kubetorch 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kubetorch might be problematic. Click here for more details.
- kubetorch/__init__.py +60 -0
- kubetorch/cli.py +1985 -0
- kubetorch/cli_utils.py +1025 -0
- kubetorch/config.py +453 -0
- kubetorch/constants.py +18 -0
- kubetorch/docs/Makefile +18 -0
- kubetorch/docs/__init__.py +0 -0
- kubetorch/docs/_ext/json_globaltoc.py +42 -0
- kubetorch/docs/api/cli.rst +10 -0
- kubetorch/docs/api/python/app.rst +21 -0
- kubetorch/docs/api/python/cls.rst +19 -0
- kubetorch/docs/api/python/compute.rst +25 -0
- kubetorch/docs/api/python/config.rst +11 -0
- kubetorch/docs/api/python/fn.rst +19 -0
- kubetorch/docs/api/python/image.rst +14 -0
- kubetorch/docs/api/python/secret.rst +18 -0
- kubetorch/docs/api/python/volumes.rst +13 -0
- kubetorch/docs/api/python.rst +101 -0
- kubetorch/docs/conf.py +69 -0
- kubetorch/docs/index.rst +20 -0
- kubetorch/docs/requirements.txt +5 -0
- kubetorch/globals.py +285 -0
- kubetorch/logger.py +59 -0
- kubetorch/resources/__init__.py +0 -0
- kubetorch/resources/callables/__init__.py +0 -0
- kubetorch/resources/callables/cls/__init__.py +0 -0
- kubetorch/resources/callables/cls/cls.py +157 -0
- kubetorch/resources/callables/fn/__init__.py +0 -0
- kubetorch/resources/callables/fn/fn.py +133 -0
- kubetorch/resources/callables/module.py +1416 -0
- kubetorch/resources/callables/utils.py +174 -0
- kubetorch/resources/compute/__init__.py +0 -0
- kubetorch/resources/compute/app.py +261 -0
- kubetorch/resources/compute/compute.py +2596 -0
- kubetorch/resources/compute/decorators.py +139 -0
- kubetorch/resources/compute/rbac.py +74 -0
- kubetorch/resources/compute/utils.py +1114 -0
- kubetorch/resources/compute/websocket.py +137 -0
- kubetorch/resources/images/__init__.py +1 -0
- kubetorch/resources/images/image.py +414 -0
- kubetorch/resources/images/images.py +74 -0
- kubetorch/resources/secrets/__init__.py +2 -0
- kubetorch/resources/secrets/kubernetes_secrets_client.py +412 -0
- kubetorch/resources/secrets/provider_secrets/__init__.py +0 -0
- kubetorch/resources/secrets/provider_secrets/anthropic_secret.py +12 -0
- kubetorch/resources/secrets/provider_secrets/aws_secret.py +16 -0
- kubetorch/resources/secrets/provider_secrets/azure_secret.py +14 -0
- kubetorch/resources/secrets/provider_secrets/cohere_secret.py +12 -0
- kubetorch/resources/secrets/provider_secrets/gcp_secret.py +16 -0
- kubetorch/resources/secrets/provider_secrets/github_secret.py +13 -0
- kubetorch/resources/secrets/provider_secrets/huggingface_secret.py +20 -0
- kubetorch/resources/secrets/provider_secrets/kubeconfig_secret.py +12 -0
- kubetorch/resources/secrets/provider_secrets/lambda_secret.py +13 -0
- kubetorch/resources/secrets/provider_secrets/langchain_secret.py +12 -0
- kubetorch/resources/secrets/provider_secrets/openai_secret.py +11 -0
- kubetorch/resources/secrets/provider_secrets/pinecone_secret.py +12 -0
- kubetorch/resources/secrets/provider_secrets/providers.py +93 -0
- kubetorch/resources/secrets/provider_secrets/ssh_secret.py +12 -0
- kubetorch/resources/secrets/provider_secrets/wandb_secret.py +11 -0
- kubetorch/resources/secrets/secret.py +238 -0
- kubetorch/resources/secrets/secret_factory.py +70 -0
- kubetorch/resources/secrets/utils.py +209 -0
- kubetorch/resources/volumes/__init__.py +0 -0
- kubetorch/resources/volumes/volume.py +365 -0
- kubetorch/servers/__init__.py +0 -0
- kubetorch/servers/http/__init__.py +0 -0
- kubetorch/servers/http/distributed_utils.py +3223 -0
- kubetorch/servers/http/http_client.py +730 -0
- kubetorch/servers/http/http_server.py +1788 -0
- kubetorch/servers/http/server_metrics.py +278 -0
- kubetorch/servers/http/utils.py +728 -0
- kubetorch/serving/__init__.py +0 -0
- kubetorch/serving/autoscaling.py +173 -0
- kubetorch/serving/base_service_manager.py +363 -0
- kubetorch/serving/constants.py +83 -0
- kubetorch/serving/deployment_service_manager.py +478 -0
- kubetorch/serving/knative_service_manager.py +519 -0
- kubetorch/serving/raycluster_service_manager.py +582 -0
- kubetorch/serving/service_manager.py +18 -0
- kubetorch/serving/templates/deployment_template.yaml +17 -0
- kubetorch/serving/templates/knative_service_template.yaml +19 -0
- kubetorch/serving/templates/kt_setup_template.sh.j2 +81 -0
- kubetorch/serving/templates/pod_template.yaml +194 -0
- kubetorch/serving/templates/raycluster_service_template.yaml +42 -0
- kubetorch/serving/templates/raycluster_template.yaml +35 -0
- kubetorch/serving/templates/service_template.yaml +21 -0
- kubetorch/serving/templates/workerset_template.yaml +36 -0
- kubetorch/serving/utils.py +377 -0
- kubetorch/utils.py +284 -0
- kubetorch-0.2.0.dist-info/METADATA +121 -0
- kubetorch-0.2.0.dist-info/RECORD +93 -0
- kubetorch-0.2.0.dist-info/WHEEL +4 -0
- kubetorch-0.2.0.dist-info/entry_points.txt +5 -0
|
@@ -0,0 +1,1416 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import json
|
|
3
|
+
import tempfile
|
|
4
|
+
import threading
|
|
5
|
+
import time
|
|
6
|
+
import urllib.parse
|
|
7
|
+
from datetime import datetime, timezone
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Dict, List, Union
|
|
10
|
+
|
|
11
|
+
import websockets
|
|
12
|
+
|
|
13
|
+
from kubetorch.globals import config, service_url, service_url_async
|
|
14
|
+
from kubetorch.logger import get_logger
|
|
15
|
+
from kubetorch.resources.callables.utils import (
|
|
16
|
+
get_names_for_reload_fallbacks,
|
|
17
|
+
locate_working_dir,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
from kubetorch.resources.compute.utils import (
|
|
21
|
+
delete_cached_service_data,
|
|
22
|
+
delete_configmaps,
|
|
23
|
+
load_configmaps,
|
|
24
|
+
VersionMismatchError,
|
|
25
|
+
)
|
|
26
|
+
from kubetorch.servers.http.http_client import HTTPClient
|
|
27
|
+
from kubetorch.servers.http.utils import (
|
|
28
|
+
clean_and_validate_k8s_name,
|
|
29
|
+
generate_unique_request_id,
|
|
30
|
+
is_running_in_kubernetes,
|
|
31
|
+
)
|
|
32
|
+
from kubetorch.utils import (
|
|
33
|
+
extract_host_port,
|
|
34
|
+
get_kt_install_url,
|
|
35
|
+
iso_timestamp_to_nanoseconds,
|
|
36
|
+
LogVerbosity,
|
|
37
|
+
ServerLogsFormatter,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
logger = get_logger(__name__)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class Module:
|
|
44
|
+
MODULE_TYPE = None
|
|
45
|
+
|
|
46
|
+
def __init__(
|
|
47
|
+
self,
|
|
48
|
+
name: str,
|
|
49
|
+
pointers: tuple,
|
|
50
|
+
):
|
|
51
|
+
self._compute = None
|
|
52
|
+
self._deployment_timestamp = None
|
|
53
|
+
self._service_config = None
|
|
54
|
+
self._http_client = None
|
|
55
|
+
self._get_if_exists = True
|
|
56
|
+
self._reload_prefixes = None
|
|
57
|
+
self._serialization = "json" # Default serialization format
|
|
58
|
+
self._async = False
|
|
59
|
+
self._remote_pointers = None
|
|
60
|
+
self._service_name = None
|
|
61
|
+
|
|
62
|
+
self.pointers = pointers
|
|
63
|
+
self.name = (
|
|
64
|
+
clean_and_validate_k8s_name(name, allow_full_length=False) if name else None
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
@property
|
|
68
|
+
def module_name(self):
|
|
69
|
+
"""Name of the function or class."""
|
|
70
|
+
return self.pointers[2]
|
|
71
|
+
|
|
72
|
+
@property
|
|
73
|
+
def reload_prefixes(self):
|
|
74
|
+
return self._reload_prefixes or []
|
|
75
|
+
|
|
76
|
+
@reload_prefixes.setter
|
|
77
|
+
def reload_prefixes(self, value: Union[str, List[str]]):
|
|
78
|
+
"""Set the reload_prefixes property."""
|
|
79
|
+
if isinstance(value, (list)):
|
|
80
|
+
self._reload_prefixes = value
|
|
81
|
+
elif isinstance(value, str):
|
|
82
|
+
self._reload_prefixes = [value]
|
|
83
|
+
else:
|
|
84
|
+
raise ValueError("`reload_prefixes` must be a string or a list.")
|
|
85
|
+
|
|
86
|
+
@property
|
|
87
|
+
def namespace(self):
|
|
88
|
+
"""Namespace where the service is deployed."""
|
|
89
|
+
if self.compute is not None:
|
|
90
|
+
return self.compute.namespace
|
|
91
|
+
return config.namespace
|
|
92
|
+
|
|
93
|
+
@property
|
|
94
|
+
def service_name(self):
|
|
95
|
+
"""Name of the knative service, formatted according to k8s regex rules."""
|
|
96
|
+
if self._service_name:
|
|
97
|
+
return self._service_name
|
|
98
|
+
|
|
99
|
+
service_name = self.name
|
|
100
|
+
|
|
101
|
+
if (
|
|
102
|
+
config.username
|
|
103
|
+
and not self.reload_prefixes
|
|
104
|
+
and not service_name.startswith(config.username + "-")
|
|
105
|
+
):
|
|
106
|
+
service_name = f"{config.username}-{service_name}"
|
|
107
|
+
|
|
108
|
+
self._service_name = clean_and_validate_k8s_name(
|
|
109
|
+
service_name, allow_full_length=True
|
|
110
|
+
)
|
|
111
|
+
return self._service_name
|
|
112
|
+
|
|
113
|
+
@service_name.setter
|
|
114
|
+
def service_name(self, value: str):
|
|
115
|
+
self._service_name = clean_and_validate_k8s_name(value, allow_full_length=True)
|
|
116
|
+
|
|
117
|
+
@property
|
|
118
|
+
def compute(self):
|
|
119
|
+
"""Compute object corresponding to the module."""
|
|
120
|
+
return self._compute
|
|
121
|
+
|
|
122
|
+
@compute.setter
|
|
123
|
+
def compute(self, compute: "Compute"):
|
|
124
|
+
self._compute = compute
|
|
125
|
+
|
|
126
|
+
@property
|
|
127
|
+
def deployment_timestamp(self):
|
|
128
|
+
if not self._deployment_timestamp:
|
|
129
|
+
self._deployment_timestamp = (
|
|
130
|
+
self.compute.service_manager.get_deployment_timestamp_annotation(
|
|
131
|
+
self.service_name
|
|
132
|
+
)
|
|
133
|
+
)
|
|
134
|
+
return self._deployment_timestamp
|
|
135
|
+
|
|
136
|
+
@deployment_timestamp.setter
|
|
137
|
+
def deployment_timestamp(self, value: str):
|
|
138
|
+
self._deployment_timestamp = value
|
|
139
|
+
|
|
140
|
+
@property
|
|
141
|
+
def remote_pointers(self):
|
|
142
|
+
if self._remote_pointers:
|
|
143
|
+
return self._remote_pointers
|
|
144
|
+
|
|
145
|
+
source_dir = locate_working_dir(self.pointers[0])
|
|
146
|
+
relative_module_path = (
|
|
147
|
+
Path(self.pointers[0]).expanduser().relative_to(source_dir)
|
|
148
|
+
)
|
|
149
|
+
source_dir_name = Path(source_dir).name
|
|
150
|
+
if self.compute.working_dir is not None:
|
|
151
|
+
container_module_path = str(
|
|
152
|
+
Path(self.compute.working_dir) / source_dir_name / relative_module_path
|
|
153
|
+
)
|
|
154
|
+
else:
|
|
155
|
+
# Leave it as relative path
|
|
156
|
+
container_module_path = str(Path(source_dir_name) / relative_module_path)
|
|
157
|
+
self._remote_pointers = (
|
|
158
|
+
container_module_path,
|
|
159
|
+
self.pointers[1],
|
|
160
|
+
self.pointers[2],
|
|
161
|
+
)
|
|
162
|
+
return self._remote_pointers
|
|
163
|
+
|
|
164
|
+
@property
|
|
165
|
+
def service_config(self) -> dict:
|
|
166
|
+
"""Knative service configuration loaded from Kubernetes API."""
|
|
167
|
+
return self._service_config
|
|
168
|
+
|
|
169
|
+
@service_config.setter
|
|
170
|
+
def service_config(self, value: dict):
|
|
171
|
+
self._service_config = value
|
|
172
|
+
|
|
173
|
+
@property
|
|
174
|
+
def base_endpoint(self):
|
|
175
|
+
"""Endpoint for the module."""
|
|
176
|
+
if is_running_in_kubernetes():
|
|
177
|
+
if not self._compute.endpoint:
|
|
178
|
+
return self._compute._wait_for_endpoint()
|
|
179
|
+
return self._compute.endpoint
|
|
180
|
+
# URL format when using the NGINX proxy
|
|
181
|
+
return f"http://localhost:{self._compute.client_port()}/{self.namespace}/{self.service_name}"
|
|
182
|
+
|
|
183
|
+
@property
|
|
184
|
+
def request_headers(self):
|
|
185
|
+
if self.compute.freeze:
|
|
186
|
+
return {}
|
|
187
|
+
|
|
188
|
+
if self.deployment_timestamp:
|
|
189
|
+
return {"X-Deployed-As-Of": self.deployment_timestamp}
|
|
190
|
+
|
|
191
|
+
return {}
|
|
192
|
+
|
|
193
|
+
@property
|
|
194
|
+
def serialization(self):
|
|
195
|
+
"""Default serialization format for this module."""
|
|
196
|
+
return self._serialization
|
|
197
|
+
|
|
198
|
+
@serialization.setter
|
|
199
|
+
def serialization(self, value: str):
|
|
200
|
+
"""Set the default serialization format for this module."""
|
|
201
|
+
if value not in ["json", "pickle"]:
|
|
202
|
+
raise ValueError("Serialization must be 'json' or 'pickle'")
|
|
203
|
+
self._serialization = value
|
|
204
|
+
|
|
205
|
+
@property
|
|
206
|
+
def async_(self):
|
|
207
|
+
"""Whether to run the function or class methods in async mode."""
|
|
208
|
+
return self._async
|
|
209
|
+
|
|
210
|
+
@async_.setter
|
|
211
|
+
def async_(self, value: bool):
|
|
212
|
+
if not isinstance(value, bool):
|
|
213
|
+
raise ValueError("`async_` must be a boolean")
|
|
214
|
+
self._async = value
|
|
215
|
+
|
|
216
|
+
@classmethod
|
|
217
|
+
def from_name(
|
|
218
|
+
cls,
|
|
219
|
+
name: str,
|
|
220
|
+
namespace: str = None,
|
|
221
|
+
reload_prefixes: Union[str, List[str]] = [],
|
|
222
|
+
):
|
|
223
|
+
"""Reload an existing callable by its service name."""
|
|
224
|
+
from kubernetes import client
|
|
225
|
+
from kubernetes.config import (
|
|
226
|
+
ConfigException,
|
|
227
|
+
load_incluster_config,
|
|
228
|
+
load_kube_config,
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
import kubetorch as kt
|
|
232
|
+
|
|
233
|
+
try:
|
|
234
|
+
load_incluster_config()
|
|
235
|
+
except ConfigException:
|
|
236
|
+
load_kube_config()
|
|
237
|
+
objects_api = client.CustomObjectsApi()
|
|
238
|
+
apps_v1_api = client.AppsV1Api()
|
|
239
|
+
core_v1_api = client.CoreV1Api()
|
|
240
|
+
|
|
241
|
+
namespace = namespace or config.namespace
|
|
242
|
+
if isinstance(reload_prefixes, str):
|
|
243
|
+
reload_prefixes = [reload_prefixes]
|
|
244
|
+
potential_names = get_names_for_reload_fallbacks(
|
|
245
|
+
name=name, prefixes=reload_prefixes
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
# Use unified service discovery from BaseServiceManager
|
|
249
|
+
from kubetorch.serving.service_manager import BaseServiceManager
|
|
250
|
+
|
|
251
|
+
all_services = BaseServiceManager.discover_services_static(
|
|
252
|
+
namespace=namespace, objects_api=objects_api, apps_v1_api=apps_v1_api
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
# Create name-to-service lookup for efficient searching
|
|
256
|
+
service_dict = {svc["name"]: svc for svc in all_services}
|
|
257
|
+
|
|
258
|
+
# Try to find the first matching service across all service types
|
|
259
|
+
for candidate in potential_names:
|
|
260
|
+
|
|
261
|
+
service_info = service_dict.get(candidate)
|
|
262
|
+
if service_info is None:
|
|
263
|
+
continue
|
|
264
|
+
|
|
265
|
+
compute = kt.Compute.from_template(service_info)
|
|
266
|
+
|
|
267
|
+
pods = core_v1_api.list_namespaced_pod(
|
|
268
|
+
namespace=namespace,
|
|
269
|
+
label_selector=f"kubetorch.com/service={name}",
|
|
270
|
+
)
|
|
271
|
+
volumes = []
|
|
272
|
+
|
|
273
|
+
# TODO: handle case where service is scaled to 0?
|
|
274
|
+
if pods.items:
|
|
275
|
+
# Use runtime Pod spec
|
|
276
|
+
pod = pods.items[0]
|
|
277
|
+
for v in pod.spec.volumes or []:
|
|
278
|
+
if v.persistent_volume_claim:
|
|
279
|
+
existing_volume = kt.Volume.from_name(name=v.name)
|
|
280
|
+
volumes.append(existing_volume)
|
|
281
|
+
|
|
282
|
+
module_args = compute.get_env_vars(
|
|
283
|
+
[
|
|
284
|
+
"KT_FILE_PATH",
|
|
285
|
+
"KT_MODULE_NAME",
|
|
286
|
+
"KT_CLS_OR_FN_NAME",
|
|
287
|
+
"KT_CALLABLE_TYPE",
|
|
288
|
+
"KT_INIT_ARGS",
|
|
289
|
+
]
|
|
290
|
+
)
|
|
291
|
+
pointers = (
|
|
292
|
+
module_args["KT_FILE_PATH"],
|
|
293
|
+
module_args["KT_MODULE_NAME"],
|
|
294
|
+
module_args["KT_CLS_OR_FN_NAME"],
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
if module_args.get("KT_CALLABLE_TYPE") == "cls":
|
|
298
|
+
init_args = json.loads(module_args.get("KT_INIT_ARGS") or "{}")
|
|
299
|
+
reloaded_module = kt.Cls(
|
|
300
|
+
name=candidate, pointers=pointers, init_args=init_args
|
|
301
|
+
)
|
|
302
|
+
elif module_args.get("KT_CALLABLE_TYPE") == "fn":
|
|
303
|
+
reloaded_module = kt.Fn(name=candidate, pointers=pointers)
|
|
304
|
+
else:
|
|
305
|
+
raise ValueError(
|
|
306
|
+
f"Unknown module type: {module_args.get('KT_CALLABLE_TYPE')}"
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
reloaded_module.service_name = candidate
|
|
310
|
+
reloaded_module.compute = compute
|
|
311
|
+
return reloaded_module
|
|
312
|
+
|
|
313
|
+
raise ValueError(
|
|
314
|
+
f"Service '{name}' not found in namespace '{namespace}' with reload_prefixes={reload_prefixes}"
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
def _client(self, *args, **kwargs):
|
|
318
|
+
"""Return the client through which to interact with the remote Module.
|
|
319
|
+
If compute is not yet set, attempt to reload it.
|
|
320
|
+
"""
|
|
321
|
+
if self._http_client is not None:
|
|
322
|
+
return self._http_client
|
|
323
|
+
|
|
324
|
+
if self.compute is None or self.service_config is None:
|
|
325
|
+
namespace = self.namespace
|
|
326
|
+
# When rebuilding the http client on reload, need to know whether to look for a prefix
|
|
327
|
+
reload_prefixes = self.reload_prefixes
|
|
328
|
+
logger.debug(
|
|
329
|
+
f"Attempting to reload service '{self.service_name}' in namespace '{namespace}' with "
|
|
330
|
+
f"reload_prefixes={reload_prefixes}"
|
|
331
|
+
)
|
|
332
|
+
reloaded_module = Module.from_name(
|
|
333
|
+
name=self.service_name,
|
|
334
|
+
namespace=namespace,
|
|
335
|
+
reload_prefixes=reload_prefixes,
|
|
336
|
+
)
|
|
337
|
+
|
|
338
|
+
# Update settable attributes with reloaded module values
|
|
339
|
+
self.compute = self.compute or reloaded_module.compute
|
|
340
|
+
self.service_config = reloaded_module.service_config
|
|
341
|
+
self.pointers = reloaded_module.pointers
|
|
342
|
+
self.name = reloaded_module.name
|
|
343
|
+
self.service_name = reloaded_module.service_name
|
|
344
|
+
|
|
345
|
+
self._http_client = HTTPClient(
|
|
346
|
+
base_url=self.endpoint(*args, **kwargs),
|
|
347
|
+
compute=self.compute,
|
|
348
|
+
service_name=self.service_name,
|
|
349
|
+
)
|
|
350
|
+
|
|
351
|
+
return self._http_client
|
|
352
|
+
|
|
353
|
+
def endpoint(self, method_name: str = None):
|
|
354
|
+
if not hasattr(self, "init_args"):
|
|
355
|
+
return f"{self.base_endpoint}/{self.module_name}"
|
|
356
|
+
else:
|
|
357
|
+
return f"{self.base_endpoint}/{self.module_name}/{method_name}"
|
|
358
|
+
|
|
359
|
+
def deploy(self):
|
|
360
|
+
"""
|
|
361
|
+
Helper method to deploy modules specified by the @compute decorator. Used by `kt deploy` CLI command.
|
|
362
|
+
Deploys the module to the specified compute.
|
|
363
|
+
"""
|
|
364
|
+
if self.compute is None:
|
|
365
|
+
raise ValueError("Compute must be set before deploying the module.")
|
|
366
|
+
return self.to(self.compute, init_args=getattr(self, "init_args", None))
|
|
367
|
+
|
|
368
|
+
async def deploy_async(self):
|
|
369
|
+
"""
|
|
370
|
+
Async helper method to deploy modules specified by the @compute decorator. Used by `kt deploy` CLI command
|
|
371
|
+
when multiple modules are present. Deploys the module to the specified compute asynchronously.
|
|
372
|
+
"""
|
|
373
|
+
if self.compute is None:
|
|
374
|
+
raise ValueError("Compute must be set before deploying the module.")
|
|
375
|
+
return await self.to_async(
|
|
376
|
+
self.compute, init_args=getattr(self, "init_args", None)
|
|
377
|
+
)
|
|
378
|
+
|
|
379
|
+
def to(
|
|
380
|
+
self,
|
|
381
|
+
compute: "Compute",
|
|
382
|
+
init_args: Dict = None,
|
|
383
|
+
stream_logs: Union[bool, None] = None,
|
|
384
|
+
verbosity: Union[LogVerbosity, str] = None,
|
|
385
|
+
get_if_exists: bool = False,
|
|
386
|
+
reload_prefixes: Union[str, List[str]] = [],
|
|
387
|
+
dryrun: bool = False,
|
|
388
|
+
):
|
|
389
|
+
"""
|
|
390
|
+
Send the function or class to the specified compute.
|
|
391
|
+
|
|
392
|
+
Args:
|
|
393
|
+
compute (Compute): The compute to send the function or class to.
|
|
394
|
+
init_args (Dict, optional): Initialization arguments, which may be relevant for a class.
|
|
395
|
+
stream_logs (bool, optional): Whether to stream logs during service launch. If None, uses the global
|
|
396
|
+
config value.
|
|
397
|
+
verbosity (Union[verbosity, str], optional): Verbosity of the logs streamed back to the client.
|
|
398
|
+
If not specified, will stream select service logs. Can also be controlled globally via the config
|
|
399
|
+
value `log_verbosity`. Supported values: "debug", "info", "critical".
|
|
400
|
+
get_if_exists (Union[bool, List[str]], optional): Controls how service lookup is performed to determine
|
|
401
|
+
whether to send the service to the compute.
|
|
402
|
+
|
|
403
|
+
- If False (default): Do not attempt to reload the service.
|
|
404
|
+
- If True: Attempt to find an existing service using a standard fallback order
|
|
405
|
+
(e.g., username, git branch, then prod). If found, re-use that existing service.
|
|
406
|
+
reload_prefixes (Union[str, List[str]], optional): A list of prefixes to use when reloading the function
|
|
407
|
+
(e.g., ["qa", "prod", "git-branch-name"]). If not provided, will use the current username,
|
|
408
|
+
git branch, and prod.
|
|
409
|
+
dryrun (bool, optional): Whether to setup and return the object as a dryrun (``True``),
|
|
410
|
+
or to actually launch the compute and service (``False``).
|
|
411
|
+
Returns:
|
|
412
|
+
Module: The module instance.
|
|
413
|
+
|
|
414
|
+
Example:
|
|
415
|
+
|
|
416
|
+
.. code-block:: python
|
|
417
|
+
|
|
418
|
+
import kubetorch as kt
|
|
419
|
+
|
|
420
|
+
remote_cls = kt.cls(SlowNumpyArray, name=name).to(
|
|
421
|
+
kt.Compute(cpus=".1"),
|
|
422
|
+
init_args={"size": 10},
|
|
423
|
+
stream_logs=True
|
|
424
|
+
)
|
|
425
|
+
"""
|
|
426
|
+
if get_if_exists:
|
|
427
|
+
try:
|
|
428
|
+
existing_service = self._get_existing_service(reload_prefixes)
|
|
429
|
+
if existing_service:
|
|
430
|
+
logger.debug(
|
|
431
|
+
f"Reusing existing service: {existing_service.service_name}"
|
|
432
|
+
)
|
|
433
|
+
return existing_service
|
|
434
|
+
except Exception as e:
|
|
435
|
+
logger.info(
|
|
436
|
+
f"Service {self.service_name} not found in namespace {self.compute.namespace} "
|
|
437
|
+
f"with reload_prefixes={reload_prefixes}: {str(e)}"
|
|
438
|
+
)
|
|
439
|
+
|
|
440
|
+
self.compute = compute
|
|
441
|
+
self.compute.service_name = self.service_name
|
|
442
|
+
|
|
443
|
+
if hasattr(self, "init_args"):
|
|
444
|
+
self.init_args = init_args
|
|
445
|
+
|
|
446
|
+
# We need the deployment timestamp at the start of the update so we know that artifacts deployed **after**
|
|
447
|
+
# this time are part of the current deployment. We actually set it at the end to ensure that the deployment is
|
|
448
|
+
# successful.
|
|
449
|
+
logger.debug(f"Deploying module: {self.service_name}")
|
|
450
|
+
deployment_timestamp = datetime.now(timezone.utc).isoformat()
|
|
451
|
+
install_url, use_editable = get_kt_install_url(self.compute.freeze)
|
|
452
|
+
|
|
453
|
+
if not dryrun and not self.compute.freeze:
|
|
454
|
+
self._rsync_repo_and_image_patches(install_url, use_editable, init_args)
|
|
455
|
+
|
|
456
|
+
self._launch_service(
|
|
457
|
+
install_url,
|
|
458
|
+
use_editable,
|
|
459
|
+
init_args,
|
|
460
|
+
deployment_timestamp,
|
|
461
|
+
stream_logs,
|
|
462
|
+
verbosity,
|
|
463
|
+
dryrun,
|
|
464
|
+
)
|
|
465
|
+
|
|
466
|
+
return self
|
|
467
|
+
|
|
468
|
+
async def to_async(
|
|
469
|
+
self,
|
|
470
|
+
compute: "Compute",
|
|
471
|
+
init_args: Dict = None,
|
|
472
|
+
stream_logs: Union[bool, None] = None,
|
|
473
|
+
verbosity: Union[LogVerbosity, str] = None,
|
|
474
|
+
get_if_exists: bool = False,
|
|
475
|
+
reload_prefixes: Union[str, List[str]] = [],
|
|
476
|
+
dryrun: bool = False,
|
|
477
|
+
):
|
|
478
|
+
"""
|
|
479
|
+
Async version of the `.to` method. Send the function or class to the specified compute asynchronously.
|
|
480
|
+
|
|
481
|
+
Args:
|
|
482
|
+
compute (Compute): The compute to send the function or class to.
|
|
483
|
+
init_args (Dict, optional): Initialization arguments, which may be relevant for a class.
|
|
484
|
+
stream_logs (bool, optional): Whether to stream logs during service launch. If None, uses the global
|
|
485
|
+
config value.
|
|
486
|
+
verbosity (Union[verbosity, str], optional): Verbosity of the logs streamed back to the client.
|
|
487
|
+
If not specified, will stream select service logs. Can also be controlled globally via the config
|
|
488
|
+
value `log_verbosity`. Supported values: "debug", "info", "critical".
|
|
489
|
+
get_if_exists (Union[bool, List[str]], optional): Controls how service lookup is performed to determine
|
|
490
|
+
whether to send the service to the compute.
|
|
491
|
+
|
|
492
|
+
- If False (default): Do not attempt to reload the service.
|
|
493
|
+
- If True: Attempt to find an existing service using a standard fallback order
|
|
494
|
+
(e.g., username, git branch, then prod). If found, re-use that existing service.
|
|
495
|
+
reload_prefixes (Union[str, List[str]], optional): A list of prefixes to use when reloading the function
|
|
496
|
+
(e.g., ["qa", "prod", "git-branch-name"]). If not provided, will use the current username,
|
|
497
|
+
git branch, and prod.
|
|
498
|
+
dryrun (bool, optional): Whether to setup and return the object as a dryrun (``True``),
|
|
499
|
+
or to actually launch the compute and service (``False``).
|
|
500
|
+
Returns:
|
|
501
|
+
Module: The module instance.
|
|
502
|
+
|
|
503
|
+
Example:
|
|
504
|
+
|
|
505
|
+
.. code-block:: python
|
|
506
|
+
|
|
507
|
+
import kubetorch as kt
|
|
508
|
+
|
|
509
|
+
remote_cls = await kt.cls(SlowNumpyArray, name=name).to_async(
|
|
510
|
+
kt.Compute(cpus=".1"),
|
|
511
|
+
init_args={"size": 10},
|
|
512
|
+
stream_logs=True
|
|
513
|
+
)
|
|
514
|
+
"""
|
|
515
|
+
if get_if_exists:
|
|
516
|
+
try:
|
|
517
|
+
existing_service = await self._get_existing_service_async(
|
|
518
|
+
reload_prefixes
|
|
519
|
+
)
|
|
520
|
+
if existing_service:
|
|
521
|
+
logger.debug(
|
|
522
|
+
f"Reusing existing service: {existing_service.service_name}"
|
|
523
|
+
)
|
|
524
|
+
return existing_service
|
|
525
|
+
except Exception as e:
|
|
526
|
+
logger.info(
|
|
527
|
+
f"Service {self.compute.service_name} not found in namespace {self.compute.namespace} "
|
|
528
|
+
f"with reload_prefixes={reload_prefixes}: {str(e)}"
|
|
529
|
+
)
|
|
530
|
+
|
|
531
|
+
self.compute = compute
|
|
532
|
+
self.compute.service_name = self.service_name
|
|
533
|
+
|
|
534
|
+
if hasattr(self, "init_args"):
|
|
535
|
+
self.init_args = init_args
|
|
536
|
+
|
|
537
|
+
logger.debug(f"Deploying module: {self.service_name}")
|
|
538
|
+
deployment_timestamp = datetime.now(timezone.utc).isoformat()
|
|
539
|
+
install_url, use_editable = get_kt_install_url(self.compute.freeze)
|
|
540
|
+
|
|
541
|
+
if not dryrun and not self.compute.freeze:
|
|
542
|
+
await self._rsync_repo_and_image_patches_async(
|
|
543
|
+
install_url, use_editable, init_args
|
|
544
|
+
)
|
|
545
|
+
|
|
546
|
+
await self._launch_service_async(
|
|
547
|
+
install_url,
|
|
548
|
+
use_editable,
|
|
549
|
+
init_args,
|
|
550
|
+
deployment_timestamp,
|
|
551
|
+
stream_logs,
|
|
552
|
+
verbosity,
|
|
553
|
+
dryrun,
|
|
554
|
+
)
|
|
555
|
+
|
|
556
|
+
return self
|
|
557
|
+
|
|
558
|
+
def _get_existing_service(self, reload_prefixes):
|
|
559
|
+
try:
|
|
560
|
+
existing_service = Module.from_name(
|
|
561
|
+
self.service_name,
|
|
562
|
+
namespace=self.namespace,
|
|
563
|
+
reload_prefixes=reload_prefixes,
|
|
564
|
+
)
|
|
565
|
+
if existing_service:
|
|
566
|
+
if self.compute:
|
|
567
|
+
# Replace the compute object, if the user has already constructed it locally
|
|
568
|
+
existing_service.compute = self.compute
|
|
569
|
+
logger.info(
|
|
570
|
+
f"Existing service '{self.service_name}' found in namespace '{self.namespace}', not "
|
|
571
|
+
f"redeploying."
|
|
572
|
+
)
|
|
573
|
+
return existing_service
|
|
574
|
+
except Exception as e:
|
|
575
|
+
raise ValueError(
|
|
576
|
+
f"Failed to reload service {self.service_name} in namespace {self.namespace} "
|
|
577
|
+
f"and reload_prefixes={reload_prefixes}: {str(e)}"
|
|
578
|
+
)
|
|
579
|
+
|
|
580
|
+
async def _get_existing_service_async(self, reload_prefixes):
|
|
581
|
+
try:
|
|
582
|
+
existing_service = Module.from_name(
|
|
583
|
+
self.service_name,
|
|
584
|
+
namespace=self.namespace,
|
|
585
|
+
reload_prefixes=reload_prefixes,
|
|
586
|
+
)
|
|
587
|
+
if existing_service:
|
|
588
|
+
if self.compute:
|
|
589
|
+
# Replace the compute object, if the user has already constructed it locally
|
|
590
|
+
existing_service.compute = self.compute
|
|
591
|
+
logger.info(
|
|
592
|
+
f"Existing service '{self.service_name}' found in namespace '{self.namespace}', not "
|
|
593
|
+
f"redeploying."
|
|
594
|
+
)
|
|
595
|
+
return existing_service
|
|
596
|
+
except Exception as e:
|
|
597
|
+
raise ValueError(
|
|
598
|
+
f"Failed to reload service {self.service_name} in namespace {self.namespace} "
|
|
599
|
+
f"and reload_prefixes={reload_prefixes}: {str(e)}"
|
|
600
|
+
)
|
|
601
|
+
|
|
602
|
+
def _rsync_repo_and_image_patches(self, install_url, use_editable, init_args):
|
|
603
|
+
logger.debug("Rsyncing data to the rsync pod")
|
|
604
|
+
source_dir = locate_working_dir(self.pointers[0])
|
|
605
|
+
rsync_dirs = [str(source_dir)]
|
|
606
|
+
if use_editable and install_url not in rsync_dirs:
|
|
607
|
+
rsync_dirs.append(install_url)
|
|
608
|
+
|
|
609
|
+
pointer_env_vars = self._get_pointer_env_vars(self.remote_pointers)
|
|
610
|
+
metadata_env_vars = self._get_metadata_env_vars(init_args)
|
|
611
|
+
service_dockerfile = self._get_service_dockerfile(
|
|
612
|
+
{**pointer_env_vars, **metadata_env_vars}
|
|
613
|
+
)
|
|
614
|
+
self._construct_and_rsync_files(rsync_dirs, service_dockerfile)
|
|
615
|
+
logger.debug(f"Rsync completed for service {self.service_name}")
|
|
616
|
+
|
|
617
|
+
async def _rsync_repo_and_image_patches_async(
|
|
618
|
+
self, install_url, use_editable, init_args
|
|
619
|
+
):
|
|
620
|
+
logger.debug("Rsyncing data to the rsync pod")
|
|
621
|
+
source_dir = locate_working_dir(self.pointers[0])
|
|
622
|
+
rsync_dirs = [str(source_dir)]
|
|
623
|
+
if use_editable and install_url not in rsync_dirs:
|
|
624
|
+
rsync_dirs.append(install_url)
|
|
625
|
+
|
|
626
|
+
pointer_env_vars = self._get_pointer_env_vars(self.remote_pointers)
|
|
627
|
+
metadata_env_vars = self._get_metadata_env_vars(init_args)
|
|
628
|
+
service_dockerfile = self._get_service_dockerfile(
|
|
629
|
+
{**pointer_env_vars, **metadata_env_vars}
|
|
630
|
+
)
|
|
631
|
+
await self._construct_and_rsync_files_async(rsync_dirs, service_dockerfile)
|
|
632
|
+
logger.debug(f"Rsync completed for service {self.service_name}")
|
|
633
|
+
|
|
634
|
+
def _launch_service(
|
|
635
|
+
self,
|
|
636
|
+
install_url,
|
|
637
|
+
use_editable,
|
|
638
|
+
init_args,
|
|
639
|
+
deployment_timestamp,
|
|
640
|
+
stream_logs,
|
|
641
|
+
verbosity,
|
|
642
|
+
dryrun,
|
|
643
|
+
):
|
|
644
|
+
# Start log streaming if enabled
|
|
645
|
+
stop_event = threading.Event()
|
|
646
|
+
log_thread = None
|
|
647
|
+
if stream_logs is None:
|
|
648
|
+
stream_logs = config.stream_logs or False
|
|
649
|
+
|
|
650
|
+
launch_request_id = "-"
|
|
651
|
+
if stream_logs and not dryrun:
|
|
652
|
+
if verbosity is None:
|
|
653
|
+
verbosity = config.log_verbosity
|
|
654
|
+
|
|
655
|
+
# Create a unique request ID for this launch sequence
|
|
656
|
+
launch_request_id = (
|
|
657
|
+
f"launch_{generate_unique_request_id('launch', deployment_timestamp)}"
|
|
658
|
+
)
|
|
659
|
+
|
|
660
|
+
# Start log streaming in a separate thread
|
|
661
|
+
log_thread = threading.Thread(
|
|
662
|
+
target=self._stream_launch_logs,
|
|
663
|
+
args=(
|
|
664
|
+
launch_request_id,
|
|
665
|
+
stop_event,
|
|
666
|
+
verbosity,
|
|
667
|
+
deployment_timestamp,
|
|
668
|
+
),
|
|
669
|
+
)
|
|
670
|
+
log_thread.daemon = True
|
|
671
|
+
log_thread.start()
|
|
672
|
+
|
|
673
|
+
try:
|
|
674
|
+
startup_rsync_command = self._startup_rsync_command(use_editable, dryrun)
|
|
675
|
+
|
|
676
|
+
# Launch the compute in the form of a service with the requested resources
|
|
677
|
+
service_config = self.compute._launch(
|
|
678
|
+
service_name=self.compute.service_name,
|
|
679
|
+
install_url=install_url if not use_editable else None,
|
|
680
|
+
pointer_env_vars=self._get_pointer_env_vars(self.remote_pointers),
|
|
681
|
+
metadata_env_vars=self._get_metadata_env_vars(init_args),
|
|
682
|
+
startup_rsync_command=startup_rsync_command,
|
|
683
|
+
launch_id=launch_request_id,
|
|
684
|
+
dryrun=dryrun,
|
|
685
|
+
)
|
|
686
|
+
self.service_config = service_config
|
|
687
|
+
|
|
688
|
+
if not self.compute.freeze and not dryrun:
|
|
689
|
+
self.deployment_timestamp = (
|
|
690
|
+
self.compute.service_manager.update_deployment_timestamp_annotation(
|
|
691
|
+
service_name=self.service_name,
|
|
692
|
+
new_timestamp=deployment_timestamp,
|
|
693
|
+
)
|
|
694
|
+
)
|
|
695
|
+
if not dryrun:
|
|
696
|
+
self.compute._check_service_ready()
|
|
697
|
+
# Additional health check to ensure HTTP server is ready
|
|
698
|
+
self._wait_for_http_health()
|
|
699
|
+
finally:
|
|
700
|
+
# Stop log streaming
|
|
701
|
+
if log_thread:
|
|
702
|
+
stop_event.set()
|
|
703
|
+
|
|
704
|
+
async def _launch_service_async(
|
|
705
|
+
self,
|
|
706
|
+
install_url,
|
|
707
|
+
use_editable,
|
|
708
|
+
init_args,
|
|
709
|
+
deployment_timestamp,
|
|
710
|
+
stream_logs,
|
|
711
|
+
verbosity,
|
|
712
|
+
dryrun,
|
|
713
|
+
):
|
|
714
|
+
# Start log streaming if enabled
|
|
715
|
+
stop_event = asyncio.Event()
|
|
716
|
+
log_task = None
|
|
717
|
+
if stream_logs is None:
|
|
718
|
+
stream_logs = config.stream_logs or False
|
|
719
|
+
|
|
720
|
+
launch_request_id = "-"
|
|
721
|
+
if stream_logs and not dryrun:
|
|
722
|
+
if verbosity is None:
|
|
723
|
+
verbosity = config.log_verbosity
|
|
724
|
+
|
|
725
|
+
# Create a unique request ID for this launch sequence
|
|
726
|
+
launch_request_id = (
|
|
727
|
+
f"launch_{generate_unique_request_id('launch', deployment_timestamp)}"
|
|
728
|
+
)
|
|
729
|
+
|
|
730
|
+
# Start log streaming as an async task
|
|
731
|
+
log_task = asyncio.create_task(
|
|
732
|
+
self._stream_launch_logs_async(
|
|
733
|
+
launch_request_id,
|
|
734
|
+
stop_event,
|
|
735
|
+
verbosity,
|
|
736
|
+
deployment_timestamp,
|
|
737
|
+
)
|
|
738
|
+
)
|
|
739
|
+
|
|
740
|
+
try:
|
|
741
|
+
startup_rsync_command = self._startup_rsync_command(use_editable, dryrun)
|
|
742
|
+
|
|
743
|
+
# Launch the compute in the form of a service with the requested resources
|
|
744
|
+
# Use the async version of _launch
|
|
745
|
+
service_config = await self.compute._launch_async(
|
|
746
|
+
service_name=self.compute.service_name,
|
|
747
|
+
install_url=install_url if not use_editable else None,
|
|
748
|
+
pointer_env_vars=self._get_pointer_env_vars(self.remote_pointers),
|
|
749
|
+
metadata_env_vars=self._get_metadata_env_vars(init_args),
|
|
750
|
+
startup_rsync_command=startup_rsync_command,
|
|
751
|
+
launch_id=launch_request_id,
|
|
752
|
+
dryrun=dryrun,
|
|
753
|
+
)
|
|
754
|
+
self.service_config = service_config
|
|
755
|
+
|
|
756
|
+
if not self.compute.freeze and not dryrun:
|
|
757
|
+
self.deployment_timestamp = (
|
|
758
|
+
self.compute.service_manager.update_deployment_timestamp_annotation(
|
|
759
|
+
service_name=self.service_name,
|
|
760
|
+
new_timestamp=deployment_timestamp,
|
|
761
|
+
)
|
|
762
|
+
)
|
|
763
|
+
if not dryrun:
|
|
764
|
+
await self.compute._check_service_ready_async()
|
|
765
|
+
await self._wait_for_http_health_async()
|
|
766
|
+
finally:
|
|
767
|
+
# Stop log streaming
|
|
768
|
+
if log_task:
|
|
769
|
+
stop_event.set()
|
|
770
|
+
try:
|
|
771
|
+
await asyncio.wait_for(log_task, timeout=2.0)
|
|
772
|
+
except asyncio.TimeoutError:
|
|
773
|
+
log_task.cancel()
|
|
774
|
+
try:
|
|
775
|
+
await log_task
|
|
776
|
+
except asyncio.CancelledError:
|
|
777
|
+
pass
|
|
778
|
+
|
|
779
|
+
def _get_service_dockerfile(self, metadata_env_vars):
|
|
780
|
+
image_instructions = self.compute._image_setup_and_instructions()
|
|
781
|
+
|
|
782
|
+
image_instructions += "\n"
|
|
783
|
+
for key, val in metadata_env_vars.items():
|
|
784
|
+
if isinstance(val, Dict):
|
|
785
|
+
val = json.dumps(val)
|
|
786
|
+
image_instructions += f"ENV {key} {val}\n"
|
|
787
|
+
|
|
788
|
+
logger.debug(
|
|
789
|
+
f"Generated Dockerfile for service {self.service_name}:\n{image_instructions}"
|
|
790
|
+
)
|
|
791
|
+
return image_instructions
|
|
792
|
+
|
|
793
|
+
def _construct_and_rsync_files(self, rsync_dirs, service_dockerfile):
|
|
794
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
795
|
+
temp_file = Path(tmpdir) / ".kt" / "image.dockerfile"
|
|
796
|
+
temp_file.parent.mkdir(parents=True, exist_ok=True)
|
|
797
|
+
temp_file.write_text(service_dockerfile)
|
|
798
|
+
|
|
799
|
+
source_dir = str(Path(tmpdir) / ".kt")
|
|
800
|
+
rsync_dirs.append(source_dir)
|
|
801
|
+
|
|
802
|
+
logger.debug(f"Rsyncing directories: {rsync_dirs}")
|
|
803
|
+
if is_running_in_kubernetes():
|
|
804
|
+
self.compute.rsync_in_cluster(rsync_dirs)
|
|
805
|
+
else:
|
|
806
|
+
self.compute.rsync(rsync_dirs)
|
|
807
|
+
|
|
808
|
+
async def _construct_and_rsync_files_async(self, rsync_dirs, service_dockerfile):
|
|
809
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
810
|
+
temp_file = Path(tmpdir) / ".kt" / "image.dockerfile"
|
|
811
|
+
temp_file.parent.mkdir(parents=True, exist_ok=True)
|
|
812
|
+
temp_file.write_text(service_dockerfile)
|
|
813
|
+
|
|
814
|
+
source_dir = str(Path(tmpdir) / ".kt")
|
|
815
|
+
rsync_dirs.append(source_dir)
|
|
816
|
+
|
|
817
|
+
logger.debug(f"Rsyncing directories: {rsync_dirs}")
|
|
818
|
+
if is_running_in_kubernetes():
|
|
819
|
+
await self.compute.rsync_in_cluster_async(rsync_dirs)
|
|
820
|
+
else:
|
|
821
|
+
await self.compute.rsync_async(rsync_dirs)
|
|
822
|
+
|
|
823
|
+
def _startup_rsync_command(self, use_editable, dryrun):
|
|
824
|
+
if not use_editable or dryrun:
|
|
825
|
+
return None
|
|
826
|
+
|
|
827
|
+
# rsync from the rsync pod's file system directly
|
|
828
|
+
startup_cmd = self.compute._rsync_svc_url()
|
|
829
|
+
cmd = f"rsync -av {startup_cmd} ."
|
|
830
|
+
return cmd
|
|
831
|
+
|
|
832
|
+
def teardown(self):
|
|
833
|
+
"""Delete the service and all associated resources."""
|
|
834
|
+
logger.info(f"Deleting service: {self.service_name}")
|
|
835
|
+
|
|
836
|
+
# Use the compute's service manager - it already knows the correct type!
|
|
837
|
+
teardown_success = self.compute.service_manager.teardown_service(
|
|
838
|
+
service_name=self.service_name,
|
|
839
|
+
)
|
|
840
|
+
|
|
841
|
+
if not teardown_success:
|
|
842
|
+
logger.error(f"Failed to teardown service {self.service_name}")
|
|
843
|
+
return
|
|
844
|
+
|
|
845
|
+
configmaps = load_configmaps(
|
|
846
|
+
core_api=self.compute.core_api,
|
|
847
|
+
service_name=self.service_name,
|
|
848
|
+
namespace=self.compute.namespace,
|
|
849
|
+
)
|
|
850
|
+
if configmaps:
|
|
851
|
+
logger.info(
|
|
852
|
+
f"Deleting {len(configmaps)} configmap{'' if len(configmaps) == 1 else 's'}"
|
|
853
|
+
)
|
|
854
|
+
delete_configmaps(
|
|
855
|
+
core_api=self.compute.core_api,
|
|
856
|
+
configmaps=configmaps,
|
|
857
|
+
namespace=self.compute.namespace,
|
|
858
|
+
)
|
|
859
|
+
|
|
860
|
+
logger.info("Deleting service data from cache in rsync pod")
|
|
861
|
+
delete_cached_service_data(
|
|
862
|
+
core_api=self.compute.core_api,
|
|
863
|
+
service_name=self.service_name,
|
|
864
|
+
namespace=self.compute.namespace,
|
|
865
|
+
)
|
|
866
|
+
|
|
867
|
+
def _get_pointer_env_vars(self, remote_pointers):
|
|
868
|
+
(container_file_path, module_name, cls_or_fn_name) = remote_pointers
|
|
869
|
+
return {
|
|
870
|
+
"KT_FILE_PATH": container_file_path,
|
|
871
|
+
"KT_MODULE_NAME": module_name,
|
|
872
|
+
"KT_CLS_OR_FN_NAME": cls_or_fn_name,
|
|
873
|
+
}
|
|
874
|
+
|
|
875
|
+
def _get_metadata_env_vars(
|
|
876
|
+
self,
|
|
877
|
+
init_args: Dict,
|
|
878
|
+
) -> Dict:
|
|
879
|
+
# TODO: add other callable metadata in addition to pointers (`is_generator`, `is_async`, etc.)
|
|
880
|
+
import json
|
|
881
|
+
|
|
882
|
+
distributed_config = self.compute.distributed_config
|
|
883
|
+
return {
|
|
884
|
+
"KT_INIT_ARGS": init_args,
|
|
885
|
+
"KT_CALLABLE_TYPE": self.MODULE_TYPE,
|
|
886
|
+
"KT_DISTRIBUTED_CONFIG": json.dumps(distributed_config)
|
|
887
|
+
if distributed_config
|
|
888
|
+
else None,
|
|
889
|
+
}
|
|
890
|
+
|
|
891
|
+
def _stream_launch_logs(
|
|
892
|
+
self,
|
|
893
|
+
request_id: str,
|
|
894
|
+
stop_event: threading.Event,
|
|
895
|
+
verbosity: LogVerbosity,
|
|
896
|
+
deployment_timestamp: str,
|
|
897
|
+
):
|
|
898
|
+
"""Stream logs and events during service launch sequence."""
|
|
899
|
+
try:
|
|
900
|
+
# Only use "kubetorch" container to exclude queue-proxy (e.g. Knative sidecars) container logs which
|
|
901
|
+
# are spammy with tons of healthcheck calls
|
|
902
|
+
pod_query = (
|
|
903
|
+
f'{{k8s_container_name="kubetorch"}} | json | request_id="{request_id}"'
|
|
904
|
+
)
|
|
905
|
+
event_query = f'{{service_name="unknown_service"}} | json | k8s_object_name=~"{self.service_name}.*" | k8s_namespace_name="{self.namespace}"'
|
|
906
|
+
|
|
907
|
+
encoded_pod_query = urllib.parse.quote_plus(pod_query)
|
|
908
|
+
encoded_event_query = urllib.parse.quote_plus(event_query)
|
|
909
|
+
logger.debug(
|
|
910
|
+
f"Streaming launch logs and events for service {self.service_name}"
|
|
911
|
+
)
|
|
912
|
+
|
|
913
|
+
def start_log_threads(host, port):
|
|
914
|
+
def run_pod_logs():
|
|
915
|
+
self._run_log_stream(
|
|
916
|
+
request_id,
|
|
917
|
+
stop_event,
|
|
918
|
+
host,
|
|
919
|
+
port,
|
|
920
|
+
encoded_pod_query,
|
|
921
|
+
verbosity,
|
|
922
|
+
deployment_timestamp,
|
|
923
|
+
dedup=True,
|
|
924
|
+
)
|
|
925
|
+
|
|
926
|
+
def run_event_logs():
|
|
927
|
+
self._run_log_stream(
|
|
928
|
+
request_id,
|
|
929
|
+
stop_event,
|
|
930
|
+
host,
|
|
931
|
+
port,
|
|
932
|
+
encoded_event_query,
|
|
933
|
+
verbosity,
|
|
934
|
+
deployment_timestamp,
|
|
935
|
+
)
|
|
936
|
+
|
|
937
|
+
pod_thread = threading.Thread(target=run_pod_logs, daemon=True)
|
|
938
|
+
event_thread = threading.Thread(target=run_event_logs, daemon=True)
|
|
939
|
+
|
|
940
|
+
pod_thread.start()
|
|
941
|
+
event_thread.start()
|
|
942
|
+
|
|
943
|
+
# Don't block indefinitely on joins - use short timeouts
|
|
944
|
+
pod_thread.join(timeout=1.0)
|
|
945
|
+
event_thread.join(timeout=1.0)
|
|
946
|
+
|
|
947
|
+
base_url = service_url()
|
|
948
|
+
host, port = extract_host_port(base_url)
|
|
949
|
+
logger.debug(
|
|
950
|
+
f"Streaming launch logs with url={base_url} host={host} and local port {port}"
|
|
951
|
+
)
|
|
952
|
+
start_log_threads(host, port)
|
|
953
|
+
|
|
954
|
+
except Exception as e:
|
|
955
|
+
logger.error(f"Failed to stream launch logs: {e}")
|
|
956
|
+
raise e
|
|
957
|
+
|
|
958
|
+
async def _stream_launch_logs_async(
|
|
959
|
+
self,
|
|
960
|
+
request_id: str,
|
|
961
|
+
stop_event: asyncio.Event,
|
|
962
|
+
verbosity: LogVerbosity,
|
|
963
|
+
deployment_timestamp: str,
|
|
964
|
+
):
|
|
965
|
+
"""Async version of _stream_launch_logs. Stream logs and events during service launch sequence."""
|
|
966
|
+
try:
|
|
967
|
+
# Only use "kubetorch" container to exclude queue-proxy (e.g. Knative sidecars) container logs which
|
|
968
|
+
# are spammy with tons of healthcheck calls
|
|
969
|
+
pod_query = (
|
|
970
|
+
f'{{k8s_container_name="kubetorch"}} | json | request_id="{request_id}"'
|
|
971
|
+
)
|
|
972
|
+
event_query = f'{{service_name="unknown_service"}} | json | k8s_object_name=~"{self.service_name}.*" | k8s_namespace_name="{self.namespace}"'
|
|
973
|
+
|
|
974
|
+
encoded_pod_query = urllib.parse.quote_plus(pod_query)
|
|
975
|
+
encoded_event_query = urllib.parse.quote_plus(event_query)
|
|
976
|
+
logger.debug(
|
|
977
|
+
f"Streaming launch logs and events for service {self.service_name}"
|
|
978
|
+
)
|
|
979
|
+
|
|
980
|
+
base_url = await service_url_async()
|
|
981
|
+
host, port = extract_host_port(base_url)
|
|
982
|
+
logger.debug(
|
|
983
|
+
f"Streaming launch logs with url={base_url} host={host} and local port {port}"
|
|
984
|
+
)
|
|
985
|
+
|
|
986
|
+
# Create async tasks for both log streams
|
|
987
|
+
pod_task = asyncio.create_task(
|
|
988
|
+
self._stream_logs_websocket(
|
|
989
|
+
request_id,
|
|
990
|
+
stop_event,
|
|
991
|
+
host=host,
|
|
992
|
+
port=port,
|
|
993
|
+
query=encoded_pod_query,
|
|
994
|
+
log_verbosity=verbosity,
|
|
995
|
+
deployment_timestamp=deployment_timestamp,
|
|
996
|
+
dedup=True,
|
|
997
|
+
)
|
|
998
|
+
)
|
|
999
|
+
|
|
1000
|
+
event_task = asyncio.create_task(
|
|
1001
|
+
self._stream_logs_websocket(
|
|
1002
|
+
request_id,
|
|
1003
|
+
stop_event,
|
|
1004
|
+
host=host,
|
|
1005
|
+
port=port,
|
|
1006
|
+
query=encoded_event_query,
|
|
1007
|
+
log_verbosity=verbosity,
|
|
1008
|
+
deployment_timestamp=deployment_timestamp,
|
|
1009
|
+
)
|
|
1010
|
+
)
|
|
1011
|
+
|
|
1012
|
+
# Wait for both tasks to complete or be cancelled
|
|
1013
|
+
try:
|
|
1014
|
+
await asyncio.gather(pod_task, event_task, return_exceptions=True)
|
|
1015
|
+
except Exception as e:
|
|
1016
|
+
logger.error(f"Error in async log streaming: {e}")
|
|
1017
|
+
|
|
1018
|
+
except Exception as e:
|
|
1019
|
+
logger.error(f"Failed to stream launch logs: {e}")
|
|
1020
|
+
raise e
|
|
1021
|
+
|
|
1022
|
+
def _run_log_stream(
|
|
1023
|
+
self,
|
|
1024
|
+
request_id: str,
|
|
1025
|
+
stop_event: threading.Event,
|
|
1026
|
+
host: str,
|
|
1027
|
+
port: int,
|
|
1028
|
+
query: str,
|
|
1029
|
+
log_verbosity: LogVerbosity,
|
|
1030
|
+
deployment_timestamp: str,
|
|
1031
|
+
dedup: bool = False,
|
|
1032
|
+
):
|
|
1033
|
+
"""Helper to run log streaming in an event loop"""
|
|
1034
|
+
loop = asyncio.new_event_loop()
|
|
1035
|
+
asyncio.set_event_loop(loop)
|
|
1036
|
+
try:
|
|
1037
|
+
loop.run_until_complete(
|
|
1038
|
+
self._stream_logs_websocket(
|
|
1039
|
+
request_id,
|
|
1040
|
+
stop_event,
|
|
1041
|
+
host=host,
|
|
1042
|
+
port=port,
|
|
1043
|
+
query=query,
|
|
1044
|
+
log_verbosity=log_verbosity,
|
|
1045
|
+
deployment_timestamp=deployment_timestamp,
|
|
1046
|
+
dedup=dedup,
|
|
1047
|
+
)
|
|
1048
|
+
)
|
|
1049
|
+
finally:
|
|
1050
|
+
loop.close()
|
|
1051
|
+
|
|
1052
|
+
async def _run_log_stream_async(
|
|
1053
|
+
self,
|
|
1054
|
+
request_id: str,
|
|
1055
|
+
stop_event: asyncio.Event,
|
|
1056
|
+
host: str,
|
|
1057
|
+
port: int,
|
|
1058
|
+
query: str,
|
|
1059
|
+
log_verbosity: LogVerbosity,
|
|
1060
|
+
deployment_timestamp: str,
|
|
1061
|
+
dedup: bool = False,
|
|
1062
|
+
):
|
|
1063
|
+
"""Async helper to run log streaming directly in the current event loop"""
|
|
1064
|
+
await self._stream_logs_websocket(
|
|
1065
|
+
request_id,
|
|
1066
|
+
stop_event,
|
|
1067
|
+
host=host,
|
|
1068
|
+
port=port,
|
|
1069
|
+
query=query,
|
|
1070
|
+
log_verbosity=log_verbosity,
|
|
1071
|
+
deployment_timestamp=deployment_timestamp,
|
|
1072
|
+
dedup=dedup,
|
|
1073
|
+
)
|
|
1074
|
+
|
|
1075
|
+
async def _stream_logs_websocket(
|
|
1076
|
+
self,
|
|
1077
|
+
request_id: str,
|
|
1078
|
+
stop_event: Union[threading.Event, asyncio.Event],
|
|
1079
|
+
host: str,
|
|
1080
|
+
port: int,
|
|
1081
|
+
query: str,
|
|
1082
|
+
log_verbosity: LogVerbosity,
|
|
1083
|
+
deployment_timestamp: str,
|
|
1084
|
+
dedup: bool = False,
|
|
1085
|
+
):
|
|
1086
|
+
"""Stream logs and events using Loki's websocket tail endpoint"""
|
|
1087
|
+
try:
|
|
1088
|
+
uri = f"ws://{host}:{port}/loki/api/v1/tail?query={query}"
|
|
1089
|
+
|
|
1090
|
+
# Track the last timestamp we've seen to avoid duplicates
|
|
1091
|
+
last_timestamp = None
|
|
1092
|
+
|
|
1093
|
+
# Track when we should stop
|
|
1094
|
+
stop_time = None
|
|
1095
|
+
|
|
1096
|
+
# Track most recent deployment timestamp to filter out old logs / events
|
|
1097
|
+
start_timestamp = iso_timestamp_to_nanoseconds(deployment_timestamp)
|
|
1098
|
+
|
|
1099
|
+
shown_event_messages = set()
|
|
1100
|
+
|
|
1101
|
+
# Track seen log messages for deduplication
|
|
1102
|
+
seen_log_messages = set() if dedup else None
|
|
1103
|
+
|
|
1104
|
+
# For formatting the server setup logs
|
|
1105
|
+
formatters = {}
|
|
1106
|
+
base_formatter = ServerLogsFormatter()
|
|
1107
|
+
websocket = None
|
|
1108
|
+
try:
|
|
1109
|
+
# Add timeout to prevent hanging connections
|
|
1110
|
+
websocket = await websockets.connect(
|
|
1111
|
+
uri,
|
|
1112
|
+
close_timeout=10, # Max time to wait for close handshake
|
|
1113
|
+
ping_interval=20, # Send ping every 20 seconds
|
|
1114
|
+
ping_timeout=10, # Wait 10 seconds for pong
|
|
1115
|
+
)
|
|
1116
|
+
while True:
|
|
1117
|
+
# If stop event is set, start counting down
|
|
1118
|
+
# Handle both threading.Event and asyncio.Event
|
|
1119
|
+
is_stop_set = (
|
|
1120
|
+
stop_event.is_set()
|
|
1121
|
+
if hasattr(stop_event, "is_set")
|
|
1122
|
+
else stop_event.is_set()
|
|
1123
|
+
)
|
|
1124
|
+
if is_stop_set and stop_time is None:
|
|
1125
|
+
stop_time = time.time() + 2 # 2 second grace period
|
|
1126
|
+
|
|
1127
|
+
# If we're past the grace period, exit
|
|
1128
|
+
if stop_time is not None and time.time() > stop_time:
|
|
1129
|
+
break
|
|
1130
|
+
|
|
1131
|
+
try:
|
|
1132
|
+
# Use shorter timeout during grace period
|
|
1133
|
+
timeout = 0.1 if stop_time is not None else 1.0
|
|
1134
|
+
message = await asyncio.wait_for(
|
|
1135
|
+
websocket.recv(), timeout=timeout
|
|
1136
|
+
)
|
|
1137
|
+
data = json.loads(message)
|
|
1138
|
+
|
|
1139
|
+
if data.get("streams"):
|
|
1140
|
+
for stream in data["streams"]:
|
|
1141
|
+
labels = stream.get("stream", {})
|
|
1142
|
+
is_event = "k8s_event_count" in list(labels.keys())
|
|
1143
|
+
for value in stream["values"]:
|
|
1144
|
+
ts_ns = int(value[0])
|
|
1145
|
+
if (
|
|
1146
|
+
start_timestamp is not None
|
|
1147
|
+
and ts_ns < start_timestamp
|
|
1148
|
+
):
|
|
1149
|
+
continue
|
|
1150
|
+
log_line = value[1]
|
|
1151
|
+
if is_event:
|
|
1152
|
+
event_type = labels.get("detected_level", "")
|
|
1153
|
+
if (
|
|
1154
|
+
log_verbosity == LogVerbosity.CRITICAL
|
|
1155
|
+
and event_type == "Normal"
|
|
1156
|
+
):
|
|
1157
|
+
# skip Normal events in MINIMAL
|
|
1158
|
+
continue
|
|
1159
|
+
|
|
1160
|
+
try:
|
|
1161
|
+
msg = log_line
|
|
1162
|
+
reason = (
|
|
1163
|
+
labels.get("k8s_event_reason", ""),
|
|
1164
|
+
)
|
|
1165
|
+
|
|
1166
|
+
# Note: relevant starting in release 0.1.19 (using OTel instead of Alloy)
|
|
1167
|
+
if isinstance(reason, tuple):
|
|
1168
|
+
reason = reason[0]
|
|
1169
|
+
|
|
1170
|
+
event_type = labels.get(
|
|
1171
|
+
"detected_level", ""
|
|
1172
|
+
)
|
|
1173
|
+
|
|
1174
|
+
if reason == "Unhealthy" and (
|
|
1175
|
+
"HTTP probe failed with statuscode: 503"
|
|
1176
|
+
in msg
|
|
1177
|
+
or "Startup probe failed" in msg
|
|
1178
|
+
):
|
|
1179
|
+
# HTTP probe failures are expected during setup
|
|
1180
|
+
continue
|
|
1181
|
+
|
|
1182
|
+
ignore_patterns = (
|
|
1183
|
+
"queue-proxy",
|
|
1184
|
+
"resolving reference: address not set for kind = service",
|
|
1185
|
+
"failed to get private k8s service endpoints:",
|
|
1186
|
+
)
|
|
1187
|
+
# Ignore queue-proxy events and gateway setup events
|
|
1188
|
+
if any(
|
|
1189
|
+
pattern in msg.lower()
|
|
1190
|
+
for pattern in ignore_patterns
|
|
1191
|
+
):
|
|
1192
|
+
continue
|
|
1193
|
+
|
|
1194
|
+
if msg in shown_event_messages:
|
|
1195
|
+
# Only show unique event messages
|
|
1196
|
+
continue
|
|
1197
|
+
|
|
1198
|
+
shown_event_messages.add(msg)
|
|
1199
|
+
|
|
1200
|
+
except Exception:
|
|
1201
|
+
# If parsing fails, just print the event as is
|
|
1202
|
+
pass
|
|
1203
|
+
|
|
1204
|
+
if event_type == "Normal":
|
|
1205
|
+
if log_verbosity in [
|
|
1206
|
+
LogVerbosity.INFO,
|
|
1207
|
+
LogVerbosity.DEBUG,
|
|
1208
|
+
]:
|
|
1209
|
+
print(
|
|
1210
|
+
f'[EVENT] reason={reason} "{msg}"'
|
|
1211
|
+
)
|
|
1212
|
+
else:
|
|
1213
|
+
print(
|
|
1214
|
+
f'[EVENT] type={event_type} reason={reason} "{msg}"'
|
|
1215
|
+
)
|
|
1216
|
+
continue
|
|
1217
|
+
|
|
1218
|
+
# Skip if we've already seen this timestamp
|
|
1219
|
+
if (
|
|
1220
|
+
last_timestamp is not None
|
|
1221
|
+
and value[0] <= last_timestamp
|
|
1222
|
+
):
|
|
1223
|
+
continue
|
|
1224
|
+
last_timestamp = value[0]
|
|
1225
|
+
if log_verbosity in [
|
|
1226
|
+
LogVerbosity.DEBUG,
|
|
1227
|
+
LogVerbosity.INFO,
|
|
1228
|
+
]:
|
|
1229
|
+
try:
|
|
1230
|
+
log_dict = json.loads(log_line)
|
|
1231
|
+
except json.JSONDecodeError:
|
|
1232
|
+
# setup steps pre server start are not JSON formatted
|
|
1233
|
+
log_dict = None
|
|
1234
|
+
|
|
1235
|
+
if log_dict is not None:
|
|
1236
|
+
# at this stage we are post setup
|
|
1237
|
+
pod_name = log_dict.get("pod", request_id)
|
|
1238
|
+
levelname = log_dict.get(
|
|
1239
|
+
"levelname", "INFO"
|
|
1240
|
+
)
|
|
1241
|
+
ts = log_dict.get("asctime")
|
|
1242
|
+
message = log_dict.get("message", "")
|
|
1243
|
+
|
|
1244
|
+
if (
|
|
1245
|
+
log_verbosity == LogVerbosity.CRITICAL
|
|
1246
|
+
and levelname
|
|
1247
|
+
not in ["ERROR", "CRITICAL"]
|
|
1248
|
+
) or (
|
|
1249
|
+
log_verbosity == LogVerbosity.INFO
|
|
1250
|
+
and levelname == "DEBUG"
|
|
1251
|
+
):
|
|
1252
|
+
continue
|
|
1253
|
+
|
|
1254
|
+
log_line = f"{levelname} | {ts} | {message}"
|
|
1255
|
+
if pod_name not in formatters:
|
|
1256
|
+
formatters[
|
|
1257
|
+
pod_name
|
|
1258
|
+
] = ServerLogsFormatter(pod_name)
|
|
1259
|
+
formatter = formatters[pod_name]
|
|
1260
|
+
else:
|
|
1261
|
+
# streaming pre server setup logs, before we have the pod name
|
|
1262
|
+
formatter = base_formatter
|
|
1263
|
+
|
|
1264
|
+
newline = "" if log_dict is None else None
|
|
1265
|
+
formatted_line = f"{formatter.start_color}{f'({self.service_name}) '}{log_line}{formatter.reset_color}"
|
|
1266
|
+
|
|
1267
|
+
# Check for duplicates if dedup is enabled
|
|
1268
|
+
if seen_log_messages is not None:
|
|
1269
|
+
if message in seen_log_messages:
|
|
1270
|
+
continue
|
|
1271
|
+
seen_log_messages.add(message)
|
|
1272
|
+
|
|
1273
|
+
print(formatted_line, end=newline)
|
|
1274
|
+
except asyncio.TimeoutError:
|
|
1275
|
+
# Timeout is expected, just continue the loop
|
|
1276
|
+
continue
|
|
1277
|
+
except websockets.exceptions.ConnectionClosed as e:
|
|
1278
|
+
logger.debug(f"WebSocket connection closed: {str(e)}")
|
|
1279
|
+
break
|
|
1280
|
+
finally:
|
|
1281
|
+
if websocket:
|
|
1282
|
+
try:
|
|
1283
|
+
# Use wait_for to prevent hanging on close
|
|
1284
|
+
await asyncio.wait_for(websocket.close(), timeout=1.0)
|
|
1285
|
+
except (asyncio.TimeoutError, Exception):
|
|
1286
|
+
pass
|
|
1287
|
+
except Exception as e:
|
|
1288
|
+
logger.error(f"Error in websocket stream: {e}")
|
|
1289
|
+
raise e
|
|
1290
|
+
finally:
|
|
1291
|
+
# Ensure websocket is closed even if we didn't enter the try block
|
|
1292
|
+
if websocket:
|
|
1293
|
+
try:
|
|
1294
|
+
# Use wait_for to prevent hanging on close
|
|
1295
|
+
await asyncio.wait_for(websocket.close(), timeout=1.0)
|
|
1296
|
+
except (asyncio.TimeoutError, Exception):
|
|
1297
|
+
pass
|
|
1298
|
+
|
|
1299
|
+
def _wait_for_http_health(
|
|
1300
|
+
self, timeout=60, retry_interval=0.1, backoff=2, max_interval=10
|
|
1301
|
+
):
|
|
1302
|
+
"""Wait for the HTTP server to be ready by checking the /health endpoint.
|
|
1303
|
+
|
|
1304
|
+
Args:
|
|
1305
|
+
timeout: Maximum time to wait in seconds
|
|
1306
|
+
retry_interval: Time between health check attempts in seconds
|
|
1307
|
+
"""
|
|
1308
|
+
import time
|
|
1309
|
+
|
|
1310
|
+
logger.info(
|
|
1311
|
+
f"Waiting for HTTP server to be ready for service {self.service_name}"
|
|
1312
|
+
)
|
|
1313
|
+
start_time = time.time()
|
|
1314
|
+
|
|
1315
|
+
while time.time() - start_time < timeout:
|
|
1316
|
+
try:
|
|
1317
|
+
client = self._client()
|
|
1318
|
+
response = client.get(
|
|
1319
|
+
endpoint=f"{self.base_endpoint}/health",
|
|
1320
|
+
headers=self.request_headers,
|
|
1321
|
+
)
|
|
1322
|
+
if response.status_code == 200:
|
|
1323
|
+
logger.info(f"HTTP server is ready for service {self.service_name}")
|
|
1324
|
+
return
|
|
1325
|
+
else:
|
|
1326
|
+
logger.debug(
|
|
1327
|
+
f"Health check returned status {response.status_code}, retrying..."
|
|
1328
|
+
)
|
|
1329
|
+
|
|
1330
|
+
except VersionMismatchError as e:
|
|
1331
|
+
raise e
|
|
1332
|
+
|
|
1333
|
+
except Exception as e:
|
|
1334
|
+
logger.debug(f"Health check failed: {e}, retrying...")
|
|
1335
|
+
|
|
1336
|
+
time.sleep(retry_interval)
|
|
1337
|
+
retry_interval *= backoff # Exponential backoff
|
|
1338
|
+
# Cap the retry interval to a maximum value
|
|
1339
|
+
retry_interval = min(retry_interval, max_interval)
|
|
1340
|
+
|
|
1341
|
+
# If we get here, we've timed out
|
|
1342
|
+
logger.warning(
|
|
1343
|
+
f"HTTP health check timed out after {timeout}s for service {self.service_name}"
|
|
1344
|
+
)
|
|
1345
|
+
|
|
1346
|
+
async def _wait_for_http_health_async(
|
|
1347
|
+
self, timeout=60, retry_interval=0.1, backoff=2, max_interval=10
|
|
1348
|
+
):
|
|
1349
|
+
"""Async version of _wait_for_http_health. Wait for the HTTP server to be ready by checking the /health endpoint.
|
|
1350
|
+
|
|
1351
|
+
Args:
|
|
1352
|
+
timeout: Maximum time to wait in seconds
|
|
1353
|
+
retry_interval: Time between health check attempts in seconds
|
|
1354
|
+
"""
|
|
1355
|
+
import asyncio
|
|
1356
|
+
|
|
1357
|
+
logger.info(
|
|
1358
|
+
f"Waiting for HTTP server to be ready for service {self.service_name}"
|
|
1359
|
+
)
|
|
1360
|
+
start_time = time.time()
|
|
1361
|
+
|
|
1362
|
+
while time.time() - start_time < timeout:
|
|
1363
|
+
try:
|
|
1364
|
+
client = self._client()
|
|
1365
|
+
response = client.get(
|
|
1366
|
+
endpoint=f"{self.base_endpoint}/health",
|
|
1367
|
+
headers=self.request_headers,
|
|
1368
|
+
)
|
|
1369
|
+
if response.status_code == 200:
|
|
1370
|
+
logger.info(f"HTTP server is ready for service {self.service_name}")
|
|
1371
|
+
return
|
|
1372
|
+
else:
|
|
1373
|
+
logger.debug(
|
|
1374
|
+
f"Health check returned status {response.status_code}, retrying..."
|
|
1375
|
+
)
|
|
1376
|
+
except Exception as e:
|
|
1377
|
+
logger.debug(f"Health check failed: {e}, retrying...")
|
|
1378
|
+
|
|
1379
|
+
await asyncio.sleep(retry_interval)
|
|
1380
|
+
retry_interval *= backoff # Exponential backoff
|
|
1381
|
+
# Cap the retry interval to a maximum value
|
|
1382
|
+
retry_interval = min(retry_interval, max_interval)
|
|
1383
|
+
|
|
1384
|
+
# If we get here, we've timed out
|
|
1385
|
+
logger.warning(
|
|
1386
|
+
f"HTTP health check timed out after {timeout}s for service {self.service_name}"
|
|
1387
|
+
)
|
|
1388
|
+
|
|
1389
|
+
def __getstate__(self):
|
|
1390
|
+
"""Remove local stateful values before pickle serialization."""
|
|
1391
|
+
state = self.__dict__.copy()
|
|
1392
|
+
# Remove local stateful values that shouldn't be serialized
|
|
1393
|
+
state["_http_client"] = None
|
|
1394
|
+
state["_service_config"] = None
|
|
1395
|
+
state["_remote_pointers"] = None
|
|
1396
|
+
# Pointers need to be converted to not be absolute paths if we're passing
|
|
1397
|
+
# the service elsewhere, e.g. into another service
|
|
1398
|
+
state["pointers"] = self.remote_pointers
|
|
1399
|
+
return state
|
|
1400
|
+
|
|
1401
|
+
def __setstate__(self, state):
|
|
1402
|
+
"""Restore state after pickle deserialization."""
|
|
1403
|
+
self.__dict__.update(state)
|
|
1404
|
+
# Reset local stateful values to None to ensure clean initialization
|
|
1405
|
+
self._http_client = None
|
|
1406
|
+
self._service_config = None
|
|
1407
|
+
self._remote_pointers = None
|
|
1408
|
+
|
|
1409
|
+
def __del__(self):
|
|
1410
|
+
if hasattr(self, "_http_client") and self._http_client is not None:
|
|
1411
|
+
try:
|
|
1412
|
+
self._http_client.close()
|
|
1413
|
+
except Exception as e:
|
|
1414
|
+
logger.debug(f"Error closing HTTPClient in Module deletion: {e}")
|
|
1415
|
+
finally:
|
|
1416
|
+
self._http_client = None
|