kubetorch 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. kubetorch/__init__.py +59 -0
  2. kubetorch/cli.py +1939 -0
  3. kubetorch/cli_utils.py +967 -0
  4. kubetorch/config.py +453 -0
  5. kubetorch/constants.py +18 -0
  6. kubetorch/docs/Makefile +18 -0
  7. kubetorch/docs/__init__.py +0 -0
  8. kubetorch/docs/_ext/json_globaltoc.py +42 -0
  9. kubetorch/docs/api/cli.rst +10 -0
  10. kubetorch/docs/api/python/app.rst +21 -0
  11. kubetorch/docs/api/python/cls.rst +19 -0
  12. kubetorch/docs/api/python/compute.rst +25 -0
  13. kubetorch/docs/api/python/config.rst +11 -0
  14. kubetorch/docs/api/python/fn.rst +19 -0
  15. kubetorch/docs/api/python/image.rst +14 -0
  16. kubetorch/docs/api/python/secret.rst +18 -0
  17. kubetorch/docs/api/python/volumes.rst +13 -0
  18. kubetorch/docs/api/python.rst +101 -0
  19. kubetorch/docs/conf.py +69 -0
  20. kubetorch/docs/index.rst +20 -0
  21. kubetorch/docs/requirements.txt +5 -0
  22. kubetorch/globals.py +269 -0
  23. kubetorch/logger.py +59 -0
  24. kubetorch/resources/__init__.py +0 -0
  25. kubetorch/resources/callables/__init__.py +0 -0
  26. kubetorch/resources/callables/cls/__init__.py +0 -0
  27. kubetorch/resources/callables/cls/cls.py +159 -0
  28. kubetorch/resources/callables/fn/__init__.py +0 -0
  29. kubetorch/resources/callables/fn/fn.py +140 -0
  30. kubetorch/resources/callables/module.py +1315 -0
  31. kubetorch/resources/callables/utils.py +203 -0
  32. kubetorch/resources/compute/__init__.py +0 -0
  33. kubetorch/resources/compute/app.py +253 -0
  34. kubetorch/resources/compute/compute.py +2414 -0
  35. kubetorch/resources/compute/decorators.py +137 -0
  36. kubetorch/resources/compute/utils.py +1026 -0
  37. kubetorch/resources/compute/websocket.py +135 -0
  38. kubetorch/resources/images/__init__.py +1 -0
  39. kubetorch/resources/images/image.py +412 -0
  40. kubetorch/resources/images/images.py +64 -0
  41. kubetorch/resources/secrets/__init__.py +2 -0
  42. kubetorch/resources/secrets/kubernetes_secrets_client.py +377 -0
  43. kubetorch/resources/secrets/provider_secrets/__init__.py +0 -0
  44. kubetorch/resources/secrets/provider_secrets/anthropic_secret.py +12 -0
  45. kubetorch/resources/secrets/provider_secrets/aws_secret.py +16 -0
  46. kubetorch/resources/secrets/provider_secrets/azure_secret.py +14 -0
  47. kubetorch/resources/secrets/provider_secrets/cohere_secret.py +12 -0
  48. kubetorch/resources/secrets/provider_secrets/gcp_secret.py +16 -0
  49. kubetorch/resources/secrets/provider_secrets/github_secret.py +13 -0
  50. kubetorch/resources/secrets/provider_secrets/huggingface_secret.py +20 -0
  51. kubetorch/resources/secrets/provider_secrets/kubeconfig_secret.py +12 -0
  52. kubetorch/resources/secrets/provider_secrets/lambda_secret.py +13 -0
  53. kubetorch/resources/secrets/provider_secrets/langchain_secret.py +12 -0
  54. kubetorch/resources/secrets/provider_secrets/openai_secret.py +11 -0
  55. kubetorch/resources/secrets/provider_secrets/pinecone_secret.py +12 -0
  56. kubetorch/resources/secrets/provider_secrets/providers.py +92 -0
  57. kubetorch/resources/secrets/provider_secrets/ssh_secret.py +12 -0
  58. kubetorch/resources/secrets/provider_secrets/wandb_secret.py +11 -0
  59. kubetorch/resources/secrets/secret.py +224 -0
  60. kubetorch/resources/secrets/secret_factory.py +64 -0
  61. kubetorch/resources/secrets/utils.py +222 -0
  62. kubetorch/resources/volumes/__init__.py +0 -0
  63. kubetorch/resources/volumes/volume.py +340 -0
  64. kubetorch/servers/__init__.py +0 -0
  65. kubetorch/servers/http/__init__.py +0 -0
  66. kubetorch/servers/http/distributed_utils.py +2968 -0
  67. kubetorch/servers/http/http_client.py +802 -0
  68. kubetorch/servers/http/http_server.py +1622 -0
  69. kubetorch/servers/http/server_metrics.py +255 -0
  70. kubetorch/servers/http/utils.py +722 -0
  71. kubetorch/serving/__init__.py +0 -0
  72. kubetorch/serving/autoscaling.py +153 -0
  73. kubetorch/serving/base_service_manager.py +344 -0
  74. kubetorch/serving/constants.py +77 -0
  75. kubetorch/serving/deployment_service_manager.py +431 -0
  76. kubetorch/serving/knative_service_manager.py +487 -0
  77. kubetorch/serving/raycluster_service_manager.py +526 -0
  78. kubetorch/serving/service_manager.py +18 -0
  79. kubetorch/serving/templates/deployment_template.yaml +17 -0
  80. kubetorch/serving/templates/knative_service_template.yaml +19 -0
  81. kubetorch/serving/templates/kt_setup_template.sh.j2 +91 -0
  82. kubetorch/serving/templates/pod_template.yaml +198 -0
  83. kubetorch/serving/templates/raycluster_service_template.yaml +42 -0
  84. kubetorch/serving/templates/raycluster_template.yaml +35 -0
  85. kubetorch/serving/templates/service_template.yaml +21 -0
  86. kubetorch/serving/templates/workerset_template.yaml +36 -0
  87. kubetorch/serving/utils.py +344 -0
  88. kubetorch/utils.py +263 -0
  89. kubetorch-0.2.5.dist-info/METADATA +75 -0
  90. kubetorch-0.2.5.dist-info/RECORD +92 -0
  91. kubetorch-0.2.5.dist-info/WHEEL +4 -0
  92. kubetorch-0.2.5.dist-info/entry_points.txt +5 -0
@@ -0,0 +1,1315 @@
1
+ import asyncio
2
+ import json
3
+ import tempfile
4
+ import threading
5
+ import time
6
+ import urllib.parse
7
+ from datetime import datetime, timezone
8
+ from pathlib import Path
9
+ from typing import Dict, List, Union
10
+
11
+ import websockets
12
+
13
+ from kubetorch.globals import config, service_url, service_url_async
14
+ from kubetorch.logger import get_logger
15
+ from kubetorch.resources.callables.utils import get_names_for_reload_fallbacks, locate_working_dir
16
+
17
+ from kubetorch.resources.compute.utils import (
18
+ delete_cached_service_data,
19
+ delete_configmaps,
20
+ load_configmaps,
21
+ VersionMismatchError,
22
+ )
23
+ from kubetorch.servers.http.http_client import HTTPClient
24
+ from kubetorch.servers.http.utils import (
25
+ clean_and_validate_k8s_name,
26
+ generate_unique_request_id,
27
+ is_running_in_kubernetes,
28
+ )
29
+ from kubetorch.serving.utils import has_k8s_credentials, KubernetesCredentialsError
30
+ from kubetorch.utils import (
31
+ extract_host_port,
32
+ get_kt_install_url,
33
+ iso_timestamp_to_nanoseconds,
34
+ LogVerbosity,
35
+ ServerLogsFormatter,
36
+ )
37
+
38
+ logger = get_logger(__name__)
39
+
40
+
41
+ class Module:
42
+ MODULE_TYPE = None
43
+
44
+ def __init__(
45
+ self,
46
+ name: str,
47
+ pointers: tuple,
48
+ ):
49
+ self._compute = None
50
+ self._deployment_timestamp = None
51
+ self._service_config = None
52
+ self._http_client = None
53
+ self._get_if_exists = True
54
+ self._reload_prefixes = None
55
+ self._serialization = "json" # Default serialization format
56
+ self._async = False
57
+ self._remote_pointers = None
58
+ self._service_name = None
59
+
60
+ self.pointers = pointers
61
+ self.name = clean_and_validate_k8s_name(name, allow_full_length=False) if name else None
62
+
63
+ @property
64
+ def module_name(self):
65
+ """Name of the function or class."""
66
+ return self.pointers[2]
67
+
68
+ @property
69
+ def reload_prefixes(self):
70
+ return self._reload_prefixes or []
71
+
72
+ @reload_prefixes.setter
73
+ def reload_prefixes(self, value: Union[str, List[str]]):
74
+ """Set the reload_prefixes property."""
75
+ if isinstance(value, (list)):
76
+ self._reload_prefixes = value
77
+ elif isinstance(value, str):
78
+ self._reload_prefixes = [value]
79
+ else:
80
+ raise ValueError("`reload_prefixes` must be a string or a list.")
81
+
82
+ @property
83
+ def namespace(self):
84
+ """Namespace where the service is deployed."""
85
+ if self.compute is not None:
86
+ return self.compute.namespace
87
+ return config.namespace
88
+
89
+ @property
90
+ def service_name(self):
91
+ """Name of the knative service, formatted according to k8s regex rules."""
92
+ if self._service_name:
93
+ return self._service_name
94
+
95
+ service_name = self.name
96
+
97
+ if config.username and not self.reload_prefixes and not service_name.startswith(config.username + "-"):
98
+ service_name = f"{config.username}-{service_name}"
99
+
100
+ self._service_name = clean_and_validate_k8s_name(service_name, allow_full_length=True)
101
+ return self._service_name
102
+
103
+ @service_name.setter
104
+ def service_name(self, value: str):
105
+ self._service_name = clean_and_validate_k8s_name(value, allow_full_length=True)
106
+
107
+ @property
108
+ def compute(self):
109
+ """Compute object corresponding to the module."""
110
+ return self._compute
111
+
112
+ @compute.setter
113
+ def compute(self, compute: "Compute"):
114
+ self._compute = compute
115
+
116
+ @property
117
+ def deployment_timestamp(self):
118
+ if not self._deployment_timestamp:
119
+ self._deployment_timestamp = self.compute.service_manager.get_deployment_timestamp_annotation(
120
+ self.service_name
121
+ )
122
+ return self._deployment_timestamp
123
+
124
+ @deployment_timestamp.setter
125
+ def deployment_timestamp(self, value: str):
126
+ self._deployment_timestamp = value
127
+
128
+ @property
129
+ def remote_pointers(self):
130
+ if self._remote_pointers:
131
+ return self._remote_pointers
132
+
133
+ source_dir, _ = locate_working_dir(self.pointers[0])
134
+ relative_module_path = Path(self.pointers[0]).expanduser().relative_to(source_dir)
135
+ source_dir_name = Path(source_dir).name
136
+ if self.compute.working_dir is not None:
137
+ container_module_path = str(Path(self.compute.working_dir) / source_dir_name / relative_module_path)
138
+ else:
139
+ # Leave it as relative path
140
+ container_module_path = str(Path(source_dir_name) / relative_module_path)
141
+ self._remote_pointers = (
142
+ container_module_path,
143
+ self.pointers[1],
144
+ self.pointers[2],
145
+ )
146
+ return self._remote_pointers
147
+
148
+ @property
149
+ def service_config(self) -> dict:
150
+ """Knative service configuration loaded from Kubernetes API."""
151
+ return self._service_config
152
+
153
+ @service_config.setter
154
+ def service_config(self, value: dict):
155
+ self._service_config = value
156
+
157
+ @property
158
+ def base_endpoint(self):
159
+ """Endpoint for the module."""
160
+ if is_running_in_kubernetes():
161
+ if not self._compute.endpoint:
162
+ return self._compute._wait_for_endpoint()
163
+ return self._compute.endpoint
164
+ # URL format when using the NGINX proxy
165
+ return f"http://localhost:{self._compute.client_port()}/{self.namespace}/{self.service_name}"
166
+
167
+ @property
168
+ def request_headers(self):
169
+ if self.compute.freeze:
170
+ return {}
171
+
172
+ if self.deployment_timestamp:
173
+ return {"X-Deployed-As-Of": self.deployment_timestamp}
174
+
175
+ return {}
176
+
177
+ @property
178
+ def serialization(self):
179
+ """Default serialization format for this module."""
180
+ return self._serialization
181
+
182
+ @serialization.setter
183
+ def serialization(self, value: str):
184
+ """Set the default serialization format for this module."""
185
+ if value not in ["json", "pickle"]:
186
+ raise ValueError("Serialization must be 'json' or 'pickle'")
187
+ self._serialization = value
188
+
189
+ @property
190
+ def async_(self):
191
+ """Whether to run the function or class methods in async mode."""
192
+ return self._async
193
+
194
+ @async_.setter
195
+ def async_(self, value: bool):
196
+ if not isinstance(value, bool):
197
+ raise ValueError("`async_` must be a boolean")
198
+ self._async = value
199
+
200
+ @classmethod
201
+ def from_name(
202
+ cls,
203
+ name: str,
204
+ namespace: str = None,
205
+ reload_prefixes: Union[str, List[str]] = [],
206
+ ):
207
+ """Reload an existing callable by its service name."""
208
+ from kubernetes import client
209
+ from kubernetes.config import ConfigException, load_incluster_config, load_kube_config
210
+
211
+ import kubetorch as kt
212
+
213
+ try:
214
+ load_incluster_config()
215
+ except ConfigException:
216
+ load_kube_config()
217
+ objects_api = client.CustomObjectsApi()
218
+ apps_v1_api = client.AppsV1Api()
219
+ core_v1_api = client.CoreV1Api()
220
+
221
+ namespace = namespace or config.namespace
222
+ if isinstance(reload_prefixes, str):
223
+ reload_prefixes = [reload_prefixes]
224
+ potential_names = get_names_for_reload_fallbacks(name=name, prefixes=reload_prefixes)
225
+
226
+ # Use unified service discovery from BaseServiceManager
227
+ from kubetorch.serving.service_manager import BaseServiceManager
228
+
229
+ all_services = BaseServiceManager.discover_services_static(
230
+ namespace=namespace, objects_api=objects_api, apps_v1_api=apps_v1_api
231
+ )
232
+
233
+ # Create name-to-service lookup for efficient searching
234
+ service_dict = {svc["name"]: svc for svc in all_services}
235
+
236
+ # Try to find the first matching service across all service types
237
+ for candidate in potential_names:
238
+ service_info = service_dict.get(candidate)
239
+ if service_info is None:
240
+ continue
241
+
242
+ compute = kt.Compute.from_template(service_info)
243
+
244
+ pods = core_v1_api.list_namespaced_pod(
245
+ namespace=namespace,
246
+ label_selector=f"kubetorch.com/service={name}",
247
+ )
248
+ volumes = []
249
+
250
+ # TODO: handle case where service is scaled to 0?
251
+ if pods.items:
252
+ # Use runtime Pod spec
253
+ pod = pods.items[0]
254
+ for v in pod.spec.volumes or []:
255
+ if v.persistent_volume_claim:
256
+ existing_volume = kt.Volume.from_name(name=v.name)
257
+ volumes.append(existing_volume)
258
+
259
+ module_args = compute.get_env_vars(
260
+ [
261
+ "KT_FILE_PATH",
262
+ "KT_MODULE_NAME",
263
+ "KT_CLS_OR_FN_NAME",
264
+ "KT_CALLABLE_TYPE",
265
+ "KT_INIT_ARGS",
266
+ ]
267
+ )
268
+ pointers = (
269
+ module_args["KT_FILE_PATH"],
270
+ module_args["KT_MODULE_NAME"],
271
+ module_args["KT_CLS_OR_FN_NAME"],
272
+ )
273
+
274
+ if module_args.get("KT_CALLABLE_TYPE") == "cls":
275
+ init_args = json.loads(module_args.get("KT_INIT_ARGS") or "{}")
276
+ reloaded_module = kt.Cls(name=candidate, pointers=pointers, init_args=init_args)
277
+ elif module_args.get("KT_CALLABLE_TYPE") == "fn":
278
+ reloaded_module = kt.Fn(name=candidate, pointers=pointers)
279
+ else:
280
+ raise ValueError(f"Unknown module type: {module_args.get('KT_CALLABLE_TYPE')}")
281
+
282
+ reloaded_module.service_name = candidate
283
+ reloaded_module.compute = compute
284
+ return reloaded_module
285
+
286
+ raise ValueError(
287
+ f"Service '{name}' not found in namespace '{namespace}' with reload_prefixes={reload_prefixes}"
288
+ )
289
+
290
+ def _client(self, *args, **kwargs):
291
+ """Return the client through which to interact with the remote Module.
292
+ If compute is not yet set, attempt to reload it.
293
+ """
294
+ if self._http_client is not None:
295
+ return self._http_client
296
+
297
+ if self.compute is None or self.service_config is None:
298
+ namespace = self.namespace
299
+ # When rebuilding the http client on reload, need to know whether to look for a prefix
300
+ reload_prefixes = self.reload_prefixes
301
+ logger.debug(
302
+ f"Attempting to reload service '{self.service_name}' in namespace '{namespace}' with "
303
+ f"reload_prefixes={reload_prefixes}"
304
+ )
305
+ reloaded_module = Module.from_name(
306
+ name=self.service_name,
307
+ namespace=namespace,
308
+ reload_prefixes=reload_prefixes,
309
+ )
310
+
311
+ # Update settable attributes with reloaded module values
312
+ self.compute = self.compute or reloaded_module.compute
313
+ self.service_config = reloaded_module.service_config
314
+ self.pointers = reloaded_module.pointers
315
+ self.name = reloaded_module.name
316
+ self.service_name = reloaded_module.service_name
317
+
318
+ self._http_client = HTTPClient(
319
+ base_url=self.endpoint(*args, **kwargs),
320
+ compute=self.compute,
321
+ service_name=self.service_name,
322
+ )
323
+
324
+ return self._http_client
325
+
326
+ def endpoint(self, method_name: str = None):
327
+ if not hasattr(self, "init_args"):
328
+ return f"{self.base_endpoint}/{self.module_name}"
329
+ else:
330
+ return f"{self.base_endpoint}/{self.module_name}/{method_name}"
331
+
332
+ def deploy(self):
333
+ """
334
+ Helper method to deploy modules specified by the @compute decorator. Used by `kt deploy` CLI command.
335
+ Deploys the module to the specified compute.
336
+ """
337
+ if self.compute is None:
338
+ raise ValueError("Compute must be set before deploying the module.")
339
+ return self.to(self.compute, init_args=getattr(self, "init_args", None))
340
+
341
+ async def deploy_async(self):
342
+ """
343
+ Async helper method to deploy modules specified by the @compute decorator. Used by `kt deploy` CLI command
344
+ when multiple modules are present. Deploys the module to the specified compute asynchronously.
345
+ """
346
+ if self.compute is None:
347
+ raise ValueError("Compute must be set before deploying the module.")
348
+ return await self.to_async(self.compute, init_args=getattr(self, "init_args", None))
349
+
350
+ def to(
351
+ self,
352
+ compute: "Compute",
353
+ init_args: Dict = None,
354
+ stream_logs: Union[bool, None] = None,
355
+ verbosity: Union[LogVerbosity, str] = None,
356
+ get_if_exists: bool = False,
357
+ reload_prefixes: Union[str, List[str]] = [],
358
+ dryrun: bool = False,
359
+ ):
360
+ """
361
+ Send the function or class to the specified compute.
362
+
363
+ Args:
364
+ compute (Compute): The compute to send the function or class to.
365
+ init_args (Dict, optional): Initialization arguments, which may be relevant for a class.
366
+ stream_logs (bool, optional): Whether to stream logs during service launch. If None, uses the global
367
+ config value.
368
+ verbosity (Union[verbosity, str], optional): Verbosity of the logs streamed back to the client.
369
+ If not specified, will stream select service logs. Can also be controlled globally via the config
370
+ value `log_verbosity`. Supported values: "debug", "info", "critical".
371
+ get_if_exists (Union[bool, List[str]], optional): Controls how service lookup is performed to determine
372
+ whether to send the service to the compute.
373
+
374
+ - If False (default): Do not attempt to reload the service.
375
+ - If True: Attempt to find an existing service using a standard fallback order
376
+ (e.g., username, git branch, then prod). If found, re-use that existing service.
377
+ reload_prefixes (Union[str, List[str]], optional): A list of prefixes to use when reloading the function
378
+ (e.g., ["qa", "prod", "git-branch-name"]). If not provided, will use the current username,
379
+ git branch, and prod.
380
+ dryrun (bool, optional): Whether to setup and return the object as a dryrun (``True``),
381
+ or to actually launch the compute and service (``False``).
382
+ Returns:
383
+ Module: The module instance.
384
+
385
+ Example:
386
+
387
+ .. code-block:: python
388
+
389
+ import kubetorch as kt
390
+
391
+ remote_cls = kt.cls(SlowNumpyArray, name=name).to(
392
+ kt.Compute(cpus=".1"),
393
+ init_args={"size": 10},
394
+ stream_logs=True
395
+ )
396
+ """
397
+ if not has_k8s_credentials():
398
+ raise KubernetesCredentialsError(
399
+ "Kubernetes credentials not found. Please ensure you are running in a Kubernetes cluster or have a valid kubeconfig file."
400
+ )
401
+
402
+ if get_if_exists:
403
+ try:
404
+ existing_service = self._get_existing_service(reload_prefixes)
405
+ if existing_service:
406
+ logger.debug(f"Reusing existing service: {existing_service.service_name}")
407
+ return existing_service
408
+ except Exception as e:
409
+ logger.debug(
410
+ f"Service {self.service_name} not found in namespace {compute.namespace} "
411
+ f"with reload_prefixes={reload_prefixes}: {str(e)}"
412
+ )
413
+
414
+ self.compute = compute
415
+ self.compute.service_name = self.service_name
416
+
417
+ if hasattr(self, "init_args"):
418
+ self.init_args = init_args
419
+
420
+ # We need the deployment timestamp at the start of the update so we know that artifacts deployed **after**
421
+ # this time are part of the current deployment. We actually set it at the end to ensure that the deployment is
422
+ # successful.
423
+ logger.debug(f"Deploying module: {self.service_name}")
424
+ deployment_timestamp = datetime.now(timezone.utc).isoformat()
425
+ install_url, use_editable = get_kt_install_url(self.compute.freeze)
426
+
427
+ if not dryrun and not self.compute.freeze:
428
+ self._rsync_repo_and_image_patches(install_url, use_editable, init_args)
429
+
430
+ self._launch_service(
431
+ install_url,
432
+ use_editable,
433
+ init_args,
434
+ deployment_timestamp,
435
+ stream_logs,
436
+ verbosity,
437
+ dryrun,
438
+ )
439
+
440
+ return self
441
+
442
+ async def to_async(
443
+ self,
444
+ compute: "Compute",
445
+ init_args: Dict = None,
446
+ stream_logs: Union[bool, None] = None,
447
+ verbosity: Union[LogVerbosity, str] = None,
448
+ get_if_exists: bool = False,
449
+ reload_prefixes: Union[str, List[str]] = [],
450
+ dryrun: bool = False,
451
+ ):
452
+ """
453
+ Async version of the `.to` method. Send the function or class to the specified compute asynchronously.
454
+
455
+ Args:
456
+ compute (Compute): The compute to send the function or class to.
457
+ init_args (Dict, optional): Initialization arguments, which may be relevant for a class.
458
+ stream_logs (bool, optional): Whether to stream logs during service launch. If None, uses the global
459
+ config value.
460
+ verbosity (Union[verbosity, str], optional): Verbosity of the logs streamed back to the client.
461
+ If not specified, will stream select service logs. Can also be controlled globally via the config
462
+ value `log_verbosity`. Supported values: "debug", "info", "critical".
463
+ get_if_exists (Union[bool, List[str]], optional): Controls how service lookup is performed to determine
464
+ whether to send the service to the compute.
465
+
466
+ - If False (default): Do not attempt to reload the service.
467
+ - If True: Attempt to find an existing service using a standard fallback order
468
+ (e.g., username, git branch, then prod). If found, re-use that existing service.
469
+ reload_prefixes (Union[str, List[str]], optional): A list of prefixes to use when reloading the function
470
+ (e.g., ["qa", "prod", "git-branch-name"]). If not provided, will use the current username,
471
+ git branch, and prod.
472
+ dryrun (bool, optional): Whether to setup and return the object as a dryrun (``True``),
473
+ or to actually launch the compute and service (``False``).
474
+ Returns:
475
+ Module: The module instance.
476
+
477
+ Example:
478
+
479
+ .. code-block:: python
480
+
481
+ import kubetorch as kt
482
+
483
+ remote_cls = await kt.cls(SlowNumpyArray, name=name).to_async(
484
+ kt.Compute(cpus=".1"),
485
+ init_args={"size": 10},
486
+ stream_logs=True
487
+ )
488
+ """
489
+ if get_if_exists:
490
+ try:
491
+ existing_service = await self._get_existing_service_async(reload_prefixes)
492
+ if existing_service:
493
+ logger.debug(f"Reusing existing service: {existing_service.service_name}")
494
+ return existing_service
495
+ except Exception as e:
496
+ logger.info(
497
+ f"Service {self.compute.service_name} not found in namespace {self.compute.namespace} "
498
+ f"with reload_prefixes={reload_prefixes}: {str(e)}"
499
+ )
500
+
501
+ self.compute = compute
502
+ self.compute.service_name = self.service_name
503
+
504
+ if hasattr(self, "init_args"):
505
+ self.init_args = init_args
506
+
507
+ logger.debug(f"Deploying module: {self.service_name}")
508
+ deployment_timestamp = datetime.now(timezone.utc).isoformat()
509
+ install_url, use_editable = get_kt_install_url(self.compute.freeze)
510
+
511
+ if not dryrun and not self.compute.freeze:
512
+ await self._rsync_repo_and_image_patches_async(install_url, use_editable, init_args)
513
+
514
+ await self._launch_service_async(
515
+ install_url,
516
+ use_editable,
517
+ init_args,
518
+ deployment_timestamp,
519
+ stream_logs,
520
+ verbosity,
521
+ dryrun,
522
+ )
523
+
524
+ return self
525
+
526
+ def _get_existing_service(self, reload_prefixes):
527
+ try:
528
+ existing_service = Module.from_name(
529
+ self.service_name,
530
+ namespace=self.namespace,
531
+ reload_prefixes=reload_prefixes,
532
+ )
533
+ if existing_service:
534
+ if self.compute:
535
+ # Replace the compute object, if the user has already constructed it locally
536
+ existing_service.compute = self.compute
537
+ logger.info(
538
+ f"Existing service '{self.service_name}' found in namespace '{self.namespace}', not "
539
+ f"redeploying."
540
+ )
541
+ return existing_service
542
+ except Exception as e:
543
+ raise ValueError(
544
+ f"Failed to reload service {self.service_name} in namespace {self.namespace} "
545
+ f"and reload_prefixes={reload_prefixes}: {str(e)}"
546
+ )
547
+
548
+ async def _get_existing_service_async(self, reload_prefixes):
549
+ try:
550
+ existing_service = Module.from_name(
551
+ self.service_name,
552
+ namespace=self.namespace,
553
+ reload_prefixes=reload_prefixes,
554
+ )
555
+ if existing_service:
556
+ if self.compute:
557
+ # Replace the compute object, if the user has already constructed it locally
558
+ existing_service.compute = self.compute
559
+ logger.info(
560
+ f"Existing service '{self.service_name}' found in namespace '{self.namespace}', not "
561
+ f"redeploying."
562
+ )
563
+ return existing_service
564
+ except Exception as e:
565
+ raise ValueError(
566
+ f"Failed to reload service {self.service_name} in namespace {self.namespace} "
567
+ f"and reload_prefixes={reload_prefixes}: {str(e)}"
568
+ )
569
+
570
+ def _get_rsync_dirs_and_dockerfile(self, install_url, use_editable, init_args):
571
+ source_dir, has_kt_dir = locate_working_dir(self.pointers[0])
572
+ rsync_dirs = [str(source_dir)]
573
+ if not has_kt_dir:
574
+ # Use the source file (.py) instead of directory
575
+ source_file = Path(f"{self.pointers[0]}/{self.pointers[1]}.py")
576
+ rsync_dirs = [str(source_file)]
577
+ logger.info(f"Package root not found; syncing file {source_file}")
578
+ else:
579
+ logger.info(f"Package root identified at {source_dir}; syncing directory")
580
+
581
+ if install_url.endswith(".whl") or (use_editable and install_url != str(source_dir)):
582
+ rsync_dirs.append(install_url)
583
+
584
+ pointer_env_vars = self._get_pointer_env_vars(self.remote_pointers)
585
+ metadata_env_vars = self._get_metadata_env_vars(init_args)
586
+ service_dockerfile = self._get_service_dockerfile({**pointer_env_vars, **metadata_env_vars})
587
+ return rsync_dirs, service_dockerfile
588
+
589
+ def _rsync_repo_and_image_patches(self, install_url, use_editable, init_args):
590
+ logger.debug("Rsyncing data to the rsync pod")
591
+ rsync_dirs, service_dockerfile = self._get_rsync_dirs_and_dockerfile(install_url, use_editable, init_args)
592
+ self._construct_and_rsync_files(rsync_dirs, service_dockerfile)
593
+ logger.debug(f"Rsync completed for service {self.service_name}")
594
+
595
+ async def _rsync_repo_and_image_patches_async(self, install_url, use_editable, init_args):
596
+ logger.debug("Rsyncing data to the rsync pod")
597
+ rsync_dirs, service_dockerfile = self._get_rsync_dirs_and_dockerfile(install_url, use_editable, init_args)
598
+ await self._construct_and_rsync_files_async(rsync_dirs, service_dockerfile)
599
+ logger.debug(f"Rsync completed for service {self.service_name}")
600
+
601
+ def _launch_service(
602
+ self,
603
+ install_url,
604
+ use_editable,
605
+ init_args,
606
+ deployment_timestamp,
607
+ stream_logs,
608
+ verbosity,
609
+ dryrun,
610
+ ):
611
+ # Start log streaming if enabled
612
+ stop_event = threading.Event()
613
+ log_thread = None
614
+ if stream_logs is None:
615
+ stream_logs = config.stream_logs or False
616
+
617
+ launch_request_id = "-"
618
+ if stream_logs and not dryrun:
619
+ if verbosity is None:
620
+ verbosity = config.log_verbosity
621
+
622
+ # Create a unique request ID for this launch sequence
623
+ launch_request_id = f"launch_{generate_unique_request_id('launch', deployment_timestamp)}"
624
+
625
+ # Start log streaming in a separate thread
626
+ log_thread = threading.Thread(
627
+ target=self._stream_launch_logs,
628
+ args=(
629
+ launch_request_id,
630
+ stop_event,
631
+ verbosity,
632
+ deployment_timestamp,
633
+ ),
634
+ )
635
+ log_thread.daemon = True
636
+ log_thread.start()
637
+
638
+ try:
639
+ startup_rsync_command = self._startup_rsync_command(use_editable, install_url, dryrun)
640
+
641
+ # Launch the compute in the form of a service with the requested resources
642
+ service_config = self.compute._launch(
643
+ service_name=self.compute.service_name,
644
+ install_url=install_url if not use_editable else None,
645
+ pointer_env_vars=self._get_pointer_env_vars(self.remote_pointers),
646
+ metadata_env_vars=self._get_metadata_env_vars(init_args),
647
+ startup_rsync_command=startup_rsync_command,
648
+ launch_id=launch_request_id,
649
+ dryrun=dryrun,
650
+ )
651
+ self.service_config = service_config
652
+
653
+ if not self.compute.freeze and not dryrun:
654
+ self.deployment_timestamp = self.compute.service_manager.update_deployment_timestamp_annotation(
655
+ service_name=self.service_name,
656
+ new_timestamp=deployment_timestamp,
657
+ )
658
+ if not dryrun:
659
+ self.compute._check_service_ready()
660
+ # Additional health check to ensure HTTP server is ready
661
+ self._wait_for_http_health()
662
+ finally:
663
+ # Stop log streaming
664
+ if log_thread:
665
+ stop_event.set()
666
+
667
+ async def _launch_service_async(
668
+ self,
669
+ install_url,
670
+ use_editable,
671
+ init_args,
672
+ deployment_timestamp,
673
+ stream_logs,
674
+ verbosity,
675
+ dryrun,
676
+ ):
677
+ # Start log streaming if enabled
678
+ stop_event = asyncio.Event()
679
+ log_task = None
680
+ if stream_logs is None:
681
+ stream_logs = config.stream_logs or False
682
+
683
+ launch_request_id = "-"
684
+ if stream_logs and not dryrun:
685
+ if verbosity is None:
686
+ verbosity = config.log_verbosity
687
+
688
+ # Create a unique request ID for this launch sequence
689
+ launch_request_id = f"launch_{generate_unique_request_id('launch', deployment_timestamp)}"
690
+
691
+ # Start log streaming as an async task
692
+ log_task = asyncio.create_task(
693
+ self._stream_launch_logs_async(
694
+ launch_request_id,
695
+ stop_event,
696
+ verbosity,
697
+ deployment_timestamp,
698
+ )
699
+ )
700
+
701
+ try:
702
+ startup_rsync_command = self._startup_rsync_command(use_editable, install_url, dryrun)
703
+
704
+ # Launch the compute in the form of a service with the requested resources
705
+ # Use the async version of _launch
706
+ service_config = await self.compute._launch_async(
707
+ service_name=self.compute.service_name,
708
+ install_url=install_url if not use_editable else None,
709
+ pointer_env_vars=self._get_pointer_env_vars(self.remote_pointers),
710
+ metadata_env_vars=self._get_metadata_env_vars(init_args),
711
+ startup_rsync_command=startup_rsync_command,
712
+ launch_id=launch_request_id,
713
+ dryrun=dryrun,
714
+ )
715
+ self.service_config = service_config
716
+
717
+ if not self.compute.freeze and not dryrun:
718
+ self.deployment_timestamp = self.compute.service_manager.update_deployment_timestamp_annotation(
719
+ service_name=self.service_name,
720
+ new_timestamp=deployment_timestamp,
721
+ )
722
+ if not dryrun:
723
+ await self.compute._check_service_ready_async()
724
+ await self._wait_for_http_health_async()
725
+ finally:
726
+ # Stop log streaming
727
+ if log_task:
728
+ stop_event.set()
729
+ try:
730
+ await asyncio.wait_for(log_task, timeout=2.0)
731
+ except asyncio.TimeoutError:
732
+ log_task.cancel()
733
+ try:
734
+ await log_task
735
+ except asyncio.CancelledError:
736
+ pass
737
+
738
+ def _get_service_dockerfile(self, metadata_env_vars):
739
+ image_instructions = self.compute._image_setup_and_instructions()
740
+
741
+ if image_instructions:
742
+ image_instructions += "\n"
743
+ for key, val in metadata_env_vars.items():
744
+ if isinstance(val, Dict):
745
+ val = json.dumps(val)
746
+ image_instructions += f"ENV {key} {val}\n"
747
+
748
+ logger.debug(f"Generated Dockerfile for service {self.service_name}:\n{image_instructions}")
749
+ return image_instructions
750
+
751
+ def _construct_and_rsync_files(self, rsync_dirs, service_dockerfile):
752
+ with tempfile.TemporaryDirectory() as tmpdir:
753
+ temp_file = Path(tmpdir) / ".kt" / "image.dockerfile"
754
+ temp_file.parent.mkdir(parents=True, exist_ok=True)
755
+ temp_file.write_text(service_dockerfile)
756
+
757
+ source_dir = str(Path(tmpdir) / ".kt")
758
+ rsync_dirs.append(source_dir)
759
+
760
+ logger.debug(f"Rsyncing directories: {rsync_dirs}")
761
+ if is_running_in_kubernetes():
762
+ self.compute.rsync_in_cluster(rsync_dirs)
763
+ else:
764
+ self.compute.rsync(rsync_dirs)
765
+
766
+ async def _construct_and_rsync_files_async(self, rsync_dirs, service_dockerfile):
767
+ with tempfile.TemporaryDirectory() as tmpdir:
768
+ temp_file = Path(tmpdir) / ".kt" / "image.dockerfile"
769
+ temp_file.parent.mkdir(parents=True, exist_ok=True)
770
+ temp_file.write_text(service_dockerfile)
771
+
772
+ source_dir = str(Path(tmpdir) / ".kt")
773
+ rsync_dirs.append(source_dir)
774
+
775
+ logger.debug(f"Rsyncing directories: {rsync_dirs}")
776
+ if is_running_in_kubernetes():
777
+ await self.compute.rsync_in_cluster_async(rsync_dirs)
778
+ else:
779
+ await self.compute.rsync_async(rsync_dirs)
780
+
781
+ def _startup_rsync_command(self, use_editable, install_url, dryrun):
782
+ if dryrun:
783
+ return None
784
+
785
+ if use_editable or (install_url and install_url.endswith(".whl")):
786
+ # rsync from the rsync pod's file system directly
787
+ startup_cmd = self.compute._rsync_svc_url()
788
+ cmd = f"rsync -av {startup_cmd} ."
789
+ return cmd
790
+
791
+ return None
792
+
793
+ def teardown(self):
794
+ """Delete the service and all associated resources."""
795
+ logger.info(f"Deleting service: {self.service_name}")
796
+
797
+ # Use the compute's service manager - it already knows the correct type!
798
+ teardown_success = self.compute.service_manager.teardown_service(
799
+ service_name=self.service_name,
800
+ )
801
+
802
+ if not teardown_success:
803
+ logger.error(f"Failed to teardown service {self.service_name}")
804
+ return
805
+
806
+ configmaps = load_configmaps(
807
+ core_api=self.compute.core_api,
808
+ service_name=self.service_name,
809
+ namespace=self.compute.namespace,
810
+ )
811
+ if configmaps:
812
+ logger.info(f"Deleting {len(configmaps)} configmap{'' if len(configmaps) == 1 else 's'}")
813
+ delete_configmaps(
814
+ core_api=self.compute.core_api,
815
+ configmaps=configmaps,
816
+ namespace=self.compute.namespace,
817
+ )
818
+
819
+ logger.info("Deleting service data from cache in rsync pod")
820
+ delete_cached_service_data(
821
+ core_api=self.compute.core_api,
822
+ service_name=self.service_name,
823
+ namespace=self.compute.namespace,
824
+ )
825
+
826
+ def _get_pointer_env_vars(self, remote_pointers):
827
+ (container_file_path, module_name, cls_or_fn_name) = remote_pointers
828
+ return {
829
+ "KT_FILE_PATH": container_file_path,
830
+ "KT_MODULE_NAME": module_name,
831
+ "KT_CLS_OR_FN_NAME": cls_or_fn_name,
832
+ }
833
+
834
+ def _get_metadata_env_vars(
835
+ self,
836
+ init_args: Dict,
837
+ ) -> Dict:
838
+ # TODO: add other callable metadata in addition to pointers (`is_generator`, `is_async`, etc.)
839
+ import json
840
+
841
+ distributed_config = self.compute.distributed_config
842
+ return {
843
+ "KT_INIT_ARGS": init_args,
844
+ "KT_CALLABLE_TYPE": self.MODULE_TYPE,
845
+ "KT_DISTRIBUTED_CONFIG": json.dumps(distributed_config) if distributed_config else None,
846
+ }
847
+
848
+ def _stream_launch_logs(
849
+ self,
850
+ request_id: str,
851
+ stop_event: threading.Event,
852
+ verbosity: LogVerbosity,
853
+ deployment_timestamp: str,
854
+ ):
855
+ """Stream logs and events during service launch sequence."""
856
+ try:
857
+ # Only use "kubetorch" container to exclude queue-proxy (e.g. Knative sidecars) container logs which
858
+ # are spammy with tons of healthcheck calls
859
+ pod_query = f'{{k8s_container_name="kubetorch"}} | json | request_id="{request_id}"'
860
+ event_query = f'{{service_name="unknown_service"}} | json | k8s_object_name=~"{self.service_name}.*" | k8s_namespace_name="{self.namespace}"'
861
+
862
+ encoded_pod_query = urllib.parse.quote_plus(pod_query)
863
+ encoded_event_query = urllib.parse.quote_plus(event_query)
864
+ logger.debug(f"Streaming launch logs and events for service {self.service_name}")
865
+
866
+ def start_log_threads(host, port):
867
+ def run_pod_logs():
868
+ self._run_log_stream(
869
+ request_id,
870
+ stop_event,
871
+ host,
872
+ port,
873
+ encoded_pod_query,
874
+ verbosity,
875
+ deployment_timestamp,
876
+ dedup=True,
877
+ )
878
+
879
+ def run_event_logs():
880
+ self._run_log_stream(
881
+ request_id,
882
+ stop_event,
883
+ host,
884
+ port,
885
+ encoded_event_query,
886
+ verbosity,
887
+ deployment_timestamp,
888
+ )
889
+
890
+ pod_thread = threading.Thread(target=run_pod_logs, daemon=True)
891
+ event_thread = threading.Thread(target=run_event_logs, daemon=True)
892
+
893
+ pod_thread.start()
894
+ event_thread.start()
895
+
896
+ # Don't block indefinitely on joins - use short timeouts
897
+ pod_thread.join(timeout=1.0)
898
+ event_thread.join(timeout=1.0)
899
+
900
+ base_url = service_url()
901
+ host, port = extract_host_port(base_url)
902
+ logger.debug(f"Streaming launch logs with url={base_url} host={host} and local port {port}")
903
+ start_log_threads(host, port)
904
+
905
+ except Exception as e:
906
+ logger.error(f"Failed to stream launch logs: {e}")
907
+ raise e
908
+
909
+ async def _stream_launch_logs_async(
910
+ self,
911
+ request_id: str,
912
+ stop_event: asyncio.Event,
913
+ verbosity: LogVerbosity,
914
+ deployment_timestamp: str,
915
+ ):
916
+ """Async version of _stream_launch_logs. Stream logs and events during service launch sequence."""
917
+ try:
918
+ # Only use "kubetorch" container to exclude queue-proxy (e.g. Knative sidecars) container logs which
919
+ # are spammy with tons of healthcheck calls
920
+ pod_query = f'{{k8s_container_name="kubetorch"}} | json | request_id="{request_id}"'
921
+ event_query = f'{{service_name="unknown_service"}} | json | k8s_object_name=~"{self.service_name}.*" | k8s_namespace_name="{self.namespace}"'
922
+
923
+ encoded_pod_query = urllib.parse.quote_plus(pod_query)
924
+ encoded_event_query = urllib.parse.quote_plus(event_query)
925
+ logger.debug(f"Streaming launch logs and events for service {self.service_name}")
926
+
927
+ base_url = await service_url_async()
928
+ host, port = extract_host_port(base_url)
929
+ logger.debug(f"Streaming launch logs with url={base_url} host={host} and local port {port}")
930
+
931
+ # Create async tasks for both log streams
932
+ pod_task = asyncio.create_task(
933
+ self._stream_logs_websocket(
934
+ request_id,
935
+ stop_event,
936
+ host=host,
937
+ port=port,
938
+ query=encoded_pod_query,
939
+ log_verbosity=verbosity,
940
+ deployment_timestamp=deployment_timestamp,
941
+ dedup=True,
942
+ )
943
+ )
944
+
945
+ event_task = asyncio.create_task(
946
+ self._stream_logs_websocket(
947
+ request_id,
948
+ stop_event,
949
+ host=host,
950
+ port=port,
951
+ query=encoded_event_query,
952
+ log_verbosity=verbosity,
953
+ deployment_timestamp=deployment_timestamp,
954
+ )
955
+ )
956
+
957
+ # Wait for both tasks to complete or be cancelled
958
+ try:
959
+ await asyncio.gather(pod_task, event_task, return_exceptions=True)
960
+ except Exception as e:
961
+ logger.error(f"Error in async log streaming: {e}")
962
+
963
+ except Exception as e:
964
+ logger.error(f"Failed to stream launch logs: {e}")
965
+ raise e
966
+
967
+ def _run_log_stream(
968
+ self,
969
+ request_id: str,
970
+ stop_event: threading.Event,
971
+ host: str,
972
+ port: int,
973
+ query: str,
974
+ log_verbosity: LogVerbosity,
975
+ deployment_timestamp: str,
976
+ dedup: bool = False,
977
+ ):
978
+ """Helper to run log streaming in an event loop"""
979
+ loop = asyncio.new_event_loop()
980
+ asyncio.set_event_loop(loop)
981
+ try:
982
+ loop.run_until_complete(
983
+ self._stream_logs_websocket(
984
+ request_id,
985
+ stop_event,
986
+ host=host,
987
+ port=port,
988
+ query=query,
989
+ log_verbosity=log_verbosity,
990
+ deployment_timestamp=deployment_timestamp,
991
+ dedup=dedup,
992
+ )
993
+ )
994
+ finally:
995
+ loop.close()
996
+
997
+ async def _run_log_stream_async(
998
+ self,
999
+ request_id: str,
1000
+ stop_event: asyncio.Event,
1001
+ host: str,
1002
+ port: int,
1003
+ query: str,
1004
+ log_verbosity: LogVerbosity,
1005
+ deployment_timestamp: str,
1006
+ dedup: bool = False,
1007
+ ):
1008
+ """Async helper to run log streaming directly in the current event loop"""
1009
+ await self._stream_logs_websocket(
1010
+ request_id,
1011
+ stop_event,
1012
+ host=host,
1013
+ port=port,
1014
+ query=query,
1015
+ log_verbosity=log_verbosity,
1016
+ deployment_timestamp=deployment_timestamp,
1017
+ dedup=dedup,
1018
+ )
1019
+
1020
+ async def _stream_logs_websocket(
1021
+ self,
1022
+ request_id: str,
1023
+ stop_event: Union[threading.Event, asyncio.Event],
1024
+ host: str,
1025
+ port: int,
1026
+ query: str,
1027
+ log_verbosity: LogVerbosity,
1028
+ deployment_timestamp: str,
1029
+ dedup: bool = False,
1030
+ ):
1031
+ """Stream logs and events using Loki's websocket tail endpoint"""
1032
+ try:
1033
+ uri = f"ws://{host}:{port}/loki/api/v1/tail?query={query}"
1034
+
1035
+ # Track the last timestamp we've seen to avoid duplicates
1036
+ last_timestamp = None
1037
+
1038
+ # Track when we should stop
1039
+ stop_time = None
1040
+
1041
+ # Track most recent deployment timestamp to filter out old logs / events
1042
+ start_timestamp = iso_timestamp_to_nanoseconds(deployment_timestamp)
1043
+
1044
+ shown_event_messages = set()
1045
+
1046
+ # Track seen log messages for deduplication
1047
+ seen_log_messages = set() if dedup else None
1048
+
1049
+ # For formatting the server setup logs
1050
+ formatters = {}
1051
+ base_formatter = ServerLogsFormatter()
1052
+ websocket = None
1053
+ try:
1054
+ # Add timeout to prevent hanging connections
1055
+ websocket = await websockets.connect(
1056
+ uri,
1057
+ close_timeout=10, # Max time to wait for close handshake
1058
+ ping_interval=20, # Send ping every 20 seconds
1059
+ ping_timeout=10, # Wait 10 seconds for pong
1060
+ )
1061
+ while True:
1062
+ # If stop event is set, start counting down
1063
+ # Handle both threading.Event and asyncio.Event
1064
+ is_stop_set = stop_event.is_set() if hasattr(stop_event, "is_set") else stop_event.is_set()
1065
+ if is_stop_set and stop_time is None:
1066
+ stop_time = time.time() + 2 # 2 second grace period
1067
+
1068
+ # If we're past the grace period, exit
1069
+ if stop_time is not None and time.time() > stop_time:
1070
+ break
1071
+
1072
+ try:
1073
+ # Use shorter timeout during grace period
1074
+ timeout = 0.1 if stop_time is not None else 1.0
1075
+ message = await asyncio.wait_for(websocket.recv(), timeout=timeout)
1076
+ data = json.loads(message)
1077
+
1078
+ if data.get("streams"):
1079
+ for stream in data["streams"]:
1080
+ labels = stream.get("stream", {})
1081
+ is_event = "k8s_event_count" in list(labels.keys())
1082
+ for value in stream["values"]:
1083
+ ts_ns = int(value[0])
1084
+ if start_timestamp is not None and ts_ns < start_timestamp:
1085
+ continue
1086
+ log_line = value[1]
1087
+ if is_event:
1088
+ event_type = labels.get("detected_level", "")
1089
+ if log_verbosity == LogVerbosity.CRITICAL and event_type == "Normal":
1090
+ # skip Normal events in MINIMAL
1091
+ continue
1092
+
1093
+ try:
1094
+ msg = log_line
1095
+ reason = (labels.get("k8s_event_reason", ""),)
1096
+
1097
+ # Note: relevant starting in release 0.1.19 (using OTel instead of Alloy)
1098
+ if isinstance(reason, tuple):
1099
+ reason = reason[0]
1100
+
1101
+ event_type = labels.get("detected_level", "")
1102
+
1103
+ if reason == "Unhealthy" and (
1104
+ "HTTP probe failed with statuscode: 503" in msg
1105
+ or "Startup probe failed" in msg
1106
+ ):
1107
+ # HTTP probe failures are expected during setup
1108
+ continue
1109
+
1110
+ ignore_patterns = (
1111
+ "queue-proxy",
1112
+ "resolving reference: address not set for kind = service",
1113
+ "failed to get private k8s service endpoints:",
1114
+ )
1115
+ # Ignore queue-proxy events and gateway setup events
1116
+ if any(pattern in msg.lower() for pattern in ignore_patterns):
1117
+ continue
1118
+
1119
+ if msg in shown_event_messages:
1120
+ # Only show unique event messages
1121
+ continue
1122
+
1123
+ shown_event_messages.add(msg)
1124
+
1125
+ except Exception:
1126
+ # If parsing fails, just print the event as is
1127
+ pass
1128
+
1129
+ is_multi_pod = len(self.compute.pods()) > 1
1130
+ k8_object_type = labels.get("k8s_object_kind")
1131
+ k8_object_name = labels.get("k8s_object_name")
1132
+ add_pod_info = is_multi_pod and k8_object_type == "Pod"
1133
+ pod_info = f" | {k8_object_name} |" if add_pod_info else ""
1134
+ if event_type == "Normal":
1135
+ if log_verbosity in [
1136
+ LogVerbosity.INFO,
1137
+ LogVerbosity.DEBUG,
1138
+ ]:
1139
+ print(f'[EVENT]{pod_info} reason={reason} "{msg}"')
1140
+ else:
1141
+ print(f'[EVENT]{pod_info} type={event_type} reason={reason} "{msg}"')
1142
+ continue
1143
+
1144
+ # Skip if we've already seen this timestamp
1145
+ if last_timestamp is not None and value[0] <= last_timestamp:
1146
+ continue
1147
+ last_timestamp = value[0]
1148
+ if log_verbosity in [
1149
+ LogVerbosity.DEBUG,
1150
+ LogVerbosity.INFO,
1151
+ ]:
1152
+ try:
1153
+ log_dict = json.loads(log_line)
1154
+ except json.JSONDecodeError:
1155
+ # setup steps pre server start are not JSON formatted
1156
+ log_dict = None
1157
+
1158
+ if log_dict is not None:
1159
+ # at this stage we are post setup
1160
+ pod_name = log_dict.get("pod", request_id)
1161
+ levelname = log_dict.get("levelname", "INFO")
1162
+ ts = log_dict.get("asctime")
1163
+ message = log_dict.get("message", "")
1164
+
1165
+ if (
1166
+ log_verbosity == LogVerbosity.CRITICAL
1167
+ and levelname not in ["ERROR", "CRITICAL"]
1168
+ ) or (log_verbosity == LogVerbosity.INFO and levelname == "DEBUG"):
1169
+ continue
1170
+
1171
+ log_line = f"{levelname} | {ts} | {message}"
1172
+ if pod_name not in formatters:
1173
+ formatters[pod_name] = ServerLogsFormatter(pod_name)
1174
+ formatter = formatters[pod_name]
1175
+ else:
1176
+ # streaming pre server setup logs, before we have the pod name
1177
+ formatter = base_formatter
1178
+
1179
+ newline = "" if log_dict is None else None
1180
+ formatted_line = f"{formatter.start_color}{f'({self.service_name}) '}{log_line}{formatter.reset_color}"
1181
+
1182
+ # Check for duplicates if dedup is enabled
1183
+ if seen_log_messages is not None:
1184
+ if message in seen_log_messages:
1185
+ continue
1186
+ seen_log_messages.add(message)
1187
+
1188
+ print(formatted_line, end=newline)
1189
+ except asyncio.TimeoutError:
1190
+ # Timeout is expected, just continue the loop
1191
+ continue
1192
+ except websockets.exceptions.ConnectionClosed as e:
1193
+ logger.debug(f"WebSocket connection closed: {str(e)}")
1194
+ break
1195
+ finally:
1196
+ if websocket:
1197
+ try:
1198
+ # Use wait_for to prevent hanging on close
1199
+ await asyncio.wait_for(websocket.close(), timeout=1.0)
1200
+ except (asyncio.TimeoutError, Exception):
1201
+ pass
1202
+ except Exception as e:
1203
+ logger.error(f"Error in websocket stream: {e}")
1204
+ raise e
1205
+ finally:
1206
+ # Ensure websocket is closed even if we didn't enter the try block
1207
+ if websocket:
1208
+ try:
1209
+ # Use wait_for to prevent hanging on close
1210
+ await asyncio.wait_for(websocket.close(), timeout=1.0)
1211
+ except (asyncio.TimeoutError, Exception):
1212
+ pass
1213
+
1214
+ def _wait_for_http_health(self, timeout=60, retry_interval=0.1, backoff=2, max_interval=10):
1215
+ """Wait for the HTTP server to be ready by checking the /health endpoint.
1216
+
1217
+ Args:
1218
+ timeout: Maximum time to wait in seconds
1219
+ retry_interval: Time between health check attempts in seconds
1220
+ """
1221
+ import time
1222
+
1223
+ logger.info(f"Waiting for HTTP server to be ready for service {self.service_name}")
1224
+ start_time = time.time()
1225
+
1226
+ while time.time() - start_time < timeout:
1227
+ try:
1228
+ client = self._client()
1229
+ response = client.get(
1230
+ endpoint=f"{self.base_endpoint}/health",
1231
+ headers=self.request_headers,
1232
+ )
1233
+ if response.status_code == 200:
1234
+ logger.info(f"HTTP server is ready for service {self.service_name}")
1235
+ return
1236
+ else:
1237
+ logger.debug(f"Health check returned status {response.status_code}, retrying...")
1238
+
1239
+ except VersionMismatchError as e:
1240
+ raise e
1241
+
1242
+ except Exception as e:
1243
+ logger.debug(f"Health check failed: {e}, retrying...")
1244
+
1245
+ time.sleep(retry_interval)
1246
+ retry_interval *= backoff # Exponential backoff
1247
+ # Cap the retry interval to a maximum value
1248
+ retry_interval = min(retry_interval, max_interval)
1249
+
1250
+ # If we get here, we've timed out
1251
+ logger.warning(f"HTTP health check timed out after {timeout}s for service {self.service_name}")
1252
+
1253
+ async def _wait_for_http_health_async(self, timeout=60, retry_interval=0.1, backoff=2, max_interval=10):
1254
+ """Async version of _wait_for_http_health. Wait for the HTTP server to be ready by checking the /health endpoint.
1255
+
1256
+ Args:
1257
+ timeout: Maximum time to wait in seconds
1258
+ retry_interval: Time between health check attempts in seconds
1259
+ """
1260
+ import asyncio
1261
+
1262
+ logger.info(f"Waiting for HTTP server to be ready for service {self.service_name}")
1263
+ start_time = time.time()
1264
+
1265
+ while time.time() - start_time < timeout:
1266
+ try:
1267
+ client = self._client()
1268
+ response = client.get(
1269
+ endpoint=f"{self.base_endpoint}/health",
1270
+ headers=self.request_headers,
1271
+ )
1272
+ if response.status_code == 200:
1273
+ logger.info(f"HTTP server is ready for service {self.service_name}")
1274
+ return
1275
+ else:
1276
+ logger.debug(f"Health check returned status {response.status_code}, retrying...")
1277
+ except Exception as e:
1278
+ logger.debug(f"Health check failed: {e}, retrying...")
1279
+
1280
+ await asyncio.sleep(retry_interval)
1281
+ retry_interval *= backoff # Exponential backoff
1282
+ # Cap the retry interval to a maximum value
1283
+ retry_interval = min(retry_interval, max_interval)
1284
+
1285
+ # If we get here, we've timed out
1286
+ logger.warning(f"HTTP health check timed out after {timeout}s for service {self.service_name}")
1287
+
1288
+ def __getstate__(self):
1289
+ """Remove local stateful values before pickle serialization."""
1290
+ state = self.__dict__.copy()
1291
+ # Remove local stateful values that shouldn't be serialized
1292
+ state["_http_client"] = None
1293
+ state["_service_config"] = None
1294
+ state["_remote_pointers"] = None
1295
+ # Pointers need to be converted to not be absolute paths if we're passing
1296
+ # the service elsewhere, e.g. into another service
1297
+ state["pointers"] = self.remote_pointers
1298
+ return state
1299
+
1300
+ def __setstate__(self, state):
1301
+ """Restore state after pickle deserialization."""
1302
+ self.__dict__.update(state)
1303
+ # Reset local stateful values to None to ensure clean initialization
1304
+ self._http_client = None
1305
+ self._service_config = None
1306
+ self._remote_pointers = None
1307
+
1308
+ def __del__(self):
1309
+ if hasattr(self, "_http_client") and self._http_client is not None:
1310
+ try:
1311
+ self._http_client.close()
1312
+ except Exception as e:
1313
+ logger.debug(f"Error closing HTTPClient in Module deletion: {e}")
1314
+ finally:
1315
+ self._http_client = None