kubetorch 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kubetorch might be problematic. Click here for more details.

Files changed (93) hide show
  1. kubetorch/__init__.py +60 -0
  2. kubetorch/cli.py +1985 -0
  3. kubetorch/cli_utils.py +1025 -0
  4. kubetorch/config.py +453 -0
  5. kubetorch/constants.py +18 -0
  6. kubetorch/docs/Makefile +18 -0
  7. kubetorch/docs/__init__.py +0 -0
  8. kubetorch/docs/_ext/json_globaltoc.py +42 -0
  9. kubetorch/docs/api/cli.rst +10 -0
  10. kubetorch/docs/api/python/app.rst +21 -0
  11. kubetorch/docs/api/python/cls.rst +19 -0
  12. kubetorch/docs/api/python/compute.rst +25 -0
  13. kubetorch/docs/api/python/config.rst +11 -0
  14. kubetorch/docs/api/python/fn.rst +19 -0
  15. kubetorch/docs/api/python/image.rst +14 -0
  16. kubetorch/docs/api/python/secret.rst +18 -0
  17. kubetorch/docs/api/python/volumes.rst +13 -0
  18. kubetorch/docs/api/python.rst +101 -0
  19. kubetorch/docs/conf.py +69 -0
  20. kubetorch/docs/index.rst +20 -0
  21. kubetorch/docs/requirements.txt +5 -0
  22. kubetorch/globals.py +285 -0
  23. kubetorch/logger.py +59 -0
  24. kubetorch/resources/__init__.py +0 -0
  25. kubetorch/resources/callables/__init__.py +0 -0
  26. kubetorch/resources/callables/cls/__init__.py +0 -0
  27. kubetorch/resources/callables/cls/cls.py +157 -0
  28. kubetorch/resources/callables/fn/__init__.py +0 -0
  29. kubetorch/resources/callables/fn/fn.py +133 -0
  30. kubetorch/resources/callables/module.py +1416 -0
  31. kubetorch/resources/callables/utils.py +174 -0
  32. kubetorch/resources/compute/__init__.py +0 -0
  33. kubetorch/resources/compute/app.py +261 -0
  34. kubetorch/resources/compute/compute.py +2596 -0
  35. kubetorch/resources/compute/decorators.py +139 -0
  36. kubetorch/resources/compute/rbac.py +74 -0
  37. kubetorch/resources/compute/utils.py +1114 -0
  38. kubetorch/resources/compute/websocket.py +137 -0
  39. kubetorch/resources/images/__init__.py +1 -0
  40. kubetorch/resources/images/image.py +414 -0
  41. kubetorch/resources/images/images.py +74 -0
  42. kubetorch/resources/secrets/__init__.py +2 -0
  43. kubetorch/resources/secrets/kubernetes_secrets_client.py +412 -0
  44. kubetorch/resources/secrets/provider_secrets/__init__.py +0 -0
  45. kubetorch/resources/secrets/provider_secrets/anthropic_secret.py +12 -0
  46. kubetorch/resources/secrets/provider_secrets/aws_secret.py +16 -0
  47. kubetorch/resources/secrets/provider_secrets/azure_secret.py +14 -0
  48. kubetorch/resources/secrets/provider_secrets/cohere_secret.py +12 -0
  49. kubetorch/resources/secrets/provider_secrets/gcp_secret.py +16 -0
  50. kubetorch/resources/secrets/provider_secrets/github_secret.py +13 -0
  51. kubetorch/resources/secrets/provider_secrets/huggingface_secret.py +20 -0
  52. kubetorch/resources/secrets/provider_secrets/kubeconfig_secret.py +12 -0
  53. kubetorch/resources/secrets/provider_secrets/lambda_secret.py +13 -0
  54. kubetorch/resources/secrets/provider_secrets/langchain_secret.py +12 -0
  55. kubetorch/resources/secrets/provider_secrets/openai_secret.py +11 -0
  56. kubetorch/resources/secrets/provider_secrets/pinecone_secret.py +12 -0
  57. kubetorch/resources/secrets/provider_secrets/providers.py +93 -0
  58. kubetorch/resources/secrets/provider_secrets/ssh_secret.py +12 -0
  59. kubetorch/resources/secrets/provider_secrets/wandb_secret.py +11 -0
  60. kubetorch/resources/secrets/secret.py +238 -0
  61. kubetorch/resources/secrets/secret_factory.py +70 -0
  62. kubetorch/resources/secrets/utils.py +209 -0
  63. kubetorch/resources/volumes/__init__.py +0 -0
  64. kubetorch/resources/volumes/volume.py +365 -0
  65. kubetorch/servers/__init__.py +0 -0
  66. kubetorch/servers/http/__init__.py +0 -0
  67. kubetorch/servers/http/distributed_utils.py +3223 -0
  68. kubetorch/servers/http/http_client.py +730 -0
  69. kubetorch/servers/http/http_server.py +1788 -0
  70. kubetorch/servers/http/server_metrics.py +278 -0
  71. kubetorch/servers/http/utils.py +728 -0
  72. kubetorch/serving/__init__.py +0 -0
  73. kubetorch/serving/autoscaling.py +173 -0
  74. kubetorch/serving/base_service_manager.py +363 -0
  75. kubetorch/serving/constants.py +83 -0
  76. kubetorch/serving/deployment_service_manager.py +478 -0
  77. kubetorch/serving/knative_service_manager.py +519 -0
  78. kubetorch/serving/raycluster_service_manager.py +582 -0
  79. kubetorch/serving/service_manager.py +18 -0
  80. kubetorch/serving/templates/deployment_template.yaml +17 -0
  81. kubetorch/serving/templates/knative_service_template.yaml +19 -0
  82. kubetorch/serving/templates/kt_setup_template.sh.j2 +81 -0
  83. kubetorch/serving/templates/pod_template.yaml +194 -0
  84. kubetorch/serving/templates/raycluster_service_template.yaml +42 -0
  85. kubetorch/serving/templates/raycluster_template.yaml +35 -0
  86. kubetorch/serving/templates/service_template.yaml +21 -0
  87. kubetorch/serving/templates/workerset_template.yaml +36 -0
  88. kubetorch/serving/utils.py +377 -0
  89. kubetorch/utils.py +284 -0
  90. kubetorch-0.2.0.dist-info/METADATA +121 -0
  91. kubetorch-0.2.0.dist-info/RECORD +93 -0
  92. kubetorch-0.2.0.dist-info/WHEEL +4 -0
  93. kubetorch-0.2.0.dist-info/entry_points.txt +5 -0
@@ -0,0 +1,1416 @@
1
+ import asyncio
2
+ import json
3
+ import tempfile
4
+ import threading
5
+ import time
6
+ import urllib.parse
7
+ from datetime import datetime, timezone
8
+ from pathlib import Path
9
+ from typing import Dict, List, Union
10
+
11
+ import websockets
12
+
13
+ from kubetorch.globals import config, service_url, service_url_async
14
+ from kubetorch.logger import get_logger
15
+ from kubetorch.resources.callables.utils import (
16
+ get_names_for_reload_fallbacks,
17
+ locate_working_dir,
18
+ )
19
+
20
+ from kubetorch.resources.compute.utils import (
21
+ delete_cached_service_data,
22
+ delete_configmaps,
23
+ load_configmaps,
24
+ VersionMismatchError,
25
+ )
26
+ from kubetorch.servers.http.http_client import HTTPClient
27
+ from kubetorch.servers.http.utils import (
28
+ clean_and_validate_k8s_name,
29
+ generate_unique_request_id,
30
+ is_running_in_kubernetes,
31
+ )
32
+ from kubetorch.utils import (
33
+ extract_host_port,
34
+ get_kt_install_url,
35
+ iso_timestamp_to_nanoseconds,
36
+ LogVerbosity,
37
+ ServerLogsFormatter,
38
+ )
39
+
40
+ logger = get_logger(__name__)
41
+
42
+
43
+ class Module:
44
+ MODULE_TYPE = None
45
+
46
+ def __init__(
47
+ self,
48
+ name: str,
49
+ pointers: tuple,
50
+ ):
51
+ self._compute = None
52
+ self._deployment_timestamp = None
53
+ self._service_config = None
54
+ self._http_client = None
55
+ self._get_if_exists = True
56
+ self._reload_prefixes = None
57
+ self._serialization = "json" # Default serialization format
58
+ self._async = False
59
+ self._remote_pointers = None
60
+ self._service_name = None
61
+
62
+ self.pointers = pointers
63
+ self.name = (
64
+ clean_and_validate_k8s_name(name, allow_full_length=False) if name else None
65
+ )
66
+
67
+ @property
68
+ def module_name(self):
69
+ """Name of the function or class."""
70
+ return self.pointers[2]
71
+
72
+ @property
73
+ def reload_prefixes(self):
74
+ return self._reload_prefixes or []
75
+
76
+ @reload_prefixes.setter
77
+ def reload_prefixes(self, value: Union[str, List[str]]):
78
+ """Set the reload_prefixes property."""
79
+ if isinstance(value, (list)):
80
+ self._reload_prefixes = value
81
+ elif isinstance(value, str):
82
+ self._reload_prefixes = [value]
83
+ else:
84
+ raise ValueError("`reload_prefixes` must be a string or a list.")
85
+
86
+ @property
87
+ def namespace(self):
88
+ """Namespace where the service is deployed."""
89
+ if self.compute is not None:
90
+ return self.compute.namespace
91
+ return config.namespace
92
+
93
+ @property
94
+ def service_name(self):
95
+ """Name of the knative service, formatted according to k8s regex rules."""
96
+ if self._service_name:
97
+ return self._service_name
98
+
99
+ service_name = self.name
100
+
101
+ if (
102
+ config.username
103
+ and not self.reload_prefixes
104
+ and not service_name.startswith(config.username + "-")
105
+ ):
106
+ service_name = f"{config.username}-{service_name}"
107
+
108
+ self._service_name = clean_and_validate_k8s_name(
109
+ service_name, allow_full_length=True
110
+ )
111
+ return self._service_name
112
+
113
+ @service_name.setter
114
+ def service_name(self, value: str):
115
+ self._service_name = clean_and_validate_k8s_name(value, allow_full_length=True)
116
+
117
+ @property
118
+ def compute(self):
119
+ """Compute object corresponding to the module."""
120
+ return self._compute
121
+
122
+ @compute.setter
123
+ def compute(self, compute: "Compute"):
124
+ self._compute = compute
125
+
126
+ @property
127
+ def deployment_timestamp(self):
128
+ if not self._deployment_timestamp:
129
+ self._deployment_timestamp = (
130
+ self.compute.service_manager.get_deployment_timestamp_annotation(
131
+ self.service_name
132
+ )
133
+ )
134
+ return self._deployment_timestamp
135
+
136
+ @deployment_timestamp.setter
137
+ def deployment_timestamp(self, value: str):
138
+ self._deployment_timestamp = value
139
+
140
+ @property
141
+ def remote_pointers(self):
142
+ if self._remote_pointers:
143
+ return self._remote_pointers
144
+
145
+ source_dir = locate_working_dir(self.pointers[0])
146
+ relative_module_path = (
147
+ Path(self.pointers[0]).expanduser().relative_to(source_dir)
148
+ )
149
+ source_dir_name = Path(source_dir).name
150
+ if self.compute.working_dir is not None:
151
+ container_module_path = str(
152
+ Path(self.compute.working_dir) / source_dir_name / relative_module_path
153
+ )
154
+ else:
155
+ # Leave it as relative path
156
+ container_module_path = str(Path(source_dir_name) / relative_module_path)
157
+ self._remote_pointers = (
158
+ container_module_path,
159
+ self.pointers[1],
160
+ self.pointers[2],
161
+ )
162
+ return self._remote_pointers
163
+
164
+ @property
165
+ def service_config(self) -> dict:
166
+ """Knative service configuration loaded from Kubernetes API."""
167
+ return self._service_config
168
+
169
+ @service_config.setter
170
+ def service_config(self, value: dict):
171
+ self._service_config = value
172
+
173
+ @property
174
+ def base_endpoint(self):
175
+ """Endpoint for the module."""
176
+ if is_running_in_kubernetes():
177
+ if not self._compute.endpoint:
178
+ return self._compute._wait_for_endpoint()
179
+ return self._compute.endpoint
180
+ # URL format when using the NGINX proxy
181
+ return f"http://localhost:{self._compute.client_port()}/{self.namespace}/{self.service_name}"
182
+
183
+ @property
184
+ def request_headers(self):
185
+ if self.compute.freeze:
186
+ return {}
187
+
188
+ if self.deployment_timestamp:
189
+ return {"X-Deployed-As-Of": self.deployment_timestamp}
190
+
191
+ return {}
192
+
193
+ @property
194
+ def serialization(self):
195
+ """Default serialization format for this module."""
196
+ return self._serialization
197
+
198
+ @serialization.setter
199
+ def serialization(self, value: str):
200
+ """Set the default serialization format for this module."""
201
+ if value not in ["json", "pickle"]:
202
+ raise ValueError("Serialization must be 'json' or 'pickle'")
203
+ self._serialization = value
204
+
205
+ @property
206
+ def async_(self):
207
+ """Whether to run the function or class methods in async mode."""
208
+ return self._async
209
+
210
+ @async_.setter
211
+ def async_(self, value: bool):
212
+ if not isinstance(value, bool):
213
+ raise ValueError("`async_` must be a boolean")
214
+ self._async = value
215
+
216
+ @classmethod
217
+ def from_name(
218
+ cls,
219
+ name: str,
220
+ namespace: str = None,
221
+ reload_prefixes: Union[str, List[str]] = [],
222
+ ):
223
+ """Reload an existing callable by its service name."""
224
+ from kubernetes import client
225
+ from kubernetes.config import (
226
+ ConfigException,
227
+ load_incluster_config,
228
+ load_kube_config,
229
+ )
230
+
231
+ import kubetorch as kt
232
+
233
+ try:
234
+ load_incluster_config()
235
+ except ConfigException:
236
+ load_kube_config()
237
+ objects_api = client.CustomObjectsApi()
238
+ apps_v1_api = client.AppsV1Api()
239
+ core_v1_api = client.CoreV1Api()
240
+
241
+ namespace = namespace or config.namespace
242
+ if isinstance(reload_prefixes, str):
243
+ reload_prefixes = [reload_prefixes]
244
+ potential_names = get_names_for_reload_fallbacks(
245
+ name=name, prefixes=reload_prefixes
246
+ )
247
+
248
+ # Use unified service discovery from BaseServiceManager
249
+ from kubetorch.serving.service_manager import BaseServiceManager
250
+
251
+ all_services = BaseServiceManager.discover_services_static(
252
+ namespace=namespace, objects_api=objects_api, apps_v1_api=apps_v1_api
253
+ )
254
+
255
+ # Create name-to-service lookup for efficient searching
256
+ service_dict = {svc["name"]: svc for svc in all_services}
257
+
258
+ # Try to find the first matching service across all service types
259
+ for candidate in potential_names:
260
+
261
+ service_info = service_dict.get(candidate)
262
+ if service_info is None:
263
+ continue
264
+
265
+ compute = kt.Compute.from_template(service_info)
266
+
267
+ pods = core_v1_api.list_namespaced_pod(
268
+ namespace=namespace,
269
+ label_selector=f"kubetorch.com/service={name}",
270
+ )
271
+ volumes = []
272
+
273
+ # TODO: handle case where service is scaled to 0?
274
+ if pods.items:
275
+ # Use runtime Pod spec
276
+ pod = pods.items[0]
277
+ for v in pod.spec.volumes or []:
278
+ if v.persistent_volume_claim:
279
+ existing_volume = kt.Volume.from_name(name=v.name)
280
+ volumes.append(existing_volume)
281
+
282
+ module_args = compute.get_env_vars(
283
+ [
284
+ "KT_FILE_PATH",
285
+ "KT_MODULE_NAME",
286
+ "KT_CLS_OR_FN_NAME",
287
+ "KT_CALLABLE_TYPE",
288
+ "KT_INIT_ARGS",
289
+ ]
290
+ )
291
+ pointers = (
292
+ module_args["KT_FILE_PATH"],
293
+ module_args["KT_MODULE_NAME"],
294
+ module_args["KT_CLS_OR_FN_NAME"],
295
+ )
296
+
297
+ if module_args.get("KT_CALLABLE_TYPE") == "cls":
298
+ init_args = json.loads(module_args.get("KT_INIT_ARGS") or "{}")
299
+ reloaded_module = kt.Cls(
300
+ name=candidate, pointers=pointers, init_args=init_args
301
+ )
302
+ elif module_args.get("KT_CALLABLE_TYPE") == "fn":
303
+ reloaded_module = kt.Fn(name=candidate, pointers=pointers)
304
+ else:
305
+ raise ValueError(
306
+ f"Unknown module type: {module_args.get('KT_CALLABLE_TYPE')}"
307
+ )
308
+
309
+ reloaded_module.service_name = candidate
310
+ reloaded_module.compute = compute
311
+ return reloaded_module
312
+
313
+ raise ValueError(
314
+ f"Service '{name}' not found in namespace '{namespace}' with reload_prefixes={reload_prefixes}"
315
+ )
316
+
317
+ def _client(self, *args, **kwargs):
318
+ """Return the client through which to interact with the remote Module.
319
+ If compute is not yet set, attempt to reload it.
320
+ """
321
+ if self._http_client is not None:
322
+ return self._http_client
323
+
324
+ if self.compute is None or self.service_config is None:
325
+ namespace = self.namespace
326
+ # When rebuilding the http client on reload, need to know whether to look for a prefix
327
+ reload_prefixes = self.reload_prefixes
328
+ logger.debug(
329
+ f"Attempting to reload service '{self.service_name}' in namespace '{namespace}' with "
330
+ f"reload_prefixes={reload_prefixes}"
331
+ )
332
+ reloaded_module = Module.from_name(
333
+ name=self.service_name,
334
+ namespace=namespace,
335
+ reload_prefixes=reload_prefixes,
336
+ )
337
+
338
+ # Update settable attributes with reloaded module values
339
+ self.compute = self.compute or reloaded_module.compute
340
+ self.service_config = reloaded_module.service_config
341
+ self.pointers = reloaded_module.pointers
342
+ self.name = reloaded_module.name
343
+ self.service_name = reloaded_module.service_name
344
+
345
+ self._http_client = HTTPClient(
346
+ base_url=self.endpoint(*args, **kwargs),
347
+ compute=self.compute,
348
+ service_name=self.service_name,
349
+ )
350
+
351
+ return self._http_client
352
+
353
+ def endpoint(self, method_name: str = None):
354
+ if not hasattr(self, "init_args"):
355
+ return f"{self.base_endpoint}/{self.module_name}"
356
+ else:
357
+ return f"{self.base_endpoint}/{self.module_name}/{method_name}"
358
+
359
+ def deploy(self):
360
+ """
361
+ Helper method to deploy modules specified by the @compute decorator. Used by `kt deploy` CLI command.
362
+ Deploys the module to the specified compute.
363
+ """
364
+ if self.compute is None:
365
+ raise ValueError("Compute must be set before deploying the module.")
366
+ return self.to(self.compute, init_args=getattr(self, "init_args", None))
367
+
368
+ async def deploy_async(self):
369
+ """
370
+ Async helper method to deploy modules specified by the @compute decorator. Used by `kt deploy` CLI command
371
+ when multiple modules are present. Deploys the module to the specified compute asynchronously.
372
+ """
373
+ if self.compute is None:
374
+ raise ValueError("Compute must be set before deploying the module.")
375
+ return await self.to_async(
376
+ self.compute, init_args=getattr(self, "init_args", None)
377
+ )
378
+
379
+ def to(
380
+ self,
381
+ compute: "Compute",
382
+ init_args: Dict = None,
383
+ stream_logs: Union[bool, None] = None,
384
+ verbosity: Union[LogVerbosity, str] = None,
385
+ get_if_exists: bool = False,
386
+ reload_prefixes: Union[str, List[str]] = [],
387
+ dryrun: bool = False,
388
+ ):
389
+ """
390
+ Send the function or class to the specified compute.
391
+
392
+ Args:
393
+ compute (Compute): The compute to send the function or class to.
394
+ init_args (Dict, optional): Initialization arguments, which may be relevant for a class.
395
+ stream_logs (bool, optional): Whether to stream logs during service launch. If None, uses the global
396
+ config value.
397
+ verbosity (Union[verbosity, str], optional): Verbosity of the logs streamed back to the client.
398
+ If not specified, will stream select service logs. Can also be controlled globally via the config
399
+ value `log_verbosity`. Supported values: "debug", "info", "critical".
400
+ get_if_exists (Union[bool, List[str]], optional): Controls how service lookup is performed to determine
401
+ whether to send the service to the compute.
402
+
403
+ - If False (default): Do not attempt to reload the service.
404
+ - If True: Attempt to find an existing service using a standard fallback order
405
+ (e.g., username, git branch, then prod). If found, re-use that existing service.
406
+ reload_prefixes (Union[str, List[str]], optional): A list of prefixes to use when reloading the function
407
+ (e.g., ["qa", "prod", "git-branch-name"]). If not provided, will use the current username,
408
+ git branch, and prod.
409
+ dryrun (bool, optional): Whether to setup and return the object as a dryrun (``True``),
410
+ or to actually launch the compute and service (``False``).
411
+ Returns:
412
+ Module: The module instance.
413
+
414
+ Example:
415
+
416
+ .. code-block:: python
417
+
418
+ import kubetorch as kt
419
+
420
+ remote_cls = kt.cls(SlowNumpyArray, name=name).to(
421
+ kt.Compute(cpus=".1"),
422
+ init_args={"size": 10},
423
+ stream_logs=True
424
+ )
425
+ """
426
+ if get_if_exists:
427
+ try:
428
+ existing_service = self._get_existing_service(reload_prefixes)
429
+ if existing_service:
430
+ logger.debug(
431
+ f"Reusing existing service: {existing_service.service_name}"
432
+ )
433
+ return existing_service
434
+ except Exception as e:
435
+ logger.info(
436
+ f"Service {self.service_name} not found in namespace {self.compute.namespace} "
437
+ f"with reload_prefixes={reload_prefixes}: {str(e)}"
438
+ )
439
+
440
+ self.compute = compute
441
+ self.compute.service_name = self.service_name
442
+
443
+ if hasattr(self, "init_args"):
444
+ self.init_args = init_args
445
+
446
+ # We need the deployment timestamp at the start of the update so we know that artifacts deployed **after**
447
+ # this time are part of the current deployment. We actually set it at the end to ensure that the deployment is
448
+ # successful.
449
+ logger.debug(f"Deploying module: {self.service_name}")
450
+ deployment_timestamp = datetime.now(timezone.utc).isoformat()
451
+ install_url, use_editable = get_kt_install_url(self.compute.freeze)
452
+
453
+ if not dryrun and not self.compute.freeze:
454
+ self._rsync_repo_and_image_patches(install_url, use_editable, init_args)
455
+
456
+ self._launch_service(
457
+ install_url,
458
+ use_editable,
459
+ init_args,
460
+ deployment_timestamp,
461
+ stream_logs,
462
+ verbosity,
463
+ dryrun,
464
+ )
465
+
466
+ return self
467
+
468
+ async def to_async(
469
+ self,
470
+ compute: "Compute",
471
+ init_args: Dict = None,
472
+ stream_logs: Union[bool, None] = None,
473
+ verbosity: Union[LogVerbosity, str] = None,
474
+ get_if_exists: bool = False,
475
+ reload_prefixes: Union[str, List[str]] = [],
476
+ dryrun: bool = False,
477
+ ):
478
+ """
479
+ Async version of the `.to` method. Send the function or class to the specified compute asynchronously.
480
+
481
+ Args:
482
+ compute (Compute): The compute to send the function or class to.
483
+ init_args (Dict, optional): Initialization arguments, which may be relevant for a class.
484
+ stream_logs (bool, optional): Whether to stream logs during service launch. If None, uses the global
485
+ config value.
486
+ verbosity (Union[verbosity, str], optional): Verbosity of the logs streamed back to the client.
487
+ If not specified, will stream select service logs. Can also be controlled globally via the config
488
+ value `log_verbosity`. Supported values: "debug", "info", "critical".
489
+ get_if_exists (Union[bool, List[str]], optional): Controls how service lookup is performed to determine
490
+ whether to send the service to the compute.
491
+
492
+ - If False (default): Do not attempt to reload the service.
493
+ - If True: Attempt to find an existing service using a standard fallback order
494
+ (e.g., username, git branch, then prod). If found, re-use that existing service.
495
+ reload_prefixes (Union[str, List[str]], optional): A list of prefixes to use when reloading the function
496
+ (e.g., ["qa", "prod", "git-branch-name"]). If not provided, will use the current username,
497
+ git branch, and prod.
498
+ dryrun (bool, optional): Whether to setup and return the object as a dryrun (``True``),
499
+ or to actually launch the compute and service (``False``).
500
+ Returns:
501
+ Module: The module instance.
502
+
503
+ Example:
504
+
505
+ .. code-block:: python
506
+
507
+ import kubetorch as kt
508
+
509
+ remote_cls = await kt.cls(SlowNumpyArray, name=name).to_async(
510
+ kt.Compute(cpus=".1"),
511
+ init_args={"size": 10},
512
+ stream_logs=True
513
+ )
514
+ """
515
+ if get_if_exists:
516
+ try:
517
+ existing_service = await self._get_existing_service_async(
518
+ reload_prefixes
519
+ )
520
+ if existing_service:
521
+ logger.debug(
522
+ f"Reusing existing service: {existing_service.service_name}"
523
+ )
524
+ return existing_service
525
+ except Exception as e:
526
+ logger.info(
527
+ f"Service {self.compute.service_name} not found in namespace {self.compute.namespace} "
528
+ f"with reload_prefixes={reload_prefixes}: {str(e)}"
529
+ )
530
+
531
+ self.compute = compute
532
+ self.compute.service_name = self.service_name
533
+
534
+ if hasattr(self, "init_args"):
535
+ self.init_args = init_args
536
+
537
+ logger.debug(f"Deploying module: {self.service_name}")
538
+ deployment_timestamp = datetime.now(timezone.utc).isoformat()
539
+ install_url, use_editable = get_kt_install_url(self.compute.freeze)
540
+
541
+ if not dryrun and not self.compute.freeze:
542
+ await self._rsync_repo_and_image_patches_async(
543
+ install_url, use_editable, init_args
544
+ )
545
+
546
+ await self._launch_service_async(
547
+ install_url,
548
+ use_editable,
549
+ init_args,
550
+ deployment_timestamp,
551
+ stream_logs,
552
+ verbosity,
553
+ dryrun,
554
+ )
555
+
556
+ return self
557
+
558
+ def _get_existing_service(self, reload_prefixes):
559
+ try:
560
+ existing_service = Module.from_name(
561
+ self.service_name,
562
+ namespace=self.namespace,
563
+ reload_prefixes=reload_prefixes,
564
+ )
565
+ if existing_service:
566
+ if self.compute:
567
+ # Replace the compute object, if the user has already constructed it locally
568
+ existing_service.compute = self.compute
569
+ logger.info(
570
+ f"Existing service '{self.service_name}' found in namespace '{self.namespace}', not "
571
+ f"redeploying."
572
+ )
573
+ return existing_service
574
+ except Exception as e:
575
+ raise ValueError(
576
+ f"Failed to reload service {self.service_name} in namespace {self.namespace} "
577
+ f"and reload_prefixes={reload_prefixes}: {str(e)}"
578
+ )
579
+
580
+ async def _get_existing_service_async(self, reload_prefixes):
581
+ try:
582
+ existing_service = Module.from_name(
583
+ self.service_name,
584
+ namespace=self.namespace,
585
+ reload_prefixes=reload_prefixes,
586
+ )
587
+ if existing_service:
588
+ if self.compute:
589
+ # Replace the compute object, if the user has already constructed it locally
590
+ existing_service.compute = self.compute
591
+ logger.info(
592
+ f"Existing service '{self.service_name}' found in namespace '{self.namespace}', not "
593
+ f"redeploying."
594
+ )
595
+ return existing_service
596
+ except Exception as e:
597
+ raise ValueError(
598
+ f"Failed to reload service {self.service_name} in namespace {self.namespace} "
599
+ f"and reload_prefixes={reload_prefixes}: {str(e)}"
600
+ )
601
+
602
+ def _rsync_repo_and_image_patches(self, install_url, use_editable, init_args):
603
+ logger.debug("Rsyncing data to the rsync pod")
604
+ source_dir = locate_working_dir(self.pointers[0])
605
+ rsync_dirs = [str(source_dir)]
606
+ if use_editable and install_url not in rsync_dirs:
607
+ rsync_dirs.append(install_url)
608
+
609
+ pointer_env_vars = self._get_pointer_env_vars(self.remote_pointers)
610
+ metadata_env_vars = self._get_metadata_env_vars(init_args)
611
+ service_dockerfile = self._get_service_dockerfile(
612
+ {**pointer_env_vars, **metadata_env_vars}
613
+ )
614
+ self._construct_and_rsync_files(rsync_dirs, service_dockerfile)
615
+ logger.debug(f"Rsync completed for service {self.service_name}")
616
+
617
+ async def _rsync_repo_and_image_patches_async(
618
+ self, install_url, use_editable, init_args
619
+ ):
620
+ logger.debug("Rsyncing data to the rsync pod")
621
+ source_dir = locate_working_dir(self.pointers[0])
622
+ rsync_dirs = [str(source_dir)]
623
+ if use_editable and install_url not in rsync_dirs:
624
+ rsync_dirs.append(install_url)
625
+
626
+ pointer_env_vars = self._get_pointer_env_vars(self.remote_pointers)
627
+ metadata_env_vars = self._get_metadata_env_vars(init_args)
628
+ service_dockerfile = self._get_service_dockerfile(
629
+ {**pointer_env_vars, **metadata_env_vars}
630
+ )
631
+ await self._construct_and_rsync_files_async(rsync_dirs, service_dockerfile)
632
+ logger.debug(f"Rsync completed for service {self.service_name}")
633
+
634
+ def _launch_service(
635
+ self,
636
+ install_url,
637
+ use_editable,
638
+ init_args,
639
+ deployment_timestamp,
640
+ stream_logs,
641
+ verbosity,
642
+ dryrun,
643
+ ):
644
+ # Start log streaming if enabled
645
+ stop_event = threading.Event()
646
+ log_thread = None
647
+ if stream_logs is None:
648
+ stream_logs = config.stream_logs or False
649
+
650
+ launch_request_id = "-"
651
+ if stream_logs and not dryrun:
652
+ if verbosity is None:
653
+ verbosity = config.log_verbosity
654
+
655
+ # Create a unique request ID for this launch sequence
656
+ launch_request_id = (
657
+ f"launch_{generate_unique_request_id('launch', deployment_timestamp)}"
658
+ )
659
+
660
+ # Start log streaming in a separate thread
661
+ log_thread = threading.Thread(
662
+ target=self._stream_launch_logs,
663
+ args=(
664
+ launch_request_id,
665
+ stop_event,
666
+ verbosity,
667
+ deployment_timestamp,
668
+ ),
669
+ )
670
+ log_thread.daemon = True
671
+ log_thread.start()
672
+
673
+ try:
674
+ startup_rsync_command = self._startup_rsync_command(use_editable, dryrun)
675
+
676
+ # Launch the compute in the form of a service with the requested resources
677
+ service_config = self.compute._launch(
678
+ service_name=self.compute.service_name,
679
+ install_url=install_url if not use_editable else None,
680
+ pointer_env_vars=self._get_pointer_env_vars(self.remote_pointers),
681
+ metadata_env_vars=self._get_metadata_env_vars(init_args),
682
+ startup_rsync_command=startup_rsync_command,
683
+ launch_id=launch_request_id,
684
+ dryrun=dryrun,
685
+ )
686
+ self.service_config = service_config
687
+
688
+ if not self.compute.freeze and not dryrun:
689
+ self.deployment_timestamp = (
690
+ self.compute.service_manager.update_deployment_timestamp_annotation(
691
+ service_name=self.service_name,
692
+ new_timestamp=deployment_timestamp,
693
+ )
694
+ )
695
+ if not dryrun:
696
+ self.compute._check_service_ready()
697
+ # Additional health check to ensure HTTP server is ready
698
+ self._wait_for_http_health()
699
+ finally:
700
+ # Stop log streaming
701
+ if log_thread:
702
+ stop_event.set()
703
+
704
+ async def _launch_service_async(
705
+ self,
706
+ install_url,
707
+ use_editable,
708
+ init_args,
709
+ deployment_timestamp,
710
+ stream_logs,
711
+ verbosity,
712
+ dryrun,
713
+ ):
714
+ # Start log streaming if enabled
715
+ stop_event = asyncio.Event()
716
+ log_task = None
717
+ if stream_logs is None:
718
+ stream_logs = config.stream_logs or False
719
+
720
+ launch_request_id = "-"
721
+ if stream_logs and not dryrun:
722
+ if verbosity is None:
723
+ verbosity = config.log_verbosity
724
+
725
+ # Create a unique request ID for this launch sequence
726
+ launch_request_id = (
727
+ f"launch_{generate_unique_request_id('launch', deployment_timestamp)}"
728
+ )
729
+
730
+ # Start log streaming as an async task
731
+ log_task = asyncio.create_task(
732
+ self._stream_launch_logs_async(
733
+ launch_request_id,
734
+ stop_event,
735
+ verbosity,
736
+ deployment_timestamp,
737
+ )
738
+ )
739
+
740
+ try:
741
+ startup_rsync_command = self._startup_rsync_command(use_editable, dryrun)
742
+
743
+ # Launch the compute in the form of a service with the requested resources
744
+ # Use the async version of _launch
745
+ service_config = await self.compute._launch_async(
746
+ service_name=self.compute.service_name,
747
+ install_url=install_url if not use_editable else None,
748
+ pointer_env_vars=self._get_pointer_env_vars(self.remote_pointers),
749
+ metadata_env_vars=self._get_metadata_env_vars(init_args),
750
+ startup_rsync_command=startup_rsync_command,
751
+ launch_id=launch_request_id,
752
+ dryrun=dryrun,
753
+ )
754
+ self.service_config = service_config
755
+
756
+ if not self.compute.freeze and not dryrun:
757
+ self.deployment_timestamp = (
758
+ self.compute.service_manager.update_deployment_timestamp_annotation(
759
+ service_name=self.service_name,
760
+ new_timestamp=deployment_timestamp,
761
+ )
762
+ )
763
+ if not dryrun:
764
+ await self.compute._check_service_ready_async()
765
+ await self._wait_for_http_health_async()
766
+ finally:
767
+ # Stop log streaming
768
+ if log_task:
769
+ stop_event.set()
770
+ try:
771
+ await asyncio.wait_for(log_task, timeout=2.0)
772
+ except asyncio.TimeoutError:
773
+ log_task.cancel()
774
+ try:
775
+ await log_task
776
+ except asyncio.CancelledError:
777
+ pass
778
+
779
+ def _get_service_dockerfile(self, metadata_env_vars):
780
+ image_instructions = self.compute._image_setup_and_instructions()
781
+
782
+ image_instructions += "\n"
783
+ for key, val in metadata_env_vars.items():
784
+ if isinstance(val, Dict):
785
+ val = json.dumps(val)
786
+ image_instructions += f"ENV {key} {val}\n"
787
+
788
+ logger.debug(
789
+ f"Generated Dockerfile for service {self.service_name}:\n{image_instructions}"
790
+ )
791
+ return image_instructions
792
+
793
+ def _construct_and_rsync_files(self, rsync_dirs, service_dockerfile):
794
+ with tempfile.TemporaryDirectory() as tmpdir:
795
+ temp_file = Path(tmpdir) / ".kt" / "image.dockerfile"
796
+ temp_file.parent.mkdir(parents=True, exist_ok=True)
797
+ temp_file.write_text(service_dockerfile)
798
+
799
+ source_dir = str(Path(tmpdir) / ".kt")
800
+ rsync_dirs.append(source_dir)
801
+
802
+ logger.debug(f"Rsyncing directories: {rsync_dirs}")
803
+ if is_running_in_kubernetes():
804
+ self.compute.rsync_in_cluster(rsync_dirs)
805
+ else:
806
+ self.compute.rsync(rsync_dirs)
807
+
808
+ async def _construct_and_rsync_files_async(self, rsync_dirs, service_dockerfile):
809
+ with tempfile.TemporaryDirectory() as tmpdir:
810
+ temp_file = Path(tmpdir) / ".kt" / "image.dockerfile"
811
+ temp_file.parent.mkdir(parents=True, exist_ok=True)
812
+ temp_file.write_text(service_dockerfile)
813
+
814
+ source_dir = str(Path(tmpdir) / ".kt")
815
+ rsync_dirs.append(source_dir)
816
+
817
+ logger.debug(f"Rsyncing directories: {rsync_dirs}")
818
+ if is_running_in_kubernetes():
819
+ await self.compute.rsync_in_cluster_async(rsync_dirs)
820
+ else:
821
+ await self.compute.rsync_async(rsync_dirs)
822
+
823
+ def _startup_rsync_command(self, use_editable, dryrun):
824
+ if not use_editable or dryrun:
825
+ return None
826
+
827
+ # rsync from the rsync pod's file system directly
828
+ startup_cmd = self.compute._rsync_svc_url()
829
+ cmd = f"rsync -av {startup_cmd} ."
830
+ return cmd
831
+
832
+ def teardown(self):
833
+ """Delete the service and all associated resources."""
834
+ logger.info(f"Deleting service: {self.service_name}")
835
+
836
+ # Use the compute's service manager - it already knows the correct type!
837
+ teardown_success = self.compute.service_manager.teardown_service(
838
+ service_name=self.service_name,
839
+ )
840
+
841
+ if not teardown_success:
842
+ logger.error(f"Failed to teardown service {self.service_name}")
843
+ return
844
+
845
+ configmaps = load_configmaps(
846
+ core_api=self.compute.core_api,
847
+ service_name=self.service_name,
848
+ namespace=self.compute.namespace,
849
+ )
850
+ if configmaps:
851
+ logger.info(
852
+ f"Deleting {len(configmaps)} configmap{'' if len(configmaps) == 1 else 's'}"
853
+ )
854
+ delete_configmaps(
855
+ core_api=self.compute.core_api,
856
+ configmaps=configmaps,
857
+ namespace=self.compute.namespace,
858
+ )
859
+
860
+ logger.info("Deleting service data from cache in rsync pod")
861
+ delete_cached_service_data(
862
+ core_api=self.compute.core_api,
863
+ service_name=self.service_name,
864
+ namespace=self.compute.namespace,
865
+ )
866
+
867
+ def _get_pointer_env_vars(self, remote_pointers):
868
+ (container_file_path, module_name, cls_or_fn_name) = remote_pointers
869
+ return {
870
+ "KT_FILE_PATH": container_file_path,
871
+ "KT_MODULE_NAME": module_name,
872
+ "KT_CLS_OR_FN_NAME": cls_or_fn_name,
873
+ }
874
+
875
+ def _get_metadata_env_vars(
876
+ self,
877
+ init_args: Dict,
878
+ ) -> Dict:
879
+ # TODO: add other callable metadata in addition to pointers (`is_generator`, `is_async`, etc.)
880
+ import json
881
+
882
+ distributed_config = self.compute.distributed_config
883
+ return {
884
+ "KT_INIT_ARGS": init_args,
885
+ "KT_CALLABLE_TYPE": self.MODULE_TYPE,
886
+ "KT_DISTRIBUTED_CONFIG": json.dumps(distributed_config)
887
+ if distributed_config
888
+ else None,
889
+ }
890
+
891
+ def _stream_launch_logs(
892
+ self,
893
+ request_id: str,
894
+ stop_event: threading.Event,
895
+ verbosity: LogVerbosity,
896
+ deployment_timestamp: str,
897
+ ):
898
+ """Stream logs and events during service launch sequence."""
899
+ try:
900
+ # Only use "kubetorch" container to exclude queue-proxy (e.g. Knative sidecars) container logs which
901
+ # are spammy with tons of healthcheck calls
902
+ pod_query = (
903
+ f'{{k8s_container_name="kubetorch"}} | json | request_id="{request_id}"'
904
+ )
905
+ event_query = f'{{service_name="unknown_service"}} | json | k8s_object_name=~"{self.service_name}.*" | k8s_namespace_name="{self.namespace}"'
906
+
907
+ encoded_pod_query = urllib.parse.quote_plus(pod_query)
908
+ encoded_event_query = urllib.parse.quote_plus(event_query)
909
+ logger.debug(
910
+ f"Streaming launch logs and events for service {self.service_name}"
911
+ )
912
+
913
+ def start_log_threads(host, port):
914
+ def run_pod_logs():
915
+ self._run_log_stream(
916
+ request_id,
917
+ stop_event,
918
+ host,
919
+ port,
920
+ encoded_pod_query,
921
+ verbosity,
922
+ deployment_timestamp,
923
+ dedup=True,
924
+ )
925
+
926
+ def run_event_logs():
927
+ self._run_log_stream(
928
+ request_id,
929
+ stop_event,
930
+ host,
931
+ port,
932
+ encoded_event_query,
933
+ verbosity,
934
+ deployment_timestamp,
935
+ )
936
+
937
+ pod_thread = threading.Thread(target=run_pod_logs, daemon=True)
938
+ event_thread = threading.Thread(target=run_event_logs, daemon=True)
939
+
940
+ pod_thread.start()
941
+ event_thread.start()
942
+
943
+ # Don't block indefinitely on joins - use short timeouts
944
+ pod_thread.join(timeout=1.0)
945
+ event_thread.join(timeout=1.0)
946
+
947
+ base_url = service_url()
948
+ host, port = extract_host_port(base_url)
949
+ logger.debug(
950
+ f"Streaming launch logs with url={base_url} host={host} and local port {port}"
951
+ )
952
+ start_log_threads(host, port)
953
+
954
+ except Exception as e:
955
+ logger.error(f"Failed to stream launch logs: {e}")
956
+ raise e
957
+
958
+ async def _stream_launch_logs_async(
959
+ self,
960
+ request_id: str,
961
+ stop_event: asyncio.Event,
962
+ verbosity: LogVerbosity,
963
+ deployment_timestamp: str,
964
+ ):
965
+ """Async version of _stream_launch_logs. Stream logs and events during service launch sequence."""
966
+ try:
967
+ # Only use "kubetorch" container to exclude queue-proxy (e.g. Knative sidecars) container logs which
968
+ # are spammy with tons of healthcheck calls
969
+ pod_query = (
970
+ f'{{k8s_container_name="kubetorch"}} | json | request_id="{request_id}"'
971
+ )
972
+ event_query = f'{{service_name="unknown_service"}} | json | k8s_object_name=~"{self.service_name}.*" | k8s_namespace_name="{self.namespace}"'
973
+
974
+ encoded_pod_query = urllib.parse.quote_plus(pod_query)
975
+ encoded_event_query = urllib.parse.quote_plus(event_query)
976
+ logger.debug(
977
+ f"Streaming launch logs and events for service {self.service_name}"
978
+ )
979
+
980
+ base_url = await service_url_async()
981
+ host, port = extract_host_port(base_url)
982
+ logger.debug(
983
+ f"Streaming launch logs with url={base_url} host={host} and local port {port}"
984
+ )
985
+
986
+ # Create async tasks for both log streams
987
+ pod_task = asyncio.create_task(
988
+ self._stream_logs_websocket(
989
+ request_id,
990
+ stop_event,
991
+ host=host,
992
+ port=port,
993
+ query=encoded_pod_query,
994
+ log_verbosity=verbosity,
995
+ deployment_timestamp=deployment_timestamp,
996
+ dedup=True,
997
+ )
998
+ )
999
+
1000
+ event_task = asyncio.create_task(
1001
+ self._stream_logs_websocket(
1002
+ request_id,
1003
+ stop_event,
1004
+ host=host,
1005
+ port=port,
1006
+ query=encoded_event_query,
1007
+ log_verbosity=verbosity,
1008
+ deployment_timestamp=deployment_timestamp,
1009
+ )
1010
+ )
1011
+
1012
+ # Wait for both tasks to complete or be cancelled
1013
+ try:
1014
+ await asyncio.gather(pod_task, event_task, return_exceptions=True)
1015
+ except Exception as e:
1016
+ logger.error(f"Error in async log streaming: {e}")
1017
+
1018
+ except Exception as e:
1019
+ logger.error(f"Failed to stream launch logs: {e}")
1020
+ raise e
1021
+
1022
+ def _run_log_stream(
1023
+ self,
1024
+ request_id: str,
1025
+ stop_event: threading.Event,
1026
+ host: str,
1027
+ port: int,
1028
+ query: str,
1029
+ log_verbosity: LogVerbosity,
1030
+ deployment_timestamp: str,
1031
+ dedup: bool = False,
1032
+ ):
1033
+ """Helper to run log streaming in an event loop"""
1034
+ loop = asyncio.new_event_loop()
1035
+ asyncio.set_event_loop(loop)
1036
+ try:
1037
+ loop.run_until_complete(
1038
+ self._stream_logs_websocket(
1039
+ request_id,
1040
+ stop_event,
1041
+ host=host,
1042
+ port=port,
1043
+ query=query,
1044
+ log_verbosity=log_verbosity,
1045
+ deployment_timestamp=deployment_timestamp,
1046
+ dedup=dedup,
1047
+ )
1048
+ )
1049
+ finally:
1050
+ loop.close()
1051
+
1052
+ async def _run_log_stream_async(
1053
+ self,
1054
+ request_id: str,
1055
+ stop_event: asyncio.Event,
1056
+ host: str,
1057
+ port: int,
1058
+ query: str,
1059
+ log_verbosity: LogVerbosity,
1060
+ deployment_timestamp: str,
1061
+ dedup: bool = False,
1062
+ ):
1063
+ """Async helper to run log streaming directly in the current event loop"""
1064
+ await self._stream_logs_websocket(
1065
+ request_id,
1066
+ stop_event,
1067
+ host=host,
1068
+ port=port,
1069
+ query=query,
1070
+ log_verbosity=log_verbosity,
1071
+ deployment_timestamp=deployment_timestamp,
1072
+ dedup=dedup,
1073
+ )
1074
+
1075
+ async def _stream_logs_websocket(
1076
+ self,
1077
+ request_id: str,
1078
+ stop_event: Union[threading.Event, asyncio.Event],
1079
+ host: str,
1080
+ port: int,
1081
+ query: str,
1082
+ log_verbosity: LogVerbosity,
1083
+ deployment_timestamp: str,
1084
+ dedup: bool = False,
1085
+ ):
1086
+ """Stream logs and events using Loki's websocket tail endpoint"""
1087
+ try:
1088
+ uri = f"ws://{host}:{port}/loki/api/v1/tail?query={query}"
1089
+
1090
+ # Track the last timestamp we've seen to avoid duplicates
1091
+ last_timestamp = None
1092
+
1093
+ # Track when we should stop
1094
+ stop_time = None
1095
+
1096
+ # Track most recent deployment timestamp to filter out old logs / events
1097
+ start_timestamp = iso_timestamp_to_nanoseconds(deployment_timestamp)
1098
+
1099
+ shown_event_messages = set()
1100
+
1101
+ # Track seen log messages for deduplication
1102
+ seen_log_messages = set() if dedup else None
1103
+
1104
+ # For formatting the server setup logs
1105
+ formatters = {}
1106
+ base_formatter = ServerLogsFormatter()
1107
+ websocket = None
1108
+ try:
1109
+ # Add timeout to prevent hanging connections
1110
+ websocket = await websockets.connect(
1111
+ uri,
1112
+ close_timeout=10, # Max time to wait for close handshake
1113
+ ping_interval=20, # Send ping every 20 seconds
1114
+ ping_timeout=10, # Wait 10 seconds for pong
1115
+ )
1116
+ while True:
1117
+ # If stop event is set, start counting down
1118
+ # Handle both threading.Event and asyncio.Event
1119
+ is_stop_set = (
1120
+ stop_event.is_set()
1121
+ if hasattr(stop_event, "is_set")
1122
+ else stop_event.is_set()
1123
+ )
1124
+ if is_stop_set and stop_time is None:
1125
+ stop_time = time.time() + 2 # 2 second grace period
1126
+
1127
+ # If we're past the grace period, exit
1128
+ if stop_time is not None and time.time() > stop_time:
1129
+ break
1130
+
1131
+ try:
1132
+ # Use shorter timeout during grace period
1133
+ timeout = 0.1 if stop_time is not None else 1.0
1134
+ message = await asyncio.wait_for(
1135
+ websocket.recv(), timeout=timeout
1136
+ )
1137
+ data = json.loads(message)
1138
+
1139
+ if data.get("streams"):
1140
+ for stream in data["streams"]:
1141
+ labels = stream.get("stream", {})
1142
+ is_event = "k8s_event_count" in list(labels.keys())
1143
+ for value in stream["values"]:
1144
+ ts_ns = int(value[0])
1145
+ if (
1146
+ start_timestamp is not None
1147
+ and ts_ns < start_timestamp
1148
+ ):
1149
+ continue
1150
+ log_line = value[1]
1151
+ if is_event:
1152
+ event_type = labels.get("detected_level", "")
1153
+ if (
1154
+ log_verbosity == LogVerbosity.CRITICAL
1155
+ and event_type == "Normal"
1156
+ ):
1157
+ # skip Normal events in MINIMAL
1158
+ continue
1159
+
1160
+ try:
1161
+ msg = log_line
1162
+ reason = (
1163
+ labels.get("k8s_event_reason", ""),
1164
+ )
1165
+
1166
+ # Note: relevant starting in release 0.1.19 (using OTel instead of Alloy)
1167
+ if isinstance(reason, tuple):
1168
+ reason = reason[0]
1169
+
1170
+ event_type = labels.get(
1171
+ "detected_level", ""
1172
+ )
1173
+
1174
+ if reason == "Unhealthy" and (
1175
+ "HTTP probe failed with statuscode: 503"
1176
+ in msg
1177
+ or "Startup probe failed" in msg
1178
+ ):
1179
+ # HTTP probe failures are expected during setup
1180
+ continue
1181
+
1182
+ ignore_patterns = (
1183
+ "queue-proxy",
1184
+ "resolving reference: address not set for kind = service",
1185
+ "failed to get private k8s service endpoints:",
1186
+ )
1187
+ # Ignore queue-proxy events and gateway setup events
1188
+ if any(
1189
+ pattern in msg.lower()
1190
+ for pattern in ignore_patterns
1191
+ ):
1192
+ continue
1193
+
1194
+ if msg in shown_event_messages:
1195
+ # Only show unique event messages
1196
+ continue
1197
+
1198
+ shown_event_messages.add(msg)
1199
+
1200
+ except Exception:
1201
+ # If parsing fails, just print the event as is
1202
+ pass
1203
+
1204
+ if event_type == "Normal":
1205
+ if log_verbosity in [
1206
+ LogVerbosity.INFO,
1207
+ LogVerbosity.DEBUG,
1208
+ ]:
1209
+ print(
1210
+ f'[EVENT] reason={reason} "{msg}"'
1211
+ )
1212
+ else:
1213
+ print(
1214
+ f'[EVENT] type={event_type} reason={reason} "{msg}"'
1215
+ )
1216
+ continue
1217
+
1218
+ # Skip if we've already seen this timestamp
1219
+ if (
1220
+ last_timestamp is not None
1221
+ and value[0] <= last_timestamp
1222
+ ):
1223
+ continue
1224
+ last_timestamp = value[0]
1225
+ if log_verbosity in [
1226
+ LogVerbosity.DEBUG,
1227
+ LogVerbosity.INFO,
1228
+ ]:
1229
+ try:
1230
+ log_dict = json.loads(log_line)
1231
+ except json.JSONDecodeError:
1232
+ # setup steps pre server start are not JSON formatted
1233
+ log_dict = None
1234
+
1235
+ if log_dict is not None:
1236
+ # at this stage we are post setup
1237
+ pod_name = log_dict.get("pod", request_id)
1238
+ levelname = log_dict.get(
1239
+ "levelname", "INFO"
1240
+ )
1241
+ ts = log_dict.get("asctime")
1242
+ message = log_dict.get("message", "")
1243
+
1244
+ if (
1245
+ log_verbosity == LogVerbosity.CRITICAL
1246
+ and levelname
1247
+ not in ["ERROR", "CRITICAL"]
1248
+ ) or (
1249
+ log_verbosity == LogVerbosity.INFO
1250
+ and levelname == "DEBUG"
1251
+ ):
1252
+ continue
1253
+
1254
+ log_line = f"{levelname} | {ts} | {message}"
1255
+ if pod_name not in formatters:
1256
+ formatters[
1257
+ pod_name
1258
+ ] = ServerLogsFormatter(pod_name)
1259
+ formatter = formatters[pod_name]
1260
+ else:
1261
+ # streaming pre server setup logs, before we have the pod name
1262
+ formatter = base_formatter
1263
+
1264
+ newline = "" if log_dict is None else None
1265
+ formatted_line = f"{formatter.start_color}{f'({self.service_name}) '}{log_line}{formatter.reset_color}"
1266
+
1267
+ # Check for duplicates if dedup is enabled
1268
+ if seen_log_messages is not None:
1269
+ if message in seen_log_messages:
1270
+ continue
1271
+ seen_log_messages.add(message)
1272
+
1273
+ print(formatted_line, end=newline)
1274
+ except asyncio.TimeoutError:
1275
+ # Timeout is expected, just continue the loop
1276
+ continue
1277
+ except websockets.exceptions.ConnectionClosed as e:
1278
+ logger.debug(f"WebSocket connection closed: {str(e)}")
1279
+ break
1280
+ finally:
1281
+ if websocket:
1282
+ try:
1283
+ # Use wait_for to prevent hanging on close
1284
+ await asyncio.wait_for(websocket.close(), timeout=1.0)
1285
+ except (asyncio.TimeoutError, Exception):
1286
+ pass
1287
+ except Exception as e:
1288
+ logger.error(f"Error in websocket stream: {e}")
1289
+ raise e
1290
+ finally:
1291
+ # Ensure websocket is closed even if we didn't enter the try block
1292
+ if websocket:
1293
+ try:
1294
+ # Use wait_for to prevent hanging on close
1295
+ await asyncio.wait_for(websocket.close(), timeout=1.0)
1296
+ except (asyncio.TimeoutError, Exception):
1297
+ pass
1298
+
1299
+ def _wait_for_http_health(
1300
+ self, timeout=60, retry_interval=0.1, backoff=2, max_interval=10
1301
+ ):
1302
+ """Wait for the HTTP server to be ready by checking the /health endpoint.
1303
+
1304
+ Args:
1305
+ timeout: Maximum time to wait in seconds
1306
+ retry_interval: Time between health check attempts in seconds
1307
+ """
1308
+ import time
1309
+
1310
+ logger.info(
1311
+ f"Waiting for HTTP server to be ready for service {self.service_name}"
1312
+ )
1313
+ start_time = time.time()
1314
+
1315
+ while time.time() - start_time < timeout:
1316
+ try:
1317
+ client = self._client()
1318
+ response = client.get(
1319
+ endpoint=f"{self.base_endpoint}/health",
1320
+ headers=self.request_headers,
1321
+ )
1322
+ if response.status_code == 200:
1323
+ logger.info(f"HTTP server is ready for service {self.service_name}")
1324
+ return
1325
+ else:
1326
+ logger.debug(
1327
+ f"Health check returned status {response.status_code}, retrying..."
1328
+ )
1329
+
1330
+ except VersionMismatchError as e:
1331
+ raise e
1332
+
1333
+ except Exception as e:
1334
+ logger.debug(f"Health check failed: {e}, retrying...")
1335
+
1336
+ time.sleep(retry_interval)
1337
+ retry_interval *= backoff # Exponential backoff
1338
+ # Cap the retry interval to a maximum value
1339
+ retry_interval = min(retry_interval, max_interval)
1340
+
1341
+ # If we get here, we've timed out
1342
+ logger.warning(
1343
+ f"HTTP health check timed out after {timeout}s for service {self.service_name}"
1344
+ )
1345
+
1346
+ async def _wait_for_http_health_async(
1347
+ self, timeout=60, retry_interval=0.1, backoff=2, max_interval=10
1348
+ ):
1349
+ """Async version of _wait_for_http_health. Wait for the HTTP server to be ready by checking the /health endpoint.
1350
+
1351
+ Args:
1352
+ timeout: Maximum time to wait in seconds
1353
+ retry_interval: Time between health check attempts in seconds
1354
+ """
1355
+ import asyncio
1356
+
1357
+ logger.info(
1358
+ f"Waiting for HTTP server to be ready for service {self.service_name}"
1359
+ )
1360
+ start_time = time.time()
1361
+
1362
+ while time.time() - start_time < timeout:
1363
+ try:
1364
+ client = self._client()
1365
+ response = client.get(
1366
+ endpoint=f"{self.base_endpoint}/health",
1367
+ headers=self.request_headers,
1368
+ )
1369
+ if response.status_code == 200:
1370
+ logger.info(f"HTTP server is ready for service {self.service_name}")
1371
+ return
1372
+ else:
1373
+ logger.debug(
1374
+ f"Health check returned status {response.status_code}, retrying..."
1375
+ )
1376
+ except Exception as e:
1377
+ logger.debug(f"Health check failed: {e}, retrying...")
1378
+
1379
+ await asyncio.sleep(retry_interval)
1380
+ retry_interval *= backoff # Exponential backoff
1381
+ # Cap the retry interval to a maximum value
1382
+ retry_interval = min(retry_interval, max_interval)
1383
+
1384
+ # If we get here, we've timed out
1385
+ logger.warning(
1386
+ f"HTTP health check timed out after {timeout}s for service {self.service_name}"
1387
+ )
1388
+
1389
+ def __getstate__(self):
1390
+ """Remove local stateful values before pickle serialization."""
1391
+ state = self.__dict__.copy()
1392
+ # Remove local stateful values that shouldn't be serialized
1393
+ state["_http_client"] = None
1394
+ state["_service_config"] = None
1395
+ state["_remote_pointers"] = None
1396
+ # Pointers need to be converted to not be absolute paths if we're passing
1397
+ # the service elsewhere, e.g. into another service
1398
+ state["pointers"] = self.remote_pointers
1399
+ return state
1400
+
1401
+ def __setstate__(self, state):
1402
+ """Restore state after pickle deserialization."""
1403
+ self.__dict__.update(state)
1404
+ # Reset local stateful values to None to ensure clean initialization
1405
+ self._http_client = None
1406
+ self._service_config = None
1407
+ self._remote_pointers = None
1408
+
1409
+ def __del__(self):
1410
+ if hasattr(self, "_http_client") and self._http_client is not None:
1411
+ try:
1412
+ self._http_client.close()
1413
+ except Exception as e:
1414
+ logger.debug(f"Error closing HTTPClient in Module deletion: {e}")
1415
+ finally:
1416
+ self._http_client = None