kubetorch 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. kubetorch/__init__.py +59 -0
  2. kubetorch/cli.py +1939 -0
  3. kubetorch/cli_utils.py +967 -0
  4. kubetorch/config.py +453 -0
  5. kubetorch/constants.py +18 -0
  6. kubetorch/docs/Makefile +18 -0
  7. kubetorch/docs/__init__.py +0 -0
  8. kubetorch/docs/_ext/json_globaltoc.py +42 -0
  9. kubetorch/docs/api/cli.rst +10 -0
  10. kubetorch/docs/api/python/app.rst +21 -0
  11. kubetorch/docs/api/python/cls.rst +19 -0
  12. kubetorch/docs/api/python/compute.rst +25 -0
  13. kubetorch/docs/api/python/config.rst +11 -0
  14. kubetorch/docs/api/python/fn.rst +19 -0
  15. kubetorch/docs/api/python/image.rst +14 -0
  16. kubetorch/docs/api/python/secret.rst +18 -0
  17. kubetorch/docs/api/python/volumes.rst +13 -0
  18. kubetorch/docs/api/python.rst +101 -0
  19. kubetorch/docs/conf.py +69 -0
  20. kubetorch/docs/index.rst +20 -0
  21. kubetorch/docs/requirements.txt +5 -0
  22. kubetorch/globals.py +269 -0
  23. kubetorch/logger.py +59 -0
  24. kubetorch/resources/__init__.py +0 -0
  25. kubetorch/resources/callables/__init__.py +0 -0
  26. kubetorch/resources/callables/cls/__init__.py +0 -0
  27. kubetorch/resources/callables/cls/cls.py +159 -0
  28. kubetorch/resources/callables/fn/__init__.py +0 -0
  29. kubetorch/resources/callables/fn/fn.py +140 -0
  30. kubetorch/resources/callables/module.py +1315 -0
  31. kubetorch/resources/callables/utils.py +203 -0
  32. kubetorch/resources/compute/__init__.py +0 -0
  33. kubetorch/resources/compute/app.py +253 -0
  34. kubetorch/resources/compute/compute.py +2414 -0
  35. kubetorch/resources/compute/decorators.py +137 -0
  36. kubetorch/resources/compute/utils.py +1026 -0
  37. kubetorch/resources/compute/websocket.py +135 -0
  38. kubetorch/resources/images/__init__.py +1 -0
  39. kubetorch/resources/images/image.py +412 -0
  40. kubetorch/resources/images/images.py +64 -0
  41. kubetorch/resources/secrets/__init__.py +2 -0
  42. kubetorch/resources/secrets/kubernetes_secrets_client.py +377 -0
  43. kubetorch/resources/secrets/provider_secrets/__init__.py +0 -0
  44. kubetorch/resources/secrets/provider_secrets/anthropic_secret.py +12 -0
  45. kubetorch/resources/secrets/provider_secrets/aws_secret.py +16 -0
  46. kubetorch/resources/secrets/provider_secrets/azure_secret.py +14 -0
  47. kubetorch/resources/secrets/provider_secrets/cohere_secret.py +12 -0
  48. kubetorch/resources/secrets/provider_secrets/gcp_secret.py +16 -0
  49. kubetorch/resources/secrets/provider_secrets/github_secret.py +13 -0
  50. kubetorch/resources/secrets/provider_secrets/huggingface_secret.py +20 -0
  51. kubetorch/resources/secrets/provider_secrets/kubeconfig_secret.py +12 -0
  52. kubetorch/resources/secrets/provider_secrets/lambda_secret.py +13 -0
  53. kubetorch/resources/secrets/provider_secrets/langchain_secret.py +12 -0
  54. kubetorch/resources/secrets/provider_secrets/openai_secret.py +11 -0
  55. kubetorch/resources/secrets/provider_secrets/pinecone_secret.py +12 -0
  56. kubetorch/resources/secrets/provider_secrets/providers.py +92 -0
  57. kubetorch/resources/secrets/provider_secrets/ssh_secret.py +12 -0
  58. kubetorch/resources/secrets/provider_secrets/wandb_secret.py +11 -0
  59. kubetorch/resources/secrets/secret.py +224 -0
  60. kubetorch/resources/secrets/secret_factory.py +64 -0
  61. kubetorch/resources/secrets/utils.py +222 -0
  62. kubetorch/resources/volumes/__init__.py +0 -0
  63. kubetorch/resources/volumes/volume.py +340 -0
  64. kubetorch/servers/__init__.py +0 -0
  65. kubetorch/servers/http/__init__.py +0 -0
  66. kubetorch/servers/http/distributed_utils.py +2968 -0
  67. kubetorch/servers/http/http_client.py +802 -0
  68. kubetorch/servers/http/http_server.py +1622 -0
  69. kubetorch/servers/http/server_metrics.py +255 -0
  70. kubetorch/servers/http/utils.py +722 -0
  71. kubetorch/serving/__init__.py +0 -0
  72. kubetorch/serving/autoscaling.py +153 -0
  73. kubetorch/serving/base_service_manager.py +344 -0
  74. kubetorch/serving/constants.py +77 -0
  75. kubetorch/serving/deployment_service_manager.py +431 -0
  76. kubetorch/serving/knative_service_manager.py +487 -0
  77. kubetorch/serving/raycluster_service_manager.py +526 -0
  78. kubetorch/serving/service_manager.py +18 -0
  79. kubetorch/serving/templates/deployment_template.yaml +17 -0
  80. kubetorch/serving/templates/knative_service_template.yaml +19 -0
  81. kubetorch/serving/templates/kt_setup_template.sh.j2 +91 -0
  82. kubetorch/serving/templates/pod_template.yaml +198 -0
  83. kubetorch/serving/templates/raycluster_service_template.yaml +42 -0
  84. kubetorch/serving/templates/raycluster_template.yaml +35 -0
  85. kubetorch/serving/templates/service_template.yaml +21 -0
  86. kubetorch/serving/templates/workerset_template.yaml +36 -0
  87. kubetorch/serving/utils.py +344 -0
  88. kubetorch/utils.py +263 -0
  89. kubetorch-0.2.5.dist-info/METADATA +75 -0
  90. kubetorch-0.2.5.dist-info/RECORD +92 -0
  91. kubetorch-0.2.5.dist-info/WHEEL +4 -0
  92. kubetorch-0.2.5.dist-info/entry_points.txt +5 -0
@@ -0,0 +1,431 @@
1
+ import os
2
+ import re
3
+ import time
4
+ from datetime import datetime, timezone
5
+ from typing import List, Optional, Tuple
6
+
7
+ from kubernetes import client
8
+
9
+ import kubetorch.serving.constants as serving_constants
10
+ from kubetorch.logger import get_logger
11
+ from kubetorch.resources.compute.utils import (
12
+ check_pod_events_for_errors,
13
+ check_pod_status_for_errors,
14
+ check_replicaset_events_for_errors,
15
+ ServiceTimeoutError,
16
+ )
17
+ from kubetorch.servers.http.utils import load_template
18
+ from kubetorch.serving.base_service_manager import BaseServiceManager
19
+ from kubetorch.serving.utils import nested_override
20
+
21
+ logger = get_logger(__name__)
22
+
23
+
24
+ class DeploymentServiceManager(BaseServiceManager):
25
+ """Service manager for Kubernetes Deployments with distributed computing support."""
26
+
27
+ def _create_or_update_deployment(
28
+ self,
29
+ name: str,
30
+ module_name: str,
31
+ pod_template: dict,
32
+ replicas: int = 1,
33
+ inactivity_ttl: str = None,
34
+ custom_labels: dict = None,
35
+ custom_annotations: dict = None,
36
+ custom_template: dict = None,
37
+ scheduler_name: str = None,
38
+ queue_name: str = None,
39
+ dryrun: bool = False,
40
+ ) -> Tuple[dict, bool]:
41
+ """Creates or updates a Deployment for distributed deployments.
42
+
43
+ Returns:
44
+ Tuple (created_deployment, is_new_deployment)
45
+ """
46
+ clean_module_name = re.sub(r"[^A-Za-z0-9.-]|^[-.]|[-.]$", "", module_name)
47
+ service_name = name # Use regular service name, not headless
48
+
49
+ labels = {
50
+ **self.base_labels,
51
+ serving_constants.KT_MODULE_LABEL: clean_module_name,
52
+ serving_constants.KT_SERVICE_LABEL: name,
53
+ serving_constants.KT_TEMPLATE_LABEL: "deployment", # Mark as source-of-truth
54
+ }
55
+ if custom_labels:
56
+ labels.update(custom_labels)
57
+
58
+ # Template labels (exclude template label - that's only for the top-level resource)
59
+ template_labels = {
60
+ **self.base_labels,
61
+ serving_constants.KT_MODULE_LABEL: clean_module_name,
62
+ serving_constants.KT_SERVICE_LABEL: name,
63
+ }
64
+ if custom_labels:
65
+ template_labels.update(custom_labels)
66
+
67
+ # Service labels (also exclude template label - supporting resource, not source-of-truth)
68
+ service_labels = {
69
+ **self.base_labels,
70
+ serving_constants.KT_MODULE_LABEL: clean_module_name,
71
+ serving_constants.KT_SERVICE_LABEL: name,
72
+ }
73
+ if custom_labels:
74
+ service_labels.update(custom_labels)
75
+
76
+ annotations = {
77
+ "prometheus.io/scrape": "true",
78
+ "prometheus.io/path": serving_constants.PROMETHEUS_HEALTH_ENDPOINT,
79
+ "prometheus.io/port": "8080",
80
+ }
81
+ if custom_annotations:
82
+ annotations.update(custom_annotations)
83
+
84
+ if inactivity_ttl:
85
+ annotations[serving_constants.INACTIVITY_TTL_ANNOTATION] = inactivity_ttl
86
+ logger.info(f"Configuring auto-down after idle timeout ({inactivity_ttl})")
87
+
88
+ if scheduler_name and queue_name:
89
+ labels["kai.scheduler/queue"] = queue_name # Useful for queries, etc
90
+ template_labels["kai.scheduler/queue"] = queue_name # Required for KAI to schedule pods
91
+
92
+ deployment_timestamp = datetime.now(timezone.utc).isoformat()
93
+ template_annotations = {"kubetorch.com/deployment_timestamp": deployment_timestamp}
94
+
95
+ # Create Deployment
96
+ deployment = load_template(
97
+ template_file=serving_constants.DEPLOYMENT_TEMPLATE_FILE,
98
+ template_dir=os.path.join(os.path.dirname(os.path.abspath(__file__)), "templates"),
99
+ name=name,
100
+ namespace=self.namespace,
101
+ annotations=annotations,
102
+ template_annotations=template_annotations,
103
+ labels=labels,
104
+ template_labels=template_labels,
105
+ pod_template=pod_template,
106
+ replicas=replicas,
107
+ )
108
+
109
+ if custom_template:
110
+ nested_override(deployment, custom_template)
111
+
112
+ # Check if this is a distributed deployment
113
+ env_vars = pod_template.get("containers", [{}])[0].get("env", [])
114
+ is_distributed = any(
115
+ env.get("name") == "KT_DISTRIBUTED_CONFIG" and env.get("value") != "null" and env.get("value")
116
+ for env in env_vars
117
+ )
118
+
119
+ # Create regular service with session affinity
120
+ service = load_template(
121
+ template_file=serving_constants.DEPLOYMENT_SERVICE_TEMPLATE_FILE,
122
+ template_dir=os.path.join(os.path.dirname(os.path.abspath(__file__)), "templates"),
123
+ name=service_name,
124
+ namespace=self.namespace,
125
+ annotations=annotations,
126
+ labels=service_labels,
127
+ deployment_name=name,
128
+ module_name=clean_module_name,
129
+ distributed=False, # Keep regular service for client access
130
+ server_port=pod_template.get("containers", [{}])[0].get("ports", [{}])[0].get("containerPort", 32300),
131
+ )
132
+
133
+ # For distributed deployments, also create a headless service for pod discovery
134
+ headless_service = None
135
+ if is_distributed:
136
+ headless_service = load_template(
137
+ template_file=serving_constants.DEPLOYMENT_SERVICE_TEMPLATE_FILE,
138
+ template_dir=os.path.join(os.path.dirname(os.path.abspath(__file__)), "templates"),
139
+ name=f"{service_name}-headless", # Use different name for headless
140
+ namespace=self.namespace,
141
+ annotations=annotations,
142
+ labels=service_labels,
143
+ deployment_name=name,
144
+ module_name=clean_module_name,
145
+ distributed=True, # Make this one headless
146
+ server_port=pod_template.get("containers", [{}])[0].get("ports", [{}])[0].get("containerPort", 32300),
147
+ )
148
+
149
+ try:
150
+ kwargs = {"dry_run": "All"} if dryrun else {}
151
+
152
+ # Create regular service first
153
+ try:
154
+ self.core_api.create_namespaced_service(
155
+ namespace=self.namespace,
156
+ body=service,
157
+ **kwargs,
158
+ )
159
+ if not dryrun:
160
+ logger.info(f"Created service {service_name} in namespace {self.namespace}")
161
+ except client.exceptions.ApiException as e:
162
+ if e.status == 409:
163
+ logger.info(f"Service {service_name} already exists")
164
+ else:
165
+ raise
166
+
167
+ # Create headless service for distributed pod discovery
168
+ if headless_service:
169
+ try:
170
+ self.core_api.create_namespaced_service(
171
+ namespace=self.namespace,
172
+ body=headless_service,
173
+ **kwargs,
174
+ )
175
+ if not dryrun:
176
+ logger.info(f"Created headless service {service_name}-headless in namespace {self.namespace}")
177
+ except client.exceptions.ApiException as e:
178
+ if e.status == 409:
179
+ logger.info(f"Headless service {service_name}-headless already exists")
180
+ else:
181
+ raise
182
+
183
+ # Create Deployment
184
+ created_deployment = self.apps_v1_api.create_namespaced_deployment(
185
+ namespace=self.namespace,
186
+ body=deployment,
187
+ **kwargs,
188
+ )
189
+
190
+ if dryrun:
191
+ return created_deployment, False
192
+
193
+ logger.info(f"Created Deployment {name} in namespace {self.namespace}")
194
+ return created_deployment, True
195
+
196
+ except client.exceptions.ApiException as e:
197
+ if e.status == 409:
198
+ logger.info(f"Deployment {name} already exists, updating")
199
+ existing_deployment = self.get_deployment(name)
200
+
201
+ # Update replicas if different
202
+ if existing_deployment.spec.replicas != replicas:
203
+ patch_body = {"spec": {"replicas": replicas}}
204
+ try:
205
+ self.apps_v1_api.patch_namespaced_deployment(
206
+ name=name,
207
+ namespace=self.namespace,
208
+ body=patch_body,
209
+ )
210
+ logger.info(f"Updated Deployment {name} replicas to {replicas}")
211
+ except Exception as e:
212
+ logger.error(f"Failed to patch Deployment {name}: {e}")
213
+ raise e
214
+
215
+ return existing_deployment, False
216
+ else:
217
+ logger.error(f"Failed to create Deployment: {str(e)}")
218
+ raise e
219
+
220
+ def get_deployment(self, deployment_name: str) -> dict:
221
+ """Retrieve a Deployment by name."""
222
+ try:
223
+ deployment = self.apps_v1_api.read_namespaced_deployment(
224
+ name=deployment_name,
225
+ namespace=self.namespace,
226
+ )
227
+ return deployment
228
+ except client.exceptions.ApiException as e:
229
+ logger.error(f"Failed to load Deployment '{deployment_name}': {str(e)}")
230
+ raise
231
+
232
+ def get_deployment_timestamp_annotation(self, service_name: str) -> Optional[str]:
233
+ """Get deployment timestamp annotation for Deployment services."""
234
+ try:
235
+ deployment = self.get_deployment(service_name)
236
+ if deployment and hasattr(deployment, "metadata") and hasattr(deployment.metadata, "annotations"):
237
+ return deployment.metadata.annotations.get("kubetorch.com/deployment_timestamp", None)
238
+ except client.exceptions.ApiException:
239
+ pass
240
+ return None
241
+
242
+ def update_deployment_timestamp_annotation(self, service_name: str, new_timestamp: str) -> str:
243
+ """Update deployment timestamp annotation for Deployment services."""
244
+ try:
245
+ patch_body = {"metadata": {"annotations": {"kubetorch.com/deployment_timestamp": new_timestamp}}}
246
+ self.apps_v1_api.patch_namespaced_deployment(
247
+ name=service_name,
248
+ namespace=self.namespace,
249
+ body=patch_body,
250
+ )
251
+ return new_timestamp
252
+ except client.exceptions.ApiException as e:
253
+ logger.error(f"Failed to update deployment timestamp for '{service_name}': {str(e)}")
254
+ raise
255
+
256
+ def create_or_update_service(
257
+ self,
258
+ service_name: str,
259
+ module_name: str,
260
+ pod_template: dict,
261
+ replicas: int = 1,
262
+ inactivity_ttl: str = None,
263
+ custom_labels: dict = None,
264
+ custom_annotations: dict = None,
265
+ custom_template: dict = None,
266
+ scheduler_name: str = None,
267
+ queue_name: str = None,
268
+ dryrun: bool = False,
269
+ **kwargs, # Ignore Knative-specific args like autoscaling_config, inactivity_ttl, etc.
270
+ ):
271
+ """
272
+ Creates a Deployment service.
273
+
274
+ Args:
275
+ service_name (str): Name for the pod/service.
276
+ module_name (str): Name of the module.
277
+ pod_template (dict): Template for the pod, including resource requirements.
278
+ replicas (int): Number of replicas for the service
279
+ custom_labels (dict, optional): Custom labels to add to the service.
280
+ custom_annotations (dict, optional): Custom annotations to add to the service.
281
+ custom_template (dict, optional): Custom template to apply to the service.
282
+ dryrun (bool, optional): Whether to run in dryrun mode (Default: `False`).
283
+ """
284
+ logger.info(f"Deploying Kubetorch service with name: {service_name}")
285
+ try:
286
+ created_service, _ = self._create_or_update_deployment(
287
+ name=service_name,
288
+ pod_template=pod_template,
289
+ module_name=module_name,
290
+ replicas=replicas,
291
+ inactivity_ttl=inactivity_ttl,
292
+ custom_labels=custom_labels,
293
+ custom_annotations=custom_annotations,
294
+ custom_template=custom_template,
295
+ scheduler_name=scheduler_name,
296
+ queue_name=queue_name,
297
+ dryrun=dryrun,
298
+ )
299
+ return created_service
300
+ except Exception as e:
301
+ logger.error(f"Failed to launch new Deployment: {str(e)}")
302
+ raise e
303
+
304
+ def get_pods_for_service(self, service_name: str, **kwargs) -> List[client.V1Pod]:
305
+ """Get all pods associated with this Deployment service.
306
+
307
+ Args:
308
+ service_name (str): Name of the service
309
+
310
+ Returns:
311
+ List[V1Pod]: List of running pods associated with the service.
312
+ """
313
+ return self.get_pods_for_service_static(
314
+ service_name=service_name,
315
+ namespace=self.namespace,
316
+ core_api=self.core_api,
317
+ )
318
+
319
+ def get_endpoint(self, service_name: str) -> str:
320
+ """Get the endpoint URL for a Deployment service."""
321
+ return f"http://{service_name}.{self.namespace}.svc.cluster.local:80"
322
+
323
+ def check_service_ready(
324
+ self,
325
+ service_name: str,
326
+ launch_timeout: int,
327
+ core_api: client.CoreV1Api = None,
328
+ **kwargs,
329
+ ) -> bool:
330
+ """Checks if the Deployment is ready to start serving requests.
331
+
332
+ Args:
333
+ service_name: Name of the Deployment service
334
+ launch_timeout: Timeout in seconds to wait for readiness
335
+ core_api: Core API instance (uses self.core_api if None)
336
+ **kwargs: Additional arguments (ignored for Deployments)
337
+
338
+ Returns:
339
+ True if service is ready
340
+
341
+ Raises:
342
+ ServiceTimeoutError: If service doesn't become ready within timeout
343
+ """
344
+ if core_api is None:
345
+ core_api = self.core_api
346
+
347
+ sleep_interval = 2
348
+ start_time = time.time()
349
+
350
+ logger.info(f"Checking Deployment {service_name} pod readiness (timeout: {launch_timeout} seconds)")
351
+
352
+ iteration = 0
353
+ while (time.time() - start_time) < launch_timeout:
354
+ iteration += 1
355
+ try:
356
+ # Get Deployment
357
+ deployment = self.get_deployment(service_name)
358
+ if not deployment:
359
+ logger.debug(f"Waiting for Deployment {service_name} to be created")
360
+ time.sleep(sleep_interval)
361
+ continue
362
+
363
+ # Check if all replicas are ready
364
+ ready_replicas = deployment.status.ready_replicas or 0
365
+ desired_replicas = deployment.spec.replicas or 0
366
+
367
+ if iteration % 3 == 0:
368
+ logger.debug(f"Deployment {service_name}: {ready_replicas}/{desired_replicas} replicas ready")
369
+
370
+ if ready_replicas >= desired_replicas and desired_replicas > 0:
371
+ logger.info(f"Deployment {service_name} pod(s) are now ready with {ready_replicas} replicas")
372
+ return True
373
+
374
+ # Check for pod-level issues
375
+ pods = self.get_pods_for_service(service_name)
376
+ for pod in pods:
377
+ # Check for image pull errors in container status
378
+ check_pod_status_for_errors(pod)
379
+
380
+ # Check pod events separately from the core API
381
+ check_pod_events_for_errors(pod, self.namespace, core_api)
382
+
383
+ # If no pods exist, check for ReplicaSet-level errors (like PriorityClass issues)
384
+ if not pods:
385
+ check_replicaset_events_for_errors(
386
+ namespace=self.namespace,
387
+ service_name=service_name,
388
+ apps_v1_api=self.apps_v1_api,
389
+ core_api=core_api,
390
+ )
391
+
392
+ except client.exceptions.ApiException as e:
393
+ logger.error(f"Error checking Deployment readiness: {e}")
394
+ raise
395
+
396
+ if iteration % 10 == 0:
397
+ elapsed = int(time.time() - start_time)
398
+ remaining = max(0, int(launch_timeout - elapsed))
399
+ logger.info(f"Deployment is not yet ready " f"(elapsed: {elapsed}s, remaining: {remaining}s)")
400
+
401
+ time.sleep(sleep_interval)
402
+
403
+ raise ServiceTimeoutError(f"Deployment {service_name} is not ready after {launch_timeout} seconds")
404
+
405
+ def teardown_service(self, service_name: str, console=None) -> bool:
406
+ """Teardown Deployment and associated resources.
407
+
408
+ Args:
409
+ service_name: Name of the Deployment to teardown
410
+ console: Optional Rich console for output
411
+
412
+ Returns:
413
+ True if teardown was successful, False otherwise
414
+ """
415
+ from kubetorch.resources.compute.utils import delete_deployment
416
+
417
+ try:
418
+ # Delete the Deployment and its associated service
419
+ delete_deployment(
420
+ apps_v1_api=self.apps_v1_api,
421
+ core_api=self.core_api,
422
+ name=service_name,
423
+ namespace=self.namespace,
424
+ console=console,
425
+ )
426
+
427
+ return True
428
+
429
+ except Exception as e:
430
+ logger.error(f"Failed to teardown Deployment {service_name}: {e}")
431
+ return False