kubetorch 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kubetorch might be problematic. Click here for more details.

Files changed (93) hide show
  1. kubetorch/__init__.py +60 -0
  2. kubetorch/cli.py +1985 -0
  3. kubetorch/cli_utils.py +1025 -0
  4. kubetorch/config.py +453 -0
  5. kubetorch/constants.py +18 -0
  6. kubetorch/docs/Makefile +18 -0
  7. kubetorch/docs/__init__.py +0 -0
  8. kubetorch/docs/_ext/json_globaltoc.py +42 -0
  9. kubetorch/docs/api/cli.rst +10 -0
  10. kubetorch/docs/api/python/app.rst +21 -0
  11. kubetorch/docs/api/python/cls.rst +19 -0
  12. kubetorch/docs/api/python/compute.rst +25 -0
  13. kubetorch/docs/api/python/config.rst +11 -0
  14. kubetorch/docs/api/python/fn.rst +19 -0
  15. kubetorch/docs/api/python/image.rst +14 -0
  16. kubetorch/docs/api/python/secret.rst +18 -0
  17. kubetorch/docs/api/python/volumes.rst +13 -0
  18. kubetorch/docs/api/python.rst +101 -0
  19. kubetorch/docs/conf.py +69 -0
  20. kubetorch/docs/index.rst +20 -0
  21. kubetorch/docs/requirements.txt +5 -0
  22. kubetorch/globals.py +285 -0
  23. kubetorch/logger.py +59 -0
  24. kubetorch/resources/__init__.py +0 -0
  25. kubetorch/resources/callables/__init__.py +0 -0
  26. kubetorch/resources/callables/cls/__init__.py +0 -0
  27. kubetorch/resources/callables/cls/cls.py +157 -0
  28. kubetorch/resources/callables/fn/__init__.py +0 -0
  29. kubetorch/resources/callables/fn/fn.py +133 -0
  30. kubetorch/resources/callables/module.py +1416 -0
  31. kubetorch/resources/callables/utils.py +174 -0
  32. kubetorch/resources/compute/__init__.py +0 -0
  33. kubetorch/resources/compute/app.py +261 -0
  34. kubetorch/resources/compute/compute.py +2596 -0
  35. kubetorch/resources/compute/decorators.py +139 -0
  36. kubetorch/resources/compute/rbac.py +74 -0
  37. kubetorch/resources/compute/utils.py +1114 -0
  38. kubetorch/resources/compute/websocket.py +137 -0
  39. kubetorch/resources/images/__init__.py +1 -0
  40. kubetorch/resources/images/image.py +414 -0
  41. kubetorch/resources/images/images.py +74 -0
  42. kubetorch/resources/secrets/__init__.py +2 -0
  43. kubetorch/resources/secrets/kubernetes_secrets_client.py +412 -0
  44. kubetorch/resources/secrets/provider_secrets/__init__.py +0 -0
  45. kubetorch/resources/secrets/provider_secrets/anthropic_secret.py +12 -0
  46. kubetorch/resources/secrets/provider_secrets/aws_secret.py +16 -0
  47. kubetorch/resources/secrets/provider_secrets/azure_secret.py +14 -0
  48. kubetorch/resources/secrets/provider_secrets/cohere_secret.py +12 -0
  49. kubetorch/resources/secrets/provider_secrets/gcp_secret.py +16 -0
  50. kubetorch/resources/secrets/provider_secrets/github_secret.py +13 -0
  51. kubetorch/resources/secrets/provider_secrets/huggingface_secret.py +20 -0
  52. kubetorch/resources/secrets/provider_secrets/kubeconfig_secret.py +12 -0
  53. kubetorch/resources/secrets/provider_secrets/lambda_secret.py +13 -0
  54. kubetorch/resources/secrets/provider_secrets/langchain_secret.py +12 -0
  55. kubetorch/resources/secrets/provider_secrets/openai_secret.py +11 -0
  56. kubetorch/resources/secrets/provider_secrets/pinecone_secret.py +12 -0
  57. kubetorch/resources/secrets/provider_secrets/providers.py +93 -0
  58. kubetorch/resources/secrets/provider_secrets/ssh_secret.py +12 -0
  59. kubetorch/resources/secrets/provider_secrets/wandb_secret.py +11 -0
  60. kubetorch/resources/secrets/secret.py +238 -0
  61. kubetorch/resources/secrets/secret_factory.py +70 -0
  62. kubetorch/resources/secrets/utils.py +209 -0
  63. kubetorch/resources/volumes/__init__.py +0 -0
  64. kubetorch/resources/volumes/volume.py +365 -0
  65. kubetorch/servers/__init__.py +0 -0
  66. kubetorch/servers/http/__init__.py +0 -0
  67. kubetorch/servers/http/distributed_utils.py +3223 -0
  68. kubetorch/servers/http/http_client.py +730 -0
  69. kubetorch/servers/http/http_server.py +1788 -0
  70. kubetorch/servers/http/server_metrics.py +278 -0
  71. kubetorch/servers/http/utils.py +728 -0
  72. kubetorch/serving/__init__.py +0 -0
  73. kubetorch/serving/autoscaling.py +173 -0
  74. kubetorch/serving/base_service_manager.py +363 -0
  75. kubetorch/serving/constants.py +83 -0
  76. kubetorch/serving/deployment_service_manager.py +478 -0
  77. kubetorch/serving/knative_service_manager.py +519 -0
  78. kubetorch/serving/raycluster_service_manager.py +582 -0
  79. kubetorch/serving/service_manager.py +18 -0
  80. kubetorch/serving/templates/deployment_template.yaml +17 -0
  81. kubetorch/serving/templates/knative_service_template.yaml +19 -0
  82. kubetorch/serving/templates/kt_setup_template.sh.j2 +81 -0
  83. kubetorch/serving/templates/pod_template.yaml +194 -0
  84. kubetorch/serving/templates/raycluster_service_template.yaml +42 -0
  85. kubetorch/serving/templates/raycluster_template.yaml +35 -0
  86. kubetorch/serving/templates/service_template.yaml +21 -0
  87. kubetorch/serving/templates/workerset_template.yaml +36 -0
  88. kubetorch/serving/utils.py +377 -0
  89. kubetorch/utils.py +284 -0
  90. kubetorch-0.2.0.dist-info/METADATA +121 -0
  91. kubetorch-0.2.0.dist-info/RECORD +93 -0
  92. kubetorch-0.2.0.dist-info/WHEEL +4 -0
  93. kubetorch-0.2.0.dist-info/entry_points.txt +5 -0
@@ -0,0 +1,478 @@
1
+ import os
2
+ import re
3
+ import time
4
+ from datetime import datetime, timezone
5
+ from typing import List, Optional, Tuple
6
+
7
+ from kubernetes import client
8
+
9
+ import kubetorch.serving.constants as serving_constants
10
+ from kubetorch.logger import get_logger
11
+ from kubetorch.resources.compute.utils import (
12
+ check_pod_events_for_errors,
13
+ check_pod_status_for_errors,
14
+ check_replicaset_events_for_errors,
15
+ ServiceTimeoutError,
16
+ )
17
+ from kubetorch.servers.http.utils import load_template
18
+ from kubetorch.serving.base_service_manager import BaseServiceManager
19
+ from kubetorch.serving.utils import nested_override
20
+
21
+ logger = get_logger(__name__)
22
+
23
+
24
+ class DeploymentServiceManager(BaseServiceManager):
25
+ """Service manager for Kubernetes Deployments with distributed computing support."""
26
+
27
+ def _create_or_update_deployment(
28
+ self,
29
+ name: str,
30
+ module_name: str,
31
+ pod_template: dict,
32
+ replicas: int = 1,
33
+ inactivity_ttl: str = None,
34
+ custom_labels: dict = None,
35
+ custom_annotations: dict = None,
36
+ custom_template: dict = None,
37
+ scheduler_name: str = None,
38
+ queue_name: str = None,
39
+ dryrun: bool = False,
40
+ ) -> Tuple[dict, bool]:
41
+ """Creates or updates a Deployment for distributed deployments.
42
+
43
+ Returns:
44
+ Tuple (created_deployment, is_new_deployment)
45
+ """
46
+ clean_module_name = re.sub(r"[^A-Za-z0-9.-]|^[-.]|[-.]$", "", module_name)
47
+ service_name = name # Use regular service name, not headless
48
+
49
+ labels = {
50
+ **self.base_labels,
51
+ serving_constants.KT_MODULE_LABEL: clean_module_name,
52
+ serving_constants.KT_SERVICE_LABEL: name,
53
+ serving_constants.KT_TEMPLATE_LABEL: "deployment", # Mark as source-of-truth
54
+ }
55
+ if custom_labels:
56
+ labels.update(custom_labels)
57
+
58
+ # Template labels (exclude template label - that's only for the top-level resource)
59
+ template_labels = {
60
+ **self.base_labels,
61
+ serving_constants.KT_MODULE_LABEL: clean_module_name,
62
+ serving_constants.KT_SERVICE_LABEL: name,
63
+ }
64
+ if custom_labels:
65
+ template_labels.update(custom_labels)
66
+
67
+ # Service labels (also exclude template label - supporting resource, not source-of-truth)
68
+ service_labels = {
69
+ **self.base_labels,
70
+ serving_constants.KT_MODULE_LABEL: clean_module_name,
71
+ serving_constants.KT_SERVICE_LABEL: name,
72
+ }
73
+ if custom_labels:
74
+ service_labels.update(custom_labels)
75
+
76
+ annotations = {
77
+ "prometheus.io/scrape": "true",
78
+ "prometheus.io/path": serving_constants.PROMETHEUS_HEALTH_ENDPOINT,
79
+ "prometheus.io/port": "8080",
80
+ }
81
+ if custom_annotations:
82
+ annotations.update(custom_annotations)
83
+
84
+ if inactivity_ttl:
85
+ annotations[serving_constants.INACTIVITY_TTL_ANNOTATION] = inactivity_ttl
86
+ logger.info(f"Configuring auto-down after idle timeout ({inactivity_ttl})")
87
+
88
+ if scheduler_name and queue_name:
89
+ labels["kai.scheduler/queue"] = queue_name # Useful for queries, etc
90
+ template_labels[
91
+ "kai.scheduler/queue"
92
+ ] = queue_name # Required for KAI to schedule pods
93
+
94
+ deployment_timestamp = datetime.now(timezone.utc).isoformat()
95
+ template_annotations = {
96
+ "kubetorch.com/deployment_timestamp": deployment_timestamp
97
+ }
98
+
99
+ # Create Deployment
100
+ deployment = load_template(
101
+ template_file=serving_constants.DEPLOYMENT_TEMPLATE_FILE,
102
+ template_dir=os.path.join(
103
+ os.path.dirname(os.path.abspath(__file__)), "templates"
104
+ ),
105
+ name=name,
106
+ namespace=self.namespace,
107
+ annotations=annotations,
108
+ template_annotations=template_annotations,
109
+ labels=labels,
110
+ template_labels=template_labels,
111
+ pod_template=pod_template,
112
+ replicas=replicas,
113
+ )
114
+
115
+ if custom_template:
116
+ nested_override(deployment, custom_template)
117
+
118
+ # Check if this is a distributed deployment
119
+ env_vars = pod_template.get("containers", [{}])[0].get("env", [])
120
+ is_distributed = any(
121
+ env.get("name") == "KT_DISTRIBUTED_CONFIG"
122
+ and env.get("value") != "null"
123
+ and env.get("value")
124
+ for env in env_vars
125
+ )
126
+
127
+ # Create regular service with session affinity
128
+ service = load_template(
129
+ template_file=serving_constants.DEPLOYMENT_SERVICE_TEMPLATE_FILE,
130
+ template_dir=os.path.join(
131
+ os.path.dirname(os.path.abspath(__file__)), "templates"
132
+ ),
133
+ name=service_name,
134
+ namespace=self.namespace,
135
+ annotations=annotations,
136
+ labels=service_labels,
137
+ deployment_name=name,
138
+ module_name=clean_module_name,
139
+ distributed=False, # Keep regular service for client access
140
+ server_port=pod_template.get("containers", [{}])[0]
141
+ .get("ports", [{}])[0]
142
+ .get("containerPort", 32300),
143
+ )
144
+
145
+ # For distributed deployments, also create a headless service for pod discovery
146
+ headless_service = None
147
+ if is_distributed:
148
+ headless_service = load_template(
149
+ template_file=serving_constants.DEPLOYMENT_SERVICE_TEMPLATE_FILE,
150
+ template_dir=os.path.join(
151
+ os.path.dirname(os.path.abspath(__file__)), "templates"
152
+ ),
153
+ name=f"{service_name}-headless", # Use different name for headless
154
+ namespace=self.namespace,
155
+ annotations=annotations,
156
+ labels=service_labels,
157
+ deployment_name=name,
158
+ module_name=clean_module_name,
159
+ distributed=True, # Make this one headless
160
+ server_port=pod_template.get("containers", [{}])[0]
161
+ .get("ports", [{}])[0]
162
+ .get("containerPort", 32300),
163
+ )
164
+
165
+ try:
166
+ kwargs = {"dry_run": "All"} if dryrun else {}
167
+
168
+ # Create regular service first
169
+ try:
170
+ self.core_api.create_namespaced_service(
171
+ namespace=self.namespace,
172
+ body=service,
173
+ **kwargs,
174
+ )
175
+ if not dryrun:
176
+ logger.info(
177
+ f"Created service {service_name} in namespace {self.namespace}"
178
+ )
179
+ except client.exceptions.ApiException as e:
180
+ if e.status == 409:
181
+ logger.info(f"Service {service_name} already exists")
182
+ else:
183
+ raise
184
+
185
+ # Create headless service for distributed pod discovery
186
+ if headless_service:
187
+ try:
188
+ self.core_api.create_namespaced_service(
189
+ namespace=self.namespace,
190
+ body=headless_service,
191
+ **kwargs,
192
+ )
193
+ if not dryrun:
194
+ logger.info(
195
+ f"Created headless service {service_name}-headless in namespace {self.namespace}"
196
+ )
197
+ except client.exceptions.ApiException as e:
198
+ if e.status == 409:
199
+ logger.info(
200
+ f"Headless service {service_name}-headless already exists"
201
+ )
202
+ else:
203
+ raise
204
+
205
+ # Create Deployment
206
+ created_deployment = self.apps_v1_api.create_namespaced_deployment(
207
+ namespace=self.namespace,
208
+ body=deployment,
209
+ **kwargs,
210
+ )
211
+
212
+ if dryrun:
213
+ return created_deployment, False
214
+
215
+ logger.info(f"Created Deployment {name} in namespace {self.namespace}")
216
+ return created_deployment, True
217
+
218
+ except client.exceptions.ApiException as e:
219
+ if e.status == 409:
220
+ logger.info(f"Deployment {name} already exists, updating")
221
+ existing_deployment = self.get_deployment(name)
222
+
223
+ # Update replicas if different
224
+ if existing_deployment.spec.replicas != replicas:
225
+ patch_body = {"spec": {"replicas": replicas}}
226
+ try:
227
+ self.apps_v1_api.patch_namespaced_deployment(
228
+ name=name,
229
+ namespace=self.namespace,
230
+ body=patch_body,
231
+ )
232
+ logger.info(f"Updated Deployment {name} replicas to {replicas}")
233
+ except Exception as e:
234
+ logger.error(f"Failed to patch Deployment {name}: {e}")
235
+ raise e
236
+
237
+ return existing_deployment, False
238
+ else:
239
+ logger.error(f"Failed to create Deployment: {str(e)}")
240
+ raise e
241
+
242
+ def get_deployment(self, deployment_name: str) -> dict:
243
+ """Retrieve a Deployment by name."""
244
+ try:
245
+ deployment = self.apps_v1_api.read_namespaced_deployment(
246
+ name=deployment_name,
247
+ namespace=self.namespace,
248
+ )
249
+ return deployment
250
+ except client.exceptions.ApiException as e:
251
+ logger.error(f"Failed to load Deployment '{deployment_name}': {str(e)}")
252
+ raise
253
+
254
+ def get_deployment_timestamp_annotation(self, service_name: str) -> Optional[str]:
255
+ """Get deployment timestamp annotation for Deployment services."""
256
+ try:
257
+ deployment = self.get_deployment(service_name)
258
+ if (
259
+ deployment
260
+ and hasattr(deployment, "metadata")
261
+ and hasattr(deployment.metadata, "annotations")
262
+ ):
263
+ return deployment.metadata.annotations.get(
264
+ "kubetorch.com/deployment_timestamp", None
265
+ )
266
+ except client.exceptions.ApiException:
267
+ pass
268
+ return None
269
+
270
+ def update_deployment_timestamp_annotation(
271
+ self, service_name: str, new_timestamp: str
272
+ ) -> str:
273
+ """Update deployment timestamp annotation for Deployment services."""
274
+ try:
275
+ patch_body = {
276
+ "metadata": {
277
+ "annotations": {"kubetorch.com/deployment_timestamp": new_timestamp}
278
+ }
279
+ }
280
+ self.apps_v1_api.patch_namespaced_deployment(
281
+ name=service_name,
282
+ namespace=self.namespace,
283
+ body=patch_body,
284
+ )
285
+ return new_timestamp
286
+ except client.exceptions.ApiException as e:
287
+ logger.error(
288
+ f"Failed to update deployment timestamp for '{service_name}': {str(e)}"
289
+ )
290
+ raise
291
+
292
+ def create_or_update_service(
293
+ self,
294
+ service_name: str,
295
+ module_name: str,
296
+ pod_template: dict,
297
+ replicas: int = 1,
298
+ inactivity_ttl: str = None,
299
+ custom_labels: dict = None,
300
+ custom_annotations: dict = None,
301
+ custom_template: dict = None,
302
+ scheduler_name: str = None,
303
+ queue_name: str = None,
304
+ dryrun: bool = False,
305
+ **kwargs, # Ignore Knative-specific args like autoscaling_config, inactivity_ttl, etc.
306
+ ):
307
+ """
308
+ Creates a Deployment service.
309
+
310
+ Args:
311
+ service_name (str): Name for the pod/service.
312
+ module_name (str): Name of the module.
313
+ pod_template (dict): Template for the pod, including resource requirements.
314
+ replicas (int): Number of replicas for the service
315
+ custom_labels (dict, optional): Custom labels to add to the service.
316
+ custom_annotations (dict, optional): Custom annotations to add to the service.
317
+ custom_template (dict, optional): Custom template to apply to the service.
318
+ dryrun (bool, optional): Whether to run in dryrun mode (Default: `False`).
319
+ """
320
+ logger.info(f"Deploying Kubetorch service with name: {service_name}")
321
+ try:
322
+ created_service, _ = self._create_or_update_deployment(
323
+ name=service_name,
324
+ pod_template=pod_template,
325
+ module_name=module_name,
326
+ replicas=replicas,
327
+ inactivity_ttl=inactivity_ttl,
328
+ custom_labels=custom_labels,
329
+ custom_annotations=custom_annotations,
330
+ custom_template=custom_template,
331
+ scheduler_name=scheduler_name,
332
+ queue_name=queue_name,
333
+ dryrun=dryrun,
334
+ )
335
+ return created_service
336
+ except Exception as e:
337
+ logger.error(f"Failed to launch new Deployment: {str(e)}")
338
+ raise e
339
+
340
+ def get_pods_for_service(self, service_name: str, **kwargs) -> List[client.V1Pod]:
341
+ """Get all pods associated with this Deployment service.
342
+
343
+ Args:
344
+ service_name (str): Name of the service
345
+
346
+ Returns:
347
+ List[V1Pod]: List of running pods associated with the service.
348
+ """
349
+ return self.get_pods_for_service_static(
350
+ service_name=service_name,
351
+ namespace=self.namespace,
352
+ core_api=self.core_api,
353
+ )
354
+
355
+ def get_endpoint(self, service_name: str) -> str:
356
+ """Get the endpoint URL for a Deployment service."""
357
+ return f"http://{service_name}.{self.namespace}.svc.cluster.local:80"
358
+
359
+ def check_service_ready(
360
+ self,
361
+ service_name: str,
362
+ launch_timeout: int,
363
+ core_api: client.CoreV1Api = None,
364
+ **kwargs,
365
+ ) -> bool:
366
+ """Checks if the Deployment is ready to start serving requests.
367
+
368
+ Args:
369
+ service_name: Name of the Deployment service
370
+ launch_timeout: Timeout in seconds to wait for readiness
371
+ core_api: Core API instance (uses self.core_api if None)
372
+ **kwargs: Additional arguments (ignored for Deployments)
373
+
374
+ Returns:
375
+ True if service is ready
376
+
377
+ Raises:
378
+ ServiceTimeoutError: If service doesn't become ready within timeout
379
+ """
380
+ if core_api is None:
381
+ core_api = self.core_api
382
+
383
+ sleep_interval = 2
384
+ start_time = time.time()
385
+
386
+ logger.info(
387
+ f"Checking Deployment {service_name} pod readiness (timeout: {launch_timeout} seconds)"
388
+ )
389
+
390
+ iteration = 0
391
+ while (time.time() - start_time) < launch_timeout:
392
+ iteration += 1
393
+ try:
394
+ # Get Deployment
395
+ deployment = self.get_deployment(service_name)
396
+ if not deployment:
397
+ logger.debug(f"Waiting for Deployment {service_name} to be created")
398
+ time.sleep(sleep_interval)
399
+ continue
400
+
401
+ # Check if all replicas are ready
402
+ ready_replicas = deployment.status.ready_replicas or 0
403
+ desired_replicas = deployment.spec.replicas or 0
404
+
405
+ if iteration % 3 == 0:
406
+ logger.debug(
407
+ f"Deployment {service_name}: {ready_replicas}/{desired_replicas} replicas ready"
408
+ )
409
+
410
+ if ready_replicas >= desired_replicas and desired_replicas > 0:
411
+ logger.info(
412
+ f"Deployment {service_name} pod(s) are now ready with {ready_replicas} replicas"
413
+ )
414
+ return True
415
+
416
+ # Check for pod-level issues
417
+ pods = self.get_pods_for_service(service_name)
418
+ for pod in pods:
419
+ # Check for image pull errors in container status
420
+ check_pod_status_for_errors(pod)
421
+
422
+ # Check pod events separately from the core API
423
+ check_pod_events_for_errors(pod, self.namespace, core_api)
424
+
425
+ # If no pods exist, check for ReplicaSet-level errors (like PriorityClass issues)
426
+ if not pods:
427
+ check_replicaset_events_for_errors(
428
+ namespace=self.namespace,
429
+ service_name=service_name,
430
+ apps_v1_api=self.apps_v1_api,
431
+ core_api=core_api,
432
+ )
433
+
434
+ except client.exceptions.ApiException as e:
435
+ logger.error(f"Error checking Deployment readiness: {e}")
436
+ raise
437
+
438
+ if iteration % 10 == 0:
439
+ elapsed = int(time.time() - start_time)
440
+ remaining = max(0, int(launch_timeout - elapsed))
441
+ logger.info(
442
+ f"Deployment is not yet ready "
443
+ f"(elapsed: {elapsed}s, remaining: {remaining}s)"
444
+ )
445
+
446
+ time.sleep(sleep_interval)
447
+
448
+ raise ServiceTimeoutError(
449
+ f"Deployment {service_name} is not ready after {launch_timeout} seconds"
450
+ )
451
+
452
+ def teardown_service(self, service_name: str, console=None) -> bool:
453
+ """Teardown Deployment and associated resources.
454
+
455
+ Args:
456
+ service_name: Name of the Deployment to teardown
457
+ console: Optional Rich console for output
458
+
459
+ Returns:
460
+ True if teardown was successful, False otherwise
461
+ """
462
+ from kubetorch.resources.compute.utils import delete_deployment
463
+
464
+ try:
465
+ # Delete the Deployment and its associated service
466
+ delete_deployment(
467
+ apps_v1_api=self.apps_v1_api,
468
+ core_api=self.core_api,
469
+ name=service_name,
470
+ namespace=self.namespace,
471
+ console=console,
472
+ )
473
+
474
+ return True
475
+
476
+ except Exception as e:
477
+ logger.error(f"Failed to teardown Deployment {service_name}: {e}")
478
+ return False