kubetorch 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. kubetorch/__init__.py +59 -0
  2. kubetorch/cli.py +1939 -0
  3. kubetorch/cli_utils.py +967 -0
  4. kubetorch/config.py +453 -0
  5. kubetorch/constants.py +18 -0
  6. kubetorch/docs/Makefile +18 -0
  7. kubetorch/docs/__init__.py +0 -0
  8. kubetorch/docs/_ext/json_globaltoc.py +42 -0
  9. kubetorch/docs/api/cli.rst +10 -0
  10. kubetorch/docs/api/python/app.rst +21 -0
  11. kubetorch/docs/api/python/cls.rst +19 -0
  12. kubetorch/docs/api/python/compute.rst +25 -0
  13. kubetorch/docs/api/python/config.rst +11 -0
  14. kubetorch/docs/api/python/fn.rst +19 -0
  15. kubetorch/docs/api/python/image.rst +14 -0
  16. kubetorch/docs/api/python/secret.rst +18 -0
  17. kubetorch/docs/api/python/volumes.rst +13 -0
  18. kubetorch/docs/api/python.rst +101 -0
  19. kubetorch/docs/conf.py +69 -0
  20. kubetorch/docs/index.rst +20 -0
  21. kubetorch/docs/requirements.txt +5 -0
  22. kubetorch/globals.py +269 -0
  23. kubetorch/logger.py +59 -0
  24. kubetorch/resources/__init__.py +0 -0
  25. kubetorch/resources/callables/__init__.py +0 -0
  26. kubetorch/resources/callables/cls/__init__.py +0 -0
  27. kubetorch/resources/callables/cls/cls.py +159 -0
  28. kubetorch/resources/callables/fn/__init__.py +0 -0
  29. kubetorch/resources/callables/fn/fn.py +140 -0
  30. kubetorch/resources/callables/module.py +1315 -0
  31. kubetorch/resources/callables/utils.py +203 -0
  32. kubetorch/resources/compute/__init__.py +0 -0
  33. kubetorch/resources/compute/app.py +253 -0
  34. kubetorch/resources/compute/compute.py +2414 -0
  35. kubetorch/resources/compute/decorators.py +137 -0
  36. kubetorch/resources/compute/utils.py +1026 -0
  37. kubetorch/resources/compute/websocket.py +135 -0
  38. kubetorch/resources/images/__init__.py +1 -0
  39. kubetorch/resources/images/image.py +412 -0
  40. kubetorch/resources/images/images.py +64 -0
  41. kubetorch/resources/secrets/__init__.py +2 -0
  42. kubetorch/resources/secrets/kubernetes_secrets_client.py +377 -0
  43. kubetorch/resources/secrets/provider_secrets/__init__.py +0 -0
  44. kubetorch/resources/secrets/provider_secrets/anthropic_secret.py +12 -0
  45. kubetorch/resources/secrets/provider_secrets/aws_secret.py +16 -0
  46. kubetorch/resources/secrets/provider_secrets/azure_secret.py +14 -0
  47. kubetorch/resources/secrets/provider_secrets/cohere_secret.py +12 -0
  48. kubetorch/resources/secrets/provider_secrets/gcp_secret.py +16 -0
  49. kubetorch/resources/secrets/provider_secrets/github_secret.py +13 -0
  50. kubetorch/resources/secrets/provider_secrets/huggingface_secret.py +20 -0
  51. kubetorch/resources/secrets/provider_secrets/kubeconfig_secret.py +12 -0
  52. kubetorch/resources/secrets/provider_secrets/lambda_secret.py +13 -0
  53. kubetorch/resources/secrets/provider_secrets/langchain_secret.py +12 -0
  54. kubetorch/resources/secrets/provider_secrets/openai_secret.py +11 -0
  55. kubetorch/resources/secrets/provider_secrets/pinecone_secret.py +12 -0
  56. kubetorch/resources/secrets/provider_secrets/providers.py +92 -0
  57. kubetorch/resources/secrets/provider_secrets/ssh_secret.py +12 -0
  58. kubetorch/resources/secrets/provider_secrets/wandb_secret.py +11 -0
  59. kubetorch/resources/secrets/secret.py +224 -0
  60. kubetorch/resources/secrets/secret_factory.py +64 -0
  61. kubetorch/resources/secrets/utils.py +222 -0
  62. kubetorch/resources/volumes/__init__.py +0 -0
  63. kubetorch/resources/volumes/volume.py +340 -0
  64. kubetorch/servers/__init__.py +0 -0
  65. kubetorch/servers/http/__init__.py +0 -0
  66. kubetorch/servers/http/distributed_utils.py +2968 -0
  67. kubetorch/servers/http/http_client.py +802 -0
  68. kubetorch/servers/http/http_server.py +1622 -0
  69. kubetorch/servers/http/server_metrics.py +255 -0
  70. kubetorch/servers/http/utils.py +722 -0
  71. kubetorch/serving/__init__.py +0 -0
  72. kubetorch/serving/autoscaling.py +153 -0
  73. kubetorch/serving/base_service_manager.py +344 -0
  74. kubetorch/serving/constants.py +77 -0
  75. kubetorch/serving/deployment_service_manager.py +431 -0
  76. kubetorch/serving/knative_service_manager.py +487 -0
  77. kubetorch/serving/raycluster_service_manager.py +526 -0
  78. kubetorch/serving/service_manager.py +18 -0
  79. kubetorch/serving/templates/deployment_template.yaml +17 -0
  80. kubetorch/serving/templates/knative_service_template.yaml +19 -0
  81. kubetorch/serving/templates/kt_setup_template.sh.j2 +91 -0
  82. kubetorch/serving/templates/pod_template.yaml +198 -0
  83. kubetorch/serving/templates/raycluster_service_template.yaml +42 -0
  84. kubetorch/serving/templates/raycluster_template.yaml +35 -0
  85. kubetorch/serving/templates/service_template.yaml +21 -0
  86. kubetorch/serving/templates/workerset_template.yaml +36 -0
  87. kubetorch/serving/utils.py +344 -0
  88. kubetorch/utils.py +263 -0
  89. kubetorch-0.2.5.dist-info/METADATA +75 -0
  90. kubetorch-0.2.5.dist-info/RECORD +92 -0
  91. kubetorch-0.2.5.dist-info/WHEEL +4 -0
  92. kubetorch-0.2.5.dist-info/entry_points.txt +5 -0
@@ -0,0 +1,526 @@
1
+ import os
2
+ import re
3
+ import time
4
+ from datetime import datetime, timezone
5
+ from typing import List, Optional, Tuple
6
+
7
+ from kubernetes import client
8
+
9
+ import kubetorch.serving.constants as serving_constants
10
+ from kubetorch.logger import get_logger
11
+ from kubetorch.servers.http.utils import load_template
12
+ from kubetorch.serving.base_service_manager import BaseServiceManager
13
+ from kubetorch.serving.utils import nested_override
14
+
15
+ logger = get_logger(__name__)
16
+
17
+
18
+ class RayClusterServiceManager(BaseServiceManager):
19
+ """Service manager for Ray clusters with distributed Ray workload support."""
20
+
21
+ def _create_or_update_raycluster(
22
+ self,
23
+ name: str,
24
+ module_name: str,
25
+ pod_template: dict,
26
+ replicas: int = 1,
27
+ inactivity_ttl: str = None,
28
+ custom_labels: dict = None,
29
+ custom_annotations: dict = None,
30
+ custom_template: dict = None,
31
+ dryrun: bool = False,
32
+ ) -> Tuple[dict, bool]:
33
+ """Creates or updates a RayCluster for Ray distributed workloads.
34
+
35
+ Returns:
36
+ Tuple (created_raycluster, is_new_raycluster)
37
+ """
38
+ clean_module_name = re.sub(r"[^A-Za-z0-9.-]|^[-.]|[-.]$", "", module_name)
39
+
40
+ labels = {
41
+ **self.base_labels,
42
+ serving_constants.KT_MODULE_LABEL: clean_module_name,
43
+ serving_constants.KT_SERVICE_LABEL: name,
44
+ serving_constants.KT_TEMPLATE_LABEL: "raycluster", # Mark as source-of-truth
45
+ }
46
+ if custom_labels:
47
+ labels.update(custom_labels)
48
+
49
+ # Template labels (exclude template label - that's only for the top-level resource)
50
+ # Add ray-node-type label to distinguish head from worker nodes
51
+ template_labels = {
52
+ **self.base_labels,
53
+ serving_constants.KT_MODULE_LABEL: clean_module_name,
54
+ serving_constants.KT_SERVICE_LABEL: name,
55
+ }
56
+ if custom_labels:
57
+ template_labels.update(custom_labels)
58
+
59
+ # Head node specific labels (for service selector)
60
+ head_template_labels = {
61
+ **template_labels,
62
+ "ray.io/node-type": "head", # KubeRay standard label
63
+ }
64
+
65
+ # Worker node specific labels
66
+ worker_template_labels = {
67
+ **template_labels,
68
+ "ray.io/node-type": "worker", # KubeRay standard label
69
+ }
70
+
71
+ annotations = {
72
+ "prometheus.io/scrape": "true",
73
+ "prometheus.io/path": serving_constants.PROMETHEUS_HEALTH_ENDPOINT,
74
+ "prometheus.io/port": "8080",
75
+ "ray.io/overwrite-container-cmd": "true",
76
+ }
77
+ if custom_annotations:
78
+ annotations.update(custom_annotations)
79
+
80
+ deployment_timestamp = datetime.now(timezone.utc).isoformat()
81
+ template_annotations = {"kubetorch.com/deployment_timestamp": deployment_timestamp}
82
+
83
+ if inactivity_ttl:
84
+ annotations[serving_constants.INACTIVITY_TTL_ANNOTATION] = inactivity_ttl
85
+ logger.info(f"Configuring auto-down after idle timeout ({inactivity_ttl})")
86
+
87
+ # Create RayCluster
88
+ worker_replicas = max(0, replicas - 1) # Head node counts as 1 replica
89
+ raycluster = load_template(
90
+ template_file=serving_constants.RAYCLUSTER_TEMPLATE_FILE,
91
+ template_dir=os.path.join(os.path.dirname(os.path.abspath(__file__)), "templates"),
92
+ name=name,
93
+ namespace=self.namespace,
94
+ annotations=annotations,
95
+ template_annotations=template_annotations,
96
+ labels=labels,
97
+ head_template_labels=head_template_labels,
98
+ worker_template_labels=worker_template_labels,
99
+ pod_template=pod_template,
100
+ worker_replicas=worker_replicas,
101
+ )
102
+
103
+ # Create Kubernetes Service pointing to head node HTTP server (like Deployments)
104
+ service_labels = {
105
+ **self.base_labels,
106
+ serving_constants.KT_MODULE_LABEL: clean_module_name,
107
+ serving_constants.KT_SERVICE_LABEL: name,
108
+ }
109
+ if custom_labels:
110
+ service_labels.update(custom_labels)
111
+
112
+ # Ray clusters are always distributed, so we need headless services for pod discovery
113
+ # Create regular service for client access (head node only)
114
+ service = load_template(
115
+ template_file=serving_constants.RAYCLUSTER_SERVICE_TEMPLATE_FILE,
116
+ template_dir=os.path.join(os.path.dirname(os.path.abspath(__file__)), "templates"),
117
+ name=name,
118
+ namespace=self.namespace,
119
+ annotations=annotations,
120
+ labels=service_labels,
121
+ deployment_name=name, # Use same parameter name as deployment for compatibility
122
+ module_name=clean_module_name,
123
+ distributed=False, # Keep regular service for client access
124
+ server_port=pod_template.get("containers", [{}])[0].get("ports", [{}])[0].get("containerPort", 32300),
125
+ )
126
+
127
+ # Create headless service for Ray pod discovery (all nodes)
128
+ headless_service_labels = service_labels.copy()
129
+ headless_service = load_template(
130
+ template_file=serving_constants.RAYCLUSTER_SERVICE_TEMPLATE_FILE,
131
+ template_dir=os.path.join(os.path.dirname(os.path.abspath(__file__)), "templates"),
132
+ name=f"{name}-headless",
133
+ namespace=self.namespace,
134
+ annotations=annotations,
135
+ labels=headless_service_labels,
136
+ deployment_name=name,
137
+ module_name=clean_module_name,
138
+ distributed=True, # Make headless for pod discovery
139
+ server_port=pod_template.get("containers", [{}])[0].get("ports", [{}])[0].get("containerPort", 32300),
140
+ )
141
+
142
+ # For headless service, select all Ray nodes (not just head)
143
+ headless_service["spec"]["selector"].pop("ray.io/node-type", None)
144
+
145
+ if custom_template:
146
+ nested_override(raycluster, custom_template)
147
+
148
+ try:
149
+ kwargs = {"dry_run": "All"} if dryrun else {}
150
+
151
+ # Create Kubernetes Service first (regular service for client access)
152
+ try:
153
+ self.core_api.create_namespaced_service(
154
+ namespace=self.namespace,
155
+ body=service,
156
+ **kwargs,
157
+ )
158
+ if not dryrun:
159
+ logger.info(f"Created service {name} in namespace {self.namespace}")
160
+ except client.exceptions.ApiException as e:
161
+ if e.status == 409:
162
+ logger.info(f"Service {name} already exists")
163
+ else:
164
+ raise
165
+
166
+ # Create headless service for Ray pod discovery (all nodes)
167
+ try:
168
+ self.core_api.create_namespaced_service(
169
+ namespace=self.namespace,
170
+ body=headless_service,
171
+ **kwargs,
172
+ )
173
+ if not dryrun:
174
+ logger.info(f"Created headless service {name}-headless in namespace {self.namespace}")
175
+ except client.exceptions.ApiException as e:
176
+ if e.status == 409:
177
+ logger.info(f"Headless service {name}-headless already exists")
178
+ else:
179
+ raise
180
+
181
+ # Create RayCluster
182
+ created_raycluster = None
183
+ try:
184
+ created_raycluster = self.objects_api.create_namespaced_custom_object(
185
+ group="ray.io",
186
+ version="v1",
187
+ namespace=self.namespace,
188
+ plural="rayclusters",
189
+ body=raycluster,
190
+ **kwargs,
191
+ )
192
+ except client.exceptions.ApiException as e:
193
+ if e.status == 404:
194
+ logger.error(
195
+ "RayCluster Custom Resource Definition (CRD) not found, please install the KubeRay operator"
196
+ )
197
+ raise e
198
+
199
+ if dryrun:
200
+ return created_raycluster, False
201
+
202
+ logger.info(f"Created RayCluster {name} in namespace {self.namespace}")
203
+ return created_raycluster, True
204
+
205
+ except client.exceptions.ApiException as e:
206
+ if e.status == 409:
207
+ logger.info(f"RayCluster {name} already exists, updating")
208
+ try:
209
+ # For RayCluster, we can patch the spec
210
+ patch_body = {"spec": raycluster["spec"]}
211
+ updated_raycluster = self.objects_api.patch_namespaced_custom_object(
212
+ group="ray.io",
213
+ version="v1",
214
+ namespace=self.namespace,
215
+ plural="rayclusters",
216
+ name=name,
217
+ body=patch_body,
218
+ )
219
+ logger.info(f"Updated RayCluster {name}")
220
+ return updated_raycluster, False
221
+ except Exception as patch_error:
222
+ logger.error(f"Failed to patch RayCluster {name}: {patch_error}")
223
+ raise patch_error
224
+
225
+ raise e
226
+
227
+ def get_raycluster(self, raycluster_name: str) -> dict:
228
+ """Retrieve a RayCluster by name."""
229
+ try:
230
+ raycluster = self.objects_api.get_namespaced_custom_object(
231
+ group="ray.io",
232
+ version="v1",
233
+ namespace=self.namespace,
234
+ plural="rayclusters",
235
+ name=raycluster_name,
236
+ )
237
+ return raycluster
238
+ except client.exceptions.ApiException as e:
239
+ logger.error(f"Failed to load RayCluster '{raycluster_name}': {str(e)}")
240
+ raise
241
+
242
+ def get_deployment_timestamp_annotation(self, service_name: str) -> Optional[str]:
243
+ """Get deployment timestamp annotation for RayCluster services."""
244
+ try:
245
+ raycluster = self.get_raycluster(service_name)
246
+ if raycluster:
247
+ return (
248
+ raycluster.get("metadata", {})
249
+ .get("annotations", {})
250
+ .get("kubetorch.com/deployment_timestamp", None)
251
+ )
252
+ except client.exceptions.ApiException:
253
+ pass
254
+ return None
255
+
256
+ def update_deployment_timestamp_annotation(self, service_name: str, new_timestamp: str) -> str:
257
+ """Update deployment timestamp annotation for RayCluster services."""
258
+ try:
259
+ patch_body = {"metadata": {"annotations": {"kubetorch.com/deployment_timestamp": new_timestamp}}}
260
+ self.objects_api.patch_namespaced_custom_object(
261
+ group="ray.io",
262
+ version="v1",
263
+ namespace=self.namespace,
264
+ plural="rayclusters",
265
+ name=service_name,
266
+ body=patch_body,
267
+ )
268
+ return new_timestamp
269
+ except client.exceptions.ApiException as e:
270
+ logger.error(f"Failed to update deployment timestamp for RayCluster '{service_name}': {str(e)}")
271
+ raise
272
+
273
+ def create_or_update_service(
274
+ self,
275
+ service_name: str,
276
+ module_name: str,
277
+ pod_template: dict,
278
+ replicas: int = 1,
279
+ inactivity_ttl: str = None,
280
+ custom_labels: dict = None,
281
+ custom_annotations: dict = None,
282
+ custom_template: dict = None,
283
+ dryrun: bool = False,
284
+ **kwargs, # Ignore Knative-specific args like autoscaling_config, inactivity_ttl, etc.
285
+ ):
286
+ """
287
+ Creates a RayCluster service.
288
+
289
+ Args:
290
+ service_name (str): Name for the RayCluster.
291
+ module_name (str): Name of the module.
292
+ pod_template (dict): Template for the pod, including resource requirements.
293
+ replicas (int): Number of replicas for the service (head + workers)
294
+ custom_labels (dict, optional): Custom labels to add to the service.
295
+ custom_annotations (dict, optional): Custom annotations to add to the service.
296
+ custom_template (dict, optional): Custom template to apply to the service.
297
+ dryrun (bool, optional): Whether to run in dryrun mode (Default: `False`).
298
+ """
299
+ logger.info(f"Deploying Kubetorch RayCluster service with name: {service_name}")
300
+ try:
301
+ created_service, is_new_service = self._create_or_update_raycluster(
302
+ name=service_name,
303
+ pod_template=pod_template,
304
+ module_name=module_name,
305
+ replicas=replicas,
306
+ inactivity_ttl=inactivity_ttl,
307
+ custom_labels=custom_labels,
308
+ custom_annotations=custom_annotations,
309
+ custom_template=custom_template,
310
+ dryrun=dryrun,
311
+ )
312
+ return created_service
313
+ except Exception as e:
314
+ logger.error(f"Failed to launch new RayCluster: {str(e)}")
315
+ raise e
316
+
317
+ def get_pods_for_service(self, service_name: str, **kwargs) -> List[client.V1Pod]:
318
+ """Get all pods associated with this RayCluster service.
319
+
320
+ Args:
321
+ service_name (str): Name of the service
322
+
323
+ Returns:
324
+ List[V1Pod]: List of running pods associated with the service.
325
+ """
326
+ return self.get_pods_for_service_static(
327
+ service_name=service_name,
328
+ namespace=self.namespace,
329
+ core_api=self.core_api,
330
+ )
331
+
332
+ def get_endpoint(self, service_name: str) -> str:
333
+ """Get the endpoint URL for a RayCluster service.
334
+
335
+ Returns the HTTP endpoint for the KubeTorch HTTP server running on the head node,
336
+ just like Deployment services.
337
+ """
338
+ return f"http://{service_name}.{self.namespace}.svc.cluster.local:80"
339
+
340
+ def check_service_ready(self, service_name: str, launch_timeout: int, **kwargs) -> bool:
341
+ """Checks if the RayCluster is ready to start serving requests.
342
+
343
+ Args:
344
+ service_name: Name of the RayCluster service
345
+ launch_timeout: Timeout in seconds to wait for readiness
346
+ **kwargs: Additional arguments (ignored for RayClusters)
347
+
348
+ Returns:
349
+ True if service is ready
350
+
351
+ Raises:
352
+ TimeoutError: If service doesn't become ready within timeout
353
+ RuntimeError: If RayCluster fails to start
354
+ """
355
+ sleep_interval = 2
356
+ start_time = time.time()
357
+
358
+ logger.info(f"Checking RayCluster {service_name} pod readiness (timeout: {launch_timeout} seconds)")
359
+
360
+ iteration = 0
361
+ while (time.time() - start_time) < launch_timeout:
362
+ iteration += 1
363
+ try:
364
+ raycluster = self.get_raycluster(service_name)
365
+ status = raycluster.get("status", {})
366
+
367
+ # Check RayCluster state
368
+ state = status.get("state", "-")
369
+ if state == "ready":
370
+ logger.info(f"RayCluster {service_name} is ready")
371
+ return True
372
+ elif state == "failed":
373
+ raise RuntimeError(f"RayCluster {service_name} failed to start")
374
+
375
+ # Calculate total expected replicas from head + all worker groups
376
+ spec = raycluster.get("spec", {})
377
+
378
+ # Head group replicas
379
+ head_group_spec = spec.get("headGroupSpec", {})
380
+ head_replicas = head_group_spec.get("replicas", 1)
381
+
382
+ # Worker group replicas (sum across all worker groups)
383
+ worker_groups = spec.get("workerGroupSpecs", [])
384
+ worker_replicas = sum(worker_group.get("replicas", 0) for worker_group in worker_groups)
385
+
386
+ total_expected_replicas = head_replicas + worker_replicas
387
+
388
+ # Check pods are running
389
+ pods = self.get_pods_for_service(service_name)
390
+ running_pods = [pod for pod in pods if pod.status.phase == "Running"]
391
+
392
+ # Count head and worker pods separately for better logging
393
+ head_pods = [pod for pod in running_pods if pod.metadata.labels.get("ray.io/node-type") == "head"]
394
+ worker_pods = [pod for pod in running_pods if pod.metadata.labels.get("ray.io/node-type") == "worker"]
395
+
396
+ # Check for specific error conditions
397
+ if head_pods:
398
+ head_pod = head_pods[0]
399
+ # Check for Ray installation errors in head pod
400
+ ray_error = self._check_ray_installation_error(service_name, head_pod.metadata.name)
401
+ if ray_error:
402
+ raise RuntimeError(ray_error)
403
+
404
+ if len(running_pods) >= total_expected_replicas:
405
+ logger.info(
406
+ f"RayCluster {service_name} is ready with {len(running_pods)} pods "
407
+ f"({len(head_pods)} head, {len(worker_pods)} worker{'' if len(worker_pods) == 1 else 's'})"
408
+ )
409
+ return True
410
+
411
+ # Log progress every 30 seconds
412
+ if iteration % (30 // sleep_interval) == 0:
413
+ elapsed = int(time.time() - start_time)
414
+ remaining = launch_timeout - elapsed
415
+ logger.info(
416
+ f"RayCluster is not yet ready (elapsed: {elapsed}s, remaining: {remaining}s). "
417
+ f"State: {state}, Running pods: {len(running_pods)}/{total_expected_replicas} "
418
+ f"({len(head_pods)}/{head_replicas} head, {len(worker_pods)}/{worker_replicas} worker{'' if worker_replicas == 1 else 's'})"
419
+ )
420
+
421
+ except RuntimeError as e:
422
+ raise e
423
+ except Exception as e:
424
+ logger.error(f"Error checking RayCluster readiness: {e}")
425
+
426
+ time.sleep(sleep_interval)
427
+
428
+ # Timeout reached
429
+ raise TimeoutError(f"RayCluster {service_name} did not become ready within {launch_timeout} seconds")
430
+
431
+ def teardown_service(self, service_name: str, console=None) -> bool:
432
+ """Teardown RayCluster and associated resources.
433
+
434
+ Args:
435
+ service_name: Name of the RayCluster to teardown
436
+ console: Optional Rich console for output
437
+
438
+ Returns:
439
+ True if teardown was successful, False otherwise
440
+ """
441
+ success = True
442
+
443
+ try:
444
+ # Delete the RayCluster
445
+ self.objects_api.delete_namespaced_custom_object(
446
+ group="ray.io",
447
+ version="v1",
448
+ namespace=self.namespace,
449
+ plural="rayclusters",
450
+ name=service_name,
451
+ )
452
+ if console:
453
+ console.print(f"✓ Deleted RayCluster [blue]{service_name}[/blue]")
454
+ else:
455
+ logger.info(f"Deleted RayCluster {service_name}")
456
+
457
+ except client.exceptions.ApiException as e:
458
+ if e.status == 404:
459
+ if console:
460
+ console.print(f"[yellow]Note:[/yellow] RayCluster {service_name} not found or already deleted")
461
+ else:
462
+ logger.info(f"RayCluster {service_name} not found or already deleted")
463
+ else:
464
+ if console:
465
+ console.print(f"[red]Error:[/red] Failed to delete RayCluster {service_name}: {e}")
466
+ else:
467
+ logger.error(f"Failed to delete RayCluster {service_name}: {e}")
468
+ success = False
469
+
470
+ try:
471
+ # Delete the associated Kubernetes service (created alongside RayCluster)
472
+ self.core_api.delete_namespaced_service(name=service_name, namespace=self.namespace)
473
+ if console:
474
+ console.print(f"✓ Deleted service [blue]{service_name}[/blue]")
475
+ else:
476
+ logger.info(f"Deleted service {service_name}")
477
+
478
+ except client.exceptions.ApiException as e:
479
+ if e.status == 404:
480
+ if console:
481
+ console.print(f"[yellow]Note:[/yellow] Service {service_name} not found or already deleted")
482
+ else:
483
+ logger.info(f"Service {service_name} not found or already deleted")
484
+ else:
485
+ if console:
486
+ console.print(f"[red]Error:[/red] Failed to delete service {service_name}: {e}")
487
+ else:
488
+ logger.error(f"Failed to delete service {service_name}: {e}")
489
+ success = False
490
+
491
+ return success
492
+
493
+ def _check_ray_installation_error(self, service_name: str, head_pod_name: str) -> Optional[str]:
494
+ """Check if there's a Ray installation error in the head pod logs.
495
+
496
+ Args:
497
+ service_name: Name of the RayCluster service
498
+ head_pod_name: Name of the head pod
499
+
500
+ Returns:
501
+ Error message if Ray installation error is found, None otherwise
502
+ """
503
+ try:
504
+ head_logs = self.core_api.read_namespaced_pod_log(
505
+ name=head_pod_name, namespace=self.namespace, tail_lines=100
506
+ )
507
+
508
+ # Check for Ray installation errors
509
+ if "ray: not found" in head_logs or "ray: command not found" in head_logs:
510
+ return (
511
+ f"RayCluster {service_name} failed to start: Ray is not installed in the container. "
512
+ f"Please use a Ray-enabled image (e.g., rayproject/ray) or ensure Ray is installed in your container setup."
513
+ )
514
+
515
+ # Check for Ray startup errors
516
+ if "Failed to start Ray server" in head_logs:
517
+ return (
518
+ f"RayCluster {service_name} failed to start: Ray server failed to start. "
519
+ f"Check the head pod logs for more details."
520
+ )
521
+
522
+ except client.exceptions.ApiException as e:
523
+ if e.status != 404: # Pod might not be ready yet
524
+ logger.warning(f"Could not check head pod logs: {e}")
525
+
526
+ return None
@@ -0,0 +1,18 @@
1
+ # Backward compatibility imports - all service manager functionality is now in separate files
2
+ from kubetorch.logger import get_logger
3
+
4
+ # Import all service managers for backward compatibility and centralized access
5
+ from kubetorch.serving.base_service_manager import BaseServiceManager
6
+ from kubetorch.serving.deployment_service_manager import DeploymentServiceManager
7
+ from kubetorch.serving.knative_service_manager import KnativeServiceManager
8
+ from kubetorch.serving.raycluster_service_manager import RayClusterServiceManager
9
+
10
+ # Export all service managers
11
+ __all__ = [
12
+ "BaseServiceManager",
13
+ "DeploymentServiceManager",
14
+ "KnativeServiceManager",
15
+ "RayClusterServiceManager",
16
+ ]
17
+
18
+ logger = get_logger(__name__)
@@ -0,0 +1,17 @@
1
+ apiVersion: apps/v1
2
+ kind: Deployment
3
+ metadata:
4
+ name: {{ name }}
5
+ namespace: {{ namespace }}
6
+ annotations: {{ annotations | tojson }}
7
+ labels: {{ labels | tojson }}
8
+ spec:
9
+ replicas: {{ replicas }}
10
+ selector:
11
+ matchLabels:
12
+ kubetorch.com/service: {{ name }}
13
+ template:
14
+ metadata:
15
+ annotations: {{ template_annotations | tojson }}
16
+ labels: {{ template_labels | tojson }}
17
+ spec: {{ pod_template | tojson }}
@@ -0,0 +1,19 @@
1
+ apiVersion: serving.knative.dev/v1
2
+ kind: Service
3
+ metadata:
4
+ name: {{ name }}
5
+ namespace: {{ namespace }}
6
+ annotations: {{ annotations | tojson }}
7
+ labels: {{ labels | tojson }}
8
+ spec:
9
+ template:
10
+ metadata:
11
+ annotations: {{ template_annotations | tojson }}
12
+ labels: {{ template_labels | tojson }}
13
+ spec:
14
+ {% if container_concurrency is defined %}
15
+ containerConcurrency: {{ container_concurrency }}
16
+ {% endif %}
17
+ {% for key, value in pod_template.items() %}
18
+ {{ key }}: {{ value | tojson }}
19
+ {% endfor %}
@@ -0,0 +1,91 @@
1
+ # Increase file descriptor limit for large-scale distributed jobs
2
+ ulimit -n 65536
3
+
4
+ {% if python_path %}
5
+ export PATH="{{ python_path }}:$PATH"
6
+ if command -v "{{ python_path }}" &> /dev/null; then
7
+ python_bin="{{ python_path }}"
8
+ fi
9
+ {% endif %}
10
+ # If the user set the python_path to exact executable, then we'll use it directly here but adding it to PATH
11
+ # above will have little effect. If they set it to a directory, then this command check will fail as desired,
12
+ # and we'll then look for python3 or python in PATH (starting with their directory) as desired.
13
+ if [[ -z "$python_bin" ]]; then
14
+ if command -v python3 &> /dev/null; then
15
+ python_bin="python3"
16
+ elif command -v python &> /dev/null; then
17
+ python_bin="python"
18
+ else
19
+ echo "Error: Neither python3 nor python found in PATH. Please set python_path to a valid Python executable."
20
+ exit 1
21
+ fi
22
+ fi
23
+ echo "Using Python binary: $python_bin"
24
+
25
+ {% if not freeze %}
26
+ if ! command -v rsync &> /dev/null; then
27
+ apt-get update && apt-get install -y rsync
28
+ fi
29
+ if ! command -v nohup &> /dev/null; then
30
+ apt-get update && apt-get install -y coreutils
31
+ fi
32
+
33
+ {% if install_cmd %}
34
+ # Use the explicitly provided install command
35
+ uv_pip_cmd="{{ install_cmd }}"
36
+ {% else %}
37
+
38
+ if $python_bin -c "import sys; exit(0 if sys.prefix != sys.base_prefix else 1)" 2>/dev/null; then
39
+ install_flags=""
40
+ else
41
+ install_flags="--system --break-system-packages"
42
+ fi
43
+
44
+ # Check if uv is available and set the appropriate command
45
+ if command -v uv &> /dev/null; then
46
+ # Use system-wide uv with the detected Python interpreter
47
+ uv_pip_cmd="uv pip install $install_flags --python=$python_bin"
48
+ elif $python_bin -m uv --version &> /dev/null; then
49
+ # Use Python module uv - it inherently uses the right Python
50
+ uv_pip_cmd="$python_bin -m uv pip install $install_flags"
51
+ else
52
+ # Install uv as a Python module and use it
53
+ echo "uv not found, installing it..."
54
+ $python_bin -m pip install uv
55
+ uv_pip_cmd="$python_bin -m uv pip install $install_flags"
56
+ fi
57
+ {% endif %}
58
+
59
+ # Export the install command as an environment variable for use in applications
60
+ echo "Setting KT_PIP_INSTALL_CMD env var to $uv_pip_cmd"
61
+ export KT_PIP_INSTALL_CMD="$uv_pip_cmd"
62
+ mkdir -p .kt
63
+ echo "$uv_pip_cmd" > .kt/kt_pip_install_cmd
64
+
65
+ {% if rsync_kt_local_cmd %}
66
+ {{ rsync_kt_local_cmd }}
67
+ {% if install_url and install_url.endswith('.whl') %}
68
+ {% set normalized_path = install_url.replace('\\', '/') %}
69
+ {% set wheel_filename = normalized_path.split('/')[-1] %}
70
+ $uv_pip_cmd "{{ wheel_filename }}[server]"
71
+ {% if install_otel %}
72
+ $uv_pip_cmd "{{ wheel_filename }}[otel]"
73
+ {% endif %}
74
+ {% else %}
75
+ $uv_pip_cmd -e "python_client[server]"
76
+ {% if install_otel %}
77
+ $uv_pip_cmd -e "python_client[otel]"
78
+ {% endif %}
79
+ {% endif %}
80
+ {% else %}
81
+ $uv_pip_cmd "kubetorch[server]=={{ install_url }}"
82
+ {% if install_otel %}
83
+ $uv_pip_cmd "kubetorch[otel]=={{ install_url }}"
84
+ {% endif %}
85
+ {% endif %}
86
+
87
+ {% endif %}
88
+
89
+ $python_bin -m uvicorn kubetorch.servers.http.http_server:app \
90
+ --host 0.0.0.0 \
91
+ --port {{ server_port }}