kubetorch 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kubetorch might be problematic. Click here for more details.

Files changed (93) hide show
  1. kubetorch/__init__.py +60 -0
  2. kubetorch/cli.py +1985 -0
  3. kubetorch/cli_utils.py +1025 -0
  4. kubetorch/config.py +453 -0
  5. kubetorch/constants.py +18 -0
  6. kubetorch/docs/Makefile +18 -0
  7. kubetorch/docs/__init__.py +0 -0
  8. kubetorch/docs/_ext/json_globaltoc.py +42 -0
  9. kubetorch/docs/api/cli.rst +10 -0
  10. kubetorch/docs/api/python/app.rst +21 -0
  11. kubetorch/docs/api/python/cls.rst +19 -0
  12. kubetorch/docs/api/python/compute.rst +25 -0
  13. kubetorch/docs/api/python/config.rst +11 -0
  14. kubetorch/docs/api/python/fn.rst +19 -0
  15. kubetorch/docs/api/python/image.rst +14 -0
  16. kubetorch/docs/api/python/secret.rst +18 -0
  17. kubetorch/docs/api/python/volumes.rst +13 -0
  18. kubetorch/docs/api/python.rst +101 -0
  19. kubetorch/docs/conf.py +69 -0
  20. kubetorch/docs/index.rst +20 -0
  21. kubetorch/docs/requirements.txt +5 -0
  22. kubetorch/globals.py +285 -0
  23. kubetorch/logger.py +59 -0
  24. kubetorch/resources/__init__.py +0 -0
  25. kubetorch/resources/callables/__init__.py +0 -0
  26. kubetorch/resources/callables/cls/__init__.py +0 -0
  27. kubetorch/resources/callables/cls/cls.py +157 -0
  28. kubetorch/resources/callables/fn/__init__.py +0 -0
  29. kubetorch/resources/callables/fn/fn.py +133 -0
  30. kubetorch/resources/callables/module.py +1416 -0
  31. kubetorch/resources/callables/utils.py +174 -0
  32. kubetorch/resources/compute/__init__.py +0 -0
  33. kubetorch/resources/compute/app.py +261 -0
  34. kubetorch/resources/compute/compute.py +2596 -0
  35. kubetorch/resources/compute/decorators.py +139 -0
  36. kubetorch/resources/compute/rbac.py +74 -0
  37. kubetorch/resources/compute/utils.py +1114 -0
  38. kubetorch/resources/compute/websocket.py +137 -0
  39. kubetorch/resources/images/__init__.py +1 -0
  40. kubetorch/resources/images/image.py +414 -0
  41. kubetorch/resources/images/images.py +74 -0
  42. kubetorch/resources/secrets/__init__.py +2 -0
  43. kubetorch/resources/secrets/kubernetes_secrets_client.py +412 -0
  44. kubetorch/resources/secrets/provider_secrets/__init__.py +0 -0
  45. kubetorch/resources/secrets/provider_secrets/anthropic_secret.py +12 -0
  46. kubetorch/resources/secrets/provider_secrets/aws_secret.py +16 -0
  47. kubetorch/resources/secrets/provider_secrets/azure_secret.py +14 -0
  48. kubetorch/resources/secrets/provider_secrets/cohere_secret.py +12 -0
  49. kubetorch/resources/secrets/provider_secrets/gcp_secret.py +16 -0
  50. kubetorch/resources/secrets/provider_secrets/github_secret.py +13 -0
  51. kubetorch/resources/secrets/provider_secrets/huggingface_secret.py +20 -0
  52. kubetorch/resources/secrets/provider_secrets/kubeconfig_secret.py +12 -0
  53. kubetorch/resources/secrets/provider_secrets/lambda_secret.py +13 -0
  54. kubetorch/resources/secrets/provider_secrets/langchain_secret.py +12 -0
  55. kubetorch/resources/secrets/provider_secrets/openai_secret.py +11 -0
  56. kubetorch/resources/secrets/provider_secrets/pinecone_secret.py +12 -0
  57. kubetorch/resources/secrets/provider_secrets/providers.py +93 -0
  58. kubetorch/resources/secrets/provider_secrets/ssh_secret.py +12 -0
  59. kubetorch/resources/secrets/provider_secrets/wandb_secret.py +11 -0
  60. kubetorch/resources/secrets/secret.py +238 -0
  61. kubetorch/resources/secrets/secret_factory.py +70 -0
  62. kubetorch/resources/secrets/utils.py +209 -0
  63. kubetorch/resources/volumes/__init__.py +0 -0
  64. kubetorch/resources/volumes/volume.py +365 -0
  65. kubetorch/servers/__init__.py +0 -0
  66. kubetorch/servers/http/__init__.py +0 -0
  67. kubetorch/servers/http/distributed_utils.py +3223 -0
  68. kubetorch/servers/http/http_client.py +730 -0
  69. kubetorch/servers/http/http_server.py +1788 -0
  70. kubetorch/servers/http/server_metrics.py +278 -0
  71. kubetorch/servers/http/utils.py +728 -0
  72. kubetorch/serving/__init__.py +0 -0
  73. kubetorch/serving/autoscaling.py +173 -0
  74. kubetorch/serving/base_service_manager.py +363 -0
  75. kubetorch/serving/constants.py +83 -0
  76. kubetorch/serving/deployment_service_manager.py +478 -0
  77. kubetorch/serving/knative_service_manager.py +519 -0
  78. kubetorch/serving/raycluster_service_manager.py +582 -0
  79. kubetorch/serving/service_manager.py +18 -0
  80. kubetorch/serving/templates/deployment_template.yaml +17 -0
  81. kubetorch/serving/templates/knative_service_template.yaml +19 -0
  82. kubetorch/serving/templates/kt_setup_template.sh.j2 +81 -0
  83. kubetorch/serving/templates/pod_template.yaml +194 -0
  84. kubetorch/serving/templates/raycluster_service_template.yaml +42 -0
  85. kubetorch/serving/templates/raycluster_template.yaml +35 -0
  86. kubetorch/serving/templates/service_template.yaml +21 -0
  87. kubetorch/serving/templates/workerset_template.yaml +36 -0
  88. kubetorch/serving/utils.py +377 -0
  89. kubetorch/utils.py +284 -0
  90. kubetorch-0.2.0.dist-info/METADATA +121 -0
  91. kubetorch-0.2.0.dist-info/RECORD +93 -0
  92. kubetorch-0.2.0.dist-info/WHEEL +4 -0
  93. kubetorch-0.2.0.dist-info/entry_points.txt +5 -0
@@ -0,0 +1,582 @@
1
+ import os
2
+ import re
3
+ import time
4
+ from datetime import datetime, timezone
5
+ from typing import List, Optional, Tuple
6
+
7
+ from kubernetes import client
8
+
9
+ import kubetorch.serving.constants as serving_constants
10
+ from kubetorch.logger import get_logger
11
+ from kubetorch.servers.http.utils import load_template
12
+ from kubetorch.serving.base_service_manager import BaseServiceManager
13
+ from kubetorch.serving.utils import nested_override
14
+
15
+ logger = get_logger(__name__)
16
+
17
+
18
+ class RayClusterServiceManager(BaseServiceManager):
19
+ """Service manager for Ray clusters with distributed Ray workload support."""
20
+
21
+ def _create_or_update_raycluster(
22
+ self,
23
+ name: str,
24
+ module_name: str,
25
+ pod_template: dict,
26
+ replicas: int = 1,
27
+ inactivity_ttl: str = None,
28
+ custom_labels: dict = None,
29
+ custom_annotations: dict = None,
30
+ custom_template: dict = None,
31
+ dryrun: bool = False,
32
+ ) -> Tuple[dict, bool]:
33
+ """Creates or updates a RayCluster for Ray distributed workloads.
34
+
35
+ Returns:
36
+ Tuple (created_raycluster, is_new_raycluster)
37
+ """
38
+ clean_module_name = re.sub(r"[^A-Za-z0-9.-]|^[-.]|[-.]$", "", module_name)
39
+
40
+ labels = {
41
+ **self.base_labels,
42
+ serving_constants.KT_MODULE_LABEL: clean_module_name,
43
+ serving_constants.KT_SERVICE_LABEL: name,
44
+ serving_constants.KT_TEMPLATE_LABEL: "raycluster", # Mark as source-of-truth
45
+ }
46
+ if custom_labels:
47
+ labels.update(custom_labels)
48
+
49
+ # Template labels (exclude template label - that's only for the top-level resource)
50
+ # Add ray-node-type label to distinguish head from worker nodes
51
+ template_labels = {
52
+ **self.base_labels,
53
+ serving_constants.KT_MODULE_LABEL: clean_module_name,
54
+ serving_constants.KT_SERVICE_LABEL: name,
55
+ }
56
+ if custom_labels:
57
+ template_labels.update(custom_labels)
58
+
59
+ # Head node specific labels (for service selector)
60
+ head_template_labels = {
61
+ **template_labels,
62
+ "ray.io/node-type": "head", # KubeRay standard label
63
+ }
64
+
65
+ # Worker node specific labels
66
+ worker_template_labels = {
67
+ **template_labels,
68
+ "ray.io/node-type": "worker", # KubeRay standard label
69
+ }
70
+
71
+ annotations = {
72
+ "prometheus.io/scrape": "true",
73
+ "prometheus.io/path": serving_constants.PROMETHEUS_HEALTH_ENDPOINT,
74
+ "prometheus.io/port": "8080",
75
+ "ray.io/overwrite-container-cmd": "true",
76
+ }
77
+ if custom_annotations:
78
+ annotations.update(custom_annotations)
79
+
80
+ deployment_timestamp = datetime.now(timezone.utc).isoformat()
81
+ template_annotations = {
82
+ "kubetorch.com/deployment_timestamp": deployment_timestamp
83
+ }
84
+
85
+ if inactivity_ttl:
86
+ annotations[serving_constants.INACTIVITY_TTL_ANNOTATION] = inactivity_ttl
87
+ logger.info(f"Configuring auto-down after idle timeout ({inactivity_ttl})")
88
+
89
+ # Create RayCluster
90
+ worker_replicas = max(0, replicas - 1) # Head node counts as 1 replica
91
+ raycluster = load_template(
92
+ template_file=serving_constants.RAYCLUSTER_TEMPLATE_FILE,
93
+ template_dir=os.path.join(
94
+ os.path.dirname(os.path.abspath(__file__)), "templates"
95
+ ),
96
+ name=name,
97
+ namespace=self.namespace,
98
+ annotations=annotations,
99
+ template_annotations=template_annotations,
100
+ labels=labels,
101
+ head_template_labels=head_template_labels,
102
+ worker_template_labels=worker_template_labels,
103
+ pod_template=pod_template,
104
+ worker_replicas=worker_replicas,
105
+ )
106
+
107
+ # Create Kubernetes Service pointing to head node HTTP server (like Deployments)
108
+ service_labels = {
109
+ **self.base_labels,
110
+ serving_constants.KT_MODULE_LABEL: clean_module_name,
111
+ serving_constants.KT_SERVICE_LABEL: name,
112
+ }
113
+ if custom_labels:
114
+ service_labels.update(custom_labels)
115
+
116
+ # Ray clusters are always distributed, so we need headless services for pod discovery
117
+ # Create regular service for client access (head node only)
118
+ service = load_template(
119
+ template_file=serving_constants.RAYCLUSTER_SERVICE_TEMPLATE_FILE,
120
+ template_dir=os.path.join(
121
+ os.path.dirname(os.path.abspath(__file__)), "templates"
122
+ ),
123
+ name=name,
124
+ namespace=self.namespace,
125
+ annotations=annotations,
126
+ labels=service_labels,
127
+ deployment_name=name, # Use same parameter name as deployment for compatibility
128
+ module_name=clean_module_name,
129
+ distributed=False, # Keep regular service for client access
130
+ server_port=pod_template.get("containers", [{}])[0]
131
+ .get("ports", [{}])[0]
132
+ .get("containerPort", 32300),
133
+ )
134
+
135
+ # Create headless service for Ray pod discovery (all nodes)
136
+ headless_service_labels = service_labels.copy()
137
+ headless_service = load_template(
138
+ template_file=serving_constants.RAYCLUSTER_SERVICE_TEMPLATE_FILE,
139
+ template_dir=os.path.join(
140
+ os.path.dirname(os.path.abspath(__file__)), "templates"
141
+ ),
142
+ name=f"{name}-headless",
143
+ namespace=self.namespace,
144
+ annotations=annotations,
145
+ labels=headless_service_labels,
146
+ deployment_name=name,
147
+ module_name=clean_module_name,
148
+ distributed=True, # Make headless for pod discovery
149
+ server_port=pod_template.get("containers", [{}])[0]
150
+ .get("ports", [{}])[0]
151
+ .get("containerPort", 32300),
152
+ )
153
+
154
+ # For headless service, select all Ray nodes (not just head)
155
+ headless_service["spec"]["selector"].pop("ray.io/node-type", None)
156
+
157
+ if custom_template:
158
+ nested_override(raycluster, custom_template)
159
+
160
+ try:
161
+ kwargs = {"dry_run": "All"} if dryrun else {}
162
+
163
+ # Create Kubernetes Service first (regular service for client access)
164
+ try:
165
+ self.core_api.create_namespaced_service(
166
+ namespace=self.namespace,
167
+ body=service,
168
+ **kwargs,
169
+ )
170
+ if not dryrun:
171
+ logger.info(f"Created service {name} in namespace {self.namespace}")
172
+ except client.exceptions.ApiException as e:
173
+ if e.status == 409:
174
+ logger.info(f"Service {name} already exists")
175
+ else:
176
+ raise
177
+
178
+ # Create headless service for Ray pod discovery (all nodes)
179
+ try:
180
+ self.core_api.create_namespaced_service(
181
+ namespace=self.namespace,
182
+ body=headless_service,
183
+ **kwargs,
184
+ )
185
+ if not dryrun:
186
+ logger.info(
187
+ f"Created headless service {name}-headless in namespace {self.namespace}"
188
+ )
189
+ except client.exceptions.ApiException as e:
190
+ if e.status == 409:
191
+ logger.info(f"Headless service {name}-headless already exists")
192
+ else:
193
+ raise
194
+
195
+ # Create RayCluster
196
+ created_raycluster = None
197
+ try:
198
+ created_raycluster = self.objects_api.create_namespaced_custom_object(
199
+ group="ray.io",
200
+ version="v1",
201
+ namespace=self.namespace,
202
+ plural="rayclusters",
203
+ body=raycluster,
204
+ **kwargs,
205
+ )
206
+ except client.exceptions.ApiException as e:
207
+ if e.status == 404:
208
+ logger.error(
209
+ "RayCluster Custom Resource Definition (CRD) not found, please install the KubeRay operator"
210
+ )
211
+ raise e
212
+
213
+ if dryrun:
214
+ return created_raycluster, False
215
+
216
+ logger.info(f"Created RayCluster {name} in namespace {self.namespace}")
217
+ return created_raycluster, True
218
+
219
+ except client.exceptions.ApiException as e:
220
+ if e.status == 409:
221
+ logger.info(f"RayCluster {name} already exists, updating")
222
+ try:
223
+ # For RayCluster, we can patch the spec
224
+ patch_body = {"spec": raycluster["spec"]}
225
+ updated_raycluster = (
226
+ self.objects_api.patch_namespaced_custom_object(
227
+ group="ray.io",
228
+ version="v1",
229
+ namespace=self.namespace,
230
+ plural="rayclusters",
231
+ name=name,
232
+ body=patch_body,
233
+ )
234
+ )
235
+ logger.info(f"Updated RayCluster {name}")
236
+ return updated_raycluster, False
237
+ except Exception as patch_error:
238
+ logger.error(f"Failed to patch RayCluster {name}: {patch_error}")
239
+ raise patch_error
240
+
241
+ raise e
242
+
243
+ def get_raycluster(self, raycluster_name: str) -> dict:
244
+ """Retrieve a RayCluster by name."""
245
+ try:
246
+ raycluster = self.objects_api.get_namespaced_custom_object(
247
+ group="ray.io",
248
+ version="v1",
249
+ namespace=self.namespace,
250
+ plural="rayclusters",
251
+ name=raycluster_name,
252
+ )
253
+ return raycluster
254
+ except client.exceptions.ApiException as e:
255
+ logger.error(f"Failed to load RayCluster '{raycluster_name}': {str(e)}")
256
+ raise
257
+
258
+ def get_deployment_timestamp_annotation(self, service_name: str) -> Optional[str]:
259
+ """Get deployment timestamp annotation for RayCluster services."""
260
+ try:
261
+ raycluster = self.get_raycluster(service_name)
262
+ if raycluster:
263
+ return (
264
+ raycluster.get("metadata", {})
265
+ .get("annotations", {})
266
+ .get("kubetorch.com/deployment_timestamp", None)
267
+ )
268
+ except client.exceptions.ApiException:
269
+ pass
270
+ return None
271
+
272
+ def update_deployment_timestamp_annotation(
273
+ self, service_name: str, new_timestamp: str
274
+ ) -> str:
275
+ """Update deployment timestamp annotation for RayCluster services."""
276
+ try:
277
+ patch_body = {
278
+ "metadata": {
279
+ "annotations": {"kubetorch.com/deployment_timestamp": new_timestamp}
280
+ }
281
+ }
282
+ self.objects_api.patch_namespaced_custom_object(
283
+ group="ray.io",
284
+ version="v1",
285
+ namespace=self.namespace,
286
+ plural="rayclusters",
287
+ name=service_name,
288
+ body=patch_body,
289
+ )
290
+ return new_timestamp
291
+ except client.exceptions.ApiException as e:
292
+ logger.error(
293
+ f"Failed to update deployment timestamp for RayCluster '{service_name}': {str(e)}"
294
+ )
295
+ raise
296
+
297
+ def create_or_update_service(
298
+ self,
299
+ service_name: str,
300
+ module_name: str,
301
+ pod_template: dict,
302
+ replicas: int = 1,
303
+ inactivity_ttl: str = None,
304
+ custom_labels: dict = None,
305
+ custom_annotations: dict = None,
306
+ custom_template: dict = None,
307
+ dryrun: bool = False,
308
+ **kwargs, # Ignore Knative-specific args like autoscaling_config, inactivity_ttl, etc.
309
+ ):
310
+ """
311
+ Creates a RayCluster service.
312
+
313
+ Args:
314
+ service_name (str): Name for the RayCluster.
315
+ module_name (str): Name of the module.
316
+ pod_template (dict): Template for the pod, including resource requirements.
317
+ replicas (int): Number of replicas for the service (head + workers)
318
+ custom_labels (dict, optional): Custom labels to add to the service.
319
+ custom_annotations (dict, optional): Custom annotations to add to the service.
320
+ custom_template (dict, optional): Custom template to apply to the service.
321
+ dryrun (bool, optional): Whether to run in dryrun mode (Default: `False`).
322
+ """
323
+ logger.info(f"Deploying Kubetorch RayCluster service with name: {service_name}")
324
+ try:
325
+ created_service, is_new_service = self._create_or_update_raycluster(
326
+ name=service_name,
327
+ pod_template=pod_template,
328
+ module_name=module_name,
329
+ replicas=replicas,
330
+ inactivity_ttl=inactivity_ttl,
331
+ custom_labels=custom_labels,
332
+ custom_annotations=custom_annotations,
333
+ custom_template=custom_template,
334
+ dryrun=dryrun,
335
+ )
336
+ return created_service
337
+ except Exception as e:
338
+ logger.error(f"Failed to launch new RayCluster: {str(e)}")
339
+ raise e
340
+
341
+ def get_pods_for_service(self, service_name: str, **kwargs) -> List[client.V1Pod]:
342
+ """Get all pods associated with this RayCluster service.
343
+
344
+ Args:
345
+ service_name (str): Name of the service
346
+
347
+ Returns:
348
+ List[V1Pod]: List of running pods associated with the service.
349
+ """
350
+ return self.get_pods_for_service_static(
351
+ service_name=service_name,
352
+ namespace=self.namespace,
353
+ core_api=self.core_api,
354
+ )
355
+
356
+ def get_endpoint(self, service_name: str) -> str:
357
+ """Get the endpoint URL for a RayCluster service.
358
+
359
+ Returns the HTTP endpoint for the KubeTorch HTTP server running on the head node,
360
+ just like Deployment services.
361
+ """
362
+ return f"http://{service_name}.{self.namespace}.svc.cluster.local:80"
363
+
364
+ def check_service_ready(
365
+ self, service_name: str, launch_timeout: int, **kwargs
366
+ ) -> bool:
367
+ """Checks if the RayCluster is ready to start serving requests.
368
+
369
+ Args:
370
+ service_name: Name of the RayCluster service
371
+ launch_timeout: Timeout in seconds to wait for readiness
372
+ **kwargs: Additional arguments (ignored for RayClusters)
373
+
374
+ Returns:
375
+ True if service is ready
376
+
377
+ Raises:
378
+ TimeoutError: If service doesn't become ready within timeout
379
+ RuntimeError: If RayCluster fails to start
380
+ """
381
+ sleep_interval = 2
382
+ start_time = time.time()
383
+
384
+ logger.info(
385
+ f"Checking RayCluster {service_name} pod readiness (timeout: {launch_timeout} seconds)"
386
+ )
387
+
388
+ iteration = 0
389
+ while (time.time() - start_time) < launch_timeout:
390
+ iteration += 1
391
+ try:
392
+ raycluster = self.get_raycluster(service_name)
393
+ status = raycluster.get("status", {})
394
+
395
+ # Check RayCluster state
396
+ state = status.get("state", "-")
397
+ if state == "ready":
398
+ logger.info(f"RayCluster {service_name} is ready")
399
+ return True
400
+ elif state == "failed":
401
+ raise RuntimeError(f"RayCluster {service_name} failed to start")
402
+
403
+ # Calculate total expected replicas from head + all worker groups
404
+ spec = raycluster.get("spec", {})
405
+
406
+ # Head group replicas
407
+ head_group_spec = spec.get("headGroupSpec", {})
408
+ head_replicas = head_group_spec.get("replicas", 1)
409
+
410
+ # Worker group replicas (sum across all worker groups)
411
+ worker_groups = spec.get("workerGroupSpecs", [])
412
+ worker_replicas = sum(
413
+ worker_group.get("replicas", 0) for worker_group in worker_groups
414
+ )
415
+
416
+ total_expected_replicas = head_replicas + worker_replicas
417
+
418
+ # Check pods are running
419
+ pods = self.get_pods_for_service(service_name)
420
+ running_pods = [pod for pod in pods if pod.status.phase == "Running"]
421
+
422
+ # Count head and worker pods separately for better logging
423
+ head_pods = [
424
+ pod
425
+ for pod in running_pods
426
+ if pod.metadata.labels.get("ray.io/node-type") == "head"
427
+ ]
428
+ worker_pods = [
429
+ pod
430
+ for pod in running_pods
431
+ if pod.metadata.labels.get("ray.io/node-type") == "worker"
432
+ ]
433
+
434
+ # Check for specific error conditions
435
+ if head_pods:
436
+ head_pod = head_pods[0]
437
+ # Check for Ray installation errors in head pod
438
+ ray_error = self._check_ray_installation_error(
439
+ service_name, head_pod.metadata.name
440
+ )
441
+ if ray_error:
442
+ raise RuntimeError(ray_error)
443
+
444
+ if len(running_pods) >= total_expected_replicas:
445
+ logger.info(
446
+ f"RayCluster {service_name} is ready with {len(running_pods)} pods "
447
+ f"({len(head_pods)} head, {len(worker_pods)} worker{'' if len(worker_pods) == 1 else 's'})"
448
+ )
449
+ return True
450
+
451
+ # Log progress every 30 seconds
452
+ if iteration % (30 // sleep_interval) == 0:
453
+ elapsed = int(time.time() - start_time)
454
+ remaining = launch_timeout - elapsed
455
+ logger.info(
456
+ f"RayCluster is not yet ready (elapsed: {elapsed}s, remaining: {remaining}s). "
457
+ f"State: {state}, Running pods: {len(running_pods)}/{total_expected_replicas} "
458
+ f"({len(head_pods)}/{head_replicas} head, {len(worker_pods)}/{worker_replicas} worker{'' if worker_replicas == 1 else 's'})"
459
+ )
460
+
461
+ except RuntimeError as e:
462
+ raise e
463
+ except Exception as e:
464
+ logger.error(f"Error checking RayCluster readiness: {e}")
465
+
466
+ time.sleep(sleep_interval)
467
+
468
+ # Timeout reached
469
+ raise TimeoutError(
470
+ f"RayCluster {service_name} did not become ready within {launch_timeout} seconds"
471
+ )
472
+
473
+ def teardown_service(self, service_name: str, console=None) -> bool:
474
+ """Teardown RayCluster and associated resources.
475
+
476
+ Args:
477
+ service_name: Name of the RayCluster to teardown
478
+ console: Optional Rich console for output
479
+
480
+ Returns:
481
+ True if teardown was successful, False otherwise
482
+ """
483
+ success = True
484
+
485
+ try:
486
+ # Delete the RayCluster
487
+ self.objects_api.delete_namespaced_custom_object(
488
+ group="ray.io",
489
+ version="v1",
490
+ namespace=self.namespace,
491
+ plural="rayclusters",
492
+ name=service_name,
493
+ )
494
+ if console:
495
+ console.print(f"✓ Deleted RayCluster [blue]{service_name}[/blue]")
496
+ else:
497
+ logger.info(f"Deleted RayCluster {service_name}")
498
+
499
+ except client.exceptions.ApiException as e:
500
+ if e.status == 404:
501
+ if console:
502
+ console.print(
503
+ f"[yellow]Note:[/yellow] RayCluster {service_name} not found or already deleted"
504
+ )
505
+ else:
506
+ logger.info(
507
+ f"RayCluster {service_name} not found or already deleted"
508
+ )
509
+ else:
510
+ if console:
511
+ console.print(
512
+ f"[red]Error:[/red] Failed to delete RayCluster {service_name}: {e}"
513
+ )
514
+ else:
515
+ logger.error(f"Failed to delete RayCluster {service_name}: {e}")
516
+ success = False
517
+
518
+ try:
519
+ # Delete the associated Kubernetes service (created alongside RayCluster)
520
+ self.core_api.delete_namespaced_service(
521
+ name=service_name, namespace=self.namespace
522
+ )
523
+ if console:
524
+ console.print(f"✓ Deleted service [blue]{service_name}[/blue]")
525
+ else:
526
+ logger.info(f"Deleted service {service_name}")
527
+
528
+ except client.exceptions.ApiException as e:
529
+ if e.status == 404:
530
+ if console:
531
+ console.print(
532
+ f"[yellow]Note:[/yellow] Service {service_name} not found or already deleted"
533
+ )
534
+ else:
535
+ logger.info(f"Service {service_name} not found or already deleted")
536
+ else:
537
+ if console:
538
+ console.print(
539
+ f"[red]Error:[/red] Failed to delete service {service_name}: {e}"
540
+ )
541
+ else:
542
+ logger.error(f"Failed to delete service {service_name}: {e}")
543
+ success = False
544
+
545
+ return success
546
+
547
+ def _check_ray_installation_error(
548
+ self, service_name: str, head_pod_name: str
549
+ ) -> Optional[str]:
550
+ """Check if there's a Ray installation error in the head pod logs.
551
+
552
+ Args:
553
+ service_name: Name of the RayCluster service
554
+ head_pod_name: Name of the head pod
555
+
556
+ Returns:
557
+ Error message if Ray installation error is found, None otherwise
558
+ """
559
+ try:
560
+ head_logs = self.core_api.read_namespaced_pod_log(
561
+ name=head_pod_name, namespace=self.namespace, tail_lines=100
562
+ )
563
+
564
+ # Check for Ray installation errors
565
+ if "ray: not found" in head_logs or "ray: command not found" in head_logs:
566
+ return (
567
+ f"RayCluster {service_name} failed to start: Ray is not installed in the container. "
568
+ f"Please use a Ray-enabled image (e.g., rayproject/ray) or ensure Ray is installed in your container setup."
569
+ )
570
+
571
+ # Check for Ray startup errors
572
+ if "Failed to start Ray server" in head_logs:
573
+ return (
574
+ f"RayCluster {service_name} failed to start: Ray server failed to start. "
575
+ f"Check the head pod logs for more details."
576
+ )
577
+
578
+ except client.exceptions.ApiException as e:
579
+ if e.status != 404: # Pod might not be ready yet
580
+ logger.warning(f"Could not check head pod logs: {e}")
581
+
582
+ return None
@@ -0,0 +1,18 @@
1
+ # Backward compatibility imports - all service manager functionality is now in separate files
2
+ from kubetorch.logger import get_logger
3
+
4
+ # Import all service managers for backward compatibility and centralized access
5
+ from kubetorch.serving.base_service_manager import BaseServiceManager
6
+ from kubetorch.serving.deployment_service_manager import DeploymentServiceManager
7
+ from kubetorch.serving.knative_service_manager import KnativeServiceManager
8
+ from kubetorch.serving.raycluster_service_manager import RayClusterServiceManager
9
+
10
+ # Export all service managers
11
+ __all__ = [
12
+ "BaseServiceManager",
13
+ "DeploymentServiceManager",
14
+ "KnativeServiceManager",
15
+ "RayClusterServiceManager",
16
+ ]
17
+
18
+ logger = get_logger(__name__)
@@ -0,0 +1,17 @@
1
+ apiVersion: apps/v1
2
+ kind: Deployment
3
+ metadata:
4
+ name: {{ name }}
5
+ namespace: {{ namespace }}
6
+ annotations: {{ annotations | tojson }}
7
+ labels: {{ labels | tojson }}
8
+ spec:
9
+ replicas: {{ replicas }}
10
+ selector:
11
+ matchLabels:
12
+ kubetorch.com/service: {{ name }}
13
+ template:
14
+ metadata:
15
+ annotations: {{ template_annotations | tojson }}
16
+ labels: {{ template_labels | tojson }}
17
+ spec: {{ pod_template | tojson }}
@@ -0,0 +1,19 @@
1
+ apiVersion: serving.knative.dev/v1
2
+ kind: Service
3
+ metadata:
4
+ name: {{ name }}
5
+ namespace: {{ namespace }}
6
+ annotations: {{ annotations | tojson }}
7
+ labels: {{ labels | tojson }}
8
+ spec:
9
+ template:
10
+ metadata:
11
+ annotations: {{ template_annotations | tojson }}
12
+ labels: {{ template_labels | tojson }}
13
+ spec:
14
+ {% if container_concurrency is defined %}
15
+ containerConcurrency: {{ container_concurrency }}
16
+ {% endif %}
17
+ {% for key, value in pod_template.items() %}
18
+ {{ key }}: {{ value | tojson }}
19
+ {% endfor %}