kubetorch 0.2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kubetorch/__init__.py +59 -0
- kubetorch/cli.py +1939 -0
- kubetorch/cli_utils.py +967 -0
- kubetorch/config.py +453 -0
- kubetorch/constants.py +18 -0
- kubetorch/docs/Makefile +18 -0
- kubetorch/docs/__init__.py +0 -0
- kubetorch/docs/_ext/json_globaltoc.py +42 -0
- kubetorch/docs/api/cli.rst +10 -0
- kubetorch/docs/api/python/app.rst +21 -0
- kubetorch/docs/api/python/cls.rst +19 -0
- kubetorch/docs/api/python/compute.rst +25 -0
- kubetorch/docs/api/python/config.rst +11 -0
- kubetorch/docs/api/python/fn.rst +19 -0
- kubetorch/docs/api/python/image.rst +14 -0
- kubetorch/docs/api/python/secret.rst +18 -0
- kubetorch/docs/api/python/volumes.rst +13 -0
- kubetorch/docs/api/python.rst +101 -0
- kubetorch/docs/conf.py +69 -0
- kubetorch/docs/index.rst +20 -0
- kubetorch/docs/requirements.txt +5 -0
- kubetorch/globals.py +269 -0
- kubetorch/logger.py +59 -0
- kubetorch/resources/__init__.py +0 -0
- kubetorch/resources/callables/__init__.py +0 -0
- kubetorch/resources/callables/cls/__init__.py +0 -0
- kubetorch/resources/callables/cls/cls.py +159 -0
- kubetorch/resources/callables/fn/__init__.py +0 -0
- kubetorch/resources/callables/fn/fn.py +140 -0
- kubetorch/resources/callables/module.py +1315 -0
- kubetorch/resources/callables/utils.py +203 -0
- kubetorch/resources/compute/__init__.py +0 -0
- kubetorch/resources/compute/app.py +253 -0
- kubetorch/resources/compute/compute.py +2414 -0
- kubetorch/resources/compute/decorators.py +137 -0
- kubetorch/resources/compute/utils.py +1026 -0
- kubetorch/resources/compute/websocket.py +135 -0
- kubetorch/resources/images/__init__.py +1 -0
- kubetorch/resources/images/image.py +412 -0
- kubetorch/resources/images/images.py +64 -0
- kubetorch/resources/secrets/__init__.py +2 -0
- kubetorch/resources/secrets/kubernetes_secrets_client.py +377 -0
- kubetorch/resources/secrets/provider_secrets/__init__.py +0 -0
- kubetorch/resources/secrets/provider_secrets/anthropic_secret.py +12 -0
- kubetorch/resources/secrets/provider_secrets/aws_secret.py +16 -0
- kubetorch/resources/secrets/provider_secrets/azure_secret.py +14 -0
- kubetorch/resources/secrets/provider_secrets/cohere_secret.py +12 -0
- kubetorch/resources/secrets/provider_secrets/gcp_secret.py +16 -0
- kubetorch/resources/secrets/provider_secrets/github_secret.py +13 -0
- kubetorch/resources/secrets/provider_secrets/huggingface_secret.py +20 -0
- kubetorch/resources/secrets/provider_secrets/kubeconfig_secret.py +12 -0
- kubetorch/resources/secrets/provider_secrets/lambda_secret.py +13 -0
- kubetorch/resources/secrets/provider_secrets/langchain_secret.py +12 -0
- kubetorch/resources/secrets/provider_secrets/openai_secret.py +11 -0
- kubetorch/resources/secrets/provider_secrets/pinecone_secret.py +12 -0
- kubetorch/resources/secrets/provider_secrets/providers.py +92 -0
- kubetorch/resources/secrets/provider_secrets/ssh_secret.py +12 -0
- kubetorch/resources/secrets/provider_secrets/wandb_secret.py +11 -0
- kubetorch/resources/secrets/secret.py +224 -0
- kubetorch/resources/secrets/secret_factory.py +64 -0
- kubetorch/resources/secrets/utils.py +222 -0
- kubetorch/resources/volumes/__init__.py +0 -0
- kubetorch/resources/volumes/volume.py +340 -0
- kubetorch/servers/__init__.py +0 -0
- kubetorch/servers/http/__init__.py +0 -0
- kubetorch/servers/http/distributed_utils.py +2968 -0
- kubetorch/servers/http/http_client.py +802 -0
- kubetorch/servers/http/http_server.py +1622 -0
- kubetorch/servers/http/server_metrics.py +255 -0
- kubetorch/servers/http/utils.py +722 -0
- kubetorch/serving/__init__.py +0 -0
- kubetorch/serving/autoscaling.py +153 -0
- kubetorch/serving/base_service_manager.py +344 -0
- kubetorch/serving/constants.py +77 -0
- kubetorch/serving/deployment_service_manager.py +431 -0
- kubetorch/serving/knative_service_manager.py +487 -0
- kubetorch/serving/raycluster_service_manager.py +526 -0
- kubetorch/serving/service_manager.py +18 -0
- kubetorch/serving/templates/deployment_template.yaml +17 -0
- kubetorch/serving/templates/knative_service_template.yaml +19 -0
- kubetorch/serving/templates/kt_setup_template.sh.j2 +91 -0
- kubetorch/serving/templates/pod_template.yaml +198 -0
- kubetorch/serving/templates/raycluster_service_template.yaml +42 -0
- kubetorch/serving/templates/raycluster_template.yaml +35 -0
- kubetorch/serving/templates/service_template.yaml +21 -0
- kubetorch/serving/templates/workerset_template.yaml +36 -0
- kubetorch/serving/utils.py +344 -0
- kubetorch/utils.py +263 -0
- kubetorch-0.2.5.dist-info/METADATA +75 -0
- kubetorch-0.2.5.dist-info/RECORD +92 -0
- kubetorch-0.2.5.dist-info/WHEEL +4 -0
- kubetorch-0.2.5.dist-info/entry_points.txt +5 -0
|
@@ -0,0 +1,526 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
import time
|
|
4
|
+
from datetime import datetime, timezone
|
|
5
|
+
from typing import List, Optional, Tuple
|
|
6
|
+
|
|
7
|
+
from kubernetes import client
|
|
8
|
+
|
|
9
|
+
import kubetorch.serving.constants as serving_constants
|
|
10
|
+
from kubetorch.logger import get_logger
|
|
11
|
+
from kubetorch.servers.http.utils import load_template
|
|
12
|
+
from kubetorch.serving.base_service_manager import BaseServiceManager
|
|
13
|
+
from kubetorch.serving.utils import nested_override
|
|
14
|
+
|
|
15
|
+
logger = get_logger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class RayClusterServiceManager(BaseServiceManager):
|
|
19
|
+
"""Service manager for Ray clusters with distributed Ray workload support."""
|
|
20
|
+
|
|
21
|
+
def _create_or_update_raycluster(
|
|
22
|
+
self,
|
|
23
|
+
name: str,
|
|
24
|
+
module_name: str,
|
|
25
|
+
pod_template: dict,
|
|
26
|
+
replicas: int = 1,
|
|
27
|
+
inactivity_ttl: str = None,
|
|
28
|
+
custom_labels: dict = None,
|
|
29
|
+
custom_annotations: dict = None,
|
|
30
|
+
custom_template: dict = None,
|
|
31
|
+
dryrun: bool = False,
|
|
32
|
+
) -> Tuple[dict, bool]:
|
|
33
|
+
"""Creates or updates a RayCluster for Ray distributed workloads.
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
Tuple (created_raycluster, is_new_raycluster)
|
|
37
|
+
"""
|
|
38
|
+
clean_module_name = re.sub(r"[^A-Za-z0-9.-]|^[-.]|[-.]$", "", module_name)
|
|
39
|
+
|
|
40
|
+
labels = {
|
|
41
|
+
**self.base_labels,
|
|
42
|
+
serving_constants.KT_MODULE_LABEL: clean_module_name,
|
|
43
|
+
serving_constants.KT_SERVICE_LABEL: name,
|
|
44
|
+
serving_constants.KT_TEMPLATE_LABEL: "raycluster", # Mark as source-of-truth
|
|
45
|
+
}
|
|
46
|
+
if custom_labels:
|
|
47
|
+
labels.update(custom_labels)
|
|
48
|
+
|
|
49
|
+
# Template labels (exclude template label - that's only for the top-level resource)
|
|
50
|
+
# Add ray-node-type label to distinguish head from worker nodes
|
|
51
|
+
template_labels = {
|
|
52
|
+
**self.base_labels,
|
|
53
|
+
serving_constants.KT_MODULE_LABEL: clean_module_name,
|
|
54
|
+
serving_constants.KT_SERVICE_LABEL: name,
|
|
55
|
+
}
|
|
56
|
+
if custom_labels:
|
|
57
|
+
template_labels.update(custom_labels)
|
|
58
|
+
|
|
59
|
+
# Head node specific labels (for service selector)
|
|
60
|
+
head_template_labels = {
|
|
61
|
+
**template_labels,
|
|
62
|
+
"ray.io/node-type": "head", # KubeRay standard label
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
# Worker node specific labels
|
|
66
|
+
worker_template_labels = {
|
|
67
|
+
**template_labels,
|
|
68
|
+
"ray.io/node-type": "worker", # KubeRay standard label
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
annotations = {
|
|
72
|
+
"prometheus.io/scrape": "true",
|
|
73
|
+
"prometheus.io/path": serving_constants.PROMETHEUS_HEALTH_ENDPOINT,
|
|
74
|
+
"prometheus.io/port": "8080",
|
|
75
|
+
"ray.io/overwrite-container-cmd": "true",
|
|
76
|
+
}
|
|
77
|
+
if custom_annotations:
|
|
78
|
+
annotations.update(custom_annotations)
|
|
79
|
+
|
|
80
|
+
deployment_timestamp = datetime.now(timezone.utc).isoformat()
|
|
81
|
+
template_annotations = {"kubetorch.com/deployment_timestamp": deployment_timestamp}
|
|
82
|
+
|
|
83
|
+
if inactivity_ttl:
|
|
84
|
+
annotations[serving_constants.INACTIVITY_TTL_ANNOTATION] = inactivity_ttl
|
|
85
|
+
logger.info(f"Configuring auto-down after idle timeout ({inactivity_ttl})")
|
|
86
|
+
|
|
87
|
+
# Create RayCluster
|
|
88
|
+
worker_replicas = max(0, replicas - 1) # Head node counts as 1 replica
|
|
89
|
+
raycluster = load_template(
|
|
90
|
+
template_file=serving_constants.RAYCLUSTER_TEMPLATE_FILE,
|
|
91
|
+
template_dir=os.path.join(os.path.dirname(os.path.abspath(__file__)), "templates"),
|
|
92
|
+
name=name,
|
|
93
|
+
namespace=self.namespace,
|
|
94
|
+
annotations=annotations,
|
|
95
|
+
template_annotations=template_annotations,
|
|
96
|
+
labels=labels,
|
|
97
|
+
head_template_labels=head_template_labels,
|
|
98
|
+
worker_template_labels=worker_template_labels,
|
|
99
|
+
pod_template=pod_template,
|
|
100
|
+
worker_replicas=worker_replicas,
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
# Create Kubernetes Service pointing to head node HTTP server (like Deployments)
|
|
104
|
+
service_labels = {
|
|
105
|
+
**self.base_labels,
|
|
106
|
+
serving_constants.KT_MODULE_LABEL: clean_module_name,
|
|
107
|
+
serving_constants.KT_SERVICE_LABEL: name,
|
|
108
|
+
}
|
|
109
|
+
if custom_labels:
|
|
110
|
+
service_labels.update(custom_labels)
|
|
111
|
+
|
|
112
|
+
# Ray clusters are always distributed, so we need headless services for pod discovery
|
|
113
|
+
# Create regular service for client access (head node only)
|
|
114
|
+
service = load_template(
|
|
115
|
+
template_file=serving_constants.RAYCLUSTER_SERVICE_TEMPLATE_FILE,
|
|
116
|
+
template_dir=os.path.join(os.path.dirname(os.path.abspath(__file__)), "templates"),
|
|
117
|
+
name=name,
|
|
118
|
+
namespace=self.namespace,
|
|
119
|
+
annotations=annotations,
|
|
120
|
+
labels=service_labels,
|
|
121
|
+
deployment_name=name, # Use same parameter name as deployment for compatibility
|
|
122
|
+
module_name=clean_module_name,
|
|
123
|
+
distributed=False, # Keep regular service for client access
|
|
124
|
+
server_port=pod_template.get("containers", [{}])[0].get("ports", [{}])[0].get("containerPort", 32300),
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
# Create headless service for Ray pod discovery (all nodes)
|
|
128
|
+
headless_service_labels = service_labels.copy()
|
|
129
|
+
headless_service = load_template(
|
|
130
|
+
template_file=serving_constants.RAYCLUSTER_SERVICE_TEMPLATE_FILE,
|
|
131
|
+
template_dir=os.path.join(os.path.dirname(os.path.abspath(__file__)), "templates"),
|
|
132
|
+
name=f"{name}-headless",
|
|
133
|
+
namespace=self.namespace,
|
|
134
|
+
annotations=annotations,
|
|
135
|
+
labels=headless_service_labels,
|
|
136
|
+
deployment_name=name,
|
|
137
|
+
module_name=clean_module_name,
|
|
138
|
+
distributed=True, # Make headless for pod discovery
|
|
139
|
+
server_port=pod_template.get("containers", [{}])[0].get("ports", [{}])[0].get("containerPort", 32300),
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
# For headless service, select all Ray nodes (not just head)
|
|
143
|
+
headless_service["spec"]["selector"].pop("ray.io/node-type", None)
|
|
144
|
+
|
|
145
|
+
if custom_template:
|
|
146
|
+
nested_override(raycluster, custom_template)
|
|
147
|
+
|
|
148
|
+
try:
|
|
149
|
+
kwargs = {"dry_run": "All"} if dryrun else {}
|
|
150
|
+
|
|
151
|
+
# Create Kubernetes Service first (regular service for client access)
|
|
152
|
+
try:
|
|
153
|
+
self.core_api.create_namespaced_service(
|
|
154
|
+
namespace=self.namespace,
|
|
155
|
+
body=service,
|
|
156
|
+
**kwargs,
|
|
157
|
+
)
|
|
158
|
+
if not dryrun:
|
|
159
|
+
logger.info(f"Created service {name} in namespace {self.namespace}")
|
|
160
|
+
except client.exceptions.ApiException as e:
|
|
161
|
+
if e.status == 409:
|
|
162
|
+
logger.info(f"Service {name} already exists")
|
|
163
|
+
else:
|
|
164
|
+
raise
|
|
165
|
+
|
|
166
|
+
# Create headless service for Ray pod discovery (all nodes)
|
|
167
|
+
try:
|
|
168
|
+
self.core_api.create_namespaced_service(
|
|
169
|
+
namespace=self.namespace,
|
|
170
|
+
body=headless_service,
|
|
171
|
+
**kwargs,
|
|
172
|
+
)
|
|
173
|
+
if not dryrun:
|
|
174
|
+
logger.info(f"Created headless service {name}-headless in namespace {self.namespace}")
|
|
175
|
+
except client.exceptions.ApiException as e:
|
|
176
|
+
if e.status == 409:
|
|
177
|
+
logger.info(f"Headless service {name}-headless already exists")
|
|
178
|
+
else:
|
|
179
|
+
raise
|
|
180
|
+
|
|
181
|
+
# Create RayCluster
|
|
182
|
+
created_raycluster = None
|
|
183
|
+
try:
|
|
184
|
+
created_raycluster = self.objects_api.create_namespaced_custom_object(
|
|
185
|
+
group="ray.io",
|
|
186
|
+
version="v1",
|
|
187
|
+
namespace=self.namespace,
|
|
188
|
+
plural="rayclusters",
|
|
189
|
+
body=raycluster,
|
|
190
|
+
**kwargs,
|
|
191
|
+
)
|
|
192
|
+
except client.exceptions.ApiException as e:
|
|
193
|
+
if e.status == 404:
|
|
194
|
+
logger.error(
|
|
195
|
+
"RayCluster Custom Resource Definition (CRD) not found, please install the KubeRay operator"
|
|
196
|
+
)
|
|
197
|
+
raise e
|
|
198
|
+
|
|
199
|
+
if dryrun:
|
|
200
|
+
return created_raycluster, False
|
|
201
|
+
|
|
202
|
+
logger.info(f"Created RayCluster {name} in namespace {self.namespace}")
|
|
203
|
+
return created_raycluster, True
|
|
204
|
+
|
|
205
|
+
except client.exceptions.ApiException as e:
|
|
206
|
+
if e.status == 409:
|
|
207
|
+
logger.info(f"RayCluster {name} already exists, updating")
|
|
208
|
+
try:
|
|
209
|
+
# For RayCluster, we can patch the spec
|
|
210
|
+
patch_body = {"spec": raycluster["spec"]}
|
|
211
|
+
updated_raycluster = self.objects_api.patch_namespaced_custom_object(
|
|
212
|
+
group="ray.io",
|
|
213
|
+
version="v1",
|
|
214
|
+
namespace=self.namespace,
|
|
215
|
+
plural="rayclusters",
|
|
216
|
+
name=name,
|
|
217
|
+
body=patch_body,
|
|
218
|
+
)
|
|
219
|
+
logger.info(f"Updated RayCluster {name}")
|
|
220
|
+
return updated_raycluster, False
|
|
221
|
+
except Exception as patch_error:
|
|
222
|
+
logger.error(f"Failed to patch RayCluster {name}: {patch_error}")
|
|
223
|
+
raise patch_error
|
|
224
|
+
|
|
225
|
+
raise e
|
|
226
|
+
|
|
227
|
+
def get_raycluster(self, raycluster_name: str) -> dict:
|
|
228
|
+
"""Retrieve a RayCluster by name."""
|
|
229
|
+
try:
|
|
230
|
+
raycluster = self.objects_api.get_namespaced_custom_object(
|
|
231
|
+
group="ray.io",
|
|
232
|
+
version="v1",
|
|
233
|
+
namespace=self.namespace,
|
|
234
|
+
plural="rayclusters",
|
|
235
|
+
name=raycluster_name,
|
|
236
|
+
)
|
|
237
|
+
return raycluster
|
|
238
|
+
except client.exceptions.ApiException as e:
|
|
239
|
+
logger.error(f"Failed to load RayCluster '{raycluster_name}': {str(e)}")
|
|
240
|
+
raise
|
|
241
|
+
|
|
242
|
+
def get_deployment_timestamp_annotation(self, service_name: str) -> Optional[str]:
|
|
243
|
+
"""Get deployment timestamp annotation for RayCluster services."""
|
|
244
|
+
try:
|
|
245
|
+
raycluster = self.get_raycluster(service_name)
|
|
246
|
+
if raycluster:
|
|
247
|
+
return (
|
|
248
|
+
raycluster.get("metadata", {})
|
|
249
|
+
.get("annotations", {})
|
|
250
|
+
.get("kubetorch.com/deployment_timestamp", None)
|
|
251
|
+
)
|
|
252
|
+
except client.exceptions.ApiException:
|
|
253
|
+
pass
|
|
254
|
+
return None
|
|
255
|
+
|
|
256
|
+
def update_deployment_timestamp_annotation(self, service_name: str, new_timestamp: str) -> str:
|
|
257
|
+
"""Update deployment timestamp annotation for RayCluster services."""
|
|
258
|
+
try:
|
|
259
|
+
patch_body = {"metadata": {"annotations": {"kubetorch.com/deployment_timestamp": new_timestamp}}}
|
|
260
|
+
self.objects_api.patch_namespaced_custom_object(
|
|
261
|
+
group="ray.io",
|
|
262
|
+
version="v1",
|
|
263
|
+
namespace=self.namespace,
|
|
264
|
+
plural="rayclusters",
|
|
265
|
+
name=service_name,
|
|
266
|
+
body=patch_body,
|
|
267
|
+
)
|
|
268
|
+
return new_timestamp
|
|
269
|
+
except client.exceptions.ApiException as e:
|
|
270
|
+
logger.error(f"Failed to update deployment timestamp for RayCluster '{service_name}': {str(e)}")
|
|
271
|
+
raise
|
|
272
|
+
|
|
273
|
+
def create_or_update_service(
|
|
274
|
+
self,
|
|
275
|
+
service_name: str,
|
|
276
|
+
module_name: str,
|
|
277
|
+
pod_template: dict,
|
|
278
|
+
replicas: int = 1,
|
|
279
|
+
inactivity_ttl: str = None,
|
|
280
|
+
custom_labels: dict = None,
|
|
281
|
+
custom_annotations: dict = None,
|
|
282
|
+
custom_template: dict = None,
|
|
283
|
+
dryrun: bool = False,
|
|
284
|
+
**kwargs, # Ignore Knative-specific args like autoscaling_config, inactivity_ttl, etc.
|
|
285
|
+
):
|
|
286
|
+
"""
|
|
287
|
+
Creates a RayCluster service.
|
|
288
|
+
|
|
289
|
+
Args:
|
|
290
|
+
service_name (str): Name for the RayCluster.
|
|
291
|
+
module_name (str): Name of the module.
|
|
292
|
+
pod_template (dict): Template for the pod, including resource requirements.
|
|
293
|
+
replicas (int): Number of replicas for the service (head + workers)
|
|
294
|
+
custom_labels (dict, optional): Custom labels to add to the service.
|
|
295
|
+
custom_annotations (dict, optional): Custom annotations to add to the service.
|
|
296
|
+
custom_template (dict, optional): Custom template to apply to the service.
|
|
297
|
+
dryrun (bool, optional): Whether to run in dryrun mode (Default: `False`).
|
|
298
|
+
"""
|
|
299
|
+
logger.info(f"Deploying Kubetorch RayCluster service with name: {service_name}")
|
|
300
|
+
try:
|
|
301
|
+
created_service, is_new_service = self._create_or_update_raycluster(
|
|
302
|
+
name=service_name,
|
|
303
|
+
pod_template=pod_template,
|
|
304
|
+
module_name=module_name,
|
|
305
|
+
replicas=replicas,
|
|
306
|
+
inactivity_ttl=inactivity_ttl,
|
|
307
|
+
custom_labels=custom_labels,
|
|
308
|
+
custom_annotations=custom_annotations,
|
|
309
|
+
custom_template=custom_template,
|
|
310
|
+
dryrun=dryrun,
|
|
311
|
+
)
|
|
312
|
+
return created_service
|
|
313
|
+
except Exception as e:
|
|
314
|
+
logger.error(f"Failed to launch new RayCluster: {str(e)}")
|
|
315
|
+
raise e
|
|
316
|
+
|
|
317
|
+
def get_pods_for_service(self, service_name: str, **kwargs) -> List[client.V1Pod]:
|
|
318
|
+
"""Get all pods associated with this RayCluster service.
|
|
319
|
+
|
|
320
|
+
Args:
|
|
321
|
+
service_name (str): Name of the service
|
|
322
|
+
|
|
323
|
+
Returns:
|
|
324
|
+
List[V1Pod]: List of running pods associated with the service.
|
|
325
|
+
"""
|
|
326
|
+
return self.get_pods_for_service_static(
|
|
327
|
+
service_name=service_name,
|
|
328
|
+
namespace=self.namespace,
|
|
329
|
+
core_api=self.core_api,
|
|
330
|
+
)
|
|
331
|
+
|
|
332
|
+
def get_endpoint(self, service_name: str) -> str:
|
|
333
|
+
"""Get the endpoint URL for a RayCluster service.
|
|
334
|
+
|
|
335
|
+
Returns the HTTP endpoint for the KubeTorch HTTP server running on the head node,
|
|
336
|
+
just like Deployment services.
|
|
337
|
+
"""
|
|
338
|
+
return f"http://{service_name}.{self.namespace}.svc.cluster.local:80"
|
|
339
|
+
|
|
340
|
+
def check_service_ready(self, service_name: str, launch_timeout: int, **kwargs) -> bool:
|
|
341
|
+
"""Checks if the RayCluster is ready to start serving requests.
|
|
342
|
+
|
|
343
|
+
Args:
|
|
344
|
+
service_name: Name of the RayCluster service
|
|
345
|
+
launch_timeout: Timeout in seconds to wait for readiness
|
|
346
|
+
**kwargs: Additional arguments (ignored for RayClusters)
|
|
347
|
+
|
|
348
|
+
Returns:
|
|
349
|
+
True if service is ready
|
|
350
|
+
|
|
351
|
+
Raises:
|
|
352
|
+
TimeoutError: If service doesn't become ready within timeout
|
|
353
|
+
RuntimeError: If RayCluster fails to start
|
|
354
|
+
"""
|
|
355
|
+
sleep_interval = 2
|
|
356
|
+
start_time = time.time()
|
|
357
|
+
|
|
358
|
+
logger.info(f"Checking RayCluster {service_name} pod readiness (timeout: {launch_timeout} seconds)")
|
|
359
|
+
|
|
360
|
+
iteration = 0
|
|
361
|
+
while (time.time() - start_time) < launch_timeout:
|
|
362
|
+
iteration += 1
|
|
363
|
+
try:
|
|
364
|
+
raycluster = self.get_raycluster(service_name)
|
|
365
|
+
status = raycluster.get("status", {})
|
|
366
|
+
|
|
367
|
+
# Check RayCluster state
|
|
368
|
+
state = status.get("state", "-")
|
|
369
|
+
if state == "ready":
|
|
370
|
+
logger.info(f"RayCluster {service_name} is ready")
|
|
371
|
+
return True
|
|
372
|
+
elif state == "failed":
|
|
373
|
+
raise RuntimeError(f"RayCluster {service_name} failed to start")
|
|
374
|
+
|
|
375
|
+
# Calculate total expected replicas from head + all worker groups
|
|
376
|
+
spec = raycluster.get("spec", {})
|
|
377
|
+
|
|
378
|
+
# Head group replicas
|
|
379
|
+
head_group_spec = spec.get("headGroupSpec", {})
|
|
380
|
+
head_replicas = head_group_spec.get("replicas", 1)
|
|
381
|
+
|
|
382
|
+
# Worker group replicas (sum across all worker groups)
|
|
383
|
+
worker_groups = spec.get("workerGroupSpecs", [])
|
|
384
|
+
worker_replicas = sum(worker_group.get("replicas", 0) for worker_group in worker_groups)
|
|
385
|
+
|
|
386
|
+
total_expected_replicas = head_replicas + worker_replicas
|
|
387
|
+
|
|
388
|
+
# Check pods are running
|
|
389
|
+
pods = self.get_pods_for_service(service_name)
|
|
390
|
+
running_pods = [pod for pod in pods if pod.status.phase == "Running"]
|
|
391
|
+
|
|
392
|
+
# Count head and worker pods separately for better logging
|
|
393
|
+
head_pods = [pod for pod in running_pods if pod.metadata.labels.get("ray.io/node-type") == "head"]
|
|
394
|
+
worker_pods = [pod for pod in running_pods if pod.metadata.labels.get("ray.io/node-type") == "worker"]
|
|
395
|
+
|
|
396
|
+
# Check for specific error conditions
|
|
397
|
+
if head_pods:
|
|
398
|
+
head_pod = head_pods[0]
|
|
399
|
+
# Check for Ray installation errors in head pod
|
|
400
|
+
ray_error = self._check_ray_installation_error(service_name, head_pod.metadata.name)
|
|
401
|
+
if ray_error:
|
|
402
|
+
raise RuntimeError(ray_error)
|
|
403
|
+
|
|
404
|
+
if len(running_pods) >= total_expected_replicas:
|
|
405
|
+
logger.info(
|
|
406
|
+
f"RayCluster {service_name} is ready with {len(running_pods)} pods "
|
|
407
|
+
f"({len(head_pods)} head, {len(worker_pods)} worker{'' if len(worker_pods) == 1 else 's'})"
|
|
408
|
+
)
|
|
409
|
+
return True
|
|
410
|
+
|
|
411
|
+
# Log progress every 30 seconds
|
|
412
|
+
if iteration % (30 // sleep_interval) == 0:
|
|
413
|
+
elapsed = int(time.time() - start_time)
|
|
414
|
+
remaining = launch_timeout - elapsed
|
|
415
|
+
logger.info(
|
|
416
|
+
f"RayCluster is not yet ready (elapsed: {elapsed}s, remaining: {remaining}s). "
|
|
417
|
+
f"State: {state}, Running pods: {len(running_pods)}/{total_expected_replicas} "
|
|
418
|
+
f"({len(head_pods)}/{head_replicas} head, {len(worker_pods)}/{worker_replicas} worker{'' if worker_replicas == 1 else 's'})"
|
|
419
|
+
)
|
|
420
|
+
|
|
421
|
+
except RuntimeError as e:
|
|
422
|
+
raise e
|
|
423
|
+
except Exception as e:
|
|
424
|
+
logger.error(f"Error checking RayCluster readiness: {e}")
|
|
425
|
+
|
|
426
|
+
time.sleep(sleep_interval)
|
|
427
|
+
|
|
428
|
+
# Timeout reached
|
|
429
|
+
raise TimeoutError(f"RayCluster {service_name} did not become ready within {launch_timeout} seconds")
|
|
430
|
+
|
|
431
|
+
def teardown_service(self, service_name: str, console=None) -> bool:
|
|
432
|
+
"""Teardown RayCluster and associated resources.
|
|
433
|
+
|
|
434
|
+
Args:
|
|
435
|
+
service_name: Name of the RayCluster to teardown
|
|
436
|
+
console: Optional Rich console for output
|
|
437
|
+
|
|
438
|
+
Returns:
|
|
439
|
+
True if teardown was successful, False otherwise
|
|
440
|
+
"""
|
|
441
|
+
success = True
|
|
442
|
+
|
|
443
|
+
try:
|
|
444
|
+
# Delete the RayCluster
|
|
445
|
+
self.objects_api.delete_namespaced_custom_object(
|
|
446
|
+
group="ray.io",
|
|
447
|
+
version="v1",
|
|
448
|
+
namespace=self.namespace,
|
|
449
|
+
plural="rayclusters",
|
|
450
|
+
name=service_name,
|
|
451
|
+
)
|
|
452
|
+
if console:
|
|
453
|
+
console.print(f"✓ Deleted RayCluster [blue]{service_name}[/blue]")
|
|
454
|
+
else:
|
|
455
|
+
logger.info(f"Deleted RayCluster {service_name}")
|
|
456
|
+
|
|
457
|
+
except client.exceptions.ApiException as e:
|
|
458
|
+
if e.status == 404:
|
|
459
|
+
if console:
|
|
460
|
+
console.print(f"[yellow]Note:[/yellow] RayCluster {service_name} not found or already deleted")
|
|
461
|
+
else:
|
|
462
|
+
logger.info(f"RayCluster {service_name} not found or already deleted")
|
|
463
|
+
else:
|
|
464
|
+
if console:
|
|
465
|
+
console.print(f"[red]Error:[/red] Failed to delete RayCluster {service_name}: {e}")
|
|
466
|
+
else:
|
|
467
|
+
logger.error(f"Failed to delete RayCluster {service_name}: {e}")
|
|
468
|
+
success = False
|
|
469
|
+
|
|
470
|
+
try:
|
|
471
|
+
# Delete the associated Kubernetes service (created alongside RayCluster)
|
|
472
|
+
self.core_api.delete_namespaced_service(name=service_name, namespace=self.namespace)
|
|
473
|
+
if console:
|
|
474
|
+
console.print(f"✓ Deleted service [blue]{service_name}[/blue]")
|
|
475
|
+
else:
|
|
476
|
+
logger.info(f"Deleted service {service_name}")
|
|
477
|
+
|
|
478
|
+
except client.exceptions.ApiException as e:
|
|
479
|
+
if e.status == 404:
|
|
480
|
+
if console:
|
|
481
|
+
console.print(f"[yellow]Note:[/yellow] Service {service_name} not found or already deleted")
|
|
482
|
+
else:
|
|
483
|
+
logger.info(f"Service {service_name} not found or already deleted")
|
|
484
|
+
else:
|
|
485
|
+
if console:
|
|
486
|
+
console.print(f"[red]Error:[/red] Failed to delete service {service_name}: {e}")
|
|
487
|
+
else:
|
|
488
|
+
logger.error(f"Failed to delete service {service_name}: {e}")
|
|
489
|
+
success = False
|
|
490
|
+
|
|
491
|
+
return success
|
|
492
|
+
|
|
493
|
+
def _check_ray_installation_error(self, service_name: str, head_pod_name: str) -> Optional[str]:
|
|
494
|
+
"""Check if there's a Ray installation error in the head pod logs.
|
|
495
|
+
|
|
496
|
+
Args:
|
|
497
|
+
service_name: Name of the RayCluster service
|
|
498
|
+
head_pod_name: Name of the head pod
|
|
499
|
+
|
|
500
|
+
Returns:
|
|
501
|
+
Error message if Ray installation error is found, None otherwise
|
|
502
|
+
"""
|
|
503
|
+
try:
|
|
504
|
+
head_logs = self.core_api.read_namespaced_pod_log(
|
|
505
|
+
name=head_pod_name, namespace=self.namespace, tail_lines=100
|
|
506
|
+
)
|
|
507
|
+
|
|
508
|
+
# Check for Ray installation errors
|
|
509
|
+
if "ray: not found" in head_logs or "ray: command not found" in head_logs:
|
|
510
|
+
return (
|
|
511
|
+
f"RayCluster {service_name} failed to start: Ray is not installed in the container. "
|
|
512
|
+
f"Please use a Ray-enabled image (e.g., rayproject/ray) or ensure Ray is installed in your container setup."
|
|
513
|
+
)
|
|
514
|
+
|
|
515
|
+
# Check for Ray startup errors
|
|
516
|
+
if "Failed to start Ray server" in head_logs:
|
|
517
|
+
return (
|
|
518
|
+
f"RayCluster {service_name} failed to start: Ray server failed to start. "
|
|
519
|
+
f"Check the head pod logs for more details."
|
|
520
|
+
)
|
|
521
|
+
|
|
522
|
+
except client.exceptions.ApiException as e:
|
|
523
|
+
if e.status != 404: # Pod might not be ready yet
|
|
524
|
+
logger.warning(f"Could not check head pod logs: {e}")
|
|
525
|
+
|
|
526
|
+
return None
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# Backward compatibility imports - all service manager functionality is now in separate files
|
|
2
|
+
from kubetorch.logger import get_logger
|
|
3
|
+
|
|
4
|
+
# Import all service managers for backward compatibility and centralized access
|
|
5
|
+
from kubetorch.serving.base_service_manager import BaseServiceManager
|
|
6
|
+
from kubetorch.serving.deployment_service_manager import DeploymentServiceManager
|
|
7
|
+
from kubetorch.serving.knative_service_manager import KnativeServiceManager
|
|
8
|
+
from kubetorch.serving.raycluster_service_manager import RayClusterServiceManager
|
|
9
|
+
|
|
10
|
+
# Export all service managers
|
|
11
|
+
__all__ = [
|
|
12
|
+
"BaseServiceManager",
|
|
13
|
+
"DeploymentServiceManager",
|
|
14
|
+
"KnativeServiceManager",
|
|
15
|
+
"RayClusterServiceManager",
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
logger = get_logger(__name__)
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
apiVersion: apps/v1
|
|
2
|
+
kind: Deployment
|
|
3
|
+
metadata:
|
|
4
|
+
name: {{ name }}
|
|
5
|
+
namespace: {{ namespace }}
|
|
6
|
+
annotations: {{ annotations | tojson }}
|
|
7
|
+
labels: {{ labels | tojson }}
|
|
8
|
+
spec:
|
|
9
|
+
replicas: {{ replicas }}
|
|
10
|
+
selector:
|
|
11
|
+
matchLabels:
|
|
12
|
+
kubetorch.com/service: {{ name }}
|
|
13
|
+
template:
|
|
14
|
+
metadata:
|
|
15
|
+
annotations: {{ template_annotations | tojson }}
|
|
16
|
+
labels: {{ template_labels | tojson }}
|
|
17
|
+
spec: {{ pod_template | tojson }}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
apiVersion: serving.knative.dev/v1
|
|
2
|
+
kind: Service
|
|
3
|
+
metadata:
|
|
4
|
+
name: {{ name }}
|
|
5
|
+
namespace: {{ namespace }}
|
|
6
|
+
annotations: {{ annotations | tojson }}
|
|
7
|
+
labels: {{ labels | tojson }}
|
|
8
|
+
spec:
|
|
9
|
+
template:
|
|
10
|
+
metadata:
|
|
11
|
+
annotations: {{ template_annotations | tojson }}
|
|
12
|
+
labels: {{ template_labels | tojson }}
|
|
13
|
+
spec:
|
|
14
|
+
{% if container_concurrency is defined %}
|
|
15
|
+
containerConcurrency: {{ container_concurrency }}
|
|
16
|
+
{% endif %}
|
|
17
|
+
{% for key, value in pod_template.items() %}
|
|
18
|
+
{{ key }}: {{ value | tojson }}
|
|
19
|
+
{% endfor %}
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
# Increase file descriptor limit for large-scale distributed jobs
|
|
2
|
+
ulimit -n 65536
|
|
3
|
+
|
|
4
|
+
{% if python_path %}
|
|
5
|
+
export PATH="{{ python_path }}:$PATH"
|
|
6
|
+
if command -v "{{ python_path }}" &> /dev/null; then
|
|
7
|
+
python_bin="{{ python_path }}"
|
|
8
|
+
fi
|
|
9
|
+
{% endif %}
|
|
10
|
+
# If the user set the python_path to exact executable, then we'll use it directly here but adding it to PATH
|
|
11
|
+
# above will have little effect. If they set it to a directory, then this command check will fail as desired,
|
|
12
|
+
# and we'll then look for python3 or python in PATH (starting with their directory) as desired.
|
|
13
|
+
if [[ -z "$python_bin" ]]; then
|
|
14
|
+
if command -v python3 &> /dev/null; then
|
|
15
|
+
python_bin="python3"
|
|
16
|
+
elif command -v python &> /dev/null; then
|
|
17
|
+
python_bin="python"
|
|
18
|
+
else
|
|
19
|
+
echo "Error: Neither python3 nor python found in PATH. Please set python_path to a valid Python executable."
|
|
20
|
+
exit 1
|
|
21
|
+
fi
|
|
22
|
+
fi
|
|
23
|
+
echo "Using Python binary: $python_bin"
|
|
24
|
+
|
|
25
|
+
{% if not freeze %}
|
|
26
|
+
if ! command -v rsync &> /dev/null; then
|
|
27
|
+
apt-get update && apt-get install -y rsync
|
|
28
|
+
fi
|
|
29
|
+
if ! command -v nohup &> /dev/null; then
|
|
30
|
+
apt-get update && apt-get install -y coreutils
|
|
31
|
+
fi
|
|
32
|
+
|
|
33
|
+
{% if install_cmd %}
|
|
34
|
+
# Use the explicitly provided install command
|
|
35
|
+
uv_pip_cmd="{{ install_cmd }}"
|
|
36
|
+
{% else %}
|
|
37
|
+
|
|
38
|
+
if $python_bin -c "import sys; exit(0 if sys.prefix != sys.base_prefix else 1)" 2>/dev/null; then
|
|
39
|
+
install_flags=""
|
|
40
|
+
else
|
|
41
|
+
install_flags="--system --break-system-packages"
|
|
42
|
+
fi
|
|
43
|
+
|
|
44
|
+
# Check if uv is available and set the appropriate command
|
|
45
|
+
if command -v uv &> /dev/null; then
|
|
46
|
+
# Use system-wide uv with the detected Python interpreter
|
|
47
|
+
uv_pip_cmd="uv pip install $install_flags --python=$python_bin"
|
|
48
|
+
elif $python_bin -m uv --version &> /dev/null; then
|
|
49
|
+
# Use Python module uv - it inherently uses the right Python
|
|
50
|
+
uv_pip_cmd="$python_bin -m uv pip install $install_flags"
|
|
51
|
+
else
|
|
52
|
+
# Install uv as a Python module and use it
|
|
53
|
+
echo "uv not found, installing it..."
|
|
54
|
+
$python_bin -m pip install uv
|
|
55
|
+
uv_pip_cmd="$python_bin -m uv pip install $install_flags"
|
|
56
|
+
fi
|
|
57
|
+
{% endif %}
|
|
58
|
+
|
|
59
|
+
# Export the install command as an environment variable for use in applications
|
|
60
|
+
echo "Setting KT_PIP_INSTALL_CMD env var to $uv_pip_cmd"
|
|
61
|
+
export KT_PIP_INSTALL_CMD="$uv_pip_cmd"
|
|
62
|
+
mkdir -p .kt
|
|
63
|
+
echo "$uv_pip_cmd" > .kt/kt_pip_install_cmd
|
|
64
|
+
|
|
65
|
+
{% if rsync_kt_local_cmd %}
|
|
66
|
+
{{ rsync_kt_local_cmd }}
|
|
67
|
+
{% if install_url and install_url.endswith('.whl') %}
|
|
68
|
+
{% set normalized_path = install_url.replace('\\', '/') %}
|
|
69
|
+
{% set wheel_filename = normalized_path.split('/')[-1] %}
|
|
70
|
+
$uv_pip_cmd "{{ wheel_filename }}[server]"
|
|
71
|
+
{% if install_otel %}
|
|
72
|
+
$uv_pip_cmd "{{ wheel_filename }}[otel]"
|
|
73
|
+
{% endif %}
|
|
74
|
+
{% else %}
|
|
75
|
+
$uv_pip_cmd -e "python_client[server]"
|
|
76
|
+
{% if install_otel %}
|
|
77
|
+
$uv_pip_cmd -e "python_client[otel]"
|
|
78
|
+
{% endif %}
|
|
79
|
+
{% endif %}
|
|
80
|
+
{% else %}
|
|
81
|
+
$uv_pip_cmd "kubetorch[server]=={{ install_url }}"
|
|
82
|
+
{% if install_otel %}
|
|
83
|
+
$uv_pip_cmd "kubetorch[otel]=={{ install_url }}"
|
|
84
|
+
{% endif %}
|
|
85
|
+
{% endif %}
|
|
86
|
+
|
|
87
|
+
{% endif %}
|
|
88
|
+
|
|
89
|
+
$python_bin -m uvicorn kubetorch.servers.http.http_server:app \
|
|
90
|
+
--host 0.0.0.0 \
|
|
91
|
+
--port {{ server_port }}
|