kubetorch 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kubetorch might be problematic. Click here for more details.
- kubetorch/__init__.py +60 -0
- kubetorch/cli.py +1985 -0
- kubetorch/cli_utils.py +1025 -0
- kubetorch/config.py +453 -0
- kubetorch/constants.py +18 -0
- kubetorch/docs/Makefile +18 -0
- kubetorch/docs/__init__.py +0 -0
- kubetorch/docs/_ext/json_globaltoc.py +42 -0
- kubetorch/docs/api/cli.rst +10 -0
- kubetorch/docs/api/python/app.rst +21 -0
- kubetorch/docs/api/python/cls.rst +19 -0
- kubetorch/docs/api/python/compute.rst +25 -0
- kubetorch/docs/api/python/config.rst +11 -0
- kubetorch/docs/api/python/fn.rst +19 -0
- kubetorch/docs/api/python/image.rst +14 -0
- kubetorch/docs/api/python/secret.rst +18 -0
- kubetorch/docs/api/python/volumes.rst +13 -0
- kubetorch/docs/api/python.rst +101 -0
- kubetorch/docs/conf.py +69 -0
- kubetorch/docs/index.rst +20 -0
- kubetorch/docs/requirements.txt +5 -0
- kubetorch/globals.py +285 -0
- kubetorch/logger.py +59 -0
- kubetorch/resources/__init__.py +0 -0
- kubetorch/resources/callables/__init__.py +0 -0
- kubetorch/resources/callables/cls/__init__.py +0 -0
- kubetorch/resources/callables/cls/cls.py +157 -0
- kubetorch/resources/callables/fn/__init__.py +0 -0
- kubetorch/resources/callables/fn/fn.py +133 -0
- kubetorch/resources/callables/module.py +1416 -0
- kubetorch/resources/callables/utils.py +174 -0
- kubetorch/resources/compute/__init__.py +0 -0
- kubetorch/resources/compute/app.py +261 -0
- kubetorch/resources/compute/compute.py +2596 -0
- kubetorch/resources/compute/decorators.py +139 -0
- kubetorch/resources/compute/rbac.py +74 -0
- kubetorch/resources/compute/utils.py +1114 -0
- kubetorch/resources/compute/websocket.py +137 -0
- kubetorch/resources/images/__init__.py +1 -0
- kubetorch/resources/images/image.py +414 -0
- kubetorch/resources/images/images.py +74 -0
- kubetorch/resources/secrets/__init__.py +2 -0
- kubetorch/resources/secrets/kubernetes_secrets_client.py +412 -0
- kubetorch/resources/secrets/provider_secrets/__init__.py +0 -0
- kubetorch/resources/secrets/provider_secrets/anthropic_secret.py +12 -0
- kubetorch/resources/secrets/provider_secrets/aws_secret.py +16 -0
- kubetorch/resources/secrets/provider_secrets/azure_secret.py +14 -0
- kubetorch/resources/secrets/provider_secrets/cohere_secret.py +12 -0
- kubetorch/resources/secrets/provider_secrets/gcp_secret.py +16 -0
- kubetorch/resources/secrets/provider_secrets/github_secret.py +13 -0
- kubetorch/resources/secrets/provider_secrets/huggingface_secret.py +20 -0
- kubetorch/resources/secrets/provider_secrets/kubeconfig_secret.py +12 -0
- kubetorch/resources/secrets/provider_secrets/lambda_secret.py +13 -0
- kubetorch/resources/secrets/provider_secrets/langchain_secret.py +12 -0
- kubetorch/resources/secrets/provider_secrets/openai_secret.py +11 -0
- kubetorch/resources/secrets/provider_secrets/pinecone_secret.py +12 -0
- kubetorch/resources/secrets/provider_secrets/providers.py +93 -0
- kubetorch/resources/secrets/provider_secrets/ssh_secret.py +12 -0
- kubetorch/resources/secrets/provider_secrets/wandb_secret.py +11 -0
- kubetorch/resources/secrets/secret.py +238 -0
- kubetorch/resources/secrets/secret_factory.py +70 -0
- kubetorch/resources/secrets/utils.py +209 -0
- kubetorch/resources/volumes/__init__.py +0 -0
- kubetorch/resources/volumes/volume.py +365 -0
- kubetorch/servers/__init__.py +0 -0
- kubetorch/servers/http/__init__.py +0 -0
- kubetorch/servers/http/distributed_utils.py +3223 -0
- kubetorch/servers/http/http_client.py +730 -0
- kubetorch/servers/http/http_server.py +1788 -0
- kubetorch/servers/http/server_metrics.py +278 -0
- kubetorch/servers/http/utils.py +728 -0
- kubetorch/serving/__init__.py +0 -0
- kubetorch/serving/autoscaling.py +173 -0
- kubetorch/serving/base_service_manager.py +363 -0
- kubetorch/serving/constants.py +83 -0
- kubetorch/serving/deployment_service_manager.py +478 -0
- kubetorch/serving/knative_service_manager.py +519 -0
- kubetorch/serving/raycluster_service_manager.py +582 -0
- kubetorch/serving/service_manager.py +18 -0
- kubetorch/serving/templates/deployment_template.yaml +17 -0
- kubetorch/serving/templates/knative_service_template.yaml +19 -0
- kubetorch/serving/templates/kt_setup_template.sh.j2 +81 -0
- kubetorch/serving/templates/pod_template.yaml +194 -0
- kubetorch/serving/templates/raycluster_service_template.yaml +42 -0
- kubetorch/serving/templates/raycluster_template.yaml +35 -0
- kubetorch/serving/templates/service_template.yaml +21 -0
- kubetorch/serving/templates/workerset_template.yaml +36 -0
- kubetorch/serving/utils.py +377 -0
- kubetorch/utils.py +284 -0
- kubetorch-0.2.0.dist-info/METADATA +121 -0
- kubetorch-0.2.0.dist-info/RECORD +93 -0
- kubetorch-0.2.0.dist-info/WHEEL +4 -0
- kubetorch-0.2.0.dist-info/entry_points.txt +5 -0
|
@@ -0,0 +1,478 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
import time
|
|
4
|
+
from datetime import datetime, timezone
|
|
5
|
+
from typing import List, Optional, Tuple
|
|
6
|
+
|
|
7
|
+
from kubernetes import client
|
|
8
|
+
|
|
9
|
+
import kubetorch.serving.constants as serving_constants
|
|
10
|
+
from kubetorch.logger import get_logger
|
|
11
|
+
from kubetorch.resources.compute.utils import (
|
|
12
|
+
check_pod_events_for_errors,
|
|
13
|
+
check_pod_status_for_errors,
|
|
14
|
+
check_replicaset_events_for_errors,
|
|
15
|
+
ServiceTimeoutError,
|
|
16
|
+
)
|
|
17
|
+
from kubetorch.servers.http.utils import load_template
|
|
18
|
+
from kubetorch.serving.base_service_manager import BaseServiceManager
|
|
19
|
+
from kubetorch.serving.utils import nested_override
|
|
20
|
+
|
|
21
|
+
logger = get_logger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class DeploymentServiceManager(BaseServiceManager):
|
|
25
|
+
"""Service manager for Kubernetes Deployments with distributed computing support."""
|
|
26
|
+
|
|
27
|
+
def _create_or_update_deployment(
|
|
28
|
+
self,
|
|
29
|
+
name: str,
|
|
30
|
+
module_name: str,
|
|
31
|
+
pod_template: dict,
|
|
32
|
+
replicas: int = 1,
|
|
33
|
+
inactivity_ttl: str = None,
|
|
34
|
+
custom_labels: dict = None,
|
|
35
|
+
custom_annotations: dict = None,
|
|
36
|
+
custom_template: dict = None,
|
|
37
|
+
scheduler_name: str = None,
|
|
38
|
+
queue_name: str = None,
|
|
39
|
+
dryrun: bool = False,
|
|
40
|
+
) -> Tuple[dict, bool]:
|
|
41
|
+
"""Creates or updates a Deployment for distributed deployments.
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
Tuple (created_deployment, is_new_deployment)
|
|
45
|
+
"""
|
|
46
|
+
clean_module_name = re.sub(r"[^A-Za-z0-9.-]|^[-.]|[-.]$", "", module_name)
|
|
47
|
+
service_name = name # Use regular service name, not headless
|
|
48
|
+
|
|
49
|
+
labels = {
|
|
50
|
+
**self.base_labels,
|
|
51
|
+
serving_constants.KT_MODULE_LABEL: clean_module_name,
|
|
52
|
+
serving_constants.KT_SERVICE_LABEL: name,
|
|
53
|
+
serving_constants.KT_TEMPLATE_LABEL: "deployment", # Mark as source-of-truth
|
|
54
|
+
}
|
|
55
|
+
if custom_labels:
|
|
56
|
+
labels.update(custom_labels)
|
|
57
|
+
|
|
58
|
+
# Template labels (exclude template label - that's only for the top-level resource)
|
|
59
|
+
template_labels = {
|
|
60
|
+
**self.base_labels,
|
|
61
|
+
serving_constants.KT_MODULE_LABEL: clean_module_name,
|
|
62
|
+
serving_constants.KT_SERVICE_LABEL: name,
|
|
63
|
+
}
|
|
64
|
+
if custom_labels:
|
|
65
|
+
template_labels.update(custom_labels)
|
|
66
|
+
|
|
67
|
+
# Service labels (also exclude template label - supporting resource, not source-of-truth)
|
|
68
|
+
service_labels = {
|
|
69
|
+
**self.base_labels,
|
|
70
|
+
serving_constants.KT_MODULE_LABEL: clean_module_name,
|
|
71
|
+
serving_constants.KT_SERVICE_LABEL: name,
|
|
72
|
+
}
|
|
73
|
+
if custom_labels:
|
|
74
|
+
service_labels.update(custom_labels)
|
|
75
|
+
|
|
76
|
+
annotations = {
|
|
77
|
+
"prometheus.io/scrape": "true",
|
|
78
|
+
"prometheus.io/path": serving_constants.PROMETHEUS_HEALTH_ENDPOINT,
|
|
79
|
+
"prometheus.io/port": "8080",
|
|
80
|
+
}
|
|
81
|
+
if custom_annotations:
|
|
82
|
+
annotations.update(custom_annotations)
|
|
83
|
+
|
|
84
|
+
if inactivity_ttl:
|
|
85
|
+
annotations[serving_constants.INACTIVITY_TTL_ANNOTATION] = inactivity_ttl
|
|
86
|
+
logger.info(f"Configuring auto-down after idle timeout ({inactivity_ttl})")
|
|
87
|
+
|
|
88
|
+
if scheduler_name and queue_name:
|
|
89
|
+
labels["kai.scheduler/queue"] = queue_name # Useful for queries, etc
|
|
90
|
+
template_labels[
|
|
91
|
+
"kai.scheduler/queue"
|
|
92
|
+
] = queue_name # Required for KAI to schedule pods
|
|
93
|
+
|
|
94
|
+
deployment_timestamp = datetime.now(timezone.utc).isoformat()
|
|
95
|
+
template_annotations = {
|
|
96
|
+
"kubetorch.com/deployment_timestamp": deployment_timestamp
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
# Create Deployment
|
|
100
|
+
deployment = load_template(
|
|
101
|
+
template_file=serving_constants.DEPLOYMENT_TEMPLATE_FILE,
|
|
102
|
+
template_dir=os.path.join(
|
|
103
|
+
os.path.dirname(os.path.abspath(__file__)), "templates"
|
|
104
|
+
),
|
|
105
|
+
name=name,
|
|
106
|
+
namespace=self.namespace,
|
|
107
|
+
annotations=annotations,
|
|
108
|
+
template_annotations=template_annotations,
|
|
109
|
+
labels=labels,
|
|
110
|
+
template_labels=template_labels,
|
|
111
|
+
pod_template=pod_template,
|
|
112
|
+
replicas=replicas,
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
if custom_template:
|
|
116
|
+
nested_override(deployment, custom_template)
|
|
117
|
+
|
|
118
|
+
# Check if this is a distributed deployment
|
|
119
|
+
env_vars = pod_template.get("containers", [{}])[0].get("env", [])
|
|
120
|
+
is_distributed = any(
|
|
121
|
+
env.get("name") == "KT_DISTRIBUTED_CONFIG"
|
|
122
|
+
and env.get("value") != "null"
|
|
123
|
+
and env.get("value")
|
|
124
|
+
for env in env_vars
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
# Create regular service with session affinity
|
|
128
|
+
service = load_template(
|
|
129
|
+
template_file=serving_constants.DEPLOYMENT_SERVICE_TEMPLATE_FILE,
|
|
130
|
+
template_dir=os.path.join(
|
|
131
|
+
os.path.dirname(os.path.abspath(__file__)), "templates"
|
|
132
|
+
),
|
|
133
|
+
name=service_name,
|
|
134
|
+
namespace=self.namespace,
|
|
135
|
+
annotations=annotations,
|
|
136
|
+
labels=service_labels,
|
|
137
|
+
deployment_name=name,
|
|
138
|
+
module_name=clean_module_name,
|
|
139
|
+
distributed=False, # Keep regular service for client access
|
|
140
|
+
server_port=pod_template.get("containers", [{}])[0]
|
|
141
|
+
.get("ports", [{}])[0]
|
|
142
|
+
.get("containerPort", 32300),
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
# For distributed deployments, also create a headless service for pod discovery
|
|
146
|
+
headless_service = None
|
|
147
|
+
if is_distributed:
|
|
148
|
+
headless_service = load_template(
|
|
149
|
+
template_file=serving_constants.DEPLOYMENT_SERVICE_TEMPLATE_FILE,
|
|
150
|
+
template_dir=os.path.join(
|
|
151
|
+
os.path.dirname(os.path.abspath(__file__)), "templates"
|
|
152
|
+
),
|
|
153
|
+
name=f"{service_name}-headless", # Use different name for headless
|
|
154
|
+
namespace=self.namespace,
|
|
155
|
+
annotations=annotations,
|
|
156
|
+
labels=service_labels,
|
|
157
|
+
deployment_name=name,
|
|
158
|
+
module_name=clean_module_name,
|
|
159
|
+
distributed=True, # Make this one headless
|
|
160
|
+
server_port=pod_template.get("containers", [{}])[0]
|
|
161
|
+
.get("ports", [{}])[0]
|
|
162
|
+
.get("containerPort", 32300),
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
try:
|
|
166
|
+
kwargs = {"dry_run": "All"} if dryrun else {}
|
|
167
|
+
|
|
168
|
+
# Create regular service first
|
|
169
|
+
try:
|
|
170
|
+
self.core_api.create_namespaced_service(
|
|
171
|
+
namespace=self.namespace,
|
|
172
|
+
body=service,
|
|
173
|
+
**kwargs,
|
|
174
|
+
)
|
|
175
|
+
if not dryrun:
|
|
176
|
+
logger.info(
|
|
177
|
+
f"Created service {service_name} in namespace {self.namespace}"
|
|
178
|
+
)
|
|
179
|
+
except client.exceptions.ApiException as e:
|
|
180
|
+
if e.status == 409:
|
|
181
|
+
logger.info(f"Service {service_name} already exists")
|
|
182
|
+
else:
|
|
183
|
+
raise
|
|
184
|
+
|
|
185
|
+
# Create headless service for distributed pod discovery
|
|
186
|
+
if headless_service:
|
|
187
|
+
try:
|
|
188
|
+
self.core_api.create_namespaced_service(
|
|
189
|
+
namespace=self.namespace,
|
|
190
|
+
body=headless_service,
|
|
191
|
+
**kwargs,
|
|
192
|
+
)
|
|
193
|
+
if not dryrun:
|
|
194
|
+
logger.info(
|
|
195
|
+
f"Created headless service {service_name}-headless in namespace {self.namespace}"
|
|
196
|
+
)
|
|
197
|
+
except client.exceptions.ApiException as e:
|
|
198
|
+
if e.status == 409:
|
|
199
|
+
logger.info(
|
|
200
|
+
f"Headless service {service_name}-headless already exists"
|
|
201
|
+
)
|
|
202
|
+
else:
|
|
203
|
+
raise
|
|
204
|
+
|
|
205
|
+
# Create Deployment
|
|
206
|
+
created_deployment = self.apps_v1_api.create_namespaced_deployment(
|
|
207
|
+
namespace=self.namespace,
|
|
208
|
+
body=deployment,
|
|
209
|
+
**kwargs,
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
if dryrun:
|
|
213
|
+
return created_deployment, False
|
|
214
|
+
|
|
215
|
+
logger.info(f"Created Deployment {name} in namespace {self.namespace}")
|
|
216
|
+
return created_deployment, True
|
|
217
|
+
|
|
218
|
+
except client.exceptions.ApiException as e:
|
|
219
|
+
if e.status == 409:
|
|
220
|
+
logger.info(f"Deployment {name} already exists, updating")
|
|
221
|
+
existing_deployment = self.get_deployment(name)
|
|
222
|
+
|
|
223
|
+
# Update replicas if different
|
|
224
|
+
if existing_deployment.spec.replicas != replicas:
|
|
225
|
+
patch_body = {"spec": {"replicas": replicas}}
|
|
226
|
+
try:
|
|
227
|
+
self.apps_v1_api.patch_namespaced_deployment(
|
|
228
|
+
name=name,
|
|
229
|
+
namespace=self.namespace,
|
|
230
|
+
body=patch_body,
|
|
231
|
+
)
|
|
232
|
+
logger.info(f"Updated Deployment {name} replicas to {replicas}")
|
|
233
|
+
except Exception as e:
|
|
234
|
+
logger.error(f"Failed to patch Deployment {name}: {e}")
|
|
235
|
+
raise e
|
|
236
|
+
|
|
237
|
+
return existing_deployment, False
|
|
238
|
+
else:
|
|
239
|
+
logger.error(f"Failed to create Deployment: {str(e)}")
|
|
240
|
+
raise e
|
|
241
|
+
|
|
242
|
+
def get_deployment(self, deployment_name: str) -> dict:
|
|
243
|
+
"""Retrieve a Deployment by name."""
|
|
244
|
+
try:
|
|
245
|
+
deployment = self.apps_v1_api.read_namespaced_deployment(
|
|
246
|
+
name=deployment_name,
|
|
247
|
+
namespace=self.namespace,
|
|
248
|
+
)
|
|
249
|
+
return deployment
|
|
250
|
+
except client.exceptions.ApiException as e:
|
|
251
|
+
logger.error(f"Failed to load Deployment '{deployment_name}': {str(e)}")
|
|
252
|
+
raise
|
|
253
|
+
|
|
254
|
+
def get_deployment_timestamp_annotation(self, service_name: str) -> Optional[str]:
|
|
255
|
+
"""Get deployment timestamp annotation for Deployment services."""
|
|
256
|
+
try:
|
|
257
|
+
deployment = self.get_deployment(service_name)
|
|
258
|
+
if (
|
|
259
|
+
deployment
|
|
260
|
+
and hasattr(deployment, "metadata")
|
|
261
|
+
and hasattr(deployment.metadata, "annotations")
|
|
262
|
+
):
|
|
263
|
+
return deployment.metadata.annotations.get(
|
|
264
|
+
"kubetorch.com/deployment_timestamp", None
|
|
265
|
+
)
|
|
266
|
+
except client.exceptions.ApiException:
|
|
267
|
+
pass
|
|
268
|
+
return None
|
|
269
|
+
|
|
270
|
+
def update_deployment_timestamp_annotation(
|
|
271
|
+
self, service_name: str, new_timestamp: str
|
|
272
|
+
) -> str:
|
|
273
|
+
"""Update deployment timestamp annotation for Deployment services."""
|
|
274
|
+
try:
|
|
275
|
+
patch_body = {
|
|
276
|
+
"metadata": {
|
|
277
|
+
"annotations": {"kubetorch.com/deployment_timestamp": new_timestamp}
|
|
278
|
+
}
|
|
279
|
+
}
|
|
280
|
+
self.apps_v1_api.patch_namespaced_deployment(
|
|
281
|
+
name=service_name,
|
|
282
|
+
namespace=self.namespace,
|
|
283
|
+
body=patch_body,
|
|
284
|
+
)
|
|
285
|
+
return new_timestamp
|
|
286
|
+
except client.exceptions.ApiException as e:
|
|
287
|
+
logger.error(
|
|
288
|
+
f"Failed to update deployment timestamp for '{service_name}': {str(e)}"
|
|
289
|
+
)
|
|
290
|
+
raise
|
|
291
|
+
|
|
292
|
+
def create_or_update_service(
|
|
293
|
+
self,
|
|
294
|
+
service_name: str,
|
|
295
|
+
module_name: str,
|
|
296
|
+
pod_template: dict,
|
|
297
|
+
replicas: int = 1,
|
|
298
|
+
inactivity_ttl: str = None,
|
|
299
|
+
custom_labels: dict = None,
|
|
300
|
+
custom_annotations: dict = None,
|
|
301
|
+
custom_template: dict = None,
|
|
302
|
+
scheduler_name: str = None,
|
|
303
|
+
queue_name: str = None,
|
|
304
|
+
dryrun: bool = False,
|
|
305
|
+
**kwargs, # Ignore Knative-specific args like autoscaling_config, inactivity_ttl, etc.
|
|
306
|
+
):
|
|
307
|
+
"""
|
|
308
|
+
Creates a Deployment service.
|
|
309
|
+
|
|
310
|
+
Args:
|
|
311
|
+
service_name (str): Name for the pod/service.
|
|
312
|
+
module_name (str): Name of the module.
|
|
313
|
+
pod_template (dict): Template for the pod, including resource requirements.
|
|
314
|
+
replicas (int): Number of replicas for the service
|
|
315
|
+
custom_labels (dict, optional): Custom labels to add to the service.
|
|
316
|
+
custom_annotations (dict, optional): Custom annotations to add to the service.
|
|
317
|
+
custom_template (dict, optional): Custom template to apply to the service.
|
|
318
|
+
dryrun (bool, optional): Whether to run in dryrun mode (Default: `False`).
|
|
319
|
+
"""
|
|
320
|
+
logger.info(f"Deploying Kubetorch service with name: {service_name}")
|
|
321
|
+
try:
|
|
322
|
+
created_service, _ = self._create_or_update_deployment(
|
|
323
|
+
name=service_name,
|
|
324
|
+
pod_template=pod_template,
|
|
325
|
+
module_name=module_name,
|
|
326
|
+
replicas=replicas,
|
|
327
|
+
inactivity_ttl=inactivity_ttl,
|
|
328
|
+
custom_labels=custom_labels,
|
|
329
|
+
custom_annotations=custom_annotations,
|
|
330
|
+
custom_template=custom_template,
|
|
331
|
+
scheduler_name=scheduler_name,
|
|
332
|
+
queue_name=queue_name,
|
|
333
|
+
dryrun=dryrun,
|
|
334
|
+
)
|
|
335
|
+
return created_service
|
|
336
|
+
except Exception as e:
|
|
337
|
+
logger.error(f"Failed to launch new Deployment: {str(e)}")
|
|
338
|
+
raise e
|
|
339
|
+
|
|
340
|
+
def get_pods_for_service(self, service_name: str, **kwargs) -> List[client.V1Pod]:
|
|
341
|
+
"""Get all pods associated with this Deployment service.
|
|
342
|
+
|
|
343
|
+
Args:
|
|
344
|
+
service_name (str): Name of the service
|
|
345
|
+
|
|
346
|
+
Returns:
|
|
347
|
+
List[V1Pod]: List of running pods associated with the service.
|
|
348
|
+
"""
|
|
349
|
+
return self.get_pods_for_service_static(
|
|
350
|
+
service_name=service_name,
|
|
351
|
+
namespace=self.namespace,
|
|
352
|
+
core_api=self.core_api,
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
def get_endpoint(self, service_name: str) -> str:
|
|
356
|
+
"""Get the endpoint URL for a Deployment service."""
|
|
357
|
+
return f"http://{service_name}.{self.namespace}.svc.cluster.local:80"
|
|
358
|
+
|
|
359
|
+
def check_service_ready(
|
|
360
|
+
self,
|
|
361
|
+
service_name: str,
|
|
362
|
+
launch_timeout: int,
|
|
363
|
+
core_api: client.CoreV1Api = None,
|
|
364
|
+
**kwargs,
|
|
365
|
+
) -> bool:
|
|
366
|
+
"""Checks if the Deployment is ready to start serving requests.
|
|
367
|
+
|
|
368
|
+
Args:
|
|
369
|
+
service_name: Name of the Deployment service
|
|
370
|
+
launch_timeout: Timeout in seconds to wait for readiness
|
|
371
|
+
core_api: Core API instance (uses self.core_api if None)
|
|
372
|
+
**kwargs: Additional arguments (ignored for Deployments)
|
|
373
|
+
|
|
374
|
+
Returns:
|
|
375
|
+
True if service is ready
|
|
376
|
+
|
|
377
|
+
Raises:
|
|
378
|
+
ServiceTimeoutError: If service doesn't become ready within timeout
|
|
379
|
+
"""
|
|
380
|
+
if core_api is None:
|
|
381
|
+
core_api = self.core_api
|
|
382
|
+
|
|
383
|
+
sleep_interval = 2
|
|
384
|
+
start_time = time.time()
|
|
385
|
+
|
|
386
|
+
logger.info(
|
|
387
|
+
f"Checking Deployment {service_name} pod readiness (timeout: {launch_timeout} seconds)"
|
|
388
|
+
)
|
|
389
|
+
|
|
390
|
+
iteration = 0
|
|
391
|
+
while (time.time() - start_time) < launch_timeout:
|
|
392
|
+
iteration += 1
|
|
393
|
+
try:
|
|
394
|
+
# Get Deployment
|
|
395
|
+
deployment = self.get_deployment(service_name)
|
|
396
|
+
if not deployment:
|
|
397
|
+
logger.debug(f"Waiting for Deployment {service_name} to be created")
|
|
398
|
+
time.sleep(sleep_interval)
|
|
399
|
+
continue
|
|
400
|
+
|
|
401
|
+
# Check if all replicas are ready
|
|
402
|
+
ready_replicas = deployment.status.ready_replicas or 0
|
|
403
|
+
desired_replicas = deployment.spec.replicas or 0
|
|
404
|
+
|
|
405
|
+
if iteration % 3 == 0:
|
|
406
|
+
logger.debug(
|
|
407
|
+
f"Deployment {service_name}: {ready_replicas}/{desired_replicas} replicas ready"
|
|
408
|
+
)
|
|
409
|
+
|
|
410
|
+
if ready_replicas >= desired_replicas and desired_replicas > 0:
|
|
411
|
+
logger.info(
|
|
412
|
+
f"Deployment {service_name} pod(s) are now ready with {ready_replicas} replicas"
|
|
413
|
+
)
|
|
414
|
+
return True
|
|
415
|
+
|
|
416
|
+
# Check for pod-level issues
|
|
417
|
+
pods = self.get_pods_for_service(service_name)
|
|
418
|
+
for pod in pods:
|
|
419
|
+
# Check for image pull errors in container status
|
|
420
|
+
check_pod_status_for_errors(pod)
|
|
421
|
+
|
|
422
|
+
# Check pod events separately from the core API
|
|
423
|
+
check_pod_events_for_errors(pod, self.namespace, core_api)
|
|
424
|
+
|
|
425
|
+
# If no pods exist, check for ReplicaSet-level errors (like PriorityClass issues)
|
|
426
|
+
if not pods:
|
|
427
|
+
check_replicaset_events_for_errors(
|
|
428
|
+
namespace=self.namespace,
|
|
429
|
+
service_name=service_name,
|
|
430
|
+
apps_v1_api=self.apps_v1_api,
|
|
431
|
+
core_api=core_api,
|
|
432
|
+
)
|
|
433
|
+
|
|
434
|
+
except client.exceptions.ApiException as e:
|
|
435
|
+
logger.error(f"Error checking Deployment readiness: {e}")
|
|
436
|
+
raise
|
|
437
|
+
|
|
438
|
+
if iteration % 10 == 0:
|
|
439
|
+
elapsed = int(time.time() - start_time)
|
|
440
|
+
remaining = max(0, int(launch_timeout - elapsed))
|
|
441
|
+
logger.info(
|
|
442
|
+
f"Deployment is not yet ready "
|
|
443
|
+
f"(elapsed: {elapsed}s, remaining: {remaining}s)"
|
|
444
|
+
)
|
|
445
|
+
|
|
446
|
+
time.sleep(sleep_interval)
|
|
447
|
+
|
|
448
|
+
raise ServiceTimeoutError(
|
|
449
|
+
f"Deployment {service_name} is not ready after {launch_timeout} seconds"
|
|
450
|
+
)
|
|
451
|
+
|
|
452
|
+
def teardown_service(self, service_name: str, console=None) -> bool:
|
|
453
|
+
"""Teardown Deployment and associated resources.
|
|
454
|
+
|
|
455
|
+
Args:
|
|
456
|
+
service_name: Name of the Deployment to teardown
|
|
457
|
+
console: Optional Rich console for output
|
|
458
|
+
|
|
459
|
+
Returns:
|
|
460
|
+
True if teardown was successful, False otherwise
|
|
461
|
+
"""
|
|
462
|
+
from kubetorch.resources.compute.utils import delete_deployment
|
|
463
|
+
|
|
464
|
+
try:
|
|
465
|
+
# Delete the Deployment and its associated service
|
|
466
|
+
delete_deployment(
|
|
467
|
+
apps_v1_api=self.apps_v1_api,
|
|
468
|
+
core_api=self.core_api,
|
|
469
|
+
name=service_name,
|
|
470
|
+
namespace=self.namespace,
|
|
471
|
+
console=console,
|
|
472
|
+
)
|
|
473
|
+
|
|
474
|
+
return True
|
|
475
|
+
|
|
476
|
+
except Exception as e:
|
|
477
|
+
logger.error(f"Failed to teardown Deployment {service_name}: {e}")
|
|
478
|
+
return False
|