kubetorch 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kubetorch might be problematic. Click here for more details.
- kubetorch/__init__.py +60 -0
- kubetorch/cli.py +1985 -0
- kubetorch/cli_utils.py +1025 -0
- kubetorch/config.py +453 -0
- kubetorch/constants.py +18 -0
- kubetorch/docs/Makefile +18 -0
- kubetorch/docs/__init__.py +0 -0
- kubetorch/docs/_ext/json_globaltoc.py +42 -0
- kubetorch/docs/api/cli.rst +10 -0
- kubetorch/docs/api/python/app.rst +21 -0
- kubetorch/docs/api/python/cls.rst +19 -0
- kubetorch/docs/api/python/compute.rst +25 -0
- kubetorch/docs/api/python/config.rst +11 -0
- kubetorch/docs/api/python/fn.rst +19 -0
- kubetorch/docs/api/python/image.rst +14 -0
- kubetorch/docs/api/python/secret.rst +18 -0
- kubetorch/docs/api/python/volumes.rst +13 -0
- kubetorch/docs/api/python.rst +101 -0
- kubetorch/docs/conf.py +69 -0
- kubetorch/docs/index.rst +20 -0
- kubetorch/docs/requirements.txt +5 -0
- kubetorch/globals.py +285 -0
- kubetorch/logger.py +59 -0
- kubetorch/resources/__init__.py +0 -0
- kubetorch/resources/callables/__init__.py +0 -0
- kubetorch/resources/callables/cls/__init__.py +0 -0
- kubetorch/resources/callables/cls/cls.py +157 -0
- kubetorch/resources/callables/fn/__init__.py +0 -0
- kubetorch/resources/callables/fn/fn.py +133 -0
- kubetorch/resources/callables/module.py +1416 -0
- kubetorch/resources/callables/utils.py +174 -0
- kubetorch/resources/compute/__init__.py +0 -0
- kubetorch/resources/compute/app.py +261 -0
- kubetorch/resources/compute/compute.py +2596 -0
- kubetorch/resources/compute/decorators.py +139 -0
- kubetorch/resources/compute/rbac.py +74 -0
- kubetorch/resources/compute/utils.py +1114 -0
- kubetorch/resources/compute/websocket.py +137 -0
- kubetorch/resources/images/__init__.py +1 -0
- kubetorch/resources/images/image.py +414 -0
- kubetorch/resources/images/images.py +74 -0
- kubetorch/resources/secrets/__init__.py +2 -0
- kubetorch/resources/secrets/kubernetes_secrets_client.py +412 -0
- kubetorch/resources/secrets/provider_secrets/__init__.py +0 -0
- kubetorch/resources/secrets/provider_secrets/anthropic_secret.py +12 -0
- kubetorch/resources/secrets/provider_secrets/aws_secret.py +16 -0
- kubetorch/resources/secrets/provider_secrets/azure_secret.py +14 -0
- kubetorch/resources/secrets/provider_secrets/cohere_secret.py +12 -0
- kubetorch/resources/secrets/provider_secrets/gcp_secret.py +16 -0
- kubetorch/resources/secrets/provider_secrets/github_secret.py +13 -0
- kubetorch/resources/secrets/provider_secrets/huggingface_secret.py +20 -0
- kubetorch/resources/secrets/provider_secrets/kubeconfig_secret.py +12 -0
- kubetorch/resources/secrets/provider_secrets/lambda_secret.py +13 -0
- kubetorch/resources/secrets/provider_secrets/langchain_secret.py +12 -0
- kubetorch/resources/secrets/provider_secrets/openai_secret.py +11 -0
- kubetorch/resources/secrets/provider_secrets/pinecone_secret.py +12 -0
- kubetorch/resources/secrets/provider_secrets/providers.py +93 -0
- kubetorch/resources/secrets/provider_secrets/ssh_secret.py +12 -0
- kubetorch/resources/secrets/provider_secrets/wandb_secret.py +11 -0
- kubetorch/resources/secrets/secret.py +238 -0
- kubetorch/resources/secrets/secret_factory.py +70 -0
- kubetorch/resources/secrets/utils.py +209 -0
- kubetorch/resources/volumes/__init__.py +0 -0
- kubetorch/resources/volumes/volume.py +365 -0
- kubetorch/servers/__init__.py +0 -0
- kubetorch/servers/http/__init__.py +0 -0
- kubetorch/servers/http/distributed_utils.py +3223 -0
- kubetorch/servers/http/http_client.py +730 -0
- kubetorch/servers/http/http_server.py +1788 -0
- kubetorch/servers/http/server_metrics.py +278 -0
- kubetorch/servers/http/utils.py +728 -0
- kubetorch/serving/__init__.py +0 -0
- kubetorch/serving/autoscaling.py +173 -0
- kubetorch/serving/base_service_manager.py +363 -0
- kubetorch/serving/constants.py +83 -0
- kubetorch/serving/deployment_service_manager.py +478 -0
- kubetorch/serving/knative_service_manager.py +519 -0
- kubetorch/serving/raycluster_service_manager.py +582 -0
- kubetorch/serving/service_manager.py +18 -0
- kubetorch/serving/templates/deployment_template.yaml +17 -0
- kubetorch/serving/templates/knative_service_template.yaml +19 -0
- kubetorch/serving/templates/kt_setup_template.sh.j2 +81 -0
- kubetorch/serving/templates/pod_template.yaml +194 -0
- kubetorch/serving/templates/raycluster_service_template.yaml +42 -0
- kubetorch/serving/templates/raycluster_template.yaml +35 -0
- kubetorch/serving/templates/service_template.yaml +21 -0
- kubetorch/serving/templates/workerset_template.yaml +36 -0
- kubetorch/serving/utils.py +377 -0
- kubetorch/utils.py +284 -0
- kubetorch-0.2.0.dist-info/METADATA +121 -0
- kubetorch-0.2.0.dist-info/RECORD +93 -0
- kubetorch-0.2.0.dist-info/WHEEL +4 -0
- kubetorch-0.2.0.dist-info/entry_points.txt +5 -0
|
@@ -0,0 +1,519 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
import time
|
|
4
|
+
from datetime import datetime, timezone
|
|
5
|
+
from typing import List, Optional
|
|
6
|
+
|
|
7
|
+
from kubernetes import client
|
|
8
|
+
|
|
9
|
+
import kubetorch as kt
|
|
10
|
+
import kubetorch.serving.constants as serving_constants
|
|
11
|
+
from kubetorch.logger import get_logger
|
|
12
|
+
from kubetorch.resources.compute.utils import (
|
|
13
|
+
check_pod_events_for_errors,
|
|
14
|
+
check_pod_status_for_errors,
|
|
15
|
+
check_revision_for_errors,
|
|
16
|
+
ServiceTimeoutError,
|
|
17
|
+
)
|
|
18
|
+
from kubetorch.servers.http.utils import load_template
|
|
19
|
+
from kubetorch.serving.autoscaling import AutoscalingConfig
|
|
20
|
+
from kubetorch.serving.base_service_manager import BaseServiceManager
|
|
21
|
+
from kubetorch.serving.utils import nested_override, pod_is_running
|
|
22
|
+
|
|
23
|
+
logger = get_logger(__name__)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class KnativeServiceManager(BaseServiceManager):
|
|
27
|
+
"""Service manager for Knative services with autoscaling capabilities."""
|
|
28
|
+
|
|
29
|
+
def _create_or_update_knative_service(
|
|
30
|
+
self,
|
|
31
|
+
name: str,
|
|
32
|
+
module_name: str,
|
|
33
|
+
pod_template: dict,
|
|
34
|
+
autoscaling_config: AutoscalingConfig = None,
|
|
35
|
+
gpu_annotations: dict = None,
|
|
36
|
+
inactivity_ttl: str = None,
|
|
37
|
+
custom_labels: dict = None,
|
|
38
|
+
custom_annotations: dict = None,
|
|
39
|
+
custom_template: dict = None,
|
|
40
|
+
scheduler_name: str = None,
|
|
41
|
+
queue_name: str = None,
|
|
42
|
+
dryrun: bool = False,
|
|
43
|
+
) -> dict:
|
|
44
|
+
"""Creates or updates a Knative service based on the provided configuration.
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
Dict
|
|
48
|
+
"""
|
|
49
|
+
# Clean the module name to remove any invalid characters for labels
|
|
50
|
+
clean_module_name = re.sub(r"[^A-Za-z0-9.-]|^[-.]|[-.]$", "", module_name)
|
|
51
|
+
|
|
52
|
+
labels = {
|
|
53
|
+
**self.base_labels,
|
|
54
|
+
serving_constants.KT_MODULE_LABEL: clean_module_name,
|
|
55
|
+
serving_constants.KT_SERVICE_LABEL: name,
|
|
56
|
+
serving_constants.KT_TEMPLATE_LABEL: "ksvc",
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
if custom_labels:
|
|
60
|
+
labels.update(custom_labels)
|
|
61
|
+
|
|
62
|
+
# Template labels (exclude template label - that's only for the top-level resource)
|
|
63
|
+
template_labels = {
|
|
64
|
+
**self.base_labels,
|
|
65
|
+
serving_constants.KT_MODULE_LABEL: clean_module_name,
|
|
66
|
+
serving_constants.KT_SERVICE_LABEL: name,
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
if custom_labels:
|
|
70
|
+
template_labels.update(custom_labels)
|
|
71
|
+
|
|
72
|
+
template_annotations = {
|
|
73
|
+
"networking.knative.dev/ingress.class": "kourier.ingress.networking.knative.dev",
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
annotations = {
|
|
77
|
+
"prometheus.io/scrape": "true",
|
|
78
|
+
"prometheus.io/port": "8080",
|
|
79
|
+
"prometheus.io/path": serving_constants.PROMETHEUS_HEALTH_ENDPOINT,
|
|
80
|
+
"serving.knative.dev/container-name": "kubetorch",
|
|
81
|
+
"serving.knative.dev/probe-path": "/health",
|
|
82
|
+
}
|
|
83
|
+
if custom_annotations:
|
|
84
|
+
annotations.update(custom_annotations)
|
|
85
|
+
|
|
86
|
+
if scheduler_name and queue_name:
|
|
87
|
+
labels["kai.scheduler/queue"] = queue_name # Useful for queries, etc
|
|
88
|
+
template_labels[
|
|
89
|
+
"kai.scheduler/queue"
|
|
90
|
+
] = queue_name # Required for KAI to schedule pods
|
|
91
|
+
# Note: KAI wraps the Knative revision in a podgroup, expecting at least 1 pod to schedule initially
|
|
92
|
+
# Only set min-scale=1 if user hasn't explicitly provided a min_scale value
|
|
93
|
+
if autoscaling_config.min_scale is None:
|
|
94
|
+
template_annotations["autoscaling.knative.dev/min-scale"] = "1"
|
|
95
|
+
|
|
96
|
+
# Add autoscaling annotations (config always provided)
|
|
97
|
+
autoscaling_annotations = autoscaling_config.convert_to_annotations()
|
|
98
|
+
template_annotations.update(autoscaling_annotations)
|
|
99
|
+
|
|
100
|
+
# Add progress deadline if specified (not an autoscaling annotation)
|
|
101
|
+
if autoscaling_config.progress_deadline is not None:
|
|
102
|
+
template_annotations[
|
|
103
|
+
"serving.knative.dev/progress-deadline"
|
|
104
|
+
] = autoscaling_config.progress_deadline
|
|
105
|
+
|
|
106
|
+
if inactivity_ttl:
|
|
107
|
+
annotations[serving_constants.INACTIVITY_TTL_ANNOTATION] = inactivity_ttl
|
|
108
|
+
logger.info(f"Configuring auto-down after idle timeout ({inactivity_ttl})")
|
|
109
|
+
|
|
110
|
+
template_annotations.update(annotations)
|
|
111
|
+
|
|
112
|
+
if gpu_annotations:
|
|
113
|
+
template_annotations.update(gpu_annotations)
|
|
114
|
+
|
|
115
|
+
deployment_timestamp = datetime.now(timezone.utc).isoformat()
|
|
116
|
+
template_annotations.update(
|
|
117
|
+
{"kubetorch.com/deployment_timestamp": deployment_timestamp}
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
# Set containerConcurrency based on autoscaling config
|
|
121
|
+
# When using concurrency-based autoscaling, set containerConcurrency to match
|
|
122
|
+
# the target to ensure the container's limit aligns with autoscaler expectations
|
|
123
|
+
template_vars = {
|
|
124
|
+
"name": name,
|
|
125
|
+
"namespace": self.namespace,
|
|
126
|
+
"annotations": annotations,
|
|
127
|
+
"template_annotations": template_annotations,
|
|
128
|
+
"labels": labels,
|
|
129
|
+
"template_labels": template_labels,
|
|
130
|
+
"pod_template": pod_template,
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
if autoscaling_config.concurrency is not None:
|
|
134
|
+
template_vars["container_concurrency"] = autoscaling_config.concurrency
|
|
135
|
+
|
|
136
|
+
service = load_template(
|
|
137
|
+
template_file=serving_constants.KNATIVE_SERVICE_TEMPLATE_FILE,
|
|
138
|
+
template_dir=os.path.join(
|
|
139
|
+
os.path.dirname(os.path.abspath(__file__)), "templates"
|
|
140
|
+
),
|
|
141
|
+
**template_vars,
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
if custom_template:
|
|
145
|
+
nested_override(service, custom_template)
|
|
146
|
+
|
|
147
|
+
try:
|
|
148
|
+
kwargs = {"dry_run": "All"} if dryrun else {}
|
|
149
|
+
created_service: dict = self.objects_api.create_namespaced_custom_object(
|
|
150
|
+
group="serving.knative.dev",
|
|
151
|
+
version="v1",
|
|
152
|
+
namespace=self.namespace,
|
|
153
|
+
plural="services",
|
|
154
|
+
body=service,
|
|
155
|
+
**kwargs,
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
logger.info(
|
|
159
|
+
f"Created Knative service {name} in namespace {self.namespace}",
|
|
160
|
+
)
|
|
161
|
+
return created_service
|
|
162
|
+
|
|
163
|
+
except client.exceptions.ApiException as e:
|
|
164
|
+
if e.status == 409:
|
|
165
|
+
logger.info(f"Service {name} already exists, updating")
|
|
166
|
+
existing_service = self.get_knative_service(name)
|
|
167
|
+
return existing_service
|
|
168
|
+
else:
|
|
169
|
+
logger.error(
|
|
170
|
+
f"Failed to create Knative service: {str(e)}",
|
|
171
|
+
)
|
|
172
|
+
raise e
|
|
173
|
+
|
|
174
|
+
def get_knative_service(self, service_name: str) -> dict:
|
|
175
|
+
"""Retrieve a Knative service by name."""
|
|
176
|
+
try:
|
|
177
|
+
service = self.objects_api.get_namespaced_custom_object(
|
|
178
|
+
group="serving.knative.dev",
|
|
179
|
+
version="v1",
|
|
180
|
+
namespace=self.namespace,
|
|
181
|
+
plural="services",
|
|
182
|
+
name=service_name,
|
|
183
|
+
)
|
|
184
|
+
return service
|
|
185
|
+
|
|
186
|
+
except client.exceptions.ApiException as e:
|
|
187
|
+
logger.error(f"Failed to load Knative service '{service_name}': {str(e)}")
|
|
188
|
+
raise
|
|
189
|
+
|
|
190
|
+
def get_deployment_timestamp_annotation(self, service_name: str) -> Optional[str]:
|
|
191
|
+
"""Get deployment timestamp annotation for Knative services."""
|
|
192
|
+
try:
|
|
193
|
+
service = self.get_knative_service(service_name)
|
|
194
|
+
if service:
|
|
195
|
+
return (
|
|
196
|
+
service.get("metadata", {})
|
|
197
|
+
.get("annotations", {})
|
|
198
|
+
.get("kubetorch.com/deployment_timestamp", None)
|
|
199
|
+
)
|
|
200
|
+
except client.exceptions.ApiException:
|
|
201
|
+
pass
|
|
202
|
+
return None
|
|
203
|
+
|
|
204
|
+
def update_deployment_timestamp_annotation(
|
|
205
|
+
self, service_name: str, new_timestamp: str
|
|
206
|
+
) -> str:
|
|
207
|
+
"""Update deployment timestamp annotation for Knative services."""
|
|
208
|
+
try:
|
|
209
|
+
patch_body = {
|
|
210
|
+
"metadata": {
|
|
211
|
+
"annotations": {"kubetorch.com/deployment_timestamp": new_timestamp}
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
self.objects_api.patch_namespaced_custom_object(
|
|
215
|
+
group="serving.knative.dev",
|
|
216
|
+
version="v1",
|
|
217
|
+
namespace=self.namespace,
|
|
218
|
+
plural="services",
|
|
219
|
+
name=service_name,
|
|
220
|
+
body=patch_body,
|
|
221
|
+
)
|
|
222
|
+
return new_timestamp
|
|
223
|
+
except client.exceptions.ApiException as e:
|
|
224
|
+
logger.error(
|
|
225
|
+
f"Failed to update deployment timestamp for Knative service '{service_name}': {str(e)}"
|
|
226
|
+
)
|
|
227
|
+
raise
|
|
228
|
+
|
|
229
|
+
def get_knative_service_endpoint(self, service_name: str) -> str:
|
|
230
|
+
"""Get the endpoint URL for a Knative service."""
|
|
231
|
+
try:
|
|
232
|
+
service = self.get_knative_service(service_name)
|
|
233
|
+
|
|
234
|
+
# Get the URL from the service status
|
|
235
|
+
status = service.get("status", {})
|
|
236
|
+
url = status.get("url")
|
|
237
|
+
if url:
|
|
238
|
+
return url
|
|
239
|
+
|
|
240
|
+
# Fallback to constructing URL
|
|
241
|
+
return f"http://{service_name}.{self.namespace}.svc.cluster.local"
|
|
242
|
+
|
|
243
|
+
except Exception as e:
|
|
244
|
+
logger.warning(f"Could not get Knative service URL for {service_name}: {e}")
|
|
245
|
+
return f"http://{service_name}.{self.namespace}.svc.cluster.local"
|
|
246
|
+
|
|
247
|
+
def create_or_update_service(
|
|
248
|
+
self,
|
|
249
|
+
service_name: str,
|
|
250
|
+
module_name: str,
|
|
251
|
+
pod_template: dict,
|
|
252
|
+
autoscaling_config: AutoscalingConfig = None,
|
|
253
|
+
gpu_annotations: dict = None,
|
|
254
|
+
inactivity_ttl: str = None,
|
|
255
|
+
custom_labels: dict = None,
|
|
256
|
+
custom_annotations: dict = None,
|
|
257
|
+
custom_template: dict = None,
|
|
258
|
+
scheduler_name: str = None,
|
|
259
|
+
queue_name: str = None,
|
|
260
|
+
dryrun: bool = False,
|
|
261
|
+
**kwargs, # Ignore deployment-specific args like replicas
|
|
262
|
+
):
|
|
263
|
+
"""
|
|
264
|
+
Creates a Knative service with autoscaling capabilities.
|
|
265
|
+
"""
|
|
266
|
+
logger.info(
|
|
267
|
+
f"Deploying Kubetorch autoscaling (Knative) service with name: {service_name}"
|
|
268
|
+
)
|
|
269
|
+
try:
|
|
270
|
+
created_service = self._create_or_update_knative_service(
|
|
271
|
+
name=service_name,
|
|
272
|
+
pod_template=pod_template,
|
|
273
|
+
module_name=module_name,
|
|
274
|
+
autoscaling_config=autoscaling_config,
|
|
275
|
+
gpu_annotations=gpu_annotations,
|
|
276
|
+
inactivity_ttl=inactivity_ttl,
|
|
277
|
+
custom_labels=custom_labels,
|
|
278
|
+
custom_annotations=custom_annotations,
|
|
279
|
+
custom_template=custom_template,
|
|
280
|
+
scheduler_name=scheduler_name,
|
|
281
|
+
queue_name=queue_name,
|
|
282
|
+
dryrun=dryrun,
|
|
283
|
+
)
|
|
284
|
+
return created_service
|
|
285
|
+
except Exception as e:
|
|
286
|
+
logger.error(f"Failed to launch new Knative service: {str(e)}")
|
|
287
|
+
raise e
|
|
288
|
+
|
|
289
|
+
def get_endpoint(self, service_name: str) -> str:
|
|
290
|
+
"""Get the endpoint URL for a Knative service."""
|
|
291
|
+
return self.get_knative_service_endpoint(service_name)
|
|
292
|
+
|
|
293
|
+
def get_pods_for_service(self, service_name: str, **kwargs) -> List[client.V1Pod]:
|
|
294
|
+
"""Get all pods associated with this Knative service."""
|
|
295
|
+
return self.get_pods_for_service_static(
|
|
296
|
+
service_name=service_name,
|
|
297
|
+
namespace=self.namespace,
|
|
298
|
+
core_api=self.core_api,
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
def _status_condition_ready(self, status: dict) -> bool:
|
|
302
|
+
"""Check if service status conditions indicate ready state."""
|
|
303
|
+
conditions = status.get("conditions", [])
|
|
304
|
+
for condition in conditions:
|
|
305
|
+
if condition.get("type") == "Ready":
|
|
306
|
+
return condition.get("status") == "True"
|
|
307
|
+
return False
|
|
308
|
+
|
|
309
|
+
def check_service_ready(
|
|
310
|
+
self,
|
|
311
|
+
service_name: str,
|
|
312
|
+
launch_timeout: int,
|
|
313
|
+
objects_api: client.CustomObjectsApi = None,
|
|
314
|
+
core_api: client.CoreV1Api = None,
|
|
315
|
+
queue_name: str = None,
|
|
316
|
+
scheduler_name: str = None,
|
|
317
|
+
**kwargs,
|
|
318
|
+
) -> bool:
|
|
319
|
+
"""Checks if the Knative service is ready to start serving requests.
|
|
320
|
+
|
|
321
|
+
Core checks:
|
|
322
|
+
- Service status and conditions
|
|
323
|
+
- Revision status and conditions
|
|
324
|
+
- Pod status and conditions
|
|
325
|
+
- Autoscaling conditions (min-scale, etc.)
|
|
326
|
+
|
|
327
|
+
Common failure scenarios handled:
|
|
328
|
+
- Image pull failures or delays
|
|
329
|
+
- Container initialization and setup (pip installs, etc.)
|
|
330
|
+
- User-defined image setup steps
|
|
331
|
+
- Node provisioning delays or failures
|
|
332
|
+
- Service health check failures
|
|
333
|
+
- Container terminations
|
|
334
|
+
- Autoscaling not meeting minimum requirements
|
|
335
|
+
|
|
336
|
+
Note:
|
|
337
|
+
This method checks all pods associated with the service, not just the first one.
|
|
338
|
+
Service check will fail fast only for truly unrecoverable conditions (like missing images or autoscaling
|
|
339
|
+
not being triggered or enabled).
|
|
340
|
+
|
|
341
|
+
Unless there is a clear reason to terminate, will wait for the full specified timeout
|
|
342
|
+
to allow autoscaling and node provisioning to work (where relevant).
|
|
343
|
+
|
|
344
|
+
Args:
|
|
345
|
+
service_name: Name of the Knative service
|
|
346
|
+
launch_timeout: Timeout in seconds to wait for readiness
|
|
347
|
+
objects_api: Objects API instance (uses self.objects_api if None)
|
|
348
|
+
core_api: Core API instance (uses self.core_api if None)
|
|
349
|
+
queue_name: Queue name for scheduling checks
|
|
350
|
+
scheduler_name: Scheduler name for scheduling checks
|
|
351
|
+
**kwargs: Additional arguments
|
|
352
|
+
|
|
353
|
+
Returns:
|
|
354
|
+
True if service is ready
|
|
355
|
+
|
|
356
|
+
Raises:
|
|
357
|
+
ServiceTimeoutError: If service doesn't become ready within timeout
|
|
358
|
+
QueueUnschedulableError: If pods can't be scheduled due to queue issues
|
|
359
|
+
ResourceNotAvailableError: If required resources aren't available
|
|
360
|
+
"""
|
|
361
|
+
if objects_api is None:
|
|
362
|
+
objects_api = self.objects_api
|
|
363
|
+
if core_api is None:
|
|
364
|
+
core_api = self.core_api
|
|
365
|
+
|
|
366
|
+
sleep_interval = 2
|
|
367
|
+
start_time = time.time()
|
|
368
|
+
|
|
369
|
+
# Instead of spamming logs with each iteration, only log once
|
|
370
|
+
displayed_msgs = {
|
|
371
|
+
"service_status": False,
|
|
372
|
+
"waiting_for_pods": None,
|
|
373
|
+
"revision_status": False,
|
|
374
|
+
"service_readiness": False,
|
|
375
|
+
"autoscaling": False,
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
logger.info(
|
|
379
|
+
f"Checking service {service_name} pod readiness (timeout: {launch_timeout} seconds)"
|
|
380
|
+
)
|
|
381
|
+
iteration = 0
|
|
382
|
+
while (time.time() - start_time) < launch_timeout:
|
|
383
|
+
iteration += 1
|
|
384
|
+
try:
|
|
385
|
+
service = objects_api.get_namespaced_custom_object(
|
|
386
|
+
group="serving.knative.dev",
|
|
387
|
+
version="v1",
|
|
388
|
+
namespace=self.namespace,
|
|
389
|
+
plural="services",
|
|
390
|
+
name=service_name,
|
|
391
|
+
)
|
|
392
|
+
status = service.get("status")
|
|
393
|
+
if not status:
|
|
394
|
+
if not displayed_msgs["service_status"]:
|
|
395
|
+
logger.info(f"Waiting for service {service_name} status")
|
|
396
|
+
displayed_msgs["service_status"] = True
|
|
397
|
+
time.sleep(sleep_interval)
|
|
398
|
+
continue
|
|
399
|
+
|
|
400
|
+
for cond in status.get("conditions", []):
|
|
401
|
+
if cond.get("type") == "Ready" and cond.get("reason") == "NotOwned":
|
|
402
|
+
raise kt.KnativeServiceConflictError(
|
|
403
|
+
f"Knative service '{service_name}' cannot become ready: {cond.get('message')}"
|
|
404
|
+
)
|
|
405
|
+
|
|
406
|
+
# Check autoscaling conditions
|
|
407
|
+
if not displayed_msgs["autoscaling"]:
|
|
408
|
+
logger.info("Checking autoscaling conditions")
|
|
409
|
+
displayed_msgs["autoscaling"] = True
|
|
410
|
+
|
|
411
|
+
# Get the min-scale from annotations
|
|
412
|
+
min_scale = 0
|
|
413
|
+
if (
|
|
414
|
+
service.get("spec", {})
|
|
415
|
+
.get("template", {})
|
|
416
|
+
.get("metadata", {})
|
|
417
|
+
.get("annotations", {})
|
|
418
|
+
):
|
|
419
|
+
min_scale_str = service["spec"]["template"]["metadata"][
|
|
420
|
+
"annotations"
|
|
421
|
+
].get("autoscaling.knative.dev/min-scale", "0")
|
|
422
|
+
min_scale = int(min_scale_str)
|
|
423
|
+
|
|
424
|
+
if min_scale == 0 and self._status_condition_ready(status):
|
|
425
|
+
# Service is ready and allowed to scale to zero
|
|
426
|
+
logger.info(f"Service {service_name} is already marked as ready")
|
|
427
|
+
return True
|
|
428
|
+
|
|
429
|
+
if min_scale == 0:
|
|
430
|
+
# Always need at least one pod
|
|
431
|
+
min_scale = 1
|
|
432
|
+
|
|
433
|
+
# Get current number of Running pods
|
|
434
|
+
pods = self.get_pods_for_service(service_name)
|
|
435
|
+
running_pods = [p for p in pods if pod_is_running(p)]
|
|
436
|
+
running_pods_count = len(running_pods)
|
|
437
|
+
|
|
438
|
+
if running_pods_count < min_scale:
|
|
439
|
+
for pod in pods:
|
|
440
|
+
# Check for image pull errors in container status
|
|
441
|
+
check_pod_status_for_errors(pod, queue_name, scheduler_name)
|
|
442
|
+
|
|
443
|
+
# Check pod events separately from the core API
|
|
444
|
+
check_pod_events_for_errors(pod, self.namespace, core_api)
|
|
445
|
+
|
|
446
|
+
if (
|
|
447
|
+
displayed_msgs["waiting_for_pods"] is None
|
|
448
|
+
or displayed_msgs["waiting_for_pods"] != running_pods_count
|
|
449
|
+
):
|
|
450
|
+
logger.info(
|
|
451
|
+
f"Waiting for minimum scale ({min_scale} pods), currently have {running_pods_count}"
|
|
452
|
+
)
|
|
453
|
+
displayed_msgs["waiting_for_pods"] = running_pods_count
|
|
454
|
+
else:
|
|
455
|
+
if not displayed_msgs["service_readiness"]:
|
|
456
|
+
logger.info(
|
|
457
|
+
f"Min {min_scale} pod{'s are' if min_scale > 1 else ' is'} ready, waiting for service to be marked as ready"
|
|
458
|
+
)
|
|
459
|
+
displayed_msgs["service_readiness"] = True
|
|
460
|
+
|
|
461
|
+
if self._status_condition_ready(status):
|
|
462
|
+
logger.info(f"Service {service_name} is now ready")
|
|
463
|
+
return True
|
|
464
|
+
|
|
465
|
+
if not displayed_msgs["revision_status"]:
|
|
466
|
+
logger.info("Checking service revision status")
|
|
467
|
+
displayed_msgs["revision_status"] = True
|
|
468
|
+
|
|
469
|
+
latest_revision = status.get("latestCreatedRevisionName")
|
|
470
|
+
if latest_revision:
|
|
471
|
+
check_revision_for_errors(
|
|
472
|
+
latest_revision, self.namespace, objects_api
|
|
473
|
+
)
|
|
474
|
+
|
|
475
|
+
except client.exceptions.ApiException:
|
|
476
|
+
raise
|
|
477
|
+
|
|
478
|
+
if iteration % 10 == 0:
|
|
479
|
+
elapsed = int(time.time() - start_time)
|
|
480
|
+
remaining = max(0, int(launch_timeout - elapsed))
|
|
481
|
+
logger.info(
|
|
482
|
+
f"Service is not yet marked as ready "
|
|
483
|
+
f"(elapsed: {elapsed}s, remaining: {remaining}s)"
|
|
484
|
+
)
|
|
485
|
+
|
|
486
|
+
time.sleep(sleep_interval)
|
|
487
|
+
|
|
488
|
+
raise ServiceTimeoutError(
|
|
489
|
+
f"Service {service_name} did not become ready within {launch_timeout} seconds. "
|
|
490
|
+
"To update the timeout, set the `launch_timeout` parameter in the Compute class, or set the "
|
|
491
|
+
"environment variable `KT_LAUNCH_TIMEOUT`."
|
|
492
|
+
)
|
|
493
|
+
|
|
494
|
+
def teardown_service(self, service_name: str, console=None) -> bool:
|
|
495
|
+
"""Teardown Knative service and associated resources.
|
|
496
|
+
|
|
497
|
+
Args:
|
|
498
|
+
service_name: Name of the Knative service to teardown
|
|
499
|
+
console: Optional Rich console for output
|
|
500
|
+
|
|
501
|
+
Returns:
|
|
502
|
+
True if teardown was successful, False otherwise
|
|
503
|
+
"""
|
|
504
|
+
from kubetorch.resources.compute.utils import delete_service
|
|
505
|
+
|
|
506
|
+
try:
|
|
507
|
+
# Delete the Knative service
|
|
508
|
+
delete_service(
|
|
509
|
+
custom_api=self.objects_api,
|
|
510
|
+
name=service_name,
|
|
511
|
+
namespace=self.namespace,
|
|
512
|
+
console=console,
|
|
513
|
+
)
|
|
514
|
+
|
|
515
|
+
return True
|
|
516
|
+
|
|
517
|
+
except Exception as e:
|
|
518
|
+
logger.error(f"Failed to teardown Knative service {service_name}: {e}")
|
|
519
|
+
return False
|