kubetorch 0.2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kubetorch/__init__.py +59 -0
- kubetorch/cli.py +1939 -0
- kubetorch/cli_utils.py +967 -0
- kubetorch/config.py +453 -0
- kubetorch/constants.py +18 -0
- kubetorch/docs/Makefile +18 -0
- kubetorch/docs/__init__.py +0 -0
- kubetorch/docs/_ext/json_globaltoc.py +42 -0
- kubetorch/docs/api/cli.rst +10 -0
- kubetorch/docs/api/python/app.rst +21 -0
- kubetorch/docs/api/python/cls.rst +19 -0
- kubetorch/docs/api/python/compute.rst +25 -0
- kubetorch/docs/api/python/config.rst +11 -0
- kubetorch/docs/api/python/fn.rst +19 -0
- kubetorch/docs/api/python/image.rst +14 -0
- kubetorch/docs/api/python/secret.rst +18 -0
- kubetorch/docs/api/python/volumes.rst +13 -0
- kubetorch/docs/api/python.rst +101 -0
- kubetorch/docs/conf.py +69 -0
- kubetorch/docs/index.rst +20 -0
- kubetorch/docs/requirements.txt +5 -0
- kubetorch/globals.py +269 -0
- kubetorch/logger.py +59 -0
- kubetorch/resources/__init__.py +0 -0
- kubetorch/resources/callables/__init__.py +0 -0
- kubetorch/resources/callables/cls/__init__.py +0 -0
- kubetorch/resources/callables/cls/cls.py +159 -0
- kubetorch/resources/callables/fn/__init__.py +0 -0
- kubetorch/resources/callables/fn/fn.py +140 -0
- kubetorch/resources/callables/module.py +1315 -0
- kubetorch/resources/callables/utils.py +203 -0
- kubetorch/resources/compute/__init__.py +0 -0
- kubetorch/resources/compute/app.py +253 -0
- kubetorch/resources/compute/compute.py +2414 -0
- kubetorch/resources/compute/decorators.py +137 -0
- kubetorch/resources/compute/utils.py +1026 -0
- kubetorch/resources/compute/websocket.py +135 -0
- kubetorch/resources/images/__init__.py +1 -0
- kubetorch/resources/images/image.py +412 -0
- kubetorch/resources/images/images.py +64 -0
- kubetorch/resources/secrets/__init__.py +2 -0
- kubetorch/resources/secrets/kubernetes_secrets_client.py +377 -0
- kubetorch/resources/secrets/provider_secrets/__init__.py +0 -0
- kubetorch/resources/secrets/provider_secrets/anthropic_secret.py +12 -0
- kubetorch/resources/secrets/provider_secrets/aws_secret.py +16 -0
- kubetorch/resources/secrets/provider_secrets/azure_secret.py +14 -0
- kubetorch/resources/secrets/provider_secrets/cohere_secret.py +12 -0
- kubetorch/resources/secrets/provider_secrets/gcp_secret.py +16 -0
- kubetorch/resources/secrets/provider_secrets/github_secret.py +13 -0
- kubetorch/resources/secrets/provider_secrets/huggingface_secret.py +20 -0
- kubetorch/resources/secrets/provider_secrets/kubeconfig_secret.py +12 -0
- kubetorch/resources/secrets/provider_secrets/lambda_secret.py +13 -0
- kubetorch/resources/secrets/provider_secrets/langchain_secret.py +12 -0
- kubetorch/resources/secrets/provider_secrets/openai_secret.py +11 -0
- kubetorch/resources/secrets/provider_secrets/pinecone_secret.py +12 -0
- kubetorch/resources/secrets/provider_secrets/providers.py +92 -0
- kubetorch/resources/secrets/provider_secrets/ssh_secret.py +12 -0
- kubetorch/resources/secrets/provider_secrets/wandb_secret.py +11 -0
- kubetorch/resources/secrets/secret.py +224 -0
- kubetorch/resources/secrets/secret_factory.py +64 -0
- kubetorch/resources/secrets/utils.py +222 -0
- kubetorch/resources/volumes/__init__.py +0 -0
- kubetorch/resources/volumes/volume.py +340 -0
- kubetorch/servers/__init__.py +0 -0
- kubetorch/servers/http/__init__.py +0 -0
- kubetorch/servers/http/distributed_utils.py +2968 -0
- kubetorch/servers/http/http_client.py +802 -0
- kubetorch/servers/http/http_server.py +1622 -0
- kubetorch/servers/http/server_metrics.py +255 -0
- kubetorch/servers/http/utils.py +722 -0
- kubetorch/serving/__init__.py +0 -0
- kubetorch/serving/autoscaling.py +153 -0
- kubetorch/serving/base_service_manager.py +344 -0
- kubetorch/serving/constants.py +77 -0
- kubetorch/serving/deployment_service_manager.py +431 -0
- kubetorch/serving/knative_service_manager.py +487 -0
- kubetorch/serving/raycluster_service_manager.py +526 -0
- kubetorch/serving/service_manager.py +18 -0
- kubetorch/serving/templates/deployment_template.yaml +17 -0
- kubetorch/serving/templates/knative_service_template.yaml +19 -0
- kubetorch/serving/templates/kt_setup_template.sh.j2 +91 -0
- kubetorch/serving/templates/pod_template.yaml +198 -0
- kubetorch/serving/templates/raycluster_service_template.yaml +42 -0
- kubetorch/serving/templates/raycluster_template.yaml +35 -0
- kubetorch/serving/templates/service_template.yaml +21 -0
- kubetorch/serving/templates/workerset_template.yaml +36 -0
- kubetorch/serving/utils.py +344 -0
- kubetorch/utils.py +263 -0
- kubetorch-0.2.5.dist-info/METADATA +75 -0
- kubetorch-0.2.5.dist-info/RECORD +92 -0
- kubetorch-0.2.5.dist-info/WHEEL +4 -0
- kubetorch-0.2.5.dist-info/entry_points.txt +5 -0
|
@@ -0,0 +1,1026 @@
|
|
|
1
|
+
import importlib
|
|
2
|
+
import inspect
|
|
3
|
+
import os
|
|
4
|
+
import socket
|
|
5
|
+
import subprocess
|
|
6
|
+
import sys
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import List, Optional, Union
|
|
9
|
+
|
|
10
|
+
from kubernetes import client
|
|
11
|
+
from kubernetes.client.rest import ApiException
|
|
12
|
+
from kubernetes.stream import stream
|
|
13
|
+
|
|
14
|
+
import kubetorch.globals
|
|
15
|
+
from kubetorch.logger import get_logger
|
|
16
|
+
from kubetorch.resources.callables.utils import get_local_install_path, locate_working_dir
|
|
17
|
+
from kubetorch.resources.secrets.kubernetes_secrets_client import KubernetesSecretsClient
|
|
18
|
+
from kubetorch.servers.http.utils import is_running_in_kubernetes, StartupError
|
|
19
|
+
from kubetorch.serving import constants as serving_constants
|
|
20
|
+
from kubetorch.serving.constants import KT_SERVICE_LABEL, KT_USERNAME_LABEL
|
|
21
|
+
|
|
22
|
+
logger = get_logger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class KnativeServiceError(Exception):
|
|
26
|
+
"""Base exception for Knative service errors."""
|
|
27
|
+
|
|
28
|
+
pass
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class ImagePullError(KnativeServiceError):
|
|
32
|
+
"""Raised when container image pull fails."""
|
|
33
|
+
|
|
34
|
+
pass
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class ResourceNotAvailableError(Exception):
|
|
38
|
+
"""Raised when required compute resources (GPU, memory, etc.) are not available in the cluster."""
|
|
39
|
+
|
|
40
|
+
pass
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class ServiceHealthError(KnativeServiceError):
|
|
44
|
+
"""Raised when service health checks fail."""
|
|
45
|
+
|
|
46
|
+
pass
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class ServiceTimeoutError(KnativeServiceError):
|
|
50
|
+
"""Raised when service fails to become ready within timeout period."""
|
|
51
|
+
|
|
52
|
+
pass
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class QueueUnschedulableError(KnativeServiceError):
|
|
56
|
+
"""Raised when the service pod is unschedulable in the requested queue."""
|
|
57
|
+
|
|
58
|
+
pass
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class KnativeServiceConflictError(Exception):
|
|
62
|
+
"""Raised when a conflicting non-Knative Kubernetes Service prevents Knative service creation."""
|
|
63
|
+
|
|
64
|
+
pass
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class PodContainerError(Exception):
|
|
68
|
+
"""Raised when pod container is in a terminated or waiting state."""
|
|
69
|
+
|
|
70
|
+
pass
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class VersionMismatchError(Exception):
|
|
74
|
+
"""Raised when the Kubetorch client version is incompatible with the version running on the target cluster"""
|
|
75
|
+
|
|
76
|
+
pass
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class SecretNotFound(Exception):
|
|
80
|
+
"""Raised when trying to update kubetorch secret the does not exist"""
|
|
81
|
+
|
|
82
|
+
def __init__(self, secret_name: str, namespace: str):
|
|
83
|
+
super().__init__(f"kubetorch secret {secret_name} was not found in {namespace} namespace")
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class RsyncError(Exception):
|
|
87
|
+
def __init__(self, cmd: str, returncode: int, stdout: str, stderr: str):
|
|
88
|
+
self.cmd = cmd
|
|
89
|
+
self.returncode = returncode
|
|
90
|
+
self.stdout = stdout
|
|
91
|
+
self.stderr = stderr
|
|
92
|
+
super().__init__(f"Rsync failed (code={returncode}): {stderr.strip()}")
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
TERMINATE_EARLY_ERRORS = {
|
|
96
|
+
"ContainerMissing": ImagePullError,
|
|
97
|
+
"ImagePullBackOff": ImagePullError,
|
|
98
|
+
"ErrImagePull": ImagePullError,
|
|
99
|
+
"CrashLoopBackOff": ServiceHealthError,
|
|
100
|
+
"BackOff": ServiceHealthError,
|
|
101
|
+
"StartupError": StartupError,
|
|
102
|
+
"FailedMount": StartupError,
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _run_bash(
|
|
107
|
+
commands: Union[str, List[str]],
|
|
108
|
+
core_api: "CoreV1Api",
|
|
109
|
+
pod_names: List[str],
|
|
110
|
+
namespace: str,
|
|
111
|
+
container: str = None,
|
|
112
|
+
):
|
|
113
|
+
if isinstance(commands, str):
|
|
114
|
+
commands = [commands]
|
|
115
|
+
commands = [["/bin/sh", "-c", f'{command}; echo "::EXIT_CODE::$?"'] for command in commands]
|
|
116
|
+
|
|
117
|
+
if isinstance(pod_names, str):
|
|
118
|
+
pod_names = [pod_names]
|
|
119
|
+
|
|
120
|
+
ret_codes = []
|
|
121
|
+
for exec_command in commands:
|
|
122
|
+
for pod_name in pod_names:
|
|
123
|
+
if not container:
|
|
124
|
+
pod = core_api.read_namespaced_pod(name=pod_name, namespace=namespace)
|
|
125
|
+
if not pod.spec.containers:
|
|
126
|
+
raise Exception(f"No containers found in pod {pod_name}")
|
|
127
|
+
container = pod.spec.containers[0].name
|
|
128
|
+
try:
|
|
129
|
+
resp = stream(
|
|
130
|
+
core_api.connect_get_namespaced_pod_exec,
|
|
131
|
+
pod_name,
|
|
132
|
+
namespace,
|
|
133
|
+
container=container,
|
|
134
|
+
command=exec_command,
|
|
135
|
+
stderr=True,
|
|
136
|
+
stdin=False,
|
|
137
|
+
stdout=True,
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
resp = resp.splitlines()
|
|
141
|
+
exit_code = 0
|
|
142
|
+
|
|
143
|
+
for line in resp:
|
|
144
|
+
if "::EXIT_CODE::" in line:
|
|
145
|
+
try:
|
|
146
|
+
exit_code = int(line.split("::EXIT_CODE::")[-1].strip())
|
|
147
|
+
resp.remove(line)
|
|
148
|
+
break
|
|
149
|
+
except ValueError:
|
|
150
|
+
pass
|
|
151
|
+
|
|
152
|
+
stdout = "\n".join(resp)
|
|
153
|
+
|
|
154
|
+
if exit_code == 0:
|
|
155
|
+
ret_codes.append([exit_code, stdout, ""])
|
|
156
|
+
else:
|
|
157
|
+
ret_codes.append([exit_code, "", stdout])
|
|
158
|
+
|
|
159
|
+
except Exception as e:
|
|
160
|
+
raise Exception(f"Failed to execute command {exec_command} on pod {pod_name}: {str(e)}")
|
|
161
|
+
return ret_codes
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def _get_rsync_exclude_options() -> str:
|
|
165
|
+
"""Get rsync exclude options using .gitignore and/or .ktignore if available."""
|
|
166
|
+
from pathlib import Path
|
|
167
|
+
|
|
168
|
+
# Allow users to hard override all of our settings
|
|
169
|
+
if os.environ.get("KT_RSYNC_FILTERS"):
|
|
170
|
+
logger.debug(
|
|
171
|
+
f"KT_RSYNC_FILTERS environment variable set, using rsync filters: {os.environ['KT_RSYNC_FILTERS']}"
|
|
172
|
+
)
|
|
173
|
+
return os.environ["KT_RSYNC_FILTERS"]
|
|
174
|
+
|
|
175
|
+
repo_root, _ = locate_working_dir(os.getcwd())
|
|
176
|
+
gitignore_path = os.path.join(repo_root, ".gitignore")
|
|
177
|
+
kt_ignore_path = os.path.join(repo_root, ".ktignore")
|
|
178
|
+
|
|
179
|
+
exclude_args = ""
|
|
180
|
+
if Path(kt_ignore_path).exists():
|
|
181
|
+
exclude_args += f" --exclude-from='{kt_ignore_path}'"
|
|
182
|
+
if Path(gitignore_path).exists():
|
|
183
|
+
exclude_args += f" --exclude-from='{gitignore_path}'"
|
|
184
|
+
# Add some reasonable default exclusions
|
|
185
|
+
exclude_args += " --exclude='*.pyc' --exclude='__pycache__' --exclude='.venv' --exclude='.git'"
|
|
186
|
+
|
|
187
|
+
return exclude_args.strip()
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def is_pod_terminated(pod: client.V1Pod) -> bool:
|
|
191
|
+
# Check if pod is marked for deletion
|
|
192
|
+
if pod.metadata.deletion_timestamp is not None:
|
|
193
|
+
return True
|
|
194
|
+
|
|
195
|
+
# Check pod phase
|
|
196
|
+
if pod.status.phase in ["Succeeded", "Failed"]:
|
|
197
|
+
return True
|
|
198
|
+
|
|
199
|
+
# Check container statuses
|
|
200
|
+
if pod.status.container_statuses:
|
|
201
|
+
for container in pod.status.container_statuses:
|
|
202
|
+
if container.state.terminated:
|
|
203
|
+
return True
|
|
204
|
+
|
|
205
|
+
return False
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
# ----------------- ConfigMap utils ----------------- #
|
|
209
|
+
def load_configmaps(
|
|
210
|
+
core_api: client.CoreV1Api,
|
|
211
|
+
service_name: str,
|
|
212
|
+
namespace: str,
|
|
213
|
+
console: "Console" = None,
|
|
214
|
+
) -> List[str]:
|
|
215
|
+
"""List configmaps that start with a given service name."""
|
|
216
|
+
try:
|
|
217
|
+
configmaps = core_api.list_namespaced_config_map(
|
|
218
|
+
namespace=namespace,
|
|
219
|
+
label_selector=f"kubetorch.com/service={service_name}",
|
|
220
|
+
)
|
|
221
|
+
return [cm.metadata.name for cm in configmaps.items]
|
|
222
|
+
except ApiException as e:
|
|
223
|
+
if console:
|
|
224
|
+
console.print(f"[yellow]Warning:[/yellow] Failed to list configmaps: {e}")
|
|
225
|
+
return []
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
# ----------------- Resource Deletion Utils ----------------- #
|
|
229
|
+
def delete_configmaps(
|
|
230
|
+
core_api: client.CoreV1Api,
|
|
231
|
+
configmaps: List[str],
|
|
232
|
+
namespace: str,
|
|
233
|
+
console: "Console" = None,
|
|
234
|
+
force: bool = False,
|
|
235
|
+
):
|
|
236
|
+
"""Delete the given list of configmaps."""
|
|
237
|
+
|
|
238
|
+
grace_period_seconds, propagation_policy = None, None
|
|
239
|
+
if force:
|
|
240
|
+
grace_period_seconds = 0
|
|
241
|
+
propagation_policy = "Foreground"
|
|
242
|
+
|
|
243
|
+
for cm in configmaps:
|
|
244
|
+
try:
|
|
245
|
+
core_api.delete_namespaced_config_map(
|
|
246
|
+
name=cm,
|
|
247
|
+
namespace=namespace,
|
|
248
|
+
grace_period_seconds=grace_period_seconds,
|
|
249
|
+
propagation_policy=propagation_policy,
|
|
250
|
+
)
|
|
251
|
+
if console:
|
|
252
|
+
console.print(f"✓ Deleted configmap [blue]{cm}[/blue]")
|
|
253
|
+
except ApiException as e:
|
|
254
|
+
if e.status == 404:
|
|
255
|
+
if console:
|
|
256
|
+
console.print(f"[yellow]Warning:[/yellow] ConfigMap {cm} not found")
|
|
257
|
+
else:
|
|
258
|
+
if console:
|
|
259
|
+
console.print(f"[red]Error:[/red] Failed to delete configmap {cm}: {e}")
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def delete_service(
|
|
263
|
+
custom_api: client.CustomObjectsApi,
|
|
264
|
+
name: str,
|
|
265
|
+
namespace,
|
|
266
|
+
console: "Console" = None,
|
|
267
|
+
force: bool = False,
|
|
268
|
+
):
|
|
269
|
+
"""Delete a Knative service."""
|
|
270
|
+
|
|
271
|
+
grace_period_seconds, propagation_policy = None, None
|
|
272
|
+
if force:
|
|
273
|
+
grace_period_seconds = 0
|
|
274
|
+
propagation_policy = "Foreground"
|
|
275
|
+
|
|
276
|
+
try:
|
|
277
|
+
custom_api.delete_namespaced_custom_object(
|
|
278
|
+
group="serving.knative.dev",
|
|
279
|
+
version="v1",
|
|
280
|
+
namespace=namespace,
|
|
281
|
+
plural="services",
|
|
282
|
+
name=name,
|
|
283
|
+
grace_period_seconds=grace_period_seconds,
|
|
284
|
+
propagation_policy=propagation_policy,
|
|
285
|
+
)
|
|
286
|
+
if console:
|
|
287
|
+
console.print(f"✓ Deleted service [blue]{name}[/blue]")
|
|
288
|
+
except Exception as e:
|
|
289
|
+
if e.status == 404:
|
|
290
|
+
if console:
|
|
291
|
+
console.print(f"[yellow]Note:[/yellow] Service {name} not found or already deleted")
|
|
292
|
+
else:
|
|
293
|
+
if console:
|
|
294
|
+
console.print(f"[red]Error:[/red] Failed to delete service {name}: {e}")
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
def delete_deployment(
|
|
298
|
+
apps_v1_api: client.AppsV1Api,
|
|
299
|
+
core_api: client.CoreV1Api,
|
|
300
|
+
name: str,
|
|
301
|
+
namespace: str,
|
|
302
|
+
console: "Console" = None,
|
|
303
|
+
force: bool = False,
|
|
304
|
+
):
|
|
305
|
+
"""Delete a Deployment and its associated service."""
|
|
306
|
+
grace_period_seconds, propagation_policy = None, None
|
|
307
|
+
if force:
|
|
308
|
+
grace_period_seconds = 0
|
|
309
|
+
propagation_policy = "Foreground"
|
|
310
|
+
try:
|
|
311
|
+
# Delete the Deployment
|
|
312
|
+
apps_v1_api.delete_namespaced_deployment(
|
|
313
|
+
name=name,
|
|
314
|
+
namespace=namespace,
|
|
315
|
+
grace_period_seconds=grace_period_seconds,
|
|
316
|
+
propagation_policy=propagation_policy,
|
|
317
|
+
)
|
|
318
|
+
if console:
|
|
319
|
+
console.print(f"✓ Deleted deployment [blue]{name}[/blue]")
|
|
320
|
+
except ApiException as e:
|
|
321
|
+
if e.status == 404:
|
|
322
|
+
if console:
|
|
323
|
+
console.print(f"[yellow]Note:[/yellow] Deployment {name} not found or already deleted")
|
|
324
|
+
else:
|
|
325
|
+
if console:
|
|
326
|
+
console.print(f"[red]Error:[/red] Failed to delete deployment {name}: {e}")
|
|
327
|
+
|
|
328
|
+
# Delete the associated service (regular service, not headless)
|
|
329
|
+
try:
|
|
330
|
+
core_api.delete_namespaced_service(
|
|
331
|
+
name=name,
|
|
332
|
+
namespace=namespace,
|
|
333
|
+
grace_period_seconds=grace_period_seconds,
|
|
334
|
+
propagation_policy=propagation_policy,
|
|
335
|
+
)
|
|
336
|
+
if console:
|
|
337
|
+
console.print(f"✓ Deleted service [blue]{name}[/blue]")
|
|
338
|
+
except ApiException as e:
|
|
339
|
+
if e.status == 404:
|
|
340
|
+
if console:
|
|
341
|
+
console.print(f"[yellow]Note:[/yellow] Service {name} not found or already deleted")
|
|
342
|
+
else:
|
|
343
|
+
if console:
|
|
344
|
+
console.print(f"[red]Error:[/red] Failed to delete service {name}: {e}")
|
|
345
|
+
|
|
346
|
+
# Also try to delete the headless service for distributed deployments
|
|
347
|
+
try:
|
|
348
|
+
core_api.delete_namespaced_service(
|
|
349
|
+
name=f"{name}-headless",
|
|
350
|
+
namespace=namespace,
|
|
351
|
+
grace_period_seconds=grace_period_seconds,
|
|
352
|
+
propagation_policy=propagation_policy,
|
|
353
|
+
)
|
|
354
|
+
if console:
|
|
355
|
+
console.print(f"✓ Deleted headless service [blue]{name}-headless[/blue]")
|
|
356
|
+
except ApiException as e:
|
|
357
|
+
if e.status == 404:
|
|
358
|
+
# This is normal for non-distributed deployments
|
|
359
|
+
pass
|
|
360
|
+
else:
|
|
361
|
+
if console:
|
|
362
|
+
console.print(f"[red]Error:[/red] Failed to delete headless service {name}-headless: {e}")
|
|
363
|
+
|
|
364
|
+
|
|
365
|
+
def delete_raycluster(
|
|
366
|
+
custom_api: client.CustomObjectsApi,
|
|
367
|
+
core_api: client.CoreV1Api,
|
|
368
|
+
name: str,
|
|
369
|
+
namespace: str,
|
|
370
|
+
console: "Console" = None,
|
|
371
|
+
force: bool = False,
|
|
372
|
+
):
|
|
373
|
+
"""Delete a RayCluster and its associated service."""
|
|
374
|
+
|
|
375
|
+
grace_period_seconds, propagation_policy = None, None
|
|
376
|
+
if force:
|
|
377
|
+
grace_period_seconds = 0
|
|
378
|
+
propagation_policy = "Foreground"
|
|
379
|
+
|
|
380
|
+
try:
|
|
381
|
+
# Delete the RayCluster
|
|
382
|
+
custom_api.delete_namespaced_custom_object(
|
|
383
|
+
group="ray.io",
|
|
384
|
+
version="v1",
|
|
385
|
+
namespace=namespace,
|
|
386
|
+
plural="rayclusters",
|
|
387
|
+
name=name,
|
|
388
|
+
grace_period_seconds=grace_period_seconds,
|
|
389
|
+
propagation_policy=propagation_policy,
|
|
390
|
+
)
|
|
391
|
+
if console:
|
|
392
|
+
console.print(f"✓ Deleted RayCluster [blue]{name}[/blue]")
|
|
393
|
+
except ApiException as e:
|
|
394
|
+
if e.status == 404:
|
|
395
|
+
if console:
|
|
396
|
+
console.print(f"[yellow]Note:[/yellow] RayCluster {name} not found or already deleted")
|
|
397
|
+
else:
|
|
398
|
+
if console:
|
|
399
|
+
console.print(f"[red]Error:[/red] Failed to delete RayCluster {name}: {e}")
|
|
400
|
+
|
|
401
|
+
# Delete the associated service (created alongside RayCluster)
|
|
402
|
+
try:
|
|
403
|
+
core_api.delete_namespaced_service(
|
|
404
|
+
name=name,
|
|
405
|
+
namespace=namespace,
|
|
406
|
+
grace_period_seconds=grace_period_seconds,
|
|
407
|
+
propagation_policy=propagation_policy,
|
|
408
|
+
)
|
|
409
|
+
if console:
|
|
410
|
+
console.print(f"✓ Deleted service [blue]{name}[/blue]")
|
|
411
|
+
except ApiException as e:
|
|
412
|
+
if e.status == 404:
|
|
413
|
+
if console:
|
|
414
|
+
console.print(f"[yellow]Note:[/yellow] Service {name} not found or already deleted")
|
|
415
|
+
else:
|
|
416
|
+
if console:
|
|
417
|
+
console.print(f"[red]Error:[/red] Failed to delete service {name}: {e}")
|
|
418
|
+
|
|
419
|
+
# Delete the headless service for Ray pod discovery
|
|
420
|
+
try:
|
|
421
|
+
core_api.delete_namespaced_service(
|
|
422
|
+
name=f"{name}-headless",
|
|
423
|
+
namespace=namespace,
|
|
424
|
+
grace_period_seconds=grace_period_seconds,
|
|
425
|
+
propagation_policy=propagation_policy,
|
|
426
|
+
)
|
|
427
|
+
if console:
|
|
428
|
+
console.print(f"✓ Deleted headless service [blue]{name}-headless[/blue]")
|
|
429
|
+
except ApiException as e:
|
|
430
|
+
if e.status == 404:
|
|
431
|
+
# This is normal for older Ray clusters without headless services
|
|
432
|
+
pass
|
|
433
|
+
else:
|
|
434
|
+
if console:
|
|
435
|
+
console.print(f"[red]Error:[/red] Failed to delete headless service {name}-headless: {e}")
|
|
436
|
+
|
|
437
|
+
|
|
438
|
+
def delete_resources_for_service(
|
|
439
|
+
core_api: client.CoreV1Api,
|
|
440
|
+
custom_api: client.CustomObjectsApi,
|
|
441
|
+
configmaps: List[str],
|
|
442
|
+
name: str,
|
|
443
|
+
service_type: str = "knative",
|
|
444
|
+
namespace: str = None,
|
|
445
|
+
console: "Console" = None,
|
|
446
|
+
force: bool = False,
|
|
447
|
+
):
|
|
448
|
+
"""Delete service resources based on service type."""
|
|
449
|
+
# Delete the main service (Knative, Deployment, or RayCluster)
|
|
450
|
+
if service_type == "deployment":
|
|
451
|
+
apps_v1_api = client.AppsV1Api()
|
|
452
|
+
delete_deployment(
|
|
453
|
+
apps_v1_api=apps_v1_api,
|
|
454
|
+
core_api=core_api,
|
|
455
|
+
name=name,
|
|
456
|
+
namespace=namespace,
|
|
457
|
+
console=console,
|
|
458
|
+
force=force,
|
|
459
|
+
)
|
|
460
|
+
elif service_type == "raycluster":
|
|
461
|
+
delete_raycluster(
|
|
462
|
+
custom_api=custom_api,
|
|
463
|
+
core_api=core_api,
|
|
464
|
+
name=name,
|
|
465
|
+
namespace=namespace,
|
|
466
|
+
console=console,
|
|
467
|
+
force=force,
|
|
468
|
+
)
|
|
469
|
+
else: # knative or unknown - try deleting as Knative service
|
|
470
|
+
delete_service(
|
|
471
|
+
custom_api=custom_api,
|
|
472
|
+
name=name,
|
|
473
|
+
namespace=namespace,
|
|
474
|
+
console=console,
|
|
475
|
+
force=force,
|
|
476
|
+
)
|
|
477
|
+
|
|
478
|
+
# Delete configmaps
|
|
479
|
+
if configmaps:
|
|
480
|
+
delete_configmaps(
|
|
481
|
+
core_api=core_api,
|
|
482
|
+
configmaps=configmaps,
|
|
483
|
+
namespace=namespace,
|
|
484
|
+
console=console,
|
|
485
|
+
force=force,
|
|
486
|
+
)
|
|
487
|
+
|
|
488
|
+
delete_cached_service_data(core_api=core_api, service_name=name, namespace=namespace, console=console)
|
|
489
|
+
|
|
490
|
+
|
|
491
|
+
def delete_cached_service_data(
|
|
492
|
+
core_api: client.CoreV1Api,
|
|
493
|
+
service_name: str,
|
|
494
|
+
namespace: str,
|
|
495
|
+
console: "Console" = None,
|
|
496
|
+
):
|
|
497
|
+
"""Delete service data from the rsync pod."""
|
|
498
|
+
try:
|
|
499
|
+
# Find the rsync pod name in the provided namespace
|
|
500
|
+
pods = core_api.list_namespaced_pod(namespace=namespace, label_selector="app=kubetorch-rsync")
|
|
501
|
+
|
|
502
|
+
if not pods.items:
|
|
503
|
+
if console:
|
|
504
|
+
console.print(f"[yellow] No rsync pod found in namespace {namespace}[/yellow]")
|
|
505
|
+
return
|
|
506
|
+
|
|
507
|
+
pod_name = pods.items[0].metadata.name
|
|
508
|
+
service_path = f"/data/{namespace}/{service_name}"
|
|
509
|
+
|
|
510
|
+
shell_cmd = (
|
|
511
|
+
f"if [ -d '{service_path}' ]; then rm -rf '{service_path}' && echo 'Deleted {service_path}'; "
|
|
512
|
+
f"else echo 'Path {service_path} not found'; fi"
|
|
513
|
+
)
|
|
514
|
+
|
|
515
|
+
# Execute command based on environment
|
|
516
|
+
if is_running_in_kubernetes():
|
|
517
|
+
response = stream(
|
|
518
|
+
core_api.connect_get_namespaced_pod_exec,
|
|
519
|
+
name=pod_name,
|
|
520
|
+
namespace=namespace,
|
|
521
|
+
command=["sh", "-c", shell_cmd],
|
|
522
|
+
stderr=True,
|
|
523
|
+
stdin=False,
|
|
524
|
+
stdout=True,
|
|
525
|
+
tty=False,
|
|
526
|
+
)
|
|
527
|
+
output = response.strip()
|
|
528
|
+
|
|
529
|
+
else:
|
|
530
|
+
cmd = [
|
|
531
|
+
"kubectl",
|
|
532
|
+
"exec",
|
|
533
|
+
"-n",
|
|
534
|
+
namespace,
|
|
535
|
+
pod_name,
|
|
536
|
+
"--",
|
|
537
|
+
"sh",
|
|
538
|
+
"-c",
|
|
539
|
+
shell_cmd,
|
|
540
|
+
]
|
|
541
|
+
result = subprocess.run(cmd, capture_output=True, text=True, timeout=10)
|
|
542
|
+
|
|
543
|
+
if result.returncode != 0:
|
|
544
|
+
if console:
|
|
545
|
+
console.print(f"[red]Error cleaning up cached data: {result.stderr}[/red]")
|
|
546
|
+
return
|
|
547
|
+
output = result.stdout.strip()
|
|
548
|
+
|
|
549
|
+
if console:
|
|
550
|
+
if "Deleted" in output:
|
|
551
|
+
console.print(f"✓ Deleted cached data for [blue]{service_name}[/blue]")
|
|
552
|
+
|
|
553
|
+
except subprocess.TimeoutExpired:
|
|
554
|
+
if console:
|
|
555
|
+
console.print("[red]Timeout while cleaning up cached service data[/red]")
|
|
556
|
+
else:
|
|
557
|
+
logger.debug("Timeout while cleaning up cached data")
|
|
558
|
+
|
|
559
|
+
except Exception as e:
|
|
560
|
+
if console:
|
|
561
|
+
console.print(f"[red]Failed to clean up cached service data: {e}[/red]")
|
|
562
|
+
else:
|
|
563
|
+
logger.debug(f"Failed to clean up cached data: {e}")
|
|
564
|
+
|
|
565
|
+
|
|
566
|
+
def _collect_modules(target_str):
|
|
567
|
+
from kubetorch.resources.callables.module import Module
|
|
568
|
+
|
|
569
|
+
to_deploy = []
|
|
570
|
+
|
|
571
|
+
if ":" in target_str:
|
|
572
|
+
target_module_or_path, target_fn_or_class = target_str.split(":")
|
|
573
|
+
else:
|
|
574
|
+
target_module_or_path, target_fn_or_class = target_str, None
|
|
575
|
+
|
|
576
|
+
if target_module_or_path.endswith(".py"):
|
|
577
|
+
abs_path = Path(target_module_or_path).resolve()
|
|
578
|
+
python_module_name = inspect.getmodulename(str(abs_path))
|
|
579
|
+
|
|
580
|
+
sys.path.insert(0, str(abs_path.parent))
|
|
581
|
+
else:
|
|
582
|
+
python_module_name = target_module_or_path
|
|
583
|
+
sys.path.append(".")
|
|
584
|
+
|
|
585
|
+
module = importlib.import_module(python_module_name)
|
|
586
|
+
|
|
587
|
+
if target_fn_or_class:
|
|
588
|
+
if not hasattr(module, target_fn_or_class):
|
|
589
|
+
raise ValueError(f"Function or class {target_fn_or_class} not found in {target_module_or_path}.")
|
|
590
|
+
to_deploy = [getattr(module, target_fn_or_class)]
|
|
591
|
+
if not isinstance(to_deploy[0], Module):
|
|
592
|
+
raise ValueError(
|
|
593
|
+
f"Function or class {target_fn_or_class} in {target_module_or_path} is not decorated with @kt.compute."
|
|
594
|
+
)
|
|
595
|
+
else:
|
|
596
|
+
# Get all functions and classes to deploy
|
|
597
|
+
for name in dir(module):
|
|
598
|
+
obj = getattr(module, name)
|
|
599
|
+
if isinstance(obj, Module):
|
|
600
|
+
to_deploy.append(obj)
|
|
601
|
+
if not to_deploy:
|
|
602
|
+
raise ValueError(f"No functions or classes decorated with @kt.compute found in {target_module_or_path}.")
|
|
603
|
+
|
|
604
|
+
return to_deploy, target_fn_or_class
|
|
605
|
+
|
|
606
|
+
|
|
607
|
+
def fetch_resources_for_teardown(
|
|
608
|
+
namespace: str,
|
|
609
|
+
target: str,
|
|
610
|
+
core_api: client.CoreV1Api,
|
|
611
|
+
custom_api: client.CustomObjectsApi,
|
|
612
|
+
prefix: Optional[str] = None,
|
|
613
|
+
username: Optional[str] = None,
|
|
614
|
+
exact_match: bool = False,
|
|
615
|
+
) -> dict:
|
|
616
|
+
"""Fetchs the resources for a given service.
|
|
617
|
+
|
|
618
|
+
Returns a dictionary with the following keys:
|
|
619
|
+
- services: {
|
|
620
|
+
[service_name]: {
|
|
621
|
+
"configmaps": List[str],
|
|
622
|
+
"pods": List[str],
|
|
623
|
+
"type": str, # "knative" or "deployment"
|
|
624
|
+
}
|
|
625
|
+
}
|
|
626
|
+
"""
|
|
627
|
+
from kubetorch.resources.callables.module import Module
|
|
628
|
+
|
|
629
|
+
resources = {"services": {}}
|
|
630
|
+
services = []
|
|
631
|
+
|
|
632
|
+
if prefix in ["kt", "kubetorch", "knative"]:
|
|
633
|
+
raise ValueError(f"Invalid prefix: {prefix} is reserved. Please delete these individually.")
|
|
634
|
+
|
|
635
|
+
# Initialize apps API for deployments
|
|
636
|
+
apps_v1_api = client.AppsV1Api()
|
|
637
|
+
|
|
638
|
+
if username or prefix:
|
|
639
|
+
# Search Knative services
|
|
640
|
+
try:
|
|
641
|
+
# Build label selector for Knative services - use template label to identify kubetorch services
|
|
642
|
+
knative_label_selector = f"{serving_constants.KT_TEMPLATE_LABEL}=ksvc"
|
|
643
|
+
if username:
|
|
644
|
+
knative_label_selector += f",{KT_USERNAME_LABEL}={username}"
|
|
645
|
+
|
|
646
|
+
response = custom_api.list_namespaced_custom_object(
|
|
647
|
+
group="serving.knative.dev",
|
|
648
|
+
version="v1",
|
|
649
|
+
namespace=namespace,
|
|
650
|
+
plural="services",
|
|
651
|
+
label_selector=knative_label_selector,
|
|
652
|
+
)
|
|
653
|
+
items = response.get("items", [])
|
|
654
|
+
knative_services = [
|
|
655
|
+
item["metadata"]["name"] for item in items if (username or item["metadata"]["name"].startswith(prefix))
|
|
656
|
+
]
|
|
657
|
+
services.extend(knative_services)
|
|
658
|
+
except client.exceptions.ApiException as e:
|
|
659
|
+
if e.status != 404: # Ignore if Knative is not installed
|
|
660
|
+
logger.warning(f"Failed to list Knative services: {e}")
|
|
661
|
+
|
|
662
|
+
# Search Deployments
|
|
663
|
+
try:
|
|
664
|
+
# Build label selector for deployments - use KT_TEMPLATE_LABEL to identify kubetorch deployments
|
|
665
|
+
deployment_label_selector = f"{serving_constants.KT_TEMPLATE_LABEL}=deployment"
|
|
666
|
+
if username:
|
|
667
|
+
deployment_label_selector += f",{KT_USERNAME_LABEL}={username}"
|
|
668
|
+
|
|
669
|
+
deployments_response = apps_v1_api.list_namespaced_deployment(
|
|
670
|
+
namespace=namespace,
|
|
671
|
+
label_selector=deployment_label_selector,
|
|
672
|
+
)
|
|
673
|
+
deployment_services = [
|
|
674
|
+
deployment.metadata.name
|
|
675
|
+
for deployment in deployments_response.items
|
|
676
|
+
if (username or deployment.metadata.name.startswith(prefix))
|
|
677
|
+
]
|
|
678
|
+
services.extend(deployment_services)
|
|
679
|
+
except client.exceptions.ApiException as e:
|
|
680
|
+
logger.warning(f"Failed to list Deployments: {e}")
|
|
681
|
+
|
|
682
|
+
# Search RayClusters
|
|
683
|
+
try:
|
|
684
|
+
# Build label selector for rayclusters - use template label to identify kubetorch rayclusters
|
|
685
|
+
raycluster_label_selector = f"{serving_constants.KT_TEMPLATE_LABEL}=raycluster"
|
|
686
|
+
if username:
|
|
687
|
+
raycluster_label_selector += f",{KT_USERNAME_LABEL}={username}"
|
|
688
|
+
|
|
689
|
+
response = custom_api.list_namespaced_custom_object(
|
|
690
|
+
group="ray.io",
|
|
691
|
+
version="v1",
|
|
692
|
+
namespace=namespace,
|
|
693
|
+
plural="rayclusters",
|
|
694
|
+
label_selector=raycluster_label_selector,
|
|
695
|
+
)
|
|
696
|
+
items = response.get("items", [])
|
|
697
|
+
raycluster_services = [
|
|
698
|
+
item["metadata"]["name"] for item in items if (username or item["metadata"]["name"].startswith(prefix))
|
|
699
|
+
]
|
|
700
|
+
services.extend(raycluster_services)
|
|
701
|
+
except client.exceptions.ApiException as e:
|
|
702
|
+
if e.status != 404: # Ignore if Ray operator is not installed
|
|
703
|
+
logger.warning(f"Failed to list RayClusters: {e}")
|
|
704
|
+
|
|
705
|
+
else:
|
|
706
|
+
if not target:
|
|
707
|
+
raise ValueError("Please provide a service name or use the --all or --prefix flags")
|
|
708
|
+
|
|
709
|
+
# Case when service_name is a module or file path (i.e. the `kt deploy` usage path)
|
|
710
|
+
if ":" in target or ".py" in target or "." in target:
|
|
711
|
+
to_down, _ = _collect_modules(target)
|
|
712
|
+
services = [mod.service_name for mod in to_down if isinstance(mod, Module)]
|
|
713
|
+
else:
|
|
714
|
+
services = [target]
|
|
715
|
+
# if the target is not prefixed with the username, add the username prefix
|
|
716
|
+
username = kubetorch.globals.config.username
|
|
717
|
+
if username and not exact_match and not target.startswith(username + "-"):
|
|
718
|
+
services.append(username + "-" + target)
|
|
719
|
+
|
|
720
|
+
for service_name in services:
|
|
721
|
+
service_type = None
|
|
722
|
+
service_found = False
|
|
723
|
+
|
|
724
|
+
# Check if it's a Knative service
|
|
725
|
+
try:
|
|
726
|
+
service = custom_api.get_namespaced_custom_object(
|
|
727
|
+
group="serving.knative.dev",
|
|
728
|
+
version="v1",
|
|
729
|
+
namespace=namespace,
|
|
730
|
+
plural="services",
|
|
731
|
+
name=service_name,
|
|
732
|
+
)
|
|
733
|
+
if service:
|
|
734
|
+
service_type = "knative"
|
|
735
|
+
service_found = True
|
|
736
|
+
except client.exceptions.ApiException:
|
|
737
|
+
pass
|
|
738
|
+
|
|
739
|
+
# Check if it's a Deployment (if not found as Knative service)
|
|
740
|
+
if not service_found:
|
|
741
|
+
try:
|
|
742
|
+
deployment = apps_v1_api.read_namespaced_deployment(name=service_name, namespace=namespace)
|
|
743
|
+
# Only consider it if it has kubetorch template label
|
|
744
|
+
if (
|
|
745
|
+
deployment.metadata.labels
|
|
746
|
+
and deployment.metadata.labels.get(serving_constants.KT_TEMPLATE_LABEL) == "deployment"
|
|
747
|
+
):
|
|
748
|
+
service_type = "deployment"
|
|
749
|
+
service_found = True
|
|
750
|
+
except client.exceptions.ApiException:
|
|
751
|
+
pass
|
|
752
|
+
|
|
753
|
+
# Check if it's a RayCluster (if not found as Knative or Deployment)
|
|
754
|
+
if not service_found:
|
|
755
|
+
try:
|
|
756
|
+
raycluster = custom_api.get_namespaced_custom_object(
|
|
757
|
+
group="ray.io",
|
|
758
|
+
version="v1",
|
|
759
|
+
namespace=namespace,
|
|
760
|
+
plural="rayclusters",
|
|
761
|
+
name=service_name,
|
|
762
|
+
)
|
|
763
|
+
if raycluster:
|
|
764
|
+
service_type = "raycluster"
|
|
765
|
+
service_found = True
|
|
766
|
+
except client.exceptions.ApiException:
|
|
767
|
+
pass
|
|
768
|
+
|
|
769
|
+
# Get associated resources if service exists
|
|
770
|
+
configmaps = load_configmaps(core_api, service_name, namespace)
|
|
771
|
+
pods = core_api.list_namespaced_pod(namespace=namespace, label_selector=f"{KT_SERVICE_LABEL}={service_name}")
|
|
772
|
+
pods = [pod.metadata.name for pod in pods.items]
|
|
773
|
+
|
|
774
|
+
# Only add the service to the resources if it has configmaps, pods, or we found the service
|
|
775
|
+
if service_found or configmaps or pods:
|
|
776
|
+
resources["services"][service_name] = {
|
|
777
|
+
"configmaps": configmaps,
|
|
778
|
+
"pods": pods,
|
|
779
|
+
"type": service_type or "unknown",
|
|
780
|
+
}
|
|
781
|
+
|
|
782
|
+
return resources
|
|
783
|
+
|
|
784
|
+
|
|
785
|
+
# ----------------- Image Builder Utils ----------------- #
|
|
786
|
+
def _get_sync_package_paths(
|
|
787
|
+
package: str,
|
|
788
|
+
):
|
|
789
|
+
if "/" in package or "~" in package:
|
|
790
|
+
package_path = (
|
|
791
|
+
Path(package).expanduser()
|
|
792
|
+
if Path(package).expanduser().is_absolute()
|
|
793
|
+
else Path(locate_working_dir()[0]) / package
|
|
794
|
+
)
|
|
795
|
+
dest_dir = str(package_path.name)
|
|
796
|
+
else:
|
|
797
|
+
package_path = get_local_install_path(package)
|
|
798
|
+
dest_dir = package
|
|
799
|
+
|
|
800
|
+
if not (package_path and Path(package_path).exists()):
|
|
801
|
+
raise ValueError(f"Could not locate local package {package}")
|
|
802
|
+
|
|
803
|
+
full_path = Path(package_path).expanduser().resolve()
|
|
804
|
+
return str(full_path), dest_dir
|
|
805
|
+
|
|
806
|
+
|
|
807
|
+
# ----------------- Error Handling Utils ----------------- #
|
|
808
|
+
def check_pod_status_for_errors(pod: client.V1Pod, queue_name: str = None, scheduler_name: str = None):
|
|
809
|
+
"""Check pod status for errors"""
|
|
810
|
+
# Check for scheduling issues
|
|
811
|
+
for condition in pod.status.conditions or []:
|
|
812
|
+
if condition.type == "PodScheduled" and condition.status == "False" and condition.reason == "Unschedulable":
|
|
813
|
+
msg = condition.message.lower()
|
|
814
|
+
|
|
815
|
+
# Check if the pod is scheduled in the correct queue and scheduler
|
|
816
|
+
if queue_name and scheduler_name:
|
|
817
|
+
scheduler = pod.metadata.annotations.get("schedulerName", "")
|
|
818
|
+
queue_label = pod.metadata.labels.get("kai.scheduler/queue")
|
|
819
|
+
if queue_label == queue_name and scheduler == scheduler_name:
|
|
820
|
+
raise QueueUnschedulableError(
|
|
821
|
+
f"Pod {pod.metadata.name} could not be scheduled: {condition.message}"
|
|
822
|
+
)
|
|
823
|
+
|
|
824
|
+
# Check for specific node selector/affinity/GPU type mismatches
|
|
825
|
+
# without matching temporary resource exhaustion messages
|
|
826
|
+
if any(
|
|
827
|
+
x in msg
|
|
828
|
+
for x in [
|
|
829
|
+
"node selector not matched",
|
|
830
|
+
"node affinity mismatch",
|
|
831
|
+
"unsupported gpu type",
|
|
832
|
+
"unknown instance type",
|
|
833
|
+
"didn't match pod's node affinity/selector",
|
|
834
|
+
]
|
|
835
|
+
):
|
|
836
|
+
raise ResourceNotAvailableError(
|
|
837
|
+
f"Required compute resources are not configured in the cluster: {condition.message}"
|
|
838
|
+
)
|
|
839
|
+
|
|
840
|
+
# Check for container status errors
|
|
841
|
+
if pod.status.container_statuses:
|
|
842
|
+
for container_status in pod.status.container_statuses:
|
|
843
|
+
if container_status.state and container_status.state.waiting:
|
|
844
|
+
reason = container_status.state.waiting.reason
|
|
845
|
+
message = container_status.state.waiting.message or ""
|
|
846
|
+
if reason in TERMINATE_EARLY_ERRORS:
|
|
847
|
+
raise TERMINATE_EARLY_ERRORS[reason](f"Pod {pod.metadata.name}: {message}")
|
|
848
|
+
|
|
849
|
+
|
|
850
|
+
def check_pod_events_for_errors(pod: client.V1Pod, namespace: str, core_api: client.CoreV1Api):
|
|
851
|
+
"""Check pod events for scheduling errors"""
|
|
852
|
+
try:
|
|
853
|
+
events = core_api.list_namespaced_event(
|
|
854
|
+
namespace=namespace,
|
|
855
|
+
field_selector=f"involvedObject.name={pod.metadata.name}",
|
|
856
|
+
).items
|
|
857
|
+
for event in events:
|
|
858
|
+
# Check for Karpenter scheduling errors
|
|
859
|
+
if (
|
|
860
|
+
event.reason == "FailedScheduling"
|
|
861
|
+
and event.source.component == "karpenter"
|
|
862
|
+
and "no instance type has enough resources" in event.message
|
|
863
|
+
):
|
|
864
|
+
raise ResourceNotAvailableError(f"Pod {pod.metadata.name} failed to schedule: {event.message}")
|
|
865
|
+
except client.exceptions.ApiException as e:
|
|
866
|
+
logger.warning(f"Error fetching events for pod {pod.metadata.name}: {e}")
|
|
867
|
+
|
|
868
|
+
|
|
869
|
+
def check_replicaset_events_for_errors(
|
|
870
|
+
namespace: str,
|
|
871
|
+
service_name: str,
|
|
872
|
+
apps_v1_api: client.AppsV1Api,
|
|
873
|
+
core_api: client.CoreV1Api,
|
|
874
|
+
):
|
|
875
|
+
"""Check ReplicaSet events for creation errors like missing PriorityClass.
|
|
876
|
+
|
|
877
|
+
Args:
|
|
878
|
+
service_name: Name of the service
|
|
879
|
+
core_api: Core API instance
|
|
880
|
+
|
|
881
|
+
Raises:
|
|
882
|
+
ResourceNotAvailableError: If ReplicaSet creation fails due to missing resources
|
|
883
|
+
"""
|
|
884
|
+
try:
|
|
885
|
+
# Get ReplicaSets associated with this Deployment
|
|
886
|
+
replicasets = apps_v1_api.list_namespaced_replica_set(
|
|
887
|
+
namespace=namespace,
|
|
888
|
+
label_selector=f"kubetorch.com/service={service_name}",
|
|
889
|
+
).items
|
|
890
|
+
|
|
891
|
+
for replicaset in replicasets:
|
|
892
|
+
# Check ReplicaSet events for FailedCreate errors
|
|
893
|
+
events = core_api.list_namespaced_event(
|
|
894
|
+
namespace=namespace,
|
|
895
|
+
field_selector=f"involvedObject.name={replicaset.metadata.name}",
|
|
896
|
+
).items
|
|
897
|
+
|
|
898
|
+
for event in events:
|
|
899
|
+
if event.reason == "FailedCreate" and event.type == "Warning" and "forbidden" in event.message.lower():
|
|
900
|
+
# Check for specific PriorityClass errors
|
|
901
|
+
if "priorityclass" in event.message.lower():
|
|
902
|
+
raise ResourceNotAvailableError(
|
|
903
|
+
f"ReplicaSet {replicaset.metadata.name} failed to create pods: "
|
|
904
|
+
f"{event.message}. Please ensure the required PriorityClass exists in the cluster."
|
|
905
|
+
)
|
|
906
|
+
# Check for other forbidden errors
|
|
907
|
+
elif any(
|
|
908
|
+
error_type in event.message.lower()
|
|
909
|
+
for error_type in [
|
|
910
|
+
"forbidden",
|
|
911
|
+
"no priorityclass",
|
|
912
|
+
"priority class",
|
|
913
|
+
]
|
|
914
|
+
):
|
|
915
|
+
raise ResourceNotAvailableError(
|
|
916
|
+
f"ReplicaSet {replicaset.metadata.name} failed to create pods: "
|
|
917
|
+
f"{event.message}. Please check cluster configuration and permissions."
|
|
918
|
+
)
|
|
919
|
+
|
|
920
|
+
except client.exceptions.ApiException as e:
|
|
921
|
+
logger.warning(f"Error checking ReplicaSet events for {service_name}: {e}")
|
|
922
|
+
except ResourceNotAvailableError:
|
|
923
|
+
# Re-raise ResourceNotAvailableError to stop the readiness check
|
|
924
|
+
raise
|
|
925
|
+
|
|
926
|
+
|
|
927
|
+
def check_revision_for_errors(revision_name: str, namespace: str, objects_api: client.CustomObjectsApi):
|
|
928
|
+
"""Check revision for errors"""
|
|
929
|
+
try:
|
|
930
|
+
revision = objects_api.get_namespaced_custom_object(
|
|
931
|
+
group="serving.knative.dev",
|
|
932
|
+
version="v1",
|
|
933
|
+
namespace=namespace,
|
|
934
|
+
plural="revisions",
|
|
935
|
+
name=revision_name,
|
|
936
|
+
)
|
|
937
|
+
for cond in revision.get("status", {}).get("conditions", []):
|
|
938
|
+
if cond["status"] == "False":
|
|
939
|
+
reason = cond.get("reason")
|
|
940
|
+
message = cond.get("message", f"Revision failed with reason: {reason}")
|
|
941
|
+
if reason in TERMINATE_EARLY_ERRORS:
|
|
942
|
+
raise TERMINATE_EARLY_ERRORS[reason](f"Revision {revision_name}: {message}")
|
|
943
|
+
except client.exceptions.ApiException as e:
|
|
944
|
+
logger.warning(f"Error checking revision: {e}")
|
|
945
|
+
|
|
946
|
+
|
|
947
|
+
def is_port_available(port: int) -> bool:
|
|
948
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
949
|
+
return s.connect_ex(("localhost", port)) != 0
|
|
950
|
+
|
|
951
|
+
|
|
952
|
+
def find_available_port(start_port: int, max_tries: int = 10) -> int:
|
|
953
|
+
for i in range(max_tries):
|
|
954
|
+
port = start_port + i
|
|
955
|
+
if is_port_available(port):
|
|
956
|
+
return port
|
|
957
|
+
raise RuntimeError(f"Could not find available port starting from {start_port}")
|
|
958
|
+
|
|
959
|
+
|
|
960
|
+
# --------------- Secrets utils ---------------------------
|
|
961
|
+
|
|
962
|
+
|
|
963
|
+
def get_parsed_secret(secret: client.V1Secret):
|
|
964
|
+
labels = secret.metadata.labels
|
|
965
|
+
secret = {
|
|
966
|
+
"name": secret.metadata.name,
|
|
967
|
+
"username": labels.get("kubetorch.com/username", None) if labels else None,
|
|
968
|
+
"namespace": secret.metadata.namespace,
|
|
969
|
+
"user_defined_name": labels.get("kubetorch.com/secret-name", None) if labels else None,
|
|
970
|
+
"labels": labels,
|
|
971
|
+
"annotations": secret.metadata.annotations,
|
|
972
|
+
"type": secret.type,
|
|
973
|
+
"data": secret.data,
|
|
974
|
+
}
|
|
975
|
+
return secret
|
|
976
|
+
|
|
977
|
+
|
|
978
|
+
def list_secrets(
|
|
979
|
+
core_api: client.CoreV1Api,
|
|
980
|
+
namespace: str = "default",
|
|
981
|
+
prefix: str = None,
|
|
982
|
+
all_namespaces: bool = False,
|
|
983
|
+
filter_by_creator: bool = True,
|
|
984
|
+
console: "Console" = None,
|
|
985
|
+
):
|
|
986
|
+
try:
|
|
987
|
+
if all_namespaces:
|
|
988
|
+
secrets: client.V1SecretList = core_api.list_secret_for_all_namespaces()
|
|
989
|
+
else:
|
|
990
|
+
secrets: client.V1SecretList = core_api.list_namespaced_secret(namespace=namespace)
|
|
991
|
+
if not secrets:
|
|
992
|
+
return None
|
|
993
|
+
filtered_secrets = []
|
|
994
|
+
for secret in secrets.items:
|
|
995
|
+
parsed_secret = get_parsed_secret(secret)
|
|
996
|
+
user_defined_secret_name = parsed_secret.get("user_defined_name")
|
|
997
|
+
if user_defined_secret_name: # filter secrets that was created by kt api, by the username set in kt.config.
|
|
998
|
+
if prefix and filter_by_creator: # filter secrets by prefix + creator
|
|
999
|
+
if (
|
|
1000
|
+
parsed_secret.get("user_defined_name").startswith(prefix)
|
|
1001
|
+
and parsed_secret.get("username") == kubetorch.globals.config.username
|
|
1002
|
+
):
|
|
1003
|
+
filtered_secrets.append(parsed_secret)
|
|
1004
|
+
elif prefix: # filter secrets by prefix
|
|
1005
|
+
if parsed_secret.get("user_defined_name").startswith(prefix):
|
|
1006
|
+
filtered_secrets.append(parsed_secret)
|
|
1007
|
+
elif filter_by_creator: # filter secrets by creator
|
|
1008
|
+
if parsed_secret.get("username") == kubetorch.globals.config.username:
|
|
1009
|
+
filtered_secrets.append(parsed_secret)
|
|
1010
|
+
else: # No additional filters required
|
|
1011
|
+
filtered_secrets.append(parsed_secret)
|
|
1012
|
+
return filtered_secrets
|
|
1013
|
+
|
|
1014
|
+
except client.rest.ApiException as e:
|
|
1015
|
+
console.print(f"[red]Failed to load secrets: {e}[/red]")
|
|
1016
|
+
return None
|
|
1017
|
+
|
|
1018
|
+
|
|
1019
|
+
def delete_secrets(
|
|
1020
|
+
secrets: List[str],
|
|
1021
|
+
secrets_client: KubernetesSecretsClient,
|
|
1022
|
+
console: "Console" = None,
|
|
1023
|
+
):
|
|
1024
|
+
"""Delete the given list of secrets."""
|
|
1025
|
+
for secret in secrets:
|
|
1026
|
+
secrets_client.delete_secret(secret, console=console)
|