kubetorch 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. kubetorch/__init__.py +59 -0
  2. kubetorch/cli.py +1939 -0
  3. kubetorch/cli_utils.py +967 -0
  4. kubetorch/config.py +453 -0
  5. kubetorch/constants.py +18 -0
  6. kubetorch/docs/Makefile +18 -0
  7. kubetorch/docs/__init__.py +0 -0
  8. kubetorch/docs/_ext/json_globaltoc.py +42 -0
  9. kubetorch/docs/api/cli.rst +10 -0
  10. kubetorch/docs/api/python/app.rst +21 -0
  11. kubetorch/docs/api/python/cls.rst +19 -0
  12. kubetorch/docs/api/python/compute.rst +25 -0
  13. kubetorch/docs/api/python/config.rst +11 -0
  14. kubetorch/docs/api/python/fn.rst +19 -0
  15. kubetorch/docs/api/python/image.rst +14 -0
  16. kubetorch/docs/api/python/secret.rst +18 -0
  17. kubetorch/docs/api/python/volumes.rst +13 -0
  18. kubetorch/docs/api/python.rst +101 -0
  19. kubetorch/docs/conf.py +69 -0
  20. kubetorch/docs/index.rst +20 -0
  21. kubetorch/docs/requirements.txt +5 -0
  22. kubetorch/globals.py +269 -0
  23. kubetorch/logger.py +59 -0
  24. kubetorch/resources/__init__.py +0 -0
  25. kubetorch/resources/callables/__init__.py +0 -0
  26. kubetorch/resources/callables/cls/__init__.py +0 -0
  27. kubetorch/resources/callables/cls/cls.py +159 -0
  28. kubetorch/resources/callables/fn/__init__.py +0 -0
  29. kubetorch/resources/callables/fn/fn.py +140 -0
  30. kubetorch/resources/callables/module.py +1315 -0
  31. kubetorch/resources/callables/utils.py +203 -0
  32. kubetorch/resources/compute/__init__.py +0 -0
  33. kubetorch/resources/compute/app.py +253 -0
  34. kubetorch/resources/compute/compute.py +2414 -0
  35. kubetorch/resources/compute/decorators.py +137 -0
  36. kubetorch/resources/compute/utils.py +1026 -0
  37. kubetorch/resources/compute/websocket.py +135 -0
  38. kubetorch/resources/images/__init__.py +1 -0
  39. kubetorch/resources/images/image.py +412 -0
  40. kubetorch/resources/images/images.py +64 -0
  41. kubetorch/resources/secrets/__init__.py +2 -0
  42. kubetorch/resources/secrets/kubernetes_secrets_client.py +377 -0
  43. kubetorch/resources/secrets/provider_secrets/__init__.py +0 -0
  44. kubetorch/resources/secrets/provider_secrets/anthropic_secret.py +12 -0
  45. kubetorch/resources/secrets/provider_secrets/aws_secret.py +16 -0
  46. kubetorch/resources/secrets/provider_secrets/azure_secret.py +14 -0
  47. kubetorch/resources/secrets/provider_secrets/cohere_secret.py +12 -0
  48. kubetorch/resources/secrets/provider_secrets/gcp_secret.py +16 -0
  49. kubetorch/resources/secrets/provider_secrets/github_secret.py +13 -0
  50. kubetorch/resources/secrets/provider_secrets/huggingface_secret.py +20 -0
  51. kubetorch/resources/secrets/provider_secrets/kubeconfig_secret.py +12 -0
  52. kubetorch/resources/secrets/provider_secrets/lambda_secret.py +13 -0
  53. kubetorch/resources/secrets/provider_secrets/langchain_secret.py +12 -0
  54. kubetorch/resources/secrets/provider_secrets/openai_secret.py +11 -0
  55. kubetorch/resources/secrets/provider_secrets/pinecone_secret.py +12 -0
  56. kubetorch/resources/secrets/provider_secrets/providers.py +92 -0
  57. kubetorch/resources/secrets/provider_secrets/ssh_secret.py +12 -0
  58. kubetorch/resources/secrets/provider_secrets/wandb_secret.py +11 -0
  59. kubetorch/resources/secrets/secret.py +224 -0
  60. kubetorch/resources/secrets/secret_factory.py +64 -0
  61. kubetorch/resources/secrets/utils.py +222 -0
  62. kubetorch/resources/volumes/__init__.py +0 -0
  63. kubetorch/resources/volumes/volume.py +340 -0
  64. kubetorch/servers/__init__.py +0 -0
  65. kubetorch/servers/http/__init__.py +0 -0
  66. kubetorch/servers/http/distributed_utils.py +2968 -0
  67. kubetorch/servers/http/http_client.py +802 -0
  68. kubetorch/servers/http/http_server.py +1622 -0
  69. kubetorch/servers/http/server_metrics.py +255 -0
  70. kubetorch/servers/http/utils.py +722 -0
  71. kubetorch/serving/__init__.py +0 -0
  72. kubetorch/serving/autoscaling.py +153 -0
  73. kubetorch/serving/base_service_manager.py +344 -0
  74. kubetorch/serving/constants.py +77 -0
  75. kubetorch/serving/deployment_service_manager.py +431 -0
  76. kubetorch/serving/knative_service_manager.py +487 -0
  77. kubetorch/serving/raycluster_service_manager.py +526 -0
  78. kubetorch/serving/service_manager.py +18 -0
  79. kubetorch/serving/templates/deployment_template.yaml +17 -0
  80. kubetorch/serving/templates/knative_service_template.yaml +19 -0
  81. kubetorch/serving/templates/kt_setup_template.sh.j2 +91 -0
  82. kubetorch/serving/templates/pod_template.yaml +198 -0
  83. kubetorch/serving/templates/raycluster_service_template.yaml +42 -0
  84. kubetorch/serving/templates/raycluster_template.yaml +35 -0
  85. kubetorch/serving/templates/service_template.yaml +21 -0
  86. kubetorch/serving/templates/workerset_template.yaml +36 -0
  87. kubetorch/serving/utils.py +344 -0
  88. kubetorch/utils.py +263 -0
  89. kubetorch-0.2.5.dist-info/METADATA +75 -0
  90. kubetorch-0.2.5.dist-info/RECORD +92 -0
  91. kubetorch-0.2.5.dist-info/WHEEL +4 -0
  92. kubetorch-0.2.5.dist-info/entry_points.txt +5 -0
@@ -0,0 +1,1026 @@
1
+ import importlib
2
+ import inspect
3
+ import os
4
+ import socket
5
+ import subprocess
6
+ import sys
7
+ from pathlib import Path
8
+ from typing import List, Optional, Union
9
+
10
+ from kubernetes import client
11
+ from kubernetes.client.rest import ApiException
12
+ from kubernetes.stream import stream
13
+
14
+ import kubetorch.globals
15
+ from kubetorch.logger import get_logger
16
+ from kubetorch.resources.callables.utils import get_local_install_path, locate_working_dir
17
+ from kubetorch.resources.secrets.kubernetes_secrets_client import KubernetesSecretsClient
18
+ from kubetorch.servers.http.utils import is_running_in_kubernetes, StartupError
19
+ from kubetorch.serving import constants as serving_constants
20
+ from kubetorch.serving.constants import KT_SERVICE_LABEL, KT_USERNAME_LABEL
21
+
22
+ logger = get_logger(__name__)
23
+
24
+
25
+ class KnativeServiceError(Exception):
26
+ """Base exception for Knative service errors."""
27
+
28
+ pass
29
+
30
+
31
+ class ImagePullError(KnativeServiceError):
32
+ """Raised when container image pull fails."""
33
+
34
+ pass
35
+
36
+
37
+ class ResourceNotAvailableError(Exception):
38
+ """Raised when required compute resources (GPU, memory, etc.) are not available in the cluster."""
39
+
40
+ pass
41
+
42
+
43
+ class ServiceHealthError(KnativeServiceError):
44
+ """Raised when service health checks fail."""
45
+
46
+ pass
47
+
48
+
49
+ class ServiceTimeoutError(KnativeServiceError):
50
+ """Raised when service fails to become ready within timeout period."""
51
+
52
+ pass
53
+
54
+
55
+ class QueueUnschedulableError(KnativeServiceError):
56
+ """Raised when the service pod is unschedulable in the requested queue."""
57
+
58
+ pass
59
+
60
+
61
+ class KnativeServiceConflictError(Exception):
62
+ """Raised when a conflicting non-Knative Kubernetes Service prevents Knative service creation."""
63
+
64
+ pass
65
+
66
+
67
+ class PodContainerError(Exception):
68
+ """Raised when pod container is in a terminated or waiting state."""
69
+
70
+ pass
71
+
72
+
73
+ class VersionMismatchError(Exception):
74
+ """Raised when the Kubetorch client version is incompatible with the version running on the target cluster"""
75
+
76
+ pass
77
+
78
+
79
+ class SecretNotFound(Exception):
80
+ """Raised when trying to update kubetorch secret the does not exist"""
81
+
82
+ def __init__(self, secret_name: str, namespace: str):
83
+ super().__init__(f"kubetorch secret {secret_name} was not found in {namespace} namespace")
84
+
85
+
86
+ class RsyncError(Exception):
87
+ def __init__(self, cmd: str, returncode: int, stdout: str, stderr: str):
88
+ self.cmd = cmd
89
+ self.returncode = returncode
90
+ self.stdout = stdout
91
+ self.stderr = stderr
92
+ super().__init__(f"Rsync failed (code={returncode}): {stderr.strip()}")
93
+
94
+
95
+ TERMINATE_EARLY_ERRORS = {
96
+ "ContainerMissing": ImagePullError,
97
+ "ImagePullBackOff": ImagePullError,
98
+ "ErrImagePull": ImagePullError,
99
+ "CrashLoopBackOff": ServiceHealthError,
100
+ "BackOff": ServiceHealthError,
101
+ "StartupError": StartupError,
102
+ "FailedMount": StartupError,
103
+ }
104
+
105
+
106
+ def _run_bash(
107
+ commands: Union[str, List[str]],
108
+ core_api: "CoreV1Api",
109
+ pod_names: List[str],
110
+ namespace: str,
111
+ container: str = None,
112
+ ):
113
+ if isinstance(commands, str):
114
+ commands = [commands]
115
+ commands = [["/bin/sh", "-c", f'{command}; echo "::EXIT_CODE::$?"'] for command in commands]
116
+
117
+ if isinstance(pod_names, str):
118
+ pod_names = [pod_names]
119
+
120
+ ret_codes = []
121
+ for exec_command in commands:
122
+ for pod_name in pod_names:
123
+ if not container:
124
+ pod = core_api.read_namespaced_pod(name=pod_name, namespace=namespace)
125
+ if not pod.spec.containers:
126
+ raise Exception(f"No containers found in pod {pod_name}")
127
+ container = pod.spec.containers[0].name
128
+ try:
129
+ resp = stream(
130
+ core_api.connect_get_namespaced_pod_exec,
131
+ pod_name,
132
+ namespace,
133
+ container=container,
134
+ command=exec_command,
135
+ stderr=True,
136
+ stdin=False,
137
+ stdout=True,
138
+ )
139
+
140
+ resp = resp.splitlines()
141
+ exit_code = 0
142
+
143
+ for line in resp:
144
+ if "::EXIT_CODE::" in line:
145
+ try:
146
+ exit_code = int(line.split("::EXIT_CODE::")[-1].strip())
147
+ resp.remove(line)
148
+ break
149
+ except ValueError:
150
+ pass
151
+
152
+ stdout = "\n".join(resp)
153
+
154
+ if exit_code == 0:
155
+ ret_codes.append([exit_code, stdout, ""])
156
+ else:
157
+ ret_codes.append([exit_code, "", stdout])
158
+
159
+ except Exception as e:
160
+ raise Exception(f"Failed to execute command {exec_command} on pod {pod_name}: {str(e)}")
161
+ return ret_codes
162
+
163
+
164
+ def _get_rsync_exclude_options() -> str:
165
+ """Get rsync exclude options using .gitignore and/or .ktignore if available."""
166
+ from pathlib import Path
167
+
168
+ # Allow users to hard override all of our settings
169
+ if os.environ.get("KT_RSYNC_FILTERS"):
170
+ logger.debug(
171
+ f"KT_RSYNC_FILTERS environment variable set, using rsync filters: {os.environ['KT_RSYNC_FILTERS']}"
172
+ )
173
+ return os.environ["KT_RSYNC_FILTERS"]
174
+
175
+ repo_root, _ = locate_working_dir(os.getcwd())
176
+ gitignore_path = os.path.join(repo_root, ".gitignore")
177
+ kt_ignore_path = os.path.join(repo_root, ".ktignore")
178
+
179
+ exclude_args = ""
180
+ if Path(kt_ignore_path).exists():
181
+ exclude_args += f" --exclude-from='{kt_ignore_path}'"
182
+ if Path(gitignore_path).exists():
183
+ exclude_args += f" --exclude-from='{gitignore_path}'"
184
+ # Add some reasonable default exclusions
185
+ exclude_args += " --exclude='*.pyc' --exclude='__pycache__' --exclude='.venv' --exclude='.git'"
186
+
187
+ return exclude_args.strip()
188
+
189
+
190
+ def is_pod_terminated(pod: client.V1Pod) -> bool:
191
+ # Check if pod is marked for deletion
192
+ if pod.metadata.deletion_timestamp is not None:
193
+ return True
194
+
195
+ # Check pod phase
196
+ if pod.status.phase in ["Succeeded", "Failed"]:
197
+ return True
198
+
199
+ # Check container statuses
200
+ if pod.status.container_statuses:
201
+ for container in pod.status.container_statuses:
202
+ if container.state.terminated:
203
+ return True
204
+
205
+ return False
206
+
207
+
208
+ # ----------------- ConfigMap utils ----------------- #
209
+ def load_configmaps(
210
+ core_api: client.CoreV1Api,
211
+ service_name: str,
212
+ namespace: str,
213
+ console: "Console" = None,
214
+ ) -> List[str]:
215
+ """List configmaps that start with a given service name."""
216
+ try:
217
+ configmaps = core_api.list_namespaced_config_map(
218
+ namespace=namespace,
219
+ label_selector=f"kubetorch.com/service={service_name}",
220
+ )
221
+ return [cm.metadata.name for cm in configmaps.items]
222
+ except ApiException as e:
223
+ if console:
224
+ console.print(f"[yellow]Warning:[/yellow] Failed to list configmaps: {e}")
225
+ return []
226
+
227
+
228
+ # ----------------- Resource Deletion Utils ----------------- #
229
+ def delete_configmaps(
230
+ core_api: client.CoreV1Api,
231
+ configmaps: List[str],
232
+ namespace: str,
233
+ console: "Console" = None,
234
+ force: bool = False,
235
+ ):
236
+ """Delete the given list of configmaps."""
237
+
238
+ grace_period_seconds, propagation_policy = None, None
239
+ if force:
240
+ grace_period_seconds = 0
241
+ propagation_policy = "Foreground"
242
+
243
+ for cm in configmaps:
244
+ try:
245
+ core_api.delete_namespaced_config_map(
246
+ name=cm,
247
+ namespace=namespace,
248
+ grace_period_seconds=grace_period_seconds,
249
+ propagation_policy=propagation_policy,
250
+ )
251
+ if console:
252
+ console.print(f"✓ Deleted configmap [blue]{cm}[/blue]")
253
+ except ApiException as e:
254
+ if e.status == 404:
255
+ if console:
256
+ console.print(f"[yellow]Warning:[/yellow] ConfigMap {cm} not found")
257
+ else:
258
+ if console:
259
+ console.print(f"[red]Error:[/red] Failed to delete configmap {cm}: {e}")
260
+
261
+
262
+ def delete_service(
263
+ custom_api: client.CustomObjectsApi,
264
+ name: str,
265
+ namespace,
266
+ console: "Console" = None,
267
+ force: bool = False,
268
+ ):
269
+ """Delete a Knative service."""
270
+
271
+ grace_period_seconds, propagation_policy = None, None
272
+ if force:
273
+ grace_period_seconds = 0
274
+ propagation_policy = "Foreground"
275
+
276
+ try:
277
+ custom_api.delete_namespaced_custom_object(
278
+ group="serving.knative.dev",
279
+ version="v1",
280
+ namespace=namespace,
281
+ plural="services",
282
+ name=name,
283
+ grace_period_seconds=grace_period_seconds,
284
+ propagation_policy=propagation_policy,
285
+ )
286
+ if console:
287
+ console.print(f"✓ Deleted service [blue]{name}[/blue]")
288
+ except Exception as e:
289
+ if e.status == 404:
290
+ if console:
291
+ console.print(f"[yellow]Note:[/yellow] Service {name} not found or already deleted")
292
+ else:
293
+ if console:
294
+ console.print(f"[red]Error:[/red] Failed to delete service {name}: {e}")
295
+
296
+
297
+ def delete_deployment(
298
+ apps_v1_api: client.AppsV1Api,
299
+ core_api: client.CoreV1Api,
300
+ name: str,
301
+ namespace: str,
302
+ console: "Console" = None,
303
+ force: bool = False,
304
+ ):
305
+ """Delete a Deployment and its associated service."""
306
+ grace_period_seconds, propagation_policy = None, None
307
+ if force:
308
+ grace_period_seconds = 0
309
+ propagation_policy = "Foreground"
310
+ try:
311
+ # Delete the Deployment
312
+ apps_v1_api.delete_namespaced_deployment(
313
+ name=name,
314
+ namespace=namespace,
315
+ grace_period_seconds=grace_period_seconds,
316
+ propagation_policy=propagation_policy,
317
+ )
318
+ if console:
319
+ console.print(f"✓ Deleted deployment [blue]{name}[/blue]")
320
+ except ApiException as e:
321
+ if e.status == 404:
322
+ if console:
323
+ console.print(f"[yellow]Note:[/yellow] Deployment {name} not found or already deleted")
324
+ else:
325
+ if console:
326
+ console.print(f"[red]Error:[/red] Failed to delete deployment {name}: {e}")
327
+
328
+ # Delete the associated service (regular service, not headless)
329
+ try:
330
+ core_api.delete_namespaced_service(
331
+ name=name,
332
+ namespace=namespace,
333
+ grace_period_seconds=grace_period_seconds,
334
+ propagation_policy=propagation_policy,
335
+ )
336
+ if console:
337
+ console.print(f"✓ Deleted service [blue]{name}[/blue]")
338
+ except ApiException as e:
339
+ if e.status == 404:
340
+ if console:
341
+ console.print(f"[yellow]Note:[/yellow] Service {name} not found or already deleted")
342
+ else:
343
+ if console:
344
+ console.print(f"[red]Error:[/red] Failed to delete service {name}: {e}")
345
+
346
+ # Also try to delete the headless service for distributed deployments
347
+ try:
348
+ core_api.delete_namespaced_service(
349
+ name=f"{name}-headless",
350
+ namespace=namespace,
351
+ grace_period_seconds=grace_period_seconds,
352
+ propagation_policy=propagation_policy,
353
+ )
354
+ if console:
355
+ console.print(f"✓ Deleted headless service [blue]{name}-headless[/blue]")
356
+ except ApiException as e:
357
+ if e.status == 404:
358
+ # This is normal for non-distributed deployments
359
+ pass
360
+ else:
361
+ if console:
362
+ console.print(f"[red]Error:[/red] Failed to delete headless service {name}-headless: {e}")
363
+
364
+
365
+ def delete_raycluster(
366
+ custom_api: client.CustomObjectsApi,
367
+ core_api: client.CoreV1Api,
368
+ name: str,
369
+ namespace: str,
370
+ console: "Console" = None,
371
+ force: bool = False,
372
+ ):
373
+ """Delete a RayCluster and its associated service."""
374
+
375
+ grace_period_seconds, propagation_policy = None, None
376
+ if force:
377
+ grace_period_seconds = 0
378
+ propagation_policy = "Foreground"
379
+
380
+ try:
381
+ # Delete the RayCluster
382
+ custom_api.delete_namespaced_custom_object(
383
+ group="ray.io",
384
+ version="v1",
385
+ namespace=namespace,
386
+ plural="rayclusters",
387
+ name=name,
388
+ grace_period_seconds=grace_period_seconds,
389
+ propagation_policy=propagation_policy,
390
+ )
391
+ if console:
392
+ console.print(f"✓ Deleted RayCluster [blue]{name}[/blue]")
393
+ except ApiException as e:
394
+ if e.status == 404:
395
+ if console:
396
+ console.print(f"[yellow]Note:[/yellow] RayCluster {name} not found or already deleted")
397
+ else:
398
+ if console:
399
+ console.print(f"[red]Error:[/red] Failed to delete RayCluster {name}: {e}")
400
+
401
+ # Delete the associated service (created alongside RayCluster)
402
+ try:
403
+ core_api.delete_namespaced_service(
404
+ name=name,
405
+ namespace=namespace,
406
+ grace_period_seconds=grace_period_seconds,
407
+ propagation_policy=propagation_policy,
408
+ )
409
+ if console:
410
+ console.print(f"✓ Deleted service [blue]{name}[/blue]")
411
+ except ApiException as e:
412
+ if e.status == 404:
413
+ if console:
414
+ console.print(f"[yellow]Note:[/yellow] Service {name} not found or already deleted")
415
+ else:
416
+ if console:
417
+ console.print(f"[red]Error:[/red] Failed to delete service {name}: {e}")
418
+
419
+ # Delete the headless service for Ray pod discovery
420
+ try:
421
+ core_api.delete_namespaced_service(
422
+ name=f"{name}-headless",
423
+ namespace=namespace,
424
+ grace_period_seconds=grace_period_seconds,
425
+ propagation_policy=propagation_policy,
426
+ )
427
+ if console:
428
+ console.print(f"✓ Deleted headless service [blue]{name}-headless[/blue]")
429
+ except ApiException as e:
430
+ if e.status == 404:
431
+ # This is normal for older Ray clusters without headless services
432
+ pass
433
+ else:
434
+ if console:
435
+ console.print(f"[red]Error:[/red] Failed to delete headless service {name}-headless: {e}")
436
+
437
+
438
+ def delete_resources_for_service(
439
+ core_api: client.CoreV1Api,
440
+ custom_api: client.CustomObjectsApi,
441
+ configmaps: List[str],
442
+ name: str,
443
+ service_type: str = "knative",
444
+ namespace: str = None,
445
+ console: "Console" = None,
446
+ force: bool = False,
447
+ ):
448
+ """Delete service resources based on service type."""
449
+ # Delete the main service (Knative, Deployment, or RayCluster)
450
+ if service_type == "deployment":
451
+ apps_v1_api = client.AppsV1Api()
452
+ delete_deployment(
453
+ apps_v1_api=apps_v1_api,
454
+ core_api=core_api,
455
+ name=name,
456
+ namespace=namespace,
457
+ console=console,
458
+ force=force,
459
+ )
460
+ elif service_type == "raycluster":
461
+ delete_raycluster(
462
+ custom_api=custom_api,
463
+ core_api=core_api,
464
+ name=name,
465
+ namespace=namespace,
466
+ console=console,
467
+ force=force,
468
+ )
469
+ else: # knative or unknown - try deleting as Knative service
470
+ delete_service(
471
+ custom_api=custom_api,
472
+ name=name,
473
+ namespace=namespace,
474
+ console=console,
475
+ force=force,
476
+ )
477
+
478
+ # Delete configmaps
479
+ if configmaps:
480
+ delete_configmaps(
481
+ core_api=core_api,
482
+ configmaps=configmaps,
483
+ namespace=namespace,
484
+ console=console,
485
+ force=force,
486
+ )
487
+
488
+ delete_cached_service_data(core_api=core_api, service_name=name, namespace=namespace, console=console)
489
+
490
+
491
+ def delete_cached_service_data(
492
+ core_api: client.CoreV1Api,
493
+ service_name: str,
494
+ namespace: str,
495
+ console: "Console" = None,
496
+ ):
497
+ """Delete service data from the rsync pod."""
498
+ try:
499
+ # Find the rsync pod name in the provided namespace
500
+ pods = core_api.list_namespaced_pod(namespace=namespace, label_selector="app=kubetorch-rsync")
501
+
502
+ if not pods.items:
503
+ if console:
504
+ console.print(f"[yellow] No rsync pod found in namespace {namespace}[/yellow]")
505
+ return
506
+
507
+ pod_name = pods.items[0].metadata.name
508
+ service_path = f"/data/{namespace}/{service_name}"
509
+
510
+ shell_cmd = (
511
+ f"if [ -d '{service_path}' ]; then rm -rf '{service_path}' && echo 'Deleted {service_path}'; "
512
+ f"else echo 'Path {service_path} not found'; fi"
513
+ )
514
+
515
+ # Execute command based on environment
516
+ if is_running_in_kubernetes():
517
+ response = stream(
518
+ core_api.connect_get_namespaced_pod_exec,
519
+ name=pod_name,
520
+ namespace=namespace,
521
+ command=["sh", "-c", shell_cmd],
522
+ stderr=True,
523
+ stdin=False,
524
+ stdout=True,
525
+ tty=False,
526
+ )
527
+ output = response.strip()
528
+
529
+ else:
530
+ cmd = [
531
+ "kubectl",
532
+ "exec",
533
+ "-n",
534
+ namespace,
535
+ pod_name,
536
+ "--",
537
+ "sh",
538
+ "-c",
539
+ shell_cmd,
540
+ ]
541
+ result = subprocess.run(cmd, capture_output=True, text=True, timeout=10)
542
+
543
+ if result.returncode != 0:
544
+ if console:
545
+ console.print(f"[red]Error cleaning up cached data: {result.stderr}[/red]")
546
+ return
547
+ output = result.stdout.strip()
548
+
549
+ if console:
550
+ if "Deleted" in output:
551
+ console.print(f"✓ Deleted cached data for [blue]{service_name}[/blue]")
552
+
553
+ except subprocess.TimeoutExpired:
554
+ if console:
555
+ console.print("[red]Timeout while cleaning up cached service data[/red]")
556
+ else:
557
+ logger.debug("Timeout while cleaning up cached data")
558
+
559
+ except Exception as e:
560
+ if console:
561
+ console.print(f"[red]Failed to clean up cached service data: {e}[/red]")
562
+ else:
563
+ logger.debug(f"Failed to clean up cached data: {e}")
564
+
565
+
566
+ def _collect_modules(target_str):
567
+ from kubetorch.resources.callables.module import Module
568
+
569
+ to_deploy = []
570
+
571
+ if ":" in target_str:
572
+ target_module_or_path, target_fn_or_class = target_str.split(":")
573
+ else:
574
+ target_module_or_path, target_fn_or_class = target_str, None
575
+
576
+ if target_module_or_path.endswith(".py"):
577
+ abs_path = Path(target_module_or_path).resolve()
578
+ python_module_name = inspect.getmodulename(str(abs_path))
579
+
580
+ sys.path.insert(0, str(abs_path.parent))
581
+ else:
582
+ python_module_name = target_module_or_path
583
+ sys.path.append(".")
584
+
585
+ module = importlib.import_module(python_module_name)
586
+
587
+ if target_fn_or_class:
588
+ if not hasattr(module, target_fn_or_class):
589
+ raise ValueError(f"Function or class {target_fn_or_class} not found in {target_module_or_path}.")
590
+ to_deploy = [getattr(module, target_fn_or_class)]
591
+ if not isinstance(to_deploy[0], Module):
592
+ raise ValueError(
593
+ f"Function or class {target_fn_or_class} in {target_module_or_path} is not decorated with @kt.compute."
594
+ )
595
+ else:
596
+ # Get all functions and classes to deploy
597
+ for name in dir(module):
598
+ obj = getattr(module, name)
599
+ if isinstance(obj, Module):
600
+ to_deploy.append(obj)
601
+ if not to_deploy:
602
+ raise ValueError(f"No functions or classes decorated with @kt.compute found in {target_module_or_path}.")
603
+
604
+ return to_deploy, target_fn_or_class
605
+
606
+
607
+ def fetch_resources_for_teardown(
608
+ namespace: str,
609
+ target: str,
610
+ core_api: client.CoreV1Api,
611
+ custom_api: client.CustomObjectsApi,
612
+ prefix: Optional[str] = None,
613
+ username: Optional[str] = None,
614
+ exact_match: bool = False,
615
+ ) -> dict:
616
+ """Fetchs the resources for a given service.
617
+
618
+ Returns a dictionary with the following keys:
619
+ - services: {
620
+ [service_name]: {
621
+ "configmaps": List[str],
622
+ "pods": List[str],
623
+ "type": str, # "knative" or "deployment"
624
+ }
625
+ }
626
+ """
627
+ from kubetorch.resources.callables.module import Module
628
+
629
+ resources = {"services": {}}
630
+ services = []
631
+
632
+ if prefix in ["kt", "kubetorch", "knative"]:
633
+ raise ValueError(f"Invalid prefix: {prefix} is reserved. Please delete these individually.")
634
+
635
+ # Initialize apps API for deployments
636
+ apps_v1_api = client.AppsV1Api()
637
+
638
+ if username or prefix:
639
+ # Search Knative services
640
+ try:
641
+ # Build label selector for Knative services - use template label to identify kubetorch services
642
+ knative_label_selector = f"{serving_constants.KT_TEMPLATE_LABEL}=ksvc"
643
+ if username:
644
+ knative_label_selector += f",{KT_USERNAME_LABEL}={username}"
645
+
646
+ response = custom_api.list_namespaced_custom_object(
647
+ group="serving.knative.dev",
648
+ version="v1",
649
+ namespace=namespace,
650
+ plural="services",
651
+ label_selector=knative_label_selector,
652
+ )
653
+ items = response.get("items", [])
654
+ knative_services = [
655
+ item["metadata"]["name"] for item in items if (username or item["metadata"]["name"].startswith(prefix))
656
+ ]
657
+ services.extend(knative_services)
658
+ except client.exceptions.ApiException as e:
659
+ if e.status != 404: # Ignore if Knative is not installed
660
+ logger.warning(f"Failed to list Knative services: {e}")
661
+
662
+ # Search Deployments
663
+ try:
664
+ # Build label selector for deployments - use KT_TEMPLATE_LABEL to identify kubetorch deployments
665
+ deployment_label_selector = f"{serving_constants.KT_TEMPLATE_LABEL}=deployment"
666
+ if username:
667
+ deployment_label_selector += f",{KT_USERNAME_LABEL}={username}"
668
+
669
+ deployments_response = apps_v1_api.list_namespaced_deployment(
670
+ namespace=namespace,
671
+ label_selector=deployment_label_selector,
672
+ )
673
+ deployment_services = [
674
+ deployment.metadata.name
675
+ for deployment in deployments_response.items
676
+ if (username or deployment.metadata.name.startswith(prefix))
677
+ ]
678
+ services.extend(deployment_services)
679
+ except client.exceptions.ApiException as e:
680
+ logger.warning(f"Failed to list Deployments: {e}")
681
+
682
+ # Search RayClusters
683
+ try:
684
+ # Build label selector for rayclusters - use template label to identify kubetorch rayclusters
685
+ raycluster_label_selector = f"{serving_constants.KT_TEMPLATE_LABEL}=raycluster"
686
+ if username:
687
+ raycluster_label_selector += f",{KT_USERNAME_LABEL}={username}"
688
+
689
+ response = custom_api.list_namespaced_custom_object(
690
+ group="ray.io",
691
+ version="v1",
692
+ namespace=namespace,
693
+ plural="rayclusters",
694
+ label_selector=raycluster_label_selector,
695
+ )
696
+ items = response.get("items", [])
697
+ raycluster_services = [
698
+ item["metadata"]["name"] for item in items if (username or item["metadata"]["name"].startswith(prefix))
699
+ ]
700
+ services.extend(raycluster_services)
701
+ except client.exceptions.ApiException as e:
702
+ if e.status != 404: # Ignore if Ray operator is not installed
703
+ logger.warning(f"Failed to list RayClusters: {e}")
704
+
705
+ else:
706
+ if not target:
707
+ raise ValueError("Please provide a service name or use the --all or --prefix flags")
708
+
709
+ # Case when service_name is a module or file path (i.e. the `kt deploy` usage path)
710
+ if ":" in target or ".py" in target or "." in target:
711
+ to_down, _ = _collect_modules(target)
712
+ services = [mod.service_name for mod in to_down if isinstance(mod, Module)]
713
+ else:
714
+ services = [target]
715
+ # if the target is not prefixed with the username, add the username prefix
716
+ username = kubetorch.globals.config.username
717
+ if username and not exact_match and not target.startswith(username + "-"):
718
+ services.append(username + "-" + target)
719
+
720
+ for service_name in services:
721
+ service_type = None
722
+ service_found = False
723
+
724
+ # Check if it's a Knative service
725
+ try:
726
+ service = custom_api.get_namespaced_custom_object(
727
+ group="serving.knative.dev",
728
+ version="v1",
729
+ namespace=namespace,
730
+ plural="services",
731
+ name=service_name,
732
+ )
733
+ if service:
734
+ service_type = "knative"
735
+ service_found = True
736
+ except client.exceptions.ApiException:
737
+ pass
738
+
739
+ # Check if it's a Deployment (if not found as Knative service)
740
+ if not service_found:
741
+ try:
742
+ deployment = apps_v1_api.read_namespaced_deployment(name=service_name, namespace=namespace)
743
+ # Only consider it if it has kubetorch template label
744
+ if (
745
+ deployment.metadata.labels
746
+ and deployment.metadata.labels.get(serving_constants.KT_TEMPLATE_LABEL) == "deployment"
747
+ ):
748
+ service_type = "deployment"
749
+ service_found = True
750
+ except client.exceptions.ApiException:
751
+ pass
752
+
753
+ # Check if it's a RayCluster (if not found as Knative or Deployment)
754
+ if not service_found:
755
+ try:
756
+ raycluster = custom_api.get_namespaced_custom_object(
757
+ group="ray.io",
758
+ version="v1",
759
+ namespace=namespace,
760
+ plural="rayclusters",
761
+ name=service_name,
762
+ )
763
+ if raycluster:
764
+ service_type = "raycluster"
765
+ service_found = True
766
+ except client.exceptions.ApiException:
767
+ pass
768
+
769
+ # Get associated resources if service exists
770
+ configmaps = load_configmaps(core_api, service_name, namespace)
771
+ pods = core_api.list_namespaced_pod(namespace=namespace, label_selector=f"{KT_SERVICE_LABEL}={service_name}")
772
+ pods = [pod.metadata.name for pod in pods.items]
773
+
774
+ # Only add the service to the resources if it has configmaps, pods, or we found the service
775
+ if service_found or configmaps or pods:
776
+ resources["services"][service_name] = {
777
+ "configmaps": configmaps,
778
+ "pods": pods,
779
+ "type": service_type or "unknown",
780
+ }
781
+
782
+ return resources
783
+
784
+
785
+ # ----------------- Image Builder Utils ----------------- #
786
+ def _get_sync_package_paths(
787
+ package: str,
788
+ ):
789
+ if "/" in package or "~" in package:
790
+ package_path = (
791
+ Path(package).expanduser()
792
+ if Path(package).expanduser().is_absolute()
793
+ else Path(locate_working_dir()[0]) / package
794
+ )
795
+ dest_dir = str(package_path.name)
796
+ else:
797
+ package_path = get_local_install_path(package)
798
+ dest_dir = package
799
+
800
+ if not (package_path and Path(package_path).exists()):
801
+ raise ValueError(f"Could not locate local package {package}")
802
+
803
+ full_path = Path(package_path).expanduser().resolve()
804
+ return str(full_path), dest_dir
805
+
806
+
807
+ # ----------------- Error Handling Utils ----------------- #
808
+ def check_pod_status_for_errors(pod: client.V1Pod, queue_name: str = None, scheduler_name: str = None):
809
+ """Check pod status for errors"""
810
+ # Check for scheduling issues
811
+ for condition in pod.status.conditions or []:
812
+ if condition.type == "PodScheduled" and condition.status == "False" and condition.reason == "Unschedulable":
813
+ msg = condition.message.lower()
814
+
815
+ # Check if the pod is scheduled in the correct queue and scheduler
816
+ if queue_name and scheduler_name:
817
+ scheduler = pod.metadata.annotations.get("schedulerName", "")
818
+ queue_label = pod.metadata.labels.get("kai.scheduler/queue")
819
+ if queue_label == queue_name and scheduler == scheduler_name:
820
+ raise QueueUnschedulableError(
821
+ f"Pod {pod.metadata.name} could not be scheduled: {condition.message}"
822
+ )
823
+
824
+ # Check for specific node selector/affinity/GPU type mismatches
825
+ # without matching temporary resource exhaustion messages
826
+ if any(
827
+ x in msg
828
+ for x in [
829
+ "node selector not matched",
830
+ "node affinity mismatch",
831
+ "unsupported gpu type",
832
+ "unknown instance type",
833
+ "didn't match pod's node affinity/selector",
834
+ ]
835
+ ):
836
+ raise ResourceNotAvailableError(
837
+ f"Required compute resources are not configured in the cluster: {condition.message}"
838
+ )
839
+
840
+ # Check for container status errors
841
+ if pod.status.container_statuses:
842
+ for container_status in pod.status.container_statuses:
843
+ if container_status.state and container_status.state.waiting:
844
+ reason = container_status.state.waiting.reason
845
+ message = container_status.state.waiting.message or ""
846
+ if reason in TERMINATE_EARLY_ERRORS:
847
+ raise TERMINATE_EARLY_ERRORS[reason](f"Pod {pod.metadata.name}: {message}")
848
+
849
+
850
+ def check_pod_events_for_errors(pod: client.V1Pod, namespace: str, core_api: client.CoreV1Api):
851
+ """Check pod events for scheduling errors"""
852
+ try:
853
+ events = core_api.list_namespaced_event(
854
+ namespace=namespace,
855
+ field_selector=f"involvedObject.name={pod.metadata.name}",
856
+ ).items
857
+ for event in events:
858
+ # Check for Karpenter scheduling errors
859
+ if (
860
+ event.reason == "FailedScheduling"
861
+ and event.source.component == "karpenter"
862
+ and "no instance type has enough resources" in event.message
863
+ ):
864
+ raise ResourceNotAvailableError(f"Pod {pod.metadata.name} failed to schedule: {event.message}")
865
+ except client.exceptions.ApiException as e:
866
+ logger.warning(f"Error fetching events for pod {pod.metadata.name}: {e}")
867
+
868
+
869
+ def check_replicaset_events_for_errors(
870
+ namespace: str,
871
+ service_name: str,
872
+ apps_v1_api: client.AppsV1Api,
873
+ core_api: client.CoreV1Api,
874
+ ):
875
+ """Check ReplicaSet events for creation errors like missing PriorityClass.
876
+
877
+ Args:
878
+ service_name: Name of the service
879
+ core_api: Core API instance
880
+
881
+ Raises:
882
+ ResourceNotAvailableError: If ReplicaSet creation fails due to missing resources
883
+ """
884
+ try:
885
+ # Get ReplicaSets associated with this Deployment
886
+ replicasets = apps_v1_api.list_namespaced_replica_set(
887
+ namespace=namespace,
888
+ label_selector=f"kubetorch.com/service={service_name}",
889
+ ).items
890
+
891
+ for replicaset in replicasets:
892
+ # Check ReplicaSet events for FailedCreate errors
893
+ events = core_api.list_namespaced_event(
894
+ namespace=namespace,
895
+ field_selector=f"involvedObject.name={replicaset.metadata.name}",
896
+ ).items
897
+
898
+ for event in events:
899
+ if event.reason == "FailedCreate" and event.type == "Warning" and "forbidden" in event.message.lower():
900
+ # Check for specific PriorityClass errors
901
+ if "priorityclass" in event.message.lower():
902
+ raise ResourceNotAvailableError(
903
+ f"ReplicaSet {replicaset.metadata.name} failed to create pods: "
904
+ f"{event.message}. Please ensure the required PriorityClass exists in the cluster."
905
+ )
906
+ # Check for other forbidden errors
907
+ elif any(
908
+ error_type in event.message.lower()
909
+ for error_type in [
910
+ "forbidden",
911
+ "no priorityclass",
912
+ "priority class",
913
+ ]
914
+ ):
915
+ raise ResourceNotAvailableError(
916
+ f"ReplicaSet {replicaset.metadata.name} failed to create pods: "
917
+ f"{event.message}. Please check cluster configuration and permissions."
918
+ )
919
+
920
+ except client.exceptions.ApiException as e:
921
+ logger.warning(f"Error checking ReplicaSet events for {service_name}: {e}")
922
+ except ResourceNotAvailableError:
923
+ # Re-raise ResourceNotAvailableError to stop the readiness check
924
+ raise
925
+
926
+
927
+ def check_revision_for_errors(revision_name: str, namespace: str, objects_api: client.CustomObjectsApi):
928
+ """Check revision for errors"""
929
+ try:
930
+ revision = objects_api.get_namespaced_custom_object(
931
+ group="serving.knative.dev",
932
+ version="v1",
933
+ namespace=namespace,
934
+ plural="revisions",
935
+ name=revision_name,
936
+ )
937
+ for cond in revision.get("status", {}).get("conditions", []):
938
+ if cond["status"] == "False":
939
+ reason = cond.get("reason")
940
+ message = cond.get("message", f"Revision failed with reason: {reason}")
941
+ if reason in TERMINATE_EARLY_ERRORS:
942
+ raise TERMINATE_EARLY_ERRORS[reason](f"Revision {revision_name}: {message}")
943
+ except client.exceptions.ApiException as e:
944
+ logger.warning(f"Error checking revision: {e}")
945
+
946
+
947
+ def is_port_available(port: int) -> bool:
948
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
949
+ return s.connect_ex(("localhost", port)) != 0
950
+
951
+
952
+ def find_available_port(start_port: int, max_tries: int = 10) -> int:
953
+ for i in range(max_tries):
954
+ port = start_port + i
955
+ if is_port_available(port):
956
+ return port
957
+ raise RuntimeError(f"Could not find available port starting from {start_port}")
958
+
959
+
960
+ # --------------- Secrets utils ---------------------------
961
+
962
+
963
+ def get_parsed_secret(secret: client.V1Secret):
964
+ labels = secret.metadata.labels
965
+ secret = {
966
+ "name": secret.metadata.name,
967
+ "username": labels.get("kubetorch.com/username", None) if labels else None,
968
+ "namespace": secret.metadata.namespace,
969
+ "user_defined_name": labels.get("kubetorch.com/secret-name", None) if labels else None,
970
+ "labels": labels,
971
+ "annotations": secret.metadata.annotations,
972
+ "type": secret.type,
973
+ "data": secret.data,
974
+ }
975
+ return secret
976
+
977
+
978
+ def list_secrets(
979
+ core_api: client.CoreV1Api,
980
+ namespace: str = "default",
981
+ prefix: str = None,
982
+ all_namespaces: bool = False,
983
+ filter_by_creator: bool = True,
984
+ console: "Console" = None,
985
+ ):
986
+ try:
987
+ if all_namespaces:
988
+ secrets: client.V1SecretList = core_api.list_secret_for_all_namespaces()
989
+ else:
990
+ secrets: client.V1SecretList = core_api.list_namespaced_secret(namespace=namespace)
991
+ if not secrets:
992
+ return None
993
+ filtered_secrets = []
994
+ for secret in secrets.items:
995
+ parsed_secret = get_parsed_secret(secret)
996
+ user_defined_secret_name = parsed_secret.get("user_defined_name")
997
+ if user_defined_secret_name: # filter secrets that was created by kt api, by the username set in kt.config.
998
+ if prefix and filter_by_creator: # filter secrets by prefix + creator
999
+ if (
1000
+ parsed_secret.get("user_defined_name").startswith(prefix)
1001
+ and parsed_secret.get("username") == kubetorch.globals.config.username
1002
+ ):
1003
+ filtered_secrets.append(parsed_secret)
1004
+ elif prefix: # filter secrets by prefix
1005
+ if parsed_secret.get("user_defined_name").startswith(prefix):
1006
+ filtered_secrets.append(parsed_secret)
1007
+ elif filter_by_creator: # filter secrets by creator
1008
+ if parsed_secret.get("username") == kubetorch.globals.config.username:
1009
+ filtered_secrets.append(parsed_secret)
1010
+ else: # No additional filters required
1011
+ filtered_secrets.append(parsed_secret)
1012
+ return filtered_secrets
1013
+
1014
+ except client.rest.ApiException as e:
1015
+ console.print(f"[red]Failed to load secrets: {e}[/red]")
1016
+ return None
1017
+
1018
+
1019
+ def delete_secrets(
1020
+ secrets: List[str],
1021
+ secrets_client: KubernetesSecretsClient,
1022
+ console: "Console" = None,
1023
+ ):
1024
+ """Delete the given list of secrets."""
1025
+ for secret in secrets:
1026
+ secrets_client.delete_secret(secret, console=console)