kubetorch 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kubetorch might be problematic. Click here for more details.

Files changed (93) hide show
  1. kubetorch/__init__.py +60 -0
  2. kubetorch/cli.py +1985 -0
  3. kubetorch/cli_utils.py +1025 -0
  4. kubetorch/config.py +453 -0
  5. kubetorch/constants.py +18 -0
  6. kubetorch/docs/Makefile +18 -0
  7. kubetorch/docs/__init__.py +0 -0
  8. kubetorch/docs/_ext/json_globaltoc.py +42 -0
  9. kubetorch/docs/api/cli.rst +10 -0
  10. kubetorch/docs/api/python/app.rst +21 -0
  11. kubetorch/docs/api/python/cls.rst +19 -0
  12. kubetorch/docs/api/python/compute.rst +25 -0
  13. kubetorch/docs/api/python/config.rst +11 -0
  14. kubetorch/docs/api/python/fn.rst +19 -0
  15. kubetorch/docs/api/python/image.rst +14 -0
  16. kubetorch/docs/api/python/secret.rst +18 -0
  17. kubetorch/docs/api/python/volumes.rst +13 -0
  18. kubetorch/docs/api/python.rst +101 -0
  19. kubetorch/docs/conf.py +69 -0
  20. kubetorch/docs/index.rst +20 -0
  21. kubetorch/docs/requirements.txt +5 -0
  22. kubetorch/globals.py +285 -0
  23. kubetorch/logger.py +59 -0
  24. kubetorch/resources/__init__.py +0 -0
  25. kubetorch/resources/callables/__init__.py +0 -0
  26. kubetorch/resources/callables/cls/__init__.py +0 -0
  27. kubetorch/resources/callables/cls/cls.py +157 -0
  28. kubetorch/resources/callables/fn/__init__.py +0 -0
  29. kubetorch/resources/callables/fn/fn.py +133 -0
  30. kubetorch/resources/callables/module.py +1416 -0
  31. kubetorch/resources/callables/utils.py +174 -0
  32. kubetorch/resources/compute/__init__.py +0 -0
  33. kubetorch/resources/compute/app.py +261 -0
  34. kubetorch/resources/compute/compute.py +2596 -0
  35. kubetorch/resources/compute/decorators.py +139 -0
  36. kubetorch/resources/compute/rbac.py +74 -0
  37. kubetorch/resources/compute/utils.py +1114 -0
  38. kubetorch/resources/compute/websocket.py +137 -0
  39. kubetorch/resources/images/__init__.py +1 -0
  40. kubetorch/resources/images/image.py +414 -0
  41. kubetorch/resources/images/images.py +74 -0
  42. kubetorch/resources/secrets/__init__.py +2 -0
  43. kubetorch/resources/secrets/kubernetes_secrets_client.py +412 -0
  44. kubetorch/resources/secrets/provider_secrets/__init__.py +0 -0
  45. kubetorch/resources/secrets/provider_secrets/anthropic_secret.py +12 -0
  46. kubetorch/resources/secrets/provider_secrets/aws_secret.py +16 -0
  47. kubetorch/resources/secrets/provider_secrets/azure_secret.py +14 -0
  48. kubetorch/resources/secrets/provider_secrets/cohere_secret.py +12 -0
  49. kubetorch/resources/secrets/provider_secrets/gcp_secret.py +16 -0
  50. kubetorch/resources/secrets/provider_secrets/github_secret.py +13 -0
  51. kubetorch/resources/secrets/provider_secrets/huggingface_secret.py +20 -0
  52. kubetorch/resources/secrets/provider_secrets/kubeconfig_secret.py +12 -0
  53. kubetorch/resources/secrets/provider_secrets/lambda_secret.py +13 -0
  54. kubetorch/resources/secrets/provider_secrets/langchain_secret.py +12 -0
  55. kubetorch/resources/secrets/provider_secrets/openai_secret.py +11 -0
  56. kubetorch/resources/secrets/provider_secrets/pinecone_secret.py +12 -0
  57. kubetorch/resources/secrets/provider_secrets/providers.py +93 -0
  58. kubetorch/resources/secrets/provider_secrets/ssh_secret.py +12 -0
  59. kubetorch/resources/secrets/provider_secrets/wandb_secret.py +11 -0
  60. kubetorch/resources/secrets/secret.py +238 -0
  61. kubetorch/resources/secrets/secret_factory.py +70 -0
  62. kubetorch/resources/secrets/utils.py +209 -0
  63. kubetorch/resources/volumes/__init__.py +0 -0
  64. kubetorch/resources/volumes/volume.py +365 -0
  65. kubetorch/servers/__init__.py +0 -0
  66. kubetorch/servers/http/__init__.py +0 -0
  67. kubetorch/servers/http/distributed_utils.py +3223 -0
  68. kubetorch/servers/http/http_client.py +730 -0
  69. kubetorch/servers/http/http_server.py +1788 -0
  70. kubetorch/servers/http/server_metrics.py +278 -0
  71. kubetorch/servers/http/utils.py +728 -0
  72. kubetorch/serving/__init__.py +0 -0
  73. kubetorch/serving/autoscaling.py +173 -0
  74. kubetorch/serving/base_service_manager.py +363 -0
  75. kubetorch/serving/constants.py +83 -0
  76. kubetorch/serving/deployment_service_manager.py +478 -0
  77. kubetorch/serving/knative_service_manager.py +519 -0
  78. kubetorch/serving/raycluster_service_manager.py +582 -0
  79. kubetorch/serving/service_manager.py +18 -0
  80. kubetorch/serving/templates/deployment_template.yaml +17 -0
  81. kubetorch/serving/templates/knative_service_template.yaml +19 -0
  82. kubetorch/serving/templates/kt_setup_template.sh.j2 +81 -0
  83. kubetorch/serving/templates/pod_template.yaml +194 -0
  84. kubetorch/serving/templates/raycluster_service_template.yaml +42 -0
  85. kubetorch/serving/templates/raycluster_template.yaml +35 -0
  86. kubetorch/serving/templates/service_template.yaml +21 -0
  87. kubetorch/serving/templates/workerset_template.yaml +36 -0
  88. kubetorch/serving/utils.py +377 -0
  89. kubetorch/utils.py +284 -0
  90. kubetorch-0.2.0.dist-info/METADATA +121 -0
  91. kubetorch-0.2.0.dist-info/RECORD +93 -0
  92. kubetorch-0.2.0.dist-info/WHEEL +4 -0
  93. kubetorch-0.2.0.dist-info/entry_points.txt +5 -0
@@ -0,0 +1,1114 @@
1
+ import importlib
2
+ import inspect
3
+ import os
4
+ import socket
5
+ import subprocess
6
+ import sys
7
+ from pathlib import Path
8
+ from typing import List, Optional, Union
9
+
10
+ from kubernetes import client
11
+ from kubernetes.client.rest import ApiException
12
+ from kubernetes.stream import stream
13
+
14
+ import kubetorch.globals
15
+ from kubetorch.logger import get_logger
16
+ from kubetorch.resources.callables.utils import (
17
+ get_local_install_path,
18
+ locate_working_dir,
19
+ )
20
+ from kubetorch.resources.secrets.kubernetes_secrets_client import (
21
+ KubernetesSecretsClient,
22
+ )
23
+ from kubetorch.servers.http.utils import is_running_in_kubernetes, StartupError
24
+ from kubetorch.serving import constants as serving_constants
25
+ from kubetorch.serving.constants import KT_SERVICE_LABEL, KT_USERNAME_LABEL
26
+
27
+ logger = get_logger(__name__)
28
+
29
+
30
+ class KnativeServiceError(Exception):
31
+ """Base exception for Knative service errors."""
32
+
33
+ pass
34
+
35
+
36
+ class ImagePullError(KnativeServiceError):
37
+ """Raised when container image pull fails."""
38
+
39
+ pass
40
+
41
+
42
+ class ResourceNotAvailableError(Exception):
43
+ """Raised when required compute resources (GPU, memory, etc.) are not available in the cluster."""
44
+
45
+ pass
46
+
47
+
48
+ class ServiceHealthError(KnativeServiceError):
49
+ """Raised when service health checks fail."""
50
+
51
+ pass
52
+
53
+
54
+ class ServiceTimeoutError(KnativeServiceError):
55
+ """Raised when service fails to become ready within timeout period."""
56
+
57
+ pass
58
+
59
+
60
+ class QueueUnschedulableError(KnativeServiceError):
61
+ """Raised when the service pod is unschedulable in the requested queue."""
62
+
63
+ pass
64
+
65
+
66
+ class KnativeServiceConflictError(Exception):
67
+ """Raised when a conflicting non-Knative Kubernetes Service prevents Knative service creation."""
68
+
69
+ pass
70
+
71
+
72
+ class PodContainerError(Exception):
73
+ """Raised when pod container is in a terminated or waiting state."""
74
+
75
+ pass
76
+
77
+
78
+ class VersionMismatchError(Exception):
79
+ """Raised when the Kubetorch client version is incompatible with the version running on the target cluster"""
80
+
81
+ pass
82
+
83
+
84
+ class SecretNotFound(Exception):
85
+ """Raised when trying to update kubetorch secret the does not exist"""
86
+
87
+ def __init__(self, secret_name: str, namespace: str):
88
+ super().__init__(
89
+ f"kubetorch secret {secret_name} was not found in {namespace} namespace"
90
+ )
91
+
92
+
93
+ class RsyncError(Exception):
94
+ def __init__(self, cmd: str, returncode: int, stdout: str, stderr: str):
95
+ self.cmd = cmd
96
+ self.returncode = returncode
97
+ self.stdout = stdout
98
+ self.stderr = stderr
99
+ super().__init__(f"Rsync failed (code={returncode}): {stderr.strip()}")
100
+
101
+
102
+ TERMINATE_EARLY_ERRORS = {
103
+ "ContainerMissing": ImagePullError,
104
+ "ImagePullBackOff": ImagePullError,
105
+ "ErrImagePull": ImagePullError,
106
+ "CrashLoopBackOff": ServiceHealthError,
107
+ "BackOff": ServiceHealthError,
108
+ "StartupError": StartupError,
109
+ "FailedMount": StartupError,
110
+ }
111
+
112
+
113
+ def _run_bash(
114
+ commands: Union[str, List[str]],
115
+ core_api: "CoreV1Api",
116
+ pod_names: List[str],
117
+ namespace: str,
118
+ container: str = None,
119
+ ):
120
+ if isinstance(commands, str):
121
+ commands = [commands]
122
+ commands = [
123
+ ["/bin/sh", "-c", f'{command}; echo "::EXIT_CODE::$?"'] for command in commands
124
+ ]
125
+
126
+ if isinstance(pod_names, str):
127
+ pod_names = [pod_names]
128
+
129
+ ret_codes = []
130
+ for exec_command in commands:
131
+ for pod_name in pod_names:
132
+ if not container:
133
+ pod = core_api.read_namespaced_pod(name=pod_name, namespace=namespace)
134
+ if not pod.spec.containers:
135
+ raise Exception(f"No containers found in pod {pod_name}")
136
+ container = pod.spec.containers[0].name
137
+ try:
138
+ resp = stream(
139
+ core_api.connect_get_namespaced_pod_exec,
140
+ pod_name,
141
+ namespace,
142
+ container=container,
143
+ command=exec_command,
144
+ stderr=True,
145
+ stdin=False,
146
+ stdout=True,
147
+ )
148
+
149
+ resp = resp.splitlines()
150
+ exit_code = 0
151
+
152
+ for line in resp:
153
+ if "::EXIT_CODE::" in line:
154
+ try:
155
+ exit_code = int(line.split("::EXIT_CODE::")[-1].strip())
156
+ resp.remove(line)
157
+ break
158
+ except ValueError:
159
+ pass
160
+
161
+ stdout = "\n".join(resp)
162
+
163
+ if exit_code == 0:
164
+ ret_codes.append([exit_code, stdout, ""])
165
+ else:
166
+ ret_codes.append([exit_code, "", stdout])
167
+
168
+ except Exception as e:
169
+ raise Exception(
170
+ f"Failed to execute command {exec_command} on pod {pod_name}: {str(e)}"
171
+ )
172
+ return ret_codes
173
+
174
+
175
+ def _get_rsync_exclude_options() -> str:
176
+ """Get rsync exclude options using .gitignore and/or .ktignore if available."""
177
+ from pathlib import Path
178
+
179
+ # Allow users to hard override all of our settings
180
+ if os.environ.get("KT_RSYNC_FILTERS"):
181
+ logger.debug(
182
+ f"KT_RSYNC_FILTERS environment variable set, using rsync filters: {os.environ['KT_RSYNC_FILTERS']}"
183
+ )
184
+ return os.environ["KT_RSYNC_FILTERS"]
185
+
186
+ repo_root = locate_working_dir(os.getcwd())
187
+ gitignore_path = os.path.join(repo_root, ".gitignore")
188
+ kt_ignore_path = os.path.join(repo_root, ".ktignore")
189
+
190
+ exclude_args = ""
191
+ if Path(kt_ignore_path).exists():
192
+ exclude_args += f" --exclude-from='{kt_ignore_path}'"
193
+ if Path(gitignore_path).exists():
194
+ exclude_args += f" --exclude-from='{gitignore_path}'"
195
+ # Add some reasonable default exclusions
196
+ exclude_args += (
197
+ " --exclude='*.pyc' --exclude='__pycache__' --exclude='.venv' --exclude='.git'"
198
+ )
199
+
200
+ return exclude_args.strip()
201
+
202
+
203
+ def is_pod_terminated(pod: client.V1Pod) -> bool:
204
+ # Check if pod is marked for deletion
205
+ if pod.metadata.deletion_timestamp is not None:
206
+ return True
207
+
208
+ # Check pod phase
209
+ if pod.status.phase in ["Succeeded", "Failed"]:
210
+ return True
211
+
212
+ # Check container statuses
213
+ if pod.status.container_statuses:
214
+ for container in pod.status.container_statuses:
215
+ if container.state.terminated:
216
+ return True
217
+
218
+ return False
219
+
220
+
221
+ # ----------------- ConfigMap utils ----------------- #
222
+ def load_configmaps(
223
+ core_api: client.CoreV1Api,
224
+ service_name: str,
225
+ namespace: str,
226
+ console: "Console" = None,
227
+ ) -> List[str]:
228
+ """List configmaps that start with a given service name."""
229
+ try:
230
+ configmaps = core_api.list_namespaced_config_map(
231
+ namespace=namespace,
232
+ label_selector=f"kubetorch.com/service={service_name}",
233
+ )
234
+ return [cm.metadata.name for cm in configmaps.items]
235
+ except ApiException as e:
236
+ if console:
237
+ console.print(f"[yellow]Warning:[/yellow] Failed to list configmaps: {e}")
238
+ return []
239
+
240
+
241
+ # ----------------- Resource Deletion Utils ----------------- #
242
+ def delete_configmaps(
243
+ core_api: client.CoreV1Api,
244
+ configmaps: List[str],
245
+ namespace: str,
246
+ console: "Console" = None,
247
+ force: bool = False,
248
+ ):
249
+ """Delete the given list of configmaps."""
250
+
251
+ grace_period_seconds, propagation_policy = None, None
252
+ if force:
253
+ grace_period_seconds = 0
254
+ propagation_policy = "Foreground"
255
+
256
+ for cm in configmaps:
257
+ try:
258
+ core_api.delete_namespaced_config_map(
259
+ name=cm,
260
+ namespace=namespace,
261
+ grace_period_seconds=grace_period_seconds,
262
+ propagation_policy=propagation_policy,
263
+ )
264
+ if console:
265
+ console.print(f"✓ Deleted configmap [blue]{cm}[/blue]")
266
+ except ApiException as e:
267
+ if e.status == 404:
268
+ if console:
269
+ console.print(f"[yellow]Warning:[/yellow] ConfigMap {cm} not found")
270
+ else:
271
+ if console:
272
+ console.print(
273
+ f"[red]Error:[/red] Failed to delete configmap {cm}: {e}"
274
+ )
275
+
276
+
277
+ def delete_service(
278
+ custom_api: client.CustomObjectsApi,
279
+ name: str,
280
+ namespace,
281
+ console: "Console" = None,
282
+ force: bool = False,
283
+ ):
284
+ """Delete a Knative service."""
285
+
286
+ grace_period_seconds, propagation_policy = None, None
287
+ if force:
288
+ grace_period_seconds = 0
289
+ propagation_policy = "Foreground"
290
+
291
+ try:
292
+ custom_api.delete_namespaced_custom_object(
293
+ group="serving.knative.dev",
294
+ version="v1",
295
+ namespace=namespace,
296
+ plural="services",
297
+ name=name,
298
+ grace_period_seconds=grace_period_seconds,
299
+ propagation_policy=propagation_policy,
300
+ )
301
+ if console:
302
+ console.print(f"✓ Deleted service [blue]{name}[/blue]")
303
+ except Exception as e:
304
+ if e.status == 404:
305
+ if console:
306
+ console.print(
307
+ f"[yellow]Note:[/yellow] Service {name} not found or already deleted"
308
+ )
309
+ else:
310
+ if console:
311
+ console.print(f"[red]Error:[/red] Failed to delete service {name}: {e}")
312
+
313
+
314
+ def delete_deployment(
315
+ apps_v1_api: client.AppsV1Api,
316
+ core_api: client.CoreV1Api,
317
+ name: str,
318
+ namespace: str,
319
+ console: "Console" = None,
320
+ force: bool = False,
321
+ ):
322
+ """Delete a Deployment and its associated service."""
323
+ grace_period_seconds, propagation_policy = None, None
324
+ if force:
325
+ grace_period_seconds = 0
326
+ propagation_policy = "Foreground"
327
+ try:
328
+ # Delete the Deployment
329
+ apps_v1_api.delete_namespaced_deployment(
330
+ name=name,
331
+ namespace=namespace,
332
+ grace_period_seconds=grace_period_seconds,
333
+ propagation_policy=propagation_policy,
334
+ )
335
+ if console:
336
+ console.print(f"✓ Deleted deployment [blue]{name}[/blue]")
337
+ except ApiException as e:
338
+ if e.status == 404:
339
+ if console:
340
+ console.print(
341
+ f"[yellow]Note:[/yellow] Deployment {name} not found or already deleted"
342
+ )
343
+ else:
344
+ if console:
345
+ console.print(
346
+ f"[red]Error:[/red] Failed to delete deployment {name}: {e}"
347
+ )
348
+
349
+ # Delete the associated service (regular service, not headless)
350
+ try:
351
+ core_api.delete_namespaced_service(
352
+ name=name,
353
+ namespace=namespace,
354
+ grace_period_seconds=grace_period_seconds,
355
+ propagation_policy=propagation_policy,
356
+ )
357
+ if console:
358
+ console.print(f"✓ Deleted service [blue]{name}[/blue]")
359
+ except ApiException as e:
360
+ if e.status == 404:
361
+ if console:
362
+ console.print(
363
+ f"[yellow]Note:[/yellow] Service {name} not found or already deleted"
364
+ )
365
+ else:
366
+ if console:
367
+ console.print(f"[red]Error:[/red] Failed to delete service {name}: {e}")
368
+
369
+ # Also try to delete the headless service for distributed deployments
370
+ try:
371
+ core_api.delete_namespaced_service(
372
+ name=f"{name}-headless",
373
+ namespace=namespace,
374
+ grace_period_seconds=grace_period_seconds,
375
+ propagation_policy=propagation_policy,
376
+ )
377
+ if console:
378
+ console.print(f"✓ Deleted headless service [blue]{name}-headless[/blue]")
379
+ except ApiException as e:
380
+ if e.status == 404:
381
+ # This is normal for non-distributed deployments
382
+ pass
383
+ else:
384
+ if console:
385
+ console.print(
386
+ f"[red]Error:[/red] Failed to delete headless service {name}-headless: {e}"
387
+ )
388
+
389
+
390
+ def delete_raycluster(
391
+ custom_api: client.CustomObjectsApi,
392
+ core_api: client.CoreV1Api,
393
+ name: str,
394
+ namespace: str,
395
+ console: "Console" = None,
396
+ force: bool = False,
397
+ ):
398
+ """Delete a RayCluster and its associated service."""
399
+
400
+ grace_period_seconds, propagation_policy = None, None
401
+ if force:
402
+ grace_period_seconds = 0
403
+ propagation_policy = "Foreground"
404
+
405
+ try:
406
+ # Delete the RayCluster
407
+ custom_api.delete_namespaced_custom_object(
408
+ group="ray.io",
409
+ version="v1",
410
+ namespace=namespace,
411
+ plural="rayclusters",
412
+ name=name,
413
+ grace_period_seconds=grace_period_seconds,
414
+ propagation_policy=propagation_policy,
415
+ )
416
+ if console:
417
+ console.print(f"✓ Deleted RayCluster [blue]{name}[/blue]")
418
+ except ApiException as e:
419
+ if e.status == 404:
420
+ if console:
421
+ console.print(
422
+ f"[yellow]Note:[/yellow] RayCluster {name} not found or already deleted"
423
+ )
424
+ else:
425
+ if console:
426
+ console.print(
427
+ f"[red]Error:[/red] Failed to delete RayCluster {name}: {e}"
428
+ )
429
+
430
+ # Delete the associated service (created alongside RayCluster)
431
+ try:
432
+ core_api.delete_namespaced_service(
433
+ name=name,
434
+ namespace=namespace,
435
+ grace_period_seconds=grace_period_seconds,
436
+ propagation_policy=propagation_policy,
437
+ )
438
+ if console:
439
+ console.print(f"✓ Deleted service [blue]{name}[/blue]")
440
+ except ApiException as e:
441
+ if e.status == 404:
442
+ if console:
443
+ console.print(
444
+ f"[yellow]Note:[/yellow] Service {name} not found or already deleted"
445
+ )
446
+ else:
447
+ if console:
448
+ console.print(f"[red]Error:[/red] Failed to delete service {name}: {e}")
449
+
450
+ # Delete the headless service for Ray pod discovery
451
+ try:
452
+ core_api.delete_namespaced_service(
453
+ name=f"{name}-headless",
454
+ namespace=namespace,
455
+ grace_period_seconds=grace_period_seconds,
456
+ propagation_policy=propagation_policy,
457
+ )
458
+ if console:
459
+ console.print(f"✓ Deleted headless service [blue]{name}-headless[/blue]")
460
+ except ApiException as e:
461
+ if e.status == 404:
462
+ # This is normal for older Ray clusters without headless services
463
+ pass
464
+ else:
465
+ if console:
466
+ console.print(
467
+ f"[red]Error:[/red] Failed to delete headless service {name}-headless: {e}"
468
+ )
469
+
470
+
471
+ def delete_resources_for_service(
472
+ core_api: client.CoreV1Api,
473
+ custom_api: client.CustomObjectsApi,
474
+ configmaps: List[str],
475
+ name: str,
476
+ service_type: str = "knative",
477
+ namespace: str = None,
478
+ console: "Console" = None,
479
+ force: bool = False,
480
+ ):
481
+ """Delete service resources based on service type."""
482
+ # Delete the main service (Knative, Deployment, or RayCluster)
483
+ if service_type == "deployment":
484
+ apps_v1_api = client.AppsV1Api()
485
+ delete_deployment(
486
+ apps_v1_api=apps_v1_api,
487
+ core_api=core_api,
488
+ name=name,
489
+ namespace=namespace,
490
+ console=console,
491
+ force=force,
492
+ )
493
+ elif service_type == "raycluster":
494
+ delete_raycluster(
495
+ custom_api=custom_api,
496
+ core_api=core_api,
497
+ name=name,
498
+ namespace=namespace,
499
+ console=console,
500
+ force=force,
501
+ )
502
+ else: # knative or unknown - try deleting as Knative service
503
+ delete_service(
504
+ custom_api=custom_api,
505
+ name=name,
506
+ namespace=namespace,
507
+ console=console,
508
+ force=force,
509
+ )
510
+
511
+ # Delete configmaps
512
+ if configmaps:
513
+ delete_configmaps(
514
+ core_api=core_api,
515
+ configmaps=configmaps,
516
+ namespace=namespace,
517
+ console=console,
518
+ force=force,
519
+ )
520
+
521
+ delete_cached_service_data(
522
+ core_api=core_api, service_name=name, namespace=namespace, console=console
523
+ )
524
+
525
+
526
+ def delete_cached_service_data(
527
+ core_api: client.CoreV1Api,
528
+ service_name: str,
529
+ namespace: str,
530
+ console: "Console" = None,
531
+ ):
532
+ """Delete service data from the rsync pod."""
533
+ try:
534
+ # Find the rsync pod name in the provided namespace
535
+ pods = core_api.list_namespaced_pod(
536
+ namespace=namespace, label_selector="app=kubetorch-rsync"
537
+ )
538
+
539
+ if not pods.items:
540
+ if console:
541
+ console.print(
542
+ f"[yellow] No rsync pod found in namespace {namespace}[/yellow]"
543
+ )
544
+ return
545
+
546
+ pod_name = pods.items[0].metadata.name
547
+ service_path = f"/data/{namespace}/{service_name}"
548
+
549
+ shell_cmd = (
550
+ f"if [ -d '{service_path}' ]; then rm -rf '{service_path}' && echo 'Deleted {service_path}'; "
551
+ f"else echo 'Path {service_path} not found'; fi"
552
+ )
553
+
554
+ # Execute command based on environment
555
+ if is_running_in_kubernetes():
556
+ response = stream(
557
+ core_api.connect_get_namespaced_pod_exec,
558
+ name=pod_name,
559
+ namespace=namespace,
560
+ command=["sh", "-c", shell_cmd],
561
+ stderr=True,
562
+ stdin=False,
563
+ stdout=True,
564
+ tty=False,
565
+ )
566
+ output = response.strip()
567
+
568
+ else:
569
+ cmd = [
570
+ "kubectl",
571
+ "exec",
572
+ "-n",
573
+ namespace,
574
+ pod_name,
575
+ "--",
576
+ "sh",
577
+ "-c",
578
+ shell_cmd,
579
+ ]
580
+ result = subprocess.run(cmd, capture_output=True, text=True, timeout=10)
581
+
582
+ if result.returncode != 0:
583
+ if console:
584
+ console.print(
585
+ f"[red]Error cleaning up cached data: {result.stderr}[/red]"
586
+ )
587
+ return
588
+ output = result.stdout.strip()
589
+
590
+ if console:
591
+ if "Deleted" in output:
592
+ console.print(f"✓ Deleted cached data for [blue]{service_name}[/blue]")
593
+
594
+ except subprocess.TimeoutExpired:
595
+ if console:
596
+ console.print("[red]Timeout while cleaning up cached service data[/red]")
597
+ else:
598
+ logger.debug("Timeout while cleaning up cached data")
599
+
600
+ except Exception as e:
601
+ if console:
602
+ console.print(f"[red]Failed to clean up cached service data: {e}[/red]")
603
+ else:
604
+ logger.debug(f"Failed to clean up cached data: {e}")
605
+
606
+
607
+ def _collect_modules(target_str):
608
+ from kubetorch.resources.callables.module import Module
609
+
610
+ to_deploy = []
611
+
612
+ if ":" in target_str:
613
+ target_module_or_path, target_fn_or_class = target_str.split(":")
614
+ else:
615
+ target_module_or_path, target_fn_or_class = target_str, None
616
+
617
+ if target_module_or_path.endswith(".py"):
618
+ abs_path = Path(target_module_or_path).resolve()
619
+ python_module_name = inspect.getmodulename(str(abs_path))
620
+
621
+ sys.path.insert(0, str(abs_path.parent))
622
+ else:
623
+ python_module_name = target_module_or_path
624
+ sys.path.append(".")
625
+
626
+ module = importlib.import_module(python_module_name)
627
+
628
+ if target_fn_or_class:
629
+ if not hasattr(module, target_fn_or_class):
630
+ raise ValueError(
631
+ f"Function or class {target_fn_or_class} not found in {target_module_or_path}."
632
+ )
633
+ to_deploy = [getattr(module, target_fn_or_class)]
634
+ if not isinstance(to_deploy[0], Module):
635
+ raise ValueError(
636
+ f"Function or class {target_fn_or_class} in {target_module_or_path} is not decorated with @kt.compute."
637
+ )
638
+ else:
639
+ # Get all functions and classes to deploy
640
+ for name in dir(module):
641
+ obj = getattr(module, name)
642
+ if isinstance(obj, Module):
643
+ to_deploy.append(obj)
644
+ if not to_deploy:
645
+ raise ValueError(
646
+ f"No functions or classes decorated with @kt.compute found in {target_module_or_path}."
647
+ )
648
+
649
+ return to_deploy, target_fn_or_class
650
+
651
+
652
+ def fetch_resources_for_teardown(
653
+ namespace: str,
654
+ target: str,
655
+ core_api: client.CoreV1Api,
656
+ custom_api: client.CustomObjectsApi,
657
+ prefix: Optional[str] = None,
658
+ username: Optional[str] = None,
659
+ ) -> dict:
660
+ """Fetchs the resources for a given service.
661
+
662
+ Returns a dictionary with the following keys:
663
+ - services: {
664
+ [service_name]: {
665
+ "configmaps": List[str],
666
+ "pods": List[str],
667
+ "type": str, # "knative" or "deployment"
668
+ }
669
+ }
670
+ """
671
+ from kubetorch.resources.callables.module import Module
672
+
673
+ resources = {"services": {}}
674
+ services = []
675
+
676
+ if prefix in ["kt", "kubetorch", "knative"]:
677
+ raise ValueError(
678
+ f"Invalid prefix: {prefix} is reserved. Please delete these individually."
679
+ )
680
+
681
+ # Initialize apps API for deployments
682
+ apps_v1_api = client.AppsV1Api()
683
+
684
+ if username or prefix:
685
+ # Search Knative services
686
+ try:
687
+ # Build label selector for Knative services - use template label to identify kubetorch services
688
+ knative_label_selector = f"{serving_constants.KT_TEMPLATE_LABEL}=ksvc"
689
+ if username:
690
+ knative_label_selector += f",{KT_USERNAME_LABEL}={username}"
691
+
692
+ response = custom_api.list_namespaced_custom_object(
693
+ group="serving.knative.dev",
694
+ version="v1",
695
+ namespace=namespace,
696
+ plural="services",
697
+ label_selector=knative_label_selector,
698
+ )
699
+ items = response.get("items", [])
700
+ knative_services = [
701
+ item["metadata"]["name"]
702
+ for item in items
703
+ if (username or item["metadata"]["name"].startswith(prefix))
704
+ ]
705
+ services.extend(knative_services)
706
+ except client.exceptions.ApiException as e:
707
+ if e.status != 404: # Ignore if Knative is not installed
708
+ logger.warning(f"Failed to list Knative services: {e}")
709
+
710
+ # Search Deployments
711
+ try:
712
+ # Build label selector for deployments - use KT_TEMPLATE_LABEL to identify kubetorch deployments
713
+ deployment_label_selector = (
714
+ f"{serving_constants.KT_TEMPLATE_LABEL}=deployment"
715
+ )
716
+ if username:
717
+ deployment_label_selector += f",{KT_USERNAME_LABEL}={username}"
718
+
719
+ deployments_response = apps_v1_api.list_namespaced_deployment(
720
+ namespace=namespace,
721
+ label_selector=deployment_label_selector,
722
+ )
723
+ deployment_services = [
724
+ deployment.metadata.name
725
+ for deployment in deployments_response.items
726
+ if (username or deployment.metadata.name.startswith(prefix))
727
+ ]
728
+ services.extend(deployment_services)
729
+ except client.exceptions.ApiException as e:
730
+ logger.warning(f"Failed to list Deployments: {e}")
731
+
732
+ # Search RayClusters
733
+ try:
734
+ # Build label selector for rayclusters - use template label to identify kubetorch rayclusters
735
+ raycluster_label_selector = (
736
+ f"{serving_constants.KT_TEMPLATE_LABEL}=raycluster"
737
+ )
738
+ if username:
739
+ raycluster_label_selector += f",{KT_USERNAME_LABEL}={username}"
740
+
741
+ response = custom_api.list_namespaced_custom_object(
742
+ group="ray.io",
743
+ version="v1",
744
+ namespace=namespace,
745
+ plural="rayclusters",
746
+ label_selector=raycluster_label_selector,
747
+ )
748
+ items = response.get("items", [])
749
+ raycluster_services = [
750
+ item["metadata"]["name"]
751
+ for item in items
752
+ if (username or item["metadata"]["name"].startswith(prefix))
753
+ ]
754
+ services.extend(raycluster_services)
755
+ except client.exceptions.ApiException as e:
756
+ if e.status != 404: # Ignore if Ray operator is not installed
757
+ logger.warning(f"Failed to list RayClusters: {e}")
758
+
759
+ else:
760
+ if not target:
761
+ raise ValueError(
762
+ "Please provide a service name or use the --all or --prefix flags"
763
+ )
764
+
765
+ # Case when service_name is a module or file path (i.e. the `kt deploy` usage path)
766
+ if ":" in target or ".py" in target or "." in target:
767
+ to_down, _ = _collect_modules(target)
768
+ services = [mod.service_name for mod in to_down if isinstance(mod, Module)]
769
+ else:
770
+ services = [target]
771
+
772
+ for service_name in services:
773
+ service_type = None
774
+ service_found = False
775
+
776
+ # Check if it's a Knative service
777
+ try:
778
+ service = custom_api.get_namespaced_custom_object(
779
+ group="serving.knative.dev",
780
+ version="v1",
781
+ namespace=namespace,
782
+ plural="services",
783
+ name=service_name,
784
+ )
785
+ if service:
786
+ service_type = "knative"
787
+ service_found = True
788
+ except client.exceptions.ApiException:
789
+ pass
790
+
791
+ # Check if it's a Deployment (if not found as Knative service)
792
+ if not service_found:
793
+ try:
794
+ deployment = apps_v1_api.read_namespaced_deployment(
795
+ name=service_name, namespace=namespace
796
+ )
797
+ # Only consider it if it has kubetorch template label
798
+ if (
799
+ deployment.metadata.labels
800
+ and deployment.metadata.labels.get(
801
+ serving_constants.KT_TEMPLATE_LABEL
802
+ )
803
+ == "deployment"
804
+ ):
805
+ service_type = "deployment"
806
+ service_found = True
807
+ except client.exceptions.ApiException:
808
+ pass
809
+
810
+ # Check if it's a RayCluster (if not found as Knative or Deployment)
811
+ if not service_found:
812
+ try:
813
+ raycluster = custom_api.get_namespaced_custom_object(
814
+ group="ray.io",
815
+ version="v1",
816
+ namespace=namespace,
817
+ plural="rayclusters",
818
+ name=service_name,
819
+ )
820
+ if raycluster:
821
+ service_type = "raycluster"
822
+ service_found = True
823
+ except client.exceptions.ApiException:
824
+ pass
825
+
826
+ # Get associated resources if service exists
827
+ configmaps = load_configmaps(core_api, service_name, namespace)
828
+ pods = core_api.list_namespaced_pod(
829
+ namespace=namespace, label_selector=f"{KT_SERVICE_LABEL}={service_name}"
830
+ )
831
+ pods = [pod.metadata.name for pod in pods.items]
832
+
833
+ # Only add the service to the resources if it has configmaps, pods, or we found the service
834
+ if service_found or configmaps or pods:
835
+ resources["services"][service_name] = {
836
+ "configmaps": configmaps,
837
+ "pods": pods,
838
+ "type": service_type or "unknown",
839
+ }
840
+
841
+ return resources
842
+
843
+
844
+ # ----------------- Image Builder Utils ----------------- #
845
+ def _get_sync_package_paths(
846
+ package: str,
847
+ ):
848
+ if "/" in package or "~" in package:
849
+ package_path = (
850
+ Path(package).expanduser()
851
+ if Path(package).expanduser().is_absolute()
852
+ else Path(locate_working_dir()) / package
853
+ )
854
+ dest_dir = str(package_path.name)
855
+ else:
856
+ package_path = get_local_install_path(package)
857
+ dest_dir = package
858
+
859
+ if not (package_path and Path(package_path).exists()):
860
+ raise ValueError(f"Could not locate local package {package}")
861
+
862
+ full_path = Path(package_path).expanduser().resolve()
863
+ return str(full_path), dest_dir
864
+
865
+
866
+ # ----------------- Error Handling Utils ----------------- #
867
+ def check_pod_status_for_errors(
868
+ pod: client.V1Pod, queue_name: str = None, scheduler_name: str = None
869
+ ):
870
+ """Check pod status for errors"""
871
+ # Check for scheduling issues
872
+ for condition in pod.status.conditions or []:
873
+ if (
874
+ condition.type == "PodScheduled"
875
+ and condition.status == "False"
876
+ and condition.reason == "Unschedulable"
877
+ ):
878
+ msg = condition.message.lower()
879
+
880
+ # Check if the pod is scheduled in the correct queue and scheduler
881
+ if queue_name and scheduler_name:
882
+ scheduler = pod.metadata.annotations.get("schedulerName", "")
883
+ queue_label = pod.metadata.labels.get("kai.scheduler/queue")
884
+ if queue_label == queue_name and scheduler == scheduler_name:
885
+ raise QueueUnschedulableError(
886
+ f"Pod {pod.metadata.name} could not be scheduled: {condition.message}"
887
+ )
888
+
889
+ # Check for specific node selector/affinity/GPU type mismatches
890
+ # without matching temporary resource exhaustion messages
891
+ if any(
892
+ x in msg
893
+ for x in [
894
+ "node selector not matched",
895
+ "node affinity mismatch",
896
+ "unsupported gpu type",
897
+ "unknown instance type",
898
+ ]
899
+ ):
900
+ raise ResourceNotAvailableError(
901
+ f"Required compute resources are not configured in the cluster: {condition.message}"
902
+ )
903
+
904
+ # Check for container status errors
905
+ if pod.status.container_statuses:
906
+ for container_status in pod.status.container_statuses:
907
+ if container_status.state and container_status.state.waiting:
908
+ reason = container_status.state.waiting.reason
909
+ message = container_status.state.waiting.message or ""
910
+ if reason in TERMINATE_EARLY_ERRORS:
911
+ raise TERMINATE_EARLY_ERRORS[reason](
912
+ f"Pod {pod.metadata.name}: {message}"
913
+ )
914
+
915
+
916
+ def check_pod_events_for_errors(
917
+ pod: client.V1Pod, namespace: str, core_api: client.CoreV1Api
918
+ ):
919
+ """Check pod events for scheduling errors"""
920
+ try:
921
+ events = core_api.list_namespaced_event(
922
+ namespace=namespace,
923
+ field_selector=f"involvedObject.name={pod.metadata.name}",
924
+ ).items
925
+ for event in events:
926
+ # Check for Karpenter scheduling errors
927
+ if (
928
+ event.reason == "FailedScheduling"
929
+ and event.source.component == "karpenter"
930
+ and "no instance type has enough resources" in event.message
931
+ ):
932
+ raise ResourceNotAvailableError(
933
+ f"Pod {pod.metadata.name} failed to schedule: {event.message}"
934
+ )
935
+ except client.exceptions.ApiException as e:
936
+ logger.warning(f"Error fetching events for pod {pod.metadata.name}: {e}")
937
+
938
+
939
+ def check_replicaset_events_for_errors(
940
+ namespace: str,
941
+ service_name: str,
942
+ apps_v1_api: client.AppsV1Api,
943
+ core_api: client.CoreV1Api,
944
+ ):
945
+ """Check ReplicaSet events for creation errors like missing PriorityClass.
946
+
947
+ Args:
948
+ service_name: Name of the service
949
+ core_api: Core API instance
950
+
951
+ Raises:
952
+ ResourceNotAvailableError: If ReplicaSet creation fails due to missing resources
953
+ """
954
+ try:
955
+ # Get ReplicaSets associated with this Deployment
956
+ replicasets = apps_v1_api.list_namespaced_replica_set(
957
+ namespace=namespace,
958
+ label_selector=f"kubetorch.com/service={service_name}",
959
+ ).items
960
+
961
+ for replicaset in replicasets:
962
+ # Check ReplicaSet events for FailedCreate errors
963
+ events = core_api.list_namespaced_event(
964
+ namespace=namespace,
965
+ field_selector=f"involvedObject.name={replicaset.metadata.name}",
966
+ ).items
967
+
968
+ for event in events:
969
+ if (
970
+ event.reason == "FailedCreate"
971
+ and event.type == "Warning"
972
+ and "forbidden" in event.message.lower()
973
+ ):
974
+ # Check for specific PriorityClass errors
975
+ if "priorityclass" in event.message.lower():
976
+ raise ResourceNotAvailableError(
977
+ f"ReplicaSet {replicaset.metadata.name} failed to create pods: "
978
+ f"{event.message}. Please ensure the required PriorityClass exists in the cluster."
979
+ )
980
+ # Check for other forbidden errors
981
+ elif any(
982
+ error_type in event.message.lower()
983
+ for error_type in [
984
+ "forbidden",
985
+ "no priorityclass",
986
+ "priority class",
987
+ ]
988
+ ):
989
+ raise ResourceNotAvailableError(
990
+ f"ReplicaSet {replicaset.metadata.name} failed to create pods: "
991
+ f"{event.message}. Please check cluster configuration and permissions."
992
+ )
993
+
994
+ except client.exceptions.ApiException as e:
995
+ logger.warning(f"Error checking ReplicaSet events for {service_name}: {e}")
996
+ except ResourceNotAvailableError:
997
+ # Re-raise ResourceNotAvailableError to stop the readiness check
998
+ raise
999
+
1000
+
1001
+ def check_revision_for_errors(
1002
+ revision_name: str, namespace: str, objects_api: client.CustomObjectsApi
1003
+ ):
1004
+ """Check revision for errors"""
1005
+ try:
1006
+ revision = objects_api.get_namespaced_custom_object(
1007
+ group="serving.knative.dev",
1008
+ version="v1",
1009
+ namespace=namespace,
1010
+ plural="revisions",
1011
+ name=revision_name,
1012
+ )
1013
+ for cond in revision.get("status", {}).get("conditions", []):
1014
+ if cond["status"] == "False":
1015
+ reason = cond.get("reason")
1016
+ message = cond.get("message", f"Revision failed with reason: {reason}")
1017
+ if reason in TERMINATE_EARLY_ERRORS:
1018
+ raise TERMINATE_EARLY_ERRORS[reason](
1019
+ f"Revision {revision_name}: {message}"
1020
+ )
1021
+ except client.exceptions.ApiException as e:
1022
+ logger.warning(f"Error checking revision: {e}")
1023
+
1024
+
1025
+ def is_port_available(port: int) -> bool:
1026
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
1027
+ return s.connect_ex(("localhost", port)) != 0
1028
+
1029
+
1030
+ def find_available_port(start_port: int, max_tries: int = 10) -> int:
1031
+ for i in range(max_tries):
1032
+ port = start_port + i
1033
+ if is_port_available(port):
1034
+ return port
1035
+ raise RuntimeError(f"Could not find available port starting from {start_port}")
1036
+
1037
+
1038
+ # --------------- Secrets utils ---------------------------
1039
+
1040
+
1041
+ def get_parsed_secret(secret: client.V1Secret):
1042
+ labels = secret.metadata.labels
1043
+ secret = {
1044
+ "name": secret.metadata.name,
1045
+ "username": labels.get("kubetorch.com/username", None) if labels else None,
1046
+ "namespace": secret.metadata.namespace,
1047
+ "user_defined_name": labels.get("kubetorch.com/secret-name", None)
1048
+ if labels
1049
+ else None,
1050
+ "labels": labels,
1051
+ "annotations": secret.metadata.annotations,
1052
+ "type": secret.type,
1053
+ "data": secret.data,
1054
+ }
1055
+ return secret
1056
+
1057
+
1058
+ def list_secrets(
1059
+ core_api: client.CoreV1Api,
1060
+ namespace: str = "default",
1061
+ prefix: str = None,
1062
+ all_namespaces: bool = False,
1063
+ filter_by_creator: bool = True,
1064
+ console: "Console" = None,
1065
+ ):
1066
+ try:
1067
+ if all_namespaces:
1068
+ secrets: client.V1SecretList = core_api.list_secret_for_all_namespaces()
1069
+ else:
1070
+ secrets: client.V1SecretList = core_api.list_namespaced_secret(
1071
+ namespace=namespace
1072
+ )
1073
+ if not secrets:
1074
+ return None
1075
+ filtered_secrets = []
1076
+ for secret in secrets.items:
1077
+ parsed_secret = get_parsed_secret(secret)
1078
+ user_defined_secret_name = parsed_secret.get("user_defined_name")
1079
+ if (
1080
+ user_defined_secret_name
1081
+ ): # filter secrets that was created by kt api, by the username set in kt.config.
1082
+ if prefix and filter_by_creator: # filter secrets by prefix + creator
1083
+ if (
1084
+ parsed_secret.get("user_defined_name").startswith(prefix)
1085
+ and parsed_secret.get("username")
1086
+ == kubetorch.globals.config.username
1087
+ ):
1088
+ filtered_secrets.append(parsed_secret)
1089
+ elif prefix: # filter secrets by prefix
1090
+ if parsed_secret.get("user_defined_name").startswith(prefix):
1091
+ filtered_secrets.append(parsed_secret)
1092
+ elif filter_by_creator: # filter secrets by creator
1093
+ if (
1094
+ parsed_secret.get("username")
1095
+ == kubetorch.globals.config.username
1096
+ ):
1097
+ filtered_secrets.append(parsed_secret)
1098
+ else: # No additional filters required
1099
+ filtered_secrets.append(parsed_secret)
1100
+ return filtered_secrets
1101
+
1102
+ except client.rest.ApiException as e:
1103
+ console.print(f"[red]Failed to load secrets: {e}[/red]")
1104
+ return None
1105
+
1106
+
1107
+ def delete_secrets(
1108
+ secrets: List[str],
1109
+ secrets_client: KubernetesSecretsClient,
1110
+ console: "Console" = None,
1111
+ ):
1112
+ """Delete the given list of secrets."""
1113
+ for secret in secrets:
1114
+ secrets_client.delete_secret(secret, console=console)