kubetorch 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kubetorch might be problematic. Click here for more details.

Files changed (93) hide show
  1. kubetorch/__init__.py +60 -0
  2. kubetorch/cli.py +1985 -0
  3. kubetorch/cli_utils.py +1025 -0
  4. kubetorch/config.py +453 -0
  5. kubetorch/constants.py +18 -0
  6. kubetorch/docs/Makefile +18 -0
  7. kubetorch/docs/__init__.py +0 -0
  8. kubetorch/docs/_ext/json_globaltoc.py +42 -0
  9. kubetorch/docs/api/cli.rst +10 -0
  10. kubetorch/docs/api/python/app.rst +21 -0
  11. kubetorch/docs/api/python/cls.rst +19 -0
  12. kubetorch/docs/api/python/compute.rst +25 -0
  13. kubetorch/docs/api/python/config.rst +11 -0
  14. kubetorch/docs/api/python/fn.rst +19 -0
  15. kubetorch/docs/api/python/image.rst +14 -0
  16. kubetorch/docs/api/python/secret.rst +18 -0
  17. kubetorch/docs/api/python/volumes.rst +13 -0
  18. kubetorch/docs/api/python.rst +101 -0
  19. kubetorch/docs/conf.py +69 -0
  20. kubetorch/docs/index.rst +20 -0
  21. kubetorch/docs/requirements.txt +5 -0
  22. kubetorch/globals.py +285 -0
  23. kubetorch/logger.py +59 -0
  24. kubetorch/resources/__init__.py +0 -0
  25. kubetorch/resources/callables/__init__.py +0 -0
  26. kubetorch/resources/callables/cls/__init__.py +0 -0
  27. kubetorch/resources/callables/cls/cls.py +157 -0
  28. kubetorch/resources/callables/fn/__init__.py +0 -0
  29. kubetorch/resources/callables/fn/fn.py +133 -0
  30. kubetorch/resources/callables/module.py +1416 -0
  31. kubetorch/resources/callables/utils.py +174 -0
  32. kubetorch/resources/compute/__init__.py +0 -0
  33. kubetorch/resources/compute/app.py +261 -0
  34. kubetorch/resources/compute/compute.py +2596 -0
  35. kubetorch/resources/compute/decorators.py +139 -0
  36. kubetorch/resources/compute/rbac.py +74 -0
  37. kubetorch/resources/compute/utils.py +1114 -0
  38. kubetorch/resources/compute/websocket.py +137 -0
  39. kubetorch/resources/images/__init__.py +1 -0
  40. kubetorch/resources/images/image.py +414 -0
  41. kubetorch/resources/images/images.py +74 -0
  42. kubetorch/resources/secrets/__init__.py +2 -0
  43. kubetorch/resources/secrets/kubernetes_secrets_client.py +412 -0
  44. kubetorch/resources/secrets/provider_secrets/__init__.py +0 -0
  45. kubetorch/resources/secrets/provider_secrets/anthropic_secret.py +12 -0
  46. kubetorch/resources/secrets/provider_secrets/aws_secret.py +16 -0
  47. kubetorch/resources/secrets/provider_secrets/azure_secret.py +14 -0
  48. kubetorch/resources/secrets/provider_secrets/cohere_secret.py +12 -0
  49. kubetorch/resources/secrets/provider_secrets/gcp_secret.py +16 -0
  50. kubetorch/resources/secrets/provider_secrets/github_secret.py +13 -0
  51. kubetorch/resources/secrets/provider_secrets/huggingface_secret.py +20 -0
  52. kubetorch/resources/secrets/provider_secrets/kubeconfig_secret.py +12 -0
  53. kubetorch/resources/secrets/provider_secrets/lambda_secret.py +13 -0
  54. kubetorch/resources/secrets/provider_secrets/langchain_secret.py +12 -0
  55. kubetorch/resources/secrets/provider_secrets/openai_secret.py +11 -0
  56. kubetorch/resources/secrets/provider_secrets/pinecone_secret.py +12 -0
  57. kubetorch/resources/secrets/provider_secrets/providers.py +93 -0
  58. kubetorch/resources/secrets/provider_secrets/ssh_secret.py +12 -0
  59. kubetorch/resources/secrets/provider_secrets/wandb_secret.py +11 -0
  60. kubetorch/resources/secrets/secret.py +238 -0
  61. kubetorch/resources/secrets/secret_factory.py +70 -0
  62. kubetorch/resources/secrets/utils.py +209 -0
  63. kubetorch/resources/volumes/__init__.py +0 -0
  64. kubetorch/resources/volumes/volume.py +365 -0
  65. kubetorch/servers/__init__.py +0 -0
  66. kubetorch/servers/http/__init__.py +0 -0
  67. kubetorch/servers/http/distributed_utils.py +3223 -0
  68. kubetorch/servers/http/http_client.py +730 -0
  69. kubetorch/servers/http/http_server.py +1788 -0
  70. kubetorch/servers/http/server_metrics.py +278 -0
  71. kubetorch/servers/http/utils.py +728 -0
  72. kubetorch/serving/__init__.py +0 -0
  73. kubetorch/serving/autoscaling.py +173 -0
  74. kubetorch/serving/base_service_manager.py +363 -0
  75. kubetorch/serving/constants.py +83 -0
  76. kubetorch/serving/deployment_service_manager.py +478 -0
  77. kubetorch/serving/knative_service_manager.py +519 -0
  78. kubetorch/serving/raycluster_service_manager.py +582 -0
  79. kubetorch/serving/service_manager.py +18 -0
  80. kubetorch/serving/templates/deployment_template.yaml +17 -0
  81. kubetorch/serving/templates/knative_service_template.yaml +19 -0
  82. kubetorch/serving/templates/kt_setup_template.sh.j2 +81 -0
  83. kubetorch/serving/templates/pod_template.yaml +194 -0
  84. kubetorch/serving/templates/raycluster_service_template.yaml +42 -0
  85. kubetorch/serving/templates/raycluster_template.yaml +35 -0
  86. kubetorch/serving/templates/service_template.yaml +21 -0
  87. kubetorch/serving/templates/workerset_template.yaml +36 -0
  88. kubetorch/serving/utils.py +377 -0
  89. kubetorch/utils.py +284 -0
  90. kubetorch-0.2.0.dist-info/METADATA +121 -0
  91. kubetorch-0.2.0.dist-info/RECORD +93 -0
  92. kubetorch-0.2.0.dist-info/WHEEL +4 -0
  93. kubetorch-0.2.0.dist-info/entry_points.txt +5 -0
kubetorch/cli.py ADDED
@@ -0,0 +1,1985 @@
1
+ import base64
2
+ import importlib
3
+ import inspect
4
+ import os
5
+ import signal
6
+ import subprocess
7
+ import sys
8
+ import textwrap
9
+ import time
10
+ from datetime import datetime
11
+ from pathlib import Path
12
+ from typing import List
13
+ from urllib.parse import urlparse
14
+
15
+ import httpx
16
+ from kubernetes.client.rest import ApiException
17
+ from rich.syntax import Syntax
18
+
19
+ from kubetorch.servers.http.utils import is_running_in_kubernetes
20
+
21
+ from .cli_utils import (
22
+ create_table_for_output,
23
+ default_typer_values,
24
+ get_deployment_mode,
25
+ get_ingress_host,
26
+ get_last_updated,
27
+ get_logs_from_loki,
28
+ is_ingress_vpc_only,
29
+ load_ingress,
30
+ load_kubetorch_volumes_for_service,
31
+ port_forward_to_pod,
32
+ SecretAction,
33
+ service_name_argument,
34
+ validate_config_key,
35
+ validate_pods_exist,
36
+ validate_provided_pod,
37
+ VolumeAction,
38
+ )
39
+
40
+ from .utils import initialize_k8s_clients
41
+
42
+ try:
43
+ import typer
44
+
45
+ from rich.console import Console
46
+ from rich.panel import Panel
47
+ from rich.table import Table
48
+ except ImportError:
49
+ raise ImportError(
50
+ "Please install the required CLI dependencies: `pip install 'kubetorch[client] @ <install_url>'`"
51
+ )
52
+
53
+
54
+ import kubetorch.serving.constants as serving_constants
55
+
56
+ from kubetorch import globals
57
+ from kubetorch.config import ENV_MAPPINGS
58
+ from kubetorch.servers.http.utils import DEFAULT_DEBUG_PORT
59
+
60
+ from .constants import BULLET_UNICODE, KT_MOUNT_FOLDER
61
+
62
+ try:
63
+ from .internal.cli import register_internal_commands
64
+
65
+ _INTERNAL_COMMANDS_AVAILABLE = True
66
+ except ImportError:
67
+ _INTERNAL_COMMANDS_AVAILABLE = False
68
+
69
+ from .logger import get_logger
70
+
71
+ app = typer.Typer(add_completion=False)
72
+ console = Console()
73
+
74
+ # Register internal CLI commands if available
75
+ if _INTERNAL_COMMANDS_AVAILABLE:
76
+ register_internal_commands(app)
77
+
78
+
79
+ logger = get_logger(__name__)
80
+
81
+
82
+ @app.command("check")
83
+ def kt_check(
84
+ name: str = service_name_argument(help="Service name"),
85
+ namespace: str = typer.Option(
86
+ globals.config.namespace,
87
+ "-n",
88
+ "--namespace",
89
+ ),
90
+ ):
91
+ """
92
+ Run a comprehensive health check for a deployed service.
93
+
94
+ Checks:
95
+
96
+ - Deployment pod comes up and becomes ready (if not scaled to zero)
97
+
98
+ - Rsync has succeeded
99
+
100
+ - Service is marked as ready and service pod(s) are ready to serve traffic
101
+
102
+ - GPU support configured (if applicable)
103
+
104
+ - Log streaming configuration (if applicable)
105
+
106
+ If a step fails, will dump ``kubectl describe`` and pod logs for relevant pods.
107
+ """
108
+ core_api, custom_api, apps_v1_api = initialize_k8s_clients()
109
+
110
+ def dump_pod_debug(pod_name):
111
+ try:
112
+ describe_proc = subprocess.run(
113
+ ["kubectl", "describe", "pod", pod_name, "-n", namespace],
114
+ check=False,
115
+ capture_output=True,
116
+ text=True,
117
+ )
118
+ describe_output = (
119
+ describe_proc.stdout or describe_proc.stderr or "<no output>"
120
+ )
121
+
122
+ logs_proc = subprocess.run(
123
+ ["kubectl", "logs", pod_name, "-n", namespace, "-c", "kubetorch"],
124
+ check=False,
125
+ capture_output=True,
126
+ text=True,
127
+ )
128
+ logs_output = logs_proc.stdout or logs_proc.stderr or "<no output>"
129
+
130
+ console.print(
131
+ Panel(
132
+ describe_output,
133
+ title=f"POD DESCRIPTION ({pod_name})",
134
+ border_style="yellow",
135
+ expand=False,
136
+ )
137
+ )
138
+ console.print(
139
+ Panel(
140
+ logs_output,
141
+ title=f"POD LOGS ({pod_name})",
142
+ border_style="yellow",
143
+ expand=False,
144
+ )
145
+ )
146
+ except Exception as e:
147
+ console.print(f"[red]Failed to dump pod info: {e}[/red]")
148
+
149
+ def fail(msg, pod_names=None):
150
+ console.print(f"[red]{msg}[/red]")
151
+ if pod_names:
152
+ for pod_name in pod_names:
153
+ dump_pod_debug(pod_name)
154
+ raise typer.Exit(1)
155
+
156
+ try:
157
+ # Validate service exists and get deployment mode
158
+ name, deployment_mode = get_deployment_mode(
159
+ name, namespace, custom_api, apps_v1_api
160
+ )
161
+
162
+ console.print(f"[bold blue]Checking {deployment_mode} service...[/bold blue]")
163
+
164
+ # 1. Deployment pod check
165
+ console.print("[bold blue]Checking deployment pod...[/bold blue]")
166
+ deploy_pods = validate_pods_exist(name, namespace, core_api)
167
+
168
+ if not deploy_pods:
169
+ if deployment_mode == "knative":
170
+ try:
171
+ # Check if the Knative service is marked as ready (e.g. scaled to zero)
172
+ service = custom_api.get_namespaced_custom_object(
173
+ group="serving.knative.dev",
174
+ version="v1",
175
+ namespace=namespace,
176
+ plural="services",
177
+ name=name,
178
+ )
179
+ conditions = service.get("status", {}).get("conditions", [])
180
+ ready = any(
181
+ c.get("type") == "Ready" and c.get("status") == "True"
182
+ for c in conditions
183
+ )
184
+ if ready:
185
+ console.print(
186
+ f"[yellow]No deployment pods found. Service [bold]{name}[/bold] is scaled to zero but marked as 'READY'. "
187
+ "It will scale up on demand.[/yellow]"
188
+ )
189
+ return
190
+ else:
191
+ fail("Deployment pod not found and service is not READY.")
192
+
193
+ except Exception as e:
194
+ fail(f"Failed to check Knative service status: {e}")
195
+ else:
196
+ fail("No Deployment pods found.")
197
+
198
+ deploy_pod = next(
199
+ (
200
+ p
201
+ for p in deploy_pods
202
+ if p.status.phase == "Running" and not p.metadata.deletion_timestamp
203
+ ),
204
+ None,
205
+ )
206
+ if not deploy_pod:
207
+ fail(
208
+ "No deployment pod in 'Running' state found.",
209
+ [p.metadata.name for p in deploy_pods],
210
+ )
211
+
212
+ deploy_pod_name = deploy_pod.metadata.name
213
+ if deploy_pod.status.phase != "Running":
214
+ fail(
215
+ f"Deployment pod not running (status: {deploy_pod.status.phase})",
216
+ [deploy_pod_name],
217
+ )
218
+
219
+ # 2. Rsync check
220
+ console.print("[bold blue]Checking rsync...[/bold blue]")
221
+ current_working_dir = "."
222
+ check_cmd = [
223
+ "kubectl",
224
+ "exec",
225
+ deploy_pod_name,
226
+ "-n",
227
+ namespace,
228
+ "--",
229
+ "ls",
230
+ "-l",
231
+ current_working_dir,
232
+ ]
233
+ try:
234
+ result = subprocess.run(
235
+ check_cmd, capture_output=True, text=True, check=True
236
+ )
237
+ lines = result.stdout.splitlines()
238
+ entries = [line for line in lines if not line.startswith("total")]
239
+ if not entries:
240
+ fail("Rsync directory exists but is empty.", [deploy_pod_name])
241
+ except subprocess.CalledProcessError as e:
242
+ fail(
243
+ f"Rsync directory check failed: {e.stderr or e.stdout}",
244
+ [deploy_pod_name],
245
+ )
246
+
247
+ # 3. Service call check
248
+ console.print("[bold blue]Checking service call...[/bold blue]")
249
+ try:
250
+ with port_forward_to_pod(
251
+ pod_name=deploy_pod_name,
252
+ namespace=namespace,
253
+ local_port=32300,
254
+ remote_port=32300,
255
+ ) as local_port:
256
+ url = f"http://localhost:{local_port}/health"
257
+ resp = httpx.get(url, timeout=10)
258
+ if not resp.is_success:
259
+ fail(
260
+ f"Service call failed: {resp.status_code} {resp.text}",
261
+ [deploy_pod_name],
262
+ )
263
+ except Exception as e:
264
+ fail(f"Service call check failed: {e}", [deploy_pod_name])
265
+
266
+ # 5. GPU + autoscaler test (if GPU requested)
267
+ gpu_requested = any(
268
+ c.resources.limits and "nvidia.com/gpu" in c.resources.limits
269
+ for c in deploy_pod.spec.containers
270
+ )
271
+ if gpu_requested:
272
+ gpus_configured = False
273
+ console.print("[bold blue]Checking GPU plugin support...[/bold blue]")
274
+ nodes = core_api.list_node().items
275
+ for node in nodes:
276
+ gpus = node.status.capacity.get("nvidia.com/gpu")
277
+ if gpus and int(gpus) > 0:
278
+ gpus_configured = True
279
+ break
280
+
281
+ if not gpus_configured:
282
+ console.print(
283
+ "[yellow]No GPU nodes currently configured on the cluster, is autoscaling configured?[/yellow]"
284
+ )
285
+
286
+ dcgm_exporter = True
287
+ dcgm_namespace = globals.config.install_namespace
288
+
289
+ pods = core_api.list_namespaced_pod(
290
+ namespace=dcgm_namespace,
291
+ label_selector="app.kubernetes.io/name=dcgm-exporter",
292
+ ).items
293
+ if not pods:
294
+ dcgm_exporter = False
295
+
296
+ if not dcgm_exporter:
297
+ console.print(
298
+ f"[yellow]DCGM exporter not found in namespace {dcgm_namespace}[/yellow]"
299
+ )
300
+
301
+ # 6. Check logs
302
+ if globals.config.stream_logs:
303
+ try:
304
+ streaming_enabled = core_api.read_namespaced_service(
305
+ name=serving_constants.LOKI_GATEWAY_SERVICE_NAME,
306
+ namespace=globals.config.install_namespace,
307
+ )
308
+ except ApiException:
309
+ streaming_enabled = False
310
+
311
+ if streaming_enabled:
312
+
313
+ console.print("[bold blue]Checking log streaming...[/bold blue]")
314
+ query = f'{{k8s_pod_name="{deploy_pod_name}", k8s_container_name="kubetorch"}}'
315
+ try:
316
+ logs = get_logs_from_loki(
317
+ query=query, print_pod_name=False, timeout=5.0
318
+ )
319
+ if logs is None:
320
+ fail("No logs found for service", [deploy_pod_name])
321
+
322
+ except Exception as e:
323
+ fail(f"Logs check failed: {e}", [deploy_pod_name])
324
+
325
+ console.print("[bold green]✓ All service checks passed[/bold green]")
326
+
327
+ except typer.Exit:
328
+ # Just re-raise, don't print
329
+ raise
330
+
331
+
332
+ @app.command("config")
333
+ def kt_config(
334
+ action: str = typer.Argument(
335
+ default="", help="Action to perform (set, unset, get, list)"
336
+ ),
337
+ key: str = typer.Argument(
338
+ None, help="Config key (e.g., 'username')", callback=validate_config_key
339
+ ),
340
+ value: str = typer.Argument(None, help="Value to set"),
341
+ ):
342
+ """Manage Kubetorch configuration settings.
343
+
344
+ Examples:
345
+
346
+ .. code-block:: bash
347
+
348
+ $ kt config set username johndoe
349
+
350
+ $ kt config set volumes "volume_name_one, volume_name_two"
351
+
352
+ $ kt config set volumes volume_name_one
353
+
354
+ $ kt config unset username
355
+
356
+ $ kt config get username
357
+
358
+ $ kt config list
359
+ """
360
+ from kubetorch import config
361
+
362
+ if action == "set":
363
+ if not key or not value:
364
+ console.print("[red]Both key and value are required for 'set'[/red]")
365
+ raise typer.Exit(1)
366
+
367
+ try:
368
+ value = config.set(key, value)
369
+ config.write()
370
+ console.print(f"[green]{key} set to:[/green] [blue]{value}[/blue]")
371
+ except ValueError as e:
372
+ console.print(f"[red]Error setting {key}:[/red] {str(e)}")
373
+ raise typer.Exit(1)
374
+
375
+ elif action == "unset":
376
+ if not key:
377
+ console.print("[red]Key is required for 'unset'[/red]")
378
+ raise typer.Exit(1)
379
+
380
+ try:
381
+ config.write({key: None})
382
+ config.set(key, None)
383
+ console.print(f"[green]{key.capitalize()} unset[/green]")
384
+ except ValueError as e:
385
+ console.print(f"[red]Error unsetting {key}:[/red] {str(e)}")
386
+ raise typer.Exit(1)
387
+
388
+ elif action == "get":
389
+ if not key:
390
+ # Error panel
391
+ console.print("[red]Key is required for 'get'[/red]")
392
+ raise typer.Exit(1)
393
+
394
+ if key in ENV_MAPPINGS:
395
+ value = config.get(key)
396
+ if value:
397
+ console.print(f"[blue]{value}[/blue]")
398
+ else:
399
+ console.print(f"[yellow]{key.capitalize()} not set[/yellow]")
400
+ else:
401
+ console.print(f"[red]Unknown config key:[/red] [bold]{key}[/bold]")
402
+ raise typer.Exit(1)
403
+
404
+ elif action == "list" or not action:
405
+ console.print(dict(config))
406
+
407
+ else:
408
+ console.print(f"[red]Unknown action:[/red] [bold]{action}[/bold]")
409
+ console.print("\nValid actions are: set, get, list")
410
+ raise typer.Exit(1)
411
+
412
+
413
+ @app.command("debug")
414
+ def kt_debug(
415
+ pod: str = typer.Argument(..., help="Pod name"),
416
+ namespace: str = typer.Option(
417
+ globals.config.namespace,
418
+ "-n",
419
+ "--namespace",
420
+ ),
421
+ port: int = typer.Option(
422
+ DEFAULT_DEBUG_PORT, help="Debug port used for remote debug server"
423
+ ),
424
+ ):
425
+ """Start an interactive debugging session on the pod, which will connect to the debug server inside the service.
426
+ Before running this command, you must call a method on the service with pdb=True or add a
427
+ kt.deep_breakpoint() call into your code to enable debugging.
428
+ """
429
+ import webbrowser
430
+
431
+ if is_running_in_kubernetes():
432
+ console.print(
433
+ "[red]Debugging is not supported when running inside Kubernetes. Please run this command locally.[/red]"
434
+ )
435
+ raise typer.Exit(1)
436
+
437
+ # Use the base path of web-pdb server as health endpoint because we're port-forwarding straight into the pod
438
+ with port_forward_to_pod(
439
+ namespace=namespace,
440
+ pod_name=pod,
441
+ local_port=port,
442
+ remote_port=port,
443
+ health_endpoint="/",
444
+ ):
445
+ debug_ui_url = f"http://localhost:{port}"
446
+ console.print(f"Opening debug UI at [blue]{debug_ui_url}[/blue]")
447
+ webbrowser.open(debug_ui_url)
448
+ # Wait for the user to finish debugging
449
+ console.print(
450
+ "[yellow]Press Ctrl+C to stop the debugging session and close the UI.[/yellow]"
451
+ )
452
+ # Wait for a Ctrl+C to exit the debug session
453
+ try:
454
+ while True:
455
+ time.sleep(1)
456
+ except KeyboardInterrupt:
457
+ console.print("\n[yellow]Debugging session ended.[/yellow]")
458
+ raise typer.Exit(0)
459
+
460
+ # Not every environment supports a UI. We should implement a more pdb like solution like this:
461
+ # https://github.com/ray-project/ray/pull/11739/files
462
+ # and then connect to it over a pty and kc port-forward instead of telnet.
463
+
464
+ # # Create a pseudo-terminal pair for bidirectional communication
465
+ # # leader: used by this process to communicate with the remote process
466
+ # # follower: used by the remote process (kubectl exec)
467
+ # leader, follower = pty.openpty()
468
+ #
469
+ # # Start the kubectl exec process, connecting it to the follower end of the pty
470
+ # # This allows the remote process to behave as if it's running in a real terminal
471
+ # process = subprocess.Popen(
472
+ # ssh_command,
473
+ # stdin=follower,
474
+ # stdout=follower,
475
+ # stderr=follower,
476
+ # close_fds=True, # Prevent file descriptor leaks
477
+ # )
478
+ #
479
+ # # Close the follower end in the parent process since we only need the leader
480
+ # # The child process (kubectl exec) will still have access to the follower
481
+ # os.close(follower)
482
+ #
483
+ # try:
484
+ # import select
485
+ # import sys
486
+ #
487
+ # import termios
488
+ # import tty
489
+ #
490
+ # # Save the original terminal settings so we can restore them later
491
+ # # This is important because we'll be putting the terminal in raw mode
492
+ # old_settings = termios.tcgetattr(sys.stdin)
493
+ #
494
+ # try:
495
+ # # Put the terminal in raw mode to disable line buffering and echo
496
+ # # This allows us to send keystrokes immediately to the remote pdb process
497
+ # # without waiting for Enter key or having them echoed locally
498
+ # tty.setraw(sys.stdin.fileno())
499
+ #
500
+ # # Main communication loop - continuously monitor both input and output
501
+ # while True:
502
+ # # Use select to check for data available on stdin (user input) or the pty (remote output)
503
+ # # This allows us to handle both directions of communication simultaneously
504
+ # rlist, _, _ = select.select([sys.stdin, leader], [], [], 0.1)
505
+ #
506
+ # for fd in rlist:
507
+ # if fd == sys.stdin:
508
+ # # User typed something - read it character by character and send to remote process
509
+ # # Reading character by character is necessary for interactive applications like pdb
510
+ # char = sys.stdin.read(1)
511
+ # if char:
512
+ # os.write(leader, char.encode())
513
+ # elif fd == leader:
514
+ # # Remote process (pdb) sent output - read it and display locally
515
+ # try:
516
+ # output = os.read(leader, 1024)
517
+ # if output:
518
+ # # Display the remote output in real-time
519
+ # sys.stdout.write(output.decode())
520
+ # sys.stdout.flush()
521
+ # else:
522
+ # # No more output means the remote process has ended
523
+ # return
524
+ # except OSError:
525
+ # # Process has ended or connection was closed
526
+ # return
527
+ #
528
+ # finally:
529
+ # # Always restore the original terminal settings when we're done
530
+ # # This ensures the terminal is left in a usable state
531
+ # termios.tcsetattr(sys.stdin, termios.TCSADRAIN, old_settings)
532
+ #
533
+ # except KeyboardInterrupt:
534
+ # print("\nInterrupted by user.")
535
+ # finally:
536
+ # # Clean up resources
537
+ # os.close(leader)
538
+ # process.terminate() # Send SIGTERM to the kubectl process
539
+ # process.wait() # Wait for the process to actually terminate
540
+
541
+
542
+ @app.command("deploy")
543
+ def kt_deploy(
544
+ target: str = typer.Argument(
545
+ ...,
546
+ help="Python module or file to deploy, optionally followed by a "
547
+ "single function or class to deploy. e.e. `my_module:my_cls`, or "
548
+ "`my_file.py`.",
549
+ ),
550
+ # TODO
551
+ # get_if_exists: bool = typer.Option(
552
+ # False,
553
+ # "--get-if-exists",
554
+ # help="Get the existing service if it exists (based on name) instead of redeploying. Local code changes will not be synced.",
555
+ # ),
556
+ # hard: bool = typer.Option(
557
+ # False,
558
+ # "--hard",
559
+ # help="Fully teardown each service and redeploy fresh.",
560
+ # ),
561
+ ):
562
+ """Deploy a Python file or module to Kubetorch. This will deploy all functions and modules decorated with
563
+ @kt.compute in the file or module."""
564
+ from kubetorch.resources.compute.utils import _collect_modules
565
+
566
+ to_deploy, target_fn_or_class = _collect_modules(target)
567
+
568
+ if not target_fn_or_class:
569
+ console.print(
570
+ f"Found the following functions and classes to deploy in {target}:"
571
+ )
572
+ for module in to_deploy:
573
+ console.print(f"{BULLET_UNICODE} {module.name}")
574
+
575
+ import asyncio
576
+
577
+ async def deploy_all_async():
578
+ tasks = []
579
+ for module in to_deploy:
580
+ console.print(f"Deploying {module.name}...")
581
+ tasks.append(module.deploy_async())
582
+
583
+ try:
584
+ await asyncio.gather(*tasks)
585
+ for module in to_deploy:
586
+ console.print(f"Successfully deployed {module.name}.")
587
+ except Exception as e:
588
+ console.print(f"Failed to deploy one or more modules: {e}")
589
+ raise e
590
+
591
+ asyncio.run(deploy_all_async())
592
+
593
+ if not target_fn_or_class:
594
+ console.print(f"Successfully deployed functions and modules from {target}.")
595
+
596
+
597
+ @app.command("describe")
598
+ def kt_describe(
599
+ name: str = service_name_argument(help="Service name"),
600
+ namespace: str = typer.Option(
601
+ globals.config.namespace,
602
+ "-n",
603
+ "--namespace",
604
+ ),
605
+ ):
606
+ """
607
+ Show basic info for calling the service depending on whether an ingress is configured.
608
+ """
609
+
610
+ core_api, custom_api, apps_v1_api = initialize_k8s_clients()
611
+
612
+ endpoint_placeholder = "METHOD_OR_CLS_NAME"
613
+ args_placeholder = []
614
+
615
+ try:
616
+ name, deployment_mode = get_deployment_mode(
617
+ name, namespace, custom_api, apps_v1_api
618
+ )
619
+ except ApiException:
620
+ console.print(
621
+ f"[red] Failed to load service '{name}' in namespace '{namespace}'[/red]"
622
+ )
623
+ raise typer.Exit(1)
624
+
625
+ try:
626
+ console.print()
627
+ base_url = globals.config.api_url
628
+
629
+ ingress = load_ingress()
630
+ host = (
631
+ get_ingress_host(ingress)
632
+ if ingress
633
+ else f"{name}.{namespace}.svc.cluster.local"
634
+ )
635
+
636
+ if not base_url:
637
+ if not ingress:
638
+ console.print(
639
+ "[yellow]No ingress found. Service is only accessible from inside the cluster.[/yellow]"
640
+ )
641
+ base_url = f"http://{name}.{namespace}.svc.cluster.local"
642
+ else:
643
+ lb_ing = (
644
+ ingress.status.load_balancer.ingress[0]
645
+ if (
646
+ ingress.status
647
+ and ingress.status.load_balancer
648
+ and ingress.status.load_balancer.ingress
649
+ )
650
+ else None
651
+ )
652
+
653
+ address = lb_ing.hostname or lb_ing.ip if lb_ing else None
654
+ if address:
655
+ base_url = f"http://{address}"
656
+ else:
657
+ console.print(
658
+ "[yellow]Ingress found but no address, falling back to cluster-local.[/yellow]"
659
+ )
660
+ base_url = f"http://{name}.{namespace}.svc.cluster.local"
661
+ else:
662
+ parsed = urlparse(base_url)
663
+ if not parsed.scheme:
664
+ base_url = f"http://{base_url}"
665
+
666
+ if ingress:
667
+ console.print(f"[bold]Host:[/bold] [green]{name}[/green]")
668
+
669
+ vpc_only = is_ingress_vpc_only(ingress.metadata.annotations)
670
+ if vpc_only:
671
+ console.print()
672
+ console.print(
673
+ "[yellow]Note: This is a VPC-only ingress (internal access only)[/yellow]"
674
+ )
675
+
676
+ console.print()
677
+
678
+ if ingress:
679
+ console.print("[bold]Calling the service using an ingress:[/bold]\n")
680
+ # With ingress, use the full path structure
681
+ service_path = f"/{namespace}/{name}/{endpoint_placeholder}"
682
+ else:
683
+ console.print("[bold]Calling the service from inside the cluster:[/bold]\n")
684
+ service_path = f"/{endpoint_placeholder}"
685
+
686
+ curl_code = textwrap.dedent(
687
+ f"""\
688
+ curl -X POST \\
689
+ -H "Content-Type: application/json" \\
690
+ -d '{{"args": {args_placeholder}, "kwargs": {{}}}}' \\
691
+ {base_url}{service_path}
692
+ """
693
+ )
694
+ # Only add Host header if we have ingress
695
+ if ingress:
696
+ curl_code = curl_code.replace(
697
+ '-H "Content-Type: application/json"',
698
+ f'-H "Host: {host}" \\\n -H "Content-Type: application/json"',
699
+ )
700
+
701
+ console.print(
702
+ Panel(Syntax(curl_code, "bash"), title="With Curl", border_style="green")
703
+ )
704
+ console.print()
705
+
706
+ python_code = textwrap.dedent(
707
+ f"""\
708
+ import requests
709
+
710
+ url = "{base_url}{service_path}"
711
+ headers = {{
712
+ "Content-Type": "application/json"
713
+ }}
714
+ data = {{
715
+ "args": {args_placeholder},
716
+ "kwargs": {{}}
717
+ }}
718
+
719
+ response = requests.post(url, headers=headers, json=data)
720
+ print(response.json())
721
+ """
722
+ )
723
+ if ingress:
724
+ python_code = python_code.replace(
725
+ '"Content-Type": "application/json"',
726
+ f'"Host": "{host}",\n "Content-Type": "application/json"',
727
+ )
728
+ console.print(
729
+ Panel(
730
+ Syntax(python_code, "python"),
731
+ title="With Python",
732
+ border_style="green",
733
+ )
734
+ )
735
+ except Exception as e:
736
+ console.print(
737
+ f"[red]Failed to describe service {name} in namespace {namespace}: {e}[/red]",
738
+ )
739
+ raise typer.Exit(1)
740
+
741
+
742
+ @app.command("list")
743
+ def kt_list(
744
+ namespace: str = typer.Option(
745
+ globals.config.namespace,
746
+ "-n",
747
+ "--namespace",
748
+ ),
749
+ sort_by_updated: bool = typer.Option(
750
+ False, "-s", "--sort", help="Sort by last update time"
751
+ ),
752
+ tag: str = typer.Option(
753
+ None,
754
+ "-t",
755
+ "--tag",
756
+ help="Service tag or prefix (ex: 'myusername', 'some-git-branch').",
757
+ ),
758
+ ):
759
+ """List all Kubetorch services.
760
+
761
+ Examples:
762
+
763
+ .. code-block:: bash
764
+
765
+ $ kt list
766
+
767
+ $ kt list -t dev-branch
768
+ """
769
+ core_api, custom_api, _ = initialize_k8s_clients()
770
+
771
+ # Import here to avoid circular imports
772
+ from kubetorch.serving.service_manager import BaseServiceManager
773
+
774
+ try:
775
+ # Use unified service discovery
776
+ unified_services = BaseServiceManager.discover_services_static(
777
+ namespace=namespace, name_filter=tag
778
+ )
779
+
780
+ if not unified_services:
781
+ console.print(
782
+ f"[yellow]No services found in {namespace} namespace[/yellow]"
783
+ )
784
+ return
785
+
786
+ # Optional second-level tag filtering
787
+ if tag:
788
+ unified_services = [
789
+ svc
790
+ for svc in unified_services
791
+ if tag in svc["name"]
792
+ or tag
793
+ in " ".join(
794
+ str(v)
795
+ for v in svc["resource"]
796
+ .get("metadata", {})
797
+ .get("labels", {})
798
+ .values()
799
+ )
800
+ ]
801
+ if not unified_services:
802
+ console.print(
803
+ f"[yellow]No services found in {namespace} namespace[/yellow]"
804
+ )
805
+ return
806
+
807
+ if sort_by_updated:
808
+
809
+ def get_update_time(svc):
810
+ # If not a ksvc, use creation timestamp as proxy for update time
811
+ return (
812
+ get_last_updated(svc["resource"])
813
+ if svc["template_type"] == "ksvc"
814
+ else svc["creation_timestamp"]
815
+ )
816
+
817
+ unified_services.sort(key=get_update_time, reverse=True)
818
+
819
+ # Get pod maps
820
+ pod_map = {
821
+ svc["name"]: BaseServiceManager.get_pods_for_service_static(
822
+ svc["name"], namespace, core_api
823
+ )
824
+ for svc in unified_services
825
+ }
826
+
827
+ # Create table
828
+ table_columns = [
829
+ ("SERVICE", "cyan"),
830
+ ("TYPE", "magenta"),
831
+ ("STATUS", "green"),
832
+ ("# OF PODS", "yellow"),
833
+ ("POD NAMES", "red"),
834
+ ("VOLUMES", "blue"),
835
+ ("LAST STATUS CHANGE", "yellow"),
836
+ ("TTL", "yellow"),
837
+ ("CREATOR", "yellow"),
838
+ ("QUEUE", "yellow"),
839
+ ("CPUs", "yellow"),
840
+ ("MEMORY", "yellow"),
841
+ ("GPUs", "yellow"),
842
+ ]
843
+ table = create_table_for_output(
844
+ columns=table_columns,
845
+ no_wrap_columns_names=["SERVICE"],
846
+ header_style={"bold": False},
847
+ )
848
+
849
+ for svc in unified_services:
850
+ name = svc["name"]
851
+ kind = svc["template_type"]
852
+ res = svc["resource"]
853
+ meta = res.get("metadata", {})
854
+ labels = meta.get("labels", {})
855
+ annotations = meta.get("annotations", {})
856
+ status_data = res.get("status", {})
857
+
858
+ # Get pods
859
+ pods = pod_map.get(name, [])
860
+
861
+ creation_ts = meta.get("creationTimestamp", None)
862
+ timestamp = (
863
+ datetime.fromisoformat(creation_ts.replace("Z", "+00:00")).strftime(
864
+ "%Y-%m-%d %H:%M:%S"
865
+ )
866
+ if creation_ts
867
+ else "Unknown"
868
+ )
869
+ ttl = annotations.get(serving_constants.INACTIVITY_TTL_ANNOTATION, "None")
870
+ creator = labels.get(serving_constants.KT_USERNAME_LABEL, "—")
871
+
872
+ volumes_display = load_kubetorch_volumes_for_service(
873
+ namespace, name, core_api
874
+ )
875
+
876
+ # Get resources from revision
877
+ cpu = memory = gpu = None
878
+ if kind == "ksvc":
879
+ cond = status_data.get("conditions", [{}])[0]
880
+ status = cond.get("status")
881
+ display_status = {
882
+ "True": "[green]Ready[/green]",
883
+ "Unknown": "[yellow]Creating[/yellow]",
884
+ }.get(status, "[red]Failed[/red]")
885
+ rev_name = status_data.get("latestCreatedRevisionName")
886
+ if rev_name:
887
+ try:
888
+ rev = custom_api.get_namespaced_custom_object(
889
+ group="serving.knative.dev",
890
+ version="v1",
891
+ namespace=namespace,
892
+ plural="revisions",
893
+ name=rev_name,
894
+ )
895
+ container = rev["spec"]["containers"][0]
896
+ reqs = container.get("resources", {}).get("requests", {})
897
+ cpu = reqs.get("cpu")
898
+ memory = reqs.get("memory")
899
+ gpu = reqs.get("nvidia.com/gpu") or reqs.get("gpu")
900
+ except Exception as e:
901
+ logger.warning(f"Could not get revision for {name}: {e}")
902
+ else:
903
+ # Process Deployment - now using consistent dict access
904
+ ready = res.get("status", {}).get("readyReplicas", 0) or 0
905
+ desired = res.get("spec", {}).get("replicas", 0) or 0
906
+ if kind == "raycluster":
907
+ state = status_data.get("state", "").lower()
908
+ conditions = {
909
+ c["type"]: c["status"]
910
+ for c in status_data.get("conditions", [])
911
+ }
912
+ if (
913
+ state == "ready"
914
+ and conditions.get("HeadPodReady") == "True"
915
+ and conditions.get("RayClusterProvisioned") == "True"
916
+ ):
917
+ display_status = "[green]Ready[/green]"
918
+ elif state in ("creating", "upscaling", "restarting", "updating"):
919
+ display_status = "[yellow]Scaling[/yellow]"
920
+ else:
921
+ display_status = "[red]Failed[/red]"
922
+ else:
923
+ display_status = (
924
+ "[green]Ready[/green]"
925
+ if ready == desired and desired > 0
926
+ else "[yellow]Scaling[/yellow]"
927
+ if ready < desired
928
+ else "[red]Failed[/red]"
929
+ )
930
+ try:
931
+ container = (
932
+ res.get("spec", {})
933
+ .get("template", {})
934
+ .get("spec", {})
935
+ .get("containers", [{}])[0]
936
+ )
937
+ reqs = container.get("resources", {}).get("requests", {})
938
+ cpu = reqs.get("cpu")
939
+ memory = reqs.get("memory")
940
+ gpu = reqs.get("nvidia.com/gpu") or reqs.get("gpu")
941
+ except Exception as e:
942
+ logger.warning(
943
+ f"Failed to get resources for {name} in namespace {namespace}: {e}"
944
+ )
945
+
946
+ # Common pod processing
947
+ pod_lines = []
948
+ queue = "—"
949
+ for pod in pods:
950
+ pod_status = pod.status.phase
951
+ ready = all(c.ready for c in (pod.status.container_statuses or []))
952
+ if ready and pod_status == "Running":
953
+ color = "green"
954
+ elif "Creating" in display_status or "Scaling" in display_status:
955
+ color = "yellow"
956
+ else:
957
+ color = "red"
958
+ pod_lines.append(f"[{color}]{pod.metadata.name}[/{color}]")
959
+ queue = pod.metadata.labels.get(
960
+ serving_constants.KAI_SCHEDULER_LABEL, queue
961
+ )
962
+
963
+ # Update service status if pod is pending
964
+ if pod_status == "Pending":
965
+ display_status = "[yellow]Pending[/yellow]"
966
+
967
+ table.add_row(
968
+ name,
969
+ f"[magenta]{kind}[/magenta]",
970
+ display_status,
971
+ str(len(pods)),
972
+ "\n".join(pod_lines),
973
+ "\n".join(volumes_display) or "-",
974
+ timestamp,
975
+ ttl,
976
+ creator,
977
+ queue,
978
+ cpu or "—",
979
+ memory or "—",
980
+ gpu or "—",
981
+ )
982
+
983
+ table.pad_bottom = 1
984
+ console.print(table)
985
+
986
+ except ApiException as e:
987
+ console.print(f"[red]Kubernetes API error: {e}[/red]")
988
+ raise typer.Exit(1)
989
+
990
+
991
+ @app.command("port-forward")
992
+ def kt_port_forward(
993
+ name: str = service_name_argument(help="Service name"),
994
+ local_port: int = typer.Argument(
995
+ default=serving_constants.DEFAULT_KT_SERVER_PORT, help="Local port to bind to"
996
+ ),
997
+ remote_port: int = typer.Argument(
998
+ default=serving_constants.DEFAULT_KT_SERVER_PORT,
999
+ help="Remote port to forward to",
1000
+ ),
1001
+ namespace: str = typer.Option(
1002
+ globals.config.namespace,
1003
+ "-n",
1004
+ "--namespace",
1005
+ ),
1006
+ pod: str = typer.Option(
1007
+ None,
1008
+ "-p",
1009
+ "--pod",
1010
+ help="Name or index of a specific pod to load logs from (0-based)",
1011
+ ),
1012
+ ):
1013
+ """
1014
+ Port forward a local port to the specified Kubetorch service.
1015
+
1016
+ Examples:
1017
+
1018
+ .. code-block:: bash
1019
+
1020
+
1021
+ $ kt port-forward my-service
1022
+
1023
+ $ kt port-forward my-service 32300
1024
+
1025
+ $ kt port-forward my-service -n custom-namespace
1026
+
1027
+ $ kt port-forward my-service -p my-pod
1028
+
1029
+ This allows you to access the service locally using `curl http://localhost:<port>`.
1030
+ """
1031
+
1032
+ from kubetorch.resources.compute.utils import is_port_available
1033
+
1034
+ if not is_port_available(local_port):
1035
+ console.print(f"\n[red]Local port {local_port} is already in use.[/red]")
1036
+ raise typer.Exit(1)
1037
+
1038
+ core_api, custom_api, apps_v1_api = initialize_k8s_clients()
1039
+
1040
+ name, _ = get_deployment_mode(name, namespace, custom_api, apps_v1_api)
1041
+ pods = validate_pods_exist(name, namespace, core_api)
1042
+ sorted_by_time = sorted(pods, key=lambda pod: pod.metadata.creation_timestamp)
1043
+
1044
+ if pod: # case when the user provides a pod
1045
+ pod_name = validate_provided_pod(
1046
+ service_name=name, provided_pod=pod, service_pods=sorted_by_time
1047
+ )
1048
+ else: # if user does not provide pod, port-forward to the first pod by default
1049
+ pod_name = sorted_by_time[0].metadata.name
1050
+
1051
+ process = None
1052
+
1053
+ def cleanup_process():
1054
+ # Clean up the port forward process
1055
+ if process:
1056
+ process.kill()
1057
+
1058
+ def signal_handler(signum, frame):
1059
+ """Handle interrupt signals for graceful shutdown."""
1060
+ console.print(f"\nReceived signal {signum}, cleaning up port forward...")
1061
+ cleanup_process()
1062
+ console.print("Port forward stopped.")
1063
+ raise typer.Exit(0)
1064
+
1065
+ # Register signal handlers for graceful shutdown
1066
+ signal.signal(signal.SIGINT, signal_handler)
1067
+ signal.signal(signal.SIGTERM, signal_handler)
1068
+
1069
+ from kubetorch.serving.utils import wait_for_port_forward
1070
+
1071
+ cmd = [
1072
+ "kubectl",
1073
+ "port-forward",
1074
+ f"pod/{pod_name}",
1075
+ f"{local_port}:{remote_port}",
1076
+ "--namespace",
1077
+ namespace,
1078
+ ]
1079
+
1080
+ port_forward_msg = f"Starting port forward to {name} in namespace {namespace}"
1081
+
1082
+ if pod:
1083
+ port_forward_msg = port_forward_msg + f", pod: [reset]{pod}"
1084
+ console.print(port_forward_msg)
1085
+
1086
+ process = subprocess.Popen(
1087
+ cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, start_new_session=True
1088
+ )
1089
+
1090
+ try:
1091
+ wait_for_port_forward(process, local_port)
1092
+ time.sleep(2)
1093
+ except Exception as e:
1094
+ logger.info(f"Failed to establish port forward on port {local_port}: {e}")
1095
+ if process:
1096
+ cleanup_process()
1097
+ process = None
1098
+ return
1099
+
1100
+ console.print(
1101
+ f"[green]✓ Port forward active on localhost:{local_port} -> {pod_name}:{remote_port}[/green]"
1102
+ )
1103
+ console.print(f"[cyan]You can now run: curl http://localhost:{local_port}[/cyan]")
1104
+ console.print("[dim]Press Ctrl+C to stop the port forward[/dim]")
1105
+
1106
+ # Keep the port forward running until interrupted
1107
+ try:
1108
+ while True:
1109
+ if process.poll() is not None:
1110
+ # Process has terminated
1111
+ console.print(
1112
+ "[red]Port forward process has terminated unexpectedly[/red]"
1113
+ )
1114
+ break
1115
+ time.sleep(1)
1116
+ except KeyboardInterrupt:
1117
+ # This should be handled by the signal handler, but just in case
1118
+ pass
1119
+
1120
+ except typer.Exit:
1121
+ # Re-raise typer.Exit to maintain proper CLI behavior
1122
+ raise
1123
+ except Exception as e:
1124
+ console.print(f"[red]Error during port forwarding: {e}[/red]")
1125
+ raise typer.Exit(1)
1126
+ finally:
1127
+ cleanup_process()
1128
+
1129
+
1130
+ @app.command(
1131
+ "run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}
1132
+ )
1133
+ def kt_run(
1134
+ ctx: typer.Context,
1135
+ name: str = typer.Option(None, "--name", help="Name for the run"),
1136
+ run_async: bool = typer.Option(
1137
+ False, "--async", help="Whether to run async and not stream logs live"
1138
+ ),
1139
+ file: int = typer.Option(None, "--file", help="File where the app is defined in"),
1140
+ ):
1141
+ """
1142
+ Build and deploy a kubetorch app that runs the provided CLI command. In order for the app
1143
+ to be deployed, the file being run must be a Python file specifying a `kt.app` construction
1144
+ at the top of the file.
1145
+
1146
+ Examples:
1147
+
1148
+ .. code-block:: bash
1149
+
1150
+ $ kt run python train.py --epochs 5
1151
+ $ kt run fastapi run my_app.py --name fastapi-app
1152
+ """
1153
+ from kubetorch import App
1154
+
1155
+ cli_cmd = " ".join(ctx.args)
1156
+ if not cli_cmd:
1157
+ raise typer.BadParameter("You must provide a command to run.")
1158
+ elif cli_cmd.split()[0].endswith(".py"):
1159
+ raise typer.BadParameter(
1160
+ "You must provide a full command to run, the Python file should not be the first argument. "
1161
+ "(e.g. `kt run python train.py`)"
1162
+ )
1163
+
1164
+ python_file = file
1165
+ if not python_file:
1166
+ for arg in cli_cmd.split():
1167
+ if arg.endswith("py") and Path(arg).exists():
1168
+ python_file = arg
1169
+ break
1170
+
1171
+ if not python_file:
1172
+ console.print(
1173
+ f"[red]Could not detect python file with `kt.app` in {cli_cmd}. Pass it in with `--file`.[/red]"
1174
+ )
1175
+ raise typer.Exit(1)
1176
+
1177
+ # Set env vars for construction of app instance
1178
+ os.environ["KT_RUN"] = "1"
1179
+ os.environ["KT_RUN_CMD"] = cli_cmd
1180
+ os.environ["KT_RUN_FILE"] = python_file
1181
+ if name:
1182
+ os.environ["KT_RUN_NAME"] = name
1183
+ if run_async:
1184
+ os.environ["KT_RUN_ASYNC"] = "1"
1185
+
1186
+ # Extract the app instance from the python file
1187
+ module_name = Path(python_file).stem
1188
+ python_file_dir = Path(python_file).resolve().parent
1189
+
1190
+ # Add the directory containing the Python file to sys.path to support relative imports
1191
+ if str(python_file_dir) not in sys.path:
1192
+ sys.path.insert(0, str(python_file_dir))
1193
+
1194
+ spec = importlib.util.spec_from_file_location(module_name, python_file)
1195
+ module = importlib.util.module_from_spec(spec)
1196
+ sys.modules[module_name] = module
1197
+ spec.loader.exec_module(module)
1198
+
1199
+ app_instance = None
1200
+ for _, obj in inspect.getmembers(module):
1201
+ if isinstance(obj, App):
1202
+ app_instance = obj
1203
+ break
1204
+ if not app_instance:
1205
+ console.print(f"[red]Could not find kt.app definition in {python_file} [/red]")
1206
+ raise typer.Exit(1)
1207
+
1208
+ app_instance.deploy()
1209
+
1210
+
1211
+ @app.command("secrets")
1212
+ def kt_secrets(
1213
+ action: SecretAction = typer.Argument(
1214
+ SecretAction.list,
1215
+ help="Action to perform: list, create, update, delete, describe",
1216
+ ),
1217
+ name: str = typer.Argument(None, help="Secret name (for create or delete actions)"),
1218
+ prefix: str = typer.Option(
1219
+ None,
1220
+ "--prefix",
1221
+ "-x",
1222
+ ),
1223
+ namespace: str = typer.Option(
1224
+ "default",
1225
+ "-n",
1226
+ "--namespace",
1227
+ ),
1228
+ all_namespaces: bool = typer.Option(
1229
+ False,
1230
+ "--all-namespaces",
1231
+ "-A",
1232
+ ),
1233
+ yes: bool = typer.Option(False, "-y", "--yes", help="Deletion confirmation"),
1234
+ path: str = typer.Option(
1235
+ None, "--path", "-p", help="Path where the secret values are held"
1236
+ ),
1237
+ provider: str = typer.Option(
1238
+ None,
1239
+ "--provider",
1240
+ "-c",
1241
+ help="Provider corresponding to the secret (e.g. 'aws', 'gcp'). "
1242
+ "If not specified, secrets are loaded from the default provider path.",
1243
+ ),
1244
+ env_vars: List[str] = typer.Option(
1245
+ None,
1246
+ "--env-vars",
1247
+ "-v",
1248
+ help="Environment variable(s) key(s) whose value(s) will hold the secret value(s)",
1249
+ ),
1250
+ show_values: bool = typer.Option(
1251
+ False, "-s", "--show", help="Show secrets values in the describe output"
1252
+ ),
1253
+ ):
1254
+ """Manage secrets used in Kubetorch services.
1255
+
1256
+ Examples:
1257
+
1258
+ .. code-block:: bash
1259
+
1260
+ $ kt secrets # list secrets in the default namespace
1261
+
1262
+ $ kt secrets list -n my_namespace # list secrets in `my_namespace` namespace
1263
+
1264
+ $ kt secrets -A # list secrets in all namespaces
1265
+
1266
+ $ kt secrets create --provider aws # create a secret with the aws credentials in `default` namespace
1267
+
1268
+ $ kt secrets create my_secret -v ENV_VAR_1 -v ENV_VAR_2 -n my_namespace # create a secret using env vars
1269
+
1270
+ $ kt secrets delete my_secret -n my_namespace # delete a secret called `my_secret` from `my_namespace` namespace
1271
+
1272
+ $ kt secrets delete aws # delete a secret called `aws` from `default` namespace
1273
+ """
1274
+ import kubetorch as kt
1275
+ from kubetorch.resources.compute.utils import delete_secrets, list_secrets
1276
+ from kubetorch.resources.secrets.kubernetes_secrets_client import (
1277
+ KubernetesSecretsClient,
1278
+ )
1279
+
1280
+ secrets_client = KubernetesSecretsClient(namespace=namespace)
1281
+
1282
+ core_api, custom_api, apps_v1_api = initialize_k8s_clients()
1283
+
1284
+ if action == SecretAction.list:
1285
+ secrets = list_secrets(
1286
+ core_api=core_api,
1287
+ namespace=namespace,
1288
+ prefix=prefix,
1289
+ all_namespaces=all_namespaces,
1290
+ console=console,
1291
+ filter_by_creator=False,
1292
+ )
1293
+
1294
+ table_columns = [
1295
+ ("SECRET", "blue"),
1296
+ ("CREATOR", "cyan"),
1297
+ ("NAMESPACE", "yellow"),
1298
+ ]
1299
+ table = create_table_for_output(
1300
+ columns=table_columns,
1301
+ no_wrap_columns_names=["SECRET"],
1302
+ header_style={"bold": True},
1303
+ )
1304
+
1305
+ if not secrets:
1306
+ msg = "No secrets found"
1307
+ if not all_namespaces:
1308
+ if prefix:
1309
+ msg += f" with prefix: {prefix}"
1310
+ msg += f" in namespace: {namespace}"
1311
+ console.print(f"[yellow]{msg}[/yellow]")
1312
+ raise typer.Exit(0)
1313
+
1314
+ for secret in secrets:
1315
+ secret_name = secret.get(
1316
+ "user_defined_name"
1317
+ ) # TODO: maybe display the kt name? so it'll match kubectl get secrets
1318
+ creator = secret.get("username")
1319
+ namespace = secret.get("namespace")
1320
+ table.add_row(secret_name, creator, namespace)
1321
+
1322
+ table.pad_bottom = 1
1323
+ console.print(table)
1324
+
1325
+ elif action == SecretAction.create:
1326
+ if not (name or provider):
1327
+ console.print(
1328
+ "[red]Cannot create secret: name or provider must be specified.[/red]"
1329
+ )
1330
+ typer.Exit(1)
1331
+ env_vars_dict = {key: key for key in env_vars} if env_vars else {}
1332
+
1333
+ try:
1334
+ new_secret = kt.secret(
1335
+ name=name, provider=provider, path=path, env_vars=env_vars_dict
1336
+ )
1337
+ secrets_client.create_secret(secret=new_secret, console=console)
1338
+ except Exception as e:
1339
+ console.print(f"[red]Failed to create the secret: {e}[/red]")
1340
+ raise typer.Exit(0)
1341
+
1342
+ elif action == SecretAction.delete:
1343
+ prefix = name if name else prefix
1344
+ all_namespaces = False if name else all_namespaces
1345
+ secrets_to_delete = list_secrets(
1346
+ core_api=core_api,
1347
+ namespace=namespace,
1348
+ prefix=prefix,
1349
+ all_namespaces=all_namespaces,
1350
+ console=console,
1351
+ )
1352
+
1353
+ username = globals.config.username
1354
+ secrets_to_delete_by_namespace: dict[str, list[str]] = {}
1355
+ for secret in secrets_to_delete:
1356
+ ns = secret.get("namespace")
1357
+ name = secret.get("name")
1358
+
1359
+ if all_namespaces:
1360
+ if secret.get("username") != username:
1361
+ continue # skip secrets not owned by user
1362
+
1363
+ secrets_to_delete_by_namespace.setdefault(ns, []).append(name)
1364
+
1365
+ # Flatten names for display
1366
+ secrets_names = [
1367
+ name for names in secrets_to_delete_by_namespace.values() for name in names
1368
+ ]
1369
+
1370
+ if not secrets_names:
1371
+ console.print(
1372
+ f"[yellow]No secrets to delete for username: {username}[/yellow]"
1373
+ )
1374
+ raise typer.Exit(0)
1375
+
1376
+ secrets_word = "secret" if len(secrets_names) == 1 else "secrets"
1377
+ console.print(f"\nDeleting {len(secrets_names)} {secrets_word}...")
1378
+
1379
+ for secret in secrets_names:
1380
+ console.print(f" - [blue]{secret}[/blue]")
1381
+
1382
+ if not yes:
1383
+ confirm = typer.confirm("\nDo you want to proceed?")
1384
+ if not confirm:
1385
+ console.print("[yellow]Operation cancelled[/yellow]")
1386
+ raise typer.Exit(0)
1387
+
1388
+ for ns, secrets in secrets_to_delete_by_namespace.items():
1389
+ if secrets:
1390
+ client = KubernetesSecretsClient(namespace=ns)
1391
+ delete_secrets(
1392
+ secrets=secrets,
1393
+ console=console,
1394
+ secrets_client=client,
1395
+ )
1396
+
1397
+ elif action == SecretAction.describe:
1398
+ prefix = name if name else prefix
1399
+ all_namespaces = False if name else all_namespaces
1400
+ secrets_to_describe = list_secrets(
1401
+ core_api=core_api,
1402
+ namespace=namespace,
1403
+ prefix=name or prefix,
1404
+ all_namespaces=all_namespaces,
1405
+ filter_by_creator=False,
1406
+ console=console,
1407
+ )
1408
+ if not secrets_to_describe:
1409
+ console.print("[yellow] No secrets found[/yellow]")
1410
+ raise typer.Exit(0)
1411
+
1412
+ for secret in secrets_to_describe:
1413
+ k8_name = secret.get("name")
1414
+ kt_name = secret.get("user_defined_name")
1415
+ console.print(f"[bold cyan]{kt_name}[/bold cyan]")
1416
+ console.print(f" K8 Name: [reset]{k8_name}")
1417
+ console.print(f' Namespace: {secret.get("namespace")}')
1418
+ console.print(f' Labels: [reset]{secret.get("labels")}')
1419
+ console.print(f' Type: {secret.get("type")}')
1420
+ secret_data = secret.get("data")
1421
+ if show_values:
1422
+ console.print(" Data:")
1423
+ for k, v in secret_data.items():
1424
+ try:
1425
+ decoded_value = base64.b64decode(v).decode("utf-8")
1426
+ except Exception:
1427
+ decoded_value = "<binary data>"
1428
+ indented_value = textwrap.indent(decoded_value, " ")
1429
+ indented_value = indented_value.replace("\n\n", "\n")
1430
+ console.print(f" {k}:{indented_value}\n")
1431
+
1432
+
1433
+ @app.command("ssh")
1434
+ def kt_ssh(
1435
+ name: str = service_name_argument(help="Service name"),
1436
+ namespace: str = typer.Option(
1437
+ globals.config.namespace,
1438
+ "-n",
1439
+ "--namespace",
1440
+ ),
1441
+ pod: str = typer.Option(
1442
+ None,
1443
+ "-p",
1444
+ "--pod",
1445
+ help="Name or index of a specific pod to load logs from (0-based)",
1446
+ ),
1447
+ ):
1448
+ """SSH into a remote service. By default, will SSH into the first pod.
1449
+
1450
+ Examples:
1451
+
1452
+ .. code-block:: bash
1453
+
1454
+ $ kt ssh my_service
1455
+ """
1456
+ core_api, custom_api, apps_v1_api = initialize_k8s_clients()
1457
+
1458
+ try:
1459
+ # Validate service exists and get deployment mode
1460
+ name, deployment_mode = get_deployment_mode(
1461
+ name, namespace, custom_api, apps_v1_api
1462
+ )
1463
+
1464
+ # Get and validate pods
1465
+ pods = validate_pods_exist(name, namespace, core_api)
1466
+
1467
+ sorted_by_time = sorted(pods, key=lambda pod: pod.metadata.creation_timestamp)
1468
+
1469
+ # case when the user provides a specific pod to ssh into
1470
+ if pod:
1471
+ pod_name = validate_provided_pod(
1472
+ service_name=name, provided_pod=pod, service_pods=sorted_by_time
1473
+ )
1474
+ # if pod is not provided, ssh into the first pod.
1475
+ else:
1476
+ pod_name = sorted_by_time[0].metadata.name
1477
+
1478
+ console.print(
1479
+ f"[green]Found pod:[/green] [blue]{pod_name}[/blue] ({deployment_mode})"
1480
+ )
1481
+ console.print("[yellow]Connecting to pod...[/yellow]")
1482
+
1483
+ # Still need subprocess for the interactive shell
1484
+ subprocess.run(
1485
+ ["kubectl", "exec", "-it", pod_name, "-n", namespace, "--", "/bin/bash"],
1486
+ check=True,
1487
+ )
1488
+
1489
+ except ApiException as e:
1490
+ console.print(f"[red]Kubernetes API error: {e}[/red]")
1491
+ raise typer.Exit(1)
1492
+
1493
+
1494
+ @app.command("teardown")
1495
+ def kt_teardown(
1496
+ name: str = service_name_argument(help="Service name", required=False),
1497
+ yes: bool = typer.Option(False, "-y", "--yes", help="Deletion confirmation"),
1498
+ teardown_all: bool = typer.Option(
1499
+ False, "-a", "--all", help="Deletes all services for the current user"
1500
+ ),
1501
+ prefix: str = typer.Option(
1502
+ "", "-p", "--prefix", help="Tear down all services with given prefix"
1503
+ ),
1504
+ namespace: str = typer.Option(
1505
+ globals.config.namespace,
1506
+ "-n",
1507
+ "--namespace",
1508
+ ),
1509
+ force: bool = typer.Option(
1510
+ False, "-f", "--force", help="Force deletion without graceful shutdown"
1511
+ ),
1512
+ ):
1513
+ """Delete a service and all its associated resources (deployments, configmaps, etc).
1514
+
1515
+
1516
+ Examples:
1517
+
1518
+ .. code-block:: bash
1519
+
1520
+ $ kt teardown my-service -y # force teardown resources corresponding to service
1521
+
1522
+ $ kt teardown --all # teardown all resources corresponding to username
1523
+
1524
+ $ kt teardown --prefix test # teardown resources with prefix "test"
1525
+ """
1526
+ from kubetorch import config
1527
+ from kubetorch.resources.compute.utils import (
1528
+ delete_resources_for_service,
1529
+ fetch_resources_for_teardown,
1530
+ )
1531
+
1532
+ name, yes, teardown_all, namespace, prefix = default_typer_values(
1533
+ name, yes, teardown_all, namespace, prefix
1534
+ )
1535
+
1536
+ core_api, custom_api, _ = initialize_k8s_clients()
1537
+
1538
+ if teardown_all:
1539
+ if not config.username:
1540
+ console.print(
1541
+ "[red]Username is not found, can't delete all services. Please set up a username, provide a service "
1542
+ "name or use the --prefix flag[/red]"
1543
+ )
1544
+ raise typer.Exit(1)
1545
+
1546
+ console.print(
1547
+ f"Deleting all services for username [blue]{config.username}[/blue]..."
1548
+ )
1549
+
1550
+ elif prefix:
1551
+ console.print(
1552
+ f"Deleting all services with prefix [blue]{prefix}[/blue] in [blue]{namespace}[/blue] namespace..."
1553
+ )
1554
+ else:
1555
+ if not name:
1556
+ console.print(
1557
+ "[red]Please provide a service name or use the --all or --prefix flags[/red]"
1558
+ )
1559
+ raise typer.Exit(1)
1560
+
1561
+ console.print(
1562
+ f"Finding resources for service [blue]{name}[/blue] in [blue]{namespace}[/blue] namespace..."
1563
+ )
1564
+
1565
+ resources = fetch_resources_for_teardown(
1566
+ namespace=namespace,
1567
+ target=name,
1568
+ core_api=core_api,
1569
+ custom_api=custom_api,
1570
+ prefix=prefix,
1571
+ username=config.username if teardown_all else None,
1572
+ )
1573
+
1574
+ services = list(resources["services"].keys())
1575
+ service_count = len(services)
1576
+
1577
+ if teardown_all or prefix:
1578
+ service_word = "service" if service_count == 1 else "services"
1579
+ if not services:
1580
+ console.print("[yellow]No services are found[/yellow]")
1581
+ raise typer.Exit(0)
1582
+ else:
1583
+ console.print(
1584
+ f"[yellow]Found [bold]{service_count}[/bold] {service_word} to delete.[/yellow]"
1585
+ )
1586
+
1587
+ if name and not services:
1588
+ console.print(f"[red]Service [bold]{name}[/bold] not found[/red]")
1589
+ raise typer.Exit(1)
1590
+
1591
+ # Confirmation prompt for multiple services
1592
+ if not yes and service_count > 1:
1593
+ for service_name in services:
1594
+ console.print(f" • {service_name}")
1595
+
1596
+ # List out resources to be deleted for each service
1597
+ console.print("\n[yellow]The following resources will be deleted:[/yellow]")
1598
+ for name in services:
1599
+ service_info = resources["services"][name]
1600
+ configmaps = service_info["configmaps"]
1601
+ service_type = service_info.get("type", "unknown")
1602
+
1603
+ if service_type == "deployment":
1604
+ console.print(f"• Deployment: [blue]{name}[/blue]")
1605
+ console.print(f"• Service: [blue]{name}[/blue]")
1606
+ elif service_type == "knative":
1607
+ console.print(f"• Knative Service: [blue]{name}[/blue]")
1608
+ elif service_type == "raycluster":
1609
+ console.print(f"• RayCluster: [blue]{name}[/blue]")
1610
+ console.print(f"• Service: [blue]{name}[/blue]")
1611
+ else:
1612
+ console.print(f"• Service: [blue]{name}[/blue]")
1613
+
1614
+ if configmaps:
1615
+ console.print("• ConfigMaps:")
1616
+ for cm in configmaps:
1617
+ console.print(f" - [blue]{cm}[/blue]")
1618
+
1619
+ # Confirmation prompt for single service
1620
+ if (
1621
+ not yes and not force
1622
+ ): # if --force is provided, we don't need additional confirmation
1623
+ confirm = typer.confirm("\nDo you want to proceed?")
1624
+ if not confirm:
1625
+ console.print("[yellow]Teardown cancelled[/yellow]")
1626
+ raise typer.Exit(0)
1627
+
1628
+ # Delete resources
1629
+ if force:
1630
+ console.print("\n[yellow]Force deleting resources...[/yellow]")
1631
+ else:
1632
+ console.print("\n[yellow]Deleting resources...[/yellow]")
1633
+
1634
+ service_types = set()
1635
+ for name in services:
1636
+ service_info = resources["services"][name]
1637
+ configmaps = service_info["configmaps"]
1638
+ service_type = service_info.get("type", "knative")
1639
+ service_types.add(service_type)
1640
+
1641
+ delete_resources_for_service(
1642
+ core_api=core_api,
1643
+ custom_api=custom_api,
1644
+ configmaps=configmaps,
1645
+ name=name,
1646
+ service_type=service_type,
1647
+ namespace=namespace,
1648
+ console=console,
1649
+ force=force,
1650
+ )
1651
+
1652
+ # Force delete any remaining pods if --force flag is set
1653
+ if force:
1654
+ # Build list of service names to check for pods
1655
+ # Include both found services and the original target name (in case service was already deleted)
1656
+ service_names_to_check = list(services)
1657
+ if name and name not in service_names_to_check:
1658
+ service_names_to_check.append(name)
1659
+
1660
+ if service_names_to_check:
1661
+ console.print("\n[yellow]Force deleting any remaining pods...[/yellow]")
1662
+ for service_name in service_names_to_check:
1663
+ try:
1664
+ # Get pods matching the service
1665
+ pods = core_api.list_namespaced_pod(
1666
+ namespace=namespace,
1667
+ label_selector=f"kubetorch.com/service={service_name}",
1668
+ ).items
1669
+
1670
+ if pods:
1671
+ for pod in pods:
1672
+ try:
1673
+ core_api.delete_namespaced_pod(
1674
+ name=pod.metadata.name,
1675
+ namespace=namespace,
1676
+ grace_period_seconds=0,
1677
+ propagation_policy="Background",
1678
+ )
1679
+ console.print(
1680
+ f"✓ Force deleted pod [blue]{pod.metadata.name}[/blue]"
1681
+ )
1682
+ except ApiException as e:
1683
+ if e.status != 404: # Ignore if already deleted
1684
+ console.print(
1685
+ f"[red]Failed to delete pod {pod.metadata.name}: {e}[/red]"
1686
+ )
1687
+ except Exception as e:
1688
+ console.print(
1689
+ f"[red]Failed to list pods for service {service_name}: {e}[/red]"
1690
+ )
1691
+
1692
+ # Also check for any orphaned pods with kubetorch labels if using --all or --prefix
1693
+ if teardown_all or prefix:
1694
+ try:
1695
+ label_selector = "kubetorch.com/service"
1696
+ if teardown_all and config.username:
1697
+ label_selector += f",kubetorch.com/username={config.username}"
1698
+
1699
+ all_pods = core_api.list_namespaced_pod(
1700
+ namespace=namespace, label_selector=label_selector
1701
+ ).items
1702
+
1703
+ # Filter by prefix if specified
1704
+ if prefix:
1705
+ all_pods = [
1706
+ p
1707
+ for p in all_pods
1708
+ if p.metadata.labels.get(
1709
+ "kubetorch.com/service", ""
1710
+ ).startswith(prefix)
1711
+ ]
1712
+
1713
+ # Delete any remaining pods not already handled
1714
+ for pod in all_pods:
1715
+ if pod.metadata.name not in [
1716
+ p.metadata.name
1717
+ for s in service_names_to_check
1718
+ for p in core_api.list_namespaced_pod(
1719
+ namespace=namespace,
1720
+ label_selector=f"kubetorch.com/service={s}",
1721
+ ).items
1722
+ ]:
1723
+ try:
1724
+ core_api.delete_namespaced_pod(
1725
+ name=pod.metadata.name,
1726
+ namespace=namespace,
1727
+ grace_period_seconds=0,
1728
+ propagation_policy="Background",
1729
+ )
1730
+ console.print(
1731
+ f"✓ Force deleted orphaned pod [blue]{pod.metadata.name}[/blue]"
1732
+ )
1733
+ except ApiException as e:
1734
+ if e.status != 404:
1735
+ console.print(
1736
+ f"[red]Failed to delete orphaned pod {pod.metadata.name}: {e}[/red]"
1737
+ )
1738
+ except Exception as e:
1739
+ console.print(f"[red]Failed to list orphaned pods: {e}[/red]")
1740
+
1741
+ console.print("\n[green]Teardown completed successfully[/green]")
1742
+
1743
+
1744
+ @app.command("volumes")
1745
+ def kt_volumes(
1746
+ action: VolumeAction = typer.Argument(VolumeAction.list, help="Action to perform"),
1747
+ name: str = typer.Argument(None, help="Volume name (for create action)"),
1748
+ storage_class: str = typer.Option(
1749
+ None, "--storage-class", "-c", help="Storage class"
1750
+ ),
1751
+ size: str = typer.Option(
1752
+ "10Gi", "--size", "-s", help="Volume size (default: 10Gi)"
1753
+ ),
1754
+ access_mode: str = typer.Option(
1755
+ "ReadWriteMany", "--access-mode", "-a", help="Access mode"
1756
+ ),
1757
+ mount_path: str = typer.Option(
1758
+ None,
1759
+ "--mount-path",
1760
+ "-m",
1761
+ help=f"Mount path (default: /{KT_MOUNT_FOLDER}/{{name}})",
1762
+ ),
1763
+ namespace: str = typer.Option(
1764
+ globals.config.namespace,
1765
+ "-n",
1766
+ "--namespace",
1767
+ ),
1768
+ all_namespaces: bool = typer.Option(
1769
+ False,
1770
+ "--all-namespaces",
1771
+ "-A",
1772
+ help="List volumes across all namespaces",
1773
+ ),
1774
+ ):
1775
+ """Manage volumes used in Kubetorch services.
1776
+
1777
+ Examples:
1778
+
1779
+ .. code-block:: bash
1780
+
1781
+ $ kt volumes
1782
+
1783
+ $ kt volumes -A
1784
+
1785
+ $ kt volumes create my-vol
1786
+
1787
+ $ kt volumes create my-vol -c gp3-csi -s 20Gi
1788
+
1789
+ $ kt volumes delete my-vol
1790
+
1791
+ $ kt volumes ssh my-vol
1792
+ """
1793
+ from kubernetes import client
1794
+
1795
+ from kubetorch import Volume
1796
+ from kubetorch.utils import load_kubeconfig
1797
+
1798
+ load_kubeconfig()
1799
+ core_v1 = client.CoreV1Api()
1800
+
1801
+ target_namespace = None
1802
+ if not all_namespaces:
1803
+ target_namespace = namespace or globals.config.namespace
1804
+
1805
+ if action == VolumeAction.list:
1806
+ try:
1807
+ if all_namespaces:
1808
+ pvcs = core_v1.list_persistent_volume_claim_for_all_namespaces()
1809
+ title = "Kubetorch Volumes (All Namespaces)"
1810
+ else:
1811
+ pvcs = core_v1.list_namespaced_persistent_volume_claim(
1812
+ namespace=target_namespace
1813
+ )
1814
+ title = f"Kubetorch Volumes (Namespace: {target_namespace})"
1815
+
1816
+ # List all Kubetorch PVCs
1817
+ kubetorch_pvcs = [
1818
+ pvc
1819
+ for pvc in pvcs.items
1820
+ if (pvc.metadata.annotations or {}).get("kubetorch.com/mount-path")
1821
+ ]
1822
+
1823
+ if not kubetorch_pvcs:
1824
+ if all_namespaces:
1825
+ console.print("[yellow]No volumes found in all namespaces[/yellow]")
1826
+ else:
1827
+ console.print(
1828
+ f"[yellow]No volumes found in namespace {target_namespace}[/yellow]"
1829
+ )
1830
+ return
1831
+
1832
+ table = Table(title=title)
1833
+ if all_namespaces:
1834
+ table.add_column("Namespace", style="green")
1835
+ table.add_column("Name", style="cyan")
1836
+ table.add_column("PVC Name", style="blue")
1837
+ table.add_column("Status", style="green")
1838
+ table.add_column("Size", style="yellow")
1839
+ table.add_column("Storage Class", style="magenta")
1840
+ table.add_column("Access Mode", style="white")
1841
+ table.add_column("Mount Path", style="dim")
1842
+
1843
+ for pvc in kubetorch_pvcs:
1844
+ # Extract volume name from PVC name
1845
+ volume_name = pvc.metadata.name
1846
+ status = pvc.status.phase
1847
+ size = pvc.spec.resources.requests.get("storage", "Unknown")
1848
+ storage_class = pvc.spec.storage_class_name or "Default"
1849
+ access_mode = (
1850
+ pvc.spec.access_modes[0] if pvc.spec.access_modes else "Unknown"
1851
+ )
1852
+
1853
+ # Get mount path from annotations
1854
+ annotations = pvc.metadata.annotations or {}
1855
+ mount_path_display = annotations.get(
1856
+ "kubetorch.com/mount-path", f"/{KT_MOUNT_FOLDER}/{volume_name}"
1857
+ )
1858
+
1859
+ status_color = (
1860
+ "green"
1861
+ if status == "Bound"
1862
+ else "yellow"
1863
+ if status == "Pending"
1864
+ else "red"
1865
+ )
1866
+
1867
+ row_data = []
1868
+ if all_namespaces:
1869
+ row_data.append(pvc.metadata.namespace)
1870
+
1871
+ row_data.extend(
1872
+ [
1873
+ volume_name,
1874
+ pvc.metadata.name,
1875
+ f"[{status_color}]{status}[/{status_color}]",
1876
+ size,
1877
+ storage_class,
1878
+ access_mode,
1879
+ mount_path_display,
1880
+ ]
1881
+ )
1882
+
1883
+ table.add_row(*row_data)
1884
+
1885
+ console.print(table)
1886
+
1887
+ except Exception as e:
1888
+ console.print(f"[red]Failed to list volumes: {e}[/red]")
1889
+ raise typer.Exit(1)
1890
+
1891
+ elif action == VolumeAction.ssh:
1892
+ if not name:
1893
+ console.print("[red]Volume name is required[/red]")
1894
+ raise typer.Exit(1)
1895
+
1896
+ volume = Volume.from_name(name=name, namespace=namespace, core_v1=core_v1)
1897
+ volume.ssh()
1898
+
1899
+ elif action == VolumeAction.create:
1900
+ if not name:
1901
+ console.print("[red]Volume name is required[/red]")
1902
+ raise typer.Exit(1)
1903
+
1904
+ if all_namespaces:
1905
+ console.print(
1906
+ "[red]Cannot create volume with --all-namespaces. Specify a namespace.[/red]"
1907
+ )
1908
+ raise typer.Exit(1)
1909
+
1910
+ try:
1911
+ volume = Volume(
1912
+ name=name,
1913
+ storage_class=storage_class,
1914
+ mount_path=mount_path,
1915
+ size=size,
1916
+ access_mode=access_mode,
1917
+ namespace=namespace,
1918
+ )
1919
+
1920
+ if volume.exists():
1921
+ console.print(
1922
+ f"[yellow]Volume {name} (PVC: {volume.pvc_name}) already exists in "
1923
+ f"namespace {namespace}[/yellow]"
1924
+ )
1925
+ return
1926
+
1927
+ console.print(f"Creating volume [blue]{name}[/blue]...")
1928
+ volume.create()
1929
+
1930
+ console.print(
1931
+ f"[green]✓[/green] Successfully created volume [blue]{name}[/blue]"
1932
+ )
1933
+ config = volume.config()
1934
+ for k, v in config.items():
1935
+ console.print(f"[bold]• {k}[/bold]: {v}")
1936
+
1937
+ except Exception as e:
1938
+ console.print(f"[red]Failed to create volume {name}: {e}[/red]")
1939
+ raise typer.Exit(1)
1940
+
1941
+ elif action == VolumeAction.delete:
1942
+ if not name:
1943
+ console.print("[red]Volume name is required[/red]")
1944
+ raise typer.Exit(1)
1945
+
1946
+ if all_namespaces:
1947
+ console.print(
1948
+ "[red]Cannot delete volume with --all-namespaces. Specify a namespace.[/red]"
1949
+ )
1950
+ raise typer.Exit(1)
1951
+
1952
+ try:
1953
+ volume = Volume.from_name(name=name, namespace=namespace, core_v1=core_v1)
1954
+
1955
+ console.print(f"Deleting volume [blue]{name}[/blue]...")
1956
+ volume.delete()
1957
+
1958
+ console.print(
1959
+ f"[green]✓[/green] Successfully deleted volume [blue]{name}[/blue]"
1960
+ )
1961
+
1962
+ except ValueError:
1963
+ console.print(
1964
+ f"[red]Volume {name} not found in namespace {namespace}[/red]"
1965
+ )
1966
+ raise typer.Exit(1)
1967
+
1968
+ except Exception as e:
1969
+ console.print(f"[red]Failed to delete volume {name}: {e}[/red]")
1970
+ raise typer.Exit(1)
1971
+
1972
+
1973
+ @app.callback(invoke_without_command=True, help="Kubetorch CLI")
1974
+ def main(
1975
+ ctx: typer.Context,
1976
+ version: bool = typer.Option(
1977
+ None, "--version", "-v", help="Show the version and exit."
1978
+ ),
1979
+ ):
1980
+ if version:
1981
+ from kubetorch import __version__
1982
+
1983
+ print(f"{__version__}")
1984
+ elif ctx.invoked_subcommand is None:
1985
+ subprocess.run("kubetorch --help", shell=True)