kubetorch 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. kubetorch/__init__.py +59 -0
  2. kubetorch/cli.py +1939 -0
  3. kubetorch/cli_utils.py +967 -0
  4. kubetorch/config.py +453 -0
  5. kubetorch/constants.py +18 -0
  6. kubetorch/docs/Makefile +18 -0
  7. kubetorch/docs/__init__.py +0 -0
  8. kubetorch/docs/_ext/json_globaltoc.py +42 -0
  9. kubetorch/docs/api/cli.rst +10 -0
  10. kubetorch/docs/api/python/app.rst +21 -0
  11. kubetorch/docs/api/python/cls.rst +19 -0
  12. kubetorch/docs/api/python/compute.rst +25 -0
  13. kubetorch/docs/api/python/config.rst +11 -0
  14. kubetorch/docs/api/python/fn.rst +19 -0
  15. kubetorch/docs/api/python/image.rst +14 -0
  16. kubetorch/docs/api/python/secret.rst +18 -0
  17. kubetorch/docs/api/python/volumes.rst +13 -0
  18. kubetorch/docs/api/python.rst +101 -0
  19. kubetorch/docs/conf.py +69 -0
  20. kubetorch/docs/index.rst +20 -0
  21. kubetorch/docs/requirements.txt +5 -0
  22. kubetorch/globals.py +269 -0
  23. kubetorch/logger.py +59 -0
  24. kubetorch/resources/__init__.py +0 -0
  25. kubetorch/resources/callables/__init__.py +0 -0
  26. kubetorch/resources/callables/cls/__init__.py +0 -0
  27. kubetorch/resources/callables/cls/cls.py +159 -0
  28. kubetorch/resources/callables/fn/__init__.py +0 -0
  29. kubetorch/resources/callables/fn/fn.py +140 -0
  30. kubetorch/resources/callables/module.py +1315 -0
  31. kubetorch/resources/callables/utils.py +203 -0
  32. kubetorch/resources/compute/__init__.py +0 -0
  33. kubetorch/resources/compute/app.py +253 -0
  34. kubetorch/resources/compute/compute.py +2414 -0
  35. kubetorch/resources/compute/decorators.py +137 -0
  36. kubetorch/resources/compute/utils.py +1026 -0
  37. kubetorch/resources/compute/websocket.py +135 -0
  38. kubetorch/resources/images/__init__.py +1 -0
  39. kubetorch/resources/images/image.py +412 -0
  40. kubetorch/resources/images/images.py +64 -0
  41. kubetorch/resources/secrets/__init__.py +2 -0
  42. kubetorch/resources/secrets/kubernetes_secrets_client.py +377 -0
  43. kubetorch/resources/secrets/provider_secrets/__init__.py +0 -0
  44. kubetorch/resources/secrets/provider_secrets/anthropic_secret.py +12 -0
  45. kubetorch/resources/secrets/provider_secrets/aws_secret.py +16 -0
  46. kubetorch/resources/secrets/provider_secrets/azure_secret.py +14 -0
  47. kubetorch/resources/secrets/provider_secrets/cohere_secret.py +12 -0
  48. kubetorch/resources/secrets/provider_secrets/gcp_secret.py +16 -0
  49. kubetorch/resources/secrets/provider_secrets/github_secret.py +13 -0
  50. kubetorch/resources/secrets/provider_secrets/huggingface_secret.py +20 -0
  51. kubetorch/resources/secrets/provider_secrets/kubeconfig_secret.py +12 -0
  52. kubetorch/resources/secrets/provider_secrets/lambda_secret.py +13 -0
  53. kubetorch/resources/secrets/provider_secrets/langchain_secret.py +12 -0
  54. kubetorch/resources/secrets/provider_secrets/openai_secret.py +11 -0
  55. kubetorch/resources/secrets/provider_secrets/pinecone_secret.py +12 -0
  56. kubetorch/resources/secrets/provider_secrets/providers.py +92 -0
  57. kubetorch/resources/secrets/provider_secrets/ssh_secret.py +12 -0
  58. kubetorch/resources/secrets/provider_secrets/wandb_secret.py +11 -0
  59. kubetorch/resources/secrets/secret.py +224 -0
  60. kubetorch/resources/secrets/secret_factory.py +64 -0
  61. kubetorch/resources/secrets/utils.py +222 -0
  62. kubetorch/resources/volumes/__init__.py +0 -0
  63. kubetorch/resources/volumes/volume.py +340 -0
  64. kubetorch/servers/__init__.py +0 -0
  65. kubetorch/servers/http/__init__.py +0 -0
  66. kubetorch/servers/http/distributed_utils.py +2968 -0
  67. kubetorch/servers/http/http_client.py +802 -0
  68. kubetorch/servers/http/http_server.py +1622 -0
  69. kubetorch/servers/http/server_metrics.py +255 -0
  70. kubetorch/servers/http/utils.py +722 -0
  71. kubetorch/serving/__init__.py +0 -0
  72. kubetorch/serving/autoscaling.py +153 -0
  73. kubetorch/serving/base_service_manager.py +344 -0
  74. kubetorch/serving/constants.py +77 -0
  75. kubetorch/serving/deployment_service_manager.py +431 -0
  76. kubetorch/serving/knative_service_manager.py +487 -0
  77. kubetorch/serving/raycluster_service_manager.py +526 -0
  78. kubetorch/serving/service_manager.py +18 -0
  79. kubetorch/serving/templates/deployment_template.yaml +17 -0
  80. kubetorch/serving/templates/knative_service_template.yaml +19 -0
  81. kubetorch/serving/templates/kt_setup_template.sh.j2 +91 -0
  82. kubetorch/serving/templates/pod_template.yaml +198 -0
  83. kubetorch/serving/templates/raycluster_service_template.yaml +42 -0
  84. kubetorch/serving/templates/raycluster_template.yaml +35 -0
  85. kubetorch/serving/templates/service_template.yaml +21 -0
  86. kubetorch/serving/templates/workerset_template.yaml +36 -0
  87. kubetorch/serving/utils.py +344 -0
  88. kubetorch/utils.py +263 -0
  89. kubetorch-0.2.5.dist-info/METADATA +75 -0
  90. kubetorch-0.2.5.dist-info/RECORD +92 -0
  91. kubetorch-0.2.5.dist-info/WHEEL +4 -0
  92. kubetorch-0.2.5.dist-info/entry_points.txt +5 -0
kubetorch/cli.py ADDED
@@ -0,0 +1,1939 @@
1
+ import base64
2
+ import importlib
3
+ import inspect
4
+ import os
5
+ import signal
6
+ import subprocess
7
+ import sys
8
+ import textwrap
9
+ import time
10
+ from datetime import datetime
11
+ from pathlib import Path
12
+ from typing import List
13
+ from urllib.parse import urlparse
14
+
15
+ import httpx
16
+ from kubernetes import client
17
+ from kubernetes.client.rest import ApiException
18
+ from rich.syntax import Syntax
19
+
20
+ from kubetorch.servers.http.utils import is_running_in_kubernetes
21
+
22
+ from .cli_utils import (
23
+ create_table_for_output,
24
+ default_typer_values,
25
+ get_deployment_mode,
26
+ get_ingress_host,
27
+ get_last_updated,
28
+ get_logs_from_loki,
29
+ is_ingress_vpc_only,
30
+ load_ingress,
31
+ load_kubetorch_volumes_from_pods,
32
+ notebook_placeholder,
33
+ port_forward_to_pod,
34
+ SecretAction,
35
+ service_name_argument,
36
+ validate_config_key,
37
+ validate_pods_exist,
38
+ validate_provided_pod,
39
+ VolumeAction,
40
+ )
41
+
42
+ from .utils import initialize_k8s_clients
43
+
44
+ try:
45
+ import typer
46
+
47
+ from rich.console import Console
48
+ from rich.panel import Panel
49
+ from rich.table import Table
50
+ except ImportError:
51
+ raise ImportError("Please install the required CLI dependencies: `pip install 'kubetorch[client] @ <install_url>'`")
52
+
53
+
54
+ import kubetorch.serving.constants as serving_constants
55
+
56
+ from kubetorch import globals
57
+ from kubetorch.config import ENV_MAPPINGS
58
+ from kubetorch.servers.http.utils import DEFAULT_DEBUG_PORT
59
+
60
+ from .constants import BULLET_UNICODE, KT_MOUNT_FOLDER
61
+
62
+ try:
63
+ from .internal.cli import register_internal_commands
64
+
65
+ _INTERNAL_COMMANDS_AVAILABLE = True
66
+ except ImportError:
67
+ _INTERNAL_COMMANDS_AVAILABLE = False
68
+
69
+ from .logger import get_logger
70
+
71
+ app = typer.Typer(add_completion=False)
72
+ console = Console()
73
+
74
+ # Register internal CLI commands if available
75
+ if _INTERNAL_COMMANDS_AVAILABLE:
76
+ register_internal_commands(app)
77
+
78
+
79
+ logger = get_logger(__name__)
80
+
81
+
82
+ @app.command("check")
83
+ def kt_check(
84
+ name: str = service_name_argument(help="Service name"),
85
+ namespace: str = typer.Option(
86
+ globals.config.namespace,
87
+ "-n",
88
+ "--namespace",
89
+ ),
90
+ ):
91
+ """
92
+ Run a comprehensive health check for a deployed service.
93
+
94
+ Checks:
95
+
96
+ - Deployment pod comes up and becomes ready (if not scaled to zero)
97
+
98
+ - Rsync has succeeded
99
+
100
+ - Service is marked as ready and service pod(s) are ready to serve traffic
101
+
102
+ - GPU support configured (if applicable)
103
+
104
+ - Log streaming configuration (if applicable)
105
+
106
+ If a step fails, will dump ``kubectl describe`` and pod logs for relevant pods.
107
+ """
108
+ core_api, custom_api, apps_v1_api = initialize_k8s_clients()
109
+
110
+ def dump_pod_debug(pod_name):
111
+ try:
112
+ describe_proc = subprocess.run(
113
+ ["kubectl", "describe", "pod", pod_name, "-n", namespace],
114
+ check=False,
115
+ capture_output=True,
116
+ text=True,
117
+ )
118
+ describe_output = describe_proc.stdout or describe_proc.stderr or "<no output>"
119
+
120
+ logs_proc = subprocess.run(
121
+ ["kubectl", "logs", pod_name, "-n", namespace, "-c", "kubetorch"],
122
+ check=False,
123
+ capture_output=True,
124
+ text=True,
125
+ )
126
+ logs_output = logs_proc.stdout or logs_proc.stderr or "<no output>"
127
+
128
+ console.print(
129
+ Panel(
130
+ describe_output,
131
+ title=f"POD DESCRIPTION ({pod_name})",
132
+ border_style="yellow",
133
+ expand=False,
134
+ )
135
+ )
136
+ console.print(
137
+ Panel(
138
+ logs_output,
139
+ title=f"POD LOGS ({pod_name})",
140
+ border_style="yellow",
141
+ expand=False,
142
+ )
143
+ )
144
+ except Exception as e:
145
+ console.print(f"[red]Failed to dump pod info: {e}[/red]")
146
+
147
+ def fail(msg, pod_names=None):
148
+ console.print(f"[red]{msg}[/red]")
149
+ if pod_names:
150
+ for pod_name in pod_names:
151
+ dump_pod_debug(pod_name)
152
+ raise typer.Exit(1)
153
+
154
+ try:
155
+ # Validate service exists and get deployment mode
156
+ name, deployment_mode = get_deployment_mode(name, namespace, custom_api, apps_v1_api)
157
+
158
+ console.print(f"[bold blue]Checking {deployment_mode} service...[/bold blue]")
159
+
160
+ # 1. Deployment pod check
161
+ console.print("[bold blue]Checking deployment pod...[/bold blue]")
162
+ deploy_pods = validate_pods_exist(name, namespace, core_api)
163
+
164
+ if not deploy_pods:
165
+ if deployment_mode == "knative":
166
+ try:
167
+ # Check if the Knative service is marked as ready (e.g. scaled to zero)
168
+ service = custom_api.get_namespaced_custom_object(
169
+ group="serving.knative.dev",
170
+ version="v1",
171
+ namespace=namespace,
172
+ plural="services",
173
+ name=name,
174
+ )
175
+ conditions = service.get("status", {}).get("conditions", [])
176
+ ready = any(c.get("type") == "Ready" and c.get("status") == "True" for c in conditions)
177
+ if ready:
178
+ console.print(
179
+ f"[yellow]No deployment pods found. Service [bold]{name}[/bold] is scaled to zero but marked as 'READY'. "
180
+ "It will scale up on demand.[/yellow]"
181
+ )
182
+ return
183
+ else:
184
+ fail("Deployment pod not found and service is not READY.")
185
+
186
+ except Exception as e:
187
+ fail(f"Failed to check Knative service status: {e}")
188
+ else:
189
+ fail("No Deployment pods found.")
190
+
191
+ deploy_pod = next(
192
+ (p for p in deploy_pods if p.status.phase == "Running" and not p.metadata.deletion_timestamp),
193
+ None,
194
+ )
195
+ if not deploy_pod:
196
+ fail(
197
+ "No deployment pod in 'Running' state found.",
198
+ [p.metadata.name for p in deploy_pods],
199
+ )
200
+
201
+ deploy_pod_name = deploy_pod.metadata.name
202
+ if deploy_pod.status.phase != "Running":
203
+ fail(
204
+ f"Deployment pod not running (status: {deploy_pod.status.phase})",
205
+ [deploy_pod_name],
206
+ )
207
+
208
+ # 2. Rsync check
209
+ console.print("[bold blue]Checking rsync...[/bold blue]")
210
+ current_working_dir = "."
211
+ check_cmd = [
212
+ "kubectl",
213
+ "exec",
214
+ deploy_pod_name,
215
+ "-n",
216
+ namespace,
217
+ "--",
218
+ "ls",
219
+ "-l",
220
+ current_working_dir,
221
+ ]
222
+ try:
223
+ result = subprocess.run(check_cmd, capture_output=True, text=True, check=True)
224
+ lines = result.stdout.splitlines()
225
+ entries = [line for line in lines if not line.startswith("total")]
226
+ if not entries:
227
+ fail("Rsync directory exists but is empty.", [deploy_pod_name])
228
+ except subprocess.CalledProcessError as e:
229
+ fail(
230
+ f"Rsync directory check failed: {e.stderr or e.stdout}",
231
+ [deploy_pod_name],
232
+ )
233
+
234
+ # 3. Service call check
235
+ console.print("[bold blue]Checking service call...[/bold blue]")
236
+ try:
237
+ with port_forward_to_pod(
238
+ pod_name=deploy_pod_name,
239
+ namespace=namespace,
240
+ local_port=32300,
241
+ remote_port=32300,
242
+ ) as local_port:
243
+ url = f"http://localhost:{local_port}/health"
244
+ resp = httpx.get(url, timeout=10)
245
+ if not resp.is_success:
246
+ fail(
247
+ f"Service call failed: {resp.status_code} {resp.text}",
248
+ [deploy_pod_name],
249
+ )
250
+ except Exception as e:
251
+ fail(f"Service call check failed: {e}", [deploy_pod_name])
252
+
253
+ # 5. GPU + autoscaler test (if GPU requested)
254
+ gpu_requested = any(
255
+ c.resources.limits and "nvidia.com/gpu" in c.resources.limits for c in deploy_pod.spec.containers
256
+ )
257
+ if gpu_requested:
258
+ gpus_configured = False
259
+ console.print("[bold blue]Checking GPU plugin support...[/bold blue]")
260
+ nodes = core_api.list_node().items
261
+ for node in nodes:
262
+ gpus = node.status.capacity.get("nvidia.com/gpu")
263
+ if gpus and int(gpus) > 0:
264
+ gpus_configured = True
265
+ break
266
+
267
+ if not gpus_configured:
268
+ console.print(
269
+ "[yellow]No GPU nodes currently configured on the cluster, is autoscaling configured?[/yellow]"
270
+ )
271
+
272
+ dcgm_exporter = True
273
+ dcgm_namespace = globals.config.install_namespace
274
+
275
+ pods = core_api.list_namespaced_pod(
276
+ namespace=dcgm_namespace,
277
+ label_selector="app.kubernetes.io/name=dcgm-exporter",
278
+ ).items
279
+ if not pods:
280
+ dcgm_exporter = False
281
+
282
+ if not dcgm_exporter:
283
+ console.print(f"[yellow]DCGM exporter not found in namespace {dcgm_namespace}[/yellow]")
284
+
285
+ # 6. Check logs
286
+ if globals.config.stream_logs:
287
+ try:
288
+ streaming_enabled = core_api.read_namespaced_service(
289
+ name=serving_constants.LOKI_GATEWAY_SERVICE_NAME,
290
+ namespace=globals.config.install_namespace,
291
+ )
292
+ except ApiException:
293
+ streaming_enabled = False
294
+
295
+ if streaming_enabled:
296
+
297
+ console.print("[bold blue]Checking log streaming...[/bold blue]")
298
+ query = f'{{k8s_pod_name="{deploy_pod_name}", k8s_container_name="kubetorch"}}'
299
+ try:
300
+ logs = get_logs_from_loki(query=query, print_pod_name=False, timeout=5.0)
301
+ if logs is None:
302
+ fail("No logs found for service", [deploy_pod_name])
303
+
304
+ except Exception as e:
305
+ fail(f"Logs check failed: {e}", [deploy_pod_name])
306
+
307
+ console.print("[bold green]✓ All service checks passed[/bold green]")
308
+
309
+ except typer.Exit:
310
+ # Just re-raise, don't print
311
+ raise
312
+
313
+
314
+ @app.command("config")
315
+ def kt_config(
316
+ action: str = typer.Argument(default="", help="Action to perform (set, unset, get, list)"),
317
+ key: str = typer.Argument(None, help="Config key (e.g., 'username')", callback=validate_config_key),
318
+ value: str = typer.Argument(None, help="Value to set"),
319
+ ):
320
+ """Manage Kubetorch configuration settings.
321
+
322
+ Examples:
323
+
324
+ .. code-block:: bash
325
+
326
+ $ kt config set username johndoe
327
+
328
+ $ kt config set volumes "volume_name_one, volume_name_two"
329
+
330
+ $ kt config set volumes volume_name_one
331
+
332
+ $ kt config unset username
333
+
334
+ $ kt config get username
335
+
336
+ $ kt config list
337
+ """
338
+ from kubetorch import config
339
+
340
+ if action == "set":
341
+ if not key or not value:
342
+ console.print("[red]Both key and value are required for 'set'[/red]")
343
+ raise typer.Exit(1)
344
+
345
+ try:
346
+ value = config.set(key, value) # validate value
347
+ config.write({key: value})
348
+ console.print(f"[green]{key} set to:[/green] [blue]{value}[/blue]")
349
+ except ValueError as e:
350
+ console.print(f"[red]Error setting {key}:[/red] {str(e)}")
351
+ raise typer.Exit(1)
352
+
353
+ elif action == "unset":
354
+ if not key:
355
+ console.print("[red]Key is required for 'unset'[/red]")
356
+ raise typer.Exit(1)
357
+
358
+ try:
359
+ config.set(key, None)
360
+ config.write({key: None})
361
+ console.print(f"[green]{key.capitalize()} unset[/green]")
362
+ except ValueError as e:
363
+ console.print(f"[red]Error unsetting {key}:[/red] {str(e)}")
364
+ raise typer.Exit(1)
365
+
366
+ elif action == "get":
367
+ if not key:
368
+ # Error panel
369
+ console.print("[red]Key is required for 'get'[/red]")
370
+ raise typer.Exit(1)
371
+
372
+ if key in ENV_MAPPINGS:
373
+ value = config.get(key)
374
+ if value:
375
+ console.print(f"[blue]{value}[/blue]")
376
+ else:
377
+ console.print(f"[yellow]{key.capitalize()} not set[/yellow]")
378
+ else:
379
+ console.print(f"[red]Unknown config key:[/red] [bold]{key}[/bold]")
380
+ raise typer.Exit(1)
381
+
382
+ elif action == "list" or not action:
383
+ console.print(dict(config))
384
+
385
+ else:
386
+ console.print(f"[red]Unknown action:[/red] [bold]{action}[/bold]")
387
+ console.print("\nValid actions are: set, get, list")
388
+ raise typer.Exit(1)
389
+
390
+
391
+ @app.command("debug")
392
+ def kt_debug(
393
+ pod: str = typer.Argument(..., help="Pod name"),
394
+ namespace: str = typer.Option(
395
+ globals.config.namespace,
396
+ "-n",
397
+ "--namespace",
398
+ ),
399
+ port: int = typer.Option(DEFAULT_DEBUG_PORT, help="Debug port used for remote debug server"),
400
+ ):
401
+ """Start an interactive debugging session on the pod, which will connect to the debug server inside the service.
402
+ Before running this command, you must call a method on the service with pdb=True or add a
403
+ kt.deep_breakpoint() call into your code to enable debugging.
404
+ """
405
+ import webbrowser
406
+
407
+ if is_running_in_kubernetes():
408
+ console.print(
409
+ "[red]Debugging is not supported when running inside Kubernetes. Please run this command locally.[/red]"
410
+ )
411
+ raise typer.Exit(1)
412
+
413
+ # Use the base path of web-pdb server as health endpoint because we're port-forwarding straight into the pod
414
+ with port_forward_to_pod(
415
+ namespace=namespace,
416
+ pod_name=pod,
417
+ local_port=port,
418
+ remote_port=port,
419
+ health_endpoint="/",
420
+ ):
421
+ debug_ui_url = f"http://localhost:{port}"
422
+ console.print(f"Opening debug UI at [blue]{debug_ui_url}[/blue]")
423
+ webbrowser.open(debug_ui_url)
424
+ # Wait for the user to finish debugging
425
+ console.print("[yellow]Press Ctrl+C to stop the debugging session and close the UI.[/yellow]")
426
+ # Wait for a Ctrl+C to exit the debug session
427
+ try:
428
+ while True:
429
+ time.sleep(1)
430
+ except KeyboardInterrupt:
431
+ console.print("\n[yellow]Debugging session ended.[/yellow]")
432
+ raise typer.Exit(0)
433
+
434
+
435
+ @app.command("deploy")
436
+ def kt_deploy(
437
+ target: str = typer.Argument(
438
+ ...,
439
+ help="Python module or file to deploy, optionally followed by a "
440
+ "single function or class to deploy. e.e. `my_module:my_cls`, or "
441
+ "`my_file.py`.",
442
+ ),
443
+ ):
444
+ """Deploy a Python file or module to Kubetorch. This will deploy all functions and modules decorated with
445
+ @kt.compute in the file or module."""
446
+ from kubetorch.resources.compute.utils import _collect_modules
447
+
448
+ os.environ["KT_CLI_DEPLOY_MODE"] = "1"
449
+ to_deploy, target_fn_or_class = _collect_modules(target)
450
+
451
+ if not target_fn_or_class:
452
+ console.print(f"Found the following functions and classes to deploy in {target}:")
453
+ for module in to_deploy:
454
+ console.print(f"{BULLET_UNICODE} {module.name}")
455
+
456
+ import asyncio
457
+
458
+ async def deploy_all_async():
459
+ tasks = []
460
+ for module in to_deploy:
461
+ console.print(f"Deploying {module.name}...")
462
+ tasks.append(module.deploy_async())
463
+
464
+ try:
465
+ await asyncio.gather(*tasks)
466
+ for module in to_deploy:
467
+ console.print(f"Successfully deployed {module.name}.")
468
+ except Exception as e:
469
+ console.print(f"Failed to deploy one or more modules: {e}")
470
+ raise e
471
+
472
+ asyncio.run(deploy_all_async())
473
+
474
+ if not target_fn_or_class:
475
+ console.print(f"Successfully deployed functions and modules from {target}.")
476
+
477
+
478
+ @app.command("describe")
479
+ def kt_describe(
480
+ name: str = service_name_argument(help="Service name"),
481
+ namespace: str = typer.Option(
482
+ globals.config.namespace,
483
+ "-n",
484
+ "--namespace",
485
+ ),
486
+ ):
487
+ """
488
+ Show basic info for calling the service depending on whether an ingress is configured.
489
+ """
490
+
491
+ core_api, custom_api, apps_v1_api = initialize_k8s_clients()
492
+
493
+ endpoint_placeholder = "METHOD_OR_CLS_NAME"
494
+ args_placeholder = []
495
+
496
+ try:
497
+ name, deployment_mode = get_deployment_mode(name, namespace, custom_api, apps_v1_api)
498
+ except ApiException:
499
+ console.print(f"[red] Failed to load service '{name}' in namespace '{namespace}'[/red]")
500
+ raise typer.Exit(1)
501
+
502
+ try:
503
+ console.print()
504
+ base_url = globals.config.api_url
505
+
506
+ ingress = load_ingress()
507
+ host = get_ingress_host(ingress) if ingress else f"{name}.{namespace}.svc.cluster.local"
508
+
509
+ if not base_url:
510
+ if not ingress:
511
+ console.print("[yellow]No ingress found. Service is only accessible from inside the cluster.[/yellow]")
512
+ base_url = f"http://{name}.{namespace}.svc.cluster.local"
513
+ else:
514
+ lb_ing = (
515
+ ingress.status.load_balancer.ingress[0]
516
+ if (ingress.status and ingress.status.load_balancer and ingress.status.load_balancer.ingress)
517
+ else None
518
+ )
519
+
520
+ address = lb_ing.hostname or lb_ing.ip if lb_ing else None
521
+ if address:
522
+ base_url = f"http://{address}"
523
+ else:
524
+ console.print("[yellow]Ingress found but no address, falling back to cluster-local.[/yellow]")
525
+ base_url = f"http://{name}.{namespace}.svc.cluster.local"
526
+ else:
527
+ parsed = urlparse(base_url)
528
+ if not parsed.scheme:
529
+ base_url = f"http://{base_url}"
530
+
531
+ if ingress:
532
+ console.print(f"[bold]Host:[/bold] [green]{name}[/green]")
533
+
534
+ vpc_only = is_ingress_vpc_only(ingress.metadata.annotations)
535
+ if vpc_only:
536
+ console.print()
537
+ console.print("[yellow]Note: This is a VPC-only ingress (internal access only)[/yellow]")
538
+
539
+ console.print()
540
+
541
+ if ingress:
542
+ console.print("[bold]Calling the service using an ingress:[/bold]\n")
543
+ # With ingress, use the full path structure
544
+ service_path = f"/{namespace}/{name}/{endpoint_placeholder}"
545
+ else:
546
+ console.print("[bold]Calling the service from inside the cluster:[/bold]\n")
547
+ service_path = f"/{endpoint_placeholder}"
548
+
549
+ curl_code = textwrap.dedent(
550
+ f"""\
551
+ curl -X POST \\
552
+ -H "Content-Type: application/json" \\
553
+ -d '{{"args": {args_placeholder}, "kwargs": {{}}}}' \\
554
+ {base_url}{service_path}
555
+ """
556
+ )
557
+ # Only add Host header if we have ingress
558
+ if ingress:
559
+ curl_code = curl_code.replace(
560
+ '-H "Content-Type: application/json"',
561
+ f'-H "Host: {host}" \\\n -H "Content-Type: application/json"',
562
+ )
563
+
564
+ console.print(Panel(Syntax(curl_code, "bash"), title="With Curl", border_style="green"))
565
+ console.print()
566
+
567
+ python_code = textwrap.dedent(
568
+ f"""\
569
+ import requests
570
+
571
+ url = "{base_url}{service_path}"
572
+ headers = {{
573
+ "Content-Type": "application/json"
574
+ }}
575
+ data = {{
576
+ "args": {args_placeholder},
577
+ "kwargs": {{}}
578
+ }}
579
+
580
+ response = requests.post(url, headers=headers, json=data)
581
+ print(response.json())
582
+ """
583
+ )
584
+ if ingress:
585
+ python_code = python_code.replace(
586
+ '"Content-Type": "application/json"',
587
+ f'"Host": "{host}",\n "Content-Type": "application/json"',
588
+ )
589
+ console.print(
590
+ Panel(
591
+ Syntax(python_code, "python"),
592
+ title="With Python",
593
+ border_style="green",
594
+ )
595
+ )
596
+ except Exception as e:
597
+ console.print(
598
+ f"[red]Failed to describe service {name} in namespace {namespace}: {e}[/red]",
599
+ )
600
+ raise typer.Exit(1)
601
+
602
+
603
+ @app.command("list")
604
+ def kt_list(
605
+ namespace: str = typer.Option(
606
+ globals.config.namespace,
607
+ "-n",
608
+ "--namespace",
609
+ ),
610
+ sort_by_updated: bool = typer.Option(False, "-s", "--sort", help="Sort by last update time"),
611
+ tag: str = typer.Option(
612
+ None,
613
+ "-t",
614
+ "--tag",
615
+ help="Service tag or prefix (ex: 'myusername', 'some-git-branch').",
616
+ ),
617
+ ):
618
+ """List all Kubetorch services.
619
+
620
+ Examples:
621
+
622
+ .. code-block:: bash
623
+
624
+ $ kt list
625
+
626
+ $ kt list -t dev-branch
627
+ """
628
+ core_api, custom_api, _ = initialize_k8s_clients()
629
+
630
+ # Import here to avoid circular imports
631
+ from kubetorch.serving.service_manager import BaseServiceManager
632
+
633
+ try:
634
+ # Use unified service discovery
635
+ unified_services = BaseServiceManager.discover_services_static(namespace=namespace, name_filter=tag)
636
+
637
+ if not unified_services:
638
+ console.print(f"[yellow]No services found in {namespace} namespace[/yellow]")
639
+ return
640
+
641
+ # Optional second-level tag filtering
642
+ if tag:
643
+ unified_services = [
644
+ svc
645
+ for svc in unified_services
646
+ if tag in svc["name"]
647
+ or tag in " ".join(str(v) for v in svc["resource"].get("metadata", {}).get("labels", {}).values())
648
+ ]
649
+ if not unified_services:
650
+ console.print(f"[yellow]No services found in {namespace} namespace[/yellow]")
651
+ return
652
+
653
+ if sort_by_updated:
654
+
655
+ def get_update_time(svc):
656
+ # If not a ksvc, use creation timestamp as proxy for update time
657
+ return (
658
+ get_last_updated(svc["resource"]) if svc["template_type"] == "ksvc" else svc["creation_timestamp"]
659
+ )
660
+
661
+ unified_services.sort(key=get_update_time, reverse=True)
662
+
663
+ try:
664
+ pods = core_api.list_namespaced_pod(
665
+ namespace=namespace, label_selector=f"{serving_constants.KT_SERVICE_LABEL}"
666
+ )
667
+ except client.exceptions.ApiException as e:
668
+ logger.warning(f"Failed to list pods for all services in namespace {namespace}: {e}")
669
+ return
670
+ pod_map = {
671
+ svc["name"]: [
672
+ pod for pod in pods.items if pod.metadata.labels.get(serving_constants.KT_SERVICE_LABEL) == svc["name"]
673
+ ]
674
+ for svc in unified_services
675
+ }
676
+
677
+ # Create table
678
+ table_columns = [
679
+ ("SERVICE", "cyan"),
680
+ ("TYPE", "magenta"),
681
+ ("STATUS", "green"),
682
+ ("# OF PODS", "yellow"),
683
+ ("POD NAMES", "red"),
684
+ ("VOLUMES", "blue"),
685
+ ("LAST STATUS CHANGE", "yellow"),
686
+ ("TTL", "yellow"),
687
+ ("CREATOR", "yellow"),
688
+ ("QUEUE", "yellow"),
689
+ ("CPUs", "yellow"),
690
+ ("MEMORY", "yellow"),
691
+ ("GPUs", "yellow"),
692
+ ]
693
+ table = create_table_for_output(
694
+ columns=table_columns,
695
+ no_wrap_columns_names=["SERVICE"],
696
+ header_style={"bold": False},
697
+ )
698
+
699
+ for svc in unified_services:
700
+ name = svc["name"]
701
+ kind = svc["template_type"]
702
+ res = svc["resource"]
703
+ meta = res.get("metadata", {})
704
+ labels = meta.get("labels", {})
705
+ annotations = meta.get("annotations", {})
706
+ status_data = res.get("status", {})
707
+
708
+ # Get pods
709
+ pods = pod_map.get(name, [])
710
+
711
+ creation_ts = meta.get("creationTimestamp", None)
712
+ timestamp = (
713
+ datetime.fromisoformat(creation_ts.replace("Z", "+00:00")).strftime("%Y-%m-%d %H:%M:%S")
714
+ if creation_ts
715
+ else "Unknown"
716
+ )
717
+ ttl = annotations.get(serving_constants.INACTIVITY_TTL_ANNOTATION, "None")
718
+ creator = labels.get(serving_constants.KT_USERNAME_LABEL, "—")
719
+
720
+ volumes_display = load_kubetorch_volumes_from_pods(pods)
721
+
722
+ # Get resources from revision
723
+ cpu = memory = gpu = None
724
+ if kind == "ksvc":
725
+ cond = status_data.get("conditions", [{}])[0]
726
+ status = cond.get("status")
727
+ display_status = {
728
+ "True": "[green]Ready[/green]",
729
+ "Unknown": "[yellow]Creating[/yellow]",
730
+ }.get(status, "[red]Failed[/red]")
731
+ rev_name = status_data.get("latestCreatedRevisionName")
732
+ if rev_name:
733
+ try:
734
+ rev = custom_api.get_namespaced_custom_object(
735
+ group="serving.knative.dev",
736
+ version="v1",
737
+ namespace=namespace,
738
+ plural="revisions",
739
+ name=rev_name,
740
+ )
741
+ container = rev["spec"]["containers"][0]
742
+ reqs = container.get("resources", {}).get("requests", {})
743
+ cpu = reqs.get("cpu")
744
+ memory = reqs.get("memory")
745
+ gpu = reqs.get("nvidia.com/gpu") or reqs.get("gpu")
746
+ except Exception as e:
747
+ logger.warning(f"Could not get revision for {name}: {e}")
748
+ else:
749
+ # Process Deployment - now using consistent dict access
750
+ ready = res.get("status", {}).get("readyReplicas", 0) or 0
751
+ desired = res.get("spec", {}).get("replicas", 0) or 0
752
+ if kind == "raycluster":
753
+ state = status_data.get("state", "").lower()
754
+ conditions = {c["type"]: c["status"] for c in status_data.get("conditions", [])}
755
+ if (
756
+ state == "ready"
757
+ and conditions.get("HeadPodReady") == "True"
758
+ and conditions.get("RayClusterProvisioned") == "True"
759
+ ):
760
+ display_status = "[green]Ready[/green]"
761
+ elif state in ("creating", "upscaling", "restarting", "updating"):
762
+ display_status = "[yellow]Scaling[/yellow]"
763
+ else:
764
+ display_status = "[red]Failed[/red]"
765
+ else:
766
+ display_status = (
767
+ "[green]Ready[/green]"
768
+ if ready == desired and desired > 0
769
+ else "[yellow]Scaling[/yellow]"
770
+ if ready < desired
771
+ else "[red]Failed[/red]"
772
+ )
773
+ try:
774
+ container = res.get("spec", {}).get("template", {}).get("spec", {}).get("containers", [{}])[0]
775
+ reqs = container.get("resources", {}).get("requests", {})
776
+ cpu = reqs.get("cpu")
777
+ memory = reqs.get("memory")
778
+ gpu = reqs.get("nvidia.com/gpu") or reqs.get("gpu")
779
+ except Exception as e:
780
+ logger.warning(f"Failed to get resources for {name} in namespace {namespace}: {e}")
781
+
782
+ # Common pod processing
783
+ pod_lines = []
784
+ queue = "—"
785
+ for pod in pods:
786
+ pod_status = pod.status.phase
787
+ ready = all(c.ready for c in (pod.status.container_statuses or []))
788
+ if ready and pod_status == "Running":
789
+ color = "green"
790
+ elif "Creating" in display_status or "Scaling" in display_status:
791
+ color = "yellow"
792
+ else:
793
+ color = "red"
794
+ pod_lines.append(f"[{color}]{pod.metadata.name}[/{color}]")
795
+ queue = pod.metadata.labels.get(serving_constants.KAI_SCHEDULER_LABEL, queue)
796
+
797
+ # Update service status if pod is pending
798
+ if pod_status == "Pending":
799
+ display_status = "[yellow]Pending[/yellow]"
800
+
801
+ table.add_row(
802
+ name,
803
+ f"[magenta]{kind}[/magenta]",
804
+ display_status,
805
+ str(len(pods)),
806
+ "\n".join(pod_lines),
807
+ "\n".join(volumes_display) or "-",
808
+ timestamp,
809
+ ttl,
810
+ creator,
811
+ queue,
812
+ cpu or "—",
813
+ memory or "—",
814
+ gpu or "—",
815
+ )
816
+
817
+ table.pad_bottom = 1
818
+ console.print(table)
819
+
820
+ except ApiException as e:
821
+ console.print(f"[red]Kubernetes API error: {e}[/red]")
822
+ raise typer.Exit(1)
823
+
824
+
825
+ @app.command("port-forward")
826
+ def kt_port_forward(
827
+ name: str = service_name_argument(help="Service name"),
828
+ local_port: int = typer.Argument(default=serving_constants.DEFAULT_KT_SERVER_PORT, help="Local port to bind to"),
829
+ remote_port: int = typer.Argument(
830
+ default=serving_constants.DEFAULT_KT_SERVER_PORT,
831
+ help="Remote port to forward to",
832
+ ),
833
+ namespace: str = typer.Option(
834
+ globals.config.namespace,
835
+ "-n",
836
+ "--namespace",
837
+ ),
838
+ pod: str = typer.Option(
839
+ None,
840
+ "-p",
841
+ "--pod",
842
+ help="Name or index of a specific pod to load logs from (0-based)",
843
+ ),
844
+ ):
845
+ """
846
+ Port forward a local port to the specified Kubetorch service.
847
+
848
+ Examples:
849
+
850
+ .. code-block:: bash
851
+
852
+ $ kt port-forward my-service
853
+
854
+ $ kt port-forward my-service 32300
855
+
856
+ $ kt port-forward my-service -n custom-namespace
857
+
858
+ $ kt port-forward my-service -p my-pod
859
+
860
+ This allows you to access the service locally using `curl http://localhost:<port>`.
861
+ """
862
+
863
+ from kubetorch.resources.compute.utils import is_port_available
864
+
865
+ if not is_port_available(local_port):
866
+ console.print(f"\n[red]Local port {local_port} is already in use.[/red]")
867
+ raise typer.Exit(1)
868
+
869
+ core_api, custom_api, apps_v1_api = initialize_k8s_clients()
870
+
871
+ name, _ = get_deployment_mode(name, namespace, custom_api, apps_v1_api)
872
+ pods = validate_pods_exist(name, namespace, core_api)
873
+ sorted_by_time = sorted(pods, key=lambda pod: pod.metadata.creation_timestamp)
874
+
875
+ if pod: # case when the user provides a pod
876
+ pod_name = validate_provided_pod(service_name=name, provided_pod=pod, service_pods=sorted_by_time)
877
+ else: # if user does not provide pod, port-forward to the first pod by default
878
+ pod_name = sorted_by_time[0].metadata.name
879
+
880
+ process = None
881
+
882
+ def cleanup_process():
883
+ # Clean up the port forward process
884
+ if process:
885
+ process.kill()
886
+
887
+ def signal_handler(signum, frame):
888
+ """Handle interrupt signals for graceful shutdown."""
889
+ console.print(f"\nReceived signal {signum}, cleaning up port forward...")
890
+ cleanup_process()
891
+ console.print("Port forward stopped.")
892
+ raise typer.Exit(0)
893
+
894
+ # Register signal handlers for graceful shutdown
895
+ signal.signal(signal.SIGINT, signal_handler)
896
+ signal.signal(signal.SIGTERM, signal_handler)
897
+
898
+ from kubetorch.serving.utils import wait_for_port_forward
899
+
900
+ cmd = [
901
+ "kubectl",
902
+ "port-forward",
903
+ f"pod/{pod_name}",
904
+ f"{local_port}:{remote_port}",
905
+ "--namespace",
906
+ namespace,
907
+ ]
908
+
909
+ port_forward_msg = f"Starting port forward to {name} in namespace {namespace}"
910
+
911
+ if pod:
912
+ port_forward_msg = port_forward_msg + f", pod: [reset]{pod}"
913
+ console.print(port_forward_msg)
914
+
915
+ process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, start_new_session=True)
916
+
917
+ try:
918
+ wait_for_port_forward(process, local_port)
919
+ time.sleep(2)
920
+ except Exception as e:
921
+ logger.info(f"Failed to establish port forward on port {local_port}: {e}")
922
+ if process:
923
+ cleanup_process()
924
+ process = None
925
+ return
926
+
927
+ console.print(f"[green]✓ Port forward active on localhost:{local_port} -> {pod_name}:{remote_port}[/green]")
928
+ console.print(f"[cyan]You can now run: curl http://localhost:{local_port}[/cyan]")
929
+ console.print("[dim]Press Ctrl+C to stop the port forward[/dim]")
930
+
931
+ # Keep the port forward running until interrupted
932
+ try:
933
+ while True:
934
+ if process.poll() is not None:
935
+ # Process has terminated
936
+ console.print("[red]Port forward process has terminated unexpectedly[/red]")
937
+ break
938
+ time.sleep(1)
939
+ except KeyboardInterrupt:
940
+ # This should be handled by the signal handler, but just in case
941
+ pass
942
+
943
+ except typer.Exit:
944
+ # Re-raise typer.Exit to maintain proper CLI behavior
945
+ raise
946
+ except Exception as e:
947
+ console.print(f"[red]Error during port forwarding: {e}[/red]")
948
+ raise typer.Exit(1)
949
+ finally:
950
+ cleanup_process()
951
+
952
+
953
+ @app.command("run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True})
954
+ def kt_run(
955
+ ctx: typer.Context,
956
+ name: str = typer.Option(None, "--name", help="Name for the run"),
957
+ run_async: bool = typer.Option(False, "--async", help="Whether to run async and not stream logs live"),
958
+ file: int = typer.Option(None, "--file", help="File where the app is defined in"),
959
+ ):
960
+ """
961
+ Build and deploy a kubetorch app that runs the provided CLI command. In order for the app
962
+ to be deployed, the file being run must be a Python file specifying a `kt.app` construction
963
+ at the top of the file.
964
+
965
+ Examples:
966
+
967
+ .. code-block:: bash
968
+
969
+ $ kt run python train.py --epochs 5
970
+ $ kt run fastapi run my_app.py --name fastapi-app
971
+ """
972
+ from kubetorch import App
973
+
974
+ cli_cmd = " ".join(ctx.args)
975
+ if not cli_cmd:
976
+ raise typer.BadParameter("You must provide a command to run.")
977
+ elif cli_cmd.split()[0].endswith(".py"):
978
+ raise typer.BadParameter(
979
+ "You must provide a full command to run, the Python file should not be the first argument. "
980
+ "(e.g. `kt run python train.py`)"
981
+ )
982
+
983
+ python_file = file
984
+ if not python_file:
985
+ for arg in cli_cmd.split():
986
+ if arg.endswith("py") and Path(arg).exists():
987
+ python_file = arg
988
+ break
989
+
990
+ if not python_file:
991
+ console.print(
992
+ f"[red]Could not detect python file with `kt.app` in {cli_cmd}. Pass it in with `--file`.[/red]"
993
+ )
994
+ raise typer.Exit(1)
995
+
996
+ # Set env vars for construction of app instance
997
+ os.environ["KT_RUN"] = "1"
998
+ os.environ["KT_RUN_CMD"] = cli_cmd
999
+ os.environ["KT_RUN_FILE"] = python_file
1000
+ if name:
1001
+ os.environ["KT_RUN_NAME"] = name
1002
+ if run_async:
1003
+ os.environ["KT_RUN_ASYNC"] = "1"
1004
+
1005
+ # Extract the app instance from the python file
1006
+ module_name = Path(python_file).stem
1007
+ python_file_dir = Path(python_file).resolve().parent
1008
+
1009
+ # Add the directory containing the Python file to sys.path to support relative imports
1010
+ if str(python_file_dir) not in sys.path:
1011
+ sys.path.insert(0, str(python_file_dir))
1012
+
1013
+ spec = importlib.util.spec_from_file_location(module_name, python_file)
1014
+ module = importlib.util.module_from_spec(spec)
1015
+ sys.modules[module_name] = module
1016
+ spec.loader.exec_module(module)
1017
+
1018
+ app_instance = None
1019
+ for _, obj in inspect.getmembers(module):
1020
+ if isinstance(obj, App):
1021
+ app_instance = obj
1022
+ break
1023
+ if not app_instance:
1024
+ console.print(f"[red]Could not find kt.app definition in {python_file} [/red]")
1025
+ raise typer.Exit(1)
1026
+
1027
+ app_instance.deploy()
1028
+
1029
+
1030
+ @app.command("secrets")
1031
+ def kt_secrets(
1032
+ action: SecretAction = typer.Argument(
1033
+ SecretAction.list,
1034
+ help="Action to perform: list, create, update, delete, describe",
1035
+ ),
1036
+ name: str = typer.Argument(None, help="Secret name (for create or delete actions)"),
1037
+ prefix: str = typer.Option(
1038
+ None,
1039
+ "--prefix",
1040
+ "-x",
1041
+ ),
1042
+ namespace: str = typer.Option(
1043
+ "default",
1044
+ "-n",
1045
+ "--namespace",
1046
+ ),
1047
+ all_namespaces: bool = typer.Option(
1048
+ False,
1049
+ "--all-namespaces",
1050
+ "-A",
1051
+ ),
1052
+ yes: bool = typer.Option(False, "-y", "--yes", help="Deletion confirmation"),
1053
+ path: str = typer.Option(None, "--path", "-p", help="Path where the secret values are held"),
1054
+ provider: str = typer.Option(
1055
+ None,
1056
+ "--provider",
1057
+ "-c",
1058
+ help="Provider corresponding to the secret (e.g. 'aws', 'gcp'). "
1059
+ "If not specified, secrets are loaded from the default provider path.",
1060
+ ),
1061
+ env_vars: List[str] = typer.Option(
1062
+ None,
1063
+ "--env-vars",
1064
+ "-v",
1065
+ help="Environment variable(s) key(s) whose value(s) will hold the secret value(s)",
1066
+ ),
1067
+ show_values: bool = typer.Option(False, "-s", "--show", help="Show secrets values in the describe output"),
1068
+ ):
1069
+ """Manage secrets used in Kubetorch services.
1070
+
1071
+ Examples:
1072
+
1073
+ .. code-block:: bash
1074
+
1075
+ $ kt secrets # list secrets in the default namespace
1076
+
1077
+ $ kt secrets list -n my_namespace # list secrets in `my_namespace` namespace
1078
+
1079
+ $ kt secrets -A # list secrets in all namespaces
1080
+
1081
+ $ kt secrets create --provider aws # create a secret with the aws credentials in `default` namespace
1082
+
1083
+ $ kt secrets create my_secret -v ENV_VAR_1 -v ENV_VAR_2 -n my_namespace # create a secret using env vars
1084
+
1085
+ $ kt secrets delete my_secret -n my_namespace # delete a secret called `my_secret` from `my_namespace` namespace
1086
+
1087
+ $ kt secrets delete aws # delete a secret called `aws` from `default` namespace
1088
+ """
1089
+ import kubetorch as kt
1090
+ from kubetorch.resources.compute.utils import delete_secrets, list_secrets
1091
+ from kubetorch.resources.secrets.kubernetes_secrets_client import KubernetesSecretsClient
1092
+
1093
+ secrets_client = KubernetesSecretsClient(namespace=namespace)
1094
+
1095
+ core_api, custom_api, apps_v1_api = initialize_k8s_clients()
1096
+
1097
+ if action == SecretAction.list:
1098
+ secrets = list_secrets(
1099
+ core_api=core_api,
1100
+ namespace=namespace,
1101
+ prefix=prefix,
1102
+ all_namespaces=all_namespaces,
1103
+ console=console,
1104
+ filter_by_creator=False,
1105
+ )
1106
+
1107
+ table_columns = [
1108
+ ("SECRET", "blue"),
1109
+ ("CREATOR", "cyan"),
1110
+ ("NAMESPACE", "yellow"),
1111
+ ]
1112
+ table = create_table_for_output(
1113
+ columns=table_columns,
1114
+ no_wrap_columns_names=["SECRET"],
1115
+ header_style={"bold": True},
1116
+ )
1117
+
1118
+ if not secrets:
1119
+ msg = "No secrets found"
1120
+ if not all_namespaces:
1121
+ if prefix:
1122
+ msg += f" with prefix: {prefix}"
1123
+ msg += f" in namespace: {namespace}"
1124
+ console.print(f"[yellow]{msg}[/yellow]")
1125
+ raise typer.Exit(0)
1126
+
1127
+ for secret in secrets:
1128
+ secret_name = secret.get(
1129
+ "user_defined_name"
1130
+ ) # TODO: maybe display the kt name? so it'll match kubectl get secrets
1131
+ creator = secret.get("username")
1132
+ namespace = secret.get("namespace")
1133
+ table.add_row(secret_name, creator, namespace)
1134
+
1135
+ table.pad_bottom = 1
1136
+ console.print(table)
1137
+
1138
+ elif action == SecretAction.create:
1139
+ if not (name or provider):
1140
+ console.print("[red]Cannot create secret: name or provider must be specified.[/red]")
1141
+ typer.Exit(1)
1142
+ env_vars_dict = {key: key for key in env_vars} if env_vars else {}
1143
+
1144
+ try:
1145
+ new_secret = kt.secret(name=name, provider=provider, path=path, env_vars=env_vars_dict)
1146
+ secrets_client.create_secret(secret=new_secret, console=console)
1147
+ except Exception as e:
1148
+ console.print(f"[red]Failed to create the secret: {e}[/red]")
1149
+ raise typer.Exit(0)
1150
+
1151
+ elif action == SecretAction.delete:
1152
+ prefix = name if name else prefix
1153
+ all_namespaces = False if name else all_namespaces
1154
+ secrets_to_delete = list_secrets(
1155
+ core_api=core_api,
1156
+ namespace=namespace,
1157
+ prefix=prefix,
1158
+ all_namespaces=all_namespaces,
1159
+ console=console,
1160
+ )
1161
+
1162
+ username = globals.config.username
1163
+ secrets_to_delete_by_namespace: dict[str, list[str]] = {}
1164
+ for secret in secrets_to_delete:
1165
+ ns = secret.get("namespace")
1166
+ name = secret.get("name")
1167
+
1168
+ if all_namespaces:
1169
+ if secret.get("username") != username:
1170
+ continue # skip secrets not owned by user
1171
+
1172
+ secrets_to_delete_by_namespace.setdefault(ns, []).append(name)
1173
+
1174
+ # Flatten names for display
1175
+ secrets_names = [name for names in secrets_to_delete_by_namespace.values() for name in names]
1176
+
1177
+ if not secrets_names:
1178
+ console.print(f"[yellow]No secrets to delete for username: {username}[/yellow]")
1179
+ raise typer.Exit(0)
1180
+
1181
+ secrets_word = "secret" if len(secrets_names) == 1 else "secrets"
1182
+ console.print(f"\nDeleting {len(secrets_names)} {secrets_word}...")
1183
+
1184
+ for secret in secrets_names:
1185
+ console.print(f" - [blue]{secret}[/blue]")
1186
+
1187
+ if not yes:
1188
+ confirm = typer.confirm("\nDo you want to proceed?")
1189
+ if not confirm:
1190
+ console.print("[yellow]Operation cancelled[/yellow]")
1191
+ raise typer.Exit(0)
1192
+
1193
+ for ns, secrets in secrets_to_delete_by_namespace.items():
1194
+ if secrets:
1195
+ client = KubernetesSecretsClient(namespace=ns)
1196
+ delete_secrets(
1197
+ secrets=secrets,
1198
+ console=console,
1199
+ secrets_client=client,
1200
+ )
1201
+
1202
+ elif action == SecretAction.describe:
1203
+ prefix = name if name else prefix
1204
+ all_namespaces = False if name else all_namespaces
1205
+ secrets_to_describe = list_secrets(
1206
+ core_api=core_api,
1207
+ namespace=namespace,
1208
+ prefix=name or prefix,
1209
+ all_namespaces=all_namespaces,
1210
+ filter_by_creator=False,
1211
+ console=console,
1212
+ )
1213
+ if not secrets_to_describe:
1214
+ console.print("[yellow] No secrets found[/yellow]")
1215
+ raise typer.Exit(0)
1216
+
1217
+ for secret in secrets_to_describe:
1218
+ k8_name = secret.get("name")
1219
+ kt_name = secret.get("user_defined_name")
1220
+ console.print(f"[bold cyan]{kt_name}[/bold cyan]")
1221
+ console.print(f" K8 Name: [reset]{k8_name}")
1222
+ console.print(f' Namespace: {secret.get("namespace")}')
1223
+ console.print(f' Labels: [reset]{secret.get("labels")}')
1224
+ console.print(f' Type: {secret.get("type")}')
1225
+ secret_data = secret.get("data")
1226
+ if show_values:
1227
+ console.print(" Data:")
1228
+ for k, v in secret_data.items():
1229
+ try:
1230
+ decoded_value = base64.b64decode(v).decode("utf-8")
1231
+ except Exception:
1232
+ decoded_value = "<binary data>"
1233
+ indented_value = textwrap.indent(decoded_value, " ")
1234
+ indented_value = indented_value.replace("\n\n", "\n")
1235
+ console.print(f" {k}:{indented_value}\n")
1236
+
1237
+
1238
+ @app.command("ssh")
1239
+ def kt_ssh(
1240
+ name: str = service_name_argument(help="Service name"),
1241
+ namespace: str = typer.Option(
1242
+ globals.config.namespace,
1243
+ "-n",
1244
+ "--namespace",
1245
+ ),
1246
+ pod: str = typer.Option(
1247
+ None,
1248
+ "-p",
1249
+ "--pod",
1250
+ help="Name or index of a specific pod to load logs from (0-based)",
1251
+ ),
1252
+ ):
1253
+ """SSH into a remote service. By default, will SSH into the first pod.
1254
+
1255
+ Examples:
1256
+
1257
+ .. code-block:: bash
1258
+
1259
+ $ kt ssh my_service
1260
+ """
1261
+ core_api, custom_api, apps_v1_api = initialize_k8s_clients()
1262
+
1263
+ try:
1264
+ # Validate service exists and get deployment mode
1265
+ name, deployment_mode = get_deployment_mode(name, namespace, custom_api, apps_v1_api)
1266
+
1267
+ # Get and validate pods
1268
+ pods = validate_pods_exist(name, namespace, core_api)
1269
+
1270
+ sorted_by_time = sorted(pods, key=lambda pod: pod.metadata.creation_timestamp)
1271
+
1272
+ # case when the user provides a specific pod to ssh into
1273
+ if pod:
1274
+ pod_name = validate_provided_pod(service_name=name, provided_pod=pod, service_pods=sorted_by_time)
1275
+ # if pod is not provided, ssh into the first pod.
1276
+ else:
1277
+ pod_name = sorted_by_time[0].metadata.name
1278
+
1279
+ console.print(f"[green]Found pod:[/green] [blue]{pod_name}[/blue] ({deployment_mode})")
1280
+ console.print("[yellow]Connecting to pod...[/yellow]")
1281
+
1282
+ # Still need subprocess for the interactive shell
1283
+ subprocess.run(
1284
+ ["kubectl", "exec", "-it", pod_name, "-n", namespace, "--", "/bin/bash"],
1285
+ check=True,
1286
+ )
1287
+
1288
+ except ApiException as e:
1289
+ console.print(f"[red]Kubernetes API error: {e}[/red]")
1290
+ raise typer.Exit(1)
1291
+
1292
+
1293
+ @app.command("teardown")
1294
+ def kt_teardown(
1295
+ name: str = service_name_argument(help="Service name", required=False),
1296
+ yes: bool = typer.Option(False, "-y", "--yes", help="Deletion confirmation"),
1297
+ teardown_all: bool = typer.Option(False, "-a", "--all", help="Deletes all services for the current user"),
1298
+ prefix: str = typer.Option("", "-p", "--prefix", help="Tear down all services with given prefix"),
1299
+ namespace: str = typer.Option(
1300
+ globals.config.namespace,
1301
+ "-n",
1302
+ "--namespace",
1303
+ ),
1304
+ force: bool = typer.Option(False, "-f", "--force", help="Force deletion without graceful shutdown"),
1305
+ exact_match: bool = typer.Option(
1306
+ False,
1307
+ "-e",
1308
+ "--exact-match",
1309
+ help="Only delete the exact service name, not the prefixed version",
1310
+ ),
1311
+ ):
1312
+ """Delete a service and all its associated resources (deployments, configmaps, etc).
1313
+
1314
+
1315
+ Examples:
1316
+
1317
+ .. code-block:: bash
1318
+
1319
+ $ kt teardown my-service -y # force teardown resources corresponding to service
1320
+
1321
+ $ kt teardown --all # teardown all resources corresponding to username
1322
+
1323
+ $ kt teardown --prefix test # teardown resources with prefix "test"
1324
+ """
1325
+ from kubetorch import config
1326
+ from kubetorch.resources.compute.utils import delete_resources_for_service, fetch_resources_for_teardown
1327
+
1328
+ name, yes, teardown_all, namespace, prefix = default_typer_values(name, yes, teardown_all, namespace, prefix)
1329
+
1330
+ core_api, custom_api, _ = initialize_k8s_clients()
1331
+
1332
+ if teardown_all:
1333
+ if not config.username:
1334
+ console.print(
1335
+ "[red]Username is not found, can't delete all services. Please set up a username, provide a service "
1336
+ "name or use the --prefix flag[/red]"
1337
+ )
1338
+ raise typer.Exit(1)
1339
+
1340
+ console.print(f"Deleting all services for username [blue]{config.username}[/blue]...")
1341
+
1342
+ elif prefix:
1343
+ console.print(
1344
+ f"Deleting all services with prefix [blue]{prefix}[/blue] in [blue]{namespace}[/blue] namespace..."
1345
+ )
1346
+ else:
1347
+ if not name:
1348
+ console.print("[red]Please provide a service name or use the --all or --prefix flags[/red]")
1349
+ raise typer.Exit(1)
1350
+
1351
+ console.print(f"Finding resources for service [blue]{name}[/blue] in [blue]{namespace}[/blue] namespace...")
1352
+
1353
+ resources = fetch_resources_for_teardown(
1354
+ namespace=namespace,
1355
+ target=name,
1356
+ core_api=core_api,
1357
+ custom_api=custom_api,
1358
+ prefix=prefix,
1359
+ username=config.username if teardown_all else None,
1360
+ exact_match=exact_match,
1361
+ )
1362
+
1363
+ services = list(resources["services"].keys())
1364
+ service_count = len(services)
1365
+
1366
+ if teardown_all or prefix:
1367
+ service_word = "service" if service_count == 1 else "services"
1368
+ if not services:
1369
+ console.print("[yellow]No services found[/yellow]")
1370
+ raise typer.Exit(0)
1371
+ else:
1372
+ console.print(f"[yellow]Found [bold]{service_count}[/bold] {service_word} to delete.[/yellow]")
1373
+
1374
+ if name and not services:
1375
+ console.print(f"[red]Service [bold]{name}[/bold] not found[/red]")
1376
+ raise typer.Exit(1)
1377
+
1378
+ # Confirmation prompt for multiple services
1379
+ if not yes and service_count > 1:
1380
+ for service_name in services:
1381
+ console.print(f" • {service_name}")
1382
+
1383
+ # Confirmation prompt for single service
1384
+ if not yes and not force: # if --force is provided, we don't need additional confirmation
1385
+ confirm = typer.confirm("\nDo you want to proceed?")
1386
+ if not confirm:
1387
+ console.print("[yellow]Teardown cancelled[/yellow]")
1388
+ raise typer.Exit(0)
1389
+
1390
+ # Delete resources
1391
+ if force:
1392
+ console.print("\n[yellow]Force deleting resources...[/yellow]")
1393
+ else:
1394
+ console.print("\n[yellow]Deleting resources...[/yellow]")
1395
+
1396
+ service_types = set()
1397
+ for name in services:
1398
+ service_info = resources["services"][name]
1399
+ configmaps = service_info["configmaps"]
1400
+ service_type = service_info.get("type", "knative")
1401
+ service_types.add(service_type)
1402
+
1403
+ delete_resources_for_service(
1404
+ core_api=core_api,
1405
+ custom_api=custom_api,
1406
+ configmaps=configmaps,
1407
+ name=name,
1408
+ service_type=service_type,
1409
+ namespace=namespace,
1410
+ console=console,
1411
+ force=force,
1412
+ )
1413
+
1414
+ # Force delete any remaining pods if --force flag is set
1415
+ if force:
1416
+ # Build list of service names to check for pods
1417
+ # Include both found services and the original target name (in case service was already deleted)
1418
+ service_names_to_check = list(services)
1419
+ if name and name not in service_names_to_check:
1420
+ service_names_to_check.append(name)
1421
+
1422
+ if service_names_to_check:
1423
+ console.print("\n[yellow]Force deleting any remaining pods...[/yellow]")
1424
+ for service_name in service_names_to_check:
1425
+ try:
1426
+ # Get pods matching the service
1427
+ pods = core_api.list_namespaced_pod(
1428
+ namespace=namespace,
1429
+ label_selector=f"kubetorch.com/service={service_name}",
1430
+ ).items
1431
+
1432
+ if pods:
1433
+ for pod in pods:
1434
+ try:
1435
+ core_api.delete_namespaced_pod(
1436
+ name=pod.metadata.name,
1437
+ namespace=namespace,
1438
+ grace_period_seconds=0,
1439
+ propagation_policy="Background",
1440
+ )
1441
+ console.print(f"✓ Force deleted pod [blue]{pod.metadata.name}[/blue]")
1442
+ except ApiException as e:
1443
+ if e.status != 404: # Ignore if already deleted
1444
+ console.print(f"[red]Failed to delete pod {pod.metadata.name}: {e}[/red]")
1445
+ except Exception as e:
1446
+ console.print(f"[red]Failed to list pods for service {service_name}: {e}[/red]")
1447
+
1448
+ # Also check for any orphaned pods with kubetorch labels if using --all or --prefix
1449
+ if teardown_all or prefix:
1450
+ try:
1451
+ label_selector = "kubetorch.com/service"
1452
+ if teardown_all and config.username:
1453
+ label_selector += f",kubetorch.com/username={config.username}"
1454
+
1455
+ all_pods = core_api.list_namespaced_pod(namespace=namespace, label_selector=label_selector).items
1456
+
1457
+ # Filter by prefix if specified
1458
+ if prefix:
1459
+ all_pods = [
1460
+ p for p in all_pods if p.metadata.labels.get("kubetorch.com/service", "").startswith(prefix)
1461
+ ]
1462
+
1463
+ # Delete any remaining pods not already handled
1464
+ for pod in all_pods:
1465
+ if pod.metadata.name not in [
1466
+ p.metadata.name
1467
+ for s in service_names_to_check
1468
+ for p in core_api.list_namespaced_pod(
1469
+ namespace=namespace,
1470
+ label_selector=f"kubetorch.com/service={s}",
1471
+ ).items
1472
+ ]:
1473
+ try:
1474
+ core_api.delete_namespaced_pod(
1475
+ name=pod.metadata.name,
1476
+ namespace=namespace,
1477
+ grace_period_seconds=0,
1478
+ propagation_policy="Background",
1479
+ )
1480
+ console.print(f"✓ Force deleted orphaned pod [blue]{pod.metadata.name}[/blue]")
1481
+ except ApiException as e:
1482
+ if e.status != 404:
1483
+ console.print(f"[red]Failed to delete orphaned pod {pod.metadata.name}: {e}[/red]")
1484
+ except Exception as e:
1485
+ console.print(f"[red]Failed to list orphaned pods: {e}[/red]")
1486
+
1487
+ console.print("\n[green]Teardown completed successfully[/green]")
1488
+
1489
+
1490
+ @app.command("volumes")
1491
+ def kt_volumes(
1492
+ action: VolumeAction = typer.Argument(VolumeAction.list, help="Action to perform"),
1493
+ name: str = typer.Argument(None, help="Volume name (for create action)"),
1494
+ storage_class: str = typer.Option(None, "--storage-class", "-c", help="Storage class"),
1495
+ size: str = typer.Option("10Gi", "--size", "-s", help="Volume size (default: 10Gi)"),
1496
+ access_mode: str = typer.Option("ReadWriteMany", "--access-mode", "-a", help="Access mode"),
1497
+ mount_path: str = typer.Option(
1498
+ None,
1499
+ "--mount-path",
1500
+ "-m",
1501
+ help=f"Mount path (default: /{KT_MOUNT_FOLDER}/{{name}})",
1502
+ ),
1503
+ namespace: str = typer.Option(
1504
+ globals.config.namespace,
1505
+ "-n",
1506
+ "--namespace",
1507
+ ),
1508
+ all_namespaces: bool = typer.Option(
1509
+ False,
1510
+ "--all-namespaces",
1511
+ "-A",
1512
+ help="List volumes across all namespaces",
1513
+ ),
1514
+ ):
1515
+ """Manage volumes used in Kubetorch services.
1516
+
1517
+ Examples:
1518
+
1519
+ .. code-block:: bash
1520
+
1521
+ $ kt volumes
1522
+
1523
+ $ kt volumes -A
1524
+
1525
+ $ kt volumes create my-vol
1526
+
1527
+ $ kt volumes create my-vol -c gp3-csi -s 20Gi
1528
+
1529
+ $ kt volumes delete my-vol
1530
+
1531
+ $ kt volumes ssh my-vol
1532
+ """
1533
+ from kubernetes import client
1534
+
1535
+ from kubetorch import Volume
1536
+ from kubetorch.utils import load_kubeconfig
1537
+
1538
+ load_kubeconfig()
1539
+ core_v1 = client.CoreV1Api()
1540
+
1541
+ target_namespace = None
1542
+ if not all_namespaces:
1543
+ target_namespace = namespace or globals.config.namespace
1544
+
1545
+ if action == VolumeAction.list:
1546
+ try:
1547
+ if all_namespaces:
1548
+ pvcs = core_v1.list_persistent_volume_claim_for_all_namespaces()
1549
+ title = "Kubetorch Volumes (All Namespaces)"
1550
+ else:
1551
+ pvcs = core_v1.list_namespaced_persistent_volume_claim(namespace=target_namespace)
1552
+ title = f"Kubetorch Volumes (Namespace: {target_namespace})"
1553
+
1554
+ # List all Kubetorch PVCs
1555
+ kubetorch_pvcs = [
1556
+ pvc for pvc in pvcs.items if (pvc.metadata.annotations or {}).get("kubetorch.com/mount-path")
1557
+ ]
1558
+
1559
+ if not kubetorch_pvcs:
1560
+ if all_namespaces:
1561
+ console.print("[yellow]No volumes found in all namespaces[/yellow]")
1562
+ else:
1563
+ console.print(f"[yellow]No volumes found in namespace {target_namespace}[/yellow]")
1564
+ return
1565
+
1566
+ table = Table(title=title)
1567
+ if all_namespaces:
1568
+ table.add_column("Namespace", style="green")
1569
+ table.add_column("Name", style="cyan")
1570
+ table.add_column("PVC Name", style="blue")
1571
+ table.add_column("Status", style="green")
1572
+ table.add_column("Size", style="yellow")
1573
+ table.add_column("Storage Class", style="magenta")
1574
+ table.add_column("Access Mode", style="white")
1575
+ table.add_column("Mount Path", style="dim")
1576
+
1577
+ for pvc in kubetorch_pvcs:
1578
+ # Extract volume name from PVC name
1579
+ volume_name = pvc.metadata.name
1580
+ status = pvc.status.phase
1581
+ size = pvc.spec.resources.requests.get("storage", "Unknown")
1582
+ storage_class = pvc.spec.storage_class_name or "Default"
1583
+ access_mode = pvc.spec.access_modes[0] if pvc.spec.access_modes else "Unknown"
1584
+
1585
+ # Get mount path from annotations
1586
+ annotations = pvc.metadata.annotations or {}
1587
+ mount_path_display = annotations.get("kubetorch.com/mount-path", f"/{KT_MOUNT_FOLDER}/{volume_name}")
1588
+
1589
+ status_color = "green" if status == "Bound" else "yellow" if status == "Pending" else "red"
1590
+
1591
+ row_data = []
1592
+ if all_namespaces:
1593
+ row_data.append(pvc.metadata.namespace)
1594
+
1595
+ row_data.extend(
1596
+ [
1597
+ volume_name,
1598
+ pvc.metadata.name,
1599
+ f"[{status_color}]{status}[/{status_color}]",
1600
+ size,
1601
+ storage_class,
1602
+ access_mode,
1603
+ mount_path_display,
1604
+ ]
1605
+ )
1606
+
1607
+ table.add_row(*row_data)
1608
+
1609
+ console.print(table)
1610
+
1611
+ except Exception as e:
1612
+ console.print(f"[red]Failed to list volumes: {e}[/red]")
1613
+ raise typer.Exit(1)
1614
+
1615
+ elif action == VolumeAction.ssh:
1616
+ if not name:
1617
+ console.print("[red]Volume name is required[/red]")
1618
+ raise typer.Exit(1)
1619
+
1620
+ volume = Volume.from_name(name=name, namespace=namespace, core_v1=core_v1)
1621
+ volume.ssh()
1622
+
1623
+ elif action == VolumeAction.create:
1624
+ if not name:
1625
+ console.print("[red]Volume name is required[/red]")
1626
+ raise typer.Exit(1)
1627
+
1628
+ if all_namespaces:
1629
+ console.print("[red]Cannot create volume with --all-namespaces. Specify a namespace.[/red]")
1630
+ raise typer.Exit(1)
1631
+
1632
+ try:
1633
+ volume = Volume(
1634
+ name=name,
1635
+ storage_class=storage_class,
1636
+ mount_path=mount_path,
1637
+ size=size,
1638
+ access_mode=access_mode,
1639
+ namespace=namespace,
1640
+ )
1641
+
1642
+ if volume.exists():
1643
+ console.print(
1644
+ f"[yellow]Volume {name} (PVC: {volume.pvc_name}) already exists in "
1645
+ f"namespace {namespace}[/yellow]"
1646
+ )
1647
+ return
1648
+
1649
+ console.print(f"Creating volume [blue]{name}[/blue]...")
1650
+ volume.create()
1651
+
1652
+ console.print(f"[green]✓[/green] Successfully created volume [blue]{name}[/blue]")
1653
+ config = volume.config()
1654
+ for k, v in config.items():
1655
+ console.print(f"[bold]• {k}[/bold]: {v}")
1656
+
1657
+ except Exception as e:
1658
+ console.print(f"[red]Failed to create volume {name}: {e}[/red]")
1659
+ raise typer.Exit(1)
1660
+
1661
+ elif action == VolumeAction.delete:
1662
+ if not name:
1663
+ console.print("[red]Volume name is required[/red]")
1664
+ raise typer.Exit(1)
1665
+
1666
+ if all_namespaces:
1667
+ console.print("[red]Cannot delete volume with --all-namespaces. Specify a namespace.[/red]")
1668
+ raise typer.Exit(1)
1669
+
1670
+ try:
1671
+ volume = Volume.from_name(name=name, namespace=namespace, core_v1=core_v1)
1672
+
1673
+ console.print(f"Deleting volume [blue]{name}[/blue]...")
1674
+ volume.delete()
1675
+
1676
+ console.print(f"[green]✓[/green] Successfully deleted volume [blue]{name}[/blue]")
1677
+
1678
+ except ValueError:
1679
+ console.print(f"[red]Volume {name} not found in namespace {namespace}[/red]")
1680
+ raise typer.Exit(1)
1681
+
1682
+ except Exception as e:
1683
+ console.print(f"[red]Failed to delete volume {name}: {e}[/red]")
1684
+ raise typer.Exit(1)
1685
+
1686
+
1687
+ @app.command("notebook")
1688
+ def kt_notebook(
1689
+ name: str = typer.Argument(None, help="Service name"),
1690
+ cpus: str = typer.Option(None, "--cpus", help="CPU resources (e.g., '2', '500m')"),
1691
+ memory: str = typer.Option(None, "--memory", "-m", help="Memory resources (e.g., '4Gi', '512Mi')"),
1692
+ gpus: str = typer.Option(None, "--gpus", help="Number of GPUs"),
1693
+ image: str = typer.Option(None, "--image", "-i", help="Container image to use"),
1694
+ namespace: str = typer.Option(
1695
+ globals.config.namespace,
1696
+ "-n",
1697
+ "--namespace",
1698
+ ),
1699
+ local_port: int = typer.Option(8888, "--port", "-p", help="Local port for notebook access"),
1700
+ inactivity_ttl: str = typer.Option(None, "--ttl", help="Inactivity TTL (e.g., '1h', '30m')"),
1701
+ restart_kernels: bool = typer.Option(
1702
+ True,
1703
+ "--restart/--no-restart",
1704
+ help="Restart notebook kernel sessions upon reconnect",
1705
+ ),
1706
+ ):
1707
+ """
1708
+ Launch a JupyterLab notebook server on a new or existing Kubetorch service. The notebook service will continue
1709
+ running after you exit, and you can reconnect to it until the service is torn down.
1710
+
1711
+ Examples:
1712
+
1713
+ .. code-block:: bash
1714
+
1715
+ $ kt notebook tune-hpo # Launch notebook into new or existing service with name "tune-hpo"
1716
+
1717
+ $ kt notebook --cpus 4 --memory 8Gi # Launch with specific resources
1718
+
1719
+ $ kt notebook --gpus 1 --cpus 8 --memory 16Gi --image nvcr.io/nvidia/pytorch:23.10-py3 # Launch with GPU and custom image
1720
+
1721
+ $ kt notebook --gpus 1 --cpus 8 --memory 16Gi --no-restart # Don't restart kernels on reconnect
1722
+ """
1723
+ import webbrowser
1724
+
1725
+ import kubetorch as kt
1726
+
1727
+ if is_running_in_kubernetes():
1728
+ console.print(
1729
+ "[red]Notebook command is not supported when running inside Kubernetes. "
1730
+ "Please run this command locally.[/red]"
1731
+ )
1732
+ raise typer.Exit(1)
1733
+
1734
+ # Build compute configuration
1735
+ compute_kwargs = {
1736
+ "namespace": namespace,
1737
+ "cpus": cpus,
1738
+ "memory": memory,
1739
+ "gpus": gpus,
1740
+ "inactivity_ttl": inactivity_ttl,
1741
+ }
1742
+
1743
+ if image:
1744
+ compute_kwargs["image"] = kt.Image(image_id=image)
1745
+ else:
1746
+ if gpus:
1747
+ console.print(
1748
+ "[yellow]Launching with GPUs without a CUDA-enabled image may limit GPU usability. "
1749
+ "Specify an appropriate image, for example: "
1750
+ "[bold]`kt notebook --gpus 1 --image nvcr.io/nvidia/pytorch:23.10-py3`[/bold].[/yellow]"
1751
+ )
1752
+ return
1753
+
1754
+ compute_kwargs["image"] = kt.Image()
1755
+
1756
+ compute = kt.Compute(**compute_kwargs)
1757
+
1758
+ # Generate service name
1759
+ service_name = name or "kt-notebook"
1760
+
1761
+ # Check if local port is available
1762
+ from kubetorch.resources.compute.utils import find_available_port
1763
+
1764
+ original_port = local_port
1765
+ try:
1766
+ local_port = find_available_port(local_port, max_tries=5)
1767
+ if local_port != original_port:
1768
+ console.print(f"[yellow]Port {original_port} already in use, using port {local_port} instead.[/yellow]")
1769
+ except RuntimeError:
1770
+ console.print(f"\n[red]Ports {original_port}-{original_port + 4} are all in use.[/red]")
1771
+ raise typer.Exit(1)
1772
+
1773
+ console.print("[cyan]Setting up notebook...[/cyan]")
1774
+
1775
+ try:
1776
+ # If the service already exists -> load it, then compare to what was requested
1777
+ # If the service doesn't exist -> deploy with requested parameters
1778
+ remote_fn = kt.fn(notebook_placeholder, name=service_name).to(compute, stream_logs=False, get_if_exists=True)
1779
+ compute = remote_fn.compute
1780
+
1781
+ # Check if requested parameters match the existing compute
1782
+ mismatches = []
1783
+ expected_params = {
1784
+ "cpus": cpus,
1785
+ "memory": memory,
1786
+ "gpus": gpus,
1787
+ "image": image,
1788
+ }
1789
+
1790
+ for key, requested_value in expected_params.items():
1791
+ if requested_value is None:
1792
+ # Skip unset CLI options
1793
+ continue
1794
+
1795
+ existing_value = getattr(compute, key, None)
1796
+ if key == "image":
1797
+ # compare image_ids
1798
+ existing_value = getattr(existing_value, "image_id", existing_value)
1799
+
1800
+ if existing_value != requested_value:
1801
+ mismatches.append((key, existing_value, requested_value))
1802
+
1803
+ if mismatches:
1804
+ console.print("[yellow]Cannot reuse existing notebook due to mismatched parameters:[/yellow]")
1805
+ for key, existing, requested in mismatches:
1806
+ display_existing = existing if existing is not None else "<default>"
1807
+ console.print(f" - [bold]{key}[/bold]: existing = '{display_existing}', requested = '{requested}'")
1808
+ console.print(
1809
+ f"\n[yellow]Delete the existing notebook service ([bold]`kt teardown {service_name}`[/bold]) "
1810
+ "or create a new one with a different name.[/yellow]"
1811
+ )
1812
+ return
1813
+
1814
+ # Ensure jupyter lab is installed
1815
+ compute.pip_install(["jupyterlab"])
1816
+
1817
+ # Get pod information
1818
+ core_api, custom_api, apps_v1_api = initialize_k8s_clients()
1819
+ pods = validate_pods_exist(remote_fn.service_name, namespace, core_api)
1820
+ if not pods:
1821
+ console.print(f"[red]No pods found for service {service_name}[/red]")
1822
+ raise typer.Exit(1)
1823
+
1824
+ pod_name = sorted(pods, key=lambda p: p.metadata.creation_timestamp)[0].metadata.name
1825
+ console.print(f"[green]Service is up (pod: {pod_name})[/green]")
1826
+
1827
+ # Start jupyter in background
1828
+ jupyter_cmd = (
1829
+ 'bash -c "nohup jupyter lab --ip=0.0.0.0 --port=8888 --no-browser '
1830
+ "--allow-root --ServerApp.token='' --ServerApp.password='' "
1831
+ "--NotebookApp.token='' --NotebookApp.password='' "
1832
+ '> /tmp/jupyter.log 2>&1 &"'
1833
+ )
1834
+ if restart_kernels:
1835
+ start_cmd_result = compute.run_bash(jupyter_cmd)
1836
+ if start_cmd_result and start_cmd_result[0][0] != 0:
1837
+ console.print("[red]Error starting Jupyter Lab[/red]", start_cmd_result)
1838
+ raise typer.Exit(1)
1839
+
1840
+ # Wait for jupyter to start
1841
+ for i in range(5):
1842
+ check_cmd = "tail -20 /tmp/jupyter.log"
1843
+ result = compute.run_bash(check_cmd)
1844
+ if result and result[0][0] == 0:
1845
+ output = result[0][1]
1846
+ if ("Jupyter Server" in output and "is running" in output) or ("Connecting to kernel" in output):
1847
+ break
1848
+ else:
1849
+ console.print("[cyan]Waiting for Jupyter to start...[/cyan]")
1850
+
1851
+ time.sleep(5)
1852
+
1853
+ else:
1854
+ if not restart_kernels:
1855
+ console.print("[yellow] Jupyter may have failed to start, you may need to set the restart flag to True")
1856
+
1857
+ console.print(f"[cyan]Setting up port forward to localhost:{local_port}...[/cyan]")
1858
+ cmd = [
1859
+ "kubectl",
1860
+ "port-forward",
1861
+ f"pod/{pod_name}",
1862
+ f"{local_port}:8888",
1863
+ "--namespace",
1864
+ namespace,
1865
+ ]
1866
+
1867
+ process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, start_new_session=True)
1868
+
1869
+ from kubetorch.serving.utils import wait_for_port_forward
1870
+
1871
+ try:
1872
+ wait_for_port_forward(
1873
+ process,
1874
+ local_port,
1875
+ health_endpoint=None,
1876
+ validate_kubetorch_versions=False,
1877
+ )
1878
+ time.sleep(2)
1879
+ except Exception as e:
1880
+ console.print(f"[red]Failed to establish port forward: {e}[/red]")
1881
+ if process:
1882
+ try:
1883
+ os.killpg(os.getpgid(process.pid), signal.SIGTERM)
1884
+ process.wait()
1885
+ except (ProcessLookupError, OSError):
1886
+ pass
1887
+ raise typer.Exit(1)
1888
+
1889
+ # Open in browser
1890
+ notebook_url = f"http://localhost:{local_port}"
1891
+ console.print(f"\n[green]✓ Notebook is ready on URL: {notebook_url}[/green]")
1892
+ console.print(
1893
+ f"[yellow]Service '{remote_fn.service_name}' will stay alive after exit; reconnecting will restart "
1894
+ f"all kernel sessions[/yellow]"
1895
+ )
1896
+ console.print(f"\n[dim]To tear down: kt teardown {remote_fn.service_name}[/dim]")
1897
+ console.print("[dim]Press Ctrl+C to stop port forwarding[/dim]\n")
1898
+ if not os.getenv("KT_NO_BROWSER"):
1899
+ webbrowser.open(notebook_url)
1900
+
1901
+ # Keep running
1902
+ try:
1903
+ while True:
1904
+ if process.poll() is not None:
1905
+ console.print("\n[yellow]Port forward process terminated[/yellow]")
1906
+ break
1907
+ time.sleep(1)
1908
+ except KeyboardInterrupt:
1909
+ console.print("\n[yellow]Stopping port forward...[/yellow]")
1910
+ finally:
1911
+ # Clean up port forward process only
1912
+ if process:
1913
+ try:
1914
+ os.killpg(os.getpgid(process.pid), signal.SIGTERM)
1915
+ process.wait()
1916
+ except (ProcessLookupError, OSError):
1917
+ pass
1918
+
1919
+ console.print(
1920
+ f"\n[yellow]Service '{remote_fn.service_name}' is still running in namespace '{namespace}'[/yellow]"
1921
+ )
1922
+ console.print(f"[dim]To tear down: kt teardown {remote_fn.service_name}[/dim]")
1923
+
1924
+ except Exception as e:
1925
+ console.print(f"[red]Error setting up notebook: {e}[/red]")
1926
+ raise typer.Exit(1)
1927
+
1928
+
1929
+ @app.callback(invoke_without_command=True, help="Kubetorch CLI")
1930
+ def main(
1931
+ ctx: typer.Context,
1932
+ version: bool = typer.Option(None, "--version", "-v", help="Show the version and exit."),
1933
+ ):
1934
+ if version:
1935
+ from kubetorch import __version__
1936
+
1937
+ print(f"{__version__}")
1938
+ elif ctx.invoked_subcommand is None:
1939
+ subprocess.run("kubetorch --help", shell=True)