kubetorch 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. kubetorch/__init__.py +59 -0
  2. kubetorch/cli.py +1939 -0
  3. kubetorch/cli_utils.py +967 -0
  4. kubetorch/config.py +453 -0
  5. kubetorch/constants.py +18 -0
  6. kubetorch/docs/Makefile +18 -0
  7. kubetorch/docs/__init__.py +0 -0
  8. kubetorch/docs/_ext/json_globaltoc.py +42 -0
  9. kubetorch/docs/api/cli.rst +10 -0
  10. kubetorch/docs/api/python/app.rst +21 -0
  11. kubetorch/docs/api/python/cls.rst +19 -0
  12. kubetorch/docs/api/python/compute.rst +25 -0
  13. kubetorch/docs/api/python/config.rst +11 -0
  14. kubetorch/docs/api/python/fn.rst +19 -0
  15. kubetorch/docs/api/python/image.rst +14 -0
  16. kubetorch/docs/api/python/secret.rst +18 -0
  17. kubetorch/docs/api/python/volumes.rst +13 -0
  18. kubetorch/docs/api/python.rst +101 -0
  19. kubetorch/docs/conf.py +69 -0
  20. kubetorch/docs/index.rst +20 -0
  21. kubetorch/docs/requirements.txt +5 -0
  22. kubetorch/globals.py +269 -0
  23. kubetorch/logger.py +59 -0
  24. kubetorch/resources/__init__.py +0 -0
  25. kubetorch/resources/callables/__init__.py +0 -0
  26. kubetorch/resources/callables/cls/__init__.py +0 -0
  27. kubetorch/resources/callables/cls/cls.py +159 -0
  28. kubetorch/resources/callables/fn/__init__.py +0 -0
  29. kubetorch/resources/callables/fn/fn.py +140 -0
  30. kubetorch/resources/callables/module.py +1315 -0
  31. kubetorch/resources/callables/utils.py +203 -0
  32. kubetorch/resources/compute/__init__.py +0 -0
  33. kubetorch/resources/compute/app.py +253 -0
  34. kubetorch/resources/compute/compute.py +2414 -0
  35. kubetorch/resources/compute/decorators.py +137 -0
  36. kubetorch/resources/compute/utils.py +1026 -0
  37. kubetorch/resources/compute/websocket.py +135 -0
  38. kubetorch/resources/images/__init__.py +1 -0
  39. kubetorch/resources/images/image.py +412 -0
  40. kubetorch/resources/images/images.py +64 -0
  41. kubetorch/resources/secrets/__init__.py +2 -0
  42. kubetorch/resources/secrets/kubernetes_secrets_client.py +377 -0
  43. kubetorch/resources/secrets/provider_secrets/__init__.py +0 -0
  44. kubetorch/resources/secrets/provider_secrets/anthropic_secret.py +12 -0
  45. kubetorch/resources/secrets/provider_secrets/aws_secret.py +16 -0
  46. kubetorch/resources/secrets/provider_secrets/azure_secret.py +14 -0
  47. kubetorch/resources/secrets/provider_secrets/cohere_secret.py +12 -0
  48. kubetorch/resources/secrets/provider_secrets/gcp_secret.py +16 -0
  49. kubetorch/resources/secrets/provider_secrets/github_secret.py +13 -0
  50. kubetorch/resources/secrets/provider_secrets/huggingface_secret.py +20 -0
  51. kubetorch/resources/secrets/provider_secrets/kubeconfig_secret.py +12 -0
  52. kubetorch/resources/secrets/provider_secrets/lambda_secret.py +13 -0
  53. kubetorch/resources/secrets/provider_secrets/langchain_secret.py +12 -0
  54. kubetorch/resources/secrets/provider_secrets/openai_secret.py +11 -0
  55. kubetorch/resources/secrets/provider_secrets/pinecone_secret.py +12 -0
  56. kubetorch/resources/secrets/provider_secrets/providers.py +92 -0
  57. kubetorch/resources/secrets/provider_secrets/ssh_secret.py +12 -0
  58. kubetorch/resources/secrets/provider_secrets/wandb_secret.py +11 -0
  59. kubetorch/resources/secrets/secret.py +224 -0
  60. kubetorch/resources/secrets/secret_factory.py +64 -0
  61. kubetorch/resources/secrets/utils.py +222 -0
  62. kubetorch/resources/volumes/__init__.py +0 -0
  63. kubetorch/resources/volumes/volume.py +340 -0
  64. kubetorch/servers/__init__.py +0 -0
  65. kubetorch/servers/http/__init__.py +0 -0
  66. kubetorch/servers/http/distributed_utils.py +2968 -0
  67. kubetorch/servers/http/http_client.py +802 -0
  68. kubetorch/servers/http/http_server.py +1622 -0
  69. kubetorch/servers/http/server_metrics.py +255 -0
  70. kubetorch/servers/http/utils.py +722 -0
  71. kubetorch/serving/__init__.py +0 -0
  72. kubetorch/serving/autoscaling.py +153 -0
  73. kubetorch/serving/base_service_manager.py +344 -0
  74. kubetorch/serving/constants.py +77 -0
  75. kubetorch/serving/deployment_service_manager.py +431 -0
  76. kubetorch/serving/knative_service_manager.py +487 -0
  77. kubetorch/serving/raycluster_service_manager.py +526 -0
  78. kubetorch/serving/service_manager.py +18 -0
  79. kubetorch/serving/templates/deployment_template.yaml +17 -0
  80. kubetorch/serving/templates/knative_service_template.yaml +19 -0
  81. kubetorch/serving/templates/kt_setup_template.sh.j2 +91 -0
  82. kubetorch/serving/templates/pod_template.yaml +198 -0
  83. kubetorch/serving/templates/raycluster_service_template.yaml +42 -0
  84. kubetorch/serving/templates/raycluster_template.yaml +35 -0
  85. kubetorch/serving/templates/service_template.yaml +21 -0
  86. kubetorch/serving/templates/workerset_template.yaml +36 -0
  87. kubetorch/serving/utils.py +344 -0
  88. kubetorch/utils.py +263 -0
  89. kubetorch-0.2.5.dist-info/METADATA +75 -0
  90. kubetorch-0.2.5.dist-info/RECORD +92 -0
  91. kubetorch-0.2.5.dist-info/WHEEL +4 -0
  92. kubetorch-0.2.5.dist-info/entry_points.txt +5 -0
kubetorch/cli_utils.py ADDED
@@ -0,0 +1,967 @@
1
+ import asyncio
2
+ import base64
3
+ import hashlib
4
+ import json
5
+ import os
6
+ import signal
7
+
8
+ import subprocess
9
+ import threading
10
+ import time
11
+ import urllib.parse
12
+ from concurrent.futures import ThreadPoolExecutor, TimeoutError
13
+ from contextlib import contextmanager
14
+ from datetime import datetime, timedelta
15
+ from enum import Enum
16
+ from pathlib import Path
17
+ from typing import List, Optional
18
+
19
+ import httpx
20
+ import typer
21
+ import yaml
22
+ from kubernetes import client
23
+ from kubernetes.client.rest import ApiException
24
+ from pydantic import BaseModel
25
+ from rich import box
26
+ from rich.console import Console
27
+ from rich.style import Style
28
+ from rich.table import Table
29
+ from websocket import create_connection
30
+
31
+ import kubetorch.serving.constants as serving_constants
32
+
33
+ from kubetorch import globals
34
+ from kubetorch.config import KubetorchConfig
35
+ from kubetorch.constants import MAX_PORT_TRIES
36
+
37
+ from kubetorch.resources.compute.utils import is_port_available
38
+ from kubetorch.servers.http.utils import stream_logs_websocket_helper, StreamType
39
+ from kubetorch.serving.utils import wait_for_port_forward
40
+ from kubetorch.utils import load_kubeconfig
41
+
42
+ from .constants import BULLET_UNICODE, CPU_RATE, DOUBLE_SPACE_UNICODE, GPU_RATE
43
+
44
+ from .logger import get_logger
45
+
46
+ console = Console()
47
+
48
+ logger = get_logger(__name__)
49
+
50
+
51
+ # ------------------ Billing helpers--------------------
52
+ class UsageData(BaseModel):
53
+ date_start: str
54
+ date_end: str
55
+ cpu_hours: float
56
+ gpu_hours: float
57
+
58
+
59
+ class BillingTotals(BaseModel):
60
+ cpu: float
61
+ gpu: float
62
+
63
+
64
+ class BillingCosts(BaseModel):
65
+ cpu: float
66
+ gpu: float
67
+
68
+
69
+ class BillingRequest(BaseModel):
70
+ license_key: str
71
+ signature: str
72
+ file_name: str
73
+ username: Optional[str] = None
74
+ usage_data: UsageData
75
+ totals: BillingTotals
76
+ costs: BillingCosts
77
+
78
+
79
+ # ------------------ Generic helpers--------------------
80
+ class VolumeAction(str, Enum):
81
+ list = "list"
82
+ create = "create"
83
+ delete = "delete"
84
+ ssh = "ssh"
85
+
86
+
87
+ class SecretAction(str, Enum):
88
+ list = "list"
89
+ create = "create"
90
+ delete = "delete"
91
+ describe = "describe"
92
+
93
+
94
+ def default_typer_values(*args):
95
+ """Convert typer model arguments to their default values or types, so the CLI commands can be also imported and
96
+ called in Python if desired."""
97
+ new_args = []
98
+ for arg in args:
99
+ if isinstance(arg, typer.models.OptionInfo):
100
+ # Replace the typer model with its value
101
+ arg = arg.default if arg.default is not None else arg.type
102
+ elif isinstance(arg, typer.models.ArgumentInfo):
103
+ # Replace the typer model with its value
104
+ arg = arg.default if arg.default is not None else arg.type
105
+ new_args.append(arg)
106
+ return new_args
107
+
108
+
109
+ def validate_config_key(key: str = None):
110
+ if key is None:
111
+ return
112
+
113
+ valid_keys = {name for name, attr in vars(KubetorchConfig).items() if isinstance(attr, property)}
114
+ if key not in valid_keys:
115
+ raise typer.BadParameter(f"Valid keys are: {', '.join(sorted(valid_keys))}")
116
+ return key
117
+
118
+
119
+ def get_pods_for_service_cli(name: str, namespace: str, core_api):
120
+ """Get pods for a service using unified label selector."""
121
+ # Use unified service label - works for all deployment modes
122
+ label_selector = f"kubetorch.com/service={name}"
123
+ return core_api.list_namespaced_pod(
124
+ namespace=namespace,
125
+ label_selector=label_selector,
126
+ )
127
+
128
+
129
+ def service_name_argument(*args, required: bool = True, **kwargs):
130
+ def _lowercase(value: str) -> str:
131
+ return value.lower() if value else value
132
+
133
+ default = ... if required else ""
134
+ return typer.Argument(default, callback=_lowercase, *args, **kwargs)
135
+
136
+
137
+ def get_deployment_mode(name: str, namespace: str, custom_api, apps_v1_api) -> str:
138
+ """Validate service exists and return deployment mode."""
139
+ try:
140
+ original_name = name
141
+ deployment_mode = detect_deployment_mode(name, namespace, custom_api, apps_v1_api)
142
+ # If service not found and not already prefixed with username, try with username prefix
143
+ if not deployment_mode and globals.config.username and not name.startswith(globals.config.username + "-"):
144
+ name = f"{globals.config.username}-{name}"
145
+ deployment_mode = detect_deployment_mode(name, namespace, custom_api, apps_v1_api)
146
+
147
+ if not deployment_mode:
148
+ console.print(f"[red]Failed to load service [bold]{original_name}[/bold] in namespace {namespace}[/red]")
149
+ raise typer.Exit(1)
150
+ console.print(f"Found [green]{deployment_mode}[/green] service [blue]{name}[/blue]")
151
+ return name, deployment_mode
152
+
153
+ except ApiException as e:
154
+ console.print(f"[red]Kubernetes API error: {e}[/red]")
155
+ raise typer.Exit(1)
156
+
157
+
158
+ def validate_pods_exist(name: str, namespace: str, core_api) -> list:
159
+ """Validate pods exist for service and return pod list."""
160
+ pods = get_pods_for_service_cli(name, namespace, core_api)
161
+ if not pods.items:
162
+ console.print(f"\n[red]No pods found for service {name} in namespace {namespace}[/red]")
163
+ console.print(f"You can view the service's status using:\n [yellow] kt status {name}[/yellow]")
164
+ raise typer.Exit(1)
165
+ return pods.items
166
+
167
+
168
+ @contextmanager
169
+ def port_forward_to_pod(
170
+ pod_name,
171
+ namespace: str = None,
172
+ local_port: int = 8080,
173
+ remote_port: int = serving_constants.DEFAULT_NGINX_PORT,
174
+ health_endpoint: str = None,
175
+ ):
176
+
177
+ load_kubeconfig()
178
+ for attempt in range(MAX_PORT_TRIES):
179
+ candidate_port = local_port + attempt
180
+ if not is_port_available(candidate_port):
181
+ logger.debug(f"Local port {candidate_port} is already in use, trying again...")
182
+ continue
183
+
184
+ cmd = [
185
+ "kubectl",
186
+ "port-forward",
187
+ f"pod/{pod_name}",
188
+ f"{candidate_port}:{remote_port}",
189
+ "--namespace",
190
+ namespace,
191
+ ]
192
+ logger.debug(f"Running port-forward command: {' '.join(cmd)}")
193
+
194
+ process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, start_new_session=True)
195
+
196
+ try:
197
+ wait_for_port_forward(
198
+ process,
199
+ candidate_port,
200
+ health_endpoint=health_endpoint,
201
+ validate_kubetorch_versions=False,
202
+ )
203
+ time.sleep(2)
204
+ yield candidate_port
205
+ return
206
+
207
+ finally:
208
+ if process:
209
+ try:
210
+ os.killpg(os.getpgid(process.pid), signal.SIGTERM)
211
+ process.wait()
212
+ except (ProcessLookupError, OSError):
213
+ # Process may have already terminated
214
+ pass
215
+
216
+ raise RuntimeError(f"Could not bind available port after {MAX_PORT_TRIES} attempts")
217
+
218
+
219
+ def get_last_updated(pod):
220
+ conditions = pod["status"].get("conditions", [])
221
+ latest = max(
222
+ (c.get("lastTransitionTime") for c in conditions if c.get("lastTransitionTime")),
223
+ default="",
224
+ )
225
+ return latest
226
+
227
+
228
+ # ------------------ Reporting helpers--------------------
229
+ def upload_report(
230
+ usage_data: dict,
231
+ signature: str,
232
+ costs: BillingCosts,
233
+ totals: BillingTotals,
234
+ file_name: str,
235
+ license_key: str,
236
+ username: str = None,
237
+ ):
238
+ billing_request = BillingRequest(
239
+ license_key=license_key,
240
+ signature=signature,
241
+ file_name=file_name,
242
+ username=username,
243
+ usage_data=UsageData(**usage_data),
244
+ costs=costs,
245
+ totals=totals,
246
+ )
247
+
248
+ url = "https://auth.run.house/v1/billing/report"
249
+ resp = httpx.post(url, json=billing_request.model_dump())
250
+ if resp.status_code != 200:
251
+ console.print("[red]Failed to send billing report[/red]")
252
+ raise typer.Exit(1)
253
+
254
+
255
+ def export_report_pdf(report_data, filename):
256
+ try:
257
+ from reportlab.lib import colors
258
+ from reportlab.lib.pagesizes import letter
259
+ from reportlab.pdfgen import canvas
260
+ except ImportError:
261
+ console.print(
262
+ "[red]ReportLab is required for downloading the report. Please install it "
263
+ "with `pip install reportlab`.[/red]"
264
+ )
265
+ raise typer.Exit(1)
266
+
267
+ usage_data: dict = report_data["usage_report"]
268
+ report_str = json.dumps(report_data, sort_keys=True)
269
+ signature = base64.b64encode(hashlib.sha256(report_str.encode()).digest()).decode()
270
+
271
+ c = canvas.Canvas(filename, pagesize=letter)
272
+ width, height = letter
273
+
274
+ # Sidebar
275
+ sidebar_color = colors.HexColor("#4B9CD3")
276
+ c.setFillColor(sidebar_color)
277
+ c.roundRect(0, 0, 18, height, 0, fill=1, stroke=0)
278
+ c.setFillColor(colors.black)
279
+
280
+ y = height - 60
281
+
282
+ # Header Title
283
+ c.setFont("Helvetica-Bold", 26)
284
+ c.setFillColor(sidebar_color)
285
+ c.drawCentredString(width / 2, y, "Kubetorch Usage Report")
286
+ c.setFillColor(colors.black)
287
+ y -= 30
288
+
289
+ # Header Bar
290
+ c.setStrokeColor(sidebar_color)
291
+ c.setLineWidth(2)
292
+ c.line(40, y, width - 40, y)
293
+ y -= 20
294
+
295
+ # Info Box
296
+ c.setFillColor(colors.whitesmoke)
297
+ c.roundRect(40, y - 60, width - 80, 60, 8, fill=1, stroke=0)
298
+ c.setFillColor(colors.black)
299
+ c.setFont("Helvetica-Bold", 12)
300
+ c.drawString(55, y - 20, "Username:")
301
+ c.drawString(55, y - 35, "Cluster:")
302
+ c.setFont("Helvetica", 12)
303
+ c.drawString(130, y - 20, report_data["username"])
304
+ c.drawString(130, y - 35, report_data.get("cluster_name", "N/A"))
305
+ y -= 100
306
+
307
+ # Usage Summary Section
308
+ c.setFont("Helvetica-Bold", 15)
309
+ c.setFillColor(sidebar_color)
310
+ c.drawString(40, y, "Usage Summary")
311
+ c.setFillColor(colors.black)
312
+ y -= 25
313
+
314
+ # Table Outline (dashed)
315
+ table_left = 40
316
+ table_width = width - 80
317
+ row_height = 18
318
+ num_rows = 2 # header + data
319
+ table_height = row_height * num_rows
320
+
321
+ # Table Header (centered text)
322
+ header_height = row_height
323
+ c.setFillColor(sidebar_color)
324
+ c.roundRect(table_left, y - header_height, table_width, header_height, 4, fill=1, stroke=0)
325
+ c.setFont("Helvetica-Bold", 11)
326
+ c.setFillColor(colors.white)
327
+ header_y = y - header_height + 5
328
+ c.drawString(table_left + 10, header_y, "Start Date")
329
+ c.drawString(table_left + 90, header_y, "End Date")
330
+ c.drawString(table_left + 200, header_y, "vCPU Hours")
331
+ c.drawString(table_left + 300, header_y, "GPU Hours")
332
+ c.setFillColor(colors.black)
333
+
334
+ # Dashed outline (starts at header, not above)
335
+ c.setStrokeColor(sidebar_color)
336
+ c.setDash(4, 4)
337
+ c.roundRect(table_left, y - table_height, table_width, table_height, 6, fill=0, stroke=1)
338
+ c.setDash()
339
+ y -= header_height
340
+
341
+ # Table Rows
342
+ c.setFont("Helvetica", 10)
343
+ y -= row_height
344
+ c.drawString(table_left + 10, y + 5, usage_data["date_start"])
345
+ c.drawString(table_left + 90, y + 5, usage_data["date_end"])
346
+ c.drawRightString(table_left + 270, y + 5, f"{usage_data['cpu_hours']:.2f}")
347
+ c.drawRightString(table_left + 370, y + 5, f"{usage_data['gpu_hours']:.2f}")
348
+
349
+ y -= 30
350
+
351
+ # Invoice Calculation
352
+ total_cpu = usage_data["cpu_hours"]
353
+ total_gpu = usage_data["gpu_hours"]
354
+ cpu_cost = total_cpu * CPU_RATE
355
+ gpu_cost = total_gpu * GPU_RATE
356
+ total_cost = cpu_cost + gpu_cost
357
+
358
+ y -= 20
359
+ c.setFont("Helvetica-Bold", 13)
360
+ c.setFillColor(sidebar_color)
361
+ c.drawString(40, y, "Invoice Summary")
362
+ c.setFillColor(colors.black)
363
+ y -= 18
364
+
365
+ c.setFont("Helvetica", 11)
366
+ c.drawString(50, y, f"Total vCPU Hours: {total_cpu:.2f} @ ${CPU_RATE:.2f}/hr")
367
+ c.drawRightString(width - 50, y, f"${cpu_cost:.2f}")
368
+ y -= 15
369
+ c.drawString(50, y, f"Total GPU Hours: {total_gpu:} @ ${GPU_RATE:.2f}/hr")
370
+ c.drawRightString(width - 50, y, f"${gpu_cost:.2f}")
371
+ y -= 15
372
+
373
+ line_left = 50
374
+ line_right = width - 50
375
+ c.setStrokeColor(sidebar_color)
376
+ c.setLineWidth(1.5)
377
+ c.line(line_left, y, line_right, y)
378
+ y -= 15
379
+
380
+ c.setFont("Helvetica-Bold", 12)
381
+ c.drawString(50, y, "Total Due:")
382
+ c.setFont("Helvetica-Bold", 12)
383
+ c.setFillColor(colors.HexColor("#008000"))
384
+ c.drawRightString(width - 50, y, f"${total_cost:.2f}")
385
+ c.setFillColor(colors.black)
386
+ y -= 30
387
+
388
+ # Signature and footer
389
+ sig_y = 80
390
+ sig_val_y = sig_y - 15
391
+ footer_y = sig_val_y - 40
392
+
393
+ # Signature at the bottom
394
+ c.setFont("Helvetica-Bold", 12)
395
+ c.setFillColor(colors.black)
396
+ c.drawString(40, sig_y, "Signature:")
397
+ c.setFont("Courier-Oblique", 8)
398
+ c.setFillColor(colors.HexColor("#888888"))
399
+ c.drawString(40, sig_val_y, signature)
400
+ c.setFillColor(colors.black)
401
+
402
+ # Footer
403
+ c.setFont("Helvetica-Oblique", 8)
404
+ c.setFillColor(colors.HexColor("#888888"))
405
+ c.drawString(40, footer_y, f"Generated on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
406
+ c.setFillColor(colors.black)
407
+
408
+ c.save()
409
+ return signature
410
+
411
+
412
+ def print_usage_table(usage_data, cluster_name):
413
+ table = Table(title="Usage Summary")
414
+ table.add_column("Start Date")
415
+ table.add_column("End Date")
416
+ table.add_column("vCPU Hours")
417
+ table.add_column("GPU Hours")
418
+ table.add_row(
419
+ usage_data["date_start"],
420
+ usage_data["date_end"],
421
+ str(usage_data["cpu_hours"]),
422
+ str(usage_data["gpu_hours"]),
423
+ )
424
+ console.print(table)
425
+ console.print(f"[dim]Cluster: {str(cluster_name)}[/dim]")
426
+
427
+
428
+ def get_last_n_calendar_weeks(n_weeks):
429
+ """Return a list of (week_start, week_end) tuples for the last n full calendar weeks (Mon–Sun),
430
+ not including this week."""
431
+ today = datetime.utcnow().date()
432
+ # Find the most recent Monday before today (not including today if today is Monday)
433
+ if today.weekday() == 0:
434
+ last_monday = today - timedelta(days=7)
435
+ else:
436
+ last_monday = today - timedelta(days=today.weekday())
437
+
438
+ weeks = []
439
+ for i in range(n_weeks):
440
+ week_start = last_monday - timedelta(weeks=i)
441
+ week_end = week_start + timedelta(days=6) # Monday + 6 = Sunday
442
+ weeks.append((week_start, week_end))
443
+
444
+ weeks.reverse() # So oldest week is first
445
+ return weeks
446
+
447
+
448
+ def get_usage_data(prom, weeks):
449
+ from datetime import datetime, timedelta
450
+
451
+ days = weeks * 7
452
+ end_time = datetime.now()
453
+ start_time = end_time - timedelta(days=days)
454
+
455
+ # sum of CPU-seconds used by all cores for that container (ex: 2 cores for 1 second = 2 seconds)
456
+ cpu_query = f'increase(container_cpu_usage_seconds_total{{container="kubetorch"}}[{days}d])'
457
+ cpu_result = prom.custom_query(cpu_query)
458
+
459
+ total_cpu_seconds = 0
460
+ if cpu_result:
461
+ for series in cpu_result:
462
+ cpu_val = float(series["value"][1])
463
+ total_cpu_seconds += cpu_val
464
+
465
+ cpu_hours = round(total_cpu_seconds / 3600, 2)
466
+
467
+ # requested GPUs × time they were running
468
+ gpu_query = f'sum_over_time(kube_pod_container_resource_requests{{resource="nvidia_com_gpu", container="kubetorch"}}[{days}d])'
469
+ gpu_result = prom.custom_query(gpu_query)
470
+
471
+ # Convert to "GPU hours" over the period
472
+ total_gpu_seconds = sum(float(s["value"][1]) for s in gpu_result or [])
473
+ gpu_hours = total_gpu_seconds / 3600
474
+
475
+ usage = {
476
+ "date_start": start_time.strftime("%Y-%m-%d"),
477
+ "date_end": end_time.strftime("%Y-%m-%d"),
478
+ "cpu_hours": round(cpu_hours, 2),
479
+ "gpu_hours": round(gpu_hours, 2),
480
+ }
481
+
482
+ return usage
483
+
484
+
485
+ # ------------------ Monitoring helpers--------------------
486
+ def get_service_metrics(prom, pod_name: str, pod_node: str, running_on_gpu: bool):
487
+ """Get CPU, GPU (if relevant) and memory metrics for a pod"""
488
+
489
+ def extract_prometheus_metric_value(query) -> float:
490
+ result = prom.custom_query(query=query)
491
+ return float(result[0].get("value")[1]) if result else 0
492
+
493
+ # --- CPU metrics --- #
494
+ cpu_query_time_window = "30s"
495
+ used_cpu_query = (
496
+ f"sum(rate(container_cpu_usage_seconds_total{{pod='{pod_name}', "
497
+ f"container='kubetorch'}}[{cpu_query_time_window}]))"
498
+ )
499
+ requested_cpu_query = (
500
+ f"sum(kube_pod_container_resource_requests{{pod='{pod_name}', " f"resource='cpu', container='kubetorch'}})"
501
+ )
502
+
503
+ used_cpu_result = extract_prometheus_metric_value(used_cpu_query)
504
+
505
+ requested_cpu_result = extract_prometheus_metric_value(requested_cpu_query)
506
+
507
+ cpu_util = (
508
+ round((100 * (used_cpu_result / requested_cpu_result)), 3) if used_cpu_result and requested_cpu_result else 0
509
+ )
510
+
511
+ memory_usage_query = f"container_memory_usage_bytes{{pod='{pod_name}', container='kubetorch'}} / 1073741824"
512
+ memory_usage = round(extract_prometheus_metric_value(memory_usage_query), 3)
513
+
514
+ machine_mem_query = f"machine_memory_bytes{{node='{pod_node}'}} / 1073741824" # convert to GB
515
+ machine_mem_result = extract_prometheus_metric_value(machine_mem_query)
516
+
517
+ cpu_mem_percent = round((memory_usage / machine_mem_result) * 100, 3) if machine_mem_result else 0
518
+ collected_metrics = {
519
+ "cpu_util": cpu_util,
520
+ "used_cpu": round(used_cpu_result, 4),
521
+ "requested_cpu": round(requested_cpu_result, 4),
522
+ "cpu_memory_usage": memory_usage,
523
+ "cpu_memory_total": round(machine_mem_result, 3),
524
+ "cpu_memory_usage_percent": cpu_mem_percent,
525
+ }
526
+
527
+ # --- GPU metrics --- #
528
+ if running_on_gpu:
529
+ gpu_util_query = f"DCGM_FI_DEV_GPU_UTIL{{exported_pod='{pod_name}', exported_container='kubetorch'}}"
530
+ gpu_mem_used_query = (
531
+ f"DCGM_FI_DEV_FB_USED{{exported_pod='{pod_name}', " f"exported_container='kubetorch'}} * 1.048576 / 1000"
532
+ ) # convert MiB to MB to GB
533
+ gpu_mem_free_query = (
534
+ f"DCGM_FI_DEV_FB_FREE{{exported_pod='{pod_name}', " f"exported_container='kubetorch'}} * 1.048576 / 1000"
535
+ ) # convert MiB to MB to GB
536
+
537
+ gpu_util = extract_prometheus_metric_value(gpu_util_query)
538
+ gpu_mem_used = round(extract_prometheus_metric_value(gpu_mem_used_query), 3)
539
+ gpu_mem_free = extract_prometheus_metric_value(gpu_mem_free_query)
540
+ gpu_mem_total = gpu_mem_free + gpu_mem_used
541
+ gpu_mem_percent = (
542
+ round(100 * (gpu_mem_used / gpu_mem_total), 2) if gpu_mem_used else 0
543
+ ) # raw approximation, because total_allocated_gpu_memory is not collected
544
+
545
+ gpu_metrics = {
546
+ "gpu_util": gpu_util,
547
+ "gpu_memory_usage": gpu_mem_used,
548
+ "gpu_memory_total": round(gpu_mem_total, 3),
549
+ "gpu_memory_usage_percent": gpu_mem_percent,
550
+ }
551
+
552
+ collected_metrics.update(gpu_metrics)
553
+
554
+ return collected_metrics
555
+
556
+
557
+ def get_current_cluster_name():
558
+ try:
559
+ from kubernetes import config as k8s_config
560
+
561
+ k8s_config.load_incluster_config()
562
+ # In-cluster: return a generic name or the service host
563
+ return os.environ.get("CLUSTER_NAME", "in-cluster")
564
+ except Exception:
565
+ pass
566
+
567
+ # Fallback to kubeconfig file
568
+ kubeconfig_path = os.getenv("KUBECONFIG") or str(Path.home() / ".kube" / "config")
569
+ if not os.path.exists(kubeconfig_path):
570
+ return None
571
+
572
+ with open(kubeconfig_path, "r") as f:
573
+ kubeconfig = yaml.safe_load(f)
574
+ current_context = kubeconfig.get("current-context")
575
+ for context in kubeconfig.get("contexts", []):
576
+ if context["name"] == current_context:
577
+ return context["context"]["cluster"]
578
+ return None
579
+
580
+
581
+ def print_pod_info(pod_name, pod_idx, is_gpu, metrics=None, queue_name=None):
582
+ """Print pod info with metrics if available"""
583
+ queue_msg = f" | [bold]Queue Name[/bold]: {queue_name}"
584
+ base_msg = f"{BULLET_UNICODE} [reset][bold cyan]{pod_name}[/bold cyan] (idx: {pod_idx})"
585
+ if queue_name:
586
+ base_msg += queue_msg
587
+ console.print(base_msg)
588
+ if metrics:
589
+ console.print(
590
+ f"{DOUBLE_SPACE_UNICODE}[bold]CPU[/bold]: [reset]{metrics['cpu_util']}% "
591
+ f"({metrics['used_cpu']} / {metrics['requested_cpu']}) | "
592
+ f"[bold]Memory[/bold]: {metrics['cpu_memory_usage']} / {metrics['cpu_memory_total']} "
593
+ f"[bold]GB[/bold] ({metrics['cpu_memory_usage_percent']}%)"
594
+ )
595
+ if is_gpu:
596
+ console.print(
597
+ f"{DOUBLE_SPACE_UNICODE}GPU: [reset]{metrics['gpu_util']}% | "
598
+ f"Memory: {metrics['gpu_memory_usage']} / {metrics['gpu_memory_total']} "
599
+ f"GB ({metrics['gpu_memory_usage_percent']}%)"
600
+ )
601
+ else:
602
+ console.print(f"{DOUBLE_SPACE_UNICODE}[yellow]Metrics unavailable[/yellow]")
603
+
604
+
605
+ def _get_logs_from_loki_worker(uri: str, print_pod_name: bool):
606
+ """Worker function for getting logs from Loki - runs in a separate thread."""
607
+ ws = None
608
+ try:
609
+ ws = create_connection(uri)
610
+ message = ws.recv()
611
+ if not message:
612
+ return None
613
+ data = json.loads(message)
614
+ logs = []
615
+ if data.get("streams"):
616
+ for stream in data["streams"]:
617
+ pod_name = f'({stream.get("stream").get("pod")}) ' if print_pod_name else ""
618
+ for value in stream.get("values"):
619
+ try:
620
+ log_line = json.loads(value[1])
621
+ log_name = log_line.get("name")
622
+ if log_name == "print_redirect":
623
+ logs.append(f'{pod_name}{log_line.get("message")}')
624
+ elif log_name != "uvicorn.access":
625
+ formatted_log = (
626
+ f"{pod_name}{log_line.get('asctime')} | {log_line.get('levelname')} | "
627
+ f"{log_line.get('message')}\n"
628
+ )
629
+ logs.append(formatted_log)
630
+ except Exception:
631
+ logs.append(value[1])
632
+ return logs
633
+ finally:
634
+ if ws:
635
+ try:
636
+ ws.close()
637
+ except Exception:
638
+ pass
639
+
640
+
641
+ def get_logs_from_loki(
642
+ query: str = None,
643
+ uri: str = None,
644
+ print_pod_name: bool = False,
645
+ timeout: float = 5.0,
646
+ ):
647
+ """Get logs from Loki with fail-fast approach to avoid hanging."""
648
+ try:
649
+ # If URI is provided, use it directly (skip cluster checks)
650
+ if uri:
651
+ return _get_logs_from_loki_worker(uri, print_pod_name)
652
+
653
+ import urllib.parse
654
+
655
+ # Now safe to proceed with service URL setup
656
+ from kubetorch import globals
657
+ from kubetorch.utils import http_to_ws
658
+
659
+ base_url = globals.service_url()
660
+ target_uri = f"{http_to_ws(base_url)}/loki/api/v1/tail?query={urllib.parse.quote_plus(query)}"
661
+
662
+ # Use thread timeout for websocket worker since websocket timeouts don't work reliably
663
+ executor = ThreadPoolExecutor(max_workers=1)
664
+ try:
665
+ future = executor.submit(_get_logs_from_loki_worker, target_uri, print_pod_name)
666
+ try:
667
+ result = future.result(timeout=timeout)
668
+ return result
669
+ except TimeoutError:
670
+ logger.debug(f"Loki websocket connection timed out after {timeout}s")
671
+ return None
672
+ except Exception as e:
673
+ logger.debug(f"Error in Loki websocket worker: {e}")
674
+ return None
675
+ finally:
676
+ # Don't wait for stuck threads to complete
677
+ executor.shutdown(wait=False)
678
+
679
+ except Exception as e:
680
+ logger.debug(f"Error getting logs from Loki: {e}")
681
+ return None
682
+
683
+
684
+ def stream_logs_websocket(uri, stop_event, service_name, print_pod_name: bool = False):
685
+ """Stream logs using Loki's websocket tail endpoint"""
686
+
687
+ console.print(f"\nFollowing logs of [reset]{service_name}\n")
688
+
689
+ # Create and run event loop in a separate thread
690
+ loop = asyncio.new_event_loop()
691
+ asyncio.set_event_loop(loop)
692
+ try:
693
+ loop.run_until_complete(
694
+ stream_logs_websocket_helper(
695
+ uri=uri,
696
+ stop_event=stop_event,
697
+ stream_type=StreamType.CLI,
698
+ print_pod_name=print_pod_name,
699
+ )
700
+ )
701
+ finally:
702
+ loop.close()
703
+ # Signal the log thread to stop
704
+ stop_event.set()
705
+ # Don't wait for the log thread - it will handle its own cleanup
706
+
707
+
708
+ def get_logs_query(name: str, namespace: str, selected_pod: str, deployment_mode):
709
+ if not selected_pod:
710
+ if deployment_mode in ["knative", "deployment"]:
711
+ # we need to get the pod names first since Loki doesn't have a service_name label
712
+ core_api = client.CoreV1Api()
713
+ pods = validate_pods_exist(name, namespace, core_api)
714
+ pod_names = [pod.metadata.name for pod in pods]
715
+ return f'{{k8s_pod_name=~"{"|".join(pod_names)}",k8s_container_name="kubetorch"}} | json'
716
+ else:
717
+ console.print(f"[red]Logs does not support deployment mode: {deployment_mode}[/red]")
718
+ return None
719
+ else:
720
+ return f'{{k8s_pod_name=~"{selected_pod}",k8s_container_name="kubetorch"}} | json'
721
+
722
+
723
+ def follow_logs_in_cli(
724
+ name: str,
725
+ namespace: str,
726
+ selected_pod: str,
727
+ deployment_mode,
728
+ print_pod_name: bool = False,
729
+ ):
730
+ """Stream logs when triggerd by the CLI command."""
731
+ from kubetorch.utils import http_to_ws
732
+
733
+ stop_event = threading.Event()
734
+
735
+ # Set up signal handler to cleanly stop on Ctrl+C
736
+ def signal_handler(signum, frame):
737
+ stop_event.set()
738
+ raise KeyboardInterrupt()
739
+
740
+ original_handler = signal.signal(signal.SIGINT, signal_handler)
741
+
742
+ # setting up the query
743
+ query = get_logs_query(name, namespace, selected_pod, deployment_mode)
744
+ if not query:
745
+ return
746
+ encoded_query = urllib.parse.quote_plus(query)
747
+
748
+ base_url = globals.service_url()
749
+ uri = f"{http_to_ws(base_url)}/loki/api/v1/tail?query={encoded_query}"
750
+
751
+ try:
752
+ stream_logs_websocket(
753
+ uri=uri,
754
+ stop_event=stop_event,
755
+ service_name=name,
756
+ print_pod_name=print_pod_name,
757
+ )
758
+ finally:
759
+ # Restore original signal handler
760
+ signal.signal(signal.SIGINT, original_handler)
761
+
762
+
763
+ def is_ingress_vpc_only(annotations: dict):
764
+ # Check for internal LoadBalancer annotations
765
+ internal_checks = [
766
+ annotations.get("service.beta.kubernetes.io/aws-load-balancer-internal") == "true",
767
+ annotations.get("networking.gke.io/load-balancer-type") == "Internal",
768
+ annotations.get("service.beta.kubernetes.io/oci-load-balancer-internal") == "true",
769
+ ]
770
+
771
+ vpc_only = any(internal_checks)
772
+ return vpc_only
773
+
774
+
775
+ def load_ingress(namespace: str = globals.config.install_namespace):
776
+ networking_v1_api = client.NetworkingV1Api()
777
+ ingresses = networking_v1_api.list_namespaced_ingress(namespace=namespace)
778
+
779
+ for ingress in ingresses.items:
780
+ if ingress.metadata.name == "kubetorch-proxy-ingress":
781
+ return ingress
782
+
783
+
784
+ def get_ingress_host(ingress):
785
+ """Get the configured host from the kubetorch ingress."""
786
+ try:
787
+ return ingress.spec.rules[0].host
788
+ except Exception:
789
+ return None
790
+
791
+
792
+ def list_all_queues():
793
+ try:
794
+ custom_api = client.CustomObjectsApi()
795
+ queues = custom_api.list_cluster_custom_object(
796
+ group="scheduling.run.ai",
797
+ version="v2",
798
+ plural="queues",
799
+ )["items"]
800
+
801
+ if not queues:
802
+ console.print("[yellow]No queues found in the cluster[/yellow]")
803
+ return
804
+
805
+ # Insert "default" queue if missing
806
+ if not any(q["metadata"]["name"] == "default" for q in queues):
807
+ default_children = [
808
+ q["metadata"]["name"] for q in queues if q.get("spec", {}).get("parentQueue") == "default"
809
+ ]
810
+ queues.insert(
811
+ 0,
812
+ {
813
+ "metadata": {"name": "default"},
814
+ "spec": {
815
+ "parentQueue": "-",
816
+ "children": default_children,
817
+ "resources": {
818
+ "cpu": {"quota": "-", "overQuotaWeight": "-"},
819
+ "gpu": {"quota": "-", "overQuotaWeight": "-"},
820
+ "memory": {"quota": "-", "overQuotaWeight": "-"},
821
+ },
822
+ "priority": "-",
823
+ },
824
+ },
825
+ )
826
+
827
+ queue_table = Table(title="Available Queues", header_style=Style(bold=True))
828
+ queue_table.add_column("QUEUE NAME", style="cyan")
829
+ queue_table.add_column("PRIORITY", style="magenta")
830
+ queue_table.add_column("PARENT", style="green")
831
+ queue_table.add_column("CHILDREN", style="yellow")
832
+ queue_table.add_column("CPU QUOTA", style="white")
833
+ queue_table.add_column("GPU QUOTA", style="white")
834
+ queue_table.add_column("MEMORY QUOTA", style="white")
835
+ queue_table.add_column("OVERQUOTA WEIGHT", style="blue")
836
+
837
+ for q in queues:
838
+ spec = q.get("spec", {})
839
+ resources = spec.get("resources", {})
840
+ cpu = resources.get("cpu", {})
841
+ gpu = resources.get("gpu", {})
842
+ memory = resources.get("memory", {})
843
+
844
+ queue_table.add_row(
845
+ q["metadata"]["name"],
846
+ str(spec.get("priority", "-")),
847
+ spec.get("parentQueue", "-"),
848
+ ", ".join(spec.get("children", [])) or "-",
849
+ str(cpu.get("quota", "-")),
850
+ str(gpu.get("quota", "-")),
851
+ str(memory.get("quota", "-")),
852
+ str(cpu.get("overQuotaWeight", "-")), # use CPU's overQuotaWeight as example
853
+ )
854
+
855
+ console.print(queue_table)
856
+ return
857
+
858
+ except client.exceptions.ApiException as e:
859
+ console.print(f"[red]Failed to list queues: {e}[/red]")
860
+ raise typer.Exit(1)
861
+
862
+
863
+ def detect_deployment_mode(name: str, namespace: str, custom_api, apps_v1_api):
864
+ """Detect if a service is deployed as Knative, Deployment, or RayCluster."""
865
+ # First try Deployment
866
+ try:
867
+ apps_v1_api.read_namespaced_deployment(name=name, namespace=namespace)
868
+ return "deployment"
869
+ except ApiException:
870
+ pass
871
+
872
+ # Then try Knative
873
+ try:
874
+ custom_api.get_namespaced_custom_object(
875
+ group="serving.knative.dev",
876
+ version="v1",
877
+ namespace=namespace,
878
+ plural="services",
879
+ name=name,
880
+ )
881
+ return "knative"
882
+ except ApiException:
883
+ pass
884
+
885
+ # Then try RayCluster
886
+ try:
887
+ custom_api.get_namespaced_custom_object(
888
+ group="ray.io",
889
+ version="v1",
890
+ namespace=namespace,
891
+ plural="rayclusters",
892
+ name=name,
893
+ )
894
+ return "raycluster"
895
+ except ApiException:
896
+ pass
897
+
898
+ return None
899
+
900
+
901
+ def validate_provided_pod(service_name, provided_pod, service_pods):
902
+ if provided_pod is None:
903
+ return provided_pod
904
+
905
+ if provided_pod.isnumeric():
906
+ pod = int(provided_pod)
907
+ if pod < 0 or pod >= len(service_pods):
908
+ console.print(f"[red]Pod index {pod} is out of range[/red]")
909
+ raise typer.Exit(1)
910
+ pod_name = service_pods[pod].metadata.name
911
+
912
+ # case when the user provides pod name
913
+ else:
914
+ pod_names = [pod.metadata.name for pod in service_pods]
915
+ if provided_pod not in pod_names:
916
+ console.print(f"[red]{service_name} does not have an associated pod called {provided_pod}[/red]")
917
+ raise typer.Exit(1)
918
+ else:
919
+ pod_name = provided_pod
920
+
921
+ return pod_name
922
+
923
+
924
+ def load_kubetorch_volumes_from_pods(pods: List[client.V1Pod]) -> List[str]:
925
+ """Extract volume information from service definition"""
926
+ volumes = []
927
+
928
+ if pods:
929
+ pod = pods[0]
930
+ for v in pod.spec.volumes or []:
931
+ if v.persistent_volume_claim:
932
+ volumes.append(v.name)
933
+
934
+ return volumes
935
+
936
+
937
+ def load_kubetorch_volumes_for_service(namespace, service_name, core_v1) -> List[str]:
938
+ """Extract volume information from service definition"""
939
+ try:
940
+ pods = core_v1.list_namespaced_pod(
941
+ namespace=namespace,
942
+ label_selector=f"kubetorch.com/service={service_name}",
943
+ )
944
+ return load_kubetorch_volumes_from_pods(pods.items)
945
+
946
+ except Exception as e:
947
+ logger.warning(f"Failed to extract volumes from service: {e}")
948
+ return []
949
+
950
+
951
+ def create_table_for_output(columns: List[set], no_wrap_columns_names: list = None, header_style: dict = None):
952
+ table = Table(box=box.SQUARE, header_style=Style(**header_style))
953
+ for name, style in columns:
954
+ if name in no_wrap_columns_names:
955
+ # always make service name fully visible
956
+ table.add_column(name, style=style, no_wrap=True)
957
+ else:
958
+ table.add_column(name, style=style)
959
+
960
+ return table
961
+
962
+
963
+ def notebook_placeholder():
964
+ """Placeholder function to launch notebook service"""
965
+ import time
966
+
967
+ time.sleep(3600) # Keep alive for port forwarding