kubetorch 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kubetorch might be problematic. Click here for more details.

Files changed (93) hide show
  1. kubetorch/__init__.py +60 -0
  2. kubetorch/cli.py +1985 -0
  3. kubetorch/cli_utils.py +1025 -0
  4. kubetorch/config.py +453 -0
  5. kubetorch/constants.py +18 -0
  6. kubetorch/docs/Makefile +18 -0
  7. kubetorch/docs/__init__.py +0 -0
  8. kubetorch/docs/_ext/json_globaltoc.py +42 -0
  9. kubetorch/docs/api/cli.rst +10 -0
  10. kubetorch/docs/api/python/app.rst +21 -0
  11. kubetorch/docs/api/python/cls.rst +19 -0
  12. kubetorch/docs/api/python/compute.rst +25 -0
  13. kubetorch/docs/api/python/config.rst +11 -0
  14. kubetorch/docs/api/python/fn.rst +19 -0
  15. kubetorch/docs/api/python/image.rst +14 -0
  16. kubetorch/docs/api/python/secret.rst +18 -0
  17. kubetorch/docs/api/python/volumes.rst +13 -0
  18. kubetorch/docs/api/python.rst +101 -0
  19. kubetorch/docs/conf.py +69 -0
  20. kubetorch/docs/index.rst +20 -0
  21. kubetorch/docs/requirements.txt +5 -0
  22. kubetorch/globals.py +285 -0
  23. kubetorch/logger.py +59 -0
  24. kubetorch/resources/__init__.py +0 -0
  25. kubetorch/resources/callables/__init__.py +0 -0
  26. kubetorch/resources/callables/cls/__init__.py +0 -0
  27. kubetorch/resources/callables/cls/cls.py +157 -0
  28. kubetorch/resources/callables/fn/__init__.py +0 -0
  29. kubetorch/resources/callables/fn/fn.py +133 -0
  30. kubetorch/resources/callables/module.py +1416 -0
  31. kubetorch/resources/callables/utils.py +174 -0
  32. kubetorch/resources/compute/__init__.py +0 -0
  33. kubetorch/resources/compute/app.py +261 -0
  34. kubetorch/resources/compute/compute.py +2596 -0
  35. kubetorch/resources/compute/decorators.py +139 -0
  36. kubetorch/resources/compute/rbac.py +74 -0
  37. kubetorch/resources/compute/utils.py +1114 -0
  38. kubetorch/resources/compute/websocket.py +137 -0
  39. kubetorch/resources/images/__init__.py +1 -0
  40. kubetorch/resources/images/image.py +414 -0
  41. kubetorch/resources/images/images.py +74 -0
  42. kubetorch/resources/secrets/__init__.py +2 -0
  43. kubetorch/resources/secrets/kubernetes_secrets_client.py +412 -0
  44. kubetorch/resources/secrets/provider_secrets/__init__.py +0 -0
  45. kubetorch/resources/secrets/provider_secrets/anthropic_secret.py +12 -0
  46. kubetorch/resources/secrets/provider_secrets/aws_secret.py +16 -0
  47. kubetorch/resources/secrets/provider_secrets/azure_secret.py +14 -0
  48. kubetorch/resources/secrets/provider_secrets/cohere_secret.py +12 -0
  49. kubetorch/resources/secrets/provider_secrets/gcp_secret.py +16 -0
  50. kubetorch/resources/secrets/provider_secrets/github_secret.py +13 -0
  51. kubetorch/resources/secrets/provider_secrets/huggingface_secret.py +20 -0
  52. kubetorch/resources/secrets/provider_secrets/kubeconfig_secret.py +12 -0
  53. kubetorch/resources/secrets/provider_secrets/lambda_secret.py +13 -0
  54. kubetorch/resources/secrets/provider_secrets/langchain_secret.py +12 -0
  55. kubetorch/resources/secrets/provider_secrets/openai_secret.py +11 -0
  56. kubetorch/resources/secrets/provider_secrets/pinecone_secret.py +12 -0
  57. kubetorch/resources/secrets/provider_secrets/providers.py +93 -0
  58. kubetorch/resources/secrets/provider_secrets/ssh_secret.py +12 -0
  59. kubetorch/resources/secrets/provider_secrets/wandb_secret.py +11 -0
  60. kubetorch/resources/secrets/secret.py +238 -0
  61. kubetorch/resources/secrets/secret_factory.py +70 -0
  62. kubetorch/resources/secrets/utils.py +209 -0
  63. kubetorch/resources/volumes/__init__.py +0 -0
  64. kubetorch/resources/volumes/volume.py +365 -0
  65. kubetorch/servers/__init__.py +0 -0
  66. kubetorch/servers/http/__init__.py +0 -0
  67. kubetorch/servers/http/distributed_utils.py +3223 -0
  68. kubetorch/servers/http/http_client.py +730 -0
  69. kubetorch/servers/http/http_server.py +1788 -0
  70. kubetorch/servers/http/server_metrics.py +278 -0
  71. kubetorch/servers/http/utils.py +728 -0
  72. kubetorch/serving/__init__.py +0 -0
  73. kubetorch/serving/autoscaling.py +173 -0
  74. kubetorch/serving/base_service_manager.py +363 -0
  75. kubetorch/serving/constants.py +83 -0
  76. kubetorch/serving/deployment_service_manager.py +478 -0
  77. kubetorch/serving/knative_service_manager.py +519 -0
  78. kubetorch/serving/raycluster_service_manager.py +582 -0
  79. kubetorch/serving/service_manager.py +18 -0
  80. kubetorch/serving/templates/deployment_template.yaml +17 -0
  81. kubetorch/serving/templates/knative_service_template.yaml +19 -0
  82. kubetorch/serving/templates/kt_setup_template.sh.j2 +81 -0
  83. kubetorch/serving/templates/pod_template.yaml +194 -0
  84. kubetorch/serving/templates/raycluster_service_template.yaml +42 -0
  85. kubetorch/serving/templates/raycluster_template.yaml +35 -0
  86. kubetorch/serving/templates/service_template.yaml +21 -0
  87. kubetorch/serving/templates/workerset_template.yaml +36 -0
  88. kubetorch/serving/utils.py +377 -0
  89. kubetorch/utils.py +284 -0
  90. kubetorch-0.2.0.dist-info/METADATA +121 -0
  91. kubetorch-0.2.0.dist-info/RECORD +93 -0
  92. kubetorch-0.2.0.dist-info/WHEEL +4 -0
  93. kubetorch-0.2.0.dist-info/entry_points.txt +5 -0
@@ -0,0 +1,278 @@
1
+ import asyncio
2
+ import logging
3
+ import logging.config
4
+ import os
5
+ from datetime import datetime
6
+ from typing import Optional
7
+
8
+ from fastapi import FastAPI, Request
9
+ from fastapi.responses import Response
10
+
11
+ try:
12
+ from utils import ensure_structured_logging, LOG_CONFIG
13
+ except ImportError:
14
+ from .utils import ensure_structured_logging, LOG_CONFIG
15
+
16
+
17
+ HEARTBEAT_INTERVAL_DIVISOR = 5
18
+
19
+ # Set up our structured JSON logging
20
+ logging.config.dictConfig(LOG_CONFIG)
21
+ ensure_structured_logging()
22
+
23
+ logger = logging.getLogger(__name__)
24
+ # Set log level based on environment variable
25
+ log_level = os.getenv("KT_LOG_LEVEL")
26
+ if log_level:
27
+ log_level = log_level.upper()
28
+ logger.setLevel(getattr(logging, log_level, logging.INFO))
29
+
30
+
31
+ def get_inactivity_ttl_annotation() -> Optional[int]:
32
+ """
33
+ Get the inactivity TTL from pod annotations.
34
+ Returns TTL in seconds, or None if not found.
35
+ """
36
+ try:
37
+ # Try to get from environment variable first (can be injected via downward API)
38
+ ttl_str = os.getenv("KT_INACTIVITY_TTL")
39
+ if ttl_str:
40
+ return parse_ttl_string(ttl_str)
41
+ return None
42
+
43
+ except Exception as e:
44
+ logger.error(f"Error getting pod TTL annotation: {e}")
45
+
46
+ return None
47
+
48
+
49
+ def parse_ttl_string(ttl_str: str) -> Optional[int]:
50
+ """Parse TTL string to seconds. Supports formats: 300, 5m, 1h, 1h30m, 1d"""
51
+ ttl_str = ttl_str.strip().lower()
52
+
53
+ # If it's just a number, assume seconds
54
+ if ttl_str.isdigit():
55
+ return int(ttl_str)
56
+
57
+ # Parse duration strings
58
+ total_seconds = 0
59
+ import re
60
+
61
+ # Match patterns like 1h, 30m, 45s
62
+ pattern = r"(\d+)([dhms])"
63
+ matches = re.findall(pattern, ttl_str)
64
+
65
+ for value, unit in matches:
66
+ value = int(value)
67
+ if unit == "d":
68
+ total_seconds += value * 24 * 3600
69
+ elif unit == "h":
70
+ total_seconds += value * 3600
71
+ elif unit == "m":
72
+ total_seconds += value * 60
73
+ elif unit == "s":
74
+ total_seconds += value
75
+
76
+ return total_seconds if total_seconds > 0 else None
77
+
78
+
79
+ class HeartbeatManager:
80
+ def __init__(self, ttl_seconds: int):
81
+ try:
82
+ from prometheus_client import Counter, Gauge
83
+ except ImportError:
84
+ logger.info(
85
+ "Prometheus client not installed, heartbeat metrics not enabled"
86
+ )
87
+ return None
88
+
89
+ self.ttl_seconds = ttl_seconds
90
+ self.heartbeat_interval = self.ttl_seconds // HEARTBEAT_INTERVAL_DIVISOR
91
+ self.active_requests = 0
92
+ self.last_activity = datetime.now()
93
+ self.heartbeat_task: Optional[asyncio.Task] = None
94
+ self.service_name = os.getenv("KT_SERVICE", "unknown-service")
95
+ self.kubetorch_version = os.getenv("KUBETORCH_VERSION", "0.0.0")
96
+ self.service_namespace = os.getenv("POD_NAMESPACE", "default")
97
+ self.service_type = os.getenv("KT_DEPLOYMENT_MODE", "deployment")
98
+
99
+ self.heartbeat_counter = Counter(
100
+ "kt_heartbeat_sent",
101
+ "Total heartbeats sent",
102
+ ["service_name", "kubetorch_version", "service_namespace", "service_type"],
103
+ )
104
+ self.active_requests_gauge = Gauge(
105
+ "http_server_active_requests",
106
+ "Number of currently active requests",
107
+ ["service_name", "kubetorch_version", "service_namespace", "service_type"],
108
+ )
109
+
110
+ logger.info(
111
+ f"Heartbeat Manager initialized: TTL={self.ttl_seconds}s, Interval={self.heartbeat_interval}s"
112
+ )
113
+
114
+ @property
115
+ def labels(self):
116
+ return {
117
+ "service_name": self.service_name,
118
+ "kubetorch_version": self.kubetorch_version,
119
+ "service_namespace": self.service_namespace,
120
+ "service_type": self.service_type,
121
+ }
122
+
123
+ async def start(self):
124
+ """Start the heartbeat manager"""
125
+ self.heartbeat_task = asyncio.create_task(self._heartbeat_loop())
126
+ logger.info("Heartbeat started - tracking activity metrics")
127
+
128
+ async def stop(self):
129
+ """Stop the heartbeat manager"""
130
+ if self.heartbeat_task:
131
+ self.heartbeat_task.cancel()
132
+ try:
133
+ await self.heartbeat_task
134
+ except asyncio.CancelledError:
135
+ pass
136
+
137
+ def request_started(self):
138
+ """Called when a request starts"""
139
+ self.active_requests += 1
140
+ self.active_requests_gauge.labels(**self.labels).inc()
141
+ self.last_activity = datetime.now()
142
+
143
+ def request_finished(self):
144
+ """Called when a request finishes"""
145
+ self.active_requests = max(0, self.active_requests - 1)
146
+ self.active_requests_gauge.labels(**self.labels).dec()
147
+ self.last_activity = datetime.now()
148
+
149
+ async def _send_heartbeat(self):
150
+ """Record heartbeat activity in metrics"""
151
+ self.heartbeat_counter.labels(**self.labels).inc()
152
+ logger.debug("Heartbeat recorded - counter incremented")
153
+
154
+ async def _heartbeat_loop(self):
155
+ """Main heartbeat loop"""
156
+ while True:
157
+ try:
158
+ await asyncio.sleep(self.heartbeat_interval)
159
+
160
+ # Only send heartbeat if there are active requests
161
+ if self.active_requests > 0:
162
+ await self._send_heartbeat()
163
+ else:
164
+ # Check if we should still send based on recent activity
165
+ time_since_activity = (
166
+ datetime.now() - self.last_activity
167
+ ).total_seconds()
168
+ if time_since_activity < self.heartbeat_interval:
169
+ # Recent activity, send heartbeat even if no current requests
170
+ await self._send_heartbeat()
171
+ else:
172
+ logger.debug(
173
+ "Skipping heartbeat - no active requests or recent activity"
174
+ )
175
+
176
+ except asyncio.CancelledError:
177
+ break
178
+ except Exception as e:
179
+ logger.error(f"Error in heartbeat loop: {e}")
180
+
181
+
182
+ def setup_otel_metrics(app: FastAPI):
183
+ """Setup OpenTelemetry metrics with Prometheus export for FastAPI"""
184
+ try:
185
+ from opentelemetry.exporter.prometheus import PrometheusMetricReader
186
+ from opentelemetry.metrics import set_meter_provider
187
+ from opentelemetry.sdk.metrics import MeterProvider
188
+ from opentelemetry.sdk.resources import Resource
189
+ from prometheus_client import (
190
+ CollectorRegistry,
191
+ CONTENT_TYPE_LATEST,
192
+ generate_latest,
193
+ Info,
194
+ )
195
+ except ImportError as e:
196
+ logger.info(f"OpenTelemetry metrics not enabled: {e}")
197
+ return app, None
198
+
199
+ logger.info("Instrumenting FastAPI app for metrics")
200
+
201
+ # Get service info from environment
202
+ service_name = os.getenv(
203
+ "KT_SERVICE_NAME", os.getenv("OTEL_SERVICE_NAME", "unknown-service")
204
+ )
205
+ service_version = os.getenv("KUBETORCH_VERSION", "0.0.0")
206
+ namespace = os.getenv("POD_NAMESPACE", "default")
207
+ service_type = os.getenv("KT_DEPLOYMENT_MODE", "deployment")
208
+
209
+ # Create resource with service information
210
+ resource = Resource.create(
211
+ {
212
+ "service.name": service_name,
213
+ "service.version": service_version,
214
+ "service.namespace": namespace,
215
+ "deployment.environment": namespace,
216
+ "service.type": service_type,
217
+ }
218
+ )
219
+
220
+ # Setup Prometheus metric reader
221
+ prometheus_reader = PrometheusMetricReader(
222
+ disable_target_info=False, # Explicitly enable target_info
223
+ )
224
+
225
+ # Create meter provider with the resource and prometheus reader
226
+ meter_provider = MeterProvider(
227
+ resource=resource, metric_readers=[prometheus_reader]
228
+ )
229
+
230
+ # Set the global meter provider
231
+ set_meter_provider(meter_provider)
232
+
233
+ # Add metrics endpoint
234
+ @app.get("/metrics")
235
+ async def get_metrics(request: Request):
236
+ """Expose Prometheus-formatted OpenTelemetry metrics"""
237
+ registry = CollectorRegistry()
238
+
239
+ manager = getattr(request.app.state, "heartbeat_manager", None)
240
+ if manager:
241
+ # Add heartbeat configuration info
242
+ heartbeat_info = Info(
243
+ "heartbeat", "Heartbeat configuration info", registry=registry
244
+ )
245
+ heartbeat_info.info(
246
+ {
247
+ "ttl_seconds": str(manager.ttl_seconds),
248
+ "interval_seconds": str(manager.heartbeat_interval),
249
+ }
250
+ )
251
+
252
+ # Get all metrics from default registry (includes our heartbeat metrics)
253
+ base_metrics = generate_latest().decode("utf-8")
254
+ additional_metrics = (
255
+ generate_latest(registry).decode("utf-8") if manager else ""
256
+ )
257
+
258
+ return Response(
259
+ content=base_metrics + additional_metrics, media_type=CONTENT_TYPE_LATEST
260
+ )
261
+
262
+ # Add middleware to track active requests
263
+ @app.middleware("http")
264
+ async def track_requests(request: Request, call_next):
265
+ """Middleware to track active requests for heartbeat"""
266
+ manager = getattr(request.app.state, "heartbeat_manager", None)
267
+ if manager and request.url.path not in ["/metrics", "/health"]:
268
+ manager.request_started()
269
+ try:
270
+ response = await call_next(request)
271
+ return response
272
+ finally:
273
+ manager.request_finished()
274
+ else:
275
+ return await call_next(request)
276
+
277
+ logger.info(f"OpenTelemetry metrics enabled for service: {service_name}")
278
+ return app, meter_provider