kubetorch 0.2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kubetorch/__init__.py +59 -0
- kubetorch/cli.py +1939 -0
- kubetorch/cli_utils.py +967 -0
- kubetorch/config.py +453 -0
- kubetorch/constants.py +18 -0
- kubetorch/docs/Makefile +18 -0
- kubetorch/docs/__init__.py +0 -0
- kubetorch/docs/_ext/json_globaltoc.py +42 -0
- kubetorch/docs/api/cli.rst +10 -0
- kubetorch/docs/api/python/app.rst +21 -0
- kubetorch/docs/api/python/cls.rst +19 -0
- kubetorch/docs/api/python/compute.rst +25 -0
- kubetorch/docs/api/python/config.rst +11 -0
- kubetorch/docs/api/python/fn.rst +19 -0
- kubetorch/docs/api/python/image.rst +14 -0
- kubetorch/docs/api/python/secret.rst +18 -0
- kubetorch/docs/api/python/volumes.rst +13 -0
- kubetorch/docs/api/python.rst +101 -0
- kubetorch/docs/conf.py +69 -0
- kubetorch/docs/index.rst +20 -0
- kubetorch/docs/requirements.txt +5 -0
- kubetorch/globals.py +269 -0
- kubetorch/logger.py +59 -0
- kubetorch/resources/__init__.py +0 -0
- kubetorch/resources/callables/__init__.py +0 -0
- kubetorch/resources/callables/cls/__init__.py +0 -0
- kubetorch/resources/callables/cls/cls.py +159 -0
- kubetorch/resources/callables/fn/__init__.py +0 -0
- kubetorch/resources/callables/fn/fn.py +140 -0
- kubetorch/resources/callables/module.py +1315 -0
- kubetorch/resources/callables/utils.py +203 -0
- kubetorch/resources/compute/__init__.py +0 -0
- kubetorch/resources/compute/app.py +253 -0
- kubetorch/resources/compute/compute.py +2414 -0
- kubetorch/resources/compute/decorators.py +137 -0
- kubetorch/resources/compute/utils.py +1026 -0
- kubetorch/resources/compute/websocket.py +135 -0
- kubetorch/resources/images/__init__.py +1 -0
- kubetorch/resources/images/image.py +412 -0
- kubetorch/resources/images/images.py +64 -0
- kubetorch/resources/secrets/__init__.py +2 -0
- kubetorch/resources/secrets/kubernetes_secrets_client.py +377 -0
- kubetorch/resources/secrets/provider_secrets/__init__.py +0 -0
- kubetorch/resources/secrets/provider_secrets/anthropic_secret.py +12 -0
- kubetorch/resources/secrets/provider_secrets/aws_secret.py +16 -0
- kubetorch/resources/secrets/provider_secrets/azure_secret.py +14 -0
- kubetorch/resources/secrets/provider_secrets/cohere_secret.py +12 -0
- kubetorch/resources/secrets/provider_secrets/gcp_secret.py +16 -0
- kubetorch/resources/secrets/provider_secrets/github_secret.py +13 -0
- kubetorch/resources/secrets/provider_secrets/huggingface_secret.py +20 -0
- kubetorch/resources/secrets/provider_secrets/kubeconfig_secret.py +12 -0
- kubetorch/resources/secrets/provider_secrets/lambda_secret.py +13 -0
- kubetorch/resources/secrets/provider_secrets/langchain_secret.py +12 -0
- kubetorch/resources/secrets/provider_secrets/openai_secret.py +11 -0
- kubetorch/resources/secrets/provider_secrets/pinecone_secret.py +12 -0
- kubetorch/resources/secrets/provider_secrets/providers.py +92 -0
- kubetorch/resources/secrets/provider_secrets/ssh_secret.py +12 -0
- kubetorch/resources/secrets/provider_secrets/wandb_secret.py +11 -0
- kubetorch/resources/secrets/secret.py +224 -0
- kubetorch/resources/secrets/secret_factory.py +64 -0
- kubetorch/resources/secrets/utils.py +222 -0
- kubetorch/resources/volumes/__init__.py +0 -0
- kubetorch/resources/volumes/volume.py +340 -0
- kubetorch/servers/__init__.py +0 -0
- kubetorch/servers/http/__init__.py +0 -0
- kubetorch/servers/http/distributed_utils.py +2968 -0
- kubetorch/servers/http/http_client.py +802 -0
- kubetorch/servers/http/http_server.py +1622 -0
- kubetorch/servers/http/server_metrics.py +255 -0
- kubetorch/servers/http/utils.py +722 -0
- kubetorch/serving/__init__.py +0 -0
- kubetorch/serving/autoscaling.py +153 -0
- kubetorch/serving/base_service_manager.py +344 -0
- kubetorch/serving/constants.py +77 -0
- kubetorch/serving/deployment_service_manager.py +431 -0
- kubetorch/serving/knative_service_manager.py +487 -0
- kubetorch/serving/raycluster_service_manager.py +526 -0
- kubetorch/serving/service_manager.py +18 -0
- kubetorch/serving/templates/deployment_template.yaml +17 -0
- kubetorch/serving/templates/knative_service_template.yaml +19 -0
- kubetorch/serving/templates/kt_setup_template.sh.j2 +91 -0
- kubetorch/serving/templates/pod_template.yaml +198 -0
- kubetorch/serving/templates/raycluster_service_template.yaml +42 -0
- kubetorch/serving/templates/raycluster_template.yaml +35 -0
- kubetorch/serving/templates/service_template.yaml +21 -0
- kubetorch/serving/templates/workerset_template.yaml +36 -0
- kubetorch/serving/utils.py +344 -0
- kubetorch/utils.py +263 -0
- kubetorch-0.2.5.dist-info/METADATA +75 -0
- kubetorch-0.2.5.dist-info/RECORD +92 -0
- kubetorch-0.2.5.dist-info/WHEEL +4 -0
- kubetorch-0.2.5.dist-info/entry_points.txt +5 -0
|
@@ -0,0 +1,255 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import logging
|
|
3
|
+
import logging.config
|
|
4
|
+
import os
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
from fastapi import FastAPI, Request
|
|
9
|
+
from fastapi.responses import Response
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
from utils import ensure_structured_logging, LOG_CONFIG
|
|
13
|
+
except ImportError:
|
|
14
|
+
from .utils import ensure_structured_logging, LOG_CONFIG
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
HEARTBEAT_INTERVAL_DIVISOR = 5
|
|
18
|
+
|
|
19
|
+
# Set up our structured JSON logging
|
|
20
|
+
logging.config.dictConfig(LOG_CONFIG)
|
|
21
|
+
ensure_structured_logging()
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
# Set log level based on environment variable
|
|
25
|
+
log_level = os.getenv("KT_LOG_LEVEL")
|
|
26
|
+
if log_level:
|
|
27
|
+
log_level = log_level.upper()
|
|
28
|
+
logger.setLevel(getattr(logging, log_level, logging.INFO))
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def get_inactivity_ttl_annotation() -> Optional[int]:
|
|
32
|
+
"""
|
|
33
|
+
Get the inactivity TTL from pod annotations.
|
|
34
|
+
Returns TTL in seconds, or None if not found.
|
|
35
|
+
"""
|
|
36
|
+
try:
|
|
37
|
+
# Try to get from environment variable first (can be injected via downward API)
|
|
38
|
+
ttl_str = os.getenv("KT_INACTIVITY_TTL")
|
|
39
|
+
if ttl_str:
|
|
40
|
+
return parse_ttl_string(ttl_str)
|
|
41
|
+
return None
|
|
42
|
+
|
|
43
|
+
except Exception as e:
|
|
44
|
+
logger.error(f"Error getting pod TTL annotation: {e}")
|
|
45
|
+
|
|
46
|
+
return None
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def parse_ttl_string(ttl_str: str) -> Optional[int]:
|
|
50
|
+
"""Parse TTL string to seconds. Supports formats: 300, 5m, 1h, 1h30m, 1d"""
|
|
51
|
+
ttl_str = ttl_str.strip().lower()
|
|
52
|
+
|
|
53
|
+
# If it's just a number, assume seconds
|
|
54
|
+
if ttl_str.isdigit():
|
|
55
|
+
return int(ttl_str)
|
|
56
|
+
|
|
57
|
+
# Parse duration strings
|
|
58
|
+
total_seconds = 0
|
|
59
|
+
import re
|
|
60
|
+
|
|
61
|
+
# Match patterns like 1h, 30m, 45s
|
|
62
|
+
pattern = r"(\d+)([dhms])"
|
|
63
|
+
matches = re.findall(pattern, ttl_str)
|
|
64
|
+
|
|
65
|
+
for value, unit in matches:
|
|
66
|
+
value = int(value)
|
|
67
|
+
if unit == "d":
|
|
68
|
+
total_seconds += value * 24 * 3600
|
|
69
|
+
elif unit == "h":
|
|
70
|
+
total_seconds += value * 3600
|
|
71
|
+
elif unit == "m":
|
|
72
|
+
total_seconds += value * 60
|
|
73
|
+
elif unit == "s":
|
|
74
|
+
total_seconds += value
|
|
75
|
+
|
|
76
|
+
return total_seconds if total_seconds > 0 else None
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class HeartbeatManager:
|
|
80
|
+
def __init__(self, ttl_seconds: int):
|
|
81
|
+
try:
|
|
82
|
+
from prometheus_client import Counter, Gauge
|
|
83
|
+
except ImportError:
|
|
84
|
+
logger.info("Prometheus client not installed, heartbeat metrics not enabled")
|
|
85
|
+
return None
|
|
86
|
+
|
|
87
|
+
self.ttl_seconds = ttl_seconds
|
|
88
|
+
self.heartbeat_interval = self.ttl_seconds // HEARTBEAT_INTERVAL_DIVISOR
|
|
89
|
+
self.active_requests = 0
|
|
90
|
+
self.last_activity = datetime.now()
|
|
91
|
+
self.heartbeat_task: Optional[asyncio.Task] = None
|
|
92
|
+
self.service_name = os.getenv("KT_SERVICE", "unknown-service")
|
|
93
|
+
self.kubetorch_version = os.getenv("KUBETORCH_VERSION", "0.0.0")
|
|
94
|
+
self.service_namespace = os.getenv("POD_NAMESPACE", "default")
|
|
95
|
+
self.service_type = os.getenv("KT_DEPLOYMENT_MODE", "deployment")
|
|
96
|
+
|
|
97
|
+
self.heartbeat_counter = Counter(
|
|
98
|
+
"kt_heartbeat_sent",
|
|
99
|
+
"Total heartbeats sent",
|
|
100
|
+
["service_name", "kubetorch_version", "service_namespace", "service_type"],
|
|
101
|
+
)
|
|
102
|
+
self.active_requests_gauge = Gauge(
|
|
103
|
+
"http_server_active_requests",
|
|
104
|
+
"Number of currently active requests",
|
|
105
|
+
["service_name", "kubetorch_version", "service_namespace", "service_type"],
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
logger.info(f"Heartbeat Manager initialized: TTL={self.ttl_seconds}s, Interval={self.heartbeat_interval}s")
|
|
109
|
+
|
|
110
|
+
@property
|
|
111
|
+
def labels(self):
|
|
112
|
+
return {
|
|
113
|
+
"service_name": self.service_name,
|
|
114
|
+
"kubetorch_version": self.kubetorch_version,
|
|
115
|
+
"service_namespace": self.service_namespace,
|
|
116
|
+
"service_type": self.service_type,
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
async def start(self):
|
|
120
|
+
"""Start the heartbeat manager"""
|
|
121
|
+
self.heartbeat_task = asyncio.create_task(self._heartbeat_loop())
|
|
122
|
+
logger.info("Heartbeat started - tracking activity metrics")
|
|
123
|
+
|
|
124
|
+
async def stop(self):
|
|
125
|
+
"""Stop the heartbeat manager"""
|
|
126
|
+
if self.heartbeat_task:
|
|
127
|
+
self.heartbeat_task.cancel()
|
|
128
|
+
try:
|
|
129
|
+
await self.heartbeat_task
|
|
130
|
+
except asyncio.CancelledError:
|
|
131
|
+
pass
|
|
132
|
+
|
|
133
|
+
def request_started(self):
|
|
134
|
+
"""Called when a request starts"""
|
|
135
|
+
self.active_requests += 1
|
|
136
|
+
self.active_requests_gauge.labels(**self.labels).inc()
|
|
137
|
+
self.last_activity = datetime.now()
|
|
138
|
+
|
|
139
|
+
def request_finished(self):
|
|
140
|
+
"""Called when a request finishes"""
|
|
141
|
+
self.active_requests = max(0, self.active_requests - 1)
|
|
142
|
+
self.active_requests_gauge.labels(**self.labels).dec()
|
|
143
|
+
self.last_activity = datetime.now()
|
|
144
|
+
|
|
145
|
+
async def _send_heartbeat(self):
|
|
146
|
+
"""Record heartbeat activity in metrics"""
|
|
147
|
+
self.heartbeat_counter.labels(**self.labels).inc()
|
|
148
|
+
logger.debug("Heartbeat recorded - counter incremented")
|
|
149
|
+
|
|
150
|
+
async def _heartbeat_loop(self):
|
|
151
|
+
"""Main heartbeat loop"""
|
|
152
|
+
while True:
|
|
153
|
+
try:
|
|
154
|
+
await asyncio.sleep(self.heartbeat_interval)
|
|
155
|
+
|
|
156
|
+
# Only send heartbeat if there are active requests
|
|
157
|
+
if self.active_requests > 0:
|
|
158
|
+
await self._send_heartbeat()
|
|
159
|
+
else:
|
|
160
|
+
# Check if we should still send based on recent activity
|
|
161
|
+
time_since_activity = (datetime.now() - self.last_activity).total_seconds()
|
|
162
|
+
if time_since_activity < self.heartbeat_interval:
|
|
163
|
+
# Recent activity, send heartbeat even if no current requests
|
|
164
|
+
await self._send_heartbeat()
|
|
165
|
+
else:
|
|
166
|
+
logger.debug("Skipping heartbeat - no active requests or recent activity")
|
|
167
|
+
|
|
168
|
+
except asyncio.CancelledError:
|
|
169
|
+
break
|
|
170
|
+
except Exception as e:
|
|
171
|
+
logger.error(f"Error in heartbeat loop: {e}")
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def setup_otel_metrics(app: FastAPI):
|
|
175
|
+
"""Setup OpenTelemetry metrics with Prometheus export for FastAPI"""
|
|
176
|
+
try:
|
|
177
|
+
from opentelemetry.exporter.prometheus import PrometheusMetricReader
|
|
178
|
+
from opentelemetry.metrics import set_meter_provider
|
|
179
|
+
from opentelemetry.sdk.metrics import MeterProvider
|
|
180
|
+
from opentelemetry.sdk.resources import Resource
|
|
181
|
+
from prometheus_client import CollectorRegistry, CONTENT_TYPE_LATEST, generate_latest, Info
|
|
182
|
+
except ImportError as e:
|
|
183
|
+
logger.debug(f"OpenTelemetry metrics not enabled: {e}")
|
|
184
|
+
return app, None
|
|
185
|
+
|
|
186
|
+
logger.info("Instrumenting FastAPI app for metrics")
|
|
187
|
+
|
|
188
|
+
# Get service info from environment
|
|
189
|
+
service_name = os.getenv("KT_SERVICE_NAME", os.getenv("OTEL_SERVICE_NAME", "unknown-service"))
|
|
190
|
+
service_version = os.getenv("KUBETORCH_VERSION", "0.0.0")
|
|
191
|
+
namespace = os.getenv("POD_NAMESPACE", "default")
|
|
192
|
+
service_type = os.getenv("KT_DEPLOYMENT_MODE", "deployment")
|
|
193
|
+
|
|
194
|
+
# Create resource with service information
|
|
195
|
+
resource = Resource.create(
|
|
196
|
+
{
|
|
197
|
+
"service.name": service_name,
|
|
198
|
+
"service.version": service_version,
|
|
199
|
+
"service.namespace": namespace,
|
|
200
|
+
"deployment.environment": namespace,
|
|
201
|
+
"service.type": service_type,
|
|
202
|
+
}
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
# Setup Prometheus metric reader
|
|
206
|
+
prometheus_reader = PrometheusMetricReader(
|
|
207
|
+
disable_target_info=False, # Explicitly enable target_info
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
# Create meter provider with the resource and prometheus reader
|
|
211
|
+
meter_provider = MeterProvider(resource=resource, metric_readers=[prometheus_reader])
|
|
212
|
+
|
|
213
|
+
# Set the global meter provider
|
|
214
|
+
set_meter_provider(meter_provider)
|
|
215
|
+
|
|
216
|
+
# Add metrics endpoint
|
|
217
|
+
@app.get("/metrics")
|
|
218
|
+
async def get_metrics(request: Request):
|
|
219
|
+
"""Expose Prometheus-formatted OpenTelemetry metrics"""
|
|
220
|
+
registry = CollectorRegistry()
|
|
221
|
+
|
|
222
|
+
manager = getattr(request.app.state, "heartbeat_manager", None)
|
|
223
|
+
if manager:
|
|
224
|
+
# Add heartbeat configuration info
|
|
225
|
+
heartbeat_info = Info("heartbeat", "Heartbeat configuration info", registry=registry)
|
|
226
|
+
heartbeat_info.info(
|
|
227
|
+
{
|
|
228
|
+
"ttl_seconds": str(manager.ttl_seconds),
|
|
229
|
+
"interval_seconds": str(manager.heartbeat_interval),
|
|
230
|
+
}
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
# Get all metrics from default registry (includes our heartbeat metrics)
|
|
234
|
+
base_metrics = generate_latest().decode("utf-8")
|
|
235
|
+
additional_metrics = generate_latest(registry).decode("utf-8") if manager else ""
|
|
236
|
+
|
|
237
|
+
return Response(content=base_metrics + additional_metrics, media_type=CONTENT_TYPE_LATEST)
|
|
238
|
+
|
|
239
|
+
# Add middleware to track active requests
|
|
240
|
+
@app.middleware("http")
|
|
241
|
+
async def track_requests(request: Request, call_next):
|
|
242
|
+
"""Middleware to track active requests for heartbeat"""
|
|
243
|
+
manager = getattr(request.app.state, "heartbeat_manager", None)
|
|
244
|
+
if manager and request.url.path not in ["/metrics", "/health"]:
|
|
245
|
+
manager.request_started()
|
|
246
|
+
try:
|
|
247
|
+
response = await call_next(request)
|
|
248
|
+
return response
|
|
249
|
+
finally:
|
|
250
|
+
manager.request_finished()
|
|
251
|
+
else:
|
|
252
|
+
return await call_next(request)
|
|
253
|
+
|
|
254
|
+
logger.info(f"OpenTelemetry metrics enabled for service: {service_name}")
|
|
255
|
+
return app, meter_provider
|