kubetorch 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. kubetorch/__init__.py +59 -0
  2. kubetorch/cli.py +1939 -0
  3. kubetorch/cli_utils.py +967 -0
  4. kubetorch/config.py +453 -0
  5. kubetorch/constants.py +18 -0
  6. kubetorch/docs/Makefile +18 -0
  7. kubetorch/docs/__init__.py +0 -0
  8. kubetorch/docs/_ext/json_globaltoc.py +42 -0
  9. kubetorch/docs/api/cli.rst +10 -0
  10. kubetorch/docs/api/python/app.rst +21 -0
  11. kubetorch/docs/api/python/cls.rst +19 -0
  12. kubetorch/docs/api/python/compute.rst +25 -0
  13. kubetorch/docs/api/python/config.rst +11 -0
  14. kubetorch/docs/api/python/fn.rst +19 -0
  15. kubetorch/docs/api/python/image.rst +14 -0
  16. kubetorch/docs/api/python/secret.rst +18 -0
  17. kubetorch/docs/api/python/volumes.rst +13 -0
  18. kubetorch/docs/api/python.rst +101 -0
  19. kubetorch/docs/conf.py +69 -0
  20. kubetorch/docs/index.rst +20 -0
  21. kubetorch/docs/requirements.txt +5 -0
  22. kubetorch/globals.py +269 -0
  23. kubetorch/logger.py +59 -0
  24. kubetorch/resources/__init__.py +0 -0
  25. kubetorch/resources/callables/__init__.py +0 -0
  26. kubetorch/resources/callables/cls/__init__.py +0 -0
  27. kubetorch/resources/callables/cls/cls.py +159 -0
  28. kubetorch/resources/callables/fn/__init__.py +0 -0
  29. kubetorch/resources/callables/fn/fn.py +140 -0
  30. kubetorch/resources/callables/module.py +1315 -0
  31. kubetorch/resources/callables/utils.py +203 -0
  32. kubetorch/resources/compute/__init__.py +0 -0
  33. kubetorch/resources/compute/app.py +253 -0
  34. kubetorch/resources/compute/compute.py +2414 -0
  35. kubetorch/resources/compute/decorators.py +137 -0
  36. kubetorch/resources/compute/utils.py +1026 -0
  37. kubetorch/resources/compute/websocket.py +135 -0
  38. kubetorch/resources/images/__init__.py +1 -0
  39. kubetorch/resources/images/image.py +412 -0
  40. kubetorch/resources/images/images.py +64 -0
  41. kubetorch/resources/secrets/__init__.py +2 -0
  42. kubetorch/resources/secrets/kubernetes_secrets_client.py +377 -0
  43. kubetorch/resources/secrets/provider_secrets/__init__.py +0 -0
  44. kubetorch/resources/secrets/provider_secrets/anthropic_secret.py +12 -0
  45. kubetorch/resources/secrets/provider_secrets/aws_secret.py +16 -0
  46. kubetorch/resources/secrets/provider_secrets/azure_secret.py +14 -0
  47. kubetorch/resources/secrets/provider_secrets/cohere_secret.py +12 -0
  48. kubetorch/resources/secrets/provider_secrets/gcp_secret.py +16 -0
  49. kubetorch/resources/secrets/provider_secrets/github_secret.py +13 -0
  50. kubetorch/resources/secrets/provider_secrets/huggingface_secret.py +20 -0
  51. kubetorch/resources/secrets/provider_secrets/kubeconfig_secret.py +12 -0
  52. kubetorch/resources/secrets/provider_secrets/lambda_secret.py +13 -0
  53. kubetorch/resources/secrets/provider_secrets/langchain_secret.py +12 -0
  54. kubetorch/resources/secrets/provider_secrets/openai_secret.py +11 -0
  55. kubetorch/resources/secrets/provider_secrets/pinecone_secret.py +12 -0
  56. kubetorch/resources/secrets/provider_secrets/providers.py +92 -0
  57. kubetorch/resources/secrets/provider_secrets/ssh_secret.py +12 -0
  58. kubetorch/resources/secrets/provider_secrets/wandb_secret.py +11 -0
  59. kubetorch/resources/secrets/secret.py +224 -0
  60. kubetorch/resources/secrets/secret_factory.py +64 -0
  61. kubetorch/resources/secrets/utils.py +222 -0
  62. kubetorch/resources/volumes/__init__.py +0 -0
  63. kubetorch/resources/volumes/volume.py +340 -0
  64. kubetorch/servers/__init__.py +0 -0
  65. kubetorch/servers/http/__init__.py +0 -0
  66. kubetorch/servers/http/distributed_utils.py +2968 -0
  67. kubetorch/servers/http/http_client.py +802 -0
  68. kubetorch/servers/http/http_server.py +1622 -0
  69. kubetorch/servers/http/server_metrics.py +255 -0
  70. kubetorch/servers/http/utils.py +722 -0
  71. kubetorch/serving/__init__.py +0 -0
  72. kubetorch/serving/autoscaling.py +153 -0
  73. kubetorch/serving/base_service_manager.py +344 -0
  74. kubetorch/serving/constants.py +77 -0
  75. kubetorch/serving/deployment_service_manager.py +431 -0
  76. kubetorch/serving/knative_service_manager.py +487 -0
  77. kubetorch/serving/raycluster_service_manager.py +526 -0
  78. kubetorch/serving/service_manager.py +18 -0
  79. kubetorch/serving/templates/deployment_template.yaml +17 -0
  80. kubetorch/serving/templates/knative_service_template.yaml +19 -0
  81. kubetorch/serving/templates/kt_setup_template.sh.j2 +91 -0
  82. kubetorch/serving/templates/pod_template.yaml +198 -0
  83. kubetorch/serving/templates/raycluster_service_template.yaml +42 -0
  84. kubetorch/serving/templates/raycluster_template.yaml +35 -0
  85. kubetorch/serving/templates/service_template.yaml +21 -0
  86. kubetorch/serving/templates/workerset_template.yaml +36 -0
  87. kubetorch/serving/utils.py +344 -0
  88. kubetorch/utils.py +263 -0
  89. kubetorch-0.2.5.dist-info/METADATA +75 -0
  90. kubetorch-0.2.5.dist-info/RECORD +92 -0
  91. kubetorch-0.2.5.dist-info/WHEEL +4 -0
  92. kubetorch-0.2.5.dist-info/entry_points.txt +5 -0
@@ -0,0 +1,255 @@
1
+ import asyncio
2
+ import logging
3
+ import logging.config
4
+ import os
5
+ from datetime import datetime
6
+ from typing import Optional
7
+
8
+ from fastapi import FastAPI, Request
9
+ from fastapi.responses import Response
10
+
11
+ try:
12
+ from utils import ensure_structured_logging, LOG_CONFIG
13
+ except ImportError:
14
+ from .utils import ensure_structured_logging, LOG_CONFIG
15
+
16
+
17
+ HEARTBEAT_INTERVAL_DIVISOR = 5
18
+
19
+ # Set up our structured JSON logging
20
+ logging.config.dictConfig(LOG_CONFIG)
21
+ ensure_structured_logging()
22
+
23
+ logger = logging.getLogger(__name__)
24
+ # Set log level based on environment variable
25
+ log_level = os.getenv("KT_LOG_LEVEL")
26
+ if log_level:
27
+ log_level = log_level.upper()
28
+ logger.setLevel(getattr(logging, log_level, logging.INFO))
29
+
30
+
31
+ def get_inactivity_ttl_annotation() -> Optional[int]:
32
+ """
33
+ Get the inactivity TTL from pod annotations.
34
+ Returns TTL in seconds, or None if not found.
35
+ """
36
+ try:
37
+ # Try to get from environment variable first (can be injected via downward API)
38
+ ttl_str = os.getenv("KT_INACTIVITY_TTL")
39
+ if ttl_str:
40
+ return parse_ttl_string(ttl_str)
41
+ return None
42
+
43
+ except Exception as e:
44
+ logger.error(f"Error getting pod TTL annotation: {e}")
45
+
46
+ return None
47
+
48
+
49
+ def parse_ttl_string(ttl_str: str) -> Optional[int]:
50
+ """Parse TTL string to seconds. Supports formats: 300, 5m, 1h, 1h30m, 1d"""
51
+ ttl_str = ttl_str.strip().lower()
52
+
53
+ # If it's just a number, assume seconds
54
+ if ttl_str.isdigit():
55
+ return int(ttl_str)
56
+
57
+ # Parse duration strings
58
+ total_seconds = 0
59
+ import re
60
+
61
+ # Match patterns like 1h, 30m, 45s
62
+ pattern = r"(\d+)([dhms])"
63
+ matches = re.findall(pattern, ttl_str)
64
+
65
+ for value, unit in matches:
66
+ value = int(value)
67
+ if unit == "d":
68
+ total_seconds += value * 24 * 3600
69
+ elif unit == "h":
70
+ total_seconds += value * 3600
71
+ elif unit == "m":
72
+ total_seconds += value * 60
73
+ elif unit == "s":
74
+ total_seconds += value
75
+
76
+ return total_seconds if total_seconds > 0 else None
77
+
78
+
79
+ class HeartbeatManager:
80
+ def __init__(self, ttl_seconds: int):
81
+ try:
82
+ from prometheus_client import Counter, Gauge
83
+ except ImportError:
84
+ logger.info("Prometheus client not installed, heartbeat metrics not enabled")
85
+ return None
86
+
87
+ self.ttl_seconds = ttl_seconds
88
+ self.heartbeat_interval = self.ttl_seconds // HEARTBEAT_INTERVAL_DIVISOR
89
+ self.active_requests = 0
90
+ self.last_activity = datetime.now()
91
+ self.heartbeat_task: Optional[asyncio.Task] = None
92
+ self.service_name = os.getenv("KT_SERVICE", "unknown-service")
93
+ self.kubetorch_version = os.getenv("KUBETORCH_VERSION", "0.0.0")
94
+ self.service_namespace = os.getenv("POD_NAMESPACE", "default")
95
+ self.service_type = os.getenv("KT_DEPLOYMENT_MODE", "deployment")
96
+
97
+ self.heartbeat_counter = Counter(
98
+ "kt_heartbeat_sent",
99
+ "Total heartbeats sent",
100
+ ["service_name", "kubetorch_version", "service_namespace", "service_type"],
101
+ )
102
+ self.active_requests_gauge = Gauge(
103
+ "http_server_active_requests",
104
+ "Number of currently active requests",
105
+ ["service_name", "kubetorch_version", "service_namespace", "service_type"],
106
+ )
107
+
108
+ logger.info(f"Heartbeat Manager initialized: TTL={self.ttl_seconds}s, Interval={self.heartbeat_interval}s")
109
+
110
+ @property
111
+ def labels(self):
112
+ return {
113
+ "service_name": self.service_name,
114
+ "kubetorch_version": self.kubetorch_version,
115
+ "service_namespace": self.service_namespace,
116
+ "service_type": self.service_type,
117
+ }
118
+
119
+ async def start(self):
120
+ """Start the heartbeat manager"""
121
+ self.heartbeat_task = asyncio.create_task(self._heartbeat_loop())
122
+ logger.info("Heartbeat started - tracking activity metrics")
123
+
124
+ async def stop(self):
125
+ """Stop the heartbeat manager"""
126
+ if self.heartbeat_task:
127
+ self.heartbeat_task.cancel()
128
+ try:
129
+ await self.heartbeat_task
130
+ except asyncio.CancelledError:
131
+ pass
132
+
133
+ def request_started(self):
134
+ """Called when a request starts"""
135
+ self.active_requests += 1
136
+ self.active_requests_gauge.labels(**self.labels).inc()
137
+ self.last_activity = datetime.now()
138
+
139
+ def request_finished(self):
140
+ """Called when a request finishes"""
141
+ self.active_requests = max(0, self.active_requests - 1)
142
+ self.active_requests_gauge.labels(**self.labels).dec()
143
+ self.last_activity = datetime.now()
144
+
145
+ async def _send_heartbeat(self):
146
+ """Record heartbeat activity in metrics"""
147
+ self.heartbeat_counter.labels(**self.labels).inc()
148
+ logger.debug("Heartbeat recorded - counter incremented")
149
+
150
+ async def _heartbeat_loop(self):
151
+ """Main heartbeat loop"""
152
+ while True:
153
+ try:
154
+ await asyncio.sleep(self.heartbeat_interval)
155
+
156
+ # Only send heartbeat if there are active requests
157
+ if self.active_requests > 0:
158
+ await self._send_heartbeat()
159
+ else:
160
+ # Check if we should still send based on recent activity
161
+ time_since_activity = (datetime.now() - self.last_activity).total_seconds()
162
+ if time_since_activity < self.heartbeat_interval:
163
+ # Recent activity, send heartbeat even if no current requests
164
+ await self._send_heartbeat()
165
+ else:
166
+ logger.debug("Skipping heartbeat - no active requests or recent activity")
167
+
168
+ except asyncio.CancelledError:
169
+ break
170
+ except Exception as e:
171
+ logger.error(f"Error in heartbeat loop: {e}")
172
+
173
+
174
+ def setup_otel_metrics(app: FastAPI):
175
+ """Setup OpenTelemetry metrics with Prometheus export for FastAPI"""
176
+ try:
177
+ from opentelemetry.exporter.prometheus import PrometheusMetricReader
178
+ from opentelemetry.metrics import set_meter_provider
179
+ from opentelemetry.sdk.metrics import MeterProvider
180
+ from opentelemetry.sdk.resources import Resource
181
+ from prometheus_client import CollectorRegistry, CONTENT_TYPE_LATEST, generate_latest, Info
182
+ except ImportError as e:
183
+ logger.debug(f"OpenTelemetry metrics not enabled: {e}")
184
+ return app, None
185
+
186
+ logger.info("Instrumenting FastAPI app for metrics")
187
+
188
+ # Get service info from environment
189
+ service_name = os.getenv("KT_SERVICE_NAME", os.getenv("OTEL_SERVICE_NAME", "unknown-service"))
190
+ service_version = os.getenv("KUBETORCH_VERSION", "0.0.0")
191
+ namespace = os.getenv("POD_NAMESPACE", "default")
192
+ service_type = os.getenv("KT_DEPLOYMENT_MODE", "deployment")
193
+
194
+ # Create resource with service information
195
+ resource = Resource.create(
196
+ {
197
+ "service.name": service_name,
198
+ "service.version": service_version,
199
+ "service.namespace": namespace,
200
+ "deployment.environment": namespace,
201
+ "service.type": service_type,
202
+ }
203
+ )
204
+
205
+ # Setup Prometheus metric reader
206
+ prometheus_reader = PrometheusMetricReader(
207
+ disable_target_info=False, # Explicitly enable target_info
208
+ )
209
+
210
+ # Create meter provider with the resource and prometheus reader
211
+ meter_provider = MeterProvider(resource=resource, metric_readers=[prometheus_reader])
212
+
213
+ # Set the global meter provider
214
+ set_meter_provider(meter_provider)
215
+
216
+ # Add metrics endpoint
217
+ @app.get("/metrics")
218
+ async def get_metrics(request: Request):
219
+ """Expose Prometheus-formatted OpenTelemetry metrics"""
220
+ registry = CollectorRegistry()
221
+
222
+ manager = getattr(request.app.state, "heartbeat_manager", None)
223
+ if manager:
224
+ # Add heartbeat configuration info
225
+ heartbeat_info = Info("heartbeat", "Heartbeat configuration info", registry=registry)
226
+ heartbeat_info.info(
227
+ {
228
+ "ttl_seconds": str(manager.ttl_seconds),
229
+ "interval_seconds": str(manager.heartbeat_interval),
230
+ }
231
+ )
232
+
233
+ # Get all metrics from default registry (includes our heartbeat metrics)
234
+ base_metrics = generate_latest().decode("utf-8")
235
+ additional_metrics = generate_latest(registry).decode("utf-8") if manager else ""
236
+
237
+ return Response(content=base_metrics + additional_metrics, media_type=CONTENT_TYPE_LATEST)
238
+
239
+ # Add middleware to track active requests
240
+ @app.middleware("http")
241
+ async def track_requests(request: Request, call_next):
242
+ """Middleware to track active requests for heartbeat"""
243
+ manager = getattr(request.app.state, "heartbeat_manager", None)
244
+ if manager and request.url.path not in ["/metrics", "/health"]:
245
+ manager.request_started()
246
+ try:
247
+ response = await call_next(request)
248
+ return response
249
+ finally:
250
+ manager.request_finished()
251
+ else:
252
+ return await call_next(request)
253
+
254
+ logger.info(f"OpenTelemetry metrics enabled for service: {service_name}")
255
+ return app, meter_provider