dory-processor-sdk 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dory/__init__.py +101 -0
- dory/auth/__init__.py +10 -0
- dory/auth/oauth2.py +153 -0
- dory/auto_instrument.py +142 -0
- dory/cli/__init__.py +5 -0
- dory/cli/main.py +137 -0
- dory/cli/templates.py +123 -0
- dory/config/__init__.py +23 -0
- dory/config/defaults.py +24 -0
- dory/config/loader.py +430 -0
- dory/config/presets.py +73 -0
- dory/config/schema.py +84 -0
- dory/core/__init__.py +27 -0
- dory/core/app.py +434 -0
- dory/core/context.py +209 -0
- dory/core/lifecycle.py +214 -0
- dory/core/meta.py +121 -0
- dory/core/modes.py +479 -0
- dory/core/processor.py +564 -0
- dory/core/signals.py +122 -0
- dory/decorators.py +142 -0
- dory/edge/__init__.py +88 -0
- dory/edge/adaptive.py +644 -0
- dory/edge/detector.py +546 -0
- dory/edge/fencing.py +488 -0
- dory/edge/heartbeat.py +598 -0
- dory/edge/role.py +419 -0
- dory/errors/__init__.py +139 -0
- dory/errors/classification.py +362 -0
- dory/errors/codes.py +498 -0
- dory/geo/__init__.py +40 -0
- dory/geo/geolocalizer.py +1034 -0
- dory/health/__init__.py +12 -0
- dory/health/probes.py +210 -0
- dory/health/server.py +635 -0
- dory/k8s/__init__.py +80 -0
- dory/k8s/annotation_watcher.py +184 -0
- dory/k8s/client.py +251 -0
- dory/k8s/labels.py +505 -0
- dory/k8s/pod_metadata.py +182 -0
- dory/logging/__init__.py +9 -0
- dory/logging/logger.py +148 -0
- dory/metrics/__init__.py +7 -0
- dory/metrics/collector.py +301 -0
- dory/middleware/__init__.py +46 -0
- dory/middleware/connection_tracker.py +608 -0
- dory/middleware/request_id.py +325 -0
- dory/middleware/request_tracker.py +511 -0
- dory/migration/__init__.py +33 -0
- dory/migration/configmap.py +232 -0
- dory/migration/s3_store.py +594 -0
- dory/migration/serialization.py +135 -0
- dory/migration/state_manager.py +286 -0
- dory/migration/transfer.py +382 -0
- dory/monitoring/__init__.py +29 -0
- dory/monitoring/opentelemetry.py +489 -0
- dory/output/__init__.py +31 -0
- dory/output/envelope.py +137 -0
- dory/output/formatter.py +113 -0
- dory/output/rabbitmq.py +632 -0
- dory/output/routing.py +318 -0
- dory/output/validator.py +199 -0
- dory/py.typed +2 -0
- dory/recovery/__init__.py +60 -0
- dory/recovery/golden_image.py +487 -0
- dory/recovery/golden_snapshot.py +713 -0
- dory/recovery/golden_validator.py +518 -0
- dory/recovery/partial_recovery.py +482 -0
- dory/recovery/recovery_decision.py +242 -0
- dory/recovery/restart_detector.py +142 -0
- dory/recovery/state_validator.py +183 -0
- dory/resilience/__init__.py +45 -0
- dory/resilience/circuit_breaker.py +457 -0
- dory/resilience/retry.py +389 -0
- dory/simple.py +342 -0
- dory/types.py +68 -0
- dory/utils/__init__.py +31 -0
- dory/utils/errors.py +59 -0
- dory/utils/retry.py +115 -0
- dory/utils/timeout.py +80 -0
- dory_processor_sdk-0.0.1.dist-info/METADATA +424 -0
- dory_processor_sdk-0.0.1.dist-info/RECORD +86 -0
- dory_processor_sdk-0.0.1.dist-info/WHEEL +5 -0
- dory_processor_sdk-0.0.1.dist-info/entry_points.txt +2 -0
- dory_processor_sdk-0.0.1.dist-info/licenses/LICENSE +201 -0
- dory_processor_sdk-0.0.1.dist-info/top_level.txt +1 -0
dory/health/server.py
ADDED
|
@@ -0,0 +1,635 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Health and metrics HTTP server.
|
|
3
|
+
|
|
4
|
+
Provides endpoints for:
|
|
5
|
+
- /health - Liveness probe
|
|
6
|
+
- /ready - Readiness probe (matches Kubernetes convention)
|
|
7
|
+
- /metrics - Prometheus metrics
|
|
8
|
+
- /state - State transfer (GET/POST) for pod migration (authenticated, with timeout)
|
|
9
|
+
- /prestop - PreStop hook handler for graceful shutdown
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import json
|
|
13
|
+
import logging
|
|
14
|
+
import os
|
|
15
|
+
import secrets
|
|
16
|
+
import time
|
|
17
|
+
from concurrent.futures import ThreadPoolExecutor, TimeoutError as FuturesTimeoutError
|
|
18
|
+
from typing import TYPE_CHECKING, Callable, Awaitable
|
|
19
|
+
|
|
20
|
+
from aiohttp import web
|
|
21
|
+
|
|
22
|
+
import dory
|
|
23
|
+
from dory.health.probes import LivenessProbe, ReadinessProbe
|
|
24
|
+
from dory.utils.errors import DoryHealthError
|
|
25
|
+
from dory.migration.transfer import (
|
|
26
|
+
TransferConfig,
|
|
27
|
+
validate_state_size,
|
|
28
|
+
StateSizeExceeded,
|
|
29
|
+
StateTransferTimeout,
|
|
30
|
+
log_transfer_summary,
|
|
31
|
+
TransferMetrics,
|
|
32
|
+
ORCHESTRATOR_STATE_TIMEOUT_SEC,
|
|
33
|
+
ORCHESTRATOR_MAX_STATE_SIZE,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
if TYPE_CHECKING:
|
|
37
|
+
from dory.metrics.collector import MetricsCollector
|
|
38
|
+
|
|
39
|
+
logger = logging.getLogger(__name__)
|
|
40
|
+
|
|
41
|
+
# SDK detection header — must match orchestrator's sdkcheck.SDKVersionHeader
|
|
42
|
+
SDK_VERSION_HEADER = "X-Dory-SDK-Version"
|
|
43
|
+
|
|
44
|
+
# Type aliases for callbacks
|
|
45
|
+
StateGetter = Callable[[], dict]
|
|
46
|
+
StateRestorer = Callable[[dict], Awaitable[None]]
|
|
47
|
+
PreStopHandler = Callable[[], Awaitable[None]]
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class HealthServer:
|
|
51
|
+
"""
|
|
52
|
+
HTTP server for health probes, metrics, and state transfer.
|
|
53
|
+
|
|
54
|
+
Runs on a separate port from the main application.
|
|
55
|
+
Provides endpoints required by Dory Orchestrator for:
|
|
56
|
+
- Health probes (liveness/readiness)
|
|
57
|
+
- Prometheus metrics
|
|
58
|
+
- State transfer during pod migration
|
|
59
|
+
- PreStop hook for graceful shutdown
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
# Fixed endpoint paths (not configurable)
|
|
63
|
+
HEALTH_PATH = "/health"
|
|
64
|
+
READY_PATH = "/ready"
|
|
65
|
+
METRICS_PATH = "/metrics"
|
|
66
|
+
|
|
67
|
+
def __init__(
|
|
68
|
+
self,
|
|
69
|
+
port: int = 8080,
|
|
70
|
+
metrics_collector: "MetricsCollector | None" = None,
|
|
71
|
+
state_getter: StateGetter | None = None,
|
|
72
|
+
state_restorer: StateRestorer | None = None,
|
|
73
|
+
prestop_handler: PreStopHandler | None = None,
|
|
74
|
+
state_token: str | None = None,
|
|
75
|
+
transfer_config: TransferConfig | None = None,
|
|
76
|
+
):
|
|
77
|
+
"""
|
|
78
|
+
Initialize health server.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
port: Port to listen on
|
|
82
|
+
metrics_collector: Optional metrics collector for /metrics endpoint
|
|
83
|
+
state_getter: Callback to get processor state for /state GET
|
|
84
|
+
state_restorer: Callback to restore processor state for /state POST
|
|
85
|
+
prestop_handler: Callback for /prestop PreStop hook
|
|
86
|
+
state_token: Authentication token for /state endpoints. If not provided,
|
|
87
|
+
reads from DORY_STATE_TOKEN environment variable. If neither is set,
|
|
88
|
+
state endpoints are unauthenticated (not recommended in production).
|
|
89
|
+
transfer_config: Configuration for state transfer timeouts and size limits.
|
|
90
|
+
If not provided, uses defaults aligned with Orchestrator (25s timeout, 8MB max).
|
|
91
|
+
"""
|
|
92
|
+
self._port = port
|
|
93
|
+
self._metrics_collector = metrics_collector
|
|
94
|
+
self._state_getter = state_getter
|
|
95
|
+
self._state_restorer = state_restorer
|
|
96
|
+
self._prestop_handler = prestop_handler
|
|
97
|
+
|
|
98
|
+
# State endpoint authentication token (matches Orchestrator's DORY_STATE_TOKEN)
|
|
99
|
+
self._state_token = state_token or os.environ.get("DORY_STATE_TOKEN")
|
|
100
|
+
if not self._state_token:
|
|
101
|
+
logger.warning(
|
|
102
|
+
"DORY_STATE_TOKEN not configured - state endpoints are unauthenticated. "
|
|
103
|
+
"Set DORY_STATE_TOKEN environment variable for production deployments."
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
# State transfer configuration (aligned with Orchestrator limits)
|
|
107
|
+
self._transfer_config = transfer_config or TransferConfig()
|
|
108
|
+
logger.info(
|
|
109
|
+
f"State transfer configured: capture_timeout={self._transfer_config.capture_timeout_sec}s "
|
|
110
|
+
f"(Orchestrator: {ORCHESTRATOR_STATE_TIMEOUT_SEC}s), "
|
|
111
|
+
f"max_size={self._transfer_config.max_size_bytes:,}B "
|
|
112
|
+
f"(Orchestrator: {ORCHESTRATOR_MAX_STATE_SIZE:,}B)"
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
# Thread pool for running synchronous state getter with timeout
|
|
116
|
+
self._executor = ThreadPoolExecutor(max_workers=2, thread_name_prefix="state-capture")
|
|
117
|
+
|
|
118
|
+
self._liveness = LivenessProbe()
|
|
119
|
+
self._readiness = ReadinessProbe()
|
|
120
|
+
|
|
121
|
+
self._app: web.Application | None = None
|
|
122
|
+
self._runner: web.AppRunner | None = None
|
|
123
|
+
self._site: web.TCPSite | None = None
|
|
124
|
+
|
|
125
|
+
@property
|
|
126
|
+
def liveness_probe(self) -> LivenessProbe:
|
|
127
|
+
"""Get liveness probe for adding custom checks."""
|
|
128
|
+
return self._liveness
|
|
129
|
+
|
|
130
|
+
@property
|
|
131
|
+
def readiness_probe(self) -> ReadinessProbe:
|
|
132
|
+
"""Get readiness probe for adding custom checks."""
|
|
133
|
+
return self._readiness
|
|
134
|
+
|
|
135
|
+
def mark_ready(self) -> None:
|
|
136
|
+
"""Mark the application as ready to receive traffic."""
|
|
137
|
+
self._readiness.mark_ready()
|
|
138
|
+
|
|
139
|
+
def mark_not_ready(self) -> None:
|
|
140
|
+
"""Mark the application as not ready."""
|
|
141
|
+
self._readiness.mark_not_ready()
|
|
142
|
+
|
|
143
|
+
def set_state_getter(self, getter: StateGetter) -> None:
|
|
144
|
+
"""Set the callback for getting processor state."""
|
|
145
|
+
self._state_getter = getter
|
|
146
|
+
|
|
147
|
+
def set_state_restorer(self, restorer: StateRestorer) -> None:
|
|
148
|
+
"""Set the callback for restoring processor state."""
|
|
149
|
+
self._state_restorer = restorer
|
|
150
|
+
|
|
151
|
+
def set_prestop_handler(self, handler: PreStopHandler) -> None:
|
|
152
|
+
"""Set the callback for PreStop hook."""
|
|
153
|
+
self._prestop_handler = handler
|
|
154
|
+
|
|
155
|
+
def set_state_token(self, token: str) -> None:
|
|
156
|
+
"""Set the authentication token for state endpoints."""
|
|
157
|
+
self._state_token = token
|
|
158
|
+
|
|
159
|
+
@property
|
|
160
|
+
def port(self) -> int:
|
|
161
|
+
"""Get the actual port the server is running on."""
|
|
162
|
+
return self._port
|
|
163
|
+
|
|
164
|
+
def _find_available_port(self, start: int = 8080, end: int = 9000) -> int:
|
|
165
|
+
"""
|
|
166
|
+
Find an available port in the given range.
|
|
167
|
+
|
|
168
|
+
Args:
|
|
169
|
+
start: Start of port range
|
|
170
|
+
end: End of port range
|
|
171
|
+
|
|
172
|
+
Returns:
|
|
173
|
+
Available port number
|
|
174
|
+
|
|
175
|
+
Raises:
|
|
176
|
+
DoryHealthError: If no available port found
|
|
177
|
+
"""
|
|
178
|
+
import socket
|
|
179
|
+
|
|
180
|
+
for port in range(start, end):
|
|
181
|
+
try:
|
|
182
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
183
|
+
s.bind(("0.0.0.0", port))
|
|
184
|
+
logger.debug(f"Found available port: {port}")
|
|
185
|
+
return port
|
|
186
|
+
except OSError:
|
|
187
|
+
continue
|
|
188
|
+
|
|
189
|
+
raise DoryHealthError(f"No available port found in range {start}-{end}")
|
|
190
|
+
|
|
191
|
+
def _verify_state_auth(self, request: web.Request) -> tuple[bool, str]:
|
|
192
|
+
"""
|
|
193
|
+
Verify authentication for state endpoints.
|
|
194
|
+
|
|
195
|
+
Uses Bearer token authentication matching Orchestrator's implementation.
|
|
196
|
+
Token comparison uses constant-time comparison to prevent timing attacks.
|
|
197
|
+
|
|
198
|
+
Args:
|
|
199
|
+
request: The incoming HTTP request
|
|
200
|
+
|
|
201
|
+
Returns:
|
|
202
|
+
Tuple of (is_authenticated, error_message)
|
|
203
|
+
"""
|
|
204
|
+
# If no token configured, allow unauthenticated access (with warning logged at startup)
|
|
205
|
+
if not self._state_token:
|
|
206
|
+
return True, ""
|
|
207
|
+
|
|
208
|
+
auth_header = request.headers.get("Authorization", "")
|
|
209
|
+
|
|
210
|
+
if not auth_header:
|
|
211
|
+
return False, "Missing Authorization header"
|
|
212
|
+
|
|
213
|
+
# Expected format: "Bearer <token>"
|
|
214
|
+
if not auth_header.startswith("Bearer "):
|
|
215
|
+
return False, "Invalid Authorization header format (expected 'Bearer <token>')"
|
|
216
|
+
|
|
217
|
+
provided_token = auth_header[7:] # Strip "Bearer " prefix
|
|
218
|
+
|
|
219
|
+
# Use constant-time comparison to prevent timing attacks
|
|
220
|
+
if not secrets.compare_digest(provided_token, self._state_token):
|
|
221
|
+
return False, "Invalid authentication token"
|
|
222
|
+
|
|
223
|
+
return True, ""
|
|
224
|
+
|
|
225
|
+
async def start(self) -> None:
|
|
226
|
+
"""
|
|
227
|
+
Start the health server.
|
|
228
|
+
|
|
229
|
+
If port is 0, automatically finds an available port.
|
|
230
|
+
|
|
231
|
+
Raises:
|
|
232
|
+
DoryHealthError: If server fails to start
|
|
233
|
+
"""
|
|
234
|
+
try:
|
|
235
|
+
self._app = web.Application()
|
|
236
|
+
self._setup_routes()
|
|
237
|
+
|
|
238
|
+
self._runner = web.AppRunner(self._app)
|
|
239
|
+
await self._runner.setup()
|
|
240
|
+
|
|
241
|
+
# If port is 0, find an available port (development mode)
|
|
242
|
+
port = self._port
|
|
243
|
+
if port == 0:
|
|
244
|
+
# Warn if running in Kubernetes with auto-port selection
|
|
245
|
+
if os.environ.get("KUBERNETES_SERVICE_HOST") or os.environ.get("DORY_POD_NAME"):
|
|
246
|
+
logger.warning(
|
|
247
|
+
"Auto-port selection (port=0) is not recommended in Kubernetes. "
|
|
248
|
+
"K8s probes require a fixed port. Set DORY_HEALTH_PORT or use production preset."
|
|
249
|
+
)
|
|
250
|
+
port = self._find_available_port()
|
|
251
|
+
self._port = port
|
|
252
|
+
|
|
253
|
+
self._site = web.TCPSite(
|
|
254
|
+
self._runner,
|
|
255
|
+
host="0.0.0.0",
|
|
256
|
+
port=port,
|
|
257
|
+
)
|
|
258
|
+
await self._site.start()
|
|
259
|
+
|
|
260
|
+
logger.info(f"Health server started on port {self._port}")
|
|
261
|
+
|
|
262
|
+
except Exception as e:
|
|
263
|
+
raise DoryHealthError(f"Failed to start health server: {e}", cause=e)
|
|
264
|
+
|
|
265
|
+
async def stop(self) -> None:
|
|
266
|
+
"""Stop the health server."""
|
|
267
|
+
if self._runner:
|
|
268
|
+
await self._runner.cleanup()
|
|
269
|
+
|
|
270
|
+
# Shutdown executor
|
|
271
|
+
self._executor.shutdown(wait=False)
|
|
272
|
+
|
|
273
|
+
logger.info("Health server stopped")
|
|
274
|
+
|
|
275
|
+
def _setup_routes(self) -> None:
|
|
276
|
+
"""Configure HTTP routes."""
|
|
277
|
+
self._app.router.add_get(self.HEALTH_PATH, self._handle_health)
|
|
278
|
+
self._app.router.add_get(self.READY_PATH, self._handle_ready)
|
|
279
|
+
self._app.router.add_get(self.METRICS_PATH, self._handle_metrics)
|
|
280
|
+
|
|
281
|
+
# State transfer endpoints (required by Dory Orchestrator)
|
|
282
|
+
self._app.router.add_get("/state", self._handle_state_get)
|
|
283
|
+
self._app.router.add_post("/state", self._handle_state_post)
|
|
284
|
+
|
|
285
|
+
# PreStop hook endpoint (required by Dory Orchestrator)
|
|
286
|
+
self._app.router.add_get("/prestop", self._handle_prestop)
|
|
287
|
+
|
|
288
|
+
# Root endpoint for basic info
|
|
289
|
+
self._app.router.add_get("/", self._handle_root)
|
|
290
|
+
|
|
291
|
+
async def _handle_root(self, request: web.Request) -> web.Response:
|
|
292
|
+
"""Handle root endpoint."""
|
|
293
|
+
return web.json_response(
|
|
294
|
+
{
|
|
295
|
+
"service": "dory-processor",
|
|
296
|
+
"sdk_version": dory.__version__,
|
|
297
|
+
"endpoints": [
|
|
298
|
+
self.HEALTH_PATH,
|
|
299
|
+
self.READY_PATH,
|
|
300
|
+
self.METRICS_PATH,
|
|
301
|
+
"/state",
|
|
302
|
+
"/prestop",
|
|
303
|
+
],
|
|
304
|
+
},
|
|
305
|
+
headers={SDK_VERSION_HEADER: dory.__version__},
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
async def _handle_health(self, request: web.Request) -> web.Response:
|
|
309
|
+
"""
|
|
310
|
+
Handle liveness probe.
|
|
311
|
+
|
|
312
|
+
Returns 200 if alive, 503 if unhealthy.
|
|
313
|
+
Includes X-Dory-SDK-Version header for orchestrator SDK detection.
|
|
314
|
+
"""
|
|
315
|
+
result = await self._liveness.check()
|
|
316
|
+
|
|
317
|
+
status = 200 if result.healthy else 503
|
|
318
|
+
return web.json_response(
|
|
319
|
+
result.to_dict(),
|
|
320
|
+
status=status,
|
|
321
|
+
headers={SDK_VERSION_HEADER: dory.__version__},
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
async def _handle_ready(self, request: web.Request) -> web.Response:
|
|
325
|
+
"""
|
|
326
|
+
Handle readiness probe.
|
|
327
|
+
|
|
328
|
+
Returns 200 if ready, 503 if not ready.
|
|
329
|
+
"""
|
|
330
|
+
result = await self._readiness.check()
|
|
331
|
+
|
|
332
|
+
status = 200 if result.healthy else 503
|
|
333
|
+
return web.json_response(result.to_dict(), status=status)
|
|
334
|
+
|
|
335
|
+
async def _handle_metrics(self, request: web.Request) -> web.Response:
|
|
336
|
+
"""
|
|
337
|
+
Handle Prometheus metrics endpoint.
|
|
338
|
+
|
|
339
|
+
Returns metrics in Prometheus text format.
|
|
340
|
+
"""
|
|
341
|
+
if self._metrics_collector is None:
|
|
342
|
+
return web.Response(
|
|
343
|
+
text="# No metrics collector configured\n",
|
|
344
|
+
content_type="text/plain",
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
try:
|
|
348
|
+
metrics_text = self._metrics_collector.export_prometheus()
|
|
349
|
+
return web.Response(
|
|
350
|
+
text=metrics_text,
|
|
351
|
+
content_type="text/plain; version=0.0.4",
|
|
352
|
+
charset="utf-8",
|
|
353
|
+
)
|
|
354
|
+
except Exception as e:
|
|
355
|
+
logger.error(f"Error exporting metrics: {e}")
|
|
356
|
+
return web.Response(
|
|
357
|
+
text=f"# Error exporting metrics: {e}\n",
|
|
358
|
+
content_type="text/plain",
|
|
359
|
+
status=500,
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
async def _handle_state_get(self, request: web.Request) -> web.Response:
|
|
363
|
+
"""
|
|
364
|
+
Handle GET /state for state capture during migration.
|
|
365
|
+
|
|
366
|
+
Called by Dory Orchestrator to capture state from old pod
|
|
367
|
+
before transferring to new pod.
|
|
368
|
+
|
|
369
|
+
Requires authentication via Bearer token (DORY_STATE_TOKEN).
|
|
370
|
+
Enforces timeout and size limits to prevent Orchestrator timeouts.
|
|
371
|
+
|
|
372
|
+
Returns:
|
|
373
|
+
JSON response with processor state
|
|
374
|
+
"""
|
|
375
|
+
# Verify authentication
|
|
376
|
+
is_authenticated, auth_error = self._verify_state_auth(request)
|
|
377
|
+
if not is_authenticated:
|
|
378
|
+
logger.warning(f"State GET authentication failed: {auth_error}")
|
|
379
|
+
return web.json_response(
|
|
380
|
+
{"error": auth_error},
|
|
381
|
+
status=401,
|
|
382
|
+
)
|
|
383
|
+
|
|
384
|
+
if self._state_getter is None:
|
|
385
|
+
logger.warning("State getter not configured, returning empty state")
|
|
386
|
+
return web.json_response({
|
|
387
|
+
"error": "state_getter not configured",
|
|
388
|
+
"data": {},
|
|
389
|
+
}, status=503)
|
|
390
|
+
|
|
391
|
+
start_time = time.monotonic()
|
|
392
|
+
metrics: TransferMetrics | None = None
|
|
393
|
+
|
|
394
|
+
try:
|
|
395
|
+
# Run state getter with timeout in thread pool
|
|
396
|
+
# (state_getter is synchronous, but we need timeout control)
|
|
397
|
+
loop = __import__("asyncio").get_event_loop()
|
|
398
|
+
state = await loop.run_in_executor(
|
|
399
|
+
self._executor,
|
|
400
|
+
self._state_getter,
|
|
401
|
+
)
|
|
402
|
+
|
|
403
|
+
# Check if we're already over timeout (executor doesn't enforce timeout)
|
|
404
|
+
capture_duration = time.monotonic() - start_time
|
|
405
|
+
if capture_duration > self._transfer_config.capture_timeout_sec:
|
|
406
|
+
metrics = TransferMetrics(
|
|
407
|
+
duration_sec=capture_duration,
|
|
408
|
+
size_bytes=0,
|
|
409
|
+
size_ratio=0,
|
|
410
|
+
timed_out=True,
|
|
411
|
+
size_exceeded=False,
|
|
412
|
+
)
|
|
413
|
+
log_transfer_summary("capture", metrics, self._transfer_config)
|
|
414
|
+
return web.json_response(
|
|
415
|
+
{
|
|
416
|
+
"error": f"State capture timed out after {capture_duration:.1f}s "
|
|
417
|
+
f"(limit: {self._transfer_config.capture_timeout_sec}s)",
|
|
418
|
+
"timeout": True,
|
|
419
|
+
},
|
|
420
|
+
status=504, # Gateway Timeout
|
|
421
|
+
)
|
|
422
|
+
|
|
423
|
+
# Serialize and validate size
|
|
424
|
+
state_json = json.dumps(state)
|
|
425
|
+
size_bytes = len(state_json.encode("utf-8"))
|
|
426
|
+
|
|
427
|
+
# Validate size
|
|
428
|
+
try:
|
|
429
|
+
validate_state_size(
|
|
430
|
+
state_json,
|
|
431
|
+
max_size=self._transfer_config.max_size_bytes,
|
|
432
|
+
warn_threshold=self._transfer_config.size_warn_threshold,
|
|
433
|
+
)
|
|
434
|
+
except StateSizeExceeded as e:
|
|
435
|
+
metrics = TransferMetrics(
|
|
436
|
+
duration_sec=time.monotonic() - start_time,
|
|
437
|
+
size_bytes=size_bytes,
|
|
438
|
+
size_ratio=size_bytes / self._transfer_config.max_size_bytes,
|
|
439
|
+
timed_out=False,
|
|
440
|
+
size_exceeded=True,
|
|
441
|
+
)
|
|
442
|
+
log_transfer_summary("capture", metrics, self._transfer_config)
|
|
443
|
+
return web.json_response(
|
|
444
|
+
{
|
|
445
|
+
"error": str(e),
|
|
446
|
+
"size_exceeded": True,
|
|
447
|
+
"size_bytes": size_bytes,
|
|
448
|
+
"max_bytes": self._transfer_config.max_size_bytes,
|
|
449
|
+
},
|
|
450
|
+
status=413, # Payload Too Large
|
|
451
|
+
)
|
|
452
|
+
|
|
453
|
+
# Success - log metrics
|
|
454
|
+
metrics = TransferMetrics(
|
|
455
|
+
duration_sec=time.monotonic() - start_time,
|
|
456
|
+
size_bytes=size_bytes,
|
|
457
|
+
size_ratio=size_bytes / self._transfer_config.max_size_bytes,
|
|
458
|
+
timed_out=False,
|
|
459
|
+
size_exceeded=False,
|
|
460
|
+
)
|
|
461
|
+
log_transfer_summary("capture", metrics, self._transfer_config)
|
|
462
|
+
|
|
463
|
+
logger.info(
|
|
464
|
+
"State captured for transfer",
|
|
465
|
+
extra={
|
|
466
|
+
"state_keys": list(state.keys()),
|
|
467
|
+
"size_bytes": size_bytes,
|
|
468
|
+
"duration_sec": metrics.duration_sec,
|
|
469
|
+
},
|
|
470
|
+
)
|
|
471
|
+
return web.json_response(state)
|
|
472
|
+
|
|
473
|
+
except FuturesTimeoutError:
|
|
474
|
+
metrics = TransferMetrics(
|
|
475
|
+
duration_sec=time.monotonic() - start_time,
|
|
476
|
+
size_bytes=0,
|
|
477
|
+
size_ratio=0,
|
|
478
|
+
timed_out=True,
|
|
479
|
+
size_exceeded=False,
|
|
480
|
+
)
|
|
481
|
+
log_transfer_summary("capture", metrics, self._transfer_config)
|
|
482
|
+
return web.json_response(
|
|
483
|
+
{
|
|
484
|
+
"error": f"State capture timed out after {self._transfer_config.capture_timeout_sec}s",
|
|
485
|
+
"timeout": True,
|
|
486
|
+
},
|
|
487
|
+
status=504,
|
|
488
|
+
)
|
|
489
|
+
|
|
490
|
+
except Exception as e:
|
|
491
|
+
logger.error(f"Failed to capture state: {e}")
|
|
492
|
+
return web.json_response(
|
|
493
|
+
{"error": f"Failed to capture state: {e}"},
|
|
494
|
+
status=500,
|
|
495
|
+
)
|
|
496
|
+
|
|
497
|
+
async def _handle_state_post(self, request: web.Request) -> web.Response:
|
|
498
|
+
"""
|
|
499
|
+
Handle POST /state for state restoration during migration.
|
|
500
|
+
|
|
501
|
+
Called by Dory Orchestrator to restore state to new pod
|
|
502
|
+
after capturing from old pod.
|
|
503
|
+
|
|
504
|
+
Requires authentication via Bearer token (DORY_STATE_TOKEN).
|
|
505
|
+
Enforces timeout to prevent Orchestrator timeouts.
|
|
506
|
+
|
|
507
|
+
Returns:
|
|
508
|
+
JSON response confirming state restoration
|
|
509
|
+
"""
|
|
510
|
+
import asyncio
|
|
511
|
+
|
|
512
|
+
# Verify authentication
|
|
513
|
+
is_authenticated, auth_error = self._verify_state_auth(request)
|
|
514
|
+
if not is_authenticated:
|
|
515
|
+
logger.warning(f"State POST authentication failed: {auth_error}")
|
|
516
|
+
return web.json_response(
|
|
517
|
+
{"error": auth_error},
|
|
518
|
+
status=401,
|
|
519
|
+
)
|
|
520
|
+
|
|
521
|
+
if self._state_restorer is None:
|
|
522
|
+
logger.warning("State restorer not configured")
|
|
523
|
+
return web.json_response({
|
|
524
|
+
"error": "state_restorer not configured",
|
|
525
|
+
}, status=503)
|
|
526
|
+
|
|
527
|
+
start_time = time.monotonic()
|
|
528
|
+
|
|
529
|
+
try:
|
|
530
|
+
# Read request body
|
|
531
|
+
body = await request.read()
|
|
532
|
+
size_bytes = len(body)
|
|
533
|
+
|
|
534
|
+
# Validate incoming state size
|
|
535
|
+
if size_bytes > self._transfer_config.max_size_bytes:
|
|
536
|
+
logger.error(
|
|
537
|
+
f"Incoming state size ({size_bytes:,}B) exceeds maximum "
|
|
538
|
+
f"({self._transfer_config.max_size_bytes:,}B)"
|
|
539
|
+
)
|
|
540
|
+
return web.json_response(
|
|
541
|
+
{
|
|
542
|
+
"error": f"State size ({size_bytes:,} bytes) exceeds maximum "
|
|
543
|
+
f"({self._transfer_config.max_size_bytes:,} bytes)",
|
|
544
|
+
"size_exceeded": True,
|
|
545
|
+
},
|
|
546
|
+
status=413,
|
|
547
|
+
)
|
|
548
|
+
|
|
549
|
+
state = json.loads(body.decode("utf-8"))
|
|
550
|
+
logger.info(
|
|
551
|
+
"Restoring state from transfer",
|
|
552
|
+
extra={"state_keys": list(state.keys()), "size_bytes": size_bytes},
|
|
553
|
+
)
|
|
554
|
+
|
|
555
|
+
# Execute restore with timeout
|
|
556
|
+
try:
|
|
557
|
+
await asyncio.wait_for(
|
|
558
|
+
self._state_restorer(state),
|
|
559
|
+
timeout=self._transfer_config.restore_timeout_sec,
|
|
560
|
+
)
|
|
561
|
+
except asyncio.TimeoutError:
|
|
562
|
+
duration = time.monotonic() - start_time
|
|
563
|
+
logger.error(
|
|
564
|
+
f"State restore timed out after {duration:.1f}s "
|
|
565
|
+
f"(limit: {self._transfer_config.restore_timeout_sec}s)"
|
|
566
|
+
)
|
|
567
|
+
return web.json_response(
|
|
568
|
+
{
|
|
569
|
+
"error": f"State restore timed out after {self._transfer_config.restore_timeout_sec}s",
|
|
570
|
+
"timeout": True,
|
|
571
|
+
},
|
|
572
|
+
status=504,
|
|
573
|
+
)
|
|
574
|
+
|
|
575
|
+
duration = time.monotonic() - start_time
|
|
576
|
+
logger.info(
|
|
577
|
+
f"State restored successfully in {duration:.2f}s",
|
|
578
|
+
extra={"size_bytes": size_bytes, "duration_sec": duration},
|
|
579
|
+
)
|
|
580
|
+
|
|
581
|
+
# Warn if restore took significant time
|
|
582
|
+
if duration > self._transfer_config.restore_timeout_sec * 0.5:
|
|
583
|
+
logger.warning(
|
|
584
|
+
f"State restore took {duration:.1f}s "
|
|
585
|
+
f"({duration/self._transfer_config.restore_timeout_sec:.0%} of "
|
|
586
|
+
f"{self._transfer_config.restore_timeout_sec}s timeout)"
|
|
587
|
+
)
|
|
588
|
+
|
|
589
|
+
return web.json_response({
|
|
590
|
+
"status": "ok",
|
|
591
|
+
"message": "State restored",
|
|
592
|
+
"duration_sec": round(duration, 3),
|
|
593
|
+
"size_bytes": size_bytes,
|
|
594
|
+
})
|
|
595
|
+
|
|
596
|
+
except json.JSONDecodeError as e:
|
|
597
|
+
logger.error(f"Invalid JSON in state restore request: {e}")
|
|
598
|
+
return web.json_response(
|
|
599
|
+
{"error": f"Invalid JSON: {e}"},
|
|
600
|
+
status=400,
|
|
601
|
+
)
|
|
602
|
+
except Exception as e:
|
|
603
|
+
logger.error(f"Failed to restore state: {e}")
|
|
604
|
+
return web.json_response(
|
|
605
|
+
{"error": f"Failed to restore state: {e}"},
|
|
606
|
+
status=500,
|
|
607
|
+
)
|
|
608
|
+
|
|
609
|
+
async def _handle_prestop(self, request: web.Request) -> web.Response:
|
|
610
|
+
"""
|
|
611
|
+
Handle GET /prestop for PreStop hook.
|
|
612
|
+
|
|
613
|
+
Called by Kubernetes PreStop hook before pod termination.
|
|
614
|
+
Allows the application to prepare for graceful shutdown.
|
|
615
|
+
|
|
616
|
+
Returns:
|
|
617
|
+
JSON response confirming prestop handling
|
|
618
|
+
"""
|
|
619
|
+
logger.info("PreStop hook invoked - preparing for shutdown")
|
|
620
|
+
|
|
621
|
+
# Mark as not ready to stop receiving new traffic
|
|
622
|
+
self._readiness.mark_not_ready()
|
|
623
|
+
|
|
624
|
+
if self._prestop_handler:
|
|
625
|
+
try:
|
|
626
|
+
await self._prestop_handler()
|
|
627
|
+
logger.info("PreStop handler completed")
|
|
628
|
+
except Exception as e:
|
|
629
|
+
logger.error(f"PreStop handler error: {e}")
|
|
630
|
+
# Continue anyway - don't block shutdown
|
|
631
|
+
|
|
632
|
+
return web.json_response({
|
|
633
|
+
"status": "ok",
|
|
634
|
+
"message": "PreStop hook processed, ready for termination",
|
|
635
|
+
})
|