dory-processor-sdk 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. dory/__init__.py +101 -0
  2. dory/auth/__init__.py +10 -0
  3. dory/auth/oauth2.py +153 -0
  4. dory/auto_instrument.py +142 -0
  5. dory/cli/__init__.py +5 -0
  6. dory/cli/main.py +137 -0
  7. dory/cli/templates.py +123 -0
  8. dory/config/__init__.py +23 -0
  9. dory/config/defaults.py +24 -0
  10. dory/config/loader.py +430 -0
  11. dory/config/presets.py +73 -0
  12. dory/config/schema.py +84 -0
  13. dory/core/__init__.py +27 -0
  14. dory/core/app.py +434 -0
  15. dory/core/context.py +209 -0
  16. dory/core/lifecycle.py +214 -0
  17. dory/core/meta.py +121 -0
  18. dory/core/modes.py +479 -0
  19. dory/core/processor.py +564 -0
  20. dory/core/signals.py +122 -0
  21. dory/decorators.py +142 -0
  22. dory/edge/__init__.py +88 -0
  23. dory/edge/adaptive.py +644 -0
  24. dory/edge/detector.py +546 -0
  25. dory/edge/fencing.py +488 -0
  26. dory/edge/heartbeat.py +598 -0
  27. dory/edge/role.py +419 -0
  28. dory/errors/__init__.py +139 -0
  29. dory/errors/classification.py +362 -0
  30. dory/errors/codes.py +498 -0
  31. dory/geo/__init__.py +40 -0
  32. dory/geo/geolocalizer.py +1034 -0
  33. dory/health/__init__.py +12 -0
  34. dory/health/probes.py +210 -0
  35. dory/health/server.py +635 -0
  36. dory/k8s/__init__.py +80 -0
  37. dory/k8s/annotation_watcher.py +184 -0
  38. dory/k8s/client.py +251 -0
  39. dory/k8s/labels.py +505 -0
  40. dory/k8s/pod_metadata.py +182 -0
  41. dory/logging/__init__.py +9 -0
  42. dory/logging/logger.py +148 -0
  43. dory/metrics/__init__.py +7 -0
  44. dory/metrics/collector.py +301 -0
  45. dory/middleware/__init__.py +46 -0
  46. dory/middleware/connection_tracker.py +608 -0
  47. dory/middleware/request_id.py +325 -0
  48. dory/middleware/request_tracker.py +511 -0
  49. dory/migration/__init__.py +33 -0
  50. dory/migration/configmap.py +232 -0
  51. dory/migration/s3_store.py +594 -0
  52. dory/migration/serialization.py +135 -0
  53. dory/migration/state_manager.py +286 -0
  54. dory/migration/transfer.py +382 -0
  55. dory/monitoring/__init__.py +29 -0
  56. dory/monitoring/opentelemetry.py +489 -0
  57. dory/output/__init__.py +31 -0
  58. dory/output/envelope.py +137 -0
  59. dory/output/formatter.py +113 -0
  60. dory/output/rabbitmq.py +632 -0
  61. dory/output/routing.py +318 -0
  62. dory/output/validator.py +199 -0
  63. dory/py.typed +2 -0
  64. dory/recovery/__init__.py +60 -0
  65. dory/recovery/golden_image.py +487 -0
  66. dory/recovery/golden_snapshot.py +713 -0
  67. dory/recovery/golden_validator.py +518 -0
  68. dory/recovery/partial_recovery.py +482 -0
  69. dory/recovery/recovery_decision.py +242 -0
  70. dory/recovery/restart_detector.py +142 -0
  71. dory/recovery/state_validator.py +183 -0
  72. dory/resilience/__init__.py +45 -0
  73. dory/resilience/circuit_breaker.py +457 -0
  74. dory/resilience/retry.py +389 -0
  75. dory/simple.py +342 -0
  76. dory/types.py +68 -0
  77. dory/utils/__init__.py +31 -0
  78. dory/utils/errors.py +59 -0
  79. dory/utils/retry.py +115 -0
  80. dory/utils/timeout.py +80 -0
  81. dory_processor_sdk-0.0.1.dist-info/METADATA +424 -0
  82. dory_processor_sdk-0.0.1.dist-info/RECORD +86 -0
  83. dory_processor_sdk-0.0.1.dist-info/WHEEL +5 -0
  84. dory_processor_sdk-0.0.1.dist-info/entry_points.txt +2 -0
  85. dory_processor_sdk-0.0.1.dist-info/licenses/LICENSE +201 -0
  86. dory_processor_sdk-0.0.1.dist-info/top_level.txt +1 -0
dory/health/server.py ADDED
@@ -0,0 +1,635 @@
1
+ """
2
+ Health and metrics HTTP server.
3
+
4
+ Provides endpoints for:
5
+ - /health - Liveness probe
6
+ - /ready - Readiness probe (matches Kubernetes convention)
7
+ - /metrics - Prometheus metrics
8
+ - /state - State transfer (GET/POST) for pod migration (authenticated, with timeout)
9
+ - /prestop - PreStop hook handler for graceful shutdown
10
+ """
11
+
12
+ import json
13
+ import logging
14
+ import os
15
+ import secrets
16
+ import time
17
+ from concurrent.futures import ThreadPoolExecutor, TimeoutError as FuturesTimeoutError
18
+ from typing import TYPE_CHECKING, Callable, Awaitable
19
+
20
+ from aiohttp import web
21
+
22
+ import dory
23
+ from dory.health.probes import LivenessProbe, ReadinessProbe
24
+ from dory.utils.errors import DoryHealthError
25
+ from dory.migration.transfer import (
26
+ TransferConfig,
27
+ validate_state_size,
28
+ StateSizeExceeded,
29
+ StateTransferTimeout,
30
+ log_transfer_summary,
31
+ TransferMetrics,
32
+ ORCHESTRATOR_STATE_TIMEOUT_SEC,
33
+ ORCHESTRATOR_MAX_STATE_SIZE,
34
+ )
35
+
36
+ if TYPE_CHECKING:
37
+ from dory.metrics.collector import MetricsCollector
38
+
39
+ logger = logging.getLogger(__name__)
40
+
41
+ # SDK detection header — must match orchestrator's sdkcheck.SDKVersionHeader
42
+ SDK_VERSION_HEADER = "X-Dory-SDK-Version"
43
+
44
+ # Type aliases for callbacks
45
+ StateGetter = Callable[[], dict]
46
+ StateRestorer = Callable[[dict], Awaitable[None]]
47
+ PreStopHandler = Callable[[], Awaitable[None]]
48
+
49
+
50
+ class HealthServer:
51
+ """
52
+ HTTP server for health probes, metrics, and state transfer.
53
+
54
+ Runs on a separate port from the main application.
55
+ Provides endpoints required by Dory Orchestrator for:
56
+ - Health probes (liveness/readiness)
57
+ - Prometheus metrics
58
+ - State transfer during pod migration
59
+ - PreStop hook for graceful shutdown
60
+ """
61
+
62
+ # Fixed endpoint paths (not configurable)
63
+ HEALTH_PATH = "/health"
64
+ READY_PATH = "/ready"
65
+ METRICS_PATH = "/metrics"
66
+
67
+ def __init__(
68
+ self,
69
+ port: int = 8080,
70
+ metrics_collector: "MetricsCollector | None" = None,
71
+ state_getter: StateGetter | None = None,
72
+ state_restorer: StateRestorer | None = None,
73
+ prestop_handler: PreStopHandler | None = None,
74
+ state_token: str | None = None,
75
+ transfer_config: TransferConfig | None = None,
76
+ ):
77
+ """
78
+ Initialize health server.
79
+
80
+ Args:
81
+ port: Port to listen on
82
+ metrics_collector: Optional metrics collector for /metrics endpoint
83
+ state_getter: Callback to get processor state for /state GET
84
+ state_restorer: Callback to restore processor state for /state POST
85
+ prestop_handler: Callback for /prestop PreStop hook
86
+ state_token: Authentication token for /state endpoints. If not provided,
87
+ reads from DORY_STATE_TOKEN environment variable. If neither is set,
88
+ state endpoints are unauthenticated (not recommended in production).
89
+ transfer_config: Configuration for state transfer timeouts and size limits.
90
+ If not provided, uses defaults aligned with Orchestrator (25s timeout, 8MB max).
91
+ """
92
+ self._port = port
93
+ self._metrics_collector = metrics_collector
94
+ self._state_getter = state_getter
95
+ self._state_restorer = state_restorer
96
+ self._prestop_handler = prestop_handler
97
+
98
+ # State endpoint authentication token (matches Orchestrator's DORY_STATE_TOKEN)
99
+ self._state_token = state_token or os.environ.get("DORY_STATE_TOKEN")
100
+ if not self._state_token:
101
+ logger.warning(
102
+ "DORY_STATE_TOKEN not configured - state endpoints are unauthenticated. "
103
+ "Set DORY_STATE_TOKEN environment variable for production deployments."
104
+ )
105
+
106
+ # State transfer configuration (aligned with Orchestrator limits)
107
+ self._transfer_config = transfer_config or TransferConfig()
108
+ logger.info(
109
+ f"State transfer configured: capture_timeout={self._transfer_config.capture_timeout_sec}s "
110
+ f"(Orchestrator: {ORCHESTRATOR_STATE_TIMEOUT_SEC}s), "
111
+ f"max_size={self._transfer_config.max_size_bytes:,}B "
112
+ f"(Orchestrator: {ORCHESTRATOR_MAX_STATE_SIZE:,}B)"
113
+ )
114
+
115
+ # Thread pool for running synchronous state getter with timeout
116
+ self._executor = ThreadPoolExecutor(max_workers=2, thread_name_prefix="state-capture")
117
+
118
+ self._liveness = LivenessProbe()
119
+ self._readiness = ReadinessProbe()
120
+
121
+ self._app: web.Application | None = None
122
+ self._runner: web.AppRunner | None = None
123
+ self._site: web.TCPSite | None = None
124
+
125
+ @property
126
+ def liveness_probe(self) -> LivenessProbe:
127
+ """Get liveness probe for adding custom checks."""
128
+ return self._liveness
129
+
130
+ @property
131
+ def readiness_probe(self) -> ReadinessProbe:
132
+ """Get readiness probe for adding custom checks."""
133
+ return self._readiness
134
+
135
+ def mark_ready(self) -> None:
136
+ """Mark the application as ready to receive traffic."""
137
+ self._readiness.mark_ready()
138
+
139
+ def mark_not_ready(self) -> None:
140
+ """Mark the application as not ready."""
141
+ self._readiness.mark_not_ready()
142
+
143
+ def set_state_getter(self, getter: StateGetter) -> None:
144
+ """Set the callback for getting processor state."""
145
+ self._state_getter = getter
146
+
147
+ def set_state_restorer(self, restorer: StateRestorer) -> None:
148
+ """Set the callback for restoring processor state."""
149
+ self._state_restorer = restorer
150
+
151
+ def set_prestop_handler(self, handler: PreStopHandler) -> None:
152
+ """Set the callback for PreStop hook."""
153
+ self._prestop_handler = handler
154
+
155
+ def set_state_token(self, token: str) -> None:
156
+ """Set the authentication token for state endpoints."""
157
+ self._state_token = token
158
+
159
+ @property
160
+ def port(self) -> int:
161
+ """Get the actual port the server is running on."""
162
+ return self._port
163
+
164
+ def _find_available_port(self, start: int = 8080, end: int = 9000) -> int:
165
+ """
166
+ Find an available port in the given range.
167
+
168
+ Args:
169
+ start: Start of port range
170
+ end: End of port range
171
+
172
+ Returns:
173
+ Available port number
174
+
175
+ Raises:
176
+ DoryHealthError: If no available port found
177
+ """
178
+ import socket
179
+
180
+ for port in range(start, end):
181
+ try:
182
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
183
+ s.bind(("0.0.0.0", port))
184
+ logger.debug(f"Found available port: {port}")
185
+ return port
186
+ except OSError:
187
+ continue
188
+
189
+ raise DoryHealthError(f"No available port found in range {start}-{end}")
190
+
191
+ def _verify_state_auth(self, request: web.Request) -> tuple[bool, str]:
192
+ """
193
+ Verify authentication for state endpoints.
194
+
195
+ Uses Bearer token authentication matching Orchestrator's implementation.
196
+ Token comparison uses constant-time comparison to prevent timing attacks.
197
+
198
+ Args:
199
+ request: The incoming HTTP request
200
+
201
+ Returns:
202
+ Tuple of (is_authenticated, error_message)
203
+ """
204
+ # If no token configured, allow unauthenticated access (with warning logged at startup)
205
+ if not self._state_token:
206
+ return True, ""
207
+
208
+ auth_header = request.headers.get("Authorization", "")
209
+
210
+ if not auth_header:
211
+ return False, "Missing Authorization header"
212
+
213
+ # Expected format: "Bearer <token>"
214
+ if not auth_header.startswith("Bearer "):
215
+ return False, "Invalid Authorization header format (expected 'Bearer <token>')"
216
+
217
+ provided_token = auth_header[7:] # Strip "Bearer " prefix
218
+
219
+ # Use constant-time comparison to prevent timing attacks
220
+ if not secrets.compare_digest(provided_token, self._state_token):
221
+ return False, "Invalid authentication token"
222
+
223
+ return True, ""
224
+
225
+ async def start(self) -> None:
226
+ """
227
+ Start the health server.
228
+
229
+ If port is 0, automatically finds an available port.
230
+
231
+ Raises:
232
+ DoryHealthError: If server fails to start
233
+ """
234
+ try:
235
+ self._app = web.Application()
236
+ self._setup_routes()
237
+
238
+ self._runner = web.AppRunner(self._app)
239
+ await self._runner.setup()
240
+
241
+ # If port is 0, find an available port (development mode)
242
+ port = self._port
243
+ if port == 0:
244
+ # Warn if running in Kubernetes with auto-port selection
245
+ if os.environ.get("KUBERNETES_SERVICE_HOST") or os.environ.get("DORY_POD_NAME"):
246
+ logger.warning(
247
+ "Auto-port selection (port=0) is not recommended in Kubernetes. "
248
+ "K8s probes require a fixed port. Set DORY_HEALTH_PORT or use production preset."
249
+ )
250
+ port = self._find_available_port()
251
+ self._port = port
252
+
253
+ self._site = web.TCPSite(
254
+ self._runner,
255
+ host="0.0.0.0",
256
+ port=port,
257
+ )
258
+ await self._site.start()
259
+
260
+ logger.info(f"Health server started on port {self._port}")
261
+
262
+ except Exception as e:
263
+ raise DoryHealthError(f"Failed to start health server: {e}", cause=e)
264
+
265
+ async def stop(self) -> None:
266
+ """Stop the health server."""
267
+ if self._runner:
268
+ await self._runner.cleanup()
269
+
270
+ # Shutdown executor
271
+ self._executor.shutdown(wait=False)
272
+
273
+ logger.info("Health server stopped")
274
+
275
+ def _setup_routes(self) -> None:
276
+ """Configure HTTP routes."""
277
+ self._app.router.add_get(self.HEALTH_PATH, self._handle_health)
278
+ self._app.router.add_get(self.READY_PATH, self._handle_ready)
279
+ self._app.router.add_get(self.METRICS_PATH, self._handle_metrics)
280
+
281
+ # State transfer endpoints (required by Dory Orchestrator)
282
+ self._app.router.add_get("/state", self._handle_state_get)
283
+ self._app.router.add_post("/state", self._handle_state_post)
284
+
285
+ # PreStop hook endpoint (required by Dory Orchestrator)
286
+ self._app.router.add_get("/prestop", self._handle_prestop)
287
+
288
+ # Root endpoint for basic info
289
+ self._app.router.add_get("/", self._handle_root)
290
+
291
+ async def _handle_root(self, request: web.Request) -> web.Response:
292
+ """Handle root endpoint."""
293
+ return web.json_response(
294
+ {
295
+ "service": "dory-processor",
296
+ "sdk_version": dory.__version__,
297
+ "endpoints": [
298
+ self.HEALTH_PATH,
299
+ self.READY_PATH,
300
+ self.METRICS_PATH,
301
+ "/state",
302
+ "/prestop",
303
+ ],
304
+ },
305
+ headers={SDK_VERSION_HEADER: dory.__version__},
306
+ )
307
+
308
+ async def _handle_health(self, request: web.Request) -> web.Response:
309
+ """
310
+ Handle liveness probe.
311
+
312
+ Returns 200 if alive, 503 if unhealthy.
313
+ Includes X-Dory-SDK-Version header for orchestrator SDK detection.
314
+ """
315
+ result = await self._liveness.check()
316
+
317
+ status = 200 if result.healthy else 503
318
+ return web.json_response(
319
+ result.to_dict(),
320
+ status=status,
321
+ headers={SDK_VERSION_HEADER: dory.__version__},
322
+ )
323
+
324
+ async def _handle_ready(self, request: web.Request) -> web.Response:
325
+ """
326
+ Handle readiness probe.
327
+
328
+ Returns 200 if ready, 503 if not ready.
329
+ """
330
+ result = await self._readiness.check()
331
+
332
+ status = 200 if result.healthy else 503
333
+ return web.json_response(result.to_dict(), status=status)
334
+
335
+ async def _handle_metrics(self, request: web.Request) -> web.Response:
336
+ """
337
+ Handle Prometheus metrics endpoint.
338
+
339
+ Returns metrics in Prometheus text format.
340
+ """
341
+ if self._metrics_collector is None:
342
+ return web.Response(
343
+ text="# No metrics collector configured\n",
344
+ content_type="text/plain",
345
+ )
346
+
347
+ try:
348
+ metrics_text = self._metrics_collector.export_prometheus()
349
+ return web.Response(
350
+ text=metrics_text,
351
+ content_type="text/plain; version=0.0.4",
352
+ charset="utf-8",
353
+ )
354
+ except Exception as e:
355
+ logger.error(f"Error exporting metrics: {e}")
356
+ return web.Response(
357
+ text=f"# Error exporting metrics: {e}\n",
358
+ content_type="text/plain",
359
+ status=500,
360
+ )
361
+
362
+ async def _handle_state_get(self, request: web.Request) -> web.Response:
363
+ """
364
+ Handle GET /state for state capture during migration.
365
+
366
+ Called by Dory Orchestrator to capture state from old pod
367
+ before transferring to new pod.
368
+
369
+ Requires authentication via Bearer token (DORY_STATE_TOKEN).
370
+ Enforces timeout and size limits to prevent Orchestrator timeouts.
371
+
372
+ Returns:
373
+ JSON response with processor state
374
+ """
375
+ # Verify authentication
376
+ is_authenticated, auth_error = self._verify_state_auth(request)
377
+ if not is_authenticated:
378
+ logger.warning(f"State GET authentication failed: {auth_error}")
379
+ return web.json_response(
380
+ {"error": auth_error},
381
+ status=401,
382
+ )
383
+
384
+ if self._state_getter is None:
385
+ logger.warning("State getter not configured, returning empty state")
386
+ return web.json_response({
387
+ "error": "state_getter not configured",
388
+ "data": {},
389
+ }, status=503)
390
+
391
+ start_time = time.monotonic()
392
+ metrics: TransferMetrics | None = None
393
+
394
+ try:
395
+ # Run state getter with timeout in thread pool
396
+ # (state_getter is synchronous, but we need timeout control)
397
+ loop = __import__("asyncio").get_event_loop()
398
+ state = await loop.run_in_executor(
399
+ self._executor,
400
+ self._state_getter,
401
+ )
402
+
403
+ # Check if we're already over timeout (executor doesn't enforce timeout)
404
+ capture_duration = time.monotonic() - start_time
405
+ if capture_duration > self._transfer_config.capture_timeout_sec:
406
+ metrics = TransferMetrics(
407
+ duration_sec=capture_duration,
408
+ size_bytes=0,
409
+ size_ratio=0,
410
+ timed_out=True,
411
+ size_exceeded=False,
412
+ )
413
+ log_transfer_summary("capture", metrics, self._transfer_config)
414
+ return web.json_response(
415
+ {
416
+ "error": f"State capture timed out after {capture_duration:.1f}s "
417
+ f"(limit: {self._transfer_config.capture_timeout_sec}s)",
418
+ "timeout": True,
419
+ },
420
+ status=504, # Gateway Timeout
421
+ )
422
+
423
+ # Serialize and validate size
424
+ state_json = json.dumps(state)
425
+ size_bytes = len(state_json.encode("utf-8"))
426
+
427
+ # Validate size
428
+ try:
429
+ validate_state_size(
430
+ state_json,
431
+ max_size=self._transfer_config.max_size_bytes,
432
+ warn_threshold=self._transfer_config.size_warn_threshold,
433
+ )
434
+ except StateSizeExceeded as e:
435
+ metrics = TransferMetrics(
436
+ duration_sec=time.monotonic() - start_time,
437
+ size_bytes=size_bytes,
438
+ size_ratio=size_bytes / self._transfer_config.max_size_bytes,
439
+ timed_out=False,
440
+ size_exceeded=True,
441
+ )
442
+ log_transfer_summary("capture", metrics, self._transfer_config)
443
+ return web.json_response(
444
+ {
445
+ "error": str(e),
446
+ "size_exceeded": True,
447
+ "size_bytes": size_bytes,
448
+ "max_bytes": self._transfer_config.max_size_bytes,
449
+ },
450
+ status=413, # Payload Too Large
451
+ )
452
+
453
+ # Success - log metrics
454
+ metrics = TransferMetrics(
455
+ duration_sec=time.monotonic() - start_time,
456
+ size_bytes=size_bytes,
457
+ size_ratio=size_bytes / self._transfer_config.max_size_bytes,
458
+ timed_out=False,
459
+ size_exceeded=False,
460
+ )
461
+ log_transfer_summary("capture", metrics, self._transfer_config)
462
+
463
+ logger.info(
464
+ "State captured for transfer",
465
+ extra={
466
+ "state_keys": list(state.keys()),
467
+ "size_bytes": size_bytes,
468
+ "duration_sec": metrics.duration_sec,
469
+ },
470
+ )
471
+ return web.json_response(state)
472
+
473
+ except FuturesTimeoutError:
474
+ metrics = TransferMetrics(
475
+ duration_sec=time.monotonic() - start_time,
476
+ size_bytes=0,
477
+ size_ratio=0,
478
+ timed_out=True,
479
+ size_exceeded=False,
480
+ )
481
+ log_transfer_summary("capture", metrics, self._transfer_config)
482
+ return web.json_response(
483
+ {
484
+ "error": f"State capture timed out after {self._transfer_config.capture_timeout_sec}s",
485
+ "timeout": True,
486
+ },
487
+ status=504,
488
+ )
489
+
490
+ except Exception as e:
491
+ logger.error(f"Failed to capture state: {e}")
492
+ return web.json_response(
493
+ {"error": f"Failed to capture state: {e}"},
494
+ status=500,
495
+ )
496
+
497
+ async def _handle_state_post(self, request: web.Request) -> web.Response:
498
+ """
499
+ Handle POST /state for state restoration during migration.
500
+
501
+ Called by Dory Orchestrator to restore state to new pod
502
+ after capturing from old pod.
503
+
504
+ Requires authentication via Bearer token (DORY_STATE_TOKEN).
505
+ Enforces timeout to prevent Orchestrator timeouts.
506
+
507
+ Returns:
508
+ JSON response confirming state restoration
509
+ """
510
+ import asyncio
511
+
512
+ # Verify authentication
513
+ is_authenticated, auth_error = self._verify_state_auth(request)
514
+ if not is_authenticated:
515
+ logger.warning(f"State POST authentication failed: {auth_error}")
516
+ return web.json_response(
517
+ {"error": auth_error},
518
+ status=401,
519
+ )
520
+
521
+ if self._state_restorer is None:
522
+ logger.warning("State restorer not configured")
523
+ return web.json_response({
524
+ "error": "state_restorer not configured",
525
+ }, status=503)
526
+
527
+ start_time = time.monotonic()
528
+
529
+ try:
530
+ # Read request body
531
+ body = await request.read()
532
+ size_bytes = len(body)
533
+
534
+ # Validate incoming state size
535
+ if size_bytes > self._transfer_config.max_size_bytes:
536
+ logger.error(
537
+ f"Incoming state size ({size_bytes:,}B) exceeds maximum "
538
+ f"({self._transfer_config.max_size_bytes:,}B)"
539
+ )
540
+ return web.json_response(
541
+ {
542
+ "error": f"State size ({size_bytes:,} bytes) exceeds maximum "
543
+ f"({self._transfer_config.max_size_bytes:,} bytes)",
544
+ "size_exceeded": True,
545
+ },
546
+ status=413,
547
+ )
548
+
549
+ state = json.loads(body.decode("utf-8"))
550
+ logger.info(
551
+ "Restoring state from transfer",
552
+ extra={"state_keys": list(state.keys()), "size_bytes": size_bytes},
553
+ )
554
+
555
+ # Execute restore with timeout
556
+ try:
557
+ await asyncio.wait_for(
558
+ self._state_restorer(state),
559
+ timeout=self._transfer_config.restore_timeout_sec,
560
+ )
561
+ except asyncio.TimeoutError:
562
+ duration = time.monotonic() - start_time
563
+ logger.error(
564
+ f"State restore timed out after {duration:.1f}s "
565
+ f"(limit: {self._transfer_config.restore_timeout_sec}s)"
566
+ )
567
+ return web.json_response(
568
+ {
569
+ "error": f"State restore timed out after {self._transfer_config.restore_timeout_sec}s",
570
+ "timeout": True,
571
+ },
572
+ status=504,
573
+ )
574
+
575
+ duration = time.monotonic() - start_time
576
+ logger.info(
577
+ f"State restored successfully in {duration:.2f}s",
578
+ extra={"size_bytes": size_bytes, "duration_sec": duration},
579
+ )
580
+
581
+ # Warn if restore took significant time
582
+ if duration > self._transfer_config.restore_timeout_sec * 0.5:
583
+ logger.warning(
584
+ f"State restore took {duration:.1f}s "
585
+ f"({duration/self._transfer_config.restore_timeout_sec:.0%} of "
586
+ f"{self._transfer_config.restore_timeout_sec}s timeout)"
587
+ )
588
+
589
+ return web.json_response({
590
+ "status": "ok",
591
+ "message": "State restored",
592
+ "duration_sec": round(duration, 3),
593
+ "size_bytes": size_bytes,
594
+ })
595
+
596
+ except json.JSONDecodeError as e:
597
+ logger.error(f"Invalid JSON in state restore request: {e}")
598
+ return web.json_response(
599
+ {"error": f"Invalid JSON: {e}"},
600
+ status=400,
601
+ )
602
+ except Exception as e:
603
+ logger.error(f"Failed to restore state: {e}")
604
+ return web.json_response(
605
+ {"error": f"Failed to restore state: {e}"},
606
+ status=500,
607
+ )
608
+
609
+ async def _handle_prestop(self, request: web.Request) -> web.Response:
610
+ """
611
+ Handle GET /prestop for PreStop hook.
612
+
613
+ Called by Kubernetes PreStop hook before pod termination.
614
+ Allows the application to prepare for graceful shutdown.
615
+
616
+ Returns:
617
+ JSON response confirming prestop handling
618
+ """
619
+ logger.info("PreStop hook invoked - preparing for shutdown")
620
+
621
+ # Mark as not ready to stop receiving new traffic
622
+ self._readiness.mark_not_ready()
623
+
624
+ if self._prestop_handler:
625
+ try:
626
+ await self._prestop_handler()
627
+ logger.info("PreStop handler completed")
628
+ except Exception as e:
629
+ logger.error(f"PreStop handler error: {e}")
630
+ # Continue anyway - don't block shutdown
631
+
632
+ return web.json_response({
633
+ "status": "ok",
634
+ "message": "PreStop hook processed, ready for termination",
635
+ })