dory-sdk 2.1.0__py3-none-any.whl → 2.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dory/health/server.py CHANGED
@@ -5,17 +5,32 @@ Provides endpoints for:
5
5
  - /healthz - Liveness probe
6
6
  - /ready - Readiness probe (matches Kubernetes convention)
7
7
  - /metrics - Prometheus metrics
8
- - /state - State transfer (GET/POST) for pod migration
8
+ - /state - State transfer (GET/POST) for pod migration (authenticated, with timeout)
9
9
  - /prestop - PreStop hook handler for graceful shutdown
10
10
  """
11
11
 
12
+ import json
12
13
  import logging
14
+ import os
15
+ import secrets
16
+ import time
17
+ from concurrent.futures import ThreadPoolExecutor, TimeoutError as FuturesTimeoutError
13
18
  from typing import TYPE_CHECKING, Callable, Awaitable
14
19
 
15
20
  from aiohttp import web
16
21
 
17
22
  from dory.health.probes import LivenessProbe, ReadinessProbe
18
23
  from dory.utils.errors import DoryHealthError
24
+ from dory.migration.transfer import (
25
+ TransferConfig,
26
+ validate_state_size,
27
+ StateSizeExceeded,
28
+ StateTransferTimeout,
29
+ log_transfer_summary,
30
+ TransferMetrics,
31
+ ORCHESTRATOR_STATE_TIMEOUT_SEC,
32
+ ORCHESTRATOR_MAX_STATE_SIZE,
33
+ )
19
34
 
20
35
  if TYPE_CHECKING:
21
36
  from dory.metrics.collector import MetricsCollector
@@ -50,6 +65,8 @@ class HealthServer:
50
65
  state_getter: StateGetter | None = None,
51
66
  state_restorer: StateRestorer | None = None,
52
67
  prestop_handler: PreStopHandler | None = None,
68
+ state_token: str | None = None,
69
+ transfer_config: TransferConfig | None = None,
53
70
  ):
54
71
  """
55
72
  Initialize health server.
@@ -63,6 +80,11 @@ class HealthServer:
63
80
  state_getter: Callback to get processor state for /state GET
64
81
  state_restorer: Callback to restore processor state for /state POST
65
82
  prestop_handler: Callback for /prestop PreStop hook
83
+ state_token: Authentication token for /state endpoints. If not provided,
84
+ reads from DORY_STATE_TOKEN environment variable. If neither is set,
85
+ state endpoints are unauthenticated (not recommended in production).
86
+ transfer_config: Configuration for state transfer timeouts and size limits.
87
+ If not provided, uses defaults aligned with Orchestrator (25s timeout, 8MB max).
66
88
  """
67
89
  self._port = port
68
90
  self._health_path = health_path
@@ -73,6 +95,26 @@ class HealthServer:
73
95
  self._state_restorer = state_restorer
74
96
  self._prestop_handler = prestop_handler
75
97
 
98
+ # State endpoint authentication token (matches Orchestrator's DORY_STATE_TOKEN)
99
+ self._state_token = state_token or os.environ.get("DORY_STATE_TOKEN")
100
+ if not self._state_token:
101
+ logger.warning(
102
+ "DORY_STATE_TOKEN not configured - state endpoints are unauthenticated. "
103
+ "Set DORY_STATE_TOKEN environment variable for production deployments."
104
+ )
105
+
106
+ # State transfer configuration (aligned with Orchestrator limits)
107
+ self._transfer_config = transfer_config or TransferConfig()
108
+ logger.info(
109
+ f"State transfer configured: capture_timeout={self._transfer_config.capture_timeout_sec}s "
110
+ f"(Orchestrator: {ORCHESTRATOR_STATE_TIMEOUT_SEC}s), "
111
+ f"max_size={self._transfer_config.max_size_bytes:,}B "
112
+ f"(Orchestrator: {ORCHESTRATOR_MAX_STATE_SIZE:,}B)"
113
+ )
114
+
115
+ # Thread pool for running synchronous state getter with timeout
116
+ self._executor = ThreadPoolExecutor(max_workers=2, thread_name_prefix="state-capture")
117
+
76
118
  self._liveness = LivenessProbe()
77
119
  self._readiness = ReadinessProbe()
78
120
 
@@ -110,6 +152,44 @@ class HealthServer:
110
152
  """Set the callback for PreStop hook."""
111
153
  self._prestop_handler = handler
112
154
 
155
+ def set_state_token(self, token: str) -> None:
156
+ """Set the authentication token for state endpoints."""
157
+ self._state_token = token
158
+
159
+ def _verify_state_auth(self, request: web.Request) -> tuple[bool, str]:
160
+ """
161
+ Verify authentication for state endpoints.
162
+
163
+ Uses Bearer token authentication matching Orchestrator's implementation.
164
+ Token comparison uses constant-time comparison to prevent timing attacks.
165
+
166
+ Args:
167
+ request: The incoming HTTP request
168
+
169
+ Returns:
170
+ Tuple of (is_authenticated, error_message)
171
+ """
172
+ # If no token configured, allow unauthenticated access (with warning logged at startup)
173
+ if not self._state_token:
174
+ return True, ""
175
+
176
+ auth_header = request.headers.get("Authorization", "")
177
+
178
+ if not auth_header:
179
+ return False, "Missing Authorization header"
180
+
181
+ # Expected format: "Bearer <token>"
182
+ if not auth_header.startswith("Bearer "):
183
+ return False, "Invalid Authorization header format (expected 'Bearer <token>')"
184
+
185
+ provided_token = auth_header[7:] # Strip "Bearer " prefix
186
+
187
+ # Use constant-time comparison to prevent timing attacks
188
+ if not secrets.compare_digest(provided_token, self._state_token):
189
+ return False, "Invalid authentication token"
190
+
191
+ return True, ""
192
+
113
193
  async def start(self) -> None:
114
194
  """
115
195
  Start the health server.
@@ -140,7 +220,11 @@ class HealthServer:
140
220
  """Stop the health server."""
141
221
  if self._runner:
142
222
  await self._runner.cleanup()
143
- logger.info("Health server stopped")
223
+
224
+ # Shutdown executor
225
+ self._executor.shutdown(wait=False)
226
+
227
+ logger.info("Health server stopped")
144
228
 
145
229
  def _setup_routes(self) -> None:
146
230
  """Configure HTTP routes."""
@@ -227,9 +311,21 @@ class HealthServer:
227
311
  Called by Dory Orchestrator to capture state from old pod
228
312
  before transferring to new pod.
229
313
 
314
+ Requires authentication via Bearer token (DORY_STATE_TOKEN).
315
+ Enforces timeout and size limits to prevent Orchestrator timeouts.
316
+
230
317
  Returns:
231
318
  JSON response with processor state
232
319
  """
320
+ # Verify authentication
321
+ is_authenticated, auth_error = self._verify_state_auth(request)
322
+ if not is_authenticated:
323
+ logger.warning(f"State GET authentication failed: {auth_error}")
324
+ return web.json_response(
325
+ {"error": auth_error},
326
+ status=401,
327
+ )
328
+
233
329
  if self._state_getter is None:
234
330
  logger.warning("State getter not configured, returning empty state")
235
331
  return web.json_response({
@@ -237,10 +333,105 @@ class HealthServer:
237
333
  "data": {},
238
334
  }, status=503)
239
335
 
336
+ start_time = time.monotonic()
337
+ metrics: TransferMetrics | None = None
338
+
240
339
  try:
241
- state = self._state_getter()
242
- logger.info("State captured for transfer", extra={"state_keys": list(state.keys())})
340
+ # Run state getter with timeout in thread pool
341
+ # (state_getter is synchronous, but we need timeout control)
342
+ loop = __import__("asyncio").get_event_loop()
343
+ state = await loop.run_in_executor(
344
+ self._executor,
345
+ self._state_getter,
346
+ )
347
+
348
+ # Check if we're already over timeout (executor doesn't enforce timeout)
349
+ capture_duration = time.monotonic() - start_time
350
+ if capture_duration > self._transfer_config.capture_timeout_sec:
351
+ metrics = TransferMetrics(
352
+ duration_sec=capture_duration,
353
+ size_bytes=0,
354
+ size_ratio=0,
355
+ timed_out=True,
356
+ size_exceeded=False,
357
+ )
358
+ log_transfer_summary("capture", metrics, self._transfer_config)
359
+ return web.json_response(
360
+ {
361
+ "error": f"State capture timed out after {capture_duration:.1f}s "
362
+ f"(limit: {self._transfer_config.capture_timeout_sec}s)",
363
+ "timeout": True,
364
+ },
365
+ status=504, # Gateway Timeout
366
+ )
367
+
368
+ # Serialize and validate size
369
+ state_json = json.dumps(state)
370
+ size_bytes = len(state_json.encode("utf-8"))
371
+
372
+ # Validate size
373
+ try:
374
+ validate_state_size(
375
+ state_json,
376
+ max_size=self._transfer_config.max_size_bytes,
377
+ warn_threshold=self._transfer_config.size_warn_threshold,
378
+ )
379
+ except StateSizeExceeded as e:
380
+ metrics = TransferMetrics(
381
+ duration_sec=time.monotonic() - start_time,
382
+ size_bytes=size_bytes,
383
+ size_ratio=size_bytes / self._transfer_config.max_size_bytes,
384
+ timed_out=False,
385
+ size_exceeded=True,
386
+ )
387
+ log_transfer_summary("capture", metrics, self._transfer_config)
388
+ return web.json_response(
389
+ {
390
+ "error": str(e),
391
+ "size_exceeded": True,
392
+ "size_bytes": size_bytes,
393
+ "max_bytes": self._transfer_config.max_size_bytes,
394
+ },
395
+ status=413, # Payload Too Large
396
+ )
397
+
398
+ # Success - log metrics
399
+ metrics = TransferMetrics(
400
+ duration_sec=time.monotonic() - start_time,
401
+ size_bytes=size_bytes,
402
+ size_ratio=size_bytes / self._transfer_config.max_size_bytes,
403
+ timed_out=False,
404
+ size_exceeded=False,
405
+ )
406
+ log_transfer_summary("capture", metrics, self._transfer_config)
407
+
408
+ logger.info(
409
+ "State captured for transfer",
410
+ extra={
411
+ "state_keys": list(state.keys()),
412
+ "size_bytes": size_bytes,
413
+ "duration_sec": metrics.duration_sec,
414
+ },
415
+ )
243
416
  return web.json_response(state)
417
+
418
+ except FuturesTimeoutError:
419
+ metrics = TransferMetrics(
420
+ duration_sec=time.monotonic() - start_time,
421
+ size_bytes=0,
422
+ size_ratio=0,
423
+ timed_out=True,
424
+ size_exceeded=False,
425
+ )
426
+ log_transfer_summary("capture", metrics, self._transfer_config)
427
+ return web.json_response(
428
+ {
429
+ "error": f"State capture timed out after {self._transfer_config.capture_timeout_sec}s",
430
+ "timeout": True,
431
+ },
432
+ status=504,
433
+ )
434
+
244
435
  except Exception as e:
245
436
  logger.error(f"Failed to capture state: {e}")
246
437
  return web.json_response(
@@ -255,21 +446,104 @@ class HealthServer:
255
446
  Called by Dory Orchestrator to restore state to new pod
256
447
  after capturing from old pod.
257
448
 
449
+ Requires authentication via Bearer token (DORY_STATE_TOKEN).
450
+ Enforces timeout to prevent Orchestrator timeouts.
451
+
258
452
  Returns:
259
453
  JSON response confirming state restoration
260
454
  """
455
+ import asyncio
456
+
457
+ # Verify authentication
458
+ is_authenticated, auth_error = self._verify_state_auth(request)
459
+ if not is_authenticated:
460
+ logger.warning(f"State POST authentication failed: {auth_error}")
461
+ return web.json_response(
462
+ {"error": auth_error},
463
+ status=401,
464
+ )
465
+
261
466
  if self._state_restorer is None:
262
467
  logger.warning("State restorer not configured")
263
468
  return web.json_response({
264
469
  "error": "state_restorer not configured",
265
470
  }, status=503)
266
471
 
472
+ start_time = time.monotonic()
473
+
267
474
  try:
268
- state = await request.json()
269
- logger.info("Restoring state from transfer", extra={"state_keys": list(state.keys())})
270
- await self._state_restorer(state)
271
- logger.info("State restored successfully")
272
- return web.json_response({"status": "ok", "message": "State restored"})
475
+ # Read request body
476
+ body = await request.read()
477
+ size_bytes = len(body)
478
+
479
+ # Validate incoming state size
480
+ if size_bytes > self._transfer_config.max_size_bytes:
481
+ logger.error(
482
+ f"Incoming state size ({size_bytes:,}B) exceeds maximum "
483
+ f"({self._transfer_config.max_size_bytes:,}B)"
484
+ )
485
+ return web.json_response(
486
+ {
487
+ "error": f"State size ({size_bytes:,} bytes) exceeds maximum "
488
+ f"({self._transfer_config.max_size_bytes:,} bytes)",
489
+ "size_exceeded": True,
490
+ },
491
+ status=413,
492
+ )
493
+
494
+ state = json.loads(body.decode("utf-8"))
495
+ logger.info(
496
+ "Restoring state from transfer",
497
+ extra={"state_keys": list(state.keys()), "size_bytes": size_bytes},
498
+ )
499
+
500
+ # Execute restore with timeout
501
+ try:
502
+ await asyncio.wait_for(
503
+ self._state_restorer(state),
504
+ timeout=self._transfer_config.restore_timeout_sec,
505
+ )
506
+ except asyncio.TimeoutError:
507
+ duration = time.monotonic() - start_time
508
+ logger.error(
509
+ f"State restore timed out after {duration:.1f}s "
510
+ f"(limit: {self._transfer_config.restore_timeout_sec}s)"
511
+ )
512
+ return web.json_response(
513
+ {
514
+ "error": f"State restore timed out after {self._transfer_config.restore_timeout_sec}s",
515
+ "timeout": True,
516
+ },
517
+ status=504,
518
+ )
519
+
520
+ duration = time.monotonic() - start_time
521
+ logger.info(
522
+ f"State restored successfully in {duration:.2f}s",
523
+ extra={"size_bytes": size_bytes, "duration_sec": duration},
524
+ )
525
+
526
+ # Warn if restore took significant time
527
+ if duration > self._transfer_config.restore_timeout_sec * 0.5:
528
+ logger.warning(
529
+ f"State restore took {duration:.1f}s "
530
+ f"({duration/self._transfer_config.restore_timeout_sec:.0%} of "
531
+ f"{self._transfer_config.restore_timeout_sec}s timeout)"
532
+ )
533
+
534
+ return web.json_response({
535
+ "status": "ok",
536
+ "message": "State restored",
537
+ "duration_sec": round(duration, 3),
538
+ "size_bytes": size_bytes,
539
+ })
540
+
541
+ except json.JSONDecodeError as e:
542
+ logger.error(f"Invalid JSON in state restore request: {e}")
543
+ return web.json_response(
544
+ {"error": f"Invalid JSON: {e}"},
545
+ status=400,
546
+ )
273
547
  except Exception as e:
274
548
  logger.error(f"Failed to restore state: {e}")
275
549
  return web.json_response(
dory/k8s/__init__.py CHANGED
@@ -3,9 +3,78 @@
3
3
  from dory.k8s.client import K8sClient
4
4
  from dory.k8s.pod_metadata import PodMetadata
5
5
  from dory.k8s.annotation_watcher import AnnotationWatcher
6
+ from dory.k8s.labels import (
7
+ # Label keys
8
+ LABEL_MANAGED_BY,
9
+ LABEL_APP_NAME,
10
+ LABEL_PROCESSOR_ID,
11
+ LABEL_WORKLOAD_TYPE,
12
+ LABEL_WORKLOAD_LOCATION,
13
+ LABEL_NODE_TYPE,
14
+ LABEL_MIGRATED_FROM_EDGE,
15
+ LABEL_ORIGINAL_NODE,
16
+ # Label values
17
+ VALUE_ORCHESTRATOR_NAME,
18
+ VALUE_EDGE,
19
+ VALUE_MANAGED,
20
+ # Enums
21
+ WorkloadLocation,
22
+ NodeType,
23
+ # Builder
24
+ DoryLabels,
25
+ # Utilities
26
+ get_label,
27
+ get_app_name,
28
+ get_processor_id,
29
+ get_workload_location,
30
+ is_managed_by_dory,
31
+ is_edge_workload,
32
+ is_migrated_from_edge,
33
+ get_original_node,
34
+ detect_workload_context,
35
+ edge_node_selector,
36
+ managed_node_selector,
37
+ edge_toleration,
38
+ get_contract_documentation,
39
+ CONTRACT_VERSION,
40
+ )
6
41
 
7
42
  __all__ = [
43
+ # Core
8
44
  "K8sClient",
9
45
  "PodMetadata",
10
46
  "AnnotationWatcher",
47
+ # Label keys
48
+ "LABEL_MANAGED_BY",
49
+ "LABEL_APP_NAME",
50
+ "LABEL_PROCESSOR_ID",
51
+ "LABEL_WORKLOAD_TYPE",
52
+ "LABEL_WORKLOAD_LOCATION",
53
+ "LABEL_NODE_TYPE",
54
+ "LABEL_MIGRATED_FROM_EDGE",
55
+ "LABEL_ORIGINAL_NODE",
56
+ # Label values
57
+ "VALUE_ORCHESTRATOR_NAME",
58
+ "VALUE_EDGE",
59
+ "VALUE_MANAGED",
60
+ # Enums
61
+ "WorkloadLocation",
62
+ "NodeType",
63
+ # Builder
64
+ "DoryLabels",
65
+ # Utilities
66
+ "get_label",
67
+ "get_app_name",
68
+ "get_processor_id",
69
+ "get_workload_location",
70
+ "is_managed_by_dory",
71
+ "is_edge_workload",
72
+ "is_migrated_from_edge",
73
+ "get_original_node",
74
+ "detect_workload_context",
75
+ "edge_node_selector",
76
+ "managed_node_selector",
77
+ "edge_toleration",
78
+ "get_contract_documentation",
79
+ "CONTRACT_VERSION",
11
80
  ]