dory-sdk 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. dory/__init__.py +70 -0
  2. dory/auto_instrument.py +142 -0
  3. dory/cli/__init__.py +5 -0
  4. dory/cli/main.py +290 -0
  5. dory/cli/templates.py +333 -0
  6. dory/config/__init__.py +23 -0
  7. dory/config/defaults.py +50 -0
  8. dory/config/loader.py +361 -0
  9. dory/config/presets.py +325 -0
  10. dory/config/schema.py +152 -0
  11. dory/core/__init__.py +27 -0
  12. dory/core/app.py +404 -0
  13. dory/core/context.py +209 -0
  14. dory/core/lifecycle.py +214 -0
  15. dory/core/meta.py +121 -0
  16. dory/core/modes.py +479 -0
  17. dory/core/processor.py +654 -0
  18. dory/core/signals.py +122 -0
  19. dory/decorators.py +142 -0
  20. dory/errors/__init__.py +117 -0
  21. dory/errors/classification.py +362 -0
  22. dory/errors/codes.py +495 -0
  23. dory/health/__init__.py +10 -0
  24. dory/health/probes.py +210 -0
  25. dory/health/server.py +306 -0
  26. dory/k8s/__init__.py +11 -0
  27. dory/k8s/annotation_watcher.py +184 -0
  28. dory/k8s/client.py +251 -0
  29. dory/k8s/pod_metadata.py +182 -0
  30. dory/logging/__init__.py +9 -0
  31. dory/logging/logger.py +175 -0
  32. dory/metrics/__init__.py +7 -0
  33. dory/metrics/collector.py +301 -0
  34. dory/middleware/__init__.py +36 -0
  35. dory/middleware/connection_tracker.py +608 -0
  36. dory/middleware/request_id.py +321 -0
  37. dory/middleware/request_tracker.py +501 -0
  38. dory/migration/__init__.py +11 -0
  39. dory/migration/configmap.py +260 -0
  40. dory/migration/serialization.py +167 -0
  41. dory/migration/state_manager.py +301 -0
  42. dory/monitoring/__init__.py +23 -0
  43. dory/monitoring/opentelemetry.py +462 -0
  44. dory/py.typed +2 -0
  45. dory/recovery/__init__.py +60 -0
  46. dory/recovery/golden_image.py +480 -0
  47. dory/recovery/golden_snapshot.py +561 -0
  48. dory/recovery/golden_validator.py +518 -0
  49. dory/recovery/partial_recovery.py +479 -0
  50. dory/recovery/recovery_decision.py +242 -0
  51. dory/recovery/restart_detector.py +142 -0
  52. dory/recovery/state_validator.py +187 -0
  53. dory/resilience/__init__.py +45 -0
  54. dory/resilience/circuit_breaker.py +454 -0
  55. dory/resilience/retry.py +389 -0
  56. dory/sidecar/__init__.py +6 -0
  57. dory/sidecar/main.py +75 -0
  58. dory/sidecar/server.py +329 -0
  59. dory/simple.py +342 -0
  60. dory/types.py +75 -0
  61. dory/utils/__init__.py +25 -0
  62. dory/utils/errors.py +59 -0
  63. dory/utils/retry.py +115 -0
  64. dory/utils/timeout.py +80 -0
  65. dory_sdk-2.1.0.dist-info/METADATA +663 -0
  66. dory_sdk-2.1.0.dist-info/RECORD +69 -0
  67. dory_sdk-2.1.0.dist-info/WHEEL +5 -0
  68. dory_sdk-2.1.0.dist-info/entry_points.txt +3 -0
  69. dory_sdk-2.1.0.dist-info/top_level.txt +1 -0
dory/config/schema.py ADDED
@@ -0,0 +1,152 @@
1
+ """
2
+ Configuration schema for Dory SDK.
3
+
4
+ Uses Pydantic for validation and type coercion.
5
+ """
6
+
7
+ from typing import Optional
8
+ from pydantic import BaseModel, Field, field_validator
9
+
10
+ from dory.types import StateBackend, LogFormat
11
+ from dory.config.defaults import DEFAULT_CONFIG
12
+
13
+
14
+ class DoryConfig(BaseModel):
15
+ """
16
+ Dory SDK configuration schema.
17
+
18
+ All configuration can be set via:
19
+ 1. YAML config file
20
+ 2. Environment variables (DORY_ prefix)
21
+ 3. Constructor arguments
22
+ """
23
+
24
+ # Lifecycle timeouts
25
+ startup_timeout_sec: int = Field(
26
+ default=DEFAULT_CONFIG["startup_timeout_sec"],
27
+ ge=1,
28
+ le=300,
29
+ description="Maximum time for startup in seconds",
30
+ )
31
+ shutdown_timeout_sec: int = Field(
32
+ default=DEFAULT_CONFIG["shutdown_timeout_sec"],
33
+ ge=1,
34
+ le=300,
35
+ description="Maximum time for shutdown in seconds",
36
+ )
37
+ health_check_interval_sec: int = Field(
38
+ default=DEFAULT_CONFIG["health_check_interval_sec"],
39
+ ge=1,
40
+ le=60,
41
+ description="Interval between health checks",
42
+ )
43
+
44
+ # Health server
45
+ health_port: int = Field(
46
+ default=DEFAULT_CONFIG["health_port"],
47
+ ge=1,
48
+ le=65535,
49
+ description="Port for health/metrics HTTP server",
50
+ )
51
+ health_path: str = Field(
52
+ default=DEFAULT_CONFIG["health_path"],
53
+ description="Path for liveness probe",
54
+ )
55
+ ready_path: str = Field(
56
+ default=DEFAULT_CONFIG["ready_path"],
57
+ description="Path for readiness probe",
58
+ )
59
+ metrics_path: str = Field(
60
+ default=DEFAULT_CONFIG["metrics_path"],
61
+ description="Path for Prometheus metrics",
62
+ )
63
+
64
+ # State management
65
+ state_backend: str = Field(
66
+ default=DEFAULT_CONFIG["state_backend"],
67
+ description="Backend for state persistence",
68
+ )
69
+ state_pvc_mount: str = Field(
70
+ default=DEFAULT_CONFIG["state_pvc_mount"],
71
+ description="Mount path for PVC state backend",
72
+ )
73
+ state_s3_bucket: Optional[str] = Field(
74
+ default=DEFAULT_CONFIG["state_s3_bucket"],
75
+ description="S3 bucket for state (if using S3 backend)",
76
+ )
77
+ state_s3_prefix: str = Field(
78
+ default=DEFAULT_CONFIG["state_s3_prefix"],
79
+ description="S3 key prefix for state objects",
80
+ )
81
+
82
+ # Recovery
83
+ max_restart_attempts: int = Field(
84
+ default=DEFAULT_CONFIG["max_restart_attempts"],
85
+ ge=1,
86
+ le=10,
87
+ description="Max restarts before golden image reset",
88
+ )
89
+ restart_backoff_sec: int = Field(
90
+ default=DEFAULT_CONFIG["restart_backoff_sec"],
91
+ ge=0,
92
+ le=300,
93
+ description="Backoff delay between restarts",
94
+ )
95
+ golden_image_threshold: int = Field(
96
+ default=DEFAULT_CONFIG["golden_image_threshold"],
97
+ ge=1,
98
+ le=10,
99
+ description="Restart count triggering golden image reset",
100
+ )
101
+
102
+ # Logging
103
+ log_level: str = Field(
104
+ default=DEFAULT_CONFIG["log_level"],
105
+ description="Logging level (DEBUG, INFO, WARNING, ERROR)",
106
+ )
107
+ log_format: str = Field(
108
+ default=DEFAULT_CONFIG["log_format"],
109
+ description="Log format (json or text)",
110
+ )
111
+
112
+ # Metrics
113
+ metrics_enabled: bool = Field(
114
+ default=DEFAULT_CONFIG["metrics_enabled"],
115
+ description="Enable Prometheus metrics",
116
+ )
117
+ metrics_prefix: str = Field(
118
+ default=DEFAULT_CONFIG["metrics_prefix"],
119
+ description="Prefix for metric names",
120
+ )
121
+
122
+ @field_validator("state_backend")
123
+ @classmethod
124
+ def validate_state_backend(cls, v: str) -> str:
125
+ """Validate state backend value."""
126
+ valid_backends = [b.value for b in StateBackend]
127
+ if v not in valid_backends:
128
+ raise ValueError(f"state_backend must be one of {valid_backends}")
129
+ return v
130
+
131
+ @field_validator("log_level")
132
+ @classmethod
133
+ def validate_log_level(cls, v: str) -> str:
134
+ """Validate log level."""
135
+ valid_levels = ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]
136
+ v_upper = v.upper()
137
+ if v_upper not in valid_levels:
138
+ raise ValueError(f"log_level must be one of {valid_levels}")
139
+ return v_upper
140
+
141
+ @field_validator("log_format")
142
+ @classmethod
143
+ def validate_log_format(cls, v: str) -> str:
144
+ """Validate log format."""
145
+ valid_formats = [f.value for f in LogFormat]
146
+ if v not in valid_formats:
147
+ raise ValueError(f"log_format must be one of {valid_formats}")
148
+ return v
149
+
150
+ model_config = {
151
+ "extra": "ignore", # Ignore unknown fields
152
+ }
dory/core/__init__.py ADDED
@@ -0,0 +1,27 @@
1
+ """Core modules for Dory SDK."""
2
+
3
+ from dory.core.processor import BaseProcessor
4
+ from dory.core.context import ExecutionContext
5
+ from dory.core.app import DoryApp
6
+ from dory.core.lifecycle import LifecycleManager
7
+ from dory.core.signals import SignalHandler
8
+ from dory.core.modes import (
9
+ ModeManager,
10
+ ProcessingMode,
11
+ ModeTransition,
12
+ ModeTransitionReason,
13
+ ModeConfig,
14
+ )
15
+
16
+ __all__ = [
17
+ "BaseProcessor",
18
+ "ExecutionContext",
19
+ "DoryApp",
20
+ "LifecycleManager",
21
+ "SignalHandler",
22
+ "ModeManager",
23
+ "ProcessingMode",
24
+ "ModeTransition",
25
+ "ModeTransitionReason",
26
+ "ModeConfig",
27
+ ]
dory/core/app.py ADDED
@@ -0,0 +1,404 @@
1
+ """
2
+ DoryApp - Main entry point for processor applications.
3
+
4
+ Orchestrates the entire processor lifecycle including:
5
+ - Configuration loading
6
+ - Health server startup
7
+ - Signal handling
8
+ - State restoration
9
+ - Processor lifecycle management
10
+ - Graceful shutdown
11
+ """
12
+
13
+ import asyncio
14
+ import logging
15
+ import sys
16
+ from typing import Type
17
+
18
+ from dory.core.processor import BaseProcessor
19
+ from dory.core.context import ExecutionContext
20
+ from dory.core.lifecycle import LifecycleManager
21
+ from dory.core.signals import SignalHandler
22
+ from dory.config.loader import ConfigLoader
23
+ from dory.config.schema import DoryConfig
24
+ from dory.health.server import HealthServer
25
+ from dory.migration.state_manager import StateManager
26
+ from dory.recovery.recovery_decision import RecoveryDecisionMaker
27
+ from dory.recovery.restart_detector import RestartDetector
28
+ from dory.logging.logger import setup_logging
29
+ from dory.metrics.collector import MetricsCollector
30
+ from dory.utils.errors import DoryStartupError, DoryStateError
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+
35
+ class DoryApp:
36
+ """
37
+ Main entry point for Dory processor applications.
38
+
39
+ Usage:
40
+ from dory import DoryApp, BaseProcessor
41
+
42
+ class MyProcessor(BaseProcessor):
43
+ ...
44
+
45
+ if __name__ == '__main__':
46
+ DoryApp().run(MyProcessor)
47
+ """
48
+
49
+ def __init__(
50
+ self,
51
+ config_file: str | None = None,
52
+ log_level: str | None = None,
53
+ ):
54
+ """
55
+ Initialize DoryApp.
56
+
57
+ Args:
58
+ config_file: Optional path to YAML config file
59
+ log_level: Optional log level override
60
+ """
61
+ self._config_file = config_file
62
+ self._log_level_override = log_level
63
+
64
+ # Components (initialized in _initialize)
65
+ self._config: DoryConfig | None = None
66
+ self._context: ExecutionContext | None = None
67
+ self._processor: BaseProcessor | None = None
68
+ self._lifecycle: LifecycleManager | None = None
69
+ self._signals: SignalHandler | None = None
70
+ self._health_server: HealthServer | None = None
71
+ self._state_manager: StateManager | None = None
72
+ self._metrics: MetricsCollector | None = None
73
+ self._restart_detector: RestartDetector | None = None
74
+ self._recovery_decision: RecoveryDecisionMaker | None = None
75
+
76
+ def run(self, processor_class: Type[BaseProcessor]) -> None:
77
+ """
78
+ Run the processor application.
79
+
80
+ This is the main entry point that blocks until shutdown.
81
+
82
+ Args:
83
+ processor_class: Class implementing BaseProcessor
84
+ """
85
+ try:
86
+ asyncio.run(self._run_async(processor_class))
87
+ except KeyboardInterrupt:
88
+ logger.info("Keyboard interrupt received")
89
+ sys.exit(0)
90
+ except Exception as e:
91
+ logger.error(f"Fatal error: {e}")
92
+ sys.exit(1)
93
+
94
+ async def _run_async(self, processor_class: Type[BaseProcessor]) -> None:
95
+ """
96
+ Async implementation of the run loop.
97
+
98
+ Args:
99
+ processor_class: Class implementing BaseProcessor
100
+ """
101
+ exit_code = 0
102
+
103
+ try:
104
+ # Phase 1: Initialize SDK components
105
+ await self._initialize(processor_class)
106
+
107
+ # Phase 2: Start health server
108
+ await self._start_health_server()
109
+
110
+ # Phase 3: Run processor lifecycle
111
+ await self._run_processor_lifecycle()
112
+
113
+ except DoryStartupError as e:
114
+ logger.error(f"Startup failed: {e}")
115
+ exit_code = 1
116
+ raise
117
+
118
+ except Exception as e:
119
+ logger.error(f"Unexpected error: {e}")
120
+ exit_code = 1
121
+ raise
122
+
123
+ finally:
124
+ # Phase 4: Cleanup
125
+ await self._cleanup()
126
+ logger.info(f"DoryApp exiting with code {exit_code}")
127
+
128
+ async def _initialize(self, processor_class: Type[BaseProcessor]) -> None:
129
+ """Initialize all SDK components."""
130
+ logger.debug("Initializing DoryApp components")
131
+
132
+ # Load configuration
133
+ config_loader = ConfigLoader(config_file=self._config_file)
134
+ self._config = config_loader.load()
135
+
136
+ # Apply log level override if provided
137
+ if self._log_level_override:
138
+ self._config.log_level = self._log_level_override
139
+
140
+ # Setup logging
141
+ setup_logging(
142
+ level=self._config.log_level,
143
+ format=self._config.log_format,
144
+ )
145
+
146
+ logger.info("Dory SDK initializing", extra={
147
+ "version": "1.0.0",
148
+ "config": self._config.model_dump(),
149
+ })
150
+
151
+ # Create execution context from environment
152
+ self._context = ExecutionContext.from_environment()
153
+
154
+ # Initialize components
155
+ self._lifecycle = LifecycleManager()
156
+ self._signals = SignalHandler()
157
+ self._state_manager = StateManager(
158
+ backend=self._config.state_backend,
159
+ config=self._config,
160
+ )
161
+ self._metrics = MetricsCollector()
162
+ self._restart_detector = RestartDetector()
163
+ self._recovery_decision = RecoveryDecisionMaker()
164
+
165
+ # Detect restart count
166
+ restart_info = await self._restart_detector.detect()
167
+ self._context.set_attempt_number(restart_info.restart_count)
168
+
169
+ logger.info(
170
+ f"Execution context: pod={self._context.pod_name}, "
171
+ f"processor_id={self._context.processor_id}, "
172
+ f"attempt={self._context.attempt_number}, "
173
+ f"is_migrating={self._context.is_migrating}"
174
+ )
175
+
176
+ # Create processor instance
177
+ self._processor = processor_class(self._context)
178
+
179
+ # Setup signal handlers
180
+ self._signals.setup(
181
+ shutdown_callback=self._trigger_shutdown,
182
+ snapshot_callback=self._trigger_snapshot,
183
+ )
184
+
185
+ # Record startup metric
186
+ self._metrics.record_startup_started()
187
+
188
+ async def _start_health_server(self) -> None:
189
+ """Start the health/metrics HTTP server."""
190
+ self._health_server = HealthServer(
191
+ port=self._config.health_port,
192
+ metrics_collector=self._metrics,
193
+ state_getter=self._get_processor_state,
194
+ state_restorer=self._restore_processor_state,
195
+ prestop_handler=self._handle_prestop,
196
+ )
197
+ await self._health_server.start()
198
+ logger.info(f"Health server started on port {self._config.health_port}")
199
+
200
+ def _get_processor_state(self) -> dict:
201
+ """Get processor state for /state GET endpoint (state capture)."""
202
+ if self._processor is None:
203
+ logger.warning("Processor not initialized, returning empty state")
204
+ return {}
205
+
206
+ try:
207
+ import os
208
+ import time
209
+ state = self._processor.get_state()
210
+
211
+ # Wrap state in ApplicationState format expected by Orchestrator
212
+ return {
213
+ "pod_name": self._context.pod_name if self._context else "unknown",
214
+ "app_name": os.environ.get("APP_NAME", "dory-processor"),
215
+ "captured_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
216
+ "state_version": "1.0",
217
+ "data": state,
218
+ "metrics": {},
219
+ "connections": [],
220
+ "active_sessions": 0,
221
+ "session_data": {},
222
+ "uptime_seconds": self._metrics.get_uptime_seconds() if self._metrics else 0.0,
223
+ "request_count": self._metrics.get_request_count() if self._metrics else 0,
224
+ "last_health_time": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
225
+ }
226
+ except Exception as e:
227
+ logger.error(f"Failed to get processor state: {e}")
228
+ return {"error": str(e)}
229
+
230
+ async def _restore_processor_state(self, state: dict) -> None:
231
+ """Restore processor state from /state POST endpoint (state transfer)."""
232
+ if self._processor is None:
233
+ raise RuntimeError("Processor not initialized, cannot restore state")
234
+
235
+ # Extract processor data from ApplicationState format
236
+ processor_data = state.get("data", state)
237
+
238
+ logger.info(f"Restoring state from transfer", extra={
239
+ "pod_name": state.get("pod_name", "unknown"),
240
+ "state_version": state.get("state_version", "unknown"),
241
+ })
242
+
243
+ await self._processor.restore_state(processor_data)
244
+
245
+ async def _handle_prestop(self) -> None:
246
+ """Handle PreStop hook - prepare for graceful shutdown."""
247
+ logger.info("PreStop hook: initiating graceful shutdown preparation")
248
+
249
+ # Signal context that shutdown is coming
250
+ if self._context:
251
+ self._context.request_shutdown()
252
+
253
+ # Mark health server as not ready to stop receiving traffic
254
+ if self._health_server:
255
+ self._health_server.mark_not_ready()
256
+
257
+ # Save state before pod terminates - this is critical because
258
+ # SIGTERM may arrive after the app has already started exiting
259
+ if self._processor and self._state_manager and self._context:
260
+ try:
261
+ state = self._processor.get_state()
262
+ await self._state_manager.save_state(
263
+ processor_id=self._context.processor_id,
264
+ state=state,
265
+ )
266
+ logger.info("State snapshot saved during PreStop")
267
+ except Exception as e:
268
+ logger.error(f"Failed to save state during PreStop: {e}")
269
+
270
+ async def _run_processor_lifecycle(self) -> None:
271
+ """Run the complete processor lifecycle."""
272
+ # Check if saved state exists before deciding recovery strategy
273
+ # This is important for detecting pod replacement (new pod, existing state)
274
+ state_exists = False
275
+ try:
276
+ existing_state = await self._state_manager.load_state(
277
+ processor_id=self._context.processor_id,
278
+ )
279
+ state_exists = existing_state is not None
280
+ if state_exists:
281
+ logger.info("Existing state found in checkpoint")
282
+ except Exception as e:
283
+ logger.debug(f"No existing state found: {e}")
284
+
285
+ # Determine recovery strategy
286
+ strategy = self._recovery_decision.decide(
287
+ restart_count=self._context.attempt_number,
288
+ is_migrating=self._context.is_migrating,
289
+ state_exists=state_exists,
290
+ )
291
+
292
+ logger.info(f"Recovery strategy: {strategy.name}")
293
+
294
+ # Load state if needed (may already have it from check above)
295
+ state = None
296
+ if strategy.should_restore_state:
297
+ try:
298
+ if state_exists and existing_state:
299
+ state = existing_state
300
+ else:
301
+ state = await self._state_manager.load_state(
302
+ processor_id=self._context.processor_id,
303
+ )
304
+ if state:
305
+ logger.info("State loaded from checkpoint")
306
+ except Exception as e:
307
+ logger.warning(f"Failed to load state: {e}")
308
+ state = None
309
+
310
+ # Run startup
311
+ await self._lifecycle.run_startup(
312
+ processor=self._processor,
313
+ timeout=self._config.startup_timeout_sec,
314
+ )
315
+
316
+ # Restore state if available
317
+ if state:
318
+ try:
319
+ await self._processor.restore_state(state)
320
+ logger.info("State restored successfully")
321
+ except Exception as e:
322
+ logger.error(f"State restore failed: {e}")
323
+ should_continue = await self._processor.on_state_restore_failed(e)
324
+ if not should_continue:
325
+ raise DoryStateError("State restore failed and recovery declined", cause=e)
326
+
327
+ # Mark as ready
328
+ self._health_server.mark_ready()
329
+ self._metrics.record_startup_completed()
330
+ logger.info("Processor ready")
331
+
332
+ # Run main loop
333
+ try:
334
+ await self._lifecycle.run_main_loop(
335
+ processor=self._processor,
336
+ context=self._context,
337
+ )
338
+ except asyncio.CancelledError:
339
+ logger.info("Main loop cancelled")
340
+
341
+ async def _trigger_shutdown(self) -> None:
342
+ """Trigger graceful shutdown sequence."""
343
+ logger.info("Shutdown triggered")
344
+
345
+ # Signal context
346
+ self._context.request_shutdown()
347
+
348
+ # Mark health server as not ready
349
+ if self._health_server:
350
+ self._health_server.mark_not_ready()
351
+
352
+ # Wait briefly for run() to exit
353
+ await asyncio.sleep(0.5)
354
+
355
+ # Run shutdown
356
+ await self._lifecycle.run_shutdown(
357
+ processor=self._processor,
358
+ timeout=self._config.shutdown_timeout_sec,
359
+ )
360
+
361
+ # Snapshot state
362
+ try:
363
+ state = self._processor.get_state()
364
+ await self._state_manager.save_state(
365
+ processor_id=self._context.processor_id,
366
+ state=state,
367
+ )
368
+ logger.info("State snapshot saved")
369
+ except Exception as e:
370
+ logger.error(f"Failed to save state: {e}")
371
+
372
+ self._metrics.record_shutdown_completed()
373
+
374
+ async def _trigger_snapshot(self) -> None:
375
+ """Trigger state snapshot (SIGUSR1 handler)."""
376
+ logger.info("State snapshot triggered")
377
+ try:
378
+ state = self._processor.get_state()
379
+ await self._state_manager.save_state(
380
+ processor_id=self._context.processor_id,
381
+ state=state,
382
+ )
383
+ logger.info("State snapshot saved (debug)")
384
+ except Exception as e:
385
+ logger.error(f"Failed to save state snapshot: {e}")
386
+
387
+ async def _cleanup(self) -> None:
388
+ """Cleanup all components."""
389
+ logger.debug("Cleaning up DoryApp components")
390
+
391
+ # Remove signal handlers
392
+ if self._signals:
393
+ self._signals.remove_handlers()
394
+
395
+ # Stop health server
396
+ if self._health_server:
397
+ await self._health_server.stop()
398
+
399
+ # Flush metrics
400
+ if self._metrics:
401
+ self._metrics.flush()
402
+
403
+ # Flush logs
404
+ logging.shutdown()