dory-sdk 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. dory/__init__.py +70 -0
  2. dory/auto_instrument.py +142 -0
  3. dory/cli/__init__.py +5 -0
  4. dory/cli/main.py +290 -0
  5. dory/cli/templates.py +333 -0
  6. dory/config/__init__.py +23 -0
  7. dory/config/defaults.py +50 -0
  8. dory/config/loader.py +361 -0
  9. dory/config/presets.py +325 -0
  10. dory/config/schema.py +152 -0
  11. dory/core/__init__.py +27 -0
  12. dory/core/app.py +404 -0
  13. dory/core/context.py +209 -0
  14. dory/core/lifecycle.py +214 -0
  15. dory/core/meta.py +121 -0
  16. dory/core/modes.py +479 -0
  17. dory/core/processor.py +654 -0
  18. dory/core/signals.py +122 -0
  19. dory/decorators.py +142 -0
  20. dory/errors/__init__.py +117 -0
  21. dory/errors/classification.py +362 -0
  22. dory/errors/codes.py +495 -0
  23. dory/health/__init__.py +10 -0
  24. dory/health/probes.py +210 -0
  25. dory/health/server.py +306 -0
  26. dory/k8s/__init__.py +11 -0
  27. dory/k8s/annotation_watcher.py +184 -0
  28. dory/k8s/client.py +251 -0
  29. dory/k8s/pod_metadata.py +182 -0
  30. dory/logging/__init__.py +9 -0
  31. dory/logging/logger.py +175 -0
  32. dory/metrics/__init__.py +7 -0
  33. dory/metrics/collector.py +301 -0
  34. dory/middleware/__init__.py +36 -0
  35. dory/middleware/connection_tracker.py +608 -0
  36. dory/middleware/request_id.py +321 -0
  37. dory/middleware/request_tracker.py +501 -0
  38. dory/migration/__init__.py +11 -0
  39. dory/migration/configmap.py +260 -0
  40. dory/migration/serialization.py +167 -0
  41. dory/migration/state_manager.py +301 -0
  42. dory/monitoring/__init__.py +23 -0
  43. dory/monitoring/opentelemetry.py +462 -0
  44. dory/py.typed +2 -0
  45. dory/recovery/__init__.py +60 -0
  46. dory/recovery/golden_image.py +480 -0
  47. dory/recovery/golden_snapshot.py +561 -0
  48. dory/recovery/golden_validator.py +518 -0
  49. dory/recovery/partial_recovery.py +479 -0
  50. dory/recovery/recovery_decision.py +242 -0
  51. dory/recovery/restart_detector.py +142 -0
  52. dory/recovery/state_validator.py +187 -0
  53. dory/resilience/__init__.py +45 -0
  54. dory/resilience/circuit_breaker.py +454 -0
  55. dory/resilience/retry.py +389 -0
  56. dory/sidecar/__init__.py +6 -0
  57. dory/sidecar/main.py +75 -0
  58. dory/sidecar/server.py +329 -0
  59. dory/simple.py +342 -0
  60. dory/types.py +75 -0
  61. dory/utils/__init__.py +25 -0
  62. dory/utils/errors.py +59 -0
  63. dory/utils/retry.py +115 -0
  64. dory/utils/timeout.py +80 -0
  65. dory_sdk-2.1.0.dist-info/METADATA +663 -0
  66. dory_sdk-2.1.0.dist-info/RECORD +69 -0
  67. dory_sdk-2.1.0.dist-info/WHEEL +5 -0
  68. dory_sdk-2.1.0.dist-info/entry_points.txt +3 -0
  69. dory_sdk-2.1.0.dist-info/top_level.txt +1 -0
dory/health/server.py ADDED
@@ -0,0 +1,306 @@
1
+ """
2
+ Health and metrics HTTP server.
3
+
4
+ Provides endpoints for:
5
+ - /healthz - Liveness probe
6
+ - /ready - Readiness probe (matches Kubernetes convention)
7
+ - /metrics - Prometheus metrics
8
+ - /state - State transfer (GET/POST) for pod migration
9
+ - /prestop - PreStop hook handler for graceful shutdown
10
+ """
11
+
12
+ import logging
13
+ from typing import TYPE_CHECKING, Callable, Awaitable
14
+
15
+ from aiohttp import web
16
+
17
+ from dory.health.probes import LivenessProbe, ReadinessProbe
18
+ from dory.utils.errors import DoryHealthError
19
+
20
+ if TYPE_CHECKING:
21
+ from dory.metrics.collector import MetricsCollector
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+ # Type aliases for callbacks
26
+ StateGetter = Callable[[], dict]
27
+ StateRestorer = Callable[[dict], Awaitable[None]]
28
+ PreStopHandler = Callable[[], Awaitable[None]]
29
+
30
+
31
+ class HealthServer:
32
+ """
33
+ HTTP server for health probes, metrics, and state transfer.
34
+
35
+ Runs on a separate port from the main application.
36
+ Provides endpoints required by Dory Orchestrator for:
37
+ - Health probes (liveness/readiness)
38
+ - Prometheus metrics
39
+ - State transfer during pod migration
40
+ - PreStop hook for graceful shutdown
41
+ """
42
+
43
+ def __init__(
44
+ self,
45
+ port: int = 8080,
46
+ health_path: str = "/healthz",
47
+ ready_path: str = "/ready", # Changed from /readyz to match Orchestrator
48
+ metrics_path: str = "/metrics",
49
+ metrics_collector: "MetricsCollector | None" = None,
50
+ state_getter: StateGetter | None = None,
51
+ state_restorer: StateRestorer | None = None,
52
+ prestop_handler: PreStopHandler | None = None,
53
+ ):
54
+ """
55
+ Initialize health server.
56
+
57
+ Args:
58
+ port: Port to listen on
59
+ health_path: Path for liveness probe
60
+ ready_path: Path for readiness probe
61
+ metrics_path: Path for Prometheus metrics
62
+ metrics_collector: Optional metrics collector for /metrics endpoint
63
+ state_getter: Callback to get processor state for /state GET
64
+ state_restorer: Callback to restore processor state for /state POST
65
+ prestop_handler: Callback for /prestop PreStop hook
66
+ """
67
+ self._port = port
68
+ self._health_path = health_path
69
+ self._ready_path = ready_path
70
+ self._metrics_path = metrics_path
71
+ self._metrics_collector = metrics_collector
72
+ self._state_getter = state_getter
73
+ self._state_restorer = state_restorer
74
+ self._prestop_handler = prestop_handler
75
+
76
+ self._liveness = LivenessProbe()
77
+ self._readiness = ReadinessProbe()
78
+
79
+ self._app: web.Application | None = None
80
+ self._runner: web.AppRunner | None = None
81
+ self._site: web.TCPSite | None = None
82
+
83
+ @property
84
+ def liveness_probe(self) -> LivenessProbe:
85
+ """Get liveness probe for adding custom checks."""
86
+ return self._liveness
87
+
88
+ @property
89
+ def readiness_probe(self) -> ReadinessProbe:
90
+ """Get readiness probe for adding custom checks."""
91
+ return self._readiness
92
+
93
+ def mark_ready(self) -> None:
94
+ """Mark the application as ready to receive traffic."""
95
+ self._readiness.mark_ready()
96
+
97
+ def mark_not_ready(self) -> None:
98
+ """Mark the application as not ready."""
99
+ self._readiness.mark_not_ready()
100
+
101
+ def set_state_getter(self, getter: StateGetter) -> None:
102
+ """Set the callback for getting processor state."""
103
+ self._state_getter = getter
104
+
105
+ def set_state_restorer(self, restorer: StateRestorer) -> None:
106
+ """Set the callback for restoring processor state."""
107
+ self._state_restorer = restorer
108
+
109
+ def set_prestop_handler(self, handler: PreStopHandler) -> None:
110
+ """Set the callback for PreStop hook."""
111
+ self._prestop_handler = handler
112
+
113
+ async def start(self) -> None:
114
+ """
115
+ Start the health server.
116
+
117
+ Raises:
118
+ DoryHealthError: If server fails to start
119
+ """
120
+ try:
121
+ self._app = web.Application()
122
+ self._setup_routes()
123
+
124
+ self._runner = web.AppRunner(self._app)
125
+ await self._runner.setup()
126
+
127
+ self._site = web.TCPSite(
128
+ self._runner,
129
+ host="0.0.0.0",
130
+ port=self._port,
131
+ )
132
+ await self._site.start()
133
+
134
+ logger.info(f"Health server started on port {self._port}")
135
+
136
+ except Exception as e:
137
+ raise DoryHealthError(f"Failed to start health server: {e}", cause=e)
138
+
139
+ async def stop(self) -> None:
140
+ """Stop the health server."""
141
+ if self._runner:
142
+ await self._runner.cleanup()
143
+ logger.info("Health server stopped")
144
+
145
+ def _setup_routes(self) -> None:
146
+ """Configure HTTP routes."""
147
+ self._app.router.add_get(self._health_path, self._handle_health)
148
+ self._app.router.add_get(self._ready_path, self._handle_ready)
149
+ self._app.router.add_get(self._metrics_path, self._handle_metrics)
150
+
151
+ # State transfer endpoints (required by Dory Orchestrator)
152
+ self._app.router.add_get("/state", self._handle_state_get)
153
+ self._app.router.add_post("/state", self._handle_state_post)
154
+
155
+ # PreStop hook endpoint (required by Dory Orchestrator)
156
+ self._app.router.add_get("/prestop", self._handle_prestop)
157
+
158
+ # Root endpoint for basic info
159
+ self._app.router.add_get("/", self._handle_root)
160
+
161
+ async def _handle_root(self, request: web.Request) -> web.Response:
162
+ """Handle root endpoint."""
163
+ return web.json_response({
164
+ "service": "dory-processor",
165
+ "endpoints": [
166
+ self._health_path,
167
+ self._ready_path,
168
+ self._metrics_path,
169
+ "/state",
170
+ "/prestop",
171
+ ],
172
+ })
173
+
174
+ async def _handle_health(self, request: web.Request) -> web.Response:
175
+ """
176
+ Handle liveness probe.
177
+
178
+ Returns 200 if alive, 503 if unhealthy.
179
+ """
180
+ result = await self._liveness.check()
181
+
182
+ status = 200 if result.healthy else 503
183
+ return web.json_response(result.to_dict(), status=status)
184
+
185
+ async def _handle_ready(self, request: web.Request) -> web.Response:
186
+ """
187
+ Handle readiness probe.
188
+
189
+ Returns 200 if ready, 503 if not ready.
190
+ """
191
+ result = await self._readiness.check()
192
+
193
+ status = 200 if result.healthy else 503
194
+ return web.json_response(result.to_dict(), status=status)
195
+
196
+ async def _handle_metrics(self, request: web.Request) -> web.Response:
197
+ """
198
+ Handle Prometheus metrics endpoint.
199
+
200
+ Returns metrics in Prometheus text format.
201
+ """
202
+ if self._metrics_collector is None:
203
+ return web.Response(
204
+ text="# No metrics collector configured\n",
205
+ content_type="text/plain",
206
+ )
207
+
208
+ try:
209
+ metrics_text = self._metrics_collector.export_prometheus()
210
+ return web.Response(
211
+ text=metrics_text,
212
+ content_type="text/plain; version=0.0.4",
213
+ charset="utf-8",
214
+ )
215
+ except Exception as e:
216
+ logger.error(f"Error exporting metrics: {e}")
217
+ return web.Response(
218
+ text=f"# Error exporting metrics: {e}\n",
219
+ content_type="text/plain",
220
+ status=500,
221
+ )
222
+
223
+ async def _handle_state_get(self, request: web.Request) -> web.Response:
224
+ """
225
+ Handle GET /state for state capture during migration.
226
+
227
+ Called by Dory Orchestrator to capture state from old pod
228
+ before transferring to new pod.
229
+
230
+ Returns:
231
+ JSON response with processor state
232
+ """
233
+ if self._state_getter is None:
234
+ logger.warning("State getter not configured, returning empty state")
235
+ return web.json_response({
236
+ "error": "state_getter not configured",
237
+ "data": {},
238
+ }, status=503)
239
+
240
+ try:
241
+ state = self._state_getter()
242
+ logger.info("State captured for transfer", extra={"state_keys": list(state.keys())})
243
+ return web.json_response(state)
244
+ except Exception as e:
245
+ logger.error(f"Failed to capture state: {e}")
246
+ return web.json_response(
247
+ {"error": f"Failed to capture state: {e}"},
248
+ status=500,
249
+ )
250
+
251
+ async def _handle_state_post(self, request: web.Request) -> web.Response:
252
+ """
253
+ Handle POST /state for state restoration during migration.
254
+
255
+ Called by Dory Orchestrator to restore state to new pod
256
+ after capturing from old pod.
257
+
258
+ Returns:
259
+ JSON response confirming state restoration
260
+ """
261
+ if self._state_restorer is None:
262
+ logger.warning("State restorer not configured")
263
+ return web.json_response({
264
+ "error": "state_restorer not configured",
265
+ }, status=503)
266
+
267
+ try:
268
+ state = await request.json()
269
+ logger.info("Restoring state from transfer", extra={"state_keys": list(state.keys())})
270
+ await self._state_restorer(state)
271
+ logger.info("State restored successfully")
272
+ return web.json_response({"status": "ok", "message": "State restored"})
273
+ except Exception as e:
274
+ logger.error(f"Failed to restore state: {e}")
275
+ return web.json_response(
276
+ {"error": f"Failed to restore state: {e}"},
277
+ status=500,
278
+ )
279
+
280
+ async def _handle_prestop(self, request: web.Request) -> web.Response:
281
+ """
282
+ Handle GET /prestop for PreStop hook.
283
+
284
+ Called by Kubernetes PreStop hook before pod termination.
285
+ Allows the application to prepare for graceful shutdown.
286
+
287
+ Returns:
288
+ JSON response confirming prestop handling
289
+ """
290
+ logger.info("PreStop hook invoked - preparing for shutdown")
291
+
292
+ # Mark as not ready to stop receiving new traffic
293
+ self._readiness.mark_not_ready()
294
+
295
+ if self._prestop_handler:
296
+ try:
297
+ await self._prestop_handler()
298
+ logger.info("PreStop handler completed")
299
+ except Exception as e:
300
+ logger.error(f"PreStop handler error: {e}")
301
+ # Continue anyway - don't block shutdown
302
+
303
+ return web.json_response({
304
+ "status": "ok",
305
+ "message": "PreStop hook processed, ready for termination",
306
+ })
dory/k8s/__init__.py ADDED
@@ -0,0 +1,11 @@
1
+ """Kubernetes integration utilities."""
2
+
3
+ from dory.k8s.client import K8sClient
4
+ from dory.k8s.pod_metadata import PodMetadata
5
+ from dory.k8s.annotation_watcher import AnnotationWatcher
6
+
7
+ __all__ = [
8
+ "K8sClient",
9
+ "PodMetadata",
10
+ "AnnotationWatcher",
11
+ ]
@@ -0,0 +1,184 @@
1
+ """
2
+ Annotation watcher for migration signals.
3
+
4
+ Watches pod annotations for migration-related signals
5
+ from the orchestrator.
6
+ """
7
+
8
+ import asyncio
9
+ import logging
10
+ from typing import Callable, Any
11
+
12
+ from dory.k8s.client import K8sClient
13
+ from dory.utils.errors import DoryK8sError
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class AnnotationWatcher:
19
+ """
20
+ Watches pod annotations for orchestrator signals.
21
+
22
+ Monitors annotations:
23
+ - dory.io/migration: "true" when migration imminent
24
+ - dory.io/shutdown: "true" when shutdown requested
25
+ - dory.io/snapshot: "true" when snapshot requested
26
+ """
27
+
28
+ MIGRATION_ANNOTATION = "dory.io/migration"
29
+ SHUTDOWN_ANNOTATION = "dory.io/shutdown"
30
+ SNAPSHOT_ANNOTATION = "dory.io/snapshot"
31
+ DEADLINE_ANNOTATION = "dory.io/migration-deadline"
32
+
33
+ def __init__(
34
+ self,
35
+ k8s_client: K8sClient,
36
+ pod_name: str,
37
+ poll_interval: float = 5.0,
38
+ ):
39
+ """
40
+ Initialize annotation watcher.
41
+
42
+ Args:
43
+ k8s_client: Kubernetes client
44
+ pod_name: Name of pod to watch
45
+ poll_interval: Seconds between polls
46
+ """
47
+ self._k8s_client = k8s_client
48
+ self._pod_name = pod_name
49
+ self._poll_interval = poll_interval
50
+
51
+ self._running = False
52
+ self._watch_task: asyncio.Task | None = None
53
+
54
+ # Callbacks
55
+ self._on_migration: Callable[[], Any] | None = None
56
+ self._on_shutdown: Callable[[], Any] | None = None
57
+ self._on_snapshot: Callable[[], Any] | None = None
58
+
59
+ # State tracking
60
+ self._last_annotations: dict[str, str] = {}
61
+
62
+ def on_migration(self, callback: Callable[[], Any]) -> None:
63
+ """Set callback for migration signal."""
64
+ self._on_migration = callback
65
+
66
+ def on_shutdown(self, callback: Callable[[], Any]) -> None:
67
+ """Set callback for shutdown signal."""
68
+ self._on_shutdown = callback
69
+
70
+ def on_snapshot(self, callback: Callable[[], Any]) -> None:
71
+ """Set callback for snapshot signal."""
72
+ self._on_snapshot = callback
73
+
74
+ async def start(self) -> None:
75
+ """Start watching annotations."""
76
+ if self._running:
77
+ return
78
+
79
+ self._running = True
80
+ self._watch_task = asyncio.create_task(self._watch_loop())
81
+ logger.info(f"Started annotation watcher for pod {self._pod_name}")
82
+
83
+ async def stop(self) -> None:
84
+ """Stop watching annotations."""
85
+ self._running = False
86
+
87
+ if self._watch_task:
88
+ self._watch_task.cancel()
89
+ try:
90
+ await self._watch_task
91
+ except asyncio.CancelledError:
92
+ pass
93
+ self._watch_task = None
94
+
95
+ logger.info("Annotation watcher stopped")
96
+
97
+ async def _watch_loop(self) -> None:
98
+ """Main watch loop."""
99
+ while self._running:
100
+ try:
101
+ await self._check_annotations()
102
+ except DoryK8sError as e:
103
+ logger.warning(f"Failed to check annotations: {e}")
104
+ except Exception as e:
105
+ logger.error(f"Unexpected error in annotation watcher: {e}")
106
+
107
+ await asyncio.sleep(self._poll_interval)
108
+
109
+ async def _check_annotations(self) -> None:
110
+ """Check annotations for changes."""
111
+ try:
112
+ annotations = await self._k8s_client.get_pod_annotations(self._pod_name)
113
+ except DoryK8sError:
114
+ # Pod might not exist yet or API unavailable
115
+ return
116
+
117
+ # Check migration annotation
118
+ if self._annotation_changed(self.MIGRATION_ANNOTATION, annotations, "true"):
119
+ logger.info("Migration signal detected")
120
+ if self._on_migration:
121
+ await self._invoke_callback(self._on_migration)
122
+
123
+ # Check shutdown annotation
124
+ if self._annotation_changed(self.SHUTDOWN_ANNOTATION, annotations, "true"):
125
+ logger.info("Shutdown signal detected")
126
+ if self._on_shutdown:
127
+ await self._invoke_callback(self._on_shutdown)
128
+
129
+ # Check snapshot annotation
130
+ if self._annotation_changed(self.SNAPSHOT_ANNOTATION, annotations, "true"):
131
+ logger.info("Snapshot signal detected")
132
+ if self._on_snapshot:
133
+ await self._invoke_callback(self._on_snapshot)
134
+ # Clear snapshot annotation after processing
135
+ await self._clear_annotation(self.SNAPSHOT_ANNOTATION)
136
+
137
+ self._last_annotations = annotations
138
+
139
+ def _annotation_changed(
140
+ self,
141
+ key: str,
142
+ new_annotations: dict[str, str],
143
+ trigger_value: str,
144
+ ) -> bool:
145
+ """Check if annotation changed to trigger value."""
146
+ old_value = self._last_annotations.get(key)
147
+ new_value = new_annotations.get(key)
148
+
149
+ return old_value != new_value and new_value == trigger_value
150
+
151
+ async def _invoke_callback(self, callback: Callable[[], Any]) -> None:
152
+ """Invoke callback, handling async/sync."""
153
+ try:
154
+ if asyncio.iscoroutinefunction(callback):
155
+ await callback()
156
+ else:
157
+ callback()
158
+ except Exception as e:
159
+ logger.error(f"Callback error: {e}")
160
+
161
+ async def _clear_annotation(self, key: str) -> None:
162
+ """Clear an annotation after processing."""
163
+ try:
164
+ await self._k8s_client.patch_pod_annotations(
165
+ self._pod_name,
166
+ {key: None}, # Setting to None removes the annotation
167
+ )
168
+ except DoryK8sError as e:
169
+ logger.warning(f"Failed to clear annotation {key}: {e}")
170
+
171
+ def get_migration_deadline(self) -> float | None:
172
+ """
173
+ Get migration deadline from annotations.
174
+
175
+ Returns:
176
+ Unix timestamp of deadline, or None
177
+ """
178
+ deadline_str = self._last_annotations.get(self.DEADLINE_ANNOTATION)
179
+ if deadline_str:
180
+ try:
181
+ return float(deadline_str)
182
+ except ValueError:
183
+ pass
184
+ return None