edda-framework 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
edda/hooks.py ADDED
@@ -0,0 +1,284 @@
1
+ """
2
+ Hook system for extending Edda with custom observability and monitoring.
3
+
4
+ This module provides a Protocol-based hook system that allows users to integrate
5
+ their own observability tools (Logfire, Datadog, Jaeger, etc.) without coupling
6
+ the framework to any specific tool.
7
+
8
+ Example:
9
+ >>> from edda.hooks import WorkflowHooks
10
+ >>>
11
+ >>> class MyHooks(WorkflowHooks):
12
+ ... async def on_workflow_start(self, instance_id, workflow_name, input_data):
13
+ ... print(f"Workflow {workflow_name} started: {instance_id}")
14
+ ...
15
+ ... async def on_activity_complete(self, instance_id, step, activity_name, result, cache_hit):
16
+ ... print(f"Activity {activity_name} completed (cache_hit={cache_hit})")
17
+ >>>
18
+ >>> app = EddaApp(service_name="my-service", db_url="...", hooks=MyHooks())
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ from abc import ABC
24
+ from typing import Any, Protocol
25
+
26
+
27
+ class WorkflowHooks(Protocol):
28
+ """
29
+ Protocol for workflow lifecycle hooks.
30
+
31
+ Users can implement this protocol to add custom observability, logging,
32
+ or monitoring to their workflows. All methods are optional - implement
33
+ only the ones you need.
34
+
35
+ The framework will check if a hook method exists before calling it, so
36
+ partial implementations are fully supported.
37
+ """
38
+
39
+ async def on_workflow_start(
40
+ self, instance_id: str, workflow_name: str, input_data: dict[str, Any]
41
+ ) -> None:
42
+ """
43
+ Called when a workflow starts execution.
44
+
45
+ Args:
46
+ instance_id: Unique workflow instance ID
47
+ workflow_name: Name of the workflow function
48
+ input_data: Input parameters passed to the workflow
49
+ """
50
+ ...
51
+
52
+ async def on_workflow_complete(self, instance_id: str, workflow_name: str, result: Any) -> None:
53
+ """
54
+ Called when a workflow completes successfully.
55
+
56
+ Args:
57
+ instance_id: Unique workflow instance ID
58
+ workflow_name: Name of the workflow function
59
+ result: Return value from the workflow
60
+ """
61
+ ...
62
+
63
+ async def on_workflow_failed(
64
+ self, instance_id: str, workflow_name: str, error: Exception
65
+ ) -> None:
66
+ """
67
+ Called when a workflow fails with an exception.
68
+
69
+ Args:
70
+ instance_id: Unique workflow instance ID
71
+ workflow_name: Name of the workflow function
72
+ error: Exception that caused the failure
73
+ """
74
+ ...
75
+
76
+ async def on_workflow_cancelled(self, instance_id: str, workflow_name: str) -> None:
77
+ """
78
+ Called when a workflow is cancelled.
79
+
80
+ Args:
81
+ instance_id: Unique workflow instance ID
82
+ workflow_name: Name of the workflow function
83
+ """
84
+ ...
85
+
86
+ async def on_activity_start(
87
+ self,
88
+ instance_id: str,
89
+ step: int,
90
+ activity_name: str,
91
+ is_replaying: bool,
92
+ ) -> None:
93
+ """
94
+ Called before an activity executes.
95
+
96
+ Args:
97
+ instance_id: Unique workflow instance ID
98
+ step: Step number in the workflow
99
+ activity_name: Name of the activity function
100
+ is_replaying: True if this is a replay (cached result)
101
+ """
102
+ ...
103
+
104
+ async def on_activity_complete(
105
+ self,
106
+ instance_id: str,
107
+ step: int,
108
+ activity_name: str,
109
+ result: Any,
110
+ cache_hit: bool,
111
+ ) -> None:
112
+ """
113
+ Called after an activity completes successfully.
114
+
115
+ Args:
116
+ instance_id: Unique workflow instance ID
117
+ step: Step number in the workflow
118
+ activity_name: Name of the activity function
119
+ result: Return value from the activity
120
+ cache_hit: True if result was retrieved from cache (replay)
121
+ """
122
+ ...
123
+
124
+ async def on_activity_failed(
125
+ self,
126
+ instance_id: str,
127
+ step: int,
128
+ activity_name: str,
129
+ error: Exception,
130
+ ) -> None:
131
+ """
132
+ Called when an activity fails with an exception.
133
+
134
+ Args:
135
+ instance_id: Unique workflow instance ID
136
+ step: Step number in the workflow
137
+ activity_name: Name of the activity function
138
+ error: Exception that caused the failure
139
+ """
140
+ ...
141
+
142
+ async def on_activity_retry(
143
+ self,
144
+ instance_id: str,
145
+ activity_id: str,
146
+ activity_name: str,
147
+ error: Exception,
148
+ attempt: int,
149
+ delay: float,
150
+ ) -> None:
151
+ """
152
+ Called when an activity is about to be retried after a failure.
153
+
154
+ This hook is called BEFORE the retry delay (asyncio.sleep), allowing
155
+ observability tools to track retry attempts in real-time.
156
+
157
+ Args:
158
+ instance_id: Unique workflow instance ID
159
+ activity_id: Activity ID (e.g., "my_activity:1")
160
+ activity_name: Name of the activity function
161
+ error: Exception that caused the failure
162
+ attempt: Current attempt number (1-indexed, before retry)
163
+ delay: Backoff delay in seconds before the next retry
164
+ """
165
+ ...
166
+
167
+ async def on_event_sent(
168
+ self,
169
+ event_type: str,
170
+ event_source: str,
171
+ event_data: dict[str, Any],
172
+ ) -> None:
173
+ """
174
+ Called when an event is sent (transactional outbox).
175
+
176
+ Args:
177
+ event_type: CloudEvents type
178
+ event_source: CloudEvents source
179
+ event_data: Event payload
180
+ """
181
+ ...
182
+
183
+ async def on_event_received(
184
+ self,
185
+ instance_id: str,
186
+ event_type: str,
187
+ event_data: dict[str, Any],
188
+ ) -> None:
189
+ """
190
+ Called when a workflow receives an awaited event.
191
+
192
+ Args:
193
+ instance_id: Unique workflow instance ID
194
+ event_type: CloudEvents type
195
+ event_data: Event payload
196
+ """
197
+ ...
198
+
199
+
200
+ # Base class for convenient partial implementations
201
+ class HooksBase(WorkflowHooks, ABC):
202
+ """
203
+ Abstract base class for WorkflowHooks implementations.
204
+
205
+ This can be used as a base class for partial implementations,
206
+ so you don't have to implement all methods.
207
+
208
+ Example:
209
+ >>> class MyHooks(HooksBase):
210
+ ... async def on_workflow_start(self, instance_id, workflow_name, input_data):
211
+ ... print(f"Workflow started: {workflow_name}")
212
+ ... # Other methods are no-ops (inherited from HooksBase)
213
+ """
214
+
215
+ async def on_workflow_start(
216
+ self, instance_id: str, workflow_name: str, input_data: dict[str, Any]
217
+ ) -> None:
218
+ pass
219
+
220
+ async def on_workflow_complete(self, instance_id: str, workflow_name: str, result: Any) -> None:
221
+ pass
222
+
223
+ async def on_workflow_failed(
224
+ self, instance_id: str, workflow_name: str, error: Exception
225
+ ) -> None:
226
+ pass
227
+
228
+ async def on_workflow_cancelled(self, instance_id: str, workflow_name: str) -> None:
229
+ pass
230
+
231
+ async def on_activity_start(
232
+ self,
233
+ instance_id: str,
234
+ step: int,
235
+ activity_name: str,
236
+ is_replaying: bool,
237
+ ) -> None:
238
+ pass
239
+
240
+ async def on_activity_complete(
241
+ self,
242
+ instance_id: str,
243
+ step: int,
244
+ activity_name: str,
245
+ result: Any,
246
+ cache_hit: bool,
247
+ ) -> None:
248
+ pass
249
+
250
+ async def on_activity_failed(
251
+ self,
252
+ instance_id: str,
253
+ step: int,
254
+ activity_name: str,
255
+ error: Exception,
256
+ ) -> None:
257
+ pass
258
+
259
+ async def on_activity_retry(
260
+ self,
261
+ instance_id: str,
262
+ activity_id: str,
263
+ activity_name: str,
264
+ error: Exception,
265
+ attempt: int,
266
+ delay: float,
267
+ ) -> None:
268
+ pass
269
+
270
+ async def on_event_sent(
271
+ self,
272
+ event_type: str,
273
+ event_source: str,
274
+ event_data: dict[str, Any],
275
+ ) -> None:
276
+ pass
277
+
278
+ async def on_event_received(
279
+ self,
280
+ instance_id: str,
281
+ event_type: str,
282
+ event_data: dict[str, Any],
283
+ ) -> None:
284
+ pass
edda/locking.py ADDED
@@ -0,0 +1,322 @@
1
+ """
2
+ Distributed locking utilities for Edda framework.
3
+
4
+ This module provides helper functions and context managers for working with
5
+ distributed locks in multi-pod deployments.
6
+ """
7
+
8
+ import asyncio
9
+ import os
10
+ import uuid
11
+ from collections.abc import AsyncIterator
12
+ from contextlib import asynccontextmanager, suppress
13
+ from typing import Any
14
+
15
+ from edda.storage.protocol import StorageProtocol
16
+
17
+
18
+ def generate_worker_id(service_name: str) -> str:
19
+ """
20
+ Generate a unique worker ID for this process.
21
+
22
+ The worker ID combines the service name, process ID, and a random UUID
23
+ to ensure uniqueness across pods and restarts.
24
+
25
+ Args:
26
+ service_name: Name of the service (e.g., "order-service")
27
+
28
+ Returns:
29
+ Unique worker ID (e.g., "order-service-12345-a1b2c3d4")
30
+ """
31
+ pid = os.getpid()
32
+ unique_id = uuid.uuid4().hex[:8]
33
+ return f"{service_name}-{pid}-{unique_id}"
34
+
35
+
36
+ async def acquire_lock_with_retry(
37
+ storage: StorageProtocol,
38
+ instance_id: str,
39
+ worker_id: str,
40
+ max_retries: int = 3,
41
+ retry_delay: float = 0.1,
42
+ timeout_seconds: int = 30,
43
+ ) -> bool:
44
+ """
45
+ Try to acquire lock with retries.
46
+
47
+ This is useful in high-contention scenarios where multiple workers
48
+ are trying to acquire the same lock simultaneously.
49
+
50
+ Args:
51
+ storage: Storage backend
52
+ instance_id: Workflow instance to lock
53
+ worker_id: Unique worker identifier
54
+ max_retries: Maximum number of retry attempts
55
+ retry_delay: Delay between retries in seconds
56
+ timeout_seconds: Lock timeout in seconds
57
+
58
+ Returns:
59
+ True if lock was acquired, False otherwise
60
+ """
61
+ for attempt in range(max_retries):
62
+ if await storage.try_acquire_lock(instance_id, worker_id, timeout_seconds):
63
+ return True
64
+
65
+ if attempt < max_retries - 1:
66
+ await asyncio.sleep(retry_delay)
67
+
68
+ return False
69
+
70
+
71
+ async def ensure_lock_held(
72
+ storage: StorageProtocol,
73
+ instance_id: str,
74
+ worker_id: str,
75
+ ) -> None:
76
+ """
77
+ Verify that we still hold the lock for a workflow instance.
78
+
79
+ Raises RuntimeError if the lock is not held by this worker.
80
+
81
+ Args:
82
+ storage: Storage backend
83
+ instance_id: Workflow instance
84
+ worker_id: Unique worker identifier
85
+
86
+ Raises:
87
+ RuntimeError: If lock is not held by this worker
88
+ """
89
+ instance = await storage.get_instance(instance_id)
90
+ if instance is None:
91
+ raise RuntimeError(f"Workflow instance {instance_id} not found")
92
+
93
+ if instance.get("locked_by") != worker_id:
94
+ raise RuntimeError(
95
+ f"Lock lost for instance {instance_id}. "
96
+ f"Current lock holder: {instance.get('locked_by')}"
97
+ )
98
+
99
+
100
+ @asynccontextmanager
101
+ async def workflow_lock(
102
+ storage: StorageProtocol,
103
+ instance_id: str,
104
+ worker_id: str,
105
+ timeout_seconds: int = 300,
106
+ refresh_interval: float | None = None,
107
+ ) -> AsyncIterator[None]:
108
+ """
109
+ Context manager for acquiring and releasing workflow locks.
110
+
111
+ Automatically refreshes the lock periodically if refresh_interval is provided.
112
+
113
+ Example:
114
+ >>> async with workflow_lock(storage, instance_id, worker_id):
115
+ ... # Execute workflow
116
+ ... pass
117
+
118
+ Args:
119
+ storage: Storage backend
120
+ instance_id: Workflow instance to lock
121
+ worker_id: Unique worker identifier
122
+ timeout_seconds: Lock timeout in seconds
123
+ refresh_interval: Optional interval for lock refresh (seconds)
124
+
125
+ Yields:
126
+ None (lock is held during context)
127
+
128
+ Raises:
129
+ RuntimeError: If lock cannot be acquired
130
+ """
131
+ # Try to acquire lock
132
+ acquired = await storage.try_acquire_lock(instance_id, worker_id, timeout_seconds)
133
+ if not acquired:
134
+ raise RuntimeError(f"Failed to acquire lock for instance {instance_id}")
135
+
136
+ refresh_task: asyncio.Task[Any] | None = None
137
+
138
+ try:
139
+ # Start lock refresh task if requested
140
+ if refresh_interval is not None:
141
+ refresh_task = asyncio.create_task(
142
+ _refresh_lock_periodically(storage, instance_id, worker_id, refresh_interval)
143
+ )
144
+
145
+ yield
146
+
147
+ finally:
148
+ # Cancel refresh task
149
+ if refresh_task is not None:
150
+ refresh_task.cancel()
151
+ with suppress(asyncio.CancelledError):
152
+ await refresh_task
153
+
154
+ # Release lock
155
+ await storage.release_lock(instance_id, worker_id)
156
+
157
+
158
+ async def _refresh_lock_periodically(
159
+ storage: StorageProtocol,
160
+ instance_id: str,
161
+ worker_id: str,
162
+ interval: float,
163
+ ) -> None:
164
+ """
165
+ Periodically refresh a lock to prevent timeout.
166
+
167
+ This is a background task that runs while a workflow is executing.
168
+
169
+ Args:
170
+ storage: Storage backend
171
+ instance_id: Workflow instance
172
+ worker_id: Unique worker identifier
173
+ interval: Refresh interval in seconds
174
+ """
175
+ with suppress(asyncio.CancelledError):
176
+ while True:
177
+ await asyncio.sleep(interval)
178
+
179
+ # Refresh the lock
180
+ success = await storage.refresh_lock(instance_id, worker_id)
181
+ if not success:
182
+ # Lock was lost - this shouldn't happen in normal operation
183
+ raise RuntimeError(
184
+ f"Lost lock for instance {instance_id} during refresh. "
185
+ "Workflow execution may be compromised."
186
+ )
187
+
188
+
189
+ async def cleanup_stale_locks_periodically(
190
+ storage: StorageProtocol,
191
+ interval: int = 60,
192
+ ) -> None:
193
+ """
194
+ Background task to periodically clean up stale locks.
195
+
196
+ This should be run as a background task in your application to ensure
197
+ that locks from crashed workers don't block workflows indefinitely.
198
+
199
+ Note: This function only cleans up locks without resuming workflows.
200
+ For automatic workflow resumption, use auto_resume_stale_workflows_periodically().
201
+
202
+ Example:
203
+ >>> asyncio.create_task(
204
+ ... cleanup_stale_locks_periodically(storage, interval=60)
205
+ ... )
206
+
207
+ Args:
208
+ storage: Storage backend
209
+ interval: Cleanup interval in seconds (default: 60)
210
+ """
211
+ with suppress(asyncio.CancelledError):
212
+ while True:
213
+ await asyncio.sleep(interval)
214
+
215
+ # Clean up stale locks
216
+ workflows = await storage.cleanup_stale_locks()
217
+
218
+ if len(workflows) > 0:
219
+ # Log cleanup (in a real implementation, use proper logging)
220
+ print(f"Cleaned up {len(workflows)} stale locks")
221
+
222
+
223
+ async def auto_resume_stale_workflows_periodically(
224
+ storage: StorageProtocol,
225
+ replay_engine: Any,
226
+ interval: int = 60,
227
+ ) -> None:
228
+ """
229
+ Background task to periodically clean up stale locks and auto-resume workflows.
230
+
231
+ This combines lock cleanup with automatic workflow resumption, ensuring
232
+ that workflows interrupted by worker crashes are automatically recovered.
233
+
234
+ Example:
235
+ >>> asyncio.create_task(
236
+ ... auto_resume_stale_workflows_periodically(
237
+ ... storage, replay_engine, interval=60
238
+ ... )
239
+ ... )
240
+
241
+ Args:
242
+ storage: Storage backend
243
+ replay_engine: ReplayEngine instance for resuming workflows
244
+ interval: Cleanup interval in seconds (default: 60)
245
+ """
246
+ with suppress(asyncio.CancelledError):
247
+ while True:
248
+ await asyncio.sleep(interval)
249
+
250
+ # Clean up stale locks and get workflows to resume
251
+ workflows_to_resume = await storage.cleanup_stale_locks()
252
+
253
+ if len(workflows_to_resume) > 0:
254
+ # Log cleanup (in a real implementation, use proper logging)
255
+ print(f"Cleaned up {len(workflows_to_resume)} stale locks")
256
+
257
+ # Auto-resume workflows
258
+ for workflow in workflows_to_resume:
259
+ instance_id = workflow["instance_id"]
260
+ workflow_name = workflow["workflow_name"]
261
+ source_hash = workflow["source_hash"]
262
+ status = workflow.get("status", "running")
263
+
264
+ try:
265
+ # Special handling for workflows in compensating state
266
+ if status == "compensating":
267
+ # Workflow crashed during compensation execution
268
+ # Only re-execute compensations, don't run workflow function
269
+ print(
270
+ f"Auto-resuming compensating workflow: {instance_id} "
271
+ f"(compensation recovery only, no workflow execution)"
272
+ )
273
+ success = await replay_engine.resume_compensating_workflow(instance_id)
274
+ if success:
275
+ print(f"Successfully completed compensations for: {instance_id}")
276
+ else:
277
+ print(f"Failed to complete compensations for: {instance_id}")
278
+ continue
279
+
280
+ # Normal workflow resumption (status='running')
281
+ # Check if workflow definition matches current Saga registry
282
+ # This prevents resuming workflows with outdated/incompatible code
283
+ current_definition = await storage.get_current_workflow_definition(
284
+ workflow_name
285
+ )
286
+
287
+ if current_definition is None:
288
+ print(
289
+ f"Skipping auto-resume for {instance_id}: "
290
+ f"workflow '{workflow_name}' not found in registry"
291
+ )
292
+ continue
293
+
294
+ if current_definition["source_hash"] != source_hash:
295
+ print(
296
+ f"Skipping auto-resume for {instance_id}: "
297
+ f"workflow definition has changed "
298
+ f"(old hash: {source_hash[:8]}..., "
299
+ f"new hash: {current_definition['source_hash'][:8]}...)"
300
+ )
301
+ continue
302
+
303
+ # Hash matches - safe to resume
304
+ print(f"Auto-resuming workflow: {workflow_name} (instance: {instance_id})")
305
+ await replay_engine.resume_by_name(instance_id, workflow_name)
306
+ print(f"Successfully resumed workflow: {instance_id}")
307
+ except Exception as e:
308
+ # Log error but continue with other workflows
309
+ # In a real implementation, use proper logging
310
+ print(f"Failed to auto-resume workflow {instance_id}: {e}")
311
+
312
+
313
+ class LockNotAcquiredError(Exception):
314
+ """Raised when a lock cannot be acquired."""
315
+
316
+ pass
317
+
318
+
319
+ class LockLostError(Exception):
320
+ """Raised when a lock is unexpectedly lost during execution."""
321
+
322
+ pass
@@ -0,0 +1,15 @@
1
+ """
2
+ Transactional Outbox Pattern Implementation.
3
+
4
+ This module provides reliable event publishing using the transactional outbox pattern.
5
+ Events are first written to the database, then asynchronously published by a background
6
+ relayer process.
7
+ """
8
+
9
+ from edda.outbox.relayer import OutboxRelayer
10
+ from edda.outbox.transactional import send_event_transactional
11
+
12
+ __all__ = [
13
+ "OutboxRelayer",
14
+ "send_event_transactional",
15
+ ]