edda-framework 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- edda/__init__.py +56 -0
- edda/activity.py +505 -0
- edda/app.py +996 -0
- edda/compensation.py +326 -0
- edda/context.py +489 -0
- edda/events.py +505 -0
- edda/exceptions.py +64 -0
- edda/hooks.py +284 -0
- edda/locking.py +322 -0
- edda/outbox/__init__.py +15 -0
- edda/outbox/relayer.py +274 -0
- edda/outbox/transactional.py +112 -0
- edda/pydantic_utils.py +316 -0
- edda/replay.py +799 -0
- edda/retry.py +207 -0
- edda/serialization/__init__.py +9 -0
- edda/serialization/base.py +83 -0
- edda/serialization/json.py +102 -0
- edda/storage/__init__.py +9 -0
- edda/storage/models.py +194 -0
- edda/storage/protocol.py +737 -0
- edda/storage/sqlalchemy_storage.py +1809 -0
- edda/viewer_ui/__init__.py +20 -0
- edda/viewer_ui/app.py +1399 -0
- edda/viewer_ui/components.py +1105 -0
- edda/viewer_ui/data_service.py +880 -0
- edda/visualizer/__init__.py +11 -0
- edda/visualizer/ast_analyzer.py +383 -0
- edda/visualizer/mermaid_generator.py +355 -0
- edda/workflow.py +218 -0
- edda_framework-0.1.0.dist-info/METADATA +748 -0
- edda_framework-0.1.0.dist-info/RECORD +35 -0
- edda_framework-0.1.0.dist-info/WHEEL +4 -0
- edda_framework-0.1.0.dist-info/entry_points.txt +2 -0
- edda_framework-0.1.0.dist-info/licenses/LICENSE +21 -0
edda/hooks.py
ADDED
|
@@ -0,0 +1,284 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Hook system for extending Edda with custom observability and monitoring.
|
|
3
|
+
|
|
4
|
+
This module provides a Protocol-based hook system that allows users to integrate
|
|
5
|
+
their own observability tools (Logfire, Datadog, Jaeger, etc.) without coupling
|
|
6
|
+
the framework to any specific tool.
|
|
7
|
+
|
|
8
|
+
Example:
|
|
9
|
+
>>> from edda.hooks import WorkflowHooks
|
|
10
|
+
>>>
|
|
11
|
+
>>> class MyHooks(WorkflowHooks):
|
|
12
|
+
... async def on_workflow_start(self, instance_id, workflow_name, input_data):
|
|
13
|
+
... print(f"Workflow {workflow_name} started: {instance_id}")
|
|
14
|
+
...
|
|
15
|
+
... async def on_activity_complete(self, instance_id, step, activity_name, result, cache_hit):
|
|
16
|
+
... print(f"Activity {activity_name} completed (cache_hit={cache_hit})")
|
|
17
|
+
>>>
|
|
18
|
+
>>> app = EddaApp(service_name="my-service", db_url="...", hooks=MyHooks())
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
from abc import ABC
|
|
24
|
+
from typing import Any, Protocol
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class WorkflowHooks(Protocol):
|
|
28
|
+
"""
|
|
29
|
+
Protocol for workflow lifecycle hooks.
|
|
30
|
+
|
|
31
|
+
Users can implement this protocol to add custom observability, logging,
|
|
32
|
+
or monitoring to their workflows. All methods are optional - implement
|
|
33
|
+
only the ones you need.
|
|
34
|
+
|
|
35
|
+
The framework will check if a hook method exists before calling it, so
|
|
36
|
+
partial implementations are fully supported.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
async def on_workflow_start(
|
|
40
|
+
self, instance_id: str, workflow_name: str, input_data: dict[str, Any]
|
|
41
|
+
) -> None:
|
|
42
|
+
"""
|
|
43
|
+
Called when a workflow starts execution.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
instance_id: Unique workflow instance ID
|
|
47
|
+
workflow_name: Name of the workflow function
|
|
48
|
+
input_data: Input parameters passed to the workflow
|
|
49
|
+
"""
|
|
50
|
+
...
|
|
51
|
+
|
|
52
|
+
async def on_workflow_complete(self, instance_id: str, workflow_name: str, result: Any) -> None:
|
|
53
|
+
"""
|
|
54
|
+
Called when a workflow completes successfully.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
instance_id: Unique workflow instance ID
|
|
58
|
+
workflow_name: Name of the workflow function
|
|
59
|
+
result: Return value from the workflow
|
|
60
|
+
"""
|
|
61
|
+
...
|
|
62
|
+
|
|
63
|
+
async def on_workflow_failed(
|
|
64
|
+
self, instance_id: str, workflow_name: str, error: Exception
|
|
65
|
+
) -> None:
|
|
66
|
+
"""
|
|
67
|
+
Called when a workflow fails with an exception.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
instance_id: Unique workflow instance ID
|
|
71
|
+
workflow_name: Name of the workflow function
|
|
72
|
+
error: Exception that caused the failure
|
|
73
|
+
"""
|
|
74
|
+
...
|
|
75
|
+
|
|
76
|
+
async def on_workflow_cancelled(self, instance_id: str, workflow_name: str) -> None:
|
|
77
|
+
"""
|
|
78
|
+
Called when a workflow is cancelled.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
instance_id: Unique workflow instance ID
|
|
82
|
+
workflow_name: Name of the workflow function
|
|
83
|
+
"""
|
|
84
|
+
...
|
|
85
|
+
|
|
86
|
+
async def on_activity_start(
|
|
87
|
+
self,
|
|
88
|
+
instance_id: str,
|
|
89
|
+
step: int,
|
|
90
|
+
activity_name: str,
|
|
91
|
+
is_replaying: bool,
|
|
92
|
+
) -> None:
|
|
93
|
+
"""
|
|
94
|
+
Called before an activity executes.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
instance_id: Unique workflow instance ID
|
|
98
|
+
step: Step number in the workflow
|
|
99
|
+
activity_name: Name of the activity function
|
|
100
|
+
is_replaying: True if this is a replay (cached result)
|
|
101
|
+
"""
|
|
102
|
+
...
|
|
103
|
+
|
|
104
|
+
async def on_activity_complete(
|
|
105
|
+
self,
|
|
106
|
+
instance_id: str,
|
|
107
|
+
step: int,
|
|
108
|
+
activity_name: str,
|
|
109
|
+
result: Any,
|
|
110
|
+
cache_hit: bool,
|
|
111
|
+
) -> None:
|
|
112
|
+
"""
|
|
113
|
+
Called after an activity completes successfully.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
instance_id: Unique workflow instance ID
|
|
117
|
+
step: Step number in the workflow
|
|
118
|
+
activity_name: Name of the activity function
|
|
119
|
+
result: Return value from the activity
|
|
120
|
+
cache_hit: True if result was retrieved from cache (replay)
|
|
121
|
+
"""
|
|
122
|
+
...
|
|
123
|
+
|
|
124
|
+
async def on_activity_failed(
|
|
125
|
+
self,
|
|
126
|
+
instance_id: str,
|
|
127
|
+
step: int,
|
|
128
|
+
activity_name: str,
|
|
129
|
+
error: Exception,
|
|
130
|
+
) -> None:
|
|
131
|
+
"""
|
|
132
|
+
Called when an activity fails with an exception.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
instance_id: Unique workflow instance ID
|
|
136
|
+
step: Step number in the workflow
|
|
137
|
+
activity_name: Name of the activity function
|
|
138
|
+
error: Exception that caused the failure
|
|
139
|
+
"""
|
|
140
|
+
...
|
|
141
|
+
|
|
142
|
+
async def on_activity_retry(
|
|
143
|
+
self,
|
|
144
|
+
instance_id: str,
|
|
145
|
+
activity_id: str,
|
|
146
|
+
activity_name: str,
|
|
147
|
+
error: Exception,
|
|
148
|
+
attempt: int,
|
|
149
|
+
delay: float,
|
|
150
|
+
) -> None:
|
|
151
|
+
"""
|
|
152
|
+
Called when an activity is about to be retried after a failure.
|
|
153
|
+
|
|
154
|
+
This hook is called BEFORE the retry delay (asyncio.sleep), allowing
|
|
155
|
+
observability tools to track retry attempts in real-time.
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
instance_id: Unique workflow instance ID
|
|
159
|
+
activity_id: Activity ID (e.g., "my_activity:1")
|
|
160
|
+
activity_name: Name of the activity function
|
|
161
|
+
error: Exception that caused the failure
|
|
162
|
+
attempt: Current attempt number (1-indexed, before retry)
|
|
163
|
+
delay: Backoff delay in seconds before the next retry
|
|
164
|
+
"""
|
|
165
|
+
...
|
|
166
|
+
|
|
167
|
+
async def on_event_sent(
|
|
168
|
+
self,
|
|
169
|
+
event_type: str,
|
|
170
|
+
event_source: str,
|
|
171
|
+
event_data: dict[str, Any],
|
|
172
|
+
) -> None:
|
|
173
|
+
"""
|
|
174
|
+
Called when an event is sent (transactional outbox).
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
event_type: CloudEvents type
|
|
178
|
+
event_source: CloudEvents source
|
|
179
|
+
event_data: Event payload
|
|
180
|
+
"""
|
|
181
|
+
...
|
|
182
|
+
|
|
183
|
+
async def on_event_received(
|
|
184
|
+
self,
|
|
185
|
+
instance_id: str,
|
|
186
|
+
event_type: str,
|
|
187
|
+
event_data: dict[str, Any],
|
|
188
|
+
) -> None:
|
|
189
|
+
"""
|
|
190
|
+
Called when a workflow receives an awaited event.
|
|
191
|
+
|
|
192
|
+
Args:
|
|
193
|
+
instance_id: Unique workflow instance ID
|
|
194
|
+
event_type: CloudEvents type
|
|
195
|
+
event_data: Event payload
|
|
196
|
+
"""
|
|
197
|
+
...
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
# Base class for convenient partial implementations
|
|
201
|
+
class HooksBase(WorkflowHooks, ABC):
|
|
202
|
+
"""
|
|
203
|
+
Abstract base class for WorkflowHooks implementations.
|
|
204
|
+
|
|
205
|
+
This can be used as a base class for partial implementations,
|
|
206
|
+
so you don't have to implement all methods.
|
|
207
|
+
|
|
208
|
+
Example:
|
|
209
|
+
>>> class MyHooks(HooksBase):
|
|
210
|
+
... async def on_workflow_start(self, instance_id, workflow_name, input_data):
|
|
211
|
+
... print(f"Workflow started: {workflow_name}")
|
|
212
|
+
... # Other methods are no-ops (inherited from HooksBase)
|
|
213
|
+
"""
|
|
214
|
+
|
|
215
|
+
async def on_workflow_start(
|
|
216
|
+
self, instance_id: str, workflow_name: str, input_data: dict[str, Any]
|
|
217
|
+
) -> None:
|
|
218
|
+
pass
|
|
219
|
+
|
|
220
|
+
async def on_workflow_complete(self, instance_id: str, workflow_name: str, result: Any) -> None:
|
|
221
|
+
pass
|
|
222
|
+
|
|
223
|
+
async def on_workflow_failed(
|
|
224
|
+
self, instance_id: str, workflow_name: str, error: Exception
|
|
225
|
+
) -> None:
|
|
226
|
+
pass
|
|
227
|
+
|
|
228
|
+
async def on_workflow_cancelled(self, instance_id: str, workflow_name: str) -> None:
|
|
229
|
+
pass
|
|
230
|
+
|
|
231
|
+
async def on_activity_start(
|
|
232
|
+
self,
|
|
233
|
+
instance_id: str,
|
|
234
|
+
step: int,
|
|
235
|
+
activity_name: str,
|
|
236
|
+
is_replaying: bool,
|
|
237
|
+
) -> None:
|
|
238
|
+
pass
|
|
239
|
+
|
|
240
|
+
async def on_activity_complete(
|
|
241
|
+
self,
|
|
242
|
+
instance_id: str,
|
|
243
|
+
step: int,
|
|
244
|
+
activity_name: str,
|
|
245
|
+
result: Any,
|
|
246
|
+
cache_hit: bool,
|
|
247
|
+
) -> None:
|
|
248
|
+
pass
|
|
249
|
+
|
|
250
|
+
async def on_activity_failed(
|
|
251
|
+
self,
|
|
252
|
+
instance_id: str,
|
|
253
|
+
step: int,
|
|
254
|
+
activity_name: str,
|
|
255
|
+
error: Exception,
|
|
256
|
+
) -> None:
|
|
257
|
+
pass
|
|
258
|
+
|
|
259
|
+
async def on_activity_retry(
|
|
260
|
+
self,
|
|
261
|
+
instance_id: str,
|
|
262
|
+
activity_id: str,
|
|
263
|
+
activity_name: str,
|
|
264
|
+
error: Exception,
|
|
265
|
+
attempt: int,
|
|
266
|
+
delay: float,
|
|
267
|
+
) -> None:
|
|
268
|
+
pass
|
|
269
|
+
|
|
270
|
+
async def on_event_sent(
|
|
271
|
+
self,
|
|
272
|
+
event_type: str,
|
|
273
|
+
event_source: str,
|
|
274
|
+
event_data: dict[str, Any],
|
|
275
|
+
) -> None:
|
|
276
|
+
pass
|
|
277
|
+
|
|
278
|
+
async def on_event_received(
|
|
279
|
+
self,
|
|
280
|
+
instance_id: str,
|
|
281
|
+
event_type: str,
|
|
282
|
+
event_data: dict[str, Any],
|
|
283
|
+
) -> None:
|
|
284
|
+
pass
|
edda/locking.py
ADDED
|
@@ -0,0 +1,322 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Distributed locking utilities for Edda framework.
|
|
3
|
+
|
|
4
|
+
This module provides helper functions and context managers for working with
|
|
5
|
+
distributed locks in multi-pod deployments.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import asyncio
|
|
9
|
+
import os
|
|
10
|
+
import uuid
|
|
11
|
+
from collections.abc import AsyncIterator
|
|
12
|
+
from contextlib import asynccontextmanager, suppress
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
from edda.storage.protocol import StorageProtocol
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def generate_worker_id(service_name: str) -> str:
|
|
19
|
+
"""
|
|
20
|
+
Generate a unique worker ID for this process.
|
|
21
|
+
|
|
22
|
+
The worker ID combines the service name, process ID, and a random UUID
|
|
23
|
+
to ensure uniqueness across pods and restarts.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
service_name: Name of the service (e.g., "order-service")
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
Unique worker ID (e.g., "order-service-12345-a1b2c3d4")
|
|
30
|
+
"""
|
|
31
|
+
pid = os.getpid()
|
|
32
|
+
unique_id = uuid.uuid4().hex[:8]
|
|
33
|
+
return f"{service_name}-{pid}-{unique_id}"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
async def acquire_lock_with_retry(
|
|
37
|
+
storage: StorageProtocol,
|
|
38
|
+
instance_id: str,
|
|
39
|
+
worker_id: str,
|
|
40
|
+
max_retries: int = 3,
|
|
41
|
+
retry_delay: float = 0.1,
|
|
42
|
+
timeout_seconds: int = 30,
|
|
43
|
+
) -> bool:
|
|
44
|
+
"""
|
|
45
|
+
Try to acquire lock with retries.
|
|
46
|
+
|
|
47
|
+
This is useful in high-contention scenarios where multiple workers
|
|
48
|
+
are trying to acquire the same lock simultaneously.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
storage: Storage backend
|
|
52
|
+
instance_id: Workflow instance to lock
|
|
53
|
+
worker_id: Unique worker identifier
|
|
54
|
+
max_retries: Maximum number of retry attempts
|
|
55
|
+
retry_delay: Delay between retries in seconds
|
|
56
|
+
timeout_seconds: Lock timeout in seconds
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
True if lock was acquired, False otherwise
|
|
60
|
+
"""
|
|
61
|
+
for attempt in range(max_retries):
|
|
62
|
+
if await storage.try_acquire_lock(instance_id, worker_id, timeout_seconds):
|
|
63
|
+
return True
|
|
64
|
+
|
|
65
|
+
if attempt < max_retries - 1:
|
|
66
|
+
await asyncio.sleep(retry_delay)
|
|
67
|
+
|
|
68
|
+
return False
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
async def ensure_lock_held(
|
|
72
|
+
storage: StorageProtocol,
|
|
73
|
+
instance_id: str,
|
|
74
|
+
worker_id: str,
|
|
75
|
+
) -> None:
|
|
76
|
+
"""
|
|
77
|
+
Verify that we still hold the lock for a workflow instance.
|
|
78
|
+
|
|
79
|
+
Raises RuntimeError if the lock is not held by this worker.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
storage: Storage backend
|
|
83
|
+
instance_id: Workflow instance
|
|
84
|
+
worker_id: Unique worker identifier
|
|
85
|
+
|
|
86
|
+
Raises:
|
|
87
|
+
RuntimeError: If lock is not held by this worker
|
|
88
|
+
"""
|
|
89
|
+
instance = await storage.get_instance(instance_id)
|
|
90
|
+
if instance is None:
|
|
91
|
+
raise RuntimeError(f"Workflow instance {instance_id} not found")
|
|
92
|
+
|
|
93
|
+
if instance.get("locked_by") != worker_id:
|
|
94
|
+
raise RuntimeError(
|
|
95
|
+
f"Lock lost for instance {instance_id}. "
|
|
96
|
+
f"Current lock holder: {instance.get('locked_by')}"
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
@asynccontextmanager
|
|
101
|
+
async def workflow_lock(
|
|
102
|
+
storage: StorageProtocol,
|
|
103
|
+
instance_id: str,
|
|
104
|
+
worker_id: str,
|
|
105
|
+
timeout_seconds: int = 300,
|
|
106
|
+
refresh_interval: float | None = None,
|
|
107
|
+
) -> AsyncIterator[None]:
|
|
108
|
+
"""
|
|
109
|
+
Context manager for acquiring and releasing workflow locks.
|
|
110
|
+
|
|
111
|
+
Automatically refreshes the lock periodically if refresh_interval is provided.
|
|
112
|
+
|
|
113
|
+
Example:
|
|
114
|
+
>>> async with workflow_lock(storage, instance_id, worker_id):
|
|
115
|
+
... # Execute workflow
|
|
116
|
+
... pass
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
storage: Storage backend
|
|
120
|
+
instance_id: Workflow instance to lock
|
|
121
|
+
worker_id: Unique worker identifier
|
|
122
|
+
timeout_seconds: Lock timeout in seconds
|
|
123
|
+
refresh_interval: Optional interval for lock refresh (seconds)
|
|
124
|
+
|
|
125
|
+
Yields:
|
|
126
|
+
None (lock is held during context)
|
|
127
|
+
|
|
128
|
+
Raises:
|
|
129
|
+
RuntimeError: If lock cannot be acquired
|
|
130
|
+
"""
|
|
131
|
+
# Try to acquire lock
|
|
132
|
+
acquired = await storage.try_acquire_lock(instance_id, worker_id, timeout_seconds)
|
|
133
|
+
if not acquired:
|
|
134
|
+
raise RuntimeError(f"Failed to acquire lock for instance {instance_id}")
|
|
135
|
+
|
|
136
|
+
refresh_task: asyncio.Task[Any] | None = None
|
|
137
|
+
|
|
138
|
+
try:
|
|
139
|
+
# Start lock refresh task if requested
|
|
140
|
+
if refresh_interval is not None:
|
|
141
|
+
refresh_task = asyncio.create_task(
|
|
142
|
+
_refresh_lock_periodically(storage, instance_id, worker_id, refresh_interval)
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
yield
|
|
146
|
+
|
|
147
|
+
finally:
|
|
148
|
+
# Cancel refresh task
|
|
149
|
+
if refresh_task is not None:
|
|
150
|
+
refresh_task.cancel()
|
|
151
|
+
with suppress(asyncio.CancelledError):
|
|
152
|
+
await refresh_task
|
|
153
|
+
|
|
154
|
+
# Release lock
|
|
155
|
+
await storage.release_lock(instance_id, worker_id)
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
async def _refresh_lock_periodically(
|
|
159
|
+
storage: StorageProtocol,
|
|
160
|
+
instance_id: str,
|
|
161
|
+
worker_id: str,
|
|
162
|
+
interval: float,
|
|
163
|
+
) -> None:
|
|
164
|
+
"""
|
|
165
|
+
Periodically refresh a lock to prevent timeout.
|
|
166
|
+
|
|
167
|
+
This is a background task that runs while a workflow is executing.
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
storage: Storage backend
|
|
171
|
+
instance_id: Workflow instance
|
|
172
|
+
worker_id: Unique worker identifier
|
|
173
|
+
interval: Refresh interval in seconds
|
|
174
|
+
"""
|
|
175
|
+
with suppress(asyncio.CancelledError):
|
|
176
|
+
while True:
|
|
177
|
+
await asyncio.sleep(interval)
|
|
178
|
+
|
|
179
|
+
# Refresh the lock
|
|
180
|
+
success = await storage.refresh_lock(instance_id, worker_id)
|
|
181
|
+
if not success:
|
|
182
|
+
# Lock was lost - this shouldn't happen in normal operation
|
|
183
|
+
raise RuntimeError(
|
|
184
|
+
f"Lost lock for instance {instance_id} during refresh. "
|
|
185
|
+
"Workflow execution may be compromised."
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
async def cleanup_stale_locks_periodically(
|
|
190
|
+
storage: StorageProtocol,
|
|
191
|
+
interval: int = 60,
|
|
192
|
+
) -> None:
|
|
193
|
+
"""
|
|
194
|
+
Background task to periodically clean up stale locks.
|
|
195
|
+
|
|
196
|
+
This should be run as a background task in your application to ensure
|
|
197
|
+
that locks from crashed workers don't block workflows indefinitely.
|
|
198
|
+
|
|
199
|
+
Note: This function only cleans up locks without resuming workflows.
|
|
200
|
+
For automatic workflow resumption, use auto_resume_stale_workflows_periodically().
|
|
201
|
+
|
|
202
|
+
Example:
|
|
203
|
+
>>> asyncio.create_task(
|
|
204
|
+
... cleanup_stale_locks_periodically(storage, interval=60)
|
|
205
|
+
... )
|
|
206
|
+
|
|
207
|
+
Args:
|
|
208
|
+
storage: Storage backend
|
|
209
|
+
interval: Cleanup interval in seconds (default: 60)
|
|
210
|
+
"""
|
|
211
|
+
with suppress(asyncio.CancelledError):
|
|
212
|
+
while True:
|
|
213
|
+
await asyncio.sleep(interval)
|
|
214
|
+
|
|
215
|
+
# Clean up stale locks
|
|
216
|
+
workflows = await storage.cleanup_stale_locks()
|
|
217
|
+
|
|
218
|
+
if len(workflows) > 0:
|
|
219
|
+
# Log cleanup (in a real implementation, use proper logging)
|
|
220
|
+
print(f"Cleaned up {len(workflows)} stale locks")
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
async def auto_resume_stale_workflows_periodically(
|
|
224
|
+
storage: StorageProtocol,
|
|
225
|
+
replay_engine: Any,
|
|
226
|
+
interval: int = 60,
|
|
227
|
+
) -> None:
|
|
228
|
+
"""
|
|
229
|
+
Background task to periodically clean up stale locks and auto-resume workflows.
|
|
230
|
+
|
|
231
|
+
This combines lock cleanup with automatic workflow resumption, ensuring
|
|
232
|
+
that workflows interrupted by worker crashes are automatically recovered.
|
|
233
|
+
|
|
234
|
+
Example:
|
|
235
|
+
>>> asyncio.create_task(
|
|
236
|
+
... auto_resume_stale_workflows_periodically(
|
|
237
|
+
... storage, replay_engine, interval=60
|
|
238
|
+
... )
|
|
239
|
+
... )
|
|
240
|
+
|
|
241
|
+
Args:
|
|
242
|
+
storage: Storage backend
|
|
243
|
+
replay_engine: ReplayEngine instance for resuming workflows
|
|
244
|
+
interval: Cleanup interval in seconds (default: 60)
|
|
245
|
+
"""
|
|
246
|
+
with suppress(asyncio.CancelledError):
|
|
247
|
+
while True:
|
|
248
|
+
await asyncio.sleep(interval)
|
|
249
|
+
|
|
250
|
+
# Clean up stale locks and get workflows to resume
|
|
251
|
+
workflows_to_resume = await storage.cleanup_stale_locks()
|
|
252
|
+
|
|
253
|
+
if len(workflows_to_resume) > 0:
|
|
254
|
+
# Log cleanup (in a real implementation, use proper logging)
|
|
255
|
+
print(f"Cleaned up {len(workflows_to_resume)} stale locks")
|
|
256
|
+
|
|
257
|
+
# Auto-resume workflows
|
|
258
|
+
for workflow in workflows_to_resume:
|
|
259
|
+
instance_id = workflow["instance_id"]
|
|
260
|
+
workflow_name = workflow["workflow_name"]
|
|
261
|
+
source_hash = workflow["source_hash"]
|
|
262
|
+
status = workflow.get("status", "running")
|
|
263
|
+
|
|
264
|
+
try:
|
|
265
|
+
# Special handling for workflows in compensating state
|
|
266
|
+
if status == "compensating":
|
|
267
|
+
# Workflow crashed during compensation execution
|
|
268
|
+
# Only re-execute compensations, don't run workflow function
|
|
269
|
+
print(
|
|
270
|
+
f"Auto-resuming compensating workflow: {instance_id} "
|
|
271
|
+
f"(compensation recovery only, no workflow execution)"
|
|
272
|
+
)
|
|
273
|
+
success = await replay_engine.resume_compensating_workflow(instance_id)
|
|
274
|
+
if success:
|
|
275
|
+
print(f"Successfully completed compensations for: {instance_id}")
|
|
276
|
+
else:
|
|
277
|
+
print(f"Failed to complete compensations for: {instance_id}")
|
|
278
|
+
continue
|
|
279
|
+
|
|
280
|
+
# Normal workflow resumption (status='running')
|
|
281
|
+
# Check if workflow definition matches current Saga registry
|
|
282
|
+
# This prevents resuming workflows with outdated/incompatible code
|
|
283
|
+
current_definition = await storage.get_current_workflow_definition(
|
|
284
|
+
workflow_name
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
if current_definition is None:
|
|
288
|
+
print(
|
|
289
|
+
f"Skipping auto-resume for {instance_id}: "
|
|
290
|
+
f"workflow '{workflow_name}' not found in registry"
|
|
291
|
+
)
|
|
292
|
+
continue
|
|
293
|
+
|
|
294
|
+
if current_definition["source_hash"] != source_hash:
|
|
295
|
+
print(
|
|
296
|
+
f"Skipping auto-resume for {instance_id}: "
|
|
297
|
+
f"workflow definition has changed "
|
|
298
|
+
f"(old hash: {source_hash[:8]}..., "
|
|
299
|
+
f"new hash: {current_definition['source_hash'][:8]}...)"
|
|
300
|
+
)
|
|
301
|
+
continue
|
|
302
|
+
|
|
303
|
+
# Hash matches - safe to resume
|
|
304
|
+
print(f"Auto-resuming workflow: {workflow_name} (instance: {instance_id})")
|
|
305
|
+
await replay_engine.resume_by_name(instance_id, workflow_name)
|
|
306
|
+
print(f"Successfully resumed workflow: {instance_id}")
|
|
307
|
+
except Exception as e:
|
|
308
|
+
# Log error but continue with other workflows
|
|
309
|
+
# In a real implementation, use proper logging
|
|
310
|
+
print(f"Failed to auto-resume workflow {instance_id}: {e}")
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
class LockNotAcquiredError(Exception):
|
|
314
|
+
"""Raised when a lock cannot be acquired."""
|
|
315
|
+
|
|
316
|
+
pass
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
class LockLostError(Exception):
|
|
320
|
+
"""Raised when a lock is unexpectedly lost during execution."""
|
|
321
|
+
|
|
322
|
+
pass
|
edda/outbox/__init__.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Transactional Outbox Pattern Implementation.
|
|
3
|
+
|
|
4
|
+
This module provides reliable event publishing using the transactional outbox pattern.
|
|
5
|
+
Events are first written to the database, then asynchronously published by a background
|
|
6
|
+
relayer process.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from edda.outbox.relayer import OutboxRelayer
|
|
10
|
+
from edda.outbox.transactional import send_event_transactional
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"OutboxRelayer",
|
|
14
|
+
"send_event_transactional",
|
|
15
|
+
]
|