edda-framework 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- edda/__init__.py +56 -0
- edda/activity.py +505 -0
- edda/app.py +996 -0
- edda/compensation.py +326 -0
- edda/context.py +489 -0
- edda/events.py +505 -0
- edda/exceptions.py +64 -0
- edda/hooks.py +284 -0
- edda/locking.py +322 -0
- edda/outbox/__init__.py +15 -0
- edda/outbox/relayer.py +274 -0
- edda/outbox/transactional.py +112 -0
- edda/pydantic_utils.py +316 -0
- edda/replay.py +799 -0
- edda/retry.py +207 -0
- edda/serialization/__init__.py +9 -0
- edda/serialization/base.py +83 -0
- edda/serialization/json.py +102 -0
- edda/storage/__init__.py +9 -0
- edda/storage/models.py +194 -0
- edda/storage/protocol.py +737 -0
- edda/storage/sqlalchemy_storage.py +1809 -0
- edda/viewer_ui/__init__.py +20 -0
- edda/viewer_ui/app.py +1399 -0
- edda/viewer_ui/components.py +1105 -0
- edda/viewer_ui/data_service.py +880 -0
- edda/visualizer/__init__.py +11 -0
- edda/visualizer/ast_analyzer.py +383 -0
- edda/visualizer/mermaid_generator.py +355 -0
- edda/workflow.py +218 -0
- edda_framework-0.1.0.dist-info/METADATA +748 -0
- edda_framework-0.1.0.dist-info/RECORD +35 -0
- edda_framework-0.1.0.dist-info/WHEEL +4 -0
- edda_framework-0.1.0.dist-info/entry_points.txt +2 -0
- edda_framework-0.1.0.dist-info/licenses/LICENSE +21 -0
edda/replay.py
ADDED
|
@@ -0,0 +1,799 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Replay engine for Edda framework.
|
|
3
|
+
|
|
4
|
+
This module implements the deterministic replay mechanism with activity result caching,
|
|
5
|
+
allowing workflows to resume from where they left off by replaying their
|
|
6
|
+
execution history.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import hashlib
|
|
10
|
+
import inspect
|
|
11
|
+
import logging
|
|
12
|
+
import uuid
|
|
13
|
+
from collections.abc import Callable
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
from edda.compensation import execute_compensations
|
|
17
|
+
from edda.context import WorkflowContext
|
|
18
|
+
from edda.events import WaitForEventException, WaitForTimerException
|
|
19
|
+
from edda.locking import workflow_lock
|
|
20
|
+
from edda.pydantic_utils import (
|
|
21
|
+
enum_value_to_enum,
|
|
22
|
+
extract_enum_from_annotation,
|
|
23
|
+
extract_pydantic_model_from_annotation,
|
|
24
|
+
from_json_dict,
|
|
25
|
+
to_json_dict,
|
|
26
|
+
)
|
|
27
|
+
from edda.storage.protocol import StorageProtocol
|
|
28
|
+
|
|
29
|
+
logger = logging.getLogger(__name__)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class ReplayEngine:
|
|
33
|
+
"""
|
|
34
|
+
Engine for executing and replaying workflows with deterministic behavior.
|
|
35
|
+
|
|
36
|
+
The replay engine orchestrates workflow execution, handles lock acquisition,
|
|
37
|
+
loads history for replay, and manages workflow lifecycle.
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
def __init__(
|
|
41
|
+
self,
|
|
42
|
+
storage: StorageProtocol,
|
|
43
|
+
service_name: str,
|
|
44
|
+
worker_id: str,
|
|
45
|
+
hooks: Any = None,
|
|
46
|
+
default_retry_policy: Any = None,
|
|
47
|
+
):
|
|
48
|
+
"""
|
|
49
|
+
Initialize the replay engine.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
storage: Storage backend
|
|
53
|
+
service_name: Name of the service (e.g., "order-service")
|
|
54
|
+
worker_id: Unique worker ID for this process
|
|
55
|
+
hooks: Optional WorkflowHooks implementation for observability
|
|
56
|
+
default_retry_policy: Default retry policy for all activities (RetryPolicy or None)
|
|
57
|
+
"""
|
|
58
|
+
self.storage = storage
|
|
59
|
+
self.service_name = service_name
|
|
60
|
+
self.worker_id = worker_id
|
|
61
|
+
self.hooks = hooks
|
|
62
|
+
self.default_retry_policy = default_retry_policy
|
|
63
|
+
|
|
64
|
+
def _prepare_workflow_input(
|
|
65
|
+
self,
|
|
66
|
+
workflow_func: Callable[..., Any],
|
|
67
|
+
input_data: dict[str, Any],
|
|
68
|
+
) -> dict[str, Any]:
|
|
69
|
+
"""
|
|
70
|
+
Prepare workflow input by converting JSON values to Pydantic models and Enums based on type hints.
|
|
71
|
+
|
|
72
|
+
Supports:
|
|
73
|
+
- Pydantic models: User
|
|
74
|
+
- Enums: OrderStatus
|
|
75
|
+
- List of Pydantic models: list[OrderItem]
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
workflow_func: The workflow function
|
|
79
|
+
input_data: Input data from storage (JSON-compatible dicts)
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
Processed input data with Pydantic models and Enums restored
|
|
83
|
+
"""
|
|
84
|
+
from typing import get_args, get_origin
|
|
85
|
+
|
|
86
|
+
sig = inspect.signature(workflow_func)
|
|
87
|
+
processed_input: dict[str, Any] = {}
|
|
88
|
+
|
|
89
|
+
# Detect single Pydantic model parameter pattern
|
|
90
|
+
# Count non-ctx parameters
|
|
91
|
+
workflow_params = [
|
|
92
|
+
(name, param) for name, param in sig.parameters.items() if name not in ("ctx", "self")
|
|
93
|
+
]
|
|
94
|
+
|
|
95
|
+
# If there's only one parameter and it's a Pydantic model,
|
|
96
|
+
# and input_data doesn't have that parameter name as a key,
|
|
97
|
+
# assume input_data IS the model data (CloudEvents case)
|
|
98
|
+
if len(workflow_params) == 1:
|
|
99
|
+
param_name, param = workflow_params[0]
|
|
100
|
+
model = extract_pydantic_model_from_annotation(param.annotation)
|
|
101
|
+
if model is not None and param_name not in input_data:
|
|
102
|
+
# input_data = {"order_id": "...", "amount": ...}
|
|
103
|
+
# → processed_input = {"input": PaymentWorkflowInput(...)}
|
|
104
|
+
processed_input[param_name] = from_json_dict(input_data, model)
|
|
105
|
+
return processed_input
|
|
106
|
+
|
|
107
|
+
for param_name, param in sig.parameters.items():
|
|
108
|
+
# Skip 'ctx' parameter (WorkflowContext)
|
|
109
|
+
if param_name == "ctx" or param_name == "self":
|
|
110
|
+
continue
|
|
111
|
+
|
|
112
|
+
if param_name not in input_data:
|
|
113
|
+
# Parameter not provided in input_data (may have default value)
|
|
114
|
+
continue
|
|
115
|
+
|
|
116
|
+
value = input_data[param_name]
|
|
117
|
+
|
|
118
|
+
# Check if parameter has Pydantic model type hint
|
|
119
|
+
model = extract_pydantic_model_from_annotation(param.annotation)
|
|
120
|
+
if model is not None and isinstance(value, dict):
|
|
121
|
+
# Restore Pydantic model from JSON dict
|
|
122
|
+
value = from_json_dict(value, model)
|
|
123
|
+
# Check if parameter has Enum type hint
|
|
124
|
+
elif (enum_class := extract_enum_from_annotation(param.annotation)) is not None:
|
|
125
|
+
# Only convert if not already an Enum instance (defensive programming)
|
|
126
|
+
from enum import Enum
|
|
127
|
+
|
|
128
|
+
if not isinstance(value, Enum):
|
|
129
|
+
value = enum_value_to_enum(value, enum_class)
|
|
130
|
+
# Check if parameter is list[PydanticModel]
|
|
131
|
+
elif get_origin(param.annotation) is list:
|
|
132
|
+
args = get_args(param.annotation)
|
|
133
|
+
if args and len(args) == 1:
|
|
134
|
+
# Check if list element is Pydantic model
|
|
135
|
+
element_model = extract_pydantic_model_from_annotation(args[0])
|
|
136
|
+
if element_model is not None and isinstance(value, list):
|
|
137
|
+
# Convert each dict to Pydantic model
|
|
138
|
+
value = [
|
|
139
|
+
from_json_dict(item, element_model) if isinstance(item, dict) else item
|
|
140
|
+
for item in value
|
|
141
|
+
]
|
|
142
|
+
|
|
143
|
+
processed_input[param_name] = value
|
|
144
|
+
|
|
145
|
+
return processed_input
|
|
146
|
+
|
|
147
|
+
async def start_workflow(
|
|
148
|
+
self,
|
|
149
|
+
workflow_name: str,
|
|
150
|
+
workflow_func: Callable[..., Any],
|
|
151
|
+
input_data: dict[str, Any],
|
|
152
|
+
lock_timeout_seconds: int | None = None,
|
|
153
|
+
) -> str:
|
|
154
|
+
"""
|
|
155
|
+
Start a new workflow instance.
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
workflow_name: Name of the workflow
|
|
159
|
+
workflow_func: The workflow function to execute
|
|
160
|
+
input_data: Input parameters for the workflow
|
|
161
|
+
lock_timeout_seconds: Lock timeout for this workflow (None = global default 300s)
|
|
162
|
+
|
|
163
|
+
Returns:
|
|
164
|
+
Instance ID of the started workflow
|
|
165
|
+
"""
|
|
166
|
+
# Generate instance ID
|
|
167
|
+
instance_id = f"{workflow_name}-{uuid.uuid4().hex}"
|
|
168
|
+
|
|
169
|
+
# Extract source code for visualization
|
|
170
|
+
try:
|
|
171
|
+
source_code = inspect.getsource(workflow_func)
|
|
172
|
+
except (OSError, TypeError) as e:
|
|
173
|
+
# inspect.getsource can fail for lambdas, built-ins, REPL functions, etc.
|
|
174
|
+
logger.warning(
|
|
175
|
+
f"Could not extract source code for workflow '{workflow_name}': {e}. "
|
|
176
|
+
"Hybrid diagram visualization will not be available."
|
|
177
|
+
)
|
|
178
|
+
source_code = f"# Source code not available\n# Workflow: {workflow_name}\n# Error: {e}"
|
|
179
|
+
|
|
180
|
+
# Calculate source code hash
|
|
181
|
+
source_hash = hashlib.sha256(source_code.encode("utf-8")).hexdigest()
|
|
182
|
+
|
|
183
|
+
# Store workflow definition (idempotent)
|
|
184
|
+
await self.storage.upsert_workflow_definition(
|
|
185
|
+
workflow_name=workflow_name,
|
|
186
|
+
source_hash=source_hash,
|
|
187
|
+
source_code=source_code,
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
# Create workflow instance in storage
|
|
191
|
+
await self.storage.create_instance(
|
|
192
|
+
instance_id=instance_id,
|
|
193
|
+
workflow_name=workflow_name,
|
|
194
|
+
source_hash=source_hash,
|
|
195
|
+
owner_service=self.service_name,
|
|
196
|
+
input_data=input_data,
|
|
197
|
+
lock_timeout_seconds=lock_timeout_seconds,
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
# Execute the workflow with distributed lock
|
|
201
|
+
async with workflow_lock(self.storage, instance_id, self.worker_id):
|
|
202
|
+
# Create context for new execution
|
|
203
|
+
ctx = WorkflowContext(
|
|
204
|
+
instance_id=instance_id,
|
|
205
|
+
workflow_name=workflow_name,
|
|
206
|
+
storage=self.storage,
|
|
207
|
+
worker_id=self.worker_id,
|
|
208
|
+
is_replaying=False,
|
|
209
|
+
hooks=self.hooks,
|
|
210
|
+
)
|
|
211
|
+
# Set default retry policy for activity resolution
|
|
212
|
+
ctx._app_retry_policy = self.default_retry_policy
|
|
213
|
+
|
|
214
|
+
try:
|
|
215
|
+
# Call hook: workflow start
|
|
216
|
+
if self.hooks and hasattr(self.hooks, "on_workflow_start"):
|
|
217
|
+
await self.hooks.on_workflow_start(instance_id, workflow_name, input_data)
|
|
218
|
+
|
|
219
|
+
# Prepare input: convert JSON dicts to Pydantic models based on type hints
|
|
220
|
+
processed_input = self._prepare_workflow_input(workflow_func, input_data)
|
|
221
|
+
|
|
222
|
+
# Execute workflow function
|
|
223
|
+
result = await workflow_func(ctx, **processed_input)
|
|
224
|
+
|
|
225
|
+
# Before marking as completed, check if workflow was cancelled
|
|
226
|
+
instance = await ctx.storage.get_instance(instance_id)
|
|
227
|
+
if instance and instance.get("status") == "cancelled":
|
|
228
|
+
from edda.exceptions import WorkflowCancelledException
|
|
229
|
+
|
|
230
|
+
raise WorkflowCancelledException(
|
|
231
|
+
f"Workflow {instance_id} was cancelled"
|
|
232
|
+
) from None
|
|
233
|
+
|
|
234
|
+
# Convert Pydantic model result to JSON dict for storage
|
|
235
|
+
result_dict = to_json_dict(result)
|
|
236
|
+
|
|
237
|
+
# Mark as completed
|
|
238
|
+
await ctx._update_status("completed", {"result": result_dict})
|
|
239
|
+
|
|
240
|
+
# Call hook: workflow complete
|
|
241
|
+
if self.hooks and hasattr(self.hooks, "on_workflow_complete"):
|
|
242
|
+
await self.hooks.on_workflow_complete(instance_id, workflow_name, result)
|
|
243
|
+
|
|
244
|
+
return instance_id
|
|
245
|
+
|
|
246
|
+
except WaitForEventException as exc:
|
|
247
|
+
# Workflow is waiting for an event
|
|
248
|
+
# Before marking as waiting_for_event, check if workflow was cancelled
|
|
249
|
+
instance = await ctx.storage.get_instance(instance_id)
|
|
250
|
+
if instance and instance.get("status") == "cancelled":
|
|
251
|
+
from edda.exceptions import WorkflowCancelledException
|
|
252
|
+
|
|
253
|
+
raise WorkflowCancelledException(
|
|
254
|
+
f"Workflow {instance_id} was cancelled"
|
|
255
|
+
) from None
|
|
256
|
+
|
|
257
|
+
# Atomically register event subscription and release lock (distributed coroutines)
|
|
258
|
+
# This ensures subscription is registered and lock is released in a single transaction
|
|
259
|
+
# so ANY worker can resume the workflow when the event arrives
|
|
260
|
+
from datetime import UTC, datetime, timedelta
|
|
261
|
+
|
|
262
|
+
timeout_at = None
|
|
263
|
+
if exc.timeout_seconds is not None:
|
|
264
|
+
timeout_at = datetime.now(UTC) + timedelta(seconds=exc.timeout_seconds)
|
|
265
|
+
|
|
266
|
+
await self.storage.register_event_subscription_and_release_lock(
|
|
267
|
+
instance_id=instance_id,
|
|
268
|
+
worker_id=self.worker_id,
|
|
269
|
+
event_type=exc.event_type,
|
|
270
|
+
timeout_at=timeout_at,
|
|
271
|
+
activity_id=exc.activity_id,
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
# Update status to waiting_for_event
|
|
275
|
+
await ctx._update_status("waiting_for_event")
|
|
276
|
+
return instance_id
|
|
277
|
+
|
|
278
|
+
except WaitForTimerException as exc:
|
|
279
|
+
# Workflow is waiting for a timer
|
|
280
|
+
# Before marking as waiting_for_timer, check if workflow was cancelled
|
|
281
|
+
instance = await ctx.storage.get_instance(instance_id)
|
|
282
|
+
if instance and instance.get("status") == "cancelled":
|
|
283
|
+
from edda.exceptions import WorkflowCancelledException
|
|
284
|
+
|
|
285
|
+
raise WorkflowCancelledException(
|
|
286
|
+
f"Workflow {instance_id} was cancelled"
|
|
287
|
+
) from None
|
|
288
|
+
|
|
289
|
+
# Atomically register timer subscription and release lock (distributed coroutines)
|
|
290
|
+
# This ensures subscription is registered and lock is released in a single transaction
|
|
291
|
+
# so ANY worker can resume the workflow when the timer expires
|
|
292
|
+
# Use the expires_at from the exception (calculated at wait_timer() call time)
|
|
293
|
+
# This ensures deterministic replay: the timer expiration time never changes
|
|
294
|
+
await self.storage.register_timer_subscription_and_release_lock(
|
|
295
|
+
instance_id=instance_id,
|
|
296
|
+
worker_id=self.worker_id,
|
|
297
|
+
timer_id=exc.timer_id,
|
|
298
|
+
expires_at=exc.expires_at,
|
|
299
|
+
activity_id=exc.activity_id,
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
# Status is updated to 'waiting_for_timer' atomically
|
|
303
|
+
# by register_timer_subscription_and_release_lock()
|
|
304
|
+
return instance_id
|
|
305
|
+
|
|
306
|
+
except Exception as error:
|
|
307
|
+
# Check if this is a cancellation exception
|
|
308
|
+
from edda.exceptions import WorkflowCancelledException
|
|
309
|
+
|
|
310
|
+
if isinstance(error, WorkflowCancelledException):
|
|
311
|
+
# Workflow was cancelled during execution
|
|
312
|
+
print(f"[Workflow] {instance_id} was cancelled during execution")
|
|
313
|
+
|
|
314
|
+
# Execute compensations (idempotent - already executed ones will be skipped)
|
|
315
|
+
# This ensures all compensations are executed, even if some were already
|
|
316
|
+
# executed by cancel_workflow() in a concurrent process
|
|
317
|
+
print(f"[Workflow] Executing compensations for {instance_id}")
|
|
318
|
+
await execute_compensations(ctx)
|
|
319
|
+
|
|
320
|
+
# Ensure status is "cancelled"
|
|
321
|
+
await ctx._update_status("cancelled", {"reason": "Workflow cancelled by user"})
|
|
322
|
+
|
|
323
|
+
# Call hook: workflow cancelled
|
|
324
|
+
if self.hooks and hasattr(self.hooks, "on_workflow_cancelled"):
|
|
325
|
+
await self.hooks.on_workflow_cancelled(instance_id, workflow_name)
|
|
326
|
+
|
|
327
|
+
return instance_id
|
|
328
|
+
|
|
329
|
+
# Execute compensations before marking as failed
|
|
330
|
+
await execute_compensations(ctx)
|
|
331
|
+
|
|
332
|
+
# Capture error details for debugging
|
|
333
|
+
import traceback
|
|
334
|
+
|
|
335
|
+
stack_trace = "".join(
|
|
336
|
+
traceback.format_exception(type(error), error, error.__traceback__)
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
# Mark as failed with detailed error information
|
|
340
|
+
await ctx._update_status(
|
|
341
|
+
"failed",
|
|
342
|
+
{
|
|
343
|
+
"error_message": str(error),
|
|
344
|
+
"error_type": type(error).__name__,
|
|
345
|
+
"stack_trace": stack_trace,
|
|
346
|
+
},
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
# Call hook: workflow failed
|
|
350
|
+
if self.hooks and hasattr(self.hooks, "on_workflow_failed"):
|
|
351
|
+
await self.hooks.on_workflow_failed(instance_id, workflow_name, error)
|
|
352
|
+
|
|
353
|
+
raise
|
|
354
|
+
|
|
355
|
+
async def resume_workflow(
|
|
356
|
+
self,
|
|
357
|
+
instance_id: str,
|
|
358
|
+
workflow_func: Callable[..., Any],
|
|
359
|
+
_event: Any = None,
|
|
360
|
+
already_locked: bool = False,
|
|
361
|
+
) -> None:
|
|
362
|
+
"""
|
|
363
|
+
Resume a workflow instance (with replay).
|
|
364
|
+
|
|
365
|
+
This method performs deterministic replay of the workflow execution
|
|
366
|
+
up to the point where it was paused, then continues execution.
|
|
367
|
+
|
|
368
|
+
Args:
|
|
369
|
+
instance_id: Workflow instance ID
|
|
370
|
+
workflow_func: The workflow function to replay/execute
|
|
371
|
+
event: Optional event that triggered the resume (for wait_event)
|
|
372
|
+
already_locked: If True, assumes the lock is already held by the caller
|
|
373
|
+
(used in distributed coroutine event delivery)
|
|
374
|
+
|
|
375
|
+
Raises:
|
|
376
|
+
ValueError: If instance not found or already completed
|
|
377
|
+
"""
|
|
378
|
+
# Get instance metadata
|
|
379
|
+
instance = await self.storage.get_instance(instance_id)
|
|
380
|
+
if instance is None:
|
|
381
|
+
raise ValueError(f"Workflow instance {instance_id} not found")
|
|
382
|
+
|
|
383
|
+
if instance["status"] == "completed":
|
|
384
|
+
# Already completed, nothing to do
|
|
385
|
+
return
|
|
386
|
+
|
|
387
|
+
if instance["status"] == "failed":
|
|
388
|
+
# Cannot resume failed workflow
|
|
389
|
+
raise ValueError(f"Cannot resume failed workflow {instance_id}")
|
|
390
|
+
|
|
391
|
+
# Execute the workflow logic with or without lock acquisition
|
|
392
|
+
if already_locked:
|
|
393
|
+
# Lock already held by caller (distributed coroutine pattern)
|
|
394
|
+
await self._execute_workflow_logic(instance, instance_id, workflow_func)
|
|
395
|
+
else:
|
|
396
|
+
# Acquire lock for this workflow
|
|
397
|
+
async with workflow_lock(self.storage, instance_id, self.worker_id):
|
|
398
|
+
await self._execute_workflow_logic(instance, instance_id, workflow_func)
|
|
399
|
+
|
|
400
|
+
async def _execute_workflow_logic(
|
|
401
|
+
self,
|
|
402
|
+
instance: dict[str, Any],
|
|
403
|
+
instance_id: str,
|
|
404
|
+
workflow_func: Callable[..., Any],
|
|
405
|
+
) -> None:
|
|
406
|
+
"""
|
|
407
|
+
Execute workflow logic (factored out to support both locked and unlocked execution).
|
|
408
|
+
|
|
409
|
+
Args:
|
|
410
|
+
instance: Workflow instance metadata
|
|
411
|
+
instance_id: Workflow instance ID
|
|
412
|
+
workflow_func: The workflow function to execute
|
|
413
|
+
"""
|
|
414
|
+
# Create context for replay
|
|
415
|
+
ctx = WorkflowContext(
|
|
416
|
+
instance_id=instance_id,
|
|
417
|
+
workflow_name=instance["workflow_name"],
|
|
418
|
+
storage=self.storage,
|
|
419
|
+
worker_id=self.worker_id,
|
|
420
|
+
is_replaying=True,
|
|
421
|
+
hooks=self.hooks,
|
|
422
|
+
)
|
|
423
|
+
# Set default retry policy for activity resolution
|
|
424
|
+
ctx._app_retry_policy = self.default_retry_policy
|
|
425
|
+
|
|
426
|
+
# Load history for replay
|
|
427
|
+
await ctx._load_history()
|
|
428
|
+
|
|
429
|
+
try:
|
|
430
|
+
# Replay and continue execution
|
|
431
|
+
input_data = instance["input_data"]
|
|
432
|
+
|
|
433
|
+
# Prepare input: convert JSON dicts to Pydantic models based on type hints
|
|
434
|
+
processed_input = self._prepare_workflow_input(workflow_func, input_data)
|
|
435
|
+
|
|
436
|
+
result = await workflow_func(ctx, **processed_input)
|
|
437
|
+
|
|
438
|
+
# Before marking as completed, check if workflow was cancelled
|
|
439
|
+
instance_check = await ctx.storage.get_instance(instance_id)
|
|
440
|
+
if instance_check and instance_check.get("status") == "cancelled":
|
|
441
|
+
from edda.exceptions import WorkflowCancelledException
|
|
442
|
+
|
|
443
|
+
raise WorkflowCancelledException(f"Workflow {instance_id} was cancelled")
|
|
444
|
+
|
|
445
|
+
# Convert Pydantic model result to JSON dict for storage
|
|
446
|
+
result_dict = to_json_dict(result)
|
|
447
|
+
|
|
448
|
+
# Mark as completed
|
|
449
|
+
await ctx._update_status("completed", {"result": result_dict})
|
|
450
|
+
|
|
451
|
+
except WaitForEventException as exc:
|
|
452
|
+
# Workflow is waiting for an event (again)
|
|
453
|
+
# Atomically register event subscription and release lock (distributed coroutines)
|
|
454
|
+
from datetime import UTC, datetime, timedelta
|
|
455
|
+
|
|
456
|
+
timeout_at = None
|
|
457
|
+
if exc.timeout_seconds is not None:
|
|
458
|
+
timeout_at = datetime.now(UTC) + timedelta(seconds=exc.timeout_seconds)
|
|
459
|
+
|
|
460
|
+
await self.storage.register_event_subscription_and_release_lock(
|
|
461
|
+
instance_id=instance_id,
|
|
462
|
+
worker_id=self.worker_id,
|
|
463
|
+
event_type=exc.event_type,
|
|
464
|
+
timeout_at=timeout_at,
|
|
465
|
+
activity_id=exc.activity_id,
|
|
466
|
+
)
|
|
467
|
+
|
|
468
|
+
# Update status to waiting_for_event
|
|
469
|
+
await ctx._update_status("waiting_for_event")
|
|
470
|
+
|
|
471
|
+
except WaitForTimerException as exc:
|
|
472
|
+
# Workflow is waiting for a timer (again)
|
|
473
|
+
# Atomically register timer subscription and release lock (distributed coroutines)
|
|
474
|
+
# Use the expires_at from the exception (calculated at wait_timer() call time)
|
|
475
|
+
# This ensures deterministic replay: the timer expiration time never changes
|
|
476
|
+
await self.storage.register_timer_subscription_and_release_lock(
|
|
477
|
+
instance_id=instance_id,
|
|
478
|
+
worker_id=self.worker_id,
|
|
479
|
+
timer_id=exc.timer_id,
|
|
480
|
+
expires_at=exc.expires_at,
|
|
481
|
+
activity_id=exc.activity_id,
|
|
482
|
+
)
|
|
483
|
+
|
|
484
|
+
# Status is updated to 'waiting_for_timer' atomically
|
|
485
|
+
# by register_timer_subscription_and_release_lock()
|
|
486
|
+
|
|
487
|
+
except Exception as error:
|
|
488
|
+
# Check if this is a cancellation exception
|
|
489
|
+
from edda.exceptions import WorkflowCancelledException
|
|
490
|
+
|
|
491
|
+
if isinstance(error, WorkflowCancelledException):
|
|
492
|
+
# Workflow was cancelled during execution
|
|
493
|
+
print(f"[Workflow] {instance_id} was cancelled during execution")
|
|
494
|
+
|
|
495
|
+
# Execute compensations (idempotent - already executed ones will be skipped)
|
|
496
|
+
# This ensures all compensations are executed, even if some were already
|
|
497
|
+
# executed by cancel_workflow() in a concurrent process
|
|
498
|
+
print(f"[Workflow] Executing compensations for {instance_id}")
|
|
499
|
+
await execute_compensations(ctx)
|
|
500
|
+
|
|
501
|
+
# Ensure status is "cancelled"
|
|
502
|
+
await ctx._update_status("cancelled", {"reason": "Workflow cancelled by user"})
|
|
503
|
+
return
|
|
504
|
+
|
|
505
|
+
# Execute compensations before marking as failed
|
|
506
|
+
await execute_compensations(ctx)
|
|
507
|
+
|
|
508
|
+
# Capture error details for debugging
|
|
509
|
+
import traceback
|
|
510
|
+
|
|
511
|
+
stack_trace = "".join(
|
|
512
|
+
traceback.format_exception(type(error), error, error.__traceback__)
|
|
513
|
+
)
|
|
514
|
+
|
|
515
|
+
# Mark as failed with detailed error information
|
|
516
|
+
await ctx._update_status(
|
|
517
|
+
"failed",
|
|
518
|
+
{
|
|
519
|
+
"error_message": str(error),
|
|
520
|
+
"error_type": type(error).__name__,
|
|
521
|
+
"stack_trace": stack_trace,
|
|
522
|
+
},
|
|
523
|
+
)
|
|
524
|
+
raise
|
|
525
|
+
|
|
526
|
+
async def resume_by_name(
|
|
527
|
+
self, instance_id: str, workflow_name: str, already_locked: bool = False
|
|
528
|
+
) -> None:
|
|
529
|
+
"""
|
|
530
|
+
Resume a workflow by its name (convenience method for auto-recovery).
|
|
531
|
+
|
|
532
|
+
This method looks up the workflow function from the global saga registry
|
|
533
|
+
and resumes execution. This is primarily used by the auto-recovery mechanism
|
|
534
|
+
after Stale Lock cleanup.
|
|
535
|
+
|
|
536
|
+
Args:
|
|
537
|
+
instance_id: Workflow instance ID
|
|
538
|
+
workflow_name: Name of the workflow to resume
|
|
539
|
+
already_locked: If True, assumes the lock is already held by the caller
|
|
540
|
+
(used in distributed coroutine event delivery)
|
|
541
|
+
|
|
542
|
+
Raises:
|
|
543
|
+
ValueError: If workflow not found in registry or instance not found
|
|
544
|
+
"""
|
|
545
|
+
# Import here to avoid circular dependency
|
|
546
|
+
from edda.workflow import get_all_workflows
|
|
547
|
+
|
|
548
|
+
# Look up workflow in workflow registry
|
|
549
|
+
workflows = get_all_workflows()
|
|
550
|
+
workflow_obj = workflows.get(workflow_name)
|
|
551
|
+
|
|
552
|
+
if workflow_obj is None:
|
|
553
|
+
raise ValueError(
|
|
554
|
+
f"Workflow '{workflow_name}' not found in workflow registry. "
|
|
555
|
+
f"Available workflows: {list(workflows.keys())}"
|
|
556
|
+
)
|
|
557
|
+
|
|
558
|
+
# Resume using the workflow function from the workflow
|
|
559
|
+
await self.resume_workflow(
|
|
560
|
+
instance_id=instance_id, workflow_func=workflow_obj.func, already_locked=already_locked
|
|
561
|
+
)
|
|
562
|
+
|
|
563
|
+
async def execute_with_lock(
|
|
564
|
+
self,
|
|
565
|
+
instance_id: str,
|
|
566
|
+
workflow_func: Callable[..., Any],
|
|
567
|
+
is_replay: bool = False,
|
|
568
|
+
) -> Any:
|
|
569
|
+
"""
|
|
570
|
+
Execute workflow function with distributed lock.
|
|
571
|
+
|
|
572
|
+
This is a lower-level method used by start_workflow and resume_workflow.
|
|
573
|
+
|
|
574
|
+
Args:
|
|
575
|
+
instance_id: Workflow instance ID
|
|
576
|
+
workflow_func: The workflow function to execute
|
|
577
|
+
is_replay: Whether this is a replay execution
|
|
578
|
+
|
|
579
|
+
Returns:
|
|
580
|
+
Workflow result
|
|
581
|
+
"""
|
|
582
|
+
# Get instance
|
|
583
|
+
instance = await self.storage.get_instance(instance_id)
|
|
584
|
+
if instance is None:
|
|
585
|
+
raise ValueError(f"Workflow instance {instance_id} not found")
|
|
586
|
+
|
|
587
|
+
# Acquire lock
|
|
588
|
+
async with workflow_lock(self.storage, instance_id, self.worker_id):
|
|
589
|
+
# Create context
|
|
590
|
+
ctx = WorkflowContext(
|
|
591
|
+
instance_id=instance_id,
|
|
592
|
+
workflow_name=instance["workflow_name"],
|
|
593
|
+
storage=self.storage,
|
|
594
|
+
worker_id=self.worker_id,
|
|
595
|
+
is_replaying=is_replay,
|
|
596
|
+
hooks=self.hooks,
|
|
597
|
+
)
|
|
598
|
+
# Set default retry policy for activity resolution
|
|
599
|
+
ctx._app_retry_policy = self.default_retry_policy
|
|
600
|
+
|
|
601
|
+
# Load history if replaying
|
|
602
|
+
if is_replay:
|
|
603
|
+
await ctx._load_history()
|
|
604
|
+
|
|
605
|
+
# Execute workflow
|
|
606
|
+
input_data = instance["input_data"]
|
|
607
|
+
return await workflow_func(ctx, **input_data)
|
|
608
|
+
|
|
609
|
+
async def cancel_workflow(self, instance_id: str, cancelled_by: str = "user") -> bool:
|
|
610
|
+
"""
|
|
611
|
+
Cancel a running or waiting workflow.
|
|
612
|
+
|
|
613
|
+
This method will:
|
|
614
|
+
1. Verify the workflow is cancellable (not already completed/failed)
|
|
615
|
+
2. Try to acquire lock (with short timeout)
|
|
616
|
+
3. Execute compensations to clean up any side effects
|
|
617
|
+
4. Mark the workflow as cancelled in storage
|
|
618
|
+
|
|
619
|
+
Args:
|
|
620
|
+
instance_id: Workflow instance ID to cancel
|
|
621
|
+
cancelled_by: Who triggered the cancellation (e.g., "user", "admin", "timeout")
|
|
622
|
+
|
|
623
|
+
Returns:
|
|
624
|
+
True if successfully cancelled, False if:
|
|
625
|
+
- Instance not found
|
|
626
|
+
- Already completed/failed/cancelled
|
|
627
|
+
- Lock acquisition failed (workflow is actively running)
|
|
628
|
+
|
|
629
|
+
Example:
|
|
630
|
+
>>> engine = ReplayEngine(storage, "service", "worker-1")
|
|
631
|
+
>>> success = await engine.cancel_workflow("order-saga-abc123", "admin")
|
|
632
|
+
>>> if success:
|
|
633
|
+
... print("Workflow cancelled and compensations executed")
|
|
634
|
+
"""
|
|
635
|
+
# Get instance to check status
|
|
636
|
+
instance = await self.storage.get_instance(instance_id)
|
|
637
|
+
if instance is None:
|
|
638
|
+
return False
|
|
639
|
+
|
|
640
|
+
current_status = instance["status"]
|
|
641
|
+
|
|
642
|
+
# Only cancel running or waiting workflows
|
|
643
|
+
if current_status not in ("running", "waiting_for_event", "waiting_for_timer"):
|
|
644
|
+
return False
|
|
645
|
+
|
|
646
|
+
# Try to acquire lock with short timeout (5 seconds)
|
|
647
|
+
# If the workflow is actively executing, we may not be able to get the lock
|
|
648
|
+
try:
|
|
649
|
+
lock_acquired = await self.storage.try_acquire_lock(
|
|
650
|
+
instance_id=instance_id,
|
|
651
|
+
worker_id=self.worker_id,
|
|
652
|
+
timeout_seconds=5,
|
|
653
|
+
)
|
|
654
|
+
|
|
655
|
+
if not lock_acquired:
|
|
656
|
+
# Another worker has the lock, try to cancel anyway
|
|
657
|
+
# The storage layer will handle atomicity
|
|
658
|
+
return await self.storage.cancel_instance(instance_id, cancelled_by)
|
|
659
|
+
|
|
660
|
+
try:
|
|
661
|
+
# Re-fetch instance data AFTER acquiring lock
|
|
662
|
+
print(f"[Cancel] Fetching instance data for {instance_id}")
|
|
663
|
+
instance_locked = await self.storage.get_instance(instance_id)
|
|
664
|
+
if instance_locked is None:
|
|
665
|
+
print(f"[Cancel] Instance {instance_id} not found after lock acquisition")
|
|
666
|
+
return False
|
|
667
|
+
|
|
668
|
+
# Create context for compensation execution
|
|
669
|
+
ctx = WorkflowContext(
|
|
670
|
+
instance_id=instance_id,
|
|
671
|
+
workflow_name=instance_locked["workflow_name"],
|
|
672
|
+
storage=self.storage,
|
|
673
|
+
worker_id=self.worker_id,
|
|
674
|
+
is_replaying=False,
|
|
675
|
+
hooks=self.hooks,
|
|
676
|
+
)
|
|
677
|
+
# Set default retry policy for activity resolution
|
|
678
|
+
ctx._app_retry_policy = self.default_retry_policy
|
|
679
|
+
|
|
680
|
+
# Execute compensations to clean up
|
|
681
|
+
print(f"[Cancel] Executing compensations for {instance_id}")
|
|
682
|
+
await execute_compensations(ctx)
|
|
683
|
+
|
|
684
|
+
# Mark as cancelled in storage
|
|
685
|
+
success = await self.storage.cancel_instance(instance_id, cancelled_by)
|
|
686
|
+
|
|
687
|
+
return success
|
|
688
|
+
|
|
689
|
+
finally:
|
|
690
|
+
# Always release the lock
|
|
691
|
+
await self.storage.release_lock(instance_id, self.worker_id)
|
|
692
|
+
|
|
693
|
+
except Exception as error:
|
|
694
|
+
# Log error but don't propagate
|
|
695
|
+
import traceback
|
|
696
|
+
|
|
697
|
+
print(f"[Cancel] Error cancelling workflow {instance_id}: {error}")
|
|
698
|
+
traceback.print_exc()
|
|
699
|
+
return False
|
|
700
|
+
|
|
701
|
+
async def resume_compensating_workflow(self, instance_id: str) -> bool:
|
|
702
|
+
"""
|
|
703
|
+
Resume a workflow that crashed during compensation execution.
|
|
704
|
+
|
|
705
|
+
This method only re-executes incomplete compensations without running
|
|
706
|
+
the workflow function. It determines the target status (failed/cancelled)
|
|
707
|
+
from the instance metadata.
|
|
708
|
+
|
|
709
|
+
Args:
|
|
710
|
+
instance_id: Workflow instance ID
|
|
711
|
+
|
|
712
|
+
Returns:
|
|
713
|
+
True if compensations completed successfully, False otherwise
|
|
714
|
+
"""
|
|
715
|
+
print(f"[ResumeCompensating] Starting compensation recovery for {instance_id}")
|
|
716
|
+
|
|
717
|
+
try:
|
|
718
|
+
# Acquire lock
|
|
719
|
+
locked = await self.storage.try_acquire_lock(
|
|
720
|
+
instance_id=instance_id,
|
|
721
|
+
worker_id=self.worker_id,
|
|
722
|
+
timeout_seconds=300,
|
|
723
|
+
)
|
|
724
|
+
|
|
725
|
+
if not locked:
|
|
726
|
+
print(f"[ResumeCompensating] Could not acquire lock for {instance_id}")
|
|
727
|
+
return False
|
|
728
|
+
|
|
729
|
+
try:
|
|
730
|
+
# Get instance data
|
|
731
|
+
instance = await self.storage.get_instance(instance_id)
|
|
732
|
+
if instance is None:
|
|
733
|
+
print(f"[ResumeCompensating] Instance {instance_id} not found")
|
|
734
|
+
return False
|
|
735
|
+
|
|
736
|
+
# Check current status
|
|
737
|
+
current_status = instance["status"]
|
|
738
|
+
if current_status != "compensating":
|
|
739
|
+
print(
|
|
740
|
+
f"[ResumeCompensating] Instance {instance_id} is not in compensating state (status={current_status})"
|
|
741
|
+
)
|
|
742
|
+
return False
|
|
743
|
+
|
|
744
|
+
# Determine target status based on history or metadata
|
|
745
|
+
# If we can't determine, default to "failed"
|
|
746
|
+
target_status = "failed"
|
|
747
|
+
|
|
748
|
+
# Check history for cancellation markers
|
|
749
|
+
history = await self.storage.get_history(instance_id)
|
|
750
|
+
for event in history:
|
|
751
|
+
event_type = event.get("event_type", "")
|
|
752
|
+
if event_type == "WorkflowCancelled" or "cancel" in event_type.lower():
|
|
753
|
+
target_status = "cancelled"
|
|
754
|
+
break
|
|
755
|
+
|
|
756
|
+
print(f"[ResumeCompensating] Target status after compensation: {target_status}")
|
|
757
|
+
|
|
758
|
+
# Create context for compensation execution
|
|
759
|
+
ctx = WorkflowContext(
|
|
760
|
+
instance_id=instance_id,
|
|
761
|
+
workflow_name=instance["workflow_name"],
|
|
762
|
+
storage=self.storage,
|
|
763
|
+
worker_id=self.worker_id,
|
|
764
|
+
is_replaying=False,
|
|
765
|
+
hooks=self.hooks,
|
|
766
|
+
)
|
|
767
|
+
# Set default retry policy for activity resolution
|
|
768
|
+
ctx._app_retry_policy = self.default_retry_policy
|
|
769
|
+
|
|
770
|
+
# Re-execute compensations (idempotent - skips already executed)
|
|
771
|
+
print(f"[ResumeCompensating] Re-executing compensations for {instance_id}")
|
|
772
|
+
await execute_compensations(ctx)
|
|
773
|
+
|
|
774
|
+
# Mark with target status
|
|
775
|
+
if target_status == "cancelled":
|
|
776
|
+
success = await self.storage.cancel_instance(instance_id, "crash_recovery")
|
|
777
|
+
print(f"[ResumeCompensating] Marked {instance_id} as cancelled")
|
|
778
|
+
else:
|
|
779
|
+
await ctx._update_status(
|
|
780
|
+
"failed", {"error": "Workflow failed before compensation"}
|
|
781
|
+
)
|
|
782
|
+
print(f"[ResumeCompensating] Marked {instance_id} as failed")
|
|
783
|
+
success = True
|
|
784
|
+
|
|
785
|
+
return success
|
|
786
|
+
|
|
787
|
+
finally:
|
|
788
|
+
# Always release the lock
|
|
789
|
+
await self.storage.release_lock(instance_id, self.worker_id)
|
|
790
|
+
|
|
791
|
+
except Exception as error:
|
|
792
|
+
# Log error but don't propagate
|
|
793
|
+
import traceback
|
|
794
|
+
|
|
795
|
+
print(
|
|
796
|
+
f"[ResumeCompensating] Error resuming compensating workflow {instance_id}: {error}"
|
|
797
|
+
)
|
|
798
|
+
traceback.print_exc()
|
|
799
|
+
return False
|