edda-framework 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
edda/replay.py ADDED
@@ -0,0 +1,799 @@
1
+ """
2
+ Replay engine for Edda framework.
3
+
4
+ This module implements the deterministic replay mechanism with activity result caching,
5
+ allowing workflows to resume from where they left off by replaying their
6
+ execution history.
7
+ """
8
+
9
+ import hashlib
10
+ import inspect
11
+ import logging
12
+ import uuid
13
+ from collections.abc import Callable
14
+ from typing import Any
15
+
16
+ from edda.compensation import execute_compensations
17
+ from edda.context import WorkflowContext
18
+ from edda.events import WaitForEventException, WaitForTimerException
19
+ from edda.locking import workflow_lock
20
+ from edda.pydantic_utils import (
21
+ enum_value_to_enum,
22
+ extract_enum_from_annotation,
23
+ extract_pydantic_model_from_annotation,
24
+ from_json_dict,
25
+ to_json_dict,
26
+ )
27
+ from edda.storage.protocol import StorageProtocol
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+
32
+ class ReplayEngine:
33
+ """
34
+ Engine for executing and replaying workflows with deterministic behavior.
35
+
36
+ The replay engine orchestrates workflow execution, handles lock acquisition,
37
+ loads history for replay, and manages workflow lifecycle.
38
+ """
39
+
40
+ def __init__(
41
+ self,
42
+ storage: StorageProtocol,
43
+ service_name: str,
44
+ worker_id: str,
45
+ hooks: Any = None,
46
+ default_retry_policy: Any = None,
47
+ ):
48
+ """
49
+ Initialize the replay engine.
50
+
51
+ Args:
52
+ storage: Storage backend
53
+ service_name: Name of the service (e.g., "order-service")
54
+ worker_id: Unique worker ID for this process
55
+ hooks: Optional WorkflowHooks implementation for observability
56
+ default_retry_policy: Default retry policy for all activities (RetryPolicy or None)
57
+ """
58
+ self.storage = storage
59
+ self.service_name = service_name
60
+ self.worker_id = worker_id
61
+ self.hooks = hooks
62
+ self.default_retry_policy = default_retry_policy
63
+
64
+ def _prepare_workflow_input(
65
+ self,
66
+ workflow_func: Callable[..., Any],
67
+ input_data: dict[str, Any],
68
+ ) -> dict[str, Any]:
69
+ """
70
+ Prepare workflow input by converting JSON values to Pydantic models and Enums based on type hints.
71
+
72
+ Supports:
73
+ - Pydantic models: User
74
+ - Enums: OrderStatus
75
+ - List of Pydantic models: list[OrderItem]
76
+
77
+ Args:
78
+ workflow_func: The workflow function
79
+ input_data: Input data from storage (JSON-compatible dicts)
80
+
81
+ Returns:
82
+ Processed input data with Pydantic models and Enums restored
83
+ """
84
+ from typing import get_args, get_origin
85
+
86
+ sig = inspect.signature(workflow_func)
87
+ processed_input: dict[str, Any] = {}
88
+
89
+ # Detect single Pydantic model parameter pattern
90
+ # Count non-ctx parameters
91
+ workflow_params = [
92
+ (name, param) for name, param in sig.parameters.items() if name not in ("ctx", "self")
93
+ ]
94
+
95
+ # If there's only one parameter and it's a Pydantic model,
96
+ # and input_data doesn't have that parameter name as a key,
97
+ # assume input_data IS the model data (CloudEvents case)
98
+ if len(workflow_params) == 1:
99
+ param_name, param = workflow_params[0]
100
+ model = extract_pydantic_model_from_annotation(param.annotation)
101
+ if model is not None and param_name not in input_data:
102
+ # input_data = {"order_id": "...", "amount": ...}
103
+ # → processed_input = {"input": PaymentWorkflowInput(...)}
104
+ processed_input[param_name] = from_json_dict(input_data, model)
105
+ return processed_input
106
+
107
+ for param_name, param in sig.parameters.items():
108
+ # Skip 'ctx' parameter (WorkflowContext)
109
+ if param_name == "ctx" or param_name == "self":
110
+ continue
111
+
112
+ if param_name not in input_data:
113
+ # Parameter not provided in input_data (may have default value)
114
+ continue
115
+
116
+ value = input_data[param_name]
117
+
118
+ # Check if parameter has Pydantic model type hint
119
+ model = extract_pydantic_model_from_annotation(param.annotation)
120
+ if model is not None and isinstance(value, dict):
121
+ # Restore Pydantic model from JSON dict
122
+ value = from_json_dict(value, model)
123
+ # Check if parameter has Enum type hint
124
+ elif (enum_class := extract_enum_from_annotation(param.annotation)) is not None:
125
+ # Only convert if not already an Enum instance (defensive programming)
126
+ from enum import Enum
127
+
128
+ if not isinstance(value, Enum):
129
+ value = enum_value_to_enum(value, enum_class)
130
+ # Check if parameter is list[PydanticModel]
131
+ elif get_origin(param.annotation) is list:
132
+ args = get_args(param.annotation)
133
+ if args and len(args) == 1:
134
+ # Check if list element is Pydantic model
135
+ element_model = extract_pydantic_model_from_annotation(args[0])
136
+ if element_model is not None and isinstance(value, list):
137
+ # Convert each dict to Pydantic model
138
+ value = [
139
+ from_json_dict(item, element_model) if isinstance(item, dict) else item
140
+ for item in value
141
+ ]
142
+
143
+ processed_input[param_name] = value
144
+
145
+ return processed_input
146
+
147
+ async def start_workflow(
148
+ self,
149
+ workflow_name: str,
150
+ workflow_func: Callable[..., Any],
151
+ input_data: dict[str, Any],
152
+ lock_timeout_seconds: int | None = None,
153
+ ) -> str:
154
+ """
155
+ Start a new workflow instance.
156
+
157
+ Args:
158
+ workflow_name: Name of the workflow
159
+ workflow_func: The workflow function to execute
160
+ input_data: Input parameters for the workflow
161
+ lock_timeout_seconds: Lock timeout for this workflow (None = global default 300s)
162
+
163
+ Returns:
164
+ Instance ID of the started workflow
165
+ """
166
+ # Generate instance ID
167
+ instance_id = f"{workflow_name}-{uuid.uuid4().hex}"
168
+
169
+ # Extract source code for visualization
170
+ try:
171
+ source_code = inspect.getsource(workflow_func)
172
+ except (OSError, TypeError) as e:
173
+ # inspect.getsource can fail for lambdas, built-ins, REPL functions, etc.
174
+ logger.warning(
175
+ f"Could not extract source code for workflow '{workflow_name}': {e}. "
176
+ "Hybrid diagram visualization will not be available."
177
+ )
178
+ source_code = f"# Source code not available\n# Workflow: {workflow_name}\n# Error: {e}"
179
+
180
+ # Calculate source code hash
181
+ source_hash = hashlib.sha256(source_code.encode("utf-8")).hexdigest()
182
+
183
+ # Store workflow definition (idempotent)
184
+ await self.storage.upsert_workflow_definition(
185
+ workflow_name=workflow_name,
186
+ source_hash=source_hash,
187
+ source_code=source_code,
188
+ )
189
+
190
+ # Create workflow instance in storage
191
+ await self.storage.create_instance(
192
+ instance_id=instance_id,
193
+ workflow_name=workflow_name,
194
+ source_hash=source_hash,
195
+ owner_service=self.service_name,
196
+ input_data=input_data,
197
+ lock_timeout_seconds=lock_timeout_seconds,
198
+ )
199
+
200
+ # Execute the workflow with distributed lock
201
+ async with workflow_lock(self.storage, instance_id, self.worker_id):
202
+ # Create context for new execution
203
+ ctx = WorkflowContext(
204
+ instance_id=instance_id,
205
+ workflow_name=workflow_name,
206
+ storage=self.storage,
207
+ worker_id=self.worker_id,
208
+ is_replaying=False,
209
+ hooks=self.hooks,
210
+ )
211
+ # Set default retry policy for activity resolution
212
+ ctx._app_retry_policy = self.default_retry_policy
213
+
214
+ try:
215
+ # Call hook: workflow start
216
+ if self.hooks and hasattr(self.hooks, "on_workflow_start"):
217
+ await self.hooks.on_workflow_start(instance_id, workflow_name, input_data)
218
+
219
+ # Prepare input: convert JSON dicts to Pydantic models based on type hints
220
+ processed_input = self._prepare_workflow_input(workflow_func, input_data)
221
+
222
+ # Execute workflow function
223
+ result = await workflow_func(ctx, **processed_input)
224
+
225
+ # Before marking as completed, check if workflow was cancelled
226
+ instance = await ctx.storage.get_instance(instance_id)
227
+ if instance and instance.get("status") == "cancelled":
228
+ from edda.exceptions import WorkflowCancelledException
229
+
230
+ raise WorkflowCancelledException(
231
+ f"Workflow {instance_id} was cancelled"
232
+ ) from None
233
+
234
+ # Convert Pydantic model result to JSON dict for storage
235
+ result_dict = to_json_dict(result)
236
+
237
+ # Mark as completed
238
+ await ctx._update_status("completed", {"result": result_dict})
239
+
240
+ # Call hook: workflow complete
241
+ if self.hooks and hasattr(self.hooks, "on_workflow_complete"):
242
+ await self.hooks.on_workflow_complete(instance_id, workflow_name, result)
243
+
244
+ return instance_id
245
+
246
+ except WaitForEventException as exc:
247
+ # Workflow is waiting for an event
248
+ # Before marking as waiting_for_event, check if workflow was cancelled
249
+ instance = await ctx.storage.get_instance(instance_id)
250
+ if instance and instance.get("status") == "cancelled":
251
+ from edda.exceptions import WorkflowCancelledException
252
+
253
+ raise WorkflowCancelledException(
254
+ f"Workflow {instance_id} was cancelled"
255
+ ) from None
256
+
257
+ # Atomically register event subscription and release lock (distributed coroutines)
258
+ # This ensures subscription is registered and lock is released in a single transaction
259
+ # so ANY worker can resume the workflow when the event arrives
260
+ from datetime import UTC, datetime, timedelta
261
+
262
+ timeout_at = None
263
+ if exc.timeout_seconds is not None:
264
+ timeout_at = datetime.now(UTC) + timedelta(seconds=exc.timeout_seconds)
265
+
266
+ await self.storage.register_event_subscription_and_release_lock(
267
+ instance_id=instance_id,
268
+ worker_id=self.worker_id,
269
+ event_type=exc.event_type,
270
+ timeout_at=timeout_at,
271
+ activity_id=exc.activity_id,
272
+ )
273
+
274
+ # Update status to waiting_for_event
275
+ await ctx._update_status("waiting_for_event")
276
+ return instance_id
277
+
278
+ except WaitForTimerException as exc:
279
+ # Workflow is waiting for a timer
280
+ # Before marking as waiting_for_timer, check if workflow was cancelled
281
+ instance = await ctx.storage.get_instance(instance_id)
282
+ if instance and instance.get("status") == "cancelled":
283
+ from edda.exceptions import WorkflowCancelledException
284
+
285
+ raise WorkflowCancelledException(
286
+ f"Workflow {instance_id} was cancelled"
287
+ ) from None
288
+
289
+ # Atomically register timer subscription and release lock (distributed coroutines)
290
+ # This ensures subscription is registered and lock is released in a single transaction
291
+ # so ANY worker can resume the workflow when the timer expires
292
+ # Use the expires_at from the exception (calculated at wait_timer() call time)
293
+ # This ensures deterministic replay: the timer expiration time never changes
294
+ await self.storage.register_timer_subscription_and_release_lock(
295
+ instance_id=instance_id,
296
+ worker_id=self.worker_id,
297
+ timer_id=exc.timer_id,
298
+ expires_at=exc.expires_at,
299
+ activity_id=exc.activity_id,
300
+ )
301
+
302
+ # Status is updated to 'waiting_for_timer' atomically
303
+ # by register_timer_subscription_and_release_lock()
304
+ return instance_id
305
+
306
+ except Exception as error:
307
+ # Check if this is a cancellation exception
308
+ from edda.exceptions import WorkflowCancelledException
309
+
310
+ if isinstance(error, WorkflowCancelledException):
311
+ # Workflow was cancelled during execution
312
+ print(f"[Workflow] {instance_id} was cancelled during execution")
313
+
314
+ # Execute compensations (idempotent - already executed ones will be skipped)
315
+ # This ensures all compensations are executed, even if some were already
316
+ # executed by cancel_workflow() in a concurrent process
317
+ print(f"[Workflow] Executing compensations for {instance_id}")
318
+ await execute_compensations(ctx)
319
+
320
+ # Ensure status is "cancelled"
321
+ await ctx._update_status("cancelled", {"reason": "Workflow cancelled by user"})
322
+
323
+ # Call hook: workflow cancelled
324
+ if self.hooks and hasattr(self.hooks, "on_workflow_cancelled"):
325
+ await self.hooks.on_workflow_cancelled(instance_id, workflow_name)
326
+
327
+ return instance_id
328
+
329
+ # Execute compensations before marking as failed
330
+ await execute_compensations(ctx)
331
+
332
+ # Capture error details for debugging
333
+ import traceback
334
+
335
+ stack_trace = "".join(
336
+ traceback.format_exception(type(error), error, error.__traceback__)
337
+ )
338
+
339
+ # Mark as failed with detailed error information
340
+ await ctx._update_status(
341
+ "failed",
342
+ {
343
+ "error_message": str(error),
344
+ "error_type": type(error).__name__,
345
+ "stack_trace": stack_trace,
346
+ },
347
+ )
348
+
349
+ # Call hook: workflow failed
350
+ if self.hooks and hasattr(self.hooks, "on_workflow_failed"):
351
+ await self.hooks.on_workflow_failed(instance_id, workflow_name, error)
352
+
353
+ raise
354
+
355
+ async def resume_workflow(
356
+ self,
357
+ instance_id: str,
358
+ workflow_func: Callable[..., Any],
359
+ _event: Any = None,
360
+ already_locked: bool = False,
361
+ ) -> None:
362
+ """
363
+ Resume a workflow instance (with replay).
364
+
365
+ This method performs deterministic replay of the workflow execution
366
+ up to the point where it was paused, then continues execution.
367
+
368
+ Args:
369
+ instance_id: Workflow instance ID
370
+ workflow_func: The workflow function to replay/execute
371
+ event: Optional event that triggered the resume (for wait_event)
372
+ already_locked: If True, assumes the lock is already held by the caller
373
+ (used in distributed coroutine event delivery)
374
+
375
+ Raises:
376
+ ValueError: If instance not found or already completed
377
+ """
378
+ # Get instance metadata
379
+ instance = await self.storage.get_instance(instance_id)
380
+ if instance is None:
381
+ raise ValueError(f"Workflow instance {instance_id} not found")
382
+
383
+ if instance["status"] == "completed":
384
+ # Already completed, nothing to do
385
+ return
386
+
387
+ if instance["status"] == "failed":
388
+ # Cannot resume failed workflow
389
+ raise ValueError(f"Cannot resume failed workflow {instance_id}")
390
+
391
+ # Execute the workflow logic with or without lock acquisition
392
+ if already_locked:
393
+ # Lock already held by caller (distributed coroutine pattern)
394
+ await self._execute_workflow_logic(instance, instance_id, workflow_func)
395
+ else:
396
+ # Acquire lock for this workflow
397
+ async with workflow_lock(self.storage, instance_id, self.worker_id):
398
+ await self._execute_workflow_logic(instance, instance_id, workflow_func)
399
+
400
+ async def _execute_workflow_logic(
401
+ self,
402
+ instance: dict[str, Any],
403
+ instance_id: str,
404
+ workflow_func: Callable[..., Any],
405
+ ) -> None:
406
+ """
407
+ Execute workflow logic (factored out to support both locked and unlocked execution).
408
+
409
+ Args:
410
+ instance: Workflow instance metadata
411
+ instance_id: Workflow instance ID
412
+ workflow_func: The workflow function to execute
413
+ """
414
+ # Create context for replay
415
+ ctx = WorkflowContext(
416
+ instance_id=instance_id,
417
+ workflow_name=instance["workflow_name"],
418
+ storage=self.storage,
419
+ worker_id=self.worker_id,
420
+ is_replaying=True,
421
+ hooks=self.hooks,
422
+ )
423
+ # Set default retry policy for activity resolution
424
+ ctx._app_retry_policy = self.default_retry_policy
425
+
426
+ # Load history for replay
427
+ await ctx._load_history()
428
+
429
+ try:
430
+ # Replay and continue execution
431
+ input_data = instance["input_data"]
432
+
433
+ # Prepare input: convert JSON dicts to Pydantic models based on type hints
434
+ processed_input = self._prepare_workflow_input(workflow_func, input_data)
435
+
436
+ result = await workflow_func(ctx, **processed_input)
437
+
438
+ # Before marking as completed, check if workflow was cancelled
439
+ instance_check = await ctx.storage.get_instance(instance_id)
440
+ if instance_check and instance_check.get("status") == "cancelled":
441
+ from edda.exceptions import WorkflowCancelledException
442
+
443
+ raise WorkflowCancelledException(f"Workflow {instance_id} was cancelled")
444
+
445
+ # Convert Pydantic model result to JSON dict for storage
446
+ result_dict = to_json_dict(result)
447
+
448
+ # Mark as completed
449
+ await ctx._update_status("completed", {"result": result_dict})
450
+
451
+ except WaitForEventException as exc:
452
+ # Workflow is waiting for an event (again)
453
+ # Atomically register event subscription and release lock (distributed coroutines)
454
+ from datetime import UTC, datetime, timedelta
455
+
456
+ timeout_at = None
457
+ if exc.timeout_seconds is not None:
458
+ timeout_at = datetime.now(UTC) + timedelta(seconds=exc.timeout_seconds)
459
+
460
+ await self.storage.register_event_subscription_and_release_lock(
461
+ instance_id=instance_id,
462
+ worker_id=self.worker_id,
463
+ event_type=exc.event_type,
464
+ timeout_at=timeout_at,
465
+ activity_id=exc.activity_id,
466
+ )
467
+
468
+ # Update status to waiting_for_event
469
+ await ctx._update_status("waiting_for_event")
470
+
471
+ except WaitForTimerException as exc:
472
+ # Workflow is waiting for a timer (again)
473
+ # Atomically register timer subscription and release lock (distributed coroutines)
474
+ # Use the expires_at from the exception (calculated at wait_timer() call time)
475
+ # This ensures deterministic replay: the timer expiration time never changes
476
+ await self.storage.register_timer_subscription_and_release_lock(
477
+ instance_id=instance_id,
478
+ worker_id=self.worker_id,
479
+ timer_id=exc.timer_id,
480
+ expires_at=exc.expires_at,
481
+ activity_id=exc.activity_id,
482
+ )
483
+
484
+ # Status is updated to 'waiting_for_timer' atomically
485
+ # by register_timer_subscription_and_release_lock()
486
+
487
+ except Exception as error:
488
+ # Check if this is a cancellation exception
489
+ from edda.exceptions import WorkflowCancelledException
490
+
491
+ if isinstance(error, WorkflowCancelledException):
492
+ # Workflow was cancelled during execution
493
+ print(f"[Workflow] {instance_id} was cancelled during execution")
494
+
495
+ # Execute compensations (idempotent - already executed ones will be skipped)
496
+ # This ensures all compensations are executed, even if some were already
497
+ # executed by cancel_workflow() in a concurrent process
498
+ print(f"[Workflow] Executing compensations for {instance_id}")
499
+ await execute_compensations(ctx)
500
+
501
+ # Ensure status is "cancelled"
502
+ await ctx._update_status("cancelled", {"reason": "Workflow cancelled by user"})
503
+ return
504
+
505
+ # Execute compensations before marking as failed
506
+ await execute_compensations(ctx)
507
+
508
+ # Capture error details for debugging
509
+ import traceback
510
+
511
+ stack_trace = "".join(
512
+ traceback.format_exception(type(error), error, error.__traceback__)
513
+ )
514
+
515
+ # Mark as failed with detailed error information
516
+ await ctx._update_status(
517
+ "failed",
518
+ {
519
+ "error_message": str(error),
520
+ "error_type": type(error).__name__,
521
+ "stack_trace": stack_trace,
522
+ },
523
+ )
524
+ raise
525
+
526
+ async def resume_by_name(
527
+ self, instance_id: str, workflow_name: str, already_locked: bool = False
528
+ ) -> None:
529
+ """
530
+ Resume a workflow by its name (convenience method for auto-recovery).
531
+
532
+ This method looks up the workflow function from the global saga registry
533
+ and resumes execution. This is primarily used by the auto-recovery mechanism
534
+ after Stale Lock cleanup.
535
+
536
+ Args:
537
+ instance_id: Workflow instance ID
538
+ workflow_name: Name of the workflow to resume
539
+ already_locked: If True, assumes the lock is already held by the caller
540
+ (used in distributed coroutine event delivery)
541
+
542
+ Raises:
543
+ ValueError: If workflow not found in registry or instance not found
544
+ """
545
+ # Import here to avoid circular dependency
546
+ from edda.workflow import get_all_workflows
547
+
548
+ # Look up workflow in workflow registry
549
+ workflows = get_all_workflows()
550
+ workflow_obj = workflows.get(workflow_name)
551
+
552
+ if workflow_obj is None:
553
+ raise ValueError(
554
+ f"Workflow '{workflow_name}' not found in workflow registry. "
555
+ f"Available workflows: {list(workflows.keys())}"
556
+ )
557
+
558
+ # Resume using the workflow function from the workflow
559
+ await self.resume_workflow(
560
+ instance_id=instance_id, workflow_func=workflow_obj.func, already_locked=already_locked
561
+ )
562
+
563
+ async def execute_with_lock(
564
+ self,
565
+ instance_id: str,
566
+ workflow_func: Callable[..., Any],
567
+ is_replay: bool = False,
568
+ ) -> Any:
569
+ """
570
+ Execute workflow function with distributed lock.
571
+
572
+ This is a lower-level method used by start_workflow and resume_workflow.
573
+
574
+ Args:
575
+ instance_id: Workflow instance ID
576
+ workflow_func: The workflow function to execute
577
+ is_replay: Whether this is a replay execution
578
+
579
+ Returns:
580
+ Workflow result
581
+ """
582
+ # Get instance
583
+ instance = await self.storage.get_instance(instance_id)
584
+ if instance is None:
585
+ raise ValueError(f"Workflow instance {instance_id} not found")
586
+
587
+ # Acquire lock
588
+ async with workflow_lock(self.storage, instance_id, self.worker_id):
589
+ # Create context
590
+ ctx = WorkflowContext(
591
+ instance_id=instance_id,
592
+ workflow_name=instance["workflow_name"],
593
+ storage=self.storage,
594
+ worker_id=self.worker_id,
595
+ is_replaying=is_replay,
596
+ hooks=self.hooks,
597
+ )
598
+ # Set default retry policy for activity resolution
599
+ ctx._app_retry_policy = self.default_retry_policy
600
+
601
+ # Load history if replaying
602
+ if is_replay:
603
+ await ctx._load_history()
604
+
605
+ # Execute workflow
606
+ input_data = instance["input_data"]
607
+ return await workflow_func(ctx, **input_data)
608
+
609
+ async def cancel_workflow(self, instance_id: str, cancelled_by: str = "user") -> bool:
610
+ """
611
+ Cancel a running or waiting workflow.
612
+
613
+ This method will:
614
+ 1. Verify the workflow is cancellable (not already completed/failed)
615
+ 2. Try to acquire lock (with short timeout)
616
+ 3. Execute compensations to clean up any side effects
617
+ 4. Mark the workflow as cancelled in storage
618
+
619
+ Args:
620
+ instance_id: Workflow instance ID to cancel
621
+ cancelled_by: Who triggered the cancellation (e.g., "user", "admin", "timeout")
622
+
623
+ Returns:
624
+ True if successfully cancelled, False if:
625
+ - Instance not found
626
+ - Already completed/failed/cancelled
627
+ - Lock acquisition failed (workflow is actively running)
628
+
629
+ Example:
630
+ >>> engine = ReplayEngine(storage, "service", "worker-1")
631
+ >>> success = await engine.cancel_workflow("order-saga-abc123", "admin")
632
+ >>> if success:
633
+ ... print("Workflow cancelled and compensations executed")
634
+ """
635
+ # Get instance to check status
636
+ instance = await self.storage.get_instance(instance_id)
637
+ if instance is None:
638
+ return False
639
+
640
+ current_status = instance["status"]
641
+
642
+ # Only cancel running or waiting workflows
643
+ if current_status not in ("running", "waiting_for_event", "waiting_for_timer"):
644
+ return False
645
+
646
+ # Try to acquire lock with short timeout (5 seconds)
647
+ # If the workflow is actively executing, we may not be able to get the lock
648
+ try:
649
+ lock_acquired = await self.storage.try_acquire_lock(
650
+ instance_id=instance_id,
651
+ worker_id=self.worker_id,
652
+ timeout_seconds=5,
653
+ )
654
+
655
+ if not lock_acquired:
656
+ # Another worker has the lock, try to cancel anyway
657
+ # The storage layer will handle atomicity
658
+ return await self.storage.cancel_instance(instance_id, cancelled_by)
659
+
660
+ try:
661
+ # Re-fetch instance data AFTER acquiring lock
662
+ print(f"[Cancel] Fetching instance data for {instance_id}")
663
+ instance_locked = await self.storage.get_instance(instance_id)
664
+ if instance_locked is None:
665
+ print(f"[Cancel] Instance {instance_id} not found after lock acquisition")
666
+ return False
667
+
668
+ # Create context for compensation execution
669
+ ctx = WorkflowContext(
670
+ instance_id=instance_id,
671
+ workflow_name=instance_locked["workflow_name"],
672
+ storage=self.storage,
673
+ worker_id=self.worker_id,
674
+ is_replaying=False,
675
+ hooks=self.hooks,
676
+ )
677
+ # Set default retry policy for activity resolution
678
+ ctx._app_retry_policy = self.default_retry_policy
679
+
680
+ # Execute compensations to clean up
681
+ print(f"[Cancel] Executing compensations for {instance_id}")
682
+ await execute_compensations(ctx)
683
+
684
+ # Mark as cancelled in storage
685
+ success = await self.storage.cancel_instance(instance_id, cancelled_by)
686
+
687
+ return success
688
+
689
+ finally:
690
+ # Always release the lock
691
+ await self.storage.release_lock(instance_id, self.worker_id)
692
+
693
+ except Exception as error:
694
+ # Log error but don't propagate
695
+ import traceback
696
+
697
+ print(f"[Cancel] Error cancelling workflow {instance_id}: {error}")
698
+ traceback.print_exc()
699
+ return False
700
+
701
+ async def resume_compensating_workflow(self, instance_id: str) -> bool:
702
+ """
703
+ Resume a workflow that crashed during compensation execution.
704
+
705
+ This method only re-executes incomplete compensations without running
706
+ the workflow function. It determines the target status (failed/cancelled)
707
+ from the instance metadata.
708
+
709
+ Args:
710
+ instance_id: Workflow instance ID
711
+
712
+ Returns:
713
+ True if compensations completed successfully, False otherwise
714
+ """
715
+ print(f"[ResumeCompensating] Starting compensation recovery for {instance_id}")
716
+
717
+ try:
718
+ # Acquire lock
719
+ locked = await self.storage.try_acquire_lock(
720
+ instance_id=instance_id,
721
+ worker_id=self.worker_id,
722
+ timeout_seconds=300,
723
+ )
724
+
725
+ if not locked:
726
+ print(f"[ResumeCompensating] Could not acquire lock for {instance_id}")
727
+ return False
728
+
729
+ try:
730
+ # Get instance data
731
+ instance = await self.storage.get_instance(instance_id)
732
+ if instance is None:
733
+ print(f"[ResumeCompensating] Instance {instance_id} not found")
734
+ return False
735
+
736
+ # Check current status
737
+ current_status = instance["status"]
738
+ if current_status != "compensating":
739
+ print(
740
+ f"[ResumeCompensating] Instance {instance_id} is not in compensating state (status={current_status})"
741
+ )
742
+ return False
743
+
744
+ # Determine target status based on history or metadata
745
+ # If we can't determine, default to "failed"
746
+ target_status = "failed"
747
+
748
+ # Check history for cancellation markers
749
+ history = await self.storage.get_history(instance_id)
750
+ for event in history:
751
+ event_type = event.get("event_type", "")
752
+ if event_type == "WorkflowCancelled" or "cancel" in event_type.lower():
753
+ target_status = "cancelled"
754
+ break
755
+
756
+ print(f"[ResumeCompensating] Target status after compensation: {target_status}")
757
+
758
+ # Create context for compensation execution
759
+ ctx = WorkflowContext(
760
+ instance_id=instance_id,
761
+ workflow_name=instance["workflow_name"],
762
+ storage=self.storage,
763
+ worker_id=self.worker_id,
764
+ is_replaying=False,
765
+ hooks=self.hooks,
766
+ )
767
+ # Set default retry policy for activity resolution
768
+ ctx._app_retry_policy = self.default_retry_policy
769
+
770
+ # Re-execute compensations (idempotent - skips already executed)
771
+ print(f"[ResumeCompensating] Re-executing compensations for {instance_id}")
772
+ await execute_compensations(ctx)
773
+
774
+ # Mark with target status
775
+ if target_status == "cancelled":
776
+ success = await self.storage.cancel_instance(instance_id, "crash_recovery")
777
+ print(f"[ResumeCompensating] Marked {instance_id} as cancelled")
778
+ else:
779
+ await ctx._update_status(
780
+ "failed", {"error": "Workflow failed before compensation"}
781
+ )
782
+ print(f"[ResumeCompensating] Marked {instance_id} as failed")
783
+ success = True
784
+
785
+ return success
786
+
787
+ finally:
788
+ # Always release the lock
789
+ await self.storage.release_lock(instance_id, self.worker_id)
790
+
791
+ except Exception as error:
792
+ # Log error but don't propagate
793
+ import traceback
794
+
795
+ print(
796
+ f"[ResumeCompensating] Error resuming compensating workflow {instance_id}: {error}"
797
+ )
798
+ traceback.print_exc()
799
+ return False