edda-framework 0.7.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- edda/__init__.py +39 -5
- edda/app.py +383 -223
- edda/channels.py +1017 -0
- edda/compensation.py +22 -22
- edda/context.py +105 -52
- edda/integrations/opentelemetry/hooks.py +7 -2
- edda/locking.py +130 -67
- edda/replay.py +312 -82
- edda/storage/models.py +142 -24
- edda/storage/protocol.py +539 -118
- edda/storage/sqlalchemy_storage.py +1866 -328
- edda/viewer_ui/app.py +6 -1
- edda/viewer_ui/data_service.py +19 -22
- edda/workflow.py +43 -0
- {edda_framework-0.7.0.dist-info → edda_framework-0.9.0.dist-info}/METADATA +109 -9
- {edda_framework-0.7.0.dist-info → edda_framework-0.9.0.dist-info}/RECORD +19 -19
- edda/events.py +0 -505
- {edda_framework-0.7.0.dist-info → edda_framework-0.9.0.dist-info}/WHEEL +0 -0
- {edda_framework-0.7.0.dist-info → edda_framework-0.9.0.dist-info}/entry_points.txt +0 -0
- {edda_framework-0.7.0.dist-info → edda_framework-0.9.0.dist-info}/licenses/LICENSE +0 -0
edda/replay.py
CHANGED
|
@@ -13,9 +13,9 @@ import uuid
|
|
|
13
13
|
from collections.abc import Callable
|
|
14
14
|
from typing import Any
|
|
15
15
|
|
|
16
|
+
from edda.channels import WaitForChannelMessageException, WaitForTimerException
|
|
16
17
|
from edda.compensation import execute_compensations
|
|
17
18
|
from edda.context import WorkflowContext
|
|
18
|
-
from edda.events import WaitForEventException, WaitForTimerException
|
|
19
19
|
from edda.locking import workflow_lock
|
|
20
20
|
from edda.pydantic_utils import (
|
|
21
21
|
enum_value_to_enum,
|
|
@@ -25,6 +25,7 @@ from edda.pydantic_utils import (
|
|
|
25
25
|
to_json_dict,
|
|
26
26
|
)
|
|
27
27
|
from edda.storage.protocol import StorageProtocol
|
|
28
|
+
from edda.workflow import RecurException
|
|
28
29
|
|
|
29
30
|
logger = logging.getLogger(__name__)
|
|
30
31
|
|
|
@@ -243,9 +244,9 @@ class ReplayEngine:
|
|
|
243
244
|
|
|
244
245
|
return instance_id
|
|
245
246
|
|
|
246
|
-
except
|
|
247
|
-
# Workflow is waiting for
|
|
248
|
-
# Before marking as
|
|
247
|
+
except WaitForTimerException as exc:
|
|
248
|
+
# Workflow is waiting for a timer
|
|
249
|
+
# Before marking as waiting_for_timer, check if workflow was cancelled
|
|
249
250
|
instance = await ctx.storage.get_instance(instance_id)
|
|
250
251
|
if instance and instance.get("status") == "cancelled":
|
|
251
252
|
from edda.exceptions import WorkflowCancelledException
|
|
@@ -254,30 +255,26 @@ class ReplayEngine:
|
|
|
254
255
|
f"Workflow {instance_id} was cancelled"
|
|
255
256
|
) from None
|
|
256
257
|
|
|
257
|
-
# Atomically register
|
|
258
|
+
# Atomically register timer subscription and release lock (distributed coroutines)
|
|
258
259
|
# This ensures subscription is registered and lock is released in a single transaction
|
|
259
|
-
# so ANY worker can resume the workflow when the
|
|
260
|
-
from
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
if exc.timeout_seconds is not None:
|
|
264
|
-
timeout_at = datetime.now(UTC) + timedelta(seconds=exc.timeout_seconds)
|
|
265
|
-
|
|
266
|
-
await self.storage.register_event_subscription_and_release_lock(
|
|
260
|
+
# so ANY worker can resume the workflow when the timer expires
|
|
261
|
+
# Use the expires_at from the exception (calculated at wait_timer() call time)
|
|
262
|
+
# This ensures deterministic replay: the timer expiration time never changes
|
|
263
|
+
await self.storage.register_timer_subscription_and_release_lock(
|
|
267
264
|
instance_id=instance_id,
|
|
268
265
|
worker_id=self.worker_id,
|
|
269
|
-
|
|
270
|
-
|
|
266
|
+
timer_id=exc.timer_id,
|
|
267
|
+
expires_at=exc.expires_at,
|
|
271
268
|
activity_id=exc.activity_id,
|
|
272
269
|
)
|
|
273
270
|
|
|
274
|
-
#
|
|
275
|
-
|
|
271
|
+
# Status is updated to 'waiting_for_timer' atomically
|
|
272
|
+
# by register_timer_subscription_and_release_lock()
|
|
276
273
|
return instance_id
|
|
277
274
|
|
|
278
|
-
except
|
|
279
|
-
# Workflow is waiting for a
|
|
280
|
-
# Before marking as
|
|
275
|
+
except WaitForChannelMessageException as exc:
|
|
276
|
+
# Workflow is waiting for a message on a channel
|
|
277
|
+
# Before marking as waiting_for_message, check if workflow was cancelled
|
|
281
278
|
instance = await ctx.storage.get_instance(instance_id)
|
|
282
279
|
if instance and instance.get("status") == "cancelled":
|
|
283
280
|
from edda.exceptions import WorkflowCancelledException
|
|
@@ -286,35 +283,70 @@ class ReplayEngine:
|
|
|
286
283
|
f"Workflow {instance_id} was cancelled"
|
|
287
284
|
) from None
|
|
288
285
|
|
|
289
|
-
# Atomically register
|
|
286
|
+
# Atomically register channel receive and release lock (distributed coroutines)
|
|
290
287
|
# This ensures subscription is registered and lock is released in a single transaction
|
|
291
|
-
# so ANY worker can resume the workflow when the
|
|
292
|
-
|
|
293
|
-
# This ensures deterministic replay: the timer expiration time never changes
|
|
294
|
-
await self.storage.register_timer_subscription_and_release_lock(
|
|
288
|
+
# so ANY worker can resume the workflow when the message arrives
|
|
289
|
+
await self.storage.register_channel_receive_and_release_lock(
|
|
295
290
|
instance_id=instance_id,
|
|
296
291
|
worker_id=self.worker_id,
|
|
297
|
-
|
|
298
|
-
expires_at=exc.expires_at,
|
|
292
|
+
channel=exc.channel,
|
|
299
293
|
activity_id=exc.activity_id,
|
|
294
|
+
timeout_seconds=exc.timeout_seconds,
|
|
300
295
|
)
|
|
301
296
|
|
|
302
|
-
# Status is updated to '
|
|
303
|
-
# by
|
|
297
|
+
# Status is updated to 'waiting_for_message' atomically
|
|
298
|
+
# by register_channel_receive_and_release_lock()
|
|
304
299
|
return instance_id
|
|
305
300
|
|
|
301
|
+
except RecurException as exc:
|
|
302
|
+
# Workflow is recurring (Erlang-style tail recursion pattern)
|
|
303
|
+
# This resets history growth in long-running loops by:
|
|
304
|
+
# 1. Completing the current instance (marking as "recurred")
|
|
305
|
+
# 2. Archiving the current history
|
|
306
|
+
# 3. Cleaning up subscriptions
|
|
307
|
+
# 4. Starting a new instance with the provided arguments
|
|
308
|
+
# 5. Linking new instance to old via `continued_from`
|
|
309
|
+
|
|
310
|
+
logger.info(f"Workflow {instance_id} recurring with args: {exc.kwargs}")
|
|
311
|
+
|
|
312
|
+
# Mark current workflow as "recurred"
|
|
313
|
+
await ctx._update_status("recurred", {"recur_kwargs": exc.kwargs})
|
|
314
|
+
|
|
315
|
+
# Archive history (move to archive table)
|
|
316
|
+
archived_count = await self.storage.archive_history(instance_id)
|
|
317
|
+
logger.info(f"Archived {archived_count} history entries for {instance_id}")
|
|
318
|
+
|
|
319
|
+
# Clean up all subscriptions (event/timer/message)
|
|
320
|
+
# This prevents old subscriptions from receiving events meant for the new instance
|
|
321
|
+
await self.storage.cleanup_instance_subscriptions(instance_id)
|
|
322
|
+
|
|
323
|
+
# Clear compensations (fresh start)
|
|
324
|
+
await ctx._clear_compensations()
|
|
325
|
+
|
|
326
|
+
# Create and start a new workflow instance
|
|
327
|
+
new_instance_id = await self._start_recurred_workflow(
|
|
328
|
+
workflow_name=workflow_name,
|
|
329
|
+
workflow_func=workflow_func,
|
|
330
|
+
input_data=exc.kwargs,
|
|
331
|
+
continued_from=instance_id,
|
|
332
|
+
lock_timeout_seconds=lock_timeout_seconds,
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
logger.info(f"Workflow {instance_id} recurred to {new_instance_id}")
|
|
336
|
+
return new_instance_id
|
|
337
|
+
|
|
306
338
|
except Exception as error:
|
|
307
339
|
# Check if this is a cancellation exception
|
|
308
340
|
from edda.exceptions import WorkflowCancelledException
|
|
309
341
|
|
|
310
342
|
if isinstance(error, WorkflowCancelledException):
|
|
311
343
|
# Workflow was cancelled during execution
|
|
312
|
-
|
|
344
|
+
logger.info("Workflow %s was cancelled during execution", instance_id)
|
|
313
345
|
|
|
314
346
|
# Execute compensations (idempotent - already executed ones will be skipped)
|
|
315
347
|
# This ensures all compensations are executed, even if some were already
|
|
316
348
|
# executed by cancel_workflow() in a concurrent process
|
|
317
|
-
|
|
349
|
+
logger.debug("Executing compensations for %s", instance_id)
|
|
318
350
|
await execute_compensations(ctx)
|
|
319
351
|
|
|
320
352
|
# Ensure status is "cancelled"
|
|
@@ -448,26 +480,6 @@ class ReplayEngine:
|
|
|
448
480
|
# Mark as completed
|
|
449
481
|
await ctx._update_status("completed", {"result": result_dict})
|
|
450
482
|
|
|
451
|
-
except WaitForEventException as exc:
|
|
452
|
-
# Workflow is waiting for an event (again)
|
|
453
|
-
# Atomically register event subscription and release lock (distributed coroutines)
|
|
454
|
-
from datetime import UTC, datetime, timedelta
|
|
455
|
-
|
|
456
|
-
timeout_at = None
|
|
457
|
-
if exc.timeout_seconds is not None:
|
|
458
|
-
timeout_at = datetime.now(UTC) + timedelta(seconds=exc.timeout_seconds)
|
|
459
|
-
|
|
460
|
-
await self.storage.register_event_subscription_and_release_lock(
|
|
461
|
-
instance_id=instance_id,
|
|
462
|
-
worker_id=self.worker_id,
|
|
463
|
-
event_type=exc.event_type,
|
|
464
|
-
timeout_at=timeout_at,
|
|
465
|
-
activity_id=exc.activity_id,
|
|
466
|
-
)
|
|
467
|
-
|
|
468
|
-
# Update status to waiting_for_event
|
|
469
|
-
await ctx._update_status("waiting_for_event")
|
|
470
|
-
|
|
471
483
|
except WaitForTimerException as exc:
|
|
472
484
|
# Workflow is waiting for a timer (again)
|
|
473
485
|
# Atomically register timer subscription and release lock (distributed coroutines)
|
|
@@ -484,18 +496,70 @@ class ReplayEngine:
|
|
|
484
496
|
# Status is updated to 'waiting_for_timer' atomically
|
|
485
497
|
# by register_timer_subscription_and_release_lock()
|
|
486
498
|
|
|
499
|
+
except WaitForChannelMessageException as exc:
|
|
500
|
+
# Workflow is waiting for a message on a channel (again)
|
|
501
|
+
# Atomically register channel receive and release lock (distributed coroutines)
|
|
502
|
+
await self.storage.register_channel_receive_and_release_lock(
|
|
503
|
+
instance_id=instance_id,
|
|
504
|
+
worker_id=self.worker_id,
|
|
505
|
+
channel=exc.channel,
|
|
506
|
+
activity_id=exc.activity_id,
|
|
507
|
+
timeout_seconds=exc.timeout_seconds,
|
|
508
|
+
)
|
|
509
|
+
|
|
510
|
+
# Status is updated to 'waiting_for_message' atomically
|
|
511
|
+
# by register_channel_receive_and_release_lock()
|
|
512
|
+
|
|
513
|
+
except RecurException as exc:
|
|
514
|
+
# Workflow is recurring (Erlang-style tail recursion pattern)
|
|
515
|
+
# This resets history growth in long-running loops by:
|
|
516
|
+
# 1. Completing the current instance (marking as "recurred")
|
|
517
|
+
# 2. Archiving the current history
|
|
518
|
+
# 3. Cleaning up subscriptions
|
|
519
|
+
# 4. Starting a new instance with the provided arguments
|
|
520
|
+
# 5. Linking new instance to old via `continued_from`
|
|
521
|
+
|
|
522
|
+
logger.info(f"Workflow {instance_id} recurring with args: {exc.kwargs}")
|
|
523
|
+
|
|
524
|
+
# Mark current workflow as "recurred"
|
|
525
|
+
await ctx._update_status("recurred", {"recur_kwargs": exc.kwargs})
|
|
526
|
+
|
|
527
|
+
# Archive history (move to archive table)
|
|
528
|
+
archived_count = await self.storage.archive_history(instance_id)
|
|
529
|
+
logger.info(f"Archived {archived_count} history entries for {instance_id}")
|
|
530
|
+
|
|
531
|
+
# Clean up all subscriptions (event/timer/message)
|
|
532
|
+
# This prevents old subscriptions from receiving events meant for the new instance
|
|
533
|
+
await self.storage.cleanup_instance_subscriptions(instance_id)
|
|
534
|
+
|
|
535
|
+
# Clear compensations (fresh start)
|
|
536
|
+
await ctx._clear_compensations()
|
|
537
|
+
|
|
538
|
+
# Create and start a new workflow instance
|
|
539
|
+
# Note: we don't return the new instance_id here since _execute_workflow_logic returns None
|
|
540
|
+
# The new workflow will execute in its own context
|
|
541
|
+
await self._start_recurred_workflow(
|
|
542
|
+
workflow_name=instance["workflow_name"],
|
|
543
|
+
workflow_func=workflow_func,
|
|
544
|
+
input_data=exc.kwargs,
|
|
545
|
+
continued_from=instance_id,
|
|
546
|
+
lock_timeout_seconds=instance.get("lock_timeout_seconds"),
|
|
547
|
+
)
|
|
548
|
+
|
|
549
|
+
logger.info(f"Workflow {instance_id} recurred successfully")
|
|
550
|
+
|
|
487
551
|
except Exception as error:
|
|
488
552
|
# Check if this is a cancellation exception
|
|
489
553
|
from edda.exceptions import WorkflowCancelledException
|
|
490
554
|
|
|
491
555
|
if isinstance(error, WorkflowCancelledException):
|
|
492
556
|
# Workflow was cancelled during execution
|
|
493
|
-
|
|
557
|
+
logger.info("Workflow %s was cancelled during execution", instance_id)
|
|
494
558
|
|
|
495
559
|
# Execute compensations (idempotent - already executed ones will be skipped)
|
|
496
560
|
# This ensures all compensations are executed, even if some were already
|
|
497
561
|
# executed by cancel_workflow() in a concurrent process
|
|
498
|
-
|
|
562
|
+
logger.debug("Executing compensations for %s", instance_id)
|
|
499
563
|
await execute_compensations(ctx)
|
|
500
564
|
|
|
501
565
|
# Ensure status is "cancelled"
|
|
@@ -560,6 +624,171 @@ class ReplayEngine:
|
|
|
560
624
|
instance_id=instance_id, workflow_func=workflow_obj.func, already_locked=already_locked
|
|
561
625
|
)
|
|
562
626
|
|
|
627
|
+
async def _start_recurred_workflow(
|
|
628
|
+
self,
|
|
629
|
+
workflow_name: str,
|
|
630
|
+
workflow_func: Callable[..., Any],
|
|
631
|
+
input_data: dict[str, Any],
|
|
632
|
+
continued_from: str,
|
|
633
|
+
lock_timeout_seconds: int | None = None,
|
|
634
|
+
) -> str:
|
|
635
|
+
"""
|
|
636
|
+
Start a new workflow instance as a recurrence of an existing workflow.
|
|
637
|
+
|
|
638
|
+
This is an internal helper method used by the RecurException handler.
|
|
639
|
+
It creates a new workflow instance linked to the previous one via continued_from.
|
|
640
|
+
|
|
641
|
+
Args:
|
|
642
|
+
workflow_name: Name of the workflow
|
|
643
|
+
workflow_func: The workflow function to execute
|
|
644
|
+
input_data: Input parameters for the workflow (from recur() kwargs)
|
|
645
|
+
continued_from: Instance ID of the workflow that is recurring
|
|
646
|
+
lock_timeout_seconds: Lock timeout for this workflow (None = global default 300s)
|
|
647
|
+
|
|
648
|
+
Returns:
|
|
649
|
+
Instance ID of the new workflow
|
|
650
|
+
"""
|
|
651
|
+
# Generate new instance ID
|
|
652
|
+
new_instance_id = f"{workflow_name}-{uuid.uuid4().hex}"
|
|
653
|
+
|
|
654
|
+
# Extract source code for visualization
|
|
655
|
+
try:
|
|
656
|
+
source_code = inspect.getsource(workflow_func)
|
|
657
|
+
except (OSError, TypeError) as e:
|
|
658
|
+
logger.warning(
|
|
659
|
+
f"Could not extract source code for workflow '{workflow_name}': {e}. "
|
|
660
|
+
"Hybrid diagram visualization will not be available."
|
|
661
|
+
)
|
|
662
|
+
source_code = f"# Source code not available\n# Workflow: {workflow_name}\n# Error: {e}"
|
|
663
|
+
|
|
664
|
+
# Calculate source code hash
|
|
665
|
+
source_hash = hashlib.sha256(source_code.encode("utf-8")).hexdigest()
|
|
666
|
+
|
|
667
|
+
# Store workflow definition (idempotent)
|
|
668
|
+
await self.storage.upsert_workflow_definition(
|
|
669
|
+
workflow_name=workflow_name,
|
|
670
|
+
source_hash=source_hash,
|
|
671
|
+
source_code=source_code,
|
|
672
|
+
)
|
|
673
|
+
|
|
674
|
+
# Create workflow instance in storage with continued_from reference
|
|
675
|
+
await self.storage.create_instance(
|
|
676
|
+
instance_id=new_instance_id,
|
|
677
|
+
workflow_name=workflow_name,
|
|
678
|
+
source_hash=source_hash,
|
|
679
|
+
owner_service=self.service_name,
|
|
680
|
+
input_data=input_data,
|
|
681
|
+
lock_timeout_seconds=lock_timeout_seconds,
|
|
682
|
+
continued_from=continued_from,
|
|
683
|
+
)
|
|
684
|
+
|
|
685
|
+
# Execute the new workflow with distributed lock
|
|
686
|
+
async with workflow_lock(self.storage, new_instance_id, self.worker_id):
|
|
687
|
+
# Create context for new execution
|
|
688
|
+
ctx = WorkflowContext(
|
|
689
|
+
instance_id=new_instance_id,
|
|
690
|
+
workflow_name=workflow_name,
|
|
691
|
+
storage=self.storage,
|
|
692
|
+
worker_id=self.worker_id,
|
|
693
|
+
is_replaying=False,
|
|
694
|
+
hooks=self.hooks,
|
|
695
|
+
)
|
|
696
|
+
# Set default retry policy for activity resolution
|
|
697
|
+
ctx._app_retry_policy = self.default_retry_policy
|
|
698
|
+
|
|
699
|
+
try:
|
|
700
|
+
# Call hook: workflow start
|
|
701
|
+
if self.hooks and hasattr(self.hooks, "on_workflow_start"):
|
|
702
|
+
await self.hooks.on_workflow_start(new_instance_id, workflow_name, input_data)
|
|
703
|
+
|
|
704
|
+
# Prepare input: convert JSON dicts to Pydantic models based on type hints
|
|
705
|
+
processed_input = self._prepare_workflow_input(workflow_func, input_data)
|
|
706
|
+
|
|
707
|
+
# Execute workflow function
|
|
708
|
+
result = await workflow_func(ctx, **processed_input)
|
|
709
|
+
|
|
710
|
+
# Convert Pydantic model result to JSON dict for storage
|
|
711
|
+
result_dict = to_json_dict(result)
|
|
712
|
+
|
|
713
|
+
# Mark as completed
|
|
714
|
+
await ctx._update_status("completed", {"result": result_dict})
|
|
715
|
+
|
|
716
|
+
# Call hook: workflow complete
|
|
717
|
+
if self.hooks and hasattr(self.hooks, "on_workflow_complete"):
|
|
718
|
+
await self.hooks.on_workflow_complete(new_instance_id, workflow_name, result)
|
|
719
|
+
|
|
720
|
+
return new_instance_id
|
|
721
|
+
|
|
722
|
+
except WaitForTimerException as exc:
|
|
723
|
+
# Workflow is waiting for a timer
|
|
724
|
+
await self.storage.register_timer_subscription_and_release_lock(
|
|
725
|
+
instance_id=new_instance_id,
|
|
726
|
+
worker_id=self.worker_id,
|
|
727
|
+
timer_id=exc.timer_id,
|
|
728
|
+
expires_at=exc.expires_at,
|
|
729
|
+
activity_id=exc.activity_id,
|
|
730
|
+
)
|
|
731
|
+
return new_instance_id
|
|
732
|
+
|
|
733
|
+
except WaitForChannelMessageException as exc:
|
|
734
|
+
# Workflow is waiting for a message
|
|
735
|
+
await self.storage.register_channel_receive_and_release_lock(
|
|
736
|
+
instance_id=new_instance_id,
|
|
737
|
+
worker_id=self.worker_id,
|
|
738
|
+
channel=exc.channel,
|
|
739
|
+
activity_id=exc.activity_id,
|
|
740
|
+
timeout_seconds=exc.timeout_seconds,
|
|
741
|
+
)
|
|
742
|
+
return new_instance_id
|
|
743
|
+
|
|
744
|
+
except RecurException as exc:
|
|
745
|
+
# Recur again immediately (nested recur)
|
|
746
|
+
logger.info(
|
|
747
|
+
f"Workflow {new_instance_id} recurring immediately with args: {exc.kwargs}"
|
|
748
|
+
)
|
|
749
|
+
|
|
750
|
+
await ctx._update_status("recurred", {"recur_kwargs": exc.kwargs})
|
|
751
|
+
archived_count = await self.storage.archive_history(new_instance_id)
|
|
752
|
+
logger.info(f"Archived {archived_count} history entries for {new_instance_id}")
|
|
753
|
+
|
|
754
|
+
# Clean up all subscriptions (event/timer/message)
|
|
755
|
+
await self.storage.cleanup_instance_subscriptions(new_instance_id)
|
|
756
|
+
|
|
757
|
+
await ctx._clear_compensations()
|
|
758
|
+
|
|
759
|
+
# Recursively start another recurred workflow
|
|
760
|
+
return await self._start_recurred_workflow(
|
|
761
|
+
workflow_name=workflow_name,
|
|
762
|
+
workflow_func=workflow_func,
|
|
763
|
+
input_data=exc.kwargs,
|
|
764
|
+
continued_from=new_instance_id,
|
|
765
|
+
lock_timeout_seconds=lock_timeout_seconds,
|
|
766
|
+
)
|
|
767
|
+
|
|
768
|
+
except Exception as error:
|
|
769
|
+
# Execute compensations before marking as failed
|
|
770
|
+
await execute_compensations(ctx)
|
|
771
|
+
|
|
772
|
+
import traceback
|
|
773
|
+
|
|
774
|
+
stack_trace = "".join(
|
|
775
|
+
traceback.format_exception(type(error), error, error.__traceback__)
|
|
776
|
+
)
|
|
777
|
+
|
|
778
|
+
await ctx._update_status(
|
|
779
|
+
"failed",
|
|
780
|
+
{
|
|
781
|
+
"error_message": str(error),
|
|
782
|
+
"error_type": type(error).__name__,
|
|
783
|
+
"stack_trace": stack_trace,
|
|
784
|
+
},
|
|
785
|
+
)
|
|
786
|
+
|
|
787
|
+
if self.hooks and hasattr(self.hooks, "on_workflow_failed"):
|
|
788
|
+
await self.hooks.on_workflow_failed(new_instance_id, workflow_name, error)
|
|
789
|
+
|
|
790
|
+
raise
|
|
791
|
+
|
|
563
792
|
async def execute_with_lock(
|
|
564
793
|
self,
|
|
565
794
|
instance_id: str,
|
|
@@ -640,7 +869,12 @@ class ReplayEngine:
|
|
|
640
869
|
current_status = instance["status"]
|
|
641
870
|
|
|
642
871
|
# Only cancel running or waiting workflows
|
|
643
|
-
if current_status not in (
|
|
872
|
+
if current_status not in (
|
|
873
|
+
"running",
|
|
874
|
+
"waiting_for_event",
|
|
875
|
+
"waiting_for_timer",
|
|
876
|
+
"waiting_for_message",
|
|
877
|
+
):
|
|
644
878
|
return False
|
|
645
879
|
|
|
646
880
|
# Try to acquire lock with short timeout (5 seconds)
|
|
@@ -659,10 +893,10 @@ class ReplayEngine:
|
|
|
659
893
|
|
|
660
894
|
try:
|
|
661
895
|
# Re-fetch instance data AFTER acquiring lock
|
|
662
|
-
|
|
896
|
+
logger.debug("Fetching instance data for %s", instance_id)
|
|
663
897
|
instance_locked = await self.storage.get_instance(instance_id)
|
|
664
898
|
if instance_locked is None:
|
|
665
|
-
|
|
899
|
+
logger.warning("Instance %s not found after lock acquisition", instance_id)
|
|
666
900
|
return False
|
|
667
901
|
|
|
668
902
|
# Create context for compensation execution
|
|
@@ -678,7 +912,7 @@ class ReplayEngine:
|
|
|
678
912
|
ctx._app_retry_policy = self.default_retry_policy
|
|
679
913
|
|
|
680
914
|
# Execute compensations to clean up
|
|
681
|
-
|
|
915
|
+
logger.debug("Executing compensations for %s", instance_id)
|
|
682
916
|
await execute_compensations(ctx)
|
|
683
917
|
|
|
684
918
|
# Mark as cancelled in storage
|
|
@@ -692,10 +926,7 @@ class ReplayEngine:
|
|
|
692
926
|
|
|
693
927
|
except Exception as error:
|
|
694
928
|
# Log error but don't propagate
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
print(f"[Cancel] Error cancelling workflow {instance_id}: {error}")
|
|
698
|
-
traceback.print_exc()
|
|
929
|
+
logger.error("Error cancelling workflow %s: %s", instance_id, error, exc_info=True)
|
|
699
930
|
return False
|
|
700
931
|
|
|
701
932
|
async def resume_compensating_workflow(self, instance_id: str) -> bool:
|
|
@@ -712,7 +943,7 @@ class ReplayEngine:
|
|
|
712
943
|
Returns:
|
|
713
944
|
True if compensations completed successfully, False otherwise
|
|
714
945
|
"""
|
|
715
|
-
|
|
946
|
+
logger.info("Starting compensation recovery for %s", instance_id)
|
|
716
947
|
|
|
717
948
|
try:
|
|
718
949
|
# Acquire lock
|
|
@@ -723,21 +954,23 @@ class ReplayEngine:
|
|
|
723
954
|
)
|
|
724
955
|
|
|
725
956
|
if not locked:
|
|
726
|
-
|
|
957
|
+
logger.debug("Could not acquire lock for %s", instance_id)
|
|
727
958
|
return False
|
|
728
959
|
|
|
729
960
|
try:
|
|
730
961
|
# Get instance data
|
|
731
962
|
instance = await self.storage.get_instance(instance_id)
|
|
732
963
|
if instance is None:
|
|
733
|
-
|
|
964
|
+
logger.warning("Instance %s not found", instance_id)
|
|
734
965
|
return False
|
|
735
966
|
|
|
736
967
|
# Check current status
|
|
737
968
|
current_status = instance["status"]
|
|
738
969
|
if current_status != "compensating":
|
|
739
|
-
|
|
740
|
-
|
|
970
|
+
logger.debug(
|
|
971
|
+
"Instance %s is not in compensating state (status=%s)",
|
|
972
|
+
instance_id,
|
|
973
|
+
current_status,
|
|
741
974
|
)
|
|
742
975
|
return False
|
|
743
976
|
|
|
@@ -745,15 +978,12 @@ class ReplayEngine:
|
|
|
745
978
|
# If we can't determine, default to "failed"
|
|
746
979
|
target_status = "failed"
|
|
747
980
|
|
|
748
|
-
# Check history for cancellation markers
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
if event_type == "WorkflowCancelled" or "cancel" in event_type.lower():
|
|
753
|
-
target_status = "cancelled"
|
|
754
|
-
break
|
|
981
|
+
# Check history for cancellation markers (LIMIT 1 optimization)
|
|
982
|
+
cancellation_event = await self.storage.find_first_cancellation_event(instance_id)
|
|
983
|
+
if cancellation_event is not None:
|
|
984
|
+
target_status = "cancelled"
|
|
755
985
|
|
|
756
|
-
|
|
986
|
+
logger.debug("Target status after compensation: %s", target_status)
|
|
757
987
|
|
|
758
988
|
# Create context for compensation execution
|
|
759
989
|
ctx = WorkflowContext(
|
|
@@ -768,18 +998,18 @@ class ReplayEngine:
|
|
|
768
998
|
ctx._app_retry_policy = self.default_retry_policy
|
|
769
999
|
|
|
770
1000
|
# Re-execute compensations (idempotent - skips already executed)
|
|
771
|
-
|
|
1001
|
+
logger.debug("Re-executing compensations for %s", instance_id)
|
|
772
1002
|
await execute_compensations(ctx)
|
|
773
1003
|
|
|
774
1004
|
# Mark with target status
|
|
775
1005
|
if target_status == "cancelled":
|
|
776
1006
|
success = await self.storage.cancel_instance(instance_id, "crash_recovery")
|
|
777
|
-
|
|
1007
|
+
logger.info("Marked %s as cancelled", instance_id)
|
|
778
1008
|
else:
|
|
779
1009
|
await ctx._update_status(
|
|
780
1010
|
"failed", {"error": "Workflow failed before compensation"}
|
|
781
1011
|
)
|
|
782
|
-
|
|
1012
|
+
logger.info("Marked %s as failed", instance_id)
|
|
783
1013
|
success = True
|
|
784
1014
|
|
|
785
1015
|
return success
|
|
@@ -790,10 +1020,10 @@ class ReplayEngine:
|
|
|
790
1020
|
|
|
791
1021
|
except Exception as error:
|
|
792
1022
|
# Log error but don't propagate
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
1023
|
+
logger.error(
|
|
1024
|
+
"Error resuming compensating workflow %s: %s",
|
|
1025
|
+
instance_id,
|
|
1026
|
+
error,
|
|
1027
|
+
exc_info=True,
|
|
797
1028
|
)
|
|
798
|
-
traceback.print_exc()
|
|
799
1029
|
return False
|