edda-framework 0.7.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
edda/replay.py CHANGED
@@ -13,9 +13,9 @@ import uuid
13
13
  from collections.abc import Callable
14
14
  from typing import Any
15
15
 
16
+ from edda.channels import WaitForChannelMessageException, WaitForTimerException
16
17
  from edda.compensation import execute_compensations
17
18
  from edda.context import WorkflowContext
18
- from edda.events import WaitForEventException, WaitForTimerException
19
19
  from edda.locking import workflow_lock
20
20
  from edda.pydantic_utils import (
21
21
  enum_value_to_enum,
@@ -25,6 +25,7 @@ from edda.pydantic_utils import (
25
25
  to_json_dict,
26
26
  )
27
27
  from edda.storage.protocol import StorageProtocol
28
+ from edda.workflow import RecurException
28
29
 
29
30
  logger = logging.getLogger(__name__)
30
31
 
@@ -243,9 +244,9 @@ class ReplayEngine:
243
244
 
244
245
  return instance_id
245
246
 
246
- except WaitForEventException as exc:
247
- # Workflow is waiting for an event
248
- # Before marking as waiting_for_event, check if workflow was cancelled
247
+ except WaitForTimerException as exc:
248
+ # Workflow is waiting for a timer
249
+ # Before marking as waiting_for_timer, check if workflow was cancelled
249
250
  instance = await ctx.storage.get_instance(instance_id)
250
251
  if instance and instance.get("status") == "cancelled":
251
252
  from edda.exceptions import WorkflowCancelledException
@@ -254,30 +255,26 @@ class ReplayEngine:
254
255
  f"Workflow {instance_id} was cancelled"
255
256
  ) from None
256
257
 
257
- # Atomically register event subscription and release lock (distributed coroutines)
258
+ # Atomically register timer subscription and release lock (distributed coroutines)
258
259
  # This ensures subscription is registered and lock is released in a single transaction
259
- # so ANY worker can resume the workflow when the event arrives
260
- from datetime import UTC, datetime, timedelta
261
-
262
- timeout_at = None
263
- if exc.timeout_seconds is not None:
264
- timeout_at = datetime.now(UTC) + timedelta(seconds=exc.timeout_seconds)
265
-
266
- await self.storage.register_event_subscription_and_release_lock(
260
+ # so ANY worker can resume the workflow when the timer expires
261
+ # Use the expires_at from the exception (calculated at wait_timer() call time)
262
+ # This ensures deterministic replay: the timer expiration time never changes
263
+ await self.storage.register_timer_subscription_and_release_lock(
267
264
  instance_id=instance_id,
268
265
  worker_id=self.worker_id,
269
- event_type=exc.event_type,
270
- timeout_at=timeout_at,
266
+ timer_id=exc.timer_id,
267
+ expires_at=exc.expires_at,
271
268
  activity_id=exc.activity_id,
272
269
  )
273
270
 
274
- # Update status to waiting_for_event
275
- await ctx._update_status("waiting_for_event")
271
+ # Status is updated to 'waiting_for_timer' atomically
272
+ # by register_timer_subscription_and_release_lock()
276
273
  return instance_id
277
274
 
278
- except WaitForTimerException as exc:
279
- # Workflow is waiting for a timer
280
- # Before marking as waiting_for_timer, check if workflow was cancelled
275
+ except WaitForChannelMessageException as exc:
276
+ # Workflow is waiting for a message on a channel
277
+ # Before marking as waiting_for_message, check if workflow was cancelled
281
278
  instance = await ctx.storage.get_instance(instance_id)
282
279
  if instance and instance.get("status") == "cancelled":
283
280
  from edda.exceptions import WorkflowCancelledException
@@ -286,35 +283,70 @@ class ReplayEngine:
286
283
  f"Workflow {instance_id} was cancelled"
287
284
  ) from None
288
285
 
289
- # Atomically register timer subscription and release lock (distributed coroutines)
286
+ # Atomically register channel receive and release lock (distributed coroutines)
290
287
  # This ensures subscription is registered and lock is released in a single transaction
291
- # so ANY worker can resume the workflow when the timer expires
292
- # Use the expires_at from the exception (calculated at wait_timer() call time)
293
- # This ensures deterministic replay: the timer expiration time never changes
294
- await self.storage.register_timer_subscription_and_release_lock(
288
+ # so ANY worker can resume the workflow when the message arrives
289
+ await self.storage.register_channel_receive_and_release_lock(
295
290
  instance_id=instance_id,
296
291
  worker_id=self.worker_id,
297
- timer_id=exc.timer_id,
298
- expires_at=exc.expires_at,
292
+ channel=exc.channel,
299
293
  activity_id=exc.activity_id,
294
+ timeout_seconds=exc.timeout_seconds,
300
295
  )
301
296
 
302
- # Status is updated to 'waiting_for_timer' atomically
303
- # by register_timer_subscription_and_release_lock()
297
+ # Status is updated to 'waiting_for_message' atomically
298
+ # by register_channel_receive_and_release_lock()
304
299
  return instance_id
305
300
 
301
+ except RecurException as exc:
302
+ # Workflow is recurring (Erlang-style tail recursion pattern)
303
+ # This resets history growth in long-running loops by:
304
+ # 1. Completing the current instance (marking as "recurred")
305
+ # 2. Archiving the current history
306
+ # 3. Cleaning up subscriptions
307
+ # 4. Starting a new instance with the provided arguments
308
+ # 5. Linking new instance to old via `continued_from`
309
+
310
+ logger.info(f"Workflow {instance_id} recurring with args: {exc.kwargs}")
311
+
312
+ # Mark current workflow as "recurred"
313
+ await ctx._update_status("recurred", {"recur_kwargs": exc.kwargs})
314
+
315
+ # Archive history (move to archive table)
316
+ archived_count = await self.storage.archive_history(instance_id)
317
+ logger.info(f"Archived {archived_count} history entries for {instance_id}")
318
+
319
+ # Clean up all subscriptions (event/timer/message)
320
+ # This prevents old subscriptions from receiving events meant for the new instance
321
+ await self.storage.cleanup_instance_subscriptions(instance_id)
322
+
323
+ # Clear compensations (fresh start)
324
+ await ctx._clear_compensations()
325
+
326
+ # Create and start a new workflow instance
327
+ new_instance_id = await self._start_recurred_workflow(
328
+ workflow_name=workflow_name,
329
+ workflow_func=workflow_func,
330
+ input_data=exc.kwargs,
331
+ continued_from=instance_id,
332
+ lock_timeout_seconds=lock_timeout_seconds,
333
+ )
334
+
335
+ logger.info(f"Workflow {instance_id} recurred to {new_instance_id}")
336
+ return new_instance_id
337
+
306
338
  except Exception as error:
307
339
  # Check if this is a cancellation exception
308
340
  from edda.exceptions import WorkflowCancelledException
309
341
 
310
342
  if isinstance(error, WorkflowCancelledException):
311
343
  # Workflow was cancelled during execution
312
- print(f"[Workflow] {instance_id} was cancelled during execution")
344
+ logger.info("Workflow %s was cancelled during execution", instance_id)
313
345
 
314
346
  # Execute compensations (idempotent - already executed ones will be skipped)
315
347
  # This ensures all compensations are executed, even if some were already
316
348
  # executed by cancel_workflow() in a concurrent process
317
- print(f"[Workflow] Executing compensations for {instance_id}")
349
+ logger.debug("Executing compensations for %s", instance_id)
318
350
  await execute_compensations(ctx)
319
351
 
320
352
  # Ensure status is "cancelled"
@@ -448,26 +480,6 @@ class ReplayEngine:
448
480
  # Mark as completed
449
481
  await ctx._update_status("completed", {"result": result_dict})
450
482
 
451
- except WaitForEventException as exc:
452
- # Workflow is waiting for an event (again)
453
- # Atomically register event subscription and release lock (distributed coroutines)
454
- from datetime import UTC, datetime, timedelta
455
-
456
- timeout_at = None
457
- if exc.timeout_seconds is not None:
458
- timeout_at = datetime.now(UTC) + timedelta(seconds=exc.timeout_seconds)
459
-
460
- await self.storage.register_event_subscription_and_release_lock(
461
- instance_id=instance_id,
462
- worker_id=self.worker_id,
463
- event_type=exc.event_type,
464
- timeout_at=timeout_at,
465
- activity_id=exc.activity_id,
466
- )
467
-
468
- # Update status to waiting_for_event
469
- await ctx._update_status("waiting_for_event")
470
-
471
483
  except WaitForTimerException as exc:
472
484
  # Workflow is waiting for a timer (again)
473
485
  # Atomically register timer subscription and release lock (distributed coroutines)
@@ -484,18 +496,70 @@ class ReplayEngine:
484
496
  # Status is updated to 'waiting_for_timer' atomically
485
497
  # by register_timer_subscription_and_release_lock()
486
498
 
499
+ except WaitForChannelMessageException as exc:
500
+ # Workflow is waiting for a message on a channel (again)
501
+ # Atomically register channel receive and release lock (distributed coroutines)
502
+ await self.storage.register_channel_receive_and_release_lock(
503
+ instance_id=instance_id,
504
+ worker_id=self.worker_id,
505
+ channel=exc.channel,
506
+ activity_id=exc.activity_id,
507
+ timeout_seconds=exc.timeout_seconds,
508
+ )
509
+
510
+ # Status is updated to 'waiting_for_message' atomically
511
+ # by register_channel_receive_and_release_lock()
512
+
513
+ except RecurException as exc:
514
+ # Workflow is recurring (Erlang-style tail recursion pattern)
515
+ # This resets history growth in long-running loops by:
516
+ # 1. Completing the current instance (marking as "recurred")
517
+ # 2. Archiving the current history
518
+ # 3. Cleaning up subscriptions
519
+ # 4. Starting a new instance with the provided arguments
520
+ # 5. Linking new instance to old via `continued_from`
521
+
522
+ logger.info(f"Workflow {instance_id} recurring with args: {exc.kwargs}")
523
+
524
+ # Mark current workflow as "recurred"
525
+ await ctx._update_status("recurred", {"recur_kwargs": exc.kwargs})
526
+
527
+ # Archive history (move to archive table)
528
+ archived_count = await self.storage.archive_history(instance_id)
529
+ logger.info(f"Archived {archived_count} history entries for {instance_id}")
530
+
531
+ # Clean up all subscriptions (event/timer/message)
532
+ # This prevents old subscriptions from receiving events meant for the new instance
533
+ await self.storage.cleanup_instance_subscriptions(instance_id)
534
+
535
+ # Clear compensations (fresh start)
536
+ await ctx._clear_compensations()
537
+
538
+ # Create and start a new workflow instance
539
+ # Note: we don't return the new instance_id here since _execute_workflow_logic returns None
540
+ # The new workflow will execute in its own context
541
+ await self._start_recurred_workflow(
542
+ workflow_name=instance["workflow_name"],
543
+ workflow_func=workflow_func,
544
+ input_data=exc.kwargs,
545
+ continued_from=instance_id,
546
+ lock_timeout_seconds=instance.get("lock_timeout_seconds"),
547
+ )
548
+
549
+ logger.info(f"Workflow {instance_id} recurred successfully")
550
+
487
551
  except Exception as error:
488
552
  # Check if this is a cancellation exception
489
553
  from edda.exceptions import WorkflowCancelledException
490
554
 
491
555
  if isinstance(error, WorkflowCancelledException):
492
556
  # Workflow was cancelled during execution
493
- print(f"[Workflow] {instance_id} was cancelled during execution")
557
+ logger.info("Workflow %s was cancelled during execution", instance_id)
494
558
 
495
559
  # Execute compensations (idempotent - already executed ones will be skipped)
496
560
  # This ensures all compensations are executed, even if some were already
497
561
  # executed by cancel_workflow() in a concurrent process
498
- print(f"[Workflow] Executing compensations for {instance_id}")
562
+ logger.debug("Executing compensations for %s", instance_id)
499
563
  await execute_compensations(ctx)
500
564
 
501
565
  # Ensure status is "cancelled"
@@ -560,6 +624,171 @@ class ReplayEngine:
560
624
  instance_id=instance_id, workflow_func=workflow_obj.func, already_locked=already_locked
561
625
  )
562
626
 
627
+ async def _start_recurred_workflow(
628
+ self,
629
+ workflow_name: str,
630
+ workflow_func: Callable[..., Any],
631
+ input_data: dict[str, Any],
632
+ continued_from: str,
633
+ lock_timeout_seconds: int | None = None,
634
+ ) -> str:
635
+ """
636
+ Start a new workflow instance as a recurrence of an existing workflow.
637
+
638
+ This is an internal helper method used by the RecurException handler.
639
+ It creates a new workflow instance linked to the previous one via continued_from.
640
+
641
+ Args:
642
+ workflow_name: Name of the workflow
643
+ workflow_func: The workflow function to execute
644
+ input_data: Input parameters for the workflow (from recur() kwargs)
645
+ continued_from: Instance ID of the workflow that is recurring
646
+ lock_timeout_seconds: Lock timeout for this workflow (None = global default 300s)
647
+
648
+ Returns:
649
+ Instance ID of the new workflow
650
+ """
651
+ # Generate new instance ID
652
+ new_instance_id = f"{workflow_name}-{uuid.uuid4().hex}"
653
+
654
+ # Extract source code for visualization
655
+ try:
656
+ source_code = inspect.getsource(workflow_func)
657
+ except (OSError, TypeError) as e:
658
+ logger.warning(
659
+ f"Could not extract source code for workflow '{workflow_name}': {e}. "
660
+ "Hybrid diagram visualization will not be available."
661
+ )
662
+ source_code = f"# Source code not available\n# Workflow: {workflow_name}\n# Error: {e}"
663
+
664
+ # Calculate source code hash
665
+ source_hash = hashlib.sha256(source_code.encode("utf-8")).hexdigest()
666
+
667
+ # Store workflow definition (idempotent)
668
+ await self.storage.upsert_workflow_definition(
669
+ workflow_name=workflow_name,
670
+ source_hash=source_hash,
671
+ source_code=source_code,
672
+ )
673
+
674
+ # Create workflow instance in storage with continued_from reference
675
+ await self.storage.create_instance(
676
+ instance_id=new_instance_id,
677
+ workflow_name=workflow_name,
678
+ source_hash=source_hash,
679
+ owner_service=self.service_name,
680
+ input_data=input_data,
681
+ lock_timeout_seconds=lock_timeout_seconds,
682
+ continued_from=continued_from,
683
+ )
684
+
685
+ # Execute the new workflow with distributed lock
686
+ async with workflow_lock(self.storage, new_instance_id, self.worker_id):
687
+ # Create context for new execution
688
+ ctx = WorkflowContext(
689
+ instance_id=new_instance_id,
690
+ workflow_name=workflow_name,
691
+ storage=self.storage,
692
+ worker_id=self.worker_id,
693
+ is_replaying=False,
694
+ hooks=self.hooks,
695
+ )
696
+ # Set default retry policy for activity resolution
697
+ ctx._app_retry_policy = self.default_retry_policy
698
+
699
+ try:
700
+ # Call hook: workflow start
701
+ if self.hooks and hasattr(self.hooks, "on_workflow_start"):
702
+ await self.hooks.on_workflow_start(new_instance_id, workflow_name, input_data)
703
+
704
+ # Prepare input: convert JSON dicts to Pydantic models based on type hints
705
+ processed_input = self._prepare_workflow_input(workflow_func, input_data)
706
+
707
+ # Execute workflow function
708
+ result = await workflow_func(ctx, **processed_input)
709
+
710
+ # Convert Pydantic model result to JSON dict for storage
711
+ result_dict = to_json_dict(result)
712
+
713
+ # Mark as completed
714
+ await ctx._update_status("completed", {"result": result_dict})
715
+
716
+ # Call hook: workflow complete
717
+ if self.hooks and hasattr(self.hooks, "on_workflow_complete"):
718
+ await self.hooks.on_workflow_complete(new_instance_id, workflow_name, result)
719
+
720
+ return new_instance_id
721
+
722
+ except WaitForTimerException as exc:
723
+ # Workflow is waiting for a timer
724
+ await self.storage.register_timer_subscription_and_release_lock(
725
+ instance_id=new_instance_id,
726
+ worker_id=self.worker_id,
727
+ timer_id=exc.timer_id,
728
+ expires_at=exc.expires_at,
729
+ activity_id=exc.activity_id,
730
+ )
731
+ return new_instance_id
732
+
733
+ except WaitForChannelMessageException as exc:
734
+ # Workflow is waiting for a message
735
+ await self.storage.register_channel_receive_and_release_lock(
736
+ instance_id=new_instance_id,
737
+ worker_id=self.worker_id,
738
+ channel=exc.channel,
739
+ activity_id=exc.activity_id,
740
+ timeout_seconds=exc.timeout_seconds,
741
+ )
742
+ return new_instance_id
743
+
744
+ except RecurException as exc:
745
+ # Recur again immediately (nested recur)
746
+ logger.info(
747
+ f"Workflow {new_instance_id} recurring immediately with args: {exc.kwargs}"
748
+ )
749
+
750
+ await ctx._update_status("recurred", {"recur_kwargs": exc.kwargs})
751
+ archived_count = await self.storage.archive_history(new_instance_id)
752
+ logger.info(f"Archived {archived_count} history entries for {new_instance_id}")
753
+
754
+ # Clean up all subscriptions (event/timer/message)
755
+ await self.storage.cleanup_instance_subscriptions(new_instance_id)
756
+
757
+ await ctx._clear_compensations()
758
+
759
+ # Recursively start another recurred workflow
760
+ return await self._start_recurred_workflow(
761
+ workflow_name=workflow_name,
762
+ workflow_func=workflow_func,
763
+ input_data=exc.kwargs,
764
+ continued_from=new_instance_id,
765
+ lock_timeout_seconds=lock_timeout_seconds,
766
+ )
767
+
768
+ except Exception as error:
769
+ # Execute compensations before marking as failed
770
+ await execute_compensations(ctx)
771
+
772
+ import traceback
773
+
774
+ stack_trace = "".join(
775
+ traceback.format_exception(type(error), error, error.__traceback__)
776
+ )
777
+
778
+ await ctx._update_status(
779
+ "failed",
780
+ {
781
+ "error_message": str(error),
782
+ "error_type": type(error).__name__,
783
+ "stack_trace": stack_trace,
784
+ },
785
+ )
786
+
787
+ if self.hooks and hasattr(self.hooks, "on_workflow_failed"):
788
+ await self.hooks.on_workflow_failed(new_instance_id, workflow_name, error)
789
+
790
+ raise
791
+
563
792
  async def execute_with_lock(
564
793
  self,
565
794
  instance_id: str,
@@ -640,7 +869,12 @@ class ReplayEngine:
640
869
  current_status = instance["status"]
641
870
 
642
871
  # Only cancel running or waiting workflows
643
- if current_status not in ("running", "waiting_for_event", "waiting_for_timer"):
872
+ if current_status not in (
873
+ "running",
874
+ "waiting_for_event",
875
+ "waiting_for_timer",
876
+ "waiting_for_message",
877
+ ):
644
878
  return False
645
879
 
646
880
  # Try to acquire lock with short timeout (5 seconds)
@@ -659,10 +893,10 @@ class ReplayEngine:
659
893
 
660
894
  try:
661
895
  # Re-fetch instance data AFTER acquiring lock
662
- print(f"[Cancel] Fetching instance data for {instance_id}")
896
+ logger.debug("Fetching instance data for %s", instance_id)
663
897
  instance_locked = await self.storage.get_instance(instance_id)
664
898
  if instance_locked is None:
665
- print(f"[Cancel] Instance {instance_id} not found after lock acquisition")
899
+ logger.warning("Instance %s not found after lock acquisition", instance_id)
666
900
  return False
667
901
 
668
902
  # Create context for compensation execution
@@ -678,7 +912,7 @@ class ReplayEngine:
678
912
  ctx._app_retry_policy = self.default_retry_policy
679
913
 
680
914
  # Execute compensations to clean up
681
- print(f"[Cancel] Executing compensations for {instance_id}")
915
+ logger.debug("Executing compensations for %s", instance_id)
682
916
  await execute_compensations(ctx)
683
917
 
684
918
  # Mark as cancelled in storage
@@ -692,10 +926,7 @@ class ReplayEngine:
692
926
 
693
927
  except Exception as error:
694
928
  # Log error but don't propagate
695
- import traceback
696
-
697
- print(f"[Cancel] Error cancelling workflow {instance_id}: {error}")
698
- traceback.print_exc()
929
+ logger.error("Error cancelling workflow %s: %s", instance_id, error, exc_info=True)
699
930
  return False
700
931
 
701
932
  async def resume_compensating_workflow(self, instance_id: str) -> bool:
@@ -712,7 +943,7 @@ class ReplayEngine:
712
943
  Returns:
713
944
  True if compensations completed successfully, False otherwise
714
945
  """
715
- print(f"[ResumeCompensating] Starting compensation recovery for {instance_id}")
946
+ logger.info("Starting compensation recovery for %s", instance_id)
716
947
 
717
948
  try:
718
949
  # Acquire lock
@@ -723,21 +954,23 @@ class ReplayEngine:
723
954
  )
724
955
 
725
956
  if not locked:
726
- print(f"[ResumeCompensating] Could not acquire lock for {instance_id}")
957
+ logger.debug("Could not acquire lock for %s", instance_id)
727
958
  return False
728
959
 
729
960
  try:
730
961
  # Get instance data
731
962
  instance = await self.storage.get_instance(instance_id)
732
963
  if instance is None:
733
- print(f"[ResumeCompensating] Instance {instance_id} not found")
964
+ logger.warning("Instance %s not found", instance_id)
734
965
  return False
735
966
 
736
967
  # Check current status
737
968
  current_status = instance["status"]
738
969
  if current_status != "compensating":
739
- print(
740
- f"[ResumeCompensating] Instance {instance_id} is not in compensating state (status={current_status})"
970
+ logger.debug(
971
+ "Instance %s is not in compensating state (status=%s)",
972
+ instance_id,
973
+ current_status,
741
974
  )
742
975
  return False
743
976
 
@@ -745,15 +978,12 @@ class ReplayEngine:
745
978
  # If we can't determine, default to "failed"
746
979
  target_status = "failed"
747
980
 
748
- # Check history for cancellation markers
749
- history = await self.storage.get_history(instance_id)
750
- for event in history:
751
- event_type = event.get("event_type", "")
752
- if event_type == "WorkflowCancelled" or "cancel" in event_type.lower():
753
- target_status = "cancelled"
754
- break
981
+ # Check history for cancellation markers (LIMIT 1 optimization)
982
+ cancellation_event = await self.storage.find_first_cancellation_event(instance_id)
983
+ if cancellation_event is not None:
984
+ target_status = "cancelled"
755
985
 
756
- print(f"[ResumeCompensating] Target status after compensation: {target_status}")
986
+ logger.debug("Target status after compensation: %s", target_status)
757
987
 
758
988
  # Create context for compensation execution
759
989
  ctx = WorkflowContext(
@@ -768,18 +998,18 @@ class ReplayEngine:
768
998
  ctx._app_retry_policy = self.default_retry_policy
769
999
 
770
1000
  # Re-execute compensations (idempotent - skips already executed)
771
- print(f"[ResumeCompensating] Re-executing compensations for {instance_id}")
1001
+ logger.debug("Re-executing compensations for %s", instance_id)
772
1002
  await execute_compensations(ctx)
773
1003
 
774
1004
  # Mark with target status
775
1005
  if target_status == "cancelled":
776
1006
  success = await self.storage.cancel_instance(instance_id, "crash_recovery")
777
- print(f"[ResumeCompensating] Marked {instance_id} as cancelled")
1007
+ logger.info("Marked %s as cancelled", instance_id)
778
1008
  else:
779
1009
  await ctx._update_status(
780
1010
  "failed", {"error": "Workflow failed before compensation"}
781
1011
  )
782
- print(f"[ResumeCompensating] Marked {instance_id} as failed")
1012
+ logger.info("Marked %s as failed", instance_id)
783
1013
  success = True
784
1014
 
785
1015
  return success
@@ -790,10 +1020,10 @@ class ReplayEngine:
790
1020
 
791
1021
  except Exception as error:
792
1022
  # Log error but don't propagate
793
- import traceback
794
-
795
- print(
796
- f"[ResumeCompensating] Error resuming compensating workflow {instance_id}: {error}"
1023
+ logger.error(
1024
+ "Error resuming compensating workflow %s: %s",
1025
+ instance_id,
1026
+ error,
1027
+ exc_info=True,
797
1028
  )
798
- traceback.print_exc()
799
1029
  return False