PyPI - edda-framework - Versions diffs - 0.11.0__py3-none-any.whl → 0.13.0__py3-none-any.whl - Mend

edda-framework 0.11.0py3-none-any.whl → 0.13.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

edda/app.py +203 -35
edda/channels.py +57 -12
edda/context.py +24 -0
edda/migrations/mysql/20251217000000_initial_schema.sql +284 -0
edda/migrations/postgresql/20251217000000_initial_schema.sql +284 -0
edda/migrations/sqlite/20251217000000_initial_schema.sql +284 -0
edda/outbox/relayer.py +34 -7
edda/replay.py +41 -23
edda/storage/migrations.py +435 -0
edda/storage/models.py +2 -0
edda/storage/pg_notify.py +5 -8
edda/storage/sqlalchemy_storage.py +97 -61
edda/viewer_ui/app.py +11 -4
{edda_framework-0.11.0.dist-info → edda_framework-0.13.0.dist-info}/METADATA +43 -3
{edda_framework-0.11.0.dist-info → edda_framework-0.13.0.dist-info}/RECORD +18 -14
{edda_framework-0.11.0.dist-info → edda_framework-0.13.0.dist-info}/WHEEL +0 -0
{edda_framework-0.11.0.dist-info → edda_framework-0.13.0.dist-info}/entry_points.txt +0 -0
{edda_framework-0.11.0.dist-info → edda_framework-0.13.0.dist-info}/licenses/LICENSE +0 -0

edda/app.py CHANGED Viewed

@@ -63,6 +63,9 @@ class EddaApp:
         notify_fallback_interval: int = 30,
         # Batch processing settings
         max_workflows_per_batch: int | Literal["auto", "auto:cpu"] = 10,
+        # Leader election settings (for coordinating background tasks across workers)
+        leader_heartbeat_interval: int = 15,
+        leader_lease_duration: int = 45,
     ):
         """
         Initialize Edda application.
@@ -100,6 +103,10 @@ class EddaApp:
                                     - int: Fixed batch size (default: 10)
                                     - "auto": Scale 10-100 based on queue depth
                                     - "auto:cpu": Scale 10-100 based on CPU utilization (requires psutil)
+            leader_heartbeat_interval: Interval in seconds for leader heartbeat (default: 15).
+                                      Controls how often workers attempt to become/maintain leadership.
+            leader_lease_duration: Duration in seconds for leader lease (default: 45).
+                                  If leader fails to heartbeat within this time, another worker takes over.
         """
         self.db_url = db_url
         self.service_name = service_name
@@ -168,6 +175,12 @@ class EddaApp:
                 "Must be int, 'auto', or 'auto:cpu'."
             )
+        # Leader election settings (for coordinating background tasks)
+        self._leader_heartbeat_interval = leader_heartbeat_interval
+        self._leader_lease_duration = leader_lease_duration
+        self._is_leader = False
+        self._leader_tasks: list[asyncio.Task[Any]] = []
     def _create_storage(self, db_url: str) -> SQLAlchemyStorage:
         """
         Create storage backend from database URL.
@@ -309,19 +322,19 @@ class EddaApp:
         # Subscribe to workflow resumable notifications
         await self._notify_listener.subscribe(
-            "edda_workflow_resumable",
+            "workflow_resumable",
             self._on_workflow_resumable_notify,
         )
         # Subscribe to outbox notifications
         await self._notify_listener.subscribe(
-            "edda_outbox_pending",
+            "workflow_outbox_pending",
             self._on_outbox_pending_notify,
         )
         # Subscribe to timer expired notifications
         await self._notify_listener.subscribe(
-            "edda_timer_expired",
+            "workflow_timer_expired",
             self._on_timer_expired_notify,
         )
@@ -462,47 +475,202 @@ class EddaApp:
         self._initialized = False
     def _start_background_tasks(self) -> None:
-        """Start background maintenance tasks."""
-        # Task to cleanup stale locks and auto-resume workflows
-        auto_resume_task = asyncio.create_task(
-            auto_resume_stale_workflows_periodically(
-                self.storage,
-                self.replay_engine,
-                self.worker_id,
-                interval=60,  # Check every 60 seconds
-            )
-        )
-        self._background_tasks.append(auto_resume_task)
+        """Start background maintenance tasks.
-        # Task to check expired timers and resume workflows
-        timer_check_task = asyncio.create_task(
-            self._check_expired_timers_periodically(interval=10)  # Check every 10 seconds
-        )
-        self._background_tasks.append(timer_check_task)
-        # Task to check expired message subscriptions and fail workflows
-        # Note: CloudEvents timeouts are also handled here since wait_event() uses wait_message()
-        message_timeout_task = asyncio.create_task(
-            self._check_expired_message_subscriptions_periodically(
-                interval=10
-            )  # Check every 10 seconds
-        )
-        self._background_tasks.append(message_timeout_task)
+        Background tasks are divided into two categories:
+        1. All-worker tasks: Run on every worker (leader election, workflow resumption)
+        2. Leader-only tasks: Run only on the elected leader (timers, timeouts, cleanup)
+        This design reduces database polling load significantly in multi-worker deployments.
+        """
+        # Leader election loop (all workers participate)
+        leader_election_task = asyncio.create_task(self._leader_election_loop())
+        self._background_tasks.append(leader_election_task)
-        # Task to resume workflows after message delivery (fast resumption)
+        # Task to resume workflows after message delivery (all workers - competitive lock)
         message_resume_task = asyncio.create_task(
             self._resume_running_workflows_periodically(interval=1)  # Check every 1 second
         )
         self._background_tasks.append(message_resume_task)
-        # Task to cleanup old channel messages (orphaned messages)
-        message_cleanup_task = asyncio.create_task(
-            self._cleanup_old_messages_periodically(
-                interval=3600,  # Check every 1 hour
-                retention_days=self._message_retention_days,
+        # Note: Leader-only tasks (timer checks, message timeouts, stale workflow cleanup,
+        # old message cleanup) are started dynamically in _leader_election_loop() when
+        # this worker becomes the leader.
+    async def _leader_election_loop(self) -> None:
+        """
+        Leader election loop that runs on all workers.
+        Uses system lock to elect a single leader among all workers.
+        The leader runs maintenance tasks (timer checks, message timeouts, etc.).
+        Non-leaders only participate in workflow resumption.
+        If a leader task crashes, it will be automatically restarted.
+        """
+        while True:
+            try:
+                was_leader = self._is_leader
+                # Try to acquire/renew leadership
+                self._is_leader = await self.storage.try_acquire_system_lock(
+                    lock_name="edda_leader",
+                    worker_id=self.worker_id,
+                    timeout_seconds=self._leader_lease_duration,
+                )
+                if self._is_leader and not was_leader:
+                    # Became leader - start leader-only tasks
+                    logger.info(f"Worker {self.worker_id} became leader")
+                    self._leader_tasks = self._create_leader_only_tasks()
+                elif not self._is_leader and was_leader:
+                    # Lost leadership - cancel leader-only tasks
+                    logger.info(f"Worker {self.worker_id} lost leadership")
+                    await self._cancel_tasks(self._leader_tasks)
+                    self._leader_tasks = []
+                elif self._is_leader:
+                    # Still leader - check if any leader tasks have crashed and restart
+                    await self._monitor_and_restart_leader_tasks()
+                # Wait before next heartbeat
+                await asyncio.sleep(self._leader_heartbeat_interval)
+            except asyncio.CancelledError:
+                # Shutdown - cancel leader tasks and exit
+                await self._cancel_tasks(self._leader_tasks)
+                self._leader_tasks = []
+                raise
+            except Exception as e:
+                logger.error(f"Leader election error: {e}", exc_info=True)
+                self._is_leader = False
+                await self._cancel_tasks(self._leader_tasks)
+                self._leader_tasks = []
+                # Wait before retry
+                await asyncio.sleep(self._leader_heartbeat_interval)
+    def _create_leader_only_tasks(self) -> list[asyncio.Task[Any]]:
+        """
+        Create tasks that should only run on the leader worker.
+        These tasks are responsible for:
+        - Timer expiration checks
+        - Message subscription timeout checks
+        - Stale workflow auto-resume
+        - Old message cleanup
+        """
+        tasks = []
+        # Timer expiration check
+        tasks.append(
+            asyncio.create_task(
+                self._check_expired_timers_periodically(interval=10),
+                name="leader_timer_check",
             )
         )
-        self._background_tasks.append(message_cleanup_task)
+        # Message subscription timeout check
+        tasks.append(
+            asyncio.create_task(
+                self._check_expired_message_subscriptions_periodically(interval=10),
+                name="leader_message_timeout_check",
+            )
+        )
+        # Stale workflow auto-resume
+        tasks.append(
+            asyncio.create_task(
+                auto_resume_stale_workflows_periodically(
+                    self.storage,
+                    self.replay_engine,
+                    self.worker_id,
+                    interval=60,
+                ),
+                name="leader_stale_workflow_resume",
+            )
+        )
+        # Old message cleanup
+        tasks.append(
+            asyncio.create_task(
+                self._cleanup_old_messages_periodically(
+                    interval=3600,
+                    retention_days=self._message_retention_days,
+                ),
+                name="leader_message_cleanup",
+            )
+        )
+        return tasks
+    async def _cancel_tasks(self, tasks: list[asyncio.Task[Any]]) -> None:
+        """Cancel a list of tasks and wait for them to finish."""
+        for task in tasks:
+            task.cancel()
+        await asyncio.gather(*tasks, return_exceptions=True)
+    async def _monitor_and_restart_leader_tasks(self) -> None:
+        """
+        Monitor leader tasks and restart any that have crashed.
+        This ensures leader-only tasks keep running even if they encounter errors.
+        """
+        task_creators = {
+            "leader_timer_check": lambda: asyncio.create_task(
+                self._check_expired_timers_periodically(interval=10),
+                name="leader_timer_check",
+            ),
+            "leader_message_timeout_check": lambda: asyncio.create_task(
+                self._check_expired_message_subscriptions_periodically(interval=10),
+                name="leader_message_timeout_check",
+            ),
+            "leader_stale_workflow_resume": lambda: asyncio.create_task(
+                auto_resume_stale_workflows_periodically(
+                    self.storage,
+                    self.replay_engine,
+                    self.worker_id,
+                    interval=60,
+                ),
+                name="leader_stale_workflow_resume",
+            ),
+            "leader_message_cleanup": lambda: asyncio.create_task(
+                self._cleanup_old_messages_periodically(
+                    interval=3600,
+                    retention_days=self._message_retention_days,
+                ),
+                name="leader_message_cleanup",
+            ),
+        }
+        # Check each task and restart if done (crashed)
+        new_tasks = []
+        for task in self._leader_tasks:
+            if task.done():
+                # Task has finished (possibly due to error)
+                task_name = task.get_name()
+                try:
+                    # Check if it raised an exception
+                    exc = task.exception()
+                    if exc is not None:
+                        logger.warning(
+                            f"Leader task {task_name} crashed with {type(exc).__name__}: {exc}, "
+                            "restarting..."
+                        )
+                except asyncio.CancelledError:
+                    # Task was cancelled, don't restart
+                    logger.debug(f"Leader task {task_name} was cancelled")
+                    continue
+                # Restart the task
+                if task_name in task_creators:
+                    new_task = task_creators[task_name]()
+                    new_tasks.append(new_task)
+                    logger.info(f"Restarted leader task: {task_name}")
+            else:
+                # Task is still running
+                new_tasks.append(task)
+        self._leader_tasks = new_tasks
     def _auto_register_workflows(self) -> None:
         """

edda/channels.py CHANGED Viewed

@@ -145,8 +145,22 @@ async def subscribe(
     Args:
         ctx: Workflow context
         channel: Channel name to subscribe to
-        mode: Subscription mode - "broadcast" (all subscribers receive all messages)
-              or "competing" (each message goes to only one subscriber)
+        mode: Subscription mode:
+              - "broadcast": All subscribers receive all messages (fan-out pattern)
+              - "competing": Each message goes to only one subscriber (work queue pattern)
+              - "direct": Receive messages sent via send_to() to this instance
+    The "direct" mode is syntactic sugar that subscribes to "channel:instance_id" internally,
+    allowing simpler code when receiving direct messages:
+        # Instead of this:
+        direct_channel = f"notifications:{ctx.instance_id}"
+        await subscribe(ctx, direct_channel, mode="broadcast")
+        msg = await receive(ctx, direct_channel)
+        # You can write:
+        await subscribe(ctx, "notifications", mode="direct")
+        msg = await receive(ctx, "notifications")
     Example:
         >>> @workflow
@@ -168,11 +182,29 @@ async def subscribe(
         ...         job = await receive(ctx, "jobs")
         ...         await execute_job(ctx, job.data, activity_id=f"job:{job.id}")
         ...         await ctx.recur()
+        >>> @workflow
+        ... async def direct_receiver(ctx: WorkflowContext, id: str):
+        ...     # Subscribe to receive direct messages via send_to()
+        ...     await subscribe(ctx, "notifications", mode="direct")
+        ...
+        ...     msg = await receive(ctx, "notifications")
+        ...     print(f"Received: {msg.data}")
     """
-    if mode not in ("broadcast", "competing"):
-        raise ValueError(f"Invalid subscription mode: {mode}. Must be 'broadcast' or 'competing'")
+    actual_channel = channel
+    actual_mode = mode
+    if mode == "direct":
+        # Transform to instance-specific channel
+        actual_channel = f"{channel}:{ctx.instance_id}"
+        actual_mode = "broadcast"
+        ctx._record_direct_subscription(channel)
+    elif mode not in ("broadcast", "competing"):
+        raise ValueError(
+            f"Invalid subscription mode: {mode}. Must be 'broadcast', 'competing', or 'direct'"
+        )
-    await ctx.storage.subscribe_to_channel(ctx.instance_id, channel, mode)
+    await ctx.storage.subscribe_to_channel(ctx.instance_id, actual_channel, actual_mode)
 async def unsubscribe(
@@ -185,11 +217,17 @@ async def unsubscribe(
     Note: Workflows are automatically unsubscribed from all channels when they
     complete, fail, or are cancelled. Explicit unsubscribe is usually not necessary.
+    For channels subscribed with mode="direct", use the original channel name
+    (not the transformed "channel:instance_id" form).
     Args:
         ctx: Workflow context
         channel: Channel name to unsubscribe from
     """
-    await ctx.storage.unsubscribe_from_channel(ctx.instance_id, channel)
+    actual_channel = channel
+    if ctx._is_direct_subscription(channel):
+        actual_channel = f"{channel}:{ctx.instance_id}"
+    await ctx.storage.unsubscribe_from_channel(ctx.instance_id, actual_channel)
 # =============================================================================
@@ -233,7 +271,12 @@ async def receive(
         ...         await process(ctx, msg.data, activity_id=f"process:{msg.id}")
         ...         await ctx.recur()
     """
-    # Generate activity ID
+    # Transform channel for direct subscriptions
+    actual_channel = channel
+    if ctx._is_direct_subscription(channel):
+        actual_channel = f"{channel}:{ctx.instance_id}"
+    # Generate activity ID (use original channel name for deterministic replay)
     if message_id is None:
         activity_id = ctx._generate_activity_id(f"receive_{channel}")
     else:
@@ -277,21 +320,21 @@ async def receive(
             raise RuntimeError(f"Unexpected cached result type: {type(cached_result)}")
     # Check for pending messages in the queue
-    pending = await ctx.storage.get_pending_channel_messages(ctx.instance_id, channel)
+    pending = await ctx.storage.get_pending_channel_messages(ctx.instance_id, actual_channel)
     if pending:
         # Get the first pending message
         msg_dict = pending[0]
         msg_id = msg_dict["message_id"]
         # For competing mode, try to claim the message
-        subscription = await _get_subscription(ctx, channel)
+        subscription = await _get_subscription(ctx, actual_channel)
         if subscription and subscription.get("mode") == "competing":
             claimed = await ctx.storage.claim_channel_message(msg_id, ctx.instance_id)
             if not claimed:
                 # Another worker claimed it, check next message
                 # For simplicity, raise exception to retry
                 raise WaitForChannelMessageException(
-                    channel=channel,
+                    channel=actual_channel,
                     timeout_seconds=timeout_seconds,
                     activity_id=activity_id,
                 )
@@ -299,7 +342,9 @@ async def receive(
             await ctx.storage.delete_channel_message(msg_id)
         else:
             # Broadcast mode - update cursor
-            await ctx.storage.update_delivery_cursor(channel, ctx.instance_id, msg_dict["id"])
+            await ctx.storage.update_delivery_cursor(
+                actual_channel, ctx.instance_id, msg_dict["id"]
+            )
         # Build the message
         raw_data = msg_dict.get("data")
@@ -337,7 +382,7 @@ async def receive(
     # No pending messages, raise exception to pause workflow
     raise WaitForChannelMessageException(
-        channel=channel,
+        channel=actual_channel,
         timeout_seconds=timeout_seconds,
         activity_id=activity_id,
     )

edda/context.py CHANGED Viewed

@@ -69,6 +69,9 @@ class WorkflowContext:
         # Default retry policy from EddaApp (set by ReplayEngine)
         self._app_retry_policy: Any = None
+        # Direct subscriptions: channel names subscribed with mode="direct"
+        self._direct_subscriptions: set[str] = set()
     @property
     def storage(self) -> StorageProtocol:
         """
@@ -271,6 +274,27 @@ class WorkflowContext:
         """
         self.executed_activity_ids.add(activity_id)
+    def _record_direct_subscription(self, channel: str) -> None:
+        """
+        Record that a channel was subscribed in direct mode (internal use only).
+        Args:
+            channel: The original channel name (before transformation)
+        """
+        self._direct_subscriptions.add(channel)
+    def _is_direct_subscription(self, channel: str) -> bool:
+        """
+        Check if a channel was subscribed in direct mode (internal use only).
+        Args:
+            channel: The channel name to check
+        Returns:
+            True if the channel was subscribed with mode="direct"
+        """
+        return channel in self._direct_subscriptions
     async def _record_activity_completed(
         self,
         activity_id: str,

edda-framework 0.11.0__py3-none-any.whl → 0.13.0__py3-none-any.whl

edda-framework 0.11.0py3-none-any.whl → 0.13.0py3-none-any.whl