PyPI - edda-framework - Versions diffs - 0.6.0__py3-none-any.whl → 0.8.0__py3-none-any.whl - Mend

edda-framework 0.6.0py3-none-any.whl → 0.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

edda/__init__.py +39 -5
edda/app.py +383 -223
edda/channels.py +992 -0
edda/compensation.py +22 -22
edda/context.py +77 -51
edda/integrations/opentelemetry/hooks.py +7 -2
edda/locking.py +130 -67
edda/replay.py +312 -82
edda/storage/models.py +165 -24
edda/storage/protocol.py +575 -122
edda/storage/sqlalchemy_storage.py +2073 -319
edda/viewer_ui/app.py +558 -127
edda/viewer_ui/components.py +81 -68
edda/viewer_ui/data_service.py +61 -25
edda/viewer_ui/theme.py +200 -0
edda/workflow.py +43 -0
{edda_framework-0.6.0.dist-info → edda_framework-0.8.0.dist-info}/METADATA +167 -9
{edda_framework-0.6.0.dist-info → edda_framework-0.8.0.dist-info}/RECORD +21 -20
{edda_framework-0.6.0.dist-info → edda_framework-0.8.0.dist-info}/WHEEL +1 -1
edda/events.py +0 -505
{edda_framework-0.6.0.dist-info → edda_framework-0.8.0.dist-info}/entry_points.txt +0 -0
{edda_framework-0.6.0.dist-info → edda_framework-0.8.0.dist-info}/licenses/LICENSE +0 -0

edda/compensation.py CHANGED Viewed

@@ -5,9 +5,12 @@ This module provides compensation transaction support for implementing
 the Saga pattern with automatic rollback on failure.
 """
+import logging
 from collections.abc import Callable
 from typing import TYPE_CHECKING, Any, TypeVar
+logger = logging.getLogger(__name__)
 if TYPE_CHECKING:
     from edda.context import WorkflowContext
@@ -197,12 +200,12 @@ async def execute_compensations(ctx: "WorkflowContext") -> None:
     # If no compensations, nothing to do
     if not compensations:
-        print(f"[Compensation] No compensations to execute for {ctx.instance_id}")
+        logger.debug("No compensations to execute for %s", ctx.instance_id)
         return
     # Mark as compensating BEFORE execution for crash recovery
     # This allows auto-resume to detect and restart incomplete compensation
-    print(f"[Compensation] Starting compensation execution for {ctx.instance_id}")
+    logger.debug("Starting compensation execution for %s", ctx.instance_id)
     await ctx._update_status("compensating", {"started_at": None})
     # Get already executed compensations to avoid duplicate execution
@@ -221,8 +224,10 @@ async def execute_compensations(ctx: "WorkflowContext") -> None:
         # Skip if already executed (idempotency)
         if compensation_id in executed_compensation_ids:
-            print(
-                f"[Compensation] Skipping already executed: {activity_name} (id={compensation_id})"
+            logger.debug(
+                "Skipping already executed compensation: %s (id=%s)",
+                activity_name,
+                compensation_id,
             )
             continue
@@ -232,20 +237,18 @@ async def execute_compensations(ctx: "WorkflowContext") -> None:
         # Skip if activity_name is None or not a string
         if not isinstance(activity_name, str):
-            print(f"[Compensation] Warning: Invalid activity_name: {activity_name}. Skipping.")
+            logger.warning("Invalid activity_name: %s. Skipping.", activity_name)
             continue
         # Log compensation execution
-        print(f"[Compensation] Executing: {activity_name} (id={compensation_id})")
+        logger.info("Executing compensation: %s (id=%s)", activity_name, compensation_id)
         try:
             # Look up compensation function from registry
             compensation_func = _COMPENSATION_REGISTRY.get(activity_name)
             if compensation_func is None:
-                print(
-                    f"[Compensation] Warning: Function '{activity_name}' not found in registry. Skipping."
-                )
+                logger.warning("Function '%s' not found in registry. Skipping.", activity_name)
                 continue
             # Execute the compensation function directly
@@ -271,20 +274,21 @@ async def execute_compensations(ctx: "WorkflowContext") -> None:
                 # This is expected in concurrent cancellation scenarios - silently ignore
                 error_msg = str(record_error)
                 if "UNIQUE constraint" in error_msg or "UNIQUE" in error_msg:
-                    print(
-                        f"[Compensation] {activity_name} already recorded by another process, skipping duplicate record"
+                    logger.debug(
+                        "%s already recorded by another process, skipping duplicate record",
+                        activity_name,
                     )
                 else:
                     # Other errors should be logged but not break the compensation flow
-                    print(
-                        f"[Compensation] Warning: Failed to record {activity_name} execution: {record_error}"
-                    )
+                    logger.warning("Failed to record %s execution: %s", activity_name, record_error)
-            print(f"[Compensation] Successfully executed: {activity_name}")
+            logger.info("Successfully executed compensation: %s", activity_name)
         except Exception as error:
             # Log but don't fail the rollback
-            print(f"[Compensation] Failed to execute {activity_name}: {error}")
+            logger.error(
+                "Failed to execute compensation %s: %s", activity_name, error, exc_info=True
+            )
             # Record compensation failure in history
             try:
@@ -304,13 +308,9 @@ async def execute_compensations(ctx: "WorkflowContext") -> None:
                 # UNIQUE constraint error means another process already recorded this failure
                 error_msg = str(record_error)
                 if "UNIQUE constraint" in error_msg or "UNIQUE" in error_msg:
-                    print(
-                        f"[Compensation] {activity_name} failure already recorded by another process"
-                    )
+                    logger.debug("%s failure already recorded by another process", activity_name)
                 else:
-                    print(
-                        f"[Compensation] Warning: Failed to record compensation failure: {record_error}"
-                    )
+                    logger.warning("Failed to record compensation failure: %s", record_error)
 async def clear_compensations(ctx: "WorkflowContext") -> None:

edda/context.py CHANGED Viewed

@@ -9,7 +9,7 @@ from collections.abc import AsyncIterator
 from contextlib import asynccontextmanager
 from typing import TYPE_CHECKING, Any, cast
-from edda.events import ReceivedEvent
+from edda.channels import ChannelMessage, ReceivedEvent
 from edda.storage.protocol import StorageProtocol
 if TYPE_CHECKING:
@@ -191,6 +191,28 @@ class WorkflowContext:
                     extensions=extensions,
                 )
                 self._history_cache[activity_id] = received_event
+            elif event_type == "ChannelMessageReceived":
+                # Cache the message data for receive() replay
+                from datetime import UTC, datetime
+                raw_data = event_data.get("data", event_data.get("payload", {}))
+                data: dict[str, Any] | bytes = (
+                    raw_data if isinstance(raw_data, (dict, bytes)) else {}
+                )
+                # Parse published_at if available, otherwise use current time
+                published_at_str = event_data.get("published_at")
+                if published_at_str:
+                    published_at = datetime.fromisoformat(published_at_str.replace("Z", "+00:00"))
+                else:
+                    published_at = datetime.now(UTC)
+                message = ChannelMessage(
+                    data=data,
+                    channel=event_data.get("channel", "unknown"),
+                    id=event_data.get("id", "unknown"),
+                    metadata=event_data.get("metadata") or {},
+                    published_at=published_at,
+                )
+                self._history_cache[activity_id] = message
             elif event_type == "TimerExpired":
                 # Cache the timer result for wait_timer replay
                 # Timer returns None, so we cache the result field
@@ -340,56 +362,6 @@ class WorkflowContext:
         """
         await self.storage.update_instance_status(self.instance_id, status, output_data)
-    async def _register_event_subscription(
-        self,
-        event_type: str,
-        timeout_seconds: int | None = None,
-        activity_id: str | None = None,
-    ) -> None:
-        """
-        Register an event subscription for wait_event (internal use only).
-        This is called when a workflow calls wait_event() and needs to pause
-        until a matching event arrives.
-        Args:
-            event_type: CloudEvent type to wait for
-            timeout_seconds: Optional timeout in seconds
-            activity_id: The activity ID where wait_event was called
-        """
-        from datetime import UTC, datetime, timedelta
-        timeout_at = None
-        if timeout_seconds is not None:
-            timeout_at = datetime.now(UTC) + timedelta(seconds=timeout_seconds)
-        await self.storage.add_event_subscription(
-            instance_id=self.instance_id,
-            event_type=event_type,
-            timeout_at=timeout_at,
-        )
-        # Update current activity ID
-        if activity_id is not None:
-            await self.storage.update_instance_activity(self.instance_id, activity_id)
-    async def _record_event_received(self, activity_id: str, event_data: dict[str, Any]) -> None:
-        """
-        Record that an event was received during wait_event (internal use only).
-        This is called when resuming a workflow after an event arrives.
-        Args:
-            activity_id: The activity ID where wait_event was called
-            event_data: The received event data
-        """
-        await self.storage.append_history(
-            instance_id=self.instance_id,
-            activity_id=activity_id,
-            event_type="EventReceived",
-            event_data={"event_data": event_data},
-        )
     async def _push_compensation(self, compensation_action: Any, activity_id: str) -> None:
         """
         Register a compensation action for this workflow (internal use only).
@@ -479,6 +451,60 @@ class WorkflowContext:
         """
         return self.storage.in_transaction()
+    async def recur(self, **kwargs: Any) -> None:
+        """
+        Restart the workflow with fresh history (Erlang-style tail recursion).
+        This method prevents unbounded history growth in long-running loops by:
+        1. Completing the current workflow instance (marking as "recurred")
+        2. Archiving the current history (not deleted)
+        3. Starting a new workflow instance with the provided arguments
+        4. Linking the new instance to the old one via `continued_from`
+        This is similar to Erlang's tail recursion pattern where calling the same
+        function at the end of a loop prevents stack growth. In Edda, `recur()`
+        prevents history growth.
+        Args:
+            **kwargs: Arguments to pass to the new workflow instance.
+                     These become the input parameters for the next iteration.
+        Raises:
+            RecurException: Always raised to signal the ReplayEngine to handle
+                           the recur operation. This exception should not be caught.
+        Example:
+            >>> @workflow
+            ... async def notification_service(ctx: WorkflowContext, processed_count: int = 0):
+            ...     await join_group(ctx, group="order_watchers")
+            ...
+            ...     count = 0
+            ...     while True:
+            ...         msg = await wait_message(ctx, channel="order.completed")
+            ...         await send_notification(ctx, msg.data, activity_id=f"notify:{msg.id}")
+            ...
+            ...         count += 1
+            ...         if count >= 1000:
+            ...             # Reset history every 1000 iterations
+            ...             await ctx.recur(processed_count=processed_count + count)
+            ...             # Code after recur() is never executed
+        Note:
+            - Group memberships are NOT automatically transferred. You must re-join
+              groups in the new iteration if needed.
+            - The old workflow's history is archived, not deleted.
+            - The new instance has a `continued_from` field pointing to the old instance.
+            - During replay, if recur() was already called, this raises immediately
+              without re-executing previous activities.
+        """
+        from edda.pydantic_utils import to_json_dict
+        from edda.workflow import RecurException
+        # Convert Pydantic models and Enums to JSON-compatible values
+        processed_kwargs = {k: to_json_dict(v) for k, v in kwargs.items()}
+        raise RecurException(kwargs=processed_kwargs)
     def __repr__(self) -> str:
         """String representation of the context."""
         return (

edda/integrations/opentelemetry/hooks.py CHANGED Viewed

@@ -22,9 +22,14 @@ try:
     from opentelemetry.context import Context
     from opentelemetry.sdk.resources import Resource
     from opentelemetry.sdk.trace import TracerProvider
-    from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter
+    from opentelemetry.sdk.trace.export import (
+        BatchSpanProcessor,
+        ConsoleSpanExporter,
+    )
     from opentelemetry.trace import Span, Status, StatusCode, Tracer
-    from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator
+    from opentelemetry.trace.propagation.tracecontext import (
+        TraceContextTextMapPropagator,
+    )
     _OPENTELEMETRY_AVAILABLE = True
 except ImportError:

edda/locking.py CHANGED Viewed

@@ -6,7 +6,9 @@ distributed locks in multi-pod deployments.
 """
 import asyncio
+import logging
 import os
+import random
 import uuid
 from collections.abc import AsyncIterator
 from contextlib import asynccontextmanager, suppress
@@ -14,6 +16,8 @@ from typing import Any
 from edda.storage.protocol import StorageProtocol
+logger = logging.getLogger(__name__)
 def generate_worker_id(service_name: str) -> str:
     """
@@ -188,6 +192,7 @@ async def _refresh_lock_periodically(
 async def cleanup_stale_locks_periodically(
     storage: StorageProtocol,
+    worker_id: str,
     interval: int = 60,
 ) -> None:
     """
@@ -199,30 +204,49 @@ async def cleanup_stale_locks_periodically(
     Note: This function only cleans up locks without resuming workflows.
     For automatic workflow resumption, use auto_resume_stale_workflows_periodically().
+    Uses system-level locking to ensure only one pod executes cleanup at a time.
     Example:
         >>> asyncio.create_task(
-        ...     cleanup_stale_locks_periodically(storage, interval=60)
+        ...     cleanup_stale_locks_periodically(storage, worker_id, interval=60)
         ... )
     Args:
         storage: Storage backend
+        worker_id: Unique identifier for this worker (for global lock coordination)
         interval: Cleanup interval in seconds (default: 60)
     """
     with suppress(asyncio.CancelledError):
         while True:
-            await asyncio.sleep(interval)
+            # Add jitter to prevent thundering herd in multi-pod deployments
+            jitter = random.uniform(0, interval * 0.3)
+            await asyncio.sleep(interval + jitter)
+            # Try to acquire global lock for this task
+            lock_acquired = await storage.try_acquire_system_lock(
+                lock_name="cleanup_stale_locks",
+                worker_id=worker_id,
+                timeout_seconds=interval,
+            )
+            if not lock_acquired:
+                # Another pod is handling this task
+                continue
-            # Clean up stale locks
-            workflows = await storage.cleanup_stale_locks()
+            try:
+                # Clean up stale locks
+                workflows = await storage.cleanup_stale_locks()
-            if len(workflows) > 0:
-                # Log cleanup (in a real implementation, use proper logging)
-                print(f"Cleaned up {len(workflows)} stale locks")
+                if len(workflows) > 0:
+                    logger.info("Cleaned up %d stale locks", len(workflows))
+            finally:
+                await storage.release_system_lock("cleanup_stale_locks", worker_id)
 async def auto_resume_stale_workflows_periodically(
     storage: StorageProtocol,
     replay_engine: Any,
+    worker_id: str,
     interval: int = 60,
 ) -> None:
     """
@@ -231,83 +255,122 @@ async def auto_resume_stale_workflows_periodically(
     This combines lock cleanup with automatic workflow resumption, ensuring
     that workflows interrupted by worker crashes are automatically recovered.
+    Uses system-level locking to ensure only one pod executes this task at a time,
+    preventing duplicate workflow execution (CRITICAL for safety).
     Example:
         >>> asyncio.create_task(
         ...     auto_resume_stale_workflows_periodically(
-        ...         storage, replay_engine, interval=60
+        ...         storage, replay_engine, worker_id, interval=60
         ...     )
         ... )
     Args:
         storage: Storage backend
         replay_engine: ReplayEngine instance for resuming workflows
+        worker_id: Unique identifier for this worker (for global lock coordination)
         interval: Cleanup interval in seconds (default: 60)
     """
     with suppress(asyncio.CancelledError):
         while True:
-            await asyncio.sleep(interval)
+            # Add jitter to prevent thundering herd in multi-pod deployments
+            jitter = random.uniform(0, interval * 0.3)
+            await asyncio.sleep(interval + jitter)
+            # Try to acquire global lock for this task
+            lock_acquired = await storage.try_acquire_system_lock(
+                lock_name="auto_resume_stale_workflows",
+                worker_id=worker_id,
+                timeout_seconds=interval,
+            )
-            # Clean up stale locks and get workflows to resume
-            workflows_to_resume = await storage.cleanup_stale_locks()
-            if len(workflows_to_resume) > 0:
-                # Log cleanup (in a real implementation, use proper logging)
-                print(f"Cleaned up {len(workflows_to_resume)} stale locks")
-                # Auto-resume workflows
-                for workflow in workflows_to_resume:
-                    instance_id = workflow["instance_id"]
-                    workflow_name = workflow["workflow_name"]
-                    source_hash = workflow["source_hash"]
-                    status = workflow.get("status", "running")
-                    try:
-                        # Special handling for workflows in compensating state
-                        if status == "compensating":
-                            # Workflow crashed during compensation execution
-                            # Only re-execute compensations, don't run workflow function
-                            print(
-                                f"Auto-resuming compensating workflow: {instance_id} "
-                                f"(compensation recovery only, no workflow execution)"
+            if not lock_acquired:
+                # Another pod is handling this task
+                continue
+            try:
+                # Clean up stale locks and get workflows to resume
+                workflows_to_resume = await storage.cleanup_stale_locks()
+                if len(workflows_to_resume) > 0:
+                    logger.info("Cleaned up %d stale locks", len(workflows_to_resume))
+                    # Auto-resume workflows
+                    for workflow in workflows_to_resume:
+                        instance_id = workflow["instance_id"]
+                        workflow_name = workflow["workflow_name"]
+                        source_hash = workflow["source_hash"]
+                        status = workflow.get("status", "running")
+                        try:
+                            # Special handling for workflows in compensating state
+                            if status == "compensating":
+                                # Workflow crashed during compensation execution
+                                # Only re-execute compensations, don't run workflow function
+                                logger.info(
+                                    "Auto-resuming compensating workflow: %s "
+                                    "(compensation recovery only, no workflow execution)",
+                                    instance_id,
+                                )
+                                success = await replay_engine.resume_compensating_workflow(
+                                    instance_id
+                                )
+                                if success:
+                                    logger.info(
+                                        "Successfully completed compensations for: %s",
+                                        instance_id,
+                                    )
+                                else:
+                                    logger.warning(
+                                        "Failed to complete compensations for: %s", instance_id
+                                    )
+                                continue
+                            # Normal workflow resumption (status='running')
+                            # Check if workflow definition matches current Saga registry
+                            # This prevents resuming workflows with outdated/incompatible code
+                            current_definition = await storage.get_current_workflow_definition(
+                                workflow_name
                             )
-                            success = await replay_engine.resume_compensating_workflow(instance_id)
-                            if success:
-                                print(f"Successfully completed compensations for: {instance_id}")
-                            else:
-                                print(f"Failed to complete compensations for: {instance_id}")
-                            continue
-                        # Normal workflow resumption (status='running')
-                        # Check if workflow definition matches current Saga registry
-                        # This prevents resuming workflows with outdated/incompatible code
-                        current_definition = await storage.get_current_workflow_definition(
-                            workflow_name
-                        )
-                        if current_definition is None:
-                            print(
-                                f"Skipping auto-resume for {instance_id}: "
-                                f"workflow '{workflow_name}' not found in registry"
+                            if current_definition is None:
+                                logger.warning(
+                                    "Skipping auto-resume for %s: "
+                                    "workflow '%s' not found in registry",
+                                    instance_id,
+                                    workflow_name,
+                                )
+                                continue
+                            if current_definition["source_hash"] != source_hash:
+                                logger.warning(
+                                    "Skipping auto-resume for %s: "
+                                    "workflow definition has changed "
+                                    "(old hash: %s..., new hash: %s...)",
+                                    instance_id,
+                                    source_hash[:8],
+                                    current_definition["source_hash"][:8],
+                                )
+                                continue
+                            # Hash matches - safe to resume
+                            logger.info(
+                                "Auto-resuming workflow: %s (instance: %s)",
+                                workflow_name,
+                                instance_id,
                             )
-                            continue
-                        if current_definition["source_hash"] != source_hash:
-                            print(
-                                f"Skipping auto-resume for {instance_id}: "
-                                f"workflow definition has changed "
-                                f"(old hash: {source_hash[:8]}..., "
-                                f"new hash: {current_definition['source_hash'][:8]}...)"
+                            await replay_engine.resume_by_name(instance_id, workflow_name)
+                            logger.info("Successfully resumed workflow: %s", instance_id)
+                        except Exception as e:
+                            # Log error but continue with other workflows
+                            logger.error(
+                                "Failed to auto-resume workflow %s: %s",
+                                instance_id,
+                                e,
+                                exc_info=True,
                             )
-                            continue
-                        # Hash matches - safe to resume
-                        print(f"Auto-resuming workflow: {workflow_name} (instance: {instance_id})")
-                        await replay_engine.resume_by_name(instance_id, workflow_name)
-                        print(f"Successfully resumed workflow: {instance_id}")
-                    except Exception as e:
-                        # Log error but continue with other workflows
-                        # In a real implementation, use proper logging
-                        print(f"Failed to auto-resume workflow {instance_id}: {e}")
+            finally:
+                await storage.release_system_lock("auto_resume_stale_workflows", worker_id)
 class LockNotAcquiredError(Exception):

edda-framework 0.6.0__py3-none-any.whl → 0.8.0__py3-none-any.whl

edda-framework 0.6.0py3-none-any.whl → 0.8.0py3-none-any.whl