edda-framework 0.6.0__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
edda/compensation.py CHANGED
@@ -5,9 +5,12 @@ This module provides compensation transaction support for implementing
5
5
  the Saga pattern with automatic rollback on failure.
6
6
  """
7
7
 
8
+ import logging
8
9
  from collections.abc import Callable
9
10
  from typing import TYPE_CHECKING, Any, TypeVar
10
11
 
12
+ logger = logging.getLogger(__name__)
13
+
11
14
  if TYPE_CHECKING:
12
15
  from edda.context import WorkflowContext
13
16
 
@@ -197,12 +200,12 @@ async def execute_compensations(ctx: "WorkflowContext") -> None:
197
200
 
198
201
  # If no compensations, nothing to do
199
202
  if not compensations:
200
- print(f"[Compensation] No compensations to execute for {ctx.instance_id}")
203
+ logger.debug("No compensations to execute for %s", ctx.instance_id)
201
204
  return
202
205
 
203
206
  # Mark as compensating BEFORE execution for crash recovery
204
207
  # This allows auto-resume to detect and restart incomplete compensation
205
- print(f"[Compensation] Starting compensation execution for {ctx.instance_id}")
208
+ logger.debug("Starting compensation execution for %s", ctx.instance_id)
206
209
  await ctx._update_status("compensating", {"started_at": None})
207
210
 
208
211
  # Get already executed compensations to avoid duplicate execution
@@ -221,8 +224,10 @@ async def execute_compensations(ctx: "WorkflowContext") -> None:
221
224
 
222
225
  # Skip if already executed (idempotency)
223
226
  if compensation_id in executed_compensation_ids:
224
- print(
225
- f"[Compensation] Skipping already executed: {activity_name} (id={compensation_id})"
227
+ logger.debug(
228
+ "Skipping already executed compensation: %s (id=%s)",
229
+ activity_name,
230
+ compensation_id,
226
231
  )
227
232
  continue
228
233
 
@@ -232,20 +237,18 @@ async def execute_compensations(ctx: "WorkflowContext") -> None:
232
237
 
233
238
  # Skip if activity_name is None or not a string
234
239
  if not isinstance(activity_name, str):
235
- print(f"[Compensation] Warning: Invalid activity_name: {activity_name}. Skipping.")
240
+ logger.warning("Invalid activity_name: %s. Skipping.", activity_name)
236
241
  continue
237
242
 
238
243
  # Log compensation execution
239
- print(f"[Compensation] Executing: {activity_name} (id={compensation_id})")
244
+ logger.info("Executing compensation: %s (id=%s)", activity_name, compensation_id)
240
245
 
241
246
  try:
242
247
  # Look up compensation function from registry
243
248
  compensation_func = _COMPENSATION_REGISTRY.get(activity_name)
244
249
 
245
250
  if compensation_func is None:
246
- print(
247
- f"[Compensation] Warning: Function '{activity_name}' not found in registry. Skipping."
248
- )
251
+ logger.warning("Function '%s' not found in registry. Skipping.", activity_name)
249
252
  continue
250
253
 
251
254
  # Execute the compensation function directly
@@ -271,20 +274,21 @@ async def execute_compensations(ctx: "WorkflowContext") -> None:
271
274
  # This is expected in concurrent cancellation scenarios - silently ignore
272
275
  error_msg = str(record_error)
273
276
  if "UNIQUE constraint" in error_msg or "UNIQUE" in error_msg:
274
- print(
275
- f"[Compensation] {activity_name} already recorded by another process, skipping duplicate record"
277
+ logger.debug(
278
+ "%s already recorded by another process, skipping duplicate record",
279
+ activity_name,
276
280
  )
277
281
  else:
278
282
  # Other errors should be logged but not break the compensation flow
279
- print(
280
- f"[Compensation] Warning: Failed to record {activity_name} execution: {record_error}"
281
- )
283
+ logger.warning("Failed to record %s execution: %s", activity_name, record_error)
282
284
 
283
- print(f"[Compensation] Successfully executed: {activity_name}")
285
+ logger.info("Successfully executed compensation: %s", activity_name)
284
286
 
285
287
  except Exception as error:
286
288
  # Log but don't fail the rollback
287
- print(f"[Compensation] Failed to execute {activity_name}: {error}")
289
+ logger.error(
290
+ "Failed to execute compensation %s: %s", activity_name, error, exc_info=True
291
+ )
288
292
 
289
293
  # Record compensation failure in history
290
294
  try:
@@ -304,13 +308,9 @@ async def execute_compensations(ctx: "WorkflowContext") -> None:
304
308
  # UNIQUE constraint error means another process already recorded this failure
305
309
  error_msg = str(record_error)
306
310
  if "UNIQUE constraint" in error_msg or "UNIQUE" in error_msg:
307
- print(
308
- f"[Compensation] {activity_name} failure already recorded by another process"
309
- )
311
+ logger.debug("%s failure already recorded by another process", activity_name)
310
312
  else:
311
- print(
312
- f"[Compensation] Warning: Failed to record compensation failure: {record_error}"
313
- )
313
+ logger.warning("Failed to record compensation failure: %s", record_error)
314
314
 
315
315
 
316
316
  async def clear_compensations(ctx: "WorkflowContext") -> None:
edda/context.py CHANGED
@@ -9,7 +9,7 @@ from collections.abc import AsyncIterator
9
9
  from contextlib import asynccontextmanager
10
10
  from typing import TYPE_CHECKING, Any, cast
11
11
 
12
- from edda.events import ReceivedEvent
12
+ from edda.channels import ChannelMessage, ReceivedEvent
13
13
  from edda.storage.protocol import StorageProtocol
14
14
 
15
15
  if TYPE_CHECKING:
@@ -191,6 +191,28 @@ class WorkflowContext:
191
191
  extensions=extensions,
192
192
  )
193
193
  self._history_cache[activity_id] = received_event
194
+ elif event_type == "ChannelMessageReceived":
195
+ # Cache the message data for receive() replay
196
+ from datetime import UTC, datetime
197
+
198
+ raw_data = event_data.get("data", event_data.get("payload", {}))
199
+ data: dict[str, Any] | bytes = (
200
+ raw_data if isinstance(raw_data, (dict, bytes)) else {}
201
+ )
202
+ # Parse published_at if available, otherwise use current time
203
+ published_at_str = event_data.get("published_at")
204
+ if published_at_str:
205
+ published_at = datetime.fromisoformat(published_at_str.replace("Z", "+00:00"))
206
+ else:
207
+ published_at = datetime.now(UTC)
208
+ message = ChannelMessage(
209
+ data=data,
210
+ channel=event_data.get("channel", "unknown"),
211
+ id=event_data.get("id", "unknown"),
212
+ metadata=event_data.get("metadata") or {},
213
+ published_at=published_at,
214
+ )
215
+ self._history_cache[activity_id] = message
194
216
  elif event_type == "TimerExpired":
195
217
  # Cache the timer result for wait_timer replay
196
218
  # Timer returns None, so we cache the result field
@@ -340,56 +362,6 @@ class WorkflowContext:
340
362
  """
341
363
  await self.storage.update_instance_status(self.instance_id, status, output_data)
342
364
 
343
- async def _register_event_subscription(
344
- self,
345
- event_type: str,
346
- timeout_seconds: int | None = None,
347
- activity_id: str | None = None,
348
- ) -> None:
349
- """
350
- Register an event subscription for wait_event (internal use only).
351
-
352
- This is called when a workflow calls wait_event() and needs to pause
353
- until a matching event arrives.
354
-
355
- Args:
356
- event_type: CloudEvent type to wait for
357
- timeout_seconds: Optional timeout in seconds
358
- activity_id: The activity ID where wait_event was called
359
- """
360
- from datetime import UTC, datetime, timedelta
361
-
362
- timeout_at = None
363
- if timeout_seconds is not None:
364
- timeout_at = datetime.now(UTC) + timedelta(seconds=timeout_seconds)
365
-
366
- await self.storage.add_event_subscription(
367
- instance_id=self.instance_id,
368
- event_type=event_type,
369
- timeout_at=timeout_at,
370
- )
371
-
372
- # Update current activity ID
373
- if activity_id is not None:
374
- await self.storage.update_instance_activity(self.instance_id, activity_id)
375
-
376
- async def _record_event_received(self, activity_id: str, event_data: dict[str, Any]) -> None:
377
- """
378
- Record that an event was received during wait_event (internal use only).
379
-
380
- This is called when resuming a workflow after an event arrives.
381
-
382
- Args:
383
- activity_id: The activity ID where wait_event was called
384
- event_data: The received event data
385
- """
386
- await self.storage.append_history(
387
- instance_id=self.instance_id,
388
- activity_id=activity_id,
389
- event_type="EventReceived",
390
- event_data={"event_data": event_data},
391
- )
392
-
393
365
  async def _push_compensation(self, compensation_action: Any, activity_id: str) -> None:
394
366
  """
395
367
  Register a compensation action for this workflow (internal use only).
@@ -479,6 +451,60 @@ class WorkflowContext:
479
451
  """
480
452
  return self.storage.in_transaction()
481
453
 
454
+ async def recur(self, **kwargs: Any) -> None:
455
+ """
456
+ Restart the workflow with fresh history (Erlang-style tail recursion).
457
+
458
+ This method prevents unbounded history growth in long-running loops by:
459
+ 1. Completing the current workflow instance (marking as "recurred")
460
+ 2. Archiving the current history (not deleted)
461
+ 3. Starting a new workflow instance with the provided arguments
462
+ 4. Linking the new instance to the old one via `continued_from`
463
+
464
+ This is similar to Erlang's tail recursion pattern where calling the same
465
+ function at the end of a loop prevents stack growth. In Edda, `recur()`
466
+ prevents history growth.
467
+
468
+ Args:
469
+ **kwargs: Arguments to pass to the new workflow instance.
470
+ These become the input parameters for the next iteration.
471
+
472
+ Raises:
473
+ RecurException: Always raised to signal the ReplayEngine to handle
474
+ the recur operation. This exception should not be caught.
475
+
476
+ Example:
477
+ >>> @workflow
478
+ ... async def notification_service(ctx: WorkflowContext, processed_count: int = 0):
479
+ ... await join_group(ctx, group="order_watchers")
480
+ ...
481
+ ... count = 0
482
+ ... while True:
483
+ ... msg = await wait_message(ctx, channel="order.completed")
484
+ ... await send_notification(ctx, msg.data, activity_id=f"notify:{msg.id}")
485
+ ...
486
+ ... count += 1
487
+ ... if count >= 1000:
488
+ ... # Reset history every 1000 iterations
489
+ ... await ctx.recur(processed_count=processed_count + count)
490
+ ... # Code after recur() is never executed
491
+
492
+ Note:
493
+ - Group memberships are NOT automatically transferred. You must re-join
494
+ groups in the new iteration if needed.
495
+ - The old workflow's history is archived, not deleted.
496
+ - The new instance has a `continued_from` field pointing to the old instance.
497
+ - During replay, if recur() was already called, this raises immediately
498
+ without re-executing previous activities.
499
+ """
500
+ from edda.pydantic_utils import to_json_dict
501
+ from edda.workflow import RecurException
502
+
503
+ # Convert Pydantic models and Enums to JSON-compatible values
504
+ processed_kwargs = {k: to_json_dict(v) for k, v in kwargs.items()}
505
+
506
+ raise RecurException(kwargs=processed_kwargs)
507
+
482
508
  def __repr__(self) -> str:
483
509
  """String representation of the context."""
484
510
  return (
@@ -22,9 +22,14 @@ try:
22
22
  from opentelemetry.context import Context
23
23
  from opentelemetry.sdk.resources import Resource
24
24
  from opentelemetry.sdk.trace import TracerProvider
25
- from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter
25
+ from opentelemetry.sdk.trace.export import (
26
+ BatchSpanProcessor,
27
+ ConsoleSpanExporter,
28
+ )
26
29
  from opentelemetry.trace import Span, Status, StatusCode, Tracer
27
- from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator
30
+ from opentelemetry.trace.propagation.tracecontext import (
31
+ TraceContextTextMapPropagator,
32
+ )
28
33
 
29
34
  _OPENTELEMETRY_AVAILABLE = True
30
35
  except ImportError:
edda/locking.py CHANGED
@@ -6,7 +6,9 @@ distributed locks in multi-pod deployments.
6
6
  """
7
7
 
8
8
  import asyncio
9
+ import logging
9
10
  import os
11
+ import random
10
12
  import uuid
11
13
  from collections.abc import AsyncIterator
12
14
  from contextlib import asynccontextmanager, suppress
@@ -14,6 +16,8 @@ from typing import Any
14
16
 
15
17
  from edda.storage.protocol import StorageProtocol
16
18
 
19
+ logger = logging.getLogger(__name__)
20
+
17
21
 
18
22
  def generate_worker_id(service_name: str) -> str:
19
23
  """
@@ -188,6 +192,7 @@ async def _refresh_lock_periodically(
188
192
 
189
193
  async def cleanup_stale_locks_periodically(
190
194
  storage: StorageProtocol,
195
+ worker_id: str,
191
196
  interval: int = 60,
192
197
  ) -> None:
193
198
  """
@@ -199,30 +204,49 @@ async def cleanup_stale_locks_periodically(
199
204
  Note: This function only cleans up locks without resuming workflows.
200
205
  For automatic workflow resumption, use auto_resume_stale_workflows_periodically().
201
206
 
207
+ Uses system-level locking to ensure only one pod executes cleanup at a time.
208
+
202
209
  Example:
203
210
  >>> asyncio.create_task(
204
- ... cleanup_stale_locks_periodically(storage, interval=60)
211
+ ... cleanup_stale_locks_periodically(storage, worker_id, interval=60)
205
212
  ... )
206
213
 
207
214
  Args:
208
215
  storage: Storage backend
216
+ worker_id: Unique identifier for this worker (for global lock coordination)
209
217
  interval: Cleanup interval in seconds (default: 60)
210
218
  """
211
219
  with suppress(asyncio.CancelledError):
212
220
  while True:
213
- await asyncio.sleep(interval)
221
+ # Add jitter to prevent thundering herd in multi-pod deployments
222
+ jitter = random.uniform(0, interval * 0.3)
223
+ await asyncio.sleep(interval + jitter)
224
+
225
+ # Try to acquire global lock for this task
226
+ lock_acquired = await storage.try_acquire_system_lock(
227
+ lock_name="cleanup_stale_locks",
228
+ worker_id=worker_id,
229
+ timeout_seconds=interval,
230
+ )
231
+
232
+ if not lock_acquired:
233
+ # Another pod is handling this task
234
+ continue
214
235
 
215
- # Clean up stale locks
216
- workflows = await storage.cleanup_stale_locks()
236
+ try:
237
+ # Clean up stale locks
238
+ workflows = await storage.cleanup_stale_locks()
217
239
 
218
- if len(workflows) > 0:
219
- # Log cleanup (in a real implementation, use proper logging)
220
- print(f"Cleaned up {len(workflows)} stale locks")
240
+ if len(workflows) > 0:
241
+ logger.info("Cleaned up %d stale locks", len(workflows))
242
+ finally:
243
+ await storage.release_system_lock("cleanup_stale_locks", worker_id)
221
244
 
222
245
 
223
246
  async def auto_resume_stale_workflows_periodically(
224
247
  storage: StorageProtocol,
225
248
  replay_engine: Any,
249
+ worker_id: str,
226
250
  interval: int = 60,
227
251
  ) -> None:
228
252
  """
@@ -231,83 +255,122 @@ async def auto_resume_stale_workflows_periodically(
231
255
  This combines lock cleanup with automatic workflow resumption, ensuring
232
256
  that workflows interrupted by worker crashes are automatically recovered.
233
257
 
258
+ Uses system-level locking to ensure only one pod executes this task at a time,
259
+ preventing duplicate workflow execution (CRITICAL for safety).
260
+
234
261
  Example:
235
262
  >>> asyncio.create_task(
236
263
  ... auto_resume_stale_workflows_periodically(
237
- ... storage, replay_engine, interval=60
264
+ ... storage, replay_engine, worker_id, interval=60
238
265
  ... )
239
266
  ... )
240
267
 
241
268
  Args:
242
269
  storage: Storage backend
243
270
  replay_engine: ReplayEngine instance for resuming workflows
271
+ worker_id: Unique identifier for this worker (for global lock coordination)
244
272
  interval: Cleanup interval in seconds (default: 60)
245
273
  """
246
274
  with suppress(asyncio.CancelledError):
247
275
  while True:
248
- await asyncio.sleep(interval)
276
+ # Add jitter to prevent thundering herd in multi-pod deployments
277
+ jitter = random.uniform(0, interval * 0.3)
278
+ await asyncio.sleep(interval + jitter)
279
+
280
+ # Try to acquire global lock for this task
281
+ lock_acquired = await storage.try_acquire_system_lock(
282
+ lock_name="auto_resume_stale_workflows",
283
+ worker_id=worker_id,
284
+ timeout_seconds=interval,
285
+ )
249
286
 
250
- # Clean up stale locks and get workflows to resume
251
- workflows_to_resume = await storage.cleanup_stale_locks()
252
-
253
- if len(workflows_to_resume) > 0:
254
- # Log cleanup (in a real implementation, use proper logging)
255
- print(f"Cleaned up {len(workflows_to_resume)} stale locks")
256
-
257
- # Auto-resume workflows
258
- for workflow in workflows_to_resume:
259
- instance_id = workflow["instance_id"]
260
- workflow_name = workflow["workflow_name"]
261
- source_hash = workflow["source_hash"]
262
- status = workflow.get("status", "running")
263
-
264
- try:
265
- # Special handling for workflows in compensating state
266
- if status == "compensating":
267
- # Workflow crashed during compensation execution
268
- # Only re-execute compensations, don't run workflow function
269
- print(
270
- f"Auto-resuming compensating workflow: {instance_id} "
271
- f"(compensation recovery only, no workflow execution)"
287
+ if not lock_acquired:
288
+ # Another pod is handling this task
289
+ continue
290
+
291
+ try:
292
+ # Clean up stale locks and get workflows to resume
293
+ workflows_to_resume = await storage.cleanup_stale_locks()
294
+
295
+ if len(workflows_to_resume) > 0:
296
+ logger.info("Cleaned up %d stale locks", len(workflows_to_resume))
297
+
298
+ # Auto-resume workflows
299
+ for workflow in workflows_to_resume:
300
+ instance_id = workflow["instance_id"]
301
+ workflow_name = workflow["workflow_name"]
302
+ source_hash = workflow["source_hash"]
303
+ status = workflow.get("status", "running")
304
+
305
+ try:
306
+ # Special handling for workflows in compensating state
307
+ if status == "compensating":
308
+ # Workflow crashed during compensation execution
309
+ # Only re-execute compensations, don't run workflow function
310
+ logger.info(
311
+ "Auto-resuming compensating workflow: %s "
312
+ "(compensation recovery only, no workflow execution)",
313
+ instance_id,
314
+ )
315
+ success = await replay_engine.resume_compensating_workflow(
316
+ instance_id
317
+ )
318
+ if success:
319
+ logger.info(
320
+ "Successfully completed compensations for: %s",
321
+ instance_id,
322
+ )
323
+ else:
324
+ logger.warning(
325
+ "Failed to complete compensations for: %s", instance_id
326
+ )
327
+ continue
328
+
329
+ # Normal workflow resumption (status='running')
330
+ # Check if workflow definition matches current Saga registry
331
+ # This prevents resuming workflows with outdated/incompatible code
332
+ current_definition = await storage.get_current_workflow_definition(
333
+ workflow_name
272
334
  )
273
- success = await replay_engine.resume_compensating_workflow(instance_id)
274
- if success:
275
- print(f"Successfully completed compensations for: {instance_id}")
276
- else:
277
- print(f"Failed to complete compensations for: {instance_id}")
278
- continue
279
-
280
- # Normal workflow resumption (status='running')
281
- # Check if workflow definition matches current Saga registry
282
- # This prevents resuming workflows with outdated/incompatible code
283
- current_definition = await storage.get_current_workflow_definition(
284
- workflow_name
285
- )
286
-
287
- if current_definition is None:
288
- print(
289
- f"Skipping auto-resume for {instance_id}: "
290
- f"workflow '{workflow_name}' not found in registry"
335
+
336
+ if current_definition is None:
337
+ logger.warning(
338
+ "Skipping auto-resume for %s: "
339
+ "workflow '%s' not found in registry",
340
+ instance_id,
341
+ workflow_name,
342
+ )
343
+ continue
344
+
345
+ if current_definition["source_hash"] != source_hash:
346
+ logger.warning(
347
+ "Skipping auto-resume for %s: "
348
+ "workflow definition has changed "
349
+ "(old hash: %s..., new hash: %s...)",
350
+ instance_id,
351
+ source_hash[:8],
352
+ current_definition["source_hash"][:8],
353
+ )
354
+ continue
355
+
356
+ # Hash matches - safe to resume
357
+ logger.info(
358
+ "Auto-resuming workflow: %s (instance: %s)",
359
+ workflow_name,
360
+ instance_id,
291
361
  )
292
- continue
293
-
294
- if current_definition["source_hash"] != source_hash:
295
- print(
296
- f"Skipping auto-resume for {instance_id}: "
297
- f"workflow definition has changed "
298
- f"(old hash: {source_hash[:8]}..., "
299
- f"new hash: {current_definition['source_hash'][:8]}...)"
362
+ await replay_engine.resume_by_name(instance_id, workflow_name)
363
+ logger.info("Successfully resumed workflow: %s", instance_id)
364
+ except Exception as e:
365
+ # Log error but continue with other workflows
366
+ logger.error(
367
+ "Failed to auto-resume workflow %s: %s",
368
+ instance_id,
369
+ e,
370
+ exc_info=True,
300
371
  )
301
- continue
302
-
303
- # Hash matches - safe to resume
304
- print(f"Auto-resuming workflow: {workflow_name} (instance: {instance_id})")
305
- await replay_engine.resume_by_name(instance_id, workflow_name)
306
- print(f"Successfully resumed workflow: {instance_id}")
307
- except Exception as e:
308
- # Log error but continue with other workflows
309
- # In a real implementation, use proper logging
310
- print(f"Failed to auto-resume workflow {instance_id}: {e}")
372
+ finally:
373
+ await storage.release_system_lock("auto_resume_stale_workflows", worker_id)
311
374
 
312
375
 
313
376
  class LockNotAcquiredError(Exception):