edda-framework 0.11.0__py3-none-any.whl → 0.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
edda/app.py CHANGED
@@ -63,6 +63,9 @@ class EddaApp:
63
63
  notify_fallback_interval: int = 30,
64
64
  # Batch processing settings
65
65
  max_workflows_per_batch: int | Literal["auto", "auto:cpu"] = 10,
66
+ # Leader election settings (for coordinating background tasks across workers)
67
+ leader_heartbeat_interval: int = 15,
68
+ leader_lease_duration: int = 45,
66
69
  ):
67
70
  """
68
71
  Initialize Edda application.
@@ -100,6 +103,10 @@ class EddaApp:
100
103
  - int: Fixed batch size (default: 10)
101
104
  - "auto": Scale 10-100 based on queue depth
102
105
  - "auto:cpu": Scale 10-100 based on CPU utilization (requires psutil)
106
+ leader_heartbeat_interval: Interval in seconds for leader heartbeat (default: 15).
107
+ Controls how often workers attempt to become/maintain leadership.
108
+ leader_lease_duration: Duration in seconds for leader lease (default: 45).
109
+ If leader fails to heartbeat within this time, another worker takes over.
103
110
  """
104
111
  self.db_url = db_url
105
112
  self.service_name = service_name
@@ -168,6 +175,12 @@ class EddaApp:
168
175
  "Must be int, 'auto', or 'auto:cpu'."
169
176
  )
170
177
 
178
+ # Leader election settings (for coordinating background tasks)
179
+ self._leader_heartbeat_interval = leader_heartbeat_interval
180
+ self._leader_lease_duration = leader_lease_duration
181
+ self._is_leader = False
182
+ self._leader_tasks: list[asyncio.Task[Any]] = []
183
+
171
184
  def _create_storage(self, db_url: str) -> SQLAlchemyStorage:
172
185
  """
173
186
  Create storage backend from database URL.
@@ -309,19 +322,19 @@ class EddaApp:
309
322
 
310
323
  # Subscribe to workflow resumable notifications
311
324
  await self._notify_listener.subscribe(
312
- "edda_workflow_resumable",
325
+ "workflow_resumable",
313
326
  self._on_workflow_resumable_notify,
314
327
  )
315
328
 
316
329
  # Subscribe to outbox notifications
317
330
  await self._notify_listener.subscribe(
318
- "edda_outbox_pending",
331
+ "workflow_outbox_pending",
319
332
  self._on_outbox_pending_notify,
320
333
  )
321
334
 
322
335
  # Subscribe to timer expired notifications
323
336
  await self._notify_listener.subscribe(
324
- "edda_timer_expired",
337
+ "workflow_timer_expired",
325
338
  self._on_timer_expired_notify,
326
339
  )
327
340
 
@@ -462,47 +475,202 @@ class EddaApp:
462
475
  self._initialized = False
463
476
 
464
477
  def _start_background_tasks(self) -> None:
465
- """Start background maintenance tasks."""
466
- # Task to cleanup stale locks and auto-resume workflows
467
- auto_resume_task = asyncio.create_task(
468
- auto_resume_stale_workflows_periodically(
469
- self.storage,
470
- self.replay_engine,
471
- self.worker_id,
472
- interval=60, # Check every 60 seconds
473
- )
474
- )
475
- self._background_tasks.append(auto_resume_task)
478
+ """Start background maintenance tasks.
476
479
 
477
- # Task to check expired timers and resume workflows
478
- timer_check_task = asyncio.create_task(
479
- self._check_expired_timers_periodically(interval=10) # Check every 10 seconds
480
- )
481
- self._background_tasks.append(timer_check_task)
482
-
483
- # Task to check expired message subscriptions and fail workflows
484
- # Note: CloudEvents timeouts are also handled here since wait_event() uses wait_message()
485
- message_timeout_task = asyncio.create_task(
486
- self._check_expired_message_subscriptions_periodically(
487
- interval=10
488
- ) # Check every 10 seconds
489
- )
490
- self._background_tasks.append(message_timeout_task)
480
+ Background tasks are divided into two categories:
481
+ 1. All-worker tasks: Run on every worker (leader election, workflow resumption)
482
+ 2. Leader-only tasks: Run only on the elected leader (timers, timeouts, cleanup)
483
+
484
+ This design reduces database polling load significantly in multi-worker deployments.
485
+ """
486
+ # Leader election loop (all workers participate)
487
+ leader_election_task = asyncio.create_task(self._leader_election_loop())
488
+ self._background_tasks.append(leader_election_task)
491
489
 
492
- # Task to resume workflows after message delivery (fast resumption)
490
+ # Task to resume workflows after message delivery (all workers - competitive lock)
493
491
  message_resume_task = asyncio.create_task(
494
492
  self._resume_running_workflows_periodically(interval=1) # Check every 1 second
495
493
  )
496
494
  self._background_tasks.append(message_resume_task)
497
495
 
498
- # Task to cleanup old channel messages (orphaned messages)
499
- message_cleanup_task = asyncio.create_task(
500
- self._cleanup_old_messages_periodically(
501
- interval=3600, # Check every 1 hour
502
- retention_days=self._message_retention_days,
496
+ # Note: Leader-only tasks (timer checks, message timeouts, stale workflow cleanup,
497
+ # old message cleanup) are started dynamically in _leader_election_loop() when
498
+ # this worker becomes the leader.
499
+
500
+ async def _leader_election_loop(self) -> None:
501
+ """
502
+ Leader election loop that runs on all workers.
503
+
504
+ Uses system lock to elect a single leader among all workers.
505
+ The leader runs maintenance tasks (timer checks, message timeouts, etc.).
506
+ Non-leaders only participate in workflow resumption.
507
+
508
+ If a leader task crashes, it will be automatically restarted.
509
+ """
510
+ while True:
511
+ try:
512
+ was_leader = self._is_leader
513
+
514
+ # Try to acquire/renew leadership
515
+ self._is_leader = await self.storage.try_acquire_system_lock(
516
+ lock_name="edda_leader",
517
+ worker_id=self.worker_id,
518
+ timeout_seconds=self._leader_lease_duration,
519
+ )
520
+
521
+ if self._is_leader and not was_leader:
522
+ # Became leader - start leader-only tasks
523
+ logger.info(f"Worker {self.worker_id} became leader")
524
+ self._leader_tasks = self._create_leader_only_tasks()
525
+
526
+ elif not self._is_leader and was_leader:
527
+ # Lost leadership - cancel leader-only tasks
528
+ logger.info(f"Worker {self.worker_id} lost leadership")
529
+ await self._cancel_tasks(self._leader_tasks)
530
+ self._leader_tasks = []
531
+
532
+ elif self._is_leader:
533
+ # Still leader - check if any leader tasks have crashed and restart
534
+ await self._monitor_and_restart_leader_tasks()
535
+
536
+ # Wait before next heartbeat
537
+ await asyncio.sleep(self._leader_heartbeat_interval)
538
+
539
+ except asyncio.CancelledError:
540
+ # Shutdown - cancel leader tasks and exit
541
+ await self._cancel_tasks(self._leader_tasks)
542
+ self._leader_tasks = []
543
+ raise
544
+ except Exception as e:
545
+ logger.error(f"Leader election error: {e}", exc_info=True)
546
+ self._is_leader = False
547
+ await self._cancel_tasks(self._leader_tasks)
548
+ self._leader_tasks = []
549
+ # Wait before retry
550
+ await asyncio.sleep(self._leader_heartbeat_interval)
551
+
552
+ def _create_leader_only_tasks(self) -> list[asyncio.Task[Any]]:
553
+ """
554
+ Create tasks that should only run on the leader worker.
555
+
556
+ These tasks are responsible for:
557
+ - Timer expiration checks
558
+ - Message subscription timeout checks
559
+ - Stale workflow auto-resume
560
+ - Old message cleanup
561
+ """
562
+ tasks = []
563
+
564
+ # Timer expiration check
565
+ tasks.append(
566
+ asyncio.create_task(
567
+ self._check_expired_timers_periodically(interval=10),
568
+ name="leader_timer_check",
503
569
  )
504
570
  )
505
- self._background_tasks.append(message_cleanup_task)
571
+
572
+ # Message subscription timeout check
573
+ tasks.append(
574
+ asyncio.create_task(
575
+ self._check_expired_message_subscriptions_periodically(interval=10),
576
+ name="leader_message_timeout_check",
577
+ )
578
+ )
579
+
580
+ # Stale workflow auto-resume
581
+ tasks.append(
582
+ asyncio.create_task(
583
+ auto_resume_stale_workflows_periodically(
584
+ self.storage,
585
+ self.replay_engine,
586
+ self.worker_id,
587
+ interval=60,
588
+ ),
589
+ name="leader_stale_workflow_resume",
590
+ )
591
+ )
592
+
593
+ # Old message cleanup
594
+ tasks.append(
595
+ asyncio.create_task(
596
+ self._cleanup_old_messages_periodically(
597
+ interval=3600,
598
+ retention_days=self._message_retention_days,
599
+ ),
600
+ name="leader_message_cleanup",
601
+ )
602
+ )
603
+
604
+ return tasks
605
+
606
+ async def _cancel_tasks(self, tasks: list[asyncio.Task[Any]]) -> None:
607
+ """Cancel a list of tasks and wait for them to finish."""
608
+ for task in tasks:
609
+ task.cancel()
610
+ await asyncio.gather(*tasks, return_exceptions=True)
611
+
612
+ async def _monitor_and_restart_leader_tasks(self) -> None:
613
+ """
614
+ Monitor leader tasks and restart any that have crashed.
615
+
616
+ This ensures leader-only tasks keep running even if they encounter errors.
617
+ """
618
+ task_creators = {
619
+ "leader_timer_check": lambda: asyncio.create_task(
620
+ self._check_expired_timers_periodically(interval=10),
621
+ name="leader_timer_check",
622
+ ),
623
+ "leader_message_timeout_check": lambda: asyncio.create_task(
624
+ self._check_expired_message_subscriptions_periodically(interval=10),
625
+ name="leader_message_timeout_check",
626
+ ),
627
+ "leader_stale_workflow_resume": lambda: asyncio.create_task(
628
+ auto_resume_stale_workflows_periodically(
629
+ self.storage,
630
+ self.replay_engine,
631
+ self.worker_id,
632
+ interval=60,
633
+ ),
634
+ name="leader_stale_workflow_resume",
635
+ ),
636
+ "leader_message_cleanup": lambda: asyncio.create_task(
637
+ self._cleanup_old_messages_periodically(
638
+ interval=3600,
639
+ retention_days=self._message_retention_days,
640
+ ),
641
+ name="leader_message_cleanup",
642
+ ),
643
+ }
644
+
645
+ # Check each task and restart if done (crashed)
646
+ new_tasks = []
647
+ for task in self._leader_tasks:
648
+ if task.done():
649
+ # Task has finished (possibly due to error)
650
+ task_name = task.get_name()
651
+ try:
652
+ # Check if it raised an exception
653
+ exc = task.exception()
654
+ if exc is not None:
655
+ logger.warning(
656
+ f"Leader task {task_name} crashed with {type(exc).__name__}: {exc}, "
657
+ "restarting..."
658
+ )
659
+ except asyncio.CancelledError:
660
+ # Task was cancelled, don't restart
661
+ logger.debug(f"Leader task {task_name} was cancelled")
662
+ continue
663
+
664
+ # Restart the task
665
+ if task_name in task_creators:
666
+ new_task = task_creators[task_name]()
667
+ new_tasks.append(new_task)
668
+ logger.info(f"Restarted leader task: {task_name}")
669
+ else:
670
+ # Task is still running
671
+ new_tasks.append(task)
672
+
673
+ self._leader_tasks = new_tasks
506
674
 
507
675
  def _auto_register_workflows(self) -> None:
508
676
  """
edda/channels.py CHANGED
@@ -145,8 +145,22 @@ async def subscribe(
145
145
  Args:
146
146
  ctx: Workflow context
147
147
  channel: Channel name to subscribe to
148
- mode: Subscription mode - "broadcast" (all subscribers receive all messages)
149
- or "competing" (each message goes to only one subscriber)
148
+ mode: Subscription mode:
149
+ - "broadcast": All subscribers receive all messages (fan-out pattern)
150
+ - "competing": Each message goes to only one subscriber (work queue pattern)
151
+ - "direct": Receive messages sent via send_to() to this instance
152
+
153
+ The "direct" mode is syntactic sugar that subscribes to "channel:instance_id" internally,
154
+ allowing simpler code when receiving direct messages:
155
+
156
+ # Instead of this:
157
+ direct_channel = f"notifications:{ctx.instance_id}"
158
+ await subscribe(ctx, direct_channel, mode="broadcast")
159
+ msg = await receive(ctx, direct_channel)
160
+
161
+ # You can write:
162
+ await subscribe(ctx, "notifications", mode="direct")
163
+ msg = await receive(ctx, "notifications")
150
164
 
151
165
  Example:
152
166
  >>> @workflow
@@ -168,11 +182,29 @@ async def subscribe(
168
182
  ... job = await receive(ctx, "jobs")
169
183
  ... await execute_job(ctx, job.data, activity_id=f"job:{job.id}")
170
184
  ... await ctx.recur()
185
+
186
+ >>> @workflow
187
+ ... async def direct_receiver(ctx: WorkflowContext, id: str):
188
+ ... # Subscribe to receive direct messages via send_to()
189
+ ... await subscribe(ctx, "notifications", mode="direct")
190
+ ...
191
+ ... msg = await receive(ctx, "notifications")
192
+ ... print(f"Received: {msg.data}")
171
193
  """
172
- if mode not in ("broadcast", "competing"):
173
- raise ValueError(f"Invalid subscription mode: {mode}. Must be 'broadcast' or 'competing'")
194
+ actual_channel = channel
195
+ actual_mode = mode
196
+
197
+ if mode == "direct":
198
+ # Transform to instance-specific channel
199
+ actual_channel = f"{channel}:{ctx.instance_id}"
200
+ actual_mode = "broadcast"
201
+ ctx._record_direct_subscription(channel)
202
+ elif mode not in ("broadcast", "competing"):
203
+ raise ValueError(
204
+ f"Invalid subscription mode: {mode}. Must be 'broadcast', 'competing', or 'direct'"
205
+ )
174
206
 
175
- await ctx.storage.subscribe_to_channel(ctx.instance_id, channel, mode)
207
+ await ctx.storage.subscribe_to_channel(ctx.instance_id, actual_channel, actual_mode)
176
208
 
177
209
 
178
210
  async def unsubscribe(
@@ -185,11 +217,17 @@ async def unsubscribe(
185
217
  Note: Workflows are automatically unsubscribed from all channels when they
186
218
  complete, fail, or are cancelled. Explicit unsubscribe is usually not necessary.
187
219
 
220
+ For channels subscribed with mode="direct", use the original channel name
221
+ (not the transformed "channel:instance_id" form).
222
+
188
223
  Args:
189
224
  ctx: Workflow context
190
225
  channel: Channel name to unsubscribe from
191
226
  """
192
- await ctx.storage.unsubscribe_from_channel(ctx.instance_id, channel)
227
+ actual_channel = channel
228
+ if ctx._is_direct_subscription(channel):
229
+ actual_channel = f"{channel}:{ctx.instance_id}"
230
+ await ctx.storage.unsubscribe_from_channel(ctx.instance_id, actual_channel)
193
231
 
194
232
 
195
233
  # =============================================================================
@@ -233,7 +271,12 @@ async def receive(
233
271
  ... await process(ctx, msg.data, activity_id=f"process:{msg.id}")
234
272
  ... await ctx.recur()
235
273
  """
236
- # Generate activity ID
274
+ # Transform channel for direct subscriptions
275
+ actual_channel = channel
276
+ if ctx._is_direct_subscription(channel):
277
+ actual_channel = f"{channel}:{ctx.instance_id}"
278
+
279
+ # Generate activity ID (use original channel name for deterministic replay)
237
280
  if message_id is None:
238
281
  activity_id = ctx._generate_activity_id(f"receive_{channel}")
239
282
  else:
@@ -277,21 +320,21 @@ async def receive(
277
320
  raise RuntimeError(f"Unexpected cached result type: {type(cached_result)}")
278
321
 
279
322
  # Check for pending messages in the queue
280
- pending = await ctx.storage.get_pending_channel_messages(ctx.instance_id, channel)
323
+ pending = await ctx.storage.get_pending_channel_messages(ctx.instance_id, actual_channel)
281
324
  if pending:
282
325
  # Get the first pending message
283
326
  msg_dict = pending[0]
284
327
  msg_id = msg_dict["message_id"]
285
328
 
286
329
  # For competing mode, try to claim the message
287
- subscription = await _get_subscription(ctx, channel)
330
+ subscription = await _get_subscription(ctx, actual_channel)
288
331
  if subscription and subscription.get("mode") == "competing":
289
332
  claimed = await ctx.storage.claim_channel_message(msg_id, ctx.instance_id)
290
333
  if not claimed:
291
334
  # Another worker claimed it, check next message
292
335
  # For simplicity, raise exception to retry
293
336
  raise WaitForChannelMessageException(
294
- channel=channel,
337
+ channel=actual_channel,
295
338
  timeout_seconds=timeout_seconds,
296
339
  activity_id=activity_id,
297
340
  )
@@ -299,7 +342,9 @@ async def receive(
299
342
  await ctx.storage.delete_channel_message(msg_id)
300
343
  else:
301
344
  # Broadcast mode - update cursor
302
- await ctx.storage.update_delivery_cursor(channel, ctx.instance_id, msg_dict["id"])
345
+ await ctx.storage.update_delivery_cursor(
346
+ actual_channel, ctx.instance_id, msg_dict["id"]
347
+ )
303
348
 
304
349
  # Build the message
305
350
  raw_data = msg_dict.get("data")
@@ -337,7 +382,7 @@ async def receive(
337
382
 
338
383
  # No pending messages, raise exception to pause workflow
339
384
  raise WaitForChannelMessageException(
340
- channel=channel,
385
+ channel=actual_channel,
341
386
  timeout_seconds=timeout_seconds,
342
387
  activity_id=activity_id,
343
388
  )
edda/context.py CHANGED
@@ -69,6 +69,9 @@ class WorkflowContext:
69
69
  # Default retry policy from EddaApp (set by ReplayEngine)
70
70
  self._app_retry_policy: Any = None
71
71
 
72
+ # Direct subscriptions: channel names subscribed with mode="direct"
73
+ self._direct_subscriptions: set[str] = set()
74
+
72
75
  @property
73
76
  def storage(self) -> StorageProtocol:
74
77
  """
@@ -271,6 +274,27 @@ class WorkflowContext:
271
274
  """
272
275
  self.executed_activity_ids.add(activity_id)
273
276
 
277
+ def _record_direct_subscription(self, channel: str) -> None:
278
+ """
279
+ Record that a channel was subscribed in direct mode (internal use only).
280
+
281
+ Args:
282
+ channel: The original channel name (before transformation)
283
+ """
284
+ self._direct_subscriptions.add(channel)
285
+
286
+ def _is_direct_subscription(self, channel: str) -> bool:
287
+ """
288
+ Check if a channel was subscribed in direct mode (internal use only).
289
+
290
+ Args:
291
+ channel: The channel name to check
292
+
293
+ Returns:
294
+ True if the channel was subscribed with mode="direct"
295
+ """
296
+ return channel in self._direct_subscriptions
297
+
274
298
  async def _record_activity_completed(
275
299
  self,
276
300
  activity_id: str,