edda-framework 0.6.0__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
edda/app.py CHANGED
@@ -7,6 +7,8 @@ application for handling CloudEvents and executing workflows.
7
7
 
8
8
  import asyncio
9
9
  import json
10
+ import logging
11
+ import random
10
12
  import sys
11
13
  from collections.abc import Callable
12
14
  from typing import Any
@@ -24,6 +26,8 @@ from edda.replay import ReplayEngine
24
26
  from edda.retry import RetryPolicy
25
27
  from edda.storage.sqlalchemy_storage import SQLAlchemyStorage
26
28
 
29
+ logger = logging.getLogger(__name__)
30
+
27
31
 
28
32
  class EddaApp:
29
33
  """
@@ -41,9 +45,16 @@ class EddaApp:
41
45
  service_name: str,
42
46
  db_url: str,
43
47
  outbox_enabled: bool = False,
44
- broker_url: str = "http://broker-ingress.knative-eventing.svc.cluster.local/default/default",
48
+ broker_url: str | None = None,
45
49
  hooks: WorkflowHooks | None = None,
46
50
  default_retry_policy: "RetryPolicy | None" = None,
51
+ message_retention_days: int = 7,
52
+ # Connection pool settings (ignored for SQLite)
53
+ pool_size: int = 5,
54
+ max_overflow: int = 10,
55
+ pool_timeout: int = 30,
56
+ pool_recycle: int = 3600,
57
+ pool_pre_ping: bool = True,
47
58
  ):
48
59
  """
49
60
  Initialize Edda application.
@@ -52,18 +63,41 @@ class EddaApp:
52
63
  service_name: Service name for distributed execution (e.g., "order-service")
53
64
  db_url: Database URL (e.g., "sqlite:///workflow.db")
54
65
  outbox_enabled: Enable transactional outbox pattern
55
- broker_url: Knative Broker URL for outbox publishing
66
+ broker_url: Broker URL for outbox publishing. Required if outbox_enabled=True.
56
67
  hooks: Optional WorkflowHooks implementation for observability
57
68
  default_retry_policy: Default retry policy for all activities.
58
69
  If None, uses DEFAULT_RETRY_POLICY (5 attempts, exponential backoff).
59
70
  Can be overridden per-activity using @activity(retry_policy=...).
71
+ message_retention_days: Number of days to retain channel messages before automatic cleanup.
72
+ Defaults to 7 days. Messages older than this will be deleted
73
+ by a background task running every hour.
74
+ pool_size: Number of connections to keep open in the pool (default: 5).
75
+ Ignored for SQLite. For production, consider 20+.
76
+ max_overflow: Maximum number of connections to create above pool_size (default: 10).
77
+ Ignored for SQLite. For production, consider 40+.
78
+ pool_timeout: Seconds to wait for a connection from the pool (default: 30).
79
+ Ignored for SQLite.
80
+ pool_recycle: Seconds before a connection is recycled (default: 3600).
81
+ Helps prevent stale connections. Ignored for SQLite.
82
+ pool_pre_ping: If True, test connections before use (default: True).
83
+ Helps detect disconnected connections. Ignored for SQLite.
60
84
  """
61
85
  self.db_url = db_url
62
86
  self.service_name = service_name
63
87
  self.outbox_enabled = outbox_enabled
64
88
  self.broker_url = broker_url
89
+ if self.outbox_enabled and not self.broker_url:
90
+ raise ValueError("broker_url is required when outbox_enabled=True")
65
91
  self.hooks = hooks
66
92
  self.default_retry_policy = default_retry_policy
93
+ self._message_retention_days = message_retention_days
94
+
95
+ # Connection pool settings
96
+ self._pool_size = pool_size
97
+ self._max_overflow = max_overflow
98
+ self._pool_timeout = pool_timeout
99
+ self._pool_recycle = pool_recycle
100
+ self._pool_pre_ping = pool_pre_ping
67
101
 
68
102
  # Generate unique worker ID for this process
69
103
  self.worker_id = generate_worker_id(service_name)
@@ -100,18 +134,35 @@ class EddaApp:
100
134
  Returns:
101
135
  SQLAlchemyStorage instance
102
136
  """
137
+ # Check if using SQLite (connection pool settings not applicable)
138
+ is_sqlite = db_url.startswith("sqlite")
139
+
103
140
  # Convert plain sqlite:// URLs to use aiosqlite driver
104
141
  if db_url.startswith("sqlite:///"):
105
142
  db_url = db_url.replace("sqlite:///", "sqlite+aiosqlite:///", 1)
106
143
  elif db_url == "sqlite:///:memory:" or db_url.startswith("sqlite:///:memory:"):
107
144
  db_url = "sqlite+aiosqlite:///:memory:"
108
145
 
146
+ # Build engine kwargs
147
+ engine_kwargs: dict[str, Any] = {
148
+ "echo": False, # Set to True for SQL logging
149
+ "future": True,
150
+ }
151
+
152
+ # Add connection pool settings for non-SQLite databases
153
+ if not is_sqlite:
154
+ engine_kwargs.update(
155
+ {
156
+ "pool_size": self._pool_size,
157
+ "max_overflow": self._max_overflow,
158
+ "pool_timeout": self._pool_timeout,
159
+ "pool_recycle": self._pool_recycle,
160
+ "pool_pre_ping": self._pool_pre_ping,
161
+ }
162
+ )
163
+
109
164
  # Create async engine
110
- engine = create_async_engine(
111
- db_url,
112
- echo=False, # Set to True for SQL logging
113
- future=True,
114
- )
165
+ engine = create_async_engine(db_url, **engine_kwargs)
115
166
 
116
167
  return SQLAlchemyStorage(engine)
117
168
 
@@ -148,6 +199,7 @@ class EddaApp:
148
199
 
149
200
  # Initialize outbox relayer if enabled
150
201
  if self.outbox_enabled:
202
+ assert self.broker_url is not None # Validated in __init__
151
203
  self.outbox_relayer = OutboxRelayer(
152
204
  storage=self.storage,
153
205
  broker_url=self.broker_url,
@@ -194,6 +246,7 @@ class EddaApp:
194
246
  auto_resume_stale_workflows_periodically(
195
247
  self.storage,
196
248
  self.replay_engine,
249
+ self.worker_id,
197
250
  interval=60, # Check every 60 seconds
198
251
  )
199
252
  )
@@ -205,11 +258,29 @@ class EddaApp:
205
258
  )
206
259
  self._background_tasks.append(timer_check_task)
207
260
 
208
- # Task to check expired event timeouts and fail workflows
209
- event_timeout_task = asyncio.create_task(
210
- self._check_expired_event_timeouts_periodically(interval=10) # Check every 10 seconds
261
+ # Task to check expired message subscriptions and fail workflows
262
+ # Note: CloudEvents timeouts are also handled here since wait_event() uses wait_message()
263
+ message_timeout_task = asyncio.create_task(
264
+ self._check_expired_message_subscriptions_periodically(
265
+ interval=10
266
+ ) # Check every 10 seconds
267
+ )
268
+ self._background_tasks.append(message_timeout_task)
269
+
270
+ # Task to resume workflows after message delivery (fast resumption)
271
+ message_resume_task = asyncio.create_task(
272
+ self._resume_running_workflows_periodically(interval=1) # Check every 1 second
211
273
  )
212
- self._background_tasks.append(event_timeout_task)
274
+ self._background_tasks.append(message_resume_task)
275
+
276
+ # Task to cleanup old channel messages (orphaned messages)
277
+ message_cleanup_task = asyncio.create_task(
278
+ self._cleanup_old_messages_periodically(
279
+ interval=3600, # Check every 1 hour
280
+ retention_days=self._message_retention_days,
281
+ )
282
+ )
283
+ self._background_tasks.append(message_cleanup_task)
213
284
 
214
285
  def _auto_register_workflows(self) -> None:
215
286
  """
@@ -354,11 +425,7 @@ class EddaApp:
354
425
  try:
355
426
  await handler(event)
356
427
  except Exception as e:
357
- # Log error (in a real implementation, use proper logging)
358
- print(f"Error handling event {event_type}: {e}")
359
- import traceback
360
-
361
- traceback.print_exc()
428
+ logger.error("Error handling event %s: %s", event_type, e, exc_info=True)
362
429
 
363
430
  async def _deliver_event_to_waiting_workflows_safe(self, event: Any) -> None:
364
431
  """
@@ -370,35 +437,46 @@ class EddaApp:
370
437
  try:
371
438
  await self._deliver_event_to_waiting_workflows(event)
372
439
  except Exception as e:
373
- print(f"Error delivering event to waiting workflows: {e}")
374
- import traceback
375
-
376
- traceback.print_exc()
440
+ logger.error("Error delivering event to waiting workflows: %s", e, exc_info=True)
377
441
 
378
442
  async def _deliver_event_to_waiting_workflows(self, event: Any) -> None:
379
443
  """
380
- Deliver event to workflows waiting for this event type.
444
+ Deliver CloudEvent to workflows waiting for this event type.
381
445
 
382
- This method:
383
- 1. Finds workflows waiting for the event type
384
- 2. Records event data to workflow history
385
- 3. Removes event subscription
386
- 4. Resumes the workflow
446
+ This method supports two delivery patterns based on the 'eddainstanceid' extension:
447
+
448
+ 1. **Point-to-Point** (when 'eddainstanceid' is present):
449
+ Delivers to a specific workflow instance only.
450
+
451
+ 2. **Pub/Sub** (when 'eddainstanceid' is absent):
452
+ Delivers to ALL workflows waiting for this event type.
453
+
454
+ Both patterns use the Channel-based Message Queue system for delivery:
455
+ - Lock acquisition (Lock-First pattern)
456
+ - History recording (ChannelMessageReceived)
457
+ - Subscription cursor update (broadcast) or message deletion (competing)
458
+ - Status update to 'running'
459
+ - Lock release
460
+
461
+ Workflow resumption is handled by background task (_resume_running_workflows_periodically).
387
462
 
388
463
  Args:
389
464
  event: CloudEvent instance
390
465
  """
466
+ from edda.channels import publish
467
+
391
468
  event_type = event["type"]
392
469
  event_data = event.get_data()
393
470
 
394
- # Extract CloudEvents metadata
395
- event_metadata = {
396
- "type": event["type"],
397
- "source": event["source"],
398
- "id": event["id"],
399
- "time": event.get("time"),
400
- "datacontenttype": event.get("datacontenttype"),
401
- "subject": event.get("subject"),
471
+ # Extract CloudEvents metadata with ce_ prefix
472
+ # This allows ReceivedEvent to reconstruct CloudEvents attributes
473
+ metadata = {
474
+ "ce_type": event["type"],
475
+ "ce_source": event["source"],
476
+ "ce_id": event["id"],
477
+ "ce_time": event.get("time"),
478
+ "ce_datacontenttype": event.get("datacontenttype"),
479
+ "ce_subject": event.get("subject"),
402
480
  }
403
481
 
404
482
  # Extract extension attributes (any attributes not in the standard set)
@@ -414,112 +492,68 @@ class EddaApp:
414
492
  "data_base64",
415
493
  }
416
494
  extensions = {k: v for k, v in event.get_attributes().items() if k not in standard_attrs}
417
-
418
- # Find workflows waiting for this event type
419
- waiting_instances = await self.storage.find_waiting_instances(event_type)
420
-
421
- if not waiting_instances:
422
- return # No workflows waiting for this event
423
-
424
- print(
425
- f"[EventDelivery] Found {len(waiting_instances)} workflow(s) waiting for '{event_type}'"
426
- )
427
-
428
- for subscription in waiting_instances:
429
- instance_id = subscription["instance_id"]
430
-
431
- # Get workflow instance
432
- instance = await self.storage.get_instance(instance_id)
433
- if not instance:
434
- print(f"[EventDelivery] Warning: Instance {instance_id} not found, skipping")
435
- continue
436
-
437
- # Check if instance is still waiting
438
- if instance.get("status") != "waiting_for_event":
439
- print(
440
- f"[EventDelivery] Warning: Instance {instance_id} "
441
- f"status is '{instance.get('status')}', expected 'waiting_for_event', skipping"
442
- )
443
- continue
444
-
445
- # Get activity_id from the subscription (stored when wait_event was called)
446
- activity_id = subscription.get("activity_id")
447
- if not activity_id:
448
- print(
449
- f"[EventDelivery] Warning: No activity_id in subscription for {instance_id}, skipping"
450
- )
451
- continue
452
-
453
- workflow_name = instance["workflow_name"]
454
-
455
- # Distributed Coroutines: Acquire lock FIRST to prevent race conditions
456
- # This ensures only ONE pod processes this event, even if multiple pods
457
- # receive the event simultaneously
458
- lock_acquired = await self.storage.try_acquire_lock(
459
- instance_id, self.worker_id, timeout_seconds=300
495
+ if extensions:
496
+ metadata["ce_extensions"] = extensions
497
+
498
+ # Check for eddainstanceid extension attribute for Point-to-Point delivery
499
+ target_instance_id = extensions.get("eddainstanceid")
500
+
501
+ if target_instance_id:
502
+ # Point-to-Point: Deliver to specific instance only
503
+ logger.debug(
504
+ "Point-to-Point: Delivering '%s' to instance %s",
505
+ event_type,
506
+ target_instance_id,
460
507
  )
461
508
 
462
- if not lock_acquired:
463
- print(
464
- f"[EventDelivery] Another worker is processing {instance_id}, skipping "
465
- "(distributed coroutine - lock already held)"
466
- )
467
- continue
468
-
469
509
  try:
470
- print(
471
- f"[EventDelivery] Delivering event to workflow {instance_id} (activity_id: {activity_id})"
510
+ await publish(
511
+ self.storage,
512
+ channel=event_type,
513
+ data=event_data,
514
+ metadata=metadata,
515
+ target_instance_id=target_instance_id,
516
+ worker_id=self.worker_id,
517
+ )
518
+ logger.debug(
519
+ "Published '%s' to channel (target: %s)",
520
+ event_type,
521
+ target_instance_id,
472
522
  )
473
523
 
474
- # 1. Record event data and metadata to history
475
- try:
476
- await self.storage.append_history(
477
- instance_id,
478
- activity_id=activity_id,
479
- event_type="EventReceived",
480
- event_data={
481
- "payload": event_data,
482
- "metadata": event_metadata,
483
- "extensions": extensions,
484
- },
485
- )
486
- except Exception as history_error:
487
- # If history entry already exists (UNIQUE constraint), this event was already
488
- # delivered by another worker in a multi-process environment.
489
- # Skip workflow resumption to prevent duplicate processing.
490
- print(
491
- f"[EventDelivery] History already exists for activity_id {activity_id}: {history_error}"
492
- )
493
- print(
494
- f"[EventDelivery] Event '{event_type}' was already delivered by another worker, skipping"
495
- )
496
- continue
497
-
498
- # 2. Remove event subscription
499
- await self.storage.remove_event_subscription(instance_id, event_type)
524
+ except Exception as e:
525
+ logger.error(
526
+ "Error delivering to workflow %s: %s",
527
+ target_instance_id,
528
+ e,
529
+ exc_info=True,
530
+ )
500
531
 
501
- # 3. Resume workflow (lock already held by this worker - distributed coroutine pattern)
502
- if self.replay_engine is None:
503
- print("[EventDelivery] Error: Replay engine not initialized")
504
- continue
532
+ else:
533
+ # Pub/Sub: Deliver to ALL waiting instances
534
+ logger.debug("Pub/Sub: Publishing '%s' to channel", event_type)
505
535
 
506
- await self.replay_engine.resume_by_name(
507
- instance_id, workflow_name, already_locked=True
536
+ try:
537
+ message_id = await publish(
538
+ self.storage,
539
+ channel=event_type,
540
+ data=event_data,
541
+ metadata=metadata,
542
+ worker_id=self.worker_id,
508
543
  )
509
-
510
- print(
511
- f"[EventDelivery] ✅ Resumed workflow {instance_id} after receiving '{event_type}'"
544
+ logger.debug(
545
+ "Published '%s' to channel (message_id: %s)",
546
+ event_type,
547
+ message_id,
512
548
  )
513
549
 
514
550
  except Exception as e:
515
- print(f"[EventDelivery] ❌ Error resuming workflow {instance_id}: {e}")
516
- import traceback
517
-
518
- traceback.print_exc()
519
-
520
- finally:
521
- # Always release the lock, even if an error occurred
522
- await self.storage.release_lock(instance_id, self.worker_id)
551
+ logger.error(
552
+ "Error publishing to channel '%s': %s",
553
+ event_type,
554
+ e,
555
+ exc_info=True,
556
+ )
523
557
 
524
558
  async def _check_expired_timers(self) -> None:
525
559
  """
@@ -542,7 +576,7 @@ class EddaApp:
542
576
  if not expired_timers:
543
577
  return # No expired timers
544
578
 
545
- print(f"[TimerCheck] Found {len(expired_timers)} expired timer(s)")
579
+ logger.debug("Found %d expired timer(s)", len(expired_timers))
546
580
 
547
581
  for timer in expired_timers:
548
582
  instance_id = timer["instance_id"]
@@ -551,22 +585,12 @@ class EddaApp:
551
585
  activity_id = timer.get("activity_id")
552
586
 
553
587
  if not activity_id:
554
- print(f"[TimerCheck] Warning: No activity_id in timer for {instance_id}, skipping")
555
- continue
556
-
557
- # Get workflow instance
558
- instance = await self.storage.get_instance(instance_id)
559
- if not instance:
560
- print(f"[TimerCheck] Warning: Instance {instance_id} not found, skipping")
588
+ logger.warning("No activity_id in timer for %s, skipping", instance_id)
561
589
  continue
562
590
 
563
- # Check if instance is still waiting for timer
564
- if instance.get("status") != "waiting_for_timer":
565
- print(
566
- f"[TimerCheck] Warning: Instance {instance_id} "
567
- f"status is '{instance.get('status')}', expected 'waiting_for_timer', skipping"
568
- )
569
- continue
591
+ # Note: find_expired_timers() already filters by status='waiting_for_timer'
592
+ # and JOINs with workflow_instances, so no need for additional get_instance() call.
593
+ # The lock mechanism below handles race conditions.
570
594
 
571
595
  # Distributed Coroutines: Acquire lock FIRST to prevent race conditions
572
596
  # This ensures only ONE pod processes this timer, even if multiple pods
@@ -576,15 +600,18 @@ class EddaApp:
576
600
  )
577
601
 
578
602
  if not lock_acquired:
579
- print(
580
- f"[TimerCheck] Another worker is processing {instance_id}, skipping "
581
- "(distributed coroutine - lock already held)"
603
+ logger.debug(
604
+ "Another worker is processing %s, skipping (lock already held)",
605
+ instance_id,
582
606
  )
583
607
  continue
584
608
 
585
609
  try:
586
- print(
587
- f"[TimerCheck] Timer '{timer_id}' expired for workflow {instance_id} (activity_id: {activity_id})"
610
+ logger.debug(
611
+ "Timer '%s' expired for workflow %s (activity_id: %s)",
612
+ timer_id,
613
+ instance_id,
614
+ activity_id,
588
615
  )
589
616
 
590
617
  # 1. Record timer expiration to history (allows deterministic replay)
@@ -604,11 +631,14 @@ class EddaApp:
604
631
  # If history entry already exists (UNIQUE constraint), this timer was already
605
632
  # processed by another worker in a multi-process environment.
606
633
  # Skip workflow resumption to prevent duplicate processing.
607
- print(
608
- f"[TimerCheck] History already exists for activity_id {activity_id}: {history_error}"
634
+ logger.debug(
635
+ "History already exists for activity_id %s: %s",
636
+ activity_id,
637
+ history_error,
609
638
  )
610
- print(
611
- f"[TimerCheck] Timer '{timer_id}' was already processed by another worker, skipping"
639
+ logger.debug(
640
+ "Timer '%s' was already processed by another worker, skipping",
641
+ timer_id,
612
642
  )
613
643
  continue
614
644
 
@@ -617,22 +647,21 @@ class EddaApp:
617
647
 
618
648
  # 3. Resume workflow (lock already held by this worker - distributed coroutine pattern)
619
649
  if self.replay_engine is None:
620
- print("[TimerCheck] Error: Replay engine not initialized")
650
+ logger.error("Replay engine not initialized")
621
651
  continue
622
652
 
623
653
  await self.replay_engine.resume_by_name(
624
654
  instance_id, workflow_name, already_locked=True
625
655
  )
626
656
 
627
- print(
628
- f"[TimerCheck] ✅ Resumed workflow {instance_id} after timer '{timer_id}' expired"
657
+ logger.debug(
658
+ "Resumed workflow %s after timer '%s' expired",
659
+ instance_id,
660
+ timer_id,
629
661
  )
630
662
 
631
663
  except Exception as e:
632
- print(f"[TimerCheck] ❌ Error resuming workflow {instance_id}: {e}")
633
- import traceback
634
-
635
- traceback.print_exc()
664
+ logger.error("Error resuming workflow %s: %s", instance_id, e, exc_info=True)
636
665
 
637
666
  finally:
638
667
  # Always release the lock, even if an error occurred
@@ -655,33 +684,30 @@ class EddaApp:
655
684
  await asyncio.sleep(interval)
656
685
  await self._check_expired_timers()
657
686
  except Exception as e:
658
- print(f"[TimerCheck] Error in periodic timer check: {e}")
659
- import traceback
660
-
661
- traceback.print_exc()
687
+ logger.error("Error in periodic timer check: %s", e, exc_info=True)
662
688
 
663
- async def _check_expired_event_timeouts(self) -> None:
689
+ async def _check_expired_message_subscriptions(self) -> None:
664
690
  """
665
- Check for event subscriptions that have timed out and fail those workflows.
691
+ Check for message subscriptions that have timed out and fail those workflows.
666
692
 
667
693
  This method:
668
- 1. Finds all event subscriptions where timeout_at <= now
694
+ 1. Finds all message subscriptions where timeout_at <= now
669
695
  2. For each timeout, acquires workflow lock (Lock-First pattern)
670
- 3. Records EventTimeout to history
671
- 4. Removes event subscription
672
- 5. Fails the workflow with EventTimeoutError
696
+ 3. Records MessageTimeout to history
697
+ 4. Removes message subscription
698
+ 5. Fails the workflow with TimeoutError
673
699
  """
674
- # Find all expired event subscriptions
675
- expired = await self.storage.find_expired_event_subscriptions()
700
+ # Find all expired message subscriptions
701
+ expired = await self.storage.find_expired_message_subscriptions()
676
702
 
677
703
  if not expired:
678
704
  return
679
705
 
680
- print(f"[EventTimeoutCheck] Found {len(expired)} expired event subscriptions")
706
+ logger.debug("Found %d expired message subscriptions", len(expired))
681
707
 
682
708
  for subscription in expired:
683
709
  instance_id = subscription["instance_id"]
684
- event_type = subscription["event_type"]
710
+ channel = subscription["channel"]
685
711
  timeout_at = subscription["timeout_at"]
686
712
  created_at = subscription["created_at"]
687
713
 
@@ -689,74 +715,92 @@ class EddaApp:
689
715
  # If we can't get the lock, another worker is processing this workflow
690
716
  lock_acquired = await self.storage.try_acquire_lock(instance_id, self.worker_id)
691
717
  if not lock_acquired:
692
- print(
693
- f"[EventTimeoutCheck] Could not acquire lock for workflow {instance_id}, skipping (another worker is processing)"
718
+ logger.debug(
719
+ "Could not acquire lock for workflow %s, skipping (another worker is processing)",
720
+ instance_id,
694
721
  )
695
722
  continue
696
723
 
697
724
  try:
698
- print(
699
- f"[EventTimeoutCheck] Event '{event_type}' timed out for workflow {instance_id}"
725
+ logger.debug(
726
+ "Message on channel '%s' timed out for workflow %s",
727
+ channel,
728
+ instance_id,
700
729
  )
701
730
 
702
- # Get workflow instance
703
- instance = await self.storage.get_instance(instance_id)
704
- if not instance:
705
- print(f"[EventTimeoutCheck] Workflow {instance_id} not found")
706
- continue
731
+ # Note: find_expired_message_subscriptions() JOINs with workflow_instances,
732
+ # so we know the instance exists. No need for separate get_instance() call.
707
733
 
708
- # Get activity_id from the subscription (stored when wait_event was called)
734
+ # Get activity_id from the subscription (stored when wait_message was called)
709
735
  activity_id = subscription.get("activity_id")
710
736
  if not activity_id:
711
- print(
712
- f"[EventTimeoutCheck] Warning: No activity_id in subscription for {instance_id}, skipping"
737
+ logger.warning(
738
+ "No activity_id in subscription for %s, skipping",
739
+ instance_id,
713
740
  )
714
741
  continue
715
742
 
716
- # 1. Record event timeout to history
743
+ # 1. Record message timeout to history
717
744
  # This allows the workflow to see what happened during replay
745
+ # Convert datetime to ISO string for JSON serialization
746
+ from datetime import datetime as dt_type
747
+
748
+ timeout_at_str = (
749
+ timeout_at.isoformat() if isinstance(timeout_at, dt_type) else str(timeout_at)
750
+ )
718
751
  try:
719
752
  await self.storage.append_history(
720
753
  instance_id,
721
754
  activity_id=activity_id,
722
- event_type="EventTimeout",
755
+ event_type="MessageTimeout",
723
756
  event_data={
724
- "event_type": event_type,
725
- "timeout_at": timeout_at,
726
- "error_message": f"Event '{event_type}' did not arrive within timeout",
757
+ "_error": True,
758
+ "error_type": "TimeoutError",
759
+ "error_message": f"Message on channel '{channel}' did not arrive within timeout",
760
+ "channel": channel,
761
+ "timeout_at": timeout_at_str,
727
762
  },
728
763
  )
729
764
  except Exception as history_error:
730
765
  # If history entry already exists, this timeout was already processed
731
- print(
732
- f"[EventTimeoutCheck] History already exists for activity_id {activity_id}: {history_error}"
766
+ logger.debug(
767
+ "History already exists for activity_id %s: %s",
768
+ activity_id,
769
+ history_error,
733
770
  )
734
- print(
735
- f"[EventTimeoutCheck] Timeout for '{event_type}' was already processed, skipping"
771
+ logger.debug(
772
+ "Timeout for channel '%s' was already processed, skipping",
773
+ channel,
736
774
  )
737
775
  continue
738
776
 
739
- # 2. Remove event subscription
740
- await self.storage.remove_event_subscription(instance_id, event_type)
777
+ # 2. Remove message subscription
778
+ await self.storage.remove_message_subscription(instance_id, channel)
741
779
 
742
- # 3. Fail the workflow with EventTimeoutError
743
- # Create error details similar to workflow failure
780
+ # 3. Fail the workflow with TimeoutError
744
781
  import traceback
745
782
 
746
783
  # Get timeout_seconds from timeout_at and created_at
747
- from datetime import datetime
748
-
749
- from edda.events import EventTimeoutError
750
-
784
+ # Handle both datetime objects and ISO strings
751
785
  try:
752
- timeout_dt = datetime.fromisoformat(timeout_at)
753
- created_dt = datetime.fromisoformat(created_at)
786
+ timeout_dt = (
787
+ timeout_at
788
+ if isinstance(timeout_at, dt_type)
789
+ else dt_type.fromisoformat(str(timeout_at))
790
+ )
791
+ created_dt = (
792
+ created_at
793
+ if isinstance(created_at, dt_type)
794
+ else dt_type.fromisoformat(str(created_at))
795
+ )
754
796
  # Calculate the original timeout duration (timeout_at - created_at)
755
797
  timeout_seconds = int((timeout_dt - created_dt).total_seconds())
756
798
  except Exception:
757
799
  timeout_seconds = 0 # Fallback
758
800
 
759
- error = EventTimeoutError(event_type, timeout_seconds)
801
+ error = TimeoutError(
802
+ f"Message on channel '{channel}' did not arrive within {timeout_seconds} seconds"
803
+ )
760
804
  stack_trace = "".join(
761
805
  traceback.format_exception(type(error), error, error.__traceback__)
762
806
  )
@@ -767,28 +811,26 @@ class EddaApp:
767
811
  "failed",
768
812
  {
769
813
  "error_message": str(error),
770
- "error_type": "EventTimeoutError",
814
+ "error_type": "TimeoutError",
771
815
  "stack_trace": stack_trace,
772
816
  },
773
817
  )
774
818
 
775
- print(
776
- f"[EventTimeoutCheck] ✅ Marked workflow {instance_id} as failed due to event timeout"
819
+ logger.debug(
820
+ "Marked workflow %s as failed due to message timeout",
821
+ instance_id,
777
822
  )
778
823
 
779
824
  except Exception as e:
780
- print(f"[EventTimeoutCheck] ❌ Error processing timeout for {instance_id}: {e}")
781
- import traceback
782
-
783
- traceback.print_exc()
825
+ logger.error("Error processing timeout for %s: %s", instance_id, e, exc_info=True)
784
826
 
785
827
  finally:
786
828
  # Always release the lock
787
829
  await self.storage.release_lock(instance_id, self.worker_id)
788
830
 
789
- async def _check_expired_event_timeouts_periodically(self, interval: int = 10) -> None:
831
+ async def _check_expired_message_subscriptions_periodically(self, interval: int = 10) -> None:
790
832
  """
791
- Background task to periodically check for expired event timeouts.
833
+ Background task to periodically check for expired message subscriptions.
792
834
 
793
835
  Args:
794
836
  interval: Check interval in seconds (default: 10)
@@ -799,12 +841,133 @@ class EddaApp:
799
841
  while True:
800
842
  try:
801
843
  await asyncio.sleep(interval)
802
- await self._check_expired_event_timeouts()
844
+ await self._check_expired_message_subscriptions()
803
845
  except Exception as e:
804
- print(f"[EventTimeoutCheck] Error in periodic timeout check: {e}")
805
- import traceback
846
+ logger.error("Error in periodic timeout check: %s", e, exc_info=True)
847
+
848
+ async def _resume_running_workflows_periodically(self, interval: int = 1) -> None:
849
+ """
850
+ Background task to resume workflows that are ready to run.
851
+
852
+ This provides fast resumption after message delivery. When deliver_message()
853
+ sets a workflow's status to 'running' and releases the lock, this task
854
+ will pick it up within 1 second and resume it.
855
+
856
+ Uses adaptive backoff to reduce DB load when no workflows are ready:
857
+ - When workflows are processed, uses base interval
858
+ - When no workflows found, exponentially backs off up to 60 seconds
859
+ - Always adds jitter to prevent thundering herd in multi-pod deployments
860
+
861
+ Args:
862
+ interval: Check interval in seconds (default: 1)
863
+ """
864
+ consecutive_empty = 0 # Track empty results for adaptive backoff
865
+ while True:
866
+ try:
867
+ # Adaptive backoff: longer sleep when no work available
868
+ jitter = random.uniform(0, interval * 0.3)
869
+ if consecutive_empty > 0:
870
+ # Exponential backoff: 2s, 4s, 8s, 16s, 32s, max 60s
871
+ backoff = min(interval * (2 ** min(consecutive_empty, 5)), 60)
872
+ else:
873
+ backoff = interval
874
+ await asyncio.sleep(backoff + jitter)
875
+
876
+ count = await self._resume_running_workflows()
877
+ if count == 0:
878
+ consecutive_empty += 1
879
+ else:
880
+ consecutive_empty = 0
881
+ except Exception as e:
882
+ consecutive_empty = 0 # Reset on error
883
+ logger.error("Error in periodic resume check: %s", e, exc_info=True)
884
+
885
+ async def _resume_running_workflows(self) -> int:
886
+ """
887
+ Find and resume workflows that are ready to run.
888
+
889
+ Finds workflows with status='running' that don't have a lock,
890
+ acquires a lock, and resumes them.
891
+
892
+ Returns:
893
+ Number of workflows successfully processed (lock acquired and resumed).
894
+ """
895
+ resumable = await self.storage.find_resumable_workflows()
896
+ processed_count = 0
897
+
898
+ for workflow_info in resumable:
899
+ instance_id = workflow_info["instance_id"]
900
+ workflow_name = workflow_info["workflow_name"]
901
+
902
+ try:
903
+ # Try to acquire lock (Lock-First pattern)
904
+ lock_acquired = await self.storage.try_acquire_lock(instance_id, self.worker_id)
905
+ if not lock_acquired:
906
+ # Another worker got it first, skip
907
+ continue
908
+
909
+ try:
910
+ # Resume the workflow
911
+ if self.replay_engine is None:
912
+ logger.error("ReplayEngine not initialized, skipping %s", instance_id)
913
+ continue
914
+ await self.replay_engine.resume_by_name(
915
+ instance_id, workflow_name, already_locked=True
916
+ )
917
+ processed_count += 1
918
+ finally:
919
+ # Always release lock
920
+ await self.storage.release_lock(instance_id, self.worker_id)
921
+
922
+ except Exception as e:
923
+ logger.error("Error resuming %s: %s", instance_id, e, exc_info=True)
924
+
925
+ return processed_count
926
+
927
+ async def _cleanup_old_messages_periodically(
928
+ self, interval: int = 3600, retention_days: int = 7
929
+ ) -> None:
930
+ """
931
+ Background task to periodically cleanup old channel messages.
932
+
933
+ Messages older than `retention_days` are deleted to prevent the database
934
+ from growing indefinitely with orphaned messages (messages that were
935
+ published but never received by any subscriber).
806
936
 
807
- traceback.print_exc()
937
+ Uses system-level locking to ensure only one pod executes cleanup at a time.
938
+
939
+ Args:
940
+ interval: Cleanup interval in seconds (default: 3600 = 1 hour)
941
+ retention_days: Number of days to retain messages (default: 7)
942
+
943
+ Note:
944
+ This runs indefinitely until the application is shut down.
945
+ """
946
+ while True:
947
+ try:
948
+ # Add jitter to prevent thundering herd in multi-pod deployments
949
+ jitter = random.uniform(0, interval * 0.3)
950
+ await asyncio.sleep(interval + jitter)
951
+
952
+ # Try to acquire global lock for this task
953
+ lock_acquired = await self.storage.try_acquire_system_lock(
954
+ lock_name="cleanup_old_messages",
955
+ worker_id=self.worker_id,
956
+ timeout_seconds=interval,
957
+ )
958
+
959
+ if not lock_acquired:
960
+ # Another pod is handling this task
961
+ continue
962
+
963
+ try:
964
+ deleted_count = await self.storage.cleanup_old_channel_messages(retention_days)
965
+ if deleted_count > 0:
966
+ logger.info("Cleaned up %d old channel messages", deleted_count)
967
+ finally:
968
+ await self.storage.release_system_lock("cleanup_old_messages", self.worker_id)
969
+ except Exception as e:
970
+ logger.error("Error cleaning up old messages: %s", e, exc_info=True)
808
971
 
809
972
  # -------------------------------------------------------------------------
810
973
  # ASGI Interface
@@ -987,10 +1150,7 @@ class EddaApp:
987
1150
 
988
1151
  except Exception as e:
989
1152
  # Internal error - log detailed traceback
990
- print(f"[Cancel] Error cancelling workflow {instance_id}: {e}")
991
- import traceback
992
-
993
- traceback.print_exc()
1153
+ logger.error("Error cancelling workflow %s: %s", instance_id, e, exc_info=True)
994
1154
 
995
1155
  status = 500
996
1156
  response_body = {"error": str(e), "type": type(e).__name__}