edda-framework 0.6.0__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- edda/__init__.py +39 -5
- edda/app.py +383 -223
- edda/channels.py +992 -0
- edda/compensation.py +22 -22
- edda/context.py +77 -51
- edda/integrations/opentelemetry/hooks.py +7 -2
- edda/locking.py +130 -67
- edda/replay.py +312 -82
- edda/storage/models.py +165 -24
- edda/storage/protocol.py +575 -122
- edda/storage/sqlalchemy_storage.py +2073 -319
- edda/viewer_ui/app.py +558 -127
- edda/viewer_ui/components.py +81 -68
- edda/viewer_ui/data_service.py +61 -25
- edda/viewer_ui/theme.py +200 -0
- edda/workflow.py +43 -0
- {edda_framework-0.6.0.dist-info → edda_framework-0.8.0.dist-info}/METADATA +167 -9
- {edda_framework-0.6.0.dist-info → edda_framework-0.8.0.dist-info}/RECORD +21 -20
- {edda_framework-0.6.0.dist-info → edda_framework-0.8.0.dist-info}/WHEEL +1 -1
- edda/events.py +0 -505
- {edda_framework-0.6.0.dist-info → edda_framework-0.8.0.dist-info}/entry_points.txt +0 -0
- {edda_framework-0.6.0.dist-info → edda_framework-0.8.0.dist-info}/licenses/LICENSE +0 -0
edda/app.py
CHANGED
|
@@ -7,6 +7,8 @@ application for handling CloudEvents and executing workflows.
|
|
|
7
7
|
|
|
8
8
|
import asyncio
|
|
9
9
|
import json
|
|
10
|
+
import logging
|
|
11
|
+
import random
|
|
10
12
|
import sys
|
|
11
13
|
from collections.abc import Callable
|
|
12
14
|
from typing import Any
|
|
@@ -24,6 +26,8 @@ from edda.replay import ReplayEngine
|
|
|
24
26
|
from edda.retry import RetryPolicy
|
|
25
27
|
from edda.storage.sqlalchemy_storage import SQLAlchemyStorage
|
|
26
28
|
|
|
29
|
+
logger = logging.getLogger(__name__)
|
|
30
|
+
|
|
27
31
|
|
|
28
32
|
class EddaApp:
|
|
29
33
|
"""
|
|
@@ -41,9 +45,16 @@ class EddaApp:
|
|
|
41
45
|
service_name: str,
|
|
42
46
|
db_url: str,
|
|
43
47
|
outbox_enabled: bool = False,
|
|
44
|
-
broker_url: str =
|
|
48
|
+
broker_url: str | None = None,
|
|
45
49
|
hooks: WorkflowHooks | None = None,
|
|
46
50
|
default_retry_policy: "RetryPolicy | None" = None,
|
|
51
|
+
message_retention_days: int = 7,
|
|
52
|
+
# Connection pool settings (ignored for SQLite)
|
|
53
|
+
pool_size: int = 5,
|
|
54
|
+
max_overflow: int = 10,
|
|
55
|
+
pool_timeout: int = 30,
|
|
56
|
+
pool_recycle: int = 3600,
|
|
57
|
+
pool_pre_ping: bool = True,
|
|
47
58
|
):
|
|
48
59
|
"""
|
|
49
60
|
Initialize Edda application.
|
|
@@ -52,18 +63,41 @@ class EddaApp:
|
|
|
52
63
|
service_name: Service name for distributed execution (e.g., "order-service")
|
|
53
64
|
db_url: Database URL (e.g., "sqlite:///workflow.db")
|
|
54
65
|
outbox_enabled: Enable transactional outbox pattern
|
|
55
|
-
broker_url:
|
|
66
|
+
broker_url: Broker URL for outbox publishing. Required if outbox_enabled=True.
|
|
56
67
|
hooks: Optional WorkflowHooks implementation for observability
|
|
57
68
|
default_retry_policy: Default retry policy for all activities.
|
|
58
69
|
If None, uses DEFAULT_RETRY_POLICY (5 attempts, exponential backoff).
|
|
59
70
|
Can be overridden per-activity using @activity(retry_policy=...).
|
|
71
|
+
message_retention_days: Number of days to retain channel messages before automatic cleanup.
|
|
72
|
+
Defaults to 7 days. Messages older than this will be deleted
|
|
73
|
+
by a background task running every hour.
|
|
74
|
+
pool_size: Number of connections to keep open in the pool (default: 5).
|
|
75
|
+
Ignored for SQLite. For production, consider 20+.
|
|
76
|
+
max_overflow: Maximum number of connections to create above pool_size (default: 10).
|
|
77
|
+
Ignored for SQLite. For production, consider 40+.
|
|
78
|
+
pool_timeout: Seconds to wait for a connection from the pool (default: 30).
|
|
79
|
+
Ignored for SQLite.
|
|
80
|
+
pool_recycle: Seconds before a connection is recycled (default: 3600).
|
|
81
|
+
Helps prevent stale connections. Ignored for SQLite.
|
|
82
|
+
pool_pre_ping: If True, test connections before use (default: True).
|
|
83
|
+
Helps detect disconnected connections. Ignored for SQLite.
|
|
60
84
|
"""
|
|
61
85
|
self.db_url = db_url
|
|
62
86
|
self.service_name = service_name
|
|
63
87
|
self.outbox_enabled = outbox_enabled
|
|
64
88
|
self.broker_url = broker_url
|
|
89
|
+
if self.outbox_enabled and not self.broker_url:
|
|
90
|
+
raise ValueError("broker_url is required when outbox_enabled=True")
|
|
65
91
|
self.hooks = hooks
|
|
66
92
|
self.default_retry_policy = default_retry_policy
|
|
93
|
+
self._message_retention_days = message_retention_days
|
|
94
|
+
|
|
95
|
+
# Connection pool settings
|
|
96
|
+
self._pool_size = pool_size
|
|
97
|
+
self._max_overflow = max_overflow
|
|
98
|
+
self._pool_timeout = pool_timeout
|
|
99
|
+
self._pool_recycle = pool_recycle
|
|
100
|
+
self._pool_pre_ping = pool_pre_ping
|
|
67
101
|
|
|
68
102
|
# Generate unique worker ID for this process
|
|
69
103
|
self.worker_id = generate_worker_id(service_name)
|
|
@@ -100,18 +134,35 @@ class EddaApp:
|
|
|
100
134
|
Returns:
|
|
101
135
|
SQLAlchemyStorage instance
|
|
102
136
|
"""
|
|
137
|
+
# Check if using SQLite (connection pool settings not applicable)
|
|
138
|
+
is_sqlite = db_url.startswith("sqlite")
|
|
139
|
+
|
|
103
140
|
# Convert plain sqlite:// URLs to use aiosqlite driver
|
|
104
141
|
if db_url.startswith("sqlite:///"):
|
|
105
142
|
db_url = db_url.replace("sqlite:///", "sqlite+aiosqlite:///", 1)
|
|
106
143
|
elif db_url == "sqlite:///:memory:" or db_url.startswith("sqlite:///:memory:"):
|
|
107
144
|
db_url = "sqlite+aiosqlite:///:memory:"
|
|
108
145
|
|
|
146
|
+
# Build engine kwargs
|
|
147
|
+
engine_kwargs: dict[str, Any] = {
|
|
148
|
+
"echo": False, # Set to True for SQL logging
|
|
149
|
+
"future": True,
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
# Add connection pool settings for non-SQLite databases
|
|
153
|
+
if not is_sqlite:
|
|
154
|
+
engine_kwargs.update(
|
|
155
|
+
{
|
|
156
|
+
"pool_size": self._pool_size,
|
|
157
|
+
"max_overflow": self._max_overflow,
|
|
158
|
+
"pool_timeout": self._pool_timeout,
|
|
159
|
+
"pool_recycle": self._pool_recycle,
|
|
160
|
+
"pool_pre_ping": self._pool_pre_ping,
|
|
161
|
+
}
|
|
162
|
+
)
|
|
163
|
+
|
|
109
164
|
# Create async engine
|
|
110
|
-
engine = create_async_engine(
|
|
111
|
-
db_url,
|
|
112
|
-
echo=False, # Set to True for SQL logging
|
|
113
|
-
future=True,
|
|
114
|
-
)
|
|
165
|
+
engine = create_async_engine(db_url, **engine_kwargs)
|
|
115
166
|
|
|
116
167
|
return SQLAlchemyStorage(engine)
|
|
117
168
|
|
|
@@ -148,6 +199,7 @@ class EddaApp:
|
|
|
148
199
|
|
|
149
200
|
# Initialize outbox relayer if enabled
|
|
150
201
|
if self.outbox_enabled:
|
|
202
|
+
assert self.broker_url is not None # Validated in __init__
|
|
151
203
|
self.outbox_relayer = OutboxRelayer(
|
|
152
204
|
storage=self.storage,
|
|
153
205
|
broker_url=self.broker_url,
|
|
@@ -194,6 +246,7 @@ class EddaApp:
|
|
|
194
246
|
auto_resume_stale_workflows_periodically(
|
|
195
247
|
self.storage,
|
|
196
248
|
self.replay_engine,
|
|
249
|
+
self.worker_id,
|
|
197
250
|
interval=60, # Check every 60 seconds
|
|
198
251
|
)
|
|
199
252
|
)
|
|
@@ -205,11 +258,29 @@ class EddaApp:
|
|
|
205
258
|
)
|
|
206
259
|
self._background_tasks.append(timer_check_task)
|
|
207
260
|
|
|
208
|
-
# Task to check expired
|
|
209
|
-
|
|
210
|
-
|
|
261
|
+
# Task to check expired message subscriptions and fail workflows
|
|
262
|
+
# Note: CloudEvents timeouts are also handled here since wait_event() uses wait_message()
|
|
263
|
+
message_timeout_task = asyncio.create_task(
|
|
264
|
+
self._check_expired_message_subscriptions_periodically(
|
|
265
|
+
interval=10
|
|
266
|
+
) # Check every 10 seconds
|
|
267
|
+
)
|
|
268
|
+
self._background_tasks.append(message_timeout_task)
|
|
269
|
+
|
|
270
|
+
# Task to resume workflows after message delivery (fast resumption)
|
|
271
|
+
message_resume_task = asyncio.create_task(
|
|
272
|
+
self._resume_running_workflows_periodically(interval=1) # Check every 1 second
|
|
211
273
|
)
|
|
212
|
-
self._background_tasks.append(
|
|
274
|
+
self._background_tasks.append(message_resume_task)
|
|
275
|
+
|
|
276
|
+
# Task to cleanup old channel messages (orphaned messages)
|
|
277
|
+
message_cleanup_task = asyncio.create_task(
|
|
278
|
+
self._cleanup_old_messages_periodically(
|
|
279
|
+
interval=3600, # Check every 1 hour
|
|
280
|
+
retention_days=self._message_retention_days,
|
|
281
|
+
)
|
|
282
|
+
)
|
|
283
|
+
self._background_tasks.append(message_cleanup_task)
|
|
213
284
|
|
|
214
285
|
def _auto_register_workflows(self) -> None:
|
|
215
286
|
"""
|
|
@@ -354,11 +425,7 @@ class EddaApp:
|
|
|
354
425
|
try:
|
|
355
426
|
await handler(event)
|
|
356
427
|
except Exception as e:
|
|
357
|
-
|
|
358
|
-
print(f"Error handling event {event_type}: {e}")
|
|
359
|
-
import traceback
|
|
360
|
-
|
|
361
|
-
traceback.print_exc()
|
|
428
|
+
logger.error("Error handling event %s: %s", event_type, e, exc_info=True)
|
|
362
429
|
|
|
363
430
|
async def _deliver_event_to_waiting_workflows_safe(self, event: Any) -> None:
|
|
364
431
|
"""
|
|
@@ -370,35 +437,46 @@ class EddaApp:
|
|
|
370
437
|
try:
|
|
371
438
|
await self._deliver_event_to_waiting_workflows(event)
|
|
372
439
|
except Exception as e:
|
|
373
|
-
|
|
374
|
-
import traceback
|
|
375
|
-
|
|
376
|
-
traceback.print_exc()
|
|
440
|
+
logger.error("Error delivering event to waiting workflows: %s", e, exc_info=True)
|
|
377
441
|
|
|
378
442
|
async def _deliver_event_to_waiting_workflows(self, event: Any) -> None:
|
|
379
443
|
"""
|
|
380
|
-
Deliver
|
|
444
|
+
Deliver CloudEvent to workflows waiting for this event type.
|
|
381
445
|
|
|
382
|
-
This method:
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
446
|
+
This method supports two delivery patterns based on the 'eddainstanceid' extension:
|
|
447
|
+
|
|
448
|
+
1. **Point-to-Point** (when 'eddainstanceid' is present):
|
|
449
|
+
Delivers to a specific workflow instance only.
|
|
450
|
+
|
|
451
|
+
2. **Pub/Sub** (when 'eddainstanceid' is absent):
|
|
452
|
+
Delivers to ALL workflows waiting for this event type.
|
|
453
|
+
|
|
454
|
+
Both patterns use the Channel-based Message Queue system for delivery:
|
|
455
|
+
- Lock acquisition (Lock-First pattern)
|
|
456
|
+
- History recording (ChannelMessageReceived)
|
|
457
|
+
- Subscription cursor update (broadcast) or message deletion (competing)
|
|
458
|
+
- Status update to 'running'
|
|
459
|
+
- Lock release
|
|
460
|
+
|
|
461
|
+
Workflow resumption is handled by background task (_resume_running_workflows_periodically).
|
|
387
462
|
|
|
388
463
|
Args:
|
|
389
464
|
event: CloudEvent instance
|
|
390
465
|
"""
|
|
466
|
+
from edda.channels import publish
|
|
467
|
+
|
|
391
468
|
event_type = event["type"]
|
|
392
469
|
event_data = event.get_data()
|
|
393
470
|
|
|
394
|
-
# Extract CloudEvents metadata
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
"
|
|
398
|
-
"
|
|
399
|
-
"
|
|
400
|
-
"
|
|
401
|
-
"
|
|
471
|
+
# Extract CloudEvents metadata with ce_ prefix
|
|
472
|
+
# This allows ReceivedEvent to reconstruct CloudEvents attributes
|
|
473
|
+
metadata = {
|
|
474
|
+
"ce_type": event["type"],
|
|
475
|
+
"ce_source": event["source"],
|
|
476
|
+
"ce_id": event["id"],
|
|
477
|
+
"ce_time": event.get("time"),
|
|
478
|
+
"ce_datacontenttype": event.get("datacontenttype"),
|
|
479
|
+
"ce_subject": event.get("subject"),
|
|
402
480
|
}
|
|
403
481
|
|
|
404
482
|
# Extract extension attributes (any attributes not in the standard set)
|
|
@@ -414,112 +492,68 @@ class EddaApp:
|
|
|
414
492
|
"data_base64",
|
|
415
493
|
}
|
|
416
494
|
extensions = {k: v for k, v in event.get_attributes().items() if k not in standard_attrs}
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
instance_id = subscription["instance_id"]
|
|
430
|
-
|
|
431
|
-
# Get workflow instance
|
|
432
|
-
instance = await self.storage.get_instance(instance_id)
|
|
433
|
-
if not instance:
|
|
434
|
-
print(f"[EventDelivery] Warning: Instance {instance_id} not found, skipping")
|
|
435
|
-
continue
|
|
436
|
-
|
|
437
|
-
# Check if instance is still waiting
|
|
438
|
-
if instance.get("status") != "waiting_for_event":
|
|
439
|
-
print(
|
|
440
|
-
f"[EventDelivery] Warning: Instance {instance_id} "
|
|
441
|
-
f"status is '{instance.get('status')}', expected 'waiting_for_event', skipping"
|
|
442
|
-
)
|
|
443
|
-
continue
|
|
444
|
-
|
|
445
|
-
# Get activity_id from the subscription (stored when wait_event was called)
|
|
446
|
-
activity_id = subscription.get("activity_id")
|
|
447
|
-
if not activity_id:
|
|
448
|
-
print(
|
|
449
|
-
f"[EventDelivery] Warning: No activity_id in subscription for {instance_id}, skipping"
|
|
450
|
-
)
|
|
451
|
-
continue
|
|
452
|
-
|
|
453
|
-
workflow_name = instance["workflow_name"]
|
|
454
|
-
|
|
455
|
-
# Distributed Coroutines: Acquire lock FIRST to prevent race conditions
|
|
456
|
-
# This ensures only ONE pod processes this event, even if multiple pods
|
|
457
|
-
# receive the event simultaneously
|
|
458
|
-
lock_acquired = await self.storage.try_acquire_lock(
|
|
459
|
-
instance_id, self.worker_id, timeout_seconds=300
|
|
495
|
+
if extensions:
|
|
496
|
+
metadata["ce_extensions"] = extensions
|
|
497
|
+
|
|
498
|
+
# Check for eddainstanceid extension attribute for Point-to-Point delivery
|
|
499
|
+
target_instance_id = extensions.get("eddainstanceid")
|
|
500
|
+
|
|
501
|
+
if target_instance_id:
|
|
502
|
+
# Point-to-Point: Deliver to specific instance only
|
|
503
|
+
logger.debug(
|
|
504
|
+
"Point-to-Point: Delivering '%s' to instance %s",
|
|
505
|
+
event_type,
|
|
506
|
+
target_instance_id,
|
|
460
507
|
)
|
|
461
508
|
|
|
462
|
-
if not lock_acquired:
|
|
463
|
-
print(
|
|
464
|
-
f"[EventDelivery] Another worker is processing {instance_id}, skipping "
|
|
465
|
-
"(distributed coroutine - lock already held)"
|
|
466
|
-
)
|
|
467
|
-
continue
|
|
468
|
-
|
|
469
509
|
try:
|
|
470
|
-
|
|
471
|
-
|
|
510
|
+
await publish(
|
|
511
|
+
self.storage,
|
|
512
|
+
channel=event_type,
|
|
513
|
+
data=event_data,
|
|
514
|
+
metadata=metadata,
|
|
515
|
+
target_instance_id=target_instance_id,
|
|
516
|
+
worker_id=self.worker_id,
|
|
517
|
+
)
|
|
518
|
+
logger.debug(
|
|
519
|
+
"Published '%s' to channel (target: %s)",
|
|
520
|
+
event_type,
|
|
521
|
+
target_instance_id,
|
|
472
522
|
)
|
|
473
523
|
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
"payload": event_data,
|
|
482
|
-
"metadata": event_metadata,
|
|
483
|
-
"extensions": extensions,
|
|
484
|
-
},
|
|
485
|
-
)
|
|
486
|
-
except Exception as history_error:
|
|
487
|
-
# If history entry already exists (UNIQUE constraint), this event was already
|
|
488
|
-
# delivered by another worker in a multi-process environment.
|
|
489
|
-
# Skip workflow resumption to prevent duplicate processing.
|
|
490
|
-
print(
|
|
491
|
-
f"[EventDelivery] History already exists for activity_id {activity_id}: {history_error}"
|
|
492
|
-
)
|
|
493
|
-
print(
|
|
494
|
-
f"[EventDelivery] Event '{event_type}' was already delivered by another worker, skipping"
|
|
495
|
-
)
|
|
496
|
-
continue
|
|
497
|
-
|
|
498
|
-
# 2. Remove event subscription
|
|
499
|
-
await self.storage.remove_event_subscription(instance_id, event_type)
|
|
524
|
+
except Exception as e:
|
|
525
|
+
logger.error(
|
|
526
|
+
"Error delivering to workflow %s: %s",
|
|
527
|
+
target_instance_id,
|
|
528
|
+
e,
|
|
529
|
+
exc_info=True,
|
|
530
|
+
)
|
|
500
531
|
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
continue
|
|
532
|
+
else:
|
|
533
|
+
# Pub/Sub: Deliver to ALL waiting instances
|
|
534
|
+
logger.debug("Pub/Sub: Publishing '%s' to channel", event_type)
|
|
505
535
|
|
|
506
|
-
|
|
507
|
-
|
|
536
|
+
try:
|
|
537
|
+
message_id = await publish(
|
|
538
|
+
self.storage,
|
|
539
|
+
channel=event_type,
|
|
540
|
+
data=event_data,
|
|
541
|
+
metadata=metadata,
|
|
542
|
+
worker_id=self.worker_id,
|
|
508
543
|
)
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
544
|
+
logger.debug(
|
|
545
|
+
"Published '%s' to channel (message_id: %s)",
|
|
546
|
+
event_type,
|
|
547
|
+
message_id,
|
|
512
548
|
)
|
|
513
549
|
|
|
514
550
|
except Exception as e:
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
# Always release the lock, even if an error occurred
|
|
522
|
-
await self.storage.release_lock(instance_id, self.worker_id)
|
|
551
|
+
logger.error(
|
|
552
|
+
"Error publishing to channel '%s': %s",
|
|
553
|
+
event_type,
|
|
554
|
+
e,
|
|
555
|
+
exc_info=True,
|
|
556
|
+
)
|
|
523
557
|
|
|
524
558
|
async def _check_expired_timers(self) -> None:
|
|
525
559
|
"""
|
|
@@ -542,7 +576,7 @@ class EddaApp:
|
|
|
542
576
|
if not expired_timers:
|
|
543
577
|
return # No expired timers
|
|
544
578
|
|
|
545
|
-
|
|
579
|
+
logger.debug("Found %d expired timer(s)", len(expired_timers))
|
|
546
580
|
|
|
547
581
|
for timer in expired_timers:
|
|
548
582
|
instance_id = timer["instance_id"]
|
|
@@ -551,22 +585,12 @@ class EddaApp:
|
|
|
551
585
|
activity_id = timer.get("activity_id")
|
|
552
586
|
|
|
553
587
|
if not activity_id:
|
|
554
|
-
|
|
555
|
-
continue
|
|
556
|
-
|
|
557
|
-
# Get workflow instance
|
|
558
|
-
instance = await self.storage.get_instance(instance_id)
|
|
559
|
-
if not instance:
|
|
560
|
-
print(f"[TimerCheck] Warning: Instance {instance_id} not found, skipping")
|
|
588
|
+
logger.warning("No activity_id in timer for %s, skipping", instance_id)
|
|
561
589
|
continue
|
|
562
590
|
|
|
563
|
-
#
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
f"[TimerCheck] Warning: Instance {instance_id} "
|
|
567
|
-
f"status is '{instance.get('status')}', expected 'waiting_for_timer', skipping"
|
|
568
|
-
)
|
|
569
|
-
continue
|
|
591
|
+
# Note: find_expired_timers() already filters by status='waiting_for_timer'
|
|
592
|
+
# and JOINs with workflow_instances, so no need for additional get_instance() call.
|
|
593
|
+
# The lock mechanism below handles race conditions.
|
|
570
594
|
|
|
571
595
|
# Distributed Coroutines: Acquire lock FIRST to prevent race conditions
|
|
572
596
|
# This ensures only ONE pod processes this timer, even if multiple pods
|
|
@@ -576,15 +600,18 @@ class EddaApp:
|
|
|
576
600
|
)
|
|
577
601
|
|
|
578
602
|
if not lock_acquired:
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
603
|
+
logger.debug(
|
|
604
|
+
"Another worker is processing %s, skipping (lock already held)",
|
|
605
|
+
instance_id,
|
|
582
606
|
)
|
|
583
607
|
continue
|
|
584
608
|
|
|
585
609
|
try:
|
|
586
|
-
|
|
587
|
-
|
|
610
|
+
logger.debug(
|
|
611
|
+
"Timer '%s' expired for workflow %s (activity_id: %s)",
|
|
612
|
+
timer_id,
|
|
613
|
+
instance_id,
|
|
614
|
+
activity_id,
|
|
588
615
|
)
|
|
589
616
|
|
|
590
617
|
# 1. Record timer expiration to history (allows deterministic replay)
|
|
@@ -604,11 +631,14 @@ class EddaApp:
|
|
|
604
631
|
# If history entry already exists (UNIQUE constraint), this timer was already
|
|
605
632
|
# processed by another worker in a multi-process environment.
|
|
606
633
|
# Skip workflow resumption to prevent duplicate processing.
|
|
607
|
-
|
|
608
|
-
|
|
634
|
+
logger.debug(
|
|
635
|
+
"History already exists for activity_id %s: %s",
|
|
636
|
+
activity_id,
|
|
637
|
+
history_error,
|
|
609
638
|
)
|
|
610
|
-
|
|
611
|
-
|
|
639
|
+
logger.debug(
|
|
640
|
+
"Timer '%s' was already processed by another worker, skipping",
|
|
641
|
+
timer_id,
|
|
612
642
|
)
|
|
613
643
|
continue
|
|
614
644
|
|
|
@@ -617,22 +647,21 @@ class EddaApp:
|
|
|
617
647
|
|
|
618
648
|
# 3. Resume workflow (lock already held by this worker - distributed coroutine pattern)
|
|
619
649
|
if self.replay_engine is None:
|
|
620
|
-
|
|
650
|
+
logger.error("Replay engine not initialized")
|
|
621
651
|
continue
|
|
622
652
|
|
|
623
653
|
await self.replay_engine.resume_by_name(
|
|
624
654
|
instance_id, workflow_name, already_locked=True
|
|
625
655
|
)
|
|
626
656
|
|
|
627
|
-
|
|
628
|
-
|
|
657
|
+
logger.debug(
|
|
658
|
+
"Resumed workflow %s after timer '%s' expired",
|
|
659
|
+
instance_id,
|
|
660
|
+
timer_id,
|
|
629
661
|
)
|
|
630
662
|
|
|
631
663
|
except Exception as e:
|
|
632
|
-
|
|
633
|
-
import traceback
|
|
634
|
-
|
|
635
|
-
traceback.print_exc()
|
|
664
|
+
logger.error("Error resuming workflow %s: %s", instance_id, e, exc_info=True)
|
|
636
665
|
|
|
637
666
|
finally:
|
|
638
667
|
# Always release the lock, even if an error occurred
|
|
@@ -655,33 +684,30 @@ class EddaApp:
|
|
|
655
684
|
await asyncio.sleep(interval)
|
|
656
685
|
await self._check_expired_timers()
|
|
657
686
|
except Exception as e:
|
|
658
|
-
|
|
659
|
-
import traceback
|
|
660
|
-
|
|
661
|
-
traceback.print_exc()
|
|
687
|
+
logger.error("Error in periodic timer check: %s", e, exc_info=True)
|
|
662
688
|
|
|
663
|
-
async def
|
|
689
|
+
async def _check_expired_message_subscriptions(self) -> None:
|
|
664
690
|
"""
|
|
665
|
-
Check for
|
|
691
|
+
Check for message subscriptions that have timed out and fail those workflows.
|
|
666
692
|
|
|
667
693
|
This method:
|
|
668
|
-
1. Finds all
|
|
694
|
+
1. Finds all message subscriptions where timeout_at <= now
|
|
669
695
|
2. For each timeout, acquires workflow lock (Lock-First pattern)
|
|
670
|
-
3. Records
|
|
671
|
-
4. Removes
|
|
672
|
-
5. Fails the workflow with
|
|
696
|
+
3. Records MessageTimeout to history
|
|
697
|
+
4. Removes message subscription
|
|
698
|
+
5. Fails the workflow with TimeoutError
|
|
673
699
|
"""
|
|
674
|
-
# Find all expired
|
|
675
|
-
expired = await self.storage.
|
|
700
|
+
# Find all expired message subscriptions
|
|
701
|
+
expired = await self.storage.find_expired_message_subscriptions()
|
|
676
702
|
|
|
677
703
|
if not expired:
|
|
678
704
|
return
|
|
679
705
|
|
|
680
|
-
|
|
706
|
+
logger.debug("Found %d expired message subscriptions", len(expired))
|
|
681
707
|
|
|
682
708
|
for subscription in expired:
|
|
683
709
|
instance_id = subscription["instance_id"]
|
|
684
|
-
|
|
710
|
+
channel = subscription["channel"]
|
|
685
711
|
timeout_at = subscription["timeout_at"]
|
|
686
712
|
created_at = subscription["created_at"]
|
|
687
713
|
|
|
@@ -689,74 +715,92 @@ class EddaApp:
|
|
|
689
715
|
# If we can't get the lock, another worker is processing this workflow
|
|
690
716
|
lock_acquired = await self.storage.try_acquire_lock(instance_id, self.worker_id)
|
|
691
717
|
if not lock_acquired:
|
|
692
|
-
|
|
693
|
-
|
|
718
|
+
logger.debug(
|
|
719
|
+
"Could not acquire lock for workflow %s, skipping (another worker is processing)",
|
|
720
|
+
instance_id,
|
|
694
721
|
)
|
|
695
722
|
continue
|
|
696
723
|
|
|
697
724
|
try:
|
|
698
|
-
|
|
699
|
-
|
|
725
|
+
logger.debug(
|
|
726
|
+
"Message on channel '%s' timed out for workflow %s",
|
|
727
|
+
channel,
|
|
728
|
+
instance_id,
|
|
700
729
|
)
|
|
701
730
|
|
|
702
|
-
#
|
|
703
|
-
instance
|
|
704
|
-
if not instance:
|
|
705
|
-
print(f"[EventTimeoutCheck] Workflow {instance_id} not found")
|
|
706
|
-
continue
|
|
731
|
+
# Note: find_expired_message_subscriptions() JOINs with workflow_instances,
|
|
732
|
+
# so we know the instance exists. No need for separate get_instance() call.
|
|
707
733
|
|
|
708
|
-
# Get activity_id from the subscription (stored when
|
|
734
|
+
# Get activity_id from the subscription (stored when wait_message was called)
|
|
709
735
|
activity_id = subscription.get("activity_id")
|
|
710
736
|
if not activity_id:
|
|
711
|
-
|
|
712
|
-
|
|
737
|
+
logger.warning(
|
|
738
|
+
"No activity_id in subscription for %s, skipping",
|
|
739
|
+
instance_id,
|
|
713
740
|
)
|
|
714
741
|
continue
|
|
715
742
|
|
|
716
|
-
# 1. Record
|
|
743
|
+
# 1. Record message timeout to history
|
|
717
744
|
# This allows the workflow to see what happened during replay
|
|
745
|
+
# Convert datetime to ISO string for JSON serialization
|
|
746
|
+
from datetime import datetime as dt_type
|
|
747
|
+
|
|
748
|
+
timeout_at_str = (
|
|
749
|
+
timeout_at.isoformat() if isinstance(timeout_at, dt_type) else str(timeout_at)
|
|
750
|
+
)
|
|
718
751
|
try:
|
|
719
752
|
await self.storage.append_history(
|
|
720
753
|
instance_id,
|
|
721
754
|
activity_id=activity_id,
|
|
722
|
-
event_type="
|
|
755
|
+
event_type="MessageTimeout",
|
|
723
756
|
event_data={
|
|
724
|
-
"
|
|
725
|
-
"
|
|
726
|
-
"error_message": f"
|
|
757
|
+
"_error": True,
|
|
758
|
+
"error_type": "TimeoutError",
|
|
759
|
+
"error_message": f"Message on channel '{channel}' did not arrive within timeout",
|
|
760
|
+
"channel": channel,
|
|
761
|
+
"timeout_at": timeout_at_str,
|
|
727
762
|
},
|
|
728
763
|
)
|
|
729
764
|
except Exception as history_error:
|
|
730
765
|
# If history entry already exists, this timeout was already processed
|
|
731
|
-
|
|
732
|
-
|
|
766
|
+
logger.debug(
|
|
767
|
+
"History already exists for activity_id %s: %s",
|
|
768
|
+
activity_id,
|
|
769
|
+
history_error,
|
|
733
770
|
)
|
|
734
|
-
|
|
735
|
-
|
|
771
|
+
logger.debug(
|
|
772
|
+
"Timeout for channel '%s' was already processed, skipping",
|
|
773
|
+
channel,
|
|
736
774
|
)
|
|
737
775
|
continue
|
|
738
776
|
|
|
739
|
-
# 2. Remove
|
|
740
|
-
await self.storage.
|
|
777
|
+
# 2. Remove message subscription
|
|
778
|
+
await self.storage.remove_message_subscription(instance_id, channel)
|
|
741
779
|
|
|
742
|
-
# 3. Fail the workflow with
|
|
743
|
-
# Create error details similar to workflow failure
|
|
780
|
+
# 3. Fail the workflow with TimeoutError
|
|
744
781
|
import traceback
|
|
745
782
|
|
|
746
783
|
# Get timeout_seconds from timeout_at and created_at
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
from edda.events import EventTimeoutError
|
|
750
|
-
|
|
784
|
+
# Handle both datetime objects and ISO strings
|
|
751
785
|
try:
|
|
752
|
-
timeout_dt =
|
|
753
|
-
|
|
786
|
+
timeout_dt = (
|
|
787
|
+
timeout_at
|
|
788
|
+
if isinstance(timeout_at, dt_type)
|
|
789
|
+
else dt_type.fromisoformat(str(timeout_at))
|
|
790
|
+
)
|
|
791
|
+
created_dt = (
|
|
792
|
+
created_at
|
|
793
|
+
if isinstance(created_at, dt_type)
|
|
794
|
+
else dt_type.fromisoformat(str(created_at))
|
|
795
|
+
)
|
|
754
796
|
# Calculate the original timeout duration (timeout_at - created_at)
|
|
755
797
|
timeout_seconds = int((timeout_dt - created_dt).total_seconds())
|
|
756
798
|
except Exception:
|
|
757
799
|
timeout_seconds = 0 # Fallback
|
|
758
800
|
|
|
759
|
-
error =
|
|
801
|
+
error = TimeoutError(
|
|
802
|
+
f"Message on channel '{channel}' did not arrive within {timeout_seconds} seconds"
|
|
803
|
+
)
|
|
760
804
|
stack_trace = "".join(
|
|
761
805
|
traceback.format_exception(type(error), error, error.__traceback__)
|
|
762
806
|
)
|
|
@@ -767,28 +811,26 @@ class EddaApp:
|
|
|
767
811
|
"failed",
|
|
768
812
|
{
|
|
769
813
|
"error_message": str(error),
|
|
770
|
-
"error_type": "
|
|
814
|
+
"error_type": "TimeoutError",
|
|
771
815
|
"stack_trace": stack_trace,
|
|
772
816
|
},
|
|
773
817
|
)
|
|
774
818
|
|
|
775
|
-
|
|
776
|
-
|
|
819
|
+
logger.debug(
|
|
820
|
+
"Marked workflow %s as failed due to message timeout",
|
|
821
|
+
instance_id,
|
|
777
822
|
)
|
|
778
823
|
|
|
779
824
|
except Exception as e:
|
|
780
|
-
|
|
781
|
-
import traceback
|
|
782
|
-
|
|
783
|
-
traceback.print_exc()
|
|
825
|
+
logger.error("Error processing timeout for %s: %s", instance_id, e, exc_info=True)
|
|
784
826
|
|
|
785
827
|
finally:
|
|
786
828
|
# Always release the lock
|
|
787
829
|
await self.storage.release_lock(instance_id, self.worker_id)
|
|
788
830
|
|
|
789
|
-
async def
|
|
831
|
+
async def _check_expired_message_subscriptions_periodically(self, interval: int = 10) -> None:
|
|
790
832
|
"""
|
|
791
|
-
Background task to periodically check for expired
|
|
833
|
+
Background task to periodically check for expired message subscriptions.
|
|
792
834
|
|
|
793
835
|
Args:
|
|
794
836
|
interval: Check interval in seconds (default: 10)
|
|
@@ -799,12 +841,133 @@ class EddaApp:
|
|
|
799
841
|
while True:
|
|
800
842
|
try:
|
|
801
843
|
await asyncio.sleep(interval)
|
|
802
|
-
await self.
|
|
844
|
+
await self._check_expired_message_subscriptions()
|
|
803
845
|
except Exception as e:
|
|
804
|
-
|
|
805
|
-
|
|
846
|
+
logger.error("Error in periodic timeout check: %s", e, exc_info=True)
|
|
847
|
+
|
|
848
|
+
async def _resume_running_workflows_periodically(self, interval: int = 1) -> None:
|
|
849
|
+
"""
|
|
850
|
+
Background task to resume workflows that are ready to run.
|
|
851
|
+
|
|
852
|
+
This provides fast resumption after message delivery. When deliver_message()
|
|
853
|
+
sets a workflow's status to 'running' and releases the lock, this task
|
|
854
|
+
will pick it up within 1 second and resume it.
|
|
855
|
+
|
|
856
|
+
Uses adaptive backoff to reduce DB load when no workflows are ready:
|
|
857
|
+
- When workflows are processed, uses base interval
|
|
858
|
+
- When no workflows found, exponentially backs off up to 60 seconds
|
|
859
|
+
- Always adds jitter to prevent thundering herd in multi-pod deployments
|
|
860
|
+
|
|
861
|
+
Args:
|
|
862
|
+
interval: Check interval in seconds (default: 1)
|
|
863
|
+
"""
|
|
864
|
+
consecutive_empty = 0 # Track empty results for adaptive backoff
|
|
865
|
+
while True:
|
|
866
|
+
try:
|
|
867
|
+
# Adaptive backoff: longer sleep when no work available
|
|
868
|
+
jitter = random.uniform(0, interval * 0.3)
|
|
869
|
+
if consecutive_empty > 0:
|
|
870
|
+
# Exponential backoff: 2s, 4s, 8s, 16s, 32s, max 60s
|
|
871
|
+
backoff = min(interval * (2 ** min(consecutive_empty, 5)), 60)
|
|
872
|
+
else:
|
|
873
|
+
backoff = interval
|
|
874
|
+
await asyncio.sleep(backoff + jitter)
|
|
875
|
+
|
|
876
|
+
count = await self._resume_running_workflows()
|
|
877
|
+
if count == 0:
|
|
878
|
+
consecutive_empty += 1
|
|
879
|
+
else:
|
|
880
|
+
consecutive_empty = 0
|
|
881
|
+
except Exception as e:
|
|
882
|
+
consecutive_empty = 0 # Reset on error
|
|
883
|
+
logger.error("Error in periodic resume check: %s", e, exc_info=True)
|
|
884
|
+
|
|
885
|
+
async def _resume_running_workflows(self) -> int:
|
|
886
|
+
"""
|
|
887
|
+
Find and resume workflows that are ready to run.
|
|
888
|
+
|
|
889
|
+
Finds workflows with status='running' that don't have a lock,
|
|
890
|
+
acquires a lock, and resumes them.
|
|
891
|
+
|
|
892
|
+
Returns:
|
|
893
|
+
Number of workflows successfully processed (lock acquired and resumed).
|
|
894
|
+
"""
|
|
895
|
+
resumable = await self.storage.find_resumable_workflows()
|
|
896
|
+
processed_count = 0
|
|
897
|
+
|
|
898
|
+
for workflow_info in resumable:
|
|
899
|
+
instance_id = workflow_info["instance_id"]
|
|
900
|
+
workflow_name = workflow_info["workflow_name"]
|
|
901
|
+
|
|
902
|
+
try:
|
|
903
|
+
# Try to acquire lock (Lock-First pattern)
|
|
904
|
+
lock_acquired = await self.storage.try_acquire_lock(instance_id, self.worker_id)
|
|
905
|
+
if not lock_acquired:
|
|
906
|
+
# Another worker got it first, skip
|
|
907
|
+
continue
|
|
908
|
+
|
|
909
|
+
try:
|
|
910
|
+
# Resume the workflow
|
|
911
|
+
if self.replay_engine is None:
|
|
912
|
+
logger.error("ReplayEngine not initialized, skipping %s", instance_id)
|
|
913
|
+
continue
|
|
914
|
+
await self.replay_engine.resume_by_name(
|
|
915
|
+
instance_id, workflow_name, already_locked=True
|
|
916
|
+
)
|
|
917
|
+
processed_count += 1
|
|
918
|
+
finally:
|
|
919
|
+
# Always release lock
|
|
920
|
+
await self.storage.release_lock(instance_id, self.worker_id)
|
|
921
|
+
|
|
922
|
+
except Exception as e:
|
|
923
|
+
logger.error("Error resuming %s: %s", instance_id, e, exc_info=True)
|
|
924
|
+
|
|
925
|
+
return processed_count
|
|
926
|
+
|
|
927
|
+
async def _cleanup_old_messages_periodically(
|
|
928
|
+
self, interval: int = 3600, retention_days: int = 7
|
|
929
|
+
) -> None:
|
|
930
|
+
"""
|
|
931
|
+
Background task to periodically cleanup old channel messages.
|
|
932
|
+
|
|
933
|
+
Messages older than `retention_days` are deleted to prevent the database
|
|
934
|
+
from growing indefinitely with orphaned messages (messages that were
|
|
935
|
+
published but never received by any subscriber).
|
|
806
936
|
|
|
807
|
-
|
|
937
|
+
Uses system-level locking to ensure only one pod executes cleanup at a time.
|
|
938
|
+
|
|
939
|
+
Args:
|
|
940
|
+
interval: Cleanup interval in seconds (default: 3600 = 1 hour)
|
|
941
|
+
retention_days: Number of days to retain messages (default: 7)
|
|
942
|
+
|
|
943
|
+
Note:
|
|
944
|
+
This runs indefinitely until the application is shut down.
|
|
945
|
+
"""
|
|
946
|
+
while True:
|
|
947
|
+
try:
|
|
948
|
+
# Add jitter to prevent thundering herd in multi-pod deployments
|
|
949
|
+
jitter = random.uniform(0, interval * 0.3)
|
|
950
|
+
await asyncio.sleep(interval + jitter)
|
|
951
|
+
|
|
952
|
+
# Try to acquire global lock for this task
|
|
953
|
+
lock_acquired = await self.storage.try_acquire_system_lock(
|
|
954
|
+
lock_name="cleanup_old_messages",
|
|
955
|
+
worker_id=self.worker_id,
|
|
956
|
+
timeout_seconds=interval,
|
|
957
|
+
)
|
|
958
|
+
|
|
959
|
+
if not lock_acquired:
|
|
960
|
+
# Another pod is handling this task
|
|
961
|
+
continue
|
|
962
|
+
|
|
963
|
+
try:
|
|
964
|
+
deleted_count = await self.storage.cleanup_old_channel_messages(retention_days)
|
|
965
|
+
if deleted_count > 0:
|
|
966
|
+
logger.info("Cleaned up %d old channel messages", deleted_count)
|
|
967
|
+
finally:
|
|
968
|
+
await self.storage.release_system_lock("cleanup_old_messages", self.worker_id)
|
|
969
|
+
except Exception as e:
|
|
970
|
+
logger.error("Error cleaning up old messages: %s", e, exc_info=True)
|
|
808
971
|
|
|
809
972
|
# -------------------------------------------------------------------------
|
|
810
973
|
# ASGI Interface
|
|
@@ -987,10 +1150,7 @@ class EddaApp:
|
|
|
987
1150
|
|
|
988
1151
|
except Exception as e:
|
|
989
1152
|
# Internal error - log detailed traceback
|
|
990
|
-
|
|
991
|
-
import traceback
|
|
992
|
-
|
|
993
|
-
traceback.print_exc()
|
|
1153
|
+
logger.error("Error cancelling workflow %s: %s", instance_id, e, exc_info=True)
|
|
994
1154
|
|
|
995
1155
|
status = 500
|
|
996
1156
|
response_body = {"error": str(e), "type": type(e).__name__}
|