edda-framework 0.9.0__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- edda/app.py +428 -56
- edda/context.py +8 -0
- edda/outbox/relayer.py +21 -2
- edda/storage/__init__.py +8 -0
- edda/storage/notify_base.py +162 -0
- edda/storage/pg_notify.py +325 -0
- edda/storage/protocol.py +9 -1
- edda/storage/sqlalchemy_storage.py +193 -13
- edda/viewer_ui/app.py +26 -0
- edda/viewer_ui/data_service.py +4 -0
- {edda_framework-0.9.0.dist-info → edda_framework-0.10.0.dist-info}/METADATA +13 -1
- {edda_framework-0.9.0.dist-info → edda_framework-0.10.0.dist-info}/RECORD +15 -13
- {edda_framework-0.9.0.dist-info → edda_framework-0.10.0.dist-info}/WHEEL +0 -0
- {edda_framework-0.9.0.dist-info → edda_framework-0.10.0.dist-info}/entry_points.txt +0 -0
- {edda_framework-0.9.0.dist-info → edda_framework-0.10.0.dist-info}/licenses/LICENSE +0 -0
edda/app.py
CHANGED
|
@@ -8,10 +8,13 @@ application for handling CloudEvents and executing workflows.
|
|
|
8
8
|
import asyncio
|
|
9
9
|
import json
|
|
10
10
|
import logging
|
|
11
|
+
import math
|
|
11
12
|
import random
|
|
12
13
|
import sys
|
|
14
|
+
import time
|
|
13
15
|
from collections.abc import Callable
|
|
14
|
-
from
|
|
16
|
+
from datetime import datetime
|
|
17
|
+
from typing import Any, Literal
|
|
15
18
|
|
|
16
19
|
import uvloop
|
|
17
20
|
from cloudevents.exceptions import GenericException as CloudEventsException
|
|
@@ -55,6 +58,11 @@ class EddaApp:
|
|
|
55
58
|
pool_timeout: int = 30,
|
|
56
59
|
pool_recycle: int = 3600,
|
|
57
60
|
pool_pre_ping: bool = True,
|
|
61
|
+
# PostgreSQL LISTEN/NOTIFY settings
|
|
62
|
+
use_listen_notify: bool | None = None,
|
|
63
|
+
notify_fallback_interval: int = 30,
|
|
64
|
+
# Batch processing settings
|
|
65
|
+
max_workflows_per_batch: int | Literal["auto", "auto:cpu"] = 10,
|
|
58
66
|
):
|
|
59
67
|
"""
|
|
60
68
|
Initialize Edda application.
|
|
@@ -81,6 +89,17 @@ class EddaApp:
|
|
|
81
89
|
Helps prevent stale connections. Ignored for SQLite.
|
|
82
90
|
pool_pre_ping: If True, test connections before use (default: True).
|
|
83
91
|
Helps detect disconnected connections. Ignored for SQLite.
|
|
92
|
+
use_listen_notify: Enable PostgreSQL LISTEN/NOTIFY for instant notifications.
|
|
93
|
+
None (default) = auto-detect (enabled for PostgreSQL, disabled for others).
|
|
94
|
+
True = force enable (raises error if not PostgreSQL).
|
|
95
|
+
False = force disable (use polling only).
|
|
96
|
+
notify_fallback_interval: Polling interval in seconds when NOTIFY is enabled.
|
|
97
|
+
Used as backup for missed notifications. Default: 30 seconds.
|
|
98
|
+
SQLite/MySQL always use their default polling intervals.
|
|
99
|
+
max_workflows_per_batch: Maximum workflows to process per resume cycle.
|
|
100
|
+
- int: Fixed batch size (default: 10)
|
|
101
|
+
- "auto": Scale 10-100 based on queue depth
|
|
102
|
+
- "auto:cpu": Scale 10-100 based on CPU utilization (requires psutil)
|
|
84
103
|
"""
|
|
85
104
|
self.db_url = db_url
|
|
86
105
|
self.service_name = service_name
|
|
@@ -99,6 +118,12 @@ class EddaApp:
|
|
|
99
118
|
self._pool_recycle = pool_recycle
|
|
100
119
|
self._pool_pre_ping = pool_pre_ping
|
|
101
120
|
|
|
121
|
+
# PostgreSQL LISTEN/NOTIFY settings
|
|
122
|
+
self._use_listen_notify = use_listen_notify
|
|
123
|
+
self._notify_fallback_interval = notify_fallback_interval
|
|
124
|
+
self._notify_listener: Any = None
|
|
125
|
+
self._notify_enabled = False
|
|
126
|
+
|
|
102
127
|
# Generate unique worker ID for this process
|
|
103
128
|
self.worker_id = generate_worker_id(service_name)
|
|
104
129
|
|
|
@@ -118,6 +143,31 @@ class EddaApp:
|
|
|
118
143
|
self._background_tasks: list[asyncio.Task[Any]] = []
|
|
119
144
|
self._initialized = False
|
|
120
145
|
|
|
146
|
+
# Wake event for notify-triggered background tasks
|
|
147
|
+
self._resume_wake_event: asyncio.Event | None = None
|
|
148
|
+
self._outbox_wake_event: asyncio.Event | None = None
|
|
149
|
+
|
|
150
|
+
# Rate limiting for NOTIFY handlers (to reduce thundering herd)
|
|
151
|
+
self._last_resume_notify_time: float = 0.0
|
|
152
|
+
self._last_outbox_notify_time: float = 0.0
|
|
153
|
+
self._notify_rate_limit: float = 0.1 # 100ms minimum interval
|
|
154
|
+
|
|
155
|
+
# Batch processing settings for load balancing
|
|
156
|
+
if isinstance(max_workflows_per_batch, int):
|
|
157
|
+
self._max_workflows_per_batch: int = max_workflows_per_batch
|
|
158
|
+
self._batch_size_strategy: str | None = None
|
|
159
|
+
elif max_workflows_per_batch == "auto":
|
|
160
|
+
self._max_workflows_per_batch = 10 # Initial value
|
|
161
|
+
self._batch_size_strategy = "queue" # Scale based on queue depth
|
|
162
|
+
elif max_workflows_per_batch == "auto:cpu":
|
|
163
|
+
self._max_workflows_per_batch = 10 # Initial value
|
|
164
|
+
self._batch_size_strategy = "cpu" # Scale based on CPU utilization
|
|
165
|
+
else:
|
|
166
|
+
raise ValueError(
|
|
167
|
+
f"Invalid max_workflows_per_batch: {max_workflows_per_batch}. "
|
|
168
|
+
"Must be int, 'auto', or 'auto:cpu'."
|
|
169
|
+
)
|
|
170
|
+
|
|
121
171
|
def _create_storage(self, db_url: str) -> SQLAlchemyStorage:
|
|
122
172
|
"""
|
|
123
173
|
Create storage backend from database URL.
|
|
@@ -166,6 +216,162 @@ class EddaApp:
|
|
|
166
216
|
|
|
167
217
|
return SQLAlchemyStorage(engine)
|
|
168
218
|
|
|
219
|
+
def _is_postgresql_url(self, db_url: str) -> bool:
|
|
220
|
+
"""Check if the database URL is for PostgreSQL."""
|
|
221
|
+
return db_url.startswith("postgresql")
|
|
222
|
+
|
|
223
|
+
async def _initialize_notify_listener(self) -> None:
|
|
224
|
+
"""Initialize PostgreSQL LISTEN/NOTIFY listener if applicable.
|
|
225
|
+
|
|
226
|
+
This sets up the notification system based on configuration:
|
|
227
|
+
- None (auto): Enable for PostgreSQL, disable for others
|
|
228
|
+
- True: Force enable (error if not PostgreSQL)
|
|
229
|
+
- False: Force disable
|
|
230
|
+
"""
|
|
231
|
+
is_pg = self._is_postgresql_url(self.db_url)
|
|
232
|
+
|
|
233
|
+
# Determine if we should use NOTIFY
|
|
234
|
+
if self._use_listen_notify is None:
|
|
235
|
+
# Auto-detect: enable for PostgreSQL only
|
|
236
|
+
should_use_notify = is_pg
|
|
237
|
+
elif self._use_listen_notify:
|
|
238
|
+
# Force enable: error if not PostgreSQL
|
|
239
|
+
if not is_pg:
|
|
240
|
+
raise ValueError(
|
|
241
|
+
"use_listen_notify=True requires PostgreSQL database. "
|
|
242
|
+
f"Current database URL starts with: {self.db_url.split(':')[0]}"
|
|
243
|
+
)
|
|
244
|
+
should_use_notify = True
|
|
245
|
+
else:
|
|
246
|
+
# Force disable
|
|
247
|
+
should_use_notify = False
|
|
248
|
+
|
|
249
|
+
if should_use_notify:
|
|
250
|
+
try:
|
|
251
|
+
from edda.storage.pg_notify import PostgresNotifyListener
|
|
252
|
+
|
|
253
|
+
# Convert SQLAlchemy URL to asyncpg DSN format
|
|
254
|
+
asyncpg_dsn = self._get_asyncpg_dsn(self.db_url)
|
|
255
|
+
|
|
256
|
+
self._notify_listener = PostgresNotifyListener(dsn=asyncpg_dsn)
|
|
257
|
+
await self._notify_listener.start()
|
|
258
|
+
|
|
259
|
+
# Set listener on storage for NOTIFY calls
|
|
260
|
+
self.storage.set_notify_listener(self._notify_listener)
|
|
261
|
+
|
|
262
|
+
# Initialize wake events for background tasks
|
|
263
|
+
self._resume_wake_event = asyncio.Event()
|
|
264
|
+
self._outbox_wake_event = asyncio.Event()
|
|
265
|
+
|
|
266
|
+
# Subscribe to notification channels
|
|
267
|
+
await self._setup_notify_subscriptions()
|
|
268
|
+
|
|
269
|
+
self._notify_enabled = True
|
|
270
|
+
logger.info(
|
|
271
|
+
"PostgreSQL LISTEN/NOTIFY enabled "
|
|
272
|
+
f"(fallback polling interval: {self._notify_fallback_interval}s)"
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
except ImportError:
|
|
276
|
+
logger.warning(
|
|
277
|
+
"asyncpg not installed, falling back to polling-only mode. "
|
|
278
|
+
"Install with: pip install edda[postgres-notify]"
|
|
279
|
+
)
|
|
280
|
+
self._notify_enabled = False
|
|
281
|
+
except Exception as e:
|
|
282
|
+
logger.warning(
|
|
283
|
+
f"Failed to initialize NOTIFY listener: {e}. "
|
|
284
|
+
"Falling back to polling-only mode."
|
|
285
|
+
)
|
|
286
|
+
self._notify_enabled = False
|
|
287
|
+
else:
|
|
288
|
+
db_type = self.db_url.split(":")[0]
|
|
289
|
+
logger.info(
|
|
290
|
+
f"LISTEN/NOTIFY not available for {db_type}, "
|
|
291
|
+
"using polling-only mode (default intervals)"
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
def _get_asyncpg_dsn(self, db_url: str) -> str:
|
|
295
|
+
"""Convert SQLAlchemy PostgreSQL URL to asyncpg DSN format.
|
|
296
|
+
|
|
297
|
+
SQLAlchemy format: postgresql+asyncpg://user:pass@host/db
|
|
298
|
+
asyncpg format: postgresql://user:pass@host/db
|
|
299
|
+
"""
|
|
300
|
+
# Remove +asyncpg driver suffix if present
|
|
301
|
+
if "+asyncpg" in db_url:
|
|
302
|
+
return db_url.replace("+asyncpg", "")
|
|
303
|
+
return db_url
|
|
304
|
+
|
|
305
|
+
async def _setup_notify_subscriptions(self) -> None:
|
|
306
|
+
"""Set up LISTEN subscriptions for notification channels."""
|
|
307
|
+
if self._notify_listener is None:
|
|
308
|
+
return
|
|
309
|
+
|
|
310
|
+
# Subscribe to workflow resumable notifications
|
|
311
|
+
await self._notify_listener.subscribe(
|
|
312
|
+
"edda_workflow_resumable",
|
|
313
|
+
self._on_workflow_resumable_notify,
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
# Subscribe to outbox notifications
|
|
317
|
+
await self._notify_listener.subscribe(
|
|
318
|
+
"edda_outbox_pending",
|
|
319
|
+
self._on_outbox_pending_notify,
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
# Subscribe to timer expired notifications
|
|
323
|
+
await self._notify_listener.subscribe(
|
|
324
|
+
"edda_timer_expired",
|
|
325
|
+
self._on_timer_expired_notify,
|
|
326
|
+
)
|
|
327
|
+
|
|
328
|
+
logger.debug("Subscribed to NOTIFY channels")
|
|
329
|
+
|
|
330
|
+
async def _on_workflow_resumable_notify(self, _payload: str) -> None:
|
|
331
|
+
"""Handle workflow resumable notification with rate limiting."""
|
|
332
|
+
try:
|
|
333
|
+
# Rate limit to reduce thundering herd
|
|
334
|
+
now = time.monotonic()
|
|
335
|
+
if now - self._last_resume_notify_time < self._notify_rate_limit:
|
|
336
|
+
return # Skip if within rate limit window
|
|
337
|
+
self._last_resume_notify_time = now
|
|
338
|
+
|
|
339
|
+
# Wake up the resume polling loop
|
|
340
|
+
if self._resume_wake_event is not None:
|
|
341
|
+
self._resume_wake_event.set()
|
|
342
|
+
except Exception as e:
|
|
343
|
+
logger.warning(f"Error handling workflow resumable notify: {e}")
|
|
344
|
+
|
|
345
|
+
async def _on_outbox_pending_notify(self, _payload: str) -> None:
|
|
346
|
+
"""Handle outbox pending notification with rate limiting."""
|
|
347
|
+
try:
|
|
348
|
+
# Rate limit to reduce thundering herd
|
|
349
|
+
now = time.monotonic()
|
|
350
|
+
if now - self._last_outbox_notify_time < self._notify_rate_limit:
|
|
351
|
+
return # Skip if within rate limit window
|
|
352
|
+
self._last_outbox_notify_time = now
|
|
353
|
+
|
|
354
|
+
# Wake up the outbox polling loop
|
|
355
|
+
if self._outbox_wake_event is not None:
|
|
356
|
+
self._outbox_wake_event.set()
|
|
357
|
+
except Exception as e:
|
|
358
|
+
logger.warning(f"Error handling outbox pending notify: {e}")
|
|
359
|
+
|
|
360
|
+
async def _on_timer_expired_notify(self, _payload: str) -> None:
|
|
361
|
+
"""Handle timer expired notification with rate limiting."""
|
|
362
|
+
try:
|
|
363
|
+
# Rate limit (shares with workflow resumable since they use same event)
|
|
364
|
+
now = time.monotonic()
|
|
365
|
+
if now - self._last_resume_notify_time < self._notify_rate_limit:
|
|
366
|
+
return # Skip if within rate limit window
|
|
367
|
+
self._last_resume_notify_time = now
|
|
368
|
+
|
|
369
|
+
# Wake up the resume polling loop (timer expiry leads to workflow resume)
|
|
370
|
+
if self._resume_wake_event is not None:
|
|
371
|
+
self._resume_wake_event.set()
|
|
372
|
+
except Exception as e:
|
|
373
|
+
logger.warning(f"Error handling timer expired notify: {e}")
|
|
374
|
+
|
|
169
375
|
async def initialize(self) -> None:
|
|
170
376
|
"""
|
|
171
377
|
Initialize the application.
|
|
@@ -185,6 +391,9 @@ class EddaApp:
|
|
|
185
391
|
# Initialize storage
|
|
186
392
|
await self.storage.initialize()
|
|
187
393
|
|
|
394
|
+
# Initialize LISTEN/NOTIFY if enabled
|
|
395
|
+
await self._initialize_notify_listener()
|
|
396
|
+
|
|
188
397
|
# Initialize replay engine
|
|
189
398
|
self.replay_engine = ReplayEngine(
|
|
190
399
|
storage=self.storage,
|
|
@@ -200,12 +409,17 @@ class EddaApp:
|
|
|
200
409
|
# Initialize outbox relayer if enabled
|
|
201
410
|
if self.outbox_enabled:
|
|
202
411
|
assert self.broker_url is not None # Validated in __init__
|
|
412
|
+
# Use longer poll interval with NOTIFY fallback
|
|
413
|
+
outbox_poll_interval = (
|
|
414
|
+
float(self._notify_fallback_interval) if self._notify_enabled else 1.0
|
|
415
|
+
)
|
|
203
416
|
self.outbox_relayer = OutboxRelayer(
|
|
204
417
|
storage=self.storage,
|
|
205
418
|
broker_url=self.broker_url,
|
|
206
|
-
poll_interval=
|
|
419
|
+
poll_interval=outbox_poll_interval,
|
|
207
420
|
max_retries=3,
|
|
208
421
|
batch_size=10,
|
|
422
|
+
wake_event=self._outbox_wake_event,
|
|
209
423
|
)
|
|
210
424
|
await self.outbox_relayer.start()
|
|
211
425
|
|
|
@@ -227,6 +441,14 @@ class EddaApp:
|
|
|
227
441
|
if self.outbox_relayer:
|
|
228
442
|
await self.outbox_relayer.stop()
|
|
229
443
|
|
|
444
|
+
# Stop NOTIFY listener if enabled
|
|
445
|
+
if self._notify_listener is not None:
|
|
446
|
+
try:
|
|
447
|
+
await self._notify_listener.stop()
|
|
448
|
+
logger.info("NOTIFY listener stopped")
|
|
449
|
+
except Exception as e:
|
|
450
|
+
logger.warning(f"Error stopping NOTIFY listener: {e}")
|
|
451
|
+
|
|
230
452
|
# Cancel background tasks
|
|
231
453
|
for task in self._background_tasks:
|
|
232
454
|
task.cancel()
|
|
@@ -588,9 +810,18 @@ class EddaApp:
|
|
|
588
810
|
logger.warning("No activity_id in timer for %s, skipping", instance_id)
|
|
589
811
|
continue
|
|
590
812
|
|
|
591
|
-
#
|
|
592
|
-
#
|
|
593
|
-
|
|
813
|
+
# Check if workflow is registered in this worker BEFORE acquiring lock
|
|
814
|
+
# In multi-app environments, another worker may own this workflow
|
|
815
|
+
from edda.workflow import get_all_workflows
|
|
816
|
+
|
|
817
|
+
workflows = get_all_workflows()
|
|
818
|
+
if workflow_name not in workflows:
|
|
819
|
+
logger.debug(
|
|
820
|
+
"Skipping timer for unregistered workflow: " "instance_id=%s, workflow_name=%s",
|
|
821
|
+
instance_id,
|
|
822
|
+
workflow_name,
|
|
823
|
+
)
|
|
824
|
+
continue # Let another worker handle it
|
|
594
825
|
|
|
595
826
|
# Distributed Coroutines: Acquire lock FIRST to prevent race conditions
|
|
596
827
|
# This ensures only ONE pod processes this timer, even if multiple pods
|
|
@@ -709,7 +940,25 @@ class EddaApp:
|
|
|
709
940
|
instance_id = subscription["instance_id"]
|
|
710
941
|
channel = subscription["channel"]
|
|
711
942
|
timeout_at = subscription["timeout_at"]
|
|
712
|
-
|
|
943
|
+
workflow_name = subscription.get("workflow_name")
|
|
944
|
+
|
|
945
|
+
if not workflow_name:
|
|
946
|
+
logger.warning("No workflow_name in subscription for %s, skipping", instance_id)
|
|
947
|
+
continue
|
|
948
|
+
|
|
949
|
+
# Check if workflow is registered in this worker BEFORE acquiring lock
|
|
950
|
+
# In multi-app environments, another worker may own this workflow
|
|
951
|
+
from edda.workflow import get_all_workflows
|
|
952
|
+
|
|
953
|
+
workflows = get_all_workflows()
|
|
954
|
+
if workflow_name not in workflows:
|
|
955
|
+
logger.debug(
|
|
956
|
+
"Skipping message subscription for unregistered workflow: "
|
|
957
|
+
"instance_id=%s, workflow_name=%s",
|
|
958
|
+
instance_id,
|
|
959
|
+
workflow_name,
|
|
960
|
+
)
|
|
961
|
+
continue # Let another worker handle it
|
|
713
962
|
|
|
714
963
|
# Lock-First pattern: Try to acquire the lock before processing
|
|
715
964
|
# If we can't get the lock, another worker is processing this workflow
|
|
@@ -777,48 +1026,20 @@ class EddaApp:
|
|
|
777
1026
|
# 2. Remove message subscription
|
|
778
1027
|
await self.storage.remove_message_subscription(instance_id, channel)
|
|
779
1028
|
|
|
780
|
-
# 3.
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
try:
|
|
786
|
-
timeout_dt = (
|
|
787
|
-
timeout_at
|
|
788
|
-
if isinstance(timeout_at, dt_type)
|
|
789
|
-
else dt_type.fromisoformat(str(timeout_at))
|
|
790
|
-
)
|
|
791
|
-
created_dt = (
|
|
792
|
-
created_at
|
|
793
|
-
if isinstance(created_at, dt_type)
|
|
794
|
-
else dt_type.fromisoformat(str(created_at))
|
|
795
|
-
)
|
|
796
|
-
# Calculate the original timeout duration (timeout_at - created_at)
|
|
797
|
-
timeout_seconds = int((timeout_dt - created_dt).total_seconds())
|
|
798
|
-
except Exception:
|
|
799
|
-
timeout_seconds = 0 # Fallback
|
|
800
|
-
|
|
801
|
-
error = TimeoutError(
|
|
802
|
-
f"Message on channel '{channel}' did not arrive within {timeout_seconds} seconds"
|
|
803
|
-
)
|
|
804
|
-
stack_trace = "".join(
|
|
805
|
-
traceback.format_exception(type(error), error, error.__traceback__)
|
|
806
|
-
)
|
|
1029
|
+
# 3. Resume workflow (lock already held - distributed coroutine pattern)
|
|
1030
|
+
# The workflow will replay and receive() will raise TimeoutError from cached history
|
|
1031
|
+
if self.replay_engine is None:
|
|
1032
|
+
logger.error("Replay engine not initialized")
|
|
1033
|
+
continue
|
|
807
1034
|
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
instance_id,
|
|
811
|
-
"failed",
|
|
812
|
-
{
|
|
813
|
-
"error_message": str(error),
|
|
814
|
-
"error_type": "TimeoutError",
|
|
815
|
-
"stack_trace": stack_trace,
|
|
816
|
-
},
|
|
1035
|
+
await self.replay_engine.resume_by_name(
|
|
1036
|
+
instance_id, workflow_name, already_locked=True
|
|
817
1037
|
)
|
|
818
1038
|
|
|
819
1039
|
logger.debug(
|
|
820
|
-
"
|
|
1040
|
+
"Resumed workflow %s after message timeout on channel '%s'",
|
|
821
1041
|
instance_id,
|
|
1042
|
+
channel,
|
|
822
1043
|
)
|
|
823
1044
|
|
|
824
1045
|
except Exception as e:
|
|
@@ -851,27 +1072,51 @@ class EddaApp:
|
|
|
851
1072
|
|
|
852
1073
|
This provides fast resumption after message delivery. When deliver_message()
|
|
853
1074
|
sets a workflow's status to 'running' and releases the lock, this task
|
|
854
|
-
will pick it up
|
|
1075
|
+
will pick it up and resume it.
|
|
1076
|
+
|
|
1077
|
+
When NOTIFY is enabled:
|
|
1078
|
+
- Wakes up immediately when notified via _resume_wake_event
|
|
1079
|
+
- Falls back to notify_fallback_interval (default 30s) if no notifications
|
|
855
1080
|
|
|
856
|
-
|
|
857
|
-
-
|
|
1081
|
+
When NOTIFY is disabled (SQLite/MySQL):
|
|
1082
|
+
- Uses adaptive backoff to reduce DB load when no workflows are ready
|
|
1083
|
+
- When workflows are processed, uses base interval (1s)
|
|
858
1084
|
- When no workflows found, exponentially backs off up to 60 seconds
|
|
859
1085
|
- Always adds jitter to prevent thundering herd in multi-pod deployments
|
|
860
1086
|
|
|
861
1087
|
Args:
|
|
862
|
-
interval:
|
|
1088
|
+
interval: Base check interval in seconds (default: 1)
|
|
863
1089
|
"""
|
|
864
1090
|
consecutive_empty = 0 # Track empty results for adaptive backoff
|
|
1091
|
+
|
|
1092
|
+
# Use longer fallback interval when NOTIFY is enabled
|
|
1093
|
+
effective_interval = self._notify_fallback_interval if self._notify_enabled else interval
|
|
1094
|
+
|
|
865
1095
|
while True:
|
|
866
1096
|
try:
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
|
|
1097
|
+
if self._notify_enabled and self._resume_wake_event is not None:
|
|
1098
|
+
# NOTIFY mode: wait for event or timeout
|
|
1099
|
+
jitter = random.uniform(0, effective_interval * 0.1)
|
|
1100
|
+
try:
|
|
1101
|
+
await asyncio.wait_for(
|
|
1102
|
+
self._resume_wake_event.wait(),
|
|
1103
|
+
timeout=effective_interval + jitter,
|
|
1104
|
+
)
|
|
1105
|
+
# Clear the event for next notification
|
|
1106
|
+
self._resume_wake_event.clear()
|
|
1107
|
+
logger.debug("Resume task woken by NOTIFY")
|
|
1108
|
+
except TimeoutError:
|
|
1109
|
+
# Fallback polling timeout reached
|
|
1110
|
+
pass
|
|
872
1111
|
else:
|
|
873
|
-
|
|
874
|
-
|
|
1112
|
+
# Polling mode: adaptive backoff
|
|
1113
|
+
jitter = random.uniform(0, interval * 0.3)
|
|
1114
|
+
if consecutive_empty > 0:
|
|
1115
|
+
# Exponential backoff: 2s, 4s, 8s, 16s, 32s, max 60s
|
|
1116
|
+
backoff = min(interval * (2 ** min(consecutive_empty, 5)), 60)
|
|
1117
|
+
else:
|
|
1118
|
+
backoff = interval
|
|
1119
|
+
await asyncio.sleep(backoff + jitter)
|
|
875
1120
|
|
|
876
1121
|
count = await self._resume_running_workflows()
|
|
877
1122
|
if count == 0:
|
|
@@ -882,6 +1127,58 @@ class EddaApp:
|
|
|
882
1127
|
consecutive_empty = 0 # Reset on error
|
|
883
1128
|
logger.error("Error in periodic resume check: %s", e, exc_info=True)
|
|
884
1129
|
|
|
1130
|
+
def _calculate_effective_batch_size(self, pending_count: int) -> int:
|
|
1131
|
+
"""
|
|
1132
|
+
Calculate the effective batch size based on the configured strategy.
|
|
1133
|
+
|
|
1134
|
+
Args:
|
|
1135
|
+
pending_count: Number of resumable workflows detected in the previous cycle.
|
|
1136
|
+
|
|
1137
|
+
Returns:
|
|
1138
|
+
Effective batch size to use for the next cycle.
|
|
1139
|
+
|
|
1140
|
+
Strategies:
|
|
1141
|
+
- None (static): Returns the configured _max_workflows_per_batch
|
|
1142
|
+
- "queue": Scales 10-100 based on queue depth
|
|
1143
|
+
- "cpu": Scales 10-100 based on CPU utilization (requires psutil)
|
|
1144
|
+
"""
|
|
1145
|
+
if self._batch_size_strategy is None:
|
|
1146
|
+
return self._max_workflows_per_batch
|
|
1147
|
+
|
|
1148
|
+
base_size = 10
|
|
1149
|
+
max_size = 100
|
|
1150
|
+
|
|
1151
|
+
if self._batch_size_strategy == "queue":
|
|
1152
|
+
# Queue-based scaling: scale up when more workflows are waiting
|
|
1153
|
+
if pending_count <= base_size:
|
|
1154
|
+
return base_size
|
|
1155
|
+
scale_factor = min(math.ceil(pending_count / base_size), max_size // base_size)
|
|
1156
|
+
return min(base_size * scale_factor, max_size)
|
|
1157
|
+
|
|
1158
|
+
elif self._batch_size_strategy == "cpu":
|
|
1159
|
+
# CPU-based scaling: scale up when CPU is idle, down when busy
|
|
1160
|
+
try:
|
|
1161
|
+
import psutil # type: ignore[import-untyped]
|
|
1162
|
+
|
|
1163
|
+
cpu_percent = psutil.cpu_percent(interval=None) # Non-blocking
|
|
1164
|
+
|
|
1165
|
+
if cpu_percent < 30:
|
|
1166
|
+
return max_size # Low load: process aggressively
|
|
1167
|
+
elif cpu_percent < 50:
|
|
1168
|
+
return 50 # Medium load
|
|
1169
|
+
elif cpu_percent < 70:
|
|
1170
|
+
return 20 # Higher load
|
|
1171
|
+
else:
|
|
1172
|
+
return base_size # High load: process conservatively
|
|
1173
|
+
except ImportError:
|
|
1174
|
+
logger.warning(
|
|
1175
|
+
"psutil not installed, falling back to default batch size. "
|
|
1176
|
+
"Install with: pip install edda-framework[cpu-monitor]"
|
|
1177
|
+
)
|
|
1178
|
+
return self._max_workflows_per_batch
|
|
1179
|
+
|
|
1180
|
+
return self._max_workflows_per_batch
|
|
1181
|
+
|
|
885
1182
|
async def _resume_running_workflows(self) -> int:
|
|
886
1183
|
"""
|
|
887
1184
|
Find and resume workflows that are ready to run.
|
|
@@ -889,13 +1186,21 @@ class EddaApp:
|
|
|
889
1186
|
Finds workflows with status='running' that don't have a lock,
|
|
890
1187
|
acquires a lock, and resumes them.
|
|
891
1188
|
|
|
1189
|
+
Uses batch limiting to ensure fair load distribution across workers.
|
|
1190
|
+
Supports static batch size and dynamic auto-scaling strategies.
|
|
1191
|
+
|
|
892
1192
|
Returns:
|
|
893
1193
|
Number of workflows successfully processed (lock acquired and resumed).
|
|
894
1194
|
"""
|
|
895
|
-
|
|
1195
|
+
effective_batch = self._max_workflows_per_batch
|
|
1196
|
+
resumable = await self.storage.find_resumable_workflows(limit=effective_batch)
|
|
896
1197
|
processed_count = 0
|
|
897
1198
|
|
|
898
1199
|
for workflow_info in resumable:
|
|
1200
|
+
# Batch limit for load balancing across workers
|
|
1201
|
+
if processed_count >= effective_batch:
|
|
1202
|
+
break
|
|
1203
|
+
|
|
899
1204
|
instance_id = workflow_info["instance_id"]
|
|
900
1205
|
workflow_name = workflow_info["workflow_name"]
|
|
901
1206
|
|
|
@@ -903,7 +1208,7 @@ class EddaApp:
|
|
|
903
1208
|
# Try to acquire lock (Lock-First pattern)
|
|
904
1209
|
lock_acquired = await self.storage.try_acquire_lock(instance_id, self.worker_id)
|
|
905
1210
|
if not lock_acquired:
|
|
906
|
-
# Another worker got it first, skip
|
|
1211
|
+
# Another worker got it first, skip (doesn't count toward limit)
|
|
907
1212
|
continue
|
|
908
1213
|
|
|
909
1214
|
try:
|
|
@@ -922,6 +1227,10 @@ class EddaApp:
|
|
|
922
1227
|
except Exception as e:
|
|
923
1228
|
logger.error("Error resuming %s: %s", instance_id, e, exc_info=True)
|
|
924
1229
|
|
|
1230
|
+
# Update batch size for next cycle (auto modes only)
|
|
1231
|
+
if self._batch_size_strategy is not None:
|
|
1232
|
+
self._max_workflows_per_batch = self._calculate_effective_batch_size(len(resumable))
|
|
1233
|
+
|
|
925
1234
|
return processed_count
|
|
926
1235
|
|
|
927
1236
|
async def _cleanup_old_messages_periodically(
|
|
@@ -969,6 +1278,69 @@ class EddaApp:
|
|
|
969
1278
|
except Exception as e:
|
|
970
1279
|
logger.error("Error cleaning up old messages: %s", e, exc_info=True)
|
|
971
1280
|
|
|
1281
|
+
# -------------------------------------------------------------------------
|
|
1282
|
+
# Query API Methods
|
|
1283
|
+
# -------------------------------------------------------------------------
|
|
1284
|
+
|
|
1285
|
+
async def find_instances(
|
|
1286
|
+
self,
|
|
1287
|
+
*,
|
|
1288
|
+
input_filters: dict[str, Any] | None = None,
|
|
1289
|
+
status: str | None = None,
|
|
1290
|
+
workflow_name: str | None = None,
|
|
1291
|
+
instance_id: str | None = None,
|
|
1292
|
+
started_after: datetime | None = None,
|
|
1293
|
+
started_before: datetime | None = None,
|
|
1294
|
+
limit: int = 50,
|
|
1295
|
+
page_token: str | None = None,
|
|
1296
|
+
) -> dict[str, Any]:
|
|
1297
|
+
"""
|
|
1298
|
+
Find workflow instances with filtering support.
|
|
1299
|
+
|
|
1300
|
+
This is a high-level API for querying workflow instances by various
|
|
1301
|
+
criteria, including input parameter values.
|
|
1302
|
+
|
|
1303
|
+
Args:
|
|
1304
|
+
input_filters: Filter by input data values. Keys are JSON paths,
|
|
1305
|
+
values are expected values (exact match).
|
|
1306
|
+
Example: {"order_id": "ORD-123"}
|
|
1307
|
+
status: Filter by workflow status (e.g., "running", "completed")
|
|
1308
|
+
workflow_name: Filter by workflow name (partial match, case-insensitive)
|
|
1309
|
+
instance_id: Filter by instance ID (partial match, case-insensitive)
|
|
1310
|
+
started_after: Filter instances started after this datetime (inclusive)
|
|
1311
|
+
started_before: Filter instances started before this datetime (inclusive)
|
|
1312
|
+
limit: Maximum number of instances to return per page (default: 50)
|
|
1313
|
+
page_token: Cursor for pagination (from previous response)
|
|
1314
|
+
|
|
1315
|
+
Returns:
|
|
1316
|
+
Dictionary containing:
|
|
1317
|
+
- instances: List of matching workflow instances
|
|
1318
|
+
- next_page_token: Cursor for the next page, or None if no more pages
|
|
1319
|
+
- has_more: Boolean indicating if there are more pages
|
|
1320
|
+
|
|
1321
|
+
Example:
|
|
1322
|
+
>>> # Find all instances with order_id = "ORD-123"
|
|
1323
|
+
>>> result = await app.find_instances(input_filters={"order_id": "ORD-123"})
|
|
1324
|
+
>>> for instance in result["instances"]:
|
|
1325
|
+
... print(f"{instance['instance_id']}: {instance['status']}")
|
|
1326
|
+
|
|
1327
|
+
>>> # Find running instances with specific customer
|
|
1328
|
+
>>> result = await app.find_instances(
|
|
1329
|
+
... input_filters={"customer_id": "CUST-456"},
|
|
1330
|
+
... status="running"
|
|
1331
|
+
... )
|
|
1332
|
+
"""
|
|
1333
|
+
return await self.storage.list_instances(
|
|
1334
|
+
limit=limit,
|
|
1335
|
+
page_token=page_token,
|
|
1336
|
+
status_filter=status,
|
|
1337
|
+
workflow_name_filter=workflow_name,
|
|
1338
|
+
instance_id_filter=instance_id,
|
|
1339
|
+
started_after=started_after,
|
|
1340
|
+
started_before=started_before,
|
|
1341
|
+
input_filters=input_filters,
|
|
1342
|
+
)
|
|
1343
|
+
|
|
972
1344
|
# -------------------------------------------------------------------------
|
|
973
1345
|
# ASGI Interface
|
|
974
1346
|
# -------------------------------------------------------------------------
|
edda/context.py
CHANGED
|
@@ -217,6 +217,14 @@ class WorkflowContext:
|
|
|
217
217
|
# Cache the timer result for wait_timer replay
|
|
218
218
|
# Timer returns None, so we cache the result field
|
|
219
219
|
self._history_cache[activity_id] = event_data.get("result")
|
|
220
|
+
elif event_type == "MessageTimeout":
|
|
221
|
+
# Cache the timeout error for receive() replay
|
|
222
|
+
# This allows TimeoutError to be raised and caught in workflow code
|
|
223
|
+
self._history_cache[activity_id] = {
|
|
224
|
+
"_error": True,
|
|
225
|
+
"error_type": event_data.get("error_type", "TimeoutError"),
|
|
226
|
+
"error_message": event_data.get("error_message", "Message timeout"),
|
|
227
|
+
}
|
|
220
228
|
|
|
221
229
|
self._history_loaded = True
|
|
222
230
|
|