edda-framework 0.9.0__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
edda/app.py CHANGED
@@ -8,10 +8,13 @@ application for handling CloudEvents and executing workflows.
8
8
  import asyncio
9
9
  import json
10
10
  import logging
11
+ import math
11
12
  import random
12
13
  import sys
14
+ import time
13
15
  from collections.abc import Callable
14
- from typing import Any
16
+ from datetime import datetime
17
+ from typing import Any, Literal
15
18
 
16
19
  import uvloop
17
20
  from cloudevents.exceptions import GenericException as CloudEventsException
@@ -55,6 +58,11 @@ class EddaApp:
55
58
  pool_timeout: int = 30,
56
59
  pool_recycle: int = 3600,
57
60
  pool_pre_ping: bool = True,
61
+ # PostgreSQL LISTEN/NOTIFY settings
62
+ use_listen_notify: bool | None = None,
63
+ notify_fallback_interval: int = 30,
64
+ # Batch processing settings
65
+ max_workflows_per_batch: int | Literal["auto", "auto:cpu"] = 10,
58
66
  ):
59
67
  """
60
68
  Initialize Edda application.
@@ -81,6 +89,17 @@ class EddaApp:
81
89
  Helps prevent stale connections. Ignored for SQLite.
82
90
  pool_pre_ping: If True, test connections before use (default: True).
83
91
  Helps detect disconnected connections. Ignored for SQLite.
92
+ use_listen_notify: Enable PostgreSQL LISTEN/NOTIFY for instant notifications.
93
+ None (default) = auto-detect (enabled for PostgreSQL, disabled for others).
94
+ True = force enable (raises error if not PostgreSQL).
95
+ False = force disable (use polling only).
96
+ notify_fallback_interval: Polling interval in seconds when NOTIFY is enabled.
97
+ Used as backup for missed notifications. Default: 30 seconds.
98
+ SQLite/MySQL always use their default polling intervals.
99
+ max_workflows_per_batch: Maximum workflows to process per resume cycle.
100
+ - int: Fixed batch size (default: 10)
101
+ - "auto": Scale 10-100 based on queue depth
102
+ - "auto:cpu": Scale 10-100 based on CPU utilization (requires psutil)
84
103
  """
85
104
  self.db_url = db_url
86
105
  self.service_name = service_name
@@ -99,6 +118,12 @@ class EddaApp:
99
118
  self._pool_recycle = pool_recycle
100
119
  self._pool_pre_ping = pool_pre_ping
101
120
 
121
+ # PostgreSQL LISTEN/NOTIFY settings
122
+ self._use_listen_notify = use_listen_notify
123
+ self._notify_fallback_interval = notify_fallback_interval
124
+ self._notify_listener: Any = None
125
+ self._notify_enabled = False
126
+
102
127
  # Generate unique worker ID for this process
103
128
  self.worker_id = generate_worker_id(service_name)
104
129
 
@@ -118,6 +143,31 @@ class EddaApp:
118
143
  self._background_tasks: list[asyncio.Task[Any]] = []
119
144
  self._initialized = False
120
145
 
146
+ # Wake event for notify-triggered background tasks
147
+ self._resume_wake_event: asyncio.Event | None = None
148
+ self._outbox_wake_event: asyncio.Event | None = None
149
+
150
+ # Rate limiting for NOTIFY handlers (to reduce thundering herd)
151
+ self._last_resume_notify_time: float = 0.0
152
+ self._last_outbox_notify_time: float = 0.0
153
+ self._notify_rate_limit: float = 0.1 # 100ms minimum interval
154
+
155
+ # Batch processing settings for load balancing
156
+ if isinstance(max_workflows_per_batch, int):
157
+ self._max_workflows_per_batch: int = max_workflows_per_batch
158
+ self._batch_size_strategy: str | None = None
159
+ elif max_workflows_per_batch == "auto":
160
+ self._max_workflows_per_batch = 10 # Initial value
161
+ self._batch_size_strategy = "queue" # Scale based on queue depth
162
+ elif max_workflows_per_batch == "auto:cpu":
163
+ self._max_workflows_per_batch = 10 # Initial value
164
+ self._batch_size_strategy = "cpu" # Scale based on CPU utilization
165
+ else:
166
+ raise ValueError(
167
+ f"Invalid max_workflows_per_batch: {max_workflows_per_batch}. "
168
+ "Must be int, 'auto', or 'auto:cpu'."
169
+ )
170
+
121
171
  def _create_storage(self, db_url: str) -> SQLAlchemyStorage:
122
172
  """
123
173
  Create storage backend from database URL.
@@ -166,6 +216,162 @@ class EddaApp:
166
216
 
167
217
  return SQLAlchemyStorage(engine)
168
218
 
219
+ def _is_postgresql_url(self, db_url: str) -> bool:
220
+ """Check if the database URL is for PostgreSQL."""
221
+ return db_url.startswith("postgresql")
222
+
223
+ async def _initialize_notify_listener(self) -> None:
224
+ """Initialize PostgreSQL LISTEN/NOTIFY listener if applicable.
225
+
226
+ This sets up the notification system based on configuration:
227
+ - None (auto): Enable for PostgreSQL, disable for others
228
+ - True: Force enable (error if not PostgreSQL)
229
+ - False: Force disable
230
+ """
231
+ is_pg = self._is_postgresql_url(self.db_url)
232
+
233
+ # Determine if we should use NOTIFY
234
+ if self._use_listen_notify is None:
235
+ # Auto-detect: enable for PostgreSQL only
236
+ should_use_notify = is_pg
237
+ elif self._use_listen_notify:
238
+ # Force enable: error if not PostgreSQL
239
+ if not is_pg:
240
+ raise ValueError(
241
+ "use_listen_notify=True requires PostgreSQL database. "
242
+ f"Current database URL starts with: {self.db_url.split(':')[0]}"
243
+ )
244
+ should_use_notify = True
245
+ else:
246
+ # Force disable
247
+ should_use_notify = False
248
+
249
+ if should_use_notify:
250
+ try:
251
+ from edda.storage.pg_notify import PostgresNotifyListener
252
+
253
+ # Convert SQLAlchemy URL to asyncpg DSN format
254
+ asyncpg_dsn = self._get_asyncpg_dsn(self.db_url)
255
+
256
+ self._notify_listener = PostgresNotifyListener(dsn=asyncpg_dsn)
257
+ await self._notify_listener.start()
258
+
259
+ # Set listener on storage for NOTIFY calls
260
+ self.storage.set_notify_listener(self._notify_listener)
261
+
262
+ # Initialize wake events for background tasks
263
+ self._resume_wake_event = asyncio.Event()
264
+ self._outbox_wake_event = asyncio.Event()
265
+
266
+ # Subscribe to notification channels
267
+ await self._setup_notify_subscriptions()
268
+
269
+ self._notify_enabled = True
270
+ logger.info(
271
+ "PostgreSQL LISTEN/NOTIFY enabled "
272
+ f"(fallback polling interval: {self._notify_fallback_interval}s)"
273
+ )
274
+
275
+ except ImportError:
276
+ logger.warning(
277
+ "asyncpg not installed, falling back to polling-only mode. "
278
+ "Install with: pip install edda[postgres-notify]"
279
+ )
280
+ self._notify_enabled = False
281
+ except Exception as e:
282
+ logger.warning(
283
+ f"Failed to initialize NOTIFY listener: {e}. "
284
+ "Falling back to polling-only mode."
285
+ )
286
+ self._notify_enabled = False
287
+ else:
288
+ db_type = self.db_url.split(":")[0]
289
+ logger.info(
290
+ f"LISTEN/NOTIFY not available for {db_type}, "
291
+ "using polling-only mode (default intervals)"
292
+ )
293
+
294
+ def _get_asyncpg_dsn(self, db_url: str) -> str:
295
+ """Convert SQLAlchemy PostgreSQL URL to asyncpg DSN format.
296
+
297
+ SQLAlchemy format: postgresql+asyncpg://user:pass@host/db
298
+ asyncpg format: postgresql://user:pass@host/db
299
+ """
300
+ # Remove +asyncpg driver suffix if present
301
+ if "+asyncpg" in db_url:
302
+ return db_url.replace("+asyncpg", "")
303
+ return db_url
304
+
305
+ async def _setup_notify_subscriptions(self) -> None:
306
+ """Set up LISTEN subscriptions for notification channels."""
307
+ if self._notify_listener is None:
308
+ return
309
+
310
+ # Subscribe to workflow resumable notifications
311
+ await self._notify_listener.subscribe(
312
+ "edda_workflow_resumable",
313
+ self._on_workflow_resumable_notify,
314
+ )
315
+
316
+ # Subscribe to outbox notifications
317
+ await self._notify_listener.subscribe(
318
+ "edda_outbox_pending",
319
+ self._on_outbox_pending_notify,
320
+ )
321
+
322
+ # Subscribe to timer expired notifications
323
+ await self._notify_listener.subscribe(
324
+ "edda_timer_expired",
325
+ self._on_timer_expired_notify,
326
+ )
327
+
328
+ logger.debug("Subscribed to NOTIFY channels")
329
+
330
+ async def _on_workflow_resumable_notify(self, _payload: str) -> None:
331
+ """Handle workflow resumable notification with rate limiting."""
332
+ try:
333
+ # Rate limit to reduce thundering herd
334
+ now = time.monotonic()
335
+ if now - self._last_resume_notify_time < self._notify_rate_limit:
336
+ return # Skip if within rate limit window
337
+ self._last_resume_notify_time = now
338
+
339
+ # Wake up the resume polling loop
340
+ if self._resume_wake_event is not None:
341
+ self._resume_wake_event.set()
342
+ except Exception as e:
343
+ logger.warning(f"Error handling workflow resumable notify: {e}")
344
+
345
+ async def _on_outbox_pending_notify(self, _payload: str) -> None:
346
+ """Handle outbox pending notification with rate limiting."""
347
+ try:
348
+ # Rate limit to reduce thundering herd
349
+ now = time.monotonic()
350
+ if now - self._last_outbox_notify_time < self._notify_rate_limit:
351
+ return # Skip if within rate limit window
352
+ self._last_outbox_notify_time = now
353
+
354
+ # Wake up the outbox polling loop
355
+ if self._outbox_wake_event is not None:
356
+ self._outbox_wake_event.set()
357
+ except Exception as e:
358
+ logger.warning(f"Error handling outbox pending notify: {e}")
359
+
360
+ async def _on_timer_expired_notify(self, _payload: str) -> None:
361
+ """Handle timer expired notification with rate limiting."""
362
+ try:
363
+ # Rate limit (shares with workflow resumable since they use same event)
364
+ now = time.monotonic()
365
+ if now - self._last_resume_notify_time < self._notify_rate_limit:
366
+ return # Skip if within rate limit window
367
+ self._last_resume_notify_time = now
368
+
369
+ # Wake up the resume polling loop (timer expiry leads to workflow resume)
370
+ if self._resume_wake_event is not None:
371
+ self._resume_wake_event.set()
372
+ except Exception as e:
373
+ logger.warning(f"Error handling timer expired notify: {e}")
374
+
169
375
  async def initialize(self) -> None:
170
376
  """
171
377
  Initialize the application.
@@ -185,6 +391,9 @@ class EddaApp:
185
391
  # Initialize storage
186
392
  await self.storage.initialize()
187
393
 
394
+ # Initialize LISTEN/NOTIFY if enabled
395
+ await self._initialize_notify_listener()
396
+
188
397
  # Initialize replay engine
189
398
  self.replay_engine = ReplayEngine(
190
399
  storage=self.storage,
@@ -200,12 +409,17 @@ class EddaApp:
200
409
  # Initialize outbox relayer if enabled
201
410
  if self.outbox_enabled:
202
411
  assert self.broker_url is not None # Validated in __init__
412
+ # Use longer poll interval with NOTIFY fallback
413
+ outbox_poll_interval = (
414
+ float(self._notify_fallback_interval) if self._notify_enabled else 1.0
415
+ )
203
416
  self.outbox_relayer = OutboxRelayer(
204
417
  storage=self.storage,
205
418
  broker_url=self.broker_url,
206
- poll_interval=1.0,
419
+ poll_interval=outbox_poll_interval,
207
420
  max_retries=3,
208
421
  batch_size=10,
422
+ wake_event=self._outbox_wake_event,
209
423
  )
210
424
  await self.outbox_relayer.start()
211
425
 
@@ -227,6 +441,14 @@ class EddaApp:
227
441
  if self.outbox_relayer:
228
442
  await self.outbox_relayer.stop()
229
443
 
444
+ # Stop NOTIFY listener if enabled
445
+ if self._notify_listener is not None:
446
+ try:
447
+ await self._notify_listener.stop()
448
+ logger.info("NOTIFY listener stopped")
449
+ except Exception as e:
450
+ logger.warning(f"Error stopping NOTIFY listener: {e}")
451
+
230
452
  # Cancel background tasks
231
453
  for task in self._background_tasks:
232
454
  task.cancel()
@@ -588,9 +810,18 @@ class EddaApp:
588
810
  logger.warning("No activity_id in timer for %s, skipping", instance_id)
589
811
  continue
590
812
 
591
- # Note: find_expired_timers() already filters by status='waiting_for_timer'
592
- # and JOINs with workflow_instances, so no need for additional get_instance() call.
593
- # The lock mechanism below handles race conditions.
813
+ # Check if workflow is registered in this worker BEFORE acquiring lock
814
+ # In multi-app environments, another worker may own this workflow
815
+ from edda.workflow import get_all_workflows
816
+
817
+ workflows = get_all_workflows()
818
+ if workflow_name not in workflows:
819
+ logger.debug(
820
+ "Skipping timer for unregistered workflow: " "instance_id=%s, workflow_name=%s",
821
+ instance_id,
822
+ workflow_name,
823
+ )
824
+ continue # Let another worker handle it
594
825
 
595
826
  # Distributed Coroutines: Acquire lock FIRST to prevent race conditions
596
827
  # This ensures only ONE pod processes this timer, even if multiple pods
@@ -709,7 +940,25 @@ class EddaApp:
709
940
  instance_id = subscription["instance_id"]
710
941
  channel = subscription["channel"]
711
942
  timeout_at = subscription["timeout_at"]
712
- created_at = subscription["created_at"]
943
+ workflow_name = subscription.get("workflow_name")
944
+
945
+ if not workflow_name:
946
+ logger.warning("No workflow_name in subscription for %s, skipping", instance_id)
947
+ continue
948
+
949
+ # Check if workflow is registered in this worker BEFORE acquiring lock
950
+ # In multi-app environments, another worker may own this workflow
951
+ from edda.workflow import get_all_workflows
952
+
953
+ workflows = get_all_workflows()
954
+ if workflow_name not in workflows:
955
+ logger.debug(
956
+ "Skipping message subscription for unregistered workflow: "
957
+ "instance_id=%s, workflow_name=%s",
958
+ instance_id,
959
+ workflow_name,
960
+ )
961
+ continue # Let another worker handle it
713
962
 
714
963
  # Lock-First pattern: Try to acquire the lock before processing
715
964
  # If we can't get the lock, another worker is processing this workflow
@@ -777,48 +1026,20 @@ class EddaApp:
777
1026
  # 2. Remove message subscription
778
1027
  await self.storage.remove_message_subscription(instance_id, channel)
779
1028
 
780
- # 3. Fail the workflow with TimeoutError
781
- import traceback
782
-
783
- # Get timeout_seconds from timeout_at and created_at
784
- # Handle both datetime objects and ISO strings
785
- try:
786
- timeout_dt = (
787
- timeout_at
788
- if isinstance(timeout_at, dt_type)
789
- else dt_type.fromisoformat(str(timeout_at))
790
- )
791
- created_dt = (
792
- created_at
793
- if isinstance(created_at, dt_type)
794
- else dt_type.fromisoformat(str(created_at))
795
- )
796
- # Calculate the original timeout duration (timeout_at - created_at)
797
- timeout_seconds = int((timeout_dt - created_dt).total_seconds())
798
- except Exception:
799
- timeout_seconds = 0 # Fallback
800
-
801
- error = TimeoutError(
802
- f"Message on channel '{channel}' did not arrive within {timeout_seconds} seconds"
803
- )
804
- stack_trace = "".join(
805
- traceback.format_exception(type(error), error, error.__traceback__)
806
- )
1029
+ # 3. Resume workflow (lock already held - distributed coroutine pattern)
1030
+ # The workflow will replay and receive() will raise TimeoutError from cached history
1031
+ if self.replay_engine is None:
1032
+ logger.error("Replay engine not initialized")
1033
+ continue
807
1034
 
808
- # Update workflow status to failed with error details
809
- await self.storage.update_instance_status(
810
- instance_id,
811
- "failed",
812
- {
813
- "error_message": str(error),
814
- "error_type": "TimeoutError",
815
- "stack_trace": stack_trace,
816
- },
1035
+ await self.replay_engine.resume_by_name(
1036
+ instance_id, workflow_name, already_locked=True
817
1037
  )
818
1038
 
819
1039
  logger.debug(
820
- "Marked workflow %s as failed due to message timeout",
1040
+ "Resumed workflow %s after message timeout on channel '%s'",
821
1041
  instance_id,
1042
+ channel,
822
1043
  )
823
1044
 
824
1045
  except Exception as e:
@@ -851,27 +1072,51 @@ class EddaApp:
851
1072
 
852
1073
  This provides fast resumption after message delivery. When deliver_message()
853
1074
  sets a workflow's status to 'running' and releases the lock, this task
854
- will pick it up within 1 second and resume it.
1075
+ will pick it up and resume it.
1076
+
1077
+ When NOTIFY is enabled:
1078
+ - Wakes up immediately when notified via _resume_wake_event
1079
+ - Falls back to notify_fallback_interval (default 30s) if no notifications
855
1080
 
856
- Uses adaptive backoff to reduce DB load when no workflows are ready:
857
- - When workflows are processed, uses base interval
1081
+ When NOTIFY is disabled (SQLite/MySQL):
1082
+ - Uses adaptive backoff to reduce DB load when no workflows are ready
1083
+ - When workflows are processed, uses base interval (1s)
858
1084
  - When no workflows found, exponentially backs off up to 60 seconds
859
1085
  - Always adds jitter to prevent thundering herd in multi-pod deployments
860
1086
 
861
1087
  Args:
862
- interval: Check interval in seconds (default: 1)
1088
+ interval: Base check interval in seconds (default: 1)
863
1089
  """
864
1090
  consecutive_empty = 0 # Track empty results for adaptive backoff
1091
+
1092
+ # Use longer fallback interval when NOTIFY is enabled
1093
+ effective_interval = self._notify_fallback_interval if self._notify_enabled else interval
1094
+
865
1095
  while True:
866
1096
  try:
867
- # Adaptive backoff: longer sleep when no work available
868
- jitter = random.uniform(0, interval * 0.3)
869
- if consecutive_empty > 0:
870
- # Exponential backoff: 2s, 4s, 8s, 16s, 32s, max 60s
871
- backoff = min(interval * (2 ** min(consecutive_empty, 5)), 60)
1097
+ if self._notify_enabled and self._resume_wake_event is not None:
1098
+ # NOTIFY mode: wait for event or timeout
1099
+ jitter = random.uniform(0, effective_interval * 0.1)
1100
+ try:
1101
+ await asyncio.wait_for(
1102
+ self._resume_wake_event.wait(),
1103
+ timeout=effective_interval + jitter,
1104
+ )
1105
+ # Clear the event for next notification
1106
+ self._resume_wake_event.clear()
1107
+ logger.debug("Resume task woken by NOTIFY")
1108
+ except TimeoutError:
1109
+ # Fallback polling timeout reached
1110
+ pass
872
1111
  else:
873
- backoff = interval
874
- await asyncio.sleep(backoff + jitter)
1112
+ # Polling mode: adaptive backoff
1113
+ jitter = random.uniform(0, interval * 0.3)
1114
+ if consecutive_empty > 0:
1115
+ # Exponential backoff: 2s, 4s, 8s, 16s, 32s, max 60s
1116
+ backoff = min(interval * (2 ** min(consecutive_empty, 5)), 60)
1117
+ else:
1118
+ backoff = interval
1119
+ await asyncio.sleep(backoff + jitter)
875
1120
 
876
1121
  count = await self._resume_running_workflows()
877
1122
  if count == 0:
@@ -882,6 +1127,58 @@ class EddaApp:
882
1127
  consecutive_empty = 0 # Reset on error
883
1128
  logger.error("Error in periodic resume check: %s", e, exc_info=True)
884
1129
 
1130
+ def _calculate_effective_batch_size(self, pending_count: int) -> int:
1131
+ """
1132
+ Calculate the effective batch size based on the configured strategy.
1133
+
1134
+ Args:
1135
+ pending_count: Number of resumable workflows detected in the previous cycle.
1136
+
1137
+ Returns:
1138
+ Effective batch size to use for the next cycle.
1139
+
1140
+ Strategies:
1141
+ - None (static): Returns the configured _max_workflows_per_batch
1142
+ - "queue": Scales 10-100 based on queue depth
1143
+ - "cpu": Scales 10-100 based on CPU utilization (requires psutil)
1144
+ """
1145
+ if self._batch_size_strategy is None:
1146
+ return self._max_workflows_per_batch
1147
+
1148
+ base_size = 10
1149
+ max_size = 100
1150
+
1151
+ if self._batch_size_strategy == "queue":
1152
+ # Queue-based scaling: scale up when more workflows are waiting
1153
+ if pending_count <= base_size:
1154
+ return base_size
1155
+ scale_factor = min(math.ceil(pending_count / base_size), max_size // base_size)
1156
+ return min(base_size * scale_factor, max_size)
1157
+
1158
+ elif self._batch_size_strategy == "cpu":
1159
+ # CPU-based scaling: scale up when CPU is idle, down when busy
1160
+ try:
1161
+ import psutil # type: ignore[import-untyped]
1162
+
1163
+ cpu_percent = psutil.cpu_percent(interval=None) # Non-blocking
1164
+
1165
+ if cpu_percent < 30:
1166
+ return max_size # Low load: process aggressively
1167
+ elif cpu_percent < 50:
1168
+ return 50 # Medium load
1169
+ elif cpu_percent < 70:
1170
+ return 20 # Higher load
1171
+ else:
1172
+ return base_size # High load: process conservatively
1173
+ except ImportError:
1174
+ logger.warning(
1175
+ "psutil not installed, falling back to default batch size. "
1176
+ "Install with: pip install edda-framework[cpu-monitor]"
1177
+ )
1178
+ return self._max_workflows_per_batch
1179
+
1180
+ return self._max_workflows_per_batch
1181
+
885
1182
  async def _resume_running_workflows(self) -> int:
886
1183
  """
887
1184
  Find and resume workflows that are ready to run.
@@ -889,13 +1186,21 @@ class EddaApp:
889
1186
  Finds workflows with status='running' that don't have a lock,
890
1187
  acquires a lock, and resumes them.
891
1188
 
1189
+ Uses batch limiting to ensure fair load distribution across workers.
1190
+ Supports static batch size and dynamic auto-scaling strategies.
1191
+
892
1192
  Returns:
893
1193
  Number of workflows successfully processed (lock acquired and resumed).
894
1194
  """
895
- resumable = await self.storage.find_resumable_workflows()
1195
+ effective_batch = self._max_workflows_per_batch
1196
+ resumable = await self.storage.find_resumable_workflows(limit=effective_batch)
896
1197
  processed_count = 0
897
1198
 
898
1199
  for workflow_info in resumable:
1200
+ # Batch limit for load balancing across workers
1201
+ if processed_count >= effective_batch:
1202
+ break
1203
+
899
1204
  instance_id = workflow_info["instance_id"]
900
1205
  workflow_name = workflow_info["workflow_name"]
901
1206
 
@@ -903,7 +1208,7 @@ class EddaApp:
903
1208
  # Try to acquire lock (Lock-First pattern)
904
1209
  lock_acquired = await self.storage.try_acquire_lock(instance_id, self.worker_id)
905
1210
  if not lock_acquired:
906
- # Another worker got it first, skip
1211
+ # Another worker got it first, skip (doesn't count toward limit)
907
1212
  continue
908
1213
 
909
1214
  try:
@@ -922,6 +1227,10 @@ class EddaApp:
922
1227
  except Exception as e:
923
1228
  logger.error("Error resuming %s: %s", instance_id, e, exc_info=True)
924
1229
 
1230
+ # Update batch size for next cycle (auto modes only)
1231
+ if self._batch_size_strategy is not None:
1232
+ self._max_workflows_per_batch = self._calculate_effective_batch_size(len(resumable))
1233
+
925
1234
  return processed_count
926
1235
 
927
1236
  async def _cleanup_old_messages_periodically(
@@ -969,6 +1278,69 @@ class EddaApp:
969
1278
  except Exception as e:
970
1279
  logger.error("Error cleaning up old messages: %s", e, exc_info=True)
971
1280
 
1281
+ # -------------------------------------------------------------------------
1282
+ # Query API Methods
1283
+ # -------------------------------------------------------------------------
1284
+
1285
+ async def find_instances(
1286
+ self,
1287
+ *,
1288
+ input_filters: dict[str, Any] | None = None,
1289
+ status: str | None = None,
1290
+ workflow_name: str | None = None,
1291
+ instance_id: str | None = None,
1292
+ started_after: datetime | None = None,
1293
+ started_before: datetime | None = None,
1294
+ limit: int = 50,
1295
+ page_token: str | None = None,
1296
+ ) -> dict[str, Any]:
1297
+ """
1298
+ Find workflow instances with filtering support.
1299
+
1300
+ This is a high-level API for querying workflow instances by various
1301
+ criteria, including input parameter values.
1302
+
1303
+ Args:
1304
+ input_filters: Filter by input data values. Keys are JSON paths,
1305
+ values are expected values (exact match).
1306
+ Example: {"order_id": "ORD-123"}
1307
+ status: Filter by workflow status (e.g., "running", "completed")
1308
+ workflow_name: Filter by workflow name (partial match, case-insensitive)
1309
+ instance_id: Filter by instance ID (partial match, case-insensitive)
1310
+ started_after: Filter instances started after this datetime (inclusive)
1311
+ started_before: Filter instances started before this datetime (inclusive)
1312
+ limit: Maximum number of instances to return per page (default: 50)
1313
+ page_token: Cursor for pagination (from previous response)
1314
+
1315
+ Returns:
1316
+ Dictionary containing:
1317
+ - instances: List of matching workflow instances
1318
+ - next_page_token: Cursor for the next page, or None if no more pages
1319
+ - has_more: Boolean indicating if there are more pages
1320
+
1321
+ Example:
1322
+ >>> # Find all instances with order_id = "ORD-123"
1323
+ >>> result = await app.find_instances(input_filters={"order_id": "ORD-123"})
1324
+ >>> for instance in result["instances"]:
1325
+ ... print(f"{instance['instance_id']}: {instance['status']}")
1326
+
1327
+ >>> # Find running instances with specific customer
1328
+ >>> result = await app.find_instances(
1329
+ ... input_filters={"customer_id": "CUST-456"},
1330
+ ... status="running"
1331
+ ... )
1332
+ """
1333
+ return await self.storage.list_instances(
1334
+ limit=limit,
1335
+ page_token=page_token,
1336
+ status_filter=status,
1337
+ workflow_name_filter=workflow_name,
1338
+ instance_id_filter=instance_id,
1339
+ started_after=started_after,
1340
+ started_before=started_before,
1341
+ input_filters=input_filters,
1342
+ )
1343
+
972
1344
  # -------------------------------------------------------------------------
973
1345
  # ASGI Interface
974
1346
  # -------------------------------------------------------------------------
edda/context.py CHANGED
@@ -217,6 +217,14 @@ class WorkflowContext:
217
217
  # Cache the timer result for wait_timer replay
218
218
  # Timer returns None, so we cache the result field
219
219
  self._history_cache[activity_id] = event_data.get("result")
220
+ elif event_type == "MessageTimeout":
221
+ # Cache the timeout error for receive() replay
222
+ # This allows TimeoutError to be raised and caught in workflow code
223
+ self._history_cache[activity_id] = {
224
+ "_error": True,
225
+ "error_type": event_data.get("error_type", "TimeoutError"),
226
+ "error_message": event_data.get("error_message", "Message timeout"),
227
+ }
220
228
 
221
229
  self._history_loaded = True
222
230