aethergraph 0.1.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (182) hide show
  1. aethergraph/__init__.py +49 -0
  2. aethergraph/config/__init__.py +0 -0
  3. aethergraph/config/config.py +121 -0
  4. aethergraph/config/context.py +16 -0
  5. aethergraph/config/llm.py +26 -0
  6. aethergraph/config/loader.py +60 -0
  7. aethergraph/config/runtime.py +9 -0
  8. aethergraph/contracts/errors/errors.py +44 -0
  9. aethergraph/contracts/services/artifacts.py +142 -0
  10. aethergraph/contracts/services/channel.py +72 -0
  11. aethergraph/contracts/services/continuations.py +23 -0
  12. aethergraph/contracts/services/eventbus.py +12 -0
  13. aethergraph/contracts/services/kv.py +24 -0
  14. aethergraph/contracts/services/llm.py +17 -0
  15. aethergraph/contracts/services/mcp.py +22 -0
  16. aethergraph/contracts/services/memory.py +108 -0
  17. aethergraph/contracts/services/resume.py +28 -0
  18. aethergraph/contracts/services/state_stores.py +33 -0
  19. aethergraph/contracts/services/wakeup.py +28 -0
  20. aethergraph/core/execution/base_scheduler.py +77 -0
  21. aethergraph/core/execution/forward_scheduler.py +777 -0
  22. aethergraph/core/execution/global_scheduler.py +634 -0
  23. aethergraph/core/execution/retry_policy.py +22 -0
  24. aethergraph/core/execution/step_forward.py +411 -0
  25. aethergraph/core/execution/step_result.py +18 -0
  26. aethergraph/core/execution/wait_types.py +72 -0
  27. aethergraph/core/graph/graph_builder.py +192 -0
  28. aethergraph/core/graph/graph_fn.py +219 -0
  29. aethergraph/core/graph/graph_io.py +67 -0
  30. aethergraph/core/graph/graph_refs.py +154 -0
  31. aethergraph/core/graph/graph_spec.py +115 -0
  32. aethergraph/core/graph/graph_state.py +59 -0
  33. aethergraph/core/graph/graphify.py +128 -0
  34. aethergraph/core/graph/interpreter.py +145 -0
  35. aethergraph/core/graph/node_handle.py +33 -0
  36. aethergraph/core/graph/node_spec.py +46 -0
  37. aethergraph/core/graph/node_state.py +63 -0
  38. aethergraph/core/graph/task_graph.py +747 -0
  39. aethergraph/core/graph/task_node.py +82 -0
  40. aethergraph/core/graph/utils.py +37 -0
  41. aethergraph/core/graph/visualize.py +239 -0
  42. aethergraph/core/runtime/ad_hoc_context.py +61 -0
  43. aethergraph/core/runtime/base_service.py +153 -0
  44. aethergraph/core/runtime/bind_adapter.py +42 -0
  45. aethergraph/core/runtime/bound_memory.py +69 -0
  46. aethergraph/core/runtime/execution_context.py +220 -0
  47. aethergraph/core/runtime/graph_runner.py +349 -0
  48. aethergraph/core/runtime/lifecycle.py +26 -0
  49. aethergraph/core/runtime/node_context.py +203 -0
  50. aethergraph/core/runtime/node_services.py +30 -0
  51. aethergraph/core/runtime/recovery.py +159 -0
  52. aethergraph/core/runtime/run_registration.py +33 -0
  53. aethergraph/core/runtime/runtime_env.py +157 -0
  54. aethergraph/core/runtime/runtime_registry.py +32 -0
  55. aethergraph/core/runtime/runtime_services.py +224 -0
  56. aethergraph/core/runtime/wakeup_watcher.py +40 -0
  57. aethergraph/core/tools/__init__.py +10 -0
  58. aethergraph/core/tools/builtins/channel_tools.py +194 -0
  59. aethergraph/core/tools/builtins/toolset.py +134 -0
  60. aethergraph/core/tools/toolkit.py +510 -0
  61. aethergraph/core/tools/waitable.py +109 -0
  62. aethergraph/plugins/channel/__init__.py +0 -0
  63. aethergraph/plugins/channel/adapters/__init__.py +0 -0
  64. aethergraph/plugins/channel/adapters/console.py +106 -0
  65. aethergraph/plugins/channel/adapters/file.py +102 -0
  66. aethergraph/plugins/channel/adapters/slack.py +285 -0
  67. aethergraph/plugins/channel/adapters/telegram.py +302 -0
  68. aethergraph/plugins/channel/adapters/webhook.py +104 -0
  69. aethergraph/plugins/channel/adapters/webui.py +134 -0
  70. aethergraph/plugins/channel/routes/__init__.py +0 -0
  71. aethergraph/plugins/channel/routes/console_routes.py +86 -0
  72. aethergraph/plugins/channel/routes/slack_routes.py +49 -0
  73. aethergraph/plugins/channel/routes/telegram_routes.py +26 -0
  74. aethergraph/plugins/channel/routes/webui_routes.py +136 -0
  75. aethergraph/plugins/channel/utils/__init__.py +0 -0
  76. aethergraph/plugins/channel/utils/slack_utils.py +278 -0
  77. aethergraph/plugins/channel/utils/telegram_utils.py +324 -0
  78. aethergraph/plugins/channel/websockets/slack_ws.py +68 -0
  79. aethergraph/plugins/channel/websockets/telegram_polling.py +151 -0
  80. aethergraph/plugins/mcp/fs_server.py +128 -0
  81. aethergraph/plugins/mcp/http_server.py +101 -0
  82. aethergraph/plugins/mcp/ws_server.py +180 -0
  83. aethergraph/plugins/net/http.py +10 -0
  84. aethergraph/plugins/utils/data_io.py +359 -0
  85. aethergraph/runner/__init__.py +5 -0
  86. aethergraph/runtime/__init__.py +62 -0
  87. aethergraph/server/__init__.py +3 -0
  88. aethergraph/server/app_factory.py +84 -0
  89. aethergraph/server/start.py +122 -0
  90. aethergraph/services/__init__.py +10 -0
  91. aethergraph/services/artifacts/facade.py +284 -0
  92. aethergraph/services/artifacts/factory.py +35 -0
  93. aethergraph/services/artifacts/fs_store.py +656 -0
  94. aethergraph/services/artifacts/jsonl_index.py +123 -0
  95. aethergraph/services/artifacts/paths.py +23 -0
  96. aethergraph/services/artifacts/sqlite_index.py +209 -0
  97. aethergraph/services/artifacts/utils.py +124 -0
  98. aethergraph/services/auth/dev.py +16 -0
  99. aethergraph/services/channel/channel_bus.py +293 -0
  100. aethergraph/services/channel/factory.py +44 -0
  101. aethergraph/services/channel/session.py +511 -0
  102. aethergraph/services/channel/wait_helpers.py +57 -0
  103. aethergraph/services/clock/clock.py +9 -0
  104. aethergraph/services/container/default_container.py +320 -0
  105. aethergraph/services/continuations/continuation.py +56 -0
  106. aethergraph/services/continuations/factory.py +34 -0
  107. aethergraph/services/continuations/stores/fs_store.py +264 -0
  108. aethergraph/services/continuations/stores/inmem_store.py +95 -0
  109. aethergraph/services/eventbus/inmem.py +21 -0
  110. aethergraph/services/features/static.py +10 -0
  111. aethergraph/services/kv/ephemeral.py +90 -0
  112. aethergraph/services/kv/factory.py +27 -0
  113. aethergraph/services/kv/layered.py +41 -0
  114. aethergraph/services/kv/sqlite_kv.py +128 -0
  115. aethergraph/services/llm/factory.py +157 -0
  116. aethergraph/services/llm/generic_client.py +542 -0
  117. aethergraph/services/llm/providers.py +3 -0
  118. aethergraph/services/llm/service.py +105 -0
  119. aethergraph/services/logger/base.py +36 -0
  120. aethergraph/services/logger/compat.py +50 -0
  121. aethergraph/services/logger/formatters.py +106 -0
  122. aethergraph/services/logger/std.py +203 -0
  123. aethergraph/services/mcp/helpers.py +23 -0
  124. aethergraph/services/mcp/http_client.py +70 -0
  125. aethergraph/services/mcp/mcp_tools.py +21 -0
  126. aethergraph/services/mcp/registry.py +14 -0
  127. aethergraph/services/mcp/service.py +100 -0
  128. aethergraph/services/mcp/stdio_client.py +70 -0
  129. aethergraph/services/mcp/ws_client.py +115 -0
  130. aethergraph/services/memory/bound.py +106 -0
  131. aethergraph/services/memory/distillers/episode.py +116 -0
  132. aethergraph/services/memory/distillers/rolling.py +74 -0
  133. aethergraph/services/memory/facade.py +633 -0
  134. aethergraph/services/memory/factory.py +78 -0
  135. aethergraph/services/memory/hotlog_kv.py +27 -0
  136. aethergraph/services/memory/indices.py +74 -0
  137. aethergraph/services/memory/io_helpers.py +72 -0
  138. aethergraph/services/memory/persist_fs.py +40 -0
  139. aethergraph/services/memory/resolver.py +152 -0
  140. aethergraph/services/metering/noop.py +4 -0
  141. aethergraph/services/prompts/file_store.py +41 -0
  142. aethergraph/services/rag/chunker.py +29 -0
  143. aethergraph/services/rag/facade.py +593 -0
  144. aethergraph/services/rag/index/base.py +27 -0
  145. aethergraph/services/rag/index/faiss_index.py +121 -0
  146. aethergraph/services/rag/index/sqlite_index.py +134 -0
  147. aethergraph/services/rag/index_factory.py +52 -0
  148. aethergraph/services/rag/parsers/md.py +7 -0
  149. aethergraph/services/rag/parsers/pdf.py +14 -0
  150. aethergraph/services/rag/parsers/txt.py +7 -0
  151. aethergraph/services/rag/utils/hybrid.py +39 -0
  152. aethergraph/services/rag/utils/make_fs_key.py +62 -0
  153. aethergraph/services/redactor/simple.py +16 -0
  154. aethergraph/services/registry/key_parsing.py +44 -0
  155. aethergraph/services/registry/registry_key.py +19 -0
  156. aethergraph/services/registry/unified_registry.py +185 -0
  157. aethergraph/services/resume/multi_scheduler_resume_bus.py +65 -0
  158. aethergraph/services/resume/router.py +73 -0
  159. aethergraph/services/schedulers/registry.py +41 -0
  160. aethergraph/services/secrets/base.py +7 -0
  161. aethergraph/services/secrets/env.py +8 -0
  162. aethergraph/services/state_stores/externalize.py +135 -0
  163. aethergraph/services/state_stores/graph_observer.py +131 -0
  164. aethergraph/services/state_stores/json_store.py +67 -0
  165. aethergraph/services/state_stores/resume_policy.py +119 -0
  166. aethergraph/services/state_stores/serialize.py +249 -0
  167. aethergraph/services/state_stores/utils.py +91 -0
  168. aethergraph/services/state_stores/validate.py +78 -0
  169. aethergraph/services/tracing/noop.py +18 -0
  170. aethergraph/services/waits/wait_registry.py +91 -0
  171. aethergraph/services/wakeup/memory_queue.py +57 -0
  172. aethergraph/services/wakeup/scanner_producer.py +56 -0
  173. aethergraph/services/wakeup/worker.py +31 -0
  174. aethergraph/tools/__init__.py +25 -0
  175. aethergraph/utils/optdeps.py +8 -0
  176. aethergraph-0.1.0a1.dist-info/METADATA +410 -0
  177. aethergraph-0.1.0a1.dist-info/RECORD +182 -0
  178. aethergraph-0.1.0a1.dist-info/WHEEL +5 -0
  179. aethergraph-0.1.0a1.dist-info/entry_points.txt +2 -0
  180. aethergraph-0.1.0a1.dist-info/licenses/LICENSE +176 -0
  181. aethergraph-0.1.0a1.dist-info/licenses/NOTICE +31 -0
  182. aethergraph-0.1.0a1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,777 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ from collections.abc import Awaitable, Callable
5
+ from datetime import datetime
6
+ import inspect
7
+ from typing import TYPE_CHECKING, Any
8
+
9
+ from aethergraph.contracts.services.resume import ResumeEvent
10
+ from aethergraph.contracts.services.wakeup import WakeupEvent
11
+
12
+ from ..graph.graph_refs import GRAPH_INPUTS_NODE_ID
13
+ from ..graph.node_spec import NodeEvent
14
+ from ..graph.node_state import TERMINAL_STATES, WAITING_STATES, NodeStatus
15
+ from ..graph.task_node import TaskNodeRuntime
16
+ from .base_scheduler import BaseScheduler
17
+ from .retry_policy import RetryPolicy
18
+ from .step_forward import step_forward
19
+
20
+ if TYPE_CHECKING:
21
+ from ..graph.task_graph import TaskGraph
22
+ from ..runtime.runtime_env import RuntimeEnv
23
+
24
+
25
+ def _is_plan(node) -> bool:
26
+ return getattr(node, "node_type", getattr(node, "type", None)) == "plan"
27
+
28
+
29
+ class ForwardScheduler(BaseScheduler):
30
+ """
31
+ Event-driven DAG scheduler for Aethergraph.
32
+
33
+ Overview
34
+ --------
35
+ The ForwardScheduler executes a TaskGraph in "forward" mode: it starts runnable
36
+ nodes as soon as their dependencies are DONE, up to a configurable concurrency
37
+ limit. It is fully event-driven (no busy polling) and reacts to:
38
+ • Task completions
39
+ • External resumes (human/robot/time/event) delivered via a control queue
40
+ • Backoff timers for retries
41
+
42
+ Responsibilities
43
+ ----------------
44
+ • Determine runnable nodes (deps satisfied, not terminal, not running)
45
+ • Start nodes (async) and invoke `step_forward(...)` for the work
46
+ • Transition node state to DONE, SKIPPED, FAILED, or WAITING_*
47
+ • Persist and publish Continuations when a node requests a wait
48
+ • Handle Resume/Wakeup events and re-start waiting nodes with a payload
49
+ • Enforce max concurrency and apply retry/backoff policy
50
+ • Optionally: stop early on the first terminal failure and/or mark dependents SKIPPED
51
+
52
+ Key Concepts
53
+ ------------
54
+ • Terminal states: {DONE, FAILED, SKIPPED} (plus any custom terminal states)
55
+ • Waiting states: {WAITING_HUMAN, WAITING_ROBOT, WAITING_TIME, WAITING_EVENT}
56
+ • Control events:
57
+ - ResumeEvent(node_id, payload): resume a WAITING_* node with payload
58
+ - WakeupEvent(node_id): resume due to timer/poll (payload supplied upstream)
59
+ • Concurrency: bounded by `max_concurrency`; resumed nodes are prioritized
60
+ • Retries: delegated to RetryPolicy (attempts, backoff); backoff sleepers are tracked
61
+
62
+ Data Structures
63
+ ---------------
64
+ • running_tasks: {node_id -> asyncio.Task}
65
+ • _events: asyncio.Queue[ResumeEvent | WakeupEvent] (control plane)
66
+ • _resume_payloads: {node_id -> dict} (payload stash until start)
67
+ • _resume_pending: set[node_id] (resumed but awaiting capacity)
68
+ • _backoff_tasks: {node_id -> asyncio.Task} (sleepers before retry)
69
+
70
+ Run Loop (high level)
71
+ ---------------------
72
+ 1) Drain any control events currently in `_events` (non-blocking) and handle them.
73
+ 2) Schedule work up to capacity:
74
+ a) Start resumed waiters in `_resume_pending` first.
75
+ b) Start newly "ready" nodes (deps DONE).
76
+ 3) If nothing is running or scheduled:
77
+ a) If graph is effectively terminal (no running, no waiters, no pending/backoffs), exit.
78
+ b) If any nodes are WAITING_*, block on `_events.get()` until a resume/wakeup arrives.
79
+ c) Otherwise, the graph is stalled (likely unmet deps or failures); raise.
80
+ 4) If there is running work, wait for FIRST_COMPLETED of:
81
+ - any running task, or
82
+ - a new control event from `_events`.
83
+ Then loop back to (1).
84
+
85
+ State Transitions (per node)
86
+ ----------------------------
87
+ • RUNNING → DONE:
88
+ - Persist outputs
89
+ - Emit NodeEvent(DONE)
90
+ • RUNNING → WAITING_*:
91
+ - Continuation already saved and notified by `step_forward`
92
+ - Emit NodeEvent(WAITING_*)
93
+ • RUNNING → FAILED:
94
+ - Set FAILED; emit NodeEvent(FAILED)
95
+ - If retry eligible: schedule backoff sleeper and requeue later
96
+ - If retries exhausted:
97
+ * If `skip_dependents_on_failure=True`: mark dependents SKIPPED (transitively)
98
+ * If `stop_on_first_error=True`: set `_terminated=True` to end the run
99
+ • (External) WAITING_* + Resume/Wakeup:
100
+ - Store payload, cancel backoff for that node (if any)
101
+ - Start immediately if capacity allows; else add to `_resume_pending`
102
+
103
+ Scheduling Order
104
+ ----------------
105
+ 1) Resumed waiters (capacity permitting)
106
+ 2) Newly ready nodes (dependencies satisfied)
107
+ This keeps the system responsive to external signals.
108
+
109
+ Termination Conditions
110
+ ----------------------
111
+ • Natural completion: all non-plan nodes are in terminal states and
112
+ there are no running tasks, backoffs, or pending resumes.
113
+ • Early stop: first terminal failure with `stop_on_first_error=True`.
114
+ • Stalled graph: no running tasks, no waiters, not terminal → raises RuntimeError.
115
+
116
+ Configuration
117
+ -------------
118
+ • max_concurrency: int = 4
119
+ • retry_policy: RetryPolicy
120
+ • stop_on_first_error: bool = False
121
+ • skip_dependents_on_failure: bool = True
122
+
123
+ Performance & Safety Notes
124
+ --------------------------
125
+ • The loop is idle when there is no work: it blocks on either task completion
126
+ or `_events.get()`. There is no busy waiting.
127
+ • `_events` is drained non-blockingly at the start of each iteration to reduce
128
+ resume latency and coalesce multiple resumes.
129
+ • All resume paths are capacity-aware; if full, node IDs sit in `_resume_pending`.
130
+ • Backoff timers are lightweight asyncio sleep tasks; they wake only when due.
131
+
132
+ Extension Points
133
+ ----------------
134
+ • add_listener(cb): subscribe to NodeEvent emissions for metrics/telemetry.
135
+ • _compute_ready(): override to implement custom gating/priority.
136
+ • _skip_dependents(failed_id): override if you need custom skip rules.
137
+
138
+ Typical Usage
139
+ -------------
140
+ env = RuntimeEnv(...); sched = ForwardScheduler(graph, env, max_concurrency=2)
141
+ result = await sched.run() # returns when the graph is effectively terminal
142
+ # External systems call: await sched.on_resume_event(node_id, payload)
143
+
144
+ Invariants
145
+ ----------
146
+ • A node is started at most once concurrently.
147
+ • Resumes are idempotent: last payload wins before the node (re)starts.
148
+ • Continuations are persisted before WAITING_* is reported.
149
+ """
150
+
151
+ def __init__(
152
+ self,
153
+ graph: TaskGraph,
154
+ env: RuntimeEnv,
155
+ retry_policy: RetryPolicy | None = None,
156
+ *,
157
+ max_concurrency: int = 4,
158
+ stop_on_first_error: bool = False,
159
+ skip_dep_on_failure: bool = True,
160
+ logger: Any | None = None,
161
+ ):
162
+ """ForwardScheduler executes nodes in a forward manner, scheduling ready nodes as soon as their dependencies are met.
163
+ It supports waiting nodes (WAITING_HUMAN, WAITING_EXTERNAL, etc.) and can resume them upon external events.
164
+
165
+ Args:
166
+ - graph: TaskGraph to execute.
167
+ - env: RuntimeEnv providing runtime services and context.
168
+ - retry_policy: RetryPolicy defining retry behavior for failed nodes.
169
+ - max_concurrency: Maximum number of concurrent running tasks.
170
+ - stop_on_first_error: If True, stops the entire graph execution on the first node failure.
171
+ - skip_dep_on_failure: If True, skips downstream dependents of a failed node, but continues executing other independent nodes.
172
+ """
173
+
174
+ super().__init__(graph, mode="forward")
175
+ self.env = env
176
+ self.retry_policy = retry_policy or RetryPolicy()
177
+ self.max_concurrency = max_concurrency
178
+ self.stop_on_first_error = stop_on_first_error
179
+ self.skip_dep_on_failure = skip_dep_on_failure
180
+
181
+ # bookkeeping
182
+ self._resume_payloads: dict[str, dict] = {} # node_id -> resume payload
183
+ self._backoff_tasks: dict[str, asyncio.Task] = {} # node_id -> backoff task
184
+ self._resume_pending: set[str] = set() # node_ids with resume pending but not yet started
185
+ self._ready_pending: set[str] = set() # node_ids that became ready but not yet started
186
+
187
+ # event to pause/resume execution
188
+ self._events: asyncio.Queue = asyncio.Queue()
189
+ self.loop: asyncio.AbstractEventLoop | None = (
190
+ None # used by MultiSchedulerResumeBus with cross-thread calls
191
+ )
192
+ self._nudge = asyncio.Event()
193
+ self._resume_tokens: set[str] = set() # for logging/debugging
194
+
195
+ # listeners and callbacks
196
+ self._listeners: list[
197
+ Callable[[NodeEvent], Awaitable[None]]
198
+ ] = [] # Placeholder for event listeners
199
+
200
+ # logger
201
+ self.logger = logger
202
+
203
+ def bind_loop(self, loop: asyncio.AbstractEventLoop | None = None):
204
+ """Bind an event loop to this scheduler (for cross-thread resume calls)."""
205
+ self.loop = loop or asyncio.get_running_loop()
206
+
207
+ # --------- event listeners ---------
208
+ def add_listener(self, listener: Callable[[NodeEvent], Awaitable[None]]):
209
+ """Add an event listener that will be called on node events."""
210
+ if not inspect.iscoroutinefunction(listener):
211
+ raise ValueError("Listener must be an async function")
212
+ self._listeners.append(listener)
213
+
214
+ def _capacity(self) -> int:
215
+ """Return available capacity for new tasks."""
216
+ return self.max_concurrency - len(self.get_running_task_node_ids())
217
+
218
+ async def _try_start_immediately(self, node_id: str) -> bool:
219
+ """Try to start node now if waiting + capacity available; return True if started."""
220
+ if self._capacity() <= 0 or node_id in self.running_tasks:
221
+ return False
222
+ node = self._runtime(node_id)
223
+ if not node:
224
+ return False
225
+ if node.state.status not in WAITING_STATES:
226
+ return False
227
+ await self._start_node(node)
228
+ return True
229
+
230
+ async def _emit(self, event: NodeEvent):
231
+ """Emit an event to all listeners. Should not kill the scheduler if a listener fails."""
232
+ for cb in self._listeners:
233
+ try:
234
+ await cb(event)
235
+ except Exception as e:
236
+ if self.logger:
237
+ self.logger.warning(f"[ForwardScheduler._emit] Error in event listener: {e}")
238
+ else:
239
+ print(f"[ForwardScheduler._emit] Error in event listener: {e}")
240
+
241
+ # --------- public API ---------
242
+ async def deliver_resume(self, token: str):
243
+ """
244
+ Wake the engine: a continuation with `token` has been resolved.
245
+ Typically this means:
246
+ - mark the relevant node WAITING->READY
247
+ - schedule a tick
248
+ """
249
+ # Implementation choices:
250
+ # - if you keep a WaitRegistry in env, this might just trigger a run loop tick
251
+ # - if you keep an asyncio.Event per run, set() it
252
+ self._resume_tokens.add(token) # optional: track for logging
253
+ self._nudge.set() # asyncio.Event the main loop awaits
254
+
255
+ async def run(self):
256
+ """Main run loop. Schedules ready nodes, handles events, and manages concurrency.
257
+
258
+ The loop works as follows:
259
+ - Drain any pending control events (e.g. resume and wakeup).
260
+ - Schedule ready nodes up to max_concurrency.
261
+ - If no tasks are running and none were scheduled:
262
+ - If all nodes are terminal, exit.
263
+ - If any node is WAITING_*, block for a resume event.
264
+ - Otherwise, raise error (stalled graph).
265
+ - If tasks are running, wait for either a task to complete or a control event.
266
+ - Repeat until terminated.
267
+ """
268
+ self.loop = asyncio.get_running_loop()
269
+
270
+ dirty = True # something changed; try scheduling
271
+ MAX_DRAIN = 100 # max control events to drain in one go (to avoid starvation)
272
+ while not self._terminated:
273
+ await self._pause_event.wait()
274
+
275
+ # clear nudge
276
+ self._nudge.clear()
277
+
278
+ if dirty:
279
+ # 1) drain already-queued control events (non-blocking)
280
+ for _ in range(MAX_DRAIN): # optional MAX_DRAIN guard
281
+ try:
282
+ ev = self._events.get_nowait()
283
+ except asyncio.QueueEmpty:
284
+ break
285
+ await self._handle_events(ev)
286
+ # 2) try to schedule work
287
+ scheduled = await self._schedule_ready()
288
+ dirty = False
289
+ else:
290
+ scheduled = 0
291
+
292
+ running = list(self.running_tasks.values())
293
+
294
+ # 3) no work currently running or scheduled
295
+ if not running and scheduled == 0:
296
+ nothing_pending = (not self._backoff_tasks) and (not self._resume_pending)
297
+ if nothing_pending and not self._any_waiting():
298
+ # graph is effectively terminal (DONE/FAILED/SKIPPED only)
299
+ self._terminated = True
300
+ break
301
+
302
+ if self._any_waiting():
303
+ # 4) BLOCK until a resume/wakeup arrives (no CPU spin)
304
+ ev = await self._events.get()
305
+ await self._handle_events(ev)
306
+ dirty = True
307
+ continue
308
+
309
+ # stalled: neither running nor waiting nor terminal (likely unmet deps)
310
+ raise RuntimeError("stalled")
311
+
312
+ # 5) We have running tasks; wait for either a task to finish OR a control event
313
+ ctrl = asyncio.create_task(self._events.get())
314
+ try:
315
+ done, _ = await asyncio.wait(running + [ctrl], return_when=asyncio.FIRST_COMPLETED)
316
+ if ctrl in done:
317
+ ev = ctrl.result()
318
+ await self._handle_events(ev)
319
+ # either a task completed or an event arrived → state changed
320
+ dirty = True
321
+ finally:
322
+ if not ctrl.done():
323
+ ctrl.cancel()
324
+
325
+ async def run_from(self, node_ids: list[str]):
326
+ """Run starting from specific nodes (e.g. after external event)."""
327
+ for nid in node_ids:
328
+ node = self.graph.node(nid)
329
+ if node.state.status in (
330
+ NodeStatus.WAITING_HUMAN,
331
+ NodeStatus.WAITING_ROBOT,
332
+ NodeStatus.WAITING_EXTERNAL,
333
+ NodeStatus.WAITING_TIME,
334
+ NodeStatus.WAITING_EVENT,
335
+ ):
336
+ # will be executed when resume_payload arrives
337
+ continue
338
+ await self._start_node(node)
339
+
340
+ async def terminate(self):
341
+ """Terminate execution; running tasks will complete but no new tasks will be started."""
342
+ self._terminated = True
343
+ # cancel backoff tasks
344
+ for task in self._backoff_tasks.values():
345
+ task.cancel()
346
+ # cancel running tasks
347
+ for task in self.running_tasks.values():
348
+ task.cancel()
349
+
350
+ async def run_node(self, node):
351
+ """Explicitly run a specific node (e.g. for testing)."""
352
+ await self._start_node(node)
353
+
354
+ # ENFORCE capacity in run_one()
355
+ async def run_one_old(self, node: TaskNodeRuntime) -> dict[str, Any]:
356
+ # deps must be DONE (except inputs node)
357
+ for dep in node.dependencies or []:
358
+ if dep == GRAPH_INPUTS_NODE_ID:
359
+ continue
360
+ dep_node = self.graph.node(dep)
361
+ if dep_node is None or dep_node.state.status != NodeStatus.DONE:
362
+ raise RuntimeError(f"Cannot run node {node.node_id}: dependency {dep} not DONE")
363
+
364
+ # If we're already at capacity, wait until any running task completes
365
+ while self._capacity() <= 0:
366
+ # Wait for FIRST_COMPLETED among running tasks
367
+ running = list(self.running_tasks.values())
368
+ if not running:
369
+ break
370
+ done, _ = await asyncio.wait(running, return_when=asyncio.FIRST_COMPLETED)
371
+
372
+ # Start this node now that a slot is available
373
+ await self._start_node(node)
374
+
375
+ # Wait for this specific node to finish
376
+ task = self.running_tasks.get(node.node_id)
377
+ if task:
378
+ await task
379
+ return node.outputs or {}
380
+
381
+ async def _wait_until_terminal(self, target_id: str):
382
+ """Drive the scheduler event loop just enough to bring target_id to a terminal state."""
383
+ while True:
384
+ node = self.graph.node(target_id)
385
+ if node.state.status in TERMINAL_STATES:
386
+ return node.state.status
387
+
388
+ # Prioritize resume events
389
+ try:
390
+ ev = self._events.get_nowait()
391
+ await self._handle_events(ev)
392
+ except asyncio.QueueEmpty:
393
+ pass
394
+
395
+ # Try to (re)start anything that became runnable
396
+ await self._schedule_ready()
397
+
398
+ # If nothing running, block on the next control event (resume/wakeup), then loop
399
+ running = list(self.running_tasks.values())
400
+ if not running:
401
+ ev = await self._events.get()
402
+ await self._handle_events(ev)
403
+ else:
404
+ # Either a running task finishes or a control event arrives
405
+ ctrl = asyncio.create_task(self._events.get())
406
+ try:
407
+ done, _ = await asyncio.wait(
408
+ running + [ctrl], return_when=asyncio.FIRST_COMPLETED
409
+ )
410
+ if ctrl in done:
411
+ await self._handle_events(ctrl.result())
412
+ finally:
413
+ if not ctrl.done():
414
+ ctrl.cancel()
415
+
416
+ async def run_one(self, node: TaskNodeRuntime) -> dict[str, Any]:
417
+ """Run a single node by ID, return its outputs."""
418
+ self.loop = asyncio.get_running_loop() # ensure loop is set
419
+ # deps DONE check (kept as-is) ...
420
+ while self._capacity() <= 0:
421
+ running = list(self.running_tasks.values())
422
+ if not running:
423
+ break
424
+ await asyncio.wait(running, return_when=asyncio.FIRST_COMPLETED)
425
+
426
+ await self._start_node(node)
427
+
428
+ # Wait for the first execution round to finish
429
+ task = self.running_tasks.get(node.node_id)
430
+ if task:
431
+ await task
432
+
433
+ # If the node is WAITING_*, drive the loop until it becomes terminal
434
+ n = self.graph.node(node.node_id)
435
+ if n.state.status in WAITING_STATES:
436
+ await self._wait_until_terminal(node.node_id)
437
+
438
+ # Terminal: return outputs (or {} if failed/skipped)
439
+ n = self.graph.node(node.node_id)
440
+ if n.state.status == NodeStatus.DONE:
441
+ return n.outputs or {}
442
+ if n.state.status == NodeStatus.FAILED:
443
+ # optionally raise an error here
444
+ return n.outputs or {}
445
+ # SKIPPED or others:
446
+ return n.outputs or {}
447
+
448
+ async def step_next(self):
449
+ """Run exactly one step (for step-by-step execution)."""
450
+ r = self._compute_ready()
451
+ if r:
452
+ nid = next(iter(r))
453
+ await self._start_node(self.graph.node(nid))
454
+
455
+ # called by ResumeRouter when external/human resumes a waiting node
456
+ async def on_resume_event(self, run_id: str, node_id: str, payload: dict[str, Any]):
457
+ """Called by external event trigger to resume a waiting node.
458
+ We use async queue to schedule the resume event.
459
+ """
460
+ # NOTE: run_id is not needed for local scheduler, but we need it to match the signature with GlobalScheduler
461
+ await self._events.put(ResumeEvent(run_id, node_id, payload))
462
+
463
+ # --------- internal methods ---------
464
+ async def _schedule_ready(self) -> int:
465
+ available = self._capacity()
466
+ if available <= 0:
467
+ return 0
468
+ scheduled = 0
469
+
470
+ # 1) resumed waiters first
471
+ while available > 0 and self._resume_pending:
472
+ nid = self._resume_pending.pop()
473
+ node = self.graph.node(nid)
474
+ if node and node.state.status in WAITING_STATES and nid not in self.running_tasks:
475
+ await self._start_node(node)
476
+ scheduled += 1
477
+ available -= 1
478
+
479
+ # 2) explicit-start ready nodes (from run_one) next
480
+ while available > 0 and self._ready_pending:
481
+ nid = self._ready_pending.pop()
482
+ node = self.graph.node(nid)
483
+ if (
484
+ node
485
+ and node.node_id not in self.running_tasks
486
+ and node.state.status not in TERMINAL_STATES
487
+ ):
488
+ # still ensure deps satisfied
489
+ if all(
490
+ (dep == GRAPH_INPUTS_NODE_ID)
491
+ or (self.graph.node(dep).state.status == NodeStatus.DONE)
492
+ for dep in (node.spec.dependencies or [])
493
+ ):
494
+ await self._start_node(node)
495
+ scheduled += 1
496
+ available -= 1
497
+ else:
498
+ pass # deps not satisfied; skip
499
+
500
+ # 3) normal ready nodes
501
+ if available > 0:
502
+ for nid in list(self._compute_ready())[:available]:
503
+ await self._start_node(self.graph.node(nid))
504
+ scheduled += 1
505
+
506
+ return scheduled
507
+
508
+ async def _skip_dependents(self, failed_node_id: str):
509
+ """Mark all downstream dependents of failed_node_id as SKIPPED if not already terminal/running."""
510
+ # breadth-first over reverse edges
511
+ q = [failed_node_id]
512
+ seen = set()
513
+ while q:
514
+ cur = q.pop(0)
515
+ for n in self.graph.nodes:
516
+ if cur in (n.spec.dependencies or []):
517
+ if n.node_id in seen:
518
+ continue
519
+ seen.add(n.node_id)
520
+ node = self.graph.node(n.node_id)
521
+ if (
522
+ node.state.status not in TERMINAL_STATES
523
+ and n.node_id not in self.running_tasks
524
+ ):
525
+ await self.graph.set_node_status(n.node_id, NodeStatus.SKIPPED)
526
+ q.append(n.node_id)
527
+
528
+ def _compute_ready(self) -> set[str]:
529
+ """Nodes whose deps are completed/skipped and that are not running/waiting/failed.
530
+ Returns set of node_ids.
531
+ The function works as follows:
532
+ - Iterate over all nodes in the graph.
533
+ - Skip plan nodes and nodes that are already done, failed, skipped, or waiting.
534
+ - Skip nodes that are already running.
535
+ - Check if all dependencies of the node are satisfied (i.e., in DONE).
536
+ - If dependencies are satisfied, add the node_id to the ready set.
537
+ """
538
+
539
+ ready: set[str] = set()
540
+ for node in self.graph.nodes: # runtime nodes
541
+ node_id = node.node_id
542
+ node_status = node.state.status
543
+ node_type = node.spec.type
544
+
545
+ if node_type == "plan":
546
+ continue # skip plan nodes; TODO: we may deprecate plan node later
547
+ if node_status in (
548
+ NodeStatus.DONE,
549
+ NodeStatus.FAILED,
550
+ NodeStatus.SKIPPED,
551
+ NodeStatus.WAITING_HUMAN,
552
+ NodeStatus.WAITING_ROBOT,
553
+ NodeStatus.WAITING_EXTERNAL,
554
+ NodeStatus.WAITING_TIME,
555
+ NodeStatus.WAITING_EVENT,
556
+ ):
557
+ # already done/waiting/failed
558
+ continue
559
+
560
+ if node_id in self.running_tasks:
561
+ # already running
562
+ continue
563
+
564
+ # dependencies satisfied?
565
+ deps_ok = True
566
+ for dep in node.spec.dependencies or []:
567
+ if dep == GRAPH_INPUTS_NODE_ID:
568
+ continue # inputs node is always satisfied
569
+ dep_node = self._runtime(dep)
570
+ if dep_node is None:
571
+ if self.logger:
572
+ self.logger.warning(
573
+ f"Node {node_id} has missing dependency {dep}; skipping"
574
+ )
575
+ else:
576
+ print(
577
+ f"[ForwardScheduler] Node {node_id} has missing dependency {dep}; skipping"
578
+ )
579
+ deps_ok = False
580
+ break
581
+ if dep_node.state.status not in [NodeStatus.DONE]:
582
+ deps_ok = False
583
+ break
584
+ if deps_ok:
585
+ ready.add(node_id)
586
+
587
+ return ready
588
+
589
+ def _runtime(self, node_id: str) -> TaskNodeRuntime:
590
+ # get runtime node by id
591
+ node = self.graph.node(node_id)
592
+ return node
593
+
594
+ async def _start_node(self, node: TaskNodeRuntime):
595
+ node_id = node.node_id
596
+
597
+ # attach resume payload if any (WAITING_* -> RUNNING)
598
+ resume_payload = self._resume_payloads.pop(node_id, None)
599
+
600
+ if node.state.status in WAITING_STATES and resume_payload is None:
601
+ # keep it pending; it will be scheduled once a payload arrives
602
+ self._resume_pending.add(node_id)
603
+ return
604
+
605
+ async def _runner():
606
+ try:
607
+ await self.graph.set_node_status(node_id, NodeStatus.RUNNING)
608
+ ctx = self.env.make_ctx(
609
+ node=node, resume_payload=resume_payload
610
+ ) # ExecutionContext
611
+ result = await step_forward(node=node, ctx=ctx, retry_policy=self.retry_policy)
612
+
613
+ if result.status == NodeStatus.DONE:
614
+ # normalize between output/outputs
615
+ outs = result.outputs or {}
616
+
617
+ await self.graph.set_node_outputs(node_id, outs)
618
+ await self.graph.set_node_status(node_id, NodeStatus.DONE)
619
+
620
+ # publish outputs to env for downstream consumption
621
+ self.env.outputs_by_node[node.node_id] = outs
622
+
623
+ # emit event
624
+ event = NodeEvent(
625
+ run_id=self.env.run_id,
626
+ graph_id=getattr(self.graph.spec, "graph_id", "inline"),
627
+ node_id=node.node_id,
628
+ status=str(NodeStatus.DONE),
629
+ outputs=node.outputs or {},
630
+ timestamp=datetime.utcnow().timestamp(),
631
+ )
632
+ await self._emit(event)
633
+
634
+ elif result.status.startswith("WAITING_"):
635
+ # no outputs yet; continuation already persisted by ctx.storage via step_forward
636
+ # scheduler idles until on_resume() or wakeup queue triggers
637
+ await self.graph.set_node_status(node_id, result.status)
638
+
639
+ # emit event
640
+ event = NodeEvent(
641
+ run_id=self.env.run_id,
642
+ graph_id=getattr(self.graph.spec, "graph_id", "inline"),
643
+ node_id=node.node_id,
644
+ status=result.status,
645
+ outputs=node.outputs or {},
646
+ timestamp=datetime.utcnow().timestamp(),
647
+ )
648
+ await self._emit(event)
649
+
650
+ elif result.status == NodeStatus.FAILED:
651
+ # step_forward already incremented attempts (if policy applies)
652
+ # If retry allowed, schedule backoff sleeper:
653
+ await self.graph.set_node_status(node_id, NodeStatus.FAILED)
654
+
655
+ # emit event
656
+ event = NodeEvent(
657
+ run_id=self.env.run_id,
658
+ graph_id=getattr(self.graph.spec, "graph_id", "inline"),
659
+ node_id=node.node_id,
660
+ status=str(NodeStatus.FAILED),
661
+ outputs=node.outputs or {},
662
+ timestamp=datetime.utcnow().timestamp(),
663
+ )
664
+ await self._emit(event)
665
+
666
+ attempts = getattr(node, "attempts", 0)
667
+ if attempts > 0 and attempts < self.retry_policy.max_attempts:
668
+ delay = self.retry_policy.backoff(
669
+ attempts - 1
670
+ ).total_seconds() # attempts was incremented in step_forward
671
+ self._backoff_tasks[node.node_id] = asyncio.create_task(
672
+ self._sleep_and_requeue(node, delay)
673
+ )
674
+ else:
675
+ # retries exhausted: optionally stop or skip dependents
676
+ if self.skip_dep_on_failure:
677
+ await self._skip_dependents(node_id)
678
+ if self.stop_on_first_error:
679
+ # flip the master switch to stop the main loop
680
+
681
+ self._terminated = True
682
+
683
+ elif result.status == NodeStatus.SKIPPED:
684
+ await self.graph.set_node_status(node_id, NodeStatus.SKIPPED)
685
+
686
+ # emit event
687
+ event = NodeEvent(
688
+ run_id=self.env.run_id,
689
+ graph_id=getattr(self.graph.spec, "graph_id", "inline"),
690
+ node_id=node.node_id,
691
+ status=str(NodeStatus.SKIPPED),
692
+ outputs=node.outputs or {},
693
+ timestamp=datetime.utcnow().timestamp(),
694
+ )
695
+ await self._emit(event)
696
+
697
+ # record memory after step
698
+ # record_after_step(self.env, node, result)
699
+ # TODO: optionally map selected outputs into domain memory here
700
+
701
+ except NotImplementedError:
702
+ # subgraph logic not handled here; escalate to orchestrator
703
+ await node.set_status(NodeStatus.FAILED)
704
+ except asyncio.CancelledError:
705
+ # task cancelled (e.g. on terminate);
706
+ await node.set_status(NodeStatus.FAILED)
707
+ finally:
708
+ # remove from running tasks in caller
709
+ pass
710
+
711
+ task = asyncio.create_task(_runner())
712
+ self.running_tasks[node_id] = task
713
+ # cleanup when done
714
+ task.add_done_callback(lambda t, nid=node_id: self.running_tasks.pop(nid, None))
715
+
716
+ async def _sleep_and_requeue(self, node: TaskNodeRuntime, delay: float):
717
+ try:
718
+ await asyncio.sleep(delay)
719
+ if not self._terminated:
720
+ await self._start_node(node)
721
+ except asyncio.CancelledError:
722
+ pass
723
+ finally:
724
+ self._backoff_tasks.pop(node.node_id, None)
725
+
726
+ async def _handle_events(self, ev):
727
+ """Handle control events (e.g., resume, wakeup).
728
+ The function works as follows:
729
+ - If the event is a ResumeEvent:
730
+ - Store the resume payload.
731
+ - Cancel any backoff task for the node.
732
+ - If the node is already running or max concurrency reached, mark it as pending and return.
733
+ - Otherwise, start the node.
734
+ - If the event is a WakeupEvent:
735
+ - If the node is not running and max concurrency not reached, start the node.
736
+ NOTE: This function assumes that the event queue is drained before scheduling new nodes.
737
+ """
738
+ # resume event for WAITING_* nodes
739
+ if isinstance(ev, ResumeEvent):
740
+ # store payload (idempotent; last write wins)
741
+ self._resume_payloads[ev.node_id] = ev.payload
742
+
743
+ # cancel any pending backoff for this node
744
+ task = self._backoff_tasks.pop(ev.node_id, None)
745
+ if task:
746
+ task.cancel()
747
+
748
+ # try start now, else mark pending
749
+ started = await self._try_start_immediately(ev.node_id)
750
+ if not started:
751
+ self._resume_pending.add(ev.node_id)
752
+ return
753
+
754
+ elif isinstance(ev, WakeupEvent):
755
+ started = await self._try_start_immediately(ev.node_id)
756
+ # If capacity is full, nothing else to do. When a slot frees, _schedule_ready will pick it.
757
+ return
758
+
759
+ def _all_nodes_terminal(self) -> bool:
760
+ # treat plan nodes as ignorable for completion
761
+ for node in self.graph.nodes:
762
+ if _is_plan(node):
763
+ continue
764
+ if node.state.status not in TERMINAL_STATES:
765
+ return False
766
+ return True
767
+
768
+ def _any_waiting(self) -> bool:
769
+ return any(
770
+ (not _is_plan(n)) and (n.state.status in WAITING_STATES) for n in self.graph.nodes
771
+ )
772
+
773
+ def post_resume_event_threadsafe(self, run_id: str, node_id: str, payload: dict):
774
+ if not self.loop or not self.loop.is_running():
775
+ # no-op or log; bus will warn
776
+ return
777
+ asyncio.run_coroutine_threadsafe(self.on_resume_event(run_id, node_id, payload), self.loop)