aethergraph 0.1.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aethergraph/__init__.py +49 -0
- aethergraph/config/__init__.py +0 -0
- aethergraph/config/config.py +121 -0
- aethergraph/config/context.py +16 -0
- aethergraph/config/llm.py +26 -0
- aethergraph/config/loader.py +60 -0
- aethergraph/config/runtime.py +9 -0
- aethergraph/contracts/errors/errors.py +44 -0
- aethergraph/contracts/services/artifacts.py +142 -0
- aethergraph/contracts/services/channel.py +72 -0
- aethergraph/contracts/services/continuations.py +23 -0
- aethergraph/contracts/services/eventbus.py +12 -0
- aethergraph/contracts/services/kv.py +24 -0
- aethergraph/contracts/services/llm.py +17 -0
- aethergraph/contracts/services/mcp.py +22 -0
- aethergraph/contracts/services/memory.py +108 -0
- aethergraph/contracts/services/resume.py +28 -0
- aethergraph/contracts/services/state_stores.py +33 -0
- aethergraph/contracts/services/wakeup.py +28 -0
- aethergraph/core/execution/base_scheduler.py +77 -0
- aethergraph/core/execution/forward_scheduler.py +777 -0
- aethergraph/core/execution/global_scheduler.py +634 -0
- aethergraph/core/execution/retry_policy.py +22 -0
- aethergraph/core/execution/step_forward.py +411 -0
- aethergraph/core/execution/step_result.py +18 -0
- aethergraph/core/execution/wait_types.py +72 -0
- aethergraph/core/graph/graph_builder.py +192 -0
- aethergraph/core/graph/graph_fn.py +219 -0
- aethergraph/core/graph/graph_io.py +67 -0
- aethergraph/core/graph/graph_refs.py +154 -0
- aethergraph/core/graph/graph_spec.py +115 -0
- aethergraph/core/graph/graph_state.py +59 -0
- aethergraph/core/graph/graphify.py +128 -0
- aethergraph/core/graph/interpreter.py +145 -0
- aethergraph/core/graph/node_handle.py +33 -0
- aethergraph/core/graph/node_spec.py +46 -0
- aethergraph/core/graph/node_state.py +63 -0
- aethergraph/core/graph/task_graph.py +747 -0
- aethergraph/core/graph/task_node.py +82 -0
- aethergraph/core/graph/utils.py +37 -0
- aethergraph/core/graph/visualize.py +239 -0
- aethergraph/core/runtime/ad_hoc_context.py +61 -0
- aethergraph/core/runtime/base_service.py +153 -0
- aethergraph/core/runtime/bind_adapter.py +42 -0
- aethergraph/core/runtime/bound_memory.py +69 -0
- aethergraph/core/runtime/execution_context.py +220 -0
- aethergraph/core/runtime/graph_runner.py +349 -0
- aethergraph/core/runtime/lifecycle.py +26 -0
- aethergraph/core/runtime/node_context.py +203 -0
- aethergraph/core/runtime/node_services.py +30 -0
- aethergraph/core/runtime/recovery.py +159 -0
- aethergraph/core/runtime/run_registration.py +33 -0
- aethergraph/core/runtime/runtime_env.py +157 -0
- aethergraph/core/runtime/runtime_registry.py +32 -0
- aethergraph/core/runtime/runtime_services.py +224 -0
- aethergraph/core/runtime/wakeup_watcher.py +40 -0
- aethergraph/core/tools/__init__.py +10 -0
- aethergraph/core/tools/builtins/channel_tools.py +194 -0
- aethergraph/core/tools/builtins/toolset.py +134 -0
- aethergraph/core/tools/toolkit.py +510 -0
- aethergraph/core/tools/waitable.py +109 -0
- aethergraph/plugins/channel/__init__.py +0 -0
- aethergraph/plugins/channel/adapters/__init__.py +0 -0
- aethergraph/plugins/channel/adapters/console.py +106 -0
- aethergraph/plugins/channel/adapters/file.py +102 -0
- aethergraph/plugins/channel/adapters/slack.py +285 -0
- aethergraph/plugins/channel/adapters/telegram.py +302 -0
- aethergraph/plugins/channel/adapters/webhook.py +104 -0
- aethergraph/plugins/channel/adapters/webui.py +134 -0
- aethergraph/plugins/channel/routes/__init__.py +0 -0
- aethergraph/plugins/channel/routes/console_routes.py +86 -0
- aethergraph/plugins/channel/routes/slack_routes.py +49 -0
- aethergraph/plugins/channel/routes/telegram_routes.py +26 -0
- aethergraph/plugins/channel/routes/webui_routes.py +136 -0
- aethergraph/plugins/channel/utils/__init__.py +0 -0
- aethergraph/plugins/channel/utils/slack_utils.py +278 -0
- aethergraph/plugins/channel/utils/telegram_utils.py +324 -0
- aethergraph/plugins/channel/websockets/slack_ws.py +68 -0
- aethergraph/plugins/channel/websockets/telegram_polling.py +151 -0
- aethergraph/plugins/mcp/fs_server.py +128 -0
- aethergraph/plugins/mcp/http_server.py +101 -0
- aethergraph/plugins/mcp/ws_server.py +180 -0
- aethergraph/plugins/net/http.py +10 -0
- aethergraph/plugins/utils/data_io.py +359 -0
- aethergraph/runner/__init__.py +5 -0
- aethergraph/runtime/__init__.py +62 -0
- aethergraph/server/__init__.py +3 -0
- aethergraph/server/app_factory.py +84 -0
- aethergraph/server/start.py +122 -0
- aethergraph/services/__init__.py +10 -0
- aethergraph/services/artifacts/facade.py +284 -0
- aethergraph/services/artifacts/factory.py +35 -0
- aethergraph/services/artifacts/fs_store.py +656 -0
- aethergraph/services/artifacts/jsonl_index.py +123 -0
- aethergraph/services/artifacts/paths.py +23 -0
- aethergraph/services/artifacts/sqlite_index.py +209 -0
- aethergraph/services/artifacts/utils.py +124 -0
- aethergraph/services/auth/dev.py +16 -0
- aethergraph/services/channel/channel_bus.py +293 -0
- aethergraph/services/channel/factory.py +44 -0
- aethergraph/services/channel/session.py +511 -0
- aethergraph/services/channel/wait_helpers.py +57 -0
- aethergraph/services/clock/clock.py +9 -0
- aethergraph/services/container/default_container.py +320 -0
- aethergraph/services/continuations/continuation.py +56 -0
- aethergraph/services/continuations/factory.py +34 -0
- aethergraph/services/continuations/stores/fs_store.py +264 -0
- aethergraph/services/continuations/stores/inmem_store.py +95 -0
- aethergraph/services/eventbus/inmem.py +21 -0
- aethergraph/services/features/static.py +10 -0
- aethergraph/services/kv/ephemeral.py +90 -0
- aethergraph/services/kv/factory.py +27 -0
- aethergraph/services/kv/layered.py +41 -0
- aethergraph/services/kv/sqlite_kv.py +128 -0
- aethergraph/services/llm/factory.py +157 -0
- aethergraph/services/llm/generic_client.py +542 -0
- aethergraph/services/llm/providers.py +3 -0
- aethergraph/services/llm/service.py +105 -0
- aethergraph/services/logger/base.py +36 -0
- aethergraph/services/logger/compat.py +50 -0
- aethergraph/services/logger/formatters.py +106 -0
- aethergraph/services/logger/std.py +203 -0
- aethergraph/services/mcp/helpers.py +23 -0
- aethergraph/services/mcp/http_client.py +70 -0
- aethergraph/services/mcp/mcp_tools.py +21 -0
- aethergraph/services/mcp/registry.py +14 -0
- aethergraph/services/mcp/service.py +100 -0
- aethergraph/services/mcp/stdio_client.py +70 -0
- aethergraph/services/mcp/ws_client.py +115 -0
- aethergraph/services/memory/bound.py +106 -0
- aethergraph/services/memory/distillers/episode.py +116 -0
- aethergraph/services/memory/distillers/rolling.py +74 -0
- aethergraph/services/memory/facade.py +633 -0
- aethergraph/services/memory/factory.py +78 -0
- aethergraph/services/memory/hotlog_kv.py +27 -0
- aethergraph/services/memory/indices.py +74 -0
- aethergraph/services/memory/io_helpers.py +72 -0
- aethergraph/services/memory/persist_fs.py +40 -0
- aethergraph/services/memory/resolver.py +152 -0
- aethergraph/services/metering/noop.py +4 -0
- aethergraph/services/prompts/file_store.py +41 -0
- aethergraph/services/rag/chunker.py +29 -0
- aethergraph/services/rag/facade.py +593 -0
- aethergraph/services/rag/index/base.py +27 -0
- aethergraph/services/rag/index/faiss_index.py +121 -0
- aethergraph/services/rag/index/sqlite_index.py +134 -0
- aethergraph/services/rag/index_factory.py +52 -0
- aethergraph/services/rag/parsers/md.py +7 -0
- aethergraph/services/rag/parsers/pdf.py +14 -0
- aethergraph/services/rag/parsers/txt.py +7 -0
- aethergraph/services/rag/utils/hybrid.py +39 -0
- aethergraph/services/rag/utils/make_fs_key.py +62 -0
- aethergraph/services/redactor/simple.py +16 -0
- aethergraph/services/registry/key_parsing.py +44 -0
- aethergraph/services/registry/registry_key.py +19 -0
- aethergraph/services/registry/unified_registry.py +185 -0
- aethergraph/services/resume/multi_scheduler_resume_bus.py +65 -0
- aethergraph/services/resume/router.py +73 -0
- aethergraph/services/schedulers/registry.py +41 -0
- aethergraph/services/secrets/base.py +7 -0
- aethergraph/services/secrets/env.py +8 -0
- aethergraph/services/state_stores/externalize.py +135 -0
- aethergraph/services/state_stores/graph_observer.py +131 -0
- aethergraph/services/state_stores/json_store.py +67 -0
- aethergraph/services/state_stores/resume_policy.py +119 -0
- aethergraph/services/state_stores/serialize.py +249 -0
- aethergraph/services/state_stores/utils.py +91 -0
- aethergraph/services/state_stores/validate.py +78 -0
- aethergraph/services/tracing/noop.py +18 -0
- aethergraph/services/waits/wait_registry.py +91 -0
- aethergraph/services/wakeup/memory_queue.py +57 -0
- aethergraph/services/wakeup/scanner_producer.py +56 -0
- aethergraph/services/wakeup/worker.py +31 -0
- aethergraph/tools/__init__.py +25 -0
- aethergraph/utils/optdeps.py +8 -0
- aethergraph-0.1.0a1.dist-info/METADATA +410 -0
- aethergraph-0.1.0a1.dist-info/RECORD +182 -0
- aethergraph-0.1.0a1.dist-info/WHEEL +5 -0
- aethergraph-0.1.0a1.dist-info/entry_points.txt +2 -0
- aethergraph-0.1.0a1.dist-info/licenses/LICENSE +176 -0
- aethergraph-0.1.0a1.dist-info/licenses/NOTICE +31 -0
- aethergraph-0.1.0a1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,777 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
from collections.abc import Awaitable, Callable
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
import inspect
|
|
7
|
+
from typing import TYPE_CHECKING, Any
|
|
8
|
+
|
|
9
|
+
from aethergraph.contracts.services.resume import ResumeEvent
|
|
10
|
+
from aethergraph.contracts.services.wakeup import WakeupEvent
|
|
11
|
+
|
|
12
|
+
from ..graph.graph_refs import GRAPH_INPUTS_NODE_ID
|
|
13
|
+
from ..graph.node_spec import NodeEvent
|
|
14
|
+
from ..graph.node_state import TERMINAL_STATES, WAITING_STATES, NodeStatus
|
|
15
|
+
from ..graph.task_node import TaskNodeRuntime
|
|
16
|
+
from .base_scheduler import BaseScheduler
|
|
17
|
+
from .retry_policy import RetryPolicy
|
|
18
|
+
from .step_forward import step_forward
|
|
19
|
+
|
|
20
|
+
if TYPE_CHECKING:
|
|
21
|
+
from ..graph.task_graph import TaskGraph
|
|
22
|
+
from ..runtime.runtime_env import RuntimeEnv
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _is_plan(node) -> bool:
|
|
26
|
+
return getattr(node, "node_type", getattr(node, "type", None)) == "plan"
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class ForwardScheduler(BaseScheduler):
|
|
30
|
+
"""
|
|
31
|
+
Event-driven DAG scheduler for Aethergraph.
|
|
32
|
+
|
|
33
|
+
Overview
|
|
34
|
+
--------
|
|
35
|
+
The ForwardScheduler executes a TaskGraph in "forward" mode: it starts runnable
|
|
36
|
+
nodes as soon as their dependencies are DONE, up to a configurable concurrency
|
|
37
|
+
limit. It is fully event-driven (no busy polling) and reacts to:
|
|
38
|
+
• Task completions
|
|
39
|
+
• External resumes (human/robot/time/event) delivered via a control queue
|
|
40
|
+
• Backoff timers for retries
|
|
41
|
+
|
|
42
|
+
Responsibilities
|
|
43
|
+
----------------
|
|
44
|
+
• Determine runnable nodes (deps satisfied, not terminal, not running)
|
|
45
|
+
• Start nodes (async) and invoke `step_forward(...)` for the work
|
|
46
|
+
• Transition node state to DONE, SKIPPED, FAILED, or WAITING_*
|
|
47
|
+
• Persist and publish Continuations when a node requests a wait
|
|
48
|
+
• Handle Resume/Wakeup events and re-start waiting nodes with a payload
|
|
49
|
+
• Enforce max concurrency and apply retry/backoff policy
|
|
50
|
+
• Optionally: stop early on the first terminal failure and/or mark dependents SKIPPED
|
|
51
|
+
|
|
52
|
+
Key Concepts
|
|
53
|
+
------------
|
|
54
|
+
• Terminal states: {DONE, FAILED, SKIPPED} (plus any custom terminal states)
|
|
55
|
+
• Waiting states: {WAITING_HUMAN, WAITING_ROBOT, WAITING_TIME, WAITING_EVENT}
|
|
56
|
+
• Control events:
|
|
57
|
+
- ResumeEvent(node_id, payload): resume a WAITING_* node with payload
|
|
58
|
+
- WakeupEvent(node_id): resume due to timer/poll (payload supplied upstream)
|
|
59
|
+
• Concurrency: bounded by `max_concurrency`; resumed nodes are prioritized
|
|
60
|
+
• Retries: delegated to RetryPolicy (attempts, backoff); backoff sleepers are tracked
|
|
61
|
+
|
|
62
|
+
Data Structures
|
|
63
|
+
---------------
|
|
64
|
+
• running_tasks: {node_id -> asyncio.Task}
|
|
65
|
+
• _events: asyncio.Queue[ResumeEvent | WakeupEvent] (control plane)
|
|
66
|
+
• _resume_payloads: {node_id -> dict} (payload stash until start)
|
|
67
|
+
• _resume_pending: set[node_id] (resumed but awaiting capacity)
|
|
68
|
+
• _backoff_tasks: {node_id -> asyncio.Task} (sleepers before retry)
|
|
69
|
+
|
|
70
|
+
Run Loop (high level)
|
|
71
|
+
---------------------
|
|
72
|
+
1) Drain any control events currently in `_events` (non-blocking) and handle them.
|
|
73
|
+
2) Schedule work up to capacity:
|
|
74
|
+
a) Start resumed waiters in `_resume_pending` first.
|
|
75
|
+
b) Start newly "ready" nodes (deps DONE).
|
|
76
|
+
3) If nothing is running or scheduled:
|
|
77
|
+
a) If graph is effectively terminal (no running, no waiters, no pending/backoffs), exit.
|
|
78
|
+
b) If any nodes are WAITING_*, block on `_events.get()` until a resume/wakeup arrives.
|
|
79
|
+
c) Otherwise, the graph is stalled (likely unmet deps or failures); raise.
|
|
80
|
+
4) If there is running work, wait for FIRST_COMPLETED of:
|
|
81
|
+
- any running task, or
|
|
82
|
+
- a new control event from `_events`.
|
|
83
|
+
Then loop back to (1).
|
|
84
|
+
|
|
85
|
+
State Transitions (per node)
|
|
86
|
+
----------------------------
|
|
87
|
+
• RUNNING → DONE:
|
|
88
|
+
- Persist outputs
|
|
89
|
+
- Emit NodeEvent(DONE)
|
|
90
|
+
• RUNNING → WAITING_*:
|
|
91
|
+
- Continuation already saved and notified by `step_forward`
|
|
92
|
+
- Emit NodeEvent(WAITING_*)
|
|
93
|
+
• RUNNING → FAILED:
|
|
94
|
+
- Set FAILED; emit NodeEvent(FAILED)
|
|
95
|
+
- If retry eligible: schedule backoff sleeper and requeue later
|
|
96
|
+
- If retries exhausted:
|
|
97
|
+
* If `skip_dependents_on_failure=True`: mark dependents SKIPPED (transitively)
|
|
98
|
+
* If `stop_on_first_error=True`: set `_terminated=True` to end the run
|
|
99
|
+
• (External) WAITING_* + Resume/Wakeup:
|
|
100
|
+
- Store payload, cancel backoff for that node (if any)
|
|
101
|
+
- Start immediately if capacity allows; else add to `_resume_pending`
|
|
102
|
+
|
|
103
|
+
Scheduling Order
|
|
104
|
+
----------------
|
|
105
|
+
1) Resumed waiters (capacity permitting)
|
|
106
|
+
2) Newly ready nodes (dependencies satisfied)
|
|
107
|
+
This keeps the system responsive to external signals.
|
|
108
|
+
|
|
109
|
+
Termination Conditions
|
|
110
|
+
----------------------
|
|
111
|
+
• Natural completion: all non-plan nodes are in terminal states and
|
|
112
|
+
there are no running tasks, backoffs, or pending resumes.
|
|
113
|
+
• Early stop: first terminal failure with `stop_on_first_error=True`.
|
|
114
|
+
• Stalled graph: no running tasks, no waiters, not terminal → raises RuntimeError.
|
|
115
|
+
|
|
116
|
+
Configuration
|
|
117
|
+
-------------
|
|
118
|
+
• max_concurrency: int = 4
|
|
119
|
+
• retry_policy: RetryPolicy
|
|
120
|
+
• stop_on_first_error: bool = False
|
|
121
|
+
• skip_dependents_on_failure: bool = True
|
|
122
|
+
|
|
123
|
+
Performance & Safety Notes
|
|
124
|
+
--------------------------
|
|
125
|
+
• The loop is idle when there is no work: it blocks on either task completion
|
|
126
|
+
or `_events.get()`. There is no busy waiting.
|
|
127
|
+
• `_events` is drained non-blockingly at the start of each iteration to reduce
|
|
128
|
+
resume latency and coalesce multiple resumes.
|
|
129
|
+
• All resume paths are capacity-aware; if full, node IDs sit in `_resume_pending`.
|
|
130
|
+
• Backoff timers are lightweight asyncio sleep tasks; they wake only when due.
|
|
131
|
+
|
|
132
|
+
Extension Points
|
|
133
|
+
----------------
|
|
134
|
+
• add_listener(cb): subscribe to NodeEvent emissions for metrics/telemetry.
|
|
135
|
+
• _compute_ready(): override to implement custom gating/priority.
|
|
136
|
+
• _skip_dependents(failed_id): override if you need custom skip rules.
|
|
137
|
+
|
|
138
|
+
Typical Usage
|
|
139
|
+
-------------
|
|
140
|
+
env = RuntimeEnv(...); sched = ForwardScheduler(graph, env, max_concurrency=2)
|
|
141
|
+
result = await sched.run() # returns when the graph is effectively terminal
|
|
142
|
+
# External systems call: await sched.on_resume_event(node_id, payload)
|
|
143
|
+
|
|
144
|
+
Invariants
|
|
145
|
+
----------
|
|
146
|
+
• A node is started at most once concurrently.
|
|
147
|
+
• Resumes are idempotent: last payload wins before the node (re)starts.
|
|
148
|
+
• Continuations are persisted before WAITING_* is reported.
|
|
149
|
+
"""
|
|
150
|
+
|
|
151
|
+
def __init__(
|
|
152
|
+
self,
|
|
153
|
+
graph: TaskGraph,
|
|
154
|
+
env: RuntimeEnv,
|
|
155
|
+
retry_policy: RetryPolicy | None = None,
|
|
156
|
+
*,
|
|
157
|
+
max_concurrency: int = 4,
|
|
158
|
+
stop_on_first_error: bool = False,
|
|
159
|
+
skip_dep_on_failure: bool = True,
|
|
160
|
+
logger: Any | None = None,
|
|
161
|
+
):
|
|
162
|
+
"""ForwardScheduler executes nodes in a forward manner, scheduling ready nodes as soon as their dependencies are met.
|
|
163
|
+
It supports waiting nodes (WAITING_HUMAN, WAITING_EXTERNAL, etc.) and can resume them upon external events.
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
- graph: TaskGraph to execute.
|
|
167
|
+
- env: RuntimeEnv providing runtime services and context.
|
|
168
|
+
- retry_policy: RetryPolicy defining retry behavior for failed nodes.
|
|
169
|
+
- max_concurrency: Maximum number of concurrent running tasks.
|
|
170
|
+
- stop_on_first_error: If True, stops the entire graph execution on the first node failure.
|
|
171
|
+
- skip_dep_on_failure: If True, skips downstream dependents of a failed node, but continues executing other independent nodes.
|
|
172
|
+
"""
|
|
173
|
+
|
|
174
|
+
super().__init__(graph, mode="forward")
|
|
175
|
+
self.env = env
|
|
176
|
+
self.retry_policy = retry_policy or RetryPolicy()
|
|
177
|
+
self.max_concurrency = max_concurrency
|
|
178
|
+
self.stop_on_first_error = stop_on_first_error
|
|
179
|
+
self.skip_dep_on_failure = skip_dep_on_failure
|
|
180
|
+
|
|
181
|
+
# bookkeeping
|
|
182
|
+
self._resume_payloads: dict[str, dict] = {} # node_id -> resume payload
|
|
183
|
+
self._backoff_tasks: dict[str, asyncio.Task] = {} # node_id -> backoff task
|
|
184
|
+
self._resume_pending: set[str] = set() # node_ids with resume pending but not yet started
|
|
185
|
+
self._ready_pending: set[str] = set() # node_ids that became ready but not yet started
|
|
186
|
+
|
|
187
|
+
# event to pause/resume execution
|
|
188
|
+
self._events: asyncio.Queue = asyncio.Queue()
|
|
189
|
+
self.loop: asyncio.AbstractEventLoop | None = (
|
|
190
|
+
None # used by MultiSchedulerResumeBus with cross-thread calls
|
|
191
|
+
)
|
|
192
|
+
self._nudge = asyncio.Event()
|
|
193
|
+
self._resume_tokens: set[str] = set() # for logging/debugging
|
|
194
|
+
|
|
195
|
+
# listeners and callbacks
|
|
196
|
+
self._listeners: list[
|
|
197
|
+
Callable[[NodeEvent], Awaitable[None]]
|
|
198
|
+
] = [] # Placeholder for event listeners
|
|
199
|
+
|
|
200
|
+
# logger
|
|
201
|
+
self.logger = logger
|
|
202
|
+
|
|
203
|
+
def bind_loop(self, loop: asyncio.AbstractEventLoop | None = None):
|
|
204
|
+
"""Bind an event loop to this scheduler (for cross-thread resume calls)."""
|
|
205
|
+
self.loop = loop or asyncio.get_running_loop()
|
|
206
|
+
|
|
207
|
+
# --------- event listeners ---------
|
|
208
|
+
def add_listener(self, listener: Callable[[NodeEvent], Awaitable[None]]):
|
|
209
|
+
"""Add an event listener that will be called on node events."""
|
|
210
|
+
if not inspect.iscoroutinefunction(listener):
|
|
211
|
+
raise ValueError("Listener must be an async function")
|
|
212
|
+
self._listeners.append(listener)
|
|
213
|
+
|
|
214
|
+
def _capacity(self) -> int:
|
|
215
|
+
"""Return available capacity for new tasks."""
|
|
216
|
+
return self.max_concurrency - len(self.get_running_task_node_ids())
|
|
217
|
+
|
|
218
|
+
async def _try_start_immediately(self, node_id: str) -> bool:
|
|
219
|
+
"""Try to start node now if waiting + capacity available; return True if started."""
|
|
220
|
+
if self._capacity() <= 0 or node_id in self.running_tasks:
|
|
221
|
+
return False
|
|
222
|
+
node = self._runtime(node_id)
|
|
223
|
+
if not node:
|
|
224
|
+
return False
|
|
225
|
+
if node.state.status not in WAITING_STATES:
|
|
226
|
+
return False
|
|
227
|
+
await self._start_node(node)
|
|
228
|
+
return True
|
|
229
|
+
|
|
230
|
+
async def _emit(self, event: NodeEvent):
|
|
231
|
+
"""Emit an event to all listeners. Should not kill the scheduler if a listener fails."""
|
|
232
|
+
for cb in self._listeners:
|
|
233
|
+
try:
|
|
234
|
+
await cb(event)
|
|
235
|
+
except Exception as e:
|
|
236
|
+
if self.logger:
|
|
237
|
+
self.logger.warning(f"[ForwardScheduler._emit] Error in event listener: {e}")
|
|
238
|
+
else:
|
|
239
|
+
print(f"[ForwardScheduler._emit] Error in event listener: {e}")
|
|
240
|
+
|
|
241
|
+
# --------- public API ---------
|
|
242
|
+
async def deliver_resume(self, token: str):
|
|
243
|
+
"""
|
|
244
|
+
Wake the engine: a continuation with `token` has been resolved.
|
|
245
|
+
Typically this means:
|
|
246
|
+
- mark the relevant node WAITING->READY
|
|
247
|
+
- schedule a tick
|
|
248
|
+
"""
|
|
249
|
+
# Implementation choices:
|
|
250
|
+
# - if you keep a WaitRegistry in env, this might just trigger a run loop tick
|
|
251
|
+
# - if you keep an asyncio.Event per run, set() it
|
|
252
|
+
self._resume_tokens.add(token) # optional: track for logging
|
|
253
|
+
self._nudge.set() # asyncio.Event the main loop awaits
|
|
254
|
+
|
|
255
|
+
async def run(self):
|
|
256
|
+
"""Main run loop. Schedules ready nodes, handles events, and manages concurrency.
|
|
257
|
+
|
|
258
|
+
The loop works as follows:
|
|
259
|
+
- Drain any pending control events (e.g. resume and wakeup).
|
|
260
|
+
- Schedule ready nodes up to max_concurrency.
|
|
261
|
+
- If no tasks are running and none were scheduled:
|
|
262
|
+
- If all nodes are terminal, exit.
|
|
263
|
+
- If any node is WAITING_*, block for a resume event.
|
|
264
|
+
- Otherwise, raise error (stalled graph).
|
|
265
|
+
- If tasks are running, wait for either a task to complete or a control event.
|
|
266
|
+
- Repeat until terminated.
|
|
267
|
+
"""
|
|
268
|
+
self.loop = asyncio.get_running_loop()
|
|
269
|
+
|
|
270
|
+
dirty = True # something changed; try scheduling
|
|
271
|
+
MAX_DRAIN = 100 # max control events to drain in one go (to avoid starvation)
|
|
272
|
+
while not self._terminated:
|
|
273
|
+
await self._pause_event.wait()
|
|
274
|
+
|
|
275
|
+
# clear nudge
|
|
276
|
+
self._nudge.clear()
|
|
277
|
+
|
|
278
|
+
if dirty:
|
|
279
|
+
# 1) drain already-queued control events (non-blocking)
|
|
280
|
+
for _ in range(MAX_DRAIN): # optional MAX_DRAIN guard
|
|
281
|
+
try:
|
|
282
|
+
ev = self._events.get_nowait()
|
|
283
|
+
except asyncio.QueueEmpty:
|
|
284
|
+
break
|
|
285
|
+
await self._handle_events(ev)
|
|
286
|
+
# 2) try to schedule work
|
|
287
|
+
scheduled = await self._schedule_ready()
|
|
288
|
+
dirty = False
|
|
289
|
+
else:
|
|
290
|
+
scheduled = 0
|
|
291
|
+
|
|
292
|
+
running = list(self.running_tasks.values())
|
|
293
|
+
|
|
294
|
+
# 3) no work currently running or scheduled
|
|
295
|
+
if not running and scheduled == 0:
|
|
296
|
+
nothing_pending = (not self._backoff_tasks) and (not self._resume_pending)
|
|
297
|
+
if nothing_pending and not self._any_waiting():
|
|
298
|
+
# graph is effectively terminal (DONE/FAILED/SKIPPED only)
|
|
299
|
+
self._terminated = True
|
|
300
|
+
break
|
|
301
|
+
|
|
302
|
+
if self._any_waiting():
|
|
303
|
+
# 4) BLOCK until a resume/wakeup arrives (no CPU spin)
|
|
304
|
+
ev = await self._events.get()
|
|
305
|
+
await self._handle_events(ev)
|
|
306
|
+
dirty = True
|
|
307
|
+
continue
|
|
308
|
+
|
|
309
|
+
# stalled: neither running nor waiting nor terminal (likely unmet deps)
|
|
310
|
+
raise RuntimeError("stalled")
|
|
311
|
+
|
|
312
|
+
# 5) We have running tasks; wait for either a task to finish OR a control event
|
|
313
|
+
ctrl = asyncio.create_task(self._events.get())
|
|
314
|
+
try:
|
|
315
|
+
done, _ = await asyncio.wait(running + [ctrl], return_when=asyncio.FIRST_COMPLETED)
|
|
316
|
+
if ctrl in done:
|
|
317
|
+
ev = ctrl.result()
|
|
318
|
+
await self._handle_events(ev)
|
|
319
|
+
# either a task completed or an event arrived → state changed
|
|
320
|
+
dirty = True
|
|
321
|
+
finally:
|
|
322
|
+
if not ctrl.done():
|
|
323
|
+
ctrl.cancel()
|
|
324
|
+
|
|
325
|
+
async def run_from(self, node_ids: list[str]):
|
|
326
|
+
"""Run starting from specific nodes (e.g. after external event)."""
|
|
327
|
+
for nid in node_ids:
|
|
328
|
+
node = self.graph.node(nid)
|
|
329
|
+
if node.state.status in (
|
|
330
|
+
NodeStatus.WAITING_HUMAN,
|
|
331
|
+
NodeStatus.WAITING_ROBOT,
|
|
332
|
+
NodeStatus.WAITING_EXTERNAL,
|
|
333
|
+
NodeStatus.WAITING_TIME,
|
|
334
|
+
NodeStatus.WAITING_EVENT,
|
|
335
|
+
):
|
|
336
|
+
# will be executed when resume_payload arrives
|
|
337
|
+
continue
|
|
338
|
+
await self._start_node(node)
|
|
339
|
+
|
|
340
|
+
async def terminate(self):
|
|
341
|
+
"""Terminate execution; running tasks will complete but no new tasks will be started."""
|
|
342
|
+
self._terminated = True
|
|
343
|
+
# cancel backoff tasks
|
|
344
|
+
for task in self._backoff_tasks.values():
|
|
345
|
+
task.cancel()
|
|
346
|
+
# cancel running tasks
|
|
347
|
+
for task in self.running_tasks.values():
|
|
348
|
+
task.cancel()
|
|
349
|
+
|
|
350
|
+
async def run_node(self, node):
|
|
351
|
+
"""Explicitly run a specific node (e.g. for testing)."""
|
|
352
|
+
await self._start_node(node)
|
|
353
|
+
|
|
354
|
+
# ENFORCE capacity in run_one()
|
|
355
|
+
async def run_one_old(self, node: TaskNodeRuntime) -> dict[str, Any]:
|
|
356
|
+
# deps must be DONE (except inputs node)
|
|
357
|
+
for dep in node.dependencies or []:
|
|
358
|
+
if dep == GRAPH_INPUTS_NODE_ID:
|
|
359
|
+
continue
|
|
360
|
+
dep_node = self.graph.node(dep)
|
|
361
|
+
if dep_node is None or dep_node.state.status != NodeStatus.DONE:
|
|
362
|
+
raise RuntimeError(f"Cannot run node {node.node_id}: dependency {dep} not DONE")
|
|
363
|
+
|
|
364
|
+
# If we're already at capacity, wait until any running task completes
|
|
365
|
+
while self._capacity() <= 0:
|
|
366
|
+
# Wait for FIRST_COMPLETED among running tasks
|
|
367
|
+
running = list(self.running_tasks.values())
|
|
368
|
+
if not running:
|
|
369
|
+
break
|
|
370
|
+
done, _ = await asyncio.wait(running, return_when=asyncio.FIRST_COMPLETED)
|
|
371
|
+
|
|
372
|
+
# Start this node now that a slot is available
|
|
373
|
+
await self._start_node(node)
|
|
374
|
+
|
|
375
|
+
# Wait for this specific node to finish
|
|
376
|
+
task = self.running_tasks.get(node.node_id)
|
|
377
|
+
if task:
|
|
378
|
+
await task
|
|
379
|
+
return node.outputs or {}
|
|
380
|
+
|
|
381
|
+
async def _wait_until_terminal(self, target_id: str):
|
|
382
|
+
"""Drive the scheduler event loop just enough to bring target_id to a terminal state."""
|
|
383
|
+
while True:
|
|
384
|
+
node = self.graph.node(target_id)
|
|
385
|
+
if node.state.status in TERMINAL_STATES:
|
|
386
|
+
return node.state.status
|
|
387
|
+
|
|
388
|
+
# Prioritize resume events
|
|
389
|
+
try:
|
|
390
|
+
ev = self._events.get_nowait()
|
|
391
|
+
await self._handle_events(ev)
|
|
392
|
+
except asyncio.QueueEmpty:
|
|
393
|
+
pass
|
|
394
|
+
|
|
395
|
+
# Try to (re)start anything that became runnable
|
|
396
|
+
await self._schedule_ready()
|
|
397
|
+
|
|
398
|
+
# If nothing running, block on the next control event (resume/wakeup), then loop
|
|
399
|
+
running = list(self.running_tasks.values())
|
|
400
|
+
if not running:
|
|
401
|
+
ev = await self._events.get()
|
|
402
|
+
await self._handle_events(ev)
|
|
403
|
+
else:
|
|
404
|
+
# Either a running task finishes or a control event arrives
|
|
405
|
+
ctrl = asyncio.create_task(self._events.get())
|
|
406
|
+
try:
|
|
407
|
+
done, _ = await asyncio.wait(
|
|
408
|
+
running + [ctrl], return_when=asyncio.FIRST_COMPLETED
|
|
409
|
+
)
|
|
410
|
+
if ctrl in done:
|
|
411
|
+
await self._handle_events(ctrl.result())
|
|
412
|
+
finally:
|
|
413
|
+
if not ctrl.done():
|
|
414
|
+
ctrl.cancel()
|
|
415
|
+
|
|
416
|
+
async def run_one(self, node: TaskNodeRuntime) -> dict[str, Any]:
|
|
417
|
+
"""Run a single node by ID, return its outputs."""
|
|
418
|
+
self.loop = asyncio.get_running_loop() # ensure loop is set
|
|
419
|
+
# deps DONE check (kept as-is) ...
|
|
420
|
+
while self._capacity() <= 0:
|
|
421
|
+
running = list(self.running_tasks.values())
|
|
422
|
+
if not running:
|
|
423
|
+
break
|
|
424
|
+
await asyncio.wait(running, return_when=asyncio.FIRST_COMPLETED)
|
|
425
|
+
|
|
426
|
+
await self._start_node(node)
|
|
427
|
+
|
|
428
|
+
# Wait for the first execution round to finish
|
|
429
|
+
task = self.running_tasks.get(node.node_id)
|
|
430
|
+
if task:
|
|
431
|
+
await task
|
|
432
|
+
|
|
433
|
+
# If the node is WAITING_*, drive the loop until it becomes terminal
|
|
434
|
+
n = self.graph.node(node.node_id)
|
|
435
|
+
if n.state.status in WAITING_STATES:
|
|
436
|
+
await self._wait_until_terminal(node.node_id)
|
|
437
|
+
|
|
438
|
+
# Terminal: return outputs (or {} if failed/skipped)
|
|
439
|
+
n = self.graph.node(node.node_id)
|
|
440
|
+
if n.state.status == NodeStatus.DONE:
|
|
441
|
+
return n.outputs or {}
|
|
442
|
+
if n.state.status == NodeStatus.FAILED:
|
|
443
|
+
# optionally raise an error here
|
|
444
|
+
return n.outputs or {}
|
|
445
|
+
# SKIPPED or others:
|
|
446
|
+
return n.outputs or {}
|
|
447
|
+
|
|
448
|
+
async def step_next(self):
|
|
449
|
+
"""Run exactly one step (for step-by-step execution)."""
|
|
450
|
+
r = self._compute_ready()
|
|
451
|
+
if r:
|
|
452
|
+
nid = next(iter(r))
|
|
453
|
+
await self._start_node(self.graph.node(nid))
|
|
454
|
+
|
|
455
|
+
# called by ResumeRouter when external/human resumes a waiting node
|
|
456
|
+
async def on_resume_event(self, run_id: str, node_id: str, payload: dict[str, Any]):
|
|
457
|
+
"""Called by external event trigger to resume a waiting node.
|
|
458
|
+
We use async queue to schedule the resume event.
|
|
459
|
+
"""
|
|
460
|
+
# NOTE: run_id is not needed for local scheduler, but we need it to match the signature with GlobalScheduler
|
|
461
|
+
await self._events.put(ResumeEvent(run_id, node_id, payload))
|
|
462
|
+
|
|
463
|
+
# --------- internal methods ---------
|
|
464
|
+
async def _schedule_ready(self) -> int:
|
|
465
|
+
available = self._capacity()
|
|
466
|
+
if available <= 0:
|
|
467
|
+
return 0
|
|
468
|
+
scheduled = 0
|
|
469
|
+
|
|
470
|
+
# 1) resumed waiters first
|
|
471
|
+
while available > 0 and self._resume_pending:
|
|
472
|
+
nid = self._resume_pending.pop()
|
|
473
|
+
node = self.graph.node(nid)
|
|
474
|
+
if node and node.state.status in WAITING_STATES and nid not in self.running_tasks:
|
|
475
|
+
await self._start_node(node)
|
|
476
|
+
scheduled += 1
|
|
477
|
+
available -= 1
|
|
478
|
+
|
|
479
|
+
# 2) explicit-start ready nodes (from run_one) next
|
|
480
|
+
while available > 0 and self._ready_pending:
|
|
481
|
+
nid = self._ready_pending.pop()
|
|
482
|
+
node = self.graph.node(nid)
|
|
483
|
+
if (
|
|
484
|
+
node
|
|
485
|
+
and node.node_id not in self.running_tasks
|
|
486
|
+
and node.state.status not in TERMINAL_STATES
|
|
487
|
+
):
|
|
488
|
+
# still ensure deps satisfied
|
|
489
|
+
if all(
|
|
490
|
+
(dep == GRAPH_INPUTS_NODE_ID)
|
|
491
|
+
or (self.graph.node(dep).state.status == NodeStatus.DONE)
|
|
492
|
+
for dep in (node.spec.dependencies or [])
|
|
493
|
+
):
|
|
494
|
+
await self._start_node(node)
|
|
495
|
+
scheduled += 1
|
|
496
|
+
available -= 1
|
|
497
|
+
else:
|
|
498
|
+
pass # deps not satisfied; skip
|
|
499
|
+
|
|
500
|
+
# 3) normal ready nodes
|
|
501
|
+
if available > 0:
|
|
502
|
+
for nid in list(self._compute_ready())[:available]:
|
|
503
|
+
await self._start_node(self.graph.node(nid))
|
|
504
|
+
scheduled += 1
|
|
505
|
+
|
|
506
|
+
return scheduled
|
|
507
|
+
|
|
508
|
+
async def _skip_dependents(self, failed_node_id: str):
|
|
509
|
+
"""Mark all downstream dependents of failed_node_id as SKIPPED if not already terminal/running."""
|
|
510
|
+
# breadth-first over reverse edges
|
|
511
|
+
q = [failed_node_id]
|
|
512
|
+
seen = set()
|
|
513
|
+
while q:
|
|
514
|
+
cur = q.pop(0)
|
|
515
|
+
for n in self.graph.nodes:
|
|
516
|
+
if cur in (n.spec.dependencies or []):
|
|
517
|
+
if n.node_id in seen:
|
|
518
|
+
continue
|
|
519
|
+
seen.add(n.node_id)
|
|
520
|
+
node = self.graph.node(n.node_id)
|
|
521
|
+
if (
|
|
522
|
+
node.state.status not in TERMINAL_STATES
|
|
523
|
+
and n.node_id not in self.running_tasks
|
|
524
|
+
):
|
|
525
|
+
await self.graph.set_node_status(n.node_id, NodeStatus.SKIPPED)
|
|
526
|
+
q.append(n.node_id)
|
|
527
|
+
|
|
528
|
+
def _compute_ready(self) -> set[str]:
|
|
529
|
+
"""Nodes whose deps are completed/skipped and that are not running/waiting/failed.
|
|
530
|
+
Returns set of node_ids.
|
|
531
|
+
The function works as follows:
|
|
532
|
+
- Iterate over all nodes in the graph.
|
|
533
|
+
- Skip plan nodes and nodes that are already done, failed, skipped, or waiting.
|
|
534
|
+
- Skip nodes that are already running.
|
|
535
|
+
- Check if all dependencies of the node are satisfied (i.e., in DONE).
|
|
536
|
+
- If dependencies are satisfied, add the node_id to the ready set.
|
|
537
|
+
"""
|
|
538
|
+
|
|
539
|
+
ready: set[str] = set()
|
|
540
|
+
for node in self.graph.nodes: # runtime nodes
|
|
541
|
+
node_id = node.node_id
|
|
542
|
+
node_status = node.state.status
|
|
543
|
+
node_type = node.spec.type
|
|
544
|
+
|
|
545
|
+
if node_type == "plan":
|
|
546
|
+
continue # skip plan nodes; TODO: we may deprecate plan node later
|
|
547
|
+
if node_status in (
|
|
548
|
+
NodeStatus.DONE,
|
|
549
|
+
NodeStatus.FAILED,
|
|
550
|
+
NodeStatus.SKIPPED,
|
|
551
|
+
NodeStatus.WAITING_HUMAN,
|
|
552
|
+
NodeStatus.WAITING_ROBOT,
|
|
553
|
+
NodeStatus.WAITING_EXTERNAL,
|
|
554
|
+
NodeStatus.WAITING_TIME,
|
|
555
|
+
NodeStatus.WAITING_EVENT,
|
|
556
|
+
):
|
|
557
|
+
# already done/waiting/failed
|
|
558
|
+
continue
|
|
559
|
+
|
|
560
|
+
if node_id in self.running_tasks:
|
|
561
|
+
# already running
|
|
562
|
+
continue
|
|
563
|
+
|
|
564
|
+
# dependencies satisfied?
|
|
565
|
+
deps_ok = True
|
|
566
|
+
for dep in node.spec.dependencies or []:
|
|
567
|
+
if dep == GRAPH_INPUTS_NODE_ID:
|
|
568
|
+
continue # inputs node is always satisfied
|
|
569
|
+
dep_node = self._runtime(dep)
|
|
570
|
+
if dep_node is None:
|
|
571
|
+
if self.logger:
|
|
572
|
+
self.logger.warning(
|
|
573
|
+
f"Node {node_id} has missing dependency {dep}; skipping"
|
|
574
|
+
)
|
|
575
|
+
else:
|
|
576
|
+
print(
|
|
577
|
+
f"[ForwardScheduler] Node {node_id} has missing dependency {dep}; skipping"
|
|
578
|
+
)
|
|
579
|
+
deps_ok = False
|
|
580
|
+
break
|
|
581
|
+
if dep_node.state.status not in [NodeStatus.DONE]:
|
|
582
|
+
deps_ok = False
|
|
583
|
+
break
|
|
584
|
+
if deps_ok:
|
|
585
|
+
ready.add(node_id)
|
|
586
|
+
|
|
587
|
+
return ready
|
|
588
|
+
|
|
589
|
+
def _runtime(self, node_id: str) -> TaskNodeRuntime:
|
|
590
|
+
# get runtime node by id
|
|
591
|
+
node = self.graph.node(node_id)
|
|
592
|
+
return node
|
|
593
|
+
|
|
594
|
+
async def _start_node(self, node: TaskNodeRuntime):
|
|
595
|
+
node_id = node.node_id
|
|
596
|
+
|
|
597
|
+
# attach resume payload if any (WAITING_* -> RUNNING)
|
|
598
|
+
resume_payload = self._resume_payloads.pop(node_id, None)
|
|
599
|
+
|
|
600
|
+
if node.state.status in WAITING_STATES and resume_payload is None:
|
|
601
|
+
# keep it pending; it will be scheduled once a payload arrives
|
|
602
|
+
self._resume_pending.add(node_id)
|
|
603
|
+
return
|
|
604
|
+
|
|
605
|
+
async def _runner():
|
|
606
|
+
try:
|
|
607
|
+
await self.graph.set_node_status(node_id, NodeStatus.RUNNING)
|
|
608
|
+
ctx = self.env.make_ctx(
|
|
609
|
+
node=node, resume_payload=resume_payload
|
|
610
|
+
) # ExecutionContext
|
|
611
|
+
result = await step_forward(node=node, ctx=ctx, retry_policy=self.retry_policy)
|
|
612
|
+
|
|
613
|
+
if result.status == NodeStatus.DONE:
|
|
614
|
+
# normalize between output/outputs
|
|
615
|
+
outs = result.outputs or {}
|
|
616
|
+
|
|
617
|
+
await self.graph.set_node_outputs(node_id, outs)
|
|
618
|
+
await self.graph.set_node_status(node_id, NodeStatus.DONE)
|
|
619
|
+
|
|
620
|
+
# publish outputs to env for downstream consumption
|
|
621
|
+
self.env.outputs_by_node[node.node_id] = outs
|
|
622
|
+
|
|
623
|
+
# emit event
|
|
624
|
+
event = NodeEvent(
|
|
625
|
+
run_id=self.env.run_id,
|
|
626
|
+
graph_id=getattr(self.graph.spec, "graph_id", "inline"),
|
|
627
|
+
node_id=node.node_id,
|
|
628
|
+
status=str(NodeStatus.DONE),
|
|
629
|
+
outputs=node.outputs or {},
|
|
630
|
+
timestamp=datetime.utcnow().timestamp(),
|
|
631
|
+
)
|
|
632
|
+
await self._emit(event)
|
|
633
|
+
|
|
634
|
+
elif result.status.startswith("WAITING_"):
|
|
635
|
+
# no outputs yet; continuation already persisted by ctx.storage via step_forward
|
|
636
|
+
# scheduler idles until on_resume() or wakeup queue triggers
|
|
637
|
+
await self.graph.set_node_status(node_id, result.status)
|
|
638
|
+
|
|
639
|
+
# emit event
|
|
640
|
+
event = NodeEvent(
|
|
641
|
+
run_id=self.env.run_id,
|
|
642
|
+
graph_id=getattr(self.graph.spec, "graph_id", "inline"),
|
|
643
|
+
node_id=node.node_id,
|
|
644
|
+
status=result.status,
|
|
645
|
+
outputs=node.outputs or {},
|
|
646
|
+
timestamp=datetime.utcnow().timestamp(),
|
|
647
|
+
)
|
|
648
|
+
await self._emit(event)
|
|
649
|
+
|
|
650
|
+
elif result.status == NodeStatus.FAILED:
|
|
651
|
+
# step_forward already incremented attempts (if policy applies)
|
|
652
|
+
# If retry allowed, schedule backoff sleeper:
|
|
653
|
+
await self.graph.set_node_status(node_id, NodeStatus.FAILED)
|
|
654
|
+
|
|
655
|
+
# emit event
|
|
656
|
+
event = NodeEvent(
|
|
657
|
+
run_id=self.env.run_id,
|
|
658
|
+
graph_id=getattr(self.graph.spec, "graph_id", "inline"),
|
|
659
|
+
node_id=node.node_id,
|
|
660
|
+
status=str(NodeStatus.FAILED),
|
|
661
|
+
outputs=node.outputs or {},
|
|
662
|
+
timestamp=datetime.utcnow().timestamp(),
|
|
663
|
+
)
|
|
664
|
+
await self._emit(event)
|
|
665
|
+
|
|
666
|
+
attempts = getattr(node, "attempts", 0)
|
|
667
|
+
if attempts > 0 and attempts < self.retry_policy.max_attempts:
|
|
668
|
+
delay = self.retry_policy.backoff(
|
|
669
|
+
attempts - 1
|
|
670
|
+
).total_seconds() # attempts was incremented in step_forward
|
|
671
|
+
self._backoff_tasks[node.node_id] = asyncio.create_task(
|
|
672
|
+
self._sleep_and_requeue(node, delay)
|
|
673
|
+
)
|
|
674
|
+
else:
|
|
675
|
+
# retries exhausted: optionally stop or skip dependents
|
|
676
|
+
if self.skip_dep_on_failure:
|
|
677
|
+
await self._skip_dependents(node_id)
|
|
678
|
+
if self.stop_on_first_error:
|
|
679
|
+
# flip the master switch to stop the main loop
|
|
680
|
+
|
|
681
|
+
self._terminated = True
|
|
682
|
+
|
|
683
|
+
elif result.status == NodeStatus.SKIPPED:
|
|
684
|
+
await self.graph.set_node_status(node_id, NodeStatus.SKIPPED)
|
|
685
|
+
|
|
686
|
+
# emit event
|
|
687
|
+
event = NodeEvent(
|
|
688
|
+
run_id=self.env.run_id,
|
|
689
|
+
graph_id=getattr(self.graph.spec, "graph_id", "inline"),
|
|
690
|
+
node_id=node.node_id,
|
|
691
|
+
status=str(NodeStatus.SKIPPED),
|
|
692
|
+
outputs=node.outputs or {},
|
|
693
|
+
timestamp=datetime.utcnow().timestamp(),
|
|
694
|
+
)
|
|
695
|
+
await self._emit(event)
|
|
696
|
+
|
|
697
|
+
# record memory after step
|
|
698
|
+
# record_after_step(self.env, node, result)
|
|
699
|
+
# TODO: optionally map selected outputs into domain memory here
|
|
700
|
+
|
|
701
|
+
except NotImplementedError:
|
|
702
|
+
# subgraph logic not handled here; escalate to orchestrator
|
|
703
|
+
await node.set_status(NodeStatus.FAILED)
|
|
704
|
+
except asyncio.CancelledError:
|
|
705
|
+
# task cancelled (e.g. on terminate);
|
|
706
|
+
await node.set_status(NodeStatus.FAILED)
|
|
707
|
+
finally:
|
|
708
|
+
# remove from running tasks in caller
|
|
709
|
+
pass
|
|
710
|
+
|
|
711
|
+
task = asyncio.create_task(_runner())
|
|
712
|
+
self.running_tasks[node_id] = task
|
|
713
|
+
# cleanup when done
|
|
714
|
+
task.add_done_callback(lambda t, nid=node_id: self.running_tasks.pop(nid, None))
|
|
715
|
+
|
|
716
|
+
async def _sleep_and_requeue(self, node: TaskNodeRuntime, delay: float):
|
|
717
|
+
try:
|
|
718
|
+
await asyncio.sleep(delay)
|
|
719
|
+
if not self._terminated:
|
|
720
|
+
await self._start_node(node)
|
|
721
|
+
except asyncio.CancelledError:
|
|
722
|
+
pass
|
|
723
|
+
finally:
|
|
724
|
+
self._backoff_tasks.pop(node.node_id, None)
|
|
725
|
+
|
|
726
|
+
async def _handle_events(self, ev):
|
|
727
|
+
"""Handle control events (e.g., resume, wakeup).
|
|
728
|
+
The function works as follows:
|
|
729
|
+
- If the event is a ResumeEvent:
|
|
730
|
+
- Store the resume payload.
|
|
731
|
+
- Cancel any backoff task for the node.
|
|
732
|
+
- If the node is already running or max concurrency reached, mark it as pending and return.
|
|
733
|
+
- Otherwise, start the node.
|
|
734
|
+
- If the event is a WakeupEvent:
|
|
735
|
+
- If the node is not running and max concurrency not reached, start the node.
|
|
736
|
+
NOTE: This function assumes that the event queue is drained before scheduling new nodes.
|
|
737
|
+
"""
|
|
738
|
+
# resume event for WAITING_* nodes
|
|
739
|
+
if isinstance(ev, ResumeEvent):
|
|
740
|
+
# store payload (idempotent; last write wins)
|
|
741
|
+
self._resume_payloads[ev.node_id] = ev.payload
|
|
742
|
+
|
|
743
|
+
# cancel any pending backoff for this node
|
|
744
|
+
task = self._backoff_tasks.pop(ev.node_id, None)
|
|
745
|
+
if task:
|
|
746
|
+
task.cancel()
|
|
747
|
+
|
|
748
|
+
# try start now, else mark pending
|
|
749
|
+
started = await self._try_start_immediately(ev.node_id)
|
|
750
|
+
if not started:
|
|
751
|
+
self._resume_pending.add(ev.node_id)
|
|
752
|
+
return
|
|
753
|
+
|
|
754
|
+
elif isinstance(ev, WakeupEvent):
|
|
755
|
+
started = await self._try_start_immediately(ev.node_id)
|
|
756
|
+
# If capacity is full, nothing else to do. When a slot frees, _schedule_ready will pick it.
|
|
757
|
+
return
|
|
758
|
+
|
|
759
|
+
def _all_nodes_terminal(self) -> bool:
|
|
760
|
+
# treat plan nodes as ignorable for completion
|
|
761
|
+
for node in self.graph.nodes:
|
|
762
|
+
if _is_plan(node):
|
|
763
|
+
continue
|
|
764
|
+
if node.state.status not in TERMINAL_STATES:
|
|
765
|
+
return False
|
|
766
|
+
return True
|
|
767
|
+
|
|
768
|
+
def _any_waiting(self) -> bool:
|
|
769
|
+
return any(
|
|
770
|
+
(not _is_plan(n)) and (n.state.status in WAITING_STATES) for n in self.graph.nodes
|
|
771
|
+
)
|
|
772
|
+
|
|
773
|
+
def post_resume_event_threadsafe(self, run_id: str, node_id: str, payload: dict):
|
|
774
|
+
if not self.loop or not self.loop.is_running():
|
|
775
|
+
# no-op or log; bus will warn
|
|
776
|
+
return
|
|
777
|
+
asyncio.run_coroutine_threadsafe(self.on_resume_event(run_id, node_id, payload), self.loop)
|