langgraph-node-deadline 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,25 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+
8
+ jobs:
9
+ test:
10
+ runs-on: ubuntu-latest
11
+ strategy:
12
+ fail-fast: false
13
+ matrix:
14
+ python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
15
+ steps:
16
+ - uses: actions/checkout@v4
17
+ - uses: actions/setup-python@v5
18
+ with:
19
+ python-version: ${{ matrix.python-version }}
20
+ - name: Install
21
+ run: python -m pip install --upgrade pip && pip install -e ".[dev]"
22
+ - name: Run tests
23
+ run: pytest -q
24
+ - name: Run the demo
25
+ run: python examples/salvage_demo.py
@@ -0,0 +1,25 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.egg-info/
5
+ .eggs/
6
+ build/
7
+ dist/
8
+ *.egg
9
+
10
+ # Virtual envs
11
+ .venv/
12
+ venv/
13
+ env/
14
+
15
+ # Test / tooling
16
+ .pytest_cache/
17
+ .mypy_cache/
18
+ .ruff_cache/
19
+ .coverage
20
+ htmlcov/
21
+
22
+ # OS / editor
23
+ .DS_Store
24
+ .idea/
25
+ .vscode/
@@ -0,0 +1,23 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project are documented here. Format loosely follows
4
+ [Keep a Changelog](https://keepachangelog.com/); this project uses
5
+ [Semantic Versioning](https://semver.org/).
6
+
7
+ ## [0.1.0] — unreleased
8
+
9
+ Initial release.
10
+
11
+ ### Added
12
+ - `node_deadline_scope` / `node_deadline` — context manager binding a deadline
13
+ on a `time.monotonic()` basis.
14
+ - `node_deadline_in(seconds)` — relative-budget convenience scope.
15
+ - `clamp_to_node_deadline(budget_secs, *, reserve_secs=0.0)` — the core
16
+ primitive; clamps any proposed timeout to the binding deadline, fail-open.
17
+ - `cooperative_wait_for(awaitable, budget_secs, *, reserve_secs=0.0)` — a
18
+ deadline-clamped `asyncio.wait_for`.
19
+ - `get_node_deadline_remaining_secs()` and `node_deadline_exceeded()` readers.
20
+ - Runnable, dependency-free `examples/salvage_demo.py`.
21
+ - Full test suite covering fail-open semantics, the clamp monotonicity
22
+ invariant, nested-scope restoration, and async context propagation into and
23
+ out of child tasks.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Fred Becker
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,150 @@
1
+ Metadata-Version: 2.4
2
+ Name: langgraph-node-deadline
3
+ Version: 0.1.0
4
+ Summary: One binding deadline for every inner timeout in a LangGraph node — clamp inner budgets to the cooperative deadline so work salvages instead of getting killed by the watchdog.
5
+ Project-URL: Homepage, https://github.com/youknowfred/langgraph-node-deadline
6
+ Project-URL: Issues, https://github.com/youknowfred/langgraph-node-deadline/issues
7
+ Author: Fred Becker
8
+ License-Expression: MIT
9
+ License-File: LICENSE
10
+ Keywords: agents,asyncio,cancellation,deadline,langchain,langgraph,llm,reliability,timeout
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.9
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Topic :: Software Development :: Libraries
20
+ Classifier: Typing :: Typed
21
+ Requires-Python: >=3.9
22
+ Provides-Extra: dev
23
+ Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
24
+ Requires-Dist: pytest>=7; extra == 'dev'
25
+ Description-Content-Type: text/markdown
26
+
27
+ # langgraph-node-deadline
28
+
29
+ **One binding deadline for every inner timeout in a LangGraph node.** Clamp inner
30
+ budgets to the node's cooperative deadline so heavy work **salvages a partial
31
+ result** instead of getting hard-killed by the watchdog and discarding everything.
32
+
33
+ Zero runtime dependencies. ~120 lines. Python 3.9+.
34
+
35
+ ```bash
36
+ pip install langgraph-node-deadline
37
+ ```
38
+
39
+ ---
40
+
41
+ ## The problem
42
+
43
+ A LangGraph node that does real work has *several layers each re-deriving their
44
+ own clock*: an outer `TimeoutPolicy` watchdog, an inner agent/tool budget, a
45
+ retry loop, a sub-planner that "wants" 60 seconds. When those clocks disagree,
46
+ the inner layers happily dispatch work the outer watchdog is **guaranteed to
47
+ kill** — and the kill is uncooperative. It cancels the node and **throws away
48
+ everything**, including the partial answer you could have returned.
49
+
50
+ You've seen the symptom: a long run times out into *nothing* after burning
51
+ minutes of paid LLM calls, and the user just sees "it failed." The upstream
52
+ issue is real and open: [langchain-ai/langgraph#5672 — *Run Cancellation Causes
53
+ Loss of Streamed State Not Yet Persisted*](https://github.com/langchain-ai/langgraph/issues/5672).
54
+
55
+ The trap, distilled: if your cooperative cancel and the watchdog are pinned to
56
+ the **same** number, the watchdog clock starts at *node entry — before your code
57
+ runs* — so your cancel loses the race deterministically. Equal timeouts lose.
58
+
59
+ ## The fix
60
+
61
+ Set **one** deadline at node entry. Make every inner timeout *clamp to it*
62
+ instead of re-deriving its own. Now inner calls yield at the node boundary, with
63
+ a little grace, **before** the watchdog fires — so your `try/except` actually
64
+ runs and you return a complete-but-shorter answer.
65
+
66
+ ```python
67
+ import asyncio
68
+ from langgraph_node_deadline import node_deadline_in, cooperative_wait_for
69
+
70
+ async def my_node(state):
71
+ # this node gets ~1.8s of cooperative runtime (a hair under its watchdog)
72
+ with node_deadline_in(1.8):
73
+ try:
74
+ # the planner asks for 5s, but gets clamped to what's actually left
75
+ result = await cooperative_wait_for(plan_and_write(state), budget_secs=5.0)
76
+ return {"draft": result}
77
+ except asyncio.TimeoutError:
78
+ # runs BEFORE the watchdog can kill us — keep the partial work
79
+ return {"draft": salvage_partial(state)}
80
+ ```
81
+
82
+ ## See it lose vs. salvage (30 seconds, no LangGraph needed)
83
+
84
+ ```bash
85
+ python examples/salvage_demo.py
86
+ ```
87
+
88
+ ```
89
+ Outer watchdog (LangGraph TimeoutPolicy): 2.0s | inner planner wants ~5s
90
+
91
+ NAIVE (inner ignores the node deadline)
92
+ -> LOST in 2.00s — outer watchdog cancelled the node, salvage code never ran, ALL work discarded
93
+
94
+ CLAMPED (inner clamps to the node deadline)
95
+ -> SALVAGED in 1.80s — kept 3 steps: ['step 1', 'step 2', 'step 3']
96
+ ```
97
+
98
+ Same work, same watchdog. One import decides whether you keep anything.
99
+
100
+ ## Wiring it into a real LangGraph node
101
+
102
+ Set the scope to a hair under whatever cap the executor enforces, then clamp
103
+ every inner timed call through it:
104
+
105
+ ```python
106
+ from langgraph_node_deadline import node_deadline_in, clamp_to_node_deadline, cooperative_wait_for
107
+
108
+ NODE_CAP_SECS = 30.0 # match this to your TimeoutPolicy, minus a small grace
109
+
110
+ async def research_node(state):
111
+ with node_deadline_in(NODE_CAP_SECS - 1.0): # leave 1s of grace under the watchdog
112
+ # an inner retry loop, sub-agent, or tool call — all clamp to the same deadline
113
+ per_call = clamp_to_node_deadline(15.0, reserve_secs=2.0) # reserve finalize headroom
114
+ chunks = await cooperative_wait_for(retrieve(state), budget_secs=per_call)
115
+ return {"chunks": chunks}
116
+ ```
117
+
118
+ Because the deadline lives in a `contextvars.ContextVar`, and `asyncio` copies
119
+ the ambient context when it creates a task, the scope you open before you
120
+ `await` is visible to the agent task **and every subagent task it spawns** — no
121
+ threading the deadline through call signatures.
122
+
123
+ ## API
124
+
125
+ | Symbol | What it does |
126
+ | --- | --- |
127
+ | `node_deadline_in(seconds)` | Context manager. Set the binding deadline to `now + seconds`. Use at node entry. |
128
+ | `node_deadline_scope(deadline_monotonic)` | Context manager. Set the deadline to an absolute `time.monotonic()` timestamp (or `None` to clear). `node_deadline` is an alias. |
129
+ | `clamp_to_node_deadline(budget_secs, *, reserve_secs=0.0)` | **The core primitive.** Returns `min(budget_secs, remaining - reserve_secs)`, floored at 0. Returns `budget_secs` unchanged when no scope is active. |
130
+ | `cooperative_wait_for(awaitable, budget_secs, *, reserve_secs=0.0)` | `asyncio.wait_for` that never outlasts the node deadline. Raises `asyncio.TimeoutError` on the clamped budget. |
131
+ | `get_node_deadline_remaining_secs()` | Seconds left, or `None` if no scope. Never negative. |
132
+ | `node_deadline_exceeded()` | `True` only when a scope is active *and* its deadline has passed. Safe loop guard. |
133
+
134
+ **Fail-open by design.** With no active scope, every function behaves as if it
135
+ weren't there — so adding it to one node never changes the behavior of the rest
136
+ of your graph, your tests, or direct invocations.
137
+
138
+ ## Why a whole package for ~120 lines
139
+
140
+ Because the *lesson* is the hard part, not the code. This is the
141
+ [`derive-don't-pin`](https://github.com/langchain-ai/langgraph/issues/5672)
142
+ discipline extracted from a production agent that paid for it: a synthesis pool
143
+ that believed it had 43.5 seconds left *nine seconds before* the watchdog killed
144
+ the node — because four inner layers each trusted their own clock and none knew
145
+ the one the executor was actually enforcing. One binding deadline fixes the
146
+ entire class of bug.
147
+
148
+ ## License
149
+
150
+ MIT © 2026 Fred Becker. See [LICENSE](LICENSE).
@@ -0,0 +1,124 @@
1
+ # langgraph-node-deadline
2
+
3
+ **One binding deadline for every inner timeout in a LangGraph node.** Clamp inner
4
+ budgets to the node's cooperative deadline so heavy work **salvages a partial
5
+ result** instead of getting hard-killed by the watchdog and discarding everything.
6
+
7
+ Zero runtime dependencies. ~120 lines. Python 3.9+.
8
+
9
+ ```bash
10
+ pip install langgraph-node-deadline
11
+ ```
12
+
13
+ ---
14
+
15
+ ## The problem
16
+
17
+ A LangGraph node that does real work has *several layers each re-deriving their
18
+ own clock*: an outer `TimeoutPolicy` watchdog, an inner agent/tool budget, a
19
+ retry loop, a sub-planner that "wants" 60 seconds. When those clocks disagree,
20
+ the inner layers happily dispatch work the outer watchdog is **guaranteed to
21
+ kill** — and the kill is uncooperative. It cancels the node and **throws away
22
+ everything**, including the partial answer you could have returned.
23
+
24
+ You've seen the symptom: a long run times out into *nothing* after burning
25
+ minutes of paid LLM calls, and the user just sees "it failed." The upstream
26
+ issue is real and open: [langchain-ai/langgraph#5672 — *Run Cancellation Causes
27
+ Loss of Streamed State Not Yet Persisted*](https://github.com/langchain-ai/langgraph/issues/5672).
28
+
29
+ The trap, distilled: if your cooperative cancel and the watchdog are pinned to
30
+ the **same** number, the watchdog clock starts at *node entry — before your code
31
+ runs* — so your cancel loses the race deterministically. Equal timeouts lose.
32
+
33
+ ## The fix
34
+
35
+ Set **one** deadline at node entry. Make every inner timeout *clamp to it*
36
+ instead of re-deriving its own. Now inner calls yield at the node boundary, with
37
+ a little grace, **before** the watchdog fires — so your `try/except` actually
38
+ runs and you return a complete-but-shorter answer.
39
+
40
+ ```python
41
+ import asyncio
42
+ from langgraph_node_deadline import node_deadline_in, cooperative_wait_for
43
+
44
+ async def my_node(state):
45
+ # this node gets ~1.8s of cooperative runtime (a hair under its watchdog)
46
+ with node_deadline_in(1.8):
47
+ try:
48
+ # the planner asks for 5s, but gets clamped to what's actually left
49
+ result = await cooperative_wait_for(plan_and_write(state), budget_secs=5.0)
50
+ return {"draft": result}
51
+ except asyncio.TimeoutError:
52
+ # runs BEFORE the watchdog can kill us — keep the partial work
53
+ return {"draft": salvage_partial(state)}
54
+ ```
55
+
56
+ ## See it lose vs. salvage (30 seconds, no LangGraph needed)
57
+
58
+ ```bash
59
+ python examples/salvage_demo.py
60
+ ```
61
+
62
+ ```
63
+ Outer watchdog (LangGraph TimeoutPolicy): 2.0s | inner planner wants ~5s
64
+
65
+ NAIVE (inner ignores the node deadline)
66
+ -> LOST in 2.00s — outer watchdog cancelled the node, salvage code never ran, ALL work discarded
67
+
68
+ CLAMPED (inner clamps to the node deadline)
69
+ -> SALVAGED in 1.80s — kept 3 steps: ['step 1', 'step 2', 'step 3']
70
+ ```
71
+
72
+ Same work, same watchdog. One import decides whether you keep anything.
73
+
74
+ ## Wiring it into a real LangGraph node
75
+
76
+ Set the scope to a hair under whatever cap the executor enforces, then clamp
77
+ every inner timed call through it:
78
+
79
+ ```python
80
+ from langgraph_node_deadline import node_deadline_in, clamp_to_node_deadline, cooperative_wait_for
81
+
82
+ NODE_CAP_SECS = 30.0 # match this to your TimeoutPolicy, minus a small grace
83
+
84
+ async def research_node(state):
85
+ with node_deadline_in(NODE_CAP_SECS - 1.0): # leave 1s of grace under the watchdog
86
+ # an inner retry loop, sub-agent, or tool call — all clamp to the same deadline
87
+ per_call = clamp_to_node_deadline(15.0, reserve_secs=2.0) # reserve finalize headroom
88
+ chunks = await cooperative_wait_for(retrieve(state), budget_secs=per_call)
89
+ return {"chunks": chunks}
90
+ ```
91
+
92
+ Because the deadline lives in a `contextvars.ContextVar`, and `asyncio` copies
93
+ the ambient context when it creates a task, the scope you open before you
94
+ `await` is visible to the agent task **and every subagent task it spawns** — no
95
+ threading the deadline through call signatures.
96
+
97
+ ## API
98
+
99
+ | Symbol | What it does |
100
+ | --- | --- |
101
+ | `node_deadline_in(seconds)` | Context manager. Set the binding deadline to `now + seconds`. Use at node entry. |
102
+ | `node_deadline_scope(deadline_monotonic)` | Context manager. Set the deadline to an absolute `time.monotonic()` timestamp (or `None` to clear). `node_deadline` is an alias. |
103
+ | `clamp_to_node_deadline(budget_secs, *, reserve_secs=0.0)` | **The core primitive.** Returns `min(budget_secs, remaining - reserve_secs)`, floored at 0. Returns `budget_secs` unchanged when no scope is active. |
104
+ | `cooperative_wait_for(awaitable, budget_secs, *, reserve_secs=0.0)` | `asyncio.wait_for` that never outlasts the node deadline. Raises `asyncio.TimeoutError` on the clamped budget. |
105
+ | `get_node_deadline_remaining_secs()` | Seconds left, or `None` if no scope. Never negative. |
106
+ | `node_deadline_exceeded()` | `True` only when a scope is active *and* its deadline has passed. Safe loop guard. |
107
+
108
+ **Fail-open by design.** With no active scope, every function behaves as if it
109
+ weren't there — so adding it to one node never changes the behavior of the rest
110
+ of your graph, your tests, or direct invocations.
111
+
112
+ ## Why a whole package for ~120 lines
113
+
114
+ Because the *lesson* is the hard part, not the code. This is the
115
+ [`derive-don't-pin`](https://github.com/langchain-ai/langgraph/issues/5672)
116
+ discipline extracted from a production agent that paid for it: a synthesis pool
117
+ that believed it had 43.5 seconds left *nine seconds before* the watchdog killed
118
+ the node — because four inner layers each trusted their own clock and none knew
119
+ the one the executor was actually enforcing. One binding deadline fixes the
120
+ entire class of bug.
121
+
122
+ ## License
123
+
124
+ MIT © 2026 Fred Becker. See [LICENSE](LICENSE).
@@ -0,0 +1,82 @@
1
+ """The README hero, runnable with zero dependencies.
2
+
3
+ Run it:
4
+
5
+ python examples/salvage_demo.py
6
+
7
+ Two nodes do the SAME work and live under the SAME outer watchdog. The only
8
+ difference is whether the inner planner clamps its budget to the binding node
9
+ deadline. One loses everything; one salvages a partial result.
10
+
11
+ This simulates a LangGraph node wrapped in a ``TimeoutPolicy`` watchdog. No
12
+ LangGraph install is needed — ``asyncio.wait_for`` plays the role of the
13
+ watchdog so you can see the mechanics directly.
14
+ """
15
+
16
+ import asyncio
17
+ import time
18
+
19
+ from langgraph_node_deadline import node_deadline_in, cooperative_wait_for
20
+
21
+ OUTER_WATCHDOG = 2.0 # simulates LangGraph TimeoutPolicy on the node (kills uncooperatively)
22
+ NODE_CAP = 1.8 # the cooperative deadline we enforce INSIDE the node (fires first)
23
+
24
+
25
+ async def long_planner(progress):
26
+ """An inner agent/LLM loop that 'wants' ~5s and accretes partial work."""
27
+ for i in range(10):
28
+ await asyncio.sleep(0.5)
29
+ progress.append(f"step {i + 1}")
30
+
31
+
32
+ # --- Naive node: the inner call re-derives its OWN 5s budget, blind to the watchdog.
33
+ async def naive_node():
34
+ progress = []
35
+ try:
36
+ await asyncio.wait_for(long_planner(progress), timeout=5.0)
37
+ except asyncio.TimeoutError:
38
+ return "salvaged", progress # never reached — the watchdog kills us first
39
+ return "complete", progress
40
+
41
+
42
+ # --- Clamped node: the inner call clamps to the binding node deadline.
43
+ async def clamped_node():
44
+ progress = []
45
+ with node_deadline_in(NODE_CAP):
46
+ try:
47
+ await cooperative_wait_for(long_planner(progress), budget_secs=5.0)
48
+ except asyncio.TimeoutError:
49
+ return "salvaged", progress # fires at ~1.8s, BEFORE the 2.0s watchdog
50
+ return "complete", progress
51
+
52
+
53
+ async def run(label, node):
54
+ start = time.monotonic()
55
+ try:
56
+ outcome, progress = await asyncio.wait_for(node(), timeout=OUTER_WATCHDOG)
57
+ elapsed = time.monotonic() - start
58
+ print(
59
+ f" {label}\n"
60
+ f" -> {outcome.upper()} in {elapsed:.2f}s — kept {len(progress)} steps: {progress}\n"
61
+ )
62
+ except asyncio.TimeoutError:
63
+ elapsed = time.monotonic() - start
64
+ print(
65
+ f" {label}\n"
66
+ f" -> LOST in {elapsed:.2f}s — outer watchdog cancelled the node, "
67
+ f"salvage code never ran, ALL work discarded\n"
68
+ )
69
+
70
+
71
+ async def main():
72
+ print(
73
+ f"\nOuter watchdog (LangGraph TimeoutPolicy): {OUTER_WATCHDOG}s"
74
+ f" | inner planner wants ~5s\n"
75
+ )
76
+ await run("NAIVE (inner ignores the node deadline)", naive_node)
77
+ await run("CLAMPED (inner clamps to the node deadline)", clamped_node)
78
+ print("Same work, same watchdog. One import decides whether you keep anything.\n")
79
+
80
+
81
+ if __name__ == "__main__":
82
+ asyncio.run(main())
@@ -0,0 +1,51 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "langgraph-node-deadline"
7
+ version = "0.1.0"
8
+ description = "One binding deadline for every inner timeout in a LangGraph node — clamp inner budgets to the cooperative deadline so work salvages instead of getting killed by the watchdog."
9
+ readme = "README.md"
10
+ requires-python = ">=3.9"
11
+ license = "MIT"
12
+ license-files = ["LICENSE"]
13
+ authors = [{ name = "Fred Becker" }]
14
+ keywords = [
15
+ "langgraph",
16
+ "langchain",
17
+ "asyncio",
18
+ "timeout",
19
+ "deadline",
20
+ "cancellation",
21
+ "agents",
22
+ "llm",
23
+ "reliability",
24
+ ]
25
+ classifiers = [
26
+ "Development Status :: 4 - Beta",
27
+ "Intended Audience :: Developers",
28
+ "Programming Language :: Python :: 3",
29
+ "Programming Language :: Python :: 3.9",
30
+ "Programming Language :: Python :: 3.10",
31
+ "Programming Language :: Python :: 3.11",
32
+ "Programming Language :: Python :: 3.12",
33
+ "Programming Language :: Python :: 3.13",
34
+ "Topic :: Software Development :: Libraries",
35
+ "Typing :: Typed",
36
+ ]
37
+ dependencies = []
38
+
39
+ [project.optional-dependencies]
40
+ dev = ["pytest>=7", "pytest-asyncio>=0.23"]
41
+
42
+ [project.urls]
43
+ Homepage = "https://github.com/youknowfred/langgraph-node-deadline"
44
+ Issues = "https://github.com/youknowfred/langgraph-node-deadline/issues"
45
+
46
+ [tool.hatch.build.targets.wheel]
47
+ packages = ["src/langgraph_node_deadline"]
48
+
49
+ [tool.pytest.ini_options]
50
+ asyncio_mode = "auto"
51
+ testpaths = ["tests"]
@@ -0,0 +1,176 @@
1
+ """langgraph-node-deadline — one binding deadline for every inner timeout.
2
+
3
+ The problem this solves
4
+ -----------------------
5
+ A LangGraph node that does real work usually has *several* layers each
6
+ re-deriving their own clock: an outer ``TimeoutPolicy`` watchdog, an inner
7
+ agent/tool budget, a retry loop, a sub-planner that "wants" 60 seconds. When
8
+ those clocks disagree, the inner layers dispatch work the outer watchdog is
9
+ guaranteed to kill — and the kill is uncooperative: it cancels the node and
10
+ **discards everything**, including any partial result you could have salvaged.
11
+ (See the long-standing upstream report: langchain-ai/langgraph#5672, "Run
12
+ Cancellation Causes Loss of Streamed State Not Yet Persisted as a Checkpoint".)
13
+
14
+ The fix
15
+ -------
16
+ Establish **one** binding deadline at node entry and make every inner timeout
17
+ *clamp to it* instead of re-deriving its own. Then your inner calls always
18
+ yield at the node boundary — with a few hundred ms of grace — *before* the
19
+ watchdog fires, so a ``try/except`` around the inner call actually runs and you
20
+ return a complete-but-shorter answer instead of nothing.
21
+
22
+ How it threads through async code
23
+ ---------------------------------
24
+ The deadline lives in a :class:`contextvars.ContextVar`. ``asyncio`` tasks copy
25
+ the ambient context at creation, so a scope opened before you ``await`` is
26
+ visible to the agent task *and every subagent task it spawns*. The default is
27
+ ``None`` (no scope) and every consumer **fails open** — code that runs outside a
28
+ scope (unit tests, direct invocations) keeps its existing arithmetic unchanged.
29
+
30
+ Quick start
31
+ -----------
32
+ >>> import asyncio
33
+ >>> from langgraph_node_deadline import node_deadline_in, cooperative_wait_for
34
+ >>> async def node():
35
+ ... # this node is allowed ~1.8s of cooperative runtime
36
+ ... with node_deadline_in(1.8):
37
+ ... try:
38
+ ... # the planner "wants" 5s but will be clamped to what's left
39
+ ... return await cooperative_wait_for(planner(), budget_secs=5.0)
40
+ ... except asyncio.TimeoutError:
41
+ ... return salvage_partial() # runs BEFORE the outer watchdog kills us
42
+
43
+ See ``examples/salvage_demo.py`` for a runnable, dependency-free contrast
44
+ between the naive path (work discarded) and the clamped path (work salvaged).
45
+ """
46
+
47
+ from __future__ import annotations
48
+
49
+ import asyncio
50
+ import time
51
+ from contextlib import contextmanager
52
+ from contextvars import ContextVar
53
+ from typing import Awaitable, Iterator, Optional, TypeVar
54
+
55
+ __all__ = [
56
+ "node_deadline",
57
+ "node_deadline_scope",
58
+ "node_deadline_in",
59
+ "get_node_deadline_remaining_secs",
60
+ "node_deadline_exceeded",
61
+ "clamp_to_node_deadline",
62
+ "cooperative_wait_for",
63
+ ]
64
+
65
+ __version__ = "0.1.0"
66
+
67
+ _T = TypeVar("_T")
68
+
69
+ _node_deadline_monotonic: ContextVar[Optional[float]] = ContextVar(
70
+ "node_deadline_monotonic", default=None
71
+ )
72
+
73
+
74
+ @contextmanager
75
+ def node_deadline_scope(deadline_monotonic: Optional[float]) -> Iterator[None]:
76
+ """Scope the binding cooperative deadline, on a ``time.monotonic()`` basis.
77
+
78
+ Args:
79
+ deadline_monotonic: An absolute ``time.monotonic()`` timestamp by which
80
+ inner work should have yielded. Pass ``None`` to explicitly clear
81
+ the scope (fail-open) — used when the outer layer has no hard cap.
82
+
83
+ The scope is restored on exit, so nesting is safe: an inner, tighter
84
+ deadline reverts to the outer one when its ``with`` block ends.
85
+ """
86
+ token = _node_deadline_monotonic.set(deadline_monotonic)
87
+ try:
88
+ yield
89
+ finally:
90
+ _node_deadline_monotonic.reset(token)
91
+
92
+
93
+ # Primary public alias — reads naturally at call sites that already hold an
94
+ # absolute monotonic deadline: ``with node_deadline(executor_deadline): ...``
95
+ node_deadline = node_deadline_scope
96
+
97
+
98
+ @contextmanager
99
+ def node_deadline_in(seconds: float) -> Iterator[None]:
100
+ """Convenience scope expressed as a *relative* budget from now.
101
+
102
+ ``with node_deadline_in(30): ...`` is exactly
103
+ ``with node_deadline_scope(time.monotonic() + 30): ...`` — use it at node
104
+ entry when you think in "this node gets N seconds" rather than in absolute
105
+ monotonic timestamps.
106
+ """
107
+ with node_deadline_scope(time.monotonic() + seconds):
108
+ yield
109
+
110
+
111
+ def get_node_deadline_remaining_secs() -> Optional[float]:
112
+ """Seconds until the binding deadline, or ``None`` when no scope is active.
113
+
114
+ Never negative — a passed deadline reports ``0.0``.
115
+ """
116
+ deadline = _node_deadline_monotonic.get()
117
+ if deadline is None:
118
+ return None
119
+ return max(0.0, deadline - time.monotonic())
120
+
121
+
122
+ def node_deadline_exceeded() -> bool:
123
+ """``True`` only when a scope is active *and* its deadline has passed.
124
+
125
+ Returns ``False`` when no scope is active (fail-open), so it is safe to use
126
+ as a cooperative loop guard: ``while not node_deadline_exceeded(): ...``.
127
+ """
128
+ remaining = get_node_deadline_remaining_secs()
129
+ return remaining is not None and remaining <= 0.0
130
+
131
+
132
+ def clamp_to_node_deadline(budget_secs: float, *, reserve_secs: float = 0.0) -> float:
133
+ """Clamp a proposed budget/timeout to the real deadline remaining.
134
+
135
+ This is the core primitive: every inner layer that is about to start a
136
+ timed operation passes its desired budget through here, so it can never
137
+ exceed the binding node deadline.
138
+
139
+ Args:
140
+ budget_secs: The timeout the inner layer *wants*.
141
+ reserve_secs: Headroom to carve below the deadline (e.g. a phase
142
+ transition / finalize buffer) so the clamped work still has time to
143
+ wrap up before the cooperative cancel.
144
+
145
+ Returns:
146
+ ``budget_secs`` unchanged when no deadline scope is active (fail-open);
147
+ otherwise ``min(budget_secs, remaining - reserve_secs)``, floored at
148
+ ``0.0`` so it is always a valid ``asyncio.wait_for`` timeout.
149
+ """
150
+ remaining = get_node_deadline_remaining_secs()
151
+ if remaining is None:
152
+ return budget_secs
153
+ return max(0.0, min(budget_secs, remaining - reserve_secs))
154
+
155
+
156
+ async def cooperative_wait_for(
157
+ awaitable: Awaitable[_T],
158
+ budget_secs: float,
159
+ *,
160
+ reserve_secs: float = 0.0,
161
+ ) -> _T:
162
+ """``asyncio.wait_for`` that never outlasts the binding node deadline.
163
+
164
+ Equivalent to ``asyncio.wait_for(awaitable, clamp_to_node_deadline(...))``.
165
+ Because the clamped timeout fires *inside* the node, an
166
+ ``asyncio.TimeoutError`` you catch here runs your salvage path **before**
167
+ an outer watchdog can cancel the node and discard the work.
168
+
169
+ With no active deadline scope this is a plain ``wait_for(awaitable,
170
+ budget_secs)`` — fail-open, no behavior change.
171
+
172
+ Raises:
173
+ asyncio.TimeoutError: if the clamped budget elapses first.
174
+ """
175
+ timeout = clamp_to_node_deadline(budget_secs, reserve_secs=reserve_secs)
176
+ return await asyncio.wait_for(awaitable, timeout)
@@ -0,0 +1,153 @@
1
+ """Behavioral + invariant tests for langgraph-node-deadline.
2
+
3
+ asyncio_mode = "auto" (set in pyproject) runs ``async def`` tests directly.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import asyncio
9
+ import time
10
+
11
+ import pytest
12
+
13
+ from langgraph_node_deadline import (
14
+ clamp_to_node_deadline,
15
+ cooperative_wait_for,
16
+ get_node_deadline_remaining_secs,
17
+ node_deadline_exceeded,
18
+ node_deadline_in,
19
+ node_deadline_scope,
20
+ )
21
+
22
+
23
+ # --------------------------------------------------------------------------- #
24
+ # Fail-open: no scope active #
25
+ # --------------------------------------------------------------------------- #
26
+
27
+ def test_no_scope_is_fail_open():
28
+ assert get_node_deadline_remaining_secs() is None
29
+ assert node_deadline_exceeded() is False
30
+ # clamp returns the requested budget unchanged
31
+ assert clamp_to_node_deadline(60.0) == 60.0
32
+ assert clamp_to_node_deadline(60.0, reserve_secs=5.0) == 60.0
33
+
34
+
35
+ # --------------------------------------------------------------------------- #
36
+ # Core clamp behavior under an active scope #
37
+ # --------------------------------------------------------------------------- #
38
+
39
+ def test_clamp_under_scope_caps_to_remaining():
40
+ with node_deadline_in(10.0):
41
+ clamped = clamp_to_node_deadline(60.0)
42
+ # never more than what's left, never more than the request
43
+ assert 9.0 < clamped <= 10.0
44
+
45
+
46
+ def test_clamp_passes_through_when_budget_is_smaller():
47
+ with node_deadline_in(10.0):
48
+ # a 2s request well under the 10s deadline is unchanged
49
+ assert clamp_to_node_deadline(2.0) == pytest.approx(2.0, abs=0.05)
50
+
51
+
52
+ def test_reserve_carves_headroom():
53
+ with node_deadline_in(10.0):
54
+ clamped = clamp_to_node_deadline(60.0, reserve_secs=2.0)
55
+ assert 7.0 < clamped <= 8.0
56
+
57
+
58
+ def test_remaining_never_negative_and_exceeded_flips():
59
+ # a deadline one second in the past
60
+ with node_deadline_scope(time.monotonic() - 1.0):
61
+ assert get_node_deadline_remaining_secs() == 0.0
62
+ assert node_deadline_exceeded() is True
63
+ assert clamp_to_node_deadline(5.0) == 0.0
64
+
65
+
66
+ # --------------------------------------------------------------------------- #
67
+ # Nesting: an inner tighter deadline reverts to the outer one #
68
+ # --------------------------------------------------------------------------- #
69
+
70
+ def test_nested_scope_restores_outer():
71
+ with node_deadline_in(10.0):
72
+ assert 9.0 < get_node_deadline_remaining_secs() <= 10.0
73
+ with node_deadline_in(1.0):
74
+ assert get_node_deadline_remaining_secs() <= 1.0
75
+ # back to the outer scope
76
+ assert 8.5 < get_node_deadline_remaining_secs() <= 10.0
77
+ # back to fail-open
78
+ assert get_node_deadline_remaining_secs() is None
79
+
80
+
81
+ # --------------------------------------------------------------------------- #
82
+ # The invariant the package exists to guarantee #
83
+ # --------------------------------------------------------------------------- #
84
+
85
+ def test_clamp_monotonicity_invariant():
86
+ """clamp(b) is always <= b AND <= remaining, for every budget."""
87
+ with node_deadline_in(5.0):
88
+ remaining = get_node_deadline_remaining_secs()
89
+ assert remaining is not None
90
+ for budget in (0.0, 0.5, 1.0, 4.9, 5.0, 50.0, 5000.0):
91
+ clamped = clamp_to_node_deadline(budget)
92
+ assert clamped <= budget + 1e-9
93
+ assert clamped <= remaining + 1e-9
94
+ assert clamped >= 0.0
95
+
96
+
97
+ # --------------------------------------------------------------------------- #
98
+ # Context propagation into child tasks (the whole reason it's a contextvar) #
99
+ # --------------------------------------------------------------------------- #
100
+
101
+ async def test_scope_propagates_to_child_task():
102
+ async def child_reads_deadline():
103
+ return get_node_deadline_remaining_secs()
104
+
105
+ with node_deadline_in(5.0):
106
+ # a task created inside the scope inherits the deadline
107
+ remaining = await asyncio.create_task(child_reads_deadline())
108
+ assert remaining is not None
109
+ assert 4.0 < remaining <= 5.0
110
+
111
+
112
+ async def test_scope_not_visible_to_task_created_outside():
113
+ async def child_reads_deadline():
114
+ # yield once so the parent's scope would be active if it leaked
115
+ await asyncio.sleep(0)
116
+ return get_node_deadline_remaining_secs()
117
+
118
+ # task created BEFORE entering the scope must not see it
119
+ task = asyncio.create_task(child_reads_deadline())
120
+ with node_deadline_in(5.0):
121
+ result = await task
122
+ assert result is None
123
+
124
+
125
+ # --------------------------------------------------------------------------- #
126
+ # cooperative_wait_for #
127
+ # --------------------------------------------------------------------------- #
128
+
129
+ async def test_cooperative_wait_for_clamps_to_deadline():
130
+ start = time.monotonic()
131
+ with node_deadline_in(0.2):
132
+ with pytest.raises(asyncio.TimeoutError):
133
+ # asks for 5s but the node only has ~0.2s left
134
+ await cooperative_wait_for(asyncio.sleep(5.0), budget_secs=5.0)
135
+ elapsed = time.monotonic() - start
136
+ assert elapsed < 1.0 # fired at the node deadline, not the 5s request
137
+
138
+
139
+ async def test_cooperative_wait_for_returns_when_work_fits():
140
+ with node_deadline_in(5.0):
141
+ result = await cooperative_wait_for(_quick(), budget_secs=5.0)
142
+ assert result == "done"
143
+
144
+
145
+ async def test_cooperative_wait_for_fail_open_without_scope():
146
+ # no scope: behaves like a plain wait_for(awaitable, budget_secs)
147
+ result = await cooperative_wait_for(_quick(), budget_secs=5.0)
148
+ assert result == "done"
149
+
150
+
151
+ async def _quick():
152
+ await asyncio.sleep(0.01)
153
+ return "done"