dynamic-subgraphs 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dynamic_subgraphs-0.1.0/.claude/README.md +32 -0
- dynamic_subgraphs-0.1.0/.claude/context.md +90 -0
- dynamic_subgraphs-0.1.0/.claude/gotchas.md +328 -0
- dynamic_subgraphs-0.1.0/.claude/patterns.md +114 -0
- dynamic_subgraphs-0.1.0/.claude/roadmap.md +180 -0
- dynamic_subgraphs-0.1.0/.claude/workflows.md +96 -0
- dynamic_subgraphs-0.1.0/.editorconfig +18 -0
- dynamic_subgraphs-0.1.0/.env.example +23 -0
- dynamic_subgraphs-0.1.0/.gitignore +31 -0
- dynamic_subgraphs-0.1.0/.pre-commit-config.yaml +31 -0
- dynamic_subgraphs-0.1.0/.python-version +1 -0
- dynamic_subgraphs-0.1.0/AGENTS.md +44 -0
- dynamic_subgraphs-0.1.0/ARCHITECTURE.md +40 -0
- dynamic_subgraphs-0.1.0/CHANGELOG.md +38 -0
- dynamic_subgraphs-0.1.0/CODE_OF_CONDUCT.md +40 -0
- dynamic_subgraphs-0.1.0/CONTRIBUTING.md +64 -0
- dynamic_subgraphs-0.1.0/LICENSE +201 -0
- dynamic_subgraphs-0.1.0/NOTICE +24 -0
- dynamic_subgraphs-0.1.0/PKG-INFO +335 -0
- dynamic_subgraphs-0.1.0/README.md +291 -0
- dynamic_subgraphs-0.1.0/SECURITY.md +38 -0
- dynamic_subgraphs-0.1.0/app/__init__.py +1 -0
- dynamic_subgraphs-0.1.0/app/api/__init__.py +6 -0
- dynamic_subgraphs-0.1.0/app/api/__main__.py +18 -0
- dynamic_subgraphs-0.1.0/app/api/app.py +32 -0
- dynamic_subgraphs-0.1.0/app/api/deps.py +146 -0
- dynamic_subgraphs-0.1.0/app/api/errors.py +67 -0
- dynamic_subgraphs-0.1.0/app/api/jobs.py +136 -0
- dynamic_subgraphs-0.1.0/app/api/routers/__init__.py +1 -0
- dynamic_subgraphs-0.1.0/app/api/routers/chains.py +170 -0
- dynamic_subgraphs-0.1.0/app/api/routers/health.py +11 -0
- dynamic_subgraphs-0.1.0/app/api/routers/registry.py +38 -0
- dynamic_subgraphs-0.1.0/app/api/routers/runs.py +300 -0
- dynamic_subgraphs-0.1.0/app/api/run_config_store.py +51 -0
- dynamic_subgraphs-0.1.0/app/api/schemas.py +86 -0
- dynamic_subgraphs-0.1.0/app/api/serialize.py +76 -0
- dynamic_subgraphs-0.1.0/app/api/settings.py +53 -0
- dynamic_subgraphs-0.1.0/app/assembly.py +256 -0
- dynamic_subgraphs-0.1.0/app/compiler/__init__.py +6 -0
- dynamic_subgraphs-0.1.0/app/compiler/build.py +168 -0
- dynamic_subgraphs-0.1.0/app/compiler/errors.py +5 -0
- dynamic_subgraphs-0.1.0/app/main.py +202 -0
- dynamic_subgraphs-0.1.0/app/models/__init__.py +29 -0
- dynamic_subgraphs-0.1.0/app/models/graph_spec.py +51 -0
- dynamic_subgraphs-0.1.0/app/models/node_kinds.py +13 -0
- dynamic_subgraphs-0.1.0/app/models/run_state.py +44 -0
- dynamic_subgraphs-0.1.0/app/models/trace.py +31 -0
- dynamic_subgraphs-0.1.0/app/py.typed +1 -0
- dynamic_subgraphs-0.1.0/app/recording/__init__.py +27 -0
- dynamic_subgraphs-0.1.0/app/recording/mermaid.py +27 -0
- dynamic_subgraphs-0.1.0/app/recording/recorder.py +646 -0
- dynamic_subgraphs-0.1.0/app/registry/__init__.py +22 -0
- dynamic_subgraphs-0.1.0/app/registry/allowlists.py +30 -0
- dynamic_subgraphs-0.1.0/app/registry/definitions.py +91 -0
- dynamic_subgraphs-0.1.0/app/registry/errors.py +20 -0
- dynamic_subgraphs-0.1.0/app/registry/params.py +88 -0
- dynamic_subgraphs-0.1.0/app/registry/registry.py +214 -0
- dynamic_subgraphs-0.1.0/app/registry/validator.py +348 -0
- dynamic_subgraphs-0.1.0/app/runtime/__init__.py +133 -0
- dynamic_subgraphs-0.1.0/app/runtime/artifacts.py +176 -0
- dynamic_subgraphs-0.1.0/app/runtime/branch.py +103 -0
- dynamic_subgraphs-0.1.0/app/runtime/chat_models.py +39 -0
- dynamic_subgraphs-0.1.0/app/runtime/executor.py +304 -0
- dynamic_subgraphs-0.1.0/app/runtime/llm_runner.py +152 -0
- dynamic_subgraphs-0.1.0/app/runtime/model_providers.py +307 -0
- dynamic_subgraphs-0.1.0/app/runtime/parallel_map.py +342 -0
- dynamic_subgraphs-0.1.0/app/runtime/runners.py +218 -0
- dynamic_subgraphs-0.1.0/app/runtime/state.py +40 -0
- dynamic_subgraphs-0.1.0/app/runtime/subagents.py +172 -0
- dynamic_subgraphs-0.1.0/app/runtime/subgraph.py +238 -0
- dynamic_subgraphs-0.1.0/app/runtime/tools.py +583 -0
- dynamic_subgraphs-0.1.0/app/runtime/wait_for_event.py +88 -0
- dynamic_subgraphs-0.1.0/app/runtime/wrappers.py +162 -0
- dynamic_subgraphs-0.1.0/app/supervisor/__init__.py +51 -0
- dynamic_subgraphs-0.1.0/app/supervisor/graph.py +235 -0
- dynamic_subgraphs-0.1.0/app/supervisor/iteration.py +525 -0
- dynamic_subgraphs-0.1.0/app/supervisor/llm_planner.py +340 -0
- dynamic_subgraphs-0.1.0/app/supervisor/planner.py +26 -0
- dynamic_subgraphs-0.1.0/app/supervisor/state.py +45 -0
- dynamic_subgraphs-0.1.0/app/supervisor/supervisor.py +510 -0
- dynamic_subgraphs-0.1.0/docs/api.md +208 -0
- dynamic_subgraphs-0.1.0/docs/dynamic-graphs-canonical-design-v1.md +861 -0
- dynamic_subgraphs-0.1.0/docs/dynamic-graphs-design-claude.md +476 -0
- dynamic_subgraphs-0.1.0/docs/dynamic-graphs-design.md +851 -0
- dynamic_subgraphs-0.1.0/docs/evals/model-comparison-2026-06.md +102 -0
- dynamic_subgraphs-0.1.0/docs/index.md +21 -0
- dynamic_subgraphs-0.1.0/docs/iterative-supervisor.md +106 -0
- dynamic_subgraphs-0.1.0/docs/recipes.md +192 -0
- dynamic_subgraphs-0.1.0/docs/sdk-next-steps.md +179 -0
- dynamic_subgraphs-0.1.0/docs/superpowers/plans/2026-05-30-fastapi-layer.md +2519 -0
- dynamic_subgraphs-0.1.0/docs/superpowers/specs/2026-05-30-fastapi-layer-design.md +328 -0
- dynamic_subgraphs-0.1.0/dynamic_subgraphs/__init__.py +65 -0
- dynamic_subgraphs-0.1.0/dynamic_subgraphs/engine.py +525 -0
- dynamic_subgraphs-0.1.0/dynamic_subgraphs/py.typed +1 -0
- dynamic_subgraphs-0.1.0/dynamic_subgraphs/recording.py +169 -0
- dynamic_subgraphs-0.1.0/dynamic_subgraphs/types.py +63 -0
- dynamic_subgraphs-0.1.0/pyproject.toml +145 -0
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
# Agent memory
|
|
2
|
+
|
|
3
|
+
Compressed, durable knowledge from prior coding-agent sessions so future
|
|
4
|
+
sessions (Claude Code, Cursor, Copilot, etc.) can pick up the project
|
|
5
|
+
context without re-deriving it.
|
|
6
|
+
|
|
7
|
+
These files **complement, don't replace**:
|
|
8
|
+
- `AGENTS.md` — package map + MVP sequence (orientation)
|
|
9
|
+
- `ARCHITECTURE.md` — package boundaries + dependency direction
|
|
10
|
+
- `docs/dynamic-graphs-canonical-design-v1.md` — canonical design brief
|
|
11
|
+
|
|
12
|
+
## Files
|
|
13
|
+
|
|
14
|
+
| File | Read when |
|
|
15
|
+
|---|---|
|
|
16
|
+
| `context.md` | Starting a new session — snapshot of what's shipped |
|
|
17
|
+
| `patterns.md` | About to add a feature — follow the established shapes |
|
|
18
|
+
| `gotchas.md` | Hit a strange LangGraph / OpenAI / Pydantic error |
|
|
19
|
+
| `workflows.md` | Setting up the dev loop, writing tests, debugging |
|
|
20
|
+
| `roadmap.md` | Deciding what to build next |
|
|
21
|
+
|
|
22
|
+
## How to maintain
|
|
23
|
+
|
|
24
|
+
When you learn something a future agent shouldn't have to re-discover:
|
|
25
|
+
- LangGraph / OpenAI / Pydantic surprise → `gotchas.md`
|
|
26
|
+
- Repeatable architectural shape that worked → `patterns.md`
|
|
27
|
+
- Workflow improvement → `workflows.md`
|
|
28
|
+
- Project state changed → `context.md`
|
|
29
|
+
- New candidate slice or shifted priorities → `roadmap.md`
|
|
30
|
+
|
|
31
|
+
Keep entries **short, specific, and load-bearing**. Save another agent a
|
|
32
|
+
debugging cycle. Don't write a textbook.
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
# Project context
|
|
2
|
+
|
|
3
|
+
## What this is
|
|
4
|
+
|
|
5
|
+
**Dynamic Subgraphs**: a governed runtime where an LLM synthesizes a
|
|
6
|
+
transient LangGraph workflow per problem. The system validates, compiles,
|
|
7
|
+
executes, and records the graph, then discards the runtime object. Bounded
|
|
8
|
+
by a registry of node kinds — the "language" the planner composes from.
|
|
9
|
+
|
|
10
|
+
The thesis: *the registry is the language; the graph is its temporary
|
|
11
|
+
executable form*. Get the registry right and most other choices are
|
|
12
|
+
recoverable.
|
|
13
|
+
|
|
14
|
+
## What's shipped
|
|
15
|
+
|
|
16
|
+
| Layer | Status | Notes |
|
|
17
|
+
|---|---|---|
|
|
18
|
+
| Models + GraphSpec | ✓ | `app/models/` |
|
|
19
|
+
| Registry + validator | ✓ | `app/registry/` — trust boundary for everything downstream |
|
|
20
|
+
| Compiler (spec → StateGraph) | ✓ | `app/compiler/build.py` |
|
|
21
|
+
| Runtime: executor, runners, wrappers, state | ✓ | `app/runtime/` |
|
|
22
|
+
| Recording (full artifacts per run, failed runs included) | ✓ | `app/recording/`, `runs/<id>/` |
|
|
23
|
+
| Supervisor (plan → validate → execute → record → respond) | ✓ | `app/supervisor/`, with status taxonomy |
|
|
24
|
+
| LLM planner (ChatOpenAI, structured output, validation retry) | ✓ | `app/supervisor/llm_planner.py` |
|
|
25
|
+
| LLM runner for `llm_call` | ✓ | `app/runtime/llm_runner.py` |
|
|
26
|
+
| LLM-backed reduce (`llm_summarize`) | ✓ | same file |
|
|
27
|
+
| `parallel_map` (compiler-native + `Send` + JSON-tolerant input) | ✓ | `app/runtime/parallel_map.py` |
|
|
28
|
+
| `branch` (compiler-native + `add_conditional_edges`) | ✓ | `app/runtime/branch.py` |
|
|
29
|
+
| `wait_for_event` (compiler-native + LangGraph `interrupt()`) | ✓ | `app/runtime/wait_for_event.py` |
|
|
30
|
+
| Executor `checkpointer` + `paused` ExecutionResult + real `resume()` | ✓ | `app/runtime/executor.py` |
|
|
31
|
+
| Recorder `load_validated_spec` + per-call `overwrite` | ✓ | `app/recording/recorder.py` |
|
|
32
|
+
| Supervisor `resume(run_id, event)` + `paused`/`resume_failed` statuses | ✓ | `app/supervisor/supervisor.py` |
|
|
33
|
+
| `spawn_subagent` (echo default + OpenAI factory with role prompts) | ✓ | `app/runtime/subagents.py` |
|
|
34
|
+
| `emit_artifact` (echo default + `CollectingArtifactSink` / `FileArtifactSink`) | ✓ | `app/runtime/artifacts.py` |
|
|
35
|
+
| Shared utility: `render_value_for_prompt` (state.py) — value→prompt rendering | ✓ | dedup'd from llm_runner + subagents |
|
|
36
|
+
| Shared utility: `build_openai_chat` — single ChatOpenAI lazy-import seam | ✓ | `app/runtime/chat_models.py` |
|
|
37
|
+
| `Supervisor.replay(run_id, *, new_run_id=None)` — load recorded spec, re-execute fresh | ✓ | `app/supervisor/supervisor.py` |
|
|
38
|
+
| `Supervisor.run_iteratively(...)` — bounded meta-loop with `IterationDecider` Protocol | ✓ | `app/supervisor/iteration.py` |
|
|
39
|
+
| `LlmIterationDecider` + `build_openai_iteration_decider` — LLM judges output against criteria, emits structured replan/stop/ask/fail decisions | ✓ | `app/supervisor/iteration.py` |
|
|
40
|
+
| Real `tool_call` runners — `web_search` (DuckDuckGo + Bing scrape fallback), `policy_lookup`, `document_extract`, `create_follow_up_task` | ✓ (partial — see roadmap) | `app/runtime/tools.py` |
|
|
41
|
+
| `SearchProvider` Protocol + `TavilySearchProvider` (production) + env-aware factory (`build_default_search_provider`) — Tavily activates automatically when `TAVILY_API_KEY` is set, DDG+Bing fallback otherwise | ✓ | `app/runtime/tools.py` |
|
|
42
|
+
| Chain-level recording — `FileRecorder.record_chain` / `.load_chain`, `Supervisor.run_iteratively(record_chain=True)` writes `runs/<chain_id>/chain.json` + `chain.md` | ✓ | `app/recording/recorder.py` |
|
|
43
|
+
| Judge truncation fix — `LlmIterationDecider` value-render limit raised from 500 → 4000 chars, system prompt notes truncation is display-only | ✓ | `app/supervisor/iteration.py` |
|
|
44
|
+
| `strict_runners` flag on executor — refuses to fall back to default echoes | ✓ | `app/runtime/executor.py` |
|
|
45
|
+
|
|
46
|
+
## Executable node kinds
|
|
47
|
+
|
|
48
|
+
| Kind | Status | Path |
|
|
49
|
+
|---|---|---|
|
|
50
|
+
| `llm_call` | ✓ | runner |
|
|
51
|
+
| `tool_call` | ✓ | runner + fake-tool registry (real tools = future) |
|
|
52
|
+
| `reduce` | ✓ | runner; strategies: concat, merge_dict, llm_summarize |
|
|
53
|
+
| `parallel_map` | ✓ | compiler-handled (dispatcher/worker/join) |
|
|
54
|
+
| `branch` | ✓ | compiler-handled (passthrough + conditional_edges) |
|
|
55
|
+
| `wait_for_event` | ✓ | compiler-handled (`interrupt()` + checkpointer + resume) |
|
|
56
|
+
| `spawn_subagent` | ✓ | runner-handled; echo default, OpenAI-backed factory |
|
|
57
|
+
| `emit_artifact` | ✓ | runner-handled; echo default, `FileArtifactSink` wired in `main.py` |
|
|
58
|
+
|
|
59
|
+
**All 8 registry kinds executable.** The runtime is functionally complete for phase 1.
|
|
60
|
+
|
|
61
|
+
## Test surface
|
|
62
|
+
|
|
63
|
+
~222 tests under `tests/`, all passing with `uv run pytest -W error`.
|
|
64
|
+
Files mirror modules: `test_registry.py`, `test_validator.py`,
|
|
65
|
+
`test_wrappers.py`, `test_executor.py`, `test_parallel_map.py`,
|
|
66
|
+
`test_branch.py`, `test_wait_for_event.py`, `test_subagents.py`,
|
|
67
|
+
`test_emit_artifact.py`, `test_replay.py`, `test_iterative_supervisor.py`,
|
|
68
|
+
`test_tools.py`, `test_recording.py`, `test_supervisor.py`,
|
|
69
|
+
`test_llm_planner.py`, `test_llm_runner.py`, `test_e2e_pipeline.py`,
|
|
70
|
+
`test_graph_spec.py`.
|
|
71
|
+
|
|
72
|
+
## Demo entrypoint
|
|
73
|
+
|
|
74
|
+
```
|
|
75
|
+
uv run python -m app.main # token-free (StaticPlanner)
|
|
76
|
+
uv run python -m app.main --llm # real LLM planner + runner
|
|
77
|
+
uv run python -m app.main --llm "your prompt"
|
|
78
|
+
uv run python -m app.main --llm --run-id "exp-1"
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
`--llm` swaps in `LLMPlanner` + `OpenAILlmRunner` + `LlmReduceRunner` and
|
|
82
|
+
widens the planner's reduce-strategy set to include `llm_summarize`.
|
|
83
|
+
Without `--llm`, every `llm_call` is the mock and reduce is deterministic.
|
|
84
|
+
|
|
85
|
+
## Configuration
|
|
86
|
+
|
|
87
|
+
- `.env` (gitignored): `OPENAI_API_KEY`, `LANGSMITH_*`, optional `TAVILY_API_KEY`
|
|
88
|
+
- `python-dotenv` loaded by main.py
|
|
89
|
+
- Default LLM model: `gpt-5.4-nano` (override with `--model`)
|
|
90
|
+
- When `TAVILY_API_KEY` is set, `web_search` uses Tavily; otherwise falls back to DuckDuckGo+Bing scrape (lower quality, no key required). Free Tavily tier: https://tavily.com
|
|
@@ -0,0 +1,328 @@
|
|
|
1
|
+
# Gotchas
|
|
2
|
+
|
|
3
|
+
Things that bit us. Specific, reproducible, with the fix.
|
|
4
|
+
|
|
5
|
+
## LangGraph
|
|
6
|
+
|
|
7
|
+
### Static edges + `Command(goto=...)` don't override — they add
|
|
8
|
+
|
|
9
|
+
If a node returns `Command(goto="x")`, the **static `add_edge` outgoing
|
|
10
|
+
edges still fire in parallel**. Result: phantom downstream execution that
|
|
11
|
+
sees half-populated state.
|
|
12
|
+
|
|
13
|
+
Where this matters:
|
|
14
|
+
- The **supervisor** uses `add_conditional_edges` for failure routing so
|
|
15
|
+
`goto="respond"` is exclusive.
|
|
16
|
+
- `parallel_map`'s **join** checks `state["errors"]` itself and halts
|
|
17
|
+
with `Command(goto=END)` because workers' own goto=END doesn't stop the
|
|
18
|
+
worker→join edge from firing across Send branches.
|
|
19
|
+
|
|
20
|
+
Rule: if you want routing to be exclusive, use `add_conditional_edges`.
|
|
21
|
+
|
|
22
|
+
### `web_search` provider is environment-dependent
|
|
23
|
+
|
|
24
|
+
`build_default_search_provider()` (used by `build_grounded_tools`)
|
|
25
|
+
returns:
|
|
26
|
+
|
|
27
|
+
- `TavilySearchProvider` when `TAVILY_API_KEY` is in the environment.
|
|
28
|
+
This is the production path: LLM-agent-focused search, structured
|
|
29
|
+
snippets with relevance scores, synthesized `answer` field. Free tier
|
|
30
|
+
~1000 searches/month at https://tavily.com.
|
|
31
|
+
- `DuckDuckGoSearchProvider` otherwise. DDG's instant-answer endpoint
|
|
32
|
+
returns mostly definitional content; when that yields nothing, it
|
|
33
|
+
falls back to scraping Bing HTML — fragile, possibly TOS-violating,
|
|
34
|
+
low quality. Acceptable for development and demos.
|
|
35
|
+
|
|
36
|
+
Override explicitly with `build_default_search_provider(prefer_tavily=False)`
|
|
37
|
+
to force DDG even when a key is present, or pass `tavily_api_key=...`
|
|
38
|
+
to supply one without setting the env var.
|
|
39
|
+
|
|
40
|
+
The output shape is uniform across providers — downstream LLM nodes
|
|
41
|
+
consume `{tool, provider, query, answer, results: [{title, url, snippet, score?}]}`
|
|
42
|
+
regardless of which backend ran. Tests that pin a specific provider
|
|
43
|
+
must `monkeypatch.setenv` or `monkeypatch.delenv` for `TAVILY_API_KEY`
|
|
44
|
+
to control selection.
|
|
45
|
+
|
|
46
|
+
### Chain recording layout: chain dir is a sibling of iteration dirs
|
|
47
|
+
|
|
48
|
+
`Supervisor.run_iteratively("...", run_id="X")` produces:
|
|
49
|
+
|
|
50
|
+
runs/
|
|
51
|
+
X/ ← chain metadata (chain.json, chain.md)
|
|
52
|
+
X_iter_1/ ← per-iteration GraphSpec/trace/output/etc.
|
|
53
|
+
X_iter_2/
|
|
54
|
+
...
|
|
55
|
+
|
|
56
|
+
The chain dir and iteration dirs are siblings at the same level (flat
|
|
57
|
+
layout, not nested). To inspect a chain, read `runs/<chain_id>/chain.json`
|
|
58
|
+
or call `recorder.load_chain(chain_id)`. The per-iteration directories
|
|
59
|
+
are normal recorded runs and can be inspected or replayed independently.
|
|
60
|
+
|
|
61
|
+
If `chain_id` collides with an existing run_id (i.e., you ran
|
|
62
|
+
`sup.run(prompt, run_id="X")` and then `sup.run_iteratively(prompt, run_id="X")`),
|
|
63
|
+
the chain recording will overwrite the prior single-run's directory.
|
|
64
|
+
Pick a different `run_id` for chains or use `record_chain=False`.
|
|
65
|
+
|
|
66
|
+
### LlmIterationDecider truncation: 4000 char per value, not 500
|
|
67
|
+
|
|
68
|
+
The judge sees a "Outputs produced (state.values):" section in its eval
|
|
69
|
+
prompt with each value truncated at `value_render_limit` chars
|
|
70
|
+
(default 4000). Real LLM outputs are routinely 1-4k chars, so the
|
|
71
|
+
default 500 we shipped initially was way too tight — the judge would
|
|
72
|
+
respond "I can't verify, the output looks truncated" to every prompt
|
|
73
|
+
with non-trivial output. The system prompt now explicitly tells the
|
|
74
|
+
judge that truncation is display-only and not to penalize it.
|
|
75
|
+
|
|
76
|
+
If you see "I can't verify" / "appears truncated" in judge gaps, bump
|
|
77
|
+
`value_render_limit` further on the decider construction.
|
|
78
|
+
|
|
79
|
+
### LlmIterationDecider defers obvious cases to the fallback decider
|
|
80
|
+
|
|
81
|
+
The LLM judge does NOT evaluate every iteration. It defers to the
|
|
82
|
+
`fallback` decider (default `StatusIterationDecider`) for:
|
|
83
|
+
|
|
84
|
+
- `paused` (framework will ask the user anyway)
|
|
85
|
+
- `plan_failed` / `validation_failed` / `compile_failed` (no output to judge)
|
|
86
|
+
- `record_failed` / `resume_failed` / `replay_failed` (infrastructure issues)
|
|
87
|
+
- `execution_failed` (unless `judge_failed_runs=True`)
|
|
88
|
+
|
|
89
|
+
The point: don't spend tokens on decisions the framework's status
|
|
90
|
+
taxonomy already settled. The LLM only runs on `ok` runs (and
|
|
91
|
+
optionally `execution_failed`). Test invocation counts (`model.calls`)
|
|
92
|
+
expect zero LLM calls for paused/error cases.
|
|
93
|
+
|
|
94
|
+
### `build_replan_prompt`'s output goes to the planner as a `prompt`
|
|
95
|
+
|
|
96
|
+
The iterative supervisor calls `Supervisor.run(replan_prompt, ...)` with
|
|
97
|
+
the text `build_replan_prompt` produced. The planner has no separate
|
|
98
|
+
"replan context" channel — it sees the verbose replan text as just a
|
|
99
|
+
new prompt. Currently this works because the verbose text contains the
|
|
100
|
+
original prompt, gaps, and prior outputs, but the planner doesn't
|
|
101
|
+
*structurally* know it's being replanned. A future refinement: add a
|
|
102
|
+
dedicated `replan_context` arg to `Supervisor.run()` so the planner's
|
|
103
|
+
system prompt can react to it explicitly.
|
|
104
|
+
|
|
105
|
+
### Replay does NOT re-plan and does NOT inherit checkpointer state
|
|
106
|
+
|
|
107
|
+
`Supervisor.replay(run_id)` loads the validated spec the recorder
|
|
108
|
+
persisted on the original run, executes it under a *new* `run_id`, and
|
|
109
|
+
writes a new run directory. Notably:
|
|
110
|
+
|
|
111
|
+
- The **planner is not called** during replay. The point is to re-run
|
|
112
|
+
the same shape, not to ask the planner what to do again.
|
|
113
|
+
- The **checkpointer is not seeded** with the original's state. If the
|
|
114
|
+
spec contains `wait_for_event`, the replay pauses fresh from the start
|
|
115
|
+
— it does NOT pick up from where the original left off.
|
|
116
|
+
- The original recording is **untouched**. New artifacts go to
|
|
117
|
+
`runs/<new_run_id>/`. Default `new_run_id` is
|
|
118
|
+
`<original>_replay_<utc_iso_timestamp>` so the original and the replay
|
|
119
|
+
are colocated for easy diffing.
|
|
120
|
+
|
|
121
|
+
Use `replay()` to compare LLM output across model versions or runner
|
|
122
|
+
code changes. Use `resume()` (different method!) to continue a paused
|
|
123
|
+
run from where it stopped.
|
|
124
|
+
|
|
125
|
+
### Echo defaults vs production factories — which is active matters
|
|
126
|
+
|
|
127
|
+
`default_runners()` returns **placeholder echo runners** for every
|
|
128
|
+
runner-handled kind. They're pure, deterministic, no I/O — perfect for
|
|
129
|
+
tests but **never what you want in production**. Each kind has a real
|
|
130
|
+
factory you wire via `runners={}` to override:
|
|
131
|
+
|
|
132
|
+
| Kind | Echo default | Production factory |
|
|
133
|
+
|-----------------|---------------------|-----------------------------------------------|
|
|
134
|
+
| `llm_call` | `run_llm_call` | `build_openai_llm_runner` -> `OpenAILlmRunner`|
|
|
135
|
+
| `tool_call` | `run_tool_call` (uses `DEFAULT_FAKE_TOOLS`) | **no real tool factory yet** (roadmap) |
|
|
136
|
+
| `reduce` | `run_reduce` (concat/merge_dict only) | `build_openai_reduce_runner` (adds llm_summarize) |
|
|
137
|
+
| `spawn_subagent`| `run_spawn_subagent`| `build_openai_spawn_subagent_runner` |
|
|
138
|
+
| `emit_artifact` | `run_emit_artifact` | `make_emit_artifact_runner(FileArtifactSink(...))` |
|
|
139
|
+
|
|
140
|
+
`main.py` swaps to production factories when `--llm` is set (or for
|
|
141
|
+
`emit_artifact`, always — file persistence is useful even in no-LLM
|
|
142
|
+
demos). If you ship a new client of the supervisor, you must wire the
|
|
143
|
+
production factories explicitly. The compiler will happily run with
|
|
144
|
+
echoes — it has no way to tell.
|
|
145
|
+
|
|
146
|
+
A future hardening: a `strict_runners=True` mode on the executor that
|
|
147
|
+
refuses to fall back to echoes. Flagged in roadmap.md.
|
|
148
|
+
|
|
149
|
+
### Subagent registry must match the Registry's `subagents` allowlist
|
|
150
|
+
|
|
151
|
+
The `Registry.subagents` set is what the validator checks `agent_name`
|
|
152
|
+
against. The `runners={NodeKind.SPAWN_SUBAGENT: make_spawn_subagent_runner(...)}`
|
|
153
|
+
is what actually executes the call.
|
|
154
|
+
|
|
155
|
+
If those two diverge — e.g., you advertise `critic` in the Registry but
|
|
156
|
+
only wire `document_specialist` in the runner — the validator passes but
|
|
157
|
+
the runtime raises `RuntimeError("No subagent registered for 'critic'")`,
|
|
158
|
+
caught as `execution_failed`.
|
|
159
|
+
|
|
160
|
+
Keep them aligned at the same wiring layer (main.py / the supervisor
|
|
161
|
+
construction site). `DEFAULT_SUBAGENT_SYSTEM_PROMPTS` covers the default
|
|
162
|
+
allowlist, but if you add a new name to the allowlist you must add a
|
|
163
|
+
prompt and a wired subagent for it too.
|
|
164
|
+
|
|
165
|
+
### Windows console can't print Unicode by default
|
|
166
|
+
|
|
167
|
+
LLM responses regularly contain `→`, `—`, curly quotes, etc. Windows'
|
|
168
|
+
default `cp1252` codec crashes when `print()` hits these characters.
|
|
169
|
+
`main.py` reconfigures `sys.stdout`/`sys.stderr` to UTF-8 with
|
|
170
|
+
`errors="replace"` at startup so the demo never dies for display reasons.
|
|
171
|
+
|
|
172
|
+
If you write new scripts that print LLM output, do the same:
|
|
173
|
+
```python
|
|
174
|
+
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
### `interrupt()` raises on the first pass, returns on resume
|
|
178
|
+
|
|
179
|
+
`langgraph.types.interrupt(value)` does NOT return on the first execution
|
|
180
|
+
— it raises `GraphInterrupt` which LangGraph catches to persist state and
|
|
181
|
+
pause. On resume (`Command(resume=value)`), the same call returns the
|
|
182
|
+
resume value.
|
|
183
|
+
|
|
184
|
+
Implication for `wait_for_event`: any state-update code *before* the
|
|
185
|
+
interrupt call never makes it into state on the first pass. The
|
|
186
|
+
`make_wait_for_event_node` factory emits START + FINISH events *after*
|
|
187
|
+
the interrupt returns — meaning both events get their timestamps from
|
|
188
|
+
the resume pass, not the original pause. If you need to see "the run is
|
|
189
|
+
currently paused" in the trace, the supervisor or recorder must emit
|
|
190
|
+
that signal, not the wait node itself.
|
|
191
|
+
|
|
192
|
+
### `wait_for_event` requires a checkpointer at compile time, not run time
|
|
193
|
+
|
|
194
|
+
`LangGraphExecutor.compile()` raises `GraphCompilationError` if the spec
|
|
195
|
+
contains any `wait_for_event` node and the executor has no checkpointer.
|
|
196
|
+
This is deliberate: a missing checkpointer would otherwise silently turn
|
|
197
|
+
`interrupt()` into a runtime crash deep inside LangGraph internals.
|
|
198
|
+
|
|
199
|
+
For tests: `MemorySaver()` from `langgraph.checkpoint.memory`. For
|
|
200
|
+
production durability: `SqliteSaver(...)`.
|
|
201
|
+
|
|
202
|
+
### Resume uses `thread_id` = `run_id`
|
|
203
|
+
|
|
204
|
+
The executor passes `config={"configurable": {"thread_id": run_id}}` to
|
|
205
|
+
both `invoke` and `resume`. The checkpointer keys persisted state by
|
|
206
|
+
`thread_id`, so the supervisor's `run_id` is what ties a paused run to
|
|
207
|
+
its resumed continuation. Don't reuse `run_id` across logically-distinct
|
|
208
|
+
flows — it'll cross-contaminate checkpointer state.
|
|
209
|
+
|
|
210
|
+
### Branch routing requires character-for-character node-id match
|
|
211
|
+
|
|
212
|
+
The `branch` design uses the same name for both "the decision string" and
|
|
213
|
+
"the target node id". If an upstream LLM-call is told to emit "factual"
|
|
214
|
+
but the branch's `branches` list is `["factual_answer", "opinion_answer"]`,
|
|
215
|
+
the branch will halt because `"factual"` isn't in the branches set.
|
|
216
|
+
|
|
217
|
+
When prompting the planner (or hand-writing specs), the upstream node's
|
|
218
|
+
output must emit values that **exactly match** one of the branch names.
|
|
219
|
+
The planner prompt warns about this; reinforce it in node-specific
|
|
220
|
+
instructions when needed.
|
|
221
|
+
|
|
222
|
+
### `Send` payload IS the worker's state input
|
|
223
|
+
|
|
224
|
+
`Send(target, payload)` sets `payload` as that worker invocation's
|
|
225
|
+
complete state. Different Sends to the same target get isolated state.
|
|
226
|
+
Worker returns merge into global state via reducers.
|
|
227
|
+
|
|
228
|
+
To pass per-worker context: put it directly in the payload. Don't try to
|
|
229
|
+
broadcast a global "dispatch table" — that's just state pollution.
|
|
230
|
+
|
|
231
|
+
### Import cycle: `runtime` ↔ `compiler`
|
|
232
|
+
|
|
233
|
+
`app/runtime/__init__.py` exports `LangGraphExecutor`. The executor needs
|
|
234
|
+
`app.compiler.build`. The compiler imports `app.runtime.wrappers` and
|
|
235
|
+
`app.runtime.parallel_map`. → cycle.
|
|
236
|
+
|
|
237
|
+
Resolution: `LangGraphExecutor.compile()` imports `app.compiler.build`
|
|
238
|
+
**inside the method body**, not at module level. Don't move it back to a
|
|
239
|
+
top-level import.
|
|
240
|
+
|
|
241
|
+
### `StateGraph.compile()` is a runtime call, not a build step
|
|
242
|
+
|
|
243
|
+
You can call it inside a node, mid-execution. The whole project relies on
|
|
244
|
+
this — every supervisor run compiles a fresh transient graph from the
|
|
245
|
+
planner's spec.
|
|
246
|
+
|
|
247
|
+
## OpenAI structured output
|
|
248
|
+
|
|
249
|
+
### Strict json_schema mode rejects open dicts
|
|
250
|
+
|
|
251
|
+
`chat.with_structured_output(GraphSpec)` defaults to
|
|
252
|
+
`method="json_schema"` which requires `additionalProperties: false` on
|
|
253
|
+
every object schema. `GraphSpec` has `NodeSpec.params: dict[str, Any]`
|
|
254
|
+
which can't satisfy that — strict mode returns a 400.
|
|
255
|
+
|
|
256
|
+
Fix: `chat.with_structured_output(GraphSpec, method="function_calling")`.
|
|
257
|
+
Function-calling mode is more permissive and works with open dicts.
|
|
258
|
+
|
|
259
|
+
### LLM list outputs arrive as JSON-encoded strings
|
|
260
|
+
|
|
261
|
+
When `llm_call` is asked to "produce a JSON list of X", the runner
|
|
262
|
+
returns the *string* `'["a", "b", "c"]'`, not a Python list. Any
|
|
263
|
+
downstream consumer (like `parallel_map`'s `over` source) sees a string.
|
|
264
|
+
|
|
265
|
+
`parallel_map` opportunistically decodes two shapes:
|
|
266
|
+
1. Bare list: `"[a, b, c]"`
|
|
267
|
+
2. Single-key object: `"{\"<over_key>\": [a, b, c]}"` — LLMs *frequently*
|
|
268
|
+
wrap their list in an object whose key matches the requested name.
|
|
269
|
+
|
|
270
|
+
If you add another upstream-list consumer, do the same opportunistic
|
|
271
|
+
decode.
|
|
272
|
+
|
|
273
|
+
### Planner needs literal "START" / "END" strings
|
|
274
|
+
|
|
275
|
+
The planner can produce edges that don't use the literal `"START"` /
|
|
276
|
+
`"END"` sentinels, leading to `no_start_to_end_path` /
|
|
277
|
+
`unreachable_node` validation errors. The system prompt has a worked
|
|
278
|
+
example with the exact JSON shape — **don't remove it**.
|
|
279
|
+
|
|
280
|
+
### Input/output keys need character-for-character matches
|
|
281
|
+
|
|
282
|
+
A downstream node's `inputs: ["sources"]` must match exactly some upstream
|
|
283
|
+
node's `outputs: ["sources"]`. The validator rejects "source", "Sources",
|
|
284
|
+
"the_sources", etc. as `missing_upstream_input`.
|
|
285
|
+
|
|
286
|
+
The planner's retry message includes the set of declared output keys
|
|
287
|
+
from the previous attempt so the model can rename or align.
|
|
288
|
+
|
|
289
|
+
## Pydantic
|
|
290
|
+
|
|
291
|
+
### `AIMessage.content` enforces string type
|
|
292
|
+
|
|
293
|
+
Can't construct `AIMessage(content=42)` for tests — Pydantic v2 rejects
|
|
294
|
+
non-string content. Use a duck-typed class with `.content` attribute
|
|
295
|
+
instead:
|
|
296
|
+
|
|
297
|
+
```python
|
|
298
|
+
class _NonStringResponse:
|
|
299
|
+
content = 42
|
|
300
|
+
```
|
|
301
|
+
|
|
302
|
+
### `EdgeSpec` uses `from_` with alias `"from"`
|
|
303
|
+
|
|
304
|
+
`from` is a Python keyword. The model has `populate_by_name=True` so both
|
|
305
|
+
`from_` and `from` work as input. **Always dump with `by_alias=True`** for
|
|
306
|
+
external artifacts (spec.json, planner outputs, etc.) so JSON consumers
|
|
307
|
+
see `"from"`.
|
|
308
|
+
|
|
309
|
+
## Python 3.13
|
|
310
|
+
|
|
311
|
+
### `datetime.utcnow()` is deprecated
|
|
312
|
+
|
|
313
|
+
Use `datetime.now(UTC)` everywhere. `-W error` catches this in CI.
|
|
314
|
+
|
|
315
|
+
## Dev environment
|
|
316
|
+
|
|
317
|
+
### `python app/main.py` vs `python -m app.main`
|
|
318
|
+
|
|
319
|
+
Running the file directly doesn't add the project root to `sys.path`, so
|
|
320
|
+
`from app.X import Y` fails. Either use `-m app.main` OR include the
|
|
321
|
+
`sys.path` bootstrap that's in main.py's header (gated on
|
|
322
|
+
`__name__ == "__main__" and __package__ in (None, "")`).
|
|
323
|
+
|
|
324
|
+
### Secrets in `.env`
|
|
325
|
+
|
|
326
|
+
`.env` is gitignored. `python-dotenv` `load_dotenv()` runs at the top of
|
|
327
|
+
main.py. If a key shows up in a transcript or PR, **rotate it** — local
|
|
328
|
+
git history isn't the only place keys can leak.
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
# Established patterns
|
|
2
|
+
|
|
3
|
+
Patterns that worked across multiple slices. New code should follow them.
|
|
4
|
+
|
|
5
|
+
## 1. Dependency injection + lazy factory imports
|
|
6
|
+
|
|
7
|
+
When adding an external integration:
|
|
8
|
+
|
|
9
|
+
1. The class accepts the **abstract interface** in its constructor
|
|
10
|
+
(`BaseChatModel`, a `Runnable`, etc.).
|
|
11
|
+
2. A **factory function** constructs the concrete dependency and wires it.
|
|
12
|
+
3. The factory's heavy import (e.g., `langchain_openai`) is **local to the
|
|
13
|
+
factory body**, not at module level.
|
|
14
|
+
|
|
15
|
+
Examples: `OpenAILlmRunner`, `LLMPlanner`, `LlmReduceRunner` all take
|
|
16
|
+
`BaseChatModel` in constructor; `build_openai_*` factories import
|
|
17
|
+
`ChatOpenAI` locally.
|
|
18
|
+
|
|
19
|
+
Why: callers using mocks or alternate providers don't pay the optional-dep
|
|
20
|
+
cost. Tests construct fakes directly without any provider import.
|
|
21
|
+
|
|
22
|
+
## 2. Planner introspects what runtime can execute
|
|
23
|
+
|
|
24
|
+
`LLMPlanner`'s system prompt is templated from the actual runtime state:
|
|
25
|
+
|
|
26
|
+
```python
|
|
27
|
+
executable_kinds = default_runners().keys() | COMPILER_HANDLED_KINDS
|
|
28
|
+
executable_reduce_strategies = injected per main.py wiring
|
|
29
|
+
tools = registry.tools
|
|
30
|
+
subagents = registry.subagents
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
When you add a new executable kind / reduce strategy / allowlisted tool,
|
|
34
|
+
the planner automatically advertises it. **Do not hardcode capability
|
|
35
|
+
lists in the prompt** — template them.
|
|
36
|
+
|
|
37
|
+
## 3. Node kinds: runner-handled vs compiler-handled
|
|
38
|
+
|
|
39
|
+
Two paths to "executable":
|
|
40
|
+
|
|
41
|
+
| Path | Mechanism | Use when | Examples |
|
|
42
|
+
|---|---|---|---|
|
|
43
|
+
| Runner-handled | `NodeRunner = (state, params) → dict`, registered in `default_runners()`, wrapped by `make_node_wrapper` | Kind has clean `(state, params) → result` semantics | `llm_call`, `tool_call`, `reduce` |
|
|
44
|
+
| Compiler-handled | Kind in `COMPILER_HANDLED_KINDS`; compiler emits multiple LangGraph nodes per `NodeSpec` | Kind needs `Send` fan-out, special edge wiring, or multiple internal nodes | `parallel_map` (dispatcher/worker/join) |
|
|
45
|
+
|
|
46
|
+
Both contribute to the union the planner sees. Pick the simpler path
|
|
47
|
+
unless you need multi-node expansion.
|
|
48
|
+
|
|
49
|
+
## 4. State envelope: explicit reducers
|
|
50
|
+
|
|
51
|
+
`DynamicRunState` is a `TypedDict` with `Annotated` reducer channels:
|
|
52
|
+
|
|
53
|
+
| Key | Reducer | Why |
|
|
54
|
+
|---|---|---|
|
|
55
|
+
| `values` | `merge_dicts` | Top-level dict merge; last-write-wins **per key** |
|
|
56
|
+
| `artifacts` | `merge_dicts` | Same |
|
|
57
|
+
| `metadata` | `merge_dicts` | Same |
|
|
58
|
+
| `errors` | `operator.add` | Append-only |
|
|
59
|
+
| `events` | `operator.add` | Append-only (trace) |
|
|
60
|
+
|
|
61
|
+
**Implication**: multiple parallel writers to the same `values` key
|
|
62
|
+
overwrite each other. To collect N parallel results, use **distinct keys**
|
|
63
|
+
(see `parallel_map`'s `<output_key>__<idx>` slot pattern) or a different
|
|
64
|
+
reducer.
|
|
65
|
+
|
|
66
|
+
## 5. Supervisor status taxonomy
|
|
67
|
+
|
|
68
|
+
Every known failure mode has a status string. New failure stages:
|
|
69
|
+
|
|
70
|
+
1. Add to the taxonomy doc on `SupervisorState`.
|
|
71
|
+
2. Catch the **specific** exception in the relevant supervisor node.
|
|
72
|
+
3. Return `{"status": "<x>_failed", "errors": [structured entry]}`.
|
|
73
|
+
4. Route via `add_conditional_edges` to short-circuit if needed.
|
|
74
|
+
5. Add a branch in `respond`'s response-message switch.
|
|
75
|
+
|
|
76
|
+
**Don't crash; classify**. Tests assert on status strings, not stack traces.
|
|
77
|
+
|
|
78
|
+
## 6. Validator is the trust boundary
|
|
79
|
+
|
|
80
|
+
Everything downstream of `validate_graph_spec` assumes well-formed input.
|
|
81
|
+
Compiler doesn't re-validate topology. Runners don't re-validate params.
|
|
82
|
+
Recorder writes whatever it's given.
|
|
83
|
+
|
|
84
|
+
If something needs checking, **add it to the validator**, not downstream.
|
|
85
|
+
|
|
86
|
+
## 7. Recording: failed runs are first class
|
|
87
|
+
|
|
88
|
+
The recorder writes a full set of artifacts for *every* run including
|
|
89
|
+
failures. The supervisor catches recording exceptions and emits status
|
|
90
|
+
`record_failed` instead of crashing.
|
|
91
|
+
|
|
92
|
+
Never let recording errors kill the request.
|
|
93
|
+
|
|
94
|
+
## 8. Per-kind output mapping convention
|
|
95
|
+
|
|
96
|
+
Runners return `{"result": value}` by default; the wrapper maps it to the
|
|
97
|
+
node's `outputs[0]`. If a runner returns named outputs that exactly match
|
|
98
|
+
`outputs[*]`, they're routed directly. Mismatch → wrapper raises and is
|
|
99
|
+
caught by the wrapper itself as an `errors` entry.
|
|
100
|
+
|
|
101
|
+
## 9. Surgical fix loop when LLM smoke fails
|
|
102
|
+
|
|
103
|
+
When `--llm` produces a failure status, each iteration should move the
|
|
104
|
+
failure **further down the pipeline**, never sideways:
|
|
105
|
+
|
|
106
|
+
1. Read the **stage** (`plan_failed`, `validation_failed`, …) and the
|
|
107
|
+
issue code or error message.
|
|
108
|
+
2. Make **one** surgical change targeting that exact failure.
|
|
109
|
+
3. Re-run `pytest -W error` (token-free) — must remain green.
|
|
110
|
+
4. Re-run `--llm` once.
|
|
111
|
+
5. Confirm the failure moved down the pipeline; repeat.
|
|
112
|
+
|
|
113
|
+
This works because the supervisor's status taxonomy named every failure
|
|
114
|
+
specifically. Don't bypass the taxonomy.
|