dynamic-subgraphs 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. dynamic_subgraphs-0.1.0/.claude/README.md +32 -0
  2. dynamic_subgraphs-0.1.0/.claude/context.md +90 -0
  3. dynamic_subgraphs-0.1.0/.claude/gotchas.md +328 -0
  4. dynamic_subgraphs-0.1.0/.claude/patterns.md +114 -0
  5. dynamic_subgraphs-0.1.0/.claude/roadmap.md +180 -0
  6. dynamic_subgraphs-0.1.0/.claude/workflows.md +96 -0
  7. dynamic_subgraphs-0.1.0/.editorconfig +18 -0
  8. dynamic_subgraphs-0.1.0/.env.example +23 -0
  9. dynamic_subgraphs-0.1.0/.gitignore +31 -0
  10. dynamic_subgraphs-0.1.0/.pre-commit-config.yaml +31 -0
  11. dynamic_subgraphs-0.1.0/.python-version +1 -0
  12. dynamic_subgraphs-0.1.0/AGENTS.md +44 -0
  13. dynamic_subgraphs-0.1.0/ARCHITECTURE.md +40 -0
  14. dynamic_subgraphs-0.1.0/CHANGELOG.md +38 -0
  15. dynamic_subgraphs-0.1.0/CODE_OF_CONDUCT.md +40 -0
  16. dynamic_subgraphs-0.1.0/CONTRIBUTING.md +64 -0
  17. dynamic_subgraphs-0.1.0/LICENSE +201 -0
  18. dynamic_subgraphs-0.1.0/NOTICE +24 -0
  19. dynamic_subgraphs-0.1.0/PKG-INFO +335 -0
  20. dynamic_subgraphs-0.1.0/README.md +291 -0
  21. dynamic_subgraphs-0.1.0/SECURITY.md +38 -0
  22. dynamic_subgraphs-0.1.0/app/__init__.py +1 -0
  23. dynamic_subgraphs-0.1.0/app/api/__init__.py +6 -0
  24. dynamic_subgraphs-0.1.0/app/api/__main__.py +18 -0
  25. dynamic_subgraphs-0.1.0/app/api/app.py +32 -0
  26. dynamic_subgraphs-0.1.0/app/api/deps.py +146 -0
  27. dynamic_subgraphs-0.1.0/app/api/errors.py +67 -0
  28. dynamic_subgraphs-0.1.0/app/api/jobs.py +136 -0
  29. dynamic_subgraphs-0.1.0/app/api/routers/__init__.py +1 -0
  30. dynamic_subgraphs-0.1.0/app/api/routers/chains.py +170 -0
  31. dynamic_subgraphs-0.1.0/app/api/routers/health.py +11 -0
  32. dynamic_subgraphs-0.1.0/app/api/routers/registry.py +38 -0
  33. dynamic_subgraphs-0.1.0/app/api/routers/runs.py +300 -0
  34. dynamic_subgraphs-0.1.0/app/api/run_config_store.py +51 -0
  35. dynamic_subgraphs-0.1.0/app/api/schemas.py +86 -0
  36. dynamic_subgraphs-0.1.0/app/api/serialize.py +76 -0
  37. dynamic_subgraphs-0.1.0/app/api/settings.py +53 -0
  38. dynamic_subgraphs-0.1.0/app/assembly.py +256 -0
  39. dynamic_subgraphs-0.1.0/app/compiler/__init__.py +6 -0
  40. dynamic_subgraphs-0.1.0/app/compiler/build.py +168 -0
  41. dynamic_subgraphs-0.1.0/app/compiler/errors.py +5 -0
  42. dynamic_subgraphs-0.1.0/app/main.py +202 -0
  43. dynamic_subgraphs-0.1.0/app/models/__init__.py +29 -0
  44. dynamic_subgraphs-0.1.0/app/models/graph_spec.py +51 -0
  45. dynamic_subgraphs-0.1.0/app/models/node_kinds.py +13 -0
  46. dynamic_subgraphs-0.1.0/app/models/run_state.py +44 -0
  47. dynamic_subgraphs-0.1.0/app/models/trace.py +31 -0
  48. dynamic_subgraphs-0.1.0/app/py.typed +1 -0
  49. dynamic_subgraphs-0.1.0/app/recording/__init__.py +27 -0
  50. dynamic_subgraphs-0.1.0/app/recording/mermaid.py +27 -0
  51. dynamic_subgraphs-0.1.0/app/recording/recorder.py +646 -0
  52. dynamic_subgraphs-0.1.0/app/registry/__init__.py +22 -0
  53. dynamic_subgraphs-0.1.0/app/registry/allowlists.py +30 -0
  54. dynamic_subgraphs-0.1.0/app/registry/definitions.py +91 -0
  55. dynamic_subgraphs-0.1.0/app/registry/errors.py +20 -0
  56. dynamic_subgraphs-0.1.0/app/registry/params.py +88 -0
  57. dynamic_subgraphs-0.1.0/app/registry/registry.py +214 -0
  58. dynamic_subgraphs-0.1.0/app/registry/validator.py +348 -0
  59. dynamic_subgraphs-0.1.0/app/runtime/__init__.py +133 -0
  60. dynamic_subgraphs-0.1.0/app/runtime/artifacts.py +176 -0
  61. dynamic_subgraphs-0.1.0/app/runtime/branch.py +103 -0
  62. dynamic_subgraphs-0.1.0/app/runtime/chat_models.py +39 -0
  63. dynamic_subgraphs-0.1.0/app/runtime/executor.py +304 -0
  64. dynamic_subgraphs-0.1.0/app/runtime/llm_runner.py +152 -0
  65. dynamic_subgraphs-0.1.0/app/runtime/model_providers.py +307 -0
  66. dynamic_subgraphs-0.1.0/app/runtime/parallel_map.py +342 -0
  67. dynamic_subgraphs-0.1.0/app/runtime/runners.py +218 -0
  68. dynamic_subgraphs-0.1.0/app/runtime/state.py +40 -0
  69. dynamic_subgraphs-0.1.0/app/runtime/subagents.py +172 -0
  70. dynamic_subgraphs-0.1.0/app/runtime/subgraph.py +238 -0
  71. dynamic_subgraphs-0.1.0/app/runtime/tools.py +583 -0
  72. dynamic_subgraphs-0.1.0/app/runtime/wait_for_event.py +88 -0
  73. dynamic_subgraphs-0.1.0/app/runtime/wrappers.py +162 -0
  74. dynamic_subgraphs-0.1.0/app/supervisor/__init__.py +51 -0
  75. dynamic_subgraphs-0.1.0/app/supervisor/graph.py +235 -0
  76. dynamic_subgraphs-0.1.0/app/supervisor/iteration.py +525 -0
  77. dynamic_subgraphs-0.1.0/app/supervisor/llm_planner.py +340 -0
  78. dynamic_subgraphs-0.1.0/app/supervisor/planner.py +26 -0
  79. dynamic_subgraphs-0.1.0/app/supervisor/state.py +45 -0
  80. dynamic_subgraphs-0.1.0/app/supervisor/supervisor.py +510 -0
  81. dynamic_subgraphs-0.1.0/docs/api.md +208 -0
  82. dynamic_subgraphs-0.1.0/docs/dynamic-graphs-canonical-design-v1.md +861 -0
  83. dynamic_subgraphs-0.1.0/docs/dynamic-graphs-design-claude.md +476 -0
  84. dynamic_subgraphs-0.1.0/docs/dynamic-graphs-design.md +851 -0
  85. dynamic_subgraphs-0.1.0/docs/evals/model-comparison-2026-06.md +102 -0
  86. dynamic_subgraphs-0.1.0/docs/index.md +21 -0
  87. dynamic_subgraphs-0.1.0/docs/iterative-supervisor.md +106 -0
  88. dynamic_subgraphs-0.1.0/docs/recipes.md +192 -0
  89. dynamic_subgraphs-0.1.0/docs/sdk-next-steps.md +179 -0
  90. dynamic_subgraphs-0.1.0/docs/superpowers/plans/2026-05-30-fastapi-layer.md +2519 -0
  91. dynamic_subgraphs-0.1.0/docs/superpowers/specs/2026-05-30-fastapi-layer-design.md +328 -0
  92. dynamic_subgraphs-0.1.0/dynamic_subgraphs/__init__.py +65 -0
  93. dynamic_subgraphs-0.1.0/dynamic_subgraphs/engine.py +525 -0
  94. dynamic_subgraphs-0.1.0/dynamic_subgraphs/py.typed +1 -0
  95. dynamic_subgraphs-0.1.0/dynamic_subgraphs/recording.py +169 -0
  96. dynamic_subgraphs-0.1.0/dynamic_subgraphs/types.py +63 -0
  97. dynamic_subgraphs-0.1.0/pyproject.toml +145 -0
@@ -0,0 +1,32 @@
1
+ # Agent memory
2
+
3
+ Compressed, durable knowledge from prior coding-agent sessions so future
4
+ sessions (Claude Code, Cursor, Copilot, etc.) can pick up the project
5
+ context without re-deriving it.
6
+
7
+ These files **complement, don't replace**:
8
+ - `AGENTS.md` — package map + MVP sequence (orientation)
9
+ - `ARCHITECTURE.md` — package boundaries + dependency direction
10
+ - `docs/dynamic-graphs-canonical-design-v1.md` — canonical design brief
11
+
12
+ ## Files
13
+
14
+ | File | Read when |
15
+ |---|---|
16
+ | `context.md` | Starting a new session — snapshot of what's shipped |
17
+ | `patterns.md` | About to add a feature — follow the established shapes |
18
+ | `gotchas.md` | Hit a strange LangGraph / OpenAI / Pydantic error |
19
+ | `workflows.md` | Setting up the dev loop, writing tests, debugging |
20
+ | `roadmap.md` | Deciding what to build next |
21
+
22
+ ## How to maintain
23
+
24
+ When you learn something a future agent shouldn't have to re-discover:
25
+ - LangGraph / OpenAI / Pydantic surprise → `gotchas.md`
26
+ - Repeatable architectural shape that worked → `patterns.md`
27
+ - Workflow improvement → `workflows.md`
28
+ - Project state changed → `context.md`
29
+ - New candidate slice or shifted priorities → `roadmap.md`
30
+
31
+ Keep entries **short, specific, and load-bearing**. Save another agent a
32
+ debugging cycle. Don't write a textbook.
@@ -0,0 +1,90 @@
1
+ # Project context
2
+
3
+ ## What this is
4
+
5
+ **Dynamic Subgraphs**: a governed runtime where an LLM synthesizes a
6
+ transient LangGraph workflow per problem. The system validates, compiles,
7
+ executes, and records the graph, then discards the runtime object. Bounded
8
+ by a registry of node kinds — the "language" the planner composes from.
9
+
10
+ The thesis: *the registry is the language; the graph is its temporary
11
+ executable form*. Get the registry right and most other choices are
12
+ recoverable.
13
+
14
+ ## What's shipped
15
+
16
+ | Layer | Status | Notes |
17
+ |---|---|---|
18
+ | Models + GraphSpec | ✓ | `app/models/` |
19
+ | Registry + validator | ✓ | `app/registry/` — trust boundary for everything downstream |
20
+ | Compiler (spec → StateGraph) | ✓ | `app/compiler/build.py` |
21
+ | Runtime: executor, runners, wrappers, state | ✓ | `app/runtime/` |
22
+ | Recording (full artifacts per run, failed runs included) | ✓ | `app/recording/`, `runs/<id>/` |
23
+ | Supervisor (plan → validate → execute → record → respond) | ✓ | `app/supervisor/`, with status taxonomy |
24
+ | LLM planner (ChatOpenAI, structured output, validation retry) | ✓ | `app/supervisor/llm_planner.py` |
25
+ | LLM runner for `llm_call` | ✓ | `app/runtime/llm_runner.py` |
26
+ | LLM-backed reduce (`llm_summarize`) | ✓ | same file |
27
+ | `parallel_map` (compiler-native + `Send` + JSON-tolerant input) | ✓ | `app/runtime/parallel_map.py` |
28
+ | `branch` (compiler-native + `add_conditional_edges`) | ✓ | `app/runtime/branch.py` |
29
+ | `wait_for_event` (compiler-native + LangGraph `interrupt()`) | ✓ | `app/runtime/wait_for_event.py` |
30
+ | Executor `checkpointer` + `paused` ExecutionResult + real `resume()` | ✓ | `app/runtime/executor.py` |
31
+ | Recorder `load_validated_spec` + per-call `overwrite` | ✓ | `app/recording/recorder.py` |
32
+ | Supervisor `resume(run_id, event)` + `paused`/`resume_failed` statuses | ✓ | `app/supervisor/supervisor.py` |
33
+ | `spawn_subagent` (echo default + OpenAI factory with role prompts) | ✓ | `app/runtime/subagents.py` |
34
+ | `emit_artifact` (echo default + `CollectingArtifactSink` / `FileArtifactSink`) | ✓ | `app/runtime/artifacts.py` |
35
+ | Shared utility: `render_value_for_prompt` (state.py) — value→prompt rendering | ✓ | dedup'd from llm_runner + subagents |
36
+ | Shared utility: `build_openai_chat` — single ChatOpenAI lazy-import seam | ✓ | `app/runtime/chat_models.py` |
37
+ | `Supervisor.replay(run_id, *, new_run_id=None)` — load recorded spec, re-execute fresh | ✓ | `app/supervisor/supervisor.py` |
38
+ | `Supervisor.run_iteratively(...)` — bounded meta-loop with `IterationDecider` Protocol | ✓ | `app/supervisor/iteration.py` |
39
+ | `LlmIterationDecider` + `build_openai_iteration_decider` — LLM judges output against criteria, emits structured replan/stop/ask/fail decisions | ✓ | `app/supervisor/iteration.py` |
40
+ | Real `tool_call` runners — `web_search` (DuckDuckGo + Bing scrape fallback), `policy_lookup`, `document_extract`, `create_follow_up_task` | ✓ (partial — see roadmap) | `app/runtime/tools.py` |
41
+ | `SearchProvider` Protocol + `TavilySearchProvider` (production) + env-aware factory (`build_default_search_provider`) — Tavily activates automatically when `TAVILY_API_KEY` is set, DDG+Bing fallback otherwise | ✓ | `app/runtime/tools.py` |
42
+ | Chain-level recording — `FileRecorder.record_chain` / `.load_chain`, `Supervisor.run_iteratively(record_chain=True)` writes `runs/<chain_id>/chain.json` + `chain.md` | ✓ | `app/recording/recorder.py` |
43
+ | Judge truncation fix — `LlmIterationDecider` value-render limit raised from 500 → 4000 chars, system prompt notes truncation is display-only | ✓ | `app/supervisor/iteration.py` |
44
+ | `strict_runners` flag on executor — refuses to fall back to default echoes | ✓ | `app/runtime/executor.py` |
45
+
46
+ ## Executable node kinds
47
+
48
+ | Kind | Status | Path |
49
+ |---|---|---|
50
+ | `llm_call` | ✓ | runner |
51
+ | `tool_call` | ✓ | runner + fake-tool registry (real tools = future) |
52
+ | `reduce` | ✓ | runner; strategies: concat, merge_dict, llm_summarize |
53
+ | `parallel_map` | ✓ | compiler-handled (dispatcher/worker/join) |
54
+ | `branch` | ✓ | compiler-handled (passthrough + conditional_edges) |
55
+ | `wait_for_event` | ✓ | compiler-handled (`interrupt()` + checkpointer + resume) |
56
+ | `spawn_subagent` | ✓ | runner-handled; echo default, OpenAI-backed factory |
57
+ | `emit_artifact` | ✓ | runner-handled; echo default, `FileArtifactSink` wired in `main.py` |
58
+
59
+ **All 8 registry kinds executable.** The runtime is functionally complete for phase 1.
60
+
61
+ ## Test surface
62
+
63
+ ~222 tests under `tests/`, all passing with `uv run pytest -W error`.
64
+ Files mirror modules: `test_registry.py`, `test_validator.py`,
65
+ `test_wrappers.py`, `test_executor.py`, `test_parallel_map.py`,
66
+ `test_branch.py`, `test_wait_for_event.py`, `test_subagents.py`,
67
+ `test_emit_artifact.py`, `test_replay.py`, `test_iterative_supervisor.py`,
68
+ `test_tools.py`, `test_recording.py`, `test_supervisor.py`,
69
+ `test_llm_planner.py`, `test_llm_runner.py`, `test_e2e_pipeline.py`,
70
+ `test_graph_spec.py`.
71
+
72
+ ## Demo entrypoint
73
+
74
+ ```
75
+ uv run python -m app.main # token-free (StaticPlanner)
76
+ uv run python -m app.main --llm # real LLM planner + runner
77
+ uv run python -m app.main --llm "your prompt"
78
+ uv run python -m app.main --llm --run-id "exp-1"
79
+ ```
80
+
81
+ `--llm` swaps in `LLMPlanner` + `OpenAILlmRunner` + `LlmReduceRunner` and
82
+ widens the planner's reduce-strategy set to include `llm_summarize`.
83
+ Without `--llm`, every `llm_call` is the mock and reduce is deterministic.
84
+
85
+ ## Configuration
86
+
87
+ - `.env` (gitignored): `OPENAI_API_KEY`, `LANGSMITH_*`, optional `TAVILY_API_KEY`
88
+ - `python-dotenv` loaded by main.py
89
+ - Default LLM model: `gpt-5.4-nano` (override with `--model`)
90
+ - When `TAVILY_API_KEY` is set, `web_search` uses Tavily; otherwise falls back to DuckDuckGo+Bing scrape (lower quality, no key required). Free Tavily tier: https://tavily.com
@@ -0,0 +1,328 @@
1
+ # Gotchas
2
+
3
+ Things that bit us. Specific, reproducible, with the fix.
4
+
5
+ ## LangGraph
6
+
7
+ ### Static edges + `Command(goto=...)` don't override — they add
8
+
9
+ If a node returns `Command(goto="x")`, the **static `add_edge` outgoing
10
+ edges still fire in parallel**. Result: phantom downstream execution that
11
+ sees half-populated state.
12
+
13
+ Where this matters:
14
+ - The **supervisor** uses `add_conditional_edges` for failure routing so
15
+ `goto="respond"` is exclusive.
16
+ - `parallel_map`'s **join** checks `state["errors"]` itself and halts
17
+ with `Command(goto=END)` because workers' own goto=END doesn't stop the
18
+ worker→join edge from firing across Send branches.
19
+
20
+ Rule: if you want routing to be exclusive, use `add_conditional_edges`.
21
+
22
+ ### `web_search` provider is environment-dependent
23
+
24
+ `build_default_search_provider()` (used by `build_grounded_tools`)
25
+ returns:
26
+
27
+ - `TavilySearchProvider` when `TAVILY_API_KEY` is in the environment.
28
+ This is the production path: LLM-agent-focused search, structured
29
+ snippets with relevance scores, synthesized `answer` field. Free tier
30
+ ~1000 searches/month at https://tavily.com.
31
+ - `DuckDuckGoSearchProvider` otherwise. DDG's instant-answer endpoint
32
+ returns mostly definitional content; when that yields nothing, it
33
+ falls back to scraping Bing HTML — fragile, possibly TOS-violating,
34
+ low quality. Acceptable for development and demos.
35
+
36
+ Override explicitly with `build_default_search_provider(prefer_tavily=False)`
37
+ to force DDG even when a key is present, or pass `tavily_api_key=...`
38
+ to supply one without setting the env var.
39
+
40
+ The output shape is uniform across providers — downstream LLM nodes
41
+ consume `{tool, provider, query, answer, results: [{title, url, snippet, score?}]}`
42
+ regardless of which backend ran. Tests that pin a specific provider
43
+ must `monkeypatch.setenv` or `monkeypatch.delenv` for `TAVILY_API_KEY`
44
+ to control selection.
45
+
46
+ ### Chain recording layout: chain dir is a sibling of iteration dirs
47
+
48
+ `Supervisor.run_iteratively("...", run_id="X")` produces:
49
+
50
+ runs/
51
+ X/ ← chain metadata (chain.json, chain.md)
52
+ X_iter_1/ ← per-iteration GraphSpec/trace/output/etc.
53
+ X_iter_2/
54
+ ...
55
+
56
+ The chain dir and iteration dirs are siblings at the same level (flat
57
+ layout, not nested). To inspect a chain, read `runs/<chain_id>/chain.json`
58
+ or call `recorder.load_chain(chain_id)`. The per-iteration directories
59
+ are normal recorded runs and can be inspected or replayed independently.
60
+
61
+ If `chain_id` collides with an existing run_id (i.e., you ran
62
+ `sup.run(prompt, run_id="X")` and then `sup.run_iteratively(prompt, run_id="X")`),
63
+ the chain recording will overwrite the prior single-run's directory.
64
+ Pick a different `run_id` for chains or use `record_chain=False`.
65
+
66
+ ### LlmIterationDecider truncation: 4000 char per value, not 500
67
+
68
+ The judge sees a "Outputs produced (state.values):" section in its eval
69
+ prompt with each value truncated at `value_render_limit` chars
70
+ (default 4000). Real LLM outputs are routinely 1-4k chars, so the
71
+ default 500 we shipped initially was way too tight — the judge would
72
+ respond "I can't verify, the output looks truncated" to every prompt
73
+ with non-trivial output. The system prompt now explicitly tells the
74
+ judge that truncation is display-only and not to penalize it.
75
+
76
+ If you see "I can't verify" / "appears truncated" in judge gaps, bump
77
+ `value_render_limit` further on the decider construction.
78
+
79
+ ### LlmIterationDecider defers obvious cases to the fallback decider
80
+
81
+ The LLM judge does NOT evaluate every iteration. It defers to the
82
+ `fallback` decider (default `StatusIterationDecider`) for:
83
+
84
+ - `paused` (framework will ask the user anyway)
85
+ - `plan_failed` / `validation_failed` / `compile_failed` (no output to judge)
86
+ - `record_failed` / `resume_failed` / `replay_failed` (infrastructure issues)
87
+ - `execution_failed` (unless `judge_failed_runs=True`)
88
+
89
+ The point: don't spend tokens on decisions the framework's status
90
+ taxonomy already settled. The LLM only runs on `ok` runs (and
91
+ optionally `execution_failed`). Test invocation counts (`model.calls`)
92
+ expect zero LLM calls for paused/error cases.
93
+
94
+ ### `build_replan_prompt`'s output goes to the planner as a `prompt`
95
+
96
+ The iterative supervisor calls `Supervisor.run(replan_prompt, ...)` with
97
+ the text `build_replan_prompt` produced. The planner has no separate
98
+ "replan context" channel — it sees the verbose replan text as just a
99
+ new prompt. Currently this works because the verbose text contains the
100
+ original prompt, gaps, and prior outputs, but the planner doesn't
101
+ *structurally* know it's being replanned. A future refinement: add a
102
+ dedicated `replan_context` arg to `Supervisor.run()` so the planner's
103
+ system prompt can react to it explicitly.
104
+
105
+ ### Replay does NOT re-plan and does NOT inherit checkpointer state
106
+
107
+ `Supervisor.replay(run_id)` loads the validated spec the recorder
108
+ persisted on the original run, executes it under a *new* `run_id`, and
109
+ writes a new run directory. Notably:
110
+
111
+ - The **planner is not called** during replay. The point is to re-run
112
+ the same shape, not to ask the planner what to do again.
113
+ - The **checkpointer is not seeded** with the original's state. If the
114
+ spec contains `wait_for_event`, the replay pauses fresh from the start
115
+ — it does NOT pick up from where the original left off.
116
+ - The original recording is **untouched**. New artifacts go to
117
+ `runs/<new_run_id>/`. Default `new_run_id` is
118
+ `<original>_replay_<utc_iso_timestamp>` so the original and the replay
119
+ are colocated for easy diffing.
120
+
121
+ Use `replay()` to compare LLM output across model versions or runner
122
+ code changes. Use `resume()` (different method!) to continue a paused
123
+ run from where it stopped.
124
+
125
+ ### Echo defaults vs production factories — which is active matters
126
+
127
+ `default_runners()` returns **placeholder echo runners** for every
128
+ runner-handled kind. They're pure, deterministic, no I/O — perfect for
129
+ tests but **never what you want in production**. Each kind has a real
130
+ factory you wire via `runners={}` to override:
131
+
132
+ | Kind | Echo default | Production factory |
133
+ |-----------------|---------------------|-----------------------------------------------|
134
+ | `llm_call` | `run_llm_call` | `build_openai_llm_runner` -> `OpenAILlmRunner`|
135
+ | `tool_call` | `run_tool_call` (uses `DEFAULT_FAKE_TOOLS`) | **no real tool factory yet** (roadmap) |
136
+ | `reduce` | `run_reduce` (concat/merge_dict only) | `build_openai_reduce_runner` (adds llm_summarize) |
137
+ | `spawn_subagent`| `run_spawn_subagent`| `build_openai_spawn_subagent_runner` |
138
+ | `emit_artifact` | `run_emit_artifact` | `make_emit_artifact_runner(FileArtifactSink(...))` |
139
+
140
+ `main.py` swaps to production factories when `--llm` is set (or for
141
+ `emit_artifact`, always — file persistence is useful even in no-LLM
142
+ demos). If you ship a new client of the supervisor, you must wire the
143
+ production factories explicitly. The compiler will happily run with
144
+ echoes — it has no way to tell.
145
+
146
+ A future hardening: a `strict_runners=True` mode on the executor that
147
+ refuses to fall back to echoes. Flagged in roadmap.md.
148
+
149
+ ### Subagent registry must match the Registry's `subagents` allowlist
150
+
151
+ The `Registry.subagents` set is what the validator checks `agent_name`
152
+ against. The `runners={NodeKind.SPAWN_SUBAGENT: make_spawn_subagent_runner(...)}`
153
+ is what actually executes the call.
154
+
155
+ If those two diverge — e.g., you advertise `critic` in the Registry but
156
+ only wire `document_specialist` in the runner — the validator passes but
157
+ the runtime raises `RuntimeError("No subagent registered for 'critic'")`,
158
+ caught as `execution_failed`.
159
+
160
+ Keep them aligned at the same wiring layer (main.py / the supervisor
161
+ construction site). `DEFAULT_SUBAGENT_SYSTEM_PROMPTS` covers the default
162
+ allowlist, but if you add a new name to the allowlist you must add a
163
+ prompt and a wired subagent for it too.
164
+
165
+ ### Windows console can't print Unicode by default
166
+
167
+ LLM responses regularly contain `→`, `—`, curly quotes, etc. Windows'
168
+ default `cp1252` codec crashes when `print()` hits these characters.
169
+ `main.py` reconfigures `sys.stdout`/`sys.stderr` to UTF-8 with
170
+ `errors="replace"` at startup so the demo never dies for display reasons.
171
+
172
+ If you write new scripts that print LLM output, do the same:
173
+ ```python
174
+ sys.stdout.reconfigure(encoding="utf-8", errors="replace")
175
+ ```
176
+
177
+ ### `interrupt()` raises on the first pass, returns on resume
178
+
179
+ `langgraph.types.interrupt(value)` does NOT return on the first execution
180
+ — it raises `GraphInterrupt` which LangGraph catches to persist state and
181
+ pause. On resume (`Command(resume=value)`), the same call returns the
182
+ resume value.
183
+
184
+ Implication for `wait_for_event`: any state-update code *before* the
185
+ interrupt call never makes it into state on the first pass. The
186
+ `make_wait_for_event_node` factory emits START + FINISH events *after*
187
+ the interrupt returns — meaning both events get their timestamps from
188
+ the resume pass, not the original pause. If you need to see "the run is
189
+ currently paused" in the trace, the supervisor or recorder must emit
190
+ that signal, not the wait node itself.
191
+
192
+ ### `wait_for_event` requires a checkpointer at compile time, not run time
193
+
194
+ `LangGraphExecutor.compile()` raises `GraphCompilationError` if the spec
195
+ contains any `wait_for_event` node and the executor has no checkpointer.
196
+ This is deliberate: a missing checkpointer would otherwise silently turn
197
+ `interrupt()` into a runtime crash deep inside LangGraph internals.
198
+
199
+ For tests: `MemorySaver()` from `langgraph.checkpoint.memory`. For
200
+ production durability: `SqliteSaver(...)`.
201
+
202
+ ### Resume uses `thread_id` = `run_id`
203
+
204
+ The executor passes `config={"configurable": {"thread_id": run_id}}` to
205
+ both `invoke` and `resume`. The checkpointer keys persisted state by
206
+ `thread_id`, so the supervisor's `run_id` is what ties a paused run to
207
+ its resumed continuation. Don't reuse `run_id` across logically-distinct
208
+ flows — it'll cross-contaminate checkpointer state.
209
+
210
+ ### Branch routing requires character-for-character node-id match
211
+
212
+ The `branch` design uses the same name for both "the decision string" and
213
+ "the target node id". If an upstream LLM-call is told to emit "factual"
214
+ but the branch's `branches` list is `["factual_answer", "opinion_answer"]`,
215
+ the branch will halt because `"factual"` isn't in the branches set.
216
+
217
+ When prompting the planner (or hand-writing specs), the upstream node's
218
+ output must emit values that **exactly match** one of the branch names.
219
+ The planner prompt warns about this; reinforce it in node-specific
220
+ instructions when needed.
221
+
222
+ ### `Send` payload IS the worker's state input
223
+
224
+ `Send(target, payload)` sets `payload` as that worker invocation's
225
+ complete state. Different Sends to the same target get isolated state.
226
+ Worker returns merge into global state via reducers.
227
+
228
+ To pass per-worker context: put it directly in the payload. Don't try to
229
+ broadcast a global "dispatch table" — that's just state pollution.
230
+
231
+ ### Import cycle: `runtime` ↔ `compiler`
232
+
233
+ `app/runtime/__init__.py` exports `LangGraphExecutor`. The executor needs
234
+ `app.compiler.build`. The compiler imports `app.runtime.wrappers` and
235
+ `app.runtime.parallel_map`. → cycle.
236
+
237
+ Resolution: `LangGraphExecutor.compile()` imports `app.compiler.build`
238
+ **inside the method body**, not at module level. Don't move it back to a
239
+ top-level import.
240
+
241
+ ### `StateGraph.compile()` is a runtime call, not a build step
242
+
243
+ You can call it inside a node, mid-execution. The whole project relies on
244
+ this — every supervisor run compiles a fresh transient graph from the
245
+ planner's spec.
246
+
247
+ ## OpenAI structured output
248
+
249
+ ### Strict json_schema mode rejects open dicts
250
+
251
+ `chat.with_structured_output(GraphSpec)` defaults to
252
+ `method="json_schema"` which requires `additionalProperties: false` on
253
+ every object schema. `GraphSpec` has `NodeSpec.params: dict[str, Any]`
254
+ which can't satisfy that — strict mode returns a 400.
255
+
256
+ Fix: `chat.with_structured_output(GraphSpec, method="function_calling")`.
257
+ Function-calling mode is more permissive and works with open dicts.
258
+
259
+ ### LLM list outputs arrive as JSON-encoded strings
260
+
261
+ When `llm_call` is asked to "produce a JSON list of X", the runner
262
+ returns the *string* `'["a", "b", "c"]'`, not a Python list. Any
263
+ downstream consumer (like `parallel_map`'s `over` source) sees a string.
264
+
265
+ `parallel_map` opportunistically decodes two shapes:
266
+ 1. Bare list: `"[a, b, c]"`
267
+ 2. Single-key object: `"{\"<over_key>\": [a, b, c]}"` — LLMs *frequently*
268
+ wrap their list in an object whose key matches the requested name.
269
+
270
+ If you add another upstream-list consumer, do the same opportunistic
271
+ decode.
272
+
273
+ ### Planner needs literal "START" / "END" strings
274
+
275
+ The planner can produce edges that don't use the literal `"START"` /
276
+ `"END"` sentinels, leading to `no_start_to_end_path` /
277
+ `unreachable_node` validation errors. The system prompt has a worked
278
+ example with the exact JSON shape — **don't remove it**.
279
+
280
+ ### Input/output keys need character-for-character matches
281
+
282
+ A downstream node's `inputs: ["sources"]` must match exactly some upstream
283
+ node's `outputs: ["sources"]`. The validator rejects "source", "Sources",
284
+ "the_sources", etc. as `missing_upstream_input`.
285
+
286
+ The planner's retry message includes the set of declared output keys
287
+ from the previous attempt so the model can rename or align.
288
+
289
+ ## Pydantic
290
+
291
+ ### `AIMessage.content` enforces string type
292
+
293
+ Can't construct `AIMessage(content=42)` for tests — Pydantic v2 rejects
294
+ non-string content. Use a duck-typed class with `.content` attribute
295
+ instead:
296
+
297
+ ```python
298
+ class _NonStringResponse:
299
+ content = 42
300
+ ```
301
+
302
+ ### `EdgeSpec` uses `from_` with alias `"from"`
303
+
304
+ `from` is a Python keyword. The model has `populate_by_name=True` so both
305
+ `from_` and `from` work as input. **Always dump with `by_alias=True`** for
306
+ external artifacts (spec.json, planner outputs, etc.) so JSON consumers
307
+ see `"from"`.
308
+
309
+ ## Python 3.13
310
+
311
+ ### `datetime.utcnow()` is deprecated
312
+
313
+ Use `datetime.now(UTC)` everywhere. `-W error` catches this in CI.
314
+
315
+ ## Dev environment
316
+
317
+ ### `python app/main.py` vs `python -m app.main`
318
+
319
+ Running the file directly doesn't add the project root to `sys.path`, so
320
+ `from app.X import Y` fails. Either use `-m app.main` OR include the
321
+ `sys.path` bootstrap that's in main.py's header (gated on
322
+ `__name__ == "__main__" and __package__ in (None, "")`).
323
+
324
+ ### Secrets in `.env`
325
+
326
+ `.env` is gitignored. `python-dotenv` `load_dotenv()` runs at the top of
327
+ main.py. If a key shows up in a transcript or PR, **rotate it** — local
328
+ git history isn't the only place keys can leak.
@@ -0,0 +1,114 @@
1
+ # Established patterns
2
+
3
+ Patterns that worked across multiple slices. New code should follow them.
4
+
5
+ ## 1. Dependency injection + lazy factory imports
6
+
7
+ When adding an external integration:
8
+
9
+ 1. The class accepts the **abstract interface** in its constructor
10
+ (`BaseChatModel`, a `Runnable`, etc.).
11
+ 2. A **factory function** constructs the concrete dependency and wires it.
12
+ 3. The factory's heavy import (e.g., `langchain_openai`) is **local to the
13
+ factory body**, not at module level.
14
+
15
+ Examples: `OpenAILlmRunner`, `LLMPlanner`, `LlmReduceRunner` all take
16
+ `BaseChatModel` in constructor; `build_openai_*` factories import
17
+ `ChatOpenAI` locally.
18
+
19
+ Why: callers using mocks or alternate providers don't pay the optional-dep
20
+ cost. Tests construct fakes directly without any provider import.
21
+
22
+ ## 2. Planner introspects what runtime can execute
23
+
24
+ `LLMPlanner`'s system prompt is templated from the actual runtime state:
25
+
26
+ ```python
27
+ executable_kinds = default_runners().keys() | COMPILER_HANDLED_KINDS
28
+ executable_reduce_strategies = injected per main.py wiring
29
+ tools = registry.tools
30
+ subagents = registry.subagents
31
+ ```
32
+
33
+ When you add a new executable kind / reduce strategy / allowlisted tool,
34
+ the planner automatically advertises it. **Do not hardcode capability
35
+ lists in the prompt** — template them.
36
+
37
+ ## 3. Node kinds: runner-handled vs compiler-handled
38
+
39
+ Two paths to "executable":
40
+
41
+ | Path | Mechanism | Use when | Examples |
42
+ |---|---|---|---|
43
+ | Runner-handled | `NodeRunner = (state, params) → dict`, registered in `default_runners()`, wrapped by `make_node_wrapper` | Kind has clean `(state, params) → result` semantics | `llm_call`, `tool_call`, `reduce` |
44
+ | Compiler-handled | Kind in `COMPILER_HANDLED_KINDS`; compiler emits multiple LangGraph nodes per `NodeSpec` | Kind needs `Send` fan-out, special edge wiring, or multiple internal nodes | `parallel_map` (dispatcher/worker/join) |
45
+
46
+ Both contribute to the union the planner sees. Pick the simpler path
47
+ unless you need multi-node expansion.
48
+
49
+ ## 4. State envelope: explicit reducers
50
+
51
+ `DynamicRunState` is a `TypedDict` with `Annotated` reducer channels:
52
+
53
+ | Key | Reducer | Why |
54
+ |---|---|---|
55
+ | `values` | `merge_dicts` | Top-level dict merge; last-write-wins **per key** |
56
+ | `artifacts` | `merge_dicts` | Same |
57
+ | `metadata` | `merge_dicts` | Same |
58
+ | `errors` | `operator.add` | Append-only |
59
+ | `events` | `operator.add` | Append-only (trace) |
60
+
61
+ **Implication**: multiple parallel writers to the same `values` key
62
+ overwrite each other. To collect N parallel results, use **distinct keys**
63
+ (see `parallel_map`'s `<output_key>__<idx>` slot pattern) or a different
64
+ reducer.
65
+
66
+ ## 5. Supervisor status taxonomy
67
+
68
+ Every known failure mode has a status string. New failure stages:
69
+
70
+ 1. Add to the taxonomy doc on `SupervisorState`.
71
+ 2. Catch the **specific** exception in the relevant supervisor node.
72
+ 3. Return `{"status": "<x>_failed", "errors": [structured entry]}`.
73
+ 4. Route via `add_conditional_edges` to short-circuit if needed.
74
+ 5. Add a branch in `respond`'s response-message switch.
75
+
76
+ **Don't crash; classify**. Tests assert on status strings, not stack traces.
77
+
78
+ ## 6. Validator is the trust boundary
79
+
80
+ Everything downstream of `validate_graph_spec` assumes well-formed input.
81
+ Compiler doesn't re-validate topology. Runners don't re-validate params.
82
+ Recorder writes whatever it's given.
83
+
84
+ If something needs checking, **add it to the validator**, not downstream.
85
+
86
+ ## 7. Recording: failed runs are first class
87
+
88
+ The recorder writes a full set of artifacts for *every* run including
89
+ failures. The supervisor catches recording exceptions and emits status
90
+ `record_failed` instead of crashing.
91
+
92
+ Never let recording errors kill the request.
93
+
94
+ ## 8. Per-kind output mapping convention
95
+
96
+ Runners return `{"result": value}` by default; the wrapper maps it to the
97
+ node's `outputs[0]`. If a runner returns named outputs that exactly match
98
+ `outputs[*]`, they're routed directly. Mismatch → wrapper raises and is
99
+ caught by the wrapper itself as an `errors` entry.
100
+
101
+ ## 9. Surgical fix loop when LLM smoke fails
102
+
103
+ When `--llm` produces a failure status, each iteration should move the
104
+ failure **further down the pipeline**, never sideways:
105
+
106
+ 1. Read the **stage** (`plan_failed`, `validation_failed`, …) and the
107
+ issue code or error message.
108
+ 2. Make **one** surgical change targeting that exact failure.
109
+ 3. Re-run `pytest -W error` (token-free) — must remain green.
110
+ 4. Re-run `--llm` once.
111
+ 5. Confirm the failure moved down the pipeline; repeat.
112
+
113
+ This works because the supervisor's status taxonomy named every failure
114
+ specifically. Don't bypass the taxonomy.