agentforge-py 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. agentforge/__init__.py +114 -0
  2. agentforge/_testing/__init__.py +19 -0
  3. agentforge/_testing/fake_llm.py +126 -0
  4. agentforge/_testing/fake_tool.py +122 -0
  5. agentforge/_tools/__init__.py +14 -0
  6. agentforge/_tools/calculator.py +102 -0
  7. agentforge/_tools/decorator.py +300 -0
  8. agentforge/_tools/file_read.py +112 -0
  9. agentforge/_tools/shell.py +134 -0
  10. agentforge/_tools/web_search.py +207 -0
  11. agentforge/agent.py +817 -0
  12. agentforge/auth.py +42 -0
  13. agentforge/cli/__init__.py +18 -0
  14. agentforge/cli/_build.py +323 -0
  15. agentforge/cli/_scaffold_state.py +250 -0
  16. agentforge/cli/_shared_scaffold.py +174 -0
  17. agentforge/cli/config_cmd.py +174 -0
  18. agentforge/cli/db_cmd.py +262 -0
  19. agentforge/cli/debug_cmd.py +168 -0
  20. agentforge/cli/docs_cmd.py +217 -0
  21. agentforge/cli/eval_cmd.py +181 -0
  22. agentforge/cli/health_cmd.py +139 -0
  23. agentforge/cli/list_modules.py +85 -0
  24. agentforge/cli/main.py +81 -0
  25. agentforge/cli/manifest_apply.py +368 -0
  26. agentforge/cli/module_cmd.py +247 -0
  27. agentforge/cli/new_cmd.py +171 -0
  28. agentforge/cli/run_cmd.py +234 -0
  29. agentforge/cli/upgrade_cmd.py +230 -0
  30. agentforge/config/__init__.py +45 -0
  31. agentforge/eval/__init__.py +18 -0
  32. agentforge/eval/consistency.py +107 -0
  33. agentforge/eval/coverage.py +100 -0
  34. agentforge/eval/format_compliance.py +107 -0
  35. agentforge/eval/regression.py +143 -0
  36. agentforge/findings.py +166 -0
  37. agentforge/guardrails/__init__.py +32 -0
  38. agentforge/guardrails/allowlist.py +49 -0
  39. agentforge/guardrails/capability_check.py +58 -0
  40. agentforge/guardrails/engine.py +289 -0
  41. agentforge/guardrails/pii_redact_basic.py +61 -0
  42. agentforge/guardrails/prompt_injection_basic.py +90 -0
  43. agentforge/memory/__init__.py +16 -0
  44. agentforge/memory/in_memory.py +130 -0
  45. agentforge/memory/in_memory_graph.py +262 -0
  46. agentforge/memory/in_memory_vector.py +167 -0
  47. agentforge/pipeline/__init__.py +26 -0
  48. agentforge/pipeline/engine.py +189 -0
  49. agentforge/pipeline/errors.py +19 -0
  50. agentforge/pipeline/tool.py +93 -0
  51. agentforge/py.typed +0 -0
  52. agentforge/recording.py +189 -0
  53. agentforge/renderers/__init__.py +28 -0
  54. agentforge/renderers/_defaults.py +32 -0
  55. agentforge/renderers/markdown.py +44 -0
  56. agentforge/renderers/patch_applier.py +46 -0
  57. agentforge/renderers/registry.py +108 -0
  58. agentforge/renderers/scorecard.py +59 -0
  59. agentforge/renderers/span_table.py +71 -0
  60. agentforge/replay.py +260 -0
  61. agentforge/resolver_register.py +41 -0
  62. agentforge/retrieval.py +410 -0
  63. agentforge/runtime.py +63 -0
  64. agentforge/strategies/__init__.py +27 -0
  65. agentforge/strategies/_base.py +280 -0
  66. agentforge/strategies/_plan.py +93 -0
  67. agentforge/strategies/multi_agent.py +541 -0
  68. agentforge/strategies/plan_execute.py +506 -0
  69. agentforge/strategies/react.py +237 -0
  70. agentforge/strategies/tot.py +472 -0
  71. agentforge/templates/_shared/.cursorrules +12 -0
  72. agentforge/templates/_shared/.github/copilot-instructions.md +13 -0
  73. agentforge/templates/_shared/.gitkeep +0 -0
  74. agentforge/templates/_shared/AGENTS.md.tmpl +123 -0
  75. agentforge/templates/_shared/CLAUDE.md +13 -0
  76. agentforge/templates/_shared/docs/runbooks/01-set-up-new-agent.md.tmpl +67 -0
  77. agentforge/templates/_shared/docs/runbooks/02-add-a-tool.md +67 -0
  78. agentforge/templates/_shared/docs/runbooks/03-add-a-pipeline-task.md +69 -0
  79. agentforge/templates/_shared/docs/runbooks/04-pick-reasoning-strategy.md +67 -0
  80. agentforge/templates/_shared/docs/runbooks/05-write-prompts.md +75 -0
  81. agentforge/templates/_shared/docs/runbooks/06-test-your-agent.md +75 -0
  82. agentforge/templates/_shared/docs/runbooks/07-debug-a-run.md +70 -0
  83. agentforge/templates/_shared/docs/runbooks/08-add-memory.md +75 -0
  84. agentforge/templates/_shared/docs/runbooks/09-add-mcp.md +78 -0
  85. agentforge/templates/_shared/docs/runbooks/10-add-evaluators.md +76 -0
  86. agentforge/templates/_shared/docs/runbooks/11-add-safety-guardrails.md +83 -0
  87. agentforge/templates/_shared/docs/runbooks/12-add-observability.md +77 -0
  88. agentforge/templates/_shared/docs/runbooks/13-configure-multi-provider.md +91 -0
  89. agentforge/templates/_shared/docs/runbooks/14-deploy-your-agent.md +70 -0
  90. agentforge/templates/_shared/docs/runbooks/15-upgrade-your-agent.md +67 -0
  91. agentforge/templates/_shared/docs/runbooks/16-configuration-reference.md +81 -0
  92. agentforge/templates/_shared/docs/runbooks/17-add-reranker.md +78 -0
  93. agentforge/templates/_shared/docs/runbooks/18-add-hybrid-search.md +78 -0
  94. agentforge/templates/_shared/docs/runbooks/19-add-graphrag.md +83 -0
  95. agentforge/templates/_shared/docs/runbooks/20-apply-schema-migrations.md +92 -0
  96. agentforge/templates/_shared/docs/runbooks/21-use-streaming-guardrails.md +82 -0
  97. agentforge/templates/_shared/docs/runbooks/README.md.tmpl +68 -0
  98. agentforge/templates/code-reviewer/.env.example +8 -0
  99. agentforge/templates/code-reviewer/.gitignore +7 -0
  100. agentforge/templates/code-reviewer/README.md +12 -0
  101. agentforge/templates/code-reviewer/agentforge.yaml +23 -0
  102. agentforge/templates/code-reviewer/copier.yml +34 -0
  103. agentforge/templates/code-reviewer/pyproject.toml +18 -0
  104. agentforge/templates/code-reviewer/src/{{project_slug.replace('-', '_')}}/__init__.py +5 -0
  105. agentforge/templates/code-reviewer/src/{{project_slug.replace('-', '_')}}/main.py +32 -0
  106. agentforge/templates/docs-qa/.env.example +8 -0
  107. agentforge/templates/docs-qa/.gitignore +7 -0
  108. agentforge/templates/docs-qa/README.md +14 -0
  109. agentforge/templates/docs-qa/agentforge.yaml +19 -0
  110. agentforge/templates/docs-qa/copier.yml +31 -0
  111. agentforge/templates/docs-qa/pyproject.toml +18 -0
  112. agentforge/templates/docs-qa/src/{{project_slug.replace('-', '_')}}/__init__.py +5 -0
  113. agentforge/templates/docs-qa/src/{{project_slug.replace('-', '_')}}/main.py +32 -0
  114. agentforge/templates/minimal/.env.example +11 -0
  115. agentforge/templates/minimal/.gitignore +10 -0
  116. agentforge/templates/minimal/README.md +28 -0
  117. agentforge/templates/minimal/agentforge.yaml +10 -0
  118. agentforge/templates/minimal/copier.yml +52 -0
  119. agentforge/templates/minimal/pyproject.toml +18 -0
  120. agentforge/templates/minimal/src/{{project_slug.replace('-', '_')}}/__init__.py +5 -0
  121. agentforge/templates/minimal/src/{{project_slug.replace('-', '_')}}/main.py +34 -0
  122. agentforge/templates/patch-bot/.env.example +8 -0
  123. agentforge/templates/patch-bot/.gitignore +7 -0
  124. agentforge/templates/patch-bot/README.md +13 -0
  125. agentforge/templates/patch-bot/agentforge.yaml +15 -0
  126. agentforge/templates/patch-bot/copier.yml +31 -0
  127. agentforge/templates/patch-bot/pyproject.toml +18 -0
  128. agentforge/templates/patch-bot/src/{{project_slug.replace('-', '_')}}/__init__.py +5 -0
  129. agentforge/templates/patch-bot/src/{{project_slug.replace('-', '_')}}/main.py +32 -0
  130. agentforge/templates/research/.env.example +8 -0
  131. agentforge/templates/research/.gitignore +7 -0
  132. agentforge/templates/research/README.md +14 -0
  133. agentforge/templates/research/agentforge.yaml +17 -0
  134. agentforge/templates/research/copier.yml +31 -0
  135. agentforge/templates/research/pyproject.toml +18 -0
  136. agentforge/templates/research/src/{{project_slug.replace('-', '_')}}/__init__.py +5 -0
  137. agentforge/templates/research/src/{{project_slug.replace('-', '_')}}/main.py +31 -0
  138. agentforge/templates/triage/.env.example +8 -0
  139. agentforge/templates/triage/.gitignore +7 -0
  140. agentforge/templates/triage/README.md +14 -0
  141. agentforge/templates/triage/agentforge.yaml +25 -0
  142. agentforge/templates/triage/copier.yml +31 -0
  143. agentforge/templates/triage/pyproject.toml +18 -0
  144. agentforge/templates/triage/src/{{project_slug.replace('-', '_')}}/__init__.py +5 -0
  145. agentforge/templates/triage/src/{{project_slug.replace('-', '_')}}/main.py +30 -0
  146. agentforge/testing/__init__.py +69 -0
  147. agentforge/testing/conformance.py +40 -0
  148. agentforge/testing/factory.py +89 -0
  149. agentforge/testing/fixtures.py +42 -0
  150. agentforge/testing/llm.py +235 -0
  151. agentforge/testing/recording.py +177 -0
  152. agentforge/tools/__init__.py +41 -0
  153. agentforge_py-0.2.1.dist-info/METADATA +158 -0
  154. agentforge_py-0.2.1.dist-info/RECORD +157 -0
  155. agentforge_py-0.2.1.dist-info/WHEEL +4 -0
  156. agentforge_py-0.2.1.dist-info/entry_points.txt +2 -0
  157. agentforge_py-0.2.1.dist-info/licenses/LICENSE +202 -0
@@ -0,0 +1,67 @@
1
+ # 01 — Set up a new AgentForge agent
2
+
3
+ > **Goal:** verify the freshly scaffolded `{{ project_slug }}`
4
+ > agent runs end-to-end against your provider.
5
+ > **Time:** ~10 minutes.
6
+ > **Prereqs:** none (this is runbook 01).
7
+
8
+ ## TL;DR
9
+
10
+ ```bash
11
+ cd {{ project_slug }}
12
+ uv sync
13
+ cp .env.example .env # then fill in real credentials
14
+ agentforge config validate
15
+ agentforge run "hello"
16
+ ```
17
+
18
+ ## Step by step
19
+
20
+ 1. **Install dependencies** with `uv sync`. AgentForge uses uv
21
+ workspaces; the lock file pins every version.
22
+ 2. **Configure credentials** by copying `.env.example` to `.env`
23
+ and filling in the LLM provider's API key (`ANTHROPIC_API_KEY`,
24
+ `AWS_ACCESS_KEY_ID`, or `OPENAI_API_KEY` depending on what
25
+ you scaffolded). Never check `.env` into git.
26
+ 3. **Validate the config** with `agentforge config validate`.
27
+ That parses `agentforge.yaml`, expands `${ENV_VAR}` references,
28
+ and surfaces any unknown keys.
29
+ 4. **Run the agent** with `agentforge run "your first task"`.
30
+ The default output is rich-formatted; pass `--output-format
31
+ json` for structured output (handy in CI).
32
+ 5. **Check the trace** — every step the agent emitted is on the
33
+ returned `RunResult`. `agentforge run --record "..."` followed
34
+ by `agentforge debug --replay <run-id>` lets you step through.
35
+
36
+ ## Variations
37
+
38
+ - **Different provider** — edit `agentforge.yaml > providers >
39
+ default` to point at the provider you prefer, then update
40
+ `.env` accordingly. See runbook 13 for multi-provider setups.
41
+ - **No credentials yet** — `MockLLMClient.deterministic("ok")`
42
+ in your own tests lets you exercise the loop without hitting
43
+ a real API (see runbook 06).
44
+ - **Batch / CI mode** — `agentforge run --no-prompts --task-file
45
+ ./task.txt --output-format json` is the script-friendly path.
46
+
47
+ ## Troubleshooting
48
+
49
+ | Symptom | Cause | Fix |
50
+ |---|---|---|
51
+ | `agentforge: command not found` | uv venv isn't activated | `uv sync` then prefix commands with `uv run` |
52
+ | `config invalid` exit code 2 | unknown YAML key | check the diff between your `agentforge.yaml` and `agentforge config schema` output |
53
+ | `No LLM provider registered` | provider package not installed | `agentforge add module <provider>` (e.g. `agentforge add module bedrock`) |
54
+ | `BudgetExceeded` on first run | budget too low for the model | bump `agent.budget.usd` in `agentforge.yaml` |
55
+
56
+ ## Related
57
+
58
+ - Runbook 02 — Add a tool
59
+ - Runbook 06 — Test your agent
60
+ - Runbook 13 — Configure multi-provider
61
+ - Feature spec: `docs/features/feat-011-scaffolding-and-upgrade.md`
62
+
63
+ <!-- agentforge:end-managed -->
64
+
65
+ <!-- agentforge:custom -->
66
+ <!-- Project-specific setup notes go here. Survives upgrades. -->
67
+ <!-- agentforge:end-custom -->
@@ -0,0 +1,67 @@
1
+ # 02 — Add a tool
2
+
3
+ > **Goal:** make a new capability available to your agent's
4
+ > reasoning loop.
5
+ > **Time:** ~10 minutes.
6
+ > **Prereqs:** runbook 01 done.
7
+
8
+ ## TL;DR
9
+
10
+ ```python
11
+ from agentforge import tool
12
+
13
+ @tool
14
+ async def fetch_weather(*, city: str) -> str:
15
+ """Return the current weather summary for `city`."""
16
+ return await my_weather_api(city)
17
+
18
+ agent = Agent(model="...", tools=[fetch_weather])
19
+ ```
20
+
21
+ ## Step by step
22
+
23
+ 1. **Decide tool surface** — what kwargs does the LLM pass? Each
24
+ becomes a typed parameter on the function. Use Pydantic models
25
+ only when the inputs are nested.
26
+ 2. **Author the tool** with the `@tool` decorator. Type hints
27
+ drive the JSON schema the LLM sees; do NOT hand-write a schema.
28
+ 3. **Add a docstring** — the first line is what the LLM reads to
29
+ decide when to call your tool. Keep it short and behaviour-
30
+ focused: "Returns X given Y", not "This tool will...".
31
+ 4. **Pass the tool to `Agent(tools=[...])`** or list it under
32
+ `agent.tools:` in `agentforge.yaml` so it's auto-resolved on
33
+ `agentforge run`.
34
+ 5. **Cover with a test** — instantiate via `FakeTool.fake(...)`
35
+ for tests where the real call would be too slow / costly
36
+ (see runbook 06).
37
+
38
+ ## Variations
39
+
40
+ - **Class-based tool** — subclass `Tool` directly when you need
41
+ per-instance state (DB pool, HTTP client). Pattern in
42
+ `agentforge_core.contracts.tool`.
43
+ - **Destructive tools** — set `capabilities: ClassVar = frozenset
44
+ ({"destructive"})` on the class. The `capability_check`
45
+ guardrail (runbook 11) will deny it unless allowlisted.
46
+ - **Long-running tools** — return early with a status; let the
47
+ next iteration check completion. Don't `await` for 30 seconds.
48
+
49
+ ## Troubleshooting
50
+
51
+ | Symptom | Cause | Fix |
52
+ |---|---|---|
53
+ | LLM never calls the tool | docstring too vague | rewrite to be behaviour-focused: "Fetches X for Y" |
54
+ | `ValidationError` on tool call | type hints don't match LLM args | check the JSON schema with `tool.to_spec()` |
55
+ | Tool runs but observation lost | tool returns `None` | return a string (or a dict; the framework JSON-serialises) |
56
+ | Tool ran twice unexpectedly | LLM retried | check the previous observation surfaced clearly; vague observations cause retries |
57
+
58
+ ## Related
59
+
60
+ - Runbook 06 — Test your agent (covers `FakeTool.fake`)
61
+ - Runbook 11 — Add safety guardrails (capability gating)
62
+ - Feature spec: `docs/features/feat-004-tools-system.md`
63
+
64
+ <!-- agentforge:end-managed -->
65
+
66
+ <!-- agentforge:custom -->
67
+ <!-- agentforge:end-custom -->
@@ -0,0 +1,69 @@
1
+ # 03 — Add a pipeline task
2
+
3
+ > **Goal:** insert a deterministic, non-LLM step into the agent's
4
+ > workflow (e.g. parse a file, call a metric API, normalise
5
+ > input).
6
+ > **Time:** ~15 minutes.
7
+ > **Prereqs:** runbook 02 (you understand tools).
8
+
9
+ ## TL;DR
10
+
11
+ ```python
12
+ from agentforge import Pipeline, Task
13
+
14
+ class FetchPRMetadata(Task):
15
+ async def run(self, *, pr_url: str) -> dict:
16
+ return await my_github_client.get_pr(pr_url)
17
+
18
+ pipeline = Pipeline([FetchPRMetadata, AgentStep, RenderReport])
19
+ ```
20
+
21
+ ## Step by step
22
+
23
+ 1. **Identify deterministic boundaries** — anything that has a
24
+ stable function from inputs to outputs (file parsing, API
25
+ normalisation, score thresholding) is a Task, not an LLM call.
26
+ 2. **Author the Task** — subclass `Task`; declare typed inputs
27
+ and outputs; `async def run(...)` is the body.
28
+ 3. **Compose** with `Pipeline([T1, T2, T3])` — tasks run in
29
+ order, each receiving the previous one's output.
30
+ 4. **Mix LLM steps** — use the framework's `AgentStep` wrapper
31
+ to drop an agent run into a pipeline; the surrounding tasks
32
+ handle deterministic pre/post processing.
33
+ 5. **Capture failures** — Tasks raise `TaskError` for
34
+ recoverable cases; the framework surfaces it as a step in
35
+ the agent's trace.
36
+
37
+ ## Variations
38
+
39
+ - **Parallel tasks** — `Pipeline.parallel([T1, T2])` runs them
40
+ concurrently and joins their outputs into a dict.
41
+ - **Conditional branching** — wrap with `Pipeline.branch(
42
+ condition_fn, true_pipe, false_pipe)`.
43
+ - **Retry** — set `retries=N` on the Task class; the framework
44
+ re-runs with exponential backoff.
45
+
46
+ ## Troubleshooting
47
+
48
+ | Symptom | Cause | Fix |
49
+ |---|---|---|
50
+ | Task output not visible to LLM | task didn't make output visible to the agent step | thread output through `AgentStep`'s task context |
51
+ | Pipeline fails fast on first error | default behaviour | wrap with `Pipeline.tolerant(...)` for best-effort runs |
52
+ | Memory blows up on large pipelines | task results held in RAM | persist intermediate outputs to memory store (runbook 08) |
53
+
54
+ ## Related
55
+
56
+ - Runbook 02 — Add a tool (different shape: tools are LLM-
57
+ invoked, tasks are deterministic)
58
+ - Runbook 08 — Add memory
59
+ - Feature spec: `docs/features/feat-015-pipelines-and-deterministic-tasks.md`
60
+
61
+ > **Note:** Pipelines + Tasks are feat-015 territory. If your
62
+ > framework version pre-dates that feature, this runbook is
63
+ > aspirational — fall back to wrapping deterministic steps as
64
+ > tools.
65
+
66
+ <!-- agentforge:end-managed -->
67
+
68
+ <!-- agentforge:custom -->
69
+ <!-- agentforge:end-custom -->
@@ -0,0 +1,67 @@
1
+ # 04 — Pick a reasoning strategy
2
+
3
+ > **Goal:** choose the right `ReasoningStrategy` for your task.
4
+ > **Time:** ~5 minutes.
5
+ > **Prereqs:** runbook 01 done.
6
+
7
+ ## TL;DR
8
+
9
+ ```yaml
10
+ # agentforge.yaml
11
+ agent:
12
+ strategy: react # default — most agents stay here
13
+ # strategy: plan-execute # multi-step plans with verification
14
+ # strategy: tree-of-thoughts # search over candidate paths
15
+ # strategy: multi-agent # supervisor + worker fan-out
16
+ ```
17
+
18
+ ## Step by step
19
+
20
+ 1. **Default to ReAct.** It's the simplest stable loop: think →
21
+ act → observe. Most agent failures come from prompts or
22
+ tools, not the loop itself. Switching strategies on Day 1 is
23
+ premature.
24
+ 2. **Move to Plan-Execute** when the task naturally decomposes
25
+ into a plan + execution: code reviewers, multi-file edits,
26
+ research with structured outputs.
27
+ 3. **Move to Tree-of-Thoughts** when you have a verifier and
28
+ need to explore multiple candidate paths. Expensive — only
29
+ when the task warrants it.
30
+ 4. **Move to Multi-Agent** when distinct sub-agents have
31
+ meaningfully different system prompts / tool sets (security
32
+ reviewer + style reviewer + correctness reviewer).
33
+ 5. **Measure before switching** — runbook 10 covers evaluators.
34
+ Don't change strategy without a baseline.
35
+
36
+ ## Variations
37
+
38
+ - **Custom strategy** — subclass `ReasoningStrategy` and
39
+ register via `@register("strategies", "my-name")`. Run
40
+ `run_strategy_conformance` from `agentforge.testing` against
41
+ it.
42
+ - **Iteration cap** — `agent.max_iterations` (default 25) is
43
+ enforced by every shipped strategy; ToT respects it
44
+ per-branch.
45
+ - **Budget reservation** — strategies coordinate with
46
+ `BudgetPolicy` automatically; you don't need to thread cost
47
+ manually.
48
+
49
+ ## Troubleshooting
50
+
51
+ | Symptom | Cause | Fix |
52
+ |---|---|---|
53
+ | Agent loops indefinitely | tool observations too vague to make progress | improve tool docstrings + observation strings |
54
+ | `iteration_cap` finish_reason | max_iterations too low | bump it or switch to Plan-Execute |
55
+ | Plan-Execute "plan" step is junk | system prompt didn't give planning hints | seed examples in the prompt; runbook 05 |
56
+ | ToT cost spike | verifier too lenient, expanding too many branches | tighten verifier prompt; cap `max_branches` |
57
+
58
+ ## Related
59
+
60
+ - Runbook 05 — Write prompts
61
+ - Runbook 10 — Add evaluators (baseline before switching)
62
+ - Feature spec: `docs/features/feat-002-reasoning-strategies.md`
63
+
64
+ <!-- agentforge:end-managed -->
65
+
66
+ <!-- agentforge:custom -->
67
+ <!-- agentforge:end-custom -->
@@ -0,0 +1,75 @@
1
+ # 05 — Write prompts
2
+
3
+ > **Goal:** author a system prompt that produces consistent
4
+ > agent behaviour.
5
+ > **Time:** ~20 minutes.
6
+ > **Prereqs:** runbook 01 done.
7
+
8
+ ## TL;DR
9
+
10
+ ```yaml
11
+ # agentforge.yaml
12
+ agent:
13
+ system_prompt_file: ./prompts/system.md
14
+ ```
15
+
16
+ ```markdown
17
+ <!-- prompts/system.md -->
18
+ You are a {{ role }}. Your job is {{ goal }}.
19
+
20
+ ## Tools
21
+ You have these tools available: {{ tool_summary }}.
22
+ Use them only when you cannot answer from existing context.
23
+
24
+ ## Output
25
+ Produce a `SimpleFinding` object with severity in {low, medium, high}.
26
+
27
+ ## Style
28
+ Be concise. Cite sources. Refuse silently if asked to bypass safety.
29
+ ```
30
+
31
+ ## Step by step
32
+
33
+ 1. **Start with a role + goal sentence.** Two sentences is
34
+ enough. Long preambles dilute the rest of the prompt.
35
+ 2. **List tools and their purpose.** The LLM already knows the
36
+ schemas (the framework injects them). What it doesn't know
37
+ is *when* to prefer one over another. Tell it.
38
+ 3. **Define output shape** — point at the finding variant
39
+ (`SimpleFinding`, `PatchFinding`, etc.) you want. The
40
+ framework will enforce it via the configured renderer.
41
+ 4. **Pin style** — concise, cite, refuse-silently. Models
42
+ respect concrete style rules more than vibe descriptors.
43
+ 5. **Iterate on examples.** Add 1-3 worked examples for hard
44
+ cases. Examples are cheaper than rule-tweaking.
45
+
46
+ ## Variations
47
+
48
+ - **Per-strategy prompts** — multi-agent supervisors carry a
49
+ separate prompt under `agent.workers.<role>.system_prompt`.
50
+ - **Tool-specific framing** — for tools whose names are
51
+ ambiguous, restate intent at point of use: "Call `lookup_user`
52
+ with the email from the issue body".
53
+ - **Dynamic context** — use Jinja in the prompt file; the
54
+ framework expands `{{ runtime_context.user }}` etc. at run
55
+ time.
56
+
57
+ ## Troubleshooting
58
+
59
+ | Symptom | Cause | Fix |
60
+ |---|---|---|
61
+ | Output drifts from the schema | prompt over-emphasises prose | move output-shape rule to the top |
62
+ | Model refuses to answer | safety phrasing too defensive | tone down "you must never..." → "decline politely when..." |
63
+ | Repeated tool calls with same args | tool docstring + system prompt disagree | reconcile; the docstring wins for the LLM |
64
+ | Verbose, low-signal responses | no concision rule | add "respond in ≤ 200 words" |
65
+
66
+ ## Related
67
+
68
+ - Runbook 02 — Add a tool (tool docstrings)
69
+ - Runbook 10 — Add evaluators (measure prompt impact)
70
+ - Feature spec: `docs/features/feat-008-findings-and-output-shapes.md`
71
+
72
+ <!-- agentforge:end-managed -->
73
+
74
+ <!-- agentforge:custom -->
75
+ <!-- agentforge:end-custom -->
@@ -0,0 +1,75 @@
1
+ # 06 — Test your agent
2
+
3
+ > **Goal:** unit-test your agent without hitting a real LLM or
4
+ > network.
5
+ > **Time:** ~15 minutes.
6
+ > **Prereqs:** runbook 02 (you have at least one tool).
7
+
8
+ ## TL;DR
9
+
10
+ ```python
11
+ import pytest
12
+ from agentforge.testing import MockLLMClient, FakeTool, agent_factory
13
+
14
+ @pytest.mark.asyncio
15
+ async def test_population_lookup() -> None:
16
+ llm = MockLLMClient.from_script([
17
+ {"text": "Looking up.",
18
+ "tool_calls": [{"name": "search", "args": {"q": "Spain"}}]},
19
+ {"text": "47.5M", "stop_reason": "end_turn"},
20
+ ])
21
+ web = FakeTool.fake("search", lambda **kw: "47.5M people")
22
+ agent = agent_factory(model=llm, tools=[web])
23
+ result = await agent.run("How many in Spain?")
24
+ assert "47.5M" in result.output
25
+ ```
26
+
27
+ ## Step by step
28
+
29
+ 1. **Use `MockLLMClient.from_script(...)`** for tests that need
30
+ to drive specific LLM responses. `deterministic("ok")` works
31
+ when you only care that the loop completes.
32
+ 2. **Stub tools with `FakeTool.fake(name, fn)`** — accepts a
33
+ static value or a callable. Preserves the real tool's
34
+ `name` so the LLM sees the same surface.
35
+ 3. **Use `agent_factory(...)`** instead of raw `Agent(...)`. It
36
+ bakes in safe defaults (in-memory store, no log filter
37
+ mutation, low budget) so tests stay isolated.
38
+ 4. **Assert on `result.output`** for the answer, and on
39
+ `mock_llm.tool_calls_observed` for the LLM's tool-use
40
+ sequence. Both are cheaper than parsing trace strings.
41
+ 5. **Record once, replay forever.** For tests that exercise a
42
+ real provider response, `record_llm(real, "cassette.jsonl")`
43
+ captures it; subsequent runs use
44
+ `MockLLMClient.from_recording(...)`.
45
+
46
+ ## Variations
47
+
48
+ - **Property-based tests** — pair `agent_factory` with Hypothesis
49
+ strategies for input fuzzing.
50
+ - **Golden sets** — `agentforge-testing` ships
51
+ `GoldenSetRunner.from_jsonl(...)`; fixture lines hold
52
+ `task` + `expected` (exact / contains / regex / any_of).
53
+ - **Snapshot rendering** — `assert_snapshot(text, path)` for
54
+ scorecard / patch output. `UPDATE_SNAPSHOTS=1 pytest`
55
+ re-records.
56
+
57
+ ## Troubleshooting
58
+
59
+ | Symptom | Cause | Fix |
60
+ |---|---|---|
61
+ | `MockLLMClient exhausted` | the agent made more LLM calls than scripted | extend the script or relax with `deterministic` |
62
+ | Test runs hit a real API | imported the real client by accident | wrap construction in `agent_factory(model=mock_llm)` |
63
+ | Stub-tool not invoked | name mismatch between FakeTool and what the script says the LLM called | both must use the same `name=` value |
64
+ | Flaky asyncio teardown warning | event-loop GC noise on macOS | already filtered in pyproject; safe to ignore |
65
+
66
+ ## Related
67
+
68
+ - Runbook 02 — Add a tool
69
+ - Runbook 10 — Add evaluators
70
+ - Feature spec: `docs/features/feat-016-testing-framework.md`
71
+
72
+ <!-- agentforge:end-managed -->
73
+
74
+ <!-- agentforge:custom -->
75
+ <!-- agentforge:end-custom -->
@@ -0,0 +1,70 @@
1
+ # 07 — Debug a run
2
+
3
+ > **Goal:** reproduce a failed run locally and step through what
4
+ > the agent saw.
5
+ > **Time:** ~15 minutes.
6
+ > **Prereqs:** runbooks 01 + 06.
7
+
8
+ ## TL;DR
9
+
10
+ ```bash
11
+ # In the offending env (or anywhere with a recorded run):
12
+ agentforge run --record "the failing task"
13
+ # Note the printed run_id, then:
14
+ agentforge debug --replay <run_id>
15
+ > step
16
+ > state
17
+ > inspect tool_call.arguments
18
+ > quit
19
+ ```
20
+
21
+ ## Step by step
22
+
23
+ 1. **Reproduce with recording.** `agentforge run --record "..."`
24
+ persists every `Step` to the configured memory store under
25
+ `category="__step"`. Without `--record`, the trace dies with
26
+ the process.
27
+ 2. **Pick the run_id.** It prints to stdout at run end (also
28
+ present on `RunResult.run_id`).
29
+ 3. **Open the REPL** with `agentforge debug --replay <run_id>`.
30
+ Reads from memory; no LLM call required.
31
+ 4. **Step + inspect.** `step` advances; `state` prints the
32
+ current step's payload; `inspect <dotted-path>` drills in
33
+ (`inspect tool_call.arguments`). `back` rewinds; `steps`
34
+ lists the whole trace.
35
+ 5. **Bisect.** If the failure is in step 17 of 22, `--to-step
36
+ N` on `agentforge run --replay` re-runs the loop up to step
37
+ N with the recorded LLM responses and stops.
38
+
39
+ ## Variations
40
+
41
+ - **Replay tools** — `replay_tools(memory, run_id, [your_tools])`
42
+ returns wrappers whose `run()` returns the recorded
43
+ observation. Pair with `ReplayLLMClient.from_recording(...)`
44
+ for byte-identical replays.
45
+ - **Cassette replay** — `agentforge run --replay <run_id> --to-
46
+ step 5` is the CLI surface around the same primitives.
47
+ - **Tracing** — the OTel root span ID is in
48
+ `RunResult.metadata` if observability is enabled (runbook 12);
49
+ cross-reference with your APM dashboard.
50
+
51
+ ## Troubleshooting
52
+
53
+ | Symptom | Cause | Fix |
54
+ |---|---|---|
55
+ | `No recorded steps for run_id` | run was not recorded | add `--record` next reproduction; configure `modules.memory` |
56
+ | Replay diverges from original | tool or LLM args drifted | use `ReplayLLMClient.from_recording` AND `replay_tools` together |
57
+ | REPL EOF errors | piped stdin without newline | append `\n` to the scripted input |
58
+ | `ReplayExhausted` | trying to replay further than recorded | task changed; re-record with the new task |
59
+
60
+ ## Related
61
+
62
+ - Runbook 06 — Test your agent
63
+ - Runbook 08 — Add memory (recording lives in the memory store)
64
+ - Feature spec: `docs/features/feat-017-cli-runtime.md` (debug,
65
+ replay)
66
+
67
+ <!-- agentforge:end-managed -->
68
+
69
+ <!-- agentforge:custom -->
70
+ <!-- agentforge:end-custom -->
@@ -0,0 +1,75 @@
1
+ # 08 — Add memory / persistence
2
+
3
+ > **Goal:** swap the default in-memory store for a durable
4
+ > backend (SQLite / Postgres / Neo4j / SurrealDB).
5
+ > **Time:** ~10 minutes.
6
+ > **Prereqs:** runbook 01.
7
+
8
+ ## TL;DR
9
+
10
+ ```yaml
11
+ # agentforge.yaml
12
+ modules:
13
+ memory:
14
+ driver: postgres
15
+ config:
16
+ dsn: "${DATABASE_URL}"
17
+ min_size: 1
18
+ max_size: 10
19
+ ```
20
+
21
+ ```bash
22
+ agentforge add module memory-postgres
23
+ agentforge db migrate
24
+ ```
25
+
26
+ ## Step by step
27
+
28
+ 1. **Pick a driver.** Default to SQLite for single-host
29
+ deployments; Postgres for managed-database / multi-writer;
30
+ Neo4j or SurrealDB if you need graph relationships
31
+ (supersede chains, finding lineage).
32
+ 2. **Install the module.** `agentforge add module memory-<driver>`
33
+ uses the framework's manifest applier (no manual pip).
34
+ 3. **Configure** under `modules.memory` in `agentforge.yaml`.
35
+ Use `${ENV_VAR}` interpolation for credentials — never put
36
+ them in the YAML literal.
37
+ 4. **Run schema migration** with `agentforge db migrate`. The
38
+ command is a no-op for drivers that create their schema
39
+ eagerly (in-memory, sqlite); a real DDL pass for postgres /
40
+ neo4j / surrealdb.
41
+ 5. **Verify** with `agentforge db query 'category:__step'` — if
42
+ your previous runs were recorded, you'll see step claims.
43
+
44
+ ## Variations
45
+
46
+ - **Drop-in driver swap** — `agentforge swap memory sqlite
47
+ postgres` migrates the configuration; data migration is
48
+ separate (`agentforge db backup` then `db restore`).
49
+ - **Multiple categories** — write your own claims via
50
+ `agent.memory.put(Claim(category="custom", ...))`. Reserved
51
+ categories (`__step`, `__eval`, `__run`) belong to the
52
+ framework.
53
+ - **TTL** — drivers that declare the `ttl` capability honour
54
+ `agent.memory.set_ttl(...)`. Check with `agent.memory.supports
55
+ ("ttl")`.
56
+
57
+ ## Troubleshooting
58
+
59
+ | Symptom | Cause | Fix |
60
+ |---|---|---|
61
+ | `No module registered for memory:postgres` | driver not installed | `agentforge add module memory-postgres` |
62
+ | `connection refused` | DSN points at the wrong host / port | check `${DATABASE_URL}` expansion via `agentforge config show --resolved` |
63
+ | `delete() requires at least one filter` | called `memory.delete()` with no args | pass `run_id=` / `category=` / `older_than=` |
64
+ | Schema version mismatch on upgrade | driver schema bumped | `agentforge db backup` → `agentforge db migrate` → `agentforge db restore` |
65
+
66
+ ## Related
67
+
68
+ - Runbook 14 — Deploy your agent (DSN secret management)
69
+ - Runbook 15 — Upgrade your agent (schema migrations)
70
+ - Feature spec: `docs/features/feat-005-persistence-and-memory.md`
71
+
72
+ <!-- agentforge:end-managed -->
73
+
74
+ <!-- agentforge:custom -->
75
+ <!-- agentforge:end-custom -->
@@ -0,0 +1,78 @@
1
+ # 09 — Add MCP servers
2
+
3
+ > **Goal:** consume Anthropic Model Context Protocol tool servers
4
+ > as if they were native tools, or expose your agent's tools as
5
+ > an MCP server.
6
+ > **Time:** ~10 minutes.
7
+ > **Prereqs:** runbook 02.
8
+
9
+ ## TL;DR
10
+
11
+ ```yaml
12
+ # agentforge.yaml
13
+ modules:
14
+ protocols:
15
+ - name: mcp
16
+ config:
17
+ servers:
18
+ - command: ["uv", "run", "filesystem-mcp"]
19
+ cwd: ./mcp-servers
20
+ expose_local_tools: true # turn this agent into an MCP server too
21
+ ```
22
+
23
+ ```bash
24
+ agentforge add module mcp
25
+ ```
26
+
27
+ ## Step by step
28
+
29
+ 1. **Install the MCP module.** `agentforge add module mcp` —
30
+ adds `agentforge-mcp` to dependencies and registers the
31
+ protocol under `modules.protocols`.
32
+ 2. **Declare upstream servers.** `servers:` is a list of command
33
+ specifications. Each spawns on agent start; the framework
34
+ handles handshake and tool discovery.
35
+ 3. **Restart the agent.** Discovered MCP tools appear in the
36
+ agent's tool list automatically; the LLM sees their schemas
37
+ alongside framework-native tools.
38
+ 4. **(Optional) Expose your tools.** `expose_local_tools: true`
39
+ makes this agent's tools available as an MCP server, so other
40
+ agents (or Claude Desktop) can call into it.
41
+ 5. **Verify** with `agentforge list tools` — MCP tools have an
42
+ `mcp:` prefix in the resolver listing.
43
+
44
+ ## Variations
45
+
46
+ - **Per-server allowlist** — `servers[].tools: ["read_file",
47
+ "list_directory"]` restricts what gets exposed from each
48
+ server. Use this for least-privilege.
49
+ - **Auth headers** — `servers[].auth.bearer: "${MCP_TOKEN}"` for
50
+ hosted MCP servers behind auth.
51
+ - **Capability negotiation** — `expose_local_tools.exclude:
52
+ [...]` strips internal tools from the exposed MCP surface.
53
+
54
+ ## Troubleshooting
55
+
56
+ | Symptom | Cause | Fix |
57
+ |---|---|---|
58
+ | MCP server fails to spawn | wrong command path | check `agentforge config show --resolved` matches your shell's view |
59
+ | Tools not visible to LLM | discovery race | bump `servers[].start_timeout` (default 5s) |
60
+ | Tool calls hang | MCP server blocking on stdio | check the server's logs; MCP requires line-delimited JSON |
61
+ | `permission denied` from exposed server | client passed a tool not in allowlist | add to `expose_local_tools.tools` or remove the deny |
62
+
63
+ ## Related
64
+
65
+ - Runbook 02 — Add a (native) tool
66
+ - Runbook 11 — Add safety guardrails (MCP tools go through the
67
+ same gates)
68
+ - Feature spec: `docs/features/feat-013-mcp-integration.md`
69
+
70
+ > **Note:** MCP integration is feat-013. If this framework
71
+ > version pre-dates the module shipping, install
72
+ > `agentforge-mcp` manually from the framework repo's
73
+ > `packages/`.
74
+
75
+ <!-- agentforge:end-managed -->
76
+
77
+ <!-- agentforge:custom -->
78
+ <!-- agentforge:end-custom -->