agentforge-py 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentforge/__init__.py +114 -0
- agentforge/_testing/__init__.py +19 -0
- agentforge/_testing/fake_llm.py +126 -0
- agentforge/_testing/fake_tool.py +122 -0
- agentforge/_tools/__init__.py +14 -0
- agentforge/_tools/calculator.py +102 -0
- agentforge/_tools/decorator.py +300 -0
- agentforge/_tools/file_read.py +112 -0
- agentforge/_tools/shell.py +134 -0
- agentforge/_tools/web_search.py +207 -0
- agentforge/agent.py +817 -0
- agentforge/auth.py +42 -0
- agentforge/cli/__init__.py +18 -0
- agentforge/cli/_build.py +323 -0
- agentforge/cli/_scaffold_state.py +250 -0
- agentforge/cli/_shared_scaffold.py +174 -0
- agentforge/cli/config_cmd.py +174 -0
- agentforge/cli/db_cmd.py +262 -0
- agentforge/cli/debug_cmd.py +168 -0
- agentforge/cli/docs_cmd.py +217 -0
- agentforge/cli/eval_cmd.py +181 -0
- agentforge/cli/health_cmd.py +139 -0
- agentforge/cli/list_modules.py +85 -0
- agentforge/cli/main.py +81 -0
- agentforge/cli/manifest_apply.py +368 -0
- agentforge/cli/module_cmd.py +247 -0
- agentforge/cli/new_cmd.py +171 -0
- agentforge/cli/run_cmd.py +234 -0
- agentforge/cli/upgrade_cmd.py +230 -0
- agentforge/config/__init__.py +45 -0
- agentforge/eval/__init__.py +18 -0
- agentforge/eval/consistency.py +107 -0
- agentforge/eval/coverage.py +100 -0
- agentforge/eval/format_compliance.py +107 -0
- agentforge/eval/regression.py +143 -0
- agentforge/findings.py +166 -0
- agentforge/guardrails/__init__.py +32 -0
- agentforge/guardrails/allowlist.py +49 -0
- agentforge/guardrails/capability_check.py +58 -0
- agentforge/guardrails/engine.py +289 -0
- agentforge/guardrails/pii_redact_basic.py +61 -0
- agentforge/guardrails/prompt_injection_basic.py +90 -0
- agentforge/memory/__init__.py +16 -0
- agentforge/memory/in_memory.py +130 -0
- agentforge/memory/in_memory_graph.py +262 -0
- agentforge/memory/in_memory_vector.py +167 -0
- agentforge/pipeline/__init__.py +26 -0
- agentforge/pipeline/engine.py +189 -0
- agentforge/pipeline/errors.py +19 -0
- agentforge/pipeline/tool.py +93 -0
- agentforge/py.typed +0 -0
- agentforge/recording.py +189 -0
- agentforge/renderers/__init__.py +28 -0
- agentforge/renderers/_defaults.py +32 -0
- agentforge/renderers/markdown.py +44 -0
- agentforge/renderers/patch_applier.py +46 -0
- agentforge/renderers/registry.py +108 -0
- agentforge/renderers/scorecard.py +59 -0
- agentforge/renderers/span_table.py +71 -0
- agentforge/replay.py +260 -0
- agentforge/resolver_register.py +41 -0
- agentforge/retrieval.py +410 -0
- agentforge/runtime.py +63 -0
- agentforge/strategies/__init__.py +27 -0
- agentforge/strategies/_base.py +280 -0
- agentforge/strategies/_plan.py +93 -0
- agentforge/strategies/multi_agent.py +541 -0
- agentforge/strategies/plan_execute.py +506 -0
- agentforge/strategies/react.py +237 -0
- agentforge/strategies/tot.py +472 -0
- agentforge/templates/_shared/.cursorrules +12 -0
- agentforge/templates/_shared/.github/copilot-instructions.md +13 -0
- agentforge/templates/_shared/.gitkeep +0 -0
- agentforge/templates/_shared/AGENTS.md.tmpl +123 -0
- agentforge/templates/_shared/CLAUDE.md +13 -0
- agentforge/templates/_shared/docs/runbooks/01-set-up-new-agent.md.tmpl +67 -0
- agentforge/templates/_shared/docs/runbooks/02-add-a-tool.md +67 -0
- agentforge/templates/_shared/docs/runbooks/03-add-a-pipeline-task.md +69 -0
- agentforge/templates/_shared/docs/runbooks/04-pick-reasoning-strategy.md +67 -0
- agentforge/templates/_shared/docs/runbooks/05-write-prompts.md +75 -0
- agentforge/templates/_shared/docs/runbooks/06-test-your-agent.md +75 -0
- agentforge/templates/_shared/docs/runbooks/07-debug-a-run.md +70 -0
- agentforge/templates/_shared/docs/runbooks/08-add-memory.md +75 -0
- agentforge/templates/_shared/docs/runbooks/09-add-mcp.md +78 -0
- agentforge/templates/_shared/docs/runbooks/10-add-evaluators.md +76 -0
- agentforge/templates/_shared/docs/runbooks/11-add-safety-guardrails.md +83 -0
- agentforge/templates/_shared/docs/runbooks/12-add-observability.md +77 -0
- agentforge/templates/_shared/docs/runbooks/13-configure-multi-provider.md +91 -0
- agentforge/templates/_shared/docs/runbooks/14-deploy-your-agent.md +70 -0
- agentforge/templates/_shared/docs/runbooks/15-upgrade-your-agent.md +67 -0
- agentforge/templates/_shared/docs/runbooks/16-configuration-reference.md +81 -0
- agentforge/templates/_shared/docs/runbooks/17-add-reranker.md +78 -0
- agentforge/templates/_shared/docs/runbooks/18-add-hybrid-search.md +78 -0
- agentforge/templates/_shared/docs/runbooks/19-add-graphrag.md +83 -0
- agentforge/templates/_shared/docs/runbooks/20-apply-schema-migrations.md +92 -0
- agentforge/templates/_shared/docs/runbooks/21-use-streaming-guardrails.md +82 -0
- agentforge/templates/_shared/docs/runbooks/README.md.tmpl +68 -0
- agentforge/templates/code-reviewer/.env.example +8 -0
- agentforge/templates/code-reviewer/.gitignore +7 -0
- agentforge/templates/code-reviewer/README.md +12 -0
- agentforge/templates/code-reviewer/agentforge.yaml +23 -0
- agentforge/templates/code-reviewer/copier.yml +34 -0
- agentforge/templates/code-reviewer/pyproject.toml +18 -0
- agentforge/templates/code-reviewer/src/{{project_slug.replace('-', '_')}}/__init__.py +5 -0
- agentforge/templates/code-reviewer/src/{{project_slug.replace('-', '_')}}/main.py +32 -0
- agentforge/templates/docs-qa/.env.example +8 -0
- agentforge/templates/docs-qa/.gitignore +7 -0
- agentforge/templates/docs-qa/README.md +14 -0
- agentforge/templates/docs-qa/agentforge.yaml +19 -0
- agentforge/templates/docs-qa/copier.yml +31 -0
- agentforge/templates/docs-qa/pyproject.toml +18 -0
- agentforge/templates/docs-qa/src/{{project_slug.replace('-', '_')}}/__init__.py +5 -0
- agentforge/templates/docs-qa/src/{{project_slug.replace('-', '_')}}/main.py +32 -0
- agentforge/templates/minimal/.env.example +11 -0
- agentforge/templates/minimal/.gitignore +10 -0
- agentforge/templates/minimal/README.md +28 -0
- agentforge/templates/minimal/agentforge.yaml +10 -0
- agentforge/templates/minimal/copier.yml +52 -0
- agentforge/templates/minimal/pyproject.toml +18 -0
- agentforge/templates/minimal/src/{{project_slug.replace('-', '_')}}/__init__.py +5 -0
- agentforge/templates/minimal/src/{{project_slug.replace('-', '_')}}/main.py +34 -0
- agentforge/templates/patch-bot/.env.example +8 -0
- agentforge/templates/patch-bot/.gitignore +7 -0
- agentforge/templates/patch-bot/README.md +13 -0
- agentforge/templates/patch-bot/agentforge.yaml +15 -0
- agentforge/templates/patch-bot/copier.yml +31 -0
- agentforge/templates/patch-bot/pyproject.toml +18 -0
- agentforge/templates/patch-bot/src/{{project_slug.replace('-', '_')}}/__init__.py +5 -0
- agentforge/templates/patch-bot/src/{{project_slug.replace('-', '_')}}/main.py +32 -0
- agentforge/templates/research/.env.example +8 -0
- agentforge/templates/research/.gitignore +7 -0
- agentforge/templates/research/README.md +14 -0
- agentforge/templates/research/agentforge.yaml +17 -0
- agentforge/templates/research/copier.yml +31 -0
- agentforge/templates/research/pyproject.toml +18 -0
- agentforge/templates/research/src/{{project_slug.replace('-', '_')}}/__init__.py +5 -0
- agentforge/templates/research/src/{{project_slug.replace('-', '_')}}/main.py +31 -0
- agentforge/templates/triage/.env.example +8 -0
- agentforge/templates/triage/.gitignore +7 -0
- agentforge/templates/triage/README.md +14 -0
- agentforge/templates/triage/agentforge.yaml +25 -0
- agentforge/templates/triage/copier.yml +31 -0
- agentforge/templates/triage/pyproject.toml +18 -0
- agentforge/templates/triage/src/{{project_slug.replace('-', '_')}}/__init__.py +5 -0
- agentforge/templates/triage/src/{{project_slug.replace('-', '_')}}/main.py +30 -0
- agentforge/testing/__init__.py +69 -0
- agentforge/testing/conformance.py +40 -0
- agentforge/testing/factory.py +89 -0
- agentforge/testing/fixtures.py +42 -0
- agentforge/testing/llm.py +235 -0
- agentforge/testing/recording.py +177 -0
- agentforge/tools/__init__.py +41 -0
- agentforge_py-0.2.1.dist-info/METADATA +158 -0
- agentforge_py-0.2.1.dist-info/RECORD +157 -0
- agentforge_py-0.2.1.dist-info/WHEEL +4 -0
- agentforge_py-0.2.1.dist-info/entry_points.txt +2 -0
- agentforge_py-0.2.1.dist-info/licenses/LICENSE +202 -0
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
# 01 — Set up a new AgentForge agent
|
|
2
|
+
|
|
3
|
+
> **Goal:** verify the freshly scaffolded `{{ project_slug }}`
|
|
4
|
+
> agent runs end-to-end against your provider.
|
|
5
|
+
> **Time:** ~10 minutes.
|
|
6
|
+
> **Prereqs:** none (this is runbook 01).
|
|
7
|
+
|
|
8
|
+
## TL;DR
|
|
9
|
+
|
|
10
|
+
```bash
|
|
11
|
+
cd {{ project_slug }}
|
|
12
|
+
uv sync
|
|
13
|
+
cp .env.example .env # then fill in real credentials
|
|
14
|
+
agentforge config validate
|
|
15
|
+
agentforge run "hello"
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
## Step by step
|
|
19
|
+
|
|
20
|
+
1. **Install dependencies** with `uv sync`. AgentForge uses uv
|
|
21
|
+
workspaces; the lock file pins every version.
|
|
22
|
+
2. **Configure credentials** by copying `.env.example` to `.env`
|
|
23
|
+
and filling in the LLM provider's API key (`ANTHROPIC_API_KEY`,
|
|
24
|
+
`AWS_ACCESS_KEY_ID`, or `OPENAI_API_KEY` depending on what
|
|
25
|
+
you scaffolded). Never check `.env` into git.
|
|
26
|
+
3. **Validate the config** with `agentforge config validate`.
|
|
27
|
+
That parses `agentforge.yaml`, expands `${ENV_VAR}` references,
|
|
28
|
+
and surfaces any unknown keys.
|
|
29
|
+
4. **Run the agent** with `agentforge run "your first task"`.
|
|
30
|
+
The default output is rich-formatted; pass `--output-format
|
|
31
|
+
json` for structured output (handy in CI).
|
|
32
|
+
5. **Check the trace** — every step the agent emitted is on the
|
|
33
|
+
returned `RunResult`. `agentforge run --record "..."` followed
|
|
34
|
+
by `agentforge debug --replay <run-id>` lets you step through.
|
|
35
|
+
|
|
36
|
+
## Variations
|
|
37
|
+
|
|
38
|
+
- **Different provider** — edit `agentforge.yaml > providers >
|
|
39
|
+
default` to point at the provider you prefer, then update
|
|
40
|
+
`.env` accordingly. See runbook 13 for multi-provider setups.
|
|
41
|
+
- **No credentials yet** — `MockLLMClient.deterministic("ok")`
|
|
42
|
+
in your own tests lets you exercise the loop without hitting
|
|
43
|
+
a real API (see runbook 06).
|
|
44
|
+
- **Batch / CI mode** — `agentforge run --no-prompts --task-file
|
|
45
|
+
./task.txt --output-format json` is the script-friendly path.
|
|
46
|
+
|
|
47
|
+
## Troubleshooting
|
|
48
|
+
|
|
49
|
+
| Symptom | Cause | Fix |
|
|
50
|
+
|---|---|---|
|
|
51
|
+
| `agentforge: command not found` | uv venv isn't activated | `uv sync` then prefix commands with `uv run` |
|
|
52
|
+
| `config invalid` exit code 2 | unknown YAML key | check the diff between your `agentforge.yaml` and `agentforge config schema` output |
|
|
53
|
+
| `No LLM provider registered` | provider package not installed | `agentforge add module <provider>` (e.g. `agentforge add module bedrock`) |
|
|
54
|
+
| `BudgetExceeded` on first run | budget too low for the model | bump `agent.budget.usd` in `agentforge.yaml` |
|
|
55
|
+
|
|
56
|
+
## Related
|
|
57
|
+
|
|
58
|
+
- Runbook 02 — Add a tool
|
|
59
|
+
- Runbook 06 — Test your agent
|
|
60
|
+
- Runbook 13 — Configure multi-provider
|
|
61
|
+
- Feature spec: `docs/features/feat-011-scaffolding-and-upgrade.md`
|
|
62
|
+
|
|
63
|
+
<!-- agentforge:end-managed -->
|
|
64
|
+
|
|
65
|
+
<!-- agentforge:custom -->
|
|
66
|
+
<!-- Project-specific setup notes go here. Survives upgrades. -->
|
|
67
|
+
<!-- agentforge:end-custom -->
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
# 02 — Add a tool
|
|
2
|
+
|
|
3
|
+
> **Goal:** make a new capability available to your agent's
|
|
4
|
+
> reasoning loop.
|
|
5
|
+
> **Time:** ~10 minutes.
|
|
6
|
+
> **Prereqs:** runbook 01 done.
|
|
7
|
+
|
|
8
|
+
## TL;DR
|
|
9
|
+
|
|
10
|
+
```python
|
|
11
|
+
from agentforge import tool
|
|
12
|
+
|
|
13
|
+
@tool
|
|
14
|
+
async def fetch_weather(*, city: str) -> str:
|
|
15
|
+
"""Return the current weather summary for `city`."""
|
|
16
|
+
return await my_weather_api(city)
|
|
17
|
+
|
|
18
|
+
agent = Agent(model="...", tools=[fetch_weather])
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## Step by step
|
|
22
|
+
|
|
23
|
+
1. **Decide tool surface** — what kwargs does the LLM pass? Each
|
|
24
|
+
becomes a typed parameter on the function. Use Pydantic models
|
|
25
|
+
only when the inputs are nested.
|
|
26
|
+
2. **Author the tool** with the `@tool` decorator. Type hints
|
|
27
|
+
drive the JSON schema the LLM sees; do NOT hand-write a schema.
|
|
28
|
+
3. **Add a docstring** — the first line is what the LLM reads to
|
|
29
|
+
decide when to call your tool. Keep it short and behaviour-
|
|
30
|
+
focused: "Returns X given Y", not "This tool will...".
|
|
31
|
+
4. **Pass the tool to `Agent(tools=[...])`** or list it under
|
|
32
|
+
`agent.tools:` in `agentforge.yaml` so it's auto-resolved on
|
|
33
|
+
`agentforge run`.
|
|
34
|
+
5. **Cover with a test** — instantiate via `FakeTool.fake(...)`
|
|
35
|
+
for tests where the real call would be too slow / costly
|
|
36
|
+
(see runbook 06).
|
|
37
|
+
|
|
38
|
+
## Variations
|
|
39
|
+
|
|
40
|
+
- **Class-based tool** — subclass `Tool` directly when you need
|
|
41
|
+
per-instance state (DB pool, HTTP client). Pattern in
|
|
42
|
+
`agentforge_core.contracts.tool`.
|
|
43
|
+
- **Destructive tools** — set `capabilities: ClassVar = frozenset
|
|
44
|
+
({"destructive"})` on the class. The `capability_check`
|
|
45
|
+
guardrail (runbook 11) will deny it unless allowlisted.
|
|
46
|
+
- **Long-running tools** — return early with a status; let the
|
|
47
|
+
next iteration check completion. Don't `await` for 30 seconds.
|
|
48
|
+
|
|
49
|
+
## Troubleshooting
|
|
50
|
+
|
|
51
|
+
| Symptom | Cause | Fix |
|
|
52
|
+
|---|---|---|
|
|
53
|
+
| LLM never calls the tool | docstring too vague | rewrite to be behaviour-focused: "Fetches X for Y" |
|
|
54
|
+
| `ValidationError` on tool call | type hints don't match LLM args | check the JSON schema with `tool.to_spec()` |
|
|
55
|
+
| Tool runs but observation lost | tool returns `None` | return a string (or a dict; the framework JSON-serialises) |
|
|
56
|
+
| Tool ran twice unexpectedly | LLM retried | check the previous observation surfaced clearly; vague observations cause retries |
|
|
57
|
+
|
|
58
|
+
## Related
|
|
59
|
+
|
|
60
|
+
- Runbook 06 — Test your agent (covers `FakeTool.fake`)
|
|
61
|
+
- Runbook 11 — Add safety guardrails (capability gating)
|
|
62
|
+
- Feature spec: `docs/features/feat-004-tools-system.md`
|
|
63
|
+
|
|
64
|
+
<!-- agentforge:end-managed -->
|
|
65
|
+
|
|
66
|
+
<!-- agentforge:custom -->
|
|
67
|
+
<!-- agentforge:end-custom -->
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
# 03 — Add a pipeline task
|
|
2
|
+
|
|
3
|
+
> **Goal:** insert a deterministic, non-LLM step into the agent's
|
|
4
|
+
> workflow (e.g. parse a file, call a metric API, normalise
|
|
5
|
+
> input).
|
|
6
|
+
> **Time:** ~15 minutes.
|
|
7
|
+
> **Prereqs:** runbook 02 (you understand tools).
|
|
8
|
+
|
|
9
|
+
## TL;DR
|
|
10
|
+
|
|
11
|
+
```python
|
|
12
|
+
from agentforge import Pipeline, Task
|
|
13
|
+
|
|
14
|
+
class FetchPRMetadata(Task):
|
|
15
|
+
async def run(self, *, pr_url: str) -> dict:
|
|
16
|
+
return await my_github_client.get_pr(pr_url)
|
|
17
|
+
|
|
18
|
+
pipeline = Pipeline([FetchPRMetadata, AgentStep, RenderReport])
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## Step by step
|
|
22
|
+
|
|
23
|
+
1. **Identify deterministic boundaries** — anything that has a
|
|
24
|
+
stable function from inputs to outputs (file parsing, API
|
|
25
|
+
normalisation, score thresholding) is a Task, not an LLM call.
|
|
26
|
+
2. **Author the Task** — subclass `Task`; declare typed inputs
|
|
27
|
+
and outputs; `async def run(...)` is the body.
|
|
28
|
+
3. **Compose** with `Pipeline([T1, T2, T3])` — tasks run in
|
|
29
|
+
order, each receiving the previous one's output.
|
|
30
|
+
4. **Mix LLM steps** — use the framework's `AgentStep` wrapper
|
|
31
|
+
to drop an agent run into a pipeline; the surrounding tasks
|
|
32
|
+
handle deterministic pre/post processing.
|
|
33
|
+
5. **Capture failures** — Tasks raise `TaskError` for
|
|
34
|
+
recoverable cases; the framework surfaces it as a step in
|
|
35
|
+
the agent's trace.
|
|
36
|
+
|
|
37
|
+
## Variations
|
|
38
|
+
|
|
39
|
+
- **Parallel tasks** — `Pipeline.parallel([T1, T2])` runs them
|
|
40
|
+
concurrently and joins their outputs into a dict.
|
|
41
|
+
- **Conditional branching** — wrap with `Pipeline.branch(
|
|
42
|
+
condition_fn, true_pipe, false_pipe)`.
|
|
43
|
+
- **Retry** — set `retries=N` on the Task class; the framework
|
|
44
|
+
re-runs with exponential backoff.
|
|
45
|
+
|
|
46
|
+
## Troubleshooting
|
|
47
|
+
|
|
48
|
+
| Symptom | Cause | Fix |
|
|
49
|
+
|---|---|---|
|
|
50
|
+
| Task output not visible to LLM | task didn't make output visible to the agent step | thread output through `AgentStep`'s task context |
|
|
51
|
+
| Pipeline fails fast on first error | default behaviour | wrap with `Pipeline.tolerant(...)` for best-effort runs |
|
|
52
|
+
| Memory blows up on large pipelines | task results held in RAM | persist intermediate outputs to memory store (runbook 08) |
|
|
53
|
+
|
|
54
|
+
## Related
|
|
55
|
+
|
|
56
|
+
- Runbook 02 — Add a tool (different shape: tools are LLM-
|
|
57
|
+
invoked, tasks are deterministic)
|
|
58
|
+
- Runbook 08 — Add memory
|
|
59
|
+
- Feature spec: `docs/features/feat-015-pipelines-and-deterministic-tasks.md`
|
|
60
|
+
|
|
61
|
+
> **Note:** Pipelines + Tasks are feat-015 territory. If your
|
|
62
|
+
> framework version pre-dates that feature, this runbook is
|
|
63
|
+
> aspirational — fall back to wrapping deterministic steps as
|
|
64
|
+
> tools.
|
|
65
|
+
|
|
66
|
+
<!-- agentforge:end-managed -->
|
|
67
|
+
|
|
68
|
+
<!-- agentforge:custom -->
|
|
69
|
+
<!-- agentforge:end-custom -->
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
# 04 — Pick a reasoning strategy
|
|
2
|
+
|
|
3
|
+
> **Goal:** choose the right `ReasoningStrategy` for your task.
|
|
4
|
+
> **Time:** ~5 minutes.
|
|
5
|
+
> **Prereqs:** runbook 01 done.
|
|
6
|
+
|
|
7
|
+
## TL;DR
|
|
8
|
+
|
|
9
|
+
```yaml
|
|
10
|
+
# agentforge.yaml
|
|
11
|
+
agent:
|
|
12
|
+
strategy: react # default — most agents stay here
|
|
13
|
+
# strategy: plan-execute # multi-step plans with verification
|
|
14
|
+
# strategy: tree-of-thoughts # search over candidate paths
|
|
15
|
+
# strategy: multi-agent # supervisor + worker fan-out
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
## Step by step
|
|
19
|
+
|
|
20
|
+
1. **Default to ReAct.** It's the simplest stable loop: think →
|
|
21
|
+
act → observe. Most agent failures come from prompts or
|
|
22
|
+
tools, not the loop itself. Switching strategies on Day 1 is
|
|
23
|
+
premature.
|
|
24
|
+
2. **Move to Plan-Execute** when the task naturally decomposes
|
|
25
|
+
into a plan + execution: code reviewers, multi-file edits,
|
|
26
|
+
research with structured outputs.
|
|
27
|
+
3. **Move to Tree-of-Thoughts** when you have a verifier and
|
|
28
|
+
need to explore multiple candidate paths. Expensive — only
|
|
29
|
+
when the task warrants it.
|
|
30
|
+
4. **Move to Multi-Agent** when distinct sub-agents have
|
|
31
|
+
meaningfully different system prompts / tool sets (security
|
|
32
|
+
reviewer + style reviewer + correctness reviewer).
|
|
33
|
+
5. **Measure before switching** — runbook 10 covers evaluators.
|
|
34
|
+
Don't change strategy without a baseline.
|
|
35
|
+
|
|
36
|
+
## Variations
|
|
37
|
+
|
|
38
|
+
- **Custom strategy** — subclass `ReasoningStrategy` and
|
|
39
|
+
register via `@register("strategies", "my-name")`. Run
|
|
40
|
+
`run_strategy_conformance` from `agentforge.testing` against
|
|
41
|
+
it.
|
|
42
|
+
- **Iteration cap** — `agent.max_iterations` (default 25) is
|
|
43
|
+
enforced by every shipped strategy; ToT respects it
|
|
44
|
+
per-branch.
|
|
45
|
+
- **Budget reservation** — strategies coordinate with
|
|
46
|
+
`BudgetPolicy` automatically; you don't need to thread cost
|
|
47
|
+
manually.
|
|
48
|
+
|
|
49
|
+
## Troubleshooting
|
|
50
|
+
|
|
51
|
+
| Symptom | Cause | Fix |
|
|
52
|
+
|---|---|---|
|
|
53
|
+
| Agent loops indefinitely | tool observations too vague to make progress | improve tool docstrings + observation strings |
|
|
54
|
+
| `iteration_cap` finish_reason | max_iterations too low | bump it or switch to Plan-Execute |
|
|
55
|
+
| Plan-Execute "plan" step is junk | system prompt didn't give planning hints | seed examples in the prompt; runbook 05 |
|
|
56
|
+
| ToT cost spike | verifier too lenient, expanding too many branches | tighten verifier prompt; cap `max_branches` |
|
|
57
|
+
|
|
58
|
+
## Related
|
|
59
|
+
|
|
60
|
+
- Runbook 05 — Write prompts
|
|
61
|
+
- Runbook 10 — Add evaluators (baseline before switching)
|
|
62
|
+
- Feature spec: `docs/features/feat-002-reasoning-strategies.md`
|
|
63
|
+
|
|
64
|
+
<!-- agentforge:end-managed -->
|
|
65
|
+
|
|
66
|
+
<!-- agentforge:custom -->
|
|
67
|
+
<!-- agentforge:end-custom -->
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
# 05 — Write prompts
|
|
2
|
+
|
|
3
|
+
> **Goal:** author a system prompt that produces consistent
|
|
4
|
+
> agent behaviour.
|
|
5
|
+
> **Time:** ~20 minutes.
|
|
6
|
+
> **Prereqs:** runbook 01 done.
|
|
7
|
+
|
|
8
|
+
## TL;DR
|
|
9
|
+
|
|
10
|
+
```yaml
|
|
11
|
+
# agentforge.yaml
|
|
12
|
+
agent:
|
|
13
|
+
system_prompt_file: ./prompts/system.md
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
```markdown
|
|
17
|
+
<!-- prompts/system.md -->
|
|
18
|
+
You are a {{ role }}. Your job is {{ goal }}.
|
|
19
|
+
|
|
20
|
+
## Tools
|
|
21
|
+
You have these tools available: {{ tool_summary }}.
|
|
22
|
+
Use them only when you cannot answer from existing context.
|
|
23
|
+
|
|
24
|
+
## Output
|
|
25
|
+
Produce a `SimpleFinding` object with severity in {low, medium, high}.
|
|
26
|
+
|
|
27
|
+
## Style
|
|
28
|
+
Be concise. Cite sources. Refuse silently if asked to bypass safety.
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## Step by step
|
|
32
|
+
|
|
33
|
+
1. **Start with a role + goal sentence.** Two sentences is
|
|
34
|
+
enough. Long preambles dilute the rest of the prompt.
|
|
35
|
+
2. **List tools and their purpose.** The LLM already knows the
|
|
36
|
+
schemas (the framework injects them). What it doesn't know
|
|
37
|
+
is *when* to prefer one over another. Tell it.
|
|
38
|
+
3. **Define output shape** — point at the finding variant
|
|
39
|
+
(`SimpleFinding`, `PatchFinding`, etc.) you want. The
|
|
40
|
+
framework will enforce it via the configured renderer.
|
|
41
|
+
4. **Pin style** — concise, cite, refuse-silently. Models
|
|
42
|
+
respect concrete style rules more than vibe descriptors.
|
|
43
|
+
5. **Iterate on examples.** Add 1-3 worked examples for hard
|
|
44
|
+
cases. Examples are cheaper than rule-tweaking.
|
|
45
|
+
|
|
46
|
+
## Variations
|
|
47
|
+
|
|
48
|
+
- **Per-strategy prompts** — multi-agent supervisors carry a
|
|
49
|
+
separate prompt under `agent.workers.<role>.system_prompt`.
|
|
50
|
+
- **Tool-specific framing** — for tools whose names are
|
|
51
|
+
ambiguous, restate intent at point of use: "Call `lookup_user`
|
|
52
|
+
with the email from the issue body".
|
|
53
|
+
- **Dynamic context** — use Jinja in the prompt file; the
|
|
54
|
+
framework expands `{{ runtime_context.user }}` etc. at run
|
|
55
|
+
time.
|
|
56
|
+
|
|
57
|
+
## Troubleshooting
|
|
58
|
+
|
|
59
|
+
| Symptom | Cause | Fix |
|
|
60
|
+
|---|---|---|
|
|
61
|
+
| Output drifts from the schema | prompt over-emphasises prose | move output-shape rule to the top |
|
|
62
|
+
| Model refuses to answer | safety phrasing too defensive | tone down "you must never..." → "decline politely when..." |
|
|
63
|
+
| Repeated tool calls with same args | tool docstring + system prompt disagree | reconcile; the docstring wins for the LLM |
|
|
64
|
+
| Verbose, low-signal responses | no concision rule | add "respond in ≤ 200 words" |
|
|
65
|
+
|
|
66
|
+
## Related
|
|
67
|
+
|
|
68
|
+
- Runbook 02 — Add a tool (tool docstrings)
|
|
69
|
+
- Runbook 10 — Add evaluators (measure prompt impact)
|
|
70
|
+
- Feature spec: `docs/features/feat-008-findings-and-output-shapes.md`
|
|
71
|
+
|
|
72
|
+
<!-- agentforge:end-managed -->
|
|
73
|
+
|
|
74
|
+
<!-- agentforge:custom -->
|
|
75
|
+
<!-- agentforge:end-custom -->
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
# 06 — Test your agent
|
|
2
|
+
|
|
3
|
+
> **Goal:** unit-test your agent without hitting a real LLM or
|
|
4
|
+
> network.
|
|
5
|
+
> **Time:** ~15 minutes.
|
|
6
|
+
> **Prereqs:** runbook 02 (you have at least one tool).
|
|
7
|
+
|
|
8
|
+
## TL;DR
|
|
9
|
+
|
|
10
|
+
```python
|
|
11
|
+
import pytest
|
|
12
|
+
from agentforge.testing import MockLLMClient, FakeTool, agent_factory
|
|
13
|
+
|
|
14
|
+
@pytest.mark.asyncio
|
|
15
|
+
async def test_population_lookup() -> None:
|
|
16
|
+
llm = MockLLMClient.from_script([
|
|
17
|
+
{"text": "Looking up.",
|
|
18
|
+
"tool_calls": [{"name": "search", "args": {"q": "Spain"}}]},
|
|
19
|
+
{"text": "47.5M", "stop_reason": "end_turn"},
|
|
20
|
+
])
|
|
21
|
+
web = FakeTool.fake("search", lambda **kw: "47.5M people")
|
|
22
|
+
agent = agent_factory(model=llm, tools=[web])
|
|
23
|
+
result = await agent.run("How many in Spain?")
|
|
24
|
+
assert "47.5M" in result.output
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## Step by step
|
|
28
|
+
|
|
29
|
+
1. **Use `MockLLMClient.from_script(...)`** for tests that need
|
|
30
|
+
to drive specific LLM responses. `deterministic("ok")` works
|
|
31
|
+
when you only care that the loop completes.
|
|
32
|
+
2. **Stub tools with `FakeTool.fake(name, fn)`** — accepts a
|
|
33
|
+
static value or a callable. Preserves the real tool's
|
|
34
|
+
`name` so the LLM sees the same surface.
|
|
35
|
+
3. **Use `agent_factory(...)`** instead of raw `Agent(...)`. It
|
|
36
|
+
bakes in safe defaults (in-memory store, no log filter
|
|
37
|
+
mutation, low budget) so tests stay isolated.
|
|
38
|
+
4. **Assert on `result.output`** for the answer, and on
|
|
39
|
+
`mock_llm.tool_calls_observed` for the LLM's tool-use
|
|
40
|
+
sequence. Both are cheaper than parsing trace strings.
|
|
41
|
+
5. **Record once, replay forever.** For tests that exercise a
|
|
42
|
+
real provider response, `record_llm(real, "cassette.jsonl")`
|
|
43
|
+
captures it; subsequent runs use
|
|
44
|
+
`MockLLMClient.from_recording(...)`.
|
|
45
|
+
|
|
46
|
+
## Variations
|
|
47
|
+
|
|
48
|
+
- **Property-based tests** — pair `agent_factory` with Hypothesis
|
|
49
|
+
strategies for input fuzzing.
|
|
50
|
+
- **Golden sets** — `agentforge-testing` ships
|
|
51
|
+
`GoldenSetRunner.from_jsonl(...)`; fixture lines hold
|
|
52
|
+
`task` + `expected` (exact / contains / regex / any_of).
|
|
53
|
+
- **Snapshot rendering** — `assert_snapshot(text, path)` for
|
|
54
|
+
scorecard / patch output. `UPDATE_SNAPSHOTS=1 pytest`
|
|
55
|
+
re-records.
|
|
56
|
+
|
|
57
|
+
## Troubleshooting
|
|
58
|
+
|
|
59
|
+
| Symptom | Cause | Fix |
|
|
60
|
+
|---|---|---|
|
|
61
|
+
| `MockLLMClient exhausted` | the agent made more LLM calls than scripted | extend the script or relax with `deterministic` |
|
|
62
|
+
| Test runs hit a real API | imported the real client by accident | wrap construction in `agent_factory(model=mock_llm)` |
|
|
63
|
+
| Stub-tool not invoked | name mismatch between FakeTool and what the script says the LLM called | both must use the same `name=` value |
|
|
64
|
+
| Flaky asyncio teardown warning | event-loop GC noise on macOS | already filtered in pyproject; safe to ignore |
|
|
65
|
+
|
|
66
|
+
## Related
|
|
67
|
+
|
|
68
|
+
- Runbook 02 — Add a tool
|
|
69
|
+
- Runbook 10 — Add evaluators
|
|
70
|
+
- Feature spec: `docs/features/feat-016-testing-framework.md`
|
|
71
|
+
|
|
72
|
+
<!-- agentforge:end-managed -->
|
|
73
|
+
|
|
74
|
+
<!-- agentforge:custom -->
|
|
75
|
+
<!-- agentforge:end-custom -->
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# 07 — Debug a run
|
|
2
|
+
|
|
3
|
+
> **Goal:** reproduce a failed run locally and step through what
|
|
4
|
+
> the agent saw.
|
|
5
|
+
> **Time:** ~15 minutes.
|
|
6
|
+
> **Prereqs:** runbooks 01 + 06.
|
|
7
|
+
|
|
8
|
+
## TL;DR
|
|
9
|
+
|
|
10
|
+
```bash
|
|
11
|
+
# In the offending env (or anywhere with a recorded run):
|
|
12
|
+
agentforge run --record "the failing task"
|
|
13
|
+
# Note the printed run_id, then:
|
|
14
|
+
agentforge debug --replay <run_id>
|
|
15
|
+
> step
|
|
16
|
+
> state
|
|
17
|
+
> inspect tool_call.arguments
|
|
18
|
+
> quit
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## Step by step
|
|
22
|
+
|
|
23
|
+
1. **Reproduce with recording.** `agentforge run --record "..."`
|
|
24
|
+
persists every `Step` to the configured memory store under
|
|
25
|
+
`category="__step"`. Without `--record`, the trace dies with
|
|
26
|
+
the process.
|
|
27
|
+
2. **Pick the run_id.** It prints to stdout at run end (also
|
|
28
|
+
present on `RunResult.run_id`).
|
|
29
|
+
3. **Open the REPL** with `agentforge debug --replay <run_id>`.
|
|
30
|
+
Reads from memory; no LLM call required.
|
|
31
|
+
4. **Step + inspect.** `step` advances; `state` prints the
|
|
32
|
+
current step's payload; `inspect <dotted-path>` drills in
|
|
33
|
+
(`inspect tool_call.arguments`). `back` rewinds; `steps`
|
|
34
|
+
lists the whole trace.
|
|
35
|
+
5. **Bisect.** If the failure is in step 17 of 22, `--to-step
|
|
36
|
+
N` on `agentforge run --replay` re-runs the loop up to step
|
|
37
|
+
N with the recorded LLM responses and stops.
|
|
38
|
+
|
|
39
|
+
## Variations
|
|
40
|
+
|
|
41
|
+
- **Replay tools** — `replay_tools(memory, run_id, [your_tools])`
|
|
42
|
+
returns wrappers whose `run()` returns the recorded
|
|
43
|
+
observation. Pair with `ReplayLLMClient.from_recording(...)`
|
|
44
|
+
for byte-identical replays.
|
|
45
|
+
- **Cassette replay** — `agentforge run --replay <run_id> --to-
|
|
46
|
+
step 5` is the CLI surface around the same primitives.
|
|
47
|
+
- **Tracing** — the OTel root span ID is in
|
|
48
|
+
`RunResult.metadata` if observability is enabled (runbook 12);
|
|
49
|
+
cross-reference with your APM dashboard.
|
|
50
|
+
|
|
51
|
+
## Troubleshooting
|
|
52
|
+
|
|
53
|
+
| Symptom | Cause | Fix |
|
|
54
|
+
|---|---|---|
|
|
55
|
+
| `No recorded steps for run_id` | run was not recorded | add `--record` next reproduction; configure `modules.memory` |
|
|
56
|
+
| Replay diverges from original | tool or LLM args drifted | use `ReplayLLMClient.from_recording` AND `replay_tools` together |
|
|
57
|
+
| REPL EOF errors | piped stdin without newline | append `\n` to the scripted input |
|
|
58
|
+
| `ReplayExhausted` | trying to replay further than recorded | task changed; re-record with the new task |
|
|
59
|
+
|
|
60
|
+
## Related
|
|
61
|
+
|
|
62
|
+
- Runbook 06 — Test your agent
|
|
63
|
+
- Runbook 08 — Add memory (recording lives in the memory store)
|
|
64
|
+
- Feature spec: `docs/features/feat-017-cli-runtime.md` (debug,
|
|
65
|
+
replay)
|
|
66
|
+
|
|
67
|
+
<!-- agentforge:end-managed -->
|
|
68
|
+
|
|
69
|
+
<!-- agentforge:custom -->
|
|
70
|
+
<!-- agentforge:end-custom -->
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
# 08 — Add memory / persistence
|
|
2
|
+
|
|
3
|
+
> **Goal:** swap the default in-memory store for a durable
|
|
4
|
+
> backend (SQLite / Postgres / Neo4j / SurrealDB).
|
|
5
|
+
> **Time:** ~10 minutes.
|
|
6
|
+
> **Prereqs:** runbook 01.
|
|
7
|
+
|
|
8
|
+
## TL;DR
|
|
9
|
+
|
|
10
|
+
```yaml
|
|
11
|
+
# agentforge.yaml
|
|
12
|
+
modules:
|
|
13
|
+
memory:
|
|
14
|
+
driver: postgres
|
|
15
|
+
config:
|
|
16
|
+
dsn: "${DATABASE_URL}"
|
|
17
|
+
min_size: 1
|
|
18
|
+
max_size: 10
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
agentforge add module memory-postgres
|
|
23
|
+
agentforge db migrate
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
## Step by step
|
|
27
|
+
|
|
28
|
+
1. **Pick a driver.** Default to SQLite for single-host
|
|
29
|
+
deployments; Postgres for managed-database / multi-writer;
|
|
30
|
+
Neo4j or SurrealDB if you need graph relationships
|
|
31
|
+
(supersede chains, finding lineage).
|
|
32
|
+
2. **Install the module.** `agentforge add module memory-<driver>`
|
|
33
|
+
uses the framework's manifest applier (no manual pip).
|
|
34
|
+
3. **Configure** under `modules.memory` in `agentforge.yaml`.
|
|
35
|
+
Use `${ENV_VAR}` interpolation for credentials — never put
|
|
36
|
+
them in the YAML literal.
|
|
37
|
+
4. **Run schema migration** with `agentforge db migrate`. The
|
|
38
|
+
command is a no-op for drivers that create their schema
|
|
39
|
+
eagerly (in-memory, sqlite); a real DDL pass for postgres /
|
|
40
|
+
neo4j / surrealdb.
|
|
41
|
+
5. **Verify** with `agentforge db query 'category:__step'` — if
|
|
42
|
+
your previous runs were recorded, you'll see step claims.
|
|
43
|
+
|
|
44
|
+
## Variations
|
|
45
|
+
|
|
46
|
+
- **Drop-in driver swap** — `agentforge swap memory sqlite
|
|
47
|
+
postgres` migrates the configuration; data migration is
|
|
48
|
+
separate (`agentforge db backup` then `db restore`).
|
|
49
|
+
- **Multiple categories** — write your own claims via
|
|
50
|
+
`agent.memory.put(Claim(category="custom", ...))`. Reserved
|
|
51
|
+
categories (`__step`, `__eval`, `__run`) belong to the
|
|
52
|
+
framework.
|
|
53
|
+
- **TTL** — drivers that declare the `ttl` capability honour
|
|
54
|
+
`agent.memory.set_ttl(...)`. Check with `agent.memory.supports
|
|
55
|
+
("ttl")`.
|
|
56
|
+
|
|
57
|
+
## Troubleshooting
|
|
58
|
+
|
|
59
|
+
| Symptom | Cause | Fix |
|
|
60
|
+
|---|---|---|
|
|
61
|
+
| `No module registered for memory:postgres` | driver not installed | `agentforge add module memory-postgres` |
|
|
62
|
+
| `connection refused` | DSN points at the wrong host / port | check `${DATABASE_URL}` expansion via `agentforge config show --resolved` |
|
|
63
|
+
| `delete() requires at least one filter` | called `memory.delete()` with no args | pass `run_id=` / `category=` / `older_than=` |
|
|
64
|
+
| Schema version mismatch on upgrade | driver schema bumped | `agentforge db backup` → `agentforge db migrate` → `agentforge db restore` |
|
|
65
|
+
|
|
66
|
+
## Related
|
|
67
|
+
|
|
68
|
+
- Runbook 14 — Deploy your agent (DSN secret management)
|
|
69
|
+
- Runbook 15 — Upgrade your agent (schema migrations)
|
|
70
|
+
- Feature spec: `docs/features/feat-005-persistence-and-memory.md`
|
|
71
|
+
|
|
72
|
+
<!-- agentforge:end-managed -->
|
|
73
|
+
|
|
74
|
+
<!-- agentforge:custom -->
|
|
75
|
+
<!-- agentforge:end-custom -->
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
# 09 — Add MCP servers
|
|
2
|
+
|
|
3
|
+
> **Goal:** consume Anthropic Model Context Protocol tool servers
|
|
4
|
+
> as if they were native tools, or expose your agent's tools as
|
|
5
|
+
> an MCP server.
|
|
6
|
+
> **Time:** ~10 minutes.
|
|
7
|
+
> **Prereqs:** runbook 02.
|
|
8
|
+
|
|
9
|
+
## TL;DR
|
|
10
|
+
|
|
11
|
+
```yaml
|
|
12
|
+
# agentforge.yaml
|
|
13
|
+
modules:
|
|
14
|
+
protocols:
|
|
15
|
+
- name: mcp
|
|
16
|
+
config:
|
|
17
|
+
servers:
|
|
18
|
+
- command: ["uv", "run", "filesystem-mcp"]
|
|
19
|
+
cwd: ./mcp-servers
|
|
20
|
+
expose_local_tools: true # turn this agent into an MCP server too
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
agentforge add module mcp
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## Step by step
|
|
28
|
+
|
|
29
|
+
1. **Install the MCP module.** `agentforge add module mcp` —
|
|
30
|
+
adds `agentforge-mcp` to dependencies and registers the
|
|
31
|
+
protocol under `modules.protocols`.
|
|
32
|
+
2. **Declare upstream servers.** `servers:` is a list of command
|
|
33
|
+
specifications. Each spawns on agent start; the framework
|
|
34
|
+
handles handshake and tool discovery.
|
|
35
|
+
3. **Restart the agent.** Discovered MCP tools appear in the
|
|
36
|
+
agent's tool list automatically; the LLM sees their schemas
|
|
37
|
+
alongside framework-native tools.
|
|
38
|
+
4. **(Optional) Expose your tools.** `expose_local_tools: true`
|
|
39
|
+
makes this agent's tools available as an MCP server, so other
|
|
40
|
+
agents (or Claude Desktop) can call into it.
|
|
41
|
+
5. **Verify** with `agentforge list tools` — MCP tools have an
|
|
42
|
+
`mcp:` prefix in the resolver listing.
|
|
43
|
+
|
|
44
|
+
## Variations
|
|
45
|
+
|
|
46
|
+
- **Per-server allowlist** — `servers[].tools: ["read_file",
|
|
47
|
+
"list_directory"]` restricts what gets exposed from each
|
|
48
|
+
server. Use this for least-privilege.
|
|
49
|
+
- **Auth headers** — `servers[].auth.bearer: "${MCP_TOKEN}"` for
|
|
50
|
+
hosted MCP servers behind auth.
|
|
51
|
+
- **Capability negotiation** — `expose_local_tools.exclude:
|
|
52
|
+
[...]` strips internal tools from the exposed MCP surface.
|
|
53
|
+
|
|
54
|
+
## Troubleshooting
|
|
55
|
+
|
|
56
|
+
| Symptom | Cause | Fix |
|
|
57
|
+
|---|---|---|
|
|
58
|
+
| MCP server fails to spawn | wrong command path | check `agentforge config show --resolved` matches your shell's view |
|
|
59
|
+
| Tools not visible to LLM | discovery race | bump `servers[].start_timeout` (default 5s) |
|
|
60
|
+
| Tool calls hang | MCP server blocking on stdio | check the server's logs; MCP requires line-delimited JSON |
|
|
61
|
+
| `permission denied` from exposed server | client passed a tool not in allowlist | add to `expose_local_tools.tools` or remove the deny |
|
|
62
|
+
|
|
63
|
+
## Related
|
|
64
|
+
|
|
65
|
+
- Runbook 02 — Add a (native) tool
|
|
66
|
+
- Runbook 11 — Add safety guardrails (MCP tools go through the
|
|
67
|
+
same gates)
|
|
68
|
+
- Feature spec: `docs/features/feat-013-mcp-integration.md`
|
|
69
|
+
|
|
70
|
+
> **Note:** MCP integration is feat-013. If this framework
|
|
71
|
+
> version pre-dates the module shipping, install
|
|
72
|
+
> `agentforge-mcp` manually from the framework repo's
|
|
73
|
+
> `packages/`.
|
|
74
|
+
|
|
75
|
+
<!-- agentforge:end-managed -->
|
|
76
|
+
|
|
77
|
+
<!-- agentforge:custom -->
|
|
78
|
+
<!-- agentforge:end-custom -->
|