agentforge-py 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentforge/__init__.py +114 -0
- agentforge/_testing/__init__.py +19 -0
- agentforge/_testing/fake_llm.py +126 -0
- agentforge/_testing/fake_tool.py +122 -0
- agentforge/_tools/__init__.py +14 -0
- agentforge/_tools/calculator.py +102 -0
- agentforge/_tools/decorator.py +300 -0
- agentforge/_tools/file_read.py +112 -0
- agentforge/_tools/shell.py +134 -0
- agentforge/_tools/web_search.py +207 -0
- agentforge/agent.py +817 -0
- agentforge/auth.py +42 -0
- agentforge/cli/__init__.py +18 -0
- agentforge/cli/_build.py +323 -0
- agentforge/cli/_scaffold_state.py +250 -0
- agentforge/cli/_shared_scaffold.py +174 -0
- agentforge/cli/config_cmd.py +174 -0
- agentforge/cli/db_cmd.py +262 -0
- agentforge/cli/debug_cmd.py +168 -0
- agentforge/cli/docs_cmd.py +217 -0
- agentforge/cli/eval_cmd.py +181 -0
- agentforge/cli/health_cmd.py +139 -0
- agentforge/cli/list_modules.py +85 -0
- agentforge/cli/main.py +81 -0
- agentforge/cli/manifest_apply.py +368 -0
- agentforge/cli/module_cmd.py +247 -0
- agentforge/cli/new_cmd.py +171 -0
- agentforge/cli/run_cmd.py +234 -0
- agentforge/cli/upgrade_cmd.py +230 -0
- agentforge/config/__init__.py +45 -0
- agentforge/eval/__init__.py +18 -0
- agentforge/eval/consistency.py +107 -0
- agentforge/eval/coverage.py +100 -0
- agentforge/eval/format_compliance.py +107 -0
- agentforge/eval/regression.py +143 -0
- agentforge/findings.py +166 -0
- agentforge/guardrails/__init__.py +32 -0
- agentforge/guardrails/allowlist.py +49 -0
- agentforge/guardrails/capability_check.py +58 -0
- agentforge/guardrails/engine.py +289 -0
- agentforge/guardrails/pii_redact_basic.py +61 -0
- agentforge/guardrails/prompt_injection_basic.py +90 -0
- agentforge/memory/__init__.py +16 -0
- agentforge/memory/in_memory.py +130 -0
- agentforge/memory/in_memory_graph.py +262 -0
- agentforge/memory/in_memory_vector.py +167 -0
- agentforge/pipeline/__init__.py +26 -0
- agentforge/pipeline/engine.py +189 -0
- agentforge/pipeline/errors.py +19 -0
- agentforge/pipeline/tool.py +93 -0
- agentforge/py.typed +0 -0
- agentforge/recording.py +189 -0
- agentforge/renderers/__init__.py +28 -0
- agentforge/renderers/_defaults.py +32 -0
- agentforge/renderers/markdown.py +44 -0
- agentforge/renderers/patch_applier.py +46 -0
- agentforge/renderers/registry.py +108 -0
- agentforge/renderers/scorecard.py +59 -0
- agentforge/renderers/span_table.py +71 -0
- agentforge/replay.py +260 -0
- agentforge/resolver_register.py +41 -0
- agentforge/retrieval.py +410 -0
- agentforge/runtime.py +63 -0
- agentforge/strategies/__init__.py +27 -0
- agentforge/strategies/_base.py +280 -0
- agentforge/strategies/_plan.py +93 -0
- agentforge/strategies/multi_agent.py +541 -0
- agentforge/strategies/plan_execute.py +506 -0
- agentforge/strategies/react.py +237 -0
- agentforge/strategies/tot.py +472 -0
- agentforge/templates/_shared/.cursorrules +12 -0
- agentforge/templates/_shared/.github/copilot-instructions.md +13 -0
- agentforge/templates/_shared/.gitkeep +0 -0
- agentforge/templates/_shared/AGENTS.md.tmpl +123 -0
- agentforge/templates/_shared/CLAUDE.md +13 -0
- agentforge/templates/_shared/docs/runbooks/01-set-up-new-agent.md.tmpl +67 -0
- agentforge/templates/_shared/docs/runbooks/02-add-a-tool.md +67 -0
- agentforge/templates/_shared/docs/runbooks/03-add-a-pipeline-task.md +69 -0
- agentforge/templates/_shared/docs/runbooks/04-pick-reasoning-strategy.md +67 -0
- agentforge/templates/_shared/docs/runbooks/05-write-prompts.md +75 -0
- agentforge/templates/_shared/docs/runbooks/06-test-your-agent.md +75 -0
- agentforge/templates/_shared/docs/runbooks/07-debug-a-run.md +70 -0
- agentforge/templates/_shared/docs/runbooks/08-add-memory.md +75 -0
- agentforge/templates/_shared/docs/runbooks/09-add-mcp.md +78 -0
- agentforge/templates/_shared/docs/runbooks/10-add-evaluators.md +76 -0
- agentforge/templates/_shared/docs/runbooks/11-add-safety-guardrails.md +83 -0
- agentforge/templates/_shared/docs/runbooks/12-add-observability.md +77 -0
- agentforge/templates/_shared/docs/runbooks/13-configure-multi-provider.md +91 -0
- agentforge/templates/_shared/docs/runbooks/14-deploy-your-agent.md +70 -0
- agentforge/templates/_shared/docs/runbooks/15-upgrade-your-agent.md +67 -0
- agentforge/templates/_shared/docs/runbooks/16-configuration-reference.md +81 -0
- agentforge/templates/_shared/docs/runbooks/17-add-reranker.md +78 -0
- agentforge/templates/_shared/docs/runbooks/18-add-hybrid-search.md +78 -0
- agentforge/templates/_shared/docs/runbooks/19-add-graphrag.md +83 -0
- agentforge/templates/_shared/docs/runbooks/20-apply-schema-migrations.md +92 -0
- agentforge/templates/_shared/docs/runbooks/21-use-streaming-guardrails.md +82 -0
- agentforge/templates/_shared/docs/runbooks/README.md.tmpl +68 -0
- agentforge/templates/code-reviewer/.env.example +8 -0
- agentforge/templates/code-reviewer/.gitignore +7 -0
- agentforge/templates/code-reviewer/README.md +12 -0
- agentforge/templates/code-reviewer/agentforge.yaml +23 -0
- agentforge/templates/code-reviewer/copier.yml +34 -0
- agentforge/templates/code-reviewer/pyproject.toml +18 -0
- agentforge/templates/code-reviewer/src/{{project_slug.replace('-', '_')}}/__init__.py +5 -0
- agentforge/templates/code-reviewer/src/{{project_slug.replace('-', '_')}}/main.py +32 -0
- agentforge/templates/docs-qa/.env.example +8 -0
- agentforge/templates/docs-qa/.gitignore +7 -0
- agentforge/templates/docs-qa/README.md +14 -0
- agentforge/templates/docs-qa/agentforge.yaml +19 -0
- agentforge/templates/docs-qa/copier.yml +31 -0
- agentforge/templates/docs-qa/pyproject.toml +18 -0
- agentforge/templates/docs-qa/src/{{project_slug.replace('-', '_')}}/__init__.py +5 -0
- agentforge/templates/docs-qa/src/{{project_slug.replace('-', '_')}}/main.py +32 -0
- agentforge/templates/minimal/.env.example +11 -0
- agentforge/templates/minimal/.gitignore +10 -0
- agentforge/templates/minimal/README.md +28 -0
- agentforge/templates/minimal/agentforge.yaml +10 -0
- agentforge/templates/minimal/copier.yml +52 -0
- agentforge/templates/minimal/pyproject.toml +18 -0
- agentforge/templates/minimal/src/{{project_slug.replace('-', '_')}}/__init__.py +5 -0
- agentforge/templates/minimal/src/{{project_slug.replace('-', '_')}}/main.py +34 -0
- agentforge/templates/patch-bot/.env.example +8 -0
- agentforge/templates/patch-bot/.gitignore +7 -0
- agentforge/templates/patch-bot/README.md +13 -0
- agentforge/templates/patch-bot/agentforge.yaml +15 -0
- agentforge/templates/patch-bot/copier.yml +31 -0
- agentforge/templates/patch-bot/pyproject.toml +18 -0
- agentforge/templates/patch-bot/src/{{project_slug.replace('-', '_')}}/__init__.py +5 -0
- agentforge/templates/patch-bot/src/{{project_slug.replace('-', '_')}}/main.py +32 -0
- agentforge/templates/research/.env.example +8 -0
- agentforge/templates/research/.gitignore +7 -0
- agentforge/templates/research/README.md +14 -0
- agentforge/templates/research/agentforge.yaml +17 -0
- agentforge/templates/research/copier.yml +31 -0
- agentforge/templates/research/pyproject.toml +18 -0
- agentforge/templates/research/src/{{project_slug.replace('-', '_')}}/__init__.py +5 -0
- agentforge/templates/research/src/{{project_slug.replace('-', '_')}}/main.py +31 -0
- agentforge/templates/triage/.env.example +8 -0
- agentforge/templates/triage/.gitignore +7 -0
- agentforge/templates/triage/README.md +14 -0
- agentforge/templates/triage/agentforge.yaml +25 -0
- agentforge/templates/triage/copier.yml +31 -0
- agentforge/templates/triage/pyproject.toml +18 -0
- agentforge/templates/triage/src/{{project_slug.replace('-', '_')}}/__init__.py +5 -0
- agentforge/templates/triage/src/{{project_slug.replace('-', '_')}}/main.py +30 -0
- agentforge/testing/__init__.py +69 -0
- agentforge/testing/conformance.py +40 -0
- agentforge/testing/factory.py +89 -0
- agentforge/testing/fixtures.py +42 -0
- agentforge/testing/llm.py +235 -0
- agentforge/testing/recording.py +177 -0
- agentforge/tools/__init__.py +41 -0
- agentforge_py-0.2.1.dist-info/METADATA +158 -0
- agentforge_py-0.2.1.dist-info/RECORD +157 -0
- agentforge_py-0.2.1.dist-info/WHEEL +4 -0
- agentforge_py-0.2.1.dist-info/entry_points.txt +2 -0
- agentforge_py-0.2.1.dist-info/licenses/LICENSE +202 -0
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
# 10 — Add evaluators
|
|
2
|
+
|
|
3
|
+
> **Goal:** score each agent run on quality so regressions are
|
|
4
|
+
> caught before they ship.
|
|
5
|
+
> **Time:** ~20 minutes.
|
|
6
|
+
> **Prereqs:** runbook 06.
|
|
7
|
+
|
|
8
|
+
## TL;DR
|
|
9
|
+
|
|
10
|
+
```yaml
|
|
11
|
+
# agentforge.yaml
|
|
12
|
+
modules:
|
|
13
|
+
evaluators:
|
|
14
|
+
- name: faithfulness # LLM-judge
|
|
15
|
+
- name: coverage # deterministic
|
|
16
|
+
config:
|
|
17
|
+
required_facts: ["population", "year"]
|
|
18
|
+
- name: regression-vs-baseline
|
|
19
|
+
config:
|
|
20
|
+
baseline_path: ./tests/baselines/answers.jsonl
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
agentforge eval --fixtures ./tests/golden.jsonl --threshold 0.8
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## Step by step
|
|
28
|
+
|
|
29
|
+
1. **Mix deterministic + LLM-judge.** Deterministic graders
|
|
30
|
+
(coverage, format-compliance, regression-vs-baseline,
|
|
31
|
+
consistency) are cheap; ship them everywhere. Use LLM-judge
|
|
32
|
+
graders (faithfulness, groundedness, hallucination,
|
|
33
|
+
relevance, helpfulness, correctness) when no rule captures
|
|
34
|
+
the property — they cost LLM calls per evaluation.
|
|
35
|
+
2. **Declare under `modules.evaluators`.** Each entry has a
|
|
36
|
+
`name` (resolver key) and optional `config`. The framework
|
|
37
|
+
instantiates and runs them post-run, attaching scores to
|
|
38
|
+
`RunResult.eval_scores`.
|
|
39
|
+
3. **Wire into CI.** `agentforge eval --fixtures golden.jsonl
|
|
40
|
+
--threshold 0.8 --output-format junit > eval.xml` exits 5
|
|
41
|
+
when the mean score is below the threshold.
|
|
42
|
+
4. **Threshold per evaluator** (when one matters more than the
|
|
43
|
+
others) goes in the evaluator's own `config` block.
|
|
44
|
+
5. **Custom evaluators** subclass `Evaluator` and register with
|
|
45
|
+
`@register("evaluators", "my-name")`. Run
|
|
46
|
+
`run_evaluator_conformance(my_eval)` to verify the contract.
|
|
47
|
+
|
|
48
|
+
## Variations
|
|
49
|
+
|
|
50
|
+
- **Cost gating** — each LLM-judge declares
|
|
51
|
+
`cost_estimate_usd`. `BudgetPolicy` skips them when the run's
|
|
52
|
+
remaining budget would be exceeded.
|
|
53
|
+
- **GEval rubrics** — `agentforge-eval-geval` lets you define
|
|
54
|
+
arbitrary judge rubrics in YAML.
|
|
55
|
+
- **Snapshot diff** — for outputs that should stay byte-stable,
|
|
56
|
+
pair an evaluator with `agentforge_testing.assert_snapshot`.
|
|
57
|
+
|
|
58
|
+
## Troubleshooting
|
|
59
|
+
|
|
60
|
+
| Symptom | Cause | Fix |
|
|
61
|
+
|---|---|---|
|
|
62
|
+
| `No module registered for evaluators:faithfulness` | LLM-judge pkg missing | `agentforge add module eval-geval` |
|
|
63
|
+
| Evaluators didn't run | budget exhausted before eval pass | bump `agent.budget.usd` or drop expensive judges |
|
|
64
|
+
| Threshold pass but quality regressed | mean masked outliers | switch CI to per-fixture threshold or run with `--threshold-per-evaluator` |
|
|
65
|
+
| Judge gives same score every time | judge prompt too vague | tighten the rubric; add 2-3 worked examples |
|
|
66
|
+
|
|
67
|
+
## Related
|
|
68
|
+
|
|
69
|
+
- Runbook 06 — Test your agent
|
|
70
|
+
- Runbook 12 — Add observability (eval scores feed dashboards)
|
|
71
|
+
- Feature spec: `docs/features/feat-006-evaluators-and-benchmarks.md`
|
|
72
|
+
|
|
73
|
+
<!-- agentforge:end-managed -->
|
|
74
|
+
|
|
75
|
+
<!-- agentforge:custom -->
|
|
76
|
+
<!-- agentforge:end-custom -->
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
# 11 — Add safety guardrails
|
|
2
|
+
|
|
3
|
+
> **Goal:** layer input validation, output redaction, and tool-
|
|
4
|
+
> call gating onto your agent.
|
|
5
|
+
> **Time:** ~15 minutes.
|
|
6
|
+
> **Prereqs:** runbook 02.
|
|
7
|
+
|
|
8
|
+
## TL;DR
|
|
9
|
+
|
|
10
|
+
```yaml
|
|
11
|
+
# agentforge.yaml
|
|
12
|
+
modules:
|
|
13
|
+
guardrails:
|
|
14
|
+
defaults: true # framework basics auto-installed
|
|
15
|
+
input:
|
|
16
|
+
- prompt_injection_basic
|
|
17
|
+
output:
|
|
18
|
+
- pii_redact_basic
|
|
19
|
+
tool_gates:
|
|
20
|
+
- capability_check
|
|
21
|
+
- allowlist:
|
|
22
|
+
allowed: ["web_search", "calculator"]
|
|
23
|
+
guardrail_policy:
|
|
24
|
+
on_input_violation: block
|
|
25
|
+
on_output_violation: redact
|
|
26
|
+
on_tool_violation: block
|
|
27
|
+
fail_open: false
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## Step by step
|
|
31
|
+
|
|
32
|
+
1. **Start with the basics.** `prompt_injection_basic` +
|
|
33
|
+
`pii_redact_basic` + `capability_check` cover the obvious
|
|
34
|
+
cases out of the box; they ship with the framework.
|
|
35
|
+
2. **Add an allowlist** if your tools include anything
|
|
36
|
+
`destructive`. `capability_check` already denies destructive
|
|
37
|
+
tools by default; `allowlist` is a tighter second layer.
|
|
38
|
+
3. **Pick a policy.** `block` is the safe default for input and
|
|
39
|
+
tool violations; `redact` for outputs lets the run complete
|
|
40
|
+
with PII stripped. `fail_open: false` (the default) treats
|
|
41
|
+
validator exceptions as failures.
|
|
42
|
+
4. **Add vendor modules** when basics aren't enough. `presidio`
|
|
43
|
+
for richer PII, `llmguard` for richer prompt-injection,
|
|
44
|
+
`nemo` for programmable Colang rails, `llamaguard` for the
|
|
45
|
+
Llama Guard 3 classifier. Each is a separate pip install.
|
|
46
|
+
5. **Audit decisions.** Every validator call emits an
|
|
47
|
+
`agentforge.audit` log record and appends to
|
|
48
|
+
`RunResult.guardrail_events`. Configure your log pipeline to
|
|
49
|
+
stream the audit logger to a security store.
|
|
50
|
+
|
|
51
|
+
## Variations
|
|
52
|
+
|
|
53
|
+
- **Custom validator.** Subclass `InputValidator` /
|
|
54
|
+
`OutputValidator` / `ToolCallGate` from
|
|
55
|
+
`agentforge_core.contracts.guardrails`, register with
|
|
56
|
+
`@register("guardrails.input", "my-name")`.
|
|
57
|
+
- **Score-only mode** — Presidio + LLM Guard support a
|
|
58
|
+
`score-only` action that reports without modifying content.
|
|
59
|
+
Useful for triage dashboards.
|
|
60
|
+
- **Conformance test custom validators** with
|
|
61
|
+
`run_input_validator_conformance` / `run_output_validator_
|
|
62
|
+
conformance` / `run_tool_gate_conformance` from
|
|
63
|
+
`agentforge.testing`.
|
|
64
|
+
|
|
65
|
+
## Troubleshooting
|
|
66
|
+
|
|
67
|
+
| Symptom | Cause | Fix |
|
|
68
|
+
|---|---|---|
|
|
69
|
+
| `GuardrailViolation` at startup | input flagged | inspect `RunResult.guardrail_events`; relax to `warn` if false-positive |
|
|
70
|
+
| PII still in output | regex basic doesn't catch your case | install `agentforge-guard-presidio` for richer detection |
|
|
71
|
+
| Destructive tool still ran | `capability_check` was disabled in config | re-enable; ensure `Tool.capabilities` includes `"destructive"` |
|
|
72
|
+
| Tests fail with `GuardrailViolation` | tests use prompts that look like injection | mock the validator in tests, or rephrase the test prompt |
|
|
73
|
+
|
|
74
|
+
## Related
|
|
75
|
+
|
|
76
|
+
- Runbook 12 — Add observability (audit stream)
|
|
77
|
+
- Runbook 14 — Deploy your agent (policy hardening)
|
|
78
|
+
- Feature spec: `docs/features/feat-018-safety-and-security-guardrails.md`
|
|
79
|
+
|
|
80
|
+
<!-- agentforge:end-managed -->
|
|
81
|
+
|
|
82
|
+
<!-- agentforge:custom -->
|
|
83
|
+
<!-- agentforge:end-custom -->
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
# 12 — Add observability
|
|
2
|
+
|
|
3
|
+
> **Goal:** stream structured logs + distributed traces from
|
|
4
|
+
> every agent run to your APM stack.
|
|
5
|
+
> **Time:** ~15 minutes.
|
|
6
|
+
> **Prereqs:** runbook 01.
|
|
7
|
+
|
|
8
|
+
## TL;DR
|
|
9
|
+
|
|
10
|
+
```yaml
|
|
11
|
+
# agentforge.yaml
|
|
12
|
+
logging:
|
|
13
|
+
format: json
|
|
14
|
+
run_id_filter: true
|
|
15
|
+
modules:
|
|
16
|
+
observability:
|
|
17
|
+
- name: otel
|
|
18
|
+
config:
|
|
19
|
+
endpoint: "${OTEL_EXPORTER_OTLP_ENDPOINT}"
|
|
20
|
+
service_name: "{{ project_slug }}"
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
agentforge add module otel
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## Step by step
|
|
28
|
+
|
|
29
|
+
1. **Turn on JSON logging.** `logging.format: json` swaps the
|
|
30
|
+
default text formatter for `JSONFormatter`; every log line
|
|
31
|
+
becomes one JSON object suitable for piping into a log
|
|
32
|
+
aggregator.
|
|
33
|
+
2. **Enable run_id propagation.** `run_id_filter: true`
|
|
34
|
+
installs a logging filter that attaches the active run's
|
|
35
|
+
`run_id` to every record under that run's context. Cross-
|
|
36
|
+
reference runs across components.
|
|
37
|
+
3. **Install OTel.** `agentforge add module otel` adds
|
|
38
|
+
`agentforge-otel`; the framework's root span (`agent.run`)
|
|
39
|
+
then becomes the parent of every strategy / LLM / tool span.
|
|
40
|
+
4. **Point at your collector.** OTLP/gRPC by default; set
|
|
41
|
+
`OTEL_EXPORTER_OTLP_ENDPOINT` (or hard-code in the YAML).
|
|
42
|
+
Service name = project slug by default.
|
|
43
|
+
5. **Custom hooks.** Implement `on_step(step)` / `on_finish(
|
|
44
|
+
result)` callables and pass them to `Agent(on_step=...,
|
|
45
|
+
on_finish=...)` for bespoke metrics; multiple hooks fan out
|
|
46
|
+
in parallel.
|
|
47
|
+
|
|
48
|
+
## Variations
|
|
49
|
+
|
|
50
|
+
- **Custom log channels.** Audit decisions go to
|
|
51
|
+
`agentforge.audit`; route them to a security store separately
|
|
52
|
+
from app logs.
|
|
53
|
+
- **Vendor backends** — Langfuse / Phoenix / Evidently / StatsD
|
|
54
|
+
modules each wrap their own SDK behind the same hook
|
|
55
|
+
contract. Add via `agentforge add module <name>`.
|
|
56
|
+
- **Cost dashboards.** `RunResult.cost_usd` + `eval_scores` are
|
|
57
|
+
cheap series for daily cost-vs-quality charts.
|
|
58
|
+
|
|
59
|
+
## Troubleshooting
|
|
60
|
+
|
|
61
|
+
| Symptom | Cause | Fix |
|
|
62
|
+
|---|---|---|
|
|
63
|
+
| No spans in OTel UI | exporter endpoint wrong | check `agentforge config show --resolved` then curl the OTLP endpoint |
|
|
64
|
+
| Run id missing from logs | run_id_filter disabled | re-enable in YAML; restart the process |
|
|
65
|
+
| Hook breaks the run | exceptions in hooks default to log-and-continue | check the hook's error log; framework isolates failures |
|
|
66
|
+
| Spans missing inside strategies | older `agentforge-otel`; iteration spans land in 0.2+ | upgrade the module |
|
|
67
|
+
|
|
68
|
+
## Related
|
|
69
|
+
|
|
70
|
+
- Runbook 11 — Add safety guardrails (audit stream)
|
|
71
|
+
- Runbook 14 — Deploy your agent
|
|
72
|
+
- Feature spec: `docs/features/feat-009-observability.md`
|
|
73
|
+
|
|
74
|
+
<!-- agentforge:end-managed -->
|
|
75
|
+
|
|
76
|
+
<!-- agentforge:custom -->
|
|
77
|
+
<!-- agentforge:end-custom -->
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
# 13 — Configure multi-provider
|
|
2
|
+
|
|
3
|
+
> **Goal:** run different model classes for reasoning, judging,
|
|
4
|
+
> and embedding without rewriting your agent.
|
|
5
|
+
> **Time:** ~10 minutes.
|
|
6
|
+
> **Prereqs:** runbook 01.
|
|
7
|
+
|
|
8
|
+
## TL;DR
|
|
9
|
+
|
|
10
|
+
```yaml
|
|
11
|
+
# agentforge.yaml
|
|
12
|
+
providers:
|
|
13
|
+
default:
|
|
14
|
+
type: anthropic # native Anthropic API
|
|
15
|
+
model: claude-sonnet-4-7
|
|
16
|
+
judge:
|
|
17
|
+
type: anthropic
|
|
18
|
+
model: claude-haiku-4-5 # cheaper judge
|
|
19
|
+
embed:
|
|
20
|
+
type: voyage
|
|
21
|
+
model: voyage-3-large
|
|
22
|
+
agent:
|
|
23
|
+
model: anthropic:claude-sonnet-4-7
|
|
24
|
+
modules:
|
|
25
|
+
evaluators:
|
|
26
|
+
- name: faithfulness
|
|
27
|
+
config:
|
|
28
|
+
judge_provider: judge
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
**Available provider drivers (v0.2):**
|
|
32
|
+
|
|
33
|
+
| `type:` | Package | Capabilities |
|
|
34
|
+
|---|---|---|
|
|
35
|
+
| `bedrock` | `agentforge-bedrock` | tools, json_mode, caching, thinking, streaming |
|
|
36
|
+
| `anthropic` | `agentforge-anthropic` | tools, json_mode, caching, thinking, streaming |
|
|
37
|
+
| `openai` | `agentforge-openai` | tools, json_mode, streaming, vision (gpt-4o*) |
|
|
38
|
+
| `ollama` | `agentforge-ollama` | tools, streaming (local; zero cost) |
|
|
39
|
+
| `litellm` | `agentforge-litellm` | tools (router → 100+ backends) |
|
|
40
|
+
| `voyage` | `agentforge-voyage` | embedding-only; matryoshka |
|
|
41
|
+
|
|
42
|
+
## Step by step
|
|
43
|
+
|
|
44
|
+
1. **Name your providers** under the top-level `providers:` map.
|
|
45
|
+
`default` is the one `agent.model` falls back to; named
|
|
46
|
+
entries (`judge`, `embed`, `summariser`) can be addressed by
|
|
47
|
+
downstream modules.
|
|
48
|
+
2. **Pick the reasoning model.** `agent.model` is the agent's
|
|
49
|
+
primary LLM. Use the strongest model you can afford.
|
|
50
|
+
3. **Use a cheaper judge** for LLM-judge evaluators. Per
|
|
51
|
+
feat-006, judge graders take a `judge_provider` config that
|
|
52
|
+
resolves the named provider. Cheap haiku-class models bring
|
|
53
|
+
judge cost down 10x with marginal quality loss for boolean
|
|
54
|
+
evaluations.
|
|
55
|
+
4. **Separate embedding from reasoning.** Vector indexing
|
|
56
|
+
typically benefits from a dedicated embedder
|
|
57
|
+
(`voyage-3`, `text-embedding-3-large`). Wire it into
|
|
58
|
+
`modules.retriever.embedding_provider`.
|
|
59
|
+
5. **Per-module overrides.** Any module that takes an LLM (
|
|
60
|
+
guardrails / evaluators / etc.) can name a provider.
|
|
61
|
+
|
|
62
|
+
## Variations
|
|
63
|
+
|
|
64
|
+
- **Fallback chain.** Use `agentforge_core.production.FallbackChain`
|
|
65
|
+
to wrap two providers; primary first, secondary on
|
|
66
|
+
`RateLimitError` / `ServiceError`.
|
|
67
|
+
- **Different providers per environment.** `agentforge.dev.yaml`
|
|
68
|
+
overlay points at a cheap dev model; `agentforge.prod.yaml`
|
|
69
|
+
swaps to the production tier. `AGENTFORGE_ENV=prod` selects.
|
|
70
|
+
- **Mock provider for tests.** Register `MockLLMClient` as a
|
|
71
|
+
named provider so config-driven tests reuse it.
|
|
72
|
+
|
|
73
|
+
## Troubleshooting
|
|
74
|
+
|
|
75
|
+
| Symptom | Cause | Fix |
|
|
76
|
+
|---|---|---|
|
|
77
|
+
| `No LLM provider registered for X` | provider package not installed | `agentforge add module <X>` |
|
|
78
|
+
| Judge cost > reasoning cost | judge running on the same big model | name a cheaper judge provider |
|
|
79
|
+
| Embedder shape mismatch | mixed-dimension stores | pin embedding model + dimension in the vector store config |
|
|
80
|
+
| Run intermittently 5xx | provider outage | wrap with FallbackChain |
|
|
81
|
+
|
|
82
|
+
## Related
|
|
83
|
+
|
|
84
|
+
- Runbook 10 — Add evaluators (judge_provider)
|
|
85
|
+
- Runbook 14 — Deploy your agent (environment overlays)
|
|
86
|
+
- Feature spec: `docs/features/feat-003-llm-provider-abstraction.md`
|
|
87
|
+
|
|
88
|
+
<!-- agentforge:end-managed -->
|
|
89
|
+
|
|
90
|
+
<!-- agentforge:custom -->
|
|
91
|
+
<!-- agentforge:end-custom -->
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# 14 — Deploy your agent
|
|
2
|
+
|
|
3
|
+
> **Goal:** get the agent running somewhere durable (container,
|
|
4
|
+
> serverless, batch job) with proper secrets and observability.
|
|
5
|
+
> **Time:** ~30 minutes.
|
|
6
|
+
> **Prereqs:** runbooks 01, 08, 12.
|
|
7
|
+
|
|
8
|
+
## TL;DR
|
|
9
|
+
|
|
10
|
+
```dockerfile
|
|
11
|
+
FROM python:3.13-slim
|
|
12
|
+
WORKDIR /app
|
|
13
|
+
RUN pip install --no-cache-dir uv
|
|
14
|
+
COPY pyproject.toml uv.lock ./
|
|
15
|
+
COPY src/ ./src/
|
|
16
|
+
COPY agentforge.yaml ./
|
|
17
|
+
RUN uv sync --frozen
|
|
18
|
+
ENV AGENTFORGE_ENV=prod
|
|
19
|
+
CMD ["uv", "run", "agentforge", "run", "--task-file", "/in/task.txt", "--output-format", "json"]
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
## Step by step
|
|
23
|
+
|
|
24
|
+
1. **Pin every dependency.** `uv.lock` must ship with the
|
|
25
|
+
image. `uv sync --frozen` enforces that.
|
|
26
|
+
2. **Use environment overlays.** Ship `agentforge.yaml` +
|
|
27
|
+
`agentforge.prod.yaml`; set `AGENTFORGE_ENV=prod` in the
|
|
28
|
+
container. The framework merges the overlay automatically.
|
|
29
|
+
3. **Mount secrets via env.** `${AWS_ACCESS_KEY_ID}` etc. in
|
|
30
|
+
the YAML resolve from the container's env. Never bake
|
|
31
|
+
secrets into the image.
|
|
32
|
+
4. **Provision the memory store.** If using Postgres, run
|
|
33
|
+
`agentforge db migrate` as a pre-deploy step (helm hook,
|
|
34
|
+
k8s Job, deployment script).
|
|
35
|
+
5. **Configure observability** — export `OTEL_EXPORTER_OTLP_
|
|
36
|
+
ENDPOINT`, `OTEL_RESOURCE_ATTRIBUTES=service.name=...`.
|
|
37
|
+
6. **Health probe.** `agentforge health --output-format json`
|
|
38
|
+
exits 0 when config + modules + backends are all OK; perfect
|
|
39
|
+
for k8s readiness probes.
|
|
40
|
+
|
|
41
|
+
## Variations
|
|
42
|
+
|
|
43
|
+
- **Serverless.** Same image, different entrypoint. Lambda /
|
|
44
|
+
Cloud Run trigger calls `agentforge run` with the task from
|
|
45
|
+
the event.
|
|
46
|
+
- **Batch worker.** Loop over a queue; reuse the Agent across
|
|
47
|
+
tasks. `Agent` is thread-safe; each `run` creates fresh
|
|
48
|
+
per-run state.
|
|
49
|
+
- **Multi-tenant.** One Agent per tenant; route requests by
|
|
50
|
+
`project` / `agent` claim namespace.
|
|
51
|
+
|
|
52
|
+
## Troubleshooting
|
|
53
|
+
|
|
54
|
+
| Symptom | Cause | Fix |
|
|
55
|
+
|---|---|---|
|
|
56
|
+
| Container exits 2 on start | config invalid in the prod overlay | check `agentforge config validate --env prod` locally |
|
|
57
|
+
| `connection refused` on DB | network policy blocking | mount the secret AND open egress |
|
|
58
|
+
| OTel spans not appearing | service.name not set | export `OTEL_RESOURCE_ATTRIBUTES=service.name=<your-agent>` |
|
|
59
|
+
| Probe fails intermittently | cold-start LLM auth | bump probe initial delay; cache provider client across requests |
|
|
60
|
+
|
|
61
|
+
## Related
|
|
62
|
+
|
|
63
|
+
- Runbook 08 — Add memory (DSN secrets, migration)
|
|
64
|
+
- Runbook 12 — Add observability
|
|
65
|
+
- Runbook 15 — Upgrade your agent (release process)
|
|
66
|
+
|
|
67
|
+
<!-- agentforge:end-managed -->
|
|
68
|
+
|
|
69
|
+
<!-- agentforge:custom -->
|
|
70
|
+
<!-- agentforge:end-custom -->
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
# 15 — Upgrade your agent
|
|
2
|
+
|
|
3
|
+
> **Goal:** pull the latest framework changes into this project
|
|
4
|
+
> without losing your customisations.
|
|
5
|
+
> **Time:** ~15 minutes.
|
|
6
|
+
> **Prereqs:** runbook 01.
|
|
7
|
+
|
|
8
|
+
## TL;DR
|
|
9
|
+
|
|
10
|
+
```bash
|
|
11
|
+
agentforge upgrade --dry-run # preview
|
|
12
|
+
agentforge upgrade # apply
|
|
13
|
+
agentforge status # any drift?
|
|
14
|
+
pytest -q
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
## Step by step
|
|
18
|
+
|
|
19
|
+
1. **Read the framework's CHANGELOG.** Open
|
|
20
|
+
`docs/features/README.md` from the framework repo (or the
|
|
21
|
+
release notes) and skim what shipped between your version
|
|
22
|
+
and current.
|
|
23
|
+
2. **Stage clean.** Commit any uncommitted work first.
|
|
24
|
+
`agentforge upgrade` is a three-way merge — easier to
|
|
25
|
+
resolve from a clean tree.
|
|
26
|
+
3. **Dry-run.** `agentforge upgrade --dry-run` prints the diff
|
|
27
|
+
without writing. Use to scope the review.
|
|
28
|
+
4. **Apply.** `agentforge upgrade` runs Copier's `run_update`,
|
|
29
|
+
merging managed files against the recorded template
|
|
30
|
+
version. Custom sections of three-section docs are
|
|
31
|
+
preserved automatically; non-managed code is left alone.
|
|
32
|
+
5. **Resolve conflicts.** Copier surfaces conflicts in `.rej`
|
|
33
|
+
files. Edit by hand or `agentforge fork <path>` to claim
|
|
34
|
+
the file outright (future upgrades skip it).
|
|
35
|
+
6. **Verify.** `agentforge status` should show no `DRIFTED`
|
|
36
|
+
files; `pytest -q` should pass.
|
|
37
|
+
|
|
38
|
+
## Variations
|
|
39
|
+
|
|
40
|
+
- **Fork a file.** `agentforge fork src/myagent/agent_runtime.py`
|
|
41
|
+
strips the marker and flips the lock entry to `forked: true`.
|
|
42
|
+
Future upgrades skip it.
|
|
43
|
+
- **Unfork.** `agentforge unfork <path>` re-prepends the marker;
|
|
44
|
+
next upgrade re-pulls framework content (lossy).
|
|
45
|
+
- **Pin a target ref.** `agentforge upgrade --to <ref>` points
|
|
46
|
+
at a specific template ref instead of the latest. Useful for
|
|
47
|
+
staged rollouts.
|
|
48
|
+
|
|
49
|
+
## Troubleshooting
|
|
50
|
+
|
|
51
|
+
| Symptom | Cause | Fix |
|
|
52
|
+
|---|---|---|
|
|
53
|
+
| `No .agentforge-state/answers.yml` | this directory wasn't scaffolded by `agentforge new` | upgrade only works on scaffolded projects |
|
|
54
|
+
| `.rej` conflict file | three-way merge couldn't auto-resolve | edit by hand; the `.rej` carries the framework's preferred shape |
|
|
55
|
+
| Custom section in runbook overwritten | edit went above the `<!-- agentforge:end-managed -->` marker | move custom content below the marker, restore from git |
|
|
56
|
+
| DB schema out of date | driver bumped its schema | `agentforge db backup` → `agentforge db migrate` → `agentforge db restore` |
|
|
57
|
+
|
|
58
|
+
## Related
|
|
59
|
+
|
|
60
|
+
- Runbook 08 — Add memory (db migrate during upgrade)
|
|
61
|
+
- Runbook 14 — Deploy your agent (release process)
|
|
62
|
+
- Feature spec: `docs/features/feat-011-scaffolding-and-upgrade.md`
|
|
63
|
+
|
|
64
|
+
<!-- agentforge:end-managed -->
|
|
65
|
+
|
|
66
|
+
<!-- agentforge:custom -->
|
|
67
|
+
<!-- agentforge:end-custom -->
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
# 16 — Configuration reference
|
|
2
|
+
|
|
3
|
+
> **Goal:** find the canonical shape of every `agentforge.yaml`
|
|
4
|
+
> field without re-reading source.
|
|
5
|
+
> **Time:** ~5 minutes (lookup).
|
|
6
|
+
> **Prereqs:** none.
|
|
7
|
+
|
|
8
|
+
## TL;DR
|
|
9
|
+
|
|
10
|
+
```bash
|
|
11
|
+
agentforge config schema | less # print the full JSON schema
|
|
12
|
+
agentforge config show --resolved # see what your YAML actually parsed to
|
|
13
|
+
agentforge config validate # fast-fail on bad keys
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
## Step by step
|
|
17
|
+
|
|
18
|
+
1. **Schema is the truth.** `agentforge config schema` prints
|
|
19
|
+
the Pydantic-derived JSON schema for `AgentForgeConfig`. No
|
|
20
|
+
guessing.
|
|
21
|
+
2. **Resolved view.** `agentforge config show --resolved` prints
|
|
22
|
+
the parsed config with `${ENV_VAR}` interpolation expanded,
|
|
23
|
+
env overlay merged, and CLI overrides applied. Source-of-
|
|
24
|
+
truth for "what will the agent actually run with?"
|
|
25
|
+
3. **Validate** before commit. `agentforge config validate` is
|
|
26
|
+
the same parse the runtime does; exit code 2 means the YAML
|
|
27
|
+
has unknown keys, bad types, or invalid env references.
|
|
28
|
+
|
|
29
|
+
## Top-level sections
|
|
30
|
+
|
|
31
|
+
| Section | Purpose |
|
|
32
|
+
|---|---|
|
|
33
|
+
| `agent` | name, model, strategy, system prompt, tools, budget, max_iterations, llm_options |
|
|
34
|
+
| `modules` | memory / graph / retriever / evaluators / observability / tools / protocols / guardrails |
|
|
35
|
+
| `providers` | named LLM clients (default + judge + embed + custom) |
|
|
36
|
+
| `logging` | level, run_id_filter, format (text\|json) |
|
|
37
|
+
| `output` | finding variant defaults, renderer choice, thresholds |
|
|
38
|
+
| `guardrail_policy` | on_input / on_output / on_tool violation actions, audit_channel, fail_open |
|
|
39
|
+
|
|
40
|
+
## Environment + override order
|
|
41
|
+
|
|
42
|
+
CLI flags > `--override` flags > `agentforge.<env>.yaml` overlay >
|
|
43
|
+
`agentforge.yaml` > defaults.
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
agentforge run \
|
|
47
|
+
--env prod \
|
|
48
|
+
--override agent.budget.usd=20 \
|
|
49
|
+
--override providers.default.model=claude-haiku-4-5 \
|
|
50
|
+
"your task"
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Variations
|
|
54
|
+
|
|
55
|
+
- **Schema export** — `agentforge config schema > schema.json`
|
|
56
|
+
feeds IDE YAML LSPs (vs-code-yaml etc.) for autocomplete.
|
|
57
|
+
- **Per-module schemas** — installed modules contribute schemas
|
|
58
|
+
to `modules.<section>.config`. `agentforge config validate
|
|
59
|
+
--strict` enforces.
|
|
60
|
+
- **`AGENTFORGE_CONFIG`** + `AGENTFORGE_ENV` + `AGENTFORGE_LOG_
|
|
61
|
+
LEVEL` env vars are the three shortcuts that don't require
|
|
62
|
+
flags.
|
|
63
|
+
|
|
64
|
+
## Troubleshooting
|
|
65
|
+
|
|
66
|
+
| Symptom | Cause | Fix |
|
|
67
|
+
|---|---|---|
|
|
68
|
+
| `unknown field` on a key you expected to be valid | typo or post-major rename | check the schema; spec changes are listed in CHANGELOG |
|
|
69
|
+
| `${VAR}` not resolving | env var unset | `agentforge config show --resolved` reports the missing one |
|
|
70
|
+
| Override not taking effect | wrong dotted path | overrides are dotted: `agent.budget.usd=10`, not `budget.usd` |
|
|
71
|
+
| `fail_open: true` slipped into prod | dev overlay leaked | rotate env-overlay names; only prod overlay shipped to prod |
|
|
72
|
+
|
|
73
|
+
## Related
|
|
74
|
+
|
|
75
|
+
- Every other runbook (they all link back here)
|
|
76
|
+
- Feature spec: `docs/features/feat-012-configuration-system.md`
|
|
77
|
+
|
|
78
|
+
<!-- agentforge:end-managed -->
|
|
79
|
+
|
|
80
|
+
<!-- agentforge:custom -->
|
|
81
|
+
<!-- agentforge:end-custom -->
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
# 17 — Add a reranker
|
|
2
|
+
|
|
3
|
+
> **Goal:** improve retrieval precision by re-scoring the top-k
|
|
4
|
+
> candidates a vector store returned, then keeping the best.
|
|
5
|
+
> **Time:** ~10 minutes.
|
|
6
|
+
> **Prereqs:** runbook 08 (retrieval already wired).
|
|
7
|
+
|
|
8
|
+
## TL;DR
|
|
9
|
+
|
|
10
|
+
```yaml
|
|
11
|
+
# agentforge.yaml
|
|
12
|
+
retrieval:
|
|
13
|
+
embedder:
|
|
14
|
+
driver: voyage
|
|
15
|
+
config: {model: voyage-3-large}
|
|
16
|
+
vector_store:
|
|
17
|
+
driver: postgres
|
|
18
|
+
config: {dsn: $POSTGRES_DSN, table: docs}
|
|
19
|
+
reranker:
|
|
20
|
+
name: cohere # or: sentence_transformers / voyage / mixedbread
|
|
21
|
+
config:
|
|
22
|
+
api_key: $COHERE_API_KEY
|
|
23
|
+
model: rerank-english-v3.0
|
|
24
|
+
top_k: 4 # keep the top 4 after re-scoring
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## Step by step
|
|
28
|
+
|
|
29
|
+
1. **Pick a reranker driver.** Built-in choices:
|
|
30
|
+
- `sentence_transformers` — local cross-encoder; no API key, slower.
|
|
31
|
+
- `cohere` — managed; fast; needs `COHERE_API_KEY`.
|
|
32
|
+
- `voyage` — managed; high quality; needs `VOYAGE_API_KEY`.
|
|
33
|
+
- `mixedbread` — managed; needs `MIXEDBREAD_API_KEY`.
|
|
34
|
+
2. **Install the matching package.**
|
|
35
|
+
`agentforge add module reranker-cohere` (or `-voyage`,
|
|
36
|
+
`-mixedbread`, `-sentence-transformers`).
|
|
37
|
+
3. **Drop the `reranker:` block** into `retrieval:`. The
|
|
38
|
+
`Retriever` looks up the driver via the `agentforge.rerankers`
|
|
39
|
+
entry-point category and slots it after the vector / hybrid
|
|
40
|
+
search stage.
|
|
41
|
+
4. **Set `top_k`.** The reranker runs over the vector store's
|
|
42
|
+
`top_k_pre` candidates and returns `top_k`. Common settings:
|
|
43
|
+
`top_k_pre=20, top_k=4` for cost-aware, `top_k_pre=50,
|
|
44
|
+
top_k=8` for quality-aware.
|
|
45
|
+
5. **Test it.** `await retriever.retrieve("query")` returns
|
|
46
|
+
`VectorMatch` rows already in reranked order — the
|
|
47
|
+
`score` field reflects the reranker's score, not the
|
|
48
|
+
original vector similarity.
|
|
49
|
+
|
|
50
|
+
## Variations
|
|
51
|
+
|
|
52
|
+
- **Two-stage** — keep an embedding-based fast path with a
|
|
53
|
+
reranker only on cold queries. Set
|
|
54
|
+
`retrieval.reranker.always: false`.
|
|
55
|
+
- **Custom reranker** — implement the `Reranker` ABC in
|
|
56
|
+
`agentforge_core.contracts.reranker` and register it via the
|
|
57
|
+
`agentforge.rerankers` entry-point in your module's
|
|
58
|
+
`pyproject.toml`.
|
|
59
|
+
|
|
60
|
+
## Troubleshooting
|
|
61
|
+
|
|
62
|
+
| Symptom | Cause | Fix |
|
|
63
|
+
|---|---|---|
|
|
64
|
+
| `No reranker registered for X` | package not installed | `agentforge add module reranker-X` |
|
|
65
|
+
| Latency 2-3x higher | local cross-encoder on CPU | switch to managed (Cohere / Voyage) |
|
|
66
|
+
| Top result is wrong | reranker model mismatch with corpus language | pick the matching `rerank-multilingual-v3.0` or similar |
|
|
67
|
+
| Cost spike | reranker called per request, hot path | cache by query hash or move reranker to async batch path |
|
|
68
|
+
|
|
69
|
+
## Related
|
|
70
|
+
|
|
71
|
+
- Runbook 08 — Add memory + retrieval
|
|
72
|
+
- Runbook 18 — Add hybrid search
|
|
73
|
+
- Feature spec: `docs/features/feat-021-reranker.md`
|
|
74
|
+
|
|
75
|
+
<!-- agentforge:end-managed -->
|
|
76
|
+
|
|
77
|
+
<!-- agentforge:custom -->
|
|
78
|
+
<!-- agentforge:end-custom -->
|