agentforge-py 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentforge/__init__.py +114 -0
- agentforge/_testing/__init__.py +19 -0
- agentforge/_testing/fake_llm.py +126 -0
- agentforge/_testing/fake_tool.py +122 -0
- agentforge/_tools/__init__.py +14 -0
- agentforge/_tools/calculator.py +102 -0
- agentforge/_tools/decorator.py +300 -0
- agentforge/_tools/file_read.py +112 -0
- agentforge/_tools/shell.py +134 -0
- agentforge/_tools/web_search.py +207 -0
- agentforge/agent.py +817 -0
- agentforge/auth.py +42 -0
- agentforge/cli/__init__.py +18 -0
- agentforge/cli/_build.py +323 -0
- agentforge/cli/_scaffold_state.py +250 -0
- agentforge/cli/_shared_scaffold.py +174 -0
- agentforge/cli/config_cmd.py +174 -0
- agentforge/cli/db_cmd.py +262 -0
- agentforge/cli/debug_cmd.py +168 -0
- agentforge/cli/docs_cmd.py +217 -0
- agentforge/cli/eval_cmd.py +181 -0
- agentforge/cli/health_cmd.py +139 -0
- agentforge/cli/list_modules.py +85 -0
- agentforge/cli/main.py +81 -0
- agentforge/cli/manifest_apply.py +368 -0
- agentforge/cli/module_cmd.py +247 -0
- agentforge/cli/new_cmd.py +171 -0
- agentforge/cli/run_cmd.py +234 -0
- agentforge/cli/upgrade_cmd.py +230 -0
- agentforge/config/__init__.py +45 -0
- agentforge/eval/__init__.py +18 -0
- agentforge/eval/consistency.py +107 -0
- agentforge/eval/coverage.py +100 -0
- agentforge/eval/format_compliance.py +107 -0
- agentforge/eval/regression.py +143 -0
- agentforge/findings.py +166 -0
- agentforge/guardrails/__init__.py +32 -0
- agentforge/guardrails/allowlist.py +49 -0
- agentforge/guardrails/capability_check.py +58 -0
- agentforge/guardrails/engine.py +289 -0
- agentforge/guardrails/pii_redact_basic.py +61 -0
- agentforge/guardrails/prompt_injection_basic.py +90 -0
- agentforge/memory/__init__.py +16 -0
- agentforge/memory/in_memory.py +130 -0
- agentforge/memory/in_memory_graph.py +262 -0
- agentforge/memory/in_memory_vector.py +167 -0
- agentforge/pipeline/__init__.py +26 -0
- agentforge/pipeline/engine.py +189 -0
- agentforge/pipeline/errors.py +19 -0
- agentforge/pipeline/tool.py +93 -0
- agentforge/py.typed +0 -0
- agentforge/recording.py +189 -0
- agentforge/renderers/__init__.py +28 -0
- agentforge/renderers/_defaults.py +32 -0
- agentforge/renderers/markdown.py +44 -0
- agentforge/renderers/patch_applier.py +46 -0
- agentforge/renderers/registry.py +108 -0
- agentforge/renderers/scorecard.py +59 -0
- agentforge/renderers/span_table.py +71 -0
- agentforge/replay.py +260 -0
- agentforge/resolver_register.py +41 -0
- agentforge/retrieval.py +410 -0
- agentforge/runtime.py +63 -0
- agentforge/strategies/__init__.py +27 -0
- agentforge/strategies/_base.py +280 -0
- agentforge/strategies/_plan.py +93 -0
- agentforge/strategies/multi_agent.py +541 -0
- agentforge/strategies/plan_execute.py +506 -0
- agentforge/strategies/react.py +237 -0
- agentforge/strategies/tot.py +472 -0
- agentforge/templates/_shared/.cursorrules +12 -0
- agentforge/templates/_shared/.github/copilot-instructions.md +13 -0
- agentforge/templates/_shared/.gitkeep +0 -0
- agentforge/templates/_shared/AGENTS.md.tmpl +123 -0
- agentforge/templates/_shared/CLAUDE.md +13 -0
- agentforge/templates/_shared/docs/runbooks/01-set-up-new-agent.md.tmpl +67 -0
- agentforge/templates/_shared/docs/runbooks/02-add-a-tool.md +67 -0
- agentforge/templates/_shared/docs/runbooks/03-add-a-pipeline-task.md +69 -0
- agentforge/templates/_shared/docs/runbooks/04-pick-reasoning-strategy.md +67 -0
- agentforge/templates/_shared/docs/runbooks/05-write-prompts.md +75 -0
- agentforge/templates/_shared/docs/runbooks/06-test-your-agent.md +75 -0
- agentforge/templates/_shared/docs/runbooks/07-debug-a-run.md +70 -0
- agentforge/templates/_shared/docs/runbooks/08-add-memory.md +75 -0
- agentforge/templates/_shared/docs/runbooks/09-add-mcp.md +78 -0
- agentforge/templates/_shared/docs/runbooks/10-add-evaluators.md +76 -0
- agentforge/templates/_shared/docs/runbooks/11-add-safety-guardrails.md +83 -0
- agentforge/templates/_shared/docs/runbooks/12-add-observability.md +77 -0
- agentforge/templates/_shared/docs/runbooks/13-configure-multi-provider.md +91 -0
- agentforge/templates/_shared/docs/runbooks/14-deploy-your-agent.md +70 -0
- agentforge/templates/_shared/docs/runbooks/15-upgrade-your-agent.md +67 -0
- agentforge/templates/_shared/docs/runbooks/16-configuration-reference.md +81 -0
- agentforge/templates/_shared/docs/runbooks/17-add-reranker.md +78 -0
- agentforge/templates/_shared/docs/runbooks/18-add-hybrid-search.md +78 -0
- agentforge/templates/_shared/docs/runbooks/19-add-graphrag.md +83 -0
- agentforge/templates/_shared/docs/runbooks/20-apply-schema-migrations.md +92 -0
- agentforge/templates/_shared/docs/runbooks/21-use-streaming-guardrails.md +82 -0
- agentforge/templates/_shared/docs/runbooks/README.md.tmpl +68 -0
- agentforge/templates/code-reviewer/.env.example +8 -0
- agentforge/templates/code-reviewer/.gitignore +7 -0
- agentforge/templates/code-reviewer/README.md +12 -0
- agentforge/templates/code-reviewer/agentforge.yaml +23 -0
- agentforge/templates/code-reviewer/copier.yml +34 -0
- agentforge/templates/code-reviewer/pyproject.toml +18 -0
- agentforge/templates/code-reviewer/src/{{project_slug.replace('-', '_')}}/__init__.py +5 -0
- agentforge/templates/code-reviewer/src/{{project_slug.replace('-', '_')}}/main.py +32 -0
- agentforge/templates/docs-qa/.env.example +8 -0
- agentforge/templates/docs-qa/.gitignore +7 -0
- agentforge/templates/docs-qa/README.md +14 -0
- agentforge/templates/docs-qa/agentforge.yaml +19 -0
- agentforge/templates/docs-qa/copier.yml +31 -0
- agentforge/templates/docs-qa/pyproject.toml +18 -0
- agentforge/templates/docs-qa/src/{{project_slug.replace('-', '_')}}/__init__.py +5 -0
- agentforge/templates/docs-qa/src/{{project_slug.replace('-', '_')}}/main.py +32 -0
- agentforge/templates/minimal/.env.example +11 -0
- agentforge/templates/minimal/.gitignore +10 -0
- agentforge/templates/minimal/README.md +28 -0
- agentforge/templates/minimal/agentforge.yaml +10 -0
- agentforge/templates/minimal/copier.yml +52 -0
- agentforge/templates/minimal/pyproject.toml +18 -0
- agentforge/templates/minimal/src/{{project_slug.replace('-', '_')}}/__init__.py +5 -0
- agentforge/templates/minimal/src/{{project_slug.replace('-', '_')}}/main.py +34 -0
- agentforge/templates/patch-bot/.env.example +8 -0
- agentforge/templates/patch-bot/.gitignore +7 -0
- agentforge/templates/patch-bot/README.md +13 -0
- agentforge/templates/patch-bot/agentforge.yaml +15 -0
- agentforge/templates/patch-bot/copier.yml +31 -0
- agentforge/templates/patch-bot/pyproject.toml +18 -0
- agentforge/templates/patch-bot/src/{{project_slug.replace('-', '_')}}/__init__.py +5 -0
- agentforge/templates/patch-bot/src/{{project_slug.replace('-', '_')}}/main.py +32 -0
- agentforge/templates/research/.env.example +8 -0
- agentforge/templates/research/.gitignore +7 -0
- agentforge/templates/research/README.md +14 -0
- agentforge/templates/research/agentforge.yaml +17 -0
- agentforge/templates/research/copier.yml +31 -0
- agentforge/templates/research/pyproject.toml +18 -0
- agentforge/templates/research/src/{{project_slug.replace('-', '_')}}/__init__.py +5 -0
- agentforge/templates/research/src/{{project_slug.replace('-', '_')}}/main.py +31 -0
- agentforge/templates/triage/.env.example +8 -0
- agentforge/templates/triage/.gitignore +7 -0
- agentforge/templates/triage/README.md +14 -0
- agentforge/templates/triage/agentforge.yaml +25 -0
- agentforge/templates/triage/copier.yml +31 -0
- agentforge/templates/triage/pyproject.toml +18 -0
- agentforge/templates/triage/src/{{project_slug.replace('-', '_')}}/__init__.py +5 -0
- agentforge/templates/triage/src/{{project_slug.replace('-', '_')}}/main.py +30 -0
- agentforge/testing/__init__.py +69 -0
- agentforge/testing/conformance.py +40 -0
- agentforge/testing/factory.py +89 -0
- agentforge/testing/fixtures.py +42 -0
- agentforge/testing/llm.py +235 -0
- agentforge/testing/recording.py +177 -0
- agentforge/tools/__init__.py +41 -0
- agentforge_py-0.2.1.dist-info/METADATA +158 -0
- agentforge_py-0.2.1.dist-info/RECORD +157 -0
- agentforge_py-0.2.1.dist-info/WHEEL +4 -0
- agentforge_py-0.2.1.dist-info/entry_points.txt +2 -0
- agentforge_py-0.2.1.dist-info/licenses/LICENSE +202 -0
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
"""`agentforge upgrade/fork/unfork/status` commands (feat-011 chunks 4-5).
|
|
2
|
+
|
|
3
|
+
- **upgrade**: wraps Copier's `copier update` for the linked
|
|
4
|
+
template; refreshes the managed-files lock.
|
|
5
|
+
- **fork**: strip the framework marker from a file + flag it as
|
|
6
|
+
forked in the lock. Future upgrades skip it.
|
|
7
|
+
- **unfork**: restore from the template (lossy — overwrites local
|
|
8
|
+
edits). Re-runs the per-file render and updates the lock.
|
|
9
|
+
- **status**: walks the lock and prints managed / forked / drifted
|
|
10
|
+
/ missing per file.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import argparse
|
|
16
|
+
import sys
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
|
|
19
|
+
import yaml
|
|
20
|
+
from agentforge_core.production.exceptions import ModuleError
|
|
21
|
+
|
|
22
|
+
from agentforge.cli._scaffold_state import (
|
|
23
|
+
answers_path,
|
|
24
|
+
file_status,
|
|
25
|
+
hash_content,
|
|
26
|
+
marker_for,
|
|
27
|
+
read_lock,
|
|
28
|
+
strip_marker,
|
|
29
|
+
write_lock,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def register_upgrade_cmds(sub: argparse._SubParsersAction) -> None: # type: ignore[type-arg]
|
|
34
|
+
"""Attach upgrade / fork / unfork / status to the parent
|
|
35
|
+
subparser action."""
|
|
36
|
+
upgrade = sub.add_parser(
|
|
37
|
+
"upgrade",
|
|
38
|
+
help="Pull framework updates into this agent (three-way merge).",
|
|
39
|
+
)
|
|
40
|
+
upgrade.add_argument("--to", default=None, help="Target version (default: latest).")
|
|
41
|
+
upgrade.add_argument(
|
|
42
|
+
"--dry-run",
|
|
43
|
+
action="store_true",
|
|
44
|
+
help="Show what would change without writing.",
|
|
45
|
+
)
|
|
46
|
+
upgrade.set_defaults(_handler=_run_upgrade)
|
|
47
|
+
|
|
48
|
+
fork = sub.add_parser("fork", help="Claim a managed file — future upgrades skip it.")
|
|
49
|
+
fork.add_argument("path", help="Path to the file to fork (relative to cwd).")
|
|
50
|
+
fork.set_defaults(_handler=_run_fork)
|
|
51
|
+
|
|
52
|
+
unfork = sub.add_parser("unfork", help="Restore a forked file to the template version.")
|
|
53
|
+
unfork.add_argument("path", help="Path to the file to unfork.")
|
|
54
|
+
unfork.set_defaults(_handler=_run_unfork)
|
|
55
|
+
|
|
56
|
+
status = sub.add_parser(
|
|
57
|
+
"status",
|
|
58
|
+
help="Show managed / forked / drifted files in this agent.",
|
|
59
|
+
)
|
|
60
|
+
status.set_defaults(_handler=_run_status)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
# ----------------------------------------------------------------------
|
|
64
|
+
# upgrade
|
|
65
|
+
# ----------------------------------------------------------------------
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _run_upgrade(
|
|
69
|
+
args: argparse.Namespace,
|
|
70
|
+
*,
|
|
71
|
+
cwd: Path | None = None,
|
|
72
|
+
) -> int:
|
|
73
|
+
"""Run `copier update` against the linked template + refresh lock.
|
|
74
|
+
|
|
75
|
+
Copier handles the three-way merge against the answer file's
|
|
76
|
+
recorded template-version. We only handle the lock refresh
|
|
77
|
+
afterwards.
|
|
78
|
+
"""
|
|
79
|
+
work_dir = cwd if cwd is not None else Path.cwd()
|
|
80
|
+
if not answers_path(work_dir).exists():
|
|
81
|
+
sys.stderr.write(
|
|
82
|
+
"No .agentforge-state/answers.yml; this directory wasn't scaffolded by "
|
|
83
|
+
"`agentforge new`. Nothing to upgrade.\n"
|
|
84
|
+
)
|
|
85
|
+
return 1
|
|
86
|
+
|
|
87
|
+
if args.dry_run:
|
|
88
|
+
sys.stdout.write(" → dry-run: not actually running copier update\n")
|
|
89
|
+
return 0
|
|
90
|
+
|
|
91
|
+
try:
|
|
92
|
+
_run_copier_update(work_dir, to=args.to)
|
|
93
|
+
except ModuleError as exc:
|
|
94
|
+
sys.stderr.write(f"upgrade failed: {exc}\n")
|
|
95
|
+
return 1
|
|
96
|
+
|
|
97
|
+
# Refresh the lock: re-hash every still-managed file against its
|
|
98
|
+
# new content. Forked entries stay flagged.
|
|
99
|
+
lock = read_lock(work_dir)
|
|
100
|
+
new_lock: dict[str, dict[str, object]] = {}
|
|
101
|
+
for rel, entry in lock.items():
|
|
102
|
+
path = work_dir / rel
|
|
103
|
+
if not path.exists():
|
|
104
|
+
continue
|
|
105
|
+
if entry.get("forked"):
|
|
106
|
+
new_lock[rel] = entry
|
|
107
|
+
continue
|
|
108
|
+
try:
|
|
109
|
+
content = path.read_text(encoding="utf-8")
|
|
110
|
+
except (UnicodeDecodeError, OSError):
|
|
111
|
+
new_lock[rel] = entry
|
|
112
|
+
continue
|
|
113
|
+
# Strip marker before hashing.
|
|
114
|
+
from agentforge.cli._scaffold_state import _strip_marker_for_hash # noqa: PLC0415
|
|
115
|
+
|
|
116
|
+
body = _strip_marker_for_hash(content)
|
|
117
|
+
new_lock[rel] = {**entry, "hash": hash_content(body)}
|
|
118
|
+
write_lock(work_dir, new_lock)
|
|
119
|
+
sys.stdout.write(" → upgrade complete; lock refreshed.\n")
|
|
120
|
+
return 0
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def _run_copier_update(cwd: Path, *, to: str | None) -> None:
|
|
124
|
+
from copier import run_update # noqa: PLC0415
|
|
125
|
+
|
|
126
|
+
try:
|
|
127
|
+
run_update(
|
|
128
|
+
dst_path=str(cwd),
|
|
129
|
+
vcs_ref=to or "HEAD",
|
|
130
|
+
defaults=True,
|
|
131
|
+
overwrite=True,
|
|
132
|
+
quiet=False,
|
|
133
|
+
)
|
|
134
|
+
except Exception as exc:
|
|
135
|
+
raise ModuleError(f"copier update failed: {exc}") from exc
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
# ----------------------------------------------------------------------
|
|
139
|
+
# fork / unfork
|
|
140
|
+
# ----------------------------------------------------------------------
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _run_fork(args: argparse.Namespace, *, cwd: Path | None = None) -> int:
|
|
144
|
+
work_dir = cwd if cwd is not None else Path.cwd()
|
|
145
|
+
rel = args.path
|
|
146
|
+
lock = read_lock(work_dir)
|
|
147
|
+
if rel not in lock:
|
|
148
|
+
sys.stderr.write(f"{rel} is not in the managed-files lock; nothing to fork.\n")
|
|
149
|
+
return 1
|
|
150
|
+
target = work_dir / rel
|
|
151
|
+
strip_marker(target)
|
|
152
|
+
lock[rel] = {**lock[rel], "forked": True}
|
|
153
|
+
write_lock(work_dir, lock)
|
|
154
|
+
sys.stdout.write(f" → forked {rel}. Future upgrades will skip it.\n")
|
|
155
|
+
return 0
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def _run_unfork(args: argparse.Namespace, *, cwd: Path | None = None) -> int:
|
|
159
|
+
work_dir = cwd if cwd is not None else Path.cwd()
|
|
160
|
+
rel = args.path
|
|
161
|
+
lock = read_lock(work_dir)
|
|
162
|
+
if rel not in lock:
|
|
163
|
+
sys.stderr.write(f"{rel} is not in the managed-files lock.\n")
|
|
164
|
+
return 1
|
|
165
|
+
if not lock[rel].get("forked"):
|
|
166
|
+
sys.stderr.write(f"{rel} is not forked.\n")
|
|
167
|
+
return 1
|
|
168
|
+
# Flip the flag and re-prepend the marker. We can't restore the
|
|
169
|
+
# full template-version content without re-running Copier; for
|
|
170
|
+
# now, just clear the forked flag and recompute the hash. The
|
|
171
|
+
# next `agentforge upgrade` will re-render the file.
|
|
172
|
+
target = work_dir / rel
|
|
173
|
+
if target.exists():
|
|
174
|
+
try:
|
|
175
|
+
content = target.read_text(encoding="utf-8")
|
|
176
|
+
except (UnicodeDecodeError, OSError):
|
|
177
|
+
content = ""
|
|
178
|
+
from agentforge.cli._scaffold_state import _strip_marker_for_hash # noqa: PLC0415
|
|
179
|
+
|
|
180
|
+
body = _strip_marker_for_hash(content)
|
|
181
|
+
marker = marker_for(
|
|
182
|
+
target.suffix,
|
|
183
|
+
lock[rel].get("source_module", "template:unknown"),
|
|
184
|
+
lock[rel].get("source_version", "0"),
|
|
185
|
+
hash_content(body)[:12],
|
|
186
|
+
)
|
|
187
|
+
if marker:
|
|
188
|
+
target.write_text(marker + "\n" + body, encoding="utf-8")
|
|
189
|
+
lock[rel] = {**lock[rel], "forked": False, "hash": hash_content(body)}
|
|
190
|
+
else:
|
|
191
|
+
lock[rel] = {**lock[rel], "forked": False}
|
|
192
|
+
write_lock(work_dir, lock)
|
|
193
|
+
sys.stdout.write(f" → unforked {rel}. Run `agentforge upgrade` to pull template content.\n")
|
|
194
|
+
return 0
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
# ----------------------------------------------------------------------
|
|
198
|
+
# status
|
|
199
|
+
# ----------------------------------------------------------------------
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def _run_status(args: argparse.Namespace, *, cwd: Path | None = None) -> int:
|
|
203
|
+
del args
|
|
204
|
+
work_dir = cwd if cwd is not None else Path.cwd()
|
|
205
|
+
lock = read_lock(work_dir)
|
|
206
|
+
if not lock:
|
|
207
|
+
sys.stdout.write("No managed-files lock; this directory wasn't scaffolded.\n")
|
|
208
|
+
return 0
|
|
209
|
+
|
|
210
|
+
by_status: dict[str, list[str]] = {"managed": [], "forked": [], "drifted": [], "missing": []}
|
|
211
|
+
for rel, entry in sorted(lock.items()):
|
|
212
|
+
status = file_status(work_dir, rel, entry)
|
|
213
|
+
by_status[status].append(rel)
|
|
214
|
+
|
|
215
|
+
for label in ("managed", "forked", "drifted", "missing"):
|
|
216
|
+
files = by_status[label]
|
|
217
|
+
if not files:
|
|
218
|
+
continue
|
|
219
|
+
sys.stdout.write(f"\n{label.upper()} ({len(files)})\n")
|
|
220
|
+
for rel in files:
|
|
221
|
+
sys.stdout.write(f" {rel}\n")
|
|
222
|
+
return 0
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
__all__ = ["register_upgrade_cmds"]
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
# Suppress unused-import warning in module-level imports the file
|
|
229
|
+
# uses transitively.
|
|
230
|
+
_ = yaml
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""Configuration loader for `agentforge.yaml` — re-export from core.
|
|
2
|
+
|
|
3
|
+
feat-012 moved the canonical schema + loader to `agentforge-core`
|
|
4
|
+
so the resolver can compose module-side Pydantic schemas without
|
|
5
|
+
importing the runtime package. This module stays as a re-export
|
|
6
|
+
for the historical `from agentforge.config import ...` path.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from agentforge_core.config import (
|
|
12
|
+
AgentConfig,
|
|
13
|
+
AgentForgeConfig,
|
|
14
|
+
BudgetConfig,
|
|
15
|
+
EvaluatorEntry,
|
|
16
|
+
GraphModuleConfig,
|
|
17
|
+
LoggingConfig,
|
|
18
|
+
MemoryModuleConfig,
|
|
19
|
+
ModuleEntry,
|
|
20
|
+
ModulesConfig,
|
|
21
|
+
ObservabilityEntry,
|
|
22
|
+
OutputConfig,
|
|
23
|
+
ProviderConfig,
|
|
24
|
+
RetrieverModuleConfig,
|
|
25
|
+
load_config,
|
|
26
|
+
parse_overrides,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
__all__ = [
|
|
30
|
+
"AgentConfig",
|
|
31
|
+
"AgentForgeConfig",
|
|
32
|
+
"BudgetConfig",
|
|
33
|
+
"EvaluatorEntry",
|
|
34
|
+
"GraphModuleConfig",
|
|
35
|
+
"LoggingConfig",
|
|
36
|
+
"MemoryModuleConfig",
|
|
37
|
+
"ModuleEntry",
|
|
38
|
+
"ModulesConfig",
|
|
39
|
+
"ObservabilityEntry",
|
|
40
|
+
"OutputConfig",
|
|
41
|
+
"ProviderConfig",
|
|
42
|
+
"RetrieverModuleConfig",
|
|
43
|
+
"load_config",
|
|
44
|
+
"parse_overrides",
|
|
45
|
+
]
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""Deterministic evaluators shipped in `agentforge` (feat-006).
|
|
2
|
+
|
|
3
|
+
Zero-cost graders that don't call an LLM — safe to run on every
|
|
4
|
+
output. LLM-judge graders ship separately in `agentforge-eval-geval`.
|
|
5
|
+
|
|
6
|
+
Each grader is constructible directly (`Coverage(reference={...})`),
|
|
7
|
+
or addressable by name through the resolver (`"coverage"` etc.) when
|
|
8
|
+
the runtime is asked to look up a grader by string.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from agentforge.eval.consistency import Consistency
|
|
14
|
+
from agentforge.eval.coverage import Coverage
|
|
15
|
+
from agentforge.eval.format_compliance import FormatCompliance
|
|
16
|
+
from agentforge.eval.regression import RegressionVsBaseline
|
|
17
|
+
|
|
18
|
+
__all__ = ["Consistency", "Coverage", "FormatCompliance", "RegressionVsBaseline"]
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
"""`Consistency` — deterministic grader for "same input → same output".
|
|
2
|
+
|
|
3
|
+
Re-runs the task N times via a caller-supplied async function and
|
|
4
|
+
scores the agreement of the N outputs against the original output.
|
|
5
|
+
The re-run function is the seam — for unit tests it can be a
|
|
6
|
+
scripted-response function; in production it typically wraps another
|
|
7
|
+
`Agent.run(task)` call.
|
|
8
|
+
|
|
9
|
+
The grader declares `cost_estimate_usd = 0.0` against the evaluator
|
|
10
|
+
budget gate (it doesn't bill itself), but the re-run function calls
|
|
11
|
+
the LLM and bills against the run's `BudgetPolicy` like any other
|
|
12
|
+
agent call. The caller is responsible for ensuring the runner
|
|
13
|
+
respects the same budget if they want a unified cost cap.
|
|
14
|
+
|
|
15
|
+
Score = fraction of re-runs whose output matches the original. The
|
|
16
|
+
match function defaults to strict equality; pass a custom
|
|
17
|
+
`matcher` for fuzzy comparison (cosine similarity, normalised
|
|
18
|
+
string compare, etc.).
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
from collections.abc import Awaitable, Callable
|
|
24
|
+
from typing import Any, ClassVar
|
|
25
|
+
|
|
26
|
+
from agentforge_core.contracts.evaluator import EvalResult, Evaluator
|
|
27
|
+
|
|
28
|
+
Runner = Callable[[str], Awaitable[Any]]
|
|
29
|
+
"""Async function that re-executes the task and returns the new output."""
|
|
30
|
+
|
|
31
|
+
Matcher = Callable[[Any, Any], bool]
|
|
32
|
+
"""Equality check between the original output and one re-run output."""
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class Consistency(Evaluator):
|
|
36
|
+
"""Score the fraction of N re-runs that match the original output."""
|
|
37
|
+
|
|
38
|
+
name: ClassVar[str] = "consistency"
|
|
39
|
+
# The grader itself does not call an LLM; re-runs bill against the
|
|
40
|
+
# outer run's BudgetPolicy via the caller-supplied runner. The
|
|
41
|
+
# evaluator gate treats this as $0 so it isn't skipped even on a
|
|
42
|
+
# tight budget; budget exhaustion will manifest as the runner
|
|
43
|
+
# itself raising BudgetExceeded.
|
|
44
|
+
cost_estimate_usd: ClassVar[float] = 0.0
|
|
45
|
+
|
|
46
|
+
def __init__(
|
|
47
|
+
self,
|
|
48
|
+
*,
|
|
49
|
+
runner: Runner,
|
|
50
|
+
n_samples: int = 3,
|
|
51
|
+
matcher: Matcher | None = None,
|
|
52
|
+
) -> None:
|
|
53
|
+
if n_samples < 1:
|
|
54
|
+
raise ValueError(f"n_samples must be >= 1; got {n_samples}")
|
|
55
|
+
self._runner = runner
|
|
56
|
+
self._n = n_samples
|
|
57
|
+
self._matcher: Matcher = matcher if matcher is not None else _strict_eq
|
|
58
|
+
|
|
59
|
+
async def evaluate(self, finding: Any, context: dict[str, Any]) -> EvalResult:
|
|
60
|
+
task = context.get("task")
|
|
61
|
+
if not isinstance(task, str):
|
|
62
|
+
return EvalResult(
|
|
63
|
+
evaluator=self.name,
|
|
64
|
+
score=0.0,
|
|
65
|
+
label="fail",
|
|
66
|
+
reasoning="context['task'] missing or not a string",
|
|
67
|
+
)
|
|
68
|
+
original = finding.output if hasattr(finding, "output") else finding
|
|
69
|
+
|
|
70
|
+
agreements = 0
|
|
71
|
+
rerun_outputs: list[Any] = []
|
|
72
|
+
for i in range(self._n):
|
|
73
|
+
try:
|
|
74
|
+
replay = await self._runner(task)
|
|
75
|
+
except Exception as exc:
|
|
76
|
+
return EvalResult(
|
|
77
|
+
evaluator=self.name,
|
|
78
|
+
score=0.0,
|
|
79
|
+
label="fail",
|
|
80
|
+
reasoning=f"re-run {i + 1}/{self._n} raised {type(exc).__name__}: {exc}",
|
|
81
|
+
raw={"rerun_outputs": rerun_outputs},
|
|
82
|
+
)
|
|
83
|
+
rerun_outputs.append(replay)
|
|
84
|
+
if self._matcher(original, replay):
|
|
85
|
+
agreements += 1
|
|
86
|
+
|
|
87
|
+
score = agreements / self._n
|
|
88
|
+
label = "pass" if agreements == self._n else "warn" if agreements > 0 else "fail"
|
|
89
|
+
return EvalResult(
|
|
90
|
+
evaluator=self.name,
|
|
91
|
+
score=score,
|
|
92
|
+
label=label,
|
|
93
|
+
reasoning=f"{agreements}/{self._n} re-runs matched the original",
|
|
94
|
+
raw={
|
|
95
|
+
"n_samples": self._n,
|
|
96
|
+
"agreements": agreements,
|
|
97
|
+
"original": original,
|
|
98
|
+
"rerun_outputs": rerun_outputs,
|
|
99
|
+
},
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _strict_eq(a: Any, b: Any) -> bool:
|
|
104
|
+
return bool(a == b)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
__all__ = ["Consistency", "Matcher", "Runner"]
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
"""`Coverage` — deterministic grader for "what fraction of expected items did the agent find?"
|
|
2
|
+
|
|
3
|
+
The reference set is supplied at construction; the grader extracts the
|
|
4
|
+
agent's items from `RunResult.output` (string match by default; callable
|
|
5
|
+
override for structural extraction) and computes
|
|
6
|
+
`score = |intersection| / |reference|` clamped to `[0, 1]`.
|
|
7
|
+
|
|
8
|
+
Use for code-review agents (did the agent flag every known issue?),
|
|
9
|
+
RAG agents (did the answer cite every required source?), and any
|
|
10
|
+
task where ground truth is "should mention exactly these things".
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
from collections.abc import Callable, Iterable
|
|
16
|
+
from typing import Any, ClassVar
|
|
17
|
+
|
|
18
|
+
from agentforge_core.contracts.evaluator import EvalResult, Evaluator
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class Coverage(Evaluator):
|
|
22
|
+
"""Fraction of expected items present in the agent's output.
|
|
23
|
+
|
|
24
|
+
Construction:
|
|
25
|
+
|
|
26
|
+
Coverage(reference={"sql injection", "xss", "csrf"})
|
|
27
|
+
|
|
28
|
+
# Or with a custom extractor (e.g. structured output):
|
|
29
|
+
Coverage(
|
|
30
|
+
reference={"item-1", "item-2"},
|
|
31
|
+
extractor=lambda out: set(out["found"]),
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
By default, items are matched against the output by case-
|
|
35
|
+
insensitive substring containment. Pass `extractor` for structural
|
|
36
|
+
matching (e.g. read a list from `output["findings"]`).
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
name: ClassVar[str] = "coverage"
|
|
40
|
+
cost_estimate_usd: ClassVar[float] = 0.0
|
|
41
|
+
|
|
42
|
+
def __init__(
|
|
43
|
+
self,
|
|
44
|
+
*,
|
|
45
|
+
reference: Iterable[str],
|
|
46
|
+
extractor: Callable[[str | dict[str, Any]], set[str]] | None = None,
|
|
47
|
+
) -> None:
|
|
48
|
+
ref = {r for r in reference if r}
|
|
49
|
+
if not ref:
|
|
50
|
+
raise ValueError("Coverage requires a non-empty reference set")
|
|
51
|
+
self._reference: frozenset[str] = frozenset(ref)
|
|
52
|
+
self._extractor = extractor
|
|
53
|
+
|
|
54
|
+
async def evaluate(self, finding: Any, context: dict[str, Any]) -> EvalResult:
|
|
55
|
+
del context
|
|
56
|
+
output = finding.output if hasattr(finding, "output") else finding
|
|
57
|
+
found_normalised, raw_found = self._find_present(output)
|
|
58
|
+
|
|
59
|
+
matched = sorted(found_normalised)
|
|
60
|
+
missing = sorted(r for r in self._reference if r.lower() not in found_normalised)
|
|
61
|
+
score = (len(self._reference) - len(missing)) / len(self._reference)
|
|
62
|
+
label = "pass" if not missing else "warn" if matched else "fail"
|
|
63
|
+
|
|
64
|
+
return EvalResult(
|
|
65
|
+
evaluator=self.name,
|
|
66
|
+
score=score,
|
|
67
|
+
label=label,
|
|
68
|
+
reasoning=(
|
|
69
|
+
f"matched {len(matched)}/{len(self._reference)}; missing={missing}"
|
|
70
|
+
if missing
|
|
71
|
+
else f"matched {len(matched)}/{len(self._reference)}; all present"
|
|
72
|
+
),
|
|
73
|
+
raw={
|
|
74
|
+
"matched": matched,
|
|
75
|
+
"missing": missing,
|
|
76
|
+
"extracted": sorted(raw_found),
|
|
77
|
+
},
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
def _find_present(self, output: Any) -> tuple[set[str], set[str]]:
|
|
81
|
+
"""Return `(matched_normalised, raw_extracted)`.
|
|
82
|
+
|
|
83
|
+
`matched_normalised` is the subset of `self._reference`, lower-
|
|
84
|
+
cased, that the output contains. `raw_extracted` is whatever the
|
|
85
|
+
extractor produced (for diagnostics in `raw`).
|
|
86
|
+
"""
|
|
87
|
+
if self._extractor is not None:
|
|
88
|
+
raw_found = self._extractor(output)
|
|
89
|
+
found_lower = {item.lower() for item in raw_found}
|
|
90
|
+
matched = {r.lower() for r in self._reference if r.lower() in found_lower}
|
|
91
|
+
return matched, raw_found
|
|
92
|
+
|
|
93
|
+
# Default substring match against a single text blob.
|
|
94
|
+
text = output if isinstance(output, str) else str(output)
|
|
95
|
+
text_lower = text.lower()
|
|
96
|
+
matched = {r.lower() for r in self._reference if r.lower() in text_lower}
|
|
97
|
+
return matched, {text}
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
__all__ = ["Coverage"]
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
"""`FormatCompliance` — deterministic grader for output shape.
|
|
2
|
+
|
|
3
|
+
Three modes pick the constraint applied to `RunResult.output`:
|
|
4
|
+
- `regex=<pattern>` — output (str) must match the pattern.
|
|
5
|
+
- `pydantic_model=<BaseModel subclass>` — output validates against
|
|
6
|
+
the model. Accepts dict output directly or a string that
|
|
7
|
+
parses as JSON.
|
|
8
|
+
- `json_parseable=True` — output (str) must parse as JSON. No
|
|
9
|
+
schema enforcement.
|
|
10
|
+
|
|
11
|
+
Exactly one mode must be set at construction. Future modes (Lark
|
|
12
|
+
grammars, ANTLR, JSON Schema via the `jsonschema` dep) can land in
|
|
13
|
+
follow-ups without breaking the constructor.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import json
|
|
19
|
+
import re
|
|
20
|
+
from typing import Any, ClassVar
|
|
21
|
+
|
|
22
|
+
from agentforge_core.contracts.evaluator import EvalResult, Evaluator
|
|
23
|
+
from pydantic import BaseModel, ValidationError
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class FormatCompliance(Evaluator):
|
|
27
|
+
"""Score = 1.0 (passes) or 0.0 (fails). Label is `"pass"` / `"fail"`."""
|
|
28
|
+
|
|
29
|
+
name: ClassVar[str] = "format_compliance"
|
|
30
|
+
cost_estimate_usd: ClassVar[float] = 0.0
|
|
31
|
+
|
|
32
|
+
def __init__(
|
|
33
|
+
self,
|
|
34
|
+
*,
|
|
35
|
+
regex: str | None = None,
|
|
36
|
+
pydantic_model: type[BaseModel] | None = None,
|
|
37
|
+
json_parseable: bool = False,
|
|
38
|
+
) -> None:
|
|
39
|
+
modes_set = sum(
|
|
40
|
+
1 for x in (regex is not None, pydantic_model is not None, json_parseable) if x
|
|
41
|
+
)
|
|
42
|
+
if modes_set != 1:
|
|
43
|
+
raise ValueError(
|
|
44
|
+
"FormatCompliance requires exactly one of regex=, pydantic_model=, json_parseable="
|
|
45
|
+
)
|
|
46
|
+
self._regex = re.compile(regex) if regex is not None else None
|
|
47
|
+
self._model = pydantic_model
|
|
48
|
+
self._json_parseable = json_parseable
|
|
49
|
+
|
|
50
|
+
async def evaluate(self, finding: Any, context: dict[str, Any]) -> EvalResult:
|
|
51
|
+
del context
|
|
52
|
+
output = finding.output if hasattr(finding, "output") else finding
|
|
53
|
+
if self._regex is not None:
|
|
54
|
+
return self._check_regex(output)
|
|
55
|
+
if self._model is not None:
|
|
56
|
+
return self._check_model(output)
|
|
57
|
+
return self._check_json(output)
|
|
58
|
+
|
|
59
|
+
def _check_regex(self, output: Any) -> EvalResult:
|
|
60
|
+
assert self._regex is not None
|
|
61
|
+
if not isinstance(output, str):
|
|
62
|
+
return _fail(f"regex mode requires string output; got {type(output).__name__}")
|
|
63
|
+
if self._regex.fullmatch(output):
|
|
64
|
+
return _pass(f"output matched regex {self._regex.pattern!r}")
|
|
65
|
+
return _fail(f"output did not match regex {self._regex.pattern!r}")
|
|
66
|
+
|
|
67
|
+
def _check_model(self, output: Any) -> EvalResult:
|
|
68
|
+
assert self._model is not None
|
|
69
|
+
candidate: Any
|
|
70
|
+
if isinstance(output, dict):
|
|
71
|
+
candidate = output
|
|
72
|
+
elif isinstance(output, str):
|
|
73
|
+
try:
|
|
74
|
+
candidate = json.loads(output)
|
|
75
|
+
except json.JSONDecodeError as exc:
|
|
76
|
+
return _fail(f"output is not JSON-parseable: {exc.msg}")
|
|
77
|
+
else:
|
|
78
|
+
return _fail(
|
|
79
|
+
f"pydantic_model mode requires dict or JSON string; got {type(output).__name__}"
|
|
80
|
+
)
|
|
81
|
+
try:
|
|
82
|
+
self._model.model_validate(candidate)
|
|
83
|
+
except ValidationError as exc:
|
|
84
|
+
return _fail(f"validation failed: {exc.errors(include_url=False)}")
|
|
85
|
+
return _pass(f"output validates against {self._model.__name__}")
|
|
86
|
+
|
|
87
|
+
def _check_json(self, output: Any) -> EvalResult:
|
|
88
|
+
if isinstance(output, dict):
|
|
89
|
+
return _pass("output is already a dict (JSON-compatible)")
|
|
90
|
+
if not isinstance(output, str):
|
|
91
|
+
return _fail(f"json_parseable mode requires str or dict; got {type(output).__name__}")
|
|
92
|
+
try:
|
|
93
|
+
json.loads(output)
|
|
94
|
+
except json.JSONDecodeError as exc:
|
|
95
|
+
return _fail(f"output is not JSON-parseable: {exc.msg}")
|
|
96
|
+
return _pass("output parses as JSON")
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _pass(reasoning: str) -> EvalResult:
|
|
100
|
+
return EvalResult(evaluator=FormatCompliance.name, score=1.0, label="pass", reasoning=reasoning)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _fail(reasoning: str) -> EvalResult:
|
|
104
|
+
return EvalResult(evaluator=FormatCompliance.name, score=0.0, label="fail", reasoning=reasoning)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
__all__ = ["FormatCompliance"]
|