agentforge-py 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. agentforge/__init__.py +114 -0
  2. agentforge/_testing/__init__.py +19 -0
  3. agentforge/_testing/fake_llm.py +126 -0
  4. agentforge/_testing/fake_tool.py +122 -0
  5. agentforge/_tools/__init__.py +14 -0
  6. agentforge/_tools/calculator.py +102 -0
  7. agentforge/_tools/decorator.py +300 -0
  8. agentforge/_tools/file_read.py +112 -0
  9. agentforge/_tools/shell.py +134 -0
  10. agentforge/_tools/web_search.py +207 -0
  11. agentforge/agent.py +817 -0
  12. agentforge/auth.py +42 -0
  13. agentforge/cli/__init__.py +18 -0
  14. agentforge/cli/_build.py +323 -0
  15. agentforge/cli/_scaffold_state.py +250 -0
  16. agentforge/cli/_shared_scaffold.py +174 -0
  17. agentforge/cli/config_cmd.py +174 -0
  18. agentforge/cli/db_cmd.py +262 -0
  19. agentforge/cli/debug_cmd.py +168 -0
  20. agentforge/cli/docs_cmd.py +217 -0
  21. agentforge/cli/eval_cmd.py +181 -0
  22. agentforge/cli/health_cmd.py +139 -0
  23. agentforge/cli/list_modules.py +85 -0
  24. agentforge/cli/main.py +81 -0
  25. agentforge/cli/manifest_apply.py +368 -0
  26. agentforge/cli/module_cmd.py +247 -0
  27. agentforge/cli/new_cmd.py +171 -0
  28. agentforge/cli/run_cmd.py +234 -0
  29. agentforge/cli/upgrade_cmd.py +230 -0
  30. agentforge/config/__init__.py +45 -0
  31. agentforge/eval/__init__.py +18 -0
  32. agentforge/eval/consistency.py +107 -0
  33. agentforge/eval/coverage.py +100 -0
  34. agentforge/eval/format_compliance.py +107 -0
  35. agentforge/eval/regression.py +143 -0
  36. agentforge/findings.py +166 -0
  37. agentforge/guardrails/__init__.py +32 -0
  38. agentforge/guardrails/allowlist.py +49 -0
  39. agentforge/guardrails/capability_check.py +58 -0
  40. agentforge/guardrails/engine.py +289 -0
  41. agentforge/guardrails/pii_redact_basic.py +61 -0
  42. agentforge/guardrails/prompt_injection_basic.py +90 -0
  43. agentforge/memory/__init__.py +16 -0
  44. agentforge/memory/in_memory.py +130 -0
  45. agentforge/memory/in_memory_graph.py +262 -0
  46. agentforge/memory/in_memory_vector.py +167 -0
  47. agentforge/pipeline/__init__.py +26 -0
  48. agentforge/pipeline/engine.py +189 -0
  49. agentforge/pipeline/errors.py +19 -0
  50. agentforge/pipeline/tool.py +93 -0
  51. agentforge/py.typed +0 -0
  52. agentforge/recording.py +189 -0
  53. agentforge/renderers/__init__.py +28 -0
  54. agentforge/renderers/_defaults.py +32 -0
  55. agentforge/renderers/markdown.py +44 -0
  56. agentforge/renderers/patch_applier.py +46 -0
  57. agentforge/renderers/registry.py +108 -0
  58. agentforge/renderers/scorecard.py +59 -0
  59. agentforge/renderers/span_table.py +71 -0
  60. agentforge/replay.py +260 -0
  61. agentforge/resolver_register.py +41 -0
  62. agentforge/retrieval.py +410 -0
  63. agentforge/runtime.py +63 -0
  64. agentforge/strategies/__init__.py +27 -0
  65. agentforge/strategies/_base.py +280 -0
  66. agentforge/strategies/_plan.py +93 -0
  67. agentforge/strategies/multi_agent.py +541 -0
  68. agentforge/strategies/plan_execute.py +506 -0
  69. agentforge/strategies/react.py +237 -0
  70. agentforge/strategies/tot.py +472 -0
  71. agentforge/templates/_shared/.cursorrules +12 -0
  72. agentforge/templates/_shared/.github/copilot-instructions.md +13 -0
  73. agentforge/templates/_shared/.gitkeep +0 -0
  74. agentforge/templates/_shared/AGENTS.md.tmpl +123 -0
  75. agentforge/templates/_shared/CLAUDE.md +13 -0
  76. agentforge/templates/_shared/docs/runbooks/01-set-up-new-agent.md.tmpl +67 -0
  77. agentforge/templates/_shared/docs/runbooks/02-add-a-tool.md +67 -0
  78. agentforge/templates/_shared/docs/runbooks/03-add-a-pipeline-task.md +69 -0
  79. agentforge/templates/_shared/docs/runbooks/04-pick-reasoning-strategy.md +67 -0
  80. agentforge/templates/_shared/docs/runbooks/05-write-prompts.md +75 -0
  81. agentforge/templates/_shared/docs/runbooks/06-test-your-agent.md +75 -0
  82. agentforge/templates/_shared/docs/runbooks/07-debug-a-run.md +70 -0
  83. agentforge/templates/_shared/docs/runbooks/08-add-memory.md +75 -0
  84. agentforge/templates/_shared/docs/runbooks/09-add-mcp.md +78 -0
  85. agentforge/templates/_shared/docs/runbooks/10-add-evaluators.md +76 -0
  86. agentforge/templates/_shared/docs/runbooks/11-add-safety-guardrails.md +83 -0
  87. agentforge/templates/_shared/docs/runbooks/12-add-observability.md +77 -0
  88. agentforge/templates/_shared/docs/runbooks/13-configure-multi-provider.md +91 -0
  89. agentforge/templates/_shared/docs/runbooks/14-deploy-your-agent.md +70 -0
  90. agentforge/templates/_shared/docs/runbooks/15-upgrade-your-agent.md +67 -0
  91. agentforge/templates/_shared/docs/runbooks/16-configuration-reference.md +81 -0
  92. agentforge/templates/_shared/docs/runbooks/17-add-reranker.md +78 -0
  93. agentforge/templates/_shared/docs/runbooks/18-add-hybrid-search.md +78 -0
  94. agentforge/templates/_shared/docs/runbooks/19-add-graphrag.md +83 -0
  95. agentforge/templates/_shared/docs/runbooks/20-apply-schema-migrations.md +92 -0
  96. agentforge/templates/_shared/docs/runbooks/21-use-streaming-guardrails.md +82 -0
  97. agentforge/templates/_shared/docs/runbooks/README.md.tmpl +68 -0
  98. agentforge/templates/code-reviewer/.env.example +8 -0
  99. agentforge/templates/code-reviewer/.gitignore +7 -0
  100. agentforge/templates/code-reviewer/README.md +12 -0
  101. agentforge/templates/code-reviewer/agentforge.yaml +23 -0
  102. agentforge/templates/code-reviewer/copier.yml +34 -0
  103. agentforge/templates/code-reviewer/pyproject.toml +18 -0
  104. agentforge/templates/code-reviewer/src/{{project_slug.replace('-', '_')}}/__init__.py +5 -0
  105. agentforge/templates/code-reviewer/src/{{project_slug.replace('-', '_')}}/main.py +32 -0
  106. agentforge/templates/docs-qa/.env.example +8 -0
  107. agentforge/templates/docs-qa/.gitignore +7 -0
  108. agentforge/templates/docs-qa/README.md +14 -0
  109. agentforge/templates/docs-qa/agentforge.yaml +19 -0
  110. agentforge/templates/docs-qa/copier.yml +31 -0
  111. agentforge/templates/docs-qa/pyproject.toml +18 -0
  112. agentforge/templates/docs-qa/src/{{project_slug.replace('-', '_')}}/__init__.py +5 -0
  113. agentforge/templates/docs-qa/src/{{project_slug.replace('-', '_')}}/main.py +32 -0
  114. agentforge/templates/minimal/.env.example +11 -0
  115. agentforge/templates/minimal/.gitignore +10 -0
  116. agentforge/templates/minimal/README.md +28 -0
  117. agentforge/templates/minimal/agentforge.yaml +10 -0
  118. agentforge/templates/minimal/copier.yml +52 -0
  119. agentforge/templates/minimal/pyproject.toml +18 -0
  120. agentforge/templates/minimal/src/{{project_slug.replace('-', '_')}}/__init__.py +5 -0
  121. agentforge/templates/minimal/src/{{project_slug.replace('-', '_')}}/main.py +34 -0
  122. agentforge/templates/patch-bot/.env.example +8 -0
  123. agentforge/templates/patch-bot/.gitignore +7 -0
  124. agentforge/templates/patch-bot/README.md +13 -0
  125. agentforge/templates/patch-bot/agentforge.yaml +15 -0
  126. agentforge/templates/patch-bot/copier.yml +31 -0
  127. agentforge/templates/patch-bot/pyproject.toml +18 -0
  128. agentforge/templates/patch-bot/src/{{project_slug.replace('-', '_')}}/__init__.py +5 -0
  129. agentforge/templates/patch-bot/src/{{project_slug.replace('-', '_')}}/main.py +32 -0
  130. agentforge/templates/research/.env.example +8 -0
  131. agentforge/templates/research/.gitignore +7 -0
  132. agentforge/templates/research/README.md +14 -0
  133. agentforge/templates/research/agentforge.yaml +17 -0
  134. agentforge/templates/research/copier.yml +31 -0
  135. agentforge/templates/research/pyproject.toml +18 -0
  136. agentforge/templates/research/src/{{project_slug.replace('-', '_')}}/__init__.py +5 -0
  137. agentforge/templates/research/src/{{project_slug.replace('-', '_')}}/main.py +31 -0
  138. agentforge/templates/triage/.env.example +8 -0
  139. agentforge/templates/triage/.gitignore +7 -0
  140. agentforge/templates/triage/README.md +14 -0
  141. agentforge/templates/triage/agentforge.yaml +25 -0
  142. agentforge/templates/triage/copier.yml +31 -0
  143. agentforge/templates/triage/pyproject.toml +18 -0
  144. agentforge/templates/triage/src/{{project_slug.replace('-', '_')}}/__init__.py +5 -0
  145. agentforge/templates/triage/src/{{project_slug.replace('-', '_')}}/main.py +30 -0
  146. agentforge/testing/__init__.py +69 -0
  147. agentforge/testing/conformance.py +40 -0
  148. agentforge/testing/factory.py +89 -0
  149. agentforge/testing/fixtures.py +42 -0
  150. agentforge/testing/llm.py +235 -0
  151. agentforge/testing/recording.py +177 -0
  152. agentforge/tools/__init__.py +41 -0
  153. agentforge_py-0.2.1.dist-info/METADATA +158 -0
  154. agentforge_py-0.2.1.dist-info/RECORD +157 -0
  155. agentforge_py-0.2.1.dist-info/WHEEL +4 -0
  156. agentforge_py-0.2.1.dist-info/entry_points.txt +2 -0
  157. agentforge_py-0.2.1.dist-info/licenses/LICENSE +202 -0
@@ -0,0 +1,230 @@
1
+ """`agentforge upgrade/fork/unfork/status` commands (feat-011 chunks 4-5).
2
+
3
+ - **upgrade**: wraps Copier's `copier update` for the linked
4
+ template; refreshes the managed-files lock.
5
+ - **fork**: strip the framework marker from a file + flag it as
6
+ forked in the lock. Future upgrades skip it.
7
+ - **unfork**: restore from the template (lossy — overwrites local
8
+ edits). Re-runs the per-file render and updates the lock.
9
+ - **status**: walks the lock and prints managed / forked / drifted
10
+ / missing per file.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import argparse
16
+ import sys
17
+ from pathlib import Path
18
+
19
+ import yaml
20
+ from agentforge_core.production.exceptions import ModuleError
21
+
22
+ from agentforge.cli._scaffold_state import (
23
+ answers_path,
24
+ file_status,
25
+ hash_content,
26
+ marker_for,
27
+ read_lock,
28
+ strip_marker,
29
+ write_lock,
30
+ )
31
+
32
+
33
+ def register_upgrade_cmds(sub: argparse._SubParsersAction) -> None: # type: ignore[type-arg]
34
+ """Attach upgrade / fork / unfork / status to the parent
35
+ subparser action."""
36
+ upgrade = sub.add_parser(
37
+ "upgrade",
38
+ help="Pull framework updates into this agent (three-way merge).",
39
+ )
40
+ upgrade.add_argument("--to", default=None, help="Target version (default: latest).")
41
+ upgrade.add_argument(
42
+ "--dry-run",
43
+ action="store_true",
44
+ help="Show what would change without writing.",
45
+ )
46
+ upgrade.set_defaults(_handler=_run_upgrade)
47
+
48
+ fork = sub.add_parser("fork", help="Claim a managed file — future upgrades skip it.")
49
+ fork.add_argument("path", help="Path to the file to fork (relative to cwd).")
50
+ fork.set_defaults(_handler=_run_fork)
51
+
52
+ unfork = sub.add_parser("unfork", help="Restore a forked file to the template version.")
53
+ unfork.add_argument("path", help="Path to the file to unfork.")
54
+ unfork.set_defaults(_handler=_run_unfork)
55
+
56
+ status = sub.add_parser(
57
+ "status",
58
+ help="Show managed / forked / drifted files in this agent.",
59
+ )
60
+ status.set_defaults(_handler=_run_status)
61
+
62
+
63
+ # ----------------------------------------------------------------------
64
+ # upgrade
65
+ # ----------------------------------------------------------------------
66
+
67
+
68
+ def _run_upgrade(
69
+ args: argparse.Namespace,
70
+ *,
71
+ cwd: Path | None = None,
72
+ ) -> int:
73
+ """Run `copier update` against the linked template + refresh lock.
74
+
75
+ Copier handles the three-way merge against the answer file's
76
+ recorded template-version. We only handle the lock refresh
77
+ afterwards.
78
+ """
79
+ work_dir = cwd if cwd is not None else Path.cwd()
80
+ if not answers_path(work_dir).exists():
81
+ sys.stderr.write(
82
+ "No .agentforge-state/answers.yml; this directory wasn't scaffolded by "
83
+ "`agentforge new`. Nothing to upgrade.\n"
84
+ )
85
+ return 1
86
+
87
+ if args.dry_run:
88
+ sys.stdout.write(" → dry-run: not actually running copier update\n")
89
+ return 0
90
+
91
+ try:
92
+ _run_copier_update(work_dir, to=args.to)
93
+ except ModuleError as exc:
94
+ sys.stderr.write(f"upgrade failed: {exc}\n")
95
+ return 1
96
+
97
+ # Refresh the lock: re-hash every still-managed file against its
98
+ # new content. Forked entries stay flagged.
99
+ lock = read_lock(work_dir)
100
+ new_lock: dict[str, dict[str, object]] = {}
101
+ for rel, entry in lock.items():
102
+ path = work_dir / rel
103
+ if not path.exists():
104
+ continue
105
+ if entry.get("forked"):
106
+ new_lock[rel] = entry
107
+ continue
108
+ try:
109
+ content = path.read_text(encoding="utf-8")
110
+ except (UnicodeDecodeError, OSError):
111
+ new_lock[rel] = entry
112
+ continue
113
+ # Strip marker before hashing.
114
+ from agentforge.cli._scaffold_state import _strip_marker_for_hash # noqa: PLC0415
115
+
116
+ body = _strip_marker_for_hash(content)
117
+ new_lock[rel] = {**entry, "hash": hash_content(body)}
118
+ write_lock(work_dir, new_lock)
119
+ sys.stdout.write(" → upgrade complete; lock refreshed.\n")
120
+ return 0
121
+
122
+
123
+ def _run_copier_update(cwd: Path, *, to: str | None) -> None:
124
+ from copier import run_update # noqa: PLC0415
125
+
126
+ try:
127
+ run_update(
128
+ dst_path=str(cwd),
129
+ vcs_ref=to or "HEAD",
130
+ defaults=True,
131
+ overwrite=True,
132
+ quiet=False,
133
+ )
134
+ except Exception as exc:
135
+ raise ModuleError(f"copier update failed: {exc}") from exc
136
+
137
+
138
+ # ----------------------------------------------------------------------
139
+ # fork / unfork
140
+ # ----------------------------------------------------------------------
141
+
142
+
143
+ def _run_fork(args: argparse.Namespace, *, cwd: Path | None = None) -> int:
144
+ work_dir = cwd if cwd is not None else Path.cwd()
145
+ rel = args.path
146
+ lock = read_lock(work_dir)
147
+ if rel not in lock:
148
+ sys.stderr.write(f"{rel} is not in the managed-files lock; nothing to fork.\n")
149
+ return 1
150
+ target = work_dir / rel
151
+ strip_marker(target)
152
+ lock[rel] = {**lock[rel], "forked": True}
153
+ write_lock(work_dir, lock)
154
+ sys.stdout.write(f" → forked {rel}. Future upgrades will skip it.\n")
155
+ return 0
156
+
157
+
158
+ def _run_unfork(args: argparse.Namespace, *, cwd: Path | None = None) -> int:
159
+ work_dir = cwd if cwd is not None else Path.cwd()
160
+ rel = args.path
161
+ lock = read_lock(work_dir)
162
+ if rel not in lock:
163
+ sys.stderr.write(f"{rel} is not in the managed-files lock.\n")
164
+ return 1
165
+ if not lock[rel].get("forked"):
166
+ sys.stderr.write(f"{rel} is not forked.\n")
167
+ return 1
168
+ # Flip the flag and re-prepend the marker. We can't restore the
169
+ # full template-version content without re-running Copier; for
170
+ # now, just clear the forked flag and recompute the hash. The
171
+ # next `agentforge upgrade` will re-render the file.
172
+ target = work_dir / rel
173
+ if target.exists():
174
+ try:
175
+ content = target.read_text(encoding="utf-8")
176
+ except (UnicodeDecodeError, OSError):
177
+ content = ""
178
+ from agentforge.cli._scaffold_state import _strip_marker_for_hash # noqa: PLC0415
179
+
180
+ body = _strip_marker_for_hash(content)
181
+ marker = marker_for(
182
+ target.suffix,
183
+ lock[rel].get("source_module", "template:unknown"),
184
+ lock[rel].get("source_version", "0"),
185
+ hash_content(body)[:12],
186
+ )
187
+ if marker:
188
+ target.write_text(marker + "\n" + body, encoding="utf-8")
189
+ lock[rel] = {**lock[rel], "forked": False, "hash": hash_content(body)}
190
+ else:
191
+ lock[rel] = {**lock[rel], "forked": False}
192
+ write_lock(work_dir, lock)
193
+ sys.stdout.write(f" → unforked {rel}. Run `agentforge upgrade` to pull template content.\n")
194
+ return 0
195
+
196
+
197
+ # ----------------------------------------------------------------------
198
+ # status
199
+ # ----------------------------------------------------------------------
200
+
201
+
202
+ def _run_status(args: argparse.Namespace, *, cwd: Path | None = None) -> int:
203
+ del args
204
+ work_dir = cwd if cwd is not None else Path.cwd()
205
+ lock = read_lock(work_dir)
206
+ if not lock:
207
+ sys.stdout.write("No managed-files lock; this directory wasn't scaffolded.\n")
208
+ return 0
209
+
210
+ by_status: dict[str, list[str]] = {"managed": [], "forked": [], "drifted": [], "missing": []}
211
+ for rel, entry in sorted(lock.items()):
212
+ status = file_status(work_dir, rel, entry)
213
+ by_status[status].append(rel)
214
+
215
+ for label in ("managed", "forked", "drifted", "missing"):
216
+ files = by_status[label]
217
+ if not files:
218
+ continue
219
+ sys.stdout.write(f"\n{label.upper()} ({len(files)})\n")
220
+ for rel in files:
221
+ sys.stdout.write(f" {rel}\n")
222
+ return 0
223
+
224
+
225
+ __all__ = ["register_upgrade_cmds"]
226
+
227
+
228
+ # Suppress unused-import warning in module-level imports the file
229
+ # uses transitively.
230
+ _ = yaml
@@ -0,0 +1,45 @@
1
+ """Configuration loader for `agentforge.yaml` — re-export from core.
2
+
3
+ feat-012 moved the canonical schema + loader to `agentforge-core`
4
+ so the resolver can compose module-side Pydantic schemas without
5
+ importing the runtime package. This module stays as a re-export
6
+ for the historical `from agentforge.config import ...` path.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from agentforge_core.config import (
12
+ AgentConfig,
13
+ AgentForgeConfig,
14
+ BudgetConfig,
15
+ EvaluatorEntry,
16
+ GraphModuleConfig,
17
+ LoggingConfig,
18
+ MemoryModuleConfig,
19
+ ModuleEntry,
20
+ ModulesConfig,
21
+ ObservabilityEntry,
22
+ OutputConfig,
23
+ ProviderConfig,
24
+ RetrieverModuleConfig,
25
+ load_config,
26
+ parse_overrides,
27
+ )
28
+
29
+ __all__ = [
30
+ "AgentConfig",
31
+ "AgentForgeConfig",
32
+ "BudgetConfig",
33
+ "EvaluatorEntry",
34
+ "GraphModuleConfig",
35
+ "LoggingConfig",
36
+ "MemoryModuleConfig",
37
+ "ModuleEntry",
38
+ "ModulesConfig",
39
+ "ObservabilityEntry",
40
+ "OutputConfig",
41
+ "ProviderConfig",
42
+ "RetrieverModuleConfig",
43
+ "load_config",
44
+ "parse_overrides",
45
+ ]
@@ -0,0 +1,18 @@
1
+ """Deterministic evaluators shipped in `agentforge` (feat-006).
2
+
3
+ Zero-cost graders that don't call an LLM — safe to run on every
4
+ output. LLM-judge graders ship separately in `agentforge-eval-geval`.
5
+
6
+ Each grader is constructible directly (`Coverage(reference={...})`),
7
+ or addressable by name through the resolver (`"coverage"` etc.) when
8
+ the runtime is asked to look up a grader by string.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from agentforge.eval.consistency import Consistency
14
+ from agentforge.eval.coverage import Coverage
15
+ from agentforge.eval.format_compliance import FormatCompliance
16
+ from agentforge.eval.regression import RegressionVsBaseline
17
+
18
+ __all__ = ["Consistency", "Coverage", "FormatCompliance", "RegressionVsBaseline"]
@@ -0,0 +1,107 @@
1
+ """`Consistency` — deterministic grader for "same input → same output".
2
+
3
+ Re-runs the task N times via a caller-supplied async function and
4
+ scores the agreement of the N outputs against the original output.
5
+ The re-run function is the seam — for unit tests it can be a
6
+ scripted-response function; in production it typically wraps another
7
+ `Agent.run(task)` call.
8
+
9
+ The grader declares `cost_estimate_usd = 0.0` against the evaluator
10
+ budget gate (it doesn't bill itself), but the re-run function calls
11
+ the LLM and bills against the run's `BudgetPolicy` like any other
12
+ agent call. The caller is responsible for ensuring the runner
13
+ respects the same budget if they want a unified cost cap.
14
+
15
+ Score = fraction of re-runs whose output matches the original. The
16
+ match function defaults to strict equality; pass a custom
17
+ `matcher` for fuzzy comparison (cosine similarity, normalised
18
+ string compare, etc.).
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ from collections.abc import Awaitable, Callable
24
+ from typing import Any, ClassVar
25
+
26
+ from agentforge_core.contracts.evaluator import EvalResult, Evaluator
27
+
28
+ Runner = Callable[[str], Awaitable[Any]]
29
+ """Async function that re-executes the task and returns the new output."""
30
+
31
+ Matcher = Callable[[Any, Any], bool]
32
+ """Equality check between the original output and one re-run output."""
33
+
34
+
35
+ class Consistency(Evaluator):
36
+ """Score the fraction of N re-runs that match the original output."""
37
+
38
+ name: ClassVar[str] = "consistency"
39
+ # The grader itself does not call an LLM; re-runs bill against the
40
+ # outer run's BudgetPolicy via the caller-supplied runner. The
41
+ # evaluator gate treats this as $0 so it isn't skipped even on a
42
+ # tight budget; budget exhaustion will manifest as the runner
43
+ # itself raising BudgetExceeded.
44
+ cost_estimate_usd: ClassVar[float] = 0.0
45
+
46
+ def __init__(
47
+ self,
48
+ *,
49
+ runner: Runner,
50
+ n_samples: int = 3,
51
+ matcher: Matcher | None = None,
52
+ ) -> None:
53
+ if n_samples < 1:
54
+ raise ValueError(f"n_samples must be >= 1; got {n_samples}")
55
+ self._runner = runner
56
+ self._n = n_samples
57
+ self._matcher: Matcher = matcher if matcher is not None else _strict_eq
58
+
59
+ async def evaluate(self, finding: Any, context: dict[str, Any]) -> EvalResult:
60
+ task = context.get("task")
61
+ if not isinstance(task, str):
62
+ return EvalResult(
63
+ evaluator=self.name,
64
+ score=0.0,
65
+ label="fail",
66
+ reasoning="context['task'] missing or not a string",
67
+ )
68
+ original = finding.output if hasattr(finding, "output") else finding
69
+
70
+ agreements = 0
71
+ rerun_outputs: list[Any] = []
72
+ for i in range(self._n):
73
+ try:
74
+ replay = await self._runner(task)
75
+ except Exception as exc:
76
+ return EvalResult(
77
+ evaluator=self.name,
78
+ score=0.0,
79
+ label="fail",
80
+ reasoning=f"re-run {i + 1}/{self._n} raised {type(exc).__name__}: {exc}",
81
+ raw={"rerun_outputs": rerun_outputs},
82
+ )
83
+ rerun_outputs.append(replay)
84
+ if self._matcher(original, replay):
85
+ agreements += 1
86
+
87
+ score = agreements / self._n
88
+ label = "pass" if agreements == self._n else "warn" if agreements > 0 else "fail"
89
+ return EvalResult(
90
+ evaluator=self.name,
91
+ score=score,
92
+ label=label,
93
+ reasoning=f"{agreements}/{self._n} re-runs matched the original",
94
+ raw={
95
+ "n_samples": self._n,
96
+ "agreements": agreements,
97
+ "original": original,
98
+ "rerun_outputs": rerun_outputs,
99
+ },
100
+ )
101
+
102
+
103
+ def _strict_eq(a: Any, b: Any) -> bool:
104
+ return bool(a == b)
105
+
106
+
107
+ __all__ = ["Consistency", "Matcher", "Runner"]
@@ -0,0 +1,100 @@
1
+ """`Coverage` — deterministic grader for "what fraction of expected items did the agent find?"
2
+
3
+ The reference set is supplied at construction; the grader extracts the
4
+ agent's items from `RunResult.output` (string match by default; callable
5
+ override for structural extraction) and computes
6
+ `score = |intersection| / |reference|` clamped to `[0, 1]`.
7
+
8
+ Use for code-review agents (did the agent flag every known issue?),
9
+ RAG agents (did the answer cite every required source?), and any
10
+ task where ground truth is "should mention exactly these things".
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ from collections.abc import Callable, Iterable
16
+ from typing import Any, ClassVar
17
+
18
+ from agentforge_core.contracts.evaluator import EvalResult, Evaluator
19
+
20
+
21
+ class Coverage(Evaluator):
22
+ """Fraction of expected items present in the agent's output.
23
+
24
+ Construction:
25
+
26
+ Coverage(reference={"sql injection", "xss", "csrf"})
27
+
28
+ # Or with a custom extractor (e.g. structured output):
29
+ Coverage(
30
+ reference={"item-1", "item-2"},
31
+ extractor=lambda out: set(out["found"]),
32
+ )
33
+
34
+ By default, items are matched against the output by case-
35
+ insensitive substring containment. Pass `extractor` for structural
36
+ matching (e.g. read a list from `output["findings"]`).
37
+ """
38
+
39
+ name: ClassVar[str] = "coverage"
40
+ cost_estimate_usd: ClassVar[float] = 0.0
41
+
42
+ def __init__(
43
+ self,
44
+ *,
45
+ reference: Iterable[str],
46
+ extractor: Callable[[str | dict[str, Any]], set[str]] | None = None,
47
+ ) -> None:
48
+ ref = {r for r in reference if r}
49
+ if not ref:
50
+ raise ValueError("Coverage requires a non-empty reference set")
51
+ self._reference: frozenset[str] = frozenset(ref)
52
+ self._extractor = extractor
53
+
54
+ async def evaluate(self, finding: Any, context: dict[str, Any]) -> EvalResult:
55
+ del context
56
+ output = finding.output if hasattr(finding, "output") else finding
57
+ found_normalised, raw_found = self._find_present(output)
58
+
59
+ matched = sorted(found_normalised)
60
+ missing = sorted(r for r in self._reference if r.lower() not in found_normalised)
61
+ score = (len(self._reference) - len(missing)) / len(self._reference)
62
+ label = "pass" if not missing else "warn" if matched else "fail"
63
+
64
+ return EvalResult(
65
+ evaluator=self.name,
66
+ score=score,
67
+ label=label,
68
+ reasoning=(
69
+ f"matched {len(matched)}/{len(self._reference)}; missing={missing}"
70
+ if missing
71
+ else f"matched {len(matched)}/{len(self._reference)}; all present"
72
+ ),
73
+ raw={
74
+ "matched": matched,
75
+ "missing": missing,
76
+ "extracted": sorted(raw_found),
77
+ },
78
+ )
79
+
80
+ def _find_present(self, output: Any) -> tuple[set[str], set[str]]:
81
+ """Return `(matched_normalised, raw_extracted)`.
82
+
83
+ `matched_normalised` is the subset of `self._reference`, lower-
84
+ cased, that the output contains. `raw_extracted` is whatever the
85
+ extractor produced (for diagnostics in `raw`).
86
+ """
87
+ if self._extractor is not None:
88
+ raw_found = self._extractor(output)
89
+ found_lower = {item.lower() for item in raw_found}
90
+ matched = {r.lower() for r in self._reference if r.lower() in found_lower}
91
+ return matched, raw_found
92
+
93
+ # Default substring match against a single text blob.
94
+ text = output if isinstance(output, str) else str(output)
95
+ text_lower = text.lower()
96
+ matched = {r.lower() for r in self._reference if r.lower() in text_lower}
97
+ return matched, {text}
98
+
99
+
100
+ __all__ = ["Coverage"]
@@ -0,0 +1,107 @@
1
+ """`FormatCompliance` — deterministic grader for output shape.
2
+
3
+ Three modes pick the constraint applied to `RunResult.output`:
4
+ - `regex=<pattern>` — output (str) must match the pattern.
5
+ - `pydantic_model=<BaseModel subclass>` — output validates against
6
+ the model. Accepts dict output directly or a string that
7
+ parses as JSON.
8
+ - `json_parseable=True` — output (str) must parse as JSON. No
9
+ schema enforcement.
10
+
11
+ Exactly one mode must be set at construction. Future modes (Lark
12
+ grammars, ANTLR, JSON Schema via the `jsonschema` dep) can land in
13
+ follow-ups without breaking the constructor.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import json
19
+ import re
20
+ from typing import Any, ClassVar
21
+
22
+ from agentforge_core.contracts.evaluator import EvalResult, Evaluator
23
+ from pydantic import BaseModel, ValidationError
24
+
25
+
26
+ class FormatCompliance(Evaluator):
27
+ """Score = 1.0 (passes) or 0.0 (fails). Label is `"pass"` / `"fail"`."""
28
+
29
+ name: ClassVar[str] = "format_compliance"
30
+ cost_estimate_usd: ClassVar[float] = 0.0
31
+
32
+ def __init__(
33
+ self,
34
+ *,
35
+ regex: str | None = None,
36
+ pydantic_model: type[BaseModel] | None = None,
37
+ json_parseable: bool = False,
38
+ ) -> None:
39
+ modes_set = sum(
40
+ 1 for x in (regex is not None, pydantic_model is not None, json_parseable) if x
41
+ )
42
+ if modes_set != 1:
43
+ raise ValueError(
44
+ "FormatCompliance requires exactly one of regex=, pydantic_model=, json_parseable="
45
+ )
46
+ self._regex = re.compile(regex) if regex is not None else None
47
+ self._model = pydantic_model
48
+ self._json_parseable = json_parseable
49
+
50
+ async def evaluate(self, finding: Any, context: dict[str, Any]) -> EvalResult:
51
+ del context
52
+ output = finding.output if hasattr(finding, "output") else finding
53
+ if self._regex is not None:
54
+ return self._check_regex(output)
55
+ if self._model is not None:
56
+ return self._check_model(output)
57
+ return self._check_json(output)
58
+
59
+ def _check_regex(self, output: Any) -> EvalResult:
60
+ assert self._regex is not None
61
+ if not isinstance(output, str):
62
+ return _fail(f"regex mode requires string output; got {type(output).__name__}")
63
+ if self._regex.fullmatch(output):
64
+ return _pass(f"output matched regex {self._regex.pattern!r}")
65
+ return _fail(f"output did not match regex {self._regex.pattern!r}")
66
+
67
+ def _check_model(self, output: Any) -> EvalResult:
68
+ assert self._model is not None
69
+ candidate: Any
70
+ if isinstance(output, dict):
71
+ candidate = output
72
+ elif isinstance(output, str):
73
+ try:
74
+ candidate = json.loads(output)
75
+ except json.JSONDecodeError as exc:
76
+ return _fail(f"output is not JSON-parseable: {exc.msg}")
77
+ else:
78
+ return _fail(
79
+ f"pydantic_model mode requires dict or JSON string; got {type(output).__name__}"
80
+ )
81
+ try:
82
+ self._model.model_validate(candidate)
83
+ except ValidationError as exc:
84
+ return _fail(f"validation failed: {exc.errors(include_url=False)}")
85
+ return _pass(f"output validates against {self._model.__name__}")
86
+
87
+ def _check_json(self, output: Any) -> EvalResult:
88
+ if isinstance(output, dict):
89
+ return _pass("output is already a dict (JSON-compatible)")
90
+ if not isinstance(output, str):
91
+ return _fail(f"json_parseable mode requires str or dict; got {type(output).__name__}")
92
+ try:
93
+ json.loads(output)
94
+ except json.JSONDecodeError as exc:
95
+ return _fail(f"output is not JSON-parseable: {exc.msg}")
96
+ return _pass("output parses as JSON")
97
+
98
+
99
+ def _pass(reasoning: str) -> EvalResult:
100
+ return EvalResult(evaluator=FormatCompliance.name, score=1.0, label="pass", reasoning=reasoning)
101
+
102
+
103
+ def _fail(reasoning: str) -> EvalResult:
104
+ return EvalResult(evaluator=FormatCompliance.name, score=0.0, label="fail", reasoning=reasoning)
105
+
106
+
107
+ __all__ = ["FormatCompliance"]