@event4u/agent-config 2.19.0 → 2.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. package/.agent-src/commands/agent-status.md +29 -0
  2. package/.agent-src/commands/onboard.md +221 -81
  3. package/.agent-src/packs/README.md +49 -0
  4. package/.agent-src/packs/agency-delivery.yml +63 -0
  5. package/.agent-src/packs/content-engine.yml +53 -0
  6. package/.agent-src/packs/founder-mvp.yml +51 -0
  7. package/.agent-src/presets/README.md +26 -0
  8. package/.agent-src/presets/balanced.yml +34 -0
  9. package/.agent-src/presets/fast.yml +31 -0
  10. package/.agent-src/presets/strict.yml +38 -0
  11. package/.agent-src/profiles/README.md +29 -0
  12. package/.agent-src/profiles/agency.yml +27 -0
  13. package/.agent-src/profiles/content_creator.yml +25 -0
  14. package/.agent-src/profiles/developer.yml +26 -0
  15. package/.agent-src/profiles/finance.yml +24 -0
  16. package/.agent-src/profiles/founder.yml +25 -0
  17. package/.agent-src/profiles/ops.yml +25 -0
  18. package/.agent-src/rules/no-cheap-questions.md +25 -17
  19. package/.agent-src/skills/adr-create/SKILL.md +78 -68
  20. package/.agent-src/skills/subagent-orchestration/SKILL.md +33 -0
  21. package/.agent-src/templates/agents/agent-project-settings.example.yml +1 -1
  22. package/.agent-src/templates/skill-archive-note.md +101 -0
  23. package/.claude-plugin/marketplace.json +1 -1
  24. package/CHANGELOG.md +52 -30
  25. package/README.md +68 -72
  26. package/config/agent-settings.template.yml +22 -0
  27. package/docs/adrs/caveman/0001-default-off-until-bench.md +93 -0
  28. package/docs/adrs/caveman/README.md +9 -0
  29. package/docs/adrs/cost/0001-hard-stop-hook.md +114 -0
  30. package/docs/adrs/cost/README.md +9 -0
  31. package/docs/adrs/memory/0001-consumer-side-snapshot.md +111 -0
  32. package/docs/adrs/memory/README.md +9 -0
  33. package/docs/adrs/router/0001-three-tier-routing.md +119 -0
  34. package/docs/adrs/router/README.md +9 -0
  35. package/docs/adrs/schema/0001-json-schema-frontmatter.md +102 -0
  36. package/docs/adrs/schema/README.md +9 -0
  37. package/docs/adrs/smoke/0001-per-tier-smoke-scripts.md +99 -0
  38. package/docs/adrs/smoke/README.md +9 -0
  39. package/docs/architecture/current-onboard-baseline.md +126 -0
  40. package/docs/architecture/current-safety-behavior.md +137 -0
  41. package/docs/archive/CHANGELOG-pre-2.16.0.md +48 -0
  42. package/docs/contracts/adr-layout.md +108 -0
  43. package/docs/contracts/benchmark-corpus-spec.md +97 -0
  44. package/docs/contracts/benchmark-report-schema.md +111 -0
  45. package/docs/contracts/command-clusters.md +1 -0
  46. package/docs/contracts/command-taxonomy.md +137 -0
  47. package/docs/contracts/compression-default-kill-criterion.md +69 -0
  48. package/docs/contracts/config-presets.md +144 -0
  49. package/docs/contracts/cost-dashboard.md +143 -0
  50. package/docs/contracts/cost-enforcement.md +134 -0
  51. package/docs/contracts/file-ownership-matrix.json +0 -7
  52. package/docs/contracts/mcp-tool-inventory.md +53 -0
  53. package/docs/contracts/measurement-baseline.md +102 -0
  54. package/docs/contracts/namespace.md +125 -0
  55. package/docs/contracts/profile-system.md +142 -0
  56. package/docs/contracts/safety-model.md +129 -0
  57. package/docs/contracts/smoke-contracts.md +144 -0
  58. package/docs/contracts/workflow-packs.md +121 -0
  59. package/docs/decisions/ADR-010-profile-pack-preset-boundary.md +132 -0
  60. package/docs/decisions/INDEX.md +1 -0
  61. package/docs/featured-commands.md +27 -0
  62. package/docs/parity/bench-ruflo.json +58 -0
  63. package/docs/parity/bench.json +41 -0
  64. package/docs/parity/ruflo.md +46 -0
  65. package/docs/profiles.md +91 -0
  66. package/package.json +1 -1
  67. package/scripts/_cli/cmd_explain.py +250 -0
  68. package/scripts/_lib/bench_cost.py +138 -0
  69. package/scripts/_lib/bench_quality.py +118 -0
  70. package/scripts/_lib/bench_report.py +150 -0
  71. package/scripts/agent-config +13 -0
  72. package/scripts/audit_adr_coverage.py +175 -0
  73. package/scripts/audit_mcp_tools.py +146 -0
  74. package/scripts/bench_baseline_ready.py +108 -0
  75. package/scripts/bench_drift_check.py +151 -0
  76. package/scripts/bench_per_tool.py +216 -0
  77. package/scripts/bench_run.py +155 -0
  78. package/scripts/config/__init__.py +9 -0
  79. package/scripts/config/presets.py +206 -0
  80. package/scripts/config/profiles.py +173 -0
  81. package/scripts/cost/budget.mjs +73 -12
  82. package/scripts/cost/preflight.mjs +89 -0
  83. package/scripts/lint_archived_skills.py +143 -0
  84. package/scripts/lint_bench_corpus.py +161 -0
  85. package/scripts/lint_namespace.py +135 -0
  86. package/scripts/skill_overlap.py +204 -0
  87. package/scripts/skill_usage_collect.py +191 -0
  88. package/scripts/skill_usage_report.py +162 -0
  89. package/scripts/smoke/kernel.sh +101 -0
  90. package/scripts/smoke/router.sh +129 -0
  91. package/scripts/smoke/schema.sh +71 -0
  92. package/scripts/smoke/skills.sh +101 -0
@@ -0,0 +1,151 @@
1
+ #!/usr/bin/env python3
2
+ """Drift detector for the bench corpus — step-4 Phase 3 Step 2.
3
+
4
+ Compares the latest `bench/reports/<stamp>-<corpus>.json` against the
5
+ previous N reports (default 5) for the same corpus. Drift defined as:
6
+
7
+ - selection-accuracy: latest is more than `accuracy_drop_pp` below
8
+ the rolling mean (default 5 pp)
9
+ - cost: latest USD total is more than `cost_increase_pct` above the
10
+ rolling mean (default 20 %); skipped when source != "captured"
11
+ - quality: latest quality_score is more than `quality_drop_pp`
12
+ below the rolling mean (default 10 pp); skipped when source ==
13
+ "not_collected"
14
+
15
+ Exit codes:
16
+ 0 — no drift detected (or no baseline yet — warn-only)
17
+ 1 — argument / read error
18
+ 2 — drift detected (CI surface; not a merge gate per roadmap)
19
+
20
+ CLI:
21
+ python3 scripts/bench_drift_check.py --corpus dev
22
+ python3 scripts/bench_drift_check.py --corpus dev --window 5 --json
23
+ """
24
+ from __future__ import annotations
25
+
26
+ import argparse
27
+ import json
28
+ import sys
29
+ from pathlib import Path
30
+ from typing import Any
31
+
32
+ REPO_ROOT = Path(__file__).resolve().parent.parent
33
+ sys.path.insert(0, str(REPO_ROOT / "scripts"))
34
+
35
+ from _lib import script_output # type: ignore[import-not-found] # noqa: E402
36
+
37
+
38
+ def _load_reports(reports_dir: Path, corpus: str) -> list[tuple[Path, dict[str, Any]]]:
39
+ out: list[tuple[Path, dict[str, Any]]] = []
40
+ for p in sorted(reports_dir.glob(f"*-{corpus}.json")):
41
+ try:
42
+ out.append((p, json.loads(p.read_text(encoding="utf-8"))))
43
+ except (OSError, json.JSONDecodeError) as exc:
44
+ script_output.warn(f" ⚠️ skip unreadable report {p.name}: {exc}")
45
+ return out
46
+
47
+
48
+ def _mean(values: list[float]) -> float:
49
+ return sum(values) / len(values) if values else 0.0
50
+
51
+
52
+ def _check(latest: dict[str, Any], baseline: list[dict[str, Any]],
53
+ thresholds: dict[str, float]) -> list[dict[str, Any]]:
54
+ findings: list[dict[str, Any]] = []
55
+
56
+ sel_latest = float(latest["selection"]["selection_accuracy"])
57
+ sel_baseline = _mean([float(r["selection"]["selection_accuracy"]) for r in baseline])
58
+ sel_drop_pp = (sel_baseline - sel_latest) * 100.0
59
+ if sel_drop_pp > thresholds["accuracy_drop_pp"]:
60
+ findings.append({
61
+ "axis": "selection_accuracy",
62
+ "latest": sel_latest, "baseline_mean": sel_baseline,
63
+ "delta_pp": -sel_drop_pp, "threshold_pp": -thresholds["accuracy_drop_pp"],
64
+ })
65
+
66
+ captured = [r for r in baseline + [latest] if r["cost"].get("source") == "captured"]
67
+ if len(captured) >= 2 and latest["cost"].get("source") == "captured":
68
+ cost_latest = float(latest["cost"]["totals"]["cost_usd"])
69
+ baseline_costs = [float(r["cost"]["totals"]["cost_usd"])
70
+ for r in baseline if r["cost"].get("source") == "captured"]
71
+ if baseline_costs:
72
+ cost_baseline = _mean(baseline_costs)
73
+ if cost_baseline > 0:
74
+ pct = (cost_latest - cost_baseline) / cost_baseline * 100.0
75
+ if pct > thresholds["cost_increase_pct"]:
76
+ findings.append({
77
+ "axis": "cost_usd",
78
+ "latest": cost_latest, "baseline_mean": cost_baseline,
79
+ "delta_pct": pct, "threshold_pct": thresholds["cost_increase_pct"],
80
+ })
81
+
82
+ if latest["quality"].get("source") != "not_collected":
83
+ q_latest = float(latest["quality"]["quality_score"])
84
+ q_baseline = _mean([float(r["quality"]["quality_score"])
85
+ for r in baseline
86
+ if r["quality"].get("source") != "not_collected"])
87
+ if q_baseline:
88
+ q_drop_pp = (q_baseline - q_latest) * 100.0
89
+ if q_drop_pp > thresholds["quality_drop_pp"]:
90
+ findings.append({
91
+ "axis": "quality_score",
92
+ "latest": q_latest, "baseline_mean": q_baseline,
93
+ "delta_pp": -q_drop_pp, "threshold_pp": -thresholds["quality_drop_pp"],
94
+ })
95
+
96
+ return findings
97
+
98
+
99
+ def main(argv: list[str] | None = None) -> int:
100
+ ap = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
101
+ ap.add_argument("--corpus", default="dev")
102
+ ap.add_argument("--reports-dir", default="bench/reports")
103
+ ap.add_argument("--window", type=int, default=5, help="rolling window size (default 5)")
104
+ ap.add_argument("--accuracy-drop-pp", type=float, default=5.0)
105
+ ap.add_argument("--cost-increase-pct", type=float, default=20.0)
106
+ ap.add_argument("--quality-drop-pp", type=float, default=10.0)
107
+ ap.add_argument("--json", action="store_true", help="emit JSON instead of Markdown")
108
+ args = ap.parse_args(argv)
109
+
110
+ reports = _load_reports(REPO_ROOT / args.reports_dir, args.corpus)
111
+ if len(reports) < 2:
112
+ msg = (f" ℹ️ bench-drift · corpus={args.corpus} · "
113
+ f"{len(reports)} report(s) — need ≥ 2 to compare; no drift gate yet.")
114
+ if args.json:
115
+ print(json.dumps({"status": "warmup", "reports": len(reports)}))
116
+ else:
117
+ print(msg)
118
+ return 0
119
+
120
+ latest_path, latest = reports[-1]
121
+ baseline = [r for _, r in reports[-(args.window + 1):-1]]
122
+ thresholds = {
123
+ "accuracy_drop_pp": args.accuracy_drop_pp,
124
+ "cost_increase_pct": args.cost_increase_pct,
125
+ "quality_drop_pp": args.quality_drop_pp,
126
+ }
127
+ findings = _check(latest, baseline, thresholds)
128
+
129
+ payload = {
130
+ "status": "drift" if findings else "ok",
131
+ "corpus": args.corpus,
132
+ "latest_report": latest_path.name,
133
+ "baseline_window": len(baseline),
134
+ "thresholds": thresholds,
135
+ "findings": findings,
136
+ }
137
+ if args.json:
138
+ print(json.dumps(payload, indent=2))
139
+ else:
140
+ emoji = "⚠️" if findings else "✅"
141
+ print(f" {emoji} bench-drift · corpus={args.corpus} · "
142
+ f"latest={latest_path.name} · window={len(baseline)} · "
143
+ f"findings={len(findings)}")
144
+ for f in findings:
145
+ print(f" · {f['axis']}: latest={f['latest']:.4f} "
146
+ f"baseline_mean={f['baseline_mean']:.4f}")
147
+ return 2 if findings else 0
148
+
149
+
150
+ if __name__ == "__main__":
151
+ sys.exit(main())
@@ -0,0 +1,216 @@
1
+ #!/usr/bin/env python3
2
+ """Per-tool projection-fidelity bench — step-4 Phase 4.
3
+
4
+ Re-runs the keyword-overlap selection scorer against each projected
5
+ tool surface and computes:
6
+
7
+ fidelity(tool) = selection_accuracy(tool) / selection_accuracy(reference)
8
+
9
+ Reference = Augment projection (most complete per roadmap). Threshold
10
+ for "fit for purpose" is >= 0.85.
11
+
12
+ Surfaces:
13
+ - `.augment/skills/` skill projection automated (reference)
14
+ - `.claude/skills/` skill projection automated
15
+ - `.cursor/rules/` rules-only not_applicable (no skill projection)
16
+ - `.windsurfrules` single concatenated not_applicable
17
+ - `.clinerules/` rules-only not_applicable
18
+
19
+ Usage:
20
+ python3 scripts/bench_per_tool.py --corpus dev
21
+ python3 scripts/bench_per_tool.py --corpus dev --json
22
+ python3 scripts/bench_per_tool.py --corpus dev --threshold 0.85
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ import argparse
28
+ import datetime as dt
29
+ import json
30
+ import re
31
+ import sys
32
+ from pathlib import Path
33
+
34
+ try:
35
+ import yaml
36
+ except ImportError:
37
+ sys.stderr.write("error: PyYAML required (pip install pyyaml)\n")
38
+ sys.exit(2)
39
+
40
+ # Reuse tokenization + ranking from the reference runner so the only
41
+ # axis that changes between tools is the skill catalogue on disk.
42
+ from bench_runner import rank_skills # type: ignore # noqa: E402
43
+
44
+ REPO_ROOT = Path(__file__).resolve().parent.parent
45
+ CORPUS_DIR = REPO_ROOT / "tests" / "eval"
46
+ REPORTS_DIR = REPO_ROOT / "bench" / "reports"
47
+
48
+ # tool_id -> (skills_root, kind). kind = "skills" | "rules_only" | "single_file".
49
+ SURFACES: dict[str, tuple[Path, str]] = {
50
+ "augment": (REPO_ROOT / ".augment" / "skills", "skills"),
51
+ "claude": (REPO_ROOT / ".claude" / "skills", "skills"),
52
+ "cursor": (REPO_ROOT / ".cursor" / "rules", "rules_only"),
53
+ "cline": (REPO_ROOT / ".clinerules", "rules_only"),
54
+ "windsurf":(REPO_ROOT / ".windsurfrules", "single_file"),
55
+ }
56
+
57
+ REFERENCE_TOOL = "augment"
58
+
59
+
60
+ def load_descriptions(root: Path) -> dict[str, str]:
61
+ """Return {skill_name: 'name + description'} for SKILL.md files under root."""
62
+ out: dict[str, str] = {}
63
+ if not root.is_dir():
64
+ return out
65
+ for skill_dir in sorted(root.iterdir()):
66
+ skill_md = skill_dir / "SKILL.md"
67
+ if not skill_md.is_file():
68
+ continue
69
+ text = skill_md.read_text(encoding="utf-8")
70
+ m = re.search(r"^---\s*\n(.*?)\n---\s*\n", text, re.DOTALL)
71
+ if not m:
72
+ continue
73
+ try:
74
+ fm = yaml.safe_load(m.group(1)) or {}
75
+ except yaml.YAMLError:
76
+ continue
77
+ desc = fm.get("description") or ""
78
+ name = fm.get("name") or skill_dir.name
79
+ if desc:
80
+ out[name] = f"{name} {desc}"
81
+ return out
82
+
83
+
84
+ def score_corpus(skills: dict[str, str], prompts: list[dict], top_k: int) -> dict:
85
+ hits = 0
86
+ per_prompt = []
87
+ for p in prompts:
88
+ ranked = rank_skills(p["prompt"], skills, top_k)
89
+ expected = set(p.get("expected_skills", []))
90
+ hit = bool(expected & set(ranked))
91
+ if hit:
92
+ hits += 1
93
+ per_prompt.append({"id": p["id"], "expected": sorted(expected),
94
+ "ranked": ranked, "hit": hit})
95
+ n = len(prompts)
96
+ return {
97
+ "prompts_total": n,
98
+ "prompts_hit": hits,
99
+ "selection_accuracy": round(hits / n, 4) if n else 0.0,
100
+ "skill_count": len(skills),
101
+ "per_prompt": per_prompt,
102
+ }
103
+
104
+
105
+ def evaluate(corpus_path: Path, top_k: int, threshold: float) -> dict:
106
+ corpus = yaml.safe_load(corpus_path.read_text(encoding="utf-8"))
107
+ prompts = corpus["prompts"]
108
+ results: dict[str, dict] = {}
109
+
110
+ for tool, (root, kind) in SURFACES.items():
111
+ if kind != "skills":
112
+ results[tool] = {
113
+ "status": "not_applicable",
114
+ "reason": f"surface is {kind}; no SKILL.md projection",
115
+ "path": str(root.relative_to(REPO_ROOT)),
116
+ }
117
+ continue
118
+ skills = load_descriptions(root)
119
+ if not skills:
120
+ results[tool] = {"status": "error", "reason": "no skills found",
121
+ "path": str(root.relative_to(REPO_ROOT))}
122
+ continue
123
+ scored = score_corpus(skills, prompts, top_k)
124
+ scored["status"] = "ok"
125
+ scored["path"] = str(root.relative_to(REPO_ROOT))
126
+ results[tool] = scored
127
+
128
+ ref = results.get(REFERENCE_TOOL, {})
129
+ ref_acc = ref.get("selection_accuracy", 0.0) if ref.get("status") == "ok" else 0.0
130
+ below = []
131
+ for tool, r in results.items():
132
+ if r.get("status") != "ok":
133
+ continue
134
+ fidelity = (r["selection_accuracy"] / ref_acc) if ref_acc else 0.0
135
+ r["fidelity"] = round(fidelity, 4)
136
+ r["passed_threshold"] = fidelity >= threshold
137
+ if tool != REFERENCE_TOOL and not r["passed_threshold"]:
138
+ below.append(tool)
139
+
140
+ return {
141
+ "schema": "projection-fidelity-v1",
142
+ "generated_at": dt.datetime.now(dt.timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
143
+ "corpus_id": corpus["corpus_id"],
144
+ "top_k": top_k,
145
+ "threshold": threshold,
146
+ "reference_tool": REFERENCE_TOOL,
147
+ "reference_accuracy": ref_acc,
148
+ "tools": results,
149
+ "below_threshold": below,
150
+ }
151
+
152
+
153
+ def render_markdown(summary: dict) -> str:
154
+ lines = [
155
+ f"# Projection fidelity — {summary['corpus_id']}",
156
+ "",
157
+ f"_Generated {summary['generated_at']} · top-K={summary['top_k']} · "
158
+ f"threshold={summary['threshold']:.2f} · reference=`{summary['reference_tool']}`_",
159
+ "",
160
+ "| tool | status | skills | accuracy | fidelity | pass |",
161
+ "|---|---|---:|---:|---:|---|",
162
+ ]
163
+ for tool, r in summary["tools"].items():
164
+ status = r.get("status", "?")
165
+ if status != "ok":
166
+ lines.append(f"| `{tool}` | {status} | — | — | — | — |")
167
+ continue
168
+ lines.append(
169
+ f"| `{tool}` | ok | {r['skill_count']} | "
170
+ f"{r['selection_accuracy']:.2%} | {r['fidelity']:.2f} | "
171
+ f"{'✅' if r['passed_threshold'] else '❌'} |"
172
+ )
173
+ if summary["below_threshold"]:
174
+ lines += ["", f"**Below threshold:** {', '.join(summary['below_threshold'])} "
175
+ f"→ inspect `scripts/_lib/generate_tools.py` projection mapping."]
176
+ else:
177
+ lines += ["", "**All projections fit-for-purpose** (≥ threshold)."]
178
+ return "\n".join(lines) + "\n"
179
+
180
+
181
+ def main(argv=None) -> int:
182
+ ap = argparse.ArgumentParser()
183
+ ap.add_argument("--corpus", default="dev")
184
+ ap.add_argument("--top-k", type=int, default=3)
185
+ ap.add_argument("--threshold", type=float, default=0.85)
186
+ ap.add_argument("--json", action="store_true")
187
+ ap.add_argument("--write-report", action="store_true",
188
+ help="emit bench/reports/<ts>-<corpus>-projection.{json,md}")
189
+ args = ap.parse_args(argv)
190
+
191
+ corpus_path = CORPUS_DIR / f"corpus-{args.corpus}.yaml"
192
+ if not corpus_path.is_file():
193
+ sys.stderr.write(f"error: corpus not found: {corpus_path}\n")
194
+ return 2
195
+
196
+ summary = evaluate(corpus_path, args.top_k, args.threshold)
197
+
198
+ if args.write_report:
199
+ REPORTS_DIR.mkdir(parents=True, exist_ok=True)
200
+ stamp = summary["generated_at"].replace(":", "-")
201
+ base = REPORTS_DIR / f"{stamp}-{args.corpus}-projection"
202
+ base.with_suffix(".json").write_text(json.dumps(summary, indent=2) + "\n")
203
+ base.with_suffix(".md").write_text(render_markdown(summary))
204
+ sys.stderr.write(f"wrote {base}.json + {base}.md\n")
205
+
206
+ if args.json:
207
+ print(json.dumps(summary, indent=2))
208
+ else:
209
+ print(render_markdown(summary))
210
+
211
+ return 1 if summary["below_threshold"] else 0
212
+
213
+
214
+ if __name__ == "__main__":
215
+ sys.exit(main())
216
+
@@ -0,0 +1,155 @@
1
+ #!/usr/bin/env python3
2
+ """Bench orchestrator — step-4 measurement-and-benchmark Phase 2.
3
+
4
+ Wraps the selection-accuracy baseline collector (`scripts/bench_runner.py`),
5
+ captures token / cost data from `agents/cost-tracking/sessions.jsonl` if
6
+ present (per ruflo pattern, external-findings § 2), runs structural
7
+ quality assertions per prompt, and emits a versioned JSON + Markdown
8
+ report under `bench/reports/` per
9
+ `docs/contracts/benchmark-report-schema.md`.
10
+
11
+ Usage:
12
+ python3 scripts/bench_run.py --corpus dev
13
+ python3 scripts/bench_run.py --corpus dev --top-k 3 --quiet
14
+ python3 scripts/bench_run.py --corpus dev --agent-output outputs.json
15
+ """
16
+ from __future__ import annotations
17
+
18
+ import argparse
19
+ import sys
20
+ from pathlib import Path
21
+
22
+ REPO_ROOT = Path(__file__).resolve().parent.parent
23
+ sys.path.insert(0, str(REPO_ROOT / "scripts"))
24
+
25
+ from _lib import script_output # type: ignore[import-not-found] # noqa: E402
26
+ from _lib.bench_cost import aggregate_sessions # noqa: E402
27
+ from _lib.bench_quality import score_corpus # noqa: E402
28
+ from _lib.bench_report import ( # noqa: E402
29
+ report_paths,
30
+ render_markdown,
31
+ utc_now_filename_stamp,
32
+ utc_now_iso,
33
+ write_json,
34
+ write_markdown,
35
+ )
36
+ from bench_runner import run_corpus # noqa: E402
37
+
38
+ try:
39
+ import yaml
40
+ except ImportError:
41
+ script_output.error("error: PyYAML required (pip install pyyaml)")
42
+ sys.exit(2)
43
+
44
+ BENCH_RUN_VERSION = "0.1.0"
45
+ PRICING_PATH = REPO_ROOT / "bench" / "pricing.yaml"
46
+ SESSIONS_JSONL = REPO_ROOT / "agents" / "cost-tracking" / "sessions.jsonl"
47
+ REPORTS_DIR = REPO_ROOT / "bench" / "reports"
48
+ CORPUS_DIR = REPO_ROOT / "tests" / "eval"
49
+ BASELINE_COLLECTOR = REPO_ROOT / "scripts" / "bench_runner.py"
50
+
51
+
52
+ def _baseline_sha_or_mtime() -> str:
53
+ try:
54
+ return f"mtime:{int(BASELINE_COLLECTOR.stat().st_mtime)}"
55
+ except OSError:
56
+ return "unavailable"
57
+
58
+
59
+ def _verdict(selection: dict, quality: dict) -> dict[str, str]:
60
+ sel = "pass" if selection["passed"] else "fail"
61
+ if quality["source"] == "not_collected":
62
+ qual = "not_collected"
63
+ overall = "partial"
64
+ else:
65
+ qual = "pass" if quality["quality_score"] >= 0.60 else "fail"
66
+ overall = "pass" if (sel == "pass" and qual == "pass") else "fail"
67
+ return {"selection": sel, "quality": qual, "overall": overall}
68
+
69
+
70
+ def build_report(
71
+ corpus_path: Path,
72
+ top_k: int,
73
+ agent_output: Path | None,
74
+ ) -> dict:
75
+ selection = run_corpus(corpus_path, top_k)
76
+ corpus_yaml = yaml.safe_load(corpus_path.read_text(encoding="utf-8"))
77
+ prompts = corpus_yaml.get("prompts", [])
78
+ cost = aggregate_sessions(SESSIONS_JSONL, PRICING_PATH)
79
+ quality = score_corpus(prompts, agent_output)
80
+ verdict = _verdict(selection, quality)
81
+ return {
82
+ "schema_version": 1,
83
+ "generated_at": utc_now_iso(),
84
+ "corpus": {
85
+ "id": selection["corpus_id"],
86
+ "path": str(corpus_path.relative_to(REPO_ROOT)),
87
+ "prompt_count": len(prompts),
88
+ },
89
+ "runner": {
90
+ "bench_run_version": BENCH_RUN_VERSION,
91
+ "baseline_collector": str(BASELINE_COLLECTOR.relative_to(REPO_ROOT)),
92
+ "baseline_collector_sha": _baseline_sha_or_mtime(),
93
+ },
94
+ "selection": selection,
95
+ "cost": cost,
96
+ "quality": quality,
97
+ "verdict": verdict,
98
+ }
99
+
100
+
101
+ def main(argv: list[str] | None = None) -> int:
102
+ ap = argparse.ArgumentParser(description=__doc__.splitlines()[0])
103
+ ap.add_argument("--corpus", default="dev", help="corpus id (default: dev)")
104
+ ap.add_argument("--top-k", type=int, default=3)
105
+ ap.add_argument("--agent-output", type=Path, default=None,
106
+ help="Path to JSON {id: output_text} for quality scoring (Phase 3)")
107
+ ap.add_argument("--quiet", action="store_true",
108
+ help="Print only the report path + headline")
109
+ ap.add_argument("--stamp", default=None,
110
+ help="Override timestamp (test hook); defaults to UTC now")
111
+ ap.add_argument("--no-write", action="store_true",
112
+ help="Compute the report but do not write files (dry run)")
113
+ args = ap.parse_args(argv)
114
+
115
+ corpus_path = CORPUS_DIR / f"corpus-{args.corpus}.yaml"
116
+ if not corpus_path.is_file():
117
+ script_output.error(f"error: corpus not found: {corpus_path}")
118
+ return 2
119
+
120
+ report = build_report(corpus_path, args.top_k, args.agent_output)
121
+ stamp = args.stamp or utc_now_filename_stamp()
122
+ json_path, md_path = report_paths(REPORTS_DIR, report["corpus"]["id"], stamp)
123
+
124
+ if not args.no_write:
125
+ write_json(json_path, report)
126
+ write_markdown(md_path, report)
127
+
128
+ verdict = report["verdict"]
129
+ sel = report["selection"]
130
+ qual = report["quality"]
131
+ cost = report["cost"]
132
+ headline = (
133
+ f"bench {report['corpus']['id']} · "
134
+ f"selection {sel['selection_accuracy']:.2%} ({verdict['selection']}) · "
135
+ f"cost ${cost['totals']['total_cost_usd']:.6f} ({cost.get('source', 'n/a')}) · "
136
+ f"quality {qual['quality_score']:.2%} ({verdict['quality']}) · "
137
+ f"overall {verdict['overall']}"
138
+ )
139
+
140
+ if args.quiet:
141
+ print(headline)
142
+ if not args.no_write:
143
+ print(f"report: {md_path.relative_to(REPO_ROOT)}")
144
+ else:
145
+ print(render_markdown(report))
146
+ if not args.no_write:
147
+ print(f"\n→ json: {json_path.relative_to(REPO_ROOT)}")
148
+ print(f"→ markdown: {md_path.relative_to(REPO_ROOT)}")
149
+
150
+ # Exit zero on overall pass OR partial (partial = quality_not_collected by design).
151
+ return 0 if verdict["overall"] in ("pass", "partial") else 1
152
+
153
+
154
+ if __name__ == "__main__":
155
+ sys.exit(main(sys.argv[1:]))
@@ -0,0 +1,9 @@
1
+ """Config-layer loaders (profile, preset, pack).
2
+
3
+ Phase 1 of step-15 product refinement. Single home for the audience /
4
+ governance / workflow axes introduced by
5
+ :mod:`docs.contracts.profile-system`,
6
+ :mod:`docs.contracts.config-presets`, and the upcoming workflow-packs
7
+ contract. Loaders here are pure, read-only, lazy-PyYAML; they layer on
8
+ top of :mod:`scripts._lib.agent_settings` for project-root anchoring.
9
+ """