@event4u/agent-config 2.19.0 → 2.20.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.agent-src/commands/agent-status.md +29 -0
- package/.agent-src/commands/onboard.md +221 -81
- package/.agent-src/packs/README.md +49 -0
- package/.agent-src/packs/agency-delivery.yml +63 -0
- package/.agent-src/packs/content-engine.yml +53 -0
- package/.agent-src/packs/founder-mvp.yml +51 -0
- package/.agent-src/presets/README.md +26 -0
- package/.agent-src/presets/balanced.yml +34 -0
- package/.agent-src/presets/fast.yml +31 -0
- package/.agent-src/presets/strict.yml +38 -0
- package/.agent-src/profiles/README.md +29 -0
- package/.agent-src/profiles/agency.yml +27 -0
- package/.agent-src/profiles/content_creator.yml +25 -0
- package/.agent-src/profiles/developer.yml +26 -0
- package/.agent-src/profiles/finance.yml +24 -0
- package/.agent-src/profiles/founder.yml +25 -0
- package/.agent-src/profiles/ops.yml +25 -0
- package/.agent-src/rules/no-cheap-questions.md +25 -17
- package/.agent-src/skills/adr-create/SKILL.md +78 -68
- package/.agent-src/skills/subagent-orchestration/SKILL.md +33 -0
- package/.agent-src/templates/agents/agent-project-settings.example.yml +1 -1
- package/.agent-src/templates/skill-archive-note.md +101 -0
- package/.claude-plugin/marketplace.json +1 -1
- package/CHANGELOG.md +52 -30
- package/README.md +68 -72
- package/config/agent-settings.template.yml +22 -0
- package/docs/adrs/caveman/0001-default-off-until-bench.md +93 -0
- package/docs/adrs/caveman/README.md +9 -0
- package/docs/adrs/cost/0001-hard-stop-hook.md +114 -0
- package/docs/adrs/cost/README.md +9 -0
- package/docs/adrs/memory/0001-consumer-side-snapshot.md +111 -0
- package/docs/adrs/memory/README.md +9 -0
- package/docs/adrs/router/0001-three-tier-routing.md +119 -0
- package/docs/adrs/router/README.md +9 -0
- package/docs/adrs/schema/0001-json-schema-frontmatter.md +102 -0
- package/docs/adrs/schema/README.md +9 -0
- package/docs/adrs/smoke/0001-per-tier-smoke-scripts.md +99 -0
- package/docs/adrs/smoke/README.md +9 -0
- package/docs/architecture/current-onboard-baseline.md +126 -0
- package/docs/architecture/current-safety-behavior.md +137 -0
- package/docs/archive/CHANGELOG-pre-2.16.0.md +48 -0
- package/docs/contracts/adr-layout.md +108 -0
- package/docs/contracts/benchmark-corpus-spec.md +97 -0
- package/docs/contracts/benchmark-report-schema.md +111 -0
- package/docs/contracts/command-clusters.md +1 -0
- package/docs/contracts/command-taxonomy.md +137 -0
- package/docs/contracts/compression-default-kill-criterion.md +69 -0
- package/docs/contracts/config-presets.md +144 -0
- package/docs/contracts/cost-dashboard.md +143 -0
- package/docs/contracts/cost-enforcement.md +134 -0
- package/docs/contracts/file-ownership-matrix.json +0 -7
- package/docs/contracts/mcp-tool-inventory.md +53 -0
- package/docs/contracts/measurement-baseline.md +102 -0
- package/docs/contracts/namespace.md +125 -0
- package/docs/contracts/profile-system.md +142 -0
- package/docs/contracts/safety-model.md +129 -0
- package/docs/contracts/smoke-contracts.md +144 -0
- package/docs/contracts/workflow-packs.md +121 -0
- package/docs/decisions/ADR-010-profile-pack-preset-boundary.md +132 -0
- package/docs/decisions/INDEX.md +1 -0
- package/docs/featured-commands.md +27 -0
- package/docs/parity/bench-ruflo.json +58 -0
- package/docs/parity/bench.json +41 -0
- package/docs/parity/ruflo.md +46 -0
- package/docs/profiles.md +91 -0
- package/package.json +1 -1
- package/scripts/_cli/cmd_explain.py +250 -0
- package/scripts/_lib/bench_cost.py +138 -0
- package/scripts/_lib/bench_quality.py +118 -0
- package/scripts/_lib/bench_report.py +150 -0
- package/scripts/agent-config +13 -0
- package/scripts/audit_adr_coverage.py +175 -0
- package/scripts/audit_mcp_tools.py +146 -0
- package/scripts/bench_baseline_ready.py +108 -0
- package/scripts/bench_drift_check.py +151 -0
- package/scripts/bench_per_tool.py +216 -0
- package/scripts/bench_run.py +155 -0
- package/scripts/config/__init__.py +9 -0
- package/scripts/config/presets.py +206 -0
- package/scripts/config/profiles.py +173 -0
- package/scripts/cost/budget.mjs +73 -12
- package/scripts/cost/preflight.mjs +89 -0
- package/scripts/lint_archived_skills.py +143 -0
- package/scripts/lint_bench_corpus.py +161 -0
- package/scripts/lint_namespace.py +135 -0
- package/scripts/skill_overlap.py +204 -0
- package/scripts/skill_usage_collect.py +191 -0
- package/scripts/skill_usage_report.py +162 -0
- package/scripts/smoke/kernel.sh +101 -0
- package/scripts/smoke/router.sh +129 -0
- package/scripts/smoke/schema.sh +71 -0
- package/scripts/smoke/skills.sh +101 -0
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Drift detector for the bench corpus — step-4 Phase 3 Step 2.
|
|
3
|
+
|
|
4
|
+
Compares the latest `bench/reports/<stamp>-<corpus>.json` against the
|
|
5
|
+
previous N reports (default 5) for the same corpus. Drift defined as:
|
|
6
|
+
|
|
7
|
+
- selection-accuracy: latest is more than `accuracy_drop_pp` below
|
|
8
|
+
the rolling mean (default 5 pp)
|
|
9
|
+
- cost: latest USD total is more than `cost_increase_pct` above the
|
|
10
|
+
rolling mean (default 20 %); skipped when source != "captured"
|
|
11
|
+
- quality: latest quality_score is more than `quality_drop_pp`
|
|
12
|
+
below the rolling mean (default 10 pp); skipped when source ==
|
|
13
|
+
"not_collected"
|
|
14
|
+
|
|
15
|
+
Exit codes:
|
|
16
|
+
0 — no drift detected (or no baseline yet — warn-only)
|
|
17
|
+
1 — argument / read error
|
|
18
|
+
2 — drift detected (CI surface; not a merge gate per roadmap)
|
|
19
|
+
|
|
20
|
+
CLI:
|
|
21
|
+
python3 scripts/bench_drift_check.py --corpus dev
|
|
22
|
+
python3 scripts/bench_drift_check.py --corpus dev --window 5 --json
|
|
23
|
+
"""
|
|
24
|
+
from __future__ import annotations
|
|
25
|
+
|
|
26
|
+
import argparse
|
|
27
|
+
import json
|
|
28
|
+
import sys
|
|
29
|
+
from pathlib import Path
|
|
30
|
+
from typing import Any
|
|
31
|
+
|
|
32
|
+
REPO_ROOT = Path(__file__).resolve().parent.parent
|
|
33
|
+
sys.path.insert(0, str(REPO_ROOT / "scripts"))
|
|
34
|
+
|
|
35
|
+
from _lib import script_output # type: ignore[import-not-found] # noqa: E402
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _load_reports(reports_dir: Path, corpus: str) -> list[tuple[Path, dict[str, Any]]]:
|
|
39
|
+
out: list[tuple[Path, dict[str, Any]]] = []
|
|
40
|
+
for p in sorted(reports_dir.glob(f"*-{corpus}.json")):
|
|
41
|
+
try:
|
|
42
|
+
out.append((p, json.loads(p.read_text(encoding="utf-8"))))
|
|
43
|
+
except (OSError, json.JSONDecodeError) as exc:
|
|
44
|
+
script_output.warn(f" ⚠️ skip unreadable report {p.name}: {exc}")
|
|
45
|
+
return out
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _mean(values: list[float]) -> float:
|
|
49
|
+
return sum(values) / len(values) if values else 0.0
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _check(latest: dict[str, Any], baseline: list[dict[str, Any]],
|
|
53
|
+
thresholds: dict[str, float]) -> list[dict[str, Any]]:
|
|
54
|
+
findings: list[dict[str, Any]] = []
|
|
55
|
+
|
|
56
|
+
sel_latest = float(latest["selection"]["selection_accuracy"])
|
|
57
|
+
sel_baseline = _mean([float(r["selection"]["selection_accuracy"]) for r in baseline])
|
|
58
|
+
sel_drop_pp = (sel_baseline - sel_latest) * 100.0
|
|
59
|
+
if sel_drop_pp > thresholds["accuracy_drop_pp"]:
|
|
60
|
+
findings.append({
|
|
61
|
+
"axis": "selection_accuracy",
|
|
62
|
+
"latest": sel_latest, "baseline_mean": sel_baseline,
|
|
63
|
+
"delta_pp": -sel_drop_pp, "threshold_pp": -thresholds["accuracy_drop_pp"],
|
|
64
|
+
})
|
|
65
|
+
|
|
66
|
+
captured = [r for r in baseline + [latest] if r["cost"].get("source") == "captured"]
|
|
67
|
+
if len(captured) >= 2 and latest["cost"].get("source") == "captured":
|
|
68
|
+
cost_latest = float(latest["cost"]["totals"]["cost_usd"])
|
|
69
|
+
baseline_costs = [float(r["cost"]["totals"]["cost_usd"])
|
|
70
|
+
for r in baseline if r["cost"].get("source") == "captured"]
|
|
71
|
+
if baseline_costs:
|
|
72
|
+
cost_baseline = _mean(baseline_costs)
|
|
73
|
+
if cost_baseline > 0:
|
|
74
|
+
pct = (cost_latest - cost_baseline) / cost_baseline * 100.0
|
|
75
|
+
if pct > thresholds["cost_increase_pct"]:
|
|
76
|
+
findings.append({
|
|
77
|
+
"axis": "cost_usd",
|
|
78
|
+
"latest": cost_latest, "baseline_mean": cost_baseline,
|
|
79
|
+
"delta_pct": pct, "threshold_pct": thresholds["cost_increase_pct"],
|
|
80
|
+
})
|
|
81
|
+
|
|
82
|
+
if latest["quality"].get("source") != "not_collected":
|
|
83
|
+
q_latest = float(latest["quality"]["quality_score"])
|
|
84
|
+
q_baseline = _mean([float(r["quality"]["quality_score"])
|
|
85
|
+
for r in baseline
|
|
86
|
+
if r["quality"].get("source") != "not_collected"])
|
|
87
|
+
if q_baseline:
|
|
88
|
+
q_drop_pp = (q_baseline - q_latest) * 100.0
|
|
89
|
+
if q_drop_pp > thresholds["quality_drop_pp"]:
|
|
90
|
+
findings.append({
|
|
91
|
+
"axis": "quality_score",
|
|
92
|
+
"latest": q_latest, "baseline_mean": q_baseline,
|
|
93
|
+
"delta_pp": -q_drop_pp, "threshold_pp": -thresholds["quality_drop_pp"],
|
|
94
|
+
})
|
|
95
|
+
|
|
96
|
+
return findings
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def main(argv: list[str] | None = None) -> int:
|
|
100
|
+
ap = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
|
|
101
|
+
ap.add_argument("--corpus", default="dev")
|
|
102
|
+
ap.add_argument("--reports-dir", default="bench/reports")
|
|
103
|
+
ap.add_argument("--window", type=int, default=5, help="rolling window size (default 5)")
|
|
104
|
+
ap.add_argument("--accuracy-drop-pp", type=float, default=5.0)
|
|
105
|
+
ap.add_argument("--cost-increase-pct", type=float, default=20.0)
|
|
106
|
+
ap.add_argument("--quality-drop-pp", type=float, default=10.0)
|
|
107
|
+
ap.add_argument("--json", action="store_true", help="emit JSON instead of Markdown")
|
|
108
|
+
args = ap.parse_args(argv)
|
|
109
|
+
|
|
110
|
+
reports = _load_reports(REPO_ROOT / args.reports_dir, args.corpus)
|
|
111
|
+
if len(reports) < 2:
|
|
112
|
+
msg = (f" ℹ️ bench-drift · corpus={args.corpus} · "
|
|
113
|
+
f"{len(reports)} report(s) — need ≥ 2 to compare; no drift gate yet.")
|
|
114
|
+
if args.json:
|
|
115
|
+
print(json.dumps({"status": "warmup", "reports": len(reports)}))
|
|
116
|
+
else:
|
|
117
|
+
print(msg)
|
|
118
|
+
return 0
|
|
119
|
+
|
|
120
|
+
latest_path, latest = reports[-1]
|
|
121
|
+
baseline = [r for _, r in reports[-(args.window + 1):-1]]
|
|
122
|
+
thresholds = {
|
|
123
|
+
"accuracy_drop_pp": args.accuracy_drop_pp,
|
|
124
|
+
"cost_increase_pct": args.cost_increase_pct,
|
|
125
|
+
"quality_drop_pp": args.quality_drop_pp,
|
|
126
|
+
}
|
|
127
|
+
findings = _check(latest, baseline, thresholds)
|
|
128
|
+
|
|
129
|
+
payload = {
|
|
130
|
+
"status": "drift" if findings else "ok",
|
|
131
|
+
"corpus": args.corpus,
|
|
132
|
+
"latest_report": latest_path.name,
|
|
133
|
+
"baseline_window": len(baseline),
|
|
134
|
+
"thresholds": thresholds,
|
|
135
|
+
"findings": findings,
|
|
136
|
+
}
|
|
137
|
+
if args.json:
|
|
138
|
+
print(json.dumps(payload, indent=2))
|
|
139
|
+
else:
|
|
140
|
+
emoji = "⚠️" if findings else "✅"
|
|
141
|
+
print(f" {emoji} bench-drift · corpus={args.corpus} · "
|
|
142
|
+
f"latest={latest_path.name} · window={len(baseline)} · "
|
|
143
|
+
f"findings={len(findings)}")
|
|
144
|
+
for f in findings:
|
|
145
|
+
print(f" · {f['axis']}: latest={f['latest']:.4f} "
|
|
146
|
+
f"baseline_mean={f['baseline_mean']:.4f}")
|
|
147
|
+
return 2 if findings else 0
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
if __name__ == "__main__":
|
|
151
|
+
sys.exit(main())
|
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Per-tool projection-fidelity bench — step-4 Phase 4.
|
|
3
|
+
|
|
4
|
+
Re-runs the keyword-overlap selection scorer against each projected
|
|
5
|
+
tool surface and computes:
|
|
6
|
+
|
|
7
|
+
fidelity(tool) = selection_accuracy(tool) / selection_accuracy(reference)
|
|
8
|
+
|
|
9
|
+
Reference = Augment projection (most complete per roadmap). Threshold
|
|
10
|
+
for "fit for purpose" is >= 0.85.
|
|
11
|
+
|
|
12
|
+
Surfaces:
|
|
13
|
+
- `.augment/skills/` skill projection automated (reference)
|
|
14
|
+
- `.claude/skills/` skill projection automated
|
|
15
|
+
- `.cursor/rules/` rules-only not_applicable (no skill projection)
|
|
16
|
+
- `.windsurfrules` single concatenated not_applicable
|
|
17
|
+
- `.clinerules/` rules-only not_applicable
|
|
18
|
+
|
|
19
|
+
Usage:
|
|
20
|
+
python3 scripts/bench_per_tool.py --corpus dev
|
|
21
|
+
python3 scripts/bench_per_tool.py --corpus dev --json
|
|
22
|
+
python3 scripts/bench_per_tool.py --corpus dev --threshold 0.85
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
import argparse
|
|
28
|
+
import datetime as dt
|
|
29
|
+
import json
|
|
30
|
+
import re
|
|
31
|
+
import sys
|
|
32
|
+
from pathlib import Path
|
|
33
|
+
|
|
34
|
+
try:
|
|
35
|
+
import yaml
|
|
36
|
+
except ImportError:
|
|
37
|
+
sys.stderr.write("error: PyYAML required (pip install pyyaml)\n")
|
|
38
|
+
sys.exit(2)
|
|
39
|
+
|
|
40
|
+
# Reuse tokenization + ranking from the reference runner so the only
|
|
41
|
+
# axis that changes between tools is the skill catalogue on disk.
|
|
42
|
+
from bench_runner import rank_skills # type: ignore # noqa: E402
|
|
43
|
+
|
|
44
|
+
REPO_ROOT = Path(__file__).resolve().parent.parent
|
|
45
|
+
CORPUS_DIR = REPO_ROOT / "tests" / "eval"
|
|
46
|
+
REPORTS_DIR = REPO_ROOT / "bench" / "reports"
|
|
47
|
+
|
|
48
|
+
# tool_id -> (skills_root, kind). kind = "skills" | "rules_only" | "single_file".
|
|
49
|
+
SURFACES: dict[str, tuple[Path, str]] = {
|
|
50
|
+
"augment": (REPO_ROOT / ".augment" / "skills", "skills"),
|
|
51
|
+
"claude": (REPO_ROOT / ".claude" / "skills", "skills"),
|
|
52
|
+
"cursor": (REPO_ROOT / ".cursor" / "rules", "rules_only"),
|
|
53
|
+
"cline": (REPO_ROOT / ".clinerules", "rules_only"),
|
|
54
|
+
"windsurf":(REPO_ROOT / ".windsurfrules", "single_file"),
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
REFERENCE_TOOL = "augment"
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def load_descriptions(root: Path) -> dict[str, str]:
|
|
61
|
+
"""Return {skill_name: 'name + description'} for SKILL.md files under root."""
|
|
62
|
+
out: dict[str, str] = {}
|
|
63
|
+
if not root.is_dir():
|
|
64
|
+
return out
|
|
65
|
+
for skill_dir in sorted(root.iterdir()):
|
|
66
|
+
skill_md = skill_dir / "SKILL.md"
|
|
67
|
+
if not skill_md.is_file():
|
|
68
|
+
continue
|
|
69
|
+
text = skill_md.read_text(encoding="utf-8")
|
|
70
|
+
m = re.search(r"^---\s*\n(.*?)\n---\s*\n", text, re.DOTALL)
|
|
71
|
+
if not m:
|
|
72
|
+
continue
|
|
73
|
+
try:
|
|
74
|
+
fm = yaml.safe_load(m.group(1)) or {}
|
|
75
|
+
except yaml.YAMLError:
|
|
76
|
+
continue
|
|
77
|
+
desc = fm.get("description") or ""
|
|
78
|
+
name = fm.get("name") or skill_dir.name
|
|
79
|
+
if desc:
|
|
80
|
+
out[name] = f"{name} {desc}"
|
|
81
|
+
return out
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def score_corpus(skills: dict[str, str], prompts: list[dict], top_k: int) -> dict:
|
|
85
|
+
hits = 0
|
|
86
|
+
per_prompt = []
|
|
87
|
+
for p in prompts:
|
|
88
|
+
ranked = rank_skills(p["prompt"], skills, top_k)
|
|
89
|
+
expected = set(p.get("expected_skills", []))
|
|
90
|
+
hit = bool(expected & set(ranked))
|
|
91
|
+
if hit:
|
|
92
|
+
hits += 1
|
|
93
|
+
per_prompt.append({"id": p["id"], "expected": sorted(expected),
|
|
94
|
+
"ranked": ranked, "hit": hit})
|
|
95
|
+
n = len(prompts)
|
|
96
|
+
return {
|
|
97
|
+
"prompts_total": n,
|
|
98
|
+
"prompts_hit": hits,
|
|
99
|
+
"selection_accuracy": round(hits / n, 4) if n else 0.0,
|
|
100
|
+
"skill_count": len(skills),
|
|
101
|
+
"per_prompt": per_prompt,
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def evaluate(corpus_path: Path, top_k: int, threshold: float) -> dict:
|
|
106
|
+
corpus = yaml.safe_load(corpus_path.read_text(encoding="utf-8"))
|
|
107
|
+
prompts = corpus["prompts"]
|
|
108
|
+
results: dict[str, dict] = {}
|
|
109
|
+
|
|
110
|
+
for tool, (root, kind) in SURFACES.items():
|
|
111
|
+
if kind != "skills":
|
|
112
|
+
results[tool] = {
|
|
113
|
+
"status": "not_applicable",
|
|
114
|
+
"reason": f"surface is {kind}; no SKILL.md projection",
|
|
115
|
+
"path": str(root.relative_to(REPO_ROOT)),
|
|
116
|
+
}
|
|
117
|
+
continue
|
|
118
|
+
skills = load_descriptions(root)
|
|
119
|
+
if not skills:
|
|
120
|
+
results[tool] = {"status": "error", "reason": "no skills found",
|
|
121
|
+
"path": str(root.relative_to(REPO_ROOT))}
|
|
122
|
+
continue
|
|
123
|
+
scored = score_corpus(skills, prompts, top_k)
|
|
124
|
+
scored["status"] = "ok"
|
|
125
|
+
scored["path"] = str(root.relative_to(REPO_ROOT))
|
|
126
|
+
results[tool] = scored
|
|
127
|
+
|
|
128
|
+
ref = results.get(REFERENCE_TOOL, {})
|
|
129
|
+
ref_acc = ref.get("selection_accuracy", 0.0) if ref.get("status") == "ok" else 0.0
|
|
130
|
+
below = []
|
|
131
|
+
for tool, r in results.items():
|
|
132
|
+
if r.get("status") != "ok":
|
|
133
|
+
continue
|
|
134
|
+
fidelity = (r["selection_accuracy"] / ref_acc) if ref_acc else 0.0
|
|
135
|
+
r["fidelity"] = round(fidelity, 4)
|
|
136
|
+
r["passed_threshold"] = fidelity >= threshold
|
|
137
|
+
if tool != REFERENCE_TOOL and not r["passed_threshold"]:
|
|
138
|
+
below.append(tool)
|
|
139
|
+
|
|
140
|
+
return {
|
|
141
|
+
"schema": "projection-fidelity-v1",
|
|
142
|
+
"generated_at": dt.datetime.now(dt.timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
|
|
143
|
+
"corpus_id": corpus["corpus_id"],
|
|
144
|
+
"top_k": top_k,
|
|
145
|
+
"threshold": threshold,
|
|
146
|
+
"reference_tool": REFERENCE_TOOL,
|
|
147
|
+
"reference_accuracy": ref_acc,
|
|
148
|
+
"tools": results,
|
|
149
|
+
"below_threshold": below,
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def render_markdown(summary: dict) -> str:
|
|
154
|
+
lines = [
|
|
155
|
+
f"# Projection fidelity — {summary['corpus_id']}",
|
|
156
|
+
"",
|
|
157
|
+
f"_Generated {summary['generated_at']} · top-K={summary['top_k']} · "
|
|
158
|
+
f"threshold={summary['threshold']:.2f} · reference=`{summary['reference_tool']}`_",
|
|
159
|
+
"",
|
|
160
|
+
"| tool | status | skills | accuracy | fidelity | pass |",
|
|
161
|
+
"|---|---|---:|---:|---:|---|",
|
|
162
|
+
]
|
|
163
|
+
for tool, r in summary["tools"].items():
|
|
164
|
+
status = r.get("status", "?")
|
|
165
|
+
if status != "ok":
|
|
166
|
+
lines.append(f"| `{tool}` | {status} | — | — | — | — |")
|
|
167
|
+
continue
|
|
168
|
+
lines.append(
|
|
169
|
+
f"| `{tool}` | ok | {r['skill_count']} | "
|
|
170
|
+
f"{r['selection_accuracy']:.2%} | {r['fidelity']:.2f} | "
|
|
171
|
+
f"{'✅' if r['passed_threshold'] else '❌'} |"
|
|
172
|
+
)
|
|
173
|
+
if summary["below_threshold"]:
|
|
174
|
+
lines += ["", f"**Below threshold:** {', '.join(summary['below_threshold'])} "
|
|
175
|
+
f"→ inspect `scripts/_lib/generate_tools.py` projection mapping."]
|
|
176
|
+
else:
|
|
177
|
+
lines += ["", "**All projections fit-for-purpose** (≥ threshold)."]
|
|
178
|
+
return "\n".join(lines) + "\n"
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def main(argv=None) -> int:
|
|
182
|
+
ap = argparse.ArgumentParser()
|
|
183
|
+
ap.add_argument("--corpus", default="dev")
|
|
184
|
+
ap.add_argument("--top-k", type=int, default=3)
|
|
185
|
+
ap.add_argument("--threshold", type=float, default=0.85)
|
|
186
|
+
ap.add_argument("--json", action="store_true")
|
|
187
|
+
ap.add_argument("--write-report", action="store_true",
|
|
188
|
+
help="emit bench/reports/<ts>-<corpus>-projection.{json,md}")
|
|
189
|
+
args = ap.parse_args(argv)
|
|
190
|
+
|
|
191
|
+
corpus_path = CORPUS_DIR / f"corpus-{args.corpus}.yaml"
|
|
192
|
+
if not corpus_path.is_file():
|
|
193
|
+
sys.stderr.write(f"error: corpus not found: {corpus_path}\n")
|
|
194
|
+
return 2
|
|
195
|
+
|
|
196
|
+
summary = evaluate(corpus_path, args.top_k, args.threshold)
|
|
197
|
+
|
|
198
|
+
if args.write_report:
|
|
199
|
+
REPORTS_DIR.mkdir(parents=True, exist_ok=True)
|
|
200
|
+
stamp = summary["generated_at"].replace(":", "-")
|
|
201
|
+
base = REPORTS_DIR / f"{stamp}-{args.corpus}-projection"
|
|
202
|
+
base.with_suffix(".json").write_text(json.dumps(summary, indent=2) + "\n")
|
|
203
|
+
base.with_suffix(".md").write_text(render_markdown(summary))
|
|
204
|
+
sys.stderr.write(f"wrote {base}.json + {base}.md\n")
|
|
205
|
+
|
|
206
|
+
if args.json:
|
|
207
|
+
print(json.dumps(summary, indent=2))
|
|
208
|
+
else:
|
|
209
|
+
print(render_markdown(summary))
|
|
210
|
+
|
|
211
|
+
return 1 if summary["below_threshold"] else 0
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
if __name__ == "__main__":
|
|
215
|
+
sys.exit(main())
|
|
216
|
+
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Bench orchestrator — step-4 measurement-and-benchmark Phase 2.
|
|
3
|
+
|
|
4
|
+
Wraps the selection-accuracy baseline collector (`scripts/bench_runner.py`),
|
|
5
|
+
captures token / cost data from `agents/cost-tracking/sessions.jsonl` if
|
|
6
|
+
present (per ruflo pattern, external-findings § 2), runs structural
|
|
7
|
+
quality assertions per prompt, and emits a versioned JSON + Markdown
|
|
8
|
+
report under `bench/reports/` per
|
|
9
|
+
`docs/contracts/benchmark-report-schema.md`.
|
|
10
|
+
|
|
11
|
+
Usage:
|
|
12
|
+
python3 scripts/bench_run.py --corpus dev
|
|
13
|
+
python3 scripts/bench_run.py --corpus dev --top-k 3 --quiet
|
|
14
|
+
python3 scripts/bench_run.py --corpus dev --agent-output outputs.json
|
|
15
|
+
"""
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import argparse
|
|
19
|
+
import sys
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
|
|
22
|
+
REPO_ROOT = Path(__file__).resolve().parent.parent
|
|
23
|
+
sys.path.insert(0, str(REPO_ROOT / "scripts"))
|
|
24
|
+
|
|
25
|
+
from _lib import script_output # type: ignore[import-not-found] # noqa: E402
|
|
26
|
+
from _lib.bench_cost import aggregate_sessions # noqa: E402
|
|
27
|
+
from _lib.bench_quality import score_corpus # noqa: E402
|
|
28
|
+
from _lib.bench_report import ( # noqa: E402
|
|
29
|
+
report_paths,
|
|
30
|
+
render_markdown,
|
|
31
|
+
utc_now_filename_stamp,
|
|
32
|
+
utc_now_iso,
|
|
33
|
+
write_json,
|
|
34
|
+
write_markdown,
|
|
35
|
+
)
|
|
36
|
+
from bench_runner import run_corpus # noqa: E402
|
|
37
|
+
|
|
38
|
+
try:
|
|
39
|
+
import yaml
|
|
40
|
+
except ImportError:
|
|
41
|
+
script_output.error("error: PyYAML required (pip install pyyaml)")
|
|
42
|
+
sys.exit(2)
|
|
43
|
+
|
|
44
|
+
BENCH_RUN_VERSION = "0.1.0"
|
|
45
|
+
PRICING_PATH = REPO_ROOT / "bench" / "pricing.yaml"
|
|
46
|
+
SESSIONS_JSONL = REPO_ROOT / "agents" / "cost-tracking" / "sessions.jsonl"
|
|
47
|
+
REPORTS_DIR = REPO_ROOT / "bench" / "reports"
|
|
48
|
+
CORPUS_DIR = REPO_ROOT / "tests" / "eval"
|
|
49
|
+
BASELINE_COLLECTOR = REPO_ROOT / "scripts" / "bench_runner.py"
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _baseline_sha_or_mtime() -> str:
|
|
53
|
+
try:
|
|
54
|
+
return f"mtime:{int(BASELINE_COLLECTOR.stat().st_mtime)}"
|
|
55
|
+
except OSError:
|
|
56
|
+
return "unavailable"
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _verdict(selection: dict, quality: dict) -> dict[str, str]:
|
|
60
|
+
sel = "pass" if selection["passed"] else "fail"
|
|
61
|
+
if quality["source"] == "not_collected":
|
|
62
|
+
qual = "not_collected"
|
|
63
|
+
overall = "partial"
|
|
64
|
+
else:
|
|
65
|
+
qual = "pass" if quality["quality_score"] >= 0.60 else "fail"
|
|
66
|
+
overall = "pass" if (sel == "pass" and qual == "pass") else "fail"
|
|
67
|
+
return {"selection": sel, "quality": qual, "overall": overall}
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def build_report(
|
|
71
|
+
corpus_path: Path,
|
|
72
|
+
top_k: int,
|
|
73
|
+
agent_output: Path | None,
|
|
74
|
+
) -> dict:
|
|
75
|
+
selection = run_corpus(corpus_path, top_k)
|
|
76
|
+
corpus_yaml = yaml.safe_load(corpus_path.read_text(encoding="utf-8"))
|
|
77
|
+
prompts = corpus_yaml.get("prompts", [])
|
|
78
|
+
cost = aggregate_sessions(SESSIONS_JSONL, PRICING_PATH)
|
|
79
|
+
quality = score_corpus(prompts, agent_output)
|
|
80
|
+
verdict = _verdict(selection, quality)
|
|
81
|
+
return {
|
|
82
|
+
"schema_version": 1,
|
|
83
|
+
"generated_at": utc_now_iso(),
|
|
84
|
+
"corpus": {
|
|
85
|
+
"id": selection["corpus_id"],
|
|
86
|
+
"path": str(corpus_path.relative_to(REPO_ROOT)),
|
|
87
|
+
"prompt_count": len(prompts),
|
|
88
|
+
},
|
|
89
|
+
"runner": {
|
|
90
|
+
"bench_run_version": BENCH_RUN_VERSION,
|
|
91
|
+
"baseline_collector": str(BASELINE_COLLECTOR.relative_to(REPO_ROOT)),
|
|
92
|
+
"baseline_collector_sha": _baseline_sha_or_mtime(),
|
|
93
|
+
},
|
|
94
|
+
"selection": selection,
|
|
95
|
+
"cost": cost,
|
|
96
|
+
"quality": quality,
|
|
97
|
+
"verdict": verdict,
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def main(argv: list[str] | None = None) -> int:
|
|
102
|
+
ap = argparse.ArgumentParser(description=__doc__.splitlines()[0])
|
|
103
|
+
ap.add_argument("--corpus", default="dev", help="corpus id (default: dev)")
|
|
104
|
+
ap.add_argument("--top-k", type=int, default=3)
|
|
105
|
+
ap.add_argument("--agent-output", type=Path, default=None,
|
|
106
|
+
help="Path to JSON {id: output_text} for quality scoring (Phase 3)")
|
|
107
|
+
ap.add_argument("--quiet", action="store_true",
|
|
108
|
+
help="Print only the report path + headline")
|
|
109
|
+
ap.add_argument("--stamp", default=None,
|
|
110
|
+
help="Override timestamp (test hook); defaults to UTC now")
|
|
111
|
+
ap.add_argument("--no-write", action="store_true",
|
|
112
|
+
help="Compute the report but do not write files (dry run)")
|
|
113
|
+
args = ap.parse_args(argv)
|
|
114
|
+
|
|
115
|
+
corpus_path = CORPUS_DIR / f"corpus-{args.corpus}.yaml"
|
|
116
|
+
if not corpus_path.is_file():
|
|
117
|
+
script_output.error(f"error: corpus not found: {corpus_path}")
|
|
118
|
+
return 2
|
|
119
|
+
|
|
120
|
+
report = build_report(corpus_path, args.top_k, args.agent_output)
|
|
121
|
+
stamp = args.stamp or utc_now_filename_stamp()
|
|
122
|
+
json_path, md_path = report_paths(REPORTS_DIR, report["corpus"]["id"], stamp)
|
|
123
|
+
|
|
124
|
+
if not args.no_write:
|
|
125
|
+
write_json(json_path, report)
|
|
126
|
+
write_markdown(md_path, report)
|
|
127
|
+
|
|
128
|
+
verdict = report["verdict"]
|
|
129
|
+
sel = report["selection"]
|
|
130
|
+
qual = report["quality"]
|
|
131
|
+
cost = report["cost"]
|
|
132
|
+
headline = (
|
|
133
|
+
f"bench {report['corpus']['id']} · "
|
|
134
|
+
f"selection {sel['selection_accuracy']:.2%} ({verdict['selection']}) · "
|
|
135
|
+
f"cost ${cost['totals']['total_cost_usd']:.6f} ({cost.get('source', 'n/a')}) · "
|
|
136
|
+
f"quality {qual['quality_score']:.2%} ({verdict['quality']}) · "
|
|
137
|
+
f"overall {verdict['overall']}"
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
if args.quiet:
|
|
141
|
+
print(headline)
|
|
142
|
+
if not args.no_write:
|
|
143
|
+
print(f"report: {md_path.relative_to(REPO_ROOT)}")
|
|
144
|
+
else:
|
|
145
|
+
print(render_markdown(report))
|
|
146
|
+
if not args.no_write:
|
|
147
|
+
print(f"\n→ json: {json_path.relative_to(REPO_ROOT)}")
|
|
148
|
+
print(f"→ markdown: {md_path.relative_to(REPO_ROOT)}")
|
|
149
|
+
|
|
150
|
+
# Exit zero on overall pass OR partial (partial = quality_not_collected by design).
|
|
151
|
+
return 0 if verdict["overall"] in ("pass", "partial") else 1
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
if __name__ == "__main__":
|
|
155
|
+
sys.exit(main(sys.argv[1:]))
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
"""Config-layer loaders (profile, preset, pack).
|
|
2
|
+
|
|
3
|
+
Phase 1 of step-15 product refinement. Single home for the audience /
|
|
4
|
+
governance / workflow axes introduced by
|
|
5
|
+
:mod:`docs.contracts.profile-system`,
|
|
6
|
+
:mod:`docs.contracts.config-presets`, and the upcoming workflow-packs
|
|
7
|
+
contract. Loaders here are pure, read-only, lazy-PyYAML; they layer on
|
|
8
|
+
top of :mod:`scripts._lib.agent_settings` for project-root anchoring.
|
|
9
|
+
"""
|