@event4u/agent-config 2.20.0 → 2.21.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.agent-src/commands/agent-status.md +16 -0
- package/.agent-src/rules/caveman-speak.md +2 -0
- package/.agent-src/skills/compress-memory/SKILL.md +119 -0
- package/.agent-src/templates/agents/agent-project-settings.example.yml +1 -1
- package/.claude-plugin/marketplace.json +2 -1
- package/CHANGELOG.md +59 -43
- package/README.md +5 -5
- package/docs/architecture.md +1 -1
- package/docs/archive/CHANGELOG-pre-2.17.0.md +63 -0
- package/docs/benchmarks.md +74 -0
- package/docs/catalog.md +3 -2
- package/docs/contracts/caveman-telemetry.md +83 -0
- package/docs/contracts/compression-default-kill-criterion.md +82 -35
- package/docs/contracts/cost-summary-schema.md +107 -0
- package/docs/contracts/file-ownership-matrix.json +41 -0
- package/package.json +1 -1
- package/scripts/_lib/bench_caveman.py +273 -0
- package/scripts/_lib/bench_caveman_report.py +152 -0
- package/scripts/bench_compress_memory.py +168 -0
- package/scripts/bench_run.py +119 -1
- package/scripts/caveman_stats.py +119 -0
- package/scripts/check_command_count_messaging.py +2 -2
- package/scripts/compress_memory.py +172 -0
- package/scripts/cost_by_conversation.py +78 -0
- package/scripts/cost_summary.py +97 -0
- package/scripts/lint_roadmap_complexity.py +3 -2
- package/scripts/update_counts.py +7 -5
- package/scripts/validate_caveman_carveouts.py +129 -0
- package/scripts/validate_safe_paths.py +118 -0
- package/scripts/verify_roadmap_closure.py +327 -0
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
# Caveman bench report serializer — step-16 Phase 1 Step 5.
|
|
2
|
+
#
|
|
3
|
+
# Emits the caveman-v1 JSON + Markdown shape. Distinct schema_version
|
|
4
|
+
# ("caveman-v1") from the selection-accuracy bench (v1) because the
|
|
5
|
+
# blocks are disjoint: caveman has no `selection`/`quality`, and the
|
|
6
|
+
# selection bench has no three-arm compression metrics.
|
|
7
|
+
"""Caveman bench report serializer."""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
from _lib.bench_caveman import ARMS, PromptResult, aggregate_results, compute_cost
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def build_caveman_report(
|
|
16
|
+
*,
|
|
17
|
+
results: list[PromptResult],
|
|
18
|
+
corpus_path_rel: str,
|
|
19
|
+
generated_at: str,
|
|
20
|
+
bench_run_version: str,
|
|
21
|
+
model: str,
|
|
22
|
+
transport: str,
|
|
23
|
+
pricing_rates: dict[str, float],
|
|
24
|
+
pricing_sourced_on: str | None,
|
|
25
|
+
) -> dict[str, Any]:
|
|
26
|
+
aggregate = aggregate_results(results)
|
|
27
|
+
cost = compute_cost(results, pricing_rates)
|
|
28
|
+
cost["source"] = "live-api"
|
|
29
|
+
cost["model"] = model
|
|
30
|
+
cost["pricing_sourced_on"] = pricing_sourced_on
|
|
31
|
+
errors = cost["totals"]["errors"]
|
|
32
|
+
return {
|
|
33
|
+
"schema_version": "caveman-v1",
|
|
34
|
+
"generated_at": generated_at,
|
|
35
|
+
"corpus": {
|
|
36
|
+
"id": "caveman",
|
|
37
|
+
"path": corpus_path_rel,
|
|
38
|
+
"prompt_count": len(results),
|
|
39
|
+
},
|
|
40
|
+
"runner": {
|
|
41
|
+
"bench_run_version": bench_run_version,
|
|
42
|
+
"transport": transport,
|
|
43
|
+
"model": model,
|
|
44
|
+
},
|
|
45
|
+
"caveman": {
|
|
46
|
+
"arms": list(ARMS),
|
|
47
|
+
"aggregate": aggregate,
|
|
48
|
+
"per_prompt": [_prompt_block(r) for r in results],
|
|
49
|
+
},
|
|
50
|
+
"cost": cost,
|
|
51
|
+
"verdict": {
|
|
52
|
+
"overall": "measured" if errors == 0 else "partial",
|
|
53
|
+
"errors": errors,
|
|
54
|
+
},
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _prompt_block(r: PromptResult) -> dict[str, Any]:
|
|
59
|
+
return {
|
|
60
|
+
"id": r.id,
|
|
61
|
+
"category": r.category,
|
|
62
|
+
"expected_carve_out_pct": r.expected_carve_out_pct,
|
|
63
|
+
"realised_carve_out_pct": (
|
|
64
|
+
r.arms["compressed"].realised_carve_out_pct
|
|
65
|
+
if "compressed" in r.arms else None
|
|
66
|
+
),
|
|
67
|
+
"savings_vs_raw": r.savings_vs_raw,
|
|
68
|
+
"savings_vs_terse": r.savings_vs_terse,
|
|
69
|
+
"arms": {
|
|
70
|
+
arm: {
|
|
71
|
+
"input_tokens": ar.input_tokens,
|
|
72
|
+
"output_tokens": ar.output_tokens,
|
|
73
|
+
"latency_ms": ar.latency_ms,
|
|
74
|
+
"output_chars": ar.output_chars,
|
|
75
|
+
"carve_out_chars": ar.carve_out_chars,
|
|
76
|
+
"error": ar.error,
|
|
77
|
+
"text": ar.text,
|
|
78
|
+
}
|
|
79
|
+
for arm, ar in r.arms.items()
|
|
80
|
+
},
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _fmt_pct(x: float | None) -> str:
|
|
85
|
+
return f"{x:.2%}" if isinstance(x, (int, float)) else "—"
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def render_caveman_markdown(report: dict[str, Any]) -> str:
|
|
89
|
+
cv = report["caveman"]
|
|
90
|
+
agg = cv["aggregate"]
|
|
91
|
+
cost = report["cost"]
|
|
92
|
+
head = [
|
|
93
|
+
f"# Caveman Bench Report — `caveman` · {report['generated_at']}",
|
|
94
|
+
"",
|
|
95
|
+
"## Headline",
|
|
96
|
+
"",
|
|
97
|
+
f"- prompts: **{report['corpus']['prompt_count']}** · "
|
|
98
|
+
f"arms: **{', '.join(cv['arms'])}** · "
|
|
99
|
+
f"model: **{report['runner']['model']}** · "
|
|
100
|
+
f"transport: **{report['runner']['transport']}**",
|
|
101
|
+
f"- median savings vs raw: **{_fmt_pct(agg['savings_vs_raw']['median'])}** "
|
|
102
|
+
f"(p10 {_fmt_pct(agg['savings_vs_raw']['p10'])} · p90 {_fmt_pct(agg['savings_vs_raw']['p90'])})",
|
|
103
|
+
f"- median savings vs terse-control: **{_fmt_pct(agg['savings_vs_terse']['median'])}** "
|
|
104
|
+
f"(p10 {_fmt_pct(agg['savings_vs_terse']['p10'])} · p90 {_fmt_pct(agg['savings_vs_terse']['p90'])})",
|
|
105
|
+
f"- median realised carve-out share (compressed arm): **{_fmt_pct(agg['realised_carve_out_pct']['median'])}** "
|
|
106
|
+
f"(expected median {_fmt_pct(agg['expected_carve_out_pct']['median'])})",
|
|
107
|
+
f"- total cost: **${cost['totals']['total_cost_usd']:.6f}** "
|
|
108
|
+
f"(calls {cost['totals']['calls']} · errors {cost['totals']['errors']})",
|
|
109
|
+
f"- verdict: **{report['verdict']['overall']}**",
|
|
110
|
+
"",
|
|
111
|
+
]
|
|
112
|
+
per_arm = [
|
|
113
|
+
"## Per-arm token totals",
|
|
114
|
+
"",
|
|
115
|
+
"| arm | calls | input_tokens | output_tokens | median out/prompt |",
|
|
116
|
+
"|---|---:|---:|---:|---:|",
|
|
117
|
+
]
|
|
118
|
+
for arm in cv["arms"]:
|
|
119
|
+
a = cost["per_arm"][arm]
|
|
120
|
+
m = agg["output_tokens"][arm]["median"]
|
|
121
|
+
per_arm.append(
|
|
122
|
+
f"| `{arm}` | {a['calls']} | {a['input_tokens']} | {a['output_tokens']} | {m:.0f} |"
|
|
123
|
+
)
|
|
124
|
+
per_arm.append("")
|
|
125
|
+
per_prompt = [
|
|
126
|
+
"## Per-prompt results",
|
|
127
|
+
"",
|
|
128
|
+
"| id | category | exp.carve | real.carve | out.compressed | out.terse | out.uncompressed | vs raw | vs terse |",
|
|
129
|
+
"|---|---|---:|---:|---:|---:|---:|---:|---:|",
|
|
130
|
+
]
|
|
131
|
+
for r in cv["per_prompt"]:
|
|
132
|
+
arms = r["arms"]
|
|
133
|
+
oc = arms.get("compressed", {}).get("output_tokens", "—")
|
|
134
|
+
ot = arms.get("terse_control", {}).get("output_tokens", "—")
|
|
135
|
+
ou = arms.get("uncompressed", {}).get("output_tokens", "—")
|
|
136
|
+
per_prompt.append(
|
|
137
|
+
f"| `{r['id']}` | {r['category']} | "
|
|
138
|
+
f"{_fmt_pct(r['expected_carve_out_pct'])} | {_fmt_pct(r['realised_carve_out_pct'])} | "
|
|
139
|
+
f"{oc} | {ot} | {ou} | "
|
|
140
|
+
f"{_fmt_pct(r['savings_vs_raw'])} | {_fmt_pct(r['savings_vs_terse'])} |"
|
|
141
|
+
)
|
|
142
|
+
per_prompt.append("")
|
|
143
|
+
notes = [
|
|
144
|
+
"## Notes",
|
|
145
|
+
"",
|
|
146
|
+
f"- corpus: `{report['corpus']['path']}`",
|
|
147
|
+
f"- pricing: `bench/pricing.yaml` (sourced {cost.get('pricing_sourced_on') or '—'})",
|
|
148
|
+
f"- schema: `caveman-v1` (see `docs/contracts/benchmark-report-schema.md`)",
|
|
149
|
+
f"- bench_run version: `{report['runner']['bench_run_version']}`",
|
|
150
|
+
"",
|
|
151
|
+
]
|
|
152
|
+
return "\n".join(head + per_arm + per_prompt + notes)
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Offline bench for input-side memory compression (Phase 2 / Step 11).
|
|
3
|
+
|
|
4
|
+
Runs `compress_memory.py` over a fixed corpus of memory-target files, records
|
|
5
|
+
pre/post char counts, approximates input-token savings (chars / 4 — the
|
|
6
|
+
GPT-4 / Claude rule of thumb), and emits `bench/reports/caveman-v2.{json,md}`.
|
|
7
|
+
|
|
8
|
+
Offline (no API calls). Cadence-aligned with `docs/benchmarks.md`. Citation
|
|
9
|
+
in `bench/reports/caveman-v2.md` notes the chars→tokens approximation and
|
|
10
|
+
points at upstream tiktoken / claude-tokenizer if a calibrated number is
|
|
11
|
+
later needed.
|
|
12
|
+
"""
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import json
|
|
16
|
+
import shutil
|
|
17
|
+
import statistics
|
|
18
|
+
import subprocess
|
|
19
|
+
import sys
|
|
20
|
+
import tempfile
|
|
21
|
+
from datetime import datetime, timezone
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
|
|
24
|
+
REPO_ROOT = Path(__file__).resolve().parent.parent
|
|
25
|
+
COMPRESS_SCRIPT = REPO_ROOT / "scripts" / "compress_memory.py"
|
|
26
|
+
REPORT_JSON = REPO_ROOT / "bench" / "reports" / "caveman-v2.json"
|
|
27
|
+
REPORT_MD = REPO_ROOT / "bench" / "reports" / "caveman-v2.md"
|
|
28
|
+
|
|
29
|
+
CORPUS: list[tuple[str, str]] = [
|
|
30
|
+
("AGENTS.md", "thin-root-package"),
|
|
31
|
+
(".agent-src.uncompressed/templates/AGENTS.md", "thin-root-consumer-template"),
|
|
32
|
+
(".agent-src/templates/AGENTS.md", "thin-root-consumer-generated"),
|
|
33
|
+
("docs/contracts/ai-council-config.md", "prose-heavy-contract"),
|
|
34
|
+
("docs/contracts/implement-ticket-flow.md", "prose-heavy-contract"),
|
|
35
|
+
("docs/contracts/command-clusters.md", "prose-heavy-contract"),
|
|
36
|
+
("docs/contracts/mental-models.md", "prose-heavy-contract"),
|
|
37
|
+
("docs/contracts/kernel-membership.md", "prose-heavy-contract"),
|
|
38
|
+
("docs/contracts/load-context-budget-model.md", "prose-heavy-contract"),
|
|
39
|
+
("docs/contracts/mcp-cloud-scope.md", "prose-heavy-contract"),
|
|
40
|
+
("docs/contracts/context-spine.md", "prose-heavy-contract"),
|
|
41
|
+
("docs/contracts/rule-classification.md", "rule-classification"),
|
|
42
|
+
]
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def chars_to_tokens(n: int) -> int:
|
|
46
|
+
"""Approximate token count via chars / 4 (GPT-4/Claude English heuristic)."""
|
|
47
|
+
return round(n / 4)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def bench_one(rel_path: str, category: str) -> dict:
|
|
51
|
+
src = REPO_ROOT / rel_path
|
|
52
|
+
if not src.is_file():
|
|
53
|
+
return {"path": rel_path, "category": category, "error": "not-found"}
|
|
54
|
+
with tempfile.TemporaryDirectory() as tmp:
|
|
55
|
+
target = Path(tmp) / src.name
|
|
56
|
+
shutil.copy(src, target)
|
|
57
|
+
before_chars = target.stat().st_size
|
|
58
|
+
result = subprocess.run(
|
|
59
|
+
[sys.executable, str(COMPRESS_SCRIPT), str(target)],
|
|
60
|
+
capture_output=True, text=True, cwd=REPO_ROOT,
|
|
61
|
+
)
|
|
62
|
+
if result.returncode != 0:
|
|
63
|
+
return {"path": rel_path, "category": category,
|
|
64
|
+
"error": f"exit-{result.returncode}", "stderr": result.stderr[:200]}
|
|
65
|
+
after_chars = target.stat().st_size
|
|
66
|
+
before_tok = chars_to_tokens(before_chars)
|
|
67
|
+
after_tok = chars_to_tokens(after_chars)
|
|
68
|
+
return {
|
|
69
|
+
"path": rel_path,
|
|
70
|
+
"category": category,
|
|
71
|
+
"before_chars": before_chars,
|
|
72
|
+
"after_chars": after_chars,
|
|
73
|
+
"delta_chars": after_chars - before_chars,
|
|
74
|
+
"saving_pct_chars": (before_chars - after_chars) * 100 / before_chars,
|
|
75
|
+
"before_tokens_est": before_tok,
|
|
76
|
+
"after_tokens_est": after_tok,
|
|
77
|
+
"delta_tokens_est": after_tok - before_tok,
|
|
78
|
+
"saving_pct_tokens_est": (before_tok - after_tok) * 100 / before_tok if before_tok else 0.0,
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def aggregate(rows: list[dict]) -> dict:
|
|
83
|
+
rows_ok = [r for r in rows if "error" not in r]
|
|
84
|
+
savings = [r["saving_pct_chars"] for r in rows_ok]
|
|
85
|
+
by_cat: dict[str, list[float]] = {}
|
|
86
|
+
for r in rows_ok:
|
|
87
|
+
by_cat.setdefault(r["category"], []).append(r["saving_pct_chars"])
|
|
88
|
+
return {
|
|
89
|
+
"calls": len(rows),
|
|
90
|
+
"errors": len(rows) - len(rows_ok),
|
|
91
|
+
"median_saving_pct": statistics.median(savings) if savings else 0.0,
|
|
92
|
+
"p10_saving_pct": statistics.quantiles(savings, n=10)[0] if len(savings) >= 10 else min(savings, default=0.0),
|
|
93
|
+
"p90_saving_pct": statistics.quantiles(savings, n=10)[8] if len(savings) >= 10 else max(savings, default=0.0),
|
|
94
|
+
"stdev_saving_pct": statistics.pstdev(savings) if len(savings) > 1 else 0.0,
|
|
95
|
+
"total_chars_saved": sum(r["before_chars"] - r["after_chars"] for r in rows_ok),
|
|
96
|
+
"total_tokens_est_saved": sum(r["before_tokens_est"] - r["after_tokens_est"] for r in rows_ok),
|
|
97
|
+
"by_category_median_pct": {k: statistics.median(v) for k, v in by_cat.items()},
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def render_md(payload: dict) -> str:
|
|
102
|
+
agg = payload["aggregate"]
|
|
103
|
+
lines = [
|
|
104
|
+
"# caveman-v2 — input-side memory compression bench",
|
|
105
|
+
"",
|
|
106
|
+
f"**Generated:** {payload['generated_at']}",
|
|
107
|
+
f"**Schema:** `caveman-v2` (input-side; offline; chars→tokens via /4 heuristic)",
|
|
108
|
+
f"**Script:** `scripts/bench_compress_memory.py`",
|
|
109
|
+
"",
|
|
110
|
+
"## Headline",
|
|
111
|
+
"",
|
|
112
|
+
f"- Median char saving: **{agg['median_saving_pct']:+.2f}%** (p10 {agg['p10_saving_pct']:+.2f}% · p90 {agg['p90_saving_pct']:+.2f}%)",
|
|
113
|
+
f"- Total chars saved across corpus: **{agg['total_chars_saved']:+,}**",
|
|
114
|
+
f"- Total tokens (estimate) saved across corpus: **{agg['total_tokens_est_saved']:+,}**",
|
|
115
|
+
f"- Files: {agg['calls']} · errors: {agg['errors']}",
|
|
116
|
+
"",
|
|
117
|
+
"## By category (median %)",
|
|
118
|
+
"",
|
|
119
|
+
"| Category | Median saving |",
|
|
120
|
+
"|---|---:|",
|
|
121
|
+
]
|
|
122
|
+
for cat, med in sorted(agg["by_category_median_pct"].items()):
|
|
123
|
+
lines.append(f"| {cat} | {med:+.2f}% |")
|
|
124
|
+
lines += ["", "## Per file", "",
|
|
125
|
+
"| Path | Category | Before | After | Δ chars | Saving % |",
|
|
126
|
+
"|---|---|---:|---:|---:|---:|"]
|
|
127
|
+
for r in payload["rows"]:
|
|
128
|
+
if "error" in r:
|
|
129
|
+
lines.append(f"| `{r['path']}` | {r['category']} | — | — | — | {r['error']} |")
|
|
130
|
+
else:
|
|
131
|
+
lines.append(
|
|
132
|
+
f"| `{r['path']}` | {r['category']} | {r['before_chars']:,} | {r['after_chars']:,} | "
|
|
133
|
+
f"{r['delta_chars']:+,} | {r['saving_pct_chars']:+.2f}% |"
|
|
134
|
+
)
|
|
135
|
+
lines += ["", "## Methodology",
|
|
136
|
+
"",
|
|
137
|
+
"- Offline run: `compress_memory.py` writes `.original.md` backup + frontmatter (`original_sha256`, `compressed_at`). The frontmatter pair (≈ 120 chars) is the fixed compression tax — files with little prose net negative.",
|
|
138
|
+
"- chars → tokens approximation: `tokens ≈ chars / 4` (GPT-4 / Claude English rule of thumb). Calibrated number requires `tiktoken` or `claude-tokenizer`; deferred until a consumer requests pinpoint numbers.",
|
|
139
|
+
"- The `caveman-v1` output-side verdict (`vs_terse` median −9.27%) is orthogonal — input-side savings apply to the always-loaded memory budget, not the reply stream.",
|
|
140
|
+
"",
|
|
141
|
+
"## Interpretation",
|
|
142
|
+
"",
|
|
143
|
+
"- **Thin-Root files net negative.** `AGENTS.md` and `templates/AGENTS.md` already follow `agents-md-thin-root` (≥ 40 % pointer ratio). The compressor's frontmatter pair adds more bytes than the sparse prose loses. **Do not compress Thin-Root files.**",
|
|
144
|
+
"- **Prose-heavy contract docs net 3–6 % saving.** Useful but modest. Pays off when the file is large and frequently loaded.",
|
|
145
|
+
"- **Rule of thumb:** target files with > 5 KB and visible paragraph prose; skip pointer-only files.",
|
|
146
|
+
""]
|
|
147
|
+
return "\n".join(lines)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def main() -> int:
|
|
151
|
+
rows = [bench_one(p, c) for p, c in CORPUS]
|
|
152
|
+
payload = {
|
|
153
|
+
"generated_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
|
|
154
|
+
"schema": "caveman-v2",
|
|
155
|
+
"rows": rows,
|
|
156
|
+
"aggregate": aggregate(rows),
|
|
157
|
+
}
|
|
158
|
+
REPORT_JSON.parent.mkdir(parents=True, exist_ok=True)
|
|
159
|
+
REPORT_JSON.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8")
|
|
160
|
+
REPORT_MD.write_text(render_md(payload), encoding="utf-8")
|
|
161
|
+
print(f"wrote: {REPORT_JSON}")
|
|
162
|
+
print(f"wrote: {REPORT_MD}")
|
|
163
|
+
print(f"median saving: {payload['aggregate']['median_saving_pct']:+.2f}%")
|
|
164
|
+
return 0
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
if __name__ == "__main__":
|
|
168
|
+
sys.exit(main())
|
package/scripts/bench_run.py
CHANGED
|
@@ -21,6 +21,7 @@ from pathlib import Path
|
|
|
21
21
|
|
|
22
22
|
REPO_ROOT = Path(__file__).resolve().parent.parent
|
|
23
23
|
sys.path.insert(0, str(REPO_ROOT / "scripts"))
|
|
24
|
+
sys.path.insert(0, str(REPO_ROOT))
|
|
24
25
|
|
|
25
26
|
from _lib import script_output # type: ignore[import-not-found] # noqa: E402
|
|
26
27
|
from _lib.bench_cost import aggregate_sessions # noqa: E402
|
|
@@ -33,6 +34,9 @@ from _lib.bench_report import ( # noqa: E402
|
|
|
33
34
|
write_json,
|
|
34
35
|
write_markdown,
|
|
35
36
|
)
|
|
37
|
+
from _lib import bench_caveman # noqa: E402
|
|
38
|
+
from _lib.bench_caveman_report import build_caveman_report, render_caveman_markdown # noqa: E402
|
|
39
|
+
from _lib.bench_cost import load_pricing # noqa: E402
|
|
36
40
|
from bench_runner import run_corpus # noqa: E402
|
|
37
41
|
|
|
38
42
|
try:
|
|
@@ -41,11 +45,12 @@ except ImportError:
|
|
|
41
45
|
script_output.error("error: PyYAML required (pip install pyyaml)")
|
|
42
46
|
sys.exit(2)
|
|
43
47
|
|
|
44
|
-
BENCH_RUN_VERSION = "0.
|
|
48
|
+
BENCH_RUN_VERSION = "0.2.0"
|
|
45
49
|
PRICING_PATH = REPO_ROOT / "bench" / "pricing.yaml"
|
|
46
50
|
SESSIONS_JSONL = REPO_ROOT / "agents" / "cost-tracking" / "sessions.jsonl"
|
|
47
51
|
REPORTS_DIR = REPO_ROOT / "bench" / "reports"
|
|
48
52
|
CORPUS_DIR = REPO_ROOT / "tests" / "eval"
|
|
53
|
+
CAVEMAN_CORPUS = REPO_ROOT / "bench" / "corpora" / "caveman" / "prompts.yaml"
|
|
49
54
|
BASELINE_COLLECTOR = REPO_ROOT / "scripts" / "bench_runner.py"
|
|
50
55
|
|
|
51
56
|
|
|
@@ -110,8 +115,21 @@ def main(argv: list[str] | None = None) -> int:
|
|
|
110
115
|
help="Override timestamp (test hook); defaults to UTC now")
|
|
111
116
|
ap.add_argument("--no-write", action="store_true",
|
|
112
117
|
help="Compute the report but do not write files (dry run)")
|
|
118
|
+
ap.add_argument("--caveman", action="store_true",
|
|
119
|
+
help="Run the caveman three-arm compression bench instead of the "
|
|
120
|
+
"selection-accuracy bench (step-16 Phase 1).")
|
|
121
|
+
ap.add_argument("--caveman-max-prompts", type=int, default=None,
|
|
122
|
+
help="Cap prompts in the caveman bench (smoke test).")
|
|
123
|
+
ap.add_argument("--caveman-dry-run", action="store_true",
|
|
124
|
+
help="Caveman: skip live API calls; emit a stub report with "
|
|
125
|
+
"zero tokens (wiring check only).")
|
|
126
|
+
ap.add_argument("--caveman-report-tag", default="caveman-v1",
|
|
127
|
+
help="Filename tag for the caveman report (default: caveman-v1).")
|
|
113
128
|
args = ap.parse_args(argv)
|
|
114
129
|
|
|
130
|
+
if args.caveman:
|
|
131
|
+
return _run_caveman(args)
|
|
132
|
+
|
|
115
133
|
corpus_path = CORPUS_DIR / f"corpus-{args.corpus}.yaml"
|
|
116
134
|
if not corpus_path.is_file():
|
|
117
135
|
script_output.error(f"error: corpus not found: {corpus_path}")
|
|
@@ -151,5 +169,105 @@ def main(argv: list[str] | None = None) -> int:
|
|
|
151
169
|
return 0 if verdict["overall"] in ("pass", "partial") else 1
|
|
152
170
|
|
|
153
171
|
|
|
172
|
+
class _DryRunClient:
|
|
173
|
+
"""Stub client for --caveman-dry-run. Returns empty CouncilResponse-shaped objects."""
|
|
174
|
+
|
|
175
|
+
def ask(self, system_prompt: str, user_prompt: str, max_tokens: int = 1024):
|
|
176
|
+
from ai_council.clients import CouncilResponse
|
|
177
|
+
return CouncilResponse(
|
|
178
|
+
provider="dry-run", model="stub", text="",
|
|
179
|
+
input_tokens=0, output_tokens=0, latency_ms=0, error=None,
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def _build_anthropic_client():
|
|
184
|
+
from ai_council.clients import AnthropicClient, load_anthropic_key
|
|
185
|
+
return AnthropicClient(api_key=load_anthropic_key())
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def _run_caveman(args: argparse.Namespace) -> int:
|
|
189
|
+
if not CAVEMAN_CORPUS.is_file():
|
|
190
|
+
script_output.error(f"error: caveman corpus not found: {CAVEMAN_CORPUS}")
|
|
191
|
+
return 2
|
|
192
|
+
|
|
193
|
+
if args.caveman_dry_run:
|
|
194
|
+
client = _DryRunClient()
|
|
195
|
+
transport = "dry-run"
|
|
196
|
+
model = "stub"
|
|
197
|
+
else:
|
|
198
|
+
try:
|
|
199
|
+
client = _build_anthropic_client()
|
|
200
|
+
except Exception as exc: # noqa: BLE001
|
|
201
|
+
script_output.error(f"error: cannot build Anthropic client: {exc}")
|
|
202
|
+
return 2
|
|
203
|
+
transport = "api"
|
|
204
|
+
model = getattr(client, "model", "claude-sonnet-4-5")
|
|
205
|
+
|
|
206
|
+
def _progress(done: int, total: int, pid: str, arm: str, ar) -> None:
|
|
207
|
+
if args.quiet:
|
|
208
|
+
return
|
|
209
|
+
err = f" ERR={ar.error}" if ar.error else ""
|
|
210
|
+
print(f"[{done:>3}/{total}] {pid} · {arm:<14} "
|
|
211
|
+
f"in={ar.input_tokens:>4} out={ar.output_tokens:>4} "
|
|
212
|
+
f"{ar.latency_ms:>5}ms{err}", file=sys.stderr)
|
|
213
|
+
|
|
214
|
+
results = bench_caveman.run_caveman_bench(
|
|
215
|
+
client, CAVEMAN_CORPUS,
|
|
216
|
+
max_prompts=args.caveman_max_prompts,
|
|
217
|
+
on_progress=_progress,
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
rates, sourced_on = load_pricing(PRICING_PATH)
|
|
221
|
+
sonnet_rates = rates.get("sonnet", {"input": 0.0, "output": 0.0})
|
|
222
|
+
|
|
223
|
+
report = build_caveman_report(
|
|
224
|
+
results=results,
|
|
225
|
+
corpus_path_rel=str(CAVEMAN_CORPUS.relative_to(REPO_ROOT)),
|
|
226
|
+
generated_at=utc_now_iso(),
|
|
227
|
+
bench_run_version=BENCH_RUN_VERSION,
|
|
228
|
+
model=model,
|
|
229
|
+
transport=transport,
|
|
230
|
+
pricing_rates=sonnet_rates,
|
|
231
|
+
pricing_sourced_on=sourced_on,
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
stamp = args.stamp or utc_now_filename_stamp()
|
|
235
|
+
json_path, md_path = report_paths(REPORTS_DIR, args.caveman_report_tag, stamp)
|
|
236
|
+
# Override: caveman roadmap pins the filename to `caveman-v1.{json,md}` (no stamp).
|
|
237
|
+
fixed_json = REPORTS_DIR / f"{args.caveman_report_tag}.json"
|
|
238
|
+
fixed_md = REPORTS_DIR / f"{args.caveman_report_tag}.md"
|
|
239
|
+
|
|
240
|
+
if not args.no_write:
|
|
241
|
+
write_json(fixed_json, report)
|
|
242
|
+
fixed_md.parent.mkdir(parents=True, exist_ok=True)
|
|
243
|
+
fixed_md.write_text(render_caveman_markdown(report), encoding="utf-8")
|
|
244
|
+
# Also drop a timestamped copy for the cadence trail.
|
|
245
|
+
write_json(json_path, report)
|
|
246
|
+
json_path.with_suffix(".md").write_text(
|
|
247
|
+
render_caveman_markdown(report), encoding="utf-8"
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
cost = report["cost"]
|
|
251
|
+
headline = (
|
|
252
|
+
f"caveman · prompts {report['corpus']['prompt_count']} · "
|
|
253
|
+
f"calls {cost['totals']['calls']} · errors {cost['totals']['errors']} · "
|
|
254
|
+
f"vs_raw med {report['caveman']['aggregate']['savings_vs_raw']['median']:.2%} · "
|
|
255
|
+
f"vs_terse med {report['caveman']['aggregate']['savings_vs_terse']['median']:.2%} · "
|
|
256
|
+
f"cost ${cost['totals']['total_cost_usd']:.6f}"
|
|
257
|
+
)
|
|
258
|
+
if args.quiet:
|
|
259
|
+
print(headline)
|
|
260
|
+
if not args.no_write:
|
|
261
|
+
print(f"report: {fixed_md.relative_to(REPO_ROOT)}")
|
|
262
|
+
else:
|
|
263
|
+
print(render_caveman_markdown(report))
|
|
264
|
+
if not args.no_write:
|
|
265
|
+
print(f"\n→ json: {fixed_json.relative_to(REPO_ROOT)}")
|
|
266
|
+
print(f"→ markdown: {fixed_md.relative_to(REPO_ROOT)}")
|
|
267
|
+
print(f"→ trail: {json_path.relative_to(REPO_ROOT)}")
|
|
268
|
+
|
|
269
|
+
return 0 if cost["totals"]["errors"] == 0 else 1
|
|
270
|
+
|
|
271
|
+
|
|
154
272
|
if __name__ == "__main__":
|
|
155
273
|
sys.exit(main(sys.argv[1:]))
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Caveman per-session / per-conversation / lifetime token-delta lens.
|
|
3
|
+
|
|
4
|
+
Reads sessions.jsonl, groups by sessionId + conversation_id, emits per-row
|
|
5
|
+
caveman delta tokens. Honors the suspended-multiplier contract in
|
|
6
|
+
`docs/contracts/caveman-telemetry.md` (delta = 0 while suspended).
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
import argparse, json, sys
|
|
10
|
+
from collections import defaultdict
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
REPO_ROOT = Path(__file__).resolve().parent.parent
|
|
14
|
+
DEFAULT_JSONL = REPO_ROOT / "agents" / "cost-tracking" / "sessions.jsonl"
|
|
15
|
+
TELEMETRY_DOC = REPO_ROOT / "docs" / "contracts" / "caveman-telemetry.md"
|
|
16
|
+
|
|
17
|
+
# Mirrors `docs/contracts/caveman-telemetry.md` `v1` constants.
|
|
18
|
+
MULTIPLIER_VERSION = "v1"
|
|
19
|
+
MULTIPLIER_VALUE = 0.9155
|
|
20
|
+
MULTIPLIER_ACTIVE = False # suspended pending v2
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _load(path: Path) -> list[dict]:
|
|
24
|
+
if not path.is_file():
|
|
25
|
+
return []
|
|
26
|
+
rows: list[dict] = []
|
|
27
|
+
for line in path.read_text(encoding="utf-8").splitlines():
|
|
28
|
+
line = line.strip()
|
|
29
|
+
if not line or line.startswith("#"):
|
|
30
|
+
continue
|
|
31
|
+
try:
|
|
32
|
+
rows.append(json.loads(line))
|
|
33
|
+
except json.JSONDecodeError:
|
|
34
|
+
continue
|
|
35
|
+
return rows
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _delta(row: dict) -> int:
|
|
39
|
+
"""Per-row delta with suspended-multiplier guard."""
|
|
40
|
+
if not MULTIPLIER_ACTIVE:
|
|
41
|
+
return 0
|
|
42
|
+
explicit = row.get("caveman_delta_tokens")
|
|
43
|
+
if isinstance(explicit, (int, float)):
|
|
44
|
+
return int(explicit)
|
|
45
|
+
compressed = row.get("caveman_compressed_tokens")
|
|
46
|
+
if isinstance(compressed, (int, float)) and compressed > 0:
|
|
47
|
+
return int(compressed * MULTIPLIER_VALUE - compressed)
|
|
48
|
+
return 0
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def aggregate(rows: list[dict]) -> dict:
|
|
52
|
+
_zero = lambda: {"sessions": 0, "delta_tokens": 0, "compressed_tokens": 0}
|
|
53
|
+
by_session: dict[str, dict] = defaultdict(_zero)
|
|
54
|
+
by_conv: dict[str, dict] = defaultdict(_zero)
|
|
55
|
+
lifetime = _zero()
|
|
56
|
+
for row in rows:
|
|
57
|
+
sid = str(row.get("sessionId") or row.get("session_id") or "unknown")
|
|
58
|
+
cid = str(row.get("conversation_id") or "unknown")
|
|
59
|
+
delta = _delta(row)
|
|
60
|
+
comp = int(row.get("caveman_compressed_tokens") or 0)
|
|
61
|
+
for bucket in (by_session[sid], by_conv[cid], lifetime):
|
|
62
|
+
bucket["sessions"] += 1
|
|
63
|
+
bucket["delta_tokens"] += delta
|
|
64
|
+
bucket["compressed_tokens"] += comp
|
|
65
|
+
return {
|
|
66
|
+
"schema_version": "caveman-stats/v1",
|
|
67
|
+
"multiplier_version": MULTIPLIER_VERSION,
|
|
68
|
+
"multiplier_value": MULTIPLIER_VALUE,
|
|
69
|
+
"multiplier_active": MULTIPLIER_ACTIVE,
|
|
70
|
+
"lifetime": lifetime,
|
|
71
|
+
"by_session": dict(by_session),
|
|
72
|
+
"by_conversation": dict(by_conv),
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def render_text(report: dict) -> str:
|
|
77
|
+
lines = [
|
|
78
|
+
f"caveman-stats {report['schema_version']} · multiplier {report['multiplier_version']}"
|
|
79
|
+
f" ({'ACTIVE' if report['multiplier_active'] else 'SUSPENDED'}) · "
|
|
80
|
+
f"value {report['multiplier_value']:.4f}",
|
|
81
|
+
"",
|
|
82
|
+
f" lifetime: {report['lifetime']['sessions']} sessions · "
|
|
83
|
+
f"delta_tokens = {report['lifetime']['delta_tokens']:+,} · "
|
|
84
|
+
f"compressed_tokens = {report['lifetime']['compressed_tokens']:,}",
|
|
85
|
+
"",
|
|
86
|
+
" by conversation:",
|
|
87
|
+
]
|
|
88
|
+
for cid, b in sorted(report["by_conversation"].items()):
|
|
89
|
+
lines.append(
|
|
90
|
+
f" {cid}: {b['sessions']} sessions · "
|
|
91
|
+
f"delta = {b['delta_tokens']:+,} · compressed = {b['compressed_tokens']:,}"
|
|
92
|
+
)
|
|
93
|
+
if not report["multiplier_active"]:
|
|
94
|
+
lines += [
|
|
95
|
+
"",
|
|
96
|
+
" Note: multiplier suspended — see docs/contracts/caveman-telemetry.md",
|
|
97
|
+
" (delta_tokens = 0 until kill-criterion satisfied in caveman-v2).",
|
|
98
|
+
]
|
|
99
|
+
return "\n".join(lines) + "\n"
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def main(argv: list[str] | None = None) -> int:
|
|
103
|
+
parser = argparse.ArgumentParser(description=__doc__.splitlines()[0])
|
|
104
|
+
parser.add_argument("--input", type=Path, default=DEFAULT_JSONL)
|
|
105
|
+
parser.add_argument("--format", choices=["text", "json"], default="text")
|
|
106
|
+
args = parser.parse_args(argv)
|
|
107
|
+
|
|
108
|
+
rows = _load(args.input)
|
|
109
|
+
report = aggregate(rows)
|
|
110
|
+
|
|
111
|
+
if args.format == "json":
|
|
112
|
+
print(json.dumps(report, indent=2))
|
|
113
|
+
else:
|
|
114
|
+
print(render_text(report))
|
|
115
|
+
return 0
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
if __name__ == "__main__":
|
|
119
|
+
sys.exit(main())
|
|
@@ -17,7 +17,7 @@ Canonical counts:
|
|
|
17
17
|
Patterns checked (per file):
|
|
18
18
|
|
|
19
19
|
README.md
|
|
20
|
-
hero
|
|
20
|
+
hero badge "/badge/Commands-{N}-…" → active
|
|
21
21
|
browse line "Browse all {N} active commands" → active
|
|
22
22
|
browse meta "{N} files total" → total
|
|
23
23
|
browse meta "{N} are deprecation shims" → shims
|
|
@@ -84,7 +84,7 @@ def main() -> int:
|
|
|
84
84
|
|
|
85
85
|
checks = [
|
|
86
86
|
# README.md
|
|
87
|
-
(README, r"
|
|
87
|
+
(README, r"/badge/Commands-(\d+)-", active, "hero badge"),
|
|
88
88
|
(README, r"Browse all (\d+) active commands", active, "browse line"),
|
|
89
89
|
(README, r"\+ (\d+) native commands\)", active, "tools blurb"),
|
|
90
90
|
# docs/getting-started.md
|