@event4u/agent-config 2.20.1 → 2.23.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.agent-src/commands/agent-status.md +16 -0
- package/.agent-src/rules/caveman-speak.md +2 -0
- package/.agent-src/skills/adversarial-review/SKILL.md +2 -1
- package/.agent-src/skills/canvas-design/SKILL.md +11 -6
- package/.agent-src/skills/compress-memory/SKILL.md +119 -0
- package/.agent-src/skills/fe-design/SKILL.md +8 -0
- package/.agent-src/skills/prompt-optimizer/SKILL.md +29 -5
- package/.agent-src/skills/react-shadcn-ui/SKILL.md +9 -0
- package/.agent-src/skills/refine-prompt/SKILL.md +57 -0
- package/.agent-src/skills/tailwind-engineer/SKILL.md +14 -0
- package/.agent-src/templates/agents/agent-project-settings.example.yml +53 -1
- package/.claude-plugin/marketplace.json +2 -1
- package/CHANGELOG.md +101 -138
- package/README.md +5 -5
- package/docs/architecture.md +2 -2
- package/docs/archive/CHANGELOG-pre-2.20.0.md +159 -0
- package/docs/benchmarks.md +74 -0
- package/docs/catalog.md +5 -3
- package/docs/contracts/caveman-telemetry.md +83 -0
- package/docs/contracts/compression-default-kill-criterion.md +82 -35
- package/docs/contracts/cost-summary-schema.md +107 -0
- package/docs/contracts/file-ownership-matrix.json +48 -0
- package/docs/guidelines/prompt-templates.md +166 -0
- package/package.json +1 -1
- package/scripts/_lib/bench_caveman.py +273 -0
- package/scripts/_lib/bench_caveman_report.py +152 -0
- package/scripts/bench_compress_memory.py +168 -0
- package/scripts/bench_run.py +119 -1
- package/scripts/caveman_stats.py +119 -0
- package/scripts/check_command_count_messaging.py +2 -2
- package/scripts/compress_memory.py +172 -0
- package/scripts/cost_by_conversation.py +78 -0
- package/scripts/cost_summary.py +97 -0
- package/scripts/update_counts.py +7 -5
- package/scripts/validate_caveman_carveouts.py +129 -0
- package/scripts/validate_safe_paths.py +118 -0
- package/scripts/verify_roadmap_closure.py +327 -0
|
@@ -0,0 +1,273 @@
|
|
|
1
|
+
# Caveman compression bench — step-16 Phase 1 Step 4.
|
|
2
|
+
#
|
|
3
|
+
# Three-arm live bench against bench/corpora/caveman/prompts.yaml:
|
|
4
|
+
# compressed — system prompt embeds caveman-speak rule (aggressive).
|
|
5
|
+
# terse_control — system prompt = "Answer concisely. …" (carve-out-free baseline).
|
|
6
|
+
# uncompressed — generic helpful-assistant system prompt.
|
|
7
|
+
#
|
|
8
|
+
# Token counts come from Anthropic API `usage` (authoritative). Carve-out
|
|
9
|
+
# share is measured via regex extraction on the reply text; chars/4 yields
|
|
10
|
+
# an estimated carve-out-token figure for the carve-out-tax accounting.
|
|
11
|
+
#
|
|
12
|
+
# Cost-touch: 10 prompts × 3 arms × claude-sonnet-4-5 (~$3/M in, ~$15/M out).
|
|
13
|
+
"""Caveman compression bench runner."""
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import re
|
|
17
|
+
import statistics
|
|
18
|
+
import time
|
|
19
|
+
from dataclasses import dataclass, field
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
from typing import Any
|
|
22
|
+
|
|
23
|
+
import yaml
|
|
24
|
+
|
|
25
|
+
# ── system prompts per arm ──────────────────────────────────────────────
|
|
26
|
+
|
|
27
|
+
SYSTEM_PROMPT_COMPRESSED = """You are speaking in CAVEMAN-SPEAK mode (speak_scope=aggressive).
|
|
28
|
+
|
|
29
|
+
Compress all body prose to caveman grammar:
|
|
30
|
+
- Drop articles (the, a, an).
|
|
31
|
+
- Drop linking auxiliaries (is, are, was, be) where unambiguous.
|
|
32
|
+
- Drop pronouns when context is clear.
|
|
33
|
+
- Keep nouns, verbs, key adjectives, negation, numbers.
|
|
34
|
+
- Example: "I will now check the file and see if it exists" -> "Check file. Exists?"
|
|
35
|
+
|
|
36
|
+
Carve-outs — preserve BYTE-FOR-BYTE (do NOT compress these):
|
|
37
|
+
1. Triple-backtick code/literal blocks (any language, including ALL-CAPS Iron-Law fences).
|
|
38
|
+
2. Numbered-options lines matching ^\\d+\\.\\s + a **Recommendation:** label.
|
|
39
|
+
3. Backtick spans (file paths, command names, identifiers).
|
|
40
|
+
4. Status markers: lines starting with ❌, ⚠️, or ✅.
|
|
41
|
+
5. Mode markers.
|
|
42
|
+
6. Markdown tables.
|
|
43
|
+
7. Deliverables (PR titles, commit messages, ticket summaries, articles, the prompt
|
|
44
|
+
line of any single question asked to the user).
|
|
45
|
+
|
|
46
|
+
Apply caveman compression aggressively to every other prose surface."""
|
|
47
|
+
|
|
48
|
+
SYSTEM_PROMPT_TERSE = (
|
|
49
|
+
"Answer concisely. Skip preamble. Do not restate the question. "
|
|
50
|
+
"Avoid filler phrases ('Let me', 'Here is', 'I will'). Get to the answer."
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
SYSTEM_PROMPT_UNCOMPRESSED = (
|
|
54
|
+
"You are a helpful AI assistant. Answer the user's question clearly and completely."
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
ARMS: tuple[str, ...] = ("compressed", "terse_control", "uncompressed")
|
|
58
|
+
ARM_SYSTEM_PROMPT: dict[str, str] = {
|
|
59
|
+
"compressed": SYSTEM_PROMPT_COMPRESSED,
|
|
60
|
+
"terse_control": SYSTEM_PROMPT_TERSE,
|
|
61
|
+
"uncompressed": SYSTEM_PROMPT_UNCOMPRESSED,
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
# ── carve-out detection ────────────────────────────────────────────────
|
|
65
|
+
|
|
66
|
+
_RE_TRIPLE_BACKTICK = re.compile(r"```[\s\S]*?```")
|
|
67
|
+
_RE_BACKTICK_SPAN = re.compile(r"`[^`\n]+`")
|
|
68
|
+
_RE_NUMBERED_LINE = re.compile(r"^>?\s*\d+\.\s.*$", re.MULTILINE)
|
|
69
|
+
_RE_STATUS_LINE = re.compile(r"^(❌|⚠️|✅).*$", re.MULTILINE)
|
|
70
|
+
_RE_TABLE_LINE = re.compile(r"^\s*\|.*\|\s*$", re.MULTILINE)
|
|
71
|
+
_RE_RECOMMENDATION = re.compile(r"^\*\*(Recommendation|Empfehlung):\*\*.*$", re.MULTILINE)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def carve_out_chars(text: str) -> int:
|
|
75
|
+
"""Sum byte-length of every carve-out region (union, no double-count)."""
|
|
76
|
+
if not text:
|
|
77
|
+
return 0
|
|
78
|
+
mask = bytearray(len(text))
|
|
79
|
+
for pattern in (
|
|
80
|
+
_RE_TRIPLE_BACKTICK, _RE_BACKTICK_SPAN, _RE_NUMBERED_LINE,
|
|
81
|
+
_RE_STATUS_LINE, _RE_TABLE_LINE, _RE_RECOMMENDATION,
|
|
82
|
+
):
|
|
83
|
+
for m in pattern.finditer(text):
|
|
84
|
+
for i in range(m.start(), m.end()):
|
|
85
|
+
mask[i] = 1
|
|
86
|
+
return sum(mask)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
# ── data shapes ────────────────────────────────────────────────────────
|
|
90
|
+
|
|
91
|
+
@dataclass
|
|
92
|
+
class ArmResult:
|
|
93
|
+
arm: str
|
|
94
|
+
text: str
|
|
95
|
+
input_tokens: int
|
|
96
|
+
output_tokens: int
|
|
97
|
+
latency_ms: int
|
|
98
|
+
output_chars: int
|
|
99
|
+
carve_out_chars: int
|
|
100
|
+
error: str | None = None
|
|
101
|
+
|
|
102
|
+
@property
|
|
103
|
+
def realised_carve_out_pct(self) -> float:
|
|
104
|
+
return self.carve_out_chars / self.output_chars if self.output_chars else 0.0
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
@dataclass
|
|
108
|
+
class PromptResult:
|
|
109
|
+
id: str
|
|
110
|
+
category: str
|
|
111
|
+
expected_carve_out_pct: float
|
|
112
|
+
arms: dict[str, ArmResult] = field(default_factory=dict)
|
|
113
|
+
|
|
114
|
+
@property
|
|
115
|
+
def savings_vs_raw(self) -> float | None:
|
|
116
|
+
c = self.arms.get("compressed")
|
|
117
|
+
u = self.arms.get("uncompressed")
|
|
118
|
+
if not c or not u or u.output_tokens == 0:
|
|
119
|
+
return None
|
|
120
|
+
return 1.0 - (c.output_tokens / u.output_tokens)
|
|
121
|
+
|
|
122
|
+
@property
|
|
123
|
+
def savings_vs_terse(self) -> float | None:
|
|
124
|
+
c = self.arms.get("compressed")
|
|
125
|
+
t = self.arms.get("terse_control")
|
|
126
|
+
if not c or not t or t.output_tokens == 0:
|
|
127
|
+
return None
|
|
128
|
+
return 1.0 - (c.output_tokens / t.output_tokens)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
# ── corpus + runner ────────────────────────────────────────────────────
|
|
132
|
+
|
|
133
|
+
def load_corpus(corpus_path: Path) -> list[dict[str, Any]]:
|
|
134
|
+
"""Read bench/corpora/caveman/prompts.yaml → list of prompt dicts."""
|
|
135
|
+
data = yaml.safe_load(corpus_path.read_text(encoding="utf-8")) or {}
|
|
136
|
+
prompts = data.get("prompts") or []
|
|
137
|
+
if not prompts:
|
|
138
|
+
raise ValueError(f"empty corpus: {corpus_path}")
|
|
139
|
+
return prompts
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def run_arm(
|
|
143
|
+
client: Any,
|
|
144
|
+
arm: str,
|
|
145
|
+
user_prompt: str,
|
|
146
|
+
*,
|
|
147
|
+
max_tokens: int = 1024,
|
|
148
|
+
) -> ArmResult:
|
|
149
|
+
"""Invoke one arm against the live API. Returns ArmResult including text."""
|
|
150
|
+
t0 = time.monotonic()
|
|
151
|
+
system = ARM_SYSTEM_PROMPT[arm]
|
|
152
|
+
try:
|
|
153
|
+
resp = client.ask(system, user_prompt, max_tokens=max_tokens)
|
|
154
|
+
except Exception as exc: # noqa: BLE001
|
|
155
|
+
latency_ms = int((time.monotonic() - t0) * 1000)
|
|
156
|
+
return ArmResult(arm=arm, text="", input_tokens=0, output_tokens=0,
|
|
157
|
+
latency_ms=latency_ms, output_chars=0, carve_out_chars=0,
|
|
158
|
+
error=str(exc))
|
|
159
|
+
return ArmResult(
|
|
160
|
+
arm=arm, text=resp.text or "",
|
|
161
|
+
input_tokens=int(resp.input_tokens or 0),
|
|
162
|
+
output_tokens=int(resp.output_tokens or 0),
|
|
163
|
+
latency_ms=int(resp.latency_ms or (time.monotonic() - t0) * 1000),
|
|
164
|
+
output_chars=len(resp.text or ""),
|
|
165
|
+
carve_out_chars=carve_out_chars(resp.text or ""),
|
|
166
|
+
error=resp.error,
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
# ── aggregation ────────────────────────────────────────────────────────────
|
|
171
|
+
|
|
172
|
+
def _stats(values: list[float]) -> dict[str, float]:
|
|
173
|
+
"""Median / p10 / p90 / stdev / n on a list of floats. Empty → zeros."""
|
|
174
|
+
if not values:
|
|
175
|
+
return {"n": 0, "median": 0.0, "p10": 0.0, "p90": 0.0, "stdev": 0.0}
|
|
176
|
+
s = sorted(values)
|
|
177
|
+
n = len(s)
|
|
178
|
+
def _pct(p: float) -> float:
|
|
179
|
+
if n == 1:
|
|
180
|
+
return s[0]
|
|
181
|
+
k = (n - 1) * p
|
|
182
|
+
lo, hi = int(k), min(int(k) + 1, n - 1)
|
|
183
|
+
return s[lo] + (s[hi] - s[lo]) * (k - lo)
|
|
184
|
+
return {
|
|
185
|
+
"n": n,
|
|
186
|
+
"median": statistics.median(s),
|
|
187
|
+
"p10": _pct(0.10),
|
|
188
|
+
"p90": _pct(0.90),
|
|
189
|
+
"stdev": statistics.pstdev(s) if n > 1 else 0.0,
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def aggregate_results(results: list[PromptResult]) -> dict[str, Any]:
|
|
194
|
+
"""Compute median/p10/p90 for compression metrics across the corpus."""
|
|
195
|
+
vs_raw = [r.savings_vs_raw for r in results if r.savings_vs_raw is not None]
|
|
196
|
+
vs_terse = [r.savings_vs_terse for r in results if r.savings_vs_terse is not None]
|
|
197
|
+
realised_carve_pct = [
|
|
198
|
+
r.arms["compressed"].realised_carve_out_pct
|
|
199
|
+
for r in results if "compressed" in r.arms and r.arms["compressed"].output_chars
|
|
200
|
+
]
|
|
201
|
+
expected_carve_pct = [r.expected_carve_out_pct for r in results]
|
|
202
|
+
|
|
203
|
+
per_arm_tokens: dict[str, list[int]] = {a: [] for a in ARMS}
|
|
204
|
+
for r in results:
|
|
205
|
+
for arm in ARMS:
|
|
206
|
+
ar = r.arms.get(arm)
|
|
207
|
+
if ar:
|
|
208
|
+
per_arm_tokens[arm].append(ar.output_tokens)
|
|
209
|
+
|
|
210
|
+
return {
|
|
211
|
+
"savings_vs_raw": _stats(vs_raw),
|
|
212
|
+
"savings_vs_terse": _stats(vs_terse),
|
|
213
|
+
"realised_carve_out_pct": _stats(realised_carve_pct),
|
|
214
|
+
"expected_carve_out_pct": _stats(expected_carve_pct),
|
|
215
|
+
"output_tokens": {
|
|
216
|
+
arm: _stats([float(v) for v in per_arm_tokens[arm]]) for arm in ARMS
|
|
217
|
+
},
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def compute_cost(results: list[PromptResult], pricing: dict[str, float]) -> dict[str, Any]:
|
|
222
|
+
"""Sum input/output tokens across all arms; cost from per-1M pricing dict."""
|
|
223
|
+
totals = {"input_tokens": 0, "output_tokens": 0, "calls": 0, "errors": 0}
|
|
224
|
+
per_arm: dict[str, dict[str, int]] = {a: {"input_tokens": 0, "output_tokens": 0, "calls": 0} for a in ARMS}
|
|
225
|
+
for r in results:
|
|
226
|
+
for arm, ar in r.arms.items():
|
|
227
|
+
totals["input_tokens"] += ar.input_tokens
|
|
228
|
+
totals["output_tokens"] += ar.output_tokens
|
|
229
|
+
totals["calls"] += 1
|
|
230
|
+
if ar.error:
|
|
231
|
+
totals["errors"] += 1
|
|
232
|
+
per_arm[arm]["input_tokens"] += ar.input_tokens
|
|
233
|
+
per_arm[arm]["output_tokens"] += ar.output_tokens
|
|
234
|
+
per_arm[arm]["calls"] += 1
|
|
235
|
+
cost_usd = (
|
|
236
|
+
totals["input_tokens"] / 1e6 * pricing.get("input", 0.0)
|
|
237
|
+
+ totals["output_tokens"] / 1e6 * pricing.get("output", 0.0)
|
|
238
|
+
)
|
|
239
|
+
totals["total_cost_usd"] = round(cost_usd, 6)
|
|
240
|
+
return {"totals": totals, "per_arm": per_arm}
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
# ── orchestrator ───────────────────────────────────────────────────────────
|
|
244
|
+
|
|
245
|
+
def run_caveman_bench(
|
|
246
|
+
client: Any,
|
|
247
|
+
corpus_path: Path,
|
|
248
|
+
*,
|
|
249
|
+
max_prompts: int | None = None,
|
|
250
|
+
max_tokens: int = 1024,
|
|
251
|
+
on_progress: Any = None,
|
|
252
|
+
) -> list[PromptResult]:
|
|
253
|
+
"""Run all three arms over the corpus. Returns per-prompt results."""
|
|
254
|
+
prompts = load_corpus(corpus_path)
|
|
255
|
+
if max_prompts:
|
|
256
|
+
prompts = prompts[:max_prompts]
|
|
257
|
+
results: list[PromptResult] = []
|
|
258
|
+
total = len(prompts) * len(ARMS)
|
|
259
|
+
done = 0
|
|
260
|
+
for p in prompts:
|
|
261
|
+
pr = PromptResult(
|
|
262
|
+
id=str(p["id"]),
|
|
263
|
+
category=str(p.get("category", "unknown")),
|
|
264
|
+
expected_carve_out_pct=float(p.get("expected_carve_out_pct", 0.0)),
|
|
265
|
+
)
|
|
266
|
+
for arm in ARMS:
|
|
267
|
+
ar = run_arm(client, arm, str(p["prompt"]), max_tokens=max_tokens)
|
|
268
|
+
pr.arms[arm] = ar
|
|
269
|
+
done += 1
|
|
270
|
+
if on_progress:
|
|
271
|
+
on_progress(done, total, pr.id, arm, ar)
|
|
272
|
+
results.append(pr)
|
|
273
|
+
return results
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
# Caveman bench report serializer — step-16 Phase 1 Step 5.
|
|
2
|
+
#
|
|
3
|
+
# Emits the caveman-v1 JSON + Markdown shape. Distinct schema_version
|
|
4
|
+
# ("caveman-v1") from the selection-accuracy bench (v1) because the
|
|
5
|
+
# blocks are disjoint: caveman has no `selection`/`quality`, and the
|
|
6
|
+
# selection bench has no three-arm compression metrics.
|
|
7
|
+
"""Caveman bench report serializer."""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
from _lib.bench_caveman import ARMS, PromptResult, aggregate_results, compute_cost
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def build_caveman_report(
|
|
16
|
+
*,
|
|
17
|
+
results: list[PromptResult],
|
|
18
|
+
corpus_path_rel: str,
|
|
19
|
+
generated_at: str,
|
|
20
|
+
bench_run_version: str,
|
|
21
|
+
model: str,
|
|
22
|
+
transport: str,
|
|
23
|
+
pricing_rates: dict[str, float],
|
|
24
|
+
pricing_sourced_on: str | None,
|
|
25
|
+
) -> dict[str, Any]:
|
|
26
|
+
aggregate = aggregate_results(results)
|
|
27
|
+
cost = compute_cost(results, pricing_rates)
|
|
28
|
+
cost["source"] = "live-api"
|
|
29
|
+
cost["model"] = model
|
|
30
|
+
cost["pricing_sourced_on"] = pricing_sourced_on
|
|
31
|
+
errors = cost["totals"]["errors"]
|
|
32
|
+
return {
|
|
33
|
+
"schema_version": "caveman-v1",
|
|
34
|
+
"generated_at": generated_at,
|
|
35
|
+
"corpus": {
|
|
36
|
+
"id": "caveman",
|
|
37
|
+
"path": corpus_path_rel,
|
|
38
|
+
"prompt_count": len(results),
|
|
39
|
+
},
|
|
40
|
+
"runner": {
|
|
41
|
+
"bench_run_version": bench_run_version,
|
|
42
|
+
"transport": transport,
|
|
43
|
+
"model": model,
|
|
44
|
+
},
|
|
45
|
+
"caveman": {
|
|
46
|
+
"arms": list(ARMS),
|
|
47
|
+
"aggregate": aggregate,
|
|
48
|
+
"per_prompt": [_prompt_block(r) for r in results],
|
|
49
|
+
},
|
|
50
|
+
"cost": cost,
|
|
51
|
+
"verdict": {
|
|
52
|
+
"overall": "measured" if errors == 0 else "partial",
|
|
53
|
+
"errors": errors,
|
|
54
|
+
},
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _prompt_block(r: PromptResult) -> dict[str, Any]:
|
|
59
|
+
return {
|
|
60
|
+
"id": r.id,
|
|
61
|
+
"category": r.category,
|
|
62
|
+
"expected_carve_out_pct": r.expected_carve_out_pct,
|
|
63
|
+
"realised_carve_out_pct": (
|
|
64
|
+
r.arms["compressed"].realised_carve_out_pct
|
|
65
|
+
if "compressed" in r.arms else None
|
|
66
|
+
),
|
|
67
|
+
"savings_vs_raw": r.savings_vs_raw,
|
|
68
|
+
"savings_vs_terse": r.savings_vs_terse,
|
|
69
|
+
"arms": {
|
|
70
|
+
arm: {
|
|
71
|
+
"input_tokens": ar.input_tokens,
|
|
72
|
+
"output_tokens": ar.output_tokens,
|
|
73
|
+
"latency_ms": ar.latency_ms,
|
|
74
|
+
"output_chars": ar.output_chars,
|
|
75
|
+
"carve_out_chars": ar.carve_out_chars,
|
|
76
|
+
"error": ar.error,
|
|
77
|
+
"text": ar.text,
|
|
78
|
+
}
|
|
79
|
+
for arm, ar in r.arms.items()
|
|
80
|
+
},
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _fmt_pct(x: float | None) -> str:
|
|
85
|
+
return f"{x:.2%}" if isinstance(x, (int, float)) else "—"
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def render_caveman_markdown(report: dict[str, Any]) -> str:
|
|
89
|
+
cv = report["caveman"]
|
|
90
|
+
agg = cv["aggregate"]
|
|
91
|
+
cost = report["cost"]
|
|
92
|
+
head = [
|
|
93
|
+
f"# Caveman Bench Report — `caveman` · {report['generated_at']}",
|
|
94
|
+
"",
|
|
95
|
+
"## Headline",
|
|
96
|
+
"",
|
|
97
|
+
f"- prompts: **{report['corpus']['prompt_count']}** · "
|
|
98
|
+
f"arms: **{', '.join(cv['arms'])}** · "
|
|
99
|
+
f"model: **{report['runner']['model']}** · "
|
|
100
|
+
f"transport: **{report['runner']['transport']}**",
|
|
101
|
+
f"- median savings vs raw: **{_fmt_pct(agg['savings_vs_raw']['median'])}** "
|
|
102
|
+
f"(p10 {_fmt_pct(agg['savings_vs_raw']['p10'])} · p90 {_fmt_pct(agg['savings_vs_raw']['p90'])})",
|
|
103
|
+
f"- median savings vs terse-control: **{_fmt_pct(agg['savings_vs_terse']['median'])}** "
|
|
104
|
+
f"(p10 {_fmt_pct(agg['savings_vs_terse']['p10'])} · p90 {_fmt_pct(agg['savings_vs_terse']['p90'])})",
|
|
105
|
+
f"- median realised carve-out share (compressed arm): **{_fmt_pct(agg['realised_carve_out_pct']['median'])}** "
|
|
106
|
+
f"(expected median {_fmt_pct(agg['expected_carve_out_pct']['median'])})",
|
|
107
|
+
f"- total cost: **${cost['totals']['total_cost_usd']:.6f}** "
|
|
108
|
+
f"(calls {cost['totals']['calls']} · errors {cost['totals']['errors']})",
|
|
109
|
+
f"- verdict: **{report['verdict']['overall']}**",
|
|
110
|
+
"",
|
|
111
|
+
]
|
|
112
|
+
per_arm = [
|
|
113
|
+
"## Per-arm token totals",
|
|
114
|
+
"",
|
|
115
|
+
"| arm | calls | input_tokens | output_tokens | median out/prompt |",
|
|
116
|
+
"|---|---:|---:|---:|---:|",
|
|
117
|
+
]
|
|
118
|
+
for arm in cv["arms"]:
|
|
119
|
+
a = cost["per_arm"][arm]
|
|
120
|
+
m = agg["output_tokens"][arm]["median"]
|
|
121
|
+
per_arm.append(
|
|
122
|
+
f"| `{arm}` | {a['calls']} | {a['input_tokens']} | {a['output_tokens']} | {m:.0f} |"
|
|
123
|
+
)
|
|
124
|
+
per_arm.append("")
|
|
125
|
+
per_prompt = [
|
|
126
|
+
"## Per-prompt results",
|
|
127
|
+
"",
|
|
128
|
+
"| id | category | exp.carve | real.carve | out.compressed | out.terse | out.uncompressed | vs raw | vs terse |",
|
|
129
|
+
"|---|---|---:|---:|---:|---:|---:|---:|---:|",
|
|
130
|
+
]
|
|
131
|
+
for r in cv["per_prompt"]:
|
|
132
|
+
arms = r["arms"]
|
|
133
|
+
oc = arms.get("compressed", {}).get("output_tokens", "—")
|
|
134
|
+
ot = arms.get("terse_control", {}).get("output_tokens", "—")
|
|
135
|
+
ou = arms.get("uncompressed", {}).get("output_tokens", "—")
|
|
136
|
+
per_prompt.append(
|
|
137
|
+
f"| `{r['id']}` | {r['category']} | "
|
|
138
|
+
f"{_fmt_pct(r['expected_carve_out_pct'])} | {_fmt_pct(r['realised_carve_out_pct'])} | "
|
|
139
|
+
f"{oc} | {ot} | {ou} | "
|
|
140
|
+
f"{_fmt_pct(r['savings_vs_raw'])} | {_fmt_pct(r['savings_vs_terse'])} |"
|
|
141
|
+
)
|
|
142
|
+
per_prompt.append("")
|
|
143
|
+
notes = [
|
|
144
|
+
"## Notes",
|
|
145
|
+
"",
|
|
146
|
+
f"- corpus: `{report['corpus']['path']}`",
|
|
147
|
+
f"- pricing: `bench/pricing.yaml` (sourced {cost.get('pricing_sourced_on') or '—'})",
|
|
148
|
+
f"- schema: `caveman-v1` (see `docs/contracts/benchmark-report-schema.md`)",
|
|
149
|
+
f"- bench_run version: `{report['runner']['bench_run_version']}`",
|
|
150
|
+
"",
|
|
151
|
+
]
|
|
152
|
+
return "\n".join(head + per_arm + per_prompt + notes)
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Offline bench for input-side memory compression (Phase 2 / Step 11).
|
|
3
|
+
|
|
4
|
+
Runs `compress_memory.py` over a fixed corpus of memory-target files, records
|
|
5
|
+
pre/post char counts, approximates input-token savings (chars / 4 — the
|
|
6
|
+
GPT-4 / Claude rule of thumb), and emits `bench/reports/caveman-v2.{json,md}`.
|
|
7
|
+
|
|
8
|
+
Offline (no API calls). Cadence-aligned with `docs/benchmarks.md`. Citation
|
|
9
|
+
in `bench/reports/caveman-v2.md` notes the chars→tokens approximation and
|
|
10
|
+
points at upstream tiktoken / claude-tokenizer if a calibrated number is
|
|
11
|
+
later needed.
|
|
12
|
+
"""
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import json
|
|
16
|
+
import shutil
|
|
17
|
+
import statistics
|
|
18
|
+
import subprocess
|
|
19
|
+
import sys
|
|
20
|
+
import tempfile
|
|
21
|
+
from datetime import datetime, timezone
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
|
|
24
|
+
REPO_ROOT = Path(__file__).resolve().parent.parent
|
|
25
|
+
COMPRESS_SCRIPT = REPO_ROOT / "scripts" / "compress_memory.py"
|
|
26
|
+
REPORT_JSON = REPO_ROOT / "bench" / "reports" / "caveman-v2.json"
|
|
27
|
+
REPORT_MD = REPO_ROOT / "bench" / "reports" / "caveman-v2.md"
|
|
28
|
+
|
|
29
|
+
CORPUS: list[tuple[str, str]] = [
|
|
30
|
+
("AGENTS.md", "thin-root-package"),
|
|
31
|
+
(".agent-src.uncompressed/templates/AGENTS.md", "thin-root-consumer-template"),
|
|
32
|
+
(".agent-src/templates/AGENTS.md", "thin-root-consumer-generated"),
|
|
33
|
+
("docs/contracts/ai-council-config.md", "prose-heavy-contract"),
|
|
34
|
+
("docs/contracts/implement-ticket-flow.md", "prose-heavy-contract"),
|
|
35
|
+
("docs/contracts/command-clusters.md", "prose-heavy-contract"),
|
|
36
|
+
("docs/contracts/mental-models.md", "prose-heavy-contract"),
|
|
37
|
+
("docs/contracts/kernel-membership.md", "prose-heavy-contract"),
|
|
38
|
+
("docs/contracts/load-context-budget-model.md", "prose-heavy-contract"),
|
|
39
|
+
("docs/contracts/mcp-cloud-scope.md", "prose-heavy-contract"),
|
|
40
|
+
("docs/contracts/context-spine.md", "prose-heavy-contract"),
|
|
41
|
+
("docs/contracts/rule-classification.md", "rule-classification"),
|
|
42
|
+
]
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def chars_to_tokens(n: int) -> int:
|
|
46
|
+
"""Approximate token count via chars / 4 (GPT-4/Claude English heuristic)."""
|
|
47
|
+
return round(n / 4)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def bench_one(rel_path: str, category: str) -> dict:
|
|
51
|
+
src = REPO_ROOT / rel_path
|
|
52
|
+
if not src.is_file():
|
|
53
|
+
return {"path": rel_path, "category": category, "error": "not-found"}
|
|
54
|
+
with tempfile.TemporaryDirectory() as tmp:
|
|
55
|
+
target = Path(tmp) / src.name
|
|
56
|
+
shutil.copy(src, target)
|
|
57
|
+
before_chars = target.stat().st_size
|
|
58
|
+
result = subprocess.run(
|
|
59
|
+
[sys.executable, str(COMPRESS_SCRIPT), str(target)],
|
|
60
|
+
capture_output=True, text=True, cwd=REPO_ROOT,
|
|
61
|
+
)
|
|
62
|
+
if result.returncode != 0:
|
|
63
|
+
return {"path": rel_path, "category": category,
|
|
64
|
+
"error": f"exit-{result.returncode}", "stderr": result.stderr[:200]}
|
|
65
|
+
after_chars = target.stat().st_size
|
|
66
|
+
before_tok = chars_to_tokens(before_chars)
|
|
67
|
+
after_tok = chars_to_tokens(after_chars)
|
|
68
|
+
return {
|
|
69
|
+
"path": rel_path,
|
|
70
|
+
"category": category,
|
|
71
|
+
"before_chars": before_chars,
|
|
72
|
+
"after_chars": after_chars,
|
|
73
|
+
"delta_chars": after_chars - before_chars,
|
|
74
|
+
"saving_pct_chars": (before_chars - after_chars) * 100 / before_chars,
|
|
75
|
+
"before_tokens_est": before_tok,
|
|
76
|
+
"after_tokens_est": after_tok,
|
|
77
|
+
"delta_tokens_est": after_tok - before_tok,
|
|
78
|
+
"saving_pct_tokens_est": (before_tok - after_tok) * 100 / before_tok if before_tok else 0.0,
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def aggregate(rows: list[dict]) -> dict:
|
|
83
|
+
rows_ok = [r for r in rows if "error" not in r]
|
|
84
|
+
savings = [r["saving_pct_chars"] for r in rows_ok]
|
|
85
|
+
by_cat: dict[str, list[float]] = {}
|
|
86
|
+
for r in rows_ok:
|
|
87
|
+
by_cat.setdefault(r["category"], []).append(r["saving_pct_chars"])
|
|
88
|
+
return {
|
|
89
|
+
"calls": len(rows),
|
|
90
|
+
"errors": len(rows) - len(rows_ok),
|
|
91
|
+
"median_saving_pct": statistics.median(savings) if savings else 0.0,
|
|
92
|
+
"p10_saving_pct": statistics.quantiles(savings, n=10)[0] if len(savings) >= 10 else min(savings, default=0.0),
|
|
93
|
+
"p90_saving_pct": statistics.quantiles(savings, n=10)[8] if len(savings) >= 10 else max(savings, default=0.0),
|
|
94
|
+
"stdev_saving_pct": statistics.pstdev(savings) if len(savings) > 1 else 0.0,
|
|
95
|
+
"total_chars_saved": sum(r["before_chars"] - r["after_chars"] for r in rows_ok),
|
|
96
|
+
"total_tokens_est_saved": sum(r["before_tokens_est"] - r["after_tokens_est"] for r in rows_ok),
|
|
97
|
+
"by_category_median_pct": {k: statistics.median(v) for k, v in by_cat.items()},
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def render_md(payload: dict) -> str:
|
|
102
|
+
agg = payload["aggregate"]
|
|
103
|
+
lines = [
|
|
104
|
+
"# caveman-v2 — input-side memory compression bench",
|
|
105
|
+
"",
|
|
106
|
+
f"**Generated:** {payload['generated_at']}",
|
|
107
|
+
f"**Schema:** `caveman-v2` (input-side; offline; chars→tokens via /4 heuristic)",
|
|
108
|
+
f"**Script:** `scripts/bench_compress_memory.py`",
|
|
109
|
+
"",
|
|
110
|
+
"## Headline",
|
|
111
|
+
"",
|
|
112
|
+
f"- Median char saving: **{agg['median_saving_pct']:+.2f}%** (p10 {agg['p10_saving_pct']:+.2f}% · p90 {agg['p90_saving_pct']:+.2f}%)",
|
|
113
|
+
f"- Total chars saved across corpus: **{agg['total_chars_saved']:+,}**",
|
|
114
|
+
f"- Total tokens (estimate) saved across corpus: **{agg['total_tokens_est_saved']:+,}**",
|
|
115
|
+
f"- Files: {agg['calls']} · errors: {agg['errors']}",
|
|
116
|
+
"",
|
|
117
|
+
"## By category (median %)",
|
|
118
|
+
"",
|
|
119
|
+
"| Category | Median saving |",
|
|
120
|
+
"|---|---:|",
|
|
121
|
+
]
|
|
122
|
+
for cat, med in sorted(agg["by_category_median_pct"].items()):
|
|
123
|
+
lines.append(f"| {cat} | {med:+.2f}% |")
|
|
124
|
+
lines += ["", "## Per file", "",
|
|
125
|
+
"| Path | Category | Before | After | Δ chars | Saving % |",
|
|
126
|
+
"|---|---|---:|---:|---:|---:|"]
|
|
127
|
+
for r in payload["rows"]:
|
|
128
|
+
if "error" in r:
|
|
129
|
+
lines.append(f"| `{r['path']}` | {r['category']} | — | — | — | {r['error']} |")
|
|
130
|
+
else:
|
|
131
|
+
lines.append(
|
|
132
|
+
f"| `{r['path']}` | {r['category']} | {r['before_chars']:,} | {r['after_chars']:,} | "
|
|
133
|
+
f"{r['delta_chars']:+,} | {r['saving_pct_chars']:+.2f}% |"
|
|
134
|
+
)
|
|
135
|
+
lines += ["", "## Methodology",
|
|
136
|
+
"",
|
|
137
|
+
"- Offline run: `compress_memory.py` writes `.original.md` backup + frontmatter (`original_sha256`, `compressed_at`). The frontmatter pair (≈ 120 chars) is the fixed compression tax — files with little prose net negative.",
|
|
138
|
+
"- chars → tokens approximation: `tokens ≈ chars / 4` (GPT-4 / Claude English rule of thumb). Calibrated number requires `tiktoken` or `claude-tokenizer`; deferred until a consumer requests pinpoint numbers.",
|
|
139
|
+
"- The `caveman-v1` output-side verdict (`vs_terse` median −9.27%) is orthogonal — input-side savings apply to the always-loaded memory budget, not the reply stream.",
|
|
140
|
+
"",
|
|
141
|
+
"## Interpretation",
|
|
142
|
+
"",
|
|
143
|
+
"- **Thin-Root files net negative.** `AGENTS.md` and `templates/AGENTS.md` already follow `agents-md-thin-root` (≥ 40 % pointer ratio). The compressor's frontmatter pair adds more bytes than the sparse prose loses. **Do not compress Thin-Root files.**",
|
|
144
|
+
"- **Prose-heavy contract docs net 3–6 % saving.** Useful but modest. Pays off when the file is large and frequently loaded.",
|
|
145
|
+
"- **Rule of thumb:** target files with > 5 KB and visible paragraph prose; skip pointer-only files.",
|
|
146
|
+
""]
|
|
147
|
+
return "\n".join(lines)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def main() -> int:
|
|
151
|
+
rows = [bench_one(p, c) for p, c in CORPUS]
|
|
152
|
+
payload = {
|
|
153
|
+
"generated_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
|
|
154
|
+
"schema": "caveman-v2",
|
|
155
|
+
"rows": rows,
|
|
156
|
+
"aggregate": aggregate(rows),
|
|
157
|
+
}
|
|
158
|
+
REPORT_JSON.parent.mkdir(parents=True, exist_ok=True)
|
|
159
|
+
REPORT_JSON.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8")
|
|
160
|
+
REPORT_MD.write_text(render_md(payload), encoding="utf-8")
|
|
161
|
+
print(f"wrote: {REPORT_JSON}")
|
|
162
|
+
print(f"wrote: {REPORT_MD}")
|
|
163
|
+
print(f"median saving: {payload['aggregate']['median_saving_pct']:+.2f}%")
|
|
164
|
+
return 0
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
if __name__ == "__main__":
|
|
168
|
+
sys.exit(main())
|