@event4u/agent-config 2.20.0 → 2.21.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.agent-src/commands/agent-status.md +16 -0
- package/.agent-src/rules/caveman-speak.md +2 -0
- package/.agent-src/skills/compress-memory/SKILL.md +119 -0
- package/.agent-src/templates/agents/agent-project-settings.example.yml +1 -1
- package/.claude-plugin/marketplace.json +2 -1
- package/CHANGELOG.md +59 -43
- package/README.md +5 -5
- package/docs/architecture.md +1 -1
- package/docs/archive/CHANGELOG-pre-2.17.0.md +63 -0
- package/docs/benchmarks.md +74 -0
- package/docs/catalog.md +3 -2
- package/docs/contracts/caveman-telemetry.md +83 -0
- package/docs/contracts/compression-default-kill-criterion.md +82 -35
- package/docs/contracts/cost-summary-schema.md +107 -0
- package/docs/contracts/file-ownership-matrix.json +41 -0
- package/package.json +1 -1
- package/scripts/_lib/bench_caveman.py +273 -0
- package/scripts/_lib/bench_caveman_report.py +152 -0
- package/scripts/bench_compress_memory.py +168 -0
- package/scripts/bench_run.py +119 -1
- package/scripts/caveman_stats.py +119 -0
- package/scripts/check_command_count_messaging.py +2 -2
- package/scripts/compress_memory.py +172 -0
- package/scripts/cost_by_conversation.py +78 -0
- package/scripts/cost_summary.py +97 -0
- package/scripts/lint_roadmap_complexity.py +3 -2
- package/scripts/update_counts.py +7 -5
- package/scripts/validate_caveman_carveouts.py +129 -0
- package/scripts/validate_safe_paths.py +118 -0
- package/scripts/verify_roadmap_closure.py +327 -0
|
@@ -5,14 +5,16 @@ keep-beta-until: 2026-08-14
|
|
|
5
5
|
|
|
6
6
|
# Compression default — kill-criterion
|
|
7
7
|
|
|
8
|
-
> **Status:**
|
|
9
|
-
> closeout
|
|
8
|
+
> **Status:** v1-measured · criterion not met · default stays `off` · **Owner:** `step-16-caveman-substance.md`
|
|
9
|
+
> Phase 1 closeout · **Sources:** [`bench/reports/caveman-v1.md`](../../bench/reports/caveman-v1.md) ·
|
|
10
|
+
> [`council-synthesis.md` § 7](../../agents/audit-2026-05-14-north-star/council-synthesis.md) ·
|
|
11
|
+
> [`caveman-v1-kc-verdict.json`](../../agents/council-responses/caveman-v1-kc-verdict.json) <!-- council-ref-allowed: ADR decision trace for v1 kill-criterion verdict -->
|
|
10
12
|
|
|
11
13
|
## Rule
|
|
12
14
|
|
|
13
15
|
```
|
|
14
|
-
DEFAULT STAYS OFF UNTIL `task bench` PRODUCES A
|
|
15
|
-
DECISION OWNED BY
|
|
16
|
+
DEFAULT STAYS OFF UNTIL `task bench -- --caveman` PRODUCES A POSITIVE vs_terse MEDIAN.
|
|
17
|
+
DECISION OWNED BY THE NEXT BENCH CLOSEOUT, NOT BY THIS DOC.
|
|
16
18
|
```
|
|
17
19
|
|
|
18
20
|
1. **Current state.** `caveman.speak_scope` defaults `off`. Carve-outs
|
|
@@ -21,49 +23,94 @@ DECISION OWNED BY step-4 CLOSEOUT, NOT BY THIS DOC OR BY step-99.
|
|
|
21
23
|
[`caveman-speak`](../../.agent-src.uncompressed/rules/caveman-speak.md)
|
|
22
24
|
but the feature is non-promoted: no skill recommends turning it on,
|
|
23
25
|
no preset enables it, no profile depends on it.
|
|
24
|
-
2. **
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
26
|
+
2. **Baselines.** Every published `bench/reports/caveman-v<N>.{json,md}`
|
|
27
|
+
measures three arms (`compressed` · `terse-control` ·
|
|
28
|
+
`uncompressed`) and reports two savings columns:
|
|
29
|
+
- `vs_raw` — median savings against the uncompressed arm.
|
|
30
|
+
- `vs_terse` — **load-bearing** median savings against the
|
|
31
|
+
`Answer concisely.` terse-control arm. `vs_raw` is inflated by the
|
|
32
|
+
carve-out-tax-free pure-prose case and is **not** the gate metric.
|
|
33
|
+
3. **Decision table.** Read the latest `bench/reports/caveman-v<N>.md`
|
|
34
|
+
and apply exactly one of:
|
|
35
|
+
|
|
36
|
+
| Measured `vs_terse` median | Quality regression on corpus | Verdict |
|
|
33
37
|
|---|---|---|
|
|
34
|
-
| <
|
|
35
|
-
|
|
|
36
|
-
| ≥ 30 % |
|
|
38
|
+
| < 0 % | any | **Criterion not met — defer.** Keep default `off`. No telemetry multiplier. Next move owned by the corpus-widening / methodology-revision step that produces `caveman-v<N+1>`. |
|
|
39
|
+
| 0 % – < 30 % | any | **Hold.** Keep default `off`. Authorised follow-up: widen corpus or tune carve-out share; no default flip. |
|
|
40
|
+
| ≥ 30 % | < 5 % | **Flip default on** — `caveman.speak_scope` defaults to a non-`off` value (separate roadmap), carve-outs stay, statusline surfaces lifetime tokens saved. |
|
|
41
|
+
| ≥ 30 % | ≥ 5 % | **Hold** — repeat the window once with tuned intensity ladder; second hold → deprecate. |
|
|
37
42
|
|
|
38
43
|
"Quality regression" = host-side rubric on the corpus per
|
|
39
|
-
`
|
|
40
|
-
`
|
|
44
|
+
`benchmark-report-schema.md`. Numbers checked into the published
|
|
45
|
+
`caveman-v<N>.json` as the decision artefact.
|
|
41
46
|
4. **No interim flip.** The default does not move on anecdote,
|
|
42
|
-
gut feeling, or a single
|
|
43
|
-
|
|
47
|
+
gut feeling, or a single positive prompt. Only a published
|
|
48
|
+
`caveman-v<N>` report with a `vs_terse` median in the "Flip" row
|
|
49
|
+
above authorises a default change, under a follow-up roadmap.
|
|
50
|
+
|
|
51
|
+
## v1 verdict (2026-05-16)
|
|
52
|
+
|
|
53
|
+
[`bench/reports/caveman-v1.md`](../../bench/reports/caveman-v1.md)
|
|
54
|
+
landed 30 calls · $0.0805 · 0 errors · `claude-sonnet-4-5`:
|
|
55
|
+
|
|
56
|
+
| Metric | Median | p10 | p90 |
|
|
57
|
+
|---|---:|---:|---:|
|
|
58
|
+
| `vs_raw` savings | +23.51 % | -18.29 % | +52.53 % |
|
|
59
|
+
| **`vs_terse` savings** | **−9.27 %** | **−109.85 %** | +51.32 % |
|
|
60
|
+
| Realised carve-out share (compressed arm) | 30.67 % | — | — |
|
|
61
|
+
|
|
62
|
+
Per row 1 of the table, the v1 verdict is **criterion not met — defer**.
|
|
63
|
+
Default stays `off`; no telemetry multiplier ships; no rule retirement
|
|
64
|
+
in this roadmap. Wins exist only on pure-prose prompts (caveman-09
|
|
65
|
+
+50.5 %, caveman-10 +58.4 %); carve-out-heavy prompts drag the median
|
|
66
|
+
negative (caveman-04 path-list −108 %, caveman-06 mode-marker −123 %).
|
|
67
|
+
|
|
68
|
+
### Council split (recorded, not decisive)
|
|
69
|
+
|
|
70
|
+
Council run [`caveman-v1-kc-verdict.json`](../../agents/council-responses/caveman-v1-kc-verdict.json) <!-- council-ref-allowed: ADR decision trace for v1 kill-criterion verdict -->
|
|
71
|
+
(2 members · 1 round · $0.0514 actual) split:
|
|
72
|
+
|
|
73
|
+
- **`claude-sonnet-4-5`** → Decision A.1 (deprecate now) + Decision B.3
|
|
74
|
+
(suspend telemetry). Reasoning: the roadmap pinned `vs_terse` as
|
|
75
|
+
load-bearing; the data falsified it; retreating to `vs_raw` is
|
|
76
|
+
post-hoc rationalisation.
|
|
77
|
+
- **`gpt-4o`** → Decision A.3 (hold + re-bench with widened corpus +
|
|
78
|
+
revised terse-control prompt) + Decision B.2 (per-category
|
|
79
|
+
multipliers, suppress negatives). Reasoning: 10 prompts is a
|
|
80
|
+
razor-thin sample; the terse-control prompt may under-compress; the
|
|
81
|
+
carve-out validator (Phase 4) is not yet shipped, so we are
|
|
82
|
+
measuring a half-implemented feature.
|
|
83
|
+
|
|
84
|
+
**Synthesis (criterion-not-met + defer).** Both members agreed `vs_terse`
|
|
85
|
+
is the right gate. Neither's strongest path is taken in full inside
|
|
86
|
+
step-16: deprecation is reserved for a follow-up roadmap once v2 confirms
|
|
87
|
+
v1; re-bench is reserved for a follow-up roadmap with the methodology
|
|
88
|
+
revision the council requested. Step-16 ships the infrastructure (corpus,
|
|
89
|
+
bench arm, validator), records the v1 verdict, suspends the telemetry
|
|
90
|
+
multiplier, and hands the deprecate-vs-rebench call to the v2 roadmap.
|
|
44
91
|
|
|
45
92
|
## Why this is parked, not decided
|
|
46
93
|
|
|
47
|
-
The council split (Opus = remove now, o1 = measure-then-decide)
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
94
|
+
The 2026-05-14 council split (Opus = remove now, o1 = measure-then-decide)
|
|
95
|
+
predated v1 numbers. The 2026-05-16 council split (Sonnet = deprecate now,
|
|
96
|
+
GPT-4o = re-bench) is informed by v1 but disagrees on which methodological
|
|
97
|
+
weakness is decisive. The kill table above gives every future bench run a
|
|
98
|
+
deterministic resolution path and stops every downstream roadmap from
|
|
99
|
+
re-litigating compression on every PR.
|
|
51
100
|
|
|
52
101
|
## Cross-references
|
|
53
102
|
|
|
54
|
-
-
|
|
55
|
-
—
|
|
56
|
-
- `
|
|
57
|
-
—
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
— implements the carve-outs and the statusline integration the
|
|
61
|
-
"flip default on" branch depends on; blocks the default flip until
|
|
62
|
-
acceptance is green.
|
|
103
|
+
- [`bench/reports/caveman-v1.md`](../../bench/reports/caveman-v1.md)
|
|
104
|
+
— v1 measurement; canonical baseline this doc cites.
|
|
105
|
+
- [`docs/benchmarks.md`](../benchmarks.md)
|
|
106
|
+
— cadence + when the next bench run is mandatory.
|
|
107
|
+
- [`caveman-telemetry`](caveman-telemetry.md)
|
|
108
|
+
— multiplier contract; records the suspended state v2 must lift.
|
|
63
109
|
- [`caveman-speak`](../../.agent-src.uncompressed/rules/caveman-speak.md)
|
|
64
110
|
— runtime rule; reads `caveman.speak_scope` from settings.
|
|
65
111
|
|
|
66
112
|
## Done
|
|
67
113
|
|
|
68
|
-
This doc
|
|
69
|
-
|
|
114
|
+
This doc reflects the v1 verdict. It is **not** an action item. The next
|
|
115
|
+
bench closeout (against `caveman-v2` once a widened corpus or revised
|
|
116
|
+
methodology is shipped) closes the loop.
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
---
|
|
2
|
+
stability: beta
|
|
3
|
+
keep-beta-until: 2026-08-15
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# cost-summary schema (`cost-summary/v1`)
|
|
7
|
+
|
|
8
|
+
Stable JSON contract for inter-tool consumption of cost-tracking data
|
|
9
|
+
emitted by [`scripts/cost_summary.py`](../../scripts/cost_summary.py).
|
|
10
|
+
Schema-versioned so downstream consumers can pin and migrate explicitly.
|
|
11
|
+
|
|
12
|
+
Design reference: Ruflo `scripts/summary.mjs` (upstream cite). Our shape
|
|
13
|
+
diverges to align with the local `agents/cost-tracking/sessions.jsonl`
|
|
14
|
+
fields and the caveman-suspended-multiplier contract.
|
|
15
|
+
|
|
16
|
+
## Envelope
|
|
17
|
+
|
|
18
|
+
```json
|
|
19
|
+
{
|
|
20
|
+
"schema_version": "cost-summary/v1",
|
|
21
|
+
"generated_at": "2026-05-16T23:45:00Z",
|
|
22
|
+
"totals": { ... },
|
|
23
|
+
"by_session": [ ... ],
|
|
24
|
+
"by_conversation": [ ... ],
|
|
25
|
+
"by_model": [ ... ]
|
|
26
|
+
}
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
| Field | Type | Notes |
|
|
30
|
+
|---|---|---|
|
|
31
|
+
| `schema_version` | string | Pinned to `cost-summary/v1`. Downstream consumers MUST refuse unknown versions. |
|
|
32
|
+
| `generated_at` | string (ISO-8601 UTC, `Z` suffix) | Emit time. |
|
|
33
|
+
| `totals` | object | Lifetime aggregates — see `totals` below. |
|
|
34
|
+
| `by_session` | array | Per `sessionId` row; ordered by `sessionId` ascending. |
|
|
35
|
+
| `by_conversation` | array | Per `conversation_id` row; ordered by `conversation_id` ascending. |
|
|
36
|
+
| `by_model` | array | Per `model` row; ordered by `model` ascending. |
|
|
37
|
+
|
|
38
|
+
## `totals` shape
|
|
39
|
+
|
|
40
|
+
```json
|
|
41
|
+
{
|
|
42
|
+
"sessions": 123,
|
|
43
|
+
"total_cost_usd": 1.2345,
|
|
44
|
+
"input_tokens": 100000,
|
|
45
|
+
"output_tokens": 50000,
|
|
46
|
+
"caveman_delta_tokens": 0,
|
|
47
|
+
"caveman_multiplier_version": "v1",
|
|
48
|
+
"caveman_multiplier_active": false
|
|
49
|
+
}
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
`caveman_delta_tokens` is always `0` while
|
|
53
|
+
`caveman_multiplier_active == false` — see
|
|
54
|
+
[`caveman-telemetry.md`](caveman-telemetry.md) for the suspension contract.
|
|
55
|
+
|
|
56
|
+
## `by_session` / `by_conversation` row shape
|
|
57
|
+
|
|
58
|
+
```json
|
|
59
|
+
{
|
|
60
|
+
"key": "<sessionId or conversation_id>",
|
|
61
|
+
"sessions": 12,
|
|
62
|
+
"total_cost_usd": 0.4567,
|
|
63
|
+
"input_tokens": 8000,
|
|
64
|
+
"output_tokens": 4500,
|
|
65
|
+
"caveman_delta_tokens": 0
|
|
66
|
+
}
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
The `key` field is the grouping identifier; consumers identify the
|
|
70
|
+
group by inspecting which array the row lives in.
|
|
71
|
+
|
|
72
|
+
## `by_model` row shape
|
|
73
|
+
|
|
74
|
+
```json
|
|
75
|
+
{
|
|
76
|
+
"model": "claude-3-5-sonnet-20241022",
|
|
77
|
+
"sessions": 12,
|
|
78
|
+
"total_cost_usd": 0.4567,
|
|
79
|
+
"input_tokens": 8000,
|
|
80
|
+
"output_tokens": 4500
|
|
81
|
+
}
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
`by_model` omits caveman fields — the multiplier is dialect-scoped, not
|
|
85
|
+
model-scoped.
|
|
86
|
+
|
|
87
|
+
## Stability guarantees
|
|
88
|
+
|
|
89
|
+
- **Field additions** are **non-breaking**: consumers MUST ignore unknown fields.
|
|
90
|
+
- **Field removals or renames** bump the `schema_version` minor (`v1` → `v2`).
|
|
91
|
+
- **Type changes** bump the major (`v1.*` → `v2.0`).
|
|
92
|
+
- Downstream consumers SHOULD pin to a specific `schema_version` and
|
|
93
|
+
refuse unknown ones; the pin is the migration boundary.
|
|
94
|
+
|
|
95
|
+
## Downstream consumers
|
|
96
|
+
|
|
97
|
+
- `agent-status` skill — surfaces lifetime / current-conversation slice.
|
|
98
|
+
- Future `cost-export-to-monitoring` scripts (deferred; trigger:
|
|
99
|
+
consumer request) would wrap this JSON to push to Prometheus / OTLP.
|
|
100
|
+
|
|
101
|
+
## See also
|
|
102
|
+
|
|
103
|
+
- [`caveman-telemetry.md`](caveman-telemetry.md) — defines the
|
|
104
|
+
`caveman_*` fields and the suspended-multiplier contract.
|
|
105
|
+
- [`scripts/cost_summary.py`](../../scripts/cost_summary.py) — implementation.
|
|
106
|
+
- [`scripts/cost_by_conversation.py`](../../scripts/cost_by_conversation.py) — narrower per-conversation lens with the same JSONL source.
|
|
107
|
+
- [`scripts/caveman_stats.py`](../../scripts/caveman_stats.py) — caveman-only delta lens with the same JSONL source.
|
|
@@ -1800,6 +1800,12 @@
|
|
|
1800
1800
|
"load_context": [],
|
|
1801
1801
|
"load_context_eager": []
|
|
1802
1802
|
},
|
|
1803
|
+
".agent-src.uncompressed/skills/compress-memory/SKILL.md": {
|
|
1804
|
+
"kind": "skill",
|
|
1805
|
+
"rule_type": null,
|
|
1806
|
+
"load_context": [],
|
|
1807
|
+
"load_context_eager": []
|
|
1808
|
+
},
|
|
1803
1809
|
".agent-src.uncompressed/skills/content-funnel-design/SKILL.md": {
|
|
1804
1810
|
"kind": "skill",
|
|
1805
1811
|
"rule_type": null,
|
|
@@ -6396,6 +6402,13 @@
|
|
|
6396
6402
|
"via": "self",
|
|
6397
6403
|
"depth": 0
|
|
6398
6404
|
},
|
|
6405
|
+
{
|
|
6406
|
+
"source": ".agent-src.uncompressed/rules/caveman-speak.md",
|
|
6407
|
+
"target": ".agent-src.uncompressed/skills/compress-memory/SKILL.md",
|
|
6408
|
+
"type": "READ_ONLY",
|
|
6409
|
+
"via": "body_link",
|
|
6410
|
+
"depth": 1
|
|
6411
|
+
},
|
|
6399
6412
|
{
|
|
6400
6413
|
"source": ".agent-src.uncompressed/rules/cli-output-handling.md",
|
|
6401
6414
|
"target": ".agent-src.uncompressed/rules/cli-output-handling.md",
|
|
@@ -8048,6 +8061,34 @@
|
|
|
8048
8061
|
"via": "self",
|
|
8049
8062
|
"depth": 0
|
|
8050
8063
|
},
|
|
8064
|
+
{
|
|
8065
|
+
"source": ".agent-src.uncompressed/skills/compress-memory/SKILL.md",
|
|
8066
|
+
"target": ".agent-src.uncompressed/rules/caveman-speak.md",
|
|
8067
|
+
"type": "READ_ONLY",
|
|
8068
|
+
"via": "body_link",
|
|
8069
|
+
"depth": 1
|
|
8070
|
+
},
|
|
8071
|
+
{
|
|
8072
|
+
"source": ".agent-src.uncompressed/skills/compress-memory/SKILL.md",
|
|
8073
|
+
"target": ".agent-src.uncompressed/rules/role-mode-adherence.md",
|
|
8074
|
+
"type": "READ_ONLY",
|
|
8075
|
+
"via": "body_link",
|
|
8076
|
+
"depth": 1
|
|
8077
|
+
},
|
|
8078
|
+
{
|
|
8079
|
+
"source": ".agent-src.uncompressed/skills/compress-memory/SKILL.md",
|
|
8080
|
+
"target": ".agent-src.uncompressed/skills/agents-md-thin-root/SKILL.md",
|
|
8081
|
+
"type": "READ_ONLY",
|
|
8082
|
+
"via": "body_link",
|
|
8083
|
+
"depth": 1
|
|
8084
|
+
},
|
|
8085
|
+
{
|
|
8086
|
+
"source": ".agent-src.uncompressed/skills/compress-memory/SKILL.md",
|
|
8087
|
+
"target": ".agent-src.uncompressed/skills/compress-memory/SKILL.md",
|
|
8088
|
+
"type": "WRITE",
|
|
8089
|
+
"via": "self",
|
|
8090
|
+
"depth": 0
|
|
8091
|
+
},
|
|
8051
8092
|
{
|
|
8052
8093
|
"source": ".agent-src.uncompressed/skills/content-funnel-design/SKILL.md",
|
|
8053
8094
|
"target": ".agent-src.uncompressed/skills/activation-design/SKILL.md",
|
package/package.json
CHANGED
|
@@ -0,0 +1,273 @@
|
|
|
1
|
+
# Caveman compression bench — step-16 Phase 1 Step 4.
|
|
2
|
+
#
|
|
3
|
+
# Three-arm live bench against bench/corpora/caveman/prompts.yaml:
|
|
4
|
+
# compressed — system prompt embeds caveman-speak rule (aggressive).
|
|
5
|
+
# terse_control — system prompt = "Answer concisely. …" (carve-out-free baseline).
|
|
6
|
+
# uncompressed — generic helpful-assistant system prompt.
|
|
7
|
+
#
|
|
8
|
+
# Token counts come from Anthropic API `usage` (authoritative). Carve-out
|
|
9
|
+
# share is measured via regex extraction on the reply text; chars/4 yields
|
|
10
|
+
# an estimated carve-out-token figure for the carve-out-tax accounting.
|
|
11
|
+
#
|
|
12
|
+
# Cost-touch: 10 prompts × 3 arms × claude-sonnet-4-5 (~$3/M in, ~$15/M out).
|
|
13
|
+
"""Caveman compression bench runner."""
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import re
|
|
17
|
+
import statistics
|
|
18
|
+
import time
|
|
19
|
+
from dataclasses import dataclass, field
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
from typing import Any
|
|
22
|
+
|
|
23
|
+
import yaml
|
|
24
|
+
|
|
25
|
+
# ── system prompts per arm ──────────────────────────────────────────────
|
|
26
|
+
|
|
27
|
+
SYSTEM_PROMPT_COMPRESSED = """You are speaking in CAVEMAN-SPEAK mode (speak_scope=aggressive).
|
|
28
|
+
|
|
29
|
+
Compress all body prose to caveman grammar:
|
|
30
|
+
- Drop articles (the, a, an).
|
|
31
|
+
- Drop linking auxiliaries (is, are, was, be) where unambiguous.
|
|
32
|
+
- Drop pronouns when context is clear.
|
|
33
|
+
- Keep nouns, verbs, key adjectives, negation, numbers.
|
|
34
|
+
- Example: "I will now check the file and see if it exists" -> "Check file. Exists?"
|
|
35
|
+
|
|
36
|
+
Carve-outs — preserve BYTE-FOR-BYTE (do NOT compress these):
|
|
37
|
+
1. Triple-backtick code/literal blocks (any language, including ALL-CAPS Iron-Law fences).
|
|
38
|
+
2. Numbered-options lines matching ^\\d+\\.\\s + a **Recommendation:** label.
|
|
39
|
+
3. Backtick spans (file paths, command names, identifiers).
|
|
40
|
+
4. Status markers: lines starting with ❌, ⚠️, or ✅.
|
|
41
|
+
5. Mode markers.
|
|
42
|
+
6. Markdown tables.
|
|
43
|
+
7. Deliverables (PR titles, commit messages, ticket summaries, articles, the prompt
|
|
44
|
+
line of any single question asked to the user).
|
|
45
|
+
|
|
46
|
+
Apply caveman compression aggressively to every other prose surface."""
|
|
47
|
+
|
|
48
|
+
SYSTEM_PROMPT_TERSE = (
|
|
49
|
+
"Answer concisely. Skip preamble. Do not restate the question. "
|
|
50
|
+
"Avoid filler phrases ('Let me', 'Here is', 'I will'). Get to the answer."
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
SYSTEM_PROMPT_UNCOMPRESSED = (
|
|
54
|
+
"You are a helpful AI assistant. Answer the user's question clearly and completely."
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
ARMS: tuple[str, ...] = ("compressed", "terse_control", "uncompressed")
|
|
58
|
+
ARM_SYSTEM_PROMPT: dict[str, str] = {
|
|
59
|
+
"compressed": SYSTEM_PROMPT_COMPRESSED,
|
|
60
|
+
"terse_control": SYSTEM_PROMPT_TERSE,
|
|
61
|
+
"uncompressed": SYSTEM_PROMPT_UNCOMPRESSED,
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
# ── carve-out detection ────────────────────────────────────────────────
|
|
65
|
+
|
|
66
|
+
_RE_TRIPLE_BACKTICK = re.compile(r"```[\s\S]*?```")
|
|
67
|
+
_RE_BACKTICK_SPAN = re.compile(r"`[^`\n]+`")
|
|
68
|
+
_RE_NUMBERED_LINE = re.compile(r"^>?\s*\d+\.\s.*$", re.MULTILINE)
|
|
69
|
+
_RE_STATUS_LINE = re.compile(r"^(❌|⚠️|✅).*$", re.MULTILINE)
|
|
70
|
+
_RE_TABLE_LINE = re.compile(r"^\s*\|.*\|\s*$", re.MULTILINE)
|
|
71
|
+
_RE_RECOMMENDATION = re.compile(r"^\*\*(Recommendation|Empfehlung):\*\*.*$", re.MULTILINE)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def carve_out_chars(text: str) -> int:
|
|
75
|
+
"""Sum byte-length of every carve-out region (union, no double-count)."""
|
|
76
|
+
if not text:
|
|
77
|
+
return 0
|
|
78
|
+
mask = bytearray(len(text))
|
|
79
|
+
for pattern in (
|
|
80
|
+
_RE_TRIPLE_BACKTICK, _RE_BACKTICK_SPAN, _RE_NUMBERED_LINE,
|
|
81
|
+
_RE_STATUS_LINE, _RE_TABLE_LINE, _RE_RECOMMENDATION,
|
|
82
|
+
):
|
|
83
|
+
for m in pattern.finditer(text):
|
|
84
|
+
for i in range(m.start(), m.end()):
|
|
85
|
+
mask[i] = 1
|
|
86
|
+
return sum(mask)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
# ── data shapes ────────────────────────────────────────────────────────
|
|
90
|
+
|
|
91
|
+
@dataclass
|
|
92
|
+
class ArmResult:
|
|
93
|
+
arm: str
|
|
94
|
+
text: str
|
|
95
|
+
input_tokens: int
|
|
96
|
+
output_tokens: int
|
|
97
|
+
latency_ms: int
|
|
98
|
+
output_chars: int
|
|
99
|
+
carve_out_chars: int
|
|
100
|
+
error: str | None = None
|
|
101
|
+
|
|
102
|
+
@property
|
|
103
|
+
def realised_carve_out_pct(self) -> float:
|
|
104
|
+
return self.carve_out_chars / self.output_chars if self.output_chars else 0.0
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
@dataclass
|
|
108
|
+
class PromptResult:
|
|
109
|
+
id: str
|
|
110
|
+
category: str
|
|
111
|
+
expected_carve_out_pct: float
|
|
112
|
+
arms: dict[str, ArmResult] = field(default_factory=dict)
|
|
113
|
+
|
|
114
|
+
@property
|
|
115
|
+
def savings_vs_raw(self) -> float | None:
|
|
116
|
+
c = self.arms.get("compressed")
|
|
117
|
+
u = self.arms.get("uncompressed")
|
|
118
|
+
if not c or not u or u.output_tokens == 0:
|
|
119
|
+
return None
|
|
120
|
+
return 1.0 - (c.output_tokens / u.output_tokens)
|
|
121
|
+
|
|
122
|
+
@property
|
|
123
|
+
def savings_vs_terse(self) -> float | None:
|
|
124
|
+
c = self.arms.get("compressed")
|
|
125
|
+
t = self.arms.get("terse_control")
|
|
126
|
+
if not c or not t or t.output_tokens == 0:
|
|
127
|
+
return None
|
|
128
|
+
return 1.0 - (c.output_tokens / t.output_tokens)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
# ── corpus + runner ────────────────────────────────────────────────────
|
|
132
|
+
|
|
133
|
+
def load_corpus(corpus_path: Path) -> list[dict[str, Any]]:
|
|
134
|
+
"""Read bench/corpora/caveman/prompts.yaml → list of prompt dicts."""
|
|
135
|
+
data = yaml.safe_load(corpus_path.read_text(encoding="utf-8")) or {}
|
|
136
|
+
prompts = data.get("prompts") or []
|
|
137
|
+
if not prompts:
|
|
138
|
+
raise ValueError(f"empty corpus: {corpus_path}")
|
|
139
|
+
return prompts
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def run_arm(
|
|
143
|
+
client: Any,
|
|
144
|
+
arm: str,
|
|
145
|
+
user_prompt: str,
|
|
146
|
+
*,
|
|
147
|
+
max_tokens: int = 1024,
|
|
148
|
+
) -> ArmResult:
|
|
149
|
+
"""Invoke one arm against the live API. Returns ArmResult including text."""
|
|
150
|
+
t0 = time.monotonic()
|
|
151
|
+
system = ARM_SYSTEM_PROMPT[arm]
|
|
152
|
+
try:
|
|
153
|
+
resp = client.ask(system, user_prompt, max_tokens=max_tokens)
|
|
154
|
+
except Exception as exc: # noqa: BLE001
|
|
155
|
+
latency_ms = int((time.monotonic() - t0) * 1000)
|
|
156
|
+
return ArmResult(arm=arm, text="", input_tokens=0, output_tokens=0,
|
|
157
|
+
latency_ms=latency_ms, output_chars=0, carve_out_chars=0,
|
|
158
|
+
error=str(exc))
|
|
159
|
+
return ArmResult(
|
|
160
|
+
arm=arm, text=resp.text or "",
|
|
161
|
+
input_tokens=int(resp.input_tokens or 0),
|
|
162
|
+
output_tokens=int(resp.output_tokens or 0),
|
|
163
|
+
latency_ms=int(resp.latency_ms or (time.monotonic() - t0) * 1000),
|
|
164
|
+
output_chars=len(resp.text or ""),
|
|
165
|
+
carve_out_chars=carve_out_chars(resp.text or ""),
|
|
166
|
+
error=resp.error,
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
# ── aggregation ────────────────────────────────────────────────────────────
|
|
171
|
+
|
|
172
|
+
def _stats(values: list[float]) -> dict[str, float]:
|
|
173
|
+
"""Median / p10 / p90 / stdev / n on a list of floats. Empty → zeros."""
|
|
174
|
+
if not values:
|
|
175
|
+
return {"n": 0, "median": 0.0, "p10": 0.0, "p90": 0.0, "stdev": 0.0}
|
|
176
|
+
s = sorted(values)
|
|
177
|
+
n = len(s)
|
|
178
|
+
def _pct(p: float) -> float:
|
|
179
|
+
if n == 1:
|
|
180
|
+
return s[0]
|
|
181
|
+
k = (n - 1) * p
|
|
182
|
+
lo, hi = int(k), min(int(k) + 1, n - 1)
|
|
183
|
+
return s[lo] + (s[hi] - s[lo]) * (k - lo)
|
|
184
|
+
return {
|
|
185
|
+
"n": n,
|
|
186
|
+
"median": statistics.median(s),
|
|
187
|
+
"p10": _pct(0.10),
|
|
188
|
+
"p90": _pct(0.90),
|
|
189
|
+
"stdev": statistics.pstdev(s) if n > 1 else 0.0,
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def aggregate_results(results: list[PromptResult]) -> dict[str, Any]:
|
|
194
|
+
"""Compute median/p10/p90 for compression metrics across the corpus."""
|
|
195
|
+
vs_raw = [r.savings_vs_raw for r in results if r.savings_vs_raw is not None]
|
|
196
|
+
vs_terse = [r.savings_vs_terse for r in results if r.savings_vs_terse is not None]
|
|
197
|
+
realised_carve_pct = [
|
|
198
|
+
r.arms["compressed"].realised_carve_out_pct
|
|
199
|
+
for r in results if "compressed" in r.arms and r.arms["compressed"].output_chars
|
|
200
|
+
]
|
|
201
|
+
expected_carve_pct = [r.expected_carve_out_pct for r in results]
|
|
202
|
+
|
|
203
|
+
per_arm_tokens: dict[str, list[int]] = {a: [] for a in ARMS}
|
|
204
|
+
for r in results:
|
|
205
|
+
for arm in ARMS:
|
|
206
|
+
ar = r.arms.get(arm)
|
|
207
|
+
if ar:
|
|
208
|
+
per_arm_tokens[arm].append(ar.output_tokens)
|
|
209
|
+
|
|
210
|
+
return {
|
|
211
|
+
"savings_vs_raw": _stats(vs_raw),
|
|
212
|
+
"savings_vs_terse": _stats(vs_terse),
|
|
213
|
+
"realised_carve_out_pct": _stats(realised_carve_pct),
|
|
214
|
+
"expected_carve_out_pct": _stats(expected_carve_pct),
|
|
215
|
+
"output_tokens": {
|
|
216
|
+
arm: _stats([float(v) for v in per_arm_tokens[arm]]) for arm in ARMS
|
|
217
|
+
},
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def compute_cost(results: list[PromptResult], pricing: dict[str, float]) -> dict[str, Any]:
|
|
222
|
+
"""Sum input/output tokens across all arms; cost from per-1M pricing dict."""
|
|
223
|
+
totals = {"input_tokens": 0, "output_tokens": 0, "calls": 0, "errors": 0}
|
|
224
|
+
per_arm: dict[str, dict[str, int]] = {a: {"input_tokens": 0, "output_tokens": 0, "calls": 0} for a in ARMS}
|
|
225
|
+
for r in results:
|
|
226
|
+
for arm, ar in r.arms.items():
|
|
227
|
+
totals["input_tokens"] += ar.input_tokens
|
|
228
|
+
totals["output_tokens"] += ar.output_tokens
|
|
229
|
+
totals["calls"] += 1
|
|
230
|
+
if ar.error:
|
|
231
|
+
totals["errors"] += 1
|
|
232
|
+
per_arm[arm]["input_tokens"] += ar.input_tokens
|
|
233
|
+
per_arm[arm]["output_tokens"] += ar.output_tokens
|
|
234
|
+
per_arm[arm]["calls"] += 1
|
|
235
|
+
cost_usd = (
|
|
236
|
+
totals["input_tokens"] / 1e6 * pricing.get("input", 0.0)
|
|
237
|
+
+ totals["output_tokens"] / 1e6 * pricing.get("output", 0.0)
|
|
238
|
+
)
|
|
239
|
+
totals["total_cost_usd"] = round(cost_usd, 6)
|
|
240
|
+
return {"totals": totals, "per_arm": per_arm}
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
# ── orchestrator ───────────────────────────────────────────────────────────
|
|
244
|
+
|
|
245
|
+
def run_caveman_bench(
|
|
246
|
+
client: Any,
|
|
247
|
+
corpus_path: Path,
|
|
248
|
+
*,
|
|
249
|
+
max_prompts: int | None = None,
|
|
250
|
+
max_tokens: int = 1024,
|
|
251
|
+
on_progress: Any = None,
|
|
252
|
+
) -> list[PromptResult]:
|
|
253
|
+
"""Run all three arms over the corpus. Returns per-prompt results."""
|
|
254
|
+
prompts = load_corpus(corpus_path)
|
|
255
|
+
if max_prompts:
|
|
256
|
+
prompts = prompts[:max_prompts]
|
|
257
|
+
results: list[PromptResult] = []
|
|
258
|
+
total = len(prompts) * len(ARMS)
|
|
259
|
+
done = 0
|
|
260
|
+
for p in prompts:
|
|
261
|
+
pr = PromptResult(
|
|
262
|
+
id=str(p["id"]),
|
|
263
|
+
category=str(p.get("category", "unknown")),
|
|
264
|
+
expected_carve_out_pct=float(p.get("expected_carve_out_pct", 0.0)),
|
|
265
|
+
)
|
|
266
|
+
for arm in ARMS:
|
|
267
|
+
ar = run_arm(client, arm, str(p["prompt"]), max_tokens=max_tokens)
|
|
268
|
+
pr.arms[arm] = ar
|
|
269
|
+
done += 1
|
|
270
|
+
if on_progress:
|
|
271
|
+
on_progress(done, total, pr.id, arm, ar)
|
|
272
|
+
results.append(pr)
|
|
273
|
+
return results
|