@event4u/agent-config 4.9.0 → 5.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.agent-src/commands/implement-ticket.md +5 -4
- package/.agent-src/contexts/execution/roadmap-process-loop.md +30 -4
- package/.agent-src/rules/language-and-tone.md +4 -10
- package/.agent-src/rules/linked-projects-onboarding-gate.md +82 -0
- package/.agent-src/rules/roadmap-progress-sync.md +39 -5
- package/.agent-src/scripts/update_roadmap_progress.py +63 -7
- package/.agent-src/skills/command-routing/SKILL.md +5 -4
- package/.agent-src/skills/roadmap-management/SKILL.md +121 -21
- package/.agent-src/skills/roadmap-writing/SKILL.md +63 -0
- package/.agent-src/templates/agent-settings.md +16 -0
- package/.agent-src/templates/roadmaps.md +22 -1
- package/.agent-src/templates/scripts/work_engine/_lib/agent_settings.py +20 -3
- package/.claude-plugin/marketplace.json +1 -1
- package/CHANGELOG.md +106 -0
- package/CONTRIBUTING.md +19 -0
- package/README.md +12 -1
- package/dist/cli/registry.js +0 -2
- package/dist/cli/registry.js.map +1 -1
- package/dist/discovery/deprecation-report.md +1 -1
- package/dist/discovery/discovery-manifest.json +36 -14
- package/dist/discovery/discovery-manifest.json.sha256 +1 -1
- package/dist/discovery/discovery-manifest.summary.md +3 -3
- package/dist/discovery/orphan-report.md +1 -1
- package/dist/discovery/packs.json +6 -5
- package/dist/discovery/trust-report.md +3 -3
- package/dist/discovery/workspaces.json +5 -4
- package/dist/mcp/registry-manifest.json +3 -3
- package/dist/router.json +1 -1671
- package/docs/architecture.md +1 -1
- package/docs/benchmark.md +20 -8
- package/docs/benchmarks.md +11 -0
- package/docs/catalog.md +3 -2
- package/docs/contracts/benchmark-corpus-spec.md +31 -3
- package/docs/contracts/command-surface-tiers.md +1 -1
- package/docs/contracts/hook-architecture-v1.md +33 -0
- package/docs/contracts/migrate-command.md +197 -0
- package/docs/contracts/settings-api.md +2 -1
- package/docs/contracts/value-dashboard-spec.md +374 -0
- package/docs/contracts/value-report-schema.md +150 -0
- package/docs/decisions/ADR-031-validation-severity-tiers-and-projection-roundtrip.md +97 -0
- package/docs/decisions/ADR-032-linked-projects-scope.md +118 -0
- package/docs/decisions/INDEX.md +2 -0
- package/docs/getting-started.md +1 -1
- package/docs/guidelines/agent-infra/installed-tools-manifest.md +6 -3
- package/docs/guidelines/agent-infra/language-and-tone-examples.md +35 -0
- package/docs/guides/cross-repo-linked-projects.md +86 -0
- package/docs/migration/v1-to-v2.md +40 -27
- package/docs/value.md +84 -0
- package/package.json +8 -8
- package/scripts/__pycache__/validate_frontmatter.cpython-312.pyc +0 -0
- package/scripts/_cli/cmd_migrate.py +264 -102
- package/scripts/_cli/cmd_settings_migrate.py +2 -1
- package/scripts/_dispatch.bash +147 -49
- package/scripts/_lib/__pycache__/__init__.cpython-312.pyc +0 -0
- package/scripts/_lib/__pycache__/agent_src.cpython-312.pyc +0 -0
- package/scripts/_lib/agent_settings.py +20 -3
- package/scripts/_lib/install_regenerator.py +129 -0
- package/scripts/_lib/linked_projects.py +238 -0
- package/scripts/_lib/value_ladder.py +599 -0
- package/scripts/_lib/value_report.py +441 -0
- package/scripts/bench_rtk_savings.py +320 -0
- package/scripts/check_no_local_settings_committed.py +51 -0
- package/scripts/compile_router.py +19 -5
- package/scripts/expected_perms.json +1 -1
- package/scripts/first_run_gate_hook.py +178 -0
- package/scripts/hook_manifest.yaml +16 -7
- package/scripts/hooks/dispatch_hook.py +27 -0
- package/scripts/hooks/dispatch_issues.py +136 -0
- package/scripts/hooks_doctor.py +40 -1
- package/scripts/install.py +25 -21
- package/scripts/lint_agents_layout.py +5 -4
- package/scripts/lint_bench_corpus.py +86 -4
- package/scripts/lint_global_paths.py +4 -3
- package/scripts/lint_marketplace_install_completeness.py +188 -0
- package/scripts/lint_value_dashboard.py +218 -0
- package/scripts/render_benchmark_md.py +6 -2
- package/scripts/render_value_md.py +355 -0
- package/scripts/repro/repro_marketplace_install_gap.sh +161 -0
- package/scripts/roadmap_progress_hook.py +23 -0
- package/scripts/router_telemetry.py +470 -0
- package/scripts/validate_frontmatter.py +23 -9
- package/scripts/_cli/cmd_migrate_to_global.py +0 -415
|
@@ -0,0 +1,599 @@
|
|
|
1
|
+
"""Pure normaliser: raw bench reports → `value-v1` rung dicts.
|
|
2
|
+
|
|
3
|
+
Phase 1 Step 2 of `agents/roadmaps/road-to-readable-value-dashboard.md`.
|
|
4
|
+
|
|
5
|
+
This module is **pure** — no I/O, no file reads, no clock. Inputs are
|
|
6
|
+
already-loaded dicts; outputs are rung dicts conforming to
|
|
7
|
+
`docs/contracts/value-report-schema.md`. The companion
|
|
8
|
+
`scripts/_lib/value_report.py` owns the I/O wrapper that loads the raw
|
|
9
|
+
reports, calls these functions, and writes the assembled JSON.
|
|
10
|
+
|
|
11
|
+
Rung dict shape (see `value-report-schema.md` for the full contract):
|
|
12
|
+
|
|
13
|
+
{
|
|
14
|
+
"id": "<kebab-case>",
|
|
15
|
+
"label": "<German + English>",
|
|
16
|
+
"what_it_does": "<≤ 80 char phrase>",
|
|
17
|
+
"token_delta": <signed int>,
|
|
18
|
+
"eur_delta": <float>,
|
|
19
|
+
"cumulative_pct": <signed float>, # filled in by assemble_ladder
|
|
20
|
+
"confidence": "measured" | "estimated" | "vendor-claim" | "pending",
|
|
21
|
+
"source_report": "<relative path>",
|
|
22
|
+
"footnote": "<optional caveat>", # omitted when no caveat
|
|
23
|
+
}
|
|
24
|
+
"""
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
from typing import Any, Dict, List, Optional
|
|
28
|
+
|
|
29
|
+
# ── Reference scale defaults ────────────────────────────────────────────
|
|
30
|
+
|
|
31
|
+
DEFAULT_REFERENCE_SCALE = {
|
|
32
|
+
"requests": 1000,
|
|
33
|
+
"avg_input_tokens": 8000,
|
|
34
|
+
"avg_output_tokens": 600,
|
|
35
|
+
"model_tier": "sonnet",
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
# ── Pricing ─────────────────────────────────────────────────────────────
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def price_tokens_eur(
|
|
42
|
+
input_tokens: int,
|
|
43
|
+
output_tokens: int,
|
|
44
|
+
pricing_row: Dict[str, Any],
|
|
45
|
+
eur_per_usd: float = 0.92,
|
|
46
|
+
) -> float:
|
|
47
|
+
"""Convert (input, output) token counts to € using a pricing.yaml row.
|
|
48
|
+
|
|
49
|
+
`pricing_row` is one entry from `internal/bench/pricing.yaml::models`
|
|
50
|
+
(the row with the matching tier). USD/1M token rates are converted to
|
|
51
|
+
€ via `eur_per_usd` (default 0.92 — adjust at the call site if
|
|
52
|
+
`pricing.yaml` ever carries a EUR rate directly).
|
|
53
|
+
"""
|
|
54
|
+
input_usd = (input_tokens / 1_000_000.0) * float(pricing_row.get("input", 0.0))
|
|
55
|
+
output_usd = (output_tokens / 1_000_000.0) * float(pricing_row.get("output", 0.0))
|
|
56
|
+
return (input_usd + output_usd) * eur_per_usd
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def price_input_delta_eur(
|
|
60
|
+
token_delta_per_request: int,
|
|
61
|
+
reference_scale: Dict[str, Any],
|
|
62
|
+
pricing_row: Dict[str, Any],
|
|
63
|
+
) -> float:
|
|
64
|
+
"""Price a per-request *input* token delta at the reference scale."""
|
|
65
|
+
requests = int(reference_scale.get("requests", 1000))
|
|
66
|
+
total_input_tokens = token_delta_per_request * requests
|
|
67
|
+
return price_tokens_eur(total_input_tokens, 0, pricing_row)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def price_output_delta_eur(
|
|
71
|
+
token_delta_per_request: int,
|
|
72
|
+
reference_scale: Dict[str, Any],
|
|
73
|
+
pricing_row: Dict[str, Any],
|
|
74
|
+
) -> float:
|
|
75
|
+
"""Price a per-request *output* token delta at the reference scale."""
|
|
76
|
+
requests = int(reference_scale.get("requests", 1000))
|
|
77
|
+
total_output_tokens = token_delta_per_request * requests
|
|
78
|
+
return price_tokens_eur(0, total_output_tokens, pricing_row)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
# ── Pending-rung factory ────────────────────────────────────────────────
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def pending_rung(
|
|
85
|
+
rung_id: str,
|
|
86
|
+
label: str,
|
|
87
|
+
what_it_does: str,
|
|
88
|
+
source_report: str,
|
|
89
|
+
footnote: Optional[str] = None,
|
|
90
|
+
) -> Dict[str, Any]:
|
|
91
|
+
"""Emit a `pending` rung — measurement not yet available."""
|
|
92
|
+
rung = {
|
|
93
|
+
"id": rung_id,
|
|
94
|
+
"label": label,
|
|
95
|
+
"what_it_does": what_it_does,
|
|
96
|
+
"token_delta": 0,
|
|
97
|
+
"eur_delta": 0.0,
|
|
98
|
+
"cumulative_pct": 0.0, # filled in by assemble_ladder
|
|
99
|
+
"confidence": "pending",
|
|
100
|
+
"source_report": source_report,
|
|
101
|
+
}
|
|
102
|
+
if footnote:
|
|
103
|
+
rung["footnote"] = footnote
|
|
104
|
+
return rung
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
# ── Rung extractors ─────────────────────────────────────────────────────
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def baseline_rung(reference_scale: Dict[str, Any]) -> Dict[str, Any]:
|
|
111
|
+
"""The zero-point rung. token_delta = 0 by construction."""
|
|
112
|
+
return {
|
|
113
|
+
"id": "baseline",
|
|
114
|
+
"label": "Ohne Paket / Without package",
|
|
115
|
+
"what_it_does": "Baseline — der nackte Request ohne Paket-Regeln.",
|
|
116
|
+
"token_delta": 0,
|
|
117
|
+
"eur_delta": 0.0,
|
|
118
|
+
"cumulative_pct": 0.0,
|
|
119
|
+
"confidence": "measured",
|
|
120
|
+
"source_report": "n/a",
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def load_rung_from_router(
|
|
125
|
+
router: Optional[Dict[str, Any]],
|
|
126
|
+
rule_chars: Optional[Dict[str, int]],
|
|
127
|
+
charter_chars: int,
|
|
128
|
+
reference_scale: Dict[str, Any],
|
|
129
|
+
pricing_row: Dict[str, Any],
|
|
130
|
+
) -> Dict[str, Any]:
|
|
131
|
+
"""Build the Paket-load rung from the canonical kernel list.
|
|
132
|
+
|
|
133
|
+
Phase 1 of road-to-value-dashboard-netto-cuts: the previous
|
|
134
|
+
`load_rung_from_frugality` reads a hardcoded 6-rule canon
|
|
135
|
+
(`scripts/measure_frugality_savings.py::CANON_RULES`), NOT the
|
|
136
|
+
actual always-loaded kernel. The real kernel lives in
|
|
137
|
+
`dist/router.json::kernel` and has 10 rules. This function reads
|
|
138
|
+
that list and sums per-file char counts to compute the real
|
|
139
|
+
always-loaded footprint.
|
|
140
|
+
|
|
141
|
+
`router` is the decoded `dist/router.json` dict.
|
|
142
|
+
`rule_chars` is a `{rule_id: char_count}` mapping (typically built
|
|
143
|
+
by walking `.agent-src/rules/<id>.md`).
|
|
144
|
+
`charter_chars` is the always-loaded charter footprint.
|
|
145
|
+
|
|
146
|
+
Returns a `pending` rung when the router is missing or has no
|
|
147
|
+
kernel entry; the rung's `source_report` cites the missing input.
|
|
148
|
+
"""
|
|
149
|
+
if not router or "kernel" not in router:
|
|
150
|
+
return pending_rung(
|
|
151
|
+
"load",
|
|
152
|
+
"Mit Paket (Regeln laden) / With package (rule load)",
|
|
153
|
+
"Die immer-aktiven Regeln landen im Kontext jedes Requests.",
|
|
154
|
+
"dist/router.json",
|
|
155
|
+
footnote="Run scripts/compile_router.py to generate the router.",
|
|
156
|
+
)
|
|
157
|
+
rule_chars = rule_chars or {}
|
|
158
|
+
kernel_ids = list(router.get("kernel", []))
|
|
159
|
+
kernel_total = sum(int(rule_chars.get(rid, 0)) for rid in kernel_ids)
|
|
160
|
+
total_chars = kernel_total + int(charter_chars)
|
|
161
|
+
# 4 chars/token approximation, consistent with measure_frugality_savings.py.
|
|
162
|
+
token_delta = total_chars // 4
|
|
163
|
+
return {
|
|
164
|
+
"id": "load",
|
|
165
|
+
"label": "Mit Paket (Regeln laden) / With package (rule load)",
|
|
166
|
+
"what_it_does": "Die immer-aktiven Regeln landen im Kontext jedes Requests.",
|
|
167
|
+
"token_delta": token_delta,
|
|
168
|
+
"eur_delta": price_input_delta_eur(token_delta, reference_scale, pricing_row),
|
|
169
|
+
"cumulative_pct": 0.0,
|
|
170
|
+
"confidence": "measured",
|
|
171
|
+
"source_report": "dist/router.json",
|
|
172
|
+
"footnote": (
|
|
173
|
+
f"Kernel = {len(kernel_ids)} rules ({kernel_total} chars) "
|
|
174
|
+
f"+ charter ({int(charter_chars)} chars); tokens ≈ chars / 4."
|
|
175
|
+
),
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def load_rung_from_frugality(
|
|
180
|
+
frugality_record: Optional[Dict[str, Any]],
|
|
181
|
+
reference_scale: Dict[str, Any],
|
|
182
|
+
pricing_row: Dict[str, Any],
|
|
183
|
+
) -> Dict[str, Any]:
|
|
184
|
+
"""Build the Paket-load rung from a frugality baseline.jsonl record.
|
|
185
|
+
|
|
186
|
+
**Deprecated** as of road-to-value-dashboard-netto-cuts Phase 1:
|
|
187
|
+
measures a hardcoded 6-rule canon, not the actual always-loaded
|
|
188
|
+
kernel. Kept as a back-compat fallback when `dist/router.json` is
|
|
189
|
+
missing. New callers should prefer `load_rung_from_router()`.
|
|
190
|
+
|
|
191
|
+
`frugality_record` is one decoded line from
|
|
192
|
+
`agents/runtime/frugality/baseline.jsonl` (the latest record is the
|
|
193
|
+
typical input). The rung token_delta is the always-loaded
|
|
194
|
+
(kernel + tier_1 + tier_2 + charter) footprint divided by 4 to
|
|
195
|
+
approximate tokens.
|
|
196
|
+
|
|
197
|
+
Returns a `pending` rung when the record is missing or malformed.
|
|
198
|
+
"""
|
|
199
|
+
if not frugality_record:
|
|
200
|
+
return pending_rung(
|
|
201
|
+
"load",
|
|
202
|
+
"Mit Paket (Regeln laden) / With package (rule load)",
|
|
203
|
+
"Die immer-aktiven Regeln landen im Kontext jedes Requests.",
|
|
204
|
+
"agents/runtime/frugality/baseline.jsonl",
|
|
205
|
+
footnote="Run scripts/measure_frugality_savings.py to populate.",
|
|
206
|
+
)
|
|
207
|
+
footprint = frugality_record.get("metric_a_footprint", {})
|
|
208
|
+
kernel = int(footprint.get("kernel_total_chars", 0))
|
|
209
|
+
tier_1 = int(footprint.get("tier_1_total_chars", 0))
|
|
210
|
+
tier_2 = int(footprint.get("tier_2_total_chars", 0))
|
|
211
|
+
charter = int(footprint.get("charter_chars", 0))
|
|
212
|
+
total_chars = kernel + tier_1 + tier_2 + charter
|
|
213
|
+
# 4 chars/token approximation, consistent with measure_frugality_savings.py.
|
|
214
|
+
token_delta = total_chars // 4
|
|
215
|
+
return {
|
|
216
|
+
"id": "load",
|
|
217
|
+
"label": "Mit Paket (Regeln laden) / With package (rule load)",
|
|
218
|
+
"what_it_does": "Die immer-aktiven Regeln landen im Kontext jedes Requests.",
|
|
219
|
+
"token_delta": token_delta,
|
|
220
|
+
"eur_delta": price_input_delta_eur(token_delta, reference_scale, pricing_row),
|
|
221
|
+
"cumulative_pct": 0.0,
|
|
222
|
+
"confidence": "measured",
|
|
223
|
+
"source_report": "agents/runtime/frugality/baseline.jsonl",
|
|
224
|
+
"footnote": (
|
|
225
|
+
"Always-loaded footprint = kernel + tier_1 + tier_2 + charter; "
|
|
226
|
+
"tokens ≈ chars / 4."
|
|
227
|
+
),
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def condense_rung_from_telegraph_v2(
|
|
232
|
+
telegraph_v2: Optional[Dict[str, Any]],
|
|
233
|
+
baseline_input_tokens: int,
|
|
234
|
+
reference_scale: Dict[str, Any],
|
|
235
|
+
pricing_row: Dict[str, Any],
|
|
236
|
+
) -> Dict[str, Any]:
|
|
237
|
+
"""Build the condense rung from telegraph-v2 aggregate.
|
|
238
|
+
|
|
239
|
+
Excludes Thin-Root files (per the spec); aggregates the
|
|
240
|
+
prose-heavy-contract + rule-classification categories. The rung is
|
|
241
|
+
a *saving* (negative token_delta) when the median is positive.
|
|
242
|
+
"""
|
|
243
|
+
if not telegraph_v2 or "aggregate" not in telegraph_v2:
|
|
244
|
+
return pending_rung(
|
|
245
|
+
"condense",
|
|
246
|
+
"+ condense (Regeln eindampfen) / + condense (rule shrink)",
|
|
247
|
+
"Build-Schritt schrumpft Regel-Dateien vor dem Ausliefern.",
|
|
248
|
+
"internal/bench/reports/telegraph-v2.json",
|
|
249
|
+
footnote="Run scripts/bench_telegraph.py to populate.",
|
|
250
|
+
)
|
|
251
|
+
aggregate = telegraph_v2["aggregate"]
|
|
252
|
+
by_cat = aggregate.get("by_category_median_pct", {})
|
|
253
|
+
# Non-Thin-Root categories only.
|
|
254
|
+
non_thin_root = {
|
|
255
|
+
k: v for k, v in by_cat.items() if not k.startswith("thin-root-")
|
|
256
|
+
}
|
|
257
|
+
if not non_thin_root:
|
|
258
|
+
median_saving_pct = float(aggregate.get("median_saving_pct", 0.0))
|
|
259
|
+
else:
|
|
260
|
+
# Simple mean across non-Thin-Root category medians — matches the
|
|
261
|
+
# "aggregate to a single rung" wording in the roadmap.
|
|
262
|
+
median_saving_pct = sum(non_thin_root.values()) / len(non_thin_root)
|
|
263
|
+
# Saving % is the % of baseline_input_tokens that condense claws back.
|
|
264
|
+
# Positive saving % → negative token_delta (we save tokens).
|
|
265
|
+
token_delta = -int(round(baseline_input_tokens * median_saving_pct / 100.0))
|
|
266
|
+
return {
|
|
267
|
+
"id": "condense",
|
|
268
|
+
"label": "+ condense (Regeln eindampfen) / + condense (rule shrink)",
|
|
269
|
+
"what_it_does": "Build-Schritt schrumpft Regel-Dateien vor dem Ausliefern.",
|
|
270
|
+
"token_delta": token_delta,
|
|
271
|
+
"eur_delta": price_input_delta_eur(token_delta, reference_scale, pricing_row),
|
|
272
|
+
"cumulative_pct": 0.0,
|
|
273
|
+
"confidence": "measured",
|
|
274
|
+
"source_report": "internal/bench/reports/telegraph-v2.json",
|
|
275
|
+
"footnote": (
|
|
276
|
+
"Aggregate across non-Thin-Root categories; Thin-Root files "
|
|
277
|
+
"(AGENTS.md variants) net negative (~−4%) and are excluded "
|
|
278
|
+
"from the rung — surfaced separately."
|
|
279
|
+
),
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
def rtk_rung_from_report(
|
|
284
|
+
rtk_report: Optional[Dict[str, Any]],
|
|
285
|
+
reference_scale: Dict[str, Any],
|
|
286
|
+
pricing_row: Dict[str, Any],
|
|
287
|
+
) -> Dict[str, Any]:
|
|
288
|
+
"""Build the rtk rung from `internal/bench/reports/rtk/latest.json`.
|
|
289
|
+
|
|
290
|
+
The rtk report carries the per-command corpus result + an aggregate
|
|
291
|
+
`tokens_saved_per_request` (output-side savings on tool calls). If
|
|
292
|
+
missing → `pending`.
|
|
293
|
+
"""
|
|
294
|
+
if not rtk_report:
|
|
295
|
+
return pending_rung(
|
|
296
|
+
"rtk",
|
|
297
|
+
"+ rtk (CLI-Output filtern) / + rtk (filter CLI output)",
|
|
298
|
+
"rtk schneidet verbose CLI-Ausgabe vor dem Modell-Input weg.",
|
|
299
|
+
"internal/bench/reports/rtk/latest.json",
|
|
300
|
+
footnote="Install rtk and run scripts/bench_rtk_savings.py.",
|
|
301
|
+
)
|
|
302
|
+
aggregate = rtk_report.get("aggregate", {})
|
|
303
|
+
tokens_saved = int(aggregate.get("tokens_saved_per_request", 0))
|
|
304
|
+
if tokens_saved <= 0:
|
|
305
|
+
return pending_rung(
|
|
306
|
+
"rtk",
|
|
307
|
+
"+ rtk (CLI-Output filtern) / + rtk (filter CLI output)",
|
|
308
|
+
"rtk schneidet verbose CLI-Ausgabe vor dem Modell-Input weg.",
|
|
309
|
+
"internal/bench/reports/rtk/latest.json",
|
|
310
|
+
footnote=(
|
|
311
|
+
"Report present but aggregate.tokens_saved_per_request "
|
|
312
|
+
"is 0 — re-run scripts/bench_rtk_savings.py with the full "
|
|
313
|
+
"corpus."
|
|
314
|
+
),
|
|
315
|
+
)
|
|
316
|
+
# Savings → negative token_delta.
|
|
317
|
+
token_delta = -tokens_saved
|
|
318
|
+
return {
|
|
319
|
+
"id": "rtk",
|
|
320
|
+
"label": "+ rtk (CLI-Output filtern) / + rtk (filter CLI output)",
|
|
321
|
+
"what_it_does": "rtk schneidet verbose CLI-Ausgabe vor dem Modell-Input weg.",
|
|
322
|
+
"token_delta": token_delta,
|
|
323
|
+
"eur_delta": price_input_delta_eur(token_delta, reference_scale, pricing_row),
|
|
324
|
+
"cumulative_pct": 0.0,
|
|
325
|
+
"confidence": "measured",
|
|
326
|
+
"source_report": "internal/bench/reports/rtk/latest.json",
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
def terse_rung_from_telegraph_v1(
|
|
331
|
+
telegraph_v1: Optional[Dict[str, Any]],
|
|
332
|
+
reference_scale: Dict[str, Any],
|
|
333
|
+
pricing_row: Dict[str, Any],
|
|
334
|
+
) -> Dict[str, Any]:
|
|
335
|
+
"""Build the terse rung from telegraph-v1 vs_terse aggregate.
|
|
336
|
+
|
|
337
|
+
The measured median is negative (~ −9.27% in the canonical report).
|
|
338
|
+
We render this honestly per the spec: a rung with the real value
|
|
339
|
+
+ a footnote, never a "saving" label on a negative.
|
|
340
|
+
"""
|
|
341
|
+
if not telegraph_v1 or "telegraph" not in telegraph_v1:
|
|
342
|
+
return pending_rung(
|
|
343
|
+
"terse",
|
|
344
|
+
"+ terse (Antworten knapper) / + terse (shorter replies)",
|
|
345
|
+
"Telegraph-Stil zielt auf knappere Modell-Antworten.",
|
|
346
|
+
"internal/bench/reports/telegraph-v1.json",
|
|
347
|
+
footnote="Run scripts/bench_telegraph.py to populate.",
|
|
348
|
+
)
|
|
349
|
+
arms = telegraph_v1["telegraph"].get("aggregate", {})
|
|
350
|
+
vs_terse = arms.get("savings_vs_terse", {})
|
|
351
|
+
median = float(vs_terse.get("median", 0.0))
|
|
352
|
+
# Output-side: positive median → fewer output tokens than terse control.
|
|
353
|
+
# The measured median in the canonical report is negative (~ -0.0927).
|
|
354
|
+
avg_output = int(reference_scale.get("avg_output_tokens", 600))
|
|
355
|
+
token_delta = -int(round(avg_output * median))
|
|
356
|
+
note = (
|
|
357
|
+
"Honest: gemessener Median = "
|
|
358
|
+
f"{median * 100:+.2f}% gegen 'sei knapp' — Telegraph liefert hier "
|
|
359
|
+
"mehr Tokens, nicht weniger. Wir messen, wir verstecken nicht."
|
|
360
|
+
)
|
|
361
|
+
return {
|
|
362
|
+
"id": "terse",
|
|
363
|
+
"label": "+ terse (Antworten knapper) / + terse (shorter replies)",
|
|
364
|
+
"what_it_does": "Telegraph-Stil zielt auf knappere Modell-Antworten.",
|
|
365
|
+
"token_delta": token_delta,
|
|
366
|
+
"eur_delta": price_output_delta_eur(token_delta, reference_scale, pricing_row),
|
|
367
|
+
"cumulative_pct": 0.0,
|
|
368
|
+
"confidence": "measured",
|
|
369
|
+
"source_report": "internal/bench/reports/telegraph-v1.json",
|
|
370
|
+
"footnote": note,
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
# ── Behaviour-metric extractors ─────────────────────────────────────────
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
def selection_metric_from_dev_reports(
|
|
378
|
+
with_report: Optional[Dict[str, Any]],
|
|
379
|
+
without_report: Optional[Dict[str, Any]],
|
|
380
|
+
) -> Dict[str, Any]:
|
|
381
|
+
"""Right-skill selection: top-K hit rate with vs. without."""
|
|
382
|
+
if not with_report and not without_report:
|
|
383
|
+
return {
|
|
384
|
+
"id": "selection",
|
|
385
|
+
"label": "Right-skill selection / Richtige Skill-Wahl",
|
|
386
|
+
"what_this_means": (
|
|
387
|
+
"Wie oft das passende Skill aktiviert wird (top-K Treffer)."
|
|
388
|
+
),
|
|
389
|
+
"with": None,
|
|
390
|
+
"without": None,
|
|
391
|
+
"delta": None,
|
|
392
|
+
"unit": "pct",
|
|
393
|
+
"mode": "dry-run",
|
|
394
|
+
"source_report": "internal/bench/reports/ab/<dev-corpus-pair>.json",
|
|
395
|
+
}
|
|
396
|
+
w = (with_report or {}).get("selection", {}).get("selection_accuracy")
|
|
397
|
+
wo = (without_report or {}).get("selection", {}).get("selection_accuracy")
|
|
398
|
+
delta = None
|
|
399
|
+
if w is not None and wo is not None:
|
|
400
|
+
delta = round(w - wo, 4)
|
|
401
|
+
mode = ((with_report or {}).get("results") or {}).get("mode") or "live"
|
|
402
|
+
return {
|
|
403
|
+
"id": "selection",
|
|
404
|
+
"label": "Right-skill selection / Richtige Skill-Wahl",
|
|
405
|
+
"what_this_means": (
|
|
406
|
+
"Wie oft das passende Skill aktiviert wird (top-K Treffer)."
|
|
407
|
+
),
|
|
408
|
+
"with": w,
|
|
409
|
+
"without": wo,
|
|
410
|
+
"delta": delta,
|
|
411
|
+
"unit": "pct",
|
|
412
|
+
"mode": mode,
|
|
413
|
+
"source_report": "internal/bench/reports/ab/",
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
|
|
417
|
+
def destructive_stops_metric(
|
|
418
|
+
with_stops: Optional[int],
|
|
419
|
+
without_stops: Optional[int],
|
|
420
|
+
total: int = 5,
|
|
421
|
+
) -> Dict[str, Any]:
|
|
422
|
+
"""Destructive-op stops: N/5 vs M/5 — counts, not pct."""
|
|
423
|
+
if with_stops is None and without_stops is None:
|
|
424
|
+
return {
|
|
425
|
+
"id": "destructive-stops",
|
|
426
|
+
"label": "Destructive-op stops / Stopps bei riskanten Aktionen",
|
|
427
|
+
"what_this_means": (
|
|
428
|
+
"Wie oft der Agent vor destructive ops anhält / nachfragt "
|
|
429
|
+
f"(von {total})."
|
|
430
|
+
),
|
|
431
|
+
"with": None,
|
|
432
|
+
"without": None,
|
|
433
|
+
"delta": None,
|
|
434
|
+
"unit": "count",
|
|
435
|
+
"mode": "dry-run",
|
|
436
|
+
"source_report": (
|
|
437
|
+
"internal/bench/reports/ab/<destructive-corpus-pair>.json"
|
|
438
|
+
),
|
|
439
|
+
}
|
|
440
|
+
delta = None
|
|
441
|
+
if with_stops is not None and without_stops is not None:
|
|
442
|
+
delta = with_stops - without_stops
|
|
443
|
+
return {
|
|
444
|
+
"id": "destructive-stops",
|
|
445
|
+
"label": "Destructive-op stops / Stopps bei riskanten Aktionen",
|
|
446
|
+
"what_this_means": (
|
|
447
|
+
"Wie oft der Agent vor destructive ops anhält / nachfragt "
|
|
448
|
+
f"(von {total})."
|
|
449
|
+
),
|
|
450
|
+
"with": with_stops,
|
|
451
|
+
"without": without_stops,
|
|
452
|
+
"delta": delta,
|
|
453
|
+
"unit": "count",
|
|
454
|
+
"mode": "live",
|
|
455
|
+
"source_report": "internal/bench/reports/ab/",
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
|
|
459
|
+
def ask_vs_act_metric(
|
|
460
|
+
with_ratio: Optional[float],
|
|
461
|
+
without_ratio: Optional[float],
|
|
462
|
+
mode: str = "live",
|
|
463
|
+
) -> Dict[str, Any]:
|
|
464
|
+
"""Ask-vs-act ratio: lower = more decisive under autonomy mandate."""
|
|
465
|
+
if with_ratio is None and without_ratio is None:
|
|
466
|
+
return {
|
|
467
|
+
"id": "ask-vs-act",
|
|
468
|
+
"label": "Ask-vs-act ratio / Fragen vs. Handeln",
|
|
469
|
+
"what_this_means": (
|
|
470
|
+
"Verhältnis Rückfragen zu Aktionen — niedriger = entschlossener."
|
|
471
|
+
),
|
|
472
|
+
"with": None,
|
|
473
|
+
"without": None,
|
|
474
|
+
"delta": None,
|
|
475
|
+
"unit": "ratio",
|
|
476
|
+
"mode": "dry-run",
|
|
477
|
+
"source_report": "internal/bench/reports/ab/",
|
|
478
|
+
}
|
|
479
|
+
delta = None
|
|
480
|
+
if with_ratio is not None and without_ratio is not None:
|
|
481
|
+
delta = round(with_ratio - without_ratio, 4)
|
|
482
|
+
return {
|
|
483
|
+
"id": "ask-vs-act",
|
|
484
|
+
"label": "Ask-vs-act ratio / Fragen vs. Handeln",
|
|
485
|
+
"what_this_means": (
|
|
486
|
+
"Verhältnis Rückfragen zu Aktionen — niedriger = entschlossener."
|
|
487
|
+
),
|
|
488
|
+
"with": with_ratio,
|
|
489
|
+
"without": without_ratio,
|
|
490
|
+
"delta": delta,
|
|
491
|
+
"unit": "ratio",
|
|
492
|
+
"mode": mode,
|
|
493
|
+
"source_report": "internal/bench/reports/ab/",
|
|
494
|
+
}
|
|
495
|
+
|
|
496
|
+
|
|
497
|
+
def completion_metric(
|
|
498
|
+
with_rate: Optional[float],
|
|
499
|
+
without_rate: Optional[float],
|
|
500
|
+
mode: str = "live",
|
|
501
|
+
) -> Dict[str, Any]:
|
|
502
|
+
"""Task completion rate from A/B Track B."""
|
|
503
|
+
if with_rate is None and without_rate is None:
|
|
504
|
+
return {
|
|
505
|
+
"id": "completion",
|
|
506
|
+
"label": "Task completion rate / Aufgaben fertig",
|
|
507
|
+
"what_this_means": (
|
|
508
|
+
"Anteil der Aufgaben, die der Agent vollständig abschließt."
|
|
509
|
+
),
|
|
510
|
+
"with": None,
|
|
511
|
+
"without": None,
|
|
512
|
+
"delta": None,
|
|
513
|
+
"unit": "pct",
|
|
514
|
+
"mode": "dry-run",
|
|
515
|
+
"source_report": "internal/bench/reports/ab/<trackb-pair>.json",
|
|
516
|
+
}
|
|
517
|
+
delta = None
|
|
518
|
+
if with_rate is not None and without_rate is not None:
|
|
519
|
+
delta = round(with_rate - without_rate, 4)
|
|
520
|
+
return {
|
|
521
|
+
"id": "completion",
|
|
522
|
+
"label": "Task completion rate / Aufgaben fertig",
|
|
523
|
+
"what_this_means": (
|
|
524
|
+
"Anteil der Aufgaben, die der Agent vollständig abschließt."
|
|
525
|
+
),
|
|
526
|
+
"with": with_rate,
|
|
527
|
+
"without": without_rate,
|
|
528
|
+
"delta": delta,
|
|
529
|
+
"unit": "pct",
|
|
530
|
+
"mode": mode,
|
|
531
|
+
"source_report": "internal/bench/reports/ab/",
|
|
532
|
+
}
|
|
533
|
+
|
|
534
|
+
|
|
535
|
+
# ── Assembler ───────────────────────────────────────────────────────────
|
|
536
|
+
|
|
537
|
+
|
|
538
|
+
def assemble_ladder(
|
|
539
|
+
rungs: List[Dict[str, Any]],
|
|
540
|
+
baseline_input_tokens: int,
|
|
541
|
+
) -> List[Dict[str, Any]]:
|
|
542
|
+
"""Fill in `cumulative_pct` for every rung in order.
|
|
543
|
+
|
|
544
|
+
Mutates copies (does not modify input dicts). Returns the new list.
|
|
545
|
+
A `pending` rung contributes 0 to the cumulative (its token_delta
|
|
546
|
+
must NOT influence the headline until it flips to `measured`).
|
|
547
|
+
"""
|
|
548
|
+
out = []
|
|
549
|
+
running = 0
|
|
550
|
+
for rung in rungs:
|
|
551
|
+
rung_copy = dict(rung)
|
|
552
|
+
delta = (
|
|
553
|
+
int(rung_copy.get("token_delta", 0))
|
|
554
|
+
if rung_copy.get("confidence") != "pending"
|
|
555
|
+
else 0
|
|
556
|
+
)
|
|
557
|
+
running += delta
|
|
558
|
+
if baseline_input_tokens > 0:
|
|
559
|
+
rung_copy["cumulative_pct"] = round(
|
|
560
|
+
100.0 * running / baseline_input_tokens, 3
|
|
561
|
+
)
|
|
562
|
+
else:
|
|
563
|
+
rung_copy["cumulative_pct"] = 0.0
|
|
564
|
+
out.append(rung_copy)
|
|
565
|
+
return out
|
|
566
|
+
|
|
567
|
+
|
|
568
|
+
def compute_totals(
|
|
569
|
+
rungs: List[Dict[str, Any]],
|
|
570
|
+
baseline_input_tokens: int,
|
|
571
|
+
reference_scale: Dict[str, Any],
|
|
572
|
+
pricing_row: Dict[str, Any],
|
|
573
|
+
) -> Dict[str, Any]:
|
|
574
|
+
"""Compute the totals block from the assembled ladder."""
|
|
575
|
+
cumulative_token_delta = sum(
|
|
576
|
+
int(r.get("token_delta", 0))
|
|
577
|
+
for r in rungs
|
|
578
|
+
if r.get("confidence") != "pending"
|
|
579
|
+
)
|
|
580
|
+
cumulative_pct = 0.0
|
|
581
|
+
if baseline_input_tokens > 0:
|
|
582
|
+
cumulative_pct = round(
|
|
583
|
+
100.0 * cumulative_token_delta / baseline_input_tokens, 3
|
|
584
|
+
)
|
|
585
|
+
cumulative_eur = price_input_delta_eur(
|
|
586
|
+
cumulative_token_delta, reference_scale, pricing_row
|
|
587
|
+
)
|
|
588
|
+
if cumulative_token_delta < 0:
|
|
589
|
+
verdict = "net-saving"
|
|
590
|
+
elif cumulative_token_delta > 0:
|
|
591
|
+
verdict = "net-cost"
|
|
592
|
+
else:
|
|
593
|
+
verdict = "break-even"
|
|
594
|
+
return {
|
|
595
|
+
"cumulative_token_delta": cumulative_token_delta,
|
|
596
|
+
"cumulative_eur_delta": round(cumulative_eur, 4),
|
|
597
|
+
"cumulative_pct": cumulative_pct,
|
|
598
|
+
"net_verdict": verdict,
|
|
599
|
+
}
|