@event4u/agent-config 4.9.0 → 5.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. package/.agent-src/commands/implement-ticket.md +5 -4
  2. package/.agent-src/rules/language-and-tone.md +4 -10
  3. package/.agent-src/skills/command-routing/SKILL.md +5 -4
  4. package/.claude-plugin/marketplace.json +1 -1
  5. package/CHANGELOG.md +73 -0
  6. package/CONTRIBUTING.md +19 -0
  7. package/README.md +11 -0
  8. package/dist/cli/registry.js +0 -2
  9. package/dist/cli/registry.js.map +1 -1
  10. package/dist/discovery/deprecation-report.md +1 -1
  11. package/dist/discovery/discovery-manifest.json +5 -5
  12. package/dist/discovery/discovery-manifest.json.sha256 +1 -1
  13. package/dist/discovery/discovery-manifest.summary.md +1 -1
  14. package/dist/discovery/orphan-report.md +1 -1
  15. package/dist/discovery/packs.json +2 -2
  16. package/dist/discovery/trust-report.md +1 -1
  17. package/dist/discovery/workspaces.json +2 -2
  18. package/dist/mcp/registry-manifest.json +2 -2
  19. package/dist/router.json +1 -1671
  20. package/docs/benchmark.md +20 -8
  21. package/docs/benchmarks.md +11 -0
  22. package/docs/contracts/benchmark-corpus-spec.md +31 -3
  23. package/docs/contracts/command-surface-tiers.md +1 -1
  24. package/docs/contracts/hook-architecture-v1.md +33 -0
  25. package/docs/contracts/migrate-command.md +197 -0
  26. package/docs/contracts/settings-api.md +2 -1
  27. package/docs/contracts/value-dashboard-spec.md +374 -0
  28. package/docs/contracts/value-report-schema.md +150 -0
  29. package/docs/decisions/ADR-031-validation-severity-tiers-and-projection-roundtrip.md +97 -0
  30. package/docs/decisions/INDEX.md +1 -0
  31. package/docs/guidelines/agent-infra/installed-tools-manifest.md +6 -3
  32. package/docs/guidelines/agent-infra/language-and-tone-examples.md +35 -0
  33. package/docs/migration/v1-to-v2.md +40 -27
  34. package/docs/value.md +84 -0
  35. package/package.json +8 -8
  36. package/scripts/__pycache__/validate_frontmatter.cpython-312.pyc +0 -0
  37. package/scripts/_cli/cmd_migrate.py +264 -102
  38. package/scripts/_cli/cmd_settings_migrate.py +2 -1
  39. package/scripts/_dispatch.bash +147 -49
  40. package/scripts/_lib/__pycache__/__init__.cpython-312.pyc +0 -0
  41. package/scripts/_lib/__pycache__/agent_src.cpython-312.pyc +0 -0
  42. package/scripts/_lib/install_regenerator.py +129 -0
  43. package/scripts/_lib/value_ladder.py +599 -0
  44. package/scripts/_lib/value_report.py +441 -0
  45. package/scripts/bench_rtk_savings.py +320 -0
  46. package/scripts/compile_router.py +19 -5
  47. package/scripts/expected_perms.json +1 -1
  48. package/scripts/first_run_gate_hook.py +178 -0
  49. package/scripts/hook_manifest.yaml +16 -7
  50. package/scripts/hooks/dispatch_hook.py +27 -0
  51. package/scripts/hooks/dispatch_issues.py +136 -0
  52. package/scripts/hooks_doctor.py +40 -1
  53. package/scripts/install.py +25 -21
  54. package/scripts/lint_agents_layout.py +5 -4
  55. package/scripts/lint_bench_corpus.py +86 -4
  56. package/scripts/lint_global_paths.py +4 -3
  57. package/scripts/lint_marketplace_install_completeness.py +188 -0
  58. package/scripts/lint_value_dashboard.py +218 -0
  59. package/scripts/render_benchmark_md.py +6 -2
  60. package/scripts/render_value_md.py +355 -0
  61. package/scripts/repro/repro_marketplace_install_gap.sh +161 -0
  62. package/scripts/roadmap_progress_hook.py +23 -0
  63. package/scripts/router_telemetry.py +470 -0
  64. package/scripts/validate_frontmatter.py +23 -9
  65. package/scripts/_cli/cmd_migrate_to_global.py +0 -415
@@ -0,0 +1,599 @@
1
+ """Pure normaliser: raw bench reports → `value-v1` rung dicts.
2
+
3
+ Phase 1 Step 2 of `agents/roadmaps/road-to-readable-value-dashboard.md`.
4
+
5
+ This module is **pure** — no I/O, no file reads, no clock. Inputs are
6
+ already-loaded dicts; outputs are rung dicts conforming to
7
+ `docs/contracts/value-report-schema.md`. The companion
8
+ `scripts/_lib/value_report.py` owns the I/O wrapper that loads the raw
9
+ reports, calls these functions, and writes the assembled JSON.
10
+
11
+ Rung dict shape (see `value-report-schema.md` for the full contract):
12
+
13
+ {
14
+ "id": "<kebab-case>",
15
+ "label": "<German + English>",
16
+ "what_it_does": "<≤ 80 char phrase>",
17
+ "token_delta": <signed int>,
18
+ "eur_delta": <float>,
19
+ "cumulative_pct": <signed float>, # filled in by assemble_ladder
20
+ "confidence": "measured" | "estimated" | "vendor-claim" | "pending",
21
+ "source_report": "<relative path>",
22
+ "footnote": "<optional caveat>", # omitted when no caveat
23
+ }
24
+ """
25
+ from __future__ import annotations
26
+
27
+ from typing import Any, Dict, List, Optional
28
+
29
+ # ── Reference scale defaults ────────────────────────────────────────────
30
+
31
+ DEFAULT_REFERENCE_SCALE = {
32
+ "requests": 1000,
33
+ "avg_input_tokens": 8000,
34
+ "avg_output_tokens": 600,
35
+ "model_tier": "sonnet",
36
+ }
37
+
38
+ # ── Pricing ─────────────────────────────────────────────────────────────
39
+
40
+
41
+ def price_tokens_eur(
42
+ input_tokens: int,
43
+ output_tokens: int,
44
+ pricing_row: Dict[str, Any],
45
+ eur_per_usd: float = 0.92,
46
+ ) -> float:
47
+ """Convert (input, output) token counts to € using a pricing.yaml row.
48
+
49
+ `pricing_row` is one entry from `internal/bench/pricing.yaml::models`
50
+ (the row with the matching tier). USD/1M token rates are converted to
51
+ € via `eur_per_usd` (default 0.92 — adjust at the call site if
52
+ `pricing.yaml` ever carries a EUR rate directly).
53
+ """
54
+ input_usd = (input_tokens / 1_000_000.0) * float(pricing_row.get("input", 0.0))
55
+ output_usd = (output_tokens / 1_000_000.0) * float(pricing_row.get("output", 0.0))
56
+ return (input_usd + output_usd) * eur_per_usd
57
+
58
+
59
+ def price_input_delta_eur(
60
+ token_delta_per_request: int,
61
+ reference_scale: Dict[str, Any],
62
+ pricing_row: Dict[str, Any],
63
+ ) -> float:
64
+ """Price a per-request *input* token delta at the reference scale."""
65
+ requests = int(reference_scale.get("requests", 1000))
66
+ total_input_tokens = token_delta_per_request * requests
67
+ return price_tokens_eur(total_input_tokens, 0, pricing_row)
68
+
69
+
70
+ def price_output_delta_eur(
71
+ token_delta_per_request: int,
72
+ reference_scale: Dict[str, Any],
73
+ pricing_row: Dict[str, Any],
74
+ ) -> float:
75
+ """Price a per-request *output* token delta at the reference scale."""
76
+ requests = int(reference_scale.get("requests", 1000))
77
+ total_output_tokens = token_delta_per_request * requests
78
+ return price_tokens_eur(0, total_output_tokens, pricing_row)
79
+
80
+
81
+ # ── Pending-rung factory ────────────────────────────────────────────────
82
+
83
+
84
+ def pending_rung(
85
+ rung_id: str,
86
+ label: str,
87
+ what_it_does: str,
88
+ source_report: str,
89
+ footnote: Optional[str] = None,
90
+ ) -> Dict[str, Any]:
91
+ """Emit a `pending` rung — measurement not yet available."""
92
+ rung = {
93
+ "id": rung_id,
94
+ "label": label,
95
+ "what_it_does": what_it_does,
96
+ "token_delta": 0,
97
+ "eur_delta": 0.0,
98
+ "cumulative_pct": 0.0, # filled in by assemble_ladder
99
+ "confidence": "pending",
100
+ "source_report": source_report,
101
+ }
102
+ if footnote:
103
+ rung["footnote"] = footnote
104
+ return rung
105
+
106
+
107
+ # ── Rung extractors ─────────────────────────────────────────────────────
108
+
109
+
110
+ def baseline_rung(reference_scale: Dict[str, Any]) -> Dict[str, Any]:
111
+ """The zero-point rung. token_delta = 0 by construction."""
112
+ return {
113
+ "id": "baseline",
114
+ "label": "Ohne Paket / Without package",
115
+ "what_it_does": "Baseline — der nackte Request ohne Paket-Regeln.",
116
+ "token_delta": 0,
117
+ "eur_delta": 0.0,
118
+ "cumulative_pct": 0.0,
119
+ "confidence": "measured",
120
+ "source_report": "n/a",
121
+ }
122
+
123
+
124
+ def load_rung_from_router(
125
+ router: Optional[Dict[str, Any]],
126
+ rule_chars: Optional[Dict[str, int]],
127
+ charter_chars: int,
128
+ reference_scale: Dict[str, Any],
129
+ pricing_row: Dict[str, Any],
130
+ ) -> Dict[str, Any]:
131
+ """Build the Paket-load rung from the canonical kernel list.
132
+
133
+ Phase 1 of road-to-value-dashboard-netto-cuts: the previous
134
+ `load_rung_from_frugality` reads a hardcoded 6-rule canon
135
+ (`scripts/measure_frugality_savings.py::CANON_RULES`), NOT the
136
+ actual always-loaded kernel. The real kernel lives in
137
+ `dist/router.json::kernel` and has 10 rules. This function reads
138
+ that list and sums per-file char counts to compute the real
139
+ always-loaded footprint.
140
+
141
+ `router` is the decoded `dist/router.json` dict.
142
+ `rule_chars` is a `{rule_id: char_count}` mapping (typically built
143
+ by walking `.agent-src/rules/<id>.md`).
144
+ `charter_chars` is the always-loaded charter footprint.
145
+
146
+ Returns a `pending` rung when the router is missing or has no
147
+ kernel entry; the rung's `source_report` cites the missing input.
148
+ """
149
+ if not router or "kernel" not in router:
150
+ return pending_rung(
151
+ "load",
152
+ "Mit Paket (Regeln laden) / With package (rule load)",
153
+ "Die immer-aktiven Regeln landen im Kontext jedes Requests.",
154
+ "dist/router.json",
155
+ footnote="Run scripts/compile_router.py to generate the router.",
156
+ )
157
+ rule_chars = rule_chars or {}
158
+ kernel_ids = list(router.get("kernel", []))
159
+ kernel_total = sum(int(rule_chars.get(rid, 0)) for rid in kernel_ids)
160
+ total_chars = kernel_total + int(charter_chars)
161
+ # 4 chars/token approximation, consistent with measure_frugality_savings.py.
162
+ token_delta = total_chars // 4
163
+ return {
164
+ "id": "load",
165
+ "label": "Mit Paket (Regeln laden) / With package (rule load)",
166
+ "what_it_does": "Die immer-aktiven Regeln landen im Kontext jedes Requests.",
167
+ "token_delta": token_delta,
168
+ "eur_delta": price_input_delta_eur(token_delta, reference_scale, pricing_row),
169
+ "cumulative_pct": 0.0,
170
+ "confidence": "measured",
171
+ "source_report": "dist/router.json",
172
+ "footnote": (
173
+ f"Kernel = {len(kernel_ids)} rules ({kernel_total} chars) "
174
+ f"+ charter ({int(charter_chars)} chars); tokens ≈ chars / 4."
175
+ ),
176
+ }
177
+
178
+
179
+ def load_rung_from_frugality(
180
+ frugality_record: Optional[Dict[str, Any]],
181
+ reference_scale: Dict[str, Any],
182
+ pricing_row: Dict[str, Any],
183
+ ) -> Dict[str, Any]:
184
+ """Build the Paket-load rung from a frugality baseline.jsonl record.
185
+
186
+ **Deprecated** as of road-to-value-dashboard-netto-cuts Phase 1:
187
+ measures a hardcoded 6-rule canon, not the actual always-loaded
188
+ kernel. Kept as a back-compat fallback when `dist/router.json` is
189
+ missing. New callers should prefer `load_rung_from_router()`.
190
+
191
+ `frugality_record` is one decoded line from
192
+ `agents/runtime/frugality/baseline.jsonl` (the latest record is the
193
+ typical input). The rung token_delta is the always-loaded
194
+ (kernel + tier_1 + tier_2 + charter) footprint divided by 4 to
195
+ approximate tokens.
196
+
197
+ Returns a `pending` rung when the record is missing or malformed.
198
+ """
199
+ if not frugality_record:
200
+ return pending_rung(
201
+ "load",
202
+ "Mit Paket (Regeln laden) / With package (rule load)",
203
+ "Die immer-aktiven Regeln landen im Kontext jedes Requests.",
204
+ "agents/runtime/frugality/baseline.jsonl",
205
+ footnote="Run scripts/measure_frugality_savings.py to populate.",
206
+ )
207
+ footprint = frugality_record.get("metric_a_footprint", {})
208
+ kernel = int(footprint.get("kernel_total_chars", 0))
209
+ tier_1 = int(footprint.get("tier_1_total_chars", 0))
210
+ tier_2 = int(footprint.get("tier_2_total_chars", 0))
211
+ charter = int(footprint.get("charter_chars", 0))
212
+ total_chars = kernel + tier_1 + tier_2 + charter
213
+ # 4 chars/token approximation, consistent with measure_frugality_savings.py.
214
+ token_delta = total_chars // 4
215
+ return {
216
+ "id": "load",
217
+ "label": "Mit Paket (Regeln laden) / With package (rule load)",
218
+ "what_it_does": "Die immer-aktiven Regeln landen im Kontext jedes Requests.",
219
+ "token_delta": token_delta,
220
+ "eur_delta": price_input_delta_eur(token_delta, reference_scale, pricing_row),
221
+ "cumulative_pct": 0.0,
222
+ "confidence": "measured",
223
+ "source_report": "agents/runtime/frugality/baseline.jsonl",
224
+ "footnote": (
225
+ "Always-loaded footprint = kernel + tier_1 + tier_2 + charter; "
226
+ "tokens ≈ chars / 4."
227
+ ),
228
+ }
229
+
230
+
231
+ def condense_rung_from_telegraph_v2(
232
+ telegraph_v2: Optional[Dict[str, Any]],
233
+ baseline_input_tokens: int,
234
+ reference_scale: Dict[str, Any],
235
+ pricing_row: Dict[str, Any],
236
+ ) -> Dict[str, Any]:
237
+ """Build the condense rung from telegraph-v2 aggregate.
238
+
239
+ Excludes Thin-Root files (per the spec); aggregates the
240
+ prose-heavy-contract + rule-classification categories. The rung is
241
+ a *saving* (negative token_delta) when the median is positive.
242
+ """
243
+ if not telegraph_v2 or "aggregate" not in telegraph_v2:
244
+ return pending_rung(
245
+ "condense",
246
+ "+ condense (Regeln eindampfen) / + condense (rule shrink)",
247
+ "Build-Schritt schrumpft Regel-Dateien vor dem Ausliefern.",
248
+ "internal/bench/reports/telegraph-v2.json",
249
+ footnote="Run scripts/bench_telegraph.py to populate.",
250
+ )
251
+ aggregate = telegraph_v2["aggregate"]
252
+ by_cat = aggregate.get("by_category_median_pct", {})
253
+ # Non-Thin-Root categories only.
254
+ non_thin_root = {
255
+ k: v for k, v in by_cat.items() if not k.startswith("thin-root-")
256
+ }
257
+ if not non_thin_root:
258
+ median_saving_pct = float(aggregate.get("median_saving_pct", 0.0))
259
+ else:
260
+ # Simple mean across non-Thin-Root category medians — matches the
261
+ # "aggregate to a single rung" wording in the roadmap.
262
+ median_saving_pct = sum(non_thin_root.values()) / len(non_thin_root)
263
+ # Saving % is the % of baseline_input_tokens that condense claws back.
264
+ # Positive saving % → negative token_delta (we save tokens).
265
+ token_delta = -int(round(baseline_input_tokens * median_saving_pct / 100.0))
266
+ return {
267
+ "id": "condense",
268
+ "label": "+ condense (Regeln eindampfen) / + condense (rule shrink)",
269
+ "what_it_does": "Build-Schritt schrumpft Regel-Dateien vor dem Ausliefern.",
270
+ "token_delta": token_delta,
271
+ "eur_delta": price_input_delta_eur(token_delta, reference_scale, pricing_row),
272
+ "cumulative_pct": 0.0,
273
+ "confidence": "measured",
274
+ "source_report": "internal/bench/reports/telegraph-v2.json",
275
+ "footnote": (
276
+ "Aggregate across non-Thin-Root categories; Thin-Root files "
277
+ "(AGENTS.md variants) net negative (~−4%) and are excluded "
278
+ "from the rung — surfaced separately."
279
+ ),
280
+ }
281
+
282
+
283
+ def rtk_rung_from_report(
284
+ rtk_report: Optional[Dict[str, Any]],
285
+ reference_scale: Dict[str, Any],
286
+ pricing_row: Dict[str, Any],
287
+ ) -> Dict[str, Any]:
288
+ """Build the rtk rung from `internal/bench/reports/rtk/latest.json`.
289
+
290
+ The rtk report carries the per-command corpus result + an aggregate
291
+ `tokens_saved_per_request` (output-side savings on tool calls). If
292
+ missing → `pending`.
293
+ """
294
+ if not rtk_report:
295
+ return pending_rung(
296
+ "rtk",
297
+ "+ rtk (CLI-Output filtern) / + rtk (filter CLI output)",
298
+ "rtk schneidet verbose CLI-Ausgabe vor dem Modell-Input weg.",
299
+ "internal/bench/reports/rtk/latest.json",
300
+ footnote="Install rtk and run scripts/bench_rtk_savings.py.",
301
+ )
302
+ aggregate = rtk_report.get("aggregate", {})
303
+ tokens_saved = int(aggregate.get("tokens_saved_per_request", 0))
304
+ if tokens_saved <= 0:
305
+ return pending_rung(
306
+ "rtk",
307
+ "+ rtk (CLI-Output filtern) / + rtk (filter CLI output)",
308
+ "rtk schneidet verbose CLI-Ausgabe vor dem Modell-Input weg.",
309
+ "internal/bench/reports/rtk/latest.json",
310
+ footnote=(
311
+ "Report present but aggregate.tokens_saved_per_request "
312
+ "is 0 — re-run scripts/bench_rtk_savings.py with the full "
313
+ "corpus."
314
+ ),
315
+ )
316
+ # Savings → negative token_delta.
317
+ token_delta = -tokens_saved
318
+ return {
319
+ "id": "rtk",
320
+ "label": "+ rtk (CLI-Output filtern) / + rtk (filter CLI output)",
321
+ "what_it_does": "rtk schneidet verbose CLI-Ausgabe vor dem Modell-Input weg.",
322
+ "token_delta": token_delta,
323
+ "eur_delta": price_input_delta_eur(token_delta, reference_scale, pricing_row),
324
+ "cumulative_pct": 0.0,
325
+ "confidence": "measured",
326
+ "source_report": "internal/bench/reports/rtk/latest.json",
327
+ }
328
+
329
+
330
+ def terse_rung_from_telegraph_v1(
331
+ telegraph_v1: Optional[Dict[str, Any]],
332
+ reference_scale: Dict[str, Any],
333
+ pricing_row: Dict[str, Any],
334
+ ) -> Dict[str, Any]:
335
+ """Build the terse rung from telegraph-v1 vs_terse aggregate.
336
+
337
+ The measured median is negative (~ −9.27% in the canonical report).
338
+ We render this honestly per the spec: a rung with the real value
339
+ + a footnote, never a "saving" label on a negative.
340
+ """
341
+ if not telegraph_v1 or "telegraph" not in telegraph_v1:
342
+ return pending_rung(
343
+ "terse",
344
+ "+ terse (Antworten knapper) / + terse (shorter replies)",
345
+ "Telegraph-Stil zielt auf knappere Modell-Antworten.",
346
+ "internal/bench/reports/telegraph-v1.json",
347
+ footnote="Run scripts/bench_telegraph.py to populate.",
348
+ )
349
+ arms = telegraph_v1["telegraph"].get("aggregate", {})
350
+ vs_terse = arms.get("savings_vs_terse", {})
351
+ median = float(vs_terse.get("median", 0.0))
352
+ # Output-side: positive median → fewer output tokens than terse control.
353
+ # The measured median in the canonical report is negative (~ -0.0927).
354
+ avg_output = int(reference_scale.get("avg_output_tokens", 600))
355
+ token_delta = -int(round(avg_output * median))
356
+ note = (
357
+ "Honest: gemessener Median = "
358
+ f"{median * 100:+.2f}% gegen 'sei knapp' — Telegraph liefert hier "
359
+ "mehr Tokens, nicht weniger. Wir messen, wir verstecken nicht."
360
+ )
361
+ return {
362
+ "id": "terse",
363
+ "label": "+ terse (Antworten knapper) / + terse (shorter replies)",
364
+ "what_it_does": "Telegraph-Stil zielt auf knappere Modell-Antworten.",
365
+ "token_delta": token_delta,
366
+ "eur_delta": price_output_delta_eur(token_delta, reference_scale, pricing_row),
367
+ "cumulative_pct": 0.0,
368
+ "confidence": "measured",
369
+ "source_report": "internal/bench/reports/telegraph-v1.json",
370
+ "footnote": note,
371
+ }
372
+
373
+
374
+ # ── Behaviour-metric extractors ─────────────────────────────────────────
375
+
376
+
377
+ def selection_metric_from_dev_reports(
378
+ with_report: Optional[Dict[str, Any]],
379
+ without_report: Optional[Dict[str, Any]],
380
+ ) -> Dict[str, Any]:
381
+ """Right-skill selection: top-K hit rate with vs. without."""
382
+ if not with_report and not without_report:
383
+ return {
384
+ "id": "selection",
385
+ "label": "Right-skill selection / Richtige Skill-Wahl",
386
+ "what_this_means": (
387
+ "Wie oft das passende Skill aktiviert wird (top-K Treffer)."
388
+ ),
389
+ "with": None,
390
+ "without": None,
391
+ "delta": None,
392
+ "unit": "pct",
393
+ "mode": "dry-run",
394
+ "source_report": "internal/bench/reports/ab/<dev-corpus-pair>.json",
395
+ }
396
+ w = (with_report or {}).get("selection", {}).get("selection_accuracy")
397
+ wo = (without_report or {}).get("selection", {}).get("selection_accuracy")
398
+ delta = None
399
+ if w is not None and wo is not None:
400
+ delta = round(w - wo, 4)
401
+ mode = ((with_report or {}).get("results") or {}).get("mode") or "live"
402
+ return {
403
+ "id": "selection",
404
+ "label": "Right-skill selection / Richtige Skill-Wahl",
405
+ "what_this_means": (
406
+ "Wie oft das passende Skill aktiviert wird (top-K Treffer)."
407
+ ),
408
+ "with": w,
409
+ "without": wo,
410
+ "delta": delta,
411
+ "unit": "pct",
412
+ "mode": mode,
413
+ "source_report": "internal/bench/reports/ab/",
414
+ }
415
+
416
+
417
+ def destructive_stops_metric(
418
+ with_stops: Optional[int],
419
+ without_stops: Optional[int],
420
+ total: int = 5,
421
+ ) -> Dict[str, Any]:
422
+ """Destructive-op stops: N/5 vs M/5 — counts, not pct."""
423
+ if with_stops is None and without_stops is None:
424
+ return {
425
+ "id": "destructive-stops",
426
+ "label": "Destructive-op stops / Stopps bei riskanten Aktionen",
427
+ "what_this_means": (
428
+ "Wie oft der Agent vor destructive ops anhält / nachfragt "
429
+ f"(von {total})."
430
+ ),
431
+ "with": None,
432
+ "without": None,
433
+ "delta": None,
434
+ "unit": "count",
435
+ "mode": "dry-run",
436
+ "source_report": (
437
+ "internal/bench/reports/ab/<destructive-corpus-pair>.json"
438
+ ),
439
+ }
440
+ delta = None
441
+ if with_stops is not None and without_stops is not None:
442
+ delta = with_stops - without_stops
443
+ return {
444
+ "id": "destructive-stops",
445
+ "label": "Destructive-op stops / Stopps bei riskanten Aktionen",
446
+ "what_this_means": (
447
+ "Wie oft der Agent vor destructive ops anhält / nachfragt "
448
+ f"(von {total})."
449
+ ),
450
+ "with": with_stops,
451
+ "without": without_stops,
452
+ "delta": delta,
453
+ "unit": "count",
454
+ "mode": "live",
455
+ "source_report": "internal/bench/reports/ab/",
456
+ }
457
+
458
+
459
+ def ask_vs_act_metric(
460
+ with_ratio: Optional[float],
461
+ without_ratio: Optional[float],
462
+ mode: str = "live",
463
+ ) -> Dict[str, Any]:
464
+ """Ask-vs-act ratio: lower = more decisive under autonomy mandate."""
465
+ if with_ratio is None and without_ratio is None:
466
+ return {
467
+ "id": "ask-vs-act",
468
+ "label": "Ask-vs-act ratio / Fragen vs. Handeln",
469
+ "what_this_means": (
470
+ "Verhältnis Rückfragen zu Aktionen — niedriger = entschlossener."
471
+ ),
472
+ "with": None,
473
+ "without": None,
474
+ "delta": None,
475
+ "unit": "ratio",
476
+ "mode": "dry-run",
477
+ "source_report": "internal/bench/reports/ab/",
478
+ }
479
+ delta = None
480
+ if with_ratio is not None and without_ratio is not None:
481
+ delta = round(with_ratio - without_ratio, 4)
482
+ return {
483
+ "id": "ask-vs-act",
484
+ "label": "Ask-vs-act ratio / Fragen vs. Handeln",
485
+ "what_this_means": (
486
+ "Verhältnis Rückfragen zu Aktionen — niedriger = entschlossener."
487
+ ),
488
+ "with": with_ratio,
489
+ "without": without_ratio,
490
+ "delta": delta,
491
+ "unit": "ratio",
492
+ "mode": mode,
493
+ "source_report": "internal/bench/reports/ab/",
494
+ }
495
+
496
+
497
+ def completion_metric(
498
+ with_rate: Optional[float],
499
+ without_rate: Optional[float],
500
+ mode: str = "live",
501
+ ) -> Dict[str, Any]:
502
+ """Task completion rate from A/B Track B."""
503
+ if with_rate is None and without_rate is None:
504
+ return {
505
+ "id": "completion",
506
+ "label": "Task completion rate / Aufgaben fertig",
507
+ "what_this_means": (
508
+ "Anteil der Aufgaben, die der Agent vollständig abschließt."
509
+ ),
510
+ "with": None,
511
+ "without": None,
512
+ "delta": None,
513
+ "unit": "pct",
514
+ "mode": "dry-run",
515
+ "source_report": "internal/bench/reports/ab/<trackb-pair>.json",
516
+ }
517
+ delta = None
518
+ if with_rate is not None and without_rate is not None:
519
+ delta = round(with_rate - without_rate, 4)
520
+ return {
521
+ "id": "completion",
522
+ "label": "Task completion rate / Aufgaben fertig",
523
+ "what_this_means": (
524
+ "Anteil der Aufgaben, die der Agent vollständig abschließt."
525
+ ),
526
+ "with": with_rate,
527
+ "without": without_rate,
528
+ "delta": delta,
529
+ "unit": "pct",
530
+ "mode": mode,
531
+ "source_report": "internal/bench/reports/ab/",
532
+ }
533
+
534
+
535
+ # ── Assembler ───────────────────────────────────────────────────────────
536
+
537
+
538
+ def assemble_ladder(
539
+ rungs: List[Dict[str, Any]],
540
+ baseline_input_tokens: int,
541
+ ) -> List[Dict[str, Any]]:
542
+ """Fill in `cumulative_pct` for every rung in order.
543
+
544
+ Mutates copies (does not modify input dicts). Returns the new list.
545
+ A `pending` rung contributes 0 to the cumulative (its token_delta
546
+ must NOT influence the headline until it flips to `measured`).
547
+ """
548
+ out = []
549
+ running = 0
550
+ for rung in rungs:
551
+ rung_copy = dict(rung)
552
+ delta = (
553
+ int(rung_copy.get("token_delta", 0))
554
+ if rung_copy.get("confidence") != "pending"
555
+ else 0
556
+ )
557
+ running += delta
558
+ if baseline_input_tokens > 0:
559
+ rung_copy["cumulative_pct"] = round(
560
+ 100.0 * running / baseline_input_tokens, 3
561
+ )
562
+ else:
563
+ rung_copy["cumulative_pct"] = 0.0
564
+ out.append(rung_copy)
565
+ return out
566
+
567
+
568
+ def compute_totals(
569
+ rungs: List[Dict[str, Any]],
570
+ baseline_input_tokens: int,
571
+ reference_scale: Dict[str, Any],
572
+ pricing_row: Dict[str, Any],
573
+ ) -> Dict[str, Any]:
574
+ """Compute the totals block from the assembled ladder."""
575
+ cumulative_token_delta = sum(
576
+ int(r.get("token_delta", 0))
577
+ for r in rungs
578
+ if r.get("confidence") != "pending"
579
+ )
580
+ cumulative_pct = 0.0
581
+ if baseline_input_tokens > 0:
582
+ cumulative_pct = round(
583
+ 100.0 * cumulative_token_delta / baseline_input_tokens, 3
584
+ )
585
+ cumulative_eur = price_input_delta_eur(
586
+ cumulative_token_delta, reference_scale, pricing_row
587
+ )
588
+ if cumulative_token_delta < 0:
589
+ verdict = "net-saving"
590
+ elif cumulative_token_delta > 0:
591
+ verdict = "net-cost"
592
+ else:
593
+ verdict = "break-even"
594
+ return {
595
+ "cumulative_token_delta": cumulative_token_delta,
596
+ "cumulative_eur_delta": round(cumulative_eur, 4),
597
+ "cumulative_pct": cumulative_pct,
598
+ "net_verdict": verdict,
599
+ }