@event4u/agent-config 4.9.0 → 5.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. package/.agent-src/commands/implement-ticket.md +5 -4
  2. package/.agent-src/rules/language-and-tone.md +4 -10
  3. package/.agent-src/skills/command-routing/SKILL.md +5 -4
  4. package/.claude-plugin/marketplace.json +1 -1
  5. package/CHANGELOG.md +73 -0
  6. package/CONTRIBUTING.md +19 -0
  7. package/README.md +11 -0
  8. package/dist/cli/registry.js +0 -2
  9. package/dist/cli/registry.js.map +1 -1
  10. package/dist/discovery/deprecation-report.md +1 -1
  11. package/dist/discovery/discovery-manifest.json +5 -5
  12. package/dist/discovery/discovery-manifest.json.sha256 +1 -1
  13. package/dist/discovery/discovery-manifest.summary.md +1 -1
  14. package/dist/discovery/orphan-report.md +1 -1
  15. package/dist/discovery/packs.json +2 -2
  16. package/dist/discovery/trust-report.md +1 -1
  17. package/dist/discovery/workspaces.json +2 -2
  18. package/dist/mcp/registry-manifest.json +2 -2
  19. package/dist/router.json +1 -1671
  20. package/docs/benchmark.md +20 -8
  21. package/docs/benchmarks.md +11 -0
  22. package/docs/contracts/benchmark-corpus-spec.md +31 -3
  23. package/docs/contracts/command-surface-tiers.md +1 -1
  24. package/docs/contracts/hook-architecture-v1.md +33 -0
  25. package/docs/contracts/migrate-command.md +197 -0
  26. package/docs/contracts/settings-api.md +2 -1
  27. package/docs/contracts/value-dashboard-spec.md +374 -0
  28. package/docs/contracts/value-report-schema.md +150 -0
  29. package/docs/decisions/ADR-031-validation-severity-tiers-and-projection-roundtrip.md +97 -0
  30. package/docs/decisions/INDEX.md +1 -0
  31. package/docs/guidelines/agent-infra/installed-tools-manifest.md +6 -3
  32. package/docs/guidelines/agent-infra/language-and-tone-examples.md +35 -0
  33. package/docs/migration/v1-to-v2.md +40 -27
  34. package/docs/value.md +84 -0
  35. package/package.json +8 -8
  36. package/scripts/__pycache__/validate_frontmatter.cpython-312.pyc +0 -0
  37. package/scripts/_cli/cmd_migrate.py +264 -102
  38. package/scripts/_cli/cmd_settings_migrate.py +2 -1
  39. package/scripts/_dispatch.bash +147 -49
  40. package/scripts/_lib/__pycache__/__init__.cpython-312.pyc +0 -0
  41. package/scripts/_lib/__pycache__/agent_src.cpython-312.pyc +0 -0
  42. package/scripts/_lib/install_regenerator.py +129 -0
  43. package/scripts/_lib/value_ladder.py +599 -0
  44. package/scripts/_lib/value_report.py +441 -0
  45. package/scripts/bench_rtk_savings.py +320 -0
  46. package/scripts/compile_router.py +19 -5
  47. package/scripts/expected_perms.json +1 -1
  48. package/scripts/first_run_gate_hook.py +178 -0
  49. package/scripts/hook_manifest.yaml +16 -7
  50. package/scripts/hooks/dispatch_hook.py +27 -0
  51. package/scripts/hooks/dispatch_issues.py +136 -0
  52. package/scripts/hooks_doctor.py +40 -1
  53. package/scripts/install.py +25 -21
  54. package/scripts/lint_agents_layout.py +5 -4
  55. package/scripts/lint_bench_corpus.py +86 -4
  56. package/scripts/lint_global_paths.py +4 -3
  57. package/scripts/lint_marketplace_install_completeness.py +188 -0
  58. package/scripts/lint_value_dashboard.py +218 -0
  59. package/scripts/render_benchmark_md.py +6 -2
  60. package/scripts/render_value_md.py +355 -0
  61. package/scripts/repro/repro_marketplace_install_gap.sh +161 -0
  62. package/scripts/roadmap_progress_hook.py +23 -0
  63. package/scripts/router_telemetry.py +470 -0
  64. package/scripts/validate_frontmatter.py +23 -9
  65. package/scripts/_cli/cmd_migrate_to_global.py +0 -415
@@ -0,0 +1,218 @@
1
+ #!/usr/bin/env python3
2
+ """Lint `docs/value.md` for structural invariants.
3
+
4
+ Phase 5 Step 3 of `agents/roadmaps/road-to-readable-value-dashboard.md`.
5
+
6
+ Invariants enforced (any violation → exit 1):
7
+
8
+ 1. Required sections present (intro / Reference scale / Panel A / Panel B
9
+ / Glossar / NETTO line).
10
+ 2. Every cost-ladder rung row cites a `source_report` (or `n/a` for the
11
+ baseline rung) — no rung sneaks in without traceability.
12
+ 3. No `measured` rung renders a `pending` source — internal consistency
13
+ of confidence ↔ source state.
14
+ 4. No negative-saving label: the literal string "Ersparnis" must not
15
+ appear in a row where the displayed Δ-token value is positive (the
16
+ load + terse rungs are *costs*, not savings; mislabelling either is
17
+ a credibility failure the page explicitly forbids).
18
+ 5. The `latest.json` exists and its `cost_ladder` rung ids match the
19
+ five canonical rungs — the renderer cannot silently drop a rung.
20
+
21
+ The linter loads `internal/bench/reports/value/latest.json` directly
22
+ (not just the rendered `.md`) for items (3) and (5) — the rendered
23
+ text alone is too lossy.
24
+
25
+ Output: one violation per line in non-quiet mode; one-line summary in
26
+ quiet mode. Exit 0 on clean, 1 on any violation.
27
+ """
28
+ from __future__ import annotations
29
+
30
+ import argparse
31
+ import json
32
+ import re
33
+ import sys
34
+ from pathlib import Path
35
+ from typing import Any, Dict, List
36
+
37
+
38
+ REPO_ROOT = Path(__file__).resolve().parent.parent
39
+ DASHBOARD = REPO_ROOT / "docs" / "value.md"
40
+ LATEST = REPO_ROOT / "internal" / "bench" / "reports" / "value" / "latest.json"
41
+
42
+ REQUIRED_SECTIONS = (
43
+ "# Value Dashboard",
44
+ "## Reference scale",
45
+ "## Panel A",
46
+ "## Panel B",
47
+ "## Glossar",
48
+ "**NETTO",
49
+ )
50
+
51
+ CANONICAL_RUNG_IDS = ("baseline", "load", "condense", "rtk", "terse")
52
+
53
+
54
+ def _log(msg: str, quiet: bool, *, err: bool = False) -> None:
55
+ if err:
56
+ print(msg, file=sys.stderr)
57
+ elif not quiet:
58
+ print(msg)
59
+
60
+
61
+ def check_required_sections(text: str) -> List[str]:
62
+ return [
63
+ f"missing required section: '{section}'"
64
+ for section in REQUIRED_SECTIONS
65
+ if section not in text
66
+ ]
67
+
68
+
69
+ def check_source_citations(report: Dict[str, Any]) -> List[str]:
70
+ violations = []
71
+ for rung in report.get("cost_ladder", []) or []:
72
+ source = rung.get("source_report")
73
+ if not source:
74
+ violations.append(
75
+ f"rung '{rung.get('id')}' has no source_report field"
76
+ )
77
+ continue
78
+ if not isinstance(source, str) or not source.strip():
79
+ violations.append(
80
+ f"rung '{rung.get('id')}' has empty source_report"
81
+ )
82
+ return violations
83
+
84
+
85
+ def check_confidence_vs_source(report: Dict[str, Any]) -> List[str]:
86
+ """A `measured` rung's source_report must exist on disk."""
87
+ violations = []
88
+ for rung in report.get("cost_ladder", []) or []:
89
+ if rung.get("confidence") != "measured":
90
+ continue
91
+ source = rung.get("source_report") or ""
92
+ if source in ("", "n/a"):
93
+ continue # baseline rung
94
+ path = REPO_ROOT / source
95
+ if not path.exists():
96
+ violations.append(
97
+ f"rung '{rung.get('id')}' is 'measured' but its "
98
+ f"source_report does not exist: {source}"
99
+ )
100
+ return violations
101
+
102
+
103
+ def check_no_negative_savings(text: str) -> List[str]:
104
+ """A rung whose Δ-token value is positive must not be labelled a saving.
105
+
106
+ Heuristic: scan Panel A's rows; flag any row that contains the
107
+ German word "Ersparnis" with a positive token-delta in the same row.
108
+ """
109
+ violations = []
110
+ # Panel A rows are pipe-delimited; we read every line starting with "|"
111
+ # inside the cost ladder section.
112
+ in_panel_a = False
113
+ for line in text.splitlines():
114
+ if line.startswith("## Panel A"):
115
+ in_panel_a = True
116
+ continue
117
+ if in_panel_a and line.startswith("## "):
118
+ break
119
+ if not in_panel_a or not line.startswith("|"):
120
+ continue
121
+ if "Ersparnis" not in line:
122
+ continue
123
+ # Look for a "+" sign at the start of an integer-shaped delta.
124
+ # The format renders deltas as "+4 843" / "-186".
125
+ m = re.search(r"\|\s*([+-][0-9 ]+)\s*\|", line)
126
+ if m and m.group(1).strip().startswith("+"):
127
+ token_value = m.group(1).strip()
128
+ violations.append(
129
+ "row labelled 'Ersparnis' has a positive Δ-token value: "
130
+ f"{token_value!r} — positive deltas are costs, not savings."
131
+ )
132
+ return violations
133
+
134
+
135
+ def check_canonical_rung_set(report: Dict[str, Any]) -> List[str]:
136
+ rungs = report.get("cost_ladder", []) or []
137
+ ids = [r.get("id") for r in rungs]
138
+ if list(ids) != list(CANONICAL_RUNG_IDS):
139
+ return [
140
+ f"cost_ladder rung ids must be {CANONICAL_RUNG_IDS}, "
141
+ f"got {tuple(ids)}"
142
+ ]
143
+ return []
144
+
145
+
146
+ def lint(quiet: bool = False) -> int:
147
+ violations: List[str] = []
148
+
149
+ if not DASHBOARD.exists():
150
+ _log(
151
+ f"FAIL: dashboard not found: {DASHBOARD.relative_to(REPO_ROOT)}",
152
+ quiet,
153
+ err=True,
154
+ )
155
+ return 1
156
+ text = DASHBOARD.read_text()
157
+ violations.extend(check_required_sections(text))
158
+ violations.extend(check_no_negative_savings(text))
159
+
160
+ if not LATEST.exists():
161
+ # No JSON to deep-check — that's a placeholder dashboard.
162
+ # Required-sections check still applies; we degrade gracefully.
163
+ if violations:
164
+ for v in violations:
165
+ _log(f"FAIL: {v}", quiet, err=True)
166
+ return 1
167
+ _log(
168
+ "lint_value_dashboard: dashboard is a placeholder "
169
+ "(no value-v1.json yet) — structural checks pass.",
170
+ quiet=False,
171
+ )
172
+ return 0
173
+
174
+ try:
175
+ report = json.loads(LATEST.read_text())
176
+ except json.JSONDecodeError as exc:
177
+ _log(f"FAIL: {LATEST.name} is not valid JSON: {exc}", quiet, err=True)
178
+ return 1
179
+
180
+ violations.extend(check_source_citations(report))
181
+ violations.extend(check_confidence_vs_source(report))
182
+ violations.extend(check_canonical_rung_set(report))
183
+
184
+ if violations:
185
+ for v in violations:
186
+ _log(f"FAIL: {v}", quiet, err=True)
187
+ return 1
188
+ _log(
189
+ (
190
+ "lint_value_dashboard: OK — "
191
+ f"{len(report.get('cost_ladder', []))} rungs, "
192
+ f"{len(report.get('behaviour', []))} behaviour metrics, all "
193
+ "sections present, all sources cited."
194
+ ),
195
+ quiet=False,
196
+ )
197
+ return 0
198
+
199
+
200
+ def parse_args(argv: List[str]) -> argparse.Namespace:
201
+ parser = argparse.ArgumentParser(
202
+ description="Lint docs/value.md for structural invariants."
203
+ )
204
+ parser.add_argument(
205
+ "--quiet",
206
+ action="store_true",
207
+ help="Suppress non-error output.",
208
+ )
209
+ return parser.parse_args(argv)
210
+
211
+
212
+ def main(argv: List[str] | None = None) -> int:
213
+ args = parse_args(argv if argv is not None else sys.argv[1:])
214
+ return lint(quiet=args.quiet)
215
+
216
+
217
+ if __name__ == "__main__":
218
+ raise SystemExit(main())
@@ -103,10 +103,14 @@ def render_headline(track_a: dict, track_b: dict) -> str:
103
103
  lines = [
104
104
  "## Headline",
105
105
  "",
106
+ "> **Track A confirms surface availability** — a precondition, not an impact metric. "
107
+ "For the impact view (cost-ladder + behaviour with vs. without), see "
108
+ "[`docs/value.md`](value.md).",
109
+ "",
106
110
  "| Metric | with | without | delta |",
107
111
  "|---|---|---|---|",
108
- f"| Track A trigger-accuracy | {fmt_pct(a_with_acc)} | {fmt_pct(a_wo_acc)} | "
109
- f"{fmt_pct((a_with_acc or 0) - (a_wo_acc or 0))} |",
112
+ f"| Track A surface-availability | {fmt_pct(a_with_acc)} | {fmt_pct(a_wo_acc)} | "
113
+ f"{fmt_pct((a_with_acc or 0) - (a_wo_acc or 0))} _(structural — files present)_ |",
110
114
  f"| Track B completion-rate | {fmt_pct(b_with_comp)} | {fmt_pct(b_wo_comp)} | "
111
115
  f"{fmt_pct((b_with_comp or 0) - (b_wo_comp or 0))} |",
112
116
  f"| Track B mean wall-time | {fmt_num(b_results.get('mean_wall_time'))}s "
@@ -0,0 +1,355 @@
1
+ #!/usr/bin/env python3
2
+ """Render `docs/value.md` from the latest `value-v1` JSON report.
3
+
4
+ Phase 4 Step 1 of `agents/roadmaps/road-to-readable-value-dashboard.md`.
5
+
6
+ This renderer is **deterministic** — it does not run any bench, only
7
+ formats existing reports. Mirrors `render_benchmark_md.py`'s placeholder
8
+ discipline: when the report is missing, write a placeholder document
9
+ explaining how to produce one. Never errors.
10
+
11
+ The dashboard has two panels:
12
+ - Panel A — cost ladder (cumulative, min → max)
13
+ - Panel B — behaviour (with vs. without)
14
+
15
+ Each panel uses plain language, prints `confidence` markers inline,
16
+ and ends with a bold NETTO line that lifts the totals out of the
17
+ table.
18
+ """
19
+ from __future__ import annotations
20
+
21
+ import argparse
22
+ import json
23
+ import sys
24
+ from datetime import datetime, timezone
25
+ from pathlib import Path
26
+ from typing import Any, Dict, Optional
27
+
28
+
29
+ REPO_ROOT = Path(__file__).resolve().parent.parent
30
+ VALUE_REPORTS_DIR = REPO_ROOT / "internal" / "bench" / "reports" / "value"
31
+ LATEST = VALUE_REPORTS_DIR / "latest.json"
32
+ OUT_PATH = REPO_ROOT / "docs" / "value.md"
33
+
34
+ REQUIRED_SECTIONS = (
35
+ "## Reference scale",
36
+ "## Panel A",
37
+ "## Panel B",
38
+ "## Glossar",
39
+ "**NETTO",
40
+ )
41
+
42
+
43
+ def utc_iso() -> str:
44
+ return datetime.now(timezone.utc).isoformat(timespec="seconds")
45
+
46
+
47
+ def safe_load(path: Path) -> Optional[Dict[str, Any]]:
48
+ if not path.exists():
49
+ return None
50
+ try:
51
+ return json.loads(path.read_text())
52
+ except json.JSONDecodeError:
53
+ return None
54
+
55
+
56
+ def fmt_signed_int(value: int) -> str:
57
+ return f"{value:+,}".replace(",", " ")
58
+
59
+
60
+ def fmt_eur(value: float) -> str:
61
+ return f"{value:+.2f} €"
62
+
63
+
64
+ def fmt_pct(value: float) -> str:
65
+ return f"{value:+.2f}%"
66
+
67
+
68
+ def confidence_badge(level: str) -> str:
69
+ badges = {
70
+ "measured": "✅ gemessen",
71
+ "estimated": "≈ geschätzt",
72
+ "vendor-claim": "⚠️ vendor-claim",
73
+ "pending": "⏳ pending",
74
+ }
75
+ return badges.get(level, level)
76
+
77
+
78
+ def mode_badge(mode: str) -> str:
79
+ if mode == "live":
80
+ return "✅ live"
81
+ if mode == "dry-run":
82
+ return "⚠️ dry-run"
83
+ return mode
84
+
85
+
86
+ def render_intro(report: Dict[str, Any]) -> str:
87
+ ref = report.get("reference_scale", {})
88
+ requests = ref.get("requests", 1000)
89
+ avg_in = ref.get("avg_input_tokens", 8000)
90
+ avg_out = ref.get("avg_output_tokens", 600)
91
+ tier = ref.get("model_tier", "sonnet")
92
+ sourced = ref.get("pricing_sourced_on", "—")
93
+ return (
94
+ f"# Value Dashboard — was kostet das Paket, was bringt es?\n"
95
+ "\n"
96
+ "> Diese Seite beantwortet **eine** Frage in echten Zahlen: "
97
+ "*Wie viel mehr Tokens kostet mich das Paket, und wie viel "
98
+ "spart es danach wieder ein?* Generiert von "
99
+ "`scripts/render_value_md.py` aus dem letzten `value-v1` Report; "
100
+ "Quelle: `internal/bench/reports/value/latest.json`.\n"
101
+ "\n"
102
+ "## Wie diese Seite zu lesen ist\n"
103
+ "\n"
104
+ "**Panel A (Kostenleiter)** — von oben nach unten lesen. Jede "
105
+ "Stufe sagt: *was sie macht*, *wie viele Input-Tokens sie pro "
106
+ "Request hinzufügt oder spart*, *was das in € auf "
107
+ f"{requests:,} Requests kostet*, und *wo wir kumulativ stehen*. "
108
+ "Die fett gedruckte **NETTO**-Zeile am Ende ist die Antwort.\n"
109
+ "\n"
110
+ "**Panel B (Verhalten)** — vier reale Vergleiche, *mit* vs. "
111
+ "*ohne* Paket. Hier liegt der nicht-Token-Wert: passende Skill-"
112
+ "Auswahl, Stopps bei riskanten Aktionen, weniger Rückfragen, "
113
+ "mehr abgeschlossene Aufgaben.\n"
114
+ "\n"
115
+ "**Confidence-Marker** an jeder Stufe: `✅ gemessen` = echter "
116
+ "Wert aus einem Report im Repo · `⏳ pending` = noch nicht "
117
+ "gemessen, Stufe trägt 0 zur Summe bei · `⚠️ vendor-claim` = "
118
+ "Behauptung eines Herstellers, nicht selbst gemessen.\n"
119
+ "\n"
120
+ "## Reference scale\n"
121
+ "\n"
122
+ f"- **{requests:,}** Requests, durchschnittlich "
123
+ f"**{avg_in:,}** Input-Tokens und **{avg_out:,}** Output-Tokens "
124
+ "pro Request\n"
125
+ f"- Modell-Tier: `{tier}` · "
126
+ f"Preisstand `{sourced}` (Quelle: `internal/bench/pricing.yaml`)\n"
127
+ "- Wer einen anderen Workload fährt, rechnet selbst nach — die "
128
+ "Methodik ist offengelegt; nichts ist hardcodiert versteckt.\n"
129
+ )
130
+
131
+
132
+ def render_panel_a(report: Dict[str, Any]) -> str:
133
+ lines = [
134
+ "## Panel A — Kostenleiter (kumulativ, min → max)\n",
135
+ "Liest sich von oben nach unten. Positive Δ-Werte = das Paket "
136
+ "*kostet* Tokens (Regel-Load ist die ehrliche Up-Front-Steuer); "
137
+ "negative Δ-Werte = das Paket *spart* Tokens.\n",
138
+ "| Stufe | Was sie tut | Δ Tokens | Δ € (1k Req) | Kumulativ | Quelle |",
139
+ "|---|---|---:|---:|---:|---|",
140
+ ]
141
+ for rung in report.get("cost_ladder", []):
142
+ if rung["id"] == "baseline":
143
+ label_cell = f"**{rung['label']}**"
144
+ else:
145
+ label_cell = rung["label"]
146
+ what = rung.get("what_it_does", "")
147
+ token_delta = int(rung.get("token_delta", 0))
148
+ eur_delta = float(rung.get("eur_delta", 0.0))
149
+ cum = float(rung.get("cumulative_pct", 0.0))
150
+ conf = confidence_badge(rung.get("confidence", "pending"))
151
+ source = rung.get("source_report", "")
152
+ # Honesty stamp: an `up-front-cost` note on the load rung.
153
+ if rung["id"] == "load" and token_delta > 0:
154
+ what = f"{what} ⚠️ erst teurer"
155
+ lines.append(
156
+ f"| {label_cell} | {what} | "
157
+ f"{fmt_signed_int(token_delta)} | {fmt_eur(eur_delta)} | "
158
+ f"{fmt_pct(cum)} | `{source}` · {conf} |"
159
+ )
160
+ if rung.get("footnote"):
161
+ lines.append(
162
+ f"| | _Fußnote:_ {rung['footnote']} | | | | |"
163
+ )
164
+
165
+ totals = report.get("totals", {})
166
+ cum_tokens = int(totals.get("cumulative_token_delta", 0))
167
+ cum_eur = float(totals.get("cumulative_eur_delta", 0.0))
168
+ cum_pct = float(totals.get("cumulative_pct", 0.0))
169
+ verdict = totals.get("net_verdict", "—")
170
+ verdict_label = {
171
+ "net-saving": "**NETTO: Ersparnis** ✅",
172
+ "net-cost": "**NETTO: Mehrkosten** ⚠️",
173
+ "break-even": "**NETTO: Break-Even** ⚖️",
174
+ }.get(verdict, f"**NETTO: {verdict}**")
175
+ lines.extend(
176
+ [
177
+ "",
178
+ f"{verdict_label} — "
179
+ f"**{fmt_signed_int(cum_tokens)} Tokens / Request**, "
180
+ f"**{fmt_eur(cum_eur)}** auf "
181
+ f"{report.get('reference_scale', {}).get('requests', 1000):,} Requests, "
182
+ f"kumulativ **{fmt_pct(cum_pct)}** vs. Baseline.\n",
183
+ ]
184
+ )
185
+ return "\n".join(lines)
186
+
187
+
188
+ def render_panel_b(report: Dict[str, Any]) -> str:
189
+ lines = [
190
+ "## Panel B — Verhalten (mit vs. ohne)\n",
191
+ "Vier reale Vergleiche aus echten Bench-Runs. Hier liegt der "
192
+ "Wert, den Tokens allein nicht messen: ob der Agent das "
193
+ "richtige Skill wählt, bei riskanten Aktionen stoppt, weniger "
194
+ "rückfragt und mehr Aufgaben abschließt.\n",
195
+ "| Metrik | Was es bedeutet | Mit Paket | Ohne Paket | Δ | Mode |",
196
+ "|---|---|---:|---:|---:|---|",
197
+ ]
198
+ for metric in report.get("behaviour", []):
199
+ label = metric["label"]
200
+ what = metric.get("what_this_means", "")
201
+ unit = metric.get("unit", "")
202
+ mode = mode_badge(metric.get("mode", "dry-run"))
203
+
204
+ def _fmt(v: Any) -> str:
205
+ if v is None:
206
+ return "—"
207
+ if unit == "pct" and isinstance(v, (int, float)):
208
+ return f"{float(v) * 100:.1f}%"
209
+ if unit == "count":
210
+ return str(int(v))
211
+ if unit == "ratio" and isinstance(v, (int, float)):
212
+ return f"{float(v):.3f}"
213
+ if unit == "seconds" and isinstance(v, (int, float)):
214
+ return f"{float(v):.1f}s"
215
+ return str(v)
216
+
217
+ with_v = _fmt(metric.get("with"))
218
+ without_v = _fmt(metric.get("without"))
219
+ delta_v = _fmt(metric.get("delta"))
220
+ lines.append(
221
+ f"| {label} | {what} | {with_v} | {without_v} | {delta_v} | {mode} |"
222
+ )
223
+ return "\n".join(lines) + "\n"
224
+
225
+
226
+ def render_glossary() -> str:
227
+ return (
228
+ "## Glossar\n"
229
+ "\n"
230
+ "Plain-language Definitionen für den nicht-Entwickler-Reader.\n"
231
+ "\n"
232
+ "- **Token** — die Einheit, in der ein Sprachmodell abrechnet. "
233
+ "Faustregel: ein Token ≈ 4 Zeichen deutsch/englischer Prosa. "
234
+ "1.000 Tokens ≈ 750 Wörter.\n"
235
+ "- **Input-Tokens** — alles, was das Modell pro Turn liest "
236
+ "(System-Prompt, immer-aktive Regeln, deine Nachricht, frühere "
237
+ "Konversation). Das Paket fügt hier Regeln hinzu — Installation "
238
+ "kostet Input-Tokens.\n"
239
+ "- **Output-Tokens** — was das Modell zurückschreibt. Meist "
240
+ "weniger als Input. Pro Token teurer als Input.\n"
241
+ "- **condense** — ein Build-Schritt, der die Regel-Dateien "
242
+ "vor dem Ausliefern schrumpft (`.agent-src.uncondensed` → "
243
+ "`.agent-src`). Spart Input-Tokens bei jedem Request.\n"
244
+ "- **rtk** — der *Rust Token Killer*, ein CLI-Wrapper, der "
245
+ "verbose Output (`git status`, lint-Output, test-Runner) "
246
+ "filtert, bevor das Modell ihn liest. Spart Input-Tokens auf "
247
+ "Tool-Calls.\n"
248
+ "- **terse / telegraph** — ein Stil (kurze Phrasen, "
249
+ "weggelassene Artikel), den der Agent für knappere Antworten "
250
+ "nutzt. Spart Output-Tokens — wenn der Korpus es belohnt.\n"
251
+ "- **Ohne Paket / Mit Paket** — *without the package* / *with "
252
+ "the package* — die zwei Arme des A/B-Vergleichs.\n"
253
+ "- **€-per-1k-requests** — Token-Kosten auf der "
254
+ "Referenz-Skala (1.000 Requests durchschnittlicher Größe, "
255
+ "gepreist mit den aktuellen Sonnet-Raten aus "
256
+ "`internal/bench/pricing.yaml`).\n"
257
+ )
258
+
259
+
260
+ def render_methodology(report: Dict[str, Any]) -> str:
261
+ notes = report.get("notes", [])
262
+ lines = [
263
+ "## Methodik & Quellen\n",
264
+ "Diese Seite ist eine **abgeleitete** Sicht — keine eigene "
265
+ "Messung. Sie fasst drei bestehende Bench-Surfaces zusammen "
266
+ "(siehe Spalte 'Quelle' in Panel A). Die maschinen-lesbaren "
267
+ "Roh-Reports bleiben die Source-of-Truth:\n",
268
+ "- `internal/bench/reports/telegraph-v1.json` / `telegraph-v2.json` "
269
+ "— Telegraph/Condense-Messungen.\n",
270
+ "- `agents/runtime/frugality/baseline.jsonl` — der Paket-Load "
271
+ "(Metric A footprint).\n",
272
+ "- `internal/bench/reports/rtk/latest.json` — die rtk-Messung "
273
+ "(neu, Phase 2).\n",
274
+ "- `internal/bench/reports/ab/*-ab-trackb-{with,without}.json` "
275
+ "— A/B Track B (Verhalten).\n",
276
+ "- `internal/bench/reports/*-dev.json` — Dev-Korpus Selection-"
277
+ "Accuracy.\n",
278
+ "",
279
+ "**A/B-technischer Anhang:** [`docs/benchmark.md`](benchmark.md) "
280
+ "trägt die Cache-Key-, Integrity- und Methodik-Details des "
281
+ "A/B-Benches — wer den Variant-Axis-Beweis sehen will, liest "
282
+ "dort weiter.\n",
283
+ "",
284
+ ]
285
+ if notes:
286
+ lines.append("**Hinweise aus dem Report:**\n")
287
+ for note in notes:
288
+ lines.append(f"- {note}")
289
+ lines.append("")
290
+ lines.append(f"_Last rendered: `{utc_iso()}`_\n")
291
+ return "\n".join(lines)
292
+
293
+
294
+ def render_placeholder() -> str:
295
+ return (
296
+ "# Value Dashboard — Platzhalter\n"
297
+ "\n"
298
+ "_Es liegt noch kein `value-v1` Report unter "
299
+ "`internal/bench/reports/value/latest.json` vor._\n"
300
+ "\n"
301
+ "Einen erzeugen mit:\n"
302
+ "\n"
303
+ "```sh\n"
304
+ "task value\n"
305
+ "```\n"
306
+ "\n"
307
+ "Die Methodik dieses Dashboards ist beschrieben in "
308
+ "`docs/contracts/value-dashboard-spec.md` und der zugehörigen "
309
+ "Roadmap `agents/roadmaps/road-to-readable-value-dashboard.md`.\n"
310
+ "\n"
311
+ f"_Last rendered: {utc_iso()}_\n"
312
+ )
313
+
314
+
315
+ def render(quiet: bool = False) -> int:
316
+ report = safe_load(LATEST)
317
+ OUT_PATH.parent.mkdir(parents=True, exist_ok=True)
318
+ if not report:
319
+ OUT_PATH.write_text(render_placeholder())
320
+ if not quiet:
321
+ sys.stdout.write(
322
+ f"render_value_md: no report — wrote placeholder to "
323
+ f"{OUT_PATH.relative_to(REPO_ROOT)}\n"
324
+ )
325
+ return 0
326
+ parts = [
327
+ render_intro(report),
328
+ render_panel_a(report),
329
+ render_panel_b(report),
330
+ render_glossary(),
331
+ render_methodology(report),
332
+ ]
333
+ OUT_PATH.write_text("\n".join(parts))
334
+ if not quiet:
335
+ sys.stdout.write(
336
+ f"render_value_md: wrote {OUT_PATH.relative_to(REPO_ROOT)}\n"
337
+ )
338
+ return 0
339
+
340
+
341
+ def parse_args(argv: list[str]) -> argparse.Namespace:
342
+ parser = argparse.ArgumentParser(
343
+ description="Render docs/value.md from the latest value-v1 report."
344
+ )
345
+ parser.add_argument("--quiet", action="store_true", help="Suppress stdout.")
346
+ return parser.parse_args(argv)
347
+
348
+
349
+ def main(argv: list[str] | None = None) -> int:
350
+ args = parse_args(argv if argv is not None else sys.argv[1:])
351
+ return render(quiet=args.quiet)
352
+
353
+
354
+ if __name__ == "__main__":
355
+ raise SystemExit(main())