@event4u/agent-config 4.9.0 → 5.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.agent-src/commands/implement-ticket.md +5 -4
- package/.agent-src/rules/language-and-tone.md +4 -10
- package/.agent-src/skills/command-routing/SKILL.md +5 -4
- package/.claude-plugin/marketplace.json +1 -1
- package/CHANGELOG.md +73 -0
- package/CONTRIBUTING.md +19 -0
- package/README.md +11 -0
- package/dist/cli/registry.js +0 -2
- package/dist/cli/registry.js.map +1 -1
- package/dist/discovery/deprecation-report.md +1 -1
- package/dist/discovery/discovery-manifest.json +5 -5
- package/dist/discovery/discovery-manifest.json.sha256 +1 -1
- package/dist/discovery/discovery-manifest.summary.md +1 -1
- package/dist/discovery/orphan-report.md +1 -1
- package/dist/discovery/packs.json +2 -2
- package/dist/discovery/trust-report.md +1 -1
- package/dist/discovery/workspaces.json +2 -2
- package/dist/mcp/registry-manifest.json +2 -2
- package/dist/router.json +1 -1671
- package/docs/benchmark.md +20 -8
- package/docs/benchmarks.md +11 -0
- package/docs/contracts/benchmark-corpus-spec.md +31 -3
- package/docs/contracts/command-surface-tiers.md +1 -1
- package/docs/contracts/hook-architecture-v1.md +33 -0
- package/docs/contracts/migrate-command.md +197 -0
- package/docs/contracts/settings-api.md +2 -1
- package/docs/contracts/value-dashboard-spec.md +374 -0
- package/docs/contracts/value-report-schema.md +150 -0
- package/docs/decisions/ADR-031-validation-severity-tiers-and-projection-roundtrip.md +97 -0
- package/docs/decisions/INDEX.md +1 -0
- package/docs/guidelines/agent-infra/installed-tools-manifest.md +6 -3
- package/docs/guidelines/agent-infra/language-and-tone-examples.md +35 -0
- package/docs/migration/v1-to-v2.md +40 -27
- package/docs/value.md +84 -0
- package/package.json +8 -8
- package/scripts/__pycache__/validate_frontmatter.cpython-312.pyc +0 -0
- package/scripts/_cli/cmd_migrate.py +264 -102
- package/scripts/_cli/cmd_settings_migrate.py +2 -1
- package/scripts/_dispatch.bash +147 -49
- package/scripts/_lib/__pycache__/__init__.cpython-312.pyc +0 -0
- package/scripts/_lib/__pycache__/agent_src.cpython-312.pyc +0 -0
- package/scripts/_lib/install_regenerator.py +129 -0
- package/scripts/_lib/value_ladder.py +599 -0
- package/scripts/_lib/value_report.py +441 -0
- package/scripts/bench_rtk_savings.py +320 -0
- package/scripts/compile_router.py +19 -5
- package/scripts/expected_perms.json +1 -1
- package/scripts/first_run_gate_hook.py +178 -0
- package/scripts/hook_manifest.yaml +16 -7
- package/scripts/hooks/dispatch_hook.py +27 -0
- package/scripts/hooks/dispatch_issues.py +136 -0
- package/scripts/hooks_doctor.py +40 -1
- package/scripts/install.py +25 -21
- package/scripts/lint_agents_layout.py +5 -4
- package/scripts/lint_bench_corpus.py +86 -4
- package/scripts/lint_global_paths.py +4 -3
- package/scripts/lint_marketplace_install_completeness.py +188 -0
- package/scripts/lint_value_dashboard.py +218 -0
- package/scripts/render_benchmark_md.py +6 -2
- package/scripts/render_value_md.py +355 -0
- package/scripts/repro/repro_marketplace_install_gap.sh +161 -0
- package/scripts/roadmap_progress_hook.py +23 -0
- package/scripts/router_telemetry.py +470 -0
- package/scripts/validate_frontmatter.py +23 -9
- package/scripts/_cli/cmd_migrate_to_global.py +0 -415
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Lint `docs/value.md` for structural invariants.
|
|
3
|
+
|
|
4
|
+
Phase 5 Step 3 of `agents/roadmaps/road-to-readable-value-dashboard.md`.
|
|
5
|
+
|
|
6
|
+
Invariants enforced (any violation → exit 1):
|
|
7
|
+
|
|
8
|
+
1. Required sections present (intro / Reference scale / Panel A / Panel B
|
|
9
|
+
/ Glossar / NETTO line).
|
|
10
|
+
2. Every cost-ladder rung row cites a `source_report` (or `n/a` for the
|
|
11
|
+
baseline rung) — no rung sneaks in without traceability.
|
|
12
|
+
3. No `measured` rung renders a `pending` source — internal consistency
|
|
13
|
+
of confidence ↔ source state.
|
|
14
|
+
4. No negative-saving label: the literal string "Ersparnis" must not
|
|
15
|
+
appear in a row where the displayed Δ-token value is positive (the
|
|
16
|
+
load + terse rungs are *costs*, not savings; mislabelling either is
|
|
17
|
+
a credibility failure the page explicitly forbids).
|
|
18
|
+
5. The `latest.json` exists and its `cost_ladder` rung ids match the
|
|
19
|
+
five canonical rungs — the renderer cannot silently drop a rung.
|
|
20
|
+
|
|
21
|
+
The linter loads `internal/bench/reports/value/latest.json` directly
|
|
22
|
+
(not just the rendered `.md`) for items (3) and (5) — the rendered
|
|
23
|
+
text alone is too lossy.
|
|
24
|
+
|
|
25
|
+
Output: one violation per line in non-quiet mode; one-line summary in
|
|
26
|
+
quiet mode. Exit 0 on clean, 1 on any violation.
|
|
27
|
+
"""
|
|
28
|
+
from __future__ import annotations
|
|
29
|
+
|
|
30
|
+
import argparse
|
|
31
|
+
import json
|
|
32
|
+
import re
|
|
33
|
+
import sys
|
|
34
|
+
from pathlib import Path
|
|
35
|
+
from typing import Any, Dict, List
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
REPO_ROOT = Path(__file__).resolve().parent.parent
|
|
39
|
+
DASHBOARD = REPO_ROOT / "docs" / "value.md"
|
|
40
|
+
LATEST = REPO_ROOT / "internal" / "bench" / "reports" / "value" / "latest.json"
|
|
41
|
+
|
|
42
|
+
REQUIRED_SECTIONS = (
|
|
43
|
+
"# Value Dashboard",
|
|
44
|
+
"## Reference scale",
|
|
45
|
+
"## Panel A",
|
|
46
|
+
"## Panel B",
|
|
47
|
+
"## Glossar",
|
|
48
|
+
"**NETTO",
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
CANONICAL_RUNG_IDS = ("baseline", "load", "condense", "rtk", "terse")
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _log(msg: str, quiet: bool, *, err: bool = False) -> None:
|
|
55
|
+
if err:
|
|
56
|
+
print(msg, file=sys.stderr)
|
|
57
|
+
elif not quiet:
|
|
58
|
+
print(msg)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def check_required_sections(text: str) -> List[str]:
|
|
62
|
+
return [
|
|
63
|
+
f"missing required section: '{section}'"
|
|
64
|
+
for section in REQUIRED_SECTIONS
|
|
65
|
+
if section not in text
|
|
66
|
+
]
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def check_source_citations(report: Dict[str, Any]) -> List[str]:
|
|
70
|
+
violations = []
|
|
71
|
+
for rung in report.get("cost_ladder", []) or []:
|
|
72
|
+
source = rung.get("source_report")
|
|
73
|
+
if not source:
|
|
74
|
+
violations.append(
|
|
75
|
+
f"rung '{rung.get('id')}' has no source_report field"
|
|
76
|
+
)
|
|
77
|
+
continue
|
|
78
|
+
if not isinstance(source, str) or not source.strip():
|
|
79
|
+
violations.append(
|
|
80
|
+
f"rung '{rung.get('id')}' has empty source_report"
|
|
81
|
+
)
|
|
82
|
+
return violations
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def check_confidence_vs_source(report: Dict[str, Any]) -> List[str]:
|
|
86
|
+
"""A `measured` rung's source_report must exist on disk."""
|
|
87
|
+
violations = []
|
|
88
|
+
for rung in report.get("cost_ladder", []) or []:
|
|
89
|
+
if rung.get("confidence") != "measured":
|
|
90
|
+
continue
|
|
91
|
+
source = rung.get("source_report") or ""
|
|
92
|
+
if source in ("", "n/a"):
|
|
93
|
+
continue # baseline rung
|
|
94
|
+
path = REPO_ROOT / source
|
|
95
|
+
if not path.exists():
|
|
96
|
+
violations.append(
|
|
97
|
+
f"rung '{rung.get('id')}' is 'measured' but its "
|
|
98
|
+
f"source_report does not exist: {source}"
|
|
99
|
+
)
|
|
100
|
+
return violations
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def check_no_negative_savings(text: str) -> List[str]:
|
|
104
|
+
"""A rung whose Δ-token value is positive must not be labelled a saving.
|
|
105
|
+
|
|
106
|
+
Heuristic: scan Panel A's rows; flag any row that contains the
|
|
107
|
+
German word "Ersparnis" with a positive token-delta in the same row.
|
|
108
|
+
"""
|
|
109
|
+
violations = []
|
|
110
|
+
# Panel A rows are pipe-delimited; we read every line starting with "|"
|
|
111
|
+
# inside the cost ladder section.
|
|
112
|
+
in_panel_a = False
|
|
113
|
+
for line in text.splitlines():
|
|
114
|
+
if line.startswith("## Panel A"):
|
|
115
|
+
in_panel_a = True
|
|
116
|
+
continue
|
|
117
|
+
if in_panel_a and line.startswith("## "):
|
|
118
|
+
break
|
|
119
|
+
if not in_panel_a or not line.startswith("|"):
|
|
120
|
+
continue
|
|
121
|
+
if "Ersparnis" not in line:
|
|
122
|
+
continue
|
|
123
|
+
# Look for a "+" sign at the start of an integer-shaped delta.
|
|
124
|
+
# The format renders deltas as "+4 843" / "-186".
|
|
125
|
+
m = re.search(r"\|\s*([+-][0-9 ]+)\s*\|", line)
|
|
126
|
+
if m and m.group(1).strip().startswith("+"):
|
|
127
|
+
token_value = m.group(1).strip()
|
|
128
|
+
violations.append(
|
|
129
|
+
"row labelled 'Ersparnis' has a positive Δ-token value: "
|
|
130
|
+
f"{token_value!r} — positive deltas are costs, not savings."
|
|
131
|
+
)
|
|
132
|
+
return violations
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def check_canonical_rung_set(report: Dict[str, Any]) -> List[str]:
|
|
136
|
+
rungs = report.get("cost_ladder", []) or []
|
|
137
|
+
ids = [r.get("id") for r in rungs]
|
|
138
|
+
if list(ids) != list(CANONICAL_RUNG_IDS):
|
|
139
|
+
return [
|
|
140
|
+
f"cost_ladder rung ids must be {CANONICAL_RUNG_IDS}, "
|
|
141
|
+
f"got {tuple(ids)}"
|
|
142
|
+
]
|
|
143
|
+
return []
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def lint(quiet: bool = False) -> int:
|
|
147
|
+
violations: List[str] = []
|
|
148
|
+
|
|
149
|
+
if not DASHBOARD.exists():
|
|
150
|
+
_log(
|
|
151
|
+
f"FAIL: dashboard not found: {DASHBOARD.relative_to(REPO_ROOT)}",
|
|
152
|
+
quiet,
|
|
153
|
+
err=True,
|
|
154
|
+
)
|
|
155
|
+
return 1
|
|
156
|
+
text = DASHBOARD.read_text()
|
|
157
|
+
violations.extend(check_required_sections(text))
|
|
158
|
+
violations.extend(check_no_negative_savings(text))
|
|
159
|
+
|
|
160
|
+
if not LATEST.exists():
|
|
161
|
+
# No JSON to deep-check — that's a placeholder dashboard.
|
|
162
|
+
# Required-sections check still applies; we degrade gracefully.
|
|
163
|
+
if violations:
|
|
164
|
+
for v in violations:
|
|
165
|
+
_log(f"FAIL: {v}", quiet, err=True)
|
|
166
|
+
return 1
|
|
167
|
+
_log(
|
|
168
|
+
"lint_value_dashboard: dashboard is a placeholder "
|
|
169
|
+
"(no value-v1.json yet) — structural checks pass.",
|
|
170
|
+
quiet=False,
|
|
171
|
+
)
|
|
172
|
+
return 0
|
|
173
|
+
|
|
174
|
+
try:
|
|
175
|
+
report = json.loads(LATEST.read_text())
|
|
176
|
+
except json.JSONDecodeError as exc:
|
|
177
|
+
_log(f"FAIL: {LATEST.name} is not valid JSON: {exc}", quiet, err=True)
|
|
178
|
+
return 1
|
|
179
|
+
|
|
180
|
+
violations.extend(check_source_citations(report))
|
|
181
|
+
violations.extend(check_confidence_vs_source(report))
|
|
182
|
+
violations.extend(check_canonical_rung_set(report))
|
|
183
|
+
|
|
184
|
+
if violations:
|
|
185
|
+
for v in violations:
|
|
186
|
+
_log(f"FAIL: {v}", quiet, err=True)
|
|
187
|
+
return 1
|
|
188
|
+
_log(
|
|
189
|
+
(
|
|
190
|
+
"lint_value_dashboard: OK — "
|
|
191
|
+
f"{len(report.get('cost_ladder', []))} rungs, "
|
|
192
|
+
f"{len(report.get('behaviour', []))} behaviour metrics, all "
|
|
193
|
+
"sections present, all sources cited."
|
|
194
|
+
),
|
|
195
|
+
quiet=False,
|
|
196
|
+
)
|
|
197
|
+
return 0
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def parse_args(argv: List[str]) -> argparse.Namespace:
|
|
201
|
+
parser = argparse.ArgumentParser(
|
|
202
|
+
description="Lint docs/value.md for structural invariants."
|
|
203
|
+
)
|
|
204
|
+
parser.add_argument(
|
|
205
|
+
"--quiet",
|
|
206
|
+
action="store_true",
|
|
207
|
+
help="Suppress non-error output.",
|
|
208
|
+
)
|
|
209
|
+
return parser.parse_args(argv)
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def main(argv: List[str] | None = None) -> int:
|
|
213
|
+
args = parse_args(argv if argv is not None else sys.argv[1:])
|
|
214
|
+
return lint(quiet=args.quiet)
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
if __name__ == "__main__":
|
|
218
|
+
raise SystemExit(main())
|
|
@@ -103,10 +103,14 @@ def render_headline(track_a: dict, track_b: dict) -> str:
|
|
|
103
103
|
lines = [
|
|
104
104
|
"## Headline",
|
|
105
105
|
"",
|
|
106
|
+
"> **Track A confirms surface availability** — a precondition, not an impact metric. "
|
|
107
|
+
"For the impact view (cost-ladder + behaviour with vs. without), see "
|
|
108
|
+
"[`docs/value.md`](value.md).",
|
|
109
|
+
"",
|
|
106
110
|
"| Metric | with | without | delta |",
|
|
107
111
|
"|---|---|---|---|",
|
|
108
|
-
f"| Track A
|
|
109
|
-
f"{fmt_pct((a_with_acc or 0) - (a_wo_acc or 0))} |",
|
|
112
|
+
f"| Track A surface-availability | {fmt_pct(a_with_acc)} | {fmt_pct(a_wo_acc)} | "
|
|
113
|
+
f"{fmt_pct((a_with_acc or 0) - (a_wo_acc or 0))} _(structural — files present)_ |",
|
|
110
114
|
f"| Track B completion-rate | {fmt_pct(b_with_comp)} | {fmt_pct(b_wo_comp)} | "
|
|
111
115
|
f"{fmt_pct((b_with_comp or 0) - (b_wo_comp or 0))} |",
|
|
112
116
|
f"| Track B mean wall-time | {fmt_num(b_results.get('mean_wall_time'))}s "
|
|
@@ -0,0 +1,355 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Render `docs/value.md` from the latest `value-v1` JSON report.
|
|
3
|
+
|
|
4
|
+
Phase 4 Step 1 of `agents/roadmaps/road-to-readable-value-dashboard.md`.
|
|
5
|
+
|
|
6
|
+
This renderer is **deterministic** — it does not run any bench, only
|
|
7
|
+
formats existing reports. Mirrors `render_benchmark_md.py`'s placeholder
|
|
8
|
+
discipline: when the report is missing, write a placeholder document
|
|
9
|
+
explaining how to produce one. Never errors.
|
|
10
|
+
|
|
11
|
+
The dashboard has two panels:
|
|
12
|
+
- Panel A — cost ladder (cumulative, min → max)
|
|
13
|
+
- Panel B — behaviour (with vs. without)
|
|
14
|
+
|
|
15
|
+
Each panel uses plain language, prints `confidence` markers inline,
|
|
16
|
+
and ends with a bold NETTO line that lifts the totals out of the
|
|
17
|
+
table.
|
|
18
|
+
"""
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
import argparse
|
|
22
|
+
import json
|
|
23
|
+
import sys
|
|
24
|
+
from datetime import datetime, timezone
|
|
25
|
+
from pathlib import Path
|
|
26
|
+
from typing import Any, Dict, Optional
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
REPO_ROOT = Path(__file__).resolve().parent.parent
|
|
30
|
+
VALUE_REPORTS_DIR = REPO_ROOT / "internal" / "bench" / "reports" / "value"
|
|
31
|
+
LATEST = VALUE_REPORTS_DIR / "latest.json"
|
|
32
|
+
OUT_PATH = REPO_ROOT / "docs" / "value.md"
|
|
33
|
+
|
|
34
|
+
REQUIRED_SECTIONS = (
|
|
35
|
+
"## Reference scale",
|
|
36
|
+
"## Panel A",
|
|
37
|
+
"## Panel B",
|
|
38
|
+
"## Glossar",
|
|
39
|
+
"**NETTO",
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def utc_iso() -> str:
|
|
44
|
+
return datetime.now(timezone.utc).isoformat(timespec="seconds")
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def safe_load(path: Path) -> Optional[Dict[str, Any]]:
|
|
48
|
+
if not path.exists():
|
|
49
|
+
return None
|
|
50
|
+
try:
|
|
51
|
+
return json.loads(path.read_text())
|
|
52
|
+
except json.JSONDecodeError:
|
|
53
|
+
return None
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def fmt_signed_int(value: int) -> str:
|
|
57
|
+
return f"{value:+,}".replace(",", " ")
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def fmt_eur(value: float) -> str:
|
|
61
|
+
return f"{value:+.2f} €"
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def fmt_pct(value: float) -> str:
|
|
65
|
+
return f"{value:+.2f}%"
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def confidence_badge(level: str) -> str:
|
|
69
|
+
badges = {
|
|
70
|
+
"measured": "✅ gemessen",
|
|
71
|
+
"estimated": "≈ geschätzt",
|
|
72
|
+
"vendor-claim": "⚠️ vendor-claim",
|
|
73
|
+
"pending": "⏳ pending",
|
|
74
|
+
}
|
|
75
|
+
return badges.get(level, level)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def mode_badge(mode: str) -> str:
|
|
79
|
+
if mode == "live":
|
|
80
|
+
return "✅ live"
|
|
81
|
+
if mode == "dry-run":
|
|
82
|
+
return "⚠️ dry-run"
|
|
83
|
+
return mode
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def render_intro(report: Dict[str, Any]) -> str:
|
|
87
|
+
ref = report.get("reference_scale", {})
|
|
88
|
+
requests = ref.get("requests", 1000)
|
|
89
|
+
avg_in = ref.get("avg_input_tokens", 8000)
|
|
90
|
+
avg_out = ref.get("avg_output_tokens", 600)
|
|
91
|
+
tier = ref.get("model_tier", "sonnet")
|
|
92
|
+
sourced = ref.get("pricing_sourced_on", "—")
|
|
93
|
+
return (
|
|
94
|
+
f"# Value Dashboard — was kostet das Paket, was bringt es?\n"
|
|
95
|
+
"\n"
|
|
96
|
+
"> Diese Seite beantwortet **eine** Frage in echten Zahlen: "
|
|
97
|
+
"*Wie viel mehr Tokens kostet mich das Paket, und wie viel "
|
|
98
|
+
"spart es danach wieder ein?* Generiert von "
|
|
99
|
+
"`scripts/render_value_md.py` aus dem letzten `value-v1` Report; "
|
|
100
|
+
"Quelle: `internal/bench/reports/value/latest.json`.\n"
|
|
101
|
+
"\n"
|
|
102
|
+
"## Wie diese Seite zu lesen ist\n"
|
|
103
|
+
"\n"
|
|
104
|
+
"**Panel A (Kostenleiter)** — von oben nach unten lesen. Jede "
|
|
105
|
+
"Stufe sagt: *was sie macht*, *wie viele Input-Tokens sie pro "
|
|
106
|
+
"Request hinzufügt oder spart*, *was das in € auf "
|
|
107
|
+
f"{requests:,} Requests kostet*, und *wo wir kumulativ stehen*. "
|
|
108
|
+
"Die fett gedruckte **NETTO**-Zeile am Ende ist die Antwort.\n"
|
|
109
|
+
"\n"
|
|
110
|
+
"**Panel B (Verhalten)** — vier reale Vergleiche, *mit* vs. "
|
|
111
|
+
"*ohne* Paket. Hier liegt der nicht-Token-Wert: passende Skill-"
|
|
112
|
+
"Auswahl, Stopps bei riskanten Aktionen, weniger Rückfragen, "
|
|
113
|
+
"mehr abgeschlossene Aufgaben.\n"
|
|
114
|
+
"\n"
|
|
115
|
+
"**Confidence-Marker** an jeder Stufe: `✅ gemessen` = echter "
|
|
116
|
+
"Wert aus einem Report im Repo · `⏳ pending` = noch nicht "
|
|
117
|
+
"gemessen, Stufe trägt 0 zur Summe bei · `⚠️ vendor-claim` = "
|
|
118
|
+
"Behauptung eines Herstellers, nicht selbst gemessen.\n"
|
|
119
|
+
"\n"
|
|
120
|
+
"## Reference scale\n"
|
|
121
|
+
"\n"
|
|
122
|
+
f"- **{requests:,}** Requests, durchschnittlich "
|
|
123
|
+
f"**{avg_in:,}** Input-Tokens und **{avg_out:,}** Output-Tokens "
|
|
124
|
+
"pro Request\n"
|
|
125
|
+
f"- Modell-Tier: `{tier}` · "
|
|
126
|
+
f"Preisstand `{sourced}` (Quelle: `internal/bench/pricing.yaml`)\n"
|
|
127
|
+
"- Wer einen anderen Workload fährt, rechnet selbst nach — die "
|
|
128
|
+
"Methodik ist offengelegt; nichts ist hardcodiert versteckt.\n"
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def render_panel_a(report: Dict[str, Any]) -> str:
|
|
133
|
+
lines = [
|
|
134
|
+
"## Panel A — Kostenleiter (kumulativ, min → max)\n",
|
|
135
|
+
"Liest sich von oben nach unten. Positive Δ-Werte = das Paket "
|
|
136
|
+
"*kostet* Tokens (Regel-Load ist die ehrliche Up-Front-Steuer); "
|
|
137
|
+
"negative Δ-Werte = das Paket *spart* Tokens.\n",
|
|
138
|
+
"| Stufe | Was sie tut | Δ Tokens | Δ € (1k Req) | Kumulativ | Quelle |",
|
|
139
|
+
"|---|---|---:|---:|---:|---|",
|
|
140
|
+
]
|
|
141
|
+
for rung in report.get("cost_ladder", []):
|
|
142
|
+
if rung["id"] == "baseline":
|
|
143
|
+
label_cell = f"**{rung['label']}**"
|
|
144
|
+
else:
|
|
145
|
+
label_cell = rung["label"]
|
|
146
|
+
what = rung.get("what_it_does", "")
|
|
147
|
+
token_delta = int(rung.get("token_delta", 0))
|
|
148
|
+
eur_delta = float(rung.get("eur_delta", 0.0))
|
|
149
|
+
cum = float(rung.get("cumulative_pct", 0.0))
|
|
150
|
+
conf = confidence_badge(rung.get("confidence", "pending"))
|
|
151
|
+
source = rung.get("source_report", "")
|
|
152
|
+
# Honesty stamp: an `up-front-cost` note on the load rung.
|
|
153
|
+
if rung["id"] == "load" and token_delta > 0:
|
|
154
|
+
what = f"{what} ⚠️ erst teurer"
|
|
155
|
+
lines.append(
|
|
156
|
+
f"| {label_cell} | {what} | "
|
|
157
|
+
f"{fmt_signed_int(token_delta)} | {fmt_eur(eur_delta)} | "
|
|
158
|
+
f"{fmt_pct(cum)} | `{source}` · {conf} |"
|
|
159
|
+
)
|
|
160
|
+
if rung.get("footnote"):
|
|
161
|
+
lines.append(
|
|
162
|
+
f"| | _Fußnote:_ {rung['footnote']} | | | | |"
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
totals = report.get("totals", {})
|
|
166
|
+
cum_tokens = int(totals.get("cumulative_token_delta", 0))
|
|
167
|
+
cum_eur = float(totals.get("cumulative_eur_delta", 0.0))
|
|
168
|
+
cum_pct = float(totals.get("cumulative_pct", 0.0))
|
|
169
|
+
verdict = totals.get("net_verdict", "—")
|
|
170
|
+
verdict_label = {
|
|
171
|
+
"net-saving": "**NETTO: Ersparnis** ✅",
|
|
172
|
+
"net-cost": "**NETTO: Mehrkosten** ⚠️",
|
|
173
|
+
"break-even": "**NETTO: Break-Even** ⚖️",
|
|
174
|
+
}.get(verdict, f"**NETTO: {verdict}**")
|
|
175
|
+
lines.extend(
|
|
176
|
+
[
|
|
177
|
+
"",
|
|
178
|
+
f"{verdict_label} — "
|
|
179
|
+
f"**{fmt_signed_int(cum_tokens)} Tokens / Request**, "
|
|
180
|
+
f"**{fmt_eur(cum_eur)}** auf "
|
|
181
|
+
f"{report.get('reference_scale', {}).get('requests', 1000):,} Requests, "
|
|
182
|
+
f"kumulativ **{fmt_pct(cum_pct)}** vs. Baseline.\n",
|
|
183
|
+
]
|
|
184
|
+
)
|
|
185
|
+
return "\n".join(lines)
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def render_panel_b(report: Dict[str, Any]) -> str:
|
|
189
|
+
lines = [
|
|
190
|
+
"## Panel B — Verhalten (mit vs. ohne)\n",
|
|
191
|
+
"Vier reale Vergleiche aus echten Bench-Runs. Hier liegt der "
|
|
192
|
+
"Wert, den Tokens allein nicht messen: ob der Agent das "
|
|
193
|
+
"richtige Skill wählt, bei riskanten Aktionen stoppt, weniger "
|
|
194
|
+
"rückfragt und mehr Aufgaben abschließt.\n",
|
|
195
|
+
"| Metrik | Was es bedeutet | Mit Paket | Ohne Paket | Δ | Mode |",
|
|
196
|
+
"|---|---|---:|---:|---:|---|",
|
|
197
|
+
]
|
|
198
|
+
for metric in report.get("behaviour", []):
|
|
199
|
+
label = metric["label"]
|
|
200
|
+
what = metric.get("what_this_means", "")
|
|
201
|
+
unit = metric.get("unit", "")
|
|
202
|
+
mode = mode_badge(metric.get("mode", "dry-run"))
|
|
203
|
+
|
|
204
|
+
def _fmt(v: Any) -> str:
|
|
205
|
+
if v is None:
|
|
206
|
+
return "—"
|
|
207
|
+
if unit == "pct" and isinstance(v, (int, float)):
|
|
208
|
+
return f"{float(v) * 100:.1f}%"
|
|
209
|
+
if unit == "count":
|
|
210
|
+
return str(int(v))
|
|
211
|
+
if unit == "ratio" and isinstance(v, (int, float)):
|
|
212
|
+
return f"{float(v):.3f}"
|
|
213
|
+
if unit == "seconds" and isinstance(v, (int, float)):
|
|
214
|
+
return f"{float(v):.1f}s"
|
|
215
|
+
return str(v)
|
|
216
|
+
|
|
217
|
+
with_v = _fmt(metric.get("with"))
|
|
218
|
+
without_v = _fmt(metric.get("without"))
|
|
219
|
+
delta_v = _fmt(metric.get("delta"))
|
|
220
|
+
lines.append(
|
|
221
|
+
f"| {label} | {what} | {with_v} | {without_v} | {delta_v} | {mode} |"
|
|
222
|
+
)
|
|
223
|
+
return "\n".join(lines) + "\n"
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def render_glossary() -> str:
|
|
227
|
+
return (
|
|
228
|
+
"## Glossar\n"
|
|
229
|
+
"\n"
|
|
230
|
+
"Plain-language Definitionen für den nicht-Entwickler-Reader.\n"
|
|
231
|
+
"\n"
|
|
232
|
+
"- **Token** — die Einheit, in der ein Sprachmodell abrechnet. "
|
|
233
|
+
"Faustregel: ein Token ≈ 4 Zeichen deutsch/englischer Prosa. "
|
|
234
|
+
"1.000 Tokens ≈ 750 Wörter.\n"
|
|
235
|
+
"- **Input-Tokens** — alles, was das Modell pro Turn liest "
|
|
236
|
+
"(System-Prompt, immer-aktive Regeln, deine Nachricht, frühere "
|
|
237
|
+
"Konversation). Das Paket fügt hier Regeln hinzu — Installation "
|
|
238
|
+
"kostet Input-Tokens.\n"
|
|
239
|
+
"- **Output-Tokens** — was das Modell zurückschreibt. Meist "
|
|
240
|
+
"weniger als Input. Pro Token teurer als Input.\n"
|
|
241
|
+
"- **condense** — ein Build-Schritt, der die Regel-Dateien "
|
|
242
|
+
"vor dem Ausliefern schrumpft (`.agent-src.uncondensed` → "
|
|
243
|
+
"`.agent-src`). Spart Input-Tokens bei jedem Request.\n"
|
|
244
|
+
"- **rtk** — der *Rust Token Killer*, ein CLI-Wrapper, der "
|
|
245
|
+
"verbose Output (`git status`, lint-Output, test-Runner) "
|
|
246
|
+
"filtert, bevor das Modell ihn liest. Spart Input-Tokens auf "
|
|
247
|
+
"Tool-Calls.\n"
|
|
248
|
+
"- **terse / telegraph** — ein Stil (kurze Phrasen, "
|
|
249
|
+
"weggelassene Artikel), den der Agent für knappere Antworten "
|
|
250
|
+
"nutzt. Spart Output-Tokens — wenn der Korpus es belohnt.\n"
|
|
251
|
+
"- **Ohne Paket / Mit Paket** — *without the package* / *with "
|
|
252
|
+
"the package* — die zwei Arme des A/B-Vergleichs.\n"
|
|
253
|
+
"- **€-per-1k-requests** — Token-Kosten auf der "
|
|
254
|
+
"Referenz-Skala (1.000 Requests durchschnittlicher Größe, "
|
|
255
|
+
"gepreist mit den aktuellen Sonnet-Raten aus "
|
|
256
|
+
"`internal/bench/pricing.yaml`).\n"
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def render_methodology(report: Dict[str, Any]) -> str:
|
|
261
|
+
notes = report.get("notes", [])
|
|
262
|
+
lines = [
|
|
263
|
+
"## Methodik & Quellen\n",
|
|
264
|
+
"Diese Seite ist eine **abgeleitete** Sicht — keine eigene "
|
|
265
|
+
"Messung. Sie fasst drei bestehende Bench-Surfaces zusammen "
|
|
266
|
+
"(siehe Spalte 'Quelle' in Panel A). Die maschinen-lesbaren "
|
|
267
|
+
"Roh-Reports bleiben die Source-of-Truth:\n",
|
|
268
|
+
"- `internal/bench/reports/telegraph-v1.json` / `telegraph-v2.json` "
|
|
269
|
+
"— Telegraph/Condense-Messungen.\n",
|
|
270
|
+
"- `agents/runtime/frugality/baseline.jsonl` — der Paket-Load "
|
|
271
|
+
"(Metric A footprint).\n",
|
|
272
|
+
"- `internal/bench/reports/rtk/latest.json` — die rtk-Messung "
|
|
273
|
+
"(neu, Phase 2).\n",
|
|
274
|
+
"- `internal/bench/reports/ab/*-ab-trackb-{with,without}.json` "
|
|
275
|
+
"— A/B Track B (Verhalten).\n",
|
|
276
|
+
"- `internal/bench/reports/*-dev.json` — Dev-Korpus Selection-"
|
|
277
|
+
"Accuracy.\n",
|
|
278
|
+
"",
|
|
279
|
+
"**A/B-technischer Anhang:** [`docs/benchmark.md`](benchmark.md) "
|
|
280
|
+
"trägt die Cache-Key-, Integrity- und Methodik-Details des "
|
|
281
|
+
"A/B-Benches — wer den Variant-Axis-Beweis sehen will, liest "
|
|
282
|
+
"dort weiter.\n",
|
|
283
|
+
"",
|
|
284
|
+
]
|
|
285
|
+
if notes:
|
|
286
|
+
lines.append("**Hinweise aus dem Report:**\n")
|
|
287
|
+
for note in notes:
|
|
288
|
+
lines.append(f"- {note}")
|
|
289
|
+
lines.append("")
|
|
290
|
+
lines.append(f"_Last rendered: `{utc_iso()}`_\n")
|
|
291
|
+
return "\n".join(lines)
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
def render_placeholder() -> str:
|
|
295
|
+
return (
|
|
296
|
+
"# Value Dashboard — Platzhalter\n"
|
|
297
|
+
"\n"
|
|
298
|
+
"_Es liegt noch kein `value-v1` Report unter "
|
|
299
|
+
"`internal/bench/reports/value/latest.json` vor._\n"
|
|
300
|
+
"\n"
|
|
301
|
+
"Einen erzeugen mit:\n"
|
|
302
|
+
"\n"
|
|
303
|
+
"```sh\n"
|
|
304
|
+
"task value\n"
|
|
305
|
+
"```\n"
|
|
306
|
+
"\n"
|
|
307
|
+
"Die Methodik dieses Dashboards ist beschrieben in "
|
|
308
|
+
"`docs/contracts/value-dashboard-spec.md` und der zugehörigen "
|
|
309
|
+
"Roadmap `agents/roadmaps/road-to-readable-value-dashboard.md`.\n"
|
|
310
|
+
"\n"
|
|
311
|
+
f"_Last rendered: {utc_iso()}_\n"
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
def render(quiet: bool = False) -> int:
|
|
316
|
+
report = safe_load(LATEST)
|
|
317
|
+
OUT_PATH.parent.mkdir(parents=True, exist_ok=True)
|
|
318
|
+
if not report:
|
|
319
|
+
OUT_PATH.write_text(render_placeholder())
|
|
320
|
+
if not quiet:
|
|
321
|
+
sys.stdout.write(
|
|
322
|
+
f"render_value_md: no report — wrote placeholder to "
|
|
323
|
+
f"{OUT_PATH.relative_to(REPO_ROOT)}\n"
|
|
324
|
+
)
|
|
325
|
+
return 0
|
|
326
|
+
parts = [
|
|
327
|
+
render_intro(report),
|
|
328
|
+
render_panel_a(report),
|
|
329
|
+
render_panel_b(report),
|
|
330
|
+
render_glossary(),
|
|
331
|
+
render_methodology(report),
|
|
332
|
+
]
|
|
333
|
+
OUT_PATH.write_text("\n".join(parts))
|
|
334
|
+
if not quiet:
|
|
335
|
+
sys.stdout.write(
|
|
336
|
+
f"render_value_md: wrote {OUT_PATH.relative_to(REPO_ROOT)}\n"
|
|
337
|
+
)
|
|
338
|
+
return 0
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
def parse_args(argv: list[str]) -> argparse.Namespace:
|
|
342
|
+
parser = argparse.ArgumentParser(
|
|
343
|
+
description="Render docs/value.md from the latest value-v1 report."
|
|
344
|
+
)
|
|
345
|
+
parser.add_argument("--quiet", action="store_true", help="Suppress stdout.")
|
|
346
|
+
return parser.parse_args(argv)
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
def main(argv: list[str] | None = None) -> int:
|
|
350
|
+
args = parse_args(argv if argv is not None else sys.argv[1:])
|
|
351
|
+
return render(quiet=args.quiet)
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
if __name__ == "__main__":
|
|
355
|
+
raise SystemExit(main())
|