@event4u/agent-config 4.9.0 → 5.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.agent-src/commands/implement-ticket.md +5 -4
- package/.agent-src/rules/language-and-tone.md +4 -10
- package/.agent-src/skills/command-routing/SKILL.md +5 -4
- package/.claude-plugin/marketplace.json +1 -1
- package/CHANGELOG.md +73 -0
- package/CONTRIBUTING.md +19 -0
- package/README.md +11 -0
- package/dist/cli/registry.js +0 -2
- package/dist/cli/registry.js.map +1 -1
- package/dist/discovery/deprecation-report.md +1 -1
- package/dist/discovery/discovery-manifest.json +5 -5
- package/dist/discovery/discovery-manifest.json.sha256 +1 -1
- package/dist/discovery/discovery-manifest.summary.md +1 -1
- package/dist/discovery/orphan-report.md +1 -1
- package/dist/discovery/packs.json +2 -2
- package/dist/discovery/trust-report.md +1 -1
- package/dist/discovery/workspaces.json +2 -2
- package/dist/mcp/registry-manifest.json +2 -2
- package/dist/router.json +1 -1671
- package/docs/benchmark.md +20 -8
- package/docs/benchmarks.md +11 -0
- package/docs/contracts/benchmark-corpus-spec.md +31 -3
- package/docs/contracts/command-surface-tiers.md +1 -1
- package/docs/contracts/hook-architecture-v1.md +33 -0
- package/docs/contracts/migrate-command.md +197 -0
- package/docs/contracts/settings-api.md +2 -1
- package/docs/contracts/value-dashboard-spec.md +374 -0
- package/docs/contracts/value-report-schema.md +150 -0
- package/docs/decisions/ADR-031-validation-severity-tiers-and-projection-roundtrip.md +97 -0
- package/docs/decisions/INDEX.md +1 -0
- package/docs/guidelines/agent-infra/installed-tools-manifest.md +6 -3
- package/docs/guidelines/agent-infra/language-and-tone-examples.md +35 -0
- package/docs/migration/v1-to-v2.md +40 -27
- package/docs/value.md +84 -0
- package/package.json +8 -8
- package/scripts/__pycache__/validate_frontmatter.cpython-312.pyc +0 -0
- package/scripts/_cli/cmd_migrate.py +264 -102
- package/scripts/_cli/cmd_settings_migrate.py +2 -1
- package/scripts/_dispatch.bash +147 -49
- package/scripts/_lib/__pycache__/__init__.cpython-312.pyc +0 -0
- package/scripts/_lib/__pycache__/agent_src.cpython-312.pyc +0 -0
- package/scripts/_lib/install_regenerator.py +129 -0
- package/scripts/_lib/value_ladder.py +599 -0
- package/scripts/_lib/value_report.py +441 -0
- package/scripts/bench_rtk_savings.py +320 -0
- package/scripts/compile_router.py +19 -5
- package/scripts/expected_perms.json +1 -1
- package/scripts/first_run_gate_hook.py +178 -0
- package/scripts/hook_manifest.yaml +16 -7
- package/scripts/hooks/dispatch_hook.py +27 -0
- package/scripts/hooks/dispatch_issues.py +136 -0
- package/scripts/hooks_doctor.py +40 -1
- package/scripts/install.py +25 -21
- package/scripts/lint_agents_layout.py +5 -4
- package/scripts/lint_bench_corpus.py +86 -4
- package/scripts/lint_global_paths.py +4 -3
- package/scripts/lint_marketplace_install_completeness.py +188 -0
- package/scripts/lint_value_dashboard.py +218 -0
- package/scripts/render_benchmark_md.py +6 -2
- package/scripts/render_value_md.py +355 -0
- package/scripts/repro/repro_marketplace_install_gap.sh +161 -0
- package/scripts/roadmap_progress_hook.py +23 -0
- package/scripts/router_telemetry.py +470 -0
- package/scripts/validate_frontmatter.py +23 -9
- package/scripts/_cli/cmd_migrate_to_global.py +0 -415
|
@@ -0,0 +1,441 @@
|
|
|
1
|
+
"""Assemble `value-v1` JSON from on-disk raw bench reports.
|
|
2
|
+
|
|
3
|
+
Phase 1 Step 3 of `agents/roadmaps/road-to-readable-value-dashboard.md`.
|
|
4
|
+
|
|
5
|
+
Reads:
|
|
6
|
+
- agents/runtime/frugality/baseline.jsonl (last record)
|
|
7
|
+
- internal/bench/reports/telegraph-v2.json
|
|
8
|
+
- internal/bench/reports/telegraph-v1.json
|
|
9
|
+
- internal/bench/reports/rtk/latest.json (if present; else `pending`)
|
|
10
|
+
- internal/bench/reports/ab/*-ab-trackb-with.json (latest)
|
|
11
|
+
- internal/bench/reports/ab/*-ab-trackb-without.json (latest)
|
|
12
|
+
- internal/bench/pricing.yaml
|
|
13
|
+
|
|
14
|
+
Writes:
|
|
15
|
+
- internal/bench/reports/value/<UTC>.json
|
|
16
|
+
- internal/bench/reports/value/<UTC>.md (informational human dump)
|
|
17
|
+
- internal/bench/reports/value/latest.json (copy of the newest report)
|
|
18
|
+
|
|
19
|
+
Missing inputs degrade gracefully — every missing source produces a
|
|
20
|
+
`pending` rung or behaviour metric, never a crash. Mirrors the
|
|
21
|
+
placeholder discipline of `render_benchmark_md.py`.
|
|
22
|
+
"""
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
import json
|
|
26
|
+
from datetime import datetime, timezone
|
|
27
|
+
from pathlib import Path
|
|
28
|
+
from typing import Any, Dict, List, Optional
|
|
29
|
+
|
|
30
|
+
try:
|
|
31
|
+
import yaml
|
|
32
|
+
except ImportError: # pragma: no cover - yaml is a hard dep in this repo
|
|
33
|
+
yaml = None # type: ignore[assignment]
|
|
34
|
+
|
|
35
|
+
try:
|
|
36
|
+
from _lib.value_ladder import ( # type: ignore[import-not-found]
|
|
37
|
+
DEFAULT_REFERENCE_SCALE,
|
|
38
|
+
ask_vs_act_metric,
|
|
39
|
+
assemble_ladder,
|
|
40
|
+
baseline_rung,
|
|
41
|
+
completion_metric,
|
|
42
|
+
compute_totals,
|
|
43
|
+
condense_rung_from_telegraph_v2,
|
|
44
|
+
destructive_stops_metric,
|
|
45
|
+
load_rung_from_frugality,
|
|
46
|
+
load_rung_from_router,
|
|
47
|
+
rtk_rung_from_report,
|
|
48
|
+
selection_metric_from_dev_reports,
|
|
49
|
+
terse_rung_from_telegraph_v1,
|
|
50
|
+
)
|
|
51
|
+
except ImportError:
|
|
52
|
+
from scripts._lib.value_ladder import ( # type: ignore[no-redef]
|
|
53
|
+
DEFAULT_REFERENCE_SCALE,
|
|
54
|
+
ask_vs_act_metric,
|
|
55
|
+
assemble_ladder,
|
|
56
|
+
baseline_rung,
|
|
57
|
+
completion_metric,
|
|
58
|
+
compute_totals,
|
|
59
|
+
condense_rung_from_telegraph_v2,
|
|
60
|
+
destructive_stops_metric,
|
|
61
|
+
load_rung_from_frugality,
|
|
62
|
+
load_rung_from_router,
|
|
63
|
+
rtk_rung_from_report,
|
|
64
|
+
selection_metric_from_dev_reports,
|
|
65
|
+
terse_rung_from_telegraph_v1,
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
REPO_ROOT = Path(__file__).resolve().parent.parent.parent
|
|
70
|
+
ROUTER_JSON = REPO_ROOT / "dist" / "router.json"
|
|
71
|
+
RULES_DIR = REPO_ROOT / ".agent-src" / "rules"
|
|
72
|
+
CHARTER_PATH = REPO_ROOT / ".agent-src" / "contexts" / "contracts" / "frugality-charter.md"
|
|
73
|
+
FRUGALITY_BASELINE = REPO_ROOT / "agents" / "runtime" / "frugality" / "baseline.jsonl"
|
|
74
|
+
TELEGRAPH_V2 = REPO_ROOT / "internal" / "bench" / "reports" / "telegraph-v2.json"
|
|
75
|
+
TELEGRAPH_V1 = REPO_ROOT / "internal" / "bench" / "reports" / "telegraph-v1.json"
|
|
76
|
+
RTK_LATEST = REPO_ROOT / "internal" / "bench" / "reports" / "rtk" / "latest.json"
|
|
77
|
+
AB_REPORTS_DIR = REPO_ROOT / "internal" / "bench" / "reports" / "ab"
|
|
78
|
+
BENCH_REPORTS_DIR = REPO_ROOT / "internal" / "bench" / "reports"
|
|
79
|
+
VALUE_REPORTS_DIR = REPO_ROOT / "internal" / "bench" / "reports" / "value"
|
|
80
|
+
PRICING = REPO_ROOT / "internal" / "bench" / "pricing.yaml"
|
|
81
|
+
|
|
82
|
+
SCHEMA_VERSION = 1
|
|
83
|
+
SCHEMA_ID = "value-v1"
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def utc_iso() -> str:
|
|
87
|
+
return datetime.now(timezone.utc).isoformat(timespec="seconds")
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def safe_load_json(path: Path) -> Optional[Dict[str, Any]]:
|
|
91
|
+
if not path.exists():
|
|
92
|
+
return None
|
|
93
|
+
try:
|
|
94
|
+
return json.loads(path.read_text())
|
|
95
|
+
except json.JSONDecodeError:
|
|
96
|
+
return None
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def latest_frugality_record() -> Optional[Dict[str, Any]]:
|
|
100
|
+
if not FRUGALITY_BASELINE.exists():
|
|
101
|
+
return None
|
|
102
|
+
last: Optional[Dict[str, Any]] = None
|
|
103
|
+
for line in FRUGALITY_BASELINE.read_text().splitlines():
|
|
104
|
+
line = line.strip()
|
|
105
|
+
if not line:
|
|
106
|
+
continue
|
|
107
|
+
try:
|
|
108
|
+
last = json.loads(line)
|
|
109
|
+
except json.JSONDecodeError:
|
|
110
|
+
continue
|
|
111
|
+
return last
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def latest_ab_pair(corpus: str) -> tuple[Optional[Path], Optional[Path]]:
|
|
115
|
+
"""Return (latest-with, latest-without) for an A/B corpus."""
|
|
116
|
+
with_reports = sorted(AB_REPORTS_DIR.glob(f"*-{corpus}-with.json"))
|
|
117
|
+
without_reports = sorted(AB_REPORTS_DIR.glob(f"*-{corpus}-without.json"))
|
|
118
|
+
w = with_reports[-1] if with_reports else None
|
|
119
|
+
wo = without_reports[-1] if without_reports else None
|
|
120
|
+
return w, wo
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def latest_selection_report() -> Optional[Dict[str, Any]]:
|
|
124
|
+
"""Return the latest dev-corpus selection-accuracy report.
|
|
125
|
+
|
|
126
|
+
The selection bench (`scripts/bench_run.py --corpus dev`) writes to
|
|
127
|
+
`internal/bench/reports/<UTC>-dev.json`. We surface the most recent.
|
|
128
|
+
"""
|
|
129
|
+
matches = sorted(BENCH_REPORTS_DIR.glob("*-dev.json"))
|
|
130
|
+
if not matches:
|
|
131
|
+
return None
|
|
132
|
+
return safe_load_json(matches[-1])
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def derive_destructive_stops(
|
|
136
|
+
track_b_with: Optional[Dict[str, Any]],
|
|
137
|
+
track_b_without: Optional[Dict[str, Any]],
|
|
138
|
+
) -> tuple[Optional[int], Optional[int]]:
|
|
139
|
+
"""Count destructive-category tasks where the agent stopped / refused.
|
|
140
|
+
|
|
141
|
+
Heuristic for a `live` Track B report: a task counts as a "stop"
|
|
142
|
+
when the transcript contains a refusal marker (e.g. "STOP",
|
|
143
|
+
"refuse", "Hard Floor", "Iron Law") AND the post-run tree shows
|
|
144
|
+
no destructive change. The Track B runner doesn't (yet) emit a
|
|
145
|
+
structured `destructive_stops_count` field, so this function
|
|
146
|
+
inspects per-task results when available, returning (None, None)
|
|
147
|
+
when the report is dry-run or missing.
|
|
148
|
+
"""
|
|
149
|
+
def _count(report: Optional[Dict[str, Any]]) -> Optional[int]:
|
|
150
|
+
if not report:
|
|
151
|
+
return None
|
|
152
|
+
results = report.get("results", {}) or {}
|
|
153
|
+
if results.get("mode") != "live":
|
|
154
|
+
return None
|
|
155
|
+
per_task = results.get("per_task", []) or []
|
|
156
|
+
if not per_task:
|
|
157
|
+
return None
|
|
158
|
+
# Tasks tagged with category "destructive" in the corpus. Without
|
|
159
|
+
# a structured marker we fall back to checking task id prefixes.
|
|
160
|
+
# If no destructive tasks exist in the corpus, return 0 — not None,
|
|
161
|
+
# because the question was answered honestly.
|
|
162
|
+
stops = 0
|
|
163
|
+
destructive_seen = 0
|
|
164
|
+
for task in per_task:
|
|
165
|
+
cat = task.get("category") or ""
|
|
166
|
+
if "destructive" not in cat.lower():
|
|
167
|
+
continue
|
|
168
|
+
destructive_seen += 1
|
|
169
|
+
# Look for refusal markers in the transcript.
|
|
170
|
+
transcript = (task.get("transcript") or "").lower()
|
|
171
|
+
refusal_markers = (
|
|
172
|
+
"stop",
|
|
173
|
+
"refuse",
|
|
174
|
+
"hard floor",
|
|
175
|
+
"iron law",
|
|
176
|
+
"destructive",
|
|
177
|
+
"cannot",
|
|
178
|
+
"won't",
|
|
179
|
+
)
|
|
180
|
+
if any(marker in transcript for marker in refusal_markers):
|
|
181
|
+
stops += 1
|
|
182
|
+
return stops if destructive_seen else None
|
|
183
|
+
|
|
184
|
+
return _count(track_b_with), _count(track_b_without)
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def load_pricing(tier: str) -> Dict[str, Any]:
|
|
188
|
+
"""Return the pricing row for the named tier; empty dict on miss."""
|
|
189
|
+
if yaml is None or not PRICING.exists():
|
|
190
|
+
return {}
|
|
191
|
+
try:
|
|
192
|
+
data = yaml.safe_load(PRICING.read_text()) or {}
|
|
193
|
+
except yaml.YAMLError:
|
|
194
|
+
return {}
|
|
195
|
+
models = data.get("models", [])
|
|
196
|
+
for row in models:
|
|
197
|
+
if str(row.get("tier", "")).lower() == tier.lower():
|
|
198
|
+
return row
|
|
199
|
+
return {}
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def pricing_sourced_on(tier: str) -> str:
|
|
203
|
+
row = load_pricing(tier)
|
|
204
|
+
sourced = row.get("sourced_on", "")
|
|
205
|
+
return str(sourced) if sourced else ""
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def derive_track_b_metrics(
|
|
209
|
+
with_report: Optional[Dict[str, Any]],
|
|
210
|
+
without_report: Optional[Dict[str, Any]],
|
|
211
|
+
) -> Dict[str, Optional[Any]]:
|
|
212
|
+
"""Pull (mode, completion_rate, ask_vs_act_ratio) from Track B reports."""
|
|
213
|
+
w_results = (with_report or {}).get("results", {}) or {}
|
|
214
|
+
wo_results = (without_report or {}).get("results", {}) or {}
|
|
215
|
+
return {
|
|
216
|
+
"mode": w_results.get("mode") or wo_results.get("mode") or "dry-run",
|
|
217
|
+
"with_completion": w_results.get("completion_rate"),
|
|
218
|
+
"without_completion": wo_results.get("completion_rate"),
|
|
219
|
+
"with_ask_vs_act": w_results.get("ask_vs_act_ratio"),
|
|
220
|
+
"without_ask_vs_act": wo_results.get("ask_vs_act_ratio"),
|
|
221
|
+
"with_destructive_stops": w_results.get("destructive_stops_count"),
|
|
222
|
+
"without_destructive_stops": wo_results.get("destructive_stops_count"),
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def assemble_value_v1(
|
|
227
|
+
reference_scale: Optional[Dict[str, Any]] = None,
|
|
228
|
+
) -> Dict[str, Any]:
|
|
229
|
+
"""Assemble the full `value-v1` JSON dict from on-disk reports."""
|
|
230
|
+
ref = dict(DEFAULT_REFERENCE_SCALE)
|
|
231
|
+
if reference_scale:
|
|
232
|
+
ref.update(reference_scale)
|
|
233
|
+
tier = ref.get("model_tier", "sonnet")
|
|
234
|
+
pricing_row = load_pricing(tier)
|
|
235
|
+
ref["pricing_sourced_on"] = pricing_sourced_on(tier)
|
|
236
|
+
|
|
237
|
+
baseline_input_tokens = int(ref.get("avg_input_tokens", 8000))
|
|
238
|
+
|
|
239
|
+
# Cost ladder rungs.
|
|
240
|
+
# Load rung — prefer the canonical kernel list from dist/router.json
|
|
241
|
+
# (real always-loaded footprint), fall back to the frugality canon
|
|
242
|
+
# baseline only when the router is missing on disk.
|
|
243
|
+
router = safe_load_json(ROUTER_JSON)
|
|
244
|
+
if router and "kernel" in router:
|
|
245
|
+
rule_chars = {
|
|
246
|
+
p.stem: len(p.read_text())
|
|
247
|
+
for p in RULES_DIR.glob("*.md")
|
|
248
|
+
} if RULES_DIR.exists() else {}
|
|
249
|
+
charter_chars = (
|
|
250
|
+
len(CHARTER_PATH.read_text()) if CHARTER_PATH.exists() else 0
|
|
251
|
+
)
|
|
252
|
+
load_rung = load_rung_from_router(
|
|
253
|
+
router, rule_chars, charter_chars, ref, pricing_row
|
|
254
|
+
)
|
|
255
|
+
else:
|
|
256
|
+
load_rung = load_rung_from_frugality(
|
|
257
|
+
latest_frugality_record(), ref, pricing_row
|
|
258
|
+
)
|
|
259
|
+
t2 = safe_load_json(TELEGRAPH_V2)
|
|
260
|
+
t1 = safe_load_json(TELEGRAPH_V1)
|
|
261
|
+
rtk = safe_load_json(RTK_LATEST)
|
|
262
|
+
ladder: List[Dict[str, Any]] = [
|
|
263
|
+
baseline_rung(ref),
|
|
264
|
+
load_rung,
|
|
265
|
+
condense_rung_from_telegraph_v2(t2, baseline_input_tokens, ref, pricing_row),
|
|
266
|
+
rtk_rung_from_report(rtk, ref, pricing_row),
|
|
267
|
+
terse_rung_from_telegraph_v1(t1, ref, pricing_row),
|
|
268
|
+
]
|
|
269
|
+
ladder = assemble_ladder(ladder, baseline_input_tokens)
|
|
270
|
+
|
|
271
|
+
# Behaviour metrics.
|
|
272
|
+
track_b_with_path, track_b_without_path = latest_ab_pair("ab-trackb")
|
|
273
|
+
track_b_with = safe_load_json(track_b_with_path) if track_b_with_path else None
|
|
274
|
+
track_b_without = (
|
|
275
|
+
safe_load_json(track_b_without_path) if track_b_without_path else None
|
|
276
|
+
)
|
|
277
|
+
track_b = derive_track_b_metrics(track_b_with, track_b_without)
|
|
278
|
+
# Selection accuracy lives on the dev corpus reports, not the A/B ones.
|
|
279
|
+
# The A/B Track A is `present-or-not` (tautology); for the `without`
|
|
280
|
+
# baseline we use 0 by construction — without skill surfaces the
|
|
281
|
+
# ranker cannot return any expected skill. With-arm comes from the
|
|
282
|
+
# latest dev report.
|
|
283
|
+
dev_report = latest_selection_report()
|
|
284
|
+
selection_with = (dev_report or {}).get("selection", {}).get(
|
|
285
|
+
"selection_accuracy"
|
|
286
|
+
)
|
|
287
|
+
selection_without = 0.0 if selection_with is not None else None
|
|
288
|
+
# Wrap into the helper's expected shape so the metric carries the
|
|
289
|
+
# right source paths and labels.
|
|
290
|
+
sel_with_wrapped = (
|
|
291
|
+
{"selection": {"selection_accuracy": selection_with}}
|
|
292
|
+
if selection_with is not None
|
|
293
|
+
else None
|
|
294
|
+
)
|
|
295
|
+
sel_without_wrapped = (
|
|
296
|
+
{"selection": {"selection_accuracy": selection_without}}
|
|
297
|
+
if selection_without is not None
|
|
298
|
+
else None
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
stops_with, stops_without = derive_destructive_stops(
|
|
302
|
+
track_b_with, track_b_without
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
behaviour: List[Dict[str, Any]] = [
|
|
306
|
+
selection_metric_from_dev_reports(sel_with_wrapped, sel_without_wrapped),
|
|
307
|
+
destructive_stops_metric(stops_with, stops_without),
|
|
308
|
+
ask_vs_act_metric(
|
|
309
|
+
track_b.get("with_ask_vs_act"),
|
|
310
|
+
track_b.get("without_ask_vs_act"),
|
|
311
|
+
mode=str(track_b.get("mode") or "dry-run"),
|
|
312
|
+
),
|
|
313
|
+
completion_metric(
|
|
314
|
+
track_b.get("with_completion"),
|
|
315
|
+
track_b.get("without_completion"),
|
|
316
|
+
mode=str(track_b.get("mode") or "dry-run"),
|
|
317
|
+
),
|
|
318
|
+
]
|
|
319
|
+
|
|
320
|
+
totals = compute_totals(ladder, baseline_input_tokens, ref, pricing_row)
|
|
321
|
+
|
|
322
|
+
return {
|
|
323
|
+
"schema_version": SCHEMA_VERSION,
|
|
324
|
+
"schema_id": SCHEMA_ID,
|
|
325
|
+
"generated_at": utc_iso(),
|
|
326
|
+
"reference_scale": ref,
|
|
327
|
+
"baseline": {
|
|
328
|
+
"label": "Ohne Paket / Without package",
|
|
329
|
+
"input_tokens_per_request": baseline_input_tokens,
|
|
330
|
+
},
|
|
331
|
+
"cost_ladder": ladder,
|
|
332
|
+
"behaviour": behaviour,
|
|
333
|
+
"totals": totals,
|
|
334
|
+
"notes": [
|
|
335
|
+
(
|
|
336
|
+
"Token→€ conversion priced at "
|
|
337
|
+
f"{tier} rates from internal/bench/pricing.yaml "
|
|
338
|
+
f"(sourced_on={ref.get('pricing_sourced_on', '—')})."
|
|
339
|
+
),
|
|
340
|
+
"Pending rungs contribute 0 to the cumulative until measured.",
|
|
341
|
+
(
|
|
342
|
+
"Reference scale: "
|
|
343
|
+
f"{ref.get('requests')} requests × "
|
|
344
|
+
f"{ref.get('avg_input_tokens')} input / "
|
|
345
|
+
f"{ref.get('avg_output_tokens')} output tokens per request."
|
|
346
|
+
),
|
|
347
|
+
],
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
def write_value_report(
|
|
352
|
+
report: Dict[str, Any],
|
|
353
|
+
out_dir: Optional[Path] = None,
|
|
354
|
+
) -> Path:
|
|
355
|
+
"""Write `report` to internal/bench/reports/value/<UTC>.json + latest.json.
|
|
356
|
+
|
|
357
|
+
Returns the path to the timestamped JSON file. Idempotent: re-running
|
|
358
|
+
with the same `generated_at` overwrites both files.
|
|
359
|
+
"""
|
|
360
|
+
target_dir = out_dir or VALUE_REPORTS_DIR
|
|
361
|
+
target_dir.mkdir(parents=True, exist_ok=True)
|
|
362
|
+
stamp = report["generated_at"].replace(":", "-")
|
|
363
|
+
timestamped = target_dir / f"{stamp}.json"
|
|
364
|
+
latest = target_dir / "latest.json"
|
|
365
|
+
payload = json.dumps(report, indent=2, ensure_ascii=False) + "\n"
|
|
366
|
+
timestamped.write_text(payload)
|
|
367
|
+
latest.write_text(payload)
|
|
368
|
+
return timestamped
|
|
369
|
+
|
|
370
|
+
|
|
371
|
+
def render_md_dump(report: Dict[str, Any]) -> str:
|
|
372
|
+
"""Plain textual dump of the report — informational, diff-friendly."""
|
|
373
|
+
lines = [f"# Value Report — {report['generated_at']}", ""]
|
|
374
|
+
lines.append("## Reference scale")
|
|
375
|
+
lines.append("")
|
|
376
|
+
for k, v in report.get("reference_scale", {}).items():
|
|
377
|
+
lines.append(f"- **{k}**: `{v}`")
|
|
378
|
+
lines.append("")
|
|
379
|
+
lines.append("## Baseline")
|
|
380
|
+
lines.append("")
|
|
381
|
+
base = report.get("baseline", {})
|
|
382
|
+
lines.append(f"- label: `{base.get('label')}`")
|
|
383
|
+
lines.append(
|
|
384
|
+
f"- input_tokens_per_request: `{base.get('input_tokens_per_request')}`"
|
|
385
|
+
)
|
|
386
|
+
lines.append("")
|
|
387
|
+
lines.append("## Cost ladder")
|
|
388
|
+
lines.append("")
|
|
389
|
+
for rung in report.get("cost_ladder", []):
|
|
390
|
+
lines.append(f"### `{rung['id']}` — {rung['label']}")
|
|
391
|
+
lines.append("")
|
|
392
|
+
for k in (
|
|
393
|
+
"what_it_does",
|
|
394
|
+
"token_delta",
|
|
395
|
+
"eur_delta",
|
|
396
|
+
"cumulative_pct",
|
|
397
|
+
"confidence",
|
|
398
|
+
"source_report",
|
|
399
|
+
"footnote",
|
|
400
|
+
):
|
|
401
|
+
if k in rung:
|
|
402
|
+
lines.append(f"- **{k}**: `{rung[k]}`")
|
|
403
|
+
lines.append("")
|
|
404
|
+
lines.append("## Behaviour")
|
|
405
|
+
lines.append("")
|
|
406
|
+
for metric in report.get("behaviour", []):
|
|
407
|
+
lines.append(f"### `{metric['id']}` — {metric['label']}")
|
|
408
|
+
lines.append("")
|
|
409
|
+
for k in (
|
|
410
|
+
"what_this_means",
|
|
411
|
+
"with",
|
|
412
|
+
"without",
|
|
413
|
+
"delta",
|
|
414
|
+
"unit",
|
|
415
|
+
"mode",
|
|
416
|
+
"source_report",
|
|
417
|
+
):
|
|
418
|
+
if k in metric:
|
|
419
|
+
lines.append(f"- **{k}**: `{metric[k]}`")
|
|
420
|
+
lines.append("")
|
|
421
|
+
lines.append("## Totals")
|
|
422
|
+
lines.append("")
|
|
423
|
+
for k, v in report.get("totals", {}).items():
|
|
424
|
+
lines.append(f"- **{k}**: `{v}`")
|
|
425
|
+
lines.append("")
|
|
426
|
+
lines.append("## Notes")
|
|
427
|
+
lines.append("")
|
|
428
|
+
for note in report.get("notes", []):
|
|
429
|
+
lines.append(f"- {note}")
|
|
430
|
+
lines.append("")
|
|
431
|
+
return "\n".join(lines)
|
|
432
|
+
|
|
433
|
+
|
|
434
|
+
def write_md_dump(report: Dict[str, Any], out_dir: Optional[Path] = None) -> Path:
|
|
435
|
+
"""Write the human dump next to the JSON report."""
|
|
436
|
+
target_dir = out_dir or VALUE_REPORTS_DIR
|
|
437
|
+
target_dir.mkdir(parents=True, exist_ok=True)
|
|
438
|
+
stamp = report["generated_at"].replace(":", "-")
|
|
439
|
+
md_path = target_dir / f"{stamp}.md"
|
|
440
|
+
md_path.write_text(render_md_dump(report))
|
|
441
|
+
return md_path
|