@event4u/agent-config 2.18.0 → 2.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (108) hide show
  1. package/.agent-src/commands/agent-status.md +29 -0
  2. package/.agent-src/commands/onboard.md +221 -81
  3. package/.agent-src/commands/refine-ticket.md +3 -0
  4. package/.agent-src/packs/README.md +49 -0
  5. package/.agent-src/packs/agency-delivery.yml +63 -0
  6. package/.agent-src/packs/content-engine.yml +53 -0
  7. package/.agent-src/packs/founder-mvp.yml +51 -0
  8. package/.agent-src/personas/README.md +8 -0
  9. package/.agent-src/presets/README.md +26 -0
  10. package/.agent-src/presets/balanced.yml +34 -0
  11. package/.agent-src/presets/fast.yml +31 -0
  12. package/.agent-src/presets/strict.yml +38 -0
  13. package/.agent-src/profiles/README.md +29 -0
  14. package/.agent-src/profiles/agency.yml +27 -0
  15. package/.agent-src/profiles/content_creator.yml +25 -0
  16. package/.agent-src/profiles/developer.yml +26 -0
  17. package/.agent-src/profiles/finance.yml +24 -0
  18. package/.agent-src/profiles/founder.yml +25 -0
  19. package/.agent-src/profiles/ops.yml +25 -0
  20. package/.agent-src/rules/no-cheap-questions.md +25 -17
  21. package/.agent-src/skills/adr-create/SKILL.md +78 -68
  22. package/.agent-src/skills/refine-ticket/SKILL.md +3 -0
  23. package/.agent-src/skills/subagent-orchestration/SKILL.md +33 -0
  24. package/.agent-src/templates/agents/agent-project-settings.example.yml +1 -1
  25. package/.agent-src/templates/skill-archive-note.md +101 -0
  26. package/.agent-src/user-types/README.md +124 -0
  27. package/.agent-src/user-types/_template/user-type.md +95 -0
  28. package/.agent-src/user-types/galabau-field-crew.md +100 -0
  29. package/.agent-src/user-types/metalworking-shop.md +105 -0
  30. package/.agent-src/user-types/truck-driver.md +113 -0
  31. package/.claude-plugin/marketplace.json +1 -1
  32. package/CHANGELOG.md +91 -30
  33. package/README.md +68 -72
  34. package/config/agent-settings.template.yml +22 -0
  35. package/docs/adrs/caveman/0001-default-off-until-bench.md +93 -0
  36. package/docs/adrs/caveman/README.md +9 -0
  37. package/docs/adrs/cost/0001-hard-stop-hook.md +114 -0
  38. package/docs/adrs/cost/README.md +9 -0
  39. package/docs/adrs/memory/0001-consumer-side-snapshot.md +111 -0
  40. package/docs/adrs/memory/README.md +9 -0
  41. package/docs/adrs/router/0001-three-tier-routing.md +119 -0
  42. package/docs/adrs/router/README.md +9 -0
  43. package/docs/adrs/schema/0001-json-schema-frontmatter.md +102 -0
  44. package/docs/adrs/schema/README.md +9 -0
  45. package/docs/adrs/smoke/0001-per-tier-smoke-scripts.md +99 -0
  46. package/docs/adrs/smoke/README.md +9 -0
  47. package/docs/architecture/current-onboard-baseline.md +126 -0
  48. package/docs/architecture/current-safety-behavior.md +137 -0
  49. package/docs/archive/CHANGELOG-pre-2.16.0.md +48 -0
  50. package/docs/contracts/adr-layout.md +108 -0
  51. package/docs/contracts/adr-mcp-runtime.md +128 -0
  52. package/docs/contracts/adr-user-types-axis.md +127 -0
  53. package/docs/contracts/benchmark-corpus-spec.md +97 -0
  54. package/docs/contracts/benchmark-report-schema.md +111 -0
  55. package/docs/contracts/command-clusters.md +1 -0
  56. package/docs/contracts/command-taxonomy.md +137 -0
  57. package/docs/contracts/compression-default-kill-criterion.md +69 -0
  58. package/docs/contracts/config-presets.md +144 -0
  59. package/docs/contracts/cost-dashboard.md +143 -0
  60. package/docs/contracts/cost-enforcement.md +134 -0
  61. package/docs/contracts/file-ownership-matrix.json +0 -7
  62. package/docs/contracts/mcp-tool-inventory.md +53 -0
  63. package/docs/contracts/measurement-baseline.md +102 -0
  64. package/docs/contracts/namespace.md +125 -0
  65. package/docs/contracts/profile-system.md +142 -0
  66. package/docs/contracts/safety-model.md +129 -0
  67. package/docs/contracts/smoke-contracts.md +144 -0
  68. package/docs/contracts/user-type-schema.md +146 -0
  69. package/docs/contracts/workflow-packs.md +121 -0
  70. package/docs/decisions/ADR-010-profile-pack-preset-boundary.md +132 -0
  71. package/docs/decisions/INDEX.md +1 -0
  72. package/docs/featured-commands.md +27 -0
  73. package/docs/parity/bench-ruflo.json +58 -0
  74. package/docs/parity/bench.json +41 -0
  75. package/docs/parity/ruflo.md +46 -0
  76. package/docs/profiles.md +91 -0
  77. package/docs/recruits/_template.md +81 -0
  78. package/package.json +1 -1
  79. package/scripts/_cli/cmd_explain.py +250 -0
  80. package/scripts/_lib/bench_cost.py +138 -0
  81. package/scripts/_lib/bench_quality.py +118 -0
  82. package/scripts/_lib/bench_report.py +150 -0
  83. package/scripts/agent-config +13 -0
  84. package/scripts/audit_adr_coverage.py +175 -0
  85. package/scripts/audit_mcp_tools.py +146 -0
  86. package/scripts/bench_baseline_ready.py +108 -0
  87. package/scripts/bench_drift_check.py +151 -0
  88. package/scripts/bench_per_tool.py +216 -0
  89. package/scripts/bench_run.py +155 -0
  90. package/scripts/compress.py +48 -2
  91. package/scripts/config/__init__.py +9 -0
  92. package/scripts/config/presets.py +206 -0
  93. package/scripts/config/profiles.py +173 -0
  94. package/scripts/cost/budget.mjs +73 -12
  95. package/scripts/cost/preflight.mjs +89 -0
  96. package/scripts/lint_archived_skills.py +143 -0
  97. package/scripts/lint_bench_corpus.py +161 -0
  98. package/scripts/lint_namespace.py +135 -0
  99. package/scripts/schemas/user-type.schema.json +35 -0
  100. package/scripts/skill_linter.py +139 -4
  101. package/scripts/skill_overlap.py +204 -0
  102. package/scripts/skill_tools/audit_user_type_coverage.py +148 -0
  103. package/scripts/skill_usage_collect.py +191 -0
  104. package/scripts/skill_usage_report.py +162 -0
  105. package/scripts/smoke/kernel.sh +101 -0
  106. package/scripts/smoke/router.sh +129 -0
  107. package/scripts/smoke/schema.sh +71 -0
  108. package/scripts/smoke/skills.sh +101 -0
@@ -0,0 +1,175 @@
1
+ #!/usr/bin/env python3
2
+ """Audit per-area ADR coverage against docs/contracts/ and the canonical
3
+ AREAS inventory. Contract: docs/contracts/adr-layout.md.
4
+
5
+ Modes:
6
+ --report (default) one-shot inventory: which areas exist, ADR count
7
+ per area, contracts missing a bootstrap ADR.
8
+ --check exit 1 on hard failures (number gaps, missing area README,
9
+ broken supersedes); exit 0 with warnings on missing
10
+ bootstrap ADRs and dangling references.
11
+ --regen-area-readme <area>
12
+ rewrite docs/adrs/<area>/README.md from the area's ADR
13
+ frontmatter. Idempotent.
14
+ """
15
+ from __future__ import annotations
16
+ import argparse, re, sys
17
+ from pathlib import Path
18
+
19
+ ROOT = Path(__file__).resolve().parent.parent
20
+ ADR_ROOT = ROOT / "docs" / "adrs"
21
+ CONTRACT_ROOT = ROOT / "docs" / "contracts"
22
+
23
+ # Canonical area inventory. To add an area: add it here, then run
24
+ # `python3 scripts/audit_adr_coverage.py --check` in the same PR.
25
+ AREAS: dict[str, dict[str, str]] = {
26
+ "cost": {"contract": "cost-enforcement.md",
27
+ "scope": "Budget ladder, hard-stop hook, cost reporting and dashboards."},
28
+ "caveman": {"contract": "compression-default-kill-criterion.md",
29
+ "scope": "Caveman-speak compression, decompression, reversibility guards."},
30
+ "schema": {"contract": "agents/docs/frontmatter-contract.md",
31
+ "scope": "Frontmatter schemas, v2 rigor, lint behaviour for skills / rules / commands."},
32
+ "router": {"contract": "rule-router.md",
33
+ "scope": "router.json shape, tier semantics, dispatch precedence."},
34
+ "smoke": {"contract": "smoke-contracts.md",
35
+ "scope": "Per-tier smoke contracts, baseline locks, regression gates."},
36
+ "memory": {"contract": "agent-memory-contract.md",
37
+ "scope": "Memory MCP, propose / promote / poison flow, runtime-trust scoring."},
38
+ }
39
+
40
+ NAMED = re.compile(r"^(\d{4})-([a-z0-9-]+)\.md$")
41
+ FM = re.compile(r"^---\n(.*?)\n---", re.DOTALL)
42
+ FIELD = re.compile(r"^([a-z_]+):\s*(.+?)\s*$", re.MULTILINE)
43
+
44
+
45
+ def parse_fm(text: str) -> dict[str, str]:
46
+ m = FM.search(text)
47
+ if not m:
48
+ return {}
49
+ return {k: v.strip(" \"'") for k, v in FIELD.findall(m.group(1))}
50
+
51
+
52
+ def scan_area(area: str) -> tuple[list[dict], list[str]]:
53
+ """Return (adrs, errors). adrs sorted by number."""
54
+ area_dir = ADR_ROOT / area
55
+ errs: list[str] = []
56
+ if not area_dir.exists():
57
+ return [], errs
58
+ adrs: list[dict] = []
59
+ for p in sorted(area_dir.glob("*.md")):
60
+ if p.name == "README.md":
61
+ continue
62
+ m = NAMED.match(p.name)
63
+ if not m:
64
+ errs.append(f"{area}/{p.name}: filename does not match NNNN-<slug>.md")
65
+ continue
66
+ fm = parse_fm(p.read_text(encoding="utf-8"))
67
+ adrs.append({"num": m.group(1), "slug": m.group(2),
68
+ "path": p.name, **fm})
69
+ # Gap check.
70
+ nums = [int(a["num"]) for a in adrs]
71
+ for i, n in enumerate(nums, start=1):
72
+ if n != i:
73
+ errs.append(f"{area}/: number gap at position {i} (got {n:04d})")
74
+ break
75
+ return adrs, errs
76
+
77
+
78
+ def _contract_path(meta: dict[str, str]) -> Path:
79
+ """Resolve a contract reference. Plain filename → docs/contracts/<file>;
80
+ a path with separators → repo-relative."""
81
+ c = meta["contract"]
82
+ return (ROOT / c) if "/" in c else (CONTRACT_ROOT / c)
83
+
84
+
85
+ def render_area_readme(area: str, meta: dict[str, str], adrs: list[dict]) -> str:
86
+ lines = [f"# ADRs — `{area}`", "",
87
+ f"> {meta['scope']}", ""]
88
+ contract_path = _contract_path(meta)
89
+ repo_rel = contract_path.relative_to(ROOT) if contract_path.exists() else Path(
90
+ meta["contract"] if "/" in meta["contract"] else f"docs/contracts/{meta['contract']}")
91
+ # Link target is relative to docs/adrs/<area>/README.md (2 levels up from area dir).
92
+ link_target = Path("..") / ".." / ".." / repo_rel
93
+ if contract_path.exists():
94
+ lines.append(f"Contract: [`{repo_rel}`]({link_target}).")
95
+ else:
96
+ lines.append(f"Contract: _not yet published_ (`{repo_rel}`).")
97
+ lines += ["",
98
+ "| # | Title | Status | Date | Supersedes |",
99
+ "|---|---|---|---|---|"]
100
+ for a in adrs:
101
+ title = a.get("decision", a["slug"]).replace("-", " ").title()
102
+ lines.append(f"| [{a['num']}]({a['path']}) | {title} | "
103
+ f"{a.get('status','—')} | {a.get('date','—')} | "
104
+ f"{a.get('supersedes','—')} |")
105
+ if not adrs:
106
+ lines.append("| _none yet_ | — | — | — | — |")
107
+ return "\n".join(lines) + "\n"
108
+
109
+
110
+ def cmd_report(args) -> int:
111
+ print("## ADR coverage report")
112
+ print()
113
+ print("| Area | Contract | ADRs | README | Status |")
114
+ print("|---|---|---:|:---:|---|")
115
+ missing_bootstrap = 0
116
+ for area, meta in AREAS.items():
117
+ adrs, _ = scan_area(area)
118
+ readme = "✅" if (ADR_ROOT / area / "README.md").exists() else "—"
119
+ contract_present = _contract_path(meta).exists()
120
+ status = "ok" if adrs else "missing bootstrap"
121
+ if not adrs:
122
+ missing_bootstrap += 1
123
+ contract_cell = meta["contract"] if contract_present else f"_{meta['contract']}_ (no contract)"
124
+ print(f"| `{area}` | {contract_cell} | {len(adrs)} | {readme} | {status} |")
125
+ print()
126
+ print(f"BASELINE: {len(AREAS)} canonical areas · {missing_bootstrap} missing bootstrap ADR(s)")
127
+ return 0
128
+
129
+
130
+ def cmd_check(args) -> int:
131
+ hard = 0
132
+ warn = 0
133
+ for area, meta in AREAS.items():
134
+ adrs, errs = scan_area(area)
135
+ for e in errs:
136
+ print(f"❌ {e}", file=sys.stderr); hard += 1
137
+ if adrs and not (ADR_ROOT / area / "README.md").exists():
138
+ print(f"❌ {area}/: README.md missing", file=sys.stderr); hard += 1
139
+ if not adrs:
140
+ print(f"⚠️ {area}/: no bootstrap ADR yet (contract: {meta['contract']})", file=sys.stderr)
141
+ warn += 1
142
+ print(f"BASELINE: {hard} hard fail(s) · {warn} warn(s)")
143
+ return 1 if hard else 0
144
+
145
+
146
+ def cmd_regen_area_readme(args) -> int:
147
+ area = args.regen_area_readme
148
+ if area not in AREAS:
149
+ print(f"❌ unknown area '{area}' — add to AREAS inventory first", file=sys.stderr)
150
+ return 1
151
+ adrs, errs = scan_area(area)
152
+ for e in errs:
153
+ print(f"❌ {e}", file=sys.stderr)
154
+ out = ADR_ROOT / area / "README.md"
155
+ out.parent.mkdir(parents=True, exist_ok=True)
156
+ out.write_text(render_area_readme(area, AREAS[area], adrs), encoding="utf-8")
157
+ print(f"wrote {out.relative_to(ROOT)}")
158
+ return 0
159
+
160
+
161
+ def main() -> int:
162
+ ap = argparse.ArgumentParser(description=__doc__)
163
+ grp = ap.add_mutually_exclusive_group()
164
+ grp.add_argument("--check", action="store_true")
165
+ grp.add_argument("--regen-area-readme", metavar="AREA")
166
+ args = ap.parse_args()
167
+ if args.check:
168
+ return cmd_check(args)
169
+ if args.regen_area_readme:
170
+ return cmd_regen_area_readme(args)
171
+ return cmd_report(args)
172
+
173
+
174
+ if __name__ == "__main__":
175
+ sys.exit(main())
@@ -0,0 +1,146 @@
1
+ #!/usr/bin/env python3
2
+ """MCP-tool inventory generator. Reads the source-of-truth catalog at
3
+ `scripts/mcp_server/consumer_tool_catalog.json` and the handler
4
+ registry at `scripts/mcp_server/tools.py`, emits
5
+ `docs/contracts/mcp-tool-inventory.md` with every tool cited by
6
+ `<file>:<line>`. README's MCP-tool count line links here; the bare
7
+ claim is banned.
8
+
9
+ Contract: step-11 Phase 5 Step 3
10
+ (agents/roadmaps/step-11-ruflo-parity.md).
11
+
12
+ Modes:
13
+ --check exit non-zero if the generated inventory drifts from
14
+ the on-disk file (CI gate).
15
+ --write regenerate the inventory file in-place (default).
16
+ """
17
+ from __future__ import annotations
18
+ import argparse, json, re, sys
19
+ from pathlib import Path
20
+
21
+ ROOT = Path(__file__).resolve().parent.parent
22
+ CATALOG = ROOT / "scripts/mcp_server/consumer_tool_catalog.json"
23
+ TOOLS_PY = ROOT / "scripts/mcp_server/tools.py"
24
+ OUT = ROOT / "docs/contracts/mcp-tool-inventory.md"
25
+
26
+ # Match `"<name>": BuiltinTool(` in the ALLOWLIST dict.
27
+ HANDLER_RE = re.compile(r'^\s*"([a-z_]+)"\s*:\s*BuiltinTool\(')
28
+ # Match `"name": "<name>",` in the catalog json (for catalog citations).
29
+ CATALOG_NAME_RE = re.compile(r'^\s*"name"\s*:\s*"([a-z_]+)"\s*,?\s*$')
30
+
31
+
32
+ def _index_handlers() -> dict[str, int]:
33
+ out: dict[str, int] = {}
34
+ for i, line in enumerate(TOOLS_PY.read_text(encoding="utf-8").splitlines(), 1):
35
+ m = HANDLER_RE.match(line)
36
+ if m:
37
+ out[m.group(1)] = i
38
+ return out
39
+
40
+
41
+ def _index_catalog_lines() -> dict[str, int]:
42
+ out: dict[str, int] = {}
43
+ for i, line in enumerate(CATALOG.read_text(encoding="utf-8").splitlines(), 1):
44
+ m = CATALOG_NAME_RE.match(line)
45
+ if m and m.group(1) not in out:
46
+ out[m.group(1)] = i
47
+ return out
48
+
49
+
50
+ def _render(catalog: dict, handlers: dict[str, int], cat_lines: dict[str, int]) -> str:
51
+ tools = catalog["tools"]
52
+ total = len(tools)
53
+ by_transport: dict[str, int] = {}
54
+ by_side_effect: dict[str, int] = {}
55
+ for t in tools:
56
+ for tr in t["implemented_on"]:
57
+ by_transport[tr] = by_transport.get(tr, 0) + 1
58
+ by_side_effect[t["side_effect"]] = by_side_effect.get(t["side_effect"], 0) + 1
59
+ stub_count = sum(1 for t in tools if not t["implemented_on"])
60
+ transport_summary = ", ".join(f"{k}={v}" for k, v in sorted(by_transport.items())) or "none"
61
+ side_effect_summary = ", ".join(f"{k}={v}" for k, v in sorted(by_side_effect.items()))
62
+
63
+ lines: list[str] = []
64
+ lines.append("---")
65
+ lines.append("stability: beta")
66
+ lines.append("keep-beta-until: 2026-08-14")
67
+ lines.append("---")
68
+ lines.append("")
69
+ lines.append("# MCP tool inventory")
70
+ lines.append("")
71
+ lines.append("> Generated by [`scripts/audit_mcp_tools.py`](../../scripts/audit_mcp_tools.py)")
72
+ lines.append("> from the source-of-truth catalog")
73
+ lines.append("> [`scripts/mcp_server/consumer_tool_catalog.json`](../../scripts/mcp_server/consumer_tool_catalog.json).")
74
+ lines.append("> Do **not** hand-edit; rerun `python3 scripts/audit_mcp_tools.py --write`.")
75
+ lines.append(">")
76
+ lines.append("> Step-11 Phase 5 Step 3 (`step-11-ruflo-parity.md`).")
77
+ lines.append("")
78
+ lines.append("## Summary")
79
+ lines.append("")
80
+ lines.append(f"- **Total tools:** {total}")
81
+ lines.append(f"- **By transport:** {transport_summary}")
82
+ lines.append(f"- **By side-effect:** {side_effect_summary}")
83
+ lines.append(f"- **Discovery-only stubs (no implementation):** {stub_count}")
84
+ lines.append("")
85
+ lines.append("## Tools")
86
+ lines.append("")
87
+ lines.append("| Tool | Side-effect | Transports | Catalog | Handler |")
88
+ lines.append("|---|---|---|---|---|")
89
+ for t in tools:
90
+ name = t["name"]
91
+ side = t["side_effect"]
92
+ transports = ", ".join(t["implemented_on"]) if t["implemented_on"] else "_(stub)_"
93
+ cat_line = cat_lines.get(name)
94
+ cat_cite = (
95
+ f"[`consumer_tool_catalog.json:{cat_line}`](../../scripts/mcp_server/consumer_tool_catalog.json#L{cat_line})"
96
+ if cat_line else "_missing_"
97
+ )
98
+ h_line = handlers.get(name)
99
+ h_cite = (
100
+ f"[`tools.py:{h_line}`](../../scripts/mcp_server/tools.py#L{h_line})"
101
+ if h_line else "_stub-only_"
102
+ )
103
+ lines.append(f"| `{name}` | `{side}` | {transports} | {cat_cite} | {h_cite} |")
104
+ lines.append("")
105
+ lines.append("## Glossary")
106
+ lines.append("")
107
+ lines.append("- **Side-effect** — `ro` (read-only) · `fs-write` (filesystem write) · `shell` (spawns processes).")
108
+ lines.append("- **Transports** — `stdio` (`scripts/mcp_server/`) · `worker` (`workers/mcp/`). A tool may live on both.")
109
+ lines.append("- **Stub** — catalog-listed for discovery; returns the `not_implemented` envelope from")
110
+ lines.append(" [`mcp-tool-stub-envelope.md`](mcp-tool-stub-envelope.md) until promoted.")
111
+ lines.append("")
112
+ return "\n".join(lines) + "\n"
113
+
114
+
115
+ def main() -> int:
116
+ ap = argparse.ArgumentParser(description=__doc__)
117
+ g = ap.add_mutually_exclusive_group()
118
+ g.add_argument("--check", action="store_true", help="Drift gate: exit 1 if file is stale.")
119
+ g.add_argument("--write", action="store_true", help="Regenerate the inventory file.")
120
+ ap.add_argument("--quiet", action="store_true")
121
+ args = ap.parse_args()
122
+
123
+ catalog = json.loads(CATALOG.read_text(encoding="utf-8"))
124
+ handlers = _index_handlers()
125
+ cat_lines = _index_catalog_lines()
126
+ rendered = _render(catalog, handlers, cat_lines)
127
+
128
+ if args.check:
129
+ on_disk = OUT.read_text(encoding="utf-8") if OUT.exists() else ""
130
+ if on_disk != rendered:
131
+ print(f"❌ {OUT.relative_to(ROOT)} drifted from generator.", file=sys.stderr)
132
+ print(" Run: python3 scripts/audit_mcp_tools.py --write", file=sys.stderr)
133
+ return 1
134
+ if not args.quiet:
135
+ print(f"BASELINE: {OUT.relative_to(ROOT)} is in sync · {len(catalog['tools'])} tool(s)")
136
+ return 0
137
+
138
+ OUT.parent.mkdir(parents=True, exist_ok=True)
139
+ OUT.write_text(rendered, encoding="utf-8")
140
+ if not args.quiet:
141
+ print(f"✅ wrote {OUT.relative_to(ROOT)} · {len(catalog['tools'])} tool(s)")
142
+ return 0
143
+
144
+
145
+ if __name__ == "__main__":
146
+ sys.exit(main())
@@ -0,0 +1,108 @@
1
+ #!/usr/bin/env python3
2
+ """Baseline-closure check — step-4 Phase 3 Step 4.
3
+
4
+ Returns exit 0 iff the 60-day clock has elapsed since
5
+ `bench/baseline-start.txt` AND `bench/reports/` contains at least
6
+ `--min-reports` complete runs for the named corpus (default 30).
7
+
8
+ Read by P2 enforcement roadmaps as their precondition (G1 gate in
9
+ step-99). This is the single arbiter of "are we allowed to flip
10
+ defaults yet" — no other timer is authoritative.
11
+
12
+ Exit codes:
13
+ 0 — baseline ready (clock elapsed AND report count met)
14
+ 1 — argument / file error
15
+ 2 — baseline not ready (clock OR reports insufficient)
16
+
17
+ CLI:
18
+ python3 scripts/bench_baseline_ready.py
19
+ python3 scripts/bench_baseline_ready.py --corpus dev --min-days 60 --min-reports 30
20
+ python3 scripts/bench_baseline_ready.py --json
21
+ """
22
+ from __future__ import annotations
23
+
24
+ import argparse
25
+ import json
26
+ import sys
27
+ from datetime import date, datetime, timezone
28
+ from pathlib import Path
29
+
30
+ REPO_ROOT = Path(__file__).resolve().parent.parent
31
+
32
+
33
+ def _read_baseline_start(path: Path) -> date | None:
34
+ if not path.exists():
35
+ return None
36
+ for line in path.read_text(encoding="utf-8").splitlines():
37
+ stripped = line.strip()
38
+ if not stripped or stripped.startswith("#"):
39
+ continue
40
+ try:
41
+ return datetime.strptime(stripped, "%Y-%m-%d").date()
42
+ except ValueError:
43
+ continue
44
+ return None
45
+
46
+
47
+ def main(argv: list[str] | None = None) -> int:
48
+ ap = argparse.ArgumentParser(
49
+ description=__doc__,
50
+ formatter_class=argparse.RawDescriptionHelpFormatter,
51
+ )
52
+ ap.add_argument("--corpus", default="dev")
53
+ ap.add_argument("--reports-dir", default="bench/reports")
54
+ ap.add_argument("--baseline-file", default="bench/baseline-start.txt")
55
+ ap.add_argument("--min-days", type=int, default=60)
56
+ ap.add_argument("--min-reports", type=int, default=30)
57
+ ap.add_argument("--json", action="store_true")
58
+ args = ap.parse_args(argv)
59
+
60
+ baseline_path = REPO_ROOT / args.baseline_file
61
+ start = _read_baseline_start(baseline_path)
62
+ if start is None:
63
+ msg = f"baseline-start file missing or unreadable: {baseline_path}"
64
+ if args.json:
65
+ print(json.dumps({"status": "error", "reason": msg}))
66
+ else:
67
+ print(f" ❌ {msg}", file=sys.stderr)
68
+ return 1
69
+
70
+ today = datetime.now(timezone.utc).date()
71
+ days_elapsed = (today - start).days
72
+ days_ok = days_elapsed >= args.min_days
73
+
74
+ reports_dir = REPO_ROOT / args.reports_dir
75
+ report_count = (
76
+ len(list(reports_dir.glob(f"*-{args.corpus}.json")))
77
+ if reports_dir.exists() else 0
78
+ )
79
+ reports_ok = report_count >= args.min_reports
80
+
81
+ ready = days_ok and reports_ok
82
+ payload = {
83
+ "status": "ready" if ready else "warmup",
84
+ "corpus": args.corpus,
85
+ "baseline_start": start.isoformat(),
86
+ "today": today.isoformat(),
87
+ "days_elapsed": days_elapsed,
88
+ "min_days": args.min_days,
89
+ "days_ok": days_ok,
90
+ "report_count": report_count,
91
+ "min_reports": args.min_reports,
92
+ "reports_ok": reports_ok,
93
+ }
94
+ if args.json:
95
+ print(json.dumps(payload, indent=2))
96
+ else:
97
+ emoji = "✅" if ready else "⏳"
98
+ verdict = "READY" if ready else "WARMUP"
99
+ print(
100
+ f" {emoji} bench-baseline · corpus={args.corpus} · "
101
+ f"{verdict} · days={days_elapsed}/{args.min_days} · "
102
+ f"reports={report_count}/{args.min_reports}"
103
+ )
104
+ return 0 if ready else 2
105
+
106
+
107
+ if __name__ == "__main__":
108
+ sys.exit(main())
@@ -0,0 +1,151 @@
1
+ #!/usr/bin/env python3
2
+ """Drift detector for the bench corpus — step-4 Phase 3 Step 2.
3
+
4
+ Compares the latest `bench/reports/<stamp>-<corpus>.json` against the
5
+ previous N reports (default 5) for the same corpus. Drift defined as:
6
+
7
+ - selection-accuracy: latest is more than `accuracy_drop_pp` below
8
+ the rolling mean (default 5 pp)
9
+ - cost: latest USD total is more than `cost_increase_pct` above the
10
+ rolling mean (default 20 %); skipped when source != "captured"
11
+ - quality: latest quality_score is more than `quality_drop_pp`
12
+ below the rolling mean (default 10 pp); skipped when source ==
13
+ "not_collected"
14
+
15
+ Exit codes:
16
+ 0 — no drift detected (or no baseline yet — warn-only)
17
+ 1 — argument / read error
18
+ 2 — drift detected (CI surface; not a merge gate per roadmap)
19
+
20
+ CLI:
21
+ python3 scripts/bench_drift_check.py --corpus dev
22
+ python3 scripts/bench_drift_check.py --corpus dev --window 5 --json
23
+ """
24
+ from __future__ import annotations
25
+
26
+ import argparse
27
+ import json
28
+ import sys
29
+ from pathlib import Path
30
+ from typing import Any
31
+
32
+ REPO_ROOT = Path(__file__).resolve().parent.parent
33
+ sys.path.insert(0, str(REPO_ROOT / "scripts"))
34
+
35
+ from _lib import script_output # type: ignore[import-not-found] # noqa: E402
36
+
37
+
38
+ def _load_reports(reports_dir: Path, corpus: str) -> list[tuple[Path, dict[str, Any]]]:
39
+ out: list[tuple[Path, dict[str, Any]]] = []
40
+ for p in sorted(reports_dir.glob(f"*-{corpus}.json")):
41
+ try:
42
+ out.append((p, json.loads(p.read_text(encoding="utf-8"))))
43
+ except (OSError, json.JSONDecodeError) as exc:
44
+ script_output.warn(f" ⚠️ skip unreadable report {p.name}: {exc}")
45
+ return out
46
+
47
+
48
+ def _mean(values: list[float]) -> float:
49
+ return sum(values) / len(values) if values else 0.0
50
+
51
+
52
+ def _check(latest: dict[str, Any], baseline: list[dict[str, Any]],
53
+ thresholds: dict[str, float]) -> list[dict[str, Any]]:
54
+ findings: list[dict[str, Any]] = []
55
+
56
+ sel_latest = float(latest["selection"]["selection_accuracy"])
57
+ sel_baseline = _mean([float(r["selection"]["selection_accuracy"]) for r in baseline])
58
+ sel_drop_pp = (sel_baseline - sel_latest) * 100.0
59
+ if sel_drop_pp > thresholds["accuracy_drop_pp"]:
60
+ findings.append({
61
+ "axis": "selection_accuracy",
62
+ "latest": sel_latest, "baseline_mean": sel_baseline,
63
+ "delta_pp": -sel_drop_pp, "threshold_pp": -thresholds["accuracy_drop_pp"],
64
+ })
65
+
66
+ captured = [r for r in baseline + [latest] if r["cost"].get("source") == "captured"]
67
+ if len(captured) >= 2 and latest["cost"].get("source") == "captured":
68
+ cost_latest = float(latest["cost"]["totals"]["cost_usd"])
69
+ baseline_costs = [float(r["cost"]["totals"]["cost_usd"])
70
+ for r in baseline if r["cost"].get("source") == "captured"]
71
+ if baseline_costs:
72
+ cost_baseline = _mean(baseline_costs)
73
+ if cost_baseline > 0:
74
+ pct = (cost_latest - cost_baseline) / cost_baseline * 100.0
75
+ if pct > thresholds["cost_increase_pct"]:
76
+ findings.append({
77
+ "axis": "cost_usd",
78
+ "latest": cost_latest, "baseline_mean": cost_baseline,
79
+ "delta_pct": pct, "threshold_pct": thresholds["cost_increase_pct"],
80
+ })
81
+
82
+ if latest["quality"].get("source") != "not_collected":
83
+ q_latest = float(latest["quality"]["quality_score"])
84
+ q_baseline = _mean([float(r["quality"]["quality_score"])
85
+ for r in baseline
86
+ if r["quality"].get("source") != "not_collected"])
87
+ if q_baseline:
88
+ q_drop_pp = (q_baseline - q_latest) * 100.0
89
+ if q_drop_pp > thresholds["quality_drop_pp"]:
90
+ findings.append({
91
+ "axis": "quality_score",
92
+ "latest": q_latest, "baseline_mean": q_baseline,
93
+ "delta_pp": -q_drop_pp, "threshold_pp": -thresholds["quality_drop_pp"],
94
+ })
95
+
96
+ return findings
97
+
98
+
99
+ def main(argv: list[str] | None = None) -> int:
100
+ ap = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
101
+ ap.add_argument("--corpus", default="dev")
102
+ ap.add_argument("--reports-dir", default="bench/reports")
103
+ ap.add_argument("--window", type=int, default=5, help="rolling window size (default 5)")
104
+ ap.add_argument("--accuracy-drop-pp", type=float, default=5.0)
105
+ ap.add_argument("--cost-increase-pct", type=float, default=20.0)
106
+ ap.add_argument("--quality-drop-pp", type=float, default=10.0)
107
+ ap.add_argument("--json", action="store_true", help="emit JSON instead of Markdown")
108
+ args = ap.parse_args(argv)
109
+
110
+ reports = _load_reports(REPO_ROOT / args.reports_dir, args.corpus)
111
+ if len(reports) < 2:
112
+ msg = (f" ℹ️ bench-drift · corpus={args.corpus} · "
113
+ f"{len(reports)} report(s) — need ≥ 2 to compare; no drift gate yet.")
114
+ if args.json:
115
+ print(json.dumps({"status": "warmup", "reports": len(reports)}))
116
+ else:
117
+ print(msg)
118
+ return 0
119
+
120
+ latest_path, latest = reports[-1]
121
+ baseline = [r for _, r in reports[-(args.window + 1):-1]]
122
+ thresholds = {
123
+ "accuracy_drop_pp": args.accuracy_drop_pp,
124
+ "cost_increase_pct": args.cost_increase_pct,
125
+ "quality_drop_pp": args.quality_drop_pp,
126
+ }
127
+ findings = _check(latest, baseline, thresholds)
128
+
129
+ payload = {
130
+ "status": "drift" if findings else "ok",
131
+ "corpus": args.corpus,
132
+ "latest_report": latest_path.name,
133
+ "baseline_window": len(baseline),
134
+ "thresholds": thresholds,
135
+ "findings": findings,
136
+ }
137
+ if args.json:
138
+ print(json.dumps(payload, indent=2))
139
+ else:
140
+ emoji = "⚠️" if findings else "✅"
141
+ print(f" {emoji} bench-drift · corpus={args.corpus} · "
142
+ f"latest={latest_path.name} · window={len(baseline)} · "
143
+ f"findings={len(findings)}")
144
+ for f in findings:
145
+ print(f" · {f['axis']}: latest={f['latest']:.4f} "
146
+ f"baseline_mean={f['baseline_mean']:.4f}")
147
+ return 2 if findings else 0
148
+
149
+
150
+ if __name__ == "__main__":
151
+ sys.exit(main())