npm - @event4u/agent-config - Versions diffs - 2.18.0 → 2.20.0 - Mend

@event4u/agent-config 2.18.0 → 2.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (108) hide show

package/.agent-src/commands/agent-status.md +29 -0
package/.agent-src/commands/onboard.md +221 -81
package/.agent-src/commands/refine-ticket.md +3 -0
package/.agent-src/packs/README.md +49 -0
package/.agent-src/packs/agency-delivery.yml +63 -0
package/.agent-src/packs/content-engine.yml +53 -0
package/.agent-src/packs/founder-mvp.yml +51 -0
package/.agent-src/personas/README.md +8 -0
package/.agent-src/presets/README.md +26 -0
package/.agent-src/presets/balanced.yml +34 -0
package/.agent-src/presets/fast.yml +31 -0
package/.agent-src/presets/strict.yml +38 -0
package/.agent-src/profiles/README.md +29 -0
package/.agent-src/profiles/agency.yml +27 -0
package/.agent-src/profiles/content_creator.yml +25 -0
package/.agent-src/profiles/developer.yml +26 -0
package/.agent-src/profiles/finance.yml +24 -0
package/.agent-src/profiles/founder.yml +25 -0
package/.agent-src/profiles/ops.yml +25 -0
package/.agent-src/rules/no-cheap-questions.md +25 -17
package/.agent-src/skills/adr-create/SKILL.md +78 -68
package/.agent-src/skills/refine-ticket/SKILL.md +3 -0
package/.agent-src/skills/subagent-orchestration/SKILL.md +33 -0
package/.agent-src/templates/agents/agent-project-settings.example.yml +1 -1
package/.agent-src/templates/skill-archive-note.md +101 -0
package/.agent-src/user-types/README.md +124 -0
package/.agent-src/user-types/_template/user-type.md +95 -0
package/.agent-src/user-types/galabau-field-crew.md +100 -0
package/.agent-src/user-types/metalworking-shop.md +105 -0
package/.agent-src/user-types/truck-driver.md +113 -0
package/.claude-plugin/marketplace.json +1 -1
package/CHANGELOG.md +91 -30
package/README.md +68 -72
package/config/agent-settings.template.yml +22 -0
package/docs/adrs/caveman/0001-default-off-until-bench.md +93 -0
package/docs/adrs/caveman/README.md +9 -0
package/docs/adrs/cost/0001-hard-stop-hook.md +114 -0
package/docs/adrs/cost/README.md +9 -0
package/docs/adrs/memory/0001-consumer-side-snapshot.md +111 -0
package/docs/adrs/memory/README.md +9 -0
package/docs/adrs/router/0001-three-tier-routing.md +119 -0
package/docs/adrs/router/README.md +9 -0
package/docs/adrs/schema/0001-json-schema-frontmatter.md +102 -0
package/docs/adrs/schema/README.md +9 -0
package/docs/adrs/smoke/0001-per-tier-smoke-scripts.md +99 -0
package/docs/adrs/smoke/README.md +9 -0
package/docs/architecture/current-onboard-baseline.md +126 -0
package/docs/architecture/current-safety-behavior.md +137 -0
package/docs/archive/CHANGELOG-pre-2.16.0.md +48 -0
package/docs/contracts/adr-layout.md +108 -0
package/docs/contracts/adr-mcp-runtime.md +128 -0
package/docs/contracts/adr-user-types-axis.md +127 -0
package/docs/contracts/benchmark-corpus-spec.md +97 -0
package/docs/contracts/benchmark-report-schema.md +111 -0
package/docs/contracts/command-clusters.md +1 -0
package/docs/contracts/command-taxonomy.md +137 -0
package/docs/contracts/compression-default-kill-criterion.md +69 -0
package/docs/contracts/config-presets.md +144 -0
package/docs/contracts/cost-dashboard.md +143 -0
package/docs/contracts/cost-enforcement.md +134 -0
package/docs/contracts/file-ownership-matrix.json +0 -7
package/docs/contracts/mcp-tool-inventory.md +53 -0
package/docs/contracts/measurement-baseline.md +102 -0
package/docs/contracts/namespace.md +125 -0
package/docs/contracts/profile-system.md +142 -0
package/docs/contracts/safety-model.md +129 -0
package/docs/contracts/smoke-contracts.md +144 -0
package/docs/contracts/user-type-schema.md +146 -0
package/docs/contracts/workflow-packs.md +121 -0
package/docs/decisions/ADR-010-profile-pack-preset-boundary.md +132 -0
package/docs/decisions/INDEX.md +1 -0
package/docs/featured-commands.md +27 -0
package/docs/parity/bench-ruflo.json +58 -0
package/docs/parity/bench.json +41 -0
package/docs/parity/ruflo.md +46 -0
package/docs/profiles.md +91 -0
package/docs/recruits/_template.md +81 -0
package/package.json +1 -1
package/scripts/_cli/cmd_explain.py +250 -0
package/scripts/_lib/bench_cost.py +138 -0
package/scripts/_lib/bench_quality.py +118 -0
package/scripts/_lib/bench_report.py +150 -0
package/scripts/agent-config +13 -0
package/scripts/audit_adr_coverage.py +175 -0
package/scripts/audit_mcp_tools.py +146 -0
package/scripts/bench_baseline_ready.py +108 -0
package/scripts/bench_drift_check.py +151 -0
package/scripts/bench_per_tool.py +216 -0
package/scripts/bench_run.py +155 -0
package/scripts/compress.py +48 -2
package/scripts/config/__init__.py +9 -0
package/scripts/config/presets.py +206 -0
package/scripts/config/profiles.py +173 -0
package/scripts/cost/budget.mjs +73 -12
package/scripts/cost/preflight.mjs +89 -0
package/scripts/lint_archived_skills.py +143 -0
package/scripts/lint_bench_corpus.py +161 -0
package/scripts/lint_namespace.py +135 -0
package/scripts/schemas/user-type.schema.json +35 -0
package/scripts/skill_linter.py +139 -4
package/scripts/skill_overlap.py +204 -0
package/scripts/skill_tools/audit_user_type_coverage.py +148 -0
package/scripts/skill_usage_collect.py +191 -0
package/scripts/skill_usage_report.py +162 -0
package/scripts/smoke/kernel.sh +101 -0
package/scripts/smoke/router.sh +129 -0
package/scripts/smoke/schema.sh +71 -0
package/scripts/smoke/skills.sh +101 -0

package/scripts/audit_adr_coverage.py ADDED Viewed

@@ -0,0 +1,175 @@
+#!/usr/bin/env python3
+"""Audit per-area ADR coverage against docs/contracts/ and the canonical
+AREAS inventory. Contract: docs/contracts/adr-layout.md.
+Modes:
+  --report   (default) one-shot inventory: which areas exist, ADR count
+             per area, contracts missing a bootstrap ADR.
+  --check    exit 1 on hard failures (number gaps, missing area README,
+             broken supersedes); exit 0 with warnings on missing
+             bootstrap ADRs and dangling references.
+  --regen-area-readme <area>
+             rewrite docs/adrs/<area>/README.md from the area's ADR
+             frontmatter. Idempotent.
+"""
+from __future__ import annotations
+import argparse, re, sys
+from pathlib import Path
+ROOT = Path(__file__).resolve().parent.parent
+ADR_ROOT = ROOT / "docs" / "adrs"
+CONTRACT_ROOT = ROOT / "docs" / "contracts"
+# Canonical area inventory. To add an area: add it here, then run
+# `python3 scripts/audit_adr_coverage.py --check` in the same PR.
+AREAS: dict[str, dict[str, str]] = {
+    "cost":    {"contract": "cost-enforcement.md",
+                "scope":    "Budget ladder, hard-stop hook, cost reporting and dashboards."},
+    "caveman": {"contract": "compression-default-kill-criterion.md",
+                "scope":    "Caveman-speak compression, decompression, reversibility guards."},
+    "schema":  {"contract": "agents/docs/frontmatter-contract.md",
+                "scope":    "Frontmatter schemas, v2 rigor, lint behaviour for skills / rules / commands."},
+    "router":  {"contract": "rule-router.md",
+                "scope":    "router.json shape, tier semantics, dispatch precedence."},
+    "smoke":   {"contract": "smoke-contracts.md",
+                "scope":    "Per-tier smoke contracts, baseline locks, regression gates."},
+    "memory":  {"contract": "agent-memory-contract.md",
+                "scope":    "Memory MCP, propose / promote / poison flow, runtime-trust scoring."},
+}
+NAMED = re.compile(r"^(\d{4})-([a-z0-9-]+)\.md$")
+FM = re.compile(r"^---\n(.*?)\n---", re.DOTALL)
+FIELD = re.compile(r"^([a-z_]+):\s*(.+?)\s*$", re.MULTILINE)
+def parse_fm(text: str) -> dict[str, str]:
+    m = FM.search(text)
+    if not m:
+        return {}
+    return {k: v.strip(" \"'") for k, v in FIELD.findall(m.group(1))}
+def scan_area(area: str) -> tuple[list[dict], list[str]]:
+    """Return (adrs, errors). adrs sorted by number."""
+    area_dir = ADR_ROOT / area
+    errs: list[str] = []
+    if not area_dir.exists():
+        return [], errs
+    adrs: list[dict] = []
+    for p in sorted(area_dir.glob("*.md")):
+        if p.name == "README.md":
+            continue
+        m = NAMED.match(p.name)
+        if not m:
+            errs.append(f"{area}/{p.name}: filename does not match NNNN-<slug>.md")
+            continue
+        fm = parse_fm(p.read_text(encoding="utf-8"))
+        adrs.append({"num": m.group(1), "slug": m.group(2),
+                     "path": p.name, **fm})
+    # Gap check.
+    nums = [int(a["num"]) for a in adrs]
+    for i, n in enumerate(nums, start=1):
+        if n != i:
+            errs.append(f"{area}/: number gap at position {i} (got {n:04d})")
+            break
+    return adrs, errs
+def _contract_path(meta: dict[str, str]) -> Path:
+    """Resolve a contract reference. Plain filename → docs/contracts/<file>;
+    a path with separators → repo-relative."""
+    c = meta["contract"]
+    return (ROOT / c) if "/" in c else (CONTRACT_ROOT / c)
+def render_area_readme(area: str, meta: dict[str, str], adrs: list[dict]) -> str:
+    lines = [f"# ADRs — `{area}`", "",
+             f"> {meta['scope']}", ""]
+    contract_path = _contract_path(meta)
+    repo_rel = contract_path.relative_to(ROOT) if contract_path.exists() else Path(
+        meta["contract"] if "/" in meta["contract"] else f"docs/contracts/{meta['contract']}")
+    # Link target is relative to docs/adrs/<area>/README.md (2 levels up from area dir).
+    link_target = Path("..") / ".." / ".." / repo_rel
+    if contract_path.exists():
+        lines.append(f"Contract: [`{repo_rel}`]({link_target}).")
+    else:
+        lines.append(f"Contract: _not yet published_ (`{repo_rel}`).")
+    lines += ["",
+              "| # | Title | Status | Date | Supersedes |",
+              "|---|---|---|---|---|"]
+    for a in adrs:
+        title = a.get("decision", a["slug"]).replace("-", " ").title()
+        lines.append(f"| [{a['num']}]({a['path']}) | {title} | "
+                     f"{a.get('status','—')} | {a.get('date','—')} | "
+                     f"{a.get('supersedes','—')} |")
+    if not adrs:
+        lines.append("| _none yet_ | — | — | — | — |")
+    return "\n".join(lines) + "\n"
+def cmd_report(args) -> int:
+    print("## ADR coverage report")
+    print()
+    print("| Area | Contract | ADRs | README | Status |")
+    print("|---|---|---:|:---:|---|")
+    missing_bootstrap = 0
+    for area, meta in AREAS.items():
+        adrs, _ = scan_area(area)
+        readme = "✅" if (ADR_ROOT / area / "README.md").exists() else "—"
+        contract_present = _contract_path(meta).exists()
+        status = "ok" if adrs else "missing bootstrap"
+        if not adrs:
+            missing_bootstrap += 1
+        contract_cell = meta["contract"] if contract_present else f"_{meta['contract']}_ (no contract)"
+        print(f"| `{area}` | {contract_cell} | {len(adrs)} | {readme} | {status} |")
+    print()
+    print(f"BASELINE: {len(AREAS)} canonical areas · {missing_bootstrap} missing bootstrap ADR(s)")
+    return 0
+def cmd_check(args) -> int:
+    hard = 0
+    warn = 0
+    for area, meta in AREAS.items():
+        adrs, errs = scan_area(area)
+        for e in errs:
+            print(f"❌ {e}", file=sys.stderr); hard += 1
+        if adrs and not (ADR_ROOT / area / "README.md").exists():
+            print(f"❌ {area}/: README.md missing", file=sys.stderr); hard += 1
+        if not adrs:
+            print(f"⚠️  {area}/: no bootstrap ADR yet (contract: {meta['contract']})", file=sys.stderr)
+            warn += 1
+    print(f"BASELINE: {hard} hard fail(s) · {warn} warn(s)")
+    return 1 if hard else 0
+def cmd_regen_area_readme(args) -> int:
+    area = args.regen_area_readme
+    if area not in AREAS:
+        print(f"❌ unknown area '{area}' — add to AREAS inventory first", file=sys.stderr)
+        return 1
+    adrs, errs = scan_area(area)
+    for e in errs:
+        print(f"❌ {e}", file=sys.stderr)
+    out = ADR_ROOT / area / "README.md"
+    out.parent.mkdir(parents=True, exist_ok=True)
+    out.write_text(render_area_readme(area, AREAS[area], adrs), encoding="utf-8")
+    print(f"wrote {out.relative_to(ROOT)}")
+    return 0
+def main() -> int:
+    ap = argparse.ArgumentParser(description=__doc__)
+    grp = ap.add_mutually_exclusive_group()
+    grp.add_argument("--check", action="store_true")
+    grp.add_argument("--regen-area-readme", metavar="AREA")
+    args = ap.parse_args()
+    if args.check:
+        return cmd_check(args)
+    if args.regen_area_readme:
+        return cmd_regen_area_readme(args)
+    return cmd_report(args)
+if __name__ == "__main__":
+    sys.exit(main())

package/scripts/audit_mcp_tools.py ADDED Viewed

@@ -0,0 +1,146 @@
+#!/usr/bin/env python3
+"""MCP-tool inventory generator. Reads the source-of-truth catalog at
+`scripts/mcp_server/consumer_tool_catalog.json` and the handler
+registry at `scripts/mcp_server/tools.py`, emits
+`docs/contracts/mcp-tool-inventory.md` with every tool cited by
+`<file>:<line>`. README's MCP-tool count line links here; the bare
+claim is banned.
+Contract: step-11 Phase 5 Step 3
+(agents/roadmaps/step-11-ruflo-parity.md).
+Modes:
+  --check    exit non-zero if the generated inventory drifts from
+             the on-disk file (CI gate).
+  --write    regenerate the inventory file in-place (default).
+"""
+from __future__ import annotations
+import argparse, json, re, sys
+from pathlib import Path
+ROOT = Path(__file__).resolve().parent.parent
+CATALOG = ROOT / "scripts/mcp_server/consumer_tool_catalog.json"
+TOOLS_PY = ROOT / "scripts/mcp_server/tools.py"
+OUT = ROOT / "docs/contracts/mcp-tool-inventory.md"
+# Match `"<name>": BuiltinTool(` in the ALLOWLIST dict.
+HANDLER_RE = re.compile(r'^\s*"([a-z_]+)"\s*:\s*BuiltinTool\(')
+# Match `"name": "<name>",` in the catalog json (for catalog citations).
+CATALOG_NAME_RE = re.compile(r'^\s*"name"\s*:\s*"([a-z_]+)"\s*,?\s*$')
+def _index_handlers() -> dict[str, int]:
+    out: dict[str, int] = {}
+    for i, line in enumerate(TOOLS_PY.read_text(encoding="utf-8").splitlines(), 1):
+        m = HANDLER_RE.match(line)
+        if m:
+            out[m.group(1)] = i
+    return out
+def _index_catalog_lines() -> dict[str, int]:
+    out: dict[str, int] = {}
+    for i, line in enumerate(CATALOG.read_text(encoding="utf-8").splitlines(), 1):
+        m = CATALOG_NAME_RE.match(line)
+        if m and m.group(1) not in out:
+            out[m.group(1)] = i
+    return out
+def _render(catalog: dict, handlers: dict[str, int], cat_lines: dict[str, int]) -> str:
+    tools = catalog["tools"]
+    total = len(tools)
+    by_transport: dict[str, int] = {}
+    by_side_effect: dict[str, int] = {}
+    for t in tools:
+        for tr in t["implemented_on"]:
+            by_transport[tr] = by_transport.get(tr, 0) + 1
+        by_side_effect[t["side_effect"]] = by_side_effect.get(t["side_effect"], 0) + 1
+    stub_count = sum(1 for t in tools if not t["implemented_on"])
+    transport_summary = ", ".join(f"{k}={v}" for k, v in sorted(by_transport.items())) or "none"
+    side_effect_summary = ", ".join(f"{k}={v}" for k, v in sorted(by_side_effect.items()))
+    lines: list[str] = []
+    lines.append("---")
+    lines.append("stability: beta")
+    lines.append("keep-beta-until: 2026-08-14")
+    lines.append("---")
+    lines.append("")
+    lines.append("# MCP tool inventory")
+    lines.append("")
+    lines.append("> Generated by [`scripts/audit_mcp_tools.py`](../../scripts/audit_mcp_tools.py)")
+    lines.append("> from the source-of-truth catalog")
+    lines.append("> [`scripts/mcp_server/consumer_tool_catalog.json`](../../scripts/mcp_server/consumer_tool_catalog.json).")
+    lines.append("> Do **not** hand-edit; rerun `python3 scripts/audit_mcp_tools.py --write`.")
+    lines.append(">")
+    lines.append("> Step-11 Phase 5 Step 3 (`step-11-ruflo-parity.md`).")
+    lines.append("")
+    lines.append("## Summary")
+    lines.append("")
+    lines.append(f"- **Total tools:** {total}")
+    lines.append(f"- **By transport:** {transport_summary}")
+    lines.append(f"- **By side-effect:** {side_effect_summary}")
+    lines.append(f"- **Discovery-only stubs (no implementation):** {stub_count}")
+    lines.append("")
+    lines.append("## Tools")
+    lines.append("")
+    lines.append("| Tool | Side-effect | Transports | Catalog | Handler |")
+    lines.append("|---|---|---|---|---|")
+    for t in tools:
+        name = t["name"]
+        side = t["side_effect"]
+        transports = ", ".join(t["implemented_on"]) if t["implemented_on"] else "_(stub)_"
+        cat_line = cat_lines.get(name)
+        cat_cite = (
+            f"[`consumer_tool_catalog.json:{cat_line}`](../../scripts/mcp_server/consumer_tool_catalog.json#L{cat_line})"
+            if cat_line else "_missing_"
+        )
+        h_line = handlers.get(name)
+        h_cite = (
+            f"[`tools.py:{h_line}`](../../scripts/mcp_server/tools.py#L{h_line})"
+            if h_line else "_stub-only_"
+        )
+        lines.append(f"| `{name}` | `{side}` | {transports} | {cat_cite} | {h_cite} |")
+    lines.append("")
+    lines.append("## Glossary")
+    lines.append("")
+    lines.append("- **Side-effect** — `ro` (read-only) · `fs-write` (filesystem write) · `shell` (spawns processes).")
+    lines.append("- **Transports** — `stdio` (`scripts/mcp_server/`) · `worker` (`workers/mcp/`). A tool may live on both.")
+    lines.append("- **Stub** — catalog-listed for discovery; returns the `not_implemented` envelope from")
+    lines.append("  [`mcp-tool-stub-envelope.md`](mcp-tool-stub-envelope.md) until promoted.")
+    lines.append("")
+    return "\n".join(lines) + "\n"
+def main() -> int:
+    ap = argparse.ArgumentParser(description=__doc__)
+    g = ap.add_mutually_exclusive_group()
+    g.add_argument("--check", action="store_true", help="Drift gate: exit 1 if file is stale.")
+    g.add_argument("--write", action="store_true", help="Regenerate the inventory file.")
+    ap.add_argument("--quiet", action="store_true")
+    args = ap.parse_args()
+    catalog = json.loads(CATALOG.read_text(encoding="utf-8"))
+    handlers = _index_handlers()
+    cat_lines = _index_catalog_lines()
+    rendered = _render(catalog, handlers, cat_lines)
+    if args.check:
+        on_disk = OUT.read_text(encoding="utf-8") if OUT.exists() else ""
+        if on_disk != rendered:
+            print(f"❌ {OUT.relative_to(ROOT)} drifted from generator.", file=sys.stderr)
+            print("   Run: python3 scripts/audit_mcp_tools.py --write", file=sys.stderr)
+            return 1
+        if not args.quiet:
+            print(f"BASELINE: {OUT.relative_to(ROOT)} is in sync · {len(catalog['tools'])} tool(s)")
+        return 0
+    OUT.parent.mkdir(parents=True, exist_ok=True)
+    OUT.write_text(rendered, encoding="utf-8")
+    if not args.quiet:
+        print(f"✅ wrote {OUT.relative_to(ROOT)} · {len(catalog['tools'])} tool(s)")
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

package/scripts/bench_baseline_ready.py ADDED Viewed

@@ -0,0 +1,108 @@
+#!/usr/bin/env python3
+"""Baseline-closure check — step-4 Phase 3 Step 4.
+Returns exit 0 iff the 60-day clock has elapsed since
+`bench/baseline-start.txt` AND `bench/reports/` contains at least
+`--min-reports` complete runs for the named corpus (default 30).
+Read by P2 enforcement roadmaps as their precondition (G1 gate in
+step-99). This is the single arbiter of "are we allowed to flip
+defaults yet" — no other timer is authoritative.
+Exit codes:
+    0 — baseline ready (clock elapsed AND report count met)
+    1 — argument / file error
+    2 — baseline not ready (clock OR reports insufficient)
+CLI:
+    python3 scripts/bench_baseline_ready.py
+    python3 scripts/bench_baseline_ready.py --corpus dev --min-days 60 --min-reports 30
+    python3 scripts/bench_baseline_ready.py --json
+"""
+from __future__ import annotations
+import argparse
+import json
+import sys
+from datetime import date, datetime, timezone
+from pathlib import Path
+REPO_ROOT = Path(__file__).resolve().parent.parent
+def _read_baseline_start(path: Path) -> date | None:
+    if not path.exists():
+        return None
+    for line in path.read_text(encoding="utf-8").splitlines():
+        stripped = line.strip()
+        if not stripped or stripped.startswith("#"):
+            continue
+        try:
+            return datetime.strptime(stripped, "%Y-%m-%d").date()
+        except ValueError:
+            continue
+    return None
+def main(argv: list[str] | None = None) -> int:
+    ap = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    ap.add_argument("--corpus", default="dev")
+    ap.add_argument("--reports-dir", default="bench/reports")
+    ap.add_argument("--baseline-file", default="bench/baseline-start.txt")
+    ap.add_argument("--min-days", type=int, default=60)
+    ap.add_argument("--min-reports", type=int, default=30)
+    ap.add_argument("--json", action="store_true")
+    args = ap.parse_args(argv)
+    baseline_path = REPO_ROOT / args.baseline_file
+    start = _read_baseline_start(baseline_path)
+    if start is None:
+        msg = f"baseline-start file missing or unreadable: {baseline_path}"
+        if args.json:
+            print(json.dumps({"status": "error", "reason": msg}))
+        else:
+            print(f"  ❌  {msg}", file=sys.stderr)
+        return 1
+    today = datetime.now(timezone.utc).date()
+    days_elapsed = (today - start).days
+    days_ok = days_elapsed >= args.min_days
+    reports_dir = REPO_ROOT / args.reports_dir
+    report_count = (
+        len(list(reports_dir.glob(f"*-{args.corpus}.json")))
+        if reports_dir.exists() else 0
+    )
+    reports_ok = report_count >= args.min_reports
+    ready = days_ok and reports_ok
+    payload = {
+        "status": "ready" if ready else "warmup",
+        "corpus": args.corpus,
+        "baseline_start": start.isoformat(),
+        "today": today.isoformat(),
+        "days_elapsed": days_elapsed,
+        "min_days": args.min_days,
+        "days_ok": days_ok,
+        "report_count": report_count,
+        "min_reports": args.min_reports,
+        "reports_ok": reports_ok,
+    }
+    if args.json:
+        print(json.dumps(payload, indent=2))
+    else:
+        emoji = "✅" if ready else "⏳"
+        verdict = "READY" if ready else "WARMUP"
+        print(
+            f"  {emoji}  bench-baseline · corpus={args.corpus} · "
+            f"{verdict} · days={days_elapsed}/{args.min_days} · "
+            f"reports={report_count}/{args.min_reports}"
+        )
+    return 0 if ready else 2
+if __name__ == "__main__":
+    sys.exit(main())

package/scripts/bench_drift_check.py ADDED Viewed

@@ -0,0 +1,151 @@
+#!/usr/bin/env python3
+"""Drift detector for the bench corpus — step-4 Phase 3 Step 2.
+Compares the latest `bench/reports/<stamp>-<corpus>.json` against the
+previous N reports (default 5) for the same corpus. Drift defined as:
+    - selection-accuracy: latest is more than `accuracy_drop_pp` below
+      the rolling mean (default 5 pp)
+    - cost: latest USD total is more than `cost_increase_pct` above the
+      rolling mean (default 20 %); skipped when source != "captured"
+    - quality: latest quality_score is more than `quality_drop_pp`
+      below the rolling mean (default 10 pp); skipped when source ==
+      "not_collected"
+Exit codes:
+    0 — no drift detected (or no baseline yet — warn-only)
+    1 — argument / read error
+    2 — drift detected (CI surface; not a merge gate per roadmap)
+CLI:
+    python3 scripts/bench_drift_check.py --corpus dev
+    python3 scripts/bench_drift_check.py --corpus dev --window 5 --json
+"""
+from __future__ import annotations
+import argparse
+import json
+import sys
+from pathlib import Path
+from typing import Any
+REPO_ROOT = Path(__file__).resolve().parent.parent
+sys.path.insert(0, str(REPO_ROOT / "scripts"))
+from _lib import script_output  # type: ignore[import-not-found]  # noqa: E402
+def _load_reports(reports_dir: Path, corpus: str) -> list[tuple[Path, dict[str, Any]]]:
+    out: list[tuple[Path, dict[str, Any]]] = []
+    for p in sorted(reports_dir.glob(f"*-{corpus}.json")):
+        try:
+            out.append((p, json.loads(p.read_text(encoding="utf-8"))))
+        except (OSError, json.JSONDecodeError) as exc:
+            script_output.warn(f"  ⚠️  skip unreadable report {p.name}: {exc}")
+    return out
+def _mean(values: list[float]) -> float:
+    return sum(values) / len(values) if values else 0.0
+def _check(latest: dict[str, Any], baseline: list[dict[str, Any]],
+           thresholds: dict[str, float]) -> list[dict[str, Any]]:
+    findings: list[dict[str, Any]] = []
+    sel_latest = float(latest["selection"]["selection_accuracy"])
+    sel_baseline = _mean([float(r["selection"]["selection_accuracy"]) for r in baseline])
+    sel_drop_pp = (sel_baseline - sel_latest) * 100.0
+    if sel_drop_pp > thresholds["accuracy_drop_pp"]:
+        findings.append({
+            "axis": "selection_accuracy",
+            "latest": sel_latest, "baseline_mean": sel_baseline,
+            "delta_pp": -sel_drop_pp, "threshold_pp": -thresholds["accuracy_drop_pp"],
+        })
+    captured = [r for r in baseline + [latest] if r["cost"].get("source") == "captured"]
+    if len(captured) >= 2 and latest["cost"].get("source") == "captured":
+        cost_latest = float(latest["cost"]["totals"]["cost_usd"])
+        baseline_costs = [float(r["cost"]["totals"]["cost_usd"])
+                          for r in baseline if r["cost"].get("source") == "captured"]
+        if baseline_costs:
+            cost_baseline = _mean(baseline_costs)
+            if cost_baseline > 0:
+                pct = (cost_latest - cost_baseline) / cost_baseline * 100.0
+                if pct > thresholds["cost_increase_pct"]:
+                    findings.append({
+                        "axis": "cost_usd",
+                        "latest": cost_latest, "baseline_mean": cost_baseline,
+                        "delta_pct": pct, "threshold_pct": thresholds["cost_increase_pct"],
+                    })
+    if latest["quality"].get("source") != "not_collected":
+        q_latest = float(latest["quality"]["quality_score"])
+        q_baseline = _mean([float(r["quality"]["quality_score"])
+                            for r in baseline
+                            if r["quality"].get("source") != "not_collected"])
+        if q_baseline:
+            q_drop_pp = (q_baseline - q_latest) * 100.0
+            if q_drop_pp > thresholds["quality_drop_pp"]:
+                findings.append({
+                    "axis": "quality_score",
+                    "latest": q_latest, "baseline_mean": q_baseline,
+                    "delta_pp": -q_drop_pp, "threshold_pp": -thresholds["quality_drop_pp"],
+                })
+    return findings
+def main(argv: list[str] | None = None) -> int:
+    ap = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+    ap.add_argument("--corpus", default="dev")
+    ap.add_argument("--reports-dir", default="bench/reports")
+    ap.add_argument("--window", type=int, default=5, help="rolling window size (default 5)")
+    ap.add_argument("--accuracy-drop-pp", type=float, default=5.0)
+    ap.add_argument("--cost-increase-pct", type=float, default=20.0)
+    ap.add_argument("--quality-drop-pp", type=float, default=10.0)
+    ap.add_argument("--json", action="store_true", help="emit JSON instead of Markdown")
+    args = ap.parse_args(argv)
+    reports = _load_reports(REPO_ROOT / args.reports_dir, args.corpus)
+    if len(reports) < 2:
+        msg = (f"  ℹ️  bench-drift · corpus={args.corpus} · "
+               f"{len(reports)} report(s) — need ≥ 2 to compare; no drift gate yet.")
+        if args.json:
+            print(json.dumps({"status": "warmup", "reports": len(reports)}))
+        else:
+            print(msg)
+        return 0
+    latest_path, latest = reports[-1]
+    baseline = [r for _, r in reports[-(args.window + 1):-1]]
+    thresholds = {
+        "accuracy_drop_pp": args.accuracy_drop_pp,
+        "cost_increase_pct": args.cost_increase_pct,
+        "quality_drop_pp": args.quality_drop_pp,
+    }
+    findings = _check(latest, baseline, thresholds)
+    payload = {
+        "status": "drift" if findings else "ok",
+        "corpus": args.corpus,
+        "latest_report": latest_path.name,
+        "baseline_window": len(baseline),
+        "thresholds": thresholds,
+        "findings": findings,
+    }
+    if args.json:
+        print(json.dumps(payload, indent=2))
+    else:
+        emoji = "⚠️" if findings else "✅"
+        print(f"  {emoji}  bench-drift · corpus={args.corpus} · "
+              f"latest={latest_path.name} · window={len(baseline)} · "
+              f"findings={len(findings)}")
+        for f in findings:
+            print(f"     · {f['axis']}: latest={f['latest']:.4f} "
+                  f"baseline_mean={f['baseline_mean']:.4f}")
+    return 2 if findings else 0
+if __name__ == "__main__":
+    sys.exit(main())