npm - @event4u/agent-config - Versions diffs - 2.12.0 → 2.13.0 - Mend

@event4u/agent-config 2.12.0 → 2.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

package/.agent-src/commands/council/analysis.md +142 -0
package/.agent-src/commands/council/debate.md +129 -0
package/.agent-src/commands/council/default.md +8 -0
package/.agent-src/commands/council/design.md +16 -12
package/.agent-src/commands/council/optimize.md +16 -15
package/.agent-src/commands/council/pr.md +12 -12
package/.agent-src/commands/council.md +48 -2
package/.agent-src/personas/advisors/contrarian.md +95 -0
package/.agent-src/personas/advisors/executor.md +99 -0
package/.agent-src/personas/advisors/expansionist.md +98 -0
package/.agent-src/personas/advisors/first-principles.md +98 -0
package/.agent-src/personas/advisors/outsider.md +102 -0
package/.agent-src/rules/copilot-routing.md +19 -0
package/.agent-src/rules/devcontainer-routing.md +20 -0
package/.agent-src/rules/laravel-routing.md +20 -0
package/.agent-src/rules/symfony-routing.md +20 -0
package/.agent-src/skills/ai-council/SKILL.md +180 -2
package/.agent-src/skills/copilot-config/SKILL.md +1 -1
package/.agent-src/skills/devcontainer/SKILL.md +1 -1
package/.agent-src/skills/laravel/SKILL.md +1 -1
package/.agent-src/skills/project-analysis-core/SKILL.md +1 -1
package/.agent-src/skills/project-analyzer/SKILL.md +1 -1
package/.agent-src/skills/symfony-workflow/SKILL.md +1 -1
package/.agent-src/skills/universal-project-analysis/SKILL.md +1 -1
package/.agent-src/templates/agents/agent-project-settings.example.yml +1 -1
package/.claude-plugin/marketplace.json +3 -1
package/AGENTS.md +1 -1
package/CHANGELOG.md +47 -0
package/CONTRIBUTING.md +5 -0
package/README.md +3 -3
package/config/agent-settings.template.yml +5 -93
package/docs/architecture/multi-tool-projection.md +53 -0
package/docs/architecture/{compression.md → source-projection.md} +21 -3
package/docs/architecture.md +5 -5
package/docs/catalog.md +21 -11
package/docs/contracts/adr-architectural-consensus-mechanism.md +67 -0
package/docs/contracts/ai-council-config.md +186 -0
package/docs/contracts/command-clusters.md +57 -1
package/docs/contracts/multi-tool-projection-fidelity.md +109 -0
package/docs/getting-started.md +2 -2
package/package.json +1 -1
package/scripts/_archive/README.md +59 -0
package/scripts/ai_council/_default_prices.py +10 -1
package/scripts/ai_council/advisors.py +148 -0
package/scripts/ai_council/clients.py +172 -0
package/scripts/ai_council/config.py +368 -0
package/scripts/ai_council/consensus.py +290 -0
package/scripts/ai_council/orchestrator.py +628 -14
package/scripts/ai_council/prompts.py +335 -0
package/scripts/check_compressed_paths.py +6 -1
package/scripts/ci_time_ratio.py +168 -0
package/scripts/council_cli.py +973 -29
package/scripts/measure_projection_bytes.py +159 -0
package/scripts/measure_roadmap_trajectory.py +112 -0
package/scripts/probe_projection_fidelity.py +202 -0
package/scripts/score_skill_selection.py +198 -0
package/scripts/skill_collision_clusters.py +162 -0
/package/scripts/{_backfill_skill_domains.py → _archive/_backfill_skill_domains.py} +0 -0
/package/scripts/{_bootstrap_tier_frontmatter.py → _archive/_bootstrap_tier_frontmatter.py} +0 -0
/package/scripts/{_p43_bodies.py → _archive/_p43_bodies.py} +0 -0
/package/scripts/{_p43_compress.py → _archive/_p43_compress.py} +0 -0
/package/scripts/{_p4_migrate.py → _archive/_p4_migrate.py} +0 -0
/package/scripts/{_phase2_shim_helper.py → _archive/_phase2_shim_helper.py} +0 -0
/package/scripts/{_pilot_council_question.py → _archive/_pilot_council_question.py} +0 -0

package/scripts/measure_projection_bytes.py ADDED Viewed

@@ -0,0 +1,159 @@
+#!/usr/bin/env python3
+"""Measure per-tool projection bytes.
+Phase 2.1 deliverable for `agents/roadmaps/step-1-v2-feedback-followup.md`
+(council finding U1 — the 0.45 % source/dist headline metric measures the
+wrong boundary). Replaces the single headline figure with per-tool numbers
+and an explicit projection-method label.
+Usage:
+    python3 scripts/measure_projection_bytes.py           # human-readable
+    python3 scripts/measure_projection_bytes.py --json    # machine-readable
+    python3 scripts/measure_projection_bytes.py --regenerate
+        # runs `task clean-tools && task generate-tools` with *all* tools
+        # enabled (via temporary .agent-tools.yml override) before measuring,
+        # then restores the original `.agent-tools.yml`. Use this to produce
+        # a complete table when the local repo only enables a subset.
+Output is intentionally non-cached and read fresh from disk every run.
+"""
+from __future__ import annotations
+import argparse
+import json
+import shutil
+import subprocess
+import sys
+from pathlib import Path
+import yaml
+PROJECT_ROOT = Path(__file__).resolve().parent.parent
+# (surface, kind, projection-method). Surface paths are relative to the repo
+# root. `kind` is "dir" (walk recursively) or "file" (single file size).
+SURFACES: list[tuple[str, str, str]] = [
+    (".agent-src.uncompressed", "dir", "verbose source (input)"),
+    (".agent-src", "dir", "source projection (path-rewrite + .npmignore)"),
+    (".augment", "dir", "Augment Code — copies (rules) + symlinks (skills/cmds)"),
+    (".claude", "dir", "Claude Code — pure symlinks"),
+    (".cursor", "dir", "Cursor — per-rule `.mdc` materialized + symlinks"),
+    (".clinerules", "dir", "Cline — pure symlinks"),
+    (".windsurf", "dir", "Windsurf — per-rule wave-8 `.md` + symlinks"),
+    (".windsurfrules", "file", "Windsurf legacy — concatenated single file"),
+    ("GEMINI.md", "file", "Gemini CLI — symlink → AGENTS.md"),
+]
+def _measure_dir(path: Path) -> tuple[int, int, int]:
+    """Return (file_count, symlink_count, materialized_bytes) for *path*."""
+    if not path.exists():
+        return (0, 0, 0)
+    files = 0
+    links = 0
+    size = 0
+    for p in path.rglob("*"):
+        if p.is_symlink():
+            links += 1
+        elif p.is_file():
+            files += 1
+            try:
+                size += p.stat().st_size
+            except OSError:
+                pass
+    return (files, links, size)
+def _measure_file(path: Path) -> tuple[int, int, int]:
+    if path.is_symlink():
+        return (0, 1, 0)
+    if path.is_file():
+        return (1, 0, path.stat().st_size)
+    return (0, 0, 0)
+def collect() -> list[dict]:
+    rows: list[dict] = []
+    for surface, kind, method in SURFACES:
+        path = PROJECT_ROOT / surface
+        files, links, size = (
+            _measure_dir(path) if kind == "dir" else _measure_file(path)
+        )
+        rows.append(
+            {
+                "surface": surface,
+                "kind": kind,
+                "method": method,
+                "files": files,
+                "symlinks": links,
+                "bytes_materialized": size,
+                "exists": files + links > 0,
+            }
+        )
+    return rows
+def _temporarily_enable_all_tools() -> str | None:
+    tools_file = PROJECT_ROOT / ".agent-tools.yml"
+    if not tools_file.exists():
+        return None
+    original = tools_file.read_text()
+    data = yaml.safe_load(original) or {}
+    data["tools"] = [
+        "claude-code", "claude-desktop", "augment", "copilot",
+        "cursor", "windsurf", "cline", "gemini",
+    ]
+    tools_file.write_text(
+        "# TEMPORARY override by measure_projection_bytes.py — restored on exit\n"
+        + yaml.safe_dump(data, sort_keys=False)
+    )
+    return original
+def regenerate_all() -> None:
+    backup = _temporarily_enable_all_tools()
+    try:
+        subprocess.run(["task", "clean-tools"], check=True, capture_output=True)
+        subprocess.run(["task", "generate-tools"], check=True, capture_output=True)
+    finally:
+        if backup is not None:
+            (PROJECT_ROOT / ".agent-tools.yml").write_text(backup)
+def render_table(rows: list[dict]) -> str:
+    width = max(len(r["surface"]) for r in rows)
+    lines = [f"{'Surface':<{width}}  Files  Symlinks  Bytes        Method"]
+    lines.append("-" * (width + 50))
+    for r in rows:
+        lines.append(
+            f"{r['surface']:<{width}}  {r['files']:>5}  {r['symlinks']:>8}  "
+            f"{r['bytes_materialized']:>10,}  {r['method']}"
+        )
+    return "\n".join(lines)
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--json", action="store_true", help="machine-readable output")
+    parser.add_argument(
+        "--regenerate",
+        action="store_true",
+        help="regenerate all tool projections before measuring",
+    )
+    args = parser.parse_args()
+    if args.regenerate:
+        if not shutil.which("task"):
+            print("❌  `task` CLI required for --regenerate", file=sys.stderr)
+            return 2
+        regenerate_all()
+    rows = collect()
+    if args.json:
+        print(json.dumps({"surfaces": rows}, indent=2))
+    else:
+        print(render_table(rows))
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

package/scripts/measure_roadmap_trajectory.py ADDED Viewed

@@ -0,0 +1,112 @@
+#!/usr/bin/env python3
+"""Phase 5.1 — Roadmap commitment-history measurement.
+Walks `agents/roadmaps/archive/` and computes per-roadmap checkbox
+completion ratio at archival time. Output: one-line trajectory metric
+per roadmap, plus an aggregate `agents/reports/roadmap-trajectory.json`.
+Checkbox grammar (mirrors `scripts/roadmap_progress_check.py`):
+- `[ ]` — open
+- `[x]` — done
+- `[~]` — in-progress
+- `[-]` — cancelled / dropped (counts neither toward open nor closed)
+Trajectory metric = closed / (open + closed + in-progress); cancelled
+items are excluded from the denominator so a cleanly archived "we
+decided not to do this" doesn't dilute the score.
+"""
+from __future__ import annotations
+import argparse
+import json
+import re
+import sys
+from pathlib import Path
+ROOT = Path(__file__).resolve().parent.parent
+ARCHIVE = ROOT / "agents" / "roadmaps" / "archive"
+REPORT = ROOT / "agents" / "reports" / "roadmap-trajectory.json"
+CHECKBOX = re.compile(r"^\s*[-*]\s*\[(?P<state>[ x~\-])\]", re.MULTILINE)
+def measure(path: Path) -> dict:
+    text = path.read_text(encoding="utf-8", errors="replace")
+    counts = {"open": 0, "done": 0, "wip": 0, "cancelled": 0}
+    for m in CHECKBOX.finditer(text):
+        state = m.group("state")
+        if state == " ":
+            counts["open"] += 1
+        elif state == "x":
+            counts["done"] += 1
+        elif state == "~":
+            counts["wip"] += 1
+        elif state == "-":
+            counts["cancelled"] += 1
+    denom = counts["open"] + counts["done"] + counts["wip"]
+    ratio = (counts["done"] / denom) if denom else None
+    return {
+        "file": str(path.relative_to(ROOT)),
+        "counts": counts,
+        "completion_ratio": ratio,
+        "total_actionable": denom,
+    }
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--archive", default=str(ARCHIVE))
+    ap.add_argument("--report", default=str(REPORT))
+    ap.add_argument("--print-table", action="store_true")
+    args = ap.parse_args()
+    archive = Path(args.archive)
+    if not archive.exists():
+        print(f"❌  archive not found: {archive}", file=sys.stderr)
+        return 2
+    rows = [measure(p) for p in sorted(archive.glob("*.md"))]
+    # Aggregate: mean, median, count above 80%, count zero-completion
+    ratios = [r["completion_ratio"] for r in rows if r["completion_ratio"] is not None]
+    aggregate = {
+        "roadmaps": len(rows),
+        "scored": len(ratios),
+        "mean": (sum(ratios) / len(ratios)) if ratios else None,
+        "median": sorted(ratios)[len(ratios) // 2] if ratios else None,
+        "above_80pct": sum(1 for r in ratios if r >= 0.80),
+        "below_50pct": sum(1 for r in ratios if r < 0.50),
+        "zero_completion": sum(1 for r in ratios if r == 0.0),
+    }
+    out = Path(args.report)
+    out.parent.mkdir(parents=True, exist_ok=True)
+    out.write_text(
+        json.dumps({"aggregate": aggregate, "rows": rows}, indent=2) + "\n",
+        encoding="utf-8",
+    )
+    print(f"✅  Wrote {out.relative_to(ROOT)}")
+    print(f"   roadmaps={aggregate['roadmaps']} scored={aggregate['scored']}")
+    if aggregate["mean"] is not None:
+        print(
+            f"   mean={aggregate['mean']:.1%}  median={aggregate['median']:.1%}  "
+            f"above_80%={aggregate['above_80pct']}  below_50%={aggregate['below_50pct']}  "
+            f"zero={aggregate['zero_completion']}"
+        )
+    if args.print_table:
+        print()
+        print(f"   {'file':70s} {'ratio':>7s} {'done':>5s} {'open':>5s} {'wip':>5s} {'cx':>5s}")
+        for r in sorted(rows, key=lambda x: (x["completion_ratio"] is None, -(x["completion_ratio"] or 0))):
+            ratio = "—" if r["completion_ratio"] is None else f"{r['completion_ratio']:.1%}"
+            print(
+                f"   {Path(r['file']).name:70s} {ratio:>7s} "
+                f"{r['counts']['done']:>5d} {r['counts']['open']:>5d} "
+                f"{r['counts']['wip']:>5d} {r['counts']['cancelled']:>5d}"
+            )
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

package/scripts/probe_projection_fidelity.py ADDED Viewed

@@ -0,0 +1,202 @@
+#!/usr/bin/env python3
+"""Phase 4.2 — Probe per-tool projection fidelity against the fixture.
+Reads tests/fixtures/projection_fidelity/fixtures.yml, walks the
+projected trees (.augment/, .claude/, .cursor/, .clinerules/,
+.windsurfrules, .windsurf/), and records pass/fail/partial per check.
+Output: agents/reports/projection-fidelity.json + stdout summary.
+Pure stdlib (PyYAML reuse from scripts/_lib if installed; otherwise
+inline minimal YAML loader for the fixture's restricted shape).
+"""
+from __future__ import annotations
+import argparse
+import json
+import re
+import sys
+from pathlib import Path
+try:
+    import yaml  # type: ignore
+except ImportError:  # pragma: no cover
+    print("❌  PyYAML required (already a project dep)", file=sys.stderr)
+    sys.exit(2)
+ROOT = Path(__file__).resolve().parent.parent
+TREES = {
+    "augment": ROOT / ".augment",
+    "claude": ROOT / ".claude",
+    "cursor_mdc": ROOT / ".cursor" / "rules",
+    "cursor_commands": ROOT / ".cursor" / "commands",
+    "cline": ROOT / ".clinerules",
+    "windsurf": ROOT / ".windsurfrules",
+    "windsurf_workflows": ROOT / ".windsurf" / "workflows",
+}
+def parse_frontmatter(path: Path) -> tuple[dict, str]:
+    if not path.exists():
+        return {}, ""
+    text = path.read_text(encoding="utf-8")
+    if not text.startswith("---"):
+        return {}, text
+    parts = text.split("---", 2)
+    if len(parts) < 3:
+        return {}, text
+    try:
+        fm = yaml.safe_load(parts[1]) or {}
+    except yaml.YAMLError:
+        fm = {}
+    return fm if isinstance(fm, dict) else {}, parts[2]
+def locate(tree_key: str, entry_type: str, src: str) -> Path | None:
+    """Locate the projected artefact in a given tree."""
+    name = Path(src).stem  # 'laravel-routing'
+    if entry_type == "rule":
+        if tree_key in ("augment", "claude"):
+            p = TREES[tree_key] / "rules" / Path(src).name
+            return p if p.exists() else None
+        if tree_key == "cursor_mdc":
+            p = TREES[tree_key] / f"{name}.mdc"
+            return p if p.exists() else None
+        if tree_key == "cline":
+            p = TREES[tree_key] / f"{name}.md"
+            return p if p.exists() else None
+        if tree_key == "windsurf":
+            return TREES[tree_key] if TREES[tree_key].exists() else None
+    if entry_type == "skill":
+        if tree_key in ("augment", "claude"):
+            p = TREES[tree_key] / "skills" / Path(src).parent.name / "SKILL.md"
+            return p if p.exists() else None
+    if entry_type == "command":
+        if tree_key == "augment":
+            p = TREES[tree_key] / "commands" / Path(src).name
+            return p if p.exists() else None
+        if tree_key == "claude":
+            p = TREES[tree_key] / "skills" / name / "SKILL.md"
+            return p if p.exists() else None
+        if tree_key == "cursor_commands":
+            p = TREES[tree_key] / f"{name}.md"
+            return p if p.exists() else None
+        if tree_key == "windsurf_workflows":
+            p = TREES[tree_key] / f"{name}.md"
+            return p if p.exists() else None
+    return None
+def check_entry(entry: dict) -> dict:
+    out = {"id": entry["id"], "type": entry["type"], "tier": entry.get("tier"), "results": {}}
+    for tool, spec in (entry.get("checks") or {}).items():
+        result = {"status": "pass", "details": []}
+        expect_present = spec.get("present", True)
+        path = locate(tool, entry["type"], entry["source"])
+        if tool == "windsurf" and spec.get("concatenated_in"):
+            fp = ROOT / spec["concatenated_in"]
+            if not fp.exists():
+                result["status"] = "fail"
+                result["details"].append(f"missing concat file {spec['concatenated_in']}")
+            else:
+                body = fp.read_text(encoding="utf-8")
+                needle = spec.get("body_contains")
+                if needle and needle not in body:
+                    result["status"] = "fail"
+                    result["details"].append(f"body missing '{needle}'")
+                if spec.get("routes_to_visible") is False and "routes_to" in body:
+                    result["details"].append("note: routes_to leaks into concat (info)")
+            out["results"][tool] = result
+            continue
+        if expect_present and path is None:
+            result["status"] = "fail"
+            result["details"].append("file not found")
+            out["results"][tool] = result
+            continue
+        if not expect_present:
+            if path is not None:
+                result["status"] = "fail"
+                result["details"].append(f"unexpected file at {path}")
+            else:
+                result["details"].append(f"absent (ok: {spec.get('rationale', '')})")
+            out["results"][tool] = result
+            continue
+        fm, body = parse_frontmatter(path)
+        for key in spec.get("frontmatter_keys", []) or []:
+            if key not in fm:
+                result["status"] = "fail"
+                result["details"].append(f"frontmatter missing '{key}'")
+        for key in spec.get("frontmatter_drops", []) or []:
+            if key in fm:
+                result["status"] = "fail"
+                result["details"].append(f"frontmatter unexpectedly contains '{key}'")
+        if spec.get("alwaysApply") is not None and fm.get("alwaysApply") != spec["alwaysApply"]:
+            result["status"] = "partial"
+            result["details"].append(
+                f"alwaysApply={fm.get('alwaysApply')!r} expected {spec['alwaysApply']!r}"
+            )
+        trig_kw = spec.get("triggers_keyword_contains") or []
+        trig_pp = spec.get("triggers_path_prefix_contains") or []
+        if trig_kw or trig_pp:
+            trigs = fm.get("triggers") or []
+            kws = [t.get("keyword") for t in trigs if isinstance(t, dict) and t.get("keyword")]
+            pps = [t.get("path_prefix") for t in trigs if isinstance(t, dict) and t.get("path_prefix")]
+            for kw in trig_kw:
+                if kw not in kws:
+                    result["status"] = "fail"
+                    result["details"].append(f"trigger keyword '{kw}' missing")
+            for pp in trig_pp:
+                if pp not in pps:
+                    result["status"] = "fail"
+                    result["details"].append(f"trigger path_prefix '{pp}' missing")
+        routes = spec.get("routes_to_contains") or []
+        if routes:
+            rt = fm.get("routes_to") or []
+            for r in routes:
+                if r not in rt:
+                    result["status"] = "fail"
+                    result["details"].append(f"routes_to missing '{r}'")
+        body_needle = spec.get("body_contains")
+        if body_needle and body_needle not in body:
+            result["status"] = "fail"
+            result["details"].append(f"body missing '{body_needle}'")
+        out["results"][tool] = result
+    return out
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--fixture", default="tests/fixtures/projection_fidelity/fixtures.yml")
+    ap.add_argument("--report", default="agents/reports/projection-fidelity.json")
+    args = ap.parse_args()
+    fixture = yaml.safe_load((ROOT / args.fixture).read_text(encoding="utf-8"))
+    entries = fixture.get("entries", [])
+    results = [check_entry(e) for e in entries]
+    summary = {"pass": 0, "partial": 0, "fail": 0}
+    for e in results:
+        for r in e["results"].values():
+            summary[r["status"]] += 1
+    report = {"summary": summary, "entries": results}
+    out = ROOT / args.report
+    out.parent.mkdir(parents=True, exist_ok=True)
+    out.write_text(json.dumps(report, indent=2) + "\n", encoding="utf-8")
+    print(f"✅  Wrote {args.report}")
+    print(f"   pass={summary['pass']} partial={summary['partial']} fail={summary['fail']}")
+    for e in results:
+        for tool, r in e["results"].items():
+            if r["status"] != "pass":
+                print(f"   {r['status']:7s} {e['id']:40s} {tool:18s} {'; '.join(r['details'])}")
+    return 0 if summary["fail"] == 0 else 1
+if __name__ == "__main__":
+    sys.exit(main())

package/scripts/score_skill_selection.py ADDED Viewed

@@ -0,0 +1,198 @@
+#!/usr/bin/env python3
+"""Selection-accuracy scorer (council file 05, Phase 2.2).
+Reads `tests/fixtures/skill_selection/fixtures.yml` and a predictions
+JSON (`{fixture_id: selected_skill_name}`), then computes:
+- (a) intended-skill hit rate — exact `intended` match
+- (b) correct-cluster hit rate — any member of the same cluster
+Per-cluster pass/fail uses the Round-3 protocol:
+    pass = (a) >= 0.90  OR  (b) >= 0.95
+    fail = (a) <  0.80  AND  (b) <  0.80   →  cluster needs `routes_to`
+Predictions source:
+- `--predictions <path>`: external JSON file (LLM run, eval harness, manual).
+- `--baseline`: built-in TF-IDF-style description-similarity baseline. The
+  baseline does NOT speak for any specific host tool; it estimates what
+  pure description-matching would do and provides a numeric floor.
+Output: human-readable summary on stdout + machine JSON to
+`agents/reports/skill-selection-accuracy.json` (or `--out`).
+"""
+from __future__ import annotations
+import argparse
+import json
+import math
+import re
+import sys
+from collections import Counter, defaultdict
+from pathlib import Path
+import yaml
+REPO_ROOT = Path(__file__).resolve().parent.parent
+FIXTURES = REPO_ROOT / "tests" / "fixtures" / "skill_selection" / "fixtures.yml"
+CLUSTERS = REPO_ROOT / "agents" / "reports" / "skill-collision-clusters.json"
+SKILLS_DIR = REPO_ROOT / ".agent-src.uncompressed" / "skills"
+DEFAULT_OUT = REPO_ROOT / "agents" / "reports" / "skill-selection-accuracy.json"
+PASS_A = 0.90
+PASS_B = 0.95
+FAIL_THRESHOLD = 0.80
+STOPWORDS = {
+    "the", "and", "for", "with", "when", "use", "or", "of", "to", "a", "an",
+    "is", "in", "on", "by", "be", "at", "as", "it", "if", "are", "this",
+    "that", "from", "but", "not", "can", "any", "all", "no", "after",
+    "before", "during", "user", "agent", "code", "project", "via", "into",
+    "onto", "even", "without", "naming", "uses", "used", "using", "also",
+    "etc", "across", "between",
+}
+def tokenize(text: str) -> list[str]:
+    tokens = re.findall(r"[A-Za-z][A-Za-z0-9_-]{2,}", text.lower())
+    return [t for t in tokens if t not in STOPWORDS and not t.isdigit()]
+def load_skills() -> dict[str, str]:
+    out = {}
+    for skill_md in sorted(SKILLS_DIR.glob("*/SKILL.md")):
+        text = skill_md.read_text()
+        if not text.startswith("---"):
+            continue
+        parts = text.split("---", 2)
+        if len(parts) < 3:
+            continue
+        try:
+            fm = yaml.safe_load(parts[1]) or {}
+        except yaml.YAMLError:
+            continue
+        name = fm.get("name") or skill_md.parent.name
+        desc = (fm.get("description") or "").strip()
+        if desc:
+            out[name] = desc
+    return out
+def tfidf_vectors(docs: dict[str, str]) -> tuple[dict[str, dict[str, float]], dict[str, float]]:
+    n_docs = len(docs)
+    df: Counter[str] = Counter()
+    tokenized = {k: tokenize(v) for k, v in docs.items()}
+    for toks in tokenized.values():
+        for term in set(toks):
+            df[term] += 1
+    idf = {term: math.log((n_docs + 1) / (count + 1)) + 1 for term, count in df.items()}
+    vectors: dict[str, dict[str, float]] = {}
+    for name, toks in tokenized.items():
+        tf = Counter(toks)
+        vectors[name] = {term: tf[term] * idf.get(term, 0.0) for term in tf}
+    return vectors, idf
+def cosine(a: dict[str, float], b: dict[str, float]) -> float:
+    if not a or not b:
+        return 0.0
+    common = set(a) & set(b)
+    dot = sum(a[t] * b[t] for t in common)
+    na = math.sqrt(sum(v * v for v in a.values()))
+    nb = math.sqrt(sum(v * v for v in b.values()))
+    if na == 0 or nb == 0:
+        return 0.0
+    return dot / (na * nb)
+def baseline_predict(fixtures: list[dict], skills: dict[str, str]) -> dict[str, str]:
+    vectors, idf = tfidf_vectors(skills)
+    preds: dict[str, str] = {}
+    for fx in fixtures:
+        prompt_tokens = tokenize(fx["prompt"])
+        tf = Counter(prompt_tokens)
+        pv = {term: tf[term] * idf.get(term, 0.0) for term in tf}
+        best_name, best_score = "", -1.0
+        for name, vec in vectors.items():
+            score = cosine(pv, vec)
+            if score > best_score:
+                best_name, best_score = name, score
+        preds[fx["id"]] = best_name
+    return preds
+def score(fixtures: list[dict], clusters: list[dict], preds: dict[str, str]) -> dict:
+    # Look up cluster membership by intended-skill (robust to cluster_id renumbering).
+    by_member: dict[str, set[str]] = {}
+    for c in clusters:
+        members = set(c["members"])
+        for m in members:
+            by_member[m] = members
+    per_cluster = defaultdict(lambda: {"total": 0, "hits_a": 0, "hits_b": 0, "misses": [], "label": ""})
+    for fx in fixtures:
+        intended = fx["intended"]
+        members = by_member.get(intended, {intended})
+        # Stable label: sorted members joined — survives cluster_id renumbering.
+        cid = fx.get("cluster") or "+".join(sorted(members)[:2])
+        pred = preds.get(fx["id"], "")
+        rec = per_cluster[cid]
+        rec["total"] += 1
+        rec["label"] = ",".join(sorted(members))
+        if pred == intended:
+            rec["hits_a"] += 1
+        if pred in members:
+            rec["hits_b"] += 1
+        else:
+            rec["misses"].append({"id": fx["id"], "intended": intended, "predicted": pred})
+    results = []
+    for cid, rec in sorted(per_cluster.items()):
+        a = rec["hits_a"] / rec["total"]
+        b = rec["hits_b"] / rec["total"]
+        if a >= PASS_A or b >= PASS_B:
+            verdict = "pass"
+        elif a < FAIL_THRESHOLD and b < FAIL_THRESHOLD:
+            verdict = "fail-needs-routes_to"
+        else:
+            verdict = "mixed"
+        results.append({"cluster": cid, "n": rec["total"], "hit_a": round(a, 3),
+                        "hit_b": round(b, 3), "verdict": verdict, "misses": rec["misses"]})
+    total = sum(r["n"] for r in results)
+    overall_a = sum(r["hit_a"] * r["n"] for r in results) / total if total else 0.0
+    overall_b = sum(r["hit_b"] * r["n"] for r in results) / total if total else 0.0
+    return {"clusters": results,
+            "overall": {"n": total, "hit_a": round(overall_a, 3), "hit_b": round(overall_b, 3)}}
+def main() -> int:
+    p = argparse.ArgumentParser()
+    p.add_argument("--predictions", type=Path, help="JSON file: {fixture_id: skill_name}")
+    p.add_argument("--baseline", action="store_true", help="Use built-in TF-IDF baseline")
+    p.add_argument("--source", default="external", help="Label recorded in output")
+    p.add_argument("--out", type=Path, default=DEFAULT_OUT)
+    args = p.parse_args()
+    if not args.predictions and not args.baseline:
+        print("❌  Specify --predictions <file> or --baseline", file=sys.stderr)
+        return 2
+    fixtures = yaml.safe_load(FIXTURES.read_text())["fixtures"]
+    clusters = json.loads(CLUSTERS.read_text())["clusters"]
+    skills = load_skills()
+    if args.baseline:
+        preds = baseline_predict(fixtures, skills)
+        source = "tfidf-baseline"
+    else:
+        preds = json.loads(args.predictions.read_text())
+        source = args.source
+    report = score(fixtures, clusters, preds)
+    report["source"] = source
+    args.out.parent.mkdir(parents=True, exist_ok=True)
+    args.out.write_text(json.dumps(report, indent=2) + "\n")
+    print(f"✅  Wrote {args.out.relative_to(REPO_ROOT)}  (source={source})")
+    print(f"   overall: hit_a={report['overall']['hit_a']:.3f}  hit_b={report['overall']['hit_b']:.3f}  n={report['overall']['n']}")
+    for c in report["clusters"]:
+        print(f"   {c['cluster']:6}  n={c['n']:2}  hit_a={c['hit_a']:.2f}  hit_b={c['hit_b']:.2f}  {c['verdict']}")
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())