@event4u/agent-config 2.11.0 → 2.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/.agent-src/commands/council/analysis.md +142 -0
  2. package/.agent-src/commands/council/debate.md +129 -0
  3. package/.agent-src/commands/council/default.md +8 -0
  4. package/.agent-src/commands/council/design.md +16 -12
  5. package/.agent-src/commands/council/optimize.md +16 -15
  6. package/.agent-src/commands/council/pr.md +12 -12
  7. package/.agent-src/commands/council.md +48 -2
  8. package/.agent-src/personas/advisors/contrarian.md +95 -0
  9. package/.agent-src/personas/advisors/executor.md +99 -0
  10. package/.agent-src/personas/advisors/expansionist.md +98 -0
  11. package/.agent-src/personas/advisors/first-principles.md +98 -0
  12. package/.agent-src/personas/advisors/outsider.md +102 -0
  13. package/.agent-src/rules/copilot-routing.md +19 -0
  14. package/.agent-src/rules/devcontainer-routing.md +20 -0
  15. package/.agent-src/rules/laravel-routing.md +20 -0
  16. package/.agent-src/rules/symfony-routing.md +20 -0
  17. package/.agent-src/skills/ai-council/SKILL.md +180 -2
  18. package/.agent-src/skills/canvas-design/SKILL.md +132 -0
  19. package/.agent-src/skills/canvas-design/evals/triggers.json +16 -0
  20. package/.agent-src/skills/copilot-config/SKILL.md +1 -1
  21. package/.agent-src/skills/devcontainer/SKILL.md +1 -1
  22. package/.agent-src/skills/doc-coauthoring/SKILL.md +129 -0
  23. package/.agent-src/skills/doc-coauthoring/evals/triggers.json +16 -0
  24. package/.agent-src/skills/laravel/SKILL.md +1 -1
  25. package/.agent-src/skills/project-analysis-core/SKILL.md +1 -1
  26. package/.agent-src/skills/project-analyzer/SKILL.md +1 -1
  27. package/.agent-src/skills/skill-writing/SKILL.md +101 -16
  28. package/.agent-src/skills/sql-writing/SKILL.md +1 -1
  29. package/.agent-src/skills/symfony-workflow/SKILL.md +1 -1
  30. package/.agent-src/skills/universal-project-analysis/SKILL.md +1 -1
  31. package/.agent-src/templates/agents/agent-project-settings.example.yml +1 -1
  32. package/.claude-plugin/marketplace.json +5 -1
  33. package/AGENTS.md +1 -1
  34. package/CHANGELOG.md +78 -0
  35. package/CONTRIBUTING.md +5 -0
  36. package/README.md +3 -3
  37. package/config/agent-settings.template.yml +5 -84
  38. package/docs/architecture/multi-tool-projection.md +53 -0
  39. package/docs/architecture/{compression.md → source-projection.md} +21 -3
  40. package/docs/architecture.md +6 -6
  41. package/docs/catalog.md +21 -11
  42. package/docs/contracts/adr-architectural-consensus-mechanism.md +67 -0
  43. package/docs/contracts/adr-level-6-productization.md +2 -2
  44. package/docs/contracts/ai-council-config.md +186 -0
  45. package/docs/contracts/command-clusters.md +57 -1
  46. package/docs/contracts/multi-tool-projection-fidelity.md +109 -0
  47. package/docs/getting-started.md +2 -2
  48. package/package.json +1 -1
  49. package/scripts/_archive/README.md +59 -0
  50. package/scripts/ai_council/_default_prices.py +10 -1
  51. package/scripts/ai_council/advisors.py +148 -0
  52. package/scripts/ai_council/clients.py +189 -4
  53. package/scripts/ai_council/config.py +368 -0
  54. package/scripts/ai_council/consensus.py +290 -0
  55. package/scripts/ai_council/orchestrator.py +634 -16
  56. package/scripts/ai_council/prompts.py +335 -0
  57. package/scripts/check_compressed_paths.py +6 -1
  58. package/scripts/check_references.py +25 -0
  59. package/scripts/ci_time_ratio.py +168 -0
  60. package/scripts/council_cli.py +1007 -32
  61. package/scripts/measure_projection_bytes.py +159 -0
  62. package/scripts/measure_roadmap_trajectory.py +112 -0
  63. package/scripts/probe_projection_fidelity.py +202 -0
  64. package/scripts/run_skill_evals.py +185 -0
  65. package/scripts/schemas/skill.schema.json +4 -0
  66. package/scripts/score_skill_selection.py +198 -0
  67. package/scripts/skill_collision_clusters.py +162 -0
  68. package/scripts/skill_linter.py +71 -1
  69. /package/scripts/{_backfill_skill_domains.py → _archive/_backfill_skill_domains.py} +0 -0
  70. /package/scripts/{_bootstrap_tier_frontmatter.py → _archive/_bootstrap_tier_frontmatter.py} +0 -0
  71. /package/scripts/{_p43_bodies.py → _archive/_p43_bodies.py} +0 -0
  72. /package/scripts/{_p43_compress.py → _archive/_p43_compress.py} +0 -0
  73. /package/scripts/{_p4_migrate.py → _archive/_p4_migrate.py} +0 -0
  74. /package/scripts/{_phase2_shim_helper.py → _archive/_phase2_shim_helper.py} +0 -0
  75. /package/scripts/{_pilot_council_question.py → _archive/_pilot_council_question.py} +0 -0
@@ -0,0 +1,159 @@
1
+ #!/usr/bin/env python3
2
+ """Measure per-tool projection bytes.
3
+
4
+ Phase 2.1 deliverable for `agents/roadmaps/step-1-v2-feedback-followup.md`
5
+ (council finding U1 — the 0.45 % source/dist headline metric measures the
6
+ wrong boundary). Replaces the single headline figure with per-tool numbers
7
+ and an explicit projection-method label.
8
+
9
+ Usage:
10
+ python3 scripts/measure_projection_bytes.py # human-readable
11
+ python3 scripts/measure_projection_bytes.py --json # machine-readable
12
+ python3 scripts/measure_projection_bytes.py --regenerate
13
+ # runs `task clean-tools && task generate-tools` with *all* tools
14
+ # enabled (via temporary .agent-tools.yml override) before measuring,
15
+ # then restores the original `.agent-tools.yml`. Use this to produce
16
+ # a complete table when the local repo only enables a subset.
17
+
18
+ Output is intentionally non-cached and read fresh from disk every run.
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ import argparse
24
+ import json
25
+ import shutil
26
+ import subprocess
27
+ import sys
28
+ from pathlib import Path
29
+
30
+ import yaml
31
+
32
+ PROJECT_ROOT = Path(__file__).resolve().parent.parent
33
+
34
+ # (surface, kind, projection-method). Surface paths are relative to the repo
35
+ # root. `kind` is "dir" (walk recursively) or "file" (single file size).
36
+ SURFACES: list[tuple[str, str, str]] = [
37
+ (".agent-src.uncompressed", "dir", "verbose source (input)"),
38
+ (".agent-src", "dir", "source projection (path-rewrite + .npmignore)"),
39
+ (".augment", "dir", "Augment Code — copies (rules) + symlinks (skills/cmds)"),
40
+ (".claude", "dir", "Claude Code — pure symlinks"),
41
+ (".cursor", "dir", "Cursor — per-rule `.mdc` materialized + symlinks"),
42
+ (".clinerules", "dir", "Cline — pure symlinks"),
43
+ (".windsurf", "dir", "Windsurf — per-rule wave-8 `.md` + symlinks"),
44
+ (".windsurfrules", "file", "Windsurf legacy — concatenated single file"),
45
+ ("GEMINI.md", "file", "Gemini CLI — symlink → AGENTS.md"),
46
+ ]
47
+
48
+
49
+ def _measure_dir(path: Path) -> tuple[int, int, int]:
50
+ """Return (file_count, symlink_count, materialized_bytes) for *path*."""
51
+ if not path.exists():
52
+ return (0, 0, 0)
53
+ files = 0
54
+ links = 0
55
+ size = 0
56
+ for p in path.rglob("*"):
57
+ if p.is_symlink():
58
+ links += 1
59
+ elif p.is_file():
60
+ files += 1
61
+ try:
62
+ size += p.stat().st_size
63
+ except OSError:
64
+ pass
65
+ return (files, links, size)
66
+
67
+
68
+ def _measure_file(path: Path) -> tuple[int, int, int]:
69
+ if path.is_symlink():
70
+ return (0, 1, 0)
71
+ if path.is_file():
72
+ return (1, 0, path.stat().st_size)
73
+ return (0, 0, 0)
74
+
75
+
76
+ def collect() -> list[dict]:
77
+ rows: list[dict] = []
78
+ for surface, kind, method in SURFACES:
79
+ path = PROJECT_ROOT / surface
80
+ files, links, size = (
81
+ _measure_dir(path) if kind == "dir" else _measure_file(path)
82
+ )
83
+ rows.append(
84
+ {
85
+ "surface": surface,
86
+ "kind": kind,
87
+ "method": method,
88
+ "files": files,
89
+ "symlinks": links,
90
+ "bytes_materialized": size,
91
+ "exists": files + links > 0,
92
+ }
93
+ )
94
+ return rows
95
+
96
+
97
+ def _temporarily_enable_all_tools() -> str | None:
98
+ tools_file = PROJECT_ROOT / ".agent-tools.yml"
99
+ if not tools_file.exists():
100
+ return None
101
+ original = tools_file.read_text()
102
+ data = yaml.safe_load(original) or {}
103
+ data["tools"] = [
104
+ "claude-code", "claude-desktop", "augment", "copilot",
105
+ "cursor", "windsurf", "cline", "gemini",
106
+ ]
107
+ tools_file.write_text(
108
+ "# TEMPORARY override by measure_projection_bytes.py — restored on exit\n"
109
+ + yaml.safe_dump(data, sort_keys=False)
110
+ )
111
+ return original
112
+
113
+
114
+ def regenerate_all() -> None:
115
+ backup = _temporarily_enable_all_tools()
116
+ try:
117
+ subprocess.run(["task", "clean-tools"], check=True, capture_output=True)
118
+ subprocess.run(["task", "generate-tools"], check=True, capture_output=True)
119
+ finally:
120
+ if backup is not None:
121
+ (PROJECT_ROOT / ".agent-tools.yml").write_text(backup)
122
+
123
+
124
+ def render_table(rows: list[dict]) -> str:
125
+ width = max(len(r["surface"]) for r in rows)
126
+ lines = [f"{'Surface':<{width}} Files Symlinks Bytes Method"]
127
+ lines.append("-" * (width + 50))
128
+ for r in rows:
129
+ lines.append(
130
+ f"{r['surface']:<{width}} {r['files']:>5} {r['symlinks']:>8} "
131
+ f"{r['bytes_materialized']:>10,} {r['method']}"
132
+ )
133
+ return "\n".join(lines)
134
+
135
+
136
+ def main() -> int:
137
+ parser = argparse.ArgumentParser(description=__doc__)
138
+ parser.add_argument("--json", action="store_true", help="machine-readable output")
139
+ parser.add_argument(
140
+ "--regenerate",
141
+ action="store_true",
142
+ help="regenerate all tool projections before measuring",
143
+ )
144
+ args = parser.parse_args()
145
+ if args.regenerate:
146
+ if not shutil.which("task"):
147
+ print("❌ `task` CLI required for --regenerate", file=sys.stderr)
148
+ return 2
149
+ regenerate_all()
150
+ rows = collect()
151
+ if args.json:
152
+ print(json.dumps({"surfaces": rows}, indent=2))
153
+ else:
154
+ print(render_table(rows))
155
+ return 0
156
+
157
+
158
+ if __name__ == "__main__":
159
+ sys.exit(main())
@@ -0,0 +1,112 @@
1
+ #!/usr/bin/env python3
2
+ """Phase 5.1 — Roadmap commitment-history measurement.
3
+
4
+ Walks `agents/roadmaps/archive/` and computes per-roadmap checkbox
5
+ completion ratio at archival time. Output: one-line trajectory metric
6
+ per roadmap, plus an aggregate `agents/reports/roadmap-trajectory.json`.
7
+
8
+ Checkbox grammar (mirrors `scripts/roadmap_progress_check.py`):
9
+ - `[ ]` — open
10
+ - `[x]` — done
11
+ - `[~]` — in-progress
12
+ - `[-]` — cancelled / dropped (counts neither toward open nor closed)
13
+
14
+ Trajectory metric = closed / (open + closed + in-progress); cancelled
15
+ items are excluded from the denominator so a cleanly archived "we
16
+ decided not to do this" doesn't dilute the score.
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import argparse
22
+ import json
23
+ import re
24
+ import sys
25
+ from pathlib import Path
26
+
27
+ ROOT = Path(__file__).resolve().parent.parent
28
+ ARCHIVE = ROOT / "agents" / "roadmaps" / "archive"
29
+ REPORT = ROOT / "agents" / "reports" / "roadmap-trajectory.json"
30
+
31
+ CHECKBOX = re.compile(r"^\s*[-*]\s*\[(?P<state>[ x~\-])\]", re.MULTILINE)
32
+
33
+
34
+ def measure(path: Path) -> dict:
35
+ text = path.read_text(encoding="utf-8", errors="replace")
36
+ counts = {"open": 0, "done": 0, "wip": 0, "cancelled": 0}
37
+ for m in CHECKBOX.finditer(text):
38
+ state = m.group("state")
39
+ if state == " ":
40
+ counts["open"] += 1
41
+ elif state == "x":
42
+ counts["done"] += 1
43
+ elif state == "~":
44
+ counts["wip"] += 1
45
+ elif state == "-":
46
+ counts["cancelled"] += 1
47
+ denom = counts["open"] + counts["done"] + counts["wip"]
48
+ ratio = (counts["done"] / denom) if denom else None
49
+ return {
50
+ "file": str(path.relative_to(ROOT)),
51
+ "counts": counts,
52
+ "completion_ratio": ratio,
53
+ "total_actionable": denom,
54
+ }
55
+
56
+
57
+ def main() -> int:
58
+ ap = argparse.ArgumentParser()
59
+ ap.add_argument("--archive", default=str(ARCHIVE))
60
+ ap.add_argument("--report", default=str(REPORT))
61
+ ap.add_argument("--print-table", action="store_true")
62
+ args = ap.parse_args()
63
+
64
+ archive = Path(args.archive)
65
+ if not archive.exists():
66
+ print(f"❌ archive not found: {archive}", file=sys.stderr)
67
+ return 2
68
+
69
+ rows = [measure(p) for p in sorted(archive.glob("*.md"))]
70
+
71
+ # Aggregate: mean, median, count above 80%, count zero-completion
72
+ ratios = [r["completion_ratio"] for r in rows if r["completion_ratio"] is not None]
73
+ aggregate = {
74
+ "roadmaps": len(rows),
75
+ "scored": len(ratios),
76
+ "mean": (sum(ratios) / len(ratios)) if ratios else None,
77
+ "median": sorted(ratios)[len(ratios) // 2] if ratios else None,
78
+ "above_80pct": sum(1 for r in ratios if r >= 0.80),
79
+ "below_50pct": sum(1 for r in ratios if r < 0.50),
80
+ "zero_completion": sum(1 for r in ratios if r == 0.0),
81
+ }
82
+
83
+ out = Path(args.report)
84
+ out.parent.mkdir(parents=True, exist_ok=True)
85
+ out.write_text(
86
+ json.dumps({"aggregate": aggregate, "rows": rows}, indent=2) + "\n",
87
+ encoding="utf-8",
88
+ )
89
+
90
+ print(f"✅ Wrote {out.relative_to(ROOT)}")
91
+ print(f" roadmaps={aggregate['roadmaps']} scored={aggregate['scored']}")
92
+ if aggregate["mean"] is not None:
93
+ print(
94
+ f" mean={aggregate['mean']:.1%} median={aggregate['median']:.1%} "
95
+ f"above_80%={aggregate['above_80pct']} below_50%={aggregate['below_50pct']} "
96
+ f"zero={aggregate['zero_completion']}"
97
+ )
98
+ if args.print_table:
99
+ print()
100
+ print(f" {'file':70s} {'ratio':>7s} {'done':>5s} {'open':>5s} {'wip':>5s} {'cx':>5s}")
101
+ for r in sorted(rows, key=lambda x: (x["completion_ratio"] is None, -(x["completion_ratio"] or 0))):
102
+ ratio = "—" if r["completion_ratio"] is None else f"{r['completion_ratio']:.1%}"
103
+ print(
104
+ f" {Path(r['file']).name:70s} {ratio:>7s} "
105
+ f"{r['counts']['done']:>5d} {r['counts']['open']:>5d} "
106
+ f"{r['counts']['wip']:>5d} {r['counts']['cancelled']:>5d}"
107
+ )
108
+ return 0
109
+
110
+
111
+ if __name__ == "__main__":
112
+ sys.exit(main())
@@ -0,0 +1,202 @@
1
+ #!/usr/bin/env python3
2
+ """Phase 4.2 — Probe per-tool projection fidelity against the fixture.
3
+
4
+ Reads tests/fixtures/projection_fidelity/fixtures.yml, walks the
5
+ projected trees (.augment/, .claude/, .cursor/, .clinerules/,
6
+ .windsurfrules, .windsurf/), and records pass/fail/partial per check.
7
+
8
+ Output: agents/reports/projection-fidelity.json + stdout summary.
9
+
10
+ Pure stdlib (PyYAML reuse from scripts/_lib if installed; otherwise
11
+ inline minimal YAML loader for the fixture's restricted shape).
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import argparse
17
+ import json
18
+ import re
19
+ import sys
20
+ from pathlib import Path
21
+
22
+ try:
23
+ import yaml # type: ignore
24
+ except ImportError: # pragma: no cover
25
+ print("❌ PyYAML required (already a project dep)", file=sys.stderr)
26
+ sys.exit(2)
27
+
28
+ ROOT = Path(__file__).resolve().parent.parent
29
+
30
+ TREES = {
31
+ "augment": ROOT / ".augment",
32
+ "claude": ROOT / ".claude",
33
+ "cursor_mdc": ROOT / ".cursor" / "rules",
34
+ "cursor_commands": ROOT / ".cursor" / "commands",
35
+ "cline": ROOT / ".clinerules",
36
+ "windsurf": ROOT / ".windsurfrules",
37
+ "windsurf_workflows": ROOT / ".windsurf" / "workflows",
38
+ }
39
+
40
+
41
+ def parse_frontmatter(path: Path) -> tuple[dict, str]:
42
+ if not path.exists():
43
+ return {}, ""
44
+ text = path.read_text(encoding="utf-8")
45
+ if not text.startswith("---"):
46
+ return {}, text
47
+ parts = text.split("---", 2)
48
+ if len(parts) < 3:
49
+ return {}, text
50
+ try:
51
+ fm = yaml.safe_load(parts[1]) or {}
52
+ except yaml.YAMLError:
53
+ fm = {}
54
+ return fm if isinstance(fm, dict) else {}, parts[2]
55
+
56
+
57
+ def locate(tree_key: str, entry_type: str, src: str) -> Path | None:
58
+ """Locate the projected artefact in a given tree."""
59
+ name = Path(src).stem # 'laravel-routing'
60
+ if entry_type == "rule":
61
+ if tree_key in ("augment", "claude"):
62
+ p = TREES[tree_key] / "rules" / Path(src).name
63
+ return p if p.exists() else None
64
+ if tree_key == "cursor_mdc":
65
+ p = TREES[tree_key] / f"{name}.mdc"
66
+ return p if p.exists() else None
67
+ if tree_key == "cline":
68
+ p = TREES[tree_key] / f"{name}.md"
69
+ return p if p.exists() else None
70
+ if tree_key == "windsurf":
71
+ return TREES[tree_key] if TREES[tree_key].exists() else None
72
+ if entry_type == "skill":
73
+ if tree_key in ("augment", "claude"):
74
+ p = TREES[tree_key] / "skills" / Path(src).parent.name / "SKILL.md"
75
+ return p if p.exists() else None
76
+ if entry_type == "command":
77
+ if tree_key == "augment":
78
+ p = TREES[tree_key] / "commands" / Path(src).name
79
+ return p if p.exists() else None
80
+ if tree_key == "claude":
81
+ p = TREES[tree_key] / "skills" / name / "SKILL.md"
82
+ return p if p.exists() else None
83
+ if tree_key == "cursor_commands":
84
+ p = TREES[tree_key] / f"{name}.md"
85
+ return p if p.exists() else None
86
+ if tree_key == "windsurf_workflows":
87
+ p = TREES[tree_key] / f"{name}.md"
88
+ return p if p.exists() else None
89
+ return None
90
+
91
+
92
+ def check_entry(entry: dict) -> dict:
93
+ out = {"id": entry["id"], "type": entry["type"], "tier": entry.get("tier"), "results": {}}
94
+ for tool, spec in (entry.get("checks") or {}).items():
95
+ result = {"status": "pass", "details": []}
96
+ expect_present = spec.get("present", True)
97
+ path = locate(tool, entry["type"], entry["source"])
98
+
99
+ if tool == "windsurf" and spec.get("concatenated_in"):
100
+ fp = ROOT / spec["concatenated_in"]
101
+ if not fp.exists():
102
+ result["status"] = "fail"
103
+ result["details"].append(f"missing concat file {spec['concatenated_in']}")
104
+ else:
105
+ body = fp.read_text(encoding="utf-8")
106
+ needle = spec.get("body_contains")
107
+ if needle and needle not in body:
108
+ result["status"] = "fail"
109
+ result["details"].append(f"body missing '{needle}'")
110
+ if spec.get("routes_to_visible") is False and "routes_to" in body:
111
+ result["details"].append("note: routes_to leaks into concat (info)")
112
+ out["results"][tool] = result
113
+ continue
114
+
115
+ if expect_present and path is None:
116
+ result["status"] = "fail"
117
+ result["details"].append("file not found")
118
+ out["results"][tool] = result
119
+ continue
120
+ if not expect_present:
121
+ if path is not None:
122
+ result["status"] = "fail"
123
+ result["details"].append(f"unexpected file at {path}")
124
+ else:
125
+ result["details"].append(f"absent (ok: {spec.get('rationale', '')})")
126
+ out["results"][tool] = result
127
+ continue
128
+
129
+ fm, body = parse_frontmatter(path)
130
+ for key in spec.get("frontmatter_keys", []) or []:
131
+ if key not in fm:
132
+ result["status"] = "fail"
133
+ result["details"].append(f"frontmatter missing '{key}'")
134
+ for key in spec.get("frontmatter_drops", []) or []:
135
+ if key in fm:
136
+ result["status"] = "fail"
137
+ result["details"].append(f"frontmatter unexpectedly contains '{key}'")
138
+ if spec.get("alwaysApply") is not None and fm.get("alwaysApply") != spec["alwaysApply"]:
139
+ result["status"] = "partial"
140
+ result["details"].append(
141
+ f"alwaysApply={fm.get('alwaysApply')!r} expected {spec['alwaysApply']!r}"
142
+ )
143
+ trig_kw = spec.get("triggers_keyword_contains") or []
144
+ trig_pp = spec.get("triggers_path_prefix_contains") or []
145
+ if trig_kw or trig_pp:
146
+ trigs = fm.get("triggers") or []
147
+ kws = [t.get("keyword") for t in trigs if isinstance(t, dict) and t.get("keyword")]
148
+ pps = [t.get("path_prefix") for t in trigs if isinstance(t, dict) and t.get("path_prefix")]
149
+ for kw in trig_kw:
150
+ if kw not in kws:
151
+ result["status"] = "fail"
152
+ result["details"].append(f"trigger keyword '{kw}' missing")
153
+ for pp in trig_pp:
154
+ if pp not in pps:
155
+ result["status"] = "fail"
156
+ result["details"].append(f"trigger path_prefix '{pp}' missing")
157
+ routes = spec.get("routes_to_contains") or []
158
+ if routes:
159
+ rt = fm.get("routes_to") or []
160
+ for r in routes:
161
+ if r not in rt:
162
+ result["status"] = "fail"
163
+ result["details"].append(f"routes_to missing '{r}'")
164
+ body_needle = spec.get("body_contains")
165
+ if body_needle and body_needle not in body:
166
+ result["status"] = "fail"
167
+ result["details"].append(f"body missing '{body_needle}'")
168
+ out["results"][tool] = result
169
+ return out
170
+
171
+
172
+ def main() -> int:
173
+ ap = argparse.ArgumentParser()
174
+ ap.add_argument("--fixture", default="tests/fixtures/projection_fidelity/fixtures.yml")
175
+ ap.add_argument("--report", default="agents/reports/projection-fidelity.json")
176
+ args = ap.parse_args()
177
+
178
+ fixture = yaml.safe_load((ROOT / args.fixture).read_text(encoding="utf-8"))
179
+ entries = fixture.get("entries", [])
180
+ results = [check_entry(e) for e in entries]
181
+
182
+ summary = {"pass": 0, "partial": 0, "fail": 0}
183
+ for e in results:
184
+ for r in e["results"].values():
185
+ summary[r["status"]] += 1
186
+
187
+ report = {"summary": summary, "entries": results}
188
+ out = ROOT / args.report
189
+ out.parent.mkdir(parents=True, exist_ok=True)
190
+ out.write_text(json.dumps(report, indent=2) + "\n", encoding="utf-8")
191
+
192
+ print(f"✅ Wrote {args.report}")
193
+ print(f" pass={summary['pass']} partial={summary['partial']} fail={summary['fail']}")
194
+ for e in results:
195
+ for tool, r in e["results"].items():
196
+ if r["status"] != "pass":
197
+ print(f" {r['status']:7s} {e['id']:40s} {tool:18s} {'; '.join(r['details'])}")
198
+ return 0 if summary["fail"] == 0 else 1
199
+
200
+
201
+ if __name__ == "__main__":
202
+ sys.exit(main())
@@ -0,0 +1,185 @@
1
+ #!/usr/bin/env python3
2
+ """Quantitative skill-eval orchestrator (skill-writing § 7).
3
+
4
+ Scaffolds, aggregates, and reports sub-agent eval runs for a skill.
5
+
6
+ Sub-agent SPAWNING is per-environment (Claude Code, Augment Code,
7
+ council) and is left as a stub `_spawn_subagent(...)` that authors
8
+ implement once for their environment. The rest of the loop —
9
+ scaffold / aggregate / report — works out of the box and reads /
10
+ writes JSON files in `runs/`.
11
+
12
+ Layout per skill:
13
+
14
+ .agent-src.uncompressed/skills/{name}/evals/
15
+ evals.json
16
+ runs/ # gitignored
17
+ {timestamp}-baseline/{scenario_id}/output.txt
18
+ {timestamp}-baseline/{scenario_id}/grade.json
19
+ {timestamp}-with-skill/{scenario_id}/output.txt
20
+ {timestamp}-with-skill/{scenario_id}/grade.json
21
+ {timestamp}-benchmark.json
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+ import argparse
27
+ import json
28
+ import sys
29
+ from datetime import datetime, timezone
30
+ from pathlib import Path
31
+ from typing import Any
32
+
33
+ REPO_ROOT = Path(__file__).resolve().parent.parent
34
+ SKILLS_ROOT = REPO_ROOT / ".agent-src.uncompressed" / "skills"
35
+
36
+
37
+ def _skill_dir(skill: str) -> Path:
38
+ p = SKILLS_ROOT / skill
39
+ if not p.is_dir():
40
+ sys.exit(f"error: skill {skill!r} not found at {p}")
41
+ return p
42
+
43
+
44
+ def _evals_dir(skill: str) -> Path:
45
+ return _skill_dir(skill) / "evals"
46
+
47
+
48
+ def _load_evals(skill: str) -> dict[str, Any]:
49
+ f = _evals_dir(skill) / "evals.json"
50
+ if not f.exists():
51
+ sys.exit(f"error: {f} not found — create it before scaffolding")
52
+ return json.loads(f.read_text(encoding="utf-8"))
53
+
54
+
55
+ def _timestamp() -> str:
56
+ return datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
57
+
58
+
59
+ def _spawn_subagent(prompt: str, *, load_skill: str | None) -> dict[str, Any]:
60
+ """STUB — implement per environment.
61
+
62
+ Must return {"output": str, "elapsed_s": float, "tokens_in": int,
63
+ "tokens_out": int}. When load_skill is None, run baseline; when
64
+ set, load that skill into the sub-agent's context.
65
+ """
66
+ raise NotImplementedError(
67
+ "implement _spawn_subagent for this environment (Claude Code, "
68
+ "Augment, council, ...) — see docstring contract"
69
+ )
70
+
71
+
72
+ def _grade_assertions(output: str, run_dir: Path, assertions: list[dict[str, Any]]) -> list[dict[str, Any]]:
73
+ results: list[dict[str, Any]] = []
74
+ for a in assertions:
75
+ kind = a.get("kind")
76
+ if kind == "contains":
77
+ ok = a["value"] in output
78
+ results.append({"kind": kind, "value": a["value"], "pass": ok})
79
+ elif kind == "file_exists":
80
+ ok = (run_dir / a["path"]).exists() or Path(a["path"]).exists()
81
+ results.append({"kind": kind, "path": a["path"], "pass": ok})
82
+ elif kind == "rubric":
83
+ results.append({"kind": kind, "criterion": a["criterion"], "pass": None,
84
+ "note": "rubric grading requires sub-agent — fill in manually or via grader"})
85
+ else:
86
+ results.append({"kind": kind, "pass": False, "note": f"unknown assertion kind {kind!r}"})
87
+ return results
88
+
89
+
90
+ def cmd_scaffold(skill: str) -> int:
91
+ spec = _load_evals(skill)
92
+ scenarios = spec.get("scenarios", [])
93
+ if not scenarios:
94
+ sys.exit("error: evals.json has no scenarios")
95
+ ts = _timestamp()
96
+ runs = _evals_dir(skill) / "runs"
97
+ for arm in ("baseline", "with-skill"):
98
+ for sc in scenarios:
99
+ d = runs / f"{ts}-{arm}" / sc["id"]
100
+ d.mkdir(parents=True, exist_ok=True)
101
+ (d / "meta.json").write_text(json.dumps({
102
+ "skill": skill, "arm": arm, "scenario_id": sc["id"],
103
+ "prompt": sc["prompt"], "assertions": sc.get("assertions", []),
104
+ "timestamp": ts,
105
+ }, indent=2) + "\n", encoding="utf-8")
106
+ print(f"scaffolded {len(scenarios)} scenarios × 2 arms at runs/{ts}-{{baseline,with-skill}}/")
107
+ print(f"timestamp: {ts}")
108
+ return 0
109
+
110
+
111
+ def cmd_aggregate(skill: str, run: str) -> int:
112
+ runs = _evals_dir(skill) / "runs"
113
+ spec = _load_evals(skill)
114
+ bench: dict[str, Any] = {"skill": skill, "run": run, "generated_at": _timestamp(), "scenarios": []}
115
+ totals = {"baseline_pass": 0, "with_skill_pass": 0, "scenarios": 0}
116
+ for sc in spec.get("scenarios", []):
117
+ row: dict[str, Any] = {"id": sc["id"], "arms": {}}
118
+ for arm in ("baseline", "with-skill"):
119
+ run_dir = runs / f"{run}-{arm}" / sc["id"]
120
+ grade_f = run_dir / "grade.json"
121
+ if not grade_f.exists():
122
+ row["arms"][arm] = {"status": "missing", "pass_count": 0, "total": 0}
123
+ continue
124
+ g = json.loads(grade_f.read_text(encoding="utf-8"))
125
+ results = g.get("results", [])
126
+ passed = sum(1 for r in results if r.get("pass") is True)
127
+ row["arms"][arm] = {"status": "graded", "pass_count": passed, "total": len(results),
128
+ "elapsed_s": g.get("elapsed_s"), "tokens_in": g.get("tokens_in"),
129
+ "tokens_out": g.get("tokens_out")}
130
+ if arm == "baseline" and passed == len(results) and results:
131
+ totals["baseline_pass"] += 1
132
+ if arm == "with-skill" and passed == len(results) and results:
133
+ totals["with_skill_pass"] += 1
134
+ bench["scenarios"].append(row)
135
+ totals["scenarios"] += 1
136
+ bench["totals"] = totals
137
+ out = runs / f"{run}-benchmark.json"
138
+ out.write_text(json.dumps(bench, indent=2) + "\n", encoding="utf-8")
139
+ print(f"wrote {out.relative_to(REPO_ROOT)}")
140
+ print(f" baseline pass: {totals['baseline_pass']}/{totals['scenarios']}")
141
+ print(f" with-skill pass: {totals['with_skill_pass']}/{totals['scenarios']}")
142
+ return 0
143
+
144
+
145
+ def cmd_report(skill: str, run: str) -> int:
146
+ bench_f = _evals_dir(skill) / "runs" / f"{run}-benchmark.json"
147
+ if not bench_f.exists():
148
+ sys.exit(f"error: {bench_f} not found — run aggregate first")
149
+ bench = json.loads(bench_f.read_text(encoding="utf-8"))
150
+ print(f"# Skill eval report — {skill} @ {run}\n")
151
+ print("| Scenario | Baseline | With skill | Δ tokens_out | Δ elapsed_s |")
152
+ print("|---|---|---|---|---|")
153
+ for sc in bench["scenarios"]:
154
+ b = sc["arms"].get("baseline", {})
155
+ w = sc["arms"].get("with-skill", {})
156
+ bp = f"{b.get('pass_count', 0)}/{b.get('total', 0)}"
157
+ wp = f"{w.get('pass_count', 0)}/{w.get('total', 0)}"
158
+ dt = (w.get("tokens_out") or 0) - (b.get("tokens_out") or 0)
159
+ de = (w.get("elapsed_s") or 0) - (b.get("elapsed_s") or 0)
160
+ print(f"| {sc['id']} | {bp} | {wp} | {dt:+d} | {de:+.2f} |")
161
+ t = bench["totals"]
162
+ print(f"\n**Totals:** baseline {t['baseline_pass']}/{t['scenarios']} · with-skill {t['with_skill_pass']}/{t['scenarios']}")
163
+ return 0
164
+
165
+
166
+ def main() -> int:
167
+ p = argparse.ArgumentParser(description=__doc__.splitlines()[0])
168
+ sub = p.add_subparsers(dest="cmd", required=True)
169
+ for name in ("scaffold", "aggregate", "report"):
170
+ sp = sub.add_parser(name)
171
+ sp.add_argument("skill")
172
+ if name != "scaffold":
173
+ sp.add_argument("--run", required=True, help="run timestamp (from scaffold output)")
174
+ args = p.parse_args()
175
+ if args.cmd == "scaffold":
176
+ return cmd_scaffold(args.skill)
177
+ if args.cmd == "aggregate":
178
+ return cmd_aggregate(args.skill, args.run)
179
+ if args.cmd == "report":
180
+ return cmd_report(args.skill, args.run)
181
+ return 1
182
+
183
+
184
+ if __name__ == "__main__":
185
+ sys.exit(main())
@@ -47,6 +47,10 @@
47
47
  "enum": ["senior"],
48
48
  "description": "Optional tier marker. `senior` opts the skill into the Senior-Tier Required Structure check (Context-First lead, Related Skills, Proactive Triggers, Output Artifacts) per .agent-src.uncompressed/rules/skill-quality.md."
49
49
  },
50
+ "meta_skill": {
51
+ "type": "boolean",
52
+ "description": "Opt-out of the linter's `skill_too_large` warn for skills whose purpose IS breadth (skill-writing, agent-docs-writing, skill-reviewer). Meta-skills inherently bundle multiple procedures and inline examples. Use sparingly — every meta_skill: true is a load-on-context trade-off."
53
+ },
50
54
  "external_source": {
51
55
  "type": "string",
52
56
  "format": "uri",