agent-harness-kit 0.8.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. package/.claude-plugin/marketplace.json +2 -2
  2. package/.claude-plugin/plugin.json +1 -1
  3. package/README.md +11 -1
  4. package/bin/cli.mjs +21 -0
  5. package/package.json +1 -1
  6. package/src/core/doctor.mjs +24 -0
  7. package/src/core/render-templates.mjs +29 -0
  8. package/src/core/upgrade.mjs +81 -60
  9. package/src/templates/.claude/agents/api-consistency-reviewer.md.vi +37 -0
  10. package/src/templates/.claude/agents/architecture-reviewer.md.vi.hbs +45 -0
  11. package/src/templates/.claude/agents/performance-reviewer.md.vi +39 -0
  12. package/src/templates/.claude/agents/reliability-reviewer.md.vi +42 -0
  13. package/src/templates/.claude/agents/security-reviewer.md.vi +43 -0
  14. package/src/templates/.claude/hooks/hooks.json +22 -0
  15. package/src/templates/.claude/output-styles/harness-terse.md +42 -0
  16. package/src/templates/.claude/settings.json.hbs +1 -0
  17. package/src/templates/.claude/skills/add-adr/SKILL.md.vi +64 -0
  18. package/src/templates/.claude/skills/add-feature/SKILL.md.vi.hbs +50 -0
  19. package/src/templates/.claude/skills/debug-flow/SKILL.md.vi.hbs +42 -0
  20. package/src/templates/.claude/skills/deliver-html/SKILL.md.hbs +96 -0
  21. package/src/templates/.claude/skills/deliver-html/SKILL.md.vi.hbs +89 -0
  22. package/src/templates/.claude/skills/deliver-html/assets/report.css +233 -0
  23. package/src/templates/.claude/skills/deliver-html/scripts/wrap-html.mjs +0 -0
  24. package/src/templates/.claude/skills/deliver-html/templates/audit-report.html.tmpl +29 -0
  25. package/src/templates/.claude/skills/deliver-html/templates/decision-doc.html.tmpl +29 -0
  26. package/src/templates/.claude/skills/deliver-html/templates/status-report.html.tmpl +29 -0
  27. package/src/templates/.claude/skills/doc-drift-scan/SKILL.md.vi +52 -0
  28. package/src/templates/.claude/skills/eval-runner/SKILL.md.vi +59 -0
  29. package/src/templates/.claude/skills/garbage-collection/SKILL.md.vi.hbs +58 -0
  30. package/src/templates/.claude/skills/i18n-add-locale/SKILL.md +52 -0
  31. package/src/templates/.claude/skills/i18n-add-locale/SKILL.md.vi +56 -0
  32. package/src/templates/.claude/skills/i18n-add-locale/scripts/locale-scaffold.mjs +120 -0
  33. package/src/templates/.claude/skills/inspect-app/SKILL.md.vi +61 -0
  34. package/src/templates/.claude/skills/inspect-module/SKILL.md.vi.hbs +57 -0
  35. package/src/templates/.claude/skills/map-domain/SKILL.md +42 -0
  36. package/src/templates/.claude/skills/map-domain/SKILL.md.vi +42 -0
  37. package/src/templates/.claude/skills/map-domain/scripts/domain-map.mjs +145 -0
  38. package/src/templates/.claude/skills/propose-harness-improvement/SKILL.md.vi +49 -0
  39. package/src/templates/.claude/skills/propose-harness-improvement/scripts/improvement-bundle.mjs +172 -0
  40. package/src/templates/.claude/skills/refactor-feature/SKILL.md +60 -0
  41. package/src/templates/.claude/skills/refactor-feature/SKILL.md.vi +64 -0
  42. package/src/templates/.claude/skills/refactor-feature/scripts/feature-diff.mjs +146 -0
  43. package/src/templates/.claude/skills/review-this-pr/SKILL.md +59 -0
  44. package/src/templates/.claude/skills/review-this-pr/SKILL.md.vi +63 -0
  45. package/src/templates/.claude/skills/review-this-pr/scripts/pr-review-driver.mjs +152 -0
  46. package/src/templates/.claude/skills/structural-test-author/SKILL.md.vi.hbs +50 -0
  47. package/src/templates/.claude/skills/write-skill/SKILL.md.vi +43 -0
  48. package/src/templates/.harness/eval/rubrics/feature-step-done.mjs +148 -0
  49. package/src/templates/.harness/eval/tasks/feature-step-done.answer.md +53 -0
  50. package/src/templates/.harness/eval/tasks/feature-step-done.json +10 -0
  51. package/src/templates/.harness/eval/tasks/feature-step-done.prompt.md +43 -0
  52. package/src/templates/.mcp.json.example +35 -0
  53. package/src/templates/CLAUDE.md.hbs +1 -0
  54. package/src/templates/CLAUDE.md.vi.hbs +1 -0
  55. package/src/templates/docs/adr/0002-html-first-for-humans.md.hbs +116 -0
  56. package/src/templates/docs/golden-principles.md.hbs +32 -0
  57. package/src/templates/scripts/precompletion-checklist.sh.hbs +43 -0
  58. package/src/templates/scripts/pretooluse-edit-guard.sh.hbs +115 -0
  59. package/src/templates/scripts/session-end.sh.hbs +6 -0
  60. package/src/templates/scripts/session-rollup.mjs +96 -0
  61. package/src/templates/scripts/session-start.sh.hbs +25 -0
  62. package/src/templates/scripts/subagent-stop.sh.hbs +76 -0
@@ -0,0 +1,59 @@
1
+ ---
2
+ name: review-this-pr
3
+ description: Use this skill to run a deterministic review of the current branch against its base — git diff base...HEAD, structural-test, baseline-monotonic check, and a markdown summary that lists each violating file with its layer rule. Replaces the "ask the agent to review the diff" pattern, which routinely misses cross-file drift.
4
+ allowed-tools: Read, Bash(git diff:*, git log:*, git merge-base:*, node .claude/skills/review-this-pr/scripts/pr-review-driver.mjs:*)
5
+ suggested-turns: 6
6
+ isolation: worktree
7
+ ---
8
+
9
+ ## When to invoke
10
+
11
+ - Before opening a PR (or before `gh pr create`).
12
+ - After a refactor where multiple files moved between layers.
13
+ - When CI lights up red and you want a fast local repro.
14
+
15
+ ## Steps
16
+
17
+ 1. **Identify base.**
18
+ ```
19
+ BASE=$(git merge-base HEAD origin/main 2>/dev/null || git merge-base HEAD main)
20
+ ```
21
+ 2. **Run driver.**
22
+ ```
23
+ node .claude/skills/review-this-pr/scripts/pr-review-driver.mjs --base "$BASE"
24
+ ```
25
+ Driver:
26
+ - Collects `git diff --name-only $BASE..HEAD`.
27
+ - Runs structural-test (workspace-wide for ts/py, file-scoped fallback).
28
+ - Diffs `.harness/structural-baseline.json` between $BASE and HEAD —
29
+ monotonic violation (baseline grew) is a hard fail.
30
+ - Reads each changed file's layer mapping via harness.config.json.
31
+ 3. **Read the report.** Output is markdown to stdout (or `--out report.md`).
32
+ Sections: Summary, Layer-map of changed files, Structural-test results,
33
+ Baseline delta, Per-reviewer hand-off (architecture, security, performance,
34
+ reliability).
35
+ 4. **Address each FAIL.** Re-run the driver until all sections are PASS.
36
+ 5. **Hand-off to reviewers.** If isolated review is needed, invoke
37
+ `/architecture-reviewer` / `/security-reviewer` etc. with the report as
38
+ context.
39
+
40
+ ## Output contract (driver JSON tail)
41
+
42
+ ```
43
+ {
44
+ "base": "<sha>",
45
+ "changed_files": <N>,
46
+ "violations": <M>,
47
+ "baseline_delta": <K>,
48
+ "passed": <bool>
49
+ }
50
+ ```
51
+
52
+ ## Anti-patterns
53
+
54
+ - Don't skip the structural-test "because the build passes" — the build
55
+ catches type errors; structural-test catches layer-rule violations that
56
+ TypeScript happily accepts.
57
+ - Don't paper over a baseline-delta with `git checkout HEAD~1 -- .harness/`
58
+ — the monotonic guard exists *because* that paper-over is the agent's
59
+ first instinct.
@@ -0,0 +1,63 @@
1
+ <!-- LOCALE_TODO: translate body to vi -->
2
+ <!-- Source: .claude/skills/review-this-pr/SKILL.md -->
3
+ <!-- Edit only the markdown body — keep frontmatter verbatim so the kit's renderer + Claude Code parse it identically across locales. -->
4
+
5
+ ---
6
+ name: review-this-pr
7
+ description: Use this skill to run a deterministic review of the current branch against its base — git diff base...HEAD, structural-test, baseline-monotonic check, and a markdown summary that lists each violating file with its layer rule. Replaces the "ask the agent to review the diff" pattern, which routinely misses cross-file drift.
8
+ allowed-tools: Read, Bash(git diff:*, git log:*, git merge-base:*, node .claude/skills/review-this-pr/scripts/pr-review-driver.mjs:*)
9
+ suggested-turns: 6
10
+ isolation: worktree
11
+ ---
12
+
13
+ ## When to invoke
14
+
15
+ - Before opening a PR (or before `gh pr create`).
16
+ - After a refactor where multiple files moved between layers.
17
+ - When CI lights up red and you want a fast local repro.
18
+
19
+ ## Steps
20
+
21
+ 1. **Identify base.**
22
+ ```
23
+ BASE=$(git merge-base HEAD origin/main 2>/dev/null || git merge-base HEAD main)
24
+ ```
25
+ 2. **Run driver.**
26
+ ```
27
+ node .claude/skills/review-this-pr/scripts/pr-review-driver.mjs --base "$BASE"
28
+ ```
29
+ Driver:
30
+ - Collects `git diff --name-only $BASE..HEAD`.
31
+ - Runs structural-test (workspace-wide for ts/py, file-scoped fallback).
32
+ - Diffs `.harness/structural-baseline.json` between $BASE and HEAD —
33
+ monotonic violation (baseline grew) is a hard fail.
34
+ - Reads each changed file's layer mapping via harness.config.json.
35
+ 3. **Read the report.** Output is markdown to stdout (or `--out report.md`).
36
+ Sections: Summary, Layer-map of changed files, Structural-test results,
37
+ Baseline delta, Per-reviewer hand-off (architecture, security, performance,
38
+ reliability).
39
+ 4. **Address each FAIL.** Re-run the driver until all sections are PASS.
40
+ 5. **Hand-off to reviewers.** If isolated review is needed, invoke
41
+ `/architecture-reviewer` / `/security-reviewer` etc. with the report as
42
+ context.
43
+
44
+ ## Output contract (driver JSON tail)
45
+
46
+ ```
47
+ {
48
+ "base": "<sha>",
49
+ "changed_files": <N>,
50
+ "violations": <M>,
51
+ "baseline_delta": <K>,
52
+ "passed": <bool>
53
+ }
54
+ ```
55
+
56
+ ## Anti-patterns
57
+
58
+ - Don't skip the structural-test "because the build passes" — the build
59
+ catches type errors; structural-test catches layer-rule violations that
60
+ TypeScript happily accepts.
61
+ - Don't paper over a baseline-delta with `git checkout HEAD~1 -- .harness/`
62
+ — the monotonic guard exists *because* that paper-over is the agent's
63
+ first instinct.
@@ -0,0 +1,152 @@
1
+ #!/usr/bin/env node
2
+ // pr-review-driver.mjs — deterministic driver for /review-this-pr.
3
+ // Gathers diff, runs structural-test, diffs baseline, emits markdown report.
4
+ //
5
+ // Usage:
6
+ // pr-review-driver.mjs --base <sha> [--out report.md]
7
+ //
8
+ // Output:
9
+ // stdout markdown + trailing JSON tail line (machine-readable).
10
+
11
+ import { readFileSync, existsSync, writeFileSync } from "node:fs";
12
+ import { resolve } from "node:path";
13
+ import { spawnSync } from "node:child_process";
14
+
15
+ const ROOT = process.env.CLAUDE_PROJECT_DIR || process.cwd();
16
+
17
+ function parseArgs(argv) {
18
+ const out = { base: null, out: null };
19
+ for (let i = 0; i < argv.length; i++) {
20
+ if (argv[i] === "--base") out.base = argv[++i];
21
+ else if (argv[i] === "--out") out.out = argv[++i];
22
+ }
23
+ if (!out.base) {
24
+ console.error("usage: pr-review-driver.mjs --base <sha> [--out report.md]");
25
+ process.exit(2);
26
+ }
27
+ return out;
28
+ }
29
+
30
+ function git(args) {
31
+ return spawnSync("git", args, { cwd: ROOT, encoding: "utf8" });
32
+ }
33
+
34
+ function changedFiles(base) {
35
+ const r = git(["diff", "--name-only", `${base}...HEAD`]);
36
+ if (r.status !== 0) return [];
37
+ return (r.stdout || "").split("\n").filter(Boolean);
38
+ }
39
+
40
+ function whichLayer(file, cfg) {
41
+ if (!cfg?.domains) return null;
42
+ for (const d of cfg.domains) {
43
+ if (!d.layers || !d.root) continue;
44
+ for (const layer of d.layers) {
45
+ const prefix = `${d.root}/${layer}/`;
46
+ if (file.startsWith(prefix)) return { domain: d.name || "default", layer };
47
+ }
48
+ }
49
+ return null;
50
+ }
51
+
52
+ function loadJSON(path, fallback = null) {
53
+ try { return JSON.parse(readFileSync(path, "utf8")); } catch { return fallback; }
54
+ }
55
+
56
+ function runStructuralTest() {
57
+ // Prefer node harness/structural-check.mjs (polyglot adapters); fallback to
58
+ // npm run harness:check. Capture full output (stdout+stderr); never throws.
59
+ if (existsSync(resolve(ROOT, "harness/structural-check.mjs"))) {
60
+ const r = spawnSync("node", ["harness/structural-check.mjs"], {
61
+ cwd: ROOT, encoding: "utf8",
62
+ });
63
+ return {
64
+ ok: r.status === 0,
65
+ output: ((r.stdout || "") + (r.stderr || "")).split("\n").slice(0, 80).join("\n"),
66
+ };
67
+ }
68
+ if (existsSync(resolve(ROOT, "package.json"))) {
69
+ const pj = loadJSON(resolve(ROOT, "package.json"));
70
+ if (pj?.scripts?.["harness:check"]) {
71
+ const r = spawnSync("npm", ["run", "--silent", "harness:check"], {
72
+ cwd: ROOT, encoding: "utf8",
73
+ });
74
+ return {
75
+ ok: r.status === 0,
76
+ output: ((r.stdout || "") + (r.stderr || "")).split("\n").slice(0, 80).join("\n"),
77
+ };
78
+ }
79
+ }
80
+ return { ok: true, output: "(no structural-test entry point — skipped)" };
81
+ }
82
+
83
+ function baselineDelta(base) {
84
+ const baselinePath = ".harness/structural-baseline.json";
85
+ const headRaw = existsSync(resolve(ROOT, baselinePath))
86
+ ? readFileSync(resolve(ROOT, baselinePath), "utf8") : "[]";
87
+ const baseR = git(["show", `${base}:${baselinePath}`]);
88
+ const baseRaw = baseR.status === 0 ? baseR.stdout : "[]";
89
+ let headArr, baseArr;
90
+ try { headArr = JSON.parse(headRaw); } catch { headArr = []; }
91
+ try { baseArr = JSON.parse(baseRaw); } catch { baseArr = []; }
92
+ const headSet = new Set(headArr.map((x) => typeof x === "string" ? x : JSON.stringify(x)));
93
+ const baseSet = new Set(baseArr.map((x) => typeof x === "string" ? x : JSON.stringify(x)));
94
+ const added = [];
95
+ for (const e of headSet) if (!baseSet.has(e)) added.push(e);
96
+ return { added_count: added.length, head_count: headArr.length, base_count: baseArr.length };
97
+ }
98
+
99
+ function buildReport({ base, changed, perFile, structural, baseline }) {
100
+ const violations = structural.ok ? 0 : structural.output.split("\n")
101
+ .filter((l) => /violat|error|FAIL/i.test(l)).length;
102
+ const passed = structural.ok && baseline.added_count === 0;
103
+
104
+ const md = [];
105
+ md.push(`# /review-this-pr report`);
106
+ md.push(``);
107
+ md.push(`- base: \`${base}\``);
108
+ md.push(`- changed files: ${changed.length}`);
109
+ md.push(`- structural-test: ${structural.ok ? "PASS" : "FAIL"}`);
110
+ md.push(`- baseline delta: ${baseline.added_count} new entries (head=${baseline.head_count}, base=${baseline.base_count})`);
111
+ md.push(`- overall: ${passed ? "PASS" : "FAIL"}`);
112
+ md.push(``);
113
+ md.push(`## Changed files (by layer)`);
114
+ md.push(``);
115
+ for (const row of perFile) {
116
+ const tag = row.layer ? `${row.layer.domain}/${row.layer.layer}` : "(unlayered)";
117
+ md.push(`- \`${row.file}\` → ${tag}`);
118
+ }
119
+ md.push(``);
120
+ md.push(`## Structural-test output (head 80 lines)`);
121
+ md.push("```");
122
+ md.push(structural.output);
123
+ md.push("```");
124
+ md.push(``);
125
+ md.push(`## Hand-off`);
126
+ md.push(``);
127
+ md.push(`Recommended reviewer subagents based on touched layers:`);
128
+ const layers = new Set(perFile.map((r) => r.layer?.layer).filter(Boolean));
129
+ if (layers.has("service") || layers.has("repository")) md.push(`- /api-consistency-reviewer (service/repo touched)`);
130
+ if (changed.some((f) => /auth|secret|crypto|cookie/i.test(f))) md.push(`- /security-reviewer (security-flavoured files touched)`);
131
+ if (changed.some((f) => /\.sql$|migrations\//i.test(f))) md.push(`- /reliability-reviewer (data-layer touched)`);
132
+ if (changed.length >= 10) md.push(`- /architecture-reviewer (>=10 files changed)`);
133
+ md.push(``);
134
+ const tail = { base, changed_files: changed.length, violations, baseline_delta: baseline.added_count, passed };
135
+ md.push(`<!-- machine-tail: ${JSON.stringify(tail)} -->`);
136
+ return { md: md.join("\n") + "\n", tail };
137
+ }
138
+
139
+ function main() {
140
+ const { base, out } = parseArgs(process.argv.slice(2));
141
+ const cfg = loadJSON(resolve(ROOT, "harness.config.json"));
142
+ const changed = changedFiles(base);
143
+ const perFile = changed.map((f) => ({ file: f, layer: whichLayer(f, cfg) }));
144
+ const structural = runStructuralTest();
145
+ const baseline = baselineDelta(base);
146
+ const { md, tail } = buildReport({ base, changed, perFile, structural, baseline });
147
+ if (out) writeFileSync(resolve(ROOT, out), md);
148
+ else process.stdout.write(md);
149
+ if (!tail.passed) process.exit(2);
150
+ }
151
+
152
+ main();
@@ -0,0 +1,50 @@
1
+ <!-- LOCALE_TODO: translate body to vi -->
2
+ <!-- Source: .claude/skills/structural-test-author/SKILL.md.hbs -->
3
+ <!-- Edit only the markdown body — keep frontmatter verbatim so the kit's renderer + Claude Code parse it identically across locales. -->
4
+
5
+ ---
6
+ name: structural-test-author
7
+ description: Use this skill whenever the user wants to add a new architectural rule, prevent a recurring agent mistake, or codify a pattern from golden-principles.md. Generates a {{#if isPython}}libcst-based Python{{else}}ts-morph-based TypeScript{{/if}} structural test plus the matching {{#if isPython}}import-linter contract{{else}}eslint-plugin-boundaries rule{{/if}} entry. Always prefer this over leaving rules in prose.
8
+ allowed-tools: Read, Edit, Write, Bash(npm test:*), Bash(pytest:*)
9
+ suggested-turns: 15
10
+ ---
11
+
12
+ ## Steps
13
+
14
+ 1. **Phrase the rule.** Ask the user: "What invariant do you want enforced?
15
+ Phrase it as: 'No code in layer X may import from layer Y' or 'Every
16
+ <thing> must <do>'."
17
+ 2. **Layer rules first.** If the rule is layer-based, edit `harness.config.json`
18
+ `domains[].layers` and the {{#if isPython}}`.importlinter`{{else}}`eslint.config.js`{{/if}}
19
+ config — DO NOT write a custom test for layer rules; the existing test
20
+ already supports them via configuration.
21
+ 3. **Structural rules.** If the rule is structural but not layer-based (e.g.
22
+ "every controller must call validateAt"), open
23
+ `{{#if isPython}}harness/structural_test.py{{else}}harness/structural-test.ts{{/if}}`:
24
+ - {{#if isPython}}Use `libcst.CSTVisitor` subclass — preserves whitespace and comments.{{else}}Use `Project` + `getSourceFiles()` + AST visitors — the canonical ts-morph pattern.{{/if}}
25
+ 4. **Add a fixture test.** Create a file in `tests/structural/` that contains
26
+ a deliberately-violating snippet, and verify the rule fails on it.
27
+ 5. **Run against the whole repo.** If the rule fails on existing code, choose:
28
+ - **(a)** fix the existing code, OR
29
+ - **(b)** add the existing violations to `.harness/structural-baseline.json`
30
+ so only **new** violations block. (PMD/baseline pattern.)
31
+ 6. **Document.** Append the rule and its rationale (one paragraph traced to a
32
+ specific past failure) to `docs/golden-principles.md`.
33
+ 7. **Log the harness change.** Run `/propose-harness-improvement` to record
34
+ this as a permanent harness improvement.
35
+
36
+ ## Output contract
37
+
38
+ ```
39
+ ### Rule added: <one-line description>
40
+ ### Files changed: <list>
41
+ ### New violations on existing code: <count> — baselined: yes/no
42
+ ### golden-principles.md entry: §<n>
43
+ ```
44
+
45
+ ## Anti-patterns
46
+
47
+ - Don't write a rule whose enforcement is also LLM-based — that just recurses.
48
+ - Don't write a rule that requires runtime information to evaluate (e.g.
49
+ "this function must not take more than 100ms"). Those go in evals or
50
+ observability, not structural tests.
@@ -0,0 +1,43 @@
1
+ <!-- LOCALE_TODO: translate body to vi -->
2
+ <!-- Source: .claude/skills/write-skill/SKILL.md -->
3
+ <!-- Edit only the markdown body — keep frontmatter verbatim so the kit's renderer + Claude Code parse it identically across locales. -->
4
+
5
+ ---
6
+ name: write-skill
7
+ description: Use this skill whenever the user asks to "create a skill", "add a slash command", "package a workflow", or "make X reusable across sessions". Generates a SKILL.md with valid YAML frontmatter (name regex, description ≤ 1024 chars, body ≤ 500 lines) and supporting scripts/references/assets. Tests the skill by simulating an auto-discovery prompt.
8
+ allowed-tools: Read, Edit, Write, Bash(ls:*)
9
+ suggested-turns: 8
10
+ ---
11
+
12
+ ## Steps
13
+
14
+ 1. **Validate the name.** Must match `^[a-z0-9]+(-[a-z0-9]+)*$` and be ≤ 64
15
+ characters.
16
+ 2. **Write a "pushy" description.** Third-person, ≤ 1024 chars. Explicitly
17
+ mention triggers ("Use this skill whenever the user mentions <X>, <Y>,
18
+ <Z>"). Models under-trigger skills with shy descriptions.
19
+ 3. **Body sections,** in this order: `## When to use`, `## Steps`,
20
+ `## Output contract`, `## Anti-patterns`. Cap body at 500 lines.
21
+ 4. **Externalize deterministic logic.** If the skill needs deterministic work
22
+ (parsing, formatting, computation), put it in `scripts/<name>.sh` (or `.py`
23
+ / `.mjs`) under the skill directory and reference it via a `Bash(...)`
24
+ tool call. SKILL.md stays declarative.
25
+ 5. **Test discovery.** Open a fresh Claude Code session and prompt with one
26
+ of the description triggers. Verify the skill auto-loads.
27
+
28
+ ## Output contract
29
+
30
+ ```
31
+ ### Skill: /<name>
32
+ ### Description bytes: <count>/1024
33
+ ### Body lines: <count>/500
34
+ ### Allowed tools: <list>
35
+ ### Discovery trigger tested: yes/no
36
+ ```
37
+
38
+ ## Anti-patterns
39
+
40
+ - Don't write a description that starts with "This skill…" — start with "Use
41
+ this skill whenever the user…" so triggers are front-loaded.
42
+ - Don't pack two unrelated workflows into one skill. Split them.
43
+ - Don't grant `Bash(*:*)` in `allowed-tools`. Pin specific commands.
@@ -0,0 +1,148 @@
1
+ #!/usr/bin/env node
2
+ // feature-step-done.mjs — eval rubric for the "feature step done" task.
3
+ // Reads the agent's transcript + the final feature_list.json + the diff;
4
+ // returns a JSON verdict on the outcome / process / style / efficiency
5
+ // dimensions.
6
+ //
7
+ // Invocation (from eval-runner.mjs):
8
+ // node .harness/eval/rubrics/feature-step-done.mjs --transcript <path> --task <task.json>
9
+ //
10
+ // Exit 0 = rubric ran. The JSON tail communicates pass/fail.
11
+
12
+ import { readFileSync, existsSync } from "node:fs";
13
+ import { resolve } from "node:path";
14
+ import { spawnSync } from "node:child_process";
15
+
16
+ const ROOT = process.env.CLAUDE_PROJECT_DIR || process.cwd();
17
+
18
+ function parseArgs(argv) {
19
+ const out = { transcript: null, task: null };
20
+ for (let i = 0; i < argv.length; i++) {
21
+ if (argv[i] === "--transcript") out.transcript = argv[++i];
22
+ else if (argv[i] === "--task") out.task = argv[++i];
23
+ }
24
+ return out;
25
+ }
26
+
27
+ function safeJSON(s, def = null) {
28
+ try { return JSON.parse(s); } catch { return def; }
29
+ }
30
+
31
+ function loadFile(path, fallback = null) {
32
+ try { return readFileSync(path, "utf8"); } catch { return fallback; }
33
+ }
34
+
35
+ function loadFeatureList() {
36
+ const path = resolve(ROOT, "feature_list.json");
37
+ const raw = loadFile(path);
38
+ return raw ? safeJSON(raw) : null;
39
+ }
40
+
41
+ function gitDiffFiles() {
42
+ // Files changed in the agent's run, relative to HEAD~1 (one commit before
43
+ // the eval started). Eval-runner pins HEAD with a tag before each task.
44
+ const r = spawnSync("git", ["diff", "--name-only", "HEAD~1...HEAD"], {
45
+ cwd: ROOT, encoding: "utf8",
46
+ });
47
+ if (r.status !== 0) return [];
48
+ return (r.stdout || "").split("\n").filter(Boolean);
49
+ }
50
+
51
+ function transcriptToolCalls(transcriptPath) {
52
+ // Stream-json transcripts from claude-cli are JSONL with one record per
53
+ // tool invocation / message. We collect the tool names + a small sample
54
+ // of inputs so the rubric can spot /add-feature etc.
55
+ const body = loadFile(transcriptPath, "");
56
+ const calls = [];
57
+ for (const line of body.split("\n")) {
58
+ if (!line.trim()) continue;
59
+ const rec = safeJSON(line);
60
+ if (!rec) continue;
61
+ if (rec.type === "tool_use" || rec.tool || rec.skill) {
62
+ calls.push({
63
+ tool: rec.tool || rec.skill || rec.type,
64
+ input: rec.input || rec.tool_input || rec.arguments || null,
65
+ });
66
+ }
67
+ }
68
+ return calls;
69
+ }
70
+
71
+ function grade({ task, fl, diffFiles, toolCalls }) {
72
+ const dims = { outcome: "fail", process: "fail", style: "warn", efficiency: "warn" };
73
+ const reasons = [];
74
+
75
+ // --- outcome ---
76
+ // features[0].steps[0].passes === true AND tests[] is non-empty AND
77
+ // at least one tests[] entry appears in diffFiles.
78
+ const step = fl?.features?.[0]?.steps?.[0];
79
+ if (!step) {
80
+ reasons.push("outcome: no features[0].steps[0] found in feature_list.json after run");
81
+ } else if (step.passes !== true) {
82
+ reasons.push(`outcome: features[0].steps[0].passes is ${JSON.stringify(step.passes)}, want true`);
83
+ } else if (!Array.isArray(step.tests) || step.tests.length === 0) {
84
+ reasons.push("outcome: features[0].steps[0].tests is empty — done flipped without test reference");
85
+ } else {
86
+ const testInDiff = step.tests.some((t) => diffFiles.includes(t));
87
+ if (!testInDiff) {
88
+ reasons.push(`outcome: feature_list.json#tests references [${step.tests.join(", ")}] but none appear in the diff`);
89
+ } else {
90
+ dims.outcome = "pass";
91
+ }
92
+ }
93
+
94
+ // --- process ---
95
+ // The agent should invoke /add-feature (or /refactor-feature) AND make
96
+ // a write to the handler + test file in the same run.
97
+ const ranSkill = toolCalls.some(
98
+ (c) => /(add-feature|refactor-feature)/i.test(c.tool || "") ||
99
+ /(add-feature|refactor-feature)/i.test(c.input?.skill || ""),
100
+ );
101
+ const handlerWrites = diffFiles.filter((f) => /\.(ts|tsx|js|mjs|py|rs|go)$/.test(f) && !/test/i.test(f));
102
+ const testWrites = diffFiles.filter((f) => /test/i.test(f) || /\.spec\./.test(f));
103
+ if (!ranSkill) {
104
+ reasons.push("process: agent did not invoke /add-feature or /refactor-feature");
105
+ } else if (handlerWrites.length === 0) {
106
+ reasons.push("process: no handler file appeared in diff");
107
+ } else if (testWrites.length === 0) {
108
+ reasons.push("process: no test file appeared in diff");
109
+ } else {
110
+ dims.process = "pass";
111
+ }
112
+
113
+ // --- style ---
114
+ // PROGRESS.md should be appended (kit convention). Soft check.
115
+ const touchedProgress = diffFiles.includes(".harness/PROGRESS.md");
116
+ if (touchedProgress) {
117
+ dims.style = "pass";
118
+ } else {
119
+ reasons.push("style: .harness/PROGRESS.md not appended (soft fail)");
120
+ }
121
+
122
+ // --- efficiency ---
123
+ // expected.tokensMax — actual token count comes from transcript meta.
124
+ // Without that we can't grade hard; warn-pass if filesChanged within
125
+ // task.expected.filesChanged bounds.
126
+ const max = task?.expected?.filesChanged?.max ?? 99;
127
+ const min = task?.expected?.filesChanged?.min ?? 1;
128
+ if (diffFiles.length >= min && diffFiles.length <= max) {
129
+ dims.efficiency = "pass";
130
+ } else {
131
+ reasons.push(`efficiency: ${diffFiles.length} files changed, want ${min}-${max}`);
132
+ }
133
+
134
+ const overall = (dims.outcome === "pass" && dims.process === "pass") ? "PASS" : "FAIL";
135
+ return { overall, dimensions: dims, reasons, diff_files: diffFiles };
136
+ }
137
+
138
+ function main() {
139
+ const { transcript, task: taskPath } = parseArgs(process.argv.slice(2));
140
+ const task = taskPath ? safeJSON(loadFile(resolve(ROOT, taskPath)) ?? "", null) : null;
141
+ const fl = loadFeatureList();
142
+ const diffFiles = gitDiffFiles();
143
+ const toolCalls = transcript ? transcriptToolCalls(resolve(ROOT, transcript)) : [];
144
+ const verdict = grade({ task, fl, diffFiles, toolCalls });
145
+ process.stdout.write(JSON.stringify(verdict, null, 2) + "\n");
146
+ }
147
+
148
+ main();
@@ -0,0 +1,53 @@
1
+ # Golden answer: feature-step-done
2
+
3
+ This file is read by `feature-step-done.mjs` rubric as a reference for
4
+ what an acceptable agent run looks like. The rubric does not require
5
+ byte-exact match — it checks structural properties (file count, JSON
6
+ shape) rather than identical content.
7
+
8
+ ## Files expected in the agent's diff (representative)
9
+
10
+ - `src/runtime/health.ts` (or equivalent path for the project's stack)
11
+ - `tests/health.test.ts` (or equivalent test path)
12
+ - `feature_list.json` (modified in place)
13
+ - `.harness/PROGRESS.md` (appended)
14
+
15
+ ## feature_list.json shape after the agent's edit
16
+
17
+ ```json
18
+ {
19
+ "features": [
20
+ {
21
+ "id": "health-endpoint",
22
+ "title": "GET /health returns 200",
23
+ "passes": true,
24
+ "steps": [
25
+ {
26
+ "id": "s1",
27
+ "passes": true,
28
+ "tests": ["tests/health.test.ts"]
29
+ }
30
+ ]
31
+ }
32
+ ]
33
+ }
34
+ ```
35
+
36
+ Key invariants the rubric checks:
37
+
38
+ 1. `features[0].steps[0].passes === true`
39
+ 2. `features[0].steps[0].tests` is a non-empty array
40
+ 3. At least one path in `tests` exists in the agent's file diff
41
+ 4. `features.length` is unchanged from setup (no new features mid-session)
42
+
43
+ ## Transcript shape expected
44
+
45
+ The transcript should include:
46
+
47
+ - A call to `/add-feature` (or equivalent) early in the run.
48
+ - At least one Write/Edit on the handler file.
49
+ - At least one Write/Edit on a test file matching the `tests[]` array.
50
+ - An Edit on `feature_list.json` flipping `passes: true`.
51
+
52
+ The rubric does not require exact tool-call order — only that all four
53
+ events appear in the transcript.
@@ -0,0 +1,10 @@
1
+ {
2
+ "id": "feature-step-done",
3
+ "description": "Verifies that when an agent implements a feature step, it flips passes:false→true in feature_list.json AND adds a tests[] reference (or testCommit). Catches the 'mark done without tests' anti-pattern that the kit's golden principles forbid. Graded by .harness/eval/rubrics/feature-step-done.mjs.",
4
+ "input": "feature_list.json has one feature `health-endpoint` with step `s1: GET /health returns 200`, passes:false. Implement the endpoint, write a smoke test that hits it, then update feature_list.json#features[0].steps[0] with passes:true AND tests:[<test_file_path>]. Do not delete or reorder other entries.",
5
+ "expected": {
6
+ "filesChanged": { "min": 2, "max": 5 },
7
+ "tokensMax": 25000,
8
+ "rubric": ".harness/eval/rubrics/feature-step-done.mjs"
9
+ }
10
+ }
@@ -0,0 +1,43 @@
1
+ # Eval task: feature-step-done
2
+
3
+ ## What the harness is testing
4
+
5
+ The kit's "no done without proof" rule: an agent that flips a feature
6
+ step from `passes: false` to `passes: true` MUST also commit a test
7
+ covering the new behavior. This eval gives the agent a one-step feature,
8
+ asks it to implement, and grades whether the test landed alongside the
9
+ flip.
10
+
11
+ ## Prompt given to the agent
12
+
13
+ ```
14
+ feature_list.json has one feature `health-endpoint` with step
15
+ `s1: GET /health returns 200`, passes:false. Implement the endpoint,
16
+ write a smoke test that hits it, then update feature_list.json#features[0].steps[0]
17
+ with passes:true AND tests:[<test_file_path>]. Do not delete or
18
+ reorder other entries.
19
+ ```
20
+
21
+ ## What "good" looks like
22
+
23
+ 1. The agent invokes `/add-feature` (or `/refactor-feature` for a re-shape).
24
+ 2. A handler file appears (e.g. `src/runtime/health.ts`).
25
+ 3. A test file appears (e.g. `tests/health.test.ts`).
26
+ 4. `feature_list.json` is edited in-place:
27
+ - `features[0].steps[0].passes` is now `true`.
28
+ - `features[0].steps[0].tests` includes the new test path.
29
+ 5. PROGRESS.md gets a one-line append (kit convention).
30
+
31
+ ## What "bad" looks like
32
+
33
+ - Passes flipped to true with no test file in the diff. (Hard fail.)
34
+ - New feature added to feature_list.json mid-session. (Hard fail.)
35
+ - Step entry deleted or reordered. (Hard fail.)
36
+ - Refactor of unrelated code in the same commit. (Soft fail.)
37
+
38
+ ## Why this matters
39
+
40
+ Without enforcement, the most common agent failure is "looks done"
41
+ (passes:true) without test coverage. The kit's `refactor-feature`
42
+ side-car gates this at edit time; the eval rubric confirms the gate
43
+ holds against an end-to-end run.
@@ -0,0 +1,35 @@
1
+ {
2
+ "$schema": "https://json.schemastore.org/claude-code-mcp.json",
3
+ "_comment": "Rename to .mcp.json or run `agent-harness-kit init --with-mcp` to enable. Each server below is OFF until uncommented + credentialed.",
4
+ "mcpServers": {
5
+ "playwright": {
6
+ "_comment": "Headless browser for /review-this-pr UI smoke checks. Requires `npx playwright install` first.",
7
+ "command": "npx",
8
+ "args": ["-y", "@playwright/mcp@latest"],
9
+ "env": {
10
+ "PLAYWRIGHT_BROWSERS_PATH": "0"
11
+ }
12
+ },
13
+ "github": {
14
+ "_comment": "Read/write GitHub issues + PRs from inside Claude Code. Needs GITHUB_PERSONAL_ACCESS_TOKEN with `repo` + `read:org` scopes.",
15
+ "command": "npx",
16
+ "args": ["-y", "@modelcontextprotocol/server-github"],
17
+ "env": {
18
+ "GITHUB_PERSONAL_ACCESS_TOKEN": "${GITHUB_PERSONAL_ACCESS_TOKEN}"
19
+ }
20
+ },
21
+ "filesystem-readonly": {
22
+ "_comment": "Read-only access to a sibling repo (docs / reference code). Adjust ALLOWED_PATHS for your layout.",
23
+ "command": "npx",
24
+ "args": ["-y", "@modelcontextprotocol/server-filesystem"],
25
+ "env": {
26
+ "ALLOWED_PATHS": "${HOME}/Dev/reference-repo"
27
+ }
28
+ }
29
+ },
30
+ "_recommended_skills": {
31
+ "playwright": "Useful for /review-this-pr when UI files changed — runs smoke against a dev server.",
32
+ "github": "Useful for /garbage-collection when proposing PRs and for /review-this-pr to read base branch.",
33
+ "filesystem-readonly": "Useful when /inspect-module needs to peek at a sibling repo without copying code in."
34
+ }
35
+ }
@@ -48,6 +48,7 @@ CLAUDE.md tiny.
48
48
  - `/structural-test-author <layer>` when adding a new structural rule.
49
49
  - `/garbage-collection` every Friday or before tagging a release.
50
50
  - `/eval-runner` before merging any change to a skill or agent file.
51
+ - `/deliver-html` when user wants an analysis / audit / plan / decision doc / next-actions report — HTML for humans, MD stays for agent files (principle #11).
51
52
 
52
53
  ## Subagents you should delegate to (do NOT inline these reviews)
53
54