agent-harness-kit 0.8.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/.claude-plugin/marketplace.json +2 -2
  2. package/.claude-plugin/plugin.json +1 -1
  3. package/bin/cli.mjs +21 -0
  4. package/package.json +1 -1
  5. package/src/core/doctor.mjs +24 -0
  6. package/src/core/render-templates.mjs +29 -0
  7. package/src/core/upgrade.mjs +81 -60
  8. package/src/templates/.claude/agents/api-consistency-reviewer.md.vi +37 -0
  9. package/src/templates/.claude/agents/architecture-reviewer.md.vi.hbs +45 -0
  10. package/src/templates/.claude/agents/performance-reviewer.md.vi +39 -0
  11. package/src/templates/.claude/agents/reliability-reviewer.md.vi +42 -0
  12. package/src/templates/.claude/agents/security-reviewer.md.vi +43 -0
  13. package/src/templates/.claude/hooks/hooks.json +22 -0
  14. package/src/templates/.claude/output-styles/harness-terse.md +42 -0
  15. package/src/templates/.claude/settings.json.hbs +1 -0
  16. package/src/templates/.claude/skills/add-adr/SKILL.md.vi +64 -0
  17. package/src/templates/.claude/skills/add-feature/SKILL.md.vi.hbs +50 -0
  18. package/src/templates/.claude/skills/debug-flow/SKILL.md.vi.hbs +42 -0
  19. package/src/templates/.claude/skills/doc-drift-scan/SKILL.md.vi +52 -0
  20. package/src/templates/.claude/skills/eval-runner/SKILL.md.vi +59 -0
  21. package/src/templates/.claude/skills/garbage-collection/SKILL.md.vi.hbs +58 -0
  22. package/src/templates/.claude/skills/i18n-add-locale/SKILL.md +52 -0
  23. package/src/templates/.claude/skills/i18n-add-locale/SKILL.md.vi +56 -0
  24. package/src/templates/.claude/skills/i18n-add-locale/scripts/locale-scaffold.mjs +120 -0
  25. package/src/templates/.claude/skills/inspect-app/SKILL.md.vi +61 -0
  26. package/src/templates/.claude/skills/inspect-module/SKILL.md.vi.hbs +57 -0
  27. package/src/templates/.claude/skills/map-domain/SKILL.md +42 -0
  28. package/src/templates/.claude/skills/map-domain/SKILL.md.vi +42 -0
  29. package/src/templates/.claude/skills/map-domain/scripts/domain-map.mjs +145 -0
  30. package/src/templates/.claude/skills/propose-harness-improvement/SKILL.md.vi +49 -0
  31. package/src/templates/.claude/skills/propose-harness-improvement/scripts/improvement-bundle.mjs +172 -0
  32. package/src/templates/.claude/skills/refactor-feature/SKILL.md +60 -0
  33. package/src/templates/.claude/skills/refactor-feature/SKILL.md.vi +64 -0
  34. package/src/templates/.claude/skills/refactor-feature/scripts/feature-diff.mjs +146 -0
  35. package/src/templates/.claude/skills/review-this-pr/SKILL.md +59 -0
  36. package/src/templates/.claude/skills/review-this-pr/SKILL.md.vi +63 -0
  37. package/src/templates/.claude/skills/review-this-pr/scripts/pr-review-driver.mjs +152 -0
  38. package/src/templates/.claude/skills/structural-test-author/SKILL.md.vi.hbs +50 -0
  39. package/src/templates/.claude/skills/write-skill/SKILL.md.vi +43 -0
  40. package/src/templates/.harness/eval/rubrics/feature-step-done.mjs +148 -0
  41. package/src/templates/.harness/eval/tasks/feature-step-done.answer.md +53 -0
  42. package/src/templates/.harness/eval/tasks/feature-step-done.json +10 -0
  43. package/src/templates/.harness/eval/tasks/feature-step-done.prompt.md +43 -0
  44. package/src/templates/.mcp.json.example +35 -0
  45. package/src/templates/scripts/pretooluse-edit-guard.sh.hbs +115 -0
  46. package/src/templates/scripts/session-end.sh.hbs +6 -0
  47. package/src/templates/scripts/session-rollup.mjs +96 -0
  48. package/src/templates/scripts/session-start.sh.hbs +25 -0
  49. package/src/templates/scripts/subagent-stop.sh.hbs +76 -0
@@ -0,0 +1,146 @@
1
+ #!/usr/bin/env node
2
+ // feature-diff.mjs — deterministic gate for /refactor-feature.
3
+ // Diffs feature_list.json#features[*].steps[*] between a base ref and the
4
+ // current working copy. Returns violations when:
5
+ // - step.passes flipped false → true without step.tests[] or step.testCommit
6
+ // - step.id silently renamed (no renamed_from)
7
+ // - step disappeared without replaced_by
8
+ //
9
+ // Exit codes:
10
+ // 0 → no violations
11
+ // 2 → violations present (printed as JSON to stdout)
12
+ // 3 → input error (missing ref / unreadable file)
13
+
14
+ import { readFileSync, existsSync } from "node:fs";
15
+ import { resolve } from "node:path";
16
+ import { spawnSync } from "node:child_process";
17
+
18
+ const ROOT = process.env.CLAUDE_PROJECT_DIR || process.cwd();
19
+
20
+ function parseArgs(argv) {
21
+ const out = { beforeRef: "HEAD", afterFile: "feature_list.json" };
22
+ for (let i = 0; i < argv.length; i++) {
23
+ if (argv[i] === "--before-ref") out.beforeRef = argv[++i];
24
+ else if (argv[i] === "--after-file") out.afterFile = argv[++i];
25
+ }
26
+ return out;
27
+ }
28
+
29
+ function gitShow(ref, path) {
30
+ const r = spawnSync("git", ["show", `${ref}:${path}`], { cwd: ROOT, encoding: "utf8" });
31
+ if (r.status !== 0) return null;
32
+ return r.stdout;
33
+ }
34
+
35
+ function safeJSON(s, label) {
36
+ if (!s) return null;
37
+ try { return JSON.parse(s); }
38
+ catch (e) {
39
+ console.error(`feature-diff: invalid JSON in ${label}: ${e.message}`);
40
+ process.exit(3);
41
+ }
42
+ }
43
+
44
+ function indexSteps(featureList) {
45
+ // Returns { [stepId]: { featureId, step } }.
46
+ const idx = new Map();
47
+ for (const f of (featureList?.features || [])) {
48
+ for (const s of (f.steps || [])) {
49
+ if (s && s.id) idx.set(s.id, { featureId: f.id, step: s });
50
+ }
51
+ }
52
+ return idx;
53
+ }
54
+
55
+ function diff(before, after) {
56
+ const beforeIdx = indexSteps(before);
57
+ const afterIdx = indexSteps(after);
58
+ const violations = [];
59
+ const renames = [];
60
+ const doneTransitions = [];
61
+
62
+ // Disappearances + done-transitions (work over before).
63
+ for (const [id, { featureId, step }] of beforeIdx) {
64
+ const post = afterIdx.get(id);
65
+ if (!post) {
66
+ // Disappeared. Allowed only when a replaced_by exists in the BEFORE
67
+ // version OR an AFTER step references this id under renamed_from.
68
+ let renamedAway = false;
69
+ for (const [newId, { step: newStep }] of afterIdx) {
70
+ if (Array.isArray(newStep.renamed_from) && newStep.renamed_from.includes(id)) {
71
+ renamedAway = true;
72
+ renames.push({ from: id, to: newId, kind: "renamed_from" });
73
+ break;
74
+ }
75
+ if (newStep.renamed_from === id) {
76
+ renamedAway = true;
77
+ renames.push({ from: id, to: newId, kind: "renamed_from" });
78
+ break;
79
+ }
80
+ }
81
+ if (!renamedAway && !step.replaced_by) {
82
+ violations.push({
83
+ kind: "step_disappeared",
84
+ step_id: id,
85
+ feature_id: featureId,
86
+ fix: `Add 'replaced_by: <new_step_id>' to the step before deleting, OR mark the new step's 'renamed_from'.`,
87
+ });
88
+ }
89
+ continue;
90
+ }
91
+ // passes transition false → true.
92
+ if (step.passes === false && post.step.passes === true) {
93
+ doneTransitions.push({ step_id: id, feature_id: featureId });
94
+ const hasTests = Array.isArray(post.step.tests) && post.step.tests.length > 0;
95
+ const hasCommit = typeof post.step.testCommit === "string" && post.step.testCommit.length > 0;
96
+ if (!hasTests && !hasCommit) {
97
+ violations.push({
98
+ kind: "done_without_proof",
99
+ step_id: id,
100
+ feature_id: featureId,
101
+ fix: `Add 'tests: [...]' (test file paths) or 'testCommit: <sha>' before flipping passes:true.`,
102
+ });
103
+ }
104
+ }
105
+ }
106
+ // Newly-introduced steps with renamed_from referring to nonexistent ids
107
+ // (paranoia: catches typos in the renamed_from value).
108
+ for (const [id, { step }] of afterIdx) {
109
+ if (beforeIdx.has(id)) continue;
110
+ const refs = Array.isArray(step.renamed_from) ? step.renamed_from
111
+ : (typeof step.renamed_from === "string" ? [step.renamed_from] : []);
112
+ for (const ref of refs) {
113
+ if (!beforeIdx.has(ref)) {
114
+ violations.push({
115
+ kind: "renamed_from_typo",
116
+ step_id: id,
117
+ missing_ref: ref,
118
+ fix: `'renamed_from' must reference a step that existed at HEAD. Check the spelling.`,
119
+ });
120
+ }
121
+ }
122
+ }
123
+ return { violations, renames, doneTransitions };
124
+ }
125
+
126
+ function main() {
127
+ const { beforeRef, afterFile } = parseArgs(process.argv.slice(2));
128
+ const beforeRaw = gitShow(beforeRef, afterFile);
129
+ if (beforeRaw === null) {
130
+ // First-time addition — nothing to diff.
131
+ process.stdout.write(JSON.stringify({ violations: [], note: `no prior ${afterFile} at ${beforeRef}` }) + "\n");
132
+ process.exit(0);
133
+ }
134
+ const afterPath = resolve(ROOT, afterFile);
135
+ if (!existsSync(afterPath)) {
136
+ console.error(`feature-diff: missing ${afterFile} in working copy`);
137
+ process.exit(3);
138
+ }
139
+ const before = safeJSON(beforeRaw, `${beforeRef}:${afterFile}`);
140
+ const after = safeJSON(readFileSync(afterPath, "utf8"), afterFile);
141
+ const result = diff(before, after);
142
+ process.stdout.write(JSON.stringify(result, null, 2) + "\n");
143
+ if (result.violations.length > 0) process.exit(2);
144
+ }
145
+
146
+ main();
@@ -0,0 +1,59 @@
1
+ ---
2
+ name: review-this-pr
3
+ description: Use this skill to run a deterministic review of the current branch against its base — git diff base...HEAD, structural-test, baseline-monotonic check, and a markdown summary that lists each violating file with its layer rule. Replaces the "ask the agent to review the diff" pattern, which routinely misses cross-file drift.
4
+ allowed-tools: Read, Bash(git diff:*, git log:*, git merge-base:*, node .claude/skills/review-this-pr/scripts/pr-review-driver.mjs:*)
5
+ suggested-turns: 6
6
+ isolation: worktree
7
+ ---
8
+
9
+ ## When to invoke
10
+
11
+ - Before opening a PR (or before `gh pr create`).
12
+ - After a refactor where multiple files moved between layers.
13
+ - When CI lights up red and you want a fast local repro.
14
+
15
+ ## Steps
16
+
17
+ 1. **Identify base.**
18
+ ```
19
+ BASE=$(git merge-base HEAD origin/main 2>/dev/null || git merge-base HEAD main)
20
+ ```
21
+ 2. **Run driver.**
22
+ ```
23
+ node .claude/skills/review-this-pr/scripts/pr-review-driver.mjs --base "$BASE"
24
+ ```
25
+ Driver:
26
+ - Collects `git diff --name-only $BASE..HEAD`.
27
+ - Runs structural-test (workspace-wide for ts/py, file-scoped fallback).
28
+ - Diffs `.harness/structural-baseline.json` between $BASE and HEAD —
29
+ monotonic violation (baseline grew) is a hard fail.
30
+ - Reads each changed file's layer mapping via harness.config.json.
31
+ 3. **Read the report.** Output is markdown to stdout (or `--out report.md`).
32
+ Sections: Summary, Layer-map of changed files, Structural-test results,
33
+ Baseline delta, Per-reviewer hand-off (architecture, security, performance,
34
+ reliability).
35
+ 4. **Address each FAIL.** Re-run the driver until all sections are PASS.
36
+ 5. **Hand-off to reviewers.** If isolated review is needed, invoke
37
+ `/architecture-reviewer` / `/security-reviewer` etc. with the report as
38
+ context.
39
+
40
+ ## Output contract (driver JSON tail)
41
+
42
+ ```
43
+ {
44
+ "base": "<sha>",
45
+ "changed_files": <N>,
46
+ "violations": <M>,
47
+ "baseline_delta": <K>,
48
+ "passed": <bool>
49
+ }
50
+ ```
51
+
52
+ ## Anti-patterns
53
+
54
+ - Don't skip the structural-test "because the build passes" — the build
55
+ catches type errors; structural-test catches layer-rule violations that
56
+ TypeScript happily accepts.
57
+ - Don't paper over a baseline-delta with `git checkout HEAD~1 -- .harness/`
58
+ — the monotonic guard exists *because* that paper-over is the agent's
59
+ first instinct.
@@ -0,0 +1,63 @@
1
+ <!-- LOCALE_TODO: translate body to vi -->
2
+ <!-- Source: .claude/skills/review-this-pr/SKILL.md -->
3
+ <!-- Edit only the markdown body — keep frontmatter verbatim so the kit's renderer + Claude Code parse it identically across locales. -->
4
+
5
+ ---
6
+ name: review-this-pr
7
+ description: Use this skill to run a deterministic review of the current branch against its base — git diff base...HEAD, structural-test, baseline-monotonic check, and a markdown summary that lists each violating file with its layer rule. Replaces the "ask the agent to review the diff" pattern, which routinely misses cross-file drift.
8
+ allowed-tools: Read, Bash(git diff:*, git log:*, git merge-base:*, node .claude/skills/review-this-pr/scripts/pr-review-driver.mjs:*)
9
+ suggested-turns: 6
10
+ isolation: worktree
11
+ ---
12
+
13
+ ## When to invoke
14
+
15
+ - Before opening a PR (or before `gh pr create`).
16
+ - After a refactor where multiple files moved between layers.
17
+ - When CI lights up red and you want a fast local repro.
18
+
19
+ ## Steps
20
+
21
+ 1. **Identify base.**
22
+ ```
23
+ BASE=$(git merge-base HEAD origin/main 2>/dev/null || git merge-base HEAD main)
24
+ ```
25
+ 2. **Run driver.**
26
+ ```
27
+ node .claude/skills/review-this-pr/scripts/pr-review-driver.mjs --base "$BASE"
28
+ ```
29
+ Driver:
30
+ - Collects `git diff --name-only $BASE..HEAD`.
31
+ - Runs structural-test (workspace-wide for ts/py, file-scoped fallback).
32
+ - Diffs `.harness/structural-baseline.json` between $BASE and HEAD —
33
+ monotonic violation (baseline grew) is a hard fail.
34
+ - Reads each changed file's layer mapping via harness.config.json.
35
+ 3. **Read the report.** Output is markdown to stdout (or `--out report.md`).
36
+ Sections: Summary, Layer-map of changed files, Structural-test results,
37
+ Baseline delta, Per-reviewer hand-off (architecture, security, performance,
38
+ reliability).
39
+ 4. **Address each FAIL.** Re-run the driver until all sections are PASS.
40
+ 5. **Hand-off to reviewers.** If isolated review is needed, invoke
41
+ `/architecture-reviewer` / `/security-reviewer` etc. with the report as
42
+ context.
43
+
44
+ ## Output contract (driver JSON tail)
45
+
46
+ ```
47
+ {
48
+ "base": "<sha>",
49
+ "changed_files": <N>,
50
+ "violations": <M>,
51
+ "baseline_delta": <K>,
52
+ "passed": <bool>
53
+ }
54
+ ```
55
+
56
+ ## Anti-patterns
57
+
58
+ - Don't skip the structural-test "because the build passes" — the build
59
+ catches type errors; structural-test catches layer-rule violations that
60
+ TypeScript happily accepts.
61
+ - Don't paper over a baseline-delta with `git checkout HEAD~1 -- .harness/`
62
+ — the monotonic guard exists *because* that paper-over is the agent's
63
+ first instinct.
@@ -0,0 +1,152 @@
1
+ #!/usr/bin/env node
2
+ // pr-review-driver.mjs — deterministic driver for /review-this-pr.
3
+ // Gathers diff, runs structural-test, diffs baseline, emits markdown report.
4
+ //
5
+ // Usage:
6
+ // pr-review-driver.mjs --base <sha> [--out report.md]
7
+ //
8
+ // Output:
9
+ // stdout markdown + trailing JSON tail line (machine-readable).
10
+
11
+ import { readFileSync, existsSync, writeFileSync } from "node:fs";
12
+ import { resolve } from "node:path";
13
+ import { spawnSync } from "node:child_process";
14
+
15
+ const ROOT = process.env.CLAUDE_PROJECT_DIR || process.cwd();
16
+
17
+ function parseArgs(argv) {
18
+ const out = { base: null, out: null };
19
+ for (let i = 0; i < argv.length; i++) {
20
+ if (argv[i] === "--base") out.base = argv[++i];
21
+ else if (argv[i] === "--out") out.out = argv[++i];
22
+ }
23
+ if (!out.base) {
24
+ console.error("usage: pr-review-driver.mjs --base <sha> [--out report.md]");
25
+ process.exit(2);
26
+ }
27
+ return out;
28
+ }
29
+
30
+ function git(args) {
31
+ return spawnSync("git", args, { cwd: ROOT, encoding: "utf8" });
32
+ }
33
+
34
+ function changedFiles(base) {
35
+ const r = git(["diff", "--name-only", `${base}...HEAD`]);
36
+ if (r.status !== 0) return [];
37
+ return (r.stdout || "").split("\n").filter(Boolean);
38
+ }
39
+
40
+ function whichLayer(file, cfg) {
41
+ if (!cfg?.domains) return null;
42
+ for (const d of cfg.domains) {
43
+ if (!d.layers || !d.root) continue;
44
+ for (const layer of d.layers) {
45
+ const prefix = `${d.root}/${layer}/`;
46
+ if (file.startsWith(prefix)) return { domain: d.name || "default", layer };
47
+ }
48
+ }
49
+ return null;
50
+ }
51
+
52
+ function loadJSON(path, fallback = null) {
53
+ try { return JSON.parse(readFileSync(path, "utf8")); } catch { return fallback; }
54
+ }
55
+
56
+ function runStructuralTest() {
57
+ // Prefer node harness/structural-check.mjs (polyglot adapters); fallback to
58
+ // npm run harness:check. Capture full output (stdout+stderr); never throws.
59
+ if (existsSync(resolve(ROOT, "harness/structural-check.mjs"))) {
60
+ const r = spawnSync("node", ["harness/structural-check.mjs"], {
61
+ cwd: ROOT, encoding: "utf8",
62
+ });
63
+ return {
64
+ ok: r.status === 0,
65
+ output: ((r.stdout || "") + (r.stderr || "")).split("\n").slice(0, 80).join("\n"),
66
+ };
67
+ }
68
+ if (existsSync(resolve(ROOT, "package.json"))) {
69
+ const pj = loadJSON(resolve(ROOT, "package.json"));
70
+ if (pj?.scripts?.["harness:check"]) {
71
+ const r = spawnSync("npm", ["run", "--silent", "harness:check"], {
72
+ cwd: ROOT, encoding: "utf8",
73
+ });
74
+ return {
75
+ ok: r.status === 0,
76
+ output: ((r.stdout || "") + (r.stderr || "")).split("\n").slice(0, 80).join("\n"),
77
+ };
78
+ }
79
+ }
80
+ return { ok: true, output: "(no structural-test entry point — skipped)" };
81
+ }
82
+
83
+ function baselineDelta(base) {
84
+ const baselinePath = ".harness/structural-baseline.json";
85
+ const headRaw = existsSync(resolve(ROOT, baselinePath))
86
+ ? readFileSync(resolve(ROOT, baselinePath), "utf8") : "[]";
87
+ const baseR = git(["show", `${base}:${baselinePath}`]);
88
+ const baseRaw = baseR.status === 0 ? baseR.stdout : "[]";
89
+ let headArr, baseArr;
90
+ try { headArr = JSON.parse(headRaw); } catch { headArr = []; }
91
+ try { baseArr = JSON.parse(baseRaw); } catch { baseArr = []; }
92
+ const headSet = new Set(headArr.map((x) => typeof x === "string" ? x : JSON.stringify(x)));
93
+ const baseSet = new Set(baseArr.map((x) => typeof x === "string" ? x : JSON.stringify(x)));
94
+ const added = [];
95
+ for (const e of headSet) if (!baseSet.has(e)) added.push(e);
96
+ return { added_count: added.length, head_count: headArr.length, base_count: baseArr.length };
97
+ }
98
+
99
+ function buildReport({ base, changed, perFile, structural, baseline }) {
100
+ const violations = structural.ok ? 0 : structural.output.split("\n")
101
+ .filter((l) => /violat|error|FAIL/i.test(l)).length;
102
+ const passed = structural.ok && baseline.added_count === 0;
103
+
104
+ const md = [];
105
+ md.push(`# /review-this-pr report`);
106
+ md.push(``);
107
+ md.push(`- base: \`${base}\``);
108
+ md.push(`- changed files: ${changed.length}`);
109
+ md.push(`- structural-test: ${structural.ok ? "PASS" : "FAIL"}`);
110
+ md.push(`- baseline delta: ${baseline.added_count} new entries (head=${baseline.head_count}, base=${baseline.base_count})`);
111
+ md.push(`- overall: ${passed ? "PASS" : "FAIL"}`);
112
+ md.push(``);
113
+ md.push(`## Changed files (by layer)`);
114
+ md.push(``);
115
+ for (const row of perFile) {
116
+ const tag = row.layer ? `${row.layer.domain}/${row.layer.layer}` : "(unlayered)";
117
+ md.push(`- \`${row.file}\` → ${tag}`);
118
+ }
119
+ md.push(``);
120
+ md.push(`## Structural-test output (head 80 lines)`);
121
+ md.push("```");
122
+ md.push(structural.output);
123
+ md.push("```");
124
+ md.push(``);
125
+ md.push(`## Hand-off`);
126
+ md.push(``);
127
+ md.push(`Recommended reviewer subagents based on touched layers:`);
128
+ const layers = new Set(perFile.map((r) => r.layer?.layer).filter(Boolean));
129
+ if (layers.has("service") || layers.has("repository")) md.push(`- /api-consistency-reviewer (service/repo touched)`);
130
+ if (changed.some((f) => /auth|secret|crypto|cookie/i.test(f))) md.push(`- /security-reviewer (security-flavoured files touched)`);
131
+ if (changed.some((f) => /\.sql$|migrations\//i.test(f))) md.push(`- /reliability-reviewer (data-layer touched)`);
132
+ if (changed.length >= 10) md.push(`- /architecture-reviewer (>=10 files changed)`);
133
+ md.push(``);
134
+ const tail = { base, changed_files: changed.length, violations, baseline_delta: baseline.added_count, passed };
135
+ md.push(`<!-- machine-tail: ${JSON.stringify(tail)} -->`);
136
+ return { md: md.join("\n") + "\n", tail };
137
+ }
138
+
139
+ function main() {
140
+ const { base, out } = parseArgs(process.argv.slice(2));
141
+ const cfg = loadJSON(resolve(ROOT, "harness.config.json"));
142
+ const changed = changedFiles(base);
143
+ const perFile = changed.map((f) => ({ file: f, layer: whichLayer(f, cfg) }));
144
+ const structural = runStructuralTest();
145
+ const baseline = baselineDelta(base);
146
+ const { md, tail } = buildReport({ base, changed, perFile, structural, baseline });
147
+ if (out) writeFileSync(resolve(ROOT, out), md);
148
+ else process.stdout.write(md);
149
+ if (!tail.passed) process.exit(2);
150
+ }
151
+
152
+ main();
@@ -0,0 +1,50 @@
1
+ <!-- LOCALE_TODO: translate body to vi -->
2
+ <!-- Source: .claude/skills/structural-test-author/SKILL.md.hbs -->
3
+ <!-- Edit only the markdown body — keep frontmatter verbatim so the kit's renderer + Claude Code parse it identically across locales. -->
4
+
5
+ ---
6
+ name: structural-test-author
7
+ description: Use this skill whenever the user wants to add a new architectural rule, prevent a recurring agent mistake, or codify a pattern from golden-principles.md. Generates a {{#if isPython}}libcst-based Python{{else}}ts-morph-based TypeScript{{/if}} structural test plus the matching {{#if isPython}}import-linter contract{{else}}eslint-plugin-boundaries rule{{/if}} entry. Always prefer this over leaving rules in prose.
8
+ allowed-tools: Read, Edit, Write, Bash(npm test:*), Bash(pytest:*)
9
+ suggested-turns: 15
10
+ ---
11
+
12
+ ## Steps
13
+
14
+ 1. **Phrase the rule.** Ask the user: "What invariant do you want enforced?
15
+ Phrase it as: 'No code in layer X may import from layer Y' or 'Every
16
+ <thing> must <do>'."
17
+ 2. **Layer rules first.** If the rule is layer-based, edit `harness.config.json`
18
+ `domains[].layers` and the {{#if isPython}}`.importlinter`{{else}}`eslint.config.js`{{/if}}
19
+ config — DO NOT write a custom test for layer rules; the existing test
20
+ already supports them via configuration.
21
+ 3. **Structural rules.** If the rule is structural but not layer-based (e.g.
22
+ "every controller must call validateAt"), open
23
+ `{{#if isPython}}harness/structural_test.py{{else}}harness/structural-test.ts{{/if}}`:
24
+ - {{#if isPython}}Use `libcst.CSTVisitor` subclass — preserves whitespace and comments.{{else}}Use `Project` + `getSourceFiles()` + AST visitors — the canonical ts-morph pattern.{{/if}}
25
+ 4. **Add a fixture test.** Create a file in `tests/structural/` that contains
26
+ a deliberately-violating snippet, and verify the rule fails on it.
27
+ 5. **Run against the whole repo.** If the rule fails on existing code, choose:
28
+ - **(a)** fix the existing code, OR
29
+ - **(b)** add the existing violations to `.harness/structural-baseline.json`
30
+ so only **new** violations block. (PMD/baseline pattern.)
31
+ 6. **Document.** Append the rule and its rationale (one paragraph traced to a
32
+ specific past failure) to `docs/golden-principles.md`.
33
+ 7. **Log the harness change.** Run `/propose-harness-improvement` to record
34
+ this as a permanent harness improvement.
35
+
36
+ ## Output contract
37
+
38
+ ```
39
+ ### Rule added: <one-line description>
40
+ ### Files changed: <list>
41
+ ### New violations on existing code: <count> — baselined: yes/no
42
+ ### golden-principles.md entry: §<n>
43
+ ```
44
+
45
+ ## Anti-patterns
46
+
47
+ - Don't write a rule whose enforcement is also LLM-based — that just recurses.
48
+ - Don't write a rule that requires runtime information to evaluate (e.g.
49
+ "this function must not take more than 100ms"). Those go in evals or
50
+ observability, not structural tests.
@@ -0,0 +1,43 @@
1
+ <!-- LOCALE_TODO: translate body to vi -->
2
+ <!-- Source: .claude/skills/write-skill/SKILL.md -->
3
+ <!-- Edit only the markdown body — keep frontmatter verbatim so the kit's renderer + Claude Code parse it identically across locales. -->
4
+
5
+ ---
6
+ name: write-skill
7
+ description: Use this skill whenever the user asks to "create a skill", "add a slash command", "package a workflow", or "make X reusable across sessions". Generates a SKILL.md with valid YAML frontmatter (name regex, description ≤ 1024 chars, body ≤ 500 lines) and supporting scripts/references/assets. Tests the skill by simulating an auto-discovery prompt.
8
+ allowed-tools: Read, Edit, Write, Bash(ls:*)
9
+ suggested-turns: 8
10
+ ---
11
+
12
+ ## Steps
13
+
14
+ 1. **Validate the name.** Must match `^[a-z0-9]+(-[a-z0-9]+)*$` and be ≤ 64
15
+ characters.
16
+ 2. **Write a "pushy" description.** Third-person, ≤ 1024 chars. Explicitly
17
+ mention triggers ("Use this skill whenever the user mentions <X>, <Y>,
18
+ <Z>"). Models under-trigger skills with shy descriptions.
19
+ 3. **Body sections,** in this order: `## When to use`, `## Steps`,
20
+ `## Output contract`, `## Anti-patterns`. Cap body at 500 lines.
21
+ 4. **Externalize deterministic logic.** If the skill needs deterministic work
22
+ (parsing, formatting, computation), put it in `scripts/<name>.sh` (or `.py`
23
+ / `.mjs`) under the skill directory and reference it via a `Bash(...)`
24
+ tool call. SKILL.md stays declarative.
25
+ 5. **Test discovery.** Open a fresh Claude Code session and prompt with one
26
+ of the description triggers. Verify the skill auto-loads.
27
+
28
+ ## Output contract
29
+
30
+ ```
31
+ ### Skill: /<name>
32
+ ### Description bytes: <count>/1024
33
+ ### Body lines: <count>/500
34
+ ### Allowed tools: <list>
35
+ ### Discovery trigger tested: yes/no
36
+ ```
37
+
38
+ ## Anti-patterns
39
+
40
+ - Don't write a description that starts with "This skill…" — start with "Use
41
+ this skill whenever the user…" so triggers are front-loaded.
42
+ - Don't pack two unrelated workflows into one skill. Split them.
43
+ - Don't grant `Bash(*:*)` in `allowed-tools`. Pin specific commands.
@@ -0,0 +1,148 @@
1
+ #!/usr/bin/env node
2
+ // feature-step-done.mjs — eval rubric for the "feature step done" task.
3
+ // Reads the agent's transcript + the final feature_list.json + the diff;
4
+ // returns a JSON verdict on the outcome / process / style / efficiency
5
+ // dimensions.
6
+ //
7
+ // Invocation (from eval-runner.mjs):
8
+ // node .harness/eval/rubrics/feature-step-done.mjs --transcript <path> --task <task.json>
9
+ //
10
+ // Exit 0 = rubric ran. The JSON tail communicates pass/fail.
11
+
12
+ import { readFileSync, existsSync } from "node:fs";
13
+ import { resolve } from "node:path";
14
+ import { spawnSync } from "node:child_process";
15
+
16
+ const ROOT = process.env.CLAUDE_PROJECT_DIR || process.cwd();
17
+
18
+ function parseArgs(argv) {
19
+ const out = { transcript: null, task: null };
20
+ for (let i = 0; i < argv.length; i++) {
21
+ if (argv[i] === "--transcript") out.transcript = argv[++i];
22
+ else if (argv[i] === "--task") out.task = argv[++i];
23
+ }
24
+ return out;
25
+ }
26
+
27
+ function safeJSON(s, def = null) {
28
+ try { return JSON.parse(s); } catch { return def; }
29
+ }
30
+
31
+ function loadFile(path, fallback = null) {
32
+ try { return readFileSync(path, "utf8"); } catch { return fallback; }
33
+ }
34
+
35
+ function loadFeatureList() {
36
+ const path = resolve(ROOT, "feature_list.json");
37
+ const raw = loadFile(path);
38
+ return raw ? safeJSON(raw) : null;
39
+ }
40
+
41
+ function gitDiffFiles() {
42
+ // Files changed in the agent's run, relative to HEAD~1 (one commit before
43
+ // the eval started). Eval-runner pins HEAD with a tag before each task.
44
+ const r = spawnSync("git", ["diff", "--name-only", "HEAD~1...HEAD"], {
45
+ cwd: ROOT, encoding: "utf8",
46
+ });
47
+ if (r.status !== 0) return [];
48
+ return (r.stdout || "").split("\n").filter(Boolean);
49
+ }
50
+
51
+ function transcriptToolCalls(transcriptPath) {
52
+ // Stream-json transcripts from claude-cli are JSONL with one record per
53
+ // tool invocation / message. We collect the tool names + a small sample
54
+ // of inputs so the rubric can spot /add-feature etc.
55
+ const body = loadFile(transcriptPath, "");
56
+ const calls = [];
57
+ for (const line of body.split("\n")) {
58
+ if (!line.trim()) continue;
59
+ const rec = safeJSON(line);
60
+ if (!rec) continue;
61
+ if (rec.type === "tool_use" || rec.tool || rec.skill) {
62
+ calls.push({
63
+ tool: rec.tool || rec.skill || rec.type,
64
+ input: rec.input || rec.tool_input || rec.arguments || null,
65
+ });
66
+ }
67
+ }
68
+ return calls;
69
+ }
70
+
71
+ function grade({ task, fl, diffFiles, toolCalls }) {
72
+ const dims = { outcome: "fail", process: "fail", style: "warn", efficiency: "warn" };
73
+ const reasons = [];
74
+
75
+ // --- outcome ---
76
+ // features[0].steps[0].passes === true AND tests[] is non-empty AND
77
+ // at least one tests[] entry appears in diffFiles.
78
+ const step = fl?.features?.[0]?.steps?.[0];
79
+ if (!step) {
80
+ reasons.push("outcome: no features[0].steps[0] found in feature_list.json after run");
81
+ } else if (step.passes !== true) {
82
+ reasons.push(`outcome: features[0].steps[0].passes is ${JSON.stringify(step.passes)}, want true`);
83
+ } else if (!Array.isArray(step.tests) || step.tests.length === 0) {
84
+ reasons.push("outcome: features[0].steps[0].tests is empty — done flipped without test reference");
85
+ } else {
86
+ const testInDiff = step.tests.some((t) => diffFiles.includes(t));
87
+ if (!testInDiff) {
88
+ reasons.push(`outcome: feature_list.json#tests references [${step.tests.join(", ")}] but none appear in the diff`);
89
+ } else {
90
+ dims.outcome = "pass";
91
+ }
92
+ }
93
+
94
+ // --- process ---
95
+ // The agent should invoke /add-feature (or /refactor-feature) AND make
96
+ // a write to the handler + test file in the same run.
97
+ const ranSkill = toolCalls.some(
98
+ (c) => /(add-feature|refactor-feature)/i.test(c.tool || "") ||
99
+ /(add-feature|refactor-feature)/i.test(c.input?.skill || ""),
100
+ );
101
+ const handlerWrites = diffFiles.filter((f) => /\.(ts|tsx|js|mjs|py|rs|go)$/.test(f) && !/test/i.test(f));
102
+ const testWrites = diffFiles.filter((f) => /test/i.test(f) || /\.spec\./.test(f));
103
+ if (!ranSkill) {
104
+ reasons.push("process: agent did not invoke /add-feature or /refactor-feature");
105
+ } else if (handlerWrites.length === 0) {
106
+ reasons.push("process: no handler file appeared in diff");
107
+ } else if (testWrites.length === 0) {
108
+ reasons.push("process: no test file appeared in diff");
109
+ } else {
110
+ dims.process = "pass";
111
+ }
112
+
113
+ // --- style ---
114
+ // PROGRESS.md should be appended (kit convention). Soft check.
115
+ const touchedProgress = diffFiles.includes(".harness/PROGRESS.md");
116
+ if (touchedProgress) {
117
+ dims.style = "pass";
118
+ } else {
119
+ reasons.push("style: .harness/PROGRESS.md not appended (soft fail)");
120
+ }
121
+
122
+ // --- efficiency ---
123
+ // expected.tokensMax — actual token count comes from transcript meta.
124
+ // Without that we can't grade hard; warn-pass if filesChanged within
125
+ // task.expected.filesChanged bounds.
126
+ const max = task?.expected?.filesChanged?.max ?? 99;
127
+ const min = task?.expected?.filesChanged?.min ?? 1;
128
+ if (diffFiles.length >= min && diffFiles.length <= max) {
129
+ dims.efficiency = "pass";
130
+ } else {
131
+ reasons.push(`efficiency: ${diffFiles.length} files changed, want ${min}-${max}`);
132
+ }
133
+
134
+ const overall = (dims.outcome === "pass" && dims.process === "pass") ? "PASS" : "FAIL";
135
+ return { overall, dimensions: dims, reasons, diff_files: diffFiles };
136
+ }
137
+
138
+ function main() {
139
+ const { transcript, task: taskPath } = parseArgs(process.argv.slice(2));
140
+ const task = taskPath ? safeJSON(loadFile(resolve(ROOT, taskPath)) ?? "", null) : null;
141
+ const fl = loadFeatureList();
142
+ const diffFiles = gitDiffFiles();
143
+ const toolCalls = transcript ? transcriptToolCalls(resolve(ROOT, transcript)) : [];
144
+ const verdict = grade({ task, fl, diffFiles, toolCalls });
145
+ process.stdout.write(JSON.stringify(verdict, null, 2) + "\n");
146
+ }
147
+
148
+ main();