agent-harness-kit 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. package/.claude-plugin/marketplace.json +27 -0
  2. package/.claude-plugin/plugin.json +25 -0
  3. package/LICENSE +21 -0
  4. package/README.md +165 -0
  5. package/bin/cli.mjs +261 -0
  6. package/package.json +64 -0
  7. package/src/core/detect-stack.mjs +181 -0
  8. package/src/core/doctor.mjs +106 -0
  9. package/src/core/patch-package-json.mjs +53 -0
  10. package/src/core/render-templates.mjs +277 -0
  11. package/src/core/upgrade.mjs +274 -0
  12. package/src/templates/.claude/agents/api-consistency-reviewer.md +33 -0
  13. package/src/templates/.claude/agents/architecture-reviewer.md.hbs +41 -0
  14. package/src/templates/.claude/agents/performance-reviewer.md +35 -0
  15. package/src/templates/.claude/agents/reliability-reviewer.md +38 -0
  16. package/src/templates/.claude/agents/security-reviewer.md +39 -0
  17. package/src/templates/.claude/hooks/hooks.json.hbs +39 -0
  18. package/src/templates/.claude/settings.json.hbs +25 -0
  19. package/src/templates/.claude/skills/add-adr/SKILL.md +60 -0
  20. package/src/templates/.claude/skills/add-feature/SKILL.md.hbs +50 -0
  21. package/src/templates/.claude/skills/debug-flow/SKILL.md.hbs +38 -0
  22. package/src/templates/.claude/skills/doc-drift-scan/SKILL.md +43 -0
  23. package/src/templates/.claude/skills/eval-runner/SKILL.md +55 -0
  24. package/src/templates/.claude/skills/garbage-collection/SKILL.md.hbs +49 -0
  25. package/src/templates/.claude/skills/inspect-app/SKILL.md +57 -0
  26. package/src/templates/.claude/skills/inspect-module/SKILL.md.hbs +53 -0
  27. package/src/templates/.claude/skills/propose-harness-improvement/SKILL.md +43 -0
  28. package/src/templates/.claude/skills/structural-test-author/SKILL.md.hbs +46 -0
  29. package/src/templates/.claude/skills/write-skill/SKILL.md +39 -0
  30. package/src/templates/CLAUDE.md.hbs +70 -0
  31. package/src/templates/_adapter-python/.importlinter +14 -0
  32. package/src/templates/_adapter-python/harness/__init__.py +0 -0
  33. package/src/templates/_adapter-python/harness/eval_runner.py +281 -0
  34. package/src/templates/_adapter-python/harness/structural_test.py +195 -0
  35. package/src/templates/_adapter-typescript/.dependency-cruiser.cjs +27 -0
  36. package/src/templates/_adapter-typescript/eslint.config.mjs +38 -0
  37. package/src/templates/_adapter-typescript/harness/eval-runner.mjs +322 -0
  38. package/src/templates/_adapter-typescript/harness/structural-test.mjs +125 -0
  39. package/src/templates/_ci/.github/workflows/eval-nightly.yml +59 -0
  40. package/src/templates/_ci/.github/workflows/harness.yml +55 -0
  41. package/src/templates/docs/adr/0001-use-agent-harness-kit.md.hbs +56 -0
  42. package/src/templates/docs/agent-failures.md +25 -0
  43. package/src/templates/docs/architecture.md.hbs +47 -0
  44. package/src/templates/docs/core-beliefs.md.hbs +41 -0
  45. package/src/templates/docs/golden-principles.md.hbs +80 -0
  46. package/src/templates/docs/tech-debt-tracker.md +30 -0
  47. package/src/templates/feature_list.json.hbs +29 -0
  48. package/src/templates/harness.config.json.hbs +40 -0
  49. package/src/templates/scripts/dev-up.sh.hbs +51 -0
  50. package/src/templates/scripts/harness-report.mjs +189 -0
  51. package/src/templates/scripts/install-git-hooks.sh +18 -0
  52. package/src/templates/scripts/pre-push.sh +21 -0
  53. package/src/templates/scripts/precompletion-checklist.sh.hbs +99 -0
  54. package/src/templates/scripts/structural-test-on-edit.sh.hbs +53 -0
  55. package/src/templates/scripts/telemetry-on-skill.sh +26 -0
@@ -0,0 +1,322 @@
1
+ // harness/eval-runner.mjs — drive Claude Code through .harness/eval/tasks/*.json
2
+ // and grade each on outcome / process / style / efficiency.
3
+ //
4
+ // Per-task JSONL row goes to .harness/eval/results/<sha>.jsonl. On regression
5
+ // (any task failing in CI), exit 1 so the workflow blocks merge.
6
+ //
7
+ // Transports:
8
+ // --transport=claude-cli spawn `claude -p` and capture stream-json transcript (default)
9
+ // --transport=mock synthetic transcript — use in CI smoke-tests, no API key needed
10
+ //
11
+ // Sets:
12
+ // --quick first 3 tasks (~$0.30, ~2 min on Sonnet)
13
+ // --full all tasks (~$2, ~15 min)
14
+ // --tasks <glob> custom set
15
+ //
16
+ // Usage:
17
+ // node harness/eval-runner.mjs --quick
18
+ // node harness/eval-runner.mjs --full --transport=mock # CI smoke-test
19
+ // node harness/eval-runner.mjs --tasks 01-trivial-endpoint.json
20
+
21
+ import { readFile, writeFile, mkdir, readdir, appendFile } from "node:fs/promises";
22
+ import { existsSync } from "node:fs";
23
+ import { resolve, join, dirname } from "node:path";
24
+ import { spawn, execSync } from "node:child_process";
25
+ import { argv, exit, env, cwd } from "node:process";
26
+
27
+ function parseArgs(argv) {
28
+ const opts = {
29
+ quick: false,
30
+ full: false,
31
+ tasksGlob: null,
32
+ transport: "claude-cli",
33
+ out: null,
34
+ };
35
+ for (let i = 2; i < argv.length; i++) {
36
+ const a = argv[i];
37
+ if (a === "--quick") opts.quick = true;
38
+ else if (a === "--full") opts.full = true;
39
+ else if (a === "--tasks") opts.tasksGlob = argv[++i];
40
+ else if (a.startsWith("--tasks=")) opts.tasksGlob = a.slice("--tasks=".length);
41
+ else if (a === "--transport") opts.transport = argv[++i];
42
+ else if (a.startsWith("--transport=")) opts.transport = a.slice("--transport=".length);
43
+ else if (a === "--out") opts.out = argv[++i];
44
+ else if (a.startsWith("--out=")) opts.out = a.slice("--out=".length);
45
+ else if (a === "--help" || a === "-h") {
46
+ console.log(USAGE);
47
+ exit(0);
48
+ }
49
+ }
50
+ return opts;
51
+ }
52
+
53
+ const USAGE = `Usage: node harness/eval-runner.mjs [--quick|--full|--tasks <glob>] [--transport <name>]
54
+
55
+ Transports:
56
+ claude-cli (default) spawn \`claude -p\` and capture stream-json
57
+ mock synthetic transcript for CI smoke-tests
58
+
59
+ See PUBLISHING.md for token budget and cost notes.`;
60
+
61
+ async function loadTasks(opts) {
62
+ const dir = resolve(cwd(), ".harness/eval/tasks");
63
+ if (!existsSync(dir)) {
64
+ console.error(`No tasks directory at ${dir}. Run \`agent-harness-kit init\` first.`);
65
+ exit(1);
66
+ }
67
+ let files = (await readdir(dir)).filter((f) => f.endsWith(".json")).sort();
68
+ if (opts.tasksGlob) {
69
+ files = files.filter((f) => f === opts.tasksGlob || f.includes(opts.tasksGlob));
70
+ } else if (opts.quick) {
71
+ files = files.slice(0, 3);
72
+ }
73
+ const tasks = [];
74
+ for (const f of files) {
75
+ const t = JSON.parse(await readFile(join(dir, f), "utf8"));
76
+ tasks.push({ ...t, _file: join(dir, f) });
77
+ }
78
+ return tasks;
79
+ }
80
+
81
+ // ---- transports ----
82
+
83
+ const TRANSPORTS = {
84
+ // Real driver: spawn `claude -p` with stream-json output and flatten the
85
+ // wire format into the same shape the mock transport produces (so the
86
+ // graders don't have to know about both shapes).
87
+ //
88
+ // Real wire format (Claude Code 2.1.x):
89
+ // {type:"assistant", message:{content:[{type:"tool_use", name, input}]}}
90
+ // {type:"user", message:{content:[{type:"tool_result", ...}]}}
91
+ // {type:"result", usage:{input_tokens, output_tokens, cache_*}, total_cost_usd}
92
+ //
93
+ // Flat shape graders consume:
94
+ // {type:"tool_use", tool:<name>, path:<input.file_path|input.path>}
95
+ // {type:"token_usage", total:<sum of all token fields>}
96
+ "claude-cli": (task) =>
97
+ new Promise((resolve, reject) => {
98
+ const proc = spawn(
99
+ "claude",
100
+ [
101
+ "-p",
102
+ task.input,
103
+ "--output-format",
104
+ "stream-json",
105
+ "--verbose",
106
+ "--max-turns",
107
+ "20",
108
+ ],
109
+ { stdio: ["ignore", "pipe", "pipe"] },
110
+ );
111
+ const events = [];
112
+ let stderr = "";
113
+ let buf = "";
114
+ const ingest = (raw) => {
115
+ // Always keep the raw event for debugging.
116
+ events.push({ raw, type: raw.type });
117
+ // Flatten tool_use blocks from assistant messages.
118
+ if (raw.type === "assistant" && raw.message?.content) {
119
+ for (const block of raw.message.content) {
120
+ if (block.type !== "tool_use") continue;
121
+ // /skill invocations come in as the Skill tool with input.skill.
122
+ if (block.name === "Skill" && block.input?.skill) {
123
+ events.push({ type: "tool_use", tool: block.input.skill });
124
+ }
125
+ const path =
126
+ block.input?.file_path ?? block.input?.path ?? null;
127
+ events.push({ type: "tool_use", tool: block.name, path });
128
+ }
129
+ }
130
+ // Final result has aggregated usage.
131
+ if (raw.type === "result" && raw.usage) {
132
+ const u = raw.usage;
133
+ const total =
134
+ (u.input_tokens ?? 0) +
135
+ (u.output_tokens ?? 0) +
136
+ (u.cache_creation_input_tokens ?? 0) +
137
+ (u.cache_read_input_tokens ?? 0);
138
+ events.push({ type: "token_usage", total });
139
+ }
140
+ };
141
+ proc.stdout.on("data", (chunk) => {
142
+ buf += chunk.toString();
143
+ const lines = buf.split("\n");
144
+ buf = lines.pop() ?? "";
145
+ for (const line of lines) {
146
+ if (!line.trim()) continue;
147
+ try {
148
+ ingest(JSON.parse(line));
149
+ } catch {
150
+ /* non-JSON line (rare) — ignore */
151
+ }
152
+ }
153
+ });
154
+ proc.stderr.on("data", (chunk) => {
155
+ stderr += chunk.toString();
156
+ });
157
+ proc.on("error", reject);
158
+ proc.on("exit", (code) => {
159
+ if (code !== 0) {
160
+ return reject(new Error(`claude exited ${code}: ${stderr.slice(0, 500)}`));
161
+ }
162
+ resolve({ events, stderr });
163
+ });
164
+ }),
165
+
166
+ // Mock transport — produces a synthetic transcript that satisfies the
167
+ // default expectations of the shipped tasks. Used in CI to verify the
168
+ // driver shape end-to-end without burning API tokens.
169
+ mock: async (task) => {
170
+ const expected = task.expected ?? {};
171
+ const events = [];
172
+ for (const skill of expected.skillsInvoked ?? []) {
173
+ events.push({ type: "tool_use", tool: skill });
174
+ }
175
+ const minFiles = expected.filesChanged?.min ?? 1;
176
+ for (let i = 0; i < minFiles; i++) {
177
+ events.push({ type: "tool_use", tool: "Write", path: `src/mock-${i}.ts` });
178
+ }
179
+ events.push({
180
+ type: "token_usage",
181
+ total: Math.min(expected.tokensMax ?? 5000, 5000),
182
+ });
183
+ return { events, stderr: "" };
184
+ },
185
+ };
186
+
187
+ // ---- graders ----
188
+
189
+ function gradeOutcome(task) {
190
+ if (task.expected?.structuralTest !== "pass") {
191
+ return { dim: "outcome", score: null, info: "no expectation" };
192
+ }
193
+ try {
194
+ execSync("npm run --silent harness:check", { stdio: "ignore" });
195
+ return { dim: "outcome", score: 1, info: "structural test passed" };
196
+ } catch {
197
+ return { dim: "outcome", score: 0, info: "structural test failed" };
198
+ }
199
+ }
200
+
201
+ function gradeProcess(task, transcript) {
202
+ const expected = task.expected?.skillsInvoked ?? [];
203
+ if (expected.length === 0) return { dim: "process", score: null };
204
+ const invoked = new Set(
205
+ transcript.events.filter((e) => e.type === "tool_use").map((e) => e.tool),
206
+ );
207
+ const missing = expected.filter((s) => !invoked.has(s));
208
+ return {
209
+ dim: "process",
210
+ score: missing.length === 0 ? 1 : 0,
211
+ info:
212
+ missing.length === 0
213
+ ? "all expected skills invoked"
214
+ : `missing skills: ${missing.join(", ")}`,
215
+ };
216
+ }
217
+
218
+ function gradeStyle(task, transcript) {
219
+ const range = task.expected?.filesChanged;
220
+ if (!range) return { dim: "style", score: null };
221
+ const writes = transcript.events.filter(
222
+ (e) => e.type === "tool_use" && (e.tool === "Write" || e.tool === "Edit" || e.tool === "MultiEdit"),
223
+ );
224
+ const distinct = new Set(writes.map((e) => e.path).filter(Boolean)).size;
225
+ const ok = distinct >= range.min && distinct <= range.max;
226
+ return {
227
+ dim: "style",
228
+ score: ok ? 1 : 0,
229
+ info: `${distinct} files changed (expected ${range.min}-${range.max})`,
230
+ };
231
+ }
232
+
233
+ function gradeEfficiency(task, transcript) {
234
+ const cap = task.expected?.tokensMax;
235
+ if (!cap) return { dim: "efficiency", score: null };
236
+ const tokens = transcript.events
237
+ .filter((e) => e.type === "token_usage")
238
+ .reduce((sum, e) => sum + (e.total ?? 0), 0);
239
+ return {
240
+ dim: "efficiency",
241
+ score: tokens <= cap ? 1 : 0,
242
+ info: `${tokens} tokens (cap ${cap})`,
243
+ };
244
+ }
245
+
246
+ function gitSha() {
247
+ try {
248
+ return execSync("git rev-parse --short HEAD", { stdio: ["ignore", "pipe", "ignore"] })
249
+ .toString()
250
+ .trim();
251
+ } catch {
252
+ return "no-git";
253
+ }
254
+ }
255
+
256
+ export async function runEval(opts = {}) {
257
+ const tasks = await loadTasks(opts);
258
+ if (tasks.length === 0) {
259
+ console.error("No tasks matched.");
260
+ return { results: [], passed: 0 };
261
+ }
262
+ const transport = TRANSPORTS[opts.transport ?? "claude-cli"];
263
+ if (!transport) {
264
+ console.error(
265
+ `Unknown transport: ${opts.transport}. Try: ${Object.keys(TRANSPORTS).join(", ")}`,
266
+ );
267
+ exit(2);
268
+ }
269
+
270
+ const sha = gitSha();
271
+ const outPath = opts.out ?? resolve(cwd(), `.harness/eval/results/${sha}.jsonl`);
272
+ await mkdir(dirname(outPath), { recursive: true });
273
+
274
+ const results = [];
275
+ for (const task of tasks) {
276
+ let transcript;
277
+ try {
278
+ transcript = await transport(task);
279
+ } catch (err) {
280
+ transcript = { events: [], stderr: err.message };
281
+ }
282
+ const grades = [
283
+ gradeOutcome(task),
284
+ gradeProcess(task, transcript),
285
+ gradeStyle(task, transcript),
286
+ gradeEfficiency(task, transcript),
287
+ ].filter((g) => g.score !== null);
288
+
289
+ const passed = grades.length > 0 && grades.every((g) => g.score === 1);
290
+ const row = {
291
+ taskId: task.id,
292
+ sha,
293
+ ts: new Date().toISOString(),
294
+ grades,
295
+ passed,
296
+ };
297
+ results.push(row);
298
+ await appendFile(outPath, JSON.stringify(row) + "\n");
299
+ }
300
+
301
+ return { results, passed: results.filter((r) => r.passed).length, outPath, sha };
302
+ }
303
+
304
+ function summarize({ results, passed, outPath, sha }) {
305
+ console.log(`\nEval run ${sha} — ${passed}/${results.length} passed (${outPath})`);
306
+ for (const r of results) {
307
+ const mark = r.passed ? "✓" : "✗";
308
+ console.log(` ${mark} ${r.taskId}`);
309
+ for (const g of r.grades) {
310
+ const m = g.score === 1 ? "✓" : "✗";
311
+ console.log(` ${m} ${g.dim}: ${g.info}`);
312
+ }
313
+ }
314
+ }
315
+
316
+ // CLI entry — only runs when invoked directly, not when imported by tests.
317
+ if (import.meta.url === `file://${argv[1]}`) {
318
+ const opts = parseArgs(argv);
319
+ const summary = await runEval(opts);
320
+ summarize(summary);
321
+ if (env.CI === "true" && summary.passed < summary.results.length) exit(1);
322
+ }
@@ -0,0 +1,125 @@
1
+ // harness/structural-test.mjs — forward-only layer enforcement.
2
+ //
3
+ // Reads harness.config.json. For each domain, parses every source file's
4
+ // imports (via ts-morph) and asserts that no import goes "backward" through
5
+ // the layer order. New violations on existing code are baselined into
6
+ // .harness/structural-baseline.json on first run.
7
+ //
8
+ // Exit codes:
9
+ // 0 — clean (or only baselined violations)
10
+ // 2 — new violations found (Claude Code reads stderr and re-prompts)
11
+
12
+ import { readFileSync, existsSync, writeFileSync } from "node:fs";
13
+ import { resolve, dirname } from "node:path";
14
+ import { mkdirSync } from "node:fs";
15
+
16
+ let Project;
17
+ try {
18
+ ({ Project } = await import("ts-morph"));
19
+ } catch {
20
+ console.error(
21
+ "ts-morph is not installed. Run `npm install --save-dev ts-morph`.",
22
+ );
23
+ process.exit(1);
24
+ }
25
+
26
+ const ROOT = process.cwd();
27
+ const cfg = JSON.parse(readFileSync(resolve(ROOT, "harness.config.json"), "utf8"));
28
+ const baselinePath = resolve(ROOT, ".harness/structural-baseline.json");
29
+ const baseline = existsSync(baselinePath)
30
+ ? new Set(JSON.parse(readFileSync(baselinePath, "utf8")))
31
+ : new Set();
32
+
33
+ // CLI flag --file <path> scopes the check to one file (used by the hook).
34
+ const args = process.argv.slice(2);
35
+ let scopedFile = null;
36
+ for (let i = 0; i < args.length; i++) {
37
+ if (args[i] === "--file" && i + 1 < args.length) scopedFile = resolve(ROOT, args[i + 1]);
38
+ }
39
+
40
+ function layerOf(filePath) {
41
+ for (const d of cfg.domains) {
42
+ if (!filePath.includes(`/${d.root}/`) && !filePath.endsWith(`/${d.root}`)) {
43
+ // also accept relative match
44
+ const rel = filePath.startsWith(ROOT) ? filePath.slice(ROOT.length + 1) : filePath;
45
+ if (!rel.startsWith(d.root)) continue;
46
+ }
47
+ for (const layer of d.layers) {
48
+ if (filePath.includes(`/${layer}/`) || filePath.endsWith(`/${layer}.ts`)) {
49
+ return { layer, domain: d };
50
+ }
51
+ }
52
+ }
53
+ return null;
54
+ }
55
+
56
+ function indexOf(layer, layers) {
57
+ return layers.indexOf(layer);
58
+ }
59
+
60
+ const project = new Project({
61
+ tsConfigFilePath: existsSync(resolve(ROOT, "tsconfig.json"))
62
+ ? resolve(ROOT, "tsconfig.json")
63
+ : undefined,
64
+ skipAddingFilesFromTsConfig: false,
65
+ });
66
+ if (!existsSync(resolve(ROOT, "tsconfig.json"))) {
67
+ project.addSourceFilesAtPaths("**/*.{ts,tsx,mts,cts}");
68
+ }
69
+
70
+ const violations = [];
71
+ for (const sf of project.getSourceFiles()) {
72
+ const sourcePath = sf.getFilePath();
73
+ if (scopedFile && sourcePath !== scopedFile) continue;
74
+ const src = layerOf(sourcePath);
75
+ if (!src) continue;
76
+ const sourceIdx = indexOf(src.layer, src.domain.layers);
77
+
78
+ for (const imp of sf.getImportDeclarations()) {
79
+ const target = imp.getModuleSpecifierSourceFile();
80
+ if (!target) continue;
81
+ const tgt = layerOf(target.getFilePath());
82
+ if (!tgt || tgt.domain.name !== src.domain.name) continue;
83
+ const targetIdx = indexOf(tgt.layer, tgt.domain.layers);
84
+ // forward-only: source layer index must be >= target layer index
85
+ if (sourceIdx < targetIdx) {
86
+ const key = `${sourcePath}::${target.getFilePath()}`;
87
+ if (baseline.has(key)) continue;
88
+ violations.push({
89
+ file: sourcePath,
90
+ line: imp.getStartLineNumber(),
91
+ from: src.layer,
92
+ to: tgt.layer,
93
+ domain: src.domain.name,
94
+ key,
95
+ });
96
+ }
97
+ }
98
+ }
99
+
100
+ // First-run baseline behavior: if no baseline file exists, write the current
101
+ // set as the baseline and exit clean. Subsequent runs only block on NEW
102
+ // violations.
103
+ if (!existsSync(baselinePath) && violations.length > 0) {
104
+ mkdirSync(dirname(baselinePath), { recursive: true });
105
+ writeFileSync(baselinePath, JSON.stringify(violations.map((v) => v.key), null, 2) + "\n");
106
+ console.log(
107
+ `✓ structural test: baselined ${violations.length} existing violations (.harness/structural-baseline.json).`,
108
+ );
109
+ console.log(
110
+ ` New violations introduced after this point will block. Existing ones can be fixed incrementally.`,
111
+ );
112
+ process.exit(0);
113
+ }
114
+
115
+ if (violations.length === 0) {
116
+ console.log("✓ structural test passed");
117
+ process.exit(0);
118
+ }
119
+
120
+ for (const v of violations) {
121
+ console.error(`✖ ${v.file}:${v.line} layer=${v.from} → ${v.to} (must be forward-only)`);
122
+ }
123
+ console.error(`\n${violations.length} new layer violation(s). Fix the import direction.`);
124
+ console.error(`Layer order for domain "${cfg.domains[0]?.name}": ${cfg.domains[0]?.layers?.join(" → ")}`);
125
+ process.exit(2);
@@ -0,0 +1,59 @@
1
+ name: harness eval (nightly)
2
+
3
+ on:
4
+ schedule:
5
+ - cron: "0 6 * * *"
6
+ workflow_dispatch:
7
+ inputs:
8
+ set:
9
+ description: "quick (3 tasks) or full (all tasks)"
10
+ type: choice
11
+ options: [quick, full]
12
+ default: quick
13
+ transport:
14
+ description: "claude-cli (real run, costs tokens) or mock (CI smoke-test, free)"
15
+ type: choice
16
+ options: [mock, claude-cli]
17
+ default: mock
18
+
19
+ permissions:
20
+ contents: read
21
+
22
+ jobs:
23
+ eval:
24
+ runs-on: ubuntu-latest
25
+ timeout-minutes: 30
26
+ steps:
27
+ - uses: actions/checkout@v4
28
+ - uses: actions/setup-node@v4
29
+ with:
30
+ node-version: "20"
31
+ - run: npm ci || npm install
32
+ - name: Verify Claude Code CLI is available
33
+ if: ${{ inputs.transport == 'claude-cli' || (github.event_name == 'schedule') }}
34
+ run: npx -y @anthropic-ai/claude-code --version
35
+ - name: Run eval
36
+ env:
37
+ ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
38
+ CI: "true"
39
+ run: |
40
+ SET="${{ inputs.set || 'quick' }}"
41
+ # Default to mock for unattended schedule runs unless ANTHROPIC_API_KEY is set.
42
+ TRANSPORT="${{ inputs.transport }}"
43
+ if [ -z "$TRANSPORT" ]; then
44
+ if [ -n "${{ secrets.ANTHROPIC_API_KEY }}" ]; then
45
+ TRANSPORT="claude-cli"
46
+ else
47
+ TRANSPORT="mock"
48
+ fi
49
+ fi
50
+ if [ -f harness.config.json ] && grep -q '"language": "python"' harness.config.json; then
51
+ python -m harness.eval_runner --$SET --transport=$TRANSPORT
52
+ else
53
+ node harness/eval-runner.mjs --$SET --transport=$TRANSPORT
54
+ fi
55
+ - uses: actions/upload-artifact@v4
56
+ if: always()
57
+ with:
58
+ name: eval-results
59
+ path: .harness/eval/results/
@@ -0,0 +1,55 @@
1
+ name: harness
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+ workflow_dispatch:
9
+
10
+ permissions:
11
+ contents: read
12
+ pull-requests: read
13
+
14
+ jobs:
15
+ structural:
16
+ runs-on: ubuntu-latest
17
+ timeout-minutes: 5
18
+ steps:
19
+ - uses: actions/checkout@v4
20
+ - uses: actions/setup-node@v4
21
+ with:
22
+ node-version: "20"
23
+ - uses: actions/setup-python@v5
24
+ with:
25
+ python-version: "3.12"
26
+ - name: Install Node deps (if package.json present)
27
+ run: |
28
+ if [ -f package.json ]; then
29
+ if [ -f pnpm-lock.yaml ]; then npm i -g pnpm && pnpm install --frozen-lockfile
30
+ elif [ -f yarn.lock ]; then npm i -g yarn && yarn install --frozen-lockfile
31
+ else npm ci || npm install
32
+ fi
33
+ fi
34
+ - name: Install Python deps (if pyproject.toml present)
35
+ run: |
36
+ if [ -f pyproject.toml ]; then
37
+ python -m pip install --upgrade pip
38
+ pip install libcst import-linter ruff || true
39
+ pip install -e '.[dev]' || pip install -e . || true
40
+ fi
41
+ - name: Structural test
42
+ run: |
43
+ if [ -f harness.config.json ] && grep -q '"language": "python"' harness.config.json; then
44
+ python -m harness.structural_test
45
+ else
46
+ npm run --silent harness:check
47
+ fi
48
+ - name: Lint
49
+ continue-on-error: true
50
+ run: |
51
+ if [ -f package.json ] && grep -q '"lint"' package.json; then
52
+ npm run --silent lint
53
+ elif command -v ruff >/dev/null 2>&1; then
54
+ ruff check .
55
+ fi
@@ -0,0 +1,56 @@
1
+ # ADR 0001 — Adopt agent-harness-kit
2
+
3
+ - **Status:** accepted
4
+ - **Date:** {{now "yyyy-MM-dd"}}
5
+ - **Deciders:** project owner
6
+
7
+ ## Context
8
+
9
+ This is a single-developer project that uses Claude Code for the bulk of
10
+ implementation work. Agent-driven development without a harness produces
11
+ predictable failure modes:
12
+
13
+ - duplicated helpers across modules
14
+ - backward layer dependencies
15
+ - silent test removal or skip
16
+ - doc drift from code reality
17
+ - unbounded retries and missing timeouts
18
+
19
+ Hand-engineering each preventive against these failures is achievable but
20
+ slow and easy to forget. A shared starter kit codifies the patterns that
21
+ OpenAI, Stripe, Anthropic, and Mitchell Hashimoto have publicly demonstrated
22
+ work.
23
+
24
+ ## Decision
25
+
26
+ Adopt `agent-harness-kit v{{kitVersion}}` as the harness layer. Specifically:
27
+
28
+ - Use the layer order `{{layersJoined}}` and enforce it via the structural
29
+ test bundled with the kit.
30
+ - Run the PostToolUse + Stop hooks shipped by the kit unmodified.
31
+ - Use the 10 starter skills and 5 reviewer subagents as the baseline; add or
32
+ remove via subsequent ADRs.
33
+ - Run `/garbage-collection` weekly.
34
+
35
+ ## Consequences
36
+
37
+ Positive
38
+
39
+ - Time-to-mistake-fix drops to ~30 seconds (PostToolUse hook).
40
+ - The `feature_list.json` + `PROGRESS.md` pair gives every session a clean
41
+ starting context, regardless of conversation length.
42
+
43
+ Negative
44
+
45
+ - The layer order is opinionated. Some valid architectures (hexagonal,
46
+ vertical-slice) require an ADR override.
47
+ - The kit upgrades introduce sidecar files (`*.harness-new`) that must be
48
+ diffed manually for user-modified files.
49
+
50
+ ## Alternatives considered
51
+
52
+ - **Roll our own.** Rejected: too slow, and the literature converges on the
53
+ same patterns.
54
+ - **Use Claudify (1700 skills, 9 subagents).** Rejected: the over-engineered
55
+ antipattern this kit explicitly avoids.
56
+ - **No harness.** Rejected: see Context.
@@ -0,0 +1,25 @@
1
+ # Agent failures log
2
+
3
+ This is the running log of agent mistakes that triggered a harness
4
+ improvement. Each entry should answer: what happened, what we did to make
5
+ sure it never happens again, and where the prevention now lives.
6
+
7
+ The `/propose-harness-improvement` skill appends entries here automatically.
8
+
9
+ > "Anytime you find an agent makes a mistake, you take the time to engineer
10
+ > a solution such that the agent never makes that mistake again."
11
+ > — Mitchell Hashimoto, _My AI Adoption Journey_ (Feb 5, 2026)
12
+
13
+ ## Format
14
+
15
+ ```
16
+ ### YYYY-MM-DD <slug>
17
+ - **Symptom:** <what went wrong>
18
+ - **Classification:** (a) missing context | (b) missing rule | (c) missing tool/skill | (d) wrong layer
19
+ - **Fix applied:** <what we did>
20
+ - **Fix lives in:** path/or/file
21
+ ```
22
+
23
+ ## Entries
24
+
25
+ _(empty — this file fills up over time as `/propose-harness-improvement` is invoked.)_
@@ -0,0 +1,47 @@
1
+ # Architecture — {{projectName}}
2
+
3
+ This document is the source of truth for how code is organized. Any deviation
4
+ must be justified in an ADR under `docs/adr/`.
5
+
6
+ ## Layer order (forward-only)
7
+
8
+ ```
9
+ {{layersJoined}}
10
+ ```
11
+
12
+ Code in a higher layer may import from any lower layer. Code in a lower layer
13
+ **must not** import from a higher layer. The structural test enforces this
14
+ mechanically — see `harness.config.json` and the
15
+ `{{#if isPython}}python -m harness.structural_test{{else}}npm run harness:check{{/if}}` command.
16
+
17
+ ## Layer responsibilities
18
+
19
+ | Layer | Responsibility |
20
+ | ----------- | --------------------------------------------------------------------------- |
21
+ | `types` | Pure data shapes. No I/O, no business logic, no framework imports. |
22
+ | `config` | Static configuration (env loading, feature flags, constants). |
23
+ | `repo` | Persistence and external-system gateways. Returns plain values. |
24
+ | `service` | Business logic. Orchestrates `repo` calls. Pure where possible. |
25
+ | `runtime` | Framework adapters: HTTP routes, CLI commands, queue handlers. |
26
+ | `ui` | Rendering, components, presentation logic. |
27
+
28
+ ## Cross-cutting concerns: `providers/`
29
+
30
+ Auth, telemetry, feature flags, observability — anything that would otherwise
31
+ cut across layers — enters through `providers/`. Each provider exposes a
32
+ single typed interface; consumers depend on the interface, not the
33
+ implementation.
34
+
35
+ ## Adding a new module
36
+
37
+ 1. Decide which layers it touches.
38
+ 2. Run `/inspect-module <existing-similar-module>` to mirror the pattern.
39
+ 3. Create files under `src/{domain}/{layer}/`.
40
+ 4. Write tests in the same layer.
41
+ 5. Run the structural test. If it fails, do **not** disable it — fix the import.
42
+
43
+ ## Recent decisions
44
+
45
+ (Most recent first. Created automatically by `/add-adr`.)
46
+
47
+ - `0001-use-agent-harness-kit.md` — Adopt agent-harness-kit as the harness layer.