agent-harness-kit 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/marketplace.json +27 -0
- package/.claude-plugin/plugin.json +25 -0
- package/LICENSE +21 -0
- package/README.md +165 -0
- package/bin/cli.mjs +261 -0
- package/package.json +64 -0
- package/src/core/detect-stack.mjs +181 -0
- package/src/core/doctor.mjs +106 -0
- package/src/core/patch-package-json.mjs +53 -0
- package/src/core/render-templates.mjs +277 -0
- package/src/core/upgrade.mjs +274 -0
- package/src/templates/.claude/agents/api-consistency-reviewer.md +33 -0
- package/src/templates/.claude/agents/architecture-reviewer.md.hbs +41 -0
- package/src/templates/.claude/agents/performance-reviewer.md +35 -0
- package/src/templates/.claude/agents/reliability-reviewer.md +38 -0
- package/src/templates/.claude/agents/security-reviewer.md +39 -0
- package/src/templates/.claude/hooks/hooks.json.hbs +39 -0
- package/src/templates/.claude/settings.json.hbs +25 -0
- package/src/templates/.claude/skills/add-adr/SKILL.md +60 -0
- package/src/templates/.claude/skills/add-feature/SKILL.md.hbs +50 -0
- package/src/templates/.claude/skills/debug-flow/SKILL.md.hbs +38 -0
- package/src/templates/.claude/skills/doc-drift-scan/SKILL.md +43 -0
- package/src/templates/.claude/skills/eval-runner/SKILL.md +55 -0
- package/src/templates/.claude/skills/garbage-collection/SKILL.md.hbs +49 -0
- package/src/templates/.claude/skills/inspect-app/SKILL.md +57 -0
- package/src/templates/.claude/skills/inspect-module/SKILL.md.hbs +53 -0
- package/src/templates/.claude/skills/propose-harness-improvement/SKILL.md +43 -0
- package/src/templates/.claude/skills/structural-test-author/SKILL.md.hbs +46 -0
- package/src/templates/.claude/skills/write-skill/SKILL.md +39 -0
- package/src/templates/CLAUDE.md.hbs +70 -0
- package/src/templates/_adapter-python/.importlinter +14 -0
- package/src/templates/_adapter-python/harness/__init__.py +0 -0
- package/src/templates/_adapter-python/harness/eval_runner.py +281 -0
- package/src/templates/_adapter-python/harness/structural_test.py +195 -0
- package/src/templates/_adapter-typescript/.dependency-cruiser.cjs +27 -0
- package/src/templates/_adapter-typescript/eslint.config.mjs +38 -0
- package/src/templates/_adapter-typescript/harness/eval-runner.mjs +322 -0
- package/src/templates/_adapter-typescript/harness/structural-test.mjs +125 -0
- package/src/templates/_ci/.github/workflows/eval-nightly.yml +59 -0
- package/src/templates/_ci/.github/workflows/harness.yml +55 -0
- package/src/templates/docs/adr/0001-use-agent-harness-kit.md.hbs +56 -0
- package/src/templates/docs/agent-failures.md +25 -0
- package/src/templates/docs/architecture.md.hbs +47 -0
- package/src/templates/docs/core-beliefs.md.hbs +41 -0
- package/src/templates/docs/golden-principles.md.hbs +80 -0
- package/src/templates/docs/tech-debt-tracker.md +30 -0
- package/src/templates/feature_list.json.hbs +29 -0
- package/src/templates/harness.config.json.hbs +40 -0
- package/src/templates/scripts/dev-up.sh.hbs +51 -0
- package/src/templates/scripts/harness-report.mjs +189 -0
- package/src/templates/scripts/install-git-hooks.sh +18 -0
- package/src/templates/scripts/pre-push.sh +21 -0
- package/src/templates/scripts/precompletion-checklist.sh.hbs +99 -0
- package/src/templates/scripts/structural-test-on-edit.sh.hbs +53 -0
- package/src/templates/scripts/telemetry-on-skill.sh +26 -0
|
@@ -0,0 +1,322 @@
|
|
|
1
|
+
// harness/eval-runner.mjs — drive Claude Code through .harness/eval/tasks/*.json
|
|
2
|
+
// and grade each on outcome / process / style / efficiency.
|
|
3
|
+
//
|
|
4
|
+
// Per-task JSONL row goes to .harness/eval/results/<sha>.jsonl. On regression
|
|
5
|
+
// (any task failing in CI), exit 1 so the workflow blocks merge.
|
|
6
|
+
//
|
|
7
|
+
// Transports:
|
|
8
|
+
// --transport=claude-cli spawn `claude -p` and capture stream-json transcript (default)
|
|
9
|
+
// --transport=mock synthetic transcript — use in CI smoke-tests, no API key needed
|
|
10
|
+
//
|
|
11
|
+
// Sets:
|
|
12
|
+
// --quick first 3 tasks (~$0.30, ~2 min on Sonnet)
|
|
13
|
+
// --full all tasks (~$2, ~15 min)
|
|
14
|
+
// --tasks <glob> custom set
|
|
15
|
+
//
|
|
16
|
+
// Usage:
|
|
17
|
+
// node harness/eval-runner.mjs --quick
|
|
18
|
+
// node harness/eval-runner.mjs --full --transport=mock # CI smoke-test
|
|
19
|
+
// node harness/eval-runner.mjs --tasks 01-trivial-endpoint.json
|
|
20
|
+
|
|
21
|
+
import { readFile, writeFile, mkdir, readdir, appendFile } from "node:fs/promises";
|
|
22
|
+
import { existsSync } from "node:fs";
|
|
23
|
+
import { resolve, join, dirname } from "node:path";
|
|
24
|
+
import { spawn, execSync } from "node:child_process";
|
|
25
|
+
import { argv, exit, env, cwd } from "node:process";
|
|
26
|
+
|
|
27
|
+
function parseArgs(argv) {
|
|
28
|
+
const opts = {
|
|
29
|
+
quick: false,
|
|
30
|
+
full: false,
|
|
31
|
+
tasksGlob: null,
|
|
32
|
+
transport: "claude-cli",
|
|
33
|
+
out: null,
|
|
34
|
+
};
|
|
35
|
+
for (let i = 2; i < argv.length; i++) {
|
|
36
|
+
const a = argv[i];
|
|
37
|
+
if (a === "--quick") opts.quick = true;
|
|
38
|
+
else if (a === "--full") opts.full = true;
|
|
39
|
+
else if (a === "--tasks") opts.tasksGlob = argv[++i];
|
|
40
|
+
else if (a.startsWith("--tasks=")) opts.tasksGlob = a.slice("--tasks=".length);
|
|
41
|
+
else if (a === "--transport") opts.transport = argv[++i];
|
|
42
|
+
else if (a.startsWith("--transport=")) opts.transport = a.slice("--transport=".length);
|
|
43
|
+
else if (a === "--out") opts.out = argv[++i];
|
|
44
|
+
else if (a.startsWith("--out=")) opts.out = a.slice("--out=".length);
|
|
45
|
+
else if (a === "--help" || a === "-h") {
|
|
46
|
+
console.log(USAGE);
|
|
47
|
+
exit(0);
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
return opts;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
const USAGE = `Usage: node harness/eval-runner.mjs [--quick|--full|--tasks <glob>] [--transport <name>]
|
|
54
|
+
|
|
55
|
+
Transports:
|
|
56
|
+
claude-cli (default) spawn \`claude -p\` and capture stream-json
|
|
57
|
+
mock synthetic transcript for CI smoke-tests
|
|
58
|
+
|
|
59
|
+
See PUBLISHING.md for token budget and cost notes.`;
|
|
60
|
+
|
|
61
|
+
async function loadTasks(opts) {
|
|
62
|
+
const dir = resolve(cwd(), ".harness/eval/tasks");
|
|
63
|
+
if (!existsSync(dir)) {
|
|
64
|
+
console.error(`No tasks directory at ${dir}. Run \`agent-harness-kit init\` first.`);
|
|
65
|
+
exit(1);
|
|
66
|
+
}
|
|
67
|
+
let files = (await readdir(dir)).filter((f) => f.endsWith(".json")).sort();
|
|
68
|
+
if (opts.tasksGlob) {
|
|
69
|
+
files = files.filter((f) => f === opts.tasksGlob || f.includes(opts.tasksGlob));
|
|
70
|
+
} else if (opts.quick) {
|
|
71
|
+
files = files.slice(0, 3);
|
|
72
|
+
}
|
|
73
|
+
const tasks = [];
|
|
74
|
+
for (const f of files) {
|
|
75
|
+
const t = JSON.parse(await readFile(join(dir, f), "utf8"));
|
|
76
|
+
tasks.push({ ...t, _file: join(dir, f) });
|
|
77
|
+
}
|
|
78
|
+
return tasks;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
// ---- transports ----
|
|
82
|
+
|
|
83
|
+
const TRANSPORTS = {
|
|
84
|
+
// Real driver: spawn `claude -p` with stream-json output and flatten the
|
|
85
|
+
// wire format into the same shape the mock transport produces (so the
|
|
86
|
+
// graders don't have to know about both shapes).
|
|
87
|
+
//
|
|
88
|
+
// Real wire format (Claude Code 2.1.x):
|
|
89
|
+
// {type:"assistant", message:{content:[{type:"tool_use", name, input}]}}
|
|
90
|
+
// {type:"user", message:{content:[{type:"tool_result", ...}]}}
|
|
91
|
+
// {type:"result", usage:{input_tokens, output_tokens, cache_*}, total_cost_usd}
|
|
92
|
+
//
|
|
93
|
+
// Flat shape graders consume:
|
|
94
|
+
// {type:"tool_use", tool:<name>, path:<input.file_path|input.path>}
|
|
95
|
+
// {type:"token_usage", total:<sum of all token fields>}
|
|
96
|
+
"claude-cli": (task) =>
|
|
97
|
+
new Promise((resolve, reject) => {
|
|
98
|
+
const proc = spawn(
|
|
99
|
+
"claude",
|
|
100
|
+
[
|
|
101
|
+
"-p",
|
|
102
|
+
task.input,
|
|
103
|
+
"--output-format",
|
|
104
|
+
"stream-json",
|
|
105
|
+
"--verbose",
|
|
106
|
+
"--max-turns",
|
|
107
|
+
"20",
|
|
108
|
+
],
|
|
109
|
+
{ stdio: ["ignore", "pipe", "pipe"] },
|
|
110
|
+
);
|
|
111
|
+
const events = [];
|
|
112
|
+
let stderr = "";
|
|
113
|
+
let buf = "";
|
|
114
|
+
const ingest = (raw) => {
|
|
115
|
+
// Always keep the raw event for debugging.
|
|
116
|
+
events.push({ raw, type: raw.type });
|
|
117
|
+
// Flatten tool_use blocks from assistant messages.
|
|
118
|
+
if (raw.type === "assistant" && raw.message?.content) {
|
|
119
|
+
for (const block of raw.message.content) {
|
|
120
|
+
if (block.type !== "tool_use") continue;
|
|
121
|
+
// /skill invocations come in as the Skill tool with input.skill.
|
|
122
|
+
if (block.name === "Skill" && block.input?.skill) {
|
|
123
|
+
events.push({ type: "tool_use", tool: block.input.skill });
|
|
124
|
+
}
|
|
125
|
+
const path =
|
|
126
|
+
block.input?.file_path ?? block.input?.path ?? null;
|
|
127
|
+
events.push({ type: "tool_use", tool: block.name, path });
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
// Final result has aggregated usage.
|
|
131
|
+
if (raw.type === "result" && raw.usage) {
|
|
132
|
+
const u = raw.usage;
|
|
133
|
+
const total =
|
|
134
|
+
(u.input_tokens ?? 0) +
|
|
135
|
+
(u.output_tokens ?? 0) +
|
|
136
|
+
(u.cache_creation_input_tokens ?? 0) +
|
|
137
|
+
(u.cache_read_input_tokens ?? 0);
|
|
138
|
+
events.push({ type: "token_usage", total });
|
|
139
|
+
}
|
|
140
|
+
};
|
|
141
|
+
proc.stdout.on("data", (chunk) => {
|
|
142
|
+
buf += chunk.toString();
|
|
143
|
+
const lines = buf.split("\n");
|
|
144
|
+
buf = lines.pop() ?? "";
|
|
145
|
+
for (const line of lines) {
|
|
146
|
+
if (!line.trim()) continue;
|
|
147
|
+
try {
|
|
148
|
+
ingest(JSON.parse(line));
|
|
149
|
+
} catch {
|
|
150
|
+
/* non-JSON line (rare) — ignore */
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
});
|
|
154
|
+
proc.stderr.on("data", (chunk) => {
|
|
155
|
+
stderr += chunk.toString();
|
|
156
|
+
});
|
|
157
|
+
proc.on("error", reject);
|
|
158
|
+
proc.on("exit", (code) => {
|
|
159
|
+
if (code !== 0) {
|
|
160
|
+
return reject(new Error(`claude exited ${code}: ${stderr.slice(0, 500)}`));
|
|
161
|
+
}
|
|
162
|
+
resolve({ events, stderr });
|
|
163
|
+
});
|
|
164
|
+
}),
|
|
165
|
+
|
|
166
|
+
// Mock transport — produces a synthetic transcript that satisfies the
|
|
167
|
+
// default expectations of the shipped tasks. Used in CI to verify the
|
|
168
|
+
// driver shape end-to-end without burning API tokens.
|
|
169
|
+
mock: async (task) => {
|
|
170
|
+
const expected = task.expected ?? {};
|
|
171
|
+
const events = [];
|
|
172
|
+
for (const skill of expected.skillsInvoked ?? []) {
|
|
173
|
+
events.push({ type: "tool_use", tool: skill });
|
|
174
|
+
}
|
|
175
|
+
const minFiles = expected.filesChanged?.min ?? 1;
|
|
176
|
+
for (let i = 0; i < minFiles; i++) {
|
|
177
|
+
events.push({ type: "tool_use", tool: "Write", path: `src/mock-${i}.ts` });
|
|
178
|
+
}
|
|
179
|
+
events.push({
|
|
180
|
+
type: "token_usage",
|
|
181
|
+
total: Math.min(expected.tokensMax ?? 5000, 5000),
|
|
182
|
+
});
|
|
183
|
+
return { events, stderr: "" };
|
|
184
|
+
},
|
|
185
|
+
};
|
|
186
|
+
|
|
187
|
+
// ---- graders ----
|
|
188
|
+
|
|
189
|
+
function gradeOutcome(task) {
|
|
190
|
+
if (task.expected?.structuralTest !== "pass") {
|
|
191
|
+
return { dim: "outcome", score: null, info: "no expectation" };
|
|
192
|
+
}
|
|
193
|
+
try {
|
|
194
|
+
execSync("npm run --silent harness:check", { stdio: "ignore" });
|
|
195
|
+
return { dim: "outcome", score: 1, info: "structural test passed" };
|
|
196
|
+
} catch {
|
|
197
|
+
return { dim: "outcome", score: 0, info: "structural test failed" };
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
function gradeProcess(task, transcript) {
|
|
202
|
+
const expected = task.expected?.skillsInvoked ?? [];
|
|
203
|
+
if (expected.length === 0) return { dim: "process", score: null };
|
|
204
|
+
const invoked = new Set(
|
|
205
|
+
transcript.events.filter((e) => e.type === "tool_use").map((e) => e.tool),
|
|
206
|
+
);
|
|
207
|
+
const missing = expected.filter((s) => !invoked.has(s));
|
|
208
|
+
return {
|
|
209
|
+
dim: "process",
|
|
210
|
+
score: missing.length === 0 ? 1 : 0,
|
|
211
|
+
info:
|
|
212
|
+
missing.length === 0
|
|
213
|
+
? "all expected skills invoked"
|
|
214
|
+
: `missing skills: ${missing.join(", ")}`,
|
|
215
|
+
};
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
function gradeStyle(task, transcript) {
|
|
219
|
+
const range = task.expected?.filesChanged;
|
|
220
|
+
if (!range) return { dim: "style", score: null };
|
|
221
|
+
const writes = transcript.events.filter(
|
|
222
|
+
(e) => e.type === "tool_use" && (e.tool === "Write" || e.tool === "Edit" || e.tool === "MultiEdit"),
|
|
223
|
+
);
|
|
224
|
+
const distinct = new Set(writes.map((e) => e.path).filter(Boolean)).size;
|
|
225
|
+
const ok = distinct >= range.min && distinct <= range.max;
|
|
226
|
+
return {
|
|
227
|
+
dim: "style",
|
|
228
|
+
score: ok ? 1 : 0,
|
|
229
|
+
info: `${distinct} files changed (expected ${range.min}-${range.max})`,
|
|
230
|
+
};
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
function gradeEfficiency(task, transcript) {
|
|
234
|
+
const cap = task.expected?.tokensMax;
|
|
235
|
+
if (!cap) return { dim: "efficiency", score: null };
|
|
236
|
+
const tokens = transcript.events
|
|
237
|
+
.filter((e) => e.type === "token_usage")
|
|
238
|
+
.reduce((sum, e) => sum + (e.total ?? 0), 0);
|
|
239
|
+
return {
|
|
240
|
+
dim: "efficiency",
|
|
241
|
+
score: tokens <= cap ? 1 : 0,
|
|
242
|
+
info: `${tokens} tokens (cap ${cap})`,
|
|
243
|
+
};
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
function gitSha() {
|
|
247
|
+
try {
|
|
248
|
+
return execSync("git rev-parse --short HEAD", { stdio: ["ignore", "pipe", "ignore"] })
|
|
249
|
+
.toString()
|
|
250
|
+
.trim();
|
|
251
|
+
} catch {
|
|
252
|
+
return "no-git";
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
export async function runEval(opts = {}) {
|
|
257
|
+
const tasks = await loadTasks(opts);
|
|
258
|
+
if (tasks.length === 0) {
|
|
259
|
+
console.error("No tasks matched.");
|
|
260
|
+
return { results: [], passed: 0 };
|
|
261
|
+
}
|
|
262
|
+
const transport = TRANSPORTS[opts.transport ?? "claude-cli"];
|
|
263
|
+
if (!transport) {
|
|
264
|
+
console.error(
|
|
265
|
+
`Unknown transport: ${opts.transport}. Try: ${Object.keys(TRANSPORTS).join(", ")}`,
|
|
266
|
+
);
|
|
267
|
+
exit(2);
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
const sha = gitSha();
|
|
271
|
+
const outPath = opts.out ?? resolve(cwd(), `.harness/eval/results/${sha}.jsonl`);
|
|
272
|
+
await mkdir(dirname(outPath), { recursive: true });
|
|
273
|
+
|
|
274
|
+
const results = [];
|
|
275
|
+
for (const task of tasks) {
|
|
276
|
+
let transcript;
|
|
277
|
+
try {
|
|
278
|
+
transcript = await transport(task);
|
|
279
|
+
} catch (err) {
|
|
280
|
+
transcript = { events: [], stderr: err.message };
|
|
281
|
+
}
|
|
282
|
+
const grades = [
|
|
283
|
+
gradeOutcome(task),
|
|
284
|
+
gradeProcess(task, transcript),
|
|
285
|
+
gradeStyle(task, transcript),
|
|
286
|
+
gradeEfficiency(task, transcript),
|
|
287
|
+
].filter((g) => g.score !== null);
|
|
288
|
+
|
|
289
|
+
const passed = grades.length > 0 && grades.every((g) => g.score === 1);
|
|
290
|
+
const row = {
|
|
291
|
+
taskId: task.id,
|
|
292
|
+
sha,
|
|
293
|
+
ts: new Date().toISOString(),
|
|
294
|
+
grades,
|
|
295
|
+
passed,
|
|
296
|
+
};
|
|
297
|
+
results.push(row);
|
|
298
|
+
await appendFile(outPath, JSON.stringify(row) + "\n");
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
return { results, passed: results.filter((r) => r.passed).length, outPath, sha };
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
function summarize({ results, passed, outPath, sha }) {
|
|
305
|
+
console.log(`\nEval run ${sha} — ${passed}/${results.length} passed (${outPath})`);
|
|
306
|
+
for (const r of results) {
|
|
307
|
+
const mark = r.passed ? "✓" : "✗";
|
|
308
|
+
console.log(` ${mark} ${r.taskId}`);
|
|
309
|
+
for (const g of r.grades) {
|
|
310
|
+
const m = g.score === 1 ? "✓" : "✗";
|
|
311
|
+
console.log(` ${m} ${g.dim}: ${g.info}`);
|
|
312
|
+
}
|
|
313
|
+
}
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
// CLI entry — only runs when invoked directly, not when imported by tests.
|
|
317
|
+
if (import.meta.url === `file://${argv[1]}`) {
|
|
318
|
+
const opts = parseArgs(argv);
|
|
319
|
+
const summary = await runEval(opts);
|
|
320
|
+
summarize(summary);
|
|
321
|
+
if (env.CI === "true" && summary.passed < summary.results.length) exit(1);
|
|
322
|
+
}
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
// harness/structural-test.mjs — forward-only layer enforcement.
|
|
2
|
+
//
|
|
3
|
+
// Reads harness.config.json. For each domain, parses every source file's
|
|
4
|
+
// imports (via ts-morph) and asserts that no import goes "backward" through
|
|
5
|
+
// the layer order. New violations on existing code are baselined into
|
|
6
|
+
// .harness/structural-baseline.json on first run.
|
|
7
|
+
//
|
|
8
|
+
// Exit codes:
|
|
9
|
+
// 0 — clean (or only baselined violations)
|
|
10
|
+
// 2 — new violations found (Claude Code reads stderr and re-prompts)
|
|
11
|
+
|
|
12
|
+
import { readFileSync, existsSync, writeFileSync } from "node:fs";
|
|
13
|
+
import { resolve, dirname } from "node:path";
|
|
14
|
+
import { mkdirSync } from "node:fs";
|
|
15
|
+
|
|
16
|
+
let Project;
|
|
17
|
+
try {
|
|
18
|
+
({ Project } = await import("ts-morph"));
|
|
19
|
+
} catch {
|
|
20
|
+
console.error(
|
|
21
|
+
"ts-morph is not installed. Run `npm install --save-dev ts-morph`.",
|
|
22
|
+
);
|
|
23
|
+
process.exit(1);
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
const ROOT = process.cwd();
|
|
27
|
+
const cfg = JSON.parse(readFileSync(resolve(ROOT, "harness.config.json"), "utf8"));
|
|
28
|
+
const baselinePath = resolve(ROOT, ".harness/structural-baseline.json");
|
|
29
|
+
const baseline = existsSync(baselinePath)
|
|
30
|
+
? new Set(JSON.parse(readFileSync(baselinePath, "utf8")))
|
|
31
|
+
: new Set();
|
|
32
|
+
|
|
33
|
+
// CLI flag --file <path> scopes the check to one file (used by the hook).
|
|
34
|
+
const args = process.argv.slice(2);
|
|
35
|
+
let scopedFile = null;
|
|
36
|
+
for (let i = 0; i < args.length; i++) {
|
|
37
|
+
if (args[i] === "--file" && i + 1 < args.length) scopedFile = resolve(ROOT, args[i + 1]);
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
function layerOf(filePath) {
|
|
41
|
+
for (const d of cfg.domains) {
|
|
42
|
+
if (!filePath.includes(`/${d.root}/`) && !filePath.endsWith(`/${d.root}`)) {
|
|
43
|
+
// also accept relative match
|
|
44
|
+
const rel = filePath.startsWith(ROOT) ? filePath.slice(ROOT.length + 1) : filePath;
|
|
45
|
+
if (!rel.startsWith(d.root)) continue;
|
|
46
|
+
}
|
|
47
|
+
for (const layer of d.layers) {
|
|
48
|
+
if (filePath.includes(`/${layer}/`) || filePath.endsWith(`/${layer}.ts`)) {
|
|
49
|
+
return { layer, domain: d };
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
return null;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
function indexOf(layer, layers) {
|
|
57
|
+
return layers.indexOf(layer);
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
const project = new Project({
|
|
61
|
+
tsConfigFilePath: existsSync(resolve(ROOT, "tsconfig.json"))
|
|
62
|
+
? resolve(ROOT, "tsconfig.json")
|
|
63
|
+
: undefined,
|
|
64
|
+
skipAddingFilesFromTsConfig: false,
|
|
65
|
+
});
|
|
66
|
+
if (!existsSync(resolve(ROOT, "tsconfig.json"))) {
|
|
67
|
+
project.addSourceFilesAtPaths("**/*.{ts,tsx,mts,cts}");
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
const violations = [];
|
|
71
|
+
for (const sf of project.getSourceFiles()) {
|
|
72
|
+
const sourcePath = sf.getFilePath();
|
|
73
|
+
if (scopedFile && sourcePath !== scopedFile) continue;
|
|
74
|
+
const src = layerOf(sourcePath);
|
|
75
|
+
if (!src) continue;
|
|
76
|
+
const sourceIdx = indexOf(src.layer, src.domain.layers);
|
|
77
|
+
|
|
78
|
+
for (const imp of sf.getImportDeclarations()) {
|
|
79
|
+
const target = imp.getModuleSpecifierSourceFile();
|
|
80
|
+
if (!target) continue;
|
|
81
|
+
const tgt = layerOf(target.getFilePath());
|
|
82
|
+
if (!tgt || tgt.domain.name !== src.domain.name) continue;
|
|
83
|
+
const targetIdx = indexOf(tgt.layer, tgt.domain.layers);
|
|
84
|
+
// forward-only: source layer index must be >= target layer index
|
|
85
|
+
if (sourceIdx < targetIdx) {
|
|
86
|
+
const key = `${sourcePath}::${target.getFilePath()}`;
|
|
87
|
+
if (baseline.has(key)) continue;
|
|
88
|
+
violations.push({
|
|
89
|
+
file: sourcePath,
|
|
90
|
+
line: imp.getStartLineNumber(),
|
|
91
|
+
from: src.layer,
|
|
92
|
+
to: tgt.layer,
|
|
93
|
+
domain: src.domain.name,
|
|
94
|
+
key,
|
|
95
|
+
});
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
// First-run baseline behavior: if no baseline file exists, write the current
|
|
101
|
+
// set as the baseline and exit clean. Subsequent runs only block on NEW
|
|
102
|
+
// violations.
|
|
103
|
+
if (!existsSync(baselinePath) && violations.length > 0) {
|
|
104
|
+
mkdirSync(dirname(baselinePath), { recursive: true });
|
|
105
|
+
writeFileSync(baselinePath, JSON.stringify(violations.map((v) => v.key), null, 2) + "\n");
|
|
106
|
+
console.log(
|
|
107
|
+
`✓ structural test: baselined ${violations.length} existing violations (.harness/structural-baseline.json).`,
|
|
108
|
+
);
|
|
109
|
+
console.log(
|
|
110
|
+
` New violations introduced after this point will block. Existing ones can be fixed incrementally.`,
|
|
111
|
+
);
|
|
112
|
+
process.exit(0);
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
if (violations.length === 0) {
|
|
116
|
+
console.log("✓ structural test passed");
|
|
117
|
+
process.exit(0);
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
for (const v of violations) {
|
|
121
|
+
console.error(`✖ ${v.file}:${v.line} layer=${v.from} → ${v.to} (must be forward-only)`);
|
|
122
|
+
}
|
|
123
|
+
console.error(`\n${violations.length} new layer violation(s). Fix the import direction.`);
|
|
124
|
+
console.error(`Layer order for domain "${cfg.domains[0]?.name}": ${cfg.domains[0]?.layers?.join(" → ")}`);
|
|
125
|
+
process.exit(2);
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
name: harness eval (nightly)
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
schedule:
|
|
5
|
+
- cron: "0 6 * * *"
|
|
6
|
+
workflow_dispatch:
|
|
7
|
+
inputs:
|
|
8
|
+
set:
|
|
9
|
+
description: "quick (3 tasks) or full (all tasks)"
|
|
10
|
+
type: choice
|
|
11
|
+
options: [quick, full]
|
|
12
|
+
default: quick
|
|
13
|
+
transport:
|
|
14
|
+
description: "claude-cli (real run, costs tokens) or mock (CI smoke-test, free)"
|
|
15
|
+
type: choice
|
|
16
|
+
options: [mock, claude-cli]
|
|
17
|
+
default: mock
|
|
18
|
+
|
|
19
|
+
permissions:
|
|
20
|
+
contents: read
|
|
21
|
+
|
|
22
|
+
jobs:
|
|
23
|
+
eval:
|
|
24
|
+
runs-on: ubuntu-latest
|
|
25
|
+
timeout-minutes: 30
|
|
26
|
+
steps:
|
|
27
|
+
- uses: actions/checkout@v4
|
|
28
|
+
- uses: actions/setup-node@v4
|
|
29
|
+
with:
|
|
30
|
+
node-version: "20"
|
|
31
|
+
- run: npm ci || npm install
|
|
32
|
+
- name: Verify Claude Code CLI is available
|
|
33
|
+
if: ${{ inputs.transport == 'claude-cli' || (github.event_name == 'schedule') }}
|
|
34
|
+
run: npx -y @anthropic-ai/claude-code --version
|
|
35
|
+
- name: Run eval
|
|
36
|
+
env:
|
|
37
|
+
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
|
38
|
+
CI: "true"
|
|
39
|
+
run: |
|
|
40
|
+
SET="${{ inputs.set || 'quick' }}"
|
|
41
|
+
# Default to mock for unattended schedule runs unless ANTHROPIC_API_KEY is set.
|
|
42
|
+
TRANSPORT="${{ inputs.transport }}"
|
|
43
|
+
if [ -z "$TRANSPORT" ]; then
|
|
44
|
+
if [ -n "${{ secrets.ANTHROPIC_API_KEY }}" ]; then
|
|
45
|
+
TRANSPORT="claude-cli"
|
|
46
|
+
else
|
|
47
|
+
TRANSPORT="mock"
|
|
48
|
+
fi
|
|
49
|
+
fi
|
|
50
|
+
if [ -f harness.config.json ] && grep -q '"language": "python"' harness.config.json; then
|
|
51
|
+
python -m harness.eval_runner --$SET --transport=$TRANSPORT
|
|
52
|
+
else
|
|
53
|
+
node harness/eval-runner.mjs --$SET --transport=$TRANSPORT
|
|
54
|
+
fi
|
|
55
|
+
- uses: actions/upload-artifact@v4
|
|
56
|
+
if: always()
|
|
57
|
+
with:
|
|
58
|
+
name: eval-results
|
|
59
|
+
path: .harness/eval/results/
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
name: harness
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
workflow_dispatch:
|
|
9
|
+
|
|
10
|
+
permissions:
|
|
11
|
+
contents: read
|
|
12
|
+
pull-requests: read
|
|
13
|
+
|
|
14
|
+
jobs:
|
|
15
|
+
structural:
|
|
16
|
+
runs-on: ubuntu-latest
|
|
17
|
+
timeout-minutes: 5
|
|
18
|
+
steps:
|
|
19
|
+
- uses: actions/checkout@v4
|
|
20
|
+
- uses: actions/setup-node@v4
|
|
21
|
+
with:
|
|
22
|
+
node-version: "20"
|
|
23
|
+
- uses: actions/setup-python@v5
|
|
24
|
+
with:
|
|
25
|
+
python-version: "3.12"
|
|
26
|
+
- name: Install Node deps (if package.json present)
|
|
27
|
+
run: |
|
|
28
|
+
if [ -f package.json ]; then
|
|
29
|
+
if [ -f pnpm-lock.yaml ]; then npm i -g pnpm && pnpm install --frozen-lockfile
|
|
30
|
+
elif [ -f yarn.lock ]; then npm i -g yarn && yarn install --frozen-lockfile
|
|
31
|
+
else npm ci || npm install
|
|
32
|
+
fi
|
|
33
|
+
fi
|
|
34
|
+
- name: Install Python deps (if pyproject.toml present)
|
|
35
|
+
run: |
|
|
36
|
+
if [ -f pyproject.toml ]; then
|
|
37
|
+
python -m pip install --upgrade pip
|
|
38
|
+
pip install libcst import-linter ruff || true
|
|
39
|
+
pip install -e '.[dev]' || pip install -e . || true
|
|
40
|
+
fi
|
|
41
|
+
- name: Structural test
|
|
42
|
+
run: |
|
|
43
|
+
if [ -f harness.config.json ] && grep -q '"language": "python"' harness.config.json; then
|
|
44
|
+
python -m harness.structural_test
|
|
45
|
+
else
|
|
46
|
+
npm run --silent harness:check
|
|
47
|
+
fi
|
|
48
|
+
- name: Lint
|
|
49
|
+
continue-on-error: true
|
|
50
|
+
run: |
|
|
51
|
+
if [ -f package.json ] && grep -q '"lint"' package.json; then
|
|
52
|
+
npm run --silent lint
|
|
53
|
+
elif command -v ruff >/dev/null 2>&1; then
|
|
54
|
+
ruff check .
|
|
55
|
+
fi
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# ADR 0001 — Adopt agent-harness-kit
|
|
2
|
+
|
|
3
|
+
- **Status:** accepted
|
|
4
|
+
- **Date:** {{now "yyyy-MM-dd"}}
|
|
5
|
+
- **Deciders:** project owner
|
|
6
|
+
|
|
7
|
+
## Context
|
|
8
|
+
|
|
9
|
+
This is a single-developer project that uses Claude Code for the bulk of
|
|
10
|
+
implementation work. Agent-driven development without a harness produces
|
|
11
|
+
predictable failure modes:
|
|
12
|
+
|
|
13
|
+
- duplicated helpers across modules
|
|
14
|
+
- backward layer dependencies
|
|
15
|
+
- silent test removal or skip
|
|
16
|
+
- doc drift from code reality
|
|
17
|
+
- unbounded retries and missing timeouts
|
|
18
|
+
|
|
19
|
+
Hand-engineering each preventive against these failures is achievable but
|
|
20
|
+
slow and easy to forget. A shared starter kit codifies the patterns that
|
|
21
|
+
OpenAI, Stripe, Anthropic, and Mitchell Hashimoto have publicly demonstrated
|
|
22
|
+
work.
|
|
23
|
+
|
|
24
|
+
## Decision
|
|
25
|
+
|
|
26
|
+
Adopt `agent-harness-kit v{{kitVersion}}` as the harness layer. Specifically:
|
|
27
|
+
|
|
28
|
+
- Use the layer order `{{layersJoined}}` and enforce it via the structural
|
|
29
|
+
test bundled with the kit.
|
|
30
|
+
- Run the PostToolUse + Stop hooks shipped by the kit unmodified.
|
|
31
|
+
- Use the 10 starter skills and 5 reviewer subagents as the baseline; add or
|
|
32
|
+
remove via subsequent ADRs.
|
|
33
|
+
- Run `/garbage-collection` weekly.
|
|
34
|
+
|
|
35
|
+
## Consequences
|
|
36
|
+
|
|
37
|
+
Positive
|
|
38
|
+
|
|
39
|
+
- Time-to-mistake-fix drops to ~30 seconds (PostToolUse hook).
|
|
40
|
+
- The `feature_list.json` + `PROGRESS.md` pair gives every session a clean
|
|
41
|
+
starting context, regardless of conversation length.
|
|
42
|
+
|
|
43
|
+
Negative
|
|
44
|
+
|
|
45
|
+
- The layer order is opinionated. Some valid architectures (hexagonal,
|
|
46
|
+
vertical-slice) require an ADR override.
|
|
47
|
+
- The kit upgrades introduce sidecar files (`*.harness-new`) that must be
|
|
48
|
+
diffed manually for user-modified files.
|
|
49
|
+
|
|
50
|
+
## Alternatives considered
|
|
51
|
+
|
|
52
|
+
- **Roll our own.** Rejected: too slow, and the literature converges on the
|
|
53
|
+
same patterns.
|
|
54
|
+
- **Use Claudify (1700 skills, 9 subagents).** Rejected: the over-engineered
|
|
55
|
+
antipattern this kit explicitly avoids.
|
|
56
|
+
- **No harness.** Rejected: see Context.
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# Agent failures log
|
|
2
|
+
|
|
3
|
+
This is the running log of agent mistakes that triggered a harness
|
|
4
|
+
improvement. Each entry should answer: what happened, what we did to make
|
|
5
|
+
sure it never happens again, and where the prevention now lives.
|
|
6
|
+
|
|
7
|
+
The `/propose-harness-improvement` skill appends entries here automatically.
|
|
8
|
+
|
|
9
|
+
> "Anytime you find an agent makes a mistake, you take the time to engineer
|
|
10
|
+
> a solution such that the agent never makes that mistake again."
|
|
11
|
+
> — Mitchell Hashimoto, _My AI Adoption Journey_ (Feb 5, 2026)
|
|
12
|
+
|
|
13
|
+
## Format
|
|
14
|
+
|
|
15
|
+
```
|
|
16
|
+
### YYYY-MM-DD <slug>
|
|
17
|
+
- **Symptom:** <what went wrong>
|
|
18
|
+
- **Classification:** (a) missing context | (b) missing rule | (c) missing tool/skill | (d) wrong layer
|
|
19
|
+
- **Fix applied:** <what we did>
|
|
20
|
+
- **Fix lives in:** path/or/file
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## Entries
|
|
24
|
+
|
|
25
|
+
_(empty — this file fills up over time as `/propose-harness-improvement` is invoked.)_
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# Architecture — {{projectName}}
|
|
2
|
+
|
|
3
|
+
This document is the source of truth for how code is organized. Any deviation
|
|
4
|
+
must be justified in an ADR under `docs/adr/`.
|
|
5
|
+
|
|
6
|
+
## Layer order (forward-only)
|
|
7
|
+
|
|
8
|
+
```
|
|
9
|
+
{{layersJoined}}
|
|
10
|
+
```
|
|
11
|
+
|
|
12
|
+
Code in a higher layer may import from any lower layer. Code in a lower layer
|
|
13
|
+
**must not** import from a higher layer. The structural test enforces this
|
|
14
|
+
mechanically — see `harness.config.json` and the
|
|
15
|
+
`{{#if isPython}}python -m harness.structural_test{{else}}npm run harness:check{{/if}}` command.
|
|
16
|
+
|
|
17
|
+
## Layer responsibilities
|
|
18
|
+
|
|
19
|
+
| Layer | Responsibility |
|
|
20
|
+
| ----------- | --------------------------------------------------------------------------- |
|
|
21
|
+
| `types` | Pure data shapes. No I/O, no business logic, no framework imports. |
|
|
22
|
+
| `config` | Static configuration (env loading, feature flags, constants). |
|
|
23
|
+
| `repo` | Persistence and external-system gateways. Returns plain values. |
|
|
24
|
+
| `service` | Business logic. Orchestrates `repo` calls. Pure where possible. |
|
|
25
|
+
| `runtime` | Framework adapters: HTTP routes, CLI commands, queue handlers. |
|
|
26
|
+
| `ui` | Rendering, components, presentation logic. |
|
|
27
|
+
|
|
28
|
+
## Cross-cutting concerns: `providers/`
|
|
29
|
+
|
|
30
|
+
Auth, telemetry, feature flags, observability — anything that would otherwise
|
|
31
|
+
cut across layers — enters through `providers/`. Each provider exposes a
|
|
32
|
+
single typed interface; consumers depend on the interface, not the
|
|
33
|
+
implementation.
|
|
34
|
+
|
|
35
|
+
## Adding a new module
|
|
36
|
+
|
|
37
|
+
1. Decide which layers it touches.
|
|
38
|
+
2. Run `/inspect-module <existing-similar-module>` to mirror the pattern.
|
|
39
|
+
3. Create files under `src/{domain}/{layer}/`.
|
|
40
|
+
4. Write tests in the same layer.
|
|
41
|
+
5. Run the structural test. If it fails, do **not** disable it — fix the import.
|
|
42
|
+
|
|
43
|
+
## Recent decisions
|
|
44
|
+
|
|
45
|
+
(Most recent first. Created automatically by `/add-adr`.)
|
|
46
|
+
|
|
47
|
+
- `0001-use-agent-harness-kit.md` — Adopt agent-harness-kit as the harness layer.
|