agent-harness-kit 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. package/.claude-plugin/marketplace.json +27 -0
  2. package/.claude-plugin/plugin.json +25 -0
  3. package/LICENSE +21 -0
  4. package/README.md +165 -0
  5. package/bin/cli.mjs +261 -0
  6. package/package.json +64 -0
  7. package/src/core/detect-stack.mjs +181 -0
  8. package/src/core/doctor.mjs +106 -0
  9. package/src/core/patch-package-json.mjs +53 -0
  10. package/src/core/render-templates.mjs +277 -0
  11. package/src/core/upgrade.mjs +274 -0
  12. package/src/templates/.claude/agents/api-consistency-reviewer.md +33 -0
  13. package/src/templates/.claude/agents/architecture-reviewer.md.hbs +41 -0
  14. package/src/templates/.claude/agents/performance-reviewer.md +35 -0
  15. package/src/templates/.claude/agents/reliability-reviewer.md +38 -0
  16. package/src/templates/.claude/agents/security-reviewer.md +39 -0
  17. package/src/templates/.claude/hooks/hooks.json.hbs +39 -0
  18. package/src/templates/.claude/settings.json.hbs +25 -0
  19. package/src/templates/.claude/skills/add-adr/SKILL.md +60 -0
  20. package/src/templates/.claude/skills/add-feature/SKILL.md.hbs +50 -0
  21. package/src/templates/.claude/skills/debug-flow/SKILL.md.hbs +38 -0
  22. package/src/templates/.claude/skills/doc-drift-scan/SKILL.md +43 -0
  23. package/src/templates/.claude/skills/eval-runner/SKILL.md +55 -0
  24. package/src/templates/.claude/skills/garbage-collection/SKILL.md.hbs +49 -0
  25. package/src/templates/.claude/skills/inspect-app/SKILL.md +57 -0
  26. package/src/templates/.claude/skills/inspect-module/SKILL.md.hbs +53 -0
  27. package/src/templates/.claude/skills/propose-harness-improvement/SKILL.md +43 -0
  28. package/src/templates/.claude/skills/structural-test-author/SKILL.md.hbs +46 -0
  29. package/src/templates/.claude/skills/write-skill/SKILL.md +39 -0
  30. package/src/templates/CLAUDE.md.hbs +70 -0
  31. package/src/templates/_adapter-python/.importlinter +14 -0
  32. package/src/templates/_adapter-python/harness/__init__.py +0 -0
  33. package/src/templates/_adapter-python/harness/eval_runner.py +281 -0
  34. package/src/templates/_adapter-python/harness/structural_test.py +195 -0
  35. package/src/templates/_adapter-typescript/.dependency-cruiser.cjs +27 -0
  36. package/src/templates/_adapter-typescript/eslint.config.mjs +38 -0
  37. package/src/templates/_adapter-typescript/harness/eval-runner.mjs +322 -0
  38. package/src/templates/_adapter-typescript/harness/structural-test.mjs +125 -0
  39. package/src/templates/_ci/.github/workflows/eval-nightly.yml +59 -0
  40. package/src/templates/_ci/.github/workflows/harness.yml +55 -0
  41. package/src/templates/docs/adr/0001-use-agent-harness-kit.md.hbs +56 -0
  42. package/src/templates/docs/agent-failures.md +25 -0
  43. package/src/templates/docs/architecture.md.hbs +47 -0
  44. package/src/templates/docs/core-beliefs.md.hbs +41 -0
  45. package/src/templates/docs/golden-principles.md.hbs +80 -0
  46. package/src/templates/docs/tech-debt-tracker.md +30 -0
  47. package/src/templates/feature_list.json.hbs +29 -0
  48. package/src/templates/harness.config.json.hbs +40 -0
  49. package/src/templates/scripts/dev-up.sh.hbs +51 -0
  50. package/src/templates/scripts/harness-report.mjs +189 -0
  51. package/src/templates/scripts/install-git-hooks.sh +18 -0
  52. package/src/templates/scripts/pre-push.sh +21 -0
  53. package/src/templates/scripts/precompletion-checklist.sh.hbs +99 -0
  54. package/src/templates/scripts/structural-test-on-edit.sh.hbs +53 -0
  55. package/src/templates/scripts/telemetry-on-skill.sh +26 -0
@@ -0,0 +1,41 @@
1
+ # Core beliefs
2
+
3
+ Beliefs are higher-level than golden principles. Beliefs explain *why* the
4
+ project exists; principles explain *how* the code is shaped. If a belief
5
+ changes, expect the principles to ripple.
6
+
7
+ ## What this codebase is
8
+
9
+ {{description}}
10
+
11
+ ## Why these constraints exist
12
+
13
+ 1. **Solo developer, no review queue.** Every constraint must pull weight
14
+ without a second pair of eyes. The harness IS the review queue.
15
+ 2. **Agent-driven development is the default mode.** Code is written by
16
+ Claude Code with a human in the loop, not the other way around. Patterns
17
+ that humans tolerate but agents abuse (vague names, "just one more flag",
18
+ lazy `any` / `Dict[str, Any]`) are out.
19
+ 3. **Time-to-mistake-fix matters more than time-to-write.** A mistake that
20
+ surfaces in the PostToolUse hook costs ~30 seconds. The same mistake in
21
+ a code review costs minutes. The same mistake in production costs hours.
22
+ Every constraint is timed against this gradient.
23
+
24
+ ## What we're optimizing for
25
+
26
+ - Throughput per dev-hour at constant quality.
27
+ - Refactor blast radius — changes should stay within one domain.
28
+ - Decisional consistency — two consecutive sessions should produce the same
29
+ shape of solution to the same problem.
30
+
31
+ ## What we're NOT optimizing for
32
+
33
+ - Multi-team coordination, RFC queues, or governance.
34
+ - Frontier-grade test coverage. Agent-written unit tests are a liability;
35
+ feature-level tests are the floor.
36
+ - Maximum flexibility. The harness is opinionated on purpose.
37
+
38
+ ---
39
+
40
+ _Edit this file when the project's purpose changes — not when you change a
41
+ library or a layer name. For those, write an ADR._
@@ -0,0 +1,80 @@
1
+ # Golden principles
2
+
3
+ These are invariants that must hold across the codebase. Each one traces to a
4
+ specific past failure or a deliberate trade-off. **Every line here must be
5
+ mechanically enforceable** — if it can't be, it doesn't belong here; promote
6
+ it to a structural test or demote it to a comment in the affected file.
7
+
8
+ The garbage-collection ritual (`/garbage-collection`) diffs the codebase
9
+ against this file weekly.
10
+
11
+ ## 1. Forward-only layer dependencies
12
+
13
+ `{{layersJoined}}`
14
+
15
+ Why: prevents circular imports, makes refactors local, mirrors OpenAI's Codex
16
+ codebase rule.
17
+ Enforced by: structural test (`harness.config.json` `domains[].layers`).
18
+
19
+ ## 2. Validate at boundaries; trust internals
20
+
21
+ External input (HTTP body, CLI arg, file content) is parsed into a typed
22
+ object at the runtime boundary. Internal code assumes the type holds.
23
+
24
+ Why: removes "defensive" type checks scattered across services that hide
25
+ bugs.
26
+ Enforced by: code review + `security-reviewer` subagent.
27
+
28
+ ## 3. Shared utilities live in `src/shared/`
29
+
30
+ Before adding a helper to a module, search `src/shared/` for an existing one.
31
+ If you write a duplicate, the garbage-collection skill will surface it.
32
+
33
+ Why: a real recurring failure mode in agent-generated code is duplicated
34
+ helpers. OpenAI's Codex team explicitly tracks this.
35
+ Enforced by: `garbage-collection` skill (duplicate-utility scan).
36
+
37
+ ## 4. Tests are end-to-end through one feature
38
+
39
+ A test exercises one entry from `feature_list.json` end-to-end. We don't
40
+ write isolated unit tests for inner helpers unless a bug repro demands one.
41
+
42
+ Why: agent-generated unit tests mock everything and verify nothing.
43
+ Enforced by: code review.
44
+
45
+ ## 5. Bounded retries and timeouts on every external call
46
+
47
+ Every `fetch`/`httpx`/`requests` call has an explicit timeout. Every retry
48
+ loop has both `maxAttempts` and a deadline. No `while True:` in production
49
+ code.
50
+
51
+ Why: agents love infinite retries.
52
+ Enforced by: `reliability-reviewer` subagent.
53
+
54
+ ## 6. JSON beats Markdown for state the agent updates
55
+
56
+ `feature_list.json`, `.harness/installed.json`, structural-baseline — all
57
+ JSON. Anthropic's long-running-agent guide: "the model is less likely to
58
+ inappropriately change or overwrite JSON files compared to Markdown files."
59
+
60
+ Why: the agent treats Markdown as freely-editable prose.
61
+ Enforced by: file format choice.
62
+
63
+ ## 7. Every agent failure becomes a permanent prevention
64
+
65
+ When the agent does something wrong, the response is **not** to add a "be
66
+ careful about X" line to CLAUDE.md. It is to:
67
+
68
+ - add context to `docs/`, OR
69
+ - add a structural test rule, OR
70
+ - add a hook, OR
71
+ - add a skill.
72
+
73
+ Why: Mitchell Hashimoto's discipline. CLAUDE.md is a table of contents — it
74
+ won't be re-read on every action.
75
+ Enforced by: `/propose-harness-improvement` skill.
76
+
77
+ ---
78
+
79
+ _Add new principles via `/structural-test-author`, which forces you to
80
+ codify the enforcement mechanism alongside the rule._
@@ -0,0 +1,30 @@
1
+ # Tech debt tracker
2
+
3
+ A flat append-only log of known compromises. Each entry has a date, a
4
+ location, a description, and a payoff condition.
5
+
6
+ > "Technical debt is a high-interest loan best paid down in continuous
7
+ > small increments." — OpenAI Codex harness team
8
+
9
+ The `/garbage-collection` skill scans this file every Friday and proposes
10
+ the top-3 highest-leverage entries to address.
11
+
12
+ ## Format
13
+
14
+ ```
15
+ ### YYYY-MM-DD <slug>
16
+ - Location: path/or/area
17
+ - Why it's debt: <one paragraph>
18
+ - Cost: <effort to fix>
19
+ - Payoff condition: <what should trigger the fix>
20
+ - Status: open | in-progress | closed
21
+ ```
22
+
23
+ ## Entries
24
+
25
+ ### 2026-01-01 example-entry
26
+ - Location: src/example/repo/legacy.ts
27
+ - Why it's debt: hand-rolled fetch wrapper instead of the shared client
28
+ - Cost: 1 hour
29
+ - Payoff condition: when we add the next external call
30
+ - Status: open
@@ -0,0 +1,29 @@
1
+ {
2
+ "$schema": "./.harness/feature-list.schema.json",
3
+ "version": "0.1",
4
+ "project": "{{projectName}}",
5
+ "features": [
6
+ {
7
+ "id": "health-endpoint",
8
+ "title": "GET /health returns {status:'ok'}",
9
+ "passes": false,
10
+ "steps": [
11
+ { "id": "type", "title": "Define HealthResponse in types/", "done": false },
12
+ { "id": "service", "title": "Implement getHealth() in service/", "done": false },
13
+ { "id": "runtime", "title": "Wire route in runtime/", "done": false },
14
+ { "id": "smoke", "title": "curl localhost returns 200 + {status:'ok'}", "done": false }
15
+ ],
16
+ "domain": "default"
17
+ },
18
+ {
19
+ "id": "not-found-page",
20
+ "title": "Custom 404 page (or handler)",
21
+ "passes": false,
22
+ "steps": [
23
+ { "id": "ui", "title": "Add 404 view/handler in ui/ or runtime/", "done": false },
24
+ { "id": "smoke", "title": "curl /no-such-path returns 404 with the custom body", "done": false }
25
+ ],
26
+ "domain": "default"
27
+ }
28
+ ]
29
+ }
@@ -0,0 +1,40 @@
1
+ {
2
+ "$schema": "https://raw.githubusercontent.com/tuanle96/agent-harness-kit/v{{kitVersion}}/schema.json",
3
+ "version": "{{kitVersion}}",
4
+ "language": "{{language}}",
5
+ "framework": "{{framework}}",
6
+ "preset": "{{preset}}",
7
+ "domains": [
8
+ {
9
+ "name": "default",
10
+ "root": "{{#if isPython}}app{{else}}src{{/if}}",
11
+ "layers": [{{#each layers}}"{{this}}"{{#unless @last}}, {{/unless}}{{/each}}]
12
+ }
13
+ ],
14
+ "providers": ["auth", "telemetry", "feature-flags"],
15
+ "goldenPrinciples": "docs/golden-principles.md",
16
+ "structuralTest": {
17
+ "engine": "{{#if isPython}}libcst{{else}}ts-morph{{/if}}",
18
+ "configPath": ".harness/structural-test.config.json",
19
+ "blockOnViolation": true
20
+ },
21
+ "evals": {
22
+ "tasksDir": ".harness/eval/tasks",
23
+ "scheduleCron": "0 6 * * *",
24
+ "dimensions": ["outcome", "process", "style", "efficiency"]
25
+ },
26
+ "garbageCollection": {
27
+ "frequency": "weekly",
28
+ "maxFixesPerRun": 3,
29
+ "scope": ["dead-imports", "duplicate-utils", "layer-violations", "doc-drift"]
30
+ },
31
+ "models": {
32
+ "main": "claude-sonnet-4-6",
33
+ "reviewers": "claude-sonnet-4-6",
34
+ "explore": "claude-haiku-4-5"
35
+ },
36
+ "budgets": {
37
+ "perRunUsd": 2.0,
38
+ "perDayUsd": 10.0
39
+ }
40
+ }
@@ -0,0 +1,51 @@
1
+ #!/usr/bin/env bash
2
+ # Start the dev server and wait until it answers a readiness probe.
3
+ # Used by `/debug-flow` and by humans during interactive work.
4
+ set -euo pipefail
5
+
6
+ PORT="${PORT:-3000}"
7
+ {{#if isFastapi}}PORT="${PORT:-8000}"{{/if}}
8
+ {{#if isDjango}}PORT="${PORT:-8000}"{{/if}}
9
+ {{#if isFlask}}PORT="${PORT:-5000}"{{/if}}
10
+ HEALTH_PATH="${HEALTH_PATH:-/}"
11
+
12
+ echo "[dev-up] starting dev server on port $PORT…"
13
+ {{#if isNextjs}}
14
+ npm run dev &
15
+ {{else if isFastapi}}
16
+ uvicorn app.main:app --reload --port "$PORT" &
17
+ {{else if isDjango}}
18
+ python manage.py runserver "$PORT" &
19
+ {{else if isFlask}}
20
+ flask --app app run --debug --port "$PORT" &
21
+ {{else if isExpress}}
22
+ node ./src/server.js &
23
+ {{else if isFastify}}
24
+ node ./src/server.js &
25
+ {{else if isNestjs}}
26
+ npm run start:dev &
27
+ {{else if isPython}}
28
+ python -m app &
29
+ {{else}}
30
+ npm run dev &
31
+ {{/if}}
32
+ SERVER_PID=$!
33
+
34
+ cleanup() {
35
+ if kill -0 "$SERVER_PID" 2>/dev/null; then
36
+ kill "$SERVER_PID" || true
37
+ fi
38
+ }
39
+ trap cleanup EXIT INT TERM
40
+
41
+ # Wait for readiness (max 30s).
42
+ for i in $(seq 1 60); do
43
+ if curl -fs "http://localhost:$PORT$HEALTH_PATH" >/dev/null 2>&1; then
44
+ echo "[dev-up] ready at http://localhost:$PORT$HEALTH_PATH"
45
+ break
46
+ fi
47
+ sleep 0.5
48
+ done
49
+
50
+ # Hand control back to the foreground process.
51
+ wait "$SERVER_PID"
@@ -0,0 +1,189 @@
1
+ #!/usr/bin/env node
2
+ // harness:report — aggregate eval results + skill telemetry into a per-skill
3
+ // summary. Reads .harness/eval/results/*.jsonl and .harness/telemetry.jsonl.
4
+ //
5
+ // Output:
6
+ // ### Eval results (last 7 days)
7
+ // <per-task: pass/fail counts, avg tokens>
8
+ // ### Skill invocations (last 7 days)
9
+ // <per-skill: invocation count, sessions, last seen>
10
+ // ### Drift signals
11
+ // <skills that haven't been invoked in N days; tasks that have started failing>
12
+ //
13
+ // No external deps — pure Node stdlib.
14
+
15
+ import { readdir, readFile, stat } from "node:fs/promises";
16
+ import { existsSync } from "node:fs";
17
+ import { resolve, join } from "node:path";
18
+
19
+ const ROOT = process.cwd();
20
+ const RESULTS_DIR = resolve(ROOT, ".harness/eval/results");
21
+ const TELEMETRY = resolve(ROOT, ".harness/telemetry.jsonl");
22
+ const NOW = Date.now();
23
+ const SEVEN_DAYS = 7 * 24 * 60 * 60 * 1000;
24
+
25
+ async function readJsonl(path) {
26
+ if (!existsSync(path)) return [];
27
+ const raw = await readFile(path, "utf8");
28
+ const out = [];
29
+ for (const line of raw.split("\n")) {
30
+ if (!line.trim()) continue;
31
+ try {
32
+ out.push(JSON.parse(line));
33
+ } catch {
34
+ /* skip malformed line */
35
+ }
36
+ }
37
+ return out;
38
+ }
39
+
40
+ async function loadEvalResults() {
41
+ if (!existsSync(RESULTS_DIR)) return [];
42
+ const files = await readdir(RESULTS_DIR);
43
+ const all = [];
44
+ for (const f of files) {
45
+ if (!f.endsWith(".jsonl")) continue;
46
+ const path = join(RESULTS_DIR, f);
47
+ const st = await stat(path);
48
+ const rows = await readJsonl(path);
49
+ for (const r of rows) {
50
+ r._mtime = st.mtimeMs;
51
+ all.push(r);
52
+ }
53
+ }
54
+ return all;
55
+ }
56
+
57
+ function recent(rows, key = "ts") {
58
+ return rows.filter((r) => {
59
+ const t = r[key] ? new Date(r[key]).getTime() : r._mtime ?? 0;
60
+ return NOW - t <= SEVEN_DAYS;
61
+ });
62
+ }
63
+
64
+ function tokensOf(row) {
65
+ return (row.grades ?? [])
66
+ .filter((g) => g.dim === "efficiency")
67
+ .reduce((sum, g) => {
68
+ const m = g.info?.match(/^(\d+) tokens/);
69
+ return sum + (m ? parseInt(m[1], 10) : 0);
70
+ }, 0);
71
+ }
72
+
73
+ function fmtPct(num, total) {
74
+ if (total === 0) return "n/a";
75
+ return `${Math.round((num / total) * 100)}%`;
76
+ }
77
+
78
+ function summarizeEvals(rows) {
79
+ const byTask = new Map();
80
+ for (const r of rows) {
81
+ const arr = byTask.get(r.taskId) ?? [];
82
+ arr.push(r);
83
+ byTask.set(r.taskId, arr);
84
+ }
85
+ console.log(`\n### Eval results (last 7 days, ${rows.length} runs)`);
86
+ if (rows.length === 0) {
87
+ console.log(" (no recent runs — try `npm run harness:eval -- --quick --transport=mock`)");
88
+ return;
89
+ }
90
+ console.log(
91
+ " task pass-rate runs avg-tokens",
92
+ );
93
+ console.log(
94
+ " ---------------------- ---------- ----- ----------",
95
+ );
96
+ for (const [taskId, taskRows] of [...byTask.entries()].sort()) {
97
+ const passed = taskRows.filter((r) => r.passed).length;
98
+ const tokens = taskRows.reduce((s, r) => s + tokensOf(r), 0);
99
+ const avgTokens = taskRows.length > 0 ? Math.round(tokens / taskRows.length) : 0;
100
+ const pct = fmtPct(passed, taskRows.length);
101
+ console.log(
102
+ ` ${taskId.padEnd(22)} ${pct.padStart(8)} ${String(taskRows.length).padStart(3)} ${String(avgTokens).padStart(8)}`,
103
+ );
104
+ }
105
+ }
106
+
107
+ function summarizeTelemetry(rows) {
108
+ console.log(`\n### Skill invocations (last 7 days, ${rows.length} events)`);
109
+ if (rows.length === 0) {
110
+ console.log(
111
+ " (no skill invocations recorded — telemetry hook may not be installed)",
112
+ );
113
+ console.log(
114
+ " Verify `.claude/hooks/hooks.json` includes the Skill matcher.",
115
+ );
116
+ return;
117
+ }
118
+ const bySkill = new Map();
119
+ for (const r of rows) {
120
+ const arr = bySkill.get(r.skill) ?? [];
121
+ arr.push(r);
122
+ bySkill.set(r.skill, arr);
123
+ }
124
+ console.log(" skill invocations last-seen");
125
+ console.log(" ----------------------------- ----------- --------------------");
126
+ for (const [skill, events] of [...bySkill.entries()].sort(
127
+ (a, b) => b[1].length - a[1].length,
128
+ )) {
129
+ const last = events
130
+ .map((e) => e.ts)
131
+ .sort()
132
+ .at(-1);
133
+ console.log(
134
+ ` ${skill.padEnd(29)} ${String(events.length).padStart(8)} ${last ?? "?"}`,
135
+ );
136
+ }
137
+ }
138
+
139
+ function driftSignals(evalRows, telemetryRows) {
140
+ console.log(`\n### Drift signals`);
141
+ const knownSkills = [
142
+ "inspect-module",
143
+ "inspect-app",
144
+ "garbage-collection",
145
+ "doc-drift-scan",
146
+ "add-feature",
147
+ "add-adr",
148
+ "structural-test-author",
149
+ "propose-harness-improvement",
150
+ "write-skill",
151
+ "debug-flow",
152
+ "eval-runner",
153
+ ];
154
+ const seen = new Set(telemetryRows.map((r) => r.skill));
155
+ const unseen = knownSkills.filter((s) => !seen.has(s));
156
+ if (unseen.length > 0) {
157
+ console.log(` skills not invoked in 7 days: ${unseen.join(", ")}`);
158
+ }
159
+ // Tasks failing in their most recent run.
160
+ const latest = new Map();
161
+ for (const r of evalRows.sort((a, b) => (a.ts ?? "").localeCompare(b.ts ?? ""))) {
162
+ latest.set(r.taskId, r);
163
+ }
164
+ const regressing = [...latest.values()].filter((r) => !r.passed);
165
+ if (regressing.length > 0) {
166
+ console.log(
167
+ ` tasks failing in their latest run: ${regressing.map((r) => r.taskId).join(", ")}`,
168
+ );
169
+ }
170
+ if (unseen.length === 0 && regressing.length === 0) {
171
+ console.log(" (none)");
172
+ }
173
+ }
174
+
175
+ async function main() {
176
+ const evalAll = await loadEvalResults();
177
+ const telemetryAll = await readJsonl(TELEMETRY);
178
+ const evalRows = recent(evalAll);
179
+ const telemetryRows = recent(telemetryAll);
180
+
181
+ console.log("=== agent-harness-kit report ===");
182
+ console.log(`Generated: ${new Date().toISOString()}`);
183
+ summarizeEvals(evalRows);
184
+ summarizeTelemetry(telemetryRows);
185
+ driftSignals(evalRows, telemetryRows);
186
+ console.log("");
187
+ }
188
+
189
+ await main();
@@ -0,0 +1,18 @@
1
+ #!/usr/bin/env bash
2
+ # Install scripts/pre-push.sh as the git pre-push hook for this repo.
3
+ set -e
4
+
5
+ if [ ! -d .git ]; then
6
+ echo "Not a git repo — run this script from the repo root." >&2
7
+ exit 1
8
+ fi
9
+
10
+ mkdir -p .git/hooks
11
+
12
+ cat > .git/hooks/pre-push <<'HOOK'
13
+ #!/usr/bin/env bash
14
+ exec bash scripts/pre-push.sh "$@"
15
+ HOOK
16
+ chmod +x .git/hooks/pre-push
17
+
18
+ echo "✓ git pre-push hook installed (delegates to scripts/pre-push.sh)"
@@ -0,0 +1,21 @@
1
+ #!/usr/bin/env bash
2
+ # pre-push hook — Stripe "shift-feedback-left" pattern. Runs only the
3
+ # deterministic checks (structural test + linter + tests on changed files).
4
+ # Lives in scripts/ so it ships with the repo; install via install-git-hooks.sh.
5
+ set -e
6
+
7
+ echo "[pre-push] running structural test…"
8
+ if [ -f harness.config.json ] && grep -q '"language": "python"' harness.config.json; then
9
+ python -m harness.structural_test
10
+ else
11
+ npm run --silent harness:check
12
+ fi
13
+
14
+ echo "[pre-push] running lint…"
15
+ if [ -f package.json ] && grep -q '"lint"' package.json; then
16
+ npm run --silent lint
17
+ elif command -v ruff >/dev/null 2>&1; then
18
+ ruff check .
19
+ fi
20
+
21
+ echo "[pre-push] OK"
@@ -0,0 +1,99 @@
1
+ #!/usr/bin/env bash
2
+ # Stop hook — LangChain's "PreCompletionChecklist" / Ralph Wiggum loop.
3
+ # On first stop: run deterministic checks; if any fail, re-inject *structured*
4
+ # failure context (not just check names) via stderr and exit 2. On second
5
+ # stop (stop_hook_active=true), exit 0 to allow real exit.
6
+ #
7
+ # Optional: set AHK_HEADLESS_RECOVER=1 to spawn `claude -p` in the background
8
+ # for one turn of recovery (costs tokens; off by default).
9
+ set -e
10
+
11
+ INPUT=$(cat)
12
+
13
+ # CRITICAL: avoid infinite loops. If the hook already ran, do not block again.
14
+ if command -v jq >/dev/null 2>&1; then
15
+ if [ "$(echo "$INPUT" | jq -r '.stop_hook_active // false')" = "true" ]; then
16
+ exit 0
17
+ fi
18
+ fi
19
+
20
+ # Capture structured output per check. We use temp files so we can quote the
21
+ # tail back to Claude verbatim — names alone are not enough context for the
22
+ # agent to act on.
23
+ TMPDIR_HOOK=$(mktemp -d -t ahk-stop-hook.XXXXXX)
24
+ # Preserve the script's exit code through the cleanup trap — otherwise the
25
+ # trailing `rm` resets the final status to 0 and Claude never sees the block.
26
+ trap 'rc=$?; rm -rf "$TMPDIR_HOOK"; exit $rc' EXIT
27
+
28
+ run_check() {
29
+ local name="$1"
30
+ shift
31
+ local out="$TMPDIR_HOOK/$name.out"
32
+ if "$@" >"$out" 2>&1; then
33
+ return 0
34
+ else
35
+ echo "$name" >> "$TMPDIR_HOOK/failed.list"
36
+ return 1
37
+ fi
38
+ }
39
+
40
+ # Structural test.
41
+ if [ -f harness.config.json ]; then
42
+ if grep -q '"language": "python"' harness.config.json; then
43
+ run_check structural-test python -m harness.structural_test || true
44
+ else
45
+ run_check structural-test npm run --silent harness:check || true
46
+ fi
47
+ fi
48
+
49
+ # Lint.
50
+ if [ -f package.json ] && grep -q '"lint"' package.json; then
51
+ run_check lint npm run --silent lint || true
52
+ elif [ -f pyproject.toml ] && command -v ruff >/dev/null 2>&1; then
53
+ run_check ruff ruff check . || true
54
+ fi
55
+
56
+ if [ ! -s "$TMPDIR_HOOK/failed.list" ]; then
57
+ exit 0
58
+ fi
59
+
60
+ # Build a structured failure report for Claude. The agent gets: which checks
61
+ # failed, the last 50 lines of each failure, and the files most recently
62
+ # touched (so the agent can correlate failures with its own edits).
63
+ {
64
+ echo
65
+ echo "=== Pre-completion checklist failed ==="
66
+ while read -r failed; do
67
+ echo
68
+ echo "--- $failed ---"
69
+ tail -50 "$TMPDIR_HOOK/$failed.out" 2>/dev/null || true
70
+ done < "$TMPDIR_HOOK/failed.list"
71
+
72
+ echo
73
+ echo "--- recent changes (last 10 modified files) ---"
74
+ if command -v git >/dev/null 2>&1; then
75
+ git status --short 2>/dev/null | head -10 || true
76
+ echo
77
+ echo "--- last 3 commits ---"
78
+ git log --oneline -3 2>/dev/null || true
79
+ fi
80
+
81
+ echo
82
+ echo "Fix the failing check(s) and re-run them locally before declaring"
83
+ echo "the task complete. Do NOT disable a check to make the hook pass."
84
+ } >&2
85
+
86
+ # Optional: opt-in headless recovery. Spawns a one-turn `claude -p` to
87
+ # attempt the fix autonomously. Useful for unattended CI / cron contexts.
88
+ # Off by default because it costs tokens.
89
+ if [ "${AHK_HEADLESS_RECOVER:-}" = "1" ] && command -v claude >/dev/null 2>&1; then
90
+ FAILED_LIST=$(tr '\n' ' ' < "$TMPDIR_HOOK/failed.list")
91
+ echo "[ahk] AHK_HEADLESS_RECOVER=1 — spawning recovery turn for: $FAILED_LIST" >&2
92
+ claude -p \
93
+ "The pre-completion checklist failed: $FAILED_LIST. Read the failure output in $TMPDIR_HOOK and apply the smallest fix. Do not disable any check." \
94
+ --max-turns 5 \
95
+ >"$TMPDIR_HOOK/recover.out" 2>&1 &
96
+ # Don't wait — let the next session pick up the partially-applied fix.
97
+ fi
98
+
99
+ exit 2
@@ -0,0 +1,53 @@
1
+ #!/usr/bin/env bash
2
+ # PostToolUse hook — runs the structural test on the file just edited.
3
+ # Defensive: never blocks on missing tooling. Exit code 2 = block + Claude reads stderr.
4
+ set -e
5
+
6
+ INPUT=$(cat)
7
+ if ! command -v jq >/dev/null 2>&1; then
8
+ exit 0 # jq missing — silently skip rather than spuriously blocking
9
+ fi
10
+
11
+ FILE=$(echo "$INPUT" | jq -r '.tool_input.file_path // empty')
12
+ [ -z "$FILE" ] && exit 0
13
+
14
+ # Only run on source files, and only inside the configured roots.
15
+ case "$FILE" in
16
+ *.ts|*.tsx|*.js|*.jsx|*.mjs|*.cjs) ENGINE=ts ;;
17
+ *.py) ENGINE=py ;;
18
+ *) exit 0 ;;
19
+ esac
20
+
21
+ # Allow opt-out via env var — useful on Windows / macOS where some hook
22
+ # events are flaky (open issues #45065 and #6305).
23
+ if [ "${AHK_HOOK_MODE:-}" = "warn" ]; then
24
+ echo "[ahk] hook running in warn-only mode (AHK_HOOK_MODE=warn)" >&2
25
+ exit 0
26
+ fi
27
+
28
+ # Run the structural test scoped to this file. Capture output so we can
29
+ # return only the relevant lines via stderr to Claude.
30
+ if [ "$ENGINE" = "ts" ]; then
31
+ if ! npm run --silent harness:check -- --file "$FILE" 2>&1 | tail -50 >&2; then
32
+ cat >&2 <<EOF
33
+
34
+ Structural test failed for $FILE.
35
+ Layer order: see harness.config.json.
36
+ Run \`npm run harness:check\` for full output.
37
+ Fix the violation before continuing — do NOT disable the test.
38
+ EOF
39
+ exit 2
40
+ fi
41
+ elif [ "$ENGINE" = "py" ]; then
42
+ if ! python -m harness.structural_test --file "$FILE" 2>&1 | tail -50 >&2; then
43
+ cat >&2 <<EOF
44
+
45
+ Structural test failed for $FILE.
46
+ Layer order: see harness.config.json.
47
+ Run \`python -m harness.structural_test\` for full output.
48
+ Fix the violation before continuing — do NOT disable the test.
49
+ EOF
50
+ exit 2
51
+ fi
52
+ fi
53
+ exit 0
@@ -0,0 +1,26 @@
1
+ #!/usr/bin/env bash
2
+ # PostToolUse telemetry hook — logs every Skill invocation to
3
+ # .harness/telemetry.jsonl. Pure observation; never blocks.
4
+ #
5
+ # Used by harness:report to compute per-skill success rate, average duration,
6
+ # and to surface drift over time.
7
+ set -e
8
+
9
+ INPUT=$(cat)
10
+ if ! command -v jq >/dev/null 2>&1; then
11
+ exit 0 # jq missing — skip silently rather than spuriously blocking
12
+ fi
13
+
14
+ TOOL=$(echo "$INPUT" | jq -r '.tool_name // empty')
15
+ [ "$TOOL" = "Skill" ] || exit 0
16
+
17
+ SKILL=$(echo "$INPUT" | jq -r '.tool_input.skill // empty')
18
+ [ -z "$SKILL" ] && exit 0
19
+
20
+ mkdir -p .harness
21
+ LINE=$(jq -nc --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
22
+ --arg skill "$SKILL" \
23
+ --arg sha "$(git rev-parse --short HEAD 2>/dev/null || echo 'no-git')" \
24
+ '{ts: $ts, event: "skill_invoked", skill: $skill, sha: $sha}')
25
+ echo "$LINE" >> .harness/telemetry.jsonl
26
+ exit 0