agent-harness-kit 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/marketplace.json +27 -0
- package/.claude-plugin/plugin.json +25 -0
- package/LICENSE +21 -0
- package/README.md +165 -0
- package/bin/cli.mjs +261 -0
- package/package.json +64 -0
- package/src/core/detect-stack.mjs +181 -0
- package/src/core/doctor.mjs +106 -0
- package/src/core/patch-package-json.mjs +53 -0
- package/src/core/render-templates.mjs +277 -0
- package/src/core/upgrade.mjs +274 -0
- package/src/templates/.claude/agents/api-consistency-reviewer.md +33 -0
- package/src/templates/.claude/agents/architecture-reviewer.md.hbs +41 -0
- package/src/templates/.claude/agents/performance-reviewer.md +35 -0
- package/src/templates/.claude/agents/reliability-reviewer.md +38 -0
- package/src/templates/.claude/agents/security-reviewer.md +39 -0
- package/src/templates/.claude/hooks/hooks.json.hbs +39 -0
- package/src/templates/.claude/settings.json.hbs +25 -0
- package/src/templates/.claude/skills/add-adr/SKILL.md +60 -0
- package/src/templates/.claude/skills/add-feature/SKILL.md.hbs +50 -0
- package/src/templates/.claude/skills/debug-flow/SKILL.md.hbs +38 -0
- package/src/templates/.claude/skills/doc-drift-scan/SKILL.md +43 -0
- package/src/templates/.claude/skills/eval-runner/SKILL.md +55 -0
- package/src/templates/.claude/skills/garbage-collection/SKILL.md.hbs +49 -0
- package/src/templates/.claude/skills/inspect-app/SKILL.md +57 -0
- package/src/templates/.claude/skills/inspect-module/SKILL.md.hbs +53 -0
- package/src/templates/.claude/skills/propose-harness-improvement/SKILL.md +43 -0
- package/src/templates/.claude/skills/structural-test-author/SKILL.md.hbs +46 -0
- package/src/templates/.claude/skills/write-skill/SKILL.md +39 -0
- package/src/templates/CLAUDE.md.hbs +70 -0
- package/src/templates/_adapter-python/.importlinter +14 -0
- package/src/templates/_adapter-python/harness/__init__.py +0 -0
- package/src/templates/_adapter-python/harness/eval_runner.py +281 -0
- package/src/templates/_adapter-python/harness/structural_test.py +195 -0
- package/src/templates/_adapter-typescript/.dependency-cruiser.cjs +27 -0
- package/src/templates/_adapter-typescript/eslint.config.mjs +38 -0
- package/src/templates/_adapter-typescript/harness/eval-runner.mjs +322 -0
- package/src/templates/_adapter-typescript/harness/structural-test.mjs +125 -0
- package/src/templates/_ci/.github/workflows/eval-nightly.yml +59 -0
- package/src/templates/_ci/.github/workflows/harness.yml +55 -0
- package/src/templates/docs/adr/0001-use-agent-harness-kit.md.hbs +56 -0
- package/src/templates/docs/agent-failures.md +25 -0
- package/src/templates/docs/architecture.md.hbs +47 -0
- package/src/templates/docs/core-beliefs.md.hbs +41 -0
- package/src/templates/docs/golden-principles.md.hbs +80 -0
- package/src/templates/docs/tech-debt-tracker.md +30 -0
- package/src/templates/feature_list.json.hbs +29 -0
- package/src/templates/harness.config.json.hbs +40 -0
- package/src/templates/scripts/dev-up.sh.hbs +51 -0
- package/src/templates/scripts/harness-report.mjs +189 -0
- package/src/templates/scripts/install-git-hooks.sh +18 -0
- package/src/templates/scripts/pre-push.sh +21 -0
- package/src/templates/scripts/precompletion-checklist.sh.hbs +99 -0
- package/src/templates/scripts/structural-test-on-edit.sh.hbs +53 -0
- package/src/templates/scripts/telemetry-on-skill.sh +26 -0
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# Core beliefs
|
|
2
|
+
|
|
3
|
+
Beliefs are higher-level than golden principles. Beliefs explain *why* the
|
|
4
|
+
project exists; principles explain *how* the code is shaped. If a belief
|
|
5
|
+
changes, expect the principles to ripple.
|
|
6
|
+
|
|
7
|
+
## What this codebase is
|
|
8
|
+
|
|
9
|
+
{{description}}
|
|
10
|
+
|
|
11
|
+
## Why these constraints exist
|
|
12
|
+
|
|
13
|
+
1. **Solo developer, no review queue.** Every constraint must pull weight
|
|
14
|
+
without a second pair of eyes. The harness IS the review queue.
|
|
15
|
+
2. **Agent-driven development is the default mode.** Code is written by
|
|
16
|
+
Claude Code with a human in the loop, not the other way around. Patterns
|
|
17
|
+
that humans tolerate but agents abuse (vague names, "just one more flag",
|
|
18
|
+
lazy `any` / `Dict[str, Any]`) are out.
|
|
19
|
+
3. **Time-to-mistake-fix matters more than time-to-write.** A mistake that
|
|
20
|
+
surfaces in the PostToolUse hook costs ~30 seconds. The same mistake in
|
|
21
|
+
a code review costs minutes. The same mistake in production costs hours.
|
|
22
|
+
Every constraint is timed against this gradient.
|
|
23
|
+
|
|
24
|
+
## What we're optimizing for
|
|
25
|
+
|
|
26
|
+
- Throughput per dev-hour at constant quality.
|
|
27
|
+
- Refactor blast radius — changes should stay within one domain.
|
|
28
|
+
- Decisional consistency — two consecutive sessions should produce the same
|
|
29
|
+
shape of solution to the same problem.
|
|
30
|
+
|
|
31
|
+
## What we're NOT optimizing for
|
|
32
|
+
|
|
33
|
+
- Multi-team coordination, RFC queues, or governance.
|
|
34
|
+
- Frontier-grade test coverage. Agent-written unit tests are a liability;
|
|
35
|
+
feature-level tests are the floor.
|
|
36
|
+
- Maximum flexibility. The harness is opinionated on purpose.
|
|
37
|
+
|
|
38
|
+
---
|
|
39
|
+
|
|
40
|
+
_Edit this file when the project's purpose changes — not when you change a
|
|
41
|
+
library or a layer name. For those, write an ADR._
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# Golden principles
|
|
2
|
+
|
|
3
|
+
These are invariants that must hold across the codebase. Each one traces to a
|
|
4
|
+
specific past failure or a deliberate trade-off. **Every line here must be
|
|
5
|
+
mechanically enforceable** — if it can't be, it doesn't belong here; promote
|
|
6
|
+
it to a structural test or demote it to a comment in the affected file.
|
|
7
|
+
|
|
8
|
+
The garbage-collection ritual (`/garbage-collection`) diffs the codebase
|
|
9
|
+
against this file weekly.
|
|
10
|
+
|
|
11
|
+
## 1. Forward-only layer dependencies
|
|
12
|
+
|
|
13
|
+
`{{layersJoined}}`
|
|
14
|
+
|
|
15
|
+
Why: prevents circular imports, makes refactors local, mirrors OpenAI's Codex
|
|
16
|
+
codebase rule.
|
|
17
|
+
Enforced by: structural test (`harness.config.json` `domains[].layers`).
|
|
18
|
+
|
|
19
|
+
## 2. Validate at boundaries; trust internals
|
|
20
|
+
|
|
21
|
+
External input (HTTP body, CLI arg, file content) is parsed into a typed
|
|
22
|
+
object at the runtime boundary. Internal code assumes the type holds.
|
|
23
|
+
|
|
24
|
+
Why: removes "defensive" type checks scattered across services that hide
|
|
25
|
+
bugs.
|
|
26
|
+
Enforced by: code review + `security-reviewer` subagent.
|
|
27
|
+
|
|
28
|
+
## 3. Shared utilities live in `src/shared/`
|
|
29
|
+
|
|
30
|
+
Before adding a helper to a module, search `src/shared/` for an existing one.
|
|
31
|
+
If you write a duplicate, the garbage-collection skill will surface it.
|
|
32
|
+
|
|
33
|
+
Why: a real recurring failure mode in agent-generated code is duplicated
|
|
34
|
+
helpers. OpenAI's Codex team explicitly tracks this.
|
|
35
|
+
Enforced by: `garbage-collection` skill (duplicate-utility scan).
|
|
36
|
+
|
|
37
|
+
## 4. Tests are end-to-end through one feature
|
|
38
|
+
|
|
39
|
+
A test exercises one entry from `feature_list.json` end-to-end. We don't
|
|
40
|
+
write isolated unit tests for inner helpers unless a bug repro demands one.
|
|
41
|
+
|
|
42
|
+
Why: agent-generated unit tests mock everything and verify nothing.
|
|
43
|
+
Enforced by: code review.
|
|
44
|
+
|
|
45
|
+
## 5. Bounded retries and timeouts on every external call
|
|
46
|
+
|
|
47
|
+
Every `fetch`/`httpx`/`requests` call has an explicit timeout. Every retry
|
|
48
|
+
loop has both `maxAttempts` and a deadline. No `while True:` in production
|
|
49
|
+
code.
|
|
50
|
+
|
|
51
|
+
Why: agents love infinite retries.
|
|
52
|
+
Enforced by: `reliability-reviewer` subagent.
|
|
53
|
+
|
|
54
|
+
## 6. JSON beats Markdown for state the agent updates
|
|
55
|
+
|
|
56
|
+
`feature_list.json`, `.harness/installed.json`, structural-baseline — all
|
|
57
|
+
JSON. Anthropic's long-running-agent guide: "the model is less likely to
|
|
58
|
+
inappropriately change or overwrite JSON files compared to Markdown files."
|
|
59
|
+
|
|
60
|
+
Why: the agent treats Markdown as freely-editable prose.
|
|
61
|
+
Enforced by: file format choice.
|
|
62
|
+
|
|
63
|
+
## 7. Every agent failure becomes a permanent prevention
|
|
64
|
+
|
|
65
|
+
When the agent does something wrong, the response is **not** to add a "be
|
|
66
|
+
careful about X" line to CLAUDE.md. It is to:
|
|
67
|
+
|
|
68
|
+
- add context to `docs/`, OR
|
|
69
|
+
- add a structural test rule, OR
|
|
70
|
+
- add a hook, OR
|
|
71
|
+
- add a skill.
|
|
72
|
+
|
|
73
|
+
Why: Mitchell Hashimoto's discipline. CLAUDE.md is a table of contents — it
|
|
74
|
+
won't be re-read on every action.
|
|
75
|
+
Enforced by: `/propose-harness-improvement` skill.
|
|
76
|
+
|
|
77
|
+
---
|
|
78
|
+
|
|
79
|
+
_Add new principles via `/structural-test-author`, which forces you to
|
|
80
|
+
codify the enforcement mechanism alongside the rule._
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# Tech debt tracker
|
|
2
|
+
|
|
3
|
+
A flat append-only log of known compromises. Each entry has a date, a
|
|
4
|
+
location, a description, and a payoff condition.
|
|
5
|
+
|
|
6
|
+
> "Technical debt is a high-interest loan best paid down in continuous
|
|
7
|
+
> small increments." — OpenAI Codex harness team
|
|
8
|
+
|
|
9
|
+
The `/garbage-collection` skill scans this file every Friday and proposes
|
|
10
|
+
the top-3 highest-leverage entries to address.
|
|
11
|
+
|
|
12
|
+
## Format
|
|
13
|
+
|
|
14
|
+
```
|
|
15
|
+
### YYYY-MM-DD <slug>
|
|
16
|
+
- Location: path/or/area
|
|
17
|
+
- Why it's debt: <one paragraph>
|
|
18
|
+
- Cost: <effort to fix>
|
|
19
|
+
- Payoff condition: <what should trigger the fix>
|
|
20
|
+
- Status: open | in-progress | closed
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## Entries
|
|
24
|
+
|
|
25
|
+
### 2026-01-01 example-entry
|
|
26
|
+
- Location: src/example/repo/legacy.ts
|
|
27
|
+
- Why it's debt: hand-rolled fetch wrapper instead of the shared client
|
|
28
|
+
- Cost: 1 hour
|
|
29
|
+
- Payoff condition: when we add the next external call
|
|
30
|
+
- Status: open
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "./.harness/feature-list.schema.json",
|
|
3
|
+
"version": "0.1",
|
|
4
|
+
"project": "{{projectName}}",
|
|
5
|
+
"features": [
|
|
6
|
+
{
|
|
7
|
+
"id": "health-endpoint",
|
|
8
|
+
"title": "GET /health returns {status:'ok'}",
|
|
9
|
+
"passes": false,
|
|
10
|
+
"steps": [
|
|
11
|
+
{ "id": "type", "title": "Define HealthResponse in types/", "done": false },
|
|
12
|
+
{ "id": "service", "title": "Implement getHealth() in service/", "done": false },
|
|
13
|
+
{ "id": "runtime", "title": "Wire route in runtime/", "done": false },
|
|
14
|
+
{ "id": "smoke", "title": "curl localhost returns 200 + {status:'ok'}", "done": false }
|
|
15
|
+
],
|
|
16
|
+
"domain": "default"
|
|
17
|
+
},
|
|
18
|
+
{
|
|
19
|
+
"id": "not-found-page",
|
|
20
|
+
"title": "Custom 404 page (or handler)",
|
|
21
|
+
"passes": false,
|
|
22
|
+
"steps": [
|
|
23
|
+
{ "id": "ui", "title": "Add 404 view/handler in ui/ or runtime/", "done": false },
|
|
24
|
+
{ "id": "smoke", "title": "curl /no-such-path returns 404 with the custom body", "done": false }
|
|
25
|
+
],
|
|
26
|
+
"domain": "default"
|
|
27
|
+
}
|
|
28
|
+
]
|
|
29
|
+
}
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "https://raw.githubusercontent.com/tuanle96/agent-harness-kit/v{{kitVersion}}/schema.json",
|
|
3
|
+
"version": "{{kitVersion}}",
|
|
4
|
+
"language": "{{language}}",
|
|
5
|
+
"framework": "{{framework}}",
|
|
6
|
+
"preset": "{{preset}}",
|
|
7
|
+
"domains": [
|
|
8
|
+
{
|
|
9
|
+
"name": "default",
|
|
10
|
+
"root": "{{#if isPython}}app{{else}}src{{/if}}",
|
|
11
|
+
"layers": [{{#each layers}}"{{this}}"{{#unless @last}}, {{/unless}}{{/each}}]
|
|
12
|
+
}
|
|
13
|
+
],
|
|
14
|
+
"providers": ["auth", "telemetry", "feature-flags"],
|
|
15
|
+
"goldenPrinciples": "docs/golden-principles.md",
|
|
16
|
+
"structuralTest": {
|
|
17
|
+
"engine": "{{#if isPython}}libcst{{else}}ts-morph{{/if}}",
|
|
18
|
+
"configPath": ".harness/structural-test.config.json",
|
|
19
|
+
"blockOnViolation": true
|
|
20
|
+
},
|
|
21
|
+
"evals": {
|
|
22
|
+
"tasksDir": ".harness/eval/tasks",
|
|
23
|
+
"scheduleCron": "0 6 * * *",
|
|
24
|
+
"dimensions": ["outcome", "process", "style", "efficiency"]
|
|
25
|
+
},
|
|
26
|
+
"garbageCollection": {
|
|
27
|
+
"frequency": "weekly",
|
|
28
|
+
"maxFixesPerRun": 3,
|
|
29
|
+
"scope": ["dead-imports", "duplicate-utils", "layer-violations", "doc-drift"]
|
|
30
|
+
},
|
|
31
|
+
"models": {
|
|
32
|
+
"main": "claude-sonnet-4-6",
|
|
33
|
+
"reviewers": "claude-sonnet-4-6",
|
|
34
|
+
"explore": "claude-haiku-4-5"
|
|
35
|
+
},
|
|
36
|
+
"budgets": {
|
|
37
|
+
"perRunUsd": 2.0,
|
|
38
|
+
"perDayUsd": 10.0
|
|
39
|
+
}
|
|
40
|
+
}
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Start the dev server and wait until it answers a readiness probe.
|
|
3
|
+
# Used by `/debug-flow` and by humans during interactive work.
|
|
4
|
+
set -euo pipefail
|
|
5
|
+
|
|
6
|
+
PORT="${PORT:-3000}"
|
|
7
|
+
{{#if isFastapi}}PORT="${PORT:-8000}"{{/if}}
|
|
8
|
+
{{#if isDjango}}PORT="${PORT:-8000}"{{/if}}
|
|
9
|
+
{{#if isFlask}}PORT="${PORT:-5000}"{{/if}}
|
|
10
|
+
HEALTH_PATH="${HEALTH_PATH:-/}"
|
|
11
|
+
|
|
12
|
+
echo "[dev-up] starting dev server on port $PORT…"
|
|
13
|
+
{{#if isNextjs}}
|
|
14
|
+
npm run dev &
|
|
15
|
+
{{else if isFastapi}}
|
|
16
|
+
uvicorn app.main:app --reload --port "$PORT" &
|
|
17
|
+
{{else if isDjango}}
|
|
18
|
+
python manage.py runserver "$PORT" &
|
|
19
|
+
{{else if isFlask}}
|
|
20
|
+
flask --app app run --debug --port "$PORT" &
|
|
21
|
+
{{else if isExpress}}
|
|
22
|
+
node ./src/server.js &
|
|
23
|
+
{{else if isFastify}}
|
|
24
|
+
node ./src/server.js &
|
|
25
|
+
{{else if isNestjs}}
|
|
26
|
+
npm run start:dev &
|
|
27
|
+
{{else if isPython}}
|
|
28
|
+
python -m app &
|
|
29
|
+
{{else}}
|
|
30
|
+
npm run dev &
|
|
31
|
+
{{/if}}
|
|
32
|
+
SERVER_PID=$!
|
|
33
|
+
|
|
34
|
+
cleanup() {
|
|
35
|
+
if kill -0 "$SERVER_PID" 2>/dev/null; then
|
|
36
|
+
kill "$SERVER_PID" || true
|
|
37
|
+
fi
|
|
38
|
+
}
|
|
39
|
+
trap cleanup EXIT INT TERM
|
|
40
|
+
|
|
41
|
+
# Wait for readiness (max 30s).
|
|
42
|
+
for i in $(seq 1 60); do
|
|
43
|
+
if curl -fs "http://localhost:$PORT$HEALTH_PATH" >/dev/null 2>&1; then
|
|
44
|
+
echo "[dev-up] ready at http://localhost:$PORT$HEALTH_PATH"
|
|
45
|
+
break
|
|
46
|
+
fi
|
|
47
|
+
sleep 0.5
|
|
48
|
+
done
|
|
49
|
+
|
|
50
|
+
# Hand control back to the foreground process.
|
|
51
|
+
wait "$SERVER_PID"
|
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
// harness:report — aggregate eval results + skill telemetry into a per-skill
|
|
3
|
+
// summary. Reads .harness/eval/results/*.jsonl and .harness/telemetry.jsonl.
|
|
4
|
+
//
|
|
5
|
+
// Output:
|
|
6
|
+
// ### Eval results (last 7 days)
|
|
7
|
+
// <per-task: pass/fail counts, avg tokens>
|
|
8
|
+
// ### Skill invocations (last 7 days)
|
|
9
|
+
// <per-skill: invocation count, sessions, last seen>
|
|
10
|
+
// ### Drift signals
|
|
11
|
+
// <skills that haven't been invoked in N days; tasks that have started failing>
|
|
12
|
+
//
|
|
13
|
+
// No external deps — pure Node stdlib.
|
|
14
|
+
|
|
15
|
+
import { readdir, readFile, stat } from "node:fs/promises";
|
|
16
|
+
import { existsSync } from "node:fs";
|
|
17
|
+
import { resolve, join } from "node:path";
|
|
18
|
+
|
|
19
|
+
const ROOT = process.cwd();
|
|
20
|
+
const RESULTS_DIR = resolve(ROOT, ".harness/eval/results");
|
|
21
|
+
const TELEMETRY = resolve(ROOT, ".harness/telemetry.jsonl");
|
|
22
|
+
const NOW = Date.now();
|
|
23
|
+
const SEVEN_DAYS = 7 * 24 * 60 * 60 * 1000;
|
|
24
|
+
|
|
25
|
+
async function readJsonl(path) {
|
|
26
|
+
if (!existsSync(path)) return [];
|
|
27
|
+
const raw = await readFile(path, "utf8");
|
|
28
|
+
const out = [];
|
|
29
|
+
for (const line of raw.split("\n")) {
|
|
30
|
+
if (!line.trim()) continue;
|
|
31
|
+
try {
|
|
32
|
+
out.push(JSON.parse(line));
|
|
33
|
+
} catch {
|
|
34
|
+
/* skip malformed line */
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
return out;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
async function loadEvalResults() {
|
|
41
|
+
if (!existsSync(RESULTS_DIR)) return [];
|
|
42
|
+
const files = await readdir(RESULTS_DIR);
|
|
43
|
+
const all = [];
|
|
44
|
+
for (const f of files) {
|
|
45
|
+
if (!f.endsWith(".jsonl")) continue;
|
|
46
|
+
const path = join(RESULTS_DIR, f);
|
|
47
|
+
const st = await stat(path);
|
|
48
|
+
const rows = await readJsonl(path);
|
|
49
|
+
for (const r of rows) {
|
|
50
|
+
r._mtime = st.mtimeMs;
|
|
51
|
+
all.push(r);
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
return all;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
function recent(rows, key = "ts") {
|
|
58
|
+
return rows.filter((r) => {
|
|
59
|
+
const t = r[key] ? new Date(r[key]).getTime() : r._mtime ?? 0;
|
|
60
|
+
return NOW - t <= SEVEN_DAYS;
|
|
61
|
+
});
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
function tokensOf(row) {
|
|
65
|
+
return (row.grades ?? [])
|
|
66
|
+
.filter((g) => g.dim === "efficiency")
|
|
67
|
+
.reduce((sum, g) => {
|
|
68
|
+
const m = g.info?.match(/^(\d+) tokens/);
|
|
69
|
+
return sum + (m ? parseInt(m[1], 10) : 0);
|
|
70
|
+
}, 0);
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
function fmtPct(num, total) {
|
|
74
|
+
if (total === 0) return "n/a";
|
|
75
|
+
return `${Math.round((num / total) * 100)}%`;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
function summarizeEvals(rows) {
|
|
79
|
+
const byTask = new Map();
|
|
80
|
+
for (const r of rows) {
|
|
81
|
+
const arr = byTask.get(r.taskId) ?? [];
|
|
82
|
+
arr.push(r);
|
|
83
|
+
byTask.set(r.taskId, arr);
|
|
84
|
+
}
|
|
85
|
+
console.log(`\n### Eval results (last 7 days, ${rows.length} runs)`);
|
|
86
|
+
if (rows.length === 0) {
|
|
87
|
+
console.log(" (no recent runs — try `npm run harness:eval -- --quick --transport=mock`)");
|
|
88
|
+
return;
|
|
89
|
+
}
|
|
90
|
+
console.log(
|
|
91
|
+
" task pass-rate runs avg-tokens",
|
|
92
|
+
);
|
|
93
|
+
console.log(
|
|
94
|
+
" ---------------------- ---------- ----- ----------",
|
|
95
|
+
);
|
|
96
|
+
for (const [taskId, taskRows] of [...byTask.entries()].sort()) {
|
|
97
|
+
const passed = taskRows.filter((r) => r.passed).length;
|
|
98
|
+
const tokens = taskRows.reduce((s, r) => s + tokensOf(r), 0);
|
|
99
|
+
const avgTokens = taskRows.length > 0 ? Math.round(tokens / taskRows.length) : 0;
|
|
100
|
+
const pct = fmtPct(passed, taskRows.length);
|
|
101
|
+
console.log(
|
|
102
|
+
` ${taskId.padEnd(22)} ${pct.padStart(8)} ${String(taskRows.length).padStart(3)} ${String(avgTokens).padStart(8)}`,
|
|
103
|
+
);
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
function summarizeTelemetry(rows) {
|
|
108
|
+
console.log(`\n### Skill invocations (last 7 days, ${rows.length} events)`);
|
|
109
|
+
if (rows.length === 0) {
|
|
110
|
+
console.log(
|
|
111
|
+
" (no skill invocations recorded — telemetry hook may not be installed)",
|
|
112
|
+
);
|
|
113
|
+
console.log(
|
|
114
|
+
" Verify `.claude/hooks/hooks.json` includes the Skill matcher.",
|
|
115
|
+
);
|
|
116
|
+
return;
|
|
117
|
+
}
|
|
118
|
+
const bySkill = new Map();
|
|
119
|
+
for (const r of rows) {
|
|
120
|
+
const arr = bySkill.get(r.skill) ?? [];
|
|
121
|
+
arr.push(r);
|
|
122
|
+
bySkill.set(r.skill, arr);
|
|
123
|
+
}
|
|
124
|
+
console.log(" skill invocations last-seen");
|
|
125
|
+
console.log(" ----------------------------- ----------- --------------------");
|
|
126
|
+
for (const [skill, events] of [...bySkill.entries()].sort(
|
|
127
|
+
(a, b) => b[1].length - a[1].length,
|
|
128
|
+
)) {
|
|
129
|
+
const last = events
|
|
130
|
+
.map((e) => e.ts)
|
|
131
|
+
.sort()
|
|
132
|
+
.at(-1);
|
|
133
|
+
console.log(
|
|
134
|
+
` ${skill.padEnd(29)} ${String(events.length).padStart(8)} ${last ?? "?"}`,
|
|
135
|
+
);
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
function driftSignals(evalRows, telemetryRows) {
|
|
140
|
+
console.log(`\n### Drift signals`);
|
|
141
|
+
const knownSkills = [
|
|
142
|
+
"inspect-module",
|
|
143
|
+
"inspect-app",
|
|
144
|
+
"garbage-collection",
|
|
145
|
+
"doc-drift-scan",
|
|
146
|
+
"add-feature",
|
|
147
|
+
"add-adr",
|
|
148
|
+
"structural-test-author",
|
|
149
|
+
"propose-harness-improvement",
|
|
150
|
+
"write-skill",
|
|
151
|
+
"debug-flow",
|
|
152
|
+
"eval-runner",
|
|
153
|
+
];
|
|
154
|
+
const seen = new Set(telemetryRows.map((r) => r.skill));
|
|
155
|
+
const unseen = knownSkills.filter((s) => !seen.has(s));
|
|
156
|
+
if (unseen.length > 0) {
|
|
157
|
+
console.log(` skills not invoked in 7 days: ${unseen.join(", ")}`);
|
|
158
|
+
}
|
|
159
|
+
// Tasks failing in their most recent run.
|
|
160
|
+
const latest = new Map();
|
|
161
|
+
for (const r of evalRows.sort((a, b) => (a.ts ?? "").localeCompare(b.ts ?? ""))) {
|
|
162
|
+
latest.set(r.taskId, r);
|
|
163
|
+
}
|
|
164
|
+
const regressing = [...latest.values()].filter((r) => !r.passed);
|
|
165
|
+
if (regressing.length > 0) {
|
|
166
|
+
console.log(
|
|
167
|
+
` tasks failing in their latest run: ${regressing.map((r) => r.taskId).join(", ")}`,
|
|
168
|
+
);
|
|
169
|
+
}
|
|
170
|
+
if (unseen.length === 0 && regressing.length === 0) {
|
|
171
|
+
console.log(" (none)");
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
async function main() {
|
|
176
|
+
const evalAll = await loadEvalResults();
|
|
177
|
+
const telemetryAll = await readJsonl(TELEMETRY);
|
|
178
|
+
const evalRows = recent(evalAll);
|
|
179
|
+
const telemetryRows = recent(telemetryAll);
|
|
180
|
+
|
|
181
|
+
console.log("=== agent-harness-kit report ===");
|
|
182
|
+
console.log(`Generated: ${new Date().toISOString()}`);
|
|
183
|
+
summarizeEvals(evalRows);
|
|
184
|
+
summarizeTelemetry(telemetryRows);
|
|
185
|
+
driftSignals(evalRows, telemetryRows);
|
|
186
|
+
console.log("");
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
await main();
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Install scripts/pre-push.sh as the git pre-push hook for this repo.
|
|
3
|
+
set -e
|
|
4
|
+
|
|
5
|
+
if [ ! -d .git ]; then
|
|
6
|
+
echo "Not a git repo — run this script from the repo root." >&2
|
|
7
|
+
exit 1
|
|
8
|
+
fi
|
|
9
|
+
|
|
10
|
+
mkdir -p .git/hooks
|
|
11
|
+
|
|
12
|
+
cat > .git/hooks/pre-push <<'HOOK'
|
|
13
|
+
#!/usr/bin/env bash
|
|
14
|
+
exec bash scripts/pre-push.sh "$@"
|
|
15
|
+
HOOK
|
|
16
|
+
chmod +x .git/hooks/pre-push
|
|
17
|
+
|
|
18
|
+
echo "✓ git pre-push hook installed (delegates to scripts/pre-push.sh)"
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# pre-push hook — Stripe "shift-feedback-left" pattern. Runs only the
|
|
3
|
+
# deterministic checks (structural test + linter + tests on changed files).
|
|
4
|
+
# Lives in scripts/ so it ships with the repo; install via install-git-hooks.sh.
|
|
5
|
+
set -e
|
|
6
|
+
|
|
7
|
+
echo "[pre-push] running structural test…"
|
|
8
|
+
if [ -f harness.config.json ] && grep -q '"language": "python"' harness.config.json; then
|
|
9
|
+
python -m harness.structural_test
|
|
10
|
+
else
|
|
11
|
+
npm run --silent harness:check
|
|
12
|
+
fi
|
|
13
|
+
|
|
14
|
+
echo "[pre-push] running lint…"
|
|
15
|
+
if [ -f package.json ] && grep -q '"lint"' package.json; then
|
|
16
|
+
npm run --silent lint
|
|
17
|
+
elif command -v ruff >/dev/null 2>&1; then
|
|
18
|
+
ruff check .
|
|
19
|
+
fi
|
|
20
|
+
|
|
21
|
+
echo "[pre-push] OK"
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Stop hook — LangChain's "PreCompletionChecklist" / Ralph Wiggum loop.
|
|
3
|
+
# On first stop: run deterministic checks; if any fail, re-inject *structured*
|
|
4
|
+
# failure context (not just check names) via stderr and exit 2. On second
|
|
5
|
+
# stop (stop_hook_active=true), exit 0 to allow real exit.
|
|
6
|
+
#
|
|
7
|
+
# Optional: set AHK_HEADLESS_RECOVER=1 to spawn `claude -p` in the background
|
|
8
|
+
# for one turn of recovery (costs tokens; off by default).
|
|
9
|
+
set -e
|
|
10
|
+
|
|
11
|
+
INPUT=$(cat)
|
|
12
|
+
|
|
13
|
+
# CRITICAL: avoid infinite loops. If the hook already ran, do not block again.
|
|
14
|
+
if command -v jq >/dev/null 2>&1; then
|
|
15
|
+
if [ "$(echo "$INPUT" | jq -r '.stop_hook_active // false')" = "true" ]; then
|
|
16
|
+
exit 0
|
|
17
|
+
fi
|
|
18
|
+
fi
|
|
19
|
+
|
|
20
|
+
# Capture structured output per check. We use temp files so we can quote the
|
|
21
|
+
# tail back to Claude verbatim — names alone are not enough context for the
|
|
22
|
+
# agent to act on.
|
|
23
|
+
TMPDIR_HOOK=$(mktemp -d -t ahk-stop-hook.XXXXXX)
|
|
24
|
+
# Preserve the script's exit code through the cleanup trap — otherwise the
|
|
25
|
+
# trailing `rm` resets the final status to 0 and Claude never sees the block.
|
|
26
|
+
trap 'rc=$?; rm -rf "$TMPDIR_HOOK"; exit $rc' EXIT
|
|
27
|
+
|
|
28
|
+
run_check() {
|
|
29
|
+
local name="$1"
|
|
30
|
+
shift
|
|
31
|
+
local out="$TMPDIR_HOOK/$name.out"
|
|
32
|
+
if "$@" >"$out" 2>&1; then
|
|
33
|
+
return 0
|
|
34
|
+
else
|
|
35
|
+
echo "$name" >> "$TMPDIR_HOOK/failed.list"
|
|
36
|
+
return 1
|
|
37
|
+
fi
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
# Structural test.
|
|
41
|
+
if [ -f harness.config.json ]; then
|
|
42
|
+
if grep -q '"language": "python"' harness.config.json; then
|
|
43
|
+
run_check structural-test python -m harness.structural_test || true
|
|
44
|
+
else
|
|
45
|
+
run_check structural-test npm run --silent harness:check || true
|
|
46
|
+
fi
|
|
47
|
+
fi
|
|
48
|
+
|
|
49
|
+
# Lint.
|
|
50
|
+
if [ -f package.json ] && grep -q '"lint"' package.json; then
|
|
51
|
+
run_check lint npm run --silent lint || true
|
|
52
|
+
elif [ -f pyproject.toml ] && command -v ruff >/dev/null 2>&1; then
|
|
53
|
+
run_check ruff ruff check . || true
|
|
54
|
+
fi
|
|
55
|
+
|
|
56
|
+
if [ ! -s "$TMPDIR_HOOK/failed.list" ]; then
|
|
57
|
+
exit 0
|
|
58
|
+
fi
|
|
59
|
+
|
|
60
|
+
# Build a structured failure report for Claude. The agent gets: which checks
|
|
61
|
+
# failed, the last 50 lines of each failure, and the files most recently
|
|
62
|
+
# touched (so the agent can correlate failures with its own edits).
|
|
63
|
+
{
|
|
64
|
+
echo
|
|
65
|
+
echo "=== Pre-completion checklist failed ==="
|
|
66
|
+
while read -r failed; do
|
|
67
|
+
echo
|
|
68
|
+
echo "--- $failed ---"
|
|
69
|
+
tail -50 "$TMPDIR_HOOK/$failed.out" 2>/dev/null || true
|
|
70
|
+
done < "$TMPDIR_HOOK/failed.list"
|
|
71
|
+
|
|
72
|
+
echo
|
|
73
|
+
echo "--- recent changes (last 10 modified files) ---"
|
|
74
|
+
if command -v git >/dev/null 2>&1; then
|
|
75
|
+
git status --short 2>/dev/null | head -10 || true
|
|
76
|
+
echo
|
|
77
|
+
echo "--- last 3 commits ---"
|
|
78
|
+
git log --oneline -3 2>/dev/null || true
|
|
79
|
+
fi
|
|
80
|
+
|
|
81
|
+
echo
|
|
82
|
+
echo "Fix the failing check(s) and re-run them locally before declaring"
|
|
83
|
+
echo "the task complete. Do NOT disable a check to make the hook pass."
|
|
84
|
+
} >&2
|
|
85
|
+
|
|
86
|
+
# Optional: opt-in headless recovery. Spawns a one-turn `claude -p` to
|
|
87
|
+
# attempt the fix autonomously. Useful for unattended CI / cron contexts.
|
|
88
|
+
# Off by default because it costs tokens.
|
|
89
|
+
if [ "${AHK_HEADLESS_RECOVER:-}" = "1" ] && command -v claude >/dev/null 2>&1; then
|
|
90
|
+
FAILED_LIST=$(tr '\n' ' ' < "$TMPDIR_HOOK/failed.list")
|
|
91
|
+
echo "[ahk] AHK_HEADLESS_RECOVER=1 — spawning recovery turn for: $FAILED_LIST" >&2
|
|
92
|
+
claude -p \
|
|
93
|
+
"The pre-completion checklist failed: $FAILED_LIST. Read the failure output in $TMPDIR_HOOK and apply the smallest fix. Do not disable any check." \
|
|
94
|
+
--max-turns 5 \
|
|
95
|
+
>"$TMPDIR_HOOK/recover.out" 2>&1 &
|
|
96
|
+
# Don't wait — let the next session pick up the partially-applied fix.
|
|
97
|
+
fi
|
|
98
|
+
|
|
99
|
+
exit 2
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# PostToolUse hook — runs the structural test on the file just edited.
|
|
3
|
+
# Defensive: never blocks on missing tooling. Exit code 2 = block + Claude reads stderr.
|
|
4
|
+
set -e
|
|
5
|
+
|
|
6
|
+
INPUT=$(cat)
|
|
7
|
+
if ! command -v jq >/dev/null 2>&1; then
|
|
8
|
+
exit 0 # jq missing — silently skip rather than spuriously blocking
|
|
9
|
+
fi
|
|
10
|
+
|
|
11
|
+
FILE=$(echo "$INPUT" | jq -r '.tool_input.file_path // empty')
|
|
12
|
+
[ -z "$FILE" ] && exit 0
|
|
13
|
+
|
|
14
|
+
# Only run on source files, and only inside the configured roots.
|
|
15
|
+
case "$FILE" in
|
|
16
|
+
*.ts|*.tsx|*.js|*.jsx|*.mjs|*.cjs) ENGINE=ts ;;
|
|
17
|
+
*.py) ENGINE=py ;;
|
|
18
|
+
*) exit 0 ;;
|
|
19
|
+
esac
|
|
20
|
+
|
|
21
|
+
# Allow opt-out via env var — useful on Windows / macOS where some hook
|
|
22
|
+
# events are flaky (open issues #45065 and #6305).
|
|
23
|
+
if [ "${AHK_HOOK_MODE:-}" = "warn" ]; then
|
|
24
|
+
echo "[ahk] hook running in warn-only mode (AHK_HOOK_MODE=warn)" >&2
|
|
25
|
+
exit 0
|
|
26
|
+
fi
|
|
27
|
+
|
|
28
|
+
# Run the structural test scoped to this file. Capture output so we can
|
|
29
|
+
# return only the relevant lines via stderr to Claude.
|
|
30
|
+
if [ "$ENGINE" = "ts" ]; then
|
|
31
|
+
if ! npm run --silent harness:check -- --file "$FILE" 2>&1 | tail -50 >&2; then
|
|
32
|
+
cat >&2 <<EOF
|
|
33
|
+
|
|
34
|
+
Structural test failed for $FILE.
|
|
35
|
+
Layer order: see harness.config.json.
|
|
36
|
+
Run \`npm run harness:check\` for full output.
|
|
37
|
+
Fix the violation before continuing — do NOT disable the test.
|
|
38
|
+
EOF
|
|
39
|
+
exit 2
|
|
40
|
+
fi
|
|
41
|
+
elif [ "$ENGINE" = "py" ]; then
|
|
42
|
+
if ! python -m harness.structural_test --file "$FILE" 2>&1 | tail -50 >&2; then
|
|
43
|
+
cat >&2 <<EOF
|
|
44
|
+
|
|
45
|
+
Structural test failed for $FILE.
|
|
46
|
+
Layer order: see harness.config.json.
|
|
47
|
+
Run \`python -m harness.structural_test\` for full output.
|
|
48
|
+
Fix the violation before continuing — do NOT disable the test.
|
|
49
|
+
EOF
|
|
50
|
+
exit 2
|
|
51
|
+
fi
|
|
52
|
+
fi
|
|
53
|
+
exit 0
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# PostToolUse telemetry hook — logs every Skill invocation to
|
|
3
|
+
# .harness/telemetry.jsonl. Pure observation; never blocks.
|
|
4
|
+
#
|
|
5
|
+
# Used by harness:report to compute per-skill success rate, average duration,
|
|
6
|
+
# and to surface drift over time.
|
|
7
|
+
set -e
|
|
8
|
+
|
|
9
|
+
INPUT=$(cat)
|
|
10
|
+
if ! command -v jq >/dev/null 2>&1; then
|
|
11
|
+
exit 0 # jq missing — skip silently rather than spuriously blocking
|
|
12
|
+
fi
|
|
13
|
+
|
|
14
|
+
TOOL=$(echo "$INPUT" | jq -r '.tool_name // empty')
|
|
15
|
+
[ "$TOOL" = "Skill" ] || exit 0
|
|
16
|
+
|
|
17
|
+
SKILL=$(echo "$INPUT" | jq -r '.tool_input.skill // empty')
|
|
18
|
+
[ -z "$SKILL" ] && exit 0
|
|
19
|
+
|
|
20
|
+
mkdir -p .harness
|
|
21
|
+
LINE=$(jq -nc --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
|
|
22
|
+
--arg skill "$SKILL" \
|
|
23
|
+
--arg sha "$(git rev-parse --short HEAD 2>/dev/null || echo 'no-git')" \
|
|
24
|
+
'{ts: $ts, event: "skill_invoked", skill: $skill, sha: $sha}')
|
|
25
|
+
echo "$LINE" >> .harness/telemetry.jsonl
|
|
26
|
+
exit 0
|