agent-harness-kit 0.5.1 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/marketplace.json +2 -2
- package/.claude-plugin/plugin.json +1 -1
- package/README.md +29 -0
- package/bin/cli.mjs +10 -1
- package/package.json +1 -1
- package/src/core/detect-stack.mjs +32 -0
- package/src/core/render-templates.mjs +111 -4
- package/src/templates/.claude/hooks/hooks.json +87 -0
- package/src/templates/CLAUDE.md.hbs +1 -1
- package/src/templates/CLAUDE.md.vi.hbs +70 -0
- package/src/templates/_adapter-kotlin/harness/structural-check.mjs.hbs +286 -0
- package/src/templates/_adapter-rust/harness/structural-check.mjs.hbs +333 -60
- package/src/templates/_adapter-swift/harness/structural-check.mjs.hbs +285 -0
- package/src/templates/harness.config.json.hbs +5 -3
- package/src/templates/scripts/_lib/approx-tokens.mjs +48 -0
- package/src/templates/scripts/_lib/json-pick.mjs +278 -0
- package/src/templates/scripts/harness-report.mjs +95 -1
- package/src/templates/scripts/pre-compact.sh.hbs +121 -0
- package/src/templates/scripts/pre-push.sh +42 -8
- package/src/templates/scripts/precompletion-checklist.sh.hbs +143 -24
- package/src/templates/scripts/pretooluse-bash-guard.sh.hbs +146 -0
- package/src/templates/scripts/session-end.sh.hbs +48 -0
- package/src/templates/scripts/session-start.sh.hbs +139 -0
- package/src/templates/scripts/structural-test-on-edit.sh.hbs +56 -4
- package/src/templates/scripts/telemetry-on-skill.sh +32 -10
- package/src/templates/.claude/hooks/hooks.json.hbs +0 -39
|
@@ -20,7 +20,9 @@ const ROOT = process.cwd();
|
|
|
20
20
|
const RESULTS_DIR = resolve(ROOT, ".harness/eval/results");
|
|
21
21
|
const TELEMETRY = resolve(ROOT, ".harness/telemetry.jsonl");
|
|
22
22
|
const NOW = Date.now();
|
|
23
|
-
const
|
|
23
|
+
const ONE_DAY = 24 * 60 * 60 * 1000;
|
|
24
|
+
const SEVEN_DAYS = 7 * ONE_DAY;
|
|
25
|
+
const FOURTEEN_DAYS = 14 * ONE_DAY;
|
|
24
26
|
|
|
25
27
|
async function readJsonl(path) {
|
|
26
28
|
if (!existsSync(path)) return [];
|
|
@@ -61,6 +63,16 @@ function recent(rows, key = "ts") {
|
|
|
61
63
|
});
|
|
62
64
|
}
|
|
63
65
|
|
|
66
|
+
// Rows aged 7–14 days. Used as the comparator for week-over-week deltas
|
|
67
|
+
// so users can spot drift instead of staring at a single-week snapshot.
|
|
68
|
+
function priorWeek(rows, key = "ts") {
|
|
69
|
+
return rows.filter((r) => {
|
|
70
|
+
const t = r[key] ? new Date(r[key]).getTime() : r._mtime ?? 0;
|
|
71
|
+
const age = NOW - t;
|
|
72
|
+
return age > SEVEN_DAYS && age <= FOURTEEN_DAYS;
|
|
73
|
+
});
|
|
74
|
+
}
|
|
75
|
+
|
|
64
76
|
function tokensOf(row) {
|
|
65
77
|
return (row.grades ?? [])
|
|
66
78
|
.filter((g) => g.dim === "efficiency")
|
|
@@ -172,16 +184,98 @@ function driftSignals(evalRows, telemetryRows) {
|
|
|
172
184
|
}
|
|
173
185
|
}
|
|
174
186
|
|
|
187
|
+
// Aggregate eval rows by task into { passed, total, tokens }.
|
|
188
|
+
function aggregateEvals(rows) {
|
|
189
|
+
const byTask = new Map();
|
|
190
|
+
for (const r of rows) {
|
|
191
|
+
const cur = byTask.get(r.taskId) ?? { passed: 0, total: 0, tokens: 0 };
|
|
192
|
+
cur.total++;
|
|
193
|
+
if (r.passed) cur.passed++;
|
|
194
|
+
cur.tokens += tokensOf(r);
|
|
195
|
+
byTask.set(r.taskId, cur);
|
|
196
|
+
}
|
|
197
|
+
return byTask;
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
// Render a single delta line. signMode controls icon meaning — for pass-rate,
|
|
201
|
+
// up is good; for tokens, up is bad; for skill invocations, neutral.
|
|
202
|
+
function fmtDelta(now, then, signMode = "neutral", unit = "") {
|
|
203
|
+
if (then === undefined) return `(new) ${now}${unit}`;
|
|
204
|
+
const diff = now - then;
|
|
205
|
+
if (diff === 0) return `${now}${unit} → ${then}${unit} (=)`;
|
|
206
|
+
let arrow = diff > 0 ? "↑" : "↓";
|
|
207
|
+
// Color the arrow by "is this a regression?"
|
|
208
|
+
let marker = " ";
|
|
209
|
+
if (signMode === "good-up") marker = diff > 0 ? "+" : "-";
|
|
210
|
+
else if (signMode === "good-down") marker = diff > 0 ? "-" : "+";
|
|
211
|
+
return `${now}${unit} ← ${then}${unit} (${arrow}${marker} ${Math.abs(diff)}${unit})`;
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
function weekOverWeek(evalRecent, evalPrior, telRecent, telPrior) {
|
|
215
|
+
console.log(`\n### Week-over-week (last 7d vs prior 7d)`);
|
|
216
|
+
const aRecent = aggregateEvals(evalRecent);
|
|
217
|
+
const aPrior = aggregateEvals(evalPrior);
|
|
218
|
+
|
|
219
|
+
if (aRecent.size === 0 && aPrior.size === 0) {
|
|
220
|
+
console.log(" (no eval data in either window — run `npm run harness:eval`)");
|
|
221
|
+
} else {
|
|
222
|
+
console.log(" task pass-rate (now ← prior) avg-tokens (now ← prior)");
|
|
223
|
+
console.log(" ---------------------- ---------------------------- --------------------------");
|
|
224
|
+
const taskIds = new Set([...aRecent.keys(), ...aPrior.keys()]);
|
|
225
|
+
for (const t of [...taskIds].sort()) {
|
|
226
|
+
const now = aRecent.get(t);
|
|
227
|
+
const prior = aPrior.get(t);
|
|
228
|
+
const nowRate = now ? Math.round((now.passed / now.total) * 100) : null;
|
|
229
|
+
const priorRate = prior ? Math.round((prior.passed / prior.total) * 100) : null;
|
|
230
|
+
const nowTok = now && now.total > 0 ? Math.round(now.tokens / now.total) : 0;
|
|
231
|
+
const priorTok = prior && prior.total > 0 ? Math.round(prior.tokens / prior.total) : 0;
|
|
232
|
+
const rateCell = nowRate === null
|
|
233
|
+
? "(absent now)"
|
|
234
|
+
: priorRate === null
|
|
235
|
+
? `${nowRate}% (new)`
|
|
236
|
+
: `${nowRate}% ← ${priorRate}% (${nowRate - priorRate >= 0 ? "+" : ""}${nowRate - priorRate})`;
|
|
237
|
+
const tokCell = nowTok === 0 && priorTok === 0
|
|
238
|
+
? "—"
|
|
239
|
+
: `${nowTok} ← ${priorTok} (${nowTok - priorTok >= 0 ? "+" : ""}${nowTok - priorTok})`;
|
|
240
|
+
console.log(
|
|
241
|
+
` ${t.padEnd(22)} ${rateCell.padEnd(30)} ${tokCell}`,
|
|
242
|
+
);
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
// Skill invocation deltas.
|
|
247
|
+
const recentBySkill = new Map();
|
|
248
|
+
for (const r of telRecent) recentBySkill.set(r.skill, (recentBySkill.get(r.skill) ?? 0) + 1);
|
|
249
|
+
const priorBySkill = new Map();
|
|
250
|
+
for (const r of telPrior) priorBySkill.set(r.skill, (priorBySkill.get(r.skill) ?? 0) + 1);
|
|
251
|
+
|
|
252
|
+
const allSkills = new Set([...recentBySkill.keys(), ...priorBySkill.keys()]);
|
|
253
|
+
if (allSkills.size > 0) {
|
|
254
|
+
console.log("\n skill invocations (now ← prior)");
|
|
255
|
+
console.log(" ----------------------------- -------------------------------");
|
|
256
|
+
for (const s of [...allSkills].sort()) {
|
|
257
|
+
const n = recentBySkill.get(s) ?? 0;
|
|
258
|
+
const p = priorBySkill.get(s) ?? 0;
|
|
259
|
+
const d = n - p;
|
|
260
|
+
const cell = p === 0 ? `${n} (new)` : `${n} ← ${p} (${d >= 0 ? "+" : ""}${d})`;
|
|
261
|
+
console.log(` ${s.padEnd(29)} ${cell}`);
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
|
|
175
266
|
async function main() {
|
|
176
267
|
const evalAll = await loadEvalResults();
|
|
177
268
|
const telemetryAll = await readJsonl(TELEMETRY);
|
|
178
269
|
const evalRows = recent(evalAll);
|
|
270
|
+
const evalPrior = priorWeek(evalAll);
|
|
179
271
|
const telemetryRows = recent(telemetryAll);
|
|
272
|
+
const telemetryPrior = priorWeek(telemetryAll);
|
|
180
273
|
|
|
181
274
|
console.log("=== agent-harness-kit report ===");
|
|
182
275
|
console.log(`Generated: ${new Date().toISOString()}`);
|
|
183
276
|
summarizeEvals(evalRows);
|
|
184
277
|
summarizeTelemetry(telemetryRows);
|
|
278
|
+
weekOverWeek(evalRows, evalPrior, telemetryRows, telemetryPrior);
|
|
185
279
|
driftSignals(evalRows, telemetryRows);
|
|
186
280
|
console.log("");
|
|
187
281
|
}
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# PreCompact hook — write a small snapshot of state to
|
|
3
|
+
# .harness/compaction-snapshot.json BEFORE the context compactor runs.
|
|
4
|
+
# The companion SessionStart hook (matcher: compact) reads this snapshot
|
|
5
|
+
# back and re-injects the salient fields so the post-compaction model
|
|
6
|
+
# knows which feature it was working on, which branch, and how dirty
|
|
7
|
+
# the tree was.
|
|
8
|
+
#
|
|
9
|
+
# This is the kit's answer to the "I lost everything after compaction"
|
|
10
|
+
# failure mode that recurs in long sessions. Pair with:
|
|
11
|
+
# - SessionStart matcher compact → re-inject
|
|
12
|
+
# - PostCompact (not implemented; SessionStart does the work)
|
|
13
|
+
#
|
|
14
|
+
# Snapshot contents:
|
|
15
|
+
# {
|
|
16
|
+
# "compacted_at": "2026-05-16T19:00:00Z",
|
|
17
|
+
# "branch": "main",
|
|
18
|
+
# "sha": "abc1234",
|
|
19
|
+
# "uncommitted": 7,
|
|
20
|
+
# "feature": "auth-endpoint — POST /auth/login",
|
|
21
|
+
# "trigger": "manual|auto",
|
|
22
|
+
# "estimated_tokens_removed": 5000
|
|
23
|
+
# }
|
|
24
|
+
#
|
|
25
|
+
# The hook NEVER blocks (exit 0 always). PreCompact can technically block
|
|
26
|
+
# compaction but doing so defeats the entire point.
|
|
27
|
+
set -eo pipefail
|
|
28
|
+
|
|
29
|
+
INPUT=$(cat)
|
|
30
|
+
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
|
31
|
+
have_jq() {
|
|
32
|
+
[ "${AHK_DISABLE_JQ:-}" = "1" ] && return 1
|
|
33
|
+
command -v jq >/dev/null 2>&1
|
|
34
|
+
}
|
|
35
|
+
have_jp() {
|
|
36
|
+
have_jq && return 0
|
|
37
|
+
command -v node >/dev/null 2>&1 && [ -f "$SCRIPT_DIR/_lib/json-pick.mjs" ] && return 0
|
|
38
|
+
return 1
|
|
39
|
+
}
|
|
40
|
+
jp() {
|
|
41
|
+
if have_jq; then
|
|
42
|
+
if [ -n "$2" ]; then jq -r "$1" "$2"; else jq -r "$1"; fi
|
|
43
|
+
else
|
|
44
|
+
if [ -n "$2" ]; then
|
|
45
|
+
node "$SCRIPT_DIR/_lib/json-pick.mjs" "$1" "$2"
|
|
46
|
+
else
|
|
47
|
+
node "$SCRIPT_DIR/_lib/json-pick.mjs" "$1"
|
|
48
|
+
fi
|
|
49
|
+
fi
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
TRIGGER=""
|
|
53
|
+
TOKENS=""
|
|
54
|
+
if have_jp; then
|
|
55
|
+
TRIGGER=$(echo "$INPUT" | jp '.trigger // "auto"' 2>/dev/null || true)
|
|
56
|
+
TOKENS=$(echo "$INPUT" | jp '.estimated_tokens_removed // 0' 2>/dev/null || true)
|
|
57
|
+
fi
|
|
58
|
+
|
|
59
|
+
mkdir -p .harness
|
|
60
|
+
|
|
61
|
+
TS=$(date -u +%Y-%m-%dT%H:%M:%SZ)
|
|
62
|
+
BR="(no-git)"
|
|
63
|
+
SHA="(no-git)"
|
|
64
|
+
COUNT=0
|
|
65
|
+
if command -v git >/dev/null 2>&1 && git rev-parse --git-dir >/dev/null 2>&1; then
|
|
66
|
+
BR=$(git branch --show-current 2>/dev/null || echo "(detached)")
|
|
67
|
+
SHA=$(git rev-parse --short HEAD 2>/dev/null || echo "(none)")
|
|
68
|
+
COUNT=$(git status --short 2>/dev/null | wc -l | tr -d ' ')
|
|
69
|
+
fi
|
|
70
|
+
|
|
71
|
+
FEAT=""
|
|
72
|
+
if [ -f feature_list.json ]; then
|
|
73
|
+
if have_jq; then
|
|
74
|
+
FEAT=$(jq -r 'first(.features[] | select(.passes == false)) | "\(.id) — \(.title)"' \
|
|
75
|
+
feature_list.json 2>/dev/null || true)
|
|
76
|
+
elif command -v node >/dev/null 2>&1; then
|
|
77
|
+
FEAT=$(node -e "
|
|
78
|
+
const f = JSON.parse(require('fs').readFileSync('feature_list.json','utf8'));
|
|
79
|
+
const o = (f.features || []).find(x => x.passes === false);
|
|
80
|
+
if (o) process.stdout.write(o.id + ' — ' + o.title);
|
|
81
|
+
" 2>/dev/null || true)
|
|
82
|
+
fi
|
|
83
|
+
fi
|
|
84
|
+
|
|
85
|
+
# Compose JSON via Node when available — handles escaping right.
|
|
86
|
+
if command -v node >/dev/null 2>&1; then
|
|
87
|
+
node -e "
|
|
88
|
+
const fs = require('fs');
|
|
89
|
+
const snap = {
|
|
90
|
+
compacted_at: '$TS',
|
|
91
|
+
branch: '$BR',
|
|
92
|
+
sha: '$SHA',
|
|
93
|
+
uncommitted: parseInt('$COUNT', 10) || 0,
|
|
94
|
+
feature: process.argv[1] || '',
|
|
95
|
+
trigger: '$TRIGGER' || 'auto',
|
|
96
|
+
estimated_tokens_removed: parseInt('$TOKENS', 10) || 0
|
|
97
|
+
};
|
|
98
|
+
fs.writeFileSync('.harness/compaction-snapshot.json', JSON.stringify(snap, null, 2) + '\n');
|
|
99
|
+
" "$FEAT"
|
|
100
|
+
elif have_jq; then
|
|
101
|
+
jq -n --arg ts "$TS" --arg br "$BR" --arg sha "$SHA" \
|
|
102
|
+
--argjson cnt "$COUNT" --arg feat "$FEAT" \
|
|
103
|
+
--arg trig "${TRIGGER:-auto}" --argjson tok "${TOKENS:-0}" \
|
|
104
|
+
'{compacted_at: $ts, branch: $br, sha: $sha, uncommitted: $cnt,
|
|
105
|
+
feature: $feat, trigger: $trig, estimated_tokens_removed: $tok}' \
|
|
106
|
+
> .harness/compaction-snapshot.json
|
|
107
|
+
else
|
|
108
|
+
# No JSON tool available — write a minimal record. SessionStart compact
|
|
109
|
+
# branch reads fields individually so partial records still work.
|
|
110
|
+
cat > .harness/compaction-snapshot.json <<EOF
|
|
111
|
+
{
|
|
112
|
+
"compacted_at": "$TS",
|
|
113
|
+
"branch": "$BR",
|
|
114
|
+
"sha": "$SHA",
|
|
115
|
+
"uncommitted": $COUNT,
|
|
116
|
+
"feature": "$FEAT",
|
|
117
|
+
"trigger": "${TRIGGER:-auto}"
|
|
118
|
+
}
|
|
119
|
+
EOF
|
|
120
|
+
fi
|
|
121
|
+
exit 0
|
|
@@ -2,7 +2,32 @@
|
|
|
2
2
|
# pre-push hook — Stripe "shift-feedback-left" pattern. Runs only the
|
|
3
3
|
# deterministic checks (structural test + linter + tests on changed files).
|
|
4
4
|
# Lives in scripts/ so it ships with the repo; install via install-git-hooks.sh.
|
|
5
|
-
set -
|
|
5
|
+
set -eo pipefail
|
|
6
|
+
|
|
7
|
+
# Resolve script dir so we can find _lib/json-pick.mjs (Node fallback for jq).
|
|
8
|
+
# Without this fallback, `jq` missing on a fresh CI image silently disabled
|
|
9
|
+
# the baseline-monotonic guard — a known audit hole.
|
|
10
|
+
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
|
11
|
+
have_jq() {
|
|
12
|
+
[ "${AHK_DISABLE_JQ:-}" = "1" ] && return 1
|
|
13
|
+
command -v jq >/dev/null 2>&1
|
|
14
|
+
}
|
|
15
|
+
have_jp() {
|
|
16
|
+
have_jq && return 0
|
|
17
|
+
command -v node >/dev/null 2>&1 && [ -f "$SCRIPT_DIR/_lib/json-pick.mjs" ] && return 0
|
|
18
|
+
return 1
|
|
19
|
+
}
|
|
20
|
+
jp() {
|
|
21
|
+
if have_jq; then
|
|
22
|
+
if [ -n "$2" ]; then jq -r "$1" "$2"; else jq -r "$1"; fi
|
|
23
|
+
else
|
|
24
|
+
if [ -n "$2" ]; then
|
|
25
|
+
node "$SCRIPT_DIR/_lib/json-pick.mjs" "$1" "$2"
|
|
26
|
+
else
|
|
27
|
+
node "$SCRIPT_DIR/_lib/json-pick.mjs" "$1"
|
|
28
|
+
fi
|
|
29
|
+
fi
|
|
30
|
+
}
|
|
6
31
|
|
|
7
32
|
# Baseline monotonic guard. .harness/structural-baseline.json is decreasing-
|
|
8
33
|
# only — fixes REMOVE entries; no path should ADD them. Catches the "mask
|
|
@@ -10,11 +35,11 @@ set -e
|
|
|
10
35
|
# Runs first because a grown baseline silently masks structural-test failures.
|
|
11
36
|
BASELINE_FILE=".harness/structural-baseline.json"
|
|
12
37
|
if [ -f "$BASELINE_FILE" ] \
|
|
13
|
-
&&
|
|
38
|
+
&& have_jp \
|
|
14
39
|
&& git rev-parse --verify HEAD >/dev/null 2>&1 \
|
|
15
40
|
&& git cat-file -e "HEAD:$BASELINE_FILE" 2>/dev/null; then
|
|
16
|
-
CURRENT_COUNT=$(
|
|
17
|
-
HEAD_COUNT=$(git show "HEAD:$BASELINE_FILE" 2>/dev/null |
|
|
41
|
+
CURRENT_COUNT=$(jp 'length' "$BASELINE_FILE" 2>/dev/null || echo 0)
|
|
42
|
+
HEAD_COUNT=$(git show "HEAD:$BASELINE_FILE" 2>/dev/null | jp 'length' 2>/dev/null || echo 0)
|
|
18
43
|
if [ "$CURRENT_COUNT" -gt "$HEAD_COUNT" ]; then
|
|
19
44
|
{
|
|
20
45
|
echo
|
|
@@ -33,11 +58,20 @@ if [ -f "$BASELINE_FILE" ] \
|
|
|
33
58
|
fi
|
|
34
59
|
fi
|
|
35
60
|
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
61
|
+
# Structural test. Skipped when `structuralTest.engine` is explicitly "none"
|
|
62
|
+
# (e.g. during scaffold of a polyglot repo where the adapter is not yet
|
|
63
|
+
# wired). Without this guard the push fails silently because
|
|
64
|
+
# `npm run harness:check` has no matching script.
|
|
65
|
+
if [ -f harness.config.json ] \
|
|
66
|
+
&& grep -qE '"engine"[[:space:]]*:[[:space:]]*"none"' harness.config.json; then
|
|
67
|
+
echo "[pre-push] structural test skipped (structuralTest.engine: none)"
|
|
39
68
|
else
|
|
40
|
-
|
|
69
|
+
echo "[pre-push] running structural test…"
|
|
70
|
+
if [ -f harness.config.json ] && grep -q '"language": "python"' harness.config.json; then
|
|
71
|
+
python -m harness.structural_test
|
|
72
|
+
else
|
|
73
|
+
npm run --silent harness:check
|
|
74
|
+
fi
|
|
41
75
|
fi
|
|
42
76
|
|
|
43
77
|
echo "[pre-push] running lint…"
|
|
@@ -12,9 +12,43 @@ set -e
|
|
|
12
12
|
|
|
13
13
|
INPUT=$(cat)
|
|
14
14
|
|
|
15
|
+
# Resolve the directory this hook lives in (used to find _lib/json-pick.mjs).
|
|
16
|
+
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
|
17
|
+
|
|
18
|
+
# have_jq — env-overridable probe. AHK_DISABLE_JQ=1 forces the Node fallback,
|
|
19
|
+
# used by tests to exercise the jq-less code path on machines that have jq
|
|
20
|
+
# installed locally.
|
|
21
|
+
have_jq() {
|
|
22
|
+
[ "${AHK_DISABLE_JQ:-}" = "1" ] && return 1
|
|
23
|
+
command -v jq >/dev/null 2>&1
|
|
24
|
+
}
|
|
25
|
+
# jp — JSON picker. Uses `jq` when available, else falls back to a bundled
|
|
26
|
+
# Node script with a jq-subset implementation. Keeps hooks portable on
|
|
27
|
+
# minimal CI / Windows where jq is not installed by default. Without this
|
|
28
|
+
# fallback, the entire pre-completion check used to be a silent no-op.
|
|
29
|
+
jp() {
|
|
30
|
+
if have_jq; then
|
|
31
|
+
if [ -n "$2" ]; then jq -r "$1" "$2"; else jq -r "$1"; fi
|
|
32
|
+
else
|
|
33
|
+
if [ -n "$2" ]; then
|
|
34
|
+
node "$SCRIPT_DIR/_lib/json-pick.mjs" "$1" "$2"
|
|
35
|
+
else
|
|
36
|
+
node "$SCRIPT_DIR/_lib/json-pick.mjs" "$1"
|
|
37
|
+
fi
|
|
38
|
+
fi
|
|
39
|
+
}
|
|
40
|
+
# Probe: do we have either jq or the Node fallback? Node is always
|
|
41
|
+
# present (kit's `engines` field requires >=20), so this is just an explicit
|
|
42
|
+
# probe and a fail-loud branch if even node is missing.
|
|
43
|
+
have_jp() {
|
|
44
|
+
have_jq && return 0
|
|
45
|
+
command -v node >/dev/null 2>&1 && [ -f "$SCRIPT_DIR/_lib/json-pick.mjs" ] && return 0
|
|
46
|
+
return 1
|
|
47
|
+
}
|
|
48
|
+
|
|
15
49
|
# CRITICAL: avoid infinite loops. If the hook already ran, do not block again.
|
|
16
|
-
if
|
|
17
|
-
if [ "$(echo "$INPUT" |
|
|
50
|
+
if have_jp; then
|
|
51
|
+
if [ "$(echo "$INPUT" | jp '.stop_hook_active // false')" = "true" ]; then
|
|
18
52
|
exit 0
|
|
19
53
|
fi
|
|
20
54
|
fi
|
|
@@ -39,8 +73,12 @@ run_check() {
|
|
|
39
73
|
fi
|
|
40
74
|
}
|
|
41
75
|
|
|
42
|
-
# Structural test.
|
|
43
|
-
|
|
76
|
+
# Structural test. Skipped when `structuralTest.engine` is explicitly "none"
|
|
77
|
+
# (e.g. during scaffold of a polyglot repo where the adapter is not yet
|
|
78
|
+
# wired). Without this guard the check fails silently with an empty body
|
|
79
|
+
# because `npm run harness:check` has no matching script.
|
|
80
|
+
if [ -f harness.config.json ] \
|
|
81
|
+
&& ! grep -qE '"engine"[[:space:]]*:[[:space:]]*"none"' harness.config.json; then
|
|
44
82
|
if grep -q '"language": "python"' harness.config.json; then
|
|
45
83
|
run_check structural-test python -m harness.structural_test || true
|
|
46
84
|
else
|
|
@@ -55,12 +93,17 @@ elif [ -f pyproject.toml ] && command -v ruff >/dev/null 2>&1; then
|
|
|
55
93
|
run_check ruff ruff check . || true
|
|
56
94
|
fi
|
|
57
95
|
|
|
58
|
-
# CLAUDE.md
|
|
59
|
-
#
|
|
60
|
-
#
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
96
|
+
# CLAUDE.md size caps. Two complementary signals:
|
|
97
|
+
# - maxInstructions (default 200): bullet/numbered-item count. Suits
|
|
98
|
+
# ASCII-heavy English where a bullet ≈ a fixed token weight.
|
|
99
|
+
# - maxTokens (default 0 = off): approximate token cap. Catches drift
|
|
100
|
+
# in non-ASCII content (Vietnamese, CJK, etc.) where 200 bullets
|
|
101
|
+
# may carry 2–3× more tokens than the HumanLayer baseline measured.
|
|
102
|
+
# Both checks fire independently — exceed either → block.
|
|
103
|
+
if [ -f harness.config.json ] && have_jp; then
|
|
104
|
+
CMD_PATH=$(jp '.claudeMd.path // "CLAUDE.md"' harness.config.json)
|
|
105
|
+
CMD_CAP=$(jp '.claudeMd.maxInstructions // 200' harness.config.json)
|
|
106
|
+
CMD_TOK_CAP=$(jp '.claudeMd.maxTokens // 0' harness.config.json)
|
|
64
107
|
if [ -f "$CMD_PATH" ] && [ "$CMD_CAP" -gt 0 ] 2>/dev/null; then
|
|
65
108
|
CMD_COUNT=$(grep -cE '^[[:space:]]*([-*]|[0-9]+\.)[[:space:]]' "$CMD_PATH" 2>/dev/null || echo 0)
|
|
66
109
|
if [ "$CMD_COUNT" -gt "$CMD_CAP" ]; then
|
|
@@ -81,6 +124,24 @@ if [ -f harness.config.json ] && command -v jq >/dev/null 2>&1; then
|
|
|
81
124
|
echo "claude-md-cap" >> "$TMPDIR_HOOK/failed.list"
|
|
82
125
|
fi
|
|
83
126
|
fi
|
|
127
|
+
if [ -f "$CMD_PATH" ] && [ "$CMD_TOK_CAP" -gt 0 ] 2>/dev/null \
|
|
128
|
+
&& command -v node >/dev/null 2>&1 \
|
|
129
|
+
&& [ -f "$SCRIPT_DIR/_lib/approx-tokens.mjs" ]; then
|
|
130
|
+
CMD_TOK=$(node "$SCRIPT_DIR/_lib/approx-tokens.mjs" "$CMD_PATH" 2>/dev/null || echo 0)
|
|
131
|
+
if [ "$CMD_TOK" -gt "$CMD_TOK_CAP" ]; then
|
|
132
|
+
{
|
|
133
|
+
echo "$CMD_PATH approximate token count: $CMD_TOK (cap: $CMD_TOK_CAP)"
|
|
134
|
+
echo
|
|
135
|
+
echo "Heuristic token cap — set because instruction count alone misses"
|
|
136
|
+
echo "drift in non-ASCII content (Vietnamese, CJK) where a bullet can"
|
|
137
|
+
echo "carry 2-3x more tokens than the HumanLayer baseline measured."
|
|
138
|
+
echo
|
|
139
|
+
echo "Adjust the cap (with justification) in harness.config.json:"
|
|
140
|
+
echo " .claudeMd.maxTokens"
|
|
141
|
+
} > "$TMPDIR_HOOK/claude-md-tokens.out"
|
|
142
|
+
echo "claude-md-tokens" >> "$TMPDIR_HOOK/failed.list"
|
|
143
|
+
fi
|
|
144
|
+
fi
|
|
84
145
|
fi
|
|
85
146
|
|
|
86
147
|
# Multi-layer review trigger. When uncommitted/staged/untracked changes touch
|
|
@@ -90,7 +151,7 @@ fi
|
|
|
90
151
|
# with a mechanical count off `harness.config.json` `domains[].layers` /
|
|
91
152
|
# `.root`. Fires once per stop; the loop guard (`stop_hook_active`) lets the
|
|
92
153
|
# next stop succeed after the agent has read the recommendation.
|
|
93
|
-
if [ -f harness.config.json ] &&
|
|
154
|
+
if [ -f harness.config.json ] && have_jp && command -v git >/dev/null 2>&1; then
|
|
94
155
|
CHANGED=$(
|
|
95
156
|
{
|
|
96
157
|
git diff --name-only 2>/dev/null || true
|
|
@@ -99,23 +160,29 @@ if [ -f harness.config.json ] && command -v jq >/dev/null 2>&1 && command -v git
|
|
|
99
160
|
} | sort -u
|
|
100
161
|
)
|
|
101
162
|
if [ -n "$CHANGED" ]; then
|
|
102
|
-
NUM_DOMAINS=$(
|
|
163
|
+
NUM_DOMAINS=$(jp '.domains | length' harness.config.json 2>/dev/null || echo 0)
|
|
103
164
|
MULTI_OUT="$TMPDIR_HOOK/multi-layer-review.out"
|
|
104
165
|
: > "$MULTI_OUT"
|
|
105
166
|
MULTI_HIT=0
|
|
106
167
|
i=0
|
|
107
168
|
while [ "$i" -lt "$NUM_DOMAINS" ]; do
|
|
108
|
-
ROOT=$(
|
|
109
|
-
DOMAIN=$(
|
|
169
|
+
ROOT=$(jp ".domains[$i].root" harness.config.json)
|
|
170
|
+
DOMAIN=$(jp ".domains[$i].name" harness.config.json)
|
|
171
|
+
# Optional layerDirPattern — supports conventions where the layer
|
|
172
|
+
# directory is not literally `{layer}`. Example: a Rust workspace
|
|
173
|
+
# with crates named `unibot-types`, `unibot-crypto`, ... uses
|
|
174
|
+
# `"layerDirPattern": "unibot-{layer}"`. Defaults to `{layer}`.
|
|
175
|
+
LAYER_PATTERN=$(jp ".domains[$i].layerDirPattern // \"{layer}\"" harness.config.json)
|
|
110
176
|
TOUCHED_COUNT=0
|
|
111
177
|
TOUCHED_NAMES=""
|
|
112
178
|
while IFS= read -r layer; do
|
|
113
179
|
[ -z "$layer" ] && continue
|
|
114
|
-
|
|
180
|
+
LAYER_DIR=$(printf '%s' "$LAYER_PATTERN" | sed "s/{layer}/$layer/g")
|
|
181
|
+
if echo "$CHANGED" | grep -qE "^${ROOT}/${LAYER_DIR}(/|$)"; then
|
|
115
182
|
TOUCHED_COUNT=$((TOUCHED_COUNT + 1))
|
|
116
183
|
TOUCHED_NAMES="$TOUCHED_NAMES $layer"
|
|
117
184
|
fi
|
|
118
|
-
done < <(
|
|
185
|
+
done < <(jp ".domains[$i].layers[]" harness.config.json)
|
|
119
186
|
if [ "$TOUCHED_COUNT" -ge 2 ]; then
|
|
120
187
|
echo "Domain '$DOMAIN' has changes spanning $TOUCHED_COUNT layers:$TOUCHED_NAMES" >> "$MULTI_OUT"
|
|
121
188
|
MULTI_HIT=1
|
|
@@ -177,8 +244,8 @@ HEADLESS_SOURCE=""
|
|
|
177
244
|
if [ "${AHK_HEADLESS_RECOVER:-}" = "1" ]; then
|
|
178
245
|
HEADLESS_RECOVER=1
|
|
179
246
|
HEADLESS_SOURCE="AHK_HEADLESS_RECOVER"
|
|
180
|
-
elif [ -f harness.config.json ] &&
|
|
181
|
-
CFG_VAL=$(
|
|
247
|
+
elif [ -f harness.config.json ] && have_jp; then
|
|
248
|
+
CFG_VAL=$(jp '.recovery.headless // false' harness.config.json 2>/dev/null)
|
|
182
249
|
if [ "$CFG_VAL" = "true" ]; then
|
|
183
250
|
HEADLESS_RECOVER=1
|
|
184
251
|
HEADLESS_SOURCE="harness.config.json:.recovery.headless"
|
|
@@ -186,12 +253,64 @@ elif [ -f harness.config.json ] && command -v jq >/dev/null 2>&1; then
|
|
|
186
253
|
fi
|
|
187
254
|
if [ "$HEADLESS_RECOVER" = "1" ] && command -v claude >/dev/null 2>&1; then
|
|
188
255
|
FAILED_LIST=$(tr '\n' ' ' < "$TMPDIR_HOOK/failed.list")
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
#
|
|
256
|
+
|
|
257
|
+
# Concurrency guard. Two Stop events in different sessions (e.g. user
|
|
258
|
+
# working in two terminals, or an unattended CI rerun firing while a
|
|
259
|
+
# previous recovery is still active) used to race and edit the same
|
|
260
|
+
# files. The lock is a directory created atomically with `mkdir`; the
|
|
261
|
+
# PID file inside lets us detect stale locks left by a crashed parent.
|
|
262
|
+
mkdir -p .harness
|
|
263
|
+
LOCK_DIR=".harness/recovery.lock"
|
|
264
|
+
LOCK_STALE_MAX_SECS=${AHK_RECOVERY_LOCK_STALE_SECS:-1800}
|
|
265
|
+
|
|
266
|
+
if mkdir "$LOCK_DIR" 2>/dev/null; then
|
|
267
|
+
# We won the race — spawn the recovery turn. Snapshot the failure
|
|
268
|
+
# context into the lock dir BEFORE the parent's EXIT trap deletes
|
|
269
|
+
# TMPDIR_HOOK; otherwise the subshell's redirect to recover.out
|
|
270
|
+
# races the parent's cleanup and the subshell dies before claude
|
|
271
|
+
# can run. Everything the recovery needs (failed.list, per-check
|
|
272
|
+
# output, recover.out) now lives inside LOCK_DIR — self-contained.
|
|
273
|
+
cp -r "$TMPDIR_HOOK/." "$LOCK_DIR/snapshot/" 2>/dev/null || true
|
|
274
|
+
(
|
|
275
|
+
# Trap removes the lock on subshell EXIT (success, failure, or signal).
|
|
276
|
+
trap 'rm -rf "$LOCK_DIR"' EXIT
|
|
277
|
+
claude -p \
|
|
278
|
+
"The pre-completion checklist failed: $FAILED_LIST. Read the failure output in $LOCK_DIR/snapshot and apply the smallest fix. Do not disable any check." \
|
|
279
|
+
--max-turns 5 \
|
|
280
|
+
>"$LOCK_DIR/recover.out" 2>&1
|
|
281
|
+
) &
|
|
282
|
+
SUB_PID=$!
|
|
283
|
+
# Parent writes metadata SYNCHRONOUSLY before printing the "spawned"
|
|
284
|
+
# message so a second Stop firing immediately after never sees an
|
|
285
|
+
# empty pid file. Subsecond races between mkdir and these writes are
|
|
286
|
+
# closed by the bounded read-loop in the lock-held branch below.
|
|
287
|
+
echo "$SUB_PID" > "$LOCK_DIR/pid"
|
|
288
|
+
date +%s > "$LOCK_DIR/started_at"
|
|
289
|
+
echo "$HEADLESS_SOURCE" > "$LOCK_DIR/source"
|
|
290
|
+
echo "[ahk] headless recovery spawned (source=$HEADLESS_SOURCE, wrapper-pid=$SUB_PID, lock=$LOCK_DIR)" >&2
|
|
291
|
+
else
|
|
292
|
+
# Lock already held. Read who holds it and decide: live → skip,
|
|
293
|
+
# stale → reclaim. We never block the user's Stop on the lock —
|
|
294
|
+
# worst case we skip a recovery turn that the next Stop can retry.
|
|
295
|
+
# Bounded wait for the pid file to materialize — closes the race
|
|
296
|
+
# window between the parent's `mkdir` and its `echo $SUB_PID > pid`.
|
|
297
|
+
for _ in 1 2 3 4 5 6 7 8 9 10; do
|
|
298
|
+
[ -s "$LOCK_DIR/pid" ] && break
|
|
299
|
+
sleep 0.05
|
|
300
|
+
done
|
|
301
|
+
EXISTING_PID=$(cat "$LOCK_DIR/pid" 2>/dev/null || true)
|
|
302
|
+
STARTED_AT=$(cat "$LOCK_DIR/started_at" 2>/dev/null || echo 0)
|
|
303
|
+
NOW=$(date +%s)
|
|
304
|
+
AGE=$((NOW - STARTED_AT))
|
|
305
|
+
if [ -n "$EXISTING_PID" ] && kill -0 "$EXISTING_PID" 2>/dev/null; then
|
|
306
|
+
echo "[ahk] headless recovery skipped — another session already running (pid=$EXISTING_PID, age=${AGE}s, lock=$LOCK_DIR)" >&2
|
|
307
|
+
elif [ "$AGE" -gt "$LOCK_STALE_MAX_SECS" ]; then
|
|
308
|
+
echo "[ahk] headless recovery: removing stale lock (pid=$EXISTING_PID, age=${AGE}s > ${LOCK_STALE_MAX_SECS}s); next stop will retry. lock=$LOCK_DIR" >&2
|
|
309
|
+
rm -rf "$LOCK_DIR"
|
|
310
|
+
else
|
|
311
|
+
echo "[ahk] headless recovery skipped — lock present with dead pid=$EXISTING_PID (age=${AGE}s, will reclaim after ${LOCK_STALE_MAX_SECS}s). lock=$LOCK_DIR" >&2
|
|
312
|
+
fi
|
|
313
|
+
fi
|
|
195
314
|
fi
|
|
196
315
|
|
|
197
316
|
exit 2
|