agent-harness-kit 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +4 -2
- package/src/templates/.claude/agents/architecture-reviewer.md.hbs +1 -1
- package/src/templates/.claude/skills/propose-harness-improvement/SKILL.md +6 -0
- package/src/templates/CLAUDE.md.hbs +2 -0
- package/src/templates/docs/agent-failures.md +1 -1
- package/src/templates/docs/golden-principles.md.hbs +45 -0
- package/src/templates/harness.config.json.hbs +4 -0
- package/src/templates/scripts/pre-push.sh +29 -0
- package/src/templates/scripts/precompletion-checklist.sh.hbs +80 -0
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "agent-harness-kit",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.4.0",
|
|
4
4
|
"description": "Solo-dev harness engineering kit for Claude Code. Layered architecture, structural tests, garbage-collection ritual, review subagents — without the enterprise overhead.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|
|
@@ -40,7 +40,9 @@
|
|
|
40
40
|
"scripts": {
|
|
41
41
|
"test": "node --test tests/*.test.mjs",
|
|
42
42
|
"lint": "echo 'no-op (kit is plain ESM JS)'",
|
|
43
|
-
"selftest": "node bin/cli.mjs --version"
|
|
43
|
+
"selftest": "node bin/cli.mjs --version",
|
|
44
|
+
"harness:eval": "node src/templates/_adapter-typescript/harness/eval-runner.mjs",
|
|
45
|
+
"harness:check": "echo 'no-op (kit-level structural rules are TBD; see .harness/eval/tasks/03-add-structural-rule.json)'"
|
|
44
46
|
},
|
|
45
47
|
"dependencies": {
|
|
46
48
|
"@inquirer/prompts": "^7.0.0",
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: architecture-reviewer
|
|
3
|
-
description: Use this agent
|
|
3
|
+
description: Use this agent when the Stop hook surfaces a `multi-layer-review` flag (changes span ≥2 layers in a single domain — mechanical count, not self-judgment), or when a change adds a new domain / modifies imports across module boundaries. Verifies the {{layersJoined}} rule, provider boundaries, and golden-principles.md compliance. Read-only — never modifies files.
|
|
4
4
|
tools: Read, Grep, Glob, Bash({{#if isPython}}python -m harness.structural_test{{else}}npm run harness:check{{/if}}), Bash(git diff:*)
|
|
5
5
|
model: sonnet
|
|
6
6
|
---
|
|
@@ -18,6 +18,12 @@ suggested-turns: 8
|
|
|
18
18
|
invoke `/write-skill`.
|
|
19
19
|
- **(d) Wrong layer / architecture** — the structure invited the
|
|
20
20
|
mistake. Fix: write an ADR via `/add-adr`.
|
|
21
|
+
- **(e) Wrong instruction in prompt** — the failure traces back to a
|
|
22
|
+
skill/agent prompt that was ambiguous, misleading, or under-constrained.
|
|
23
|
+
The agent followed the prompt correctly but the prompt itself led astray.
|
|
24
|
+
Fix: edit the offending file under `.claude/skills/<name>/SKILL.md` or
|
|
25
|
+
`.claude/agents/<name>.md`. Re-run `/eval-runner` afterward to confirm
|
|
26
|
+
the regression is closed.
|
|
21
27
|
3. **Append entry** to `docs/agent-failures.md` with: date, symptom, fix,
|
|
22
28
|
fix-type, file modified.
|
|
23
29
|
4. **Apply the fix in the right place.** NEVER paper over with a CLAUDE.md
|
|
@@ -68,3 +68,5 @@ Full list: `docs/golden-principles.md`.
|
|
|
68
68
|
imports across layers).
|
|
69
69
|
- Don't update CLAUDE.md without proposing a harness improvement
|
|
70
70
|
(`/propose-harness-improvement`).
|
|
71
|
+
- Don't grow CLAUDE.md past 200 instructions — Stop hook blocks the stop on
|
|
72
|
+
overflow (HumanLayer measurement). Excess belongs in `docs/` or @-imports.
|
|
@@ -15,7 +15,7 @@ The `/propose-harness-improvement` skill appends entries here automatically.
|
|
|
15
15
|
```
|
|
16
16
|
### YYYY-MM-DD <slug>
|
|
17
17
|
- **Symptom:** <what went wrong>
|
|
18
|
-
- **Classification:** (a) missing context | (b) missing rule | (c) missing tool/skill | (d) wrong layer
|
|
18
|
+
- **Classification:** (a) missing context | (b) missing rule | (c) missing tool/skill | (d) wrong layer | (e) wrong instruction in prompt
|
|
19
19
|
- **Fix applied:** <what we did>
|
|
20
20
|
- **Fix lives in:** path/or/file
|
|
21
21
|
```
|
|
@@ -74,6 +74,51 @@ Why: Mitchell Hashimoto's discipline. CLAUDE.md is a table of contents — it
|
|
|
74
74
|
won't be re-read on every action.
|
|
75
75
|
Enforced by: `/propose-harness-improvement` skill.
|
|
76
76
|
|
|
77
|
+
## 8. CLAUDE.md is bounded — at most 200 instructions
|
|
78
|
+
|
|
79
|
+
CLAUDE.md is loaded into context every session. Beyond ~150-200 instructions
|
|
80
|
+
(HumanLayer measurement) agents stop following it reliably; verbose
|
|
81
|
+
CLAUDE.md silently degrades behavior. Promote details to `docs/` or use
|
|
82
|
+
`@-imports` to load context on demand.
|
|
83
|
+
|
|
84
|
+
Why: closes principle 7's loop — without a hard cap, "every failure becomes
|
|
85
|
+
a CLAUDE.md line" is the path of least resistance and CLAUDE.md grows
|
|
86
|
+
unbounded until the agent ignores it.
|
|
87
|
+
Enforced by: Stop hook (`scripts/precompletion-checklist.sh`) counts
|
|
88
|
+
bullets and numbered items in `CLAUDE.md` against
|
|
89
|
+
`harness.config.json` `claudeMd.maxInstructions` (default 200) and blocks
|
|
90
|
+
the stop on overflow.
|
|
91
|
+
|
|
92
|
+
## 9. Baselines are decreasing-only
|
|
93
|
+
|
|
94
|
+
`.harness/structural-baseline.json` lists existing violations the codebase
|
|
95
|
+
inherits when a new structural rule is introduced. New code must not add to
|
|
96
|
+
this file — fixes only REMOVE entries.
|
|
97
|
+
|
|
98
|
+
Why: a growing baseline silently masks structural-test failures. Without
|
|
99
|
+
this guard, the path of least resistance for a violation is "append it to
|
|
100
|
+
the baseline," which defeats the rule. PMD's baseline pattern works only
|
|
101
|
+
because it's enforced as monotonic.
|
|
102
|
+
Enforced by: pre-push hook (`scripts/pre-push.sh`) compares
|
|
103
|
+
`.harness/structural-baseline.json` length to its HEAD version and blocks
|
|
104
|
+
the push when the count grew.
|
|
105
|
+
|
|
106
|
+
## 10. Reviewer subagent triggers are mechanical, not self-judged
|
|
107
|
+
|
|
108
|
+
`architecture-reviewer` runs when changes span ≥2 layers in a single
|
|
109
|
+
domain. The decision is made by counting layers off
|
|
110
|
+
`harness.config.json` `domains[].layers` against the changed-file set —
|
|
111
|
+
not by the agent guessing whether its diff "touches multiple layers".
|
|
112
|
+
|
|
113
|
+
Why: self-judged triggers fail open on borderline cases. The agent that
|
|
114
|
+
just shipped a layer-spanning change is the one least equipped to notice
|
|
115
|
+
it. Mechanical counting closes that gap.
|
|
116
|
+
Enforced by: Stop hook (`scripts/precompletion-checklist.sh`) emits a
|
|
117
|
+
`multi-layer-review` failure when `git` reveals ≥2 touched layers in any
|
|
118
|
+
domain. The agent reads the recommendation, invokes
|
|
119
|
+
`architecture-reviewer` (or documents why review is unnecessary), and the
|
|
120
|
+
loop guard (`stop_hook_active`) lets the next stop succeed.
|
|
121
|
+
|
|
77
122
|
---
|
|
78
123
|
|
|
79
124
|
_Add new principles via `/structural-test-author`, which forces you to
|
|
@@ -28,6 +28,10 @@
|
|
|
28
28
|
"maxFixesPerRun": 3,
|
|
29
29
|
"scope": ["dead-imports", "duplicate-utils", "layer-violations", "doc-drift"]
|
|
30
30
|
},
|
|
31
|
+
"claudeMd": {
|
|
32
|
+
"path": "CLAUDE.md",
|
|
33
|
+
"maxInstructions": 200
|
|
34
|
+
},
|
|
31
35
|
"models": {
|
|
32
36
|
"main": "claude-sonnet-4-6",
|
|
33
37
|
"reviewers": "claude-sonnet-4-6",
|
|
@@ -4,6 +4,35 @@
|
|
|
4
4
|
# Lives in scripts/ so it ships with the repo; install via install-git-hooks.sh.
|
|
5
5
|
set -e
|
|
6
6
|
|
|
7
|
+
# Baseline monotonic guard. .harness/structural-baseline.json is decreasing-
|
|
8
|
+
# only — fixes REMOVE entries; no path should ADD them. Catches the "mask
|
|
9
|
+
# violations by baselining them" anti-pattern before code leaves the machine.
|
|
10
|
+
# Runs first because a grown baseline silently masks structural-test failures.
|
|
11
|
+
BASELINE_FILE=".harness/structural-baseline.json"
|
|
12
|
+
if [ -f "$BASELINE_FILE" ] \
|
|
13
|
+
&& command -v jq >/dev/null 2>&1 \
|
|
14
|
+
&& git rev-parse --verify HEAD >/dev/null 2>&1 \
|
|
15
|
+
&& git cat-file -e "HEAD:$BASELINE_FILE" 2>/dev/null; then
|
|
16
|
+
CURRENT_COUNT=$(jq 'length' "$BASELINE_FILE" 2>/dev/null || echo 0)
|
|
17
|
+
HEAD_COUNT=$(git show "HEAD:$BASELINE_FILE" 2>/dev/null | jq 'length' 2>/dev/null || echo 0)
|
|
18
|
+
if [ "$CURRENT_COUNT" -gt "$HEAD_COUNT" ]; then
|
|
19
|
+
{
|
|
20
|
+
echo
|
|
21
|
+
echo "[pre-push] BLOCKED: structural-baseline.json grew vs HEAD"
|
|
22
|
+
echo " Previous: $HEAD_COUNT entries"
|
|
23
|
+
echo " Current: $CURRENT_COUNT entries (+$((CURRENT_COUNT - HEAD_COUNT)))"
|
|
24
|
+
echo
|
|
25
|
+
echo "Baseline is decreasing-only. New violations should be FIXED,"
|
|
26
|
+
echo "not appended to the baseline."
|
|
27
|
+
echo
|
|
28
|
+
echo "To bypass intentionally (e.g. legitimate refactor that re-baselines"
|
|
29
|
+
echo "across a domain boundary):"
|
|
30
|
+
echo " git push --no-verify # then document the reason in the commit"
|
|
31
|
+
} >&2
|
|
32
|
+
exit 2
|
|
33
|
+
fi
|
|
34
|
+
fi
|
|
35
|
+
|
|
7
36
|
echo "[pre-push] running structural test…"
|
|
8
37
|
if [ -f harness.config.json ] && grep -q '"language": "python"' harness.config.json; then
|
|
9
38
|
python -m harness.structural_test
|
|
@@ -53,6 +53,86 @@ elif [ -f pyproject.toml ] && command -v ruff >/dev/null 2>&1; then
|
|
|
53
53
|
run_check ruff ruff check . || true
|
|
54
54
|
fi
|
|
55
55
|
|
|
56
|
+
# CLAUDE.md instruction cap. HumanLayer measurement: agents stop following
|
|
57
|
+
# CLAUDE.md reliably beyond ~150-200 bullets/numbered items. Treat the file
|
|
58
|
+
# as a table of contents; promote details to docs/ or @-imports.
|
|
59
|
+
if [ -f harness.config.json ] && command -v jq >/dev/null 2>&1; then
|
|
60
|
+
CMD_PATH=$(jq -r '.claudeMd.path // "CLAUDE.md"' harness.config.json)
|
|
61
|
+
CMD_CAP=$(jq -r '.claudeMd.maxInstructions // 200' harness.config.json)
|
|
62
|
+
if [ -f "$CMD_PATH" ] && [ "$CMD_CAP" -gt 0 ] 2>/dev/null; then
|
|
63
|
+
CMD_COUNT=$(grep -cE '^[[:space:]]*([-*]|[0-9]+\.)[[:space:]]' "$CMD_PATH" 2>/dev/null || echo 0)
|
|
64
|
+
if [ "$CMD_COUNT" -gt "$CMD_CAP" ]; then
|
|
65
|
+
{
|
|
66
|
+
echo "$CMD_PATH instruction count: $CMD_COUNT (cap: $CMD_CAP)"
|
|
67
|
+
echo
|
|
68
|
+
echo "HumanLayer measurement: agents stop following CLAUDE.md reliably"
|
|
69
|
+
echo "beyond ~150-200 instructions. Your file exceeds the cap."
|
|
70
|
+
echo
|
|
71
|
+
echo "Fix options:"
|
|
72
|
+
echo " - extract sections to docs/ and link from CLAUDE.md"
|
|
73
|
+
echo " - use @-imports to load detailed context on demand"
|
|
74
|
+
echo " - delete obsolete rules (run /garbage-collection)"
|
|
75
|
+
echo
|
|
76
|
+
echo "Adjust the cap (with justification) in harness.config.json:"
|
|
77
|
+
echo " .claudeMd.maxInstructions"
|
|
78
|
+
} > "$TMPDIR_HOOK/claude-md-cap.out"
|
|
79
|
+
echo "claude-md-cap" >> "$TMPDIR_HOOK/failed.list"
|
|
80
|
+
fi
|
|
81
|
+
fi
|
|
82
|
+
fi
|
|
83
|
+
|
|
84
|
+
# Multi-layer review trigger. When uncommitted/staged/untracked changes touch
|
|
85
|
+
# ≥2 layers within a single domain, the `architecture-reviewer` subagent
|
|
86
|
+
# should run before commit. Replaces the agent's self-judgment about "touches
|
|
87
|
+
# multiple layers" (the §4.3 #3 ambiguity in the harness-techniques research)
|
|
88
|
+
# with a mechanical count off `harness.config.json` `domains[].layers` /
|
|
89
|
+
# `.root`. Fires once per stop; the loop guard (`stop_hook_active`) lets the
|
|
90
|
+
# next stop succeed after the agent has read the recommendation.
|
|
91
|
+
if [ -f harness.config.json ] && command -v jq >/dev/null 2>&1 && command -v git >/dev/null 2>&1; then
|
|
92
|
+
CHANGED=$(
|
|
93
|
+
{
|
|
94
|
+
git diff --name-only 2>/dev/null || true
|
|
95
|
+
git diff --name-only --cached 2>/dev/null || true
|
|
96
|
+
git ls-files --others --exclude-standard 2>/dev/null || true
|
|
97
|
+
} | sort -u
|
|
98
|
+
)
|
|
99
|
+
if [ -n "$CHANGED" ]; then
|
|
100
|
+
NUM_DOMAINS=$(jq '.domains | length' harness.config.json 2>/dev/null || echo 0)
|
|
101
|
+
MULTI_OUT="$TMPDIR_HOOK/multi-layer-review.out"
|
|
102
|
+
: > "$MULTI_OUT"
|
|
103
|
+
MULTI_HIT=0
|
|
104
|
+
i=0
|
|
105
|
+
while [ "$i" -lt "$NUM_DOMAINS" ]; do
|
|
106
|
+
ROOT=$(jq -r ".domains[$i].root" harness.config.json)
|
|
107
|
+
DOMAIN=$(jq -r ".domains[$i].name" harness.config.json)
|
|
108
|
+
TOUCHED_COUNT=0
|
|
109
|
+
TOUCHED_NAMES=""
|
|
110
|
+
while IFS= read -r layer; do
|
|
111
|
+
[ -z "$layer" ] && continue
|
|
112
|
+
if echo "$CHANGED" | grep -qE "^${ROOT}/${layer}(/|$)"; then
|
|
113
|
+
TOUCHED_COUNT=$((TOUCHED_COUNT + 1))
|
|
114
|
+
TOUCHED_NAMES="$TOUCHED_NAMES $layer"
|
|
115
|
+
fi
|
|
116
|
+
done < <(jq -r ".domains[$i].layers[]" harness.config.json)
|
|
117
|
+
if [ "$TOUCHED_COUNT" -ge 2 ]; then
|
|
118
|
+
echo "Domain '$DOMAIN' has changes spanning $TOUCHED_COUNT layers:$TOUCHED_NAMES" >> "$MULTI_OUT"
|
|
119
|
+
MULTI_HIT=1
|
|
120
|
+
fi
|
|
121
|
+
i=$((i + 1))
|
|
122
|
+
done
|
|
123
|
+
if [ "$MULTI_HIT" = "1" ]; then
|
|
124
|
+
{
|
|
125
|
+
echo
|
|
126
|
+
echo "Recommend invoking the 'architecture-reviewer' subagent before commit."
|
|
127
|
+
echo "Mechanical detection — replaces self-judgment about 'touches multiple layers'."
|
|
128
|
+
echo "If review is genuinely not needed (e.g. mass-rename across layers), state"
|
|
129
|
+
echo "the reason in the commit message and re-run; the loop guard will release."
|
|
130
|
+
} >> "$MULTI_OUT"
|
|
131
|
+
echo "multi-layer-review" >> "$TMPDIR_HOOK/failed.list"
|
|
132
|
+
fi
|
|
133
|
+
fi
|
|
134
|
+
fi
|
|
135
|
+
|
|
56
136
|
if [ ! -s "$TMPDIR_HOOK/failed.list" ]; then
|
|
57
137
|
exit 0
|
|
58
138
|
fi
|