npm - agent-harness-kit - Versions diffs - 0.8.0 → 0.9.0 - Mend

agent-harness-kit 0.8.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

package/src/templates/.claude/skills/refactor-feature/scripts/feature-diff.mjs ADDED Viewed

@@ -0,0 +1,146 @@
+#!/usr/bin/env node
+// feature-diff.mjs — deterministic gate for /refactor-feature.
+// Diffs feature_list.json#features[*].steps[*] between a base ref and the
+// current working copy. Returns violations when:
+//   - step.passes flipped false → true without step.tests[] or step.testCommit
+//   - step.id silently renamed (no renamed_from)
+//   - step disappeared without replaced_by
+//
+// Exit codes:
+//   0 → no violations
+//   2 → violations present (printed as JSON to stdout)
+//   3 → input error (missing ref / unreadable file)
+import { readFileSync, existsSync } from "node:fs";
+import { resolve } from "node:path";
+import { spawnSync } from "node:child_process";
+const ROOT = process.env.CLAUDE_PROJECT_DIR || process.cwd();
+function parseArgs(argv) {
+  const out = { beforeRef: "HEAD", afterFile: "feature_list.json" };
+  for (let i = 0; i < argv.length; i++) {
+    if (argv[i] === "--before-ref") out.beforeRef = argv[++i];
+    else if (argv[i] === "--after-file") out.afterFile = argv[++i];
+  }
+  return out;
+}
+function gitShow(ref, path) {
+  const r = spawnSync("git", ["show", `${ref}:${path}`], { cwd: ROOT, encoding: "utf8" });
+  if (r.status !== 0) return null;
+  return r.stdout;
+}
+function safeJSON(s, label) {
+  if (!s) return null;
+  try { return JSON.parse(s); }
+  catch (e) {
+    console.error(`feature-diff: invalid JSON in ${label}: ${e.message}`);
+    process.exit(3);
+  }
+}
+function indexSteps(featureList) {
+  // Returns { [stepId]: { featureId, step } }.
+  const idx = new Map();
+  for (const f of (featureList?.features || [])) {
+    for (const s of (f.steps || [])) {
+      if (s && s.id) idx.set(s.id, { featureId: f.id, step: s });
+    }
+  }
+  return idx;
+}
+function diff(before, after) {
+  const beforeIdx = indexSteps(before);
+  const afterIdx = indexSteps(after);
+  const violations = [];
+  const renames = [];
+  const doneTransitions = [];
+  // Disappearances + done-transitions (work over before).
+  for (const [id, { featureId, step }] of beforeIdx) {
+    const post = afterIdx.get(id);
+    if (!post) {
+      // Disappeared. Allowed only when a replaced_by exists in the BEFORE
+      // version OR an AFTER step references this id under renamed_from.
+      let renamedAway = false;
+      for (const [newId, { step: newStep }] of afterIdx) {
+        if (Array.isArray(newStep.renamed_from) && newStep.renamed_from.includes(id)) {
+          renamedAway = true;
+          renames.push({ from: id, to: newId, kind: "renamed_from" });
+          break;
+        }
+        if (newStep.renamed_from === id) {
+          renamedAway = true;
+          renames.push({ from: id, to: newId, kind: "renamed_from" });
+          break;
+        }
+      }
+      if (!renamedAway && !step.replaced_by) {
+        violations.push({
+          kind: "step_disappeared",
+          step_id: id,
+          feature_id: featureId,
+          fix: `Add 'replaced_by: <new_step_id>' to the step before deleting, OR mark the new step's 'renamed_from'.`,
+        });
+      }
+      continue;
+    }
+    // passes transition false → true.
+    if (step.passes === false && post.step.passes === true) {
+      doneTransitions.push({ step_id: id, feature_id: featureId });
+      const hasTests = Array.isArray(post.step.tests) && post.step.tests.length > 0;
+      const hasCommit = typeof post.step.testCommit === "string" && post.step.testCommit.length > 0;
+      if (!hasTests && !hasCommit) {
+        violations.push({
+          kind: "done_without_proof",
+          step_id: id,
+          feature_id: featureId,
+          fix: `Add 'tests: [...]' (test file paths) or 'testCommit: <sha>' before flipping passes:true.`,
+        });
+      }
+    }
+  }
+  // Newly-introduced steps with renamed_from referring to nonexistent ids
+  // (paranoia: catches typos in the renamed_from value).
+  for (const [id, { step }] of afterIdx) {
+    if (beforeIdx.has(id)) continue;
+    const refs = Array.isArray(step.renamed_from) ? step.renamed_from
+               : (typeof step.renamed_from === "string" ? [step.renamed_from] : []);
+    for (const ref of refs) {
+      if (!beforeIdx.has(ref)) {
+        violations.push({
+          kind: "renamed_from_typo",
+          step_id: id,
+          missing_ref: ref,
+          fix: `'renamed_from' must reference a step that existed at HEAD. Check the spelling.`,
+        });
+      }
+    }
+  }
+  return { violations, renames, doneTransitions };
+}
+function main() {
+  const { beforeRef, afterFile } = parseArgs(process.argv.slice(2));
+  const beforeRaw = gitShow(beforeRef, afterFile);
+  if (beforeRaw === null) {
+    // First-time addition — nothing to diff.
+    process.stdout.write(JSON.stringify({ violations: [], note: `no prior ${afterFile} at ${beforeRef}` }) + "\n");
+    process.exit(0);
+  }
+  const afterPath = resolve(ROOT, afterFile);
+  if (!existsSync(afterPath)) {
+    console.error(`feature-diff: missing ${afterFile} in working copy`);
+    process.exit(3);
+  }
+  const before = safeJSON(beforeRaw, `${beforeRef}:${afterFile}`);
+  const after = safeJSON(readFileSync(afterPath, "utf8"), afterFile);
+  const result = diff(before, after);
+  process.stdout.write(JSON.stringify(result, null, 2) + "\n");
+  if (result.violations.length > 0) process.exit(2);
+}
+main();

package/src/templates/.claude/skills/review-this-pr/SKILL.md ADDED Viewed

@@ -0,0 +1,59 @@
+---
+name: review-this-pr
+description: Use this skill to run a deterministic review of the current branch against its base — git diff base...HEAD, structural-test, baseline-monotonic check, and a markdown summary that lists each violating file with its layer rule. Replaces the "ask the agent to review the diff" pattern, which routinely misses cross-file drift.
+allowed-tools: Read, Bash(git diff:*, git log:*, git merge-base:*, node .claude/skills/review-this-pr/scripts/pr-review-driver.mjs:*)
+suggested-turns: 6
+isolation: worktree
+---
+## When to invoke
+- Before opening a PR (or before `gh pr create`).
+- After a refactor where multiple files moved between layers.
+- When CI lights up red and you want a fast local repro.
+## Steps
+1. **Identify base.**
+   ```
+   BASE=$(git merge-base HEAD origin/main 2>/dev/null || git merge-base HEAD main)
+   ```
+2. **Run driver.**
+   ```
+   node .claude/skills/review-this-pr/scripts/pr-review-driver.mjs --base "$BASE"
+   ```
+   Driver:
+   - Collects `git diff --name-only $BASE..HEAD`.
+   - Runs structural-test (workspace-wide for ts/py, file-scoped fallback).
+   - Diffs `.harness/structural-baseline.json` between $BASE and HEAD —
+     monotonic violation (baseline grew) is a hard fail.
+   - Reads each changed file's layer mapping via harness.config.json.
+3. **Read the report.** Output is markdown to stdout (or `--out report.md`).
+   Sections: Summary, Layer-map of changed files, Structural-test results,
+   Baseline delta, Per-reviewer hand-off (architecture, security, performance,
+   reliability).
+4. **Address each FAIL.** Re-run the driver until all sections are PASS.
+5. **Hand-off to reviewers.** If isolated review is needed, invoke
+   `/architecture-reviewer` / `/security-reviewer` etc. with the report as
+   context.
+## Output contract (driver JSON tail)
+```
+{
+  "base": "<sha>",
+  "changed_files": <N>,
+  "violations": <M>,
+  "baseline_delta": <K>,
+  "passed": <bool>
+}
+```
+## Anti-patterns
+- Don't skip the structural-test "because the build passes" — the build
+  catches type errors; structural-test catches layer-rule violations that
+  TypeScript happily accepts.
+- Don't paper over a baseline-delta with `git checkout HEAD~1 -- .harness/`
+  — the monotonic guard exists *because* that paper-over is the agent's
+  first instinct.

package/src/templates/.claude/skills/review-this-pr/SKILL.md.vi ADDED Viewed

@@ -0,0 +1,63 @@
+<!-- LOCALE_TODO: translate body to vi -->
+<!-- Source: .claude/skills/review-this-pr/SKILL.md -->
+<!-- Edit only the markdown body — keep frontmatter verbatim so the kit's renderer + Claude Code parse it identically across locales. -->
+---
+name: review-this-pr
+description: Use this skill to run a deterministic review of the current branch against its base — git diff base...HEAD, structural-test, baseline-monotonic check, and a markdown summary that lists each violating file with its layer rule. Replaces the "ask the agent to review the diff" pattern, which routinely misses cross-file drift.
+allowed-tools: Read, Bash(git diff:*, git log:*, git merge-base:*, node .claude/skills/review-this-pr/scripts/pr-review-driver.mjs:*)
+suggested-turns: 6
+isolation: worktree
+---
+## When to invoke
+- Before opening a PR (or before `gh pr create`).
+- After a refactor where multiple files moved between layers.
+- When CI lights up red and you want a fast local repro.
+## Steps
+1. **Identify base.**
+   ```
+   BASE=$(git merge-base HEAD origin/main 2>/dev/null || git merge-base HEAD main)
+   ```
+2. **Run driver.**
+   ```
+   node .claude/skills/review-this-pr/scripts/pr-review-driver.mjs --base "$BASE"
+   ```
+   Driver:
+   - Collects `git diff --name-only $BASE..HEAD`.
+   - Runs structural-test (workspace-wide for ts/py, file-scoped fallback).
+   - Diffs `.harness/structural-baseline.json` between $BASE and HEAD —
+     monotonic violation (baseline grew) is a hard fail.
+   - Reads each changed file's layer mapping via harness.config.json.
+3. **Read the report.** Output is markdown to stdout (or `--out report.md`).
+   Sections: Summary, Layer-map of changed files, Structural-test results,
+   Baseline delta, Per-reviewer hand-off (architecture, security, performance,
+   reliability).
+4. **Address each FAIL.** Re-run the driver until all sections are PASS.
+5. **Hand-off to reviewers.** If isolated review is needed, invoke
+   `/architecture-reviewer` / `/security-reviewer` etc. with the report as
+   context.
+## Output contract (driver JSON tail)
+```
+{
+  "base": "<sha>",
+  "changed_files": <N>,
+  "violations": <M>,
+  "baseline_delta": <K>,
+  "passed": <bool>
+}
+```
+## Anti-patterns
+- Don't skip the structural-test "because the build passes" — the build
+  catches type errors; structural-test catches layer-rule violations that
+  TypeScript happily accepts.
+- Don't paper over a baseline-delta with `git checkout HEAD~1 -- .harness/`
+  — the monotonic guard exists *because* that paper-over is the agent's
+  first instinct.

package/src/templates/.claude/skills/review-this-pr/scripts/pr-review-driver.mjs ADDED Viewed

@@ -0,0 +1,152 @@
+#!/usr/bin/env node
+// pr-review-driver.mjs — deterministic driver for /review-this-pr.
+// Gathers diff, runs structural-test, diffs baseline, emits markdown report.
+//
+// Usage:
+//   pr-review-driver.mjs --base <sha> [--out report.md]
+//
+// Output:
+//   stdout markdown + trailing JSON tail line (machine-readable).
+import { readFileSync, existsSync, writeFileSync } from "node:fs";
+import { resolve } from "node:path";
+import { spawnSync } from "node:child_process";
+const ROOT = process.env.CLAUDE_PROJECT_DIR || process.cwd();
+function parseArgs(argv) {
+  const out = { base: null, out: null };
+  for (let i = 0; i < argv.length; i++) {
+    if (argv[i] === "--base") out.base = argv[++i];
+    else if (argv[i] === "--out") out.out = argv[++i];
+  }
+  if (!out.base) {
+    console.error("usage: pr-review-driver.mjs --base <sha> [--out report.md]");
+    process.exit(2);
+  }
+  return out;
+}
+function git(args) {
+  return spawnSync("git", args, { cwd: ROOT, encoding: "utf8" });
+}
+function changedFiles(base) {
+  const r = git(["diff", "--name-only", `${base}...HEAD`]);
+  if (r.status !== 0) return [];
+  return (r.stdout || "").split("\n").filter(Boolean);
+}
+function whichLayer(file, cfg) {
+  if (!cfg?.domains) return null;
+  for (const d of cfg.domains) {
+    if (!d.layers || !d.root) continue;
+    for (const layer of d.layers) {
+      const prefix = `${d.root}/${layer}/`;
+      if (file.startsWith(prefix)) return { domain: d.name || "default", layer };
+    }
+  }
+  return null;
+}
+function loadJSON(path, fallback = null) {
+  try { return JSON.parse(readFileSync(path, "utf8")); } catch { return fallback; }
+}
+function runStructuralTest() {
+  // Prefer node harness/structural-check.mjs (polyglot adapters); fallback to
+  // npm run harness:check. Capture full output (stdout+stderr); never throws.
+  if (existsSync(resolve(ROOT, "harness/structural-check.mjs"))) {
+    const r = spawnSync("node", ["harness/structural-check.mjs"], {
+      cwd: ROOT, encoding: "utf8",
+    });
+    return {
+      ok: r.status === 0,
+      output: ((r.stdout || "") + (r.stderr || "")).split("\n").slice(0, 80).join("\n"),
+    };
+  }
+  if (existsSync(resolve(ROOT, "package.json"))) {
+    const pj = loadJSON(resolve(ROOT, "package.json"));
+    if (pj?.scripts?.["harness:check"]) {
+      const r = spawnSync("npm", ["run", "--silent", "harness:check"], {
+        cwd: ROOT, encoding: "utf8",
+      });
+      return {
+        ok: r.status === 0,
+        output: ((r.stdout || "") + (r.stderr || "")).split("\n").slice(0, 80).join("\n"),
+      };
+    }
+  }
+  return { ok: true, output: "(no structural-test entry point — skipped)" };
+}
+function baselineDelta(base) {
+  const baselinePath = ".harness/structural-baseline.json";
+  const headRaw = existsSync(resolve(ROOT, baselinePath))
+    ? readFileSync(resolve(ROOT, baselinePath), "utf8") : "[]";
+  const baseR = git(["show", `${base}:${baselinePath}`]);
+  const baseRaw = baseR.status === 0 ? baseR.stdout : "[]";
+  let headArr, baseArr;
+  try { headArr = JSON.parse(headRaw); } catch { headArr = []; }
+  try { baseArr = JSON.parse(baseRaw); } catch { baseArr = []; }
+  const headSet = new Set(headArr.map((x) => typeof x === "string" ? x : JSON.stringify(x)));
+  const baseSet = new Set(baseArr.map((x) => typeof x === "string" ? x : JSON.stringify(x)));
+  const added = [];
+  for (const e of headSet) if (!baseSet.has(e)) added.push(e);
+  return { added_count: added.length, head_count: headArr.length, base_count: baseArr.length };
+}
+function buildReport({ base, changed, perFile, structural, baseline }) {
+  const violations = structural.ok ? 0 : structural.output.split("\n")
+    .filter((l) => /violat|error|FAIL/i.test(l)).length;
+  const passed = structural.ok && baseline.added_count === 0;
+  const md = [];
+  md.push(`# /review-this-pr report`);
+  md.push(``);
+  md.push(`- base: \`${base}\``);
+  md.push(`- changed files: ${changed.length}`);
+  md.push(`- structural-test: ${structural.ok ? "PASS" : "FAIL"}`);
+  md.push(`- baseline delta: ${baseline.added_count} new entries (head=${baseline.head_count}, base=${baseline.base_count})`);
+  md.push(`- overall: ${passed ? "PASS" : "FAIL"}`);
+  md.push(``);
+  md.push(`## Changed files (by layer)`);
+  md.push(``);
+  for (const row of perFile) {
+    const tag = row.layer ? `${row.layer.domain}/${row.layer.layer}` : "(unlayered)";
+    md.push(`- \`${row.file}\` → ${tag}`);
+  }
+  md.push(``);
+  md.push(`## Structural-test output (head 80 lines)`);
+  md.push("```");
+  md.push(structural.output);
+  md.push("```");
+  md.push(``);
+  md.push(`## Hand-off`);
+  md.push(``);
+  md.push(`Recommended reviewer subagents based on touched layers:`);
+  const layers = new Set(perFile.map((r) => r.layer?.layer).filter(Boolean));
+  if (layers.has("service") || layers.has("repository")) md.push(`- /api-consistency-reviewer (service/repo touched)`);
+  if (changed.some((f) => /auth|secret|crypto|cookie/i.test(f))) md.push(`- /security-reviewer (security-flavoured files touched)`);
+  if (changed.some((f) => /\.sql$|migrations\//i.test(f))) md.push(`- /reliability-reviewer (data-layer touched)`);
+  if (changed.length >= 10) md.push(`- /architecture-reviewer (>=10 files changed)`);
+  md.push(``);
+  const tail = { base, changed_files: changed.length, violations, baseline_delta: baseline.added_count, passed };
+  md.push(`<!-- machine-tail: ${JSON.stringify(tail)} -->`);
+  return { md: md.join("\n") + "\n", tail };
+}
+function main() {
+  const { base, out } = parseArgs(process.argv.slice(2));
+  const cfg = loadJSON(resolve(ROOT, "harness.config.json"));
+  const changed = changedFiles(base);
+  const perFile = changed.map((f) => ({ file: f, layer: whichLayer(f, cfg) }));
+  const structural = runStructuralTest();
+  const baseline = baselineDelta(base);
+  const { md, tail } = buildReport({ base, changed, perFile, structural, baseline });
+  if (out) writeFileSync(resolve(ROOT, out), md);
+  else process.stdout.write(md);
+  if (!tail.passed) process.exit(2);
+}
+main();

package/src/templates/.claude/skills/structural-test-author/SKILL.md.vi.hbs ADDED Viewed

@@ -0,0 +1,50 @@
+<!-- LOCALE_TODO: translate body to vi -->
+<!-- Source: .claude/skills/structural-test-author/SKILL.md.hbs -->
+<!-- Edit only the markdown body — keep frontmatter verbatim so the kit's renderer + Claude Code parse it identically across locales. -->
+---
+name: structural-test-author
+description: Use this skill whenever the user wants to add a new architectural rule, prevent a recurring agent mistake, or codify a pattern from golden-principles.md. Generates a {{#if isPython}}libcst-based Python{{else}}ts-morph-based TypeScript{{/if}} structural test plus the matching {{#if isPython}}import-linter contract{{else}}eslint-plugin-boundaries rule{{/if}} entry. Always prefer this over leaving rules in prose.
+allowed-tools: Read, Edit, Write, Bash(npm test:*), Bash(pytest:*)
+suggested-turns: 15
+---
+## Steps
+1. **Phrase the rule.** Ask the user: "What invariant do you want enforced?
+   Phrase it as: 'No code in layer X may import from layer Y' or 'Every
+   <thing> must <do>'."
+2. **Layer rules first.** If the rule is layer-based, edit `harness.config.json`
+   `domains[].layers` and the {{#if isPython}}`.importlinter`{{else}}`eslint.config.js`{{/if}}
+   config — DO NOT write a custom test for layer rules; the existing test
+   already supports them via configuration.
+3. **Structural rules.** If the rule is structural but not layer-based (e.g.
+   "every controller must call validateAt"), open
+   `{{#if isPython}}harness/structural_test.py{{else}}harness/structural-test.ts{{/if}}`:
+   - {{#if isPython}}Use `libcst.CSTVisitor` subclass — preserves whitespace and comments.{{else}}Use `Project` + `getSourceFiles()` + AST visitors — the canonical ts-morph pattern.{{/if}}
+4. **Add a fixture test.** Create a file in `tests/structural/` that contains
+   a deliberately-violating snippet, and verify the rule fails on it.
+5. **Run against the whole repo.** If the rule fails on existing code, choose:
+   - **(a)** fix the existing code, OR
+   - **(b)** add the existing violations to `.harness/structural-baseline.json`
+     so only **new** violations block. (PMD/baseline pattern.)
+6. **Document.** Append the rule and its rationale (one paragraph traced to a
+   specific past failure) to `docs/golden-principles.md`.
+7. **Log the harness change.** Run `/propose-harness-improvement` to record
+   this as a permanent harness improvement.
+## Output contract
+```
+### Rule added: <one-line description>
+### Files changed: <list>
+### New violations on existing code: <count> — baselined: yes/no
+### golden-principles.md entry: §<n>
+```
+## Anti-patterns
+- Don't write a rule whose enforcement is also LLM-based — that just recurses.
+- Don't write a rule that requires runtime information to evaluate (e.g.
+  "this function must not take more than 100ms"). Those go in evals or
+  observability, not structural tests.

package/src/templates/.claude/skills/write-skill/SKILL.md.vi ADDED Viewed

@@ -0,0 +1,43 @@
+<!-- LOCALE_TODO: translate body to vi -->
+<!-- Source: .claude/skills/write-skill/SKILL.md -->
+<!-- Edit only the markdown body — keep frontmatter verbatim so the kit's renderer + Claude Code parse it identically across locales. -->
+---
+name: write-skill
+description: Use this skill whenever the user asks to "create a skill", "add a slash command", "package a workflow", or "make X reusable across sessions". Generates a SKILL.md with valid YAML frontmatter (name regex, description ≤ 1024 chars, body ≤ 500 lines) and supporting scripts/references/assets. Tests the skill by simulating an auto-discovery prompt.
+allowed-tools: Read, Edit, Write, Bash(ls:*)
+suggested-turns: 8
+---
+## Steps
+1. **Validate the name.** Must match `^[a-z0-9]+(-[a-z0-9]+)*$` and be ≤ 64
+   characters.
+2. **Write a "pushy" description.** Third-person, ≤ 1024 chars. Explicitly
+   mention triggers ("Use this skill whenever the user mentions <X>, <Y>,
+   <Z>"). Models under-trigger skills with shy descriptions.
+3. **Body sections,** in this order: `## When to use`, `## Steps`,
+   `## Output contract`, `## Anti-patterns`. Cap body at 500 lines.
+4. **Externalize deterministic logic.** If the skill needs deterministic work
+   (parsing, formatting, computation), put it in `scripts/<name>.sh` (or `.py`
+   / `.mjs`) under the skill directory and reference it via a `Bash(...)`
+   tool call. SKILL.md stays declarative.
+5. **Test discovery.** Open a fresh Claude Code session and prompt with one
+   of the description triggers. Verify the skill auto-loads.
+## Output contract
+```
+### Skill: /<name>
+### Description bytes: <count>/1024
+### Body lines: <count>/500
+### Allowed tools: <list>
+### Discovery trigger tested: yes/no
+```
+## Anti-patterns
+- Don't write a description that starts with "This skill…" — start with "Use
+  this skill whenever the user…" so triggers are front-loaded.
+- Don't pack two unrelated workflows into one skill. Split them.
+- Don't grant `Bash(*:*)` in `allowed-tools`. Pin specific commands.

package/src/templates/.harness/eval/rubrics/feature-step-done.mjs ADDED Viewed

@@ -0,0 +1,148 @@
+#!/usr/bin/env node
+// feature-step-done.mjs — eval rubric for the "feature step done" task.
+// Reads the agent's transcript + the final feature_list.json + the diff;
+// returns a JSON verdict on the outcome / process / style / efficiency
+// dimensions.
+//
+// Invocation (from eval-runner.mjs):
+//   node .harness/eval/rubrics/feature-step-done.mjs --transcript <path> --task <task.json>
+//
+// Exit 0 = rubric ran. The JSON tail communicates pass/fail.
+import { readFileSync, existsSync } from "node:fs";
+import { resolve } from "node:path";
+import { spawnSync } from "node:child_process";
+const ROOT = process.env.CLAUDE_PROJECT_DIR || process.cwd();
+function parseArgs(argv) {
+  const out = { transcript: null, task: null };
+  for (let i = 0; i < argv.length; i++) {
+    if (argv[i] === "--transcript") out.transcript = argv[++i];
+    else if (argv[i] === "--task") out.task = argv[++i];
+  }
+  return out;
+}
+function safeJSON(s, def = null) {
+  try { return JSON.parse(s); } catch { return def; }
+}
+function loadFile(path, fallback = null) {
+  try { return readFileSync(path, "utf8"); } catch { return fallback; }
+}
+function loadFeatureList() {
+  const path = resolve(ROOT, "feature_list.json");
+  const raw = loadFile(path);
+  return raw ? safeJSON(raw) : null;
+}
+function gitDiffFiles() {
+  // Files changed in the agent's run, relative to HEAD~1 (one commit before
+  // the eval started). Eval-runner pins HEAD with a tag before each task.
+  const r = spawnSync("git", ["diff", "--name-only", "HEAD~1...HEAD"], {
+    cwd: ROOT, encoding: "utf8",
+  });
+  if (r.status !== 0) return [];
+  return (r.stdout || "").split("\n").filter(Boolean);
+}
+function transcriptToolCalls(transcriptPath) {
+  // Stream-json transcripts from claude-cli are JSONL with one record per
+  // tool invocation / message. We collect the tool names + a small sample
+  // of inputs so the rubric can spot /add-feature etc.
+  const body = loadFile(transcriptPath, "");
+  const calls = [];
+  for (const line of body.split("\n")) {
+    if (!line.trim()) continue;
+    const rec = safeJSON(line);
+    if (!rec) continue;
+    if (rec.type === "tool_use" || rec.tool || rec.skill) {
+      calls.push({
+        tool: rec.tool || rec.skill || rec.type,
+        input: rec.input || rec.tool_input || rec.arguments || null,
+      });
+    }
+  }
+  return calls;
+}
+function grade({ task, fl, diffFiles, toolCalls }) {
+  const dims = { outcome: "fail", process: "fail", style: "warn", efficiency: "warn" };
+  const reasons = [];
+  // --- outcome ---
+  // features[0].steps[0].passes === true AND tests[] is non-empty AND
+  // at least one tests[] entry appears in diffFiles.
+  const step = fl?.features?.[0]?.steps?.[0];
+  if (!step) {
+    reasons.push("outcome: no features[0].steps[0] found in feature_list.json after run");
+  } else if (step.passes !== true) {
+    reasons.push(`outcome: features[0].steps[0].passes is ${JSON.stringify(step.passes)}, want true`);
+  } else if (!Array.isArray(step.tests) || step.tests.length === 0) {
+    reasons.push("outcome: features[0].steps[0].tests is empty — done flipped without test reference");
+  } else {
+    const testInDiff = step.tests.some((t) => diffFiles.includes(t));
+    if (!testInDiff) {
+      reasons.push(`outcome: feature_list.json#tests references [${step.tests.join(", ")}] but none appear in the diff`);
+    } else {
+      dims.outcome = "pass";
+    }
+  }
+  // --- process ---
+  // The agent should invoke /add-feature (or /refactor-feature) AND make
+  // a write to the handler + test file in the same run.
+  const ranSkill = toolCalls.some(
+    (c) => /(add-feature|refactor-feature)/i.test(c.tool || "") ||
+           /(add-feature|refactor-feature)/i.test(c.input?.skill || ""),
+  );
+  const handlerWrites = diffFiles.filter((f) => /\.(ts|tsx|js|mjs|py|rs|go)$/.test(f) && !/test/i.test(f));
+  const testWrites = diffFiles.filter((f) => /test/i.test(f) || /\.spec\./.test(f));
+  if (!ranSkill) {
+    reasons.push("process: agent did not invoke /add-feature or /refactor-feature");
+  } else if (handlerWrites.length === 0) {
+    reasons.push("process: no handler file appeared in diff");
+  } else if (testWrites.length === 0) {
+    reasons.push("process: no test file appeared in diff");
+  } else {
+    dims.process = "pass";
+  }
+  // --- style ---
+  // PROGRESS.md should be appended (kit convention). Soft check.
+  const touchedProgress = diffFiles.includes(".harness/PROGRESS.md");
+  if (touchedProgress) {
+    dims.style = "pass";
+  } else {
+    reasons.push("style: .harness/PROGRESS.md not appended (soft fail)");
+  }
+  // --- efficiency ---
+  // expected.tokensMax — actual token count comes from transcript meta.
+  // Without that we can't grade hard; warn-pass if filesChanged within
+  // task.expected.filesChanged bounds.
+  const max = task?.expected?.filesChanged?.max ?? 99;
+  const min = task?.expected?.filesChanged?.min ?? 1;
+  if (diffFiles.length >= min && diffFiles.length <= max) {
+    dims.efficiency = "pass";
+  } else {
+    reasons.push(`efficiency: ${diffFiles.length} files changed, want ${min}-${max}`);
+  }
+  const overall = (dims.outcome === "pass" && dims.process === "pass") ? "PASS" : "FAIL";
+  return { overall, dimensions: dims, reasons, diff_files: diffFiles };
+}
+function main() {
+  const { transcript, task: taskPath } = parseArgs(process.argv.slice(2));
+  const task = taskPath ? safeJSON(loadFile(resolve(ROOT, taskPath)) ?? "", null) : null;
+  const fl = loadFeatureList();
+  const diffFiles = gitDiffFiles();
+  const toolCalls = transcript ? transcriptToolCalls(resolve(ROOT, transcript)) : [];
+  const verdict = grade({ task, fl, diffFiles, toolCalls });
+  process.stdout.write(JSON.stringify(verdict, null, 2) + "\n");
+}
+main();