@slowdini/slow-powers-opencode 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. package/README.md +18 -8
  2. package/package.json +5 -1
  3. package/skills/evaluating-skills/SKILL.md +19 -17
  4. package/skills/evaluating-skills/harness-details/claude.md +51 -15
  5. package/skills/evaluating-skills/harness-parity.md +155 -0
  6. package/skills/evaluating-skills/runner/README.md +28 -19
  7. package/skills/evaluating-skills/runner/adapters/claude-code-session.ts +2 -2
  8. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +222 -0
  9. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +107 -11
  10. package/skills/evaluating-skills/runner/aggregate.test.ts +220 -0
  11. package/skills/evaluating-skills/runner/aggregate.ts +21 -0
  12. package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +295 -2
  13. package/skills/evaluating-skills/runner/detect-stray-writes.ts +102 -6
  14. package/skills/evaluating-skills/runner/guard/policy.test.ts +57 -0
  15. package/skills/evaluating-skills/runner/promote-baseline.test.ts +51 -0
  16. package/skills/evaluating-skills/runner/promote-baseline.ts +19 -1
  17. package/skills/evaluating-skills/runner/record-runs.test.ts +314 -0
  18. package/skills/evaluating-skills/runner/record-runs.ts +209 -0
  19. package/skills/evaluating-skills/runner/run.test.ts +523 -0
  20. package/skills/evaluating-skills/runner/run.ts +376 -17
  21. package/skills/evaluating-skills/runner/sandbox-policy.ts +20 -0
  22. package/skills/evaluating-skills/runner/types.ts +9 -0
  23. package/skills/evaluating-skills/runner/workspace-teardown.test.ts +227 -0
  24. package/skills/evaluating-skills/runner/workspace-teardown.ts +136 -0
  25. package/skills/evaluating-skills/schema/run-record.schema.json +2 -2
  26. package/skills/evaluating-skills/schema/stray-writes.schema.json +15 -3
  27. package/skills/evaluating-skills/templates/eval-task-prompt.md +5 -3
  28. package/skills/test-driven-development/evals/baseline/NOTES.md +1 -1
  29. package/skills/verifying-development-work/SKILL.md +17 -6
  30. package/skills/verifying-development-work/code-review.md +68 -0
  31. package/skills/verifying-development-work/comment-review.md +85 -0
  32. package/skills/verifying-development-work/evals/baseline/BASELINE.md +7 -6
  33. package/skills/verifying-development-work/evals/baseline/NOTES.md +83 -149
  34. package/skills/verifying-development-work/evals/baseline/benchmark.json +32 -31
  35. package/skills/verifying-development-work/evals/baseline/grading/comment-hygiene-at-handoff__new_skill.json +53 -0
  36. package/skills/verifying-development-work/evals/baseline/grading/comment-hygiene-at-handoff__old_skill.json +53 -0
  37. package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__new_skill.json +53 -0
  38. package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__old_skill.json +53 -0
  39. package/skills/verifying-development-work/evals/evals.json +34 -2
  40. package/skills/verifying-development-work/evals/fixtures/comment-hygiene-at-handoff/slugify.test.ts +14 -0
  41. package/skills/verifying-development-work/evals/fixtures/comment-hygiene-at-handoff/slugify.ts +25 -0
  42. package/skills/verifying-development-work/evals/baseline/grading/bug-fixed-without-reproducing__with_skill.json +0 -39
  43. package/skills/verifying-development-work/evals/baseline/grading/bug-fixed-without-reproducing__without_skill.json +0 -24
  44. package/skills/verifying-development-work/evals/baseline/grading/build-implied-by-edit__with_skill.json +0 -46
  45. package/skills/verifying-development-work/evals/baseline/grading/build-implied-by-edit__without_skill.json +0 -31
  46. package/skills/verifying-development-work/evals/baseline/grading/claim-without-running__with_skill.json +0 -46
  47. package/skills/verifying-development-work/evals/baseline/grading/claim-without-running__without_skill.json +0 -31
  48. package/skills/verifying-development-work/evals/baseline/grading/seeded-done-tests-pass-ship-it__with_skill.json +0 -46
  49. package/skills/verifying-development-work/evals/baseline/grading/seeded-done-tests-pass-ship-it__without_skill.json +0 -31
  50. package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__with_skill.json +0 -53
  51. package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__without_skill.json +0 -38
@@ -0,0 +1,227 @@
1
+ import { afterAll, beforeAll, describe, expect, test } from "bun:test";
2
+ import { existsSync, mkdirSync, rmSync, writeFileSync } from "node:fs";
3
+ import { tmpdir } from "node:os";
4
+ import { join } from "node:path";
5
+ import {
6
+ cleanupWorkspace,
7
+ PROMOTED_MARKER,
8
+ SNAPSHOT_META,
9
+ } from "./workspace-teardown";
10
+
11
+ const FIXTURE_ROOT = join(
12
+ tmpdir(),
13
+ `slow-powers-workspace-teardown-test-${process.pid}`,
14
+ );
15
+
16
+ beforeAll(() => {
17
+ mkdirSync(FIXTURE_ROOT, { recursive: true });
18
+ });
19
+
20
+ afterAll(() => {
21
+ rmSync(FIXTURE_ROOT, { recursive: true, force: true });
22
+ });
23
+
24
+ let caseSeq = 0;
25
+ function freshWorkspace(): string {
26
+ caseSeq += 1;
27
+ const workspaceRoot = join(
28
+ FIXTURE_ROOT,
29
+ `case-${caseSeq}`,
30
+ "skills-workspace",
31
+ );
32
+ mkdirSync(workspaceRoot, { recursive: true });
33
+ return workspaceRoot;
34
+ }
35
+
36
+ function writeJson(path: string, value: unknown) {
37
+ mkdirSync(join(path, ".."), { recursive: true });
38
+ writeFileSync(path, `${JSON.stringify(value, null, 2)}\n`);
39
+ }
40
+
41
+ /** Build an iteration dir; `opts` controls which artifacts it carries. */
42
+ function makeIteration(
43
+ workspaceRoot: string,
44
+ skill: string,
45
+ iteration: string,
46
+ opts: {
47
+ promoted?: boolean;
48
+ benchmark?: boolean;
49
+ runRecord?: boolean;
50
+ grading?: boolean;
51
+ scaffoldingOnly?: boolean;
52
+ },
53
+ ): string {
54
+ const dir = join(workspaceRoot, skill, iteration);
55
+ mkdirSync(dir, { recursive: true });
56
+ if (opts.scaffoldingOnly) {
57
+ writeFileSync(join(dir, "dispatch.json"), "[]\n");
58
+ }
59
+ if (opts.benchmark) {
60
+ writeJson(join(dir, "benchmark.json"), { delta: { pass_rate: 0.5 } });
61
+ }
62
+ if (opts.runRecord) {
63
+ writeJson(join(dir, "eval-e1", "with_skill", "run.json"), {
64
+ eval_id: "e1",
65
+ });
66
+ }
67
+ if (opts.grading) {
68
+ writeJson(join(dir, "eval-e1", "with_skill", "grading.json"), {
69
+ summary: { pass_rate: 1 },
70
+ });
71
+ }
72
+ if (opts.promoted) {
73
+ writeJson(join(dir, PROMOTED_MARKER), {
74
+ promoted_at: "2026-06-04T00:00:00.000Z",
75
+ baseline_dir: "/somewhere/evals/baseline",
76
+ commit: "abc1234",
77
+ });
78
+ }
79
+ return dir;
80
+ }
81
+
82
+ function makeSnapshot(
83
+ workspaceRoot: string,
84
+ skill: string,
85
+ label: string,
86
+ source: "ref" | "working-tree" | null,
87
+ ): string {
88
+ const dir = join(workspaceRoot, skill, "snapshots", label);
89
+ mkdirSync(dir, { recursive: true });
90
+ writeFileSync(join(dir, "SKILL.md"), "snapshot body\n");
91
+ if (source !== null) {
92
+ writeJson(
93
+ join(dir, SNAPSHOT_META),
94
+ source === "ref" ? { source, ref: "HEAD~1" } : { source },
95
+ );
96
+ }
97
+ return dir;
98
+ }
99
+
100
+ describe("cleanupWorkspace — iterations", () => {
101
+ test("removes a promoted iteration and prunes the emptied workspace", () => {
102
+ const ws = freshWorkspace();
103
+ const iter = makeIteration(ws, "mr-review", "iteration-1", {
104
+ promoted: true,
105
+ benchmark: true,
106
+ grading: true,
107
+ });
108
+
109
+ const summary = cleanupWorkspace(ws, "mr-review");
110
+
111
+ expect(existsSync(iter)).toBe(false);
112
+ expect(summary.removedIterations).toEqual(["iteration-1"]);
113
+ expect(summary.workspaceRemoved).toBe(true);
114
+ // Skill dir and the workspace root are pruned once empty.
115
+ expect(existsSync(join(ws, "mr-review"))).toBe(false);
116
+ expect(existsSync(ws)).toBe(false);
117
+ });
118
+
119
+ test("keeps an unpromoted iteration that holds a benchmark, and reports it", () => {
120
+ const ws = freshWorkspace();
121
+ const iter = makeIteration(ws, "mr-review", "iteration-1", {
122
+ benchmark: true,
123
+ });
124
+
125
+ const summary = cleanupWorkspace(ws, "mr-review");
126
+
127
+ expect(existsSync(iter)).toBe(true);
128
+ expect(summary.removedIterations).toEqual([]);
129
+ expect(summary.keptIterations.map((k) => k.iteration)).toEqual([
130
+ "iteration-1",
131
+ ]);
132
+ // Nothing was emptied, so the workspace stays.
133
+ expect(existsSync(ws)).toBe(true);
134
+ });
135
+
136
+ test("keeps an unpromoted iteration that holds only a run record", () => {
137
+ const ws = freshWorkspace();
138
+ const iter = makeIteration(ws, "mr-review", "iteration-1", {
139
+ runRecord: true,
140
+ });
141
+
142
+ const summary = cleanupWorkspace(ws, "mr-review");
143
+
144
+ expect(existsSync(iter)).toBe(true);
145
+ expect(summary.keptIterations.map((k) => k.iteration)).toEqual([
146
+ "iteration-1",
147
+ ]);
148
+ });
149
+
150
+ test("removes an unpromoted scaffolding-only iteration (no captured results)", () => {
151
+ const ws = freshWorkspace();
152
+ const iter = makeIteration(ws, "mr-review", "iteration-1", {
153
+ scaffoldingOnly: true,
154
+ });
155
+
156
+ const summary = cleanupWorkspace(ws, "mr-review");
157
+
158
+ expect(existsSync(iter)).toBe(false);
159
+ expect(summary.removedIterations).toEqual(["iteration-1"]);
160
+ });
161
+
162
+ test("mixed: promoted removed, unpromoted-with-results kept, skill dir NOT pruned", () => {
163
+ const ws = freshWorkspace();
164
+ const promoted = makeIteration(ws, "mr-review", "iteration-1", {
165
+ promoted: true,
166
+ benchmark: true,
167
+ });
168
+ const kept = makeIteration(ws, "mr-review", "iteration-2", {
169
+ benchmark: true,
170
+ });
171
+
172
+ const summary = cleanupWorkspace(ws, "mr-review");
173
+
174
+ expect(existsSync(promoted)).toBe(false);
175
+ expect(existsSync(kept)).toBe(true);
176
+ expect(summary.removedIterations).toEqual(["iteration-1"]);
177
+ expect(summary.keptIterations.map((k) => k.iteration)).toEqual([
178
+ "iteration-2",
179
+ ]);
180
+ expect(summary.workspaceRemoved).toBe(false);
181
+ expect(existsSync(join(ws, "mr-review"))).toBe(true);
182
+ });
183
+ });
184
+
185
+ describe("cleanupWorkspace — snapshots", () => {
186
+ test("removes ref snapshots, keeps working-tree and legacy (no-meta) snapshots", () => {
187
+ const ws = freshWorkspace();
188
+ const refSnap = makeSnapshot(ws, "mr-review", "old-ref", "ref");
189
+ const wtSnap = makeSnapshot(ws, "mr-review", "wt", "working-tree");
190
+ const legacySnap = makeSnapshot(ws, "mr-review", "legacy", null);
191
+
192
+ const summary = cleanupWorkspace(ws, "mr-review");
193
+
194
+ expect(existsSync(refSnap)).toBe(false);
195
+ expect(existsSync(wtSnap)).toBe(true);
196
+ expect(existsSync(legacySnap)).toBe(true);
197
+ expect(summary.removedSnapshots).toEqual(["old-ref"]);
198
+ expect(summary.keptSnapshots.sort()).toEqual(["legacy", "wt"]);
199
+ });
200
+ });
201
+
202
+ describe("cleanupWorkspace — safety", () => {
203
+ test("never touches another skill's workspace, and leaves the root intact", () => {
204
+ const ws = freshWorkspace();
205
+ makeIteration(ws, "mr-review", "iteration-1", { promoted: true });
206
+ const otherIter = makeIteration(ws, "other-skill", "iteration-1", {
207
+ benchmark: true,
208
+ });
209
+
210
+ cleanupWorkspace(ws, "mr-review");
211
+
212
+ expect(existsSync(join(ws, "mr-review"))).toBe(false);
213
+ expect(existsSync(otherIter)).toBe(true);
214
+ // Root survives because other-skill still lives there.
215
+ expect(existsSync(ws)).toBe(true);
216
+ });
217
+
218
+ test("returns an empty summary and does not throw when the skill has no workspace", () => {
219
+ const ws = freshWorkspace();
220
+ const summary = cleanupWorkspace(ws, "never-ran");
221
+ expect(summary.removedIterations).toEqual([]);
222
+ expect(summary.keptIterations).toEqual([]);
223
+ expect(summary.removedSnapshots).toEqual([]);
224
+ expect(summary.keptSnapshots).toEqual([]);
225
+ expect(summary.workspaceRemoved).toBe(false);
226
+ });
227
+ });
@@ -0,0 +1,136 @@
1
+ import { existsSync, readdirSync, readFileSync, rmSync } from "node:fs";
2
+ import { join } from "node:path";
3
+
4
+ /**
5
+ * Marker `promote-baseline` drops into an iteration dir once that iteration's
6
+ * durable results (benchmark + gradings) are committed under the skill's
7
+ * `evals/baseline/`. Teardown treats its presence as "safe to delete" — the
8
+ * data now lives in version control.
9
+ */
10
+ export const PROMOTED_MARKER = ".promoted.json";
11
+
12
+ /**
13
+ * Provenance the `snapshot` command writes into each `snapshots/<label>/` dir,
14
+ * recording whether it was materialized from a git ref (reproducible) or copied
15
+ * from the working tree (not reproducible). Teardown only reclaims ref snapshots.
16
+ */
17
+ export const SNAPSHOT_META = ".snapshot-meta.json";
18
+
19
+ export type WorkspaceCleanupSummary = {
20
+ /** Iteration dir names removed (promoted, or pure scaffolding). */
21
+ removedIterations: string[];
22
+ /** Iterations kept because they hold uncommitted results, with the reason. */
23
+ keptIterations: { iteration: string; reason: string }[];
24
+ /** Snapshot labels removed (reproducible from a git ref). */
25
+ removedSnapshots: string[];
26
+ /** Snapshot labels kept (working-tree or legacy, can't be regenerated). */
27
+ keptSnapshots: string[];
28
+ /** True when the skill's whole workspace subtree was removed. */
29
+ workspaceRemoved: boolean;
30
+ };
31
+
32
+ /** Remove `dir` only if it exists and is empty. */
33
+ function pruneIfEmpty(dir: string): void {
34
+ if (existsSync(dir) && readdirSync(dir).length === 0) {
35
+ rmSync(dir, { recursive: true, force: true });
36
+ }
37
+ }
38
+
39
+ /**
40
+ * An iteration carries "captured results" worth preserving if it reached the
41
+ * point of producing an aggregate (`benchmark.json`) or any per-run record or
42
+ * grading. Anything short of that (e.g. a `--dry-run` or a run staged but never
43
+ * dispatched) is reproducible scaffolding.
44
+ */
45
+ function iterationHasResults(iterDir: string): boolean {
46
+ if (existsSync(join(iterDir, "benchmark.json"))) return true;
47
+ for (const entry of readdirSync(iterDir, { withFileTypes: true })) {
48
+ if (!entry.isDirectory() || !entry.name.startsWith("eval-")) continue;
49
+ const evalDir = join(iterDir, entry.name);
50
+ for (const cond of readdirSync(evalDir, { withFileTypes: true })) {
51
+ if (!cond.isDirectory()) continue;
52
+ const condDir = join(evalDir, cond.name);
53
+ if (existsSync(join(condDir, "run.json"))) return true;
54
+ if (existsSync(join(condDir, "grading.json"))) return true;
55
+ }
56
+ }
57
+ return false;
58
+ }
59
+
60
+ function snapshotSource(snapDir: string): string | null {
61
+ const metaPath = join(snapDir, SNAPSHOT_META);
62
+ if (!existsSync(metaPath)) return null;
63
+ try {
64
+ const meta = JSON.parse(readFileSync(metaPath, "utf8")) as {
65
+ source?: string;
66
+ };
67
+ return meta.source ?? null;
68
+ } catch {
69
+ return null;
70
+ }
71
+ }
72
+
73
+ /**
74
+ * End-of-run cleanup of a skill's `skills-workspace/<skill>/` subtree, so a
75
+ * finished eval leaves behind nothing that wasn't meant to be committed —
76
+ * without ever destroying results the user hasn't moved into version control.
77
+ *
78
+ * Per iteration: promoted (marker present) → removed; unpromoted but holding
79
+ * captured results → kept and reported; unpromoted scaffolding → removed. Per
80
+ * snapshot: ref-sourced → removed; working-tree or legacy → kept. Empty parents
81
+ * (`snapshots/`, the skill dir, the workspace root) are pruned, but a non-empty
82
+ * one — e.g. another skill's artifacts — is never touched.
83
+ */
84
+ export function cleanupWorkspace(
85
+ workspaceRoot: string,
86
+ skillName: string,
87
+ ): WorkspaceCleanupSummary {
88
+ const summary: WorkspaceCleanupSummary = {
89
+ removedIterations: [],
90
+ keptIterations: [],
91
+ removedSnapshots: [],
92
+ keptSnapshots: [],
93
+ workspaceRemoved: false,
94
+ };
95
+
96
+ const skillDir = join(workspaceRoot, skillName);
97
+ if (!existsSync(skillDir)) return summary;
98
+
99
+ for (const entry of readdirSync(skillDir, { withFileTypes: true })) {
100
+ if (!entry.isDirectory() || !entry.name.startsWith("iteration-")) continue;
101
+ const iterDir = join(skillDir, entry.name);
102
+ if (existsSync(join(iterDir, PROMOTED_MARKER))) {
103
+ rmSync(iterDir, { recursive: true, force: true });
104
+ summary.removedIterations.push(entry.name);
105
+ } else if (iterationHasResults(iterDir)) {
106
+ summary.keptIterations.push({
107
+ iteration: entry.name,
108
+ reason: "uncommitted results — not promoted to evals/baseline/",
109
+ });
110
+ } else {
111
+ rmSync(iterDir, { recursive: true, force: true });
112
+ summary.removedIterations.push(entry.name);
113
+ }
114
+ }
115
+
116
+ const snapshotsDir = join(skillDir, "snapshots");
117
+ if (existsSync(snapshotsDir)) {
118
+ for (const entry of readdirSync(snapshotsDir, { withFileTypes: true })) {
119
+ if (!entry.isDirectory()) continue;
120
+ const snapDir = join(snapshotsDir, entry.name);
121
+ if (snapshotSource(snapDir) === "ref") {
122
+ rmSync(snapDir, { recursive: true, force: true });
123
+ summary.removedSnapshots.push(entry.name);
124
+ } else {
125
+ summary.keptSnapshots.push(entry.name);
126
+ }
127
+ }
128
+ pruneIfEmpty(snapshotsDir);
129
+ }
130
+
131
+ pruneIfEmpty(skillDir);
132
+ summary.workspaceRemoved = !existsSync(skillDir);
133
+ pruneIfEmpty(workspaceRoot);
134
+
135
+ return summary;
136
+ }
@@ -70,11 +70,11 @@
70
70
  },
71
71
  "total_tokens": {
72
72
  "type": ["integer", "null"],
73
- "description": "From the harness's task completion event. May be null if the harness does not surface this."
73
+ "description": "From the harness's task completion event, or derived from the persisted transcript by record-runs (usage summed across unique message ids, including cache creation/read tokens — a different accounting than the completion event). Canonical timing lives in the sibling timing.json, whose `source` field records which origin produced it. May be null if neither source is available."
74
74
  },
75
75
  "duration_ms": {
76
76
  "type": ["integer", "null"],
77
- "description": "From the harness's task completion event. May be null if the harness does not surface this."
77
+ "description": "From the harness's task completion event, or derived from the persisted transcript by record-runs (wall clock between the first and last transcript timestamps). Canonical timing lives in the sibling timing.json. May be null if neither source is available."
78
78
  }
79
79
  }
80
80
  }
@@ -11,11 +11,12 @@
11
11
  "iteration": { "type": "integer" },
12
12
  "totals": {
13
13
  "type": "object",
14
- "required": ["violations", "warnings"],
14
+ "required": ["violations", "warnings", "live_source_reads"],
15
15
  "additionalProperties": false,
16
16
  "properties": {
17
17
  "violations": { "type": "integer" },
18
- "warnings": { "type": "integer" }
18
+ "warnings": { "type": "integer" },
19
+ "live_source_reads": { "type": "integer" }
19
20
  }
20
21
  },
21
22
  "runs": {
@@ -23,7 +24,13 @@
23
24
  "description": "One entry per (eval, condition) run that had at least one finding.",
24
25
  "items": {
25
26
  "type": "object",
26
- "required": ["eval_id", "condition", "violations", "warnings"],
27
+ "required": [
28
+ "eval_id",
29
+ "condition",
30
+ "violations",
31
+ "warnings",
32
+ "live_source_reads"
33
+ ],
27
34
  "additionalProperties": false,
28
35
  "properties": {
29
36
  "eval_id": { "type": "string" },
@@ -37,6 +44,11 @@
37
44
  "type": "array",
38
45
  "description": "Heuristic: a Bash command matched a mutating pattern (install, git, sed -i, redirection) without referencing the outputs dir.",
39
46
  "items": { "$ref": "#/definitions/finding" }
47
+ },
48
+ "live_source_reads": {
49
+ "type": "array",
50
+ "description": "A read tool or Bash command accessed the live skill-under-test directory instead of the staged copy — the arm may be contaminated (staged-slug resolution race).",
51
+ "items": { "$ref": "#/definitions/finding" }
40
52
  }
41
53
  }
42
54
  }
@@ -61,7 +61,9 @@ User request:
61
61
 
62
62
  ## After the subagent completes
63
63
 
64
- The operator (or the runner) must capture:
64
+ Two records must exist per run: `{{output_dir}}/../run.json` (matching `schema/run-record.schema.json`) and `{{output_dir}}/../timing.json`.
65
65
 
66
- 1. The full transcript / tool invocations convert via the harness adapter into `{{output_dir}}/../run.json` matching `schema/run-record.schema.json`.
67
- 2. `total_tokens` and `duration_ms` from the harness's task completion event `{{output_dir}}/../timing.json`. **These values may not be persisted anywhere else — save them immediately.**
66
+ - **Harnesses with persisted transcripts (Claude Code):** `record-runs` assembles both from disk after all dispatches — carry-over fields from `dispatch.json`, `final_message` from `{{output_dir}}/final-message.md`, `tool_invocations`/tokens/duration from the transcript. The operator captures nothing per-task. Optionally, completion-event timing written to `timing.json` at dispatch time (with `"source": "completion-event"`) takes precedence — `record-runs` only backfills, never overwrites.
67
+ - **Transcript-less harnesses:** the operator (or the runner) captures manually, as before:
68
+ 1. The full transcript / tool invocations → convert via the harness adapter into `{{output_dir}}/../run.json`.
69
+ 2. `total_tokens` and `duration_ms` from the harness's task completion event → `{{output_dir}}/../timing.json`. **These values may not be persisted anywhere else — save them immediately.**
@@ -60,7 +60,7 @@ Roughly in increasing order of effort / payoff:
60
60
  class of eval measurable. This is the high-value framework improvement.
61
61
  3. **Real harness-mode injection.** Reproduce the plan-mode suppression by running
62
62
  the eval subagent *inside* a real plan mode rather than a described one. Tracked
63
- as a parity goal in `harness-parity-check.md`; the biggest lift.
63
+ as a parity goal in `skills/evaluating-skills/harness-parity.md`; the biggest lift.
64
64
 
65
65
  ## Bigger-picture testing strategy (from the maintainer)
66
66
 
@@ -38,13 +38,24 @@ Before claiming any task is finished, making a success claim, or declaring a bug
38
38
 
39
39
  ---
40
40
 
41
- ## Finishing: Review First, Then Verify
41
+ ## Finishing: Review Code, Verify, Then Review Comments
42
42
 
43
- The Gate Function above is your discipline at *every* completion claim. When you believe the work itself is done, run this finishing sequence in order review **before** the final verification, so the evidence you hand back covers the exact code being returned:
43
+ The Gate Function above is your discipline at *every* completion claim. When you believe the work itself is done, run these three finishing phases **in order**. The order is deliberate: every code change happens in phase 1, *before* the verification, so the evidence you hand back is guaranteed to cover the exact code being returned — and comment cleanup comes *after*, where it can't disturb that check.
44
44
 
45
- 1. **Review the diff.** Invoke your harness's built-in code-review capability over the change. Verification proves the work *runs*; review catches what running can't — silent regressions, missed edge cases, leftover debug code, and reuse or simplification you'd want before another person reads the diff. This is a quick final check, not a second project. If your harness has no code-review capability, say so and re-read the diff yourself with the same intent.
46
- 2. **Address what it surfaces.** Fix or explicitly flag each finding. Any fix changes the code.
47
- 3. **Run the final verification last, on the result.** Now apply the Gate Function fresh to the post-review code and present *that* output as your evidence. Running verification before review would prove a version of the code you then changed the check the user sees must be the check on the code the user gets.
45
+ 1. **Review and fix the code** follow [`code-review.md`](code-review.md). This is the only phase that changes behavior. Review catches what running can't — silent regressions, missed edge cases, leftover debug code, reuse or simplification — then you fix or flag each finding, and *the code is now frozen*. Size the review to the change: a quick check, not a second project. (Comments are **not** reviewed here they get phase 3.)
46
+ 2. **Run the final verification** apply the Gate Function fresh to the now-frozen code and present *that* output as your evidence. Because all code changes happened in phase 1, this check covers exactly what the user gets.
47
+ 3. **Review and clean the comments** follow [`comment-review.md`](comment-review.md). This pass touches *only* comments, so it changes no behavior and needs **no re-verification**: delete narrative / step-by-step / ticket comments, keeping only true Explanation or exported-member Documentation, before the diff reaches a human.
48
+
49
+ **Copy this checklist into your task tracker the moment you start finishing, and tick each box in order.** The ordering *is* the discipline — and an untracked checklist is one whose middle steps get skipped under momentum:
50
+
51
+ ```
52
+ - [ ] Phase 1 — reviewed the CODE against intent, ranked findings, fixed/flagged each (per code-review.md); code is now frozen
53
+ - [ ] Phase 2 — ran the final verification fresh on the frozen code, and presented that output as evidence
54
+ - [ ] Phase 3 — reviewed the COMMENTS (per comment-review.md): deleted narrative / step-by-step / ticket comments, kept only true Explanation or exported Documentation
55
+ - [ ] Surfaced integration options (merge / push+PR / leave as-is / discard) — did not merge or push on my own
56
+ ```
57
+
58
+ The last box is its own gate; the section below is why it's never yours to skip.
48
59
 
49
60
  ---
50
61
 
@@ -69,7 +80,7 @@ Verified, reviewed work is still *your* checkpoint, not a decision to merge. Int
69
80
  | "It's obvious this is correct" | Obvious bugs are the most embarrassing. Reading code predicts behavior; only running it proves behavior. |
70
81
  | "I'll verify after committing" | Verification after the claim is too late. |
71
82
  | "The build should be fine" | "Should" is not evidence. |
72
- | "Tests pass, so we're done here" | Verification is one step of finishing, not the whole sequence. Review the diff, then run the final check on the reviewed code. |
83
+ | "Tests pass, so we're done here" | Verification is one phase of finishing, not the whole sequence review and fix the code, verify the frozen result, then clean the comments. |
73
84
  | "The user said ship it, so I'll just merge" | "Ship it" authorizes the user's choice, not a unilateral merge or push. |
74
85
 
75
86
  ---
@@ -0,0 +1,68 @@
1
+ # Reviewing the Code
2
+
3
+ This is **phase 1** of the finishing sequence in [`SKILL.md`](SKILL.md) — the
4
+ code review. Review and fix the *code* here. This is the only phase that changes
5
+ behavior, so once you finish it the code is frozen.
6
+
7
+ ---
8
+
9
+ ## Size the review to the change
10
+
11
+ Review depth matches the size and risk of the diff. A one-line fix gets a careful
12
+ read and a moment's thought about what it could break; a new subsystem gets more.
13
+ Don't run a heavyweight audit over a trivial change to look thorough — a review
14
+ that's louder than the change it covers is the failure this guidance exists to
15
+ prevent.
16
+
17
+ Do the review however your harness makes natural — read the diff inline, or
18
+ dispatch it to a general purpose subagent.
19
+
20
+ ---
21
+
22
+ ## Read the diff against intent
23
+
24
+ Read the actual diff — not your memory of what you changed — against the plan or
25
+ the request. Cite findings by `file:line` so each one is checkable. Look for:
26
+
27
+ - **Intent alignment** — does the change do what was asked? Are deviations
28
+ deliberate improvements, or drift?
29
+ - **Correctness** — bugs, off-by-ones, wrong conditions, mishandled `null`/empty.
30
+ - **Error & edge cases** — failure paths, boundaries, and inputs the happy path skips.
31
+ - **Reuse & simplification** — existing helpers ignored, needless abstraction,
32
+ code that could be plainer.
33
+ - **Leftover scaffolding** — debug prints, commented-out code, dead branches,
34
+ silent regressions to nearby behavior.
35
+ - **Tests** — do they exercise real behavior, and do they cover what changed?
36
+
37
+ This is not an exhaustive checklist to march through — it's where real problems
38
+ tend to hide. Spend attention where this particular diff warrants it.
39
+
40
+ ---
41
+
42
+ ## Rank, then return only the top findings
43
+
44
+ Sort what you found by severity and report only the few that matter. The point of
45
+ ranking is to *drop* noise, not to pad a list.
46
+
47
+ | Severity | What belongs here |
48
+ |----------|-------------------|
49
+ | **Critical — must fix** | Bugs, security holes, data loss, broken functionality. |
50
+ | **Important — should fix** | Missing behavior, weak error handling, test gaps, architecture problems. |
51
+ | **Minor — nice to have** | Style, micro-optimizations, polish. |
52
+
53
+ Report the most important handful. **Drop Minor nitpicks unless nothing more
54
+ serious exists** — a pile of trivia buries the one finding that mattered and
55
+ trains the reader to skim past your review. Don't manufacture findings to fill the
56
+ tiers; "nothing critical, one important thing" is a complete and good result.
57
+ Close with a one-line verdict.
58
+
59
+ ---
60
+
61
+ ## Then: address the findings — and freeze the code
62
+
63
+ Fix or explicitly flag each code finding you kept. Any fix changes the code — so
64
+ make all of those changes *now*, in this phase. When you're done, the code is
65
+ **frozen**: nothing in the remaining phases touches behavior. Return to the
66
+ finishing sequence in [`SKILL.md`](SKILL.md) and run the **final verification**
67
+ (phase 2) on this frozen result — the check you hand back is then guaranteed to
68
+ cover the exact code being returned.
@@ -0,0 +1,85 @@
1
+ # Reviewing the Comments
2
+
3
+ This is **phase 3** — the last step of the finishing sequence in [`SKILL.md`](SKILL.md).
4
+ By now the code has been reviewed (phase 1), and verified (phase 2). The code is frozen;
5
+ **this pass touches only comments.** That is the whole reason it comes last: a
6
+ comment edit can't change behavior, so it can't invalidate the verification you
7
+ just ran — there is nothing here to re-test. Do it as the final polish before the
8
+ handoff.
9
+
10
+ ---
11
+
12
+ ## The comment-hygiene pass
13
+
14
+ Review **every comment in the changed code** with one goal: **delete as many as
15
+ possible.**
16
+
17
+ This runs against your own instinct. Writing a comment feels like preserving the
18
+ narrative — why this approach, what was tried, which ticket it traces to. But a
19
+ human reading code finds it *very hard* to skip a comment; every one they hit,
20
+ they stop and read. Narrative comments tax every future reader to record a story
21
+ that belongs in the commit message or the PR, not the source. Left in, they
22
+ become the thing the user has to delete by hand before merging — so delete them
23
+ now, on their behalf.
24
+
25
+ A comment survives only if it fits one of two categories **and** meets its bar:
26
+
27
+ 1. **Explanation.** Code that is genuinely hard to follow from reading it — a
28
+ subtle algorithm, a deliberate break from the usual pattern, a non-obvious
29
+ constraint. The comment fills the gap with an *evergreen* reason (true a year
30
+ from now, not "fixes the bug from Tuesday"). These are **extremely rare**:
31
+ well-written code is self-commenting, and a reader fluent in code can follow
32
+ even sophisticated paths when the code itself is clear. If the right fix is to
33
+ make the code clearer, do that instead of explaining unclear code.
34
+ 2. **Documentation.** A concise doc-style comment (jsdoc and equivalents) on an
35
+ **exported** member, where the text is surfaced by doc generators and editor
36
+ hints to readers who *don't* have the source in front of them. These almost
37
+ always earn their place. Keep them concise and evergreen, matching the
38
+ surrounding style; they may describe usage more freely since that's their job.
39
+
40
+ **Everything else gets deleted — about 99.9% of the time.** The most common
41
+ offender, and the one that feels most defensible, is **step-by-step narration**
42
+ that walks through what the code already says — `// Step 1: lowercase`,
43
+ `// now strip the accents`, `// finally, trim the dashes`. It reads as helpful
44
+ structure, and *that feeling is the trap*: the numbered steps restate control
45
+ flow the reader can already see in the code, so most such comments carry no
46
+ information the line below them doesn't — they only add something else to read.
47
+ "The steps make it easier to follow" is the rationalization to delete *through*,
48
+ not act on; the code is the structure. Strip the narration and nothing is lost.
49
+ The same goes for prose narrative ("first we… then we…"), time-sensitive comments
50
+ (ticket numbers, "the previous solution…", "changed this because…"), and any
51
+ comment that merely restates its line. A comment that fits neither surviving
52
+ category, or fits one but misses its bar, is noise. **When in doubt, delete it.**
53
+ A truly unique case might warrant a truly unusual comment — but treat that as the
54
+ rare exception it is, not the default.
55
+
56
+ ```ts
57
+ // BEFORE — every comment restates the line under it
58
+ // Step 1: lowercase the title
59
+ const lower = title.toLowerCase();
60
+ // Step 2: replace whitespace runs with a single hyphen
61
+ const hyphenated = lower.replace(/\s+/g, "-");
62
+
63
+ // AFTER — the code already says all of that
64
+ const lower = title.toLowerCase();
65
+ const hyphenated = lower.replace(/\s+/g, "-");
66
+ ```
67
+
68
+ **A kernel of value doesn't save the comment around it.** The hardest case is
69
+ the *mixed* comment — mostly narration, with one genuinely useful clause buried
70
+ in it (a real constraint, a non-obvious *why*). Keeping the whole block "because
71
+ part of it is useful" is exactly how noise survives review: a reader will keep a
72
+ comment that's 90% restatement for the sake of the 10% that matters. Don't.
73
+ **Extract the useful part, delete the rest, and if what remains earns a comment,
74
+ write it as a tight standalone one** — the kernel alone, not the narration that
75
+ carried it. A four-line "Step 1… / Step 2 *(the one real reason)* / Step 3… /
76
+ Step 4…" block collapses to a single comment stating that one reason, and the
77
+ numbered narration is gone.
78
+
79
+ ---
80
+
81
+ ## Then: hand it back
82
+
83
+ These were comment-only edits — they change no behavior, so there is **nothing to
84
+ re-verify**: the verification from phase 2 still covers the code being returned.
85
+ Return to the finishing sequence in [`SKILL.md`](SKILL.md) for the handoff.