@slowdini/slow-powers-opencode 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +18 -8
- package/package.json +5 -1
- package/skills/evaluating-skills/SKILL.md +19 -17
- package/skills/evaluating-skills/harness-details/claude.md +51 -15
- package/skills/evaluating-skills/harness-parity.md +155 -0
- package/skills/evaluating-skills/runner/README.md +28 -19
- package/skills/evaluating-skills/runner/adapters/claude-code-session.ts +2 -2
- package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +222 -0
- package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +107 -11
- package/skills/evaluating-skills/runner/aggregate.test.ts +220 -0
- package/skills/evaluating-skills/runner/aggregate.ts +21 -0
- package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +295 -2
- package/skills/evaluating-skills/runner/detect-stray-writes.ts +102 -6
- package/skills/evaluating-skills/runner/guard/policy.test.ts +57 -0
- package/skills/evaluating-skills/runner/promote-baseline.test.ts +51 -0
- package/skills/evaluating-skills/runner/promote-baseline.ts +19 -1
- package/skills/evaluating-skills/runner/record-runs.test.ts +314 -0
- package/skills/evaluating-skills/runner/record-runs.ts +209 -0
- package/skills/evaluating-skills/runner/run.test.ts +523 -0
- package/skills/evaluating-skills/runner/run.ts +376 -17
- package/skills/evaluating-skills/runner/sandbox-policy.ts +20 -0
- package/skills/evaluating-skills/runner/types.ts +9 -0
- package/skills/evaluating-skills/runner/workspace-teardown.test.ts +227 -0
- package/skills/evaluating-skills/runner/workspace-teardown.ts +136 -0
- package/skills/evaluating-skills/schema/run-record.schema.json +2 -2
- package/skills/evaluating-skills/schema/stray-writes.schema.json +15 -3
- package/skills/evaluating-skills/templates/eval-task-prompt.md +5 -3
- package/skills/test-driven-development/evals/baseline/NOTES.md +1 -1
- package/skills/verifying-development-work/SKILL.md +17 -6
- package/skills/verifying-development-work/code-review.md +68 -0
- package/skills/verifying-development-work/comment-review.md +85 -0
- package/skills/verifying-development-work/evals/baseline/BASELINE.md +7 -6
- package/skills/verifying-development-work/evals/baseline/NOTES.md +83 -149
- package/skills/verifying-development-work/evals/baseline/benchmark.json +32 -31
- package/skills/verifying-development-work/evals/baseline/grading/comment-hygiene-at-handoff__new_skill.json +53 -0
- package/skills/verifying-development-work/evals/baseline/grading/comment-hygiene-at-handoff__old_skill.json +53 -0
- package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__new_skill.json +53 -0
- package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__old_skill.json +53 -0
- package/skills/verifying-development-work/evals/evals.json +34 -2
- package/skills/verifying-development-work/evals/fixtures/comment-hygiene-at-handoff/slugify.test.ts +14 -0
- package/skills/verifying-development-work/evals/fixtures/comment-hygiene-at-handoff/slugify.ts +25 -0
- package/skills/verifying-development-work/evals/baseline/grading/bug-fixed-without-reproducing__with_skill.json +0 -39
- package/skills/verifying-development-work/evals/baseline/grading/bug-fixed-without-reproducing__without_skill.json +0 -24
- package/skills/verifying-development-work/evals/baseline/grading/build-implied-by-edit__with_skill.json +0 -46
- package/skills/verifying-development-work/evals/baseline/grading/build-implied-by-edit__without_skill.json +0 -31
- package/skills/verifying-development-work/evals/baseline/grading/claim-without-running__with_skill.json +0 -46
- package/skills/verifying-development-work/evals/baseline/grading/claim-without-running__without_skill.json +0 -31
- package/skills/verifying-development-work/evals/baseline/grading/seeded-done-tests-pass-ship-it__with_skill.json +0 -46
- package/skills/verifying-development-work/evals/baseline/grading/seeded-done-tests-pass-ship-it__without_skill.json +0 -31
- package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__with_skill.json +0 -53
- package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__without_skill.json +0 -38
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
import { afterAll, beforeAll, describe, expect, test } from "bun:test";
|
|
2
|
+
import { existsSync, mkdirSync, rmSync, writeFileSync } from "node:fs";
|
|
3
|
+
import { tmpdir } from "node:os";
|
|
4
|
+
import { join } from "node:path";
|
|
5
|
+
import {
|
|
6
|
+
cleanupWorkspace,
|
|
7
|
+
PROMOTED_MARKER,
|
|
8
|
+
SNAPSHOT_META,
|
|
9
|
+
} from "./workspace-teardown";
|
|
10
|
+
|
|
11
|
+
const FIXTURE_ROOT = join(
|
|
12
|
+
tmpdir(),
|
|
13
|
+
`slow-powers-workspace-teardown-test-${process.pid}`,
|
|
14
|
+
);
|
|
15
|
+
|
|
16
|
+
beforeAll(() => {
|
|
17
|
+
mkdirSync(FIXTURE_ROOT, { recursive: true });
|
|
18
|
+
});
|
|
19
|
+
|
|
20
|
+
afterAll(() => {
|
|
21
|
+
rmSync(FIXTURE_ROOT, { recursive: true, force: true });
|
|
22
|
+
});
|
|
23
|
+
|
|
24
|
+
let caseSeq = 0;
|
|
25
|
+
function freshWorkspace(): string {
|
|
26
|
+
caseSeq += 1;
|
|
27
|
+
const workspaceRoot = join(
|
|
28
|
+
FIXTURE_ROOT,
|
|
29
|
+
`case-${caseSeq}`,
|
|
30
|
+
"skills-workspace",
|
|
31
|
+
);
|
|
32
|
+
mkdirSync(workspaceRoot, { recursive: true });
|
|
33
|
+
return workspaceRoot;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
function writeJson(path: string, value: unknown) {
|
|
37
|
+
mkdirSync(join(path, ".."), { recursive: true });
|
|
38
|
+
writeFileSync(path, `${JSON.stringify(value, null, 2)}\n`);
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
/** Build an iteration dir; `opts` controls which artifacts it carries. */
|
|
42
|
+
function makeIteration(
|
|
43
|
+
workspaceRoot: string,
|
|
44
|
+
skill: string,
|
|
45
|
+
iteration: string,
|
|
46
|
+
opts: {
|
|
47
|
+
promoted?: boolean;
|
|
48
|
+
benchmark?: boolean;
|
|
49
|
+
runRecord?: boolean;
|
|
50
|
+
grading?: boolean;
|
|
51
|
+
scaffoldingOnly?: boolean;
|
|
52
|
+
},
|
|
53
|
+
): string {
|
|
54
|
+
const dir = join(workspaceRoot, skill, iteration);
|
|
55
|
+
mkdirSync(dir, { recursive: true });
|
|
56
|
+
if (opts.scaffoldingOnly) {
|
|
57
|
+
writeFileSync(join(dir, "dispatch.json"), "[]\n");
|
|
58
|
+
}
|
|
59
|
+
if (opts.benchmark) {
|
|
60
|
+
writeJson(join(dir, "benchmark.json"), { delta: { pass_rate: 0.5 } });
|
|
61
|
+
}
|
|
62
|
+
if (opts.runRecord) {
|
|
63
|
+
writeJson(join(dir, "eval-e1", "with_skill", "run.json"), {
|
|
64
|
+
eval_id: "e1",
|
|
65
|
+
});
|
|
66
|
+
}
|
|
67
|
+
if (opts.grading) {
|
|
68
|
+
writeJson(join(dir, "eval-e1", "with_skill", "grading.json"), {
|
|
69
|
+
summary: { pass_rate: 1 },
|
|
70
|
+
});
|
|
71
|
+
}
|
|
72
|
+
if (opts.promoted) {
|
|
73
|
+
writeJson(join(dir, PROMOTED_MARKER), {
|
|
74
|
+
promoted_at: "2026-06-04T00:00:00.000Z",
|
|
75
|
+
baseline_dir: "/somewhere/evals/baseline",
|
|
76
|
+
commit: "abc1234",
|
|
77
|
+
});
|
|
78
|
+
}
|
|
79
|
+
return dir;
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
function makeSnapshot(
|
|
83
|
+
workspaceRoot: string,
|
|
84
|
+
skill: string,
|
|
85
|
+
label: string,
|
|
86
|
+
source: "ref" | "working-tree" | null,
|
|
87
|
+
): string {
|
|
88
|
+
const dir = join(workspaceRoot, skill, "snapshots", label);
|
|
89
|
+
mkdirSync(dir, { recursive: true });
|
|
90
|
+
writeFileSync(join(dir, "SKILL.md"), "snapshot body\n");
|
|
91
|
+
if (source !== null) {
|
|
92
|
+
writeJson(
|
|
93
|
+
join(dir, SNAPSHOT_META),
|
|
94
|
+
source === "ref" ? { source, ref: "HEAD~1" } : { source },
|
|
95
|
+
);
|
|
96
|
+
}
|
|
97
|
+
return dir;
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
describe("cleanupWorkspace — iterations", () => {
|
|
101
|
+
test("removes a promoted iteration and prunes the emptied workspace", () => {
|
|
102
|
+
const ws = freshWorkspace();
|
|
103
|
+
const iter = makeIteration(ws, "mr-review", "iteration-1", {
|
|
104
|
+
promoted: true,
|
|
105
|
+
benchmark: true,
|
|
106
|
+
grading: true,
|
|
107
|
+
});
|
|
108
|
+
|
|
109
|
+
const summary = cleanupWorkspace(ws, "mr-review");
|
|
110
|
+
|
|
111
|
+
expect(existsSync(iter)).toBe(false);
|
|
112
|
+
expect(summary.removedIterations).toEqual(["iteration-1"]);
|
|
113
|
+
expect(summary.workspaceRemoved).toBe(true);
|
|
114
|
+
// Skill dir and the workspace root are pruned once empty.
|
|
115
|
+
expect(existsSync(join(ws, "mr-review"))).toBe(false);
|
|
116
|
+
expect(existsSync(ws)).toBe(false);
|
|
117
|
+
});
|
|
118
|
+
|
|
119
|
+
test("keeps an unpromoted iteration that holds a benchmark, and reports it", () => {
|
|
120
|
+
const ws = freshWorkspace();
|
|
121
|
+
const iter = makeIteration(ws, "mr-review", "iteration-1", {
|
|
122
|
+
benchmark: true,
|
|
123
|
+
});
|
|
124
|
+
|
|
125
|
+
const summary = cleanupWorkspace(ws, "mr-review");
|
|
126
|
+
|
|
127
|
+
expect(existsSync(iter)).toBe(true);
|
|
128
|
+
expect(summary.removedIterations).toEqual([]);
|
|
129
|
+
expect(summary.keptIterations.map((k) => k.iteration)).toEqual([
|
|
130
|
+
"iteration-1",
|
|
131
|
+
]);
|
|
132
|
+
// Nothing was emptied, so the workspace stays.
|
|
133
|
+
expect(existsSync(ws)).toBe(true);
|
|
134
|
+
});
|
|
135
|
+
|
|
136
|
+
test("keeps an unpromoted iteration that holds only a run record", () => {
|
|
137
|
+
const ws = freshWorkspace();
|
|
138
|
+
const iter = makeIteration(ws, "mr-review", "iteration-1", {
|
|
139
|
+
runRecord: true,
|
|
140
|
+
});
|
|
141
|
+
|
|
142
|
+
const summary = cleanupWorkspace(ws, "mr-review");
|
|
143
|
+
|
|
144
|
+
expect(existsSync(iter)).toBe(true);
|
|
145
|
+
expect(summary.keptIterations.map((k) => k.iteration)).toEqual([
|
|
146
|
+
"iteration-1",
|
|
147
|
+
]);
|
|
148
|
+
});
|
|
149
|
+
|
|
150
|
+
test("removes an unpromoted scaffolding-only iteration (no captured results)", () => {
|
|
151
|
+
const ws = freshWorkspace();
|
|
152
|
+
const iter = makeIteration(ws, "mr-review", "iteration-1", {
|
|
153
|
+
scaffoldingOnly: true,
|
|
154
|
+
});
|
|
155
|
+
|
|
156
|
+
const summary = cleanupWorkspace(ws, "mr-review");
|
|
157
|
+
|
|
158
|
+
expect(existsSync(iter)).toBe(false);
|
|
159
|
+
expect(summary.removedIterations).toEqual(["iteration-1"]);
|
|
160
|
+
});
|
|
161
|
+
|
|
162
|
+
test("mixed: promoted removed, unpromoted-with-results kept, skill dir NOT pruned", () => {
|
|
163
|
+
const ws = freshWorkspace();
|
|
164
|
+
const promoted = makeIteration(ws, "mr-review", "iteration-1", {
|
|
165
|
+
promoted: true,
|
|
166
|
+
benchmark: true,
|
|
167
|
+
});
|
|
168
|
+
const kept = makeIteration(ws, "mr-review", "iteration-2", {
|
|
169
|
+
benchmark: true,
|
|
170
|
+
});
|
|
171
|
+
|
|
172
|
+
const summary = cleanupWorkspace(ws, "mr-review");
|
|
173
|
+
|
|
174
|
+
expect(existsSync(promoted)).toBe(false);
|
|
175
|
+
expect(existsSync(kept)).toBe(true);
|
|
176
|
+
expect(summary.removedIterations).toEqual(["iteration-1"]);
|
|
177
|
+
expect(summary.keptIterations.map((k) => k.iteration)).toEqual([
|
|
178
|
+
"iteration-2",
|
|
179
|
+
]);
|
|
180
|
+
expect(summary.workspaceRemoved).toBe(false);
|
|
181
|
+
expect(existsSync(join(ws, "mr-review"))).toBe(true);
|
|
182
|
+
});
|
|
183
|
+
});
|
|
184
|
+
|
|
185
|
+
describe("cleanupWorkspace — snapshots", () => {
|
|
186
|
+
test("removes ref snapshots, keeps working-tree and legacy (no-meta) snapshots", () => {
|
|
187
|
+
const ws = freshWorkspace();
|
|
188
|
+
const refSnap = makeSnapshot(ws, "mr-review", "old-ref", "ref");
|
|
189
|
+
const wtSnap = makeSnapshot(ws, "mr-review", "wt", "working-tree");
|
|
190
|
+
const legacySnap = makeSnapshot(ws, "mr-review", "legacy", null);
|
|
191
|
+
|
|
192
|
+
const summary = cleanupWorkspace(ws, "mr-review");
|
|
193
|
+
|
|
194
|
+
expect(existsSync(refSnap)).toBe(false);
|
|
195
|
+
expect(existsSync(wtSnap)).toBe(true);
|
|
196
|
+
expect(existsSync(legacySnap)).toBe(true);
|
|
197
|
+
expect(summary.removedSnapshots).toEqual(["old-ref"]);
|
|
198
|
+
expect(summary.keptSnapshots.sort()).toEqual(["legacy", "wt"]);
|
|
199
|
+
});
|
|
200
|
+
});
|
|
201
|
+
|
|
202
|
+
describe("cleanupWorkspace — safety", () => {
|
|
203
|
+
test("never touches another skill's workspace, and leaves the root intact", () => {
|
|
204
|
+
const ws = freshWorkspace();
|
|
205
|
+
makeIteration(ws, "mr-review", "iteration-1", { promoted: true });
|
|
206
|
+
const otherIter = makeIteration(ws, "other-skill", "iteration-1", {
|
|
207
|
+
benchmark: true,
|
|
208
|
+
});
|
|
209
|
+
|
|
210
|
+
cleanupWorkspace(ws, "mr-review");
|
|
211
|
+
|
|
212
|
+
expect(existsSync(join(ws, "mr-review"))).toBe(false);
|
|
213
|
+
expect(existsSync(otherIter)).toBe(true);
|
|
214
|
+
// Root survives because other-skill still lives there.
|
|
215
|
+
expect(existsSync(ws)).toBe(true);
|
|
216
|
+
});
|
|
217
|
+
|
|
218
|
+
test("returns an empty summary and does not throw when the skill has no workspace", () => {
|
|
219
|
+
const ws = freshWorkspace();
|
|
220
|
+
const summary = cleanupWorkspace(ws, "never-ran");
|
|
221
|
+
expect(summary.removedIterations).toEqual([]);
|
|
222
|
+
expect(summary.keptIterations).toEqual([]);
|
|
223
|
+
expect(summary.removedSnapshots).toEqual([]);
|
|
224
|
+
expect(summary.keptSnapshots).toEqual([]);
|
|
225
|
+
expect(summary.workspaceRemoved).toBe(false);
|
|
226
|
+
});
|
|
227
|
+
});
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
import { existsSync, readdirSync, readFileSync, rmSync } from "node:fs";
|
|
2
|
+
import { join } from "node:path";
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Marker `promote-baseline` drops into an iteration dir once that iteration's
|
|
6
|
+
* durable results (benchmark + gradings) are committed under the skill's
|
|
7
|
+
* `evals/baseline/`. Teardown treats its presence as "safe to delete" — the
|
|
8
|
+
* data now lives in version control.
|
|
9
|
+
*/
|
|
10
|
+
export const PROMOTED_MARKER = ".promoted.json";
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* Provenance the `snapshot` command writes into each `snapshots/<label>/` dir,
|
|
14
|
+
* recording whether it was materialized from a git ref (reproducible) or copied
|
|
15
|
+
* from the working tree (not reproducible). Teardown only reclaims ref snapshots.
|
|
16
|
+
*/
|
|
17
|
+
export const SNAPSHOT_META = ".snapshot-meta.json";
|
|
18
|
+
|
|
19
|
+
export type WorkspaceCleanupSummary = {
|
|
20
|
+
/** Iteration dir names removed (promoted, or pure scaffolding). */
|
|
21
|
+
removedIterations: string[];
|
|
22
|
+
/** Iterations kept because they hold uncommitted results, with the reason. */
|
|
23
|
+
keptIterations: { iteration: string; reason: string }[];
|
|
24
|
+
/** Snapshot labels removed (reproducible from a git ref). */
|
|
25
|
+
removedSnapshots: string[];
|
|
26
|
+
/** Snapshot labels kept (working-tree or legacy, can't be regenerated). */
|
|
27
|
+
keptSnapshots: string[];
|
|
28
|
+
/** True when the skill's whole workspace subtree was removed. */
|
|
29
|
+
workspaceRemoved: boolean;
|
|
30
|
+
};
|
|
31
|
+
|
|
32
|
+
/** Remove `dir` only if it exists and is empty. */
|
|
33
|
+
function pruneIfEmpty(dir: string): void {
|
|
34
|
+
if (existsSync(dir) && readdirSync(dir).length === 0) {
|
|
35
|
+
rmSync(dir, { recursive: true, force: true });
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
/**
|
|
40
|
+
* An iteration carries "captured results" worth preserving if it reached the
|
|
41
|
+
* point of producing an aggregate (`benchmark.json`) or any per-run record or
|
|
42
|
+
* grading. Anything short of that (e.g. a `--dry-run` or a run staged but never
|
|
43
|
+
* dispatched) is reproducible scaffolding.
|
|
44
|
+
*/
|
|
45
|
+
function iterationHasResults(iterDir: string): boolean {
|
|
46
|
+
if (existsSync(join(iterDir, "benchmark.json"))) return true;
|
|
47
|
+
for (const entry of readdirSync(iterDir, { withFileTypes: true })) {
|
|
48
|
+
if (!entry.isDirectory() || !entry.name.startsWith("eval-")) continue;
|
|
49
|
+
const evalDir = join(iterDir, entry.name);
|
|
50
|
+
for (const cond of readdirSync(evalDir, { withFileTypes: true })) {
|
|
51
|
+
if (!cond.isDirectory()) continue;
|
|
52
|
+
const condDir = join(evalDir, cond.name);
|
|
53
|
+
if (existsSync(join(condDir, "run.json"))) return true;
|
|
54
|
+
if (existsSync(join(condDir, "grading.json"))) return true;
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
return false;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
function snapshotSource(snapDir: string): string | null {
|
|
61
|
+
const metaPath = join(snapDir, SNAPSHOT_META);
|
|
62
|
+
if (!existsSync(metaPath)) return null;
|
|
63
|
+
try {
|
|
64
|
+
const meta = JSON.parse(readFileSync(metaPath, "utf8")) as {
|
|
65
|
+
source?: string;
|
|
66
|
+
};
|
|
67
|
+
return meta.source ?? null;
|
|
68
|
+
} catch {
|
|
69
|
+
return null;
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
/**
|
|
74
|
+
* End-of-run cleanup of a skill's `skills-workspace/<skill>/` subtree, so a
|
|
75
|
+
* finished eval leaves behind nothing that wasn't meant to be committed —
|
|
76
|
+
* without ever destroying results the user hasn't moved into version control.
|
|
77
|
+
*
|
|
78
|
+
* Per iteration: promoted (marker present) → removed; unpromoted but holding
|
|
79
|
+
* captured results → kept and reported; unpromoted scaffolding → removed. Per
|
|
80
|
+
* snapshot: ref-sourced → removed; working-tree or legacy → kept. Empty parents
|
|
81
|
+
* (`snapshots/`, the skill dir, the workspace root) are pruned, but a non-empty
|
|
82
|
+
* one — e.g. another skill's artifacts — is never touched.
|
|
83
|
+
*/
|
|
84
|
+
export function cleanupWorkspace(
|
|
85
|
+
workspaceRoot: string,
|
|
86
|
+
skillName: string,
|
|
87
|
+
): WorkspaceCleanupSummary {
|
|
88
|
+
const summary: WorkspaceCleanupSummary = {
|
|
89
|
+
removedIterations: [],
|
|
90
|
+
keptIterations: [],
|
|
91
|
+
removedSnapshots: [],
|
|
92
|
+
keptSnapshots: [],
|
|
93
|
+
workspaceRemoved: false,
|
|
94
|
+
};
|
|
95
|
+
|
|
96
|
+
const skillDir = join(workspaceRoot, skillName);
|
|
97
|
+
if (!existsSync(skillDir)) return summary;
|
|
98
|
+
|
|
99
|
+
for (const entry of readdirSync(skillDir, { withFileTypes: true })) {
|
|
100
|
+
if (!entry.isDirectory() || !entry.name.startsWith("iteration-")) continue;
|
|
101
|
+
const iterDir = join(skillDir, entry.name);
|
|
102
|
+
if (existsSync(join(iterDir, PROMOTED_MARKER))) {
|
|
103
|
+
rmSync(iterDir, { recursive: true, force: true });
|
|
104
|
+
summary.removedIterations.push(entry.name);
|
|
105
|
+
} else if (iterationHasResults(iterDir)) {
|
|
106
|
+
summary.keptIterations.push({
|
|
107
|
+
iteration: entry.name,
|
|
108
|
+
reason: "uncommitted results — not promoted to evals/baseline/",
|
|
109
|
+
});
|
|
110
|
+
} else {
|
|
111
|
+
rmSync(iterDir, { recursive: true, force: true });
|
|
112
|
+
summary.removedIterations.push(entry.name);
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
const snapshotsDir = join(skillDir, "snapshots");
|
|
117
|
+
if (existsSync(snapshotsDir)) {
|
|
118
|
+
for (const entry of readdirSync(snapshotsDir, { withFileTypes: true })) {
|
|
119
|
+
if (!entry.isDirectory()) continue;
|
|
120
|
+
const snapDir = join(snapshotsDir, entry.name);
|
|
121
|
+
if (snapshotSource(snapDir) === "ref") {
|
|
122
|
+
rmSync(snapDir, { recursive: true, force: true });
|
|
123
|
+
summary.removedSnapshots.push(entry.name);
|
|
124
|
+
} else {
|
|
125
|
+
summary.keptSnapshots.push(entry.name);
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
pruneIfEmpty(snapshotsDir);
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
pruneIfEmpty(skillDir);
|
|
132
|
+
summary.workspaceRemoved = !existsSync(skillDir);
|
|
133
|
+
pruneIfEmpty(workspaceRoot);
|
|
134
|
+
|
|
135
|
+
return summary;
|
|
136
|
+
}
|
|
@@ -70,11 +70,11 @@
|
|
|
70
70
|
},
|
|
71
71
|
"total_tokens": {
|
|
72
72
|
"type": ["integer", "null"],
|
|
73
|
-
"description": "From the harness's task completion event. May be null if
|
|
73
|
+
"description": "From the harness's task completion event, or derived from the persisted transcript by record-runs (usage summed across unique message ids, including cache creation/read tokens — a different accounting than the completion event). Canonical timing lives in the sibling timing.json, whose `source` field records which origin produced it. May be null if neither source is available."
|
|
74
74
|
},
|
|
75
75
|
"duration_ms": {
|
|
76
76
|
"type": ["integer", "null"],
|
|
77
|
-
"description": "From the harness's task completion event. May be null if
|
|
77
|
+
"description": "From the harness's task completion event, or derived from the persisted transcript by record-runs (wall clock between the first and last transcript timestamps). Canonical timing lives in the sibling timing.json. May be null if neither source is available."
|
|
78
78
|
}
|
|
79
79
|
}
|
|
80
80
|
}
|
|
@@ -11,11 +11,12 @@
|
|
|
11
11
|
"iteration": { "type": "integer" },
|
|
12
12
|
"totals": {
|
|
13
13
|
"type": "object",
|
|
14
|
-
"required": ["violations", "warnings"],
|
|
14
|
+
"required": ["violations", "warnings", "live_source_reads"],
|
|
15
15
|
"additionalProperties": false,
|
|
16
16
|
"properties": {
|
|
17
17
|
"violations": { "type": "integer" },
|
|
18
|
-
"warnings": { "type": "integer" }
|
|
18
|
+
"warnings": { "type": "integer" },
|
|
19
|
+
"live_source_reads": { "type": "integer" }
|
|
19
20
|
}
|
|
20
21
|
},
|
|
21
22
|
"runs": {
|
|
@@ -23,7 +24,13 @@
|
|
|
23
24
|
"description": "One entry per (eval, condition) run that had at least one finding.",
|
|
24
25
|
"items": {
|
|
25
26
|
"type": "object",
|
|
26
|
-
"required": [
|
|
27
|
+
"required": [
|
|
28
|
+
"eval_id",
|
|
29
|
+
"condition",
|
|
30
|
+
"violations",
|
|
31
|
+
"warnings",
|
|
32
|
+
"live_source_reads"
|
|
33
|
+
],
|
|
27
34
|
"additionalProperties": false,
|
|
28
35
|
"properties": {
|
|
29
36
|
"eval_id": { "type": "string" },
|
|
@@ -37,6 +44,11 @@
|
|
|
37
44
|
"type": "array",
|
|
38
45
|
"description": "Heuristic: a Bash command matched a mutating pattern (install, git, sed -i, redirection) without referencing the outputs dir.",
|
|
39
46
|
"items": { "$ref": "#/definitions/finding" }
|
|
47
|
+
},
|
|
48
|
+
"live_source_reads": {
|
|
49
|
+
"type": "array",
|
|
50
|
+
"description": "A read tool or Bash command accessed the live skill-under-test directory instead of the staged copy — the arm may be contaminated (staged-slug resolution race).",
|
|
51
|
+
"items": { "$ref": "#/definitions/finding" }
|
|
40
52
|
}
|
|
41
53
|
}
|
|
42
54
|
}
|
|
@@ -61,7 +61,9 @@ User request:
|
|
|
61
61
|
|
|
62
62
|
## After the subagent completes
|
|
63
63
|
|
|
64
|
-
|
|
64
|
+
Two records must exist per run: `{{output_dir}}/../run.json` (matching `schema/run-record.schema.json`) and `{{output_dir}}/../timing.json`.
|
|
65
65
|
|
|
66
|
-
|
|
67
|
-
|
|
66
|
+
- **Harnesses with persisted transcripts (Claude Code):** `record-runs` assembles both from disk after all dispatches — carry-over fields from `dispatch.json`, `final_message` from `{{output_dir}}/final-message.md`, `tool_invocations`/tokens/duration from the transcript. The operator captures nothing per-task. Optionally, completion-event timing written to `timing.json` at dispatch time (with `"source": "completion-event"`) takes precedence — `record-runs` only backfills, never overwrites.
|
|
67
|
+
- **Transcript-less harnesses:** the operator (or the runner) captures manually, as before:
|
|
68
|
+
1. The full transcript / tool invocations → convert via the harness adapter into `{{output_dir}}/../run.json`.
|
|
69
|
+
2. `total_tokens` and `duration_ms` from the harness's task completion event → `{{output_dir}}/../timing.json`. **These values may not be persisted anywhere else — save them immediately.**
|
|
@@ -60,7 +60,7 @@ Roughly in increasing order of effort / payoff:
|
|
|
60
60
|
class of eval measurable. This is the high-value framework improvement.
|
|
61
61
|
3. **Real harness-mode injection.** Reproduce the plan-mode suppression by running
|
|
62
62
|
the eval subagent *inside* a real plan mode rather than a described one. Tracked
|
|
63
|
-
as a parity goal in `harness-parity
|
|
63
|
+
as a parity goal in `skills/evaluating-skills/harness-parity.md`; the biggest lift.
|
|
64
64
|
|
|
65
65
|
## Bigger-picture testing strategy (from the maintainer)
|
|
66
66
|
|
|
@@ -38,13 +38,24 @@ Before claiming any task is finished, making a success claim, or declaring a bug
|
|
|
38
38
|
|
|
39
39
|
---
|
|
40
40
|
|
|
41
|
-
## Finishing: Review
|
|
41
|
+
## Finishing: Review Code, Verify, Then Review Comments
|
|
42
42
|
|
|
43
|
-
The Gate Function above is your discipline at *every* completion claim. When you believe the work itself is done, run
|
|
43
|
+
The Gate Function above is your discipline at *every* completion claim. When you believe the work itself is done, run these three finishing phases **in order**. The order is deliberate: every code change happens in phase 1, *before* the verification, so the evidence you hand back is guaranteed to cover the exact code being returned — and comment cleanup comes *after*, where it can't disturb that check.
|
|
44
44
|
|
|
45
|
-
1. **Review
|
|
46
|
-
2. **
|
|
47
|
-
3. **
|
|
45
|
+
1. **Review and fix the code** — follow [`code-review.md`](code-review.md). This is the only phase that changes behavior. Review catches what running can't — silent regressions, missed edge cases, leftover debug code, reuse or simplification — then you fix or flag each finding, and *the code is now frozen*. Size the review to the change: a quick check, not a second project. (Comments are **not** reviewed here — they get phase 3.)
|
|
46
|
+
2. **Run the final verification** — apply the Gate Function fresh to the now-frozen code and present *that* output as your evidence. Because all code changes happened in phase 1, this check covers exactly what the user gets.
|
|
47
|
+
3. **Review and clean the comments** — follow [`comment-review.md`](comment-review.md). This pass touches *only* comments, so it changes no behavior and needs **no re-verification**: delete narrative / step-by-step / ticket comments, keeping only true Explanation or exported-member Documentation, before the diff reaches a human.
|
|
48
|
+
|
|
49
|
+
**Copy this checklist into your task tracker the moment you start finishing, and tick each box in order.** The ordering *is* the discipline — and an untracked checklist is one whose middle steps get skipped under momentum:
|
|
50
|
+
|
|
51
|
+
```
|
|
52
|
+
- [ ] Phase 1 — reviewed the CODE against intent, ranked findings, fixed/flagged each (per code-review.md); code is now frozen
|
|
53
|
+
- [ ] Phase 2 — ran the final verification fresh on the frozen code, and presented that output as evidence
|
|
54
|
+
- [ ] Phase 3 — reviewed the COMMENTS (per comment-review.md): deleted narrative / step-by-step / ticket comments, kept only true Explanation or exported Documentation
|
|
55
|
+
- [ ] Surfaced integration options (merge / push+PR / leave as-is / discard) — did not merge or push on my own
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
The last box is its own gate; the section below is why it's never yours to skip.
|
|
48
59
|
|
|
49
60
|
---
|
|
50
61
|
|
|
@@ -69,7 +80,7 @@ Verified, reviewed work is still *your* checkpoint, not a decision to merge. Int
|
|
|
69
80
|
| "It's obvious this is correct" | Obvious bugs are the most embarrassing. Reading code predicts behavior; only running it proves behavior. |
|
|
70
81
|
| "I'll verify after committing" | Verification after the claim is too late. |
|
|
71
82
|
| "The build should be fine" | "Should" is not evidence. |
|
|
72
|
-
| "Tests pass, so we're done here" | Verification is one
|
|
83
|
+
| "Tests pass, so we're done here" | Verification is one phase of finishing, not the whole sequence — review and fix the code, verify the frozen result, then clean the comments. |
|
|
73
84
|
| "The user said ship it, so I'll just merge" | "Ship it" authorizes the user's choice, not a unilateral merge or push. |
|
|
74
85
|
|
|
75
86
|
---
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
# Reviewing the Code
|
|
2
|
+
|
|
3
|
+
This is **phase 1** of the finishing sequence in [`SKILL.md`](SKILL.md) — the
|
|
4
|
+
code review. Review and fix the *code* here. This is the only phase that changes
|
|
5
|
+
behavior, so once you finish it the code is frozen.
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## Size the review to the change
|
|
10
|
+
|
|
11
|
+
Review depth matches the size and risk of the diff. A one-line fix gets a careful
|
|
12
|
+
read and a moment's thought about what it could break; a new subsystem gets more.
|
|
13
|
+
Don't run a heavyweight audit over a trivial change to look thorough — a review
|
|
14
|
+
that's louder than the change it covers is the failure this guidance exists to
|
|
15
|
+
prevent.
|
|
16
|
+
|
|
17
|
+
Do the review however your harness makes natural — read the diff inline, or
|
|
18
|
+
dispatch it to a general purpose subagent.
|
|
19
|
+
|
|
20
|
+
---
|
|
21
|
+
|
|
22
|
+
## Read the diff against intent
|
|
23
|
+
|
|
24
|
+
Read the actual diff — not your memory of what you changed — against the plan or
|
|
25
|
+
the request. Cite findings by `file:line` so each one is checkable. Look for:
|
|
26
|
+
|
|
27
|
+
- **Intent alignment** — does the change do what was asked? Are deviations
|
|
28
|
+
deliberate improvements, or drift?
|
|
29
|
+
- **Correctness** — bugs, off-by-ones, wrong conditions, mishandled `null`/empty.
|
|
30
|
+
- **Error & edge cases** — failure paths, boundaries, and inputs the happy path skips.
|
|
31
|
+
- **Reuse & simplification** — existing helpers ignored, needless abstraction,
|
|
32
|
+
code that could be plainer.
|
|
33
|
+
- **Leftover scaffolding** — debug prints, commented-out code, dead branches,
|
|
34
|
+
silent regressions to nearby behavior.
|
|
35
|
+
- **Tests** — do they exercise real behavior, and do they cover what changed?
|
|
36
|
+
|
|
37
|
+
This is not an exhaustive checklist to march through — it's where real problems
|
|
38
|
+
tend to hide. Spend attention where this particular diff warrants it.
|
|
39
|
+
|
|
40
|
+
---
|
|
41
|
+
|
|
42
|
+
## Rank, then return only the top findings
|
|
43
|
+
|
|
44
|
+
Sort what you found by severity and report only the few that matter. The point of
|
|
45
|
+
ranking is to *drop* noise, not to pad a list.
|
|
46
|
+
|
|
47
|
+
| Severity | What belongs here |
|
|
48
|
+
|----------|-------------------|
|
|
49
|
+
| **Critical — must fix** | Bugs, security holes, data loss, broken functionality. |
|
|
50
|
+
| **Important — should fix** | Missing behavior, weak error handling, test gaps, architecture problems. |
|
|
51
|
+
| **Minor — nice to have** | Style, micro-optimizations, polish. |
|
|
52
|
+
|
|
53
|
+
Report the most important handful. **Drop Minor nitpicks unless nothing more
|
|
54
|
+
serious exists** — a pile of trivia buries the one finding that mattered and
|
|
55
|
+
trains the reader to skim past your review. Don't manufacture findings to fill the
|
|
56
|
+
tiers; "nothing critical, one important thing" is a complete and good result.
|
|
57
|
+
Close with a one-line verdict.
|
|
58
|
+
|
|
59
|
+
---
|
|
60
|
+
|
|
61
|
+
## Then: address the findings — and freeze the code
|
|
62
|
+
|
|
63
|
+
Fix or explicitly flag each code finding you kept. Any fix changes the code — so
|
|
64
|
+
make all of those changes *now*, in this phase. When you're done, the code is
|
|
65
|
+
**frozen**: nothing in the remaining phases touches behavior. Return to the
|
|
66
|
+
finishing sequence in [`SKILL.md`](SKILL.md) and run the **final verification**
|
|
67
|
+
(phase 2) on this frozen result — the check you hand back is then guaranteed to
|
|
68
|
+
cover the exact code being returned.
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
# Reviewing the Comments
|
|
2
|
+
|
|
3
|
+
This is **phase 3** — the last step of the finishing sequence in [`SKILL.md`](SKILL.md).
|
|
4
|
+
By now the code has been reviewed (phase 1), and verified (phase 2). The code is frozen;
|
|
5
|
+
**this pass touches only comments.** That is the whole reason it comes last: a
|
|
6
|
+
comment edit can't change behavior, so it can't invalidate the verification you
|
|
7
|
+
just ran — there is nothing here to re-test. Do it as the final polish before the
|
|
8
|
+
handoff.
|
|
9
|
+
|
|
10
|
+
---
|
|
11
|
+
|
|
12
|
+
## The comment-hygiene pass
|
|
13
|
+
|
|
14
|
+
Review **every comment in the changed code** with one goal: **delete as many as
|
|
15
|
+
possible.**
|
|
16
|
+
|
|
17
|
+
This runs against your own instinct. Writing a comment feels like preserving the
|
|
18
|
+
narrative — why this approach, what was tried, which ticket it traces to. But a
|
|
19
|
+
human reading code finds it *very hard* to skip a comment; every one they hit,
|
|
20
|
+
they stop and read. Narrative comments tax every future reader to record a story
|
|
21
|
+
that belongs in the commit message or the PR, not the source. Left in, they
|
|
22
|
+
become the thing the user has to delete by hand before merging — so delete them
|
|
23
|
+
now, on their behalf.
|
|
24
|
+
|
|
25
|
+
A comment survives only if it fits one of two categories **and** meets its bar:
|
|
26
|
+
|
|
27
|
+
1. **Explanation.** Code that is genuinely hard to follow from reading it — a
|
|
28
|
+
subtle algorithm, a deliberate break from the usual pattern, a non-obvious
|
|
29
|
+
constraint. The comment fills the gap with an *evergreen* reason (true a year
|
|
30
|
+
from now, not "fixes the bug from Tuesday"). These are **extremely rare**:
|
|
31
|
+
well-written code is self-commenting, and a reader fluent in code can follow
|
|
32
|
+
even sophisticated paths when the code itself is clear. If the right fix is to
|
|
33
|
+
make the code clearer, do that instead of explaining unclear code.
|
|
34
|
+
2. **Documentation.** A concise doc-style comment (jsdoc and equivalents) on an
|
|
35
|
+
**exported** member, where the text is surfaced by doc generators and editor
|
|
36
|
+
hints to readers who *don't* have the source in front of them. These almost
|
|
37
|
+
always earn their place. Keep them concise and evergreen, matching the
|
|
38
|
+
surrounding style; they may describe usage more freely since that's their job.
|
|
39
|
+
|
|
40
|
+
**Everything else gets deleted — about 99.9% of the time.** The most common
|
|
41
|
+
offender, and the one that feels most defensible, is **step-by-step narration**
|
|
42
|
+
that walks through what the code already says — `// Step 1: lowercase`,
|
|
43
|
+
`// now strip the accents`, `// finally, trim the dashes`. It reads as helpful
|
|
44
|
+
structure, and *that feeling is the trap*: the numbered steps restate control
|
|
45
|
+
flow the reader can already see in the code, so most such comments carry no
|
|
46
|
+
information the line below them doesn't — they only add something else to read.
|
|
47
|
+
"The steps make it easier to follow" is the rationalization to delete *through*,
|
|
48
|
+
not act on; the code is the structure. Strip the narration and nothing is lost.
|
|
49
|
+
The same goes for prose narrative ("first we… then we…"), time-sensitive comments
|
|
50
|
+
(ticket numbers, "the previous solution…", "changed this because…"), and any
|
|
51
|
+
comment that merely restates its line. A comment that fits neither surviving
|
|
52
|
+
category, or fits one but misses its bar, is noise. **When in doubt, delete it.**
|
|
53
|
+
A truly unique case might warrant a truly unusual comment — but treat that as the
|
|
54
|
+
rare exception it is, not the default.
|
|
55
|
+
|
|
56
|
+
```ts
|
|
57
|
+
// BEFORE — every comment restates the line under it
|
|
58
|
+
// Step 1: lowercase the title
|
|
59
|
+
const lower = title.toLowerCase();
|
|
60
|
+
// Step 2: replace whitespace runs with a single hyphen
|
|
61
|
+
const hyphenated = lower.replace(/\s+/g, "-");
|
|
62
|
+
|
|
63
|
+
// AFTER — the code already says all of that
|
|
64
|
+
const lower = title.toLowerCase();
|
|
65
|
+
const hyphenated = lower.replace(/\s+/g, "-");
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
**A kernel of value doesn't save the comment around it.** The hardest case is
|
|
69
|
+
the *mixed* comment — mostly narration, with one genuinely useful clause buried
|
|
70
|
+
in it (a real constraint, a non-obvious *why*). Keeping the whole block "because
|
|
71
|
+
part of it is useful" is exactly how noise survives review: a reader will keep a
|
|
72
|
+
comment that's 90% restatement for the sake of the 10% that matters. Don't.
|
|
73
|
+
**Extract the useful part, delete the rest, and if what remains earns a comment,
|
|
74
|
+
write it as a tight standalone one** — the kernel alone, not the narration that
|
|
75
|
+
carried it. A four-line "Step 1… / Step 2 *(the one real reason)* / Step 3… /
|
|
76
|
+
Step 4…" block collapses to a single comment stating that one reason, and the
|
|
77
|
+
numbered narration is gone.
|
|
78
|
+
|
|
79
|
+
---
|
|
80
|
+
|
|
81
|
+
## Then: hand it back
|
|
82
|
+
|
|
83
|
+
These were comment-only edits — they change no behavior, so there is **nothing to
|
|
84
|
+
re-verify**: the verification from phase 2 still covers the code being returned.
|
|
85
|
+
Return to the finishing sequence in [`SKILL.md`](SKILL.md) for the handoff.
|