@slowdini/slow-powers-opencode 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +18 -8
- package/package.json +5 -1
- package/skills/evaluating-skills/SKILL.md +19 -17
- package/skills/evaluating-skills/harness-details/claude.md +51 -15
- package/skills/evaluating-skills/harness-parity.md +155 -0
- package/skills/evaluating-skills/runner/README.md +28 -19
- package/skills/evaluating-skills/runner/adapters/claude-code-session.ts +2 -2
- package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +222 -0
- package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +107 -11
- package/skills/evaluating-skills/runner/aggregate.test.ts +220 -0
- package/skills/evaluating-skills/runner/aggregate.ts +21 -0
- package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +295 -2
- package/skills/evaluating-skills/runner/detect-stray-writes.ts +102 -6
- package/skills/evaluating-skills/runner/guard/policy.test.ts +57 -0
- package/skills/evaluating-skills/runner/promote-baseline.test.ts +51 -0
- package/skills/evaluating-skills/runner/promote-baseline.ts +19 -1
- package/skills/evaluating-skills/runner/record-runs.test.ts +314 -0
- package/skills/evaluating-skills/runner/record-runs.ts +209 -0
- package/skills/evaluating-skills/runner/run.test.ts +523 -0
- package/skills/evaluating-skills/runner/run.ts +376 -17
- package/skills/evaluating-skills/runner/sandbox-policy.ts +20 -0
- package/skills/evaluating-skills/runner/types.ts +9 -0
- package/skills/evaluating-skills/runner/workspace-teardown.test.ts +227 -0
- package/skills/evaluating-skills/runner/workspace-teardown.ts +136 -0
- package/skills/evaluating-skills/schema/run-record.schema.json +2 -2
- package/skills/evaluating-skills/schema/stray-writes.schema.json +15 -3
- package/skills/evaluating-skills/templates/eval-task-prompt.md +5 -3
- package/skills/test-driven-development/evals/baseline/NOTES.md +1 -1
- package/skills/verifying-development-work/SKILL.md +17 -6
- package/skills/verifying-development-work/code-review.md +68 -0
- package/skills/verifying-development-work/comment-review.md +85 -0
- package/skills/verifying-development-work/evals/baseline/BASELINE.md +7 -6
- package/skills/verifying-development-work/evals/baseline/NOTES.md +83 -149
- package/skills/verifying-development-work/evals/baseline/benchmark.json +32 -31
- package/skills/verifying-development-work/evals/baseline/grading/comment-hygiene-at-handoff__new_skill.json +53 -0
- package/skills/verifying-development-work/evals/baseline/grading/comment-hygiene-at-handoff__old_skill.json +53 -0
- package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__new_skill.json +53 -0
- package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__old_skill.json +53 -0
- package/skills/verifying-development-work/evals/evals.json +34 -2
- package/skills/verifying-development-work/evals/fixtures/comment-hygiene-at-handoff/slugify.test.ts +14 -0
- package/skills/verifying-development-work/evals/fixtures/comment-hygiene-at-handoff/slugify.ts +25 -0
- package/skills/verifying-development-work/evals/baseline/grading/bug-fixed-without-reproducing__with_skill.json +0 -39
- package/skills/verifying-development-work/evals/baseline/grading/bug-fixed-without-reproducing__without_skill.json +0 -24
- package/skills/verifying-development-work/evals/baseline/grading/build-implied-by-edit__with_skill.json +0 -46
- package/skills/verifying-development-work/evals/baseline/grading/build-implied-by-edit__without_skill.json +0 -31
- package/skills/verifying-development-work/evals/baseline/grading/claim-without-running__with_skill.json +0 -46
- package/skills/verifying-development-work/evals/baseline/grading/claim-without-running__without_skill.json +0 -31
- package/skills/verifying-development-work/evals/baseline/grading/seeded-done-tests-pass-ship-it__with_skill.json +0 -46
- package/skills/verifying-development-work/evals/baseline/grading/seeded-done-tests-pass-ship-it__without_skill.json +0 -31
- package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__with_skill.json +0 -53
- package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__without_skill.json +0 -38
|
@@ -31,6 +31,7 @@ import type {
|
|
|
31
31
|
EvalsConfig,
|
|
32
32
|
} from "./types";
|
|
33
33
|
import { validateEvalsConfig } from "./validate";
|
|
34
|
+
import { cleanupWorkspace, SNAPSHOT_META } from "./workspace-teardown";
|
|
34
35
|
|
|
35
36
|
export const STAGED_SKILL_PREFIX = "slow-powers-eval-";
|
|
36
37
|
export const STAGED_SIBLING_MANIFEST = ".slow-powers-eval-manifest.json";
|
|
@@ -41,6 +42,16 @@ export function stageSkillForCC(opts: {
|
|
|
41
42
|
condition: string;
|
|
42
43
|
skillName: string;
|
|
43
44
|
repoRoot: string;
|
|
45
|
+
/**
|
|
46
|
+
* Source skill directory whose sibling assets are copied alongside the staged
|
|
47
|
+
* SKILL.md — everything next to SKILL.md except SKILL.md itself, the `evals/`
|
|
48
|
+
* dir, and the snapshot bookkeeping file. A multi-file skill whose SKILL.md
|
|
49
|
+
* links a sibling (e.g. `[code-review.md](code-review.md)`) would otherwise be
|
|
50
|
+
* staged with a dangling link: the agent can't resolve the reference relative
|
|
51
|
+
* to the staged dir, so the linked guidance is silently unreachable. Mirrors
|
|
52
|
+
* the sibling-asset copy in `snapshot`. Omit to stage SKILL.md alone.
|
|
53
|
+
*/
|
|
54
|
+
assetsDir?: string;
|
|
44
55
|
/**
|
|
45
56
|
* When set, stage under this verbatim identifier instead of the conspicuous
|
|
46
57
|
* `slow-powers-eval-…` slug. Used by `--stage-name` to A/B a natural name
|
|
@@ -56,6 +67,16 @@ export function stageSkillForCC(opts: {
|
|
|
56
67
|
const skillDir = join(opts.repoRoot, ".claude", "skills", slug);
|
|
57
68
|
mkdirSync(skillDir, { recursive: true });
|
|
58
69
|
writeFileSync(join(skillDir, "SKILL.md"), opts.content);
|
|
70
|
+
if (opts.assetsDir !== undefined && existsSync(opts.assetsDir)) {
|
|
71
|
+
for (const entry of readdirSync(opts.assetsDir)) {
|
|
72
|
+
if (entry === "SKILL.md" || entry === "evals" || entry === SNAPSHOT_META)
|
|
73
|
+
continue;
|
|
74
|
+
const src = join(opts.assetsDir, entry);
|
|
75
|
+
const dst = join(skillDir, entry);
|
|
76
|
+
if (statSync(src).isDirectory()) cpSync(src, dst, { recursive: true });
|
|
77
|
+
else cpSync(src, dst);
|
|
78
|
+
}
|
|
79
|
+
}
|
|
59
80
|
return slug;
|
|
60
81
|
}
|
|
61
82
|
|
|
@@ -78,6 +99,7 @@ export function registerStagedSkillForCleanup(
|
|
|
78
99
|
manifest = {
|
|
79
100
|
created_at: new Date().toISOString(),
|
|
80
101
|
staged_under_test: name,
|
|
102
|
+
skills_dir_preexisting: true,
|
|
81
103
|
created_entries: [],
|
|
82
104
|
};
|
|
83
105
|
}
|
|
@@ -89,6 +111,14 @@ export function registerStagedSkillForCleanup(
|
|
|
89
111
|
type SiblingManifest = {
|
|
90
112
|
created_at: string;
|
|
91
113
|
staged_under_test: string;
|
|
114
|
+
/**
|
|
115
|
+
* Whether `.claude/skills` already existed when staging began. When false the
|
|
116
|
+
* runner created it, so {@link cleanupStagedSkills} may remove the whole tree
|
|
117
|
+
* (and prune an emptied `.claude`); when true (or absent, on older manifests)
|
|
118
|
+
* cleanup falls back to the surgical per-entry restore so a user's own
|
|
119
|
+
* project skills are left intact.
|
|
120
|
+
*/
|
|
121
|
+
skills_dir_preexisting?: boolean;
|
|
92
122
|
created_entries: Array<{
|
|
93
123
|
name: string;
|
|
94
124
|
preexisting: boolean;
|
|
@@ -102,6 +132,7 @@ export function stageSiblingSkills(opts: {
|
|
|
102
132
|
repoRoot: string;
|
|
103
133
|
}): SiblingManifest {
|
|
104
134
|
const skillsDir = join(opts.repoRoot, ".claude", "skills");
|
|
135
|
+
const skillsDirPreexisting = existsSync(skillsDir);
|
|
105
136
|
mkdirSync(skillsDir, { recursive: true });
|
|
106
137
|
|
|
107
138
|
const siblings = readdirSync(opts.skillsSourceDir).filter((name) => {
|
|
@@ -114,6 +145,7 @@ export function stageSiblingSkills(opts: {
|
|
|
114
145
|
const manifest: SiblingManifest = {
|
|
115
146
|
created_at: new Date().toISOString(),
|
|
116
147
|
staged_under_test: opts.skillUnderTest,
|
|
148
|
+
skills_dir_preexisting: skillsDirPreexisting,
|
|
117
149
|
created_entries: [],
|
|
118
150
|
};
|
|
119
151
|
|
|
@@ -153,8 +185,18 @@ export function stageSiblingSkills(opts: {
|
|
|
153
185
|
return manifest;
|
|
154
186
|
}
|
|
155
187
|
|
|
188
|
+
/** Remove `dir` only if it exists and is empty. Used to prune a `.claude` the
|
|
189
|
+
* runner emptied without ever touching a `.claude` that still holds the user's
|
|
190
|
+
* own files (e.g. `settings.json`). */
|
|
191
|
+
function pruneIfEmpty(dir: string): void {
|
|
192
|
+
if (existsSync(dir) && readdirSync(dir).length === 0) {
|
|
193
|
+
rmSync(dir, { recursive: true, force: true });
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
|
|
156
197
|
export function cleanupStagedSkills(repoRoot: string): void {
|
|
157
|
-
const
|
|
198
|
+
const claudeDir = join(repoRoot, ".claude");
|
|
199
|
+
const skillsDir = join(claudeDir, "skills");
|
|
158
200
|
if (!existsSync(skillsDir)) return;
|
|
159
201
|
|
|
160
202
|
for (const entry of readdirSync(skillsDir)) {
|
|
@@ -171,6 +213,18 @@ export function cleanupStagedSkills(repoRoot: string): void {
|
|
|
171
213
|
rmSync(manifestPath, { force: true });
|
|
172
214
|
return;
|
|
173
215
|
}
|
|
216
|
+
|
|
217
|
+
// The runner created `.claude/skills` this run, so it can't be holding any of
|
|
218
|
+
// the user's own skills — remove the whole staged tree (including any stray,
|
|
219
|
+
// non-prefixed dirs a recursive eval left behind), then prune an emptied
|
|
220
|
+
// `.claude`. In a real project `.claude/settings.json` keeps `.claude`
|
|
221
|
+
// non-empty, so only the scaffolding we created is removed.
|
|
222
|
+
if (manifest.skills_dir_preexisting === false) {
|
|
223
|
+
rmSync(skillsDir, { recursive: true, force: true });
|
|
224
|
+
pruneIfEmpty(claudeDir);
|
|
225
|
+
return;
|
|
226
|
+
}
|
|
227
|
+
|
|
174
228
|
for (const e of manifest.created_entries) {
|
|
175
229
|
const target = join(skillsDir, e.name);
|
|
176
230
|
rmSync(target, { recursive: true, force: true });
|
|
@@ -185,7 +239,13 @@ export function cleanupStagedSkills(repoRoot: string): void {
|
|
|
185
239
|
type Mode = "new-skill" | "revision";
|
|
186
240
|
|
|
187
241
|
type Args = {
|
|
188
|
-
command:
|
|
242
|
+
command:
|
|
243
|
+
| "run"
|
|
244
|
+
| "snapshot"
|
|
245
|
+
| "teardown-guard"
|
|
246
|
+
| "teardown"
|
|
247
|
+
| "ingest"
|
|
248
|
+
| "finalize";
|
|
189
249
|
mode?: Mode;
|
|
190
250
|
baseline?: string;
|
|
191
251
|
label?: string;
|
|
@@ -197,6 +257,8 @@ type Args = {
|
|
|
197
257
|
guard: boolean;
|
|
198
258
|
stageName?: string;
|
|
199
259
|
planMode: boolean;
|
|
260
|
+
ref?: string;
|
|
261
|
+
subagentsDir?: string;
|
|
200
262
|
};
|
|
201
263
|
|
|
202
264
|
function die(msg: string): never {
|
|
@@ -204,14 +266,59 @@ function die(msg: string): never {
|
|
|
204
266
|
process.exit(1);
|
|
205
267
|
}
|
|
206
268
|
|
|
269
|
+
/**
|
|
270
|
+
* Reads the bytes of `<ref>:./<relPath>` from git, resolving `relPath` relative
|
|
271
|
+
* to `cwd` via the `./` prefix. Returns the raw stdout Buffer on success (write
|
|
272
|
+
* it directly — never `.toString()` — so binary assets round-trip intact), or
|
|
273
|
+
* `null` if the object doesn't exist at that ref (git exits non-zero). Mirrors
|
|
274
|
+
* the `Bun.spawnSync` git pattern in `promote-baseline.ts:gitHead`; runs git
|
|
275
|
+
* directly (no shell), so the ref/path aren't interpolated into a shell string.
|
|
276
|
+
*/
|
|
277
|
+
function gitShowBytes(
|
|
278
|
+
cwd: string,
|
|
279
|
+
ref: string,
|
|
280
|
+
relPath: string,
|
|
281
|
+
): Buffer | null {
|
|
282
|
+
const res = Bun.spawnSync(["git", "show", `${ref}:./${relPath}`], {
|
|
283
|
+
cwd,
|
|
284
|
+
stdout: "pipe",
|
|
285
|
+
stderr: "pipe",
|
|
286
|
+
});
|
|
287
|
+
if (res.exitCode !== 0) return null;
|
|
288
|
+
return Buffer.from(res.stdout);
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
/**
|
|
292
|
+
* Lists every file under `cwd` as it existed at `<ref>`, as paths relative to
|
|
293
|
+
* `cwd` (git's default ls-tree output strips the cwd prefix). `die`s with git's
|
|
294
|
+
* stderr on failure — a bad ref or a cwd outside any repo surfaces here.
|
|
295
|
+
*/
|
|
296
|
+
function gitLsFiles(cwd: string, ref: string): string[] {
|
|
297
|
+
const res = Bun.spawnSync(["git", "ls-tree", "-r", "--name-only", ref, "."], {
|
|
298
|
+
cwd,
|
|
299
|
+
stdout: "pipe",
|
|
300
|
+
stderr: "pipe",
|
|
301
|
+
});
|
|
302
|
+
if (res.exitCode !== 0)
|
|
303
|
+
die(`git ls-tree failed for ref ${ref}: ${res.stderr.toString().trim()}`);
|
|
304
|
+
return res.stdout
|
|
305
|
+
.toString()
|
|
306
|
+
.split("\n")
|
|
307
|
+
.map((s) => s.trim())
|
|
308
|
+
.filter(Boolean);
|
|
309
|
+
}
|
|
310
|
+
|
|
207
311
|
function parseArgs(argv: string[]): Args {
|
|
208
312
|
const positionals = argv.filter((a) => !a.startsWith("--"));
|
|
313
|
+
const COMMANDS: Args["command"][] = [
|
|
314
|
+
"snapshot",
|
|
315
|
+
"teardown-guard",
|
|
316
|
+
"teardown",
|
|
317
|
+
"ingest",
|
|
318
|
+
"finalize",
|
|
319
|
+
];
|
|
209
320
|
const command: Args["command"] =
|
|
210
|
-
positionals[0]
|
|
211
|
-
? "snapshot"
|
|
212
|
-
: positionals[0] === "teardown-guard"
|
|
213
|
-
? "teardown-guard"
|
|
214
|
-
: "run";
|
|
321
|
+
COMMANDS.find((c) => c === positionals[0]) ?? "run";
|
|
215
322
|
|
|
216
323
|
const flag = (name: string): string | undefined => {
|
|
217
324
|
const i = argv.indexOf(`--${name}`);
|
|
@@ -252,6 +359,8 @@ function parseArgs(argv: string[]): Args {
|
|
|
252
359
|
guard: has("guard"),
|
|
253
360
|
stageName: flag("stage-name"),
|
|
254
361
|
planMode: has("plan-mode"),
|
|
362
|
+
ref: flag("ref"),
|
|
363
|
+
subagentsDir: flag("subagents-dir"),
|
|
255
364
|
};
|
|
256
365
|
}
|
|
257
366
|
|
|
@@ -289,8 +398,6 @@ function conditionNamesFor(mode: Mode): [string, string] {
|
|
|
289
398
|
function commandSnapshot(args: Args, ctx: RunContext): void {
|
|
290
399
|
if (!args.label) die("snapshot requires --label <name>");
|
|
291
400
|
const skillDir = ctx.skillSubdir;
|
|
292
|
-
const skillMd = join(skillDir, "SKILL.md");
|
|
293
|
-
if (!existsSync(skillMd)) die(`skill not found: ${skillMd}`);
|
|
294
401
|
|
|
295
402
|
const destDir = join(
|
|
296
403
|
ctx.workspaceRoot,
|
|
@@ -303,6 +410,14 @@ function commandSnapshot(args: Args, ctx: RunContext): void {
|
|
|
303
410
|
`snapshot already exists: ${destDir}\n` +
|
|
304
411
|
" Use a different --label or delete the existing snapshot first.",
|
|
305
412
|
);
|
|
413
|
+
|
|
414
|
+
if (args.ref !== undefined) {
|
|
415
|
+
snapshotFromRef(args.ref, skillDir, destDir, ctx.skillName);
|
|
416
|
+
return;
|
|
417
|
+
}
|
|
418
|
+
|
|
419
|
+
const skillMd = join(skillDir, "SKILL.md");
|
|
420
|
+
if (!existsSync(skillMd)) die(`skill not found: ${skillMd}`);
|
|
306
421
|
ensureDir(destDir);
|
|
307
422
|
|
|
308
423
|
cpSync(skillMd, join(destDir, "SKILL.md"));
|
|
@@ -314,9 +429,53 @@ function commandSnapshot(args: Args, ctx: RunContext): void {
|
|
|
314
429
|
else cpSync(src, dst);
|
|
315
430
|
}
|
|
316
431
|
|
|
432
|
+
// Record provenance so teardown keeps this (working-tree) snapshot — unlike a
|
|
433
|
+
// ref snapshot, it can't be regenerated from git.
|
|
434
|
+
writeJson(join(destDir, SNAPSHOT_META), { source: "working-tree" });
|
|
435
|
+
|
|
317
436
|
console.log(`Snapshotted ${ctx.skillName} → ${destDir}`);
|
|
318
437
|
}
|
|
319
438
|
|
|
439
|
+
/**
|
|
440
|
+
* Snapshots the skill (SKILL.md + sibling assets) as it existed at a git ref,
|
|
441
|
+
* read straight from the object database without touching the working tree
|
|
442
|
+
* (issue #122). The `evals/` directory is excluded to match the working-tree
|
|
443
|
+
* branch. Git runs from `skillDir`, which must sit inside a repo; a bad ref or a
|
|
444
|
+
* skill absent at that ref `die`s with a clear message.
|
|
445
|
+
*/
|
|
446
|
+
function snapshotFromRef(
|
|
447
|
+
ref: string,
|
|
448
|
+
skillDir: string,
|
|
449
|
+
destDir: string,
|
|
450
|
+
skillName: string,
|
|
451
|
+
): void {
|
|
452
|
+
const skillMd = gitShowBytes(skillDir, ref, "SKILL.md");
|
|
453
|
+
if (skillMd === null)
|
|
454
|
+
die(
|
|
455
|
+
`skill not found at ${ref}: ${join(skillDir, "SKILL.md")}\n` +
|
|
456
|
+
" Check the ref exists and that the skill was present there (and that this is a git repo).",
|
|
457
|
+
);
|
|
458
|
+
|
|
459
|
+
ensureDir(destDir);
|
|
460
|
+
writeFileSync(join(destDir, "SKILL.md"), skillMd);
|
|
461
|
+
|
|
462
|
+
for (const relPath of gitLsFiles(skillDir, ref)) {
|
|
463
|
+
if (relPath === "SKILL.md") continue;
|
|
464
|
+
if (relPath === "evals" || relPath.startsWith("evals/")) continue;
|
|
465
|
+
const bytes = gitShowBytes(skillDir, ref, relPath);
|
|
466
|
+
if (bytes === null) continue; // listed but unreadable (e.g. submodule/gitlink)
|
|
467
|
+
const dst = join(destDir, relPath);
|
|
468
|
+
ensureDir(dirname(dst));
|
|
469
|
+
writeFileSync(dst, bytes);
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
// Record provenance so teardown can reclaim this snapshot — it's fully
|
|
473
|
+
// reproducible from the ref.
|
|
474
|
+
writeJson(join(destDir, SNAPSHOT_META), { source: "ref", ref });
|
|
475
|
+
|
|
476
|
+
console.log(`Snapshotted ${skillName} at ${ref} → ${destDir}`);
|
|
477
|
+
}
|
|
478
|
+
|
|
320
479
|
function commandRun(args: Args, ctx: RunContext): void {
|
|
321
480
|
if (!args.mode) die("--mode required: new-skill | revision");
|
|
322
481
|
if (args.mode !== "new-skill" && args.mode !== "revision")
|
|
@@ -481,6 +640,7 @@ function commandRun(args: Args, ctx: RunContext): void {
|
|
|
481
640
|
condition: condName,
|
|
482
641
|
skillName: ctx.skillName,
|
|
483
642
|
repoRoot: ctx.stageRoot,
|
|
643
|
+
assetsDir: dirname(condSkillPath),
|
|
484
644
|
stageNameOverride: args.stageName,
|
|
485
645
|
});
|
|
486
646
|
};
|
|
@@ -652,7 +812,7 @@ function commandRun(args: Args, ctx: RunContext): void {
|
|
|
652
812
|
if (args.dryRun) console.log("\n--dry-run: stopping after workspace prep.");
|
|
653
813
|
else
|
|
654
814
|
console.log(
|
|
655
|
-
"\nNext: read dispatch.json
|
|
815
|
+
"\nNext: read dispatch.json and dispatch each task as a subagent. Then run `ingest --iteration <N> --subagents-dir <path>` (Claude Code), or write run.json + timing.json to the paths in each task by hand and run the chained steps individually (transcript-less harnesses).",
|
|
656
816
|
);
|
|
657
817
|
}
|
|
658
818
|
|
|
@@ -975,15 +1135,12 @@ function buildManifest(opts: {
|
|
|
975
1135
|
"",
|
|
976
1136
|
"**Transcript correlation:** Each task has an `agent_description` field of the form `<eval_id>:<condition>:i<N>-<nonce>`. When dispatching the subagent via the host's primitive (e.g. Claude Code's Agent tool), pass this string verbatim as the dispatch `description` — do not reconstruct it. The per-run nonce keeps descriptions unique across iterations sharing one session's subagents dir, so the transcript adapter correlates each subagent's persisted transcript back to the right `(eval, condition)` slot without collisions.",
|
|
977
1137
|
"",
|
|
978
|
-
"After
|
|
979
|
-
"",
|
|
980
|
-
"1. Write `run.json` matching `skills/evaluating-skills/schema/run-record.schema.json` (enforced at runtime by grade/fill-transcripts/detect-stray-writes). Carry over `eval_id`, `condition`, `skill_path` (`null` on the without_skill arm), `prompt`, and `files` from the task; populate `final_message` from the subagent's reply; leave `tool_invocations` as `[]` for now — `evals:fill-transcripts` will populate it from the persisted transcript in a later step.",
|
|
981
|
-
"2. Capture `total_tokens` and `duration_ms` from the harness's task completion event into `timing.json`. These values may not be persisted anywhere else — save them immediately.",
|
|
1138
|
+
"After all dispatches (Claude Code):",
|
|
982
1139
|
"",
|
|
983
|
-
"
|
|
1140
|
+
'1. Run `bun run evals:ingest -- --skill <name> --iteration <N> --subagents-dir ~/.claude/projects/<project-slug>/<parent-session-id>/subagents/` — a fixed-order chain of record-runs (assembles every task\'s `run.json` from `dispatch.json` + the subagent\'s own `outputs/final-message.md` + the persisted transcript, and backfills `timing.json` with transcript-derived tokens/duration; never clobbers an existing record), fill-transcripts, detect-stray-writes, and grade. Optional higher-fidelity timing: write `{ "total_tokens": <n>, "duration_ms": <n>, "source": "completion-event" }` from the task completion event to `timing.json` right after a dispatch — completion-event numbers always win over the backfill.',
|
|
1141
|
+
"2. Dispatch the judge tasks ingest lists, then run `bun run evals:finalize -- --skill <name> --iteration <N>` for the benchmark.",
|
|
984
1142
|
"",
|
|
985
|
-
"
|
|
986
|
-
"4. Run `bun run evals:grade --skill <name> --iteration <N>` to grade.",
|
|
1143
|
+
"On a harness without persisted transcripts, instead write each task's `run.json` (matching `skills/evaluating-skills/schema/run-record.schema.json`, enforced at runtime by grade/fill-transcripts/detect-stray-writes) and `timing.json` by hand when its subagent returns: carry over `eval_id`, `condition`, `skill_path` (`null` on the without_skill arm), `prompt`, and `files` from the task; populate `final_message` from the subagent's reply; leave `tool_invocations` as `[]`; capture `total_tokens`/`duration_ms` from the task completion event immediately — they may not be persisted anywhere else.",
|
|
987
1144
|
"",
|
|
988
1145
|
"## Dispatches",
|
|
989
1146
|
"",
|
|
@@ -1008,6 +1165,174 @@ function buildManifest(opts: {
|
|
|
1008
1165
|
return header + entries;
|
|
1009
1166
|
}
|
|
1010
1167
|
|
|
1168
|
+
// ---------------------------------------------------------------------------
|
|
1169
|
+
// ingest / finalize — fixed-order orchestrators over the sibling commands.
|
|
1170
|
+
//
|
|
1171
|
+
// The eval loop has exactly two points where only the in-harness agent can act
|
|
1172
|
+
// (dispatching eval subagents, dispatching judge subagents). Everything between
|
|
1173
|
+
// them is mechanical, so each stretch is one command: `ingest` runs the
|
|
1174
|
+
// post-dispatch chain and stops at the judge hand-off; `finalize` runs the
|
|
1175
|
+
// post-judge chain and prints the benchmark. No workspace-state inference —
|
|
1176
|
+
// each always runs the same steps in the same order, and every sub-step keeps
|
|
1177
|
+
// its own skip-if-done guard, so re-running after a fix is safe.
|
|
1178
|
+
// ---------------------------------------------------------------------------
|
|
1179
|
+
|
|
1180
|
+
export type StepCommand = { label: string; argv: string[] };
|
|
1181
|
+
|
|
1182
|
+
export function buildIngestCommands(opts: {
|
|
1183
|
+
runnerDir: string;
|
|
1184
|
+
skillDir: string;
|
|
1185
|
+
skill: string;
|
|
1186
|
+
iteration: number;
|
|
1187
|
+
subagentsDir: string;
|
|
1188
|
+
}): StepCommand[] {
|
|
1189
|
+
const shared = [
|
|
1190
|
+
"--skill-dir",
|
|
1191
|
+
opts.skillDir,
|
|
1192
|
+
"--skill",
|
|
1193
|
+
opts.skill,
|
|
1194
|
+
"--iteration",
|
|
1195
|
+
String(opts.iteration),
|
|
1196
|
+
];
|
|
1197
|
+
const transcripts = ["--subagents-dir", opts.subagentsDir];
|
|
1198
|
+
const script = (name: string) => [
|
|
1199
|
+
"bun",
|
|
1200
|
+
"run",
|
|
1201
|
+
join(opts.runnerDir, `${name}.ts`),
|
|
1202
|
+
];
|
|
1203
|
+
return [
|
|
1204
|
+
{
|
|
1205
|
+
label: "record-runs",
|
|
1206
|
+
argv: [...script("record-runs"), ...shared, ...transcripts],
|
|
1207
|
+
},
|
|
1208
|
+
// record-runs subsumes this for the records it wrote; it still fills any
|
|
1209
|
+
// pre-existing (agent-written) run.json with empty tool_invocations.
|
|
1210
|
+
{
|
|
1211
|
+
label: "fill-transcripts",
|
|
1212
|
+
argv: [...script("fill-transcripts"), ...shared, ...transcripts],
|
|
1213
|
+
},
|
|
1214
|
+
{
|
|
1215
|
+
label: "detect-stray-writes",
|
|
1216
|
+
argv: [...script("detect-stray-writes"), ...shared],
|
|
1217
|
+
},
|
|
1218
|
+
{ label: "grade", argv: [...script("grade"), ...shared] },
|
|
1219
|
+
];
|
|
1220
|
+
}
|
|
1221
|
+
|
|
1222
|
+
export function buildFinalizeCommands(opts: {
|
|
1223
|
+
runnerDir: string;
|
|
1224
|
+
skillDir: string;
|
|
1225
|
+
skill: string;
|
|
1226
|
+
iteration: number;
|
|
1227
|
+
}): StepCommand[] {
|
|
1228
|
+
const shared = [
|
|
1229
|
+
"--skill-dir",
|
|
1230
|
+
opts.skillDir,
|
|
1231
|
+
"--skill",
|
|
1232
|
+
opts.skill,
|
|
1233
|
+
"--iteration",
|
|
1234
|
+
String(opts.iteration),
|
|
1235
|
+
];
|
|
1236
|
+
return [
|
|
1237
|
+
{
|
|
1238
|
+
label: "grade --finalize",
|
|
1239
|
+
argv: [
|
|
1240
|
+
"bun",
|
|
1241
|
+
"run",
|
|
1242
|
+
join(opts.runnerDir, "grade.ts"),
|
|
1243
|
+
...shared,
|
|
1244
|
+
"--finalize",
|
|
1245
|
+
],
|
|
1246
|
+
},
|
|
1247
|
+
{
|
|
1248
|
+
label: "aggregate",
|
|
1249
|
+
argv: ["bun", "run", join(opts.runnerDir, "aggregate.ts"), ...shared],
|
|
1250
|
+
},
|
|
1251
|
+
];
|
|
1252
|
+
}
|
|
1253
|
+
|
|
1254
|
+
/**
|
|
1255
|
+
* Runs steps in order, stopping at the first non-zero exit. A failure must
|
|
1256
|
+
* halt the chain: grade's `__skill_invoked` code-check silently degrades to an
|
|
1257
|
+
* LLM judge when `tool_invocations` is missing, so grading after a failed
|
|
1258
|
+
* record/fill step would quietly lose the deterministic check.
|
|
1259
|
+
*/
|
|
1260
|
+
export function runSteps(
|
|
1261
|
+
steps: StepCommand[],
|
|
1262
|
+
spawn: (step: StepCommand) => number = (step) =>
|
|
1263
|
+
Bun.spawnSync(step.argv, { stdout: "inherit", stderr: "inherit" })
|
|
1264
|
+
.exitCode ?? 1,
|
|
1265
|
+
): { failedAt: string | null } {
|
|
1266
|
+
for (const step of steps) {
|
|
1267
|
+
console.log(`\n── ${step.label} ──`);
|
|
1268
|
+
if (spawn(step) !== 0) return { failedAt: step.label };
|
|
1269
|
+
}
|
|
1270
|
+
return { failedAt: null };
|
|
1271
|
+
}
|
|
1272
|
+
|
|
1273
|
+
function commandIngest(args: Args, ctx: RunContext): void {
|
|
1274
|
+
if (args.iteration === undefined) die("ingest requires --iteration <N>");
|
|
1275
|
+
if (!args.subagentsDir)
|
|
1276
|
+
die(
|
|
1277
|
+
"ingest requires --subagents-dir <path> (Claude Code persists subagent transcripts under ~/.claude/projects/<project-slug>/<parent-session-id>/subagents/)",
|
|
1278
|
+
);
|
|
1279
|
+
const { failedAt } = runSteps(
|
|
1280
|
+
buildIngestCommands({
|
|
1281
|
+
runnerDir: import.meta.dir,
|
|
1282
|
+
skillDir: ctx.skillDir,
|
|
1283
|
+
skill: ctx.skillName,
|
|
1284
|
+
iteration: args.iteration,
|
|
1285
|
+
subagentsDir: args.subagentsDir,
|
|
1286
|
+
}),
|
|
1287
|
+
);
|
|
1288
|
+
if (failedAt)
|
|
1289
|
+
die(
|
|
1290
|
+
`ingest stopped at '${failedAt}'. Fix the failure and re-run ingest — completed steps skip work that's already done.`,
|
|
1291
|
+
);
|
|
1292
|
+
|
|
1293
|
+
const judgeTasksPath = join(
|
|
1294
|
+
ctx.workspaceRoot,
|
|
1295
|
+
ctx.skillName,
|
|
1296
|
+
`iteration-${args.iteration}`,
|
|
1297
|
+
"judge-tasks.json",
|
|
1298
|
+
);
|
|
1299
|
+
let totalTasks: number | null = null;
|
|
1300
|
+
try {
|
|
1301
|
+
totalTasks =
|
|
1302
|
+
readJson<{ total_tasks?: number }>(judgeTasksPath).total_tasks ?? null;
|
|
1303
|
+
} catch {
|
|
1304
|
+
// grade always writes judge-tasks.json; treat a read failure as unknown.
|
|
1305
|
+
}
|
|
1306
|
+
if (totalTasks === 0) {
|
|
1307
|
+
console.log(
|
|
1308
|
+
`\n✅ Ingest complete — no judge dispatches needed.\nNext: bun run evals:finalize -- --skill ${ctx.skillName} --iteration ${args.iteration}`,
|
|
1309
|
+
);
|
|
1310
|
+
} else {
|
|
1311
|
+
console.log(
|
|
1312
|
+
`\n✅ Ingest complete. Dispatch the ${totalTasks ?? ""} judge task(s) grade listed above (judge-tasks.json), then:\n bun run evals:finalize -- --skill ${ctx.skillName} --iteration ${args.iteration}`,
|
|
1313
|
+
);
|
|
1314
|
+
}
|
|
1315
|
+
}
|
|
1316
|
+
|
|
1317
|
+
function commandFinalize(args: Args, ctx: RunContext): void {
|
|
1318
|
+
if (args.iteration === undefined) die("finalize requires --iteration <N>");
|
|
1319
|
+
const { failedAt } = runSteps(
|
|
1320
|
+
buildFinalizeCommands({
|
|
1321
|
+
runnerDir: import.meta.dir,
|
|
1322
|
+
skillDir: ctx.skillDir,
|
|
1323
|
+
skill: ctx.skillName,
|
|
1324
|
+
iteration: args.iteration,
|
|
1325
|
+
}),
|
|
1326
|
+
);
|
|
1327
|
+
if (failedAt)
|
|
1328
|
+
die(
|
|
1329
|
+
`finalize stopped at '${failedAt}'. Fix the failure and re-run finalize.`,
|
|
1330
|
+
);
|
|
1331
|
+
console.log(
|
|
1332
|
+
`\n✅ Finalize complete. Read the benchmark above, then tear down: bun run evals:teardown --skill ${ctx.skillName}`,
|
|
1333
|
+
);
|
|
1334
|
+
}
|
|
1335
|
+
|
|
1011
1336
|
if (import.meta.main) {
|
|
1012
1337
|
const argv = Bun.argv.slice(2);
|
|
1013
1338
|
const args = parseArgs(argv);
|
|
@@ -1018,6 +1343,8 @@ if (import.meta.main) {
|
|
|
1018
1343
|
die(err instanceof Error ? err.message : String(err));
|
|
1019
1344
|
}
|
|
1020
1345
|
if (args.command === "snapshot") commandSnapshot(args, ctx);
|
|
1346
|
+
else if (args.command === "ingest") commandIngest(args, ctx);
|
|
1347
|
+
else if (args.command === "finalize") commandFinalize(args, ctx);
|
|
1021
1348
|
else if (args.command === "teardown-guard") {
|
|
1022
1349
|
const torn = teardownGuard(ctx.stageRoot);
|
|
1023
1350
|
console.log(
|
|
@@ -1025,5 +1352,37 @@ if (import.meta.main) {
|
|
|
1025
1352
|
? "🛡 Write guard removed."
|
|
1026
1353
|
: "No write guard was installed — nothing to remove.",
|
|
1027
1354
|
);
|
|
1355
|
+
} else if (args.command === "teardown") {
|
|
1356
|
+
// Full end-of-run teardown: disarm the guard, remove the staged skill set
|
|
1357
|
+
// (and prune a `.claude` the runner emptied), then reclaim the workspace —
|
|
1358
|
+
// leaving the user's own `.claude/settings.json`, pre-existing project
|
|
1359
|
+
// skills, and any uncommitted eval results intact.
|
|
1360
|
+
const torn = teardownGuard(ctx.stageRoot);
|
|
1361
|
+
cleanupStagedSkills(ctx.stageRoot);
|
|
1362
|
+
const ws = cleanupWorkspace(ctx.workspaceRoot, ctx.skillName);
|
|
1363
|
+
console.log(
|
|
1364
|
+
`🧹 Eval teardown complete: staged skill set removed${
|
|
1365
|
+
torn ? " and write guard disarmed" : ""
|
|
1366
|
+
}.`,
|
|
1367
|
+
);
|
|
1368
|
+
const reclaimed = ws.removedIterations.length + ws.removedSnapshots.length;
|
|
1369
|
+
if (reclaimed > 0) {
|
|
1370
|
+
console.log(
|
|
1371
|
+
` Reclaimed ${ws.removedIterations.length} workspace iteration(s)` +
|
|
1372
|
+
` and ${ws.removedSnapshots.length} reproducible snapshot(s).`,
|
|
1373
|
+
);
|
|
1374
|
+
}
|
|
1375
|
+
if (ws.keptIterations.length > 0) {
|
|
1376
|
+
const lines = ws.keptIterations.map(
|
|
1377
|
+
(k) => ` - ${k.iteration} (${k.reason})`,
|
|
1378
|
+
);
|
|
1379
|
+
console.warn(
|
|
1380
|
+
`⚠ Kept ${ws.keptIterations.length} workspace iteration(s) with results ` +
|
|
1381
|
+
`not yet committed:\n${lines.join("\n")}\n` +
|
|
1382
|
+
` Commit them, e.g.:\n` +
|
|
1383
|
+
` bun run evals:promote-baseline --skill ${ctx.skillName} --iteration <N>\n` +
|
|
1384
|
+
` or delete ${join("skills-workspace", ctx.skillName)}/ manually to discard.`,
|
|
1385
|
+
);
|
|
1386
|
+
}
|
|
1028
1387
|
} else commandRun(args, ctx);
|
|
1029
1388
|
}
|
|
@@ -25,6 +25,26 @@ export const BASH_MUTATION_PATTERNS: Array<{ re: RegExp; reason: string }> = [
|
|
|
25
25
|
re: /\bgit\s+(commit|add|push|checkout|reset|restore|merge|rebase)\b/,
|
|
26
26
|
reason: "git mutation",
|
|
27
27
|
},
|
|
28
|
+
{
|
|
29
|
+
re: /\bgit\s+worktree\s+add\b/,
|
|
30
|
+
reason: "git worktree add (working tree outside the sandbox)",
|
|
31
|
+
},
|
|
32
|
+
// A create/copy/move/link verb whose operand is a path under `.claude` —
|
|
33
|
+
// catches stray writes to the harness config dir that aren't a `>` redirect
|
|
34
|
+
// (those are caught below). Read-only verbs (`cat`, `ls`) aren't listed, so
|
|
35
|
+
// inspecting `.claude` stays allowed.
|
|
36
|
+
{
|
|
37
|
+
re: /\b(cp|mv|mkdir|touch|ln|rsync|install)\b[^|;&\n]*\.claude(\/|\b)/,
|
|
38
|
+
reason: "path under .claude",
|
|
39
|
+
},
|
|
40
|
+
// The same create verbs whose operand is a top-level `skills/` directory —
|
|
41
|
+
// catches a bare `skills/` left in the cwd. `skills-workspace` and other
|
|
42
|
+
// `skills`-prefixed names are excluded by the trailing `/`, whitespace, or
|
|
43
|
+
// end-of-string boundary.
|
|
44
|
+
{
|
|
45
|
+
re: /\b(cp|mv|mkdir|touch|ln|rsync)\b[^|;&\n]*[\s'"=/]\.{0,2}\/?skills(\/|\s|$)/,
|
|
46
|
+
reason: "creates a bare skills/ dir",
|
|
47
|
+
},
|
|
28
48
|
{ re: /(^|\s)(>>?|tee)\s/, reason: "output redirection to a file" },
|
|
29
49
|
];
|
|
30
50
|
|
|
@@ -109,4 +109,13 @@ export const SKILL_INVOKED_META_ID = "__skill_invoked";
|
|
|
109
109
|
export type TimingRecord = {
|
|
110
110
|
total_tokens?: number | null;
|
|
111
111
|
duration_ms?: number | null;
|
|
112
|
+
/**
|
|
113
|
+
* Where the numbers came from. "completion-event" = captured by the
|
|
114
|
+
* dispatching agent from the harness's task completion event;
|
|
115
|
+
* "transcript" = derived by record-runs from the persisted transcript
|
|
116
|
+
* (includes cache accounting — a different metric, not comparable 1:1).
|
|
117
|
+
* Absent on records written before provenance was tracked
|
|
118
|
+
* (completion-event in practice).
|
|
119
|
+
*/
|
|
120
|
+
source?: "completion-event" | "transcript";
|
|
112
121
|
};
|