@slowdini/slow-powers-opencode 0.3.0 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +34 -72
- package/bootstrap.md +1 -7
- package/opencode/plugins/slow-powers.js +69 -5
- package/package.json +14 -17
- package/skills/evaluating-skills/SKILL.md +90 -338
- package/skills/evaluating-skills/evals/baseline/BASELINE.md +23 -0
- package/skills/evaluating-skills/evals/baseline/NOTES.md +40 -0
- package/skills/evaluating-skills/evals/baseline/benchmark.json +54 -0
- package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__new_skill.json +39 -0
- package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__old_skill.json +39 -0
- package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__new_skill.json +39 -0
- package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__old_skill.json +39 -0
- package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__new_skill.json +32 -0
- package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__old_skill.json +32 -0
- package/skills/hardening-plans/SKILL.md +29 -7
- package/skills/hardening-plans/evals/baseline/BASELINE.md +11 -6
- package/skills/hardening-plans/evals/baseline/NOTES.md +72 -58
- package/skills/hardening-plans/evals/baseline/benchmark.json +25 -25
- package/skills/hardening-plans/evals/baseline/grading/concrete-todo-app-plan__new_skill.json +2 -2
- package/skills/hardening-plans/evals/baseline/grading/concrete-todo-app-plan__old_skill.json +2 -2
- package/skills/hardening-plans/evals/baseline/grading/docs-refactor-plan-mode__new_skill.json +39 -0
- package/skills/hardening-plans/evals/baseline/grading/docs-refactor-plan-mode__old_skill.json +39 -0
- package/skills/hardening-plans/evals/baseline/grading/oauth-task-breakdown-cold__new_skill.json +39 -0
- package/skills/hardening-plans/evals/baseline/grading/oauth-task-breakdown-cold__old_skill.json +39 -0
- package/skills/hardening-plans/evals/baseline/grading/research-plan-no-required-skill__new_skill.json +32 -0
- package/skills/hardening-plans/evals/baseline/grading/research-plan-no-required-skill__old_skill.json +32 -0
- package/skills/hardening-plans/evals/baseline/grading/seeded-plan-mode-todo-app-adversarial__new_skill.json +39 -0
- package/skills/hardening-plans/evals/baseline/grading/seeded-plan-mode-todo-app-adversarial__old_skill.json +39 -0
- package/skills/hardening-plans/evals/baseline/grading/seeded-plan-mode-todo-app__new_skill.json +39 -0
- package/skills/hardening-plans/evals/baseline/grading/seeded-plan-mode-todo-app__old_skill.json +39 -0
- package/skills/hardening-plans/evals/baseline/grading/seeded-review-catches-defects__new_skill.json +3 -3
- package/skills/hardening-plans/evals/baseline/grading/seeded-review-catches-defects__old_skill.json +8 -8
- package/skills/hardening-plans/evals/baseline/grading/structural-refactor-cold__new_skill.json +39 -0
- package/skills/hardening-plans/evals/baseline/grading/structural-refactor-cold__old_skill.json +39 -0
- package/skills/hardening-plans/evals/evals.json +46 -0
- package/skills/test-driven-development/evals/baseline/NOTES.md +2 -2
- package/skills/evaluating-skills/examples/verifying-development-work-evals.json +0 -30
- package/skills/evaluating-skills/harness-details/claude.md +0 -194
- package/skills/evaluating-skills/harness-parity.md +0 -155
- package/skills/evaluating-skills/runner/README.md +0 -163
- package/skills/evaluating-skills/runner/adapters/claude-code-session.test.ts +0 -56
- package/skills/evaluating-skills/runner/adapters/claude-code-session.ts +0 -43
- package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +0 -485
- package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +0 -242
- package/skills/evaluating-skills/runner/aggregate.test.ts +0 -484
- package/skills/evaluating-skills/runner/aggregate.ts +0 -269
- package/skills/evaluating-skills/runner/context.test.ts +0 -181
- package/skills/evaluating-skills/runner/context.ts +0 -90
- package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +0 -396
- package/skills/evaluating-skills/runner/detect-stray-writes.ts +0 -288
- package/skills/evaluating-skills/runner/fill-transcripts.test.ts +0 -73
- package/skills/evaluating-skills/runner/fill-transcripts.ts +0 -154
- package/skills/evaluating-skills/runner/grade.test.ts +0 -347
- package/skills/evaluating-skills/runner/grade.ts +0 -603
- package/skills/evaluating-skills/runner/guard/guard.ts +0 -49
- package/skills/evaluating-skills/runner/guard/install.test.ts +0 -92
- package/skills/evaluating-skills/runner/guard/install.ts +0 -147
- package/skills/evaluating-skills/runner/guard/policy.test.ts +0 -128
- package/skills/evaluating-skills/runner/guard/policy.ts +0 -74
- package/skills/evaluating-skills/runner/plugin-shadow.test.ts +0 -228
- package/skills/evaluating-skills/runner/plugin-shadow.ts +0 -201
- package/skills/evaluating-skills/runner/profiles/claude-code/plan-mode.md +0 -11
- package/skills/evaluating-skills/runner/promote-baseline.test.ts +0 -281
- package/skills/evaluating-skills/runner/promote-baseline.ts +0 -204
- package/skills/evaluating-skills/runner/record-runs.test.ts +0 -314
- package/skills/evaluating-skills/runner/record-runs.ts +0 -209
- package/skills/evaluating-skills/runner/run.test.ts +0 -1703
- package/skills/evaluating-skills/runner/run.ts +0 -1388
- package/skills/evaluating-skills/runner/sandbox-policy.ts +0 -94
- package/skills/evaluating-skills/runner/types.ts +0 -121
- package/skills/evaluating-skills/runner/validate-all.ts +0 -54
- package/skills/evaluating-skills/runner/validate-schema.test.ts +0 -99
- package/skills/evaluating-skills/runner/validate-schema.ts +0 -51
- package/skills/evaluating-skills/runner/validate.test.ts +0 -56
- package/skills/evaluating-skills/runner/validate.ts +0 -21
- package/skills/evaluating-skills/runner/workspace-teardown.test.ts +0 -227
- package/skills/evaluating-skills/runner/workspace-teardown.ts +0 -136
- package/skills/evaluating-skills/schema/evals.schema.json +0 -105
- package/skills/evaluating-skills/schema/grading.schema.json +0 -84
- package/skills/evaluating-skills/schema/run-record.schema.json +0 -80
- package/skills/evaluating-skills/schema/stray-writes.schema.json +0 -80
- package/skills/evaluating-skills/templates/eval-task-prompt.md +0 -69
- package/skills/evaluating-skills/templates/evals.json.example +0 -17
- package/skills/evaluating-skills/templates/judge-prompt.md +0 -56
- package/skills/evaluating-skills/templates/revise-skill-prompt.md +0 -56
|
@@ -1,1388 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env bun
|
|
2
|
-
import { randomBytes } from "node:crypto";
|
|
3
|
-
import {
|
|
4
|
-
cpSync,
|
|
5
|
-
existsSync,
|
|
6
|
-
mkdirSync,
|
|
7
|
-
mkdtempSync,
|
|
8
|
-
readdirSync,
|
|
9
|
-
readFileSync,
|
|
10
|
-
rmSync,
|
|
11
|
-
statSync,
|
|
12
|
-
writeFileSync,
|
|
13
|
-
} from "node:fs";
|
|
14
|
-
import { tmpdir } from "node:os";
|
|
15
|
-
import { basename, dirname, join } from "node:path";
|
|
16
|
-
import {
|
|
17
|
-
renderAvailableSkillsBlock,
|
|
18
|
-
renderPlanModeContext,
|
|
19
|
-
} from "./adapters/claude-code-session";
|
|
20
|
-
import { detectRunContext, type Harness, type RunContext } from "./context";
|
|
21
|
-
import { installGuard, teardownGuard } from "./guard/install";
|
|
22
|
-
import {
|
|
23
|
-
detectPluginShadows,
|
|
24
|
-
formatShadowBanner,
|
|
25
|
-
resolveConfigDir,
|
|
26
|
-
} from "./plugin-shadow";
|
|
27
|
-
import type {
|
|
28
|
-
AvailableSkill,
|
|
29
|
-
ConditionsRecord,
|
|
30
|
-
Eval,
|
|
31
|
-
EvalsConfig,
|
|
32
|
-
} from "./types";
|
|
33
|
-
import { validateEvalsConfig } from "./validate";
|
|
34
|
-
import { cleanupWorkspace, SNAPSHOT_META } from "./workspace-teardown";
|
|
35
|
-
|
|
36
|
-
export const STAGED_SKILL_PREFIX = "slow-powers-eval-";
|
|
37
|
-
export const STAGED_SIBLING_MANIFEST = ".slow-powers-eval-manifest.json";
|
|
38
|
-
|
|
39
|
-
export function stageSkillForCC(opts: {
|
|
40
|
-
content: string;
|
|
41
|
-
iteration: number;
|
|
42
|
-
condition: string;
|
|
43
|
-
skillName: string;
|
|
44
|
-
repoRoot: string;
|
|
45
|
-
/**
|
|
46
|
-
* Source skill directory whose sibling assets are copied alongside the staged
|
|
47
|
-
* SKILL.md — everything next to SKILL.md except SKILL.md itself, the `evals/`
|
|
48
|
-
* dir, and the snapshot bookkeeping file. A multi-file skill whose SKILL.md
|
|
49
|
-
* links a sibling (e.g. `[code-review.md](code-review.md)`) would otherwise be
|
|
50
|
-
* staged with a dangling link: the agent can't resolve the reference relative
|
|
51
|
-
* to the staged dir, so the linked guidance is silently unreachable. Mirrors
|
|
52
|
-
* the sibling-asset copy in `snapshot`. Omit to stage SKILL.md alone.
|
|
53
|
-
*/
|
|
54
|
-
assetsDir?: string;
|
|
55
|
-
/**
|
|
56
|
-
* When set, stage under this verbatim identifier instead of the conspicuous
|
|
57
|
-
* `slow-powers-eval-…` slug. Used by `--stage-name` to A/B a natural name
|
|
58
|
-
* against the eval-flagged one (issue #144 Step 2). A custom name is not
|
|
59
|
-
* caught by `cleanupStagedSkills`'s prefix scan, so the caller must also call
|
|
60
|
-
* `registerStagedSkillForCleanup` to have it removed on the next run.
|
|
61
|
-
*/
|
|
62
|
-
stageNameOverride?: string;
|
|
63
|
-
}): string {
|
|
64
|
-
const slug =
|
|
65
|
-
opts.stageNameOverride ??
|
|
66
|
-
`${STAGED_SKILL_PREFIX}${opts.iteration}-${opts.condition}__${opts.skillName}`;
|
|
67
|
-
const skillDir = join(opts.repoRoot, ".claude", "skills", slug);
|
|
68
|
-
mkdirSync(skillDir, { recursive: true });
|
|
69
|
-
writeFileSync(join(skillDir, "SKILL.md"), opts.content);
|
|
70
|
-
if (opts.assetsDir !== undefined && existsSync(opts.assetsDir)) {
|
|
71
|
-
for (const entry of readdirSync(opts.assetsDir)) {
|
|
72
|
-
if (entry === "SKILL.md" || entry === "evals" || entry === SNAPSHOT_META)
|
|
73
|
-
continue;
|
|
74
|
-
const src = join(opts.assetsDir, entry);
|
|
75
|
-
const dst = join(skillDir, entry);
|
|
76
|
-
if (statSync(src).isDirectory()) cpSync(src, dst, { recursive: true });
|
|
77
|
-
else cpSync(src, dst);
|
|
78
|
-
}
|
|
79
|
-
}
|
|
80
|
-
return slug;
|
|
81
|
-
}
|
|
82
|
-
|
|
83
|
-
/**
|
|
84
|
-
* Adds a custom-named staged skill dir (one created via `stageNameOverride`) to
|
|
85
|
-
* the sibling manifest's `created_entries` so the next run's
|
|
86
|
-
* `cleanupStagedSkills` removes it — the prefix scan only catches
|
|
87
|
-
* `slow-powers-eval-…` names. Idempotent: a name already recorded is left alone.
|
|
88
|
-
*/
|
|
89
|
-
export function registerStagedSkillForCleanup(
|
|
90
|
-
repoRoot: string,
|
|
91
|
-
name: string,
|
|
92
|
-
): void {
|
|
93
|
-
const skillsDir = join(repoRoot, ".claude", "skills");
|
|
94
|
-
const manifestPath = join(skillsDir, STAGED_SIBLING_MANIFEST);
|
|
95
|
-
let manifest: SiblingManifest;
|
|
96
|
-
if (existsSync(manifestPath)) {
|
|
97
|
-
manifest = JSON.parse(readFileSync(manifestPath, "utf8"));
|
|
98
|
-
} else {
|
|
99
|
-
manifest = {
|
|
100
|
-
created_at: new Date().toISOString(),
|
|
101
|
-
staged_under_test: name,
|
|
102
|
-
skills_dir_preexisting: true,
|
|
103
|
-
created_entries: [],
|
|
104
|
-
};
|
|
105
|
-
}
|
|
106
|
-
if (manifest.created_entries.some((e) => e.name === name)) return;
|
|
107
|
-
manifest.created_entries.push({ name, preexisting: false });
|
|
108
|
-
writeFileSync(manifestPath, `${JSON.stringify(manifest, null, 2)}\n`);
|
|
109
|
-
}
|
|
110
|
-
|
|
111
|
-
type SiblingManifest = {
|
|
112
|
-
created_at: string;
|
|
113
|
-
staged_under_test: string;
|
|
114
|
-
/**
|
|
115
|
-
* Whether `.claude/skills` already existed when staging began. When false the
|
|
116
|
-
* runner created it, so {@link cleanupStagedSkills} may remove the whole tree
|
|
117
|
-
* (and prune an emptied `.claude`); when true (or absent, on older manifests)
|
|
118
|
-
* cleanup falls back to the surgical per-entry restore so a user's own
|
|
119
|
-
* project skills are left intact.
|
|
120
|
-
*/
|
|
121
|
-
skills_dir_preexisting?: boolean;
|
|
122
|
-
created_entries: Array<{
|
|
123
|
-
name: string;
|
|
124
|
-
preexisting: boolean;
|
|
125
|
-
backup_path?: string;
|
|
126
|
-
}>;
|
|
127
|
-
};
|
|
128
|
-
|
|
129
|
-
export function stageSiblingSkills(opts: {
|
|
130
|
-
skillUnderTest: string;
|
|
131
|
-
skillsSourceDir: string;
|
|
132
|
-
repoRoot: string;
|
|
133
|
-
}): SiblingManifest {
|
|
134
|
-
const skillsDir = join(opts.repoRoot, ".claude", "skills");
|
|
135
|
-
const skillsDirPreexisting = existsSync(skillsDir);
|
|
136
|
-
mkdirSync(skillsDir, { recursive: true });
|
|
137
|
-
|
|
138
|
-
const siblings = readdirSync(opts.skillsSourceDir).filter((name) => {
|
|
139
|
-
if (name === opts.skillUnderTest) return false;
|
|
140
|
-
const srcDir = join(opts.skillsSourceDir, name);
|
|
141
|
-
if (!statSync(srcDir).isDirectory()) return false;
|
|
142
|
-
return existsSync(join(srcDir, "SKILL.md"));
|
|
143
|
-
});
|
|
144
|
-
|
|
145
|
-
const manifest: SiblingManifest = {
|
|
146
|
-
created_at: new Date().toISOString(),
|
|
147
|
-
staged_under_test: opts.skillUnderTest,
|
|
148
|
-
skills_dir_preexisting: skillsDirPreexisting,
|
|
149
|
-
created_entries: [],
|
|
150
|
-
};
|
|
151
|
-
|
|
152
|
-
for (const name of siblings) {
|
|
153
|
-
const srcDir = join(opts.skillsSourceDir, name);
|
|
154
|
-
const dstDir = join(skillsDir, name);
|
|
155
|
-
const evalsSubdir = join(srcDir, "evals");
|
|
156
|
-
|
|
157
|
-
const entry: SiblingManifest["created_entries"][number] = {
|
|
158
|
-
name,
|
|
159
|
-
preexisting: false,
|
|
160
|
-
};
|
|
161
|
-
|
|
162
|
-
if (existsSync(dstDir)) {
|
|
163
|
-
entry.preexisting = true;
|
|
164
|
-
const backupRoot = mkdtempSync(
|
|
165
|
-
join(tmpdir(), "slow-powers-eval-backup-"),
|
|
166
|
-
);
|
|
167
|
-
entry.backup_path = join(backupRoot, name);
|
|
168
|
-
cpSync(dstDir, entry.backup_path, { recursive: true });
|
|
169
|
-
rmSync(dstDir, { recursive: true, force: true });
|
|
170
|
-
}
|
|
171
|
-
|
|
172
|
-
cpSync(srcDir, dstDir, {
|
|
173
|
-
recursive: true,
|
|
174
|
-
filter: (src) =>
|
|
175
|
-
src !== evalsSubdir && !src.startsWith(`${evalsSubdir}/`),
|
|
176
|
-
});
|
|
177
|
-
|
|
178
|
-
manifest.created_entries.push(entry);
|
|
179
|
-
}
|
|
180
|
-
|
|
181
|
-
writeFileSync(
|
|
182
|
-
join(skillsDir, STAGED_SIBLING_MANIFEST),
|
|
183
|
-
`${JSON.stringify(manifest, null, 2)}\n`,
|
|
184
|
-
);
|
|
185
|
-
return manifest;
|
|
186
|
-
}
|
|
187
|
-
|
|
188
|
-
/** Remove `dir` only if it exists and is empty. Used to prune a `.claude` the
|
|
189
|
-
* runner emptied without ever touching a `.claude` that still holds the user's
|
|
190
|
-
* own files (e.g. `settings.json`). */
|
|
191
|
-
function pruneIfEmpty(dir: string): void {
|
|
192
|
-
if (existsSync(dir) && readdirSync(dir).length === 0) {
|
|
193
|
-
rmSync(dir, { recursive: true, force: true });
|
|
194
|
-
}
|
|
195
|
-
}
|
|
196
|
-
|
|
197
|
-
export function cleanupStagedSkills(repoRoot: string): void {
|
|
198
|
-
const claudeDir = join(repoRoot, ".claude");
|
|
199
|
-
const skillsDir = join(claudeDir, "skills");
|
|
200
|
-
if (!existsSync(skillsDir)) return;
|
|
201
|
-
|
|
202
|
-
for (const entry of readdirSync(skillsDir)) {
|
|
203
|
-
if (!entry.startsWith(STAGED_SKILL_PREFIX)) continue;
|
|
204
|
-
rmSync(join(skillsDir, entry), { recursive: true, force: true });
|
|
205
|
-
}
|
|
206
|
-
|
|
207
|
-
const manifestPath = join(skillsDir, STAGED_SIBLING_MANIFEST);
|
|
208
|
-
if (!existsSync(manifestPath)) return;
|
|
209
|
-
let manifest: SiblingManifest;
|
|
210
|
-
try {
|
|
211
|
-
manifest = JSON.parse(readFileSync(manifestPath, "utf8"));
|
|
212
|
-
} catch {
|
|
213
|
-
rmSync(manifestPath, { force: true });
|
|
214
|
-
return;
|
|
215
|
-
}
|
|
216
|
-
|
|
217
|
-
// The runner created `.claude/skills` this run, so it can't be holding any of
|
|
218
|
-
// the user's own skills — remove the whole staged tree (including any stray,
|
|
219
|
-
// non-prefixed dirs a recursive eval left behind), then prune an emptied
|
|
220
|
-
// `.claude`. In a real project `.claude/settings.json` keeps `.claude`
|
|
221
|
-
// non-empty, so only the scaffolding we created is removed.
|
|
222
|
-
if (manifest.skills_dir_preexisting === false) {
|
|
223
|
-
rmSync(skillsDir, { recursive: true, force: true });
|
|
224
|
-
pruneIfEmpty(claudeDir);
|
|
225
|
-
return;
|
|
226
|
-
}
|
|
227
|
-
|
|
228
|
-
for (const e of manifest.created_entries) {
|
|
229
|
-
const target = join(skillsDir, e.name);
|
|
230
|
-
rmSync(target, { recursive: true, force: true });
|
|
231
|
-
if (e.preexisting && e.backup_path && existsSync(e.backup_path)) {
|
|
232
|
-
cpSync(e.backup_path, target, { recursive: true });
|
|
233
|
-
rmSync(dirname(e.backup_path), { recursive: true, force: true });
|
|
234
|
-
}
|
|
235
|
-
}
|
|
236
|
-
rmSync(manifestPath, { force: true });
|
|
237
|
-
}
|
|
238
|
-
|
|
239
|
-
type Mode = "new-skill" | "revision";
|
|
240
|
-
|
|
241
|
-
type Args = {
|
|
242
|
-
command:
|
|
243
|
-
| "run"
|
|
244
|
-
| "snapshot"
|
|
245
|
-
| "teardown-guard"
|
|
246
|
-
| "teardown"
|
|
247
|
-
| "ingest"
|
|
248
|
-
| "finalize";
|
|
249
|
-
mode?: Mode;
|
|
250
|
-
baseline?: string;
|
|
251
|
-
label?: string;
|
|
252
|
-
iteration?: number;
|
|
253
|
-
only?: string[];
|
|
254
|
-
skip?: string[];
|
|
255
|
-
dryRun: boolean;
|
|
256
|
-
noStage: boolean;
|
|
257
|
-
guard: boolean;
|
|
258
|
-
stageName?: string;
|
|
259
|
-
planMode: boolean;
|
|
260
|
-
ref?: string;
|
|
261
|
-
subagentsDir?: string;
|
|
262
|
-
};
|
|
263
|
-
|
|
264
|
-
function die(msg: string): never {
|
|
265
|
-
console.error(`error: ${msg}`);
|
|
266
|
-
process.exit(1);
|
|
267
|
-
}
|
|
268
|
-
|
|
269
|
-
/**
|
|
270
|
-
* Reads the bytes of `<ref>:./<relPath>` from git, resolving `relPath` relative
|
|
271
|
-
* to `cwd` via the `./` prefix. Returns the raw stdout Buffer on success (write
|
|
272
|
-
* it directly — never `.toString()` — so binary assets round-trip intact), or
|
|
273
|
-
* `null` if the object doesn't exist at that ref (git exits non-zero). Mirrors
|
|
274
|
-
* the `Bun.spawnSync` git pattern in `promote-baseline.ts:gitHead`; runs git
|
|
275
|
-
* directly (no shell), so the ref/path aren't interpolated into a shell string.
|
|
276
|
-
*/
|
|
277
|
-
function gitShowBytes(
|
|
278
|
-
cwd: string,
|
|
279
|
-
ref: string,
|
|
280
|
-
relPath: string,
|
|
281
|
-
): Buffer | null {
|
|
282
|
-
const res = Bun.spawnSync(["git", "show", `${ref}:./${relPath}`], {
|
|
283
|
-
cwd,
|
|
284
|
-
stdout: "pipe",
|
|
285
|
-
stderr: "pipe",
|
|
286
|
-
});
|
|
287
|
-
if (res.exitCode !== 0) return null;
|
|
288
|
-
return Buffer.from(res.stdout);
|
|
289
|
-
}
|
|
290
|
-
|
|
291
|
-
/**
|
|
292
|
-
* Lists every file under `cwd` as it existed at `<ref>`, as paths relative to
|
|
293
|
-
* `cwd` (git's default ls-tree output strips the cwd prefix). `die`s with git's
|
|
294
|
-
* stderr on failure — a bad ref or a cwd outside any repo surfaces here.
|
|
295
|
-
*/
|
|
296
|
-
function gitLsFiles(cwd: string, ref: string): string[] {
|
|
297
|
-
const res = Bun.spawnSync(["git", "ls-tree", "-r", "--name-only", ref, "."], {
|
|
298
|
-
cwd,
|
|
299
|
-
stdout: "pipe",
|
|
300
|
-
stderr: "pipe",
|
|
301
|
-
});
|
|
302
|
-
if (res.exitCode !== 0)
|
|
303
|
-
die(`git ls-tree failed for ref ${ref}: ${res.stderr.toString().trim()}`);
|
|
304
|
-
return res.stdout
|
|
305
|
-
.toString()
|
|
306
|
-
.split("\n")
|
|
307
|
-
.map((s) => s.trim())
|
|
308
|
-
.filter(Boolean);
|
|
309
|
-
}
|
|
310
|
-
|
|
311
|
-
function parseArgs(argv: string[]): Args {
|
|
312
|
-
const positionals = argv.filter((a) => !a.startsWith("--"));
|
|
313
|
-
const COMMANDS: Args["command"][] = [
|
|
314
|
-
"snapshot",
|
|
315
|
-
"teardown-guard",
|
|
316
|
-
"teardown",
|
|
317
|
-
"ingest",
|
|
318
|
-
"finalize",
|
|
319
|
-
];
|
|
320
|
-
const command: Args["command"] =
|
|
321
|
-
COMMANDS.find((c) => c === positionals[0]) ?? "run";
|
|
322
|
-
|
|
323
|
-
const flag = (name: string): string | undefined => {
|
|
324
|
-
const i = argv.indexOf(`--${name}`);
|
|
325
|
-
if (i === -1) return undefined;
|
|
326
|
-
const v = argv[i + 1];
|
|
327
|
-
if (v === undefined || v.startsWith("--")) {
|
|
328
|
-
die(`flag --${name} requires a value`);
|
|
329
|
-
}
|
|
330
|
-
return v;
|
|
331
|
-
};
|
|
332
|
-
|
|
333
|
-
const has = (name: string) => argv.includes(`--${name}`);
|
|
334
|
-
|
|
335
|
-
const iterationFlag = flag("iteration");
|
|
336
|
-
const iteration =
|
|
337
|
-
iterationFlag !== undefined ? Number(iterationFlag) : undefined;
|
|
338
|
-
if (iteration !== undefined && !Number.isInteger(iteration))
|
|
339
|
-
die(`--iteration must be an integer, got ${iterationFlag}`);
|
|
340
|
-
|
|
341
|
-
const parseIdList = (v: string | undefined): string[] | undefined =>
|
|
342
|
-
v === undefined
|
|
343
|
-
? undefined
|
|
344
|
-
: v
|
|
345
|
-
.split(",")
|
|
346
|
-
.map((s) => s.trim())
|
|
347
|
-
.filter(Boolean);
|
|
348
|
-
|
|
349
|
-
return {
|
|
350
|
-
command,
|
|
351
|
-
mode: flag("mode") as Mode | undefined,
|
|
352
|
-
baseline: flag("baseline"),
|
|
353
|
-
label: flag("label"),
|
|
354
|
-
iteration,
|
|
355
|
-
only: parseIdList(flag("only")),
|
|
356
|
-
skip: parseIdList(flag("skip")),
|
|
357
|
-
dryRun: has("dry-run"),
|
|
358
|
-
noStage: has("no-stage"),
|
|
359
|
-
guard: has("guard"),
|
|
360
|
-
stageName: flag("stage-name"),
|
|
361
|
-
planMode: has("plan-mode"),
|
|
362
|
-
ref: flag("ref"),
|
|
363
|
-
subagentsDir: flag("subagents-dir"),
|
|
364
|
-
};
|
|
365
|
-
}
|
|
366
|
-
|
|
367
|
-
function ensureDir(path: string): void {
|
|
368
|
-
if (!existsSync(path)) mkdirSync(path, { recursive: true });
|
|
369
|
-
}
|
|
370
|
-
|
|
371
|
-
function writeJson(path: string, value: unknown): void {
|
|
372
|
-
writeFileSync(path, `${JSON.stringify(value, null, 2)}\n`);
|
|
373
|
-
}
|
|
374
|
-
|
|
375
|
-
function readJson<T>(path: string): T {
|
|
376
|
-
return JSON.parse(readFileSync(path, "utf8"));
|
|
377
|
-
}
|
|
378
|
-
|
|
379
|
-
function nextIteration(workspaceSkillDir: string, override?: number): number {
|
|
380
|
-
if (override !== undefined) return override;
|
|
381
|
-
if (!existsSync(workspaceSkillDir)) return 1;
|
|
382
|
-
const entries = readdirSync(workspaceSkillDir).filter((e) =>
|
|
383
|
-
e.startsWith("iteration-"),
|
|
384
|
-
);
|
|
385
|
-
if (entries.length === 0) return 1;
|
|
386
|
-
const nums = entries
|
|
387
|
-
.map((e) => Number(e.slice("iteration-".length)))
|
|
388
|
-
.filter((n) => Number.isFinite(n));
|
|
389
|
-
return Math.max(...nums, 0) + 1;
|
|
390
|
-
}
|
|
391
|
-
|
|
392
|
-
function conditionNamesFor(mode: Mode): [string, string] {
|
|
393
|
-
return mode === "new-skill"
|
|
394
|
-
? ["with_skill", "without_skill"]
|
|
395
|
-
: ["old_skill", "new_skill"];
|
|
396
|
-
}
|
|
397
|
-
|
|
398
|
-
function commandSnapshot(args: Args, ctx: RunContext): void {
|
|
399
|
-
if (!args.label) die("snapshot requires --label <name>");
|
|
400
|
-
const skillDir = ctx.skillSubdir;
|
|
401
|
-
|
|
402
|
-
const destDir = join(
|
|
403
|
-
ctx.workspaceRoot,
|
|
404
|
-
ctx.skillName,
|
|
405
|
-
"snapshots",
|
|
406
|
-
args.label,
|
|
407
|
-
);
|
|
408
|
-
if (existsSync(destDir))
|
|
409
|
-
die(
|
|
410
|
-
`snapshot already exists: ${destDir}\n` +
|
|
411
|
-
" Use a different --label or delete the existing snapshot first.",
|
|
412
|
-
);
|
|
413
|
-
|
|
414
|
-
if (args.ref !== undefined) {
|
|
415
|
-
snapshotFromRef(args.ref, skillDir, destDir, ctx.skillName);
|
|
416
|
-
return;
|
|
417
|
-
}
|
|
418
|
-
|
|
419
|
-
const skillMd = join(skillDir, "SKILL.md");
|
|
420
|
-
if (!existsSync(skillMd)) die(`skill not found: ${skillMd}`);
|
|
421
|
-
ensureDir(destDir);
|
|
422
|
-
|
|
423
|
-
cpSync(skillMd, join(destDir, "SKILL.md"));
|
|
424
|
-
for (const entry of readdirSync(skillDir)) {
|
|
425
|
-
if (entry === "SKILL.md" || entry === "evals") continue;
|
|
426
|
-
const src = join(skillDir, entry);
|
|
427
|
-
const dst = join(destDir, entry);
|
|
428
|
-
if (statSync(src).isDirectory()) cpSync(src, dst, { recursive: true });
|
|
429
|
-
else cpSync(src, dst);
|
|
430
|
-
}
|
|
431
|
-
|
|
432
|
-
// Record provenance so teardown keeps this (working-tree) snapshot — unlike a
|
|
433
|
-
// ref snapshot, it can't be regenerated from git.
|
|
434
|
-
writeJson(join(destDir, SNAPSHOT_META), { source: "working-tree" });
|
|
435
|
-
|
|
436
|
-
console.log(`Snapshotted ${ctx.skillName} → ${destDir}`);
|
|
437
|
-
}
|
|
438
|
-
|
|
439
|
-
/**
|
|
440
|
-
* Snapshots the skill (SKILL.md + sibling assets) as it existed at a git ref,
|
|
441
|
-
* read straight from the object database without touching the working tree
|
|
442
|
-
* (issue #122). The `evals/` directory is excluded to match the working-tree
|
|
443
|
-
* branch. Git runs from `skillDir`, which must sit inside a repo; a bad ref or a
|
|
444
|
-
* skill absent at that ref `die`s with a clear message.
|
|
445
|
-
*/
|
|
446
|
-
function snapshotFromRef(
|
|
447
|
-
ref: string,
|
|
448
|
-
skillDir: string,
|
|
449
|
-
destDir: string,
|
|
450
|
-
skillName: string,
|
|
451
|
-
): void {
|
|
452
|
-
const skillMd = gitShowBytes(skillDir, ref, "SKILL.md");
|
|
453
|
-
if (skillMd === null)
|
|
454
|
-
die(
|
|
455
|
-
`skill not found at ${ref}: ${join(skillDir, "SKILL.md")}\n` +
|
|
456
|
-
" Check the ref exists and that the skill was present there (and that this is a git repo).",
|
|
457
|
-
);
|
|
458
|
-
|
|
459
|
-
ensureDir(destDir);
|
|
460
|
-
writeFileSync(join(destDir, "SKILL.md"), skillMd);
|
|
461
|
-
|
|
462
|
-
for (const relPath of gitLsFiles(skillDir, ref)) {
|
|
463
|
-
if (relPath === "SKILL.md") continue;
|
|
464
|
-
if (relPath === "evals" || relPath.startsWith("evals/")) continue;
|
|
465
|
-
const bytes = gitShowBytes(skillDir, ref, relPath);
|
|
466
|
-
if (bytes === null) continue; // listed but unreadable (e.g. submodule/gitlink)
|
|
467
|
-
const dst = join(destDir, relPath);
|
|
468
|
-
ensureDir(dirname(dst));
|
|
469
|
-
writeFileSync(dst, bytes);
|
|
470
|
-
}
|
|
471
|
-
|
|
472
|
-
// Record provenance so teardown can reclaim this snapshot — it's fully
|
|
473
|
-
// reproducible from the ref.
|
|
474
|
-
writeJson(join(destDir, SNAPSHOT_META), { source: "ref", ref });
|
|
475
|
-
|
|
476
|
-
console.log(`Snapshotted ${skillName} at ${ref} → ${destDir}`);
|
|
477
|
-
}
|
|
478
|
-
|
|
479
|
-
function commandRun(args: Args, ctx: RunContext): void {
|
|
480
|
-
if (!args.mode) die("--mode required: new-skill | revision");
|
|
481
|
-
if (args.mode !== "new-skill" && args.mode !== "revision")
|
|
482
|
-
die(`unknown --mode: ${args.mode}`);
|
|
483
|
-
if (args.mode === "revision" && !args.baseline)
|
|
484
|
-
die("revision mode requires --baseline <label>");
|
|
485
|
-
|
|
486
|
-
const skillDir = ctx.skillSubdir;
|
|
487
|
-
const skillMd = join(skillDir, "SKILL.md");
|
|
488
|
-
if (!existsSync(skillMd)) die(`skill not found: ${skillMd}`);
|
|
489
|
-
|
|
490
|
-
const evalsPath = join(skillDir, "evals", "evals.json");
|
|
491
|
-
if (!existsSync(evalsPath)) die(`evals.json not found: ${evalsPath}`);
|
|
492
|
-
|
|
493
|
-
const config: EvalsConfig = validateEvalsConfig(
|
|
494
|
-
readJson(evalsPath),
|
|
495
|
-
evalsPath,
|
|
496
|
-
);
|
|
497
|
-
if (config.skill_name !== ctx.skillName)
|
|
498
|
-
console.warn(
|
|
499
|
-
`warning: evals.json skill_name (${config.skill_name}) does not match the skill folder (${ctx.skillName}). Proceeding with ${ctx.skillName}.`,
|
|
500
|
-
);
|
|
501
|
-
|
|
502
|
-
let selectedEvals: Eval[];
|
|
503
|
-
try {
|
|
504
|
-
selectedEvals = selectEvals(config.evals, {
|
|
505
|
-
only: args.only,
|
|
506
|
-
skip: args.skip,
|
|
507
|
-
});
|
|
508
|
-
} catch (err) {
|
|
509
|
-
die(err instanceof Error ? err.message : String(err));
|
|
510
|
-
}
|
|
511
|
-
|
|
512
|
-
const workspaceSkillDir = join(ctx.workspaceRoot, ctx.skillName);
|
|
513
|
-
const iteration = nextIteration(workspaceSkillDir, args.iteration);
|
|
514
|
-
const iterationDir = join(workspaceSkillDir, `iteration-${iteration}`);
|
|
515
|
-
|
|
516
|
-
// A per-run nonce makes each dispatch description globally unique. The
|
|
517
|
-
// subagents dir is shared across iterations of one parent session, so a bare
|
|
518
|
-
// `<eval>:<condition>` description repeats and fill-transcripts could fill an
|
|
519
|
-
// iteration's run from a colliding agent in another iteration. `i<N>-<nonce>`
|
|
520
|
-
// also disambiguates re-running the same iteration number.
|
|
521
|
-
const runNonce = `${Date.now().toString(36)}-${randomBytes(3).toString("hex")}`;
|
|
522
|
-
const runTag = `i${iteration}-${runNonce}`;
|
|
523
|
-
|
|
524
|
-
if (existsSync(iterationDir) && args.iteration === undefined)
|
|
525
|
-
die(
|
|
526
|
-
`iteration-${iteration} already exists; pass --iteration to overwrite explicitly`,
|
|
527
|
-
);
|
|
528
|
-
|
|
529
|
-
const [conditionA, conditionB] = conditionNamesFor(args.mode);
|
|
530
|
-
|
|
531
|
-
let skillPathForA: string | null;
|
|
532
|
-
let skillPathForB: string | null;
|
|
533
|
-
if (args.mode === "new-skill") {
|
|
534
|
-
skillPathForA = skillMd;
|
|
535
|
-
skillPathForB = null;
|
|
536
|
-
} else {
|
|
537
|
-
const baselineSkill = join(
|
|
538
|
-
workspaceSkillDir,
|
|
539
|
-
"snapshots",
|
|
540
|
-
args.baseline as string,
|
|
541
|
-
"SKILL.md",
|
|
542
|
-
);
|
|
543
|
-
if (!existsSync(baselineSkill))
|
|
544
|
-
die(
|
|
545
|
-
`baseline snapshot not found: ${baselineSkill}\n` +
|
|
546
|
-
` Run: bun run evals:snapshot --skill ${ctx.skillName} --skill-dir ${ctx.skillDir} --label ${args.baseline} (before editing)`,
|
|
547
|
-
);
|
|
548
|
-
skillPathForA = baselineSkill;
|
|
549
|
-
skillPathForB = skillMd;
|
|
550
|
-
}
|
|
551
|
-
|
|
552
|
-
console.log(
|
|
553
|
-
`Preparing ${ctx.skillName} iteration-${iteration} (${args.mode})`,
|
|
554
|
-
);
|
|
555
|
-
console.log(` ${conditionA}: ${skillPathForA ?? "(no skill)"}`);
|
|
556
|
-
console.log(` ${conditionB}: ${skillPathForB ?? "(no skill)"}`);
|
|
557
|
-
if (selectedEvals.length !== config.evals.length) {
|
|
558
|
-
const [flagName, ids] = args.only
|
|
559
|
-
? ["--only", args.only]
|
|
560
|
-
: ["--skip", args.skip ?? []];
|
|
561
|
-
console.log(
|
|
562
|
-
` selection: ${selectedEvals.length} of ${config.evals.length} evals (${flagName} ${ids.join(", ")})`,
|
|
563
|
-
);
|
|
564
|
-
}
|
|
565
|
-
if (args.noStage)
|
|
566
|
-
console.log(
|
|
567
|
-
" staging: disabled (--no-stage) — skills will be inlined into dispatch_prompt for harnesses without project-local skill discovery",
|
|
568
|
-
);
|
|
569
|
-
|
|
570
|
-
ensureDir(iterationDir);
|
|
571
|
-
cpSync(skillMd, join(iterationDir, "skill-snapshot.md"));
|
|
572
|
-
|
|
573
|
-
// Always disarm a prior run's guard before re-staging, so a crashed run can't
|
|
574
|
-
// leave the write-blocking hook armed across runs.
|
|
575
|
-
teardownGuard(ctx.stageRoot);
|
|
576
|
-
|
|
577
|
-
if (!args.noStage) cleanupStagedSkills(ctx.stageRoot);
|
|
578
|
-
|
|
579
|
-
if (!args.noStage) {
|
|
580
|
-
stageSiblingSkills({
|
|
581
|
-
skillUnderTest: ctx.skillName,
|
|
582
|
-
skillsSourceDir: ctx.skillDir,
|
|
583
|
-
repoRoot: ctx.stageRoot,
|
|
584
|
-
});
|
|
585
|
-
}
|
|
586
|
-
|
|
587
|
-
const bootstrapContent =
|
|
588
|
-
ctx.bootstrapPath !== null ? readFileSync(ctx.bootstrapPath, "utf8") : null;
|
|
589
|
-
|
|
590
|
-
// `--plan-mode` (issue #142): inject the harness's verbatim plan-mode
|
|
591
|
-
// procedure as an operating-context layer. The profile is a bundled asset
|
|
592
|
-
// resolved relative to this runner (mirroring the guard-script resolution
|
|
593
|
-
// below) and keyed by harness, so a harness without a profile simply has no
|
|
594
|
-
// `--plan-mode` and the portable dispatch contract is unchanged.
|
|
595
|
-
const planModeContent = args.planMode
|
|
596
|
-
? resolvePlanModeProfile(ctx.harness)
|
|
597
|
-
: null;
|
|
598
|
-
if (args.planMode)
|
|
599
|
-
console.log(
|
|
600
|
-
` plan-mode: injecting ${ctx.harness} plan-mode profile as operating context (issue #142; necessary-not-sufficient fidelity layer)`,
|
|
601
|
-
);
|
|
602
|
-
|
|
603
|
-
// Sibling skill metadata, shared across conditions. Empty when --no-stage
|
|
604
|
-
// (nothing is staged, so nothing is discoverable to list).
|
|
605
|
-
const siblingSkills: AvailableSkill[] = args.noStage
|
|
606
|
-
? []
|
|
607
|
-
: ctx.siblingSkillNames.map((name) => {
|
|
608
|
-
const p = join(ctx.skillDir, name, "SKILL.md");
|
|
609
|
-
return { name, path: p, description: getSkillDescription(p) };
|
|
610
|
-
});
|
|
611
|
-
|
|
612
|
-
// `--stage-name` overrides the conspicuous `slow-powers-eval-…` slug with a
|
|
613
|
-
// verbatim name (issue #144 Step 2: A/B a natural name against the eval slug).
|
|
614
|
-
// It targets the single staging condition, so reject the case where both
|
|
615
|
-
// conditions stage (e.g. revision mode) — one name can't cover two dirs — and
|
|
616
|
-
// refuse to clobber a dir that already exists (a real project skill the user
|
|
617
|
-
// owns; cleanup has already removed our own prior custom dirs by this point).
|
|
618
|
-
if (args.stageName !== undefined && !args.noStage) {
|
|
619
|
-
if (skillPathForA !== null && skillPathForB !== null) {
|
|
620
|
-
die(
|
|
621
|
-
"--stage-name is only supported when exactly one condition stages the skill (e.g. --mode new-skill); both conditions stage here.",
|
|
622
|
-
);
|
|
623
|
-
}
|
|
624
|
-
const target = join(ctx.stageRoot, ".claude", "skills", args.stageName);
|
|
625
|
-
if (existsSync(target)) {
|
|
626
|
-
die(
|
|
627
|
-
`--stage-name "${args.stageName}": ${target} already exists; refusing to clobber it. Remove it or choose a different name.`,
|
|
628
|
-
);
|
|
629
|
-
}
|
|
630
|
-
}
|
|
631
|
-
|
|
632
|
-
const stageFor = (
|
|
633
|
-
condName: string,
|
|
634
|
-
condSkillPath: string | null,
|
|
635
|
-
): string | null => {
|
|
636
|
-
if (!condSkillPath || args.noStage) return null;
|
|
637
|
-
return stageSkillForCC({
|
|
638
|
-
content: readFileSync(condSkillPath, "utf8"),
|
|
639
|
-
iteration,
|
|
640
|
-
condition: condName,
|
|
641
|
-
skillName: ctx.skillName,
|
|
642
|
-
repoRoot: ctx.stageRoot,
|
|
643
|
-
assetsDir: dirname(condSkillPath),
|
|
644
|
-
stageNameOverride: args.stageName,
|
|
645
|
-
});
|
|
646
|
-
};
|
|
647
|
-
|
|
648
|
-
const conditionASlug = stageFor(conditionA, skillPathForA);
|
|
649
|
-
const conditionBSlug = stageFor(conditionB, skillPathForB);
|
|
650
|
-
|
|
651
|
-
// A custom-named dir isn't caught by cleanupStagedSkills's prefix scan; record
|
|
652
|
-
// it in the sibling manifest so the next run removes it.
|
|
653
|
-
if (
|
|
654
|
-
args.stageName !== undefined &&
|
|
655
|
-
(conditionASlug === args.stageName || conditionBSlug === args.stageName)
|
|
656
|
-
) {
|
|
657
|
-
registerStagedSkillForCleanup(ctx.stageRoot, args.stageName);
|
|
658
|
-
}
|
|
659
|
-
|
|
660
|
-
const conditions: ConditionsRecord = {
|
|
661
|
-
mode: args.mode,
|
|
662
|
-
baseline: args.baseline,
|
|
663
|
-
conditions: [
|
|
664
|
-
{
|
|
665
|
-
name: conditionA,
|
|
666
|
-
skill_path: skillPathForA,
|
|
667
|
-
staged_skill_slug: conditionASlug,
|
|
668
|
-
},
|
|
669
|
-
{
|
|
670
|
-
name: conditionB,
|
|
671
|
-
skill_path: skillPathForB,
|
|
672
|
-
staged_skill_slug: conditionBSlug,
|
|
673
|
-
},
|
|
674
|
-
],
|
|
675
|
-
timestamp: new Date().toISOString(),
|
|
676
|
-
harness: ctx.harness,
|
|
677
|
-
run_nonce: runNonce,
|
|
678
|
-
};
|
|
679
|
-
writeJson(join(iterationDir, "conditions.json"), conditions);
|
|
680
|
-
|
|
681
|
-
// availableSkills for a condition = siblings + the skill-under-test when
|
|
682
|
-
// that condition loads it. Empty when nothing was staged.
|
|
683
|
-
const availableSkillsFor = (
|
|
684
|
-
condSkillPath: string | null,
|
|
685
|
-
): AvailableSkill[] => {
|
|
686
|
-
if (args.noStage) return [];
|
|
687
|
-
const skills = [...siblingSkills];
|
|
688
|
-
if (condSkillPath) {
|
|
689
|
-
skills.push({
|
|
690
|
-
name: ctx.skillName,
|
|
691
|
-
path: condSkillPath,
|
|
692
|
-
description: getSkillDescription(condSkillPath),
|
|
693
|
-
});
|
|
694
|
-
}
|
|
695
|
-
return skills;
|
|
696
|
-
};
|
|
697
|
-
|
|
698
|
-
const tasks: DispatchTask[] = [];
|
|
699
|
-
for (const ev of selectedEvals) {
|
|
700
|
-
const evalDir = join(iterationDir, `eval-${ev.id}`);
|
|
701
|
-
ensureDir(evalDir);
|
|
702
|
-
|
|
703
|
-
for (const [condName, condSkillPath, condSlug] of [
|
|
704
|
-
[conditionA, skillPathForA, conditionASlug],
|
|
705
|
-
[conditionB, skillPathForB, conditionBSlug],
|
|
706
|
-
] as const) {
|
|
707
|
-
const condDir = join(evalDir, condName);
|
|
708
|
-
const outputsDir = join(condDir, "outputs");
|
|
709
|
-
ensureDir(outputsDir);
|
|
710
|
-
|
|
711
|
-
const fixtures = copyFixtures(ev, skillDir, condDir);
|
|
712
|
-
tasks.push(
|
|
713
|
-
buildDispatchTask({
|
|
714
|
-
evalId: ev.id,
|
|
715
|
-
condition: condName,
|
|
716
|
-
skillPath: condSkillPath,
|
|
717
|
-
stagedSkillSlug: condSlug,
|
|
718
|
-
userPrompt: ev.prompt,
|
|
719
|
-
fixtures,
|
|
720
|
-
outputsDir,
|
|
721
|
-
condDir,
|
|
722
|
-
bootstrapContent,
|
|
723
|
-
planModeContent,
|
|
724
|
-
skillName: ctx.skillName,
|
|
725
|
-
availableSkills: availableSkillsFor(condSkillPath),
|
|
726
|
-
runTag,
|
|
727
|
-
}),
|
|
728
|
-
);
|
|
729
|
-
}
|
|
730
|
-
}
|
|
731
|
-
|
|
732
|
-
const manifestPath = join(iterationDir, "dispatch-manifest.md");
|
|
733
|
-
writeFileSync(
|
|
734
|
-
manifestPath,
|
|
735
|
-
buildManifest({
|
|
736
|
-
skillName: ctx.skillName,
|
|
737
|
-
mode: args.mode,
|
|
738
|
-
baseline: args.baseline,
|
|
739
|
-
iteration,
|
|
740
|
-
tasks,
|
|
741
|
-
}),
|
|
742
|
-
);
|
|
743
|
-
|
|
744
|
-
// Write each prompt to its own file and reference it by path in dispatch.json.
|
|
745
|
-
// The orchestrator then dispatches with a short "read this file" prompt instead
|
|
746
|
-
// of reproducing the full prompt verbatim per Task call.
|
|
747
|
-
for (const task of tasks) {
|
|
748
|
-
writeFileSync(task.dispatch_prompt_path, task.dispatch_prompt);
|
|
749
|
-
}
|
|
750
|
-
|
|
751
|
-
const dispatchJsonPath = join(iterationDir, "dispatch.json");
|
|
752
|
-
writeJson(dispatchJsonPath, {
|
|
753
|
-
skill_name: ctx.skillName,
|
|
754
|
-
iteration,
|
|
755
|
-
run_nonce: runNonce,
|
|
756
|
-
iteration_dir: iterationDir,
|
|
757
|
-
mode: args.mode,
|
|
758
|
-
baseline: args.baseline ?? null,
|
|
759
|
-
plan_mode: args.planMode,
|
|
760
|
-
conditions: conditions.conditions,
|
|
761
|
-
harness: ctx.harness,
|
|
762
|
-
tasks: tasks.map(({ dispatch_prompt: _omit, ...rest }) => rest),
|
|
763
|
-
});
|
|
764
|
-
|
|
765
|
-
// Opt-in hard guard. Stages a PreToolUse hook that blocks subagent
|
|
766
|
-
// writes/installs outside the eval sandbox while dispatches run.
|
|
767
|
-
if (args.guard && !args.dryRun) {
|
|
768
|
-
if (args.noStage) {
|
|
769
|
-
console.warn(
|
|
770
|
-
"\n⚠ --guard requires staging enabled; skipping guard install.",
|
|
771
|
-
);
|
|
772
|
-
} else {
|
|
773
|
-
const guardScriptPath = join(import.meta.dir, "guard", "guard.ts");
|
|
774
|
-
installGuard({
|
|
775
|
-
stageRoot: ctx.stageRoot,
|
|
776
|
-
workspaceRoot: ctx.workspaceRoot,
|
|
777
|
-
guardScriptPath,
|
|
778
|
-
});
|
|
779
|
-
console.log(
|
|
780
|
-
"\n🛡 Write guard armed: a PreToolUse hook is staged in .claude/settings.local.json\n" +
|
|
781
|
-
" and will block writes/installs outside the eval sandbox during dispatches.\n" +
|
|
782
|
-
" It auto-expires in 6h and is removed on the next run; to remove it now:\n" +
|
|
783
|
-
" bun run evals:teardown-guard --skill <name>",
|
|
784
|
-
);
|
|
785
|
-
}
|
|
786
|
-
}
|
|
787
|
-
|
|
788
|
-
// Plugin-shadow preflight (Claude Code): a staged skill name that is also
|
|
789
|
-
// discoverable from an enabled plugin or the global skills dir contaminates the
|
|
790
|
-
// run — subagents inherit this session's plugins, so both copies are reachable.
|
|
791
|
-
// The runner can't unload a plugin from a live session; it only flags it. The
|
|
792
|
-
// report is persisted so the aggregator can surface it in validity_warnings.
|
|
793
|
-
if (ctx.harness === "claude-code") {
|
|
794
|
-
const shadowReport = detectPluginShadows({
|
|
795
|
-
configDir: resolveConfigDir(),
|
|
796
|
-
cwd: ctx.stageRoot,
|
|
797
|
-
stagedSkillNames: [ctx.skillName, ...ctx.siblingSkillNames],
|
|
798
|
-
});
|
|
799
|
-
if (shadowReport.shadowed.length > 0) {
|
|
800
|
-
writeJson(join(iterationDir, "plugin-shadow.json"), shadowReport);
|
|
801
|
-
console.warn(formatShadowBanner(shadowReport));
|
|
802
|
-
}
|
|
803
|
-
}
|
|
804
|
-
|
|
805
|
-
console.log(`\nWorkspace prepared: ${iterationDir}`);
|
|
806
|
-
console.log(`Dispatch manifest: ${manifestPath}`);
|
|
807
|
-
console.log(`Dispatch tasks: ${dispatchJsonPath}`);
|
|
808
|
-
console.log(
|
|
809
|
-
`\n${tasks.length} dispatches required (${selectedEvals.length} evals × 2 conditions).`,
|
|
810
|
-
);
|
|
811
|
-
|
|
812
|
-
if (args.dryRun) console.log("\n--dry-run: stopping after workspace prep.");
|
|
813
|
-
else
|
|
814
|
-
console.log(
|
|
815
|
-
"\nNext: read dispatch.json and dispatch each task as a subagent. Then run `ingest --iteration <N> --subagents-dir <path>` (Claude Code), or write run.json + timing.json to the paths in each task by hand and run the chained steps individually (transcript-less harnesses).",
|
|
816
|
-
);
|
|
817
|
-
}
|
|
818
|
-
|
|
819
|
-
type DispatchTask = {
|
|
820
|
-
eval_id: string;
|
|
821
|
-
condition: string;
|
|
822
|
-
skill_path: string | null;
|
|
823
|
-
staged_skill_slug: string | null;
|
|
824
|
-
user_prompt: string;
|
|
825
|
-
fixtures: string[];
|
|
826
|
-
outputs_dir: string;
|
|
827
|
-
run_record_path: string;
|
|
828
|
-
timing_path: string;
|
|
829
|
-
agent_description: string;
|
|
830
|
-
/**
|
|
831
|
-
* Absolute path to the file holding the full dispatch prompt. The orchestrator
|
|
832
|
-
* dispatches each subagent with a short "read this file and follow it" prompt
|
|
833
|
-
* rather than inlining the prompt, so it never has to reproduce ~KB of text per
|
|
834
|
-
* Task call. `dispatch_prompt` carries the same text in-memory (for manifest
|
|
835
|
-
* building and unit tests) but is stripped from the serialized dispatch.json.
|
|
836
|
-
*/
|
|
837
|
-
dispatch_prompt_path: string;
|
|
838
|
-
dispatch_prompt: string;
|
|
839
|
-
};
|
|
840
|
-
|
|
841
|
-
export type { AvailableSkill } from "./types";
|
|
842
|
-
|
|
843
|
-
/**
|
|
844
|
-
* Filters the eval list to the subset requested via `--only` / `--skip`. The
|
|
845
|
-
* two flags are mutually exclusive. Every requested id must exist in the config,
|
|
846
|
-
* so a typo'd id is caught up front rather than silently producing an empty or
|
|
847
|
-
* surprising run. Throws on invalid input; the caller routes the message to
|
|
848
|
-
* `die`. `--only` preserves the config's eval order, not the order ids were
|
|
849
|
-
* passed.
|
|
850
|
-
*/
|
|
851
|
-
export function selectEvals(
|
|
852
|
-
evals: Eval[],
|
|
853
|
-
opts: { only?: string[]; skip?: string[] },
|
|
854
|
-
): Eval[] {
|
|
855
|
-
if (opts.only && opts.skip)
|
|
856
|
-
throw new Error("use only one of --only / --skip, not both");
|
|
857
|
-
const requested = opts.only ?? opts.skip;
|
|
858
|
-
if (requested === undefined) return evals;
|
|
859
|
-
if (requested.length === 0)
|
|
860
|
-
throw new Error("--only/--skip requires at least one eval id");
|
|
861
|
-
|
|
862
|
-
const known = new Set(evals.map((e) => e.id));
|
|
863
|
-
const unknown = requested.filter((id) => !known.has(id));
|
|
864
|
-
if (unknown.length)
|
|
865
|
-
throw new Error(
|
|
866
|
-
`unknown eval id(s): ${unknown.join(", ")}. ` +
|
|
867
|
-
`Available ids: ${[...known].join(", ")}`,
|
|
868
|
-
);
|
|
869
|
-
|
|
870
|
-
const set = new Set(requested);
|
|
871
|
-
return opts.only
|
|
872
|
-
? evals.filter((e) => set.has(e.id))
|
|
873
|
-
: evals.filter((e) => !set.has(e.id));
|
|
874
|
-
}
|
|
875
|
-
|
|
876
|
-
function copyFixtures(ev: Eval, skillDir: string, condDir: string): string[] {
|
|
877
|
-
if (!ev.files || ev.files.length === 0) return [];
|
|
878
|
-
const inputsDir = join(condDir, "inputs");
|
|
879
|
-
ensureDir(inputsDir);
|
|
880
|
-
const copied: string[] = [];
|
|
881
|
-
for (const f of ev.files) {
|
|
882
|
-
const src = join(skillDir, "evals", f);
|
|
883
|
-
if (!existsSync(src)) die(`fixture not found: ${src}`);
|
|
884
|
-
const dst = join(inputsDir, basename(f));
|
|
885
|
-
if (statSync(src).isDirectory()) cpSync(src, dst, { recursive: true });
|
|
886
|
-
else cpSync(src, dst);
|
|
887
|
-
copied.push(dst);
|
|
888
|
-
}
|
|
889
|
-
return copied;
|
|
890
|
-
}
|
|
891
|
-
|
|
892
|
-
/**
|
|
893
|
-
* Resolve the verbatim plan-mode procedure profile for a harness (issue #142).
|
|
894
|
-
* The profile is a bundled supporting-file asset under
|
|
895
|
-
* `profiles/<harness>/plan-mode.md`, resolved relative to this runner exactly
|
|
896
|
-
* like the guard script (`join(import.meta.dir, "guard", "guard.ts")`). A
|
|
897
|
-
* harness without a profile gets a clear error rather than a silent no-op — the
|
|
898
|
-
* profile is Claude-tier fidelity, and a harness lacking one leaves the portable
|
|
899
|
-
* dispatch contract unchanged (no `<system-reminder>` plan-mode block emitted).
|
|
900
|
-
*/
|
|
901
|
-
function resolvePlanModeProfile(harness: Harness): string {
|
|
902
|
-
const profilePath = join(
|
|
903
|
-
import.meta.dir,
|
|
904
|
-
"profiles",
|
|
905
|
-
harness,
|
|
906
|
-
"plan-mode.md",
|
|
907
|
-
);
|
|
908
|
-
if (!existsSync(profilePath)) {
|
|
909
|
-
die(
|
|
910
|
-
`--plan-mode: no plan-mode profile exists for harness '${harness}' ` +
|
|
911
|
-
`(expected ${profilePath}). This is a Claude-tier fidelity layer; a ` +
|
|
912
|
-
"harness without a profile leaves the portable dispatch contract unchanged.",
|
|
913
|
-
);
|
|
914
|
-
}
|
|
915
|
-
return readFileSync(profilePath, "utf8");
|
|
916
|
-
}
|
|
917
|
-
|
|
918
|
-
function getSkillDescription(skillPath: string): string {
|
|
919
|
-
try {
|
|
920
|
-
const content = readFileSync(skillPath, "utf8");
|
|
921
|
-
const match = content.match(/description:\s*([^\n\r]+)/);
|
|
922
|
-
if (match) {
|
|
923
|
-
let desc = match[1].trim();
|
|
924
|
-
if (
|
|
925
|
-
(desc.startsWith('"') && desc.endsWith('"')) ||
|
|
926
|
-
(desc.startsWith("'") && desc.endsWith("'"))
|
|
927
|
-
) {
|
|
928
|
-
desc = desc.slice(1, -1).trim();
|
|
929
|
-
}
|
|
930
|
-
return desc;
|
|
931
|
-
}
|
|
932
|
-
} catch {}
|
|
933
|
-
return "No description available.";
|
|
934
|
-
}
|
|
935
|
-
|
|
936
|
-
/**
|
|
937
|
-
* Removes the skill-under-test's "Active Skills Directory" entry from bootstrap
|
|
938
|
-
* content so a skill-absent condition (e.g. `without_skill`) carries no
|
|
939
|
-
* reference to it. Targets the markdown list-item block: a top-level `*`/`-`
|
|
940
|
-
* bullet whose backticked name equals `skillName`, plus its indented
|
|
941
|
-
* continuation lines (the `*Trigger:*` sub-bullet). Sibling entries and the
|
|
942
|
-
* heading are left intact. The eval bootstrap names skills only in that
|
|
943
|
-
* directory, so this is the sole reference vector to scrub.
|
|
944
|
-
*/
|
|
945
|
-
export function redactSkillFromBootstrap(
|
|
946
|
-
content: string,
|
|
947
|
-
skillName: string,
|
|
948
|
-
): string {
|
|
949
|
-
const out: string[] = [];
|
|
950
|
-
let skipping = false;
|
|
951
|
-
for (const line of content.split("\n")) {
|
|
952
|
-
if (skipping) {
|
|
953
|
-
// Indented continuation lines belong to the entry being dropped.
|
|
954
|
-
if (/^\s+\S/.test(line)) continue;
|
|
955
|
-
skipping = false;
|
|
956
|
-
}
|
|
957
|
-
if (/^[*-]\s/.test(line) && line.includes(`\`${skillName}\``)) {
|
|
958
|
-
skipping = true;
|
|
959
|
-
continue;
|
|
960
|
-
}
|
|
961
|
-
out.push(line);
|
|
962
|
-
}
|
|
963
|
-
return out.join("\n");
|
|
964
|
-
}
|
|
965
|
-
|
|
966
|
-
export function buildDispatchTask(opts: {
|
|
967
|
-
evalId: string;
|
|
968
|
-
condition: string;
|
|
969
|
-
skillPath: string | null;
|
|
970
|
-
stagedSkillSlug: string | null;
|
|
971
|
-
userPrompt: string;
|
|
972
|
-
fixtures: string[];
|
|
973
|
-
outputsDir: string;
|
|
974
|
-
condDir: string;
|
|
975
|
-
bootstrapContent: string | null;
|
|
976
|
-
/**
|
|
977
|
-
* Verbatim plan-mode procedure profile (from
|
|
978
|
-
* `profiles/<harness>/plan-mode.md`) to inject as an operating-context layer,
|
|
979
|
-
* or null/undefined to omit it. Skill-agnostic, so it is identical across the
|
|
980
|
-
* with/without-skill arms and needs no redaction. Set by the `--plan-mode`
|
|
981
|
-
* flag (issue #142): the highest-fidelity in-runner approximation of a real
|
|
982
|
-
* plan mode, still text the agent reads — a necessary-not-sufficient signal.
|
|
983
|
-
*/
|
|
984
|
-
planModeContent?: string | null;
|
|
985
|
-
skillName: string;
|
|
986
|
-
availableSkills: AvailableSkill[];
|
|
987
|
-
/**
|
|
988
|
-
* Per-run uniqueness suffix (`i<iteration>-<nonce>`). Appended to the
|
|
989
|
-
* dispatch description so transcripts can't collide across iterations or
|
|
990
|
-
* re-runs. Omitted in unit tests that exercise prompt assembly directly.
|
|
991
|
-
*/
|
|
992
|
-
runTag?: string;
|
|
993
|
-
}): DispatchTask {
|
|
994
|
-
const stagedSkills = [...opts.availableSkills].sort((a, b) =>
|
|
995
|
-
a.name.localeCompare(b.name),
|
|
996
|
-
);
|
|
997
|
-
|
|
998
|
-
let skillBlock: string;
|
|
999
|
-
if (opts.stagedSkillSlug) {
|
|
1000
|
-
// Neutral slug disambiguation only — no imperative to invoke. The skill is
|
|
1001
|
-
// staged under a unique slug; surface that identifier so a deliberate
|
|
1002
|
-
// invocation targets the staged copy and the __skill_invoked meta-check can
|
|
1003
|
-
// find it. Do NOT assert a plugin is "loaded" or tell the agent to prefer the
|
|
1004
|
-
// slug "rather than the bare name": in an isolated run there is no global copy,
|
|
1005
|
-
// and that framing invited the agent to hunt for one (issue #144 global-plugin
|
|
1006
|
-
// leakage). Whether to invoke is left to the skill's own triggering (dropping
|
|
1007
|
-
// the old "invoke if it applies" directive was the issue #119 ceiling fix).
|
|
1008
|
-
skillBlock = [
|
|
1009
|
-
`The \`${opts.skillName}\` skill is registered under the identifier \`${opts.stagedSkillSlug}\` and is discoverable via the Skill tool. If you invoke it, use that identifier.`,
|
|
1010
|
-
].join("\n");
|
|
1011
|
-
} else if (opts.skillPath) {
|
|
1012
|
-
skillBlock = [
|
|
1013
|
-
"The following skill is loaded into your operating guidelines. Apply it where relevant to the user's request.",
|
|
1014
|
-
"",
|
|
1015
|
-
`<skill name="${basename(dirname(opts.skillPath))}">`,
|
|
1016
|
-
readFileSync(opts.skillPath, "utf8").trim(),
|
|
1017
|
-
"</skill>",
|
|
1018
|
-
].join("\n");
|
|
1019
|
-
} else if (stagedSkills.length > 0 || opts.bootstrapContent) {
|
|
1020
|
-
// Skill-absent arm in a realistic environment: stay silent. The
|
|
1021
|
-
// available-skills block already omits the skill-under-test, so any
|
|
1022
|
-
// commentary here would only announce the eval (and, in the control arm,
|
|
1023
|
-
// draw attention to the very skill that is supposed to be absent).
|
|
1024
|
-
skillBlock = "";
|
|
1025
|
-
} else {
|
|
1026
|
-
skillBlock = "No skill is loaded. Respond as you naturally would.";
|
|
1027
|
-
}
|
|
1028
|
-
|
|
1029
|
-
const fixturesBlock = opts.fixtures.length
|
|
1030
|
-
? `Available fixture files:\n${opts.fixtures.map((f) => ` - ${f}`).join("\n")}`
|
|
1031
|
-
: "Available fixture files: none";
|
|
1032
|
-
|
|
1033
|
-
// A dispatch mirrors a real session by carrying two *separate* surfaces, the
|
|
1034
|
-
// way the harness actually delivers them:
|
|
1035
|
-
// 1. The verbatim --bootstrap file (the SessionStart-hook equivalent),
|
|
1036
|
-
// wrapped in <session-start-context>, if supplied.
|
|
1037
|
-
// 2. The list of discoverable skills, rendered in the harness's native
|
|
1038
|
-
// presentation as its own block (see adapters/claude-code-session.ts).
|
|
1039
|
-
// A condition that does not load the skill-under-test (the new-skill
|
|
1040
|
-
// `without_skill` arm, under staging or --no-stage) must carry zero reference
|
|
1041
|
-
// to it. The skill-under-test is auto-omitted from the available-skills block
|
|
1042
|
-
// (see `availableSkillsFor`). redactSkillFromBootstrap covers the other path:
|
|
1043
|
-
// a *user-supplied* --bootstrap that names the skill in its own prose would
|
|
1044
|
-
// otherwise leak it into the control arm. (The shipped bootstrap.md no longer
|
|
1045
|
-
// enumerates skills, so that redaction is a no-op against it.)
|
|
1046
|
-
const skillAbsent = !opts.skillPath && !opts.stagedSkillSlug;
|
|
1047
|
-
const effectiveBootstrap =
|
|
1048
|
-
opts.bootstrapContent && skillAbsent
|
|
1049
|
-
? redactSkillFromBootstrap(opts.bootstrapContent, opts.skillName)
|
|
1050
|
-
: opts.bootstrapContent;
|
|
1051
|
-
|
|
1052
|
-
const sections: string[] = [];
|
|
1053
|
-
if (effectiveBootstrap) {
|
|
1054
|
-
sections.push(
|
|
1055
|
-
[
|
|
1056
|
-
"<session-start-context>",
|
|
1057
|
-
"The following guidelines were loaded at session start by the slow-powers plugin",
|
|
1058
|
-
"(equivalent to the SessionStart hook firing in a real user's environment):",
|
|
1059
|
-
"",
|
|
1060
|
-
effectiveBootstrap.trim(),
|
|
1061
|
-
"</session-start-context>",
|
|
1062
|
-
"",
|
|
1063
|
-
].join("\n"),
|
|
1064
|
-
);
|
|
1065
|
-
}
|
|
1066
|
-
const availableSkillsBlock = renderAvailableSkillsBlock(stagedSkills);
|
|
1067
|
-
if (availableSkillsBlock) {
|
|
1068
|
-
sections.push(`${availableSkillsBlock}\n\n`);
|
|
1069
|
-
}
|
|
1070
|
-
// Plan-mode operating context (issue #142). Injected as its own block after
|
|
1071
|
-
// the session-start surfaces and before the eval task framing, so it reads as
|
|
1072
|
-
// a session-level mode active for this turn — layered the way the real harness
|
|
1073
|
-
// delivers it, not as seed prose. Skill-agnostic: identical in both arms.
|
|
1074
|
-
const planModeBlock = opts.planModeContent
|
|
1075
|
-
? renderPlanModeContext(opts.planModeContent)
|
|
1076
|
-
: "";
|
|
1077
|
-
if (planModeBlock) {
|
|
1078
|
-
sections.push(`${planModeBlock}\n\n`);
|
|
1079
|
-
}
|
|
1080
|
-
const taskLines = [
|
|
1081
|
-
"You are executing a single test case for a skill evaluation framework.",
|
|
1082
|
-
"Treat this as a real user request — do NOT optimize behavior for the eval.",
|
|
1083
|
-
];
|
|
1084
|
-
if (skillBlock) taskLines.push("", skillBlock);
|
|
1085
|
-
taskLines.push(
|
|
1086
|
-
"",
|
|
1087
|
-
fixturesBlock,
|
|
1088
|
-
`Output directory: ${opts.outputsDir}`,
|
|
1089
|
-
"",
|
|
1090
|
-
"Instructions:",
|
|
1091
|
-
"- Write any files you produce into the output directory.",
|
|
1092
|
-
`- After completing the task, write your final user-facing response to ${opts.outputsDir}/final-message.md.`,
|
|
1093
|
-
"- Do not write outside the output directory.",
|
|
1094
|
-
"",
|
|
1095
|
-
"User request:",
|
|
1096
|
-
opts.userPrompt,
|
|
1097
|
-
);
|
|
1098
|
-
sections.push(taskLines.join("\n"));
|
|
1099
|
-
|
|
1100
|
-
return {
|
|
1101
|
-
eval_id: opts.evalId,
|
|
1102
|
-
condition: opts.condition,
|
|
1103
|
-
skill_path: opts.skillPath,
|
|
1104
|
-
staged_skill_slug: opts.stagedSkillSlug,
|
|
1105
|
-
user_prompt: opts.userPrompt,
|
|
1106
|
-
fixtures: opts.fixtures,
|
|
1107
|
-
outputs_dir: opts.outputsDir,
|
|
1108
|
-
run_record_path: join(opts.condDir, "run.json"),
|
|
1109
|
-
timing_path: join(opts.condDir, "timing.json"),
|
|
1110
|
-
agent_description: opts.runTag
|
|
1111
|
-
? `${opts.evalId}:${opts.condition}:${opts.runTag}`
|
|
1112
|
-
: `${opts.evalId}:${opts.condition}`,
|
|
1113
|
-
dispatch_prompt_path: join(opts.condDir, "dispatch-prompt.txt"),
|
|
1114
|
-
dispatch_prompt: sections.join(""),
|
|
1115
|
-
};
|
|
1116
|
-
}
|
|
1117
|
-
|
|
1118
|
-
function buildManifest(opts: {
|
|
1119
|
-
skillName: string;
|
|
1120
|
-
mode: Mode;
|
|
1121
|
-
baseline?: string;
|
|
1122
|
-
iteration: number;
|
|
1123
|
-
tasks: DispatchTask[];
|
|
1124
|
-
}): string {
|
|
1125
|
-
const header = [
|
|
1126
|
-
`# Dispatch manifest — ${opts.skillName} iteration-${opts.iteration}`,
|
|
1127
|
-
"",
|
|
1128
|
-
`Mode: ${opts.mode}${opts.baseline ? ` (baseline: ${opts.baseline})` : ""}`,
|
|
1129
|
-
`Generated: ${new Date().toISOString()}`,
|
|
1130
|
-
`Total dispatches: ${opts.tasks.length}`,
|
|
1131
|
-
"",
|
|
1132
|
-
"## How to use this manifest",
|
|
1133
|
-
"",
|
|
1134
|
-
'In an agent session, read `dispatch.json` (sibling of this file) instead of this manifest. Each task has a `dispatch_prompt_path` field pointing at the file that holds the full prompt — dispatch the subagent with a short "read this file and follow it" instruction rather than inlining the prompt — plus exact paths for `run.json` and `timing.json`.',
|
|
1135
|
-
"",
|
|
1136
|
-
"**Transcript correlation:** Each task has an `agent_description` field of the form `<eval_id>:<condition>:i<N>-<nonce>`. When dispatching the subagent via the host's primitive (e.g. Claude Code's Agent tool), pass this string verbatim as the dispatch `description` — do not reconstruct it. The per-run nonce keeps descriptions unique across iterations sharing one session's subagents dir, so the transcript adapter correlates each subagent's persisted transcript back to the right `(eval, condition)` slot without collisions.",
|
|
1137
|
-
"",
|
|
1138
|
-
"After all dispatches (Claude Code):",
|
|
1139
|
-
"",
|
|
1140
|
-
'1. Run `bun run evals:ingest -- --skill <name> --iteration <N> --subagents-dir ~/.claude/projects/<project-slug>/<parent-session-id>/subagents/` — a fixed-order chain of record-runs (assembles every task\'s `run.json` from `dispatch.json` + the subagent\'s own `outputs/final-message.md` + the persisted transcript, and backfills `timing.json` with transcript-derived tokens/duration; never clobbers an existing record), fill-transcripts, detect-stray-writes, and grade. Optional higher-fidelity timing: write `{ "total_tokens": <n>, "duration_ms": <n>, "source": "completion-event" }` from the task completion event to `timing.json` right after a dispatch — completion-event numbers always win over the backfill.',
|
|
1141
|
-
"2. Dispatch the judge tasks ingest lists, then run `bun run evals:finalize -- --skill <name> --iteration <N>` for the benchmark.",
|
|
1142
|
-
"",
|
|
1143
|
-
"On a harness without persisted transcripts, instead write each task's `run.json` (matching `skills/evaluating-skills/schema/run-record.schema.json`, enforced at runtime by grade/fill-transcripts/detect-stray-writes) and `timing.json` by hand when its subagent returns: carry over `eval_id`, `condition`, `skill_path` (`null` on the without_skill arm), `prompt`, and `files` from the task; populate `final_message` from the subagent's reply; leave `tool_invocations` as `[]`; capture `total_tokens`/`duration_ms` from the task completion event immediately — they may not be persisted anywhere else.",
|
|
1144
|
-
"",
|
|
1145
|
-
"## Dispatches",
|
|
1146
|
-
"",
|
|
1147
|
-
].join("\n");
|
|
1148
|
-
|
|
1149
|
-
const entries = opts.tasks
|
|
1150
|
-
.map((t) =>
|
|
1151
|
-
[
|
|
1152
|
-
`### ${t.eval_id} / ${t.condition}`,
|
|
1153
|
-
"",
|
|
1154
|
-
`- run.json: ${t.run_record_path}`,
|
|
1155
|
-
`- timing.json: ${t.timing_path}`,
|
|
1156
|
-
"",
|
|
1157
|
-
"```",
|
|
1158
|
-
t.dispatch_prompt,
|
|
1159
|
-
"```",
|
|
1160
|
-
"",
|
|
1161
|
-
].join("\n"),
|
|
1162
|
-
)
|
|
1163
|
-
.join("\n");
|
|
1164
|
-
|
|
1165
|
-
return header + entries;
|
|
1166
|
-
}
|
|
1167
|
-
|
|
1168
|
-
// ---------------------------------------------------------------------------
|
|
1169
|
-
// ingest / finalize — fixed-order orchestrators over the sibling commands.
|
|
1170
|
-
//
|
|
1171
|
-
// The eval loop has exactly two points where only the in-harness agent can act
|
|
1172
|
-
// (dispatching eval subagents, dispatching judge subagents). Everything between
|
|
1173
|
-
// them is mechanical, so each stretch is one command: `ingest` runs the
|
|
1174
|
-
// post-dispatch chain and stops at the judge hand-off; `finalize` runs the
|
|
1175
|
-
// post-judge chain and prints the benchmark. No workspace-state inference —
|
|
1176
|
-
// each always runs the same steps in the same order, and every sub-step keeps
|
|
1177
|
-
// its own skip-if-done guard, so re-running after a fix is safe.
|
|
1178
|
-
// ---------------------------------------------------------------------------
|
|
1179
|
-
|
|
1180
|
-
export type StepCommand = { label: string; argv: string[] };
|
|
1181
|
-
|
|
1182
|
-
export function buildIngestCommands(opts: {
|
|
1183
|
-
runnerDir: string;
|
|
1184
|
-
skillDir: string;
|
|
1185
|
-
skill: string;
|
|
1186
|
-
iteration: number;
|
|
1187
|
-
subagentsDir: string;
|
|
1188
|
-
}): StepCommand[] {
|
|
1189
|
-
const shared = [
|
|
1190
|
-
"--skill-dir",
|
|
1191
|
-
opts.skillDir,
|
|
1192
|
-
"--skill",
|
|
1193
|
-
opts.skill,
|
|
1194
|
-
"--iteration",
|
|
1195
|
-
String(opts.iteration),
|
|
1196
|
-
];
|
|
1197
|
-
const transcripts = ["--subagents-dir", opts.subagentsDir];
|
|
1198
|
-
const script = (name: string) => [
|
|
1199
|
-
"bun",
|
|
1200
|
-
"run",
|
|
1201
|
-
join(opts.runnerDir, `${name}.ts`),
|
|
1202
|
-
];
|
|
1203
|
-
return [
|
|
1204
|
-
{
|
|
1205
|
-
label: "record-runs",
|
|
1206
|
-
argv: [...script("record-runs"), ...shared, ...transcripts],
|
|
1207
|
-
},
|
|
1208
|
-
// record-runs subsumes this for the records it wrote; it still fills any
|
|
1209
|
-
// pre-existing (agent-written) run.json with empty tool_invocations.
|
|
1210
|
-
{
|
|
1211
|
-
label: "fill-transcripts",
|
|
1212
|
-
argv: [...script("fill-transcripts"), ...shared, ...transcripts],
|
|
1213
|
-
},
|
|
1214
|
-
{
|
|
1215
|
-
label: "detect-stray-writes",
|
|
1216
|
-
argv: [...script("detect-stray-writes"), ...shared],
|
|
1217
|
-
},
|
|
1218
|
-
{ label: "grade", argv: [...script("grade"), ...shared] },
|
|
1219
|
-
];
|
|
1220
|
-
}
|
|
1221
|
-
|
|
1222
|
-
export function buildFinalizeCommands(opts: {
|
|
1223
|
-
runnerDir: string;
|
|
1224
|
-
skillDir: string;
|
|
1225
|
-
skill: string;
|
|
1226
|
-
iteration: number;
|
|
1227
|
-
}): StepCommand[] {
|
|
1228
|
-
const shared = [
|
|
1229
|
-
"--skill-dir",
|
|
1230
|
-
opts.skillDir,
|
|
1231
|
-
"--skill",
|
|
1232
|
-
opts.skill,
|
|
1233
|
-
"--iteration",
|
|
1234
|
-
String(opts.iteration),
|
|
1235
|
-
];
|
|
1236
|
-
return [
|
|
1237
|
-
{
|
|
1238
|
-
label: "grade --finalize",
|
|
1239
|
-
argv: [
|
|
1240
|
-
"bun",
|
|
1241
|
-
"run",
|
|
1242
|
-
join(opts.runnerDir, "grade.ts"),
|
|
1243
|
-
...shared,
|
|
1244
|
-
"--finalize",
|
|
1245
|
-
],
|
|
1246
|
-
},
|
|
1247
|
-
{
|
|
1248
|
-
label: "aggregate",
|
|
1249
|
-
argv: ["bun", "run", join(opts.runnerDir, "aggregate.ts"), ...shared],
|
|
1250
|
-
},
|
|
1251
|
-
];
|
|
1252
|
-
}
|
|
1253
|
-
|
|
1254
|
-
/**
|
|
1255
|
-
* Runs steps in order, stopping at the first non-zero exit. A failure must
|
|
1256
|
-
* halt the chain: grade's `__skill_invoked` code-check silently degrades to an
|
|
1257
|
-
* LLM judge when `tool_invocations` is missing, so grading after a failed
|
|
1258
|
-
* record/fill step would quietly lose the deterministic check.
|
|
1259
|
-
*/
|
|
1260
|
-
export function runSteps(
|
|
1261
|
-
steps: StepCommand[],
|
|
1262
|
-
spawn: (step: StepCommand) => number = (step) =>
|
|
1263
|
-
Bun.spawnSync(step.argv, { stdout: "inherit", stderr: "inherit" })
|
|
1264
|
-
.exitCode ?? 1,
|
|
1265
|
-
): { failedAt: string | null } {
|
|
1266
|
-
for (const step of steps) {
|
|
1267
|
-
console.log(`\n── ${step.label} ──`);
|
|
1268
|
-
if (spawn(step) !== 0) return { failedAt: step.label };
|
|
1269
|
-
}
|
|
1270
|
-
return { failedAt: null };
|
|
1271
|
-
}
|
|
1272
|
-
|
|
1273
|
-
function commandIngest(args: Args, ctx: RunContext): void {
|
|
1274
|
-
if (args.iteration === undefined) die("ingest requires --iteration <N>");
|
|
1275
|
-
if (!args.subagentsDir)
|
|
1276
|
-
die(
|
|
1277
|
-
"ingest requires --subagents-dir <path> (Claude Code persists subagent transcripts under ~/.claude/projects/<project-slug>/<parent-session-id>/subagents/)",
|
|
1278
|
-
);
|
|
1279
|
-
const { failedAt } = runSteps(
|
|
1280
|
-
buildIngestCommands({
|
|
1281
|
-
runnerDir: import.meta.dir,
|
|
1282
|
-
skillDir: ctx.skillDir,
|
|
1283
|
-
skill: ctx.skillName,
|
|
1284
|
-
iteration: args.iteration,
|
|
1285
|
-
subagentsDir: args.subagentsDir,
|
|
1286
|
-
}),
|
|
1287
|
-
);
|
|
1288
|
-
if (failedAt)
|
|
1289
|
-
die(
|
|
1290
|
-
`ingest stopped at '${failedAt}'. Fix the failure and re-run ingest — completed steps skip work that's already done.`,
|
|
1291
|
-
);
|
|
1292
|
-
|
|
1293
|
-
const judgeTasksPath = join(
|
|
1294
|
-
ctx.workspaceRoot,
|
|
1295
|
-
ctx.skillName,
|
|
1296
|
-
`iteration-${args.iteration}`,
|
|
1297
|
-
"judge-tasks.json",
|
|
1298
|
-
);
|
|
1299
|
-
let totalTasks: number | null = null;
|
|
1300
|
-
try {
|
|
1301
|
-
totalTasks =
|
|
1302
|
-
readJson<{ total_tasks?: number }>(judgeTasksPath).total_tasks ?? null;
|
|
1303
|
-
} catch {
|
|
1304
|
-
// grade always writes judge-tasks.json; treat a read failure as unknown.
|
|
1305
|
-
}
|
|
1306
|
-
if (totalTasks === 0) {
|
|
1307
|
-
console.log(
|
|
1308
|
-
`\n✅ Ingest complete — no judge dispatches needed.\nNext: bun run evals:finalize -- --skill ${ctx.skillName} --iteration ${args.iteration}`,
|
|
1309
|
-
);
|
|
1310
|
-
} else {
|
|
1311
|
-
console.log(
|
|
1312
|
-
`\n✅ Ingest complete. Dispatch the ${totalTasks ?? ""} judge task(s) grade listed above (judge-tasks.json), then:\n bun run evals:finalize -- --skill ${ctx.skillName} --iteration ${args.iteration}`,
|
|
1313
|
-
);
|
|
1314
|
-
}
|
|
1315
|
-
}
|
|
1316
|
-
|
|
1317
|
-
function commandFinalize(args: Args, ctx: RunContext): void {
|
|
1318
|
-
if (args.iteration === undefined) die("finalize requires --iteration <N>");
|
|
1319
|
-
const { failedAt } = runSteps(
|
|
1320
|
-
buildFinalizeCommands({
|
|
1321
|
-
runnerDir: import.meta.dir,
|
|
1322
|
-
skillDir: ctx.skillDir,
|
|
1323
|
-
skill: ctx.skillName,
|
|
1324
|
-
iteration: args.iteration,
|
|
1325
|
-
}),
|
|
1326
|
-
);
|
|
1327
|
-
if (failedAt)
|
|
1328
|
-
die(
|
|
1329
|
-
`finalize stopped at '${failedAt}'. Fix the failure and re-run finalize.`,
|
|
1330
|
-
);
|
|
1331
|
-
console.log(
|
|
1332
|
-
`\n✅ Finalize complete. Read the benchmark above, then tear down: bun run evals:teardown --skill ${ctx.skillName}`,
|
|
1333
|
-
);
|
|
1334
|
-
}
|
|
1335
|
-
|
|
1336
|
-
if (import.meta.main) {
|
|
1337
|
-
const argv = Bun.argv.slice(2);
|
|
1338
|
-
const args = parseArgs(argv);
|
|
1339
|
-
let ctx: RunContext;
|
|
1340
|
-
try {
|
|
1341
|
-
ctx = detectRunContext(argv);
|
|
1342
|
-
} catch (err) {
|
|
1343
|
-
die(err instanceof Error ? err.message : String(err));
|
|
1344
|
-
}
|
|
1345
|
-
if (args.command === "snapshot") commandSnapshot(args, ctx);
|
|
1346
|
-
else if (args.command === "ingest") commandIngest(args, ctx);
|
|
1347
|
-
else if (args.command === "finalize") commandFinalize(args, ctx);
|
|
1348
|
-
else if (args.command === "teardown-guard") {
|
|
1349
|
-
const torn = teardownGuard(ctx.stageRoot);
|
|
1350
|
-
console.log(
|
|
1351
|
-
torn
|
|
1352
|
-
? "🛡 Write guard removed."
|
|
1353
|
-
: "No write guard was installed — nothing to remove.",
|
|
1354
|
-
);
|
|
1355
|
-
} else if (args.command === "teardown") {
|
|
1356
|
-
// Full end-of-run teardown: disarm the guard, remove the staged skill set
|
|
1357
|
-
// (and prune a `.claude` the runner emptied), then reclaim the workspace —
|
|
1358
|
-
// leaving the user's own `.claude/settings.json`, pre-existing project
|
|
1359
|
-
// skills, and any uncommitted eval results intact.
|
|
1360
|
-
const torn = teardownGuard(ctx.stageRoot);
|
|
1361
|
-
cleanupStagedSkills(ctx.stageRoot);
|
|
1362
|
-
const ws = cleanupWorkspace(ctx.workspaceRoot, ctx.skillName);
|
|
1363
|
-
console.log(
|
|
1364
|
-
`🧹 Eval teardown complete: staged skill set removed${
|
|
1365
|
-
torn ? " and write guard disarmed" : ""
|
|
1366
|
-
}.`,
|
|
1367
|
-
);
|
|
1368
|
-
const reclaimed = ws.removedIterations.length + ws.removedSnapshots.length;
|
|
1369
|
-
if (reclaimed > 0) {
|
|
1370
|
-
console.log(
|
|
1371
|
-
` Reclaimed ${ws.removedIterations.length} workspace iteration(s)` +
|
|
1372
|
-
` and ${ws.removedSnapshots.length} reproducible snapshot(s).`,
|
|
1373
|
-
);
|
|
1374
|
-
}
|
|
1375
|
-
if (ws.keptIterations.length > 0) {
|
|
1376
|
-
const lines = ws.keptIterations.map(
|
|
1377
|
-
(k) => ` - ${k.iteration} (${k.reason})`,
|
|
1378
|
-
);
|
|
1379
|
-
console.warn(
|
|
1380
|
-
`⚠ Kept ${ws.keptIterations.length} workspace iteration(s) with results ` +
|
|
1381
|
-
`not yet committed:\n${lines.join("\n")}\n` +
|
|
1382
|
-
` Commit them, e.g.:\n` +
|
|
1383
|
-
` bun run evals:promote-baseline --skill ${ctx.skillName} --iteration <N>\n` +
|
|
1384
|
-
` or delete ${join("skills-workspace", ctx.skillName)}/ manually to discard.`,
|
|
1385
|
-
);
|
|
1386
|
-
}
|
|
1387
|
-
} else commandRun(args, ctx);
|
|
1388
|
-
}
|