@slowdini/slow-powers-opencode 0.1.5 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +32 -13
- package/package.json +5 -1
- package/skills/auditing-slow-powers-usage/evals/evals.json +3 -3
- package/skills/auditing-slow-powers-usage/evals/fixtures/audits-blindspot-session/session-summary.md +1 -1
- package/skills/evaluating-skills/SKILL.md +22 -20
- package/skills/evaluating-skills/examples/{verification-before-completion-evals.json → verifying-development-work-evals.json} +2 -2
- package/skills/evaluating-skills/harness-details/claude.md +51 -15
- package/skills/evaluating-skills/harness-parity.md +155 -0
- package/skills/evaluating-skills/pressure-scenarios.md +1 -1
- package/skills/evaluating-skills/runner/README.md +28 -19
- package/skills/evaluating-skills/runner/adapters/claude-code-session.ts +2 -2
- package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +222 -0
- package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +107 -11
- package/skills/evaluating-skills/runner/aggregate.test.ts +220 -0
- package/skills/evaluating-skills/runner/aggregate.ts +21 -0
- package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +295 -2
- package/skills/evaluating-skills/runner/detect-stray-writes.ts +102 -6
- package/skills/evaluating-skills/runner/guard/policy.test.ts +57 -0
- package/skills/evaluating-skills/runner/promote-baseline.test.ts +51 -0
- package/skills/evaluating-skills/runner/promote-baseline.ts +19 -1
- package/skills/evaluating-skills/runner/record-runs.test.ts +314 -0
- package/skills/evaluating-skills/runner/record-runs.ts +209 -0
- package/skills/evaluating-skills/runner/run.test.ts +523 -0
- package/skills/evaluating-skills/runner/run.ts +376 -17
- package/skills/evaluating-skills/runner/sandbox-policy.ts +20 -0
- package/skills/evaluating-skills/runner/types.ts +9 -0
- package/skills/evaluating-skills/runner/workspace-teardown.test.ts +227 -0
- package/skills/evaluating-skills/runner/workspace-teardown.ts +136 -0
- package/skills/evaluating-skills/schema/run-record.schema.json +2 -2
- package/skills/evaluating-skills/schema/stray-writes.schema.json +15 -3
- package/skills/evaluating-skills/templates/eval-task-prompt.md +5 -3
- package/skills/hardening-plans/SKILL.md +1 -1
- package/skills/systematic-debugging/SKILL.md +4 -0
- package/skills/test-driven-development/SKILL.md +2 -0
- package/skills/test-driven-development/evals/baseline/NOTES.md +1 -1
- package/skills/verifying-development-work/SKILL.md +99 -0
- package/skills/verifying-development-work/code-review.md +68 -0
- package/skills/verifying-development-work/comment-review.md +85 -0
- package/skills/verifying-development-work/evals/baseline/BASELINE.md +23 -0
- package/skills/verifying-development-work/evals/baseline/NOTES.md +87 -0
- package/skills/verifying-development-work/evals/baseline/benchmark.json +54 -0
- package/skills/verifying-development-work/evals/baseline/grading/comment-hygiene-at-handoff__new_skill.json +53 -0
- package/skills/verifying-development-work/evals/baseline/grading/comment-hygiene-at-handoff__old_skill.json +53 -0
- package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__new_skill.json +53 -0
- package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__old_skill.json +53 -0
- package/skills/verifying-development-work/evals/evals.json +178 -0
- package/skills/verifying-development-work/evals/fixtures/comment-hygiene-at-handoff/slugify.test.ts +14 -0
- package/skills/verifying-development-work/evals/fixtures/comment-hygiene-at-handoff/slugify.ts +25 -0
- package/skills/verifying-development-work/evals/fixtures/seeded-done-tests-pass-ship-it/pricing.test.ts +14 -0
- package/skills/verifying-development-work/evals/fixtures/seeded-done-tests-pass-ship-it/pricing.ts +24 -0
- package/skills/verifying-development-work/evals/fixtures/seeded-teammate-pasted-evidence/checkout.test.ts +25 -0
- package/skills/verifying-development-work/evals/fixtures/seeded-teammate-pasted-evidence/checkout.ts +18 -0
- package/skills/verifying-development-work/evals/fixtures/wrap-it-up-handoff/limiter.test.ts +19 -0
- package/skills/verifying-development-work/evals/fixtures/wrap-it-up-handoff/limiter.ts +24 -0
- package/skills/working-in-isolation/SKILL.md +2 -2
- package/skills/writing-skills/SKILL.md +2 -3
- package/skills/finishing-a-development-branch/SKILL.md +0 -96
- package/skills/finishing-a-development-branch/evals/evals.json +0 -41
- package/skills/finishing-a-development-branch/evals/fixtures/finish/package.json +0 -4
- package/skills/finishing-a-development-branch/evals/fixtures/finish/sum.test.ts +0 -5
- package/skills/verification-before-completion/SKILL.md +0 -65
- package/skills/verification-before-completion/evals/baseline/BASELINE.md +0 -22
- package/skills/verification-before-completion/evals/baseline/NOTES.md +0 -75
- package/skills/verification-before-completion/evals/baseline/benchmark.json +0 -51
- package/skills/verification-before-completion/evals/baseline/grading/bug-fixed-without-reproducing__with_skill.json +0 -39
- package/skills/verification-before-completion/evals/baseline/grading/bug-fixed-without-reproducing__without_skill.json +0 -24
- package/skills/verification-before-completion/evals/baseline/grading/build-implied-by-edit__with_skill.json +0 -46
- package/skills/verification-before-completion/evals/baseline/grading/build-implied-by-edit__without_skill.json +0 -31
- package/skills/verification-before-completion/evals/baseline/grading/claim-without-running__with_skill.json +0 -46
- package/skills/verification-before-completion/evals/baseline/grading/claim-without-running__without_skill.json +0 -31
- package/skills/verification-before-completion/evals/evals.json +0 -77
- /package/skills/{verification-before-completion → verifying-development-work}/evals/fixtures/build-implied-by-edit/api.ts +0 -0
- /package/skills/{verification-before-completion → verifying-development-work}/evals/fixtures/build-implied-by-edit/consumer.ts +0 -0
- /package/skills/{verification-before-completion → verifying-development-work}/evals/fixtures/build-implied-by-edit/tsconfig.json +0 -0
- /package/skills/{verification-before-completion → verifying-development-work}/evals/fixtures/claim-without-running/sum.test.ts +0 -0
- /package/skills/{verification-before-completion → verifying-development-work}/evals/fixtures/claim-without-running/sum.ts +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
#!/usr/bin/env bun
|
|
2
2
|
import { existsSync, readdirSync, readFileSync, writeFileSync } from "node:fs";
|
|
3
|
-
import { join } from "node:path";
|
|
3
|
+
import { join, relative, resolve } from "node:path";
|
|
4
4
|
import { detectRunContext } from "./context";
|
|
5
5
|
import { classifyBash, isUnder, pathArg, WRITE_TOOLS } from "./sandbox-policy";
|
|
6
6
|
import type { ConditionsRecord, RunRecord, ToolInvocation } from "./types";
|
|
@@ -71,6 +71,81 @@ export function detectStrayWrites(
|
|
|
71
71
|
return { violations, warnings };
|
|
72
72
|
}
|
|
73
73
|
|
|
74
|
+
/** Read-only tools that carry a target path argument (see `pathArg`). */
|
|
75
|
+
const READ_TOOLS = new Set(["Read", "Glob", "Grep"]);
|
|
76
|
+
|
|
77
|
+
const LIVE_SOURCE_REASON =
|
|
78
|
+
"reads the live skill source instead of its staged copy — the arm may be contaminated";
|
|
79
|
+
|
|
80
|
+
function escapeRegExp(s: string): string {
|
|
81
|
+
return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
/**
|
|
85
|
+
* Flag tool invocations that read the **live** skill-under-test directory.
|
|
86
|
+
*
|
|
87
|
+
* Eval subagents are only ever meant to see the *staged* copy of the skill
|
|
88
|
+
* (`.claude/skills/<slug>/`, or the inlined SKILL.md under `--no-stage`). A
|
|
89
|
+
* read of the live source typically means the Skill tool couldn't resolve the
|
|
90
|
+
* staged slug yet (mid-session registry refresh race) and the agent improvised
|
|
91
|
+
* — fatal in revision mode, where the old_skill arm then reads new-skill
|
|
92
|
+
* content. Reads are detected, not blocked: the guard stays read-permissive,
|
|
93
|
+
* so this surfaces post-hoc as a validity warning.
|
|
94
|
+
*
|
|
95
|
+
* - Read-tool calls (Read/Glob/Grep) whose path arg resolves under the live
|
|
96
|
+
* dir are flagged; relative paths resolve against `repoRoot`.
|
|
97
|
+
* - Bash commands that reference the live dir (absolute, or repo-relative
|
|
98
|
+
* text) are flagged. A staged copy under `.claude/skills/` can carry the
|
|
99
|
+
* same `skills/<name>` relative text (e.g. via `--stage-name`), so that
|
|
100
|
+
* prefix is excluded.
|
|
101
|
+
*/
|
|
102
|
+
export function detectLiveSourceReads(
|
|
103
|
+
invocations: Array<Pick<ToolInvocation, "name" | "args" | "ordinal">>,
|
|
104
|
+
liveSkillDir: string,
|
|
105
|
+
repoRoot: string,
|
|
106
|
+
): StrayFinding[] {
|
|
107
|
+
const findings: StrayFinding[] = [];
|
|
108
|
+
const liveDir = resolve(liveSkillDir);
|
|
109
|
+
const rel = relative(repoRoot, liveDir);
|
|
110
|
+
const relRe = rel.startsWith("..")
|
|
111
|
+
? null
|
|
112
|
+
: new RegExp(
|
|
113
|
+
// The lookbehind fires at the boundary char itself, so it checks for a
|
|
114
|
+
// bare `.claude` — the `/` is consumed by the boundary group.
|
|
115
|
+
`(?<!\\.claude)(^|[\\s'"=:(/])${escapeRegExp(rel)}(/|[\\s'")]|$)`,
|
|
116
|
+
);
|
|
117
|
+
|
|
118
|
+
for (const inv of invocations) {
|
|
119
|
+
if (READ_TOOLS.has(inv.name)) {
|
|
120
|
+
const p = pathArg(inv.args);
|
|
121
|
+
if (p && isUnder(p, liveDir, repoRoot)) {
|
|
122
|
+
findings.push({
|
|
123
|
+
tool: inv.name,
|
|
124
|
+
path: p,
|
|
125
|
+
ordinal: inv.ordinal,
|
|
126
|
+
reason: LIVE_SOURCE_REASON,
|
|
127
|
+
});
|
|
128
|
+
}
|
|
129
|
+
continue;
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
if (inv.name === "Bash") {
|
|
133
|
+
const args = inv.args as { command?: unknown } | undefined;
|
|
134
|
+
const command = typeof args?.command === "string" ? args.command : "";
|
|
135
|
+
if (command.includes(liveDir) || relRe?.test(command)) {
|
|
136
|
+
findings.push({
|
|
137
|
+
tool: "Bash",
|
|
138
|
+
command,
|
|
139
|
+
ordinal: inv.ordinal,
|
|
140
|
+
reason: LIVE_SOURCE_REASON,
|
|
141
|
+
});
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
return findings;
|
|
147
|
+
}
|
|
148
|
+
|
|
74
149
|
if (import.meta.main) {
|
|
75
150
|
const argv = Bun.argv.slice(2);
|
|
76
151
|
const flag = (name: string): string | undefined => {
|
|
@@ -127,10 +202,12 @@ if (import.meta.main) {
|
|
|
127
202
|
condition: string;
|
|
128
203
|
violations: StrayFinding[];
|
|
129
204
|
warnings: StrayFinding[];
|
|
205
|
+
live_source_reads: StrayFinding[];
|
|
130
206
|
};
|
|
131
207
|
const runs: RunReport[] = [];
|
|
132
208
|
let totalViolations = 0;
|
|
133
209
|
let totalWarnings = 0;
|
|
210
|
+
let totalLiveReads = 0;
|
|
134
211
|
|
|
135
212
|
for (const evalDir of evalDirs) {
|
|
136
213
|
const evalId = evalDir.replace(/^eval-/, "");
|
|
@@ -149,23 +226,38 @@ if (import.meta.main) {
|
|
|
149
226
|
const outputsDir =
|
|
150
227
|
outputsByKey.get(`${evalId}:${cond}`) ?? join(condDir, "outputs");
|
|
151
228
|
const findings = detectStrayWrites(invocations, outputsDir, repoRoot);
|
|
152
|
-
|
|
229
|
+
const liveReads = detectLiveSourceReads(
|
|
230
|
+
invocations,
|
|
231
|
+
ctx.skillSubdir,
|
|
232
|
+
repoRoot,
|
|
233
|
+
);
|
|
234
|
+
if (
|
|
235
|
+
findings.violations.length ||
|
|
236
|
+
findings.warnings.length ||
|
|
237
|
+
liveReads.length
|
|
238
|
+
) {
|
|
153
239
|
runs.push({
|
|
154
240
|
eval_id: evalId,
|
|
155
241
|
condition: cond,
|
|
156
242
|
violations: findings.violations,
|
|
157
243
|
warnings: findings.warnings,
|
|
244
|
+
live_source_reads: liveReads,
|
|
158
245
|
});
|
|
159
246
|
}
|
|
160
247
|
totalViolations += findings.violations.length;
|
|
161
248
|
totalWarnings += findings.warnings.length;
|
|
249
|
+
totalLiveReads += liveReads.length;
|
|
162
250
|
}
|
|
163
251
|
}
|
|
164
252
|
|
|
165
253
|
const report = {
|
|
166
254
|
generated: new Date().toISOString(),
|
|
167
255
|
iteration: Number(iteration),
|
|
168
|
-
totals: {
|
|
256
|
+
totals: {
|
|
257
|
+
violations: totalViolations,
|
|
258
|
+
warnings: totalWarnings,
|
|
259
|
+
live_source_reads: totalLiveReads,
|
|
260
|
+
},
|
|
169
261
|
runs,
|
|
170
262
|
};
|
|
171
263
|
const outPath = join(iterationDir, "stray-writes.json");
|
|
@@ -182,11 +274,15 @@ if (import.meta.main) {
|
|
|
182
274
|
console.warn(
|
|
183
275
|
`⚠ ${r.eval_id}/${r.condition}: Bash ${w.reason} (ordinal ${w.ordinal}): ${w.command}`,
|
|
184
276
|
);
|
|
277
|
+
for (const l of r.live_source_reads)
|
|
278
|
+
console.warn(
|
|
279
|
+
`⚠ ${r.eval_id}/${r.condition}: ${l.tool} read the live skill source (ordinal ${l.ordinal}): ${l.path ?? l.command}`,
|
|
280
|
+
);
|
|
185
281
|
}
|
|
186
|
-
if (totalViolations === 0 && totalWarnings === 0)
|
|
187
|
-
console.log("✓ No out-of-bounds writes detected.");
|
|
282
|
+
if (totalViolations === 0 && totalWarnings === 0 && totalLiveReads === 0)
|
|
283
|
+
console.log("✓ No out-of-bounds writes or live-source reads detected.");
|
|
188
284
|
else
|
|
189
285
|
console.warn(
|
|
190
|
-
`\n${totalViolations} violation(s), ${totalWarnings} warning(s). Runs with violations edited files outside their sandbox — treat those data points as tainted.`,
|
|
286
|
+
`\n${totalViolations} violation(s), ${totalWarnings} warning(s), ${totalLiveReads} live-source read(s). Runs with violations edited files outside their sandbox; runs with live-source reads saw the live skill instead of their staged copy — treat those data points as tainted.`,
|
|
191
287
|
);
|
|
192
288
|
}
|
|
@@ -68,4 +68,61 @@ describe("guard decide", () => {
|
|
|
68
68
|
true,
|
|
69
69
|
);
|
|
70
70
|
});
|
|
71
|
+
|
|
72
|
+
test("denies git worktree add (working tree outside the sandbox)", () => {
|
|
73
|
+
const d = decide(
|
|
74
|
+
"Bash",
|
|
75
|
+
{ command: "git worktree add ../wt -b scratch" },
|
|
76
|
+
marker(),
|
|
77
|
+
);
|
|
78
|
+
expect(d.allow).toBe(false);
|
|
79
|
+
expect(d.reason).toMatch(/worktree/i);
|
|
80
|
+
});
|
|
81
|
+
|
|
82
|
+
test("denies Bash that creates a path under .claude via a non-redirect verb", () => {
|
|
83
|
+
expect(
|
|
84
|
+
decide("Bash", { command: "mkdir -p .claude/foo" }, marker()).allow,
|
|
85
|
+
).toBe(false);
|
|
86
|
+
expect(
|
|
87
|
+
decide("Bash", { command: "cp out.txt .claude/bar" }, marker()).allow,
|
|
88
|
+
).toBe(false);
|
|
89
|
+
});
|
|
90
|
+
|
|
91
|
+
test("denies Bash that creates a bare skills/ dir", () => {
|
|
92
|
+
expect(decide("Bash", { command: "mkdir skills" }, marker()).allow).toBe(
|
|
93
|
+
false,
|
|
94
|
+
);
|
|
95
|
+
expect(
|
|
96
|
+
decide("Bash", { command: "cp -r src ./skills" }, marker()).allow,
|
|
97
|
+
).toBe(false);
|
|
98
|
+
});
|
|
99
|
+
|
|
100
|
+
test("still allows reads of .claude (no create verb)", () => {
|
|
101
|
+
expect(
|
|
102
|
+
decide("Bash", { command: "cat .claude/settings.json" }, marker()).allow,
|
|
103
|
+
).toBe(true);
|
|
104
|
+
expect(decide("Bash", { command: "ls .claude" }, marker()).allow).toBe(
|
|
105
|
+
true,
|
|
106
|
+
);
|
|
107
|
+
});
|
|
108
|
+
|
|
109
|
+
test("allows a create scoped to the .claude/skills staging root (allowed-root escape)", () => {
|
|
110
|
+
expect(
|
|
111
|
+
decide(
|
|
112
|
+
"Bash",
|
|
113
|
+
{ command: "mkdir -p /work/.claude/skills/staged-x" },
|
|
114
|
+
marker(),
|
|
115
|
+
).allow,
|
|
116
|
+
).toBe(true);
|
|
117
|
+
});
|
|
118
|
+
|
|
119
|
+
test("does not flag skills-workspace as a bare skills/ write", () => {
|
|
120
|
+
expect(
|
|
121
|
+
decide(
|
|
122
|
+
"Bash",
|
|
123
|
+
{ command: "mkdir -p /work/skills-workspace/x/outputs" },
|
|
124
|
+
marker(),
|
|
125
|
+
).allow,
|
|
126
|
+
).toBe(true);
|
|
127
|
+
});
|
|
71
128
|
});
|
|
@@ -8,6 +8,7 @@ import {
|
|
|
8
8
|
} from "node:fs";
|
|
9
9
|
import { tmpdir } from "node:os";
|
|
10
10
|
import { join } from "node:path";
|
|
11
|
+
import { PROMOTED_MARKER } from "./workspace-teardown";
|
|
11
12
|
|
|
12
13
|
const FIXTURE_ROOT = join(tmpdir(), `slow-powers-promote-test-${process.pid}`);
|
|
13
14
|
const PROMOTE_TS = join(import.meta.dir, "promote-baseline.ts");
|
|
@@ -137,6 +138,56 @@ describe("promote-baseline.ts (--skill-dir, isolated CWD)", () => {
|
|
|
137
138
|
expect(provenance).toContain("Judge model | unspecified");
|
|
138
139
|
});
|
|
139
140
|
|
|
141
|
+
test("drops a .promoted.json marker into the iteration dir for teardown", () => {
|
|
142
|
+
const root = join(FIXTURE_ROOT, "promote-marker");
|
|
143
|
+
|
|
144
|
+
const skillDir = join(root, "skill-dir");
|
|
145
|
+
const skillSub = join(skillDir, "mr-review");
|
|
146
|
+
mkdirSync(skillSub, { recursive: true });
|
|
147
|
+
writeFileSync(
|
|
148
|
+
join(skillSub, "SKILL.md"),
|
|
149
|
+
"---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
|
|
150
|
+
);
|
|
151
|
+
|
|
152
|
+
const cwd = join(root, "work");
|
|
153
|
+
const iterationDir = join(
|
|
154
|
+
cwd,
|
|
155
|
+
"skills-workspace",
|
|
156
|
+
"mr-review",
|
|
157
|
+
"iteration-3",
|
|
158
|
+
);
|
|
159
|
+
mkdirSync(iterationDir, { recursive: true });
|
|
160
|
+
writeJson(join(iterationDir, "benchmark.json"), {
|
|
161
|
+
delta: { pass_rate: 0 },
|
|
162
|
+
});
|
|
163
|
+
|
|
164
|
+
const res = Bun.spawnSync(
|
|
165
|
+
[
|
|
166
|
+
"bun",
|
|
167
|
+
"run",
|
|
168
|
+
PROMOTE_TS,
|
|
169
|
+
"--skill-dir",
|
|
170
|
+
skillDir,
|
|
171
|
+
"--skill",
|
|
172
|
+
"mr-review",
|
|
173
|
+
"--iteration",
|
|
174
|
+
"3",
|
|
175
|
+
],
|
|
176
|
+
{ cwd, stdout: "pipe", stderr: "pipe" },
|
|
177
|
+
);
|
|
178
|
+
expect(res.stderr.toString()).toBe("");
|
|
179
|
+
expect(res.exitCode).toBe(0);
|
|
180
|
+
|
|
181
|
+
const markerPath = join(iterationDir, PROMOTED_MARKER);
|
|
182
|
+
expect(existsSync(markerPath)).toBe(true);
|
|
183
|
+
const marker = JSON.parse(readFileSync(markerPath, "utf8")) as {
|
|
184
|
+
promoted_at: string;
|
|
185
|
+
baseline_dir: string;
|
|
186
|
+
};
|
|
187
|
+
expect(marker.promoted_at).toBeTruthy();
|
|
188
|
+
expect(marker.baseline_dir).toBe(join(skillSub, "evals", "baseline"));
|
|
189
|
+
});
|
|
190
|
+
|
|
140
191
|
test("records agent and judge models in provenance when flags are passed", () => {
|
|
141
192
|
const root = join(FIXTURE_ROOT, "promote-models");
|
|
142
193
|
|
|
@@ -10,6 +10,7 @@ import {
|
|
|
10
10
|
import { join } from "node:path";
|
|
11
11
|
import { detectRunContext } from "./context";
|
|
12
12
|
import type { ConditionsRecord } from "./types";
|
|
13
|
+
import { PROMOTED_MARKER } from "./workspace-teardown";
|
|
13
14
|
|
|
14
15
|
function die(msg: string): never {
|
|
15
16
|
console.error(`error: ${msg}`);
|
|
@@ -120,7 +121,8 @@ export function promoteBaseline(opts: PromoteOptions): {
|
|
|
120
121
|
"`bun run evals:promote-baseline -- --skill " +
|
|
121
122
|
`${opts.skillName} --iteration <N>` +
|
|
122
123
|
"` after aggregating. The ephemeral workspace (run records, timing,",
|
|
123
|
-
"dispatch files, produced outputs) stays gitignored under `skills-workspace
|
|
124
|
+
"dispatch files, produced outputs) stays gitignored under `skills-workspace/`",
|
|
125
|
+
"and is reclaimable by `evals:teardown` once promoted (this commit's marker).",
|
|
124
126
|
"",
|
|
125
127
|
"| Field | Value |",
|
|
126
128
|
"|-------|-------|",
|
|
@@ -141,6 +143,22 @@ export function promoteBaseline(opts: PromoteOptions): {
|
|
|
141
143
|
].join("\n");
|
|
142
144
|
writeFileSync(join(baselineDir, "BASELINE.md"), `${provenance}\n`);
|
|
143
145
|
|
|
146
|
+
// Mark the iteration as committed so `teardown` can safely reclaim its
|
|
147
|
+
// workspace — without this marker teardown preserves the iteration as
|
|
148
|
+
// uncommitted results.
|
|
149
|
+
writeFileSync(
|
|
150
|
+
join(iterationDir, PROMOTED_MARKER),
|
|
151
|
+
`${JSON.stringify(
|
|
152
|
+
{
|
|
153
|
+
promoted_at: new Date().toISOString(),
|
|
154
|
+
baseline_dir: baselineDir,
|
|
155
|
+
commit: head,
|
|
156
|
+
},
|
|
157
|
+
null,
|
|
158
|
+
2,
|
|
159
|
+
)}\n`,
|
|
160
|
+
);
|
|
161
|
+
|
|
144
162
|
return { baselineDir, gradingsCopied };
|
|
145
163
|
}
|
|
146
164
|
|
|
@@ -0,0 +1,314 @@
|
|
|
1
|
+
import { afterEach, beforeEach, describe, expect, test } from "bun:test";
|
|
2
|
+
import {
|
|
3
|
+
existsSync,
|
|
4
|
+
mkdirSync,
|
|
5
|
+
readFileSync,
|
|
6
|
+
rmSync,
|
|
7
|
+
writeFileSync,
|
|
8
|
+
} from "node:fs";
|
|
9
|
+
import { tmpdir } from "node:os";
|
|
10
|
+
import { join } from "node:path";
|
|
11
|
+
import { recordRuns } from "./record-runs";
|
|
12
|
+
import type { RunRecord, TimingRecord } from "./types";
|
|
13
|
+
|
|
14
|
+
const ROOT = join(tmpdir(), `record-runs-test-${process.pid}`);
|
|
15
|
+
|
|
16
|
+
let iterationDir: string;
|
|
17
|
+
let subagentsDir: string;
|
|
18
|
+
|
|
19
|
+
function jsonl(lines: object[]): string {
|
|
20
|
+
return `${lines.map((l) => JSON.stringify(l)).join("\n")}\n`;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
/** A minimal transcript with usage, timestamps, one tool call, and final text. */
|
|
24
|
+
function transcriptLines(finalText: string): object[] {
|
|
25
|
+
return [
|
|
26
|
+
{
|
|
27
|
+
type: "user",
|
|
28
|
+
timestamp: "2026-06-04T10:00:00.000Z",
|
|
29
|
+
message: { role: "user", content: "go" },
|
|
30
|
+
},
|
|
31
|
+
{
|
|
32
|
+
type: "assistant",
|
|
33
|
+
timestamp: "2026-06-04T10:00:10.000Z",
|
|
34
|
+
message: {
|
|
35
|
+
id: "msg_1",
|
|
36
|
+
role: "assistant",
|
|
37
|
+
usage: {
|
|
38
|
+
input_tokens: 100,
|
|
39
|
+
output_tokens: 20,
|
|
40
|
+
cache_creation_input_tokens: 30,
|
|
41
|
+
cache_read_input_tokens: 50,
|
|
42
|
+
},
|
|
43
|
+
content: [
|
|
44
|
+
{
|
|
45
|
+
type: "tool_use",
|
|
46
|
+
id: "toolu_1",
|
|
47
|
+
name: "Bash",
|
|
48
|
+
input: { command: "ls" },
|
|
49
|
+
},
|
|
50
|
+
],
|
|
51
|
+
},
|
|
52
|
+
},
|
|
53
|
+
{
|
|
54
|
+
type: "user",
|
|
55
|
+
timestamp: "2026-06-04T10:00:12.000Z",
|
|
56
|
+
message: {
|
|
57
|
+
role: "user",
|
|
58
|
+
content: [
|
|
59
|
+
{ type: "tool_result", tool_use_id: "toolu_1", content: "ok" },
|
|
60
|
+
],
|
|
61
|
+
},
|
|
62
|
+
},
|
|
63
|
+
{
|
|
64
|
+
type: "assistant",
|
|
65
|
+
timestamp: "2026-06-04T10:01:00.000Z",
|
|
66
|
+
message: {
|
|
67
|
+
id: "msg_2",
|
|
68
|
+
role: "assistant",
|
|
69
|
+
usage: {
|
|
70
|
+
input_tokens: 200,
|
|
71
|
+
output_tokens: 40,
|
|
72
|
+
cache_creation_input_tokens: 0,
|
|
73
|
+
cache_read_input_tokens: 60,
|
|
74
|
+
},
|
|
75
|
+
content: [{ type: "text", text: finalText }],
|
|
76
|
+
},
|
|
77
|
+
},
|
|
78
|
+
];
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
// Token math for transcriptLines: msg_1 (100+20+30+50) + msg_2 (200+40+0+60) = 500.
|
|
82
|
+
const TRANSCRIPT_TOKENS = 500;
|
|
83
|
+
// 10:00:00.000 → 10:01:00.000
|
|
84
|
+
const TRANSCRIPT_DURATION_MS = 60_000;
|
|
85
|
+
|
|
86
|
+
function writeSubagent(name: string, description: string, lines: object[]) {
|
|
87
|
+
writeFileSync(
|
|
88
|
+
join(subagentsDir, `${name}.meta.json`),
|
|
89
|
+
JSON.stringify({ agentType: "general-purpose", description }),
|
|
90
|
+
);
|
|
91
|
+
writeFileSync(join(subagentsDir, `${name}.jsonl`), jsonl(lines));
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
type FixtureTask = {
|
|
95
|
+
eval_id: string;
|
|
96
|
+
condition: string;
|
|
97
|
+
finalMessage?: string; // written to outputs/final-message.md when present
|
|
98
|
+
};
|
|
99
|
+
|
|
100
|
+
/** Builds an iteration dir + dispatch.json shaped like run.ts serializes it. */
|
|
101
|
+
function writeIteration(tasks: FixtureTask[]) {
|
|
102
|
+
const serialized = tasks.map((t) => {
|
|
103
|
+
const condDir = join(iterationDir, `eval-${t.eval_id}`, t.condition);
|
|
104
|
+
const outputsDir = join(condDir, "outputs");
|
|
105
|
+
mkdirSync(outputsDir, { recursive: true });
|
|
106
|
+
if (t.finalMessage !== undefined) {
|
|
107
|
+
writeFileSync(join(outputsDir, "final-message.md"), t.finalMessage);
|
|
108
|
+
}
|
|
109
|
+
return {
|
|
110
|
+
eval_id: t.eval_id,
|
|
111
|
+
condition: t.condition,
|
|
112
|
+
skill_path:
|
|
113
|
+
t.condition === "without_skill" ? null : "/staged/skill/SKILL.md",
|
|
114
|
+
staged_skill_slug: t.condition === "without_skill" ? null : "test-slug",
|
|
115
|
+
user_prompt: `Do the ${t.eval_id} task`,
|
|
116
|
+
fixtures: [join(condDir, "inputs", "fixture.txt")],
|
|
117
|
+
outputs_dir: outputsDir,
|
|
118
|
+
run_record_path: join(condDir, "run.json"),
|
|
119
|
+
timing_path: join(condDir, "timing.json"),
|
|
120
|
+
agent_description: `${t.eval_id}:${t.condition}:i1-nonce1`,
|
|
121
|
+
dispatch_prompt_path: join(condDir, "dispatch-prompt.txt"),
|
|
122
|
+
};
|
|
123
|
+
});
|
|
124
|
+
writeFileSync(
|
|
125
|
+
join(iterationDir, "dispatch.json"),
|
|
126
|
+
JSON.stringify({ run_nonce: "nonce1", tasks: serialized }, null, 2),
|
|
127
|
+
);
|
|
128
|
+
return serialized;
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
function readRun(evalId: string, condition: string): RunRecord {
|
|
132
|
+
return JSON.parse(
|
|
133
|
+
readFileSync(
|
|
134
|
+
join(iterationDir, `eval-${evalId}`, condition, "run.json"),
|
|
135
|
+
"utf8",
|
|
136
|
+
),
|
|
137
|
+
);
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
function readTiming(evalId: string, condition: string): TimingRecord {
|
|
141
|
+
return JSON.parse(
|
|
142
|
+
readFileSync(
|
|
143
|
+
join(iterationDir, `eval-${evalId}`, condition, "timing.json"),
|
|
144
|
+
"utf8",
|
|
145
|
+
),
|
|
146
|
+
);
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
beforeEach(() => {
|
|
150
|
+
iterationDir = join(ROOT, `iter-${Math.random().toString(36).slice(2)}`);
|
|
151
|
+
subagentsDir = join(ROOT, `sub-${Math.random().toString(36).slice(2)}`);
|
|
152
|
+
mkdirSync(iterationDir, { recursive: true });
|
|
153
|
+
mkdirSync(subagentsDir, { recursive: true });
|
|
154
|
+
});
|
|
155
|
+
|
|
156
|
+
afterEach(() => rmSync(ROOT, { recursive: true, force: true }));
|
|
157
|
+
|
|
158
|
+
describe("recordRuns", () => {
|
|
159
|
+
test("assembles run.json and timing.json for every task from disk", () => {
|
|
160
|
+
writeIteration([
|
|
161
|
+
{ eval_id: "crash", condition: "with_skill", finalMessage: "Fixed it." },
|
|
162
|
+
{
|
|
163
|
+
eval_id: "crash",
|
|
164
|
+
condition: "without_skill",
|
|
165
|
+
finalMessage: "Done, I think.",
|
|
166
|
+
},
|
|
167
|
+
]);
|
|
168
|
+
writeSubagent(
|
|
169
|
+
"agent-a",
|
|
170
|
+
"crash:with_skill:i1-nonce1",
|
|
171
|
+
transcriptLines("unused"),
|
|
172
|
+
);
|
|
173
|
+
writeSubagent(
|
|
174
|
+
"agent-b",
|
|
175
|
+
"crash:without_skill:i1-nonce1",
|
|
176
|
+
transcriptLines("unused"),
|
|
177
|
+
);
|
|
178
|
+
|
|
179
|
+
const result = recordRuns({ iterationDir, subagentsDir });
|
|
180
|
+
expect(result.recorded).toBe(2);
|
|
181
|
+
expect(result.missingTranscript).toBe(0);
|
|
182
|
+
|
|
183
|
+
const run = readRun("crash", "with_skill");
|
|
184
|
+
expect(run.eval_id).toBe("crash");
|
|
185
|
+
expect(run.condition).toBe("with_skill");
|
|
186
|
+
expect(run.skill_path).toBe("/staged/skill/SKILL.md");
|
|
187
|
+
expect(run.prompt).toBe("Do the crash task");
|
|
188
|
+
expect(run.files).toHaveLength(1);
|
|
189
|
+
expect(run.final_message).toBe("Fixed it.");
|
|
190
|
+
expect(run.tool_invocations).toHaveLength(1);
|
|
191
|
+
expect(run.tool_invocations[0]).toMatchObject({ name: "Bash", ordinal: 0 });
|
|
192
|
+
|
|
193
|
+
expect(readRun("crash", "without_skill").skill_path).toBeNull();
|
|
194
|
+
|
|
195
|
+
const timing = readTiming("crash", "with_skill");
|
|
196
|
+
expect(timing.total_tokens).toBe(TRANSCRIPT_TOKENS);
|
|
197
|
+
expect(timing.duration_ms).toBe(TRANSCRIPT_DURATION_MS);
|
|
198
|
+
expect(timing.source).toBe("transcript");
|
|
199
|
+
});
|
|
200
|
+
|
|
201
|
+
test("skips existing run.json without --overwrite, replaces with it", () => {
|
|
202
|
+
const [task] = writeIteration([
|
|
203
|
+
{ eval_id: "crash", condition: "with_skill", finalMessage: "New." },
|
|
204
|
+
]);
|
|
205
|
+
writeSubagent(
|
|
206
|
+
"agent-a",
|
|
207
|
+
"crash:with_skill:i1-nonce1",
|
|
208
|
+
transcriptLines("unused"),
|
|
209
|
+
);
|
|
210
|
+
const handWritten = {
|
|
211
|
+
eval_id: "crash",
|
|
212
|
+
condition: "with_skill",
|
|
213
|
+
skill_path: "/staged/skill/SKILL.md",
|
|
214
|
+
prompt: "Do the crash task",
|
|
215
|
+
files: [],
|
|
216
|
+
final_message: "Agent-authored.",
|
|
217
|
+
tool_invocations: [],
|
|
218
|
+
};
|
|
219
|
+
writeFileSync(task.run_record_path, JSON.stringify(handWritten));
|
|
220
|
+
|
|
221
|
+
const skipped = recordRuns({ iterationDir, subagentsDir });
|
|
222
|
+
expect(skipped.recorded).toBe(0);
|
|
223
|
+
expect(skipped.skippedExisting).toBe(1);
|
|
224
|
+
expect(readRun("crash", "with_skill").final_message).toBe(
|
|
225
|
+
"Agent-authored.",
|
|
226
|
+
);
|
|
227
|
+
|
|
228
|
+
const replaced = recordRuns({
|
|
229
|
+
iterationDir,
|
|
230
|
+
subagentsDir,
|
|
231
|
+
overwrite: true,
|
|
232
|
+
});
|
|
233
|
+
expect(replaced.recorded).toBe(1);
|
|
234
|
+
expect(readRun("crash", "with_skill").final_message).toBe("New.");
|
|
235
|
+
});
|
|
236
|
+
|
|
237
|
+
test("backfills timing.json only when absent", () => {
|
|
238
|
+
const [task] = writeIteration([
|
|
239
|
+
{ eval_id: "crash", condition: "with_skill", finalMessage: "Done." },
|
|
240
|
+
]);
|
|
241
|
+
writeSubagent(
|
|
242
|
+
"agent-a",
|
|
243
|
+
"crash:with_skill:i1-nonce1",
|
|
244
|
+
transcriptLines("unused"),
|
|
245
|
+
);
|
|
246
|
+
writeFileSync(
|
|
247
|
+
task.timing_path,
|
|
248
|
+
JSON.stringify({ total_tokens: 12345, duration_ms: 9000 }),
|
|
249
|
+
);
|
|
250
|
+
|
|
251
|
+
recordRuns({ iterationDir, subagentsDir });
|
|
252
|
+
|
|
253
|
+
// Agent-captured completion-event timing wins; not overwritten.
|
|
254
|
+
const timing = readTiming("crash", "with_skill");
|
|
255
|
+
expect(timing.total_tokens).toBe(12345);
|
|
256
|
+
expect(timing.duration_ms).toBe(9000);
|
|
257
|
+
expect(timing.source).toBeUndefined();
|
|
258
|
+
});
|
|
259
|
+
|
|
260
|
+
test("falls back to the transcript's final assistant text when final-message.md is missing", () => {
|
|
261
|
+
writeIteration([{ eval_id: "crash", condition: "with_skill" }]);
|
|
262
|
+
writeSubagent(
|
|
263
|
+
"agent-a",
|
|
264
|
+
"crash:with_skill:i1-nonce1",
|
|
265
|
+
transcriptLines("Closing summary from transcript."),
|
|
266
|
+
);
|
|
267
|
+
|
|
268
|
+
const result = recordRuns({ iterationDir, subagentsDir });
|
|
269
|
+
expect(result.recorded).toBe(1);
|
|
270
|
+
expect(readRun("crash", "with_skill").final_message).toBe(
|
|
271
|
+
"Closing summary from transcript.",
|
|
272
|
+
);
|
|
273
|
+
});
|
|
274
|
+
|
|
275
|
+
test("skips the slot entirely when no final-message source exists", () => {
|
|
276
|
+
writeIteration([{ eval_id: "crash", condition: "with_skill" }]);
|
|
277
|
+
// No final-message.md, no transcript.
|
|
278
|
+
|
|
279
|
+
const result = recordRuns({ iterationDir, subagentsDir });
|
|
280
|
+
expect(result.recorded).toBe(0);
|
|
281
|
+
expect(result.skippedNoFinalMessage).toBe(1);
|
|
282
|
+
expect(
|
|
283
|
+
existsSync(join(iterationDir, "eval-crash", "with_skill", "run.json")),
|
|
284
|
+
).toBe(false);
|
|
285
|
+
expect(
|
|
286
|
+
existsSync(join(iterationDir, "eval-crash", "with_skill", "timing.json")),
|
|
287
|
+
).toBe(false);
|
|
288
|
+
});
|
|
289
|
+
|
|
290
|
+
test("writes run.json with empty invocations and no timing.json when the transcript is missing", () => {
|
|
291
|
+
writeIteration([
|
|
292
|
+
{ eval_id: "crash", condition: "with_skill", finalMessage: "Done." },
|
|
293
|
+
]);
|
|
294
|
+
// final-message.md exists but no subagent transcript matches.
|
|
295
|
+
|
|
296
|
+
const result = recordRuns({ iterationDir, subagentsDir });
|
|
297
|
+
expect(result.recorded).toBe(1);
|
|
298
|
+
expect(result.missingTranscript).toBe(1);
|
|
299
|
+
|
|
300
|
+
const run = readRun("crash", "with_skill");
|
|
301
|
+
expect(run.final_message).toBe("Done.");
|
|
302
|
+
expect(run.tool_invocations).toEqual([]);
|
|
303
|
+
expect(
|
|
304
|
+
existsSync(join(iterationDir, "eval-crash", "with_skill", "timing.json")),
|
|
305
|
+
).toBe(false);
|
|
306
|
+
});
|
|
307
|
+
|
|
308
|
+
test("throws when dispatch.json is absent", () => {
|
|
309
|
+
// Hand-authored/operator runs have no dispatch.json — the manual path owns them.
|
|
310
|
+
expect(() => recordRuns({ iterationDir, subagentsDir })).toThrow(
|
|
311
|
+
/dispatch\.json/,
|
|
312
|
+
);
|
|
313
|
+
});
|
|
314
|
+
});
|