@slowdini/slow-powers-opencode 0.3.0 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +34 -72
- package/bootstrap.md +1 -7
- package/opencode/plugins/slow-powers.js +69 -5
- package/package.json +14 -17
- package/skills/evaluating-skills/SKILL.md +90 -338
- package/skills/evaluating-skills/evals/baseline/BASELINE.md +23 -0
- package/skills/evaluating-skills/evals/baseline/NOTES.md +40 -0
- package/skills/evaluating-skills/evals/baseline/benchmark.json +54 -0
- package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__new_skill.json +39 -0
- package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__old_skill.json +39 -0
- package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__new_skill.json +39 -0
- package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__old_skill.json +39 -0
- package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__new_skill.json +32 -0
- package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__old_skill.json +32 -0
- package/skills/hardening-plans/SKILL.md +29 -7
- package/skills/hardening-plans/evals/baseline/BASELINE.md +11 -6
- package/skills/hardening-plans/evals/baseline/NOTES.md +72 -58
- package/skills/hardening-plans/evals/baseline/benchmark.json +25 -25
- package/skills/hardening-plans/evals/baseline/grading/concrete-todo-app-plan__new_skill.json +2 -2
- package/skills/hardening-plans/evals/baseline/grading/concrete-todo-app-plan__old_skill.json +2 -2
- package/skills/hardening-plans/evals/baseline/grading/docs-refactor-plan-mode__new_skill.json +39 -0
- package/skills/hardening-plans/evals/baseline/grading/docs-refactor-plan-mode__old_skill.json +39 -0
- package/skills/hardening-plans/evals/baseline/grading/oauth-task-breakdown-cold__new_skill.json +39 -0
- package/skills/hardening-plans/evals/baseline/grading/oauth-task-breakdown-cold__old_skill.json +39 -0
- package/skills/hardening-plans/evals/baseline/grading/research-plan-no-required-skill__new_skill.json +32 -0
- package/skills/hardening-plans/evals/baseline/grading/research-plan-no-required-skill__old_skill.json +32 -0
- package/skills/hardening-plans/evals/baseline/grading/seeded-plan-mode-todo-app-adversarial__new_skill.json +39 -0
- package/skills/hardening-plans/evals/baseline/grading/seeded-plan-mode-todo-app-adversarial__old_skill.json +39 -0
- package/skills/hardening-plans/evals/baseline/grading/seeded-plan-mode-todo-app__new_skill.json +39 -0
- package/skills/hardening-plans/evals/baseline/grading/seeded-plan-mode-todo-app__old_skill.json +39 -0
- package/skills/hardening-plans/evals/baseline/grading/seeded-review-catches-defects__new_skill.json +3 -3
- package/skills/hardening-plans/evals/baseline/grading/seeded-review-catches-defects__old_skill.json +8 -8
- package/skills/hardening-plans/evals/baseline/grading/structural-refactor-cold__new_skill.json +39 -0
- package/skills/hardening-plans/evals/baseline/grading/structural-refactor-cold__old_skill.json +39 -0
- package/skills/hardening-plans/evals/evals.json +46 -0
- package/skills/test-driven-development/evals/baseline/NOTES.md +2 -2
- package/skills/evaluating-skills/examples/verifying-development-work-evals.json +0 -30
- package/skills/evaluating-skills/harness-details/claude.md +0 -194
- package/skills/evaluating-skills/harness-parity.md +0 -155
- package/skills/evaluating-skills/runner/README.md +0 -163
- package/skills/evaluating-skills/runner/adapters/claude-code-session.test.ts +0 -56
- package/skills/evaluating-skills/runner/adapters/claude-code-session.ts +0 -43
- package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +0 -485
- package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +0 -242
- package/skills/evaluating-skills/runner/aggregate.test.ts +0 -484
- package/skills/evaluating-skills/runner/aggregate.ts +0 -269
- package/skills/evaluating-skills/runner/context.test.ts +0 -181
- package/skills/evaluating-skills/runner/context.ts +0 -90
- package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +0 -396
- package/skills/evaluating-skills/runner/detect-stray-writes.ts +0 -288
- package/skills/evaluating-skills/runner/fill-transcripts.test.ts +0 -73
- package/skills/evaluating-skills/runner/fill-transcripts.ts +0 -154
- package/skills/evaluating-skills/runner/grade.test.ts +0 -347
- package/skills/evaluating-skills/runner/grade.ts +0 -603
- package/skills/evaluating-skills/runner/guard/guard.ts +0 -49
- package/skills/evaluating-skills/runner/guard/install.test.ts +0 -92
- package/skills/evaluating-skills/runner/guard/install.ts +0 -147
- package/skills/evaluating-skills/runner/guard/policy.test.ts +0 -128
- package/skills/evaluating-skills/runner/guard/policy.ts +0 -74
- package/skills/evaluating-skills/runner/plugin-shadow.test.ts +0 -228
- package/skills/evaluating-skills/runner/plugin-shadow.ts +0 -201
- package/skills/evaluating-skills/runner/profiles/claude-code/plan-mode.md +0 -11
- package/skills/evaluating-skills/runner/promote-baseline.test.ts +0 -281
- package/skills/evaluating-skills/runner/promote-baseline.ts +0 -204
- package/skills/evaluating-skills/runner/record-runs.test.ts +0 -314
- package/skills/evaluating-skills/runner/record-runs.ts +0 -209
- package/skills/evaluating-skills/runner/run.test.ts +0 -1703
- package/skills/evaluating-skills/runner/run.ts +0 -1388
- package/skills/evaluating-skills/runner/sandbox-policy.ts +0 -94
- package/skills/evaluating-skills/runner/types.ts +0 -121
- package/skills/evaluating-skills/runner/validate-all.ts +0 -54
- package/skills/evaluating-skills/runner/validate-schema.test.ts +0 -99
- package/skills/evaluating-skills/runner/validate-schema.ts +0 -51
- package/skills/evaluating-skills/runner/validate.test.ts +0 -56
- package/skills/evaluating-skills/runner/validate.ts +0 -21
- package/skills/evaluating-skills/runner/workspace-teardown.test.ts +0 -227
- package/skills/evaluating-skills/runner/workspace-teardown.ts +0 -136
- package/skills/evaluating-skills/schema/evals.schema.json +0 -105
- package/skills/evaluating-skills/schema/grading.schema.json +0 -84
- package/skills/evaluating-skills/schema/run-record.schema.json +0 -80
- package/skills/evaluating-skills/schema/stray-writes.schema.json +0 -80
- package/skills/evaluating-skills/templates/eval-task-prompt.md +0 -69
- package/skills/evaluating-skills/templates/evals.json.example +0 -17
- package/skills/evaluating-skills/templates/judge-prompt.md +0 -56
- package/skills/evaluating-skills/templates/revise-skill-prompt.md +0 -56
|
@@ -1,314 +0,0 @@
|
|
|
1
|
-
import { afterEach, beforeEach, describe, expect, test } from "bun:test";
|
|
2
|
-
import {
|
|
3
|
-
existsSync,
|
|
4
|
-
mkdirSync,
|
|
5
|
-
readFileSync,
|
|
6
|
-
rmSync,
|
|
7
|
-
writeFileSync,
|
|
8
|
-
} from "node:fs";
|
|
9
|
-
import { tmpdir } from "node:os";
|
|
10
|
-
import { join } from "node:path";
|
|
11
|
-
import { recordRuns } from "./record-runs";
|
|
12
|
-
import type { RunRecord, TimingRecord } from "./types";
|
|
13
|
-
|
|
14
|
-
const ROOT = join(tmpdir(), `record-runs-test-${process.pid}`);
|
|
15
|
-
|
|
16
|
-
let iterationDir: string;
|
|
17
|
-
let subagentsDir: string;
|
|
18
|
-
|
|
19
|
-
function jsonl(lines: object[]): string {
|
|
20
|
-
return `${lines.map((l) => JSON.stringify(l)).join("\n")}\n`;
|
|
21
|
-
}
|
|
22
|
-
|
|
23
|
-
/** A minimal transcript with usage, timestamps, one tool call, and final text. */
|
|
24
|
-
function transcriptLines(finalText: string): object[] {
|
|
25
|
-
return [
|
|
26
|
-
{
|
|
27
|
-
type: "user",
|
|
28
|
-
timestamp: "2026-06-04T10:00:00.000Z",
|
|
29
|
-
message: { role: "user", content: "go" },
|
|
30
|
-
},
|
|
31
|
-
{
|
|
32
|
-
type: "assistant",
|
|
33
|
-
timestamp: "2026-06-04T10:00:10.000Z",
|
|
34
|
-
message: {
|
|
35
|
-
id: "msg_1",
|
|
36
|
-
role: "assistant",
|
|
37
|
-
usage: {
|
|
38
|
-
input_tokens: 100,
|
|
39
|
-
output_tokens: 20,
|
|
40
|
-
cache_creation_input_tokens: 30,
|
|
41
|
-
cache_read_input_tokens: 50,
|
|
42
|
-
},
|
|
43
|
-
content: [
|
|
44
|
-
{
|
|
45
|
-
type: "tool_use",
|
|
46
|
-
id: "toolu_1",
|
|
47
|
-
name: "Bash",
|
|
48
|
-
input: { command: "ls" },
|
|
49
|
-
},
|
|
50
|
-
],
|
|
51
|
-
},
|
|
52
|
-
},
|
|
53
|
-
{
|
|
54
|
-
type: "user",
|
|
55
|
-
timestamp: "2026-06-04T10:00:12.000Z",
|
|
56
|
-
message: {
|
|
57
|
-
role: "user",
|
|
58
|
-
content: [
|
|
59
|
-
{ type: "tool_result", tool_use_id: "toolu_1", content: "ok" },
|
|
60
|
-
],
|
|
61
|
-
},
|
|
62
|
-
},
|
|
63
|
-
{
|
|
64
|
-
type: "assistant",
|
|
65
|
-
timestamp: "2026-06-04T10:01:00.000Z",
|
|
66
|
-
message: {
|
|
67
|
-
id: "msg_2",
|
|
68
|
-
role: "assistant",
|
|
69
|
-
usage: {
|
|
70
|
-
input_tokens: 200,
|
|
71
|
-
output_tokens: 40,
|
|
72
|
-
cache_creation_input_tokens: 0,
|
|
73
|
-
cache_read_input_tokens: 60,
|
|
74
|
-
},
|
|
75
|
-
content: [{ type: "text", text: finalText }],
|
|
76
|
-
},
|
|
77
|
-
},
|
|
78
|
-
];
|
|
79
|
-
}
|
|
80
|
-
|
|
81
|
-
// Token math for transcriptLines: msg_1 (100+20+30+50) + msg_2 (200+40+0+60) = 500.
|
|
82
|
-
const TRANSCRIPT_TOKENS = 500;
|
|
83
|
-
// 10:00:00.000 → 10:01:00.000
|
|
84
|
-
const TRANSCRIPT_DURATION_MS = 60_000;
|
|
85
|
-
|
|
86
|
-
function writeSubagent(name: string, description: string, lines: object[]) {
|
|
87
|
-
writeFileSync(
|
|
88
|
-
join(subagentsDir, `${name}.meta.json`),
|
|
89
|
-
JSON.stringify({ agentType: "general-purpose", description }),
|
|
90
|
-
);
|
|
91
|
-
writeFileSync(join(subagentsDir, `${name}.jsonl`), jsonl(lines));
|
|
92
|
-
}
|
|
93
|
-
|
|
94
|
-
type FixtureTask = {
|
|
95
|
-
eval_id: string;
|
|
96
|
-
condition: string;
|
|
97
|
-
finalMessage?: string; // written to outputs/final-message.md when present
|
|
98
|
-
};
|
|
99
|
-
|
|
100
|
-
/** Builds an iteration dir + dispatch.json shaped like run.ts serializes it. */
|
|
101
|
-
function writeIteration(tasks: FixtureTask[]) {
|
|
102
|
-
const serialized = tasks.map((t) => {
|
|
103
|
-
const condDir = join(iterationDir, `eval-${t.eval_id}`, t.condition);
|
|
104
|
-
const outputsDir = join(condDir, "outputs");
|
|
105
|
-
mkdirSync(outputsDir, { recursive: true });
|
|
106
|
-
if (t.finalMessage !== undefined) {
|
|
107
|
-
writeFileSync(join(outputsDir, "final-message.md"), t.finalMessage);
|
|
108
|
-
}
|
|
109
|
-
return {
|
|
110
|
-
eval_id: t.eval_id,
|
|
111
|
-
condition: t.condition,
|
|
112
|
-
skill_path:
|
|
113
|
-
t.condition === "without_skill" ? null : "/staged/skill/SKILL.md",
|
|
114
|
-
staged_skill_slug: t.condition === "without_skill" ? null : "test-slug",
|
|
115
|
-
user_prompt: `Do the ${t.eval_id} task`,
|
|
116
|
-
fixtures: [join(condDir, "inputs", "fixture.txt")],
|
|
117
|
-
outputs_dir: outputsDir,
|
|
118
|
-
run_record_path: join(condDir, "run.json"),
|
|
119
|
-
timing_path: join(condDir, "timing.json"),
|
|
120
|
-
agent_description: `${t.eval_id}:${t.condition}:i1-nonce1`,
|
|
121
|
-
dispatch_prompt_path: join(condDir, "dispatch-prompt.txt"),
|
|
122
|
-
};
|
|
123
|
-
});
|
|
124
|
-
writeFileSync(
|
|
125
|
-
join(iterationDir, "dispatch.json"),
|
|
126
|
-
JSON.stringify({ run_nonce: "nonce1", tasks: serialized }, null, 2),
|
|
127
|
-
);
|
|
128
|
-
return serialized;
|
|
129
|
-
}
|
|
130
|
-
|
|
131
|
-
function readRun(evalId: string, condition: string): RunRecord {
|
|
132
|
-
return JSON.parse(
|
|
133
|
-
readFileSync(
|
|
134
|
-
join(iterationDir, `eval-${evalId}`, condition, "run.json"),
|
|
135
|
-
"utf8",
|
|
136
|
-
),
|
|
137
|
-
);
|
|
138
|
-
}
|
|
139
|
-
|
|
140
|
-
function readTiming(evalId: string, condition: string): TimingRecord {
|
|
141
|
-
return JSON.parse(
|
|
142
|
-
readFileSync(
|
|
143
|
-
join(iterationDir, `eval-${evalId}`, condition, "timing.json"),
|
|
144
|
-
"utf8",
|
|
145
|
-
),
|
|
146
|
-
);
|
|
147
|
-
}
|
|
148
|
-
|
|
149
|
-
beforeEach(() => {
|
|
150
|
-
iterationDir = join(ROOT, `iter-${Math.random().toString(36).slice(2)}`);
|
|
151
|
-
subagentsDir = join(ROOT, `sub-${Math.random().toString(36).slice(2)}`);
|
|
152
|
-
mkdirSync(iterationDir, { recursive: true });
|
|
153
|
-
mkdirSync(subagentsDir, { recursive: true });
|
|
154
|
-
});
|
|
155
|
-
|
|
156
|
-
afterEach(() => rmSync(ROOT, { recursive: true, force: true }));
|
|
157
|
-
|
|
158
|
-
describe("recordRuns", () => {
|
|
159
|
-
test("assembles run.json and timing.json for every task from disk", () => {
|
|
160
|
-
writeIteration([
|
|
161
|
-
{ eval_id: "crash", condition: "with_skill", finalMessage: "Fixed it." },
|
|
162
|
-
{
|
|
163
|
-
eval_id: "crash",
|
|
164
|
-
condition: "without_skill",
|
|
165
|
-
finalMessage: "Done, I think.",
|
|
166
|
-
},
|
|
167
|
-
]);
|
|
168
|
-
writeSubagent(
|
|
169
|
-
"agent-a",
|
|
170
|
-
"crash:with_skill:i1-nonce1",
|
|
171
|
-
transcriptLines("unused"),
|
|
172
|
-
);
|
|
173
|
-
writeSubagent(
|
|
174
|
-
"agent-b",
|
|
175
|
-
"crash:without_skill:i1-nonce1",
|
|
176
|
-
transcriptLines("unused"),
|
|
177
|
-
);
|
|
178
|
-
|
|
179
|
-
const result = recordRuns({ iterationDir, subagentsDir });
|
|
180
|
-
expect(result.recorded).toBe(2);
|
|
181
|
-
expect(result.missingTranscript).toBe(0);
|
|
182
|
-
|
|
183
|
-
const run = readRun("crash", "with_skill");
|
|
184
|
-
expect(run.eval_id).toBe("crash");
|
|
185
|
-
expect(run.condition).toBe("with_skill");
|
|
186
|
-
expect(run.skill_path).toBe("/staged/skill/SKILL.md");
|
|
187
|
-
expect(run.prompt).toBe("Do the crash task");
|
|
188
|
-
expect(run.files).toHaveLength(1);
|
|
189
|
-
expect(run.final_message).toBe("Fixed it.");
|
|
190
|
-
expect(run.tool_invocations).toHaveLength(1);
|
|
191
|
-
expect(run.tool_invocations[0]).toMatchObject({ name: "Bash", ordinal: 0 });
|
|
192
|
-
|
|
193
|
-
expect(readRun("crash", "without_skill").skill_path).toBeNull();
|
|
194
|
-
|
|
195
|
-
const timing = readTiming("crash", "with_skill");
|
|
196
|
-
expect(timing.total_tokens).toBe(TRANSCRIPT_TOKENS);
|
|
197
|
-
expect(timing.duration_ms).toBe(TRANSCRIPT_DURATION_MS);
|
|
198
|
-
expect(timing.source).toBe("transcript");
|
|
199
|
-
});
|
|
200
|
-
|
|
201
|
-
test("skips existing run.json without --overwrite, replaces with it", () => {
|
|
202
|
-
const [task] = writeIteration([
|
|
203
|
-
{ eval_id: "crash", condition: "with_skill", finalMessage: "New." },
|
|
204
|
-
]);
|
|
205
|
-
writeSubagent(
|
|
206
|
-
"agent-a",
|
|
207
|
-
"crash:with_skill:i1-nonce1",
|
|
208
|
-
transcriptLines("unused"),
|
|
209
|
-
);
|
|
210
|
-
const handWritten = {
|
|
211
|
-
eval_id: "crash",
|
|
212
|
-
condition: "with_skill",
|
|
213
|
-
skill_path: "/staged/skill/SKILL.md",
|
|
214
|
-
prompt: "Do the crash task",
|
|
215
|
-
files: [],
|
|
216
|
-
final_message: "Agent-authored.",
|
|
217
|
-
tool_invocations: [],
|
|
218
|
-
};
|
|
219
|
-
writeFileSync(task.run_record_path, JSON.stringify(handWritten));
|
|
220
|
-
|
|
221
|
-
const skipped = recordRuns({ iterationDir, subagentsDir });
|
|
222
|
-
expect(skipped.recorded).toBe(0);
|
|
223
|
-
expect(skipped.skippedExisting).toBe(1);
|
|
224
|
-
expect(readRun("crash", "with_skill").final_message).toBe(
|
|
225
|
-
"Agent-authored.",
|
|
226
|
-
);
|
|
227
|
-
|
|
228
|
-
const replaced = recordRuns({
|
|
229
|
-
iterationDir,
|
|
230
|
-
subagentsDir,
|
|
231
|
-
overwrite: true,
|
|
232
|
-
});
|
|
233
|
-
expect(replaced.recorded).toBe(1);
|
|
234
|
-
expect(readRun("crash", "with_skill").final_message).toBe("New.");
|
|
235
|
-
});
|
|
236
|
-
|
|
237
|
-
test("backfills timing.json only when absent", () => {
|
|
238
|
-
const [task] = writeIteration([
|
|
239
|
-
{ eval_id: "crash", condition: "with_skill", finalMessage: "Done." },
|
|
240
|
-
]);
|
|
241
|
-
writeSubagent(
|
|
242
|
-
"agent-a",
|
|
243
|
-
"crash:with_skill:i1-nonce1",
|
|
244
|
-
transcriptLines("unused"),
|
|
245
|
-
);
|
|
246
|
-
writeFileSync(
|
|
247
|
-
task.timing_path,
|
|
248
|
-
JSON.stringify({ total_tokens: 12345, duration_ms: 9000 }),
|
|
249
|
-
);
|
|
250
|
-
|
|
251
|
-
recordRuns({ iterationDir, subagentsDir });
|
|
252
|
-
|
|
253
|
-
// Agent-captured completion-event timing wins; not overwritten.
|
|
254
|
-
const timing = readTiming("crash", "with_skill");
|
|
255
|
-
expect(timing.total_tokens).toBe(12345);
|
|
256
|
-
expect(timing.duration_ms).toBe(9000);
|
|
257
|
-
expect(timing.source).toBeUndefined();
|
|
258
|
-
});
|
|
259
|
-
|
|
260
|
-
test("falls back to the transcript's final assistant text when final-message.md is missing", () => {
|
|
261
|
-
writeIteration([{ eval_id: "crash", condition: "with_skill" }]);
|
|
262
|
-
writeSubagent(
|
|
263
|
-
"agent-a",
|
|
264
|
-
"crash:with_skill:i1-nonce1",
|
|
265
|
-
transcriptLines("Closing summary from transcript."),
|
|
266
|
-
);
|
|
267
|
-
|
|
268
|
-
const result = recordRuns({ iterationDir, subagentsDir });
|
|
269
|
-
expect(result.recorded).toBe(1);
|
|
270
|
-
expect(readRun("crash", "with_skill").final_message).toBe(
|
|
271
|
-
"Closing summary from transcript.",
|
|
272
|
-
);
|
|
273
|
-
});
|
|
274
|
-
|
|
275
|
-
test("skips the slot entirely when no final-message source exists", () => {
|
|
276
|
-
writeIteration([{ eval_id: "crash", condition: "with_skill" }]);
|
|
277
|
-
// No final-message.md, no transcript.
|
|
278
|
-
|
|
279
|
-
const result = recordRuns({ iterationDir, subagentsDir });
|
|
280
|
-
expect(result.recorded).toBe(0);
|
|
281
|
-
expect(result.skippedNoFinalMessage).toBe(1);
|
|
282
|
-
expect(
|
|
283
|
-
existsSync(join(iterationDir, "eval-crash", "with_skill", "run.json")),
|
|
284
|
-
).toBe(false);
|
|
285
|
-
expect(
|
|
286
|
-
existsSync(join(iterationDir, "eval-crash", "with_skill", "timing.json")),
|
|
287
|
-
).toBe(false);
|
|
288
|
-
});
|
|
289
|
-
|
|
290
|
-
test("writes run.json with empty invocations and no timing.json when the transcript is missing", () => {
|
|
291
|
-
writeIteration([
|
|
292
|
-
{ eval_id: "crash", condition: "with_skill", finalMessage: "Done." },
|
|
293
|
-
]);
|
|
294
|
-
// final-message.md exists but no subagent transcript matches.
|
|
295
|
-
|
|
296
|
-
const result = recordRuns({ iterationDir, subagentsDir });
|
|
297
|
-
expect(result.recorded).toBe(1);
|
|
298
|
-
expect(result.missingTranscript).toBe(1);
|
|
299
|
-
|
|
300
|
-
const run = readRun("crash", "with_skill");
|
|
301
|
-
expect(run.final_message).toBe("Done.");
|
|
302
|
-
expect(run.tool_invocations).toEqual([]);
|
|
303
|
-
expect(
|
|
304
|
-
existsSync(join(iterationDir, "eval-crash", "with_skill", "timing.json")),
|
|
305
|
-
).toBe(false);
|
|
306
|
-
});
|
|
307
|
-
|
|
308
|
-
test("throws when dispatch.json is absent", () => {
|
|
309
|
-
// Hand-authored/operator runs have no dispatch.json — the manual path owns them.
|
|
310
|
-
expect(() => recordRuns({ iterationDir, subagentsDir })).toThrow(
|
|
311
|
-
/dispatch\.json/,
|
|
312
|
-
);
|
|
313
|
-
});
|
|
314
|
-
});
|
|
@@ -1,209 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env bun
|
|
2
|
-
import { existsSync, readFileSync, writeFileSync } from "node:fs";
|
|
3
|
-
import { join } from "node:path";
|
|
4
|
-
import {
|
|
5
|
-
findByDescription,
|
|
6
|
-
parseTranscriptFull,
|
|
7
|
-
} from "./adapters/claude-code-transcript";
|
|
8
|
-
import { detectRunContext } from "./context";
|
|
9
|
-
import type { RunRecord, TimingRecord } from "./types";
|
|
10
|
-
import { validateAgainstSchema } from "./validate-schema";
|
|
11
|
-
|
|
12
|
-
function die(msg: string): never {
|
|
13
|
-
console.error(`error: ${msg}`);
|
|
14
|
-
process.exit(1);
|
|
15
|
-
}
|
|
16
|
-
|
|
17
|
-
/** The dispatch.json task shape record-runs consumes (see DispatchTask in
|
|
18
|
-
* run.ts — `dispatch_prompt` is stripped from the serialized file). */
|
|
19
|
-
type DispatchTask = {
|
|
20
|
-
eval_id: string;
|
|
21
|
-
condition: string;
|
|
22
|
-
skill_path: string | null;
|
|
23
|
-
user_prompt: string;
|
|
24
|
-
fixtures: string[];
|
|
25
|
-
outputs_dir: string;
|
|
26
|
-
run_record_path: string;
|
|
27
|
-
timing_path: string;
|
|
28
|
-
agent_description: string;
|
|
29
|
-
};
|
|
30
|
-
|
|
31
|
-
export type RecordRunsResult = {
|
|
32
|
-
recorded: number;
|
|
33
|
-
skippedExisting: number;
|
|
34
|
-
skippedNoFinalMessage: number;
|
|
35
|
-
missingTranscript: number;
|
|
36
|
-
};
|
|
37
|
-
|
|
38
|
-
/**
|
|
39
|
-
* Assembles a schema-valid `run.json` (and backfills `timing.json`) for every
|
|
40
|
-
* task in the iteration's `dispatch.json`, from sources already on disk:
|
|
41
|
-
*
|
|
42
|
-
* - carry-over fields (`prompt` ← `user_prompt`, `files` ← `fixtures`,
|
|
43
|
-
* `eval_id`/`condition`/`skill_path`) from `dispatch.json`;
|
|
44
|
-
* - `final_message` from `<outputs_dir>/final-message.md` (the dispatch prompt
|
|
45
|
-
* instructs the subagent to write it), falling back to the transcript's last
|
|
46
|
-
* assistant text;
|
|
47
|
-
* - `tool_invocations`, tokens, and duration from the persisted Claude Code
|
|
48
|
-
* transcript (Claude-Code-tier, like fill-transcripts — transcript-less
|
|
49
|
-
* harnesses keep hand-authoring these records).
|
|
50
|
-
*
|
|
51
|
-
* Existing records always win: an agent/operator-written `run.json` is skipped
|
|
52
|
-
* without `overwrite`, and `timing.json` is backfill-only — completion-event
|
|
53
|
-
* numbers captured at dispatch time are never replaced by transcript-derived
|
|
54
|
-
* ones, which include cache accounting and are not comparable 1:1.
|
|
55
|
-
*/
|
|
56
|
-
export function recordRuns(opts: {
|
|
57
|
-
iterationDir: string;
|
|
58
|
-
subagentsDir: string;
|
|
59
|
-
overwrite?: boolean;
|
|
60
|
-
}): RecordRunsResult {
|
|
61
|
-
const { iterationDir, subagentsDir, overwrite = false } = opts;
|
|
62
|
-
|
|
63
|
-
const dispatchPath = join(iterationDir, "dispatch.json");
|
|
64
|
-
if (!existsSync(dispatchPath)) {
|
|
65
|
-
throw new Error(
|
|
66
|
-
`${dispatchPath} not found — record-runs assembles records from dispatch.json and only supports runner-built iterations. For hand-authored runs, write run.json + timing.json manually (see schema/run-record.schema.json).`,
|
|
67
|
-
);
|
|
68
|
-
}
|
|
69
|
-
const dispatch = JSON.parse(readFileSync(dispatchPath, "utf8")) as {
|
|
70
|
-
tasks?: DispatchTask[];
|
|
71
|
-
};
|
|
72
|
-
const tasks = dispatch.tasks ?? [];
|
|
73
|
-
|
|
74
|
-
const result: RecordRunsResult = {
|
|
75
|
-
recorded: 0,
|
|
76
|
-
skippedExisting: 0,
|
|
77
|
-
skippedNoFinalMessage: 0,
|
|
78
|
-
missingTranscript: 0,
|
|
79
|
-
};
|
|
80
|
-
|
|
81
|
-
for (const task of tasks) {
|
|
82
|
-
const slot = `${task.eval_id}/${task.condition}`;
|
|
83
|
-
|
|
84
|
-
const subagent = findByDescription(subagentsDir, task.agent_description);
|
|
85
|
-
const summary = subagent ? parseTranscriptFull(subagent.jsonlPath) : null;
|
|
86
|
-
if (!subagent) {
|
|
87
|
-
console.warn(
|
|
88
|
-
`miss ${slot}: no subagent transcript with description='${task.agent_description}'`,
|
|
89
|
-
);
|
|
90
|
-
result.missingTranscript++;
|
|
91
|
-
}
|
|
92
|
-
|
|
93
|
-
// run.json — skip if the agent/operator already wrote one.
|
|
94
|
-
if (existsSync(task.run_record_path) && !overwrite) {
|
|
95
|
-
console.log(
|
|
96
|
-
`skip ${slot}: run.json already exists (use --overwrite to replace)`,
|
|
97
|
-
);
|
|
98
|
-
result.skippedExisting++;
|
|
99
|
-
} else {
|
|
100
|
-
const finalMessagePath = join(task.outputs_dir, "final-message.md");
|
|
101
|
-
let finalMessage: string | null = null;
|
|
102
|
-
if (existsSync(finalMessagePath)) {
|
|
103
|
-
finalMessage = readFileSync(finalMessagePath, "utf8").trim();
|
|
104
|
-
} else if (summary?.final_text) {
|
|
105
|
-
console.warn(
|
|
106
|
-
`warn ${slot}: ${finalMessagePath} missing — using the transcript's last assistant text as final_message`,
|
|
107
|
-
);
|
|
108
|
-
finalMessage = summary.final_text;
|
|
109
|
-
}
|
|
110
|
-
if (finalMessage === null) {
|
|
111
|
-
console.warn(
|
|
112
|
-
`skip ${slot}: no final-message.md and no transcript text — was this task dispatched? Not writing a blank record.`,
|
|
113
|
-
);
|
|
114
|
-
result.skippedNoFinalMessage++;
|
|
115
|
-
continue;
|
|
116
|
-
}
|
|
117
|
-
|
|
118
|
-
const record: RunRecord = {
|
|
119
|
-
eval_id: task.eval_id,
|
|
120
|
-
condition: task.condition,
|
|
121
|
-
skill_path: task.skill_path,
|
|
122
|
-
prompt: task.user_prompt,
|
|
123
|
-
files: task.fixtures,
|
|
124
|
-
final_message: finalMessage,
|
|
125
|
-
tool_invocations: summary?.tool_invocations ?? [],
|
|
126
|
-
// Timing lives in timing.json; run.json never carries it.
|
|
127
|
-
total_tokens: null,
|
|
128
|
-
duration_ms: null,
|
|
129
|
-
};
|
|
130
|
-
validateAgainstSchema<RunRecord>(
|
|
131
|
-
"run-record",
|
|
132
|
-
record,
|
|
133
|
-
task.run_record_path,
|
|
134
|
-
);
|
|
135
|
-
writeFileSync(
|
|
136
|
-
task.run_record_path,
|
|
137
|
-
`${JSON.stringify(record, null, 2)}\n`,
|
|
138
|
-
);
|
|
139
|
-
console.log(
|
|
140
|
-
`record ${slot}: wrote run.json with ${record.tool_invocations.length} tool_invocations`,
|
|
141
|
-
);
|
|
142
|
-
result.recorded++;
|
|
143
|
-
}
|
|
144
|
-
|
|
145
|
-
// timing.json — backfill only; completion-event numbers always win.
|
|
146
|
-
const timingExists = existsSync(task.timing_path);
|
|
147
|
-
if (summary && (!timingExists || overwrite)) {
|
|
148
|
-
const timing: TimingRecord = {
|
|
149
|
-
total_tokens: summary.total_tokens,
|
|
150
|
-
duration_ms: summary.duration_ms,
|
|
151
|
-
source: "transcript",
|
|
152
|
-
};
|
|
153
|
-
writeFileSync(task.timing_path, `${JSON.stringify(timing, null, 2)}\n`);
|
|
154
|
-
}
|
|
155
|
-
}
|
|
156
|
-
|
|
157
|
-
return result;
|
|
158
|
-
}
|
|
159
|
-
|
|
160
|
-
function parseArgs(argv: string[]) {
|
|
161
|
-
const flag = (name: string): string | undefined => {
|
|
162
|
-
const i = argv.indexOf(`--${name}`);
|
|
163
|
-
if (i === -1) return undefined;
|
|
164
|
-
return argv[i + 1];
|
|
165
|
-
};
|
|
166
|
-
const iteration = flag("iteration");
|
|
167
|
-
const subagentsDir = flag("subagents-dir");
|
|
168
|
-
const overwrite = argv.includes("--overwrite");
|
|
169
|
-
if (!iteration) die("missing --iteration");
|
|
170
|
-
if (!subagentsDir)
|
|
171
|
-
die(
|
|
172
|
-
"missing --subagents-dir (e.g. ~/.claude/projects/<project-slug>/<parent-session-id>/subagents/)",
|
|
173
|
-
);
|
|
174
|
-
return { iteration, subagentsDir, overwrite };
|
|
175
|
-
}
|
|
176
|
-
|
|
177
|
-
if (import.meta.main) {
|
|
178
|
-
const argv = Bun.argv.slice(2);
|
|
179
|
-
const { iteration, subagentsDir, overwrite } = parseArgs(argv);
|
|
180
|
-
const ctx = detectRunContext(argv);
|
|
181
|
-
|
|
182
|
-
if (!existsSync(subagentsDir))
|
|
183
|
-
die(`subagents-dir not found: ${subagentsDir}`);
|
|
184
|
-
|
|
185
|
-
const iterationDir = join(
|
|
186
|
-
ctx.workspaceRoot,
|
|
187
|
-
ctx.skillName,
|
|
188
|
-
`iteration-${iteration}`,
|
|
189
|
-
);
|
|
190
|
-
if (!existsSync(iterationDir)) die(`not found: ${iterationDir}`);
|
|
191
|
-
|
|
192
|
-
let result: RecordRunsResult;
|
|
193
|
-
try {
|
|
194
|
-
result = recordRuns({ iterationDir, subagentsDir, overwrite });
|
|
195
|
-
} catch (err) {
|
|
196
|
-
die(err instanceof Error ? err.message : String(err));
|
|
197
|
-
}
|
|
198
|
-
|
|
199
|
-
console.log(
|
|
200
|
-
`\nRecorded: ${result.recorded}, skipped (existing run.json): ${result.skippedExisting}, skipped (no final message): ${result.skippedNoFinalMessage}, missing transcript: ${result.missingTranscript}`,
|
|
201
|
-
);
|
|
202
|
-
if (result.missingTranscript > 0)
|
|
203
|
-
console.warn(
|
|
204
|
-
"Missing transcripts mean the dispatching agent's dispatch `description` did not match the task's `agent_description` in dispatch.json. Those slots got empty tool_invocations (transcript_check assertions will grade unverifiable) and no transcript-derived timing.",
|
|
205
|
-
);
|
|
206
|
-
console.log(
|
|
207
|
-
`\nNext: bun run evals:detect-stray-writes -- --skill ${ctx.skillName} --iteration ${iteration}\nThen: bun run evals:grade -- --skill ${ctx.skillName} --iteration ${iteration}`,
|
|
208
|
-
);
|
|
209
|
-
}
|