@slowdini/slow-powers-opencode 0.3.0 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. package/README.md +34 -72
  2. package/bootstrap.md +1 -7
  3. package/opencode/plugins/slow-powers.js +69 -5
  4. package/package.json +14 -17
  5. package/skills/evaluating-skills/SKILL.md +90 -338
  6. package/skills/evaluating-skills/evals/baseline/BASELINE.md +23 -0
  7. package/skills/evaluating-skills/evals/baseline/NOTES.md +40 -0
  8. package/skills/evaluating-skills/evals/baseline/benchmark.json +54 -0
  9. package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__new_skill.json +39 -0
  10. package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__old_skill.json +39 -0
  11. package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__new_skill.json +39 -0
  12. package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__old_skill.json +39 -0
  13. package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__new_skill.json +32 -0
  14. package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__old_skill.json +32 -0
  15. package/skills/hardening-plans/SKILL.md +29 -7
  16. package/skills/hardening-plans/evals/baseline/BASELINE.md +11 -6
  17. package/skills/hardening-plans/evals/baseline/NOTES.md +72 -58
  18. package/skills/hardening-plans/evals/baseline/benchmark.json +25 -25
  19. package/skills/hardening-plans/evals/baseline/grading/concrete-todo-app-plan__new_skill.json +2 -2
  20. package/skills/hardening-plans/evals/baseline/grading/concrete-todo-app-plan__old_skill.json +2 -2
  21. package/skills/hardening-plans/evals/baseline/grading/docs-refactor-plan-mode__new_skill.json +39 -0
  22. package/skills/hardening-plans/evals/baseline/grading/docs-refactor-plan-mode__old_skill.json +39 -0
  23. package/skills/hardening-plans/evals/baseline/grading/oauth-task-breakdown-cold__new_skill.json +39 -0
  24. package/skills/hardening-plans/evals/baseline/grading/oauth-task-breakdown-cold__old_skill.json +39 -0
  25. package/skills/hardening-plans/evals/baseline/grading/research-plan-no-required-skill__new_skill.json +32 -0
  26. package/skills/hardening-plans/evals/baseline/grading/research-plan-no-required-skill__old_skill.json +32 -0
  27. package/skills/hardening-plans/evals/baseline/grading/seeded-plan-mode-todo-app-adversarial__new_skill.json +39 -0
  28. package/skills/hardening-plans/evals/baseline/grading/seeded-plan-mode-todo-app-adversarial__old_skill.json +39 -0
  29. package/skills/hardening-plans/evals/baseline/grading/seeded-plan-mode-todo-app__new_skill.json +39 -0
  30. package/skills/hardening-plans/evals/baseline/grading/seeded-plan-mode-todo-app__old_skill.json +39 -0
  31. package/skills/hardening-plans/evals/baseline/grading/seeded-review-catches-defects__new_skill.json +3 -3
  32. package/skills/hardening-plans/evals/baseline/grading/seeded-review-catches-defects__old_skill.json +8 -8
  33. package/skills/hardening-plans/evals/baseline/grading/structural-refactor-cold__new_skill.json +39 -0
  34. package/skills/hardening-plans/evals/baseline/grading/structural-refactor-cold__old_skill.json +39 -0
  35. package/skills/hardening-plans/evals/evals.json +46 -0
  36. package/skills/test-driven-development/evals/baseline/NOTES.md +2 -2
  37. package/skills/evaluating-skills/examples/verifying-development-work-evals.json +0 -30
  38. package/skills/evaluating-skills/harness-details/claude.md +0 -194
  39. package/skills/evaluating-skills/harness-parity.md +0 -155
  40. package/skills/evaluating-skills/runner/README.md +0 -163
  41. package/skills/evaluating-skills/runner/adapters/claude-code-session.test.ts +0 -56
  42. package/skills/evaluating-skills/runner/adapters/claude-code-session.ts +0 -43
  43. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +0 -485
  44. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +0 -242
  45. package/skills/evaluating-skills/runner/aggregate.test.ts +0 -484
  46. package/skills/evaluating-skills/runner/aggregate.ts +0 -269
  47. package/skills/evaluating-skills/runner/context.test.ts +0 -181
  48. package/skills/evaluating-skills/runner/context.ts +0 -90
  49. package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +0 -396
  50. package/skills/evaluating-skills/runner/detect-stray-writes.ts +0 -288
  51. package/skills/evaluating-skills/runner/fill-transcripts.test.ts +0 -73
  52. package/skills/evaluating-skills/runner/fill-transcripts.ts +0 -154
  53. package/skills/evaluating-skills/runner/grade.test.ts +0 -347
  54. package/skills/evaluating-skills/runner/grade.ts +0 -603
  55. package/skills/evaluating-skills/runner/guard/guard.ts +0 -49
  56. package/skills/evaluating-skills/runner/guard/install.test.ts +0 -92
  57. package/skills/evaluating-skills/runner/guard/install.ts +0 -147
  58. package/skills/evaluating-skills/runner/guard/policy.test.ts +0 -128
  59. package/skills/evaluating-skills/runner/guard/policy.ts +0 -74
  60. package/skills/evaluating-skills/runner/plugin-shadow.test.ts +0 -228
  61. package/skills/evaluating-skills/runner/plugin-shadow.ts +0 -201
  62. package/skills/evaluating-skills/runner/profiles/claude-code/plan-mode.md +0 -11
  63. package/skills/evaluating-skills/runner/promote-baseline.test.ts +0 -281
  64. package/skills/evaluating-skills/runner/promote-baseline.ts +0 -204
  65. package/skills/evaluating-skills/runner/record-runs.test.ts +0 -314
  66. package/skills/evaluating-skills/runner/record-runs.ts +0 -209
  67. package/skills/evaluating-skills/runner/run.test.ts +0 -1703
  68. package/skills/evaluating-skills/runner/run.ts +0 -1388
  69. package/skills/evaluating-skills/runner/sandbox-policy.ts +0 -94
  70. package/skills/evaluating-skills/runner/types.ts +0 -121
  71. package/skills/evaluating-skills/runner/validate-all.ts +0 -54
  72. package/skills/evaluating-skills/runner/validate-schema.test.ts +0 -99
  73. package/skills/evaluating-skills/runner/validate-schema.ts +0 -51
  74. package/skills/evaluating-skills/runner/validate.test.ts +0 -56
  75. package/skills/evaluating-skills/runner/validate.ts +0 -21
  76. package/skills/evaluating-skills/runner/workspace-teardown.test.ts +0 -227
  77. package/skills/evaluating-skills/runner/workspace-teardown.ts +0 -136
  78. package/skills/evaluating-skills/schema/evals.schema.json +0 -105
  79. package/skills/evaluating-skills/schema/grading.schema.json +0 -84
  80. package/skills/evaluating-skills/schema/run-record.schema.json +0 -80
  81. package/skills/evaluating-skills/schema/stray-writes.schema.json +0 -80
  82. package/skills/evaluating-skills/templates/eval-task-prompt.md +0 -69
  83. package/skills/evaluating-skills/templates/evals.json.example +0 -17
  84. package/skills/evaluating-skills/templates/judge-prompt.md +0 -56
  85. package/skills/evaluating-skills/templates/revise-skill-prompt.md +0 -56
@@ -1,314 +0,0 @@
1
- import { afterEach, beforeEach, describe, expect, test } from "bun:test";
2
- import {
3
- existsSync,
4
- mkdirSync,
5
- readFileSync,
6
- rmSync,
7
- writeFileSync,
8
- } from "node:fs";
9
- import { tmpdir } from "node:os";
10
- import { join } from "node:path";
11
- import { recordRuns } from "./record-runs";
12
- import type { RunRecord, TimingRecord } from "./types";
13
-
14
- const ROOT = join(tmpdir(), `record-runs-test-${process.pid}`);
15
-
16
- let iterationDir: string;
17
- let subagentsDir: string;
18
-
19
- function jsonl(lines: object[]): string {
20
- return `${lines.map((l) => JSON.stringify(l)).join("\n")}\n`;
21
- }
22
-
23
- /** A minimal transcript with usage, timestamps, one tool call, and final text. */
24
- function transcriptLines(finalText: string): object[] {
25
- return [
26
- {
27
- type: "user",
28
- timestamp: "2026-06-04T10:00:00.000Z",
29
- message: { role: "user", content: "go" },
30
- },
31
- {
32
- type: "assistant",
33
- timestamp: "2026-06-04T10:00:10.000Z",
34
- message: {
35
- id: "msg_1",
36
- role: "assistant",
37
- usage: {
38
- input_tokens: 100,
39
- output_tokens: 20,
40
- cache_creation_input_tokens: 30,
41
- cache_read_input_tokens: 50,
42
- },
43
- content: [
44
- {
45
- type: "tool_use",
46
- id: "toolu_1",
47
- name: "Bash",
48
- input: { command: "ls" },
49
- },
50
- ],
51
- },
52
- },
53
- {
54
- type: "user",
55
- timestamp: "2026-06-04T10:00:12.000Z",
56
- message: {
57
- role: "user",
58
- content: [
59
- { type: "tool_result", tool_use_id: "toolu_1", content: "ok" },
60
- ],
61
- },
62
- },
63
- {
64
- type: "assistant",
65
- timestamp: "2026-06-04T10:01:00.000Z",
66
- message: {
67
- id: "msg_2",
68
- role: "assistant",
69
- usage: {
70
- input_tokens: 200,
71
- output_tokens: 40,
72
- cache_creation_input_tokens: 0,
73
- cache_read_input_tokens: 60,
74
- },
75
- content: [{ type: "text", text: finalText }],
76
- },
77
- },
78
- ];
79
- }
80
-
81
- // Token math for transcriptLines: msg_1 (100+20+30+50) + msg_2 (200+40+0+60) = 500.
82
- const TRANSCRIPT_TOKENS = 500;
83
- // 10:00:00.000 → 10:01:00.000
84
- const TRANSCRIPT_DURATION_MS = 60_000;
85
-
86
- function writeSubagent(name: string, description: string, lines: object[]) {
87
- writeFileSync(
88
- join(subagentsDir, `${name}.meta.json`),
89
- JSON.stringify({ agentType: "general-purpose", description }),
90
- );
91
- writeFileSync(join(subagentsDir, `${name}.jsonl`), jsonl(lines));
92
- }
93
-
94
- type FixtureTask = {
95
- eval_id: string;
96
- condition: string;
97
- finalMessage?: string; // written to outputs/final-message.md when present
98
- };
99
-
100
- /** Builds an iteration dir + dispatch.json shaped like run.ts serializes it. */
101
- function writeIteration(tasks: FixtureTask[]) {
102
- const serialized = tasks.map((t) => {
103
- const condDir = join(iterationDir, `eval-${t.eval_id}`, t.condition);
104
- const outputsDir = join(condDir, "outputs");
105
- mkdirSync(outputsDir, { recursive: true });
106
- if (t.finalMessage !== undefined) {
107
- writeFileSync(join(outputsDir, "final-message.md"), t.finalMessage);
108
- }
109
- return {
110
- eval_id: t.eval_id,
111
- condition: t.condition,
112
- skill_path:
113
- t.condition === "without_skill" ? null : "/staged/skill/SKILL.md",
114
- staged_skill_slug: t.condition === "without_skill" ? null : "test-slug",
115
- user_prompt: `Do the ${t.eval_id} task`,
116
- fixtures: [join(condDir, "inputs", "fixture.txt")],
117
- outputs_dir: outputsDir,
118
- run_record_path: join(condDir, "run.json"),
119
- timing_path: join(condDir, "timing.json"),
120
- agent_description: `${t.eval_id}:${t.condition}:i1-nonce1`,
121
- dispatch_prompt_path: join(condDir, "dispatch-prompt.txt"),
122
- };
123
- });
124
- writeFileSync(
125
- join(iterationDir, "dispatch.json"),
126
- JSON.stringify({ run_nonce: "nonce1", tasks: serialized }, null, 2),
127
- );
128
- return serialized;
129
- }
130
-
131
- function readRun(evalId: string, condition: string): RunRecord {
132
- return JSON.parse(
133
- readFileSync(
134
- join(iterationDir, `eval-${evalId}`, condition, "run.json"),
135
- "utf8",
136
- ),
137
- );
138
- }
139
-
140
- function readTiming(evalId: string, condition: string): TimingRecord {
141
- return JSON.parse(
142
- readFileSync(
143
- join(iterationDir, `eval-${evalId}`, condition, "timing.json"),
144
- "utf8",
145
- ),
146
- );
147
- }
148
-
149
- beforeEach(() => {
150
- iterationDir = join(ROOT, `iter-${Math.random().toString(36).slice(2)}`);
151
- subagentsDir = join(ROOT, `sub-${Math.random().toString(36).slice(2)}`);
152
- mkdirSync(iterationDir, { recursive: true });
153
- mkdirSync(subagentsDir, { recursive: true });
154
- });
155
-
156
- afterEach(() => rmSync(ROOT, { recursive: true, force: true }));
157
-
158
- describe("recordRuns", () => {
159
- test("assembles run.json and timing.json for every task from disk", () => {
160
- writeIteration([
161
- { eval_id: "crash", condition: "with_skill", finalMessage: "Fixed it." },
162
- {
163
- eval_id: "crash",
164
- condition: "without_skill",
165
- finalMessage: "Done, I think.",
166
- },
167
- ]);
168
- writeSubagent(
169
- "agent-a",
170
- "crash:with_skill:i1-nonce1",
171
- transcriptLines("unused"),
172
- );
173
- writeSubagent(
174
- "agent-b",
175
- "crash:without_skill:i1-nonce1",
176
- transcriptLines("unused"),
177
- );
178
-
179
- const result = recordRuns({ iterationDir, subagentsDir });
180
- expect(result.recorded).toBe(2);
181
- expect(result.missingTranscript).toBe(0);
182
-
183
- const run = readRun("crash", "with_skill");
184
- expect(run.eval_id).toBe("crash");
185
- expect(run.condition).toBe("with_skill");
186
- expect(run.skill_path).toBe("/staged/skill/SKILL.md");
187
- expect(run.prompt).toBe("Do the crash task");
188
- expect(run.files).toHaveLength(1);
189
- expect(run.final_message).toBe("Fixed it.");
190
- expect(run.tool_invocations).toHaveLength(1);
191
- expect(run.tool_invocations[0]).toMatchObject({ name: "Bash", ordinal: 0 });
192
-
193
- expect(readRun("crash", "without_skill").skill_path).toBeNull();
194
-
195
- const timing = readTiming("crash", "with_skill");
196
- expect(timing.total_tokens).toBe(TRANSCRIPT_TOKENS);
197
- expect(timing.duration_ms).toBe(TRANSCRIPT_DURATION_MS);
198
- expect(timing.source).toBe("transcript");
199
- });
200
-
201
- test("skips existing run.json without --overwrite, replaces with it", () => {
202
- const [task] = writeIteration([
203
- { eval_id: "crash", condition: "with_skill", finalMessage: "New." },
204
- ]);
205
- writeSubagent(
206
- "agent-a",
207
- "crash:with_skill:i1-nonce1",
208
- transcriptLines("unused"),
209
- );
210
- const handWritten = {
211
- eval_id: "crash",
212
- condition: "with_skill",
213
- skill_path: "/staged/skill/SKILL.md",
214
- prompt: "Do the crash task",
215
- files: [],
216
- final_message: "Agent-authored.",
217
- tool_invocations: [],
218
- };
219
- writeFileSync(task.run_record_path, JSON.stringify(handWritten));
220
-
221
- const skipped = recordRuns({ iterationDir, subagentsDir });
222
- expect(skipped.recorded).toBe(0);
223
- expect(skipped.skippedExisting).toBe(1);
224
- expect(readRun("crash", "with_skill").final_message).toBe(
225
- "Agent-authored.",
226
- );
227
-
228
- const replaced = recordRuns({
229
- iterationDir,
230
- subagentsDir,
231
- overwrite: true,
232
- });
233
- expect(replaced.recorded).toBe(1);
234
- expect(readRun("crash", "with_skill").final_message).toBe("New.");
235
- });
236
-
237
- test("backfills timing.json only when absent", () => {
238
- const [task] = writeIteration([
239
- { eval_id: "crash", condition: "with_skill", finalMessage: "Done." },
240
- ]);
241
- writeSubagent(
242
- "agent-a",
243
- "crash:with_skill:i1-nonce1",
244
- transcriptLines("unused"),
245
- );
246
- writeFileSync(
247
- task.timing_path,
248
- JSON.stringify({ total_tokens: 12345, duration_ms: 9000 }),
249
- );
250
-
251
- recordRuns({ iterationDir, subagentsDir });
252
-
253
- // Agent-captured completion-event timing wins; not overwritten.
254
- const timing = readTiming("crash", "with_skill");
255
- expect(timing.total_tokens).toBe(12345);
256
- expect(timing.duration_ms).toBe(9000);
257
- expect(timing.source).toBeUndefined();
258
- });
259
-
260
- test("falls back to the transcript's final assistant text when final-message.md is missing", () => {
261
- writeIteration([{ eval_id: "crash", condition: "with_skill" }]);
262
- writeSubagent(
263
- "agent-a",
264
- "crash:with_skill:i1-nonce1",
265
- transcriptLines("Closing summary from transcript."),
266
- );
267
-
268
- const result = recordRuns({ iterationDir, subagentsDir });
269
- expect(result.recorded).toBe(1);
270
- expect(readRun("crash", "with_skill").final_message).toBe(
271
- "Closing summary from transcript.",
272
- );
273
- });
274
-
275
- test("skips the slot entirely when no final-message source exists", () => {
276
- writeIteration([{ eval_id: "crash", condition: "with_skill" }]);
277
- // No final-message.md, no transcript.
278
-
279
- const result = recordRuns({ iterationDir, subagentsDir });
280
- expect(result.recorded).toBe(0);
281
- expect(result.skippedNoFinalMessage).toBe(1);
282
- expect(
283
- existsSync(join(iterationDir, "eval-crash", "with_skill", "run.json")),
284
- ).toBe(false);
285
- expect(
286
- existsSync(join(iterationDir, "eval-crash", "with_skill", "timing.json")),
287
- ).toBe(false);
288
- });
289
-
290
- test("writes run.json with empty invocations and no timing.json when the transcript is missing", () => {
291
- writeIteration([
292
- { eval_id: "crash", condition: "with_skill", finalMessage: "Done." },
293
- ]);
294
- // final-message.md exists but no subagent transcript matches.
295
-
296
- const result = recordRuns({ iterationDir, subagentsDir });
297
- expect(result.recorded).toBe(1);
298
- expect(result.missingTranscript).toBe(1);
299
-
300
- const run = readRun("crash", "with_skill");
301
- expect(run.final_message).toBe("Done.");
302
- expect(run.tool_invocations).toEqual([]);
303
- expect(
304
- existsSync(join(iterationDir, "eval-crash", "with_skill", "timing.json")),
305
- ).toBe(false);
306
- });
307
-
308
- test("throws when dispatch.json is absent", () => {
309
- // Hand-authored/operator runs have no dispatch.json — the manual path owns them.
310
- expect(() => recordRuns({ iterationDir, subagentsDir })).toThrow(
311
- /dispatch\.json/,
312
- );
313
- });
314
- });
@@ -1,209 +0,0 @@
1
- #!/usr/bin/env bun
2
- import { existsSync, readFileSync, writeFileSync } from "node:fs";
3
- import { join } from "node:path";
4
- import {
5
- findByDescription,
6
- parseTranscriptFull,
7
- } from "./adapters/claude-code-transcript";
8
- import { detectRunContext } from "./context";
9
- import type { RunRecord, TimingRecord } from "./types";
10
- import { validateAgainstSchema } from "./validate-schema";
11
-
12
- function die(msg: string): never {
13
- console.error(`error: ${msg}`);
14
- process.exit(1);
15
- }
16
-
17
- /** The dispatch.json task shape record-runs consumes (see DispatchTask in
18
- * run.ts — `dispatch_prompt` is stripped from the serialized file). */
19
- type DispatchTask = {
20
- eval_id: string;
21
- condition: string;
22
- skill_path: string | null;
23
- user_prompt: string;
24
- fixtures: string[];
25
- outputs_dir: string;
26
- run_record_path: string;
27
- timing_path: string;
28
- agent_description: string;
29
- };
30
-
31
- export type RecordRunsResult = {
32
- recorded: number;
33
- skippedExisting: number;
34
- skippedNoFinalMessage: number;
35
- missingTranscript: number;
36
- };
37
-
38
- /**
39
- * Assembles a schema-valid `run.json` (and backfills `timing.json`) for every
40
- * task in the iteration's `dispatch.json`, from sources already on disk:
41
- *
42
- * - carry-over fields (`prompt` ← `user_prompt`, `files` ← `fixtures`,
43
- * `eval_id`/`condition`/`skill_path`) from `dispatch.json`;
44
- * - `final_message` from `<outputs_dir>/final-message.md` (the dispatch prompt
45
- * instructs the subagent to write it), falling back to the transcript's last
46
- * assistant text;
47
- * - `tool_invocations`, tokens, and duration from the persisted Claude Code
48
- * transcript (Claude-Code-tier, like fill-transcripts — transcript-less
49
- * harnesses keep hand-authoring these records).
50
- *
51
- * Existing records always win: an agent/operator-written `run.json` is skipped
52
- * without `overwrite`, and `timing.json` is backfill-only — completion-event
53
- * numbers captured at dispatch time are never replaced by transcript-derived
54
- * ones, which include cache accounting and are not comparable 1:1.
55
- */
56
- export function recordRuns(opts: {
57
- iterationDir: string;
58
- subagentsDir: string;
59
- overwrite?: boolean;
60
- }): RecordRunsResult {
61
- const { iterationDir, subagentsDir, overwrite = false } = opts;
62
-
63
- const dispatchPath = join(iterationDir, "dispatch.json");
64
- if (!existsSync(dispatchPath)) {
65
- throw new Error(
66
- `${dispatchPath} not found — record-runs assembles records from dispatch.json and only supports runner-built iterations. For hand-authored runs, write run.json + timing.json manually (see schema/run-record.schema.json).`,
67
- );
68
- }
69
- const dispatch = JSON.parse(readFileSync(dispatchPath, "utf8")) as {
70
- tasks?: DispatchTask[];
71
- };
72
- const tasks = dispatch.tasks ?? [];
73
-
74
- const result: RecordRunsResult = {
75
- recorded: 0,
76
- skippedExisting: 0,
77
- skippedNoFinalMessage: 0,
78
- missingTranscript: 0,
79
- };
80
-
81
- for (const task of tasks) {
82
- const slot = `${task.eval_id}/${task.condition}`;
83
-
84
- const subagent = findByDescription(subagentsDir, task.agent_description);
85
- const summary = subagent ? parseTranscriptFull(subagent.jsonlPath) : null;
86
- if (!subagent) {
87
- console.warn(
88
- `miss ${slot}: no subagent transcript with description='${task.agent_description}'`,
89
- );
90
- result.missingTranscript++;
91
- }
92
-
93
- // run.json — skip if the agent/operator already wrote one.
94
- if (existsSync(task.run_record_path) && !overwrite) {
95
- console.log(
96
- `skip ${slot}: run.json already exists (use --overwrite to replace)`,
97
- );
98
- result.skippedExisting++;
99
- } else {
100
- const finalMessagePath = join(task.outputs_dir, "final-message.md");
101
- let finalMessage: string | null = null;
102
- if (existsSync(finalMessagePath)) {
103
- finalMessage = readFileSync(finalMessagePath, "utf8").trim();
104
- } else if (summary?.final_text) {
105
- console.warn(
106
- `warn ${slot}: ${finalMessagePath} missing — using the transcript's last assistant text as final_message`,
107
- );
108
- finalMessage = summary.final_text;
109
- }
110
- if (finalMessage === null) {
111
- console.warn(
112
- `skip ${slot}: no final-message.md and no transcript text — was this task dispatched? Not writing a blank record.`,
113
- );
114
- result.skippedNoFinalMessage++;
115
- continue;
116
- }
117
-
118
- const record: RunRecord = {
119
- eval_id: task.eval_id,
120
- condition: task.condition,
121
- skill_path: task.skill_path,
122
- prompt: task.user_prompt,
123
- files: task.fixtures,
124
- final_message: finalMessage,
125
- tool_invocations: summary?.tool_invocations ?? [],
126
- // Timing lives in timing.json; run.json never carries it.
127
- total_tokens: null,
128
- duration_ms: null,
129
- };
130
- validateAgainstSchema<RunRecord>(
131
- "run-record",
132
- record,
133
- task.run_record_path,
134
- );
135
- writeFileSync(
136
- task.run_record_path,
137
- `${JSON.stringify(record, null, 2)}\n`,
138
- );
139
- console.log(
140
- `record ${slot}: wrote run.json with ${record.tool_invocations.length} tool_invocations`,
141
- );
142
- result.recorded++;
143
- }
144
-
145
- // timing.json — backfill only; completion-event numbers always win.
146
- const timingExists = existsSync(task.timing_path);
147
- if (summary && (!timingExists || overwrite)) {
148
- const timing: TimingRecord = {
149
- total_tokens: summary.total_tokens,
150
- duration_ms: summary.duration_ms,
151
- source: "transcript",
152
- };
153
- writeFileSync(task.timing_path, `${JSON.stringify(timing, null, 2)}\n`);
154
- }
155
- }
156
-
157
- return result;
158
- }
159
-
160
- function parseArgs(argv: string[]) {
161
- const flag = (name: string): string | undefined => {
162
- const i = argv.indexOf(`--${name}`);
163
- if (i === -1) return undefined;
164
- return argv[i + 1];
165
- };
166
- const iteration = flag("iteration");
167
- const subagentsDir = flag("subagents-dir");
168
- const overwrite = argv.includes("--overwrite");
169
- if (!iteration) die("missing --iteration");
170
- if (!subagentsDir)
171
- die(
172
- "missing --subagents-dir (e.g. ~/.claude/projects/<project-slug>/<parent-session-id>/subagents/)",
173
- );
174
- return { iteration, subagentsDir, overwrite };
175
- }
176
-
177
- if (import.meta.main) {
178
- const argv = Bun.argv.slice(2);
179
- const { iteration, subagentsDir, overwrite } = parseArgs(argv);
180
- const ctx = detectRunContext(argv);
181
-
182
- if (!existsSync(subagentsDir))
183
- die(`subagents-dir not found: ${subagentsDir}`);
184
-
185
- const iterationDir = join(
186
- ctx.workspaceRoot,
187
- ctx.skillName,
188
- `iteration-${iteration}`,
189
- );
190
- if (!existsSync(iterationDir)) die(`not found: ${iterationDir}`);
191
-
192
- let result: RecordRunsResult;
193
- try {
194
- result = recordRuns({ iterationDir, subagentsDir, overwrite });
195
- } catch (err) {
196
- die(err instanceof Error ? err.message : String(err));
197
- }
198
-
199
- console.log(
200
- `\nRecorded: ${result.recorded}, skipped (existing run.json): ${result.skippedExisting}, skipped (no final message): ${result.skippedNoFinalMessage}, missing transcript: ${result.missingTranscript}`,
201
- );
202
- if (result.missingTranscript > 0)
203
- console.warn(
204
- "Missing transcripts mean the dispatching agent's dispatch `description` did not match the task's `agent_description` in dispatch.json. Those slots got empty tool_invocations (transcript_check assertions will grade unverifiable) and no transcript-derived timing.",
205
- );
206
- console.log(
207
- `\nNext: bun run evals:detect-stray-writes -- --skill ${ctx.skillName} --iteration ${iteration}\nThen: bun run evals:grade -- --skill ${ctx.skillName} --iteration ${iteration}`,
208
- );
209
- }