@slowdini/slow-powers-opencode 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/README.md +34 -72
  2. package/bootstrap.md +1 -7
  3. package/opencode/plugins/slow-powers.js +1 -1
  4. package/package.json +14 -17
  5. package/skills/evaluating-skills/SKILL.md +90 -338
  6. package/skills/evaluating-skills/evals/baseline/BASELINE.md +23 -0
  7. package/skills/evaluating-skills/evals/baseline/NOTES.md +40 -0
  8. package/skills/evaluating-skills/evals/baseline/benchmark.json +54 -0
  9. package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__new_skill.json +39 -0
  10. package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__old_skill.json +39 -0
  11. package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__new_skill.json +39 -0
  12. package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__old_skill.json +39 -0
  13. package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__new_skill.json +32 -0
  14. package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__old_skill.json +32 -0
  15. package/skills/test-driven-development/evals/baseline/NOTES.md +2 -2
  16. package/skills/evaluating-skills/examples/verifying-development-work-evals.json +0 -30
  17. package/skills/evaluating-skills/harness-details/claude.md +0 -194
  18. package/skills/evaluating-skills/harness-parity.md +0 -155
  19. package/skills/evaluating-skills/runner/README.md +0 -163
  20. package/skills/evaluating-skills/runner/adapters/claude-code-session.test.ts +0 -56
  21. package/skills/evaluating-skills/runner/adapters/claude-code-session.ts +0 -43
  22. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +0 -485
  23. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +0 -242
  24. package/skills/evaluating-skills/runner/aggregate.test.ts +0 -484
  25. package/skills/evaluating-skills/runner/aggregate.ts +0 -269
  26. package/skills/evaluating-skills/runner/context.test.ts +0 -181
  27. package/skills/evaluating-skills/runner/context.ts +0 -90
  28. package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +0 -396
  29. package/skills/evaluating-skills/runner/detect-stray-writes.ts +0 -288
  30. package/skills/evaluating-skills/runner/fill-transcripts.test.ts +0 -73
  31. package/skills/evaluating-skills/runner/fill-transcripts.ts +0 -154
  32. package/skills/evaluating-skills/runner/grade.test.ts +0 -347
  33. package/skills/evaluating-skills/runner/grade.ts +0 -603
  34. package/skills/evaluating-skills/runner/guard/guard.ts +0 -49
  35. package/skills/evaluating-skills/runner/guard/install.test.ts +0 -92
  36. package/skills/evaluating-skills/runner/guard/install.ts +0 -147
  37. package/skills/evaluating-skills/runner/guard/policy.test.ts +0 -128
  38. package/skills/evaluating-skills/runner/guard/policy.ts +0 -74
  39. package/skills/evaluating-skills/runner/plugin-shadow.test.ts +0 -228
  40. package/skills/evaluating-skills/runner/plugin-shadow.ts +0 -201
  41. package/skills/evaluating-skills/runner/profiles/claude-code/plan-mode.md +0 -11
  42. package/skills/evaluating-skills/runner/promote-baseline.test.ts +0 -281
  43. package/skills/evaluating-skills/runner/promote-baseline.ts +0 -204
  44. package/skills/evaluating-skills/runner/record-runs.test.ts +0 -314
  45. package/skills/evaluating-skills/runner/record-runs.ts +0 -209
  46. package/skills/evaluating-skills/runner/run.test.ts +0 -1703
  47. package/skills/evaluating-skills/runner/run.ts +0 -1388
  48. package/skills/evaluating-skills/runner/sandbox-policy.ts +0 -94
  49. package/skills/evaluating-skills/runner/types.ts +0 -121
  50. package/skills/evaluating-skills/runner/validate-all.ts +0 -54
  51. package/skills/evaluating-skills/runner/validate-schema.test.ts +0 -99
  52. package/skills/evaluating-skills/runner/validate-schema.ts +0 -51
  53. package/skills/evaluating-skills/runner/validate.test.ts +0 -56
  54. package/skills/evaluating-skills/runner/validate.ts +0 -21
  55. package/skills/evaluating-skills/runner/workspace-teardown.test.ts +0 -227
  56. package/skills/evaluating-skills/runner/workspace-teardown.ts +0 -136
  57. package/skills/evaluating-skills/schema/evals.schema.json +0 -105
  58. package/skills/evaluating-skills/schema/grading.schema.json +0 -84
  59. package/skills/evaluating-skills/schema/run-record.schema.json +0 -80
  60. package/skills/evaluating-skills/schema/stray-writes.schema.json +0 -80
  61. package/skills/evaluating-skills/templates/eval-task-prompt.md +0 -69
  62. package/skills/evaluating-skills/templates/evals.json.example +0 -17
  63. package/skills/evaluating-skills/templates/judge-prompt.md +0 -56
  64. package/skills/evaluating-skills/templates/revise-skill-prompt.md +0 -56
@@ -1,314 +0,0 @@
1
- import { afterEach, beforeEach, describe, expect, test } from "bun:test";
2
- import {
3
- existsSync,
4
- mkdirSync,
5
- readFileSync,
6
- rmSync,
7
- writeFileSync,
8
- } from "node:fs";
9
- import { tmpdir } from "node:os";
10
- import { join } from "node:path";
11
- import { recordRuns } from "./record-runs";
12
- import type { RunRecord, TimingRecord } from "./types";
13
-
14
- const ROOT = join(tmpdir(), `record-runs-test-${process.pid}`);
15
-
16
- let iterationDir: string;
17
- let subagentsDir: string;
18
-
19
- function jsonl(lines: object[]): string {
20
- return `${lines.map((l) => JSON.stringify(l)).join("\n")}\n`;
21
- }
22
-
23
- /** A minimal transcript with usage, timestamps, one tool call, and final text. */
24
- function transcriptLines(finalText: string): object[] {
25
- return [
26
- {
27
- type: "user",
28
- timestamp: "2026-06-04T10:00:00.000Z",
29
- message: { role: "user", content: "go" },
30
- },
31
- {
32
- type: "assistant",
33
- timestamp: "2026-06-04T10:00:10.000Z",
34
- message: {
35
- id: "msg_1",
36
- role: "assistant",
37
- usage: {
38
- input_tokens: 100,
39
- output_tokens: 20,
40
- cache_creation_input_tokens: 30,
41
- cache_read_input_tokens: 50,
42
- },
43
- content: [
44
- {
45
- type: "tool_use",
46
- id: "toolu_1",
47
- name: "Bash",
48
- input: { command: "ls" },
49
- },
50
- ],
51
- },
52
- },
53
- {
54
- type: "user",
55
- timestamp: "2026-06-04T10:00:12.000Z",
56
- message: {
57
- role: "user",
58
- content: [
59
- { type: "tool_result", tool_use_id: "toolu_1", content: "ok" },
60
- ],
61
- },
62
- },
63
- {
64
- type: "assistant",
65
- timestamp: "2026-06-04T10:01:00.000Z",
66
- message: {
67
- id: "msg_2",
68
- role: "assistant",
69
- usage: {
70
- input_tokens: 200,
71
- output_tokens: 40,
72
- cache_creation_input_tokens: 0,
73
- cache_read_input_tokens: 60,
74
- },
75
- content: [{ type: "text", text: finalText }],
76
- },
77
- },
78
- ];
79
- }
80
-
81
- // Token math for transcriptLines: msg_1 (100+20+30+50) + msg_2 (200+40+0+60) = 500.
82
- const TRANSCRIPT_TOKENS = 500;
83
- // 10:00:00.000 → 10:01:00.000
84
- const TRANSCRIPT_DURATION_MS = 60_000;
85
-
86
- function writeSubagent(name: string, description: string, lines: object[]) {
87
- writeFileSync(
88
- join(subagentsDir, `${name}.meta.json`),
89
- JSON.stringify({ agentType: "general-purpose", description }),
90
- );
91
- writeFileSync(join(subagentsDir, `${name}.jsonl`), jsonl(lines));
92
- }
93
-
94
- type FixtureTask = {
95
- eval_id: string;
96
- condition: string;
97
- finalMessage?: string; // written to outputs/final-message.md when present
98
- };
99
-
100
- /** Builds an iteration dir + dispatch.json shaped like run.ts serializes it. */
101
- function writeIteration(tasks: FixtureTask[]) {
102
- const serialized = tasks.map((t) => {
103
- const condDir = join(iterationDir, `eval-${t.eval_id}`, t.condition);
104
- const outputsDir = join(condDir, "outputs");
105
- mkdirSync(outputsDir, { recursive: true });
106
- if (t.finalMessage !== undefined) {
107
- writeFileSync(join(outputsDir, "final-message.md"), t.finalMessage);
108
- }
109
- return {
110
- eval_id: t.eval_id,
111
- condition: t.condition,
112
- skill_path:
113
- t.condition === "without_skill" ? null : "/staged/skill/SKILL.md",
114
- staged_skill_slug: t.condition === "without_skill" ? null : "test-slug",
115
- user_prompt: `Do the ${t.eval_id} task`,
116
- fixtures: [join(condDir, "inputs", "fixture.txt")],
117
- outputs_dir: outputsDir,
118
- run_record_path: join(condDir, "run.json"),
119
- timing_path: join(condDir, "timing.json"),
120
- agent_description: `${t.eval_id}:${t.condition}:i1-nonce1`,
121
- dispatch_prompt_path: join(condDir, "dispatch-prompt.txt"),
122
- };
123
- });
124
- writeFileSync(
125
- join(iterationDir, "dispatch.json"),
126
- JSON.stringify({ run_nonce: "nonce1", tasks: serialized }, null, 2),
127
- );
128
- return serialized;
129
- }
130
-
131
- function readRun(evalId: string, condition: string): RunRecord {
132
- return JSON.parse(
133
- readFileSync(
134
- join(iterationDir, `eval-${evalId}`, condition, "run.json"),
135
- "utf8",
136
- ),
137
- );
138
- }
139
-
140
- function readTiming(evalId: string, condition: string): TimingRecord {
141
- return JSON.parse(
142
- readFileSync(
143
- join(iterationDir, `eval-${evalId}`, condition, "timing.json"),
144
- "utf8",
145
- ),
146
- );
147
- }
148
-
149
- beforeEach(() => {
150
- iterationDir = join(ROOT, `iter-${Math.random().toString(36).slice(2)}`);
151
- subagentsDir = join(ROOT, `sub-${Math.random().toString(36).slice(2)}`);
152
- mkdirSync(iterationDir, { recursive: true });
153
- mkdirSync(subagentsDir, { recursive: true });
154
- });
155
-
156
- afterEach(() => rmSync(ROOT, { recursive: true, force: true }));
157
-
158
- describe("recordRuns", () => {
159
- test("assembles run.json and timing.json for every task from disk", () => {
160
- writeIteration([
161
- { eval_id: "crash", condition: "with_skill", finalMessage: "Fixed it." },
162
- {
163
- eval_id: "crash",
164
- condition: "without_skill",
165
- finalMessage: "Done, I think.",
166
- },
167
- ]);
168
- writeSubagent(
169
- "agent-a",
170
- "crash:with_skill:i1-nonce1",
171
- transcriptLines("unused"),
172
- );
173
- writeSubagent(
174
- "agent-b",
175
- "crash:without_skill:i1-nonce1",
176
- transcriptLines("unused"),
177
- );
178
-
179
- const result = recordRuns({ iterationDir, subagentsDir });
180
- expect(result.recorded).toBe(2);
181
- expect(result.missingTranscript).toBe(0);
182
-
183
- const run = readRun("crash", "with_skill");
184
- expect(run.eval_id).toBe("crash");
185
- expect(run.condition).toBe("with_skill");
186
- expect(run.skill_path).toBe("/staged/skill/SKILL.md");
187
- expect(run.prompt).toBe("Do the crash task");
188
- expect(run.files).toHaveLength(1);
189
- expect(run.final_message).toBe("Fixed it.");
190
- expect(run.tool_invocations).toHaveLength(1);
191
- expect(run.tool_invocations[0]).toMatchObject({ name: "Bash", ordinal: 0 });
192
-
193
- expect(readRun("crash", "without_skill").skill_path).toBeNull();
194
-
195
- const timing = readTiming("crash", "with_skill");
196
- expect(timing.total_tokens).toBe(TRANSCRIPT_TOKENS);
197
- expect(timing.duration_ms).toBe(TRANSCRIPT_DURATION_MS);
198
- expect(timing.source).toBe("transcript");
199
- });
200
-
201
- test("skips existing run.json without --overwrite, replaces with it", () => {
202
- const [task] = writeIteration([
203
- { eval_id: "crash", condition: "with_skill", finalMessage: "New." },
204
- ]);
205
- writeSubagent(
206
- "agent-a",
207
- "crash:with_skill:i1-nonce1",
208
- transcriptLines("unused"),
209
- );
210
- const handWritten = {
211
- eval_id: "crash",
212
- condition: "with_skill",
213
- skill_path: "/staged/skill/SKILL.md",
214
- prompt: "Do the crash task",
215
- files: [],
216
- final_message: "Agent-authored.",
217
- tool_invocations: [],
218
- };
219
- writeFileSync(task.run_record_path, JSON.stringify(handWritten));
220
-
221
- const skipped = recordRuns({ iterationDir, subagentsDir });
222
- expect(skipped.recorded).toBe(0);
223
- expect(skipped.skippedExisting).toBe(1);
224
- expect(readRun("crash", "with_skill").final_message).toBe(
225
- "Agent-authored.",
226
- );
227
-
228
- const replaced = recordRuns({
229
- iterationDir,
230
- subagentsDir,
231
- overwrite: true,
232
- });
233
- expect(replaced.recorded).toBe(1);
234
- expect(readRun("crash", "with_skill").final_message).toBe("New.");
235
- });
236
-
237
- test("backfills timing.json only when absent", () => {
238
- const [task] = writeIteration([
239
- { eval_id: "crash", condition: "with_skill", finalMessage: "Done." },
240
- ]);
241
- writeSubagent(
242
- "agent-a",
243
- "crash:with_skill:i1-nonce1",
244
- transcriptLines("unused"),
245
- );
246
- writeFileSync(
247
- task.timing_path,
248
- JSON.stringify({ total_tokens: 12345, duration_ms: 9000 }),
249
- );
250
-
251
- recordRuns({ iterationDir, subagentsDir });
252
-
253
- // Agent-captured completion-event timing wins; not overwritten.
254
- const timing = readTiming("crash", "with_skill");
255
- expect(timing.total_tokens).toBe(12345);
256
- expect(timing.duration_ms).toBe(9000);
257
- expect(timing.source).toBeUndefined();
258
- });
259
-
260
- test("falls back to the transcript's final assistant text when final-message.md is missing", () => {
261
- writeIteration([{ eval_id: "crash", condition: "with_skill" }]);
262
- writeSubagent(
263
- "agent-a",
264
- "crash:with_skill:i1-nonce1",
265
- transcriptLines("Closing summary from transcript."),
266
- );
267
-
268
- const result = recordRuns({ iterationDir, subagentsDir });
269
- expect(result.recorded).toBe(1);
270
- expect(readRun("crash", "with_skill").final_message).toBe(
271
- "Closing summary from transcript.",
272
- );
273
- });
274
-
275
- test("skips the slot entirely when no final-message source exists", () => {
276
- writeIteration([{ eval_id: "crash", condition: "with_skill" }]);
277
- // No final-message.md, no transcript.
278
-
279
- const result = recordRuns({ iterationDir, subagentsDir });
280
- expect(result.recorded).toBe(0);
281
- expect(result.skippedNoFinalMessage).toBe(1);
282
- expect(
283
- existsSync(join(iterationDir, "eval-crash", "with_skill", "run.json")),
284
- ).toBe(false);
285
- expect(
286
- existsSync(join(iterationDir, "eval-crash", "with_skill", "timing.json")),
287
- ).toBe(false);
288
- });
289
-
290
- test("writes run.json with empty invocations and no timing.json when the transcript is missing", () => {
291
- writeIteration([
292
- { eval_id: "crash", condition: "with_skill", finalMessage: "Done." },
293
- ]);
294
- // final-message.md exists but no subagent transcript matches.
295
-
296
- const result = recordRuns({ iterationDir, subagentsDir });
297
- expect(result.recorded).toBe(1);
298
- expect(result.missingTranscript).toBe(1);
299
-
300
- const run = readRun("crash", "with_skill");
301
- expect(run.final_message).toBe("Done.");
302
- expect(run.tool_invocations).toEqual([]);
303
- expect(
304
- existsSync(join(iterationDir, "eval-crash", "with_skill", "timing.json")),
305
- ).toBe(false);
306
- });
307
-
308
- test("throws when dispatch.json is absent", () => {
309
- // Hand-authored/operator runs have no dispatch.json — the manual path owns them.
310
- expect(() => recordRuns({ iterationDir, subagentsDir })).toThrow(
311
- /dispatch\.json/,
312
- );
313
- });
314
- });
@@ -1,209 +0,0 @@
1
- #!/usr/bin/env bun
2
- import { existsSync, readFileSync, writeFileSync } from "node:fs";
3
- import { join } from "node:path";
4
- import {
5
- findByDescription,
6
- parseTranscriptFull,
7
- } from "./adapters/claude-code-transcript";
8
- import { detectRunContext } from "./context";
9
- import type { RunRecord, TimingRecord } from "./types";
10
- import { validateAgainstSchema } from "./validate-schema";
11
-
12
- function die(msg: string): never {
13
- console.error(`error: ${msg}`);
14
- process.exit(1);
15
- }
16
-
17
- /** The dispatch.json task shape record-runs consumes (see DispatchTask in
18
- * run.ts — `dispatch_prompt` is stripped from the serialized file). */
19
- type DispatchTask = {
20
- eval_id: string;
21
- condition: string;
22
- skill_path: string | null;
23
- user_prompt: string;
24
- fixtures: string[];
25
- outputs_dir: string;
26
- run_record_path: string;
27
- timing_path: string;
28
- agent_description: string;
29
- };
30
-
31
- export type RecordRunsResult = {
32
- recorded: number;
33
- skippedExisting: number;
34
- skippedNoFinalMessage: number;
35
- missingTranscript: number;
36
- };
37
-
38
- /**
39
- * Assembles a schema-valid `run.json` (and backfills `timing.json`) for every
40
- * task in the iteration's `dispatch.json`, from sources already on disk:
41
- *
42
- * - carry-over fields (`prompt` ← `user_prompt`, `files` ← `fixtures`,
43
- * `eval_id`/`condition`/`skill_path`) from `dispatch.json`;
44
- * - `final_message` from `<outputs_dir>/final-message.md` (the dispatch prompt
45
- * instructs the subagent to write it), falling back to the transcript's last
46
- * assistant text;
47
- * - `tool_invocations`, tokens, and duration from the persisted Claude Code
48
- * transcript (Claude-Code-tier, like fill-transcripts — transcript-less
49
- * harnesses keep hand-authoring these records).
50
- *
51
- * Existing records always win: an agent/operator-written `run.json` is skipped
52
- * without `overwrite`, and `timing.json` is backfill-only — completion-event
53
- * numbers captured at dispatch time are never replaced by transcript-derived
54
- * ones, which include cache accounting and are not comparable 1:1.
55
- */
56
- export function recordRuns(opts: {
57
- iterationDir: string;
58
- subagentsDir: string;
59
- overwrite?: boolean;
60
- }): RecordRunsResult {
61
- const { iterationDir, subagentsDir, overwrite = false } = opts;
62
-
63
- const dispatchPath = join(iterationDir, "dispatch.json");
64
- if (!existsSync(dispatchPath)) {
65
- throw new Error(
66
- `${dispatchPath} not found — record-runs assembles records from dispatch.json and only supports runner-built iterations. For hand-authored runs, write run.json + timing.json manually (see schema/run-record.schema.json).`,
67
- );
68
- }
69
- const dispatch = JSON.parse(readFileSync(dispatchPath, "utf8")) as {
70
- tasks?: DispatchTask[];
71
- };
72
- const tasks = dispatch.tasks ?? [];
73
-
74
- const result: RecordRunsResult = {
75
- recorded: 0,
76
- skippedExisting: 0,
77
- skippedNoFinalMessage: 0,
78
- missingTranscript: 0,
79
- };
80
-
81
- for (const task of tasks) {
82
- const slot = `${task.eval_id}/${task.condition}`;
83
-
84
- const subagent = findByDescription(subagentsDir, task.agent_description);
85
- const summary = subagent ? parseTranscriptFull(subagent.jsonlPath) : null;
86
- if (!subagent) {
87
- console.warn(
88
- `miss ${slot}: no subagent transcript with description='${task.agent_description}'`,
89
- );
90
- result.missingTranscript++;
91
- }
92
-
93
- // run.json — skip if the agent/operator already wrote one.
94
- if (existsSync(task.run_record_path) && !overwrite) {
95
- console.log(
96
- `skip ${slot}: run.json already exists (use --overwrite to replace)`,
97
- );
98
- result.skippedExisting++;
99
- } else {
100
- const finalMessagePath = join(task.outputs_dir, "final-message.md");
101
- let finalMessage: string | null = null;
102
- if (existsSync(finalMessagePath)) {
103
- finalMessage = readFileSync(finalMessagePath, "utf8").trim();
104
- } else if (summary?.final_text) {
105
- console.warn(
106
- `warn ${slot}: ${finalMessagePath} missing — using the transcript's last assistant text as final_message`,
107
- );
108
- finalMessage = summary.final_text;
109
- }
110
- if (finalMessage === null) {
111
- console.warn(
112
- `skip ${slot}: no final-message.md and no transcript text — was this task dispatched? Not writing a blank record.`,
113
- );
114
- result.skippedNoFinalMessage++;
115
- continue;
116
- }
117
-
118
- const record: RunRecord = {
119
- eval_id: task.eval_id,
120
- condition: task.condition,
121
- skill_path: task.skill_path,
122
- prompt: task.user_prompt,
123
- files: task.fixtures,
124
- final_message: finalMessage,
125
- tool_invocations: summary?.tool_invocations ?? [],
126
- // Timing lives in timing.json; run.json never carries it.
127
- total_tokens: null,
128
- duration_ms: null,
129
- };
130
- validateAgainstSchema<RunRecord>(
131
- "run-record",
132
- record,
133
- task.run_record_path,
134
- );
135
- writeFileSync(
136
- task.run_record_path,
137
- `${JSON.stringify(record, null, 2)}\n`,
138
- );
139
- console.log(
140
- `record ${slot}: wrote run.json with ${record.tool_invocations.length} tool_invocations`,
141
- );
142
- result.recorded++;
143
- }
144
-
145
- // timing.json — backfill only; completion-event numbers always win.
146
- const timingExists = existsSync(task.timing_path);
147
- if (summary && (!timingExists || overwrite)) {
148
- const timing: TimingRecord = {
149
- total_tokens: summary.total_tokens,
150
- duration_ms: summary.duration_ms,
151
- source: "transcript",
152
- };
153
- writeFileSync(task.timing_path, `${JSON.stringify(timing, null, 2)}\n`);
154
- }
155
- }
156
-
157
- return result;
158
- }
159
-
160
- function parseArgs(argv: string[]) {
161
- const flag = (name: string): string | undefined => {
162
- const i = argv.indexOf(`--${name}`);
163
- if (i === -1) return undefined;
164
- return argv[i + 1];
165
- };
166
- const iteration = flag("iteration");
167
- const subagentsDir = flag("subagents-dir");
168
- const overwrite = argv.includes("--overwrite");
169
- if (!iteration) die("missing --iteration");
170
- if (!subagentsDir)
171
- die(
172
- "missing --subagents-dir (e.g. ~/.claude/projects/<project-slug>/<parent-session-id>/subagents/)",
173
- );
174
- return { iteration, subagentsDir, overwrite };
175
- }
176
-
177
- if (import.meta.main) {
178
- const argv = Bun.argv.slice(2);
179
- const { iteration, subagentsDir, overwrite } = parseArgs(argv);
180
- const ctx = detectRunContext(argv);
181
-
182
- if (!existsSync(subagentsDir))
183
- die(`subagents-dir not found: ${subagentsDir}`);
184
-
185
- const iterationDir = join(
186
- ctx.workspaceRoot,
187
- ctx.skillName,
188
- `iteration-${iteration}`,
189
- );
190
- if (!existsSync(iterationDir)) die(`not found: ${iterationDir}`);
191
-
192
- let result: RecordRunsResult;
193
- try {
194
- result = recordRuns({ iterationDir, subagentsDir, overwrite });
195
- } catch (err) {
196
- die(err instanceof Error ? err.message : String(err));
197
- }
198
-
199
- console.log(
200
- `\nRecorded: ${result.recorded}, skipped (existing run.json): ${result.skippedExisting}, skipped (no final message): ${result.skippedNoFinalMessage}, missing transcript: ${result.missingTranscript}`,
201
- );
202
- if (result.missingTranscript > 0)
203
- console.warn(
204
- "Missing transcripts mean the dispatching agent's dispatch `description` did not match the task's `agent_description` in dispatch.json. Those slots got empty tool_invocations (transcript_check assertions will grade unverifiable) and no transcript-derived timing.",
205
- );
206
- console.log(
207
- `\nNext: bun run evals:detect-stray-writes -- --skill ${ctx.skillName} --iteration ${iteration}\nThen: bun run evals:grade -- --skill ${ctx.skillName} --iteration ${iteration}`,
208
- );
209
- }