@slowdini/slow-powers-opencode 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. package/README.md +18 -8
  2. package/package.json +5 -1
  3. package/skills/evaluating-skills/SKILL.md +19 -17
  4. package/skills/evaluating-skills/harness-details/claude.md +51 -15
  5. package/skills/evaluating-skills/harness-parity.md +155 -0
  6. package/skills/evaluating-skills/runner/README.md +28 -19
  7. package/skills/evaluating-skills/runner/adapters/claude-code-session.ts +2 -2
  8. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +222 -0
  9. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +107 -11
  10. package/skills/evaluating-skills/runner/aggregate.test.ts +220 -0
  11. package/skills/evaluating-skills/runner/aggregate.ts +21 -0
  12. package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +295 -2
  13. package/skills/evaluating-skills/runner/detect-stray-writes.ts +102 -6
  14. package/skills/evaluating-skills/runner/guard/policy.test.ts +57 -0
  15. package/skills/evaluating-skills/runner/promote-baseline.test.ts +51 -0
  16. package/skills/evaluating-skills/runner/promote-baseline.ts +19 -1
  17. package/skills/evaluating-skills/runner/record-runs.test.ts +314 -0
  18. package/skills/evaluating-skills/runner/record-runs.ts +209 -0
  19. package/skills/evaluating-skills/runner/run.test.ts +523 -0
  20. package/skills/evaluating-skills/runner/run.ts +376 -17
  21. package/skills/evaluating-skills/runner/sandbox-policy.ts +20 -0
  22. package/skills/evaluating-skills/runner/types.ts +9 -0
  23. package/skills/evaluating-skills/runner/workspace-teardown.test.ts +227 -0
  24. package/skills/evaluating-skills/runner/workspace-teardown.ts +136 -0
  25. package/skills/evaluating-skills/schema/run-record.schema.json +2 -2
  26. package/skills/evaluating-skills/schema/stray-writes.schema.json +15 -3
  27. package/skills/evaluating-skills/templates/eval-task-prompt.md +5 -3
  28. package/skills/test-driven-development/evals/baseline/NOTES.md +1 -1
  29. package/skills/verifying-development-work/SKILL.md +17 -6
  30. package/skills/verifying-development-work/code-review.md +68 -0
  31. package/skills/verifying-development-work/comment-review.md +85 -0
  32. package/skills/verifying-development-work/evals/baseline/BASELINE.md +7 -6
  33. package/skills/verifying-development-work/evals/baseline/NOTES.md +83 -149
  34. package/skills/verifying-development-work/evals/baseline/benchmark.json +32 -31
  35. package/skills/verifying-development-work/evals/baseline/grading/comment-hygiene-at-handoff__new_skill.json +53 -0
  36. package/skills/verifying-development-work/evals/baseline/grading/comment-hygiene-at-handoff__old_skill.json +53 -0
  37. package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__new_skill.json +53 -0
  38. package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__old_skill.json +53 -0
  39. package/skills/verifying-development-work/evals/evals.json +34 -2
  40. package/skills/verifying-development-work/evals/fixtures/comment-hygiene-at-handoff/slugify.test.ts +14 -0
  41. package/skills/verifying-development-work/evals/fixtures/comment-hygiene-at-handoff/slugify.ts +25 -0
  42. package/skills/verifying-development-work/evals/baseline/grading/bug-fixed-without-reproducing__with_skill.json +0 -39
  43. package/skills/verifying-development-work/evals/baseline/grading/bug-fixed-without-reproducing__without_skill.json +0 -24
  44. package/skills/verifying-development-work/evals/baseline/grading/build-implied-by-edit__with_skill.json +0 -46
  45. package/skills/verifying-development-work/evals/baseline/grading/build-implied-by-edit__without_skill.json +0 -31
  46. package/skills/verifying-development-work/evals/baseline/grading/claim-without-running__with_skill.json +0 -46
  47. package/skills/verifying-development-work/evals/baseline/grading/claim-without-running__without_skill.json +0 -31
  48. package/skills/verifying-development-work/evals/baseline/grading/seeded-done-tests-pass-ship-it__with_skill.json +0 -46
  49. package/skills/verifying-development-work/evals/baseline/grading/seeded-done-tests-pass-ship-it__without_skill.json +0 -31
  50. package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__with_skill.json +0 -53
  51. package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__without_skill.json +0 -38
@@ -6,6 +6,7 @@ import {
6
6
  findByDescription,
7
7
  listSubagents,
8
8
  parseTranscript,
9
+ parseTranscriptFull,
9
10
  } from "./claude-code-transcript";
10
11
 
11
12
  const FIXTURE_ROOT = join(tmpdir(), `claude-code-adapter-test-${process.pid}`);
@@ -193,6 +194,227 @@ describe("parseTranscript", () => {
193
194
  });
194
195
  });
195
196
 
197
+ describe("parseTranscriptFull", () => {
198
+ const usage = (output: number) => ({
199
+ input_tokens: 100,
200
+ cache_creation_input_tokens: 50,
201
+ cache_read_input_tokens: 200,
202
+ output_tokens: output,
203
+ });
204
+
205
+ test("sums usage across unique message ids, deduping repeated ids", () => {
206
+ // One API response spans multiple jsonl lines (one per content block) and
207
+ // repeats the same message.id + usage on each — it must be counted once.
208
+ const path = join(FIXTURE_ROOT, "full-dedup.jsonl");
209
+ writeFileSync(
210
+ path,
211
+ jsonl([
212
+ {
213
+ type: "user",
214
+ timestamp: "2026-06-04T10:00:00.000Z",
215
+ message: { role: "user", content: "go" },
216
+ },
217
+ {
218
+ type: "assistant",
219
+ timestamp: "2026-06-04T10:00:05.000Z",
220
+ message: {
221
+ id: "msg_aaa",
222
+ role: "assistant",
223
+ usage: usage(10),
224
+ content: [{ type: "text", text: "first block" }],
225
+ },
226
+ },
227
+ {
228
+ type: "assistant",
229
+ timestamp: "2026-06-04T10:00:06.000Z",
230
+ message: {
231
+ id: "msg_aaa",
232
+ role: "assistant",
233
+ usage: usage(10),
234
+ content: [
235
+ {
236
+ type: "tool_use",
237
+ id: "toolu_1",
238
+ name: "Bash",
239
+ input: { command: "ls" },
240
+ },
241
+ ],
242
+ },
243
+ },
244
+ {
245
+ type: "assistant",
246
+ timestamp: "2026-06-04T10:01:00.000Z",
247
+ message: {
248
+ id: "msg_bbb",
249
+ role: "assistant",
250
+ usage: usage(40),
251
+ content: [{ type: "text", text: "done" }],
252
+ },
253
+ },
254
+ ]),
255
+ );
256
+
257
+ const full = parseTranscriptFull(path);
258
+ // msg_aaa counted once (100+50+200+10) + msg_bbb (100+50+200+40) = 750
259
+ expect(full.total_tokens).toBe(750);
260
+ });
261
+
262
+ test("returns null total_tokens when no usage objects present", () => {
263
+ const path = join(FIXTURE_ROOT, "full-no-usage.jsonl");
264
+ writeFileSync(
265
+ path,
266
+ jsonl([
267
+ {
268
+ type: "assistant",
269
+ message: {
270
+ role: "assistant",
271
+ content: [{ type: "text", text: "hi" }],
272
+ },
273
+ },
274
+ ]),
275
+ );
276
+ expect(parseTranscriptFull(path).total_tokens).toBeNull();
277
+ });
278
+
279
+ test("derives duration_ms from first and last line timestamps", () => {
280
+ const path = join(FIXTURE_ROOT, "full-duration.jsonl");
281
+ writeFileSync(
282
+ path,
283
+ jsonl([
284
+ {
285
+ type: "user",
286
+ timestamp: "2026-06-04T10:00:00.000Z",
287
+ message: { role: "user", content: "go" },
288
+ },
289
+ {
290
+ type: "assistant",
291
+ timestamp: "2026-06-04T10:02:30.500Z",
292
+ message: {
293
+ id: "msg_x",
294
+ role: "assistant",
295
+ content: [{ type: "text", text: "done" }],
296
+ },
297
+ },
298
+ ]),
299
+ );
300
+ expect(parseTranscriptFull(path).duration_ms).toBe(150_500);
301
+ });
302
+
303
+ test("returns null duration_ms with fewer than two timestamps", () => {
304
+ const path = join(FIXTURE_ROOT, "full-one-ts.jsonl");
305
+ writeFileSync(
306
+ path,
307
+ jsonl([
308
+ {
309
+ type: "assistant",
310
+ timestamp: "2026-06-04T10:00:00.000Z",
311
+ message: { role: "assistant", content: [] },
312
+ },
313
+ { type: "assistant", message: { role: "assistant", content: [] } },
314
+ ]),
315
+ );
316
+ expect(parseTranscriptFull(path).duration_ms).toBeNull();
317
+ });
318
+
319
+ test("final_text is the concatenated text of the last assistant message", () => {
320
+ const path = join(FIXTURE_ROOT, "full-final-text.jsonl");
321
+ writeFileSync(
322
+ path,
323
+ jsonl([
324
+ {
325
+ type: "assistant",
326
+ message: {
327
+ id: "msg_1",
328
+ role: "assistant",
329
+ content: [{ type: "text", text: "intermediate" }],
330
+ },
331
+ },
332
+ {
333
+ type: "assistant",
334
+ message: {
335
+ id: "msg_2",
336
+ role: "assistant",
337
+ content: [
338
+ { type: "text", text: "All tests pass." },
339
+ {
340
+ type: "tool_use",
341
+ id: "toolu_z",
342
+ name: "Bash",
343
+ input: { command: "true" },
344
+ },
345
+ { type: "text", text: "Wrapping up." },
346
+ ],
347
+ },
348
+ },
349
+ {
350
+ type: "user",
351
+ message: {
352
+ role: "user",
353
+ content: [
354
+ { type: "tool_result", tool_use_id: "toolu_z", content: "ok" },
355
+ ],
356
+ },
357
+ },
358
+ ]),
359
+ );
360
+ expect(parseTranscriptFull(path).final_text).toBe(
361
+ "All tests pass.\nWrapping up.",
362
+ );
363
+ });
364
+
365
+ test("final_text is null when no assistant text exists", () => {
366
+ const path = join(FIXTURE_ROOT, "full-no-text.jsonl");
367
+ writeFileSync(
368
+ path,
369
+ jsonl([{ type: "user", message: { role: "user", content: "hi" } }]),
370
+ );
371
+ expect(parseTranscriptFull(path).final_text).toBeNull();
372
+ });
373
+
374
+ test("tool_invocations matches parseTranscript output", () => {
375
+ const path = join(FIXTURE_ROOT, "full-invocations.jsonl");
376
+ writeFileSync(
377
+ path,
378
+ jsonl([
379
+ {
380
+ type: "assistant",
381
+ timestamp: "2026-06-04T10:00:00.000Z",
382
+ message: {
383
+ id: "msg_1",
384
+ role: "assistant",
385
+ usage: usage(5),
386
+ content: [
387
+ {
388
+ type: "tool_use",
389
+ id: "toolu_q",
390
+ name: "Read",
391
+ input: { file_path: "/tmp/a" },
392
+ },
393
+ ],
394
+ },
395
+ },
396
+ {
397
+ type: "user",
398
+ timestamp: "2026-06-04T10:00:02.000Z",
399
+ message: {
400
+ role: "user",
401
+ content: [
402
+ {
403
+ type: "tool_result",
404
+ tool_use_id: "toolu_q",
405
+ content: "contents",
406
+ },
407
+ ],
408
+ },
409
+ },
410
+ ]),
411
+ );
412
+ expect(parseTranscriptFull(path).tool_invocations).toEqual(
413
+ parseTranscript(path),
414
+ );
415
+ });
416
+ });
417
+
196
418
  describe("listSubagents / findByDescription", () => {
197
419
  test("matches subagents by meta description", () => {
198
420
  const dir = join(FIXTURE_ROOT, "subagents");
@@ -15,12 +15,31 @@ type ToolResultBlock = {
15
15
  content: string | unknown[];
16
16
  };
17
17
 
18
- type ContentBlock = ToolUseBlock | ToolResultBlock | { type: string };
18
+ type TextBlock = {
19
+ type: "text";
20
+ text: string;
21
+ };
22
+
23
+ type ContentBlock =
24
+ | ToolUseBlock
25
+ | ToolResultBlock
26
+ | TextBlock
27
+ | { type: string };
28
+
29
+ type UsageRecord = {
30
+ input_tokens?: number;
31
+ output_tokens?: number;
32
+ cache_creation_input_tokens?: number;
33
+ cache_read_input_tokens?: number;
34
+ };
19
35
 
20
36
  type TranscriptRecord = {
21
37
  type: "user" | "assistant" | string;
38
+ timestamp?: string;
22
39
  message?: {
40
+ id?: string;
23
41
  role?: string;
42
+ usage?: UsageRecord;
24
43
  content?: string | ContentBlock[];
25
44
  };
26
45
  };
@@ -47,21 +66,25 @@ function stringifyResult(content: ToolResultBlock["content"]): string {
47
66
  return JSON.stringify(content);
48
67
  }
49
68
 
50
- export function parseTranscript(jsonlPath: string): ToolInvocation[] {
69
+ function readRecords(jsonlPath: string): TranscriptRecord[] {
51
70
  const raw = readFileSync(jsonlPath, "utf8");
52
- const lines = raw.split("\n").filter((l) => l.length > 0);
53
-
54
- const invocations: ToolInvocation[] = [];
55
- const indexById = new Map<string, number>();
56
-
57
- for (const line of lines) {
58
- let record: TranscriptRecord;
71
+ const records: TranscriptRecord[] = [];
72
+ for (const line of raw.split("\n")) {
73
+ if (line.length === 0) continue;
59
74
  try {
60
- record = JSON.parse(line) as TranscriptRecord;
75
+ records.push(JSON.parse(line) as TranscriptRecord);
61
76
  } catch {
62
- continue;
77
+ // skip malformed lines
63
78
  }
79
+ }
80
+ return records;
81
+ }
82
+
83
+ function extractInvocations(records: TranscriptRecord[]): ToolInvocation[] {
84
+ const invocations: ToolInvocation[] = [];
85
+ const indexById = new Map<string, number>();
64
86
 
87
+ for (const record of records) {
65
88
  const blocks = flattenContent(record.message?.content);
66
89
 
67
90
  if (record.type === "assistant") {
@@ -93,6 +116,79 @@ export function parseTranscript(jsonlPath: string): ToolInvocation[] {
93
116
  return invocations;
94
117
  }
95
118
 
119
+ export function parseTranscript(jsonlPath: string): ToolInvocation[] {
120
+ return extractInvocations(readRecords(jsonlPath));
121
+ }
122
+
123
+ export type TranscriptSummary = {
124
+ tool_invocations: ToolInvocation[];
125
+ /**
126
+ * Sum of usage across unique API responses. One response spans multiple
127
+ * jsonl lines (one per content block) and repeats the same `message.id` +
128
+ * `usage` on each, so totals are deduped by `message.id`. Includes cache
129
+ * creation/read tokens — a different accounting than the harness's task
130
+ * completion event.
131
+ */
132
+ total_tokens: number | null;
133
+ /** Wall clock between the first and last line timestamps. */
134
+ duration_ms: number | null;
135
+ /** Concatenated text blocks of the last assistant message. */
136
+ final_text: string | null;
137
+ };
138
+
139
+ export function parseTranscriptFull(jsonlPath: string): TranscriptSummary {
140
+ const records = readRecords(jsonlPath);
141
+
142
+ const usageById = new Map<string, UsageRecord>();
143
+ let firstTs: number | null = null;
144
+ let lastTs: number | null = null;
145
+ let timestampCount = 0;
146
+ let finalText: string | null = null;
147
+
148
+ for (const record of records) {
149
+ if (record.timestamp) {
150
+ const ts = Date.parse(record.timestamp);
151
+ if (!Number.isNaN(ts)) {
152
+ if (firstTs === null) firstTs = ts;
153
+ lastTs = ts;
154
+ timestampCount++;
155
+ }
156
+ }
157
+
158
+ if (record.type !== "assistant") continue;
159
+
160
+ const { id, usage } = record.message ?? {};
161
+ if (id && usage) usageById.set(id, usage);
162
+
163
+ const texts = flattenContent(record.message?.content)
164
+ .filter((b): b is TextBlock => b.type === "text")
165
+ .map((b) => b.text);
166
+ if (texts.length > 0) finalText = texts.join("\n");
167
+ }
168
+
169
+ let totalTokens: number | null = null;
170
+ if (usageById.size > 0) {
171
+ totalTokens = 0;
172
+ for (const usage of usageById.values()) {
173
+ totalTokens +=
174
+ (usage.input_tokens ?? 0) +
175
+ (usage.output_tokens ?? 0) +
176
+ (usage.cache_creation_input_tokens ?? 0) +
177
+ (usage.cache_read_input_tokens ?? 0);
178
+ }
179
+ }
180
+
181
+ return {
182
+ tool_invocations: extractInvocations(records),
183
+ total_tokens: totalTokens,
184
+ duration_ms:
185
+ timestampCount >= 2 && firstTs !== null && lastTs !== null
186
+ ? lastTs - firstTs
187
+ : null,
188
+ final_text: finalText,
189
+ };
190
+ }
191
+
96
192
  export type SubagentMeta = {
97
193
  agentType?: string;
98
194
  description?: string;
@@ -186,6 +186,226 @@ describe("aggregate.ts user-mode (--skill-dir, isolated CWD)", () => {
186
186
  ).toBe(true);
187
187
  });
188
188
 
189
+ test("surfaces live-source reads as validity_warnings", () => {
190
+ const root = join(FIXTURE_ROOT, "agg-live-reads");
191
+ const skillDir = join(root, "skill-dir");
192
+ const skillSub = join(skillDir, "mr-review");
193
+ mkdirSync(skillSub, { recursive: true });
194
+ writeFileSync(
195
+ join(skillSub, "SKILL.md"),
196
+ "---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
197
+ );
198
+
199
+ const cwd = join(root, "work");
200
+ const iterationDir = join(
201
+ cwd,
202
+ "skills-workspace",
203
+ "mr-review",
204
+ "iteration-1",
205
+ );
206
+ mkdirSync(iterationDir, { recursive: true });
207
+ writeJson(join(iterationDir, "conditions.json"), {
208
+ mode: "revision",
209
+ conditions: [
210
+ { name: "old_skill", skill_path: join(skillSub, "SKILL.md") },
211
+ { name: "new_skill", skill_path: join(skillSub, "SKILL.md") },
212
+ ],
213
+ timestamp: new Date().toISOString(),
214
+ harness: "claude-code",
215
+ });
216
+ for (const cond of ["old_skill", "new_skill"]) {
217
+ const condDir = join(iterationDir, "eval-e1", cond);
218
+ mkdirSync(condDir, { recursive: true });
219
+ writeJson(join(condDir, "grading.json"), {
220
+ assertion_results: [],
221
+ summary: { passed: 1, failed: 0, total: 1, pass_rate: 1 },
222
+ });
223
+ writeJson(join(condDir, "timing.json"), {
224
+ total_tokens: 100,
225
+ duration_ms: 1,
226
+ });
227
+ }
228
+ writeJson(join(iterationDir, "stray-writes.json"), {
229
+ generated: new Date().toISOString(),
230
+ iteration: 1,
231
+ totals: { violations: 0, warnings: 0, live_source_reads: 1 },
232
+ runs: [
233
+ {
234
+ eval_id: "e1",
235
+ condition: "old_skill",
236
+ violations: [],
237
+ warnings: [],
238
+ live_source_reads: [
239
+ {
240
+ tool: "Read",
241
+ path: join(skillSub, "SKILL.md"),
242
+ ordinal: 0,
243
+ reason: "x",
244
+ },
245
+ ],
246
+ },
247
+ ],
248
+ });
249
+
250
+ const res = Bun.spawnSync(
251
+ [
252
+ "bun",
253
+ "run",
254
+ AGGREGATE_TS,
255
+ "--skill-dir",
256
+ skillDir,
257
+ "--skill",
258
+ "mr-review",
259
+ "--iteration",
260
+ "1",
261
+ ],
262
+ { cwd, stdout: "pipe", stderr: "pipe" },
263
+ );
264
+ expect(res.exitCode).toBe(0);
265
+ const benchmark = JSON.parse(
266
+ readFileSync(join(iterationDir, "benchmark.json"), "utf8"),
267
+ ) as { validity_warnings: string[] };
268
+ expect(
269
+ benchmark.validity_warnings.some(
270
+ (w) => w.includes("e1/old_skill") && /live skill source/i.test(w),
271
+ ),
272
+ ).toBe(true);
273
+ });
274
+
275
+ test("warns when timing sources are mixed across the compared runs", () => {
276
+ const root = join(FIXTURE_ROOT, "agg-mixed-timing");
277
+ const skillDir = join(root, "skill-dir");
278
+ const skillSub = join(skillDir, "mr-review");
279
+ mkdirSync(skillSub, { recursive: true });
280
+ writeFileSync(
281
+ join(skillSub, "SKILL.md"),
282
+ "---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
283
+ );
284
+
285
+ const cwd = join(root, "work");
286
+ const iterationDir = join(
287
+ cwd,
288
+ "skills-workspace",
289
+ "mr-review",
290
+ "iteration-1",
291
+ );
292
+ mkdirSync(iterationDir, { recursive: true });
293
+ writeJson(join(iterationDir, "conditions.json"), {
294
+ mode: "new-skill",
295
+ conditions: [
296
+ { name: "with_skill", skill_path: join(skillSub, "SKILL.md") },
297
+ { name: "without_skill", skill_path: null },
298
+ ],
299
+ timestamp: new Date().toISOString(),
300
+ harness: "claude-code",
301
+ });
302
+ // One arm has agent-captured completion-event timing (no source field, the
303
+ // pre-provenance shape); the other was backfilled from the transcript.
304
+ const mkCond = (cond: string, timing: unknown) => {
305
+ const condDir = join(iterationDir, "eval-e1", cond);
306
+ mkdirSync(condDir, { recursive: true });
307
+ writeJson(join(condDir, "grading.json"), {
308
+ assertion_results: [],
309
+ summary: { passed: 1, failed: 0, total: 1, pass_rate: 1 },
310
+ });
311
+ writeJson(join(condDir, "timing.json"), timing);
312
+ };
313
+ mkCond("with_skill", { total_tokens: 5000, duration_ms: 1000 });
314
+ mkCond("without_skill", {
315
+ total_tokens: 90000,
316
+ duration_ms: 1200,
317
+ source: "transcript",
318
+ });
319
+
320
+ const res = Bun.spawnSync(
321
+ [
322
+ "bun",
323
+ "run",
324
+ AGGREGATE_TS,
325
+ "--skill-dir",
326
+ skillDir,
327
+ "--skill",
328
+ "mr-review",
329
+ "--iteration",
330
+ "1",
331
+ ],
332
+ { cwd, stdout: "pipe", stderr: "pipe" },
333
+ );
334
+ expect(res.exitCode).toBe(0);
335
+ const benchmark = JSON.parse(
336
+ readFileSync(join(iterationDir, "benchmark.json"), "utf8"),
337
+ ) as { validity_warnings: string[] };
338
+ expect(
339
+ benchmark.validity_warnings.some(
340
+ (w) => w.includes("timing source") && w.includes("transcript"),
341
+ ),
342
+ ).toBe(true);
343
+ });
344
+
345
+ test("does not warn when all timing comes from one source", () => {
346
+ const root = join(FIXTURE_ROOT, "agg-same-timing");
347
+ const skillDir = join(root, "skill-dir");
348
+ const skillSub = join(skillDir, "mr-review");
349
+ mkdirSync(skillSub, { recursive: true });
350
+ writeFileSync(
351
+ join(skillSub, "SKILL.md"),
352
+ "---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
353
+ );
354
+
355
+ const cwd = join(root, "work");
356
+ const iterationDir = join(
357
+ cwd,
358
+ "skills-workspace",
359
+ "mr-review",
360
+ "iteration-1",
361
+ );
362
+ mkdirSync(iterationDir, { recursive: true });
363
+ writeJson(join(iterationDir, "conditions.json"), {
364
+ mode: "new-skill",
365
+ conditions: [
366
+ { name: "with_skill", skill_path: join(skillSub, "SKILL.md") },
367
+ { name: "without_skill", skill_path: null },
368
+ ],
369
+ timestamp: new Date().toISOString(),
370
+ harness: "claude-code",
371
+ });
372
+ for (const cond of ["with_skill", "without_skill"]) {
373
+ const condDir = join(iterationDir, "eval-e1", cond);
374
+ mkdirSync(condDir, { recursive: true });
375
+ writeJson(join(condDir, "grading.json"), {
376
+ assertion_results: [],
377
+ summary: { passed: 1, failed: 0, total: 1, pass_rate: 1 },
378
+ });
379
+ writeJson(join(condDir, "timing.json"), {
380
+ total_tokens: 100,
381
+ duration_ms: 1,
382
+ source: "transcript",
383
+ });
384
+ }
385
+
386
+ const res = Bun.spawnSync(
387
+ [
388
+ "bun",
389
+ "run",
390
+ AGGREGATE_TS,
391
+ "--skill-dir",
392
+ skillDir,
393
+ "--skill",
394
+ "mr-review",
395
+ "--iteration",
396
+ "1",
397
+ ],
398
+ { cwd, stdout: "pipe", stderr: "pipe" },
399
+ );
400
+ expect(res.exitCode).toBe(0);
401
+ const benchmark = JSON.parse(
402
+ readFileSync(join(iterationDir, "benchmark.json"), "utf8"),
403
+ ) as { validity_warnings: string[] };
404
+ expect(
405
+ benchmark.validity_warnings.some((w) => w.includes("timing source")),
406
+ ).toBe(false);
407
+ });
408
+
189
409
  test("surfaces plugin-shadow findings as validity_warnings", () => {
190
410
  const root = join(FIXTURE_ROOT, "agg-shadow");
191
411
  const skillDir = join(root, "skill-dir");
@@ -94,6 +94,11 @@ for (const c of conditions.conditions) {
94
94
  }
95
95
 
96
96
  let missingGradings = 0;
97
+ // Timing provenance across all runs in the comparison. "completion-event"
98
+ // (the agent-captured default, also assumed when `source` is absent) and
99
+ // "transcript" (record-runs backfill, includes cache accounting) measure
100
+ // different things — a delta mixing them is comparing two metrics.
101
+ const timingSources = new Set<string>();
97
102
  for (const evalDir of evalDirs) {
98
103
  for (const cond of conditionNames) {
99
104
  const condDir = join(iterationDir, evalDir, cond);
@@ -116,6 +121,11 @@ for (const evalDir of evalDirs) {
116
121
  byCondition[cond].tokens.push(timing.total_tokens);
117
122
  if (typeof timing.duration_ms === "number")
118
123
  byCondition[cond].durations.push(timing.duration_ms);
124
+ if (
125
+ typeof timing.total_tokens === "number" ||
126
+ typeof timing.duration_ms === "number"
127
+ )
128
+ timingSources.add(timing.source ?? "completion-event");
119
129
  }
120
130
  }
121
131
  }
@@ -168,6 +178,11 @@ const delta = {
168
178
  };
169
179
 
170
180
  const validityWarnings: string[] = [];
181
+ if (timingSources.size > 1) {
182
+ validityWarnings.push(
183
+ `runs mix timing sources (${[...timingSources].sort().join(", ")}) — transcript-derived totals include cache accounting, so the token/duration delta compares two different metrics. Re-record one side or read the delta as a rough signal only.`,
184
+ );
185
+ }
171
186
  for (const cond of conditionNames) {
172
187
  const s = runSummary[cond];
173
188
  if (s.skill_invocation_rate != null && s.skill_invocation_rate < 1) {
@@ -188,6 +203,7 @@ if (existsSync(strayPath)) {
188
203
  eval_id: string;
189
204
  condition: string;
190
205
  violations?: unknown[];
206
+ live_source_reads?: unknown[];
191
207
  }>;
192
208
  };
193
209
  for (const r of stray.runs ?? []) {
@@ -196,6 +212,11 @@ if (existsSync(strayPath)) {
196
212
  validityWarnings.push(
197
213
  `${r.eval_id}/${r.condition} wrote ${n} file(s) outside its outputs dir — data point may be tainted (see stray-writes.json).`,
198
214
  );
215
+ const reads = r.live_source_reads?.length ?? 0;
216
+ if (reads > 0)
217
+ validityWarnings.push(
218
+ `${r.eval_id}/${r.condition} read the live skill source ${reads} time(s) instead of its staged copy — the arm may be contaminated (staged-slug resolution race; see stray-writes.json).`,
219
+ );
199
220
  }
200
221
  } catch {
201
222
  // ignore a malformed report rather than failing aggregation