@slowdini/slow-powers-opencode 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +18 -8
- package/package.json +5 -1
- package/skills/evaluating-skills/SKILL.md +19 -17
- package/skills/evaluating-skills/harness-details/claude.md +51 -15
- package/skills/evaluating-skills/harness-parity.md +155 -0
- package/skills/evaluating-skills/runner/README.md +28 -19
- package/skills/evaluating-skills/runner/adapters/claude-code-session.ts +2 -2
- package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +222 -0
- package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +107 -11
- package/skills/evaluating-skills/runner/aggregate.test.ts +220 -0
- package/skills/evaluating-skills/runner/aggregate.ts +21 -0
- package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +295 -2
- package/skills/evaluating-skills/runner/detect-stray-writes.ts +102 -6
- package/skills/evaluating-skills/runner/guard/policy.test.ts +57 -0
- package/skills/evaluating-skills/runner/promote-baseline.test.ts +51 -0
- package/skills/evaluating-skills/runner/promote-baseline.ts +19 -1
- package/skills/evaluating-skills/runner/record-runs.test.ts +314 -0
- package/skills/evaluating-skills/runner/record-runs.ts +209 -0
- package/skills/evaluating-skills/runner/run.test.ts +523 -0
- package/skills/evaluating-skills/runner/run.ts +376 -17
- package/skills/evaluating-skills/runner/sandbox-policy.ts +20 -0
- package/skills/evaluating-skills/runner/types.ts +9 -0
- package/skills/evaluating-skills/runner/workspace-teardown.test.ts +227 -0
- package/skills/evaluating-skills/runner/workspace-teardown.ts +136 -0
- package/skills/evaluating-skills/schema/run-record.schema.json +2 -2
- package/skills/evaluating-skills/schema/stray-writes.schema.json +15 -3
- package/skills/evaluating-skills/templates/eval-task-prompt.md +5 -3
- package/skills/test-driven-development/evals/baseline/NOTES.md +1 -1
- package/skills/verifying-development-work/SKILL.md +17 -6
- package/skills/verifying-development-work/code-review.md +68 -0
- package/skills/verifying-development-work/comment-review.md +85 -0
- package/skills/verifying-development-work/evals/baseline/BASELINE.md +7 -6
- package/skills/verifying-development-work/evals/baseline/NOTES.md +83 -149
- package/skills/verifying-development-work/evals/baseline/benchmark.json +32 -31
- package/skills/verifying-development-work/evals/baseline/grading/comment-hygiene-at-handoff__new_skill.json +53 -0
- package/skills/verifying-development-work/evals/baseline/grading/comment-hygiene-at-handoff__old_skill.json +53 -0
- package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__new_skill.json +53 -0
- package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__old_skill.json +53 -0
- package/skills/verifying-development-work/evals/evals.json +34 -2
- package/skills/verifying-development-work/evals/fixtures/comment-hygiene-at-handoff/slugify.test.ts +14 -0
- package/skills/verifying-development-work/evals/fixtures/comment-hygiene-at-handoff/slugify.ts +25 -0
- package/skills/verifying-development-work/evals/baseline/grading/bug-fixed-without-reproducing__with_skill.json +0 -39
- package/skills/verifying-development-work/evals/baseline/grading/bug-fixed-without-reproducing__without_skill.json +0 -24
- package/skills/verifying-development-work/evals/baseline/grading/build-implied-by-edit__with_skill.json +0 -46
- package/skills/verifying-development-work/evals/baseline/grading/build-implied-by-edit__without_skill.json +0 -31
- package/skills/verifying-development-work/evals/baseline/grading/claim-without-running__with_skill.json +0 -46
- package/skills/verifying-development-work/evals/baseline/grading/claim-without-running__without_skill.json +0 -31
- package/skills/verifying-development-work/evals/baseline/grading/seeded-done-tests-pass-ship-it__with_skill.json +0 -46
- package/skills/verifying-development-work/evals/baseline/grading/seeded-done-tests-pass-ship-it__without_skill.json +0 -31
- package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__with_skill.json +0 -53
- package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__without_skill.json +0 -38
|
@@ -6,6 +6,7 @@ import {
|
|
|
6
6
|
findByDescription,
|
|
7
7
|
listSubagents,
|
|
8
8
|
parseTranscript,
|
|
9
|
+
parseTranscriptFull,
|
|
9
10
|
} from "./claude-code-transcript";
|
|
10
11
|
|
|
11
12
|
const FIXTURE_ROOT = join(tmpdir(), `claude-code-adapter-test-${process.pid}`);
|
|
@@ -193,6 +194,227 @@ describe("parseTranscript", () => {
|
|
|
193
194
|
});
|
|
194
195
|
});
|
|
195
196
|
|
|
197
|
+
describe("parseTranscriptFull", () => {
|
|
198
|
+
const usage = (output: number) => ({
|
|
199
|
+
input_tokens: 100,
|
|
200
|
+
cache_creation_input_tokens: 50,
|
|
201
|
+
cache_read_input_tokens: 200,
|
|
202
|
+
output_tokens: output,
|
|
203
|
+
});
|
|
204
|
+
|
|
205
|
+
test("sums usage across unique message ids, deduping repeated ids", () => {
|
|
206
|
+
// One API response spans multiple jsonl lines (one per content block) and
|
|
207
|
+
// repeats the same message.id + usage on each — it must be counted once.
|
|
208
|
+
const path = join(FIXTURE_ROOT, "full-dedup.jsonl");
|
|
209
|
+
writeFileSync(
|
|
210
|
+
path,
|
|
211
|
+
jsonl([
|
|
212
|
+
{
|
|
213
|
+
type: "user",
|
|
214
|
+
timestamp: "2026-06-04T10:00:00.000Z",
|
|
215
|
+
message: { role: "user", content: "go" },
|
|
216
|
+
},
|
|
217
|
+
{
|
|
218
|
+
type: "assistant",
|
|
219
|
+
timestamp: "2026-06-04T10:00:05.000Z",
|
|
220
|
+
message: {
|
|
221
|
+
id: "msg_aaa",
|
|
222
|
+
role: "assistant",
|
|
223
|
+
usage: usage(10),
|
|
224
|
+
content: [{ type: "text", text: "first block" }],
|
|
225
|
+
},
|
|
226
|
+
},
|
|
227
|
+
{
|
|
228
|
+
type: "assistant",
|
|
229
|
+
timestamp: "2026-06-04T10:00:06.000Z",
|
|
230
|
+
message: {
|
|
231
|
+
id: "msg_aaa",
|
|
232
|
+
role: "assistant",
|
|
233
|
+
usage: usage(10),
|
|
234
|
+
content: [
|
|
235
|
+
{
|
|
236
|
+
type: "tool_use",
|
|
237
|
+
id: "toolu_1",
|
|
238
|
+
name: "Bash",
|
|
239
|
+
input: { command: "ls" },
|
|
240
|
+
},
|
|
241
|
+
],
|
|
242
|
+
},
|
|
243
|
+
},
|
|
244
|
+
{
|
|
245
|
+
type: "assistant",
|
|
246
|
+
timestamp: "2026-06-04T10:01:00.000Z",
|
|
247
|
+
message: {
|
|
248
|
+
id: "msg_bbb",
|
|
249
|
+
role: "assistant",
|
|
250
|
+
usage: usage(40),
|
|
251
|
+
content: [{ type: "text", text: "done" }],
|
|
252
|
+
},
|
|
253
|
+
},
|
|
254
|
+
]),
|
|
255
|
+
);
|
|
256
|
+
|
|
257
|
+
const full = parseTranscriptFull(path);
|
|
258
|
+
// msg_aaa counted once (100+50+200+10) + msg_bbb (100+50+200+40) = 750
|
|
259
|
+
expect(full.total_tokens).toBe(750);
|
|
260
|
+
});
|
|
261
|
+
|
|
262
|
+
test("returns null total_tokens when no usage objects present", () => {
|
|
263
|
+
const path = join(FIXTURE_ROOT, "full-no-usage.jsonl");
|
|
264
|
+
writeFileSync(
|
|
265
|
+
path,
|
|
266
|
+
jsonl([
|
|
267
|
+
{
|
|
268
|
+
type: "assistant",
|
|
269
|
+
message: {
|
|
270
|
+
role: "assistant",
|
|
271
|
+
content: [{ type: "text", text: "hi" }],
|
|
272
|
+
},
|
|
273
|
+
},
|
|
274
|
+
]),
|
|
275
|
+
);
|
|
276
|
+
expect(parseTranscriptFull(path).total_tokens).toBeNull();
|
|
277
|
+
});
|
|
278
|
+
|
|
279
|
+
test("derives duration_ms from first and last line timestamps", () => {
|
|
280
|
+
const path = join(FIXTURE_ROOT, "full-duration.jsonl");
|
|
281
|
+
writeFileSync(
|
|
282
|
+
path,
|
|
283
|
+
jsonl([
|
|
284
|
+
{
|
|
285
|
+
type: "user",
|
|
286
|
+
timestamp: "2026-06-04T10:00:00.000Z",
|
|
287
|
+
message: { role: "user", content: "go" },
|
|
288
|
+
},
|
|
289
|
+
{
|
|
290
|
+
type: "assistant",
|
|
291
|
+
timestamp: "2026-06-04T10:02:30.500Z",
|
|
292
|
+
message: {
|
|
293
|
+
id: "msg_x",
|
|
294
|
+
role: "assistant",
|
|
295
|
+
content: [{ type: "text", text: "done" }],
|
|
296
|
+
},
|
|
297
|
+
},
|
|
298
|
+
]),
|
|
299
|
+
);
|
|
300
|
+
expect(parseTranscriptFull(path).duration_ms).toBe(150_500);
|
|
301
|
+
});
|
|
302
|
+
|
|
303
|
+
test("returns null duration_ms with fewer than two timestamps", () => {
|
|
304
|
+
const path = join(FIXTURE_ROOT, "full-one-ts.jsonl");
|
|
305
|
+
writeFileSync(
|
|
306
|
+
path,
|
|
307
|
+
jsonl([
|
|
308
|
+
{
|
|
309
|
+
type: "assistant",
|
|
310
|
+
timestamp: "2026-06-04T10:00:00.000Z",
|
|
311
|
+
message: { role: "assistant", content: [] },
|
|
312
|
+
},
|
|
313
|
+
{ type: "assistant", message: { role: "assistant", content: [] } },
|
|
314
|
+
]),
|
|
315
|
+
);
|
|
316
|
+
expect(parseTranscriptFull(path).duration_ms).toBeNull();
|
|
317
|
+
});
|
|
318
|
+
|
|
319
|
+
test("final_text is the concatenated text of the last assistant message", () => {
|
|
320
|
+
const path = join(FIXTURE_ROOT, "full-final-text.jsonl");
|
|
321
|
+
writeFileSync(
|
|
322
|
+
path,
|
|
323
|
+
jsonl([
|
|
324
|
+
{
|
|
325
|
+
type: "assistant",
|
|
326
|
+
message: {
|
|
327
|
+
id: "msg_1",
|
|
328
|
+
role: "assistant",
|
|
329
|
+
content: [{ type: "text", text: "intermediate" }],
|
|
330
|
+
},
|
|
331
|
+
},
|
|
332
|
+
{
|
|
333
|
+
type: "assistant",
|
|
334
|
+
message: {
|
|
335
|
+
id: "msg_2",
|
|
336
|
+
role: "assistant",
|
|
337
|
+
content: [
|
|
338
|
+
{ type: "text", text: "All tests pass." },
|
|
339
|
+
{
|
|
340
|
+
type: "tool_use",
|
|
341
|
+
id: "toolu_z",
|
|
342
|
+
name: "Bash",
|
|
343
|
+
input: { command: "true" },
|
|
344
|
+
},
|
|
345
|
+
{ type: "text", text: "Wrapping up." },
|
|
346
|
+
],
|
|
347
|
+
},
|
|
348
|
+
},
|
|
349
|
+
{
|
|
350
|
+
type: "user",
|
|
351
|
+
message: {
|
|
352
|
+
role: "user",
|
|
353
|
+
content: [
|
|
354
|
+
{ type: "tool_result", tool_use_id: "toolu_z", content: "ok" },
|
|
355
|
+
],
|
|
356
|
+
},
|
|
357
|
+
},
|
|
358
|
+
]),
|
|
359
|
+
);
|
|
360
|
+
expect(parseTranscriptFull(path).final_text).toBe(
|
|
361
|
+
"All tests pass.\nWrapping up.",
|
|
362
|
+
);
|
|
363
|
+
});
|
|
364
|
+
|
|
365
|
+
test("final_text is null when no assistant text exists", () => {
|
|
366
|
+
const path = join(FIXTURE_ROOT, "full-no-text.jsonl");
|
|
367
|
+
writeFileSync(
|
|
368
|
+
path,
|
|
369
|
+
jsonl([{ type: "user", message: { role: "user", content: "hi" } }]),
|
|
370
|
+
);
|
|
371
|
+
expect(parseTranscriptFull(path).final_text).toBeNull();
|
|
372
|
+
});
|
|
373
|
+
|
|
374
|
+
test("tool_invocations matches parseTranscript output", () => {
|
|
375
|
+
const path = join(FIXTURE_ROOT, "full-invocations.jsonl");
|
|
376
|
+
writeFileSync(
|
|
377
|
+
path,
|
|
378
|
+
jsonl([
|
|
379
|
+
{
|
|
380
|
+
type: "assistant",
|
|
381
|
+
timestamp: "2026-06-04T10:00:00.000Z",
|
|
382
|
+
message: {
|
|
383
|
+
id: "msg_1",
|
|
384
|
+
role: "assistant",
|
|
385
|
+
usage: usage(5),
|
|
386
|
+
content: [
|
|
387
|
+
{
|
|
388
|
+
type: "tool_use",
|
|
389
|
+
id: "toolu_q",
|
|
390
|
+
name: "Read",
|
|
391
|
+
input: { file_path: "/tmp/a" },
|
|
392
|
+
},
|
|
393
|
+
],
|
|
394
|
+
},
|
|
395
|
+
},
|
|
396
|
+
{
|
|
397
|
+
type: "user",
|
|
398
|
+
timestamp: "2026-06-04T10:00:02.000Z",
|
|
399
|
+
message: {
|
|
400
|
+
role: "user",
|
|
401
|
+
content: [
|
|
402
|
+
{
|
|
403
|
+
type: "tool_result",
|
|
404
|
+
tool_use_id: "toolu_q",
|
|
405
|
+
content: "contents",
|
|
406
|
+
},
|
|
407
|
+
],
|
|
408
|
+
},
|
|
409
|
+
},
|
|
410
|
+
]),
|
|
411
|
+
);
|
|
412
|
+
expect(parseTranscriptFull(path).tool_invocations).toEqual(
|
|
413
|
+
parseTranscript(path),
|
|
414
|
+
);
|
|
415
|
+
});
|
|
416
|
+
});
|
|
417
|
+
|
|
196
418
|
describe("listSubagents / findByDescription", () => {
|
|
197
419
|
test("matches subagents by meta description", () => {
|
|
198
420
|
const dir = join(FIXTURE_ROOT, "subagents");
|
|
@@ -15,12 +15,31 @@ type ToolResultBlock = {
|
|
|
15
15
|
content: string | unknown[];
|
|
16
16
|
};
|
|
17
17
|
|
|
18
|
-
type
|
|
18
|
+
type TextBlock = {
|
|
19
|
+
type: "text";
|
|
20
|
+
text: string;
|
|
21
|
+
};
|
|
22
|
+
|
|
23
|
+
type ContentBlock =
|
|
24
|
+
| ToolUseBlock
|
|
25
|
+
| ToolResultBlock
|
|
26
|
+
| TextBlock
|
|
27
|
+
| { type: string };
|
|
28
|
+
|
|
29
|
+
type UsageRecord = {
|
|
30
|
+
input_tokens?: number;
|
|
31
|
+
output_tokens?: number;
|
|
32
|
+
cache_creation_input_tokens?: number;
|
|
33
|
+
cache_read_input_tokens?: number;
|
|
34
|
+
};
|
|
19
35
|
|
|
20
36
|
type TranscriptRecord = {
|
|
21
37
|
type: "user" | "assistant" | string;
|
|
38
|
+
timestamp?: string;
|
|
22
39
|
message?: {
|
|
40
|
+
id?: string;
|
|
23
41
|
role?: string;
|
|
42
|
+
usage?: UsageRecord;
|
|
24
43
|
content?: string | ContentBlock[];
|
|
25
44
|
};
|
|
26
45
|
};
|
|
@@ -47,21 +66,25 @@ function stringifyResult(content: ToolResultBlock["content"]): string {
|
|
|
47
66
|
return JSON.stringify(content);
|
|
48
67
|
}
|
|
49
68
|
|
|
50
|
-
|
|
69
|
+
function readRecords(jsonlPath: string): TranscriptRecord[] {
|
|
51
70
|
const raw = readFileSync(jsonlPath, "utf8");
|
|
52
|
-
const
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
const indexById = new Map<string, number>();
|
|
56
|
-
|
|
57
|
-
for (const line of lines) {
|
|
58
|
-
let record: TranscriptRecord;
|
|
71
|
+
const records: TranscriptRecord[] = [];
|
|
72
|
+
for (const line of raw.split("\n")) {
|
|
73
|
+
if (line.length === 0) continue;
|
|
59
74
|
try {
|
|
60
|
-
|
|
75
|
+
records.push(JSON.parse(line) as TranscriptRecord);
|
|
61
76
|
} catch {
|
|
62
|
-
|
|
77
|
+
// skip malformed lines
|
|
63
78
|
}
|
|
79
|
+
}
|
|
80
|
+
return records;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
function extractInvocations(records: TranscriptRecord[]): ToolInvocation[] {
|
|
84
|
+
const invocations: ToolInvocation[] = [];
|
|
85
|
+
const indexById = new Map<string, number>();
|
|
64
86
|
|
|
87
|
+
for (const record of records) {
|
|
65
88
|
const blocks = flattenContent(record.message?.content);
|
|
66
89
|
|
|
67
90
|
if (record.type === "assistant") {
|
|
@@ -93,6 +116,79 @@ export function parseTranscript(jsonlPath: string): ToolInvocation[] {
|
|
|
93
116
|
return invocations;
|
|
94
117
|
}
|
|
95
118
|
|
|
119
|
+
export function parseTranscript(jsonlPath: string): ToolInvocation[] {
|
|
120
|
+
return extractInvocations(readRecords(jsonlPath));
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
export type TranscriptSummary = {
|
|
124
|
+
tool_invocations: ToolInvocation[];
|
|
125
|
+
/**
|
|
126
|
+
* Sum of usage across unique API responses. One response spans multiple
|
|
127
|
+
* jsonl lines (one per content block) and repeats the same `message.id` +
|
|
128
|
+
* `usage` on each, so totals are deduped by `message.id`. Includes cache
|
|
129
|
+
* creation/read tokens — a different accounting than the harness's task
|
|
130
|
+
* completion event.
|
|
131
|
+
*/
|
|
132
|
+
total_tokens: number | null;
|
|
133
|
+
/** Wall clock between the first and last line timestamps. */
|
|
134
|
+
duration_ms: number | null;
|
|
135
|
+
/** Concatenated text blocks of the last assistant message. */
|
|
136
|
+
final_text: string | null;
|
|
137
|
+
};
|
|
138
|
+
|
|
139
|
+
export function parseTranscriptFull(jsonlPath: string): TranscriptSummary {
|
|
140
|
+
const records = readRecords(jsonlPath);
|
|
141
|
+
|
|
142
|
+
const usageById = new Map<string, UsageRecord>();
|
|
143
|
+
let firstTs: number | null = null;
|
|
144
|
+
let lastTs: number | null = null;
|
|
145
|
+
let timestampCount = 0;
|
|
146
|
+
let finalText: string | null = null;
|
|
147
|
+
|
|
148
|
+
for (const record of records) {
|
|
149
|
+
if (record.timestamp) {
|
|
150
|
+
const ts = Date.parse(record.timestamp);
|
|
151
|
+
if (!Number.isNaN(ts)) {
|
|
152
|
+
if (firstTs === null) firstTs = ts;
|
|
153
|
+
lastTs = ts;
|
|
154
|
+
timestampCount++;
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
if (record.type !== "assistant") continue;
|
|
159
|
+
|
|
160
|
+
const { id, usage } = record.message ?? {};
|
|
161
|
+
if (id && usage) usageById.set(id, usage);
|
|
162
|
+
|
|
163
|
+
const texts = flattenContent(record.message?.content)
|
|
164
|
+
.filter((b): b is TextBlock => b.type === "text")
|
|
165
|
+
.map((b) => b.text);
|
|
166
|
+
if (texts.length > 0) finalText = texts.join("\n");
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
let totalTokens: number | null = null;
|
|
170
|
+
if (usageById.size > 0) {
|
|
171
|
+
totalTokens = 0;
|
|
172
|
+
for (const usage of usageById.values()) {
|
|
173
|
+
totalTokens +=
|
|
174
|
+
(usage.input_tokens ?? 0) +
|
|
175
|
+
(usage.output_tokens ?? 0) +
|
|
176
|
+
(usage.cache_creation_input_tokens ?? 0) +
|
|
177
|
+
(usage.cache_read_input_tokens ?? 0);
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
return {
|
|
182
|
+
tool_invocations: extractInvocations(records),
|
|
183
|
+
total_tokens: totalTokens,
|
|
184
|
+
duration_ms:
|
|
185
|
+
timestampCount >= 2 && firstTs !== null && lastTs !== null
|
|
186
|
+
? lastTs - firstTs
|
|
187
|
+
: null,
|
|
188
|
+
final_text: finalText,
|
|
189
|
+
};
|
|
190
|
+
}
|
|
191
|
+
|
|
96
192
|
export type SubagentMeta = {
|
|
97
193
|
agentType?: string;
|
|
98
194
|
description?: string;
|
|
@@ -186,6 +186,226 @@ describe("aggregate.ts user-mode (--skill-dir, isolated CWD)", () => {
|
|
|
186
186
|
).toBe(true);
|
|
187
187
|
});
|
|
188
188
|
|
|
189
|
+
test("surfaces live-source reads as validity_warnings", () => {
|
|
190
|
+
const root = join(FIXTURE_ROOT, "agg-live-reads");
|
|
191
|
+
const skillDir = join(root, "skill-dir");
|
|
192
|
+
const skillSub = join(skillDir, "mr-review");
|
|
193
|
+
mkdirSync(skillSub, { recursive: true });
|
|
194
|
+
writeFileSync(
|
|
195
|
+
join(skillSub, "SKILL.md"),
|
|
196
|
+
"---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
|
|
197
|
+
);
|
|
198
|
+
|
|
199
|
+
const cwd = join(root, "work");
|
|
200
|
+
const iterationDir = join(
|
|
201
|
+
cwd,
|
|
202
|
+
"skills-workspace",
|
|
203
|
+
"mr-review",
|
|
204
|
+
"iteration-1",
|
|
205
|
+
);
|
|
206
|
+
mkdirSync(iterationDir, { recursive: true });
|
|
207
|
+
writeJson(join(iterationDir, "conditions.json"), {
|
|
208
|
+
mode: "revision",
|
|
209
|
+
conditions: [
|
|
210
|
+
{ name: "old_skill", skill_path: join(skillSub, "SKILL.md") },
|
|
211
|
+
{ name: "new_skill", skill_path: join(skillSub, "SKILL.md") },
|
|
212
|
+
],
|
|
213
|
+
timestamp: new Date().toISOString(),
|
|
214
|
+
harness: "claude-code",
|
|
215
|
+
});
|
|
216
|
+
for (const cond of ["old_skill", "new_skill"]) {
|
|
217
|
+
const condDir = join(iterationDir, "eval-e1", cond);
|
|
218
|
+
mkdirSync(condDir, { recursive: true });
|
|
219
|
+
writeJson(join(condDir, "grading.json"), {
|
|
220
|
+
assertion_results: [],
|
|
221
|
+
summary: { passed: 1, failed: 0, total: 1, pass_rate: 1 },
|
|
222
|
+
});
|
|
223
|
+
writeJson(join(condDir, "timing.json"), {
|
|
224
|
+
total_tokens: 100,
|
|
225
|
+
duration_ms: 1,
|
|
226
|
+
});
|
|
227
|
+
}
|
|
228
|
+
writeJson(join(iterationDir, "stray-writes.json"), {
|
|
229
|
+
generated: new Date().toISOString(),
|
|
230
|
+
iteration: 1,
|
|
231
|
+
totals: { violations: 0, warnings: 0, live_source_reads: 1 },
|
|
232
|
+
runs: [
|
|
233
|
+
{
|
|
234
|
+
eval_id: "e1",
|
|
235
|
+
condition: "old_skill",
|
|
236
|
+
violations: [],
|
|
237
|
+
warnings: [],
|
|
238
|
+
live_source_reads: [
|
|
239
|
+
{
|
|
240
|
+
tool: "Read",
|
|
241
|
+
path: join(skillSub, "SKILL.md"),
|
|
242
|
+
ordinal: 0,
|
|
243
|
+
reason: "x",
|
|
244
|
+
},
|
|
245
|
+
],
|
|
246
|
+
},
|
|
247
|
+
],
|
|
248
|
+
});
|
|
249
|
+
|
|
250
|
+
const res = Bun.spawnSync(
|
|
251
|
+
[
|
|
252
|
+
"bun",
|
|
253
|
+
"run",
|
|
254
|
+
AGGREGATE_TS,
|
|
255
|
+
"--skill-dir",
|
|
256
|
+
skillDir,
|
|
257
|
+
"--skill",
|
|
258
|
+
"mr-review",
|
|
259
|
+
"--iteration",
|
|
260
|
+
"1",
|
|
261
|
+
],
|
|
262
|
+
{ cwd, stdout: "pipe", stderr: "pipe" },
|
|
263
|
+
);
|
|
264
|
+
expect(res.exitCode).toBe(0);
|
|
265
|
+
const benchmark = JSON.parse(
|
|
266
|
+
readFileSync(join(iterationDir, "benchmark.json"), "utf8"),
|
|
267
|
+
) as { validity_warnings: string[] };
|
|
268
|
+
expect(
|
|
269
|
+
benchmark.validity_warnings.some(
|
|
270
|
+
(w) => w.includes("e1/old_skill") && /live skill source/i.test(w),
|
|
271
|
+
),
|
|
272
|
+
).toBe(true);
|
|
273
|
+
});
|
|
274
|
+
|
|
275
|
+
test("warns when timing sources are mixed across the compared runs", () => {
|
|
276
|
+
const root = join(FIXTURE_ROOT, "agg-mixed-timing");
|
|
277
|
+
const skillDir = join(root, "skill-dir");
|
|
278
|
+
const skillSub = join(skillDir, "mr-review");
|
|
279
|
+
mkdirSync(skillSub, { recursive: true });
|
|
280
|
+
writeFileSync(
|
|
281
|
+
join(skillSub, "SKILL.md"),
|
|
282
|
+
"---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
|
|
283
|
+
);
|
|
284
|
+
|
|
285
|
+
const cwd = join(root, "work");
|
|
286
|
+
const iterationDir = join(
|
|
287
|
+
cwd,
|
|
288
|
+
"skills-workspace",
|
|
289
|
+
"mr-review",
|
|
290
|
+
"iteration-1",
|
|
291
|
+
);
|
|
292
|
+
mkdirSync(iterationDir, { recursive: true });
|
|
293
|
+
writeJson(join(iterationDir, "conditions.json"), {
|
|
294
|
+
mode: "new-skill",
|
|
295
|
+
conditions: [
|
|
296
|
+
{ name: "with_skill", skill_path: join(skillSub, "SKILL.md") },
|
|
297
|
+
{ name: "without_skill", skill_path: null },
|
|
298
|
+
],
|
|
299
|
+
timestamp: new Date().toISOString(),
|
|
300
|
+
harness: "claude-code",
|
|
301
|
+
});
|
|
302
|
+
// One arm has agent-captured completion-event timing (no source field, the
|
|
303
|
+
// pre-provenance shape); the other was backfilled from the transcript.
|
|
304
|
+
const mkCond = (cond: string, timing: unknown) => {
|
|
305
|
+
const condDir = join(iterationDir, "eval-e1", cond);
|
|
306
|
+
mkdirSync(condDir, { recursive: true });
|
|
307
|
+
writeJson(join(condDir, "grading.json"), {
|
|
308
|
+
assertion_results: [],
|
|
309
|
+
summary: { passed: 1, failed: 0, total: 1, pass_rate: 1 },
|
|
310
|
+
});
|
|
311
|
+
writeJson(join(condDir, "timing.json"), timing);
|
|
312
|
+
};
|
|
313
|
+
mkCond("with_skill", { total_tokens: 5000, duration_ms: 1000 });
|
|
314
|
+
mkCond("without_skill", {
|
|
315
|
+
total_tokens: 90000,
|
|
316
|
+
duration_ms: 1200,
|
|
317
|
+
source: "transcript",
|
|
318
|
+
});
|
|
319
|
+
|
|
320
|
+
const res = Bun.spawnSync(
|
|
321
|
+
[
|
|
322
|
+
"bun",
|
|
323
|
+
"run",
|
|
324
|
+
AGGREGATE_TS,
|
|
325
|
+
"--skill-dir",
|
|
326
|
+
skillDir,
|
|
327
|
+
"--skill",
|
|
328
|
+
"mr-review",
|
|
329
|
+
"--iteration",
|
|
330
|
+
"1",
|
|
331
|
+
],
|
|
332
|
+
{ cwd, stdout: "pipe", stderr: "pipe" },
|
|
333
|
+
);
|
|
334
|
+
expect(res.exitCode).toBe(0);
|
|
335
|
+
const benchmark = JSON.parse(
|
|
336
|
+
readFileSync(join(iterationDir, "benchmark.json"), "utf8"),
|
|
337
|
+
) as { validity_warnings: string[] };
|
|
338
|
+
expect(
|
|
339
|
+
benchmark.validity_warnings.some(
|
|
340
|
+
(w) => w.includes("timing source") && w.includes("transcript"),
|
|
341
|
+
),
|
|
342
|
+
).toBe(true);
|
|
343
|
+
});
|
|
344
|
+
|
|
345
|
+
test("does not warn when all timing comes from one source", () => {
|
|
346
|
+
const root = join(FIXTURE_ROOT, "agg-same-timing");
|
|
347
|
+
const skillDir = join(root, "skill-dir");
|
|
348
|
+
const skillSub = join(skillDir, "mr-review");
|
|
349
|
+
mkdirSync(skillSub, { recursive: true });
|
|
350
|
+
writeFileSync(
|
|
351
|
+
join(skillSub, "SKILL.md"),
|
|
352
|
+
"---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
|
|
353
|
+
);
|
|
354
|
+
|
|
355
|
+
const cwd = join(root, "work");
|
|
356
|
+
const iterationDir = join(
|
|
357
|
+
cwd,
|
|
358
|
+
"skills-workspace",
|
|
359
|
+
"mr-review",
|
|
360
|
+
"iteration-1",
|
|
361
|
+
);
|
|
362
|
+
mkdirSync(iterationDir, { recursive: true });
|
|
363
|
+
writeJson(join(iterationDir, "conditions.json"), {
|
|
364
|
+
mode: "new-skill",
|
|
365
|
+
conditions: [
|
|
366
|
+
{ name: "with_skill", skill_path: join(skillSub, "SKILL.md") },
|
|
367
|
+
{ name: "without_skill", skill_path: null },
|
|
368
|
+
],
|
|
369
|
+
timestamp: new Date().toISOString(),
|
|
370
|
+
harness: "claude-code",
|
|
371
|
+
});
|
|
372
|
+
for (const cond of ["with_skill", "without_skill"]) {
|
|
373
|
+
const condDir = join(iterationDir, "eval-e1", cond);
|
|
374
|
+
mkdirSync(condDir, { recursive: true });
|
|
375
|
+
writeJson(join(condDir, "grading.json"), {
|
|
376
|
+
assertion_results: [],
|
|
377
|
+
summary: { passed: 1, failed: 0, total: 1, pass_rate: 1 },
|
|
378
|
+
});
|
|
379
|
+
writeJson(join(condDir, "timing.json"), {
|
|
380
|
+
total_tokens: 100,
|
|
381
|
+
duration_ms: 1,
|
|
382
|
+
source: "transcript",
|
|
383
|
+
});
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
const res = Bun.spawnSync(
|
|
387
|
+
[
|
|
388
|
+
"bun",
|
|
389
|
+
"run",
|
|
390
|
+
AGGREGATE_TS,
|
|
391
|
+
"--skill-dir",
|
|
392
|
+
skillDir,
|
|
393
|
+
"--skill",
|
|
394
|
+
"mr-review",
|
|
395
|
+
"--iteration",
|
|
396
|
+
"1",
|
|
397
|
+
],
|
|
398
|
+
{ cwd, stdout: "pipe", stderr: "pipe" },
|
|
399
|
+
);
|
|
400
|
+
expect(res.exitCode).toBe(0);
|
|
401
|
+
const benchmark = JSON.parse(
|
|
402
|
+
readFileSync(join(iterationDir, "benchmark.json"), "utf8"),
|
|
403
|
+
) as { validity_warnings: string[] };
|
|
404
|
+
expect(
|
|
405
|
+
benchmark.validity_warnings.some((w) => w.includes("timing source")),
|
|
406
|
+
).toBe(false);
|
|
407
|
+
});
|
|
408
|
+
|
|
189
409
|
test("surfaces plugin-shadow findings as validity_warnings", () => {
|
|
190
410
|
const root = join(FIXTURE_ROOT, "agg-shadow");
|
|
191
411
|
const skillDir = join(root, "skill-dir");
|
|
@@ -94,6 +94,11 @@ for (const c of conditions.conditions) {
|
|
|
94
94
|
}
|
|
95
95
|
|
|
96
96
|
let missingGradings = 0;
|
|
97
|
+
// Timing provenance across all runs in the comparison. "completion-event"
|
|
98
|
+
// (the agent-captured default, also assumed when `source` is absent) and
|
|
99
|
+
// "transcript" (record-runs backfill, includes cache accounting) measure
|
|
100
|
+
// different things — a delta mixing them is comparing two metrics.
|
|
101
|
+
const timingSources = new Set<string>();
|
|
97
102
|
for (const evalDir of evalDirs) {
|
|
98
103
|
for (const cond of conditionNames) {
|
|
99
104
|
const condDir = join(iterationDir, evalDir, cond);
|
|
@@ -116,6 +121,11 @@ for (const evalDir of evalDirs) {
|
|
|
116
121
|
byCondition[cond].tokens.push(timing.total_tokens);
|
|
117
122
|
if (typeof timing.duration_ms === "number")
|
|
118
123
|
byCondition[cond].durations.push(timing.duration_ms);
|
|
124
|
+
if (
|
|
125
|
+
typeof timing.total_tokens === "number" ||
|
|
126
|
+
typeof timing.duration_ms === "number"
|
|
127
|
+
)
|
|
128
|
+
timingSources.add(timing.source ?? "completion-event");
|
|
119
129
|
}
|
|
120
130
|
}
|
|
121
131
|
}
|
|
@@ -168,6 +178,11 @@ const delta = {
|
|
|
168
178
|
};
|
|
169
179
|
|
|
170
180
|
const validityWarnings: string[] = [];
|
|
181
|
+
if (timingSources.size > 1) {
|
|
182
|
+
validityWarnings.push(
|
|
183
|
+
`runs mix timing sources (${[...timingSources].sort().join(", ")}) — transcript-derived totals include cache accounting, so the token/duration delta compares two different metrics. Re-record one side or read the delta as a rough signal only.`,
|
|
184
|
+
);
|
|
185
|
+
}
|
|
171
186
|
for (const cond of conditionNames) {
|
|
172
187
|
const s = runSummary[cond];
|
|
173
188
|
if (s.skill_invocation_rate != null && s.skill_invocation_rate < 1) {
|
|
@@ -188,6 +203,7 @@ if (existsSync(strayPath)) {
|
|
|
188
203
|
eval_id: string;
|
|
189
204
|
condition: string;
|
|
190
205
|
violations?: unknown[];
|
|
206
|
+
live_source_reads?: unknown[];
|
|
191
207
|
}>;
|
|
192
208
|
};
|
|
193
209
|
for (const r of stray.runs ?? []) {
|
|
@@ -196,6 +212,11 @@ if (existsSync(strayPath)) {
|
|
|
196
212
|
validityWarnings.push(
|
|
197
213
|
`${r.eval_id}/${r.condition} wrote ${n} file(s) outside its outputs dir — data point may be tainted (see stray-writes.json).`,
|
|
198
214
|
);
|
|
215
|
+
const reads = r.live_source_reads?.length ?? 0;
|
|
216
|
+
if (reads > 0)
|
|
217
|
+
validityWarnings.push(
|
|
218
|
+
`${r.eval_id}/${r.condition} read the live skill source ${reads} time(s) instead of its staged copy — the arm may be contaminated (staged-slug resolution race; see stray-writes.json).`,
|
|
219
|
+
);
|
|
199
220
|
}
|
|
200
221
|
} catch {
|
|
201
222
|
// ignore a malformed report rather than failing aggregation
|