@astudioplus/compressor 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (108) hide show
  1. package/CHANGELOG.md +52 -0
  2. package/LICENSE +20 -0
  3. package/README.md +167 -0
  4. package/dist/adapters/agents-md.d.ts +2 -0
  5. package/dist/adapters/agents-md.js +91 -0
  6. package/dist/adapters/apply.d.ts +3 -0
  7. package/dist/adapters/apply.js +83 -0
  8. package/dist/adapters/claude-code.d.ts +2 -0
  9. package/dist/adapters/claude-code.js +403 -0
  10. package/dist/adapters/copilot.d.ts +2 -0
  11. package/dist/adapters/copilot.js +418 -0
  12. package/dist/adapters/cursor.d.ts +2 -0
  13. package/dist/adapters/cursor.js +149 -0
  14. package/dist/adapters/index.d.ts +11 -0
  15. package/dist/adapters/index.js +19 -0
  16. package/dist/adapters/markers.d.ts +7 -0
  17. package/dist/adapters/markers.js +129 -0
  18. package/dist/adapters/types.d.ts +44 -0
  19. package/dist/adapters/types.js +1 -0
  20. package/dist/bench/ablate.d.ts +35 -0
  21. package/dist/bench/ablate.js +163 -0
  22. package/dist/bench/cell.d.ts +33 -0
  23. package/dist/bench/cell.js +437 -0
  24. package/dist/bench/results.d.ts +37 -0
  25. package/dist/bench/results.js +157 -0
  26. package/dist/bench/runner.d.ts +24 -0
  27. package/dist/bench/runner.js +121 -0
  28. package/dist/bench/tasks.d.ts +4 -0
  29. package/dist/bench/tasks.js +147 -0
  30. package/dist/bench/types.d.ts +109 -0
  31. package/dist/bench/types.js +1 -0
  32. package/dist/claude/transcripts.d.ts +30 -0
  33. package/dist/claude/transcripts.js +154 -0
  34. package/dist/cli/commands/benchmark.d.ts +33 -0
  35. package/dist/cli/commands/benchmark.js +203 -0
  36. package/dist/cli/commands/compress.d.ts +8 -0
  37. package/dist/cli/commands/compress.js +45 -0
  38. package/dist/cli/commands/count.d.ts +5 -0
  39. package/dist/cli/commands/count.js +25 -0
  40. package/dist/cli/commands/hook.d.ts +6 -0
  41. package/dist/cli/commands/hook.js +30 -0
  42. package/dist/cli/commands/init.d.ts +16 -0
  43. package/dist/cli/commands/init.js +76 -0
  44. package/dist/cli/commands/report.d.ts +90 -0
  45. package/dist/cli/commands/report.js +464 -0
  46. package/dist/cli/commands/savings.d.ts +38 -0
  47. package/dist/cli/commands/savings.js +196 -0
  48. package/dist/cli/commands/set-mode.d.ts +5 -0
  49. package/dist/cli/commands/set-mode.js +13 -0
  50. package/dist/cli/commands/stats.d.ts +5 -0
  51. package/dist/cli/commands/stats.js +51 -0
  52. package/dist/cli/commands/status.d.ts +1 -0
  53. package/dist/cli/commands/status.js +11 -0
  54. package/dist/cli/commands/uninstall.d.ts +7 -0
  55. package/dist/cli/commands/uninstall.js +22 -0
  56. package/dist/cli/index.d.ts +2 -0
  57. package/dist/cli/index.js +146 -0
  58. package/dist/copilot-hook-entry.d.ts +1 -0
  59. package/dist/copilot-hook-entry.js +36 -0
  60. package/dist/copilot-hook.js +1000 -0
  61. package/dist/engine/detect.d.ts +2 -0
  62. package/dist/engine/detect.js +47 -0
  63. package/dist/engine/index.d.ts +4 -0
  64. package/dist/engine/index.js +90 -0
  65. package/dist/engine/policy.d.ts +2 -0
  66. package/dist/engine/policy.js +48 -0
  67. package/dist/engine/tiers/code.d.ts +7 -0
  68. package/dist/engine/tiers/code.js +206 -0
  69. package/dist/engine/tiers/logs.d.ts +4 -0
  70. package/dist/engine/tiers/logs.js +139 -0
  71. package/dist/engine/tiers/structural.d.ts +28 -0
  72. package/dist/engine/tiers/structural.js +199 -0
  73. package/dist/engine/types.d.ts +71 -0
  74. package/dist/engine/types.js +5 -0
  75. package/dist/hook/copilot.d.ts +5 -0
  76. package/dist/hook/copilot.js +136 -0
  77. package/dist/hook/core.d.ts +36 -0
  78. package/dist/hook/core.js +138 -0
  79. package/dist/hook/exit.d.ts +22 -0
  80. package/dist/hook/exit.js +56 -0
  81. package/dist/hook/post-tool-use.d.ts +5 -0
  82. package/dist/hook/post-tool-use.js +57 -0
  83. package/dist/hook-entry.d.ts +1 -0
  84. package/dist/hook-entry.js +35 -0
  85. package/dist/hook.js +946 -0
  86. package/dist/index.d.ts +15 -0
  87. package/dist/index.js +16 -0
  88. package/dist/ledger/read.d.ts +9 -0
  89. package/dist/ledger/read.js +91 -0
  90. package/dist/ledger/write.d.ts +29 -0
  91. package/dist/ledger/write.js +61 -0
  92. package/dist/packs/atoms.d.ts +3 -0
  93. package/dist/packs/atoms.js +108 -0
  94. package/dist/packs/modes.d.ts +3 -0
  95. package/dist/packs/modes.js +34 -0
  96. package/dist/packs/render.d.ts +24 -0
  97. package/dist/packs/render.js +115 -0
  98. package/dist/packs/types.d.ts +32 -0
  99. package/dist/packs/types.js +1 -0
  100. package/dist/paths.d.ts +29 -0
  101. package/dist/paths.js +87 -0
  102. package/dist/tokens/estimate.d.ts +12 -0
  103. package/dist/tokens/estimate.js +23 -0
  104. package/dist/tokens/exact.d.ts +5 -0
  105. package/dist/tokens/exact.js +16 -0
  106. package/dist/tokens/index.d.ts +2 -0
  107. package/dist/tokens/index.js +2 -0
  108. package/package.json +77 -0
@@ -0,0 +1,437 @@
1
+ import { exec, execFile } from 'node:child_process';
2
+ import { cp, mkdir, mkdtemp, readFile, realpath, rm, writeFile } from 'node:fs/promises';
3
+ import { tmpdir } from 'node:os';
4
+ import path from 'node:path';
5
+ import { promisify } from 'node:util';
6
+ import { addUsage, encodeProjectDir, readSessionUsage, } from "../claude/transcripts.js";
7
+ import { resolveHookCommand } from "../paths.js";
8
+ const execAsync = promisify(exec);
9
+ const execFileAsync = promisify(execFile);
10
+ const CLAUDE_TIMEOUT_MS = 600_000;
11
+ const CHECK_TIMEOUT_MS = 600_000;
12
+ const MAX_BUFFER = 32 * 1024 * 1024;
13
+ const HOOK_MATCHER = 'Read|Bash|Grep|Glob';
14
+ function isRecord(value) {
15
+ return typeof value === 'object' && value !== null && !Array.isArray(value);
16
+ }
17
+ function num(value) {
18
+ return typeof value === 'number' && Number.isFinite(value) ? value : 0;
19
+ }
20
+ function zeroUsage() {
21
+ return { input: 0, output: 0, cacheCreation: 0, cacheRead: 0 };
22
+ }
23
+ function errorMessage(error) {
24
+ const text = error instanceof Error ? error.message : String(error);
25
+ return text.length > 400 ? `${text.slice(0, 400)}…` : text;
26
+ }
27
+ async function gitInitBestEffort(workspace) {
28
+ try {
29
+ await execFileAsync('git', ['init', '-q'], { cwd: workspace, timeout: 30_000 });
30
+ }
31
+ catch {
32
+ // git missing or init failed — workspace works without it
33
+ }
34
+ }
35
+ /**
36
+ * Hook command installed in a cell: the resolved bundle command plus the
37
+ * variant's extra args (Variant.hookArgs, e.g. '--marker-style informative')
38
+ * so experiments can vary engine behavior per variant. `root` is exposed for
39
+ * tests only; production callers use the package default.
40
+ */
41
+ export function hookCommandForVariant(variant, root) {
42
+ if (variant.baseMode === 'full') {
43
+ throw new Error(`variant ${variant.id}: hook requires baseMode optimized|slim`);
44
+ }
45
+ const base = root === undefined
46
+ ? resolveHookCommand(variant.baseMode)
47
+ : resolveHookCommand(variant.baseMode, root);
48
+ const extra = variant.hookArgs?.trim() ?? '';
49
+ return extra === '' ? base : `${base} ${extra}`;
50
+ }
51
+ /** Writes style files + cell settings; returns the settings file path. */
52
+ async function writeVariantArtifacts(variant, workspace, scratch) {
53
+ if (variant.styleBody !== null && variant.styleName !== null) {
54
+ const fileName = `${variant.styleName}.md`;
55
+ // style resolution under --bare may use either scope: write both
56
+ const workspaceDir = path.join(workspace, '.claude', 'output-styles');
57
+ const scratchDir = path.join(scratch, 'output-styles');
58
+ await mkdir(workspaceDir, { recursive: true });
59
+ await mkdir(scratchDir, { recursive: true });
60
+ await writeFile(path.join(workspaceDir, fileName), variant.styleBody, 'utf8');
61
+ await writeFile(path.join(scratchDir, fileName), variant.styleBody, 'utf8');
62
+ }
63
+ const settings = {
64
+ // Headless cells must work unprompted inside their throwaway workspace;
65
+ // denied Edit/Bash calls otherwise corrupt the measurement (the model
66
+ // spins on retries instead of doing the task — observed live: 16 turns
67
+ // of denial loops with the correct fix in hand).
68
+ permissions: { defaultMode: 'bypassPermissions' },
69
+ };
70
+ if (variant.styleName !== null) {
71
+ settings['outputStyle'] = variant.styleName;
72
+ }
73
+ if (variant.hook) {
74
+ settings['hooks'] = {
75
+ PostToolUse: [
76
+ {
77
+ matcher: HOOK_MATCHER,
78
+ hooks: [{ type: 'command', command: hookCommandForVariant(variant) }],
79
+ },
80
+ ],
81
+ };
82
+ }
83
+ const file = path.join(scratch, 'cell-settings.json');
84
+ await writeFile(file, `${JSON.stringify(settings, null, 2)}\n`, 'utf8');
85
+ return file;
86
+ }
87
+ async function runCommandCheck(command, cwd) {
88
+ try {
89
+ await execAsync(command, { cwd, timeout: CHECK_TIMEOUT_MS });
90
+ return { kind: 'ran', passed: true };
91
+ }
92
+ catch (error) {
93
+ const code = error.code;
94
+ if (typeof code === 'number') {
95
+ return { kind: 'ran', passed: code === 0 };
96
+ }
97
+ return { kind: 'infra', message: errorMessage(error) };
98
+ }
99
+ }
100
+ async function baselineCheck(check, workspace) {
101
+ if (check.kind !== 'command') {
102
+ return null;
103
+ }
104
+ const outcome = await runCommandCheck(check.command, workspace);
105
+ return outcome.kind === 'ran' ? outcome.passed : null;
106
+ }
107
+ /**
108
+ * Command checks run once in the workspace (after the final turn). For
109
+ * answer-regex the conversation is the answer: pass when the pattern matches
110
+ * ANY single turn's result text (see the semantics note in tasks.ts).
111
+ */
112
+ async function judgeSuccess(check, workspace, resultTexts) {
113
+ if (check.kind === 'command') {
114
+ const outcome = await runCommandCheck(check.command, workspace);
115
+ if (outcome.kind === 'infra') {
116
+ return { success: null, checkError: `success check failed to run: ${outcome.message}` };
117
+ }
118
+ return { success: outcome.passed, checkError: null };
119
+ }
120
+ try {
121
+ const re = new RegExp(check.pattern, check.flags);
122
+ const success = resultTexts.some((text) => {
123
+ re.lastIndex = 0; // 'g'/'y' flags carry state across .test calls
124
+ return re.test(text);
125
+ });
126
+ return { success, checkError: null };
127
+ }
128
+ catch (error) {
129
+ return { success: null, checkError: `answer-regex invalid: ${errorMessage(error)}` };
130
+ }
131
+ }
132
+ /**
133
+ * Environment for the claude child process (and therefore for the PostToolUse
134
+ * hook it spawns). CLAUDE_CONFIG_DIR isolates the cell; COMPRESSOR_NO_LEDGER
135
+ * keeps benchmark cells out of the user's LIVE savings ledger — hook-bearing
136
+ * cells run the real hook, and without the kill switch every worthwhile
137
+ * compression would append a synthetic event to ~/.compressor/ledger,
138
+ * corrupting what `compressor savings` reports. Exported for tests.
139
+ */
140
+ export function cellEnv(scratch) {
141
+ return { ...process.env, CLAUDE_CONFIG_DIR: scratch, COMPRESSOR_NO_LEDGER: '1' };
142
+ }
143
+ async function invokeClaude(spec, workspace, scratch, settingsFile, prompt, resumeSessionId) {
144
+ const bin = process.env.COMPRESSOR_CLAUDE_BIN ?? 'claude';
145
+ const args = [
146
+ '--bare',
147
+ '-p',
148
+ prompt,
149
+ '--output-format',
150
+ 'json',
151
+ '--model',
152
+ spec.model,
153
+ '--settings',
154
+ settingsFile,
155
+ ];
156
+ if (resumeSessionId !== undefined) {
157
+ // documented headless continuation: claude -p "<prompt>" --resume <id>
158
+ args.push('--resume', resumeSessionId);
159
+ }
160
+ const options = {
161
+ cwd: workspace,
162
+ env: cellEnv(scratch),
163
+ timeout: CLAUDE_TIMEOUT_MS,
164
+ maxBuffer: MAX_BUFFER,
165
+ };
166
+ // .mjs/.js bins (test stubs) are not directly executable: run via node
167
+ const { stdout } = /\.(mjs|js)$/.test(bin)
168
+ ? await execFileAsync(process.execPath, [bin, ...args], options)
169
+ : await execFileAsync(bin, args, options);
170
+ return stdout;
171
+ }
172
+ function parseResultJson(stdout) {
173
+ let parsed;
174
+ try {
175
+ parsed = JSON.parse(stdout);
176
+ }
177
+ catch {
178
+ const head = stdout.trim().slice(0, 200);
179
+ throw new Error(`result JSON parse failed: ${head === '' ? '(empty stdout)' : head}`);
180
+ }
181
+ if (!isRecord(parsed)) {
182
+ throw new Error('result JSON parse failed: not an object');
183
+ }
184
+ const usage = isRecord(parsed['usage']) ? parsed['usage'] : {};
185
+ return {
186
+ sessionId: typeof parsed['session_id'] === 'string' ? parsed['session_id'] : null,
187
+ servedModels: Object.keys(isRecord(parsed['modelUsage']) ? parsed['modelUsage'] : {}),
188
+ usage: {
189
+ input: num(usage['input_tokens']),
190
+ output: num(usage['output_tokens']),
191
+ cacheCreation: num(usage['cache_creation_input_tokens']),
192
+ cacheRead: num(usage['cache_read_input_tokens']),
193
+ },
194
+ costUsd: typeof parsed['total_cost_usd'] === 'number' ? parsed['total_cost_usd'] : null,
195
+ durationMs: num(parsed['duration_ms']),
196
+ numTurns: num(parsed['num_turns']),
197
+ permissionDenials: Array.isArray(parsed['permission_denials'])
198
+ ? parsed['permission_denials'].length
199
+ : 0,
200
+ resultText: typeof parsed['result'] === 'string' ? parsed['result'] : '',
201
+ };
202
+ }
203
+ function transcriptFilePath(scratch, workspace, sessionId) {
204
+ return path.join(scratch, 'projects', encodeProjectDir(workspace), `${sessionId}.jsonl`);
205
+ }
206
+ /**
207
+ * Transcript totals and summed per-turn result JSONs count the same API
208
+ * responses, so they must roughly agree. Divergence beyond this relative
209
+ * tolerance means one of the two known failure topologies happened: a
210
+ * resumed session forked ids and the final transcript does NOT carry the
211
+ * full copied history (transcript ≪ sum: usage silently undercounts to
212
+ * roughly the last turn), or per-turn result JSONs report cumulative
213
+ * session usage (sum ≫ transcript: the fallback double-counts). Neither is
214
+ * detectable from one side alone; the cell is flagged data-quality-suspect.
215
+ */
216
+ export const USAGE_MISMATCH_TOLERANCE = 0.25;
217
+ function totalTokens(usage) {
218
+ return usage.input + usage.output + usage.cacheCreation + usage.cacheRead;
219
+ }
220
+ /**
221
+ * Cell-level usage for multi-turn cells: the FINAL transcript, deduped by
222
+ * requestId (readSessionUsage), is authoritative across all turns — resumed
223
+ * sessions carry the full history, and per-turn result JSONs would double
224
+ * count anything the API reported on more than one turn. Falls back to
225
+ * summing the turn result JSONs when the transcript is missing/empty.
226
+ * When the transcript IS used, it is cross-checked against the summed
227
+ * per-turn usage; disagreement flags the cell instead of silently reporting
228
+ * a wrong total (`suspect` carries the data-quality note).
229
+ */
230
+ async function multiTurnUsage(scratch, workspace, sessionId, turnUsage) {
231
+ const summed = turnUsage.reduce(addUsage, zeroUsage());
232
+ if (sessionId === null) {
233
+ return { totals: summed, suspect: null };
234
+ }
235
+ try {
236
+ const session = await readSessionUsage(transcriptFilePath(scratch, workspace, sessionId));
237
+ if (session.turns === 0) {
238
+ return { totals: summed, suspect: null };
239
+ }
240
+ const fromTranscript = totalTokens(session.totals);
241
+ const fromTurns = totalTokens(summed);
242
+ const limit = Math.max(fromTranscript, fromTurns) * USAGE_MISMATCH_TOLERANCE;
243
+ const suspect = fromTurns > 0 && Math.abs(fromTranscript - fromTurns) > limit
244
+ ? `usage data-quality: final transcript totals (${fromTranscript} tokens) diverge from summed per-turn usage (${fromTurns} tokens) by >${Math.round(USAGE_MISMATCH_TOLERANCE * 100)}% — resumed session may have forked without full history, or per-turn result JSONs may be cumulative`
245
+ : null;
246
+ return { totals: session.totals, suspect };
247
+ }
248
+ catch {
249
+ return { totals: summed, suspect: null };
250
+ }
251
+ }
252
+ async function countToolCalls(transcriptFile) {
253
+ let text;
254
+ try {
255
+ text = await readFile(transcriptFile, 'utf8');
256
+ }
257
+ catch {
258
+ return {};
259
+ }
260
+ // PLAN.md: the same API response can appear on multiple transcript lines —
261
+ // dedupe by requestId/message.id, last occurrence wins (matches
262
+ // readSessionUsage in src/claude/transcripts.ts)
263
+ const byKey = new Map();
264
+ let anonCounter = 0;
265
+ for (const line of text.split('\n')) {
266
+ const trimmed = line.trim();
267
+ if (trimmed === '')
268
+ continue;
269
+ let parsed;
270
+ try {
271
+ parsed = JSON.parse(trimmed);
272
+ }
273
+ catch {
274
+ continue;
275
+ }
276
+ if (!isRecord(parsed) || parsed['type'] !== 'assistant')
277
+ continue;
278
+ const message = parsed['message'];
279
+ if (!isRecord(message) || !Array.isArray(message['content']))
280
+ continue;
281
+ const names = [];
282
+ for (const block of message['content']) {
283
+ if (isRecord(block) && block['type'] === 'tool_use' && typeof block['name'] === 'string') {
284
+ names.push(block['name']);
285
+ }
286
+ }
287
+ const key = typeof parsed['requestId'] === 'string'
288
+ ? parsed['requestId']
289
+ : typeof message['id'] === 'string'
290
+ ? message['id']
291
+ : `anon-${anonCounter++}`;
292
+ byKey.set(key, names);
293
+ }
294
+ const counts = {};
295
+ for (const names of byKey.values()) {
296
+ for (const name of names) {
297
+ counts[name] = (counts[name] ?? 0) + 1;
298
+ }
299
+ }
300
+ return counts;
301
+ }
302
+ /** Best-effort removal, refusing anything outside the OS temp dir. */
303
+ async function cleanupTempDir(dir) {
304
+ if (dir === '')
305
+ return;
306
+ try {
307
+ const tmpReal = await realpath(tmpdir());
308
+ const rel = path.relative(tmpReal, dir);
309
+ if (rel === '' || rel.startsWith('..') || path.isAbsolute(rel))
310
+ return;
311
+ await rm(dir, { recursive: true, force: true });
312
+ }
313
+ catch {
314
+ // best-effort
315
+ }
316
+ }
317
+ export async function runCell(spec, ctx) {
318
+ const base = {
319
+ runId: ctx.runId,
320
+ taskId: spec.task.id,
321
+ variantId: spec.variant.id,
322
+ trial: spec.trial,
323
+ model: spec.model,
324
+ };
325
+ let workspace = '';
326
+ let scratch = '';
327
+ let baselineCheckPassed = null;
328
+ const isMultiTurn = spec.task.turns !== undefined;
329
+ // accumulated outside the try so a failed turn still reports completed
330
+ // turns — including their COSTS: every completed turn's costUsd is known
331
+ // at failure time, and discarding it would leave the runner's budget
332
+ // ceiling blind to real spend on exactly the runs that misbehave
333
+ const turnUsage = [];
334
+ const turnCosts = [];
335
+ try {
336
+ // realpath both sides so the encoded transcript dir matches the cwd the
337
+ // child reports (macOS tmpdir is a symlinked /var/folders path)
338
+ workspace = await realpath(await mkdtemp(path.join(tmpdir(), 'compressor-bench-ws-')));
339
+ scratch = await realpath(await mkdtemp(path.join(tmpdir(), 'compressor-bench-cfg-')));
340
+ // fix.patch.json is the answer key (scripted fix for stubs/fixture tests);
341
+ // copying it would hand the agent the literal solution
342
+ await cp(path.join(ctx.fixturesDir, spec.task.fixture), workspace, {
343
+ recursive: true,
344
+ filter: (src) => path.basename(src) !== 'fix.patch.json',
345
+ });
346
+ await gitInitBestEffort(workspace);
347
+ const settingsFile = await writeVariantArtifacts(spec.variant, workspace, scratch);
348
+ baselineCheckPassed = await baselineCheck(spec.task.check, workspace);
349
+ // scripted conversation: first the task prompt, then each turn resumed
350
+ // from the previous turn's session id (sessions can fork ids on resume,
351
+ // so each turn chains from the one before it)
352
+ const prompts = [spec.task.prompt, ...(spec.task.turns ?? [])];
353
+ const turns = [];
354
+ for (const [index, prompt] of prompts.entries()) {
355
+ const label = prompts.length > 1 ? `turn ${index + 1}/${prompts.length}: ` : '';
356
+ let resume;
357
+ if (index > 0) {
358
+ const prevSession = turns[index - 1]?.sessionId ?? null;
359
+ if (prevSession === null) {
360
+ throw new Error(`${label}previous turn reported no session_id to --resume from`);
361
+ }
362
+ resume = prevSession;
363
+ }
364
+ let parsed;
365
+ try {
366
+ const stdout = await invokeClaude(spec, workspace, scratch, settingsFile, prompt, resume);
367
+ parsed = parseResultJson(stdout);
368
+ }
369
+ catch (error) {
370
+ // single-shot keeps its original message; conversations get the label
371
+ throw label === '' ? error : new Error(`${label}${errorMessage(error)}`);
372
+ }
373
+ turns.push(parsed);
374
+ turnUsage.push(parsed.usage);
375
+ if (typeof parsed.costUsd === 'number') {
376
+ turnCosts.push(parsed.costUsd);
377
+ }
378
+ }
379
+ const final = turns[turns.length - 1];
380
+ if (final === undefined) {
381
+ throw new Error('no turns ran'); // unreachable: prompts is never empty
382
+ }
383
+ // final transcript covers the whole conversation (toolCalls + usage)
384
+ const toolCalls = final.sessionId === null
385
+ ? {}
386
+ : await countToolCalls(transcriptFilePath(scratch, workspace, final.sessionId));
387
+ const multi = isMultiTurn
388
+ ? await multiTurnUsage(scratch, workspace, final.sessionId, turnUsage)
389
+ : null;
390
+ const usage = multi === null ? final.usage : multi.totals;
391
+ const { success, checkError } = await judgeSuccess(spec.task.check, workspace, turns.map((turn) => turn.resultText));
392
+ const problems = [checkError, multi?.suspect ?? null].filter((note) => note !== null);
393
+ return {
394
+ ...base,
395
+ servedModels: [...new Set(turns.flatMap((turn) => turn.servedModels))],
396
+ baselineCheckPassed,
397
+ success,
398
+ usage,
399
+ // each invocation reports its own totals: sum across turns
400
+ costUsd: turnCosts.length === 0 ? null : turnCosts.reduce((sum, cost) => sum + cost, 0),
401
+ durationMs: turns.reduce((sum, turn) => sum + turn.durationMs, 0),
402
+ numTurns: turns.reduce((sum, turn) => sum + turn.numTurns, 0),
403
+ permissionDenials: turns.reduce((sum, turn) => sum + turn.permissionDenials, 0),
404
+ ...(isMultiTurn ? { turnUsage: [...turnUsage] } : {}),
405
+ toolCalls,
406
+ sessionId: final.sessionId,
407
+ ...(problems.length > 0 ? { error: problems.join('; ') } : {}),
408
+ timestamp: new Date().toISOString(),
409
+ };
410
+ }
411
+ catch (error) {
412
+ return {
413
+ ...base,
414
+ servedModels: [],
415
+ baselineCheckPassed,
416
+ success: null,
417
+ // a failed/garbled turn errors the cell, but completed turns still
418
+ // count: usage sums them (keeps `usage` consistent with `turnUsage` —
419
+ // an aggregator summing either must see the same spend) and costUsd
420
+ // carries the partial spend so the runner's budget ceiling sees it
421
+ usage: turnUsage.reduce(addUsage, zeroUsage()),
422
+ costUsd: turnCosts.length === 0 ? null : turnCosts.reduce((sum, cost) => sum + cost, 0),
423
+ durationMs: 0,
424
+ numTurns: 0,
425
+ permissionDenials: 0,
426
+ ...(isMultiTurn && turnUsage.length > 0 ? { turnUsage: [...turnUsage] } : {}),
427
+ toolCalls: {},
428
+ sessionId: null,
429
+ error: errorMessage(error),
430
+ timestamp: new Date().toISOString(),
431
+ };
432
+ }
433
+ finally {
434
+ await cleanupTempDir(workspace);
435
+ await cleanupTempDir(scratch);
436
+ }
437
+ }
@@ -0,0 +1,37 @@
1
+ import type { CellResult, RunMeta } from './types.ts';
2
+ export declare function runFilePath(outDir: string, runId: string): string;
3
+ export declare function newRunId(): string;
4
+ export declare function appendResult(outDir: string, runId: string, row: CellResult): Promise<string>;
5
+ export declare function writeRunMeta(outDir: string, meta: RunMeta): Promise<string>;
6
+ export declare function readRun(outDir: string, runId: string): Promise<{
7
+ meta: RunMeta | null;
8
+ results: CellResult[];
9
+ }>;
10
+ /**
11
+ * Post-run balance assertion: cross-variant comparison is valid only when
12
+ * every variant executed the same number of cells (the runner schedules
13
+ * variants innermost and stops group-atomically, so an imbalance means
14
+ * something defeated that — e.g. results concatenated from separate arm runs
15
+ * with independent budget ceilings, each truncating at its own point).
16
+ * Returns a warning string, or null when balanced. Skipped cells (budget
17
+ * ceiling / no-cost breaker) are not counted as executed.
18
+ */
19
+ export declare function balanceWarning(results: readonly CellResult[]): string | null;
20
+ export interface VariantAggregate {
21
+ variantId: string;
22
+ cells: number;
23
+ errors: number;
24
+ /** non-error cells — medians are 0-on-empty, so deltas must check this */
25
+ valid: number;
26
+ successRate: number | null;
27
+ medianInput: number;
28
+ medianOutput: number;
29
+ medianCacheCreation: number;
30
+ medianCacheRead: number;
31
+ medianCostUsd: number | null;
32
+ medianDurationMs: number;
33
+ medianTurns: number;
34
+ iqrOutput: [number, number];
35
+ toolCallTotals: Record<string, number>;
36
+ }
37
+ export declare function aggregate(results: CellResult[]): VariantAggregate[];
@@ -0,0 +1,157 @@
1
+ import { appendFile, mkdir, readFile, writeFile } from 'node:fs/promises';
2
+ import path from 'node:path';
3
+ export function runFilePath(outDir, runId) {
4
+ return path.join(outDir, `${runId}.jsonl`);
5
+ }
6
+ function metaFilePath(outDir, runId) {
7
+ return path.join(outDir, `${runId}.meta.json`);
8
+ }
9
+ export function newRunId() {
10
+ const now = new Date();
11
+ const pad = (n) => String(n).padStart(2, '0');
12
+ const date = `${now.getFullYear()}${pad(now.getMonth() + 1)}${pad(now.getDate())}`;
13
+ const time = `${pad(now.getHours())}${pad(now.getMinutes())}${pad(now.getSeconds())}`;
14
+ return `bench-${date}-${time}`;
15
+ }
16
+ export async function appendResult(outDir, runId, row) {
17
+ await mkdir(outDir, { recursive: true });
18
+ const file = runFilePath(outDir, runId);
19
+ await appendFile(file, `${JSON.stringify(row)}\n`, 'utf8');
20
+ return file;
21
+ }
22
+ export async function writeRunMeta(outDir, meta) {
23
+ await mkdir(outDir, { recursive: true });
24
+ const file = metaFilePath(outDir, meta.runId);
25
+ await writeFile(file, `${JSON.stringify(meta, null, 2)}\n`, 'utf8');
26
+ return file;
27
+ }
28
+ function isRecord(value) {
29
+ return typeof value === 'object' && value !== null && !Array.isArray(value);
30
+ }
31
+ export async function readRun(outDir, runId) {
32
+ let meta = null;
33
+ try {
34
+ const parsed = JSON.parse(await readFile(metaFilePath(outDir, runId), 'utf8'));
35
+ if (isRecord(parsed) && typeof parsed['runId'] === 'string') {
36
+ meta = parsed;
37
+ }
38
+ }
39
+ catch {
40
+ meta = null;
41
+ }
42
+ let text = '';
43
+ try {
44
+ text = await readFile(runFilePath(outDir, runId), 'utf8');
45
+ }
46
+ catch {
47
+ text = '';
48
+ }
49
+ const results = [];
50
+ for (const line of text.split('\n')) {
51
+ const trimmed = line.trim();
52
+ if (trimmed === '')
53
+ continue;
54
+ let parsed;
55
+ try {
56
+ parsed = JSON.parse(trimmed);
57
+ }
58
+ catch {
59
+ continue;
60
+ }
61
+ if (isRecord(parsed) &&
62
+ typeof parsed['taskId'] === 'string' &&
63
+ typeof parsed['variantId'] === 'string') {
64
+ results.push(parsed);
65
+ }
66
+ }
67
+ return { meta, results };
68
+ }
69
+ /**
70
+ * Post-run balance assertion: cross-variant comparison is valid only when
71
+ * every variant executed the same number of cells (the runner schedules
72
+ * variants innermost and stops group-atomically, so an imbalance means
73
+ * something defeated that — e.g. results concatenated from separate arm runs
74
+ * with independent budget ceilings, each truncating at its own point).
75
+ * Returns a warning string, or null when balanced. Skipped cells (budget
76
+ * ceiling / no-cost breaker) are not counted as executed.
77
+ */
78
+ export function balanceWarning(results) {
79
+ const counts = new Map();
80
+ for (const row of results) {
81
+ if (row.error?.startsWith('skipped:') === true)
82
+ continue;
83
+ counts.set(row.variantId, (counts.get(row.variantId) ?? 0) + 1);
84
+ }
85
+ const values = [...counts.values()];
86
+ const first = values[0];
87
+ if (first === undefined || values.every((count) => count === first)) {
88
+ return null;
89
+ }
90
+ const detail = [...counts.entries()]
91
+ .map(([variantId, count]) => `${variantId}=${count}`)
92
+ .join(', ');
93
+ return `WARNING: unbalanced variants — executed cell counts differ (${detail}); drop task×trial groups missing from any variant before comparing`;
94
+ }
95
+ /** Linear-interpolated quantile (numpy default); 0 on empty input. */
96
+ function quantile(sorted, p) {
97
+ if (sorted.length === 0)
98
+ return 0;
99
+ const pos = (sorted.length - 1) * p;
100
+ const lo = Math.floor(pos);
101
+ const a = sorted[lo] ?? 0;
102
+ const b = sorted[Math.ceil(pos)] ?? a;
103
+ return a + (b - a) * (pos - lo);
104
+ }
105
+ function sortedAsc(values) {
106
+ return [...values].sort((x, y) => x - y);
107
+ }
108
+ function median(values) {
109
+ return quantile(sortedAsc(values), 0.5);
110
+ }
111
+ export function aggregate(results) {
112
+ const byVariant = new Map();
113
+ for (const row of results) {
114
+ const rows = byVariant.get(row.variantId);
115
+ if (rows === undefined) {
116
+ byVariant.set(row.variantId, [row]);
117
+ }
118
+ else {
119
+ rows.push(row);
120
+ }
121
+ }
122
+ const aggregates = [];
123
+ for (const [variantId, rows] of byVariant) {
124
+ const valid = rows.filter((r) => r.error === undefined || r.error === null);
125
+ const judged = valid.filter((r) => typeof r.success === 'boolean');
126
+ const successRate = judged.length === 0
127
+ ? null
128
+ : judged.filter((r) => r.success === true).length / judged.length;
129
+ const costs = valid
130
+ .map((r) => r.costUsd)
131
+ .filter((c) => typeof c === 'number');
132
+ const outputs = sortedAsc(valid.map((r) => r.usage.output));
133
+ const toolCallTotals = {};
134
+ for (const r of valid) {
135
+ for (const [name, count] of Object.entries(r.toolCalls)) {
136
+ toolCallTotals[name] = (toolCallTotals[name] ?? 0) + count;
137
+ }
138
+ }
139
+ aggregates.push({
140
+ variantId,
141
+ cells: rows.length,
142
+ errors: rows.length - valid.length,
143
+ valid: valid.length,
144
+ successRate,
145
+ medianInput: median(valid.map((r) => r.usage.input)),
146
+ medianOutput: quantile(outputs, 0.5),
147
+ medianCacheCreation: median(valid.map((r) => r.usage.cacheCreation)),
148
+ medianCacheRead: median(valid.map((r) => r.usage.cacheRead)),
149
+ medianCostUsd: costs.length === 0 ? null : median(costs),
150
+ medianDurationMs: median(valid.map((r) => r.durationMs)),
151
+ medianTurns: median(valid.map((r) => r.numTurns)),
152
+ iqrOutput: [quantile(outputs, 0.25), quantile(outputs, 0.75)],
153
+ toolCallTotals,
154
+ });
155
+ }
156
+ return aggregates;
157
+ }
@@ -0,0 +1,24 @@
1
+ import type { CellResult, SuiteSpec, Variant } from './types.ts';
2
+ export interface RunOptions {
3
+ suite: SuiteSpec;
4
+ variants: Variant[];
5
+ trials: number;
6
+ model: string;
7
+ maxBudgetUsd: number;
8
+ concurrency: number;
9
+ outDir: string;
10
+ fixturesDir: string;
11
+ onProgress?: (line: string) => void;
12
+ }
13
+ /**
14
+ * Cells that report no cost (timeouts, errors, subscription/Bedrock auth)
15
+ * still bill real API spend, so the dollar ceiling cannot see them. After
16
+ * this many consecutive no-cost cells the ceiling is unenforceable and the
17
+ * runner stops scheduling instead of burning the whole grid.
18
+ */
19
+ export declare const MAX_CONSECUTIVE_NO_COST_CELLS = 3;
20
+ export declare function runBenchmark(opts: RunOptions): Promise<{
21
+ runId: string;
22
+ results: CellResult[];
23
+ resultsFile: string;
24
+ }>;