@astudioplus/compressor 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +52 -0
- package/LICENSE +20 -0
- package/README.md +167 -0
- package/dist/adapters/agents-md.d.ts +2 -0
- package/dist/adapters/agents-md.js +91 -0
- package/dist/adapters/apply.d.ts +3 -0
- package/dist/adapters/apply.js +83 -0
- package/dist/adapters/claude-code.d.ts +2 -0
- package/dist/adapters/claude-code.js +403 -0
- package/dist/adapters/copilot.d.ts +2 -0
- package/dist/adapters/copilot.js +418 -0
- package/dist/adapters/cursor.d.ts +2 -0
- package/dist/adapters/cursor.js +149 -0
- package/dist/adapters/index.d.ts +11 -0
- package/dist/adapters/index.js +19 -0
- package/dist/adapters/markers.d.ts +7 -0
- package/dist/adapters/markers.js +129 -0
- package/dist/adapters/types.d.ts +44 -0
- package/dist/adapters/types.js +1 -0
- package/dist/bench/ablate.d.ts +35 -0
- package/dist/bench/ablate.js +163 -0
- package/dist/bench/cell.d.ts +33 -0
- package/dist/bench/cell.js +437 -0
- package/dist/bench/results.d.ts +37 -0
- package/dist/bench/results.js +157 -0
- package/dist/bench/runner.d.ts +24 -0
- package/dist/bench/runner.js +121 -0
- package/dist/bench/tasks.d.ts +4 -0
- package/dist/bench/tasks.js +147 -0
- package/dist/bench/types.d.ts +109 -0
- package/dist/bench/types.js +1 -0
- package/dist/claude/transcripts.d.ts +30 -0
- package/dist/claude/transcripts.js +154 -0
- package/dist/cli/commands/benchmark.d.ts +33 -0
- package/dist/cli/commands/benchmark.js +203 -0
- package/dist/cli/commands/compress.d.ts +8 -0
- package/dist/cli/commands/compress.js +45 -0
- package/dist/cli/commands/count.d.ts +5 -0
- package/dist/cli/commands/count.js +25 -0
- package/dist/cli/commands/hook.d.ts +6 -0
- package/dist/cli/commands/hook.js +30 -0
- package/dist/cli/commands/init.d.ts +16 -0
- package/dist/cli/commands/init.js +76 -0
- package/dist/cli/commands/report.d.ts +90 -0
- package/dist/cli/commands/report.js +464 -0
- package/dist/cli/commands/savings.d.ts +38 -0
- package/dist/cli/commands/savings.js +196 -0
- package/dist/cli/commands/set-mode.d.ts +5 -0
- package/dist/cli/commands/set-mode.js +13 -0
- package/dist/cli/commands/stats.d.ts +5 -0
- package/dist/cli/commands/stats.js +51 -0
- package/dist/cli/commands/status.d.ts +1 -0
- package/dist/cli/commands/status.js +11 -0
- package/dist/cli/commands/uninstall.d.ts +7 -0
- package/dist/cli/commands/uninstall.js +22 -0
- package/dist/cli/index.d.ts +2 -0
- package/dist/cli/index.js +146 -0
- package/dist/copilot-hook-entry.d.ts +1 -0
- package/dist/copilot-hook-entry.js +36 -0
- package/dist/copilot-hook.js +1000 -0
- package/dist/engine/detect.d.ts +2 -0
- package/dist/engine/detect.js +47 -0
- package/dist/engine/index.d.ts +4 -0
- package/dist/engine/index.js +90 -0
- package/dist/engine/policy.d.ts +2 -0
- package/dist/engine/policy.js +48 -0
- package/dist/engine/tiers/code.d.ts +7 -0
- package/dist/engine/tiers/code.js +206 -0
- package/dist/engine/tiers/logs.d.ts +4 -0
- package/dist/engine/tiers/logs.js +139 -0
- package/dist/engine/tiers/structural.d.ts +28 -0
- package/dist/engine/tiers/structural.js +199 -0
- package/dist/engine/types.d.ts +71 -0
- package/dist/engine/types.js +5 -0
- package/dist/hook/copilot.d.ts +5 -0
- package/dist/hook/copilot.js +136 -0
- package/dist/hook/core.d.ts +36 -0
- package/dist/hook/core.js +138 -0
- package/dist/hook/exit.d.ts +22 -0
- package/dist/hook/exit.js +56 -0
- package/dist/hook/post-tool-use.d.ts +5 -0
- package/dist/hook/post-tool-use.js +57 -0
- package/dist/hook-entry.d.ts +1 -0
- package/dist/hook-entry.js +35 -0
- package/dist/hook.js +946 -0
- package/dist/index.d.ts +15 -0
- package/dist/index.js +16 -0
- package/dist/ledger/read.d.ts +9 -0
- package/dist/ledger/read.js +91 -0
- package/dist/ledger/write.d.ts +29 -0
- package/dist/ledger/write.js +61 -0
- package/dist/packs/atoms.d.ts +3 -0
- package/dist/packs/atoms.js +108 -0
- package/dist/packs/modes.d.ts +3 -0
- package/dist/packs/modes.js +34 -0
- package/dist/packs/render.d.ts +24 -0
- package/dist/packs/render.js +115 -0
- package/dist/packs/types.d.ts +32 -0
- package/dist/packs/types.js +1 -0
- package/dist/paths.d.ts +29 -0
- package/dist/paths.js +87 -0
- package/dist/tokens/estimate.d.ts +12 -0
- package/dist/tokens/estimate.js +23 -0
- package/dist/tokens/exact.d.ts +5 -0
- package/dist/tokens/exact.js +16 -0
- package/dist/tokens/index.d.ts +2 -0
- package/dist/tokens/index.js +2 -0
- package/package.json +77 -0
|
@@ -0,0 +1,437 @@
|
|
|
1
|
+
import { exec, execFile } from 'node:child_process';
|
|
2
|
+
import { cp, mkdir, mkdtemp, readFile, realpath, rm, writeFile } from 'node:fs/promises';
|
|
3
|
+
import { tmpdir } from 'node:os';
|
|
4
|
+
import path from 'node:path';
|
|
5
|
+
import { promisify } from 'node:util';
|
|
6
|
+
import { addUsage, encodeProjectDir, readSessionUsage, } from "../claude/transcripts.js";
|
|
7
|
+
import { resolveHookCommand } from "../paths.js";
|
|
8
|
+
const execAsync = promisify(exec);
|
|
9
|
+
const execFileAsync = promisify(execFile);
|
|
10
|
+
const CLAUDE_TIMEOUT_MS = 600_000;
|
|
11
|
+
const CHECK_TIMEOUT_MS = 600_000;
|
|
12
|
+
const MAX_BUFFER = 32 * 1024 * 1024;
|
|
13
|
+
const HOOK_MATCHER = 'Read|Bash|Grep|Glob';
|
|
14
|
+
function isRecord(value) {
|
|
15
|
+
return typeof value === 'object' && value !== null && !Array.isArray(value);
|
|
16
|
+
}
|
|
17
|
+
function num(value) {
|
|
18
|
+
return typeof value === 'number' && Number.isFinite(value) ? value : 0;
|
|
19
|
+
}
|
|
20
|
+
function zeroUsage() {
|
|
21
|
+
return { input: 0, output: 0, cacheCreation: 0, cacheRead: 0 };
|
|
22
|
+
}
|
|
23
|
+
function errorMessage(error) {
|
|
24
|
+
const text = error instanceof Error ? error.message : String(error);
|
|
25
|
+
return text.length > 400 ? `${text.slice(0, 400)}…` : text;
|
|
26
|
+
}
|
|
27
|
+
async function gitInitBestEffort(workspace) {
|
|
28
|
+
try {
|
|
29
|
+
await execFileAsync('git', ['init', '-q'], { cwd: workspace, timeout: 30_000 });
|
|
30
|
+
}
|
|
31
|
+
catch {
|
|
32
|
+
// git missing or init failed — workspace works without it
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* Hook command installed in a cell: the resolved bundle command plus the
|
|
37
|
+
* variant's extra args (Variant.hookArgs, e.g. '--marker-style informative')
|
|
38
|
+
* so experiments can vary engine behavior per variant. `root` is exposed for
|
|
39
|
+
* tests only; production callers use the package default.
|
|
40
|
+
*/
|
|
41
|
+
export function hookCommandForVariant(variant, root) {
|
|
42
|
+
if (variant.baseMode === 'full') {
|
|
43
|
+
throw new Error(`variant ${variant.id}: hook requires baseMode optimized|slim`);
|
|
44
|
+
}
|
|
45
|
+
const base = root === undefined
|
|
46
|
+
? resolveHookCommand(variant.baseMode)
|
|
47
|
+
: resolveHookCommand(variant.baseMode, root);
|
|
48
|
+
const extra = variant.hookArgs?.trim() ?? '';
|
|
49
|
+
return extra === '' ? base : `${base} ${extra}`;
|
|
50
|
+
}
|
|
51
|
+
/** Writes style files + cell settings; returns the settings file path. */
|
|
52
|
+
async function writeVariantArtifacts(variant, workspace, scratch) {
|
|
53
|
+
if (variant.styleBody !== null && variant.styleName !== null) {
|
|
54
|
+
const fileName = `${variant.styleName}.md`;
|
|
55
|
+
// style resolution under --bare may use either scope: write both
|
|
56
|
+
const workspaceDir = path.join(workspace, '.claude', 'output-styles');
|
|
57
|
+
const scratchDir = path.join(scratch, 'output-styles');
|
|
58
|
+
await mkdir(workspaceDir, { recursive: true });
|
|
59
|
+
await mkdir(scratchDir, { recursive: true });
|
|
60
|
+
await writeFile(path.join(workspaceDir, fileName), variant.styleBody, 'utf8');
|
|
61
|
+
await writeFile(path.join(scratchDir, fileName), variant.styleBody, 'utf8');
|
|
62
|
+
}
|
|
63
|
+
const settings = {
|
|
64
|
+
// Headless cells must work unprompted inside their throwaway workspace;
|
|
65
|
+
// denied Edit/Bash calls otherwise corrupt the measurement (the model
|
|
66
|
+
// spins on retries instead of doing the task — observed live: 16 turns
|
|
67
|
+
// of denial loops with the correct fix in hand).
|
|
68
|
+
permissions: { defaultMode: 'bypassPermissions' },
|
|
69
|
+
};
|
|
70
|
+
if (variant.styleName !== null) {
|
|
71
|
+
settings['outputStyle'] = variant.styleName;
|
|
72
|
+
}
|
|
73
|
+
if (variant.hook) {
|
|
74
|
+
settings['hooks'] = {
|
|
75
|
+
PostToolUse: [
|
|
76
|
+
{
|
|
77
|
+
matcher: HOOK_MATCHER,
|
|
78
|
+
hooks: [{ type: 'command', command: hookCommandForVariant(variant) }],
|
|
79
|
+
},
|
|
80
|
+
],
|
|
81
|
+
};
|
|
82
|
+
}
|
|
83
|
+
const file = path.join(scratch, 'cell-settings.json');
|
|
84
|
+
await writeFile(file, `${JSON.stringify(settings, null, 2)}\n`, 'utf8');
|
|
85
|
+
return file;
|
|
86
|
+
}
|
|
87
|
+
async function runCommandCheck(command, cwd) {
|
|
88
|
+
try {
|
|
89
|
+
await execAsync(command, { cwd, timeout: CHECK_TIMEOUT_MS });
|
|
90
|
+
return { kind: 'ran', passed: true };
|
|
91
|
+
}
|
|
92
|
+
catch (error) {
|
|
93
|
+
const code = error.code;
|
|
94
|
+
if (typeof code === 'number') {
|
|
95
|
+
return { kind: 'ran', passed: code === 0 };
|
|
96
|
+
}
|
|
97
|
+
return { kind: 'infra', message: errorMessage(error) };
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
async function baselineCheck(check, workspace) {
|
|
101
|
+
if (check.kind !== 'command') {
|
|
102
|
+
return null;
|
|
103
|
+
}
|
|
104
|
+
const outcome = await runCommandCheck(check.command, workspace);
|
|
105
|
+
return outcome.kind === 'ran' ? outcome.passed : null;
|
|
106
|
+
}
|
|
107
|
+
/**
|
|
108
|
+
* Command checks run once in the workspace (after the final turn). For
|
|
109
|
+
* answer-regex the conversation is the answer: pass when the pattern matches
|
|
110
|
+
* ANY single turn's result text (see the semantics note in tasks.ts).
|
|
111
|
+
*/
|
|
112
|
+
async function judgeSuccess(check, workspace, resultTexts) {
|
|
113
|
+
if (check.kind === 'command') {
|
|
114
|
+
const outcome = await runCommandCheck(check.command, workspace);
|
|
115
|
+
if (outcome.kind === 'infra') {
|
|
116
|
+
return { success: null, checkError: `success check failed to run: ${outcome.message}` };
|
|
117
|
+
}
|
|
118
|
+
return { success: outcome.passed, checkError: null };
|
|
119
|
+
}
|
|
120
|
+
try {
|
|
121
|
+
const re = new RegExp(check.pattern, check.flags);
|
|
122
|
+
const success = resultTexts.some((text) => {
|
|
123
|
+
re.lastIndex = 0; // 'g'/'y' flags carry state across .test calls
|
|
124
|
+
return re.test(text);
|
|
125
|
+
});
|
|
126
|
+
return { success, checkError: null };
|
|
127
|
+
}
|
|
128
|
+
catch (error) {
|
|
129
|
+
return { success: null, checkError: `answer-regex invalid: ${errorMessage(error)}` };
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
/**
|
|
133
|
+
* Environment for the claude child process (and therefore for the PostToolUse
|
|
134
|
+
* hook it spawns). CLAUDE_CONFIG_DIR isolates the cell; COMPRESSOR_NO_LEDGER
|
|
135
|
+
* keeps benchmark cells out of the user's LIVE savings ledger — hook-bearing
|
|
136
|
+
* cells run the real hook, and without the kill switch every worthwhile
|
|
137
|
+
* compression would append a synthetic event to ~/.compressor/ledger,
|
|
138
|
+
* corrupting what `compressor savings` reports. Exported for tests.
|
|
139
|
+
*/
|
|
140
|
+
export function cellEnv(scratch) {
|
|
141
|
+
return { ...process.env, CLAUDE_CONFIG_DIR: scratch, COMPRESSOR_NO_LEDGER: '1' };
|
|
142
|
+
}
|
|
143
|
+
async function invokeClaude(spec, workspace, scratch, settingsFile, prompt, resumeSessionId) {
|
|
144
|
+
const bin = process.env.COMPRESSOR_CLAUDE_BIN ?? 'claude';
|
|
145
|
+
const args = [
|
|
146
|
+
'--bare',
|
|
147
|
+
'-p',
|
|
148
|
+
prompt,
|
|
149
|
+
'--output-format',
|
|
150
|
+
'json',
|
|
151
|
+
'--model',
|
|
152
|
+
spec.model,
|
|
153
|
+
'--settings',
|
|
154
|
+
settingsFile,
|
|
155
|
+
];
|
|
156
|
+
if (resumeSessionId !== undefined) {
|
|
157
|
+
// documented headless continuation: claude -p "<prompt>" --resume <id>
|
|
158
|
+
args.push('--resume', resumeSessionId);
|
|
159
|
+
}
|
|
160
|
+
const options = {
|
|
161
|
+
cwd: workspace,
|
|
162
|
+
env: cellEnv(scratch),
|
|
163
|
+
timeout: CLAUDE_TIMEOUT_MS,
|
|
164
|
+
maxBuffer: MAX_BUFFER,
|
|
165
|
+
};
|
|
166
|
+
// .mjs/.js bins (test stubs) are not directly executable: run via node
|
|
167
|
+
const { stdout } = /\.(mjs|js)$/.test(bin)
|
|
168
|
+
? await execFileAsync(process.execPath, [bin, ...args], options)
|
|
169
|
+
: await execFileAsync(bin, args, options);
|
|
170
|
+
return stdout;
|
|
171
|
+
}
|
|
172
|
+
function parseResultJson(stdout) {
|
|
173
|
+
let parsed;
|
|
174
|
+
try {
|
|
175
|
+
parsed = JSON.parse(stdout);
|
|
176
|
+
}
|
|
177
|
+
catch {
|
|
178
|
+
const head = stdout.trim().slice(0, 200);
|
|
179
|
+
throw new Error(`result JSON parse failed: ${head === '' ? '(empty stdout)' : head}`);
|
|
180
|
+
}
|
|
181
|
+
if (!isRecord(parsed)) {
|
|
182
|
+
throw new Error('result JSON parse failed: not an object');
|
|
183
|
+
}
|
|
184
|
+
const usage = isRecord(parsed['usage']) ? parsed['usage'] : {};
|
|
185
|
+
return {
|
|
186
|
+
sessionId: typeof parsed['session_id'] === 'string' ? parsed['session_id'] : null,
|
|
187
|
+
servedModels: Object.keys(isRecord(parsed['modelUsage']) ? parsed['modelUsage'] : {}),
|
|
188
|
+
usage: {
|
|
189
|
+
input: num(usage['input_tokens']),
|
|
190
|
+
output: num(usage['output_tokens']),
|
|
191
|
+
cacheCreation: num(usage['cache_creation_input_tokens']),
|
|
192
|
+
cacheRead: num(usage['cache_read_input_tokens']),
|
|
193
|
+
},
|
|
194
|
+
costUsd: typeof parsed['total_cost_usd'] === 'number' ? parsed['total_cost_usd'] : null,
|
|
195
|
+
durationMs: num(parsed['duration_ms']),
|
|
196
|
+
numTurns: num(parsed['num_turns']),
|
|
197
|
+
permissionDenials: Array.isArray(parsed['permission_denials'])
|
|
198
|
+
? parsed['permission_denials'].length
|
|
199
|
+
: 0,
|
|
200
|
+
resultText: typeof parsed['result'] === 'string' ? parsed['result'] : '',
|
|
201
|
+
};
|
|
202
|
+
}
|
|
203
|
+
function transcriptFilePath(scratch, workspace, sessionId) {
|
|
204
|
+
return path.join(scratch, 'projects', encodeProjectDir(workspace), `${sessionId}.jsonl`);
|
|
205
|
+
}
|
|
206
|
+
/**
|
|
207
|
+
* Transcript totals and summed per-turn result JSONs count the same API
|
|
208
|
+
* responses, so they must roughly agree. Divergence beyond this relative
|
|
209
|
+
* tolerance means one of the two known failure topologies happened: a
|
|
210
|
+
* resumed session forked ids and the final transcript does NOT carry the
|
|
211
|
+
* full copied history (transcript ≪ sum: usage silently undercounts to
|
|
212
|
+
* roughly the last turn), or per-turn result JSONs report cumulative
|
|
213
|
+
* session usage (sum ≫ transcript: the fallback double-counts). Neither is
|
|
214
|
+
* detectable from one side alone; the cell is flagged data-quality-suspect.
|
|
215
|
+
*/
|
|
216
|
+
export const USAGE_MISMATCH_TOLERANCE = 0.25;
|
|
217
|
+
function totalTokens(usage) {
|
|
218
|
+
return usage.input + usage.output + usage.cacheCreation + usage.cacheRead;
|
|
219
|
+
}
|
|
220
|
+
/**
|
|
221
|
+
* Cell-level usage for multi-turn cells: the FINAL transcript, deduped by
|
|
222
|
+
* requestId (readSessionUsage), is authoritative across all turns — resumed
|
|
223
|
+
* sessions carry the full history, and per-turn result JSONs would double
|
|
224
|
+
* count anything the API reported on more than one turn. Falls back to
|
|
225
|
+
* summing the turn result JSONs when the transcript is missing/empty.
|
|
226
|
+
* When the transcript IS used, it is cross-checked against the summed
|
|
227
|
+
* per-turn usage; disagreement flags the cell instead of silently reporting
|
|
228
|
+
* a wrong total (`suspect` carries the data-quality note).
|
|
229
|
+
*/
|
|
230
|
+
async function multiTurnUsage(scratch, workspace, sessionId, turnUsage) {
|
|
231
|
+
const summed = turnUsage.reduce(addUsage, zeroUsage());
|
|
232
|
+
if (sessionId === null) {
|
|
233
|
+
return { totals: summed, suspect: null };
|
|
234
|
+
}
|
|
235
|
+
try {
|
|
236
|
+
const session = await readSessionUsage(transcriptFilePath(scratch, workspace, sessionId));
|
|
237
|
+
if (session.turns === 0) {
|
|
238
|
+
return { totals: summed, suspect: null };
|
|
239
|
+
}
|
|
240
|
+
const fromTranscript = totalTokens(session.totals);
|
|
241
|
+
const fromTurns = totalTokens(summed);
|
|
242
|
+
const limit = Math.max(fromTranscript, fromTurns) * USAGE_MISMATCH_TOLERANCE;
|
|
243
|
+
const suspect = fromTurns > 0 && Math.abs(fromTranscript - fromTurns) > limit
|
|
244
|
+
? `usage data-quality: final transcript totals (${fromTranscript} tokens) diverge from summed per-turn usage (${fromTurns} tokens) by >${Math.round(USAGE_MISMATCH_TOLERANCE * 100)}% — resumed session may have forked without full history, or per-turn result JSONs may be cumulative`
|
|
245
|
+
: null;
|
|
246
|
+
return { totals: session.totals, suspect };
|
|
247
|
+
}
|
|
248
|
+
catch {
|
|
249
|
+
return { totals: summed, suspect: null };
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
async function countToolCalls(transcriptFile) {
|
|
253
|
+
let text;
|
|
254
|
+
try {
|
|
255
|
+
text = await readFile(transcriptFile, 'utf8');
|
|
256
|
+
}
|
|
257
|
+
catch {
|
|
258
|
+
return {};
|
|
259
|
+
}
|
|
260
|
+
// PLAN.md: the same API response can appear on multiple transcript lines —
|
|
261
|
+
// dedupe by requestId/message.id, last occurrence wins (matches
|
|
262
|
+
// readSessionUsage in src/claude/transcripts.ts)
|
|
263
|
+
const byKey = new Map();
|
|
264
|
+
let anonCounter = 0;
|
|
265
|
+
for (const line of text.split('\n')) {
|
|
266
|
+
const trimmed = line.trim();
|
|
267
|
+
if (trimmed === '')
|
|
268
|
+
continue;
|
|
269
|
+
let parsed;
|
|
270
|
+
try {
|
|
271
|
+
parsed = JSON.parse(trimmed);
|
|
272
|
+
}
|
|
273
|
+
catch {
|
|
274
|
+
continue;
|
|
275
|
+
}
|
|
276
|
+
if (!isRecord(parsed) || parsed['type'] !== 'assistant')
|
|
277
|
+
continue;
|
|
278
|
+
const message = parsed['message'];
|
|
279
|
+
if (!isRecord(message) || !Array.isArray(message['content']))
|
|
280
|
+
continue;
|
|
281
|
+
const names = [];
|
|
282
|
+
for (const block of message['content']) {
|
|
283
|
+
if (isRecord(block) && block['type'] === 'tool_use' && typeof block['name'] === 'string') {
|
|
284
|
+
names.push(block['name']);
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
const key = typeof parsed['requestId'] === 'string'
|
|
288
|
+
? parsed['requestId']
|
|
289
|
+
: typeof message['id'] === 'string'
|
|
290
|
+
? message['id']
|
|
291
|
+
: `anon-${anonCounter++}`;
|
|
292
|
+
byKey.set(key, names);
|
|
293
|
+
}
|
|
294
|
+
const counts = {};
|
|
295
|
+
for (const names of byKey.values()) {
|
|
296
|
+
for (const name of names) {
|
|
297
|
+
counts[name] = (counts[name] ?? 0) + 1;
|
|
298
|
+
}
|
|
299
|
+
}
|
|
300
|
+
return counts;
|
|
301
|
+
}
|
|
302
|
+
/** Best-effort removal, refusing anything outside the OS temp dir. */
|
|
303
|
+
async function cleanupTempDir(dir) {
|
|
304
|
+
if (dir === '')
|
|
305
|
+
return;
|
|
306
|
+
try {
|
|
307
|
+
const tmpReal = await realpath(tmpdir());
|
|
308
|
+
const rel = path.relative(tmpReal, dir);
|
|
309
|
+
if (rel === '' || rel.startsWith('..') || path.isAbsolute(rel))
|
|
310
|
+
return;
|
|
311
|
+
await rm(dir, { recursive: true, force: true });
|
|
312
|
+
}
|
|
313
|
+
catch {
|
|
314
|
+
// best-effort
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
export async function runCell(spec, ctx) {
|
|
318
|
+
const base = {
|
|
319
|
+
runId: ctx.runId,
|
|
320
|
+
taskId: spec.task.id,
|
|
321
|
+
variantId: spec.variant.id,
|
|
322
|
+
trial: spec.trial,
|
|
323
|
+
model: spec.model,
|
|
324
|
+
};
|
|
325
|
+
let workspace = '';
|
|
326
|
+
let scratch = '';
|
|
327
|
+
let baselineCheckPassed = null;
|
|
328
|
+
const isMultiTurn = spec.task.turns !== undefined;
|
|
329
|
+
// accumulated outside the try so a failed turn still reports completed
|
|
330
|
+
// turns — including their COSTS: every completed turn's costUsd is known
|
|
331
|
+
// at failure time, and discarding it would leave the runner's budget
|
|
332
|
+
// ceiling blind to real spend on exactly the runs that misbehave
|
|
333
|
+
const turnUsage = [];
|
|
334
|
+
const turnCosts = [];
|
|
335
|
+
try {
|
|
336
|
+
// realpath both sides so the encoded transcript dir matches the cwd the
|
|
337
|
+
// child reports (macOS tmpdir is a symlinked /var/folders path)
|
|
338
|
+
workspace = await realpath(await mkdtemp(path.join(tmpdir(), 'compressor-bench-ws-')));
|
|
339
|
+
scratch = await realpath(await mkdtemp(path.join(tmpdir(), 'compressor-bench-cfg-')));
|
|
340
|
+
// fix.patch.json is the answer key (scripted fix for stubs/fixture tests);
|
|
341
|
+
// copying it would hand the agent the literal solution
|
|
342
|
+
await cp(path.join(ctx.fixturesDir, spec.task.fixture), workspace, {
|
|
343
|
+
recursive: true,
|
|
344
|
+
filter: (src) => path.basename(src) !== 'fix.patch.json',
|
|
345
|
+
});
|
|
346
|
+
await gitInitBestEffort(workspace);
|
|
347
|
+
const settingsFile = await writeVariantArtifacts(spec.variant, workspace, scratch);
|
|
348
|
+
baselineCheckPassed = await baselineCheck(spec.task.check, workspace);
|
|
349
|
+
// scripted conversation: first the task prompt, then each turn resumed
|
|
350
|
+
// from the previous turn's session id (sessions can fork ids on resume,
|
|
351
|
+
// so each turn chains from the one before it)
|
|
352
|
+
const prompts = [spec.task.prompt, ...(spec.task.turns ?? [])];
|
|
353
|
+
const turns = [];
|
|
354
|
+
for (const [index, prompt] of prompts.entries()) {
|
|
355
|
+
const label = prompts.length > 1 ? `turn ${index + 1}/${prompts.length}: ` : '';
|
|
356
|
+
let resume;
|
|
357
|
+
if (index > 0) {
|
|
358
|
+
const prevSession = turns[index - 1]?.sessionId ?? null;
|
|
359
|
+
if (prevSession === null) {
|
|
360
|
+
throw new Error(`${label}previous turn reported no session_id to --resume from`);
|
|
361
|
+
}
|
|
362
|
+
resume = prevSession;
|
|
363
|
+
}
|
|
364
|
+
let parsed;
|
|
365
|
+
try {
|
|
366
|
+
const stdout = await invokeClaude(spec, workspace, scratch, settingsFile, prompt, resume);
|
|
367
|
+
parsed = parseResultJson(stdout);
|
|
368
|
+
}
|
|
369
|
+
catch (error) {
|
|
370
|
+
// single-shot keeps its original message; conversations get the label
|
|
371
|
+
throw label === '' ? error : new Error(`${label}${errorMessage(error)}`);
|
|
372
|
+
}
|
|
373
|
+
turns.push(parsed);
|
|
374
|
+
turnUsage.push(parsed.usage);
|
|
375
|
+
if (typeof parsed.costUsd === 'number') {
|
|
376
|
+
turnCosts.push(parsed.costUsd);
|
|
377
|
+
}
|
|
378
|
+
}
|
|
379
|
+
const final = turns[turns.length - 1];
|
|
380
|
+
if (final === undefined) {
|
|
381
|
+
throw new Error('no turns ran'); // unreachable: prompts is never empty
|
|
382
|
+
}
|
|
383
|
+
// final transcript covers the whole conversation (toolCalls + usage)
|
|
384
|
+
const toolCalls = final.sessionId === null
|
|
385
|
+
? {}
|
|
386
|
+
: await countToolCalls(transcriptFilePath(scratch, workspace, final.sessionId));
|
|
387
|
+
const multi = isMultiTurn
|
|
388
|
+
? await multiTurnUsage(scratch, workspace, final.sessionId, turnUsage)
|
|
389
|
+
: null;
|
|
390
|
+
const usage = multi === null ? final.usage : multi.totals;
|
|
391
|
+
const { success, checkError } = await judgeSuccess(spec.task.check, workspace, turns.map((turn) => turn.resultText));
|
|
392
|
+
const problems = [checkError, multi?.suspect ?? null].filter((note) => note !== null);
|
|
393
|
+
return {
|
|
394
|
+
...base,
|
|
395
|
+
servedModels: [...new Set(turns.flatMap((turn) => turn.servedModels))],
|
|
396
|
+
baselineCheckPassed,
|
|
397
|
+
success,
|
|
398
|
+
usage,
|
|
399
|
+
// each invocation reports its own totals: sum across turns
|
|
400
|
+
costUsd: turnCosts.length === 0 ? null : turnCosts.reduce((sum, cost) => sum + cost, 0),
|
|
401
|
+
durationMs: turns.reduce((sum, turn) => sum + turn.durationMs, 0),
|
|
402
|
+
numTurns: turns.reduce((sum, turn) => sum + turn.numTurns, 0),
|
|
403
|
+
permissionDenials: turns.reduce((sum, turn) => sum + turn.permissionDenials, 0),
|
|
404
|
+
...(isMultiTurn ? { turnUsage: [...turnUsage] } : {}),
|
|
405
|
+
toolCalls,
|
|
406
|
+
sessionId: final.sessionId,
|
|
407
|
+
...(problems.length > 0 ? { error: problems.join('; ') } : {}),
|
|
408
|
+
timestamp: new Date().toISOString(),
|
|
409
|
+
};
|
|
410
|
+
}
|
|
411
|
+
catch (error) {
|
|
412
|
+
return {
|
|
413
|
+
...base,
|
|
414
|
+
servedModels: [],
|
|
415
|
+
baselineCheckPassed,
|
|
416
|
+
success: null,
|
|
417
|
+
// a failed/garbled turn errors the cell, but completed turns still
|
|
418
|
+
// count: usage sums them (keeps `usage` consistent with `turnUsage` —
|
|
419
|
+
// an aggregator summing either must see the same spend) and costUsd
|
|
420
|
+
// carries the partial spend so the runner's budget ceiling sees it
|
|
421
|
+
usage: turnUsage.reduce(addUsage, zeroUsage()),
|
|
422
|
+
costUsd: turnCosts.length === 0 ? null : turnCosts.reduce((sum, cost) => sum + cost, 0),
|
|
423
|
+
durationMs: 0,
|
|
424
|
+
numTurns: 0,
|
|
425
|
+
permissionDenials: 0,
|
|
426
|
+
...(isMultiTurn && turnUsage.length > 0 ? { turnUsage: [...turnUsage] } : {}),
|
|
427
|
+
toolCalls: {},
|
|
428
|
+
sessionId: null,
|
|
429
|
+
error: errorMessage(error),
|
|
430
|
+
timestamp: new Date().toISOString(),
|
|
431
|
+
};
|
|
432
|
+
}
|
|
433
|
+
finally {
|
|
434
|
+
await cleanupTempDir(workspace);
|
|
435
|
+
await cleanupTempDir(scratch);
|
|
436
|
+
}
|
|
437
|
+
}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import type { CellResult, RunMeta } from './types.ts';
|
|
2
|
+
export declare function runFilePath(outDir: string, runId: string): string;
|
|
3
|
+
export declare function newRunId(): string;
|
|
4
|
+
export declare function appendResult(outDir: string, runId: string, row: CellResult): Promise<string>;
|
|
5
|
+
export declare function writeRunMeta(outDir: string, meta: RunMeta): Promise<string>;
|
|
6
|
+
export declare function readRun(outDir: string, runId: string): Promise<{
|
|
7
|
+
meta: RunMeta | null;
|
|
8
|
+
results: CellResult[];
|
|
9
|
+
}>;
|
|
10
|
+
/**
|
|
11
|
+
* Post-run balance assertion: cross-variant comparison is valid only when
|
|
12
|
+
* every variant executed the same number of cells (the runner schedules
|
|
13
|
+
* variants innermost and stops group-atomically, so an imbalance means
|
|
14
|
+
* something defeated that — e.g. results concatenated from separate arm runs
|
|
15
|
+
* with independent budget ceilings, each truncating at its own point).
|
|
16
|
+
* Returns a warning string, or null when balanced. Skipped cells (budget
|
|
17
|
+
* ceiling / no-cost breaker) are not counted as executed.
|
|
18
|
+
*/
|
|
19
|
+
export declare function balanceWarning(results: readonly CellResult[]): string | null;
|
|
20
|
+
export interface VariantAggregate {
|
|
21
|
+
variantId: string;
|
|
22
|
+
cells: number;
|
|
23
|
+
errors: number;
|
|
24
|
+
/** non-error cells — medians are 0-on-empty, so deltas must check this */
|
|
25
|
+
valid: number;
|
|
26
|
+
successRate: number | null;
|
|
27
|
+
medianInput: number;
|
|
28
|
+
medianOutput: number;
|
|
29
|
+
medianCacheCreation: number;
|
|
30
|
+
medianCacheRead: number;
|
|
31
|
+
medianCostUsd: number | null;
|
|
32
|
+
medianDurationMs: number;
|
|
33
|
+
medianTurns: number;
|
|
34
|
+
iqrOutput: [number, number];
|
|
35
|
+
toolCallTotals: Record<string, number>;
|
|
36
|
+
}
|
|
37
|
+
export declare function aggregate(results: CellResult[]): VariantAggregate[];
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
import { appendFile, mkdir, readFile, writeFile } from 'node:fs/promises';
|
|
2
|
+
import path from 'node:path';
|
|
3
|
+
export function runFilePath(outDir, runId) {
|
|
4
|
+
return path.join(outDir, `${runId}.jsonl`);
|
|
5
|
+
}
|
|
6
|
+
function metaFilePath(outDir, runId) {
|
|
7
|
+
return path.join(outDir, `${runId}.meta.json`);
|
|
8
|
+
}
|
|
9
|
+
export function newRunId() {
|
|
10
|
+
const now = new Date();
|
|
11
|
+
const pad = (n) => String(n).padStart(2, '0');
|
|
12
|
+
const date = `${now.getFullYear()}${pad(now.getMonth() + 1)}${pad(now.getDate())}`;
|
|
13
|
+
const time = `${pad(now.getHours())}${pad(now.getMinutes())}${pad(now.getSeconds())}`;
|
|
14
|
+
return `bench-${date}-${time}`;
|
|
15
|
+
}
|
|
16
|
+
export async function appendResult(outDir, runId, row) {
|
|
17
|
+
await mkdir(outDir, { recursive: true });
|
|
18
|
+
const file = runFilePath(outDir, runId);
|
|
19
|
+
await appendFile(file, `${JSON.stringify(row)}\n`, 'utf8');
|
|
20
|
+
return file;
|
|
21
|
+
}
|
|
22
|
+
export async function writeRunMeta(outDir, meta) {
|
|
23
|
+
await mkdir(outDir, { recursive: true });
|
|
24
|
+
const file = metaFilePath(outDir, meta.runId);
|
|
25
|
+
await writeFile(file, `${JSON.stringify(meta, null, 2)}\n`, 'utf8');
|
|
26
|
+
return file;
|
|
27
|
+
}
|
|
28
|
+
function isRecord(value) {
|
|
29
|
+
return typeof value === 'object' && value !== null && !Array.isArray(value);
|
|
30
|
+
}
|
|
31
|
+
export async function readRun(outDir, runId) {
|
|
32
|
+
let meta = null;
|
|
33
|
+
try {
|
|
34
|
+
const parsed = JSON.parse(await readFile(metaFilePath(outDir, runId), 'utf8'));
|
|
35
|
+
if (isRecord(parsed) && typeof parsed['runId'] === 'string') {
|
|
36
|
+
meta = parsed;
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
catch {
|
|
40
|
+
meta = null;
|
|
41
|
+
}
|
|
42
|
+
let text = '';
|
|
43
|
+
try {
|
|
44
|
+
text = await readFile(runFilePath(outDir, runId), 'utf8');
|
|
45
|
+
}
|
|
46
|
+
catch {
|
|
47
|
+
text = '';
|
|
48
|
+
}
|
|
49
|
+
const results = [];
|
|
50
|
+
for (const line of text.split('\n')) {
|
|
51
|
+
const trimmed = line.trim();
|
|
52
|
+
if (trimmed === '')
|
|
53
|
+
continue;
|
|
54
|
+
let parsed;
|
|
55
|
+
try {
|
|
56
|
+
parsed = JSON.parse(trimmed);
|
|
57
|
+
}
|
|
58
|
+
catch {
|
|
59
|
+
continue;
|
|
60
|
+
}
|
|
61
|
+
if (isRecord(parsed) &&
|
|
62
|
+
typeof parsed['taskId'] === 'string' &&
|
|
63
|
+
typeof parsed['variantId'] === 'string') {
|
|
64
|
+
results.push(parsed);
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
return { meta, results };
|
|
68
|
+
}
|
|
69
|
+
/**
|
|
70
|
+
* Post-run balance assertion: cross-variant comparison is valid only when
|
|
71
|
+
* every variant executed the same number of cells (the runner schedules
|
|
72
|
+
* variants innermost and stops group-atomically, so an imbalance means
|
|
73
|
+
* something defeated that — e.g. results concatenated from separate arm runs
|
|
74
|
+
* with independent budget ceilings, each truncating at its own point).
|
|
75
|
+
* Returns a warning string, or null when balanced. Skipped cells (budget
|
|
76
|
+
* ceiling / no-cost breaker) are not counted as executed.
|
|
77
|
+
*/
|
|
78
|
+
export function balanceWarning(results) {
|
|
79
|
+
const counts = new Map();
|
|
80
|
+
for (const row of results) {
|
|
81
|
+
if (row.error?.startsWith('skipped:') === true)
|
|
82
|
+
continue;
|
|
83
|
+
counts.set(row.variantId, (counts.get(row.variantId) ?? 0) + 1);
|
|
84
|
+
}
|
|
85
|
+
const values = [...counts.values()];
|
|
86
|
+
const first = values[0];
|
|
87
|
+
if (first === undefined || values.every((count) => count === first)) {
|
|
88
|
+
return null;
|
|
89
|
+
}
|
|
90
|
+
const detail = [...counts.entries()]
|
|
91
|
+
.map(([variantId, count]) => `${variantId}=${count}`)
|
|
92
|
+
.join(', ');
|
|
93
|
+
return `WARNING: unbalanced variants — executed cell counts differ (${detail}); drop task×trial groups missing from any variant before comparing`;
|
|
94
|
+
}
|
|
95
|
+
/** Linear-interpolated quantile (numpy default); 0 on empty input. */
|
|
96
|
+
function quantile(sorted, p) {
|
|
97
|
+
if (sorted.length === 0)
|
|
98
|
+
return 0;
|
|
99
|
+
const pos = (sorted.length - 1) * p;
|
|
100
|
+
const lo = Math.floor(pos);
|
|
101
|
+
const a = sorted[lo] ?? 0;
|
|
102
|
+
const b = sorted[Math.ceil(pos)] ?? a;
|
|
103
|
+
return a + (b - a) * (pos - lo);
|
|
104
|
+
}
|
|
105
|
+
function sortedAsc(values) {
|
|
106
|
+
return [...values].sort((x, y) => x - y);
|
|
107
|
+
}
|
|
108
|
+
function median(values) {
|
|
109
|
+
return quantile(sortedAsc(values), 0.5);
|
|
110
|
+
}
|
|
111
|
+
export function aggregate(results) {
|
|
112
|
+
const byVariant = new Map();
|
|
113
|
+
for (const row of results) {
|
|
114
|
+
const rows = byVariant.get(row.variantId);
|
|
115
|
+
if (rows === undefined) {
|
|
116
|
+
byVariant.set(row.variantId, [row]);
|
|
117
|
+
}
|
|
118
|
+
else {
|
|
119
|
+
rows.push(row);
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
const aggregates = [];
|
|
123
|
+
for (const [variantId, rows] of byVariant) {
|
|
124
|
+
const valid = rows.filter((r) => r.error === undefined || r.error === null);
|
|
125
|
+
const judged = valid.filter((r) => typeof r.success === 'boolean');
|
|
126
|
+
const successRate = judged.length === 0
|
|
127
|
+
? null
|
|
128
|
+
: judged.filter((r) => r.success === true).length / judged.length;
|
|
129
|
+
const costs = valid
|
|
130
|
+
.map((r) => r.costUsd)
|
|
131
|
+
.filter((c) => typeof c === 'number');
|
|
132
|
+
const outputs = sortedAsc(valid.map((r) => r.usage.output));
|
|
133
|
+
const toolCallTotals = {};
|
|
134
|
+
for (const r of valid) {
|
|
135
|
+
for (const [name, count] of Object.entries(r.toolCalls)) {
|
|
136
|
+
toolCallTotals[name] = (toolCallTotals[name] ?? 0) + count;
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
aggregates.push({
|
|
140
|
+
variantId,
|
|
141
|
+
cells: rows.length,
|
|
142
|
+
errors: rows.length - valid.length,
|
|
143
|
+
valid: valid.length,
|
|
144
|
+
successRate,
|
|
145
|
+
medianInput: median(valid.map((r) => r.usage.input)),
|
|
146
|
+
medianOutput: quantile(outputs, 0.5),
|
|
147
|
+
medianCacheCreation: median(valid.map((r) => r.usage.cacheCreation)),
|
|
148
|
+
medianCacheRead: median(valid.map((r) => r.usage.cacheRead)),
|
|
149
|
+
medianCostUsd: costs.length === 0 ? null : median(costs),
|
|
150
|
+
medianDurationMs: median(valid.map((r) => r.durationMs)),
|
|
151
|
+
medianTurns: median(valid.map((r) => r.numTurns)),
|
|
152
|
+
iqrOutput: [quantile(outputs, 0.25), quantile(outputs, 0.75)],
|
|
153
|
+
toolCallTotals,
|
|
154
|
+
});
|
|
155
|
+
}
|
|
156
|
+
return aggregates;
|
|
157
|
+
}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import type { CellResult, SuiteSpec, Variant } from './types.ts';
|
|
2
|
+
export interface RunOptions {
|
|
3
|
+
suite: SuiteSpec;
|
|
4
|
+
variants: Variant[];
|
|
5
|
+
trials: number;
|
|
6
|
+
model: string;
|
|
7
|
+
maxBudgetUsd: number;
|
|
8
|
+
concurrency: number;
|
|
9
|
+
outDir: string;
|
|
10
|
+
fixturesDir: string;
|
|
11
|
+
onProgress?: (line: string) => void;
|
|
12
|
+
}
|
|
13
|
+
/**
|
|
14
|
+
* Cells that report no cost (timeouts, errors, subscription/Bedrock auth)
|
|
15
|
+
* still bill real API spend, so the dollar ceiling cannot see them. After
|
|
16
|
+
* this many consecutive no-cost cells the ceiling is unenforceable and the
|
|
17
|
+
* runner stops scheduling instead of burning the whole grid.
|
|
18
|
+
*/
|
|
19
|
+
export declare const MAX_CONSECUTIVE_NO_COST_CELLS = 3;
|
|
20
|
+
export declare function runBenchmark(opts: RunOptions): Promise<{
|
|
21
|
+
runId: string;
|
|
22
|
+
results: CellResult[];
|
|
23
|
+
resultsFile: string;
|
|
24
|
+
}>;
|