@astudioplus/compressor 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +52 -0
- package/LICENSE +20 -0
- package/README.md +167 -0
- package/dist/adapters/agents-md.d.ts +2 -0
- package/dist/adapters/agents-md.js +91 -0
- package/dist/adapters/apply.d.ts +3 -0
- package/dist/adapters/apply.js +83 -0
- package/dist/adapters/claude-code.d.ts +2 -0
- package/dist/adapters/claude-code.js +403 -0
- package/dist/adapters/copilot.d.ts +2 -0
- package/dist/adapters/copilot.js +418 -0
- package/dist/adapters/cursor.d.ts +2 -0
- package/dist/adapters/cursor.js +149 -0
- package/dist/adapters/index.d.ts +11 -0
- package/dist/adapters/index.js +19 -0
- package/dist/adapters/markers.d.ts +7 -0
- package/dist/adapters/markers.js +129 -0
- package/dist/adapters/types.d.ts +44 -0
- package/dist/adapters/types.js +1 -0
- package/dist/bench/ablate.d.ts +35 -0
- package/dist/bench/ablate.js +163 -0
- package/dist/bench/cell.d.ts +33 -0
- package/dist/bench/cell.js +437 -0
- package/dist/bench/results.d.ts +37 -0
- package/dist/bench/results.js +157 -0
- package/dist/bench/runner.d.ts +24 -0
- package/dist/bench/runner.js +121 -0
- package/dist/bench/tasks.d.ts +4 -0
- package/dist/bench/tasks.js +147 -0
- package/dist/bench/types.d.ts +109 -0
- package/dist/bench/types.js +1 -0
- package/dist/claude/transcripts.d.ts +30 -0
- package/dist/claude/transcripts.js +154 -0
- package/dist/cli/commands/benchmark.d.ts +33 -0
- package/dist/cli/commands/benchmark.js +203 -0
- package/dist/cli/commands/compress.d.ts +8 -0
- package/dist/cli/commands/compress.js +45 -0
- package/dist/cli/commands/count.d.ts +5 -0
- package/dist/cli/commands/count.js +25 -0
- package/dist/cli/commands/hook.d.ts +6 -0
- package/dist/cli/commands/hook.js +30 -0
- package/dist/cli/commands/init.d.ts +16 -0
- package/dist/cli/commands/init.js +76 -0
- package/dist/cli/commands/report.d.ts +90 -0
- package/dist/cli/commands/report.js +464 -0
- package/dist/cli/commands/savings.d.ts +38 -0
- package/dist/cli/commands/savings.js +196 -0
- package/dist/cli/commands/set-mode.d.ts +5 -0
- package/dist/cli/commands/set-mode.js +13 -0
- package/dist/cli/commands/stats.d.ts +5 -0
- package/dist/cli/commands/stats.js +51 -0
- package/dist/cli/commands/status.d.ts +1 -0
- package/dist/cli/commands/status.js +11 -0
- package/dist/cli/commands/uninstall.d.ts +7 -0
- package/dist/cli/commands/uninstall.js +22 -0
- package/dist/cli/index.d.ts +2 -0
- package/dist/cli/index.js +146 -0
- package/dist/copilot-hook-entry.d.ts +1 -0
- package/dist/copilot-hook-entry.js +36 -0
- package/dist/copilot-hook.js +1000 -0
- package/dist/engine/detect.d.ts +2 -0
- package/dist/engine/detect.js +47 -0
- package/dist/engine/index.d.ts +4 -0
- package/dist/engine/index.js +90 -0
- package/dist/engine/policy.d.ts +2 -0
- package/dist/engine/policy.js +48 -0
- package/dist/engine/tiers/code.d.ts +7 -0
- package/dist/engine/tiers/code.js +206 -0
- package/dist/engine/tiers/logs.d.ts +4 -0
- package/dist/engine/tiers/logs.js +139 -0
- package/dist/engine/tiers/structural.d.ts +28 -0
- package/dist/engine/tiers/structural.js +199 -0
- package/dist/engine/types.d.ts +71 -0
- package/dist/engine/types.js +5 -0
- package/dist/hook/copilot.d.ts +5 -0
- package/dist/hook/copilot.js +136 -0
- package/dist/hook/core.d.ts +36 -0
- package/dist/hook/core.js +138 -0
- package/dist/hook/exit.d.ts +22 -0
- package/dist/hook/exit.js +56 -0
- package/dist/hook/post-tool-use.d.ts +5 -0
- package/dist/hook/post-tool-use.js +57 -0
- package/dist/hook-entry.d.ts +1 -0
- package/dist/hook-entry.js +35 -0
- package/dist/hook.js +946 -0
- package/dist/index.d.ts +15 -0
- package/dist/index.js +16 -0
- package/dist/ledger/read.d.ts +9 -0
- package/dist/ledger/read.js +91 -0
- package/dist/ledger/write.d.ts +29 -0
- package/dist/ledger/write.js +61 -0
- package/dist/packs/atoms.d.ts +3 -0
- package/dist/packs/atoms.js +108 -0
- package/dist/packs/modes.d.ts +3 -0
- package/dist/packs/modes.js +34 -0
- package/dist/packs/render.d.ts +24 -0
- package/dist/packs/render.js +115 -0
- package/dist/packs/types.d.ts +32 -0
- package/dist/packs/types.js +1 -0
- package/dist/paths.d.ts +29 -0
- package/dist/paths.js +87 -0
- package/dist/tokens/estimate.d.ts +12 -0
- package/dist/tokens/estimate.js +23 -0
- package/dist/tokens/exact.d.ts +5 -0
- package/dist/tokens/exact.js +16 -0
- package/dist/tokens/index.d.ts +2 -0
- package/dist/tokens/index.js +2 -0
- package/package.json +77 -0
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
import { runCell } from "./cell.js";
|
|
2
|
+
import { appendResult, newRunId, runFilePath, writeRunMeta } from "./results.js";
|
|
3
|
+
/**
|
|
4
|
+
* Cells that report no cost (timeouts, errors, subscription/Bedrock auth)
|
|
5
|
+
* still bill real API spend, so the dollar ceiling cannot see them. After
|
|
6
|
+
* this many consecutive no-cost cells the ceiling is unenforceable and the
|
|
7
|
+
* runner stops scheduling instead of burning the whole grid.
|
|
8
|
+
*/
|
|
9
|
+
export const MAX_CONSECUTIVE_NO_COST_CELLS = 3;
|
|
10
|
+
function zeroedResult(runId, cell, error) {
|
|
11
|
+
return {
|
|
12
|
+
runId,
|
|
13
|
+
taskId: cell.task.id,
|
|
14
|
+
variantId: cell.variant.id,
|
|
15
|
+
trial: cell.trial,
|
|
16
|
+
model: cell.model,
|
|
17
|
+
servedModels: [],
|
|
18
|
+
baselineCheckPassed: null,
|
|
19
|
+
success: null,
|
|
20
|
+
usage: { input: 0, output: 0, cacheCreation: 0, cacheRead: 0 },
|
|
21
|
+
costUsd: null,
|
|
22
|
+
durationMs: 0,
|
|
23
|
+
numTurns: 0,
|
|
24
|
+
permissionDenials: 0,
|
|
25
|
+
toolCalls: {},
|
|
26
|
+
sessionId: null,
|
|
27
|
+
error,
|
|
28
|
+
timestamp: new Date().toISOString(),
|
|
29
|
+
};
|
|
30
|
+
}
|
|
31
|
+
function statusOf(row) {
|
|
32
|
+
if (row.error !== undefined)
|
|
33
|
+
return row.error;
|
|
34
|
+
if (row.success === null)
|
|
35
|
+
return 'unchecked';
|
|
36
|
+
return row.success ? 'pass' : 'fail';
|
|
37
|
+
}
|
|
38
|
+
export async function runBenchmark(opts) {
|
|
39
|
+
const runId = newRunId();
|
|
40
|
+
const resultsFile = runFilePath(opts.outDir, runId);
|
|
41
|
+
await writeRunMeta(opts.outDir, {
|
|
42
|
+
runId,
|
|
43
|
+
suite: opts.suite.name,
|
|
44
|
+
variantIds: opts.variants.map((v) => v.id),
|
|
45
|
+
model: opts.model,
|
|
46
|
+
trials: opts.trials,
|
|
47
|
+
startedAt: new Date().toISOString(),
|
|
48
|
+
maxBudgetUsd: opts.maxBudgetUsd,
|
|
49
|
+
});
|
|
50
|
+
// variants innermost: an early budget stop still covers every variant on
|
|
51
|
+
// the tasks that did run. Enforced group-atomically below: consecutive
|
|
52
|
+
// cells form one task×trial group (group size = variant count), the stop
|
|
53
|
+
// decision is made ONCE at a group's first cell and shared by the rest of
|
|
54
|
+
// the group, so a mid-group ceiling trip can never leave some arms of a
|
|
55
|
+
// task×trial measured and others skipped — cross-arm comparisons need
|
|
56
|
+
// complete groups.
|
|
57
|
+
const cells = [];
|
|
58
|
+
for (let trial = 1; trial <= opts.trials; trial += 1) {
|
|
59
|
+
for (const task of opts.suite.tasks) {
|
|
60
|
+
for (const variant of opts.variants) {
|
|
61
|
+
cells.push({ task, variant, trial, model: opts.model });
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
const groupSize = Math.max(1, opts.variants.length);
|
|
66
|
+
const results = new Array(cells.length);
|
|
67
|
+
let spentUsd = 0;
|
|
68
|
+
let noCostStreak = 0;
|
|
69
|
+
let next = 0;
|
|
70
|
+
// task×trial group index → skip reason decided at the group's first cell
|
|
71
|
+
// (null = the whole group runs)
|
|
72
|
+
const groupSkip = new Map();
|
|
73
|
+
const progress = (line) => opts.onProgress?.(line);
|
|
74
|
+
const worker = async () => {
|
|
75
|
+
while (next < cells.length) {
|
|
76
|
+
const index = next;
|
|
77
|
+
next += 1;
|
|
78
|
+
const cell = cells[index];
|
|
79
|
+
if (cell === undefined)
|
|
80
|
+
return;
|
|
81
|
+
const label = `${cell.task.id} × ${cell.variant.id} trial ${cell.trial}`;
|
|
82
|
+
const group = Math.floor(index / groupSize);
|
|
83
|
+
let skipReason = groupSkip.get(group);
|
|
84
|
+
if (skipReason === undefined) {
|
|
85
|
+
skipReason =
|
|
86
|
+
spentUsd >= opts.maxBudgetUsd
|
|
87
|
+
? `skipped: budget ceiling ${opts.maxBudgetUsd} USD reached`
|
|
88
|
+
: noCostStreak >= MAX_CONSECUTIVE_NO_COST_CELLS
|
|
89
|
+
? `skipped: ${MAX_CONSECUTIVE_NO_COST_CELLS} consecutive cells reported no cost — budget ceiling ${opts.maxBudgetUsd} USD is unenforceable`
|
|
90
|
+
: null;
|
|
91
|
+
groupSkip.set(group, skipReason);
|
|
92
|
+
}
|
|
93
|
+
let row;
|
|
94
|
+
if (skipReason !== null) {
|
|
95
|
+
row = zeroedResult(runId, cell, skipReason);
|
|
96
|
+
}
|
|
97
|
+
else {
|
|
98
|
+
try {
|
|
99
|
+
row = await runCell(cell, { runId, fixturesDir: opts.fixturesDir });
|
|
100
|
+
}
|
|
101
|
+
catch (error) {
|
|
102
|
+
// runCell catches internally; this guards the pool regardless
|
|
103
|
+
row = zeroedResult(runId, cell, error instanceof Error ? error.message : String(error));
|
|
104
|
+
}
|
|
105
|
+
if (typeof row.costUsd === 'number') {
|
|
106
|
+
spentUsd += row.costUsd;
|
|
107
|
+
noCostStreak = 0;
|
|
108
|
+
}
|
|
109
|
+
else {
|
|
110
|
+
noCostStreak += 1;
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
results[index] = row;
|
|
114
|
+
await appendResult(opts.outDir, runId, row);
|
|
115
|
+
progress(`[${index + 1}/${cells.length}] ${label}: ${statusOf(row)} (spent $${spentUsd.toFixed(4)})`);
|
|
116
|
+
}
|
|
117
|
+
};
|
|
118
|
+
const poolSize = Math.min(Math.max(1, Math.floor(opts.concurrency) || 2), Math.max(1, cells.length));
|
|
119
|
+
await Promise.all(Array.from({ length: poolSize }, () => worker()));
|
|
120
|
+
return { runId, results, resultsFile };
|
|
121
|
+
}
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
import { readFile } from 'node:fs/promises';
|
|
2
|
+
import path from 'node:path';
|
|
3
|
+
function isRecord(value) {
|
|
4
|
+
return typeof value === 'object' && value !== null && !Array.isArray(value);
|
|
5
|
+
}
|
|
6
|
+
function nonEmptyString(value) {
|
|
7
|
+
return typeof value === 'string' && value.length > 0;
|
|
8
|
+
}
|
|
9
|
+
function isStringArray(value) {
|
|
10
|
+
return Array.isArray(value) && value.every((item) => typeof item === 'string');
|
|
11
|
+
}
|
|
12
|
+
function message(error) {
|
|
13
|
+
return error instanceof Error ? error.message : String(error);
|
|
14
|
+
}
|
|
15
|
+
function parseCheck(raw, taskId) {
|
|
16
|
+
if (!isRecord(raw)) {
|
|
17
|
+
throw new Error(`task ${taskId}: check must be an object`);
|
|
18
|
+
}
|
|
19
|
+
const kind = raw['kind'];
|
|
20
|
+
if (kind === 'command') {
|
|
21
|
+
const command = raw['command'];
|
|
22
|
+
if (!nonEmptyString(command)) {
|
|
23
|
+
throw new Error(`task ${taskId}: command check requires a non-empty command string`);
|
|
24
|
+
}
|
|
25
|
+
return { kind: 'command', command };
|
|
26
|
+
}
|
|
27
|
+
if (kind === 'answer-regex') {
|
|
28
|
+
const pattern = raw['pattern'];
|
|
29
|
+
if (!nonEmptyString(pattern)) {
|
|
30
|
+
throw new Error(`task ${taskId}: answer-regex check requires a non-empty pattern string`);
|
|
31
|
+
}
|
|
32
|
+
const flags = raw['flags'];
|
|
33
|
+
if (flags !== undefined && typeof flags !== 'string') {
|
|
34
|
+
throw new Error(`task ${taskId}: answer-regex flags must be a string`);
|
|
35
|
+
}
|
|
36
|
+
try {
|
|
37
|
+
new RegExp(pattern, flags);
|
|
38
|
+
}
|
|
39
|
+
catch (error) {
|
|
40
|
+
throw new Error(`task ${taskId}: answer-regex pattern does not compile — ${message(error)}`);
|
|
41
|
+
}
|
|
42
|
+
return flags === undefined
|
|
43
|
+
? { kind: 'answer-regex', pattern }
|
|
44
|
+
: { kind: 'answer-regex', pattern, flags };
|
|
45
|
+
}
|
|
46
|
+
throw new Error(`task ${taskId}: unknown check kind ${JSON.stringify(kind)} (expected 'command' or 'answer-regex')`);
|
|
47
|
+
}
|
|
48
|
+
/**
|
|
49
|
+
* Multi-turn check semantics (tasks with `turns`):
|
|
50
|
+
* - command checks run once, in the workspace, after the FINAL turn.
|
|
51
|
+
* - answer-regex checks treat the whole conversation as the answer: the cell
|
|
52
|
+
* passes when the pattern matches ANY single turn's result text. Each turn
|
|
53
|
+
* is judged separately (so `^` anchors apply per turn), never only the
|
|
54
|
+
* final turn — follow-ups like "draft a PR comment" would otherwise erase
|
|
55
|
+
* evidence the first turn already produced.
|
|
56
|
+
*/
|
|
57
|
+
function parseTurns(raw, taskId) {
|
|
58
|
+
if (!Array.isArray(raw) || raw.length === 0) {
|
|
59
|
+
throw new Error(`task ${taskId}: turns must be a non-empty array of follow-up prompt strings`);
|
|
60
|
+
}
|
|
61
|
+
return raw.map((turn, index) => {
|
|
62
|
+
if (!nonEmptyString(turn)) {
|
|
63
|
+
throw new Error(`task ${taskId}: turns[${index}] must be a non-empty string`);
|
|
64
|
+
}
|
|
65
|
+
return turn;
|
|
66
|
+
});
|
|
67
|
+
}
|
|
68
|
+
function parseFixture(raw, taskId) {
|
|
69
|
+
if (!nonEmptyString(raw)) {
|
|
70
|
+
throw new Error(`task ${taskId}: fixture must be a non-empty string`);
|
|
71
|
+
}
|
|
72
|
+
if (raw.includes('/') || raw.includes('\\') || raw === '.' || raw === '..') {
|
|
73
|
+
throw new Error(`task ${taskId}: fixture must be a bare directory name (no path separators), got ${JSON.stringify(raw)}`);
|
|
74
|
+
}
|
|
75
|
+
return raw;
|
|
76
|
+
}
|
|
77
|
+
function parseTask(raw, index, seenIds) {
|
|
78
|
+
if (!isRecord(raw)) {
|
|
79
|
+
throw new Error(`task at index ${index}: must be an object`);
|
|
80
|
+
}
|
|
81
|
+
const id = raw['id'];
|
|
82
|
+
if (!nonEmptyString(id)) {
|
|
83
|
+
throw new Error(`task at index ${index}: id must be a non-empty string`);
|
|
84
|
+
}
|
|
85
|
+
if (seenIds.has(id)) {
|
|
86
|
+
throw new Error(`task ${id}: duplicate id`);
|
|
87
|
+
}
|
|
88
|
+
seenIds.add(id);
|
|
89
|
+
const prompt = raw['prompt'];
|
|
90
|
+
if (!nonEmptyString(prompt)) {
|
|
91
|
+
throw new Error(`task ${id}: prompt must be a non-empty string`);
|
|
92
|
+
}
|
|
93
|
+
const fixture = parseFixture(raw['fixture'], id);
|
|
94
|
+
const check = parseCheck(raw['check'], id);
|
|
95
|
+
const tags = raw['tags'];
|
|
96
|
+
if (tags !== undefined && !isStringArray(tags)) {
|
|
97
|
+
throw new Error(`task ${id}: tags must be an array of strings`);
|
|
98
|
+
}
|
|
99
|
+
const task = { id, prompt, fixture, check };
|
|
100
|
+
const turns = raw['turns'];
|
|
101
|
+
if (turns !== undefined) {
|
|
102
|
+
task.turns = parseTurns(turns, id);
|
|
103
|
+
}
|
|
104
|
+
if (tags !== undefined) {
|
|
105
|
+
task.tags = tags;
|
|
106
|
+
}
|
|
107
|
+
return task;
|
|
108
|
+
}
|
|
109
|
+
export async function loadSuite(suitePath) {
|
|
110
|
+
let raw;
|
|
111
|
+
try {
|
|
112
|
+
raw = await readFile(suitePath, 'utf8');
|
|
113
|
+
}
|
|
114
|
+
catch (error) {
|
|
115
|
+
throw new Error(`suite ${suitePath}: unreadable — ${message(error)}`);
|
|
116
|
+
}
|
|
117
|
+
let parsed;
|
|
118
|
+
try {
|
|
119
|
+
parsed = JSON.parse(raw);
|
|
120
|
+
}
|
|
121
|
+
catch (error) {
|
|
122
|
+
throw new Error(`suite ${suitePath}: invalid JSON — ${message(error)}`);
|
|
123
|
+
}
|
|
124
|
+
if (!isRecord(parsed)) {
|
|
125
|
+
throw new Error(`suite ${suitePath}: root must be an object`);
|
|
126
|
+
}
|
|
127
|
+
const name = parsed['name'];
|
|
128
|
+
if (!nonEmptyString(name)) {
|
|
129
|
+
throw new Error(`suite ${suitePath}: name must be a non-empty string`);
|
|
130
|
+
}
|
|
131
|
+
const tasksRaw = parsed['tasks'];
|
|
132
|
+
if (!Array.isArray(tasksRaw) || tasksRaw.length === 0) {
|
|
133
|
+
throw new Error(`suite ${suitePath}: tasks must be a non-empty array`);
|
|
134
|
+
}
|
|
135
|
+
const seenIds = new Set();
|
|
136
|
+
try {
|
|
137
|
+
const tasks = tasksRaw.map((task, index) => parseTask(task, index, seenIds));
|
|
138
|
+
return { name, tasks };
|
|
139
|
+
}
|
|
140
|
+
catch (error) {
|
|
141
|
+
throw new Error(`suite ${suitePath}: ${message(error)}`);
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
/** Fixture root shipped alongside a suite file: <suiteDir>/../fixtures. */
|
|
145
|
+
export function suiteFixturesDir(suitePath) {
|
|
146
|
+
return path.resolve(path.dirname(suitePath), '..', 'fixtures');
|
|
147
|
+
}
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
import type { Mode } from '../engine/types.ts';
|
|
2
|
+
import type { UsageTotals } from '../claude/transcripts.ts';
|
|
3
|
+
/** How a task's outcome is judged. Binary, no vibes. */
|
|
4
|
+
export type TaskCheck =
|
|
5
|
+
/** run in the cell workspace; exit 0 = pass */
|
|
6
|
+
{
|
|
7
|
+
kind: 'command';
|
|
8
|
+
command: string;
|
|
9
|
+
}
|
|
10
|
+
/** matched against the model's final answer text (output-heavy tasks) */
|
|
11
|
+
| {
|
|
12
|
+
kind: 'answer-regex';
|
|
13
|
+
pattern: string;
|
|
14
|
+
flags?: string;
|
|
15
|
+
};
|
|
16
|
+
export interface TaskSpec {
|
|
17
|
+
id: string;
|
|
18
|
+
prompt: string;
|
|
19
|
+
/**
|
|
20
|
+
* Follow-up prompts forming a scripted multi-turn conversation: the runner
|
|
21
|
+
* sends `prompt`, then each entry via `claude -p --resume <session-id>`.
|
|
22
|
+
* Absent = single-shot (existing behavior).
|
|
23
|
+
*/
|
|
24
|
+
turns?: string[];
|
|
25
|
+
/** directory name under bench/fixtures/ copied into the cell workspace */
|
|
26
|
+
fixture: string;
|
|
27
|
+
check: TaskCheck;
|
|
28
|
+
/** e.g. ['log-heavy', 'output-heavy', 'large-file'] — reporting only */
|
|
29
|
+
tags?: string[];
|
|
30
|
+
}
|
|
31
|
+
export interface SuiteSpec {
|
|
32
|
+
name: string;
|
|
33
|
+
tasks: TaskSpec[];
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* One experimental condition. Modes map to variants 1:1; ablations add
|
|
37
|
+
* variants like 'optimized-minus-out.no-preamble' (that atom removed) and
|
|
38
|
+
* 'optimized-plus-tokens.drop-articles' (a rejected atom added back).
|
|
39
|
+
*/
|
|
40
|
+
export interface Variant {
|
|
41
|
+
id: string;
|
|
42
|
+
baseMode: Mode;
|
|
43
|
+
/** rendered output-style body to install; null = no artifacts (full baseline) */
|
|
44
|
+
styleBody: string | null;
|
|
45
|
+
/** style name for settings.outputStyle; null for full */
|
|
46
|
+
styleName: string | null;
|
|
47
|
+
/** install the compression hook in this cell */
|
|
48
|
+
hook: boolean;
|
|
49
|
+
/**
|
|
50
|
+
* Extra args appended to the hook command (e.g. '--marker-style informative')
|
|
51
|
+
* so experiments can vary engine behavior per variant.
|
|
52
|
+
*/
|
|
53
|
+
hookArgs?: string;
|
|
54
|
+
}
|
|
55
|
+
export interface CellSpec {
|
|
56
|
+
task: TaskSpec;
|
|
57
|
+
variant: Variant;
|
|
58
|
+
trial: number;
|
|
59
|
+
/** requested model */
|
|
60
|
+
model: string;
|
|
61
|
+
}
|
|
62
|
+
export interface CellResult {
|
|
63
|
+
runId: string;
|
|
64
|
+
taskId: string;
|
|
65
|
+
variantId: string;
|
|
66
|
+
trial: number;
|
|
67
|
+
model: string;
|
|
68
|
+
/** modelUsage keys from the result JSON — fallback can silently substitute */
|
|
69
|
+
servedModels: string[];
|
|
70
|
+
/**
|
|
71
|
+
* For command checks: did the check pass BEFORE the agent ran? A bugfix
|
|
72
|
+
* fixture whose check already passes is vacuous — report flags these.
|
|
73
|
+
*/
|
|
74
|
+
baselineCheckPassed: boolean | null;
|
|
75
|
+
/** null = the check itself errored (infra problem, not task failure) */
|
|
76
|
+
success: boolean | null;
|
|
77
|
+
usage: UsageTotals;
|
|
78
|
+
costUsd: number | null;
|
|
79
|
+
durationMs: number;
|
|
80
|
+
numTurns: number;
|
|
81
|
+
/**
|
|
82
|
+
* Count of permission_denials in the result JSON. Non-zero denials corrupt
|
|
83
|
+
* the measurement (the model burns turns retrying instead of working) —
|
|
84
|
+
* the report flags them as a data-quality problem.
|
|
85
|
+
*/
|
|
86
|
+
permissionDenials: number;
|
|
87
|
+
/**
|
|
88
|
+
* Per-turn usage for multi-turn cells (one entry per scripted turn, from
|
|
89
|
+
* each turn's result JSON). Absent for single-shot cells. Cell-level
|
|
90
|
+
* `usage` stays authoritative (summed from the final transcript).
|
|
91
|
+
*/
|
|
92
|
+
turnUsage?: UsageTotals[];
|
|
93
|
+
/** tool_use counts by tool name, from the session transcript */
|
|
94
|
+
toolCalls: Record<string, number>;
|
|
95
|
+
sessionId: string | null;
|
|
96
|
+
/** cell-level infrastructure failure (claude crashed, timeout, parse error) */
|
|
97
|
+
error?: string;
|
|
98
|
+
timestamp: string;
|
|
99
|
+
}
|
|
100
|
+
export interface RunMeta {
|
|
101
|
+
runId: string;
|
|
102
|
+
suite: string;
|
|
103
|
+
variantIds: string[];
|
|
104
|
+
model: string;
|
|
105
|
+
trials: number;
|
|
106
|
+
startedAt: string;
|
|
107
|
+
/** hard cost ceiling; the runner stops scheduling cells when exceeded */
|
|
108
|
+
maxBudgetUsd: number;
|
|
109
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
export interface UsageTotals {
|
|
2
|
+
input: number;
|
|
3
|
+
output: number;
|
|
4
|
+
cacheCreation: number;
|
|
5
|
+
cacheRead: number;
|
|
6
|
+
}
|
|
7
|
+
export interface SessionUsage {
|
|
8
|
+
sessionId: string;
|
|
9
|
+
file: string;
|
|
10
|
+
turns: number;
|
|
11
|
+
totals: UsageTotals;
|
|
12
|
+
byModel: Record<string, UsageTotals>;
|
|
13
|
+
sidechain: UsageTotals;
|
|
14
|
+
firstTimestamp?: string;
|
|
15
|
+
lastTimestamp?: string;
|
|
16
|
+
}
|
|
17
|
+
/**
|
|
18
|
+
* Claude Code encodes the project cwd as replace(/[^a-zA-Z0-9]/g, '-'); when
|
|
19
|
+
* the result exceeds 200 chars it is truncated and suffixed with a base36
|
|
20
|
+
* hash of the ORIGINAL path (verified against the 2.1.170 binary).
|
|
21
|
+
*/
|
|
22
|
+
export declare function encodeProjectDir(absPath: string): string;
|
|
23
|
+
export declare function addUsage(a: UsageTotals, b: UsageTotals): UsageTotals;
|
|
24
|
+
export declare function aggregateUsage(sessions: SessionUsage[]): UsageTotals;
|
|
25
|
+
export declare function findTranscripts(opts: {
|
|
26
|
+
projectDir: string;
|
|
27
|
+
configDir?: string;
|
|
28
|
+
since?: Date;
|
|
29
|
+
}): Promise<string[]>;
|
|
30
|
+
export declare function readSessionUsage(file: string): Promise<SessionUsage>;
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
import { createReadStream } from 'node:fs';
|
|
2
|
+
import { readdir, stat } from 'node:fs/promises';
|
|
3
|
+
import { homedir } from 'node:os';
|
|
4
|
+
import { basename, join } from 'node:path';
|
|
5
|
+
import { createInterface } from 'node:readline';
|
|
6
|
+
const ENCODED_DIR_MAX = 200;
|
|
7
|
+
/** 32-bit java-style string hash, matching Claude Code's long-path suffix. */
|
|
8
|
+
function hash32(text) {
|
|
9
|
+
let h = 0;
|
|
10
|
+
for (let i = 0; i < text.length; i += 1) {
|
|
11
|
+
h = ((h << 5) - h + text.charCodeAt(i)) | 0;
|
|
12
|
+
}
|
|
13
|
+
return h;
|
|
14
|
+
}
|
|
15
|
+
/**
|
|
16
|
+
* Claude Code encodes the project cwd as replace(/[^a-zA-Z0-9]/g, '-'); when
|
|
17
|
+
* the result exceeds 200 chars it is truncated and suffixed with a base36
|
|
18
|
+
* hash of the ORIGINAL path (verified against the 2.1.170 binary).
|
|
19
|
+
*/
|
|
20
|
+
export function encodeProjectDir(absPath) {
|
|
21
|
+
const dashed = absPath.replace(/[^a-zA-Z0-9]/g, '-');
|
|
22
|
+
if (dashed.length <= ENCODED_DIR_MAX) {
|
|
23
|
+
return dashed;
|
|
24
|
+
}
|
|
25
|
+
return `${dashed.slice(0, ENCODED_DIR_MAX)}-${Math.abs(hash32(absPath)).toString(36)}`;
|
|
26
|
+
}
|
|
27
|
+
const emptyTotals = () => ({
|
|
28
|
+
input: 0,
|
|
29
|
+
output: 0,
|
|
30
|
+
cacheCreation: 0,
|
|
31
|
+
cacheRead: 0,
|
|
32
|
+
});
|
|
33
|
+
export function addUsage(a, b) {
|
|
34
|
+
return {
|
|
35
|
+
input: a.input + b.input,
|
|
36
|
+
output: a.output + b.output,
|
|
37
|
+
cacheCreation: a.cacheCreation + b.cacheCreation,
|
|
38
|
+
cacheRead: a.cacheRead + b.cacheRead,
|
|
39
|
+
};
|
|
40
|
+
}
|
|
41
|
+
export function aggregateUsage(sessions) {
|
|
42
|
+
return sessions.reduce((acc, s) => addUsage(acc, s.totals), emptyTotals());
|
|
43
|
+
}
|
|
44
|
+
export async function findTranscripts(opts) {
|
|
45
|
+
const configDir = opts.configDir ?? process.env.CLAUDE_CONFIG_DIR ?? join(homedir(), '.claude');
|
|
46
|
+
const dir = join(configDir, 'projects', encodeProjectDir(opts.projectDir));
|
|
47
|
+
let names;
|
|
48
|
+
try {
|
|
49
|
+
names = await readdir(dir);
|
|
50
|
+
}
|
|
51
|
+
catch {
|
|
52
|
+
return [];
|
|
53
|
+
}
|
|
54
|
+
const files = [];
|
|
55
|
+
for (const name of names.filter((n) => n.endsWith('.jsonl')).sort()) {
|
|
56
|
+
const full = join(dir, name);
|
|
57
|
+
if (opts.since) {
|
|
58
|
+
try {
|
|
59
|
+
const info = await stat(full);
|
|
60
|
+
if (info.mtimeMs < opts.since.getTime())
|
|
61
|
+
continue;
|
|
62
|
+
}
|
|
63
|
+
catch {
|
|
64
|
+
continue;
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
files.push(full);
|
|
68
|
+
}
|
|
69
|
+
return files;
|
|
70
|
+
}
|
|
71
|
+
function isRecord(value) {
|
|
72
|
+
return typeof value === 'object' && value !== null;
|
|
73
|
+
}
|
|
74
|
+
function num(value) {
|
|
75
|
+
return typeof value === 'number' && Number.isFinite(value) ? value : 0;
|
|
76
|
+
}
|
|
77
|
+
function usageFrom(usage) {
|
|
78
|
+
return {
|
|
79
|
+
input: num(usage['input_tokens']),
|
|
80
|
+
output: num(usage['output_tokens']),
|
|
81
|
+
cacheCreation: num(usage['cache_creation_input_tokens']),
|
|
82
|
+
cacheRead: num(usage['cache_read_input_tokens']),
|
|
83
|
+
};
|
|
84
|
+
}
|
|
85
|
+
export async function readSessionUsage(file) {
|
|
86
|
+
// The same requestId appears on multiple lines for streamed updates; the
|
|
87
|
+
// last occurrence carries final usage, so Map.set overwrites earlier ones.
|
|
88
|
+
const turns = new Map();
|
|
89
|
+
let sessionId = '';
|
|
90
|
+
let firstTimestamp;
|
|
91
|
+
let lastTimestamp;
|
|
92
|
+
let anonCounter = 0;
|
|
93
|
+
const rl = createInterface({
|
|
94
|
+
input: createReadStream(file, 'utf8'),
|
|
95
|
+
crlfDelay: Infinity,
|
|
96
|
+
});
|
|
97
|
+
for await (const line of rl) {
|
|
98
|
+
const trimmed = line.trim();
|
|
99
|
+
if (trimmed === '')
|
|
100
|
+
continue;
|
|
101
|
+
let parsed;
|
|
102
|
+
try {
|
|
103
|
+
parsed = JSON.parse(trimmed);
|
|
104
|
+
}
|
|
105
|
+
catch {
|
|
106
|
+
continue;
|
|
107
|
+
}
|
|
108
|
+
if (!isRecord(parsed) || parsed['type'] !== 'assistant')
|
|
109
|
+
continue;
|
|
110
|
+
const message = parsed['message'];
|
|
111
|
+
if (!isRecord(message))
|
|
112
|
+
continue;
|
|
113
|
+
const usage = message['usage'];
|
|
114
|
+
if (!isRecord(usage))
|
|
115
|
+
continue;
|
|
116
|
+
if (sessionId === '' && typeof parsed['sessionId'] === 'string') {
|
|
117
|
+
sessionId = parsed['sessionId'];
|
|
118
|
+
}
|
|
119
|
+
const ts = parsed['timestamp'];
|
|
120
|
+
if (typeof ts === 'string') {
|
|
121
|
+
firstTimestamp ??= ts;
|
|
122
|
+
lastTimestamp = ts;
|
|
123
|
+
}
|
|
124
|
+
const key = typeof parsed['requestId'] === 'string'
|
|
125
|
+
? parsed['requestId']
|
|
126
|
+
: typeof message['id'] === 'string'
|
|
127
|
+
? message['id']
|
|
128
|
+
: `anon-${anonCounter++}`;
|
|
129
|
+
turns.set(key, {
|
|
130
|
+
usage: usageFrom(usage),
|
|
131
|
+
model: typeof message['model'] === 'string' ? message['model'] : 'unknown',
|
|
132
|
+
sidechain: parsed['isSidechain'] === true,
|
|
133
|
+
});
|
|
134
|
+
}
|
|
135
|
+
let totals = emptyTotals();
|
|
136
|
+
let sidechain = emptyTotals();
|
|
137
|
+
const byModel = {};
|
|
138
|
+
for (const turn of turns.values()) {
|
|
139
|
+
totals = addUsage(totals, turn.usage);
|
|
140
|
+
byModel[turn.model] = addUsage(byModel[turn.model] ?? emptyTotals(), turn.usage);
|
|
141
|
+
if (turn.sidechain)
|
|
142
|
+
sidechain = addUsage(sidechain, turn.usage);
|
|
143
|
+
}
|
|
144
|
+
return {
|
|
145
|
+
sessionId: sessionId === '' ? basename(file, '.jsonl') : sessionId,
|
|
146
|
+
file,
|
|
147
|
+
turns: turns.size,
|
|
148
|
+
totals,
|
|
149
|
+
byModel,
|
|
150
|
+
sidechain,
|
|
151
|
+
firstTimestamp,
|
|
152
|
+
lastTimestamp,
|
|
153
|
+
};
|
|
154
|
+
}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
export interface BenchmarkCliOptions {
|
|
2
|
+
suite: string;
|
|
3
|
+
modes: string;
|
|
4
|
+
trials: string;
|
|
5
|
+
model: string;
|
|
6
|
+
ablate?: string;
|
|
7
|
+
ablateAdd?: string;
|
|
8
|
+
/** comma-separated atom categories (output|behavior) for group ablation */
|
|
9
|
+
ablateGroup?: string;
|
|
10
|
+
/** commander --no-hook: defaults true */
|
|
11
|
+
hook: boolean;
|
|
12
|
+
/** extra args appended to the hook command in every hook-bearing variant */
|
|
13
|
+
hookArgs?: string;
|
|
14
|
+
/**
|
|
15
|
+
* comma-separated marker styles: fans each hook-bearing variant out into
|
|
16
|
+
* one arm per style WITHIN this run (shared budget ceiling, balanced groups)
|
|
17
|
+
*/
|
|
18
|
+
markerStyles?: string;
|
|
19
|
+
concurrency: string;
|
|
20
|
+
maxBudgetUsd: string;
|
|
21
|
+
out: string;
|
|
22
|
+
}
|
|
23
|
+
/**
|
|
24
|
+
* Preflight for marker-style experiments: the hook entry parses argv
|
|
25
|
+
* fail-open, so a STALE dist/hook.js that predates --marker-style ignores
|
|
26
|
+
* the flag silently and every arm measures identical 'plain' behavior — a
|
|
27
|
+
* three-arm run of pure noise with zero errors anywhere. Verify the exact
|
|
28
|
+
* installed hook command by piping the same over-budget payload through it
|
|
29
|
+
* with two different styles and requiring the outputs to differ.
|
|
30
|
+
* COMPRESSOR_NO_LEDGER keeps the probe out of the live savings ledger.
|
|
31
|
+
*/
|
|
32
|
+
export declare function assertHookHandlesMarkerStyle(baseHookCommand: string): Promise<void>;
|
|
33
|
+
export declare function runBenchmarkCommand(opts: BenchmarkCliOptions): Promise<void>;
|