@dotsetlabs/dotclaw 2.4.0 → 2.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +9 -10
- package/README.md +8 -4
- package/config-examples/runtime.json +34 -8
- package/config-examples/tool-policy.json +12 -2
- package/container/agent-runner/package-lock.json +2 -2
- package/container/agent-runner/package.json +1 -1
- package/container/agent-runner/src/agent-config.ts +19 -3
- package/container/agent-runner/src/container-protocol.ts +11 -0
- package/container/agent-runner/src/context-overflow-recovery.ts +39 -0
- package/container/agent-runner/src/index.ts +603 -165
- package/container/agent-runner/src/openrouter-input.ts +159 -0
- package/container/agent-runner/src/system-prompt.ts +13 -3
- package/container/agent-runner/src/tool-loop-policy.ts +741 -0
- package/container/agent-runner/src/tools.ts +211 -8
- package/dist/agent-context.d.ts +1 -0
- package/dist/agent-context.d.ts.map +1 -1
- package/dist/agent-context.js +21 -9
- package/dist/agent-context.js.map +1 -1
- package/dist/agent-execution.d.ts +2 -0
- package/dist/agent-execution.d.ts.map +1 -1
- package/dist/agent-execution.js +164 -15
- package/dist/agent-execution.js.map +1 -1
- package/dist/agent-semaphore.d.ts +24 -1
- package/dist/agent-semaphore.d.ts.map +1 -1
- package/dist/agent-semaphore.js +109 -20
- package/dist/agent-semaphore.js.map +1 -1
- package/dist/cli.js +3 -11
- package/dist/cli.js.map +1 -1
- package/dist/config.d.ts +2 -0
- package/dist/config.d.ts.map +1 -1
- package/dist/config.js +2 -0
- package/dist/config.js.map +1 -1
- package/dist/container-protocol.d.ts +22 -0
- package/dist/container-protocol.d.ts.map +1 -1
- package/dist/container-protocol.js.map +1 -1
- package/dist/container-runner.d.ts +7 -0
- package/dist/container-runner.d.ts.map +1 -1
- package/dist/container-runner.js +417 -143
- package/dist/container-runner.js.map +1 -1
- package/dist/db.d.ts.map +1 -1
- package/dist/db.js +46 -12
- package/dist/db.js.map +1 -1
- package/dist/error-messages.d.ts.map +1 -1
- package/dist/error-messages.js +18 -4
- package/dist/error-messages.js.map +1 -1
- package/dist/failover-policy.d.ts +41 -0
- package/dist/failover-policy.d.ts.map +1 -0
- package/dist/failover-policy.js +261 -0
- package/dist/failover-policy.js.map +1 -0
- package/dist/index.js +1 -0
- package/dist/index.js.map +1 -1
- package/dist/ipc-dispatcher.d.ts.map +1 -1
- package/dist/ipc-dispatcher.js +27 -43
- package/dist/ipc-dispatcher.js.map +1 -1
- package/dist/mcp-config.d.ts +22 -0
- package/dist/mcp-config.d.ts.map +1 -0
- package/dist/mcp-config.js +94 -0
- package/dist/mcp-config.js.map +1 -0
- package/dist/memory-backend.d.ts +27 -0
- package/dist/memory-backend.d.ts.map +1 -0
- package/dist/memory-backend.js +112 -0
- package/dist/memory-backend.js.map +1 -0
- package/dist/memory-recall.d.ts.map +1 -1
- package/dist/memory-recall.js +135 -22
- package/dist/memory-recall.js.map +1 -1
- package/dist/memory-store.d.ts +1 -0
- package/dist/memory-store.d.ts.map +1 -1
- package/dist/memory-store.js +55 -7
- package/dist/memory-store.js.map +1 -1
- package/dist/message-pipeline.d.ts +24 -0
- package/dist/message-pipeline.d.ts.map +1 -1
- package/dist/message-pipeline.js +131 -27
- package/dist/message-pipeline.js.map +1 -1
- package/dist/metrics.d.ts +1 -0
- package/dist/metrics.d.ts.map +1 -1
- package/dist/metrics.js +9 -0
- package/dist/metrics.js.map +1 -1
- package/dist/providers/discord/discord-provider.d.ts.map +1 -1
- package/dist/providers/discord/discord-provider.js +72 -4
- package/dist/providers/discord/discord-provider.js.map +1 -1
- package/dist/providers/telegram/telegram-provider.d.ts.map +1 -1
- package/dist/providers/telegram/telegram-provider.js +65 -3
- package/dist/providers/telegram/telegram-provider.js.map +1 -1
- package/dist/recall-policy.d.ts +12 -0
- package/dist/recall-policy.d.ts.map +1 -0
- package/dist/recall-policy.js +89 -0
- package/dist/recall-policy.js.map +1 -0
- package/dist/runtime-config.d.ts +33 -0
- package/dist/runtime-config.d.ts.map +1 -1
- package/dist/runtime-config.js +109 -9
- package/dist/runtime-config.js.map +1 -1
- package/dist/streaming.d.ts.map +1 -1
- package/dist/streaming.js +125 -33
- package/dist/streaming.js.map +1 -1
- package/dist/task-scheduler.d.ts.map +1 -1
- package/dist/task-scheduler.js +4 -2
- package/dist/task-scheduler.js.map +1 -1
- package/dist/tool-policy.d.ts.map +1 -1
- package/dist/tool-policy.js +26 -4
- package/dist/tool-policy.js.map +1 -1
- package/dist/trace-writer.d.ts +12 -0
- package/dist/trace-writer.d.ts.map +1 -1
- package/dist/trace-writer.js.map +1 -1
- package/dist/turn-hygiene.d.ts +14 -0
- package/dist/turn-hygiene.d.ts.map +1 -0
- package/dist/turn-hygiene.js +214 -0
- package/dist/turn-hygiene.js.map +1 -0
- package/dist/webhook.d.ts.map +1 -1
- package/dist/webhook.js +1 -0
- package/dist/webhook.js.map +1 -1
- package/package.json +15 -1
- package/scripts/benchmark-baseline.js +365 -0
- package/scripts/benchmark-harness.js +1413 -0
- package/scripts/benchmark-scenarios.js +301 -0
- package/scripts/canary-suite.js +123 -0
- package/scripts/generate-controlled-traces.js +230 -0
- package/scripts/release-slo-check.js +214 -0
- package/scripts/run-live-canary.js +339 -0
|
@@ -0,0 +1,1413 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
import fs from 'node:fs';
|
|
4
|
+
import os from 'node:os';
|
|
5
|
+
import path from 'node:path';
|
|
6
|
+
import { execSync } from 'node:child_process';
|
|
7
|
+
|
|
8
|
+
import { buildReport, loadTraces, percentile } from './benchmark-baseline.js';
|
|
9
|
+
import { evaluateScenarioMetrics, evaluateScenarioThresholds } from './benchmark-scenarios.js';
|
|
10
|
+
import { evaluateReleaseSlo } from './release-slo-check.js';
|
|
11
|
+
|
|
12
|
+
const DEFAULT_BOOTSTRAP_ITERATIONS = 1200;
|
|
13
|
+
const DEFAULT_RECOVERY_WINDOW_MS = 10 * 60 * 1000;
|
|
14
|
+
const DEFAULT_DAYS = 7;
|
|
15
|
+
const SNAPSHOT_SCHEMA_VERSION = 1;
|
|
16
|
+
|
|
17
|
+
function parseArgs(argv) {
|
|
18
|
+
const [commandRaw, ...rest] = argv;
|
|
19
|
+
const command = (commandRaw || 'help').trim().toLowerCase();
|
|
20
|
+
const args = {
|
|
21
|
+
command,
|
|
22
|
+
runId: '',
|
|
23
|
+
label: '',
|
|
24
|
+
days: DEFAULT_DAYS,
|
|
25
|
+
since: '',
|
|
26
|
+
until: '',
|
|
27
|
+
source: '',
|
|
28
|
+
excludeSource: '',
|
|
29
|
+
chatId: '',
|
|
30
|
+
dir: '',
|
|
31
|
+
outputDir: '',
|
|
32
|
+
before: '',
|
|
33
|
+
after: '',
|
|
34
|
+
baseline: '',
|
|
35
|
+
candidate: '',
|
|
36
|
+
enforce: false,
|
|
37
|
+
superiorityGate: false,
|
|
38
|
+
bootstrap: DEFAULT_BOOTSTRAP_ITERATIONS,
|
|
39
|
+
recoveryWindowMs: DEFAULT_RECOVERY_WINDOW_MS,
|
|
40
|
+
latencyTolerance: 0.05,
|
|
41
|
+
tokenTolerance: 0.05,
|
|
42
|
+
};
|
|
43
|
+
|
|
44
|
+
for (let i = 0; i < rest.length; i += 1) {
|
|
45
|
+
const arg = rest[i];
|
|
46
|
+
if (arg === '--run-id' && i + 1 < rest.length) {
|
|
47
|
+
args.runId = rest[i + 1];
|
|
48
|
+
i += 1;
|
|
49
|
+
continue;
|
|
50
|
+
}
|
|
51
|
+
if (arg === '--label' && i + 1 < rest.length) {
|
|
52
|
+
args.label = rest[i + 1];
|
|
53
|
+
i += 1;
|
|
54
|
+
continue;
|
|
55
|
+
}
|
|
56
|
+
if (arg === '--days' && i + 1 < rest.length) {
|
|
57
|
+
const value = Number(rest[i + 1]);
|
|
58
|
+
if (Number.isFinite(value) && value > 0) {
|
|
59
|
+
args.days = Math.floor(value);
|
|
60
|
+
}
|
|
61
|
+
i += 1;
|
|
62
|
+
continue;
|
|
63
|
+
}
|
|
64
|
+
if (arg === '--since' && i + 1 < rest.length) {
|
|
65
|
+
args.since = rest[i + 1];
|
|
66
|
+
i += 1;
|
|
67
|
+
continue;
|
|
68
|
+
}
|
|
69
|
+
if (arg === '--until' && i + 1 < rest.length) {
|
|
70
|
+
args.until = rest[i + 1];
|
|
71
|
+
i += 1;
|
|
72
|
+
continue;
|
|
73
|
+
}
|
|
74
|
+
if (arg === '--source' && i + 1 < rest.length) {
|
|
75
|
+
args.source = rest[i + 1];
|
|
76
|
+
i += 1;
|
|
77
|
+
continue;
|
|
78
|
+
}
|
|
79
|
+
if (arg === '--exclude-source' && i + 1 < rest.length) {
|
|
80
|
+
args.excludeSource = rest[i + 1];
|
|
81
|
+
i += 1;
|
|
82
|
+
continue;
|
|
83
|
+
}
|
|
84
|
+
if (arg === '--chat-id' && i + 1 < rest.length) {
|
|
85
|
+
args.chatId = rest[i + 1];
|
|
86
|
+
i += 1;
|
|
87
|
+
continue;
|
|
88
|
+
}
|
|
89
|
+
if (arg === '--dir' && i + 1 < rest.length) {
|
|
90
|
+
args.dir = rest[i + 1];
|
|
91
|
+
i += 1;
|
|
92
|
+
continue;
|
|
93
|
+
}
|
|
94
|
+
if (arg === '--output-dir' && i + 1 < rest.length) {
|
|
95
|
+
args.outputDir = rest[i + 1];
|
|
96
|
+
i += 1;
|
|
97
|
+
continue;
|
|
98
|
+
}
|
|
99
|
+
if (arg === '--before' && i + 1 < rest.length) {
|
|
100
|
+
args.before = rest[i + 1];
|
|
101
|
+
i += 1;
|
|
102
|
+
continue;
|
|
103
|
+
}
|
|
104
|
+
if (arg === '--after' && i + 1 < rest.length) {
|
|
105
|
+
args.after = rest[i + 1];
|
|
106
|
+
i += 1;
|
|
107
|
+
continue;
|
|
108
|
+
}
|
|
109
|
+
if (arg === '--baseline' && i + 1 < rest.length) {
|
|
110
|
+
args.baseline = rest[i + 1];
|
|
111
|
+
i += 1;
|
|
112
|
+
continue;
|
|
113
|
+
}
|
|
114
|
+
if (arg === '--candidate' && i + 1 < rest.length) {
|
|
115
|
+
args.candidate = rest[i + 1];
|
|
116
|
+
i += 1;
|
|
117
|
+
continue;
|
|
118
|
+
}
|
|
119
|
+
if (arg === '--bootstrap' && i + 1 < rest.length) {
|
|
120
|
+
const value = Number(rest[i + 1]);
|
|
121
|
+
if (Number.isFinite(value) && value >= 200) {
|
|
122
|
+
args.bootstrap = Math.floor(value);
|
|
123
|
+
}
|
|
124
|
+
i += 1;
|
|
125
|
+
continue;
|
|
126
|
+
}
|
|
127
|
+
if (arg === '--recovery-window-ms' && i + 1 < rest.length) {
|
|
128
|
+
const value = Number(rest[i + 1]);
|
|
129
|
+
if (Number.isFinite(value) && value > 0) {
|
|
130
|
+
args.recoveryWindowMs = Math.floor(value);
|
|
131
|
+
}
|
|
132
|
+
i += 1;
|
|
133
|
+
continue;
|
|
134
|
+
}
|
|
135
|
+
if (arg === '--latency-tolerance' && i + 1 < rest.length) {
|
|
136
|
+
const value = Number(rest[i + 1]);
|
|
137
|
+
if (Number.isFinite(value) && value >= 0 && value <= 1) {
|
|
138
|
+
args.latencyTolerance = value;
|
|
139
|
+
}
|
|
140
|
+
i += 1;
|
|
141
|
+
continue;
|
|
142
|
+
}
|
|
143
|
+
if (arg === '--token-tolerance' && i + 1 < rest.length) {
|
|
144
|
+
const value = Number(rest[i + 1]);
|
|
145
|
+
if (Number.isFinite(value) && value >= 0 && value <= 1) {
|
|
146
|
+
args.tokenTolerance = value;
|
|
147
|
+
}
|
|
148
|
+
i += 1;
|
|
149
|
+
continue;
|
|
150
|
+
}
|
|
151
|
+
if (arg === '--enforce') {
|
|
152
|
+
args.enforce = true;
|
|
153
|
+
continue;
|
|
154
|
+
}
|
|
155
|
+
if (arg === '--superiority-gate') {
|
|
156
|
+
args.superiorityGate = true;
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
return args;
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
function usage() {
|
|
164
|
+
console.log([
|
|
165
|
+
'DotClaw Benchmark Harness',
|
|
166
|
+
'',
|
|
167
|
+
'Commands:',
|
|
168
|
+
' init --run-id <id> [--days <n>|--since <iso>] [--until <iso>] [--source <list>] [--dir <traces>] [--output-dir <path>]',
|
|
169
|
+
' Create a run and capture the "overall_start" snapshot.',
|
|
170
|
+
'',
|
|
171
|
+
' capture --run-id <id> --label <name> [--days <n>|--since <iso>] [--until <iso>] [--source <list>] [--exclude-source <list>] [--chat-id <id,list>] [--dir <traces>] [--output-dir <path>]',
|
|
172
|
+
' Capture a named snapshot (for tranche before/after and final).',
|
|
173
|
+
'',
|
|
174
|
+
' compare --run-id <id> --before <label|file> --after <label|file> [--bootstrap <n>] [--superiority-gate] [--latency-tolerance <0..1>] [--token-tolerance <0..1>] [--enforce]',
|
|
175
|
+
' Compare two snapshots with statistical tests.',
|
|
176
|
+
'',
|
|
177
|
+
' headtohead --run-id <id> --baseline <label|file> --candidate <label|file> [--bootstrap <n>] [--latency-tolerance <0..1>] [--token-tolerance <0..1>] [--enforce]',
|
|
178
|
+
' DotClaw-vs-baseline comparison with superiority gate enforcement.',
|
|
179
|
+
'',
|
|
180
|
+
' report --run-id <id> [--bootstrap <n>] [--enforce]',
|
|
181
|
+
' Build run-level report (overall + tranche before/after pairs).',
|
|
182
|
+
'',
|
|
183
|
+
'Examples:',
|
|
184
|
+
' npm run bench:harness -- init --run-id parity-superiority-20260207 --days 14',
|
|
185
|
+
' npm run bench:harness -- capture --run-id parity-superiority-20260207 --label tranche1_before --since 2026-02-07T20:00:00Z --source dotclaw,live-canary',
|
|
186
|
+
' npm run bench:harness -- capture --run-id parity-superiority-20260207 --label tranche1_after --since 2026-02-07T22:00:00Z --source dotclaw,live-canary',
|
|
187
|
+
' npm run bench:harness -- report --run-id parity-superiority-20260207 --enforce',
|
|
188
|
+
].join('\n'));
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
function ensureDir(dirPath) {
|
|
192
|
+
fs.mkdirSync(dirPath, { recursive: true });
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
function writeJson(filePath, value) {
|
|
196
|
+
const tmpPath = `${filePath}.tmp`;
|
|
197
|
+
fs.writeFileSync(tmpPath, `${JSON.stringify(value, null, 2)}\n`);
|
|
198
|
+
fs.renameSync(tmpPath, filePath);
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
function readJson(filePath, fallback = null) {
|
|
202
|
+
try {
|
|
203
|
+
return JSON.parse(fs.readFileSync(filePath, 'utf-8'));
|
|
204
|
+
} catch {
|
|
205
|
+
return fallback;
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
function normalizeLabel(label) {
|
|
210
|
+
return String(label || '')
|
|
211
|
+
.trim()
|
|
212
|
+
.toLowerCase()
|
|
213
|
+
.replace(/[^a-z0-9._-]+/g, '_')
|
|
214
|
+
.replace(/^_+|_+$/g, '');
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
function timestampToken(now = Date.now()) {
|
|
218
|
+
const iso = new Date(now).toISOString();
|
|
219
|
+
return iso.replace(/[:.]/g, '-');
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
function resolveDotclawHome() {
|
|
223
|
+
return process.env.DOTCLAW_HOME || path.join(os.homedir(), '.dotclaw');
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
function resolveHarnessRoot(args) {
|
|
227
|
+
if (args.outputDir && args.outputDir.trim()) {
|
|
228
|
+
return path.resolve(args.outputDir.trim());
|
|
229
|
+
}
|
|
230
|
+
return path.join(resolveDotclawHome(), 'reports', 'benchmark-harness');
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
function resolveRunDir(args) {
|
|
234
|
+
const runId = args.runId.trim();
|
|
235
|
+
if (!runId) {
|
|
236
|
+
throw new Error('--run-id is required');
|
|
237
|
+
}
|
|
238
|
+
return path.join(resolveHarnessRoot(args), runId);
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
function resolveTraceDir(args) {
|
|
242
|
+
if (args.dir && args.dir.trim()) {
|
|
243
|
+
return path.resolve(args.dir.trim());
|
|
244
|
+
}
|
|
245
|
+
return path.join(resolveDotclawHome(), 'traces');
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
function parseCsvSet(value, options = {}) {
|
|
249
|
+
const lower = options.lower !== false;
|
|
250
|
+
const entries = String(value || '')
|
|
251
|
+
.split(',')
|
|
252
|
+
.map(item => {
|
|
253
|
+
const trimmed = item.trim();
|
|
254
|
+
return lower ? trimmed.toLowerCase() : trimmed;
|
|
255
|
+
})
|
|
256
|
+
.filter(Boolean);
|
|
257
|
+
return entries.length > 0 ? new Set(entries) : null;
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
function resolveTimestamp(value, fallback) {
|
|
261
|
+
const trimmed = String(value || '').trim();
|
|
262
|
+
if (!trimmed) return fallback;
|
|
263
|
+
const numeric = Number(trimmed);
|
|
264
|
+
if (Number.isFinite(numeric) && numeric > 0) return numeric;
|
|
265
|
+
const parsed = Date.parse(trimmed);
|
|
266
|
+
return Number.isFinite(parsed) ? parsed : fallback;
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
function normalizeSource(value) {
|
|
270
|
+
const source = String(value || '').trim().toLowerCase();
|
|
271
|
+
return source || 'unknown';
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
function resolveTraceFilters(args) {
|
|
275
|
+
const sinceMs = resolveTimestamp(args.since, Date.now() - (args.days * 24 * 60 * 60 * 1000));
|
|
276
|
+
const untilMs = resolveTimestamp(args.until, Infinity);
|
|
277
|
+
const includeSources = parseCsvSet(args.source);
|
|
278
|
+
const excludeSources = parseCsvSet(args.excludeSource);
|
|
279
|
+
const includeChats = parseCsvSet(args.chatId, { lower: false });
|
|
280
|
+
return {
|
|
281
|
+
sinceMs,
|
|
282
|
+
untilMs,
|
|
283
|
+
includeSources,
|
|
284
|
+
excludeSources,
|
|
285
|
+
includeChats
|
|
286
|
+
};
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
function currentGitInfo() {
|
|
290
|
+
try {
|
|
291
|
+
const sha = execSync('git rev-parse HEAD', { encoding: 'utf-8', stdio: ['ignore', 'pipe', 'ignore'] }).trim();
|
|
292
|
+
const status = execSync('git status --porcelain', { encoding: 'utf-8', stdio: ['ignore', 'pipe', 'ignore'] }).trim();
|
|
293
|
+
return { sha, dirty: status.length > 0 };
|
|
294
|
+
} catch {
|
|
295
|
+
return { sha: null, dirty: null };
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
function classifyErrorMessage(message) {
|
|
300
|
+
const lower = String(message || '').toLowerCase();
|
|
301
|
+
if (/invalid.?api.?key|unauthorized|forbidden|payment|required|insufficient.?credit|\b401\b|\b402\b|\b403\b/.test(lower)) {
|
|
302
|
+
return 'auth';
|
|
303
|
+
}
|
|
304
|
+
if (/rate.?limit|too many requests|\b429\b/.test(lower)) {
|
|
305
|
+
return 'rate_limit';
|
|
306
|
+
}
|
|
307
|
+
if (/timeout|timed out|deadline|econnreset|econnrefused|enotfound|eai_again/.test(lower)) {
|
|
308
|
+
return 'timeout';
|
|
309
|
+
}
|
|
310
|
+
if (/context.?length|maximum.?context|too many tokens|token.?limit/.test(lower)) {
|
|
311
|
+
return 'context_overflow';
|
|
312
|
+
}
|
|
313
|
+
return 'unknown';
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
function evenlySample(values, maxCount) {
|
|
317
|
+
if (!Array.isArray(values)) return [];
|
|
318
|
+
if (!Number.isFinite(maxCount) || maxCount <= 0) return [];
|
|
319
|
+
if (values.length <= maxCount) return [...values];
|
|
320
|
+
const sampled = [];
|
|
321
|
+
const stride = values.length / maxCount;
|
|
322
|
+
for (let i = 0; i < maxCount; i += 1) {
|
|
323
|
+
const idx = Math.min(values.length - 1, Math.floor(i * stride));
|
|
324
|
+
sampled.push(values[idx]);
|
|
325
|
+
}
|
|
326
|
+
return sampled;
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
function extractRawSignals(records) {
|
|
330
|
+
const successFlags = [];
|
|
331
|
+
const emptySuccessFlags = [];
|
|
332
|
+
const latencies = [];
|
|
333
|
+
const promptTokens = [];
|
|
334
|
+
const completionTokens = [];
|
|
335
|
+
const toolCallSuccessFlags = [];
|
|
336
|
+
const bySource = new Map();
|
|
337
|
+
const errorClassCounts = {
|
|
338
|
+
auth: 0,
|
|
339
|
+
rate_limit: 0,
|
|
340
|
+
timeout: 0,
|
|
341
|
+
context_overflow: 0,
|
|
342
|
+
unknown: 0,
|
|
343
|
+
};
|
|
344
|
+
|
|
345
|
+
for (const row of records) {
|
|
346
|
+
const source = normalizeSource(row.source);
|
|
347
|
+
if (!bySource.has(source)) {
|
|
348
|
+
bySource.set(source, {
|
|
349
|
+
success: [],
|
|
350
|
+
empty: [],
|
|
351
|
+
latencies: [],
|
|
352
|
+
tool: [],
|
|
353
|
+
promptTokens: [],
|
|
354
|
+
completionTokens: []
|
|
355
|
+
});
|
|
356
|
+
}
|
|
357
|
+
const sourceSignals = bySource.get(source);
|
|
358
|
+
const hasError = typeof row.error_code === 'string' && row.error_code.trim().length > 0;
|
|
359
|
+
if (hasError) {
|
|
360
|
+
const errorClass = classifyErrorMessage(row.error_code);
|
|
361
|
+
errorClassCounts[errorClass] += 1;
|
|
362
|
+
successFlags.push(0);
|
|
363
|
+
sourceSignals.success.push(0);
|
|
364
|
+
} else {
|
|
365
|
+
successFlags.push(1);
|
|
366
|
+
sourceSignals.success.push(1);
|
|
367
|
+
const outputText = typeof row.output_text === 'string' ? row.output_text.trim() : '';
|
|
368
|
+
const emptyFlag = outputText ? 0 : 1;
|
|
369
|
+
emptySuccessFlags.push(emptyFlag);
|
|
370
|
+
sourceSignals.empty.push(emptyFlag);
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
const latency = Number(row.latency_ms);
|
|
374
|
+
if (Number.isFinite(latency) && latency >= 0) {
|
|
375
|
+
latencies.push(latency);
|
|
376
|
+
sourceSignals.latencies.push(latency);
|
|
377
|
+
}
|
|
378
|
+
const prompt = Number(row.tokens_prompt);
|
|
379
|
+
if (Number.isFinite(prompt) && prompt >= 0) {
|
|
380
|
+
promptTokens.push(prompt);
|
|
381
|
+
sourceSignals.promptTokens.push(prompt);
|
|
382
|
+
}
|
|
383
|
+
const completion = Number(row.tokens_completion);
|
|
384
|
+
if (Number.isFinite(completion) && completion >= 0) {
|
|
385
|
+
completionTokens.push(completion);
|
|
386
|
+
sourceSignals.completionTokens.push(completion);
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
const toolCalls = Array.isArray(row.tool_calls) ? row.tool_calls : [];
|
|
390
|
+
for (const call of toolCalls) {
|
|
391
|
+
const flag = call?.ok ? 1 : 0;
|
|
392
|
+
toolCallSuccessFlags.push(flag);
|
|
393
|
+
sourceSignals.tool.push(flag);
|
|
394
|
+
}
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
const perSource = {};
|
|
398
|
+
for (const [source, signals] of bySource.entries()) {
|
|
399
|
+
perSource[source] = {
|
|
400
|
+
success_flags: signals.success,
|
|
401
|
+
empty_success_flags: signals.empty,
|
|
402
|
+
tool_call_success_flags: signals.tool,
|
|
403
|
+
latencies_ms_sample: evenlySample(signals.latencies, 2000),
|
|
404
|
+
prompt_tokens_sample: evenlySample(signals.promptTokens, 2000),
|
|
405
|
+
completion_tokens_sample: evenlySample(signals.completionTokens, 2000)
|
|
406
|
+
};
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
return {
|
|
410
|
+
success_flags: successFlags,
|
|
411
|
+
empty_success_flags: emptySuccessFlags,
|
|
412
|
+
tool_call_success_flags: toolCallSuccessFlags,
|
|
413
|
+
latencies_ms_sample: evenlySample(latencies, 4000),
|
|
414
|
+
prompt_tokens_sample: evenlySample(promptTokens, 4000),
|
|
415
|
+
completion_tokens_sample: evenlySample(completionTokens, 4000),
|
|
416
|
+
error_class_counts: errorClassCounts,
|
|
417
|
+
per_source: perSource,
|
|
418
|
+
};
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
function buildSnapshot(args) {
|
|
422
|
+
const traceDir = resolveTraceDir(args);
|
|
423
|
+
const filters = resolveTraceFilters(args);
|
|
424
|
+
const records = loadTraces(traceDir, filters);
|
|
425
|
+
const baseline = buildReport(records, traceDir, args.days);
|
|
426
|
+
const scenarioMetrics = evaluateScenarioMetrics(records, { recoveryWindowMs: args.recoveryWindowMs });
|
|
427
|
+
const scenarioFailures = evaluateScenarioThresholds(scenarioMetrics);
|
|
428
|
+
const releaseSlo = evaluateReleaseSlo(baseline);
|
|
429
|
+
const raw = extractRawSignals(records);
|
|
430
|
+
|
|
431
|
+
return {
|
|
432
|
+
schema_version: SNAPSHOT_SCHEMA_VERSION,
|
|
433
|
+
run_id: args.runId,
|
|
434
|
+
label: normalizeLabel(args.label),
|
|
435
|
+
captured_at: new Date().toISOString(),
|
|
436
|
+
window_days: args.days,
|
|
437
|
+
window: {
|
|
438
|
+
since: Number.isFinite(filters.sinceMs) ? new Date(filters.sinceMs).toISOString() : null,
|
|
439
|
+
until: Number.isFinite(filters.untilMs) ? new Date(filters.untilMs).toISOString() : null
|
|
440
|
+
},
|
|
441
|
+
filters: {
|
|
442
|
+
source: filters.includeSources ? Array.from(filters.includeSources.values()) : null,
|
|
443
|
+
exclude_source: filters.excludeSources ? Array.from(filters.excludeSources.values()) : null,
|
|
444
|
+
chat_id: filters.includeChats ? Array.from(filters.includeChats.values()) : null
|
|
445
|
+
},
|
|
446
|
+
trace_dir: traceDir,
|
|
447
|
+
git: currentGitInfo(),
|
|
448
|
+
baseline,
|
|
449
|
+
scenarios: {
|
|
450
|
+
...scenarioMetrics,
|
|
451
|
+
failures: scenarioFailures,
|
|
452
|
+
},
|
|
453
|
+
release_slo: releaseSlo,
|
|
454
|
+
raw,
|
|
455
|
+
};
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
function loadManifest(runDir) {
|
|
459
|
+
const manifestPath = path.join(runDir, 'manifest.json');
|
|
460
|
+
const fallback = {
|
|
461
|
+
schema_version: SNAPSHOT_SCHEMA_VERSION,
|
|
462
|
+
run_id: path.basename(runDir),
|
|
463
|
+
created_at: new Date().toISOString(),
|
|
464
|
+
snapshots: [],
|
|
465
|
+
};
|
|
466
|
+
const manifest = readJson(manifestPath, fallback);
|
|
467
|
+
if (!Array.isArray(manifest.snapshots)) manifest.snapshots = [];
|
|
468
|
+
return manifest;
|
|
469
|
+
}
|
|
470
|
+
|
|
471
|
+
function saveManifest(runDir, manifest) {
|
|
472
|
+
writeJson(path.join(runDir, 'manifest.json'), manifest);
|
|
473
|
+
}
|
|
474
|
+
|
|
475
|
+
function addSnapshotToManifest(manifest, snapshotFile, snapshot) {
|
|
476
|
+
const entry = {
|
|
477
|
+
label: snapshot.label,
|
|
478
|
+
file: path.basename(snapshotFile),
|
|
479
|
+
captured_at: snapshot.captured_at,
|
|
480
|
+
records_total: snapshot?.baseline?.records_total ?? 0,
|
|
481
|
+
};
|
|
482
|
+
const existingIdx = manifest.snapshots.findIndex((item) => item.label === entry.label);
|
|
483
|
+
if (existingIdx >= 0) {
|
|
484
|
+
manifest.snapshots[existingIdx] = entry;
|
|
485
|
+
} else {
|
|
486
|
+
manifest.snapshots.push(entry);
|
|
487
|
+
}
|
|
488
|
+
manifest.snapshots.sort((a, b) => String(a.captured_at).localeCompare(String(b.captured_at)));
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
function resolveSnapshotPathFromLabel(runDir, label) {
|
|
492
|
+
const clean = normalizeLabel(label);
|
|
493
|
+
if (!clean) return null;
|
|
494
|
+
const direct = path.join(runDir, 'snapshots', `${clean}.json`);
|
|
495
|
+
if (fs.existsSync(direct)) return direct;
|
|
496
|
+
return null;
|
|
497
|
+
}
|
|
498
|
+
|
|
499
|
+
function resolveSnapshotPath(runDir, value) {
|
|
500
|
+
const trimmed = String(value || '').trim();
|
|
501
|
+
if (!trimmed) return null;
|
|
502
|
+
if (trimmed.endsWith('.json') && fs.existsSync(trimmed)) {
|
|
503
|
+
return path.resolve(trimmed);
|
|
504
|
+
}
|
|
505
|
+
if (fs.existsSync(path.join(runDir, 'snapshots', trimmed))) {
|
|
506
|
+
return path.join(runDir, 'snapshots', trimmed);
|
|
507
|
+
}
|
|
508
|
+
return resolveSnapshotPathFromLabel(runDir, trimmed);
|
|
509
|
+
}
|
|
510
|
+
|
|
511
|
+
function erf(x) {
|
|
512
|
+
const sign = x < 0 ? -1 : 1;
|
|
513
|
+
const abs = Math.abs(x);
|
|
514
|
+
const t = 1 / (1 + 0.3275911 * abs);
|
|
515
|
+
const a1 = 0.254829592;
|
|
516
|
+
const a2 = -0.284496736;
|
|
517
|
+
const a3 = 1.421413741;
|
|
518
|
+
const a4 = -1.453152027;
|
|
519
|
+
const a5 = 1.061405429;
|
|
520
|
+
const y = 1 - (((((a5 * t + a4) * t + a3) * t + a2) * t + a1) * t * Math.exp(-abs * abs));
|
|
521
|
+
return sign * y;
|
|
522
|
+
}
|
|
523
|
+
|
|
524
|
+
function normalCdf(x) {
|
|
525
|
+
return 0.5 * (1 + erf(x / Math.SQRT2));
|
|
526
|
+
}
|
|
527
|
+
|
|
528
|
+
function twoProportionStats(params) {
|
|
529
|
+
const n1 = Math.max(0, Number(params.n1) || 0);
|
|
530
|
+
const n2 = Math.max(0, Number(params.n2) || 0);
|
|
531
|
+
const s1 = Math.min(n1, Math.max(0, Number(params.s1) || 0));
|
|
532
|
+
const s2 = Math.min(n2, Math.max(0, Number(params.s2) || 0));
|
|
533
|
+
if (n1 === 0 || n2 === 0) {
|
|
534
|
+
return {
|
|
535
|
+
before: n1 > 0 ? s1 / n1 : null,
|
|
536
|
+
after: n2 > 0 ? s2 / n2 : null,
|
|
537
|
+
delta: null,
|
|
538
|
+
p_value: null,
|
|
539
|
+
ci95: [null, null],
|
|
540
|
+
significant: false,
|
|
541
|
+
};
|
|
542
|
+
}
|
|
543
|
+
|
|
544
|
+
const p1 = s1 / n1;
|
|
545
|
+
const p2 = s2 / n2;
|
|
546
|
+
const delta = p2 - p1;
|
|
547
|
+
const pooled = (s1 + s2) / (n1 + n2);
|
|
548
|
+
const sePooled = Math.sqrt(Math.max(1e-12, pooled * (1 - pooled) * (1 / n1 + 1 / n2)));
|
|
549
|
+
const z = delta / sePooled;
|
|
550
|
+
const pValue = Math.max(0, Math.min(1, 2 * (1 - normalCdf(Math.abs(z)))));
|
|
551
|
+
|
|
552
|
+
const se = Math.sqrt(
|
|
553
|
+
Math.max(1e-12, (p1 * (1 - p1)) / n1 + (p2 * (1 - p2)) / n2)
|
|
554
|
+
);
|
|
555
|
+
const ciLow = delta - 1.96 * se;
|
|
556
|
+
const ciHigh = delta + 1.96 * se;
|
|
557
|
+
|
|
558
|
+
return {
|
|
559
|
+
before: p1,
|
|
560
|
+
after: p2,
|
|
561
|
+
delta,
|
|
562
|
+
p_value: pValue,
|
|
563
|
+
ci95: [ciLow, ciHigh],
|
|
564
|
+
significant: pValue < 0.05,
|
|
565
|
+
};
|
|
566
|
+
}
|
|
567
|
+
|
|
568
|
+
function mulberry32(seed) {
|
|
569
|
+
let t = seed >>> 0;
|
|
570
|
+
return () => {
|
|
571
|
+
t += 0x6D2B79F5;
|
|
572
|
+
let r = t;
|
|
573
|
+
r = Math.imul(r ^ (r >>> 15), r | 1);
|
|
574
|
+
r ^= r + Math.imul(r ^ (r >>> 7), r | 61);
|
|
575
|
+
return ((r ^ (r >>> 14)) >>> 0) / 4294967296;
|
|
576
|
+
};
|
|
577
|
+
}
|
|
578
|
+
|
|
579
|
+
function sampleWithReplacement(values, rng) {
|
|
580
|
+
const result = new Array(values.length);
|
|
581
|
+
for (let i = 0; i < values.length; i += 1) {
|
|
582
|
+
const idx = Math.floor(rng() * values.length);
|
|
583
|
+
result[i] = values[idx];
|
|
584
|
+
}
|
|
585
|
+
return result;
|
|
586
|
+
}
|
|
587
|
+
|
|
588
|
+
function bootstrapDelta(params) {
|
|
589
|
+
const before = Array.isArray(params.before) ? params.before.filter(Number.isFinite) : [];
|
|
590
|
+
const after = Array.isArray(params.after) ? params.after.filter(Number.isFinite) : [];
|
|
591
|
+
const iterations = Math.max(200, Math.floor(params.iterations || DEFAULT_BOOTSTRAP_ITERATIONS));
|
|
592
|
+
const statFn = params.statFn;
|
|
593
|
+
if (before.length < 20 || after.length < 20) {
|
|
594
|
+
return {
|
|
595
|
+
before: before.length > 0 ? statFn(before) : null,
|
|
596
|
+
after: after.length > 0 ? statFn(after) : null,
|
|
597
|
+
delta: null,
|
|
598
|
+
p_value: null,
|
|
599
|
+
ci95: [null, null],
|
|
600
|
+
significant: false,
|
|
601
|
+
};
|
|
602
|
+
}
|
|
603
|
+
|
|
604
|
+
const beforeStat = statFn(before);
|
|
605
|
+
const afterStat = statFn(after);
|
|
606
|
+
const deltas = [];
|
|
607
|
+
const seed = (before.length * 2654435761 + after.length * 1013904223 + iterations) >>> 0;
|
|
608
|
+
const rng = mulberry32(seed);
|
|
609
|
+
|
|
610
|
+
for (let i = 0; i < iterations; i += 1) {
|
|
611
|
+
const b = sampleWithReplacement(before, rng);
|
|
612
|
+
const a = sampleWithReplacement(after, rng);
|
|
613
|
+
deltas.push(statFn(a) - statFn(b));
|
|
614
|
+
}
|
|
615
|
+
deltas.sort((a, b) => a - b);
|
|
616
|
+
const ciLow = percentile(deltas, 2.5);
|
|
617
|
+
const ciHigh = percentile(deltas, 97.5);
|
|
618
|
+
const observedDelta = afterStat - beforeStat;
|
|
619
|
+
const oppositeSignCount = observedDelta >= 0
|
|
620
|
+
? deltas.filter(v => v <= 0).length
|
|
621
|
+
: deltas.filter(v => v >= 0).length;
|
|
622
|
+
const pValue = Math.max(1 / iterations, Math.min(1, (2 * oppositeSignCount) / iterations));
|
|
623
|
+
|
|
624
|
+
return {
|
|
625
|
+
before: beforeStat,
|
|
626
|
+
after: afterStat,
|
|
627
|
+
delta: observedDelta,
|
|
628
|
+
p_value: pValue,
|
|
629
|
+
ci95: [ciLow, ciHigh],
|
|
630
|
+
significant: ciLow !== null && ciHigh !== null && (ciLow > 0 || ciHigh < 0),
|
|
631
|
+
};
|
|
632
|
+
}
|
|
633
|
+
|
|
634
|
+
function summarizeDirectionalResult(name, stats, direction) {
|
|
635
|
+
const delta = stats?.delta;
|
|
636
|
+
const improved = Number.isFinite(delta)
|
|
637
|
+
? (direction === 'up' ? delta > 0 : delta < 0)
|
|
638
|
+
: false;
|
|
639
|
+
const regressed = Number.isFinite(delta)
|
|
640
|
+
? (direction === 'up' ? delta < 0 : delta > 0)
|
|
641
|
+
: false;
|
|
642
|
+
const significant = Boolean(stats?.significant);
|
|
643
|
+
return {
|
|
644
|
+
name,
|
|
645
|
+
direction,
|
|
646
|
+
...stats,
|
|
647
|
+
improved,
|
|
648
|
+
regressed,
|
|
649
|
+
significant_improvement: improved && significant,
|
|
650
|
+
significant_regression: regressed && significant,
|
|
651
|
+
};
|
|
652
|
+
}
|
|
653
|
+
|
|
654
|
+
function getSourceMix(snapshot) {
|
|
655
|
+
const rows = Array.isArray(snapshot?.baseline?.records_by_source)
|
|
656
|
+
? snapshot.baseline.records_by_source
|
|
657
|
+
: [];
|
|
658
|
+
const total = rows.reduce((sum, item) => sum + Number(item.records || 0), 0);
|
|
659
|
+
if (!Number.isFinite(total) || total <= 0) return {};
|
|
660
|
+
const mix = {};
|
|
661
|
+
for (const row of rows) {
|
|
662
|
+
const source = normalizeSource(row.source);
|
|
663
|
+
const records = Number(row.records || 0);
|
|
664
|
+
if (records <= 0) continue;
|
|
665
|
+
mix[source] = records / total;
|
|
666
|
+
}
|
|
667
|
+
return mix;
|
|
668
|
+
}
|
|
669
|
+
|
|
670
|
+
function weightedAverage(parts, weights) {
|
|
671
|
+
let weighted = 0;
|
|
672
|
+
let appliedWeight = 0;
|
|
673
|
+
for (const [source, value] of Object.entries(parts || {})) {
|
|
674
|
+
if (!Number.isFinite(value)) continue;
|
|
675
|
+
const weight = Number(weights[source] || 0);
|
|
676
|
+
if (!Number.isFinite(weight) || weight <= 0) continue;
|
|
677
|
+
weighted += value * weight;
|
|
678
|
+
appliedWeight += weight;
|
|
679
|
+
}
|
|
680
|
+
if (appliedWeight <= 0) return null;
|
|
681
|
+
return weighted / appliedWeight;
|
|
682
|
+
}
|
|
683
|
+
|
|
684
|
+
function resampleToCount(values, count) {
|
|
685
|
+
const list = Array.isArray(values) ? values.filter(Number.isFinite) : [];
|
|
686
|
+
if (count <= 0 || list.length === 0) return [];
|
|
687
|
+
if (list.length >= count) {
|
|
688
|
+
return evenlySample(list, count);
|
|
689
|
+
}
|
|
690
|
+
const out = [];
|
|
691
|
+
for (let i = 0; i < count; i += 1) {
|
|
692
|
+
out.push(list[i % list.length]);
|
|
693
|
+
}
|
|
694
|
+
return out;
|
|
695
|
+
}
|
|
696
|
+
|
|
697
|
+
function weightedLatencyPercentile(snapshot, mix, p) {
|
|
698
|
+
const perSource = snapshot?.raw?.per_source || {};
|
|
699
|
+
const targetSample = 4000;
|
|
700
|
+
const combined = [];
|
|
701
|
+
let totalWeight = 0;
|
|
702
|
+
for (const [source, weightRaw] of Object.entries(mix || {})) {
|
|
703
|
+
const weight = Number(weightRaw || 0);
|
|
704
|
+
if (!Number.isFinite(weight) || weight <= 0) continue;
|
|
705
|
+
totalWeight += weight;
|
|
706
|
+
const sourceRows = perSource[source];
|
|
707
|
+
const values = sourceRows?.latencies_ms_sample || [];
|
|
708
|
+
if (!Array.isArray(values) || values.length === 0) continue;
|
|
709
|
+
const count = Math.max(1, Math.round(weight * targetSample));
|
|
710
|
+
combined.push(...resampleToCount(values, count));
|
|
711
|
+
}
|
|
712
|
+
if (combined.length === 0 || totalWeight <= 0) return null;
|
|
713
|
+
return percentile(combined, p);
|
|
714
|
+
}
|
|
715
|
+
|
|
716
|
+
function weightedCoreValues(snapshot, mix) {
|
|
717
|
+
const rows = Array.isArray(snapshot?.baseline?.records_by_source)
|
|
718
|
+
? snapshot.baseline.records_by_source
|
|
719
|
+
: [];
|
|
720
|
+
const sourceMetrics = {};
|
|
721
|
+
for (const row of rows) {
|
|
722
|
+
const source = normalizeSource(row.source);
|
|
723
|
+
const successRate = Number(row.success_rate);
|
|
724
|
+
const emptySuccessRate = Number(row.empty_success_rate);
|
|
725
|
+
const toolSuccessRate = Number(row.tool_success_rate);
|
|
726
|
+
const errorRate = Number.isFinite(successRate) ? 1 - successRate : NaN;
|
|
727
|
+
const sourceTokenUsage = row?.token_usage && typeof row.token_usage === 'object'
|
|
728
|
+
? row.token_usage
|
|
729
|
+
: {};
|
|
730
|
+
const promptPerSuccess = Number(sourceTokenUsage.prompt_per_success);
|
|
731
|
+
const completionPerSuccess = Number(sourceTokenUsage.completion_per_success);
|
|
732
|
+
const totalPerSuccess = Number(sourceTokenUsage.total_per_success);
|
|
733
|
+
sourceMetrics[source] = {
|
|
734
|
+
success_rate: Number.isFinite(successRate) ? successRate : NaN,
|
|
735
|
+
error_rate: Number.isFinite(errorRate) ? errorRate : NaN,
|
|
736
|
+
empty_success_rate: Number.isFinite(emptySuccessRate) ? emptySuccessRate : NaN,
|
|
737
|
+
tool_success_rate: Number.isFinite(toolSuccessRate) ? toolSuccessRate : NaN,
|
|
738
|
+
prompt_tokens_per_success: Number.isFinite(promptPerSuccess) ? promptPerSuccess : NaN,
|
|
739
|
+
completion_tokens_per_success: Number.isFinite(completionPerSuccess) ? completionPerSuccess : NaN,
|
|
740
|
+
total_tokens_per_success: Number.isFinite(totalPerSuccess) ? totalPerSuccess : NaN
|
|
741
|
+
};
|
|
742
|
+
}
|
|
743
|
+
|
|
744
|
+
const parts = {
|
|
745
|
+
success_rate: {},
|
|
746
|
+
error_rate: {},
|
|
747
|
+
empty_success_rate: {},
|
|
748
|
+
tool_success_rate: {},
|
|
749
|
+
prompt_tokens_per_success: {},
|
|
750
|
+
completion_tokens_per_success: {},
|
|
751
|
+
total_tokens_per_success: {},
|
|
752
|
+
};
|
|
753
|
+
for (const [source, metrics] of Object.entries(sourceMetrics)) {
|
|
754
|
+
parts.success_rate[source] = metrics.success_rate;
|
|
755
|
+
parts.error_rate[source] = metrics.error_rate;
|
|
756
|
+
parts.empty_success_rate[source] = metrics.empty_success_rate;
|
|
757
|
+
parts.tool_success_rate[source] = metrics.tool_success_rate;
|
|
758
|
+
parts.prompt_tokens_per_success[source] = metrics.prompt_tokens_per_success;
|
|
759
|
+
parts.completion_tokens_per_success[source] = metrics.completion_tokens_per_success;
|
|
760
|
+
parts.total_tokens_per_success[source] = metrics.total_tokens_per_success;
|
|
761
|
+
}
|
|
762
|
+
|
|
763
|
+
return {
|
|
764
|
+
success_rate: weightedAverage(parts.success_rate, mix),
|
|
765
|
+
error_rate: weightedAverage(parts.error_rate, mix),
|
|
766
|
+
empty_success_rate: weightedAverage(parts.empty_success_rate, mix),
|
|
767
|
+
tool_success_rate: weightedAverage(parts.tool_success_rate, mix),
|
|
768
|
+
latency_p50_ms: weightedLatencyPercentile(snapshot, mix, 50),
|
|
769
|
+
latency_p95_ms: weightedLatencyPercentile(snapshot, mix, 95),
|
|
770
|
+
latency_p99_ms: weightedLatencyPercentile(snapshot, mix, 99),
|
|
771
|
+
prompt_tokens_per_success: weightedAverage(parts.prompt_tokens_per_success, mix),
|
|
772
|
+
completion_tokens_per_success: weightedAverage(parts.completion_tokens_per_success, mix),
|
|
773
|
+
total_tokens_per_success: weightedAverage(parts.total_tokens_per_success, mix)
|
|
774
|
+
};
|
|
775
|
+
}
|
|
776
|
+
|
|
777
|
+
function summarizeWeightedMetric(name, direction, beforeValue, afterValue) {
|
|
778
|
+
const before = Number.isFinite(beforeValue) ? Number(beforeValue) : null;
|
|
779
|
+
const after = Number.isFinite(afterValue) ? Number(afterValue) : null;
|
|
780
|
+
const delta = before !== null && after !== null ? after - before : null;
|
|
781
|
+
const improved = delta !== null ? (direction === 'up' ? delta > 0 : delta < 0) : false;
|
|
782
|
+
const regressed = delta !== null ? (direction === 'up' ? delta < 0 : delta > 0) : false;
|
|
783
|
+
return {
|
|
784
|
+
name,
|
|
785
|
+
direction,
|
|
786
|
+
before,
|
|
787
|
+
after,
|
|
788
|
+
delta,
|
|
789
|
+
p_value: null,
|
|
790
|
+
ci95: [null, null],
|
|
791
|
+
significant: false,
|
|
792
|
+
improved,
|
|
793
|
+
regressed,
|
|
794
|
+
significant_improvement: false,
|
|
795
|
+
significant_regression: false,
|
|
796
|
+
};
|
|
797
|
+
}
|
|
798
|
+
|
|
799
|
+
function deterministicDeltaStats(beforeValue, afterValue) {
|
|
800
|
+
const before = Number.isFinite(beforeValue) ? Number(beforeValue) : null;
|
|
801
|
+
const after = Number.isFinite(afterValue) ? Number(afterValue) : null;
|
|
802
|
+
const delta = before !== null && after !== null ? after - before : null;
|
|
803
|
+
return {
|
|
804
|
+
before,
|
|
805
|
+
after,
|
|
806
|
+
delta,
|
|
807
|
+
p_value: null,
|
|
808
|
+
ci95: [null, null],
|
|
809
|
+
significant: false
|
|
810
|
+
};
|
|
811
|
+
}
|
|
812
|
+
|
|
813
|
+
function extractTokensPerSuccess(snapshot) {
|
|
814
|
+
const promptTotal = Number(snapshot?.baseline?.token_usage?.prompt_total || 0);
|
|
815
|
+
const completionTotal = Number(snapshot?.baseline?.token_usage?.completion_total || 0);
|
|
816
|
+
const success = Number(snapshot?.baseline?.records_success || 0);
|
|
817
|
+
if (!Number.isFinite(success) || success <= 0) {
|
|
818
|
+
return {
|
|
819
|
+
prompt_tokens_per_success: null,
|
|
820
|
+
completion_tokens_per_success: null,
|
|
821
|
+
total_tokens_per_success: null,
|
|
822
|
+
};
|
|
823
|
+
}
|
|
824
|
+
const promptPerSuccess = Number.isFinite(promptTotal) ? promptTotal / success : null;
|
|
825
|
+
const completionPerSuccess = Number.isFinite(completionTotal) ? completionTotal / success : null;
|
|
826
|
+
const totalPerSuccess = Number.isFinite(promptPerSuccess) && Number.isFinite(completionPerSuccess)
|
|
827
|
+
? promptPerSuccess + completionPerSuccess
|
|
828
|
+
: null;
|
|
829
|
+
return {
|
|
830
|
+
prompt_tokens_per_success: Number.isFinite(promptPerSuccess) ? Number(promptPerSuccess.toFixed(2)) : null,
|
|
831
|
+
completion_tokens_per_success: Number.isFinite(completionPerSuccess) ? Number(completionPerSuccess.toFixed(2)) : null,
|
|
832
|
+
total_tokens_per_success: Number.isFinite(totalPerSuccess) ? Number(totalPerSuccess.toFixed(2)) : null,
|
|
833
|
+
};
|
|
834
|
+
}
|
|
835
|
+
|
|
836
|
+
export function evaluateSnapshotComparison(beforeSnapshot, afterSnapshot, options = {}) {
|
|
837
|
+
const before = beforeSnapshot;
|
|
838
|
+
const after = afterSnapshot;
|
|
839
|
+
const bootstrapIterations = Math.max(200, Math.floor(options.bootstrapIterations || DEFAULT_BOOTSTRAP_ITERATIONS));
|
|
840
|
+
|
|
841
|
+
const beforeTotal = Number(before?.baseline?.records_total || 0);
|
|
842
|
+
const afterTotal = Number(after?.baseline?.records_total || 0);
|
|
843
|
+
const beforeErrors = Number(before?.baseline?.records_error || 0);
|
|
844
|
+
const afterErrors = Number(after?.baseline?.records_error || 0);
|
|
845
|
+
const beforeSuccess = Number(before?.baseline?.records_success || 0);
|
|
846
|
+
const afterSuccess = Number(after?.baseline?.records_success || 0);
|
|
847
|
+
const beforeEmpty = Number(before?.baseline?.empty_success_responses || 0);
|
|
848
|
+
const afterEmpty = Number(after?.baseline?.empty_success_responses || 0);
|
|
849
|
+
|
|
850
|
+
const beforeToolTotal = Number(before?.baseline?.tool_calls?.total || 0);
|
|
851
|
+
const beforeToolFailed = Number(before?.baseline?.tool_calls?.failed || 0);
|
|
852
|
+
const afterToolTotal = Number(after?.baseline?.tool_calls?.total || 0);
|
|
853
|
+
const afterToolFailed = Number(after?.baseline?.tool_calls?.failed || 0);
|
|
854
|
+
const beforeToolSuccess = Math.max(0, beforeToolTotal - beforeToolFailed);
|
|
855
|
+
const afterToolSuccess = Math.max(0, afterToolTotal - afterToolFailed);
|
|
856
|
+
|
|
857
|
+
const successRate = summarizeDirectionalResult(
|
|
858
|
+
'success_rate',
|
|
859
|
+
twoProportionStats({ s1: beforeSuccess, n1: beforeTotal, s2: afterSuccess, n2: afterTotal }),
|
|
860
|
+
'up'
|
|
861
|
+
);
|
|
862
|
+
const errorRate = summarizeDirectionalResult(
|
|
863
|
+
'error_rate',
|
|
864
|
+
twoProportionStats({ s1: beforeErrors, n1: beforeTotal, s2: afterErrors, n2: afterTotal }),
|
|
865
|
+
'down'
|
|
866
|
+
);
|
|
867
|
+
const emptySuccessRate = summarizeDirectionalResult(
|
|
868
|
+
'empty_success_rate',
|
|
869
|
+
twoProportionStats({ s1: beforeEmpty, n1: beforeSuccess, s2: afterEmpty, n2: afterSuccess }),
|
|
870
|
+
'down'
|
|
871
|
+
);
|
|
872
|
+
const toolSuccessRate = summarizeDirectionalResult(
|
|
873
|
+
'tool_success_rate',
|
|
874
|
+
twoProportionStats({ s1: beforeToolSuccess, n1: beforeToolTotal, s2: afterToolSuccess, n2: afterToolTotal }),
|
|
875
|
+
'up'
|
|
876
|
+
);
|
|
877
|
+
|
|
878
|
+
const latencyP50Stats = bootstrapDelta({
|
|
879
|
+
before: before?.raw?.latencies_ms_sample || [],
|
|
880
|
+
after: after?.raw?.latencies_ms_sample || [],
|
|
881
|
+
iterations: bootstrapIterations,
|
|
882
|
+
statFn: values => percentile(values, 50),
|
|
883
|
+
});
|
|
884
|
+
const latencyP50 = summarizeDirectionalResult('latency_p50_ms', latencyP50Stats, 'down');
|
|
885
|
+
|
|
886
|
+
const latencyP95Stats = bootstrapDelta({
|
|
887
|
+
before: before?.raw?.latencies_ms_sample || [],
|
|
888
|
+
after: after?.raw?.latencies_ms_sample || [],
|
|
889
|
+
iterations: bootstrapIterations,
|
|
890
|
+
statFn: values => percentile(values, 95),
|
|
891
|
+
});
|
|
892
|
+
const latencyP95 = summarizeDirectionalResult('latency_p95_ms', latencyP95Stats, 'down');
|
|
893
|
+
|
|
894
|
+
const latencyP99Stats = bootstrapDelta({
|
|
895
|
+
before: before?.raw?.latencies_ms_sample || [],
|
|
896
|
+
after: after?.raw?.latencies_ms_sample || [],
|
|
897
|
+
iterations: bootstrapIterations,
|
|
898
|
+
statFn: values => percentile(values, 99),
|
|
899
|
+
});
|
|
900
|
+
const latencyP99 = summarizeDirectionalResult('latency_p99_ms', latencyP99Stats, 'down');
|
|
901
|
+
|
|
902
|
+
const beforeTokensPerSuccess = extractTokensPerSuccess(before);
|
|
903
|
+
const afterTokensPerSuccess = extractTokensPerSuccess(after);
|
|
904
|
+
const promptTokensPerSuccess = summarizeDirectionalResult(
|
|
905
|
+
'prompt_tokens_per_success',
|
|
906
|
+
deterministicDeltaStats(beforeTokensPerSuccess.prompt_tokens_per_success, afterTokensPerSuccess.prompt_tokens_per_success),
|
|
907
|
+
'down'
|
|
908
|
+
);
|
|
909
|
+
const completionTokensPerSuccess = summarizeDirectionalResult(
|
|
910
|
+
'completion_tokens_per_success',
|
|
911
|
+
deterministicDeltaStats(beforeTokensPerSuccess.completion_tokens_per_success, afterTokensPerSuccess.completion_tokens_per_success),
|
|
912
|
+
'down'
|
|
913
|
+
);
|
|
914
|
+
const totalTokensPerSuccess = summarizeDirectionalResult(
|
|
915
|
+
'total_tokens_per_success',
|
|
916
|
+
deterministicDeltaStats(beforeTokensPerSuccess.total_tokens_per_success, afterTokensPerSuccess.total_tokens_per_success),
|
|
917
|
+
'down'
|
|
918
|
+
);
|
|
919
|
+
|
|
920
|
+
const scenarioKeys = ['memory_carryover', 'tool_heavy', 'transient_recovery', 'context_recovery'];
|
|
921
|
+
const scenarioResults = [];
|
|
922
|
+
for (const key of scenarioKeys) {
|
|
923
|
+
const beforeScenario = before?.scenarios?.scenarios?.[key];
|
|
924
|
+
const afterScenario = after?.scenarios?.scenarios?.[key];
|
|
925
|
+
const scenarioStats = summarizeDirectionalResult(
|
|
926
|
+
`scenario_${key}_pass_rate`,
|
|
927
|
+
twoProportionStats({
|
|
928
|
+
s1: Number(beforeScenario?.passed || 0),
|
|
929
|
+
n1: Number(beforeScenario?.candidates || 0),
|
|
930
|
+
s2: Number(afterScenario?.passed || 0),
|
|
931
|
+
n2: Number(afterScenario?.candidates || 0),
|
|
932
|
+
}),
|
|
933
|
+
'up'
|
|
934
|
+
);
|
|
935
|
+
scenarioResults.push({
|
|
936
|
+
...scenarioStats,
|
|
937
|
+
before_candidates: Number(beforeScenario?.candidates || 0),
|
|
938
|
+
after_candidates: Number(afterScenario?.candidates || 0),
|
|
939
|
+
});
|
|
940
|
+
}
|
|
941
|
+
|
|
942
|
+
const core = [
|
|
943
|
+
successRate,
|
|
944
|
+
errorRate,
|
|
945
|
+
emptySuccessRate,
|
|
946
|
+
toolSuccessRate,
|
|
947
|
+
latencyP50,
|
|
948
|
+
latencyP95,
|
|
949
|
+
latencyP99,
|
|
950
|
+
promptTokensPerSuccess,
|
|
951
|
+
completionTokensPerSuccess,
|
|
952
|
+
totalTokensPerSuccess
|
|
953
|
+
];
|
|
954
|
+
const all = [...core, ...scenarioResults];
|
|
955
|
+
const significantImprovements = all.filter(item => item.significant_improvement).map(item => item.name);
|
|
956
|
+
const significantRegressions = all.filter(item => item.significant_regression).map(item => item.name);
|
|
957
|
+
|
|
958
|
+
const productionSourceMix = getSourceMix(before);
|
|
959
|
+
const weightedBefore = weightedCoreValues(before, productionSourceMix);
|
|
960
|
+
const weightedAfter = weightedCoreValues(after, productionSourceMix);
|
|
961
|
+
const weightedCore = {
|
|
962
|
+
success_rate: summarizeWeightedMetric('success_rate', 'up', weightedBefore.success_rate, weightedAfter.success_rate),
|
|
963
|
+
error_rate: summarizeWeightedMetric('error_rate', 'down', weightedBefore.error_rate, weightedAfter.error_rate),
|
|
964
|
+
empty_success_rate: summarizeWeightedMetric('empty_success_rate', 'down', weightedBefore.empty_success_rate, weightedAfter.empty_success_rate),
|
|
965
|
+
tool_success_rate: summarizeWeightedMetric('tool_success_rate', 'up', weightedBefore.tool_success_rate, weightedAfter.tool_success_rate),
|
|
966
|
+
latency_p50_ms: summarizeWeightedMetric('latency_p50_ms', 'down', weightedBefore.latency_p50_ms, weightedAfter.latency_p50_ms),
|
|
967
|
+
latency_p95_ms: summarizeWeightedMetric('latency_p95_ms', 'down', weightedBefore.latency_p95_ms, weightedAfter.latency_p95_ms),
|
|
968
|
+
latency_p99_ms: summarizeWeightedMetric('latency_p99_ms', 'down', weightedBefore.latency_p99_ms, weightedAfter.latency_p99_ms),
|
|
969
|
+
prompt_tokens_per_success: summarizeWeightedMetric('prompt_tokens_per_success', 'down', weightedBefore.prompt_tokens_per_success, weightedAfter.prompt_tokens_per_success),
|
|
970
|
+
completion_tokens_per_success: summarizeWeightedMetric('completion_tokens_per_success', 'down', weightedBefore.completion_tokens_per_success, weightedAfter.completion_tokens_per_success),
|
|
971
|
+
total_tokens_per_success: summarizeWeightedMetric('total_tokens_per_success', 'down', weightedBefore.total_tokens_per_success, weightedAfter.total_tokens_per_success),
|
|
972
|
+
};
|
|
973
|
+
const weightedItems = Object.values(weightedCore);
|
|
974
|
+
const weightedSummary = {
|
|
975
|
+
improvements: weightedItems.filter(item => item.improved).map(item => item.name),
|
|
976
|
+
regressions: weightedItems.filter(item => item.regressed).map(item => item.name),
|
|
977
|
+
};
|
|
978
|
+
|
|
979
|
+
return {
|
|
980
|
+
before_label: before?.label || 'before',
|
|
981
|
+
after_label: after?.label || 'after',
|
|
982
|
+
before_snapshot: before,
|
|
983
|
+
after_snapshot: after,
|
|
984
|
+
bootstrap_iterations: bootstrapIterations,
|
|
985
|
+
comparisons: {
|
|
986
|
+
core: {
|
|
987
|
+
success_rate: successRate,
|
|
988
|
+
error_rate: errorRate,
|
|
989
|
+
empty_success_rate: emptySuccessRate,
|
|
990
|
+
tool_success_rate: toolSuccessRate,
|
|
991
|
+
latency_p50_ms: latencyP50,
|
|
992
|
+
latency_p95_ms: latencyP95,
|
|
993
|
+
latency_p99_ms: latencyP99,
|
|
994
|
+
prompt_tokens_per_success: promptTokensPerSuccess,
|
|
995
|
+
completion_tokens_per_success: completionTokensPerSuccess,
|
|
996
|
+
total_tokens_per_success: totalTokensPerSuccess,
|
|
997
|
+
},
|
|
998
|
+
scenarios: scenarioResults,
|
|
999
|
+
production_weighted: {
|
|
1000
|
+
source_mix: productionSourceMix,
|
|
1001
|
+
core: weightedCore,
|
|
1002
|
+
summary: weightedSummary
|
|
1003
|
+
}
|
|
1004
|
+
},
|
|
1005
|
+
summary: {
|
|
1006
|
+
significant_improvements: significantImprovements,
|
|
1007
|
+
significant_regressions: significantRegressions,
|
|
1008
|
+
improved_without_significance: all
|
|
1009
|
+
.filter(item => item.improved && !item.significant)
|
|
1010
|
+
.map(item => item.name),
|
|
1011
|
+
regressed_without_significance: all
|
|
1012
|
+
.filter(item => item.regressed && !item.significant)
|
|
1013
|
+
.map(item => item.name),
|
|
1014
|
+
passed: significantRegressions.length === 0,
|
|
1015
|
+
},
|
|
1016
|
+
};
|
|
1017
|
+
}
|
|
1018
|
+
|
|
1019
|
+
function pickWeightedOrCoreMetric(comparison, metricName) {
|
|
1020
|
+
const weighted = comparison?.comparisons?.production_weighted?.core?.[metricName];
|
|
1021
|
+
if (weighted && Number.isFinite(weighted.before) && Number.isFinite(weighted.after)) {
|
|
1022
|
+
return { ...weighted, source: 'production_weighted' };
|
|
1023
|
+
}
|
|
1024
|
+
const core = comparison?.comparisons?.core?.[metricName];
|
|
1025
|
+
if (core && Number.isFinite(core.before) && Number.isFinite(core.after)) {
|
|
1026
|
+
return { ...core, source: 'core' };
|
|
1027
|
+
}
|
|
1028
|
+
return null;
|
|
1029
|
+
}
|
|
1030
|
+
|
|
1031
|
+
function safeRatio(before, after) {
|
|
1032
|
+
if (!Number.isFinite(before) || !Number.isFinite(after) || before === 0) return null;
|
|
1033
|
+
return after / before;
|
|
1034
|
+
}
|
|
1035
|
+
|
|
1036
|
+
export function evaluateSuperiorityGate(comparison, options = {}) {
|
|
1037
|
+
const latencyTolerance = Number.isFinite(options.latencyTolerance)
|
|
1038
|
+
? Number(options.latencyTolerance)
|
|
1039
|
+
: 0.05;
|
|
1040
|
+
const tokenTolerance = Number.isFinite(options.tokenTolerance)
|
|
1041
|
+
? Number(options.tokenTolerance)
|
|
1042
|
+
: 0.05;
|
|
1043
|
+
const checks = [];
|
|
1044
|
+
const failures = [];
|
|
1045
|
+
const epsilon = 1e-9;
|
|
1046
|
+
|
|
1047
|
+
const reliabilityChecks = [
|
|
1048
|
+
['success_rate', 'up'],
|
|
1049
|
+
['error_rate', 'down'],
|
|
1050
|
+
['empty_success_rate', 'down'],
|
|
1051
|
+
['tool_success_rate', 'up']
|
|
1052
|
+
];
|
|
1053
|
+
for (const [name, direction] of reliabilityChecks) {
|
|
1054
|
+
const metric = pickWeightedOrCoreMetric(comparison, name);
|
|
1055
|
+
if (!metric) {
|
|
1056
|
+
checks.push({ metric: name, status: 'skipped', reason: 'insufficient_data' });
|
|
1057
|
+
continue;
|
|
1058
|
+
}
|
|
1059
|
+
const before = Number(metric.before);
|
|
1060
|
+
const after = Number(metric.after);
|
|
1061
|
+
const passed = direction === 'up'
|
|
1062
|
+
? after + epsilon >= before
|
|
1063
|
+
: after <= before + epsilon;
|
|
1064
|
+
checks.push({
|
|
1065
|
+
metric: name,
|
|
1066
|
+
source: metric.source,
|
|
1067
|
+
direction,
|
|
1068
|
+
before,
|
|
1069
|
+
after,
|
|
1070
|
+
delta: Number((after - before).toFixed(6)),
|
|
1071
|
+
passed
|
|
1072
|
+
});
|
|
1073
|
+
if (!passed) {
|
|
1074
|
+
failures.push(`${name} regressed (${before} -> ${after})`);
|
|
1075
|
+
}
|
|
1076
|
+
}
|
|
1077
|
+
|
|
1078
|
+
const scenarios = Array.isArray(comparison?.comparisons?.scenarios)
|
|
1079
|
+
? comparison.comparisons.scenarios
|
|
1080
|
+
: [];
|
|
1081
|
+
const requiredScenarioMetrics = new Set([
|
|
1082
|
+
'scenario_memory_carryover_pass_rate',
|
|
1083
|
+
'scenario_tool_heavy_pass_rate'
|
|
1084
|
+
]);
|
|
1085
|
+
for (const metricName of requiredScenarioMetrics) {
|
|
1086
|
+
const metric = scenarios.find(item => item?.name === metricName);
|
|
1087
|
+
if (!metric || !Number.isFinite(metric.before) || !Number.isFinite(metric.after)) {
|
|
1088
|
+
checks.push({ metric: metricName, status: 'skipped', reason: 'insufficient_data' });
|
|
1089
|
+
continue;
|
|
1090
|
+
}
|
|
1091
|
+
const before = Number(metric.before);
|
|
1092
|
+
const after = Number(metric.after);
|
|
1093
|
+
const passed = after + epsilon >= before;
|
|
1094
|
+
checks.push({
|
|
1095
|
+
metric: metricName,
|
|
1096
|
+
direction: 'up',
|
|
1097
|
+
before,
|
|
1098
|
+
after,
|
|
1099
|
+
delta: Number((after - before).toFixed(6)),
|
|
1100
|
+
before_candidates: metric.before_candidates,
|
|
1101
|
+
after_candidates: metric.after_candidates,
|
|
1102
|
+
passed
|
|
1103
|
+
});
|
|
1104
|
+
if (!passed) {
|
|
1105
|
+
failures.push(`${metricName} regressed (${before} -> ${after})`);
|
|
1106
|
+
}
|
|
1107
|
+
}
|
|
1108
|
+
|
|
1109
|
+
const latencyMetrics = ['latency_p95_ms', 'latency_p99_ms'];
|
|
1110
|
+
for (const metricName of latencyMetrics) {
|
|
1111
|
+
const metric = pickWeightedOrCoreMetric(comparison, metricName);
|
|
1112
|
+
if (!metric) {
|
|
1113
|
+
checks.push({ metric: metricName, status: 'skipped', reason: 'insufficient_data' });
|
|
1114
|
+
continue;
|
|
1115
|
+
}
|
|
1116
|
+
const before = Number(metric.before);
|
|
1117
|
+
const after = Number(metric.after);
|
|
1118
|
+
const ratio = safeRatio(before, after);
|
|
1119
|
+
const maxAllowed = before * (1 + latencyTolerance);
|
|
1120
|
+
const passed = Number.isFinite(maxAllowed) ? after <= maxAllowed + epsilon : after <= before + epsilon;
|
|
1121
|
+
checks.push({
|
|
1122
|
+
metric: metricName,
|
|
1123
|
+
source: metric.source,
|
|
1124
|
+
before,
|
|
1125
|
+
after,
|
|
1126
|
+
ratio,
|
|
1127
|
+
max_allowed: Number.isFinite(maxAllowed) ? Number(maxAllowed.toFixed(2)) : null,
|
|
1128
|
+
tolerance: latencyTolerance,
|
|
1129
|
+
passed
|
|
1130
|
+
});
|
|
1131
|
+
if (!passed) {
|
|
1132
|
+
failures.push(`${metricName} above tolerance (${before} -> ${after}, tolerance=${latencyTolerance})`);
|
|
1133
|
+
}
|
|
1134
|
+
}
|
|
1135
|
+
|
|
1136
|
+
const tokenMetrics = [
|
|
1137
|
+
'prompt_tokens_per_success',
|
|
1138
|
+
'completion_tokens_per_success',
|
|
1139
|
+
'total_tokens_per_success'
|
|
1140
|
+
];
|
|
1141
|
+
for (const metricName of tokenMetrics) {
|
|
1142
|
+
const metric = pickWeightedOrCoreMetric(comparison, metricName);
|
|
1143
|
+
if (!metric) {
|
|
1144
|
+
checks.push({ metric: metricName, status: 'skipped', reason: 'insufficient_data' });
|
|
1145
|
+
continue;
|
|
1146
|
+
}
|
|
1147
|
+
const before = Number(metric.before);
|
|
1148
|
+
const after = Number(metric.after);
|
|
1149
|
+
const ratio = safeRatio(before, after);
|
|
1150
|
+
const maxAllowed = before * (1 + tokenTolerance);
|
|
1151
|
+
const passed = Number.isFinite(maxAllowed) ? after <= maxAllowed + epsilon : after <= before + epsilon;
|
|
1152
|
+
checks.push({
|
|
1153
|
+
metric: metricName,
|
|
1154
|
+
source: metric.source,
|
|
1155
|
+
before,
|
|
1156
|
+
after,
|
|
1157
|
+
ratio,
|
|
1158
|
+
max_allowed: Number.isFinite(maxAllowed) ? Number(maxAllowed.toFixed(2)) : null,
|
|
1159
|
+
tolerance: tokenTolerance,
|
|
1160
|
+
passed
|
|
1161
|
+
});
|
|
1162
|
+
if (!passed) {
|
|
1163
|
+
failures.push(`${metricName} above tolerance (${before} -> ${after}, tolerance=${tokenTolerance})`);
|
|
1164
|
+
}
|
|
1165
|
+
}
|
|
1166
|
+
|
|
1167
|
+
return {
|
|
1168
|
+
baseline_label: comparison?.before_label || 'baseline',
|
|
1169
|
+
candidate_label: comparison?.after_label || 'candidate',
|
|
1170
|
+
latency_tolerance: latencyTolerance,
|
|
1171
|
+
token_tolerance: tokenTolerance,
|
|
1172
|
+
checks,
|
|
1173
|
+
failures,
|
|
1174
|
+
passed: failures.length === 0
|
|
1175
|
+
};
|
|
1176
|
+
}
|
|
1177
|
+
|
|
1178
|
+
function captureSnapshotCommand(args) {
|
|
1179
|
+
const label = normalizeLabel(args.label);
|
|
1180
|
+
if (!label) {
|
|
1181
|
+
throw new Error('--label is required');
|
|
1182
|
+
}
|
|
1183
|
+
const runDir = resolveRunDir(args);
|
|
1184
|
+
const snapshotDir = path.join(runDir, 'snapshots');
|
|
1185
|
+
ensureDir(snapshotDir);
|
|
1186
|
+
const manifest = loadManifest(runDir);
|
|
1187
|
+
const snapshot = buildSnapshot({ ...args, label });
|
|
1188
|
+
const snapshotPath = path.join(snapshotDir, `${label}.json`);
|
|
1189
|
+
writeJson(snapshotPath, snapshot);
|
|
1190
|
+
addSnapshotToManifest(manifest, snapshotPath, snapshot);
|
|
1191
|
+
saveManifest(runDir, manifest);
|
|
1192
|
+
|
|
1193
|
+
const output = {
|
|
1194
|
+
run_id: args.runId,
|
|
1195
|
+
label,
|
|
1196
|
+
snapshot_path: snapshotPath,
|
|
1197
|
+
records_total: snapshot?.baseline?.records_total ?? 0,
|
|
1198
|
+
success_rate: snapshot?.baseline?.success_rate ?? null,
|
|
1199
|
+
p95_latency_ms: snapshot?.baseline?.latency_ms?.p95 ?? null,
|
|
1200
|
+
release_slo_passed: snapshot?.release_slo?.passed ?? null,
|
|
1201
|
+
};
|
|
1202
|
+
console.log(JSON.stringify(output, null, 2));
|
|
1203
|
+
return output;
|
|
1204
|
+
}
|
|
1205
|
+
|
|
1206
|
+
function initRunCommand(args) {
|
|
1207
|
+
const runDir = resolveRunDir(args);
|
|
1208
|
+
ensureDir(path.join(runDir, 'snapshots'));
|
|
1209
|
+
const manifest = loadManifest(runDir);
|
|
1210
|
+
manifest.created_at = manifest.created_at || new Date().toISOString();
|
|
1211
|
+
manifest.run_id = args.runId;
|
|
1212
|
+
saveManifest(runDir, manifest);
|
|
1213
|
+
return captureSnapshotCommand({ ...args, label: 'overall_start' });
|
|
1214
|
+
}
|
|
1215
|
+
|
|
1216
|
+
function loadComparisonSnapshots(args, beforeRef, afterRef) {
|
|
1217
|
+
const runDir = resolveRunDir(args);
|
|
1218
|
+
const beforePath = resolveSnapshotPath(runDir, beforeRef);
|
|
1219
|
+
const afterPath = resolveSnapshotPath(runDir, afterRef);
|
|
1220
|
+
if (!beforePath) throw new Error(`Unable to resolve baseline snapshot: ${beforeRef}`);
|
|
1221
|
+
if (!afterPath) throw new Error(`Unable to resolve candidate snapshot: ${afterRef}`);
|
|
1222
|
+
const before = readJson(beforePath);
|
|
1223
|
+
const after = readJson(afterPath);
|
|
1224
|
+
if (!before || !after) {
|
|
1225
|
+
throw new Error('Failed to read one or both snapshots');
|
|
1226
|
+
}
|
|
1227
|
+
return { before, after };
|
|
1228
|
+
}
|
|
1229
|
+
|
|
1230
|
+
function compareSnapshotsCommand(args) {
|
|
1231
|
+
const { before, after } = loadComparisonSnapshots(args, args.before, args.after);
|
|
1232
|
+
const comparison = evaluateSnapshotComparison(before, after, {
|
|
1233
|
+
bootstrapIterations: args.bootstrap,
|
|
1234
|
+
});
|
|
1235
|
+
const superiorityGate = args.superiorityGate
|
|
1236
|
+
? evaluateSuperiorityGate(comparison, {
|
|
1237
|
+
latencyTolerance: args.latencyTolerance,
|
|
1238
|
+
tokenTolerance: args.tokenTolerance
|
|
1239
|
+
})
|
|
1240
|
+
: null;
|
|
1241
|
+
const output = superiorityGate
|
|
1242
|
+
? { ...comparison, superiority_gate: superiorityGate }
|
|
1243
|
+
: comparison;
|
|
1244
|
+
console.log(JSON.stringify(output, null, 2));
|
|
1245
|
+
const passed = superiorityGate ? superiorityGate.passed : comparison.summary.passed;
|
|
1246
|
+
if (args.enforce && !passed) {
|
|
1247
|
+
process.exitCode = 1;
|
|
1248
|
+
}
|
|
1249
|
+
return output;
|
|
1250
|
+
}
|
|
1251
|
+
|
|
1252
|
+
function headToHeadCommand(args) {
|
|
1253
|
+
if (!args.baseline || !args.candidate) {
|
|
1254
|
+
throw new Error('--baseline and --candidate are required for headtohead');
|
|
1255
|
+
}
|
|
1256
|
+
const { before, after } = loadComparisonSnapshots(args, args.baseline, args.candidate);
|
|
1257
|
+
const comparison = evaluateSnapshotComparison(before, after, {
|
|
1258
|
+
bootstrapIterations: args.bootstrap,
|
|
1259
|
+
});
|
|
1260
|
+
const superiorityGate = evaluateSuperiorityGate(comparison, {
|
|
1261
|
+
latencyTolerance: args.latencyTolerance,
|
|
1262
|
+
tokenTolerance: args.tokenTolerance
|
|
1263
|
+
});
|
|
1264
|
+
const output = {
|
|
1265
|
+
mode: 'head_to_head',
|
|
1266
|
+
baseline_ref: args.baseline,
|
|
1267
|
+
candidate_ref: args.candidate,
|
|
1268
|
+
...comparison,
|
|
1269
|
+
superiority_gate: superiorityGate
|
|
1270
|
+
};
|
|
1271
|
+
console.log(JSON.stringify(output, null, 2));
|
|
1272
|
+
if (args.enforce && !superiorityGate.passed) {
|
|
1273
|
+
process.exitCode = 1;
|
|
1274
|
+
}
|
|
1275
|
+
return output;
|
|
1276
|
+
}
|
|
1277
|
+
|
|
1278
|
+
function buildRunComparisons(manifest, runDir, bootstrapIterations) {
|
|
1279
|
+
const snapshotsByLabel = new Map();
|
|
1280
|
+
for (const entry of manifest.snapshots || []) {
|
|
1281
|
+
const file = entry?.file ? path.join(runDir, 'snapshots', entry.file) : null;
|
|
1282
|
+
if (!file || !fs.existsSync(file)) continue;
|
|
1283
|
+
const snapshot = readJson(file);
|
|
1284
|
+
if (!snapshot) continue;
|
|
1285
|
+
snapshotsByLabel.set(entry.label, snapshot);
|
|
1286
|
+
}
|
|
1287
|
+
|
|
1288
|
+
const labels = Array.from(snapshotsByLabel.keys()).sort();
|
|
1289
|
+
const comparisons = [];
|
|
1290
|
+
|
|
1291
|
+
if (snapshotsByLabel.has('overall_start')) {
|
|
1292
|
+
const terminal = snapshotsByLabel.get('overall_end')
|
|
1293
|
+
|| (() => {
|
|
1294
|
+
const ordered = manifest.snapshots || [];
|
|
1295
|
+
const latest = ordered[ordered.length - 1];
|
|
1296
|
+
return latest ? snapshotsByLabel.get(latest.label) : null;
|
|
1297
|
+
})();
|
|
1298
|
+
if (terminal) {
|
|
1299
|
+
comparisons.push({
|
|
1300
|
+
kind: 'overall',
|
|
1301
|
+
key: 'overall_start->terminal',
|
|
1302
|
+
result: evaluateSnapshotComparison(
|
|
1303
|
+
snapshotsByLabel.get('overall_start'),
|
|
1304
|
+
terminal,
|
|
1305
|
+
{ bootstrapIterations }
|
|
1306
|
+
),
|
|
1307
|
+
});
|
|
1308
|
+
}
|
|
1309
|
+
}
|
|
1310
|
+
|
|
1311
|
+
const trancheIndices = new Set();
|
|
1312
|
+
for (const label of labels) {
|
|
1313
|
+
const match = /^tranche(\d+)_before$/.exec(label);
|
|
1314
|
+
if (match) trancheIndices.add(Number(match[1]));
|
|
1315
|
+
}
|
|
1316
|
+
|
|
1317
|
+
for (const idx of Array.from(trancheIndices).sort((a, b) => a - b)) {
|
|
1318
|
+
const beforeLabel = `tranche${idx}_before`;
|
|
1319
|
+
const afterLabel = `tranche${idx}_after`;
|
|
1320
|
+
const before = snapshotsByLabel.get(beforeLabel);
|
|
1321
|
+
const after = snapshotsByLabel.get(afterLabel);
|
|
1322
|
+
if (!before || !after) continue;
|
|
1323
|
+
comparisons.push({
|
|
1324
|
+
kind: 'tranche',
|
|
1325
|
+
key: `tranche${idx}`,
|
|
1326
|
+
result: evaluateSnapshotComparison(before, after, { bootstrapIterations }),
|
|
1327
|
+
});
|
|
1328
|
+
}
|
|
1329
|
+
|
|
1330
|
+
return comparisons;
|
|
1331
|
+
}
|
|
1332
|
+
|
|
1333
|
+
function reportCommand(args) {
|
|
1334
|
+
const runDir = resolveRunDir(args);
|
|
1335
|
+
const manifest = loadManifest(runDir);
|
|
1336
|
+
const comparisons = buildRunComparisons(manifest, runDir, args.bootstrap);
|
|
1337
|
+
const significantRegressions = comparisons.flatMap(item =>
|
|
1338
|
+
item.result.summary.significant_regressions.map(metric => `${item.key}:${metric}`)
|
|
1339
|
+
);
|
|
1340
|
+
const significantImprovements = comparisons.flatMap(item =>
|
|
1341
|
+
item.result.summary.significant_improvements.map(metric => `${item.key}:${metric}`)
|
|
1342
|
+
);
|
|
1343
|
+
const report = {
|
|
1344
|
+
schema_version: SNAPSHOT_SCHEMA_VERSION,
|
|
1345
|
+
run_id: args.runId,
|
|
1346
|
+
generated_at: new Date().toISOString(),
|
|
1347
|
+
snapshots: manifest.snapshots || [],
|
|
1348
|
+
comparisons,
|
|
1349
|
+
summary: {
|
|
1350
|
+
significant_improvements: significantImprovements,
|
|
1351
|
+
significant_regressions: significantRegressions,
|
|
1352
|
+
passed: significantRegressions.length === 0,
|
|
1353
|
+
},
|
|
1354
|
+
};
|
|
1355
|
+
|
|
1356
|
+
const reportsDir = path.join(runDir, 'reports');
|
|
1357
|
+
ensureDir(reportsDir);
|
|
1358
|
+
const reportPath = path.join(reportsDir, `report-${timestampToken()}.json`);
|
|
1359
|
+
writeJson(reportPath, report);
|
|
1360
|
+
|
|
1361
|
+
const output = {
|
|
1362
|
+
run_id: args.runId,
|
|
1363
|
+
report_path: reportPath,
|
|
1364
|
+
comparisons: comparisons.length,
|
|
1365
|
+
significant_improvements: significantImprovements.length,
|
|
1366
|
+
significant_regressions: significantRegressions.length,
|
|
1367
|
+
passed: report.summary.passed,
|
|
1368
|
+
};
|
|
1369
|
+
console.log(JSON.stringify(output, null, 2));
|
|
1370
|
+
|
|
1371
|
+
if (args.enforce && !report.summary.passed) {
|
|
1372
|
+
process.exitCode = 1;
|
|
1373
|
+
}
|
|
1374
|
+
return report;
|
|
1375
|
+
}
|
|
1376
|
+
|
|
1377
|
+
function main() {
|
|
1378
|
+
const args = parseArgs(process.argv.slice(2));
|
|
1379
|
+
try {
|
|
1380
|
+
switch (args.command) {
|
|
1381
|
+
case 'init':
|
|
1382
|
+
initRunCommand(args);
|
|
1383
|
+
break;
|
|
1384
|
+
case 'capture':
|
|
1385
|
+
captureSnapshotCommand(args);
|
|
1386
|
+
break;
|
|
1387
|
+
case 'compare':
|
|
1388
|
+
if (!args.before || !args.after) {
|
|
1389
|
+
throw new Error('--before and --after are required for compare');
|
|
1390
|
+
}
|
|
1391
|
+
compareSnapshotsCommand(args);
|
|
1392
|
+
break;
|
|
1393
|
+
case 'headtohead':
|
|
1394
|
+
headToHeadCommand(args);
|
|
1395
|
+
break;
|
|
1396
|
+
case 'report':
|
|
1397
|
+
reportCommand(args);
|
|
1398
|
+
break;
|
|
1399
|
+
case 'help':
|
|
1400
|
+
default:
|
|
1401
|
+
usage();
|
|
1402
|
+
break;
|
|
1403
|
+
}
|
|
1404
|
+
} catch (err) {
|
|
1405
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
1406
|
+
console.error(`[benchmark-harness] ${message}`);
|
|
1407
|
+
process.exitCode = 1;
|
|
1408
|
+
}
|
|
1409
|
+
}
|
|
1410
|
+
|
|
1411
|
+
if (import.meta.url === `file://${process.argv[1]}`) {
|
|
1412
|
+
main();
|
|
1413
|
+
}
|