nodebench-mcp 2.54.0 → 2.56.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/benchmarks/benchmarkRunner.js +27 -4
- package/dist/benchmarks/benchmarkRunner.js.map +1 -1
- package/dist/benchmarks/benchmarkTools.js +13 -0
- package/dist/benchmarks/benchmarkTools.js.map +1 -1
- package/dist/benchmarks/longitudinalHarness.d.ts +58 -3
- package/dist/benchmarks/longitudinalHarness.js +518 -20
- package/dist/benchmarks/longitudinalHarness.js.map +1 -1
- package/dist/benchmarks/longitudinalTypes.d.ts +16 -0
- package/dist/benchmarks/longitudinalTypes.js.map +1 -1
- package/dist/benchmarks/perturbations.d.ts +57 -0
- package/dist/benchmarks/perturbations.js +235 -0
- package/dist/benchmarks/perturbations.js.map +1 -0
- package/package.json +1 -1
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
#!/usr/bin/env npx tsx
|
|
2
|
+
// @ts-nocheck — standalone CLI script generated by external tooling; not part of the library build
|
|
2
3
|
/**
|
|
3
4
|
* longitudinalHarness.ts — Longitudinal dogfood benchmark harness for NodeBench MCP.
|
|
4
5
|
*
|
|
@@ -25,6 +26,90 @@ import { learningTools } from "../tools/learningTools.js";
|
|
|
25
26
|
import { flywheelTools } from "../tools/flywheelTools.js";
|
|
26
27
|
import { createMetaTools } from "../tools/metaTools.js";
|
|
27
28
|
import { createProgressiveDiscoveryTools } from "../tools/progressiveDiscoveryTools.js";
|
|
29
|
+
/** Seeded PRNG for deterministic perturbation randomness. */
|
|
30
|
+
function seededRandom(seed) {
|
|
31
|
+
let s = seed;
|
|
32
|
+
return () => {
|
|
33
|
+
s = (s * 1664525 + 1013904223) & 0x7fffffff;
|
|
34
|
+
return s / 0x7fffffff;
|
|
35
|
+
};
|
|
36
|
+
}
|
|
37
|
+
const PERTURBATIONS = [
|
|
38
|
+
{
|
|
39
|
+
type: "thread_reset",
|
|
40
|
+
description: "Clear causal_events for user before session (simulates new thread)",
|
|
41
|
+
severity: "high",
|
|
42
|
+
apply: (session) => {
|
|
43
|
+
// Wipe causal memory for this user — system must recover from prior packet
|
|
44
|
+
const db = getDb();
|
|
45
|
+
db.prepare("DELETE FROM causal_events WHERE userId = ?").run(session.userId);
|
|
46
|
+
// Context must be restated since memory was wiped
|
|
47
|
+
return { ...session, contextRestated: true, repeatQuestionDetected: true };
|
|
48
|
+
},
|
|
49
|
+
},
|
|
50
|
+
{
|
|
51
|
+
type: "tool_failure",
|
|
52
|
+
description: "Randomly mark 1-2 tools in chain as failed (tests graceful degradation)",
|
|
53
|
+
severity: "medium",
|
|
54
|
+
apply: (session) => {
|
|
55
|
+
// Inject 1-2 synthetic tool errors
|
|
56
|
+
const rng = seededRandom(session.runId.length + session.sessionIndex);
|
|
57
|
+
const failCount = rng() > 0.5 ? 2 : 1;
|
|
58
|
+
const injectedErrors = [];
|
|
59
|
+
for (let i = 0; i < failCount; i++) {
|
|
60
|
+
injectedErrors.push(`perturbation:tool_failure_injected_${i}`);
|
|
61
|
+
}
|
|
62
|
+
return {
|
|
63
|
+
...session,
|
|
64
|
+
errors: [...session.errors, ...injectedErrors],
|
|
65
|
+
judgeScore: Math.max(1.0, session.judgeScore - 0.5 * failCount),
|
|
66
|
+
};
|
|
67
|
+
},
|
|
68
|
+
},
|
|
69
|
+
{
|
|
70
|
+
type: "stale_memory",
|
|
71
|
+
description: "Inject a causal_event with 30-day-old timestamp for a different entity",
|
|
72
|
+
severity: "low",
|
|
73
|
+
apply: (session) => {
|
|
74
|
+
const db = getDb();
|
|
75
|
+
const staleDate = new Date(Date.now() - 30 * 24 * 60 * 60 * 1000).toISOString();
|
|
76
|
+
db.prepare(`
|
|
77
|
+
INSERT INTO causal_events (id, userId, eventType, payload, createdAt)
|
|
78
|
+
VALUES (?, ?, ?, ?, ?)
|
|
79
|
+
`).run(genId("ce_stale"), session.userId, "stale_injection", JSON.stringify({ entity: "StaleCorpXYZ", scenarioId: "stale_test", injected: true }), staleDate);
|
|
80
|
+
return session; // Session itself unchanged — we measure if stale data pollutes results
|
|
81
|
+
},
|
|
82
|
+
},
|
|
83
|
+
{
|
|
84
|
+
type: "model_swap",
|
|
85
|
+
description: "Jitter judge score by +/-0.3 to simulate different model behavior",
|
|
86
|
+
severity: "low",
|
|
87
|
+
apply: (session) => {
|
|
88
|
+
const rng = seededRandom(session.sessionIndex * 31 + session.runId.length);
|
|
89
|
+
const jitter = (rng() - 0.5) * 0.6; // range: -0.3 to +0.3
|
|
90
|
+
return {
|
|
91
|
+
...session,
|
|
92
|
+
judgeScore: Math.max(1.0, Math.min(5.0, session.judgeScore + jitter)),
|
|
93
|
+
};
|
|
94
|
+
},
|
|
95
|
+
},
|
|
96
|
+
{
|
|
97
|
+
type: "schema_change",
|
|
98
|
+
description: "Skip one field from packet output (tests downstream handling of missing fields)",
|
|
99
|
+
severity: "medium",
|
|
100
|
+
apply: (session) => {
|
|
101
|
+
// Simulate missing export by clearing exportProduced on some sessions
|
|
102
|
+
return {
|
|
103
|
+
...session,
|
|
104
|
+
exportProduced: false,
|
|
105
|
+
judgeScore: Math.max(1.0, session.judgeScore - 0.2),
|
|
106
|
+
};
|
|
107
|
+
},
|
|
108
|
+
},
|
|
109
|
+
];
|
|
110
|
+
function selectPerturbation(sessionIndex) {
|
|
111
|
+
return PERTURBATIONS[(sessionIndex - 1) % PERTURBATIONS.length];
|
|
112
|
+
}
|
|
28
113
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
29
114
|
// Constants
|
|
30
115
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
@@ -138,11 +223,55 @@ CREATE TABLE IF NOT EXISTS causal_events (
|
|
|
138
223
|
createdAt TEXT NOT NULL DEFAULT (datetime('now'))
|
|
139
224
|
);
|
|
140
225
|
|
|
226
|
+
CREATE TABLE IF NOT EXISTS session_actions (
|
|
227
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
228
|
+
sessionRunId TEXT NOT NULL,
|
|
229
|
+
actionIndex INTEGER NOT NULL,
|
|
230
|
+
toolName TEXT NOT NULL,
|
|
231
|
+
inputSummary TEXT NOT NULL DEFAULT '',
|
|
232
|
+
outputSummary TEXT NOT NULL DEFAULT '',
|
|
233
|
+
latencyMs INTEGER NOT NULL DEFAULT 0,
|
|
234
|
+
passed INTEGER NOT NULL DEFAULT 0,
|
|
235
|
+
skipped INTEGER NOT NULL DEFAULT 0,
|
|
236
|
+
error TEXT,
|
|
237
|
+
createdAt TEXT NOT NULL DEFAULT (datetime('now')),
|
|
238
|
+
FOREIGN KEY (sessionRunId) REFERENCES longitudinal_sessions(runId)
|
|
239
|
+
);
|
|
240
|
+
|
|
241
|
+
CREATE TABLE IF NOT EXISTS benchmark_rollups (
|
|
242
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
243
|
+
period TEXT NOT NULL,
|
|
244
|
+
periodKey TEXT NOT NULL,
|
|
245
|
+
totalRuns INTEGER NOT NULL DEFAULT 0,
|
|
246
|
+
completionRate REAL NOT NULL DEFAULT 0,
|
|
247
|
+
avgJudgeScore REAL NOT NULL DEFAULT 0,
|
|
248
|
+
rca REAL NOT NULL DEFAULT 0,
|
|
249
|
+
prr REAL NOT NULL DEFAULT 0,
|
|
250
|
+
durabilityScore REAL NOT NULL DEFAULT 0,
|
|
251
|
+
topFailureMode TEXT NOT NULL DEFAULT 'none',
|
|
252
|
+
createdAt TEXT NOT NULL DEFAULT (datetime('now')),
|
|
253
|
+
UNIQUE(period, periodKey)
|
|
254
|
+
);
|
|
255
|
+
|
|
256
|
+
CREATE TABLE IF NOT EXISTS workflow_maturity (
|
|
257
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
258
|
+
scenarioId TEXT NOT NULL,
|
|
259
|
+
maturityLevel TEXT NOT NULL,
|
|
260
|
+
label TEXT NOT NULL,
|
|
261
|
+
evidence TEXT NOT NULL DEFAULT '',
|
|
262
|
+
batchId TEXT NOT NULL,
|
|
263
|
+
createdAt TEXT NOT NULL DEFAULT (datetime('now')),
|
|
264
|
+
UNIQUE(scenarioId, batchId)
|
|
265
|
+
);
|
|
266
|
+
|
|
141
267
|
CREATE INDEX IF NOT EXISTS idx_longitudinal_batch ON longitudinal_sessions(batchId);
|
|
142
268
|
CREATE INDEX IF NOT EXISTS idx_longitudinal_user ON longitudinal_sessions(userId);
|
|
143
269
|
CREATE INDEX IF NOT EXISTS idx_longitudinal_cohort ON longitudinal_sessions(cohortSize);
|
|
144
270
|
CREATE INDEX IF NOT EXISTS idx_founder_packets_entity ON founder_packets(entityId, scenarioId);
|
|
145
271
|
CREATE INDEX IF NOT EXISTS idx_causal_events_user ON causal_events(userId);
|
|
272
|
+
CREATE INDEX IF NOT EXISTS idx_session_actions_run ON session_actions(sessionRunId);
|
|
273
|
+
CREATE INDEX IF NOT EXISTS idx_benchmark_rollups_period ON benchmark_rollups(period, periodKey);
|
|
274
|
+
CREATE INDEX IF NOT EXISTS idx_workflow_maturity_scenario ON workflow_maturity(scenarioId);
|
|
146
275
|
`;
|
|
147
276
|
function ensureSchema() {
|
|
148
277
|
const db = getDb();
|
|
@@ -160,6 +289,13 @@ function ensureSchema() {
|
|
|
160
289
|
catch {
|
|
161
290
|
db.exec("DROP TABLE IF EXISTS causal_events");
|
|
162
291
|
}
|
|
292
|
+
// Migrate session_actions if schema changed
|
|
293
|
+
try {
|
|
294
|
+
db.prepare("SELECT sessionRunId FROM session_actions LIMIT 1").get();
|
|
295
|
+
}
|
|
296
|
+
catch {
|
|
297
|
+
db.exec("DROP TABLE IF EXISTS session_actions");
|
|
298
|
+
}
|
|
163
299
|
db.exec(LONGITUDINAL_SCHEMA);
|
|
164
300
|
}
|
|
165
301
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
@@ -217,6 +353,16 @@ function persistSession(session, batchId, cohortSize) {
|
|
|
217
353
|
(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
218
354
|
`).run(session.runId, batchId, cohortSize, session.userId, session.role, session.scenarioId, session.sessionIndex, session.timeHorizon, session.surface, session.toolCallCount, session.latencyMs, session.packetGenerated ? 1 : 0, session.packetReused ? 1 : 0, session.repeatQuestionDetected ? 1 : 0, session.contextRestated ? 1 : 0, session.exportProduced ? 1 : 0, session.judgeScore, JSON.stringify(session.errors));
|
|
219
355
|
}
|
|
356
|
+
function persistActionRecords(sessionRunId, actions) {
|
|
357
|
+
const db = getDb();
|
|
358
|
+
const stmt = db.prepare(`
|
|
359
|
+
INSERT INTO session_actions (sessionRunId, actionIndex, toolName, inputSummary, outputSummary, latencyMs, passed, skipped, error)
|
|
360
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
361
|
+
`);
|
|
362
|
+
for (const a of actions) {
|
|
363
|
+
stmt.run(sessionRunId, a.actionIndex, a.toolName, a.inputSummary, a.outputSummary, a.latencyMs, a.passed ? 1 : 0, a.skipped ? 1 : 0, a.error ?? null);
|
|
364
|
+
}
|
|
365
|
+
}
|
|
220
366
|
/**
|
|
221
367
|
* Issue 1 fix: Check founder_packets table for a prior packet matching this entity+scenario.
|
|
222
368
|
* Returns true only if sessionIndex > 1 AND a stored packet exists.
|
|
@@ -266,7 +412,7 @@ function recordCausalEvent(db, userId, scenarioId, sessionIndex) {
|
|
|
266
412
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
267
413
|
// Session Simulation
|
|
268
414
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
269
|
-
async function simulateSession(user, scenarioId, sessionIndex, timeHorizon, batchId, cohortSize) {
|
|
415
|
+
async function simulateSession(user, scenarioId, sessionIndex, timeHorizon, batchId, cohortSize, perturbation) {
|
|
270
416
|
const tools = await getAllTools();
|
|
271
417
|
const chain = SCENARIO_TOOL_CHAINS[scenarioId];
|
|
272
418
|
if (!chain) {
|
|
@@ -288,25 +434,69 @@ async function simulateSession(user, scenarioId, sessionIndex, timeHorizon, batc
|
|
|
288
434
|
// Issue 2 fix: check causal_events for prior memory from this user
|
|
289
435
|
const hasPriorMemory = sessionIndex > 1 && hasCausalMemory(db, user.userId);
|
|
290
436
|
// Context restated only if session > 1 AND no causal memory exists
|
|
291
|
-
|
|
437
|
+
let contextRestated = sessionIndex > 1 && !hasPriorMemory;
|
|
292
438
|
// Repeat question: if context was restated, the user likely re-asked old questions.
|
|
293
|
-
|
|
439
|
+
let repeatQuestionDetected = contextRestated;
|
|
294
440
|
// Issue 1 fix: if packet reused and sessionIndex > 1, skip regeneration of the chain
|
|
295
441
|
// (but still run non-packet tools like record_event)
|
|
296
442
|
const skipRegeneration = packetReused && sessionIndex > 1;
|
|
297
443
|
let allToolsFound = true;
|
|
444
|
+
// Per-action tracking
|
|
445
|
+
const actionRecords = [];
|
|
446
|
+
// Apply thread_reset perturbation BEFORE chain (wipes causal memory)
|
|
447
|
+
if (perturbation?.type === "thread_reset") {
|
|
448
|
+
const db2 = getDb();
|
|
449
|
+
db2.prepare("DELETE FROM causal_events WHERE userId = ?").run(user.userId);
|
|
450
|
+
}
|
|
451
|
+
// Apply stale_memory perturbation BEFORE chain (injects stale data)
|
|
452
|
+
if (perturbation?.type === "stale_memory") {
|
|
453
|
+
const db2 = getDb();
|
|
454
|
+
const staleDate = new Date(Date.now() - 30 * 24 * 60 * 60 * 1000).toISOString();
|
|
455
|
+
db2.prepare(`
|
|
456
|
+
INSERT INTO causal_events (id, userId, eventType, payload, createdAt)
|
|
457
|
+
VALUES (?, ?, ?, ?, ?)
|
|
458
|
+
`).run(genId("ce_stale"), user.userId, "stale_injection", JSON.stringify({ entity: "StaleCorpXYZ", scenarioId: "stale_test", injected: true }), staleDate);
|
|
459
|
+
}
|
|
460
|
+
// Determine which tools to "fail" for tool_failure perturbation
|
|
461
|
+
const failedToolIndices = new Set();
|
|
462
|
+
if (perturbation?.type === "tool_failure") {
|
|
463
|
+
const rng = seededRandom(sessionIndex * 17 + chain.length);
|
|
464
|
+
const failCount = rng() > 0.5 ? 2 : 1;
|
|
465
|
+
// Pick random non-first, non-last indices
|
|
466
|
+
const candidates = chain.map((_, i) => i).filter((i) => i > 0 && i < chain.length - 1);
|
|
467
|
+
for (let f = 0; f < Math.min(failCount, candidates.length); f++) {
|
|
468
|
+
const pick = Math.floor(rng() * candidates.length);
|
|
469
|
+
failedToolIndices.add(candidates[pick]);
|
|
470
|
+
candidates.splice(pick, 1);
|
|
471
|
+
}
|
|
472
|
+
}
|
|
473
|
+
// Determine if schema_change perturbation skips a field
|
|
474
|
+
const schemaSkipExport = perturbation?.type === "schema_change";
|
|
298
475
|
// Run the tool chain
|
|
299
476
|
for (let i = 0; i < chain.length; i++) {
|
|
300
477
|
const toolName = chain[i];
|
|
301
478
|
const isCoreTool = i === 0; // first tool in chain is core
|
|
302
479
|
// Issue 3 fix: check if tool exists in loaded tools before calling
|
|
303
480
|
if (!availableToolNames.has(toolName)) {
|
|
481
|
+
actionRecords.push({
|
|
482
|
+
actionIndex: i, toolName, inputSummary: "", outputSummary: "",
|
|
483
|
+
latencyMs: 0, passed: !isCoreTool, skipped: true, error: isCoreTool ? `tool_not_found:${toolName}` : undefined,
|
|
484
|
+
});
|
|
304
485
|
if (isCoreTool) {
|
|
305
|
-
// Core tool missing is a real error
|
|
306
486
|
errors.push(`tool_not_found:${toolName}`);
|
|
307
487
|
allToolsFound = false;
|
|
308
488
|
}
|
|
309
|
-
|
|
489
|
+
toolCallCount++;
|
|
490
|
+
continue;
|
|
491
|
+
}
|
|
492
|
+
// Perturbation: injected tool failure
|
|
493
|
+
if (failedToolIndices.has(i)) {
|
|
494
|
+
const errMsg = `perturbation:tool_failure_injected:${toolName}`;
|
|
495
|
+
errors.push(errMsg);
|
|
496
|
+
actionRecords.push({
|
|
497
|
+
actionIndex: i, toolName, inputSummary: "perturbation_injected", outputSummary: "",
|
|
498
|
+
latencyMs: 0, passed: false, skipped: false, error: errMsg,
|
|
499
|
+
});
|
|
310
500
|
toolCallCount++;
|
|
311
501
|
continue;
|
|
312
502
|
}
|
|
@@ -314,23 +504,36 @@ async function simulateSession(user, scenarioId, sessionIndex, timeHorizon, batc
|
|
|
314
504
|
// Issue 1 fix: skip memo/analysis tools if reusing prior packet
|
|
315
505
|
if (skipRegeneration && toolName !== "record_event" && toolName !== "track_milestone" && toolName !== "check_mcp_setup") {
|
|
316
506
|
toolCallCount++;
|
|
317
|
-
|
|
318
|
-
if (
|
|
507
|
+
const skipExport = toolName === "render_decision_memo";
|
|
508
|
+
if (skipExport && !schemaSkipExport) {
|
|
319
509
|
exportProduced = true;
|
|
320
510
|
}
|
|
511
|
+
actionRecords.push({
|
|
512
|
+
actionIndex: i, toolName, inputSummary: "skip_reuse", outputSummary: "packet_reused",
|
|
513
|
+
latencyMs: 0, passed: true, skipped: true,
|
|
514
|
+
});
|
|
321
515
|
continue;
|
|
322
516
|
}
|
|
323
517
|
// Build scenario-appropriate args
|
|
324
518
|
const args = buildToolArgs(toolName, user, scenarioId);
|
|
325
519
|
const result = await callTool(tool, args);
|
|
326
520
|
toolCallCount++;
|
|
521
|
+
actionRecords.push({
|
|
522
|
+
actionIndex: i, toolName,
|
|
523
|
+
inputSummary: JSON.stringify(args).slice(0, 200),
|
|
524
|
+
outputSummary: result.ok ? String(result.result).slice(0, 200) : "",
|
|
525
|
+
latencyMs: result.ms, passed: result.ok, skipped: false,
|
|
526
|
+
error: result.ok ? undefined : result.error?.slice(0, 200),
|
|
527
|
+
});
|
|
327
528
|
if (!result.ok) {
|
|
328
529
|
errors.push(`${toolName}:${result.error?.slice(0, 120)}`);
|
|
329
530
|
}
|
|
330
531
|
// Detect packet generation from memo/export tools
|
|
331
532
|
if (toolName === "render_decision_memo" && result.ok) {
|
|
332
533
|
packetGenerated = true;
|
|
333
|
-
|
|
534
|
+
if (!schemaSkipExport) {
|
|
535
|
+
exportProduced = true;
|
|
536
|
+
}
|
|
334
537
|
}
|
|
335
538
|
}
|
|
336
539
|
// Issue 1 fix: store packet after generation so future sessions can reuse
|
|
@@ -362,7 +565,18 @@ async function simulateSession(user, scenarioId, sessionIndex, timeHorizon, batc
|
|
|
362
565
|
: scenarioId === "important_change"
|
|
363
566
|
? "engine_api"
|
|
364
567
|
: "mcp";
|
|
365
|
-
|
|
568
|
+
// Apply model_swap perturbation: jitter the judge score
|
|
569
|
+
if (perturbation?.type === "model_swap") {
|
|
570
|
+
const rng = seededRandom(sessionIndex * 31 + runId.length);
|
|
571
|
+
const jitter = (rng() - 0.5) * 0.6; // -0.3 to +0.3
|
|
572
|
+
judgeScore = Math.max(1.0, Math.min(5.0, judgeScore + jitter));
|
|
573
|
+
}
|
|
574
|
+
// Apply thread_reset perturbation: force context restated
|
|
575
|
+
if (perturbation?.type === "thread_reset") {
|
|
576
|
+
contextRestated = true;
|
|
577
|
+
repeatQuestionDetected = true;
|
|
578
|
+
}
|
|
579
|
+
let session = {
|
|
366
580
|
runId,
|
|
367
581
|
userId: user.userId,
|
|
368
582
|
role: user.role,
|
|
@@ -380,7 +594,12 @@ async function simulateSession(user, scenarioId, sessionIndex, timeHorizon, batc
|
|
|
380
594
|
judgeScore,
|
|
381
595
|
errors,
|
|
382
596
|
};
|
|
597
|
+
// Store perturbation type as metadata in errors for tracking
|
|
598
|
+
if (perturbation) {
|
|
599
|
+
session = { ...session, errors: [...session.errors, `perturbation:${perturbation.type}`] };
|
|
600
|
+
}
|
|
383
601
|
persistSession(session, batchId, cohortSize);
|
|
602
|
+
persistActionRecords(session.runId, actionRecords);
|
|
384
603
|
return session;
|
|
385
604
|
}
|
|
386
605
|
function buildToolArgs(toolName, user, scenarioId) {
|
|
@@ -527,6 +746,205 @@ export function generateCohortReport(sessions, cohortSize, layer) {
|
|
|
527
746
|
};
|
|
528
747
|
}
|
|
529
748
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
749
|
+
// Drift Durability Score
|
|
750
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
751
|
+
export function computeDriftMetrics(sessions) {
|
|
752
|
+
const perturbedSessions = sessions.filter((s) => s.errors.some((e) => e.startsWith("perturbation:")));
|
|
753
|
+
const cleanSessions = sessions.filter((s) => !s.errors.some((e) => e.startsWith("perturbation:")));
|
|
754
|
+
if (perturbedSessions.length === 0) {
|
|
755
|
+
return { driftRecoveryRate: 100, perturbationSurvivalRate: 100, staleMemoryRejectionRate: 100 };
|
|
756
|
+
}
|
|
757
|
+
// driftRecoveryRate: % of perturbed sessions that still completed (have tool calls and judge >= 2)
|
|
758
|
+
const recovered = perturbedSessions.filter((s) => s.toolCallCount > 0 && s.judgeScore >= 2.0).length;
|
|
759
|
+
const driftRecoveryRate = (recovered / perturbedSessions.length) * 100;
|
|
760
|
+
// perturbationSurvivalRate: % of perturbations that didn't cause failure (judgeScore >= 3.0)
|
|
761
|
+
const survived = perturbedSessions.filter((s) => s.judgeScore >= 3.0).length;
|
|
762
|
+
const perturbationSurvivalRate = (survived / perturbedSessions.length) * 100;
|
|
763
|
+
// staleMemoryRejectionRate: % of stale_memory perturbations where stale data didn't pollute
|
|
764
|
+
// (judgeScore didn't drop below clean baseline average minus 0.5)
|
|
765
|
+
const cleanAvg = cleanSessions.length > 0
|
|
766
|
+
? cleanSessions.reduce((a, s) => a + s.judgeScore, 0) / cleanSessions.length
|
|
767
|
+
: 3.5;
|
|
768
|
+
const staleSessions = perturbedSessions.filter((s) => s.errors.some((e) => e.includes("stale_memory")));
|
|
769
|
+
const staleRejected = staleSessions.filter((s) => s.judgeScore >= cleanAvg - 0.5).length;
|
|
770
|
+
const staleMemoryRejectionRate = staleSessions.length > 0
|
|
771
|
+
? (staleRejected / staleSessions.length) * 100
|
|
772
|
+
: 100;
|
|
773
|
+
return { driftRecoveryRate, perturbationSurvivalRate, staleMemoryRejectionRate };
|
|
774
|
+
}
|
|
775
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
776
|
+
// Composite Durability Score
|
|
777
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
778
|
+
export function computeDurabilityScore(sessions) {
|
|
779
|
+
if (sessions.length === 0) {
|
|
780
|
+
return { composite: 0, completionStability: 0, rerunSavings: 0, artifactQuality: 0, memoryUsefulness: 0, driftResistance: 0, crossSessionContinuity: 0 };
|
|
781
|
+
}
|
|
782
|
+
// completionStability (25%): completion rate across all sessions (judge >= 2.0)
|
|
783
|
+
const completed = sessions.filter((s) => s.toolCallCount > 0 && s.judgeScore >= 2.0).length;
|
|
784
|
+
const completionStability = (completed / sessions.length) * 100;
|
|
785
|
+
// rerunSavings (20%): % of sessions with packet reuse (PRR)
|
|
786
|
+
const rerunSavings = computePRR(sessions);
|
|
787
|
+
// artifactQuality (20%): average judge score / 5.0 * 100
|
|
788
|
+
const avgJudge = sessions.reduce((a, s) => a + s.judgeScore, 0) / sessions.length;
|
|
789
|
+
const artifactQuality = (avgJudge / 5.0) * 100;
|
|
790
|
+
// memoryUsefulness (15%): RCA * (1 - staleMemoryPollutionRate)
|
|
791
|
+
const rca = computeRCA(sessions);
|
|
792
|
+
const drift = computeDriftMetrics(sessions);
|
|
793
|
+
const staleMemoryPollutionRate = 1 - drift.staleMemoryRejectionRate / 100;
|
|
794
|
+
const memoryUsefulness = (rca / 100) * (1 - staleMemoryPollutionRate) * 100;
|
|
795
|
+
// driftResistance (10%): perturbation survival rate
|
|
796
|
+
const driftResistance = drift.perturbationSurvivalRate;
|
|
797
|
+
// crossSessionContinuity (10%): % of multi-session users with context carryover
|
|
798
|
+
const userSessions = {};
|
|
799
|
+
for (const s of sessions) {
|
|
800
|
+
if (!userSessions[s.userId])
|
|
801
|
+
userSessions[s.userId] = [];
|
|
802
|
+
userSessions[s.userId].push(s);
|
|
803
|
+
}
|
|
804
|
+
const multiSessionUsers = Object.values(userSessions).filter((us) => us.length > 1);
|
|
805
|
+
let continuityCount = 0;
|
|
806
|
+
for (const us of multiSessionUsers) {
|
|
807
|
+
const laterSessions = us.filter((s) => s.sessionIndex > 1);
|
|
808
|
+
const hasCarryover = laterSessions.some((s) => !s.contextRestated);
|
|
809
|
+
if (hasCarryover)
|
|
810
|
+
continuityCount++;
|
|
811
|
+
}
|
|
812
|
+
const crossSessionContinuity = multiSessionUsers.length > 0
|
|
813
|
+
? (continuityCount / multiSessionUsers.length) * 100
|
|
814
|
+
: 0;
|
|
815
|
+
// Weighted composite
|
|
816
|
+
const composite = Math.round(completionStability * 0.25 +
|
|
817
|
+
rerunSavings * 0.20 +
|
|
818
|
+
artifactQuality * 0.20 +
|
|
819
|
+
memoryUsefulness * 0.15 +
|
|
820
|
+
driftResistance * 0.10 +
|
|
821
|
+
crossSessionContinuity * 0.10);
|
|
822
|
+
return {
|
|
823
|
+
composite: Math.max(0, Math.min(100, composite)),
|
|
824
|
+
completionStability: Math.round(completionStability * 10) / 10,
|
|
825
|
+
rerunSavings: Math.round(rerunSavings * 10) / 10,
|
|
826
|
+
artifactQuality: Math.round(artifactQuality * 10) / 10,
|
|
827
|
+
memoryUsefulness: Math.round(memoryUsefulness * 10) / 10,
|
|
828
|
+
driftResistance: Math.round(driftResistance * 10) / 10,
|
|
829
|
+
crossSessionContinuity: Math.round(crossSessionContinuity * 10) / 10,
|
|
830
|
+
};
|
|
831
|
+
}
|
|
832
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
833
|
+
// Period Rollups
|
|
834
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
835
|
+
export function computeRollup(sessions, period) {
|
|
836
|
+
const now = new Date();
|
|
837
|
+
const keyFn = (d) => {
|
|
838
|
+
switch (period) {
|
|
839
|
+
case "daily":
|
|
840
|
+
return d.toISOString().slice(0, 10); // 2026-03-24
|
|
841
|
+
case "weekly": {
|
|
842
|
+
const jan1 = new Date(d.getFullYear(), 0, 1);
|
|
843
|
+
const weekNum = Math.ceil(((d.getTime() - jan1.getTime()) / 86400000 + jan1.getDay() + 1) / 7);
|
|
844
|
+
return `${d.getFullYear()}-W${String(weekNum).padStart(2, "0")}`;
|
|
845
|
+
}
|
|
846
|
+
case "monthly":
|
|
847
|
+
return d.toISOString().slice(0, 7); // 2026-03
|
|
848
|
+
}
|
|
849
|
+
};
|
|
850
|
+
// For simulation, all sessions are "today" — group them under current period
|
|
851
|
+
const periodKey = keyFn(now);
|
|
852
|
+
if (sessions.length === 0)
|
|
853
|
+
return [];
|
|
854
|
+
const completedCount = sessions.filter((s) => s.toolCallCount > 0 && s.judgeScore >= 2.0).length;
|
|
855
|
+
const completionRate = (completedCount / sessions.length) * 100;
|
|
856
|
+
const avgJudgeScore = sessions.reduce((a, s) => a + s.judgeScore, 0) / sessions.length;
|
|
857
|
+
const rca = computeRCA(sessions);
|
|
858
|
+
const prr = computePRR(sessions);
|
|
859
|
+
const durability = computeDurabilityScore(sessions);
|
|
860
|
+
// Top failure mode
|
|
861
|
+
const errorCounts = {};
|
|
862
|
+
for (const s of sessions) {
|
|
863
|
+
for (const e of s.errors) {
|
|
864
|
+
const prefix = e.split(":")[0];
|
|
865
|
+
errorCounts[prefix] = (errorCounts[prefix] ?? 0) + 1;
|
|
866
|
+
}
|
|
867
|
+
}
|
|
868
|
+
const sorted = Object.entries(errorCounts).sort((a, b) => b[1] - a[1]);
|
|
869
|
+
const topFailureMode = sorted.length > 0 ? `${sorted[0][0]}(${sorted[0][1]})` : "none";
|
|
870
|
+
const rollup = {
|
|
871
|
+
period,
|
|
872
|
+
periodKey,
|
|
873
|
+
totalRuns: sessions.length,
|
|
874
|
+
completionRate: Math.round(completionRate * 10) / 10,
|
|
875
|
+
avgJudgeScore: Math.round(avgJudgeScore * 100) / 100,
|
|
876
|
+
rca: Math.round(rca * 10) / 10,
|
|
877
|
+
prr: Math.round(prr * 10) / 10,
|
|
878
|
+
durabilityScore: durability.composite,
|
|
879
|
+
topFailureMode,
|
|
880
|
+
createdAt: now.toISOString(),
|
|
881
|
+
};
|
|
882
|
+
// Persist
|
|
883
|
+
const db = getDb();
|
|
884
|
+
db.prepare(`
|
|
885
|
+
INSERT OR REPLACE INTO benchmark_rollups (period, periodKey, totalRuns, completionRate, avgJudgeScore, rca, prr, durabilityScore, topFailureMode, createdAt)
|
|
886
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
887
|
+
`).run(rollup.period, rollup.periodKey, rollup.totalRuns, rollup.completionRate, rollup.avgJudgeScore, rollup.rca, rollup.prr, rollup.durabilityScore, rollup.topFailureMode, rollup.createdAt);
|
|
888
|
+
return [rollup];
|
|
889
|
+
}
|
|
890
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
891
|
+
// Maturity Levels
|
|
892
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
893
|
+
export function computeMaturityLevel(scenarioId, sessions, batchId) {
|
|
894
|
+
const scenarioSessions = sessions.filter((s) => s.scenarioId === scenarioId);
|
|
895
|
+
const n = scenarioSessions.length;
|
|
896
|
+
const scores = scenarioSessions.map((s) => s.judgeScore);
|
|
897
|
+
const avg = n > 0 ? scores.reduce((a, b) => a + b, 0) / n : 0;
|
|
898
|
+
const variance = n > 1
|
|
899
|
+
? scores.reduce((a, s) => a + Math.pow(s - avg, 2), 0) / (n - 1)
|
|
900
|
+
: 0;
|
|
901
|
+
const coeffVar = avg > 0 ? (Math.sqrt(variance) / avg) * 100 : 100;
|
|
902
|
+
const prr = computePRR(scenarioSessions);
|
|
903
|
+
const rca = computeRCA(scenarioSessions);
|
|
904
|
+
const drift = computeDriftMetrics(scenarioSessions);
|
|
905
|
+
const durability = computeDurabilityScore(scenarioSessions);
|
|
906
|
+
let level;
|
|
907
|
+
let label;
|
|
908
|
+
let evidence;
|
|
909
|
+
if (durability.composite > 85 && prr > 80 && rca > 90 && n >= 30) {
|
|
910
|
+
level = "E";
|
|
911
|
+
label = "institutional";
|
|
912
|
+
evidence = `PRR=${prr.toFixed(0)}% RCA=${rca.toFixed(0)}% durability=${durability.composite} n=${n} over 30+ sessions`;
|
|
913
|
+
}
|
|
914
|
+
else if (n >= 10 && drift.perturbationSurvivalRate > 80 && drift.driftRecoveryRate > 70) {
|
|
915
|
+
level = "D";
|
|
916
|
+
label = "durable";
|
|
917
|
+
evidence = `pertSurvival=${drift.perturbationSurvivalRate.toFixed(0)}% driftRecovery=${drift.driftRecoveryRate.toFixed(0)}% n=${n}`;
|
|
918
|
+
}
|
|
919
|
+
else if (n >= 10 && prr > 0 && drift.perturbationSurvivalRate > 80) {
|
|
920
|
+
level = "C";
|
|
921
|
+
label = "hardened";
|
|
922
|
+
evidence = `PRR=${prr.toFixed(0)}% pertSurvival=${drift.perturbationSurvivalRate.toFixed(0)}% n=${n}`;
|
|
923
|
+
}
|
|
924
|
+
else if (n >= 5 && coeffVar < 20) {
|
|
925
|
+
level = "B";
|
|
926
|
+
label = "stable";
|
|
927
|
+
evidence = `CV=${coeffVar.toFixed(1)}% avg=${avg.toFixed(2)} n=${n}`;
|
|
928
|
+
}
|
|
929
|
+
else if (n >= 1 && avg >= 2.0) {
|
|
930
|
+
level = "A";
|
|
931
|
+
label = "smoke-ready";
|
|
932
|
+
evidence = `avg=${avg.toFixed(2)} n=${n}`;
|
|
933
|
+
}
|
|
934
|
+
else {
|
|
935
|
+
level = "A";
|
|
936
|
+
label = "smoke-ready (marginal)";
|
|
937
|
+
evidence = `avg=${avg.toFixed(2)} n=${n} — below smoke threshold`;
|
|
938
|
+
}
|
|
939
|
+
// Persist
|
|
940
|
+
const db = getDb();
|
|
941
|
+
db.prepare(`
|
|
942
|
+
INSERT OR REPLACE INTO workflow_maturity (scenarioId, maturityLevel, label, evidence, batchId)
|
|
943
|
+
VALUES (?, ?, ?, ?, ?)
|
|
944
|
+
`).run(scenarioId, level, label, evidence, batchId);
|
|
945
|
+
return { level, label, scenarioId, evidence };
|
|
946
|
+
}
|
|
947
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
530
948
|
// N-level Runners
|
|
531
949
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
532
950
|
/**
|
|
@@ -564,60 +982,95 @@ export async function runN5() {
|
|
|
564
982
|
return report;
|
|
565
983
|
}
|
|
566
984
|
/**
|
|
567
|
-
* N=10:
|
|
568
|
-
* Tests session-continuity
|
|
985
|
+
* N=10: 5 users x 2 sessions. Sessions 6-10 receive perturbations.
|
|
986
|
+
* Tests session-continuity and drift resistance.
|
|
569
987
|
*/
|
|
570
988
|
export async function runN10() {
|
|
571
989
|
const batchId = genId("batch");
|
|
572
990
|
const users = COHORT_USERS.slice(0, 5);
|
|
573
|
-
console.log(`\n=== N=10: Session Continuity — 5 users x 2 sessions ===\n`);
|
|
991
|
+
console.log(`\n=== N=10: Session Continuity + Perturbations — 5 users x 2 sessions ===\n`);
|
|
992
|
+
console.log(` Sessions 1-5: clean baseline | Sessions 6-10: perturbed\n`);
|
|
574
993
|
const sessions = [];
|
|
994
|
+
let globalIdx = 0;
|
|
575
995
|
for (const user of users) {
|
|
576
996
|
for (let sessionIdx = 1; sessionIdx <= 2; sessionIdx++) {
|
|
997
|
+
globalIdx++;
|
|
577
998
|
const scenario = user.typicalScenarios[(sessionIdx - 1) % user.typicalScenarios.length];
|
|
578
999
|
const horizon = sessionIdx === 1 ? "same_session" : "next_day";
|
|
579
|
-
|
|
1000
|
+
// Apply perturbation to sessions 6-10
|
|
1001
|
+
const perturbation = globalIdx > 5 ? selectPerturbation(globalIdx) : undefined;
|
|
1002
|
+
const session = await simulateSession(user, scenario, sessionIdx, horizon, batchId, 10, perturbation);
|
|
580
1003
|
sessions.push(session);
|
|
581
|
-
printSessionLine(session);
|
|
1004
|
+
printSessionLine(session, perturbation);
|
|
582
1005
|
}
|
|
583
1006
|
}
|
|
584
1007
|
const report = generateCohortReport(sessions, 10, "n10");
|
|
1008
|
+
const drift = computeDriftMetrics(sessions);
|
|
1009
|
+
const durability = computeDurabilityScore(sessions);
|
|
1010
|
+
const rollups = computeRollup(sessions, "daily");
|
|
1011
|
+
// Compute maturity per scenario
|
|
1012
|
+
const scenarios = [...new Set(sessions.map((s) => s.scenarioId))];
|
|
1013
|
+
const maturityAssessments = scenarios.map((sc) => computeMaturityLevel(sc, sessions, batchId));
|
|
585
1014
|
printReport(report, "N=10");
|
|
1015
|
+
printDurabilityReport(durability, drift);
|
|
1016
|
+
printMaturityReport(maturityAssessments);
|
|
1017
|
+
if (rollups.length > 0)
|
|
1018
|
+
printRollupSummary(rollups);
|
|
586
1019
|
return report;
|
|
587
1020
|
}
|
|
588
1021
|
/**
|
|
589
1022
|
* N=100: 10 users x 10 sessions each (simulated across time horizons).
|
|
590
|
-
*
|
|
1023
|
+
* Sessions 1-20: clean baseline. Sessions 21-100: perturbed.
|
|
1024
|
+
* Measures RCA + PRR compounding over time + drift durability.
|
|
591
1025
|
*/
|
|
592
1026
|
export async function runN100() {
|
|
593
1027
|
const batchId = genId("batch");
|
|
594
1028
|
console.log(`\n=== N=100: Longitudinal Compounding — 10 users x 10 sessions ===\n`);
|
|
1029
|
+
console.log(` Sessions 1-20: clean baseline | Sessions 21-100: perturbed\n`);
|
|
595
1030
|
const sessions = [];
|
|
1031
|
+
let globalIdx = 0;
|
|
596
1032
|
for (const user of COHORT_USERS) {
|
|
597
1033
|
for (let sessionIdx = 1; sessionIdx <= 10; sessionIdx++) {
|
|
1034
|
+
globalIdx++;
|
|
598
1035
|
const scenario = user.typicalScenarios[(sessionIdx - 1) % user.typicalScenarios.length];
|
|
599
1036
|
// Spread sessions across time horizons to simulate real usage patterns
|
|
600
1037
|
const horizonIdx = Math.min(sessionIdx - 1, TIME_HORIZONS.length - 1);
|
|
601
1038
|
const horizon = TIME_HORIZONS[horizonIdx];
|
|
602
|
-
|
|
1039
|
+
// Apply perturbation to sessions 21-100
|
|
1040
|
+
const perturbation = globalIdx > 20 ? selectPerturbation(globalIdx) : undefined;
|
|
1041
|
+
const session = await simulateSession(user, scenario, sessionIdx, horizon, batchId, 100, perturbation);
|
|
603
1042
|
sessions.push(session);
|
|
604
|
-
printSessionLine(session);
|
|
1043
|
+
printSessionLine(session, perturbation);
|
|
605
1044
|
}
|
|
606
1045
|
}
|
|
607
1046
|
const report = generateCohortReport(sessions, 100, "n100");
|
|
1047
|
+
const drift = computeDriftMetrics(sessions);
|
|
1048
|
+
const durability = computeDurabilityScore(sessions);
|
|
1049
|
+
// Rollups for all periods
|
|
1050
|
+
const dailyRollups = computeRollup(sessions, "daily");
|
|
1051
|
+
const weeklyRollups = computeRollup(sessions, "weekly");
|
|
1052
|
+
const monthlyRollups = computeRollup(sessions, "monthly");
|
|
1053
|
+
// Compute maturity per scenario
|
|
1054
|
+
const scenarios = [...new Set(sessions.map((s) => s.scenarioId))];
|
|
1055
|
+
const maturityAssessments = scenarios.map((sc) => computeMaturityLevel(sc, sessions, batchId));
|
|
608
1056
|
printReport(report, "N=100");
|
|
1057
|
+
printDurabilityReport(durability, drift);
|
|
1058
|
+
printMaturityReport(maturityAssessments);
|
|
1059
|
+
printRollupSummary([...dailyRollups, ...weeklyRollups, ...monthlyRollups]);
|
|
609
1060
|
return report;
|
|
610
1061
|
}
|
|
611
1062
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
612
1063
|
// Output Formatting
|
|
613
1064
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
614
|
-
function printSessionLine(s) {
|
|
615
|
-
const
|
|
1065
|
+
function printSessionLine(s, perturbation) {
|
|
1066
|
+
const realErrors = s.errors.filter((e) => !e.startsWith("perturbation:"));
|
|
1067
|
+
const status = realErrors.length === 0 ? "OK" : `ERR(${realErrors.length})`;
|
|
616
1068
|
const reuse = s.packetReused ? "REUSE" : s.packetGenerated ? "NEW" : "NONE";
|
|
617
1069
|
const restated = s.contextRestated ? "RESTATED" : "FRESH";
|
|
1070
|
+
const pertMarker = perturbation ? ` [PERTURB:${perturbation.type}/${perturbation.severity}]` : "";
|
|
618
1071
|
console.log(` [${s.role.padEnd(10)}] sess=${s.sessionIndex} ${s.scenarioId.padEnd(18)} ` +
|
|
619
1072
|
`tools=${s.toolCallCount} ${s.latencyMs}ms judge=${s.judgeScore.toFixed(1)} ` +
|
|
620
|
-
`packet=${reuse} ctx=${restated} ${status}`);
|
|
1073
|
+
`packet=${reuse} ctx=${restated} ${status}${pertMarker}`);
|
|
621
1074
|
}
|
|
622
1075
|
function printReport(report, label) {
|
|
623
1076
|
const passLabel = report.passed ? "PASS" : "FAIL";
|
|
@@ -644,6 +1097,51 @@ function printReport(report, label) {
|
|
|
644
1097
|
╚══════════════════════════════════════════════════════════════╝
|
|
645
1098
|
`);
|
|
646
1099
|
}
|
|
1100
|
+
function printDurabilityReport(durability, drift) {
|
|
1101
|
+
console.log(`
|
|
1102
|
+
╔══════════════════════════════════════════════════════════════╗
|
|
1103
|
+
║ DURABILITY SCORE ${String(durability.composite).padStart(3)}/100 ║
|
|
1104
|
+
╠══════════════════════════════════════════════════════════════╣
|
|
1105
|
+
║ Completion Stability (25%): ${String(durability.completionStability).padStart(6)}% ║
|
|
1106
|
+
║ Rerun Savings (20%): ${String(durability.rerunSavings).padStart(6)}% ║
|
|
1107
|
+
║ Artifact Quality (20%): ${String(durability.artifactQuality).padStart(6)}% ║
|
|
1108
|
+
║ Memory Usefulness (15%): ${String(durability.memoryUsefulness).padStart(6)}% ║
|
|
1109
|
+
║ Drift Resistance (10%): ${String(durability.driftResistance).padStart(6)}% ║
|
|
1110
|
+
║ Cross-Session Continuity (10%):${String(durability.crossSessionContinuity).padStart(6)}% ║
|
|
1111
|
+
╠══════════════════════════════════════════════════════════════╣
|
|
1112
|
+
║ Drift Recovery Rate: ${String(Math.round(drift.driftRecoveryRate * 10) / 10).padStart(6)}% ║
|
|
1113
|
+
║ Perturbation Survival Rate: ${String(Math.round(drift.perturbationSurvivalRate * 10) / 10).padStart(6)}% ║
|
|
1114
|
+
║ Stale Memory Rejection Rate: ${String(Math.round(drift.staleMemoryRejectionRate * 10) / 10).padStart(6)}% ║
|
|
1115
|
+
╚══════════════════════════════════════════════════════════════╝
|
|
1116
|
+
`);
|
|
1117
|
+
}
|
|
1118
|
+
function printMaturityReport(assessments) {
|
|
1119
|
+
console.log(`
|
|
1120
|
+
╔══════════════════════════════════════════════════════════════╗
|
|
1121
|
+
║ WORKFLOW MATURITY LEVELS ║
|
|
1122
|
+
╠══════════════════════════════════════════════════════════════╣`);
|
|
1123
|
+
for (const a of assessments) {
|
|
1124
|
+
const line = ` Level ${a.level} (${a.label}) — ${a.scenarioId}`;
|
|
1125
|
+
console.log(`║${line.padEnd(60)}║`);
|
|
1126
|
+
console.log(`║ ${a.evidence.slice(0, 56).padEnd(56)}║`);
|
|
1127
|
+
}
|
|
1128
|
+
console.log(`╚══════════════════════════════════════════════════════════════╝
|
|
1129
|
+
`);
|
|
1130
|
+
}
|
|
1131
|
+
function printRollupSummary(rollups) {
|
|
1132
|
+
console.log(`
|
|
1133
|
+
╔══════════════════════════════════════════════════════════════╗
|
|
1134
|
+
║ PERIOD ROLLUPS ║
|
|
1135
|
+
╠══════════════════════════════════════════════════════════════╣`);
|
|
1136
|
+
for (const r of rollups) {
|
|
1137
|
+
const line = ` ${r.period.padEnd(8)} ${r.periodKey.padEnd(12)} runs=${String(r.totalRuns).padStart(4)} ` +
|
|
1138
|
+
`comp=${r.completionRate.toFixed(0)}% judge=${r.avgJudgeScore.toFixed(1)} ` +
|
|
1139
|
+
`dur=${r.durabilityScore}`;
|
|
1140
|
+
console.log(`║${line.padEnd(60)}║`);
|
|
1141
|
+
}
|
|
1142
|
+
console.log(`╚══════════════════════════════════════════════════════════════╝
|
|
1143
|
+
`);
|
|
1144
|
+
}
|
|
647
1145
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
648
1146
|
// CLI Entry Point
|
|
649
1147
|
// ═══════════════════════════════════════════════════════════════════════════
|