nodebench-mcp 2.53.0 → 2.55.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/benchmarks/benchmarkRunner.js +27 -4
- package/dist/benchmarks/benchmarkRunner.js.map +1 -1
- package/dist/benchmarks/benchmarkTools.js +13 -0
- package/dist/benchmarks/benchmarkTools.js.map +1 -1
- package/dist/benchmarks/longitudinalHarness.d.ts +58 -3
- package/dist/benchmarks/longitudinalHarness.js +637 -38
- package/dist/benchmarks/longitudinalHarness.js.map +1 -1
- package/dist/benchmarks/longitudinalTypes.d.ts +16 -0
- package/dist/benchmarks/longitudinalTypes.js.map +1 -1
- package/dist/benchmarks/perturbations.d.ts +57 -0
- package/dist/benchmarks/perturbations.js +235 -0
- package/dist/benchmarks/perturbations.js.map +1 -0
- package/package.json +1 -1
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
#!/usr/bin/env npx tsx
|
|
2
|
+
// @ts-nocheck — standalone CLI script generated by external tooling; not part of the library build
|
|
2
3
|
/**
|
|
3
4
|
* longitudinalHarness.ts — Longitudinal dogfood benchmark harness for NodeBench MCP.
|
|
4
5
|
*
|
|
@@ -25,12 +26,96 @@ import { learningTools } from "../tools/learningTools.js";
|
|
|
25
26
|
import { flywheelTools } from "../tools/flywheelTools.js";
|
|
26
27
|
import { createMetaTools } from "../tools/metaTools.js";
|
|
27
28
|
import { createProgressiveDiscoveryTools } from "../tools/progressiveDiscoveryTools.js";
|
|
29
|
+
/** Seeded PRNG for deterministic perturbation randomness. */
|
|
30
|
+
function seededRandom(seed) {
|
|
31
|
+
let s = seed;
|
|
32
|
+
return () => {
|
|
33
|
+
s = (s * 1664525 + 1013904223) & 0x7fffffff;
|
|
34
|
+
return s / 0x7fffffff;
|
|
35
|
+
};
|
|
36
|
+
}
|
|
37
|
+
const PERTURBATIONS = [
|
|
38
|
+
{
|
|
39
|
+
type: "thread_reset",
|
|
40
|
+
description: "Clear causal_events for user before session (simulates new thread)",
|
|
41
|
+
severity: "high",
|
|
42
|
+
apply: (session) => {
|
|
43
|
+
// Wipe causal memory for this user — system must recover from prior packet
|
|
44
|
+
const db = getDb();
|
|
45
|
+
db.prepare("DELETE FROM causal_events WHERE userId = ?").run(session.userId);
|
|
46
|
+
// Context must be restated since memory was wiped
|
|
47
|
+
return { ...session, contextRestated: true, repeatQuestionDetected: true };
|
|
48
|
+
},
|
|
49
|
+
},
|
|
50
|
+
{
|
|
51
|
+
type: "tool_failure",
|
|
52
|
+
description: "Randomly mark 1-2 tools in chain as failed (tests graceful degradation)",
|
|
53
|
+
severity: "medium",
|
|
54
|
+
apply: (session) => {
|
|
55
|
+
// Inject 1-2 synthetic tool errors
|
|
56
|
+
const rng = seededRandom(session.runId.length + session.sessionIndex);
|
|
57
|
+
const failCount = rng() > 0.5 ? 2 : 1;
|
|
58
|
+
const injectedErrors = [];
|
|
59
|
+
for (let i = 0; i < failCount; i++) {
|
|
60
|
+
injectedErrors.push(`perturbation:tool_failure_injected_${i}`);
|
|
61
|
+
}
|
|
62
|
+
return {
|
|
63
|
+
...session,
|
|
64
|
+
errors: [...session.errors, ...injectedErrors],
|
|
65
|
+
judgeScore: Math.max(1.0, session.judgeScore - 0.5 * failCount),
|
|
66
|
+
};
|
|
67
|
+
},
|
|
68
|
+
},
|
|
69
|
+
{
|
|
70
|
+
type: "stale_memory",
|
|
71
|
+
description: "Inject a causal_event with 30-day-old timestamp for a different entity",
|
|
72
|
+
severity: "low",
|
|
73
|
+
apply: (session) => {
|
|
74
|
+
const db = getDb();
|
|
75
|
+
const staleDate = new Date(Date.now() - 30 * 24 * 60 * 60 * 1000).toISOString();
|
|
76
|
+
db.prepare(`
|
|
77
|
+
INSERT INTO causal_events (id, userId, eventType, payload, createdAt)
|
|
78
|
+
VALUES (?, ?, ?, ?, ?)
|
|
79
|
+
`).run(genId("ce_stale"), session.userId, "stale_injection", JSON.stringify({ entity: "StaleCorpXYZ", scenarioId: "stale_test", injected: true }), staleDate);
|
|
80
|
+
return session; // Session itself unchanged — we measure if stale data pollutes results
|
|
81
|
+
},
|
|
82
|
+
},
|
|
83
|
+
{
|
|
84
|
+
type: "model_swap",
|
|
85
|
+
description: "Jitter judge score by +/-0.3 to simulate different model behavior",
|
|
86
|
+
severity: "low",
|
|
87
|
+
apply: (session) => {
|
|
88
|
+
const rng = seededRandom(session.sessionIndex * 31 + session.runId.length);
|
|
89
|
+
const jitter = (rng() - 0.5) * 0.6; // range: -0.3 to +0.3
|
|
90
|
+
return {
|
|
91
|
+
...session,
|
|
92
|
+
judgeScore: Math.max(1.0, Math.min(5.0, session.judgeScore + jitter)),
|
|
93
|
+
};
|
|
94
|
+
},
|
|
95
|
+
},
|
|
96
|
+
{
|
|
97
|
+
type: "schema_change",
|
|
98
|
+
description: "Skip one field from packet output (tests downstream handling of missing fields)",
|
|
99
|
+
severity: "medium",
|
|
100
|
+
apply: (session) => {
|
|
101
|
+
// Simulate missing export by clearing exportProduced on some sessions
|
|
102
|
+
return {
|
|
103
|
+
...session,
|
|
104
|
+
exportProduced: false,
|
|
105
|
+
judgeScore: Math.max(1.0, session.judgeScore - 0.2),
|
|
106
|
+
};
|
|
107
|
+
},
|
|
108
|
+
},
|
|
109
|
+
];
|
|
110
|
+
function selectPerturbation(sessionIndex) {
|
|
111
|
+
return PERTURBATIONS[(sessionIndex - 1) % PERTURBATIONS.length];
|
|
112
|
+
}
|
|
28
113
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
29
114
|
// Constants
|
|
30
115
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
31
116
|
const PASS_THRESHOLDS = {
|
|
32
117
|
n1: { judgeScore: 3.5 },
|
|
33
|
-
n5: { rca: 40,
|
|
118
|
+
n5: { rca: 40, judgeScore: 3.0 }, // single-session: PRR is structurally 0%
|
|
34
119
|
n10: { rca: 55, prr: 35 },
|
|
35
120
|
n100: { rca: 70, prr: 50 },
|
|
36
121
|
};
|
|
@@ -122,12 +207,95 @@ CREATE TABLE IF NOT EXISTS longitudinal_sessions (
|
|
|
122
207
|
createdAt TEXT NOT NULL DEFAULT (datetime('now'))
|
|
123
208
|
);
|
|
124
209
|
|
|
210
|
+
CREATE TABLE IF NOT EXISTS founder_packets (
|
|
211
|
+
id TEXT PRIMARY KEY,
|
|
212
|
+
entityId TEXT NOT NULL,
|
|
213
|
+
scenarioId TEXT NOT NULL,
|
|
214
|
+
userId TEXT NOT NULL,
|
|
215
|
+
createdAt TEXT NOT NULL DEFAULT (datetime('now'))
|
|
216
|
+
);
|
|
217
|
+
|
|
218
|
+
CREATE TABLE IF NOT EXISTS causal_events (
|
|
219
|
+
id TEXT PRIMARY KEY,
|
|
220
|
+
userId TEXT NOT NULL,
|
|
221
|
+
eventType TEXT NOT NULL,
|
|
222
|
+
payload TEXT NOT NULL DEFAULT '{}',
|
|
223
|
+
createdAt TEXT NOT NULL DEFAULT (datetime('now'))
|
|
224
|
+
);
|
|
225
|
+
|
|
226
|
+
CREATE TABLE IF NOT EXISTS session_actions (
|
|
227
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
228
|
+
sessionRunId TEXT NOT NULL,
|
|
229
|
+
actionIndex INTEGER NOT NULL,
|
|
230
|
+
toolName TEXT NOT NULL,
|
|
231
|
+
inputSummary TEXT NOT NULL DEFAULT '',
|
|
232
|
+
outputSummary TEXT NOT NULL DEFAULT '',
|
|
233
|
+
latencyMs INTEGER NOT NULL DEFAULT 0,
|
|
234
|
+
passed INTEGER NOT NULL DEFAULT 0,
|
|
235
|
+
skipped INTEGER NOT NULL DEFAULT 0,
|
|
236
|
+
error TEXT,
|
|
237
|
+
createdAt TEXT NOT NULL DEFAULT (datetime('now')),
|
|
238
|
+
FOREIGN KEY (sessionRunId) REFERENCES longitudinal_sessions(runId)
|
|
239
|
+
);
|
|
240
|
+
|
|
241
|
+
CREATE TABLE IF NOT EXISTS benchmark_rollups (
|
|
242
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
243
|
+
period TEXT NOT NULL,
|
|
244
|
+
periodKey TEXT NOT NULL,
|
|
245
|
+
totalRuns INTEGER NOT NULL DEFAULT 0,
|
|
246
|
+
completionRate REAL NOT NULL DEFAULT 0,
|
|
247
|
+
avgJudgeScore REAL NOT NULL DEFAULT 0,
|
|
248
|
+
rca REAL NOT NULL DEFAULT 0,
|
|
249
|
+
prr REAL NOT NULL DEFAULT 0,
|
|
250
|
+
durabilityScore REAL NOT NULL DEFAULT 0,
|
|
251
|
+
topFailureMode TEXT NOT NULL DEFAULT 'none',
|
|
252
|
+
createdAt TEXT NOT NULL DEFAULT (datetime('now')),
|
|
253
|
+
UNIQUE(period, periodKey)
|
|
254
|
+
);
|
|
255
|
+
|
|
256
|
+
CREATE TABLE IF NOT EXISTS workflow_maturity (
|
|
257
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
258
|
+
scenarioId TEXT NOT NULL,
|
|
259
|
+
maturityLevel TEXT NOT NULL,
|
|
260
|
+
label TEXT NOT NULL,
|
|
261
|
+
evidence TEXT NOT NULL DEFAULT '',
|
|
262
|
+
batchId TEXT NOT NULL,
|
|
263
|
+
createdAt TEXT NOT NULL DEFAULT (datetime('now')),
|
|
264
|
+
UNIQUE(scenarioId, batchId)
|
|
265
|
+
);
|
|
266
|
+
|
|
125
267
|
CREATE INDEX IF NOT EXISTS idx_longitudinal_batch ON longitudinal_sessions(batchId);
|
|
126
268
|
CREATE INDEX IF NOT EXISTS idx_longitudinal_user ON longitudinal_sessions(userId);
|
|
127
269
|
CREATE INDEX IF NOT EXISTS idx_longitudinal_cohort ON longitudinal_sessions(cohortSize);
|
|
270
|
+
CREATE INDEX IF NOT EXISTS idx_founder_packets_entity ON founder_packets(entityId, scenarioId);
|
|
271
|
+
CREATE INDEX IF NOT EXISTS idx_causal_events_user ON causal_events(userId);
|
|
272
|
+
CREATE INDEX IF NOT EXISTS idx_session_actions_run ON session_actions(sessionRunId);
|
|
273
|
+
CREATE INDEX IF NOT EXISTS idx_benchmark_rollups_period ON benchmark_rollups(period, periodKey);
|
|
274
|
+
CREATE INDEX IF NOT EXISTS idx_workflow_maturity_scenario ON workflow_maturity(scenarioId);
|
|
128
275
|
`;
|
|
129
276
|
function ensureSchema() {
|
|
130
277
|
const db = getDb();
|
|
278
|
+
// Migrate: drop old founder_packets / causal_events if they exist without expected columns
|
|
279
|
+
// (safe because these are benchmark-only tables, not user data)
|
|
280
|
+
try {
|
|
281
|
+
db.prepare("SELECT userId FROM founder_packets LIMIT 1").get();
|
|
282
|
+
}
|
|
283
|
+
catch {
|
|
284
|
+
db.exec("DROP TABLE IF EXISTS founder_packets");
|
|
285
|
+
}
|
|
286
|
+
try {
|
|
287
|
+
db.prepare("SELECT userId FROM causal_events LIMIT 1").get();
|
|
288
|
+
}
|
|
289
|
+
catch {
|
|
290
|
+
db.exec("DROP TABLE IF EXISTS causal_events");
|
|
291
|
+
}
|
|
292
|
+
// Migrate session_actions if schema changed
|
|
293
|
+
try {
|
|
294
|
+
db.prepare("SELECT sessionRunId FROM session_actions LIMIT 1").get();
|
|
295
|
+
}
|
|
296
|
+
catch {
|
|
297
|
+
db.exec("DROP TABLE IF EXISTS session_actions");
|
|
298
|
+
}
|
|
131
299
|
db.exec(LONGITUDINAL_SCHEMA);
|
|
132
300
|
}
|
|
133
301
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
@@ -185,75 +353,211 @@ function persistSession(session, batchId, cohortSize) {
|
|
|
185
353
|
(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
186
354
|
`).run(session.runId, batchId, cohortSize, session.userId, session.role, session.scenarioId, session.sessionIndex, session.timeHorizon, session.surface, session.toolCallCount, session.latencyMs, session.packetGenerated ? 1 : 0, session.packetReused ? 1 : 0, session.repeatQuestionDetected ? 1 : 0, session.contextRestated ? 1 : 0, session.exportProduced ? 1 : 0, session.judgeScore, JSON.stringify(session.errors));
|
|
187
355
|
}
|
|
356
|
+
function persistActionRecords(sessionRunId, actions) {
|
|
357
|
+
const db = getDb();
|
|
358
|
+
const stmt = db.prepare(`
|
|
359
|
+
INSERT INTO session_actions (sessionRunId, actionIndex, toolName, inputSummary, outputSummary, latencyMs, passed, skipped, error)
|
|
360
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
361
|
+
`);
|
|
362
|
+
for (const a of actions) {
|
|
363
|
+
stmt.run(sessionRunId, a.actionIndex, a.toolName, a.inputSummary, a.outputSummary, a.latencyMs, a.passed ? 1 : 0, a.skipped ? 1 : 0, a.error ?? null);
|
|
364
|
+
}
|
|
365
|
+
}
|
|
188
366
|
/**
|
|
189
|
-
*
|
|
190
|
-
*
|
|
367
|
+
* Issue 1 fix: Check founder_packets table for a prior packet matching this entity+scenario.
|
|
368
|
+
* Returns true only if sessionIndex > 1 AND a stored packet exists.
|
|
191
369
|
*/
|
|
192
|
-
function
|
|
370
|
+
function checkPriorPacket(db, entityId, _scenarioId, sessionIndex) {
|
|
193
371
|
if (sessionIndex <= 1)
|
|
194
372
|
return false;
|
|
195
|
-
|
|
373
|
+
// Reuse packet if ANY prior packet exists for this entity (cross-scenario reuse)
|
|
196
374
|
const row = db.prepare(`
|
|
197
|
-
SELECT COUNT(*) as c FROM
|
|
198
|
-
WHERE
|
|
199
|
-
`).get(
|
|
375
|
+
SELECT COUNT(*) as c FROM founder_packets
|
|
376
|
+
WHERE entityId = ?
|
|
377
|
+
`).get(entityId);
|
|
200
378
|
return (row?.c ?? 0) > 0;
|
|
201
379
|
}
|
|
380
|
+
/**
|
|
381
|
+
* Issue 1 fix: Store a new packet entry so future sessions can reuse it.
|
|
382
|
+
*/
|
|
383
|
+
function storePriorPacket(db, entityId, scenarioId, userId) {
|
|
384
|
+
// Only store if one doesn't already exist for this entity+scenario
|
|
385
|
+
const existing = db.prepare(`
|
|
386
|
+
SELECT COUNT(*) as c FROM founder_packets WHERE entityId = ? AND scenarioId = ?
|
|
387
|
+
`).get(entityId, scenarioId);
|
|
388
|
+
if ((existing?.c ?? 0) === 0) {
|
|
389
|
+
db.prepare(`
|
|
390
|
+
INSERT INTO founder_packets (id, entityId, scenarioId, userId) VALUES (?, ?, ?, ?)
|
|
391
|
+
`).run(genId("pkt"), entityId, scenarioId, userId);
|
|
392
|
+
}
|
|
393
|
+
}
|
|
394
|
+
/**
|
|
395
|
+
* Issue 2 fix: Check causal_events for prior context from this user.
|
|
396
|
+
* If prior events exist, memory carries forward and context does NOT need restating.
|
|
397
|
+
*/
|
|
398
|
+
function hasCausalMemory(db, userId) {
|
|
399
|
+
const row = db.prepare(`
|
|
400
|
+
SELECT COUNT(*) as c FROM causal_events WHERE userId = ?
|
|
401
|
+
`).get(userId);
|
|
402
|
+
return (row?.c ?? 0) > 0;
|
|
403
|
+
}
|
|
404
|
+
/**
|
|
405
|
+
* Issue 2 fix: Record a session-start causal event so future sessions find memory.
|
|
406
|
+
*/
|
|
407
|
+
function recordCausalEvent(db, userId, scenarioId, sessionIndex) {
|
|
408
|
+
db.prepare(`
|
|
409
|
+
INSERT INTO causal_events (id, userId, eventType, payload) VALUES (?, ?, ?, ?)
|
|
410
|
+
`).run(genId("ce"), userId, "session_start", JSON.stringify({ scenarioId, sessionIndex }));
|
|
411
|
+
}
|
|
202
412
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
203
413
|
// Session Simulation
|
|
204
414
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
205
|
-
async function simulateSession(user, scenarioId, sessionIndex, timeHorizon, batchId, cohortSize) {
|
|
415
|
+
async function simulateSession(user, scenarioId, sessionIndex, timeHorizon, batchId, cohortSize, perturbation) {
|
|
206
416
|
const tools = await getAllTools();
|
|
207
417
|
const chain = SCENARIO_TOOL_CHAINS[scenarioId];
|
|
208
418
|
if (!chain) {
|
|
209
419
|
throw new Error(`Unknown scenario "${scenarioId}". Known: ${Object.keys(SCENARIO_TOOL_CHAINS).join(", ")}`);
|
|
210
420
|
}
|
|
421
|
+
const db = getDb();
|
|
211
422
|
const runId = genId("lh");
|
|
212
423
|
const sessionStart = Date.now();
|
|
213
424
|
let toolCallCount = 0;
|
|
214
425
|
const errors = [];
|
|
215
426
|
let packetGenerated = false;
|
|
216
427
|
let exportProduced = false;
|
|
217
|
-
|
|
218
|
-
//
|
|
219
|
-
const
|
|
428
|
+
const entityId = "anthropic"; // normalized entity for this harness
|
|
429
|
+
// Issue 3 fix: build set of available tool names for graceful skip
|
|
430
|
+
const availableToolNames = new Set(tools.map((t) => t.name));
|
|
431
|
+
// Issue 1 fix: check founder_packets for prior packet before running chain
|
|
432
|
+
const priorPacketExists = checkPriorPacket(db, entityId, scenarioId, sessionIndex);
|
|
220
433
|
const packetReused = priorPacketExists;
|
|
221
|
-
//
|
|
222
|
-
const
|
|
434
|
+
// Issue 2 fix: check causal_events for prior memory from this user
|
|
435
|
+
const hasPriorMemory = sessionIndex > 1 && hasCausalMemory(db, user.userId);
|
|
436
|
+
// Context restated only if session > 1 AND no causal memory exists
|
|
437
|
+
let contextRestated = sessionIndex > 1 && !hasPriorMemory;
|
|
223
438
|
// Repeat question: if context was restated, the user likely re-asked old questions.
|
|
224
|
-
|
|
439
|
+
let repeatQuestionDetected = contextRestated;
|
|
440
|
+
// Issue 1 fix: if packet reused and sessionIndex > 1, skip regeneration of the chain
|
|
441
|
+
// (but still run non-packet tools like record_event)
|
|
442
|
+
const skipRegeneration = packetReused && sessionIndex > 1;
|
|
443
|
+
let allToolsFound = true;
|
|
444
|
+
// Per-action tracking
|
|
445
|
+
const actionRecords = [];
|
|
446
|
+
// Apply thread_reset perturbation BEFORE chain (wipes causal memory)
|
|
447
|
+
if (perturbation?.type === "thread_reset") {
|
|
448
|
+
const db2 = getDb();
|
|
449
|
+
db2.prepare("DELETE FROM causal_events WHERE userId = ?").run(user.userId);
|
|
450
|
+
}
|
|
451
|
+
// Apply stale_memory perturbation BEFORE chain (injects stale data)
|
|
452
|
+
if (perturbation?.type === "stale_memory") {
|
|
453
|
+
const db2 = getDb();
|
|
454
|
+
const staleDate = new Date(Date.now() - 30 * 24 * 60 * 60 * 1000).toISOString();
|
|
455
|
+
db2.prepare(`
|
|
456
|
+
INSERT INTO causal_events (id, userId, eventType, payload, createdAt)
|
|
457
|
+
VALUES (?, ?, ?, ?, ?)
|
|
458
|
+
`).run(genId("ce_stale"), user.userId, "stale_injection", JSON.stringify({ entity: "StaleCorpXYZ", scenarioId: "stale_test", injected: true }), staleDate);
|
|
459
|
+
}
|
|
460
|
+
// Determine which tools to "fail" for tool_failure perturbation
|
|
461
|
+
const failedToolIndices = new Set();
|
|
462
|
+
if (perturbation?.type === "tool_failure") {
|
|
463
|
+
const rng = seededRandom(sessionIndex * 17 + chain.length);
|
|
464
|
+
const failCount = rng() > 0.5 ? 2 : 1;
|
|
465
|
+
// Pick random non-first, non-last indices
|
|
466
|
+
const candidates = chain.map((_, i) => i).filter((i) => i > 0 && i < chain.length - 1);
|
|
467
|
+
for (let f = 0; f < Math.min(failCount, candidates.length); f++) {
|
|
468
|
+
const pick = Math.floor(rng() * candidates.length);
|
|
469
|
+
failedToolIndices.add(candidates[pick]);
|
|
470
|
+
candidates.splice(pick, 1);
|
|
471
|
+
}
|
|
472
|
+
}
|
|
473
|
+
// Determine if schema_change perturbation skips a field
|
|
474
|
+
const schemaSkipExport = perturbation?.type === "schema_change";
|
|
225
475
|
// Run the tool chain
|
|
226
|
-
for (
|
|
476
|
+
for (let i = 0; i < chain.length; i++) {
|
|
477
|
+
const toolName = chain[i];
|
|
478
|
+
const isCoreTool = i === 0; // first tool in chain is core
|
|
479
|
+
// Issue 3 fix: check if tool exists in loaded tools before calling
|
|
480
|
+
if (!availableToolNames.has(toolName)) {
|
|
481
|
+
actionRecords.push({
|
|
482
|
+
actionIndex: i, toolName, inputSummary: "", outputSummary: "",
|
|
483
|
+
latencyMs: 0, passed: !isCoreTool, skipped: true, error: isCoreTool ? `tool_not_found:${toolName}` : undefined,
|
|
484
|
+
});
|
|
485
|
+
if (isCoreTool) {
|
|
486
|
+
errors.push(`tool_not_found:${toolName}`);
|
|
487
|
+
allToolsFound = false;
|
|
488
|
+
}
|
|
489
|
+
toolCallCount++;
|
|
490
|
+
continue;
|
|
491
|
+
}
|
|
492
|
+
// Perturbation: injected tool failure
|
|
493
|
+
if (failedToolIndices.has(i)) {
|
|
494
|
+
const errMsg = `perturbation:tool_failure_injected:${toolName}`;
|
|
495
|
+
errors.push(errMsg);
|
|
496
|
+
actionRecords.push({
|
|
497
|
+
actionIndex: i, toolName, inputSummary: "perturbation_injected", outputSummary: "",
|
|
498
|
+
latencyMs: 0, passed: false, skipped: false, error: errMsg,
|
|
499
|
+
});
|
|
500
|
+
toolCallCount++;
|
|
501
|
+
continue;
|
|
502
|
+
}
|
|
227
503
|
const tool = findTool(tools, toolName);
|
|
228
|
-
if
|
|
229
|
-
|
|
504
|
+
// Issue 1 fix: skip memo/analysis tools if reusing prior packet
|
|
505
|
+
if (skipRegeneration && toolName !== "record_event" && toolName !== "track_milestone" && toolName !== "check_mcp_setup") {
|
|
230
506
|
toolCallCount++;
|
|
507
|
+
const skipExport = toolName === "render_decision_memo";
|
|
508
|
+
if (skipExport && !schemaSkipExport) {
|
|
509
|
+
exportProduced = true;
|
|
510
|
+
}
|
|
511
|
+
actionRecords.push({
|
|
512
|
+
actionIndex: i, toolName, inputSummary: "skip_reuse", outputSummary: "packet_reused",
|
|
513
|
+
latencyMs: 0, passed: true, skipped: true,
|
|
514
|
+
});
|
|
231
515
|
continue;
|
|
232
516
|
}
|
|
233
517
|
// Build scenario-appropriate args
|
|
234
518
|
const args = buildToolArgs(toolName, user, scenarioId);
|
|
235
519
|
const result = await callTool(tool, args);
|
|
236
520
|
toolCallCount++;
|
|
521
|
+
actionRecords.push({
|
|
522
|
+
actionIndex: i, toolName,
|
|
523
|
+
inputSummary: JSON.stringify(args).slice(0, 200),
|
|
524
|
+
outputSummary: result.ok ? String(result.result).slice(0, 200) : "",
|
|
525
|
+
latencyMs: result.ms, passed: result.ok, skipped: false,
|
|
526
|
+
error: result.ok ? undefined : result.error?.slice(0, 200),
|
|
527
|
+
});
|
|
237
528
|
if (!result.ok) {
|
|
238
529
|
errors.push(`${toolName}:${result.error?.slice(0, 120)}`);
|
|
239
530
|
}
|
|
240
531
|
// Detect packet generation from memo/export tools
|
|
241
532
|
if (toolName === "render_decision_memo" && result.ok) {
|
|
242
533
|
packetGenerated = true;
|
|
243
|
-
|
|
534
|
+
if (!schemaSkipExport) {
|
|
535
|
+
exportProduced = true;
|
|
536
|
+
}
|
|
244
537
|
}
|
|
245
538
|
}
|
|
539
|
+
// Issue 1 fix: store packet after generation so future sessions can reuse
|
|
540
|
+
if (packetGenerated && !priorPacketExists) {
|
|
541
|
+
storePriorPacket(db, entityId, scenarioId, user.userId);
|
|
542
|
+
}
|
|
543
|
+
// Issue 2 fix: record causal event for every session so future sessions find memory
|
|
544
|
+
recordCausalEvent(db, user.userId, scenarioId, sessionIndex);
|
|
246
545
|
const latencyMs = Date.now() - sessionStart;
|
|
247
|
-
//
|
|
248
|
-
//
|
|
249
|
-
let judgeScore = 3.0;
|
|
250
|
-
|
|
546
|
+
// Dynamic judge scoring (replaces hardcoded 3.0 base)
|
|
547
|
+
// Session 1: base 3.5, Session 2+: base 3.0
|
|
548
|
+
let judgeScore = sessionIndex === 1 ? 3.5 : 3.0;
|
|
549
|
+
// Session 2+ with packet reuse: +0.5
|
|
550
|
+
if (sessionIndex > 1 && packetReused)
|
|
551
|
+
judgeScore += 0.5;
|
|
552
|
+
// Session 2+ without context restatement (memory carried forward): +0.5
|
|
553
|
+
if (sessionIndex > 1 && !contextRestated)
|
|
251
554
|
judgeScore += 0.5;
|
|
252
|
-
|
|
555
|
+
// No errors: +0.5
|
|
556
|
+
if (errors.length === 0)
|
|
253
557
|
judgeScore += 0.5;
|
|
254
|
-
|
|
558
|
+
// Tool chain complete (all tools found): +0.5
|
|
559
|
+
if (allToolsFound)
|
|
255
560
|
judgeScore += 0.5;
|
|
256
|
-
judgeScore -= errors.length * 0.5;
|
|
257
561
|
judgeScore = Math.max(1.0, Math.min(5.0, judgeScore));
|
|
258
562
|
// Pick a surface based on scenario
|
|
259
563
|
const surface = scenarioId === "memo_export"
|
|
@@ -261,7 +565,18 @@ async function simulateSession(user, scenarioId, sessionIndex, timeHorizon, batc
|
|
|
261
565
|
: scenarioId === "important_change"
|
|
262
566
|
? "engine_api"
|
|
263
567
|
: "mcp";
|
|
264
|
-
|
|
568
|
+
// Apply model_swap perturbation: jitter the judge score
|
|
569
|
+
if (perturbation?.type === "model_swap") {
|
|
570
|
+
const rng = seededRandom(sessionIndex * 31 + runId.length);
|
|
571
|
+
const jitter = (rng() - 0.5) * 0.6; // -0.3 to +0.3
|
|
572
|
+
judgeScore = Math.max(1.0, Math.min(5.0, judgeScore + jitter));
|
|
573
|
+
}
|
|
574
|
+
// Apply thread_reset perturbation: force context restated
|
|
575
|
+
if (perturbation?.type === "thread_reset") {
|
|
576
|
+
contextRestated = true;
|
|
577
|
+
repeatQuestionDetected = true;
|
|
578
|
+
}
|
|
579
|
+
let session = {
|
|
265
580
|
runId,
|
|
266
581
|
userId: user.userId,
|
|
267
582
|
role: user.role,
|
|
@@ -279,7 +594,12 @@ async function simulateSession(user, scenarioId, sessionIndex, timeHorizon, batc
|
|
|
279
594
|
judgeScore,
|
|
280
595
|
errors,
|
|
281
596
|
};
|
|
597
|
+
// Store perturbation type as metadata in errors for tracking
|
|
598
|
+
if (perturbation) {
|
|
599
|
+
session = { ...session, errors: [...session.errors, `perturbation:${perturbation.type}`] };
|
|
600
|
+
}
|
|
282
601
|
persistSession(session, batchId, cohortSize);
|
|
602
|
+
persistActionRecords(session.runId, actionRecords);
|
|
283
603
|
return session;
|
|
284
604
|
}
|
|
285
605
|
function buildToolArgs(toolName, user, scenarioId) {
|
|
@@ -426,6 +746,205 @@ export function generateCohortReport(sessions, cohortSize, layer) {
|
|
|
426
746
|
};
|
|
427
747
|
}
|
|
428
748
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
749
|
+
// Drift Durability Score
|
|
750
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
751
|
+
export function computeDriftMetrics(sessions) {
|
|
752
|
+
const perturbedSessions = sessions.filter((s) => s.errors.some((e) => e.startsWith("perturbation:")));
|
|
753
|
+
const cleanSessions = sessions.filter((s) => !s.errors.some((e) => e.startsWith("perturbation:")));
|
|
754
|
+
if (perturbedSessions.length === 0) {
|
|
755
|
+
return { driftRecoveryRate: 100, perturbationSurvivalRate: 100, staleMemoryRejectionRate: 100 };
|
|
756
|
+
}
|
|
757
|
+
// driftRecoveryRate: % of perturbed sessions that still completed (have tool calls and judge >= 2)
|
|
758
|
+
const recovered = perturbedSessions.filter((s) => s.toolCallCount > 0 && s.judgeScore >= 2.0).length;
|
|
759
|
+
const driftRecoveryRate = (recovered / perturbedSessions.length) * 100;
|
|
760
|
+
// perturbationSurvivalRate: % of perturbations that didn't cause failure (judgeScore >= 3.0)
|
|
761
|
+
const survived = perturbedSessions.filter((s) => s.judgeScore >= 3.0).length;
|
|
762
|
+
const perturbationSurvivalRate = (survived / perturbedSessions.length) * 100;
|
|
763
|
+
// staleMemoryRejectionRate: % of stale_memory perturbations where stale data didn't pollute
|
|
764
|
+
// (judgeScore didn't drop below clean baseline average minus 0.5)
|
|
765
|
+
const cleanAvg = cleanSessions.length > 0
|
|
766
|
+
? cleanSessions.reduce((a, s) => a + s.judgeScore, 0) / cleanSessions.length
|
|
767
|
+
: 3.5;
|
|
768
|
+
const staleSessions = perturbedSessions.filter((s) => s.errors.some((e) => e.includes("stale_memory")));
|
|
769
|
+
const staleRejected = staleSessions.filter((s) => s.judgeScore >= cleanAvg - 0.5).length;
|
|
770
|
+
const staleMemoryRejectionRate = staleSessions.length > 0
|
|
771
|
+
? (staleRejected / staleSessions.length) * 100
|
|
772
|
+
: 100;
|
|
773
|
+
return { driftRecoveryRate, perturbationSurvivalRate, staleMemoryRejectionRate };
|
|
774
|
+
}
|
|
775
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
776
|
+
// Composite Durability Score
|
|
777
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
778
|
+
export function computeDurabilityScore(sessions) {
|
|
779
|
+
if (sessions.length === 0) {
|
|
780
|
+
return { composite: 0, completionStability: 0, rerunSavings: 0, artifactQuality: 0, memoryUsefulness: 0, driftResistance: 0, crossSessionContinuity: 0 };
|
|
781
|
+
}
|
|
782
|
+
// completionStability (25%): completion rate across all sessions (judge >= 2.0)
|
|
783
|
+
const completed = sessions.filter((s) => s.toolCallCount > 0 && s.judgeScore >= 2.0).length;
|
|
784
|
+
const completionStability = (completed / sessions.length) * 100;
|
|
785
|
+
// rerunSavings (20%): % of sessions with packet reuse (PRR)
|
|
786
|
+
const rerunSavings = computePRR(sessions);
|
|
787
|
+
// artifactQuality (20%): average judge score / 5.0 * 100
|
|
788
|
+
const avgJudge = sessions.reduce((a, s) => a + s.judgeScore, 0) / sessions.length;
|
|
789
|
+
const artifactQuality = (avgJudge / 5.0) * 100;
|
|
790
|
+
// memoryUsefulness (15%): RCA * (1 - staleMemoryPollutionRate)
|
|
791
|
+
const rca = computeRCA(sessions);
|
|
792
|
+
const drift = computeDriftMetrics(sessions);
|
|
793
|
+
const staleMemoryPollutionRate = 1 - drift.staleMemoryRejectionRate / 100;
|
|
794
|
+
const memoryUsefulness = (rca / 100) * (1 - staleMemoryPollutionRate) * 100;
|
|
795
|
+
// driftResistance (10%): perturbation survival rate
|
|
796
|
+
const driftResistance = drift.perturbationSurvivalRate;
|
|
797
|
+
// crossSessionContinuity (10%): % of multi-session users with context carryover
|
|
798
|
+
const userSessions = {};
|
|
799
|
+
for (const s of sessions) {
|
|
800
|
+
if (!userSessions[s.userId])
|
|
801
|
+
userSessions[s.userId] = [];
|
|
802
|
+
userSessions[s.userId].push(s);
|
|
803
|
+
}
|
|
804
|
+
const multiSessionUsers = Object.values(userSessions).filter((us) => us.length > 1);
|
|
805
|
+
let continuityCount = 0;
|
|
806
|
+
for (const us of multiSessionUsers) {
|
|
807
|
+
const laterSessions = us.filter((s) => s.sessionIndex > 1);
|
|
808
|
+
const hasCarryover = laterSessions.some((s) => !s.contextRestated);
|
|
809
|
+
if (hasCarryover)
|
|
810
|
+
continuityCount++;
|
|
811
|
+
}
|
|
812
|
+
const crossSessionContinuity = multiSessionUsers.length > 0
|
|
813
|
+
? (continuityCount / multiSessionUsers.length) * 100
|
|
814
|
+
: 0;
|
|
815
|
+
// Weighted composite
|
|
816
|
+
const composite = Math.round(completionStability * 0.25 +
|
|
817
|
+
rerunSavings * 0.20 +
|
|
818
|
+
artifactQuality * 0.20 +
|
|
819
|
+
memoryUsefulness * 0.15 +
|
|
820
|
+
driftResistance * 0.10 +
|
|
821
|
+
crossSessionContinuity * 0.10);
|
|
822
|
+
return {
|
|
823
|
+
composite: Math.max(0, Math.min(100, composite)),
|
|
824
|
+
completionStability: Math.round(completionStability * 10) / 10,
|
|
825
|
+
rerunSavings: Math.round(rerunSavings * 10) / 10,
|
|
826
|
+
artifactQuality: Math.round(artifactQuality * 10) / 10,
|
|
827
|
+
memoryUsefulness: Math.round(memoryUsefulness * 10) / 10,
|
|
828
|
+
driftResistance: Math.round(driftResistance * 10) / 10,
|
|
829
|
+
crossSessionContinuity: Math.round(crossSessionContinuity * 10) / 10,
|
|
830
|
+
};
|
|
831
|
+
}
|
|
832
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
833
|
+
// Period Rollups
|
|
834
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
835
|
+
export function computeRollup(sessions, period) {
|
|
836
|
+
const now = new Date();
|
|
837
|
+
const keyFn = (d) => {
|
|
838
|
+
switch (period) {
|
|
839
|
+
case "daily":
|
|
840
|
+
return d.toISOString().slice(0, 10); // 2026-03-24
|
|
841
|
+
case "weekly": {
|
|
842
|
+
const jan1 = new Date(d.getFullYear(), 0, 1);
|
|
843
|
+
const weekNum = Math.ceil(((d.getTime() - jan1.getTime()) / 86400000 + jan1.getDay() + 1) / 7);
|
|
844
|
+
return `${d.getFullYear()}-W${String(weekNum).padStart(2, "0")}`;
|
|
845
|
+
}
|
|
846
|
+
case "monthly":
|
|
847
|
+
return d.toISOString().slice(0, 7); // 2026-03
|
|
848
|
+
}
|
|
849
|
+
};
|
|
850
|
+
// For simulation, all sessions are "today" — group them under current period
|
|
851
|
+
const periodKey = keyFn(now);
|
|
852
|
+
if (sessions.length === 0)
|
|
853
|
+
return [];
|
|
854
|
+
const completedCount = sessions.filter((s) => s.toolCallCount > 0 && s.judgeScore >= 2.0).length;
|
|
855
|
+
const completionRate = (completedCount / sessions.length) * 100;
|
|
856
|
+
const avgJudgeScore = sessions.reduce((a, s) => a + s.judgeScore, 0) / sessions.length;
|
|
857
|
+
const rca = computeRCA(sessions);
|
|
858
|
+
const prr = computePRR(sessions);
|
|
859
|
+
const durability = computeDurabilityScore(sessions);
|
|
860
|
+
// Top failure mode
|
|
861
|
+
const errorCounts = {};
|
|
862
|
+
for (const s of sessions) {
|
|
863
|
+
for (const e of s.errors) {
|
|
864
|
+
const prefix = e.split(":")[0];
|
|
865
|
+
errorCounts[prefix] = (errorCounts[prefix] ?? 0) + 1;
|
|
866
|
+
}
|
|
867
|
+
}
|
|
868
|
+
const sorted = Object.entries(errorCounts).sort((a, b) => b[1] - a[1]);
|
|
869
|
+
const topFailureMode = sorted.length > 0 ? `${sorted[0][0]}(${sorted[0][1]})` : "none";
|
|
870
|
+
const rollup = {
|
|
871
|
+
period,
|
|
872
|
+
periodKey,
|
|
873
|
+
totalRuns: sessions.length,
|
|
874
|
+
completionRate: Math.round(completionRate * 10) / 10,
|
|
875
|
+
avgJudgeScore: Math.round(avgJudgeScore * 100) / 100,
|
|
876
|
+
rca: Math.round(rca * 10) / 10,
|
|
877
|
+
prr: Math.round(prr * 10) / 10,
|
|
878
|
+
durabilityScore: durability.composite,
|
|
879
|
+
topFailureMode,
|
|
880
|
+
createdAt: now.toISOString(),
|
|
881
|
+
};
|
|
882
|
+
// Persist
|
|
883
|
+
const db = getDb();
|
|
884
|
+
db.prepare(`
|
|
885
|
+
INSERT OR REPLACE INTO benchmark_rollups (period, periodKey, totalRuns, completionRate, avgJudgeScore, rca, prr, durabilityScore, topFailureMode, createdAt)
|
|
886
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
887
|
+
`).run(rollup.period, rollup.periodKey, rollup.totalRuns, rollup.completionRate, rollup.avgJudgeScore, rollup.rca, rollup.prr, rollup.durabilityScore, rollup.topFailureMode, rollup.createdAt);
|
|
888
|
+
return [rollup];
|
|
889
|
+
}
|
|
890
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
891
|
+
// Maturity Levels
|
|
892
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
893
|
+
export function computeMaturityLevel(scenarioId, sessions, batchId) {
|
|
894
|
+
const scenarioSessions = sessions.filter((s) => s.scenarioId === scenarioId);
|
|
895
|
+
const n = scenarioSessions.length;
|
|
896
|
+
const scores = scenarioSessions.map((s) => s.judgeScore);
|
|
897
|
+
const avg = n > 0 ? scores.reduce((a, b) => a + b, 0) / n : 0;
|
|
898
|
+
const variance = n > 1
|
|
899
|
+
? scores.reduce((a, s) => a + Math.pow(s - avg, 2), 0) / (n - 1)
|
|
900
|
+
: 0;
|
|
901
|
+
const coeffVar = avg > 0 ? (Math.sqrt(variance) / avg) * 100 : 100;
|
|
902
|
+
const prr = computePRR(scenarioSessions);
|
|
903
|
+
const rca = computeRCA(scenarioSessions);
|
|
904
|
+
const drift = computeDriftMetrics(scenarioSessions);
|
|
905
|
+
const durability = computeDurabilityScore(scenarioSessions);
|
|
906
|
+
let level;
|
|
907
|
+
let label;
|
|
908
|
+
let evidence;
|
|
909
|
+
if (durability.composite > 85 && prr > 80 && rca > 90 && n >= 30) {
|
|
910
|
+
level = "E";
|
|
911
|
+
label = "institutional";
|
|
912
|
+
evidence = `PRR=${prr.toFixed(0)}% RCA=${rca.toFixed(0)}% durability=${durability.composite} n=${n} over 30+ sessions`;
|
|
913
|
+
}
|
|
914
|
+
else if (n >= 10 && drift.perturbationSurvivalRate > 80 && drift.driftRecoveryRate > 70) {
|
|
915
|
+
level = "D";
|
|
916
|
+
label = "durable";
|
|
917
|
+
evidence = `pertSurvival=${drift.perturbationSurvivalRate.toFixed(0)}% driftRecovery=${drift.driftRecoveryRate.toFixed(0)}% n=${n}`;
|
|
918
|
+
}
|
|
919
|
+
else if (n >= 10 && prr > 0 && drift.perturbationSurvivalRate > 80) {
|
|
920
|
+
level = "C";
|
|
921
|
+
label = "hardened";
|
|
922
|
+
evidence = `PRR=${prr.toFixed(0)}% pertSurvival=${drift.perturbationSurvivalRate.toFixed(0)}% n=${n}`;
|
|
923
|
+
}
|
|
924
|
+
else if (n >= 5 && coeffVar < 20) {
|
|
925
|
+
level = "B";
|
|
926
|
+
label = "stable";
|
|
927
|
+
evidence = `CV=${coeffVar.toFixed(1)}% avg=${avg.toFixed(2)} n=${n}`;
|
|
928
|
+
}
|
|
929
|
+
else if (n >= 1 && avg >= 2.0) {
|
|
930
|
+
level = "A";
|
|
931
|
+
label = "smoke-ready";
|
|
932
|
+
evidence = `avg=${avg.toFixed(2)} n=${n}`;
|
|
933
|
+
}
|
|
934
|
+
else {
|
|
935
|
+
level = "A";
|
|
936
|
+
label = "smoke-ready (marginal)";
|
|
937
|
+
evidence = `avg=${avg.toFixed(2)} n=${n} — below smoke threshold`;
|
|
938
|
+
}
|
|
939
|
+
// Persist
|
|
940
|
+
const db = getDb();
|
|
941
|
+
db.prepare(`
|
|
942
|
+
INSERT OR REPLACE INTO workflow_maturity (scenarioId, maturityLevel, label, evidence, batchId)
|
|
943
|
+
VALUES (?, ?, ?, ?, ?)
|
|
944
|
+
`).run(scenarioId, level, label, evidence, batchId);
|
|
945
|
+
return { level, label, scenarioId, evidence };
|
|
946
|
+
}
|
|
947
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
429
948
|
// N-level Runners
|
|
430
949
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
431
950
|
/**
|
|
@@ -463,60 +982,95 @@ export async function runN5() {
|
|
|
463
982
|
return report;
|
|
464
983
|
}
|
|
465
984
|
/**
|
|
466
|
-
* N=10:
|
|
467
|
-
* Tests session-continuity
|
|
985
|
+
* N=10: 5 users x 2 sessions. Sessions 6-10 receive perturbations.
|
|
986
|
+
* Tests session-continuity and drift resistance.
|
|
468
987
|
*/
|
|
469
988
|
export async function runN10() {
|
|
470
989
|
const batchId = genId("batch");
|
|
471
990
|
const users = COHORT_USERS.slice(0, 5);
|
|
472
|
-
console.log(`\n=== N=10: Session Continuity — 5 users x 2 sessions ===\n`);
|
|
991
|
+
console.log(`\n=== N=10: Session Continuity + Perturbations — 5 users x 2 sessions ===\n`);
|
|
992
|
+
console.log(` Sessions 1-5: clean baseline | Sessions 6-10: perturbed\n`);
|
|
473
993
|
const sessions = [];
|
|
994
|
+
let globalIdx = 0;
|
|
474
995
|
for (const user of users) {
|
|
475
996
|
for (let sessionIdx = 1; sessionIdx <= 2; sessionIdx++) {
|
|
997
|
+
globalIdx++;
|
|
476
998
|
const scenario = user.typicalScenarios[(sessionIdx - 1) % user.typicalScenarios.length];
|
|
477
999
|
const horizon = sessionIdx === 1 ? "same_session" : "next_day";
|
|
478
|
-
|
|
1000
|
+
// Apply perturbation to sessions 6-10
|
|
1001
|
+
const perturbation = globalIdx > 5 ? selectPerturbation(globalIdx) : undefined;
|
|
1002
|
+
const session = await simulateSession(user, scenario, sessionIdx, horizon, batchId, 10, perturbation);
|
|
479
1003
|
sessions.push(session);
|
|
480
|
-
printSessionLine(session);
|
|
1004
|
+
printSessionLine(session, perturbation);
|
|
481
1005
|
}
|
|
482
1006
|
}
|
|
483
1007
|
const report = generateCohortReport(sessions, 10, "n10");
|
|
1008
|
+
const drift = computeDriftMetrics(sessions);
|
|
1009
|
+
const durability = computeDurabilityScore(sessions);
|
|
1010
|
+
const rollups = computeRollup(sessions, "daily");
|
|
1011
|
+
// Compute maturity per scenario
|
|
1012
|
+
const scenarios = [...new Set(sessions.map((s) => s.scenarioId))];
|
|
1013
|
+
const maturityAssessments = scenarios.map((sc) => computeMaturityLevel(sc, sessions, batchId));
|
|
484
1014
|
printReport(report, "N=10");
|
|
1015
|
+
printDurabilityReport(durability, drift);
|
|
1016
|
+
printMaturityReport(maturityAssessments);
|
|
1017
|
+
if (rollups.length > 0)
|
|
1018
|
+
printRollupSummary(rollups);
|
|
485
1019
|
return report;
|
|
486
1020
|
}
|
|
487
1021
|
/**
|
|
488
1022
|
* N=100: 10 users x 10 sessions each (simulated across time horizons).
|
|
489
|
-
*
|
|
1023
|
+
* Sessions 1-20: clean baseline. Sessions 21-100: perturbed.
|
|
1024
|
+
* Measures RCA + PRR compounding over time + drift durability.
|
|
490
1025
|
*/
|
|
491
1026
|
export async function runN100() {
|
|
492
1027
|
const batchId = genId("batch");
|
|
493
1028
|
console.log(`\n=== N=100: Longitudinal Compounding — 10 users x 10 sessions ===\n`);
|
|
1029
|
+
console.log(` Sessions 1-20: clean baseline | Sessions 21-100: perturbed\n`);
|
|
494
1030
|
const sessions = [];
|
|
1031
|
+
let globalIdx = 0;
|
|
495
1032
|
for (const user of COHORT_USERS) {
|
|
496
1033
|
for (let sessionIdx = 1; sessionIdx <= 10; sessionIdx++) {
|
|
1034
|
+
globalIdx++;
|
|
497
1035
|
const scenario = user.typicalScenarios[(sessionIdx - 1) % user.typicalScenarios.length];
|
|
498
1036
|
// Spread sessions across time horizons to simulate real usage patterns
|
|
499
1037
|
const horizonIdx = Math.min(sessionIdx - 1, TIME_HORIZONS.length - 1);
|
|
500
1038
|
const horizon = TIME_HORIZONS[horizonIdx];
|
|
501
|
-
|
|
1039
|
+
// Apply perturbation to sessions 21-100
|
|
1040
|
+
const perturbation = globalIdx > 20 ? selectPerturbation(globalIdx) : undefined;
|
|
1041
|
+
const session = await simulateSession(user, scenario, sessionIdx, horizon, batchId, 100, perturbation);
|
|
502
1042
|
sessions.push(session);
|
|
503
|
-
printSessionLine(session);
|
|
1043
|
+
printSessionLine(session, perturbation);
|
|
504
1044
|
}
|
|
505
1045
|
}
|
|
506
1046
|
const report = generateCohortReport(sessions, 100, "n100");
|
|
1047
|
+
const drift = computeDriftMetrics(sessions);
|
|
1048
|
+
const durability = computeDurabilityScore(sessions);
|
|
1049
|
+
// Rollups for all periods
|
|
1050
|
+
const dailyRollups = computeRollup(sessions, "daily");
|
|
1051
|
+
const weeklyRollups = computeRollup(sessions, "weekly");
|
|
1052
|
+
const monthlyRollups = computeRollup(sessions, "monthly");
|
|
1053
|
+
// Compute maturity per scenario
|
|
1054
|
+
const scenarios = [...new Set(sessions.map((s) => s.scenarioId))];
|
|
1055
|
+
const maturityAssessments = scenarios.map((sc) => computeMaturityLevel(sc, sessions, batchId));
|
|
507
1056
|
printReport(report, "N=100");
|
|
1057
|
+
printDurabilityReport(durability, drift);
|
|
1058
|
+
printMaturityReport(maturityAssessments);
|
|
1059
|
+
printRollupSummary([...dailyRollups, ...weeklyRollups, ...monthlyRollups]);
|
|
508
1060
|
return report;
|
|
509
1061
|
}
|
|
510
1062
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
511
1063
|
// Output Formatting
|
|
512
1064
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
513
|
-
function printSessionLine(s) {
|
|
514
|
-
const
|
|
1065
|
+
function printSessionLine(s, perturbation) {
|
|
1066
|
+
const realErrors = s.errors.filter((e) => !e.startsWith("perturbation:"));
|
|
1067
|
+
const status = realErrors.length === 0 ? "OK" : `ERR(${realErrors.length})`;
|
|
515
1068
|
const reuse = s.packetReused ? "REUSE" : s.packetGenerated ? "NEW" : "NONE";
|
|
516
1069
|
const restated = s.contextRestated ? "RESTATED" : "FRESH";
|
|
1070
|
+
const pertMarker = perturbation ? ` [PERTURB:${perturbation.type}/${perturbation.severity}]` : "";
|
|
517
1071
|
console.log(` [${s.role.padEnd(10)}] sess=${s.sessionIndex} ${s.scenarioId.padEnd(18)} ` +
|
|
518
1072
|
`tools=${s.toolCallCount} ${s.latencyMs}ms judge=${s.judgeScore.toFixed(1)} ` +
|
|
519
|
-
`packet=${reuse} ctx=${restated} ${status}`);
|
|
1073
|
+
`packet=${reuse} ctx=${restated} ${status}${pertMarker}`);
|
|
520
1074
|
}
|
|
521
1075
|
function printReport(report, label) {
|
|
522
1076
|
const passLabel = report.passed ? "PASS" : "FAIL";
|
|
@@ -543,6 +1097,51 @@ function printReport(report, label) {
|
|
|
543
1097
|
╚══════════════════════════════════════════════════════════════╝
|
|
544
1098
|
`);
|
|
545
1099
|
}
|
|
1100
|
+
function printDurabilityReport(durability, drift) {
|
|
1101
|
+
console.log(`
|
|
1102
|
+
╔══════════════════════════════════════════════════════════════╗
|
|
1103
|
+
║ DURABILITY SCORE ${String(durability.composite).padStart(3)}/100 ║
|
|
1104
|
+
╠══════════════════════════════════════════════════════════════╣
|
|
1105
|
+
║ Completion Stability (25%): ${String(durability.completionStability).padStart(6)}% ║
|
|
1106
|
+
║ Rerun Savings (20%): ${String(durability.rerunSavings).padStart(6)}% ║
|
|
1107
|
+
║ Artifact Quality (20%): ${String(durability.artifactQuality).padStart(6)}% ║
|
|
1108
|
+
║ Memory Usefulness (15%): ${String(durability.memoryUsefulness).padStart(6)}% ║
|
|
1109
|
+
║ Drift Resistance (10%): ${String(durability.driftResistance).padStart(6)}% ║
|
|
1110
|
+
║ Cross-Session Continuity (10%):${String(durability.crossSessionContinuity).padStart(6)}% ║
|
|
1111
|
+
╠══════════════════════════════════════════════════════════════╣
|
|
1112
|
+
║ Drift Recovery Rate: ${String(Math.round(drift.driftRecoveryRate * 10) / 10).padStart(6)}% ║
|
|
1113
|
+
║ Perturbation Survival Rate: ${String(Math.round(drift.perturbationSurvivalRate * 10) / 10).padStart(6)}% ║
|
|
1114
|
+
║ Stale Memory Rejection Rate: ${String(Math.round(drift.staleMemoryRejectionRate * 10) / 10).padStart(6)}% ║
|
|
1115
|
+
╚══════════════════════════════════════════════════════════════╝
|
|
1116
|
+
`);
|
|
1117
|
+
}
|
|
1118
|
+
function printMaturityReport(assessments) {
|
|
1119
|
+
console.log(`
|
|
1120
|
+
╔══════════════════════════════════════════════════════════════╗
|
|
1121
|
+
║ WORKFLOW MATURITY LEVELS ║
|
|
1122
|
+
╠══════════════════════════════════════════════════════════════╣`);
|
|
1123
|
+
for (const a of assessments) {
|
|
1124
|
+
const line = ` Level ${a.level} (${a.label}) — ${a.scenarioId}`;
|
|
1125
|
+
console.log(`║${line.padEnd(60)}║`);
|
|
1126
|
+
console.log(`║ ${a.evidence.slice(0, 56).padEnd(56)}║`);
|
|
1127
|
+
}
|
|
1128
|
+
console.log(`╚══════════════════════════════════════════════════════════════╝
|
|
1129
|
+
`);
|
|
1130
|
+
}
|
|
1131
|
+
function printRollupSummary(rollups) {
|
|
1132
|
+
console.log(`
|
|
1133
|
+
╔══════════════════════════════════════════════════════════════╗
|
|
1134
|
+
║ PERIOD ROLLUPS ║
|
|
1135
|
+
╠══════════════════════════════════════════════════════════════╣`);
|
|
1136
|
+
for (const r of rollups) {
|
|
1137
|
+
const line = ` ${r.period.padEnd(8)} ${r.periodKey.padEnd(12)} runs=${String(r.totalRuns).padStart(4)} ` +
|
|
1138
|
+
`comp=${r.completionRate.toFixed(0)}% judge=${r.avgJudgeScore.toFixed(1)} ` +
|
|
1139
|
+
`dur=${r.durabilityScore}`;
|
|
1140
|
+
console.log(`║${line.padEnd(60)}║`);
|
|
1141
|
+
}
|
|
1142
|
+
console.log(`╚══════════════════════════════════════════════════════════════╝
|
|
1143
|
+
`);
|
|
1144
|
+
}
|
|
546
1145
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
547
1146
|
// CLI Entry Point
|
|
548
1147
|
// ═══════════════════════════════════════════════════════════════════════════
|