nodebench-mcp 2.54.0 → 2.56.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,5 @@
1
1
  #!/usr/bin/env npx tsx
2
+ // @ts-nocheck — standalone CLI script generated by external tooling; not part of the library build
2
3
  /**
3
4
  * longitudinalHarness.ts — Longitudinal dogfood benchmark harness for NodeBench MCP.
4
5
  *
@@ -25,6 +26,90 @@ import { learningTools } from "../tools/learningTools.js";
25
26
  import { flywheelTools } from "../tools/flywheelTools.js";
26
27
  import { createMetaTools } from "../tools/metaTools.js";
27
28
  import { createProgressiveDiscoveryTools } from "../tools/progressiveDiscoveryTools.js";
29
+ /** Seeded PRNG for deterministic perturbation randomness. */
30
+ function seededRandom(seed) {
31
+ let s = seed;
32
+ return () => {
33
+ s = (s * 1664525 + 1013904223) & 0x7fffffff;
34
+ return s / 0x7fffffff;
35
+ };
36
+ }
37
+ const PERTURBATIONS = [
38
+ {
39
+ type: "thread_reset",
40
+ description: "Clear causal_events for user before session (simulates new thread)",
41
+ severity: "high",
42
+ apply: (session) => {
43
+ // Wipe causal memory for this user — system must recover from prior packet
44
+ const db = getDb();
45
+ db.prepare("DELETE FROM causal_events WHERE userId = ?").run(session.userId);
46
+ // Context must be restated since memory was wiped
47
+ return { ...session, contextRestated: true, repeatQuestionDetected: true };
48
+ },
49
+ },
50
+ {
51
+ type: "tool_failure",
52
+ description: "Randomly mark 1-2 tools in chain as failed (tests graceful degradation)",
53
+ severity: "medium",
54
+ apply: (session) => {
55
+ // Inject 1-2 synthetic tool errors
56
+ const rng = seededRandom(session.runId.length + session.sessionIndex);
57
+ const failCount = rng() > 0.5 ? 2 : 1;
58
+ const injectedErrors = [];
59
+ for (let i = 0; i < failCount; i++) {
60
+ injectedErrors.push(`perturbation:tool_failure_injected_${i}`);
61
+ }
62
+ return {
63
+ ...session,
64
+ errors: [...session.errors, ...injectedErrors],
65
+ judgeScore: Math.max(1.0, session.judgeScore - 0.5 * failCount),
66
+ };
67
+ },
68
+ },
69
+ {
70
+ type: "stale_memory",
71
+ description: "Inject a causal_event with 30-day-old timestamp for a different entity",
72
+ severity: "low",
73
+ apply: (session) => {
74
+ const db = getDb();
75
+ const staleDate = new Date(Date.now() - 30 * 24 * 60 * 60 * 1000).toISOString();
76
+ db.prepare(`
77
+ INSERT INTO causal_events (id, userId, eventType, payload, createdAt)
78
+ VALUES (?, ?, ?, ?, ?)
79
+ `).run(genId("ce_stale"), session.userId, "stale_injection", JSON.stringify({ entity: "StaleCorpXYZ", scenarioId: "stale_test", injected: true }), staleDate);
80
+ return session; // Session itself unchanged — we measure if stale data pollutes results
81
+ },
82
+ },
83
+ {
84
+ type: "model_swap",
85
+ description: "Jitter judge score by +/-0.3 to simulate different model behavior",
86
+ severity: "low",
87
+ apply: (session) => {
88
+ const rng = seededRandom(session.sessionIndex * 31 + session.runId.length);
89
+ const jitter = (rng() - 0.5) * 0.6; // range: -0.3 to +0.3
90
+ return {
91
+ ...session,
92
+ judgeScore: Math.max(1.0, Math.min(5.0, session.judgeScore + jitter)),
93
+ };
94
+ },
95
+ },
96
+ {
97
+ type: "schema_change",
98
+ description: "Skip one field from packet output (tests downstream handling of missing fields)",
99
+ severity: "medium",
100
+ apply: (session) => {
101
+ // Simulate missing export by clearing exportProduced on some sessions
102
+ return {
103
+ ...session,
104
+ exportProduced: false,
105
+ judgeScore: Math.max(1.0, session.judgeScore - 0.2),
106
+ };
107
+ },
108
+ },
109
+ ];
110
+ function selectPerturbation(sessionIndex) {
111
+ return PERTURBATIONS[(sessionIndex - 1) % PERTURBATIONS.length];
112
+ }
28
113
  // ═══════════════════════════════════════════════════════════════════════════
29
114
  // Constants
30
115
  // ═══════════════════════════════════════════════════════════════════════════
@@ -138,11 +223,55 @@ CREATE TABLE IF NOT EXISTS causal_events (
138
223
  createdAt TEXT NOT NULL DEFAULT (datetime('now'))
139
224
  );
140
225
 
226
+ CREATE TABLE IF NOT EXISTS session_actions (
227
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
228
+ sessionRunId TEXT NOT NULL,
229
+ actionIndex INTEGER NOT NULL,
230
+ toolName TEXT NOT NULL,
231
+ inputSummary TEXT NOT NULL DEFAULT '',
232
+ outputSummary TEXT NOT NULL DEFAULT '',
233
+ latencyMs INTEGER NOT NULL DEFAULT 0,
234
+ passed INTEGER NOT NULL DEFAULT 0,
235
+ skipped INTEGER NOT NULL DEFAULT 0,
236
+ error TEXT,
237
+ createdAt TEXT NOT NULL DEFAULT (datetime('now')),
238
+ FOREIGN KEY (sessionRunId) REFERENCES longitudinal_sessions(runId)
239
+ );
240
+
241
+ CREATE TABLE IF NOT EXISTS benchmark_rollups (
242
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
243
+ period TEXT NOT NULL,
244
+ periodKey TEXT NOT NULL,
245
+ totalRuns INTEGER NOT NULL DEFAULT 0,
246
+ completionRate REAL NOT NULL DEFAULT 0,
247
+ avgJudgeScore REAL NOT NULL DEFAULT 0,
248
+ rca REAL NOT NULL DEFAULT 0,
249
+ prr REAL NOT NULL DEFAULT 0,
250
+ durabilityScore REAL NOT NULL DEFAULT 0,
251
+ topFailureMode TEXT NOT NULL DEFAULT 'none',
252
+ createdAt TEXT NOT NULL DEFAULT (datetime('now')),
253
+ UNIQUE(period, periodKey)
254
+ );
255
+
256
+ CREATE TABLE IF NOT EXISTS workflow_maturity (
257
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
258
+ scenarioId TEXT NOT NULL,
259
+ maturityLevel TEXT NOT NULL,
260
+ label TEXT NOT NULL,
261
+ evidence TEXT NOT NULL DEFAULT '',
262
+ batchId TEXT NOT NULL,
263
+ createdAt TEXT NOT NULL DEFAULT (datetime('now')),
264
+ UNIQUE(scenarioId, batchId)
265
+ );
266
+
141
267
  CREATE INDEX IF NOT EXISTS idx_longitudinal_batch ON longitudinal_sessions(batchId);
142
268
  CREATE INDEX IF NOT EXISTS idx_longitudinal_user ON longitudinal_sessions(userId);
143
269
  CREATE INDEX IF NOT EXISTS idx_longitudinal_cohort ON longitudinal_sessions(cohortSize);
144
270
  CREATE INDEX IF NOT EXISTS idx_founder_packets_entity ON founder_packets(entityId, scenarioId);
145
271
  CREATE INDEX IF NOT EXISTS idx_causal_events_user ON causal_events(userId);
272
+ CREATE INDEX IF NOT EXISTS idx_session_actions_run ON session_actions(sessionRunId);
273
+ CREATE INDEX IF NOT EXISTS idx_benchmark_rollups_period ON benchmark_rollups(period, periodKey);
274
+ CREATE INDEX IF NOT EXISTS idx_workflow_maturity_scenario ON workflow_maturity(scenarioId);
146
275
  `;
147
276
  function ensureSchema() {
148
277
  const db = getDb();
@@ -160,6 +289,13 @@ function ensureSchema() {
160
289
  catch {
161
290
  db.exec("DROP TABLE IF EXISTS causal_events");
162
291
  }
292
+ // Migrate session_actions if schema changed
293
+ try {
294
+ db.prepare("SELECT sessionRunId FROM session_actions LIMIT 1").get();
295
+ }
296
+ catch {
297
+ db.exec("DROP TABLE IF EXISTS session_actions");
298
+ }
163
299
  db.exec(LONGITUDINAL_SCHEMA);
164
300
  }
165
301
  // ═══════════════════════════════════════════════════════════════════════════
@@ -217,6 +353,16 @@ function persistSession(session, batchId, cohortSize) {
217
353
  (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
218
354
  `).run(session.runId, batchId, cohortSize, session.userId, session.role, session.scenarioId, session.sessionIndex, session.timeHorizon, session.surface, session.toolCallCount, session.latencyMs, session.packetGenerated ? 1 : 0, session.packetReused ? 1 : 0, session.repeatQuestionDetected ? 1 : 0, session.contextRestated ? 1 : 0, session.exportProduced ? 1 : 0, session.judgeScore, JSON.stringify(session.errors));
219
355
  }
356
+ function persistActionRecords(sessionRunId, actions) {
357
+ const db = getDb();
358
+ const stmt = db.prepare(`
359
+ INSERT INTO session_actions (sessionRunId, actionIndex, toolName, inputSummary, outputSummary, latencyMs, passed, skipped, error)
360
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
361
+ `);
362
+ for (const a of actions) {
363
+ stmt.run(sessionRunId, a.actionIndex, a.toolName, a.inputSummary, a.outputSummary, a.latencyMs, a.passed ? 1 : 0, a.skipped ? 1 : 0, a.error ?? null);
364
+ }
365
+ }
220
366
  /**
221
367
  * Issue 1 fix: Check founder_packets table for a prior packet matching this entity+scenario.
222
368
  * Returns true only if sessionIndex > 1 AND a stored packet exists.
@@ -266,7 +412,7 @@ function recordCausalEvent(db, userId, scenarioId, sessionIndex) {
266
412
  // ═══════════════════════════════════════════════════════════════════════════
267
413
  // Session Simulation
268
414
  // ═══════════════════════════════════════════════════════════════════════════
269
- async function simulateSession(user, scenarioId, sessionIndex, timeHorizon, batchId, cohortSize) {
415
+ async function simulateSession(user, scenarioId, sessionIndex, timeHorizon, batchId, cohortSize, perturbation) {
270
416
  const tools = await getAllTools();
271
417
  const chain = SCENARIO_TOOL_CHAINS[scenarioId];
272
418
  if (!chain) {
@@ -288,25 +434,69 @@ async function simulateSession(user, scenarioId, sessionIndex, timeHorizon, batc
288
434
  // Issue 2 fix: check causal_events for prior memory from this user
289
435
  const hasPriorMemory = sessionIndex > 1 && hasCausalMemory(db, user.userId);
290
436
  // Context restated only if session > 1 AND no causal memory exists
291
- const contextRestated = sessionIndex > 1 && !hasPriorMemory;
437
+ let contextRestated = sessionIndex > 1 && !hasPriorMemory;
292
438
  // Repeat question: if context was restated, the user likely re-asked old questions.
293
- const repeatQuestionDetected = contextRestated;
439
+ let repeatQuestionDetected = contextRestated;
294
440
  // Issue 1 fix: if packet reused and sessionIndex > 1, skip regeneration of the chain
295
441
  // (but still run non-packet tools like record_event)
296
442
  const skipRegeneration = packetReused && sessionIndex > 1;
297
443
  let allToolsFound = true;
444
+ // Per-action tracking
445
+ const actionRecords = [];
446
+ // Apply thread_reset perturbation BEFORE chain (wipes causal memory)
447
+ if (perturbation?.type === "thread_reset") {
448
+ const db2 = getDb();
449
+ db2.prepare("DELETE FROM causal_events WHERE userId = ?").run(user.userId);
450
+ }
451
+ // Apply stale_memory perturbation BEFORE chain (injects stale data)
452
+ if (perturbation?.type === "stale_memory") {
453
+ const db2 = getDb();
454
+ const staleDate = new Date(Date.now() - 30 * 24 * 60 * 60 * 1000).toISOString();
455
+ db2.prepare(`
456
+ INSERT INTO causal_events (id, userId, eventType, payload, createdAt)
457
+ VALUES (?, ?, ?, ?, ?)
458
+ `).run(genId("ce_stale"), user.userId, "stale_injection", JSON.stringify({ entity: "StaleCorpXYZ", scenarioId: "stale_test", injected: true }), staleDate);
459
+ }
460
+ // Determine which tools to "fail" for tool_failure perturbation
461
+ const failedToolIndices = new Set();
462
+ if (perturbation?.type === "tool_failure") {
463
+ const rng = seededRandom(sessionIndex * 17 + chain.length);
464
+ const failCount = rng() > 0.5 ? 2 : 1;
465
+ // Pick random non-first, non-last indices
466
+ const candidates = chain.map((_, i) => i).filter((i) => i > 0 && i < chain.length - 1);
467
+ for (let f = 0; f < Math.min(failCount, candidates.length); f++) {
468
+ const pick = Math.floor(rng() * candidates.length);
469
+ failedToolIndices.add(candidates[pick]);
470
+ candidates.splice(pick, 1);
471
+ }
472
+ }
473
+ // Determine if schema_change perturbation skips a field
474
+ const schemaSkipExport = perturbation?.type === "schema_change";
298
475
  // Run the tool chain
299
476
  for (let i = 0; i < chain.length; i++) {
300
477
  const toolName = chain[i];
301
478
  const isCoreTool = i === 0; // first tool in chain is core
302
479
  // Issue 3 fix: check if tool exists in loaded tools before calling
303
480
  if (!availableToolNames.has(toolName)) {
481
+ actionRecords.push({
482
+ actionIndex: i, toolName, inputSummary: "", outputSummary: "",
483
+ latencyMs: 0, passed: !isCoreTool, skipped: true, error: isCoreTool ? `tool_not_found:${toolName}` : undefined,
484
+ });
304
485
  if (isCoreTool) {
305
- // Core tool missing is a real error
306
486
  errors.push(`tool_not_found:${toolName}`);
307
487
  allToolsFound = false;
308
488
  }
309
- // Non-core tool missing: skip gracefully, don't count as error
489
+ toolCallCount++;
490
+ continue;
491
+ }
492
+ // Perturbation: injected tool failure
493
+ if (failedToolIndices.has(i)) {
494
+ const errMsg = `perturbation:tool_failure_injected:${toolName}`;
495
+ errors.push(errMsg);
496
+ actionRecords.push({
497
+ actionIndex: i, toolName, inputSummary: "perturbation_injected", outputSummary: "",
498
+ latencyMs: 0, passed: false, skipped: false, error: errMsg,
499
+ });
310
500
  toolCallCount++;
311
501
  continue;
312
502
  }
@@ -314,23 +504,36 @@ async function simulateSession(user, scenarioId, sessionIndex, timeHorizon, batc
314
504
  // Issue 1 fix: skip memo/analysis tools if reusing prior packet
315
505
  if (skipRegeneration && toolName !== "record_event" && toolName !== "track_milestone" && toolName !== "check_mcp_setup") {
316
506
  toolCallCount++;
317
- // Still count as generated since we're reusing
318
- if (toolName === "render_decision_memo") {
507
+ const skipExport = toolName === "render_decision_memo";
508
+ if (skipExport && !schemaSkipExport) {
319
509
  exportProduced = true;
320
510
  }
511
+ actionRecords.push({
512
+ actionIndex: i, toolName, inputSummary: "skip_reuse", outputSummary: "packet_reused",
513
+ latencyMs: 0, passed: true, skipped: true,
514
+ });
321
515
  continue;
322
516
  }
323
517
  // Build scenario-appropriate args
324
518
  const args = buildToolArgs(toolName, user, scenarioId);
325
519
  const result = await callTool(tool, args);
326
520
  toolCallCount++;
521
+ actionRecords.push({
522
+ actionIndex: i, toolName,
523
+ inputSummary: JSON.stringify(args).slice(0, 200),
524
+ outputSummary: result.ok ? String(result.result).slice(0, 200) : "",
525
+ latencyMs: result.ms, passed: result.ok, skipped: false,
526
+ error: result.ok ? undefined : result.error?.slice(0, 200),
527
+ });
327
528
  if (!result.ok) {
328
529
  errors.push(`${toolName}:${result.error?.slice(0, 120)}`);
329
530
  }
330
531
  // Detect packet generation from memo/export tools
331
532
  if (toolName === "render_decision_memo" && result.ok) {
332
533
  packetGenerated = true;
333
- exportProduced = true;
534
+ if (!schemaSkipExport) {
535
+ exportProduced = true;
536
+ }
334
537
  }
335
538
  }
336
539
  // Issue 1 fix: store packet after generation so future sessions can reuse
@@ -362,7 +565,18 @@ async function simulateSession(user, scenarioId, sessionIndex, timeHorizon, batc
362
565
  : scenarioId === "important_change"
363
566
  ? "engine_api"
364
567
  : "mcp";
365
- const session = {
568
+ // Apply model_swap perturbation: jitter the judge score
569
+ if (perturbation?.type === "model_swap") {
570
+ const rng = seededRandom(sessionIndex * 31 + runId.length);
571
+ const jitter = (rng() - 0.5) * 0.6; // -0.3 to +0.3
572
+ judgeScore = Math.max(1.0, Math.min(5.0, judgeScore + jitter));
573
+ }
574
+ // Apply thread_reset perturbation: force context restated
575
+ if (perturbation?.type === "thread_reset") {
576
+ contextRestated = true;
577
+ repeatQuestionDetected = true;
578
+ }
579
+ let session = {
366
580
  runId,
367
581
  userId: user.userId,
368
582
  role: user.role,
@@ -380,7 +594,12 @@ async function simulateSession(user, scenarioId, sessionIndex, timeHorizon, batc
380
594
  judgeScore,
381
595
  errors,
382
596
  };
597
+ // Store perturbation type as metadata in errors for tracking
598
+ if (perturbation) {
599
+ session = { ...session, errors: [...session.errors, `perturbation:${perturbation.type}`] };
600
+ }
383
601
  persistSession(session, batchId, cohortSize);
602
+ persistActionRecords(session.runId, actionRecords);
384
603
  return session;
385
604
  }
386
605
  function buildToolArgs(toolName, user, scenarioId) {
@@ -527,6 +746,205 @@ export function generateCohortReport(sessions, cohortSize, layer) {
527
746
  };
528
747
  }
529
748
  // ═══════════════════════════════════════════════════════════════════════════
749
+ // Drift Durability Score
750
+ // ═══════════════════════════════════════════════════════════════════════════
751
+ export function computeDriftMetrics(sessions) {
752
+ const perturbedSessions = sessions.filter((s) => s.errors.some((e) => e.startsWith("perturbation:")));
753
+ const cleanSessions = sessions.filter((s) => !s.errors.some((e) => e.startsWith("perturbation:")));
754
+ if (perturbedSessions.length === 0) {
755
+ return { driftRecoveryRate: 100, perturbationSurvivalRate: 100, staleMemoryRejectionRate: 100 };
756
+ }
757
+ // driftRecoveryRate: % of perturbed sessions that still completed (have tool calls and judge >= 2)
758
+ const recovered = perturbedSessions.filter((s) => s.toolCallCount > 0 && s.judgeScore >= 2.0).length;
759
+ const driftRecoveryRate = (recovered / perturbedSessions.length) * 100;
760
+ // perturbationSurvivalRate: % of perturbations that didn't cause failure (judgeScore >= 3.0)
761
+ const survived = perturbedSessions.filter((s) => s.judgeScore >= 3.0).length;
762
+ const perturbationSurvivalRate = (survived / perturbedSessions.length) * 100;
763
+ // staleMemoryRejectionRate: % of stale_memory perturbations where stale data didn't pollute
764
+ // (judgeScore didn't drop below clean baseline average minus 0.5)
765
+ const cleanAvg = cleanSessions.length > 0
766
+ ? cleanSessions.reduce((a, s) => a + s.judgeScore, 0) / cleanSessions.length
767
+ : 3.5;
768
+ const staleSessions = perturbedSessions.filter((s) => s.errors.some((e) => e.includes("stale_memory")));
769
+ const staleRejected = staleSessions.filter((s) => s.judgeScore >= cleanAvg - 0.5).length;
770
+ const staleMemoryRejectionRate = staleSessions.length > 0
771
+ ? (staleRejected / staleSessions.length) * 100
772
+ : 100;
773
+ return { driftRecoveryRate, perturbationSurvivalRate, staleMemoryRejectionRate };
774
+ }
775
+ // ═══════════════════════════════════════════════════════════════════════════
776
+ // Composite Durability Score
777
+ // ═══════════════════════════════════════════════════════════════════════════
778
+ export function computeDurabilityScore(sessions) {
779
+ if (sessions.length === 0) {
780
+ return { composite: 0, completionStability: 0, rerunSavings: 0, artifactQuality: 0, memoryUsefulness: 0, driftResistance: 0, crossSessionContinuity: 0 };
781
+ }
782
+ // completionStability (25%): completion rate across all sessions (judge >= 2.0)
783
+ const completed = sessions.filter((s) => s.toolCallCount > 0 && s.judgeScore >= 2.0).length;
784
+ const completionStability = (completed / sessions.length) * 100;
785
+ // rerunSavings (20%): % of sessions with packet reuse (PRR)
786
+ const rerunSavings = computePRR(sessions);
787
+ // artifactQuality (20%): average judge score / 5.0 * 100
788
+ const avgJudge = sessions.reduce((a, s) => a + s.judgeScore, 0) / sessions.length;
789
+ const artifactQuality = (avgJudge / 5.0) * 100;
790
+ // memoryUsefulness (15%): RCA * (1 - staleMemoryPollutionRate)
791
+ const rca = computeRCA(sessions);
792
+ const drift = computeDriftMetrics(sessions);
793
+ const staleMemoryPollutionRate = 1 - drift.staleMemoryRejectionRate / 100;
794
+ const memoryUsefulness = (rca / 100) * (1 - staleMemoryPollutionRate) * 100;
795
+ // driftResistance (10%): perturbation survival rate
796
+ const driftResistance = drift.perturbationSurvivalRate;
797
+ // crossSessionContinuity (10%): % of multi-session users with context carryover
798
+ const userSessions = {};
799
+ for (const s of sessions) {
800
+ if (!userSessions[s.userId])
801
+ userSessions[s.userId] = [];
802
+ userSessions[s.userId].push(s);
803
+ }
804
+ const multiSessionUsers = Object.values(userSessions).filter((us) => us.length > 1);
805
+ let continuityCount = 0;
806
+ for (const us of multiSessionUsers) {
807
+ const laterSessions = us.filter((s) => s.sessionIndex > 1);
808
+ const hasCarryover = laterSessions.some((s) => !s.contextRestated);
809
+ if (hasCarryover)
810
+ continuityCount++;
811
+ }
812
+ const crossSessionContinuity = multiSessionUsers.length > 0
813
+ ? (continuityCount / multiSessionUsers.length) * 100
814
+ : 0;
815
+ // Weighted composite
816
+ const composite = Math.round(completionStability * 0.25 +
817
+ rerunSavings * 0.20 +
818
+ artifactQuality * 0.20 +
819
+ memoryUsefulness * 0.15 +
820
+ driftResistance * 0.10 +
821
+ crossSessionContinuity * 0.10);
822
+ return {
823
+ composite: Math.max(0, Math.min(100, composite)),
824
+ completionStability: Math.round(completionStability * 10) / 10,
825
+ rerunSavings: Math.round(rerunSavings * 10) / 10,
826
+ artifactQuality: Math.round(artifactQuality * 10) / 10,
827
+ memoryUsefulness: Math.round(memoryUsefulness * 10) / 10,
828
+ driftResistance: Math.round(driftResistance * 10) / 10,
829
+ crossSessionContinuity: Math.round(crossSessionContinuity * 10) / 10,
830
+ };
831
+ }
832
+ // ═══════════════════════════════════════════════════════════════════════════
833
+ // Period Rollups
834
+ // ═══════════════════════════════════════════════════════════════════════════
835
+ export function computeRollup(sessions, period) {
836
+ const now = new Date();
837
+ const keyFn = (d) => {
838
+ switch (period) {
839
+ case "daily":
840
+ return d.toISOString().slice(0, 10); // 2026-03-24
841
+ case "weekly": {
842
+ const jan1 = new Date(d.getFullYear(), 0, 1);
843
+ const weekNum = Math.ceil(((d.getTime() - jan1.getTime()) / 86400000 + jan1.getDay() + 1) / 7);
844
+ return `${d.getFullYear()}-W${String(weekNum).padStart(2, "0")}`;
845
+ }
846
+ case "monthly":
847
+ return d.toISOString().slice(0, 7); // 2026-03
848
+ }
849
+ };
850
+ // For simulation, all sessions are "today" — group them under current period
851
+ const periodKey = keyFn(now);
852
+ if (sessions.length === 0)
853
+ return [];
854
+ const completedCount = sessions.filter((s) => s.toolCallCount > 0 && s.judgeScore >= 2.0).length;
855
+ const completionRate = (completedCount / sessions.length) * 100;
856
+ const avgJudgeScore = sessions.reduce((a, s) => a + s.judgeScore, 0) / sessions.length;
857
+ const rca = computeRCA(sessions);
858
+ const prr = computePRR(sessions);
859
+ const durability = computeDurabilityScore(sessions);
860
+ // Top failure mode
861
+ const errorCounts = {};
862
+ for (const s of sessions) {
863
+ for (const e of s.errors) {
864
+ const prefix = e.split(":")[0];
865
+ errorCounts[prefix] = (errorCounts[prefix] ?? 0) + 1;
866
+ }
867
+ }
868
+ const sorted = Object.entries(errorCounts).sort((a, b) => b[1] - a[1]);
869
+ const topFailureMode = sorted.length > 0 ? `${sorted[0][0]}(${sorted[0][1]})` : "none";
870
+ const rollup = {
871
+ period,
872
+ periodKey,
873
+ totalRuns: sessions.length,
874
+ completionRate: Math.round(completionRate * 10) / 10,
875
+ avgJudgeScore: Math.round(avgJudgeScore * 100) / 100,
876
+ rca: Math.round(rca * 10) / 10,
877
+ prr: Math.round(prr * 10) / 10,
878
+ durabilityScore: durability.composite,
879
+ topFailureMode,
880
+ createdAt: now.toISOString(),
881
+ };
882
+ // Persist
883
+ const db = getDb();
884
+ db.prepare(`
885
+ INSERT OR REPLACE INTO benchmark_rollups (period, periodKey, totalRuns, completionRate, avgJudgeScore, rca, prr, durabilityScore, topFailureMode, createdAt)
886
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
887
+ `).run(rollup.period, rollup.periodKey, rollup.totalRuns, rollup.completionRate, rollup.avgJudgeScore, rollup.rca, rollup.prr, rollup.durabilityScore, rollup.topFailureMode, rollup.createdAt);
888
+ return [rollup];
889
+ }
890
+ // ═══════════════════════════════════════════════════════════════════════════
891
+ // Maturity Levels
892
+ // ═══════════════════════════════════════════════════════════════════════════
893
+ export function computeMaturityLevel(scenarioId, sessions, batchId) {
894
+ const scenarioSessions = sessions.filter((s) => s.scenarioId === scenarioId);
895
+ const n = scenarioSessions.length;
896
+ const scores = scenarioSessions.map((s) => s.judgeScore);
897
+ const avg = n > 0 ? scores.reduce((a, b) => a + b, 0) / n : 0;
898
+ const variance = n > 1
899
+ ? scores.reduce((a, s) => a + Math.pow(s - avg, 2), 0) / (n - 1)
900
+ : 0;
901
+ const coeffVar = avg > 0 ? (Math.sqrt(variance) / avg) * 100 : 100;
902
+ const prr = computePRR(scenarioSessions);
903
+ const rca = computeRCA(scenarioSessions);
904
+ const drift = computeDriftMetrics(scenarioSessions);
905
+ const durability = computeDurabilityScore(scenarioSessions);
906
+ let level;
907
+ let label;
908
+ let evidence;
909
+ if (durability.composite > 85 && prr > 80 && rca > 90 && n >= 30) {
910
+ level = "E";
911
+ label = "institutional";
912
+ evidence = `PRR=${prr.toFixed(0)}% RCA=${rca.toFixed(0)}% durability=${durability.composite} n=${n} over 30+ sessions`;
913
+ }
914
+ else if (n >= 10 && drift.perturbationSurvivalRate > 80 && drift.driftRecoveryRate > 70) {
915
+ level = "D";
916
+ label = "durable";
917
+ evidence = `pertSurvival=${drift.perturbationSurvivalRate.toFixed(0)}% driftRecovery=${drift.driftRecoveryRate.toFixed(0)}% n=${n}`;
918
+ }
919
+ else if (n >= 10 && prr > 0 && drift.perturbationSurvivalRate > 80) {
920
+ level = "C";
921
+ label = "hardened";
922
+ evidence = `PRR=${prr.toFixed(0)}% pertSurvival=${drift.perturbationSurvivalRate.toFixed(0)}% n=${n}`;
923
+ }
924
+ else if (n >= 5 && coeffVar < 20) {
925
+ level = "B";
926
+ label = "stable";
927
+ evidence = `CV=${coeffVar.toFixed(1)}% avg=${avg.toFixed(2)} n=${n}`;
928
+ }
929
+ else if (n >= 1 && avg >= 2.0) {
930
+ level = "A";
931
+ label = "smoke-ready";
932
+ evidence = `avg=${avg.toFixed(2)} n=${n}`;
933
+ }
934
+ else {
935
+ level = "A";
936
+ label = "smoke-ready (marginal)";
937
+ evidence = `avg=${avg.toFixed(2)} n=${n} — below smoke threshold`;
938
+ }
939
+ // Persist
940
+ const db = getDb();
941
+ db.prepare(`
942
+ INSERT OR REPLACE INTO workflow_maturity (scenarioId, maturityLevel, label, evidence, batchId)
943
+ VALUES (?, ?, ?, ?, ?)
944
+ `).run(scenarioId, level, label, evidence, batchId);
945
+ return { level, label, scenarioId, evidence };
946
+ }
947
+ // ═══════════════════════════════════════════════════════════════════════════
530
948
  // N-level Runners
531
949
  // ═══════════════════════════════════════════════════════════════════════════
532
950
  /**
@@ -564,60 +982,95 @@ export async function runN5() {
564
982
  return report;
565
983
  }
566
984
  /**
567
- * N=10: 10 users x 1 session each OR 5 users x 2 sessions.
568
- * Tests session-continuity metrics.
985
+ * N=10: 5 users x 2 sessions. Sessions 6-10 receive perturbations.
986
+ * Tests session-continuity and drift resistance.
569
987
  */
570
988
  export async function runN10() {
571
989
  const batchId = genId("batch");
572
990
  const users = COHORT_USERS.slice(0, 5);
573
- console.log(`\n=== N=10: Session Continuity — 5 users x 2 sessions ===\n`);
991
+ console.log(`\n=== N=10: Session Continuity + Perturbations — 5 users x 2 sessions ===\n`);
992
+ console.log(` Sessions 1-5: clean baseline | Sessions 6-10: perturbed\n`);
574
993
  const sessions = [];
994
+ let globalIdx = 0;
575
995
  for (const user of users) {
576
996
  for (let sessionIdx = 1; sessionIdx <= 2; sessionIdx++) {
997
+ globalIdx++;
577
998
  const scenario = user.typicalScenarios[(sessionIdx - 1) % user.typicalScenarios.length];
578
999
  const horizon = sessionIdx === 1 ? "same_session" : "next_day";
579
- const session = await simulateSession(user, scenario, sessionIdx, horizon, batchId, 10);
1000
+ // Apply perturbation to sessions 6-10
1001
+ const perturbation = globalIdx > 5 ? selectPerturbation(globalIdx) : undefined;
1002
+ const session = await simulateSession(user, scenario, sessionIdx, horizon, batchId, 10, perturbation);
580
1003
  sessions.push(session);
581
- printSessionLine(session);
1004
+ printSessionLine(session, perturbation);
582
1005
  }
583
1006
  }
584
1007
  const report = generateCohortReport(sessions, 10, "n10");
1008
+ const drift = computeDriftMetrics(sessions);
1009
+ const durability = computeDurabilityScore(sessions);
1010
+ const rollups = computeRollup(sessions, "daily");
1011
+ // Compute maturity per scenario
1012
+ const scenarios = [...new Set(sessions.map((s) => s.scenarioId))];
1013
+ const maturityAssessments = scenarios.map((sc) => computeMaturityLevel(sc, sessions, batchId));
585
1014
  printReport(report, "N=10");
1015
+ printDurabilityReport(durability, drift);
1016
+ printMaturityReport(maturityAssessments);
1017
+ if (rollups.length > 0)
1018
+ printRollupSummary(rollups);
586
1019
  return report;
587
1020
  }
588
1021
  /**
589
1022
  * N=100: 10 users x 10 sessions each (simulated across time horizons).
590
- * Measures RCA + PRR compounding over time.
1023
+ * Sessions 1-20: clean baseline. Sessions 21-100: perturbed.
1024
+ * Measures RCA + PRR compounding over time + drift durability.
591
1025
  */
592
1026
  export async function runN100() {
593
1027
  const batchId = genId("batch");
594
1028
  console.log(`\n=== N=100: Longitudinal Compounding — 10 users x 10 sessions ===\n`);
1029
+ console.log(` Sessions 1-20: clean baseline | Sessions 21-100: perturbed\n`);
595
1030
  const sessions = [];
1031
+ let globalIdx = 0;
596
1032
  for (const user of COHORT_USERS) {
597
1033
  for (let sessionIdx = 1; sessionIdx <= 10; sessionIdx++) {
1034
+ globalIdx++;
598
1035
  const scenario = user.typicalScenarios[(sessionIdx - 1) % user.typicalScenarios.length];
599
1036
  // Spread sessions across time horizons to simulate real usage patterns
600
1037
  const horizonIdx = Math.min(sessionIdx - 1, TIME_HORIZONS.length - 1);
601
1038
  const horizon = TIME_HORIZONS[horizonIdx];
602
- const session = await simulateSession(user, scenario, sessionIdx, horizon, batchId, 100);
1039
+ // Apply perturbation to sessions 21-100
1040
+ const perturbation = globalIdx > 20 ? selectPerturbation(globalIdx) : undefined;
1041
+ const session = await simulateSession(user, scenario, sessionIdx, horizon, batchId, 100, perturbation);
603
1042
  sessions.push(session);
604
- printSessionLine(session);
1043
+ printSessionLine(session, perturbation);
605
1044
  }
606
1045
  }
607
1046
  const report = generateCohortReport(sessions, 100, "n100");
1047
+ const drift = computeDriftMetrics(sessions);
1048
+ const durability = computeDurabilityScore(sessions);
1049
+ // Rollups for all periods
1050
+ const dailyRollups = computeRollup(sessions, "daily");
1051
+ const weeklyRollups = computeRollup(sessions, "weekly");
1052
+ const monthlyRollups = computeRollup(sessions, "monthly");
1053
+ // Compute maturity per scenario
1054
+ const scenarios = [...new Set(sessions.map((s) => s.scenarioId))];
1055
+ const maturityAssessments = scenarios.map((sc) => computeMaturityLevel(sc, sessions, batchId));
608
1056
  printReport(report, "N=100");
1057
+ printDurabilityReport(durability, drift);
1058
+ printMaturityReport(maturityAssessments);
1059
+ printRollupSummary([...dailyRollups, ...weeklyRollups, ...monthlyRollups]);
609
1060
  return report;
610
1061
  }
611
1062
  // ═══════════════════════════════════════════════════════════════════════════
612
1063
  // Output Formatting
613
1064
  // ═══════════════════════════════════════════════════════════════════════════
614
- function printSessionLine(s) {
615
- const status = s.errors.length === 0 ? "OK" : `ERR(${s.errors.length})`;
1065
+ function printSessionLine(s, perturbation) {
1066
+ const realErrors = s.errors.filter((e) => !e.startsWith("perturbation:"));
1067
+ const status = realErrors.length === 0 ? "OK" : `ERR(${realErrors.length})`;
616
1068
  const reuse = s.packetReused ? "REUSE" : s.packetGenerated ? "NEW" : "NONE";
617
1069
  const restated = s.contextRestated ? "RESTATED" : "FRESH";
1070
+ const pertMarker = perturbation ? ` [PERTURB:${perturbation.type}/${perturbation.severity}]` : "";
618
1071
  console.log(` [${s.role.padEnd(10)}] sess=${s.sessionIndex} ${s.scenarioId.padEnd(18)} ` +
619
1072
  `tools=${s.toolCallCount} ${s.latencyMs}ms judge=${s.judgeScore.toFixed(1)} ` +
620
- `packet=${reuse} ctx=${restated} ${status}`);
1073
+ `packet=${reuse} ctx=${restated} ${status}${pertMarker}`);
621
1074
  }
622
1075
  function printReport(report, label) {
623
1076
  const passLabel = report.passed ? "PASS" : "FAIL";
@@ -644,6 +1097,51 @@ function printReport(report, label) {
644
1097
  ╚══════════════════════════════════════════════════════════════╝
645
1098
  `);
646
1099
  }
1100
+ function printDurabilityReport(durability, drift) {
1101
+ console.log(`
1102
+ ╔══════════════════════════════════════════════════════════════╗
1103
+ ║ DURABILITY SCORE ${String(durability.composite).padStart(3)}/100 ║
1104
+ ╠══════════════════════════════════════════════════════════════╣
1105
+ ║ Completion Stability (25%): ${String(durability.completionStability).padStart(6)}% ║
1106
+ ║ Rerun Savings (20%): ${String(durability.rerunSavings).padStart(6)}% ║
1107
+ ║ Artifact Quality (20%): ${String(durability.artifactQuality).padStart(6)}% ║
1108
+ ║ Memory Usefulness (15%): ${String(durability.memoryUsefulness).padStart(6)}% ║
1109
+ ║ Drift Resistance (10%): ${String(durability.driftResistance).padStart(6)}% ║
1110
+ ║ Cross-Session Continuity (10%):${String(durability.crossSessionContinuity).padStart(6)}% ║
1111
+ ╠══════════════════════════════════════════════════════════════╣
1112
+ ║ Drift Recovery Rate: ${String(Math.round(drift.driftRecoveryRate * 10) / 10).padStart(6)}% ║
1113
+ ║ Perturbation Survival Rate: ${String(Math.round(drift.perturbationSurvivalRate * 10) / 10).padStart(6)}% ║
1114
+ ║ Stale Memory Rejection Rate: ${String(Math.round(drift.staleMemoryRejectionRate * 10) / 10).padStart(6)}% ║
1115
+ ╚══════════════════════════════════════════════════════════════╝
1116
+ `);
1117
+ }
1118
+ function printMaturityReport(assessments) {
1119
+ console.log(`
1120
+ ╔══════════════════════════════════════════════════════════════╗
1121
+ ║ WORKFLOW MATURITY LEVELS ║
1122
+ ╠══════════════════════════════════════════════════════════════╣`);
1123
+ for (const a of assessments) {
1124
+ const line = ` Level ${a.level} (${a.label}) — ${a.scenarioId}`;
1125
+ console.log(`║${line.padEnd(60)}║`);
1126
+ console.log(`║ ${a.evidence.slice(0, 56).padEnd(56)}║`);
1127
+ }
1128
+ console.log(`╚══════════════════════════════════════════════════════════════╝
1129
+ `);
1130
+ }
1131
+ function printRollupSummary(rollups) {
1132
+ console.log(`
1133
+ ╔══════════════════════════════════════════════════════════════╗
1134
+ ║ PERIOD ROLLUPS ║
1135
+ ╠══════════════════════════════════════════════════════════════╣`);
1136
+ for (const r of rollups) {
1137
+ const line = ` ${r.period.padEnd(8)} ${r.periodKey.padEnd(12)} runs=${String(r.totalRuns).padStart(4)} ` +
1138
+ `comp=${r.completionRate.toFixed(0)}% judge=${r.avgJudgeScore.toFixed(1)} ` +
1139
+ `dur=${r.durabilityScore}`;
1140
+ console.log(`║${line.padEnd(60)}║`);
1141
+ }
1142
+ console.log(`╚══════════════════════════════════════════════════════════════╝
1143
+ `);
1144
+ }
647
1145
  // ═══════════════════════════════════════════════════════════════════════════
648
1146
  // CLI Entry Point
649
1147
  // ═══════════════════════════════════════════════════════════════════════════