nodebench-mcp 2.34.0 → 2.35.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,809 @@
1
+ /**
2
+ * dogfoodJudgeTools — Dogfood Judge Fix System (Phase 13)
3
+ *
4
+ * Measures whether NodeBench actually removes repeat cognition:
5
+ * - Session recording: track manual corrections, repeated questions, packet usefulness
6
+ * - 6-dimension judging: truth, compression, anticipation, output, delegation, trust
7
+ * - Failure triage: classify by canonical system layer taxonomy
8
+ * - Replay verification: prove fixes work, detect regressions
9
+ * - Repeat cognition metrics: the core compound metric
10
+ */
11
+ import { getDb, genId } from "../db.js";
12
+ /* ------------------------------------------------------------------ */
13
+ /* Schema bootstrap (idempotent) */
14
+ /* ------------------------------------------------------------------ */
15
+ let _schemaReady = false;
16
+ export function ensureDogfoodSchema() {
17
+ if (_schemaReady)
18
+ return;
19
+ const db = getDb();
20
+ db.exec(`
21
+ CREATE TABLE IF NOT EXISTS dogfood_sessions (
22
+ sessionId TEXT PRIMARY KEY,
23
+ loopType TEXT NOT NULL,
24
+ startedAt INTEGER NOT NULL,
25
+ endedAt INTEGER,
26
+ transcript TEXT,
27
+ packetVersionUsed TEXT,
28
+ artifactsProduced TEXT,
29
+ manualCorrections TEXT,
30
+ repeatedQuestions TEXT,
31
+ timeToFirstUsefulOutput INTEGER,
32
+ delegationSucceeded INTEGER,
33
+ packetExported INTEGER,
34
+ overallNotes TEXT
35
+ );
36
+
37
+ CREATE TABLE IF NOT EXISTS judge_runs (
38
+ runId TEXT PRIMARY KEY,
39
+ sessionId TEXT REFERENCES dogfood_sessions(sessionId),
40
+ judgedAt INTEGER NOT NULL,
41
+ truthQuality REAL,
42
+ compressionQuality REAL,
43
+ anticipationQuality REAL,
44
+ outputQuality REAL,
45
+ delegationQuality REAL,
46
+ trustQuality REAL,
47
+ overallScore REAL,
48
+ notes TEXT,
49
+ failureClasses TEXT
50
+ );
51
+
52
+ CREATE TABLE IF NOT EXISTS failure_cases (
53
+ caseId TEXT PRIMARY KEY,
54
+ sessionId TEXT REFERENCES dogfood_sessions(sessionId),
55
+ judgeRunId TEXT REFERENCES judge_runs(runId),
56
+ symptom TEXT NOT NULL,
57
+ rootCause TEXT NOT NULL,
58
+ systemLayer TEXT NOT NULL,
59
+ severity TEXT DEFAULT 'medium',
60
+ frequency INTEGER DEFAULT 1,
61
+ fixAttemptId TEXT,
62
+ status TEXT DEFAULT 'open',
63
+ createdAt INTEGER NOT NULL
64
+ );
65
+
66
+ CREATE TABLE IF NOT EXISTS fix_attempts (
67
+ attemptId TEXT PRIMARY KEY,
68
+ caseId TEXT REFERENCES failure_cases(caseId),
69
+ failureClass TEXT NOT NULL,
70
+ rootCause TEXT NOT NULL,
71
+ layerCorrected TEXT NOT NULL,
72
+ description TEXT NOT NULL,
73
+ replayProof TEXT,
74
+ regressionProtection TEXT,
75
+ status TEXT DEFAULT 'proposed',
76
+ createdAt INTEGER NOT NULL
77
+ );
78
+
79
+ CREATE TABLE IF NOT EXISTS replay_runs (
80
+ replayId TEXT PRIMARY KEY,
81
+ originalSessionId TEXT REFERENCES dogfood_sessions(sessionId),
82
+ fixAttemptId TEXT REFERENCES fix_attempts(attemptId),
83
+ replayedAt INTEGER NOT NULL,
84
+ priorScores TEXT,
85
+ newScores TEXT,
86
+ improved INTEGER,
87
+ regressionDetected INTEGER,
88
+ notes TEXT
89
+ );
90
+
91
+ CREATE TABLE IF NOT EXISTS repeat_question_events (
92
+ eventId TEXT PRIMARY KEY,
93
+ question TEXT NOT NULL,
94
+ sessionId TEXT,
95
+ priorSessionId TEXT,
96
+ timeSinceLastAsked INTEGER,
97
+ shouldHaveBeenWarm INTEGER DEFAULT 1,
98
+ detectedAt INTEGER NOT NULL
99
+ );
100
+
101
+ CREATE TABLE IF NOT EXISTS manual_correction_events (
102
+ eventId TEXT PRIMARY KEY,
103
+ sessionId TEXT,
104
+ field TEXT NOT NULL,
105
+ beforeValue TEXT,
106
+ afterValue TEXT,
107
+ correctionType TEXT,
108
+ detectedAt INTEGER NOT NULL
109
+ );
110
+
111
+ CREATE TABLE IF NOT EXISTS packet_usefulness_ratings (
112
+ ratingId TEXT PRIMARY KEY,
113
+ sessionId TEXT,
114
+ packetType TEXT,
115
+ exported INTEGER DEFAULT 0,
116
+ delegated INTEGER DEFAULT 0,
117
+ reused INTEGER DEFAULT 0,
118
+ abandoned INTEGER DEFAULT 0,
119
+ humanEditsCount INTEGER DEFAULT 0,
120
+ ratedAt INTEGER NOT NULL
121
+ );
122
+
123
+ CREATE INDEX IF NOT EXISTS idx_dogfood_sessions_loop ON dogfood_sessions(loopType);
124
+ CREATE INDEX IF NOT EXISTS idx_judge_runs_session ON judge_runs(sessionId);
125
+ CREATE INDEX IF NOT EXISTS idx_failure_cases_session ON failure_cases(sessionId);
126
+ CREATE INDEX IF NOT EXISTS idx_failure_cases_status ON failure_cases(status);
127
+ CREATE INDEX IF NOT EXISTS idx_failure_cases_layer ON failure_cases(systemLayer);
128
+ CREATE INDEX IF NOT EXISTS idx_fix_attempts_case ON fix_attempts(caseId);
129
+ CREATE INDEX IF NOT EXISTS idx_replay_runs_session ON replay_runs(originalSessionId);
130
+ CREATE INDEX IF NOT EXISTS idx_repeat_questions_session ON repeat_question_events(sessionId);
131
+ CREATE INDEX IF NOT EXISTS idx_manual_corrections_session ON manual_correction_events(sessionId);
132
+ CREATE INDEX IF NOT EXISTS idx_packet_ratings_session ON packet_usefulness_ratings(sessionId);
133
+ `);
134
+ _schemaReady = true;
135
+ }
136
+ /* ------------------------------------------------------------------ */
137
+ /* Canonical system layer taxonomy */
138
+ /* ------------------------------------------------------------------ */
139
+ const SYSTEM_LAYERS = [
140
+ "ingestion",
141
+ "canonicalization",
142
+ "change_detection",
143
+ "contradiction",
144
+ "suppression",
145
+ "packet_construction",
146
+ "artifact_rendering",
147
+ "trace_lineage",
148
+ "provider_bus",
149
+ "role_overlay",
150
+ "ux_explanation",
151
+ ];
152
+ /* ------------------------------------------------------------------ */
153
+ /* Tools */
154
+ /* ------------------------------------------------------------------ */
155
+ export const dogfoodJudgeTools = [
156
+ // ─── 1. start_dogfood_session ──────────────────────────────────
157
+ {
158
+ name: "start_dogfood_session",
159
+ description: "Start a new dogfood session for one of the 3 canonical loops (weekly_reset, pre_delegation, company_search). Returns sessionId for subsequent recording.",
160
+ inputSchema: {
161
+ type: "object",
162
+ properties: {
163
+ loopType: {
164
+ type: "string",
165
+ enum: ["weekly_reset", "pre_delegation", "company_search"],
166
+ description: "Which canonical dogfood loop is being tested",
167
+ },
168
+ packetVersionUsed: {
169
+ type: "string",
170
+ description: "Version/ID of the packet template being tested (optional)",
171
+ },
172
+ },
173
+ required: ["loopType"],
174
+ },
175
+ handler: async (args) => {
176
+ ensureDogfoodSchema();
177
+ const db = getDb();
178
+ const sessionId = genId("dfs");
179
+ const now = Date.now();
180
+ db.prepare(`INSERT INTO dogfood_sessions (sessionId, loopType, startedAt, packetVersionUsed)
181
+ VALUES (?, ?, ?, ?)`).run(sessionId, args.loopType, now, args.packetVersionUsed ?? null);
182
+ return { sessionId, loopType: args.loopType, startedAt: now };
183
+ },
184
+ },
185
+ // ─── 2. end_dogfood_session ────────────────────────────────────
186
+ {
187
+ name: "end_dogfood_session",
188
+ description: "End a dogfood session with summary metrics: time-to-first-useful-output, delegation success, packet export status, and notes.",
189
+ inputSchema: {
190
+ type: "object",
191
+ properties: {
192
+ sessionId: { type: "string", description: "Session to end" },
193
+ notes: { type: "string", description: "Overall session notes" },
194
+ timeToFirstUsefulOutput: {
195
+ type: "number",
196
+ description: "Milliseconds until first useful output was produced",
197
+ },
198
+ delegationSucceeded: {
199
+ type: "boolean",
200
+ description: "Whether delegation worked without restatement",
201
+ },
202
+ packetExported: {
203
+ type: "boolean",
204
+ description: "Whether the packet was exported/shared",
205
+ },
206
+ },
207
+ required: ["sessionId"],
208
+ },
209
+ handler: async (args) => {
210
+ ensureDogfoodSchema();
211
+ const db = getDb();
212
+ const now = Date.now();
213
+ const result = db.prepare(`UPDATE dogfood_sessions
214
+ SET endedAt = ?, overallNotes = ?, timeToFirstUsefulOutput = ?,
215
+ delegationSucceeded = ?, packetExported = ?
216
+ WHERE sessionId = ?`).run(now, args.notes ?? null, args.timeToFirstUsefulOutput ?? null, args.delegationSucceeded != null ? (args.delegationSucceeded ? 1 : 0) : null, args.packetExported != null ? (args.packetExported ? 1 : 0) : null, args.sessionId);
217
+ return {
218
+ sessionId: args.sessionId,
219
+ endedAt: now,
220
+ updated: result.changes > 0,
221
+ };
222
+ },
223
+ },
224
+ // ─── 3. record_manual_correction ───────────────────────────────
225
+ {
226
+ name: "record_manual_correction",
227
+ description: "Track a human correction to agent output. Every correction is evidence of a system gap — the system should have gotten this right.",
228
+ inputSchema: {
229
+ type: "object",
230
+ properties: {
231
+ sessionId: { type: "string", description: "Dogfood session ID" },
232
+ field: { type: "string", description: "Which field/section was corrected" },
233
+ beforeValue: { type: "string", description: "What the system produced" },
234
+ afterValue: { type: "string", description: "What the human corrected it to" },
235
+ correctionType: {
236
+ type: "string",
237
+ enum: ["factual", "priority", "scope", "tone", "missing"],
238
+ description: "Category of correction",
239
+ },
240
+ },
241
+ required: ["sessionId", "field", "correctionType"],
242
+ },
243
+ handler: async (args) => {
244
+ ensureDogfoodSchema();
245
+ const db = getDb();
246
+ const eventId = genId("mc");
247
+ const now = Date.now();
248
+ db.prepare(`INSERT INTO manual_correction_events
249
+ (eventId, sessionId, field, beforeValue, afterValue, correctionType, detectedAt)
250
+ VALUES (?, ?, ?, ?, ?, ?, ?)`).run(eventId, args.sessionId, args.field, args.beforeValue ?? null, args.afterValue ?? null, args.correctionType, now);
251
+ // Also update session's manualCorrections array
252
+ const session = db
253
+ .prepare(`SELECT manualCorrections FROM dogfood_sessions WHERE sessionId = ?`)
254
+ .get(args.sessionId);
255
+ if (session) {
256
+ const corrections = session.manualCorrections
257
+ ? JSON.parse(session.manualCorrections)
258
+ : [];
259
+ corrections.push({
260
+ field: args.field,
261
+ before: args.beforeValue ?? null,
262
+ after: args.afterValue ?? null,
263
+ type: args.correctionType,
264
+ });
265
+ db.prepare(`UPDATE dogfood_sessions SET manualCorrections = ? WHERE sessionId = ?`).run(JSON.stringify(corrections), args.sessionId);
266
+ }
267
+ return { eventId, sessionId: args.sessionId, recorded: true };
268
+ },
269
+ },
270
+ // ─── 4. record_repeated_question ───────────────────────────────
271
+ {
272
+ name: "record_repeated_question",
273
+ description: "Track a question the user asked that NodeBench should have already known. This is the core failure signal — repeat cognition means the system isn't compounding.",
274
+ inputSchema: {
275
+ type: "object",
276
+ properties: {
277
+ question: { type: "string", description: "The repeated question" },
278
+ sessionId: { type: "string", description: "Current session ID" },
279
+ priorSessionId: {
280
+ type: "string",
281
+ description: "Session where this was previously asked (optional)",
282
+ },
283
+ },
284
+ required: ["question"],
285
+ },
286
+ handler: async (args) => {
287
+ ensureDogfoodSchema();
288
+ const db = getDb();
289
+ const eventId = genId("rq");
290
+ const now = Date.now();
291
+ // Calculate time since last asked if priorSessionId provided
292
+ let timeSinceLastAsked = null;
293
+ if (args.priorSessionId) {
294
+ const prior = db
295
+ .prepare(`SELECT startedAt FROM dogfood_sessions WHERE sessionId = ?`)
296
+ .get(args.priorSessionId);
297
+ if (prior) {
298
+ timeSinceLastAsked = now - prior.startedAt;
299
+ }
300
+ }
301
+ db.prepare(`INSERT INTO repeat_question_events
302
+ (eventId, question, sessionId, priorSessionId, timeSinceLastAsked, shouldHaveBeenWarm, detectedAt)
303
+ VALUES (?, ?, ?, ?, ?, 1, ?)`).run(eventId, args.question, args.sessionId ?? null, args.priorSessionId ?? null, timeSinceLastAsked, now);
304
+ // Also update session's repeatedQuestions array
305
+ if (args.sessionId) {
306
+ const session = db
307
+ .prepare(`SELECT repeatedQuestions FROM dogfood_sessions WHERE sessionId = ?`)
308
+ .get(args.sessionId);
309
+ if (session) {
310
+ const questions = session.repeatedQuestions
311
+ ? JSON.parse(session.repeatedQuestions)
312
+ : [];
313
+ questions.push(args.question);
314
+ db.prepare(`UPDATE dogfood_sessions SET repeatedQuestions = ? WHERE sessionId = ?`).run(JSON.stringify(questions), args.sessionId);
315
+ }
316
+ }
317
+ return { eventId, question: args.question, timeSinceLastAsked, recorded: true };
318
+ },
319
+ },
320
+ // ─── 5. rate_packet_usefulness ─────────────────────────────────
321
+ {
322
+ name: "rate_packet_usefulness",
323
+ description: "Rate a packet's real-world utility: was it exported, delegated, reused, or abandoned? How many human edits were needed?",
324
+ inputSchema: {
325
+ type: "object",
326
+ properties: {
327
+ sessionId: { type: "string", description: "Dogfood session ID" },
328
+ packetType: {
329
+ type: "string",
330
+ description: "Type of packet (weekly_reset, pre_delegation, company_search, etc.)",
331
+ },
332
+ exported: { type: "boolean", description: "Was the packet exported?" },
333
+ delegated: { type: "boolean", description: "Was the packet delegated to someone?" },
334
+ reused: { type: "boolean", description: "Was the packet reused in another context?" },
335
+ abandoned: { type: "boolean", description: "Was the packet abandoned?" },
336
+ humanEditsCount: {
337
+ type: "number",
338
+ description: "Number of human edits required before the packet was usable",
339
+ },
340
+ },
341
+ required: ["sessionId", "packetType"],
342
+ },
343
+ handler: async (args) => {
344
+ ensureDogfoodSchema();
345
+ const db = getDb();
346
+ const ratingId = genId("pur");
347
+ const now = Date.now();
348
+ db.prepare(`INSERT INTO packet_usefulness_ratings
349
+ (ratingId, sessionId, packetType, exported, delegated, reused, abandoned, humanEditsCount, ratedAt)
350
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`).run(ratingId, args.sessionId, args.packetType, args.exported ? 1 : 0, args.delegated ? 1 : 0, args.reused ? 1 : 0, args.abandoned ? 1 : 0, args.humanEditsCount ?? 0, now);
351
+ return { ratingId, sessionId: args.sessionId, recorded: true };
352
+ },
353
+ },
354
+ // ─── 6. judge_session ──────────────────────────────────────────
355
+ {
356
+ name: "judge_session",
357
+ description: "Score a dogfood session on 6 dimensions (1-5 each): truth, compression, anticipation, output, delegation, trust. Returns overall score and records failure classes.",
358
+ inputSchema: {
359
+ type: "object",
360
+ properties: {
361
+ sessionId: { type: "string", description: "Session to judge" },
362
+ truthQuality: {
363
+ type: "number",
364
+ minimum: 1,
365
+ maximum: 5,
366
+ description: "1-5: Were facts correct? Did it hallucinate?",
367
+ },
368
+ compressionQuality: {
369
+ type: "number",
370
+ minimum: 1,
371
+ maximum: 5,
372
+ description: "1-5: Was context compressed without losing signal?",
373
+ },
374
+ anticipationQuality: {
375
+ type: "number",
376
+ minimum: 1,
377
+ maximum: 5,
378
+ description: "1-5: Did it anticipate what you needed next?",
379
+ },
380
+ outputQuality: {
381
+ type: "number",
382
+ minimum: 1,
383
+ maximum: 5,
384
+ description: "1-5: Was the artifact/output directly usable?",
385
+ },
386
+ delegationQuality: {
387
+ type: "number",
388
+ minimum: 1,
389
+ maximum: 5,
390
+ description: "1-5: Could you hand the output to someone without restatement?",
391
+ },
392
+ trustQuality: {
393
+ type: "number",
394
+ minimum: 1,
395
+ maximum: 5,
396
+ description: "1-5: Did you trust the output enough to act on it?",
397
+ },
398
+ notes: { type: "string", description: "Judge notes" },
399
+ failureClasses: {
400
+ type: "array",
401
+ items: { type: "string" },
402
+ description: "Array of failure class strings (e.g. 'stale_entity', 'missing_change', 'wrong_priority')",
403
+ },
404
+ },
405
+ required: [
406
+ "sessionId",
407
+ "truthQuality",
408
+ "compressionQuality",
409
+ "anticipationQuality",
410
+ "outputQuality",
411
+ "delegationQuality",
412
+ "trustQuality",
413
+ ],
414
+ },
415
+ handler: async (args) => {
416
+ ensureDogfoodSchema();
417
+ const db = getDb();
418
+ const runId = genId("jr");
419
+ const now = Date.now();
420
+ const scores = [
421
+ args.truthQuality,
422
+ args.compressionQuality,
423
+ args.anticipationQuality,
424
+ args.outputQuality,
425
+ args.delegationQuality,
426
+ args.trustQuality,
427
+ ];
428
+ const overallScore = Math.round((scores.reduce((a, b) => a + b, 0) / scores.length) * 100) / 100;
429
+ db.prepare(`INSERT INTO judge_runs
430
+ (runId, sessionId, judgedAt, truthQuality, compressionQuality, anticipationQuality,
431
+ outputQuality, delegationQuality, trustQuality, overallScore, notes, failureClasses)
432
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`).run(runId, args.sessionId, now, args.truthQuality, args.compressionQuality, args.anticipationQuality, args.outputQuality, args.delegationQuality, args.trustQuality, overallScore, args.notes ?? null, args.failureClasses ? JSON.stringify(args.failureClasses) : null);
433
+ return {
434
+ runId,
435
+ sessionId: args.sessionId,
436
+ overallScore,
437
+ dimensions: {
438
+ truth: args.truthQuality,
439
+ compression: args.compressionQuality,
440
+ anticipation: args.anticipationQuality,
441
+ output: args.outputQuality,
442
+ delegation: args.delegationQuality,
443
+ trust: args.trustQuality,
444
+ },
445
+ };
446
+ },
447
+ },
448
+ // ─── 7. classify_failure ───────────────────────────────────────
449
+ {
450
+ name: "classify_failure",
451
+ description: "Classify a failure by canonical system layer taxonomy. Tracks symptom, root cause, and system layer for structured triage.",
452
+ inputSchema: {
453
+ type: "object",
454
+ properties: {
455
+ sessionId: { type: "string", description: "Dogfood session ID" },
456
+ judgeRunId: { type: "string", description: "Judge run that identified this failure" },
457
+ symptom: { type: "string", description: "What the user observed" },
458
+ rootCause: { type: "string", description: "Why it happened (5-whys root cause)" },
459
+ systemLayer: {
460
+ type: "string",
461
+ enum: [...SYSTEM_LAYERS],
462
+ description: "Which system layer is responsible",
463
+ },
464
+ severity: {
465
+ type: "string",
466
+ enum: ["low", "medium", "high", "critical"],
467
+ description: "Severity level (default: medium)",
468
+ },
469
+ },
470
+ required: ["sessionId", "symptom", "rootCause", "systemLayer"],
471
+ },
472
+ handler: async (args) => {
473
+ ensureDogfoodSchema();
474
+ const db = getDb();
475
+ const caseId = genId("fc");
476
+ const now = Date.now();
477
+ // Check if a similar failure exists (same layer + similar symptom)
478
+ const existing = db
479
+ .prepare(`SELECT caseId, frequency FROM failure_cases
480
+ WHERE systemLayer = ? AND status = 'open'
481
+ ORDER BY createdAt DESC LIMIT 5`)
482
+ .all(args.systemLayer);
483
+ db.prepare(`INSERT INTO failure_cases
484
+ (caseId, sessionId, judgeRunId, symptom, rootCause, systemLayer, severity, createdAt)
485
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?)`).run(caseId, args.sessionId, args.judgeRunId ?? null, args.symptom, args.rootCause, args.systemLayer, args.severity ?? "medium", now);
486
+ return {
487
+ caseId,
488
+ systemLayer: args.systemLayer,
489
+ severity: args.severity ?? "medium",
490
+ existingOpenInLayer: existing.length,
491
+ recorded: true,
492
+ };
493
+ },
494
+ },
495
+ // ─── 8. record_fix_attempt ─────────────────────────────────────
496
+ {
497
+ name: "record_fix_attempt",
498
+ description: "Record a fix attempt with replay proof and regression protection description. Links to a failure case.",
499
+ inputSchema: {
500
+ type: "object",
501
+ properties: {
502
+ caseId: { type: "string", description: "Failure case being fixed" },
503
+ failureClass: { type: "string", description: "Class of failure being addressed" },
504
+ rootCause: { type: "string", description: "Root cause being fixed" },
505
+ layerCorrected: {
506
+ type: "string",
507
+ enum: [...SYSTEM_LAYERS],
508
+ description: "Which system layer was corrected",
509
+ },
510
+ description: { type: "string", description: "What was changed" },
511
+ replayProof: {
512
+ type: "object",
513
+ properties: {
514
+ priorScore: { type: "number" },
515
+ newScore: { type: "number" },
516
+ improved: { type: "boolean" },
517
+ },
518
+ description: "JSON proof: prior vs new scores",
519
+ },
520
+ regressionProtection: {
521
+ type: "string",
522
+ description: "What prevents this from regressing",
523
+ },
524
+ },
525
+ required: ["caseId", "failureClass", "rootCause", "layerCorrected", "description"],
526
+ },
527
+ handler: async (args) => {
528
+ ensureDogfoodSchema();
529
+ const db = getDb();
530
+ const attemptId = genId("fix");
531
+ const now = Date.now();
532
+ const status = args.replayProof?.improved ? "verified" : "proposed";
533
+ db.prepare(`INSERT INTO fix_attempts
534
+ (attemptId, caseId, failureClass, rootCause, layerCorrected, description, replayProof, regressionProtection, status, createdAt)
535
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`).run(attemptId, args.caseId, args.failureClass, args.rootCause, args.layerCorrected, args.description, args.replayProof ? JSON.stringify(args.replayProof) : null, args.regressionProtection ?? null, status, now);
536
+ // Update failure case status
537
+ if (status === "verified") {
538
+ db.prepare(`UPDATE failure_cases SET status = 'fixed', fixAttemptId = ? WHERE caseId = ?`).run(attemptId, args.caseId);
539
+ }
540
+ else {
541
+ db.prepare(`UPDATE failure_cases SET status = 'investigating', fixAttemptId = ? WHERE caseId = ?`).run(attemptId, args.caseId);
542
+ }
543
+ return { attemptId, caseId: args.caseId, status, recorded: true };
544
+ },
545
+ },
546
+ // ─── 9. get_dogfood_sessions ───────────────────────────────────
547
+ {
548
+ name: "get_dogfood_sessions",
549
+ description: "List recent dogfood sessions with their judge scores. Filter by loop type.",
550
+ inputSchema: {
551
+ type: "object",
552
+ properties: {
553
+ loopType: {
554
+ type: "string",
555
+ enum: ["weekly_reset", "pre_delegation", "company_search"],
556
+ description: "Filter by loop type (optional)",
557
+ },
558
+ limit: {
559
+ type: "number",
560
+ description: "Max sessions to return (default 10)",
561
+ },
562
+ },
563
+ },
564
+ annotations: { readOnlyHint: true },
565
+ handler: async (args) => {
566
+ ensureDogfoodSchema();
567
+ const db = getDb();
568
+ const limit = args.limit ?? 10;
569
+ let sessions;
570
+ if (args.loopType) {
571
+ sessions = db
572
+ .prepare(`SELECT * FROM dogfood_sessions WHERE loopType = ? ORDER BY startedAt DESC LIMIT ?`)
573
+ .all(args.loopType, limit);
574
+ }
575
+ else {
576
+ sessions = db
577
+ .prepare(`SELECT * FROM dogfood_sessions ORDER BY startedAt DESC LIMIT ?`)
578
+ .all(limit);
579
+ }
580
+ // Attach judge scores
581
+ const enriched = sessions.map((s) => {
582
+ const judgeRun = db
583
+ .prepare(`SELECT overallScore, truthQuality, compressionQuality, anticipationQuality,
584
+ outputQuality, delegationQuality, trustQuality, failureClasses
585
+ FROM judge_runs WHERE sessionId = ? ORDER BY judgedAt DESC LIMIT 1`)
586
+ .get(s.sessionId);
587
+ return {
588
+ ...s,
589
+ manualCorrections: s.manualCorrections ? JSON.parse(s.manualCorrections) : [],
590
+ repeatedQuestions: s.repeatedQuestions ? JSON.parse(s.repeatedQuestions) : [],
591
+ artifactsProduced: s.artifactsProduced ? JSON.parse(s.artifactsProduced) : [],
592
+ judgeScore: judgeRun
593
+ ? {
594
+ overall: judgeRun.overallScore,
595
+ truth: judgeRun.truthQuality,
596
+ compression: judgeRun.compressionQuality,
597
+ anticipation: judgeRun.anticipationQuality,
598
+ output: judgeRun.outputQuality,
599
+ delegation: judgeRun.delegationQuality,
600
+ trust: judgeRun.trustQuality,
601
+ failureClasses: judgeRun.failureClasses
602
+ ? JSON.parse(judgeRun.failureClasses)
603
+ : [],
604
+ }
605
+ : null,
606
+ };
607
+ });
608
+ return { sessions: enriched, count: enriched.length };
609
+ },
610
+ },
611
+ // ─── 10. get_failure_triage ────────────────────────────────────
612
+ {
613
+ name: "get_failure_triage",
614
+ description: "Get open failure cases grouped by system layer with frequency counts. The triage board for fixing system gaps.",
615
+ inputSchema: {
616
+ type: "object",
617
+ properties: {
618
+ status: {
619
+ type: "string",
620
+ enum: ["open", "investigating", "fixed", "wont_fix"],
621
+ description: "Filter by status (default: open)",
622
+ },
623
+ },
624
+ },
625
+ annotations: { readOnlyHint: true },
626
+ handler: async (args) => {
627
+ ensureDogfoodSchema();
628
+ const db = getDb();
629
+ const status = args.status ?? "open";
630
+ const cases = db
631
+ .prepare(`SELECT caseId, sessionId, symptom, rootCause, systemLayer, severity, frequency, status, createdAt
632
+ FROM failure_cases WHERE status = ? ORDER BY
633
+ CASE severity WHEN 'critical' THEN 0 WHEN 'high' THEN 1 WHEN 'medium' THEN 2 ELSE 3 END,
634
+ createdAt DESC`)
635
+ .all(status);
636
+ // Group by system layer
637
+ const byLayer = {};
638
+ for (const c of cases) {
639
+ if (!byLayer[c.systemLayer])
640
+ byLayer[c.systemLayer] = [];
641
+ byLayer[c.systemLayer].push(c);
642
+ }
643
+ // Layer summary
644
+ const layerSummary = Object.entries(byLayer).map(([layer, items]) => ({
645
+ layer,
646
+ count: items.length,
647
+ criticalCount: items.filter((i) => i.severity === "critical").length,
648
+ highCount: items.filter((i) => i.severity === "high").length,
649
+ }));
650
+ return {
651
+ status,
652
+ totalCases: cases.length,
653
+ byLayer,
654
+ layerSummary: layerSummary.sort((a, b) => b.criticalCount - a.criticalCount || b.count - a.count),
655
+ };
656
+ },
657
+ },
658
+ // ─── 11. get_regression_gate ───────────────────────────────────
659
+ {
660
+ name: "get_regression_gate",
661
+ description: "Check if the 3 canonical loops pass. Returns per-loop scores, overall pass/fail, and regression detection.",
662
+ inputSchema: {
663
+ type: "object",
664
+ properties: {},
665
+ },
666
+ annotations: { readOnlyHint: true },
667
+ handler: async () => {
668
+ ensureDogfoodSchema();
669
+ const db = getDb();
670
+ const PASS_THRESHOLD = 3.5;
671
+ const loops = ["weekly_reset", "pre_delegation", "company_search"];
672
+ const results = {};
673
+ let allPassed = true;
674
+ let regressionsDetected = false;
675
+ for (const loop of loops) {
676
+ // Get last 5 judge scores for this loop type
677
+ const scores = db
678
+ .prepare(`SELECT jr.overallScore
679
+ FROM judge_runs jr
680
+ JOIN dogfood_sessions ds ON jr.sessionId = ds.sessionId
681
+ WHERE ds.loopType = ?
682
+ ORDER BY jr.judgedAt DESC LIMIT 5`)
683
+ .all(loop);
684
+ const trend = scores.map((s) => s.overallScore);
685
+ const latestScore = trend.length > 0 ? trend[0] : null;
686
+ const passed = latestScore != null && latestScore >= PASS_THRESHOLD;
687
+ if (!passed)
688
+ allPassed = false;
689
+ // Check for regression: if latest score is lower than previous
690
+ if (trend.length >= 2 && trend[0] < trend[1]) {
691
+ regressionsDetected = true;
692
+ }
693
+ results[loop] = { latestScore, trend, passed };
694
+ }
695
+ // Count open failures
696
+ const openFailures = db
697
+ .prepare(`SELECT COUNT(*) as count FROM failure_cases WHERE status = 'open'`)
698
+ .get();
699
+ return {
700
+ weeklyResetScore: results.weekly_reset.latestScore,
701
+ preDelegationScore: results.pre_delegation.latestScore,
702
+ companySearchScore: results.company_search.latestScore,
703
+ passed: allPassed,
704
+ regressions: regressionsDetected,
705
+ details: results,
706
+ openFailureCount: openFailures.count,
707
+ };
708
+ },
709
+ },
710
+ // ─── 12. get_repeat_cognition_metrics ──────────────────────────
711
+ {
712
+ name: "get_repeat_cognition_metrics",
713
+ description: "The key compound metric. Measures repeat question rate, manual reconstruction count, packet abandonment rate, delegation-without-restatement rate, and average time-to-useful-output.",
714
+ inputSchema: {
715
+ type: "object",
716
+ properties: {
717
+ daysSince: {
718
+ type: "number",
719
+ description: "Look back N days (default 30)",
720
+ },
721
+ },
722
+ },
723
+ annotations: { readOnlyHint: true },
724
+ handler: async (args) => {
725
+ ensureDogfoodSchema();
726
+ const db = getDb();
727
+ const since = Date.now() - (args.daysSince ?? 30) * 86400000;
728
+ // Total sessions in window
729
+ const totalSessions = db
730
+ .prepare(`SELECT COUNT(*) as count FROM dogfood_sessions WHERE startedAt >= ?`)
731
+ .get(since);
732
+ // Repeat questions in window
733
+ const repeatQuestions = db
734
+ .prepare(`SELECT COUNT(*) as count FROM repeat_question_events WHERE detectedAt >= ?`)
735
+ .get(since);
736
+ // Manual corrections in window
737
+ const manualCorrections = db
738
+ .prepare(`SELECT COUNT(*) as count FROM manual_correction_events WHERE detectedAt >= ?`)
739
+ .get(since);
740
+ // Packet ratings in window
741
+ const packetRatings = db
742
+ .prepare(`SELECT * FROM packet_usefulness_ratings WHERE ratedAt >= ?`)
743
+ .all(since);
744
+ const totalRated = packetRatings.length;
745
+ const abandoned = packetRatings.filter((r) => r.abandoned === 1).length;
746
+ const delegated = packetRatings.filter((r) => r.delegated === 1).length;
747
+ const totalHumanEdits = packetRatings.reduce((sum, r) => sum + (r.humanEditsCount ?? 0), 0);
748
+ // Average time-to-first-useful-output
749
+ const times = db
750
+ .prepare(`SELECT timeToFirstUsefulOutput FROM dogfood_sessions
751
+ WHERE startedAt >= ? AND timeToFirstUsefulOutput IS NOT NULL`)
752
+ .all(since);
753
+ const avgTimeToUsefulOutput = times.length > 0
754
+ ? Math.round(times.reduce((s, t) => s + t.timeToFirstUsefulOutput, 0) / times.length)
755
+ : null;
756
+ // Delegation without restatement rate
757
+ const delegationSessions = db
758
+ .prepare(`SELECT delegationSucceeded FROM dogfood_sessions
759
+ WHERE startedAt >= ? AND delegationSucceeded IS NOT NULL`)
760
+ .all(since);
761
+ const delegationSuccessRate = delegationSessions.length > 0
762
+ ? Math.round((delegationSessions.filter((s) => s.delegationSucceeded === 1).length /
763
+ delegationSessions.length) *
764
+ 100)
765
+ : null;
766
+ return {
767
+ window: { days: args.daysSince ?? 30, since: new Date(since).toISOString() },
768
+ totalSessions: totalSessions.count,
769
+ repeatQuestionRate: totalSessions.count > 0
770
+ ? Math.round((repeatQuestions.count / totalSessions.count) * 100) / 100
771
+ : 0,
772
+ repeatQuestionCount: repeatQuestions.count,
773
+ manualCorrectionCount: manualCorrections.count,
774
+ packetAbandonmentRate: totalRated > 0 ? Math.round((abandoned / totalRated) * 100) / 100 : 0,
775
+ delegationWithoutRestatementRate: delegationSuccessRate,
776
+ averageTimeToUsefulOutputMs: avgTimeToUsefulOutput,
777
+ totalHumanEdits,
778
+ compoundScore: computeCompoundScore({
779
+ repeatRate: totalSessions.count > 0
780
+ ? repeatQuestions.count / totalSessions.count
781
+ : 0,
782
+ correctionRate: totalSessions.count > 0
783
+ ? manualCorrections.count / totalSessions.count
784
+ : 0,
785
+ abandonmentRate: totalRated > 0 ? abandoned / totalRated : 0,
786
+ delegationRate: delegationSuccessRate ?? 0,
787
+ }),
788
+ };
789
+ },
790
+ },
791
+ ];
792
+ /* ------------------------------------------------------------------ */
793
+ /* Compound score: 0-100, higher is better */
794
+ /* ------------------------------------------------------------------ */
795
+ function computeCompoundScore(metrics) {
796
+ // Lower repeat/correction/abandonment is better → invert
797
+ // Higher delegation is better → keep
798
+ const repeatScore = Math.max(0, 100 - metrics.repeatRate * 100);
799
+ const correctionScore = Math.max(0, 100 - metrics.correctionRate * 50);
800
+ const abandonmentScore = Math.max(0, 100 - metrics.abandonmentRate * 100);
801
+ const delegationScore = metrics.delegationRate; // already 0-100
802
+ // Weighted average (repeat cognition weighted highest)
803
+ const score = repeatScore * 0.35 +
804
+ correctionScore * 0.25 +
805
+ abandonmentScore * 0.15 +
806
+ delegationScore * 0.25;
807
+ return Math.round(score * 100) / 100;
808
+ }
809
+ //# sourceMappingURL=dogfoodJudgeTools.js.map