npm - nodebench-mcp - Versions diffs - 2.34.0 → 2.35.0 - Mend

nodebench-mcp 2.34.0 → 2.35.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

package/dist/benchmarks/ambientBench.d.ts +27 -0
package/dist/benchmarks/ambientBench.js +900 -0
package/dist/benchmarks/ambientBench.js.map +1 -0
package/dist/benchmarks/index.d.ts +1 -0
package/dist/benchmarks/index.js +2 -0
package/dist/benchmarks/index.js.map +1 -0
package/dist/dashboard/operatingDashboardHtml.d.ts +23 -0
package/dist/dashboard/operatingDashboardHtml.js +2036 -0
package/dist/dashboard/operatingDashboardHtml.js.map +1 -0
package/dist/dashboard/operatingServer.d.ts +23 -0
package/dist/dashboard/operatingServer.js +704 -0
package/dist/dashboard/operatingServer.js.map +1 -0
package/dist/index.js +13 -4
package/dist/index.js.map +1 -1
package/dist/tools/dogfoodJudgeTools.d.ts +13 -0
package/dist/tools/dogfoodJudgeTools.js +809 -0
package/dist/tools/dogfoodJudgeTools.js.map +1 -0
package/dist/tools/localDashboardTools.js +36 -0
package/dist/tools/localDashboardTools.js.map +1 -1
package/dist/tools/progressiveDiscoveryTools.js +1 -1
package/dist/tools/progressiveDiscoveryTools.js.map +1 -1
package/dist/tools/toolRegistry.js +166 -0
package/dist/tools/toolRegistry.js.map +1 -1
package/dist/toolsetRegistry.js +2 -0
package/dist/toolsetRegistry.js.map +1 -1
package/package.json +1 -1

package/dist/tools/dogfoodJudgeTools.js ADDED Viewed

@@ -0,0 +1,809 @@
+/**
+ * dogfoodJudgeTools — Dogfood Judge Fix System (Phase 13)
+ *
+ * Measures whether NodeBench actually removes repeat cognition:
+ * - Session recording: track manual corrections, repeated questions, packet usefulness
+ * - 6-dimension judging: truth, compression, anticipation, output, delegation, trust
+ * - Failure triage: classify by canonical system layer taxonomy
+ * - Replay verification: prove fixes work, detect regressions
+ * - Repeat cognition metrics: the core compound metric
+ */
+import { getDb, genId } from "../db.js";
+/* ------------------------------------------------------------------ */
+/*  Schema bootstrap (idempotent)                                      */
+/* ------------------------------------------------------------------ */
+let _schemaReady = false;
+export function ensureDogfoodSchema() {
+    if (_schemaReady)
+        return;
+    const db = getDb();
+    db.exec(`
+    CREATE TABLE IF NOT EXISTS dogfood_sessions (
+      sessionId TEXT PRIMARY KEY,
+      loopType TEXT NOT NULL,
+      startedAt INTEGER NOT NULL,
+      endedAt INTEGER,
+      transcript TEXT,
+      packetVersionUsed TEXT,
+      artifactsProduced TEXT,
+      manualCorrections TEXT,
+      repeatedQuestions TEXT,
+      timeToFirstUsefulOutput INTEGER,
+      delegationSucceeded INTEGER,
+      packetExported INTEGER,
+      overallNotes TEXT
+    );
+    CREATE TABLE IF NOT EXISTS judge_runs (
+      runId TEXT PRIMARY KEY,
+      sessionId TEXT REFERENCES dogfood_sessions(sessionId),
+      judgedAt INTEGER NOT NULL,
+      truthQuality REAL,
+      compressionQuality REAL,
+      anticipationQuality REAL,
+      outputQuality REAL,
+      delegationQuality REAL,
+      trustQuality REAL,
+      overallScore REAL,
+      notes TEXT,
+      failureClasses TEXT
+    );
+    CREATE TABLE IF NOT EXISTS failure_cases (
+      caseId TEXT PRIMARY KEY,
+      sessionId TEXT REFERENCES dogfood_sessions(sessionId),
+      judgeRunId TEXT REFERENCES judge_runs(runId),
+      symptom TEXT NOT NULL,
+      rootCause TEXT NOT NULL,
+      systemLayer TEXT NOT NULL,
+      severity TEXT DEFAULT 'medium',
+      frequency INTEGER DEFAULT 1,
+      fixAttemptId TEXT,
+      status TEXT DEFAULT 'open',
+      createdAt INTEGER NOT NULL
+    );
+    CREATE TABLE IF NOT EXISTS fix_attempts (
+      attemptId TEXT PRIMARY KEY,
+      caseId TEXT REFERENCES failure_cases(caseId),
+      failureClass TEXT NOT NULL,
+      rootCause TEXT NOT NULL,
+      layerCorrected TEXT NOT NULL,
+      description TEXT NOT NULL,
+      replayProof TEXT,
+      regressionProtection TEXT,
+      status TEXT DEFAULT 'proposed',
+      createdAt INTEGER NOT NULL
+    );
+    CREATE TABLE IF NOT EXISTS replay_runs (
+      replayId TEXT PRIMARY KEY,
+      originalSessionId TEXT REFERENCES dogfood_sessions(sessionId),
+      fixAttemptId TEXT REFERENCES fix_attempts(attemptId),
+      replayedAt INTEGER NOT NULL,
+      priorScores TEXT,
+      newScores TEXT,
+      improved INTEGER,
+      regressionDetected INTEGER,
+      notes TEXT
+    );
+    CREATE TABLE IF NOT EXISTS repeat_question_events (
+      eventId TEXT PRIMARY KEY,
+      question TEXT NOT NULL,
+      sessionId TEXT,
+      priorSessionId TEXT,
+      timeSinceLastAsked INTEGER,
+      shouldHaveBeenWarm INTEGER DEFAULT 1,
+      detectedAt INTEGER NOT NULL
+    );
+    CREATE TABLE IF NOT EXISTS manual_correction_events (
+      eventId TEXT PRIMARY KEY,
+      sessionId TEXT,
+      field TEXT NOT NULL,
+      beforeValue TEXT,
+      afterValue TEXT,
+      correctionType TEXT,
+      detectedAt INTEGER NOT NULL
+    );
+    CREATE TABLE IF NOT EXISTS packet_usefulness_ratings (
+      ratingId TEXT PRIMARY KEY,
+      sessionId TEXT,
+      packetType TEXT,
+      exported INTEGER DEFAULT 0,
+      delegated INTEGER DEFAULT 0,
+      reused INTEGER DEFAULT 0,
+      abandoned INTEGER DEFAULT 0,
+      humanEditsCount INTEGER DEFAULT 0,
+      ratedAt INTEGER NOT NULL
+    );
+    CREATE INDEX IF NOT EXISTS idx_dogfood_sessions_loop ON dogfood_sessions(loopType);
+    CREATE INDEX IF NOT EXISTS idx_judge_runs_session ON judge_runs(sessionId);
+    CREATE INDEX IF NOT EXISTS idx_failure_cases_session ON failure_cases(sessionId);
+    CREATE INDEX IF NOT EXISTS idx_failure_cases_status ON failure_cases(status);
+    CREATE INDEX IF NOT EXISTS idx_failure_cases_layer ON failure_cases(systemLayer);
+    CREATE INDEX IF NOT EXISTS idx_fix_attempts_case ON fix_attempts(caseId);
+    CREATE INDEX IF NOT EXISTS idx_replay_runs_session ON replay_runs(originalSessionId);
+    CREATE INDEX IF NOT EXISTS idx_repeat_questions_session ON repeat_question_events(sessionId);
+    CREATE INDEX IF NOT EXISTS idx_manual_corrections_session ON manual_correction_events(sessionId);
+    CREATE INDEX IF NOT EXISTS idx_packet_ratings_session ON packet_usefulness_ratings(sessionId);
+  `);
+    _schemaReady = true;
+}
+/* ------------------------------------------------------------------ */
+/*  Canonical system layer taxonomy                                    */
+/* ------------------------------------------------------------------ */
+const SYSTEM_LAYERS = [
+    "ingestion",
+    "canonicalization",
+    "change_detection",
+    "contradiction",
+    "suppression",
+    "packet_construction",
+    "artifact_rendering",
+    "trace_lineage",
+    "provider_bus",
+    "role_overlay",
+    "ux_explanation",
+];
+/* ------------------------------------------------------------------ */
+/*  Tools                                                              */
+/* ------------------------------------------------------------------ */
+export const dogfoodJudgeTools = [
+    // ─── 1. start_dogfood_session ──────────────────────────────────
+    {
+        name: "start_dogfood_session",
+        description: "Start a new dogfood session for one of the 3 canonical loops (weekly_reset, pre_delegation, company_search). Returns sessionId for subsequent recording.",
+        inputSchema: {
+            type: "object",
+            properties: {
+                loopType: {
+                    type: "string",
+                    enum: ["weekly_reset", "pre_delegation", "company_search"],
+                    description: "Which canonical dogfood loop is being tested",
+                },
+                packetVersionUsed: {
+                    type: "string",
+                    description: "Version/ID of the packet template being tested (optional)",
+                },
+            },
+            required: ["loopType"],
+        },
+        handler: async (args) => {
+            ensureDogfoodSchema();
+            const db = getDb();
+            const sessionId = genId("dfs");
+            const now = Date.now();
+            db.prepare(`INSERT INTO dogfood_sessions (sessionId, loopType, startedAt, packetVersionUsed)
+         VALUES (?, ?, ?, ?)`).run(sessionId, args.loopType, now, args.packetVersionUsed ?? null);
+            return { sessionId, loopType: args.loopType, startedAt: now };
+        },
+    },
+    // ─── 2. end_dogfood_session ────────────────────────────────────
+    {
+        name: "end_dogfood_session",
+        description: "End a dogfood session with summary metrics: time-to-first-useful-output, delegation success, packet export status, and notes.",
+        inputSchema: {
+            type: "object",
+            properties: {
+                sessionId: { type: "string", description: "Session to end" },
+                notes: { type: "string", description: "Overall session notes" },
+                timeToFirstUsefulOutput: {
+                    type: "number",
+                    description: "Milliseconds until first useful output was produced",
+                },
+                delegationSucceeded: {
+                    type: "boolean",
+                    description: "Whether delegation worked without restatement",
+                },
+                packetExported: {
+                    type: "boolean",
+                    description: "Whether the packet was exported/shared",
+                },
+            },
+            required: ["sessionId"],
+        },
+        handler: async (args) => {
+            ensureDogfoodSchema();
+            const db = getDb();
+            const now = Date.now();
+            const result = db.prepare(`UPDATE dogfood_sessions
+         SET endedAt = ?, overallNotes = ?, timeToFirstUsefulOutput = ?,
+             delegationSucceeded = ?, packetExported = ?
+         WHERE sessionId = ?`).run(now, args.notes ?? null, args.timeToFirstUsefulOutput ?? null, args.delegationSucceeded != null ? (args.delegationSucceeded ? 1 : 0) : null, args.packetExported != null ? (args.packetExported ? 1 : 0) : null, args.sessionId);
+            return {
+                sessionId: args.sessionId,
+                endedAt: now,
+                updated: result.changes > 0,
+            };
+        },
+    },
+    // ─── 3. record_manual_correction ───────────────────────────────
+    {
+        name: "record_manual_correction",
+        description: "Track a human correction to agent output. Every correction is evidence of a system gap — the system should have gotten this right.",
+        inputSchema: {
+            type: "object",
+            properties: {
+                sessionId: { type: "string", description: "Dogfood session ID" },
+                field: { type: "string", description: "Which field/section was corrected" },
+                beforeValue: { type: "string", description: "What the system produced" },
+                afterValue: { type: "string", description: "What the human corrected it to" },
+                correctionType: {
+                    type: "string",
+                    enum: ["factual", "priority", "scope", "tone", "missing"],
+                    description: "Category of correction",
+                },
+            },
+            required: ["sessionId", "field", "correctionType"],
+        },
+        handler: async (args) => {
+            ensureDogfoodSchema();
+            const db = getDb();
+            const eventId = genId("mc");
+            const now = Date.now();
+            db.prepare(`INSERT INTO manual_correction_events
+         (eventId, sessionId, field, beforeValue, afterValue, correctionType, detectedAt)
+         VALUES (?, ?, ?, ?, ?, ?, ?)`).run(eventId, args.sessionId, args.field, args.beforeValue ?? null, args.afterValue ?? null, args.correctionType, now);
+            // Also update session's manualCorrections array
+            const session = db
+                .prepare(`SELECT manualCorrections FROM dogfood_sessions WHERE sessionId = ?`)
+                .get(args.sessionId);
+            if (session) {
+                const corrections = session.manualCorrections
+                    ? JSON.parse(session.manualCorrections)
+                    : [];
+                corrections.push({
+                    field: args.field,
+                    before: args.beforeValue ?? null,
+                    after: args.afterValue ?? null,
+                    type: args.correctionType,
+                });
+                db.prepare(`UPDATE dogfood_sessions SET manualCorrections = ? WHERE sessionId = ?`).run(JSON.stringify(corrections), args.sessionId);
+            }
+            return { eventId, sessionId: args.sessionId, recorded: true };
+        },
+    },
+    // ─── 4. record_repeated_question ───────────────────────────────
+    {
+        name: "record_repeated_question",
+        description: "Track a question the user asked that NodeBench should have already known. This is the core failure signal — repeat cognition means the system isn't compounding.",
+        inputSchema: {
+            type: "object",
+            properties: {
+                question: { type: "string", description: "The repeated question" },
+                sessionId: { type: "string", description: "Current session ID" },
+                priorSessionId: {
+                    type: "string",
+                    description: "Session where this was previously asked (optional)",
+                },
+            },
+            required: ["question"],
+        },
+        handler: async (args) => {
+            ensureDogfoodSchema();
+            const db = getDb();
+            const eventId = genId("rq");
+            const now = Date.now();
+            // Calculate time since last asked if priorSessionId provided
+            let timeSinceLastAsked = null;
+            if (args.priorSessionId) {
+                const prior = db
+                    .prepare(`SELECT startedAt FROM dogfood_sessions WHERE sessionId = ?`)
+                    .get(args.priorSessionId);
+                if (prior) {
+                    timeSinceLastAsked = now - prior.startedAt;
+                }
+            }
+            db.prepare(`INSERT INTO repeat_question_events
+         (eventId, question, sessionId, priorSessionId, timeSinceLastAsked, shouldHaveBeenWarm, detectedAt)
+         VALUES (?, ?, ?, ?, ?, 1, ?)`).run(eventId, args.question, args.sessionId ?? null, args.priorSessionId ?? null, timeSinceLastAsked, now);
+            // Also update session's repeatedQuestions array
+            if (args.sessionId) {
+                const session = db
+                    .prepare(`SELECT repeatedQuestions FROM dogfood_sessions WHERE sessionId = ?`)
+                    .get(args.sessionId);
+                if (session) {
+                    const questions = session.repeatedQuestions
+                        ? JSON.parse(session.repeatedQuestions)
+                        : [];
+                    questions.push(args.question);
+                    db.prepare(`UPDATE dogfood_sessions SET repeatedQuestions = ? WHERE sessionId = ?`).run(JSON.stringify(questions), args.sessionId);
+                }
+            }
+            return { eventId, question: args.question, timeSinceLastAsked, recorded: true };
+        },
+    },
+    // ─── 5. rate_packet_usefulness ─────────────────────────────────
+    {
+        name: "rate_packet_usefulness",
+        description: "Rate a packet's real-world utility: was it exported, delegated, reused, or abandoned? How many human edits were needed?",
+        inputSchema: {
+            type: "object",
+            properties: {
+                sessionId: { type: "string", description: "Dogfood session ID" },
+                packetType: {
+                    type: "string",
+                    description: "Type of packet (weekly_reset, pre_delegation, company_search, etc.)",
+                },
+                exported: { type: "boolean", description: "Was the packet exported?" },
+                delegated: { type: "boolean", description: "Was the packet delegated to someone?" },
+                reused: { type: "boolean", description: "Was the packet reused in another context?" },
+                abandoned: { type: "boolean", description: "Was the packet abandoned?" },
+                humanEditsCount: {
+                    type: "number",
+                    description: "Number of human edits required before the packet was usable",
+                },
+            },
+            required: ["sessionId", "packetType"],
+        },
+        handler: async (args) => {
+            ensureDogfoodSchema();
+            const db = getDb();
+            const ratingId = genId("pur");
+            const now = Date.now();
+            db.prepare(`INSERT INTO packet_usefulness_ratings
+         (ratingId, sessionId, packetType, exported, delegated, reused, abandoned, humanEditsCount, ratedAt)
+         VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`).run(ratingId, args.sessionId, args.packetType, args.exported ? 1 : 0, args.delegated ? 1 : 0, args.reused ? 1 : 0, args.abandoned ? 1 : 0, args.humanEditsCount ?? 0, now);
+            return { ratingId, sessionId: args.sessionId, recorded: true };
+        },
+    },
+    // ─── 6. judge_session ──────────────────────────────────────────
+    {
+        name: "judge_session",
+        description: "Score a dogfood session on 6 dimensions (1-5 each): truth, compression, anticipation, output, delegation, trust. Returns overall score and records failure classes.",
+        inputSchema: {
+            type: "object",
+            properties: {
+                sessionId: { type: "string", description: "Session to judge" },
+                truthQuality: {
+                    type: "number",
+                    minimum: 1,
+                    maximum: 5,
+                    description: "1-5: Were facts correct? Did it hallucinate?",
+                },
+                compressionQuality: {
+                    type: "number",
+                    minimum: 1,
+                    maximum: 5,
+                    description: "1-5: Was context compressed without losing signal?",
+                },
+                anticipationQuality: {
+                    type: "number",
+                    minimum: 1,
+                    maximum: 5,
+                    description: "1-5: Did it anticipate what you needed next?",
+                },
+                outputQuality: {
+                    type: "number",
+                    minimum: 1,
+                    maximum: 5,
+                    description: "1-5: Was the artifact/output directly usable?",
+                },
+                delegationQuality: {
+                    type: "number",
+                    minimum: 1,
+                    maximum: 5,
+                    description: "1-5: Could you hand the output to someone without restatement?",
+                },
+                trustQuality: {
+                    type: "number",
+                    minimum: 1,
+                    maximum: 5,
+                    description: "1-5: Did you trust the output enough to act on it?",
+                },
+                notes: { type: "string", description: "Judge notes" },
+                failureClasses: {
+                    type: "array",
+                    items: { type: "string" },
+                    description: "Array of failure class strings (e.g. 'stale_entity', 'missing_change', 'wrong_priority')",
+                },
+            },
+            required: [
+                "sessionId",
+                "truthQuality",
+                "compressionQuality",
+                "anticipationQuality",
+                "outputQuality",
+                "delegationQuality",
+                "trustQuality",
+            ],
+        },
+        handler: async (args) => {
+            ensureDogfoodSchema();
+            const db = getDb();
+            const runId = genId("jr");
+            const now = Date.now();
+            const scores = [
+                args.truthQuality,
+                args.compressionQuality,
+                args.anticipationQuality,
+                args.outputQuality,
+                args.delegationQuality,
+                args.trustQuality,
+            ];
+            const overallScore = Math.round((scores.reduce((a, b) => a + b, 0) / scores.length) * 100) / 100;
+            db.prepare(`INSERT INTO judge_runs
+         (runId, sessionId, judgedAt, truthQuality, compressionQuality, anticipationQuality,
+          outputQuality, delegationQuality, trustQuality, overallScore, notes, failureClasses)
+         VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`).run(runId, args.sessionId, now, args.truthQuality, args.compressionQuality, args.anticipationQuality, args.outputQuality, args.delegationQuality, args.trustQuality, overallScore, args.notes ?? null, args.failureClasses ? JSON.stringify(args.failureClasses) : null);
+            return {
+                runId,
+                sessionId: args.sessionId,
+                overallScore,
+                dimensions: {
+                    truth: args.truthQuality,
+                    compression: args.compressionQuality,
+                    anticipation: args.anticipationQuality,
+                    output: args.outputQuality,
+                    delegation: args.delegationQuality,
+                    trust: args.trustQuality,
+                },
+            };
+        },
+    },
+    // ─── 7. classify_failure ───────────────────────────────────────
+    {
+        name: "classify_failure",
+        description: "Classify a failure by canonical system layer taxonomy. Tracks symptom, root cause, and system layer for structured triage.",
+        inputSchema: {
+            type: "object",
+            properties: {
+                sessionId: { type: "string", description: "Dogfood session ID" },
+                judgeRunId: { type: "string", description: "Judge run that identified this failure" },
+                symptom: { type: "string", description: "What the user observed" },
+                rootCause: { type: "string", description: "Why it happened (5-whys root cause)" },
+                systemLayer: {
+                    type: "string",
+                    enum: [...SYSTEM_LAYERS],
+                    description: "Which system layer is responsible",
+                },
+                severity: {
+                    type: "string",
+                    enum: ["low", "medium", "high", "critical"],
+                    description: "Severity level (default: medium)",
+                },
+            },
+            required: ["sessionId", "symptom", "rootCause", "systemLayer"],
+        },
+        handler: async (args) => {
+            ensureDogfoodSchema();
+            const db = getDb();
+            const caseId = genId("fc");
+            const now = Date.now();
+            // Check if a similar failure exists (same layer + similar symptom)
+            const existing = db
+                .prepare(`SELECT caseId, frequency FROM failure_cases
+           WHERE systemLayer = ? AND status = 'open'
+           ORDER BY createdAt DESC LIMIT 5`)
+                .all(args.systemLayer);
+            db.prepare(`INSERT INTO failure_cases
+         (caseId, sessionId, judgeRunId, symptom, rootCause, systemLayer, severity, createdAt)
+         VALUES (?, ?, ?, ?, ?, ?, ?, ?)`).run(caseId, args.sessionId, args.judgeRunId ?? null, args.symptom, args.rootCause, args.systemLayer, args.severity ?? "medium", now);
+            return {
+                caseId,
+                systemLayer: args.systemLayer,
+                severity: args.severity ?? "medium",
+                existingOpenInLayer: existing.length,
+                recorded: true,
+            };
+        },
+    },
+    // ─── 8. record_fix_attempt ─────────────────────────────────────
+    {
+        name: "record_fix_attempt",
+        description: "Record a fix attempt with replay proof and regression protection description. Links to a failure case.",
+        inputSchema: {
+            type: "object",
+            properties: {
+                caseId: { type: "string", description: "Failure case being fixed" },
+                failureClass: { type: "string", description: "Class of failure being addressed" },
+                rootCause: { type: "string", description: "Root cause being fixed" },
+                layerCorrected: {
+                    type: "string",
+                    enum: [...SYSTEM_LAYERS],
+                    description: "Which system layer was corrected",
+                },
+                description: { type: "string", description: "What was changed" },
+                replayProof: {
+                    type: "object",
+                    properties: {
+                        priorScore: { type: "number" },
+                        newScore: { type: "number" },
+                        improved: { type: "boolean" },
+                    },
+                    description: "JSON proof: prior vs new scores",
+                },
+                regressionProtection: {
+                    type: "string",
+                    description: "What prevents this from regressing",
+                },
+            },
+            required: ["caseId", "failureClass", "rootCause", "layerCorrected", "description"],
+        },
+        handler: async (args) => {
+            ensureDogfoodSchema();
+            const db = getDb();
+            const attemptId = genId("fix");
+            const now = Date.now();
+            const status = args.replayProof?.improved ? "verified" : "proposed";
+            db.prepare(`INSERT INTO fix_attempts
+         (attemptId, caseId, failureClass, rootCause, layerCorrected, description, replayProof, regressionProtection, status, createdAt)
+         VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`).run(attemptId, args.caseId, args.failureClass, args.rootCause, args.layerCorrected, args.description, args.replayProof ? JSON.stringify(args.replayProof) : null, args.regressionProtection ?? null, status, now);
+            // Update failure case status
+            if (status === "verified") {
+                db.prepare(`UPDATE failure_cases SET status = 'fixed', fixAttemptId = ? WHERE caseId = ?`).run(attemptId, args.caseId);
+            }
+            else {
+                db.prepare(`UPDATE failure_cases SET status = 'investigating', fixAttemptId = ? WHERE caseId = ?`).run(attemptId, args.caseId);
+            }
+            return { attemptId, caseId: args.caseId, status, recorded: true };
+        },
+    },
+    // ─── 9. get_dogfood_sessions ───────────────────────────────────
+    {
+        name: "get_dogfood_sessions",
+        description: "List recent dogfood sessions with their judge scores. Filter by loop type.",
+        inputSchema: {
+            type: "object",
+            properties: {
+                loopType: {
+                    type: "string",
+                    enum: ["weekly_reset", "pre_delegation", "company_search"],
+                    description: "Filter by loop type (optional)",
+                },
+                limit: {
+                    type: "number",
+                    description: "Max sessions to return (default 10)",
+                },
+            },
+        },
+        annotations: { readOnlyHint: true },
+        handler: async (args) => {
+            ensureDogfoodSchema();
+            const db = getDb();
+            const limit = args.limit ?? 10;
+            let sessions;
+            if (args.loopType) {
+                sessions = db
+                    .prepare(`SELECT * FROM dogfood_sessions WHERE loopType = ? ORDER BY startedAt DESC LIMIT ?`)
+                    .all(args.loopType, limit);
+            }
+            else {
+                sessions = db
+                    .prepare(`SELECT * FROM dogfood_sessions ORDER BY startedAt DESC LIMIT ?`)
+                    .all(limit);
+            }
+            // Attach judge scores
+            const enriched = sessions.map((s) => {
+                const judgeRun = db
+                    .prepare(`SELECT overallScore, truthQuality, compressionQuality, anticipationQuality,
+                    outputQuality, delegationQuality, trustQuality, failureClasses
+             FROM judge_runs WHERE sessionId = ? ORDER BY judgedAt DESC LIMIT 1`)
+                    .get(s.sessionId);
+                return {
+                    ...s,
+                    manualCorrections: s.manualCorrections ? JSON.parse(s.manualCorrections) : [],
+                    repeatedQuestions: s.repeatedQuestions ? JSON.parse(s.repeatedQuestions) : [],
+                    artifactsProduced: s.artifactsProduced ? JSON.parse(s.artifactsProduced) : [],
+                    judgeScore: judgeRun
+                        ? {
+                            overall: judgeRun.overallScore,
+                            truth: judgeRun.truthQuality,
+                            compression: judgeRun.compressionQuality,
+                            anticipation: judgeRun.anticipationQuality,
+                            output: judgeRun.outputQuality,
+                            delegation: judgeRun.delegationQuality,
+                            trust: judgeRun.trustQuality,
+                            failureClasses: judgeRun.failureClasses
+                                ? JSON.parse(judgeRun.failureClasses)
+                                : [],
+                        }
+                        : null,
+                };
+            });
+            return { sessions: enriched, count: enriched.length };
+        },
+    },
+    // ─── 10. get_failure_triage ────────────────────────────────────
+    {
+        name: "get_failure_triage",
+        description: "Get open failure cases grouped by system layer with frequency counts. The triage board for fixing system gaps.",
+        inputSchema: {
+            type: "object",
+            properties: {
+                status: {
+                    type: "string",
+                    enum: ["open", "investigating", "fixed", "wont_fix"],
+                    description: "Filter by status (default: open)",
+                },
+            },
+        },
+        annotations: { readOnlyHint: true },
+        handler: async (args) => {
+            ensureDogfoodSchema();
+            const db = getDb();
+            const status = args.status ?? "open";
+            const cases = db
+                .prepare(`SELECT caseId, sessionId, symptom, rootCause, systemLayer, severity, frequency, status, createdAt
+           FROM failure_cases WHERE status = ? ORDER BY
+           CASE severity WHEN 'critical' THEN 0 WHEN 'high' THEN 1 WHEN 'medium' THEN 2 ELSE 3 END,
+           createdAt DESC`)
+                .all(status);
+            // Group by system layer
+            const byLayer = {};
+            for (const c of cases) {
+                if (!byLayer[c.systemLayer])
+                    byLayer[c.systemLayer] = [];
+                byLayer[c.systemLayer].push(c);
+            }
+            // Layer summary
+            const layerSummary = Object.entries(byLayer).map(([layer, items]) => ({
+                layer,
+                count: items.length,
+                criticalCount: items.filter((i) => i.severity === "critical").length,
+                highCount: items.filter((i) => i.severity === "high").length,
+            }));
+            return {
+                status,
+                totalCases: cases.length,
+                byLayer,
+                layerSummary: layerSummary.sort((a, b) => b.criticalCount - a.criticalCount || b.count - a.count),
+            };
+        },
+    },
+    // ─── 11. get_regression_gate ───────────────────────────────────
+    {
+        name: "get_regression_gate",
+        description: "Check if the 3 canonical loops pass. Returns per-loop scores, overall pass/fail, and regression detection.",
+        inputSchema: {
+            type: "object",
+            properties: {},
+        },
+        annotations: { readOnlyHint: true },
+        handler: async () => {
+            ensureDogfoodSchema();
+            const db = getDb();
+            const PASS_THRESHOLD = 3.5;
+            const loops = ["weekly_reset", "pre_delegation", "company_search"];
+            const results = {};
+            let allPassed = true;
+            let regressionsDetected = false;
+            for (const loop of loops) {
+                // Get last 5 judge scores for this loop type
+                const scores = db
+                    .prepare(`SELECT jr.overallScore
+             FROM judge_runs jr
+             JOIN dogfood_sessions ds ON jr.sessionId = ds.sessionId
+             WHERE ds.loopType = ?
+             ORDER BY jr.judgedAt DESC LIMIT 5`)
+                    .all(loop);
+                const trend = scores.map((s) => s.overallScore);
+                const latestScore = trend.length > 0 ? trend[0] : null;
+                const passed = latestScore != null && latestScore >= PASS_THRESHOLD;
+                if (!passed)
+                    allPassed = false;
+                // Check for regression: if latest score is lower than previous
+                if (trend.length >= 2 && trend[0] < trend[1]) {
+                    regressionsDetected = true;
+                }
+                results[loop] = { latestScore, trend, passed };
+            }
+            // Count open failures
+            const openFailures = db
+                .prepare(`SELECT COUNT(*) as count FROM failure_cases WHERE status = 'open'`)
+                .get();
+            return {
+                weeklyResetScore: results.weekly_reset.latestScore,
+                preDelegationScore: results.pre_delegation.latestScore,
+                companySearchScore: results.company_search.latestScore,
+                passed: allPassed,
+                regressions: regressionsDetected,
+                details: results,
+                openFailureCount: openFailures.count,
+            };
+        },
+    },
+    // ─── 12. get_repeat_cognition_metrics ──────────────────────────
+    {
+        name: "get_repeat_cognition_metrics",
+        description: "The key compound metric. Measures repeat question rate, manual reconstruction count, packet abandonment rate, delegation-without-restatement rate, and average time-to-useful-output.",
+        inputSchema: {
+            type: "object",
+            properties: {
+                daysSince: {
+                    type: "number",
+                    description: "Look back N days (default 30)",
+                },
+            },
+        },
+        annotations: { readOnlyHint: true },
+        handler: async (args) => {
+            ensureDogfoodSchema();
+            const db = getDb();
+            const since = Date.now() - (args.daysSince ?? 30) * 86400000;
+            // Total sessions in window
+            const totalSessions = db
+                .prepare(`SELECT COUNT(*) as count FROM dogfood_sessions WHERE startedAt >= ?`)
+                .get(since);
+            // Repeat questions in window
+            const repeatQuestions = db
+                .prepare(`SELECT COUNT(*) as count FROM repeat_question_events WHERE detectedAt >= ?`)
+                .get(since);
+            // Manual corrections in window
+            const manualCorrections = db
+                .prepare(`SELECT COUNT(*) as count FROM manual_correction_events WHERE detectedAt >= ?`)
+                .get(since);
+            // Packet ratings in window
+            const packetRatings = db
+                .prepare(`SELECT * FROM packet_usefulness_ratings WHERE ratedAt >= ?`)
+                .all(since);
+            const totalRated = packetRatings.length;
+            const abandoned = packetRatings.filter((r) => r.abandoned === 1).length;
+            const delegated = packetRatings.filter((r) => r.delegated === 1).length;
+            const totalHumanEdits = packetRatings.reduce((sum, r) => sum + (r.humanEditsCount ?? 0), 0);
+            // Average time-to-first-useful-output
+            const times = db
+                .prepare(`SELECT timeToFirstUsefulOutput FROM dogfood_sessions
+           WHERE startedAt >= ? AND timeToFirstUsefulOutput IS NOT NULL`)
+                .all(since);
+            const avgTimeToUsefulOutput = times.length > 0
+                ? Math.round(times.reduce((s, t) => s + t.timeToFirstUsefulOutput, 0) / times.length)
+                : null;
+            // Delegation without restatement rate
+            const delegationSessions = db
+                .prepare(`SELECT delegationSucceeded FROM dogfood_sessions
+           WHERE startedAt >= ? AND delegationSucceeded IS NOT NULL`)
+                .all(since);
+            const delegationSuccessRate = delegationSessions.length > 0
+                ? Math.round((delegationSessions.filter((s) => s.delegationSucceeded === 1).length /
+                    delegationSessions.length) *
+                    100)
+                : null;
+            return {
+                window: { days: args.daysSince ?? 30, since: new Date(since).toISOString() },
+                totalSessions: totalSessions.count,
+                repeatQuestionRate: totalSessions.count > 0
+                    ? Math.round((repeatQuestions.count / totalSessions.count) * 100) / 100
+                    : 0,
+                repeatQuestionCount: repeatQuestions.count,
+                manualCorrectionCount: manualCorrections.count,
+                packetAbandonmentRate: totalRated > 0 ? Math.round((abandoned / totalRated) * 100) / 100 : 0,
+                delegationWithoutRestatementRate: delegationSuccessRate,
+                averageTimeToUsefulOutputMs: avgTimeToUsefulOutput,
+                totalHumanEdits,
+                compoundScore: computeCompoundScore({
+                    repeatRate: totalSessions.count > 0
+                        ? repeatQuestions.count / totalSessions.count
+                        : 0,
+                    correctionRate: totalSessions.count > 0
+                        ? manualCorrections.count / totalSessions.count
+                        : 0,
+                    abandonmentRate: totalRated > 0 ? abandoned / totalRated : 0,
+                    delegationRate: delegationSuccessRate ?? 0,
+                }),
+            };
+        },
+    },
+];
+/* ------------------------------------------------------------------ */
+/*  Compound score: 0-100, higher is better                           */
+/* ------------------------------------------------------------------ */
+function computeCompoundScore(metrics) {
+    // Lower repeat/correction/abandonment is better → invert
+    // Higher delegation is better → keep
+    const repeatScore = Math.max(0, 100 - metrics.repeatRate * 100);
+    const correctionScore = Math.max(0, 100 - metrics.correctionRate * 50);
+    const abandonmentScore = Math.max(0, 100 - metrics.abandonmentRate * 100);
+    const delegationScore = metrics.delegationRate; // already 0-100
+    // Weighted average (repeat cognition weighted highest)
+    const score = repeatScore * 0.35 +
+        correctionScore * 0.25 +
+        abandonmentScore * 0.15 +
+        delegationScore * 0.25;
+    return Math.round(score * 100) / 100;
+}
+//# sourceMappingURL=dogfoodJudgeTools.js.map