npm - verifiable-thinking-mcp - Versions diffs - 0.4.0 - Mend

verifiable-thinking-mcp 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

package/LICENSE +21 -0
package/README.md +339 -0
package/package.json +75 -0
package/src/index.ts +38 -0
package/src/lib/cache.ts +246 -0
package/src/lib/compression.ts +804 -0
package/src/lib/compute/cache.ts +86 -0
package/src/lib/compute/classifier.ts +555 -0
package/src/lib/compute/confidence.ts +79 -0
package/src/lib/compute/context.ts +154 -0
package/src/lib/compute/extract.ts +200 -0
package/src/lib/compute/filter.ts +224 -0
package/src/lib/compute/index.ts +171 -0
package/src/lib/compute/math.ts +247 -0
package/src/lib/compute/patterns.ts +564 -0
package/src/lib/compute/registry.ts +145 -0
package/src/lib/compute/solvers/arithmetic.ts +65 -0
package/src/lib/compute/solvers/calculus.ts +249 -0
package/src/lib/compute/solvers/derivation-core.ts +371 -0
package/src/lib/compute/solvers/derivation-latex.ts +160 -0
package/src/lib/compute/solvers/derivation-mistakes.ts +1046 -0
package/src/lib/compute/solvers/derivation-simplify.ts +451 -0
package/src/lib/compute/solvers/derivation-transform.ts +620 -0
package/src/lib/compute/solvers/derivation.ts +67 -0
package/src/lib/compute/solvers/facts.ts +120 -0
package/src/lib/compute/solvers/formula.ts +728 -0
package/src/lib/compute/solvers/index.ts +36 -0
package/src/lib/compute/solvers/logic.ts +422 -0
package/src/lib/compute/solvers/probability.ts +307 -0
package/src/lib/compute/solvers/statistics.ts +262 -0
package/src/lib/compute/solvers/word-problems.ts +408 -0
package/src/lib/compute/types.ts +107 -0
package/src/lib/concepts.ts +111 -0
package/src/lib/domain.ts +731 -0
package/src/lib/extraction.ts +912 -0
package/src/lib/index.ts +122 -0
package/src/lib/judge.ts +260 -0
package/src/lib/math/ast.ts +842 -0
package/src/lib/math/index.ts +8 -0
package/src/lib/math/operators.ts +171 -0
package/src/lib/math/tokenizer.ts +477 -0
package/src/lib/patterns.ts +200 -0
package/src/lib/session.ts +825 -0
package/src/lib/think/challenge.ts +323 -0
package/src/lib/think/complexity.ts +504 -0
package/src/lib/think/confidence-drift.ts +507 -0
package/src/lib/think/consistency.ts +347 -0
package/src/lib/think/guidance.ts +188 -0
package/src/lib/think/helpers.ts +568 -0
package/src/lib/think/hypothesis.ts +216 -0
package/src/lib/think/index.ts +127 -0
package/src/lib/think/prompts.ts +262 -0
package/src/lib/think/route.ts +358 -0
package/src/lib/think/schema.ts +98 -0
package/src/lib/think/scratchpad-schema.ts +662 -0
package/src/lib/think/spot-check.ts +961 -0
package/src/lib/think/types.ts +93 -0
package/src/lib/think/verification.ts +260 -0
package/src/lib/tokens.ts +177 -0
package/src/lib/verification.ts +620 -0
package/src/prompts/index.ts +10 -0
package/src/prompts/templates.ts +336 -0
package/src/resources/index.ts +8 -0
package/src/resources/sessions.ts +196 -0
package/src/tools/compress.ts +138 -0
package/src/tools/index.ts +5 -0
package/src/tools/scratchpad.ts +2659 -0
package/src/tools/sessions.ts +144 -0

package/src/lib/think/route.ts ADDED Viewed

@@ -0,0 +1,358 @@
+/**
+ * Complexity-based routing for reasoning tasks
+ *
+ * This module encapsulates ALL routing logic that was previously in the benchmark runner.
+ * The runner should just call routeQuestion() and follow the instructions.
+ */
+import { detectMetaDomain } from "../domain.ts";
+import {
+  assessPromptComplexity,
+  type ComplexityResult,
+  getTrivialPrompt,
+  isTrivialQuestion,
+} from "./complexity.ts";
+import {
+  formatDomainExplanatoryPrompt,
+  getDomainSystemPrompt,
+  getSystemPrompt,
+  getUserPrompt,
+  getVerbosity,
+  type Verbosity,
+} from "./prompts.ts";
+import { needsSpotCheck } from "./spot-check.ts";
+// =============================================================================
+// EXPLANATORY QUESTION DETECTION
+// =============================================================================
+/**
+ * Detect if a question is primarily explanatory/descriptive.
+ * These questions benefit from reasoning but NOT from spot-check verification,
+ * since verification is designed for factual/numeric answers, not open-ended explanations.
+ */
+export function isExplanatoryQuestion(question: string): boolean {
+  const lower = question.toLowerCase();
+  // Primary indicators: explicit explanation requests
+  const explanatoryVerbs = [
+    /^explain\b/,
+    /\bexplain\s+(why|how|what|the|step)/,
+    /^describe\b/,
+    /\bdescribe\s+(how|what|the)/,
+    /^compare\b/,
+    /\bcompare\s+(and\s+)?contrast/,
+    /^discuss\b/,
+    /\bdiscuss\s+(why|how|the)/,
+    /^outline\b/,
+    /^summarize\b/,
+    /\bwhat\s+is\s+the\s+difference/,
+    /\bwhat\s+are\s+the\s+differences/,
+    /\bwhy\s+is\s+this\s+important/,
+    /\bwhy\s+does\s+this\s+matter/,
+  ];
+  // Check if primary request is explanatory
+  const hasExplanatoryVerb = explanatoryVerbs.some((p) => p.test(lower));
+  if (!hasExplanatoryVerb) return false;
+  // Exclusions: questions that look explanatory but have factual answers
+  const factualIndicators = [
+    /\bwhat\s+is\s+the\s+(value|answer|result|sum|product|number)\b/,
+    /\bhow\s+many\b/,
+    /\bhow\s+much\b/,
+    /\bcalculate\b/,
+    /\bcompute\b/,
+    /\bsolve\b/,
+    /=\s*\?/, // equation to solve
+  ];
+  const isFactual = factualIndicators.some((p) => p.test(lower));
+  return !isFactual;
+}
+// =============================================================================
+// OVERTHINKING DETECTOR
+// =============================================================================
+export interface OverthinkingResult {
+  /** Whether this question is prone to overthinking */
+  prone: boolean;
+  /** Why we think this is overthinking-prone */
+  reason: string | null;
+  /** Recommended action: "direct" to bypass reasoning, null to proceed normally */
+  recommendation: "direct" | null;
+}
+/**
+ * Detect questions that are prone to overthinking errors.
+ *
+ * These are questions where extended step-by-step reasoning can introduce errors
+ * that wouldn't occur with direct intuitive answers. Key patterns:
+ *
+ * 1. **Binary decision questions** with clear setup (SPIN or FIRE, YES or NO)
+ * 2. **Conditional probability** with explicit setup (given X happened, what's Y?)
+ * 3. **Game theory decisions** with simple payoff structure
+ *
+ * Evidence: Benchmark showed sota_russian_roulette baseline=FIRE (correct),
+ * tool=Spin (wrong). The reasoning path introduced error.
+ */
+export function detectOverthinking(question: string): OverthinkingResult {
+  const lower = question.toLowerCase();
+  const length = question.length;
+  // Pattern 1: Binary decision with conditional setup
+  // "First X happened. Better to A or B?"
+  const binaryDecisionPatterns = [
+    /better to\s+(\w+)(\s+\w+)?\s+(or|vs\.?)\s+(\w+)/i, // "better to SPIN again or FIRE"
+    /should you\s+(\w+)(\s+\w+)?\s+(or|vs\.?)\s+(\w+)/i, // "should you switch doors or stay"
+    /\b(spin|fire|switch|stay|fold|call|hit|stand)\s+(again\s+)?(or|vs\.?)\s+(spin|fire|switch|stay|fold|call|hit|stand)\b/i,
+    /\b(spin|fire|switch|stay)\b.*\b(or|vs\.?)\b.*\b(spin|fire|switch|stay)\b/i, // Loose match
+  ];
+  const hasBinaryDecision = binaryDecisionPatterns.some((p) => p.test(lower));
+  // Pattern 2: Conditional probability setup
+  // "Given X, what is Y?" or "First X happened, then..."
+  const conditionalSetupPatterns = [
+    /first\s+(trigger|shot|draw|flip|roll).*?(click|empty|miss|heads|tails)/i, // "First trigger: click"
+    /given (that|the)\s+\w+/i, // "Given that X"
+    /after\s+(seeing|getting|drawing|rolling)\s+\w+/i, // "After seeing X"
+    /\w+\s+already\s+(happened|occurred|fired|clicked)/i, // "X already happened"
+  ];
+  const hasConditionalSetup = conditionalSetupPatterns.some((p) => p.test(lower));
+  // Pattern 3: Compact question with numbers (probabilistic setup)
+  // Short questions with specific numeric setup are often well-defined
+  const isCompactWithNumbers =
+    length < 200 && /\d+[-\s]?chamber|\d+\s+bullet|\d+\s+door/i.test(lower);
+  // Pattern 4: Game theory keywords
+  const gameTheoryPatterns = [
+    /revolver|russian roulette/i,
+    /monty hall/i,
+    /prisoner'?s dilemma/i,
+    /\d+\s+doors?.*goat/i,
+    /envelope\s+paradox/i,
+  ];
+  const hasGameTheory = gameTheoryPatterns.some((p) => p.test(lower));
+  // Decision: Overthinking-prone if binary decision + conditional setup + compact
+  // OR if known game theory problem with binary choice
+  if (hasBinaryDecision && hasConditionalSetup && isCompactWithNumbers) {
+    return {
+      prone: true,
+      reason: "binary_decision_with_conditional_probability",
+      recommendation: "direct",
+    };
+  }
+  if (hasGameTheory && hasBinaryDecision) {
+    return {
+      prone: true,
+      reason: "game_theory_binary_decision",
+      recommendation: "direct",
+    };
+  }
+  return {
+    prone: false,
+    reason: null,
+    recommendation: null,
+  };
+}
+// =============================================================================
+// TYPES
+// =============================================================================
+export type RoutingPath = "trivial" | "direct" | "reasoning";
+export interface RouteResult {
+  /** Which path to take */
+  path: RoutingPath;
+  /** Complexity tier for logging */
+  tier: ComplexityResult["tier"];
+  /** Complexity score (0-1) */
+  score: number;
+  /** Verbosity level for prompts */
+  verbosity: Verbosity;
+  /** Number of LLM calls this path requires (always 1) */
+  steps: 1;
+  /** Whether this is an explanatory question */
+  isExplanatory: boolean;
+  /** Detected meta-domain (coding, scientific, educational, financial, general) */
+  metaDomain: string;
+  /** Whether to run spot-check on the answer (High+ complexity with trap patterns) */
+  shouldSpotCheck: boolean;
+  /** Overthinking detection result */
+  overthinking: OverthinkingResult;
+  /** Prompts to use */
+  prompts: RoutePrompts;
+}
+export interface RoutePrompts {
+  /** Main reasoning/answer prompt */
+  main: { system: string; user: string };
+}
+// =============================================================================
+// MAIN ROUTING FUNCTION
+// =============================================================================
+/**
+ * Route a question to the appropriate reasoning path.
+ *
+ * Returns everything the caller needs to execute the path:
+ * - Which path to take (trivial, direct, reasoning)
+ * - Pre-built prompts
+ *
+ * @param question The question/problem to solve
+ */
+export function routeQuestion(question: string): RouteResult {
+  const complexity = assessPromptComplexity(question);
+  const trivial = isTrivialQuestion(question);
+  const explanatory = isExplanatoryQuestion(question);
+  const verbosity = getVerbosity(question);
+  const tier = complexity.tier;
+  const metaDomain = detectMetaDomain(question);
+  // Detect overthinking-prone questions
+  const overthinking = detectOverthinking(question);
+  // Determine if spot-check should run:
+  // - Has structural trap patterns (likely to trigger intuitive but wrong answers)
+  // - NOT explanatory (spot-check is for factual answers)
+  // - NOT trivial (trivial questions are too simple for traps)
+  const spotCheckResult = needsSpotCheck(question);
+  const shouldSpotCheck = !trivial && !explanatory && spotCheckResult.required;
+  // Domain-aware prompts for explanatory questions (token-light steering)
+  const getExplanatoryPrompts = () => ({
+    system: getDomainSystemPrompt(metaDomain),
+    user: formatDomainExplanatoryPrompt(question, metaDomain),
+  });
+  // Standard prompts for non-explanatory questions
+  const getStandardPrompts = (type: "baseline" | "reasoning") => ({
+    system: getSystemPrompt(type, verbosity),
+    user: getUserPrompt(type, question, verbosity),
+  });
+  // === TRIVIAL: Direct answer, minimal prompt ===
+  if (trivial) {
+    const trivialPrompt = getTrivialPrompt(question);
+    return {
+      path: "trivial",
+      tier,
+      score: complexity.score,
+      verbosity,
+      steps: 1,
+      isExplanatory: explanatory,
+      metaDomain,
+      shouldSpotCheck: false, // Never spot-check trivial
+      overthinking,
+      prompts: {
+        main: trivialPrompt,
+      },
+    };
+  }
+  // === TRAP BYPASS: Route to reasoning if trap patterns detected ===
+  // Even if tier is Low, some questions have structural traps that need reasoning.
+  // Evidence: trap_sunk_cost baseline=NO (correct), tool=YES (wrong) when using direct.
+  // The spot-check correctly identifies these but we need reasoning to avoid the trap.
+  //
+  // EXCEPTION: Meta-questions (questions ABOUT cognitive biases, not triggering them)
+  // should NOT get the trap bypass. They describe traps but don't set them.
+  const isMetaQuestion = complexity.explanation.intensity_signals.includes("meta_question");
+  const hasTrapPattern = !trivial && !explanatory && !isMetaQuestion && spotCheckResult.required;
+  // === LOW: Direct answer with standard prompt ===
+  // EXCEPT: If trap patterns detected, route to reasoning instead
+  if (tier === "Low" && !hasTrapPattern) {
+    return {
+      path: "direct",
+      tier,
+      score: complexity.score,
+      verbosity,
+      steps: 1,
+      isExplanatory: explanatory,
+      metaDomain,
+      shouldSpotCheck: false, // Low complexity doesn't need spot-check
+      overthinking,
+      prompts: {
+        main: explanatory ? getExplanatoryPrompts() : getStandardPrompts("baseline"),
+      },
+    };
+  }
+  // === OVERTHINKING BYPASS: Route to direct if overthinking-prone ===
+  // Even though tier is Moderate+, some questions do worse with extended reasoning.
+  // Evidence: sota_russian_roulette baseline=FIRE (correct), tool=Spin (wrong).
+  if (overthinking.prone && overthinking.recommendation === "direct") {
+    return {
+      path: "direct",
+      tier, // Keep original tier for logging
+      score: complexity.score,
+      verbosity,
+      steps: 1,
+      isExplanatory: explanatory,
+      metaDomain,
+      shouldSpotCheck: false, // Direct path skips spot-check
+      overthinking,
+      prompts: {
+        // Use baseline prompt but with a focused nudge
+        main: getStandardPrompts("baseline"),
+      },
+    };
+  }
+  // === MODERATE/HIGH/TRAP-BOOSTED: Reasoning prompt (step-by-step) ===
+  // Includes:
+  // - Moderate, High, Almost Impossible tiers (natural routing)
+  // - Low tier with trap patterns (boosted to reasoning)
+  // Note: Very Hard tier routes to direct (bypass above) unless it has trap patterns
+  return {
+    path: "reasoning",
+    tier,
+    score: complexity.score,
+    verbosity,
+    steps: 1,
+    isExplanatory: explanatory,
+    metaDomain,
+    shouldSpotCheck,
+    overthinking,
+    prompts: {
+      main: explanatory ? getExplanatoryPrompts() : getStandardPrompts("reasoning"),
+    },
+  };
+}
+// =============================================================================
+// CONVENIENCE: Get complexity info without full routing
+// =============================================================================
+export interface ComplexityInfo {
+  tier: ComplexityResult["tier"];
+  score: number;
+  trivial: boolean;
+  domain: string | null;
+  signals: string[];
+}
+/**
+ * Quick complexity assessment without full routing
+ */
+export function getComplexityInfo(question: string): ComplexityInfo {
+  const complexity = assessPromptComplexity(question);
+  return {
+    tier: complexity.tier,
+    score: complexity.score,
+    trivial: isTrivialQuestion(question),
+    domain: complexity.explanation.domain_detected,
+    signals: complexity.explanation.intensity_signals,
+  };
+}

package/src/lib/think/schema.ts ADDED Viewed

@@ -0,0 +1,98 @@
+/**
+ * Think Tool Schema - Rich structured reasoning schema
+ * Zod schemas and types for the think tool
+ */
+import { z } from "zod";
+// ============================================================================
+// SCHEMA - Rich structured reasoning schema
+// ============================================================================
+export const NextActionSchema = z.union([
+  z.string().describe("Simple description of next action"),
+  z
+    .object({
+      tool: z.string().optional().describe("Tool to use"),
+      action: z.string().describe("Specific action to perform"),
+      parameters: z.record(z.string(), z.unknown()).optional().describe("Tool parameters"),
+      expectedOutput: z.string().optional().describe("Expected result"),
+    })
+    .describe("Structured action with tool details"),
+]);
+export const ThinkSchema = z.object({
+  // Core required fields
+  step_number: z.number().int().min(1).describe("Sequential step number starting from 1"),
+  estimated_total: z.number().int().min(1).describe("Estimated total steps needed"),
+  purpose: z
+    .string()
+    .describe(
+      "Step category: analysis, action, reflection, decision, summary, validation, exploration, hypothesis, correction, planning",
+    ),
+  context: z.string().describe("What is already known. Include prior findings."),
+  thought: z.string().describe("Current reasoning process"),
+  outcome: z.string().describe("Expected or actual result from this step"),
+  next_action: NextActionSchema.describe("What to do next"),
+  rationale: z.string().describe("Why this next action was chosen"),
+  // Completion
+  is_final_step: z.boolean().default(false).describe("Mark as final step"),
+  // Confidence tracking
+  confidence: z.number().min(0).max(1).optional().describe("Confidence in this step (0-1)"),
+  uncertainty_notes: z.string().optional().describe("Specific uncertainties or assumptions"),
+  // Revision support
+  revises_step: z.number().int().min(1).optional().describe("Step number being revised"),
+  revision_reason: z.string().optional().describe("Why revising earlier step"),
+  // Branching support
+  branch_from: z.number().int().min(1).optional().describe("Step to branch from"),
+  branch_id: z.string().optional().describe("Branch identifier"),
+  branch_name: z.string().optional().describe("Human-readable branch name"),
+  // Dependencies
+  dependencies: z.array(z.number().int().min(1)).optional().describe("Steps this depends on"),
+  // Tool tracking
+  tools_used: z.array(z.string()).optional().describe("Tools used in this step"),
+  external_context: z
+    .record(z.string(), z.unknown())
+    .optional()
+    .describe("External data/tool outputs"),
+  // Session
+  session_id: z.string().optional().describe("Session ID for multi-turn"),
+  // Guidance/verification extensions
+  guidance: z.boolean().default(true).describe("Enable proactive guidance"),
+  verify: z.boolean().default(false).describe("Run domain verification"),
+  domain: z.enum(["math", "logic", "code", "general"]).optional().describe("Domain hint"),
+  local_compute: z.boolean().default(false).describe("Try local compute for math"),
+  // Local compute augmentation - inject computed values into thought
+  augment_compute: z
+    .boolean()
+    .default(false)
+    .describe(
+      "Extract and inject locally computed values into thought (math, logic, probability, facts)",
+    ),
+  system_prompt: z
+    .string()
+    .optional()
+    .describe("System prompt context for domain-aware filtering of compute augmentation"),
+  // Compression control
+  compression_level: z
+    .enum(["none", "auto", "aggressive"])
+    .default("auto")
+    .describe(
+      "Compression level: none (disabled), auto (entropy-based), aggressive (always compress long text)",
+    ),
+  // Baseline mode - pure pass-through, no features
+  baseline: z.boolean().default(false).describe("Baseline mode: bypass all features"),
+});
+export type ThinkArgs = z.infer<typeof ThinkSchema>;