verifiable-thinking-mcp 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +339 -0
  3. package/package.json +75 -0
  4. package/src/index.ts +38 -0
  5. package/src/lib/cache.ts +246 -0
  6. package/src/lib/compression.ts +804 -0
  7. package/src/lib/compute/cache.ts +86 -0
  8. package/src/lib/compute/classifier.ts +555 -0
  9. package/src/lib/compute/confidence.ts +79 -0
  10. package/src/lib/compute/context.ts +154 -0
  11. package/src/lib/compute/extract.ts +200 -0
  12. package/src/lib/compute/filter.ts +224 -0
  13. package/src/lib/compute/index.ts +171 -0
  14. package/src/lib/compute/math.ts +247 -0
  15. package/src/lib/compute/patterns.ts +564 -0
  16. package/src/lib/compute/registry.ts +145 -0
  17. package/src/lib/compute/solvers/arithmetic.ts +65 -0
  18. package/src/lib/compute/solvers/calculus.ts +249 -0
  19. package/src/lib/compute/solvers/derivation-core.ts +371 -0
  20. package/src/lib/compute/solvers/derivation-latex.ts +160 -0
  21. package/src/lib/compute/solvers/derivation-mistakes.ts +1046 -0
  22. package/src/lib/compute/solvers/derivation-simplify.ts +451 -0
  23. package/src/lib/compute/solvers/derivation-transform.ts +620 -0
  24. package/src/lib/compute/solvers/derivation.ts +67 -0
  25. package/src/lib/compute/solvers/facts.ts +120 -0
  26. package/src/lib/compute/solvers/formula.ts +728 -0
  27. package/src/lib/compute/solvers/index.ts +36 -0
  28. package/src/lib/compute/solvers/logic.ts +422 -0
  29. package/src/lib/compute/solvers/probability.ts +307 -0
  30. package/src/lib/compute/solvers/statistics.ts +262 -0
  31. package/src/lib/compute/solvers/word-problems.ts +408 -0
  32. package/src/lib/compute/types.ts +107 -0
  33. package/src/lib/concepts.ts +111 -0
  34. package/src/lib/domain.ts +731 -0
  35. package/src/lib/extraction.ts +912 -0
  36. package/src/lib/index.ts +122 -0
  37. package/src/lib/judge.ts +260 -0
  38. package/src/lib/math/ast.ts +842 -0
  39. package/src/lib/math/index.ts +8 -0
  40. package/src/lib/math/operators.ts +171 -0
  41. package/src/lib/math/tokenizer.ts +477 -0
  42. package/src/lib/patterns.ts +200 -0
  43. package/src/lib/session.ts +825 -0
  44. package/src/lib/think/challenge.ts +323 -0
  45. package/src/lib/think/complexity.ts +504 -0
  46. package/src/lib/think/confidence-drift.ts +507 -0
  47. package/src/lib/think/consistency.ts +347 -0
  48. package/src/lib/think/guidance.ts +188 -0
  49. package/src/lib/think/helpers.ts +568 -0
  50. package/src/lib/think/hypothesis.ts +216 -0
  51. package/src/lib/think/index.ts +127 -0
  52. package/src/lib/think/prompts.ts +262 -0
  53. package/src/lib/think/route.ts +358 -0
  54. package/src/lib/think/schema.ts +98 -0
  55. package/src/lib/think/scratchpad-schema.ts +662 -0
  56. package/src/lib/think/spot-check.ts +961 -0
  57. package/src/lib/think/types.ts +93 -0
  58. package/src/lib/think/verification.ts +260 -0
  59. package/src/lib/tokens.ts +177 -0
  60. package/src/lib/verification.ts +620 -0
  61. package/src/prompts/index.ts +10 -0
  62. package/src/prompts/templates.ts +336 -0
  63. package/src/resources/index.ts +8 -0
  64. package/src/resources/sessions.ts +196 -0
  65. package/src/tools/compress.ts +138 -0
  66. package/src/tools/index.ts +5 -0
  67. package/src/tools/scratchpad.ts +2659 -0
  68. package/src/tools/sessions.ts +144 -0
@@ -0,0 +1,358 @@
1
+ /**
2
+ * Complexity-based routing for reasoning tasks
3
+ *
4
+ * This module encapsulates ALL routing logic that was previously in the benchmark runner.
5
+ * The runner should just call routeQuestion() and follow the instructions.
6
+ */
7
+
8
+ import { detectMetaDomain } from "../domain.ts";
9
+ import {
10
+ assessPromptComplexity,
11
+ type ComplexityResult,
12
+ getTrivialPrompt,
13
+ isTrivialQuestion,
14
+ } from "./complexity.ts";
15
+ import {
16
+ formatDomainExplanatoryPrompt,
17
+ getDomainSystemPrompt,
18
+ getSystemPrompt,
19
+ getUserPrompt,
20
+ getVerbosity,
21
+ type Verbosity,
22
+ } from "./prompts.ts";
23
+ import { needsSpotCheck } from "./spot-check.ts";
24
+
25
+ // =============================================================================
26
+ // EXPLANATORY QUESTION DETECTION
27
+ // =============================================================================
28
+
29
+ /**
30
+ * Detect if a question is primarily explanatory/descriptive.
31
+ * These questions benefit from reasoning but NOT from spot-check verification,
32
+ * since verification is designed for factual/numeric answers, not open-ended explanations.
33
+ */
34
+ export function isExplanatoryQuestion(question: string): boolean {
35
+ const lower = question.toLowerCase();
36
+
37
+ // Primary indicators: explicit explanation requests
38
+ const explanatoryVerbs = [
39
+ /^explain\b/,
40
+ /\bexplain\s+(why|how|what|the|step)/,
41
+ /^describe\b/,
42
+ /\bdescribe\s+(how|what|the)/,
43
+ /^compare\b/,
44
+ /\bcompare\s+(and\s+)?contrast/,
45
+ /^discuss\b/,
46
+ /\bdiscuss\s+(why|how|the)/,
47
+ /^outline\b/,
48
+ /^summarize\b/,
49
+ /\bwhat\s+is\s+the\s+difference/,
50
+ /\bwhat\s+are\s+the\s+differences/,
51
+ /\bwhy\s+is\s+this\s+important/,
52
+ /\bwhy\s+does\s+this\s+matter/,
53
+ ];
54
+
55
+ // Check if primary request is explanatory
56
+ const hasExplanatoryVerb = explanatoryVerbs.some((p) => p.test(lower));
57
+ if (!hasExplanatoryVerb) return false;
58
+
59
+ // Exclusions: questions that look explanatory but have factual answers
60
+ const factualIndicators = [
61
+ /\bwhat\s+is\s+the\s+(value|answer|result|sum|product|number)\b/,
62
+ /\bhow\s+many\b/,
63
+ /\bhow\s+much\b/,
64
+ /\bcalculate\b/,
65
+ /\bcompute\b/,
66
+ /\bsolve\b/,
67
+ /=\s*\?/, // equation to solve
68
+ ];
69
+
70
+ const isFactual = factualIndicators.some((p) => p.test(lower));
71
+ return !isFactual;
72
+ }
73
+
74
+ // =============================================================================
75
+ // OVERTHINKING DETECTOR
76
+ // =============================================================================
77
+
78
+ export interface OverthinkingResult {
79
+ /** Whether this question is prone to overthinking */
80
+ prone: boolean;
81
+ /** Why we think this is overthinking-prone */
82
+ reason: string | null;
83
+ /** Recommended action: "direct" to bypass reasoning, null to proceed normally */
84
+ recommendation: "direct" | null;
85
+ }
86
+
87
+ /**
88
+ * Detect questions that are prone to overthinking errors.
89
+ *
90
+ * These are questions where extended step-by-step reasoning can introduce errors
91
+ * that wouldn't occur with direct intuitive answers. Key patterns:
92
+ *
93
+ * 1. **Binary decision questions** with clear setup (SPIN or FIRE, YES or NO)
94
+ * 2. **Conditional probability** with explicit setup (given X happened, what's Y?)
95
+ * 3. **Game theory decisions** with simple payoff structure
96
+ *
97
+ * Evidence: Benchmark showed sota_russian_roulette baseline=FIRE (correct),
98
+ * tool=Spin (wrong). The reasoning path introduced error.
99
+ */
100
+ export function detectOverthinking(question: string): OverthinkingResult {
101
+ const lower = question.toLowerCase();
102
+ const length = question.length;
103
+
104
+ // Pattern 1: Binary decision with conditional setup
105
+ // "First X happened. Better to A or B?"
106
+ const binaryDecisionPatterns = [
107
+ /better to\s+(\w+)(\s+\w+)?\s+(or|vs\.?)\s+(\w+)/i, // "better to SPIN again or FIRE"
108
+ /should you\s+(\w+)(\s+\w+)?\s+(or|vs\.?)\s+(\w+)/i, // "should you switch doors or stay"
109
+ /\b(spin|fire|switch|stay|fold|call|hit|stand)\s+(again\s+)?(or|vs\.?)\s+(spin|fire|switch|stay|fold|call|hit|stand)\b/i,
110
+ /\b(spin|fire|switch|stay)\b.*\b(or|vs\.?)\b.*\b(spin|fire|switch|stay)\b/i, // Loose match
111
+ ];
112
+
113
+ const hasBinaryDecision = binaryDecisionPatterns.some((p) => p.test(lower));
114
+
115
+ // Pattern 2: Conditional probability setup
116
+ // "Given X, what is Y?" or "First X happened, then..."
117
+ const conditionalSetupPatterns = [
118
+ /first\s+(trigger|shot|draw|flip|roll).*?(click|empty|miss|heads|tails)/i, // "First trigger: click"
119
+ /given (that|the)\s+\w+/i, // "Given that X"
120
+ /after\s+(seeing|getting|drawing|rolling)\s+\w+/i, // "After seeing X"
121
+ /\w+\s+already\s+(happened|occurred|fired|clicked)/i, // "X already happened"
122
+ ];
123
+
124
+ const hasConditionalSetup = conditionalSetupPatterns.some((p) => p.test(lower));
125
+
126
+ // Pattern 3: Compact question with numbers (probabilistic setup)
127
+ // Short questions with specific numeric setup are often well-defined
128
+ const isCompactWithNumbers =
129
+ length < 200 && /\d+[-\s]?chamber|\d+\s+bullet|\d+\s+door/i.test(lower);
130
+
131
+ // Pattern 4: Game theory keywords
132
+ const gameTheoryPatterns = [
133
+ /revolver|russian roulette/i,
134
+ /monty hall/i,
135
+ /prisoner'?s dilemma/i,
136
+ /\d+\s+doors?.*goat/i,
137
+ /envelope\s+paradox/i,
138
+ ];
139
+
140
+ const hasGameTheory = gameTheoryPatterns.some((p) => p.test(lower));
141
+
142
+ // Decision: Overthinking-prone if binary decision + conditional setup + compact
143
+ // OR if known game theory problem with binary choice
144
+ if (hasBinaryDecision && hasConditionalSetup && isCompactWithNumbers) {
145
+ return {
146
+ prone: true,
147
+ reason: "binary_decision_with_conditional_probability",
148
+ recommendation: "direct",
149
+ };
150
+ }
151
+
152
+ if (hasGameTheory && hasBinaryDecision) {
153
+ return {
154
+ prone: true,
155
+ reason: "game_theory_binary_decision",
156
+ recommendation: "direct",
157
+ };
158
+ }
159
+
160
+ return {
161
+ prone: false,
162
+ reason: null,
163
+ recommendation: null,
164
+ };
165
+ }
166
+
167
+ // =============================================================================
168
+ // TYPES
169
+ // =============================================================================
170
+
171
+ export type RoutingPath = "trivial" | "direct" | "reasoning";
172
+
173
+ export interface RouteResult {
174
+ /** Which path to take */
175
+ path: RoutingPath;
176
+ /** Complexity tier for logging */
177
+ tier: ComplexityResult["tier"];
178
+ /** Complexity score (0-1) */
179
+ score: number;
180
+ /** Verbosity level for prompts */
181
+ verbosity: Verbosity;
182
+ /** Number of LLM calls this path requires (always 1) */
183
+ steps: 1;
184
+ /** Whether this is an explanatory question */
185
+ isExplanatory: boolean;
186
+ /** Detected meta-domain (coding, scientific, educational, financial, general) */
187
+ metaDomain: string;
188
+ /** Whether to run spot-check on the answer (High+ complexity with trap patterns) */
189
+ shouldSpotCheck: boolean;
190
+ /** Overthinking detection result */
191
+ overthinking: OverthinkingResult;
192
+ /** Prompts to use */
193
+ prompts: RoutePrompts;
194
+ }
195
+
196
+ export interface RoutePrompts {
197
+ /** Main reasoning/answer prompt */
198
+ main: { system: string; user: string };
199
+ }
200
+
201
+ // =============================================================================
202
+ // MAIN ROUTING FUNCTION
203
+ // =============================================================================
204
+
205
+ /**
206
+ * Route a question to the appropriate reasoning path.
207
+ *
208
+ * Returns everything the caller needs to execute the path:
209
+ * - Which path to take (trivial, direct, reasoning)
210
+ * - Pre-built prompts
211
+ *
212
+ * @param question The question/problem to solve
213
+ */
214
+ export function routeQuestion(question: string): RouteResult {
215
+ const complexity = assessPromptComplexity(question);
216
+ const trivial = isTrivialQuestion(question);
217
+ const explanatory = isExplanatoryQuestion(question);
218
+ const verbosity = getVerbosity(question);
219
+ const tier = complexity.tier;
220
+ const metaDomain = detectMetaDomain(question);
221
+
222
+ // Detect overthinking-prone questions
223
+ const overthinking = detectOverthinking(question);
224
+
225
+ // Determine if spot-check should run:
226
+ // - Has structural trap patterns (likely to trigger intuitive but wrong answers)
227
+ // - NOT explanatory (spot-check is for factual answers)
228
+ // - NOT trivial (trivial questions are too simple for traps)
229
+ const spotCheckResult = needsSpotCheck(question);
230
+ const shouldSpotCheck = !trivial && !explanatory && spotCheckResult.required;
231
+
232
+ // Domain-aware prompts for explanatory questions (token-light steering)
233
+ const getExplanatoryPrompts = () => ({
234
+ system: getDomainSystemPrompt(metaDomain),
235
+ user: formatDomainExplanatoryPrompt(question, metaDomain),
236
+ });
237
+
238
+ // Standard prompts for non-explanatory questions
239
+ const getStandardPrompts = (type: "baseline" | "reasoning") => ({
240
+ system: getSystemPrompt(type, verbosity),
241
+ user: getUserPrompt(type, question, verbosity),
242
+ });
243
+
244
+ // === TRIVIAL: Direct answer, minimal prompt ===
245
+ if (trivial) {
246
+ const trivialPrompt = getTrivialPrompt(question);
247
+ return {
248
+ path: "trivial",
249
+ tier,
250
+ score: complexity.score,
251
+ verbosity,
252
+ steps: 1,
253
+ isExplanatory: explanatory,
254
+ metaDomain,
255
+ shouldSpotCheck: false, // Never spot-check trivial
256
+ overthinking,
257
+ prompts: {
258
+ main: trivialPrompt,
259
+ },
260
+ };
261
+ }
262
+
263
+ // === TRAP BYPASS: Route to reasoning if trap patterns detected ===
264
+ // Even if tier is Low, some questions have structural traps that need reasoning.
265
+ // Evidence: trap_sunk_cost baseline=NO (correct), tool=YES (wrong) when using direct.
266
+ // The spot-check correctly identifies these but we need reasoning to avoid the trap.
267
+ //
268
+ // EXCEPTION: Meta-questions (questions ABOUT cognitive biases, not triggering them)
269
+ // should NOT get the trap bypass. They describe traps but don't set them.
270
+ const isMetaQuestion = complexity.explanation.intensity_signals.includes("meta_question");
271
+ const hasTrapPattern = !trivial && !explanatory && !isMetaQuestion && spotCheckResult.required;
272
+
273
+ // === LOW: Direct answer with standard prompt ===
274
+ // EXCEPT: If trap patterns detected, route to reasoning instead
275
+ if (tier === "Low" && !hasTrapPattern) {
276
+ return {
277
+ path: "direct",
278
+ tier,
279
+ score: complexity.score,
280
+ verbosity,
281
+ steps: 1,
282
+ isExplanatory: explanatory,
283
+ metaDomain,
284
+ shouldSpotCheck: false, // Low complexity doesn't need spot-check
285
+ overthinking,
286
+ prompts: {
287
+ main: explanatory ? getExplanatoryPrompts() : getStandardPrompts("baseline"),
288
+ },
289
+ };
290
+ }
291
+
292
+ // === OVERTHINKING BYPASS: Route to direct if overthinking-prone ===
293
+ // Even though tier is Moderate+, some questions do worse with extended reasoning.
294
+ // Evidence: sota_russian_roulette baseline=FIRE (correct), tool=Spin (wrong).
295
+ if (overthinking.prone && overthinking.recommendation === "direct") {
296
+ return {
297
+ path: "direct",
298
+ tier, // Keep original tier for logging
299
+ score: complexity.score,
300
+ verbosity,
301
+ steps: 1,
302
+ isExplanatory: explanatory,
303
+ metaDomain,
304
+ shouldSpotCheck: false, // Direct path skips spot-check
305
+ overthinking,
306
+ prompts: {
307
+ // Use baseline prompt but with a focused nudge
308
+ main: getStandardPrompts("baseline"),
309
+ },
310
+ };
311
+ }
312
+
313
+ // === MODERATE/HIGH/TRAP-BOOSTED: Reasoning prompt (step-by-step) ===
314
+ // Includes:
315
+ // - Moderate, High, Almost Impossible tiers (natural routing)
316
+ // - Low tier with trap patterns (boosted to reasoning)
317
+ // Note: Very Hard tier routes to direct (bypass above) unless it has trap patterns
318
+ return {
319
+ path: "reasoning",
320
+ tier,
321
+ score: complexity.score,
322
+ verbosity,
323
+ steps: 1,
324
+ isExplanatory: explanatory,
325
+ metaDomain,
326
+ shouldSpotCheck,
327
+ overthinking,
328
+ prompts: {
329
+ main: explanatory ? getExplanatoryPrompts() : getStandardPrompts("reasoning"),
330
+ },
331
+ };
332
+ }
333
+
334
+ // =============================================================================
335
+ // CONVENIENCE: Get complexity info without full routing
336
+ // =============================================================================
337
+
338
+ export interface ComplexityInfo {
339
+ tier: ComplexityResult["tier"];
340
+ score: number;
341
+ trivial: boolean;
342
+ domain: string | null;
343
+ signals: string[];
344
+ }
345
+
346
+ /**
347
+ * Quick complexity assessment without full routing
348
+ */
349
+ export function getComplexityInfo(question: string): ComplexityInfo {
350
+ const complexity = assessPromptComplexity(question);
351
+ return {
352
+ tier: complexity.tier,
353
+ score: complexity.score,
354
+ trivial: isTrivialQuestion(question),
355
+ domain: complexity.explanation.domain_detected,
356
+ signals: complexity.explanation.intensity_signals,
357
+ };
358
+ }
@@ -0,0 +1,98 @@
1
+ /**
2
+ * Think Tool Schema - Rich structured reasoning schema
3
+ * Zod schemas and types for the think tool
4
+ */
5
+
6
+ import { z } from "zod";
7
+
8
+ // ============================================================================
9
+ // SCHEMA - Rich structured reasoning schema
10
+ // ============================================================================
11
+
12
+ export const NextActionSchema = z.union([
13
+ z.string().describe("Simple description of next action"),
14
+ z
15
+ .object({
16
+ tool: z.string().optional().describe("Tool to use"),
17
+ action: z.string().describe("Specific action to perform"),
18
+ parameters: z.record(z.string(), z.unknown()).optional().describe("Tool parameters"),
19
+ expectedOutput: z.string().optional().describe("Expected result"),
20
+ })
21
+ .describe("Structured action with tool details"),
22
+ ]);
23
+
24
+ export const ThinkSchema = z.object({
25
+ // Core required fields
26
+ step_number: z.number().int().min(1).describe("Sequential step number starting from 1"),
27
+ estimated_total: z.number().int().min(1).describe("Estimated total steps needed"),
28
+ purpose: z
29
+ .string()
30
+ .describe(
31
+ "Step category: analysis, action, reflection, decision, summary, validation, exploration, hypothesis, correction, planning",
32
+ ),
33
+ context: z.string().describe("What is already known. Include prior findings."),
34
+ thought: z.string().describe("Current reasoning process"),
35
+ outcome: z.string().describe("Expected or actual result from this step"),
36
+ next_action: NextActionSchema.describe("What to do next"),
37
+ rationale: z.string().describe("Why this next action was chosen"),
38
+
39
+ // Completion
40
+ is_final_step: z.boolean().default(false).describe("Mark as final step"),
41
+
42
+ // Confidence tracking
43
+ confidence: z.number().min(0).max(1).optional().describe("Confidence in this step (0-1)"),
44
+ uncertainty_notes: z.string().optional().describe("Specific uncertainties or assumptions"),
45
+
46
+ // Revision support
47
+ revises_step: z.number().int().min(1).optional().describe("Step number being revised"),
48
+ revision_reason: z.string().optional().describe("Why revising earlier step"),
49
+
50
+ // Branching support
51
+ branch_from: z.number().int().min(1).optional().describe("Step to branch from"),
52
+ branch_id: z.string().optional().describe("Branch identifier"),
53
+ branch_name: z.string().optional().describe("Human-readable branch name"),
54
+
55
+ // Dependencies
56
+ dependencies: z.array(z.number().int().min(1)).optional().describe("Steps this depends on"),
57
+
58
+ // Tool tracking
59
+ tools_used: z.array(z.string()).optional().describe("Tools used in this step"),
60
+ external_context: z
61
+ .record(z.string(), z.unknown())
62
+ .optional()
63
+ .describe("External data/tool outputs"),
64
+
65
+ // Session
66
+ session_id: z.string().optional().describe("Session ID for multi-turn"),
67
+
68
+ // Guidance/verification extensions
69
+ guidance: z.boolean().default(true).describe("Enable proactive guidance"),
70
+ verify: z.boolean().default(false).describe("Run domain verification"),
71
+ domain: z.enum(["math", "logic", "code", "general"]).optional().describe("Domain hint"),
72
+ local_compute: z.boolean().default(false).describe("Try local compute for math"),
73
+
74
+ // Local compute augmentation - inject computed values into thought
75
+ augment_compute: z
76
+ .boolean()
77
+ .default(false)
78
+ .describe(
79
+ "Extract and inject locally computed values into thought (math, logic, probability, facts)",
80
+ ),
81
+ system_prompt: z
82
+ .string()
83
+ .optional()
84
+ .describe("System prompt context for domain-aware filtering of compute augmentation"),
85
+
86
+ // Compression control
87
+ compression_level: z
88
+ .enum(["none", "auto", "aggressive"])
89
+ .default("auto")
90
+ .describe(
91
+ "Compression level: none (disabled), auto (entropy-based), aggressive (always compress long text)",
92
+ ),
93
+
94
+ // Baseline mode - pure pass-through, no features
95
+ baseline: z.boolean().default(false).describe("Baseline mode: bypass all features"),
96
+ });
97
+
98
+ export type ThinkArgs = z.infer<typeof ThinkSchema>;