verifiable-thinking-mcp 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +339 -0
  3. package/package.json +75 -0
  4. package/src/index.ts +38 -0
  5. package/src/lib/cache.ts +246 -0
  6. package/src/lib/compression.ts +804 -0
  7. package/src/lib/compute/cache.ts +86 -0
  8. package/src/lib/compute/classifier.ts +555 -0
  9. package/src/lib/compute/confidence.ts +79 -0
  10. package/src/lib/compute/context.ts +154 -0
  11. package/src/lib/compute/extract.ts +200 -0
  12. package/src/lib/compute/filter.ts +224 -0
  13. package/src/lib/compute/index.ts +171 -0
  14. package/src/lib/compute/math.ts +247 -0
  15. package/src/lib/compute/patterns.ts +564 -0
  16. package/src/lib/compute/registry.ts +145 -0
  17. package/src/lib/compute/solvers/arithmetic.ts +65 -0
  18. package/src/lib/compute/solvers/calculus.ts +249 -0
  19. package/src/lib/compute/solvers/derivation-core.ts +371 -0
  20. package/src/lib/compute/solvers/derivation-latex.ts +160 -0
  21. package/src/lib/compute/solvers/derivation-mistakes.ts +1046 -0
  22. package/src/lib/compute/solvers/derivation-simplify.ts +451 -0
  23. package/src/lib/compute/solvers/derivation-transform.ts +620 -0
  24. package/src/lib/compute/solvers/derivation.ts +67 -0
  25. package/src/lib/compute/solvers/facts.ts +120 -0
  26. package/src/lib/compute/solvers/formula.ts +728 -0
  27. package/src/lib/compute/solvers/index.ts +36 -0
  28. package/src/lib/compute/solvers/logic.ts +422 -0
  29. package/src/lib/compute/solvers/probability.ts +307 -0
  30. package/src/lib/compute/solvers/statistics.ts +262 -0
  31. package/src/lib/compute/solvers/word-problems.ts +408 -0
  32. package/src/lib/compute/types.ts +107 -0
  33. package/src/lib/concepts.ts +111 -0
  34. package/src/lib/domain.ts +731 -0
  35. package/src/lib/extraction.ts +912 -0
  36. package/src/lib/index.ts +122 -0
  37. package/src/lib/judge.ts +260 -0
  38. package/src/lib/math/ast.ts +842 -0
  39. package/src/lib/math/index.ts +8 -0
  40. package/src/lib/math/operators.ts +171 -0
  41. package/src/lib/math/tokenizer.ts +477 -0
  42. package/src/lib/patterns.ts +200 -0
  43. package/src/lib/session.ts +825 -0
  44. package/src/lib/think/challenge.ts +323 -0
  45. package/src/lib/think/complexity.ts +504 -0
  46. package/src/lib/think/confidence-drift.ts +507 -0
  47. package/src/lib/think/consistency.ts +347 -0
  48. package/src/lib/think/guidance.ts +188 -0
  49. package/src/lib/think/helpers.ts +568 -0
  50. package/src/lib/think/hypothesis.ts +216 -0
  51. package/src/lib/think/index.ts +127 -0
  52. package/src/lib/think/prompts.ts +262 -0
  53. package/src/lib/think/route.ts +358 -0
  54. package/src/lib/think/schema.ts +98 -0
  55. package/src/lib/think/scratchpad-schema.ts +662 -0
  56. package/src/lib/think/spot-check.ts +961 -0
  57. package/src/lib/think/types.ts +93 -0
  58. package/src/lib/think/verification.ts +260 -0
  59. package/src/lib/tokens.ts +177 -0
  60. package/src/lib/verification.ts +620 -0
  61. package/src/prompts/index.ts +10 -0
  62. package/src/prompts/templates.ts +336 -0
  63. package/src/resources/index.ts +8 -0
  64. package/src/resources/sessions.ts +196 -0
  65. package/src/tools/compress.ts +138 -0
  66. package/src/tools/index.ts +5 -0
  67. package/src/tools/scratchpad.ts +2659 -0
  68. package/src/tools/sessions.ts +144 -0
@@ -0,0 +1,961 @@
1
+ /**
2
+ * Generalized Spot-Check for Trap Questions
3
+ *
4
+ * NO LLM calls - pure structural heuristics for <1ms overhead.
5
+ * Detects trap patterns by STRUCTURE, not by problem NAME.
6
+ *
7
+ * Design principles:
8
+ * 1. O(n) single-pass detection
9
+ * 2. Structural pattern matching (not "bat and ball", but "additive system")
10
+ * 3. Synthesizes warnings based on mathematical structure
11
+ * 4. False positives OK - warns rather than corrects
12
+ */
13
+
14
+ export interface SpotCheckResult {
15
+ /** Whether the spot-check passed (no issues detected) */
16
+ passed: boolean;
17
+ /** Warning message if a potential trap was detected */
18
+ warning: string | null;
19
+ /** Hint for the LLM to reconsider */
20
+ hint: string | null;
21
+ /** Structural trap category (not problem name) */
22
+ trapType: string | null;
23
+ /** Confidence in the detection (0-1) */
24
+ confidence: number;
25
+ }
26
+
27
+ export interface NeedsSpotCheckResult {
28
+ /** Whether spot-check is recommended */
29
+ required: boolean;
30
+ /** Confidence score (0-1) */
31
+ score: number;
32
+ /** Detected structural categories */
33
+ categories: string[];
34
+ }
35
+
36
+ // =============================================================================
37
+ // HELPERS
38
+ // =============================================================================
39
+
40
+ /** Extract all numbers from text */
41
+ function extractNumbers(text: string): number[] {
42
+ const matches = text.match(/\d+(?:\.\d+)?/g);
43
+ if (!matches) return [];
44
+ return matches.map((m) => parseFloat(m));
45
+ }
46
+
47
+ /** Extract first number from text as float */
48
+ function extractFloat(text: string): number | null {
49
+ const match = text.match(/(\d+(?:\.\d+)?)/);
50
+ if (!match || match[1] === undefined) return null;
51
+ return parseFloat(match[1]);
52
+ }
53
+
54
+ /** Count occurrences of each number */
55
+ function countNumbers(nums: number[]): Map<number, number> {
56
+ const counts = new Map<number, number>();
57
+ for (const n of nums) {
58
+ counts.set(n, (counts.get(n) || 0) + 1);
59
+ }
60
+ return counts;
61
+ }
62
+
63
+ // =============================================================================
64
+ // FAST STRUCTURAL DETECTION (O(n) single-pass)
65
+ // =============================================================================
66
+
67
+ /**
68
+ * Fast O(n) detection of whether a question needs spot-checking.
69
+ * Based on STRUCTURAL signals, not problem names.
70
+ */
71
+ export function needsSpotCheck(question: string): NeedsSpotCheckResult {
72
+ const lower = question.toLowerCase();
73
+ const categories: string[] = [];
74
+ let score = 0;
75
+
76
+ // 1. ADDITIVE SYSTEM: "total/sum" AND "more/less than"
77
+ // Structure: x + y = T, x - y = D → trap is answering T - D
78
+ if (/(?:total|sum|together|cost).*?(?:more|less)\s*than/i.test(lower)) {
79
+ score += 0.8;
80
+ categories.push("additive_system");
81
+ }
82
+
83
+ // 2. NON-LINEAR GROWTH: "doubles/triples" AND "half/fraction/percent"
84
+ // Structure: exponential growth → trap is linear interpolation
85
+ if (
86
+ /doubles?|triples?|exponential/i.test(lower) &&
87
+ /half|quarter|fraction|percent/i.test(lower)
88
+ ) {
89
+ score += 0.8;
90
+ categories.push("nonlinear_growth");
91
+ }
92
+
93
+ // 3. RATE PATTERN: machines/workers + time + output
94
+ // Structure: N machines, N minutes, N widgets → trap is scaling linearly
95
+ if (
96
+ /(\d+)\s*(?:machines?|workers?|people|printers?).*?(?:minutes?|hours?|days?|seconds?)/i.test(
97
+ lower,
98
+ )
99
+ ) {
100
+ score += 0.6;
101
+ categories.push("rate_pattern");
102
+ }
103
+
104
+ // 4. HARMONIC MEAN: average speed + round trip/return
105
+ // Structure: different speeds over same distance → trap is arithmetic mean
106
+ if (
107
+ /average\s*speed|speed.*average/i.test(lower) &&
108
+ /(?:round\s*trip|return|back|there and back)/i.test(lower)
109
+ ) {
110
+ score += 0.9;
111
+ categories.push("harmonic_mean");
112
+ }
113
+
114
+ // 5. INDEPENDENCE: sequence + probability of next
115
+ // Structure: consecutive outcomes → trap is gambler's fallacy
116
+ if (
117
+ /(?:row|consecutive|straight|times)/i.test(lower) &&
118
+ /(?:probability|chance|likely|odds)/i.test(lower)
119
+ ) {
120
+ score += 0.7;
121
+ categories.push("independence");
122
+ }
123
+
124
+ // 6. PIGEONHOLE: minimum/least + guarantee/ensure
125
+ // Structure: N categories, need match → trap is underestimating worst case
126
+ if (/(?:minimum|least|fewest)/i.test(lower) && /(?:guarantee|ensure|certain|must)/i.test(lower)) {
127
+ score += 0.7;
128
+ categories.push("pigeonhole");
129
+ }
130
+
131
+ // 7. BASE RATE: test accuracy + rare condition
132
+ // Structure: high accuracy + low prevalence → trap is ignoring base rate
133
+ if (
134
+ /(?:test|positive|negative)/i.test(lower) &&
135
+ /(?:1\s*in\s*\d+|rare|uncommon|\d+%\s*(?:of|have))/i.test(lower)
136
+ ) {
137
+ score += 0.75;
138
+ categories.push("base_rate");
139
+ }
140
+
141
+ // 8. FACTORIAL/COUNTING: n! + zeros/factors
142
+ // Structure: large factorial → trap is simple division
143
+ if (/\d+!/i.test(lower) && /(?:zero|factor|digit)/i.test(lower)) {
144
+ score += 0.7;
145
+ categories.push("factorial_counting");
146
+ }
147
+
148
+ // 9. CLOCK OVERLAP: clock + overlap/coincide
149
+ // Structure: 12-hour period → trap is assuming 12 overlaps
150
+ if (/clock/i.test(lower) && /(?:overlap|coincide|same position)/i.test(lower)) {
151
+ score += 0.8;
152
+ categories.push("clock_overlap");
153
+ }
154
+
155
+ // 10. CONDITIONAL PROBABILITY: given/if + probability
156
+ // Structure: conditional setup → trap is ignoring conditioning
157
+ if (
158
+ /(?:given|if|knowing|after)/i.test(lower) &&
159
+ /(?:probability|chance|what.*odds)/i.test(lower)
160
+ ) {
161
+ score += 0.6;
162
+ categories.push("conditional_probability");
163
+ }
164
+
165
+ // 11. CONJUNCTION FALLACY: "and" probability vs single event
166
+ // Structure: specific description → trap is thinking more detail = more likely
167
+ if (
168
+ /(?:more likely|which.*probable|what.*probability)/i.test(lower) &&
169
+ /(?:and|both|also)/i.test(lower) &&
170
+ /(?:bank teller|feminist|active|personality|description)/i.test(lower)
171
+ ) {
172
+ score += 0.8;
173
+ categories.push("conjunction_fallacy");
174
+ }
175
+
176
+ // 12. MONTY HALL: switch vs stay, doors/boxes/curtains
177
+ // Structure: revealed option → trap is thinking 50/50
178
+ if (
179
+ /(?:door|box|curtain|envelope)/i.test(lower) &&
180
+ /(?:switch|stay|change|keep)/i.test(lower) &&
181
+ /(?:reveal|open|show)/i.test(lower)
182
+ ) {
183
+ score += 0.85;
184
+ categories.push("monty_hall");
185
+ }
186
+
187
+ // 13. ANCHORING: estimation after seeing a number
188
+ // Structure: irrelevant number shown before estimation task
189
+ if (
190
+ /(?:estimate|guess|how (?:many|much|long))/i.test(lower) &&
191
+ /(?:spin|wheel|number|digit|wrote|shown)/i.test(lower)
192
+ ) {
193
+ score += 0.6;
194
+ categories.push("anchoring");
195
+ }
196
+
197
+ // 14. SUNK COST: already invested + should continue?
198
+ // Structure: past investment + decision about future action
199
+ if (
200
+ /(?:already|spent|invested|paid|cost)/i.test(lower) &&
201
+ /(?:should|continue|keep|stop|quit|abandon|walk away)/i.test(lower)
202
+ ) {
203
+ score += 0.75;
204
+ categories.push("sunk_cost");
205
+ }
206
+
207
+ // 15. FRAMING EFFECT: same outcome with gain/loss framing
208
+ // Structure: presents options as "X will be saved" vs "Y will die"
209
+ if (
210
+ (/(?:save|saved|survive|lives?)/i.test(lower) || /(?:die|death|lost|killed)/i.test(lower)) &&
211
+ /(?:program|option|choice|treatment|plan) [ab]/i.test(lower)
212
+ ) {
213
+ score += 0.7;
214
+ categories.push("framing_effect");
215
+ }
216
+
217
+ return {
218
+ required: score >= 0.6,
219
+ score: Math.min(1, score),
220
+ categories,
221
+ };
222
+ }
223
+
224
+ // =============================================================================
225
+ // GENERALIZED SPOT-CHECK (structural analysis)
226
+ // =============================================================================
227
+
228
+ /**
229
+ * Run generalized spot-check on an answer.
230
+ * Detects if answer matches "intuitive but wrong" patterns based on structure.
231
+ */
232
+ export function spotCheck(question: string, answer: string): SpotCheckResult {
233
+ const qNums = extractNumbers(question);
234
+ const aNum = extractFloat(answer);
235
+ const lower = question.toLowerCase();
236
+
237
+ // Text-based checks (don't require numbers in answer)
238
+ const textCheck =
239
+ checkConjunctionFallacy(lower, answer) ||
240
+ checkMontyHall(lower, aNum, answer) ||
241
+ checkSunkCost(lower, answer) ||
242
+ checkFramingEffect(lower, answer);
243
+
244
+ if (textCheck) return textCheck;
245
+
246
+ // Number-based checks require numbers
247
+ if (aNum === null || qNums.length === 0) {
248
+ return passed();
249
+ }
250
+
251
+ // Run structural checks in order of specificity
252
+ return (
253
+ checkAdditiveSystem(lower, qNums, aNum) ||
254
+ checkNonlinearGrowth(lower, qNums, aNum) ||
255
+ checkRatePattern(lower, qNums, aNum) ||
256
+ checkHarmonicMean(lower, qNums, aNum) ||
257
+ checkIndependence(lower, aNum) ||
258
+ checkPigeonhole(lower, qNums, aNum) ||
259
+ checkBaseRate(lower, qNums, aNum) ||
260
+ checkFactorialZeros(lower, qNums, aNum) ||
261
+ checkClockOverlap(lower, aNum) ||
262
+ passed()
263
+ );
264
+ }
265
+
266
+ // =============================================================================
267
+ // STRUCTURAL CHECKERS
268
+ // =============================================================================
269
+
270
+ function passed(): SpotCheckResult {
271
+ return { passed: true, warning: null, hint: null, trapType: null, confidence: 0.5 };
272
+ }
273
+
274
+ function trap(type: string, warning: string, hint: string, confidence: number): SpotCheckResult {
275
+ return { passed: false, warning, hint, trapType: type, confidence };
276
+ }
277
+
278
+ /**
279
+ * ADDITIVE SYSTEM: x + y = Total, x = y + Diff
280
+ * Trap: answering (Total - Diff) instead of (Total - Diff) / 2
281
+ */
282
+ function checkAdditiveSystem(q: string, nums: number[], ans: number): SpotCheckResult | null {
283
+ if (!/(?:total|sum|together|cost).*?(?:more|less)\s*than/i.test(q)) return null;
284
+ if (nums.length < 2) return null;
285
+
286
+ // Find total (usually largest) and difference
287
+ const sorted = [...nums].sort((a, b) => b - a);
288
+ const total = sorted[0];
289
+ const diff = sorted[1];
290
+
291
+ if (total === undefined || diff === undefined) return null;
292
+
293
+ // Trap answer: Total - Diff (without halving)
294
+ const trapAnswer = total - diff;
295
+ const correctAnswer = (total - diff) / 2;
296
+
297
+ // Check various unit representations (dollars vs cents, etc.)
298
+ // trapAnswer might be 0.10, ans might be 10 (cents)
299
+ const isTrapped =
300
+ Math.abs(ans - trapAnswer) < 0.01 || // Same unit
301
+ Math.abs(ans - trapAnswer * 100) < 0.5 || // Answer in cents, trap in dollars
302
+ Math.abs(ans / 100 - trapAnswer) < 0.01; // Answer in dollars, trap in cents
303
+
304
+ const isCorrect =
305
+ Math.abs(ans - correctAnswer) < 0.01 ||
306
+ Math.abs(ans - correctAnswer * 100) < 0.5 ||
307
+ Math.abs(ans / 100 - correctAnswer) < 0.01;
308
+
309
+ if (isTrapped && !isCorrect) {
310
+ return trap(
311
+ "additive_system",
312
+ `Potential trap: ${ans} might be (${total} - ${diff}) without solving the system`,
313
+ `This is a system: x + y = ${total}, x - y = ${diff}. Solve: y = (${total} - ${diff}) / 2 = ${correctAnswer}`,
314
+ 0.85,
315
+ );
316
+ }
317
+
318
+ return null;
319
+ }
320
+
321
+ /**
322
+ * NON-LINEAR GROWTH: doubles every period
323
+ * Trap: answering Time/2 instead of Time-1 for "half full"
324
+ */
325
+ function checkNonlinearGrowth(q: string, nums: number[], ans: number): SpotCheckResult | null {
326
+ if (!/doubles?|triples?/i.test(q)) return null;
327
+ if (!/half|quarter/i.test(q)) return null;
328
+
329
+ // Find the time value (usually larger integer)
330
+ const timeNums = nums.filter((n) => n > 5 && Number.isInteger(n));
331
+ if (timeNums.length === 0) return null;
332
+
333
+ const time = Math.max(...timeNums);
334
+
335
+ // Trap answer: Time / 2 (linear thinking)
336
+ if (Math.abs(ans - time / 2) < 0.5) {
337
+ return trap(
338
+ "nonlinear_growth",
339
+ `Potential trap: ${ans} is ${time}/2, but exponential growth doesn't work linearly`,
340
+ `If something doubles each period and is full at time ${time}, it was half-full at time ${time - 1}`,
341
+ 0.9,
342
+ );
343
+ }
344
+
345
+ return null;
346
+ }
347
+
348
+ /**
349
+ * RATE PATTERN: N machines, N minutes, N widgets
350
+ * Trap: answering M when asked about M machines making M widgets
351
+ */
352
+ function checkRatePattern(q: string, nums: number[], ans: number): SpotCheckResult | null {
353
+ if (!/machines?|workers?|people|printers?/i.test(q)) return null;
354
+ if (!/minutes?|hours?|seconds?/i.test(q)) return null;
355
+
356
+ const counts = countNumbers(nums);
357
+
358
+ // Look for the "setup" pattern: same number appears 3 times (N machines, N min, N widgets)
359
+ let setupNum: number | null = null;
360
+ for (const [num, count] of counts) {
361
+ if (count >= 3) {
362
+ setupNum = num;
363
+ break;
364
+ }
365
+ }
366
+
367
+ if (setupNum === null) return null;
368
+
369
+ // Look for a different "target" number (M machines, M widgets)
370
+ const targetNums = nums.filter((n) => n !== setupNum && (counts.get(n) ?? 0) >= 2);
371
+
372
+ // Trap: answer equals target number (assumes time scales with count)
373
+ for (const target of targetNums) {
374
+ if (Math.abs(ans - target) < 0.1) {
375
+ return trap(
376
+ "rate_pattern",
377
+ `Potential trap: ${ans} assumes time scales with quantity`,
378
+ `If ${setupNum} machines make ${setupNum} widgets in ${setupNum} min, each machine makes 1 widget in ${setupNum} min. More machines = same time, more output.`,
379
+ 0.85,
380
+ );
381
+ }
382
+ }
383
+
384
+ return null;
385
+ }
386
+
387
+ /**
388
+ * HARMONIC MEAN: average speed for round trip
389
+ * Trap: using arithmetic mean (S1 + S2) / 2 instead of 2*S1*S2/(S1+S2)
390
+ */
391
+ function checkHarmonicMean(q: string, nums: number[], ans: number): SpotCheckResult | null {
392
+ if (!/average\s*speed|speed.*average/i.test(q)) return null;
393
+ if (!/(?:round\s*trip|return|back)/i.test(q)) return null;
394
+
395
+ // Filter likely speeds (< 200, probably not distances)
396
+ const speeds = nums.filter((n) => n > 0 && n < 200);
397
+ if (speeds.length < 2) return null;
398
+
399
+ const s1 = speeds[0];
400
+ const s2 = speeds[1];
401
+
402
+ if (s1 === undefined || s2 === undefined || s1 === s2) return null;
403
+
404
+ const arithmetic = (s1 + s2) / 2;
405
+ const harmonic = (2 * s1 * s2) / (s1 + s2);
406
+
407
+ // Trap: answer is arithmetic mean
408
+ if (Math.abs(ans - arithmetic) < 0.5 && Math.abs(ans - harmonic) > 1) {
409
+ return trap(
410
+ "harmonic_mean",
411
+ `Potential trap: ${ans} is the arithmetic mean (${s1}+${s2})/2`,
412
+ `For round trips over fixed distance, use harmonic mean: 2×${s1}×${s2}/(${s1}+${s2}) = ${harmonic.toFixed(1)}`,
413
+ 0.9,
414
+ );
415
+ }
416
+
417
+ return null;
418
+ }
419
+
420
+ /**
421
+ * INDEPENDENCE: probability after streak
422
+ * Trap: gambler's fallacy (thinking streak affects next outcome)
423
+ */
424
+ function checkIndependence(q: string, ans: number): SpotCheckResult | null {
425
+ if (!/coin|dice?|flip|roll/i.test(q)) return null;
426
+ if (!/(?:row|consecutive|straight|times)/i.test(q)) return null;
427
+ if (!/(?:probability|chance|likely)/i.test(q)) return null;
428
+
429
+ // For fair coin, answer should be 50% (or 0.5)
430
+ const is50 = Math.abs(ans - 50) < 2 || Math.abs(ans - 0.5) < 0.02;
431
+
432
+ if (!is50 && ans > 0 && ans < 100) {
433
+ return trap(
434
+ "independence",
435
+ `Potential gambler's fallacy: previous outcomes don't affect independent events`,
436
+ `Each flip/roll is independent. Past results don't change future probability.`,
437
+ 0.8,
438
+ );
439
+ }
440
+
441
+ return null;
442
+ }
443
+
444
+ /**
445
+ * PIGEONHOLE: minimum to guarantee match
446
+ * Trap: underestimating worst case
447
+ */
448
+ function checkPigeonhole(q: string, nums: number[], ans: number): SpotCheckResult | null {
449
+ if (!/(?:minimum|least|fewest)/i.test(q)) return null;
450
+ if (!/(?:guarantee|ensure|certain)/i.test(q)) return null;
451
+ if (!/(?:match|pair|same)/i.test(q)) return null;
452
+
453
+ // For matching pair with N categories, answer is N+1
454
+ // Common setup: 2 colors → need 3 items
455
+
456
+ if (ans === 2) {
457
+ return trap(
458
+ "pigeonhole",
459
+ `Potential trap: 2 items could all be different`,
460
+ `Pigeonhole principle: with N categories, you need N+1 items to guarantee a match.`,
461
+ 0.85,
462
+ );
463
+ }
464
+
465
+ // If answer seems too high (overthinking)
466
+ const maxCategory = Math.max(...nums.filter((n) => n < 100));
467
+ if (ans > maxCategory && maxCategory > 2) {
468
+ return trap(
469
+ "pigeonhole",
470
+ `Potential trap: you don't need majority, just one more than categories`,
471
+ `With ${maxCategory} categories (if that's the count), you need at most ${maxCategory + 1} to guarantee a match.`,
472
+ 0.7,
473
+ );
474
+ }
475
+
476
+ return null;
477
+ }
478
+
479
+ /**
480
+ * BASE RATE: test accuracy + rare condition
481
+ * Trap: ignoring low prevalence (answering ~accuracy instead of Bayes result)
482
+ */
483
+ function checkBaseRate(q: string, nums: number[], ans: number): SpotCheckResult | null {
484
+ if (!/(?:test|positive|negative)/i.test(q)) return null;
485
+ if (!/(?:probability|chance)/i.test(q)) return null;
486
+
487
+ // Look for base rate pattern: "1 in N"
488
+ const rateMatch = q.match(/1\s*(?:in|out of)\s*(\d+)/i);
489
+ if (!rateMatch || !rateMatch[1]) return null;
490
+
491
+ const denominator = parseInt(rateMatch[1], 10);
492
+ const baseRate = 1 / denominator;
493
+
494
+ // Look for accuracy (high percentage)
495
+ const highPcts = nums.filter((n) => n >= 90 && n <= 100);
496
+ const firstPct = highPcts[0];
497
+ if (firstPct === undefined) return null;
498
+
499
+ const accuracy = firstPct / 100;
500
+
501
+ // Calculate Bayes result
502
+ const pPosGivenDisease = accuracy;
503
+ const pPosGivenNoDisease = 1 - accuracy;
504
+ const pPositive = pPosGivenDisease * baseRate + pPosGivenNoDisease * (1 - baseRate);
505
+ const bayesResult = (pPosGivenDisease * baseRate) / pPositive;
506
+
507
+ // Normalize answer to percentage
508
+ const ansPct = ans > 1 ? ans : ans * 100;
509
+
510
+ // Trap: answer is close to accuracy, not Bayes result
511
+ if (ansPct > 80 && bayesResult < 0.2) {
512
+ return trap(
513
+ "base_rate",
514
+ `Potential base rate neglect: ${ansPct.toFixed(0)}% ignores the low prevalence (1 in ${denominator})`,
515
+ `Apply Bayes: P(disease|positive) ≈ ${(bayesResult * 100).toFixed(0)}%, not ${(accuracy * 100).toFixed(0)}%`,
516
+ 0.85,
517
+ );
518
+ }
519
+
520
+ return null;
521
+ }
522
+
523
+ /**
524
+ * FACTORIAL ZEROS: trailing zeros in n!
525
+ * Trap: simple division instead of counting factors of 5
526
+ */
527
+ function checkFactorialZeros(q: string, _nums: number[], ans: number): SpotCheckResult | null {
528
+ if (!/trailing.*zero|zero.*trailing/i.test(q)) return null;
529
+ if (!/\d+!/i.test(q)) return null;
530
+
531
+ // Find factorial argument
532
+ const factMatch = q.match(/(\d+)!/);
533
+ if (!factMatch || !factMatch[1]) return null;
534
+ const n = parseInt(factMatch[1], 10);
535
+
536
+ // Calculate correct answer (count factors of 5)
537
+ let correct = 0;
538
+ let power = 5;
539
+ while (power <= n) {
540
+ correct += Math.floor(n / power);
541
+ power *= 5;
542
+ }
543
+
544
+ // Common traps: n/5 (missing higher powers) or n/10
545
+ const simpleWrong = Math.floor(n / 5);
546
+ const veryWrong = Math.floor(n / 10);
547
+
548
+ if (Math.abs(ans - simpleWrong) < 0.5 && simpleWrong !== correct) {
549
+ return trap(
550
+ "factorial_counting",
551
+ `Potential trap: ${ans} only counts single factors of 5`,
552
+ `Count ALL factors of 5: ⌊n/5⌋ + ⌊n/25⌋ + ⌊n/125⌋ + ... = ${correct}`,
553
+ 0.85,
554
+ );
555
+ }
556
+
557
+ if (Math.abs(ans - veryWrong) < 0.5 && veryWrong !== correct) {
558
+ return trap(
559
+ "factorial_counting",
560
+ `Potential trap: trailing zeros come from factors of 5 (not 10)`,
561
+ `Since 2s are abundant, count factors of 5: ${correct}`,
562
+ 0.8,
563
+ );
564
+ }
565
+
566
+ return null;
567
+ }
568
+
569
+ /**
570
+ * CLOCK OVERLAP: times hands overlap
571
+ * Trap: assuming 12 or 24 overlaps instead of 11 or 22
572
+ */
573
+ function checkClockOverlap(q: string, ans: number): SpotCheckResult | null {
574
+ if (!/clock/i.test(q)) return null;
575
+ if (!/(?:overlap|coincide)/i.test(q)) return null;
576
+ if (!/(?:how many|times)/i.test(q)) return null;
577
+
578
+ // 12-hour trap
579
+ if (/12\s*hours?/i.test(q) && Math.abs(ans - 12) < 0.5) {
580
+ return trap(
581
+ "clock_overlap",
582
+ `Potential trap: hands overlap 11 times in 12 hours, not 12`,
583
+ `The 12:00 overlap is shared. Hands overlap every ~65.45 minutes → 11 times per 12 hours.`,
584
+ 0.9,
585
+ );
586
+ }
587
+
588
+ // 24-hour trap
589
+ if (/24\s*hours?/i.test(q) && Math.abs(ans - 24) < 0.5) {
590
+ return trap(
591
+ "clock_overlap",
592
+ `Potential trap: hands overlap 22 times in 24 hours, not 24`,
593
+ `11 overlaps per 12-hour period × 2 = 22 total.`,
594
+ 0.9,
595
+ );
596
+ }
597
+
598
+ return null;
599
+ }
600
+
601
+ /**
602
+ * CONJUNCTION FALLACY: Linda problem structure
603
+ * Trap: thinking specific conjunction is more likely than general case
604
+ */
605
+ function checkConjunctionFallacy(q: string, answer: string): SpotCheckResult | null {
606
+ if (!/(?:more likely|which.*probable|what.*probability)/i.test(q)) return null;
607
+
608
+ const ansLower = answer.toLowerCase();
609
+
610
+ // Check if question has an "and" option (conjunction)
611
+ const hasConjunctionOption = /(?:and|both|as well)/i.test(q);
612
+ if (!hasConjunctionOption) return null;
613
+
614
+ // Check if answer chooses the conjunction option
615
+ // Match patterns like: "B", "option B", "bank teller and", "and feminist", etc.
616
+ const choosesConjunction =
617
+ /\b[bB]\b/.test(answer) || // Chose option B (common format)
618
+ /(?:and|both|as well)/i.test(ansLower); // Answer contains conjunction
619
+
620
+ if (choosesConjunction) {
621
+ return trap(
622
+ "conjunction_fallacy",
623
+ `Potential conjunction fallacy: P(A and B) ≤ P(A) always`,
624
+ `A conjunction cannot be more probable than either of its parts. The more specific option is LESS likely.`,
625
+ 0.85,
626
+ );
627
+ }
628
+
629
+ return null;
630
+ }
631
+
632
+ /**
633
+ * MONTY HALL: switch vs stay
634
+ * Trap: thinking it's 50/50 after door is revealed
635
+ */
636
+ function checkMontyHall(q: string, ans: number | null, answer: string): SpotCheckResult | null {
637
+ // Detect Monty Hall by name OR by structure
638
+ const isMontyHall =
639
+ /monty\s*hall/i.test(q) ||
640
+ (/(?:door|box|curtain)/i.test(q) &&
641
+ /(?:switch|stay|change|keep)/i.test(q) &&
642
+ /(?:reveal|open|show|goat)/i.test(q));
643
+
644
+ if (!isMontyHall) return null;
645
+
646
+ const ansLower = answer.toLowerCase();
647
+
648
+ // If question asks for probability and answer is 50%
649
+ if (/(?:probability|chance)/i.test(q) && ans !== null) {
650
+ if (Math.abs(ans - 50) < 2 || Math.abs(ans - 0.5) < 0.02) {
651
+ return trap(
652
+ "monty_hall",
653
+ `Potential Monty Hall trap: it's NOT 50/50 after a door is revealed`,
654
+ `Switching wins 2/3 of the time, staying wins 1/3. The reveal gives you information.`,
655
+ 0.9,
656
+ );
657
+ }
658
+ }
659
+
660
+ // If question asks whether to switch, and answer is "stay" or "doesn't matter"
661
+ if (/(?:should|better|strategy)/i.test(q)) {
662
+ if (/(?:stay|keep|doesn't matter|50.?50|same|either)/i.test(ansLower)) {
663
+ return trap(
664
+ "monty_hall",
665
+ `Potential Monty Hall trap: switching is actually the better strategy`,
666
+ `Switching wins 2/3 of the time. The host's reveal changes the odds.`,
667
+ 0.85,
668
+ );
669
+ }
670
+ }
671
+
672
+ return null;
673
+ }
674
+
675
+ /**
676
+ * SUNK COST FALLACY: decision influenced by past investment
677
+ * Trap: continuing based on what was already spent, not future value
678
+ */
679
+ function checkSunkCost(q: string, answer: string): SpotCheckResult | null {
680
+ // Detect sunk cost structure: past investment + decision about future
681
+ if (!/(?:already|spent|invested|paid|cost)/i.test(q)) return null;
682
+ if (!/(?:should|continue|keep|stop|quit|abandon|walk away|finish)/i.test(q)) return null;
683
+
684
+ const ansLower = answer.toLowerCase();
685
+
686
+ // Check if answer references past investment as justification
687
+ const referencesPastInvestment =
688
+ /(?:already spent|already invested|can't waste|too much invested|come this far|so much into)/i.test(
689
+ ansLower,
690
+ ) ||
691
+ // Or explicitly says "continue because of" past spending
692
+ /(?:continue|keep going|finish).*(?:because|since).*(?:spent|invested|paid)/i.test(ansLower);
693
+
694
+ // Also detect the common trap answers
695
+ const commonTrapAnswers =
696
+ // "Yes, continue" without proper justification
697
+ (/^(?:yes|continue|keep|finish)/i.test(ansLower.trim()) &&
698
+ !/(?:future value|expected return|profitable going forward|worth it regardless)/i.test(
699
+ ansLower,
700
+ )) ||
701
+ // Explicit sunk cost reasoning
702
+ /(?:wasted|thrown away|lost|for nothing)/i.test(ansLower);
703
+
704
+ if (referencesPastInvestment || commonTrapAnswers) {
705
+ return trap(
706
+ "sunk_cost",
707
+ `Potential sunk cost fallacy: past investment shouldn't influence future decisions`,
708
+ `Sunk costs are gone - focus on whether FUTURE benefits justify FUTURE costs. What's already spent is irrelevant.`,
709
+ 0.8,
710
+ );
711
+ }
712
+
713
+ return null;
714
+ }
715
+
716
+ /**
717
+ * FRAMING EFFECT: decision influenced by gain vs loss presentation
718
+ * Trap: different choice based on how options are framed
719
+ */
720
+ function checkFramingEffect(q: string, answer: string): SpotCheckResult | null {
721
+ // Detect framing effect structure: same outcome presented differently
722
+ const hasFramingSignals =
723
+ (/(?:save|saved|survive|lives?)/i.test(q) || /(?:die|death|lost|killed)/i.test(q)) &&
724
+ /(?:program|option|choice|treatment|plan) [ab]/i.test(q);
725
+
726
+ if (!hasFramingSignals) return null;
727
+
728
+ const ansLower = answer.toLowerCase();
729
+
730
+ // Classic Asian Disease Problem structure:
731
+ // - Gain frame: "200 saved" vs "1/3 chance all saved, 2/3 none saved"
732
+ // - Loss frame: "400 die" vs "1/3 none die, 2/3 all die"
733
+ // These are mathematically equivalent!
734
+
735
+ // Check if answer shows framing bias
736
+ // In gain frame, people prefer certain option (A)
737
+ // In loss frame, people prefer risky option (B)
738
+
739
+ // Detect if question has gain framing (focus on "saved/survive")
740
+ const isGainFrame = /(?:save|saved|survive)/i.test(q) && !/(?:die|death|killed)/i.test(q);
741
+
742
+ // Detect if question has loss framing (focus on "die/death")
743
+ const isLossFrame = /(?:die|death|killed)/i.test(q) && !/(?:save|saved|survive)/i.test(q);
744
+
745
+ // If someone chooses based on framing without recognizing equivalence
746
+ if (isGainFrame || isLossFrame) {
747
+ // Check if answer acknowledges framing effect or just picks
748
+ const acknowledgesFraming =
749
+ /(?:equivalent|same|framing|mathematically|expected value|doesn't matter)/i.test(ansLower);
750
+
751
+ if (!acknowledgesFraming) {
752
+ // If they just picked without considering the math
753
+ const pickedOption = /\b[ab]\b/i.test(ansLower);
754
+ if (pickedOption) {
755
+ return trap(
756
+ "framing_effect",
757
+ `Potential framing effect: check if options are mathematically equivalent`,
758
+ `The way choices are presented (lives saved vs lives lost) often triggers different intuitive responses to identical expected outcomes. Calculate expected values to decide rationally.`,
759
+ 0.7,
760
+ );
761
+ }
762
+ }
763
+ }
764
+
765
+ return null;
766
+ }
767
+
768
+ // =============================================================================
769
+ // TRAP PRIMING (proactive guidance before reasoning)
770
+ // =============================================================================
771
+
772
+ /**
773
+ * Configuration for smart priming behavior.
774
+ * Based on benchmark analysis showing:
775
+ * - Single-trap priming: 0 regressions
776
+ * - Multi-trap priming: 1 regression (Monty Hall confusion)
777
+ */
778
+ export interface PrimeOptions {
779
+ /** Minimum detection confidence to trigger priming (default: 0.7) */
780
+ minConfidence?: number;
781
+
782
+ /**
783
+ * Maximum traps to combine into prompt (default: 1 = single-trap only)
784
+ * Set to 1 for conservative mode (safest), 2-3 for aggressive mode.
785
+ * Benchmark showed multi-trap priming can confuse models.
786
+ */
787
+ maxCombined?: number;
788
+
789
+ /** Trap types to exclude from priming (model handles well without help) */
790
+ excludeTypes?: string[];
791
+ }
792
+
793
+ /** Default conservative options - single-trap only, proven safe */
794
+ export const PRIME_DEFAULTS: Required<PrimeOptions> = {
795
+ minConfidence: 0.7,
796
+ maxCombined: 1,
797
+ excludeTypes: [],
798
+ };
799
+
800
+ /** Aggressive priming - use with caution, may cause regressions */
801
+ export const PRIME_AGGRESSIVE: Required<PrimeOptions> = {
802
+ minConfidence: 0.6,
803
+ maxCombined: 2,
804
+ excludeTypes: [],
805
+ };
806
+
807
+ export interface PrimeResult {
808
+ /** Whether priming is recommended */
809
+ shouldPrime: boolean;
810
+ /** Detected trap types (all detected, before filtering) */
811
+ trapTypes: string[];
812
+ /** Trap types actually used for priming (after filtering) */
813
+ primedTypes: string[];
814
+ /** Short nudge to prepend (<20 tokens for single, <50 for combined) */
815
+ primingPrompt: string | null;
816
+ /** Individual prompts for each detected trap */
817
+ allPrompts: string[];
818
+ /** Confidence in detection (0-1) */
819
+ confidence: number;
820
+ /** Whether priming was skipped due to options (confidence too low, excluded type, etc.) */
821
+ skippedReason: string | null;
822
+ }
823
+
824
+ /** Priming prompts for each trap type - kept under 20 tokens */
825
+ const PRIMING_PROMPTS: Record<string, string> = {
826
+ additive_system: "⚠️ System of equations detected. Define variables x,y and solve algebraically.",
827
+ nonlinear_growth: "⚠️ Exponential growth. Work backwards from the end state, not forwards.",
828
+ rate_pattern: "⚠️ Rate problem. Calculate rate per unit first, then scale.",
829
+ harmonic_mean: "⚠️ Round trip speed. Use harmonic mean: 2ab/(a+b), not arithmetic.",
830
+ independence: "⚠️ Independent events. Past outcomes don't affect future probability.",
831
+ pigeonhole: "⚠️ Guarantee problem. Consider worst case: need categories + 1.",
832
+ base_rate: "⚠️ Rare condition + test. Apply Bayes' theorem with base rate.",
833
+ factorial_counting: "⚠️ Factorial zeros. Count factors of 5: ⌊n/5⌋ + ⌊n/25⌋ + ...",
834
+ clock_overlap: "⚠️ Clock hands overlap 11 times per 12 hours, not 12.",
835
+ conditional_probability: "⚠️ Conditional probability. Use P(A|B) = P(A∩B)/P(B).",
836
+ conjunction_fallacy: "⚠️ Conjunction trap. P(A and B) ≤ P(A) always.",
837
+ monty_hall: "⚠️ Revealed information changes odds. Switching wins 2/3.",
838
+ anchoring: "⚠️ Ignore irrelevant numbers. Base estimate on actual data only.",
839
+ sunk_cost: "⚠️ Sunk cost trap. Past spending is irrelevant to future decisions.",
840
+ framing_effect: "⚠️ Check framing. Calculate expected values for both options.",
841
+ };
842
+
843
+ /**
844
+ * Analyze a question BEFORE reasoning to detect potential cognitive traps.
845
+ * Returns a priming prompt to inject preventive guidance.
846
+ *
847
+ * Smart priming based on benchmark analysis:
848
+ * - Single-trap priming had 0 regressions across 41 questions
849
+ * - Multi-trap priming caused 1 regression (model confusion)
850
+ * - Default: conservative single-trap mode (maxCombined=1)
851
+ *
852
+ * O(n) single-pass - no LLM calls.
853
+ *
854
+ * @param question - The question to analyze
855
+ * @param options - Smart priming configuration (or number for backward compat)
856
+ */
857
+ export function primeQuestion(question: string, options?: PrimeOptions | number): PrimeResult {
858
+ // Backward compatibility: number = maxCombined
859
+ const opts: Required<PrimeOptions> =
860
+ typeof options === "number"
861
+ ? { ...PRIME_DEFAULTS, maxCombined: options }
862
+ : { ...PRIME_DEFAULTS, ...options };
863
+
864
+ const detection = needsSpotCheck(question);
865
+
866
+ // No traps detected
867
+ if (!detection.required || detection.categories.length === 0) {
868
+ return {
869
+ shouldPrime: false,
870
+ trapTypes: [],
871
+ primedTypes: [],
872
+ primingPrompt: null,
873
+ allPrompts: [],
874
+ confidence: detection.score,
875
+ skippedReason: "no_traps_detected",
876
+ };
877
+ }
878
+
879
+ // Confidence below threshold
880
+ if (detection.score < opts.minConfidence) {
881
+ return {
882
+ shouldPrime: false,
883
+ trapTypes: detection.categories,
884
+ primedTypes: [],
885
+ primingPrompt: null,
886
+ allPrompts: [],
887
+ confidence: detection.score,
888
+ skippedReason: `confidence_below_threshold:${detection.score.toFixed(2)}<${opts.minConfidence}`,
889
+ };
890
+ }
891
+
892
+ // Filter out excluded trap types
893
+ const filteredCategories = detection.categories.filter((cat) => !opts.excludeTypes.includes(cat));
894
+
895
+ // All traps excluded
896
+ if (filteredCategories.length === 0) {
897
+ return {
898
+ shouldPrime: false,
899
+ trapTypes: detection.categories,
900
+ primedTypes: [],
901
+ primingPrompt: null,
902
+ allPrompts: [],
903
+ confidence: detection.score,
904
+ skippedReason: `all_types_excluded:${detection.categories.join(",")}`,
905
+ };
906
+ }
907
+
908
+ // Collect prompts for filtered traps (up to maxCombined)
909
+ const trapsToInclude = filteredCategories.slice(0, opts.maxCombined);
910
+ const allPrompts: string[] = [];
911
+
912
+ for (const trap of trapsToInclude) {
913
+ const prompt = PRIMING_PROMPTS[trap];
914
+ if (prompt) {
915
+ allPrompts.push(prompt);
916
+ }
917
+ }
918
+
919
+ // No prompts available for detected traps
920
+ if (allPrompts.length === 0) {
921
+ return {
922
+ shouldPrime: false,
923
+ trapTypes: detection.categories,
924
+ primedTypes: [],
925
+ primingPrompt: null,
926
+ allPrompts: [],
927
+ confidence: detection.score,
928
+ skippedReason: `no_prompts_for_types:${trapsToInclude.join(",")}`,
929
+ };
930
+ }
931
+
932
+ // Combine prompts: single trap uses full prompt, multi-trap uses condensed format
933
+ let primingPrompt: string | null = null;
934
+ if (allPrompts.length === 1) {
935
+ primingPrompt = allPrompts[0] ?? null;
936
+ } else if (allPrompts.length > 1) {
937
+ // For multi-trap, use numbered list format
938
+ primingPrompt = allPrompts.map((p, i) => `${i + 1}. ${p.replace("⚠️ ", "")}`).join("\n");
939
+ }
940
+
941
+ return {
942
+ shouldPrime: true,
943
+ trapTypes: detection.categories,
944
+ primedTypes: trapsToInclude,
945
+ primingPrompt,
946
+ allPrompts,
947
+ confidence: detection.score,
948
+ skippedReason: null,
949
+ };
950
+ }
951
+
952
+ // =============================================================================
953
+ // LEGACY EXPORTS (for backwards compatibility)
954
+ // =============================================================================
955
+
956
+ /** @deprecated Use needsSpotCheck instead */
957
+ export function hasTrapPatterns(question: string): boolean {
958
+ return needsSpotCheck(question).required;
959
+ }
960
+
961
+ export type TrapDetector = (question: string, answer: string) => SpotCheckResult | null;