verifiable-thinking-mcp 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +339 -0
  3. package/package.json +75 -0
  4. package/src/index.ts +38 -0
  5. package/src/lib/cache.ts +246 -0
  6. package/src/lib/compression.ts +804 -0
  7. package/src/lib/compute/cache.ts +86 -0
  8. package/src/lib/compute/classifier.ts +555 -0
  9. package/src/lib/compute/confidence.ts +79 -0
  10. package/src/lib/compute/context.ts +154 -0
  11. package/src/lib/compute/extract.ts +200 -0
  12. package/src/lib/compute/filter.ts +224 -0
  13. package/src/lib/compute/index.ts +171 -0
  14. package/src/lib/compute/math.ts +247 -0
  15. package/src/lib/compute/patterns.ts +564 -0
  16. package/src/lib/compute/registry.ts +145 -0
  17. package/src/lib/compute/solvers/arithmetic.ts +65 -0
  18. package/src/lib/compute/solvers/calculus.ts +249 -0
  19. package/src/lib/compute/solvers/derivation-core.ts +371 -0
  20. package/src/lib/compute/solvers/derivation-latex.ts +160 -0
  21. package/src/lib/compute/solvers/derivation-mistakes.ts +1046 -0
  22. package/src/lib/compute/solvers/derivation-simplify.ts +451 -0
  23. package/src/lib/compute/solvers/derivation-transform.ts +620 -0
  24. package/src/lib/compute/solvers/derivation.ts +67 -0
  25. package/src/lib/compute/solvers/facts.ts +120 -0
  26. package/src/lib/compute/solvers/formula.ts +728 -0
  27. package/src/lib/compute/solvers/index.ts +36 -0
  28. package/src/lib/compute/solvers/logic.ts +422 -0
  29. package/src/lib/compute/solvers/probability.ts +307 -0
  30. package/src/lib/compute/solvers/statistics.ts +262 -0
  31. package/src/lib/compute/solvers/word-problems.ts +408 -0
  32. package/src/lib/compute/types.ts +107 -0
  33. package/src/lib/concepts.ts +111 -0
  34. package/src/lib/domain.ts +731 -0
  35. package/src/lib/extraction.ts +912 -0
  36. package/src/lib/index.ts +122 -0
  37. package/src/lib/judge.ts +260 -0
  38. package/src/lib/math/ast.ts +842 -0
  39. package/src/lib/math/index.ts +8 -0
  40. package/src/lib/math/operators.ts +171 -0
  41. package/src/lib/math/tokenizer.ts +477 -0
  42. package/src/lib/patterns.ts +200 -0
  43. package/src/lib/session.ts +825 -0
  44. package/src/lib/think/challenge.ts +323 -0
  45. package/src/lib/think/complexity.ts +504 -0
  46. package/src/lib/think/confidence-drift.ts +507 -0
  47. package/src/lib/think/consistency.ts +347 -0
  48. package/src/lib/think/guidance.ts +188 -0
  49. package/src/lib/think/helpers.ts +568 -0
  50. package/src/lib/think/hypothesis.ts +216 -0
  51. package/src/lib/think/index.ts +127 -0
  52. package/src/lib/think/prompts.ts +262 -0
  53. package/src/lib/think/route.ts +358 -0
  54. package/src/lib/think/schema.ts +98 -0
  55. package/src/lib/think/scratchpad-schema.ts +662 -0
  56. package/src/lib/think/spot-check.ts +961 -0
  57. package/src/lib/think/types.ts +93 -0
  58. package/src/lib/think/verification.ts +260 -0
  59. package/src/lib/tokens.ts +177 -0
  60. package/src/lib/verification.ts +620 -0
  61. package/src/prompts/index.ts +10 -0
  62. package/src/prompts/templates.ts +336 -0
  63. package/src/resources/index.ts +8 -0
  64. package/src/resources/sessions.ts +196 -0
  65. package/src/tools/compress.ts +138 -0
  66. package/src/tools/index.ts +5 -0
  67. package/src/tools/scratchpad.ts +2659 -0
  68. package/src/tools/sessions.ts +144 -0
@@ -0,0 +1,912 @@
1
+ /**
2
+ * Answer Extraction Utilities
3
+ * Priority-based extraction for structured and unstructured LLM responses
4
+ * Based on HuggingFace Math-Verify patterns
5
+ */
6
+
7
+ // Import pre-compiled regex patterns from centralized module
8
+ import {
9
+ RE_AMP,
10
+ RE_ANTITHINK,
11
+ RE_APOS,
12
+ RE_ARTIFACTS,
13
+ // Model-Specific Tokens
14
+ RE_BEGIN_BOX,
15
+ RE_BLOCKQUOTE,
16
+ RE_BOLD_ASTERISK,
17
+ RE_BOLD_UNDERSCORE,
18
+ RE_BOXED,
19
+ // LaTeX
20
+ RE_BOXED_DOLLAR,
21
+ RE_BR,
22
+ // Markdown
23
+ RE_CODE_BLOCK,
24
+ RE_CONTEXT,
25
+ RE_DOCUMENT_CONTENT,
26
+ RE_END_BOX,
27
+ RE_ENDOFTEXT,
28
+ RE_GT,
29
+ RE_HEADINGS,
30
+ RE_HORIZONTAL_RULE,
31
+ RE_IM_BLOCK,
32
+ RE_IMAGES,
33
+ RE_INLINE_CODE,
34
+ RE_INLINE_MATH,
35
+ RE_INTERNAL_MONOLOGUE,
36
+ RE_ITALIC_ASTERISK,
37
+ RE_ITALIC_UNDERSCORE,
38
+ RE_LINKS,
39
+ RE_LT,
40
+ RE_MODEL_TOKENS_FAST,
41
+ // Whitespace
42
+ RE_MULTI_NEWLINE,
43
+ RE_MULTI_SPACE,
44
+ // HTML
45
+ RE_NBSP,
46
+ RE_ORDERED_LIST,
47
+ RE_PAD,
48
+ RE_PERCENTAGE,
49
+ RE_QUOT,
50
+ RE_REASONING,
51
+ RE_REFLECTION,
52
+ RE_SIMPLE_TAGS,
53
+ RE_STRIKETHROUGH,
54
+ // Thinking/Reasoning Tags
55
+ RE_THINK,
56
+ RE_THINKING,
57
+ RE_THOUGHT,
58
+ RE_THOUGHTS,
59
+ // Tool/Artifact Containers
60
+ RE_TOOL_CALL,
61
+ RE_TOOL_RESULT,
62
+ RE_TRAILING_WHITESPACE,
63
+ RE_UNORDERED_LIST,
64
+ // Answer Extraction
65
+ RE_WORD_FRACTION,
66
+ RE_WORD_FRACTION_START,
67
+ } from "./patterns.ts";
68
+
69
+ /**
70
+ * Strip all LLM output artifacts for clean display/comparison.
71
+ * Handles:
72
+ * - Thinking/reasoning tags (DeepSeek, Claude, Gemini, Llama, Mistral)
73
+ * - Model-specific tokens (GLM, etc.)
74
+ * - Tool invocation artifacts
75
+ * - Markdown formatting
76
+ * - HTML entities
77
+ * - Excess whitespace
78
+ *
79
+ * Performance: Regex patterns are pre-compiled at module load time.
80
+ */
81
+ export function stripLLMOutput(text: string): string {
82
+ return (
83
+ text
84
+ // === THINKING/REASONING TAGS ===
85
+ .replace(RE_THINK, "")
86
+ .replace(RE_THINKING, "")
87
+ .replace(RE_REASONING, "")
88
+ .replace(RE_ANTITHINK, "")
89
+ .replace(RE_THOUGHT, "")
90
+ .replace(RE_THOUGHTS, "")
91
+ .replace(RE_REFLECTION, "")
92
+ .replace(RE_INTERNAL_MONOLOGUE, "")
93
+
94
+ // === TOOL/ARTIFACT CONTAINERS ===
95
+ .replace(RE_TOOL_CALL, "")
96
+ .replace(RE_TOOL_RESULT, "")
97
+ .replace(RE_ARTIFACTS, "")
98
+ .replace(RE_DOCUMENT_CONTENT, "")
99
+ .replace(RE_CONTEXT, "")
100
+
101
+ // === MODEL-SPECIFIC TOKENS ===
102
+ .replace(RE_BEGIN_BOX, "")
103
+ .replace(RE_END_BOX, "")
104
+ .replace(RE_IM_BLOCK, "")
105
+ .replace(RE_ENDOFTEXT, "")
106
+ .replace(RE_PAD, "")
107
+
108
+ // === MARKDOWN ===
109
+ .replace(RE_CODE_BLOCK, "")
110
+ .replace(RE_BOLD_ASTERISK, "$1")
111
+ .replace(RE_BOLD_UNDERSCORE, "$1")
112
+ .replace(RE_ITALIC_ASTERISK, "$1")
113
+ .replace(RE_ITALIC_UNDERSCORE, "$1")
114
+ .replace(RE_INLINE_CODE, "$1")
115
+ .replace(RE_HEADINGS, "")
116
+ .replace(RE_STRIKETHROUGH, "$1")
117
+ .replace(RE_IMAGES, "")
118
+ .replace(RE_LINKS, "$1")
119
+ .replace(RE_BLOCKQUOTE, "")
120
+ .replace(RE_HORIZONTAL_RULE, "")
121
+ .replace(RE_UNORDERED_LIST, "")
122
+ .replace(RE_ORDERED_LIST, "")
123
+
124
+ // === LATEX (extract content) ===
125
+ .replace(RE_BOXED_DOLLAR, "$1")
126
+ .replace(RE_BOXED, "$1")
127
+ .replace(RE_INLINE_MATH, "$1")
128
+
129
+ // === HTML ===
130
+ .replace(RE_NBSP, " ")
131
+ .replace(RE_AMP, "&")
132
+ .replace(RE_LT, "<")
133
+ .replace(RE_GT, ">")
134
+ .replace(RE_QUOT, '"')
135
+ .replace(RE_APOS, "'")
136
+ .replace(RE_BR, "\n")
137
+ .replace(RE_SIMPLE_TAGS, "")
138
+
139
+ // === WHITESPACE CLEANUP ===
140
+ .replace(RE_MULTI_NEWLINE, "\n\n")
141
+ .replace(RE_TRAILING_WHITESPACE, "")
142
+ .replace(RE_MULTI_SPACE, " ")
143
+
144
+ .trim()
145
+ );
146
+ }
147
+
148
+ // Keep legacy function names as aliases for backward compatibility
149
+ export const stripThinkingTags = stripLLMOutput;
150
+ export const stripMarkdown = stripLLMOutput;
151
+
152
+ // =============================================================================
153
+ // FAST PATH: Only thinking tags + model tokens (no markdown/HTML cleanup)
154
+ // =============================================================================
155
+
156
+ /**
157
+ * Fast variant that only strips thinking tags and model tokens.
158
+ * Use when you only need to remove reasoning artifacts, not full markdown cleanup.
159
+ *
160
+ * Uses the same individual regex approach as stripLLMOutput (faster than backreference).
161
+ *
162
+ * @example
163
+ * ```ts
164
+ * // Hot path: just need visible content
165
+ * const visible = stripThinkingTagsFast(response);
166
+ *
167
+ * // Full cleanup needed for comparison
168
+ * const clean = stripLLMOutput(response);
169
+ * ```
170
+ */
171
+ export function stripThinkingTagsFast(text: string): string {
172
+ return text
173
+ .replace(RE_THINK, "")
174
+ .replace(RE_THINKING, "")
175
+ .replace(RE_REASONING, "")
176
+ .replace(RE_ANTITHINK, "")
177
+ .replace(RE_THOUGHT, "")
178
+ .replace(RE_THOUGHTS, "")
179
+ .replace(RE_REFLECTION, "")
180
+ .replace(RE_INTERNAL_MONOLOGUE, "")
181
+ .replace(RE_MODEL_TOKENS_FAST, "")
182
+ .replace(RE_MULTI_NEWLINE, "\n\n")
183
+ .replace(RE_MULTI_SPACE, " ")
184
+ .trim();
185
+ }
186
+
187
+ // =============================================================================
188
+ // STREAMING: For very large responses (>100KB)
189
+ // =============================================================================
190
+
191
+ /** Threshold above which streaming is recommended (100KB) */
192
+ const STREAMING_THRESHOLD = 100 * 1024;
193
+
194
+ /** Chunk size for streaming processing (32KB with overlap) */
195
+ const CHUNK_SIZE = 32 * 1024;
196
+
197
+ /** Overlap to handle tags split across chunk boundaries */
198
+ const CHUNK_OVERLAP = 1024;
199
+
200
+ /**
201
+ * Check if a response is large enough to benefit from streaming.
202
+ */
203
+ export function shouldStreamStrip(text: string): boolean {
204
+ return text.length > STREAMING_THRESHOLD;
205
+ }
206
+
207
+ /**
208
+ * Generator that yields cleaned chunks for very large responses.
209
+ *
210
+ * Two-phase approach:
211
+ * 1. Strip all thinking/reasoning tags first (single regex pass)
212
+ * 2. Chunk the cleaned result for memory-friendly processing
213
+ *
214
+ * Use for responses >100KB to avoid memory spikes during markdown/entity cleanup.
215
+ *
216
+ * @example
217
+ * ```ts
218
+ * if (shouldStreamStrip(hugeResponse)) {
219
+ * const chunks: string[] = [];
220
+ * for (const chunk of stripLLMOutputStreaming(hugeResponse)) {
221
+ * chunks.push(chunk);
222
+ * }
223
+ * const result = chunks.join('');
224
+ * } else {
225
+ * const result = stripLLMOutput(hugeResponse);
226
+ * }
227
+ * ```
228
+ */
229
+ export function* stripLLMOutputStreaming(text: string): Generator<string, void, unknown> {
230
+ const len = text.length;
231
+
232
+ // For small inputs, just yield the full result
233
+ if (len <= STREAMING_THRESHOLD) {
234
+ yield stripLLMOutput(text);
235
+ return;
236
+ }
237
+
238
+ // Phase 1: Strip all thinking tags first (they can span chunks)
239
+ // Uses individual patterns (faster than combined backreference regex)
240
+ const withoutThinking = text
241
+ .replace(RE_THINK, "")
242
+ .replace(RE_THINKING, "")
243
+ .replace(RE_REASONING, "")
244
+ .replace(RE_ANTITHINK, "")
245
+ .replace(RE_THOUGHT, "")
246
+ .replace(RE_THOUGHTS, "")
247
+ .replace(RE_REFLECTION, "")
248
+ .replace(RE_INTERNAL_MONOLOGUE, "")
249
+ .replace(RE_MODEL_TOKENS_FAST, "");
250
+
251
+ // Phase 2: Chunk the remaining content for markdown/entity cleanup
252
+ const cleanedLen = withoutThinking.length;
253
+ let pos = 0;
254
+
255
+ while (pos < cleanedLen) {
256
+ // Calculate chunk boundaries
257
+ const chunkEnd = Math.min(pos + CHUNK_SIZE, cleanedLen);
258
+ const isLastChunk = chunkEnd >= cleanedLen;
259
+
260
+ let chunk = withoutThinking.slice(pos, chunkEnd);
261
+
262
+ // For non-last chunks, find a safe break point (newline or space)
263
+ if (!isLastChunk) {
264
+ const searchStart = Math.max(0, chunk.length - CHUNK_OVERLAP);
265
+ const lastNewline = chunk.lastIndexOf("\n", searchStart);
266
+ const lastSpace = chunk.lastIndexOf(" ", searchStart);
267
+ // Guard against -1 from lastIndexOf when no match found
268
+ const safeEnd = Math.max(
269
+ lastNewline >= 0 ? lastNewline : 0,
270
+ lastSpace >= 0 ? lastSpace : 0,
271
+ searchStart,
272
+ );
273
+
274
+ // Adjust next position to continue from safe point
275
+ pos += safeEnd;
276
+ chunk = chunk.slice(0, safeEnd);
277
+ } else {
278
+ pos = chunkEnd;
279
+ }
280
+
281
+ // Apply remaining cleanup (markdown, entities, whitespace)
282
+ const processed = chunk
283
+ // Tool/artifact containers
284
+ .replace(RE_TOOL_CALL, "")
285
+ .replace(RE_TOOL_RESULT, "")
286
+ .replace(RE_ARTIFACTS, "")
287
+ .replace(RE_DOCUMENT_CONTENT, "")
288
+ .replace(RE_CONTEXT, "")
289
+ // Markdown
290
+ .replace(RE_CODE_BLOCK, "")
291
+ .replace(RE_HEADINGS, "")
292
+ .replace(RE_BOLD_ASTERISK, "$1")
293
+ .replace(RE_BOLD_UNDERSCORE, "$1")
294
+ .replace(RE_ITALIC_ASTERISK, "$1")
295
+ .replace(RE_ITALIC_UNDERSCORE, "$1")
296
+ .replace(RE_STRIKETHROUGH, "$1")
297
+ .replace(RE_IMAGES, "")
298
+ .replace(RE_LINKS, "$1")
299
+ .replace(RE_INLINE_CODE, "$1")
300
+ .replace(RE_BLOCKQUOTE, "")
301
+ .replace(RE_HORIZONTAL_RULE, "")
302
+ .replace(RE_UNORDERED_LIST, "")
303
+ .replace(RE_ORDERED_LIST, "")
304
+ // HTML
305
+ .replace(RE_NBSP, " ")
306
+ .replace(RE_LT, "<")
307
+ .replace(RE_GT, ">")
308
+ .replace(RE_AMP, "&")
309
+ .replace(RE_QUOT, '"')
310
+ .replace(RE_APOS, "'")
311
+ .replace(RE_BR, "\n")
312
+ .replace(RE_SIMPLE_TAGS, "")
313
+ // Whitespace
314
+ .replace(RE_MULTI_NEWLINE, "\n\n")
315
+ .replace(RE_MULTI_SPACE, " ")
316
+ .trim();
317
+
318
+ // Yield non-empty results
319
+ if (processed) {
320
+ yield processed;
321
+ }
322
+ }
323
+ }
324
+
325
+ /**
326
+ * Streaming version that returns a Promise for async contexts.
327
+ * Processes chunks with yielding to avoid blocking the event loop.
328
+ */
329
+ export async function stripLLMOutputAsync(text: string): Promise<string> {
330
+ if (text.length <= STREAMING_THRESHOLD) {
331
+ return stripLLMOutput(text);
332
+ }
333
+
334
+ const chunks: string[] = [];
335
+ let chunkCount = 0;
336
+
337
+ for (const chunk of stripLLMOutputStreaming(text)) {
338
+ chunks.push(chunk);
339
+ chunkCount++;
340
+
341
+ // Yield to event loop every 10 chunks to avoid blocking
342
+ if (chunkCount % 10 === 0) {
343
+ await new Promise((resolve) => setTimeout(resolve, 0));
344
+ }
345
+ }
346
+
347
+ return chunks.join(" ");
348
+ }
349
+
350
+ /** Clean number: remove commas, trim whitespace */
351
+ function cleanNumber(s: string): string {
352
+ return s.replace(/,/g, "").trim();
353
+ }
354
+
355
+ /** Stopwords to filter out when extracting word answers */
356
+ const STOPWORDS = new Set([
357
+ "is",
358
+ "the",
359
+ "a",
360
+ "an",
361
+ "to",
362
+ "be",
363
+ "it",
364
+ "that",
365
+ "this",
366
+ "answer",
367
+ "result",
368
+ "final",
369
+ "therefore",
370
+ "thus",
371
+ "so",
372
+ "and",
373
+ "or",
374
+ "but",
375
+ "of",
376
+ "in",
377
+ "for",
378
+ "on",
379
+ "with",
380
+ "as",
381
+ "at",
382
+ "by",
383
+ "from",
384
+ ]);
385
+
386
+ /** Extract answer from a phrase like "45 degrees" or "YES because..." */
387
+ function extractFromPhrase(phrase: string): string | null {
388
+ const trimmed = phrase.trim();
389
+
390
+ // Priority 0: Word fractions at the start ("two-thirds", "one half", "a third")
391
+ const wordFracMatch = trimmed.match(RE_WORD_FRACTION_START);
392
+ if (wordFracMatch?.[0]) return wordFracMatch[0];
393
+
394
+ // Priority 1: Leading number with optional percent (75%, 3.14, 2/3)
395
+ const numMatch = trimmed.match(/^(-?[\d,]+(?:\.\d+)?(?:\/\d+)?%?)/);
396
+ if (numMatch?.[1]) return cleanNumber(numMatch[1]);
397
+
398
+ // Priority 2: Capitalized short answer (YES, NO, A, B, TRUE, FALSE, etc.)
399
+ const capsMatch = trimmed.match(/^([A-Z][A-Z0-9]*)\b/);
400
+ if (capsMatch?.[1] && capsMatch[1].length <= 10) return capsMatch[1];
401
+
402
+ // Priority 3: First word if it's short and meaningful
403
+ const firstWord = trimmed.split(/\s+/)[0];
404
+ if (firstWord) {
405
+ const cleaned = firstWord.replace(/[^a-zA-Z0-9.-]/g, "");
406
+ if (cleaned.length >= 1 && cleaned.length <= 15 && !STOPWORDS.has(cleaned.toLowerCase())) {
407
+ return cleaned;
408
+ }
409
+ }
410
+
411
+ return null;
412
+ }
413
+
414
+ /** Extract last meaningful word from text (for YES/NO type answers) */
415
+ function extractLastMeaningfulWord(text: string): string {
416
+ const words = text.split(/\s+/).filter((w) => w.length > 0);
417
+
418
+ // Search backwards for a meaningful word
419
+ for (let i = words.length - 1; i >= 0; i--) {
420
+ const rawWord = words[i];
421
+ if (rawWord) {
422
+ const word = rawWord.replace(/[^a-zA-Z0-9.-]/g, "");
423
+ if (word.length >= 1 && !STOPWORDS.has(word.toLowerCase())) {
424
+ return word.slice(0, 20);
425
+ }
426
+ }
427
+ }
428
+
429
+ // Absolute fallback: return last word cleaned
430
+ const lastWord = words[words.length - 1] ?? "";
431
+ return lastWord.replace(/[^a-zA-Z0-9.-]/g, "").slice(0, 20) || "unknown";
432
+ }
433
+
434
+ /**
435
+ * Try to match expected answers directly in the response.
436
+ * This is Priority 0 - the fastest path when we have ground truth.
437
+ * @returns The matched expected answer, or null if no match found
438
+ */
439
+ function matchExpectedAnswer(cleaned: string, expectedAnswers: string[]): string | null {
440
+ // Try exact word boundary match first
441
+ for (const expected of expectedAnswers) {
442
+ const escapedExpected = expected.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
443
+ const pattern = new RegExp(`\\b${escapedExpected}\\b`, "i");
444
+ if (pattern.test(cleaned)) {
445
+ return expected;
446
+ }
447
+ }
448
+ // Try normalized versions (e.g., "A, 7" vs "A,7")
449
+ const normalizedResponse = cleaned.replace(/\s+/g, "").toLowerCase();
450
+ for (const expected of expectedAnswers) {
451
+ const normalizedExpected = expected.replace(/\s+/g, "").toLowerCase();
452
+ if (normalizedResponse.includes(normalizedExpected)) {
453
+ return expected;
454
+ }
455
+ }
456
+ return null;
457
+ }
458
+
459
+ /**
460
+ * Match explicit answer markers in text (Priority 2-5).
461
+ * Patterns: "Final Answer: X", "Answer: X", "The answer is X", "Result: X", etc.
462
+ * @returns The extracted answer, or null if no marker found
463
+ */
464
+ function matchExplicitMarkers(cleaned: string): string | null {
465
+ // Priority 2: "Final Answer: X" (with colon, very explicit)
466
+ const finalColonMatch = cleaned.match(/final\s+answer:\s*([^\n]+?)(?:\.\s|$)/i);
467
+ if (finalColonMatch?.[1]) {
468
+ const extracted = extractFromPhrase(finalColonMatch[1]);
469
+ if (extracted) return extracted;
470
+ }
471
+
472
+ // Priority 3: "Answer: X" (with colon)
473
+ const colonMatch = cleaned.match(/(?<!final\s)answer:\s*([^\n]+?)(?:\.\s|$)/i);
474
+ if (colonMatch?.[1]) {
475
+ const extracted = extractFromPhrase(colonMatch[1]);
476
+ if (extracted) return extracted;
477
+ }
478
+
479
+ // Priority 4: "The answer is X" or "answer is X"
480
+ const isMatch = cleaned.match(/(?:the\s+)?(?:final\s+)?answer\s+is\s+([^\n]+?)(?:\.\s|$)/i);
481
+ if (isMatch?.[1]) {
482
+ const extracted = extractFromPhrase(isMatch[1]);
483
+ if (extracted) return extracted;
484
+ }
485
+
486
+ // Priority 4b: "should be X" or "must be X" (common in verification responses)
487
+ const shouldBeMatch = cleaned.match(/(?:answer\s+)?(?:should|must)\s+be\s+([^\n.]+)/i);
488
+ if (shouldBeMatch?.[1]) {
489
+ const extracted = extractFromPhrase(shouldBeMatch[1].trim());
490
+ if (extracted) return extracted;
491
+ }
492
+
493
+ // Priority 4c: "cards to flip are X" or "need to flip X" (Wason task specific)
494
+ const flipMatch = cleaned.match(
495
+ /(?:cards?\s+to\s+flip|need\s+to\s+flip|must\s+flip)\s+(?:are\s+)?([A-Z0-9,\s]+)/i,
496
+ );
497
+ if (flipMatch?.[1]) {
498
+ const cards = flipMatch[1]
499
+ .trim()
500
+ .replace(/\s+and\s+/gi, ",")
501
+ .replace(/\s+/g, "");
502
+ if (cards) return cards;
503
+ }
504
+
505
+ // Priority 4d: "conclusion is X" or "I conclude X"
506
+ const concludeMatch = cleaned.match(
507
+ /(?:conclusion|conclude|I\s+conclude)\s+(?:is\s+|that\s+)?([^\n.]+)/i,
508
+ );
509
+ if (concludeMatch?.[1]) {
510
+ const extracted = extractFromPhrase(concludeMatch[1].trim());
511
+ if (extracted) return extracted;
512
+ }
513
+
514
+ // Priority 4e: "therefore X" or "thus X" at sentence start
515
+ const thereforeMatch = cleaned.match(/(?:^|\n)\s*(?:therefore|thus|hence|so)\s+([^\n.]+)/im);
516
+ if (thereforeMatch?.[1]) {
517
+ const extracted = extractFromPhrase(thereforeMatch[1].trim());
518
+ if (extracted) return extracted;
519
+ }
520
+
521
+ // Priority 5: "Result: X"
522
+ const resultMatch = cleaned.match(/result:\s*([^\n]+?)(?:\.\s|$)/i);
523
+ if (resultMatch?.[1]) {
524
+ const extracted = extractFromPhrase(resultMatch[1]);
525
+ if (extracted) return extracted;
526
+ }
527
+
528
+ return null;
529
+ }
530
+
531
+ /**
532
+ * Answer extraction result with confidence score.
533
+ */
534
+ export interface AnswerExtractionResult {
535
+ answer: string;
536
+ confidence: number; // 0-1, higher = more confident
537
+ source: "expected" | "boxed" | "explicit" | "equation" | "standalone" | "implicit" | "fallback";
538
+ }
539
+
540
+ /**
541
+ * Extract answer from LLM response with confidence scoring.
542
+ * Higher confidence sources (explicit markers, boxed) are preferred over implicit ones.
543
+ *
544
+ * Confidence levels:
545
+ * - 1.0: Expected answer found in response (verified match)
546
+ * - 0.95: LaTeX \boxed{X} (LLM explicitly marked)
547
+ * - 0.85: "Final Answer:", "Answer:", "The answer is" patterns
548
+ * - 0.7: Equation result "= X"
549
+ * - 0.6: Standalone number in last lines
550
+ * - 0.4: Last number/percentage in response
551
+ * - 0.3: Word fractions, YES/NO fallback
552
+ */
553
+ export function extractAnswerWithConfidence(
554
+ response: string,
555
+ expectedAnswers?: string[],
556
+ ): AnswerExtractionResult {
557
+ // Priority 1: LaTeX boxed (check BEFORE stripping - stripLLMOutput extracts boxed content)
558
+ // Highest confidence because LLM explicitly marked this as the answer
559
+ const boxedMatch = response.match(/\\boxed\{([^}]+)\}/);
560
+ if (boxedMatch?.[1]) {
561
+ return { answer: cleanNumber(boxedMatch[1]), confidence: 0.95, source: "boxed" };
562
+ }
563
+
564
+ // Strip all LLM artifacts (thinking tags, markdown, model tokens, etc.)
565
+ const cleaned = stripLLMOutput(response);
566
+
567
+ // Priority 0: If we know expected answers, look for them directly in the response
568
+ if (expectedAnswers && expectedAnswers.length > 0) {
569
+ const matched = matchExpectedAnswer(cleaned, expectedAnswers);
570
+ if (matched) return { answer: matched, confidence: 1.0, source: "expected" };
571
+ }
572
+
573
+ // Priority 2-5: Explicit answer markers
574
+ const explicitMatch = matchExplicitMarkers(cleaned);
575
+ if (explicitMatch) {
576
+ return { answer: explicitMatch, confidence: 0.85, source: "explicit" };
577
+ }
578
+
579
+ // Priority 6-9: Implicit patterns (equations, standalone numbers, word fractions)
580
+ return matchImplicitPatternsWithConfidence(cleaned);
581
+ }
582
+
583
+ /**
584
+ * Extract answer from LLM response using priority-based pattern matching
585
+ *
586
+ * Priority order:
587
+ * 0. If expectedAnswers provided, look for exact match in response (fastest path)
588
+ * 1. LaTeX \boxed{X} (explicit answer marking)
589
+ * 2-5. Explicit markers (via matchExplicitMarkers): "Final Answer:", "Answer:", "The answer is"
590
+ * 6-9. Implicit patterns (via matchImplicitPatterns): equations, numbers, fractions, YES/NO
591
+ */
592
+ export function extractAnswer(response: string, expectedAnswers?: string[]): string {
593
+ return extractAnswerWithConfidence(response, expectedAnswers).answer;
594
+ }
595
+
596
+ /**
597
+ * Match implicit answer patterns with confidence scoring (Priority 6-9).
598
+ */
599
+ function matchImplicitPatternsWithConfidence(cleaned: string): AnswerExtractionResult {
600
+ // Priority 6: Last equation result "= X" (confidence 0.7)
601
+ const eqMatches = [...cleaned.matchAll(/=\s*(-?[\d,]+(?:\.\d+)?(?:\/\d+)?)/g)];
602
+ if (eqMatches.length > 0) {
603
+ const lastMatch = eqMatches[eqMatches.length - 1];
604
+ if (lastMatch?.[1]) {
605
+ return { answer: cleanNumber(lastMatch[1]), confidence: 0.7, source: "equation" };
606
+ }
607
+ }
608
+
609
+ // Priority 7: Standalone numbers in last few lines (confidence 0.6)
610
+ const lines = cleaned.trim().split("\n").slice(-5);
611
+ for (const line of lines.reverse()) {
612
+ const isNumMatch = line.match(/is\s+(-?[\d,]+(?:\.\d+)?(?:\/\d+)?%?)(?=\s|$|[.,;:!?)])/i);
613
+ if (isNumMatch?.[1]) {
614
+ return { answer: cleanNumber(isNumMatch[1]), confidence: 0.6, source: "standalone" };
615
+ }
616
+
617
+ const standaloneNum = line.match(/^\s*(-?[\d,]+(?:\.\d+)?(?:\/\d+)?%?)\s*$/);
618
+ if (standaloneNum?.[1]) {
619
+ return { answer: cleanNumber(standaloneNum[1]), confidence: 0.6, source: "standalone" };
620
+ }
621
+ }
622
+
623
+ // Priority 8: Last percentage (confidence 0.4)
624
+ const percentMatches = [...cleaned.matchAll(RE_PERCENTAGE)];
625
+ if (percentMatches.length > 0) {
626
+ const lastMatch = percentMatches[percentMatches.length - 1];
627
+ if (lastMatch?.[1]) {
628
+ return { answer: `${cleanNumber(lastMatch[1])}%`, confidence: 0.4, source: "implicit" };
629
+ }
630
+ }
631
+
632
+ // Priority 8a: Last number/fraction (confidence 0.4)
633
+ const allNumbers = cleaned.match(/-?[\d,]+(?:\.\d+)?(?:\/\d+)?/g);
634
+ if (allNumbers && allNumbers.length > 0) {
635
+ const lastNum = allNumbers[allNumbers.length - 1];
636
+ if (lastNum) {
637
+ return { answer: cleanNumber(lastNum), confidence: 0.4, source: "implicit" };
638
+ }
639
+ }
640
+
641
+ // Priority 8b: Word fractions (confidence 0.3)
642
+ RE_WORD_FRACTION.lastIndex = 0;
643
+ const wordFractionMatch = cleaned.match(RE_WORD_FRACTION);
644
+ if (wordFractionMatch && wordFractionMatch.length > 0) {
645
+ return {
646
+ answer: wordFractionMatch[wordFractionMatch.length - 1]!,
647
+ confidence: 0.3,
648
+ source: "fallback",
649
+ };
650
+ }
651
+
652
+ // Priority 9: Last meaningful word (confidence 0.3)
653
+ const lastLines = cleaned.trim().split("\n").slice(-3).join(" ");
654
+ return {
655
+ answer: extractLastMeaningfulWord(lastLines),
656
+ confidence: 0.3,
657
+ source: "fallback",
658
+ };
659
+ }
660
+
661
+ // =============================================================================
662
+ // FRACTION HANDLING
663
+ // =============================================================================
664
+
665
+ /** Word-to-number mapping for fraction parsing */
666
+ const WORD_NUMBERS: Record<string, number> = {
667
+ zero: 0,
668
+ one: 1,
669
+ two: 2,
670
+ three: 3,
671
+ four: 4,
672
+ five: 5,
673
+ six: 6,
674
+ seven: 7,
675
+ eight: 8,
676
+ nine: 9,
677
+ ten: 10,
678
+ eleven: 11,
679
+ twelve: 12,
680
+ };
681
+
682
+ /** Word-to-denominator mapping for common fractions */
683
+ const WORD_DENOMINATORS: Record<string, number> = {
684
+ half: 2,
685
+ halves: 2,
686
+ third: 3,
687
+ thirds: 3,
688
+ fourth: 4,
689
+ fourths: 4,
690
+ quarter: 4,
691
+ quarters: 4,
692
+ fifth: 5,
693
+ fifths: 5,
694
+ sixth: 6,
695
+ sixths: 6,
696
+ seventh: 7,
697
+ sevenths: 7,
698
+ eighth: 8,
699
+ eighths: 8,
700
+ ninth: 9,
701
+ ninths: 9,
702
+ tenth: 10,
703
+ tenths: 10,
704
+ };
705
+
706
+ /**
707
+ * Parse a fraction string into a decimal number.
708
+ * Handles:
709
+ * - Numeric fractions: "2/3", "1/2", "3/4"
710
+ * - Word fractions: "two-thirds", "one-half", "three-quarters"
711
+ * - Mixed numbers: "1 1/2", "2 3/4" (whole + fraction)
712
+ *
713
+ * @returns The decimal value, or null if not a valid fraction
714
+ */
715
+ export function parseFraction(input: string): number | null {
716
+ const trimmed = input.trim().toLowerCase();
717
+
718
+ // Pattern 1: Numeric fraction "a/b" or mixed "w a/b"
719
+ const numericMatch = trimmed.match(/^(-?\d+(?:\.\d+)?)\s*\/\s*(\d+(?:\.\d+)?)$/);
720
+ if (numericMatch) {
721
+ const [, num, denom] = numericMatch;
722
+ const d = Number.parseFloat(denom!);
723
+ if (d === 0) return null;
724
+ return Number.parseFloat(num!) / d;
725
+ }
726
+
727
+ // Pattern 2: Mixed number "w a/b" (e.g., "1 1/2" or "2 3/4")
728
+ const mixedMatch = trimmed.match(/^(-?\d+)\s+(\d+)\s*\/\s*(\d+)$/);
729
+ if (mixedMatch) {
730
+ const [, whole, num, denom] = mixedMatch;
731
+ const w = Number.parseInt(whole!, 10);
732
+ const n = Number.parseInt(num!, 10);
733
+ const d = Number.parseInt(denom!, 10);
734
+ if (d === 0) return null;
735
+ const sign = w < 0 ? -1 : 1;
736
+ return w + sign * (n / d);
737
+ }
738
+
739
+ // Pattern 3: Word fraction "one-half", "two-thirds", "three-quarters"
740
+ // Also handles "a half", "a third", etc.
741
+ const wordMatch = trimmed.match(
742
+ /^(a|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve)[-\s]?(half|halves|third|thirds|fourth|fourths|quarter|quarters|fifth|fifths|sixth|sixths|seventh|sevenths|eighth|eighths|ninth|ninths|tenth|tenths)$/,
743
+ );
744
+ if (wordMatch) {
745
+ const [, numWord, denomWord] = wordMatch;
746
+ const numerator = numWord === "a" ? 1 : (WORD_NUMBERS[numWord!] ?? 1);
747
+ const denominator = WORD_DENOMINATORS[denomWord!];
748
+ if (denominator) {
749
+ return numerator / denominator;
750
+ }
751
+ }
752
+
753
+ return null;
754
+ }
755
+
756
+ /**
757
+ * Normalize answer for comparison
758
+ * Handles case, whitespace, and common variations
759
+ */
760
+ export function normalizeAnswer(answer: string): string {
761
+ return answer
762
+ .toLowerCase()
763
+ .replace(/^\[|\]$/g, "") // Remove surrounding brackets [X] -> X
764
+ .replace(/,/g, "") // Remove commas from numbers
765
+ .replace(/\s+/g, "") // Remove whitespace
766
+ .replace(/^0+(\d)/, "$1") // Remove leading zeros (but keep "0")
767
+ .replace(/\.0+$/, "") // Remove trailing .0
768
+ .replace(/%$/, "") // Remove trailing % for percentage comparison
769
+ .trim();
770
+ }
771
+
772
+ /**
773
+ * Compare two answers for equivalence.
774
+ * Handles:
775
+ * - Case-insensitive comparison
776
+ * - Numeric tolerance (0.01% relative or 0.001 absolute for fraction tolerance)
777
+ * - Fractions: "2/3" matches "0.667", "two-thirds"
778
+ * - Percentages: "75%" matches "0.75", "75 percent"
779
+ * - Scientific notation: "1.5e6" matches "1500000", "3×10^8"
780
+ * - Partial containment: "45" matches "45 degrees"
781
+ */
782
+ export function answersMatch(extracted: string, expected: string): boolean {
783
+ const normExtracted = normalizeAnswer(extracted);
784
+ const normExpected = normalizeAnswer(expected);
785
+
786
+ // Exact match
787
+ if (normExtracted === normExpected) return true;
788
+
789
+ // Track if we parsed fractions (need wider tolerance for rounding)
790
+ let hasFraction = false;
791
+
792
+ // Try fraction parsing FIRST (before parseNumericValue, since parseFloat("1/2") = 1)
793
+ let numExtracted: number | null = null;
794
+ let numExpected: number | null = null;
795
+
796
+ if (extracted.includes("/")) {
797
+ const fracExtracted = parseFraction(extracted);
798
+ if (fracExtracted !== null) {
799
+ numExtracted = fracExtracted;
800
+ hasFraction = true;
801
+ }
802
+ }
803
+ if (expected.includes("/")) {
804
+ const fracExpected = parseFraction(expected);
805
+ if (fracExpected !== null) {
806
+ numExpected = fracExpected;
807
+ hasFraction = true;
808
+ }
809
+ }
810
+
811
+ // Try word fractions ("two-thirds", "one-half")
812
+ // Just try parseFraction() - it handles all word fraction formats
813
+ if (numExtracted === null) {
814
+ const fracExtracted = parseFraction(extracted);
815
+ if (fracExtracted !== null) {
816
+ numExtracted = fracExtracted;
817
+ hasFraction = true;
818
+ }
819
+ }
820
+ if (numExpected === null) {
821
+ const fracExpected = parseFraction(expected);
822
+ if (fracExpected !== null) {
823
+ numExpected = fracExpected;
824
+ hasFraction = true;
825
+ }
826
+ }
827
+
828
+ // Try numeric comparison (percentages, scientific notation, plain numbers)
829
+ if (numExtracted === null) {
830
+ numExtracted = parseNumericValue(extracted);
831
+ }
832
+ if (numExpected === null) {
833
+ numExpected = parseNumericValue(expected);
834
+ }
835
+
836
+ if (numExtracted !== null && numExpected !== null) {
837
+ // Use wider tolerance for fractions (0.001 absolute) to handle rounding in 3-digit decimals
838
+ // For non-fractions, use tighter tolerance (0.0001 absolute or 0.01% relative)
839
+ const absDiff = Math.abs(numExtracted - numExpected);
840
+ const absTol = hasFraction ? 0.001 : 0.0001;
841
+ const relTol = Math.abs(numExpected) * 0.0001;
842
+ if (absDiff < Math.max(absTol, relTol)) return true;
843
+ }
844
+
845
+ // Check if one contains the other as a complete token (for "45" vs "45 degrees")
846
+ // Only applies when not both purely numeric (numeric comparison already handled above)
847
+ // For "45 degrees" vs "45": shorter appears at start/end of longer (after normalization removes spaces)
848
+ const hasNonNumeric = /[a-z]/i.test(normExtracted) || /[a-z]/i.test(normExpected);
849
+ if (hasNonNumeric) {
850
+ const shorter = normExtracted.length <= normExpected.length ? normExtracted : normExpected;
851
+ const longer = normExtracted.length > normExpected.length ? normExtracted : normExpected;
852
+
853
+ // Shorter must appear at start or end of longer (handles "45degrees"↔"45", "answeris42"↔"42")
854
+ if (shorter.length > 0 && (longer.startsWith(shorter) || longer.endsWith(shorter))) {
855
+ return true;
856
+ }
857
+ }
858
+
859
+ return false;
860
+ }
861
+
862
+ /**
863
+ * Parse a numeric value from a string, handling percentages and scientific notation.
864
+ * @returns The numeric value, or null if not parseable
865
+ */
866
+ function parseNumericValue(input: string): number | null {
867
+ const trimmed = input.trim();
868
+
869
+ // Check for percentage (75%, 75 percent, 75 pct)
870
+ const percentMatch = trimmed.match(/^(-?[\d,]+(?:\.\d+)?)\s*(%|percent|pct)$/i);
871
+ if (percentMatch?.[1]) {
872
+ const value = Number.parseFloat(percentMatch[1].replace(/,/g, ""));
873
+ if (!Number.isNaN(value)) return value / 100;
874
+ }
875
+
876
+ // Check for scientific notation with × or x (3×10^8, 3x10^8, 3×10⁸)
877
+ const sciMultMatch = trimmed.match(/^(-?[\d.]+)\s*[×xX]\s*10[\^]?(-?\d+)$/);
878
+ if (sciMultMatch?.[1] && sciMultMatch?.[2]) {
879
+ const base = Number.parseFloat(sciMultMatch[1]);
880
+ const exp = Number.parseInt(sciMultMatch[2], 10);
881
+ if (!Number.isNaN(base) && !Number.isNaN(exp)) return base * 10 ** exp;
882
+ }
883
+
884
+ // Check for Unicode superscript exponents (10⁸, 10⁻³)
885
+ const superscriptMap: Record<string, string> = {
886
+ "⁰": "0",
887
+ "¹": "1",
888
+ "²": "2",
889
+ "³": "3",
890
+ "⁴": "4",
891
+ "⁵": "5",
892
+ "⁶": "6",
893
+ "⁷": "7",
894
+ "⁸": "8",
895
+ "⁹": "9",
896
+ "⁻": "-",
897
+ };
898
+ const superMatch = trimmed.match(/^(-?[\d.]+)\s*[×xX]\s*10([⁰¹²³⁴⁵⁶⁷⁸⁹⁻]+)$/);
899
+ if (superMatch?.[1] && superMatch?.[2]) {
900
+ const base = Number.parseFloat(superMatch[1]);
901
+ const expStr = superMatch[2]
902
+ .split("")
903
+ .map((c) => superscriptMap[c] ?? c)
904
+ .join("");
905
+ const exp = Number.parseInt(expStr, 10);
906
+ if (!Number.isNaN(base) && !Number.isNaN(exp)) return base * 10 ** exp;
907
+ }
908
+
909
+ // Standard parseFloat handles 1.5e6 notation
910
+ const value = Number.parseFloat(trimmed.replace(/,/g, ""));
911
+ return Number.isNaN(value) ? null : value;
912
+ }