verifiable-thinking-mcp 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +339 -0
  3. package/package.json +75 -0
  4. package/src/index.ts +38 -0
  5. package/src/lib/cache.ts +246 -0
  6. package/src/lib/compression.ts +804 -0
  7. package/src/lib/compute/cache.ts +86 -0
  8. package/src/lib/compute/classifier.ts +555 -0
  9. package/src/lib/compute/confidence.ts +79 -0
  10. package/src/lib/compute/context.ts +154 -0
  11. package/src/lib/compute/extract.ts +200 -0
  12. package/src/lib/compute/filter.ts +224 -0
  13. package/src/lib/compute/index.ts +171 -0
  14. package/src/lib/compute/math.ts +247 -0
  15. package/src/lib/compute/patterns.ts +564 -0
  16. package/src/lib/compute/registry.ts +145 -0
  17. package/src/lib/compute/solvers/arithmetic.ts +65 -0
  18. package/src/lib/compute/solvers/calculus.ts +249 -0
  19. package/src/lib/compute/solvers/derivation-core.ts +371 -0
  20. package/src/lib/compute/solvers/derivation-latex.ts +160 -0
  21. package/src/lib/compute/solvers/derivation-mistakes.ts +1046 -0
  22. package/src/lib/compute/solvers/derivation-simplify.ts +451 -0
  23. package/src/lib/compute/solvers/derivation-transform.ts +620 -0
  24. package/src/lib/compute/solvers/derivation.ts +67 -0
  25. package/src/lib/compute/solvers/facts.ts +120 -0
  26. package/src/lib/compute/solvers/formula.ts +728 -0
  27. package/src/lib/compute/solvers/index.ts +36 -0
  28. package/src/lib/compute/solvers/logic.ts +422 -0
  29. package/src/lib/compute/solvers/probability.ts +307 -0
  30. package/src/lib/compute/solvers/statistics.ts +262 -0
  31. package/src/lib/compute/solvers/word-problems.ts +408 -0
  32. package/src/lib/compute/types.ts +107 -0
  33. package/src/lib/concepts.ts +111 -0
  34. package/src/lib/domain.ts +731 -0
  35. package/src/lib/extraction.ts +912 -0
  36. package/src/lib/index.ts +122 -0
  37. package/src/lib/judge.ts +260 -0
  38. package/src/lib/math/ast.ts +842 -0
  39. package/src/lib/math/index.ts +8 -0
  40. package/src/lib/math/operators.ts +171 -0
  41. package/src/lib/math/tokenizer.ts +477 -0
  42. package/src/lib/patterns.ts +200 -0
  43. package/src/lib/session.ts +825 -0
  44. package/src/lib/think/challenge.ts +323 -0
  45. package/src/lib/think/complexity.ts +504 -0
  46. package/src/lib/think/confidence-drift.ts +507 -0
  47. package/src/lib/think/consistency.ts +347 -0
  48. package/src/lib/think/guidance.ts +188 -0
  49. package/src/lib/think/helpers.ts +568 -0
  50. package/src/lib/think/hypothesis.ts +216 -0
  51. package/src/lib/think/index.ts +127 -0
  52. package/src/lib/think/prompts.ts +262 -0
  53. package/src/lib/think/route.ts +358 -0
  54. package/src/lib/think/schema.ts +98 -0
  55. package/src/lib/think/scratchpad-schema.ts +662 -0
  56. package/src/lib/think/spot-check.ts +961 -0
  57. package/src/lib/think/types.ts +93 -0
  58. package/src/lib/think/verification.ts +260 -0
  59. package/src/lib/tokens.ts +177 -0
  60. package/src/lib/verification.ts +620 -0
  61. package/src/prompts/index.ts +10 -0
  62. package/src/prompts/templates.ts +336 -0
  63. package/src/resources/index.ts +8 -0
  64. package/src/resources/sessions.ts +196 -0
  65. package/src/tools/compress.ts +138 -0
  66. package/src/tools/index.ts +5 -0
  67. package/src/tools/scratchpad.ts +2659 -0
  68. package/src/tools/sessions.ts +144 -0
@@ -0,0 +1,122 @@
1
+ /**
2
+ * Main barrel export for src/lib/
3
+ * Re-exports commonly used utilities for convenient imports
4
+ */
5
+
6
+ // Cache utilities
7
+ export { verificationCache } from "./cache";
8
+ // Compression utilities
9
+ export {
10
+ calculateEntropy,
11
+ cleanFillers,
12
+ compress,
13
+ computeNCD,
14
+ isMetaSentence,
15
+ jaccardSimilarity,
16
+ needsCompression,
17
+ quickCompress,
18
+ } from "./compression";
19
+ // Local compute
20
+ export {
21
+ computeConfidence,
22
+ extractAndCompute,
23
+ isLikelyComputable,
24
+ tryLocalCompute,
25
+ } from "./compute/index";
26
+ // Concept tracking
27
+ export {
28
+ ConceptTracker,
29
+ clearTracker,
30
+ getTracker,
31
+ } from "./concepts";
32
+ // Unified domain detection
33
+ export {
34
+ type DomainResult,
35
+ detectDomainFull,
36
+ detectGranularDomain,
37
+ detectMetaDomain,
38
+ detectVerificationDomain,
39
+ type GranularDomain,
40
+ getDomainWeight,
41
+ getRelevantSolvers,
42
+ isSolverRelevant,
43
+ type MetaDomain,
44
+ } from "./domain";
45
+ // Answer extraction & matching
46
+ export {
47
+ type AnswerExtractionResult,
48
+ answersMatch,
49
+ extractAnswer,
50
+ extractAnswerWithConfidence,
51
+ normalizeAnswer,
52
+ parseFraction,
53
+ shouldStreamStrip,
54
+ stripLLMOutput,
55
+ stripLLMOutputAsync,
56
+ stripLLMOutputStreaming,
57
+ stripMarkdown,
58
+ stripThinkingTagsFast,
59
+ } from "./extraction";
60
+ // LLM-as-Judge for response comparison
61
+ export {
62
+ type DimensionScores,
63
+ type JudgeInput,
64
+ type JudgeResult,
65
+ type JudgeSummary,
66
+ judgeBatch,
67
+ judgeResponses,
68
+ type LLMJudgeFunc,
69
+ summarizeJudgments,
70
+ } from "./judge";
71
+ // Math module (operators, tokenizer, AST) - also re-exported via verification
72
+ export * as math from "./math";
73
+ // Session management
74
+ export {
75
+ SessionManager,
76
+ SessionManagerImpl,
77
+ type ThoughtRecord,
78
+ } from "./session";
79
+ // Think module (comprehensive)
80
+ export * from "./think";
81
+ // Token estimation (from think/verification)
82
+ export {
83
+ estimateCodeTokens,
84
+ estimateTokens,
85
+ estimateTokensBatch,
86
+ } from "./think/verification";
87
+ // Verification (domain-specific)
88
+ export {
89
+ type ASTNode,
90
+ type ASTResult,
91
+ type BinaryNode,
92
+ buildAST,
93
+ canBeUnary,
94
+ clearVerificationCache,
95
+ compareExpressions,
96
+ compareOperatorPrecedence,
97
+ type EvalResult,
98
+ type ExpressionValidation,
99
+ evaluateExpression,
100
+ type FormatASTOptions,
101
+ type FormatOptions,
102
+ formatAST,
103
+ formatExpression,
104
+ getOperatorArity,
105
+ getOperatorArityInContext,
106
+ getOperatorPrecedence,
107
+ getVerificationCacheStats,
108
+ isMathOperator,
109
+ isRightAssociative,
110
+ MATH_OPERATOR_PATTERN,
111
+ MATH_OPERATORS,
112
+ type MathToken,
113
+ type MathTokenType,
114
+ type NumberNode,
115
+ simplifyAST,
116
+ type TokenizeResult,
117
+ tokenizeMathExpression,
118
+ type UnaryNode,
119
+ type VariableNode,
120
+ validateExpression,
121
+ verify,
122
+ } from "./verification";
@@ -0,0 +1,260 @@
1
+ /**
2
+ * LLM-as-Judge: Compare two responses and score quality
3
+ *
4
+ * Uses a separate LLM call to evaluate response quality on multiple dimensions.
5
+ * Designed for open-ended questions where exact-match verification isn't possible.
6
+ */
7
+
8
+ export interface JudgeResult {
9
+ /** Which response is better: "A", "B", or "tie" */
10
+ winner: "A" | "B" | "tie";
11
+ /** Confidence in the judgment (0-1) */
12
+ confidence: number;
13
+ /** Scores per dimension (1-5 scale) */
14
+ scores: {
15
+ A: DimensionScores;
16
+ B: DimensionScores;
17
+ };
18
+ /** Brief explanation of the judgment */
19
+ reasoning: string;
20
+ /** Raw judge response for debugging */
21
+ raw_response?: string;
22
+ }
23
+
24
+ export interface DimensionScores {
25
+ /** Is the response factually correct and accurate? */
26
+ accuracy: number;
27
+ /** Is the reasoning logical and well-structured? */
28
+ reasoning_quality: number;
29
+ /** Does the response fully address the question? */
30
+ completeness: number;
31
+ /** Is the response clear and well-written? */
32
+ clarity: number;
33
+ /** Overall quality score */
34
+ overall: number;
35
+ }
36
+
37
+ export interface JudgeInput {
38
+ /** The original question/prompt */
39
+ question: string;
40
+ /** Response A (typically baseline) */
41
+ response_a: string;
42
+ /** Response B (typically with tool) */
43
+ response_b: string;
44
+ /** Optional reference answer for grounding */
45
+ reference_answer?: string;
46
+ /** Optional category for domain-specific judging */
47
+ category?: string;
48
+ }
49
+
50
+ /**
51
+ * System prompt for the judge LLM
52
+ */
53
+ const JUDGE_SYSTEM_PROMPT = `You are an expert evaluator comparing two AI responses to the same question.
54
+
55
+ Your task is to evaluate both responses on these dimensions (1-5 scale):
56
+ 1. **Accuracy**: Factual correctness, no hallucinations or errors
57
+ 2. **Reasoning Quality**: Logical flow, clear step-by-step thinking
58
+ 3. **Completeness**: Fully addresses the question, no missing parts
59
+ 4. **Clarity**: Well-written, easy to understand
60
+ 5. **Overall**: Holistic quality assessment
61
+
62
+ IMPORTANT RULES:
63
+ - Be objective and fair to both responses
64
+ - If a reference answer is provided, use it as ground truth
65
+ - Consider the question type when weighing dimensions
66
+ - Explain your reasoning briefly
67
+
68
+ OUTPUT FORMAT (JSON only, no markdown):
69
+ {
70
+ "scores_a": { "accuracy": N, "reasoning_quality": N, "completeness": N, "clarity": N, "overall": N },
71
+ "scores_b": { "accuracy": N, "reasoning_quality": N, "completeness": N, "clarity": N, "overall": N },
72
+ "winner": "A" | "B" | "tie",
73
+ "confidence": 0.0-1.0,
74
+ "reasoning": "Brief explanation"
75
+ }`;
76
+
77
+ /**
78
+ * Build the user prompt for the judge
79
+ */
80
+ function buildJudgePrompt(input: JudgeInput): string {
81
+ let prompt = `QUESTION:\n${input.question}\n\n`;
82
+
83
+ if (input.reference_answer) {
84
+ prompt += `REFERENCE ANSWER:\n${input.reference_answer}\n\n`;
85
+ }
86
+
87
+ prompt += `RESPONSE A:\n${input.response_a}\n\n`;
88
+ prompt += `RESPONSE B:\n${input.response_b}\n\n`;
89
+
90
+ if (input.category) {
91
+ prompt += `CATEGORY: ${input.category} (weight accuracy higher for math/logic)\n\n`;
92
+ }
93
+
94
+ prompt += `Evaluate both responses and output JSON only.`;
95
+
96
+ return prompt;
97
+ }
98
+
99
+ /**
100
+ * Parse the judge's response into structured result
101
+ */
102
+ function parseJudgeResponse(response: string): Omit<JudgeResult, "raw_response"> | null {
103
+ try {
104
+ // Try to extract JSON from the response
105
+ const jsonMatch = response.match(/\{[\s\S]*\}/);
106
+ if (!jsonMatch) return null;
107
+
108
+ const parsed = JSON.parse(jsonMatch[0]);
109
+
110
+ // Validate structure
111
+ if (!parsed.scores_a || !parsed.scores_b || !parsed.winner) {
112
+ return null;
113
+ }
114
+
115
+ const mapScores = (s: Record<string, number>): DimensionScores => ({
116
+ accuracy: s.accuracy || 3,
117
+ reasoning_quality: s.reasoning_quality || 3,
118
+ completeness: s.completeness || 3,
119
+ clarity: s.clarity || 3,
120
+ overall: s.overall || 3,
121
+ });
122
+
123
+ return {
124
+ winner: parsed.winner as "A" | "B" | "tie",
125
+ confidence: parsed.confidence || 0.5,
126
+ scores: {
127
+ A: mapScores(parsed.scores_a),
128
+ B: mapScores(parsed.scores_b),
129
+ },
130
+ reasoning: parsed.reasoning || "No reasoning provided",
131
+ };
132
+ } catch {
133
+ return null;
134
+ }
135
+ }
136
+
137
+ /**
138
+ * Judge function type - allows different LLM backends
139
+ */
140
+ export type LLMJudgeFunc = (prompt: string, system: string) => Promise<string>;
141
+
142
+ /**
143
+ * Compare two responses using LLM-as-Judge
144
+ *
145
+ * @param input - The question and two responses to compare
146
+ * @param llmCall - Function to call the judge LLM
147
+ * @returns Structured judgment result
148
+ */
149
+ export async function judgeResponses(
150
+ input: JudgeInput,
151
+ llmCall: LLMJudgeFunc,
152
+ ): Promise<JudgeResult> {
153
+ const prompt = buildJudgePrompt(input);
154
+ const response = await llmCall(prompt, JUDGE_SYSTEM_PROMPT);
155
+
156
+ const parsed = parseJudgeResponse(response);
157
+
158
+ if (!parsed) {
159
+ // Fallback for unparseable response
160
+ return {
161
+ winner: "tie",
162
+ confidence: 0,
163
+ scores: {
164
+ A: { accuracy: 3, reasoning_quality: 3, completeness: 3, clarity: 3, overall: 3 },
165
+ B: { accuracy: 3, reasoning_quality: 3, completeness: 3, clarity: 3, overall: 3 },
166
+ },
167
+ reasoning: "Failed to parse judge response",
168
+ raw_response: response,
169
+ };
170
+ }
171
+
172
+ return {
173
+ ...parsed,
174
+ raw_response: response,
175
+ };
176
+ }
177
+
178
+ /**
179
+ * Batch judge multiple response pairs
180
+ */
181
+ export async function judgeBatch(
182
+ inputs: JudgeInput[],
183
+ llmCall: LLMJudgeFunc,
184
+ options: { concurrency?: number } = {},
185
+ ): Promise<JudgeResult[]> {
186
+ const { concurrency = 3 } = options;
187
+ const results: JudgeResult[] = [];
188
+
189
+ // Process in batches for rate limiting
190
+ for (let i = 0; i < inputs.length; i += concurrency) {
191
+ const batch = inputs.slice(i, i + concurrency);
192
+ const batchResults = await Promise.all(batch.map((input) => judgeResponses(input, llmCall)));
193
+ results.push(...batchResults);
194
+ }
195
+
196
+ return results;
197
+ }
198
+
199
+ /**
200
+ * Aggregate judge results into summary statistics
201
+ */
202
+ export interface JudgeSummary {
203
+ total: number;
204
+ wins_a: number;
205
+ wins_b: number;
206
+ ties: number;
207
+ win_rate_a: number;
208
+ win_rate_b: number;
209
+ avg_scores_a: DimensionScores;
210
+ avg_scores_b: DimensionScores;
211
+ avg_confidence: number;
212
+ }
213
+
214
+ export function summarizeJudgments(results: JudgeResult[]): JudgeSummary {
215
+ const total = results.length;
216
+ if (total === 0) {
217
+ return {
218
+ total: 0,
219
+ wins_a: 0,
220
+ wins_b: 0,
221
+ ties: 0,
222
+ win_rate_a: 0,
223
+ win_rate_b: 0,
224
+ avg_scores_a: { accuracy: 0, reasoning_quality: 0, completeness: 0, clarity: 0, overall: 0 },
225
+ avg_scores_b: { accuracy: 0, reasoning_quality: 0, completeness: 0, clarity: 0, overall: 0 },
226
+ avg_confidence: 0,
227
+ };
228
+ }
229
+
230
+ const wins_a = results.filter((r) => r.winner === "A").length;
231
+ const wins_b = results.filter((r) => r.winner === "B").length;
232
+ const ties = results.filter((r) => r.winner === "tie").length;
233
+
234
+ const avgScores = (key: "A" | "B"): DimensionScores => {
235
+ const dims: (keyof DimensionScores)[] = [
236
+ "accuracy",
237
+ "reasoning_quality",
238
+ "completeness",
239
+ "clarity",
240
+ "overall",
241
+ ];
242
+ const avg: Partial<DimensionScores> = {};
243
+ for (const dim of dims) {
244
+ avg[dim] = results.reduce((sum, r) => sum + r.scores[key][dim], 0) / total;
245
+ }
246
+ return avg as DimensionScores;
247
+ };
248
+
249
+ return {
250
+ total,
251
+ wins_a,
252
+ wins_b,
253
+ ties,
254
+ win_rate_a: wins_a / total,
255
+ win_rate_b: wins_b / total,
256
+ avg_scores_a: avgScores("A"),
257
+ avg_scores_b: avgScores("B"),
258
+ avg_confidence: results.reduce((sum, r) => sum + r.confidence, 0) / total,
259
+ };
260
+ }