verifiable-thinking-mcp 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +339 -0
- package/package.json +75 -0
- package/src/index.ts +38 -0
- package/src/lib/cache.ts +246 -0
- package/src/lib/compression.ts +804 -0
- package/src/lib/compute/cache.ts +86 -0
- package/src/lib/compute/classifier.ts +555 -0
- package/src/lib/compute/confidence.ts +79 -0
- package/src/lib/compute/context.ts +154 -0
- package/src/lib/compute/extract.ts +200 -0
- package/src/lib/compute/filter.ts +224 -0
- package/src/lib/compute/index.ts +171 -0
- package/src/lib/compute/math.ts +247 -0
- package/src/lib/compute/patterns.ts +564 -0
- package/src/lib/compute/registry.ts +145 -0
- package/src/lib/compute/solvers/arithmetic.ts +65 -0
- package/src/lib/compute/solvers/calculus.ts +249 -0
- package/src/lib/compute/solvers/derivation-core.ts +371 -0
- package/src/lib/compute/solvers/derivation-latex.ts +160 -0
- package/src/lib/compute/solvers/derivation-mistakes.ts +1046 -0
- package/src/lib/compute/solvers/derivation-simplify.ts +451 -0
- package/src/lib/compute/solvers/derivation-transform.ts +620 -0
- package/src/lib/compute/solvers/derivation.ts +67 -0
- package/src/lib/compute/solvers/facts.ts +120 -0
- package/src/lib/compute/solvers/formula.ts +728 -0
- package/src/lib/compute/solvers/index.ts +36 -0
- package/src/lib/compute/solvers/logic.ts +422 -0
- package/src/lib/compute/solvers/probability.ts +307 -0
- package/src/lib/compute/solvers/statistics.ts +262 -0
- package/src/lib/compute/solvers/word-problems.ts +408 -0
- package/src/lib/compute/types.ts +107 -0
- package/src/lib/concepts.ts +111 -0
- package/src/lib/domain.ts +731 -0
- package/src/lib/extraction.ts +912 -0
- package/src/lib/index.ts +122 -0
- package/src/lib/judge.ts +260 -0
- package/src/lib/math/ast.ts +842 -0
- package/src/lib/math/index.ts +8 -0
- package/src/lib/math/operators.ts +171 -0
- package/src/lib/math/tokenizer.ts +477 -0
- package/src/lib/patterns.ts +200 -0
- package/src/lib/session.ts +825 -0
- package/src/lib/think/challenge.ts +323 -0
- package/src/lib/think/complexity.ts +504 -0
- package/src/lib/think/confidence-drift.ts +507 -0
- package/src/lib/think/consistency.ts +347 -0
- package/src/lib/think/guidance.ts +188 -0
- package/src/lib/think/helpers.ts +568 -0
- package/src/lib/think/hypothesis.ts +216 -0
- package/src/lib/think/index.ts +127 -0
- package/src/lib/think/prompts.ts +262 -0
- package/src/lib/think/route.ts +358 -0
- package/src/lib/think/schema.ts +98 -0
- package/src/lib/think/scratchpad-schema.ts +662 -0
- package/src/lib/think/spot-check.ts +961 -0
- package/src/lib/think/types.ts +93 -0
- package/src/lib/think/verification.ts +260 -0
- package/src/lib/tokens.ts +177 -0
- package/src/lib/verification.ts +620 -0
- package/src/prompts/index.ts +10 -0
- package/src/prompts/templates.ts +336 -0
- package/src/resources/index.ts +8 -0
- package/src/resources/sessions.ts +196 -0
- package/src/tools/compress.ts +138 -0
- package/src/tools/index.ts +5 -0
- package/src/tools/scratchpad.ts +2659 -0
- package/src/tools/sessions.ts +144 -0
package/src/lib/index.ts
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Main barrel export for src/lib/
|
|
3
|
+
* Re-exports commonly used utilities for convenient imports
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
// Cache utilities
|
|
7
|
+
export { verificationCache } from "./cache";
|
|
8
|
+
// Compression utilities
|
|
9
|
+
export {
|
|
10
|
+
calculateEntropy,
|
|
11
|
+
cleanFillers,
|
|
12
|
+
compress,
|
|
13
|
+
computeNCD,
|
|
14
|
+
isMetaSentence,
|
|
15
|
+
jaccardSimilarity,
|
|
16
|
+
needsCompression,
|
|
17
|
+
quickCompress,
|
|
18
|
+
} from "./compression";
|
|
19
|
+
// Local compute
|
|
20
|
+
export {
|
|
21
|
+
computeConfidence,
|
|
22
|
+
extractAndCompute,
|
|
23
|
+
isLikelyComputable,
|
|
24
|
+
tryLocalCompute,
|
|
25
|
+
} from "./compute/index";
|
|
26
|
+
// Concept tracking
|
|
27
|
+
export {
|
|
28
|
+
ConceptTracker,
|
|
29
|
+
clearTracker,
|
|
30
|
+
getTracker,
|
|
31
|
+
} from "./concepts";
|
|
32
|
+
// Unified domain detection
|
|
33
|
+
export {
|
|
34
|
+
type DomainResult,
|
|
35
|
+
detectDomainFull,
|
|
36
|
+
detectGranularDomain,
|
|
37
|
+
detectMetaDomain,
|
|
38
|
+
detectVerificationDomain,
|
|
39
|
+
type GranularDomain,
|
|
40
|
+
getDomainWeight,
|
|
41
|
+
getRelevantSolvers,
|
|
42
|
+
isSolverRelevant,
|
|
43
|
+
type MetaDomain,
|
|
44
|
+
} from "./domain";
|
|
45
|
+
// Answer extraction & matching
|
|
46
|
+
export {
|
|
47
|
+
type AnswerExtractionResult,
|
|
48
|
+
answersMatch,
|
|
49
|
+
extractAnswer,
|
|
50
|
+
extractAnswerWithConfidence,
|
|
51
|
+
normalizeAnswer,
|
|
52
|
+
parseFraction,
|
|
53
|
+
shouldStreamStrip,
|
|
54
|
+
stripLLMOutput,
|
|
55
|
+
stripLLMOutputAsync,
|
|
56
|
+
stripLLMOutputStreaming,
|
|
57
|
+
stripMarkdown,
|
|
58
|
+
stripThinkingTagsFast,
|
|
59
|
+
} from "./extraction";
|
|
60
|
+
// LLM-as-Judge for response comparison
|
|
61
|
+
export {
|
|
62
|
+
type DimensionScores,
|
|
63
|
+
type JudgeInput,
|
|
64
|
+
type JudgeResult,
|
|
65
|
+
type JudgeSummary,
|
|
66
|
+
judgeBatch,
|
|
67
|
+
judgeResponses,
|
|
68
|
+
type LLMJudgeFunc,
|
|
69
|
+
summarizeJudgments,
|
|
70
|
+
} from "./judge";
|
|
71
|
+
// Math module (operators, tokenizer, AST) - also re-exported via verification
|
|
72
|
+
export * as math from "./math";
|
|
73
|
+
// Session management
|
|
74
|
+
export {
|
|
75
|
+
SessionManager,
|
|
76
|
+
SessionManagerImpl,
|
|
77
|
+
type ThoughtRecord,
|
|
78
|
+
} from "./session";
|
|
79
|
+
// Think module (comprehensive)
|
|
80
|
+
export * from "./think";
|
|
81
|
+
// Token estimation (from think/verification)
|
|
82
|
+
export {
|
|
83
|
+
estimateCodeTokens,
|
|
84
|
+
estimateTokens,
|
|
85
|
+
estimateTokensBatch,
|
|
86
|
+
} from "./think/verification";
|
|
87
|
+
// Verification (domain-specific)
|
|
88
|
+
export {
|
|
89
|
+
type ASTNode,
|
|
90
|
+
type ASTResult,
|
|
91
|
+
type BinaryNode,
|
|
92
|
+
buildAST,
|
|
93
|
+
canBeUnary,
|
|
94
|
+
clearVerificationCache,
|
|
95
|
+
compareExpressions,
|
|
96
|
+
compareOperatorPrecedence,
|
|
97
|
+
type EvalResult,
|
|
98
|
+
type ExpressionValidation,
|
|
99
|
+
evaluateExpression,
|
|
100
|
+
type FormatASTOptions,
|
|
101
|
+
type FormatOptions,
|
|
102
|
+
formatAST,
|
|
103
|
+
formatExpression,
|
|
104
|
+
getOperatorArity,
|
|
105
|
+
getOperatorArityInContext,
|
|
106
|
+
getOperatorPrecedence,
|
|
107
|
+
getVerificationCacheStats,
|
|
108
|
+
isMathOperator,
|
|
109
|
+
isRightAssociative,
|
|
110
|
+
MATH_OPERATOR_PATTERN,
|
|
111
|
+
MATH_OPERATORS,
|
|
112
|
+
type MathToken,
|
|
113
|
+
type MathTokenType,
|
|
114
|
+
type NumberNode,
|
|
115
|
+
simplifyAST,
|
|
116
|
+
type TokenizeResult,
|
|
117
|
+
tokenizeMathExpression,
|
|
118
|
+
type UnaryNode,
|
|
119
|
+
type VariableNode,
|
|
120
|
+
validateExpression,
|
|
121
|
+
verify,
|
|
122
|
+
} from "./verification";
|
package/src/lib/judge.ts
ADDED
|
@@ -0,0 +1,260 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LLM-as-Judge: Compare two responses and score quality
|
|
3
|
+
*
|
|
4
|
+
* Uses a separate LLM call to evaluate response quality on multiple dimensions.
|
|
5
|
+
* Designed for open-ended questions where exact-match verification isn't possible.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
export interface JudgeResult {
|
|
9
|
+
/** Which response is better: "A", "B", or "tie" */
|
|
10
|
+
winner: "A" | "B" | "tie";
|
|
11
|
+
/** Confidence in the judgment (0-1) */
|
|
12
|
+
confidence: number;
|
|
13
|
+
/** Scores per dimension (1-5 scale) */
|
|
14
|
+
scores: {
|
|
15
|
+
A: DimensionScores;
|
|
16
|
+
B: DimensionScores;
|
|
17
|
+
};
|
|
18
|
+
/** Brief explanation of the judgment */
|
|
19
|
+
reasoning: string;
|
|
20
|
+
/** Raw judge response for debugging */
|
|
21
|
+
raw_response?: string;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
export interface DimensionScores {
|
|
25
|
+
/** Is the response factually correct and accurate? */
|
|
26
|
+
accuracy: number;
|
|
27
|
+
/** Is the reasoning logical and well-structured? */
|
|
28
|
+
reasoning_quality: number;
|
|
29
|
+
/** Does the response fully address the question? */
|
|
30
|
+
completeness: number;
|
|
31
|
+
/** Is the response clear and well-written? */
|
|
32
|
+
clarity: number;
|
|
33
|
+
/** Overall quality score */
|
|
34
|
+
overall: number;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
export interface JudgeInput {
|
|
38
|
+
/** The original question/prompt */
|
|
39
|
+
question: string;
|
|
40
|
+
/** Response A (typically baseline) */
|
|
41
|
+
response_a: string;
|
|
42
|
+
/** Response B (typically with tool) */
|
|
43
|
+
response_b: string;
|
|
44
|
+
/** Optional reference answer for grounding */
|
|
45
|
+
reference_answer?: string;
|
|
46
|
+
/** Optional category for domain-specific judging */
|
|
47
|
+
category?: string;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
/**
|
|
51
|
+
* System prompt for the judge LLM
|
|
52
|
+
*/
|
|
53
|
+
const JUDGE_SYSTEM_PROMPT = `You are an expert evaluator comparing two AI responses to the same question.
|
|
54
|
+
|
|
55
|
+
Your task is to evaluate both responses on these dimensions (1-5 scale):
|
|
56
|
+
1. **Accuracy**: Factual correctness, no hallucinations or errors
|
|
57
|
+
2. **Reasoning Quality**: Logical flow, clear step-by-step thinking
|
|
58
|
+
3. **Completeness**: Fully addresses the question, no missing parts
|
|
59
|
+
4. **Clarity**: Well-written, easy to understand
|
|
60
|
+
5. **Overall**: Holistic quality assessment
|
|
61
|
+
|
|
62
|
+
IMPORTANT RULES:
|
|
63
|
+
- Be objective and fair to both responses
|
|
64
|
+
- If a reference answer is provided, use it as ground truth
|
|
65
|
+
- Consider the question type when weighing dimensions
|
|
66
|
+
- Explain your reasoning briefly
|
|
67
|
+
|
|
68
|
+
OUTPUT FORMAT (JSON only, no markdown):
|
|
69
|
+
{
|
|
70
|
+
"scores_a": { "accuracy": N, "reasoning_quality": N, "completeness": N, "clarity": N, "overall": N },
|
|
71
|
+
"scores_b": { "accuracy": N, "reasoning_quality": N, "completeness": N, "clarity": N, "overall": N },
|
|
72
|
+
"winner": "A" | "B" | "tie",
|
|
73
|
+
"confidence": 0.0-1.0,
|
|
74
|
+
"reasoning": "Brief explanation"
|
|
75
|
+
}`;
|
|
76
|
+
|
|
77
|
+
/**
|
|
78
|
+
* Build the user prompt for the judge
|
|
79
|
+
*/
|
|
80
|
+
function buildJudgePrompt(input: JudgeInput): string {
|
|
81
|
+
let prompt = `QUESTION:\n${input.question}\n\n`;
|
|
82
|
+
|
|
83
|
+
if (input.reference_answer) {
|
|
84
|
+
prompt += `REFERENCE ANSWER:\n${input.reference_answer}\n\n`;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
prompt += `RESPONSE A:\n${input.response_a}\n\n`;
|
|
88
|
+
prompt += `RESPONSE B:\n${input.response_b}\n\n`;
|
|
89
|
+
|
|
90
|
+
if (input.category) {
|
|
91
|
+
prompt += `CATEGORY: ${input.category} (weight accuracy higher for math/logic)\n\n`;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
prompt += `Evaluate both responses and output JSON only.`;
|
|
95
|
+
|
|
96
|
+
return prompt;
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
/**
|
|
100
|
+
* Parse the judge's response into structured result
|
|
101
|
+
*/
|
|
102
|
+
function parseJudgeResponse(response: string): Omit<JudgeResult, "raw_response"> | null {
|
|
103
|
+
try {
|
|
104
|
+
// Try to extract JSON from the response
|
|
105
|
+
const jsonMatch = response.match(/\{[\s\S]*\}/);
|
|
106
|
+
if (!jsonMatch) return null;
|
|
107
|
+
|
|
108
|
+
const parsed = JSON.parse(jsonMatch[0]);
|
|
109
|
+
|
|
110
|
+
// Validate structure
|
|
111
|
+
if (!parsed.scores_a || !parsed.scores_b || !parsed.winner) {
|
|
112
|
+
return null;
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
const mapScores = (s: Record<string, number>): DimensionScores => ({
|
|
116
|
+
accuracy: s.accuracy || 3,
|
|
117
|
+
reasoning_quality: s.reasoning_quality || 3,
|
|
118
|
+
completeness: s.completeness || 3,
|
|
119
|
+
clarity: s.clarity || 3,
|
|
120
|
+
overall: s.overall || 3,
|
|
121
|
+
});
|
|
122
|
+
|
|
123
|
+
return {
|
|
124
|
+
winner: parsed.winner as "A" | "B" | "tie",
|
|
125
|
+
confidence: parsed.confidence || 0.5,
|
|
126
|
+
scores: {
|
|
127
|
+
A: mapScores(parsed.scores_a),
|
|
128
|
+
B: mapScores(parsed.scores_b),
|
|
129
|
+
},
|
|
130
|
+
reasoning: parsed.reasoning || "No reasoning provided",
|
|
131
|
+
};
|
|
132
|
+
} catch {
|
|
133
|
+
return null;
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
/**
|
|
138
|
+
* Judge function type - allows different LLM backends
|
|
139
|
+
*/
|
|
140
|
+
export type LLMJudgeFunc = (prompt: string, system: string) => Promise<string>;
|
|
141
|
+
|
|
142
|
+
/**
|
|
143
|
+
* Compare two responses using LLM-as-Judge
|
|
144
|
+
*
|
|
145
|
+
* @param input - The question and two responses to compare
|
|
146
|
+
* @param llmCall - Function to call the judge LLM
|
|
147
|
+
* @returns Structured judgment result
|
|
148
|
+
*/
|
|
149
|
+
export async function judgeResponses(
|
|
150
|
+
input: JudgeInput,
|
|
151
|
+
llmCall: LLMJudgeFunc,
|
|
152
|
+
): Promise<JudgeResult> {
|
|
153
|
+
const prompt = buildJudgePrompt(input);
|
|
154
|
+
const response = await llmCall(prompt, JUDGE_SYSTEM_PROMPT);
|
|
155
|
+
|
|
156
|
+
const parsed = parseJudgeResponse(response);
|
|
157
|
+
|
|
158
|
+
if (!parsed) {
|
|
159
|
+
// Fallback for unparseable response
|
|
160
|
+
return {
|
|
161
|
+
winner: "tie",
|
|
162
|
+
confidence: 0,
|
|
163
|
+
scores: {
|
|
164
|
+
A: { accuracy: 3, reasoning_quality: 3, completeness: 3, clarity: 3, overall: 3 },
|
|
165
|
+
B: { accuracy: 3, reasoning_quality: 3, completeness: 3, clarity: 3, overall: 3 },
|
|
166
|
+
},
|
|
167
|
+
reasoning: "Failed to parse judge response",
|
|
168
|
+
raw_response: response,
|
|
169
|
+
};
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
return {
|
|
173
|
+
...parsed,
|
|
174
|
+
raw_response: response,
|
|
175
|
+
};
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
/**
|
|
179
|
+
* Batch judge multiple response pairs
|
|
180
|
+
*/
|
|
181
|
+
export async function judgeBatch(
|
|
182
|
+
inputs: JudgeInput[],
|
|
183
|
+
llmCall: LLMJudgeFunc,
|
|
184
|
+
options: { concurrency?: number } = {},
|
|
185
|
+
): Promise<JudgeResult[]> {
|
|
186
|
+
const { concurrency = 3 } = options;
|
|
187
|
+
const results: JudgeResult[] = [];
|
|
188
|
+
|
|
189
|
+
// Process in batches for rate limiting
|
|
190
|
+
for (let i = 0; i < inputs.length; i += concurrency) {
|
|
191
|
+
const batch = inputs.slice(i, i + concurrency);
|
|
192
|
+
const batchResults = await Promise.all(batch.map((input) => judgeResponses(input, llmCall)));
|
|
193
|
+
results.push(...batchResults);
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
return results;
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
/**
|
|
200
|
+
* Aggregate judge results into summary statistics
|
|
201
|
+
*/
|
|
202
|
+
export interface JudgeSummary {
|
|
203
|
+
total: number;
|
|
204
|
+
wins_a: number;
|
|
205
|
+
wins_b: number;
|
|
206
|
+
ties: number;
|
|
207
|
+
win_rate_a: number;
|
|
208
|
+
win_rate_b: number;
|
|
209
|
+
avg_scores_a: DimensionScores;
|
|
210
|
+
avg_scores_b: DimensionScores;
|
|
211
|
+
avg_confidence: number;
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
export function summarizeJudgments(results: JudgeResult[]): JudgeSummary {
|
|
215
|
+
const total = results.length;
|
|
216
|
+
if (total === 0) {
|
|
217
|
+
return {
|
|
218
|
+
total: 0,
|
|
219
|
+
wins_a: 0,
|
|
220
|
+
wins_b: 0,
|
|
221
|
+
ties: 0,
|
|
222
|
+
win_rate_a: 0,
|
|
223
|
+
win_rate_b: 0,
|
|
224
|
+
avg_scores_a: { accuracy: 0, reasoning_quality: 0, completeness: 0, clarity: 0, overall: 0 },
|
|
225
|
+
avg_scores_b: { accuracy: 0, reasoning_quality: 0, completeness: 0, clarity: 0, overall: 0 },
|
|
226
|
+
avg_confidence: 0,
|
|
227
|
+
};
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
const wins_a = results.filter((r) => r.winner === "A").length;
|
|
231
|
+
const wins_b = results.filter((r) => r.winner === "B").length;
|
|
232
|
+
const ties = results.filter((r) => r.winner === "tie").length;
|
|
233
|
+
|
|
234
|
+
const avgScores = (key: "A" | "B"): DimensionScores => {
|
|
235
|
+
const dims: (keyof DimensionScores)[] = [
|
|
236
|
+
"accuracy",
|
|
237
|
+
"reasoning_quality",
|
|
238
|
+
"completeness",
|
|
239
|
+
"clarity",
|
|
240
|
+
"overall",
|
|
241
|
+
];
|
|
242
|
+
const avg: Partial<DimensionScores> = {};
|
|
243
|
+
for (const dim of dims) {
|
|
244
|
+
avg[dim] = results.reduce((sum, r) => sum + r.scores[key][dim], 0) / total;
|
|
245
|
+
}
|
|
246
|
+
return avg as DimensionScores;
|
|
247
|
+
};
|
|
248
|
+
|
|
249
|
+
return {
|
|
250
|
+
total,
|
|
251
|
+
wins_a,
|
|
252
|
+
wins_b,
|
|
253
|
+
ties,
|
|
254
|
+
win_rate_a: wins_a / total,
|
|
255
|
+
win_rate_b: wins_b / total,
|
|
256
|
+
avg_scores_a: avgScores("A"),
|
|
257
|
+
avg_scores_b: avgScores("B"),
|
|
258
|
+
avg_confidence: results.reduce((sum, r) => sum + r.confidence, 0) / total,
|
|
259
|
+
};
|
|
260
|
+
}
|