verifiable-thinking-mcp 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +339 -0
- package/package.json +75 -0
- package/src/index.ts +38 -0
- package/src/lib/cache.ts +246 -0
- package/src/lib/compression.ts +804 -0
- package/src/lib/compute/cache.ts +86 -0
- package/src/lib/compute/classifier.ts +555 -0
- package/src/lib/compute/confidence.ts +79 -0
- package/src/lib/compute/context.ts +154 -0
- package/src/lib/compute/extract.ts +200 -0
- package/src/lib/compute/filter.ts +224 -0
- package/src/lib/compute/index.ts +171 -0
- package/src/lib/compute/math.ts +247 -0
- package/src/lib/compute/patterns.ts +564 -0
- package/src/lib/compute/registry.ts +145 -0
- package/src/lib/compute/solvers/arithmetic.ts +65 -0
- package/src/lib/compute/solvers/calculus.ts +249 -0
- package/src/lib/compute/solvers/derivation-core.ts +371 -0
- package/src/lib/compute/solvers/derivation-latex.ts +160 -0
- package/src/lib/compute/solvers/derivation-mistakes.ts +1046 -0
- package/src/lib/compute/solvers/derivation-simplify.ts +451 -0
- package/src/lib/compute/solvers/derivation-transform.ts +620 -0
- package/src/lib/compute/solvers/derivation.ts +67 -0
- package/src/lib/compute/solvers/facts.ts +120 -0
- package/src/lib/compute/solvers/formula.ts +728 -0
- package/src/lib/compute/solvers/index.ts +36 -0
- package/src/lib/compute/solvers/logic.ts +422 -0
- package/src/lib/compute/solvers/probability.ts +307 -0
- package/src/lib/compute/solvers/statistics.ts +262 -0
- package/src/lib/compute/solvers/word-problems.ts +408 -0
- package/src/lib/compute/types.ts +107 -0
- package/src/lib/concepts.ts +111 -0
- package/src/lib/domain.ts +731 -0
- package/src/lib/extraction.ts +912 -0
- package/src/lib/index.ts +122 -0
- package/src/lib/judge.ts +260 -0
- package/src/lib/math/ast.ts +842 -0
- package/src/lib/math/index.ts +8 -0
- package/src/lib/math/operators.ts +171 -0
- package/src/lib/math/tokenizer.ts +477 -0
- package/src/lib/patterns.ts +200 -0
- package/src/lib/session.ts +825 -0
- package/src/lib/think/challenge.ts +323 -0
- package/src/lib/think/complexity.ts +504 -0
- package/src/lib/think/confidence-drift.ts +507 -0
- package/src/lib/think/consistency.ts +347 -0
- package/src/lib/think/guidance.ts +188 -0
- package/src/lib/think/helpers.ts +568 -0
- package/src/lib/think/hypothesis.ts +216 -0
- package/src/lib/think/index.ts +127 -0
- package/src/lib/think/prompts.ts +262 -0
- package/src/lib/think/route.ts +358 -0
- package/src/lib/think/schema.ts +98 -0
- package/src/lib/think/scratchpad-schema.ts +662 -0
- package/src/lib/think/spot-check.ts +961 -0
- package/src/lib/think/types.ts +93 -0
- package/src/lib/think/verification.ts +260 -0
- package/src/lib/tokens.ts +177 -0
- package/src/lib/verification.ts +620 -0
- package/src/prompts/index.ts +10 -0
- package/src/prompts/templates.ts +336 -0
- package/src/resources/index.ts +8 -0
- package/src/resources/sessions.ts +196 -0
- package/src/tools/compress.ts +138 -0
- package/src/tools/index.ts +5 -0
- package/src/tools/scratchpad.ts +2659 -0
- package/src/tools/sessions.ts +144 -0
|
@@ -0,0 +1,323 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Challenge Operation - Adversarial Self-Check for Reasoning Quality
|
|
3
|
+
*
|
|
4
|
+
* Generates counterarguments to combat confirmation bias by:
|
|
5
|
+
* - Inverting key assumptions
|
|
6
|
+
* - Finding edge cases
|
|
7
|
+
* - Verifying premises were established
|
|
8
|
+
* - Generating steelman counterarguments
|
|
9
|
+
*
|
|
10
|
+
* O(n) complexity using regex-based claim extraction.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
/** Types of challenges that can be generated */
|
|
14
|
+
export type ChallengeType =
|
|
15
|
+
| "assumption_inversion"
|
|
16
|
+
| "edge_case"
|
|
17
|
+
| "premise_check"
|
|
18
|
+
| "steelman_counter";
|
|
19
|
+
|
|
20
|
+
/** A generated challenge to a claim */
|
|
21
|
+
export interface Challenge {
|
|
22
|
+
/** Type of challenge */
|
|
23
|
+
type: ChallengeType;
|
|
24
|
+
/** The original claim being challenged */
|
|
25
|
+
original_claim: string;
|
|
26
|
+
/** The challenge/counterargument */
|
|
27
|
+
challenge: string;
|
|
28
|
+
/** How serious is this challenge */
|
|
29
|
+
severity: "low" | "medium" | "high";
|
|
30
|
+
/** Suggested way to address this challenge */
|
|
31
|
+
suggested_response: string;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
/** Result of running challenge operation */
|
|
35
|
+
export interface ChallengeResult {
|
|
36
|
+
/** Number of challenges generated */
|
|
37
|
+
challenges_generated: number;
|
|
38
|
+
/** The challenges */
|
|
39
|
+
challenges: Challenge[];
|
|
40
|
+
/** Overall robustness score (0-1) */
|
|
41
|
+
overall_robustness: number;
|
|
42
|
+
/** Summary of findings */
|
|
43
|
+
summary: string;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
// Patterns to extract claims/conclusions from text
|
|
47
|
+
const CLAIM_PATTERNS = [
|
|
48
|
+
/(?:therefore|thus|hence|consequently|so)\s+(.{10,100}?)(?:\.|$)/gi,
|
|
49
|
+
/(?:we conclude|this means|this shows|this proves)\s+(?:that\s+)?(.{10,100}?)(?:\.|$)/gi,
|
|
50
|
+
/(?:it follows that|it must be that)\s+(.{10,100}?)(?:\.|$)/gi,
|
|
51
|
+
/(.{5,50})\s+(?:is|are)\s+(?:true|false|correct|incorrect|valid|invalid)(?:\.|$)/gi,
|
|
52
|
+
/(?:the answer is|the result is|the solution is)\s+(.{5,100}?)(?:\.|$)/gi,
|
|
53
|
+
];
|
|
54
|
+
|
|
55
|
+
// Patterns for conditional statements (if P then Q)
|
|
56
|
+
const CONDITIONAL_PATTERN = /if\s+(.{5,80}?)(?:,\s*)?then\s+(.{5,80}?)(?:\.|,|$)/gi;
|
|
57
|
+
|
|
58
|
+
// Assumption words to invert
|
|
59
|
+
const ASSUMPTION_INVERSIONS: Record<string, string> = {
|
|
60
|
+
always: "sometimes not",
|
|
61
|
+
never: "sometimes",
|
|
62
|
+
all: "some",
|
|
63
|
+
none: "some",
|
|
64
|
+
every: "some",
|
|
65
|
+
must: "might not",
|
|
66
|
+
cannot: "might",
|
|
67
|
+
impossible: "possible",
|
|
68
|
+
certain: "uncertain",
|
|
69
|
+
definitely: "possibly not",
|
|
70
|
+
obviously: "not necessarily",
|
|
71
|
+
clearly: "arguably",
|
|
72
|
+
};
|
|
73
|
+
|
|
74
|
+
// Numeric patterns for edge case detection
|
|
75
|
+
const NUMERIC_PATTERN = /\b(\d+(?:\.\d+)?)\b/g;
|
|
76
|
+
|
|
77
|
+
/**
|
|
78
|
+
* Extract claims from text using pattern matching
|
|
79
|
+
*/
|
|
80
|
+
function extractClaims(text: string): string[] {
|
|
81
|
+
const claims: string[] = [];
|
|
82
|
+
const seen = new Set<string>();
|
|
83
|
+
|
|
84
|
+
for (const pattern of CLAIM_PATTERNS) {
|
|
85
|
+
pattern.lastIndex = 0;
|
|
86
|
+
let match: RegExpExecArray | null;
|
|
87
|
+
while ((match = pattern.exec(text)) !== null) {
|
|
88
|
+
const claim = match[1]?.trim();
|
|
89
|
+
if (claim && claim.length > 10 && !seen.has(claim.toLowerCase())) {
|
|
90
|
+
claims.push(claim);
|
|
91
|
+
seen.add(claim.toLowerCase());
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
return claims;
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
/**
|
|
100
|
+
* Extract conditional statements (if P then Q)
|
|
101
|
+
*/
|
|
102
|
+
function extractConditionals(text: string): Array<{ premise: string; conclusion: string }> {
|
|
103
|
+
const conditionals: Array<{ premise: string; conclusion: string }> = [];
|
|
104
|
+
CONDITIONAL_PATTERN.lastIndex = 0;
|
|
105
|
+
|
|
106
|
+
let match: RegExpExecArray | null;
|
|
107
|
+
while ((match = CONDITIONAL_PATTERN.exec(text)) !== null) {
|
|
108
|
+
const premise = match[1]?.trim();
|
|
109
|
+
const conclusion = match[2]?.trim();
|
|
110
|
+
if (premise && conclusion) {
|
|
111
|
+
conditionals.push({ premise, conclusion });
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
return conditionals;
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
/**
|
|
119
|
+
* Generate assumption inversion challenges
|
|
120
|
+
*/
|
|
121
|
+
function generateAssumptionInversions(claim: string): Challenge[] {
|
|
122
|
+
const challenges: Challenge[] = [];
|
|
123
|
+
const lowerClaim = claim.toLowerCase();
|
|
124
|
+
|
|
125
|
+
for (const [word, inversion] of Object.entries(ASSUMPTION_INVERSIONS)) {
|
|
126
|
+
if (lowerClaim.includes(word)) {
|
|
127
|
+
challenges.push({
|
|
128
|
+
type: "assumption_inversion",
|
|
129
|
+
original_claim: claim,
|
|
130
|
+
challenge: `What if "${word}" should be "${inversion}"? The claim assumes absolute certainty.`,
|
|
131
|
+
severity: "medium",
|
|
132
|
+
suggested_response: `Verify the "${word}" claim with evidence or soften to "${inversion}".`,
|
|
133
|
+
});
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
return challenges;
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
/**
|
|
141
|
+
* Generate edge case challenges from numeric values
|
|
142
|
+
*/
|
|
143
|
+
function generateEdgeCases(claim: string): Challenge[] {
|
|
144
|
+
const challenges: Challenge[] = [];
|
|
145
|
+
|
|
146
|
+
// Find numeric values
|
|
147
|
+
NUMERIC_PATTERN.lastIndex = 0;
|
|
148
|
+
let match: RegExpExecArray | null;
|
|
149
|
+
while ((match = NUMERIC_PATTERN.exec(claim)) !== null) {
|
|
150
|
+
const matchValue = match[1];
|
|
151
|
+
if (!matchValue) continue;
|
|
152
|
+
const num = parseFloat(matchValue);
|
|
153
|
+
if (!Number.isNaN(num)) {
|
|
154
|
+
const edgeCases = [0, -1, 1, num - 1, num + 1];
|
|
155
|
+
if (num > 0) edgeCases.push(-num);
|
|
156
|
+
|
|
157
|
+
challenges.push({
|
|
158
|
+
type: "edge_case",
|
|
159
|
+
original_claim: claim,
|
|
160
|
+
challenge: `Does the claim hold for edge cases: ${edgeCases.slice(0, 3).join(", ")}?`,
|
|
161
|
+
severity: "low",
|
|
162
|
+
suggested_response: `Test the claim with boundary values: ${edgeCases.join(", ")}.`,
|
|
163
|
+
});
|
|
164
|
+
break; // One edge case challenge per claim
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
return challenges;
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
/**
|
|
172
|
+
* Generate premise check challenges from conditionals
|
|
173
|
+
*/
|
|
174
|
+
function generatePremiseChecks(
|
|
175
|
+
conditionals: Array<{ premise: string; conclusion: string }>,
|
|
176
|
+
allText: string,
|
|
177
|
+
): Challenge[] {
|
|
178
|
+
const challenges: Challenge[] = [];
|
|
179
|
+
const lowerText = allText.toLowerCase();
|
|
180
|
+
|
|
181
|
+
for (const { premise, conclusion } of conditionals) {
|
|
182
|
+
// Check if premise was established (mentioned affirmatively)
|
|
183
|
+
const premiseWords = premise
|
|
184
|
+
.toLowerCase()
|
|
185
|
+
.split(/\s+/)
|
|
186
|
+
.filter((w) => w.length > 3);
|
|
187
|
+
const premiseInText = premiseWords.filter((w) => lowerText.includes(w)).length;
|
|
188
|
+
const coverage = premiseWords.length > 0 ? premiseInText / premiseWords.length : 0;
|
|
189
|
+
|
|
190
|
+
if (coverage < 0.5) {
|
|
191
|
+
challenges.push({
|
|
192
|
+
type: "premise_check",
|
|
193
|
+
original_claim: `If ${premise} then ${conclusion}`,
|
|
194
|
+
challenge: `The premise "${premise}" was not clearly established before concluding "${conclusion}".`,
|
|
195
|
+
severity: "high",
|
|
196
|
+
suggested_response: `Add a step that explicitly establishes: "${premise}".`,
|
|
197
|
+
});
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
return challenges;
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
/**
|
|
205
|
+
* Generate steelman counterargument
|
|
206
|
+
*/
|
|
207
|
+
function generateSteelmanCounter(claim: string): Challenge {
|
|
208
|
+
return {
|
|
209
|
+
type: "steelman_counter",
|
|
210
|
+
original_claim: claim,
|
|
211
|
+
challenge: `Steel-man opposing view: What's the strongest argument AGAINST "${claim.slice(0, 50)}${claim.length > 50 ? "..." : ""}"?`,
|
|
212
|
+
severity: "medium",
|
|
213
|
+
suggested_response: "Address the strongest possible counterargument before finalizing.",
|
|
214
|
+
};
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
/**
|
|
218
|
+
* Calculate overall robustness score
|
|
219
|
+
*/
|
|
220
|
+
function calculateRobustness(challenges: Challenge[]): number {
|
|
221
|
+
if (challenges.length === 0) return 1.0;
|
|
222
|
+
|
|
223
|
+
const severityWeights = { low: 0.1, medium: 0.25, high: 0.5 };
|
|
224
|
+
const totalPenalty = challenges.reduce((sum, c) => sum + severityWeights[c.severity], 0);
|
|
225
|
+
|
|
226
|
+
// Robustness decreases with more/severe challenges
|
|
227
|
+
return Math.max(0, 1 - Math.min(totalPenalty, 1));
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
/**
|
|
231
|
+
* Run adversarial challenge on reasoning steps
|
|
232
|
+
*
|
|
233
|
+
* @param steps - Array of reasoning step texts
|
|
234
|
+
* @param targetClaim - Optional specific claim to challenge
|
|
235
|
+
* @returns Challenge result with generated counterarguments
|
|
236
|
+
*/
|
|
237
|
+
export function challenge(
|
|
238
|
+
steps: Array<{ step: number; thought: string }>,
|
|
239
|
+
targetClaim?: string,
|
|
240
|
+
): ChallengeResult {
|
|
241
|
+
if (steps.length === 0) {
|
|
242
|
+
return {
|
|
243
|
+
challenges_generated: 0,
|
|
244
|
+
challenges: [],
|
|
245
|
+
overall_robustness: 1.0,
|
|
246
|
+
summary: "No steps to challenge.",
|
|
247
|
+
};
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
// Combine all text for analysis
|
|
251
|
+
const allText = steps.map((s) => s.thought).join(" ");
|
|
252
|
+
|
|
253
|
+
// Extract claims and conditionals
|
|
254
|
+
const claims = targetClaim ? [targetClaim] : extractClaims(allText);
|
|
255
|
+
const conditionals = extractConditionals(allText);
|
|
256
|
+
|
|
257
|
+
const challenges: Challenge[] = [];
|
|
258
|
+
|
|
259
|
+
// Generate challenges for each claim
|
|
260
|
+
for (const claim of claims.slice(0, 5)) {
|
|
261
|
+
// Limit to 5 claims
|
|
262
|
+
challenges.push(...generateAssumptionInversions(claim));
|
|
263
|
+
challenges.push(...generateEdgeCases(claim));
|
|
264
|
+
|
|
265
|
+
// Add one steelman counter for the most recent claim
|
|
266
|
+
if (claim === claims[claims.length - 1]) {
|
|
267
|
+
challenges.push(generateSteelmanCounter(claim));
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
// Generate premise checks from conditionals
|
|
272
|
+
challenges.push(...generatePremiseChecks(conditionals, allText));
|
|
273
|
+
|
|
274
|
+
// Dedupe by challenge text
|
|
275
|
+
const seen = new Set<string>();
|
|
276
|
+
const uniqueChallenges = challenges.filter((c) => {
|
|
277
|
+
const key = c.challenge.toLowerCase();
|
|
278
|
+
if (seen.has(key)) return false;
|
|
279
|
+
seen.add(key);
|
|
280
|
+
return true;
|
|
281
|
+
});
|
|
282
|
+
|
|
283
|
+
// Calculate robustness
|
|
284
|
+
const robustness = calculateRobustness(uniqueChallenges);
|
|
285
|
+
|
|
286
|
+
// Generate summary
|
|
287
|
+
const highCount = uniqueChallenges.filter((c) => c.severity === "high").length;
|
|
288
|
+
const summary =
|
|
289
|
+
uniqueChallenges.length === 0
|
|
290
|
+
? "No significant challenges found. Reasoning appears robust."
|
|
291
|
+
: highCount > 0
|
|
292
|
+
? `⚠️ Found ${highCount} high-severity challenge(s). Address before finalizing.`
|
|
293
|
+
: `Found ${uniqueChallenges.length} challenge(s). Robustness: ${(robustness * 100).toFixed(0)}%`;
|
|
294
|
+
|
|
295
|
+
return {
|
|
296
|
+
challenges_generated: uniqueChallenges.length,
|
|
297
|
+
challenges: uniqueChallenges,
|
|
298
|
+
overall_robustness: robustness,
|
|
299
|
+
summary,
|
|
300
|
+
};
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
/**
|
|
304
|
+
* Quick check if reasoning should be challenged (for auto-trigger)
|
|
305
|
+
* Returns true if overconfidence detected or claims lack support
|
|
306
|
+
*/
|
|
307
|
+
export function shouldChallenge(
|
|
308
|
+
chainConfidence: number,
|
|
309
|
+
stepCount: number,
|
|
310
|
+
hasVerification: boolean,
|
|
311
|
+
): boolean {
|
|
312
|
+
// Trigger on overconfidence: high confidence with few steps and no verification
|
|
313
|
+
if (chainConfidence > 0.9 && stepCount < 3 && !hasVerification) {
|
|
314
|
+
return true;
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
// Trigger on very high confidence regardless
|
|
318
|
+
if (chainConfidence > 0.95) {
|
|
319
|
+
return true;
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
return false;
|
|
323
|
+
}
|