verifiable-thinking-mcp 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +339 -0
  3. package/package.json +75 -0
  4. package/src/index.ts +38 -0
  5. package/src/lib/cache.ts +246 -0
  6. package/src/lib/compression.ts +804 -0
  7. package/src/lib/compute/cache.ts +86 -0
  8. package/src/lib/compute/classifier.ts +555 -0
  9. package/src/lib/compute/confidence.ts +79 -0
  10. package/src/lib/compute/context.ts +154 -0
  11. package/src/lib/compute/extract.ts +200 -0
  12. package/src/lib/compute/filter.ts +224 -0
  13. package/src/lib/compute/index.ts +171 -0
  14. package/src/lib/compute/math.ts +247 -0
  15. package/src/lib/compute/patterns.ts +564 -0
  16. package/src/lib/compute/registry.ts +145 -0
  17. package/src/lib/compute/solvers/arithmetic.ts +65 -0
  18. package/src/lib/compute/solvers/calculus.ts +249 -0
  19. package/src/lib/compute/solvers/derivation-core.ts +371 -0
  20. package/src/lib/compute/solvers/derivation-latex.ts +160 -0
  21. package/src/lib/compute/solvers/derivation-mistakes.ts +1046 -0
  22. package/src/lib/compute/solvers/derivation-simplify.ts +451 -0
  23. package/src/lib/compute/solvers/derivation-transform.ts +620 -0
  24. package/src/lib/compute/solvers/derivation.ts +67 -0
  25. package/src/lib/compute/solvers/facts.ts +120 -0
  26. package/src/lib/compute/solvers/formula.ts +728 -0
  27. package/src/lib/compute/solvers/index.ts +36 -0
  28. package/src/lib/compute/solvers/logic.ts +422 -0
  29. package/src/lib/compute/solvers/probability.ts +307 -0
  30. package/src/lib/compute/solvers/statistics.ts +262 -0
  31. package/src/lib/compute/solvers/word-problems.ts +408 -0
  32. package/src/lib/compute/types.ts +107 -0
  33. package/src/lib/concepts.ts +111 -0
  34. package/src/lib/domain.ts +731 -0
  35. package/src/lib/extraction.ts +912 -0
  36. package/src/lib/index.ts +122 -0
  37. package/src/lib/judge.ts +260 -0
  38. package/src/lib/math/ast.ts +842 -0
  39. package/src/lib/math/index.ts +8 -0
  40. package/src/lib/math/operators.ts +171 -0
  41. package/src/lib/math/tokenizer.ts +477 -0
  42. package/src/lib/patterns.ts +200 -0
  43. package/src/lib/session.ts +825 -0
  44. package/src/lib/think/challenge.ts +323 -0
  45. package/src/lib/think/complexity.ts +504 -0
  46. package/src/lib/think/confidence-drift.ts +507 -0
  47. package/src/lib/think/consistency.ts +347 -0
  48. package/src/lib/think/guidance.ts +188 -0
  49. package/src/lib/think/helpers.ts +568 -0
  50. package/src/lib/think/hypothesis.ts +216 -0
  51. package/src/lib/think/index.ts +127 -0
  52. package/src/lib/think/prompts.ts +262 -0
  53. package/src/lib/think/route.ts +358 -0
  54. package/src/lib/think/schema.ts +98 -0
  55. package/src/lib/think/scratchpad-schema.ts +662 -0
  56. package/src/lib/think/spot-check.ts +961 -0
  57. package/src/lib/think/types.ts +93 -0
  58. package/src/lib/think/verification.ts +260 -0
  59. package/src/lib/tokens.ts +177 -0
  60. package/src/lib/verification.ts +620 -0
  61. package/src/prompts/index.ts +10 -0
  62. package/src/prompts/templates.ts +336 -0
  63. package/src/resources/index.ts +8 -0
  64. package/src/resources/sessions.ts +196 -0
  65. package/src/tools/compress.ts +138 -0
  66. package/src/tools/index.ts +5 -0
  67. package/src/tools/scratchpad.ts +2659 -0
  68. package/src/tools/sessions.ts +144 -0
@@ -0,0 +1,2659 @@
1
+ /**
2
+ * Scratchpad Tool - Unified CRASH-style reasoning with operation-based dispatch
3
+ *
4
+ * Features:
5
+ * - Auto step increment (no manual step_number needed)
6
+ * - Confidence tracking (average across chain)
7
+ * - Threshold detection with 5-second warning
8
+ * - Navigate operation for viewing history/branches/paths
9
+ * - Branch and revise operations
10
+ * - Auto-suggest next simplification step for math derivations
11
+ * - Proactive stepping guidance based on question complexity
12
+ */
13
+
14
+ import type { Context } from "fastmcp";
15
+ import { compress, needsCompression } from "../lib/compression.ts";
16
+ import { contextAwareCompute } from "../lib/compute/context.ts";
17
+ import {
18
+ type DetectedMistake,
19
+ detectCommonMistakesFromText,
20
+ isLikelyComputable,
21
+ type SimplificationStep,
22
+ suggestNextStepFromText,
23
+ suggestSimplificationPath,
24
+ tryLocalCompute,
25
+ } from "../lib/compute/index.ts";
26
+ import { stripMarkdown } from "../lib/extraction.ts";
27
+ import { SessionManager, type ThoughtRecord } from "../lib/session.ts";
28
+ import { challenge, shouldChallenge } from "../lib/think/challenge.ts";
29
+ import { assessPromptComplexity } from "../lib/think/complexity.ts";
30
+ import { analyzeConfidenceDrift } from "../lib/think/confidence-drift.ts";
31
+ import { checkStepConsistency } from "../lib/think/consistency.ts";
32
+ import { detectDomain } from "../lib/think/guidance.ts";
33
+ import { analyzeStepForResolution } from "../lib/think/hypothesis.ts";
34
+ import {
35
+ type ScratchpadArgs,
36
+ type ScratchpadResponse,
37
+ ScratchpadSchema,
38
+ } from "../lib/think/scratchpad-schema.ts";
39
+ import { primeQuestion, spotCheck } from "../lib/think/spot-check.ts";
40
+ import { calculateTokenUsage, getSessionTokens, trackSessionTokens } from "../lib/tokens.ts";
41
+ import { verify } from "../lib/verification.ts";
42
+
43
+ type MCPContext = Context<Record<string, unknown> | undefined>;
44
+
45
+ // ============================================================================
46
+ // CONSTANTS
47
+ // ============================================================================
48
+
49
+ /**
50
+ * Threshold for adaptive maxCombined in trap priming.
51
+ * Questions shorter than this get maxCombined=2, longer get maxCombined=1.
52
+ *
53
+ * Tuned empirically: all multi-trap questions in benchmark are ≥195 chars.
54
+ * Using 190 ensures all multi-trap questions stay conservative (maxCombined=1).
55
+ */
56
+ const ADAPTIVE_PRIMING_THRESHOLD = 190;
57
+
58
+ /**
59
+ * Maximum question length for trap priming (security + performance).
60
+ * Prevents memory exhaustion and ReDoS attacks on regex patterns.
61
+ * 10k chars ≈ 2.5k tokens, sufficient for any reasonable question.
62
+ */
63
+ const MAX_QUESTION_LENGTH = 10_000;
64
+
65
+ // ============================================================================
66
+ // STEPPING GUIDANCE
67
+ // ============================================================================
68
+
69
+ /** Map complexity tier to recommended minimum steps */
70
+ function getRecommendedSteps(
71
+ tier: "Low" | "Moderate" | "High" | "Very Hard" | "Almost Impossible",
72
+ ): number {
73
+ switch (tier) {
74
+ case "Low":
75
+ return 1;
76
+ case "Moderate":
77
+ return 2;
78
+ case "High":
79
+ return 4;
80
+ case "Very Hard":
81
+ return 6;
82
+ case "Almost Impossible":
83
+ return 8;
84
+ }
85
+ }
86
+
87
+ // ============================================================================
88
+ // CONFIDENCE TRACKING
89
+ // ============================================================================
90
+
91
+ interface ConfidenceState {
92
+ stepConfidence: number | undefined;
93
+ chainConfidence: number;
94
+ stepsWithConfidence: number;
95
+ }
96
+
97
+ /** Calculate chain confidence from session thoughts + current step */
98
+ function calculateConfidence(
99
+ sessionId: string,
100
+ branchId: string,
101
+ newConfidence?: number,
102
+ ): ConfidenceState {
103
+ const thoughts = SessionManager.getThoughts(sessionId, branchId);
104
+
105
+ // Collect confidences from verification results
106
+ const confidences: number[] = [];
107
+ for (const t of thoughts) {
108
+ if (t.verification?.confidence !== undefined) {
109
+ confidences.push(t.verification.confidence);
110
+ }
111
+ }
112
+
113
+ // Add new confidence if provided
114
+ if (newConfidence !== undefined) {
115
+ confidences.push(newConfidence);
116
+ }
117
+
118
+ const chainConfidence =
119
+ confidences.length > 0 ? confidences.reduce((a, b) => a + b, 0) / confidences.length : 0;
120
+
121
+ return {
122
+ stepConfidence: newConfidence,
123
+ chainConfidence,
124
+ stepsWithConfidence: confidences.length,
125
+ };
126
+ }
127
+
128
+ /** Determine status based on confidence threshold */
129
+ function determineStatus(
130
+ chainConfidence: number,
131
+ threshold: number,
132
+ isComplete: boolean,
133
+ ): ScratchpadResponse["status"] {
134
+ if (isComplete) return "complete";
135
+ if (chainConfidence >= threshold) return "threshold_reached";
136
+ if (chainConfidence >= threshold * 0.8) return "review"; // Within 20% of threshold
137
+ return "continue";
138
+ }
139
+
140
+ /** Get suggested action based on status */
141
+ function getSuggestedAction(status: ScratchpadResponse["status"], chainConfidence: number): string {
142
+ switch (status) {
143
+ case "complete":
144
+ return "Reasoning chain complete.";
145
+ case "threshold_reached":
146
+ return `Confidence ${(chainConfidence * 100).toFixed(0)}% reached threshold. Consider completing or add one more verification step.`;
147
+ case "review":
148
+ return `Confidence ${(chainConfidence * 100).toFixed(0)}% approaching threshold. Review recent steps for completeness.`;
149
+ case "continue":
150
+ return `Continue reasoning. Chain confidence: ${(chainConfidence * 100).toFixed(0)}%`;
151
+ case "verification_failed":
152
+ return "Verification failed. Use revise, branch, or override to continue.";
153
+ case "budget_exhausted":
154
+ return "Token budget exhausted. Complete your reasoning or start a new session.";
155
+ }
156
+ }
157
+
158
+ /**
159
+ * Run step-level CDD analysis and return drift info.
160
+ * Returns data for ALL patterns (not just concerning ones) so clients can display trajectory.
161
+ * Streams warning only for concerning patterns.
162
+ */
163
+ async function runStepLevelCDD(
164
+ sessionId: string,
165
+ branchId: string,
166
+ streamContent: MCPContext["streamContent"],
167
+ ): Promise<ScratchpadResponse["confidence_drift"] | undefined> {
168
+ const thoughts = SessionManager.getThoughts(sessionId, branchId);
169
+
170
+ // Need at least 3 steps for meaningful CDD analysis
171
+ if (thoughts.length < 3) {
172
+ return undefined;
173
+ }
174
+
175
+ const analysis = analyzeConfidenceDrift(thoughts);
176
+
177
+ // Skip insufficient pattern
178
+ if (analysis.pattern === "insufficient") {
179
+ return undefined;
180
+ }
181
+
182
+ // Stream warning for concerning patterns only
183
+ if (analysis.unresolved) {
184
+ await streamContent({
185
+ type: "text",
186
+ text:
187
+ `\n⚠️ **Early Drift Warning:** ${analysis.explanation}\n` +
188
+ (analysis.suggestion ? ` 💡 ${analysis.suggestion}\n` : ""),
189
+ });
190
+ }
191
+
192
+ // Return structured data for ALL non-insufficient patterns (so clients can display trajectory)
193
+ return {
194
+ drift_score: analysis.drift_score,
195
+ unresolved: analysis.unresolved,
196
+ min_confidence: analysis.min_confidence,
197
+ min_step: analysis.min_step,
198
+ max_drop: analysis.max_drop,
199
+ recovery: analysis.recovery,
200
+ has_revision_after_drop: analysis.has_revision_after_drop,
201
+ pattern: analysis.pattern,
202
+ explanation: analysis.explanation,
203
+ suggestion: analysis.suggestion,
204
+ };
205
+ }
206
+
207
+ /**
208
+ * Adaptive spot-check: Auto-run spot-check when CDD detects unresolved drift.
209
+ * This catches trap patterns early, before the model reaches complete().
210
+ *
211
+ * Triggers when:
212
+ * 1. CDD detected unresolved pattern (unresolved=true)
213
+ * 2. Session has a stored question
214
+ * 3. Current thought contains potential answer markers
215
+ *
216
+ * Returns spot-check result if triggered, undefined otherwise.
217
+ */
218
+ async function runAdaptiveSpotCheck(
219
+ sessionId: string,
220
+ thought: string,
221
+ cddResult: ScratchpadResponse["confidence_drift"] | undefined,
222
+ streamContent: MCPContext["streamContent"],
223
+ ): Promise<ScratchpadResponse["spot_check_result"] | undefined> {
224
+ // Only trigger if CDD detected unresolved drift
225
+ if (!cddResult?.unresolved) {
226
+ return undefined;
227
+ }
228
+
229
+ // Need a stored question to spot-check against
230
+ const question = SessionManager.getQuestion(sessionId);
231
+ if (!question) {
232
+ return undefined;
233
+ }
234
+
235
+ // Look for answer indicators in the thought
236
+ // Match patterns like "answer is X", "= X", "therefore X", "result: X"
237
+ const answerPatterns = [
238
+ /(?:answer|result|solution|total|sum|value|equals?)\s*(?:is|:|=)\s*([^\s,.]+)/i,
239
+ /(?:therefore|thus|so|hence)\s*[,:]?\s*([^\s,.]+)/i,
240
+ /=\s*([^\s,.=]+)\s*$/m,
241
+ /\*\*([^*]+)\*\*\s*$/m, // Bold answer at end
242
+ ];
243
+
244
+ let potentialAnswer: string | undefined;
245
+ for (const pattern of answerPatterns) {
246
+ const match = thought.match(pattern);
247
+ if (match?.[1]) {
248
+ potentialAnswer = match[1].trim();
249
+ break;
250
+ }
251
+ }
252
+
253
+ // No answer found in thought
254
+ if (!potentialAnswer) {
255
+ return undefined;
256
+ }
257
+
258
+ // Run spot-check
259
+ const result = spotCheck(question, potentialAnswer);
260
+
261
+ // Only report if spot-check failed (found a trap)
262
+ if (result.passed) {
263
+ return undefined;
264
+ }
265
+
266
+ // Stream warning
267
+ await streamContent({
268
+ type: "text",
269
+ text:
270
+ `\n🔍 **Adaptive Spot-Check** (triggered by ${cddResult.pattern} drift)\n` +
271
+ ` ⚠️ ${result.trapType}: ${result.warning}\n` +
272
+ (result.hint ? ` 💡 ${result.hint}\n` : ""),
273
+ });
274
+
275
+ return {
276
+ passed: result.passed,
277
+ trap_type: result.trapType,
278
+ warning: result.warning,
279
+ hint: result.hint,
280
+ confidence: result.confidence,
281
+ };
282
+ }
283
+
284
+ /**
285
+ * Enrich step response with optional fields (verification, compute, compression, etc).
286
+ * Extracted to reduce handleStep complexity.
287
+ */
288
+ function enrichStepResponse(
289
+ response: ScratchpadResponse,
290
+ params: {
291
+ verificationResult: { passed: boolean; confidence: number } | null;
292
+ domain: string;
293
+ computeResult: { solved: boolean; result?: string | number; method?: string } | null;
294
+ compressionResult: ScratchpadResponse["compression"] | null;
295
+ tokenUsage: { total: number };
296
+ tokenBudget: number;
297
+ budgetExceeded: boolean;
298
+ autoCompressed: boolean;
299
+ augmentationResult: ScratchpadResponse["augmentation"] | null;
300
+ trapAnalysis: ScratchpadResponse["trap_analysis"] | undefined;
301
+ nextStepSuggestion: ScratchpadResponse["next_step_suggestion"] | undefined;
302
+ },
303
+ ): void {
304
+ const {
305
+ verificationResult,
306
+ domain,
307
+ computeResult,
308
+ compressionResult,
309
+ tokenUsage,
310
+ tokenBudget,
311
+ budgetExceeded,
312
+ autoCompressed,
313
+ augmentationResult,
314
+ trapAnalysis,
315
+ nextStepSuggestion,
316
+ } = params;
317
+
318
+ // Add verification info
319
+ if (verificationResult) {
320
+ response.verification = {
321
+ passed: verificationResult.passed,
322
+ confidence: verificationResult.confidence,
323
+ domain,
324
+ };
325
+ }
326
+
327
+ // Add local compute info
328
+ if (computeResult?.solved && computeResult.result !== undefined) {
329
+ response.local_compute = {
330
+ solved: true,
331
+ result: computeResult.result,
332
+ method: computeResult.method ?? "unknown",
333
+ };
334
+ }
335
+
336
+ // Add compression info
337
+ if (compressionResult) {
338
+ response.compression = compressionResult;
339
+ }
340
+
341
+ // Add token usage info
342
+ const budgetPercent = tokenBudget > 0 ? (tokenUsage.total / tokenBudget) * 100 : 0;
343
+ response.token_usage = {
344
+ total: tokenUsage.total,
345
+ budget: tokenBudget,
346
+ exceeded: budgetExceeded,
347
+ auto_compressed: autoCompressed,
348
+ budget_percent: Math.round(budgetPercent),
349
+ };
350
+
351
+ // Proactive compression suggestion when approaching budget (>60% consumed)
352
+ if (budgetPercent >= 60 && !autoCompressed && !compressionResult) {
353
+ const urgency = budgetPercent >= 80 ? "⚠️ " : "";
354
+ response.compression_suggestion = {
355
+ should_compress: true,
356
+ current_tokens: tokenUsage.total,
357
+ budget: tokenBudget,
358
+ percent_used: Math.round(budgetPercent),
359
+ nudge: `${urgency}Session at ${Math.round(budgetPercent)}% of token budget (${tokenUsage.total}/${tokenBudget}). Use compress=true on next step to reduce context size.`,
360
+ };
361
+ }
362
+
363
+ // Add augmentation info
364
+ if (augmentationResult) {
365
+ response.augmentation = augmentationResult;
366
+ }
367
+
368
+ // Add trap analysis info (from priming on first step)
369
+ if (trapAnalysis) {
370
+ response.trap_analysis = trapAnalysis;
371
+ }
372
+
373
+ // Add next step suggestion for math domain
374
+ if (nextStepSuggestion) {
375
+ response.next_step_suggestion = nextStepSuggestion;
376
+ }
377
+ }
378
+
379
+ // ============================================================================
380
+ // HELPER FUNCTIONS
381
+ // ============================================================================
382
+
383
+ /** Build verification failure response with recovery options */
384
+ function buildVerificationFailureResponse(params: {
385
+ sessionId: string;
386
+ branchId: string;
387
+ stepNumber: number;
388
+ threshold: number;
389
+ verificationResult: {
390
+ passed: boolean;
391
+ confidence: number;
392
+ suggestions: string[];
393
+ evidence: string;
394
+ };
395
+ detectedMistakes: DetectedMistake[];
396
+ domain: string;
397
+ }): ScratchpadResponse {
398
+ const {
399
+ sessionId,
400
+ branchId,
401
+ stepNumber,
402
+ threshold,
403
+ verificationResult,
404
+ detectedMistakes,
405
+ domain,
406
+ } = params;
407
+ const confState = calculateConfidence(sessionId, branchId);
408
+ const verificationError = {
409
+ issue: verificationResult.suggestions[0] || "Verification failed",
410
+ evidence: verificationResult.evidence,
411
+ suggestions: verificationResult.suggestions,
412
+ confidence: verificationResult.confidence,
413
+ domain,
414
+ };
415
+
416
+ return {
417
+ session_id: sessionId,
418
+ current_step: stepNumber - 1,
419
+ branch: branchId,
420
+ operation: "step",
421
+ chain_confidence: confState.chainConfidence,
422
+ confidence_threshold: threshold,
423
+ steps_with_confidence: confState.stepsWithConfidence,
424
+ status: "verification_failed",
425
+ suggested_action: "Verification failed. Use revise, branch, or override to continue.",
426
+ verification_failure: {
427
+ issue: verificationError.issue,
428
+ evidence: verificationError.evidence,
429
+ suggestions: verificationError.suggestions,
430
+ confidence: verificationResult.confidence,
431
+ domain,
432
+ detected_mistakes:
433
+ detectedMistakes.length > 0
434
+ ? detectedMistakes.map((m) => ({
435
+ type: m.type,
436
+ description: m.explanation,
437
+ fix: m.suggestion,
438
+ corrected_step: m.suggestedFix,
439
+ }))
440
+ : undefined,
441
+ recovery_options: {
442
+ revise: {
443
+ target_step: stepNumber,
444
+ suggested_reason: detectedMistakes[0]
445
+ ? `Fix ${detectedMistakes[0].type}: ${detectedMistakes[0].suggestion || detectedMistakes[0].explanation}`
446
+ : verificationError.suggestions[0] || "Fix verification issue",
447
+ },
448
+ branch: {
449
+ from_step: Math.max(1, stepNumber - 1),
450
+ suggested_name: `Alternative after failed step ${stepNumber}`,
451
+ },
452
+ override: {
453
+ flag: "force_continue",
454
+ warning:
455
+ "Only use if you're certain the heuristic is wrong. The step will be stored as-is.",
456
+ },
457
+ },
458
+ },
459
+ };
460
+ }
461
+
462
+ /** Stream verification failure notice with detected mistakes */
463
+ async function streamVerificationFailure(
464
+ streamContent: MCPContext["streamContent"],
465
+ verificationResult: { confidence: number; suggestions: string[]; evidence: string },
466
+ detectedMistakes: DetectedMistake[],
467
+ stepNumber: number,
468
+ ): Promise<void> {
469
+ let mistakeText = "";
470
+ if (detectedMistakes.length > 0) {
471
+ mistakeText = "\n**Detected algebraic mistakes:**\n";
472
+ for (const m of detectedMistakes) {
473
+ mistakeText += `• **${m.type}**: ${m.explanation}\n`;
474
+ if (m.suggestedFix) {
475
+ mistakeText += ` **Corrected:** \`${m.suggestedFix}\`\n`;
476
+ } else if (m.suggestion) {
477
+ mistakeText += ` Fix: ${m.suggestion}\n`;
478
+ }
479
+ }
480
+ }
481
+
482
+ const issue = verificationResult.suggestions[0] || "Verification failed";
483
+ await streamContent({
484
+ type: "text",
485
+ text:
486
+ `\n⚠️ **VERIFICATION FAILED** (${Math.round(verificationResult.confidence * 100)}% confidence)\n` +
487
+ `**Issue:** ${issue}\n` +
488
+ `**Evidence:** ${verificationResult.evidence}\n` +
489
+ mistakeText +
490
+ `\n**Recovery options:**\n` +
491
+ `1. \`revise\` - Correct this step (target_step: ${stepNumber}, reason: "${verificationResult.suggestions[0] || "fix issue"}")\n` +
492
+ `2. \`branch\` - Try alternative approach (from_step: ${stepNumber - 1})\n` +
493
+ `3. \`override\` - Proceed anyway (acknowledge: true, failed_step: ${stepNumber})\n\n` +
494
+ `**Suggested:** revise\n`,
495
+ });
496
+ }
497
+
498
+ /** Build pending thought record for failed verification */
499
+ function buildPendingRecord(params: {
500
+ sessionId: string;
501
+ branchId: string;
502
+ stepNumber: number;
503
+ thought: string;
504
+ domain: string;
505
+ verificationConfidence: number;
506
+ compressionResult: { original_tokens: number; compressed_tokens: number } | null;
507
+ }): ThoughtRecord {
508
+ const {
509
+ sessionId,
510
+ branchId,
511
+ stepNumber,
512
+ thought,
513
+ domain,
514
+ verificationConfidence,
515
+ compressionResult,
516
+ } = params;
517
+ return {
518
+ id: `${sessionId}:${branchId}:${stepNumber}`,
519
+ step_number: stepNumber,
520
+ thought,
521
+ timestamp: Date.now(),
522
+ branch_id: branchId,
523
+ verification: { passed: false, confidence: verificationConfidence, domain },
524
+ compression: compressionResult
525
+ ? {
526
+ input_bytes_saved:
527
+ (compressionResult.original_tokens - compressionResult.compressed_tokens) * 4,
528
+ output_bytes_saved: 0,
529
+ context_bytes_saved: 0,
530
+ original_tokens: compressionResult.original_tokens,
531
+ compressed_tokens: compressionResult.compressed_tokens,
532
+ }
533
+ : undefined,
534
+ };
535
+ }
536
+
537
+ /** Apply augmentation to thought if enabled */
538
+ async function applyAugmentation(
539
+ thought: string,
540
+ context: string | undefined,
541
+ shouldAugment: boolean,
542
+ streamContent: MCPContext["streamContent"],
543
+ ): Promise<{
544
+ thought: string;
545
+ result: { applied: boolean; computations: number; filtered: number; domain: string } | null;
546
+ }> {
547
+ if (!shouldAugment) {
548
+ return { thought, result: null };
549
+ }
550
+
551
+ const augResult = contextAwareCompute({ thought, systemPrompt: context });
552
+ if (!augResult.hasComputations) {
553
+ return { thought, result: null };
554
+ }
555
+
556
+ await streamContent({
557
+ type: "text",
558
+ text: `⚡ **Augmented** ${augResult.computations.length} computations (${augResult.domain})\n`,
559
+ });
560
+
561
+ return {
562
+ thought: augResult.augmented,
563
+ result: {
564
+ applied: true,
565
+ computations: augResult.computations.length,
566
+ filtered: augResult.filteredCount,
567
+ domain: augResult.domain,
568
+ },
569
+ };
570
+ }
571
+
572
+ /** Apply compression if needed */
573
+ async function applyCompression(
574
+ thought: string,
575
+ args: { compress?: boolean; compression_query?: string; context?: string },
576
+ budgetExceeded: boolean,
577
+ streamContent: MCPContext["streamContent"],
578
+ ): Promise<{
579
+ thought: string;
580
+ result: {
581
+ applied: boolean;
582
+ original_tokens: number;
583
+ compressed_tokens: number;
584
+ ratio: number;
585
+ } | null;
586
+ autoCompressed: boolean;
587
+ }> {
588
+ const shouldCompress =
589
+ args.compress ||
590
+ budgetExceeded ||
591
+ (thought.length > 500 && needsCompression(thought).shouldCompress);
592
+
593
+ if (!shouldCompress) {
594
+ return { thought, result: null, autoCompressed: false };
595
+ }
596
+
597
+ const query = args.compression_query || args.context || "";
598
+ const targetRatio = budgetExceeded ? 0.4 : 0.6;
599
+ const compressOutput = compress(thought, query, { target_ratio: targetRatio });
600
+ const autoCompressed = budgetExceeded && !args.compress;
601
+
602
+ const budgetTag = autoCompressed ? " [budget guard]" : "";
603
+ await streamContent({
604
+ type: "text",
605
+ text: `📦 **Compressed** ${compressOutput.original_tokens}→${compressOutput.compressed_tokens} tokens (${(compressOutput.ratio * 100).toFixed(0)}%)${budgetTag}\n`,
606
+ });
607
+
608
+ return {
609
+ thought: compressOutput.compressed,
610
+ result: {
611
+ applied: true,
612
+ original_tokens: compressOutput.original_tokens,
613
+ compressed_tokens: compressOutput.compressed_tokens,
614
+ ratio: compressOutput.ratio,
615
+ },
616
+ autoCompressed,
617
+ };
618
+ }
619
+
620
+ // ============================================================================
621
+ // HELPER FUNCTIONS
622
+ // ============================================================================
623
+
624
+ /**
625
+ * Handle trap priming for step operation.
626
+ * Stores question in session and runs trap detection on first step.
627
+ * Returns trap analysis if traps detected, undefined otherwise.
628
+ *
629
+ * Uses adaptive maxCombined based on question length:
630
+ * - Short questions (<ADAPTIVE_PRIMING_THRESHOLD chars): maxCombined=2
631
+ * - Longer questions: maxCombined=1 (avoid prompt bloat, multi-trap confusion)
632
+ */
633
+ async function handleTrapPriming(
634
+ question: string,
635
+ sessionId: string,
636
+ stepNumber: number,
637
+ streamContent: MCPContext["streamContent"],
638
+ ): Promise<ScratchpadResponse["trap_analysis"]> {
639
+ // Validate question length (security: prevents memory exhaustion + ReDoS)
640
+ if (question.length > MAX_QUESTION_LENGTH) {
641
+ await streamContent({
642
+ type: "text",
643
+ text: `⚠️ Question too long (${question.length} chars, max ${MAX_QUESTION_LENGTH}). Skipping trap detection.\n\n`,
644
+ });
645
+ return undefined;
646
+ }
647
+
648
+ // Store question in session for later spot-check at complete (first-write-wins)
649
+ SessionManager.setQuestion(sessionId, question);
650
+
651
+ // Warn if question provided late (trap analysis only runs on step 1)
652
+ if (stepNumber !== 1) {
653
+ await streamContent({
654
+ type: "text",
655
+ text: `⚠️ Question provided at step ${stepNumber}. Trap priming only runs on step 1. Stored for spot-check at complete.\n\n`,
656
+ });
657
+ return undefined;
658
+ }
659
+
660
+ // Adaptive maxCombined: short questions can handle more priming context
661
+ const maxCombined = question.length < ADAPTIVE_PRIMING_THRESHOLD ? 2 : 1;
662
+ const primeResult = primeQuestion(question, { maxCombined });
663
+ if (!primeResult.shouldPrime || !primeResult.primingPrompt) return undefined;
664
+
665
+ await streamContent({
666
+ type: "text",
667
+ text: `💡 **Trap Analysis:** ${primeResult.primingPrompt}\n\n`,
668
+ });
669
+
670
+ return {
671
+ detected: true,
672
+ types: primeResult.trapTypes,
673
+ primed_count: primeResult.primedTypes.length,
674
+ note: primeResult.primingPrompt,
675
+ confidence: primeResult.confidence,
676
+ };
677
+ }
678
+
679
+ /**
680
+ * Run consistency check every N steps to detect contradictions.
681
+ * Returns consistency_warning if contradictions found, undefined otherwise.
682
+ */
683
+ async function runConsistencyCheck(
684
+ sessionId: string,
685
+ branchId: string,
686
+ stepNumber: number,
687
+ currentThought: string,
688
+ streamContent: MCPContext["streamContent"],
689
+ ): Promise<ScratchpadResponse["consistency_warning"]> {
690
+ // Only check every 3 steps, and only if we have prior steps
691
+ if (stepNumber < 3 || stepNumber % 3 !== 0) {
692
+ return undefined;
693
+ }
694
+
695
+ const thoughts = SessionManager.getThoughts(sessionId, branchId);
696
+ const stepData = thoughts.map((t) => ({ step: t.step_number, thought: t.thought }));
697
+ const contradictions = checkStepConsistency(
698
+ { step: stepNumber, thought: currentThought },
699
+ stepData.slice(0, -1), // Exclude current step (already in thoughts)
700
+ );
701
+
702
+ if (contradictions.length === 0) {
703
+ return undefined;
704
+ }
705
+
706
+ await streamContent({
707
+ type: "text",
708
+ text:
709
+ `\n⚠️ **Consistency Warning:** ${contradictions.length} contradiction(s) detected\n` +
710
+ contradictions.map((c) => ` - ${c.description}`).join("\n") +
711
+ "\n",
712
+ });
713
+
714
+ return {
715
+ has_contradictions: true,
716
+ count: contradictions.length,
717
+ contradictions: contradictions.map((c) => ({
718
+ type: c.type,
719
+ description: c.description,
720
+ subject: c.subject,
721
+ original_step: c.original_step,
722
+ conflicting_step: c.conflicting_step,
723
+ confidence: c.confidence,
724
+ })),
725
+ nudge: `⚠️ Found ${contradictions.length} potential contradiction(s). Review steps ${contradictions.map((c) => c.original_step).join(", ")} for consistency.`,
726
+ };
727
+ }
728
+
729
+ /**
730
+ * Run hypothesis resolution check for branch steps.
731
+ * Returns hypothesis_resolution and optional merge_suggestion if confirmed.
732
+ */
733
+ async function runHypothesisResolution(
734
+ sessionId: string,
735
+ branchId: string,
736
+ stepNumber: number,
737
+ currentThought: string,
738
+ streamContent: MCPContext["streamContent"],
739
+ ): Promise<{
740
+ resolution?: ScratchpadResponse["hypothesis_resolution"];
741
+ mergeSuggestion?: ScratchpadResponse["merge_suggestion"];
742
+ }> {
743
+ const session = SessionManager.get(sessionId);
744
+ if (!session) {
745
+ return {};
746
+ }
747
+
748
+ // Check all branches with hypotheses
749
+ for (const branch of session.branches.values()) {
750
+ if (!branch.hypothesis || branch.id === "main") {
751
+ continue;
752
+ }
753
+
754
+ // Only check if the current step is on this branch
755
+ if (branchId !== branch.id) {
756
+ continue;
757
+ }
758
+
759
+ const resolution = analyzeStepForResolution(
760
+ currentThought,
761
+ branch.hypothesis,
762
+ branch.success_criteria ?? null,
763
+ stepNumber,
764
+ );
765
+
766
+ if (!resolution.resolved && resolution.confidence <= 0.5) {
767
+ continue;
768
+ }
769
+
770
+ // Stream resolution status
771
+ if (resolution.resolved) {
772
+ const emoji =
773
+ resolution.outcome === "confirmed" ? "✅" : resolution.outcome === "refuted" ? "❌" : "❓";
774
+ await streamContent({
775
+ type: "text",
776
+ text:
777
+ `\n${emoji} **Hypothesis ${resolution.outcome?.toUpperCase()}:** "${branch.hypothesis.slice(0, 60)}${branch.hypothesis.length > 60 ? "..." : ""}"\n` +
778
+ ` Evidence: ${resolution.evidence}\n` +
779
+ ` ${resolution.suggestion}\n`,
780
+ });
781
+ }
782
+
783
+ // Build merge suggestion if hypothesis confirmed
784
+ let mergeSuggestion: ScratchpadResponse["merge_suggestion"];
785
+ if (resolution.outcome === "confirmed") {
786
+ mergeSuggestion = {
787
+ should_merge: true,
788
+ from_branch: branch.id,
789
+ confirmed_hypothesis: branch.hypothesis,
790
+ key_findings: resolution.evidence || currentThought.slice(0, 100),
791
+ nudge: `💡 Hypothesis confirmed! Consider incorporating findings from branch "${branch.name || branch.id}" into your main reasoning.`,
792
+ };
793
+
794
+ await streamContent({
795
+ type: "text",
796
+ text: `\n${mergeSuggestion.nudge}\n`,
797
+ });
798
+ }
799
+
800
+ return { resolution, mergeSuggestion };
801
+ }
802
+
803
+ return {};
804
+ }
805
+
806
+ /**
807
+ * Check if reasoning should be challenged and build suggestion.
808
+ * Returns challenge_suggestion if overconfidence detected, undefined otherwise.
809
+ */
810
+ async function runAutoChallenge(
811
+ chainConfidence: number,
812
+ stepCount: number,
813
+ hasVerification: boolean,
814
+ streamContent: MCPContext["streamContent"],
815
+ ): Promise<ScratchpadResponse["challenge_suggestion"]> {
816
+ if (!shouldChallenge(chainConfidence, stepCount, hasVerification)) {
817
+ return undefined;
818
+ }
819
+
820
+ // Determine reason for challenge suggestion
821
+ let reason: string;
822
+ let suggestedType: ScratchpadResponse["challenge_suggestion"] extends
823
+ | { suggested_type: infer T }
824
+ | undefined
825
+ ? T
826
+ : never;
827
+
828
+ if (chainConfidence > 0.95) {
829
+ reason = `Very high confidence (${(chainConfidence * 100).toFixed(0)}%) warrants adversarial review`;
830
+ suggestedType = "all";
831
+ } else if (stepCount < 3 && !hasVerification) {
832
+ reason = `High confidence (${(chainConfidence * 100).toFixed(0)}%) with only ${stepCount} step(s) and no verification`;
833
+ suggestedType = "premise_check";
834
+ } else {
835
+ reason = `Confidence pattern suggests potential overconfidence`;
836
+ suggestedType = "assumption_inversion";
837
+ }
838
+
839
+ const nudge = `🎯 Consider using \`challenge\` operation: ${reason}`;
840
+
841
+ await streamContent({
842
+ type: "text",
843
+ text: `\n${nudge}\n`,
844
+ });
845
+
846
+ return {
847
+ should_challenge: true,
848
+ reason,
849
+ suggested_type: suggestedType,
850
+ nudge,
851
+ };
852
+ }
853
+
854
+ /**
855
+ * Calculate stepping guidance based on question complexity.
856
+ * Only runs on step 1 when a question is provided.
857
+ */
858
+ async function calculateSteppingGuidance(
859
+ question: string | undefined,
860
+ stepNumber: number,
861
+ streamContent: MCPContext["streamContent"],
862
+ ): Promise<ScratchpadResponse["stepping_guidance"]> {
863
+ if (!question || stepNumber !== 1) {
864
+ return undefined;
865
+ }
866
+
867
+ const complexity = assessPromptComplexity(question);
868
+ const recommendedSteps = getRecommendedSteps(complexity.tier);
869
+ const guidance: ScratchpadResponse["stepping_guidance"] = {
870
+ complexity_tier: complexity.tier,
871
+ recommended_steps: recommendedSteps,
872
+ current_steps: 1,
873
+ needs_more_steps: recommendedSteps > 1,
874
+ nudge:
875
+ recommendedSteps > 2
876
+ ? `⚠️ This is a ${complexity.tier} complexity question. Take ${recommendedSteps}+ reasoning steps before concluding.`
877
+ : null,
878
+ };
879
+
880
+ if (guidance.nudge) {
881
+ await streamContent({
882
+ type: "text",
883
+ text: `${guidance.nudge}\n\n`,
884
+ });
885
+ }
886
+
887
+ return guidance;
888
+ }
889
+
890
+ /**
891
+ * Run verification on thought and return failure response if verification fails.
892
+ * Returns null if verification passes or is not required.
893
+ */
894
+ async function runVerificationCheck(
895
+ args: ScratchpadArgs,
896
+ sessionId: string,
897
+ branchId: string,
898
+ stepNumber: number,
899
+ thought: string,
900
+ domain: "math" | "logic" | "code" | "general",
901
+ threshold: number,
902
+ compressionResult: {
903
+ applied: boolean;
904
+ original_tokens: number;
905
+ compressed_tokens: number;
906
+ ratio: number;
907
+ } | null,
908
+ streamContent: MCPContext["streamContent"],
909
+ ): Promise<
910
+ | {
911
+ passed: true;
912
+ result: ReturnType<typeof verify> | null;
913
+ }
914
+ | {
915
+ passed: false;
916
+ response: ScratchpadResponse;
917
+ }
918
+ > {
919
+ // Run verification if requested OR auto-enabled for longer chains
920
+ // Auto-verify when: chain has >3 steps AND verify wasn't explicitly set to false
921
+ const priorThoughts = SessionManager.getThoughts(sessionId, branchId);
922
+ const shouldAutoVerify = priorThoughts.length >= 3 && args.verify !== false;
923
+ const shouldVerify = args.verify === true || shouldAutoVerify;
924
+
925
+ if (!shouldVerify) {
926
+ return { passed: true, result: null };
927
+ }
928
+
929
+ const autoVerifyEnabled = shouldAutoVerify && args.verify !== true;
930
+ const contextStrings = priorThoughts.map((t) => t.thought);
931
+ const verificationResult = verify(thought, domain, contextStrings, true);
932
+
933
+ // Note auto-verification in stream if it was triggered
934
+ if (autoVerifyEnabled) {
935
+ await streamContent({
936
+ type: "text",
937
+ text: `🔍 **Auto-verification enabled** (chain length: ${priorThoughts.length + 1} steps)\n`,
938
+ });
939
+ }
940
+
941
+ // HALT ON VERIFICATION FAILURE
942
+ if (!verificationResult.passed) {
943
+ const mistakeResult = domain === "math" ? detectCommonMistakesFromText(thought) : null;
944
+ const detectedMistakes = mistakeResult?.mistakes ?? [];
945
+
946
+ // Build and store pending record
947
+ const pendingRecord = buildPendingRecord({
948
+ sessionId,
949
+ branchId,
950
+ stepNumber,
951
+ thought,
952
+ domain,
953
+ verificationConfidence: verificationResult.confidence,
954
+ compressionResult,
955
+ });
956
+ const verificationError = {
957
+ issue: verificationResult.suggestions[0] || "Verification failed",
958
+ evidence: verificationResult.evidence,
959
+ suggestions: verificationResult.suggestions,
960
+ confidence: verificationResult.confidence,
961
+ domain,
962
+ };
963
+ SessionManager.setPendingThought(sessionId, pendingRecord, verificationError);
964
+
965
+ // Stream failure and return response
966
+ await streamVerificationFailure(
967
+ streamContent,
968
+ verificationResult,
969
+ detectedMistakes,
970
+ stepNumber,
971
+ );
972
+ return {
973
+ passed: false,
974
+ response: buildVerificationFailureResponse({
975
+ sessionId,
976
+ branchId,
977
+ stepNumber,
978
+ threshold,
979
+ verificationResult,
980
+ detectedMistakes,
981
+ domain,
982
+ }),
983
+ };
984
+ }
985
+
986
+ return { passed: true, result: verificationResult };
987
+ }
988
+
989
+ // ============================================================================
990
+ // OPERATION HANDLERS
991
+ // ============================================================================
992
+
993
+ /** Handle step operation - add a new thought */
994
+ async function handleStep(args: ScratchpadArgs, ctx: MCPContext): Promise<ScratchpadResponse> {
995
+ const { streamContent } = ctx;
996
+
997
+ // Runtime validation: thought is required for step operation
998
+ if (!args.thought) {
999
+ throw new Error("thought is required for step operation");
1000
+ }
1001
+ const thought = args.thought;
1002
+
1003
+ const sessionId = args.session_id || `s_${crypto.randomUUID()}`;
1004
+ const branchId = "main"; // Default branch for step operation
1005
+ const threshold = args.confidence_threshold ?? 0.8;
1006
+ const tokenBudget = args.token_budget ?? 3000;
1007
+
1008
+ // S3: Check max_step_tokens limit before any processing
1009
+ const maxStepTokens = args.max_step_tokens;
1010
+ if (maxStepTokens !== undefined && !args.force_large) {
1011
+ // Estimate tokens: ~4 chars per token
1012
+ const estimatedTokens = Math.ceil(thought.length / 4);
1013
+ if (estimatedTokens > maxStepTokens) {
1014
+ throw new Error(
1015
+ `Step exceeds max_step_tokens limit: ${estimatedTokens} > ${maxStepTokens}. ` +
1016
+ `Split into smaller steps or use force_large=true to override.`,
1017
+ );
1018
+ }
1019
+ }
1020
+
1021
+ // Auto-increment step number
1022
+ const stepNumber = SessionManager.getNextStep(sessionId, branchId);
1023
+
1024
+ // Handle trap priming if question provided
1025
+ const trapAnalysis = args.question
1026
+ ? await handleTrapPriming(args.question, sessionId, stepNumber, streamContent)
1027
+ : undefined;
1028
+
1029
+ // Proactive stepping guidance: assess complexity on first step when question provided
1030
+ const steppingGuidance = await calculateSteppingGuidance(
1031
+ args.question,
1032
+ stepNumber,
1033
+ streamContent,
1034
+ );
1035
+
1036
+ // Strip markdown and detect domain
1037
+ let strippedThought = stripMarkdown(thought);
1038
+ const domain = args.domain || detectDomain(strippedThought);
1039
+
1040
+ // Pre-compute next step suggestion for math domain (before augmentation modifies text)
1041
+ let nextStepSuggestion: ScratchpadResponse["next_step_suggestion"];
1042
+ if (domain === "math") {
1043
+ const suggestion = suggestNextStepFromText(strippedThought);
1044
+ if (suggestion) {
1045
+ nextStepSuggestion = suggestion;
1046
+ }
1047
+ }
1048
+
1049
+ // Try local compute FIRST if requested (before augmentation modifies the text)
1050
+ let computeResult = null;
1051
+ if (args.local_compute && isLikelyComputable(strippedThought)) {
1052
+ computeResult = tryLocalCompute(strippedThought);
1053
+ if (computeResult?.solved) {
1054
+ await streamContent({
1055
+ type: "text",
1056
+ text: `⚡ **Local Compute** (${computeResult.method})\n**Result:** ${computeResult.result}\n\n`,
1057
+ });
1058
+ }
1059
+ }
1060
+
1061
+ // S2: Run augment_compute (default: true) - inject computed values into thought
1062
+ const shouldAugment = args.augment_compute !== false;
1063
+ const augmentation = await applyAugmentation(
1064
+ strippedThought,
1065
+ args.context,
1066
+ shouldAugment,
1067
+ streamContent,
1068
+ );
1069
+ strippedThought = augmentation.thought;
1070
+ const augmentationResult = augmentation.result;
1071
+
1072
+ // S1: Token budget guard - check if session exceeds budget
1073
+ const tokenUsage = SessionManager.getTokenUsage(sessionId);
1074
+ const budgetExceeded = tokenUsage.total >= tokenBudget;
1075
+
1076
+ // Compression - check if requested, auto-detect, OR budget exceeded
1077
+ const compression = await applyCompression(strippedThought, args, budgetExceeded, streamContent);
1078
+ strippedThought = compression.thought;
1079
+ const compressionResult = compression.result;
1080
+ const autoCompressed = compression.autoCompressed;
1081
+
1082
+ // Run verification (extracted to helper to reduce complexity)
1083
+ const verificationCheck = await runVerificationCheck(
1084
+ args,
1085
+ sessionId,
1086
+ branchId,
1087
+ stepNumber,
1088
+ strippedThought,
1089
+ domain,
1090
+ threshold,
1091
+ compressionResult,
1092
+ streamContent,
1093
+ );
1094
+ if (!verificationCheck.passed) {
1095
+ return verificationCheck.response;
1096
+ }
1097
+ const verificationResult = verificationCheck.result;
1098
+
1099
+ // Stream the thought (only if verification passed or wasn't requested)
1100
+ await streamContent({
1101
+ type: "text",
1102
+ text: `**Step ${stepNumber}** [${args.purpose}]\n${strippedThought}\n`,
1103
+ });
1104
+ if (args.preconditions?.length) {
1105
+ await streamContent({
1106
+ type: "text",
1107
+ text: `📋 **Preconditions:** ${args.preconditions.join(", ")}\n`,
1108
+ });
1109
+ }
1110
+ if (args.outcome) {
1111
+ await streamContent({ type: "text", text: `**Outcome:** ${args.outcome}\n` });
1112
+ }
1113
+
1114
+ // Build thought record
1115
+ const record: ThoughtRecord = {
1116
+ id: `${sessionId}:${branchId}:${stepNumber}`,
1117
+ step_number: stepNumber,
1118
+ thought: strippedThought,
1119
+ timestamp: Date.now(),
1120
+ branch_id: branchId,
1121
+ // Store preconditions if provided
1122
+ preconditions: args.preconditions,
1123
+ verification: verificationResult
1124
+ ? {
1125
+ passed: verificationResult.passed,
1126
+ confidence: args.confidence ?? verificationResult.confidence,
1127
+ domain,
1128
+ }
1129
+ : args.confidence !== undefined
1130
+ ? {
1131
+ passed: true, // Assume passed if confidence provided manually
1132
+ confidence: args.confidence,
1133
+ domain,
1134
+ }
1135
+ : undefined,
1136
+ // Track compression stats if compression was applied
1137
+ compression: compressionResult
1138
+ ? {
1139
+ input_bytes_saved:
1140
+ (compressionResult.original_tokens - compressionResult.compressed_tokens) * 4,
1141
+ output_bytes_saved: 0,
1142
+ context_bytes_saved: 0,
1143
+ original_tokens: compressionResult.original_tokens,
1144
+ compressed_tokens: compressionResult.compressed_tokens,
1145
+ }
1146
+ : undefined,
1147
+ };
1148
+
1149
+ // Store thought
1150
+ const storeResult = SessionManager.addThought(sessionId, record);
1151
+ if (!storeResult.success) {
1152
+ throw new Error(storeResult.error || "Failed to store thought");
1153
+ }
1154
+
1155
+ // Calculate confidence
1156
+ const confState = calculateConfidence(sessionId, branchId, args.confidence);
1157
+ const status = determineStatus(confState.chainConfidence, threshold, false);
1158
+ const suggestedAction = getSuggestedAction(status, confState.chainConfidence);
1159
+
1160
+ // Build response
1161
+ const response: ScratchpadResponse = {
1162
+ session_id: sessionId,
1163
+ current_step: stepNumber,
1164
+ branch: branchId,
1165
+ operation: "step",
1166
+ step_confidence: confState.stepConfidence,
1167
+ chain_confidence: confState.chainConfidence,
1168
+ confidence_threshold: threshold,
1169
+ steps_with_confidence: confState.stepsWithConfidence,
1170
+ status,
1171
+ suggested_action: suggestedAction,
1172
+ };
1173
+
1174
+ // Add 5-second warning if threshold reached
1175
+ if (status === "threshold_reached") {
1176
+ response.auto_complete_warning =
1177
+ "⏱️ Confidence threshold reached. You have 5 seconds to continue or call complete. " +
1178
+ "After 5s, the chain will auto-complete if no further action is taken.";
1179
+ await streamContent({
1180
+ type: "text",
1181
+ text:
1182
+ `\n⚠️ **THRESHOLD REACHED** (${(confState.chainConfidence * 100).toFixed(0)}% ≥ ${threshold * 100}%)\n` +
1183
+ "Call `complete` operation or continue reasoning within 5 seconds.\n",
1184
+ });
1185
+ }
1186
+
1187
+ // Enrich response with optional fields (extracted to reduce complexity)
1188
+ const updatedTokenUsage = SessionManager.getTokenUsage(sessionId);
1189
+ enrichStepResponse(response, {
1190
+ verificationResult,
1191
+ domain,
1192
+ computeResult,
1193
+ compressionResult,
1194
+ tokenUsage: updatedTokenUsage,
1195
+ tokenBudget,
1196
+ budgetExceeded,
1197
+ autoCompressed,
1198
+ augmentationResult,
1199
+ trapAnalysis,
1200
+ nextStepSuggestion,
1201
+ });
1202
+
1203
+ // Stream next step suggestion if available
1204
+ if (nextStepSuggestion?.hasSuggestion) {
1205
+ await streamContent({
1206
+ type: "text",
1207
+ text: `💡 **Next step:** ${nextStepSuggestion.description}\n`,
1208
+ });
1209
+ }
1210
+
1211
+ // Add stepping guidance if available (from first step complexity assessment)
1212
+ if (steppingGuidance) {
1213
+ response.stepping_guidance = steppingGuidance;
1214
+ }
1215
+
1216
+ // Stream compression suggestion if present
1217
+ if (response.compression_suggestion) {
1218
+ await streamContent({
1219
+ type: "text",
1220
+ text: `📦 ${response.compression_suggestion.nudge}\n`,
1221
+ });
1222
+ }
1223
+
1224
+ // S3: Step-level Confidence Drift Detection (CDD)
1225
+ // Extracted to helper function to reduce cyclomatic complexity
1226
+ const cddResult = await runStepLevelCDD(sessionId, branchId, streamContent);
1227
+ if (cddResult) {
1228
+ response.confidence_drift = cddResult;
1229
+ }
1230
+
1231
+ // Adaptive spot-check: Auto-trigger when CDD detects unresolved drift
1232
+ // This catches trap patterns early, before complete() is called
1233
+ const adaptiveSpotCheck = await runAdaptiveSpotCheck(
1234
+ sessionId,
1235
+ strippedThought,
1236
+ cddResult,
1237
+ streamContent,
1238
+ );
1239
+ if (adaptiveSpotCheck) {
1240
+ response.spot_check_result = adaptiveSpotCheck;
1241
+ // Upgrade status to "review" if spot-check found a trap
1242
+ if (!adaptiveSpotCheck.passed) {
1243
+ response.status = "review";
1244
+ response.suggested_action = `Potential ${adaptiveSpotCheck.trap_type} trap detected. ${adaptiveSpotCheck.hint || "Reconsider your approach."}`;
1245
+ }
1246
+ }
1247
+
1248
+ // Consistency check: Run every 3 steps to catch contradictions early
1249
+ const consistencyWarning = await runConsistencyCheck(
1250
+ sessionId,
1251
+ branchId,
1252
+ stepNumber,
1253
+ strippedThought,
1254
+ streamContent,
1255
+ );
1256
+ if (consistencyWarning) {
1257
+ response.consistency_warning = consistencyWarning;
1258
+ }
1259
+
1260
+ // Hypothesis resolution: Check if branch hypothesis has been resolved
1261
+ const { resolution, mergeSuggestion } = await runHypothesisResolution(
1262
+ sessionId,
1263
+ branchId,
1264
+ stepNumber,
1265
+ strippedThought,
1266
+ streamContent,
1267
+ );
1268
+ if (resolution) {
1269
+ response.hypothesis_resolution = resolution;
1270
+ }
1271
+ if (mergeSuggestion) {
1272
+ response.merge_suggestion = mergeSuggestion;
1273
+ }
1274
+
1275
+ // Auto-challenge: Suggest adversarial review on overconfidence
1276
+ const hasVerification = !!verificationResult?.passed;
1277
+ const challengeSuggestion = await runAutoChallenge(
1278
+ confState.chainConfidence,
1279
+ stepNumber,
1280
+ hasVerification,
1281
+ streamContent,
1282
+ );
1283
+ if (challengeSuggestion) {
1284
+ response.challenge_suggestion = challengeSuggestion;
1285
+ }
1286
+
1287
+ return response;
1288
+ }
1289
+
1290
+ /** Handle navigate operation - view history/branches/steps/paths */
1291
+ async function handleNavigate(args: ScratchpadArgs, _ctx: MCPContext): Promise<ScratchpadResponse> {
1292
+ const sessionId = args.session_id;
1293
+ if (!sessionId) {
1294
+ throw new Error("session_id required for navigate operation");
1295
+ }
1296
+
1297
+ const session = SessionManager.get(sessionId);
1298
+ if (!session) {
1299
+ throw new Error(`Session not found: ${sessionId}`);
1300
+ }
1301
+
1302
+ const threshold = args.confidence_threshold ?? 0.8;
1303
+ const branchId = args.branch_id || "main";
1304
+ const confState = calculateConfidence(sessionId, branchId);
1305
+ const status = determineStatus(confState.chainConfidence, threshold, false);
1306
+
1307
+ const response: ScratchpadResponse = {
1308
+ session_id: sessionId,
1309
+ current_step: SessionManager.getCurrentStep(sessionId, branchId),
1310
+ branch: branchId,
1311
+ operation: "navigate",
1312
+ chain_confidence: confState.chainConfidence,
1313
+ confidence_threshold: threshold,
1314
+ steps_with_confidence: confState.stepsWithConfidence,
1315
+ status,
1316
+ suggested_action: getSuggestedAction(status, confState.chainConfidence),
1317
+ };
1318
+
1319
+ switch (args.view) {
1320
+ case "history": {
1321
+ const thoughts = SessionManager.getThoughts(sessionId, args.branch_id);
1322
+ const limited = thoughts.slice(-(args.limit || 10));
1323
+ response.history = limited.map((t) => ({
1324
+ step: t.step_number,
1325
+ branch: t.branch_id,
1326
+ purpose: "analysis", // Default since we don't store purpose currently
1327
+ thought_preview: t.thought.slice(0, 80) + (t.thought.length > 80 ? "..." : ""),
1328
+ confidence: t.verification?.confidence,
1329
+ revised_by: t.revised_by,
1330
+ }));
1331
+ break;
1332
+ }
1333
+
1334
+ case "branches": {
1335
+ const branches = SessionManager.getBranches(sessionId);
1336
+ response.branches = branches.map((b) => ({
1337
+ id: b.id,
1338
+ name: b.name,
1339
+ from_step: b.from_step,
1340
+ depth: b.depth,
1341
+ hypothesis: b.hypothesis,
1342
+ success_criteria: b.success_criteria,
1343
+ }));
1344
+ break;
1345
+ }
1346
+
1347
+ case "step": {
1348
+ if (!args.step_id) {
1349
+ throw new Error("step_id required for step view");
1350
+ }
1351
+ const step = SessionManager.getStep(sessionId, args.step_id);
1352
+ if (!step) {
1353
+ throw new Error(`Step not found: ${args.step_id}`);
1354
+ }
1355
+ response.step_detail = {
1356
+ step: step.step_number,
1357
+ branch: step.branch_id,
1358
+ purpose: "analysis",
1359
+ thought: step.thought,
1360
+ outcome: undefined, // Not stored currently
1361
+ confidence: step.verification?.confidence,
1362
+ revises_step: step.revises_step,
1363
+ revised_by: step.revised_by,
1364
+ preconditions: step.preconditions,
1365
+ hypothesis: step.hypothesis,
1366
+ success_criteria: step.success_criteria,
1367
+ };
1368
+ break;
1369
+ }
1370
+
1371
+ case "path": {
1372
+ if (!args.step_id) {
1373
+ throw new Error("step_id required for path view");
1374
+ }
1375
+ const path = SessionManager.getPath(sessionId, args.step_id);
1376
+ response.path = path.map((t) => ({
1377
+ step: t.step_number,
1378
+ branch: t.branch_id,
1379
+ thought_preview: t.thought.slice(0, 60) + (t.thought.length > 60 ? "..." : ""),
1380
+ }));
1381
+ break;
1382
+ }
1383
+ }
1384
+
1385
+ return response;
1386
+ }
1387
+
1388
+ /** Handle branch operation - start alternative reasoning path */
1389
+ async function handleBranch(args: ScratchpadArgs, ctx: MCPContext): Promise<ScratchpadResponse> {
1390
+ const { streamContent } = ctx;
1391
+
1392
+ // Runtime validation: session_id and thought are required for branch operation
1393
+ if (!args.session_id) {
1394
+ throw new Error("session_id required for branch operation");
1395
+ }
1396
+ if (!args.thought) {
1397
+ throw new Error("thought is required for branch operation");
1398
+ }
1399
+ const sessionId = args.session_id;
1400
+ const thought = args.thought;
1401
+
1402
+ const session = SessionManager.get(sessionId);
1403
+ if (!session) {
1404
+ throw new Error(`Session not found: ${sessionId}`);
1405
+ }
1406
+
1407
+ const threshold = args.confidence_threshold ?? 0.8;
1408
+
1409
+ // Clear any pending verification failure (branching abandons the failed step)
1410
+ const hadPending = SessionManager.clearPendingThought(sessionId);
1411
+
1412
+ // Determine branch point
1413
+ const fromStep = args.from_step ?? SessionManager.getCurrentStep(sessionId, "main");
1414
+ const branchId = `branch-${crypto.randomUUID()}`;
1415
+ const branchName = args.branch_name || `Alternative from step ${fromStep}`;
1416
+
1417
+ // Auto-increment step number for new branch
1418
+ const stepNumber = fromStep + 1;
1419
+
1420
+ // Strip markdown and detect domain BEFORE augmentation
1421
+ let strippedThought = stripMarkdown(thought);
1422
+ const domain = detectDomain(strippedThought);
1423
+
1424
+ // Pre-compute next step suggestion for math domain (before augmentation modifies text)
1425
+ let nextStepSuggestion: ScratchpadResponse["next_step_suggestion"];
1426
+ if (domain === "math") {
1427
+ const suggestion = suggestNextStepFromText(strippedThought);
1428
+ if (suggestion) {
1429
+ nextStepSuggestion = suggestion;
1430
+ }
1431
+ }
1432
+
1433
+ // Auto-augment (default: true)
1434
+ let augmentationResult: {
1435
+ applied: boolean;
1436
+ computations: number;
1437
+ filtered: number;
1438
+ domain: string;
1439
+ } | null = null;
1440
+
1441
+ const shouldAugment = args.augment_compute !== false;
1442
+
1443
+ if (shouldAugment) {
1444
+ const augResult = contextAwareCompute({
1445
+ thought: strippedThought,
1446
+ systemPrompt: args.context,
1447
+ });
1448
+
1449
+ if (augResult.hasComputations) {
1450
+ strippedThought = augResult.augmented;
1451
+ augmentationResult = {
1452
+ applied: true,
1453
+ computations: augResult.computations.length,
1454
+ filtered: augResult.filteredCount,
1455
+ domain: augResult.domain,
1456
+ };
1457
+ await streamContent({
1458
+ type: "text",
1459
+ text: `⚡ **Augmented** ${augResult.computations.length} computations (${augResult.domain})\n`,
1460
+ });
1461
+ }
1462
+ }
1463
+
1464
+ // Stream branch creation
1465
+ const pendingNote = hadPending ? " (abandoning failed verification step)" : "";
1466
+ const hypothesisNote = args.hypothesis ? `\n 📊 Hypothesis: ${args.hypothesis}` : "";
1467
+ const criteriaNote = args.success_criteria
1468
+ ? `\n ✅ Success criteria: ${args.success_criteria}`
1469
+ : "";
1470
+ await streamContent({
1471
+ type: "text",
1472
+ text:
1473
+ `🌿 **New Branch:** ${branchName}${pendingNote}\n` +
1474
+ ` From step ${fromStep} → Step ${stepNumber}${hypothesisNote}${criteriaNote}\n\n`,
1475
+ });
1476
+
1477
+ // Stream the thought
1478
+ await streamContent({
1479
+ type: "text",
1480
+ text: `**Step ${stepNumber}** [${args.purpose}]\n${strippedThought}\n`,
1481
+ });
1482
+
1483
+ // Build thought record with branch info
1484
+ const record: ThoughtRecord = {
1485
+ id: `${sessionId}:${branchId}:${stepNumber}`,
1486
+ step_number: stepNumber,
1487
+ thought: strippedThought,
1488
+ timestamp: Date.now(),
1489
+ branch_id: branchId,
1490
+ branch_from: fromStep,
1491
+ branch_name: branchName,
1492
+ // Hypothesis-driven branching
1493
+ hypothesis: args.hypothesis,
1494
+ success_criteria: args.success_criteria,
1495
+ };
1496
+
1497
+ // Store thought
1498
+ const storeResult = SessionManager.addThought(sessionId, record);
1499
+ if (!storeResult.success) {
1500
+ throw new Error(storeResult.error || "Failed to store branch thought");
1501
+ }
1502
+
1503
+ // Calculate confidence for new branch
1504
+ const confState = calculateConfidence(sessionId, branchId);
1505
+ const status = determineStatus(confState.chainConfidence, threshold, false);
1506
+
1507
+ const response: ScratchpadResponse = {
1508
+ session_id: sessionId,
1509
+ current_step: stepNumber,
1510
+ branch: branchId,
1511
+ operation: "branch",
1512
+ chain_confidence: confState.chainConfidence,
1513
+ confidence_threshold: threshold,
1514
+ steps_with_confidence: confState.stepsWithConfidence,
1515
+ status,
1516
+ suggested_action: args.hypothesis
1517
+ ? `Branch "${branchName}" created to test: "${args.hypothesis}". Continue reasoning to prove/disprove.`
1518
+ : `Branch "${branchName}" created. Continue reasoning on this alternative path.`,
1519
+ };
1520
+
1521
+ // Add augmentation info
1522
+ if (augmentationResult) {
1523
+ response.augmentation = augmentationResult;
1524
+ }
1525
+
1526
+ // Add next step suggestion for math domain (computed before augmentation)
1527
+ if (nextStepSuggestion) {
1528
+ response.next_step_suggestion = nextStepSuggestion;
1529
+ if (nextStepSuggestion.hasSuggestion) {
1530
+ await streamContent({
1531
+ type: "text",
1532
+ text: `💡 **Next step:** ${nextStepSuggestion.description}\n`,
1533
+ });
1534
+ }
1535
+ }
1536
+
1537
+ return response;
1538
+ }
1539
+
1540
+ /** Handle revise operation - correct earlier step */
1541
+ async function handleRevise(args: ScratchpadArgs, ctx: MCPContext): Promise<ScratchpadResponse> {
1542
+ const { streamContent } = ctx;
1543
+
1544
+ // Runtime validation: required fields for revise operation
1545
+ if (!args.session_id) {
1546
+ throw new Error("session_id required for revise operation");
1547
+ }
1548
+ if (!args.thought) {
1549
+ throw new Error("thought is required for revise operation");
1550
+ }
1551
+ if (args.target_step === undefined) {
1552
+ throw new Error("target_step is required for revise operation");
1553
+ }
1554
+ const sessionId = args.session_id;
1555
+ const thought = args.thought;
1556
+ const targetStep = args.target_step;
1557
+
1558
+ const session = SessionManager.get(sessionId);
1559
+ if (!session) {
1560
+ throw new Error(`Session not found: ${sessionId}`);
1561
+ }
1562
+
1563
+ const threshold = args.confidence_threshold ?? 0.8;
1564
+ const branchId = "main"; // Revisions go on main branch
1565
+
1566
+ // Check if revising a pending (failed verification) step
1567
+ const pending = SessionManager.getPendingThought(sessionId);
1568
+ const isRevisingPending = pending && targetStep === pending.thought.step_number;
1569
+
1570
+ // If not revising pending, validate target step exists in stored thoughts
1571
+ if (!isRevisingPending) {
1572
+ const existingStep = SessionManager.getStep(sessionId, targetStep);
1573
+ if (!existingStep) {
1574
+ throw new Error(`Target step not found: ${targetStep}`);
1575
+ }
1576
+ }
1577
+
1578
+ // Clear pending if we're revising it (the revision replaces it)
1579
+ if (isRevisingPending) {
1580
+ SessionManager.clearPendingThought(sessionId);
1581
+ }
1582
+
1583
+ // Use the same step number if revising pending, otherwise auto-increment
1584
+ const stepNumber = isRevisingPending
1585
+ ? pending.thought.step_number
1586
+ : SessionManager.getNextStep(sessionId, branchId);
1587
+
1588
+ // Strip markdown
1589
+ let strippedThought = stripMarkdown(thought);
1590
+ const domain = detectDomain(strippedThought);
1591
+
1592
+ // Pre-compute next step suggestion for math domain (before augmentation modifies text)
1593
+ let nextStepSuggestion: ScratchpadResponse["next_step_suggestion"];
1594
+ if (domain === "math") {
1595
+ const suggestion = suggestNextStepFromText(strippedThought);
1596
+ if (suggestion) {
1597
+ nextStepSuggestion = suggestion;
1598
+ }
1599
+ }
1600
+
1601
+ // Auto-augment (default: true)
1602
+ let augmentationResult: {
1603
+ applied: boolean;
1604
+ computations: number;
1605
+ filtered: number;
1606
+ domain: string;
1607
+ } | null = null;
1608
+
1609
+ const shouldAugment = args.augment_compute !== false;
1610
+
1611
+ if (shouldAugment) {
1612
+ const augResult = contextAwareCompute({
1613
+ thought: strippedThought,
1614
+ systemPrompt: args.context,
1615
+ });
1616
+
1617
+ if (augResult.hasComputations) {
1618
+ strippedThought = augResult.augmented;
1619
+ augmentationResult = {
1620
+ applied: true,
1621
+ computations: augResult.computations.length,
1622
+ filtered: augResult.filteredCount,
1623
+ domain: augResult.domain,
1624
+ };
1625
+ await streamContent({
1626
+ type: "text",
1627
+ text: `⚡ **Augmented** ${augResult.computations.length} computations (${augResult.domain})\n`,
1628
+ });
1629
+ }
1630
+ }
1631
+
1632
+ // Stream revision
1633
+ const revisingLabel = isRevisingPending ? " (replacing failed verification)" : "";
1634
+ await streamContent({
1635
+ type: "text",
1636
+ text:
1637
+ `📝 **Revising Step ${targetStep}**${revisingLabel}\n` +
1638
+ ` Reason: ${args.reason ?? "correction"}\n\n` +
1639
+ `**Step ${stepNumber}** [correction]\n${strippedThought}\n`,
1640
+ });
1641
+
1642
+ // Build thought record with revision info
1643
+ const record: ThoughtRecord = {
1644
+ id: `${sessionId}:${branchId}:${stepNumber}`,
1645
+ step_number: stepNumber,
1646
+ thought: strippedThought,
1647
+ timestamp: Date.now(),
1648
+ branch_id: branchId,
1649
+ revises_step: isRevisingPending ? undefined : targetStep, // Don't mark as revision if replacing pending
1650
+ revision_reason: args.reason,
1651
+ verification:
1652
+ args.confidence !== undefined
1653
+ ? {
1654
+ passed: true,
1655
+ confidence: args.confidence,
1656
+ domain,
1657
+ }
1658
+ : undefined,
1659
+ };
1660
+
1661
+ // Store thought
1662
+ const storeResult = SessionManager.addThought(sessionId, record);
1663
+ if (!storeResult.success) {
1664
+ throw new Error(storeResult.error || "Failed to store revision");
1665
+ }
1666
+
1667
+ // Calculate confidence
1668
+ const confState = calculateConfidence(sessionId, branchId, args.confidence);
1669
+ const status = determineStatus(confState.chainConfidence, threshold, false);
1670
+
1671
+ const response: ScratchpadResponse = {
1672
+ session_id: sessionId,
1673
+ current_step: stepNumber,
1674
+ branch: branchId,
1675
+ operation: "revise",
1676
+ step_confidence: confState.stepConfidence,
1677
+ chain_confidence: confState.chainConfidence,
1678
+ confidence_threshold: threshold,
1679
+ steps_with_confidence: confState.stepsWithConfidence,
1680
+ status,
1681
+ suggested_action: `Revised step ${targetStep}. Continue reasoning with corrected understanding.`,
1682
+ };
1683
+
1684
+ // Add augmentation info
1685
+ if (augmentationResult) {
1686
+ response.augmentation = augmentationResult;
1687
+ }
1688
+
1689
+ // Add next step suggestion for math domain (computed before augmentation)
1690
+ if (nextStepSuggestion) {
1691
+ response.next_step_suggestion = nextStepSuggestion;
1692
+ if (nextStepSuggestion.hasSuggestion) {
1693
+ await streamContent({
1694
+ type: "text",
1695
+ text: `💡 **Next step:** ${nextStepSuggestion.description}\n`,
1696
+ });
1697
+ }
1698
+ }
1699
+
1700
+ return response;
1701
+ }
1702
+
1703
+ /** Handle complete operation - finalize reasoning chain */
1704
+ async function handleComplete(args: ScratchpadArgs, ctx: MCPContext): Promise<ScratchpadResponse> {
1705
+ const { streamContent } = ctx;
1706
+ const sessionId = args.session_id;
1707
+ if (!sessionId) {
1708
+ throw new Error("session_id required for complete operation");
1709
+ }
1710
+
1711
+ const session = SessionManager.get(sessionId);
1712
+ if (!session) {
1713
+ throw new Error(`Session not found: ${sessionId}`);
1714
+ }
1715
+
1716
+ const threshold = args.confidence_threshold ?? 0.8;
1717
+ const branchId = "main";
1718
+
1719
+ // Get final stats - filter to main branch only for accurate analysis
1720
+ const allThoughts = SessionManager.getThoughts(sessionId);
1721
+ const thoughts = allThoughts.filter((t) => !t.branch_id || t.branch_id === branchId);
1722
+ const confState = calculateConfidence(sessionId, branchId);
1723
+ const compressionStats = SessionManager.getCompressionStats(sessionId);
1724
+
1725
+ // Stream completion
1726
+ await streamContent({
1727
+ type: "text",
1728
+ text:
1729
+ `✅ **Reasoning Complete**\n` +
1730
+ ` Total steps: ${thoughts.length}\n` +
1731
+ ` Chain confidence: ${(confState.chainConfidence * 100).toFixed(0)}%\n`,
1732
+ });
1733
+
1734
+ if (compressionStats && compressionStats.totalBytesSaved > 0) {
1735
+ await streamContent({
1736
+ type: "text",
1737
+ text: ` Compression: ${compressionStats.stepCount} steps, ${compressionStats.totalBytesSaved} bytes saved\n`,
1738
+ });
1739
+ }
1740
+
1741
+ if (args.summary) {
1742
+ await streamContent({ type: "text", text: `\n**Summary:** ${args.summary}\n` });
1743
+ }
1744
+ if (args.final_answer) {
1745
+ await streamContent({ type: "text", text: `**Answer:** ${args.final_answer}\n` });
1746
+ }
1747
+
1748
+ // Auto spot-check if question and final_answer provided
1749
+ // Use stored question from step operation if not provided directly
1750
+ const questionForSpotCheck = args.question || SessionManager.getQuestion(sessionId);
1751
+ let spotCheckResult:
1752
+ | {
1753
+ passed: boolean;
1754
+ trapType: string | null;
1755
+ warning: string | null;
1756
+ hint: string | null;
1757
+ confidence: number;
1758
+ }
1759
+ | undefined;
1760
+ let needsReconsideration = false;
1761
+
1762
+ if (questionForSpotCheck && args.final_answer) {
1763
+ spotCheckResult = spotCheck(questionForSpotCheck, args.final_answer);
1764
+ if (!spotCheckResult.passed) {
1765
+ needsReconsideration = true;
1766
+ await streamContent({
1767
+ type: "text",
1768
+ text:
1769
+ `\n⚠️ **Spot-check warning:** ${spotCheckResult.trapType}\n` +
1770
+ (spotCheckResult.warning ? ` ${spotCheckResult.warning}\n` : "") +
1771
+ (spotCheckResult.hint ? ` 💡 ${spotCheckResult.hint}\n` : "") +
1772
+ `\n🔄 **Reconsideration recommended:** Your answer may have fallen for a cognitive trap.\n` +
1773
+ ` Call \`revise\` with target_step=${thoughts.length} to reconsider your final reasoning.\n`,
1774
+ });
1775
+ }
1776
+ }
1777
+
1778
+ // Confidence Drift Detection (CDD) - analyze trajectory for unresolved uncertainty
1779
+ const driftAnalysis = analyzeConfidenceDrift(thoughts);
1780
+ if (driftAnalysis.pattern !== "insufficient") {
1781
+ // Stream drift analysis if concerning
1782
+ if (driftAnalysis.unresolved) {
1783
+ needsReconsideration = true;
1784
+ await streamContent({
1785
+ type: "text",
1786
+ text:
1787
+ `\n⚠️ **Confidence Drift Warning:** ${driftAnalysis.explanation}\n` +
1788
+ (driftAnalysis.suggestion ? ` 💡 ${driftAnalysis.suggestion}\n` : "") +
1789
+ ` Pattern: ${driftAnalysis.pattern}, Drift score: ${(driftAnalysis.drift_score * 100).toFixed(0)}%\n`,
1790
+ });
1791
+ } else if (driftAnalysis.pattern !== "stable") {
1792
+ // Informational for non-stable patterns
1793
+ await streamContent({
1794
+ type: "text",
1795
+ text: ` Confidence pattern: ${driftAnalysis.pattern}\n`,
1796
+ });
1797
+ }
1798
+ }
1799
+
1800
+ // Determine final status - "review" if spot-check failed or unresolved drift, otherwise "complete"
1801
+ const finalStatus = needsReconsideration ? "review" : "complete";
1802
+ let suggestedAction: string;
1803
+ if (needsReconsideration) {
1804
+ if (driftAnalysis.unresolved) {
1805
+ suggestedAction = `Unresolved confidence drift detected (${driftAnalysis.pattern} pattern). ${driftAnalysis.suggestion || `Review step ${driftAnalysis.min_step} where confidence dropped.`}`;
1806
+ } else if (spotCheckResult?.trapType) {
1807
+ suggestedAction = `Potential ${spotCheckResult.trapType} trap detected. Call revise(target_step=${thoughts.length}, reason="${spotCheckResult.hint || "Reconsider approach"}") to fix.`;
1808
+ } else {
1809
+ suggestedAction = "Review recommended before finalizing.";
1810
+ }
1811
+ } else {
1812
+ suggestedAction = "Reasoning chain finalized.";
1813
+ }
1814
+
1815
+ const response: ScratchpadResponse = {
1816
+ session_id: sessionId,
1817
+ current_step: SessionManager.getCurrentStep(sessionId, branchId),
1818
+ branch: branchId,
1819
+ operation: "complete",
1820
+ chain_confidence: confState.chainConfidence,
1821
+ confidence_threshold: threshold,
1822
+ steps_with_confidence: confState.stepsWithConfidence,
1823
+ status: finalStatus,
1824
+ suggested_action: suggestedAction,
1825
+ final_summary: args.summary,
1826
+ total_steps: thoughts.length,
1827
+ };
1828
+
1829
+ // Add spot-check result if we ran it
1830
+ if (spotCheckResult) {
1831
+ response.spot_check_result = {
1832
+ passed: spotCheckResult.passed,
1833
+ trap_type: spotCheckResult.trapType,
1834
+ warning: spotCheckResult.warning,
1835
+ hint: spotCheckResult.hint,
1836
+ confidence: spotCheckResult.confidence,
1837
+ };
1838
+
1839
+ // Add reconsideration prompt if trap detected
1840
+ if (needsReconsideration && spotCheckResult.trapType && spotCheckResult.hint) {
1841
+ response.reconsideration = {
1842
+ trap_type: spotCheckResult.trapType,
1843
+ hint: spotCheckResult.hint,
1844
+ suggested_revise: {
1845
+ target_step: thoughts.length,
1846
+ reason: `Potential ${spotCheckResult.trapType} trap: ${spotCheckResult.hint}`,
1847
+ },
1848
+ };
1849
+ }
1850
+ }
1851
+
1852
+ // Add compression stats if any compression occurred
1853
+ if (compressionStats && compressionStats.totalBytesSaved > 0) {
1854
+ response.compression_stats = {
1855
+ total_bytes_saved: compressionStats.totalBytesSaved,
1856
+ steps_compressed: compressionStats.stepCount,
1857
+ tokens:
1858
+ compressionStats.tokens.original > 0
1859
+ ? {
1860
+ original: compressionStats.tokens.original,
1861
+ compressed: compressionStats.tokens.compressed,
1862
+ saved: compressionStats.tokens.saved,
1863
+ }
1864
+ : undefined,
1865
+ };
1866
+ }
1867
+
1868
+ // Add confidence drift analysis (always include for complete operation)
1869
+ if (driftAnalysis.pattern !== "insufficient") {
1870
+ response.confidence_drift = {
1871
+ drift_score: driftAnalysis.drift_score,
1872
+ unresolved: driftAnalysis.unresolved,
1873
+ min_confidence: driftAnalysis.min_confidence,
1874
+ min_step: driftAnalysis.min_step,
1875
+ max_drop: driftAnalysis.max_drop,
1876
+ recovery: driftAnalysis.recovery,
1877
+ has_revision_after_drop: driftAnalysis.has_revision_after_drop,
1878
+ pattern: driftAnalysis.pattern,
1879
+ explanation: driftAnalysis.explanation,
1880
+ suggestion: driftAnalysis.suggestion,
1881
+ };
1882
+ }
1883
+
1884
+ return response;
1885
+ }
1886
+
1887
+ /** Handle augment operation - extract, compute, and inject math results */
1888
+ async function handleAugment(args: ScratchpadArgs, ctx: MCPContext): Promise<ScratchpadResponse> {
1889
+ const { streamContent } = ctx;
1890
+
1891
+ // Runtime validation: text is required for augment operation
1892
+ if (!args.text) {
1893
+ throw new Error("text is required for augment operation");
1894
+ }
1895
+ const text = args.text;
1896
+
1897
+ const sessionId = args.session_id || `s_${crypto.randomUUID()}`;
1898
+ const threshold = args.confidence_threshold ?? 0.8;
1899
+ const branchId = "main";
1900
+
1901
+ // Run context-aware computation
1902
+ const computeResult = contextAwareCompute({
1903
+ thought: text,
1904
+ systemPrompt: args.system_context,
1905
+ });
1906
+
1907
+ // Stream result
1908
+ if (computeResult.hasComputations) {
1909
+ await streamContent({
1910
+ type: "text",
1911
+ text:
1912
+ `⚡ **Augmented** (${computeResult.computations.length} computations, ` +
1913
+ `${computeResult.filteredCount} filtered by domain)\n` +
1914
+ `Domain: ${computeResult.domain}\n\n`,
1915
+ });
1916
+ await streamContent({
1917
+ type: "text",
1918
+ text: `**Result:**\n${computeResult.augmented}\n`,
1919
+ });
1920
+ } else {
1921
+ await streamContent({
1922
+ type: "text",
1923
+ text: "No computable expressions found.\n",
1924
+ });
1925
+ }
1926
+
1927
+ // Optionally store as a step
1928
+ let stepNumber = 0;
1929
+ if (args.store_as_step) {
1930
+ stepNumber = SessionManager.getNextStep(sessionId, branchId);
1931
+ const record: ThoughtRecord = {
1932
+ id: `${sessionId}:${branchId}:${stepNumber}`,
1933
+ step_number: stepNumber,
1934
+ thought: computeResult.augmented,
1935
+ timestamp: Date.now(),
1936
+ branch_id: branchId,
1937
+ };
1938
+ SessionManager.addThought(sessionId, record);
1939
+ }
1940
+
1941
+ // Calculate confidence for session
1942
+ const confState = calculateConfidence(sessionId, branchId);
1943
+ const status = determineStatus(confState.chainConfidence, threshold, false);
1944
+
1945
+ return {
1946
+ session_id: sessionId,
1947
+ current_step: stepNumber,
1948
+ branch: branchId,
1949
+ operation: "augment",
1950
+ chain_confidence: confState.chainConfidence,
1951
+ confidence_threshold: threshold,
1952
+ steps_with_confidence: confState.stepsWithConfidence,
1953
+ status,
1954
+ suggested_action: computeResult.hasComputations
1955
+ ? `Augmented ${computeResult.computations.length} expressions. Use store_as_step=true to add to reasoning chain.`
1956
+ : "No computations found. Text returned unchanged.",
1957
+ augmented_text: computeResult.augmented,
1958
+ computations: computeResult.computations.map((c) => ({
1959
+ expression: c.original,
1960
+ result: c.result,
1961
+ method: c.method,
1962
+ })),
1963
+ filtered_count: computeResult.filteredCount,
1964
+ detected_domain: computeResult.domain,
1965
+ };
1966
+ }
1967
+
1968
+ /** Handle override operation - commit a failed verification step anyway */
1969
+ async function handleOverride(args: ScratchpadArgs, ctx: MCPContext): Promise<ScratchpadResponse> {
1970
+ const { streamContent } = ctx;
1971
+ const sessionId = args.session_id;
1972
+ if (!sessionId) {
1973
+ throw new Error("session_id required for override operation");
1974
+ }
1975
+
1976
+ const threshold = args.confidence_threshold ?? 0.8;
1977
+
1978
+ // Check for pending thought
1979
+ const pending = SessionManager.getPendingThought(sessionId);
1980
+ if (!pending) {
1981
+ throw new Error(
1982
+ `No pending verification failure to override. ` +
1983
+ `Use override only after a step fails verification.`,
1984
+ );
1985
+ }
1986
+
1987
+ // Validate the failed_step matches
1988
+ if (args.failed_step !== pending.thought.step_number) {
1989
+ throw new Error(
1990
+ `failed_step (${args.failed_step}) doesn't match pending step (${pending.thought.step_number})`,
1991
+ );
1992
+ }
1993
+
1994
+ // Commit the pending thought
1995
+ const commitResult = SessionManager.commitPendingThought(sessionId);
1996
+ if (!commitResult.success) {
1997
+ throw new Error(commitResult.error || "Failed to commit overridden step");
1998
+ }
1999
+
2000
+ const branchId = pending.thought.branch_id;
2001
+
2002
+ // Stream override notice
2003
+ await streamContent({
2004
+ type: "text",
2005
+ text:
2006
+ `✓ **OVERRIDE ACCEPTED**\n` +
2007
+ `Step ${args.failed_step} committed despite verification failure.\n` +
2008
+ `Reason: ${args.reason}\n\n` +
2009
+ `**Note:** This step is marked as verification-failed in the chain.\n`,
2010
+ });
2011
+
2012
+ // Calculate confidence
2013
+ const confState = calculateConfidence(sessionId, branchId);
2014
+ const status = determineStatus(confState.chainConfidence, threshold, false);
2015
+
2016
+ return {
2017
+ session_id: sessionId,
2018
+ current_step: pending.thought.step_number,
2019
+ branch: branchId,
2020
+ operation: "override",
2021
+ chain_confidence: confState.chainConfidence,
2022
+ confidence_threshold: threshold,
2023
+ steps_with_confidence: confState.stepsWithConfidence,
2024
+ status,
2025
+ suggested_action: `Step ${args.failed_step} committed. Continue reasoning.`,
2026
+ verification: {
2027
+ passed: false,
2028
+ confidence: pending.verificationError.confidence,
2029
+ domain: pending.verificationError.domain,
2030
+ },
2031
+ };
2032
+ }
2033
+
2034
+ /** Handle hint operation - progressive simplification hints with session state */
2035
+ async function handleHint(args: ScratchpadArgs, ctx: MCPContext): Promise<ScratchpadResponse> {
2036
+ const { streamContent } = ctx;
2037
+ const sessionId = args.session_id || `hint-${Date.now()}`;
2038
+ const threshold = args.confidence_threshold ?? 0.8;
2039
+ const { cumulative = true, reset = false } = args;
2040
+
2041
+ // Check for existing hint state
2042
+ const existingState = reset ? null : SessionManager.getHintState(sessionId);
2043
+
2044
+ // Determine expression and reveal count
2045
+ let expression: string;
2046
+ let revealCount: number;
2047
+
2048
+ if (args.expression) {
2049
+ // New expression provided - start fresh or continue if same expression
2050
+ expression = args.expression;
2051
+ if (existingState && existingState.expression === expression && !reset) {
2052
+ // Same expression - auto-increment if no reveal_count specified
2053
+ revealCount = args.reveal_count ?? existingState.revealCount + 1;
2054
+ } else {
2055
+ // Different expression or reset - start fresh
2056
+ revealCount = args.reveal_count ?? 1;
2057
+ }
2058
+ } else if (existingState) {
2059
+ // No expression but have state - continue from previous
2060
+ expression = existingState.expression;
2061
+ revealCount = args.reveal_count ?? existingState.revealCount + 1;
2062
+ } else {
2063
+ // No expression and no state - error
2064
+ await streamContent({
2065
+ type: "text",
2066
+ text: `❌ No expression provided and no previous hint state in session.\n`,
2067
+ });
2068
+
2069
+ return {
2070
+ session_id: sessionId,
2071
+ current_step: 0,
2072
+ branch: "main",
2073
+ operation: "hint",
2074
+ chain_confidence: 0,
2075
+ confidence_threshold: threshold,
2076
+ steps_with_confidence: 0,
2077
+ status: "continue",
2078
+ suggested_action: "Provide an expression to get hints.",
2079
+ hint_result: {
2080
+ success: false,
2081
+ original: "",
2082
+ simplified: "",
2083
+ steps_shown: 0,
2084
+ total_steps: 0,
2085
+ steps: [],
2086
+ has_more: false,
2087
+ },
2088
+ };
2089
+ }
2090
+
2091
+ // Get full simplification path
2092
+ const pathResult = suggestSimplificationPath(expression);
2093
+
2094
+ if (!pathResult.success) {
2095
+ // Clear any existing state for this failed expression
2096
+ SessionManager.clearHintState(sessionId);
2097
+
2098
+ await streamContent({
2099
+ type: "text",
2100
+ text: `❌ Could not parse expression: "${expression}"\n`,
2101
+ });
2102
+
2103
+ return {
2104
+ session_id: sessionId,
2105
+ current_step: 0,
2106
+ branch: "main",
2107
+ operation: "hint",
2108
+ chain_confidence: 0,
2109
+ confidence_threshold: threshold,
2110
+ steps_with_confidence: 0,
2111
+ status: "continue",
2112
+ suggested_action: "Expression could not be parsed. Check syntax.",
2113
+ hint_result: {
2114
+ success: false,
2115
+ original: expression,
2116
+ simplified: expression,
2117
+ steps_shown: 0,
2118
+ total_steps: 0,
2119
+ steps: [],
2120
+ has_more: false,
2121
+ },
2122
+ };
2123
+ }
2124
+
2125
+ const totalSteps = pathResult.steps.length;
2126
+ const stepsToShow = Math.min(revealCount, totalSteps);
2127
+
2128
+ // Store hint state for future calls
2129
+ SessionManager.setHintState(sessionId, {
2130
+ expression,
2131
+ revealCount: stepsToShow,
2132
+ totalSteps,
2133
+ simplified: pathResult.simplified,
2134
+ });
2135
+
2136
+ // Build steps array
2137
+ const visibleSteps: SimplificationStep[] = (
2138
+ cumulative
2139
+ ? pathResult.steps.slice(0, stepsToShow)
2140
+ : stepsToShow > 0
2141
+ ? [pathResult.steps[stepsToShow - 1]]
2142
+ : []
2143
+ ).filter((s: SimplificationStep | undefined): s is SimplificationStep => s !== undefined);
2144
+
2145
+ // Get the result at the revealed step
2146
+ const lastStep =
2147
+ stepsToShow > 0 && stepsToShow <= totalSteps ? pathResult.steps[stepsToShow - 1] : undefined;
2148
+ const currentSimplified = lastStep?.after ?? expression;
2149
+
2150
+ // Stream the hint
2151
+ const isContinuing = existingState?.expression === expression;
2152
+ if (totalSteps === 0) {
2153
+ await streamContent({
2154
+ type: "text",
2155
+ text: `✓ Expression "${expression}" is already simplified.\n`,
2156
+ });
2157
+ } else {
2158
+ const continueLabel = isContinuing ? " (continued)" : "";
2159
+ await streamContent({
2160
+ type: "text",
2161
+ text: `💡 **Simplification Hint${continueLabel}** (step ${stepsToShow}/${totalSteps})\n\n`,
2162
+ });
2163
+
2164
+ for (const step of visibleSteps) {
2165
+ await streamContent({
2166
+ type: "text",
2167
+ text:
2168
+ `**Step ${step.step}:** ${step.transformation}\n` +
2169
+ ` ${step.before} → ${step.after}\n` +
2170
+ ` _${step.description}_\n\n`,
2171
+ });
2172
+ }
2173
+
2174
+ if (stepsToShow < totalSteps) {
2175
+ await streamContent({
2176
+ type: "text",
2177
+ text: `_${totalSteps - stepsToShow} more step(s) available. Call hint again to reveal next step._\n`,
2178
+ });
2179
+ } else {
2180
+ await streamContent({
2181
+ type: "text",
2182
+ text: `✓ **Final simplified form:** ${pathResult.simplified}\n`,
2183
+ });
2184
+ }
2185
+ }
2186
+
2187
+ return {
2188
+ session_id: sessionId,
2189
+ current_step: 0,
2190
+ branch: "main",
2191
+ operation: "hint",
2192
+ chain_confidence: 0,
2193
+ confidence_threshold: threshold,
2194
+ steps_with_confidence: 0,
2195
+ status: "continue",
2196
+ suggested_action:
2197
+ stepsToShow < totalSteps
2198
+ ? `${totalSteps - stepsToShow} more steps available. Call hint again to continue.`
2199
+ : "Expression fully simplified",
2200
+ hint_result: {
2201
+ success: true,
2202
+ original: expression,
2203
+ simplified: currentSimplified,
2204
+ steps_shown: stepsToShow,
2205
+ total_steps: totalSteps,
2206
+ steps: visibleSteps.map((s: SimplificationStep) => ({
2207
+ step_number: s.step,
2208
+ transformation: s.transformation,
2209
+ description: s.description,
2210
+ from: s.before,
2211
+ to: s.after,
2212
+ })),
2213
+ has_more: stepsToShow < totalSteps,
2214
+ },
2215
+ };
2216
+ }
2217
+
2218
+ /** Handle mistakes operation - proactive error checking for math derivations */
2219
+ async function handleMistakes(args: ScratchpadArgs, ctx: MCPContext): Promise<ScratchpadResponse> {
2220
+ const { streamContent } = ctx;
2221
+
2222
+ // Runtime validation: text is required for mistakes operation
2223
+ if (!args.text) {
2224
+ throw new Error("text is required for mistakes operation");
2225
+ }
2226
+ const text = args.text;
2227
+
2228
+ const sessionId = args.session_id || `mistakes-${Date.now()}`;
2229
+ const threshold = args.confidence_threshold ?? 0.8;
2230
+
2231
+ // Run mistake detection
2232
+ const result = detectCommonMistakesFromText(text);
2233
+ const mistakes = result?.mistakes ?? [];
2234
+ const mistakesFound = mistakes.length;
2235
+
2236
+ // Stream results
2237
+ if (mistakesFound === 0) {
2238
+ await streamContent({
2239
+ type: "text",
2240
+ text: `✓ **No common algebraic mistakes detected**\n\n_Note: This checks for sign errors, distribution errors, exponent mistakes, etc. It doesn't guarantee correctness._\n`,
2241
+ });
2242
+ } else {
2243
+ await streamContent({
2244
+ type: "text",
2245
+ text: `⚠️ **Found ${mistakesFound} potential algebraic mistake${mistakesFound > 1 ? "s" : ""}:**\n\n`,
2246
+ });
2247
+
2248
+ for (const m of mistakes) {
2249
+ await streamContent({
2250
+ type: "text",
2251
+ text: `• **${m.type}**: ${m.explanation}\n`,
2252
+ });
2253
+ if (m.suggestedFix) {
2254
+ await streamContent({
2255
+ type: "text",
2256
+ text: ` **Corrected:** \`${m.suggestedFix}\`\n`,
2257
+ });
2258
+ } else if (m.suggestion) {
2259
+ await streamContent({
2260
+ type: "text",
2261
+ text: ` _Fix: ${m.suggestion}_\n`,
2262
+ });
2263
+ }
2264
+ }
2265
+ }
2266
+
2267
+ return {
2268
+ session_id: sessionId,
2269
+ current_step: 0,
2270
+ branch: "main",
2271
+ operation: "mistakes",
2272
+ chain_confidence: 0,
2273
+ confidence_threshold: threshold,
2274
+ steps_with_confidence: 0,
2275
+ status: "continue",
2276
+ suggested_action:
2277
+ mistakesFound > 0
2278
+ ? `Found ${mistakesFound} potential mistake(s). Review and revise if needed.`
2279
+ : "No common mistakes detected.",
2280
+ mistakes_result: {
2281
+ text_checked: text.slice(0, 200) + (text.length > 200 ? "..." : ""),
2282
+ mistakes_found: mistakesFound,
2283
+ mistakes: mistakes.map((m: DetectedMistake) => ({
2284
+ type: m.type,
2285
+ description: m.explanation,
2286
+ fix: m.suggestion,
2287
+ corrected_step: m.suggestedFix,
2288
+ })),
2289
+ },
2290
+ };
2291
+ }
2292
+
2293
+ /** Handle spot_check operation - detect trap patterns in answers */
2294
+ async function handleSpotCheck(args: ScratchpadArgs, ctx: MCPContext): Promise<ScratchpadResponse> {
2295
+ const { streamContent } = ctx;
2296
+
2297
+ // Runtime validation: question and answer are required for spot_check operation
2298
+ if (!args.question) {
2299
+ throw new Error("question is required for spot_check operation");
2300
+ }
2301
+ if (!args.answer) {
2302
+ throw new Error("answer is required for spot_check operation");
2303
+ }
2304
+ const question = args.question;
2305
+ const answer = args.answer;
2306
+
2307
+ const sessionId = args.session_id || `spot-check-${Date.now()}`;
2308
+ const threshold = args.confidence_threshold ?? 0.8;
2309
+
2310
+ // Run spot-check
2311
+ const result = spotCheck(question, answer);
2312
+
2313
+ // Stream results
2314
+ if (result.passed) {
2315
+ await streamContent({
2316
+ type: "text",
2317
+ text: `✓ **No trap patterns detected**\n\n_Answer "${answer}" does not match known cognitive trap patterns for this question type._\n`,
2318
+ });
2319
+ } else {
2320
+ await streamContent({
2321
+ type: "text",
2322
+ text: `⚠️ **Potential trap detected: ${result.trapType}**\n\n`,
2323
+ });
2324
+ if (result.warning) {
2325
+ await streamContent({
2326
+ type: "text",
2327
+ text: `**Warning:** ${result.warning}\n`,
2328
+ });
2329
+ }
2330
+ if (result.hint) {
2331
+ await streamContent({
2332
+ type: "text",
2333
+ text: `**Hint:** ${result.hint}\n`,
2334
+ });
2335
+ }
2336
+ await streamContent({
2337
+ type: "text",
2338
+ text: `\n_Consider rechecking your reasoning before finalizing this answer._\n`,
2339
+ });
2340
+ }
2341
+
2342
+ return {
2343
+ session_id: sessionId,
2344
+ current_step: 0,
2345
+ branch: "main",
2346
+ operation: "spot_check",
2347
+ chain_confidence: 0,
2348
+ confidence_threshold: threshold,
2349
+ steps_with_confidence: 0,
2350
+ status: result.passed ? "continue" : "review",
2351
+ suggested_action: result.passed
2352
+ ? "No trap patterns detected. Answer appears safe."
2353
+ : `Potential ${result.trapType} trap detected. Review reasoning before finalizing.`,
2354
+ spot_check_result: {
2355
+ passed: result.passed,
2356
+ trap_type: result.trapType,
2357
+ warning: result.warning,
2358
+ hint: result.hint,
2359
+ confidence: result.confidence,
2360
+ },
2361
+ };
2362
+ }
2363
+
2364
+ /** Handle challenge operation - adversarial self-check for reasoning quality */
2365
+ async function handleChallenge(args: ScratchpadArgs, ctx: MCPContext): Promise<ScratchpadResponse> {
2366
+ const { streamContent } = ctx;
2367
+ const sessionId = args.session_id;
2368
+ if (!sessionId) {
2369
+ throw new Error("session_id required for challenge operation");
2370
+ }
2371
+
2372
+ const session = SessionManager.get(sessionId);
2373
+ if (!session) {
2374
+ throw new Error(`Session not found: ${sessionId}`);
2375
+ }
2376
+
2377
+ const threshold = args.confidence_threshold ?? 0.8;
2378
+ const branchId = args.branch_id || "main";
2379
+
2380
+ // Get thoughts from session
2381
+ const thoughts = SessionManager.getThoughts(sessionId, branchId);
2382
+ if (thoughts.length === 0) {
2383
+ await streamContent({
2384
+ type: "text",
2385
+ text: "⚠️ No reasoning steps to challenge. Add steps first.\n",
2386
+ });
2387
+
2388
+ return {
2389
+ session_id: sessionId,
2390
+ current_step: 0,
2391
+ branch: branchId,
2392
+ operation: "challenge",
2393
+ chain_confidence: 0,
2394
+ confidence_threshold: threshold,
2395
+ steps_with_confidence: 0,
2396
+ status: "continue",
2397
+ suggested_action: "Add reasoning steps before running challenge.",
2398
+ challenge_result: {
2399
+ challenges_generated: 0,
2400
+ challenges: [],
2401
+ overall_robustness: 1.0,
2402
+ summary: "No steps to challenge.",
2403
+ },
2404
+ };
2405
+ }
2406
+
2407
+ // Convert to format expected by challenge function
2408
+ const stepData = thoughts.map((t) => ({ step: t.step_number, thought: t.thought }));
2409
+
2410
+ // Run challenge with optional target claim
2411
+ const result = challenge(stepData, args.target_claim);
2412
+
2413
+ // Stream results
2414
+ if (result.challenges_generated === 0) {
2415
+ await streamContent({
2416
+ type: "text",
2417
+ text:
2418
+ `✓ **No significant challenges found**\n` +
2419
+ `Robustness: ${(result.overall_robustness * 100).toFixed(0)}%\n\n` +
2420
+ `_Reasoning appears robust against common counterarguments._\n`,
2421
+ });
2422
+ } else {
2423
+ const highCount = result.challenges.filter((c) => c.severity === "high").length;
2424
+ const medCount = result.challenges.filter((c) => c.severity === "medium").length;
2425
+
2426
+ await streamContent({
2427
+ type: "text",
2428
+ text:
2429
+ `⚡ **Adversarial Challenge Results**\n` +
2430
+ ` Challenges: ${result.challenges_generated} (${highCount} high, ${medCount} medium)\n` +
2431
+ ` Robustness: ${(result.overall_robustness * 100).toFixed(0)}%\n\n`,
2432
+ });
2433
+
2434
+ // Group by severity for better readability
2435
+ const severityOrder = ["high", "medium", "low"] as const;
2436
+ for (const severity of severityOrder) {
2437
+ const challengesOfSeverity = result.challenges.filter((c) => c.severity === severity);
2438
+ if (challengesOfSeverity.length === 0) continue;
2439
+
2440
+ const emoji = severity === "high" ? "🔴" : severity === "medium" ? "🟡" : "🟢";
2441
+ await streamContent({
2442
+ type: "text",
2443
+ text: `**${emoji} ${severity.toUpperCase()} Severity:**\n`,
2444
+ });
2445
+
2446
+ for (const c of challengesOfSeverity) {
2447
+ await streamContent({
2448
+ type: "text",
2449
+ text:
2450
+ `• **${c.type}**: ${c.challenge}\n` +
2451
+ ` _Claim: "${c.original_claim.slice(0, 60)}${c.original_claim.length > 60 ? "..." : ""}"_\n` +
2452
+ ` 💡 ${c.suggested_response}\n\n`,
2453
+ });
2454
+ }
2455
+ }
2456
+ }
2457
+
2458
+ // Calculate confidence for session
2459
+ const confState = calculateConfidence(sessionId, branchId);
2460
+ const status =
2461
+ result.challenges.filter((c) => c.severity === "high").length > 0 ? "review" : "continue";
2462
+
2463
+ return {
2464
+ session_id: sessionId,
2465
+ current_step: SessionManager.getCurrentStep(sessionId, branchId),
2466
+ branch: branchId,
2467
+ operation: "challenge",
2468
+ chain_confidence: confState.chainConfidence,
2469
+ confidence_threshold: threshold,
2470
+ steps_with_confidence: confState.stepsWithConfidence,
2471
+ status,
2472
+ suggested_action:
2473
+ result.challenges_generated === 0
2474
+ ? "Reasoning appears robust. Proceed to complete."
2475
+ : result.challenges.filter((c) => c.severity === "high").length > 0
2476
+ ? `Found ${result.challenges.filter((c) => c.severity === "high").length} high-severity challenge(s). Address before finalizing.`
2477
+ : `Found ${result.challenges_generated} challenge(s). Consider addressing before completion.`,
2478
+ challenge_result: {
2479
+ challenges_generated: result.challenges_generated,
2480
+ challenges: result.challenges.map((c) => ({
2481
+ type: c.type,
2482
+ original_claim: c.original_claim,
2483
+ challenge: c.challenge,
2484
+ severity: c.severity,
2485
+ suggested_response: c.suggested_response,
2486
+ })),
2487
+ overall_robustness: result.overall_robustness,
2488
+ summary: result.summary,
2489
+ },
2490
+ };
2491
+ }
2492
+
2493
+ // ============================================================================
2494
+ // SCRATCHPAD TOOL
2495
+ // ============================================================================
2496
+
2497
+ export const scratchpadTool = {
2498
+ name: "scratchpad",
2499
+ description: `Structured reasoning with verification, trap detection, and self-challenge.
2500
+
2501
+ OPS:
2502
+ step thought= [question= on 1st] → Add reasoning step. Auto-verifies at step 4+.
2503
+ complete [final_answer=] [summary=] → Finalize chain. Auto spot-checks answer.
2504
+ revise target_step= thought= [reason=] → Fix a step (after verification fail or trap warning).
2505
+ branch thought= [from_step=] [hypothesis=] → Fork reasoning path to test alternative.
2506
+ navigate view=history|branches|step|path [step_id=] → Inspect session state.
2507
+ augment text= → Compute math expressions, inject results.
2508
+ hint [expression=] → Progressive simplification hints (auto-continues).
2509
+ mistakes text= → Check for algebraic errors.
2510
+ spot_check question= answer= → Manual trap pattern detection.
2511
+ challenge [target_claim=] → Adversarial self-check. Generates counterarguments.
2512
+ override failed_step= [reason=] → Force-commit after verification fail.
2513
+
2514
+ DEFAULTS:
2515
+ confidence_threshold=0.8 token_budget=3000
2516
+
2517
+ STATUS → ACTION:
2518
+ continue → Add more steps
2519
+ threshold_reached → Consider complete or add verification step
2520
+ review → Trap/drift detected. Use reconsideration.suggested_revise
2521
+ verification_failed → revise target_step | branch from prior | override
2522
+ budget_exhausted → complete or new session
2523
+
2524
+ FLOW:
2525
+ 1. step(question=, thought=) → primes trap detection
2526
+ 2. step(thought=) × N → auto-verify at 4+
2527
+ 3. [optional] challenge() → adversarial self-check
2528
+ 4. complete(final_answer=) → spot-check, returns status
2529
+ 5. If review: revise per reconsideration.suggested_revise`,
2530
+
2531
+ parameters: ScratchpadSchema,
2532
+
2533
+ annotations: {
2534
+ streamingHint: true,
2535
+ },
2536
+
2537
+ execute: async (args: ScratchpadArgs, ctx: MCPContext) => {
2538
+ try {
2539
+ // Check hard budget limit BEFORE processing operation
2540
+ if (args.hard_limit_tokens && args.session_id) {
2541
+ const existingTokens = getSessionTokens(args.session_id);
2542
+ if (existingTokens && existingTokens.total >= args.hard_limit_tokens) {
2543
+ const budgetExhaustedResponse: ScratchpadResponse = {
2544
+ session_id: args.session_id,
2545
+ current_step: 0,
2546
+ branch: "main",
2547
+ operation: args.operation,
2548
+ chain_confidence: 0,
2549
+ confidence_threshold: args.confidence_threshold,
2550
+ steps_with_confidence: 0,
2551
+ status: "budget_exhausted",
2552
+ suggested_action:
2553
+ "Token budget exhausted. Complete the reasoning chain with your current answer or start a new session.",
2554
+ session_tokens: existingTokens,
2555
+ budget_exhausted: {
2556
+ limit: args.hard_limit_tokens,
2557
+ current: existingTokens.total,
2558
+ exceeded_by: existingTokens.total - args.hard_limit_tokens,
2559
+ message: `Session has used ${existingTokens.total} tokens, exceeding hard limit of ${args.hard_limit_tokens}.`,
2560
+ recommendation:
2561
+ "Use complete operation to finalize your answer, or start a fresh session for new reasoning.",
2562
+ },
2563
+ };
2564
+ return {
2565
+ content: [
2566
+ {
2567
+ type: "text" as const,
2568
+ text: `\n\`\`\`json\n${JSON.stringify(budgetExhaustedResponse, null, 2)}\n\`\`\``,
2569
+ },
2570
+ ],
2571
+ };
2572
+ }
2573
+ }
2574
+
2575
+ let response: ScratchpadResponse;
2576
+
2577
+ switch (args.operation) {
2578
+ case "step":
2579
+ response = await handleStep(args, ctx);
2580
+ break;
2581
+ case "navigate":
2582
+ response = await handleNavigate(args, ctx);
2583
+ break;
2584
+ case "branch":
2585
+ response = await handleBranch(args, ctx);
2586
+ break;
2587
+ case "revise":
2588
+ response = await handleRevise(args, ctx);
2589
+ break;
2590
+ case "complete":
2591
+ response = await handleComplete(args, ctx);
2592
+ break;
2593
+ case "augment":
2594
+ response = await handleAugment(args, ctx);
2595
+ break;
2596
+ case "override":
2597
+ response = await handleOverride(args, ctx);
2598
+ break;
2599
+ case "hint":
2600
+ response = await handleHint(args, ctx);
2601
+ break;
2602
+ case "mistakes":
2603
+ response = await handleMistakes(args, ctx);
2604
+ break;
2605
+ case "spot_check":
2606
+ response = await handleSpotCheck(args, ctx);
2607
+ break;
2608
+ case "challenge":
2609
+ response = await handleChallenge(args, ctx);
2610
+ break;
2611
+ default:
2612
+ throw new Error(`Unknown operation: ${(args as { operation: string }).operation}`);
2613
+ }
2614
+
2615
+ // Add token usage to response
2616
+ const tokens = calculateTokenUsage(args, response);
2617
+ response.tokens = tokens;
2618
+
2619
+ // Track cumulative session tokens
2620
+ const sessionTokens = trackSessionTokens(response.session_id, tokens);
2621
+ response.session_tokens = sessionTokens;
2622
+
2623
+ // Check token budget warning threshold
2624
+ if (args.warn_at_tokens && sessionTokens.total > args.warn_at_tokens) {
2625
+ response.token_warning = {
2626
+ threshold: args.warn_at_tokens,
2627
+ current: sessionTokens.total,
2628
+ exceeded_by: sessionTokens.total - args.warn_at_tokens,
2629
+ message: `Session token usage (${sessionTokens.total}) exceeds threshold (${args.warn_at_tokens}). Consider completing or compressing.`,
2630
+ };
2631
+ }
2632
+
2633
+ return {
2634
+ content: [
2635
+ {
2636
+ type: "text" as const,
2637
+ text: `\n\`\`\`json\n${JSON.stringify(response, null, 2)}\n\`\`\``,
2638
+ },
2639
+ ],
2640
+ };
2641
+ } catch (error) {
2642
+ const message = error instanceof Error ? error.message : "Unknown error";
2643
+ const errorResponse: {
2644
+ error: string;
2645
+ tokens?: ReturnType<typeof calculateTokenUsage>;
2646
+ session_tokens?: ReturnType<typeof trackSessionTokens>;
2647
+ } = { error: message };
2648
+ const tokens = calculateTokenUsage(args, errorResponse);
2649
+ errorResponse.tokens = tokens;
2650
+ // Track session tokens even on error for accurate budget monitoring
2651
+ if (args.session_id) {
2652
+ errorResponse.session_tokens = trackSessionTokens(args.session_id, tokens);
2653
+ }
2654
+ return {
2655
+ content: [{ type: "text" as const, text: JSON.stringify(errorResponse) }],
2656
+ };
2657
+ }
2658
+ },
2659
+ };