verifiable-thinking-mcp 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +339 -0
  3. package/package.json +75 -0
  4. package/src/index.ts +38 -0
  5. package/src/lib/cache.ts +246 -0
  6. package/src/lib/compression.ts +804 -0
  7. package/src/lib/compute/cache.ts +86 -0
  8. package/src/lib/compute/classifier.ts +555 -0
  9. package/src/lib/compute/confidence.ts +79 -0
  10. package/src/lib/compute/context.ts +154 -0
  11. package/src/lib/compute/extract.ts +200 -0
  12. package/src/lib/compute/filter.ts +224 -0
  13. package/src/lib/compute/index.ts +171 -0
  14. package/src/lib/compute/math.ts +247 -0
  15. package/src/lib/compute/patterns.ts +564 -0
  16. package/src/lib/compute/registry.ts +145 -0
  17. package/src/lib/compute/solvers/arithmetic.ts +65 -0
  18. package/src/lib/compute/solvers/calculus.ts +249 -0
  19. package/src/lib/compute/solvers/derivation-core.ts +371 -0
  20. package/src/lib/compute/solvers/derivation-latex.ts +160 -0
  21. package/src/lib/compute/solvers/derivation-mistakes.ts +1046 -0
  22. package/src/lib/compute/solvers/derivation-simplify.ts +451 -0
  23. package/src/lib/compute/solvers/derivation-transform.ts +620 -0
  24. package/src/lib/compute/solvers/derivation.ts +67 -0
  25. package/src/lib/compute/solvers/facts.ts +120 -0
  26. package/src/lib/compute/solvers/formula.ts +728 -0
  27. package/src/lib/compute/solvers/index.ts +36 -0
  28. package/src/lib/compute/solvers/logic.ts +422 -0
  29. package/src/lib/compute/solvers/probability.ts +307 -0
  30. package/src/lib/compute/solvers/statistics.ts +262 -0
  31. package/src/lib/compute/solvers/word-problems.ts +408 -0
  32. package/src/lib/compute/types.ts +107 -0
  33. package/src/lib/concepts.ts +111 -0
  34. package/src/lib/domain.ts +731 -0
  35. package/src/lib/extraction.ts +912 -0
  36. package/src/lib/index.ts +122 -0
  37. package/src/lib/judge.ts +260 -0
  38. package/src/lib/math/ast.ts +842 -0
  39. package/src/lib/math/index.ts +8 -0
  40. package/src/lib/math/operators.ts +171 -0
  41. package/src/lib/math/tokenizer.ts +477 -0
  42. package/src/lib/patterns.ts +200 -0
  43. package/src/lib/session.ts +825 -0
  44. package/src/lib/think/challenge.ts +323 -0
  45. package/src/lib/think/complexity.ts +504 -0
  46. package/src/lib/think/confidence-drift.ts +507 -0
  47. package/src/lib/think/consistency.ts +347 -0
  48. package/src/lib/think/guidance.ts +188 -0
  49. package/src/lib/think/helpers.ts +568 -0
  50. package/src/lib/think/hypothesis.ts +216 -0
  51. package/src/lib/think/index.ts +127 -0
  52. package/src/lib/think/prompts.ts +262 -0
  53. package/src/lib/think/route.ts +358 -0
  54. package/src/lib/think/schema.ts +98 -0
  55. package/src/lib/think/scratchpad-schema.ts +662 -0
  56. package/src/lib/think/spot-check.ts +961 -0
  57. package/src/lib/think/types.ts +93 -0
  58. package/src/lib/think/verification.ts +260 -0
  59. package/src/lib/tokens.ts +177 -0
  60. package/src/lib/verification.ts +620 -0
  61. package/src/prompts/index.ts +10 -0
  62. package/src/prompts/templates.ts +336 -0
  63. package/src/resources/index.ts +8 -0
  64. package/src/resources/sessions.ts +196 -0
  65. package/src/tools/compress.ts +138 -0
  66. package/src/tools/index.ts +5 -0
  67. package/src/tools/scratchpad.ts +2659 -0
  68. package/src/tools/sessions.ts +144 -0
@@ -0,0 +1,662 @@
1
+ /**
2
+ * Scratchpad Schema - Unified CRASH-style reasoning tool
3
+ *
4
+ * Single tool with operation-based dispatch:
5
+ * - step: Add a thought (auto-increments step number)
6
+ * - navigate: View history, branches, specific step, or path
7
+ * - branch: Start alternative reasoning path
8
+ * - revise: Correct earlier step
9
+ * - complete: Finalize reasoning chain
10
+ *
11
+ * Note: Uses a flat object schema for MCP SDK compatibility.
12
+ * The MCP spec requires inputSchema.type = "object", but Zod's
13
+ * discriminatedUnion produces "oneOf" which fails validation.
14
+ */
15
+
16
+ import { z } from "zod";
17
+
18
+ // ============================================================================
19
+ // FLAT SCHEMA (MCP-compatible: type="object" at top level)
20
+ // ============================================================================
21
+
22
+ export const ScratchpadSchema = z.object({
23
+ // Required: operation discriminator
24
+ operation: z
25
+ .enum([
26
+ "step",
27
+ "navigate",
28
+ "branch",
29
+ "revise",
30
+ "complete",
31
+ "augment",
32
+ "override",
33
+ "hint",
34
+ "mistakes",
35
+ "spot_check",
36
+ "challenge",
37
+ ])
38
+ .describe("Operation to perform"),
39
+
40
+ // Common fields (all operations)
41
+ session_id: z.string().optional().describe("Session ID (auto-generated if omitted)"),
42
+ confidence_threshold: z
43
+ .number()
44
+ .min(0)
45
+ .max(1)
46
+ .default(0.8)
47
+ .describe("Chain confidence threshold to suggest completion"),
48
+ token_budget: z
49
+ .number()
50
+ .int()
51
+ .min(100)
52
+ .default(3000)
53
+ .describe("Max tokens before auto-compressing new steps"),
54
+ warn_at_tokens: z
55
+ .number()
56
+ .int()
57
+ .min(100)
58
+ .optional()
59
+ .describe(
60
+ "Warn when cumulative session tokens exceed this threshold (soft limit, cost control)",
61
+ ),
62
+ hard_limit_tokens: z
63
+ .number()
64
+ .int()
65
+ .min(100)
66
+ .optional()
67
+ .describe(
68
+ "Hard stop when cumulative session tokens exceed this threshold. Returns budget_exhausted status and blocks further operations.",
69
+ ),
70
+
71
+ // Step operation fields
72
+ thought: z.string().optional().describe("Current reasoning/analysis (step/branch/revise)"),
73
+ purpose: z
74
+ .enum([
75
+ "analysis",
76
+ "action",
77
+ "reflection",
78
+ "decision",
79
+ "summary",
80
+ "validation",
81
+ "exploration",
82
+ "hypothesis",
83
+ "correction",
84
+ "planning",
85
+ ])
86
+ .optional()
87
+ .describe("Step category"),
88
+ outcome: z.string().optional().describe("Result or conclusion from this step"),
89
+ confidence: z
90
+ .number()
91
+ .min(0)
92
+ .max(1)
93
+ .optional()
94
+ .describe("Confidence in this step (0-1). Contributes to chain average."),
95
+ context: z.string().optional().describe("Prior context or findings"),
96
+ verify: z
97
+ .boolean()
98
+ .optional()
99
+ .describe(
100
+ "Run domain verification. Auto-enabled for chains >3 steps. Set to false to disable.",
101
+ ),
102
+ domain: z.enum(["math", "logic", "code", "general"]).optional(),
103
+ local_compute: z.boolean().default(false).describe("Try local compute for math"),
104
+ augment_compute: z
105
+ .boolean()
106
+ .default(true)
107
+ .describe("Auto-inject computed values into thought (default: true)"),
108
+ compress: z.boolean().default(false).describe("Compress thought before storing"),
109
+ compression_query: z.string().optional().describe("Query for context-aware compression"),
110
+ max_step_tokens: z
111
+ .number()
112
+ .int()
113
+ .min(10)
114
+ .optional()
115
+ .describe("Max tokens for this step. Rejects if exceeded (default: no limit)"),
116
+ force_large: z.boolean().default(false).describe("Allow step even if it exceeds max_step_tokens"),
117
+ preconditions: z
118
+ .array(z.string())
119
+ .optional()
120
+ .describe("Assumptions that MUST be true for this step (e.g., 'x > 0', 'file exists')"),
121
+
122
+ // Navigate operation fields
123
+ view: z
124
+ .enum(["history", "branches", "step", "path"])
125
+ .optional()
126
+ .describe(
127
+ "What to view: history (all steps), branches (list), step (specific), path (lineage)",
128
+ ),
129
+ step_id: z.number().int().min(1).optional().describe("Step number to view"),
130
+ branch_id: z.string().optional().describe("Filter history by branch"),
131
+ limit: z.number().int().min(1).max(50).default(10).describe("Max steps to return"),
132
+
133
+ // Branch operation fields
134
+ from_step: z.number().int().min(1).optional().describe("Step to branch from (default: current)"),
135
+ branch_name: z.string().optional().describe("Human-readable branch name"),
136
+ hypothesis: z
137
+ .string()
138
+ .optional()
139
+ .describe("Falsifiable hypothesis this branch will test (e.g., 'Assume X is prime')"),
140
+ success_criteria: z
141
+ .string()
142
+ .optional()
143
+ .describe("What observation proves/disproves this hypothesis"),
144
+
145
+ // Revise operation fields
146
+ target_step: z.number().int().min(1).optional().describe("Step number to revise"),
147
+ reason: z.string().optional().describe("Why revising this step / Why overriding verification"),
148
+
149
+ // Complete operation fields
150
+ summary: z.string().optional().describe("Final summary/conclusion"),
151
+ final_answer: z.string().optional().describe("The answer/result"),
152
+ question: z
153
+ .string()
154
+ .optional()
155
+ .describe(
156
+ "Original question. On step: enables trap priming and stores for auto spot-check. On complete: enables spot-check.",
157
+ ),
158
+
159
+ // Augment operation fields
160
+ text: z
161
+ .string()
162
+ .optional()
163
+ .describe("Text containing math expressions to compute and inject (augment/mistakes)"),
164
+ system_context: z.string().optional().describe("System prompt context for domain filtering"),
165
+ store_as_step: z.boolean().default(false).describe("Store augmented result as a reasoning step"),
166
+
167
+ // Override operation fields
168
+ acknowledge: z
169
+ .boolean()
170
+ .optional()
171
+ .describe("Confirm you understand verification failed but want to proceed"),
172
+ failed_step: z.number().int().min(1).optional().describe("Step number that failed verification"),
173
+
174
+ // Hint operation fields
175
+ expression: z
176
+ .string()
177
+ .optional()
178
+ .describe("Math expression to simplify. Omit to continue from previous hint in session."),
179
+ reveal_count: z
180
+ .number()
181
+ .int()
182
+ .min(1)
183
+ .optional()
184
+ .describe("Number of steps to reveal. Omit to auto-increment when continuing."),
185
+ cumulative: z
186
+ .boolean()
187
+ .default(true)
188
+ .describe("Show all steps up to reveal_count (true) or just the nth step (false)"),
189
+ reset: z.boolean().default(false).describe("Reset hint state and start from beginning"),
190
+
191
+ // Spot check operation fields
192
+ answer: z.string().optional().describe("The proposed answer to check for trap patterns"),
193
+
194
+ // Challenge operation fields
195
+ challenge_type: z
196
+ .enum(["assumption_inversion", "edge_case", "premise_check", "steelman_counter", "all"])
197
+ .optional()
198
+ .describe("Type of challenge to generate (default: all)"),
199
+ target_claim: z
200
+ .string()
201
+ .optional()
202
+ .describe("Specific claim to challenge (optional - if omitted, extracts claims from steps)"),
203
+ });
204
+
205
+ export type ScratchpadArgs = z.infer<typeof ScratchpadSchema>;
206
+
207
+ // Operation-specific type aliases (all use the same flat structure, just for clarity)
208
+ export type StepOperation = ScratchpadArgs & { operation: "step" };
209
+ export type NavigateOperation = ScratchpadArgs & { operation: "navigate" };
210
+ export type BranchOperation = ScratchpadArgs & { operation: "branch" };
211
+ export type ReviseOperation = ScratchpadArgs & { operation: "revise" };
212
+ export type CompleteOperation = ScratchpadArgs & { operation: "complete" };
213
+ export type AugmentOperation = ScratchpadArgs & { operation: "augment" };
214
+ export type OverrideOperation = ScratchpadArgs & { operation: "override" };
215
+ export type HintOperation = ScratchpadArgs & { operation: "hint" };
216
+ export type MistakesOperation = ScratchpadArgs & { operation: "mistakes" };
217
+ export type SpotCheckOperation = ScratchpadArgs & { operation: "spot_check" };
218
+ export type ChallengeOperation = ScratchpadArgs & { operation: "challenge" };
219
+
220
+ // ============================================================================
221
+ // RESPONSE TYPES
222
+ // ============================================================================
223
+
224
+ /** Recovery options provided when verification fails */
225
+ export interface RecoveryOptions {
226
+ /** Revise the failed step in-place */
227
+ revise: {
228
+ target_step: number;
229
+ suggested_reason: string;
230
+ };
231
+ /** Branch to try an alternative approach */
232
+ branch: {
233
+ from_step: number;
234
+ suggested_name: string;
235
+ };
236
+ /** Override and continue anyway (use when heuristic is wrong) */
237
+ override: {
238
+ flag: "force_continue";
239
+ warning: string;
240
+ };
241
+ }
242
+
243
+ /** A detected algebraic mistake */
244
+ export interface DetectedMistakeInfo {
245
+ /** Type of mistake (sign_error, distribution_error, etc.) */
246
+ type: string;
247
+ /** Human-readable description */
248
+ description: string;
249
+ /** Specific fix suggestion */
250
+ fix?: string;
251
+ /** The complete corrected step (e.g., "2x + 3x = 5x") */
252
+ corrected_step?: string;
253
+ }
254
+
255
+ /** Details about a verification failure */
256
+ export interface VerificationFailure {
257
+ /** What check failed */
258
+ issue: string;
259
+ /** Specific evidence of the problem */
260
+ evidence: string;
261
+ /** Suggestions for fixing */
262
+ suggestions: string[];
263
+ /** Confidence in the failure detection (higher = more certain it's wrong) */
264
+ confidence: number;
265
+ /** Domain that was checked */
266
+ domain: string;
267
+ /** Detected algebraic mistakes (math domain only) */
268
+ detected_mistakes?: DetectedMistakeInfo[];
269
+ /** Available recovery actions */
270
+ recovery_options: RecoveryOptions;
271
+ }
272
+
273
+ export interface ScratchpadResponse {
274
+ // State
275
+ session_id: string;
276
+ current_step: number;
277
+ branch: string;
278
+ operation: string;
279
+
280
+ // Confidence tracking
281
+ step_confidence?: number;
282
+ chain_confidence: number;
283
+ confidence_threshold: number;
284
+ steps_with_confidence: number;
285
+
286
+ // Status & guidance
287
+ status:
288
+ | "continue"
289
+ | "review"
290
+ | "threshold_reached"
291
+ | "complete"
292
+ | "verification_failed"
293
+ | "budget_exhausted";
294
+ suggested_action: string;
295
+
296
+ // Timer warning (when threshold reached)
297
+ auto_complete_warning?: string;
298
+
299
+ // Verification failure (when status === "verification_failed")
300
+ // The step is NOT stored until recovery action is taken
301
+ verification_failure?: VerificationFailure;
302
+
303
+ // For navigate operation
304
+ history?: Array<{
305
+ step: number;
306
+ branch: string;
307
+ purpose: string;
308
+ thought_preview: string;
309
+ confidence?: number;
310
+ revised_by?: number;
311
+ }>;
312
+ branches?: Array<{
313
+ id: string;
314
+ name: string;
315
+ from_step: number;
316
+ depth: number;
317
+ /** Hypothesis this branch is testing (if provided) */
318
+ hypothesis?: string;
319
+ /** Criteria for proving/disproving the hypothesis */
320
+ success_criteria?: string;
321
+ }>;
322
+ path?: Array<{
323
+ step: number;
324
+ branch: string;
325
+ thought_preview: string;
326
+ }>;
327
+ step_detail?: {
328
+ step: number;
329
+ branch: string;
330
+ purpose: string;
331
+ thought: string;
332
+ outcome?: string;
333
+ confidence?: number;
334
+ revises_step?: number;
335
+ revised_by?: number;
336
+ /** Preconditions/assumptions for this step */
337
+ preconditions?: string[];
338
+ /** Hypothesis being tested (for branch steps) */
339
+ hypothesis?: string;
340
+ /** Success criteria for the hypothesis */
341
+ success_criteria?: string;
342
+ };
343
+
344
+ // For complete operation
345
+ final_summary?: string;
346
+ total_steps?: number;
347
+
348
+ // Metadata
349
+ verification?: {
350
+ passed: boolean;
351
+ confidence: number;
352
+ domain: string;
353
+ };
354
+ local_compute?: {
355
+ solved: boolean;
356
+ result: unknown;
357
+ method: string;
358
+ };
359
+ compression?: {
360
+ applied: boolean;
361
+ original_tokens: number;
362
+ compressed_tokens: number;
363
+ ratio: number;
364
+ };
365
+
366
+ // Token budget tracking
367
+ token_usage?: {
368
+ total: number;
369
+ budget: number;
370
+ exceeded: boolean;
371
+ auto_compressed: boolean;
372
+ /** Percentage of budget consumed */
373
+ budget_percent: number;
374
+ };
375
+
376
+ // Proactive compression suggestion (when approaching budget)
377
+ compression_suggestion?: {
378
+ /** Whether compression is recommended now */
379
+ should_compress: boolean;
380
+ /** Current session token total */
381
+ current_tokens: number;
382
+ /** Budget threshold */
383
+ budget: number;
384
+ /** Percentage consumed */
385
+ percent_used: number;
386
+ /** Human-readable nudge */
387
+ nudge: string;
388
+ };
389
+
390
+ // Augmentation results (when augment_compute=true)
391
+ augmentation?: {
392
+ applied: boolean;
393
+ computations: number;
394
+ filtered: number;
395
+ domain: string;
396
+ };
397
+ // Session-level compression stats (for complete operation)
398
+ compression_stats?: {
399
+ total_bytes_saved: number;
400
+ steps_compressed: number;
401
+ tokens?: {
402
+ original: number;
403
+ compressed: number;
404
+ saved: number;
405
+ };
406
+ };
407
+
408
+ // For augment operation
409
+ augmented_text?: string;
410
+ computations?: Array<{
411
+ expression: string;
412
+ result: unknown;
413
+ method: string;
414
+ }>;
415
+ filtered_count?: number;
416
+ detected_domain?: string;
417
+
418
+ // Next step suggestion for math derivations (auto-populated for math domain)
419
+ next_step_suggestion?: {
420
+ hasSuggestion: boolean;
421
+ transformation?: string;
422
+ description?: string;
423
+ currentExpression?: string;
424
+ allApplicable?: Array<{ name: string; description: string }>;
425
+ };
426
+
427
+ // For hint operation - progressive simplification hints
428
+ hint_result?: {
429
+ success: boolean;
430
+ original: string;
431
+ simplified: string;
432
+ steps_shown: number;
433
+ total_steps: number;
434
+ steps: Array<{
435
+ step_number: number;
436
+ transformation: string;
437
+ description: string;
438
+ from: string;
439
+ to: string;
440
+ }>;
441
+ has_more: boolean;
442
+ };
443
+
444
+ // For mistakes operation - proactive error checking
445
+ mistakes_result?: {
446
+ text_checked: string;
447
+ mistakes_found: number;
448
+ mistakes: DetectedMistakeInfo[];
449
+ };
450
+
451
+ // For spot_check operation - trap pattern detection
452
+ spot_check_result?: {
453
+ passed: boolean;
454
+ trap_type: string | null;
455
+ warning: string | null;
456
+ hint: string | null;
457
+ confidence: number;
458
+ };
459
+
460
+ // Reconsideration prompt (when spot-check fails during complete)
461
+ reconsideration?: {
462
+ trap_type: string;
463
+ hint: string;
464
+ suggested_revise: {
465
+ target_step: number;
466
+ reason: string;
467
+ };
468
+ };
469
+
470
+ // Trap analysis (when question provided on step, informational only)
471
+ trap_analysis?: {
472
+ detected: boolean;
473
+ types: string[]; // All detected trap types
474
+ primed_count: number; // How many traps were actually primed (≤ types.length)
475
+ note: string | null;
476
+ confidence: number;
477
+ };
478
+
479
+ // Token usage metadata (always added by execute wrapper)
480
+ tokens?: {
481
+ input_tokens: number;
482
+ output_tokens: number;
483
+ total_tokens: number;
484
+ };
485
+
486
+ // Cumulative session token usage (always added by execute wrapper)
487
+ session_tokens?: {
488
+ total_input: number;
489
+ total_output: number;
490
+ total: number;
491
+ operations: number;
492
+ };
493
+
494
+ // Token budget warning (when warn_at_tokens threshold exceeded)
495
+ token_warning?: {
496
+ threshold: number;
497
+ current: number;
498
+ exceeded_by: number;
499
+ message: string;
500
+ };
501
+
502
+ // Hard budget limit (when hard_limit_tokens exceeded - operation blocked)
503
+ budget_exhausted?: {
504
+ limit: number;
505
+ current: number;
506
+ exceeded_by: number;
507
+ message: string;
508
+ recommendation: string;
509
+ };
510
+
511
+ // Confidence Drift Detection (CDD) - novel meta-signal for reasoning quality
512
+ // Analyzes confidence trajectory shape to detect unresolved uncertainty
513
+ confidence_drift?: {
514
+ /** Overall drift score (0-1, higher = more concerning) */
515
+ drift_score: number;
516
+ /** Whether the drift represents unresolved uncertainty */
517
+ unresolved: boolean;
518
+ /** Confidence at trajectory minimum */
519
+ min_confidence: number;
520
+ /** Step number where minimum occurred */
521
+ min_step: number;
522
+ /** Maximum confidence drop observed */
523
+ max_drop: number;
524
+ /** Recovery magnitude from min to final */
525
+ recovery: number;
526
+ /** Whether a revision step exists after the drop */
527
+ has_revision_after_drop: boolean;
528
+ /** Pattern classification */
529
+ pattern:
530
+ | "stable"
531
+ | "stable_overconfident"
532
+ | "declining"
533
+ | "improving"
534
+ | "v_shaped"
535
+ | "oscillating"
536
+ | "cliff"
537
+ | "insufficient";
538
+ /** Human-readable explanation */
539
+ explanation: string;
540
+ /** Suggested action if unresolved */
541
+ suggestion: string | null;
542
+ };
543
+
544
+ // Proactive stepping guidance based on question complexity
545
+ // Provided on first step when question is supplied
546
+ stepping_guidance?: {
547
+ /** Complexity tier of the question */
548
+ complexity_tier: "Low" | "Moderate" | "High" | "Very Hard" | "Almost Impossible";
549
+ /** Recommended minimum steps for this complexity */
550
+ recommended_steps: number;
551
+ /** Current step count */
552
+ current_steps: number;
553
+ /** Whether more steps are recommended before completing */
554
+ needs_more_steps: boolean;
555
+ /** Human-readable nudge */
556
+ nudge: string | null;
557
+ };
558
+
559
+ // Consistency check - detects contradictions across reasoning steps
560
+ // Checked every N steps (configurable) to catch logical inconsistencies
561
+ consistency_warning?: {
562
+ /** Whether contradictions were found */
563
+ has_contradictions: boolean;
564
+ /** Number of contradictions detected */
565
+ count: number;
566
+ /** The contradictions found */
567
+ contradictions: Array<{
568
+ /** Type of contradiction */
569
+ type: "value_reassignment" | "logical_conflict" | "sign_flip" | "direction_reversal";
570
+ /** Human-readable description */
571
+ description: string;
572
+ /** The variable/concept involved */
573
+ subject: string;
574
+ /** Step where original claim was made */
575
+ original_step: number;
576
+ /** Conflicting step number */
577
+ conflicting_step: number;
578
+ /** Confidence in detection (0-1) */
579
+ confidence: number;
580
+ }>;
581
+ /** Human-readable nudge */
582
+ nudge: string;
583
+ };
584
+
585
+ // Hypothesis resolution - detects when a branch's hypothesis is confirmed/refuted
586
+ // Only present for steps on branches with hypotheses
587
+ hypothesis_resolution?: {
588
+ /** Whether the hypothesis has been resolved */
589
+ resolved: boolean;
590
+ /** Resolution outcome if resolved */
591
+ outcome: "confirmed" | "refuted" | "inconclusive" | null;
592
+ /** Confidence in the resolution (0-1) */
593
+ confidence: number;
594
+ /** Step number where resolution was detected */
595
+ resolved_at_step: number | null;
596
+ /** Evidence text that triggered resolution */
597
+ evidence: string | null;
598
+ /** The original hypothesis being tested */
599
+ hypothesis: string;
600
+ /** The success criteria (if provided) */
601
+ success_criteria: string | null;
602
+ /** Suggested action based on resolution */
603
+ suggestion: string;
604
+ };
605
+
606
+ // Challenge result - adversarial self-check for reasoning quality
607
+ // Only present for challenge operation
608
+ challenge_result?: {
609
+ /** Number of challenges generated */
610
+ challenges_generated: number;
611
+ /** The challenges */
612
+ challenges: Array<{
613
+ /** Type of challenge */
614
+ type: "assumption_inversion" | "edge_case" | "premise_check" | "steelman_counter";
615
+ /** The original claim being challenged */
616
+ original_claim: string;
617
+ /** The challenge/counterargument */
618
+ challenge: string;
619
+ /** How serious is this challenge */
620
+ severity: "low" | "medium" | "high";
621
+ /** Suggested way to address this challenge */
622
+ suggested_response: string;
623
+ }>;
624
+ /** Overall robustness score (0-1) */
625
+ overall_robustness: number;
626
+ /** Summary of findings */
627
+ summary: string;
628
+ };
629
+
630
+ // Auto-challenge suggestion - triggered when overconfidence detected
631
+ // Present when shouldChallenge() returns true (high confidence with few steps)
632
+ challenge_suggestion?: {
633
+ /** Whether a challenge is recommended */
634
+ should_challenge: boolean;
635
+ /** Why challenge is suggested */
636
+ reason: string;
637
+ /** Specific type of challenge recommended */
638
+ suggested_type:
639
+ | "assumption_inversion"
640
+ | "edge_case"
641
+ | "premise_check"
642
+ | "steelman_counter"
643
+ | "all";
644
+ /** Human-readable nudge */
645
+ nudge: string;
646
+ };
647
+
648
+ // Merge suggestion - triggered when branch hypothesis is confirmed
649
+ // Suggests merging branch findings back to main reasoning path
650
+ merge_suggestion?: {
651
+ /** Whether merge is recommended */
652
+ should_merge: boolean;
653
+ /** Branch ID to merge from */
654
+ from_branch: string;
655
+ /** The confirmed hypothesis */
656
+ confirmed_hypothesis: string;
657
+ /** Key findings to incorporate */
658
+ key_findings: string;
659
+ /** Human-readable suggestion */
660
+ nudge: string;
661
+ };
662
+ }