verifiable-thinking-mcp 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +339 -0
- package/package.json +75 -0
- package/src/index.ts +38 -0
- package/src/lib/cache.ts +246 -0
- package/src/lib/compression.ts +804 -0
- package/src/lib/compute/cache.ts +86 -0
- package/src/lib/compute/classifier.ts +555 -0
- package/src/lib/compute/confidence.ts +79 -0
- package/src/lib/compute/context.ts +154 -0
- package/src/lib/compute/extract.ts +200 -0
- package/src/lib/compute/filter.ts +224 -0
- package/src/lib/compute/index.ts +171 -0
- package/src/lib/compute/math.ts +247 -0
- package/src/lib/compute/patterns.ts +564 -0
- package/src/lib/compute/registry.ts +145 -0
- package/src/lib/compute/solvers/arithmetic.ts +65 -0
- package/src/lib/compute/solvers/calculus.ts +249 -0
- package/src/lib/compute/solvers/derivation-core.ts +371 -0
- package/src/lib/compute/solvers/derivation-latex.ts +160 -0
- package/src/lib/compute/solvers/derivation-mistakes.ts +1046 -0
- package/src/lib/compute/solvers/derivation-simplify.ts +451 -0
- package/src/lib/compute/solvers/derivation-transform.ts +620 -0
- package/src/lib/compute/solvers/derivation.ts +67 -0
- package/src/lib/compute/solvers/facts.ts +120 -0
- package/src/lib/compute/solvers/formula.ts +728 -0
- package/src/lib/compute/solvers/index.ts +36 -0
- package/src/lib/compute/solvers/logic.ts +422 -0
- package/src/lib/compute/solvers/probability.ts +307 -0
- package/src/lib/compute/solvers/statistics.ts +262 -0
- package/src/lib/compute/solvers/word-problems.ts +408 -0
- package/src/lib/compute/types.ts +107 -0
- package/src/lib/concepts.ts +111 -0
- package/src/lib/domain.ts +731 -0
- package/src/lib/extraction.ts +912 -0
- package/src/lib/index.ts +122 -0
- package/src/lib/judge.ts +260 -0
- package/src/lib/math/ast.ts +842 -0
- package/src/lib/math/index.ts +8 -0
- package/src/lib/math/operators.ts +171 -0
- package/src/lib/math/tokenizer.ts +477 -0
- package/src/lib/patterns.ts +200 -0
- package/src/lib/session.ts +825 -0
- package/src/lib/think/challenge.ts +323 -0
- package/src/lib/think/complexity.ts +504 -0
- package/src/lib/think/confidence-drift.ts +507 -0
- package/src/lib/think/consistency.ts +347 -0
- package/src/lib/think/guidance.ts +188 -0
- package/src/lib/think/helpers.ts +568 -0
- package/src/lib/think/hypothesis.ts +216 -0
- package/src/lib/think/index.ts +127 -0
- package/src/lib/think/prompts.ts +262 -0
- package/src/lib/think/route.ts +358 -0
- package/src/lib/think/schema.ts +98 -0
- package/src/lib/think/scratchpad-schema.ts +662 -0
- package/src/lib/think/spot-check.ts +961 -0
- package/src/lib/think/types.ts +93 -0
- package/src/lib/think/verification.ts +260 -0
- package/src/lib/tokens.ts +177 -0
- package/src/lib/verification.ts +620 -0
- package/src/prompts/index.ts +10 -0
- package/src/prompts/templates.ts +336 -0
- package/src/resources/index.ts +8 -0
- package/src/resources/sessions.ts +196 -0
- package/src/tools/compress.ts +138 -0
- package/src/tools/index.ts +5 -0
- package/src/tools/scratchpad.ts +2659 -0
- package/src/tools/sessions.ts +144 -0
|
@@ -0,0 +1,662 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Scratchpad Schema - Unified CRASH-style reasoning tool
|
|
3
|
+
*
|
|
4
|
+
* Single tool with operation-based dispatch:
|
|
5
|
+
* - step: Add a thought (auto-increments step number)
|
|
6
|
+
* - navigate: View history, branches, specific step, or path
|
|
7
|
+
* - branch: Start alternative reasoning path
|
|
8
|
+
* - revise: Correct earlier step
|
|
9
|
+
* - complete: Finalize reasoning chain
|
|
10
|
+
*
|
|
11
|
+
* Note: Uses a flat object schema for MCP SDK compatibility.
|
|
12
|
+
* The MCP spec requires inputSchema.type = "object", but Zod's
|
|
13
|
+
* discriminatedUnion produces "oneOf" which fails validation.
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
import { z } from "zod";
|
|
17
|
+
|
|
18
|
+
// ============================================================================
|
|
19
|
+
// FLAT SCHEMA (MCP-compatible: type="object" at top level)
|
|
20
|
+
// ============================================================================
|
|
21
|
+
|
|
22
|
+
export const ScratchpadSchema = z.object({
|
|
23
|
+
// Required: operation discriminator
|
|
24
|
+
operation: z
|
|
25
|
+
.enum([
|
|
26
|
+
"step",
|
|
27
|
+
"navigate",
|
|
28
|
+
"branch",
|
|
29
|
+
"revise",
|
|
30
|
+
"complete",
|
|
31
|
+
"augment",
|
|
32
|
+
"override",
|
|
33
|
+
"hint",
|
|
34
|
+
"mistakes",
|
|
35
|
+
"spot_check",
|
|
36
|
+
"challenge",
|
|
37
|
+
])
|
|
38
|
+
.describe("Operation to perform"),
|
|
39
|
+
|
|
40
|
+
// Common fields (all operations)
|
|
41
|
+
session_id: z.string().optional().describe("Session ID (auto-generated if omitted)"),
|
|
42
|
+
confidence_threshold: z
|
|
43
|
+
.number()
|
|
44
|
+
.min(0)
|
|
45
|
+
.max(1)
|
|
46
|
+
.default(0.8)
|
|
47
|
+
.describe("Chain confidence threshold to suggest completion"),
|
|
48
|
+
token_budget: z
|
|
49
|
+
.number()
|
|
50
|
+
.int()
|
|
51
|
+
.min(100)
|
|
52
|
+
.default(3000)
|
|
53
|
+
.describe("Max tokens before auto-compressing new steps"),
|
|
54
|
+
warn_at_tokens: z
|
|
55
|
+
.number()
|
|
56
|
+
.int()
|
|
57
|
+
.min(100)
|
|
58
|
+
.optional()
|
|
59
|
+
.describe(
|
|
60
|
+
"Warn when cumulative session tokens exceed this threshold (soft limit, cost control)",
|
|
61
|
+
),
|
|
62
|
+
hard_limit_tokens: z
|
|
63
|
+
.number()
|
|
64
|
+
.int()
|
|
65
|
+
.min(100)
|
|
66
|
+
.optional()
|
|
67
|
+
.describe(
|
|
68
|
+
"Hard stop when cumulative session tokens exceed this threshold. Returns budget_exhausted status and blocks further operations.",
|
|
69
|
+
),
|
|
70
|
+
|
|
71
|
+
// Step operation fields
|
|
72
|
+
thought: z.string().optional().describe("Current reasoning/analysis (step/branch/revise)"),
|
|
73
|
+
purpose: z
|
|
74
|
+
.enum([
|
|
75
|
+
"analysis",
|
|
76
|
+
"action",
|
|
77
|
+
"reflection",
|
|
78
|
+
"decision",
|
|
79
|
+
"summary",
|
|
80
|
+
"validation",
|
|
81
|
+
"exploration",
|
|
82
|
+
"hypothesis",
|
|
83
|
+
"correction",
|
|
84
|
+
"planning",
|
|
85
|
+
])
|
|
86
|
+
.optional()
|
|
87
|
+
.describe("Step category"),
|
|
88
|
+
outcome: z.string().optional().describe("Result or conclusion from this step"),
|
|
89
|
+
confidence: z
|
|
90
|
+
.number()
|
|
91
|
+
.min(0)
|
|
92
|
+
.max(1)
|
|
93
|
+
.optional()
|
|
94
|
+
.describe("Confidence in this step (0-1). Contributes to chain average."),
|
|
95
|
+
context: z.string().optional().describe("Prior context or findings"),
|
|
96
|
+
verify: z
|
|
97
|
+
.boolean()
|
|
98
|
+
.optional()
|
|
99
|
+
.describe(
|
|
100
|
+
"Run domain verification. Auto-enabled for chains >3 steps. Set to false to disable.",
|
|
101
|
+
),
|
|
102
|
+
domain: z.enum(["math", "logic", "code", "general"]).optional(),
|
|
103
|
+
local_compute: z.boolean().default(false).describe("Try local compute for math"),
|
|
104
|
+
augment_compute: z
|
|
105
|
+
.boolean()
|
|
106
|
+
.default(true)
|
|
107
|
+
.describe("Auto-inject computed values into thought (default: true)"),
|
|
108
|
+
compress: z.boolean().default(false).describe("Compress thought before storing"),
|
|
109
|
+
compression_query: z.string().optional().describe("Query for context-aware compression"),
|
|
110
|
+
max_step_tokens: z
|
|
111
|
+
.number()
|
|
112
|
+
.int()
|
|
113
|
+
.min(10)
|
|
114
|
+
.optional()
|
|
115
|
+
.describe("Max tokens for this step. Rejects if exceeded (default: no limit)"),
|
|
116
|
+
force_large: z.boolean().default(false).describe("Allow step even if it exceeds max_step_tokens"),
|
|
117
|
+
preconditions: z
|
|
118
|
+
.array(z.string())
|
|
119
|
+
.optional()
|
|
120
|
+
.describe("Assumptions that MUST be true for this step (e.g., 'x > 0', 'file exists')"),
|
|
121
|
+
|
|
122
|
+
// Navigate operation fields
|
|
123
|
+
view: z
|
|
124
|
+
.enum(["history", "branches", "step", "path"])
|
|
125
|
+
.optional()
|
|
126
|
+
.describe(
|
|
127
|
+
"What to view: history (all steps), branches (list), step (specific), path (lineage)",
|
|
128
|
+
),
|
|
129
|
+
step_id: z.number().int().min(1).optional().describe("Step number to view"),
|
|
130
|
+
branch_id: z.string().optional().describe("Filter history by branch"),
|
|
131
|
+
limit: z.number().int().min(1).max(50).default(10).describe("Max steps to return"),
|
|
132
|
+
|
|
133
|
+
// Branch operation fields
|
|
134
|
+
from_step: z.number().int().min(1).optional().describe("Step to branch from (default: current)"),
|
|
135
|
+
branch_name: z.string().optional().describe("Human-readable branch name"),
|
|
136
|
+
hypothesis: z
|
|
137
|
+
.string()
|
|
138
|
+
.optional()
|
|
139
|
+
.describe("Falsifiable hypothesis this branch will test (e.g., 'Assume X is prime')"),
|
|
140
|
+
success_criteria: z
|
|
141
|
+
.string()
|
|
142
|
+
.optional()
|
|
143
|
+
.describe("What observation proves/disproves this hypothesis"),
|
|
144
|
+
|
|
145
|
+
// Revise operation fields
|
|
146
|
+
target_step: z.number().int().min(1).optional().describe("Step number to revise"),
|
|
147
|
+
reason: z.string().optional().describe("Why revising this step / Why overriding verification"),
|
|
148
|
+
|
|
149
|
+
// Complete operation fields
|
|
150
|
+
summary: z.string().optional().describe("Final summary/conclusion"),
|
|
151
|
+
final_answer: z.string().optional().describe("The answer/result"),
|
|
152
|
+
question: z
|
|
153
|
+
.string()
|
|
154
|
+
.optional()
|
|
155
|
+
.describe(
|
|
156
|
+
"Original question. On step: enables trap priming and stores for auto spot-check. On complete: enables spot-check.",
|
|
157
|
+
),
|
|
158
|
+
|
|
159
|
+
// Augment operation fields
|
|
160
|
+
text: z
|
|
161
|
+
.string()
|
|
162
|
+
.optional()
|
|
163
|
+
.describe("Text containing math expressions to compute and inject (augment/mistakes)"),
|
|
164
|
+
system_context: z.string().optional().describe("System prompt context for domain filtering"),
|
|
165
|
+
store_as_step: z.boolean().default(false).describe("Store augmented result as a reasoning step"),
|
|
166
|
+
|
|
167
|
+
// Override operation fields
|
|
168
|
+
acknowledge: z
|
|
169
|
+
.boolean()
|
|
170
|
+
.optional()
|
|
171
|
+
.describe("Confirm you understand verification failed but want to proceed"),
|
|
172
|
+
failed_step: z.number().int().min(1).optional().describe("Step number that failed verification"),
|
|
173
|
+
|
|
174
|
+
// Hint operation fields
|
|
175
|
+
expression: z
|
|
176
|
+
.string()
|
|
177
|
+
.optional()
|
|
178
|
+
.describe("Math expression to simplify. Omit to continue from previous hint in session."),
|
|
179
|
+
reveal_count: z
|
|
180
|
+
.number()
|
|
181
|
+
.int()
|
|
182
|
+
.min(1)
|
|
183
|
+
.optional()
|
|
184
|
+
.describe("Number of steps to reveal. Omit to auto-increment when continuing."),
|
|
185
|
+
cumulative: z
|
|
186
|
+
.boolean()
|
|
187
|
+
.default(true)
|
|
188
|
+
.describe("Show all steps up to reveal_count (true) or just the nth step (false)"),
|
|
189
|
+
reset: z.boolean().default(false).describe("Reset hint state and start from beginning"),
|
|
190
|
+
|
|
191
|
+
// Spot check operation fields
|
|
192
|
+
answer: z.string().optional().describe("The proposed answer to check for trap patterns"),
|
|
193
|
+
|
|
194
|
+
// Challenge operation fields
|
|
195
|
+
challenge_type: z
|
|
196
|
+
.enum(["assumption_inversion", "edge_case", "premise_check", "steelman_counter", "all"])
|
|
197
|
+
.optional()
|
|
198
|
+
.describe("Type of challenge to generate (default: all)"),
|
|
199
|
+
target_claim: z
|
|
200
|
+
.string()
|
|
201
|
+
.optional()
|
|
202
|
+
.describe("Specific claim to challenge (optional - if omitted, extracts claims from steps)"),
|
|
203
|
+
});
|
|
204
|
+
|
|
205
|
+
export type ScratchpadArgs = z.infer<typeof ScratchpadSchema>;
|
|
206
|
+
|
|
207
|
+
// Operation-specific type aliases (all use the same flat structure, just for clarity)
|
|
208
|
+
export type StepOperation = ScratchpadArgs & { operation: "step" };
|
|
209
|
+
export type NavigateOperation = ScratchpadArgs & { operation: "navigate" };
|
|
210
|
+
export type BranchOperation = ScratchpadArgs & { operation: "branch" };
|
|
211
|
+
export type ReviseOperation = ScratchpadArgs & { operation: "revise" };
|
|
212
|
+
export type CompleteOperation = ScratchpadArgs & { operation: "complete" };
|
|
213
|
+
export type AugmentOperation = ScratchpadArgs & { operation: "augment" };
|
|
214
|
+
export type OverrideOperation = ScratchpadArgs & { operation: "override" };
|
|
215
|
+
export type HintOperation = ScratchpadArgs & { operation: "hint" };
|
|
216
|
+
export type MistakesOperation = ScratchpadArgs & { operation: "mistakes" };
|
|
217
|
+
export type SpotCheckOperation = ScratchpadArgs & { operation: "spot_check" };
|
|
218
|
+
export type ChallengeOperation = ScratchpadArgs & { operation: "challenge" };
|
|
219
|
+
|
|
220
|
+
// ============================================================================
|
|
221
|
+
// RESPONSE TYPES
|
|
222
|
+
// ============================================================================
|
|
223
|
+
|
|
224
|
+
/** Recovery options provided when verification fails */
|
|
225
|
+
export interface RecoveryOptions {
|
|
226
|
+
/** Revise the failed step in-place */
|
|
227
|
+
revise: {
|
|
228
|
+
target_step: number;
|
|
229
|
+
suggested_reason: string;
|
|
230
|
+
};
|
|
231
|
+
/** Branch to try an alternative approach */
|
|
232
|
+
branch: {
|
|
233
|
+
from_step: number;
|
|
234
|
+
suggested_name: string;
|
|
235
|
+
};
|
|
236
|
+
/** Override and continue anyway (use when heuristic is wrong) */
|
|
237
|
+
override: {
|
|
238
|
+
flag: "force_continue";
|
|
239
|
+
warning: string;
|
|
240
|
+
};
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
/** A detected algebraic mistake */
|
|
244
|
+
export interface DetectedMistakeInfo {
|
|
245
|
+
/** Type of mistake (sign_error, distribution_error, etc.) */
|
|
246
|
+
type: string;
|
|
247
|
+
/** Human-readable description */
|
|
248
|
+
description: string;
|
|
249
|
+
/** Specific fix suggestion */
|
|
250
|
+
fix?: string;
|
|
251
|
+
/** The complete corrected step (e.g., "2x + 3x = 5x") */
|
|
252
|
+
corrected_step?: string;
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
/** Details about a verification failure */
|
|
256
|
+
export interface VerificationFailure {
|
|
257
|
+
/** What check failed */
|
|
258
|
+
issue: string;
|
|
259
|
+
/** Specific evidence of the problem */
|
|
260
|
+
evidence: string;
|
|
261
|
+
/** Suggestions for fixing */
|
|
262
|
+
suggestions: string[];
|
|
263
|
+
/** Confidence in the failure detection (higher = more certain it's wrong) */
|
|
264
|
+
confidence: number;
|
|
265
|
+
/** Domain that was checked */
|
|
266
|
+
domain: string;
|
|
267
|
+
/** Detected algebraic mistakes (math domain only) */
|
|
268
|
+
detected_mistakes?: DetectedMistakeInfo[];
|
|
269
|
+
/** Available recovery actions */
|
|
270
|
+
recovery_options: RecoveryOptions;
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
export interface ScratchpadResponse {
|
|
274
|
+
// State
|
|
275
|
+
session_id: string;
|
|
276
|
+
current_step: number;
|
|
277
|
+
branch: string;
|
|
278
|
+
operation: string;
|
|
279
|
+
|
|
280
|
+
// Confidence tracking
|
|
281
|
+
step_confidence?: number;
|
|
282
|
+
chain_confidence: number;
|
|
283
|
+
confidence_threshold: number;
|
|
284
|
+
steps_with_confidence: number;
|
|
285
|
+
|
|
286
|
+
// Status & guidance
|
|
287
|
+
status:
|
|
288
|
+
| "continue"
|
|
289
|
+
| "review"
|
|
290
|
+
| "threshold_reached"
|
|
291
|
+
| "complete"
|
|
292
|
+
| "verification_failed"
|
|
293
|
+
| "budget_exhausted";
|
|
294
|
+
suggested_action: string;
|
|
295
|
+
|
|
296
|
+
// Timer warning (when threshold reached)
|
|
297
|
+
auto_complete_warning?: string;
|
|
298
|
+
|
|
299
|
+
// Verification failure (when status === "verification_failed")
|
|
300
|
+
// The step is NOT stored until recovery action is taken
|
|
301
|
+
verification_failure?: VerificationFailure;
|
|
302
|
+
|
|
303
|
+
// For navigate operation
|
|
304
|
+
history?: Array<{
|
|
305
|
+
step: number;
|
|
306
|
+
branch: string;
|
|
307
|
+
purpose: string;
|
|
308
|
+
thought_preview: string;
|
|
309
|
+
confidence?: number;
|
|
310
|
+
revised_by?: number;
|
|
311
|
+
}>;
|
|
312
|
+
branches?: Array<{
|
|
313
|
+
id: string;
|
|
314
|
+
name: string;
|
|
315
|
+
from_step: number;
|
|
316
|
+
depth: number;
|
|
317
|
+
/** Hypothesis this branch is testing (if provided) */
|
|
318
|
+
hypothesis?: string;
|
|
319
|
+
/** Criteria for proving/disproving the hypothesis */
|
|
320
|
+
success_criteria?: string;
|
|
321
|
+
}>;
|
|
322
|
+
path?: Array<{
|
|
323
|
+
step: number;
|
|
324
|
+
branch: string;
|
|
325
|
+
thought_preview: string;
|
|
326
|
+
}>;
|
|
327
|
+
step_detail?: {
|
|
328
|
+
step: number;
|
|
329
|
+
branch: string;
|
|
330
|
+
purpose: string;
|
|
331
|
+
thought: string;
|
|
332
|
+
outcome?: string;
|
|
333
|
+
confidence?: number;
|
|
334
|
+
revises_step?: number;
|
|
335
|
+
revised_by?: number;
|
|
336
|
+
/** Preconditions/assumptions for this step */
|
|
337
|
+
preconditions?: string[];
|
|
338
|
+
/** Hypothesis being tested (for branch steps) */
|
|
339
|
+
hypothesis?: string;
|
|
340
|
+
/** Success criteria for the hypothesis */
|
|
341
|
+
success_criteria?: string;
|
|
342
|
+
};
|
|
343
|
+
|
|
344
|
+
// For complete operation
|
|
345
|
+
final_summary?: string;
|
|
346
|
+
total_steps?: number;
|
|
347
|
+
|
|
348
|
+
// Metadata
|
|
349
|
+
verification?: {
|
|
350
|
+
passed: boolean;
|
|
351
|
+
confidence: number;
|
|
352
|
+
domain: string;
|
|
353
|
+
};
|
|
354
|
+
local_compute?: {
|
|
355
|
+
solved: boolean;
|
|
356
|
+
result: unknown;
|
|
357
|
+
method: string;
|
|
358
|
+
};
|
|
359
|
+
compression?: {
|
|
360
|
+
applied: boolean;
|
|
361
|
+
original_tokens: number;
|
|
362
|
+
compressed_tokens: number;
|
|
363
|
+
ratio: number;
|
|
364
|
+
};
|
|
365
|
+
|
|
366
|
+
// Token budget tracking
|
|
367
|
+
token_usage?: {
|
|
368
|
+
total: number;
|
|
369
|
+
budget: number;
|
|
370
|
+
exceeded: boolean;
|
|
371
|
+
auto_compressed: boolean;
|
|
372
|
+
/** Percentage of budget consumed */
|
|
373
|
+
budget_percent: number;
|
|
374
|
+
};
|
|
375
|
+
|
|
376
|
+
// Proactive compression suggestion (when approaching budget)
|
|
377
|
+
compression_suggestion?: {
|
|
378
|
+
/** Whether compression is recommended now */
|
|
379
|
+
should_compress: boolean;
|
|
380
|
+
/** Current session token total */
|
|
381
|
+
current_tokens: number;
|
|
382
|
+
/** Budget threshold */
|
|
383
|
+
budget: number;
|
|
384
|
+
/** Percentage consumed */
|
|
385
|
+
percent_used: number;
|
|
386
|
+
/** Human-readable nudge */
|
|
387
|
+
nudge: string;
|
|
388
|
+
};
|
|
389
|
+
|
|
390
|
+
// Augmentation results (when augment_compute=true)
|
|
391
|
+
augmentation?: {
|
|
392
|
+
applied: boolean;
|
|
393
|
+
computations: number;
|
|
394
|
+
filtered: number;
|
|
395
|
+
domain: string;
|
|
396
|
+
};
|
|
397
|
+
// Session-level compression stats (for complete operation)
|
|
398
|
+
compression_stats?: {
|
|
399
|
+
total_bytes_saved: number;
|
|
400
|
+
steps_compressed: number;
|
|
401
|
+
tokens?: {
|
|
402
|
+
original: number;
|
|
403
|
+
compressed: number;
|
|
404
|
+
saved: number;
|
|
405
|
+
};
|
|
406
|
+
};
|
|
407
|
+
|
|
408
|
+
// For augment operation
|
|
409
|
+
augmented_text?: string;
|
|
410
|
+
computations?: Array<{
|
|
411
|
+
expression: string;
|
|
412
|
+
result: unknown;
|
|
413
|
+
method: string;
|
|
414
|
+
}>;
|
|
415
|
+
filtered_count?: number;
|
|
416
|
+
detected_domain?: string;
|
|
417
|
+
|
|
418
|
+
// Next step suggestion for math derivations (auto-populated for math domain)
|
|
419
|
+
next_step_suggestion?: {
|
|
420
|
+
hasSuggestion: boolean;
|
|
421
|
+
transformation?: string;
|
|
422
|
+
description?: string;
|
|
423
|
+
currentExpression?: string;
|
|
424
|
+
allApplicable?: Array<{ name: string; description: string }>;
|
|
425
|
+
};
|
|
426
|
+
|
|
427
|
+
// For hint operation - progressive simplification hints
|
|
428
|
+
hint_result?: {
|
|
429
|
+
success: boolean;
|
|
430
|
+
original: string;
|
|
431
|
+
simplified: string;
|
|
432
|
+
steps_shown: number;
|
|
433
|
+
total_steps: number;
|
|
434
|
+
steps: Array<{
|
|
435
|
+
step_number: number;
|
|
436
|
+
transformation: string;
|
|
437
|
+
description: string;
|
|
438
|
+
from: string;
|
|
439
|
+
to: string;
|
|
440
|
+
}>;
|
|
441
|
+
has_more: boolean;
|
|
442
|
+
};
|
|
443
|
+
|
|
444
|
+
// For mistakes operation - proactive error checking
|
|
445
|
+
mistakes_result?: {
|
|
446
|
+
text_checked: string;
|
|
447
|
+
mistakes_found: number;
|
|
448
|
+
mistakes: DetectedMistakeInfo[];
|
|
449
|
+
};
|
|
450
|
+
|
|
451
|
+
// For spot_check operation - trap pattern detection
|
|
452
|
+
spot_check_result?: {
|
|
453
|
+
passed: boolean;
|
|
454
|
+
trap_type: string | null;
|
|
455
|
+
warning: string | null;
|
|
456
|
+
hint: string | null;
|
|
457
|
+
confidence: number;
|
|
458
|
+
};
|
|
459
|
+
|
|
460
|
+
// Reconsideration prompt (when spot-check fails during complete)
|
|
461
|
+
reconsideration?: {
|
|
462
|
+
trap_type: string;
|
|
463
|
+
hint: string;
|
|
464
|
+
suggested_revise: {
|
|
465
|
+
target_step: number;
|
|
466
|
+
reason: string;
|
|
467
|
+
};
|
|
468
|
+
};
|
|
469
|
+
|
|
470
|
+
// Trap analysis (when question provided on step, informational only)
|
|
471
|
+
trap_analysis?: {
|
|
472
|
+
detected: boolean;
|
|
473
|
+
types: string[]; // All detected trap types
|
|
474
|
+
primed_count: number; // How many traps were actually primed (≤ types.length)
|
|
475
|
+
note: string | null;
|
|
476
|
+
confidence: number;
|
|
477
|
+
};
|
|
478
|
+
|
|
479
|
+
// Token usage metadata (always added by execute wrapper)
|
|
480
|
+
tokens?: {
|
|
481
|
+
input_tokens: number;
|
|
482
|
+
output_tokens: number;
|
|
483
|
+
total_tokens: number;
|
|
484
|
+
};
|
|
485
|
+
|
|
486
|
+
// Cumulative session token usage (always added by execute wrapper)
|
|
487
|
+
session_tokens?: {
|
|
488
|
+
total_input: number;
|
|
489
|
+
total_output: number;
|
|
490
|
+
total: number;
|
|
491
|
+
operations: number;
|
|
492
|
+
};
|
|
493
|
+
|
|
494
|
+
// Token budget warning (when warn_at_tokens threshold exceeded)
|
|
495
|
+
token_warning?: {
|
|
496
|
+
threshold: number;
|
|
497
|
+
current: number;
|
|
498
|
+
exceeded_by: number;
|
|
499
|
+
message: string;
|
|
500
|
+
};
|
|
501
|
+
|
|
502
|
+
// Hard budget limit (when hard_limit_tokens exceeded - operation blocked)
|
|
503
|
+
budget_exhausted?: {
|
|
504
|
+
limit: number;
|
|
505
|
+
current: number;
|
|
506
|
+
exceeded_by: number;
|
|
507
|
+
message: string;
|
|
508
|
+
recommendation: string;
|
|
509
|
+
};
|
|
510
|
+
|
|
511
|
+
// Confidence Drift Detection (CDD) - novel meta-signal for reasoning quality
|
|
512
|
+
// Analyzes confidence trajectory shape to detect unresolved uncertainty
|
|
513
|
+
confidence_drift?: {
|
|
514
|
+
/** Overall drift score (0-1, higher = more concerning) */
|
|
515
|
+
drift_score: number;
|
|
516
|
+
/** Whether the drift represents unresolved uncertainty */
|
|
517
|
+
unresolved: boolean;
|
|
518
|
+
/** Confidence at trajectory minimum */
|
|
519
|
+
min_confidence: number;
|
|
520
|
+
/** Step number where minimum occurred */
|
|
521
|
+
min_step: number;
|
|
522
|
+
/** Maximum confidence drop observed */
|
|
523
|
+
max_drop: number;
|
|
524
|
+
/** Recovery magnitude from min to final */
|
|
525
|
+
recovery: number;
|
|
526
|
+
/** Whether a revision step exists after the drop */
|
|
527
|
+
has_revision_after_drop: boolean;
|
|
528
|
+
/** Pattern classification */
|
|
529
|
+
pattern:
|
|
530
|
+
| "stable"
|
|
531
|
+
| "stable_overconfident"
|
|
532
|
+
| "declining"
|
|
533
|
+
| "improving"
|
|
534
|
+
| "v_shaped"
|
|
535
|
+
| "oscillating"
|
|
536
|
+
| "cliff"
|
|
537
|
+
| "insufficient";
|
|
538
|
+
/** Human-readable explanation */
|
|
539
|
+
explanation: string;
|
|
540
|
+
/** Suggested action if unresolved */
|
|
541
|
+
suggestion: string | null;
|
|
542
|
+
};
|
|
543
|
+
|
|
544
|
+
// Proactive stepping guidance based on question complexity
|
|
545
|
+
// Provided on first step when question is supplied
|
|
546
|
+
stepping_guidance?: {
|
|
547
|
+
/** Complexity tier of the question */
|
|
548
|
+
complexity_tier: "Low" | "Moderate" | "High" | "Very Hard" | "Almost Impossible";
|
|
549
|
+
/** Recommended minimum steps for this complexity */
|
|
550
|
+
recommended_steps: number;
|
|
551
|
+
/** Current step count */
|
|
552
|
+
current_steps: number;
|
|
553
|
+
/** Whether more steps are recommended before completing */
|
|
554
|
+
needs_more_steps: boolean;
|
|
555
|
+
/** Human-readable nudge */
|
|
556
|
+
nudge: string | null;
|
|
557
|
+
};
|
|
558
|
+
|
|
559
|
+
// Consistency check - detects contradictions across reasoning steps
|
|
560
|
+
// Checked every N steps (configurable) to catch logical inconsistencies
|
|
561
|
+
consistency_warning?: {
|
|
562
|
+
/** Whether contradictions were found */
|
|
563
|
+
has_contradictions: boolean;
|
|
564
|
+
/** Number of contradictions detected */
|
|
565
|
+
count: number;
|
|
566
|
+
/** The contradictions found */
|
|
567
|
+
contradictions: Array<{
|
|
568
|
+
/** Type of contradiction */
|
|
569
|
+
type: "value_reassignment" | "logical_conflict" | "sign_flip" | "direction_reversal";
|
|
570
|
+
/** Human-readable description */
|
|
571
|
+
description: string;
|
|
572
|
+
/** The variable/concept involved */
|
|
573
|
+
subject: string;
|
|
574
|
+
/** Step where original claim was made */
|
|
575
|
+
original_step: number;
|
|
576
|
+
/** Conflicting step number */
|
|
577
|
+
conflicting_step: number;
|
|
578
|
+
/** Confidence in detection (0-1) */
|
|
579
|
+
confidence: number;
|
|
580
|
+
}>;
|
|
581
|
+
/** Human-readable nudge */
|
|
582
|
+
nudge: string;
|
|
583
|
+
};
|
|
584
|
+
|
|
585
|
+
// Hypothesis resolution - detects when a branch's hypothesis is confirmed/refuted
|
|
586
|
+
// Only present for steps on branches with hypotheses
|
|
587
|
+
hypothesis_resolution?: {
|
|
588
|
+
/** Whether the hypothesis has been resolved */
|
|
589
|
+
resolved: boolean;
|
|
590
|
+
/** Resolution outcome if resolved */
|
|
591
|
+
outcome: "confirmed" | "refuted" | "inconclusive" | null;
|
|
592
|
+
/** Confidence in the resolution (0-1) */
|
|
593
|
+
confidence: number;
|
|
594
|
+
/** Step number where resolution was detected */
|
|
595
|
+
resolved_at_step: number | null;
|
|
596
|
+
/** Evidence text that triggered resolution */
|
|
597
|
+
evidence: string | null;
|
|
598
|
+
/** The original hypothesis being tested */
|
|
599
|
+
hypothesis: string;
|
|
600
|
+
/** The success criteria (if provided) */
|
|
601
|
+
success_criteria: string | null;
|
|
602
|
+
/** Suggested action based on resolution */
|
|
603
|
+
suggestion: string;
|
|
604
|
+
};
|
|
605
|
+
|
|
606
|
+
// Challenge result - adversarial self-check for reasoning quality
|
|
607
|
+
// Only present for challenge operation
|
|
608
|
+
challenge_result?: {
|
|
609
|
+
/** Number of challenges generated */
|
|
610
|
+
challenges_generated: number;
|
|
611
|
+
/** The challenges */
|
|
612
|
+
challenges: Array<{
|
|
613
|
+
/** Type of challenge */
|
|
614
|
+
type: "assumption_inversion" | "edge_case" | "premise_check" | "steelman_counter";
|
|
615
|
+
/** The original claim being challenged */
|
|
616
|
+
original_claim: string;
|
|
617
|
+
/** The challenge/counterargument */
|
|
618
|
+
challenge: string;
|
|
619
|
+
/** How serious is this challenge */
|
|
620
|
+
severity: "low" | "medium" | "high";
|
|
621
|
+
/** Suggested way to address this challenge */
|
|
622
|
+
suggested_response: string;
|
|
623
|
+
}>;
|
|
624
|
+
/** Overall robustness score (0-1) */
|
|
625
|
+
overall_robustness: number;
|
|
626
|
+
/** Summary of findings */
|
|
627
|
+
summary: string;
|
|
628
|
+
};
|
|
629
|
+
|
|
630
|
+
// Auto-challenge suggestion - triggered when overconfidence detected
|
|
631
|
+
// Present when shouldChallenge() returns true (high confidence with few steps)
|
|
632
|
+
challenge_suggestion?: {
|
|
633
|
+
/** Whether a challenge is recommended */
|
|
634
|
+
should_challenge: boolean;
|
|
635
|
+
/** Why challenge is suggested */
|
|
636
|
+
reason: string;
|
|
637
|
+
/** Specific type of challenge recommended */
|
|
638
|
+
suggested_type:
|
|
639
|
+
| "assumption_inversion"
|
|
640
|
+
| "edge_case"
|
|
641
|
+
| "premise_check"
|
|
642
|
+
| "steelman_counter"
|
|
643
|
+
| "all";
|
|
644
|
+
/** Human-readable nudge */
|
|
645
|
+
nudge: string;
|
|
646
|
+
};
|
|
647
|
+
|
|
648
|
+
// Merge suggestion - triggered when branch hypothesis is confirmed
|
|
649
|
+
// Suggests merging branch findings back to main reasoning path
|
|
650
|
+
merge_suggestion?: {
|
|
651
|
+
/** Whether merge is recommended */
|
|
652
|
+
should_merge: boolean;
|
|
653
|
+
/** Branch ID to merge from */
|
|
654
|
+
from_branch: string;
|
|
655
|
+
/** The confirmed hypothesis */
|
|
656
|
+
confirmed_hypothesis: string;
|
|
657
|
+
/** Key findings to incorporate */
|
|
658
|
+
key_findings: string;
|
|
659
|
+
/** Human-readable suggestion */
|
|
660
|
+
nudge: string;
|
|
661
|
+
};
|
|
662
|
+
}
|