verifiable-thinking-mcp 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +339 -0
- package/package.json +75 -0
- package/src/index.ts +38 -0
- package/src/lib/cache.ts +246 -0
- package/src/lib/compression.ts +804 -0
- package/src/lib/compute/cache.ts +86 -0
- package/src/lib/compute/classifier.ts +555 -0
- package/src/lib/compute/confidence.ts +79 -0
- package/src/lib/compute/context.ts +154 -0
- package/src/lib/compute/extract.ts +200 -0
- package/src/lib/compute/filter.ts +224 -0
- package/src/lib/compute/index.ts +171 -0
- package/src/lib/compute/math.ts +247 -0
- package/src/lib/compute/patterns.ts +564 -0
- package/src/lib/compute/registry.ts +145 -0
- package/src/lib/compute/solvers/arithmetic.ts +65 -0
- package/src/lib/compute/solvers/calculus.ts +249 -0
- package/src/lib/compute/solvers/derivation-core.ts +371 -0
- package/src/lib/compute/solvers/derivation-latex.ts +160 -0
- package/src/lib/compute/solvers/derivation-mistakes.ts +1046 -0
- package/src/lib/compute/solvers/derivation-simplify.ts +451 -0
- package/src/lib/compute/solvers/derivation-transform.ts +620 -0
- package/src/lib/compute/solvers/derivation.ts +67 -0
- package/src/lib/compute/solvers/facts.ts +120 -0
- package/src/lib/compute/solvers/formula.ts +728 -0
- package/src/lib/compute/solvers/index.ts +36 -0
- package/src/lib/compute/solvers/logic.ts +422 -0
- package/src/lib/compute/solvers/probability.ts +307 -0
- package/src/lib/compute/solvers/statistics.ts +262 -0
- package/src/lib/compute/solvers/word-problems.ts +408 -0
- package/src/lib/compute/types.ts +107 -0
- package/src/lib/concepts.ts +111 -0
- package/src/lib/domain.ts +731 -0
- package/src/lib/extraction.ts +912 -0
- package/src/lib/index.ts +122 -0
- package/src/lib/judge.ts +260 -0
- package/src/lib/math/ast.ts +842 -0
- package/src/lib/math/index.ts +8 -0
- package/src/lib/math/operators.ts +171 -0
- package/src/lib/math/tokenizer.ts +477 -0
- package/src/lib/patterns.ts +200 -0
- package/src/lib/session.ts +825 -0
- package/src/lib/think/challenge.ts +323 -0
- package/src/lib/think/complexity.ts +504 -0
- package/src/lib/think/confidence-drift.ts +507 -0
- package/src/lib/think/consistency.ts +347 -0
- package/src/lib/think/guidance.ts +188 -0
- package/src/lib/think/helpers.ts +568 -0
- package/src/lib/think/hypothesis.ts +216 -0
- package/src/lib/think/index.ts +127 -0
- package/src/lib/think/prompts.ts +262 -0
- package/src/lib/think/route.ts +358 -0
- package/src/lib/think/schema.ts +98 -0
- package/src/lib/think/scratchpad-schema.ts +662 -0
- package/src/lib/think/spot-check.ts +961 -0
- package/src/lib/think/types.ts +93 -0
- package/src/lib/think/verification.ts +260 -0
- package/src/lib/tokens.ts +177 -0
- package/src/lib/verification.ts +620 -0
- package/src/prompts/index.ts +10 -0
- package/src/prompts/templates.ts +336 -0
- package/src/resources/index.ts +8 -0
- package/src/resources/sessions.ts +196 -0
- package/src/tools/compress.ts +138 -0
- package/src/tools/index.ts +5 -0
- package/src/tools/scratchpad.ts +2659 -0
- package/src/tools/sessions.ts +144 -0
|
@@ -0,0 +1,507 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Confidence Drift Detection (CDD)
|
|
3
|
+
*
|
|
4
|
+
* Novel technique: Analyzes confidence TRAJECTORY as a meta-signal for reasoning quality.
|
|
5
|
+
*
|
|
6
|
+
* Key insight: LLMs often start confident, confidence DROPS mid-chain when hitting
|
|
7
|
+
* difficulty, then "recovers" at the end without explicitly addressing the uncertainty.
|
|
8
|
+
* This V-shaped pattern without revision indicates "pushed through" uncertainty.
|
|
9
|
+
*
|
|
10
|
+
* Design principles:
|
|
11
|
+
* 1. O(n) single-pass analysis of confidence array
|
|
12
|
+
* 2. Detects structural patterns in confidence trajectory
|
|
13
|
+
* 3. Flags unresolved doubt (recovery without revision)
|
|
14
|
+
* 4. Provides actionable insights for reasoning improvement
|
|
15
|
+
*
|
|
16
|
+
* Formula:
|
|
17
|
+
* drift_score = max_drop × recovery_magnitude / steps_to_recover
|
|
18
|
+
* unresolved = drift_score > threshold AND no revision step exists
|
|
19
|
+
*/
|
|
20
|
+
|
|
21
|
+
import type { ThoughtRecord } from "../session.ts";
|
|
22
|
+
|
|
23
|
+
// ============================================================================
|
|
24
|
+
// TYPES
|
|
25
|
+
// ============================================================================
|
|
26
|
+
|
|
27
|
+
export interface DriftAnalysis {
|
|
28
|
+
/** Overall drift score (0-1, higher = more concerning) */
|
|
29
|
+
drift_score: number;
|
|
30
|
+
/** Whether the drift represents unresolved uncertainty */
|
|
31
|
+
unresolved: boolean;
|
|
32
|
+
/** Confidence at trajectory minimum */
|
|
33
|
+
min_confidence: number;
|
|
34
|
+
/** Step number where minimum occurred */
|
|
35
|
+
min_step: number;
|
|
36
|
+
/** Maximum confidence drop observed */
|
|
37
|
+
max_drop: number;
|
|
38
|
+
/** Recovery magnitude from min to final */
|
|
39
|
+
recovery: number;
|
|
40
|
+
/** Whether a revision step exists after the drop */
|
|
41
|
+
has_revision_after_drop: boolean;
|
|
42
|
+
/** Pattern classification */
|
|
43
|
+
pattern: DriftPattern;
|
|
44
|
+
/** Human-readable explanation */
|
|
45
|
+
explanation: string;
|
|
46
|
+
/** Suggested action if unresolved */
|
|
47
|
+
suggestion: string | null;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
export type DriftPattern =
|
|
51
|
+
| "stable" // Confidence stays relatively flat
|
|
52
|
+
| "stable_overconfident" // All confidence values ≥0.85 with low variance (trap risk)
|
|
53
|
+
| "declining" // Monotonic decrease (getting less confident)
|
|
54
|
+
| "improving" // Monotonic increase (getting more confident)
|
|
55
|
+
| "v_shaped" // Drop then recovery (the concerning pattern)
|
|
56
|
+
| "oscillating" // Multiple ups and downs
|
|
57
|
+
| "cliff" // Sudden drop at end (likely error detected)
|
|
58
|
+
| "insufficient"; // Not enough steps to analyze
|
|
59
|
+
|
|
60
|
+
export interface DriftConfig {
|
|
61
|
+
/** Minimum drop to consider significant (default: 0.15) */
|
|
62
|
+
min_significant_drop: number;
|
|
63
|
+
/** Minimum recovery to flag as V-shaped (default: 0.15) */
|
|
64
|
+
min_significant_recovery: number;
|
|
65
|
+
/** Drift score threshold to flag as unresolved (default: 0.3) */
|
|
66
|
+
unresolved_threshold: number;
|
|
67
|
+
/** Minimum steps required for analysis (default: 3) */
|
|
68
|
+
min_steps: number;
|
|
69
|
+
/** Minimum confidence threshold for "overconfident" detection (default: 0.85) */
|
|
70
|
+
overconfident_threshold: number;
|
|
71
|
+
/** Maximum variance allowed for "stable overconfident" pattern (default: 0.05) */
|
|
72
|
+
overconfident_max_variance: number;
|
|
73
|
+
/** Minimum final drop to flag cliff as unresolved (default: 0.3) */
|
|
74
|
+
cliff_drop_threshold: number;
|
|
75
|
+
/** Final confidence threshold to flag declining pattern as unresolved (default: 0.5) */
|
|
76
|
+
declining_final_threshold: number;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
const DEFAULT_CONFIG: DriftConfig = {
|
|
80
|
+
min_significant_drop: 0.15,
|
|
81
|
+
min_significant_recovery: 0.15,
|
|
82
|
+
unresolved_threshold: 0.3,
|
|
83
|
+
min_steps: 3,
|
|
84
|
+
overconfident_threshold: 0.85,
|
|
85
|
+
overconfident_max_variance: 0.05,
|
|
86
|
+
cliff_drop_threshold: 0.3,
|
|
87
|
+
declining_final_threshold: 0.5,
|
|
88
|
+
};
|
|
89
|
+
|
|
90
|
+
// ============================================================================
|
|
91
|
+
// CORE ALGORITHM
|
|
92
|
+
// ============================================================================
|
|
93
|
+
|
|
94
|
+
/**
|
|
95
|
+
* Analyze confidence trajectory for drift patterns.
|
|
96
|
+
* O(n) complexity - single pass through steps array.
|
|
97
|
+
*/
|
|
98
|
+
export function analyzeConfidenceDrift(
|
|
99
|
+
steps: ThoughtRecord[],
|
|
100
|
+
config: Partial<DriftConfig> = {},
|
|
101
|
+
): DriftAnalysis {
|
|
102
|
+
const cfg = { ...DEFAULT_CONFIG, ...config };
|
|
103
|
+
|
|
104
|
+
// Handle insufficient data
|
|
105
|
+
if (steps.length < cfg.min_steps) {
|
|
106
|
+
return {
|
|
107
|
+
drift_score: 0,
|
|
108
|
+
unresolved: false,
|
|
109
|
+
min_confidence: steps[0]?.verification?.confidence ?? 0.5,
|
|
110
|
+
min_step: steps[0]?.step_number ?? 1,
|
|
111
|
+
max_drop: 0,
|
|
112
|
+
recovery: 0,
|
|
113
|
+
has_revision_after_drop: false,
|
|
114
|
+
pattern: "insufficient",
|
|
115
|
+
explanation: `Insufficient steps for drift analysis (${steps.length} < ${cfg.min_steps})`,
|
|
116
|
+
suggestion: null,
|
|
117
|
+
};
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
// Extract confidence values (default to 0.5 if not present)
|
|
121
|
+
const confidences = steps.map((s) => s.verification?.confidence ?? 0.5);
|
|
122
|
+
const stepNumbers = steps.map((s) => s.step_number);
|
|
123
|
+
|
|
124
|
+
// Single-pass analysis: find min, max drop, track trajectory
|
|
125
|
+
let minConf = confidences[0]!;
|
|
126
|
+
let minIdx = 0;
|
|
127
|
+
let maxConf = confidences[0]!;
|
|
128
|
+
let maxIdx = 0;
|
|
129
|
+
let maxDropFromPeak = 0;
|
|
130
|
+
|
|
131
|
+
// Track running peak for drop calculation
|
|
132
|
+
let runningPeak = confidences[0]!;
|
|
133
|
+
|
|
134
|
+
for (let i = 1; i < confidences.length; i++) {
|
|
135
|
+
const conf = confidences[i]!;
|
|
136
|
+
|
|
137
|
+
// Update global min
|
|
138
|
+
if (conf < minConf) {
|
|
139
|
+
minConf = conf;
|
|
140
|
+
minIdx = i;
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
// Update global max
|
|
144
|
+
if (conf > maxConf) {
|
|
145
|
+
maxConf = conf;
|
|
146
|
+
maxIdx = i;
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
// Track maximum drop from any previous peak
|
|
150
|
+
if (conf > runningPeak) {
|
|
151
|
+
runningPeak = conf;
|
|
152
|
+
} else {
|
|
153
|
+
const dropFromPeak = runningPeak - conf;
|
|
154
|
+
if (dropFromPeak > maxDropFromPeak) {
|
|
155
|
+
maxDropFromPeak = dropFromPeak;
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
// Calculate recovery (from min to final)
|
|
161
|
+
const finalConf = confidences[confidences.length - 1]!;
|
|
162
|
+
const recovery = finalConf - minConf;
|
|
163
|
+
|
|
164
|
+
// Check for revision steps after the minimum
|
|
165
|
+
const hasRevisionAfterDrop = steps.slice(minIdx + 1).some((s) => s.revises_step !== undefined);
|
|
166
|
+
|
|
167
|
+
// Classify pattern
|
|
168
|
+
const pattern = classifyPattern(confidences, minIdx, maxIdx, maxDropFromPeak, recovery, cfg);
|
|
169
|
+
|
|
170
|
+
// Calculate drift score
|
|
171
|
+
// Formula for V-shaped: emphasize the drop magnitude since that's the concern
|
|
172
|
+
// For other patterns: use drop as primary signal
|
|
173
|
+
const stepsToRecover = Math.max(1, confidences.length - 1 - minIdx);
|
|
174
|
+
let driftScore: number;
|
|
175
|
+
|
|
176
|
+
if (pattern === "v_shaped") {
|
|
177
|
+
// V-shaped score: max of (drop alone) or (drop × recovery / steps)
|
|
178
|
+
// This ensures significant drops always produce significant scores
|
|
179
|
+
const basicScore = maxDropFromPeak;
|
|
180
|
+
const recoveryBonus = (maxDropFromPeak * recovery) / stepsToRecover;
|
|
181
|
+
driftScore = Math.max(basicScore, recoveryBonus);
|
|
182
|
+
} else {
|
|
183
|
+
// Non-V patterns get lower score based just on drop
|
|
184
|
+
driftScore = maxDropFromPeak * 0.5;
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
// Clamp to 0-1
|
|
188
|
+
const normalizedDriftScore = Math.min(1, Math.max(0, driftScore));
|
|
189
|
+
|
|
190
|
+
// Determine if unresolved (concerning pattern without remediation)
|
|
191
|
+
const isVShaped = pattern === "v_shaped";
|
|
192
|
+
const isStableOverconfident = pattern === "stable_overconfident";
|
|
193
|
+
const isCliff = pattern === "cliff";
|
|
194
|
+
const significantDrop = maxDropFromPeak >= cfg.min_significant_drop;
|
|
195
|
+
const significantRecovery = recovery >= cfg.min_significant_recovery;
|
|
196
|
+
|
|
197
|
+
// V-shaped is unresolved if: significant drop + recovery, no revision, above threshold
|
|
198
|
+
const vShapedUnresolved =
|
|
199
|
+
isVShaped &&
|
|
200
|
+
significantDrop &&
|
|
201
|
+
significantRecovery &&
|
|
202
|
+
!hasRevisionAfterDrop &&
|
|
203
|
+
normalizedDriftScore >= cfg.unresolved_threshold;
|
|
204
|
+
|
|
205
|
+
// Cliff is unresolved if: sharp final drop exceeds threshold (error detected at end)
|
|
206
|
+
// Calculate final step drop for cliff detection
|
|
207
|
+
const finalStepDrop =
|
|
208
|
+
confidences.length >= 2
|
|
209
|
+
? confidences[confidences.length - 2]! - confidences[confidences.length - 1]!
|
|
210
|
+
: 0;
|
|
211
|
+
const cliffUnresolved = isCliff && finalStepDrop >= cfg.cliff_drop_threshold;
|
|
212
|
+
|
|
213
|
+
// Declining is unresolved if: final confidence below threshold (ended uncertain)
|
|
214
|
+
const isDeclining = pattern === "declining";
|
|
215
|
+
const decliningUnresolved = isDeclining && finalConf < cfg.declining_final_threshold;
|
|
216
|
+
|
|
217
|
+
// Stable overconfident is always flagged as unresolved (warrants review)
|
|
218
|
+
// This catches trap questions where LLM is confidently wrong
|
|
219
|
+
const unresolved =
|
|
220
|
+
vShapedUnresolved || isStableOverconfident || cliffUnresolved || decliningUnresolved;
|
|
221
|
+
|
|
222
|
+
// For stable_overconfident, cliff, and declining: use a moderate drift score to indicate concern
|
|
223
|
+
const finalDriftScore =
|
|
224
|
+
isStableOverconfident || cliffUnresolved || decliningUnresolved
|
|
225
|
+
? Math.max(normalizedDriftScore, 0.4) // Ensure visible concern level
|
|
226
|
+
: normalizedDriftScore;
|
|
227
|
+
|
|
228
|
+
// Generate explanation
|
|
229
|
+
const explanation = generateExplanation(
|
|
230
|
+
pattern,
|
|
231
|
+
maxDropFromPeak,
|
|
232
|
+
recovery,
|
|
233
|
+
minIdx,
|
|
234
|
+
stepNumbers,
|
|
235
|
+
hasRevisionAfterDrop,
|
|
236
|
+
minConf,
|
|
237
|
+
);
|
|
238
|
+
|
|
239
|
+
// Generate suggestion if unresolved
|
|
240
|
+
const suggestion = unresolved
|
|
241
|
+
? generateSuggestion(stepNumbers[minIdx]!, maxDropFromPeak, pattern, minConf)
|
|
242
|
+
: null;
|
|
243
|
+
|
|
244
|
+
return {
|
|
245
|
+
drift_score: finalDriftScore,
|
|
246
|
+
unresolved,
|
|
247
|
+
min_confidence: minConf,
|
|
248
|
+
min_step: stepNumbers[minIdx]!,
|
|
249
|
+
max_drop: maxDropFromPeak,
|
|
250
|
+
recovery,
|
|
251
|
+
has_revision_after_drop: hasRevisionAfterDrop,
|
|
252
|
+
pattern,
|
|
253
|
+
explanation,
|
|
254
|
+
suggestion,
|
|
255
|
+
};
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
/**
|
|
259
|
+
* Classify the overall confidence trajectory pattern.
|
|
260
|
+
*/
|
|
261
|
+
function classifyPattern(
|
|
262
|
+
confidences: number[],
|
|
263
|
+
minIdx: number,
|
|
264
|
+
_maxIdx: number,
|
|
265
|
+
maxDrop: number,
|
|
266
|
+
recovery: number,
|
|
267
|
+
cfg: DriftConfig,
|
|
268
|
+
): DriftPattern {
|
|
269
|
+
const n = confidences.length;
|
|
270
|
+
const range = Math.max(...confidences) - Math.min(...confidences);
|
|
271
|
+
|
|
272
|
+
// V-shaped: significant drop followed by significant recovery
|
|
273
|
+
// Min must be in middle portion (not at start or end)
|
|
274
|
+
// Check FIRST - this is the most important pattern to detect
|
|
275
|
+
const minInMiddle = minIdx > 0 && minIdx < n - 1;
|
|
276
|
+
if (
|
|
277
|
+
minInMiddle &&
|
|
278
|
+
maxDrop >= cfg.min_significant_drop &&
|
|
279
|
+
recovery >= cfg.min_significant_recovery
|
|
280
|
+
) {
|
|
281
|
+
return "v_shaped";
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
// Cliff: sudden drop at the end (min is at or near the end)
|
|
285
|
+
// Must be a SUDDEN drop in the final step - check this BEFORE declining
|
|
286
|
+
// to catch "error detected at end" pattern
|
|
287
|
+
// For cliff: final drop must be significantly larger than average step change
|
|
288
|
+
if (minIdx >= n - 1 && maxDrop >= cfg.min_significant_drop && n >= 2) {
|
|
289
|
+
const finalDrop = confidences[n - 2]! - confidences[n - 1]!;
|
|
290
|
+
// Calculate average step change for comparison
|
|
291
|
+
let totalChange = 0;
|
|
292
|
+
for (let i = 1; i < n - 1; i++) {
|
|
293
|
+
totalChange += Math.abs(confidences[i]! - confidences[i - 1]!);
|
|
294
|
+
}
|
|
295
|
+
const avgChange = n > 2 ? totalChange / (n - 2) : 0;
|
|
296
|
+
// Cliff: final drop is at least 2x the average change AND meets minimum threshold
|
|
297
|
+
if (finalDrop >= cfg.min_significant_drop && finalDrop >= avgChange * 2) {
|
|
298
|
+
return "cliff";
|
|
299
|
+
}
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
// Stable overconfident: all values ≥ threshold with low variance
|
|
303
|
+
// This is a concerning pattern on trap questions where LLMs are confidently wrong
|
|
304
|
+
// Check BEFORE generic stable to catch this specific concerning case
|
|
305
|
+
const minConf = Math.min(...confidences);
|
|
306
|
+
if (minConf >= cfg.overconfident_threshold && range <= cfg.overconfident_max_variance) {
|
|
307
|
+
return "stable_overconfident";
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
// Stable: low variance throughout (check AFTER V-shaped so custom configs work)
|
|
311
|
+
if (range < 0.1) {
|
|
312
|
+
return "stable";
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
// Declining: monotonic or mostly decreasing
|
|
316
|
+
let decreases = 0;
|
|
317
|
+
for (let i = 1; i < n; i++) {
|
|
318
|
+
if (confidences[i]! < confidences[i - 1]!) decreases++;
|
|
319
|
+
}
|
|
320
|
+
if (decreases >= (n - 1) * 0.7) {
|
|
321
|
+
return "declining";
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
// Improving: monotonic or mostly increasing
|
|
325
|
+
let increases = 0;
|
|
326
|
+
for (let i = 1; i < n; i++) {
|
|
327
|
+
if (confidences[i]! > confidences[i - 1]!) increases++;
|
|
328
|
+
}
|
|
329
|
+
if (increases >= (n - 1) * 0.7) {
|
|
330
|
+
return "improving";
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
// Oscillating: multiple direction changes
|
|
334
|
+
let directionChanges = 0;
|
|
335
|
+
let lastDir = 0;
|
|
336
|
+
for (let i = 1; i < n; i++) {
|
|
337
|
+
const dir = Math.sign(confidences[i]! - confidences[i - 1]!);
|
|
338
|
+
if (dir !== 0 && dir !== lastDir) {
|
|
339
|
+
directionChanges++;
|
|
340
|
+
lastDir = dir;
|
|
341
|
+
}
|
|
342
|
+
}
|
|
343
|
+
if (directionChanges >= 3) {
|
|
344
|
+
return "oscillating";
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
// Default to stable if no clear pattern
|
|
348
|
+
return "stable";
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
/**
|
|
352
|
+
* Generate human-readable explanation of the drift analysis.
|
|
353
|
+
*/
|
|
354
|
+
function generateExplanation(
|
|
355
|
+
pattern: DriftPattern,
|
|
356
|
+
maxDrop: number,
|
|
357
|
+
recovery: number,
|
|
358
|
+
minIdx: number,
|
|
359
|
+
stepNumbers: number[],
|
|
360
|
+
hasRevision: boolean,
|
|
361
|
+
minConfidence?: number,
|
|
362
|
+
): string {
|
|
363
|
+
const dropPct = (maxDrop * 100).toFixed(0);
|
|
364
|
+
const recoveryPct = (recovery * 100).toFixed(0);
|
|
365
|
+
const minStep = stepNumbers[minIdx];
|
|
366
|
+
|
|
367
|
+
switch (pattern) {
|
|
368
|
+
case "stable":
|
|
369
|
+
return "Confidence remained stable throughout reasoning chain.";
|
|
370
|
+
|
|
371
|
+
case "stable_overconfident":
|
|
372
|
+
return `⚠️ Stable high confidence (≥${((minConfidence ?? 0.85) * 100).toFixed(0)}%) throughout chain. On complex/trap questions, consistent high confidence without doubt often correlates with incorrect answers.`;
|
|
373
|
+
|
|
374
|
+
case "declining":
|
|
375
|
+
if (minConfidence !== undefined && minConfidence < 0.5) {
|
|
376
|
+
return `⚠️ Confidence declined steadily to ${(minConfidence * 100).toFixed(0)}% (${dropPct}% total drop). Ending with low confidence suggests unresolved uncertainty.`;
|
|
377
|
+
}
|
|
378
|
+
return `Confidence declined steadily (${dropPct}% total drop). This may indicate increasing uncertainty or problem difficulty.`;
|
|
379
|
+
|
|
380
|
+
case "improving":
|
|
381
|
+
return `Confidence improved throughout reasoning (${recoveryPct}% increase). Good progressive understanding.`;
|
|
382
|
+
|
|
383
|
+
case "v_shaped":
|
|
384
|
+
if (hasRevision) {
|
|
385
|
+
return `V-shaped confidence pattern detected: ${dropPct}% drop at step ${minStep}, then ${recoveryPct}% recovery. Revision step present - uncertainty was addressed.`;
|
|
386
|
+
} else {
|
|
387
|
+
return `⚠️ V-shaped confidence pattern: ${dropPct}% drop at step ${minStep}, then ${recoveryPct}% recovery WITHOUT revision. The reasoning may have "pushed through" uncertainty without addressing it.`;
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
case "oscillating":
|
|
391
|
+
return `Confidence oscillated throughout reasoning. Multiple uncertainty points encountered.`;
|
|
392
|
+
|
|
393
|
+
case "cliff":
|
|
394
|
+
return `Confidence dropped sharply at the end (${dropPct}% drop). Possible error or contradiction detected late in reasoning.`;
|
|
395
|
+
|
|
396
|
+
case "insufficient":
|
|
397
|
+
return "Not enough steps for meaningful drift analysis.";
|
|
398
|
+
|
|
399
|
+
default:
|
|
400
|
+
return "Confidence pattern analyzed.";
|
|
401
|
+
}
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
/**
|
|
405
|
+
* Generate actionable suggestion for unresolved drift.
|
|
406
|
+
*/
|
|
407
|
+
function generateSuggestion(
|
|
408
|
+
minStep: number,
|
|
409
|
+
dropMagnitude: number,
|
|
410
|
+
pattern?: DriftPattern,
|
|
411
|
+
minConfidence?: number,
|
|
412
|
+
): string {
|
|
413
|
+
// Special handling for stable overconfident pattern
|
|
414
|
+
if (pattern === "stable_overconfident") {
|
|
415
|
+
const confPct = ((minConfidence ?? 0.85) * 100).toFixed(0);
|
|
416
|
+
return `High confidence (${confPct}%+) throughout suggests possible overconfidence. Consider: Is this a trick question? Have you verified assumptions? Adding a self-check step could help catch errors.`;
|
|
417
|
+
}
|
|
418
|
+
|
|
419
|
+
// Special handling for cliff pattern - error detected at end
|
|
420
|
+
if (pattern === "cliff") {
|
|
421
|
+
const dropPct = (dropMagnitude * 100).toFixed(0);
|
|
422
|
+
return `Sharp confidence drop (${dropPct}%) at the final step suggests an error or contradiction was detected late. Consider: What caused this doubt? Should you revise earlier steps before concluding?`;
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
// Special handling for declining pattern - ended uncertain
|
|
426
|
+
if (pattern === "declining") {
|
|
427
|
+
const finalConfPct = ((minConfidence ?? 0.5) * 100).toFixed(0);
|
|
428
|
+
return `Confidence declined to ${finalConfPct}% by the end. The reasoning chain ended with significant doubt. Consider: What's causing the uncertainty? Is the approach valid? Should you try a different method?`;
|
|
429
|
+
}
|
|
430
|
+
|
|
431
|
+
if (dropMagnitude >= 0.3) {
|
|
432
|
+
return `Consider revising from step ${minStep} where confidence dropped significantly. The recovery without explicit revision suggests the uncertainty was not properly addressed.`;
|
|
433
|
+
} else {
|
|
434
|
+
return `Review step ${minStep} where confidence was lowest. Adding explicit reasoning about why confidence recovered could strengthen the chain.`;
|
|
435
|
+
}
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
// ============================================================================
|
|
439
|
+
// CONVENIENCE FUNCTIONS
|
|
440
|
+
// ============================================================================
|
|
441
|
+
|
|
442
|
+
/**
|
|
443
|
+
* Quick check if a reasoning chain has concerning drift.
|
|
444
|
+
* Use for fast filtering before detailed analysis.
|
|
445
|
+
*/
|
|
446
|
+
export function hasConcerningDrift(steps: ThoughtRecord[], _threshold: number = 0.3): boolean {
|
|
447
|
+
if (steps.length < 3) return false;
|
|
448
|
+
|
|
449
|
+
const confidences = steps.map((s) => s.verification?.confidence ?? 0.5);
|
|
450
|
+
const min = Math.min(...confidences);
|
|
451
|
+
const minIdx = confidences.indexOf(min);
|
|
452
|
+
const final = confidences[confidences.length - 1]!;
|
|
453
|
+
|
|
454
|
+
// Quick V-shape detection
|
|
455
|
+
const hasDrop = confidences.slice(0, minIdx + 1).some((c) => c - min >= 0.15);
|
|
456
|
+
const hasRecovery = final - min >= 0.15;
|
|
457
|
+
const noRevision = !steps.slice(minIdx + 1).some((s) => s.revises_step !== undefined);
|
|
458
|
+
|
|
459
|
+
return hasDrop && hasRecovery && noRevision && minIdx > 0 && minIdx < steps.length - 1;
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
/**
|
|
463
|
+
* Extract just the confidence trajectory for visualization/logging.
|
|
464
|
+
*/
|
|
465
|
+
export function extractConfidenceTrajectory(
|
|
466
|
+
steps: ThoughtRecord[],
|
|
467
|
+
): { step: number; confidence: number }[] {
|
|
468
|
+
return steps.map((s) => ({
|
|
469
|
+
step: s.step_number,
|
|
470
|
+
confidence: s.verification?.confidence ?? 0.5,
|
|
471
|
+
}));
|
|
472
|
+
}
|
|
473
|
+
|
|
474
|
+
/**
|
|
475
|
+
* Compute aggregate statistics for a confidence trajectory.
|
|
476
|
+
*/
|
|
477
|
+
export function computeTrajectoryStats(steps: ThoughtRecord[]): {
|
|
478
|
+
mean: number;
|
|
479
|
+
stddev: number;
|
|
480
|
+
min: number;
|
|
481
|
+
max: number;
|
|
482
|
+
trend: "up" | "down" | "flat";
|
|
483
|
+
} {
|
|
484
|
+
const confidences = steps.map((s) => s.verification?.confidence ?? 0.5);
|
|
485
|
+
const n = confidences.length;
|
|
486
|
+
|
|
487
|
+
if (n === 0) {
|
|
488
|
+
return { mean: 0.5, stddev: 0, min: 0.5, max: 0.5, trend: "flat" };
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
const sum = confidences.reduce((a, b) => a + b, 0);
|
|
492
|
+
const mean = sum / n;
|
|
493
|
+
|
|
494
|
+
const sqDiffs = confidences.map((c) => (c - mean) ** 2);
|
|
495
|
+
const variance = sqDiffs.reduce((a, b) => a + b, 0) / n;
|
|
496
|
+
const stddev = Math.sqrt(variance);
|
|
497
|
+
|
|
498
|
+
const min = Math.min(...confidences);
|
|
499
|
+
const max = Math.max(...confidences);
|
|
500
|
+
|
|
501
|
+
// Linear trend: positive slope = up, negative = down
|
|
502
|
+
const first = confidences[0]!;
|
|
503
|
+
const last = confidences[n - 1]!;
|
|
504
|
+
const trend = last - first > 0.1 ? "up" : last - first < -0.1 ? "down" : "flat";
|
|
505
|
+
|
|
506
|
+
return { mean, stddev, min, max, trend };
|
|
507
|
+
}
|