verifiable-thinking-mcp 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +339 -0
- package/package.json +75 -0
- package/src/index.ts +38 -0
- package/src/lib/cache.ts +246 -0
- package/src/lib/compression.ts +804 -0
- package/src/lib/compute/cache.ts +86 -0
- package/src/lib/compute/classifier.ts +555 -0
- package/src/lib/compute/confidence.ts +79 -0
- package/src/lib/compute/context.ts +154 -0
- package/src/lib/compute/extract.ts +200 -0
- package/src/lib/compute/filter.ts +224 -0
- package/src/lib/compute/index.ts +171 -0
- package/src/lib/compute/math.ts +247 -0
- package/src/lib/compute/patterns.ts +564 -0
- package/src/lib/compute/registry.ts +145 -0
- package/src/lib/compute/solvers/arithmetic.ts +65 -0
- package/src/lib/compute/solvers/calculus.ts +249 -0
- package/src/lib/compute/solvers/derivation-core.ts +371 -0
- package/src/lib/compute/solvers/derivation-latex.ts +160 -0
- package/src/lib/compute/solvers/derivation-mistakes.ts +1046 -0
- package/src/lib/compute/solvers/derivation-simplify.ts +451 -0
- package/src/lib/compute/solvers/derivation-transform.ts +620 -0
- package/src/lib/compute/solvers/derivation.ts +67 -0
- package/src/lib/compute/solvers/facts.ts +120 -0
- package/src/lib/compute/solvers/formula.ts +728 -0
- package/src/lib/compute/solvers/index.ts +36 -0
- package/src/lib/compute/solvers/logic.ts +422 -0
- package/src/lib/compute/solvers/probability.ts +307 -0
- package/src/lib/compute/solvers/statistics.ts +262 -0
- package/src/lib/compute/solvers/word-problems.ts +408 -0
- package/src/lib/compute/types.ts +107 -0
- package/src/lib/concepts.ts +111 -0
- package/src/lib/domain.ts +731 -0
- package/src/lib/extraction.ts +912 -0
- package/src/lib/index.ts +122 -0
- package/src/lib/judge.ts +260 -0
- package/src/lib/math/ast.ts +842 -0
- package/src/lib/math/index.ts +8 -0
- package/src/lib/math/operators.ts +171 -0
- package/src/lib/math/tokenizer.ts +477 -0
- package/src/lib/patterns.ts +200 -0
- package/src/lib/session.ts +825 -0
- package/src/lib/think/challenge.ts +323 -0
- package/src/lib/think/complexity.ts +504 -0
- package/src/lib/think/confidence-drift.ts +507 -0
- package/src/lib/think/consistency.ts +347 -0
- package/src/lib/think/guidance.ts +188 -0
- package/src/lib/think/helpers.ts +568 -0
- package/src/lib/think/hypothesis.ts +216 -0
- package/src/lib/think/index.ts +127 -0
- package/src/lib/think/prompts.ts +262 -0
- package/src/lib/think/route.ts +358 -0
- package/src/lib/think/schema.ts +98 -0
- package/src/lib/think/scratchpad-schema.ts +662 -0
- package/src/lib/think/spot-check.ts +961 -0
- package/src/lib/think/types.ts +93 -0
- package/src/lib/think/verification.ts +260 -0
- package/src/lib/tokens.ts +177 -0
- package/src/lib/verification.ts +620 -0
- package/src/prompts/index.ts +10 -0
- package/src/prompts/templates.ts +336 -0
- package/src/resources/index.ts +8 -0
- package/src/resources/sessions.ts +196 -0
- package/src/tools/compress.ts +138 -0
- package/src/tools/index.ts +5 -0
- package/src/tools/scratchpad.ts +2659 -0
- package/src/tools/sessions.ts +144 -0
|
@@ -0,0 +1,804 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Enhanced Prompt Compression - CPC-style sentence-level compression with:
|
|
3
|
+
* - TF-IDF relevance scoring
|
|
4
|
+
* - NCD (Normalized Compression Distance) for query relevance
|
|
5
|
+
* - Coreference constraint enforcement
|
|
6
|
+
* - Causal chain preservation
|
|
7
|
+
* - Filler/meta-cognition removal
|
|
8
|
+
* - Repetition detection
|
|
9
|
+
*
|
|
10
|
+
* Research basis:
|
|
11
|
+
* - CPC: "Prompt Compression with Context-Aware Sentence Encoding" (arXiv:2409.01227)
|
|
12
|
+
* - CompactPrompt (2025): N-gram abbreviation
|
|
13
|
+
* - Selective Context (2023): Entropy-based pruning
|
|
14
|
+
* - Information Bottleneck methods: Preserve task-relevant info
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
import { gzipSync } from "node:zlib";
|
|
18
|
+
|
|
19
|
+
// ============================================================================
|
|
20
|
+
// Types
|
|
21
|
+
// ============================================================================
|
|
22
|
+
|
|
23
|
+
export interface CompressionResult {
|
|
24
|
+
compressed: string;
|
|
25
|
+
original_tokens: number;
|
|
26
|
+
compressed_tokens: number;
|
|
27
|
+
ratio: number;
|
|
28
|
+
kept_sentences: number;
|
|
29
|
+
dropped_sentences: string[];
|
|
30
|
+
/** Enhancement metrics (only present when enhanced features used) */
|
|
31
|
+
enhancements?: {
|
|
32
|
+
fillers_removed: number;
|
|
33
|
+
coref_constraints_applied: number;
|
|
34
|
+
causal_constraints_applied: number;
|
|
35
|
+
repetitions_penalized: number;
|
|
36
|
+
ncd_boost_applied: boolean;
|
|
37
|
+
};
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
export interface CompressionOptions {
|
|
41
|
+
/** Target compression ratio 0.1-1.0, default 0.5 (keep 50%) */
|
|
42
|
+
target_ratio?: number;
|
|
43
|
+
/** Minimum sentences to keep, default 1 */
|
|
44
|
+
min_sentences?: number;
|
|
45
|
+
/** Boost logical connectives, default true */
|
|
46
|
+
boost_reasoning?: boolean;
|
|
47
|
+
/** Use NCD for query relevance scoring (default: true) */
|
|
48
|
+
useNCD?: boolean;
|
|
49
|
+
/** Enforce coreference constraints - keep antecedents for pronouns (default: true) */
|
|
50
|
+
enforceCoref?: boolean;
|
|
51
|
+
/** Enforce causal chain constraints - keep premises for conclusions (default: true) */
|
|
52
|
+
enforceCausalChains?: boolean;
|
|
53
|
+
/** Remove filler phrases before scoring (default: true) */
|
|
54
|
+
removeFillers?: boolean;
|
|
55
|
+
/** Jaccard threshold for repetition detection (default: 0.8) */
|
|
56
|
+
repeatThreshold?: number;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
interface SentenceMetadata {
|
|
60
|
+
index: number;
|
|
61
|
+
original: string;
|
|
62
|
+
cleaned: string;
|
|
63
|
+
score: number;
|
|
64
|
+
ncdScore: number;
|
|
65
|
+
startsWithPronoun: boolean;
|
|
66
|
+
hasCausalConnective: boolean;
|
|
67
|
+
repeatSimilarity: number;
|
|
68
|
+
requiredBy: number | null;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
// ============================================================================
|
|
72
|
+
// Constants
|
|
73
|
+
// ============================================================================
|
|
74
|
+
|
|
75
|
+
const DEFAULT_OPTIONS: Required<CompressionOptions> = {
|
|
76
|
+
target_ratio: 0.5,
|
|
77
|
+
min_sentences: 1,
|
|
78
|
+
boost_reasoning: true,
|
|
79
|
+
useNCD: true,
|
|
80
|
+
enforceCoref: true,
|
|
81
|
+
enforceCausalChains: true,
|
|
82
|
+
removeFillers: true,
|
|
83
|
+
repeatThreshold: 0.8,
|
|
84
|
+
};
|
|
85
|
+
|
|
86
|
+
// Filler phrases to remove (research-backed)
|
|
87
|
+
const FILLER_PATTERNS = [
|
|
88
|
+
// Meta-cognition starters
|
|
89
|
+
/^(let's see|let me (think|check|see)|i think that|i believe that|okay so|well,?)\s*/gi,
|
|
90
|
+
// Inline fillers
|
|
91
|
+
/\b(basically|literally|actually|you know|i mean)\b/gi,
|
|
92
|
+
// Hedging (keep for nuance in some cases, lighter penalty)
|
|
93
|
+
/\b(really|very|quite|rather|somewhat)\b/gi,
|
|
94
|
+
];
|
|
95
|
+
|
|
96
|
+
// Full meta-sentences to remove entirely
|
|
97
|
+
const META_SENTENCE_PATTERNS = [
|
|
98
|
+
/^(let me think about this|hmm+|okay|alright|so)[.!?]?$/i,
|
|
99
|
+
/^(that's a good question|interesting question)[.!?]?$/i,
|
|
100
|
+
];
|
|
101
|
+
|
|
102
|
+
// Pronoun starters that indicate coreference dependency
|
|
103
|
+
const PRONOUN_START = /^(he|she|it|they|this|that|these|those|such)\b/i;
|
|
104
|
+
|
|
105
|
+
// Causal connectives that indicate dependency on previous sentence
|
|
106
|
+
const CAUSAL_CONNECTIVES = /^(therefore|thus|hence|consequently|as a result|so,|accordingly)/i;
|
|
107
|
+
|
|
108
|
+
// Contrastive connectives
|
|
109
|
+
const CONTRASTIVE_CONNECTIVES = /^(however|but|although|yet|nevertheless|on the other hand)/i;
|
|
110
|
+
|
|
111
|
+
// Reasoning keywords to boost (from Self-Correction Bench research)
|
|
112
|
+
const REASONING_KEYWORDS =
|
|
113
|
+
/\b(therefore|because|thus|hence|consequently|result|conclude|implies|means|since|given|if|then|however|but|although|wait)\b/i;
|
|
114
|
+
|
|
115
|
+
// High-value sentence starters
|
|
116
|
+
const VALUE_STARTERS =
|
|
117
|
+
/^(the key|importantly|note that|crucially|specifically|in summary|to summarize|finally|first|second|third)/i;
|
|
118
|
+
|
|
119
|
+
// Common stop words to filter out
|
|
120
|
+
const STOP_WORDS = new Set([
|
|
121
|
+
"the",
|
|
122
|
+
"a",
|
|
123
|
+
"an",
|
|
124
|
+
"and",
|
|
125
|
+
"or",
|
|
126
|
+
"but",
|
|
127
|
+
"in",
|
|
128
|
+
"on",
|
|
129
|
+
"at",
|
|
130
|
+
"to",
|
|
131
|
+
"for",
|
|
132
|
+
"of",
|
|
133
|
+
"with",
|
|
134
|
+
"by",
|
|
135
|
+
"from",
|
|
136
|
+
"as",
|
|
137
|
+
"is",
|
|
138
|
+
"was",
|
|
139
|
+
"are",
|
|
140
|
+
"were",
|
|
141
|
+
"been",
|
|
142
|
+
"be",
|
|
143
|
+
"have",
|
|
144
|
+
"has",
|
|
145
|
+
"had",
|
|
146
|
+
"do",
|
|
147
|
+
"does",
|
|
148
|
+
"did",
|
|
149
|
+
"will",
|
|
150
|
+
"would",
|
|
151
|
+
"could",
|
|
152
|
+
"should",
|
|
153
|
+
"may",
|
|
154
|
+
"might",
|
|
155
|
+
"must",
|
|
156
|
+
"shall",
|
|
157
|
+
"can",
|
|
158
|
+
"need",
|
|
159
|
+
"dare",
|
|
160
|
+
"ought",
|
|
161
|
+
"used",
|
|
162
|
+
"this",
|
|
163
|
+
"that",
|
|
164
|
+
"these",
|
|
165
|
+
"those",
|
|
166
|
+
"i",
|
|
167
|
+
"you",
|
|
168
|
+
"he",
|
|
169
|
+
"she",
|
|
170
|
+
"it",
|
|
171
|
+
"we",
|
|
172
|
+
"they",
|
|
173
|
+
"what",
|
|
174
|
+
"which",
|
|
175
|
+
"who",
|
|
176
|
+
"whom",
|
|
177
|
+
"whose",
|
|
178
|
+
"where",
|
|
179
|
+
"when",
|
|
180
|
+
"why",
|
|
181
|
+
"how",
|
|
182
|
+
"all",
|
|
183
|
+
"each",
|
|
184
|
+
"every",
|
|
185
|
+
"both",
|
|
186
|
+
"few",
|
|
187
|
+
"more",
|
|
188
|
+
"most",
|
|
189
|
+
"other",
|
|
190
|
+
"some",
|
|
191
|
+
"such",
|
|
192
|
+
"no",
|
|
193
|
+
"nor",
|
|
194
|
+
"not",
|
|
195
|
+
"only",
|
|
196
|
+
"own",
|
|
197
|
+
"same",
|
|
198
|
+
"so",
|
|
199
|
+
"than",
|
|
200
|
+
"too",
|
|
201
|
+
"very",
|
|
202
|
+
"just",
|
|
203
|
+
"also",
|
|
204
|
+
"now",
|
|
205
|
+
"here",
|
|
206
|
+
"there",
|
|
207
|
+
"then",
|
|
208
|
+
]);
|
|
209
|
+
|
|
210
|
+
// ============================================================================
|
|
211
|
+
// Core Functions
|
|
212
|
+
// ============================================================================
|
|
213
|
+
|
|
214
|
+
/**
|
|
215
|
+
* Split text into sentences
|
|
216
|
+
*/
|
|
217
|
+
function splitSentences(text: string): string[] {
|
|
218
|
+
return text
|
|
219
|
+
.split(/(?<=[.!?])\s+/)
|
|
220
|
+
.map((s) => s.trim())
|
|
221
|
+
.filter((s) => s.length > 0);
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
/**
|
|
225
|
+
* Simple tokenizer for Jaccard similarity
|
|
226
|
+
*/
|
|
227
|
+
function tokenize(text: string): string[] {
|
|
228
|
+
return text
|
|
229
|
+
.toLowerCase()
|
|
230
|
+
.replace(/[^\w\s]/g, " ")
|
|
231
|
+
.split(/\s+/)
|
|
232
|
+
.filter((w) => w.length > 2);
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
/**
|
|
236
|
+
* Tokenize text for term comparison (filters stop words)
|
|
237
|
+
*/
|
|
238
|
+
function tokenizeForTfIdf(text: string): string[] {
|
|
239
|
+
return text
|
|
240
|
+
.toLowerCase()
|
|
241
|
+
.replace(/[^\w\s]/g, " ")
|
|
242
|
+
.split(/\s+/)
|
|
243
|
+
.filter((w) => w.length > 2 && !STOP_WORDS.has(w));
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
/**
|
|
247
|
+
* Estimate token count (rough: ~4 chars per token for English)
|
|
248
|
+
*/
|
|
249
|
+
function estimateTokens(text: string): number {
|
|
250
|
+
return Math.ceil(text.length / 4);
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
/**
|
|
254
|
+
* Calculate TF-IDF-like relevance score for a sentence
|
|
255
|
+
*/
|
|
256
|
+
function relevanceScore(
|
|
257
|
+
sentence: string,
|
|
258
|
+
query: string,
|
|
259
|
+
position: number,
|
|
260
|
+
totalSentences: number,
|
|
261
|
+
boostReasoning: boolean,
|
|
262
|
+
): number {
|
|
263
|
+
let score = 0;
|
|
264
|
+
|
|
265
|
+
// 1. Term overlap with query (TF-IDF-like)
|
|
266
|
+
const queryTerms = tokenizeForTfIdf(query);
|
|
267
|
+
const sentenceTerms = tokenizeForTfIdf(sentence);
|
|
268
|
+
const sentenceTermSet = new Set(sentenceTerms);
|
|
269
|
+
|
|
270
|
+
for (const term of queryTerms) {
|
|
271
|
+
if (sentenceTermSet.has(term)) {
|
|
272
|
+
// IDF-like: rarer terms in sentence = higher weight
|
|
273
|
+
const termFreq = sentenceTerms.filter((t) => t === term).length;
|
|
274
|
+
score += 1 / Math.log(1 + termFreq);
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
// 2. Position bias (first and last sentences often important)
|
|
279
|
+
const positionScore = position === 0 ? 0.3 : position === totalSentences - 1 ? 0.2 : 0;
|
|
280
|
+
score += positionScore;
|
|
281
|
+
|
|
282
|
+
// 3. Reasoning keyword boost
|
|
283
|
+
if (boostReasoning && REASONING_KEYWORDS.test(sentence)) {
|
|
284
|
+
score *= 1.5;
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
// 4. High-value starter boost
|
|
288
|
+
if (VALUE_STARTERS.test(sentence.trim())) {
|
|
289
|
+
score *= 1.3;
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
// 5. Length penalty for very short sentences (likely not informative)
|
|
293
|
+
if (sentence.length < 20) {
|
|
294
|
+
score *= 0.5;
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
// 6. Penalty for filler phrases
|
|
298
|
+
if (/^(um|uh|well|so|okay|basically|actually|like)\b/i.test(sentence.trim())) {
|
|
299
|
+
score *= 0.3;
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
return score;
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
/**
|
|
306
|
+
* Remove filler phrases from text
|
|
307
|
+
*/
|
|
308
|
+
function cleanFillers(sentence: string): { cleaned: string; removedCount: number } {
|
|
309
|
+
let cleaned = sentence;
|
|
310
|
+
let removedCount = 0;
|
|
311
|
+
|
|
312
|
+
for (const pattern of FILLER_PATTERNS) {
|
|
313
|
+
const before = cleaned;
|
|
314
|
+
cleaned = cleaned.replace(pattern, " ");
|
|
315
|
+
if (cleaned !== before) removedCount++;
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
// Normalize whitespace
|
|
319
|
+
cleaned = cleaned.replace(/\s+/g, " ").trim();
|
|
320
|
+
|
|
321
|
+
return { cleaned, removedCount };
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
/**
|
|
325
|
+
* Check if sentence is pure meta-cognition (should be removed entirely)
|
|
326
|
+
*/
|
|
327
|
+
function isMetaSentence(sentence: string): boolean {
|
|
328
|
+
const trimmed = sentence.trim();
|
|
329
|
+
return META_SENTENCE_PATTERNS.some((p) => p.test(trimmed));
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
/**
|
|
333
|
+
* Compute Normalized Compression Distance between two strings
|
|
334
|
+
* NCD(x,y) = (C(xy) - min(C(x), C(y))) / max(C(x), C(y))
|
|
335
|
+
*
|
|
336
|
+
* Lower NCD = more similar (0 = identical, 1 = unrelated)
|
|
337
|
+
*
|
|
338
|
+
* @param a - First string
|
|
339
|
+
* @param b - Second string
|
|
340
|
+
* @param cachedCa - Optional pre-computed gzip size for string a (optimization)
|
|
341
|
+
* @param cachedCb - Optional pre-computed gzip size for string b (optimization)
|
|
342
|
+
*/
|
|
343
|
+
export function computeNCD(a: string, b: string, cachedCa?: number, cachedCb?: number): number {
|
|
344
|
+
if (a.length === 0 || b.length === 0) return 1;
|
|
345
|
+
|
|
346
|
+
try {
|
|
347
|
+
const Ca = cachedCa ?? gzipSync(Buffer.from(a)).length;
|
|
348
|
+
const Cb = cachedCb ?? gzipSync(Buffer.from(b)).length;
|
|
349
|
+
const Cab = gzipSync(Buffer.from(`${a} ${b}`)).length;
|
|
350
|
+
|
|
351
|
+
const ncd = (Cab - Math.min(Ca, Cb)) / Math.max(Ca, Cb);
|
|
352
|
+
return Math.min(1, Math.max(0, ncd)); // Clamp to [0, 1]
|
|
353
|
+
} catch {
|
|
354
|
+
return 0.5; // Default on error
|
|
355
|
+
}
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
/**
|
|
359
|
+
* Compute Jaccard similarity between two token sets
|
|
360
|
+
*/
|
|
361
|
+
export function jaccardSimilarity(a: string, b: string): number {
|
|
362
|
+
const tokensA = new Set(tokenize(a));
|
|
363
|
+
const tokensB = new Set(tokenize(b));
|
|
364
|
+
|
|
365
|
+
if (tokensA.size === 0 || tokensB.size === 0) return 0;
|
|
366
|
+
|
|
367
|
+
const intersection = [...tokensA].filter((t) => tokensB.has(t)).length;
|
|
368
|
+
const union = new Set([...tokensA, ...tokensB]).size;
|
|
369
|
+
|
|
370
|
+
return intersection / union;
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
/**
|
|
374
|
+
* Compress context by keeping sentences most relevant to the query
|
|
375
|
+
*
|
|
376
|
+
* Features:
|
|
377
|
+
* - TF-IDF relevance scoring
|
|
378
|
+
* - NCD (gzip-based) query similarity
|
|
379
|
+
* - Coreference constraint enforcement
|
|
380
|
+
* - Causal chain preservation
|
|
381
|
+
* - Filler/meta-cognition removal
|
|
382
|
+
* - Repetition detection
|
|
383
|
+
*/
|
|
384
|
+
export function compress(
|
|
385
|
+
context: string,
|
|
386
|
+
query: string,
|
|
387
|
+
options: CompressionOptions = {},
|
|
388
|
+
): CompressionResult {
|
|
389
|
+
const opts = { ...DEFAULT_OPTIONS, ...options };
|
|
390
|
+
const rawSentences = splitSentences(context);
|
|
391
|
+
|
|
392
|
+
// Early exit for short text
|
|
393
|
+
if (rawSentences.length <= opts.min_sentences) {
|
|
394
|
+
return createShortTextResult(context, rawSentences.length);
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
// Phase 1: Pre-processing - Build metadata
|
|
398
|
+
const { metadata, fillersRemoved, repetitionsPenalized } = buildSentenceMetadata(
|
|
399
|
+
rawSentences,
|
|
400
|
+
query,
|
|
401
|
+
opts,
|
|
402
|
+
);
|
|
403
|
+
|
|
404
|
+
// Phase 2: Compute scores
|
|
405
|
+
computeSentenceScores(metadata, query, opts);
|
|
406
|
+
|
|
407
|
+
// Phase 3: Select sentences
|
|
408
|
+
const keepCount = Math.max(
|
|
409
|
+
opts.min_sentences,
|
|
410
|
+
Math.ceil(rawSentences.length * opts.target_ratio),
|
|
411
|
+
);
|
|
412
|
+
const selected = selectTopSentences(metadata, keepCount);
|
|
413
|
+
|
|
414
|
+
// Phase 4: Enforce constraints
|
|
415
|
+
const constraints = enforceConstraints(metadata, selected, opts);
|
|
416
|
+
|
|
417
|
+
// Reconstruct in original order
|
|
418
|
+
const kept = rawSentences.filter((_, i) => selected.has(i));
|
|
419
|
+
const dropped = rawSentences.filter((_, i) => !selected.has(i));
|
|
420
|
+
const compressed = kept.join(" ");
|
|
421
|
+
|
|
422
|
+
return {
|
|
423
|
+
compressed,
|
|
424
|
+
original_tokens: estimateTokens(context),
|
|
425
|
+
compressed_tokens: estimateTokens(compressed),
|
|
426
|
+
ratio: compressed.length / Math.max(context.length, 1),
|
|
427
|
+
kept_sentences: kept.length,
|
|
428
|
+
dropped_sentences: dropped,
|
|
429
|
+
enhancements: {
|
|
430
|
+
fillers_removed: fillersRemoved,
|
|
431
|
+
coref_constraints_applied: constraints.coref,
|
|
432
|
+
causal_constraints_applied: constraints.causal,
|
|
433
|
+
repetitions_penalized: repetitionsPenalized,
|
|
434
|
+
ncd_boost_applied: opts.useNCD,
|
|
435
|
+
},
|
|
436
|
+
};
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
/** Create result for text too short to compress */
|
|
440
|
+
function createShortTextResult(context: string, sentenceCount: number): CompressionResult {
|
|
441
|
+
return {
|
|
442
|
+
compressed: context,
|
|
443
|
+
original_tokens: estimateTokens(context),
|
|
444
|
+
compressed_tokens: estimateTokens(context),
|
|
445
|
+
ratio: 1.0,
|
|
446
|
+
kept_sentences: sentenceCount,
|
|
447
|
+
dropped_sentences: [],
|
|
448
|
+
enhancements: {
|
|
449
|
+
fillers_removed: 0,
|
|
450
|
+
coref_constraints_applied: 0,
|
|
451
|
+
causal_constraints_applied: 0,
|
|
452
|
+
repetitions_penalized: 0,
|
|
453
|
+
ncd_boost_applied: false,
|
|
454
|
+
},
|
|
455
|
+
};
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
/** Build metadata for all sentences */
|
|
459
|
+
function buildSentenceMetadata(
|
|
460
|
+
rawSentences: string[],
|
|
461
|
+
query: string,
|
|
462
|
+
opts: Required<CompressionOptions>,
|
|
463
|
+
): { metadata: SentenceMetadata[]; fillersRemoved: number; repetitionsPenalized: number } {
|
|
464
|
+
let fillersRemoved = 0;
|
|
465
|
+
let repetitionsPenalized = 0;
|
|
466
|
+
|
|
467
|
+
// Pre-compute query's gzip size once (optimization: avoids redundant compression)
|
|
468
|
+
let cachedQueryGzipSize: number | undefined;
|
|
469
|
+
if (opts.useNCD && query.length > 0) {
|
|
470
|
+
try {
|
|
471
|
+
cachedQueryGzipSize = gzipSync(Buffer.from(query)).length;
|
|
472
|
+
} catch {
|
|
473
|
+
// Fallback: let computeNCD handle it
|
|
474
|
+
}
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
const metadata: SentenceMetadata[] = rawSentences.map((sentence, index) => {
|
|
478
|
+
if (opts.removeFillers && isMetaSentence(sentence)) {
|
|
479
|
+
fillersRemoved++;
|
|
480
|
+
return {
|
|
481
|
+
index,
|
|
482
|
+
original: sentence,
|
|
483
|
+
cleaned: "",
|
|
484
|
+
score: -1000,
|
|
485
|
+
ncdScore: 1,
|
|
486
|
+
startsWithPronoun: false,
|
|
487
|
+
hasCausalConnective: false,
|
|
488
|
+
repeatSimilarity: 0,
|
|
489
|
+
requiredBy: null,
|
|
490
|
+
};
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
const { cleaned, removedCount } = opts.removeFillers
|
|
494
|
+
? cleanFillers(sentence)
|
|
495
|
+
: { cleaned: sentence, removedCount: 0 };
|
|
496
|
+
fillersRemoved += removedCount;
|
|
497
|
+
|
|
498
|
+
// Use cached query gzip size for NCD computation
|
|
499
|
+
const ncdScore = opts.useNCD ? computeNCD(cleaned, query, undefined, cachedQueryGzipSize) : 0.5;
|
|
500
|
+
const startsWithPronoun = PRONOUN_START.test(cleaned);
|
|
501
|
+
const hasCausalConnective =
|
|
502
|
+
CAUSAL_CONNECTIVES.test(cleaned) || CONTRASTIVE_CONNECTIVES.test(cleaned);
|
|
503
|
+
|
|
504
|
+
return {
|
|
505
|
+
index,
|
|
506
|
+
original: sentence,
|
|
507
|
+
cleaned,
|
|
508
|
+
score: 0,
|
|
509
|
+
ncdScore,
|
|
510
|
+
startsWithPronoun,
|
|
511
|
+
hasCausalConnective,
|
|
512
|
+
repeatSimilarity: 0,
|
|
513
|
+
requiredBy: null,
|
|
514
|
+
};
|
|
515
|
+
});
|
|
516
|
+
|
|
517
|
+
// Compute repetition similarity and mark dependencies
|
|
518
|
+
for (let i = 1; i < metadata.length; i++) {
|
|
519
|
+
const current = metadata[i];
|
|
520
|
+
const previous = metadata[i - 1];
|
|
521
|
+
if (current && previous) {
|
|
522
|
+
const sim = jaccardSimilarity(current.cleaned, previous.cleaned);
|
|
523
|
+
current.repeatSimilarity = sim;
|
|
524
|
+
if (sim > opts.repeatThreshold) repetitionsPenalized++;
|
|
525
|
+
if (current.startsWithPronoun) previous.requiredBy = i;
|
|
526
|
+
if (current.hasCausalConnective) previous.requiredBy = i;
|
|
527
|
+
}
|
|
528
|
+
}
|
|
529
|
+
|
|
530
|
+
return { metadata, fillersRemoved, repetitionsPenalized };
|
|
531
|
+
}
|
|
532
|
+
|
|
533
|
+
/** Compute relevance scores for sentences */
|
|
534
|
+
function computeSentenceScores(
|
|
535
|
+
metadata: SentenceMetadata[],
|
|
536
|
+
query: string,
|
|
537
|
+
opts: Required<CompressionOptions>,
|
|
538
|
+
): void {
|
|
539
|
+
const totalSentences = metadata.filter((m) => m.cleaned.length > 0).length;
|
|
540
|
+
|
|
541
|
+
for (const m of metadata) {
|
|
542
|
+
if (m.cleaned.length === 0) continue;
|
|
543
|
+
|
|
544
|
+
let score = relevanceScore(m.cleaned, query, m.index, totalSentences, opts.boost_reasoning);
|
|
545
|
+
if (opts.useNCD) score += (1 - m.ncdScore) * 0.5;
|
|
546
|
+
if (m.repeatSimilarity > opts.repeatThreshold) score *= 0.3;
|
|
547
|
+
if (m.requiredBy !== null) score *= 1.2;
|
|
548
|
+
m.score = score;
|
|
549
|
+
}
|
|
550
|
+
}
|
|
551
|
+
|
|
552
|
+
/** Select top sentences by score */
|
|
553
|
+
function selectTopSentences(metadata: SentenceMetadata[], keepCount: number): Set<number> {
|
|
554
|
+
const validMetadata = metadata.filter((m) => m.cleaned.length > 0);
|
|
555
|
+
const sorted = [...validMetadata].sort((a, b) => b.score - a.score);
|
|
556
|
+
return new Set(sorted.slice(0, keepCount).map((m) => m.index));
|
|
557
|
+
}
|
|
558
|
+
|
|
559
|
+
/** Enforce coreference and causal chain constraints */
|
|
560
|
+
function enforceConstraints(
|
|
561
|
+
metadata: SentenceMetadata[],
|
|
562
|
+
selected: Set<number>,
|
|
563
|
+
opts: Required<CompressionOptions>,
|
|
564
|
+
): { coref: number; causal: number } {
|
|
565
|
+
let corefConstraints = 0;
|
|
566
|
+
let causalConstraints = 0;
|
|
567
|
+
|
|
568
|
+
if (!opts.enforceCoref && !opts.enforceCausalChains) {
|
|
569
|
+
return { coref: 0, causal: 0 };
|
|
570
|
+
}
|
|
571
|
+
|
|
572
|
+
let changed = true;
|
|
573
|
+
let iterations = 0;
|
|
574
|
+
const maxIterations = 10;
|
|
575
|
+
|
|
576
|
+
while (changed && iterations < maxIterations) {
|
|
577
|
+
changed = false;
|
|
578
|
+
iterations++;
|
|
579
|
+
|
|
580
|
+
for (const m of metadata) {
|
|
581
|
+
if (!selected.has(m.index)) continue;
|
|
582
|
+
|
|
583
|
+
if (opts.enforceCoref && m.startsWithPronoun && m.index > 0 && !selected.has(m.index - 1)) {
|
|
584
|
+
selected.add(m.index - 1);
|
|
585
|
+
corefConstraints++;
|
|
586
|
+
changed = true;
|
|
587
|
+
}
|
|
588
|
+
|
|
589
|
+
if (
|
|
590
|
+
opts.enforceCausalChains &&
|
|
591
|
+
m.hasCausalConnective &&
|
|
592
|
+
m.index > 0 &&
|
|
593
|
+
!selected.has(m.index - 1)
|
|
594
|
+
) {
|
|
595
|
+
selected.add(m.index - 1);
|
|
596
|
+
causalConstraints++;
|
|
597
|
+
changed = true;
|
|
598
|
+
}
|
|
599
|
+
}
|
|
600
|
+
}
|
|
601
|
+
|
|
602
|
+
return { coref: corefConstraints, causal: causalConstraints };
|
|
603
|
+
}
|
|
604
|
+
|
|
605
|
+
/**
|
|
606
|
+
* Quick compression for context before adding to prompt
|
|
607
|
+
* Returns compressed text if compression is beneficial, otherwise original
|
|
608
|
+
*/
|
|
609
|
+
export function quickCompress(context: string, query: string, maxTokens: number = 500): string {
|
|
610
|
+
const currentTokens = estimateTokens(context);
|
|
611
|
+
|
|
612
|
+
if (currentTokens <= maxTokens) {
|
|
613
|
+
return context;
|
|
614
|
+
}
|
|
615
|
+
|
|
616
|
+
const targetRatio = maxTokens / currentTokens;
|
|
617
|
+
const result = compress(context, query, { target_ratio: targetRatio });
|
|
618
|
+
|
|
619
|
+
return result.compressed;
|
|
620
|
+
}
|
|
621
|
+
|
|
622
|
+
// ============================================================================
|
|
623
|
+
// COMPRESSION DETECTION - Determine if text would benefit from compression
|
|
624
|
+
// ============================================================================
|
|
625
|
+
|
|
626
|
+
export interface CompressionAnalysis {
|
|
627
|
+
/** Whether compression is recommended */
|
|
628
|
+
shouldCompress: boolean;
|
|
629
|
+
/** Shannon entropy in bits per character (0-8 for bytes, ~4.5 typical for English) */
|
|
630
|
+
entropy: number;
|
|
631
|
+
/** Ratio of unique characters to total length (0-1) */
|
|
632
|
+
uniquenessRatio: number;
|
|
633
|
+
/** Estimated compression ratio achievable (0-1, lower = better compression) */
|
|
634
|
+
estimatedRatio: number;
|
|
635
|
+
/** Estimated tokens in text */
|
|
636
|
+
tokens: number;
|
|
637
|
+
/** Reasons for the recommendation */
|
|
638
|
+
reasons: string[];
|
|
639
|
+
}
|
|
640
|
+
|
|
641
|
+
/** Thresholds for compression decision */
|
|
642
|
+
const COMPRESSION_THRESHOLDS = {
|
|
643
|
+
/** Minimum tokens before compression is worthwhile */
|
|
644
|
+
MIN_TOKENS: 100,
|
|
645
|
+
/** Entropy below this indicates high redundancy (good for compression) */
|
|
646
|
+
LOW_ENTROPY: 4.0,
|
|
647
|
+
/** Entropy above this indicates low redundancy (compression less effective) */
|
|
648
|
+
HIGH_ENTROPY: 6.5,
|
|
649
|
+
/** Uniqueness ratio below this suggests repetitive content */
|
|
650
|
+
LOW_UNIQUENESS: 0.3,
|
|
651
|
+
/** Minimum estimated savings to recommend compression */
|
|
652
|
+
MIN_SAVINGS: 0.2,
|
|
653
|
+
} as const;
|
|
654
|
+
|
|
655
|
+
/**
|
|
656
|
+
* Calculate Shannon entropy of text (bits per character)
|
|
657
|
+
*
|
|
658
|
+
* Based on Shannon's source coding theorem: entropy represents the theoretical
|
|
659
|
+
* lower bound for lossless compression. English text typically has entropy ~4.5
|
|
660
|
+
* bits/char; random data approaches 8 bits/byte (maximum).
|
|
661
|
+
*
|
|
662
|
+
* @param text - Input text to analyze
|
|
663
|
+
* @returns Entropy in bits per character (0 to ~8)
|
|
664
|
+
*/
|
|
665
|
+
export function calculateEntropy(text: string): number {
|
|
666
|
+
if (text.length === 0) return 0;
|
|
667
|
+
|
|
668
|
+
// Count character frequencies
|
|
669
|
+
const freq = new Map<string, number>();
|
|
670
|
+
for (const char of text) {
|
|
671
|
+
freq.set(char, (freq.get(char) || 0) + 1);
|
|
672
|
+
}
|
|
673
|
+
|
|
674
|
+
// Calculate entropy: H = -Σ p(x) * log2(p(x))
|
|
675
|
+
const len = text.length;
|
|
676
|
+
let entropy = 0;
|
|
677
|
+
|
|
678
|
+
for (const count of freq.values()) {
|
|
679
|
+
const p = count / len;
|
|
680
|
+
entropy -= p * Math.log2(p);
|
|
681
|
+
}
|
|
682
|
+
|
|
683
|
+
return entropy;
|
|
684
|
+
}
|
|
685
|
+
|
|
686
|
+
/**
|
|
687
|
+
* Analyze text to determine if compression would be beneficial
|
|
688
|
+
*
|
|
689
|
+
* Uses multiple heuristics based on information theory research:
|
|
690
|
+
* 1. Shannon entropy - measures information density
|
|
691
|
+
* 2. Uniqueness ratio - character diversity
|
|
692
|
+
* 3. Token count - minimum size for compression overhead to pay off
|
|
693
|
+
* 4. Estimated compression ratio based on entropy
|
|
694
|
+
*
|
|
695
|
+
* @param text - Text to analyze
|
|
696
|
+
* @param query - Optional query for context-aware analysis
|
|
697
|
+
* @returns Analysis with recommendation and metrics
|
|
698
|
+
*/
|
|
699
|
+
export function needsCompression(text: string, query?: string): CompressionAnalysis {
|
|
700
|
+
const tokens = estimateTokens(text);
|
|
701
|
+
const reasons: string[] = [];
|
|
702
|
+
|
|
703
|
+
// Short text: compression overhead not worthwhile
|
|
704
|
+
if (tokens < COMPRESSION_THRESHOLDS.MIN_TOKENS) {
|
|
705
|
+
return {
|
|
706
|
+
shouldCompress: false,
|
|
707
|
+
entropy: 0,
|
|
708
|
+
uniquenessRatio: 1,
|
|
709
|
+
estimatedRatio: 1,
|
|
710
|
+
tokens,
|
|
711
|
+
reasons: [`Text too short (${tokens} tokens < ${COMPRESSION_THRESHOLDS.MIN_TOKENS} minimum)`],
|
|
712
|
+
};
|
|
713
|
+
}
|
|
714
|
+
|
|
715
|
+
const entropy = calculateEntropy(text);
|
|
716
|
+
|
|
717
|
+
// Uniqueness ratio: unique chars / total length
|
|
718
|
+
const uniqueChars = new Set(text).size;
|
|
719
|
+
const uniquenessRatio = uniqueChars / text.length;
|
|
720
|
+
|
|
721
|
+
// Estimate compression ratio based on entropy
|
|
722
|
+
// Theoretical: ratio ≈ entropy / 8 (since 8 bits = 1 byte max)
|
|
723
|
+
// For text compression targeting semantic content, we use a more practical estimate
|
|
724
|
+
// Factor in that our sentence-level compression can achieve ~40-60% on repetitive text
|
|
725
|
+
const theoreticalRatio = entropy / 8;
|
|
726
|
+
const practicalRatio = Math.min(1, theoreticalRatio + 0.2); // Add overhead margin
|
|
727
|
+
const estimatedRatio = practicalRatio;
|
|
728
|
+
|
|
729
|
+
// Decision logic
|
|
730
|
+
let shouldCompress = false;
|
|
731
|
+
|
|
732
|
+
// High redundancy indicators
|
|
733
|
+
if (entropy < COMPRESSION_THRESHOLDS.LOW_ENTROPY) {
|
|
734
|
+
shouldCompress = true;
|
|
735
|
+
reasons.push(`Low entropy (${entropy.toFixed(2)} bits/char) indicates high redundancy`);
|
|
736
|
+
}
|
|
737
|
+
|
|
738
|
+
if (uniquenessRatio < COMPRESSION_THRESHOLDS.LOW_UNIQUENESS) {
|
|
739
|
+
shouldCompress = true;
|
|
740
|
+
reasons.push(
|
|
741
|
+
`Low uniqueness ratio (${(uniquenessRatio * 100).toFixed(1)}%) suggests repetitive content`,
|
|
742
|
+
);
|
|
743
|
+
}
|
|
744
|
+
|
|
745
|
+
// Token count consideration
|
|
746
|
+
if (tokens > 500) {
|
|
747
|
+
// Longer text benefits more from compression
|
|
748
|
+
if (entropy < 5.5) {
|
|
749
|
+
shouldCompress = true;
|
|
750
|
+
reasons.push(`Long text (${tokens} tokens) with moderate entropy benefits from compression`);
|
|
751
|
+
}
|
|
752
|
+
}
|
|
753
|
+
|
|
754
|
+
// Check estimated savings
|
|
755
|
+
const estimatedSavings = 1 - estimatedRatio;
|
|
756
|
+
if (estimatedSavings < COMPRESSION_THRESHOLDS.MIN_SAVINGS && shouldCompress) {
|
|
757
|
+
// Override if savings too small
|
|
758
|
+
if (tokens < 300) {
|
|
759
|
+
shouldCompress = false;
|
|
760
|
+
reasons.length = 0;
|
|
761
|
+
reasons.push(
|
|
762
|
+
`Estimated savings (${(estimatedSavings * 100).toFixed(1)}%) too small for text size`,
|
|
763
|
+
);
|
|
764
|
+
}
|
|
765
|
+
}
|
|
766
|
+
|
|
767
|
+
// High entropy = likely already dense or random
|
|
768
|
+
if (entropy > COMPRESSION_THRESHOLDS.HIGH_ENTROPY) {
|
|
769
|
+
shouldCompress = false;
|
|
770
|
+
reasons.length = 0;
|
|
771
|
+
reasons.push(`High entropy (${entropy.toFixed(2)} bits/char) indicates already-dense content`);
|
|
772
|
+
}
|
|
773
|
+
|
|
774
|
+
// Query relevance boost: if query provided, check if compression preserves key terms
|
|
775
|
+
if (query && shouldCompress) {
|
|
776
|
+
const queryTerms = tokenizeForTfIdf(query);
|
|
777
|
+
const textTerms = new Set(tokenizeForTfIdf(text));
|
|
778
|
+
const overlap = queryTerms.filter((t) => textTerms.has(t)).length;
|
|
779
|
+
const overlapRatio = queryTerms.length > 0 ? overlap / queryTerms.length : 0;
|
|
780
|
+
|
|
781
|
+
if (overlapRatio > 0.5) {
|
|
782
|
+
reasons.push(`Query terms well-represented (${(overlapRatio * 100).toFixed(0)}% overlap)`);
|
|
783
|
+
}
|
|
784
|
+
}
|
|
785
|
+
|
|
786
|
+
if (reasons.length === 0) {
|
|
787
|
+
reasons.push("No strong compression indicators detected");
|
|
788
|
+
}
|
|
789
|
+
|
|
790
|
+
return {
|
|
791
|
+
shouldCompress,
|
|
792
|
+
entropy,
|
|
793
|
+
uniquenessRatio,
|
|
794
|
+
estimatedRatio,
|
|
795
|
+
tokens,
|
|
796
|
+
reasons,
|
|
797
|
+
};
|
|
798
|
+
}
|
|
799
|
+
|
|
800
|
+
// ============================================================================
|
|
801
|
+
// Utility Exports
|
|
802
|
+
// ============================================================================
|
|
803
|
+
|
|
804
|
+
export { cleanFillers, isMetaSentence };
|