@small-ltsc/ml 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,158 @@
1
+ # @small-ltsc/ml
2
+
3
+ Optional ML features for **Small LTSC** - Pattern importance scoring, quality prediction, and adaptive region detection.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ npm install @small-ltsc/ml @small-ltsc/sdk
9
+ ```
10
+
11
+ Note: `@small-ltsc/sdk` is a peer dependency.
12
+
13
+ ## Features
14
+
15
+ - **Pattern Importance Scoring** - Determine which patterns are semantically important
16
+ - **Quality Prediction** - Predict if compression will degrade model performance
17
+ - **Region Detection** - Identify system prompts, user input, and context for adaptive compression
18
+
19
+ ## Pattern Importance
20
+
21
+ Score patterns to preserve semantically important content:
22
+
23
+ ```typescript
24
+ import { PositionalImportanceScorer, filterByImportance } from '@small-ltsc/ml';
25
+ import { discoverPatterns } from '@small-ltsc/sdk';
26
+
27
+ const scorer = new PositionalImportanceScorer({ decayRate: 2.0 });
28
+ const patterns = await discoverPatterns(tokens);
29
+ const scores = await scorer.scorePatterns(tokens, patterns);
30
+
31
+ // Filter out high-importance patterns (preserve them)
32
+ const safeToCompress = filterByImportance(patterns, scores, 0.8);
33
+ ```
34
+
35
+ ### Embedding-Based Scoring
36
+
37
+ For more accurate importance scoring with an embedding model:
38
+
39
+ ```typescript
40
+ import { EmbeddingImportanceScorer } from '@small-ltsc/ml';
41
+
42
+ const scorer = new EmbeddingImportanceScorer(embeddingProvider, {
43
+ contextWindow: 5,
44
+ });
45
+
46
+ const scores = await scorer.scorePatterns(tokens, patterns);
47
+ ```
48
+
49
+ ## Quality Prediction
50
+
51
+ Predict if compressed output will maintain quality:
52
+
53
+ ```typescript
54
+ import { createQualityPredictor } from '@small-ltsc/ml';
55
+ import { compress } from '@small-ltsc/sdk';
56
+
57
+ const predictor = createQualityPredictor();
58
+ const result = await compress(tokens);
59
+ const prediction = await predictor.predict(result);
60
+
61
+ if (!prediction.acceptable) {
62
+ console.log(`Recommendation: ${prediction.recommendation}`);
63
+ // 'accept' | 'retry_conservative' | 'skip_compression'
64
+ }
65
+ ```
66
+
67
+ ### Quality Features
68
+
69
+ ```typescript
70
+ console.log(prediction.features);
71
+ // {
72
+ // compressionRatio: 0.65,
73
+ // dictionaryOverhead: 0.15,
74
+ // diversityReduction: 0.2,
75
+ // averagePatternLength: 4.5,
76
+ // patternCount: 12,
77
+ // }
78
+ ```
79
+
80
+ ## Region Detection
81
+
82
+ Detect semantic regions for adaptive compression:
83
+
84
+ ```typescript
85
+ import { detectRegions, RegionType, filterPatternsByRegion } from '@small-ltsc/ml';
86
+
87
+ const regions = detectRegions(tokens, {
88
+ systemMarkers: [[58, 71905, 60]], // [SYSTEM] tokens
89
+ retentionTargets: {
90
+ [RegionType.SYSTEM]: 0.98, // Minimal compression
91
+ [RegionType.USER]: 0.85, // Moderate
92
+ [RegionType.CONTEXT]: 0.6, // Aggressive
93
+ },
94
+ });
95
+
96
+ // Filter patterns based on region constraints
97
+ const filtered = filterPatternsByRegion(patterns, regions, tokens);
98
+ ```
99
+
100
+ ### Region Types
101
+
102
+ - `SYSTEM` - System instructions (high retention)
103
+ - `USER` - User input (moderate retention)
104
+ - `CONTEXT` - Injected context/documents (low retention)
105
+ - `CODE` - Code blocks (moderate retention)
106
+ - `UNKNOWN` - Default region
107
+
108
+ ## Custom Embedding Provider
109
+
110
+ Implement the `EmbeddingProvider` interface for your embedding model:
111
+
112
+ ```typescript
113
+ import type { EmbeddingProvider } from '@small-ltsc/ml';
114
+
115
+ class OpenAIEmbeddings implements EmbeddingProvider {
116
+ async embed(tokens: readonly number[]): Promise<Float32Array> {
117
+ const text = tokenizer.decode(tokens);
118
+ const response = await openai.embeddings.create({
119
+ model: 'text-embedding-3-small',
120
+ input: text,
121
+ });
122
+ return new Float32Array(response.data[0].embedding);
123
+ }
124
+
125
+ dimension(): number {
126
+ return 1536;
127
+ }
128
+ }
129
+
130
+ const scorer = new EmbeddingImportanceScorer(new OpenAIEmbeddings());
131
+ ```
132
+
133
+ ## API Reference
134
+
135
+ ### Importance Scoring
136
+
137
+ - `PositionalImportanceScorer` - Score by position (early = important)
138
+ - `EmbeddingImportanceScorer` - Score by context diversity
139
+ - `CombinedImportanceScorer` - Combine positional and embedding scoring
140
+ - `adjustPrioritiesByImportance(patterns, scores, threshold)` - Adjust pattern priorities
141
+ - `filterByImportance(patterns, scores, threshold)` - Filter high-importance patterns
142
+
143
+ ### Quality Prediction
144
+
145
+ - `HeuristicQualityPredictor` - Rule-based quality prediction
146
+ - `EmbeddingQualityPredictor` - Enhanced with embedding similarity
147
+ - `createQualityPredictor(provider?, config?)` - Factory function
148
+
149
+ ### Region Detection
150
+
151
+ - `detectRegions(tokens, config?)` - Detect semantic regions
152
+ - `detectRegionsHeuristic(tokens)` - Simple heuristic detection
153
+ - `filterPatternsByRegion(patterns, regions, tokens)` - Filter by region
154
+ - `getRegionCompressionSettings(regionType)` - Get settings for region
155
+
156
+ ## License
157
+
158
+ MIT
@@ -0,0 +1,190 @@
1
+ /**
2
+ * Pattern importance scoring for ML-aware compression.
3
+ *
4
+ * Port of `small/pattern_importance.py` to TypeScript.
5
+ */
6
+ const DEFAULT_IMPORTANCE_CONFIG = {
7
+ decayRate: 2.0,
8
+ contextWindow: 5,
9
+ positionalWeight: 0.3,
10
+ };
11
+ /**
12
+ * Positional importance scorer.
13
+ *
14
+ * Scores patterns based on their position in the sequence,
15
+ * with earlier positions receiving higher importance (useful for
16
+ * system prompts that typically appear at the start).
17
+ */
18
+ export class PositionalImportanceScorer {
19
+ decayRate;
20
+ constructor(config) {
21
+ const cfg = { ...DEFAULT_IMPORTANCE_CONFIG, ...config };
22
+ this.decayRate = cfg.decayRate;
23
+ }
24
+ async scorePatterns(tokens, patterns) {
25
+ const n = tokens.length;
26
+ if (n === 0) {
27
+ return patterns.map(() => 0);
28
+ }
29
+ return patterns.map((pattern) => {
30
+ if (pattern.positions.length === 0) {
31
+ return 0;
32
+ }
33
+ // Compute average positional importance across all occurrences
34
+ let totalImportance = 0;
35
+ for (const pos of pattern.positions) {
36
+ // Exponential decay from start of sequence
37
+ const normalizedPos = pos / n;
38
+ const importance = Math.exp(-this.decayRate * normalizedPos);
39
+ totalImportance += importance;
40
+ }
41
+ return totalImportance / pattern.positions.length;
42
+ });
43
+ }
44
+ }
45
+ /**
46
+ * Embedding-based importance scorer.
47
+ *
48
+ * Uses an embedding model to determine if a pattern appears in
49
+ * diverse semantic contexts (important, should preserve) vs.
50
+ * similar contexts (redundant, safe to compress).
51
+ */
52
+ export class EmbeddingImportanceScorer {
53
+ provider;
54
+ contextWindow;
55
+ constructor(provider, config) {
56
+ const cfg = { ...DEFAULT_IMPORTANCE_CONFIG, ...config };
57
+ this.provider = provider;
58
+ this.contextWindow = cfg.contextWindow;
59
+ }
60
+ async scorePatterns(tokens, patterns) {
61
+ const tokenArray = Array.isArray(tokens) ? tokens : Array.from(tokens);
62
+ const n = tokenArray.length;
63
+ if (n === 0) {
64
+ return patterns.map(() => 0);
65
+ }
66
+ const scores = [];
67
+ for (const pattern of patterns) {
68
+ if (pattern.positions.length <= 1) {
69
+ // Single occurrence - can't compute diversity
70
+ scores.push(0.5);
71
+ continue;
72
+ }
73
+ // Extract context windows around each occurrence
74
+ const contextEmbeddings = [];
75
+ for (const pos of pattern.positions) {
76
+ const start = Math.max(0, pos - this.contextWindow);
77
+ const end = Math.min(n, pos + pattern.length + this.contextWindow);
78
+ const context = tokenArray.slice(start, end);
79
+ const embedding = await this.provider.embed(context);
80
+ contextEmbeddings.push(embedding);
81
+ }
82
+ // Compute pairwise cosine similarities
83
+ const similarities = [];
84
+ for (let i = 0; i < contextEmbeddings.length; i++) {
85
+ for (let j = i + 1; j < contextEmbeddings.length; j++) {
86
+ const sim = cosineSimilarity(contextEmbeddings[i], contextEmbeddings[j]);
87
+ similarities.push(sim);
88
+ }
89
+ }
90
+ // Low average similarity = diverse contexts = high importance
91
+ const avgSimilarity = similarities.length > 0
92
+ ? similarities.reduce((a, b) => a + b, 0) / similarities.length
93
+ : 0;
94
+ // Convert to importance (invert similarity)
95
+ scores.push(1 - avgSimilarity);
96
+ }
97
+ return scores;
98
+ }
99
+ }
100
+ /**
101
+ * Combined importance scorer that uses both positional and embedding-based scoring.
102
+ */
103
+ export class CombinedImportanceScorer {
104
+ positionalScorer;
105
+ embeddingScorer;
106
+ positionalWeight;
107
+ constructor(provider, config) {
108
+ const cfg = { ...DEFAULT_IMPORTANCE_CONFIG, ...config };
109
+ this.positionalScorer = new PositionalImportanceScorer(config);
110
+ this.embeddingScorer = provider ? new EmbeddingImportanceScorer(provider, config) : null;
111
+ this.positionalWeight = cfg.positionalWeight;
112
+ }
113
+ async scorePatterns(tokens, patterns) {
114
+ const positionalScores = await this.positionalScorer.scorePatterns(tokens, patterns);
115
+ if (!this.embeddingScorer) {
116
+ return positionalScores;
117
+ }
118
+ const embeddingScores = await this.embeddingScorer.scorePatterns(tokens, patterns);
119
+ // Weighted combination
120
+ return positionalScores.map((posScore, i) => {
121
+ const embScore = embeddingScores[i];
122
+ return this.positionalWeight * posScore + (1 - this.positionalWeight) * embScore;
123
+ });
124
+ }
125
+ }
126
+ /**
127
+ * Compute cosine similarity between two vectors.
128
+ */
129
+ function cosineSimilarity(a, b) {
130
+ if (a.length !== b.length) {
131
+ throw new Error('Vectors must have same length');
132
+ }
133
+ let dotProduct = 0;
134
+ let normA = 0;
135
+ let normB = 0;
136
+ for (let i = 0; i < a.length; i++) {
137
+ dotProduct += a[i] * b[i];
138
+ normA += a[i] * a[i];
139
+ normB += b[i] * b[i];
140
+ }
141
+ if (normA === 0 || normB === 0) {
142
+ return 0;
143
+ }
144
+ return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
145
+ }
146
+ /**
147
+ * Adjust pattern priorities based on importance scores.
148
+ *
149
+ * Patterns with high importance get lower priority (compressed later),
150
+ * patterns with low importance get higher priority (compressed first).
151
+ *
152
+ * @param patterns - Patterns to adjust
153
+ * @param scores - Importance scores from an ImportanceScorer
154
+ * @param threshold - Patterns above this importance threshold get negative priority
155
+ * @returns Patterns with adjusted priorities
156
+ */
157
+ export function adjustPrioritiesByImportance(patterns, scores, threshold = 0.7) {
158
+ return patterns.map((pattern, i) => {
159
+ const importance = scores[i];
160
+ // High importance = low priority (preserve)
161
+ // Low importance = high priority (compress)
162
+ let priorityAdjustment;
163
+ if (importance > threshold) {
164
+ // Very important - negative priority to preserve
165
+ priorityAdjustment = -10 * importance;
166
+ }
167
+ else {
168
+ // Less important - positive priority to compress
169
+ priorityAdjustment = 10 * (1 - importance);
170
+ }
171
+ return {
172
+ ...pattern,
173
+ // Store adjusted priority (would need to extend type)
174
+ _importanceScore: importance,
175
+ _adjustedPriority: priorityAdjustment,
176
+ };
177
+ });
178
+ }
179
+ /**
180
+ * Filter patterns to only those below an importance threshold.
181
+ *
182
+ * @param patterns - Patterns to filter
183
+ * @param scores - Importance scores
184
+ * @param threshold - Maximum importance score to include
185
+ * @returns Filtered patterns that are safe to compress
186
+ */
187
+ export function filterByImportance(patterns, scores, threshold = 0.8) {
188
+ return patterns.filter((_, i) => scores[i] < threshold);
189
+ }
190
+ //# sourceMappingURL=importance.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"importance.js","sourceRoot":"","sources":["../../src/importance.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AA8CH,MAAM,yBAAyB,GAA+B;IAC5D,SAAS,EAAE,GAAG;IACd,aAAa,EAAE,CAAC;IAChB,gBAAgB,EAAE,GAAG;CACtB,CAAC;AAmBF;;;;;;GAMG;AACH,MAAM,OAAO,0BAA0B;IAC7B,SAAS,CAAS;IAE1B,YAAY,MAAyB;QACnC,MAAM,GAAG,GAAG,EAAE,GAAG,yBAAyB,EAAE,GAAG,MAAM,EAAE,CAAC;QACxD,IAAI,CAAC,SAAS,GAAG,GAAG,CAAC,SAAS,CAAC;IACjC,CAAC;IAED,KAAK,CAAC,aAAa,CAAC,MAAgB,EAAE,QAA6B;QACjE,MAAM,CAAC,GAAG,MAAM,CAAC,MAAM,CAAC;QACxB,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC;YACZ,OAAO,QAAQ,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC;QAC/B,CAAC;QAED,OAAO,QAAQ,CAAC,GAAG,CAAC,CAAC,OAAO,EAAE,EAAE;YAC9B,IAAI,OAAO,CAAC,SAAS,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBACnC,OAAO,CAAC,CAAC;YACX,CAAC;YAED,+DAA+D;YAC/D,IAAI,eAAe,GAAG,CAAC,CAAC;YACxB,KAAK,MAAM,GAAG,IAAI,OAAO,CAAC,SAAS,EAAE,CAAC;gBACpC,2CAA2C;gBAC3C,MAAM,aAAa,GAAG,GAAG,GAAG,CAAC,CAAC;gBAC9B,MAAM,UAAU,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,SAAS,GAAG,aAAa,CAAC,CAAC;gBAC7D,eAAe,IAAI,UAAU,CAAC;YAChC,CAAC;YAED,OAAO,eAAe,GAAG,OAAO,CAAC,SAAS,CAAC,MAAM,CAAC;QACpD,CAAC,CAAC,CAAC;IACL,CAAC;CACF;AAED;;;;;;GAMG;AACH,MAAM,OAAO,yBAAyB;IAC5B,QAAQ,CAAoB;IAC5B,aAAa,CAAS;IAE9B,YAAY,QAA2B,EAAE,MAAyB;QAChE,MAAM,GAAG,GAAG,EAAE,GAAG,yBAAyB,EAAE,GAAG,MAAM,EAAE,CAAC;QACxD,IAAI,CAAC,QAAQ,GAAG,QAAQ,CAAC;QACzB,IAAI,CAAC,aAAa,GAAG,GAAG,CAAC,aAAa,CAAC;IACzC,CAAC;IAED,KAAK,CAAC,aAAa,CAAC,MAAgB,EAAE,QAA6B;QACjE,MAAM,UAAU,GAAG,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;QACvE,MAAM,CAAC,GAAG,UAAU,CAAC,MAAM,CAAC;QAE5B,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC;YACZ,OAAO,QAAQ,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC;QAC/B,CAAC;QAED,MAAM,MAAM,GAAa,EAAE,CAAC;QAE5B,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;YAC/B,IAAI,OAAO,CAAC,SAAS,CAAC,MAAM,IAAI,CAAC,EAAE,CAAC;gBAClC,8CAA8C;gBAC9C,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;gBACjB,SAAS;YACX,CAAC;YAED,iDAAiD;YACjD,MAAM,iBAAiB,GAAmB,EAAE,CAAC;YAE7C,KAAK,MAAM,GAAG,IAAI,OAAO,CAAC,SAAS,EAAE,CAAC;gBACpC,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,GAAG,GAAG,IAAI,CAAC,aAAa,CAAC,CAAC;gBACpD,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,GAAG,GAAG,OAAO,CAAC,MAAM,GAAG,IAAI,CAAC,aAAa,CAAC,CAAC;gBACnE,MAAM,OAAO,GAAG,UAAU,CAAC,KAAK,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC;gBAE7C,MAAM,SAAS,GAAG,MAAM,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;gBACrD,iBAAiB,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;YACpC,CAAC;YAED,uCAAuC;YACvC,MAAM,YAAY,GAAa,EAAE,CAAC;YAClC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,iBAAiB,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBAClD,KAAK,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,iBAAiB,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;oBACtD,MAAM,GAAG,GAAG,gBAAgB,CAAC,iBAAiB,CAAC,CAAC,CAAC,EAAE,iBAAiB,CAAC,CAAC,CAAC,CAAC,CAAC;oBACzE,YAAY,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;gBACzB,CAAC;YACH,CAAC;YAED,8DAA8D;YAC9D,MAAM,aAAa,GACjB,YAAY,CAAC,MAAM,GAAG,CAAC;gBACrB,CAAC,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,GAAG,YAAY,CAAC,MAAM;gBAC/D,CAAC,CAAC,CAAC,CAAC;YAER,4CAA4C;YAC5C,MAAM,CAAC,IAAI,CAAC,CAAC,GAAG,aAAa,CAAC,CAAC;QACjC,CAAC;QAED,OAAO,MAAM,CAAC;IAChB,CAAC;CACF;AAED;;GAEG;AACH,MAAM,OAAO,wBAAwB;IAC3B,gBAAgB,CAA6B;IAC7C,eAAe,CAAmC;IAClD,gBAAgB,CAAS;IAEjC,YAAY,QAA4B,EAAE,MAAyB;QACjE,MAAM,GAAG,GAAG,EAAE,GAAG,yBAAyB,EAAE,GAAG,MAAM,EAAE,CAAC;QACxD,IAAI,CAAC,gBAAgB,GAAG,IAAI,0BAA0B,CAAC,MAAM,CAAC,CAAC;QAC/D,IAAI,CAAC,eAAe,GAAG,QAAQ,CAAC,CAAC,CAAC,IAAI,yBAAyB,CAAC,QAAQ,EAAE,MAAM,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;QACzF,IAAI,CAAC,gBAAgB,GAAG,GAAG,CAAC,gBAAgB,CAAC;IAC/C,CAAC;IAED,KAAK,CAAC,aAAa,CAAC,MAAgB,EAAE,QAA6B;QACjE,MAAM,gBAAgB,GAAG,MAAM,IAAI,CAAC,gBAAgB,CAAC,aAAa,CAAC,MAAM,EAAE,QAAQ,CAAC,CAAC;QAErF,IAAI,CAAC,IAAI,CAAC,eAAe,EAAE,CAAC;YAC1B,OAAO,gBAAgB,CAAC;QAC1B,CAAC;QAED,MAAM,eAAe,GAAG,MAAM,IAAI,CAAC,eAAe,CAAC,aAAa,CAAC,MAAM,EAAE,QAAQ,CAAC,CAAC;QAEnF,uBAAuB;QACvB,OAAO,gBAAgB,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE,CAAC,EAAE,EAAE;YAC1C,MAAM,QAAQ,GAAG,eAAe,CAAC,CAAC,CAAC,CAAC;YACpC,OAAO,IAAI,CAAC,gBAAgB,GAAG,QAAQ,GAAG,CAAC,CAAC,GAAG,IAAI,CAAC,gBAAgB,CAAC,GAAG,QAAQ,CAAC;QACnF,CAAC,CAAC,CAAC;IACL,CAAC;CACF;AAED;;GAEG;AACH,SAAS,gBAAgB,CAAC,CAAe,EAAE,CAAe;IACxD,IAAI,CAAC,CAAC,MAAM,KAAK,CAAC,CAAC,MAAM,EAAE,CAAC;QAC1B,MAAM,IAAI,KAAK,CAAC,+BAA+B,CAAC,CAAC;IACnD,CAAC;IAED,IAAI,UAAU,GAAG,CAAC,CAAC;IACnB,IAAI,KAAK,GAAG,CAAC,CAAC;IACd,IAAI,KAAK,GAAG,CAAC,CAAC;IAEd,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAClC,UAAU,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;QAC1B,KAAK,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;QACrB,KAAK,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;IACvB,CAAC;IAED,IAAI,KAAK,KAAK,CAAC,IAAI,KAAK,KAAK,CAAC,EAAE,CAAC;QAC/B,OAAO,CAAC,CAAC;IACX,CAAC;IAED,OAAO,UAAU,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC;AAC5D,CAAC;AAED;;;;;;;;;;GAUG;AACH,MAAM,UAAU,4BAA4B,CAC1C,QAA6B,EAC7B,MAAgB,EAChB,SAAS,GAAG,GAAG;IAEf,OAAO,QAAQ,CAAC,GAAG,CAAC,CAAC,OAAO,EAAE,CAAC,EAAE,EAAE;QACjC,MAAM,UAAU,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC;QAE7B,4CAA4C;QAC5C,4CAA4C;QAC5C,IAAI,kBAA0B,CAAC;QAE/B,IAAI,UAAU,GAAG,SAAS,EAAE,CAAC;YAC3B,iDAAiD;YACjD,kBAAkB,GAAG,CAAC,EAAE,GAAG,UAAU,CAAC;QACxC,CAAC;aAAM,CAAC;YACN,iDAAiD;YACjD,kBAAkB,GAAG,EAAE,GAAG,CAAC,CAAC,GAAG,UAAU,CAAC,CAAC;QAC7C,CAAC;QAED,OAAO;YACL,GAAG,OAAO;YACV,sDAAsD;YACtD,gBAAgB,EAAE,UAAU;YAC5B,iBAAiB,EAAE,kBAAkB;SACyC,CAAC;IACnF,CAAC,CAAC,CAAC;AACL,CAAC;AAED;;;;;;;GAOG;AACH,MAAM,UAAU,kBAAkB,CAChC,QAA6B,EAC7B,MAAgB,EAChB,SAAS,GAAG,GAAG;IAEf,OAAO,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,MAAM,CAAC,CAAC,CAAC,GAAG,SAAS,CAAC,CAAC;AAC1D,CAAC"}
@@ -0,0 +1,19 @@
1
+ /**
2
+ * Small LTSC ML - Machine Learning Features
3
+ *
4
+ * Optional ML features for enhanced compression quality:
5
+ * - Pattern importance scoring
6
+ * - Quality prediction
7
+ * - Adaptive region detection
8
+ *
9
+ * @packageDocumentation
10
+ */
11
+ // Importance scoring
12
+ export { PositionalImportanceScorer, EmbeddingImportanceScorer, CombinedImportanceScorer, adjustPrioritiesByImportance, filterByImportance, } from './importance.js';
13
+ // Quality prediction
14
+ export { HeuristicQualityPredictor, EmbeddingQualityPredictor, createQualityPredictor, } from './quality.js';
15
+ // Region detection
16
+ export { RegionType, detectRegions, detectRegionsHeuristic, filterPatternsByRegion, getRegionCompressionSettings, } from './regions.js';
17
+ // Version
18
+ export const VERSION = '0.1.0';
19
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;GASG;AAEH,qBAAqB;AACrB,OAAO,EAIL,0BAA0B,EAC1B,yBAAyB,EACzB,wBAAwB,EACxB,4BAA4B,EAC5B,kBAAkB,GACnB,MAAM,iBAAiB,CAAC;AAEzB,qBAAqB;AACrB,OAAO,EAKL,yBAAyB,EACzB,yBAAyB,EACzB,sBAAsB,GACvB,MAAM,cAAc,CAAC;AAEtB,mBAAmB;AACnB,OAAO,EACL,UAAU,EAGV,aAAa,EACb,sBAAsB,EACtB,sBAAsB,EACtB,4BAA4B,GAC7B,MAAM,cAAc,CAAC;AAEtB,UAAU;AACV,MAAM,CAAC,MAAM,OAAO,GAAG,OAAO,CAAC"}
@@ -0,0 +1,235 @@
1
+ /**
2
+ * Quality prediction for compression validation.
3
+ *
4
+ * Predicts whether a compressed sequence will maintain
5
+ * sufficient quality for transformer models to learn from.
6
+ *
7
+ * Port of `small/quality_predictor.py` to TypeScript.
8
+ */
9
+ const DEFAULT_QUALITY_CONFIG = {
10
+ maxCompressionRatio: 0.5,
11
+ maxDictionaryOverhead: 0.3,
12
+ minEmbeddingSimilarity: 0.7,
13
+ maxDiversityReduction: 0.4,
14
+ };
15
+ /**
16
+ * Heuristic-based quality predictor.
17
+ *
18
+ * Uses a combination of handcrafted features and thresholds
19
+ * to predict compression quality.
20
+ */
21
+ export class HeuristicQualityPredictor {
22
+ config;
23
+ constructor(config) {
24
+ this.config = { ...DEFAULT_QUALITY_CONFIG, ...config };
25
+ }
26
+ async predict(result) {
27
+ const features = this.extractFeatures(result);
28
+ const score = this.computeScore(features);
29
+ const degradationProbability = this.computeDegradationProbability(features);
30
+ const acceptable = score >= 0.6 && degradationProbability < 0.3;
31
+ let recommendation;
32
+ if (acceptable) {
33
+ recommendation = 'accept';
34
+ }
35
+ else if (degradationProbability < 0.5) {
36
+ recommendation = 'retry_conservative';
37
+ }
38
+ else {
39
+ recommendation = 'skip_compression';
40
+ }
41
+ return {
42
+ score,
43
+ acceptable,
44
+ degradationProbability,
45
+ features,
46
+ recommendation,
47
+ };
48
+ }
49
+ /**
50
+ * Extract quality features from compression result.
51
+ */
52
+ extractFeatures(result) {
53
+ // Compression ratio
54
+ const compressionRatio = result.compressionRatio;
55
+ // Dictionary overhead
56
+ const dictionaryOverhead = result.originalLength > 0
57
+ ? result.dictionaryTokens.length / result.originalLength
58
+ : 0;
59
+ // Token diversity
60
+ const originalDiversity = new Set(result.originalTokens).size;
61
+ const compressedDiversity = new Set(result.serializedTokens).size;
62
+ const diversityReduction = originalDiversity > 0 ? 1 - compressedDiversity / originalDiversity : 0;
63
+ // Pattern statistics
64
+ const patternCount = result.dictionaryMap.size;
65
+ let totalPatternLength = 0;
66
+ for (const [, pattern] of result.dictionaryMap) {
67
+ totalPatternLength += pattern.length;
68
+ }
69
+ const averagePatternLength = patternCount > 0 ? totalPatternLength / patternCount : 0;
70
+ return {
71
+ compressionRatio,
72
+ dictionaryOverhead,
73
+ diversityReduction,
74
+ averagePatternLength,
75
+ patternCount,
76
+ };
77
+ }
78
+ /**
79
+ * Compute overall quality score from features.
80
+ */
81
+ computeScore(features) {
82
+ let score = 1.0;
83
+ // Penalize extreme compression
84
+ if (features.compressionRatio < this.config.maxCompressionRatio) {
85
+ const penalty = (this.config.maxCompressionRatio - features.compressionRatio) * 0.5;
86
+ score -= penalty;
87
+ }
88
+ // Penalize high dictionary overhead
89
+ if (features.dictionaryOverhead > this.config.maxDictionaryOverhead) {
90
+ const penalty = (features.dictionaryOverhead - this.config.maxDictionaryOverhead) * 0.3;
91
+ score -= penalty;
92
+ }
93
+ // Penalize diversity loss
94
+ if (features.diversityReduction > this.config.maxDiversityReduction) {
95
+ const penalty = (features.diversityReduction - this.config.maxDiversityReduction) * 0.4;
96
+ score -= penalty;
97
+ }
98
+ // Bonus for reasonable pattern lengths
99
+ if (features.averagePatternLength >= 3 && features.averagePatternLength <= 6) {
100
+ score += 0.1;
101
+ }
102
+ // Use embedding similarity if available
103
+ if (features.embeddingSimilarity !== undefined) {
104
+ if (features.embeddingSimilarity < this.config.minEmbeddingSimilarity) {
105
+ score -= (this.config.minEmbeddingSimilarity - features.embeddingSimilarity) * 0.5;
106
+ }
107
+ }
108
+ return Math.max(0, Math.min(1, score));
109
+ }
110
+ /**
111
+ * Compute probability of quality degradation.
112
+ */
113
+ computeDegradationProbability(features) {
114
+ let prob = 0;
115
+ // Very aggressive compression increases risk
116
+ if (features.compressionRatio < 0.4) {
117
+ prob += 0.3;
118
+ }
119
+ else if (features.compressionRatio < 0.5) {
120
+ prob += 0.15;
121
+ }
122
+ // High dictionary overhead increases risk
123
+ if (features.dictionaryOverhead > 0.3) {
124
+ prob += 0.2;
125
+ }
126
+ // Large diversity reduction increases risk
127
+ if (features.diversityReduction > 0.3) {
128
+ prob += 0.25;
129
+ }
130
+ else if (features.diversityReduction > 0.2) {
131
+ prob += 0.1;
132
+ }
133
+ // Very short patterns are risky
134
+ if (features.averagePatternLength < 2.5) {
135
+ prob += 0.15;
136
+ }
137
+ // Low embedding similarity is a strong signal
138
+ if (features.embeddingSimilarity !== undefined && features.embeddingSimilarity < 0.7) {
139
+ prob += 0.3 * (0.7 - features.embeddingSimilarity);
140
+ }
141
+ return Math.max(0, Math.min(1, prob));
142
+ }
143
+ }
144
+ /**
145
+ * Embedding-enhanced quality predictor.
146
+ *
147
+ * Adds embedding similarity comparison to the heuristic predictor
148
+ * for more accurate quality assessment.
149
+ */
150
+ export class EmbeddingQualityPredictor {
151
+ provider;
152
+ heuristicPredictor;
153
+ config;
154
+ constructor(provider, config) {
155
+ this.provider = provider;
156
+ this.heuristicPredictor = new HeuristicQualityPredictor(config);
157
+ this.config = { ...DEFAULT_QUALITY_CONFIG, ...config };
158
+ }
159
+ async predict(result) {
160
+ // Get base prediction
161
+ const basePrediction = await this.heuristicPredictor.predict(result);
162
+ // Compute embedding similarity
163
+ const originalEmbedding = await this.provider.embed(result.originalTokens);
164
+ const compressedEmbedding = await this.provider.embed(result.serializedTokens);
165
+ const similarity = this.cosineSimilarity(originalEmbedding, compressedEmbedding);
166
+ // Update features with embedding similarity
167
+ const features = {
168
+ ...basePrediction.features,
169
+ embeddingSimilarity: similarity,
170
+ };
171
+ // Recompute scores with embedding
172
+ let score = basePrediction.score;
173
+ let degradationProbability = basePrediction.degradationProbability;
174
+ if (similarity < this.config.minEmbeddingSimilarity) {
175
+ const penalty = (this.config.minEmbeddingSimilarity - similarity) * 0.3;
176
+ score -= penalty;
177
+ degradationProbability += 0.2;
178
+ }
179
+ else {
180
+ // High similarity is a good signal
181
+ score += (similarity - this.config.minEmbeddingSimilarity) * 0.1;
182
+ }
183
+ score = Math.max(0, Math.min(1, score));
184
+ degradationProbability = Math.max(0, Math.min(1, degradationProbability));
185
+ const acceptable = score >= 0.6 && degradationProbability < 0.3;
186
+ let recommendation;
187
+ if (acceptable) {
188
+ recommendation = 'accept';
189
+ }
190
+ else if (degradationProbability < 0.5) {
191
+ recommendation = 'retry_conservative';
192
+ }
193
+ else {
194
+ recommendation = 'skip_compression';
195
+ }
196
+ return {
197
+ score,
198
+ acceptable,
199
+ degradationProbability,
200
+ features,
201
+ recommendation,
202
+ };
203
+ }
204
+ cosineSimilarity(a, b) {
205
+ if (a.length !== b.length) {
206
+ throw new Error('Vectors must have same length');
207
+ }
208
+ let dotProduct = 0;
209
+ let normA = 0;
210
+ let normB = 0;
211
+ for (let i = 0; i < a.length; i++) {
212
+ dotProduct += a[i] * b[i];
213
+ normA += a[i] * a[i];
214
+ normB += b[i] * b[i];
215
+ }
216
+ if (normA === 0 || normB === 0) {
217
+ return 0;
218
+ }
219
+ return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
220
+ }
221
+ }
222
+ /**
223
+ * Create a quality predictor.
224
+ *
225
+ * @param provider - Optional embedding provider for enhanced prediction
226
+ * @param config - Quality configuration
227
+ * @returns Quality predictor instance
228
+ */
229
+ export function createQualityPredictor(provider, config) {
230
+ if (provider) {
231
+ return new EmbeddingQualityPredictor(provider, config);
232
+ }
233
+ return new HeuristicQualityPredictor(config);
234
+ }
235
+ //# sourceMappingURL=quality.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"quality.js","sourceRoot":"","sources":["../../src/quality.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAkCH,MAAM,sBAAsB,GAA4B;IACtD,mBAAmB,EAAE,GAAG;IACxB,qBAAqB,EAAE,GAAG;IAC1B,sBAAsB,EAAE,GAAG;IAC3B,qBAAqB,EAAE,GAAG;CAC3B,CAAC;AAgFF;;;;;GAKG;AACH,MAAM,OAAO,yBAAyB;IAC5B,MAAM,CAA0B;IAExC,YAAY,MAAsB;QAChC,IAAI,CAAC,MAAM,GAAG,EAAE,GAAG,sBAAsB,EAAE,GAAG,MAAM,EAAE,CAAC;IACzD,CAAC;IAED,KAAK,CAAC,OAAO,CAAC,MAAyB;QACrC,MAAM,QAAQ,GAAG,IAAI,CAAC,eAAe,CAAC,MAAM,CAAC,CAAC;QAC9C,MAAM,KAAK,GAAG,IAAI,CAAC,YAAY,CAAC,QAAQ,CAAC,CAAC;QAC1C,MAAM,sBAAsB,GAAG,IAAI,CAAC,6BAA6B,CAAC,QAAQ,CAAC,CAAC;QAC5E,MAAM,UAAU,GAAG,KAAK,IAAI,GAAG,IAAI,sBAAsB,GAAG,GAAG,CAAC;QAEhE,IAAI,cAAmD,CAAC;QACxD,IAAI,UAAU,EAAE,CAAC;YACf,cAAc,GAAG,QAAQ,CAAC;QAC5B,CAAC;aAAM,IAAI,sBAAsB,GAAG,GAAG,EAAE,CAAC;YACxC,cAAc,GAAG,oBAAoB,CAAC;QACxC,CAAC;aAAM,CAAC;YACN,cAAc,GAAG,kBAAkB,CAAC;QACtC,CAAC;QAED,OAAO;YACL,KAAK;YACL,UAAU;YACV,sBAAsB;YACtB,QAAQ;YACR,cAAc;SACf,CAAC;IACJ,CAAC;IAED;;OAEG;IACK,eAAe,CAAC,MAAyB;QAC/C,oBAAoB;QACpB,MAAM,gBAAgB,GAAG,MAAM,CAAC,gBAAgB,CAAC;QAEjD,sBAAsB;QACtB,MAAM,kBAAkB,GACtB,MAAM,CAAC,cAAc,GAAG,CAAC;YACvB,CAAC,CAAC,MAAM,CAAC,gBAAgB,CAAC,MAAM,GAAG,MAAM,CAAC,cAAc;YACxD,CAAC,CAAC,CAAC,CAAC;QAER,kBAAkB;QAClB,MAAM,iBAAiB,GAAG,IAAI,GAAG,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,IAAI,CAAC;QAC9D,MAAM,mBAAmB,GAAG,IAAI,GAAG,CAAC,MAAM,CAAC,gBAAgB,CAAC,CAAC,IAAI,CAAC;QAClE,MAAM,kBAAkB,GACtB,iBAAiB,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,mBAAmB,GAAG,iBAAiB,CAAC,CAAC,CAAC,CAAC,CAAC;QAE1E,qBAAqB;QACrB,MAAM,YAAY,GAAG,MAAM,CAAC,aAAa,CAAC,IAAI,CAAC;QAC/C,IAAI,kBAAkB,GAAG,CAAC,CAAC;QAC3B,KAAK,MAAM,CAAC,EAAE,OAAO,CAAC,IAAI,MAAM,CAAC,aAAa,EAAE,CAAC;YAC/C,kBAAkB,IAAI,OAAO,CAAC,MAAM,CAAC;QACvC,CAAC;QACD,MAAM,oBAAoB,GAAG,YAAY,GAAG,CAAC,CAAC,CAAC,CAAC,kBAAkB,GAAG,YAAY,CAAC,CAAC,CAAC,CAAC,CAAC;QAEtF,OAAO;YACL,gBAAgB;YAChB,kBAAkB;YAClB,kBAAkB;YAClB,oBAAoB;YACpB,YAAY;SACb,CAAC;IACJ,CAAC;IAED;;OAEG;IACK,YAAY,CAAC,QAAyB;QAC5C,IAAI,KAAK,GAAG,GAAG,CAAC;QAEhB,+BAA+B;QAC/B,IAAI,QAAQ,CAAC,gBAAgB,GAAG,IAAI,CAAC,MAAM,CAAC,mBAAmB,EAAE,CAAC;YAChE,MAAM,OAAO,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,mBAAmB,GAAG,QAAQ,CAAC,gBAAgB,CAAC,GAAG,GAAG,CAAC;YACpF,KAAK,IAAI,OAAO,CAAC;QACnB,CAAC;QAED,oCAAoC;QACpC,IAAI,QAAQ,CAAC,kBAAkB,GAAG,IAAI,CAAC,MAAM,CAAC,qBAAqB,EAAE,CAAC;YACpE,MAAM,OAAO,GAAG,CAAC,QAAQ,CAAC,kBAAkB,GAAG,IAAI,CAAC,MAAM,CAAC,qBAAqB,CAAC,GAAG,GAAG,CAAC;YACxF,KAAK,IAAI,OAAO,CAAC;QACnB,CAAC;QAED,0BAA0B;QAC1B,IAAI,QAAQ,CAAC,kBAAkB,GAAG,IAAI,CAAC,MAAM,CAAC,qBAAqB,EAAE,CAAC;YACpE,MAAM,OAAO,GAAG,CAAC,QAAQ,CAAC,kBAAkB,GAAG,IAAI,CAAC,MAAM,CAAC,qBAAqB,CAAC,GAAG,GAAG,CAAC;YACxF,KAAK,IAAI,OAAO,CAAC;QACnB,CAAC;QAED,uCAAuC;QACvC,IAAI,QAAQ,CAAC,oBAAoB,IAAI,CAAC,IAAI,QAAQ,CAAC,oBAAoB,IAAI,CAAC,EAAE,CAAC;YAC7E,KAAK,IAAI,GAAG,CAAC;QACf,CAAC;QAED,wCAAwC;QACxC,IAAI,QAAQ,CAAC,mBAAmB,KAAK,SAAS,EAAE,CAAC;YAC/C,IAAI,QAAQ,CAAC,mBAAmB,GAAG,IAAI,CAAC,MAAM,CAAC,sBAAsB,EAAE,CAAC;gBACtE,KAAK,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,sBAAsB,GAAG,QAAQ,CAAC,mBAAmB,CAAC,GAAG,GAAG,CAAC;YACrF,CAAC;QACH,CAAC;QAED,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,KAAK,CAAC,CAAC,CAAC;IACzC,CAAC;IAED;;OAEG;IACK,6BAA6B,CAAC,QAAyB;QAC7D,IAAI,IAAI,GAAG,CAAC,CAAC;QAEb,6CAA6C;QAC7C,IAAI,QAAQ,CAAC,gBAAgB,GAAG,GAAG,EAAE,CAAC;YACpC,IAAI,IAAI,GAAG,CAAC;QACd,CAAC;aAAM,IAAI,QAAQ,CAAC,gBAAgB,GAAG,GAAG,EAAE,CAAC;YAC3C,IAAI,IAAI,IAAI,CAAC;QACf,CAAC;QAED,0CAA0C;QAC1C,IAAI,QAAQ,CAAC,kBAAkB,GAAG,GAAG,EAAE,CAAC;YACtC,IAAI,IAAI,GAAG,CAAC;QACd,CAAC;QAED,2CAA2C;QAC3C,IAAI,QAAQ,CAAC,kBAAkB,GAAG,GAAG,EAAE,CAAC;YACtC,IAAI,IAAI,IAAI,CAAC;QACf,CAAC;aAAM,IAAI,QAAQ,CAAC,kBAAkB,GAAG,GAAG,EAAE,CAAC;YAC7C,IAAI,IAAI,GAAG,CAAC;QACd,CAAC;QAED,gCAAgC;QAChC,IAAI,QAAQ,CAAC,oBAAoB,GAAG,GAAG,EAAE,CAAC;YACxC,IAAI,IAAI,IAAI,CAAC;QACf,CAAC;QAED,8CAA8C;QAC9C,IAAI,QAAQ,CAAC,mBAAmB,KAAK,SAAS,IAAI,QAAQ,CAAC,mBAAmB,GAAG,GAAG,EAAE,CAAC;YACrF,IAAI,IAAI,GAAG,GAAG,CAAC,GAAG,GAAG,QAAQ,CAAC,mBAAmB,CAAC,CAAC;QACrD,CAAC;QAED,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC,CAAC;IACxC,CAAC;CACF;AAED;;;;;GAKG;AACH,MAAM,OAAO,yBAAyB;IAC5B,QAAQ,CAAoB;IAC5B,kBAAkB,CAA4B;IAC9C,MAAM,CAA0B;IAExC,YAAY,QAA2B,EAAE,MAAsB;QAC7D,IAAI,CAAC,QAAQ,GAAG,QAAQ,CAAC;QACzB,IAAI,CAAC,kBAAkB,GAAG,IAAI,yBAAyB,CAAC,MAAM,CAAC,CAAC;QAChE,IAAI,CAAC,MAAM,GAAG,EAAE,GAAG,sBAAsB,EAAE,GAAG,MAAM,EAAE,CAAC;IACzD,CAAC;IAED,KAAK,CAAC,OAAO,CAAC,MAAyB;QACrC,sBAAsB;QACtB,MAAM,cAAc,GAAG,MAAM,IAAI,CAAC,kBAAkB,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC;QAErE,+BAA+B;QAC/B,MAAM,iBAAiB,GAAG,MAAM,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC;QAC3E,MAAM,mBAAmB,GAAG,MAAM,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,MAAM,CAAC,gBAAgB,CAAC,CAAC;QAC/E,MAAM,UAAU,GAAG,IAAI,CAAC,gBAAgB,CAAC,iBAAiB,EAAE,mBAAmB,CAAC,CAAC;QAEjF,4CAA4C;QAC5C,MAAM,QAAQ,GAAoB;YAChC,GAAG,cAAc,CAAC,QAAQ;YAC1B,mBAAmB,EAAE,UAAU;SAChC,CAAC;QAEF,kCAAkC;QAClC,IAAI,KAAK,GAAG,cAAc,CAAC,KAAK,CAAC;QACjC,IAAI,sBAAsB,GAAG,cAAc,CAAC,sBAAsB,CAAC;QAEnE,IAAI,UAAU,GAAG,IAAI,CAAC,MAAM,CAAC,sBAAsB,EAAE,CAAC;YACpD,MAAM,OAAO,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,sBAAsB,GAAG,UAAU,CAAC,GAAG,GAAG,CAAC;YACxE,KAAK,IAAI,OAAO,CAAC;YACjB,sBAAsB,IAAI,GAAG,CAAC;QAChC,CAAC;aAAM,CAAC;YACN,mCAAmC;YACnC,KAAK,IAAI,CAAC,UAAU,GAAG,IAAI,CAAC,MAAM,CAAC,sBAAsB,CAAC,GAAG,GAAG,CAAC;QACnE,CAAC;QAED,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,KAAK,CAAC,CAAC,CAAC;QACxC,sBAAsB,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,sBAAsB,CAAC,CAAC,CAAC;QAE1E,MAAM,UAAU,GAAG,KAAK,IAAI,GAAG,IAAI,sBAAsB,GAAG,GAAG,CAAC;QAEhE,IAAI,cAAmD,CAAC;QACxD,IAAI,UAAU,EAAE,CAAC;YACf,cAAc,GAAG,QAAQ,CAAC;QAC5B,CAAC;aAAM,IAAI,sBAAsB,GAAG,GAAG,EAAE,CAAC;YACxC,cAAc,GAAG,oBAAoB,CAAC;QACxC,CAAC;aAAM,CAAC;YACN,cAAc,GAAG,kBAAkB,CAAC;QACtC,CAAC;QAED,OAAO;YACL,KAAK;YACL,UAAU;YACV,sBAAsB;YACtB,QAAQ;YACR,cAAc;SACf,CAAC;IACJ,CAAC;IAEO,gBAAgB,CAAC,CAAe,EAAE,CAAe;QACvD,IAAI,CAAC,CAAC,MAAM,KAAK,CAAC,CAAC,MAAM,EAAE,CAAC;YAC1B,MAAM,IAAI,KAAK,CAAC,+BAA+B,CAAC,CAAC;QACnD,CAAC;QAED,IAAI,UAAU,GAAG,CAAC,CAAC;QACnB,IAAI,KAAK,GAAG,CAAC,CAAC;QACd,IAAI,KAAK,GAAG,CAAC,CAAC;QAEd,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAClC,UAAU,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;YAC1B,KAAK,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;YACrB,KAAK,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;QACvB,CAAC;QAED,IAAI,KAAK,KAAK,CAAC,IAAI,KAAK,KAAK,CAAC,EAAE,CAAC;YAC/B,OAAO,CAAC,CAAC;QACX,CAAC;QAED,OAAO,UAAU,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC;IAC5D,CAAC;CACF;AAED;;;;;;GAMG;AACH,MAAM,UAAU,sBAAsB,CACpC,QAA4B,EAC5B,MAAsB;IAEtB,IAAI,QAAQ,EAAE,CAAC;QACb,OAAO,IAAI,yBAAyB,CAAC,QAAQ,EAAE,MAAM,CAAC,CAAC;IACzD,CAAC;IACD,OAAO,IAAI,yBAAyB,CAAC,MAAM,CAAC,CAAC;AAC/C,CAAC"}
@@ -0,0 +1,251 @@
1
+ /**
2
+ * Adaptive region detection for context-aware compression.
3
+ *
4
+ * Detects semantic regions in token sequences (system prompts, user input,
5
+ * injected context) and applies different compression strategies.
6
+ *
7
+ * Port of `small/adaptive.py` region detection to TypeScript.
8
+ */
9
+ /**
10
+ * Region types with different compression strategies.
11
+ */
12
+ export var RegionType;
13
+ (function (RegionType) {
14
+ /** System instructions - minimal compression */
15
+ RegionType["SYSTEM"] = "system";
16
+ /** User input - moderate compression */
17
+ RegionType["USER"] = "user";
18
+ /** Injected context - aggressive compression */
19
+ RegionType["CONTEXT"] = "context";
20
+ /** Code blocks - moderate compression */
21
+ RegionType["CODE"] = "code";
22
+ /** Unknown region - default compression */
23
+ RegionType["UNKNOWN"] = "unknown";
24
+ })(RegionType || (RegionType = {}));
25
+ const DEFAULT_RETENTION_TARGETS = {
26
+ [RegionType.SYSTEM]: 0.98,
27
+ [RegionType.USER]: 0.85,
28
+ [RegionType.CONTEXT]: 0.6,
29
+ [RegionType.CODE]: 0.8,
30
+ [RegionType.UNKNOWN]: 0.75,
31
+ };
32
+ /**
33
+ * Detect regions in a token sequence.
34
+ *
35
+ * Uses marker patterns to identify boundaries between different
36
+ * semantic regions in the input.
37
+ *
38
+ * @param tokens - Token sequence to analyze
39
+ * @param config - Region detection configuration
40
+ * @returns Array of detected regions
41
+ */
42
+ export function detectRegions(tokens, config) {
43
+ const tokenArray = Array.isArray(tokens) ? tokens : Array.from(tokens);
44
+ const n = tokenArray.length;
45
+ if (n === 0) {
46
+ return [];
47
+ }
48
+ const retentionTargets = {
49
+ ...DEFAULT_RETENTION_TARGETS,
50
+ ...config?.retentionTargets,
51
+ };
52
+ // Find all marker positions
53
+ const markers = [];
54
+ const systemMarkers = config?.systemMarkers ?? DEFAULT_SYSTEM_MARKERS;
55
+ const userMarkers = config?.userMarkers ?? DEFAULT_USER_MARKERS;
56
+ const contextMarkers = config?.contextMarkers ?? DEFAULT_CONTEXT_MARKERS;
57
+ const codeMarkers = config?.codeMarkers ?? DEFAULT_CODE_MARKERS;
58
+ // Search for markers
59
+ for (const pattern of systemMarkers) {
60
+ for (const pos of findPattern(tokenArray, pattern)) {
61
+ markers.push({ pos, type: RegionType.SYSTEM });
62
+ }
63
+ }
64
+ for (const pattern of userMarkers) {
65
+ for (const pos of findPattern(tokenArray, pattern)) {
66
+ markers.push({ pos, type: RegionType.USER });
67
+ }
68
+ }
69
+ for (const pattern of contextMarkers) {
70
+ for (const pos of findPattern(tokenArray, pattern)) {
71
+ markers.push({ pos, type: RegionType.CONTEXT });
72
+ }
73
+ }
74
+ for (const pattern of codeMarkers) {
75
+ for (const pos of findPattern(tokenArray, pattern)) {
76
+ markers.push({ pos, type: RegionType.CODE });
77
+ }
78
+ }
79
+ // Sort markers by position
80
+ markers.sort((a, b) => a.pos - b.pos);
81
+ // Build regions from markers
82
+ const regions = [];
83
+ if (markers.length === 0) {
84
+ // No markers found - treat entire sequence as unknown
85
+ regions.push({
86
+ type: RegionType.UNKNOWN,
87
+ start: 0,
88
+ end: n,
89
+ retention: retentionTargets[RegionType.UNKNOWN],
90
+ });
91
+ }
92
+ else {
93
+ // Add initial region if first marker is not at start
94
+ if (markers[0].pos > 0) {
95
+ regions.push({
96
+ type: RegionType.UNKNOWN,
97
+ start: 0,
98
+ end: markers[0].pos,
99
+ retention: retentionTargets[RegionType.UNKNOWN],
100
+ });
101
+ }
102
+ // Add regions from markers
103
+ for (let i = 0; i < markers.length; i++) {
104
+ const marker = markers[i];
105
+ const nextPos = i < markers.length - 1 ? markers[i + 1].pos : n;
106
+ regions.push({
107
+ type: marker.type,
108
+ start: marker.pos,
109
+ end: nextPos,
110
+ retention: retentionTargets[marker.type],
111
+ });
112
+ }
113
+ }
114
+ return regions;
115
+ }
116
+ /**
117
+ * Heuristic-based region detection without explicit markers.
118
+ *
119
+ * Uses statistical features to guess region boundaries.
120
+ */
121
+ export function detectRegionsHeuristic(tokens) {
122
+ const tokenArray = Array.isArray(tokens) ? tokens : Array.from(tokens);
123
+ const n = tokenArray.length;
124
+ if (n === 0) {
125
+ return [];
126
+ }
127
+ // Simple heuristic: first 10% is likely system, rest is context
128
+ const systemEnd = Math.floor(n * 0.1);
129
+ return [
130
+ {
131
+ type: RegionType.SYSTEM,
132
+ start: 0,
133
+ end: Math.max(systemEnd, 1),
134
+ retention: DEFAULT_RETENTION_TARGETS[RegionType.SYSTEM],
135
+ },
136
+ {
137
+ type: RegionType.CONTEXT,
138
+ start: Math.max(systemEnd, 1),
139
+ end: n,
140
+ retention: DEFAULT_RETENTION_TARGETS[RegionType.CONTEXT],
141
+ },
142
+ ];
143
+ }
144
+ /**
145
+ * Filter patterns based on region retention targets.
146
+ *
147
+ * Removes patterns that would compress high-retention regions.
148
+ *
149
+ * @param patterns - Discovered patterns
150
+ * @param regions - Detected regions
151
+ * @param tokens - Original token sequence
152
+ * @returns Filtered patterns respecting region constraints
153
+ */
154
+ export function filterPatternsByRegion(patterns, regions, _tokens) {
155
+ if (regions.length === 0) {
156
+ return patterns;
157
+ }
158
+ return patterns.map((pattern) => {
159
+ // Filter positions based on region retention
160
+ const filteredPositions = pattern.positions.filter((pos) => {
161
+ const region = findRegionAtPosition(regions, pos);
162
+ if (!region)
163
+ return true;
164
+ // Keep position if region allows compression (low retention)
165
+ // High retention regions should preserve patterns
166
+ return region.retention < 0.9;
167
+ });
168
+ return {
169
+ ...pattern,
170
+ positions: filteredPositions,
171
+ count: filteredPositions.length,
172
+ };
173
+ }).filter((pattern) => pattern.positions.length >= 2);
174
+ }
175
+ /**
176
+ * Find the region containing a position.
177
+ */
178
+ function findRegionAtPosition(regions, pos) {
179
+ for (const region of regions) {
180
+ if (pos >= region.start && pos < region.end) {
181
+ return region;
182
+ }
183
+ }
184
+ return null;
185
+ }
186
+ /**
187
+ * Find all occurrences of a pattern in tokens.
188
+ */
189
+ function findPattern(tokens, pattern) {
190
+ const positions = [];
191
+ const n = tokens.length;
192
+ const m = pattern.length;
193
+ for (let i = 0; i <= n - m; i++) {
194
+ let match = true;
195
+ for (let j = 0; j < m; j++) {
196
+ if (tokens[i + j] !== pattern[j]) {
197
+ match = false;
198
+ break;
199
+ }
200
+ }
201
+ if (match) {
202
+ positions.push(i);
203
+ }
204
+ }
205
+ return positions;
206
+ }
207
+ // Default marker patterns (tiktoken cl100k_base token IDs for common markers)
208
+ // These are approximate - actual token IDs depend on the tokenizer
209
+ /** Default system region markers */
210
+ const DEFAULT_SYSTEM_MARKERS = [
211
+ // [SYSTEM], <<SYS>>, etc.
212
+ [58, 71905, 60], // [SYSTEM]
213
+ [27, 27, 71905, 2083, 2083], // <<SYS>>
214
+ ];
215
+ /** Default user region markers */
216
+ const DEFAULT_USER_MARKERS = [
217
+ // [USER], [INST], etc.
218
+ [58, 35295, 60], // [USER]
219
+ [58, 96746, 60], // [INST]
220
+ ];
221
+ /** Default context region markers */
222
+ const DEFAULT_CONTEXT_MARKERS = [
223
+ // [CONTEXT], [DOC], etc.
224
+ [58, 94034, 60], // [CONTEXT]
225
+ [58, 44184, 60], // [DOC]
226
+ ];
227
+ /** Default code region markers */
228
+ const DEFAULT_CODE_MARKERS = [
229
+ // ```python, ```typescript, etc.
230
+ [74694, 12958], // ```python
231
+ [74694, 92459], // ```typescript
232
+ [74694, 13210], // ```javascript
233
+ ];
234
+ /**
235
+ * Get compression settings for a region type.
236
+ */
237
+ export function getRegionCompressionSettings(regionType) {
238
+ switch (regionType) {
239
+ case RegionType.SYSTEM:
240
+ return { maxSubsequenceLength: 4, minOccurrences: 5 };
241
+ case RegionType.USER:
242
+ return { maxSubsequenceLength: 6, minOccurrences: 3 };
243
+ case RegionType.CONTEXT:
244
+ return { maxSubsequenceLength: 10, minOccurrences: 2 };
245
+ case RegionType.CODE:
246
+ return { maxSubsequenceLength: 6, minOccurrences: 3 };
247
+ default:
248
+ return { maxSubsequenceLength: 8, minOccurrences: 3 };
249
+ }
250
+ }
251
+ //# sourceMappingURL=regions.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"regions.js","sourceRoot":"","sources":["../../src/regions.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAIH;;GAEG;AACH,MAAM,CAAN,IAAY,UAWX;AAXD,WAAY,UAAU;IACpB,gDAAgD;IAChD,+BAAiB,CAAA;IACjB,wCAAwC;IACxC,2BAAa,CAAA;IACb,gDAAgD;IAChD,iCAAmB,CAAA;IACnB,yCAAyC;IACzC,2BAAa,CAAA;IACb,2CAA2C;IAC3C,iCAAmB,CAAA;AACrB,CAAC,EAXW,UAAU,KAAV,UAAU,QAWrB;AAgCD,MAAM,yBAAyB,GAA+B;IAC5D,CAAC,UAAU,CAAC,MAAM,CAAC,EAAE,IAAI;IACzB,CAAC,UAAU,CAAC,IAAI,CAAC,EAAE,IAAI;IACvB,CAAC,UAAU,CAAC,OAAO,CAAC,EAAE,GAAG;IACzB,CAAC,UAAU,CAAC,IAAI,CAAC,EAAE,GAAG;IACtB,CAAC,UAAU,CAAC,OAAO,CAAC,EAAE,IAAI;CAC3B,CAAC;AAEF;;;;;;;;;GASG;AACH,MAAM,UAAU,aAAa,CAAC,MAAgB,EAAE,MAAqB;IACnE,MAAM,UAAU,GAAG,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IACvE,MAAM,CAAC,GAAG,UAAU,CAAC,MAAM,CAAC;IAE5B,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC;QACZ,OAAO,EAAE,CAAC;IACZ,CAAC;IAED,MAAM,gBAAgB,GAAG;QACvB,GAAG,yBAAyB;QAC5B,GAAG,MAAM,EAAE,gBAAgB;KAC5B,CAAC;IAEF,4BAA4B;IAC5B,MAAM,OAAO,GAAwC,EAAE,CAAC;IAExD,MAAM,aAAa,GAAG,MAAM,EAAE,aAAa,IAAI,sBAAsB,CAAC;IACtE,MAAM,WAAW,GAAG,MAAM,EAAE,WAAW,IAAI,oBAAoB,CAAC;IAChE,MAAM,cAAc,GAAG,MAAM,EAAE,cAAc,IAAI,uBAAuB,CAAC;IACzE,MAAM,WAAW,GAAG,MAAM,EAAE,WAAW,IAAI,oBAAoB,CAAC;IAEhE,qBAAqB;IACrB,KAAK,MAAM,OAAO,IAAI,aAAa,EAAE,CAAC;QACpC,KAAK,MAAM,GAAG,IAAI,WAAW,CAAC,UAAU,EAAE,OAAO,CAAC,EAAE,CAAC;YACnD,OAAO,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,IAAI,EAAE,UAAU,CAAC,MAAM,EAAE,CAAC,CAAC;QACjD,CAAC;IACH,CAAC;IAED,KAAK,MAAM,OAAO,IAAI,WAAW,EAAE,CAAC;QAClC,KAAK,MAAM,GAAG,IAAI,WAAW,CAAC,UAAU,EAAE,OAAO,CAAC,EAAE,CAAC;YACnD,OAAO,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,IAAI,EAAE,UAAU,CAAC,IAAI,EAAE,CAAC,CAAC;QAC/C,CAAC;IACH,CAAC;IAED,KAAK,MAAM,OAAO,IAAI,cAAc,EAAE,CAAC;QACrC,KAAK,MAAM,GAAG,IAAI,WAAW,CAAC,UAAU,EAAE,OAAO,CAAC,EAAE,CAAC;YACnD,OAAO,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,IAAI,EAAE,UAAU,CAAC,OAAO,EAAE,CAAC,CAAC;QAClD,CAAC;IACH,CAAC;IAED,KAAK,MAAM,OAAO,IAAI,WAAW,EAAE,CAAC;QAClC,KAAK,MAAM,GAAG,IAAI,WAAW,CAAC,UAAU,EAAE,OAAO,CAAC,EAAE,CAAC;YACnD,OAAO,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,IAAI,EAAE,UAAU,CAAC,IAAI,EAAE,CAAC,CAAC;QAC/C,CAAC;IACH,CAAC;IAED,2BAA2B;IAC3B,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC;IAEtC,6BAA6B;IAC7B,MAAM,OAAO,GAAa,EAAE,CAAC;IAE7B,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACzB,sDAAsD;QACtD,OAAO,CAAC,IAAI,CAAC;YACX,IAAI,EAAE,UAAU,CAAC,OAAO;YACxB,KAAK,EAAE,CAAC;YACR,GAAG,EAAE,CAAC;YACN,SAAS,EAAE,gBAAgB,CAAC,UAAU,CAAC,OAAO,CAAC;SAChD,CAAC,CAAC;IACL,CAAC;SAAM,CAAC;QACN,qDAAqD;QACrD,IAAI,OAAO,CAAC,CAAC,CAAC,CAAC,GAAG,GAAG,CAAC,EAAE,CAAC;YACvB,OAAO,CAAC,IAAI,CAAC;gBACX,IAAI,EAAE,UAAU,CAAC,OAAO;gBACxB,KAAK,EAAE,CAAC;gBACR,GAAG,EAAE,OAAO,CAAC,CAAC,CAAC,CAAC,GAAG;gBACnB,SAAS,EAAE,gBAAgB,CAAC,UAAU,CAAC,OAAO,CAAC;aAChD,CAAC,CAAC;QACL,CAAC;QAED,2BAA2B;QAC3B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACxC,MAAM,MAAM,GAAG,OAAO,CAAC,CAAC,CAAC,CAAC;YAC1B,MAAM,OAAO,GAAG,CAAC,GAAG,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;YAEhE,OAAO,CAAC,IAAI,CAAC;gBACX,IAAI,EAAE,MAAM,CAAC,IAAI;gBACjB,KAAK,EAAE,MAAM,CAAC,GAAG;gBACjB,GAAG,EAAE,OAAO;gBACZ,SAAS,EAAE,gBAAgB,CAAC,MAAM,CAAC,IAAI,CAAC;aACzC,CAAC,CAAC;QACL,CAAC;IACH,CAAC;IAED,OAAO,OAAO,CAAC;AACjB,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,sBAAsB,CAAC,MAAgB;IACrD,MAAM,UAAU,GAAG,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IACvE,MAAM,CAAC,GAAG,UAAU,CAAC,MAAM,CAAC;IAE5B,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC;QACZ,OAAO,EAAE,CAAC;IACZ,CAAC;IAED,gEAAgE;IAChE,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC;IAEtC,OAAO;QACL;YACE,IAAI,EAAE,UAAU,CAAC,MAAM;YACvB,KAAK,EAAE,CAAC;YACR,GAAG,EAAE,IAAI,CAAC,GAAG,CAAC,SAAS,EAAE,CAAC,CAAC;YAC3B,SAAS,EAAE,yBAAyB,CAAC,UAAU,CAAC,MAAM,CAAC;SACxD;QACD;YACE,IAAI,EAAE,UAAU,CAAC,OAAO;YACxB,KAAK,EAAE,IAAI,CAAC,GAAG,CAAC,SAAS,EAAE,CAAC,CAAC;YAC7B,GAAG,EAAE,CAAC;YACN,SAAS,EAAE,yBAAyB,CAAC,UAAU,CAAC,OAAO,CAAC;SACzD;KACF,CAAC;AACJ,CAAC;AAED;;;;;;;;;GASG;AACH,MAAM,UAAU,sBAAsB,CACpC,QAA6B,EAC7B,OAAiB,EACjB,OAAiB;IAEjB,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACzB,OAAO,QAAQ,CAAC;IAClB,CAAC;IAED,OAAO,QAAQ,CAAC,GAAG,CAAC,CAAC,OAAO,EAAE,EAAE;QAC9B,6CAA6C;QAC7C,MAAM,iBAAiB,GAAG,OAAO,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,EAAE;YACzD,MAAM,MAAM,GAAG,oBAAoB,CAAC,OAAO,EAAE,GAAG,CAAC,CAAC;YAClD,IAAI,CAAC,MAAM;gBAAE,OAAO,IAAI,CAAC;YAEzB,6DAA6D;YAC7D,kDAAkD;YAClD,OAAO,MAAM,CAAC,SAAS,GAAG,GAAG,CAAC;QAChC,CAAC,CAAC,CAAC;QAEH,OAAO;YACL,GAAG,OAAO;YACV,SAAS,EAAE,iBAAiB;YAC5B,KAAK,EAAE,iBAAiB,CAAC,MAAM;SAChC,CAAC;IACJ,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,OAAO,EAAE,EAAE,CAAC,OAAO,CAAC,SAAS,CAAC,MAAM,IAAI,CAAC,CAAC,CAAC;AACxD,CAAC;AAED;;GAEG;AACH,SAAS,oBAAoB,CAAC,OAAiB,EAAE,GAAW;IAC1D,KAAK,MAAM,MAAM,IAAI,OAAO,EAAE,CAAC;QAC7B,IAAI,GAAG,IAAI,MAAM,CAAC,KAAK,IAAI,GAAG,GAAG,MAAM,CAAC,GAAG,EAAE,CAAC;YAC5C,OAAO,MAAM,CAAC;QAChB,CAAC;IACH,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC;AAED;;GAEG;AACH,SAAS,WAAW,CAAC,MAAgB,EAAE,OAAiB;IACtD,MAAM,SAAS,GAAa,EAAE,CAAC;IAC/B,MAAM,CAAC,GAAG,MAAM,CAAC,MAAM,CAAC;IACxB,MAAM,CAAC,GAAG,OAAO,CAAC,MAAM,CAAC;IAEzB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QAChC,IAAI,KAAK,GAAG,IAAI,CAAC;QACjB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YAC3B,IAAI,MAAM,CAAC,CAAC,GAAG,CAAC,CAAC,KAAK,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC;gBACjC,KAAK,GAAG,KAAK,CAAC;gBACd,MAAM;YACR,CAAC;QACH,CAAC;QACD,IAAI,KAAK,EAAE,CAAC;YACV,SAAS,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACpB,CAAC;IACH,CAAC;IAED,OAAO,SAAS,CAAC;AACnB,CAAC;AAED,8EAA8E;AAC9E,mEAAmE;AAEnE,oCAAoC;AACpC,MAAM,sBAAsB,GAAe;IACzC,0BAA0B;IAC1B,CAAC,EAAE,EAAE,KAAK,EAAE,EAAE,CAAC,EAAM,WAAW;IAChC,CAAC,EAAE,EAAE,EAAE,EAAE,KAAK,EAAE,IAAI,EAAE,IAAI,CAAC,EAAE,UAAU;CACxC,CAAC;AAEF,kCAAkC;AAClC,MAAM,oBAAoB,GAAe;IACvC,uBAAuB;IACvB,CAAC,EAAE,EAAE,KAAK,EAAE,EAAE,CAAC,EAAM,SAAS;IAC9B,CAAC,EAAE,EAAE,KAAK,EAAE,EAAE,CAAC,EAAM,SAAS;CAC/B,CAAC;AAEF,qCAAqC;AACrC,MAAM,uBAAuB,GAAe;IAC1C,yBAAyB;IACzB,CAAC,EAAE,EAAE,KAAK,EAAE,EAAE,CAAC,EAAM,YAAY;IACjC,CAAC,EAAE,EAAE,KAAK,EAAE,EAAE,CAAC,EAAM,QAAQ;CAC9B,CAAC;AAEF,kCAAkC;AAClC,MAAM,oBAAoB,GAAe;IACvC,iCAAiC;IACjC,CAAC,KAAK,EAAE,KAAK,CAAC,EAAO,YAAY;IACjC,CAAC,KAAK,EAAE,KAAK,CAAC,EAAO,gBAAgB;IACrC,CAAC,KAAK,EAAE,KAAK,CAAC,EAAO,gBAAgB;CACtC,CAAC;AAEF;;GAEG;AACH,MAAM,UAAU,4BAA4B,CAAC,UAAsB;IAIjE,QAAQ,UAAU,EAAE,CAAC;QACnB,KAAK,UAAU,CAAC,MAAM;YACpB,OAAO,EAAE,oBAAoB,EAAE,CAAC,EAAE,cAAc,EAAE,CAAC,EAAE,CAAC;QACxD,KAAK,UAAU,CAAC,IAAI;YAClB,OAAO,EAAE,oBAAoB,EAAE,CAAC,EAAE,cAAc,EAAE,CAAC,EAAE,CAAC;QACxD,KAAK,UAAU,CAAC,OAAO;YACrB,OAAO,EAAE,oBAAoB,EAAE,EAAE,EAAE,cAAc,EAAE,CAAC,EAAE,CAAC;QACzD,KAAK,UAAU,CAAC,IAAI;YAClB,OAAO,EAAE,oBAAoB,EAAE,CAAC,EAAE,cAAc,EAAE,CAAC,EAAE,CAAC;QACxD;YACE,OAAO,EAAE,oBAAoB,EAAE,CAAC,EAAE,cAAc,EAAE,CAAC,EAAE,CAAC;IAC1D,CAAC;AACH,CAAC"}
@@ -0,0 +1,116 @@
1
+ /**
2
+ * Pattern importance scoring for ML-aware compression.
3
+ *
4
+ * Port of `small/pattern_importance.py` to TypeScript.
5
+ */
6
+ import type { DiscoveredPattern, TokenSeq } from '@small-ltsc/sdk';
7
+ /**
8
+ * Interface for embedding providers.
9
+ */
10
+ export interface EmbeddingProvider {
11
+ /**
12
+ * Get embeddings for a sequence of tokens.
13
+ *
14
+ * @param tokens - Token sequence to embed
15
+ * @returns Promise resolving to embedding vector (Float32Array)
16
+ */
17
+ embed(tokens: TokenSeq): Promise<Float32Array>;
18
+ /**
19
+ * Get embedding dimension.
20
+ */
21
+ dimension(): number;
22
+ }
23
+ /**
24
+ * Configuration for importance scoring.
25
+ */
26
+ export interface ImportanceConfig {
27
+ /**
28
+ * Position decay rate for positional scoring.
29
+ * Higher values = more weight on early positions.
30
+ * @default 2.0
31
+ */
32
+ decayRate?: number;
33
+ /**
34
+ * Context window size for embedding-based scoring.
35
+ * @default 5
36
+ */
37
+ contextWindow?: number;
38
+ /**
39
+ * Weight for positional importance (vs. embedding-based).
40
+ * @default 0.3
41
+ */
42
+ positionalWeight?: number;
43
+ }
44
+ /**
45
+ * Interface for pattern importance scorers.
46
+ */
47
+ export interface ImportanceScorer {
48
+ /**
49
+ * Score patterns by importance.
50
+ *
51
+ * Higher scores indicate more important patterns that should be
52
+ * preserved (less aggressively compressed).
53
+ *
54
+ * @param tokens - Original token sequence
55
+ * @param patterns - Discovered patterns to score
56
+ * @returns Promise resolving to importance scores (0-1 range)
57
+ */
58
+ scorePatterns(tokens: TokenSeq, patterns: DiscoveredPattern[]): Promise<number[]>;
59
+ }
60
+ /**
61
+ * Positional importance scorer.
62
+ *
63
+ * Scores patterns based on their position in the sequence,
64
+ * with earlier positions receiving higher importance (useful for
65
+ * system prompts that typically appear at the start).
66
+ */
67
+ export declare class PositionalImportanceScorer implements ImportanceScorer {
68
+ private decayRate;
69
+ constructor(config?: ImportanceConfig);
70
+ scorePatterns(tokens: TokenSeq, patterns: DiscoveredPattern[]): Promise<number[]>;
71
+ }
72
+ /**
73
+ * Embedding-based importance scorer.
74
+ *
75
+ * Uses an embedding model to determine if a pattern appears in
76
+ * diverse semantic contexts (important, should preserve) vs.
77
+ * similar contexts (redundant, safe to compress).
78
+ */
79
+ export declare class EmbeddingImportanceScorer implements ImportanceScorer {
80
+ private provider;
81
+ private contextWindow;
82
+ constructor(provider: EmbeddingProvider, config?: ImportanceConfig);
83
+ scorePatterns(tokens: TokenSeq, patterns: DiscoveredPattern[]): Promise<number[]>;
84
+ }
85
+ /**
86
+ * Combined importance scorer that uses both positional and embedding-based scoring.
87
+ */
88
+ export declare class CombinedImportanceScorer implements ImportanceScorer {
89
+ private positionalScorer;
90
+ private embeddingScorer;
91
+ private positionalWeight;
92
+ constructor(provider?: EmbeddingProvider, config?: ImportanceConfig);
93
+ scorePatterns(tokens: TokenSeq, patterns: DiscoveredPattern[]): Promise<number[]>;
94
+ }
95
+ /**
96
+ * Adjust pattern priorities based on importance scores.
97
+ *
98
+ * Patterns with high importance get lower priority (compressed later),
99
+ * patterns with low importance get higher priority (compressed first).
100
+ *
101
+ * @param patterns - Patterns to adjust
102
+ * @param scores - Importance scores from an ImportanceScorer
103
+ * @param threshold - Patterns above this importance threshold get negative priority
104
+ * @returns Patterns with adjusted priorities
105
+ */
106
+ export declare function adjustPrioritiesByImportance(patterns: DiscoveredPattern[], scores: number[], threshold?: number): DiscoveredPattern[];
107
+ /**
108
+ * Filter patterns to only those below an importance threshold.
109
+ *
110
+ * @param patterns - Patterns to filter
111
+ * @param scores - Importance scores
112
+ * @param threshold - Maximum importance score to include
113
+ * @returns Filtered patterns that are safe to compress
114
+ */
115
+ export declare function filterByImportance(patterns: DiscoveredPattern[], scores: number[], threshold?: number): DiscoveredPattern[];
116
+ //# sourceMappingURL=importance.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"importance.d.ts","sourceRoot":"","sources":["../../src/importance.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,KAAK,EAAE,iBAAiB,EAAE,QAAQ,EAAE,MAAM,iBAAiB,CAAC;AAEnE;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAChC;;;;;OAKG;IACH,KAAK,CAAC,MAAM,EAAE,QAAQ,GAAG,OAAO,CAAC,YAAY,CAAC,CAAC;IAE/C;;OAEG;IACH,SAAS,IAAI,MAAM,CAAC;CACrB;AAED;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B;;;;OAIG;IACH,SAAS,CAAC,EAAE,MAAM,CAAC;IAEnB;;;OAGG;IACH,aAAa,CAAC,EAAE,MAAM,CAAC;IAEvB;;;OAGG;IACH,gBAAgB,CAAC,EAAE,MAAM,CAAC;CAC3B;AAQD;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B;;;;;;;;;OASG;IACH,aAAa,CAAC,MAAM,EAAE,QAAQ,EAAE,QAAQ,EAAE,iBAAiB,EAAE,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC,CAAC;CACnF;AAED;;;;;;GAMG;AACH,qBAAa,0BAA2B,YAAW,gBAAgB;IACjE,OAAO,CAAC,SAAS,CAAS;gBAEd,MAAM,CAAC,EAAE,gBAAgB;IAK/B,aAAa,CAAC,MAAM,EAAE,QAAQ,EAAE,QAAQ,EAAE,iBAAiB,EAAE,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC;CAuBxF;AAED;;;;;;GAMG;AACH,qBAAa,yBAA0B,YAAW,gBAAgB;IAChE,OAAO,CAAC,QAAQ,CAAoB;IACpC,OAAO,CAAC,aAAa,CAAS;gBAElB,QAAQ,EAAE,iBAAiB,EAAE,MAAM,CAAC,EAAE,gBAAgB;IAM5D,aAAa,CAAC,MAAM,EAAE,QAAQ,EAAE,QAAQ,EAAE,iBAAiB,EAAE,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC;CAkDxF;AAED;;GAEG;AACH,qBAAa,wBAAyB,YAAW,gBAAgB;IAC/D,OAAO,CAAC,gBAAgB,CAA6B;IACrD,OAAO,CAAC,eAAe,CAAmC;IAC1D,OAAO,CAAC,gBAAgB,CAAS;gBAErB,QAAQ,CAAC,EAAE,iBAAiB,EAAE,MAAM,CAAC,EAAE,gBAAgB;IAO7D,aAAa,CAAC,MAAM,EAAE,QAAQ,EAAE,QAAQ,EAAE,iBAAiB,EAAE,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC;CAexF;AA2BD;;;;;;;;;;GAUG;AACH,wBAAgB,4BAA4B,CAC1C,QAAQ,EAAE,iBAAiB,EAAE,EAC7B,MAAM,EAAE,MAAM,EAAE,EAChB,SAAS,SAAM,GACd,iBAAiB,EAAE,CAuBrB;AAED;;;;;;;GAOG;AACH,wBAAgB,kBAAkB,CAChC,QAAQ,EAAE,iBAAiB,EAAE,EAC7B,MAAM,EAAE,MAAM,EAAE,EAChB,SAAS,SAAM,GACd,iBAAiB,EAAE,CAErB"}
@@ -0,0 +1,15 @@
1
+ /**
2
+ * Small LTSC ML - Machine Learning Features
3
+ *
4
+ * Optional ML features for enhanced compression quality:
5
+ * - Pattern importance scoring
6
+ * - Quality prediction
7
+ * - Adaptive region detection
8
+ *
9
+ * @packageDocumentation
10
+ */
11
+ export { type EmbeddingProvider, type ImportanceConfig, type ImportanceScorer, PositionalImportanceScorer, EmbeddingImportanceScorer, CombinedImportanceScorer, adjustPrioritiesByImportance, filterByImportance, } from './importance.js';
12
+ export { type QualityConfig, type QualityPrediction, type QualityFeatures, type QualityPredictor, HeuristicQualityPredictor, EmbeddingQualityPredictor, createQualityPredictor, } from './quality.js';
13
+ export { RegionType, type Region, type RegionConfig, detectRegions, detectRegionsHeuristic, filterPatternsByRegion, getRegionCompressionSettings, } from './regions.js';
14
+ export declare const VERSION = "0.1.0";
15
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;GASG;AAGH,OAAO,EACL,KAAK,iBAAiB,EACtB,KAAK,gBAAgB,EACrB,KAAK,gBAAgB,EACrB,0BAA0B,EAC1B,yBAAyB,EACzB,wBAAwB,EACxB,4BAA4B,EAC5B,kBAAkB,GACnB,MAAM,iBAAiB,CAAC;AAGzB,OAAO,EACL,KAAK,aAAa,EAClB,KAAK,iBAAiB,EACtB,KAAK,eAAe,EACpB,KAAK,gBAAgB,EACrB,yBAAyB,EACzB,yBAAyB,EACzB,sBAAsB,GACvB,MAAM,cAAc,CAAC;AAGtB,OAAO,EACL,UAAU,EACV,KAAK,MAAM,EACX,KAAK,YAAY,EACjB,aAAa,EACb,sBAAsB,EACtB,sBAAsB,EACtB,4BAA4B,GAC7B,MAAM,cAAc,CAAC;AAGtB,eAAO,MAAM,OAAO,UAAU,CAAC"}
@@ -0,0 +1,147 @@
1
+ /**
2
+ * Quality prediction for compression validation.
3
+ *
4
+ * Predicts whether a compressed sequence will maintain
5
+ * sufficient quality for transformer models to learn from.
6
+ *
7
+ * Port of `small/quality_predictor.py` to TypeScript.
8
+ */
9
+ import type { CompressionResult } from '@small-ltsc/sdk';
10
+ import type { EmbeddingProvider } from './importance.js';
11
+ /**
12
+ * Configuration for quality prediction.
13
+ */
14
+ export interface QualityConfig {
15
+ /**
16
+ * Maximum acceptable compression ratio.
17
+ * @default 0.5
18
+ */
19
+ maxCompressionRatio?: number;
20
+ /**
21
+ * Maximum acceptable dictionary overhead ratio.
22
+ * @default 0.3
23
+ */
24
+ maxDictionaryOverhead?: number;
25
+ /**
26
+ * Minimum embedding similarity between original and compressed.
27
+ * @default 0.7
28
+ */
29
+ minEmbeddingSimilarity?: number;
30
+ /**
31
+ * Maximum acceptable token diversity reduction.
32
+ * @default 0.4
33
+ */
34
+ maxDiversityReduction?: number;
35
+ }
36
+ /**
37
+ * Result of quality prediction.
38
+ */
39
+ export interface QualityPrediction {
40
+ /**
41
+ * Overall quality score (0-1, higher is better).
42
+ */
43
+ score: number;
44
+ /**
45
+ * Whether the compression passes quality threshold.
46
+ */
47
+ acceptable: boolean;
48
+ /**
49
+ * Probability of quality degradation.
50
+ */
51
+ degradationProbability: number;
52
+ /**
53
+ * Detailed feature scores.
54
+ */
55
+ features: QualityFeatures;
56
+ /**
57
+ * Recommendation for how to proceed.
58
+ */
59
+ recommendation: 'accept' | 'retry_conservative' | 'skip_compression';
60
+ }
61
+ /**
62
+ * Feature scores used in quality prediction.
63
+ */
64
+ export interface QualityFeatures {
65
+ /**
66
+ * Compression ratio feature (lower is more aggressive).
67
+ */
68
+ compressionRatio: number;
69
+ /**
70
+ * Dictionary overhead ratio.
71
+ */
72
+ dictionaryOverhead: number;
73
+ /**
74
+ * Token diversity change (0-1, 0 = no change, 1 = complete loss).
75
+ */
76
+ diversityReduction: number;
77
+ /**
78
+ * Average pattern length feature.
79
+ */
80
+ averagePatternLength: number;
81
+ /**
82
+ * Pattern count feature.
83
+ */
84
+ patternCount: number;
85
+ /**
86
+ * Embedding similarity (if available).
87
+ */
88
+ embeddingSimilarity?: number;
89
+ }
90
+ /**
91
+ * Quality predictor interface.
92
+ */
93
+ export interface QualityPredictor {
94
+ /**
95
+ * Predict quality of compressed output.
96
+ *
97
+ * @param result - Compression result to evaluate
98
+ * @returns Promise resolving to quality prediction
99
+ */
100
+ predict(result: CompressionResult): Promise<QualityPrediction>;
101
+ }
102
+ /**
103
+ * Heuristic-based quality predictor.
104
+ *
105
+ * Uses a combination of handcrafted features and thresholds
106
+ * to predict compression quality.
107
+ */
108
+ export declare class HeuristicQualityPredictor implements QualityPredictor {
109
+ private config;
110
+ constructor(config?: QualityConfig);
111
+ predict(result: CompressionResult): Promise<QualityPrediction>;
112
+ /**
113
+ * Extract quality features from compression result.
114
+ */
115
+ private extractFeatures;
116
+ /**
117
+ * Compute overall quality score from features.
118
+ */
119
+ private computeScore;
120
+ /**
121
+ * Compute probability of quality degradation.
122
+ */
123
+ private computeDegradationProbability;
124
+ }
125
+ /**
126
+ * Embedding-enhanced quality predictor.
127
+ *
128
+ * Adds embedding similarity comparison to the heuristic predictor
129
+ * for more accurate quality assessment.
130
+ */
131
+ export declare class EmbeddingQualityPredictor implements QualityPredictor {
132
+ private provider;
133
+ private heuristicPredictor;
134
+ private config;
135
+ constructor(provider: EmbeddingProvider, config?: QualityConfig);
136
+ predict(result: CompressionResult): Promise<QualityPrediction>;
137
+ private cosineSimilarity;
138
+ }
139
+ /**
140
+ * Create a quality predictor.
141
+ *
142
+ * @param provider - Optional embedding provider for enhanced prediction
143
+ * @param config - Quality configuration
144
+ * @returns Quality predictor instance
145
+ */
146
+ export declare function createQualityPredictor(provider?: EmbeddingProvider, config?: QualityConfig): QualityPredictor;
147
+ //# sourceMappingURL=quality.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"quality.d.ts","sourceRoot":"","sources":["../../src/quality.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,iBAAiB,CAAC;AACzD,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,iBAAiB,CAAC;AAEzD;;GAEG;AACH,MAAM,WAAW,aAAa;IAC5B;;;OAGG;IACH,mBAAmB,CAAC,EAAE,MAAM,CAAC;IAE7B;;;OAGG;IACH,qBAAqB,CAAC,EAAE,MAAM,CAAC;IAE/B;;;OAGG;IACH,sBAAsB,CAAC,EAAE,MAAM,CAAC;IAEhC;;;OAGG;IACH,qBAAqB,CAAC,EAAE,MAAM,CAAC;CAChC;AASD;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAChC;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;IAEd;;OAEG;IACH,UAAU,EAAE,OAAO,CAAC;IAEpB;;OAEG;IACH,sBAAsB,EAAE,MAAM,CAAC;IAE/B;;OAEG;IACH,QAAQ,EAAE,eAAe,CAAC;IAE1B;;OAEG;IACH,cAAc,EAAE,QAAQ,GAAG,oBAAoB,GAAG,kBAAkB,CAAC;CACtE;AAED;;GAEG;AACH,MAAM,WAAW,eAAe;IAC9B;;OAEG;IACH,gBAAgB,EAAE,MAAM,CAAC;IAEzB;;OAEG;IACH,kBAAkB,EAAE,MAAM,CAAC;IAE3B;;OAEG;IACH,kBAAkB,EAAE,MAAM,CAAC;IAE3B;;OAEG;IACH,oBAAoB,EAAE,MAAM,CAAC;IAE7B;;OAEG;IACH,YAAY,EAAE,MAAM,CAAC;IAErB;;OAEG;IACH,mBAAmB,CAAC,EAAE,MAAM,CAAC;CAC9B;AAED;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B;;;;;OAKG;IACH,OAAO,CAAC,MAAM,EAAE,iBAAiB,GAAG,OAAO,CAAC,iBAAiB,CAAC,CAAC;CAChE;AAED;;;;;GAKG;AACH,qBAAa,yBAA0B,YAAW,gBAAgB;IAChE,OAAO,CAAC,MAAM,CAA0B;gBAE5B,MAAM,CAAC,EAAE,aAAa;IAI5B,OAAO,CAAC,MAAM,EAAE,iBAAiB,GAAG,OAAO,CAAC,iBAAiB,CAAC;IAwBpE;;OAEG;IACH,OAAO,CAAC,eAAe;IAiCvB;;OAEG;IACH,OAAO,CAAC,YAAY;IAoCpB;;OAEG;IACH,OAAO,CAAC,6BAA6B;CAkCtC;AAED;;;;;GAKG;AACH,qBAAa,yBAA0B,YAAW,gBAAgB;IAChE,OAAO,CAAC,QAAQ,CAAoB;IACpC,OAAO,CAAC,kBAAkB,CAA4B;IACtD,OAAO,CAAC,MAAM,CAA0B;gBAE5B,QAAQ,EAAE,iBAAiB,EAAE,MAAM,CAAC,EAAE,aAAa;IAMzD,OAAO,CAAC,MAAM,EAAE,iBAAiB,GAAG,OAAO,CAAC,iBAAiB,CAAC;IAmDpE,OAAO,CAAC,gBAAgB;CAqBzB;AAED;;;;;;GAMG;AACH,wBAAgB,sBAAsB,CACpC,QAAQ,CAAC,EAAE,iBAAiB,EAC5B,MAAM,CAAC,EAAE,aAAa,GACrB,gBAAgB,CAKlB"}
@@ -0,0 +1,88 @@
1
+ /**
2
+ * Adaptive region detection for context-aware compression.
3
+ *
4
+ * Detects semantic regions in token sequences (system prompts, user input,
5
+ * injected context) and applies different compression strategies.
6
+ *
7
+ * Port of `small/adaptive.py` region detection to TypeScript.
8
+ */
9
+ import type { TokenSeq, DiscoveredPattern } from '@small-ltsc/sdk';
10
+ /**
11
+ * Region types with different compression strategies.
12
+ */
13
+ export declare enum RegionType {
14
+ /** System instructions - minimal compression */
15
+ SYSTEM = "system",
16
+ /** User input - moderate compression */
17
+ USER = "user",
18
+ /** Injected context - aggressive compression */
19
+ CONTEXT = "context",
20
+ /** Code blocks - moderate compression */
21
+ CODE = "code",
22
+ /** Unknown region - default compression */
23
+ UNKNOWN = "unknown"
24
+ }
25
+ /**
26
+ * A detected region in the token sequence.
27
+ */
28
+ export interface Region {
29
+ /** Region type */
30
+ type: RegionType;
31
+ /** Start position (inclusive) */
32
+ start: number;
33
+ /** End position (exclusive) */
34
+ end: number;
35
+ /** Compression retention target (0-1) */
36
+ retention: number;
37
+ }
38
+ /**
39
+ * Configuration for region detection.
40
+ */
41
+ export interface RegionConfig {
42
+ /** Token patterns that mark system region start */
43
+ systemMarkers?: number[][];
44
+ /** Token patterns that mark user region start */
45
+ userMarkers?: number[][];
46
+ /** Token patterns that mark context region start */
47
+ contextMarkers?: number[][];
48
+ /** Token patterns that mark code region start */
49
+ codeMarkers?: number[][];
50
+ /** Retention targets for each region type */
51
+ retentionTargets?: Partial<Record<RegionType, number>>;
52
+ }
53
+ /**
54
+ * Detect regions in a token sequence.
55
+ *
56
+ * Uses marker patterns to identify boundaries between different
57
+ * semantic regions in the input.
58
+ *
59
+ * @param tokens - Token sequence to analyze
60
+ * @param config - Region detection configuration
61
+ * @returns Array of detected regions
62
+ */
63
+ export declare function detectRegions(tokens: TokenSeq, config?: RegionConfig): Region[];
64
+ /**
65
+ * Heuristic-based region detection without explicit markers.
66
+ *
67
+ * Uses statistical features to guess region boundaries.
68
+ */
69
+ export declare function detectRegionsHeuristic(tokens: TokenSeq): Region[];
70
+ /**
71
+ * Filter patterns based on region retention targets.
72
+ *
73
+ * Removes patterns that would compress high-retention regions.
74
+ *
75
+ * @param patterns - Discovered patterns
76
+ * @param regions - Detected regions
77
+ * @param tokens - Original token sequence
78
+ * @returns Filtered patterns respecting region constraints
79
+ */
80
+ export declare function filterPatternsByRegion(patterns: DiscoveredPattern[], regions: Region[], _tokens: TokenSeq): DiscoveredPattern[];
81
+ /**
82
+ * Get compression settings for a region type.
83
+ */
84
+ export declare function getRegionCompressionSettings(regionType: RegionType): {
85
+ maxSubsequenceLength: number;
86
+ minOccurrences: number;
87
+ };
88
+ //# sourceMappingURL=regions.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"regions.d.ts","sourceRoot":"","sources":["../../src/regions.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,OAAO,KAAK,EAAE,QAAQ,EAAE,iBAAiB,EAAE,MAAM,iBAAiB,CAAC;AAEnE;;GAEG;AACH,oBAAY,UAAU;IACpB,gDAAgD;IAChD,MAAM,WAAW;IACjB,wCAAwC;IACxC,IAAI,SAAS;IACb,gDAAgD;IAChD,OAAO,YAAY;IACnB,yCAAyC;IACzC,IAAI,SAAS;IACb,2CAA2C;IAC3C,OAAO,YAAY;CACpB;AAED;;GAEG;AACH,MAAM,WAAW,MAAM;IACrB,kBAAkB;IAClB,IAAI,EAAE,UAAU,CAAC;IACjB,iCAAiC;IACjC,KAAK,EAAE,MAAM,CAAC;IACd,+BAA+B;IAC/B,GAAG,EAAE,MAAM,CAAC;IACZ,yCAAyC;IACzC,SAAS,EAAE,MAAM,CAAC;CACnB;AAED;;GAEG;AACH,MAAM,WAAW,YAAY;IAC3B,mDAAmD;IACnD,aAAa,CAAC,EAAE,MAAM,EAAE,EAAE,CAAC;IAC3B,iDAAiD;IACjD,WAAW,CAAC,EAAE,MAAM,EAAE,EAAE,CAAC;IACzB,oDAAoD;IACpD,cAAc,CAAC,EAAE,MAAM,EAAE,EAAE,CAAC;IAC5B,iDAAiD;IACjD,WAAW,CAAC,EAAE,MAAM,EAAE,EAAE,CAAC;IACzB,6CAA6C;IAC7C,gBAAgB,CAAC,EAAE,OAAO,CAAC,MAAM,CAAC,UAAU,EAAE,MAAM,CAAC,CAAC,CAAC;CACxD;AAUD;;;;;;;;;GASG;AACH,wBAAgB,aAAa,CAAC,MAAM,EAAE,QAAQ,EAAE,MAAM,CAAC,EAAE,YAAY,GAAG,MAAM,EAAE,CAsF/E;AAED;;;;GAIG;AACH,wBAAgB,sBAAsB,CAAC,MAAM,EAAE,QAAQ,GAAG,MAAM,EAAE,CAyBjE;AAED;;;;;;;;;GASG;AACH,wBAAgB,sBAAsB,CACpC,QAAQ,EAAE,iBAAiB,EAAE,EAC7B,OAAO,EAAE,MAAM,EAAE,EACjB,OAAO,EAAE,QAAQ,GAChB,iBAAiB,EAAE,CAsBrB;AAsED;;GAEG;AACH,wBAAgB,4BAA4B,CAAC,UAAU,EAAE,UAAU,GAAG;IACpE,oBAAoB,EAAE,MAAM,CAAC;IAC7B,cAAc,EAAE,MAAM,CAAC;CACxB,CAaA"}
package/package.json ADDED
@@ -0,0 +1,53 @@
1
+ {
2
+ "name": "@small-ltsc/ml",
3
+ "version": "0.1.0",
4
+ "description": "ML features for Small LTSC - Pattern importance scoring and quality prediction",
5
+ "type": "module",
6
+ "main": "./dist/esm/index.js",
7
+ "module": "./dist/esm/index.js",
8
+ "types": "./dist/types/index.d.ts",
9
+ "exports": {
10
+ ".": {
11
+ "import": "./dist/esm/index.js",
12
+ "types": "./dist/types/index.d.ts"
13
+ }
14
+ },
15
+ "files": [
16
+ "dist"
17
+ ],
18
+ "sideEffects": false,
19
+ "scripts": {
20
+ "build": "npm run build:esm && npm run build:types",
21
+ "build:esm": "tsc -p tsconfig.esm.json",
22
+ "build:types": "tsc -p tsconfig.types.json",
23
+ "test": "vitest",
24
+ "lint": "eslint src --ext .ts",
25
+ "prepublishOnly": "npm run build"
26
+ },
27
+ "keywords": [
28
+ "compression",
29
+ "llm",
30
+ "tokens",
31
+ "transformer",
32
+ "ml",
33
+ "embeddings"
34
+ ],
35
+ "author": "",
36
+ "license": "MIT",
37
+ "repository": {
38
+ "type": "git",
39
+ "url": "https://github.com/triage-sec/small"
40
+ },
41
+ "peerDependencies": {
42
+ "@small-ltsc/sdk": "^0.1.0"
43
+ },
44
+ "devDependencies": {
45
+ "@types/node": "^20.10.0",
46
+ "eslint": "^8.55.0",
47
+ "typescript": "^5.3.0",
48
+ "vitest": "^1.0.0"
49
+ },
50
+ "engines": {
51
+ "node": ">=18.0.0"
52
+ }
53
+ }