@small-ltsc/ml 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +158 -0
- package/dist/esm/importance.js +190 -0
- package/dist/esm/importance.js.map +1 -0
- package/dist/esm/index.js +19 -0
- package/dist/esm/index.js.map +1 -0
- package/dist/esm/quality.js +235 -0
- package/dist/esm/quality.js.map +1 -0
- package/dist/esm/regions.js +251 -0
- package/dist/esm/regions.js.map +1 -0
- package/dist/types/importance.d.ts +116 -0
- package/dist/types/importance.d.ts.map +1 -0
- package/dist/types/index.d.ts +15 -0
- package/dist/types/index.d.ts.map +1 -0
- package/dist/types/quality.d.ts +147 -0
- package/dist/types/quality.d.ts.map +1 -0
- package/dist/types/regions.d.ts +88 -0
- package/dist/types/regions.d.ts.map +1 -0
- package/package.json +53 -0
package/README.md
ADDED
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
# @small-ltsc/ml
|
|
2
|
+
|
|
3
|
+
Optional ML features for **Small LTSC** - Pattern importance scoring, quality prediction, and adaptive region detection.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
npm install @small-ltsc/ml @small-ltsc/sdk
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
Note: `@small-ltsc/sdk` is a peer dependency.
|
|
12
|
+
|
|
13
|
+
## Features
|
|
14
|
+
|
|
15
|
+
- **Pattern Importance Scoring** - Determine which patterns are semantically important
|
|
16
|
+
- **Quality Prediction** - Predict if compression will degrade model performance
|
|
17
|
+
- **Region Detection** - Identify system prompts, user input, and context for adaptive compression
|
|
18
|
+
|
|
19
|
+
## Pattern Importance
|
|
20
|
+
|
|
21
|
+
Score patterns to preserve semantically important content:
|
|
22
|
+
|
|
23
|
+
```typescript
|
|
24
|
+
import { PositionalImportanceScorer, filterByImportance } from '@small-ltsc/ml';
|
|
25
|
+
import { discoverPatterns } from '@small-ltsc/sdk';
|
|
26
|
+
|
|
27
|
+
const scorer = new PositionalImportanceScorer({ decayRate: 2.0 });
|
|
28
|
+
const patterns = await discoverPatterns(tokens);
|
|
29
|
+
const scores = await scorer.scorePatterns(tokens, patterns);
|
|
30
|
+
|
|
31
|
+
// Filter out high-importance patterns (preserve them)
|
|
32
|
+
const safeToCompress = filterByImportance(patterns, scores, 0.8);
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
### Embedding-Based Scoring
|
|
36
|
+
|
|
37
|
+
For more accurate importance scoring with an embedding model:
|
|
38
|
+
|
|
39
|
+
```typescript
|
|
40
|
+
import { EmbeddingImportanceScorer } from '@small-ltsc/ml';
|
|
41
|
+
|
|
42
|
+
const scorer = new EmbeddingImportanceScorer(embeddingProvider, {
|
|
43
|
+
contextWindow: 5,
|
|
44
|
+
});
|
|
45
|
+
|
|
46
|
+
const scores = await scorer.scorePatterns(tokens, patterns);
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## Quality Prediction
|
|
50
|
+
|
|
51
|
+
Predict if compressed output will maintain quality:
|
|
52
|
+
|
|
53
|
+
```typescript
|
|
54
|
+
import { createQualityPredictor } from '@small-ltsc/ml';
|
|
55
|
+
import { compress } from '@small-ltsc/sdk';
|
|
56
|
+
|
|
57
|
+
const predictor = createQualityPredictor();
|
|
58
|
+
const result = await compress(tokens);
|
|
59
|
+
const prediction = await predictor.predict(result);
|
|
60
|
+
|
|
61
|
+
if (!prediction.acceptable) {
|
|
62
|
+
console.log(`Recommendation: ${prediction.recommendation}`);
|
|
63
|
+
// 'accept' | 'retry_conservative' | 'skip_compression'
|
|
64
|
+
}
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
### Quality Features
|
|
68
|
+
|
|
69
|
+
```typescript
|
|
70
|
+
console.log(prediction.features);
|
|
71
|
+
// {
|
|
72
|
+
// compressionRatio: 0.65,
|
|
73
|
+
// dictionaryOverhead: 0.15,
|
|
74
|
+
// diversityReduction: 0.2,
|
|
75
|
+
// averagePatternLength: 4.5,
|
|
76
|
+
// patternCount: 12,
|
|
77
|
+
// }
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## Region Detection
|
|
81
|
+
|
|
82
|
+
Detect semantic regions for adaptive compression:
|
|
83
|
+
|
|
84
|
+
```typescript
|
|
85
|
+
import { detectRegions, RegionType, filterPatternsByRegion } from '@small-ltsc/ml';
|
|
86
|
+
|
|
87
|
+
const regions = detectRegions(tokens, {
|
|
88
|
+
systemMarkers: [[58, 71905, 60]], // [SYSTEM] tokens
|
|
89
|
+
retentionTargets: {
|
|
90
|
+
[RegionType.SYSTEM]: 0.98, // Minimal compression
|
|
91
|
+
[RegionType.USER]: 0.85, // Moderate
|
|
92
|
+
[RegionType.CONTEXT]: 0.6, // Aggressive
|
|
93
|
+
},
|
|
94
|
+
});
|
|
95
|
+
|
|
96
|
+
// Filter patterns based on region constraints
|
|
97
|
+
const filtered = filterPatternsByRegion(patterns, regions, tokens);
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
### Region Types
|
|
101
|
+
|
|
102
|
+
- `SYSTEM` - System instructions (high retention)
|
|
103
|
+
- `USER` - User input (moderate retention)
|
|
104
|
+
- `CONTEXT` - Injected context/documents (low retention)
|
|
105
|
+
- `CODE` - Code blocks (moderate retention)
|
|
106
|
+
- `UNKNOWN` - Default region
|
|
107
|
+
|
|
108
|
+
## Custom Embedding Provider
|
|
109
|
+
|
|
110
|
+
Implement the `EmbeddingProvider` interface for your embedding model:
|
|
111
|
+
|
|
112
|
+
```typescript
|
|
113
|
+
import type { EmbeddingProvider } from '@small-ltsc/ml';
|
|
114
|
+
|
|
115
|
+
class OpenAIEmbeddings implements EmbeddingProvider {
|
|
116
|
+
async embed(tokens: readonly number[]): Promise<Float32Array> {
|
|
117
|
+
const text = tokenizer.decode(tokens);
|
|
118
|
+
const response = await openai.embeddings.create({
|
|
119
|
+
model: 'text-embedding-3-small',
|
|
120
|
+
input: text,
|
|
121
|
+
});
|
|
122
|
+
return new Float32Array(response.data[0].embedding);
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
dimension(): number {
|
|
126
|
+
return 1536;
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
const scorer = new EmbeddingImportanceScorer(new OpenAIEmbeddings());
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
## API Reference
|
|
134
|
+
|
|
135
|
+
### Importance Scoring
|
|
136
|
+
|
|
137
|
+
- `PositionalImportanceScorer` - Score by position (early = important)
|
|
138
|
+
- `EmbeddingImportanceScorer` - Score by context diversity
|
|
139
|
+
- `CombinedImportanceScorer` - Combine positional and embedding scoring
|
|
140
|
+
- `adjustPrioritiesByImportance(patterns, scores, threshold)` - Adjust pattern priorities
|
|
141
|
+
- `filterByImportance(patterns, scores, threshold)` - Filter high-importance patterns
|
|
142
|
+
|
|
143
|
+
### Quality Prediction
|
|
144
|
+
|
|
145
|
+
- `HeuristicQualityPredictor` - Rule-based quality prediction
|
|
146
|
+
- `EmbeddingQualityPredictor` - Enhanced with embedding similarity
|
|
147
|
+
- `createQualityPredictor(provider?, config?)` - Factory function
|
|
148
|
+
|
|
149
|
+
### Region Detection
|
|
150
|
+
|
|
151
|
+
- `detectRegions(tokens, config?)` - Detect semantic regions
|
|
152
|
+
- `detectRegionsHeuristic(tokens)` - Simple heuristic detection
|
|
153
|
+
- `filterPatternsByRegion(patterns, regions, tokens)` - Filter by region
|
|
154
|
+
- `getRegionCompressionSettings(regionType)` - Get settings for region
|
|
155
|
+
|
|
156
|
+
## License
|
|
157
|
+
|
|
158
|
+
MIT
|
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pattern importance scoring for ML-aware compression.
|
|
3
|
+
*
|
|
4
|
+
* Port of `small/pattern_importance.py` to TypeScript.
|
|
5
|
+
*/
|
|
6
|
+
const DEFAULT_IMPORTANCE_CONFIG = {
|
|
7
|
+
decayRate: 2.0,
|
|
8
|
+
contextWindow: 5,
|
|
9
|
+
positionalWeight: 0.3,
|
|
10
|
+
};
|
|
11
|
+
/**
|
|
12
|
+
* Positional importance scorer.
|
|
13
|
+
*
|
|
14
|
+
* Scores patterns based on their position in the sequence,
|
|
15
|
+
* with earlier positions receiving higher importance (useful for
|
|
16
|
+
* system prompts that typically appear at the start).
|
|
17
|
+
*/
|
|
18
|
+
export class PositionalImportanceScorer {
|
|
19
|
+
decayRate;
|
|
20
|
+
constructor(config) {
|
|
21
|
+
const cfg = { ...DEFAULT_IMPORTANCE_CONFIG, ...config };
|
|
22
|
+
this.decayRate = cfg.decayRate;
|
|
23
|
+
}
|
|
24
|
+
async scorePatterns(tokens, patterns) {
|
|
25
|
+
const n = tokens.length;
|
|
26
|
+
if (n === 0) {
|
|
27
|
+
return patterns.map(() => 0);
|
|
28
|
+
}
|
|
29
|
+
return patterns.map((pattern) => {
|
|
30
|
+
if (pattern.positions.length === 0) {
|
|
31
|
+
return 0;
|
|
32
|
+
}
|
|
33
|
+
// Compute average positional importance across all occurrences
|
|
34
|
+
let totalImportance = 0;
|
|
35
|
+
for (const pos of pattern.positions) {
|
|
36
|
+
// Exponential decay from start of sequence
|
|
37
|
+
const normalizedPos = pos / n;
|
|
38
|
+
const importance = Math.exp(-this.decayRate * normalizedPos);
|
|
39
|
+
totalImportance += importance;
|
|
40
|
+
}
|
|
41
|
+
return totalImportance / pattern.positions.length;
|
|
42
|
+
});
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
/**
|
|
46
|
+
* Embedding-based importance scorer.
|
|
47
|
+
*
|
|
48
|
+
* Uses an embedding model to determine if a pattern appears in
|
|
49
|
+
* diverse semantic contexts (important, should preserve) vs.
|
|
50
|
+
* similar contexts (redundant, safe to compress).
|
|
51
|
+
*/
|
|
52
|
+
export class EmbeddingImportanceScorer {
|
|
53
|
+
provider;
|
|
54
|
+
contextWindow;
|
|
55
|
+
constructor(provider, config) {
|
|
56
|
+
const cfg = { ...DEFAULT_IMPORTANCE_CONFIG, ...config };
|
|
57
|
+
this.provider = provider;
|
|
58
|
+
this.contextWindow = cfg.contextWindow;
|
|
59
|
+
}
|
|
60
|
+
async scorePatterns(tokens, patterns) {
|
|
61
|
+
const tokenArray = Array.isArray(tokens) ? tokens : Array.from(tokens);
|
|
62
|
+
const n = tokenArray.length;
|
|
63
|
+
if (n === 0) {
|
|
64
|
+
return patterns.map(() => 0);
|
|
65
|
+
}
|
|
66
|
+
const scores = [];
|
|
67
|
+
for (const pattern of patterns) {
|
|
68
|
+
if (pattern.positions.length <= 1) {
|
|
69
|
+
// Single occurrence - can't compute diversity
|
|
70
|
+
scores.push(0.5);
|
|
71
|
+
continue;
|
|
72
|
+
}
|
|
73
|
+
// Extract context windows around each occurrence
|
|
74
|
+
const contextEmbeddings = [];
|
|
75
|
+
for (const pos of pattern.positions) {
|
|
76
|
+
const start = Math.max(0, pos - this.contextWindow);
|
|
77
|
+
const end = Math.min(n, pos + pattern.length + this.contextWindow);
|
|
78
|
+
const context = tokenArray.slice(start, end);
|
|
79
|
+
const embedding = await this.provider.embed(context);
|
|
80
|
+
contextEmbeddings.push(embedding);
|
|
81
|
+
}
|
|
82
|
+
// Compute pairwise cosine similarities
|
|
83
|
+
const similarities = [];
|
|
84
|
+
for (let i = 0; i < contextEmbeddings.length; i++) {
|
|
85
|
+
for (let j = i + 1; j < contextEmbeddings.length; j++) {
|
|
86
|
+
const sim = cosineSimilarity(contextEmbeddings[i], contextEmbeddings[j]);
|
|
87
|
+
similarities.push(sim);
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
// Low average similarity = diverse contexts = high importance
|
|
91
|
+
const avgSimilarity = similarities.length > 0
|
|
92
|
+
? similarities.reduce((a, b) => a + b, 0) / similarities.length
|
|
93
|
+
: 0;
|
|
94
|
+
// Convert to importance (invert similarity)
|
|
95
|
+
scores.push(1 - avgSimilarity);
|
|
96
|
+
}
|
|
97
|
+
return scores;
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
/**
|
|
101
|
+
* Combined importance scorer that uses both positional and embedding-based scoring.
|
|
102
|
+
*/
|
|
103
|
+
export class CombinedImportanceScorer {
|
|
104
|
+
positionalScorer;
|
|
105
|
+
embeddingScorer;
|
|
106
|
+
positionalWeight;
|
|
107
|
+
constructor(provider, config) {
|
|
108
|
+
const cfg = { ...DEFAULT_IMPORTANCE_CONFIG, ...config };
|
|
109
|
+
this.positionalScorer = new PositionalImportanceScorer(config);
|
|
110
|
+
this.embeddingScorer = provider ? new EmbeddingImportanceScorer(provider, config) : null;
|
|
111
|
+
this.positionalWeight = cfg.positionalWeight;
|
|
112
|
+
}
|
|
113
|
+
async scorePatterns(tokens, patterns) {
|
|
114
|
+
const positionalScores = await this.positionalScorer.scorePatterns(tokens, patterns);
|
|
115
|
+
if (!this.embeddingScorer) {
|
|
116
|
+
return positionalScores;
|
|
117
|
+
}
|
|
118
|
+
const embeddingScores = await this.embeddingScorer.scorePatterns(tokens, patterns);
|
|
119
|
+
// Weighted combination
|
|
120
|
+
return positionalScores.map((posScore, i) => {
|
|
121
|
+
const embScore = embeddingScores[i];
|
|
122
|
+
return this.positionalWeight * posScore + (1 - this.positionalWeight) * embScore;
|
|
123
|
+
});
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
/**
|
|
127
|
+
* Compute cosine similarity between two vectors.
|
|
128
|
+
*/
|
|
129
|
+
function cosineSimilarity(a, b) {
|
|
130
|
+
if (a.length !== b.length) {
|
|
131
|
+
throw new Error('Vectors must have same length');
|
|
132
|
+
}
|
|
133
|
+
let dotProduct = 0;
|
|
134
|
+
let normA = 0;
|
|
135
|
+
let normB = 0;
|
|
136
|
+
for (let i = 0; i < a.length; i++) {
|
|
137
|
+
dotProduct += a[i] * b[i];
|
|
138
|
+
normA += a[i] * a[i];
|
|
139
|
+
normB += b[i] * b[i];
|
|
140
|
+
}
|
|
141
|
+
if (normA === 0 || normB === 0) {
|
|
142
|
+
return 0;
|
|
143
|
+
}
|
|
144
|
+
return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
|
|
145
|
+
}
|
|
146
|
+
/**
|
|
147
|
+
* Adjust pattern priorities based on importance scores.
|
|
148
|
+
*
|
|
149
|
+
* Patterns with high importance get lower priority (compressed later),
|
|
150
|
+
* patterns with low importance get higher priority (compressed first).
|
|
151
|
+
*
|
|
152
|
+
* @param patterns - Patterns to adjust
|
|
153
|
+
* @param scores - Importance scores from an ImportanceScorer
|
|
154
|
+
* @param threshold - Patterns above this importance threshold get negative priority
|
|
155
|
+
* @returns Patterns with adjusted priorities
|
|
156
|
+
*/
|
|
157
|
+
export function adjustPrioritiesByImportance(patterns, scores, threshold = 0.7) {
|
|
158
|
+
return patterns.map((pattern, i) => {
|
|
159
|
+
const importance = scores[i];
|
|
160
|
+
// High importance = low priority (preserve)
|
|
161
|
+
// Low importance = high priority (compress)
|
|
162
|
+
let priorityAdjustment;
|
|
163
|
+
if (importance > threshold) {
|
|
164
|
+
// Very important - negative priority to preserve
|
|
165
|
+
priorityAdjustment = -10 * importance;
|
|
166
|
+
}
|
|
167
|
+
else {
|
|
168
|
+
// Less important - positive priority to compress
|
|
169
|
+
priorityAdjustment = 10 * (1 - importance);
|
|
170
|
+
}
|
|
171
|
+
return {
|
|
172
|
+
...pattern,
|
|
173
|
+
// Store adjusted priority (would need to extend type)
|
|
174
|
+
_importanceScore: importance,
|
|
175
|
+
_adjustedPriority: priorityAdjustment,
|
|
176
|
+
};
|
|
177
|
+
});
|
|
178
|
+
}
|
|
179
|
+
/**
|
|
180
|
+
* Filter patterns to only those below an importance threshold.
|
|
181
|
+
*
|
|
182
|
+
* @param patterns - Patterns to filter
|
|
183
|
+
* @param scores - Importance scores
|
|
184
|
+
* @param threshold - Maximum importance score to include
|
|
185
|
+
* @returns Filtered patterns that are safe to compress
|
|
186
|
+
*/
|
|
187
|
+
export function filterByImportance(patterns, scores, threshold = 0.8) {
|
|
188
|
+
return patterns.filter((_, i) => scores[i] < threshold);
|
|
189
|
+
}
|
|
190
|
+
//# sourceMappingURL=importance.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"importance.js","sourceRoot":"","sources":["../../src/importance.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AA8CH,MAAM,yBAAyB,GAA+B;IAC5D,SAAS,EAAE,GAAG;IACd,aAAa,EAAE,CAAC;IAChB,gBAAgB,EAAE,GAAG;CACtB,CAAC;AAmBF;;;;;;GAMG;AACH,MAAM,OAAO,0BAA0B;IAC7B,SAAS,CAAS;IAE1B,YAAY,MAAyB;QACnC,MAAM,GAAG,GAAG,EAAE,GAAG,yBAAyB,EAAE,GAAG,MAAM,EAAE,CAAC;QACxD,IAAI,CAAC,SAAS,GAAG,GAAG,CAAC,SAAS,CAAC;IACjC,CAAC;IAED,KAAK,CAAC,aAAa,CAAC,MAAgB,EAAE,QAA6B;QACjE,MAAM,CAAC,GAAG,MAAM,CAAC,MAAM,CAAC;QACxB,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC;YACZ,OAAO,QAAQ,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC;QAC/B,CAAC;QAED,OAAO,QAAQ,CAAC,GAAG,CAAC,CAAC,OAAO,EAAE,EAAE;YAC9B,IAAI,OAAO,CAAC,SAAS,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBACnC,OAAO,CAAC,CAAC;YACX,CAAC;YAED,+DAA+D;YAC/D,IAAI,eAAe,GAAG,CAAC,CAAC;YACxB,KAAK,MAAM,GAAG,IAAI,OAAO,CAAC,SAAS,EAAE,CAAC;gBACpC,2CAA2C;gBAC3C,MAAM,aAAa,GAAG,GAAG,GAAG,CAAC,CAAC;gBAC9B,MAAM,UAAU,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,SAAS,GAAG,aAAa,CAAC,CAAC;gBAC7D,eAAe,IAAI,UAAU,CAAC;YAChC,CAAC;YAED,OAAO,eAAe,GAAG,OAAO,CAAC,SAAS,CAAC,MAAM,CAAC;QACpD,CAAC,CAAC,CAAC;IACL,CAAC;CACF;AAED;;;;;;GAMG;AACH,MAAM,OAAO,yBAAyB;IAC5B,QAAQ,CAAoB;IAC5B,aAAa,CAAS;IAE9B,YAAY,QAA2B,EAAE,MAAyB;QAChE,MAAM,GAAG,GAAG,EAAE,GAAG,yBAAyB,EAAE,GAAG,MAAM,EAAE,CAAC;QACxD,IAAI,CAAC,QAAQ,GAAG,QAAQ,CAAC;QACzB,IAAI,CAAC,aAAa,GAAG,GAAG,CAAC,aAAa,CAAC;IACzC,CAAC;IAED,KAAK,CAAC,aAAa,CAAC,MAAgB,EAAE,QAA6B;QACjE,MAAM,UAAU,GAAG,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;QACvE,MAAM,CAAC,GAAG,UAAU,CAAC,MAAM,CAAC;QAE5B,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC;YACZ,OAAO,QAAQ,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC;QAC/B,CAAC;QAED,MAAM,MAAM,GAAa,EAAE,CAAC;QAE5B,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;YAC/B,IAAI,OAAO,CAAC,SAAS,CAAC,MAAM,IAAI,CAAC,EAAE,CAAC;gBAClC,8CAA8C;gBAC9C,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;gBACjB,SAAS;YACX,CAAC;YAED,iDAAiD;YACjD,MAAM,iBAAiB,GAAmB,EAAE,CAAC;YAE7C,KAAK,MAAM,GAAG,IAAI,OAAO,CAAC,SAAS,EAAE,CAAC;gBACpC,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,GAAG,GAAG,IAAI,CAAC,aAAa,CAAC,CAAC;gBACpD,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,GAAG,GAAG,OAAO,CAAC,MAAM,GAAG,IAAI,CAAC,aAAa,CAAC,CAAC;gBACnE,MAAM,OAAO,GAAG,UAAU,CAAC,KAAK,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC;gBAE7C,MAAM,SAAS,GAAG,MAAM,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;gBACrD,iBAAiB,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;YACpC,CAAC;YAED,uCAAuC;YACvC,MAAM,YAAY,GAAa,EAAE,CAAC;YAClC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,iBAAiB,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBAClD,KAAK,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,iBAAiB,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;oBACtD,MAAM,GAAG,GAAG,gBAAgB,CAAC,iBAAiB,CAAC,CAAC,CAAC,EAAE,iBAAiB,CAAC,CAAC,CAAC,CAAC,CAAC;oBACzE,YAAY,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;gBACzB,CAAC;YACH,CAAC;YAED,8DAA8D;YAC9D,MAAM,aAAa,GACjB,YAAY,CAAC,MAAM,GAAG,CAAC;gBACrB,CAAC,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,GAAG,YAAY,CAAC,MAAM;gBAC/D,CAAC,CAAC,CAAC,CAAC;YAER,4CAA4C;YAC5C,MAAM,CAAC,IAAI,CAAC,CAAC,GAAG,aAAa,CAAC,CAAC;QACjC,CAAC;QAED,OAAO,MAAM,CAAC;IAChB,CAAC;CACF;AAED;;GAEG;AACH,MAAM,OAAO,wBAAwB;IAC3B,gBAAgB,CAA6B;IAC7C,eAAe,CAAmC;IAClD,gBAAgB,CAAS;IAEjC,YAAY,QAA4B,EAAE,MAAyB;QACjE,MAAM,GAAG,GAAG,EAAE,GAAG,yBAAyB,EAAE,GAAG,MAAM,EAAE,CAAC;QACxD,IAAI,CAAC,gBAAgB,GAAG,IAAI,0BAA0B,CAAC,MAAM,CAAC,CAAC;QAC/D,IAAI,CAAC,eAAe,GAAG,QAAQ,CAAC,CAAC,CAAC,IAAI,yBAAyB,CAAC,QAAQ,EAAE,MAAM,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;QACzF,IAAI,CAAC,gBAAgB,GAAG,GAAG,CAAC,gBAAgB,CAAC;IAC/C,CAAC;IAED,KAAK,CAAC,aAAa,CAAC,MAAgB,EAAE,QAA6B;QACjE,MAAM,gBAAgB,GAAG,MAAM,IAAI,CAAC,gBAAgB,CAAC,aAAa,CAAC,MAAM,EAAE,QAAQ,CAAC,CAAC;QAErF,IAAI,CAAC,IAAI,CAAC,eAAe,EAAE,CAAC;YAC1B,OAAO,gBAAgB,CAAC;QAC1B,CAAC;QAED,MAAM,eAAe,GAAG,MAAM,IAAI,CAAC,eAAe,CAAC,aAAa,CAAC,MAAM,EAAE,QAAQ,CAAC,CAAC;QAEnF,uBAAuB;QACvB,OAAO,gBAAgB,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE,CAAC,EAAE,EAAE;YAC1C,MAAM,QAAQ,GAAG,eAAe,CAAC,CAAC,CAAC,CAAC;YACpC,OAAO,IAAI,CAAC,gBAAgB,GAAG,QAAQ,GAAG,CAAC,CAAC,GAAG,IAAI,CAAC,gBAAgB,CAAC,GAAG,QAAQ,CAAC;QACnF,CAAC,CAAC,CAAC;IACL,CAAC;CACF;AAED;;GAEG;AACH,SAAS,gBAAgB,CAAC,CAAe,EAAE,CAAe;IACxD,IAAI,CAAC,CAAC,MAAM,KAAK,CAAC,CAAC,MAAM,EAAE,CAAC;QAC1B,MAAM,IAAI,KAAK,CAAC,+BAA+B,CAAC,CAAC;IACnD,CAAC;IAED,IAAI,UAAU,GAAG,CAAC,CAAC;IACnB,IAAI,KAAK,GAAG,CAAC,CAAC;IACd,IAAI,KAAK,GAAG,CAAC,CAAC;IAEd,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAClC,UAAU,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;QAC1B,KAAK,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;QACrB,KAAK,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;IACvB,CAAC;IAED,IAAI,KAAK,KAAK,CAAC,IAAI,KAAK,KAAK,CAAC,EAAE,CAAC;QAC/B,OAAO,CAAC,CAAC;IACX,CAAC;IAED,OAAO,UAAU,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC;AAC5D,CAAC;AAED;;;;;;;;;;GAUG;AACH,MAAM,UAAU,4BAA4B,CAC1C,QAA6B,EAC7B,MAAgB,EAChB,SAAS,GAAG,GAAG;IAEf,OAAO,QAAQ,CAAC,GAAG,CAAC,CAAC,OAAO,EAAE,CAAC,EAAE,EAAE;QACjC,MAAM,UAAU,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC;QAE7B,4CAA4C;QAC5C,4CAA4C;QAC5C,IAAI,kBAA0B,CAAC;QAE/B,IAAI,UAAU,GAAG,SAAS,EAAE,CAAC;YAC3B,iDAAiD;YACjD,kBAAkB,GAAG,CAAC,EAAE,GAAG,UAAU,CAAC;QACxC,CAAC;aAAM,CAAC;YACN,iDAAiD;YACjD,kBAAkB,GAAG,EAAE,GAAG,CAAC,CAAC,GAAG,UAAU,CAAC,CAAC;QAC7C,CAAC;QAED,OAAO;YACL,GAAG,OAAO;YACV,sDAAsD;YACtD,gBAAgB,EAAE,UAAU;YAC5B,iBAAiB,EAAE,kBAAkB;SACyC,CAAC;IACnF,CAAC,CAAC,CAAC;AACL,CAAC;AAED;;;;;;;GAOG;AACH,MAAM,UAAU,kBAAkB,CAChC,QAA6B,EAC7B,MAAgB,EAChB,SAAS,GAAG,GAAG;IAEf,OAAO,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,MAAM,CAAC,CAAC,CAAC,GAAG,SAAS,CAAC,CAAC;AAC1D,CAAC"}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Small LTSC ML - Machine Learning Features
|
|
3
|
+
*
|
|
4
|
+
* Optional ML features for enhanced compression quality:
|
|
5
|
+
* - Pattern importance scoring
|
|
6
|
+
* - Quality prediction
|
|
7
|
+
* - Adaptive region detection
|
|
8
|
+
*
|
|
9
|
+
* @packageDocumentation
|
|
10
|
+
*/
|
|
11
|
+
// Importance scoring
|
|
12
|
+
export { PositionalImportanceScorer, EmbeddingImportanceScorer, CombinedImportanceScorer, adjustPrioritiesByImportance, filterByImportance, } from './importance.js';
|
|
13
|
+
// Quality prediction
|
|
14
|
+
export { HeuristicQualityPredictor, EmbeddingQualityPredictor, createQualityPredictor, } from './quality.js';
|
|
15
|
+
// Region detection
|
|
16
|
+
export { RegionType, detectRegions, detectRegionsHeuristic, filterPatternsByRegion, getRegionCompressionSettings, } from './regions.js';
|
|
17
|
+
// Version
|
|
18
|
+
export const VERSION = '0.1.0';
|
|
19
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;GASG;AAEH,qBAAqB;AACrB,OAAO,EAIL,0BAA0B,EAC1B,yBAAyB,EACzB,wBAAwB,EACxB,4BAA4B,EAC5B,kBAAkB,GACnB,MAAM,iBAAiB,CAAC;AAEzB,qBAAqB;AACrB,OAAO,EAKL,yBAAyB,EACzB,yBAAyB,EACzB,sBAAsB,GACvB,MAAM,cAAc,CAAC;AAEtB,mBAAmB;AACnB,OAAO,EACL,UAAU,EAGV,aAAa,EACb,sBAAsB,EACtB,sBAAsB,EACtB,4BAA4B,GAC7B,MAAM,cAAc,CAAC;AAEtB,UAAU;AACV,MAAM,CAAC,MAAM,OAAO,GAAG,OAAO,CAAC"}
|
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Quality prediction for compression validation.
|
|
3
|
+
*
|
|
4
|
+
* Predicts whether a compressed sequence will maintain
|
|
5
|
+
* sufficient quality for transformer models to learn from.
|
|
6
|
+
*
|
|
7
|
+
* Port of `small/quality_predictor.py` to TypeScript.
|
|
8
|
+
*/
|
|
9
|
+
const DEFAULT_QUALITY_CONFIG = {
|
|
10
|
+
maxCompressionRatio: 0.5,
|
|
11
|
+
maxDictionaryOverhead: 0.3,
|
|
12
|
+
minEmbeddingSimilarity: 0.7,
|
|
13
|
+
maxDiversityReduction: 0.4,
|
|
14
|
+
};
|
|
15
|
+
/**
|
|
16
|
+
* Heuristic-based quality predictor.
|
|
17
|
+
*
|
|
18
|
+
* Uses a combination of handcrafted features and thresholds
|
|
19
|
+
* to predict compression quality.
|
|
20
|
+
*/
|
|
21
|
+
export class HeuristicQualityPredictor {
|
|
22
|
+
config;
|
|
23
|
+
constructor(config) {
|
|
24
|
+
this.config = { ...DEFAULT_QUALITY_CONFIG, ...config };
|
|
25
|
+
}
|
|
26
|
+
async predict(result) {
|
|
27
|
+
const features = this.extractFeatures(result);
|
|
28
|
+
const score = this.computeScore(features);
|
|
29
|
+
const degradationProbability = this.computeDegradationProbability(features);
|
|
30
|
+
const acceptable = score >= 0.6 && degradationProbability < 0.3;
|
|
31
|
+
let recommendation;
|
|
32
|
+
if (acceptable) {
|
|
33
|
+
recommendation = 'accept';
|
|
34
|
+
}
|
|
35
|
+
else if (degradationProbability < 0.5) {
|
|
36
|
+
recommendation = 'retry_conservative';
|
|
37
|
+
}
|
|
38
|
+
else {
|
|
39
|
+
recommendation = 'skip_compression';
|
|
40
|
+
}
|
|
41
|
+
return {
|
|
42
|
+
score,
|
|
43
|
+
acceptable,
|
|
44
|
+
degradationProbability,
|
|
45
|
+
features,
|
|
46
|
+
recommendation,
|
|
47
|
+
};
|
|
48
|
+
}
|
|
49
|
+
/**
|
|
50
|
+
* Extract quality features from compression result.
|
|
51
|
+
*/
|
|
52
|
+
extractFeatures(result) {
|
|
53
|
+
// Compression ratio
|
|
54
|
+
const compressionRatio = result.compressionRatio;
|
|
55
|
+
// Dictionary overhead
|
|
56
|
+
const dictionaryOverhead = result.originalLength > 0
|
|
57
|
+
? result.dictionaryTokens.length / result.originalLength
|
|
58
|
+
: 0;
|
|
59
|
+
// Token diversity
|
|
60
|
+
const originalDiversity = new Set(result.originalTokens).size;
|
|
61
|
+
const compressedDiversity = new Set(result.serializedTokens).size;
|
|
62
|
+
const diversityReduction = originalDiversity > 0 ? 1 - compressedDiversity / originalDiversity : 0;
|
|
63
|
+
// Pattern statistics
|
|
64
|
+
const patternCount = result.dictionaryMap.size;
|
|
65
|
+
let totalPatternLength = 0;
|
|
66
|
+
for (const [, pattern] of result.dictionaryMap) {
|
|
67
|
+
totalPatternLength += pattern.length;
|
|
68
|
+
}
|
|
69
|
+
const averagePatternLength = patternCount > 0 ? totalPatternLength / patternCount : 0;
|
|
70
|
+
return {
|
|
71
|
+
compressionRatio,
|
|
72
|
+
dictionaryOverhead,
|
|
73
|
+
diversityReduction,
|
|
74
|
+
averagePatternLength,
|
|
75
|
+
patternCount,
|
|
76
|
+
};
|
|
77
|
+
}
|
|
78
|
+
/**
|
|
79
|
+
* Compute overall quality score from features.
|
|
80
|
+
*/
|
|
81
|
+
computeScore(features) {
|
|
82
|
+
let score = 1.0;
|
|
83
|
+
// Penalize extreme compression
|
|
84
|
+
if (features.compressionRatio < this.config.maxCompressionRatio) {
|
|
85
|
+
const penalty = (this.config.maxCompressionRatio - features.compressionRatio) * 0.5;
|
|
86
|
+
score -= penalty;
|
|
87
|
+
}
|
|
88
|
+
// Penalize high dictionary overhead
|
|
89
|
+
if (features.dictionaryOverhead > this.config.maxDictionaryOverhead) {
|
|
90
|
+
const penalty = (features.dictionaryOverhead - this.config.maxDictionaryOverhead) * 0.3;
|
|
91
|
+
score -= penalty;
|
|
92
|
+
}
|
|
93
|
+
// Penalize diversity loss
|
|
94
|
+
if (features.diversityReduction > this.config.maxDiversityReduction) {
|
|
95
|
+
const penalty = (features.diversityReduction - this.config.maxDiversityReduction) * 0.4;
|
|
96
|
+
score -= penalty;
|
|
97
|
+
}
|
|
98
|
+
// Bonus for reasonable pattern lengths
|
|
99
|
+
if (features.averagePatternLength >= 3 && features.averagePatternLength <= 6) {
|
|
100
|
+
score += 0.1;
|
|
101
|
+
}
|
|
102
|
+
// Use embedding similarity if available
|
|
103
|
+
if (features.embeddingSimilarity !== undefined) {
|
|
104
|
+
if (features.embeddingSimilarity < this.config.minEmbeddingSimilarity) {
|
|
105
|
+
score -= (this.config.minEmbeddingSimilarity - features.embeddingSimilarity) * 0.5;
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
return Math.max(0, Math.min(1, score));
|
|
109
|
+
}
|
|
110
|
+
/**
|
|
111
|
+
* Compute probability of quality degradation.
|
|
112
|
+
*/
|
|
113
|
+
computeDegradationProbability(features) {
|
|
114
|
+
let prob = 0;
|
|
115
|
+
// Very aggressive compression increases risk
|
|
116
|
+
if (features.compressionRatio < 0.4) {
|
|
117
|
+
prob += 0.3;
|
|
118
|
+
}
|
|
119
|
+
else if (features.compressionRatio < 0.5) {
|
|
120
|
+
prob += 0.15;
|
|
121
|
+
}
|
|
122
|
+
// High dictionary overhead increases risk
|
|
123
|
+
if (features.dictionaryOverhead > 0.3) {
|
|
124
|
+
prob += 0.2;
|
|
125
|
+
}
|
|
126
|
+
// Large diversity reduction increases risk
|
|
127
|
+
if (features.diversityReduction > 0.3) {
|
|
128
|
+
prob += 0.25;
|
|
129
|
+
}
|
|
130
|
+
else if (features.diversityReduction > 0.2) {
|
|
131
|
+
prob += 0.1;
|
|
132
|
+
}
|
|
133
|
+
// Very short patterns are risky
|
|
134
|
+
if (features.averagePatternLength < 2.5) {
|
|
135
|
+
prob += 0.15;
|
|
136
|
+
}
|
|
137
|
+
// Low embedding similarity is a strong signal
|
|
138
|
+
if (features.embeddingSimilarity !== undefined && features.embeddingSimilarity < 0.7) {
|
|
139
|
+
prob += 0.3 * (0.7 - features.embeddingSimilarity);
|
|
140
|
+
}
|
|
141
|
+
return Math.max(0, Math.min(1, prob));
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
/**
|
|
145
|
+
* Embedding-enhanced quality predictor.
|
|
146
|
+
*
|
|
147
|
+
* Adds embedding similarity comparison to the heuristic predictor
|
|
148
|
+
* for more accurate quality assessment.
|
|
149
|
+
*/
|
|
150
|
+
export class EmbeddingQualityPredictor {
|
|
151
|
+
provider;
|
|
152
|
+
heuristicPredictor;
|
|
153
|
+
config;
|
|
154
|
+
constructor(provider, config) {
|
|
155
|
+
this.provider = provider;
|
|
156
|
+
this.heuristicPredictor = new HeuristicQualityPredictor(config);
|
|
157
|
+
this.config = { ...DEFAULT_QUALITY_CONFIG, ...config };
|
|
158
|
+
}
|
|
159
|
+
async predict(result) {
|
|
160
|
+
// Get base prediction
|
|
161
|
+
const basePrediction = await this.heuristicPredictor.predict(result);
|
|
162
|
+
// Compute embedding similarity
|
|
163
|
+
const originalEmbedding = await this.provider.embed(result.originalTokens);
|
|
164
|
+
const compressedEmbedding = await this.provider.embed(result.serializedTokens);
|
|
165
|
+
const similarity = this.cosineSimilarity(originalEmbedding, compressedEmbedding);
|
|
166
|
+
// Update features with embedding similarity
|
|
167
|
+
const features = {
|
|
168
|
+
...basePrediction.features,
|
|
169
|
+
embeddingSimilarity: similarity,
|
|
170
|
+
};
|
|
171
|
+
// Recompute scores with embedding
|
|
172
|
+
let score = basePrediction.score;
|
|
173
|
+
let degradationProbability = basePrediction.degradationProbability;
|
|
174
|
+
if (similarity < this.config.minEmbeddingSimilarity) {
|
|
175
|
+
const penalty = (this.config.minEmbeddingSimilarity - similarity) * 0.3;
|
|
176
|
+
score -= penalty;
|
|
177
|
+
degradationProbability += 0.2;
|
|
178
|
+
}
|
|
179
|
+
else {
|
|
180
|
+
// High similarity is a good signal
|
|
181
|
+
score += (similarity - this.config.minEmbeddingSimilarity) * 0.1;
|
|
182
|
+
}
|
|
183
|
+
score = Math.max(0, Math.min(1, score));
|
|
184
|
+
degradationProbability = Math.max(0, Math.min(1, degradationProbability));
|
|
185
|
+
const acceptable = score >= 0.6 && degradationProbability < 0.3;
|
|
186
|
+
let recommendation;
|
|
187
|
+
if (acceptable) {
|
|
188
|
+
recommendation = 'accept';
|
|
189
|
+
}
|
|
190
|
+
else if (degradationProbability < 0.5) {
|
|
191
|
+
recommendation = 'retry_conservative';
|
|
192
|
+
}
|
|
193
|
+
else {
|
|
194
|
+
recommendation = 'skip_compression';
|
|
195
|
+
}
|
|
196
|
+
return {
|
|
197
|
+
score,
|
|
198
|
+
acceptable,
|
|
199
|
+
degradationProbability,
|
|
200
|
+
features,
|
|
201
|
+
recommendation,
|
|
202
|
+
};
|
|
203
|
+
}
|
|
204
|
+
cosineSimilarity(a, b) {
|
|
205
|
+
if (a.length !== b.length) {
|
|
206
|
+
throw new Error('Vectors must have same length');
|
|
207
|
+
}
|
|
208
|
+
let dotProduct = 0;
|
|
209
|
+
let normA = 0;
|
|
210
|
+
let normB = 0;
|
|
211
|
+
for (let i = 0; i < a.length; i++) {
|
|
212
|
+
dotProduct += a[i] * b[i];
|
|
213
|
+
normA += a[i] * a[i];
|
|
214
|
+
normB += b[i] * b[i];
|
|
215
|
+
}
|
|
216
|
+
if (normA === 0 || normB === 0) {
|
|
217
|
+
return 0;
|
|
218
|
+
}
|
|
219
|
+
return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
/**
|
|
223
|
+
* Create a quality predictor.
|
|
224
|
+
*
|
|
225
|
+
* @param provider - Optional embedding provider for enhanced prediction
|
|
226
|
+
* @param config - Quality configuration
|
|
227
|
+
* @returns Quality predictor instance
|
|
228
|
+
*/
|
|
229
|
+
export function createQualityPredictor(provider, config) {
|
|
230
|
+
if (provider) {
|
|
231
|
+
return new EmbeddingQualityPredictor(provider, config);
|
|
232
|
+
}
|
|
233
|
+
return new HeuristicQualityPredictor(config);
|
|
234
|
+
}
|
|
235
|
+
//# sourceMappingURL=quality.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"quality.js","sourceRoot":"","sources":["../../src/quality.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAkCH,MAAM,sBAAsB,GAA4B;IACtD,mBAAmB,EAAE,GAAG;IACxB,qBAAqB,EAAE,GAAG;IAC1B,sBAAsB,EAAE,GAAG;IAC3B,qBAAqB,EAAE,GAAG;CAC3B,CAAC;AAgFF;;;;;GAKG;AACH,MAAM,OAAO,yBAAyB;IAC5B,MAAM,CAA0B;IAExC,YAAY,MAAsB;QAChC,IAAI,CAAC,MAAM,GAAG,EAAE,GAAG,sBAAsB,EAAE,GAAG,MAAM,EAAE,CAAC;IACzD,CAAC;IAED,KAAK,CAAC,OAAO,CAAC,MAAyB;QACrC,MAAM,QAAQ,GAAG,IAAI,CAAC,eAAe,CAAC,MAAM,CAAC,CAAC;QAC9C,MAAM,KAAK,GAAG,IAAI,CAAC,YAAY,CAAC,QAAQ,CAAC,CAAC;QAC1C,MAAM,sBAAsB,GAAG,IAAI,CAAC,6BAA6B,CAAC,QAAQ,CAAC,CAAC;QAC5E,MAAM,UAAU,GAAG,KAAK,IAAI,GAAG,IAAI,sBAAsB,GAAG,GAAG,CAAC;QAEhE,IAAI,cAAmD,CAAC;QACxD,IAAI,UAAU,EAAE,CAAC;YACf,cAAc,GAAG,QAAQ,CAAC;QAC5B,CAAC;aAAM,IAAI,sBAAsB,GAAG,GAAG,EAAE,CAAC;YACxC,cAAc,GAAG,oBAAoB,CAAC;QACxC,CAAC;aAAM,CAAC;YACN,cAAc,GAAG,kBAAkB,CAAC;QACtC,CAAC;QAED,OAAO;YACL,KAAK;YACL,UAAU;YACV,sBAAsB;YACtB,QAAQ;YACR,cAAc;SACf,CAAC;IACJ,CAAC;IAED;;OAEG;IACK,eAAe,CAAC,MAAyB;QAC/C,oBAAoB;QACpB,MAAM,gBAAgB,GAAG,MAAM,CAAC,gBAAgB,CAAC;QAEjD,sBAAsB;QACtB,MAAM,kBAAkB,GACtB,MAAM,CAAC,cAAc,GAAG,CAAC;YACvB,CAAC,CAAC,MAAM,CAAC,gBAAgB,CAAC,MAAM,GAAG,MAAM,CAAC,cAAc;YACxD,CAAC,CAAC,CAAC,CAAC;QAER,kBAAkB;QAClB,MAAM,iBAAiB,GAAG,IAAI,GAAG,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,IAAI,CAAC;QAC9D,MAAM,mBAAmB,GAAG,IAAI,GAAG,CAAC,MAAM,CAAC,gBAAgB,CAAC,CAAC,IAAI,CAAC;QAClE,MAAM,kBAAkB,GACtB,iBAAiB,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,mBAAmB,GAAG,iBAAiB,CAAC,CAAC,CAAC,CAAC,CAAC;QAE1E,qBAAqB;QACrB,MAAM,YAAY,GAAG,MAAM,CAAC,aAAa,CAAC,IAAI,CAAC;QAC/C,IAAI,kBAAkB,GAAG,CAAC,CAAC;QAC3B,KAAK,MAAM,CAAC,EAAE,OAAO,CAAC,IAAI,MAAM,CAAC,aAAa,EAAE,CAAC;YAC/C,kBAAkB,IAAI,OAAO,CAAC,MAAM,CAAC;QACvC,CAAC;QACD,MAAM,oBAAoB,GAAG,YAAY,GAAG,CAAC,CAAC,CAAC,CAAC,kBAAkB,GAAG,YAAY,CAAC,CAAC,CAAC,CAAC,CAAC;QAEtF,OAAO;YACL,gBAAgB;YAChB,kBAAkB;YAClB,kBAAkB;YAClB,oBAAoB;YACpB,YAAY;SACb,CAAC;IACJ,CAAC;IAED;;OAEG;IACK,YAAY,CAAC,QAAyB;QAC5C,IAAI,KAAK,GAAG,GAAG,CAAC;QAEhB,+BAA+B;QAC/B,IAAI,QAAQ,CAAC,gBAAgB,GAAG,IAAI,CAAC,MAAM,CAAC,mBAAmB,EAAE,CAAC;YAChE,MAAM,OAAO,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,mBAAmB,GAAG,QAAQ,CAAC,gBAAgB,CAAC,GAAG,GAAG,CAAC;YACpF,KAAK,IAAI,OAAO,CAAC;QACnB,CAAC;QAED,oCAAoC;QACpC,IAAI,QAAQ,CAAC,kBAAkB,GAAG,IAAI,CAAC,MAAM,CAAC,qBAAqB,EAAE,CAAC;YACpE,MAAM,OAAO,GAAG,CAAC,QAAQ,CAAC,kBAAkB,GAAG,IAAI,CAAC,MAAM,CAAC,qBAAqB,CAAC,GAAG,GAAG,CAAC;YACxF,KAAK,IAAI,OAAO,CAAC;QACnB,CAAC;QAED,0BAA0B;QAC1B,IAAI,QAAQ,CAAC,kBAAkB,GAAG,IAAI,CAAC,MAAM,CAAC,qBAAqB,EAAE,CAAC;YACpE,MAAM,OAAO,GAAG,CAAC,QAAQ,CAAC,kBAAkB,GAAG,IAAI,CAAC,MAAM,CAAC,qBAAqB,CAAC,GAAG,GAAG,CAAC;YACxF,KAAK,IAAI,OAAO,CAAC;QACnB,CAAC;QAED,uCAAuC;QACvC,IAAI,QAAQ,CAAC,oBAAoB,IAAI,CAAC,IAAI,QAAQ,CAAC,oBAAoB,IAAI,CAAC,EAAE,CAAC;YAC7E,KAAK,IAAI,GAAG,CAAC;QACf,CAAC;QAED,wCAAwC;QACxC,IAAI,QAAQ,CAAC,mBAAmB,KAAK,SAAS,EAAE,CAAC;YAC/C,IAAI,QAAQ,CAAC,mBAAmB,GAAG,IAAI,CAAC,MAAM,CAAC,sBAAsB,EAAE,CAAC;gBACtE,KAAK,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,sBAAsB,GAAG,QAAQ,CAAC,mBAAmB,CAAC,GAAG,GAAG,CAAC;YACrF,CAAC;QACH,CAAC;QAED,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,KAAK,CAAC,CAAC,CAAC;IACzC,CAAC;IAED;;OAEG;IACK,6BAA6B,CAAC,QAAyB;QAC7D,IAAI,IAAI,GAAG,CAAC,CAAC;QAEb,6CAA6C;QAC7C,IAAI,QAAQ,CAAC,gBAAgB,GAAG,GAAG,EAAE,CAAC;YACpC,IAAI,IAAI,GAAG,CAAC;QACd,CAAC;aAAM,IAAI,QAAQ,CAAC,gBAAgB,GAAG,GAAG,EAAE,CAAC;YAC3C,IAAI,IAAI,IAAI,CAAC;QACf,CAAC;QAED,0CAA0C;QAC1C,IAAI,QAAQ,CAAC,kBAAkB,GAAG,GAAG,EAAE,CAAC;YACtC,IAAI,IAAI,GAAG,CAAC;QACd,CAAC;QAED,2CAA2C;QAC3C,IAAI,QAAQ,CAAC,kBAAkB,GAAG,GAAG,EAAE,CAAC;YACtC,IAAI,IAAI,IAAI,CAAC;QACf,CAAC;aAAM,IAAI,QAAQ,CAAC,kBAAkB,GAAG,GAAG,EAAE,CAAC;YAC7C,IAAI,IAAI,GAAG,CAAC;QACd,CAAC;QAED,gCAAgC;QAChC,IAAI,QAAQ,CAAC,oBAAoB,GAAG,GAAG,EAAE,CAAC;YACxC,IAAI,IAAI,IAAI,CAAC;QACf,CAAC;QAED,8CAA8C;QAC9C,IAAI,QAAQ,CAAC,mBAAmB,KAAK,SAAS,IAAI,QAAQ,CAAC,mBAAmB,GAAG,GAAG,EAAE,CAAC;YACrF,IAAI,IAAI,GAAG,GAAG,CAAC,GAAG,GAAG,QAAQ,CAAC,mBAAmB,CAAC,CAAC;QACrD,CAAC;QAED,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC,CAAC;IACxC,CAAC;CACF;AAED;;;;;GAKG;AACH,MAAM,OAAO,yBAAyB;IAC5B,QAAQ,CAAoB;IAC5B,kBAAkB,CAA4B;IAC9C,MAAM,CAA0B;IAExC,YAAY,QAA2B,EAAE,MAAsB;QAC7D,IAAI,CAAC,QAAQ,GAAG,QAAQ,CAAC;QACzB,IAAI,CAAC,kBAAkB,GAAG,IAAI,yBAAyB,CAAC,MAAM,CAAC,CAAC;QAChE,IAAI,CAAC,MAAM,GAAG,EAAE,GAAG,sBAAsB,EAAE,GAAG,MAAM,EAAE,CAAC;IACzD,CAAC;IAED,KAAK,CAAC,OAAO,CAAC,MAAyB;QACrC,sBAAsB;QACtB,MAAM,cAAc,GAAG,MAAM,IAAI,CAAC,kBAAkB,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC;QAErE,+BAA+B;QAC/B,MAAM,iBAAiB,GAAG,MAAM,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC;QAC3E,MAAM,mBAAmB,GAAG,MAAM,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,MAAM,CAAC,gBAAgB,CAAC,CAAC;QAC/E,MAAM,UAAU,GAAG,IAAI,CAAC,gBAAgB,CAAC,iBAAiB,EAAE,mBAAmB,CAAC,CAAC;QAEjF,4CAA4C;QAC5C,MAAM,QAAQ,GAAoB;YAChC,GAAG,cAAc,CAAC,QAAQ;YAC1B,mBAAmB,EAAE,UAAU;SAChC,CAAC;QAEF,kCAAkC;QAClC,IAAI,KAAK,GAAG,cAAc,CAAC,KAAK,CAAC;QACjC,IAAI,sBAAsB,GAAG,cAAc,CAAC,sBAAsB,CAAC;QAEnE,IAAI,UAAU,GAAG,IAAI,CAAC,MAAM,CAAC,sBAAsB,EAAE,CAAC;YACpD,MAAM,OAAO,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,sBAAsB,GAAG,UAAU,CAAC,GAAG,GAAG,CAAC;YACxE,KAAK,IAAI,OAAO,CAAC;YACjB,sBAAsB,IAAI,GAAG,CAAC;QAChC,CAAC;aAAM,CAAC;YACN,mCAAmC;YACnC,KAAK,IAAI,CAAC,UAAU,GAAG,IAAI,CAAC,MAAM,CAAC,sBAAsB,CAAC,GAAG,GAAG,CAAC;QACnE,CAAC;QAED,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,KAAK,CAAC,CAAC,CAAC;QACxC,sBAAsB,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,sBAAsB,CAAC,CAAC,CAAC;QAE1E,MAAM,UAAU,GAAG,KAAK,IAAI,GAAG,IAAI,sBAAsB,GAAG,GAAG,CAAC;QAEhE,IAAI,cAAmD,CAAC;QACxD,IAAI,UAAU,EAAE,CAAC;YACf,cAAc,GAAG,QAAQ,CAAC;QAC5B,CAAC;aAAM,IAAI,sBAAsB,GAAG,GAAG,EAAE,CAAC;YACxC,cAAc,GAAG,oBAAoB,CAAC;QACxC,CAAC;aAAM,CAAC;YACN,cAAc,GAAG,kBAAkB,CAAC;QACtC,CAAC;QAED,OAAO;YACL,KAAK;YACL,UAAU;YACV,sBAAsB;YACtB,QAAQ;YACR,cAAc;SACf,CAAC;IACJ,CAAC;IAEO,gBAAgB,CAAC,CAAe,EAAE,CAAe;QACvD,IAAI,CAAC,CAAC,MAAM,KAAK,CAAC,CAAC,MAAM,EAAE,CAAC;YAC1B,MAAM,IAAI,KAAK,CAAC,+BAA+B,CAAC,CAAC;QACnD,CAAC;QAED,IAAI,UAAU,GAAG,CAAC,CAAC;QACnB,IAAI,KAAK,GAAG,CAAC,CAAC;QACd,IAAI,KAAK,GAAG,CAAC,CAAC;QAEd,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAClC,UAAU,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;YAC1B,KAAK,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;YACrB,KAAK,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;QACvB,CAAC;QAED,IAAI,KAAK,KAAK,CAAC,IAAI,KAAK,KAAK,CAAC,EAAE,CAAC;YAC/B,OAAO,CAAC,CAAC;QACX,CAAC;QAED,OAAO,UAAU,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC;IAC5D,CAAC;CACF;AAED;;;;;;GAMG;AACH,MAAM,UAAU,sBAAsB,CACpC,QAA4B,EAC5B,MAAsB;IAEtB,IAAI,QAAQ,EAAE,CAAC;QACb,OAAO,IAAI,yBAAyB,CAAC,QAAQ,EAAE,MAAM,CAAC,CAAC;IACzD,CAAC;IACD,OAAO,IAAI,yBAAyB,CAAC,MAAM,CAAC,CAAC;AAC/C,CAAC"}
|
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Adaptive region detection for context-aware compression.
|
|
3
|
+
*
|
|
4
|
+
* Detects semantic regions in token sequences (system prompts, user input,
|
|
5
|
+
* injected context) and applies different compression strategies.
|
|
6
|
+
*
|
|
7
|
+
* Port of `small/adaptive.py` region detection to TypeScript.
|
|
8
|
+
*/
|
|
9
|
+
/**
|
|
10
|
+
* Region types with different compression strategies.
|
|
11
|
+
*/
|
|
12
|
+
export var RegionType;
|
|
13
|
+
(function (RegionType) {
|
|
14
|
+
/** System instructions - minimal compression */
|
|
15
|
+
RegionType["SYSTEM"] = "system";
|
|
16
|
+
/** User input - moderate compression */
|
|
17
|
+
RegionType["USER"] = "user";
|
|
18
|
+
/** Injected context - aggressive compression */
|
|
19
|
+
RegionType["CONTEXT"] = "context";
|
|
20
|
+
/** Code blocks - moderate compression */
|
|
21
|
+
RegionType["CODE"] = "code";
|
|
22
|
+
/** Unknown region - default compression */
|
|
23
|
+
RegionType["UNKNOWN"] = "unknown";
|
|
24
|
+
})(RegionType || (RegionType = {}));
|
|
25
|
+
const DEFAULT_RETENTION_TARGETS = {
|
|
26
|
+
[RegionType.SYSTEM]: 0.98,
|
|
27
|
+
[RegionType.USER]: 0.85,
|
|
28
|
+
[RegionType.CONTEXT]: 0.6,
|
|
29
|
+
[RegionType.CODE]: 0.8,
|
|
30
|
+
[RegionType.UNKNOWN]: 0.75,
|
|
31
|
+
};
|
|
32
|
+
/**
|
|
33
|
+
* Detect regions in a token sequence.
|
|
34
|
+
*
|
|
35
|
+
* Uses marker patterns to identify boundaries between different
|
|
36
|
+
* semantic regions in the input.
|
|
37
|
+
*
|
|
38
|
+
* @param tokens - Token sequence to analyze
|
|
39
|
+
* @param config - Region detection configuration
|
|
40
|
+
* @returns Array of detected regions
|
|
41
|
+
*/
|
|
42
|
+
export function detectRegions(tokens, config) {
|
|
43
|
+
const tokenArray = Array.isArray(tokens) ? tokens : Array.from(tokens);
|
|
44
|
+
const n = tokenArray.length;
|
|
45
|
+
if (n === 0) {
|
|
46
|
+
return [];
|
|
47
|
+
}
|
|
48
|
+
const retentionTargets = {
|
|
49
|
+
...DEFAULT_RETENTION_TARGETS,
|
|
50
|
+
...config?.retentionTargets,
|
|
51
|
+
};
|
|
52
|
+
// Find all marker positions
|
|
53
|
+
const markers = [];
|
|
54
|
+
const systemMarkers = config?.systemMarkers ?? DEFAULT_SYSTEM_MARKERS;
|
|
55
|
+
const userMarkers = config?.userMarkers ?? DEFAULT_USER_MARKERS;
|
|
56
|
+
const contextMarkers = config?.contextMarkers ?? DEFAULT_CONTEXT_MARKERS;
|
|
57
|
+
const codeMarkers = config?.codeMarkers ?? DEFAULT_CODE_MARKERS;
|
|
58
|
+
// Search for markers
|
|
59
|
+
for (const pattern of systemMarkers) {
|
|
60
|
+
for (const pos of findPattern(tokenArray, pattern)) {
|
|
61
|
+
markers.push({ pos, type: RegionType.SYSTEM });
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
for (const pattern of userMarkers) {
|
|
65
|
+
for (const pos of findPattern(tokenArray, pattern)) {
|
|
66
|
+
markers.push({ pos, type: RegionType.USER });
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
for (const pattern of contextMarkers) {
|
|
70
|
+
for (const pos of findPattern(tokenArray, pattern)) {
|
|
71
|
+
markers.push({ pos, type: RegionType.CONTEXT });
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
for (const pattern of codeMarkers) {
|
|
75
|
+
for (const pos of findPattern(tokenArray, pattern)) {
|
|
76
|
+
markers.push({ pos, type: RegionType.CODE });
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
// Sort markers by position
|
|
80
|
+
markers.sort((a, b) => a.pos - b.pos);
|
|
81
|
+
// Build regions from markers
|
|
82
|
+
const regions = [];
|
|
83
|
+
if (markers.length === 0) {
|
|
84
|
+
// No markers found - treat entire sequence as unknown
|
|
85
|
+
regions.push({
|
|
86
|
+
type: RegionType.UNKNOWN,
|
|
87
|
+
start: 0,
|
|
88
|
+
end: n,
|
|
89
|
+
retention: retentionTargets[RegionType.UNKNOWN],
|
|
90
|
+
});
|
|
91
|
+
}
|
|
92
|
+
else {
|
|
93
|
+
// Add initial region if first marker is not at start
|
|
94
|
+
if (markers[0].pos > 0) {
|
|
95
|
+
regions.push({
|
|
96
|
+
type: RegionType.UNKNOWN,
|
|
97
|
+
start: 0,
|
|
98
|
+
end: markers[0].pos,
|
|
99
|
+
retention: retentionTargets[RegionType.UNKNOWN],
|
|
100
|
+
});
|
|
101
|
+
}
|
|
102
|
+
// Add regions from markers
|
|
103
|
+
for (let i = 0; i < markers.length; i++) {
|
|
104
|
+
const marker = markers[i];
|
|
105
|
+
const nextPos = i < markers.length - 1 ? markers[i + 1].pos : n;
|
|
106
|
+
regions.push({
|
|
107
|
+
type: marker.type,
|
|
108
|
+
start: marker.pos,
|
|
109
|
+
end: nextPos,
|
|
110
|
+
retention: retentionTargets[marker.type],
|
|
111
|
+
});
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
return regions;
|
|
115
|
+
}
|
|
116
|
+
/**
|
|
117
|
+
* Heuristic-based region detection without explicit markers.
|
|
118
|
+
*
|
|
119
|
+
* Uses statistical features to guess region boundaries.
|
|
120
|
+
*/
|
|
121
|
+
export function detectRegionsHeuristic(tokens) {
|
|
122
|
+
const tokenArray = Array.isArray(tokens) ? tokens : Array.from(tokens);
|
|
123
|
+
const n = tokenArray.length;
|
|
124
|
+
if (n === 0) {
|
|
125
|
+
return [];
|
|
126
|
+
}
|
|
127
|
+
// Simple heuristic: first 10% is likely system, rest is context
|
|
128
|
+
const systemEnd = Math.floor(n * 0.1);
|
|
129
|
+
return [
|
|
130
|
+
{
|
|
131
|
+
type: RegionType.SYSTEM,
|
|
132
|
+
start: 0,
|
|
133
|
+
end: Math.max(systemEnd, 1),
|
|
134
|
+
retention: DEFAULT_RETENTION_TARGETS[RegionType.SYSTEM],
|
|
135
|
+
},
|
|
136
|
+
{
|
|
137
|
+
type: RegionType.CONTEXT,
|
|
138
|
+
start: Math.max(systemEnd, 1),
|
|
139
|
+
end: n,
|
|
140
|
+
retention: DEFAULT_RETENTION_TARGETS[RegionType.CONTEXT],
|
|
141
|
+
},
|
|
142
|
+
];
|
|
143
|
+
}
|
|
144
|
+
/**
|
|
145
|
+
* Filter patterns based on region retention targets.
|
|
146
|
+
*
|
|
147
|
+
* Removes patterns that would compress high-retention regions.
|
|
148
|
+
*
|
|
149
|
+
* @param patterns - Discovered patterns
|
|
150
|
+
* @param regions - Detected regions
|
|
151
|
+
* @param tokens - Original token sequence
|
|
152
|
+
* @returns Filtered patterns respecting region constraints
|
|
153
|
+
*/
|
|
154
|
+
export function filterPatternsByRegion(patterns, regions, _tokens) {
|
|
155
|
+
if (regions.length === 0) {
|
|
156
|
+
return patterns;
|
|
157
|
+
}
|
|
158
|
+
return patterns.map((pattern) => {
|
|
159
|
+
// Filter positions based on region retention
|
|
160
|
+
const filteredPositions = pattern.positions.filter((pos) => {
|
|
161
|
+
const region = findRegionAtPosition(regions, pos);
|
|
162
|
+
if (!region)
|
|
163
|
+
return true;
|
|
164
|
+
// Keep position if region allows compression (low retention)
|
|
165
|
+
// High retention regions should preserve patterns
|
|
166
|
+
return region.retention < 0.9;
|
|
167
|
+
});
|
|
168
|
+
return {
|
|
169
|
+
...pattern,
|
|
170
|
+
positions: filteredPositions,
|
|
171
|
+
count: filteredPositions.length,
|
|
172
|
+
};
|
|
173
|
+
}).filter((pattern) => pattern.positions.length >= 2);
|
|
174
|
+
}
|
|
175
|
+
/**
|
|
176
|
+
* Find the region containing a position.
|
|
177
|
+
*/
|
|
178
|
+
function findRegionAtPosition(regions, pos) {
|
|
179
|
+
for (const region of regions) {
|
|
180
|
+
if (pos >= region.start && pos < region.end) {
|
|
181
|
+
return region;
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
return null;
|
|
185
|
+
}
|
|
186
|
+
/**
|
|
187
|
+
* Find all occurrences of a pattern in tokens.
|
|
188
|
+
*/
|
|
189
|
+
function findPattern(tokens, pattern) {
|
|
190
|
+
const positions = [];
|
|
191
|
+
const n = tokens.length;
|
|
192
|
+
const m = pattern.length;
|
|
193
|
+
for (let i = 0; i <= n - m; i++) {
|
|
194
|
+
let match = true;
|
|
195
|
+
for (let j = 0; j < m; j++) {
|
|
196
|
+
if (tokens[i + j] !== pattern[j]) {
|
|
197
|
+
match = false;
|
|
198
|
+
break;
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
if (match) {
|
|
202
|
+
positions.push(i);
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
return positions;
|
|
206
|
+
}
|
|
207
|
+
// Default marker patterns (tiktoken cl100k_base token IDs for common markers)
|
|
208
|
+
// These are approximate - actual token IDs depend on the tokenizer
|
|
209
|
+
/** Default system region markers */
|
|
210
|
+
const DEFAULT_SYSTEM_MARKERS = [
|
|
211
|
+
// [SYSTEM], <<SYS>>, etc.
|
|
212
|
+
[58, 71905, 60], // [SYSTEM]
|
|
213
|
+
[27, 27, 71905, 2083, 2083], // <<SYS>>
|
|
214
|
+
];
|
|
215
|
+
/** Default user region markers */
|
|
216
|
+
const DEFAULT_USER_MARKERS = [
|
|
217
|
+
// [USER], [INST], etc.
|
|
218
|
+
[58, 35295, 60], // [USER]
|
|
219
|
+
[58, 96746, 60], // [INST]
|
|
220
|
+
];
|
|
221
|
+
/** Default context region markers */
|
|
222
|
+
const DEFAULT_CONTEXT_MARKERS = [
|
|
223
|
+
// [CONTEXT], [DOC], etc.
|
|
224
|
+
[58, 94034, 60], // [CONTEXT]
|
|
225
|
+
[58, 44184, 60], // [DOC]
|
|
226
|
+
];
|
|
227
|
+
/** Default code region markers */
|
|
228
|
+
const DEFAULT_CODE_MARKERS = [
|
|
229
|
+
// ```python, ```typescript, etc.
|
|
230
|
+
[74694, 12958], // ```python
|
|
231
|
+
[74694, 92459], // ```typescript
|
|
232
|
+
[74694, 13210], // ```javascript
|
|
233
|
+
];
|
|
234
|
+
/**
|
|
235
|
+
* Get compression settings for a region type.
|
|
236
|
+
*/
|
|
237
|
+
export function getRegionCompressionSettings(regionType) {
|
|
238
|
+
switch (regionType) {
|
|
239
|
+
case RegionType.SYSTEM:
|
|
240
|
+
return { maxSubsequenceLength: 4, minOccurrences: 5 };
|
|
241
|
+
case RegionType.USER:
|
|
242
|
+
return { maxSubsequenceLength: 6, minOccurrences: 3 };
|
|
243
|
+
case RegionType.CONTEXT:
|
|
244
|
+
return { maxSubsequenceLength: 10, minOccurrences: 2 };
|
|
245
|
+
case RegionType.CODE:
|
|
246
|
+
return { maxSubsequenceLength: 6, minOccurrences: 3 };
|
|
247
|
+
default:
|
|
248
|
+
return { maxSubsequenceLength: 8, minOccurrences: 3 };
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
//# sourceMappingURL=regions.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"regions.js","sourceRoot":"","sources":["../../src/regions.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAIH;;GAEG;AACH,MAAM,CAAN,IAAY,UAWX;AAXD,WAAY,UAAU;IACpB,gDAAgD;IAChD,+BAAiB,CAAA;IACjB,wCAAwC;IACxC,2BAAa,CAAA;IACb,gDAAgD;IAChD,iCAAmB,CAAA;IACnB,yCAAyC;IACzC,2BAAa,CAAA;IACb,2CAA2C;IAC3C,iCAAmB,CAAA;AACrB,CAAC,EAXW,UAAU,KAAV,UAAU,QAWrB;AAgCD,MAAM,yBAAyB,GAA+B;IAC5D,CAAC,UAAU,CAAC,MAAM,CAAC,EAAE,IAAI;IACzB,CAAC,UAAU,CAAC,IAAI,CAAC,EAAE,IAAI;IACvB,CAAC,UAAU,CAAC,OAAO,CAAC,EAAE,GAAG;IACzB,CAAC,UAAU,CAAC,IAAI,CAAC,EAAE,GAAG;IACtB,CAAC,UAAU,CAAC,OAAO,CAAC,EAAE,IAAI;CAC3B,CAAC;AAEF;;;;;;;;;GASG;AACH,MAAM,UAAU,aAAa,CAAC,MAAgB,EAAE,MAAqB;IACnE,MAAM,UAAU,GAAG,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IACvE,MAAM,CAAC,GAAG,UAAU,CAAC,MAAM,CAAC;IAE5B,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC;QACZ,OAAO,EAAE,CAAC;IACZ,CAAC;IAED,MAAM,gBAAgB,GAAG;QACvB,GAAG,yBAAyB;QAC5B,GAAG,MAAM,EAAE,gBAAgB;KAC5B,CAAC;IAEF,4BAA4B;IAC5B,MAAM,OAAO,GAAwC,EAAE,CAAC;IAExD,MAAM,aAAa,GAAG,MAAM,EAAE,aAAa,IAAI,sBAAsB,CAAC;IACtE,MAAM,WAAW,GAAG,MAAM,EAAE,WAAW,IAAI,oBAAoB,CAAC;IAChE,MAAM,cAAc,GAAG,MAAM,EAAE,cAAc,IAAI,uBAAuB,CAAC;IACzE,MAAM,WAAW,GAAG,MAAM,EAAE,WAAW,IAAI,oBAAoB,CAAC;IAEhE,qBAAqB;IACrB,KAAK,MAAM,OAAO,IAAI,aAAa,EAAE,CAAC;QACpC,KAAK,MAAM,GAAG,IAAI,WAAW,CAAC,UAAU,EAAE,OAAO,CAAC,EAAE,CAAC;YACnD,OAAO,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,IAAI,EAAE,UAAU,CAAC,MAAM,EAAE,CAAC,CAAC;QACjD,CAAC;IACH,CAAC;IAED,KAAK,MAAM,OAAO,IAAI,WAAW,EAAE,CAAC;QAClC,KAAK,MAAM,GAAG,IAAI,WAAW,CAAC,UAAU,EAAE,OAAO,CAAC,EAAE,CAAC;YACnD,OAAO,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,IAAI,EAAE,UAAU,CAAC,IAAI,EAAE,CAAC,CAAC;QAC/C,CAAC;IACH,CAAC;IAED,KAAK,MAAM,OAAO,IAAI,cAAc,EAAE,CAAC;QACrC,KAAK,MAAM,GAAG,IAAI,WAAW,CAAC,UAAU,EAAE,OAAO,CAAC,EAAE,CAAC;YACnD,OAAO,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,IAAI,EAAE,UAAU,CAAC,OAAO,EAAE,CAAC,CAAC;QAClD,CAAC;IACH,CAAC;IAED,KAAK,MAAM,OAAO,IAAI,WAAW,EAAE,CAAC;QAClC,KAAK,MAAM,GAAG,IAAI,WAAW,CAAC,UAAU,EAAE,OAAO,CAAC,EAAE,CAAC;YACnD,OAAO,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,IAAI,EAAE,UAAU,CAAC,IAAI,EAAE,CAAC,CAAC;QAC/C,CAAC;IACH,CAAC;IAED,2BAA2B;IAC3B,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC;IAEtC,6BAA6B;IAC7B,MAAM,OAAO,GAAa,EAAE,CAAC;IAE7B,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACzB,sDAAsD;QACtD,OAAO,CAAC,IAAI,CAAC;YACX,IAAI,EAAE,UAAU,CAAC,OAAO;YACxB,KAAK,EAAE,CAAC;YACR,GAAG,EAAE,CAAC;YACN,SAAS,EAAE,gBAAgB,CAAC,UAAU,CAAC,OAAO,CAAC;SAChD,CAAC,CAAC;IACL,CAAC;SAAM,CAAC;QACN,qDAAqD;QACrD,IAAI,OAAO,CAAC,CAAC,CAAC,CAAC,GAAG,GAAG,CAAC,EAAE,CAAC;YACvB,OAAO,CAAC,IAAI,CAAC;gBACX,IAAI,EAAE,UAAU,CAAC,OAAO;gBACxB,KAAK,EAAE,CAAC;gBACR,GAAG,EAAE,OAAO,CAAC,CAAC,CAAC,CAAC,GAAG;gBACnB,SAAS,EAAE,gBAAgB,CAAC,UAAU,CAAC,OAAO,CAAC;aAChD,CAAC,CAAC;QACL,CAAC;QAED,2BAA2B;QAC3B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACxC,MAAM,MAAM,GAAG,OAAO,CAAC,CAAC,CAAC,CAAC;YAC1B,MAAM,OAAO,GAAG,CAAC,GAAG,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;YAEhE,OAAO,CAAC,IAAI,CAAC;gBACX,IAAI,EAAE,MAAM,CAAC,IAAI;gBACjB,KAAK,EAAE,MAAM,CAAC,GAAG;gBACjB,GAAG,EAAE,OAAO;gBACZ,SAAS,EAAE,gBAAgB,CAAC,MAAM,CAAC,IAAI,CAAC;aACzC,CAAC,CAAC;QACL,CAAC;IACH,CAAC;IAED,OAAO,OAAO,CAAC;AACjB,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,sBAAsB,CAAC,MAAgB;IACrD,MAAM,UAAU,GAAG,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IACvE,MAAM,CAAC,GAAG,UAAU,CAAC,MAAM,CAAC;IAE5B,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC;QACZ,OAAO,EAAE,CAAC;IACZ,CAAC;IAED,gEAAgE;IAChE,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC;IAEtC,OAAO;QACL;YACE,IAAI,EAAE,UAAU,CAAC,MAAM;YACvB,KAAK,EAAE,CAAC;YACR,GAAG,EAAE,IAAI,CAAC,GAAG,CAAC,SAAS,EAAE,CAAC,CAAC;YAC3B,SAAS,EAAE,yBAAyB,CAAC,UAAU,CAAC,MAAM,CAAC;SACxD;QACD;YACE,IAAI,EAAE,UAAU,CAAC,OAAO;YACxB,KAAK,EAAE,IAAI,CAAC,GAAG,CAAC,SAAS,EAAE,CAAC,CAAC;YAC7B,GAAG,EAAE,CAAC;YACN,SAAS,EAAE,yBAAyB,CAAC,UAAU,CAAC,OAAO,CAAC;SACzD;KACF,CAAC;AACJ,CAAC;AAED;;;;;;;;;GASG;AACH,MAAM,UAAU,sBAAsB,CACpC,QAA6B,EAC7B,OAAiB,EACjB,OAAiB;IAEjB,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACzB,OAAO,QAAQ,CAAC;IAClB,CAAC;IAED,OAAO,QAAQ,CAAC,GAAG,CAAC,CAAC,OAAO,EAAE,EAAE;QAC9B,6CAA6C;QAC7C,MAAM,iBAAiB,GAAG,OAAO,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,EAAE;YACzD,MAAM,MAAM,GAAG,oBAAoB,CAAC,OAAO,EAAE,GAAG,CAAC,CAAC;YAClD,IAAI,CAAC,MAAM;gBAAE,OAAO,IAAI,CAAC;YAEzB,6DAA6D;YAC7D,kDAAkD;YAClD,OAAO,MAAM,CAAC,SAAS,GAAG,GAAG,CAAC;QAChC,CAAC,CAAC,CAAC;QAEH,OAAO;YACL,GAAG,OAAO;YACV,SAAS,EAAE,iBAAiB;YAC5B,KAAK,EAAE,iBAAiB,CAAC,MAAM;SAChC,CAAC;IACJ,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,OAAO,EAAE,EAAE,CAAC,OAAO,CAAC,SAAS,CAAC,MAAM,IAAI,CAAC,CAAC,CAAC;AACxD,CAAC;AAED;;GAEG;AACH,SAAS,oBAAoB,CAAC,OAAiB,EAAE,GAAW;IAC1D,KAAK,MAAM,MAAM,IAAI,OAAO,EAAE,CAAC;QAC7B,IAAI,GAAG,IAAI,MAAM,CAAC,KAAK,IAAI,GAAG,GAAG,MAAM,CAAC,GAAG,EAAE,CAAC;YAC5C,OAAO,MAAM,CAAC;QAChB,CAAC;IACH,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC;AAED;;GAEG;AACH,SAAS,WAAW,CAAC,MAAgB,EAAE,OAAiB;IACtD,MAAM,SAAS,GAAa,EAAE,CAAC;IAC/B,MAAM,CAAC,GAAG,MAAM,CAAC,MAAM,CAAC;IACxB,MAAM,CAAC,GAAG,OAAO,CAAC,MAAM,CAAC;IAEzB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QAChC,IAAI,KAAK,GAAG,IAAI,CAAC;QACjB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YAC3B,IAAI,MAAM,CAAC,CAAC,GAAG,CAAC,CAAC,KAAK,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC;gBACjC,KAAK,GAAG,KAAK,CAAC;gBACd,MAAM;YACR,CAAC;QACH,CAAC;QACD,IAAI,KAAK,EAAE,CAAC;YACV,SAAS,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACpB,CAAC;IACH,CAAC;IAED,OAAO,SAAS,CAAC;AACnB,CAAC;AAED,8EAA8E;AAC9E,mEAAmE;AAEnE,oCAAoC;AACpC,MAAM,sBAAsB,GAAe;IACzC,0BAA0B;IAC1B,CAAC,EAAE,EAAE,KAAK,EAAE,EAAE,CAAC,EAAM,WAAW;IAChC,CAAC,EAAE,EAAE,EAAE,EAAE,KAAK,EAAE,IAAI,EAAE,IAAI,CAAC,EAAE,UAAU;CACxC,CAAC;AAEF,kCAAkC;AAClC,MAAM,oBAAoB,GAAe;IACvC,uBAAuB;IACvB,CAAC,EAAE,EAAE,KAAK,EAAE,EAAE,CAAC,EAAM,SAAS;IAC9B,CAAC,EAAE,EAAE,KAAK,EAAE,EAAE,CAAC,EAAM,SAAS;CAC/B,CAAC;AAEF,qCAAqC;AACrC,MAAM,uBAAuB,GAAe;IAC1C,yBAAyB;IACzB,CAAC,EAAE,EAAE,KAAK,EAAE,EAAE,CAAC,EAAM,YAAY;IACjC,CAAC,EAAE,EAAE,KAAK,EAAE,EAAE,CAAC,EAAM,QAAQ;CAC9B,CAAC;AAEF,kCAAkC;AAClC,MAAM,oBAAoB,GAAe;IACvC,iCAAiC;IACjC,CAAC,KAAK,EAAE,KAAK,CAAC,EAAO,YAAY;IACjC,CAAC,KAAK,EAAE,KAAK,CAAC,EAAO,gBAAgB;IACrC,CAAC,KAAK,EAAE,KAAK,CAAC,EAAO,gBAAgB;CACtC,CAAC;AAEF;;GAEG;AACH,MAAM,UAAU,4BAA4B,CAAC,UAAsB;IAIjE,QAAQ,UAAU,EAAE,CAAC;QACnB,KAAK,UAAU,CAAC,MAAM;YACpB,OAAO,EAAE,oBAAoB,EAAE,CAAC,EAAE,cAAc,EAAE,CAAC,EAAE,CAAC;QACxD,KAAK,UAAU,CAAC,IAAI;YAClB,OAAO,EAAE,oBAAoB,EAAE,CAAC,EAAE,cAAc,EAAE,CAAC,EAAE,CAAC;QACxD,KAAK,UAAU,CAAC,OAAO;YACrB,OAAO,EAAE,oBAAoB,EAAE,EAAE,EAAE,cAAc,EAAE,CAAC,EAAE,CAAC;QACzD,KAAK,UAAU,CAAC,IAAI;YAClB,OAAO,EAAE,oBAAoB,EAAE,CAAC,EAAE,cAAc,EAAE,CAAC,EAAE,CAAC;QACxD;YACE,OAAO,EAAE,oBAAoB,EAAE,CAAC,EAAE,cAAc,EAAE,CAAC,EAAE,CAAC;IAC1D,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pattern importance scoring for ML-aware compression.
|
|
3
|
+
*
|
|
4
|
+
* Port of `small/pattern_importance.py` to TypeScript.
|
|
5
|
+
*/
|
|
6
|
+
import type { DiscoveredPattern, TokenSeq } from '@small-ltsc/sdk';
|
|
7
|
+
/**
|
|
8
|
+
* Interface for embedding providers.
|
|
9
|
+
*/
|
|
10
|
+
export interface EmbeddingProvider {
|
|
11
|
+
/**
|
|
12
|
+
* Get embeddings for a sequence of tokens.
|
|
13
|
+
*
|
|
14
|
+
* @param tokens - Token sequence to embed
|
|
15
|
+
* @returns Promise resolving to embedding vector (Float32Array)
|
|
16
|
+
*/
|
|
17
|
+
embed(tokens: TokenSeq): Promise<Float32Array>;
|
|
18
|
+
/**
|
|
19
|
+
* Get embedding dimension.
|
|
20
|
+
*/
|
|
21
|
+
dimension(): number;
|
|
22
|
+
}
|
|
23
|
+
/**
|
|
24
|
+
* Configuration for importance scoring.
|
|
25
|
+
*/
|
|
26
|
+
export interface ImportanceConfig {
|
|
27
|
+
/**
|
|
28
|
+
* Position decay rate for positional scoring.
|
|
29
|
+
* Higher values = more weight on early positions.
|
|
30
|
+
* @default 2.0
|
|
31
|
+
*/
|
|
32
|
+
decayRate?: number;
|
|
33
|
+
/**
|
|
34
|
+
* Context window size for embedding-based scoring.
|
|
35
|
+
* @default 5
|
|
36
|
+
*/
|
|
37
|
+
contextWindow?: number;
|
|
38
|
+
/**
|
|
39
|
+
* Weight for positional importance (vs. embedding-based).
|
|
40
|
+
* @default 0.3
|
|
41
|
+
*/
|
|
42
|
+
positionalWeight?: number;
|
|
43
|
+
}
|
|
44
|
+
/**
|
|
45
|
+
* Interface for pattern importance scorers.
|
|
46
|
+
*/
|
|
47
|
+
export interface ImportanceScorer {
|
|
48
|
+
/**
|
|
49
|
+
* Score patterns by importance.
|
|
50
|
+
*
|
|
51
|
+
* Higher scores indicate more important patterns that should be
|
|
52
|
+
* preserved (less aggressively compressed).
|
|
53
|
+
*
|
|
54
|
+
* @param tokens - Original token sequence
|
|
55
|
+
* @param patterns - Discovered patterns to score
|
|
56
|
+
* @returns Promise resolving to importance scores (0-1 range)
|
|
57
|
+
*/
|
|
58
|
+
scorePatterns(tokens: TokenSeq, patterns: DiscoveredPattern[]): Promise<number[]>;
|
|
59
|
+
}
|
|
60
|
+
/**
|
|
61
|
+
* Positional importance scorer.
|
|
62
|
+
*
|
|
63
|
+
* Scores patterns based on their position in the sequence,
|
|
64
|
+
* with earlier positions receiving higher importance (useful for
|
|
65
|
+
* system prompts that typically appear at the start).
|
|
66
|
+
*/
|
|
67
|
+
export declare class PositionalImportanceScorer implements ImportanceScorer {
|
|
68
|
+
private decayRate;
|
|
69
|
+
constructor(config?: ImportanceConfig);
|
|
70
|
+
scorePatterns(tokens: TokenSeq, patterns: DiscoveredPattern[]): Promise<number[]>;
|
|
71
|
+
}
|
|
72
|
+
/**
|
|
73
|
+
* Embedding-based importance scorer.
|
|
74
|
+
*
|
|
75
|
+
* Uses an embedding model to determine if a pattern appears in
|
|
76
|
+
* diverse semantic contexts (important, should preserve) vs.
|
|
77
|
+
* similar contexts (redundant, safe to compress).
|
|
78
|
+
*/
|
|
79
|
+
export declare class EmbeddingImportanceScorer implements ImportanceScorer {
|
|
80
|
+
private provider;
|
|
81
|
+
private contextWindow;
|
|
82
|
+
constructor(provider: EmbeddingProvider, config?: ImportanceConfig);
|
|
83
|
+
scorePatterns(tokens: TokenSeq, patterns: DiscoveredPattern[]): Promise<number[]>;
|
|
84
|
+
}
|
|
85
|
+
/**
|
|
86
|
+
* Combined importance scorer that uses both positional and embedding-based scoring.
|
|
87
|
+
*/
|
|
88
|
+
export declare class CombinedImportanceScorer implements ImportanceScorer {
|
|
89
|
+
private positionalScorer;
|
|
90
|
+
private embeddingScorer;
|
|
91
|
+
private positionalWeight;
|
|
92
|
+
constructor(provider?: EmbeddingProvider, config?: ImportanceConfig);
|
|
93
|
+
scorePatterns(tokens: TokenSeq, patterns: DiscoveredPattern[]): Promise<number[]>;
|
|
94
|
+
}
|
|
95
|
+
/**
|
|
96
|
+
* Adjust pattern priorities based on importance scores.
|
|
97
|
+
*
|
|
98
|
+
* Patterns with high importance get lower priority (compressed later),
|
|
99
|
+
* patterns with low importance get higher priority (compressed first).
|
|
100
|
+
*
|
|
101
|
+
* @param patterns - Patterns to adjust
|
|
102
|
+
* @param scores - Importance scores from an ImportanceScorer
|
|
103
|
+
* @param threshold - Patterns above this importance threshold get negative priority
|
|
104
|
+
* @returns Patterns with adjusted priorities
|
|
105
|
+
*/
|
|
106
|
+
export declare function adjustPrioritiesByImportance(patterns: DiscoveredPattern[], scores: number[], threshold?: number): DiscoveredPattern[];
|
|
107
|
+
/**
|
|
108
|
+
* Filter patterns to only those below an importance threshold.
|
|
109
|
+
*
|
|
110
|
+
* @param patterns - Patterns to filter
|
|
111
|
+
* @param scores - Importance scores
|
|
112
|
+
* @param threshold - Maximum importance score to include
|
|
113
|
+
* @returns Filtered patterns that are safe to compress
|
|
114
|
+
*/
|
|
115
|
+
export declare function filterByImportance(patterns: DiscoveredPattern[], scores: number[], threshold?: number): DiscoveredPattern[];
|
|
116
|
+
//# sourceMappingURL=importance.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"importance.d.ts","sourceRoot":"","sources":["../../src/importance.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,KAAK,EAAE,iBAAiB,EAAE,QAAQ,EAAE,MAAM,iBAAiB,CAAC;AAEnE;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAChC;;;;;OAKG;IACH,KAAK,CAAC,MAAM,EAAE,QAAQ,GAAG,OAAO,CAAC,YAAY,CAAC,CAAC;IAE/C;;OAEG;IACH,SAAS,IAAI,MAAM,CAAC;CACrB;AAED;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B;;;;OAIG;IACH,SAAS,CAAC,EAAE,MAAM,CAAC;IAEnB;;;OAGG;IACH,aAAa,CAAC,EAAE,MAAM,CAAC;IAEvB;;;OAGG;IACH,gBAAgB,CAAC,EAAE,MAAM,CAAC;CAC3B;AAQD;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B;;;;;;;;;OASG;IACH,aAAa,CAAC,MAAM,EAAE,QAAQ,EAAE,QAAQ,EAAE,iBAAiB,EAAE,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC,CAAC;CACnF;AAED;;;;;;GAMG;AACH,qBAAa,0BAA2B,YAAW,gBAAgB;IACjE,OAAO,CAAC,SAAS,CAAS;gBAEd,MAAM,CAAC,EAAE,gBAAgB;IAK/B,aAAa,CAAC,MAAM,EAAE,QAAQ,EAAE,QAAQ,EAAE,iBAAiB,EAAE,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC;CAuBxF;AAED;;;;;;GAMG;AACH,qBAAa,yBAA0B,YAAW,gBAAgB;IAChE,OAAO,CAAC,QAAQ,CAAoB;IACpC,OAAO,CAAC,aAAa,CAAS;gBAElB,QAAQ,EAAE,iBAAiB,EAAE,MAAM,CAAC,EAAE,gBAAgB;IAM5D,aAAa,CAAC,MAAM,EAAE,QAAQ,EAAE,QAAQ,EAAE,iBAAiB,EAAE,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC;CAkDxF;AAED;;GAEG;AACH,qBAAa,wBAAyB,YAAW,gBAAgB;IAC/D,OAAO,CAAC,gBAAgB,CAA6B;IACrD,OAAO,CAAC,eAAe,CAAmC;IAC1D,OAAO,CAAC,gBAAgB,CAAS;gBAErB,QAAQ,CAAC,EAAE,iBAAiB,EAAE,MAAM,CAAC,EAAE,gBAAgB;IAO7D,aAAa,CAAC,MAAM,EAAE,QAAQ,EAAE,QAAQ,EAAE,iBAAiB,EAAE,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC;CAexF;AA2BD;;;;;;;;;;GAUG;AACH,wBAAgB,4BAA4B,CAC1C,QAAQ,EAAE,iBAAiB,EAAE,EAC7B,MAAM,EAAE,MAAM,EAAE,EAChB,SAAS,SAAM,GACd,iBAAiB,EAAE,CAuBrB;AAED;;;;;;;GAOG;AACH,wBAAgB,kBAAkB,CAChC,QAAQ,EAAE,iBAAiB,EAAE,EAC7B,MAAM,EAAE,MAAM,EAAE,EAChB,SAAS,SAAM,GACd,iBAAiB,EAAE,CAErB"}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Small LTSC ML - Machine Learning Features
|
|
3
|
+
*
|
|
4
|
+
* Optional ML features for enhanced compression quality:
|
|
5
|
+
* - Pattern importance scoring
|
|
6
|
+
* - Quality prediction
|
|
7
|
+
* - Adaptive region detection
|
|
8
|
+
*
|
|
9
|
+
* @packageDocumentation
|
|
10
|
+
*/
|
|
11
|
+
export { type EmbeddingProvider, type ImportanceConfig, type ImportanceScorer, PositionalImportanceScorer, EmbeddingImportanceScorer, CombinedImportanceScorer, adjustPrioritiesByImportance, filterByImportance, } from './importance.js';
|
|
12
|
+
export { type QualityConfig, type QualityPrediction, type QualityFeatures, type QualityPredictor, HeuristicQualityPredictor, EmbeddingQualityPredictor, createQualityPredictor, } from './quality.js';
|
|
13
|
+
export { RegionType, type Region, type RegionConfig, detectRegions, detectRegionsHeuristic, filterPatternsByRegion, getRegionCompressionSettings, } from './regions.js';
|
|
14
|
+
export declare const VERSION = "0.1.0";
|
|
15
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;GASG;AAGH,OAAO,EACL,KAAK,iBAAiB,EACtB,KAAK,gBAAgB,EACrB,KAAK,gBAAgB,EACrB,0BAA0B,EAC1B,yBAAyB,EACzB,wBAAwB,EACxB,4BAA4B,EAC5B,kBAAkB,GACnB,MAAM,iBAAiB,CAAC;AAGzB,OAAO,EACL,KAAK,aAAa,EAClB,KAAK,iBAAiB,EACtB,KAAK,eAAe,EACpB,KAAK,gBAAgB,EACrB,yBAAyB,EACzB,yBAAyB,EACzB,sBAAsB,GACvB,MAAM,cAAc,CAAC;AAGtB,OAAO,EACL,UAAU,EACV,KAAK,MAAM,EACX,KAAK,YAAY,EACjB,aAAa,EACb,sBAAsB,EACtB,sBAAsB,EACtB,4BAA4B,GAC7B,MAAM,cAAc,CAAC;AAGtB,eAAO,MAAM,OAAO,UAAU,CAAC"}
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Quality prediction for compression validation.
|
|
3
|
+
*
|
|
4
|
+
* Predicts whether a compressed sequence will maintain
|
|
5
|
+
* sufficient quality for transformer models to learn from.
|
|
6
|
+
*
|
|
7
|
+
* Port of `small/quality_predictor.py` to TypeScript.
|
|
8
|
+
*/
|
|
9
|
+
import type { CompressionResult } from '@small-ltsc/sdk';
|
|
10
|
+
import type { EmbeddingProvider } from './importance.js';
|
|
11
|
+
/**
|
|
12
|
+
* Configuration for quality prediction.
|
|
13
|
+
*/
|
|
14
|
+
export interface QualityConfig {
|
|
15
|
+
/**
|
|
16
|
+
* Maximum acceptable compression ratio.
|
|
17
|
+
* @default 0.5
|
|
18
|
+
*/
|
|
19
|
+
maxCompressionRatio?: number;
|
|
20
|
+
/**
|
|
21
|
+
* Maximum acceptable dictionary overhead ratio.
|
|
22
|
+
* @default 0.3
|
|
23
|
+
*/
|
|
24
|
+
maxDictionaryOverhead?: number;
|
|
25
|
+
/**
|
|
26
|
+
* Minimum embedding similarity between original and compressed.
|
|
27
|
+
* @default 0.7
|
|
28
|
+
*/
|
|
29
|
+
minEmbeddingSimilarity?: number;
|
|
30
|
+
/**
|
|
31
|
+
* Maximum acceptable token diversity reduction.
|
|
32
|
+
* @default 0.4
|
|
33
|
+
*/
|
|
34
|
+
maxDiversityReduction?: number;
|
|
35
|
+
}
|
|
36
|
+
/**
|
|
37
|
+
* Result of quality prediction.
|
|
38
|
+
*/
|
|
39
|
+
export interface QualityPrediction {
|
|
40
|
+
/**
|
|
41
|
+
* Overall quality score (0-1, higher is better).
|
|
42
|
+
*/
|
|
43
|
+
score: number;
|
|
44
|
+
/**
|
|
45
|
+
* Whether the compression passes quality threshold.
|
|
46
|
+
*/
|
|
47
|
+
acceptable: boolean;
|
|
48
|
+
/**
|
|
49
|
+
* Probability of quality degradation.
|
|
50
|
+
*/
|
|
51
|
+
degradationProbability: number;
|
|
52
|
+
/**
|
|
53
|
+
* Detailed feature scores.
|
|
54
|
+
*/
|
|
55
|
+
features: QualityFeatures;
|
|
56
|
+
/**
|
|
57
|
+
* Recommendation for how to proceed.
|
|
58
|
+
*/
|
|
59
|
+
recommendation: 'accept' | 'retry_conservative' | 'skip_compression';
|
|
60
|
+
}
|
|
61
|
+
/**
|
|
62
|
+
* Feature scores used in quality prediction.
|
|
63
|
+
*/
|
|
64
|
+
export interface QualityFeatures {
|
|
65
|
+
/**
|
|
66
|
+
* Compression ratio feature (lower is more aggressive).
|
|
67
|
+
*/
|
|
68
|
+
compressionRatio: number;
|
|
69
|
+
/**
|
|
70
|
+
* Dictionary overhead ratio.
|
|
71
|
+
*/
|
|
72
|
+
dictionaryOverhead: number;
|
|
73
|
+
/**
|
|
74
|
+
* Token diversity change (0-1, 0 = no change, 1 = complete loss).
|
|
75
|
+
*/
|
|
76
|
+
diversityReduction: number;
|
|
77
|
+
/**
|
|
78
|
+
* Average pattern length feature.
|
|
79
|
+
*/
|
|
80
|
+
averagePatternLength: number;
|
|
81
|
+
/**
|
|
82
|
+
* Pattern count feature.
|
|
83
|
+
*/
|
|
84
|
+
patternCount: number;
|
|
85
|
+
/**
|
|
86
|
+
* Embedding similarity (if available).
|
|
87
|
+
*/
|
|
88
|
+
embeddingSimilarity?: number;
|
|
89
|
+
}
|
|
90
|
+
/**
|
|
91
|
+
* Quality predictor interface.
|
|
92
|
+
*/
|
|
93
|
+
export interface QualityPredictor {
|
|
94
|
+
/**
|
|
95
|
+
* Predict quality of compressed output.
|
|
96
|
+
*
|
|
97
|
+
* @param result - Compression result to evaluate
|
|
98
|
+
* @returns Promise resolving to quality prediction
|
|
99
|
+
*/
|
|
100
|
+
predict(result: CompressionResult): Promise<QualityPrediction>;
|
|
101
|
+
}
|
|
102
|
+
/**
|
|
103
|
+
* Heuristic-based quality predictor.
|
|
104
|
+
*
|
|
105
|
+
* Uses a combination of handcrafted features and thresholds
|
|
106
|
+
* to predict compression quality.
|
|
107
|
+
*/
|
|
108
|
+
export declare class HeuristicQualityPredictor implements QualityPredictor {
|
|
109
|
+
private config;
|
|
110
|
+
constructor(config?: QualityConfig);
|
|
111
|
+
predict(result: CompressionResult): Promise<QualityPrediction>;
|
|
112
|
+
/**
|
|
113
|
+
* Extract quality features from compression result.
|
|
114
|
+
*/
|
|
115
|
+
private extractFeatures;
|
|
116
|
+
/**
|
|
117
|
+
* Compute overall quality score from features.
|
|
118
|
+
*/
|
|
119
|
+
private computeScore;
|
|
120
|
+
/**
|
|
121
|
+
* Compute probability of quality degradation.
|
|
122
|
+
*/
|
|
123
|
+
private computeDegradationProbability;
|
|
124
|
+
}
|
|
125
|
+
/**
|
|
126
|
+
* Embedding-enhanced quality predictor.
|
|
127
|
+
*
|
|
128
|
+
* Adds embedding similarity comparison to the heuristic predictor
|
|
129
|
+
* for more accurate quality assessment.
|
|
130
|
+
*/
|
|
131
|
+
export declare class EmbeddingQualityPredictor implements QualityPredictor {
|
|
132
|
+
private provider;
|
|
133
|
+
private heuristicPredictor;
|
|
134
|
+
private config;
|
|
135
|
+
constructor(provider: EmbeddingProvider, config?: QualityConfig);
|
|
136
|
+
predict(result: CompressionResult): Promise<QualityPrediction>;
|
|
137
|
+
private cosineSimilarity;
|
|
138
|
+
}
|
|
139
|
+
/**
|
|
140
|
+
* Create a quality predictor.
|
|
141
|
+
*
|
|
142
|
+
* @param provider - Optional embedding provider for enhanced prediction
|
|
143
|
+
* @param config - Quality configuration
|
|
144
|
+
* @returns Quality predictor instance
|
|
145
|
+
*/
|
|
146
|
+
export declare function createQualityPredictor(provider?: EmbeddingProvider, config?: QualityConfig): QualityPredictor;
|
|
147
|
+
//# sourceMappingURL=quality.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"quality.d.ts","sourceRoot":"","sources":["../../src/quality.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,iBAAiB,CAAC;AACzD,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,iBAAiB,CAAC;AAEzD;;GAEG;AACH,MAAM,WAAW,aAAa;IAC5B;;;OAGG;IACH,mBAAmB,CAAC,EAAE,MAAM,CAAC;IAE7B;;;OAGG;IACH,qBAAqB,CAAC,EAAE,MAAM,CAAC;IAE/B;;;OAGG;IACH,sBAAsB,CAAC,EAAE,MAAM,CAAC;IAEhC;;;OAGG;IACH,qBAAqB,CAAC,EAAE,MAAM,CAAC;CAChC;AASD;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAChC;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;IAEd;;OAEG;IACH,UAAU,EAAE,OAAO,CAAC;IAEpB;;OAEG;IACH,sBAAsB,EAAE,MAAM,CAAC;IAE/B;;OAEG;IACH,QAAQ,EAAE,eAAe,CAAC;IAE1B;;OAEG;IACH,cAAc,EAAE,QAAQ,GAAG,oBAAoB,GAAG,kBAAkB,CAAC;CACtE;AAED;;GAEG;AACH,MAAM,WAAW,eAAe;IAC9B;;OAEG;IACH,gBAAgB,EAAE,MAAM,CAAC;IAEzB;;OAEG;IACH,kBAAkB,EAAE,MAAM,CAAC;IAE3B;;OAEG;IACH,kBAAkB,EAAE,MAAM,CAAC;IAE3B;;OAEG;IACH,oBAAoB,EAAE,MAAM,CAAC;IAE7B;;OAEG;IACH,YAAY,EAAE,MAAM,CAAC;IAErB;;OAEG;IACH,mBAAmB,CAAC,EAAE,MAAM,CAAC;CAC9B;AAED;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B;;;;;OAKG;IACH,OAAO,CAAC,MAAM,EAAE,iBAAiB,GAAG,OAAO,CAAC,iBAAiB,CAAC,CAAC;CAChE;AAED;;;;;GAKG;AACH,qBAAa,yBAA0B,YAAW,gBAAgB;IAChE,OAAO,CAAC,MAAM,CAA0B;gBAE5B,MAAM,CAAC,EAAE,aAAa;IAI5B,OAAO,CAAC,MAAM,EAAE,iBAAiB,GAAG,OAAO,CAAC,iBAAiB,CAAC;IAwBpE;;OAEG;IACH,OAAO,CAAC,eAAe;IAiCvB;;OAEG;IACH,OAAO,CAAC,YAAY;IAoCpB;;OAEG;IACH,OAAO,CAAC,6BAA6B;CAkCtC;AAED;;;;;GAKG;AACH,qBAAa,yBAA0B,YAAW,gBAAgB;IAChE,OAAO,CAAC,QAAQ,CAAoB;IACpC,OAAO,CAAC,kBAAkB,CAA4B;IACtD,OAAO,CAAC,MAAM,CAA0B;gBAE5B,QAAQ,EAAE,iBAAiB,EAAE,MAAM,CAAC,EAAE,aAAa;IAMzD,OAAO,CAAC,MAAM,EAAE,iBAAiB,GAAG,OAAO,CAAC,iBAAiB,CAAC;IAmDpE,OAAO,CAAC,gBAAgB;CAqBzB;AAED;;;;;;GAMG;AACH,wBAAgB,sBAAsB,CACpC,QAAQ,CAAC,EAAE,iBAAiB,EAC5B,MAAM,CAAC,EAAE,aAAa,GACrB,gBAAgB,CAKlB"}
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Adaptive region detection for context-aware compression.
|
|
3
|
+
*
|
|
4
|
+
* Detects semantic regions in token sequences (system prompts, user input,
|
|
5
|
+
* injected context) and applies different compression strategies.
|
|
6
|
+
*
|
|
7
|
+
* Port of `small/adaptive.py` region detection to TypeScript.
|
|
8
|
+
*/
|
|
9
|
+
import type { TokenSeq, DiscoveredPattern } from '@small-ltsc/sdk';
|
|
10
|
+
/**
|
|
11
|
+
* Region types with different compression strategies.
|
|
12
|
+
*/
|
|
13
|
+
export declare enum RegionType {
|
|
14
|
+
/** System instructions - minimal compression */
|
|
15
|
+
SYSTEM = "system",
|
|
16
|
+
/** User input - moderate compression */
|
|
17
|
+
USER = "user",
|
|
18
|
+
/** Injected context - aggressive compression */
|
|
19
|
+
CONTEXT = "context",
|
|
20
|
+
/** Code blocks - moderate compression */
|
|
21
|
+
CODE = "code",
|
|
22
|
+
/** Unknown region - default compression */
|
|
23
|
+
UNKNOWN = "unknown"
|
|
24
|
+
}
|
|
25
|
+
/**
|
|
26
|
+
* A detected region in the token sequence.
|
|
27
|
+
*/
|
|
28
|
+
export interface Region {
|
|
29
|
+
/** Region type */
|
|
30
|
+
type: RegionType;
|
|
31
|
+
/** Start position (inclusive) */
|
|
32
|
+
start: number;
|
|
33
|
+
/** End position (exclusive) */
|
|
34
|
+
end: number;
|
|
35
|
+
/** Compression retention target (0-1) */
|
|
36
|
+
retention: number;
|
|
37
|
+
}
|
|
38
|
+
/**
|
|
39
|
+
* Configuration for region detection.
|
|
40
|
+
*/
|
|
41
|
+
export interface RegionConfig {
|
|
42
|
+
/** Token patterns that mark system region start */
|
|
43
|
+
systemMarkers?: number[][];
|
|
44
|
+
/** Token patterns that mark user region start */
|
|
45
|
+
userMarkers?: number[][];
|
|
46
|
+
/** Token patterns that mark context region start */
|
|
47
|
+
contextMarkers?: number[][];
|
|
48
|
+
/** Token patterns that mark code region start */
|
|
49
|
+
codeMarkers?: number[][];
|
|
50
|
+
/** Retention targets for each region type */
|
|
51
|
+
retentionTargets?: Partial<Record<RegionType, number>>;
|
|
52
|
+
}
|
|
53
|
+
/**
|
|
54
|
+
* Detect regions in a token sequence.
|
|
55
|
+
*
|
|
56
|
+
* Uses marker patterns to identify boundaries between different
|
|
57
|
+
* semantic regions in the input.
|
|
58
|
+
*
|
|
59
|
+
* @param tokens - Token sequence to analyze
|
|
60
|
+
* @param config - Region detection configuration
|
|
61
|
+
* @returns Array of detected regions
|
|
62
|
+
*/
|
|
63
|
+
export declare function detectRegions(tokens: TokenSeq, config?: RegionConfig): Region[];
|
|
64
|
+
/**
|
|
65
|
+
* Heuristic-based region detection without explicit markers.
|
|
66
|
+
*
|
|
67
|
+
* Uses statistical features to guess region boundaries.
|
|
68
|
+
*/
|
|
69
|
+
export declare function detectRegionsHeuristic(tokens: TokenSeq): Region[];
|
|
70
|
+
/**
|
|
71
|
+
* Filter patterns based on region retention targets.
|
|
72
|
+
*
|
|
73
|
+
* Removes patterns that would compress high-retention regions.
|
|
74
|
+
*
|
|
75
|
+
* @param patterns - Discovered patterns
|
|
76
|
+
* @param regions - Detected regions
|
|
77
|
+
* @param tokens - Original token sequence
|
|
78
|
+
* @returns Filtered patterns respecting region constraints
|
|
79
|
+
*/
|
|
80
|
+
export declare function filterPatternsByRegion(patterns: DiscoveredPattern[], regions: Region[], _tokens: TokenSeq): DiscoveredPattern[];
|
|
81
|
+
/**
|
|
82
|
+
* Get compression settings for a region type.
|
|
83
|
+
*/
|
|
84
|
+
export declare function getRegionCompressionSettings(regionType: RegionType): {
|
|
85
|
+
maxSubsequenceLength: number;
|
|
86
|
+
minOccurrences: number;
|
|
87
|
+
};
|
|
88
|
+
//# sourceMappingURL=regions.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"regions.d.ts","sourceRoot":"","sources":["../../src/regions.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,OAAO,KAAK,EAAE,QAAQ,EAAE,iBAAiB,EAAE,MAAM,iBAAiB,CAAC;AAEnE;;GAEG;AACH,oBAAY,UAAU;IACpB,gDAAgD;IAChD,MAAM,WAAW;IACjB,wCAAwC;IACxC,IAAI,SAAS;IACb,gDAAgD;IAChD,OAAO,YAAY;IACnB,yCAAyC;IACzC,IAAI,SAAS;IACb,2CAA2C;IAC3C,OAAO,YAAY;CACpB;AAED;;GAEG;AACH,MAAM,WAAW,MAAM;IACrB,kBAAkB;IAClB,IAAI,EAAE,UAAU,CAAC;IACjB,iCAAiC;IACjC,KAAK,EAAE,MAAM,CAAC;IACd,+BAA+B;IAC/B,GAAG,EAAE,MAAM,CAAC;IACZ,yCAAyC;IACzC,SAAS,EAAE,MAAM,CAAC;CACnB;AAED;;GAEG;AACH,MAAM,WAAW,YAAY;IAC3B,mDAAmD;IACnD,aAAa,CAAC,EAAE,MAAM,EAAE,EAAE,CAAC;IAC3B,iDAAiD;IACjD,WAAW,CAAC,EAAE,MAAM,EAAE,EAAE,CAAC;IACzB,oDAAoD;IACpD,cAAc,CAAC,EAAE,MAAM,EAAE,EAAE,CAAC;IAC5B,iDAAiD;IACjD,WAAW,CAAC,EAAE,MAAM,EAAE,EAAE,CAAC;IACzB,6CAA6C;IAC7C,gBAAgB,CAAC,EAAE,OAAO,CAAC,MAAM,CAAC,UAAU,EAAE,MAAM,CAAC,CAAC,CAAC;CACxD;AAUD;;;;;;;;;GASG;AACH,wBAAgB,aAAa,CAAC,MAAM,EAAE,QAAQ,EAAE,MAAM,CAAC,EAAE,YAAY,GAAG,MAAM,EAAE,CAsF/E;AAED;;;;GAIG;AACH,wBAAgB,sBAAsB,CAAC,MAAM,EAAE,QAAQ,GAAG,MAAM,EAAE,CAyBjE;AAED;;;;;;;;;GASG;AACH,wBAAgB,sBAAsB,CACpC,QAAQ,EAAE,iBAAiB,EAAE,EAC7B,OAAO,EAAE,MAAM,EAAE,EACjB,OAAO,EAAE,QAAQ,GAChB,iBAAiB,EAAE,CAsBrB;AAsED;;GAEG;AACH,wBAAgB,4BAA4B,CAAC,UAAU,EAAE,UAAU,GAAG;IACpE,oBAAoB,EAAE,MAAM,CAAC;IAC7B,cAAc,EAAE,MAAM,CAAC;CACxB,CAaA"}
|
package/package.json
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@small-ltsc/ml",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "ML features for Small LTSC - Pattern importance scoring and quality prediction",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "./dist/esm/index.js",
|
|
7
|
+
"module": "./dist/esm/index.js",
|
|
8
|
+
"types": "./dist/types/index.d.ts",
|
|
9
|
+
"exports": {
|
|
10
|
+
".": {
|
|
11
|
+
"import": "./dist/esm/index.js",
|
|
12
|
+
"types": "./dist/types/index.d.ts"
|
|
13
|
+
}
|
|
14
|
+
},
|
|
15
|
+
"files": [
|
|
16
|
+
"dist"
|
|
17
|
+
],
|
|
18
|
+
"sideEffects": false,
|
|
19
|
+
"scripts": {
|
|
20
|
+
"build": "npm run build:esm && npm run build:types",
|
|
21
|
+
"build:esm": "tsc -p tsconfig.esm.json",
|
|
22
|
+
"build:types": "tsc -p tsconfig.types.json",
|
|
23
|
+
"test": "vitest",
|
|
24
|
+
"lint": "eslint src --ext .ts",
|
|
25
|
+
"prepublishOnly": "npm run build"
|
|
26
|
+
},
|
|
27
|
+
"keywords": [
|
|
28
|
+
"compression",
|
|
29
|
+
"llm",
|
|
30
|
+
"tokens",
|
|
31
|
+
"transformer",
|
|
32
|
+
"ml",
|
|
33
|
+
"embeddings"
|
|
34
|
+
],
|
|
35
|
+
"author": "",
|
|
36
|
+
"license": "MIT",
|
|
37
|
+
"repository": {
|
|
38
|
+
"type": "git",
|
|
39
|
+
"url": "https://github.com/triage-sec/small"
|
|
40
|
+
},
|
|
41
|
+
"peerDependencies": {
|
|
42
|
+
"@small-ltsc/sdk": "^0.1.0"
|
|
43
|
+
},
|
|
44
|
+
"devDependencies": {
|
|
45
|
+
"@types/node": "^20.10.0",
|
|
46
|
+
"eslint": "^8.55.0",
|
|
47
|
+
"typescript": "^5.3.0",
|
|
48
|
+
"vitest": "^1.0.0"
|
|
49
|
+
},
|
|
50
|
+
"engines": {
|
|
51
|
+
"node": ">=18.0.0"
|
|
52
|
+
}
|
|
53
|
+
}
|