@small-ltsc/sdk 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +169 -0
- package/dist/esm/compress.js +160 -0
- package/dist/esm/compress.js.map +1 -0
- package/dist/esm/config.js +47 -0
- package/dist/esm/config.js.map +1 -0
- package/dist/esm/decompress.js +105 -0
- package/dist/esm/decompress.js.map +1 -0
- package/dist/esm/dictionaries/index.js +104 -0
- package/dist/esm/dictionaries/index.js.map +1 -0
- package/dist/esm/dictionaries/json.json +28 -0
- package/dist/esm/dictionaries/markdown.json +28 -0
- package/dist/esm/dictionaries/python.json +28 -0
- package/dist/esm/dictionaries/sql.json +28 -0
- package/dist/esm/dictionaries/typescript.json +28 -0
- package/dist/esm/index.js +26 -0
- package/dist/esm/index.js.map +1 -0
- package/dist/esm/streaming.js +139 -0
- package/dist/esm/streaming.js.map +1 -0
- package/dist/esm/types.js +25 -0
- package/dist/esm/types.js.map +1 -0
- package/dist/esm/wasm/index.js +5 -0
- package/dist/esm/wasm/index.js.map +1 -0
- package/dist/esm/wasm/loader.js +179 -0
- package/dist/esm/wasm/loader.js.map +1 -0
- package/dist/esm/worker.js +304 -0
- package/dist/esm/worker.js.map +1 -0
- package/dist/types/compress.d.ts +39 -0
- package/dist/types/compress.d.ts.map +1 -0
- package/dist/types/config.d.ts +113 -0
- package/dist/types/config.d.ts.map +1 -0
- package/dist/types/decompress.d.ts +53 -0
- package/dist/types/decompress.d.ts.map +1 -0
- package/dist/types/dictionaries/index.d.ts +127 -0
- package/dist/types/dictionaries/index.d.ts.map +1 -0
- package/dist/types/index.d.ts +18 -0
- package/dist/types/index.d.ts.map +1 -0
- package/dist/types/streaming.d.ts +81 -0
- package/dist/types/streaming.d.ts.map +1 -0
- package/dist/types/types.d.ts +116 -0
- package/dist/types/types.d.ts.map +1 -0
- package/dist/types/wasm/index.d.ts +6 -0
- package/dist/types/wasm/index.d.ts.map +1 -0
- package/dist/types/wasm/loader.d.ts +71 -0
- package/dist/types/wasm/loader.d.ts.map +1 -0
- package/dist/types/worker.d.ts +106 -0
- package/dist/types/worker.d.ts.map +1 -0
- package/package.json +63 -0
- package/src/wasm/small_ltsc_core_bg.wasm +0 -0
package/README.md
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
# @small-ltsc/sdk
|
|
2
|
+
|
|
3
|
+
TypeScript SDK for **Small LTSC** - Lossless Token Sequence Compression for LLMs.
|
|
4
|
+
|
|
5
|
+
Reduce LLM inference costs by compressing repetitive token patterns in prompts while maintaining perfect reconstruction. The compressed format can be understood by fine-tuned models, enabling 30-50% cost reduction on structured inputs.
|
|
6
|
+
|
|
7
|
+
## Features
|
|
8
|
+
|
|
9
|
+
- **Lossless compression** - Perfect round-trip reconstruction
|
|
10
|
+
- **High performance** - Rust/WASM core with O(n log n) algorithms
|
|
11
|
+
- **Cross-platform** - Works in browsers, Node.js, Deno, and edge runtimes
|
|
12
|
+
- **Streaming support** - Handle inputs of any size
|
|
13
|
+
- **Worker threads** - Non-blocking compression for large inputs
|
|
14
|
+
- **Static dictionaries** - Pre-built patterns for common domains
|
|
15
|
+
- **TypeScript-first** - Full type safety and IntelliSense
|
|
16
|
+
|
|
17
|
+
## Installation
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
npm install @small-ltsc/sdk
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## Quick Start
|
|
24
|
+
|
|
25
|
+
```typescript
|
|
26
|
+
import { compress, decompress, initWasm } from '@small-ltsc/sdk';
|
|
27
|
+
|
|
28
|
+
// Initialize WASM (required once)
|
|
29
|
+
await initWasm();
|
|
30
|
+
|
|
31
|
+
// Compress tokens
|
|
32
|
+
const tokens = [1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3];
|
|
33
|
+
const result = await compress(tokens);
|
|
34
|
+
|
|
35
|
+
console.log(`Compressed: ${result.originalLength} → ${result.compressedLength} tokens`);
|
|
36
|
+
console.log(`Ratio: ${(result.compressionRatio * 100).toFixed(1)}%`);
|
|
37
|
+
|
|
38
|
+
// Decompress
|
|
39
|
+
const restored = await decompress(result.serializedTokens);
|
|
40
|
+
console.assert(JSON.stringify(tokens) === JSON.stringify(restored));
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## Configuration
|
|
44
|
+
|
|
45
|
+
```typescript
|
|
46
|
+
const result = await compress(tokens, {
|
|
47
|
+
// Pattern discovery
|
|
48
|
+
minSubsequenceLength: 2, // Min pattern length (default: 2)
|
|
49
|
+
maxSubsequenceLength: 8, // Max pattern length (default: 8)
|
|
50
|
+
|
|
51
|
+
// Selection algorithm
|
|
52
|
+
selectionMode: 'greedy', // 'greedy' | 'optimal' | 'beam'
|
|
53
|
+
|
|
54
|
+
// Hierarchical compression
|
|
55
|
+
hierarchicalEnabled: true, // Allow patterns of patterns
|
|
56
|
+
hierarchicalMaxDepth: 3, // Max nesting depth
|
|
57
|
+
|
|
58
|
+
// Verification
|
|
59
|
+
verify: true, // Round-trip verification
|
|
60
|
+
});
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## Static Dictionaries
|
|
64
|
+
|
|
65
|
+
Use pre-built dictionaries for better compression:
|
|
66
|
+
|
|
67
|
+
```typescript
|
|
68
|
+
const result = await compress(pythonCodeTokens, {
|
|
69
|
+
staticDictionary: 'python-v1',
|
|
70
|
+
});
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
Available: `python-v1`, `typescript-v1`, `markdown-v1`, `json-v1`, `sql-v1`
|
|
74
|
+
|
|
75
|
+
## Streaming
|
|
76
|
+
|
|
77
|
+
For large inputs:
|
|
78
|
+
|
|
79
|
+
```typescript
|
|
80
|
+
import { createStreamingCompressor } from '@small-ltsc/sdk';
|
|
81
|
+
|
|
82
|
+
const compressor = await createStreamingCompressor();
|
|
83
|
+
|
|
84
|
+
for await (const chunk of tokenStream) {
|
|
85
|
+
await compressor.addChunk(chunk);
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
const result = await compressor.finish();
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
## Worker Threads
|
|
92
|
+
|
|
93
|
+
Non-blocking compression:
|
|
94
|
+
|
|
95
|
+
```typescript
|
|
96
|
+
import { createWorkerPool } from '@small-ltsc/sdk';
|
|
97
|
+
|
|
98
|
+
const pool = await createWorkerPool(4);
|
|
99
|
+
const result = await pool.compress(tokens);
|
|
100
|
+
pool.terminate();
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
## Browser Usage
|
|
104
|
+
|
|
105
|
+
```html
|
|
106
|
+
<script type="module">
|
|
107
|
+
import { compress, initWasm } from 'https://esm.sh/@small-ltsc/sdk';
|
|
108
|
+
await initWasm();
|
|
109
|
+
const result = await compress([1, 2, 3, 1, 2, 3]);
|
|
110
|
+
</script>
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
## API
|
|
114
|
+
|
|
115
|
+
### Core Functions
|
|
116
|
+
|
|
117
|
+
- `compress(tokens, config?)` - Compress token sequence
|
|
118
|
+
- `decompress(tokens, config?)` - Decompress to original tokens
|
|
119
|
+
- `discoverPatterns(tokens, minLen?, maxLen?)` - Find patterns without compressing
|
|
120
|
+
|
|
121
|
+
### Streaming
|
|
122
|
+
|
|
123
|
+
- `createStreamingCompressor(config?)` - Create streaming compressor
|
|
124
|
+
- `compressStream(asyncIterable, config?)` - Compress async stream
|
|
125
|
+
|
|
126
|
+
### Workers
|
|
127
|
+
|
|
128
|
+
- `createWorkerPool(count?)` - Create worker pool
|
|
129
|
+
- `compressInWorker(tokens, config?)` - Single-use worker compression
|
|
130
|
+
|
|
131
|
+
### Dictionaries
|
|
132
|
+
|
|
133
|
+
- `loadStaticDictionary(id)` - Load built-in dictionary
|
|
134
|
+
- `createStaticDictionary(id, patterns)` - Create custom dictionary
|
|
135
|
+
|
|
136
|
+
### Utilities
|
|
137
|
+
|
|
138
|
+
- `initWasm()` - Initialize WASM module
|
|
139
|
+
- `isWasmInitialized()` - Check initialization status
|
|
140
|
+
- `extractDictionary(tokens)` - Get dictionary from compressed tokens
|
|
141
|
+
- `isCompressed(tokens)` - Check if tokens are compressed
|
|
142
|
+
|
|
143
|
+
## Documentation
|
|
144
|
+
|
|
145
|
+
- [Quick Start Guide](./docs/QUICKSTART.md)
|
|
146
|
+
- [API Reference](./docs/API.md)
|
|
147
|
+
|
|
148
|
+
## Optional ML Features
|
|
149
|
+
|
|
150
|
+
For pattern importance scoring and quality prediction:
|
|
151
|
+
|
|
152
|
+
```bash
|
|
153
|
+
npm install @small-ltsc/ml
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
```typescript
|
|
157
|
+
import { HeuristicQualityPredictor, detectRegions } from '@small-ltsc/ml';
|
|
158
|
+
|
|
159
|
+
const predictor = new HeuristicQualityPredictor();
|
|
160
|
+
const prediction = await predictor.predict(compressionResult);
|
|
161
|
+
|
|
162
|
+
if (!prediction.acceptable) {
|
|
163
|
+
// Retry with more conservative settings
|
|
164
|
+
}
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
## License
|
|
168
|
+
|
|
169
|
+
MIT
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* High-level compression API.
|
|
3
|
+
*/
|
|
4
|
+
import { getWasm, isWasmInitialized, initWasm } from './wasm/loader.js';
|
|
5
|
+
import { mergeConfig, toWasmConfig, DEFAULT_CONFIG } from './config.js';
|
|
6
|
+
import { normalizeTokens, } from './types.js';
|
|
7
|
+
import { loadStaticDictionary } from './dictionaries/index.js';
|
|
8
|
+
/**
|
|
9
|
+
* Compress a token sequence.
|
|
10
|
+
*
|
|
11
|
+
* @param tokens - The token sequence to compress (Array, Uint32Array, or similar)
|
|
12
|
+
* @param config - Optional compression configuration
|
|
13
|
+
* @returns Promise resolving to compression result
|
|
14
|
+
*
|
|
15
|
+
* @example
|
|
16
|
+
* ```typescript
|
|
17
|
+
* import { compress, decompress, initWasm } from '@small-ltsc/sdk';
|
|
18
|
+
*
|
|
19
|
+
* await initWasm();
|
|
20
|
+
*
|
|
21
|
+
* const tokens = [1, 2, 3, 1, 2, 3, 1, 2, 3];
|
|
22
|
+
* const result = await compress(tokens);
|
|
23
|
+
*
|
|
24
|
+
* console.log(`Compressed ${result.originalLength} -> ${result.compressedLength}`);
|
|
25
|
+
* console.log(`Ratio: ${(result.compressionRatio * 100).toFixed(1)}%`);
|
|
26
|
+
* ```
|
|
27
|
+
*/
|
|
28
|
+
export async function compress(tokens, config) {
|
|
29
|
+
// Auto-initialize if not already done
|
|
30
|
+
if (!isWasmInitialized()) {
|
|
31
|
+
await initWasm();
|
|
32
|
+
}
|
|
33
|
+
const mergedConfig = mergeConfig(config);
|
|
34
|
+
const inputTokens = normalizeTokens(tokens);
|
|
35
|
+
// Handle static dictionary if specified
|
|
36
|
+
let staticDictId;
|
|
37
|
+
if (mergedConfig.staticDictionary) {
|
|
38
|
+
if (typeof mergedConfig.staticDictionary === 'string') {
|
|
39
|
+
// Load built-in dictionary (pre-load to validate it exists)
|
|
40
|
+
await loadStaticDictionary(mergedConfig.staticDictionary);
|
|
41
|
+
staticDictId = mergedConfig.staticDictionary;
|
|
42
|
+
// TODO: Apply static dictionary patterns in WASM core
|
|
43
|
+
}
|
|
44
|
+
else {
|
|
45
|
+
staticDictId = mergedConfig.staticDictionary.id;
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
// Check if streaming should be used
|
|
49
|
+
if (inputTokens.length > mergedConfig.streamingThreshold) {
|
|
50
|
+
return compressStreaming(inputTokens, mergedConfig, staticDictId);
|
|
51
|
+
}
|
|
52
|
+
// Direct compression
|
|
53
|
+
const wasm = getWasm();
|
|
54
|
+
const startTime = performance.now();
|
|
55
|
+
const wasmConfig = toWasmConfig(mergedConfig);
|
|
56
|
+
const wasmResult = wasm.compress(inputTokens, wasmConfig);
|
|
57
|
+
const endTime = performance.now();
|
|
58
|
+
return convertWasmResult(wasmResult, staticDictId, endTime - startTime);
|
|
59
|
+
}
|
|
60
|
+
/**
|
|
61
|
+
* Internal streaming compression for large inputs.
|
|
62
|
+
*/
|
|
63
|
+
async function compressStreaming(tokens, config, staticDictId) {
|
|
64
|
+
const wasm = getWasm();
|
|
65
|
+
const wasmConfig = toWasmConfig(config);
|
|
66
|
+
const startTime = performance.now();
|
|
67
|
+
// Create streaming compressor
|
|
68
|
+
const compressor = new wasm.StreamingCompressor(wasmConfig);
|
|
69
|
+
// Process in chunks
|
|
70
|
+
const chunkSize = 32768;
|
|
71
|
+
for (let i = 0; i < tokens.length; i += chunkSize) {
|
|
72
|
+
const end = Math.min(i + chunkSize, tokens.length);
|
|
73
|
+
const chunk = tokens.subarray(i, end);
|
|
74
|
+
compressor.add_chunk(chunk);
|
|
75
|
+
}
|
|
76
|
+
// Finish and get result
|
|
77
|
+
const wasmResult = compressor.finish();
|
|
78
|
+
const endTime = performance.now();
|
|
79
|
+
return convertWasmResult(wasmResult, staticDictId, endTime - startTime);
|
|
80
|
+
}
|
|
81
|
+
/**
|
|
82
|
+
* Convert WASM result to SDK result format.
|
|
83
|
+
*/
|
|
84
|
+
function convertWasmResult(wasmResult, staticDictId, totalTimeMs) {
|
|
85
|
+
const serializedTokens = Array.from(wasmResult.getSerializedTokens());
|
|
86
|
+
const dictionaryTokens = Array.from(wasmResult.getDictionaryTokens());
|
|
87
|
+
const bodyTokens = Array.from(wasmResult.getBodyTokens());
|
|
88
|
+
const originalTokens = Array.from(wasmResult.getOriginalTokens());
|
|
89
|
+
// Build dictionary map from serialized tokens
|
|
90
|
+
const dictionaryMap = buildDictionaryMap(dictionaryTokens);
|
|
91
|
+
return {
|
|
92
|
+
originalTokens,
|
|
93
|
+
serializedTokens,
|
|
94
|
+
dictionaryTokens,
|
|
95
|
+
bodyTokens,
|
|
96
|
+
originalLength: wasmResult.original_length,
|
|
97
|
+
compressedLength: wasmResult.compressed_length,
|
|
98
|
+
compressionRatio: wasmResult.compression_ratio,
|
|
99
|
+
dictionaryMap,
|
|
100
|
+
staticDictionaryId: staticDictId ?? wasmResult.getStaticDictionaryId() ?? undefined,
|
|
101
|
+
metrics: {
|
|
102
|
+
discoveryTimeMs: 0, // Not tracked individually in current implementation
|
|
103
|
+
selectionTimeMs: 0,
|
|
104
|
+
serializationTimeMs: 0,
|
|
105
|
+
totalTimeMs,
|
|
106
|
+
peakMemoryBytes: 0,
|
|
107
|
+
},
|
|
108
|
+
};
|
|
109
|
+
}
|
|
110
|
+
/**
|
|
111
|
+
* Build dictionary map from serialized dictionary tokens.
|
|
112
|
+
*/
|
|
113
|
+
function buildDictionaryMap(dictionaryTokens) {
|
|
114
|
+
const map = new Map();
|
|
115
|
+
if (dictionaryTokens.length === 0) {
|
|
116
|
+
return map;
|
|
117
|
+
}
|
|
118
|
+
const dictStart = DEFAULT_CONFIG.dictStartToken;
|
|
119
|
+
const dictEnd = DEFAULT_CONFIG.dictEndToken;
|
|
120
|
+
let pos = 0;
|
|
121
|
+
// Find dictionary start
|
|
122
|
+
while (pos < dictionaryTokens.length && dictionaryTokens[pos] !== dictStart) {
|
|
123
|
+
pos++;
|
|
124
|
+
}
|
|
125
|
+
pos++; // Skip start token
|
|
126
|
+
// Parse entries
|
|
127
|
+
while (pos < dictionaryTokens.length && dictionaryTokens[pos] !== dictEnd) {
|
|
128
|
+
const metaToken = dictionaryTokens[pos++];
|
|
129
|
+
const length = dictionaryTokens[pos++];
|
|
130
|
+
if (pos + length > dictionaryTokens.length) {
|
|
131
|
+
break;
|
|
132
|
+
}
|
|
133
|
+
const definition = dictionaryTokens.slice(pos, pos + length);
|
|
134
|
+
map.set(metaToken, definition);
|
|
135
|
+
pos += length;
|
|
136
|
+
}
|
|
137
|
+
return map;
|
|
138
|
+
}
|
|
139
|
+
/**
|
|
140
|
+
* Discover patterns in a token sequence without compressing.
|
|
141
|
+
*
|
|
142
|
+
* Useful for analysis, building static dictionaries, or understanding
|
|
143
|
+
* what patterns would be compressed.
|
|
144
|
+
*
|
|
145
|
+
* @param tokens - The token sequence to analyze
|
|
146
|
+
* @param minLength - Minimum pattern length (default: 2)
|
|
147
|
+
* @param maxLength - Maximum pattern length (default: 8)
|
|
148
|
+
* @returns Array of discovered patterns sorted by potential savings
|
|
149
|
+
*/
|
|
150
|
+
export async function discoverPatterns(tokens, minLength = 2, maxLength = 8) {
|
|
151
|
+
if (!isWasmInitialized()) {
|
|
152
|
+
await initWasm();
|
|
153
|
+
}
|
|
154
|
+
const wasm = getWasm();
|
|
155
|
+
const inputTokens = normalizeTokens(tokens);
|
|
156
|
+
const result = wasm.discover_patterns(inputTokens, minLength, maxLength);
|
|
157
|
+
// Parse result from WASM (returns JSON-serializable value)
|
|
158
|
+
return result;
|
|
159
|
+
}
|
|
160
|
+
//# sourceMappingURL=compress.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"compress.js","sourceRoot":"","sources":["../../src/compress.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,OAAO,EAAE,iBAAiB,EAAE,QAAQ,EAAE,MAAM,kBAAkB,CAAC;AAExE,OAAO,EAA0B,WAAW,EAAE,YAAY,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAChG,OAAO,EAIL,eAAe,GAChB,MAAM,YAAY,CAAC;AACpB,OAAO,EAAE,oBAAoB,EAAyB,MAAM,yBAAyB,CAAC;AAEtF;;;;;;;;;;;;;;;;;;;GAmBG;AACH,MAAM,CAAC,KAAK,UAAU,QAAQ,CAC5B,MAAkB,EAClB,MAA0B;IAE1B,sCAAsC;IACtC,IAAI,CAAC,iBAAiB,EAAE,EAAE,CAAC;QACzB,MAAM,QAAQ,EAAE,CAAC;IACnB,CAAC;IAED,MAAM,YAAY,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC;IACzC,MAAM,WAAW,GAAG,eAAe,CAAC,MAAM,CAAC,CAAC;IAE5C,wCAAwC;IACxC,IAAI,YAAgC,CAAC;IACrC,IAAI,YAAY,CAAC,gBAAgB,EAAE,CAAC;QAClC,IAAI,OAAO,YAAY,CAAC,gBAAgB,KAAK,QAAQ,EAAE,CAAC;YACtD,4DAA4D;YAC5D,MAAM,oBAAoB,CACxB,YAAY,CAAC,gBAA8D,CAC5E,CAAC;YACF,YAAY,GAAG,YAAY,CAAC,gBAAgB,CAAC;YAC7C,sDAAsD;QACxD,CAAC;aAAM,CAAC;YACN,YAAY,GAAI,YAAY,CAAC,gBAAqC,CAAC,EAAE,CAAC;QACxE,CAAC;IACH,CAAC;IAED,oCAAoC;IACpC,IAAI,WAAW,CAAC,MAAM,GAAG,YAAY,CAAC,kBAAkB,EAAE,CAAC;QACzD,OAAO,iBAAiB,CAAC,WAAW,EAAE,YAAY,EAAE,YAAY,CAAC,CAAC;IACpE,CAAC;IAED,qBAAqB;IACrB,MAAM,IAAI,GAAG,OAAO,EAAE,CAAC;IACvB,MAAM,SAAS,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC;IAEpC,MAAM,UAAU,GAAG,YAAY,CAAC,YAAY,CAAC,CAAC;IAC9C,MAAM,UAAU,GAAG,IAAI,CAAC,QAAQ,CAAC,WAAW,EAAE,UAAU,CAAC,CAAC;IAE1D,MAAM,OAAO,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC;IAElC,OAAO,iBAAiB,CAAC,UAAU,EAAE,YAAY,EAAE,OAAO,GAAG,SAAS,CAAC,CAAC;AAC1E,CAAC;AAED;;GAEG;AACH,KAAK,UAAU,iBAAiB,CAC9B,MAAmB,EACnB,MAEC,EACD,YAAqB;IAErB,MAAM,IAAI,GAAG,OAAO,EAAE,CAAC;IACvB,MAAM,UAAU,GAAG,YAAY,CAAC,MAAM,CAAC,CAAC;IAExC,MAAM,SAAS,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC;IAEpC,8BAA8B;IAC9B,MAAM,UAAU,GAAG,IAAI,IAAI,CAAC,mBAAmB,CAAC,UAAU,CAAC,CAAC;IAE5D,oBAAoB;IACpB,MAAM,SAAS,GAAG,KAAK,CAAC;IACxB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,IAAI,SAAS,EAAE,CAAC;QAClD,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,SAAS,EAAE,MAAM,CAAC,MAAM,CAAC,CAAC;QACnD,MAAM,KAAK,GAAG,MAAM,CAAC,QAAQ,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC;QACtC,UAAU,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC;IAC9B,CAAC;IAED,wBAAwB;IACxB,MAAM,UAAU,GAAG,UAAU,CAAC,MAAM,EAAE,CAAC;IACvC,MAAM,OAAO,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC;IAElC,OAAO,iBAAiB,CAAC,UAAU,EAAE,YAAY,EAAE,OAAO,GAAG,SAAS,CAAC,CAAC;AAC1E,CAAC;AAED;;GAEG;AACH,SAAS,iBAAiB,CACxB,UAAiC,EACjC,YAAgC,EAChC,WAAmB;IAEnB,MAAM,gBAAgB,GAAG,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,mBAAmB,EAAE,CAAC,CAAC;IACtE,MAAM,gBAAgB,GAAG,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,mBAAmB,EAAE,CAAC,CAAC;IACtE,MAAM,UAAU,GAAG,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,aAAa,EAAE,CAAC,CAAC;IAC1D,MAAM,cAAc,GAAG,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,iBAAiB,EAAE,CAAC,CAAC;IAElE,8CAA8C;IAC9C,MAAM,aAAa,GAAG,kBAAkB,CAAC,gBAAgB,CAAC,CAAC;IAE3D,OAAO;QACL,cAAc;QACd,gBAAgB;QAChB,gBAAgB;QAChB,UAAU;QACV,cAAc,EAAE,UAAU,CAAC,eAAe;QAC1C,gBAAgB,EAAE,UAAU,CAAC,iBAAiB;QAC9C,gBAAgB,EAAE,UAAU,CAAC,iBAAiB;QAC9C,aAAa;QACb,kBAAkB,EAAE,YAAY,IAAI,UAAU,CAAC,qBAAqB,EAAE,IAAI,SAAS;QACnF,OAAO,EAAE;YACP,eAAe,EAAE,CAAC,EAAE,qDAAqD;YACzE,eAAe,EAAE,CAAC;YAClB,mBAAmB,EAAE,CAAC;YACtB,WAAW;YACX,eAAe,EAAE,CAAC;SACnB;KACF,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,SAAS,kBAAkB,CACzB,gBAAmC;IAEnC,MAAM,GAAG,GAAG,IAAI,GAAG,EAA6B,CAAC;IAEjD,IAAI,gBAAgB,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAClC,OAAO,GAAG,CAAC;IACb,CAAC;IAED,MAAM,SAAS,GAAG,cAAc,CAAC,cAAc,CAAC;IAChD,MAAM,OAAO,GAAG,cAAc,CAAC,YAAY,CAAC;IAE5C,IAAI,GAAG,GAAG,CAAC,CAAC;IAEZ,wBAAwB;IACxB,OAAO,GAAG,GAAG,gBAAgB,CAAC,MAAM,IAAI,gBAAgB,CAAC,GAAG,CAAC,KAAK,SAAS,EAAE,CAAC;QAC5E,GAAG,EAAE,CAAC;IACR,CAAC;IACD,GAAG,EAAE,CAAC,CAAC,mBAAmB;IAE1B,gBAAgB;IAChB,OAAO,GAAG,GAAG,gBAAgB,CAAC,MAAM,IAAI,gBAAgB,CAAC,GAAG,CAAC,KAAK,OAAO,EAAE,CAAC;QAC1E,MAAM,SAAS,GAAG,gBAAgB,CAAC,GAAG,EAAE,CAAC,CAAC;QAC1C,MAAM,MAAM,GAAG,gBAAgB,CAAC,GAAG,EAAE,CAAC,CAAC;QAEvC,IAAI,GAAG,GAAG,MAAM,GAAG,gBAAgB,CAAC,MAAM,EAAE,CAAC;YAC3C,MAAM;QACR,CAAC;QAED,MAAM,UAAU,GAAG,gBAAgB,CAAC,KAAK,CAAC,GAAG,EAAE,GAAG,GAAG,MAAM,CAAC,CAAC;QAC7D,GAAG,CAAC,GAAG,CAAC,SAAS,EAAE,UAAU,CAAC,CAAC;QAC/B,GAAG,IAAI,MAAM,CAAC;IAChB,CAAC;IAED,OAAO,GAAG,CAAC;AACb,CAAC;AAED;;;;;;;;;;GAUG;AACH,MAAM,CAAC,KAAK,UAAU,gBAAgB,CACpC,MAAkB,EAClB,SAAS,GAAG,CAAC,EACb,SAAS,GAAG,CAAC;IAEb,IAAI,CAAC,iBAAiB,EAAE,EAAE,CAAC;QACzB,MAAM,QAAQ,EAAE,CAAC;IACnB,CAAC;IAED,MAAM,IAAI,GAAG,OAAO,EAAE,CAAC;IACvB,MAAM,WAAW,GAAG,eAAe,CAAC,MAAM,CAAC,CAAC;IAE5C,MAAM,MAAM,GAAG,IAAI,CAAC,iBAAiB,CAAC,WAAW,EAAE,SAAS,EAAE,SAAS,CAAC,CAAC;IAEzE,2DAA2D;IAC3D,OAAO,MAA6B,CAAC;AACvC,CAAC"}
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* TypeScript configuration types for Small LTSC.
|
|
3
|
+
*/
|
|
4
|
+
/**
|
|
5
|
+
* Default configuration values.
|
|
6
|
+
*/
|
|
7
|
+
export const DEFAULT_CONFIG = {
|
|
8
|
+
minSubsequenceLength: 2,
|
|
9
|
+
maxSubsequenceLength: 8,
|
|
10
|
+
selectionMode: 'greedy',
|
|
11
|
+
beamWidth: 8,
|
|
12
|
+
hierarchicalEnabled: true,
|
|
13
|
+
hierarchicalMaxDepth: 3,
|
|
14
|
+
streamingThreshold: 50000,
|
|
15
|
+
maxMemoryMb: 256,
|
|
16
|
+
verify: false,
|
|
17
|
+
dictStartToken: 0xfffffff0,
|
|
18
|
+
dictEndToken: 0xfffffff1,
|
|
19
|
+
nextMetaToken: 0xffff0000,
|
|
20
|
+
};
|
|
21
|
+
/**
|
|
22
|
+
* Merge user config with defaults.
|
|
23
|
+
*/
|
|
24
|
+
export function mergeConfig(userConfig) {
|
|
25
|
+
return {
|
|
26
|
+
...DEFAULT_CONFIG,
|
|
27
|
+
...userConfig,
|
|
28
|
+
};
|
|
29
|
+
}
|
|
30
|
+
/**
|
|
31
|
+
* Convert SDK config to WASM config format.
|
|
32
|
+
*/
|
|
33
|
+
export function toWasmConfig(config) {
|
|
34
|
+
return {
|
|
35
|
+
min_subsequence_length: config.minSubsequenceLength,
|
|
36
|
+
max_subsequence_length: config.maxSubsequenceLength,
|
|
37
|
+
selection_mode: config.selectionMode,
|
|
38
|
+
beam_width: config.beamWidth,
|
|
39
|
+
hierarchical_enabled: config.hierarchicalEnabled,
|
|
40
|
+
hierarchical_max_depth: config.hierarchicalMaxDepth,
|
|
41
|
+
verify: config.verify,
|
|
42
|
+
dict_start_token: config.dictStartToken,
|
|
43
|
+
dict_end_token: config.dictEndToken,
|
|
44
|
+
next_meta_token: config.nextMetaToken,
|
|
45
|
+
};
|
|
46
|
+
}
|
|
47
|
+
//# sourceMappingURL=config.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"config.js","sourceRoot":"","sources":["../../src/config.ts"],"names":[],"mappings":"AAAA;;GAEG;AAkHH;;GAEG;AACH,MAAM,CAAC,MAAM,cAAc,GAEvB;IACF,oBAAoB,EAAE,CAAC;IACvB,oBAAoB,EAAE,CAAC;IACvB,aAAa,EAAE,QAAQ;IACvB,SAAS,EAAE,CAAC;IACZ,mBAAmB,EAAE,IAAI;IACzB,oBAAoB,EAAE,CAAC;IACvB,kBAAkB,EAAE,KAAK;IACzB,WAAW,EAAE,GAAG;IAChB,MAAM,EAAE,KAAK;IACb,cAAc,EAAE,UAAU;IAC1B,YAAY,EAAE,UAAU;IACxB,aAAa,EAAE,UAAU;CACjB,CAAC;AAEX;;GAEG;AACH,MAAM,UAAU,WAAW,CACzB,UAA8B;IAI9B,OAAO;QACL,GAAG,cAAc;QACjB,GAAG,UAAU;KACd,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,YAAY,CAAC,MAAyB;IACpD,OAAO;QACL,sBAAsB,EAAE,MAAM,CAAC,oBAAoB;QACnD,sBAAsB,EAAE,MAAM,CAAC,oBAAoB;QACnD,cAAc,EAAE,MAAM,CAAC,aAAa;QACpC,UAAU,EAAE,MAAM,CAAC,SAAS;QAC5B,oBAAoB,EAAE,MAAM,CAAC,mBAAmB;QAChD,sBAAsB,EAAE,MAAM,CAAC,oBAAoB;QACnD,MAAM,EAAE,MAAM,CAAC,MAAM;QACrB,gBAAgB,EAAE,MAAM,CAAC,cAAc;QACvC,cAAc,EAAE,MAAM,CAAC,YAAY;QACnC,eAAe,EAAE,MAAM,CAAC,aAAa;KACtC,CAAC;AACJ,CAAC"}
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* High-level decompression API.
|
|
3
|
+
*/
|
|
4
|
+
import { getWasm, isWasmInitialized, initWasm } from './wasm/loader.js';
|
|
5
|
+
import { DEFAULT_CONFIG } from './config.js';
|
|
6
|
+
import { normalizeTokens } from './types.js';
|
|
7
|
+
/**
|
|
8
|
+
* Decompress a compressed token sequence.
|
|
9
|
+
*
|
|
10
|
+
* @param tokens - The compressed token sequence
|
|
11
|
+
* @param config - Optional decompression configuration
|
|
12
|
+
* @returns Promise resolving to the original token sequence
|
|
13
|
+
*
|
|
14
|
+
* @example
|
|
15
|
+
* ```typescript
|
|
16
|
+
* import { compress, decompress, initWasm } from '@small-ltsc/sdk';
|
|
17
|
+
*
|
|
18
|
+
* await initWasm();
|
|
19
|
+
*
|
|
20
|
+
* const tokens = [1, 2, 3, 1, 2, 3, 1, 2, 3];
|
|
21
|
+
* const result = await compress(tokens);
|
|
22
|
+
* const restored = await decompress(result.serializedTokens);
|
|
23
|
+
*
|
|
24
|
+
* console.assert(JSON.stringify(tokens) === JSON.stringify(restored));
|
|
25
|
+
* ```
|
|
26
|
+
*/
|
|
27
|
+
export async function decompress(tokens, config) {
|
|
28
|
+
// Auto-initialize if not already done
|
|
29
|
+
if (!isWasmInitialized()) {
|
|
30
|
+
await initWasm();
|
|
31
|
+
}
|
|
32
|
+
const wasm = getWasm();
|
|
33
|
+
const inputTokens = normalizeTokens(tokens);
|
|
34
|
+
const wasmConfig = {
|
|
35
|
+
dict_start_token: config?.dictStartToken ?? DEFAULT_CONFIG.dictStartToken,
|
|
36
|
+
dict_end_token: config?.dictEndToken ?? DEFAULT_CONFIG.dictEndToken,
|
|
37
|
+
};
|
|
38
|
+
const result = wasm.decompress(inputTokens, wasmConfig);
|
|
39
|
+
return Array.from(result);
|
|
40
|
+
}
|
|
41
|
+
/**
|
|
42
|
+
* Extract the dictionary from a compressed token sequence.
|
|
43
|
+
*
|
|
44
|
+
* @param tokens - The compressed token sequence
|
|
45
|
+
* @param config - Optional decompression configuration
|
|
46
|
+
* @returns Map of meta-tokens to their definitions
|
|
47
|
+
*/
|
|
48
|
+
export async function extractDictionary(tokens, config) {
|
|
49
|
+
const inputTokens = normalizeTokens(tokens);
|
|
50
|
+
const dictStart = config?.dictStartToken ?? DEFAULT_CONFIG.dictStartToken;
|
|
51
|
+
const dictEnd = config?.dictEndToken ?? DEFAULT_CONFIG.dictEndToken;
|
|
52
|
+
const map = new Map();
|
|
53
|
+
// Find dictionary start
|
|
54
|
+
const startIdx = inputTokens.indexOf(dictStart);
|
|
55
|
+
if (startIdx === -1) {
|
|
56
|
+
return map;
|
|
57
|
+
}
|
|
58
|
+
let pos = startIdx + 1;
|
|
59
|
+
// Parse entries
|
|
60
|
+
while (pos < inputTokens.length && inputTokens[pos] !== dictEnd) {
|
|
61
|
+
const metaToken = inputTokens[pos++];
|
|
62
|
+
const length = inputTokens[pos++];
|
|
63
|
+
if (pos + length > inputTokens.length) {
|
|
64
|
+
break;
|
|
65
|
+
}
|
|
66
|
+
const definition = Array.from(inputTokens.slice(pos, pos + length));
|
|
67
|
+
map.set(metaToken, definition);
|
|
68
|
+
pos += length;
|
|
69
|
+
}
|
|
70
|
+
return map;
|
|
71
|
+
}
|
|
72
|
+
/**
|
|
73
|
+
* Extract the body tokens from a compressed sequence (without decompression).
|
|
74
|
+
*
|
|
75
|
+
* @param tokens - The compressed token sequence
|
|
76
|
+
* @param config - Optional decompression configuration
|
|
77
|
+
* @returns Body tokens with meta-token references
|
|
78
|
+
*/
|
|
79
|
+
export function extractBody(tokens, config) {
|
|
80
|
+
const inputTokens = normalizeTokens(tokens);
|
|
81
|
+
const dictEnd = config?.dictEndToken ?? DEFAULT_CONFIG.dictEndToken;
|
|
82
|
+
// Find dictionary end
|
|
83
|
+
const endIdx = inputTokens.indexOf(dictEnd);
|
|
84
|
+
if (endIdx === -1) {
|
|
85
|
+
// No dictionary section - return all tokens
|
|
86
|
+
return Array.from(inputTokens);
|
|
87
|
+
}
|
|
88
|
+
// Body is everything after the dictionary end token
|
|
89
|
+
return Array.from(inputTokens.slice(endIdx + 1));
|
|
90
|
+
}
|
|
91
|
+
/**
|
|
92
|
+
* Check if a token sequence appears to be compressed.
|
|
93
|
+
*
|
|
94
|
+
* Looks for the dictionary start token marker.
|
|
95
|
+
*
|
|
96
|
+
* @param tokens - The token sequence to check
|
|
97
|
+
* @param config - Optional decompression configuration
|
|
98
|
+
* @returns True if the sequence contains a dictionary section
|
|
99
|
+
*/
|
|
100
|
+
export function isCompressed(tokens, config) {
|
|
101
|
+
const inputTokens = normalizeTokens(tokens);
|
|
102
|
+
const dictStart = config?.dictStartToken ?? DEFAULT_CONFIG.dictStartToken;
|
|
103
|
+
return inputTokens.includes(dictStart);
|
|
104
|
+
}
|
|
105
|
+
//# sourceMappingURL=decompress.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"decompress.js","sourceRoot":"","sources":["../../src/decompress.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,OAAO,EAAE,iBAAiB,EAAE,QAAQ,EAAE,MAAM,kBAAkB,CAAC;AACxE,OAAO,EAA4B,cAAc,EAAE,MAAM,aAAa,CAAC;AACvE,OAAO,EAAmB,eAAe,EAAE,MAAM,YAAY,CAAC;AAE9D;;;;;;;;;;;;;;;;;;;GAmBG;AACH,MAAM,CAAC,KAAK,UAAU,UAAU,CAC9B,MAAkB,EAClB,MAA4B;IAE5B,sCAAsC;IACtC,IAAI,CAAC,iBAAiB,EAAE,EAAE,CAAC;QACzB,MAAM,QAAQ,EAAE,CAAC;IACnB,CAAC;IAED,MAAM,IAAI,GAAG,OAAO,EAAE,CAAC;IACvB,MAAM,WAAW,GAAG,eAAe,CAAC,MAAM,CAAC,CAAC;IAE5C,MAAM,UAAU,GAAG;QACjB,gBAAgB,EAAE,MAAM,EAAE,cAAc,IAAI,cAAc,CAAC,cAAc;QACzE,cAAc,EAAE,MAAM,EAAE,YAAY,IAAI,cAAc,CAAC,YAAY;KACpE,CAAC;IAEF,MAAM,MAAM,GAAG,IAAI,CAAC,UAAU,CAAC,WAAW,EAAE,UAAU,CAAC,CAAC;IACxD,OAAO,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;AAC5B,CAAC;AAED;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,iBAAiB,CACrC,MAAkB,EAClB,MAA4B;IAE5B,MAAM,WAAW,GAAG,eAAe,CAAC,MAAM,CAAC,CAAC;IAC5C,MAAM,SAAS,GAAG,MAAM,EAAE,cAAc,IAAI,cAAc,CAAC,cAAc,CAAC;IAC1E,MAAM,OAAO,GAAG,MAAM,EAAE,YAAY,IAAI,cAAc,CAAC,YAAY,CAAC;IAEpE,MAAM,GAAG,GAAG,IAAI,GAAG,EAA6B,CAAC;IAEjD,wBAAwB;IACxB,MAAM,QAAQ,GAAG,WAAW,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC;IAChD,IAAI,QAAQ,KAAK,CAAC,CAAC,EAAE,CAAC;QACpB,OAAO,GAAG,CAAC;IACb,CAAC;IAED,IAAI,GAAG,GAAG,QAAQ,GAAG,CAAC,CAAC;IAEvB,gBAAgB;IAChB,OAAO,GAAG,GAAG,WAAW,CAAC,MAAM,IAAI,WAAW,CAAC,GAAG,CAAC,KAAK,OAAO,EAAE,CAAC;QAChE,MAAM,SAAS,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC,CAAC;QACrC,MAAM,MAAM,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC,CAAC;QAElC,IAAI,GAAG,GAAG,MAAM,GAAG,WAAW,CAAC,MAAM,EAAE,CAAC;YACtC,MAAM;QACR,CAAC;QAED,MAAM,UAAU,GAAG,KAAK,CAAC,IAAI,CAAC,WAAW,CAAC,KAAK,CAAC,GAAG,EAAE,GAAG,GAAG,MAAM,CAAC,CAAC,CAAC;QACpE,GAAG,CAAC,GAAG,CAAC,SAAS,EAAE,UAAU,CAAC,CAAC;QAC/B,GAAG,IAAI,MAAM,CAAC;IAChB,CAAC;IAED,OAAO,GAAG,CAAC;AACb,CAAC;AAED;;;;;;GAMG;AACH,MAAM,UAAU,WAAW,CACzB,MAAkB,EAClB,MAA4B;IAE5B,MAAM,WAAW,GAAG,eAAe,CAAC,MAAM,CAAC,CAAC;IAC5C,MAAM,OAAO,GAAG,MAAM,EAAE,YAAY,IAAI,cAAc,CAAC,YAAY,CAAC;IAEpE,sBAAsB;IACtB,MAAM,MAAM,GAAG,WAAW,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;IAC5C,IAAI,MAAM,KAAK,CAAC,CAAC,EAAE,CAAC;QAClB,4CAA4C;QAC5C,OAAO,KAAK,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;IACjC,CAAC;IAED,oDAAoD;IACpD,OAAO,KAAK,CAAC,IAAI,CAAC,WAAW,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC;AACnD,CAAC;AAED;;;;;;;;GAQG;AACH,MAAM,UAAU,YAAY,CAC1B,MAAkB,EAClB,MAA4B;IAE5B,MAAM,WAAW,GAAG,eAAe,CAAC,MAAM,CAAC,CAAC;IAC5C,MAAM,SAAS,GAAG,MAAM,EAAE,cAAc,IAAI,cAAc,CAAC,cAAc,CAAC;IAC1E,OAAO,WAAW,CAAC,QAAQ,CAAC,SAAS,CAAC,CAAC;AACzC,CAAC"}
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Static dictionary support for Small LTSC.
|
|
3
|
+
*
|
|
4
|
+
* Pre-built dictionaries for common domains that can be used
|
|
5
|
+
* to improve compression of domain-specific content.
|
|
6
|
+
*/
|
|
7
|
+
// Import dictionaries directly
|
|
8
|
+
import pythonDict from './python.json';
|
|
9
|
+
import typescriptDict from './typescript.json';
|
|
10
|
+
import markdownDict from './markdown.json';
|
|
11
|
+
import jsonDict from './json.json';
|
|
12
|
+
import sqlDict from './sql.json';
|
|
13
|
+
/**
|
|
14
|
+
* Available built-in static dictionaries.
|
|
15
|
+
*/
|
|
16
|
+
export const STATIC_DICTIONARIES = {
|
|
17
|
+
'python-v1': pythonDict,
|
|
18
|
+
'typescript-v1': typescriptDict,
|
|
19
|
+
'markdown-v1': markdownDict,
|
|
20
|
+
'json-v1': jsonDict,
|
|
21
|
+
'sql-v1': sqlDict,
|
|
22
|
+
};
|
|
23
|
+
/**
|
|
24
|
+
* Load a built-in static dictionary.
|
|
25
|
+
*
|
|
26
|
+
* @param id - Dictionary ID (e.g., 'python-v1', 'typescript-v1')
|
|
27
|
+
* @returns Promise resolving to the static dictionary
|
|
28
|
+
*
|
|
29
|
+
* @example
|
|
30
|
+
* ```typescript
|
|
31
|
+
* import { loadStaticDictionary, compress } from '@small-ltsc/sdk';
|
|
32
|
+
*
|
|
33
|
+
* const pythonDict = await loadStaticDictionary('python-v1');
|
|
34
|
+
*
|
|
35
|
+
* const result = await compress(tokens, {
|
|
36
|
+
* staticDictionary: pythonDict,
|
|
37
|
+
* });
|
|
38
|
+
* ```
|
|
39
|
+
*/
|
|
40
|
+
export async function loadStaticDictionary(id) {
|
|
41
|
+
const data = STATIC_DICTIONARIES[id];
|
|
42
|
+
if (!data) {
|
|
43
|
+
throw new Error(`Unknown static dictionary: ${id}`);
|
|
44
|
+
}
|
|
45
|
+
return parseDictionaryJson(data);
|
|
46
|
+
}
|
|
47
|
+
/**
|
|
48
|
+
* Parse dictionary JSON into StaticDictionary.
|
|
49
|
+
*/
|
|
50
|
+
function parseDictionaryJson(data) {
|
|
51
|
+
const entries = new Map();
|
|
52
|
+
const patterns = new Map();
|
|
53
|
+
for (const entry of data.entries) {
|
|
54
|
+
entries.set(entry.metaToken, entry.pattern);
|
|
55
|
+
patterns.set(JSON.stringify(entry.pattern), entry.metaToken);
|
|
56
|
+
}
|
|
57
|
+
return {
|
|
58
|
+
id: data.id,
|
|
59
|
+
version: data.version,
|
|
60
|
+
name: data.name,
|
|
61
|
+
description: data.description,
|
|
62
|
+
entries,
|
|
63
|
+
patterns,
|
|
64
|
+
};
|
|
65
|
+
}
|
|
66
|
+
/**
|
|
67
|
+
* Create a custom static dictionary from patterns.
|
|
68
|
+
*
|
|
69
|
+
* @param id - Unique identifier for the dictionary
|
|
70
|
+
* @param patterns - Array of token patterns to include
|
|
71
|
+
* @param startMetaToken - Starting meta-token ID (default: 0xFFFF8000)
|
|
72
|
+
* @returns StaticDictionary ready for use
|
|
73
|
+
*/
|
|
74
|
+
export function createStaticDictionary(id, patterns, startMetaToken = 0xffff8000) {
|
|
75
|
+
const entries = new Map();
|
|
76
|
+
const patternMap = new Map();
|
|
77
|
+
for (let i = 0; i < patterns.length; i++) {
|
|
78
|
+
const pattern = patterns[i];
|
|
79
|
+
const metaToken = startMetaToken + i;
|
|
80
|
+
entries.set(metaToken, pattern);
|
|
81
|
+
patternMap.set(JSON.stringify(pattern), metaToken);
|
|
82
|
+
}
|
|
83
|
+
return {
|
|
84
|
+
id,
|
|
85
|
+
version: '1.0.0',
|
|
86
|
+
name: id,
|
|
87
|
+
description: `Custom dictionary: ${id}`,
|
|
88
|
+
entries,
|
|
89
|
+
patterns: patternMap,
|
|
90
|
+
};
|
|
91
|
+
}
|
|
92
|
+
/**
|
|
93
|
+
* List available built-in dictionaries.
|
|
94
|
+
*/
|
|
95
|
+
export function listStaticDictionaries() {
|
|
96
|
+
return Object.keys(STATIC_DICTIONARIES);
|
|
97
|
+
}
|
|
98
|
+
/**
|
|
99
|
+
* Check if a dictionary ID is a built-in dictionary.
|
|
100
|
+
*/
|
|
101
|
+
export function isBuiltinDictionary(id) {
|
|
102
|
+
return id in STATIC_DICTIONARIES;
|
|
103
|
+
}
|
|
104
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/dictionaries/index.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAqCH,+BAA+B;AAC/B,OAAO,UAAU,MAAM,eAAe,CAAC;AACvC,OAAO,cAAc,MAAM,mBAAmB,CAAC;AAC/C,OAAO,YAAY,MAAM,iBAAiB,CAAC;AAC3C,OAAO,QAAQ,MAAM,aAAa,CAAC;AACnC,OAAO,OAAO,MAAM,YAAY,CAAC;AAEjC;;GAEG;AACH,MAAM,CAAC,MAAM,mBAAmB,GAAG;IACjC,WAAW,EAAE,UAAU;IACvB,eAAe,EAAE,cAAc;IAC/B,aAAa,EAAE,YAAY;IAC3B,SAAS,EAAE,QAAQ;IACnB,QAAQ,EAAE,OAAO;CACT,CAAC;AAkBX;;;;;;;;;;;;;;;;GAgBG;AACH,MAAM,CAAC,KAAK,UAAU,oBAAoB,CACxC,EAAsB;IAEtB,MAAM,IAAI,GAAG,mBAAmB,CAAC,EAAE,CAAC,CAAC;IACrC,IAAI,CAAC,IAAI,EAAE,CAAC;QACV,MAAM,IAAI,KAAK,CAAC,8BAA8B,EAAE,EAAE,CAAC,CAAC;IACtD,CAAC;IAED,OAAO,mBAAmB,CAAC,IAAsB,CAAC,CAAC;AACrD,CAAC;AAED;;GAEG;AACH,SAAS,mBAAmB,CAAC,IAAoB;IAC/C,MAAM,OAAO,GAAG,IAAI,GAAG,EAA6B,CAAC;IACrD,MAAM,QAAQ,GAAG,IAAI,GAAG,EAAkB,CAAC;IAE3C,KAAK,MAAM,KAAK,IAAI,IAAI,CAAC,OAAO,EAAE,CAAC;QACjC,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,SAAS,EAAE,KAAK,CAAC,OAAO,CAAC,CAAC;QAC5C,QAAQ,CAAC,GAAG,CAAC,IAAI,CAAC,SAAS,CAAC,KAAK,CAAC,OAAO,CAAC,EAAE,KAAK,CAAC,SAAS,CAAC,CAAC;IAC/D,CAAC;IAED,OAAO;QACL,EAAE,EAAE,IAAI,CAAC,EAAE;QACX,OAAO,EAAE,IAAI,CAAC,OAAO;QACrB,IAAI,EAAE,IAAI,CAAC,IAAI;QACf,WAAW,EAAE,IAAI,CAAC,WAAW;QAC7B,OAAO;QACP,QAAQ;KACT,CAAC;AACJ,CAAC;AAED;;;;;;;GAOG;AACH,MAAM,UAAU,sBAAsB,CACpC,EAAU,EACV,QAAoB,EACpB,cAAc,GAAG,UAAU;IAE3B,MAAM,OAAO,GAAG,IAAI,GAAG,EAA6B,CAAC;IACrD,MAAM,UAAU,GAAG,IAAI,GAAG,EAAkB,CAAC;IAE7C,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,QAAQ,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACzC,MAAM,OAAO,GAAG,QAAQ,CAAC,CAAC,CAAC,CAAC;QAC5B,MAAM,SAAS,GAAG,cAAc,GAAG,CAAC,CAAC;QACrC,OAAO,CAAC,GAAG,CAAC,SAAS,EAAE,OAAO,CAAC,CAAC;QAChC,UAAU,CAAC,GAAG,CAAC,IAAI,CAAC,SAAS,CAAC,OAAO,CAAC,EAAE,SAAS,CAAC,CAAC;IACrD,CAAC;IAED,OAAO;QACL,EAAE;QACF,OAAO,EAAE,OAAO;QAChB,IAAI,EAAE,EAAE;QACR,WAAW,EAAE,sBAAsB,EAAE,EAAE;QACvC,OAAO;QACP,QAAQ,EAAE,UAAU;KACrB,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,sBAAsB;IACpC,OAAO,MAAM,CAAC,IAAI,CAAC,mBAAmB,CAAyB,CAAC;AAClE,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,mBAAmB,CAAC,EAAU;IAC5C,OAAO,EAAE,IAAI,mBAAmB,CAAC;AACnC,CAAC"}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
{
|
|
2
|
+
"id": "json-v1",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"name": "JSON/API Dictionary",
|
|
5
|
+
"description": "Pre-built patterns for JSON and API schema compression",
|
|
6
|
+
"entries": [
|
|
7
|
+
{ "metaToken": 4294934528, "pattern": [788, 25] },
|
|
8
|
+
{ "metaToken": 4294934529, "pattern": [330, 25] },
|
|
9
|
+
{ "metaToken": 4294934530, "pattern": [837, 220] },
|
|
10
|
+
{ "metaToken": 4294934531, "pattern": [1025, 220] },
|
|
11
|
+
{ "metaToken": 4294934532, "pattern": [498, 220] },
|
|
12
|
+
{ "metaToken": 4294934533, "pattern": [1115, 220] },
|
|
13
|
+
{ "metaToken": 4294934534, "pattern": [14196, 220] },
|
|
14
|
+
{ "metaToken": 4294934535, "pattern": [27, 198] },
|
|
15
|
+
{ "metaToken": 4294934536, "pattern": [220, 220] },
|
|
16
|
+
{ "metaToken": 4294934537, "pattern": [220, 220, 220, 220] },
|
|
17
|
+
{ "metaToken": 4294934538, "pattern": [6660, 788] },
|
|
18
|
+
{ "metaToken": 4294934539, "pattern": [828, 330] },
|
|
19
|
+
{ "metaToken": 4294934540, "pattern": [1, 330] },
|
|
20
|
+
{ "metaToken": 4294934541, "pattern": [330, 1374] },
|
|
21
|
+
{ "metaToken": 4294934542, "pattern": [14196, 2970] },
|
|
22
|
+
{ "metaToken": 4294934543, "pattern": [198, 198] },
|
|
23
|
+
{ "metaToken": 4294934544, "pattern": [220, 498] },
|
|
24
|
+
{ "metaToken": 4294934545, "pattern": [330, 788] },
|
|
25
|
+
{ "metaToken": 4294934546, "pattern": [788, 1374] },
|
|
26
|
+
{ "metaToken": 4294934547, "pattern": [330, 837] }
|
|
27
|
+
]
|
|
28
|
+
}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
{
|
|
2
|
+
"id": "markdown-v1",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"name": "Markdown Documentation Dictionary",
|
|
5
|
+
"description": "Pre-built patterns for Markdown documentation compression",
|
|
6
|
+
"entries": [
|
|
7
|
+
{ "metaToken": 4294934528, "pattern": [198, 198] },
|
|
8
|
+
{ "metaToken": 4294934529, "pattern": [14196, 198] },
|
|
9
|
+
{ "metaToken": 4294934530, "pattern": [532, 279] },
|
|
10
|
+
{ "metaToken": 4294934531, "pattern": [6102, 220] },
|
|
11
|
+
{ "metaToken": 4294934532, "pattern": [279, 279] },
|
|
12
|
+
{ "metaToken": 4294934533, "pattern": [61, 220] },
|
|
13
|
+
{ "metaToken": 4294934534, "pattern": [2, 4211] },
|
|
14
|
+
{ "metaToken": 4294934535, "pattern": [7191, 74694] },
|
|
15
|
+
{ "metaToken": 4294934536, "pattern": [27, 198] },
|
|
16
|
+
{ "metaToken": 4294934537, "pattern": [532, 3384] },
|
|
17
|
+
{ "metaToken": 4294934538, "pattern": [2, 198] },
|
|
18
|
+
{ "metaToken": 4294934539, "pattern": [7, 2] },
|
|
19
|
+
{ "metaToken": 4294934540, "pattern": [8, 610] },
|
|
20
|
+
{ "metaToken": 4294934541, "pattern": [220, 220] },
|
|
21
|
+
{ "metaToken": 4294934542, "pattern": [14, 532] },
|
|
22
|
+
{ "metaToken": 4294934543, "pattern": [5765, 5765, 5765] },
|
|
23
|
+
{ "metaToken": 4294934544, "pattern": [2, 1374] },
|
|
24
|
+
{ "metaToken": 4294934545, "pattern": [220, 220, 220, 220] },
|
|
25
|
+
{ "metaToken": 4294934546, "pattern": [1374, 659] },
|
|
26
|
+
{ "metaToken": 4294934547, "pattern": [198, 14196] }
|
|
27
|
+
]
|
|
28
|
+
}
|