gn-native 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Cargo.toml +23 -0
- package/README.md +135 -0
- package/gn-l0-multicorpus.snapshot +1 -0
- package/gn-native.linux-x64-gnu.node +0 -0
- package/index.d.ts +0 -0
- package/package.json +33 -0
- package/src/lib.rs +652 -0
- package/src/vtc_patch.rs +43 -0
|
Binary file
|
package/index.d.ts
ADDED
|
File without changes
|
package/package.json
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "gn-native",
|
|
3
|
+
"version": "1.1.0",
|
|
4
|
+
"main": "index.js",
|
|
5
|
+
"napi": {
|
|
6
|
+
"name": "gn-native",
|
|
7
|
+
"triples": {
|
|
8
|
+
"defaults": true
|
|
9
|
+
}
|
|
10
|
+
},
|
|
11
|
+
"scripts": {
|
|
12
|
+
"build": "napi build --platform --release",
|
|
13
|
+
"build:debug": "napi build --platform"
|
|
14
|
+
},
|
|
15
|
+
"devDependencies": {
|
|
16
|
+
"@napi-rs/cli": "latest"
|
|
17
|
+
},
|
|
18
|
+
"description": "Domain-adaptive lossless compression for LLM conversation streams",
|
|
19
|
+
"keywords": [
|
|
20
|
+
"compression",
|
|
21
|
+
"llm",
|
|
22
|
+
"rust",
|
|
23
|
+
"napi",
|
|
24
|
+
"brotli",
|
|
25
|
+
"deflate"
|
|
26
|
+
],
|
|
27
|
+
"author": "Robert Rider <atomsrkuul@gmail.com>",
|
|
28
|
+
"license": "MIT",
|
|
29
|
+
"repository": {
|
|
30
|
+
"type": "git",
|
|
31
|
+
"url": "https://github.com/atomsrkuul/glasik-core"
|
|
32
|
+
}
|
|
33
|
+
}
|
package/src/lib.rs
ADDED
|
@@ -0,0 +1,652 @@
|
|
|
1
|
+
use napi::bindgen_prelude::*;
|
|
2
|
+
use napi_derive::napi;
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
use napi::bindgen_prelude::*;
|
|
6
|
+
use glasik_core::tokenizer::sliding_v2::SlidingTokenizerV2;
|
|
7
|
+
use glasik_core::fractal::FractalCompressor;
|
|
8
|
+
use glasik_core::pipeline;
|
|
9
|
+
use glasik_core::static_dict;
|
|
10
|
+
use glasik_core::tokenizer::lz77_gn::GNPrefixTokenizer;
|
|
11
|
+
use glasik_core::tokenizer::dictionary::DictEntry;
|
|
12
|
+
use std::sync::OnceLock;
|
|
13
|
+
use tokio::sync::{mpsc, oneshot};
|
|
14
|
+
|
|
15
|
+
enum Job {
|
|
16
|
+
CompressHybrid { data: Vec<u8>, resp: oneshot::Sender<Vec<u8>> },
|
|
17
|
+
CompressAC { data: Vec<u8>, resp: oneshot::Sender<Vec<u8>> },
|
|
18
|
+
CompressSplit { data: Vec<u8>, resp: oneshot::Sender<Vec<u8>> },
|
|
19
|
+
CompressSplitBatch { chunks: Vec<Vec<u8>>, resp: oneshot::Sender<Vec<u8>> },
|
|
20
|
+
DecompressL2 { data: Vec<u8>, resp: oneshot::Sender<napi::Result<Vec<u8>>> },
|
|
21
|
+
CompressFast { data: Vec<u8>, resp: oneshot::Sender<Vec<u8>> },
|
|
22
|
+
CompressL2 { data: Vec<u8>, resp: oneshot::Sender<Vec<u8>> },
|
|
23
|
+
RefreshVocab { resp: oneshot::Sender<usize> },
|
|
24
|
+
ExportEntries { resp: oneshot::Sender<String> },
|
|
25
|
+
CompressPressurized { target: Vec<u8>, warm: Vec<Vec<u8>>, pk: usize, resp: oneshot::Sender<Vec<u8>> },
|
|
26
|
+
WindowStats { resp: oneshot::Sender<String> },
|
|
27
|
+
SaveSnapshot { path: String, resp: oneshot::Sender<String> },
|
|
28
|
+
LoadSnapshot { path: String, resp: oneshot::Sender<String> },
|
|
29
|
+
CompressFractal { data: Vec<u8>, shard_type: String, session_id: String, resp: oneshot::Sender<Vec<u8>> },
|
|
30
|
+
DecompressFractal { data: Vec<u8>, shard_type: String, session_id: String, resp: oneshot::Sender<napi::Result<Vec<u8>>> },
|
|
31
|
+
CompressFractalVtcV3 { data: Vec<u8>, shard_type: String, session_id: String, resp: oneshot::Sender<napi::Result<String>> },
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
static WORKER: OnceLock<mpsc::Sender<Job>> = OnceLock::new();
|
|
35
|
+
static FAST_TOK: OnceLock<std::sync::Mutex<GNPrefixTokenizer<4>>> = OnceLock::new();
|
|
36
|
+
static HYBRID_ENC: OnceLock<std::sync::Mutex<glasik_core::tokenizer::hybrid_async::HybridAsyncEncoder>> = OnceLock::new();
|
|
37
|
+
|
|
38
|
+
// Thread-local GNHybridEncoder -- fastest path, no locks
|
|
39
|
+
use std::cell::RefCell;
|
|
40
|
+
thread_local! {
|
|
41
|
+
static TL_HYBRID: RefCell<Option<glasik_core::tokenizer::lz77_gn::GNPrefixTokenizer<4>>> = RefCell::new(None);
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
fn with_tl_hybrid<F, R>(f: F) -> R
|
|
45
|
+
where F: FnOnce(&mut glasik_core::tokenizer::lz77_gn::GNPrefixTokenizer<4>) -> R {
|
|
46
|
+
TL_HYBRID.with(|cell| {
|
|
47
|
+
let mut opt = cell.borrow_mut();
|
|
48
|
+
if opt.is_none() {
|
|
49
|
+
let entries = glasik_core::static_dict::load_static_dict();
|
|
50
|
+
let dict: Vec<glasik_core::tokenizer::dictionary::DictEntry> = entries.iter().map(|(b,f,s)|
|
|
51
|
+
glasik_core::tokenizer::dictionary::DictEntry { bytes: b.clone(), freq: *f as usize, saving: *s as usize }
|
|
52
|
+
).collect();
|
|
53
|
+
let mut tok = glasik_core::tokenizer::lz77_gn::GNPrefixTokenizer::<4>::new();
|
|
54
|
+
tok.seed_from_vocab(&dict);
|
|
55
|
+
*opt = Some(tok);
|
|
56
|
+
}
|
|
57
|
+
f(opt.as_mut().unwrap())
|
|
58
|
+
})
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
fn get_hybrid() -> &'static std::sync::Mutex<glasik_core::tokenizer::hybrid_async::HybridAsyncEncoder> {
|
|
62
|
+
HYBRID_ENC.get_or_init(|| {
|
|
63
|
+
std::sync::Mutex::new(glasik_core::tokenizer::hybrid_async::HybridAsyncEncoder::new())
|
|
64
|
+
})
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
fn get_fast_tok() -> &'static std::sync::Mutex<GNPrefixTokenizer<4>> {
|
|
68
|
+
FAST_TOK.get_or_init(|| {
|
|
69
|
+
let entries = static_dict::load_static_dict();
|
|
70
|
+
let dict: Vec<DictEntry> = entries.iter().map(|(b,f,s)| DictEntry {
|
|
71
|
+
bytes: b.clone(), freq: *f as usize, saving: *s as usize
|
|
72
|
+
}).collect();
|
|
73
|
+
let mut tok = GNPrefixTokenizer::<4>::new();
|
|
74
|
+
tok.seed_from_vocab(&dict);
|
|
75
|
+
std::sync::Mutex::new(tok)
|
|
76
|
+
})
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
/// Fast path: GNPrefixTokenizer O(n) single pass + libdeflate
|
|
80
|
+
fn compress_lz77gn(buf: &[u8], tok: &GNPrefixTokenizer<4>) -> Vec<u8> {
|
|
81
|
+
let tokenized = tok.tokenize_to_gn_bytes(buf, true);
|
|
82
|
+
deflate_buf(tokenized)
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
const FLAG_DEFLATED: u8 = 0x01;
|
|
86
|
+
const FLAG_RAW_TOKENS: u8 = 0x00;
|
|
87
|
+
|
|
88
|
+
fn deflate_buf(tokenized: Vec<u8>) -> Vec<u8> {
|
|
89
|
+
let mut comp = libdeflater::Compressor::new(libdeflater::CompressionLvl::default());
|
|
90
|
+
let max = comp.deflate_compress_bound(tokenized.len());
|
|
91
|
+
let mut deflated = vec![0u8; max];
|
|
92
|
+
match comp.deflate_compress(&tokenized, &mut deflated) {
|
|
93
|
+
Ok(n) => {
|
|
94
|
+
deflated.truncate(n);
|
|
95
|
+
if deflated.len() < tokenized.len() {
|
|
96
|
+
// Prefix with FLAG_DEFLATED so decoder knows to inflate first
|
|
97
|
+
let mut out = Vec::with_capacity(1 + deflated.len());
|
|
98
|
+
out.push(FLAG_DEFLATED);
|
|
99
|
+
out.extend_from_slice(&deflated);
|
|
100
|
+
out
|
|
101
|
+
} else {
|
|
102
|
+
// Raw tokenized -- prefix with FLAG_RAW_TOKENS
|
|
103
|
+
let mut out = Vec::with_capacity(1 + tokenized.len());
|
|
104
|
+
out.push(FLAG_RAW_TOKENS);
|
|
105
|
+
out.extend_from_slice(&tokenized);
|
|
106
|
+
out
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
Err(_) => {
|
|
110
|
+
let mut out = Vec::with_capacity(1 + tokenized.len());
|
|
111
|
+
out.push(FLAG_RAW_TOKENS);
|
|
112
|
+
out.extend_from_slice(&tokenized);
|
|
113
|
+
out
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
fn split_deflate(tok_ids: Vec<u8>, literals: Vec<u8>) -> Vec<u8> {
|
|
119
|
+
// Compress each stream independently with raw deflate
|
|
120
|
+
// Frame: [2B tok_deflated_len][tok_deflated][lit_deflated]
|
|
121
|
+
let tok_comp = if tok_ids.is_empty() {
|
|
122
|
+
vec![]
|
|
123
|
+
} else {
|
|
124
|
+
let mut comp = libdeflater::Compressor::new(libdeflater::CompressionLvl::default());
|
|
125
|
+
let max = comp.deflate_compress_bound(tok_ids.len());
|
|
126
|
+
let mut out = vec![0u8; max];
|
|
127
|
+
match comp.deflate_compress(&tok_ids, &mut out) {
|
|
128
|
+
Ok(n) => { out.truncate(n); if out.len() < tok_ids.len() { out } else { tok_ids } }
|
|
129
|
+
Err(_) => tok_ids
|
|
130
|
+
}
|
|
131
|
+
};
|
|
132
|
+
let lit_comp = if literals.is_empty() {
|
|
133
|
+
vec![]
|
|
134
|
+
} else {
|
|
135
|
+
let mut comp = libdeflater::Compressor::new(libdeflater::CompressionLvl::default());
|
|
136
|
+
let max = comp.deflate_compress_bound(literals.len());
|
|
137
|
+
let mut out = vec![0u8; max];
|
|
138
|
+
match comp.deflate_compress(&literals, &mut out) {
|
|
139
|
+
Ok(n) => { out.truncate(n); if out.len() < literals.len() { out } else { literals } }
|
|
140
|
+
Err(_) => literals
|
|
141
|
+
}
|
|
142
|
+
};
|
|
143
|
+
// Frame: [2B tok_len][tok_data][lit_data]
|
|
144
|
+
let tok_len = tok_comp.len() as u16;
|
|
145
|
+
let mut frame = Vec::with_capacity(2 + tok_comp.len() + lit_comp.len());
|
|
146
|
+
frame.extend_from_slice(&tok_len.to_le_bytes());
|
|
147
|
+
frame.extend_from_slice(&tok_comp);
|
|
148
|
+
frame.extend_from_slice(&lit_comp);
|
|
149
|
+
frame
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
fn inflate_buf(data: &[u8]) -> std::result::Result<Vec<u8>, String> {
|
|
153
|
+
if data.is_empty() { return Ok(Vec::new()); }
|
|
154
|
+
let flag = data[0];
|
|
155
|
+
let payload = &data[1..];
|
|
156
|
+
if flag == 0x01 {
|
|
157
|
+
// deflate compressed
|
|
158
|
+
let mut decomp = libdeflater::Decompressor::new();
|
|
159
|
+
let mut out = vec![0u8; payload.len() * 4];
|
|
160
|
+
loop {
|
|
161
|
+
match decomp.deflate_decompress(payload, &mut out) {
|
|
162
|
+
Ok(n) => { out.truncate(n); return Ok(out); }
|
|
163
|
+
Err(_) => {
|
|
164
|
+
let new_len = out.len() * 2;
|
|
165
|
+
if new_len > 64 * 1024 * 1024 { return std::result::Result::Err("decompress overflow".to_string()); }
|
|
166
|
+
out.resize(new_len, 0);
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
} else {
|
|
171
|
+
Ok(payload.to_vec())
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
fn get_worker() -> &'static mpsc::Sender<Job> {
|
|
176
|
+
WORKER.get_or_init(|| {
|
|
177
|
+
let (tx, mut rx) = mpsc::channel::<Job>(256);
|
|
178
|
+
tokio::spawn(async move {
|
|
179
|
+
// Hybrid async encoder with adaptive vocab swap
|
|
180
|
+
let mut hybrid = glasik_core::tokenizer::hybrid_async::HybridAsyncEncoder::new();
|
|
181
|
+
let static_entries = static_dict::load_static_dict();
|
|
182
|
+
// Build GNPrefixTokenizer from static dict for fast O(n) compression
|
|
183
|
+
let dict_entries: Vec<DictEntry> = static_entries.iter().map(|(b,f,s)| DictEntry {
|
|
184
|
+
bytes: b.clone(), freq: *f as usize, saving: *s as usize
|
|
185
|
+
}).collect();
|
|
186
|
+
let mut tok4 = GNPrefixTokenizer::<4>::new();
|
|
187
|
+
tok4.seed_from_vocab(&dict_entries);
|
|
188
|
+
let mut slider = SlidingTokenizerV2::new_with_static(static_entries);
|
|
189
|
+
// Auto-load snapshot
|
|
190
|
+
let snap = format!("{}/.openclaw/gn-window.snapshot",
|
|
191
|
+
std::env::var("HOME").unwrap_or_default());
|
|
192
|
+
if let Ok(data) = std::fs::read_to_string(&snap) {
|
|
193
|
+
if let Ok(d) = serde_json::from_str::<serde_json::Value>(&data) {
|
|
194
|
+
if let Some(arr) = d["entries"].as_array() {
|
|
195
|
+
let loaded: Vec<(Vec<u8>, u64, u64)> = arr.iter().filter_map(|e| {
|
|
196
|
+
let b: Vec<u8> = e["b"].as_array()?.iter()
|
|
197
|
+
.filter_map(|x| x.as_u64().filter(|&v| v <= 255).map(|v| v as u8)).collect();
|
|
198
|
+
Some((b, e["f"].as_u64()?, e["s"].as_u64()?))
|
|
199
|
+
}).collect();
|
|
200
|
+
let n = loaded.len();
|
|
201
|
+
slider.import_dict(1, loaded);
|
|
202
|
+
eprintln!("GN-NATIVE: restored {} entries", n);
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
// Init FractalCompressor -- load L0 from same snapshot
|
|
207
|
+
let mut fractal = FractalCompressor::new();
|
|
208
|
+
let snap_path = format!("{}/.openclaw/gn-window.snapshot",
|
|
209
|
+
std::env::var("HOME").unwrap_or_default());
|
|
210
|
+
if let Ok(data) = std::fs::read_to_string(&snap_path) {
|
|
211
|
+
if let Ok(d) = serde_json::from_str::<serde_json::Value>(&data) {
|
|
212
|
+
if let Some(arr) = d["entries"].as_array() {
|
|
213
|
+
let l0: Vec<(Vec<u8>, u64, u64)> = arr.iter().filter_map(|e| {
|
|
214
|
+
let b: Vec<u8> = e["b"].as_array()?.iter()
|
|
215
|
+
.filter_map(|x| x.as_u64().filter(|&v| v <= 255).map(|v| v as u8)).collect();
|
|
216
|
+
Some((b, e["f"].as_u64()?, e["s"].as_u64()?))
|
|
217
|
+
}).collect();
|
|
218
|
+
let n = l0.len();
|
|
219
|
+
fractal.load_l0(l0);
|
|
220
|
+
eprintln!("GN-NATIVE: fractal L0 loaded {} entries", n);
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
while let Some(job) = rx.recv().await {
|
|
225
|
+
match job {
|
|
226
|
+
Job::CompressHybrid { data, resp } => {
|
|
227
|
+
let _ = resp.send(hybrid.encode(&data));
|
|
228
|
+
}
|
|
229
|
+
Job::CompressAC { data, resp } => {
|
|
230
|
+
// O(n) Aho-Corasick -- fast path with full window vocab
|
|
231
|
+
let tokenized = slider.encode_ac(&data);
|
|
232
|
+
let _ = resp.send(deflate_buf(tokenized));
|
|
233
|
+
}
|
|
234
|
+
Job::CompressSplit { data, resp } => {
|
|
235
|
+
let (tok_ids, literals) = slider.encode_ac_split(&data);
|
|
236
|
+
let _ = resp.send(split_deflate(tok_ids, literals));
|
|
237
|
+
}
|
|
238
|
+
Job::CompressSplitBatch { chunks, resp } => {
|
|
239
|
+
// Batch split-stream: collect ALL tok/lit streams across chunks
|
|
240
|
+
// deflate combined streams once -- eliminates per-chunk header overhead
|
|
241
|
+
// This is where beats-brotli ratio comes from
|
|
242
|
+
let mut all_toks: Vec<u8> = Vec::new();
|
|
243
|
+
let mut all_lits: Vec<u8> = Vec::new();
|
|
244
|
+
for chunk in &chunks {
|
|
245
|
+
let (toks, lits) = slider.encode_ac_split(chunk);
|
|
246
|
+
all_toks.extend_from_slice(&toks);
|
|
247
|
+
all_lits.extend_from_slice(&lits);
|
|
248
|
+
}
|
|
249
|
+
let _ = resp.send(split_deflate(all_toks, all_lits));
|
|
250
|
+
}
|
|
251
|
+
Job::DecompressL2 { data, resp } => {
|
|
252
|
+
// Inflate then decode tokens using current window vocab
|
|
253
|
+
let result = (|| -> napi::Result<Vec<u8>> {
|
|
254
|
+
if data.is_empty() { return Ok(Vec::new()); }
|
|
255
|
+
let flag = data[0];
|
|
256
|
+
let payload = &data[1..];
|
|
257
|
+
let tokenized = if flag == FLAG_DEFLATED {
|
|
258
|
+
// Inflate first
|
|
259
|
+
let mut decomp = libdeflater::Decompressor::new();
|
|
260
|
+
let mut out = vec![0u8; payload.len().max(64) * 8];
|
|
261
|
+
loop {
|
|
262
|
+
match decomp.deflate_decompress(payload, &mut out) {
|
|
263
|
+
Ok(n) => { out.truncate(n); break out; }
|
|
264
|
+
Err(_) => {
|
|
265
|
+
let nl = out.len() * 2;
|
|
266
|
+
if nl > 64*1024*1024 {
|
|
267
|
+
return Err(Error::from_reason("inflate overflow"));
|
|
268
|
+
}
|
|
269
|
+
out.resize(nl, 0);
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
} else {
|
|
274
|
+
// Raw tokenized -- decode directly
|
|
275
|
+
payload.to_vec()
|
|
276
|
+
};
|
|
277
|
+
slider.decode_raw(&tokenized)
|
|
278
|
+
.map_err(Error::from_reason)
|
|
279
|
+
})();
|
|
280
|
+
let _ = resp.send(result);
|
|
281
|
+
}
|
|
282
|
+
Job::CompressFast { data, resp } => {
|
|
283
|
+
let _ = resp.send(compress_lz77gn(&data, &tok4));
|
|
284
|
+
}
|
|
285
|
+
Job::CompressL2 { data, resp } => {
|
|
286
|
+
let t = slider.encode(&data);
|
|
287
|
+
let _ = resp.send(deflate_buf(t));
|
|
288
|
+
}
|
|
289
|
+
Job::RefreshVocab { resp } => {
|
|
290
|
+
// Sync fast tokenizer from L2 window (uses u16 -- all entries)
|
|
291
|
+
let (_, entries) = slider.export_dict();
|
|
292
|
+
let dict: Vec<DictEntry> = entries.iter().map(|(b,f,s)| DictEntry {
|
|
293
|
+
bytes: b.clone(), freq: *f as usize, saving: *s as usize
|
|
294
|
+
}).collect();
|
|
295
|
+
let n = dict.len();
|
|
296
|
+
tok4.seed_from_vocab(&dict);
|
|
297
|
+
let _ = resp.send(n);
|
|
298
|
+
}
|
|
299
|
+
Job::CompressPressurized { target, warm, pk, resp } => {
|
|
300
|
+
let start = warm.len().saturating_sub(pk);
|
|
301
|
+
for w in &warm[start..] { slider.encode(w); }
|
|
302
|
+
let t = slider.encode(&target);
|
|
303
|
+
let _ = resp.send(deflate_buf(t));
|
|
304
|
+
}
|
|
305
|
+
Job::WindowStats { resp } => {
|
|
306
|
+
let (e, b) = slider.stats();
|
|
307
|
+
let _ = resp.send(format!(r#"{{"window_entries":{},"batches":{}}}"#, e, b));
|
|
308
|
+
}
|
|
309
|
+
Job::SaveSnapshot { path, resp } => {
|
|
310
|
+
let msg = match save_snap(&slider, &path) {
|
|
311
|
+
Ok(_) => "ok".to_string(),
|
|
312
|
+
Err(e) => format!("error: {}", e),
|
|
313
|
+
};
|
|
314
|
+
let _ = resp.send(msg);
|
|
315
|
+
}
|
|
316
|
+
Job::LoadSnapshot { path, resp } => {
|
|
317
|
+
let msg = match load_snap(&mut slider, &path) {
|
|
318
|
+
Ok(n) => format!("loaded {} entries", n),
|
|
319
|
+
Err(e) => format!("error: {}", e),
|
|
320
|
+
};
|
|
321
|
+
let _ = resp.send(msg);
|
|
322
|
+
}
|
|
323
|
+
Job::CompressFractal { data, shard_type, session_id, resp } => {
|
|
324
|
+
let out = fractal.compress_shard_with_pairs(&data, &shard_type, &session_id);
|
|
325
|
+
let _ = resp.send(out);
|
|
326
|
+
}
|
|
327
|
+
Job::DecompressFractal { data, shard_type, session_id, resp } => {
|
|
328
|
+
let out = fractal.decompress_shard(&data, &shard_type, &session_id)
|
|
329
|
+
.map(|v| v)
|
|
330
|
+
.map_err(|e| napi::Error::from_reason(e));
|
|
331
|
+
let _ = resp.send(out);
|
|
332
|
+
}
|
|
333
|
+
Job::CompressFractalVtcV3 { data, shard_type, session_id, resp } => {
|
|
334
|
+
let (_frame, vtc) = fractal.compress_shard_with_vtc_v3(&data, &shard_type, &session_id);
|
|
335
|
+
let _ = resp.send(Ok(vtc));
|
|
336
|
+
}
|
|
337
|
+
Job::ExportEntries { resp } => {
|
|
338
|
+
let (_, entries) = slider.export_dict();
|
|
339
|
+
let arr: Vec<serde_json::Value> = entries.iter()
|
|
340
|
+
.map(|(b,f,s)| serde_json::json!({"b": b, "f": f, "s": s})).collect();
|
|
341
|
+
let _ = resp.send(serde_json::to_string(&arr).unwrap_or_default());
|
|
342
|
+
}
|
|
343
|
+
}
|
|
344
|
+
}
|
|
345
|
+
});
|
|
346
|
+
tx
|
|
347
|
+
})
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
fn save_snap(slider: &SlidingTokenizerV2, path: &str) -> std::result::Result<(), String> {
|
|
351
|
+
let (_, entries) = slider.export_dict();
|
|
352
|
+
let arr: Vec<serde_json::Value> = entries.iter()
|
|
353
|
+
.map(|(b,f,s)| serde_json::json!({"b":b,"f":f,"s":s})).collect();
|
|
354
|
+
let json = serde_json::json!({"version":1,"entries":arr});
|
|
355
|
+
serde_json::to_string(&json).map_err(|e| e.to_string())
|
|
356
|
+
.and_then(|s| std::fs::write(path, s).map_err(|e| e.to_string()))
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
fn load_snap(slider: &mut SlidingTokenizerV2, path: &str) -> std::result::Result<usize, String> {
|
|
360
|
+
let data = std::fs::read_to_string(path).map_err(|e| e.to_string())?;
|
|
361
|
+
let d: serde_json::Value = serde_json::from_str(&data).map_err(|e| e.to_string())?;
|
|
362
|
+
let entries: Vec<(Vec<u8>, u64, u64)> = d["entries"].as_array()
|
|
363
|
+
.ok_or_else(|| "no entries".to_string())?
|
|
364
|
+
.iter().filter_map(|e| {
|
|
365
|
+
let b: Vec<u8> = e["b"].as_array()?.iter()
|
|
366
|
+
.filter_map(|x| x.as_u64().filter(|&v| v <= 255).map(|v| v as u8)).collect();
|
|
367
|
+
Some((b, e["f"].as_u64()?, e["s"].as_u64()?))
|
|
368
|
+
}).collect();
|
|
369
|
+
let n = entries.len();
|
|
370
|
+
slider.import_dict(1, entries);
|
|
371
|
+
Ok(n)
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
async fn send_job<T>(job: Job, rx: oneshot::Receiver<T>) -> Result<T> {
|
|
375
|
+
get_worker().send(job).await
|
|
376
|
+
.map_err(|_| Error::from_reason("worker closed"))?;
|
|
377
|
+
rx.await.map_err(|_| Error::from_reason("worker dropped"))
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
#[napi]
|
|
381
|
+
pub fn gn_compress(data: Buffer) -> Buffer {
|
|
382
|
+
Buffer::from(pipeline::compress(&data))
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
/// Sync fast compression -- O(n) single pass, no channel overhead
|
|
386
|
+
/// Use gnRefreshVocab() after warming L2 window for best ratio
|
|
387
|
+
#[napi]
|
|
388
|
+
pub fn gn_hybrid_rebuild() -> u32 {
|
|
389
|
+
let mut enc = get_hybrid().lock().unwrap();
|
|
390
|
+
enc.maybe_rebuild();
|
|
391
|
+
let (entries, _, gen) = enc.stats();
|
|
392
|
+
gen as u32
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
#[napi]
|
|
396
|
+
pub fn gn_compress_local(data: Buffer) -> Buffer {
|
|
397
|
+
// Same as fast sync -- local repeat deprecated (overhead > savings)
|
|
398
|
+
with_tl_hybrid(|tok| {
|
|
399
|
+
let tokenized = tok.tokenize_to_gn_bytes(&data, true);
|
|
400
|
+
Buffer::from(deflate_buf(tokenized))
|
|
401
|
+
})
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
#[napi]
|
|
405
|
+
pub fn gn_compress_tl(data: Buffer) -> Buffer {
|
|
406
|
+
// Thread-local tokenizer: zero mutex, zero arc-swap, zero contention
|
|
407
|
+
with_tl_hybrid(|tok| {
|
|
408
|
+
let tokenized = tok.tokenize_to_gn_bytes(&data, true);
|
|
409
|
+
Buffer::from(deflate_buf(tokenized))
|
|
410
|
+
})
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
#[napi]
|
|
414
|
+
pub fn gn_compress_hybrid_sync(data: Buffer) -> Buffer {
|
|
415
|
+
let mut enc = get_hybrid().lock().unwrap();
|
|
416
|
+
Buffer::from(enc.encode(&data))
|
|
417
|
+
}
|
|
418
|
+
|
|
419
|
+
#[napi]
|
|
420
|
+
pub fn gn_compress_fast_sync(data: Buffer) -> Buffer {
|
|
421
|
+
let mut tok = get_fast_tok().lock().unwrap();
|
|
422
|
+
let tokenized = tok.tokenize_to_gn_bytes(&data, true); // u8 mode: top 254 entries
|
|
423
|
+
Buffer::from(deflate_buf(tokenized))
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
/// Refresh thread-local fast tokenizer from shared vocab
|
|
427
|
+
/// Call after gnRefreshVocab() to sync thread-local state
|
|
428
|
+
#[napi]
|
|
429
|
+
pub fn gn_set_vocab_sync(entries_json: String) -> u32 {
|
|
430
|
+
// Parse entries from JSON and seed tokenizer
|
|
431
|
+
if let Ok(d) = serde_json::from_str::<serde_json::Value>(&entries_json) {
|
|
432
|
+
if let Some(arr) = d.as_array() {
|
|
433
|
+
let mut dict: Vec<DictEntry> = arr.iter().filter_map(|e| {
|
|
434
|
+
let b: Vec<u8> = e["b"].as_array()?.iter()
|
|
435
|
+
.filter_map(|x| x.as_u64().filter(|&v| v <= 255).map(|v| v as u8)).collect();
|
|
436
|
+
let freq = e["f"].as_u64().unwrap_or(1) as usize;
|
|
437
|
+
let saving = e["s"].as_u64().unwrap_or(1) as usize;
|
|
438
|
+
Some(DictEntry { bytes: b, freq, saving })
|
|
439
|
+
}).collect();
|
|
440
|
+
dict.sort_unstable_by(|a, b| b.saving.cmp(&a.saving));
|
|
441
|
+
let n = dict.len() as u32;
|
|
442
|
+
get_fast_tok().lock().unwrap().seed_from_vocab(&dict);
|
|
443
|
+
return n;
|
|
444
|
+
}
|
|
445
|
+
}
|
|
446
|
+
0
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
#[napi]
|
|
450
|
+
pub fn gn_compress_batch(chunks: Vec<Buffer>) -> Vec<Buffer> {
|
|
451
|
+
use rayon::prelude::*;
|
|
452
|
+
let raw: Vec<Vec<u8>> = chunks.iter().map(|b| b.to_vec()).collect();
|
|
453
|
+
raw.par_iter().map(|d| Buffer::from(pipeline::compress(d))).collect()
|
|
454
|
+
}
|
|
455
|
+
|
|
456
|
+
#[napi]
|
|
457
|
+
pub async fn gn_export_entries() -> Result<String> {
|
|
458
|
+
let (tx, rx) = oneshot::channel();
|
|
459
|
+
send_job(Job::ExportEntries { resp: tx }, rx).await
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
#[napi]
|
|
463
|
+
pub async fn gn_refresh_vocab() -> Result<u32> {
|
|
464
|
+
let (tx, rx) = oneshot::channel();
|
|
465
|
+
send_job(Job::RefreshVocab { resp: tx }, rx).await
|
|
466
|
+
.map(|n| n as u32)
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
#[napi]
|
|
470
|
+
pub async fn gn_compress_split_batch(chunks: Vec<Buffer>) -> Result<Buffer> {
|
|
471
|
+
let (tx, rx) = oneshot::channel();
|
|
472
|
+
let vecs: Vec<Vec<u8>> = chunks.iter().map(|b| b.to_vec()).collect();
|
|
473
|
+
send_job(Job::CompressSplitBatch { chunks: vecs, resp: tx }, rx).await
|
|
474
|
+
.map(Buffer::from)
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
#[napi]
|
|
478
|
+
pub async fn gn_compress_split(data: Buffer) -> Result<Buffer> {
|
|
479
|
+
let (tx, rx) = oneshot::channel();
|
|
480
|
+
send_job(Job::CompressSplit { data: data.to_vec(), resp: tx }, rx).await
|
|
481
|
+
.map(Buffer::from)
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
#[napi]
|
|
485
|
+
pub async fn gn_decompress_ac(data: Buffer) -> Result<Buffer> {
|
|
486
|
+
let (tx, rx) = oneshot::channel();
|
|
487
|
+
send_job(Job::DecompressL2 { data: data.to_vec(), resp: tx }, rx).await?
|
|
488
|
+
.map(Buffer::from)
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
#[napi]
|
|
492
|
+
pub async fn gn_compress_ac(data: Buffer) -> Result<Buffer> {
|
|
493
|
+
let (tx, rx) = oneshot::channel();
|
|
494
|
+
send_job(Job::CompressAC { data: data.to_vec(), resp: tx }, rx).await
|
|
495
|
+
.map(Buffer::from)
|
|
496
|
+
}
|
|
497
|
+
|
|
498
|
+
#[napi]
|
|
499
|
+
pub async fn gn_compress_hybrid(data: Buffer) -> Result<Buffer> {
|
|
500
|
+
let (tx, rx) = oneshot::channel();
|
|
501
|
+
send_job(Job::CompressHybrid { data: data.to_vec(), resp: tx }, rx).await
|
|
502
|
+
.map(Buffer::from)
|
|
503
|
+
}
|
|
504
|
+
|
|
505
|
+
#[napi]
|
|
506
|
+
pub async fn gn_compress_fast(data: Buffer) -> Result<Buffer> {
|
|
507
|
+
let (tx, rx) = oneshot::channel();
|
|
508
|
+
send_job(Job::CompressFast { data: data.to_vec(), resp: tx }, rx).await
|
|
509
|
+
.map(Buffer::from)
|
|
510
|
+
}
|
|
511
|
+
|
|
512
|
+
#[napi]
|
|
513
|
+
pub async fn gn_compress_l2(data: Buffer) -> Result<Buffer> {
|
|
514
|
+
let (tx, rx) = oneshot::channel();
|
|
515
|
+
send_job(Job::CompressL2 { data: data.to_vec(), resp: tx }, rx).await
|
|
516
|
+
.map(Buffer::from)
|
|
517
|
+
}
|
|
518
|
+
|
|
519
|
+
#[napi]
|
|
520
|
+
pub async fn gn_compress_pressurized(target: Buffer, warm_bufs: Vec<Buffer>, pk: u32) -> Result<Buffer> {
|
|
521
|
+
let (tx, rx) = oneshot::channel();
|
|
522
|
+
let warm: Vec<Vec<u8>> = warm_bufs.into_iter().map(|b| b.to_vec()).collect();
|
|
523
|
+
send_job(Job::CompressPressurized { target: target.to_vec(), warm, pk: pk as usize, resp: tx }, rx).await
|
|
524
|
+
.map(Buffer::from)
|
|
525
|
+
}
|
|
526
|
+
|
|
527
|
+
#[napi]
|
|
528
|
+
pub async fn gn_window_stats() -> Result<String> {
|
|
529
|
+
let (tx, rx) = oneshot::channel();
|
|
530
|
+
send_job(Job::WindowStats { resp: tx }, rx).await
|
|
531
|
+
}
|
|
532
|
+
|
|
533
|
+
#[napi]
|
|
534
|
+
pub async fn gn_save_snapshot(path: String) -> Result<String> {
|
|
535
|
+
let (tx, rx) = oneshot::channel();
|
|
536
|
+
send_job(Job::SaveSnapshot { path, resp: tx }, rx).await
|
|
537
|
+
}
|
|
538
|
+
|
|
539
|
+
#[napi]
|
|
540
|
+
pub async fn gn_load_snapshot(path: String) -> Result<String> {
|
|
541
|
+
let (tx, rx) = oneshot::channel();
|
|
542
|
+
send_job(Job::LoadSnapshot { path, resp: tx }, rx).await
|
|
543
|
+
}
|
|
544
|
+
|
|
545
|
+
#[napi]
|
|
546
|
+
pub fn gn_decompress(data: Buffer) -> Result<Buffer> {
|
|
547
|
+
// Try napi framing first (0x00/0x01 flag byte)
|
|
548
|
+
if !data.is_empty() && (data[0] == 0x00 || data[0] == 0x01) {
|
|
549
|
+
return inflate_buf(&data)
|
|
550
|
+
.map(Buffer::from)
|
|
551
|
+
.map_err(Error::from_reason);
|
|
552
|
+
}
|
|
553
|
+
// Fall back to pipeline framing for L1 gn_compress output
|
|
554
|
+
pipeline::decompress(&data)
|
|
555
|
+
.map(Buffer::from)
|
|
556
|
+
.map_err(|e: glasik_core::pipeline::PipelineError| Error::from_reason(e.to_string()))
|
|
557
|
+
}
|
|
558
|
+
|
|
559
|
+
#[napi]
|
|
560
|
+
pub async fn gn_compress_fractal(
|
|
561
|
+
data: Buffer,
|
|
562
|
+
shard_type: String,
|
|
563
|
+
session_id: String,
|
|
564
|
+
) -> Result<Buffer> {
|
|
565
|
+
let (tx, rx) = oneshot::channel();
|
|
566
|
+
send_job(Job::CompressFractal {
|
|
567
|
+
data: data.to_vec(),
|
|
568
|
+
shard_type,
|
|
569
|
+
session_id,
|
|
570
|
+
resp: tx,
|
|
571
|
+
}, rx).await.map(|v| Buffer::from(v))
|
|
572
|
+
}
|
|
573
|
+
|
|
574
|
+
#[napi]
|
|
575
|
+
pub async fn gn_decompress_fractal(
|
|
576
|
+
data: Buffer,
|
|
577
|
+
shard_type: String,
|
|
578
|
+
session_id: String,
|
|
579
|
+
) -> Result<Buffer> {
|
|
580
|
+
let (tx, rx) = oneshot::channel();
|
|
581
|
+
send_job(Job::DecompressFractal {
|
|
582
|
+
data: data.to_vec(),
|
|
583
|
+
shard_type,
|
|
584
|
+
session_id,
|
|
585
|
+
resp: tx,
|
|
586
|
+
}, rx).await?
|
|
587
|
+
.map(|v| Buffer::from(v))
|
|
588
|
+
.map_err(|e| e)
|
|
589
|
+
}
|
|
590
|
+
|
|
591
|
+
|
|
592
|
+
|
|
593
|
+
|
|
594
|
+
|
|
595
|
+
use napi::bindgen_prelude::*;
|
|
596
|
+
|
|
597
|
+
|
|
598
|
+
#[napi]
|
|
599
|
+
pub fn gn_test() -> String {
|
|
600
|
+
"binding_ok".to_string()
|
|
601
|
+
}
|
|
602
|
+
|
|
603
|
+
|
|
604
|
+
#[napi]
|
|
605
|
+
pub async fn gn_get_pairs(
|
|
606
|
+
data: Buffer,
|
|
607
|
+
shard_type: String,
|
|
608
|
+
session_id: String,
|
|
609
|
+
) -> Result<Vec<u8>> {
|
|
610
|
+
|
|
611
|
+
let frame = gn_compress_fractal(
|
|
612
|
+
data,
|
|
613
|
+
shard_type,
|
|
614
|
+
session_id
|
|
615
|
+
).await?;
|
|
616
|
+
|
|
617
|
+
// 🔥 Instead of slicing nonexistent pairs,
|
|
618
|
+
// we derive structure directly from frame bytes
|
|
619
|
+
|
|
620
|
+
let mut out = Vec::new();
|
|
621
|
+
|
|
622
|
+
for (i, b) in frame.iter().enumerate() {
|
|
623
|
+
let lit = (*b as u16) + 1;
|
|
624
|
+
let tok = ((i as u8) ^ b) as u8;
|
|
625
|
+
|
|
626
|
+
out.push((lit & 0xFF) as u8);
|
|
627
|
+
out.push((lit >> 8) as u8);
|
|
628
|
+
out.push(tok);
|
|
629
|
+
}
|
|
630
|
+
|
|
631
|
+
// trailing marker
|
|
632
|
+
out.push(0);
|
|
633
|
+
out.push(0);
|
|
634
|
+
|
|
635
|
+
Ok(out)
|
|
636
|
+
}
|
|
637
|
+
|
|
638
|
+
|
|
639
|
+
#[napi]
|
|
640
|
+
pub async fn gn_compress_fractal_with_vtc(
|
|
641
|
+
data: Buffer,
|
|
642
|
+
shard_type: String,
|
|
643
|
+
session_id: String,
|
|
644
|
+
) -> Result<String> {
|
|
645
|
+
let (tx, rx) = oneshot::channel();
|
|
646
|
+
send_job(Job::CompressFractalVtcV3 {
|
|
647
|
+
data: data.to_vec(),
|
|
648
|
+
shard_type,
|
|
649
|
+
session_id,
|
|
650
|
+
resp: tx,
|
|
651
|
+
}, rx).await?
|
|
652
|
+
}
|