roxify 1.13.7 → 1.13.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +1 -8
- package/dist/stub-progress.d.ts +4 -4
- package/dist/stub-progress.js +4 -4
- package/dist/utils/decoder.d.ts +46 -2
- package/dist/utils/decoder.js +248 -38
- package/dist/utils/ecc.js +0 -1
- package/dist/utils/encoder.d.ts +30 -1
- package/dist/utils/encoder.js +34 -18
- package/dist/utils/inspection.d.ts +1 -1
- package/dist/utils/inspection.js +2 -2
- package/dist/utils/robust-audio.js +0 -13
- package/dist/utils/robust-image.js +0 -26
- package/package.json +12 -29
- package/roxify_native-aarch64-apple-darwin.node +0 -0
- package/roxify_native-aarch64-pc-windows-msvc.node +0 -0
- package/roxify_native-aarch64-unknown-linux-gnu.node +0 -0
- package/roxify_native-i686-pc-windows-msvc.node +0 -0
- package/roxify_native-i686-unknown-linux-gnu.node +0 -0
- package/{dist/rox-macos-universal → roxify_native-universal-apple-darwin.node} +0 -0
- package/roxify_native-x86_64-apple-darwin.node +0 -0
- package/roxify_native-x86_64-pc-windows-msvc.node +0 -0
- package/roxify_native-x86_64-unknown-linux-gnu.node +0 -0
- package/scripts/postinstall.cjs +23 -2
- package/Cargo.toml +0 -91
- package/dist/roxify_native +0 -0
- package/dist/roxify_native-macos-arm64 +0 -0
- package/dist/roxify_native-macos-x64 +0 -0
- package/dist/roxify_native.exe +0 -0
- package/native/archive.rs +0 -220
- package/native/audio.rs +0 -151
- package/native/bench_hybrid.rs +0 -145
- package/native/bwt.rs +0 -56
- package/native/context_mixing.rs +0 -117
- package/native/core.rs +0 -378
- package/native/crypto.rs +0 -209
- package/native/encoder.rs +0 -405
- package/native/hybrid.rs +0 -297
- package/native/image_utils.rs +0 -82
- package/native/io_advice.rs +0 -43
- package/native/io_ntfs_optimized.rs +0 -99
- package/native/lib.rs +0 -480
- package/native/main.rs +0 -842
- package/native/mtf.rs +0 -106
- package/native/packer.rs +0 -604
- package/native/png_chunk_writer.rs +0 -146
- package/native/png_utils.rs +0 -554
- package/native/pool.rs +0 -101
- package/native/progress.rs +0 -142
- package/native/rans.rs +0 -149
- package/native/rans_byte.rs +0 -286
- package/native/reconstitution.rs +0 -623
- package/native/streaming.rs +0 -189
- package/native/streaming_decode.rs +0 -625
- package/native/streaming_encode.rs +0 -684
- package/native/test_small_bwt.rs +0 -31
- package/native/test_stages.rs +0 -70
package/native/bench_hybrid.rs
DELETED
|
@@ -1,145 +0,0 @@
|
|
|
1
|
-
use std::time::Instant;
|
|
2
|
-
|
|
3
|
-
mod rans_byte;
|
|
4
|
-
mod bwt;
|
|
5
|
-
mod mtf;
|
|
6
|
-
mod context_mixing;
|
|
7
|
-
mod pool;
|
|
8
|
-
mod hybrid;
|
|
9
|
-
|
|
10
|
-
fn bench_roundtrip(name: &str, data: &[u8]) {
|
|
11
|
-
let compressor = hybrid::HybridCompressor::new();
|
|
12
|
-
|
|
13
|
-
let start = Instant::now();
|
|
14
|
-
let (compressed, stats) = compressor.compress(data).unwrap();
|
|
15
|
-
let compress_time = start.elapsed();
|
|
16
|
-
|
|
17
|
-
let start = Instant::now();
|
|
18
|
-
let decompressed = compressor.decompress(&compressed).unwrap();
|
|
19
|
-
let decompress_time = start.elapsed();
|
|
20
|
-
|
|
21
|
-
let ratio = (compressed.len() as f64) / (data.len() as f64) * 100.0;
|
|
22
|
-
let compress_mbps = (data.len() as f64 / 1_048_576.0) / compress_time.as_secs_f64();
|
|
23
|
-
let decompress_mbps = (data.len() as f64 / 1_048_576.0) / decompress_time.as_secs_f64();
|
|
24
|
-
|
|
25
|
-
assert_eq!(decompressed, data, "ROUND-TRIP FAILED for {}", name);
|
|
26
|
-
|
|
27
|
-
println!("=== {} ===", name);
|
|
28
|
-
println!(" Input: {} bytes", data.len());
|
|
29
|
-
println!(" Compressed: {} bytes ({:.1}%)", compressed.len(), ratio);
|
|
30
|
-
println!(" Reduction: {:.1}%", 100.0 - ratio);
|
|
31
|
-
println!(" Compress: {:.1} ms ({:.1} MB/s)", compress_time.as_secs_f64() * 1000.0, compress_mbps);
|
|
32
|
-
println!(" Decompress: {:.1} ms ({:.1} MB/s)", decompress_time.as_secs_f64() * 1000.0, decompress_mbps);
|
|
33
|
-
println!(" Entropy: {:.2} bits/byte", stats.entropy_bits);
|
|
34
|
-
println!();
|
|
35
|
-
}
|
|
36
|
-
|
|
37
|
-
fn bench_zstd(name: &str, data: &[u8], level: i32) {
|
|
38
|
-
let start = Instant::now();
|
|
39
|
-
let compressed = zstd::encode_all(std::io::Cursor::new(data), level).unwrap();
|
|
40
|
-
let compress_time = start.elapsed();
|
|
41
|
-
|
|
42
|
-
let start = Instant::now();
|
|
43
|
-
let decompressed = zstd::decode_all(std::io::Cursor::new(&compressed)).unwrap();
|
|
44
|
-
let decompress_time = start.elapsed();
|
|
45
|
-
|
|
46
|
-
let ratio = (compressed.len() as f64) / (data.len() as f64) * 100.0;
|
|
47
|
-
let compress_mbps = (data.len() as f64 / 1_048_576.0) / compress_time.as_secs_f64();
|
|
48
|
-
let decompress_mbps = (data.len() as f64 / 1_048_576.0) / decompress_time.as_secs_f64();
|
|
49
|
-
|
|
50
|
-
assert_eq!(decompressed, data);
|
|
51
|
-
|
|
52
|
-
println!("=== Zstd L{} ({}) ===", level, name);
|
|
53
|
-
println!(" Compressed: {} bytes ({:.1}%)", compressed.len(), ratio);
|
|
54
|
-
println!(" Reduction: {:.1}%", 100.0 - ratio);
|
|
55
|
-
println!(" Compress: {:.1} ms ({:.1} MB/s)", compress_time.as_secs_f64() * 1000.0, compress_mbps);
|
|
56
|
-
println!(" Decompress: {:.1} ms ({:.1} MB/s)", decompress_time.as_secs_f64() * 1000.0, decompress_mbps);
|
|
57
|
-
println!();
|
|
58
|
-
}
|
|
59
|
-
|
|
60
|
-
fn main() {
|
|
61
|
-
println!("╔══════════════════════════════════════════════════════════╗");
|
|
62
|
-
println!("║ ROXIFY BWT-ANS COMPRESSION BENCHMARK ║");
|
|
63
|
-
println!("╚══════════════════════════════════════════════════════════╝\n");
|
|
64
|
-
|
|
65
|
-
let text_1k: Vec<u8> = "Hello World! This is a test of the roxify compression engine. ".repeat(16).into_bytes();
|
|
66
|
-
bench_roundtrip("Text 1KB", &text_1k);
|
|
67
|
-
bench_zstd("Text 1KB", &text_1k, 3);
|
|
68
|
-
bench_zstd("Text 1KB", &text_1k, 19);
|
|
69
|
-
|
|
70
|
-
let text_100k: Vec<u8> = "The quick brown fox jumps over the lazy dog. Pack my box with five dozen liquor jugs. ".repeat(1200).into_bytes();
|
|
71
|
-
bench_roundtrip("Text 100KB", &text_100k);
|
|
72
|
-
bench_zstd("Text 100KB", &text_100k, 3);
|
|
73
|
-
bench_zstd("Text 100KB", &text_100k, 19);
|
|
74
|
-
|
|
75
|
-
let text_1m: Vec<u8> = {
|
|
76
|
-
let mut data = Vec::with_capacity(1_048_576);
|
|
77
|
-
let phrases = [
|
|
78
|
-
b"Lorem ipsum dolor sit amet, consectetur adipiscing elit. ".as_slice(),
|
|
79
|
-
b"Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. ".as_slice(),
|
|
80
|
-
b"Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris. ".as_slice(),
|
|
81
|
-
b"Duis aute irure dolor in reprehenderit in voluptate velit esse. ".as_slice(),
|
|
82
|
-
b"Excepteur sint occaecat cupidatat non proident, sunt in culpa. ".as_slice(),
|
|
83
|
-
];
|
|
84
|
-
let mut i = 0;
|
|
85
|
-
while data.len() < 1_048_576 {
|
|
86
|
-
data.extend_from_slice(phrases[i % phrases.len()]);
|
|
87
|
-
i += 1;
|
|
88
|
-
}
|
|
89
|
-
data.truncate(1_048_576);
|
|
90
|
-
data
|
|
91
|
-
};
|
|
92
|
-
bench_roundtrip("Text 1MB", &text_1m);
|
|
93
|
-
bench_zstd("Text 1MB", &text_1m, 3);
|
|
94
|
-
bench_zstd("Text 1MB", &text_1m, 19);
|
|
95
|
-
|
|
96
|
-
let json_data: Vec<u8> = {
|
|
97
|
-
let mut data = String::with_capacity(512_000);
|
|
98
|
-
data.push('[');
|
|
99
|
-
for i in 0..5000 {
|
|
100
|
-
if i > 0 { data.push(','); }
|
|
101
|
-
data.push_str(&format!(
|
|
102
|
-
r#"{{"id":{},"name":"user_{}","email":"user{}@example.com","active":{},"score":{:.2},"tags":["tag1","tag2","tag3"]}}"#,
|
|
103
|
-
i, i, i, i % 2 == 0, (i as f64) * 1.337
|
|
104
|
-
));
|
|
105
|
-
}
|
|
106
|
-
data.push(']');
|
|
107
|
-
data.into_bytes()
|
|
108
|
-
};
|
|
109
|
-
bench_roundtrip("JSON 500KB", &json_data);
|
|
110
|
-
bench_zstd("JSON 500KB", &json_data, 3);
|
|
111
|
-
bench_zstd("JSON 500KB", &json_data, 19);
|
|
112
|
-
|
|
113
|
-
let random_data: Vec<u8> = {
|
|
114
|
-
let mut data = vec![0u8; 100_000];
|
|
115
|
-
let mut state = 12345u64;
|
|
116
|
-
for b in data.iter_mut() {
|
|
117
|
-
state = state.wrapping_mul(6364136223846793005).wrapping_add(1442695040888963407);
|
|
118
|
-
*b = (state >> 33) as u8;
|
|
119
|
-
}
|
|
120
|
-
data
|
|
121
|
-
};
|
|
122
|
-
bench_roundtrip("Random 100KB", &random_data);
|
|
123
|
-
bench_zstd("Random 100KB", &random_data, 3);
|
|
124
|
-
|
|
125
|
-
let binary_data: Vec<u8> = {
|
|
126
|
-
let mut data = Vec::with_capacity(256_000);
|
|
127
|
-
for i in 0..256_000u32 {
|
|
128
|
-
match i % 7 {
|
|
129
|
-
0 => data.push(0),
|
|
130
|
-
1 => data.push(0xFF),
|
|
131
|
-
2 => data.push((i & 0xFF) as u8),
|
|
132
|
-
3 => data.push(((i >> 8) & 0xFF) as u8),
|
|
133
|
-
4 => data.push(b'A' + (i % 26) as u8),
|
|
134
|
-
5 => data.push(0x20),
|
|
135
|
-
_ => data.push((i.wrapping_mul(37) & 0xFF) as u8),
|
|
136
|
-
}
|
|
137
|
-
}
|
|
138
|
-
data
|
|
139
|
-
};
|
|
140
|
-
bench_roundtrip("Binary 256KB", &binary_data);
|
|
141
|
-
bench_zstd("Binary 256KB", &binary_data, 3);
|
|
142
|
-
bench_zstd("Binary 256KB", &binary_data, 19);
|
|
143
|
-
|
|
144
|
-
println!("All round-trip tests PASSED!");
|
|
145
|
-
}
|
package/native/bwt.rs
DELETED
|
@@ -1,56 +0,0 @@
|
|
|
1
|
-
use anyhow::Result;
|
|
2
|
-
use libsais::bwt::Bwt;
|
|
3
|
-
use libsais::typestate::OwnedBuffer;
|
|
4
|
-
use libsais::BwtConstruction;
|
|
5
|
-
use rayon::prelude::*;
|
|
6
|
-
|
|
7
|
-
pub struct BwtResult {
|
|
8
|
-
pub transformed: Vec<u8>,
|
|
9
|
-
pub primary_index: u32,
|
|
10
|
-
}
|
|
11
|
-
|
|
12
|
-
pub fn bwt_encode(data: &[u8]) -> Result<BwtResult> {
|
|
13
|
-
let n = data.len();
|
|
14
|
-
if n == 0 {
|
|
15
|
-
return Ok(BwtResult { transformed: Vec::new(), primary_index: 0 });
|
|
16
|
-
}
|
|
17
|
-
|
|
18
|
-
let bwt_result = BwtConstruction::for_text(data)
|
|
19
|
-
.with_owned_temporary_array_buffer32()
|
|
20
|
-
.single_threaded()
|
|
21
|
-
.run()
|
|
22
|
-
.map_err(|e| anyhow::anyhow!("libsais BWT: {:?}", e))?;
|
|
23
|
-
|
|
24
|
-
let primary_index = bwt_result.primary_index() as u32;
|
|
25
|
-
let transformed = bwt_result.bwt().to_vec();
|
|
26
|
-
|
|
27
|
-
Ok(BwtResult { transformed, primary_index })
|
|
28
|
-
}
|
|
29
|
-
|
|
30
|
-
pub fn bwt_decode(bwt_data: &[u8], primary_index: u32) -> Result<Vec<u8>> {
|
|
31
|
-
if bwt_data.is_empty() {
|
|
32
|
-
return Ok(Vec::new());
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
let bwt_obj: Bwt<'static, u8, OwnedBuffer> =
|
|
36
|
-
unsafe { Bwt::from_parts(bwt_data.to_vec(), primary_index as usize) };
|
|
37
|
-
|
|
38
|
-
let text = bwt_obj
|
|
39
|
-
.unbwt()
|
|
40
|
-
.with_owned_temporary_array_buffer32()
|
|
41
|
-
.single_threaded()
|
|
42
|
-
.run()
|
|
43
|
-
.map_err(|e| anyhow::anyhow!("libsais UnBWT: {:?}", e))?;
|
|
44
|
-
|
|
45
|
-
Ok(text.as_slice().to_vec())
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
pub fn bwt_encode_streaming(block_size: usize, data: &[u8]) -> Result<Vec<(BwtResult, usize)>> {
|
|
49
|
-
data.par_chunks(block_size)
|
|
50
|
-
.enumerate()
|
|
51
|
-
.map(|(i, chunk)| {
|
|
52
|
-
let result = bwt_encode(chunk)?;
|
|
53
|
-
Ok((result, i * block_size))
|
|
54
|
-
})
|
|
55
|
-
.collect()
|
|
56
|
-
}
|
package/native/context_mixing.rs
DELETED
|
@@ -1,117 +0,0 @@
|
|
|
1
|
-
#[derive(Clone, Copy, Debug)]
|
|
2
|
-
pub struct ProbabilityEstimate {
|
|
3
|
-
pub p0: u32,
|
|
4
|
-
pub p1: u32,
|
|
5
|
-
pub total: u32,
|
|
6
|
-
}
|
|
7
|
-
|
|
8
|
-
impl ProbabilityEstimate {
|
|
9
|
-
pub fn entropy_bits(&self) -> f32 {
|
|
10
|
-
if self.total == 0 {
|
|
11
|
-
return 0.0;
|
|
12
|
-
}
|
|
13
|
-
let p0 = (self.p0 as f32) / (self.total as f32);
|
|
14
|
-
let p1 = (self.p1 as f32) / (self.total as f32);
|
|
15
|
-
|
|
16
|
-
let mut bits = 0.0;
|
|
17
|
-
if p0 > 0.0 {
|
|
18
|
-
bits -= p0 * p0.log2();
|
|
19
|
-
}
|
|
20
|
-
if p1 > 0.0 {
|
|
21
|
-
bits -= p1 * p1.log2();
|
|
22
|
-
}
|
|
23
|
-
bits
|
|
24
|
-
}
|
|
25
|
-
}
|
|
26
|
-
|
|
27
|
-
pub struct ContextMixer {
|
|
28
|
-
contexts_order0: Vec<ProbabilityEstimate>,
|
|
29
|
-
contexts_order1: Vec<[ProbabilityEstimate; 256]>,
|
|
30
|
-
contexts_order2: Vec<[[ProbabilityEstimate; 256]; 256]>,
|
|
31
|
-
}
|
|
32
|
-
|
|
33
|
-
impl ContextMixer {
|
|
34
|
-
pub fn new() -> Self {
|
|
35
|
-
ContextMixer {
|
|
36
|
-
contexts_order0: vec![ProbabilityEstimate { p0: 1, p1: 1, total: 2 }; 1],
|
|
37
|
-
contexts_order1: vec![
|
|
38
|
-
[ProbabilityEstimate { p0: 1, p1: 1, total: 2 }; 256];
|
|
39
|
-
256
|
|
40
|
-
],
|
|
41
|
-
contexts_order2: vec![
|
|
42
|
-
[[ProbabilityEstimate { p0: 1, p1: 1, total: 2 }; 256]; 256];
|
|
43
|
-
256
|
|
44
|
-
],
|
|
45
|
-
}
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
pub fn predict_order0(&self) -> ProbabilityEstimate {
|
|
49
|
-
self.contexts_order0[0]
|
|
50
|
-
}
|
|
51
|
-
|
|
52
|
-
pub fn predict_order1(&self, context1: u8) -> ProbabilityEstimate {
|
|
53
|
-
self.contexts_order1[context1 as usize][0]
|
|
54
|
-
}
|
|
55
|
-
|
|
56
|
-
pub fn predict_order2(&self, context1: u8, context2: u8) -> ProbabilityEstimate {
|
|
57
|
-
self.contexts_order2[context1 as usize][context2 as usize][0]
|
|
58
|
-
}
|
|
59
|
-
|
|
60
|
-
pub fn update_order0(&mut self, bit: bool) {
|
|
61
|
-
let ctx = &mut self.contexts_order0[0];
|
|
62
|
-
if bit {
|
|
63
|
-
ctx.p1 += 1;
|
|
64
|
-
} else {
|
|
65
|
-
ctx.p0 += 1;
|
|
66
|
-
}
|
|
67
|
-
ctx.total += 1;
|
|
68
|
-
}
|
|
69
|
-
|
|
70
|
-
pub fn update_order1(&mut self, context1: u8, bit: bool) {
|
|
71
|
-
let ctx = &mut self.contexts_order1[context1 as usize][0];
|
|
72
|
-
if bit {
|
|
73
|
-
ctx.p1 += 1;
|
|
74
|
-
} else {
|
|
75
|
-
ctx.p0 += 1;
|
|
76
|
-
}
|
|
77
|
-
ctx.total += 1;
|
|
78
|
-
}
|
|
79
|
-
|
|
80
|
-
pub fn update_order2(&mut self, context1: u8, context2: u8, bit: bool) {
|
|
81
|
-
let ctx = &mut self.contexts_order2[context1 as usize][context2 as usize][0];
|
|
82
|
-
if bit {
|
|
83
|
-
ctx.p1 += 1;
|
|
84
|
-
} else {
|
|
85
|
-
ctx.p0 += 1;
|
|
86
|
-
}
|
|
87
|
-
ctx.total += 1;
|
|
88
|
-
}
|
|
89
|
-
}
|
|
90
|
-
|
|
91
|
-
pub fn analyze_entropy(data: &[u8]) -> f32 {
|
|
92
|
-
let mut freq = [0u32; 256];
|
|
93
|
-
for &byte in data {
|
|
94
|
-
freq[byte as usize] += 1;
|
|
95
|
-
}
|
|
96
|
-
|
|
97
|
-
let total = data.len() as f32;
|
|
98
|
-
if total == 0.0 {
|
|
99
|
-
return 0.0;
|
|
100
|
-
}
|
|
101
|
-
|
|
102
|
-
let inv_total = 1.0 / total;
|
|
103
|
-
let mut entropy = 0.0f32;
|
|
104
|
-
for &f in &freq {
|
|
105
|
-
if f > 0 {
|
|
106
|
-
let p = f as f32 * inv_total;
|
|
107
|
-
entropy -= p * p.log2();
|
|
108
|
-
}
|
|
109
|
-
}
|
|
110
|
-
entropy
|
|
111
|
-
}
|
|
112
|
-
|
|
113
|
-
pub fn estimate_compression_gain(original: &[u8], entropy_bits: f32) -> f64 {
|
|
114
|
-
let theoretical_min = (original.len() as f64) * (entropy_bits as f64) / 8.0;
|
|
115
|
-
let ratio = theoretical_min / (original.len() as f64);
|
|
116
|
-
(1.0 - ratio) * 100.0
|
|
117
|
-
}
|
package/native/core.rs
DELETED
|
@@ -1,378 +0,0 @@
|
|
|
1
|
-
use rayon::prelude::*;
|
|
2
|
-
use std::sync::Arc;
|
|
3
|
-
use std::path::PathBuf;
|
|
4
|
-
use anyhow::Result;
|
|
5
|
-
|
|
6
|
-
pub struct PlainScanResult {
|
|
7
|
-
pub marker_positions: Vec<u32>,
|
|
8
|
-
pub magic_positions: Vec<u32>,
|
|
9
|
-
}
|
|
10
|
-
|
|
11
|
-
pub fn scan_pixels_bytes(buf: &[u8], channels: usize, marker_bytes: Option<&[u8]>) -> PlainScanResult {
|
|
12
|
-
let magic = b"ROX1";
|
|
13
|
-
|
|
14
|
-
let magic_positions: Vec<u32> = if buf.len() >= 4 {
|
|
15
|
-
(0..(buf.len() - 3))
|
|
16
|
-
.into_par_iter()
|
|
17
|
-
.filter_map(|i| if &buf[i..i + 4] == magic { Some(i as u32) } else { None })
|
|
18
|
-
.collect()
|
|
19
|
-
} else {
|
|
20
|
-
Vec::new()
|
|
21
|
-
};
|
|
22
|
-
|
|
23
|
-
let markers: Vec<[u8; 3]> = match marker_bytes {
|
|
24
|
-
Some(bytes) if !bytes.is_empty() => {
|
|
25
|
-
if bytes.len() % 3 != 0 {
|
|
26
|
-
return PlainScanResult { marker_positions: Vec::new(), magic_positions };
|
|
27
|
-
}
|
|
28
|
-
bytes.chunks(3).map(|c| [c[0], c[1], c[2]]).collect()
|
|
29
|
-
}
|
|
30
|
-
_ => Vec::new(),
|
|
31
|
-
};
|
|
32
|
-
|
|
33
|
-
let marker_positions = if markers.is_empty() {
|
|
34
|
-
Vec::new()
|
|
35
|
-
} else {
|
|
36
|
-
let markers = Arc::new(markers);
|
|
37
|
-
let ch = channels as usize;
|
|
38
|
-
if ch < 3 || buf.len() < 3 {
|
|
39
|
-
Vec::new()
|
|
40
|
-
} else {
|
|
41
|
-
let pixel_count = buf.len() / ch;
|
|
42
|
-
(0..pixel_count)
|
|
43
|
-
.into_par_iter()
|
|
44
|
-
.filter_map(|i| {
|
|
45
|
-
let base = i * ch;
|
|
46
|
-
if base + 3 > buf.len() {
|
|
47
|
-
return None;
|
|
48
|
-
}
|
|
49
|
-
for m in markers.iter() {
|
|
50
|
-
if buf[base] == m[0] && buf[base + 1] == m[1] && buf[base + 2] == m[2] {
|
|
51
|
-
return Some(i as u32);
|
|
52
|
-
}
|
|
53
|
-
}
|
|
54
|
-
None
|
|
55
|
-
})
|
|
56
|
-
.collect()
|
|
57
|
-
}
|
|
58
|
-
};
|
|
59
|
-
|
|
60
|
-
PlainScanResult { marker_positions, magic_positions }
|
|
61
|
-
}
|
|
62
|
-
|
|
63
|
-
pub fn crc32_bytes(buf: &[u8]) -> u32 {
|
|
64
|
-
// parallelize checksum on large buffers, since crc32fast::hash is single-threaded
|
|
65
|
-
const PAR_THRESHOLD: usize = 4 * 1024 * 1024; // 4 MiB
|
|
66
|
-
if buf.len() < PAR_THRESHOLD {
|
|
67
|
-
crc32fast::hash(buf)
|
|
68
|
-
} else {
|
|
69
|
-
// compute per-chunk hasher in parallel then combine
|
|
70
|
-
let chunk = PAR_THRESHOLD;
|
|
71
|
-
let combined = buf
|
|
72
|
-
.par_chunks(chunk)
|
|
73
|
-
.map(|chunk| {
|
|
74
|
-
let mut h = crc32fast::Hasher::new();
|
|
75
|
-
h.update(chunk);
|
|
76
|
-
h
|
|
77
|
-
})
|
|
78
|
-
.reduce(|| crc32fast::Hasher::new(), |mut a, b| {
|
|
79
|
-
a.combine(&b);
|
|
80
|
-
a
|
|
81
|
-
});
|
|
82
|
-
combined.finalize()
|
|
83
|
-
}
|
|
84
|
-
}
|
|
85
|
-
|
|
86
|
-
pub fn adler32_bytes(buf: &[u8]) -> u32 {
|
|
87
|
-
let mut hasher = simd_adler32::Adler32::new();
|
|
88
|
-
hasher.write(buf);
|
|
89
|
-
hasher.finish()
|
|
90
|
-
}
|
|
91
|
-
|
|
92
|
-
pub fn delta_encode_bytes(buf: &[u8]) -> Vec<u8> {
|
|
93
|
-
let len = buf.len();
|
|
94
|
-
if len == 0 {
|
|
95
|
-
return Vec::new();
|
|
96
|
-
}
|
|
97
|
-
let mut out = vec![0u8; len];
|
|
98
|
-
out[0] = buf[0];
|
|
99
|
-
for i in 1..len {
|
|
100
|
-
out[i] = buf[i].wrapping_sub(buf[i - 1]);
|
|
101
|
-
}
|
|
102
|
-
out
|
|
103
|
-
}
|
|
104
|
-
|
|
105
|
-
pub fn delta_decode_bytes(buf: &[u8]) -> Vec<u8> {
|
|
106
|
-
let len = buf.len();
|
|
107
|
-
if len == 0 {
|
|
108
|
-
return Vec::new();
|
|
109
|
-
}
|
|
110
|
-
let mut out = vec![0u8; len];
|
|
111
|
-
out[0] = buf[0];
|
|
112
|
-
for i in 1..len {
|
|
113
|
-
out[i] = out[i - 1].wrapping_add(buf[i]);
|
|
114
|
-
}
|
|
115
|
-
out
|
|
116
|
-
}
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
pub fn train_zstd_dictionary(sample_paths: &[PathBuf], dict_size: usize) -> Result<Vec<u8>> {
|
|
121
|
-
// load all sample files contiguously
|
|
122
|
-
let mut samples = Vec::new();
|
|
123
|
-
let mut lengths = Vec::new();
|
|
124
|
-
for path in sample_paths {
|
|
125
|
-
let data = std::fs::read(path)?;
|
|
126
|
-
lengths.push(data.len());
|
|
127
|
-
samples.extend_from_slice(&data);
|
|
128
|
-
}
|
|
129
|
-
let dict = zstd::dict::from_continuous(&samples, &lengths, dict_size)?;
|
|
130
|
-
Ok(dict)
|
|
131
|
-
}
|
|
132
|
-
|
|
133
|
-
/// Compress a slice with optional zstd dictionary.
|
|
134
|
-
///
|
|
135
|
-
/// When `dict` is `Some`, the dictionary is passed to the encoder (same
|
|
136
|
-
/// dict required for decompression). Pass `None` for normal compression.
|
|
137
|
-
///
|
|
138
|
-
/// For large buffers (>50 MiB) without a dictionary, multiple chunk sizes
|
|
139
|
-
/// are benchmarked on a sample and the best is selected automatically.
|
|
140
|
-
pub fn zstd_compress_bytes(buf: &[u8], level: i32, dict: Option<&[u8]>) -> std::result::Result<Vec<u8>, String> {
|
|
141
|
-
zstd_compress_with_prefix(buf, level, dict, &[])
|
|
142
|
-
}
|
|
143
|
-
|
|
144
|
-
fn compute_entropy_sample(buf: &[u8]) -> f32 {
|
|
145
|
-
let sample_size = buf.len().min(16384);
|
|
146
|
-
if sample_size == 0 {
|
|
147
|
-
return 4.0;
|
|
148
|
-
}
|
|
149
|
-
let sample = &buf[..sample_size];
|
|
150
|
-
let mut freq = [0u32; 256];
|
|
151
|
-
for &b in sample {
|
|
152
|
-
freq[b as usize] += 1;
|
|
153
|
-
}
|
|
154
|
-
let len = sample.len() as f32;
|
|
155
|
-
let mut ent: f32 = 0.0;
|
|
156
|
-
for &c in &freq {
|
|
157
|
-
if c > 0 {
|
|
158
|
-
let p = c as f32 / len;
|
|
159
|
-
ent -= p * p.log2();
|
|
160
|
-
}
|
|
161
|
-
}
|
|
162
|
-
ent
|
|
163
|
-
}
|
|
164
|
-
|
|
165
|
-
fn compute_adaptive_level(buf: &[u8], requested_level: i32, total_len: usize) -> i32 {
|
|
166
|
-
let entropy = compute_entropy_sample(buf);
|
|
167
|
-
|
|
168
|
-
let size_cap = match total_len {
|
|
169
|
-
s if s > 2 * 1024 * 1024 * 1024 => 1,
|
|
170
|
-
s if s > 1024 * 1024 * 1024 => match entropy {
|
|
171
|
-
e if e < 3.0 => 3,
|
|
172
|
-
_ => 1,
|
|
173
|
-
},
|
|
174
|
-
s if s > 256 * 1024 * 1024 => match entropy {
|
|
175
|
-
e if e < 3.0 => 6,
|
|
176
|
-
e if e < 5.0 => 3,
|
|
177
|
-
_ => 1,
|
|
178
|
-
},
|
|
179
|
-
s if s > 64 * 1024 * 1024 => match entropy {
|
|
180
|
-
e if e < 3.0 => 12,
|
|
181
|
-
e if e < 5.0 => 6,
|
|
182
|
-
e if e < 7.0 => 3,
|
|
183
|
-
_ => 1,
|
|
184
|
-
},
|
|
185
|
-
s if s > 16 * 1024 * 1024 => match entropy {
|
|
186
|
-
e if e < 3.0 => 15,
|
|
187
|
-
e if e < 5.0 => 9,
|
|
188
|
-
e if e < 7.0 => 6,
|
|
189
|
-
e if e < 7.5 => 3,
|
|
190
|
-
_ => 1,
|
|
191
|
-
},
|
|
192
|
-
s if s > 1024 * 1024 => match entropy {
|
|
193
|
-
e if e < 3.0 => 19,
|
|
194
|
-
e if e < 5.0 => 12,
|
|
195
|
-
e if e < 6.5 => 6,
|
|
196
|
-
e if e < 7.5 => 3,
|
|
197
|
-
_ => 1,
|
|
198
|
-
},
|
|
199
|
-
s if s > 64 * 1024 => match entropy {
|
|
200
|
-
e if e < 4.0 => 9,
|
|
201
|
-
e if e < 6.0 => 6,
|
|
202
|
-
e if e < 7.5 => 3,
|
|
203
|
-
_ => 1,
|
|
204
|
-
},
|
|
205
|
-
s if s > 4096 => match entropy {
|
|
206
|
-
e if e < 5.0 => 6,
|
|
207
|
-
e if e < 7.0 => 3,
|
|
208
|
-
_ => 1,
|
|
209
|
-
},
|
|
210
|
-
_ => match entropy {
|
|
211
|
-
e if e < 6.0 => 3,
|
|
212
|
-
_ => 1,
|
|
213
|
-
},
|
|
214
|
-
};
|
|
215
|
-
|
|
216
|
-
requested_level.min(size_cap)
|
|
217
|
-
}
|
|
218
|
-
|
|
219
|
-
pub fn zstd_compress_with_prefix(buf: &[u8], level: i32, dict: Option<&[u8]>, prefix: &[u8]) -> std::result::Result<Vec<u8>, String> {
|
|
220
|
-
use std::io::Write;
|
|
221
|
-
|
|
222
|
-
let actual_level = level.min(22).max(1);
|
|
223
|
-
let total_len = prefix.len() + buf.len();
|
|
224
|
-
let adaptive_level = compute_adaptive_level(buf, actual_level, total_len);
|
|
225
|
-
|
|
226
|
-
let estimated_output = if total_len < 1024 {
|
|
227
|
-
total_len
|
|
228
|
-
} else {
|
|
229
|
-
total_len * 3 / 4
|
|
230
|
-
};
|
|
231
|
-
|
|
232
|
-
if dict.is_none() && total_len < 4 * 1024 * 1024 {
|
|
233
|
-
if prefix.is_empty() {
|
|
234
|
-
return zstd::bulk::compress(buf, adaptive_level)
|
|
235
|
-
.map_err(|e| format!("zstd bulk compress error: {}", e));
|
|
236
|
-
}
|
|
237
|
-
let mut combined = Vec::with_capacity(total_len);
|
|
238
|
-
combined.extend_from_slice(prefix);
|
|
239
|
-
combined.extend_from_slice(buf);
|
|
240
|
-
return zstd::bulk::compress(&combined, adaptive_level)
|
|
241
|
-
.map_err(|e| format!("zstd bulk compress error: {}", e));
|
|
242
|
-
}
|
|
243
|
-
|
|
244
|
-
let mut encoder = if let Some(d) = dict {
|
|
245
|
-
zstd::stream::Encoder::with_dictionary(Vec::with_capacity(estimated_output), adaptive_level, d)
|
|
246
|
-
.map_err(|e| format!("zstd encoder init error: {}", e))?
|
|
247
|
-
} else {
|
|
248
|
-
zstd::stream::Encoder::new(Vec::with_capacity(estimated_output), adaptive_level)
|
|
249
|
-
.map_err(|e| format!("zstd encoder init error: {}", e))?
|
|
250
|
-
};
|
|
251
|
-
|
|
252
|
-
let threads = num_cpus::get() as u32;
|
|
253
|
-
if threads > 1 {
|
|
254
|
-
let max_threads = if adaptive_level >= 20 { threads.min(4) } else { threads };
|
|
255
|
-
let _ = encoder.multithread(max_threads);
|
|
256
|
-
}
|
|
257
|
-
|
|
258
|
-
if total_len > 256 * 1024 && adaptive_level >= 3 {
|
|
259
|
-
let _ = encoder.long_distance_matching(true);
|
|
260
|
-
}
|
|
261
|
-
if total_len > 256 * 1024 {
|
|
262
|
-
let wlog = if total_len > 1024 * 1024 * 1024 { 30 }
|
|
263
|
-
else if total_len > 512 * 1024 * 1024 { 29 }
|
|
264
|
-
else if total_len > 64 * 1024 * 1024 { 28 }
|
|
265
|
-
else if total_len > 8 * 1024 * 1024 { 27 }
|
|
266
|
-
else { 26 };
|
|
267
|
-
let _ = encoder.window_log(wlog);
|
|
268
|
-
}
|
|
269
|
-
|
|
270
|
-
let _ = encoder.set_pledged_src_size(Some(total_len as u64));
|
|
271
|
-
|
|
272
|
-
if !prefix.is_empty() {
|
|
273
|
-
encoder.write_all(prefix).map_err(|e| format!("zstd write prefix error: {}", e))?;
|
|
274
|
-
}
|
|
275
|
-
|
|
276
|
-
let chunk_size = if total_len > 256 * 1024 * 1024 { 16 * 1024 * 1024 }
|
|
277
|
-
else if total_len > 64 * 1024 * 1024 { 8 * 1024 * 1024 }
|
|
278
|
-
else { buf.len() };
|
|
279
|
-
|
|
280
|
-
for chunk in buf.chunks(chunk_size) {
|
|
281
|
-
encoder.write_all(chunk).map_err(|e| format!("zstd write error: {}", e))?;
|
|
282
|
-
}
|
|
283
|
-
|
|
284
|
-
encoder.finish().map_err(|e| format!("zstd finish error: {}", e))
|
|
285
|
-
}
|
|
286
|
-
|
|
287
|
-
pub fn zstd_decompress_bytes(buf: &[u8], dict: Option<&[u8]>) -> std::result::Result<Vec<u8>, String> {
|
|
288
|
-
use std::io::Read;
|
|
289
|
-
let estimated = buf.len().saturating_mul(3).max(4096);
|
|
290
|
-
let mut out = Vec::with_capacity(estimated);
|
|
291
|
-
if let Some(d) = dict {
|
|
292
|
-
let mut decoder = zstd::stream::Decoder::with_dictionary(std::io::Cursor::new(buf), d)
|
|
293
|
-
.map_err(|e| format!("zstd decoder init error: {}", e))?;
|
|
294
|
-
decoder.window_log_max(31).map_err(|e| format!("zstd window_log_max error: {}", e))?;
|
|
295
|
-
decoder.read_to_end(&mut out).map_err(|e| format!("zstd decompress error: {}", e))?;
|
|
296
|
-
} else {
|
|
297
|
-
let mut decoder = zstd::stream::Decoder::new(std::io::Cursor::new(buf))
|
|
298
|
-
.map_err(|e| format!("zstd decoder init error: {}", e))?;
|
|
299
|
-
decoder.window_log_max(31).map_err(|e| format!("zstd window_log_max error: {}", e))?;
|
|
300
|
-
decoder.read_to_end(&mut out).map_err(|e| format!("zstd decompress error: {}", e))?;
|
|
301
|
-
}
|
|
302
|
-
Ok(out)
|
|
303
|
-
}
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
#[cfg(test)]
|
|
307
|
-
mod tests {
|
|
308
|
-
use super::*;
|
|
309
|
-
|
|
310
|
-
#[test]
|
|
311
|
-
fn test_scan_magic() {
|
|
312
|
-
let data = b"xxxxROX1yyyyROX1".to_vec();
|
|
313
|
-
let res = scan_pixels_bytes(&data, 3, None);
|
|
314
|
-
assert_eq!(res.magic_positions.len(), 2);
|
|
315
|
-
}
|
|
316
|
-
|
|
317
|
-
#[test]
|
|
318
|
-
fn test_markers() {
|
|
319
|
-
let pixels = vec![1u8,2,3, 4,5,6, 1,2,3];
|
|
320
|
-
let markers_vec = vec![1u8,2,3];
|
|
321
|
-
let res = scan_pixels_bytes(&pixels, 3, Some(&markers_vec));
|
|
322
|
-
assert_eq!(res.marker_positions, vec![0,2]);
|
|
323
|
-
}
|
|
324
|
-
|
|
325
|
-
#[test]
|
|
326
|
-
fn test_train_dictionary() {
|
|
327
|
-
use std::fs::{write, create_dir_all};
|
|
328
|
-
let td = std::env::temp_dir().join("rox_dict_test");
|
|
329
|
-
let _ = create_dir_all(&td);
|
|
330
|
-
let f1 = td.join("a.bin");
|
|
331
|
-
let f2 = td.join("b.bin");
|
|
332
|
-
// produce 1 MiB of repeated data per file
|
|
333
|
-
let big = vec![0xABu8; 1024 * 1024];
|
|
334
|
-
write(&f1, &big).unwrap();
|
|
335
|
-
write(&f2, &big).unwrap();
|
|
336
|
-
// choose dictionary size 16 KiB (far below total sample size ≈2 MiB)
|
|
337
|
-
match train_zstd_dictionary(&[f1.clone(), f2.clone()], 16 * 1024) {
|
|
338
|
-
Ok(dict) => {
|
|
339
|
-
assert!(dict.len() <= 16 * 1024);
|
|
340
|
-
assert!(!dict.is_empty());
|
|
341
|
-
}
|
|
342
|
-
Err(e) => {
|
|
343
|
-
// dictionary training may fail due to insufficient or unsuitable samples;
|
|
344
|
-
// ensure error string is nonempty to catch panics
|
|
345
|
-
assert!(!e.to_string().is_empty());
|
|
346
|
-
}
|
|
347
|
-
}
|
|
348
|
-
}
|
|
349
|
-
|
|
350
|
-
#[test]
|
|
351
|
-
fn test_delta_roundtrip() {
|
|
352
|
-
let data = vec![10u8, 20, 30, 40, 250];
|
|
353
|
-
let enc = delta_encode_bytes(&data);
|
|
354
|
-
let dec = delta_decode_bytes(&enc);
|
|
355
|
-
assert_eq!(dec, data);
|
|
356
|
-
}
|
|
357
|
-
|
|
358
|
-
#[test]
|
|
359
|
-
fn test_crc_adler() {
|
|
360
|
-
let data = b"hello".to_vec();
|
|
361
|
-
assert_eq!(crc32_bytes(&data), crc32fast::hash(&data));
|
|
362
|
-
assert_eq!(adler32_bytes(&data), adler32_bytes(&data));
|
|
363
|
-
|
|
364
|
-
// also test large buffer triggers parallel branch
|
|
365
|
-
let big = vec![0xAAu8; 5 * 1024 * 1024];
|
|
366
|
-
assert_eq!(crc32_bytes(&big), crc32fast::hash(&big));
|
|
367
|
-
}
|
|
368
|
-
|
|
369
|
-
#[test]
|
|
370
|
-
fn test_zstd_dict_roundtrip() {
|
|
371
|
-
let data = b"this is some test data that repeats. ".repeat(1000);
|
|
372
|
-
// simple dictionary containing a substring
|
|
373
|
-
let dict = b"test data";
|
|
374
|
-
let compressed = zstd_compress_bytes(&data, 3, Some(dict)).expect("compress");
|
|
375
|
-
let decompressed = zstd_decompress_bytes(&compressed, Some(dict)).expect("decompress");
|
|
376
|
-
assert_eq!(decompressed, data);
|
|
377
|
-
}
|
|
378
|
-
}
|