roxify 1.12.7 → 1.12.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,220 @@
1
+ use std::io::Cursor;
2
+ use std::path::Path;
3
+ use rayon::prelude::*;
4
+ use tar::{Archive, Builder, Header};
5
+ use walkdir::WalkDir;
6
+
7
+ pub struct TarPackResult {
8
+ pub data: Vec<u8>,
9
+ pub file_list: Vec<(String, u64)>,
10
+ }
11
+
12
+ pub fn tar_pack_directory_with_list(dir_path: &Path) -> Result<TarPackResult, String> {
13
+ let base = dir_path;
14
+
15
+ let entries: Vec<_> = WalkDir::new(dir_path)
16
+ .follow_links(false)
17
+ .into_iter()
18
+ .filter_map(|e| e.ok())
19
+ .filter(|e| e.file_type().is_file())
20
+ .collect();
21
+
22
+ let file_data: Vec<(String, Vec<u8>)> = entries
23
+ .par_iter()
24
+ .filter_map(|entry| {
25
+ let full = entry.path();
26
+ let rel = full.strip_prefix(base).unwrap_or(full);
27
+ let rel_str = rel.to_string_lossy().replace('\\', "/");
28
+ match std::fs::read(full) {
29
+ Ok(data) => Some((rel_str, data)),
30
+ Err(_) => None,
31
+ }
32
+ })
33
+ .collect();
34
+
35
+ let file_list: Vec<(String, u64)> = file_data.iter()
36
+ .map(|(name, data)| (name.clone(), data.len() as u64))
37
+ .collect();
38
+
39
+ let total_estimate: usize = file_data.iter().map(|(n, d)| 512 + d.len() + 512 + n.len()).sum();
40
+ let mut buf = Vec::with_capacity(total_estimate + 1024);
41
+ {
42
+ let mut builder = Builder::new(&mut buf);
43
+ for (rel_str, data) in &file_data {
44
+ let mut header = Header::new_gnu();
45
+ header.set_size(data.len() as u64);
46
+ header.set_mode(0o644);
47
+ header.set_cksum();
48
+ builder
49
+ .append_data(&mut header, rel_str, &data[..])
50
+ .map_err(|e| format!("tar append {}: {}", rel_str, e))?;
51
+ }
52
+ builder.finish().map_err(|e| format!("tar finish: {}", e))?;
53
+ }
54
+ Ok(TarPackResult { data: buf, file_list })
55
+ }
56
+
57
+ pub fn tar_pack_directory(dir_path: &Path) -> Result<Vec<u8>, String> {
58
+ tar_pack_directory_with_list(dir_path).map(|r| r.data)
59
+ }
60
+
61
+ pub fn tar_file_list_fast(tar_data: &[u8]) -> Vec<(String, u64)> {
62
+ let mut list = Vec::new();
63
+ let mut pos = 0;
64
+ while pos + 512 <= tar_data.len() {
65
+ let header = &tar_data[pos..pos + 512];
66
+ if header.iter().all(|&b| b == 0) {
67
+ break;
68
+ }
69
+ let name_end = header[..100].iter().position(|&b| b == 0).unwrap_or(100);
70
+ let name = String::from_utf8_lossy(&header[..name_end]).to_string();
71
+ let size_str = String::from_utf8_lossy(&header[124..136]);
72
+ let size = u64::from_str_radix(size_str.trim().trim_matches('\0'), 8).unwrap_or(0);
73
+ if !name.is_empty() {
74
+ list.push((name, size));
75
+ }
76
+ let data_blocks = (size as usize + 511) / 512;
77
+ pos += 512 + data_blocks * 512;
78
+ }
79
+ list
80
+ }
81
+
82
+ pub fn tar_unpack(tar_data: &[u8], output_dir: &Path) -> Result<Vec<String>, String> {
83
+ let mut archive = Archive::new(Cursor::new(tar_data));
84
+ let mut entries_data: Vec<(std::path::PathBuf, Vec<u8>)> = Vec::new();
85
+
86
+ let entries = archive.entries().map_err(|e| format!("tar entries: {}", e))?;
87
+ for entry in entries {
88
+ let mut entry = entry.map_err(|e| format!("tar entry: {}", e))?;
89
+ let path = entry.path().map_err(|e| format!("tar entry path: {}", e))?.to_path_buf();
90
+
91
+ let mut safe = std::path::PathBuf::new();
92
+ for comp in path.components() {
93
+ if let std::path::Component::Normal(osstr) = comp {
94
+ safe.push(osstr);
95
+ }
96
+ }
97
+ if safe.as_os_str().is_empty() {
98
+ continue;
99
+ }
100
+
101
+ let mut data = Vec::with_capacity(entry.size() as usize);
102
+ std::io::Read::read_to_end(&mut entry, &mut data)
103
+ .map_err(|e| format!("tar read {:?}: {}", safe, e))?;
104
+ entries_data.push((safe, data));
105
+ }
106
+
107
+ let dirs: std::collections::HashSet<_> = entries_data.iter()
108
+ .filter_map(|(p, _)| {
109
+ let dest = output_dir.join(p);
110
+ dest.parent().map(|d| d.to_path_buf())
111
+ })
112
+ .collect();
113
+ for dir in &dirs {
114
+ std::fs::create_dir_all(dir).map_err(|e| format!("mkdir {:?}: {}", dir, e))?;
115
+ }
116
+
117
+ let written: Vec<String> = entries_data.par_iter()
118
+ .filter_map(|(safe, data)| {
119
+ let dest = output_dir.join(safe);
120
+ match std::fs::write(&dest, data) {
121
+ Ok(_) => Some(safe.to_string_lossy().to_string()),
122
+ Err(_) => None,
123
+ }
124
+ })
125
+ .collect();
126
+
127
+ Ok(written)
128
+ }
129
+
130
+ pub fn is_tar(data: &[u8]) -> bool {
131
+ if data.len() < 263 {
132
+ return false;
133
+ }
134
+ &data[257..262] == b"ustar"
135
+ }
136
+
137
+ pub fn tar_file_list(tar_data: &[u8]) -> Result<Vec<(String, u64)>, String> {
138
+ let mut archive = Archive::new(Cursor::new(tar_data));
139
+ let mut list = Vec::new();
140
+ let entries = archive.entries().map_err(|e| format!("tar entries: {}", e))?;
141
+ for entry in entries {
142
+ let entry = entry.map_err(|e| format!("tar entry: {}", e))?;
143
+ let path = entry
144
+ .path()
145
+ .map_err(|e| format!("tar path: {}", e))?
146
+ .to_string_lossy()
147
+ .to_string();
148
+ let size = entry.size();
149
+ list.push((path, size));
150
+ }
151
+ Ok(list)
152
+ }
153
+
154
+ #[cfg(test)]
155
+ mod tests {
156
+ use super::*;
157
+ use std::fs;
158
+
159
+ #[test]
160
+ fn test_tar_roundtrip() {
161
+ let tmp = std::env::temp_dir().join("rox_tar_test");
162
+ let _ = fs::remove_dir_all(&tmp);
163
+ fs::create_dir_all(tmp.join("sub")).unwrap();
164
+ fs::write(tmp.join("hello.txt"), b"Hello TAR").unwrap();
165
+ fs::write(tmp.join("sub/nested.txt"), b"Nested!").unwrap();
166
+
167
+ let tar_data = tar_pack_directory(&tmp).unwrap();
168
+ assert!(is_tar(&tar_data));
169
+
170
+ let list = tar_file_list(&tar_data).unwrap();
171
+ assert_eq!(list.len(), 2);
172
+
173
+ let out = std::env::temp_dir().join("rox_tar_test_out");
174
+ let _ = fs::remove_dir_all(&out);
175
+ fs::create_dir_all(&out).unwrap();
176
+
177
+ let written = tar_unpack(&tar_data, &out).unwrap();
178
+ assert_eq!(written.len(), 2);
179
+ assert_eq!(fs::read_to_string(out.join("hello.txt")).unwrap(), "Hello TAR");
180
+ assert_eq!(fs::read_to_string(out.join("sub/nested.txt")).unwrap(), "Nested!");
181
+
182
+ let _ = fs::remove_dir_all(&tmp);
183
+ let _ = fs::remove_dir_all(&out);
184
+ }
185
+
186
+ #[test]
187
+ fn test_tar_zstd_roundtrip() {
188
+ use std::io::Write;
189
+
190
+ let tmp = std::env::temp_dir().join("rox_tar_zstd_test");
191
+ let _ = fs::remove_dir_all(&tmp);
192
+ fs::create_dir_all(tmp.join("a/b")).unwrap();
193
+ fs::write(tmp.join("root.txt"), b"root file content").unwrap();
194
+ fs::write(tmp.join("a/mid.txt"), b"mid level").unwrap();
195
+ fs::write(tmp.join("a/b/deep.txt"), b"deep nested file").unwrap();
196
+
197
+ let tar_data = tar_pack_directory(&tmp).unwrap();
198
+ assert!(is_tar(&tar_data));
199
+
200
+ let mut encoder = zstd::stream::Encoder::new(Vec::new(), 3).unwrap();
201
+ encoder.write_all(&tar_data).unwrap();
202
+ let compressed = encoder.finish().unwrap();
203
+
204
+ let decompressed = crate::core::zstd_decompress_bytes(&compressed, None).unwrap();
205
+ assert!(is_tar(&decompressed));
206
+
207
+ let out = std::env::temp_dir().join("rox_tar_zstd_test_out");
208
+ let _ = fs::remove_dir_all(&out);
209
+ fs::create_dir_all(&out).unwrap();
210
+
211
+ let written = tar_unpack(&decompressed, &out).unwrap();
212
+ assert_eq!(written.len(), 3);
213
+ assert_eq!(fs::read_to_string(out.join("root.txt")).unwrap(), "root file content");
214
+ assert_eq!(fs::read_to_string(out.join("a/mid.txt")).unwrap(), "mid level");
215
+ assert_eq!(fs::read_to_string(out.join("a/b/deep.txt")).unwrap(), "deep nested file");
216
+
217
+ let _ = fs::remove_dir_all(&tmp);
218
+ let _ = fs::remove_dir_all(&out);
219
+ }
220
+ }
@@ -0,0 +1,151 @@
1
+ /// WAV container for binary data.
2
+ ///
3
+ /// Encodes raw bytes as 8-bit unsigned PCM mono samples (44100 Hz).
4
+ /// Header is exactly 44 bytes. Total overhead: 44 bytes.
5
+ ///
6
+ /// Compared to PNG (stored deflate): PNG overhead grows with data size
7
+ /// (zlib framing, filter bytes, chunk CRCs). WAV overhead is constant.
8
+
9
+ const WAV_HEADER_SIZE: usize = 44;
10
+ const SAMPLE_RATE: u32 = 44100;
11
+ const BITS_PER_SAMPLE: u16 = 8;
12
+ const NUM_CHANNELS: u16 = 1;
13
+
14
+ /// Pack raw bytes into a WAV file (8-bit PCM, mono, 44100 Hz).
15
+ /// The bytes are stored directly as unsigned PCM samples.
16
+ /// Returns the complete WAV file as a Vec<u8>.
17
+ pub fn bytes_to_wav(data: &[u8]) -> Vec<u8> {
18
+ let data_size = data.len() as u32;
19
+ let file_size = WAV_HEADER_SIZE as u32 - 8 + data_size; // RIFF chunk size
20
+
21
+ let byte_rate = SAMPLE_RATE * NUM_CHANNELS as u32 * (BITS_PER_SAMPLE as u32 / 8);
22
+ let block_align = NUM_CHANNELS * (BITS_PER_SAMPLE / 8);
23
+
24
+ let mut wav = Vec::with_capacity(WAV_HEADER_SIZE + data.len());
25
+
26
+ // RIFF header
27
+ wav.extend_from_slice(b"RIFF");
28
+ wav.extend_from_slice(&file_size.to_le_bytes());
29
+ wav.extend_from_slice(b"WAVE");
30
+
31
+ // fmt sub-chunk
32
+ wav.extend_from_slice(b"fmt ");
33
+ wav.extend_from_slice(&16u32.to_le_bytes()); // sub-chunk size (PCM = 16)
34
+ wav.extend_from_slice(&1u16.to_le_bytes()); // audio format (1 = PCM)
35
+ wav.extend_from_slice(&NUM_CHANNELS.to_le_bytes());
36
+ wav.extend_from_slice(&SAMPLE_RATE.to_le_bytes());
37
+ wav.extend_from_slice(&byte_rate.to_le_bytes());
38
+ wav.extend_from_slice(&block_align.to_le_bytes());
39
+ wav.extend_from_slice(&BITS_PER_SAMPLE.to_le_bytes());
40
+
41
+ // data sub-chunk
42
+ wav.extend_from_slice(b"data");
43
+ wav.extend_from_slice(&data_size.to_le_bytes());
44
+ wav.extend_from_slice(data);
45
+
46
+ wav
47
+ }
48
+
49
+ /// Extract raw bytes from a WAV file.
50
+ /// Returns the PCM data (the original bytes) or an error.
51
+ pub fn wav_to_bytes(wav: &[u8]) -> Result<Vec<u8>, String> {
52
+ if wav.len() < WAV_HEADER_SIZE {
53
+ return Err("WAV data too short".to_string());
54
+ }
55
+
56
+ // Validate RIFF header
57
+ if &wav[0..4] != b"RIFF" {
58
+ return Err("Not a RIFF file".to_string());
59
+ }
60
+ if &wav[8..12] != b"WAVE" {
61
+ return Err("Not a WAVE file".to_string());
62
+ }
63
+
64
+ // Find the "data" sub-chunk (skip fmt and any other chunks)
65
+ let mut offset = 12; // past "RIFF" + size + "WAVE"
66
+ loop {
67
+ if offset + 8 > wav.len() {
68
+ return Err("data chunk not found".to_string());
69
+ }
70
+ let chunk_id = &wav[offset..offset + 4];
71
+ let chunk_size = u32::from_le_bytes([
72
+ wav[offset + 4],
73
+ wav[offset + 5],
74
+ wav[offset + 6],
75
+ wav[offset + 7],
76
+ ]) as usize;
77
+
78
+ if chunk_id == b"data" {
79
+ let data_start = offset + 8;
80
+ let data_end = data_start + chunk_size;
81
+ if data_end > wav.len() {
82
+ // Allow truncation: return what we have
83
+ return Ok(wav[data_start..].to_vec());
84
+ }
85
+ return Ok(wav[data_start..data_end].to_vec());
86
+ }
87
+
88
+ // Skip this chunk (+ padding byte if odd size)
89
+ offset += 8 + chunk_size;
90
+ if chunk_size % 2 != 0 {
91
+ offset += 1; // RIFF chunks are word-aligned
92
+ }
93
+ }
94
+ }
95
+
96
+ /// Check if a buffer starts with a RIFF/WAVE header.
97
+ pub fn is_wav(buf: &[u8]) -> bool {
98
+ buf.len() >= 12 && &buf[0..4] == b"RIFF" && &buf[8..12] == b"WAVE"
99
+ }
100
+
101
+ #[cfg(test)]
102
+ mod tests {
103
+ use super::*;
104
+
105
+ #[test]
106
+ fn test_wav_roundtrip() {
107
+ let data = b"Hello, World! This is roxify audio container test data.";
108
+ let wav = bytes_to_wav(data);
109
+
110
+ // Check header
111
+ assert_eq!(&wav[0..4], b"RIFF");
112
+ assert_eq!(&wav[8..12], b"WAVE");
113
+ assert_eq!(wav.len(), 44 + data.len());
114
+
115
+ // Roundtrip
116
+ let recovered = wav_to_bytes(&wav).expect("decode should succeed");
117
+ assert_eq!(recovered, data);
118
+ }
119
+
120
+ #[test]
121
+ fn test_wav_empty() {
122
+ let data: &[u8] = b"";
123
+ let wav = bytes_to_wav(data);
124
+ assert_eq!(wav.len(), 44);
125
+ let recovered = wav_to_bytes(&wav).expect("decode empty");
126
+ assert!(recovered.is_empty());
127
+ }
128
+
129
+ #[test]
130
+ fn test_wav_large() {
131
+ let data = vec![0xAB_u8; 1024 * 1024]; // 1 MB
132
+ let wav = bytes_to_wav(&data);
133
+ assert_eq!(wav.len(), 44 + 1024 * 1024);
134
+ let recovered = wav_to_bytes(&wav).expect("decode large");
135
+ assert_eq!(recovered, data);
136
+ }
137
+
138
+ #[test]
139
+ fn test_is_wav() {
140
+ let wav = bytes_to_wav(b"test");
141
+ assert!(is_wav(&wav));
142
+ assert!(!is_wav(b"not a wav"));
143
+ assert!(!is_wav(b"RIFF1234XXXX")); // RIFF but not WAVE
144
+ }
145
+
146
+ #[test]
147
+ fn test_invalid_wav() {
148
+ assert!(wav_to_bytes(b"short").is_err());
149
+ assert!(wav_to_bytes(b"NOT a RIFF file!").is_err());
150
+ }
151
+ }
@@ -0,0 +1,145 @@
1
+ use std::time::Instant;
2
+
3
+ mod rans_byte;
4
+ mod bwt;
5
+ mod mtf;
6
+ mod context_mixing;
7
+ mod pool;
8
+ mod hybrid;
9
+
10
+ fn bench_roundtrip(name: &str, data: &[u8]) {
11
+ let compressor = hybrid::HybridCompressor::new(false, 4);
12
+
13
+ let start = Instant::now();
14
+ let (compressed, stats) = compressor.compress(data).unwrap();
15
+ let compress_time = start.elapsed();
16
+
17
+ let start = Instant::now();
18
+ let decompressed = compressor.decompress(&compressed).unwrap();
19
+ let decompress_time = start.elapsed();
20
+
21
+ let ratio = (compressed.len() as f64) / (data.len() as f64) * 100.0;
22
+ let compress_mbps = (data.len() as f64 / 1_048_576.0) / compress_time.as_secs_f64();
23
+ let decompress_mbps = (data.len() as f64 / 1_048_576.0) / decompress_time.as_secs_f64();
24
+
25
+ assert_eq!(decompressed, data, "ROUND-TRIP FAILED for {}", name);
26
+
27
+ println!("=== {} ===", name);
28
+ println!(" Input: {} bytes", data.len());
29
+ println!(" Compressed: {} bytes ({:.1}%)", compressed.len(), ratio);
30
+ println!(" Reduction: {:.1}%", 100.0 - ratio);
31
+ println!(" Compress: {:.1} ms ({:.1} MB/s)", compress_time.as_secs_f64() * 1000.0, compress_mbps);
32
+ println!(" Decompress: {:.1} ms ({:.1} MB/s)", decompress_time.as_secs_f64() * 1000.0, decompress_mbps);
33
+ println!(" Entropy: {:.2} bits/byte", stats.entropy_bits);
34
+ println!();
35
+ }
36
+
37
+ fn bench_zstd(name: &str, data: &[u8], level: i32) {
38
+ let start = Instant::now();
39
+ let compressed = zstd::encode_all(std::io::Cursor::new(data), level).unwrap();
40
+ let compress_time = start.elapsed();
41
+
42
+ let start = Instant::now();
43
+ let decompressed = zstd::decode_all(std::io::Cursor::new(&compressed)).unwrap();
44
+ let decompress_time = start.elapsed();
45
+
46
+ let ratio = (compressed.len() as f64) / (data.len() as f64) * 100.0;
47
+ let compress_mbps = (data.len() as f64 / 1_048_576.0) / compress_time.as_secs_f64();
48
+ let decompress_mbps = (data.len() as f64 / 1_048_576.0) / decompress_time.as_secs_f64();
49
+
50
+ assert_eq!(decompressed, data);
51
+
52
+ println!("=== Zstd L{} ({}) ===", level, name);
53
+ println!(" Compressed: {} bytes ({:.1}%)", compressed.len(), ratio);
54
+ println!(" Reduction: {:.1}%", 100.0 - ratio);
55
+ println!(" Compress: {:.1} ms ({:.1} MB/s)", compress_time.as_secs_f64() * 1000.0, compress_mbps);
56
+ println!(" Decompress: {:.1} ms ({:.1} MB/s)", decompress_time.as_secs_f64() * 1000.0, decompress_mbps);
57
+ println!();
58
+ }
59
+
60
+ fn main() {
61
+ println!("╔══════════════════════════════════════════════════════════╗");
62
+ println!("║ ROXIFY BWT-ANS COMPRESSION BENCHMARK ║");
63
+ println!("╚══════════════════════════════════════════════════════════╝\n");
64
+
65
+ let text_1k: Vec<u8> = "Hello World! This is a test of the roxify compression engine. ".repeat(16).into_bytes();
66
+ bench_roundtrip("Text 1KB", &text_1k);
67
+ bench_zstd("Text 1KB", &text_1k, 3);
68
+ bench_zstd("Text 1KB", &text_1k, 19);
69
+
70
+ let text_100k: Vec<u8> = "The quick brown fox jumps over the lazy dog. Pack my box with five dozen liquor jugs. ".repeat(1200).into_bytes();
71
+ bench_roundtrip("Text 100KB", &text_100k);
72
+ bench_zstd("Text 100KB", &text_100k, 3);
73
+ bench_zstd("Text 100KB", &text_100k, 19);
74
+
75
+ let text_1m: Vec<u8> = {
76
+ let mut data = Vec::with_capacity(1_048_576);
77
+ let phrases = [
78
+ b"Lorem ipsum dolor sit amet, consectetur adipiscing elit. ".as_slice(),
79
+ b"Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. ".as_slice(),
80
+ b"Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris. ".as_slice(),
81
+ b"Duis aute irure dolor in reprehenderit in voluptate velit esse. ".as_slice(),
82
+ b"Excepteur sint occaecat cupidatat non proident, sunt in culpa. ".as_slice(),
83
+ ];
84
+ let mut i = 0;
85
+ while data.len() < 1_048_576 {
86
+ data.extend_from_slice(phrases[i % phrases.len()]);
87
+ i += 1;
88
+ }
89
+ data.truncate(1_048_576);
90
+ data
91
+ };
92
+ bench_roundtrip("Text 1MB", &text_1m);
93
+ bench_zstd("Text 1MB", &text_1m, 3);
94
+ bench_zstd("Text 1MB", &text_1m, 19);
95
+
96
+ let json_data: Vec<u8> = {
97
+ let mut data = String::with_capacity(512_000);
98
+ data.push('[');
99
+ for i in 0..5000 {
100
+ if i > 0 { data.push(','); }
101
+ data.push_str(&format!(
102
+ r#"{{"id":{},"name":"user_{}","email":"user{}@example.com","active":{},"score":{:.2},"tags":["tag1","tag2","tag3"]}}"#,
103
+ i, i, i, i % 2 == 0, (i as f64) * 1.337
104
+ ));
105
+ }
106
+ data.push(']');
107
+ data.into_bytes()
108
+ };
109
+ bench_roundtrip("JSON 500KB", &json_data);
110
+ bench_zstd("JSON 500KB", &json_data, 3);
111
+ bench_zstd("JSON 500KB", &json_data, 19);
112
+
113
+ let random_data: Vec<u8> = {
114
+ let mut data = vec![0u8; 100_000];
115
+ let mut state = 12345u64;
116
+ for b in data.iter_mut() {
117
+ state = state.wrapping_mul(6364136223846793005).wrapping_add(1442695040888963407);
118
+ *b = (state >> 33) as u8;
119
+ }
120
+ data
121
+ };
122
+ bench_roundtrip("Random 100KB", &random_data);
123
+ bench_zstd("Random 100KB", &random_data, 3);
124
+
125
+ let binary_data: Vec<u8> = {
126
+ let mut data = Vec::with_capacity(256_000);
127
+ for i in 0..256_000u32 {
128
+ match i % 7 {
129
+ 0 => data.push(0),
130
+ 1 => data.push(0xFF),
131
+ 2 => data.push((i & 0xFF) as u8),
132
+ 3 => data.push(((i >> 8) & 0xFF) as u8),
133
+ 4 => data.push(b'A' + (i % 26) as u8),
134
+ 5 => data.push(0x20),
135
+ _ => data.push((i.wrapping_mul(37) & 0xFF) as u8),
136
+ }
137
+ }
138
+ data
139
+ };
140
+ bench_roundtrip("Binary 256KB", &binary_data);
141
+ bench_zstd("Binary 256KB", &binary_data, 3);
142
+ bench_zstd("Binary 256KB", &binary_data, 19);
143
+
144
+ println!("All round-trip tests PASSED!");
145
+ }
package/native/bwt.rs ADDED
@@ -0,0 +1,56 @@
1
+ use anyhow::Result;
2
+ use libsais::bwt::Bwt;
3
+ use libsais::typestate::OwnedBuffer;
4
+ use libsais::BwtConstruction;
5
+ use rayon::prelude::*;
6
+
7
+ pub struct BwtResult {
8
+ pub transformed: Vec<u8>,
9
+ pub primary_index: u32,
10
+ }
11
+
12
+ pub fn bwt_encode(data: &[u8]) -> Result<BwtResult> {
13
+ let n = data.len();
14
+ if n == 0 {
15
+ return Ok(BwtResult { transformed: Vec::new(), primary_index: 0 });
16
+ }
17
+
18
+ let bwt_result = BwtConstruction::for_text(data)
19
+ .with_owned_temporary_array_buffer32()
20
+ .single_threaded()
21
+ .run()
22
+ .map_err(|e| anyhow::anyhow!("libsais BWT: {:?}", e))?;
23
+
24
+ let primary_index = bwt_result.primary_index() as u32;
25
+ let transformed = bwt_result.bwt().to_vec();
26
+
27
+ Ok(BwtResult { transformed, primary_index })
28
+ }
29
+
30
+ pub fn bwt_decode(bwt_data: &[u8], primary_index: u32) -> Result<Vec<u8>> {
31
+ if bwt_data.is_empty() {
32
+ return Ok(Vec::new());
33
+ }
34
+
35
+ let bwt_obj: Bwt<'static, u8, OwnedBuffer> =
36
+ unsafe { Bwt::from_parts(bwt_data.to_vec(), primary_index as usize) };
37
+
38
+ let text = bwt_obj
39
+ .unbwt()
40
+ .with_owned_temporary_array_buffer32()
41
+ .single_threaded()
42
+ .run()
43
+ .map_err(|e| anyhow::anyhow!("libsais UnBWT: {:?}", e))?;
44
+
45
+ Ok(text.as_slice().to_vec())
46
+ }
47
+
48
+ pub fn bwt_encode_streaming(block_size: usize, data: &[u8]) -> Result<Vec<(BwtResult, usize)>> {
49
+ data.par_chunks(block_size)
50
+ .enumerate()
51
+ .map(|(i, chunk)| {
52
+ let result = bwt_encode(chunk)?;
53
+ Ok((result, i * block_size))
54
+ })
55
+ .collect()
56
+ }
@@ -0,0 +1,117 @@
1
+ #[derive(Clone, Copy, Debug)]
2
+ pub struct ProbabilityEstimate {
3
+ pub p0: u32,
4
+ pub p1: u32,
5
+ pub total: u32,
6
+ }
7
+
8
+ impl ProbabilityEstimate {
9
+ pub fn entropy_bits(&self) -> f32 {
10
+ if self.total == 0 {
11
+ return 0.0;
12
+ }
13
+ let p0 = (self.p0 as f32) / (self.total as f32);
14
+ let p1 = (self.p1 as f32) / (self.total as f32);
15
+
16
+ let mut bits = 0.0;
17
+ if p0 > 0.0 {
18
+ bits -= p0 * p0.log2();
19
+ }
20
+ if p1 > 0.0 {
21
+ bits -= p1 * p1.log2();
22
+ }
23
+ bits
24
+ }
25
+ }
26
+
27
+ pub struct ContextMixer {
28
+ contexts_order0: Vec<ProbabilityEstimate>,
29
+ contexts_order1: Vec<[ProbabilityEstimate; 256]>,
30
+ contexts_order2: Vec<[[ProbabilityEstimate; 256]; 256]>,
31
+ }
32
+
33
+ impl ContextMixer {
34
+ pub fn new() -> Self {
35
+ ContextMixer {
36
+ contexts_order0: vec![ProbabilityEstimate { p0: 1, p1: 1, total: 2 }; 1],
37
+ contexts_order1: vec![
38
+ [ProbabilityEstimate { p0: 1, p1: 1, total: 2 }; 256];
39
+ 256
40
+ ],
41
+ contexts_order2: vec![
42
+ [[ProbabilityEstimate { p0: 1, p1: 1, total: 2 }; 256]; 256];
43
+ 256
44
+ ],
45
+ }
46
+ }
47
+
48
+ pub fn predict_order0(&self) -> ProbabilityEstimate {
49
+ self.contexts_order0[0]
50
+ }
51
+
52
+ pub fn predict_order1(&self, context1: u8) -> ProbabilityEstimate {
53
+ self.contexts_order1[context1 as usize][0]
54
+ }
55
+
56
+ pub fn predict_order2(&self, context1: u8, context2: u8) -> ProbabilityEstimate {
57
+ self.contexts_order2[context1 as usize][context2 as usize][0]
58
+ }
59
+
60
+ pub fn update_order0(&mut self, bit: bool) {
61
+ let ctx = &mut self.contexts_order0[0];
62
+ if bit {
63
+ ctx.p1 += 1;
64
+ } else {
65
+ ctx.p0 += 1;
66
+ }
67
+ ctx.total += 1;
68
+ }
69
+
70
+ pub fn update_order1(&mut self, context1: u8, bit: bool) {
71
+ let ctx = &mut self.contexts_order1[context1 as usize][0];
72
+ if bit {
73
+ ctx.p1 += 1;
74
+ } else {
75
+ ctx.p0 += 1;
76
+ }
77
+ ctx.total += 1;
78
+ }
79
+
80
+ pub fn update_order2(&mut self, context1: u8, context2: u8, bit: bool) {
81
+ let ctx = &mut self.contexts_order2[context1 as usize][context2 as usize][0];
82
+ if bit {
83
+ ctx.p1 += 1;
84
+ } else {
85
+ ctx.p0 += 1;
86
+ }
87
+ ctx.total += 1;
88
+ }
89
+ }
90
+
91
+ pub fn analyze_entropy(data: &[u8]) -> f32 {
92
+ let mut freq = [0u32; 256];
93
+ for &byte in data {
94
+ freq[byte as usize] += 1;
95
+ }
96
+
97
+ let total = data.len() as f32;
98
+ if total == 0.0 {
99
+ return 0.0;
100
+ }
101
+
102
+ let inv_total = 1.0 / total;
103
+ let mut entropy = 0.0f32;
104
+ for &f in &freq {
105
+ if f > 0 {
106
+ let p = f as f32 * inv_total;
107
+ entropy -= p * p.log2();
108
+ }
109
+ }
110
+ entropy
111
+ }
112
+
113
+ pub fn estimate_compression_gain(original: &[u8], entropy_bits: f32) -> f64 {
114
+ let theoretical_min = (original.len() as f64) * (entropy_bits as f64) / 8.0;
115
+ let ratio = theoretical_min / (original.len() as f64);
116
+ (1.0 - ratio) * 100.0
117
+ }