rlz4 0.1.1 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +2 -3
- data/README.md +32 -3
- data/ext/rlz4/Cargo.toml +2 -2
- data/ext/rlz4/src/lib.rs +86 -35
- data/lib/rlz4/version.rb +1 -1
- data/lib/rlz4.rb +16 -0
- data/tmp/x86_64-linux/stage/ext/rlz4/Cargo.toml +2 -2
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 8393efee154a550c5eb3889849ccb3d39867cac460c19176ea7a11d1e4be1595
|
|
4
|
+
data.tar.gz: 59cbb3f50ab09db3c9e252634da446448436581dd72767d6d9e5c4f9d164f27a
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 9eb27c1712c68e370697eb3edb1e92ff311769651605f04b0157ed180d0419467e9d4e0432fc4d07b80dca4d8cbcc3e349888bd81fd73321e806dac87d870736
|
|
7
|
+
data.tar.gz: 30ac184b6921cee53e17c064bddaadea96e39789b38893d68a8cfd8816531d1964fdc30ed53a28a1641a3e32670deab2c8f403d4a85584efb8fc4f66bc5a1eab
|
data/Cargo.lock
CHANGED
|
@@ -107,8 +107,7 @@ dependencies = [
|
|
|
107
107
|
[[package]]
|
|
108
108
|
name = "lz4_flex"
|
|
109
109
|
version = "0.13.0"
|
|
110
|
-
source = "
|
|
111
|
-
checksum = "db9a0d582c2874f68138a16ce1867e0ffde6c0bb0a0df85e1f36d04146db488a"
|
|
110
|
+
source = "git+https://github.com/paddor/lz4_flex.git?rev=dae9c784e890591e6445135ba23cacf344eafe8f#dae9c784e890591e6445135ba23cacf344eafe8f"
|
|
112
111
|
dependencies = [
|
|
113
112
|
"twox-hash",
|
|
114
113
|
]
|
|
@@ -237,7 +236,7 @@ checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a"
|
|
|
237
236
|
|
|
238
237
|
[[package]]
|
|
239
238
|
name = "rlz4"
|
|
240
|
-
version = "0.
|
|
239
|
+
version = "0.2.0"
|
|
241
240
|
dependencies = [
|
|
242
241
|
"lz4_flex",
|
|
243
242
|
"magnus",
|
data/README.md
CHANGED
|
@@ -55,9 +55,10 @@ end
|
|
|
55
55
|
|
|
56
56
|
For workloads where many small messages share a common prefix (e.g. ZMQ
|
|
57
57
|
messages with a fixed header), a shared dictionary massively improves the
|
|
58
|
-
compression ratio. `RLZ4::Dictionary`
|
|
59
|
-
|
|
60
|
-
`
|
|
58
|
+
compression ratio. `RLZ4::Dictionary#compress` emits a **real LZ4 frame**
|
|
59
|
+
(magic `04 22 4D 18`) with the `FLG.DictID` bit set and the dictionary's
|
|
60
|
+
`Dict_ID` written into the FrameDescriptor — interoperable with the
|
|
61
|
+
reference `lz4` CLI given the same dictionary file (`lz4 -d -D dict.bin`).
|
|
61
62
|
|
|
62
63
|
```ruby
|
|
63
64
|
dict = RLZ4::Dictionary.new("schema=v1 type=message field1=")
|
|
@@ -66,11 +67,39 @@ compressed = dict.compress("schema=v1 type=message field1=payload")
|
|
|
66
67
|
decompressed = dict.decompress(compressed)
|
|
67
68
|
|
|
68
69
|
dict.size # => 30
|
|
70
|
+
dict.id # => u32 Dict_ID
|
|
69
71
|
```
|
|
70
72
|
|
|
71
73
|
`RLZ4::Dictionary` is immutable after construction and can be shared across
|
|
72
74
|
Ractors.
|
|
73
75
|
|
|
76
|
+
## Dictionary IDs
|
|
77
|
+
|
|
78
|
+
`Dictionary#id` is a `u32` derived from `sha256(dict_bytes)[0..4]`
|
|
79
|
+
interpreted little-endian. The LZ4 frame spec defines `Dict_ID` as
|
|
80
|
+
an application-defined field with no reserved ranges and no central
|
|
81
|
+
registrar, so the full `u32` space is usable.
|
|
82
|
+
|
|
83
|
+
The id **is on the wire**: `Dictionary#compress` sets `FLG.DictID = 1`
|
|
84
|
+
and writes the id into the FrameDescriptor. On decode, `rlz4` parses
|
|
85
|
+
the incoming frame's `Dict_ID` and asserts it matches
|
|
86
|
+
`Dictionary#id` before touching the payload. Receivers that maintain
|
|
87
|
+
multiple dictionaries can therefore route incoming frames to the
|
|
88
|
+
right one purely by parsing the frame header — no out-of-band id
|
|
89
|
+
channel needed.
|
|
90
|
+
|
|
91
|
+
LZ4 dictionaries are always raw bytes (unlike Zstd, there is no
|
|
92
|
+
dict-file header format), so there is no header to parse an id out
|
|
93
|
+
of. If you need sender and receiver to agree on an id without
|
|
94
|
+
shipping it out-of-band, deriving it deterministically from the
|
|
95
|
+
dict bytes — which is what `Dictionary.new` does — is the simplest
|
|
96
|
+
option.
|
|
97
|
+
|
|
98
|
+
Dictionary training from a sample corpus is **not supported**: LZ4
|
|
99
|
+
has no equivalent of Zstd's `ZDICT_trainFromBuffer`. Dictionaries
|
|
100
|
+
are supplied by the caller as raw bytes (typically a hand-picked
|
|
101
|
+
prefix or a representative message).
|
|
102
|
+
|
|
74
103
|
### Ractors
|
|
75
104
|
|
|
76
105
|
Both the module functions and `RLZ4::Dictionary` can be used from any
|
data/ext/rlz4/Cargo.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "rlz4"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.2.0"
|
|
4
4
|
edition = "2021"
|
|
5
5
|
|
|
6
6
|
[lib]
|
|
@@ -8,7 +8,7 @@ name = "rlz4"
|
|
|
8
8
|
crate-type = ["cdylib", "rlib"]
|
|
9
9
|
|
|
10
10
|
[dependencies]
|
|
11
|
-
lz4_flex = {
|
|
11
|
+
lz4_flex = { git = "https://github.com/paddor/lz4_flex.git", rev = "dae9c784e890591e6445135ba23cacf344eafe8f", default-features = false, features = ["frame", "std", "safe-encode", "safe-decode"] }
|
|
12
12
|
magnus = "0.8"
|
|
13
13
|
rb-sys = "0.9"
|
|
14
14
|
|
data/ext/rlz4/src/lib.rs
CHANGED
|
@@ -78,35 +78,56 @@ fn rlz4_decompress(ruby: &Ruby, rb_input: RString) -> Result<RString, Error> {
|
|
|
78
78
|
Ok(ruby.str_from_slice(&out))
|
|
79
79
|
}
|
|
80
80
|
|
|
81
|
-
// ---------- Dictionary:
|
|
81
|
+
// ---------- Dictionary: dict-bound LZ4 frame compression ----------
|
|
82
82
|
//
|
|
83
|
-
// lz4_flex's
|
|
84
|
-
//
|
|
85
|
-
//
|
|
86
|
-
//
|
|
83
|
+
// Backed by lz4_flex's `FrameEncoder::with_dictionary` /
|
|
84
|
+
// `FrameDecoder::with_dictionary` (added in our fork). Output is a real
|
|
85
|
+
// LZ4 frame with the FLG.DictID bit set and `Dict_ID` written into the
|
|
86
|
+
// FrameDescriptor — interoperable with the reference `lz4` CLI given the
|
|
87
|
+
// same dictionary file.
|
|
87
88
|
//
|
|
88
|
-
//
|
|
89
|
-
//
|
|
89
|
+
// `Dict_ID` is supplied by the caller (the Ruby wrapper in `lib/rlz4.rb`
|
|
90
|
+
// derives it from `sha256(dict_bytes)[0..4]` interpreted little-endian).
|
|
91
|
+
// Doing the digest in Ruby keeps a hash crate out of the Rust extension's
|
|
92
|
+
// dependency tree.
|
|
90
93
|
#[magnus::wrap(class = "RLZ4::Dictionary", free_immediately, size)]
|
|
91
94
|
struct Dictionary {
|
|
92
95
|
bytes: Vec<u8>,
|
|
96
|
+
id: u32,
|
|
93
97
|
}
|
|
94
98
|
|
|
95
|
-
// Safety: Dictionary is read-only after construction (just a byte buffer
|
|
96
|
-
// No interior mutability, no
|
|
99
|
+
// Safety: Dictionary is read-only after construction (just a byte buffer
|
|
100
|
+
// plus a derived id). No interior mutability, no thread-local refs.
|
|
97
101
|
unsafe impl Send for Dictionary {}
|
|
98
102
|
unsafe impl Sync for Dictionary {}
|
|
99
103
|
|
|
100
|
-
fn dict_initialize(_ruby: &Ruby, rb_dict: RString) -> Result<Dictionary, Error> {
|
|
104
|
+
fn dict_initialize(_ruby: &Ruby, rb_dict: RString, id: u32) -> Result<Dictionary, Error> {
|
|
101
105
|
// SAFETY: copy bytes into an owned Vec before any Ruby allocation.
|
|
102
106
|
let bytes: Vec<u8> = unsafe { rb_dict.as_slice().to_vec() };
|
|
103
107
|
rb_dict.freeze();
|
|
104
|
-
Ok(Dictionary { bytes })
|
|
108
|
+
Ok(Dictionary { bytes, id })
|
|
105
109
|
}
|
|
106
110
|
|
|
107
111
|
fn dict_compress(ruby: &Ruby, rb_self: &Dictionary, rb_input: RString) -> Result<RString, Error> {
|
|
108
112
|
let input: Vec<u8> = unsafe { rb_input.as_slice().to_vec() };
|
|
109
|
-
let
|
|
113
|
+
let upper = lz4_flex::block::get_maximum_output_size(input.len()) + 64;
|
|
114
|
+
let mut encoder = lz4_flex::frame::FrameEncoder::with_dictionary(
|
|
115
|
+
Vec::with_capacity(upper),
|
|
116
|
+
&rb_self.bytes,
|
|
117
|
+
rb_self.id,
|
|
118
|
+
);
|
|
119
|
+
encoder.write_all(&input).map_err(|e| {
|
|
120
|
+
Error::new(
|
|
121
|
+
ruby.exception_runtime_error(),
|
|
122
|
+
format!("lz4 dict frame encode write failed: {e}"),
|
|
123
|
+
)
|
|
124
|
+
})?;
|
|
125
|
+
let compressed = encoder.finish().map_err(|e| {
|
|
126
|
+
Error::new(
|
|
127
|
+
ruby.exception_runtime_error(),
|
|
128
|
+
format!("lz4 dict frame encode finish failed: {e}"),
|
|
129
|
+
)
|
|
130
|
+
})?;
|
|
110
131
|
Ok(ruby.str_from_slice(&compressed))
|
|
111
132
|
}
|
|
112
133
|
|
|
@@ -116,13 +137,25 @@ fn dict_decompress(
|
|
|
116
137
|
rb_input: RString,
|
|
117
138
|
) -> Result<RString, Error> {
|
|
118
139
|
let compressed: Vec<u8> = unsafe { rb_input.as_slice().to_vec() };
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
140
|
+
if compressed.len() < LZ4_FRAME_MAGIC.len() || compressed[..4] != LZ4_FRAME_MAGIC {
|
|
141
|
+
return Err(Error::new(
|
|
142
|
+
decompress_error(ruby),
|
|
143
|
+
"lz4 dict frame decode failed: bad magic (input is not an LZ4 frame)",
|
|
144
|
+
));
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
let mut decoder = lz4_flex::frame::FrameDecoder::with_dictionary(
|
|
148
|
+
&compressed[..],
|
|
149
|
+
&rb_self.bytes,
|
|
150
|
+
rb_self.id,
|
|
151
|
+
);
|
|
152
|
+
let mut out = Vec::new();
|
|
153
|
+
decoder.read_to_end(&mut out).map_err(|e| {
|
|
154
|
+
Error::new(
|
|
155
|
+
decompress_error(ruby),
|
|
156
|
+
format!("lz4 dict frame decode failed: {e}"),
|
|
157
|
+
)
|
|
158
|
+
})?;
|
|
126
159
|
Ok(ruby.str_from_slice(&out))
|
|
127
160
|
}
|
|
128
161
|
|
|
@@ -130,6 +163,10 @@ fn dict_size(rb_self: &Dictionary) -> usize {
|
|
|
130
163
|
rb_self.bytes.len()
|
|
131
164
|
}
|
|
132
165
|
|
|
166
|
+
fn dict_id(rb_self: &Dictionary) -> u32 {
|
|
167
|
+
rb_self.id
|
|
168
|
+
}
|
|
169
|
+
|
|
133
170
|
// ---------- module init ----------
|
|
134
171
|
|
|
135
172
|
#[magnus::init]
|
|
@@ -152,10 +189,13 @@ fn init(ruby: &Ruby) -> Result<(), Error> {
|
|
|
152
189
|
module.define_module_function("decompress", function!(rlz4_decompress, 1))?;
|
|
153
190
|
|
|
154
191
|
let dict_class = module.define_class("Dictionary", ruby.class_object())?;
|
|
155
|
-
|
|
192
|
+
// Bound as `_native_new(bytes, id)`. Ruby's `RLZ4::Dictionary.new(bytes)`
|
|
193
|
+
// computes the id and forwards — see `lib/rlz4.rb`.
|
|
194
|
+
dict_class.define_singleton_method("_native_new", function!(dict_initialize, 2))?;
|
|
156
195
|
dict_class.define_method("compress", method!(dict_compress, 1))?;
|
|
157
196
|
dict_class.define_method("decompress", method!(dict_decompress, 1))?;
|
|
158
197
|
dict_class.define_method("size", method!(dict_size, 0))?;
|
|
198
|
+
dict_class.define_method("id", method!(dict_id, 0))?;
|
|
159
199
|
|
|
160
200
|
Ok(())
|
|
161
201
|
}
|
|
@@ -202,25 +242,36 @@ mod tests {
|
|
|
202
242
|
}
|
|
203
243
|
|
|
204
244
|
#[test]
|
|
205
|
-
fn
|
|
206
|
-
let dict = b"JSON schema version 1 field ";
|
|
207
|
-
let
|
|
208
|
-
let
|
|
209
|
-
|
|
245
|
+
fn frame_dict_round_trip() {
|
|
246
|
+
let dict = b"JSON schema version 1 field ".repeat(4);
|
|
247
|
+
let id: u32 = 0xDEAD_BEEF;
|
|
248
|
+
let msg = b"JSON schema version 1 field name=hello value=world".to_vec();
|
|
249
|
+
|
|
250
|
+
let mut enc = lz4_flex::frame::FrameEncoder::with_dictionary(Vec::new(), &dict, id);
|
|
251
|
+
enc.write_all(&msg).unwrap();
|
|
252
|
+
let ct = enc.finish().unwrap();
|
|
253
|
+
assert_eq!(&ct[..4], &[0x04, 0x22, 0x4d, 0x18]);
|
|
254
|
+
|
|
255
|
+
let mut dec = lz4_flex::frame::FrameDecoder::with_dictionary(&*ct, &dict, id);
|
|
256
|
+
let mut pt = Vec::new();
|
|
257
|
+
dec.read_to_end(&mut pt).unwrap();
|
|
210
258
|
assert_eq!(pt, msg);
|
|
211
259
|
}
|
|
212
260
|
|
|
213
261
|
#[test]
|
|
214
|
-
fn
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
let dict_b = b"common prefix BBB ";
|
|
262
|
+
fn frame_dict_id_mismatch_fails() {
|
|
263
|
+
let dict_a = b"common prefix AAA ".repeat(4);
|
|
264
|
+
let dict_b = b"common prefix BBB ".repeat(4);
|
|
265
|
+
|
|
219
266
|
let msg = b"common prefix AAA : the payload";
|
|
220
|
-
let
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
267
|
+
let mut enc =
|
|
268
|
+
lz4_flex::frame::FrameEncoder::with_dictionary(Vec::new(), &dict_a, 0xAAAA_AAAA);
|
|
269
|
+
enc.write_all(msg).unwrap();
|
|
270
|
+
let ct = enc.finish().unwrap();
|
|
271
|
+
|
|
272
|
+
let mut dec =
|
|
273
|
+
lz4_flex::frame::FrameDecoder::with_dictionary(&*ct, &dict_b, 0xBBBB_BBBB);
|
|
274
|
+
let mut out = Vec::new();
|
|
275
|
+
assert!(dec.read_to_end(&mut out).is_err());
|
|
225
276
|
}
|
|
226
277
|
}
|
data/lib/rlz4/version.rb
CHANGED
data/lib/rlz4.rb
CHANGED
|
@@ -1,4 +1,20 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require "digest"
|
|
4
|
+
|
|
3
5
|
require_relative "rlz4/rlz4"
|
|
4
6
|
require_relative "rlz4/version"
|
|
7
|
+
|
|
8
|
+
module RLZ4
|
|
9
|
+
class Dictionary
|
|
10
|
+
# Public constructor. Derives the LZ4 frame `Dict_ID` from the dictionary
|
|
11
|
+
# bytes (sha256 truncated to the first 4 bytes, little-endian) and forwards
|
|
12
|
+
# to the Rust extension. The id is what gets written into every emitted
|
|
13
|
+
# frame's FrameDescriptor and what `#decompress` asserts the incoming
|
|
14
|
+
# frame declares before decoding.
|
|
15
|
+
def self.new(bytes)
|
|
16
|
+
id = Digest::SHA256.digest(bytes).byteslice(0, 4).unpack1("V")
|
|
17
|
+
_native_new(bytes, id)
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "rlz4"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.2.0"
|
|
4
4
|
edition = "2021"
|
|
5
5
|
|
|
6
6
|
[lib]
|
|
@@ -8,7 +8,7 @@ name = "rlz4"
|
|
|
8
8
|
crate-type = ["cdylib", "rlib"]
|
|
9
9
|
|
|
10
10
|
[dependencies]
|
|
11
|
-
lz4_flex = {
|
|
11
|
+
lz4_flex = { git = "https://github.com/paddor/lz4_flex.git", rev = "dae9c784e890591e6445135ba23cacf344eafe8f", default-features = false, features = ["frame", "std", "safe-encode", "safe-decode"] }
|
|
12
12
|
magnus = "0.8"
|
|
13
13
|
rb-sys = "0.9"
|
|
14
14
|
|