rlz4 0.1.1 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: cc8e8cccbcbf8fd18231558a8e9486e630dadd9287f6a424d3565790ec2b28a4
4
- data.tar.gz: e99944e60c12edb16ced111597ee628274c5fee7f977d6d6f1a57ca2217bfefa
3
+ metadata.gz: 8393efee154a550c5eb3889849ccb3d39867cac460c19176ea7a11d1e4be1595
4
+ data.tar.gz: 59cbb3f50ab09db3c9e252634da446448436581dd72767d6d9e5c4f9d164f27a
5
5
  SHA512:
6
- metadata.gz: f548b432680ad0e5c12ae606696049f3710be1d26211973103d0607ee9241af8d786a95c8eb12e4bf5b52230387a9873bfd524913236f7be3a04672c98130b2b
7
- data.tar.gz: f745e2487bff140a571d18670b23b22b77acfcaa66174bfa72cb740efdc2c708905414c1fce3ce6e31fe634446c523c2e257cbcecddc74babdd956634f916a68
6
+ metadata.gz: 9eb27c1712c68e370697eb3edb1e92ff311769651605f04b0157ed180d0419467e9d4e0432fc4d07b80dca4d8cbcc3e349888bd81fd73321e806dac87d870736
7
+ data.tar.gz: 30ac184b6921cee53e17c064bddaadea96e39789b38893d68a8cfd8816531d1964fdc30ed53a28a1641a3e32670deab2c8f403d4a85584efb8fc4f66bc5a1eab
data/Cargo.lock CHANGED
@@ -107,8 +107,7 @@ dependencies = [
107
107
  [[package]]
108
108
  name = "lz4_flex"
109
109
  version = "0.13.0"
110
- source = "registry+https://github.com/rust-lang/crates.io-index"
111
- checksum = "db9a0d582c2874f68138a16ce1867e0ffde6c0bb0a0df85e1f36d04146db488a"
110
+ source = "git+https://github.com/paddor/lz4_flex.git?rev=dae9c784e890591e6445135ba23cacf344eafe8f#dae9c784e890591e6445135ba23cacf344eafe8f"
112
111
  dependencies = [
113
112
  "twox-hash",
114
113
  ]
@@ -237,7 +236,7 @@ checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a"
237
236
 
238
237
  [[package]]
239
238
  name = "rlz4"
240
- version = "0.1.0"
239
+ version = "0.2.0"
241
240
  dependencies = [
242
241
  "lz4_flex",
243
242
  "magnus",
data/README.md CHANGED
@@ -55,9 +55,10 @@ end
55
55
 
56
56
  For workloads where many small messages share a common prefix (e.g. ZMQ
57
57
  messages with a fixed header), a shared dictionary massively improves the
58
- compression ratio. `RLZ4::Dictionary` uses LZ4 **block** format with the
59
- original size prepended this is a different wire format from
60
- `RLZ4.compress` and is not interoperable with it.
58
+ compression ratio. `RLZ4::Dictionary#compress` emits a **real LZ4 frame**
59
+ (magic `04 22 4D 18`) with the `FLG.DictID` bit set and the dictionary's
60
+ `Dict_ID` written into the FrameDescriptor — interoperable with the
61
+ reference `lz4` CLI given the same dictionary file (`lz4 -d -D dict.bin`).
61
62
 
62
63
  ```ruby
63
64
  dict = RLZ4::Dictionary.new("schema=v1 type=message field1=")
@@ -66,11 +67,39 @@ compressed = dict.compress("schema=v1 type=message field1=payload")
66
67
  decompressed = dict.decompress(compressed)
67
68
 
68
69
  dict.size # => 30
70
+ dict.id # => u32 Dict_ID
69
71
  ```
70
72
 
71
73
  `RLZ4::Dictionary` is immutable after construction and can be shared across
72
74
  Ractors.
73
75
 
76
+ ## Dictionary IDs
77
+
78
+ `Dictionary#id` is a `u32` derived from `sha256(dict_bytes)[0..4]`
79
+ interpreted little-endian. The LZ4 frame spec defines `Dict_ID` as
80
+ an application-defined field with no reserved ranges and no central
81
+ registrar, so the full `u32` space is usable.
82
+
83
+ The id **is on the wire**: `Dictionary#compress` sets `FLG.DictID = 1`
84
+ and writes the id into the FrameDescriptor. On decode, `rlz4` parses
85
+ the incoming frame's `Dict_ID` and asserts it matches
86
+ `Dictionary#id` before touching the payload. Receivers that maintain
87
+ multiple dictionaries can therefore route incoming frames to the
88
+ right one purely by parsing the frame header — no out-of-band id
89
+ channel needed.
90
+
91
+ LZ4 dictionaries are always raw bytes (unlike Zstd, there is no
92
+ dict-file header format), so there is no header to parse an id out
93
+ of. If you need sender and receiver to agree on an id without
94
+ shipping it out-of-band, deriving it deterministically from the
95
+ dict bytes — which is what `Dictionary.new` does — is the simplest
96
+ option.
97
+
98
+ Dictionary training from a sample corpus is **not supported**: LZ4
99
+ has no equivalent of Zstd's `ZDICT_trainFromBuffer`. Dictionaries
100
+ are supplied by the caller as raw bytes (typically a hand-picked
101
+ prefix or a representative message).
102
+
74
103
  ### Ractors
75
104
 
76
105
  Both the module functions and `RLZ4::Dictionary` can be used from any
data/ext/rlz4/Cargo.toml CHANGED
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "rlz4"
3
- version = "0.1.0"
3
+ version = "0.2.0"
4
4
  edition = "2021"
5
5
 
6
6
  [lib]
@@ -8,7 +8,7 @@ name = "rlz4"
8
8
  crate-type = ["cdylib", "rlib"]
9
9
 
10
10
  [dependencies]
11
- lz4_flex = { version = "0.13", default-features = false, features = ["frame", "std", "safe-encode", "safe-decode"] }
11
+ lz4_flex = { git = "https://github.com/paddor/lz4_flex.git", rev = "dae9c784e890591e6445135ba23cacf344eafe8f", default-features = false, features = ["frame", "std", "safe-encode", "safe-decode"] }
12
12
  magnus = "0.8"
13
13
  rb-sys = "0.9"
14
14
 
data/ext/rlz4/src/lib.rs CHANGED
@@ -78,35 +78,56 @@ fn rlz4_decompress(ruby: &Ruby, rb_input: RString) -> Result<RString, Error> {
78
78
  Ok(ruby.str_from_slice(&out))
79
79
  }
80
80
 
81
- // ---------- Dictionary: block-format compression with a shared dictionary ----------
81
+ // ---------- Dictionary: dict-bound LZ4 frame compression ----------
82
82
  //
83
- // lz4_flex's frame format does not implement dictionary-based compression
84
- // (FrameInfo::dict_id is metadata-only). For the small-ZMQ-message use case
85
- // that motivates this class, block format with a prepended size is a better
86
- // fit anyway: lower per-message overhead and direct dictionary support.
83
+ // Backed by lz4_flex's `FrameEncoder::with_dictionary` /
84
+ // `FrameDecoder::with_dictionary` (added in our fork). Output is a real
85
+ // LZ4 frame with the FLG.DictID bit set and `Dict_ID` written into the
86
+ // FrameDescriptor interoperable with the reference `lz4` CLI given the
87
+ // same dictionary file.
87
88
  //
88
- // Output is a raw LZ4 block with the original (uncompressed) size prepended
89
- // as a little-endian u32, matching lz4_flex's `*_size_prepended` API.
89
+ // `Dict_ID` is supplied by the caller (the Ruby wrapper in `lib/rlz4.rb`
90
+ // derives it from `sha256(dict_bytes)[0..4]` interpreted little-endian).
91
+ // Doing the digest in Ruby keeps a hash crate out of the Rust extension's
92
+ // dependency tree.
90
93
  #[magnus::wrap(class = "RLZ4::Dictionary", free_immediately, size)]
91
94
  struct Dictionary {
92
95
  bytes: Vec<u8>,
96
+ id: u32,
93
97
  }
94
98
 
95
- // Safety: Dictionary is read-only after construction (just a byte buffer).
96
- // No interior mutability, no references to thread-local data.
99
+ // Safety: Dictionary is read-only after construction (just a byte buffer
100
+ // plus a derived id). No interior mutability, no thread-local refs.
97
101
  unsafe impl Send for Dictionary {}
98
102
  unsafe impl Sync for Dictionary {}
99
103
 
100
- fn dict_initialize(_ruby: &Ruby, rb_dict: RString) -> Result<Dictionary, Error> {
104
+ fn dict_initialize(_ruby: &Ruby, rb_dict: RString, id: u32) -> Result<Dictionary, Error> {
101
105
  // SAFETY: copy bytes into an owned Vec before any Ruby allocation.
102
106
  let bytes: Vec<u8> = unsafe { rb_dict.as_slice().to_vec() };
103
107
  rb_dict.freeze();
104
- Ok(Dictionary { bytes })
108
+ Ok(Dictionary { bytes, id })
105
109
  }
106
110
 
107
111
  fn dict_compress(ruby: &Ruby, rb_self: &Dictionary, rb_input: RString) -> Result<RString, Error> {
108
112
  let input: Vec<u8> = unsafe { rb_input.as_slice().to_vec() };
109
- let compressed = lz4_flex::block::compress_prepend_size_with_dict(&input, &rb_self.bytes);
113
+ let upper = lz4_flex::block::get_maximum_output_size(input.len()) + 64;
114
+ let mut encoder = lz4_flex::frame::FrameEncoder::with_dictionary(
115
+ Vec::with_capacity(upper),
116
+ &rb_self.bytes,
117
+ rb_self.id,
118
+ );
119
+ encoder.write_all(&input).map_err(|e| {
120
+ Error::new(
121
+ ruby.exception_runtime_error(),
122
+ format!("lz4 dict frame encode write failed: {e}"),
123
+ )
124
+ })?;
125
+ let compressed = encoder.finish().map_err(|e| {
126
+ Error::new(
127
+ ruby.exception_runtime_error(),
128
+ format!("lz4 dict frame encode finish failed: {e}"),
129
+ )
130
+ })?;
110
131
  Ok(ruby.str_from_slice(&compressed))
111
132
  }
112
133
 
@@ -116,13 +137,25 @@ fn dict_decompress(
116
137
  rb_input: RString,
117
138
  ) -> Result<RString, Error> {
118
139
  let compressed: Vec<u8> = unsafe { rb_input.as_slice().to_vec() };
119
- let out = lz4_flex::block::decompress_size_prepended_with_dict(&compressed, &rb_self.bytes)
120
- .map_err(|e| {
121
- Error::new(
122
- decompress_error(ruby),
123
- format!("lz4 block decode failed: {e}"),
124
- )
125
- })?;
140
+ if compressed.len() < LZ4_FRAME_MAGIC.len() || compressed[..4] != LZ4_FRAME_MAGIC {
141
+ return Err(Error::new(
142
+ decompress_error(ruby),
143
+ "lz4 dict frame decode failed: bad magic (input is not an LZ4 frame)",
144
+ ));
145
+ }
146
+
147
+ let mut decoder = lz4_flex::frame::FrameDecoder::with_dictionary(
148
+ &compressed[..],
149
+ &rb_self.bytes,
150
+ rb_self.id,
151
+ );
152
+ let mut out = Vec::new();
153
+ decoder.read_to_end(&mut out).map_err(|e| {
154
+ Error::new(
155
+ decompress_error(ruby),
156
+ format!("lz4 dict frame decode failed: {e}"),
157
+ )
158
+ })?;
126
159
  Ok(ruby.str_from_slice(&out))
127
160
  }
128
161
 
@@ -130,6 +163,10 @@ fn dict_size(rb_self: &Dictionary) -> usize {
130
163
  rb_self.bytes.len()
131
164
  }
132
165
 
166
+ fn dict_id(rb_self: &Dictionary) -> u32 {
167
+ rb_self.id
168
+ }
169
+
133
170
  // ---------- module init ----------
134
171
 
135
172
  #[magnus::init]
@@ -152,10 +189,13 @@ fn init(ruby: &Ruby) -> Result<(), Error> {
152
189
  module.define_module_function("decompress", function!(rlz4_decompress, 1))?;
153
190
 
154
191
  let dict_class = module.define_class("Dictionary", ruby.class_object())?;
155
- dict_class.define_singleton_method("new", function!(dict_initialize, 1))?;
192
+ // Bound as `_native_new(bytes, id)`. Ruby's `RLZ4::Dictionary.new(bytes)`
193
+ // computes the id and forwards — see `lib/rlz4.rb`.
194
+ dict_class.define_singleton_method("_native_new", function!(dict_initialize, 2))?;
156
195
  dict_class.define_method("compress", method!(dict_compress, 1))?;
157
196
  dict_class.define_method("decompress", method!(dict_decompress, 1))?;
158
197
  dict_class.define_method("size", method!(dict_size, 0))?;
198
+ dict_class.define_method("id", method!(dict_id, 0))?;
159
199
 
160
200
  Ok(())
161
201
  }
@@ -202,25 +242,36 @@ mod tests {
202
242
  }
203
243
 
204
244
  #[test]
205
- fn block_dict_round_trip() {
206
- let dict = b"JSON schema version 1 field ";
207
- let msg = b"JSON schema version 1 field name=hello value=world";
208
- let ct = lz4_flex::block::compress_prepend_size_with_dict(msg, dict);
209
- let pt = lz4_flex::block::decompress_size_prepended_with_dict(&ct, dict).unwrap();
245
+ fn frame_dict_round_trip() {
246
+ let dict = b"JSON schema version 1 field ".repeat(4);
247
+ let id: u32 = 0xDEAD_BEEF;
248
+ let msg = b"JSON schema version 1 field name=hello value=world".to_vec();
249
+
250
+ let mut enc = lz4_flex::frame::FrameEncoder::with_dictionary(Vec::new(), &dict, id);
251
+ enc.write_all(&msg).unwrap();
252
+ let ct = enc.finish().unwrap();
253
+ assert_eq!(&ct[..4], &[0x04, 0x22, 0x4d, 0x18]);
254
+
255
+ let mut dec = lz4_flex::frame::FrameDecoder::with_dictionary(&*ct, &dict, id);
256
+ let mut pt = Vec::new();
257
+ dec.read_to_end(&mut pt).unwrap();
210
258
  assert_eq!(pt, msg);
211
259
  }
212
260
 
213
261
  #[test]
214
- fn block_dict_mismatch_fails_or_returns_wrong_data() {
215
- // With a wrong dict, decode either errors out or returns wrong bytes.
216
- // Either way it must not silently round-trip to the original.
217
- let dict_a = b"common prefix AAA ";
218
- let dict_b = b"common prefix BBB ";
262
+ fn frame_dict_id_mismatch_fails() {
263
+ let dict_a = b"common prefix AAA ".repeat(4);
264
+ let dict_b = b"common prefix BBB ".repeat(4);
265
+
219
266
  let msg = b"common prefix AAA : the payload";
220
- let ct = lz4_flex::block::compress_prepend_size_with_dict(msg, dict_a);
221
- match lz4_flex::block::decompress_size_prepended_with_dict(&ct, dict_b) {
222
- Ok(out) => assert_ne!(out, msg),
223
- Err(_) => {}
224
- }
267
+ let mut enc =
268
+ lz4_flex::frame::FrameEncoder::with_dictionary(Vec::new(), &dict_a, 0xAAAA_AAAA);
269
+ enc.write_all(msg).unwrap();
270
+ let ct = enc.finish().unwrap();
271
+
272
+ let mut dec =
273
+ lz4_flex::frame::FrameDecoder::with_dictionary(&*ct, &dict_b, 0xBBBB_BBBB);
274
+ let mut out = Vec::new();
275
+ assert!(dec.read_to_end(&mut out).is_err());
225
276
  }
226
277
  }
data/lib/rlz4/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module RLZ4
4
- VERSION = "0.1.1"
4
+ VERSION = "0.2.1"
5
5
  end
data/lib/rlz4.rb CHANGED
@@ -1,4 +1,20 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require "digest"
4
+
3
5
  require_relative "rlz4/rlz4"
4
6
  require_relative "rlz4/version"
7
+
8
+ module RLZ4
9
+ class Dictionary
10
+ # Public constructor. Derives the LZ4 frame `Dict_ID` from the dictionary
11
+ # bytes (sha256 truncated to the first 4 bytes, little-endian) and forwards
12
+ # to the Rust extension. The id is what gets written into every emitted
13
+ # frame's FrameDescriptor and what `#decompress` asserts the incoming
14
+ # frame declares before decoding.
15
+ def self.new(bytes)
16
+ id = Digest::SHA256.digest(bytes).byteslice(0, 4).unpack1("V")
17
+ _native_new(bytes, id)
18
+ end
19
+ end
20
+ end
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "rlz4"
3
- version = "0.1.0"
3
+ version = "0.2.0"
4
4
  edition = "2021"
5
5
 
6
6
  [lib]
@@ -8,7 +8,7 @@ name = "rlz4"
8
8
  crate-type = ["cdylib", "rlib"]
9
9
 
10
10
  [dependencies]
11
- lz4_flex = { version = "0.13", default-features = false, features = ["frame", "std", "safe-encode", "safe-decode"] }
11
+ lz4_flex = { git = "https://github.com/paddor/lz4_flex.git", rev = "dae9c784e890591e6445135ba23cacf344eafe8f", default-features = false, features = ["frame", "std", "safe-encode", "safe-decode"] }
12
12
  magnus = "0.8"
13
13
  rb-sys = "0.9"
14
14
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rlz4
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Patrik Wenger