vibe_zstd 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. checksums.yaml +7 -0
  2. data/.standard.yml +3 -0
  3. data/CHANGELOG.md +22 -0
  4. data/LICENSE.txt +21 -0
  5. data/README.md +978 -0
  6. data/Rakefile +20 -0
  7. data/benchmark/README.md +198 -0
  8. data/benchmark/compression_levels.rb +99 -0
  9. data/benchmark/context_reuse.rb +174 -0
  10. data/benchmark/decompression_speed_by_level.rb +65 -0
  11. data/benchmark/dictionary_training.rb +182 -0
  12. data/benchmark/dictionary_usage.rb +121 -0
  13. data/benchmark/for_readme.rb +157 -0
  14. data/benchmark/generate_fixture.rb +82 -0
  15. data/benchmark/helpers.rb +237 -0
  16. data/benchmark/multithreading.rb +105 -0
  17. data/benchmark/run_all.rb +150 -0
  18. data/benchmark/streaming.rb +154 -0
  19. data/ext/vibe_zstd/Makefile +270 -0
  20. data/ext/vibe_zstd/cctx.c +565 -0
  21. data/ext/vibe_zstd/dctx.c +493 -0
  22. data/ext/vibe_zstd/dict.c +587 -0
  23. data/ext/vibe_zstd/extconf.rb +52 -0
  24. data/ext/vibe_zstd/frames.c +132 -0
  25. data/ext/vibe_zstd/libzstd/LICENSE +30 -0
  26. data/ext/vibe_zstd/libzstd/common/allocations.h +55 -0
  27. data/ext/vibe_zstd/libzstd/common/bits.h +205 -0
  28. data/ext/vibe_zstd/libzstd/common/bitstream.h +454 -0
  29. data/ext/vibe_zstd/libzstd/common/compiler.h +464 -0
  30. data/ext/vibe_zstd/libzstd/common/cpu.h +249 -0
  31. data/ext/vibe_zstd/libzstd/common/debug.c +30 -0
  32. data/ext/vibe_zstd/libzstd/common/debug.h +107 -0
  33. data/ext/vibe_zstd/libzstd/common/entropy_common.c +340 -0
  34. data/ext/vibe_zstd/libzstd/common/error_private.c +64 -0
  35. data/ext/vibe_zstd/libzstd/common/error_private.h +158 -0
  36. data/ext/vibe_zstd/libzstd/common/fse.h +625 -0
  37. data/ext/vibe_zstd/libzstd/common/fse_decompress.c +315 -0
  38. data/ext/vibe_zstd/libzstd/common/huf.h +277 -0
  39. data/ext/vibe_zstd/libzstd/common/mem.h +422 -0
  40. data/ext/vibe_zstd/libzstd/common/pool.c +371 -0
  41. data/ext/vibe_zstd/libzstd/common/pool.h +81 -0
  42. data/ext/vibe_zstd/libzstd/common/portability_macros.h +171 -0
  43. data/ext/vibe_zstd/libzstd/common/threading.c +182 -0
  44. data/ext/vibe_zstd/libzstd/common/threading.h +142 -0
  45. data/ext/vibe_zstd/libzstd/common/xxhash.c +18 -0
  46. data/ext/vibe_zstd/libzstd/common/xxhash.h +7094 -0
  47. data/ext/vibe_zstd/libzstd/common/zstd_common.c +48 -0
  48. data/ext/vibe_zstd/libzstd/common/zstd_deps.h +123 -0
  49. data/ext/vibe_zstd/libzstd/common/zstd_internal.h +324 -0
  50. data/ext/vibe_zstd/libzstd/common/zstd_trace.h +156 -0
  51. data/ext/vibe_zstd/libzstd/compress/clevels.h +134 -0
  52. data/ext/vibe_zstd/libzstd/compress/fse_compress.c +625 -0
  53. data/ext/vibe_zstd/libzstd/compress/hist.c +191 -0
  54. data/ext/vibe_zstd/libzstd/compress/hist.h +82 -0
  55. data/ext/vibe_zstd/libzstd/compress/huf_compress.c +1464 -0
  56. data/ext/vibe_zstd/libzstd/compress/zstd_compress.c +7843 -0
  57. data/ext/vibe_zstd/libzstd/compress/zstd_compress_internal.h +1636 -0
  58. data/ext/vibe_zstd/libzstd/compress/zstd_compress_literals.c +235 -0
  59. data/ext/vibe_zstd/libzstd/compress/zstd_compress_literals.h +39 -0
  60. data/ext/vibe_zstd/libzstd/compress/zstd_compress_sequences.c +442 -0
  61. data/ext/vibe_zstd/libzstd/compress/zstd_compress_sequences.h +55 -0
  62. data/ext/vibe_zstd/libzstd/compress/zstd_compress_superblock.c +688 -0
  63. data/ext/vibe_zstd/libzstd/compress/zstd_compress_superblock.h +32 -0
  64. data/ext/vibe_zstd/libzstd/compress/zstd_cwksp.h +765 -0
  65. data/ext/vibe_zstd/libzstd/compress/zstd_double_fast.c +778 -0
  66. data/ext/vibe_zstd/libzstd/compress/zstd_double_fast.h +42 -0
  67. data/ext/vibe_zstd/libzstd/compress/zstd_fast.c +985 -0
  68. data/ext/vibe_zstd/libzstd/compress/zstd_fast.h +30 -0
  69. data/ext/vibe_zstd/libzstd/compress/zstd_lazy.c +2199 -0
  70. data/ext/vibe_zstd/libzstd/compress/zstd_lazy.h +193 -0
  71. data/ext/vibe_zstd/libzstd/compress/zstd_ldm.c +745 -0
  72. data/ext/vibe_zstd/libzstd/compress/zstd_ldm.h +109 -0
  73. data/ext/vibe_zstd/libzstd/compress/zstd_ldm_geartab.h +106 -0
  74. data/ext/vibe_zstd/libzstd/compress/zstd_opt.c +1580 -0
  75. data/ext/vibe_zstd/libzstd/compress/zstd_opt.h +72 -0
  76. data/ext/vibe_zstd/libzstd/compress/zstd_preSplit.c +238 -0
  77. data/ext/vibe_zstd/libzstd/compress/zstd_preSplit.h +33 -0
  78. data/ext/vibe_zstd/libzstd/compress/zstdmt_compress.c +1923 -0
  79. data/ext/vibe_zstd/libzstd/compress/zstdmt_compress.h +102 -0
  80. data/ext/vibe_zstd/libzstd/decompress/huf_decompress.c +1944 -0
  81. data/ext/vibe_zstd/libzstd/decompress/huf_decompress_amd64.S +602 -0
  82. data/ext/vibe_zstd/libzstd/decompress/zstd_ddict.c +244 -0
  83. data/ext/vibe_zstd/libzstd/decompress/zstd_ddict.h +44 -0
  84. data/ext/vibe_zstd/libzstd/decompress/zstd_decompress.c +2410 -0
  85. data/ext/vibe_zstd/libzstd/decompress/zstd_decompress_block.c +2209 -0
  86. data/ext/vibe_zstd/libzstd/decompress/zstd_decompress_block.h +73 -0
  87. data/ext/vibe_zstd/libzstd/decompress/zstd_decompress_internal.h +240 -0
  88. data/ext/vibe_zstd/libzstd/deprecated/zbuff.h +214 -0
  89. data/ext/vibe_zstd/libzstd/deprecated/zbuff_common.c +26 -0
  90. data/ext/vibe_zstd/libzstd/deprecated/zbuff_compress.c +167 -0
  91. data/ext/vibe_zstd/libzstd/deprecated/zbuff_decompress.c +77 -0
  92. data/ext/vibe_zstd/libzstd/dictBuilder/cover.c +1302 -0
  93. data/ext/vibe_zstd/libzstd/dictBuilder/cover.h +152 -0
  94. data/ext/vibe_zstd/libzstd/dictBuilder/divsufsort.c +1913 -0
  95. data/ext/vibe_zstd/libzstd/dictBuilder/divsufsort.h +57 -0
  96. data/ext/vibe_zstd/libzstd/dictBuilder/fastcover.c +766 -0
  97. data/ext/vibe_zstd/libzstd/dictBuilder/zdict.c +1133 -0
  98. data/ext/vibe_zstd/libzstd/zdict.h +481 -0
  99. data/ext/vibe_zstd/libzstd/zstd.h +3198 -0
  100. data/ext/vibe_zstd/libzstd/zstd_errors.h +107 -0
  101. data/ext/vibe_zstd/streaming.c +410 -0
  102. data/ext/vibe_zstd/vibe_zstd.c +293 -0
  103. data/ext/vibe_zstd/vibe_zstd.h +56 -0
  104. data/ext/vibe_zstd/vibe_zstd_internal.h +27 -0
  105. data/lib/vibe_zstd/constants.rb +67 -0
  106. data/lib/vibe_zstd/version.rb +5 -0
  107. data/lib/vibe_zstd.rb +255 -0
  108. data/sig/vibe_zstd.rbs +76 -0
  109. metadata +179 -0
data/README.md ADDED
@@ -0,0 +1,978 @@
1
+ # VibeZstd
2
+
3
+ Fast, high-ratio compression for Ruby using the Zstandard (Zstd) library. VibeZstd provides a native Ruby C extension with an idiomatic API for compressing and decompressing data.
4
+
5
+ ## Quick Start
6
+
7
+ ```ruby
8
+ require 'vibe_zstd'
9
+
10
+ # One-line compression/decompression
11
+ compressed = VibeZstd.compress("Hello, world!")
12
+ original = VibeZstd.decompress(compressed)
13
+
14
+ # With custom compression level (1-22, or negative for ultra-fast)
15
+ compressed = VibeZstd.compress(data, level: 9)
16
+
17
+ # Reusable contexts (recommended for multiple operations - 2.2x faster!)
18
+ cctx = VibeZstd::CCtx.new
19
+ dctx = VibeZstd::DCtx.new
20
+
21
+ # Reuse the same contexts for multiple operations
22
+ files.each do |file_data|
23
+ compressed = cctx.compress(file_data)
24
+ decompressed = dctx.decompress(compressed)
25
+ # ... process data
26
+ end
27
+ ```
28
+
29
+ ## Installation
30
+
31
+ Add to your Gemfile:
32
+
33
+ ```ruby
34
+ gem 'vibe_zstd'
35
+ ```
36
+
37
+ Then run:
38
+
39
+ ```bash
40
+ bundle install
41
+ ```
42
+
43
+ Or install directly:
44
+
45
+ ```bash
46
+ gem install vibe_zstd
47
+ ```
48
+
49
+ ## Performance & Best Practices
50
+
51
+ VibeZstd leverages Zstandard's excellent compression performance:
52
+
53
+ - **Compression ratios** comparable to or better than gzip/bzip2 at similar speeds
54
+ - **Extremely fast decompression**
55
+ - **Compression levels** from -7 (ultra-fast) to 22 (maximum compression)
56
+
57
+ ### Context Reuse (Important!)
58
+
59
+ **Always reuse contexts for multiple operations** - it's 2-3x faster:
60
+
61
+ | Data Size | New Context | Reused Context | Speedup |
62
+ |-----------|-------------|----------------|---------|
63
+ | 1KB | 72,610 ops/s | 159,454 ops/s | **2.2x** |
64
+ | 10KB | 34,941 ops/s | 61,171 ops/s | **1.75x** |
65
+ | 100KB | 7,675 ops/s | 9,491 ops/s | **1.24x** |
66
+
67
+ ```ruby
68
+ # ❌ Don't do this (creates new context each time)
69
+ 1000.times do
70
+ compressed = VibeZstd.compress(data)
71
+ end
72
+
73
+ # ✅ Do this instead (reuse context)
74
+ cctx = VibeZstd::CCtx.new
75
+ 1000.times do
76
+ compressed = cctx.compress(data)
77
+ end
78
+ ```
79
+
80
+ **Memory savings:** Reusing contexts saves ~6.7GB for 5000 operations:
81
+ - Creating new contexts: 5000 × 1.34MB = 6.70GB
82
+ - Reusing contexts: 1 × 1.34MB = 1.34MB
83
+ - **Savings: 6.69GB (99.98% reduction)**
84
+
85
+ *Note: 1.34MB = CCtx memory (~1.24MB at level 3) + DCtx memory (~128KB)*
86
+
87
+ ### Compression Level Trade-offs
88
+
89
+ Choose the right level for your use case:
90
+
91
+ | Level | Ratio | Speed (ops/sec) | Memory | Use Case |
92
+ |-------|-------|-----------------|--------|----------|
93
+ | -1 | 6.03x | 11,507 | 537KB | Ultra-fast, real-time |
94
+ | 1 | 8.2x | 10,752 | 569KB | Fast, high-throughput |
95
+ | 3 | 7.93x | 9,191 | 1.24MB | **Balanced (default)** |
96
+ | 9 | 9.17x | 987 | 12.49MB | Better compression |
97
+ | 19 | 10.3x | 35 | 81.25MB | Maximum compression |
98
+
99
+ ```ruby
100
+ cctx = VibeZstd::CCtx.new
101
+
102
+ # Ultra-fast for real-time processing
103
+ compressed = cctx.compress(data, level: -1)
104
+
105
+ # Maximum compression for archival
106
+ compressed = cctx.compress(data, level: 19)
107
+ ```
108
+
109
+ ### Dictionary Compression
110
+
111
+ For small, similar data (JSON, logs, API responses), dictionaries provide dramatic improvements:
112
+
113
+ | Method | Compressed Size | Ratio | Improvement |
114
+ |--------|----------------|-------|-------------|
115
+ | Without dictionary | 110B | 1.15x | - |
116
+ | With dictionary (16KB) | 54B | 2.33x | **50.9% smaller** |
117
+
118
+ See the [Dictionaries](#dictionaries) section below for usage examples.
119
+
120
+ ## Basic Usage
121
+
122
+ ### Simple Compression
123
+
124
+ ```ruby
125
+ require 'vibe_zstd'
126
+
127
+ data = "Hello, world! This is a test string."
128
+
129
+ # One-off compression (creates context internally)
130
+ compressed = VibeZstd.compress(data)
131
+ decompressed = VibeZstd.decompress(compressed)
132
+
133
+ # With custom level
134
+ compressed = VibeZstd.compress(data, level: 5)
135
+ ```
136
+
137
+ ### Using Contexts (Recommended)
138
+
139
+ For multiple operations, create reusable contexts:
140
+
141
+ ```ruby
142
+ # Create contexts once
143
+ cctx = VibeZstd::CCtx.new
144
+ dctx = VibeZstd::DCtx.new
145
+
146
+ # Reuse for multiple operations
147
+ files.each do |file|
148
+ data = File.read(file)
149
+ compressed = cctx.compress(data)
150
+ File.write("#{file}.zst", compressed)
151
+ end
152
+ ```
153
+
154
+ ### Compression Levels
155
+
156
+ ```ruby
157
+ cctx = VibeZstd::CCtx.new
158
+
159
+ # Fast compression (level 1)
160
+ compressed = cctx.compress(data, level: 1)
161
+
162
+ # Default level (3)
163
+ compressed = cctx.compress(data)
164
+
165
+ # High compression (level 9)
166
+ compressed = cctx.compress(data, level: 9)
167
+
168
+ # Negative levels for ultra-fast compression
169
+ compressed = cctx.compress(data, level: -1)
170
+ ```
171
+
172
+ ### Frame Information
173
+
174
+ ```ruby
175
+ # Check decompressed size before decompressing
176
+ size = VibeZstd.frame_content_size(compressed_data)
177
+ puts "Will decompress to #{size} bytes" if size
178
+
179
+ # Get compression bound (maximum compressed size)
180
+ max_size = VibeZstd.compress_bound(data.bytesize)
181
+ ```
182
+
183
+ ## Advanced Features
184
+
185
+ ### Dictionaries
186
+
187
+ Dictionaries dramatically improve compression for small, similar data by pre-training on representative samples.
188
+
189
+ #### Training a Dictionary
190
+
191
+ ```ruby
192
+ # Collect representative samples
193
+ samples = [
194
+ {id: 1, name: "Alice", email: "alice@example.com"}.to_json,
195
+ {id: 2, name: "Bob", email: "bob@example.com"}.to_json,
196
+ {id: 3, name: "Charlie", email: "charlie@example.com"}.to_json
197
+ # ... more samples
198
+ ]
199
+
200
+ # Train dictionary with default algorithm (fast, good for most use cases)
201
+ dict_data = VibeZstd.train_dict(samples)
202
+
203
+ # Or specify custom size
204
+ dict_data = VibeZstd.train_dict(samples, max_dict_size: 16_384)
205
+
206
+ # Advanced: Use COVER algorithm for better dictionaries
207
+ # k: segment size (typical: 16-2048+)
208
+ # d: dmer size (typical: 6-16, must be ≤ k)
209
+ dict_data = VibeZstd.train_dict_cover(
210
+ samples,
211
+ max_dict_size: 16_384,
212
+ k: 200, # Segment size
213
+ d: 6 # Dmer size
214
+ )
215
+
216
+ # Advanced: Fast COVER for quick training
217
+ dict_data = VibeZstd.train_dict_fast_cover(
218
+ samples,
219
+ max_dict_size: 16_384,
220
+ k: 200,
221
+ d: 6,
222
+ accel: 1 # Higher = faster but less accurate (1-10)
223
+ )
224
+ ```
225
+
226
+ #### Using Dictionaries
227
+
228
+ ```ruby
229
+ # Create dictionary objects
230
+ dict_data = File.binread('my.dict')
231
+ cdict = VibeZstd::CDict.new(dict_data)
232
+ ddict = VibeZstd::DDict.new(dict_data)
233
+
234
+ # Use with contexts
235
+ cctx = VibeZstd::CCtx.new
236
+ dctx = VibeZstd::DCtx.new
237
+
238
+ compressed = cctx.compress(data, dict: cdict)
239
+ decompressed = dctx.decompress(compressed, dict: ddict)
240
+
241
+ # Or use convenience methods
242
+ compressed = VibeZstd.compress(data, dict: cdict)
243
+ decompressed = VibeZstd.decompress(compressed, dict: ddict)
244
+
245
+ # Check dictionary properties
246
+ puts "Dictionary size: #{cdict.size} bytes"
247
+ puts "Dictionary ID: #{cdict.dict_id}"
248
+
249
+ # Get dictionary ID from compressed data
250
+ dict_id = VibeZstd.get_dict_id_from_frame(compressed)
251
+ ```
252
+
253
+ #### Prefix Dictionaries (Lightweight Alternative)
254
+
255
+ For cases where training isn't practical:
256
+
257
+ ```ruby
258
+ cctx = VibeZstd::CCtx.new
259
+ dctx = VibeZstd::DCtx.new
260
+
261
+ # Use a common prefix (same for compression and decompression)
262
+ prefix = "Common data prefix that appears frequently"
263
+
264
+ compressed = cctx.use_prefix(prefix).compress(data)
265
+ decompressed = dctx.use_prefix(prefix).decompress(compressed)
266
+ ```
267
+
268
+ ### Streaming API
269
+
270
+ Process large files with constant memory usage.
271
+
272
+ #### Streaming Compression
273
+
274
+ ```ruby
275
+ # Compress to file
276
+ File.open('output.zst', 'wb') do |file|
277
+ writer = VibeZstd::CompressWriter.new(file, level: 5)
278
+
279
+ writer.write("chunk 1")
280
+ writer.write("chunk 2")
281
+ writer.flush # Optional: force output
282
+ writer.finish # Required: finalize frame
283
+ end
284
+
285
+ # Or use block form (auto-finishes)
286
+ VibeZstd::CompressWriter.open(file, level: 5) do |writer|
287
+ large_data.each_slice(65536) do |chunk|
288
+ writer.write(chunk)
289
+ end
290
+ end
291
+
292
+ # With dictionary
293
+ cdict = VibeZstd::CDict.new(dict_data)
294
+ writer = VibeZstd::CompressWriter.new(file, level: 5, dict: cdict)
295
+ ```
296
+
297
+ #### Streaming Decompression
298
+
299
+ ```ruby
300
+ # Decompress file in chunks (memory-safe for large files)
301
+ File.open('input.zst', 'rb') do |file|
302
+ reader = VibeZstd::DecompressReader.new(file)
303
+
304
+ # Read in ~128KB chunks by default
305
+ while chunk = reader.read
306
+ process(chunk)
307
+ end
308
+ end
309
+
310
+ # Custom chunk sizes
311
+ reader = VibeZstd::DecompressReader.new(file, initial_chunk_size: 1_048_576)
312
+ while chunk = reader.read # Returns up to 1MB per call
313
+ process(chunk)
314
+ end
315
+
316
+ # Or specify per-read
317
+ while chunk = reader.read(65536) # Read 64KB chunks
318
+ process(chunk)
319
+ end
320
+
321
+ # Block form
322
+ VibeZstd::DecompressReader.open(file) do |reader|
323
+ reader.each do |chunk|
324
+ process(chunk)
325
+ end
326
+ end
327
+
328
+ # HTTP streaming example
329
+ require 'net/http'
330
+ uri = URI('https://example.com/large_file.zst')
331
+ File.open('output.txt', 'wb') do |output|
332
+ Net::HTTP.start(uri.host, uri.port, use_ssl: true) do |http|
333
+ http.request_get(uri.path) do |response|
334
+ reader = VibeZstd::DecompressReader.new(response.body)
335
+ reader.each { |chunk| output.write(chunk) }
336
+ end
337
+ end
338
+ end
339
+ ```
340
+
341
+ **Chunk size recommendations:**
342
+ - **Small data (< 10KB)**: `initial_chunk_size: 4096` to avoid over-allocation
343
+ - **Medium data (10KB-1MB)**: Default (~128KB) for balanced performance
344
+ - **Large data (> 1MB)**: `initial_chunk_size: 1_048_576` to reduce overhead
345
+ - **Memory-constrained**: Use smaller chunks (4-8KB)
346
+ - **High throughput**: Use larger chunks (1-10MB)
347
+
348
+ #### Line-by-Line Processing
349
+
350
+ `DecompressReader` provides IO-like methods for processing compressed text files line by line:
351
+
352
+ ```ruby
353
+ # Process compressed log file line by line
354
+ File.open('app.log.zst', 'rb') do |file|
355
+ reader = VibeZstd::DecompressReader.new(file)
356
+
357
+ # Read lines one at a time
358
+ while line = reader.gets
359
+ # Process each log entry
360
+ if line.include?('ERROR')
361
+ alert_on_call(line)
362
+ end
363
+ end
364
+ end
365
+
366
+ # Or use each_line for cleaner iteration
367
+ VibeZstd::DecompressReader.open(file) do |reader|
368
+ reader.each_line do |line|
369
+ process_log_entry(line)
370
+ end
371
+ end
372
+
373
+ # Read specific number of bytes
374
+ reader.readpartial(4096) # Raises EOFError at end of stream
375
+
376
+ # Check for end of stream
377
+ reader.eof? # => true/false
378
+ ```
379
+
380
+ **Use cases:**
381
+ - **Log processing** - Parse compressed log files without decompressing the entire file
382
+ - **CSV/TSV parsing** - Read compressed data files line by line for memory-efficient ETL
383
+ - **Configuration files** - Load compressed config files with minimal memory footprint
384
+
385
+ ### Multi-threaded Compression
386
+
387
+ Enable parallel compression for large data:
388
+
389
+ ```ruby
390
+ cctx = VibeZstd::CCtx.new
391
+
392
+ # Enable 4 worker threads
393
+ cctx.workers = 4
394
+
395
+ # Or set during initialization
396
+ cctx = VibeZstd::CCtx.new(workers: 4)
397
+
398
+ large_data = File.read('big_file.txt')
399
+ compressed = cctx.compress(large_data)
400
+ ```
401
+
402
+ **Multi-threading performance** (500KB data):
403
+
404
+ | Workers | Throughput | Speedup | Efficiency |
405
+ |---------|------------|---------|------------|
406
+ | 0 (single) | 795MB/s | 1.0x | 100% |
407
+ | 2 | 784MB/s | 0.99x | 49% |
408
+ | 4 | 748MB/s | 0.94x | 24% |
409
+
410
+ **Note:** Multi-threading works best for data > 1MB. Overhead may outweigh benefits for smaller payloads.
411
+
412
+ #### Multi-threading Tuning
413
+
414
+ ```ruby
415
+ cctx = VibeZstd::CCtx.new(workers: 4)
416
+
417
+ # Tune job size (default: auto)
418
+ # Larger = better ratio but higher latency
419
+ cctx.job_size = 1_048_576 # 1MB per job
420
+
421
+ # Tune overlap (0-9)
422
+ # Higher = better ratio but slower
423
+ cctx.overlap_log = 6 # Default: auto (usually 6-9)
424
+ ```
425
+
426
+ ### Compression Parameters
427
+
428
+ Fine-tune compression behavior using property setters:
429
+
430
+ ```ruby
431
+ # Set during initialization (recommended)
432
+ cctx = VibeZstd::CCtx.new(
433
+ checksum_flag: 1, # Add checksum for integrity
434
+ content_size_flag: 1, # Include size in frame header
435
+ window_log: 20, # 1MB window (2^20)
436
+ workers: 4 # 4 threads
437
+ )
438
+
439
+ # Or set after creation
440
+ cctx = VibeZstd::CCtx.new
441
+ cctx.checksum_flag = 1
442
+ cctx.content_size_flag = 1
443
+ cctx.workers = 4
444
+
445
+ data = "Your data here"
446
+ compressed = cctx.compress(data)
447
+ ```
448
+
449
+ #### Common Parameters
450
+
451
+ **Frame parameters:**
452
+ ```ruby
453
+ cctx.checksum_flag = 1 # Enable 32-bit checksum
454
+ cctx.content_size_flag = 1 # Include decompressed size
455
+ cctx.dict_id_flag = 1 # Include dictionary ID
456
+ ```
457
+
458
+ **Compression tuning:**
459
+ ```ruby
460
+ cctx.compression_level = 9 # Same as level: argument
461
+ cctx.window_log = 20 # Window size (2^20 = 1MB)
462
+ ```
463
+
464
+ **Long Distance Matching (for large files with repeated patterns):**
465
+ ```ruby
466
+ cctx.enable_long_distance_matching = 1
467
+ cctx.ldm_hash_log = 20
468
+ cctx.ldm_min_match = 64
469
+ ```
470
+
471
+ **Multi-threading:**
472
+ ```ruby
473
+ cctx.workers = 4 # Number of threads
474
+ cctx.job_size = 1_048_576 # Size per job
475
+ cctx.overlap_log = 6 # Overlap between jobs
476
+ ```
477
+
478
+ #### Query Parameter Bounds
479
+
480
+ ```ruby
481
+ # Get valid range for a parameter
482
+ bounds = VibeZstd::CCtx.parameter_bounds(:compression_level)
483
+ puts "Level range: #{bounds[:min]} to #{bounds[:max]}"
484
+ # => Level range: -131072 to 22
485
+ # Note: Practical range is -7 to 22; -131072 is a technical limit, not a usable level
486
+
487
+ # Validate before setting
488
+ level = user_input.to_i
489
+ bounds = VibeZstd::CCtx.parameter_bounds(:compression_level)
490
+ if level >= bounds[:min] && level <= bounds[:max]
491
+ cctx.compression_level = level
492
+ else
493
+ raise "Invalid level"
494
+ end
495
+ ```
496
+
497
+ #### Get Current Parameter Values
498
+
499
+ ```ruby
500
+ cctx = VibeZstd::CCtx.new(compression_level: 9)
501
+
502
+ # Read current values
503
+ puts cctx.compression_level # => 9
504
+ puts cctx.checksum_flag # => 0
505
+ ```
506
+
507
+ ### Decompression Parameters
508
+
509
+ Control decompression behavior to prevent memory exhaustion:
510
+
511
+ ```ruby
512
+ dctx = VibeZstd::DCtx.new
513
+
514
+ # Limit maximum window size (prevents memory attacks)
515
+ dctx.window_log_max = 20 # Max 1MB window (2^20)
516
+
517
+ # Or set during initialization
518
+ dctx = VibeZstd::DCtx.new(window_log_max: 20)
519
+
520
+ compressed = File.read('data.zst')
521
+ decompressed = dctx.decompress(compressed)
522
+ ```
523
+
524
+ #### Optimize for Unknown-Size Frames
525
+
526
+ When decompressing frames without known content size:
527
+
528
+ ```ruby
529
+ # Set globally for all new DCtx instances
530
+ VibeZstd::DCtx.default_initial_capacity = 1_048_576 # 1MB for large data
531
+
532
+ # Set per instance
533
+ dctx = VibeZstd::DCtx.new(initial_capacity: 512_000)
534
+
535
+ # Or per call (overrides instance setting)
536
+ dctx.decompress(compressed, initial_capacity: 16_384)
537
+
538
+ # Reset to default (~128KB)
539
+ VibeZstd::DCtx.default_initial_capacity = nil
540
+ ```
541
+
542
+ **When to configure:**
543
+ - **Small data (< 10KB)**: Set to `4096-8192`
544
+ - **Large data (> 1MB)**: Set to `1_048_576` or higher
545
+ - **Known-size frames**: Not applicable (size read from frame header)
546
+
547
+ ### Memory Estimation
548
+
549
+ Estimate memory usage before creating contexts:
550
+
551
+ ```ruby
552
+ # Compression context memory at level 5
553
+ cctx_bytes = VibeZstd::CCtx.estimate_memory(5)
554
+ puts "CCtx will use ~#{cctx_bytes} bytes"
555
+
556
+ # Decompression context
557
+ dctx_bytes = VibeZstd::DCtx.estimate_memory
558
+ puts "DCtx will use ~#{dctx_bytes} bytes"
559
+
560
+ # Dictionary memory
561
+ dict_size = 16_384
562
+ cdict_bytes = VibeZstd::CDict.estimate_memory(dict_size, 5)
563
+ ddict_bytes = VibeZstd::DDict.estimate_memory(dict_size)
564
+ puts "CDict: #{cdict_bytes} bytes, DDict: #{ddict_bytes} bytes"
565
+ ```
566
+
567
+ ## Integration Examples
568
+
569
+ Real-world examples demonstrating VibeZstd in production scenarios.
570
+
571
+ ### Rails Encrypted Columns with Thread-Local Contexts
572
+
573
+ Use VibeZstd with ActiveRecord::Encryption for high-performance compression of encrypted attributes.
574
+
575
+ #### Rails 7.1+ (Global Compressor Configuration)
576
+
577
+ ```ruby
578
+ # config/initializers/vibe_zstd_encryption.rb
579
+ module VibeZstdCompressor
580
+ # Compress using thread-local contexts (2-3x faster in multi-threaded environments)
581
+ def self.deflate(data)
582
+ VibeZstd::ThreadLocal.compress(data, level: 3)
583
+ end
584
+
585
+ def self.inflate(data)
586
+ VibeZstd::ThreadLocal.decompress(data)
587
+ end
588
+ end
589
+
590
+ ActiveSupport.on_load(:active_record) do
591
+ ActiveRecord::Encryption.config.support_unencrypted_data = true
592
+ ActiveRecord::Encryption.config.compressor = VibeZstdCompressor
593
+ end
594
+
595
+ # In your model - all encrypted attributes use VibeZstd
596
+ class User < ApplicationRecord
597
+ encrypts :preferences
598
+ encrypts :metadata
599
+ end
600
+ ```
601
+
602
+ #### Rails 8.0+ (Per-Attribute Compressor)
603
+
604
+ Rails 8 introduces per-attribute `compressor:` option for fine-grained control:
605
+
606
+ ```ruby
607
+ # config/initializers/vibe_zstd_encryption.rb
608
+ module VibeZstdCompressor
609
+ def self.deflate(data)
610
+ VibeZstd::ThreadLocal.compress(data, level: 3)
611
+ end
612
+
613
+ def self.inflate(data)
614
+ VibeZstd::ThreadLocal.decompress(data)
615
+ end
616
+ end
617
+
618
+ # In your model - specify compressor per attribute
619
+ class User < ApplicationRecord
620
+ # Use VibeZstd for large JSON columns
621
+ encrypts :preferences, compressor: VibeZstdCompressor
622
+ encrypts :settings, compressor: VibeZstdCompressor
623
+
624
+ # Use default Zlib for small text fields
625
+ encrypts :api_key
626
+ end
627
+ ```
628
+
629
+ #### Rails 8.0+ with Per-Attribute Dictionaries
630
+
631
+ Rails 8's per-attribute compressor enables custom dictionaries for individual fields—maximum compression for structured data:
632
+
633
+ ```ruby
634
+ # config/initializers/vibe_zstd_encryption.rb
635
+
636
+ # Compressor for user preferences with custom dictionary
637
+ module UserPrefsCompressor
638
+ DICT = VibeZstd::CDict.new(
639
+ File.binread('config/dictionaries/user_preferences.dict')
640
+ )
641
+
642
+ def self.deflate(data)
643
+ VibeZstd::ThreadLocal.compress(data, dict: DICT, level: 5)
644
+ end
645
+
646
+ def self.inflate(data)
647
+ VibeZstd::ThreadLocal.decompress(data, dict: DICT.to_ddict)
648
+ end
649
+ end
650
+
651
+ # Compressor for audit logs with different dictionary
652
+ module AuditLogCompressor
653
+ DICT = VibeZstd::CDict.new(
654
+ File.binread('config/dictionaries/audit_logs.dict')
655
+ )
656
+
657
+ def self.deflate(data)
658
+ VibeZstd::ThreadLocal.compress(data, dict: DICT, level: 3)
659
+ end
660
+
661
+ def self.inflate(data)
662
+ VibeZstd::ThreadLocal.decompress(data, dict: DICT.to_ddict)
663
+ end
664
+ end
665
+
666
+ # In your models - each attribute gets optimized dictionary
667
+ class User < ApplicationRecord
668
+ encrypts :preferences, compressor: UserPrefsCompressor # 50%+ smaller with custom dict
669
+ encrypts :settings, compressor: VibeZstdCompressor # Standard VibeZstd (no dict)
670
+ encrypts :api_key # Default Zlib for small data
671
+ end
672
+
673
+ class AuditEvent < ApplicationRecord
674
+ encrypts :event_data, compressor: AuditLogCompressor # Custom dict for audit logs
675
+ end
676
+ ```
677
+
678
+ **Why per-attribute dictionaries?**
679
+ - **50-70% size reduction** for small, similar data (JSON user preferences, API responses, logs)
680
+ - **Different dictionaries** trained on different data patterns (user prefs vs audit logs)
681
+ - **ThreadLocal pooling** keeps one context per dictionary per thread—minimal memory overhead
682
+
683
+ **Why ThreadLocal?** In Puma/multi-threaded Rails apps, `ThreadLocal` reuses contexts per thread (saves ~1.3MB per operation × requests). Each Puma worker thread maintains one CCtx and one DCtx, reducing memory and improving throughput.
684
+
685
+ **Rails 8 Advantage:** Per-attribute compressors let you optimize each field—use VibeZstd for large structured data (JSON, serialized objects) and default Zlib for small strings.
686
+
687
+ ### Dictionary Training for Encrypted Columns
688
+
689
+ For small, structured data (JSON, serialized objects), dictionaries can reduce size by 50%+:
690
+
691
+ ```ruby
692
+ # Step 1: Train dictionary from representative samples (one-time setup)
693
+ samples = User.limit(1000).pluck(:preferences).compact
694
+ dict_data = VibeZstd.train_dict(samples, max_dict_size: 16_384)
695
+ File.write('config/user_prefs.dict', dict_data)
696
+
697
+ # Step 2: Load dictionary at boot (config/initializers/vibe_zstd_encryption.rb)
698
+ module UserPrefsCompressor
699
+ DICT = VibeZstd::CDict.new(File.binread('config/user_prefs.dict'))
700
+
701
+ def self.deflate(data)
702
+ VibeZstd::ThreadLocal.compress(data, dict: DICT)
703
+ end
704
+
705
+ def self.inflate(data)
706
+ VibeZstd::ThreadLocal.decompress(data, dict: DICT.to_ddict)
707
+ end
708
+ end
709
+
710
+ # Step 3: Configure in your model
711
+ # Rails 7.1+: Set as global compressor
712
+ ActiveSupport.on_load(:active_record) do
713
+ ActiveRecord::Encryption.config.compressor = UserPrefsCompressor
714
+ end
715
+
716
+ # Rails 8.0+: Set per-attribute
717
+ class User < ApplicationRecord
718
+ encrypts :preferences, compressor: UserPrefsCompressor
719
+ end
720
+ ```
721
+
722
+ **Dictionary guidelines:**
723
+ - **Samples:** 100+ representative samples, similar to production data
724
+ - **Algorithm:** `train_dict` (fast, good) or `train_dict_cover` (slower, better compression)
725
+ - **Size:** 16-64KB typical; larger doesn't always improve compression
726
+ - **Best for:** Small (< 10KB), similar data (JSON, logs, structured text)
727
+ - **Avoid for:** Large files, binary data, highly variable content
728
+
729
+ ### Stream Decompressing a Remote File
730
+
731
+ Memory-efficient decompression of large remote `.zst` files:
732
+
733
+ ```ruby
734
+ require 'net/http'
735
+ require 'vibe_zstd'
736
+
737
+ uri = URI('https://example.com/large_dataset.zst')
738
+ File.open('dataset.csv', 'wb') do |output|
739
+ Net::HTTP.start(uri.host, uri.port, use_ssl: true) do |http|
740
+ http.request_get(uri.path) do |response|
741
+ reader = VibeZstd::DecompressReader.new(response.body)
742
+ reader.each { |chunk| output.write(chunk) }
743
+ end
744
+ end
745
+ end
746
+ ```
747
+
748
+ **Constant memory:** Processes files of any size with ~128KB RAM (configurable via `initial_chunk_size`).
749
+
750
+ ### Stream Compressing Large Files
751
+
752
+ Compress large files without loading into memory:
753
+
754
+ ```ruby
755
+ # Compress 10GB file in chunks
756
+ File.open('large_data.txt', 'rb') do |input|
757
+ VibeZstd::CompressWriter.open('large_data.txt.zst', level: 5) do |writer|
758
+ while chunk = input.read(1_048_576) # 1MB chunks
759
+ writer.write(chunk)
760
+ end
761
+ end
762
+ end
763
+ ```
764
+
765
+ **When to stream:**
766
+ - Files > 100MB (avoids loading entire file into memory)
767
+ - Network streams, pipes, or IO objects
768
+ - Progressive compression (write data as it's generated)
769
+
770
+ ### Skippable Frame Metadata
771
+
772
+ Add metadata (version, timestamp, checksums) without affecting decompression:
773
+
774
+ ```ruby
775
+ # Write file with metadata
776
+ metadata = {version: "2.0", created_at: Time.now.to_i, schema: "users_v2"}.to_json
777
+ File.open('data.zst', 'wb') do |f|
778
+ f.write VibeZstd.write_skippable_frame(metadata, magic_number: 0)
779
+ f.write VibeZstd.compress(actual_data)
780
+ end
781
+
782
+ # Read decompresses normally (skips metadata automatically)
783
+ data = VibeZstd.decompress(File.binread('data.zst'))
784
+
785
+ # Extract metadata without decompressing payload
786
+ File.open('data.zst', 'rb') do |f|
787
+ VibeZstd.each_skippable_frame(f.read) do |content, magic, offset|
788
+ metadata = JSON.parse(content)
789
+ puts "File version: #{metadata['version']}"
790
+ end
791
+ end
792
+ ```
793
+
794
+ **Use cases:**
795
+ - **Versioning:** Track data schema versions for migrations
796
+ - **Provenance:** Store creation timestamp, user, source system
797
+ - **Integrity:** Add checksums or signatures before compression
798
+ - **Archives:** Multi-file archives with per-file metadata (see test_skippable_frame_archive_pattern in tests)
799
+
800
+ **Note:** Skippable frames add 8 bytes + metadata size. For small files, consider alternatives (separate metadata file, database columns).
801
+
802
+ ## API Reference
803
+
804
+ ### Module Methods
805
+
806
+ ```ruby
807
+ VibeZstd.compress(data, level: nil, dict: nil)
808
+ VibeZstd.decompress(data, dict: nil)
809
+ VibeZstd.frame_content_size(data)
810
+ VibeZstd.compress_bound(size)
811
+ VibeZstd.train_dict(samples, max_dict_size: 112640)
812
+ VibeZstd.train_dict_cover(samples, max_dict_size:, k:, d:, **opts)
813
+ VibeZstd.train_dict_fast_cover(samples, max_dict_size:, k:, d:, **opts)
814
+ VibeZstd.get_dict_id(dict_data)
815
+ VibeZstd.get_dict_id_from_frame(data)
816
+ VibeZstd.version_number # e.g., 10507
817
+ VibeZstd.version_string # e.g., "1.5.7"
818
+ VibeZstd.min_level # Minimum compression level
819
+ VibeZstd.max_level # Maximum compression level
820
+ VibeZstd.default_level # Default compression level
821
+ ```
822
+
823
+ ### CCtx (Compression Context)
824
+
825
+ ```ruby
826
+ cctx = VibeZstd::CCtx.new(**params)
827
+ cctx.compress(data, level: nil, dict: nil, pledged_size: nil)
828
+ cctx.use_prefix(prefix_data)
829
+
830
+ # Property setters (see parameters section)
831
+ cctx.checksum_flag = 1
832
+ cctx.content_size_flag = 1
833
+ cctx.compression_level = 9
834
+ cctx.window_log = 20
835
+ cctx.workers = 4
836
+ # ... and many more
837
+
838
+ # Class methods
839
+ VibeZstd::CCtx.parameter_bounds(param)
840
+ VibeZstd::CCtx.estimate_memory(level)
841
+ ```
842
+
843
+ ### DCtx (Decompression Context)
844
+
845
+ ```ruby
846
+ dctx = VibeZstd::DCtx.new(**params)
847
+ dctx.decompress(data, dict: nil, initial_capacity: nil)
848
+ dctx.use_prefix(prefix_data)
849
+ dctx.initial_capacity = 1_048_576
850
+ dctx.window_log_max = 20
851
+
852
+ # Class methods
853
+ VibeZstd::DCtx.default_initial_capacity = value
854
+ VibeZstd::DCtx.parameter_bounds(param)
855
+ VibeZstd::DCtx.frame_content_size(data)
856
+ VibeZstd::DCtx.estimate_memory
857
+ ```
858
+
859
+ ### CDict / DDict (Dictionaries)
860
+
861
+ ```ruby
862
+ cdict = VibeZstd::CDict.new(dict_data, level = nil)
863
+ cdict.size # Dictionary size in bytes
864
+ cdict.dict_id # Dictionary ID
865
+
866
+ ddict = VibeZstd::DDict.new(dict_data)
867
+ ddict.size
868
+ ddict.dict_id
869
+
870
+ # Class methods
871
+ VibeZstd::CDict.estimate_memory(dict_size, level)
872
+ VibeZstd::DDict.estimate_memory(dict_size)
873
+ ```
874
+
875
+ ### Streaming
876
+
877
+ ```ruby
878
+ # Compression
879
+ writer = VibeZstd::CompressWriter.new(io, level: 3, dict: nil, pledged_size: nil)
880
+ VibeZstd::CompressWriter.open(io, **opts) { |w| ... }
881
+ writer.write(data)
882
+ writer.flush
883
+ writer.finish # or writer.close
884
+
885
+ # Decompression
886
+ reader = VibeZstd::DecompressReader.new(io, dict: nil, initial_chunk_size: nil)
887
+ VibeZstd::DecompressReader.open(io, **opts) { |r| ... }
888
+ reader.read(size = nil)
889
+ reader.eof?
890
+ reader.each { |chunk| ... }
891
+ reader.each_line(separator = $/) { |line| ... }
892
+ reader.gets(separator = $/)
893
+ reader.readline(separator = $/)
894
+ reader.readpartial(maxlen)
895
+ reader.read_all
896
+ ```
897
+
898
+ ### ThreadLocal (Context Pooling)
899
+
900
+ ```ruby
901
+ # Thread-local context reuse (ideal for Rails/Puma applications)
902
+ VibeZstd::ThreadLocal.compress(data, level: nil, dict: nil, pledged_size: nil)
903
+ VibeZstd::ThreadLocal.decompress(data, dict: nil, initial_capacity: nil)
904
+ VibeZstd::ThreadLocal.clear_thread_cache!
905
+ VibeZstd::ThreadLocal.thread_cache_stats
906
+ ```
907
+
908
+ ## Thread Safety and Ractors
909
+
910
+ VibeZstd is designed to be thread-safe and Ractor-compatible:
911
+
912
+ - Each context/dictionary object manages its own Zstd state
913
+ - CPU-intensive operations release the GVL for concurrent execution
914
+ - Create separate instances for each thread/Ractor as needed
915
+
916
+ ```ruby
917
+ # Safe: Each thread has its own context
918
+ threads = 10.times.map do
919
+ Thread.new do
920
+ cctx = VibeZstd::CCtx.new
921
+ # ... use cctx
922
+ end
923
+ end
924
+ ```
925
+
926
+ ## Benchmarking
927
+
928
+ Run comprehensive benchmarks:
929
+
930
+ ```bash
931
+ # All benchmarks
932
+ ruby benchmark/run_all.rb
933
+
934
+ # Specific benchmarks
935
+ ruby benchmark/context_reuse.rb
936
+ ruby benchmark/dictionary_usage.rb
937
+ ruby benchmark/compression_levels.rb
938
+ ruby benchmark/streaming.rb
939
+ ruby benchmark/multithreading.rb
940
+
941
+ # Generate README benchmark output
942
+ ruby benchmark/for_readme.rb
943
+ ```
944
+
945
+ See `benchmark/README.md` for detailed documentation.
946
+
947
+ ## Development
948
+
949
+ To set up the development environment:
950
+
951
+ ```bash
952
+ bin/setup # Install dependencies
953
+ rake compile # Build C extension
954
+ rake test # Run tests
955
+ bin/console # Interactive console
956
+ bundle exec rake install # Install locally
957
+ ```
958
+
959
+ ## Contributing
960
+
961
+ Bug reports and pull requests are welcome on GitHub at https://github.com/kreynolds/vibe_zstd.
962
+
963
+ ## Vendored Libraries
964
+
965
+ This gem vendors the Zstandard (zstd) compression library to provide consistent behavior across all platforms. The vendored zstd library is located in `ext/vibe_zstd/libzstd/` and is licensed under the BSD License.
966
+
967
+ **Zstandard License:**
968
+ - Copyright (c) Meta Platforms, Inc. and affiliates
969
+ - Licensed under the BSD License (see `ext/vibe_zstd/libzstd/LICENSE`)
970
+ - Project: https://github.com/facebook/zstd
971
+
972
+ For the complete zstd license text, see the LICENSE file in the vendored library directory.
973
+
974
+ ## License
975
+
976
+ The VibeZstd gem itself is available as open source under the [MIT License](https://opensource.org/licenses/MIT).
977
+
978
+ This gem vendors the Zstandard library, which is separately licensed under the BSD License. See the [Vendored Libraries](#vendored-libraries) section above for details.