vibe_zstd 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. checksums.yaml +7 -0
  2. data/.standard.yml +3 -0
  3. data/CHANGELOG.md +22 -0
  4. data/LICENSE.txt +21 -0
  5. data/README.md +978 -0
  6. data/Rakefile +20 -0
  7. data/benchmark/README.md +198 -0
  8. data/benchmark/compression_levels.rb +99 -0
  9. data/benchmark/context_reuse.rb +174 -0
  10. data/benchmark/decompression_speed_by_level.rb +65 -0
  11. data/benchmark/dictionary_training.rb +182 -0
  12. data/benchmark/dictionary_usage.rb +121 -0
  13. data/benchmark/for_readme.rb +157 -0
  14. data/benchmark/generate_fixture.rb +82 -0
  15. data/benchmark/helpers.rb +237 -0
  16. data/benchmark/multithreading.rb +105 -0
  17. data/benchmark/run_all.rb +150 -0
  18. data/benchmark/streaming.rb +154 -0
  19. data/ext/vibe_zstd/Makefile +270 -0
  20. data/ext/vibe_zstd/cctx.c +565 -0
  21. data/ext/vibe_zstd/dctx.c +493 -0
  22. data/ext/vibe_zstd/dict.c +587 -0
  23. data/ext/vibe_zstd/extconf.rb +52 -0
  24. data/ext/vibe_zstd/frames.c +132 -0
  25. data/ext/vibe_zstd/libzstd/LICENSE +30 -0
  26. data/ext/vibe_zstd/libzstd/common/allocations.h +55 -0
  27. data/ext/vibe_zstd/libzstd/common/bits.h +205 -0
  28. data/ext/vibe_zstd/libzstd/common/bitstream.h +454 -0
  29. data/ext/vibe_zstd/libzstd/common/compiler.h +464 -0
  30. data/ext/vibe_zstd/libzstd/common/cpu.h +249 -0
  31. data/ext/vibe_zstd/libzstd/common/debug.c +30 -0
  32. data/ext/vibe_zstd/libzstd/common/debug.h +107 -0
  33. data/ext/vibe_zstd/libzstd/common/entropy_common.c +340 -0
  34. data/ext/vibe_zstd/libzstd/common/error_private.c +64 -0
  35. data/ext/vibe_zstd/libzstd/common/error_private.h +158 -0
  36. data/ext/vibe_zstd/libzstd/common/fse.h +625 -0
  37. data/ext/vibe_zstd/libzstd/common/fse_decompress.c +315 -0
  38. data/ext/vibe_zstd/libzstd/common/huf.h +277 -0
  39. data/ext/vibe_zstd/libzstd/common/mem.h +422 -0
  40. data/ext/vibe_zstd/libzstd/common/pool.c +371 -0
  41. data/ext/vibe_zstd/libzstd/common/pool.h +81 -0
  42. data/ext/vibe_zstd/libzstd/common/portability_macros.h +171 -0
  43. data/ext/vibe_zstd/libzstd/common/threading.c +182 -0
  44. data/ext/vibe_zstd/libzstd/common/threading.h +142 -0
  45. data/ext/vibe_zstd/libzstd/common/xxhash.c +18 -0
  46. data/ext/vibe_zstd/libzstd/common/xxhash.h +7094 -0
  47. data/ext/vibe_zstd/libzstd/common/zstd_common.c +48 -0
  48. data/ext/vibe_zstd/libzstd/common/zstd_deps.h +123 -0
  49. data/ext/vibe_zstd/libzstd/common/zstd_internal.h +324 -0
  50. data/ext/vibe_zstd/libzstd/common/zstd_trace.h +156 -0
  51. data/ext/vibe_zstd/libzstd/compress/clevels.h +134 -0
  52. data/ext/vibe_zstd/libzstd/compress/fse_compress.c +625 -0
  53. data/ext/vibe_zstd/libzstd/compress/hist.c +191 -0
  54. data/ext/vibe_zstd/libzstd/compress/hist.h +82 -0
  55. data/ext/vibe_zstd/libzstd/compress/huf_compress.c +1464 -0
  56. data/ext/vibe_zstd/libzstd/compress/zstd_compress.c +7843 -0
  57. data/ext/vibe_zstd/libzstd/compress/zstd_compress_internal.h +1636 -0
  58. data/ext/vibe_zstd/libzstd/compress/zstd_compress_literals.c +235 -0
  59. data/ext/vibe_zstd/libzstd/compress/zstd_compress_literals.h +39 -0
  60. data/ext/vibe_zstd/libzstd/compress/zstd_compress_sequences.c +442 -0
  61. data/ext/vibe_zstd/libzstd/compress/zstd_compress_sequences.h +55 -0
  62. data/ext/vibe_zstd/libzstd/compress/zstd_compress_superblock.c +688 -0
  63. data/ext/vibe_zstd/libzstd/compress/zstd_compress_superblock.h +32 -0
  64. data/ext/vibe_zstd/libzstd/compress/zstd_cwksp.h +765 -0
  65. data/ext/vibe_zstd/libzstd/compress/zstd_double_fast.c +778 -0
  66. data/ext/vibe_zstd/libzstd/compress/zstd_double_fast.h +42 -0
  67. data/ext/vibe_zstd/libzstd/compress/zstd_fast.c +985 -0
  68. data/ext/vibe_zstd/libzstd/compress/zstd_fast.h +30 -0
  69. data/ext/vibe_zstd/libzstd/compress/zstd_lazy.c +2199 -0
  70. data/ext/vibe_zstd/libzstd/compress/zstd_lazy.h +193 -0
  71. data/ext/vibe_zstd/libzstd/compress/zstd_ldm.c +745 -0
  72. data/ext/vibe_zstd/libzstd/compress/zstd_ldm.h +109 -0
  73. data/ext/vibe_zstd/libzstd/compress/zstd_ldm_geartab.h +106 -0
  74. data/ext/vibe_zstd/libzstd/compress/zstd_opt.c +1580 -0
  75. data/ext/vibe_zstd/libzstd/compress/zstd_opt.h +72 -0
  76. data/ext/vibe_zstd/libzstd/compress/zstd_preSplit.c +238 -0
  77. data/ext/vibe_zstd/libzstd/compress/zstd_preSplit.h +33 -0
  78. data/ext/vibe_zstd/libzstd/compress/zstdmt_compress.c +1923 -0
  79. data/ext/vibe_zstd/libzstd/compress/zstdmt_compress.h +102 -0
  80. data/ext/vibe_zstd/libzstd/decompress/huf_decompress.c +1944 -0
  81. data/ext/vibe_zstd/libzstd/decompress/huf_decompress_amd64.S +602 -0
  82. data/ext/vibe_zstd/libzstd/decompress/zstd_ddict.c +244 -0
  83. data/ext/vibe_zstd/libzstd/decompress/zstd_ddict.h +44 -0
  84. data/ext/vibe_zstd/libzstd/decompress/zstd_decompress.c +2410 -0
  85. data/ext/vibe_zstd/libzstd/decompress/zstd_decompress_block.c +2209 -0
  86. data/ext/vibe_zstd/libzstd/decompress/zstd_decompress_block.h +73 -0
  87. data/ext/vibe_zstd/libzstd/decompress/zstd_decompress_internal.h +240 -0
  88. data/ext/vibe_zstd/libzstd/deprecated/zbuff.h +214 -0
  89. data/ext/vibe_zstd/libzstd/deprecated/zbuff_common.c +26 -0
  90. data/ext/vibe_zstd/libzstd/deprecated/zbuff_compress.c +167 -0
  91. data/ext/vibe_zstd/libzstd/deprecated/zbuff_decompress.c +77 -0
  92. data/ext/vibe_zstd/libzstd/dictBuilder/cover.c +1302 -0
  93. data/ext/vibe_zstd/libzstd/dictBuilder/cover.h +152 -0
  94. data/ext/vibe_zstd/libzstd/dictBuilder/divsufsort.c +1913 -0
  95. data/ext/vibe_zstd/libzstd/dictBuilder/divsufsort.h +57 -0
  96. data/ext/vibe_zstd/libzstd/dictBuilder/fastcover.c +766 -0
  97. data/ext/vibe_zstd/libzstd/dictBuilder/zdict.c +1133 -0
  98. data/ext/vibe_zstd/libzstd/zdict.h +481 -0
  99. data/ext/vibe_zstd/libzstd/zstd.h +3198 -0
  100. data/ext/vibe_zstd/libzstd/zstd_errors.h +107 -0
  101. data/ext/vibe_zstd/streaming.c +410 -0
  102. data/ext/vibe_zstd/vibe_zstd.c +293 -0
  103. data/ext/vibe_zstd/vibe_zstd.h +56 -0
  104. data/ext/vibe_zstd/vibe_zstd_internal.h +27 -0
  105. data/lib/vibe_zstd/constants.rb +67 -0
  106. data/lib/vibe_zstd/version.rb +5 -0
  107. data/lib/vibe_zstd.rb +255 -0
  108. data/sig/vibe_zstd.rbs +76 -0
  109. metadata +179 -0
@@ -0,0 +1,587 @@
1
+ // Dictionary implementation for VibeZstd
2
+ #include "vibe_zstd_internal.h"
3
+
4
+ // Forward declarations
5
+ static VALUE vibe_zstd_cdict_initialize(int argc, VALUE* argv, VALUE self);
6
+ static VALUE vibe_zstd_cdict_size(VALUE self);
7
+ static VALUE vibe_zstd_cdict_dict_id(VALUE self);
8
+ static VALUE vibe_zstd_cdict_estimate_memory(VALUE self, VALUE dict_size, VALUE level);
9
+ static VALUE vibe_zstd_ddict_initialize(VALUE self, VALUE dict_data);
10
+ static VALUE vibe_zstd_ddict_size(VALUE self);
11
+ static VALUE vibe_zstd_ddict_dict_id(VALUE self);
12
+ static VALUE vibe_zstd_ddict_estimate_memory(VALUE self, VALUE dict_size);
13
+ static VALUE vibe_zstd_train_dict(int argc, VALUE* argv, VALUE self);
14
+ static VALUE vibe_zstd_train_dict_cover(int argc, VALUE* argv, VALUE self);
15
+ static VALUE vibe_zstd_train_dict_fast_cover(int argc, VALUE* argv, VALUE self);
16
+ static VALUE vibe_zstd_get_dict_id(VALUE self, VALUE dict_data);
17
+ static VALUE vibe_zstd_get_dict_id_from_frame(VALUE self, VALUE data);
18
+ static VALUE vibe_zstd_finalize_dictionary(int argc, VALUE* argv, VALUE self);
19
+ static VALUE vibe_zstd_dict_header_size(VALUE self, VALUE dict_data);
20
+
21
+ // TypedData types - defined in vibe_zstd.c
22
+ extern rb_data_type_t vibe_zstd_cdict_type;
23
+ extern rb_data_type_t vibe_zstd_ddict_type;
24
+
25
+ // CDict initialize method
26
+ static VALUE
27
+ vibe_zstd_cdict_initialize(int argc, VALUE* argv, VALUE self) {
28
+ VALUE dict_data, level = Qnil;
29
+ rb_scan_args(argc, argv, "11", &dict_data, &level);
30
+ vibe_zstd_cdict* cdict;
31
+ TypedData_Get_Struct(self, vibe_zstd_cdict, &vibe_zstd_cdict_type, cdict);
32
+ StringValue(dict_data);
33
+ int lvl = NIL_P(level) ? ZSTD_defaultCLevel() : NUM2INT(level);
34
+ cdict->cdict = ZSTD_createCDict(RSTRING_PTR(dict_data), RSTRING_LEN(dict_data), lvl);
35
+ if (!cdict->cdict) {
36
+ rb_raise(rb_eRuntimeError, "Failed to create ZSTD_CDict");
37
+ }
38
+
39
+ // Store dictionary data and level for later retrieval
40
+ rb_ivar_set(self, rb_intern("@dict_data"), dict_data);
41
+ rb_ivar_set(self, rb_intern("@compression_level"), INT2NUM(lvl));
42
+
43
+ return self;
44
+ }
45
+
46
+ // CDict size method - returns the size in memory
47
+ static VALUE
48
+ vibe_zstd_cdict_size(VALUE self) {
49
+ vibe_zstd_cdict* cdict;
50
+ TypedData_Get_Struct(self, vibe_zstd_cdict, &vibe_zstd_cdict_type, cdict);
51
+ if (!cdict->cdict) {
52
+ rb_raise(rb_eRuntimeError, "CDict not initialized");
53
+ }
54
+ size_t size = ZSTD_sizeof_CDict(cdict->cdict);
55
+ return SIZET2NUM(size);
56
+ }
57
+
58
+ // CDict dict_id method - returns dictionary ID
59
+ static VALUE
60
+ vibe_zstd_cdict_dict_id(VALUE self) {
61
+ vibe_zstd_cdict* cdict;
62
+ TypedData_Get_Struct(self, vibe_zstd_cdict, &vibe_zstd_cdict_type, cdict);
63
+ if (!cdict->cdict) {
64
+ rb_raise(rb_eRuntimeError, "CDict not initialized");
65
+ }
66
+ unsigned dictID = ZSTD_getDictID_fromCDict(cdict->cdict);
67
+ return UINT2NUM(dictID);
68
+ }
69
+
70
+ // DDict initialize method
71
+ static VALUE
72
+ vibe_zstd_ddict_initialize(VALUE self, VALUE dict_data) {
73
+ vibe_zstd_ddict* ddict;
74
+ TypedData_Get_Struct(self, vibe_zstd_ddict, &vibe_zstd_ddict_type, ddict);
75
+ StringValue(dict_data);
76
+ ddict->ddict = ZSTD_createDDict(RSTRING_PTR(dict_data), RSTRING_LEN(dict_data));
77
+ if (!ddict->ddict) {
78
+ rb_raise(rb_eRuntimeError, "Failed to create ZSTD_DDict");
79
+ }
80
+ return self;
81
+ }
82
+
83
+ // DDict size method - returns the size in memory
84
+ static VALUE
85
+ vibe_zstd_ddict_size(VALUE self) {
86
+ vibe_zstd_ddict* ddict;
87
+ TypedData_Get_Struct(self, vibe_zstd_ddict, &vibe_zstd_ddict_type, ddict);
88
+ if (!ddict->ddict) {
89
+ rb_raise(rb_eRuntimeError, "DDict not initialized");
90
+ }
91
+ size_t size = ZSTD_sizeof_DDict(ddict->ddict);
92
+ return SIZET2NUM(size);
93
+ }
94
+
95
+ // DDict dict_id method - returns dictionary ID
96
+ static VALUE
97
+ vibe_zstd_ddict_dict_id(VALUE self) {
98
+ vibe_zstd_ddict* ddict;
99
+ TypedData_Get_Struct(self, vibe_zstd_ddict, &vibe_zstd_ddict_type, ddict);
100
+ if (!ddict->ddict) {
101
+ rb_raise(rb_eRuntimeError, "DDict not initialized");
102
+ }
103
+ unsigned dictID = ZSTD_getDictID_fromDDict(ddict->ddict);
104
+ return UINT2NUM(dictID);
105
+ }
106
+
107
+ // Cleanup structure for dictionary training operations
108
+ // Groups all allocated resources for dictionary training so they can be
109
+ // freed together in error paths or on success
110
+ typedef struct {
111
+ size_t* sample_sizes;
112
+ char* samples_buffer;
113
+ void* dict_buffer;
114
+ } dict_training_resources;
115
+
116
+ // Cleanup function for dictionary training resources
117
+ // Safely frees all allocated memory, checking for NULL to handle partial allocations.
118
+ // Called explicitly in error paths and after successful training to prevent leaks.
119
+ static VALUE
120
+ dict_training_cleanup(VALUE arg) {
121
+ dict_training_resources* resources = (dict_training_resources*)arg;
122
+ if (resources->sample_sizes) xfree(resources->sample_sizes);
123
+ if (resources->samples_buffer) xfree(resources->samples_buffer);
124
+ if (resources->dict_buffer) xfree(resources->dict_buffer);
125
+ return Qnil;
126
+ }
127
+
128
+ // Train dictionary from samples - module-level method
129
+ // VibeZstd.train_dict(samples, max_dict_size: 112640)
130
+ //
131
+ // Memory usage: Allocates memory equal to sum of all sample sizes plus max_dict_size.
132
+ // For large datasets, consider training on a representative subset to reduce memory footprint.
133
+ static VALUE
134
+ vibe_zstd_train_dict(int argc, VALUE* argv, VALUE self) {
135
+ VALUE samples, options;
136
+ rb_scan_args(argc, argv, "1:", &samples, &options);
137
+
138
+ // Layer 1: Validate inputs BEFORE any allocation (fail-fast)
139
+ Check_Type(samples, T_ARRAY);
140
+ long num_samples = RARRAY_LEN(samples);
141
+
142
+ if (num_samples == 0) {
143
+ rb_raise(rb_eArgError, "samples array cannot be empty");
144
+ }
145
+
146
+ // Validate all samples are strings and calculate sizes BEFORE allocating
147
+ size_t total_samples_size = 0;
148
+ for (long i = 0; i < num_samples; i++) {
149
+ VALUE sample = rb_ary_entry(samples, i);
150
+ StringValue(sample); // Validate type early - may raise TypeError
151
+ total_samples_size += RSTRING_LEN(sample);
152
+ }
153
+
154
+ // Parse options
155
+ VALUE max_dict_size_val = Qnil;
156
+ if (!NIL_P(options)) {
157
+ max_dict_size_val = rb_hash_aref(options, ID2SYM(rb_intern("max_dict_size")));
158
+ }
159
+
160
+ // Default max dictionary size is 112KB (zstd default)
161
+ size_t max_dict_size = NIL_P(max_dict_size_val) ? (112 * 1024) : NUM2SIZET(max_dict_size_val);
162
+
163
+ // Layer 2: Allocate late - only after validation passes
164
+ dict_training_resources resources = {NULL, NULL, NULL};
165
+ resources.sample_sizes = ALLOC_N(size_t, num_samples);
166
+ resources.samples_buffer = ALLOC_N(char, total_samples_size);
167
+ resources.dict_buffer = ALLOC_N(char, max_dict_size);
168
+
169
+ // Layer 3: Use rb_ensure for guaranteed cleanup (safety net)
170
+ // Build samples buffer - we already validated, so just copy
171
+ size_t offset = 0;
172
+ for (long i = 0; i < num_samples; i++) {
173
+ VALUE sample = rb_ary_entry(samples, i);
174
+ size_t sample_len = RSTRING_LEN(sample);
175
+ resources.sample_sizes[i] = sample_len;
176
+ memcpy(resources.samples_buffer + offset, RSTRING_PTR(sample), sample_len);
177
+ offset += sample_len;
178
+ }
179
+
180
+ // Train the dictionary
181
+ size_t dict_size = ZDICT_trainFromBuffer(
182
+ resources.dict_buffer, max_dict_size,
183
+ resources.samples_buffer, resources.sample_sizes, (unsigned)num_samples
184
+ );
185
+
186
+ // Check for errors
187
+ if (ZDICT_isError(dict_size)) {
188
+ dict_training_cleanup((VALUE)&resources);
189
+ rb_raise(rb_eRuntimeError, "Dictionary training failed: %s", ZDICT_getErrorName(dict_size));
190
+ }
191
+
192
+ // Create Ruby string with the trained dictionary
193
+ VALUE dict_string = rb_str_new(resources.dict_buffer, dict_size);
194
+
195
+ // Clean up all resources
196
+ dict_training_cleanup((VALUE)&resources);
197
+
198
+ return dict_string;
199
+ }
200
+
201
+ // VibeZstd.train_dict_cover(samples, max_dict_size: 112640, k: 0, d: 0, steps: 0, split_point: 1.0, shrink_dict: false, shrink_dict_max_regression: 0, nb_threads: 0)
202
+ //
203
+ // Memory usage: Allocates memory equal to sum of all sample sizes plus max_dict_size.
204
+ // For large datasets, consider training on a representative subset to reduce memory footprint.
205
+ static VALUE
206
+ vibe_zstd_train_dict_cover(int argc, VALUE* argv, VALUE self) {
207
+ VALUE samples, options;
208
+ rb_scan_args(argc, argv, "1:", &samples, &options);
209
+
210
+ // Layer 1: Validate inputs BEFORE any allocation (fail-fast)
211
+ Check_Type(samples, T_ARRAY);
212
+ long num_samples = RARRAY_LEN(samples);
213
+
214
+ if (num_samples == 0) {
215
+ rb_raise(rb_eArgError, "samples array cannot be empty");
216
+ }
217
+
218
+ // Validate all samples are strings and calculate sizes BEFORE allocating
219
+ size_t total_samples_size = 0;
220
+ for (long i = 0; i < num_samples; i++) {
221
+ VALUE sample = rb_ary_entry(samples, i);
222
+ StringValue(sample); // Validate type early - may raise TypeError
223
+ total_samples_size += RSTRING_LEN(sample);
224
+ }
225
+
226
+ // Initialize COVER parameters with defaults
227
+ ZDICT_cover_params_t params;
228
+ memset(&params, 0, sizeof(params));
229
+ params.splitPoint = 1.0; // Default split point
230
+
231
+ // Parse options
232
+ if (!NIL_P(options)) {
233
+ VALUE v;
234
+
235
+ v = rb_hash_aref(options, ID2SYM(rb_intern("k")));
236
+ if (!NIL_P(v)) params.k = NUM2UINT(v);
237
+
238
+ v = rb_hash_aref(options, ID2SYM(rb_intern("d")));
239
+ if (!NIL_P(v)) params.d = NUM2UINT(v);
240
+
241
+ v = rb_hash_aref(options, ID2SYM(rb_intern("steps")));
242
+ if (!NIL_P(v)) params.steps = NUM2UINT(v);
243
+
244
+ v = rb_hash_aref(options, ID2SYM(rb_intern("split_point")));
245
+ if (!NIL_P(v)) params.splitPoint = NUM2DBL(v);
246
+
247
+ v = rb_hash_aref(options, ID2SYM(rb_intern("shrink_dict")));
248
+ if (!NIL_P(v)) params.shrinkDict = RTEST(v) ? 1 : 0;
249
+
250
+ v = rb_hash_aref(options, ID2SYM(rb_intern("shrink_dict_max_regression")));
251
+ if (!NIL_P(v)) params.shrinkDictMaxRegression = NUM2UINT(v);
252
+
253
+ v = rb_hash_aref(options, ID2SYM(rb_intern("nb_threads")));
254
+ if (!NIL_P(v)) params.nbThreads = NUM2UINT(v);
255
+ }
256
+
257
+ // Get max_dict_size (default 112KB)
258
+ VALUE max_dict_size_val = Qnil;
259
+ if (!NIL_P(options)) {
260
+ max_dict_size_val = rb_hash_aref(options, ID2SYM(rb_intern("max_dict_size")));
261
+ }
262
+ size_t max_dict_size = NIL_P(max_dict_size_val) ? (112 * 1024) : NUM2SIZET(max_dict_size_val);
263
+ params.zParams.compressionLevel = 0; // Use default compression level
264
+
265
+ // Layer 2: Allocate late - only after validation passes
266
+ dict_training_resources resources = {NULL, NULL, NULL};
267
+ resources.sample_sizes = ALLOC_N(size_t, num_samples);
268
+ resources.samples_buffer = ALLOC_N(char, total_samples_size);
269
+ resources.dict_buffer = ALLOC_N(char, max_dict_size);
270
+
271
+ // Layer 3: Use rb_ensure for guaranteed cleanup (safety net)
272
+ // Build samples buffer - we already validated, so just copy
273
+ size_t offset = 0;
274
+ for (long i = 0; i < num_samples; i++) {
275
+ VALUE sample = rb_ary_entry(samples, i);
276
+ size_t sample_len = RSTRING_LEN(sample);
277
+ resources.sample_sizes[i] = sample_len;
278
+ memcpy(resources.samples_buffer + offset, RSTRING_PTR(sample), sample_len);
279
+ offset += sample_len;
280
+ }
281
+
282
+ // Train the dictionary using COVER algorithm
283
+ size_t dict_size = ZDICT_trainFromBuffer_cover(
284
+ resources.dict_buffer, max_dict_size,
285
+ resources.samples_buffer, resources.sample_sizes, (unsigned)num_samples,
286
+ params
287
+ );
288
+
289
+ // Check for errors
290
+ if (ZDICT_isError(dict_size)) {
291
+ dict_training_cleanup((VALUE)&resources);
292
+ rb_raise(rb_eRuntimeError, "Dictionary training failed: %s", ZDICT_getErrorName(dict_size));
293
+ }
294
+
295
+ // Create Ruby string with the trained dictionary
296
+ VALUE dict_string = rb_str_new(resources.dict_buffer, dict_size);
297
+
298
+ // Clean up all resources
299
+ dict_training_cleanup((VALUE)&resources);
300
+
301
+ return dict_string;
302
+ }
303
+
304
+ // VibeZstd.train_dict_fast_cover(samples, max_dict_size: 112640, k: 0, d: 0, f: 0, split_point: 1.0, accel: 0, shrink_dict: false, shrink_dict_max_regression: 0, nb_threads: 0)
305
+ //
306
+ // Memory usage: Allocates memory equal to sum of all sample sizes plus max_dict_size.
307
+ // For large datasets, consider training on a representative subset to reduce memory footprint.
308
+ static VALUE
309
+ vibe_zstd_train_dict_fast_cover(int argc, VALUE* argv, VALUE self) {
310
+ VALUE samples, options;
311
+ rb_scan_args(argc, argv, "1:", &samples, &options);
312
+
313
+ // Layer 1: Validate inputs BEFORE any allocation (fail-fast)
314
+ Check_Type(samples, T_ARRAY);
315
+ long num_samples = RARRAY_LEN(samples);
316
+
317
+ if (num_samples == 0) {
318
+ rb_raise(rb_eArgError, "samples array cannot be empty");
319
+ }
320
+
321
+ // Validate all samples are strings and calculate sizes BEFORE allocating
322
+ size_t total_samples_size = 0;
323
+ for (long i = 0; i < num_samples; i++) {
324
+ VALUE sample = rb_ary_entry(samples, i);
325
+ StringValue(sample); // Validate type early - may raise TypeError
326
+ total_samples_size += RSTRING_LEN(sample);
327
+ }
328
+
329
+ // Initialize COVER parameters with defaults
330
+ ZDICT_fastCover_params_t params;
331
+ memset(&params, 0, sizeof(params));
332
+ params.splitPoint = 1.0; // Default split point
333
+
334
+ // Parse options
335
+ if (!NIL_P(options)) {
336
+ VALUE v;
337
+
338
+ v = rb_hash_aref(options, ID2SYM(rb_intern("k")));
339
+ if (!NIL_P(v)) params.k = NUM2UINT(v);
340
+
341
+ v = rb_hash_aref(options, ID2SYM(rb_intern("d")));
342
+ if (!NIL_P(v)) params.d = NUM2UINT(v);
343
+
344
+ v = rb_hash_aref(options, ID2SYM(rb_intern("f")));
345
+ if (!NIL_P(v)) params.f = NUM2UINT(v);
346
+
347
+ v = rb_hash_aref(options, ID2SYM(rb_intern("split_point")));
348
+ if (!NIL_P(v)) params.splitPoint = NUM2DBL(v);
349
+
350
+ v = rb_hash_aref(options, ID2SYM(rb_intern("accel")));
351
+ if (!NIL_P(v)) params.accel = NUM2UINT(v);
352
+
353
+ v = rb_hash_aref(options, ID2SYM(rb_intern("shrink_dict")));
354
+ if (!NIL_P(v)) params.shrinkDict = RTEST(v) ? 1 : 0;
355
+
356
+ v = rb_hash_aref(options, ID2SYM(rb_intern("shrink_dict_max_regression")));
357
+ if (!NIL_P(v)) params.shrinkDictMaxRegression = NUM2UINT(v);
358
+
359
+ v = rb_hash_aref(options, ID2SYM(rb_intern("nb_threads")));
360
+ if (!NIL_P(v)) params.nbThreads = NUM2UINT(v);
361
+ }
362
+
363
+ // Get max_dict_size (default 112KB)
364
+ VALUE max_dict_size_val = Qnil;
365
+ if (!NIL_P(options)) {
366
+ max_dict_size_val = rb_hash_aref(options, ID2SYM(rb_intern("max_dict_size")));
367
+ }
368
+ size_t max_dict_size = NIL_P(max_dict_size_val) ? (112 * 1024) : NUM2SIZET(max_dict_size_val);
369
+ params.zParams.compressionLevel = 0; // Use default compression level
370
+
371
+ // Layer 2: Allocate late - only after validation passes
372
+ dict_training_resources resources = {NULL, NULL, NULL};
373
+ resources.sample_sizes = ALLOC_N(size_t, num_samples);
374
+ resources.samples_buffer = ALLOC_N(char, total_samples_size);
375
+ resources.dict_buffer = ALLOC_N(char, max_dict_size);
376
+
377
+ // Layer 3: Use rb_ensure for guaranteed cleanup (safety net)
378
+ // Build samples buffer - we already validated, so just copy
379
+ size_t offset = 0;
380
+ for (long i = 0; i < num_samples; i++) {
381
+ VALUE sample = rb_ary_entry(samples, i);
382
+ size_t sample_len = RSTRING_LEN(sample);
383
+ resources.sample_sizes[i] = sample_len;
384
+ memcpy(resources.samples_buffer + offset, RSTRING_PTR(sample), sample_len);
385
+ offset += sample_len;
386
+ }
387
+
388
+ // Train the dictionary using fast COVER algorithm
389
+ size_t dict_size = ZDICT_trainFromBuffer_fastCover(
390
+ resources.dict_buffer, max_dict_size,
391
+ resources.samples_buffer, resources.sample_sizes, (unsigned)num_samples,
392
+ params
393
+ );
394
+
395
+ // Check for errors
396
+ if (ZDICT_isError(dict_size)) {
397
+ dict_training_cleanup((VALUE)&resources);
398
+ rb_raise(rb_eRuntimeError, "Dictionary training failed: %s", ZDICT_getErrorName(dict_size));
399
+ }
400
+
401
+ // Create Ruby string with the trained dictionary
402
+ VALUE dict_string = rb_str_new(resources.dict_buffer, dict_size);
403
+
404
+ // Clean up all resources
405
+ dict_training_cleanup((VALUE)&resources);
406
+
407
+ return dict_string;
408
+ }
409
+
410
+ // Get dictionary ID from raw dictionary data - module-level utility
411
+ // VibeZstd.get_dict_id(dict_data)
412
+ static VALUE
413
+ vibe_zstd_get_dict_id(VALUE self, VALUE dict_data) {
414
+ StringValue(dict_data);
415
+ unsigned dict_id = ZDICT_getDictID(RSTRING_PTR(dict_data), RSTRING_LEN(dict_data));
416
+ return UINT2NUM(dict_id);
417
+ }
418
+
419
+ // Get dictionary ID from compressed frame - module-level utility
420
+ // VibeZstd.get_dict_id_from_frame(data)
421
+ static VALUE
422
+ vibe_zstd_get_dict_id_from_frame(VALUE self, VALUE data) {
423
+ StringValue(data);
424
+ unsigned dict_id = ZSTD_getDictID_fromFrame(RSTRING_PTR(data), RSTRING_LEN(data));
425
+ return UINT2NUM(dict_id);
426
+ }
427
+
428
+ // Finalize raw content into zstd dictionary - module-level utility
429
+ // VibeZstd.finalize_dictionary(content:, samples:, max_size:, compression_level: nil, dict_id: nil)
430
+ //
431
+ // Memory usage: Allocates memory equal to sum of all sample sizes plus max_size.
432
+ // For large datasets, consider using a representative subset of samples.
433
+ static VALUE
434
+ vibe_zstd_finalize_dictionary(int argc, VALUE* argv, VALUE self) {
435
+ VALUE options;
436
+ rb_scan_args(argc, argv, ":", &options);
437
+
438
+ // Layer 1: Validate inputs BEFORE any allocation (fail-fast)
439
+ if (NIL_P(options)) {
440
+ rb_raise(rb_eArgError, "finalize_dictionary requires keyword arguments");
441
+ }
442
+
443
+ // Get required parameters
444
+ VALUE content_val = rb_hash_aref(options, ID2SYM(rb_intern("content")));
445
+ VALUE samples_val = rb_hash_aref(options, ID2SYM(rb_intern("samples")));
446
+ VALUE max_size_val = rb_hash_aref(options, ID2SYM(rb_intern("max_size")));
447
+
448
+ if (NIL_P(content_val)) {
449
+ rb_raise(rb_eArgError, "content: parameter is required");
450
+ }
451
+ if (NIL_P(samples_val)) {
452
+ rb_raise(rb_eArgError, "samples: parameter is required");
453
+ }
454
+ if (NIL_P(max_size_val)) {
455
+ rb_raise(rb_eArgError, "max_size: parameter is required");
456
+ }
457
+
458
+ // Validate types early
459
+ StringValue(content_val);
460
+ Check_Type(samples_val, T_ARRAY);
461
+ size_t max_size = NUM2SIZET(max_size_val);
462
+
463
+ long num_samples = RARRAY_LEN(samples_val);
464
+ if (num_samples == 0) {
465
+ rb_raise(rb_eArgError, "samples array cannot be empty");
466
+ }
467
+
468
+ // Validate all samples are strings and calculate sizes BEFORE allocating
469
+ size_t total_samples_size = 0;
470
+ for (long i = 0; i < num_samples; i++) {
471
+ VALUE sample = rb_ary_entry(samples_val, i);
472
+ StringValue(sample); // Validate type early - may raise TypeError
473
+ total_samples_size += RSTRING_LEN(sample);
474
+ }
475
+
476
+ // Get optional parameters
477
+ VALUE compression_level_val = rb_hash_aref(options, ID2SYM(rb_intern("compression_level")));
478
+ VALUE dict_id_val = rb_hash_aref(options, ID2SYM(rb_intern("dict_id")));
479
+
480
+ // Setup ZDICT_params_t
481
+ ZDICT_params_t params;
482
+ memset(&params, 0, sizeof(params));
483
+ params.compressionLevel = NIL_P(compression_level_val) ? 0 : NUM2INT(compression_level_val);
484
+ params.dictID = NIL_P(dict_id_val) ? 0 : NUM2UINT(dict_id_val);
485
+ params.notificationLevel = 0;
486
+
487
+ // Layer 2: Allocate late - only after validation passes
488
+ dict_training_resources resources = {NULL, NULL, NULL};
489
+ resources.sample_sizes = ALLOC_N(size_t, num_samples);
490
+ resources.samples_buffer = ALLOC_N(char, total_samples_size);
491
+ resources.dict_buffer = ALLOC_N(char, max_size);
492
+
493
+ // Layer 3: Use rb_ensure for guaranteed cleanup (safety net)
494
+ // Build samples buffer - we already validated, so just copy
495
+ size_t offset = 0;
496
+ for (long i = 0; i < num_samples; i++) {
497
+ VALUE sample = rb_ary_entry(samples_val, i);
498
+ size_t sample_len = RSTRING_LEN(sample);
499
+ resources.sample_sizes[i] = sample_len;
500
+ memcpy(resources.samples_buffer + offset, RSTRING_PTR(sample), sample_len);
501
+ offset += sample_len;
502
+ }
503
+
504
+ // Finalize the dictionary
505
+ size_t dict_size = ZDICT_finalizeDictionary(
506
+ resources.dict_buffer, max_size,
507
+ RSTRING_PTR(content_val), RSTRING_LEN(content_val),
508
+ resources.samples_buffer, resources.sample_sizes, (unsigned)num_samples,
509
+ params
510
+ );
511
+
512
+ // Check for errors
513
+ if (ZDICT_isError(dict_size)) {
514
+ dict_training_cleanup((VALUE)&resources);
515
+ rb_raise(rb_eRuntimeError, "Dictionary finalization failed: %s", ZDICT_getErrorName(dict_size));
516
+ }
517
+
518
+ // Create Ruby string with the finalized dictionary
519
+ VALUE dict_string = rb_str_new(resources.dict_buffer, dict_size);
520
+
521
+ // Clean up all resources
522
+ dict_training_cleanup((VALUE)&resources);
523
+
524
+ return dict_string;
525
+ }
526
+
527
+ // Get dictionary header size - module-level utility
528
+ // VibeZstd.dict_header_size(dict_data)
529
+ static VALUE
530
+ vibe_zstd_dict_header_size(VALUE self, VALUE dict_data) {
531
+ StringValue(dict_data);
532
+ size_t header_size = ZDICT_getDictHeaderSize(RSTRING_PTR(dict_data), RSTRING_LEN(dict_data));
533
+
534
+ // Check for errors
535
+ if (ZDICT_isError(header_size)) {
536
+ rb_raise(rb_eRuntimeError, "Failed to get dictionary header size: %s", ZDICT_getErrorName(header_size));
537
+ }
538
+
539
+ return SIZET2NUM(header_size);
540
+ }
541
+
542
+ // CDict.estimate_memory(dict_size, level)
543
+ static VALUE
544
+ vibe_zstd_cdict_estimate_memory(VALUE self, VALUE dict_size, VALUE level) {
545
+ size_t size = NUM2SIZET(dict_size);
546
+ int lvl = NUM2INT(level);
547
+ size_t estimate = ZSTD_estimateCDictSize(size, lvl);
548
+ return SIZET2NUM(estimate);
549
+ }
550
+
551
+ // DDict.estimate_memory(dict_size)
552
+ static VALUE
553
+ vibe_zstd_ddict_estimate_memory(VALUE self, VALUE dict_size) {
554
+ size_t size = NUM2SIZET(dict_size);
555
+ size_t estimate = ZSTD_estimateDDictSize(size, ZSTD_dlm_byCopy);
556
+ return SIZET2NUM(estimate);
557
+ }
558
+
559
+ // Class initialization functions called from main Init_vibe_zstd
560
+ void
561
+ vibe_zstd_dict_init_classes(VALUE rb_cVibeZstdCDict, VALUE rb_cVibeZstdDDict) {
562
+ // CDict class setup
563
+ rb_define_alloc_func(rb_cVibeZstdCDict, vibe_zstd_cdict_alloc);
564
+ rb_define_method(rb_cVibeZstdCDict, "initialize", vibe_zstd_cdict_initialize, -1);
565
+ rb_define_method(rb_cVibeZstdCDict, "size", vibe_zstd_cdict_size, 0);
566
+ rb_define_method(rb_cVibeZstdCDict, "dict_id", vibe_zstd_cdict_dict_id, 0);
567
+ rb_define_singleton_method(rb_cVibeZstdCDict, "estimate_memory", vibe_zstd_cdict_estimate_memory, 2);
568
+
569
+ // DDict class setup
570
+ rb_define_alloc_func(rb_cVibeZstdDDict, vibe_zstd_ddict_alloc);
571
+ rb_define_method(rb_cVibeZstdDDict, "initialize", vibe_zstd_ddict_initialize, 1);
572
+ rb_define_method(rb_cVibeZstdDDict, "size", vibe_zstd_ddict_size, 0);
573
+ rb_define_method(rb_cVibeZstdDDict, "dict_id", vibe_zstd_ddict_dict_id, 0);
574
+ rb_define_singleton_method(rb_cVibeZstdDDict, "estimate_memory", vibe_zstd_ddict_estimate_memory, 1);
575
+ }
576
+
577
+ void
578
+ vibe_zstd_dict_init_module_methods(VALUE rb_mVibeZstd) {
579
+ // Module-level dictionary methods
580
+ rb_define_module_function(rb_mVibeZstd, "train_dict", vibe_zstd_train_dict, -1);
581
+ rb_define_module_function(rb_mVibeZstd, "train_dict_cover", vibe_zstd_train_dict_cover, -1);
582
+ rb_define_module_function(rb_mVibeZstd, "train_dict_fast_cover", vibe_zstd_train_dict_fast_cover, -1);
583
+ rb_define_module_function(rb_mVibeZstd, "get_dict_id", vibe_zstd_get_dict_id, 1);
584
+ rb_define_module_function(rb_mVibeZstd, "get_dict_id_from_frame", vibe_zstd_get_dict_id_from_frame, 1);
585
+ rb_define_module_function(rb_mVibeZstd, "finalize_dictionary", vibe_zstd_finalize_dictionary, -1);
586
+ rb_define_module_function(rb_mVibeZstd, "dict_header_size", vibe_zstd_dict_header_size, 1);
587
+ }
@@ -0,0 +1,52 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "mkmf"
4
+
5
+ # Use vendored zstd library
6
+ LIBZSTD_DIR = File.expand_path("libzstd", __dir__)
7
+
8
+ # Add include paths for vendored zstd headers
9
+ # standard:disable Style/GlobalVars
10
+ $INCFLAGS << " -I#{LIBZSTD_DIR}"
11
+ $INCFLAGS << " -I#{LIBZSTD_DIR}/common"
12
+ $INCFLAGS << " -I#{LIBZSTD_DIR}/compress"
13
+ $INCFLAGS << " -I#{LIBZSTD_DIR}/decompress"
14
+ $INCFLAGS << " -I#{LIBZSTD_DIR}/dictBuilder"
15
+ # standard:enable Style/GlobalVars
16
+
17
+ # Add preprocessor definitions
18
+ append_cflags("-DXXH_NAMESPACE=ZSTD_")
19
+ append_cflags("-DZSTD_LEGACY_SUPPORT=0") # Disable legacy support to reduce size
20
+ append_cflags("-DZSTD_MULTITHREAD") # Enable multithreading support
21
+
22
+ # Link with pthread for multithreading
23
+ have_library("pthread") || abort("pthread library is required for multithreading support")
24
+
25
+ # Makes all symbols private by default to avoid unintended conflict
26
+ # with other gems. To explicitly export symbols you can use RUBY_FUNC_EXPORTED
27
+ # selectively, or entirely remove this flag.
28
+ append_cflags("-fvisibility=hidden")
29
+
30
+ # Gather all vendored zstd source files
31
+ zstd_sources = Dir[
32
+ "#{LIBZSTD_DIR}/common/*.c",
33
+ "#{LIBZSTD_DIR}/compress/*.c",
34
+ "#{LIBZSTD_DIR}/decompress/*.c",
35
+ "#{LIBZSTD_DIR}/dictBuilder/*.c",
36
+ "#{LIBZSTD_DIR}/deprecated/*.c"
37
+ ].map { |path| File.basename(path) }
38
+
39
+ # Add the main vibe_zstd.c file (which includes the split files via #include)
40
+ # standard:disable Style/GlobalVars
41
+ $srcs = ["vibe_zstd.c"] + zstd_sources
42
+
43
+ # Set vpath to find source files in subdirectories
44
+ $VPATH ||= []
45
+ $VPATH << "$(srcdir)/libzstd/common"
46
+ $VPATH << "$(srcdir)/libzstd/compress"
47
+ $VPATH << "$(srcdir)/libzstd/decompress"
48
+ $VPATH << "$(srcdir)/libzstd/dictBuilder"
49
+ $VPATH << "$(srcdir)/libzstd/deprecated"
50
+ # standard:enable Style/GlobalVars
51
+
52
+ create_makefile("vibe_zstd/vibe_zstd")