RubyGems - vibe_zstd - Versions diffs - 1.0.0 - Mend

vibe_zstd 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (109) hide show

checksums.yaml +7 -0
data/.standard.yml +3 -0
data/CHANGELOG.md +22 -0
data/LICENSE.txt +21 -0
data/README.md +978 -0
data/Rakefile +20 -0
data/benchmark/README.md +198 -0
data/benchmark/compression_levels.rb +99 -0
data/benchmark/context_reuse.rb +174 -0
data/benchmark/decompression_speed_by_level.rb +65 -0
data/benchmark/dictionary_training.rb +182 -0
data/benchmark/dictionary_usage.rb +121 -0
data/benchmark/for_readme.rb +157 -0
data/benchmark/generate_fixture.rb +82 -0
data/benchmark/helpers.rb +237 -0
data/benchmark/multithreading.rb +105 -0
data/benchmark/run_all.rb +150 -0
data/benchmark/streaming.rb +154 -0
data/ext/vibe_zstd/Makefile +270 -0
data/ext/vibe_zstd/cctx.c +565 -0
data/ext/vibe_zstd/dctx.c +493 -0
data/ext/vibe_zstd/dict.c +587 -0
data/ext/vibe_zstd/extconf.rb +52 -0
data/ext/vibe_zstd/frames.c +132 -0
data/ext/vibe_zstd/libzstd/LICENSE +30 -0
data/ext/vibe_zstd/libzstd/common/allocations.h +55 -0
data/ext/vibe_zstd/libzstd/common/bits.h +205 -0
data/ext/vibe_zstd/libzstd/common/bitstream.h +454 -0
data/ext/vibe_zstd/libzstd/common/compiler.h +464 -0
data/ext/vibe_zstd/libzstd/common/cpu.h +249 -0
data/ext/vibe_zstd/libzstd/common/debug.c +30 -0
data/ext/vibe_zstd/libzstd/common/debug.h +107 -0
data/ext/vibe_zstd/libzstd/common/entropy_common.c +340 -0
data/ext/vibe_zstd/libzstd/common/error_private.c +64 -0
data/ext/vibe_zstd/libzstd/common/error_private.h +158 -0
data/ext/vibe_zstd/libzstd/common/fse.h +625 -0
data/ext/vibe_zstd/libzstd/common/fse_decompress.c +315 -0
data/ext/vibe_zstd/libzstd/common/huf.h +277 -0
data/ext/vibe_zstd/libzstd/common/mem.h +422 -0
data/ext/vibe_zstd/libzstd/common/pool.c +371 -0
data/ext/vibe_zstd/libzstd/common/pool.h +81 -0
data/ext/vibe_zstd/libzstd/common/portability_macros.h +171 -0
data/ext/vibe_zstd/libzstd/common/threading.c +182 -0
data/ext/vibe_zstd/libzstd/common/threading.h +142 -0
data/ext/vibe_zstd/libzstd/common/xxhash.c +18 -0
data/ext/vibe_zstd/libzstd/common/xxhash.h +7094 -0
data/ext/vibe_zstd/libzstd/common/zstd_common.c +48 -0
data/ext/vibe_zstd/libzstd/common/zstd_deps.h +123 -0
data/ext/vibe_zstd/libzstd/common/zstd_internal.h +324 -0
data/ext/vibe_zstd/libzstd/common/zstd_trace.h +156 -0
data/ext/vibe_zstd/libzstd/compress/clevels.h +134 -0
data/ext/vibe_zstd/libzstd/compress/fse_compress.c +625 -0
data/ext/vibe_zstd/libzstd/compress/hist.c +191 -0
data/ext/vibe_zstd/libzstd/compress/hist.h +82 -0
data/ext/vibe_zstd/libzstd/compress/huf_compress.c +1464 -0
data/ext/vibe_zstd/libzstd/compress/zstd_compress.c +7843 -0
data/ext/vibe_zstd/libzstd/compress/zstd_compress_internal.h +1636 -0
data/ext/vibe_zstd/libzstd/compress/zstd_compress_literals.c +235 -0
data/ext/vibe_zstd/libzstd/compress/zstd_compress_literals.h +39 -0
data/ext/vibe_zstd/libzstd/compress/zstd_compress_sequences.c +442 -0
data/ext/vibe_zstd/libzstd/compress/zstd_compress_sequences.h +55 -0
data/ext/vibe_zstd/libzstd/compress/zstd_compress_superblock.c +688 -0
data/ext/vibe_zstd/libzstd/compress/zstd_compress_superblock.h +32 -0
data/ext/vibe_zstd/libzstd/compress/zstd_cwksp.h +765 -0
data/ext/vibe_zstd/libzstd/compress/zstd_double_fast.c +778 -0
data/ext/vibe_zstd/libzstd/compress/zstd_double_fast.h +42 -0
data/ext/vibe_zstd/libzstd/compress/zstd_fast.c +985 -0
data/ext/vibe_zstd/libzstd/compress/zstd_fast.h +30 -0
data/ext/vibe_zstd/libzstd/compress/zstd_lazy.c +2199 -0
data/ext/vibe_zstd/libzstd/compress/zstd_lazy.h +193 -0
data/ext/vibe_zstd/libzstd/compress/zstd_ldm.c +745 -0
data/ext/vibe_zstd/libzstd/compress/zstd_ldm.h +109 -0
data/ext/vibe_zstd/libzstd/compress/zstd_ldm_geartab.h +106 -0
data/ext/vibe_zstd/libzstd/compress/zstd_opt.c +1580 -0
data/ext/vibe_zstd/libzstd/compress/zstd_opt.h +72 -0
data/ext/vibe_zstd/libzstd/compress/zstd_preSplit.c +238 -0
data/ext/vibe_zstd/libzstd/compress/zstd_preSplit.h +33 -0
data/ext/vibe_zstd/libzstd/compress/zstdmt_compress.c +1923 -0
data/ext/vibe_zstd/libzstd/compress/zstdmt_compress.h +102 -0
data/ext/vibe_zstd/libzstd/decompress/huf_decompress.c +1944 -0
data/ext/vibe_zstd/libzstd/decompress/huf_decompress_amd64.S +602 -0
data/ext/vibe_zstd/libzstd/decompress/zstd_ddict.c +244 -0
data/ext/vibe_zstd/libzstd/decompress/zstd_ddict.h +44 -0
data/ext/vibe_zstd/libzstd/decompress/zstd_decompress.c +2410 -0
data/ext/vibe_zstd/libzstd/decompress/zstd_decompress_block.c +2209 -0
data/ext/vibe_zstd/libzstd/decompress/zstd_decompress_block.h +73 -0
data/ext/vibe_zstd/libzstd/decompress/zstd_decompress_internal.h +240 -0
data/ext/vibe_zstd/libzstd/deprecated/zbuff.h +214 -0
data/ext/vibe_zstd/libzstd/deprecated/zbuff_common.c +26 -0
data/ext/vibe_zstd/libzstd/deprecated/zbuff_compress.c +167 -0
data/ext/vibe_zstd/libzstd/deprecated/zbuff_decompress.c +77 -0
data/ext/vibe_zstd/libzstd/dictBuilder/cover.c +1302 -0
data/ext/vibe_zstd/libzstd/dictBuilder/cover.h +152 -0
data/ext/vibe_zstd/libzstd/dictBuilder/divsufsort.c +1913 -0
data/ext/vibe_zstd/libzstd/dictBuilder/divsufsort.h +57 -0
data/ext/vibe_zstd/libzstd/dictBuilder/fastcover.c +766 -0
data/ext/vibe_zstd/libzstd/dictBuilder/zdict.c +1133 -0
data/ext/vibe_zstd/libzstd/zdict.h +481 -0
data/ext/vibe_zstd/libzstd/zstd.h +3198 -0
data/ext/vibe_zstd/libzstd/zstd_errors.h +107 -0
data/ext/vibe_zstd/streaming.c +410 -0
data/ext/vibe_zstd/vibe_zstd.c +293 -0
data/ext/vibe_zstd/vibe_zstd.h +56 -0
data/ext/vibe_zstd/vibe_zstd_internal.h +27 -0
data/lib/vibe_zstd/constants.rb +67 -0
data/lib/vibe_zstd/version.rb +5 -0
data/lib/vibe_zstd.rb +255 -0
data/sig/vibe_zstd.rbs +76 -0
metadata +179 -0

data/ext/vibe_zstd/dict.c ADDED Viewed

@@ -0,0 +1,587 @@
+// Dictionary implementation for VibeZstd
+#include "vibe_zstd_internal.h"
+// Forward declarations
+static VALUE vibe_zstd_cdict_initialize(int argc, VALUE* argv, VALUE self);
+static VALUE vibe_zstd_cdict_size(VALUE self);
+static VALUE vibe_zstd_cdict_dict_id(VALUE self);
+static VALUE vibe_zstd_cdict_estimate_memory(VALUE self, VALUE dict_size, VALUE level);
+static VALUE vibe_zstd_ddict_initialize(VALUE self, VALUE dict_data);
+static VALUE vibe_zstd_ddict_size(VALUE self);
+static VALUE vibe_zstd_ddict_dict_id(VALUE self);
+static VALUE vibe_zstd_ddict_estimate_memory(VALUE self, VALUE dict_size);
+static VALUE vibe_zstd_train_dict(int argc, VALUE* argv, VALUE self);
+static VALUE vibe_zstd_train_dict_cover(int argc, VALUE* argv, VALUE self);
+static VALUE vibe_zstd_train_dict_fast_cover(int argc, VALUE* argv, VALUE self);
+static VALUE vibe_zstd_get_dict_id(VALUE self, VALUE dict_data);
+static VALUE vibe_zstd_get_dict_id_from_frame(VALUE self, VALUE data);
+static VALUE vibe_zstd_finalize_dictionary(int argc, VALUE* argv, VALUE self);
+static VALUE vibe_zstd_dict_header_size(VALUE self, VALUE dict_data);
+// TypedData types - defined in vibe_zstd.c
+extern rb_data_type_t vibe_zstd_cdict_type;
+extern rb_data_type_t vibe_zstd_ddict_type;
+// CDict initialize method
+static VALUE
+vibe_zstd_cdict_initialize(int argc, VALUE* argv, VALUE self) {
+    VALUE dict_data, level = Qnil;
+    rb_scan_args(argc, argv, "11", &dict_data, &level);
+    vibe_zstd_cdict* cdict;
+    TypedData_Get_Struct(self, vibe_zstd_cdict, &vibe_zstd_cdict_type, cdict);
+    StringValue(dict_data);
+    int lvl = NIL_P(level) ? ZSTD_defaultCLevel() : NUM2INT(level);
+    cdict->cdict = ZSTD_createCDict(RSTRING_PTR(dict_data), RSTRING_LEN(dict_data), lvl);
+    if (!cdict->cdict) {
+        rb_raise(rb_eRuntimeError, "Failed to create ZSTD_CDict");
+    }
+    // Store dictionary data and level for later retrieval
+    rb_ivar_set(self, rb_intern("@dict_data"), dict_data);
+    rb_ivar_set(self, rb_intern("@compression_level"), INT2NUM(lvl));
+    return self;
+}
+// CDict size method - returns the size in memory
+static VALUE
+vibe_zstd_cdict_size(VALUE self) {
+    vibe_zstd_cdict* cdict;
+    TypedData_Get_Struct(self, vibe_zstd_cdict, &vibe_zstd_cdict_type, cdict);
+    if (!cdict->cdict) {
+        rb_raise(rb_eRuntimeError, "CDict not initialized");
+    }
+    size_t size = ZSTD_sizeof_CDict(cdict->cdict);
+    return SIZET2NUM(size);
+}
+// CDict dict_id method - returns dictionary ID
+static VALUE
+vibe_zstd_cdict_dict_id(VALUE self) {
+    vibe_zstd_cdict* cdict;
+    TypedData_Get_Struct(self, vibe_zstd_cdict, &vibe_zstd_cdict_type, cdict);
+    if (!cdict->cdict) {
+        rb_raise(rb_eRuntimeError, "CDict not initialized");
+    }
+    unsigned dictID = ZSTD_getDictID_fromCDict(cdict->cdict);
+    return UINT2NUM(dictID);
+}
+// DDict initialize method
+static VALUE
+vibe_zstd_ddict_initialize(VALUE self, VALUE dict_data) {
+    vibe_zstd_ddict* ddict;
+    TypedData_Get_Struct(self, vibe_zstd_ddict, &vibe_zstd_ddict_type, ddict);
+    StringValue(dict_data);
+    ddict->ddict = ZSTD_createDDict(RSTRING_PTR(dict_data), RSTRING_LEN(dict_data));
+    if (!ddict->ddict) {
+        rb_raise(rb_eRuntimeError, "Failed to create ZSTD_DDict");
+    }
+    return self;
+}
+// DDict size method - returns the size in memory
+static VALUE
+vibe_zstd_ddict_size(VALUE self) {
+    vibe_zstd_ddict* ddict;
+    TypedData_Get_Struct(self, vibe_zstd_ddict, &vibe_zstd_ddict_type, ddict);
+    if (!ddict->ddict) {
+        rb_raise(rb_eRuntimeError, "DDict not initialized");
+    }
+    size_t size = ZSTD_sizeof_DDict(ddict->ddict);
+    return SIZET2NUM(size);
+}
+// DDict dict_id method - returns dictionary ID
+static VALUE
+vibe_zstd_ddict_dict_id(VALUE self) {
+    vibe_zstd_ddict* ddict;
+    TypedData_Get_Struct(self, vibe_zstd_ddict, &vibe_zstd_ddict_type, ddict);
+    if (!ddict->ddict) {
+        rb_raise(rb_eRuntimeError, "DDict not initialized");
+    }
+    unsigned dictID = ZSTD_getDictID_fromDDict(ddict->ddict);
+    return UINT2NUM(dictID);
+}
+// Cleanup structure for dictionary training operations
+// Groups all allocated resources for dictionary training so they can be
+// freed together in error paths or on success
+typedef struct {
+    size_t* sample_sizes;
+    char* samples_buffer;
+    void* dict_buffer;
+} dict_training_resources;
+// Cleanup function for dictionary training resources
+// Safely frees all allocated memory, checking for NULL to handle partial allocations.
+// Called explicitly in error paths and after successful training to prevent leaks.
+static VALUE
+dict_training_cleanup(VALUE arg) {
+    dict_training_resources* resources = (dict_training_resources*)arg;
+    if (resources->sample_sizes) xfree(resources->sample_sizes);
+    if (resources->samples_buffer) xfree(resources->samples_buffer);
+    if (resources->dict_buffer) xfree(resources->dict_buffer);
+    return Qnil;
+}
+// Train dictionary from samples - module-level method
+// VibeZstd.train_dict(samples, max_dict_size: 112640)
+//
+// Memory usage: Allocates memory equal to sum of all sample sizes plus max_dict_size.
+// For large datasets, consider training on a representative subset to reduce memory footprint.
+static VALUE
+vibe_zstd_train_dict(int argc, VALUE* argv, VALUE self) {
+    VALUE samples, options;
+    rb_scan_args(argc, argv, "1:", &samples, &options);
+    // Layer 1: Validate inputs BEFORE any allocation (fail-fast)
+    Check_Type(samples, T_ARRAY);
+    long num_samples = RARRAY_LEN(samples);
+    if (num_samples == 0) {
+        rb_raise(rb_eArgError, "samples array cannot be empty");
+    }
+    // Validate all samples are strings and calculate sizes BEFORE allocating
+    size_t total_samples_size = 0;
+    for (long i = 0; i < num_samples; i++) {
+        VALUE sample = rb_ary_entry(samples, i);
+        StringValue(sample);  // Validate type early - may raise TypeError
+        total_samples_size += RSTRING_LEN(sample);
+    }
+    // Parse options
+    VALUE max_dict_size_val = Qnil;
+    if (!NIL_P(options)) {
+        max_dict_size_val = rb_hash_aref(options, ID2SYM(rb_intern("max_dict_size")));
+    }
+    // Default max dictionary size is 112KB (zstd default)
+    size_t max_dict_size = NIL_P(max_dict_size_val) ? (112 * 1024) : NUM2SIZET(max_dict_size_val);
+    // Layer 2: Allocate late - only after validation passes
+    dict_training_resources resources = {NULL, NULL, NULL};
+    resources.sample_sizes = ALLOC_N(size_t, num_samples);
+    resources.samples_buffer = ALLOC_N(char, total_samples_size);
+    resources.dict_buffer = ALLOC_N(char, max_dict_size);
+    // Layer 3: Use rb_ensure for guaranteed cleanup (safety net)
+    // Build samples buffer - we already validated, so just copy
+    size_t offset = 0;
+    for (long i = 0; i < num_samples; i++) {
+        VALUE sample = rb_ary_entry(samples, i);
+        size_t sample_len = RSTRING_LEN(sample);
+        resources.sample_sizes[i] = sample_len;
+        memcpy(resources.samples_buffer + offset, RSTRING_PTR(sample), sample_len);
+        offset += sample_len;
+    }
+    // Train the dictionary
+    size_t dict_size = ZDICT_trainFromBuffer(
+        resources.dict_buffer, max_dict_size,
+        resources.samples_buffer, resources.sample_sizes, (unsigned)num_samples
+    );
+    // Check for errors
+    if (ZDICT_isError(dict_size)) {
+        dict_training_cleanup((VALUE)&resources);
+        rb_raise(rb_eRuntimeError, "Dictionary training failed: %s", ZDICT_getErrorName(dict_size));
+    }
+    // Create Ruby string with the trained dictionary
+    VALUE dict_string = rb_str_new(resources.dict_buffer, dict_size);
+    // Clean up all resources
+    dict_training_cleanup((VALUE)&resources);
+    return dict_string;
+}
+// VibeZstd.train_dict_cover(samples, max_dict_size: 112640, k: 0, d: 0, steps: 0, split_point: 1.0, shrink_dict: false, shrink_dict_max_regression: 0, nb_threads: 0)
+//
+// Memory usage: Allocates memory equal to sum of all sample sizes plus max_dict_size.
+// For large datasets, consider training on a representative subset to reduce memory footprint.
+static VALUE
+vibe_zstd_train_dict_cover(int argc, VALUE* argv, VALUE self) {
+    VALUE samples, options;
+    rb_scan_args(argc, argv, "1:", &samples, &options);
+    // Layer 1: Validate inputs BEFORE any allocation (fail-fast)
+    Check_Type(samples, T_ARRAY);
+    long num_samples = RARRAY_LEN(samples);
+    if (num_samples == 0) {
+        rb_raise(rb_eArgError, "samples array cannot be empty");
+    }
+    // Validate all samples are strings and calculate sizes BEFORE allocating
+    size_t total_samples_size = 0;
+    for (long i = 0; i < num_samples; i++) {
+        VALUE sample = rb_ary_entry(samples, i);
+        StringValue(sample);  // Validate type early - may raise TypeError
+        total_samples_size += RSTRING_LEN(sample);
+    }
+    // Initialize COVER parameters with defaults
+    ZDICT_cover_params_t params;
+    memset(&params, 0, sizeof(params));
+    params.splitPoint = 1.0;  // Default split point
+    // Parse options
+    if (!NIL_P(options)) {
+        VALUE v;
+        v = rb_hash_aref(options, ID2SYM(rb_intern("k")));
+        if (!NIL_P(v)) params.k = NUM2UINT(v);
+        v = rb_hash_aref(options, ID2SYM(rb_intern("d")));
+        if (!NIL_P(v)) params.d = NUM2UINT(v);
+        v = rb_hash_aref(options, ID2SYM(rb_intern("steps")));
+        if (!NIL_P(v)) params.steps = NUM2UINT(v);
+        v = rb_hash_aref(options, ID2SYM(rb_intern("split_point")));
+        if (!NIL_P(v)) params.splitPoint = NUM2DBL(v);
+        v = rb_hash_aref(options, ID2SYM(rb_intern("shrink_dict")));
+        if (!NIL_P(v)) params.shrinkDict = RTEST(v) ? 1 : 0;
+        v = rb_hash_aref(options, ID2SYM(rb_intern("shrink_dict_max_regression")));
+        if (!NIL_P(v)) params.shrinkDictMaxRegression = NUM2UINT(v);
+        v = rb_hash_aref(options, ID2SYM(rb_intern("nb_threads")));
+        if (!NIL_P(v)) params.nbThreads = NUM2UINT(v);
+    }
+    // Get max_dict_size (default 112KB)
+    VALUE max_dict_size_val = Qnil;
+    if (!NIL_P(options)) {
+        max_dict_size_val = rb_hash_aref(options, ID2SYM(rb_intern("max_dict_size")));
+    }
+    size_t max_dict_size = NIL_P(max_dict_size_val) ? (112 * 1024) : NUM2SIZET(max_dict_size_val);
+    params.zParams.compressionLevel = 0;  // Use default compression level
+    // Layer 2: Allocate late - only after validation passes
+    dict_training_resources resources = {NULL, NULL, NULL};
+    resources.sample_sizes = ALLOC_N(size_t, num_samples);
+    resources.samples_buffer = ALLOC_N(char, total_samples_size);
+    resources.dict_buffer = ALLOC_N(char, max_dict_size);
+    // Layer 3: Use rb_ensure for guaranteed cleanup (safety net)
+    // Build samples buffer - we already validated, so just copy
+    size_t offset = 0;
+    for (long i = 0; i < num_samples; i++) {
+        VALUE sample = rb_ary_entry(samples, i);
+        size_t sample_len = RSTRING_LEN(sample);
+        resources.sample_sizes[i] = sample_len;
+        memcpy(resources.samples_buffer + offset, RSTRING_PTR(sample), sample_len);
+        offset += sample_len;
+    }
+    // Train the dictionary using COVER algorithm
+    size_t dict_size = ZDICT_trainFromBuffer_cover(
+        resources.dict_buffer, max_dict_size,
+        resources.samples_buffer, resources.sample_sizes, (unsigned)num_samples,
+        params
+    );
+    // Check for errors
+    if (ZDICT_isError(dict_size)) {
+        dict_training_cleanup((VALUE)&resources);
+        rb_raise(rb_eRuntimeError, "Dictionary training failed: %s", ZDICT_getErrorName(dict_size));
+    }
+    // Create Ruby string with the trained dictionary
+    VALUE dict_string = rb_str_new(resources.dict_buffer, dict_size);
+    // Clean up all resources
+    dict_training_cleanup((VALUE)&resources);
+    return dict_string;
+}
+// VibeZstd.train_dict_fast_cover(samples, max_dict_size: 112640, k: 0, d: 0, f: 0, split_point: 1.0, accel: 0, shrink_dict: false, shrink_dict_max_regression: 0, nb_threads: 0)
+//
+// Memory usage: Allocates memory equal to sum of all sample sizes plus max_dict_size.
+// For large datasets, consider training on a representative subset to reduce memory footprint.
+static VALUE
+vibe_zstd_train_dict_fast_cover(int argc, VALUE* argv, VALUE self) {
+    VALUE samples, options;
+    rb_scan_args(argc, argv, "1:", &samples, &options);
+    // Layer 1: Validate inputs BEFORE any allocation (fail-fast)
+    Check_Type(samples, T_ARRAY);
+    long num_samples = RARRAY_LEN(samples);
+    if (num_samples == 0) {
+        rb_raise(rb_eArgError, "samples array cannot be empty");
+    }
+    // Validate all samples are strings and calculate sizes BEFORE allocating
+    size_t total_samples_size = 0;
+    for (long i = 0; i < num_samples; i++) {
+        VALUE sample = rb_ary_entry(samples, i);
+        StringValue(sample);  // Validate type early - may raise TypeError
+        total_samples_size += RSTRING_LEN(sample);
+    }
+    // Initialize COVER parameters with defaults
+    ZDICT_fastCover_params_t params;
+    memset(&params, 0, sizeof(params));
+    params.splitPoint = 1.0;  // Default split point
+    // Parse options
+    if (!NIL_P(options)) {
+        VALUE v;
+        v = rb_hash_aref(options, ID2SYM(rb_intern("k")));
+        if (!NIL_P(v)) params.k = NUM2UINT(v);
+        v = rb_hash_aref(options, ID2SYM(rb_intern("d")));
+        if (!NIL_P(v)) params.d = NUM2UINT(v);
+        v = rb_hash_aref(options, ID2SYM(rb_intern("f")));
+        if (!NIL_P(v)) params.f = NUM2UINT(v);
+        v = rb_hash_aref(options, ID2SYM(rb_intern("split_point")));
+        if (!NIL_P(v)) params.splitPoint = NUM2DBL(v);
+        v = rb_hash_aref(options, ID2SYM(rb_intern("accel")));
+        if (!NIL_P(v)) params.accel = NUM2UINT(v);
+        v = rb_hash_aref(options, ID2SYM(rb_intern("shrink_dict")));
+        if (!NIL_P(v)) params.shrinkDict = RTEST(v) ? 1 : 0;
+        v = rb_hash_aref(options, ID2SYM(rb_intern("shrink_dict_max_regression")));
+        if (!NIL_P(v)) params.shrinkDictMaxRegression = NUM2UINT(v);
+        v = rb_hash_aref(options, ID2SYM(rb_intern("nb_threads")));
+        if (!NIL_P(v)) params.nbThreads = NUM2UINT(v);
+    }
+    // Get max_dict_size (default 112KB)
+    VALUE max_dict_size_val = Qnil;
+    if (!NIL_P(options)) {
+        max_dict_size_val = rb_hash_aref(options, ID2SYM(rb_intern("max_dict_size")));
+    }
+    size_t max_dict_size = NIL_P(max_dict_size_val) ? (112 * 1024) : NUM2SIZET(max_dict_size_val);
+    params.zParams.compressionLevel = 0;  // Use default compression level
+    // Layer 2: Allocate late - only after validation passes
+    dict_training_resources resources = {NULL, NULL, NULL};
+    resources.sample_sizes = ALLOC_N(size_t, num_samples);
+    resources.samples_buffer = ALLOC_N(char, total_samples_size);
+    resources.dict_buffer = ALLOC_N(char, max_dict_size);
+    // Layer 3: Use rb_ensure for guaranteed cleanup (safety net)
+    // Build samples buffer - we already validated, so just copy
+    size_t offset = 0;
+    for (long i = 0; i < num_samples; i++) {
+        VALUE sample = rb_ary_entry(samples, i);
+        size_t sample_len = RSTRING_LEN(sample);
+        resources.sample_sizes[i] = sample_len;
+        memcpy(resources.samples_buffer + offset, RSTRING_PTR(sample), sample_len);
+        offset += sample_len;
+    }
+    // Train the dictionary using fast COVER algorithm
+    size_t dict_size = ZDICT_trainFromBuffer_fastCover(
+        resources.dict_buffer, max_dict_size,
+        resources.samples_buffer, resources.sample_sizes, (unsigned)num_samples,
+        params
+    );
+    // Check for errors
+    if (ZDICT_isError(dict_size)) {
+        dict_training_cleanup((VALUE)&resources);
+        rb_raise(rb_eRuntimeError, "Dictionary training failed: %s", ZDICT_getErrorName(dict_size));
+    }
+    // Create Ruby string with the trained dictionary
+    VALUE dict_string = rb_str_new(resources.dict_buffer, dict_size);
+    // Clean up all resources
+    dict_training_cleanup((VALUE)&resources);
+    return dict_string;
+}
+// Get dictionary ID from raw dictionary data - module-level utility
+// VibeZstd.get_dict_id(dict_data)
+static VALUE
+vibe_zstd_get_dict_id(VALUE self, VALUE dict_data) {
+    StringValue(dict_data);
+    unsigned dict_id = ZDICT_getDictID(RSTRING_PTR(dict_data), RSTRING_LEN(dict_data));
+    return UINT2NUM(dict_id);
+}
+// Get dictionary ID from compressed frame - module-level utility
+// VibeZstd.get_dict_id_from_frame(data)
+static VALUE
+vibe_zstd_get_dict_id_from_frame(VALUE self, VALUE data) {
+    StringValue(data);
+    unsigned dict_id = ZSTD_getDictID_fromFrame(RSTRING_PTR(data), RSTRING_LEN(data));
+    return UINT2NUM(dict_id);
+}
+// Finalize raw content into zstd dictionary - module-level utility
+// VibeZstd.finalize_dictionary(content:, samples:, max_size:, compression_level: nil, dict_id: nil)
+//
+// Memory usage: Allocates memory equal to sum of all sample sizes plus max_size.
+// For large datasets, consider using a representative subset of samples.
+static VALUE
+vibe_zstd_finalize_dictionary(int argc, VALUE* argv, VALUE self) {
+    VALUE options;
+    rb_scan_args(argc, argv, ":", &options);
+    // Layer 1: Validate inputs BEFORE any allocation (fail-fast)
+    if (NIL_P(options)) {
+        rb_raise(rb_eArgError, "finalize_dictionary requires keyword arguments");
+    }
+    // Get required parameters
+    VALUE content_val = rb_hash_aref(options, ID2SYM(rb_intern("content")));
+    VALUE samples_val = rb_hash_aref(options, ID2SYM(rb_intern("samples")));
+    VALUE max_size_val = rb_hash_aref(options, ID2SYM(rb_intern("max_size")));
+    if (NIL_P(content_val)) {
+        rb_raise(rb_eArgError, "content: parameter is required");
+    }
+    if (NIL_P(samples_val)) {
+        rb_raise(rb_eArgError, "samples: parameter is required");
+    }
+    if (NIL_P(max_size_val)) {
+        rb_raise(rb_eArgError, "max_size: parameter is required");
+    }
+    // Validate types early
+    StringValue(content_val);
+    Check_Type(samples_val, T_ARRAY);
+    size_t max_size = NUM2SIZET(max_size_val);
+    long num_samples = RARRAY_LEN(samples_val);
+    if (num_samples == 0) {
+        rb_raise(rb_eArgError, "samples array cannot be empty");
+    }
+    // Validate all samples are strings and calculate sizes BEFORE allocating
+    size_t total_samples_size = 0;
+    for (long i = 0; i < num_samples; i++) {
+        VALUE sample = rb_ary_entry(samples_val, i);
+        StringValue(sample);  // Validate type early - may raise TypeError
+        total_samples_size += RSTRING_LEN(sample);
+    }
+    // Get optional parameters
+    VALUE compression_level_val = rb_hash_aref(options, ID2SYM(rb_intern("compression_level")));
+    VALUE dict_id_val = rb_hash_aref(options, ID2SYM(rb_intern("dict_id")));
+    // Setup ZDICT_params_t
+    ZDICT_params_t params;
+    memset(&params, 0, sizeof(params));
+    params.compressionLevel = NIL_P(compression_level_val) ? 0 : NUM2INT(compression_level_val);
+    params.dictID = NIL_P(dict_id_val) ? 0 : NUM2UINT(dict_id_val);
+    params.notificationLevel = 0;
+    // Layer 2: Allocate late - only after validation passes
+    dict_training_resources resources = {NULL, NULL, NULL};
+    resources.sample_sizes = ALLOC_N(size_t, num_samples);
+    resources.samples_buffer = ALLOC_N(char, total_samples_size);
+    resources.dict_buffer = ALLOC_N(char, max_size);
+    // Layer 3: Use rb_ensure for guaranteed cleanup (safety net)
+    // Build samples buffer - we already validated, so just copy
+    size_t offset = 0;
+    for (long i = 0; i < num_samples; i++) {
+        VALUE sample = rb_ary_entry(samples_val, i);
+        size_t sample_len = RSTRING_LEN(sample);
+        resources.sample_sizes[i] = sample_len;
+        memcpy(resources.samples_buffer + offset, RSTRING_PTR(sample), sample_len);
+        offset += sample_len;
+    }
+    // Finalize the dictionary
+    size_t dict_size = ZDICT_finalizeDictionary(
+        resources.dict_buffer, max_size,
+        RSTRING_PTR(content_val), RSTRING_LEN(content_val),
+        resources.samples_buffer, resources.sample_sizes, (unsigned)num_samples,
+        params
+    );
+    // Check for errors
+    if (ZDICT_isError(dict_size)) {
+        dict_training_cleanup((VALUE)&resources);
+        rb_raise(rb_eRuntimeError, "Dictionary finalization failed: %s", ZDICT_getErrorName(dict_size));
+    }
+    // Create Ruby string with the finalized dictionary
+    VALUE dict_string = rb_str_new(resources.dict_buffer, dict_size);
+    // Clean up all resources
+    dict_training_cleanup((VALUE)&resources);
+    return dict_string;
+}
+// Get dictionary header size - module-level utility
+// VibeZstd.dict_header_size(dict_data)
+static VALUE
+vibe_zstd_dict_header_size(VALUE self, VALUE dict_data) {
+    StringValue(dict_data);
+    size_t header_size = ZDICT_getDictHeaderSize(RSTRING_PTR(dict_data), RSTRING_LEN(dict_data));
+    // Check for errors
+    if (ZDICT_isError(header_size)) {
+        rb_raise(rb_eRuntimeError, "Failed to get dictionary header size: %s", ZDICT_getErrorName(header_size));
+    }
+    return SIZET2NUM(header_size);
+}
+// CDict.estimate_memory(dict_size, level)
+static VALUE
+vibe_zstd_cdict_estimate_memory(VALUE self, VALUE dict_size, VALUE level) {
+    size_t size = NUM2SIZET(dict_size);
+    int lvl = NUM2INT(level);
+    size_t estimate = ZSTD_estimateCDictSize(size, lvl);
+    return SIZET2NUM(estimate);
+}
+// DDict.estimate_memory(dict_size)
+static VALUE
+vibe_zstd_ddict_estimate_memory(VALUE self, VALUE dict_size) {
+    size_t size = NUM2SIZET(dict_size);
+    size_t estimate = ZSTD_estimateDDictSize(size, ZSTD_dlm_byCopy);
+    return SIZET2NUM(estimate);
+}
+// Class initialization functions called from main Init_vibe_zstd
+void
+vibe_zstd_dict_init_classes(VALUE rb_cVibeZstdCDict, VALUE rb_cVibeZstdDDict) {
+    // CDict class setup
+    rb_define_alloc_func(rb_cVibeZstdCDict, vibe_zstd_cdict_alloc);
+    rb_define_method(rb_cVibeZstdCDict, "initialize", vibe_zstd_cdict_initialize, -1);
+    rb_define_method(rb_cVibeZstdCDict, "size", vibe_zstd_cdict_size, 0);
+    rb_define_method(rb_cVibeZstdCDict, "dict_id", vibe_zstd_cdict_dict_id, 0);
+    rb_define_singleton_method(rb_cVibeZstdCDict, "estimate_memory", vibe_zstd_cdict_estimate_memory, 2);
+    // DDict class setup
+    rb_define_alloc_func(rb_cVibeZstdDDict, vibe_zstd_ddict_alloc);
+    rb_define_method(rb_cVibeZstdDDict, "initialize", vibe_zstd_ddict_initialize, 1);
+    rb_define_method(rb_cVibeZstdDDict, "size", vibe_zstd_ddict_size, 0);
+    rb_define_method(rb_cVibeZstdDDict, "dict_id", vibe_zstd_ddict_dict_id, 0);
+    rb_define_singleton_method(rb_cVibeZstdDDict, "estimate_memory", vibe_zstd_ddict_estimate_memory, 1);
+}
+void
+vibe_zstd_dict_init_module_methods(VALUE rb_mVibeZstd) {
+    // Module-level dictionary methods
+    rb_define_module_function(rb_mVibeZstd, "train_dict", vibe_zstd_train_dict, -1);
+    rb_define_module_function(rb_mVibeZstd, "train_dict_cover", vibe_zstd_train_dict_cover, -1);
+    rb_define_module_function(rb_mVibeZstd, "train_dict_fast_cover", vibe_zstd_train_dict_fast_cover, -1);
+    rb_define_module_function(rb_mVibeZstd, "get_dict_id", vibe_zstd_get_dict_id, 1);
+    rb_define_module_function(rb_mVibeZstd, "get_dict_id_from_frame", vibe_zstd_get_dict_id_from_frame, 1);
+    rb_define_module_function(rb_mVibeZstd, "finalize_dictionary", vibe_zstd_finalize_dictionary, -1);
+    rb_define_module_function(rb_mVibeZstd, "dict_header_size", vibe_zstd_dict_header_size, 1);
+}

data/ext/vibe_zstd/extconf.rb ADDED Viewed

@@ -0,0 +1,52 @@
+# frozen_string_literal: true
+require "mkmf"
+# Use vendored zstd library
+LIBZSTD_DIR = File.expand_path("libzstd", __dir__)
+# Add include paths for vendored zstd headers
+# standard:disable Style/GlobalVars
+$INCFLAGS << " -I#{LIBZSTD_DIR}"
+$INCFLAGS << " -I#{LIBZSTD_DIR}/common"
+$INCFLAGS << " -I#{LIBZSTD_DIR}/compress"
+$INCFLAGS << " -I#{LIBZSTD_DIR}/decompress"
+$INCFLAGS << " -I#{LIBZSTD_DIR}/dictBuilder"
+# standard:enable Style/GlobalVars
+# Add preprocessor definitions
+append_cflags("-DXXH_NAMESPACE=ZSTD_")
+append_cflags("-DZSTD_LEGACY_SUPPORT=0")  # Disable legacy support to reduce size
+append_cflags("-DZSTD_MULTITHREAD")  # Enable multithreading support
+# Link with pthread for multithreading
+have_library("pthread") || abort("pthread library is required for multithreading support")
+# Makes all symbols private by default to avoid unintended conflict
+# with other gems. To explicitly export symbols you can use RUBY_FUNC_EXPORTED
+# selectively, or entirely remove this flag.
+append_cflags("-fvisibility=hidden")
+# Gather all vendored zstd source files
+zstd_sources = Dir[
+  "#{LIBZSTD_DIR}/common/*.c",
+  "#{LIBZSTD_DIR}/compress/*.c",
+  "#{LIBZSTD_DIR}/decompress/*.c",
+  "#{LIBZSTD_DIR}/dictBuilder/*.c",
+  "#{LIBZSTD_DIR}/deprecated/*.c"
+].map { |path| File.basename(path) }
+# Add the main vibe_zstd.c file (which includes the split files via #include)
+# standard:disable Style/GlobalVars
+$srcs = ["vibe_zstd.c"] + zstd_sources
+# Set vpath to find source files in subdirectories
+$VPATH ||= []
+$VPATH << "$(srcdir)/libzstd/common"
+$VPATH << "$(srcdir)/libzstd/compress"
+$VPATH << "$(srcdir)/libzstd/decompress"
+$VPATH << "$(srcdir)/libzstd/dictBuilder"
+$VPATH << "$(srcdir)/libzstd/deprecated"
+# standard:enable Style/GlobalVars
+create_makefile("vibe_zstd/vibe_zstd")