vibe_zstd 1.1.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/ext/vibe_zstd/dctx.c CHANGED
@@ -8,9 +8,24 @@ extern rb_data_type_t vibe_zstd_dctx_type;
8
8
  // Class-level default for initial capacity (0 = use ZSTD_DStreamOutSize)
9
9
  static size_t default_initial_capacity = 0;
10
10
 
11
+ // Class-level default output-size limit (0 = unlimited)
12
+ static size_t default_max_decompressed_size = 0;
13
+
14
+ // VibeZstd::DecompressedSizeExceeded - raised when output exceeds the limit.
15
+ // Defined in vibe_zstd_dctx_init_class, cached here for use on the error path.
16
+ static VALUE rb_eDecompressedSizeExceeded;
17
+
11
18
  // Helper to set DCtx parameter from Ruby keyword argument
12
19
  static int
13
20
  vibe_zstd_dctx_init_param_iter(VALUE key, VALUE value, VALUE self) {
21
+ // Guard: only Symbol keys are valid. A non-Symbol key (e.g. a String like
22
+ // "format" => 1) would make SYM2ID undefined behaviour, so reject it early.
23
+ if (!SYMBOL_P(key)) {
24
+ rb_raise(rb_eArgError,
25
+ "DCtx.new option keys must be Symbols (got %"PRIsVALUE")",
26
+ rb_inspect(key));
27
+ }
28
+
14
29
  // Build the setter method name: key + "="
15
30
  const char* key_str = rb_id2name(SYM2ID(key));
16
31
  char setter[256];
@@ -51,7 +66,8 @@ typedef struct {
51
66
  } dctx_param_entry;
52
67
 
53
68
  static dctx_param_entry dctx_param_table[] = {
54
- {0, ZSTD_d_windowLogMax, "window_log_max"}
69
+ {0, ZSTD_d_windowLogMax, "window_log_max"},
70
+ {0, ZSTD_d_format, "format"}
55
71
  };
56
72
 
57
73
  #define DCTX_PARAM_TABLE_SIZE (sizeof(dctx_param_table) / sizeof(dctx_param_entry))
@@ -136,6 +152,7 @@ vibe_zstd_dctx_get_param_generic(VALUE self, ZSTD_dParameter param, const char*
136
152
 
137
153
  // Define all DCtx parameter accessors
138
154
  DEFINE_DCTX_PARAM_ACCESSORS(window_log_max, ZSTD_d_windowLogMax, "window_log_max")
155
+ DEFINE_DCTX_PARAM_ACCESSORS(format, ZSTD_d_format, "format")
139
156
 
140
157
  // DCtx parameter_bounds - query parameter bounds (class method, kept for introspection)
141
158
  static VALUE
@@ -215,6 +232,54 @@ vibe_zstd_dctx_set_initial_capacity(VALUE self, VALUE value) {
215
232
  return value;
216
233
  }
217
234
 
235
+ // DCtx default_max_decompressed_size getter (class method); 0 = unlimited
236
+ static VALUE
237
+ vibe_zstd_dctx_get_default_max_decompressed_size(VALUE self) {
238
+ return SIZET2NUM(default_max_decompressed_size);
239
+ }
240
+
241
+ // DCtx default_max_decompressed_size setter (class method)
242
+ static VALUE
243
+ vibe_zstd_dctx_set_default_max_decompressed_size(VALUE self, VALUE value) {
244
+ if (NIL_P(value)) {
245
+ default_max_decompressed_size = 0; // unlimited
246
+ } else {
247
+ default_max_decompressed_size = NUM2SIZET(value);
248
+ }
249
+ return value;
250
+ }
251
+
252
+ // DCtx max_decompressed_size getter (instance method); reports the effective
253
+ // limit, falling back to the class default. Returns 0 when unlimited.
254
+ static VALUE
255
+ vibe_zstd_dctx_get_max_decompressed_size(VALUE self) {
256
+ vibe_zstd_dctx* dctx;
257
+ TypedData_Get_Struct(self, vibe_zstd_dctx, &vibe_zstd_dctx_type, dctx);
258
+
259
+ if (dctx->max_decompressed_size == 0) {
260
+ return SIZET2NUM(default_max_decompressed_size);
261
+ }
262
+ return SIZET2NUM(dctx->max_decompressed_size);
263
+ }
264
+
265
+ // DCtx max_decompressed_size setter (instance method); nil = inherit class default
266
+ static VALUE
267
+ vibe_zstd_dctx_set_max_decompressed_size(VALUE self, VALUE value) {
268
+ vibe_zstd_dctx* dctx;
269
+ TypedData_Get_Struct(self, vibe_zstd_dctx, &vibe_zstd_dctx_type, dctx);
270
+
271
+ if (NIL_P(value)) {
272
+ dctx->max_decompressed_size = 0; // inherit class default
273
+ } else {
274
+ size_t limit = NUM2SIZET(value);
275
+ if (limit == 0) {
276
+ rb_raise(rb_eArgError, "max_decompressed_size must be positive (or nil to inherit the class default)");
277
+ }
278
+ dctx->max_decompressed_size = limit;
279
+ }
280
+ return value;
281
+ }
282
+
218
283
  // Decompress args for GVL release
219
284
  // This structure packages all arguments needed for decompression so we can
220
285
  // call ZSTD functions without holding Ruby's Global VM Lock (GVL).
@@ -253,7 +318,10 @@ typedef struct {
253
318
  size_t dst_capacity;
254
319
  size_t dst_size;
255
320
  size_t initial_capacity;
321
+ size_t max_size; // 0 = unlimited; otherwise output must not exceed this
256
322
  int error;
323
+ int limit_exceeded; // set if output would exceed max_size
324
+ int truncated; // set if input was exhausted before the frame completed
257
325
  const char *error_name;
258
326
  } decompress_stream_nogvl_args;
259
327
 
@@ -264,9 +332,15 @@ static void*
264
332
  decompress_stream_without_gvl(void* arg) {
265
333
  decompress_stream_nogvl_args* args = arg;
266
334
  args->error = 0;
335
+ args->limit_exceeded = 0;
336
+ args->truncated = 0;
267
337
  args->error_name = NULL;
268
338
 
269
339
  args->dst_capacity = args->initial_capacity;
340
+ // Never allocate more than the configured limit up front.
341
+ if (args->max_size && args->dst_capacity > args->max_size) {
342
+ args->dst_capacity = args->max_size;
343
+ }
270
344
  args->dst = malloc(args->dst_capacity);
271
345
  if (!args->dst) {
272
346
  args->error = 1;
@@ -276,11 +350,21 @@ decompress_stream_without_gvl(void* arg) {
276
350
  args->dst_size = 0;
277
351
 
278
352
  ZSTD_inBuffer input = { args->src, args->src_size, 0 };
353
+ size_t last_ret = 1; // sentinel: non-zero = frame not yet complete
279
354
 
280
355
  while (input.pos < input.size) {
281
356
  // Ensure we have room for output
282
357
  if (args->dst_size >= args->dst_capacity) {
283
358
  size_t new_capacity = args->dst_capacity * 2;
359
+ // Clamp growth to the configured limit. If we cannot grow past the
360
+ // current capacity, the output would exceed the limit.
361
+ if (args->max_size && new_capacity > args->max_size) {
362
+ new_capacity = args->max_size;
363
+ }
364
+ if (new_capacity <= args->dst_capacity) {
365
+ args->limit_exceeded = 1;
366
+ return NULL;
367
+ }
284
368
  char* new_buf = realloc(args->dst, new_capacity);
285
369
  if (!new_buf) {
286
370
  args->error = 1;
@@ -305,14 +389,74 @@ decompress_stream_without_gvl(void* arg) {
305
389
  }
306
390
 
307
391
  args->dst_size += output.pos;
392
+ last_ret = ret;
308
393
 
309
394
  // ret == 0 means frame is complete
310
395
  if (ret == 0) break;
311
396
  }
312
397
 
398
+ // If we consumed all input but the last call still reported a non-zero hint
399
+ // (more input needed), the frame was cut short — flag it as truncated.
400
+ if (last_ret != 0) {
401
+ args->truncated = 1;
402
+ }
403
+
313
404
  return NULL;
314
405
  }
315
406
 
407
+ // State for the rb_ensure-wrapped unknown-size decompression path.
408
+ // Groups everything the body needs to run the no-GVL stream loop and everything
409
+ // the cleanup needs to release on any exit (raise, async exception, success).
410
+ typedef struct {
411
+ ZSTD_DCtx* dctx;
412
+ ZSTD_DDict* ddict;
413
+ decompress_stream_nogvl_args* args;
414
+ VALUE data;
415
+ size_t max_size;
416
+ } dctx_stream_decompress_state;
417
+
418
+ // Body: run the no-GVL stream loop (source string locked), check the outcome,
419
+ // and build the result string. Raising here is safe: cleanup always runs.
420
+ static VALUE
421
+ vibe_zstd_dctx_stream_decompress_body(VALUE p) {
422
+ dctx_stream_decompress_state* state = (dctx_stream_decompress_state*)p;
423
+
424
+ // Lock the source string while the GVL is released: another Ruby thread
425
+ // holding the same string must not mutate or GC it mid-decompression.
426
+ vibe_zstd_nogvl_with_str_locked(decompress_stream_without_gvl, state->args, state->data);
427
+
428
+ if (state->args->limit_exceeded) {
429
+ rb_raise(rb_eDecompressedSizeExceeded,
430
+ "Decompressed output exceeds limit of %zu bytes", state->max_size);
431
+ }
432
+
433
+ if (state->args->error) {
434
+ rb_raise(rb_eRuntimeError, "Decompression failed: %s", state->args->error_name);
435
+ }
436
+
437
+ if (state->args->truncated) {
438
+ rb_raise(rb_eRuntimeError, "Truncated frame: incomplete zstd data");
439
+ }
440
+
441
+ // Create Ruby string from the C buffer; cleanup frees the buffer
442
+ return rb_str_new(state->args->dst, state->args->dst_size);
443
+ }
444
+
445
+ // Cleanup: free the C output buffer and return the context to no-dictionary
446
+ // mode so subsequent calls on this DCtx are not affected.
447
+ static VALUE
448
+ vibe_zstd_dctx_stream_decompress_cleanup(VALUE p) {
449
+ dctx_stream_decompress_state* state = (dctx_stream_decompress_state*)p;
450
+ if (state->args->dst) {
451
+ free(state->args->dst);
452
+ state->args->dst = NULL;
453
+ }
454
+ if (state->ddict) {
455
+ ZSTD_DCtx_refDDict(state->dctx, NULL);
456
+ }
457
+ return Qnil;
458
+ }
459
+
316
460
  // DCtx frame_content_size - class method to get frame content size
317
461
  static VALUE
318
462
  vibe_zstd_dctx_frame_content_size(VALUE self, VALUE data) {
@@ -353,35 +497,52 @@ vibe_zstd_dctx_decompress(int argc, VALUE* argv, VALUE self) {
353
497
  size_t srcSize = RSTRING_LEN(data);
354
498
  size_t offset = 0;
355
499
 
356
- // Skip any leading skippable frames
357
- while (offset < srcSize && ZSTD_isSkippableFrame(src + offset, srcSize - offset)) {
358
- size_t frameSize = ZSTD_findFrameCompressedSize(src + offset, srcSize - offset);
359
- if (ZSTD_isError(frameSize)) {
360
- rb_raise(rb_eRuntimeError, "Invalid skippable frame at offset %zu: %s", offset, ZSTD_getErrorName(frameSize));
500
+ // Magicless frames (format = ZSTD_f_zstd1_magicless) carry no magic number,
501
+ // so frame introspection (content size, dict ID, skippable detection) cannot
502
+ // be performed. Force the streaming decompress path, which honors the format
503
+ // parameter set on the context via ZSTD_decompressStream.
504
+ int dformat = 0;
505
+ (void)ZSTD_DCtx_getParameter(dctx->dctx, ZSTD_d_format, &dformat);
506
+ int magicless = (dformat == ZSTD_f_zstd1_magicless);
507
+
508
+ unsigned long long contentSize;
509
+ unsigned int frame_dict_id;
510
+
511
+ if (magicless) {
512
+ contentSize = ZSTD_CONTENTSIZE_UNKNOWN; // route to streaming path
513
+ frame_dict_id = 0; // cannot read dict ID without magic
514
+ } else {
515
+ // Skip any leading skippable frames
516
+ while (offset < srcSize && ZSTD_isSkippableFrame(src + offset, srcSize - offset)) {
517
+ size_t frameSize = ZSTD_findFrameCompressedSize(src + offset, srcSize - offset);
518
+ if (ZSTD_isError(frameSize)) {
519
+ rb_raise(rb_eRuntimeError, "Invalid skippable frame at offset %zu: %s", offset, ZSTD_getErrorName(frameSize));
520
+ }
521
+ offset += frameSize;
361
522
  }
362
- offset += frameSize;
363
- }
364
523
 
365
- // Now check the actual compressed frame
366
- if (offset >= srcSize) {
367
- rb_raise(rb_eRuntimeError, "No compressed frame found in %zu bytes (only skippable frames)", srcSize);
368
- }
524
+ // Now check the actual compressed frame
525
+ if (offset >= srcSize) {
526
+ rb_raise(rb_eRuntimeError, "No compressed frame found in %zu bytes (only skippable frames)", srcSize);
527
+ }
369
528
 
370
- src += offset;
371
- srcSize -= offset;
529
+ src += offset;
530
+ srcSize -= offset;
372
531
 
373
- unsigned long long contentSize = ZSTD_getFrameContentSize(src, srcSize);
374
- if (contentSize == ZSTD_CONTENTSIZE_ERROR) {
375
- rb_raise(rb_eRuntimeError, "Invalid compressed data: not a valid zstd frame (size: %zu bytes)", srcSize);
376
- }
532
+ contentSize = ZSTD_getFrameContentSize(src, srcSize);
533
+ if (contentSize == ZSTD_CONTENTSIZE_ERROR) {
534
+ rb_raise(rb_eRuntimeError, "Invalid compressed data: not a valid zstd frame (size: %zu bytes)", srcSize);
535
+ }
377
536
 
378
- // Check dictionary requirements from the frame
379
- unsigned int frame_dict_id = ZSTD_getDictID_fromFrame(src, srcSize);
537
+ // Check dictionary requirements from the frame
538
+ frame_dict_id = ZSTD_getDictID_fromFrame(src, srcSize);
539
+ }
380
540
 
381
541
  // Extract keyword arguments
382
542
  ZSTD_DDict* ddict = NULL;
383
543
  unsigned int provided_dict_id = 0;
384
544
  size_t initial_capacity = 0; // 0 = not specified in per-call options
545
+ size_t max_size = 0; // 0 = not specified in per-call options
385
546
 
386
547
  if (!NIL_P(options)) {
387
548
  VALUE dict_val = rb_hash_aref(options, ID2SYM(rb_intern("dict")));
@@ -399,6 +560,27 @@ vibe_zstd_dctx_decompress(int argc, VALUE* argv, VALUE self) {
399
560
  rb_raise(rb_eArgError, "initial_capacity must be positive");
400
561
  }
401
562
  }
563
+
564
+ // Per-call output-size limit; accepts :max_decompressed_size or :max_size.
565
+ VALUE max_size_val = rb_hash_aref(options, ID2SYM(rb_intern("max_decompressed_size")));
566
+ if (NIL_P(max_size_val)) {
567
+ max_size_val = rb_hash_aref(options, ID2SYM(rb_intern("max_size")));
568
+ }
569
+ if (!NIL_P(max_size_val)) {
570
+ max_size = NUM2SIZET(max_size_val);
571
+ if (max_size == 0) {
572
+ rb_raise(rb_eArgError, "max_decompressed_size must be positive");
573
+ }
574
+ }
575
+ }
576
+
577
+ // Resolve max_size fallback chain: per-call > instance > class default.
578
+ // A value of 0 at every level means unlimited.
579
+ if (max_size == 0) {
580
+ max_size = dctx->max_decompressed_size; // instance
581
+ if (max_size == 0) {
582
+ max_size = default_max_decompressed_size; // class
583
+ }
402
584
  }
403
585
 
404
586
  // Validate dictionary matches frame requirements
@@ -426,6 +608,17 @@ vibe_zstd_dctx_decompress(int argc, VALUE* argv, VALUE self) {
426
608
  // Releases GVL to allow other Ruby threads to run during decompression.
427
609
  // Uses C malloc/realloc (not Ruby allocators) since Ruby API calls are forbidden without GVL.
428
610
  if (contentSize == ZSTD_CONTENTSIZE_UNKNOWN) {
611
+ // Reference the dictionary on the context before streaming decompression.
612
+ // ZSTD_decompressStream uses whatever dict is referenced on the DCtx, so
613
+ // without this the dictionary would be ignored on the unknown-size path
614
+ // (every dict frame produced by CompressWriter has unknown content size).
615
+ if (ddict) {
616
+ size_t rd = ZSTD_DCtx_refDDict(dctx->dctx, ddict);
617
+ if (ZSTD_isError(rd)) {
618
+ rb_raise(rb_eRuntimeError, "Failed to reference dictionary: %s", ZSTD_getErrorName(rd));
619
+ }
620
+ }
621
+
429
622
  decompress_stream_nogvl_args stream_args = {
430
623
  .dctx = dctx->dctx,
431
624
  .src = src,
@@ -434,22 +627,34 @@ vibe_zstd_dctx_decompress(int argc, VALUE* argv, VALUE self) {
434
627
  .dst_capacity = 0,
435
628
  .dst_size = 0,
436
629
  .initial_capacity = initial_capacity,
630
+ .max_size = max_size,
437
631
  .error = 0,
632
+ .limit_exceeded = 0,
633
+ .truncated = 0,
438
634
  .error_name = NULL
439
635
  };
440
636
 
441
- rb_thread_call_without_gvl(decompress_stream_without_gvl, &stream_args, NULL, NULL);
442
-
443
- if (stream_args.error) {
444
- if (stream_args.dst) free(stream_args.dst);
445
- rb_raise(rb_eRuntimeError, "Decompression failed: %s", stream_args.error_name);
446
- }
447
-
448
- // Create Ruby string from the C buffer, then free the C buffer
449
- VALUE result = rb_str_new(stream_args.dst, stream_args.dst_size);
450
- free(stream_args.dst);
451
- return result;
637
+ // Run the streaming decompression and build the result under rb_ensure:
638
+ // the cleanup frees the C buffer and un-references the dictionary on
639
+ // every exit path, including the raises below and async exceptions
640
+ // delivered when the GVL is reacquired.
641
+ dctx_stream_decompress_state state = {
642
+ .dctx = dctx->dctx,
643
+ .ddict = ddict,
644
+ .args = &stream_args,
645
+ .data = data,
646
+ .max_size = max_size
647
+ };
648
+ return rb_ensure(vibe_zstd_dctx_stream_decompress_body, (VALUE)&state,
649
+ vibe_zstd_dctx_stream_decompress_cleanup, (VALUE)&state);
452
650
  }
651
+ // Reject a frame whose declared content size exceeds the limit before
652
+ // allocating the output buffer (the header is attacker-controlled).
653
+ if (max_size && contentSize > (unsigned long long)max_size) {
654
+ rb_raise(rb_eDecompressedSizeExceeded,
655
+ "Declared content size %llu exceeds limit of %zu bytes", contentSize, max_size);
656
+ }
657
+
453
658
  VALUE result = rb_str_new(NULL, contentSize);
454
659
  decompress_args args = {
455
660
  .dctx = dctx->dctx,
@@ -460,7 +665,11 @@ vibe_zstd_dctx_decompress(int argc, VALUE* argv, VALUE self) {
460
665
  .dstCapacity = contentSize,
461
666
  .result = 0
462
667
  };
463
- rb_thread_call_without_gvl(decompress_without_gvl, &args, NULL, NULL);
668
+ // Lock the source string while the GVL is released: another Ruby thread
669
+ // holding the same string must not mutate or GC it mid-decompression.
670
+ // The helper unlocks via rb_ensure so an async exception cannot leave
671
+ // the string permanently locked.
672
+ vibe_zstd_nogvl_with_str_locked(decompress_without_gvl, &args, data);
464
673
  if (ZSTD_isError(args.result)) {
465
674
  rb_raise(rb_eRuntimeError, "Decompression failed: %s", ZSTD_getErrorName(args.result));
466
675
  }
@@ -525,6 +734,13 @@ vibe_zstd_dctx_init_class(VALUE rb_cVibeZstdDCtx) {
525
734
  // Initialize parameter lookup table
526
735
  init_dctx_param_table();
527
736
 
737
+ // Define VibeZstd::Error (base) and VibeZstd::DecompressedSizeExceeded.
738
+ // Defined here (rather than only in Ruby) so the error is available even if
739
+ // the C extension is required without the Ruby wrapper. Ruby's
740
+ // `class Error < StandardError` simply reopens the same class.
741
+ VALUE rb_eVibeZstdError = rb_define_class_under(rb_mVibeZstd, "Error", rb_eStandardError);
742
+ rb_eDecompressedSizeExceeded = rb_define_class_under(rb_mVibeZstd, "DecompressedSizeExceeded", rb_eVibeZstdError);
743
+
528
744
  rb_define_alloc_func(rb_cVibeZstdDCtx, vibe_zstd_dctx_alloc);
529
745
  rb_define_method(rb_cVibeZstdDCtx, "initialize", vibe_zstd_dctx_initialize, -1);
530
746
  rb_define_method(rb_cVibeZstdDCtx, "decompress", vibe_zstd_dctx_decompress, -1);
@@ -543,8 +759,20 @@ vibe_zstd_dctx_init_class(VALUE rb_cVibeZstdDCtx) {
543
759
  rb_define_method(rb_cVibeZstdDCtx, "window_log_max", vibe_zstd_dctx_get_window_log_max, 0);
544
760
  rb_define_alias(rb_cVibeZstdDCtx, "max_window_log=", "window_log_max=");
545
761
  rb_define_alias(rb_cVibeZstdDCtx, "max_window_log", "window_log_max");
762
+ rb_define_method(rb_cVibeZstdDCtx, "format=", vibe_zstd_dctx_set_format, 1);
763
+ rb_define_method(rb_cVibeZstdDCtx, "format", vibe_zstd_dctx_get_format, 0);
546
764
 
547
765
  // Instance-level initial_capacity accessors
548
766
  rb_define_method(rb_cVibeZstdDCtx, "initial_capacity", vibe_zstd_dctx_get_initial_capacity, 0);
549
767
  rb_define_method(rb_cVibeZstdDCtx, "initial_capacity=", vibe_zstd_dctx_set_initial_capacity, 1);
768
+
769
+ // Class-level default_max_decompressed_size accessors (0 = unlimited)
770
+ rb_define_singleton_method(rb_cVibeZstdDCtx, "default_max_decompressed_size", vibe_zstd_dctx_get_default_max_decompressed_size, 0);
771
+ rb_define_singleton_method(rb_cVibeZstdDCtx, "default_max_decompressed_size=", vibe_zstd_dctx_set_default_max_decompressed_size, 1);
772
+
773
+ // Instance-level max_decompressed_size accessors (with shorter max_size alias)
774
+ rb_define_method(rb_cVibeZstdDCtx, "max_decompressed_size", vibe_zstd_dctx_get_max_decompressed_size, 0);
775
+ rb_define_method(rb_cVibeZstdDCtx, "max_decompressed_size=", vibe_zstd_dctx_set_max_decompressed_size, 1);
776
+ rb_define_alias(rb_cVibeZstdDCtx, "max_size", "max_decompressed_size");
777
+ rb_define_alias(rb_cVibeZstdDCtx, "max_size=", "max_decompressed_size=");
550
778
  }
@@ -0,0 +1,3 @@
1
+ # vibe_zstd.c textually #includes the split implementation files, so the object
2
+ # must be rebuilt when any of them (or the project headers) change.
3
+ vibe_zstd.o: cctx.c dctx.c dict.c streaming.c frames.c vibe_zstd.h vibe_zstd_internal.h
data/ext/vibe_zstd/dict.c CHANGED
@@ -125,13 +125,21 @@ dict_training_cleanup(VALUE arg) {
125
125
  return Qnil;
126
126
  }
127
127
 
128
- // Copy Ruby sample strings into contiguous C buffer for ZDICT functions
128
+ // Copy Ruby sample strings into contiguous C buffer for ZDICT functions.
129
+ // capacity is the total_samples_size measured during the validation pass.
130
+ // If any sample has grown since validation (TOCTOU mutation), we raise rather
131
+ // than overflow the buffer.
129
132
  static void
130
- copy_samples_to_buffer(dict_training_resources* resources, VALUE samples, long num_samples) {
133
+ copy_samples_to_buffer(dict_training_resources* resources, VALUE samples, long num_samples,
134
+ size_t capacity) {
131
135
  size_t offset = 0;
132
136
  for (long i = 0; i < num_samples; i++) {
133
137
  VALUE sample = rb_ary_entry(samples, i);
134
138
  size_t sample_len = RSTRING_LEN(sample);
139
+ // Guard against mutation between the validation pass and this copy.
140
+ if (sample_len > capacity - offset) {
141
+ rb_raise(rb_eRuntimeError, "sample mutated during dictionary training");
142
+ }
135
143
  resources->sample_sizes[i] = sample_len;
136
144
  memcpy(resources->samples_buffer + offset, RSTRING_PTR(sample), sample_len);
137
145
  offset += sample_len;
@@ -148,12 +156,14 @@ typedef struct {
148
156
  VALUE result;
149
157
  size_t max_dict_size;
150
158
  long num_samples;
151
- VALUE samples;
159
+ size_t total_samples_size; // measured during validation; passed to copy_samples_to_buffer
160
+ VALUE samples; // private converted-samples array built during validation
152
161
  } dict_training_ctx;
153
162
 
154
163
  static VALUE train_dict_basic_body(VALUE arg) {
155
164
  dict_training_ctx* ctx = (dict_training_ctx*)arg;
156
- copy_samples_to_buffer(ctx->resources, ctx->samples, ctx->num_samples);
165
+ copy_samples_to_buffer(ctx->resources, ctx->samples, ctx->num_samples,
166
+ ctx->total_samples_size);
157
167
  size_t dict_size = ZDICT_trainFromBuffer(
158
168
  ctx->resources->dict_buffer, ctx->max_dict_size,
159
169
  ctx->resources->samples_buffer, ctx->resources->sample_sizes, (unsigned)ctx->num_samples
@@ -172,7 +182,8 @@ typedef struct {
172
182
 
173
183
  static VALUE train_dict_cover_body(VALUE arg) {
174
184
  train_dict_cover_ctx* ctx = (train_dict_cover_ctx*)arg;
175
- copy_samples_to_buffer(ctx->base.resources, ctx->base.samples, ctx->base.num_samples);
185
+ copy_samples_to_buffer(ctx->base.resources, ctx->base.samples, ctx->base.num_samples,
186
+ ctx->base.total_samples_size);
176
187
  size_t dict_size = ZDICT_trainFromBuffer_cover(
177
188
  ctx->base.resources->dict_buffer, ctx->base.max_dict_size,
178
189
  ctx->base.resources->samples_buffer, ctx->base.resources->sample_sizes, (unsigned)ctx->base.num_samples,
@@ -192,7 +203,8 @@ typedef struct {
192
203
 
193
204
  static VALUE train_dict_fast_cover_body(VALUE arg) {
194
205
  train_dict_fast_cover_ctx* ctx = (train_dict_fast_cover_ctx*)arg;
195
- copy_samples_to_buffer(ctx->base.resources, ctx->base.samples, ctx->base.num_samples);
206
+ copy_samples_to_buffer(ctx->base.resources, ctx->base.samples, ctx->base.num_samples,
207
+ ctx->base.total_samples_size);
196
208
  size_t dict_size = ZDICT_trainFromBuffer_fastCover(
197
209
  ctx->base.resources->dict_buffer, ctx->base.max_dict_size,
198
210
  ctx->base.resources->samples_buffer, ctx->base.resources->sample_sizes, (unsigned)ctx->base.num_samples,
@@ -213,7 +225,8 @@ typedef struct {
213
225
 
214
226
  static VALUE finalize_dict_body(VALUE arg) {
215
227
  finalize_dict_ctx* ctx = (finalize_dict_ctx*)arg;
216
- copy_samples_to_buffer(ctx->base.resources, ctx->base.samples, ctx->base.num_samples);
228
+ copy_samples_to_buffer(ctx->base.resources, ctx->base.samples, ctx->base.num_samples,
229
+ ctx->base.total_samples_size);
217
230
  size_t dict_size = ZDICT_finalizeDictionary(
218
231
  ctx->base.resources->dict_buffer, ctx->base.max_dict_size,
219
232
  RSTRING_PTR(ctx->content_val), RSTRING_LEN(ctx->content_val),
@@ -245,11 +258,17 @@ vibe_zstd_train_dict(int argc, VALUE* argv, VALUE self) {
245
258
  rb_raise(rb_eArgError, "samples array cannot be empty");
246
259
  }
247
260
 
248
- // Validate all samples are strings and calculate sizes BEFORE allocating
261
+ // Validate all samples are strings and calculate sizes BEFORE allocating.
262
+ // Build a private converted-samples array so that StringValue conversions are
263
+ // retained: rb_ary_entry re-fetches the raw element, so storing the converted
264
+ // value here ensures copy_samples_to_buffer operates on real String objects
265
+ // rather than potentially-non-String objects that merely respond to to_str.
266
+ VALUE converted_samples = rb_ary_new_capa(num_samples);
249
267
  size_t total_samples_size = 0;
250
268
  for (long i = 0; i < num_samples; i++) {
251
269
  VALUE sample = rb_ary_entry(samples, i);
252
- StringValue(sample); // Validate type early - may raise TypeError
270
+ StringValue(sample); // Validate type early - may raise TypeError; updates local
271
+ rb_ary_push(converted_samples, sample);
253
272
  total_samples_size += RSTRING_LEN(sample);
254
273
  }
255
274
 
@@ -274,7 +293,8 @@ vibe_zstd_train_dict(int argc, VALUE* argv, VALUE self) {
274
293
  .result = Qnil,
275
294
  .max_dict_size = max_dict_size,
276
295
  .num_samples = num_samples,
277
- .samples = samples
296
+ .total_samples_size = total_samples_size,
297
+ .samples = converted_samples // use private array, not caller's array
278
298
  };
279
299
 
280
300
  rb_ensure(train_dict_basic_body, (VALUE)&ctx, dict_training_cleanup, (VALUE)&resources);
@@ -298,11 +318,14 @@ vibe_zstd_train_dict_cover(int argc, VALUE* argv, VALUE self) {
298
318
  rb_raise(rb_eArgError, "samples array cannot be empty");
299
319
  }
300
320
 
301
- // Validate all samples are strings and calculate sizes BEFORE allocating
321
+ // Validate all samples are strings and calculate sizes BEFORE allocating.
322
+ // Build a private converted-samples array (see vibe_zstd_train_dict for details).
323
+ VALUE converted_samples = rb_ary_new_capa(num_samples);
302
324
  size_t total_samples_size = 0;
303
325
  for (long i = 0; i < num_samples; i++) {
304
326
  VALUE sample = rb_ary_entry(samples, i);
305
- StringValue(sample); // Validate type early - may raise TypeError
327
+ StringValue(sample); // Validate type early - may raise TypeError; updates local
328
+ rb_ary_push(converted_samples, sample);
306
329
  total_samples_size += RSTRING_LEN(sample);
307
330
  }
308
331
 
@@ -358,7 +381,8 @@ vibe_zstd_train_dict_cover(int argc, VALUE* argv, VALUE self) {
358
381
  .result = Qnil,
359
382
  .max_dict_size = max_dict_size,
360
383
  .num_samples = num_samples,
361
- .samples = samples
384
+ .total_samples_size = total_samples_size,
385
+ .samples = converted_samples // use private array, not caller's array
362
386
  },
363
387
  .params = params
364
388
  };
@@ -384,11 +408,14 @@ vibe_zstd_train_dict_fast_cover(int argc, VALUE* argv, VALUE self) {
384
408
  rb_raise(rb_eArgError, "samples array cannot be empty");
385
409
  }
386
410
 
387
- // Validate all samples are strings and calculate sizes BEFORE allocating
411
+ // Validate all samples are strings and calculate sizes BEFORE allocating.
412
+ // Build a private converted-samples array (see vibe_zstd_train_dict for details).
413
+ VALUE converted_samples = rb_ary_new_capa(num_samples);
388
414
  size_t total_samples_size = 0;
389
415
  for (long i = 0; i < num_samples; i++) {
390
416
  VALUE sample = rb_ary_entry(samples, i);
391
- StringValue(sample); // Validate type early - may raise TypeError
417
+ StringValue(sample); // Validate type early - may raise TypeError; updates local
418
+ rb_ary_push(converted_samples, sample);
392
419
  total_samples_size += RSTRING_LEN(sample);
393
420
  }
394
421
 
@@ -447,7 +474,8 @@ vibe_zstd_train_dict_fast_cover(int argc, VALUE* argv, VALUE self) {
447
474
  .result = Qnil,
448
475
  .max_dict_size = max_dict_size,
449
476
  .num_samples = num_samples,
450
- .samples = samples
477
+ .total_samples_size = total_samples_size,
478
+ .samples = converted_samples // use private array, not caller's array
451
479
  },
452
480
  .params = params
453
481
  };
@@ -514,11 +542,14 @@ vibe_zstd_finalize_dictionary(int argc, VALUE* argv, VALUE self) {
514
542
  rb_raise(rb_eArgError, "samples array cannot be empty");
515
543
  }
516
544
 
517
- // Validate all samples are strings and calculate sizes BEFORE allocating
545
+ // Validate all samples are strings and calculate sizes BEFORE allocating.
546
+ // Build a private converted-samples array (see vibe_zstd_train_dict for details).
547
+ VALUE converted_samples = rb_ary_new_capa(num_samples);
518
548
  size_t total_samples_size = 0;
519
549
  for (long i = 0; i < num_samples; i++) {
520
550
  VALUE sample = rb_ary_entry(samples_val, i);
521
- StringValue(sample); // Validate type early - may raise TypeError
551
+ StringValue(sample); // Validate type early - may raise TypeError; updates local
552
+ rb_ary_push(converted_samples, sample);
522
553
  total_samples_size += RSTRING_LEN(sample);
523
554
  }
524
555
 
@@ -546,7 +577,8 @@ vibe_zstd_finalize_dictionary(int argc, VALUE* argv, VALUE self) {
546
577
  .result = Qnil,
547
578
  .max_dict_size = max_size,
548
579
  .num_samples = num_samples,
549
- .samples = samples_val
580
+ .total_samples_size = total_samples_size,
581
+ .samples = converted_samples // use private array, not caller's array
550
582
  },
551
583
  .content_val = content_val,
552
584
  .params = params
@@ -14,10 +14,13 @@ $INCFLAGS << " -I#{LIBZSTD_DIR}/decompress"
14
14
  $INCFLAGS << " -I#{LIBZSTD_DIR}/dictBuilder"
15
15
  # standard:enable Style/GlobalVars
16
16
 
17
- # Add preprocessor definitions
18
- append_cflags("-DXXH_NAMESPACE=ZSTD_")
19
- append_cflags("-DZSTD_LEGACY_SUPPORT=0") # Disable legacy support to reduce size
20
- append_cflags("-DZSTD_MULTITHREAD") # Enable multithreading support
17
+ # Add preprocessor definitions (use $defs so they appear in DEFS in the Makefile,
18
+ # append_cflags only validates the flag but doesn't reliably propagate -D flags)
19
+ # standard:disable Style/GlobalVars
20
+ $defs << "-DXXH_NAMESPACE=ZSTD_"
21
+ $defs << "-DZSTD_LEGACY_SUPPORT=0" # Disable legacy support to reduce size
22
+ $defs << "-DZSTD_MULTITHREAD" # Enable multithreading support
23
+ # standard:enable Style/GlobalVars
21
24
 
22
25
  # Link with pthread for multithreading
23
26
  have_library("pthread") || abort("pthread library is required for multithreading support")
@@ -81,12 +81,23 @@ vibe_zstd_read_skippable_frame(VALUE self, VALUE data) {
81
81
  uint32_t content_size;
82
82
  memcpy(&content_size, src + 4, 4);
83
83
 
84
- VALUE result = rb_str_buf_new(content_size);
84
+ // The content size field is attacker-controlled and may claim up to ~4 GiB.
85
+ // A skippable frame's content cannot exceed the bytes actually provided
86
+ // (src_size minus the 8-byte header), so cap the allocation accordingly to
87
+ // prevent a tiny truncated input from forcing a huge allocation. A frame
88
+ // whose declared size exceeds what is present is malformed and
89
+ // ZSTD_readSkippableFrame reports the error below.
90
+ size_t capacity = content_size;
91
+ if (capacity > src_size - 8) {
92
+ capacity = src_size - 8;
93
+ }
94
+
95
+ VALUE result = rb_str_buf_new(capacity);
85
96
  unsigned magic_variant;
86
97
 
87
98
  size_t bytes_read = ZSTD_readSkippableFrame(
88
99
  RSTRING_PTR(result),
89
- content_size,
100
+ capacity,
90
101
  &magic_variant,
91
102
  src,
92
103
  src_size