vibe_zstd 1.1.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +43 -0
- data/README.md +79 -3
- data/ext/vibe_zstd/cctx.c +71 -25
- data/ext/vibe_zstd/dctx.c +260 -32
- data/ext/vibe_zstd/depend +3 -0
- data/ext/vibe_zstd/dict.c +51 -19
- data/ext/vibe_zstd/extconf.rb +7 -4
- data/ext/vibe_zstd/frames.c +13 -2
- data/ext/vibe_zstd/streaming.c +110 -16
- data/ext/vibe_zstd/vibe_zstd.c +30 -0
- data/ext/vibe_zstd/vibe_zstd.h +1 -0
- data/lib/vibe_zstd/version.rb +1 -1
- data/lib/vibe_zstd.rb +48 -23
- metadata +3 -2
data/ext/vibe_zstd/dctx.c
CHANGED
|
@@ -8,9 +8,24 @@ extern rb_data_type_t vibe_zstd_dctx_type;
|
|
|
8
8
|
// Class-level default for initial capacity (0 = use ZSTD_DStreamOutSize)
|
|
9
9
|
static size_t default_initial_capacity = 0;
|
|
10
10
|
|
|
11
|
+
// Class-level default output-size limit (0 = unlimited)
|
|
12
|
+
static size_t default_max_decompressed_size = 0;
|
|
13
|
+
|
|
14
|
+
// VibeZstd::DecompressedSizeExceeded - raised when output exceeds the limit.
|
|
15
|
+
// Defined in vibe_zstd_dctx_init_class, cached here for use on the error path.
|
|
16
|
+
static VALUE rb_eDecompressedSizeExceeded;
|
|
17
|
+
|
|
11
18
|
// Helper to set DCtx parameter from Ruby keyword argument
|
|
12
19
|
static int
|
|
13
20
|
vibe_zstd_dctx_init_param_iter(VALUE key, VALUE value, VALUE self) {
|
|
21
|
+
// Guard: only Symbol keys are valid. A non-Symbol key (e.g. a String like
|
|
22
|
+
// "format" => 1) would make SYM2ID undefined behaviour, so reject it early.
|
|
23
|
+
if (!SYMBOL_P(key)) {
|
|
24
|
+
rb_raise(rb_eArgError,
|
|
25
|
+
"DCtx.new option keys must be Symbols (got %"PRIsVALUE")",
|
|
26
|
+
rb_inspect(key));
|
|
27
|
+
}
|
|
28
|
+
|
|
14
29
|
// Build the setter method name: key + "="
|
|
15
30
|
const char* key_str = rb_id2name(SYM2ID(key));
|
|
16
31
|
char setter[256];
|
|
@@ -51,7 +66,8 @@ typedef struct {
|
|
|
51
66
|
} dctx_param_entry;
|
|
52
67
|
|
|
53
68
|
static dctx_param_entry dctx_param_table[] = {
|
|
54
|
-
{0, ZSTD_d_windowLogMax, "window_log_max"}
|
|
69
|
+
{0, ZSTD_d_windowLogMax, "window_log_max"},
|
|
70
|
+
{0, ZSTD_d_format, "format"}
|
|
55
71
|
};
|
|
56
72
|
|
|
57
73
|
#define DCTX_PARAM_TABLE_SIZE (sizeof(dctx_param_table) / sizeof(dctx_param_entry))
|
|
@@ -136,6 +152,7 @@ vibe_zstd_dctx_get_param_generic(VALUE self, ZSTD_dParameter param, const char*
|
|
|
136
152
|
|
|
137
153
|
// Define all DCtx parameter accessors
|
|
138
154
|
DEFINE_DCTX_PARAM_ACCESSORS(window_log_max, ZSTD_d_windowLogMax, "window_log_max")
|
|
155
|
+
DEFINE_DCTX_PARAM_ACCESSORS(format, ZSTD_d_format, "format")
|
|
139
156
|
|
|
140
157
|
// DCtx parameter_bounds - query parameter bounds (class method, kept for introspection)
|
|
141
158
|
static VALUE
|
|
@@ -215,6 +232,54 @@ vibe_zstd_dctx_set_initial_capacity(VALUE self, VALUE value) {
|
|
|
215
232
|
return value;
|
|
216
233
|
}
|
|
217
234
|
|
|
235
|
+
// DCtx default_max_decompressed_size getter (class method); 0 = unlimited
|
|
236
|
+
static VALUE
|
|
237
|
+
vibe_zstd_dctx_get_default_max_decompressed_size(VALUE self) {
|
|
238
|
+
return SIZET2NUM(default_max_decompressed_size);
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
// DCtx default_max_decompressed_size setter (class method)
|
|
242
|
+
static VALUE
|
|
243
|
+
vibe_zstd_dctx_set_default_max_decompressed_size(VALUE self, VALUE value) {
|
|
244
|
+
if (NIL_P(value)) {
|
|
245
|
+
default_max_decompressed_size = 0; // unlimited
|
|
246
|
+
} else {
|
|
247
|
+
default_max_decompressed_size = NUM2SIZET(value);
|
|
248
|
+
}
|
|
249
|
+
return value;
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
// DCtx max_decompressed_size getter (instance method); reports the effective
|
|
253
|
+
// limit, falling back to the class default. Returns 0 when unlimited.
|
|
254
|
+
static VALUE
|
|
255
|
+
vibe_zstd_dctx_get_max_decompressed_size(VALUE self) {
|
|
256
|
+
vibe_zstd_dctx* dctx;
|
|
257
|
+
TypedData_Get_Struct(self, vibe_zstd_dctx, &vibe_zstd_dctx_type, dctx);
|
|
258
|
+
|
|
259
|
+
if (dctx->max_decompressed_size == 0) {
|
|
260
|
+
return SIZET2NUM(default_max_decompressed_size);
|
|
261
|
+
}
|
|
262
|
+
return SIZET2NUM(dctx->max_decompressed_size);
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
// DCtx max_decompressed_size setter (instance method); nil = inherit class default
|
|
266
|
+
static VALUE
|
|
267
|
+
vibe_zstd_dctx_set_max_decompressed_size(VALUE self, VALUE value) {
|
|
268
|
+
vibe_zstd_dctx* dctx;
|
|
269
|
+
TypedData_Get_Struct(self, vibe_zstd_dctx, &vibe_zstd_dctx_type, dctx);
|
|
270
|
+
|
|
271
|
+
if (NIL_P(value)) {
|
|
272
|
+
dctx->max_decompressed_size = 0; // inherit class default
|
|
273
|
+
} else {
|
|
274
|
+
size_t limit = NUM2SIZET(value);
|
|
275
|
+
if (limit == 0) {
|
|
276
|
+
rb_raise(rb_eArgError, "max_decompressed_size must be positive (or nil to inherit the class default)");
|
|
277
|
+
}
|
|
278
|
+
dctx->max_decompressed_size = limit;
|
|
279
|
+
}
|
|
280
|
+
return value;
|
|
281
|
+
}
|
|
282
|
+
|
|
218
283
|
// Decompress args for GVL release
|
|
219
284
|
// This structure packages all arguments needed for decompression so we can
|
|
220
285
|
// call ZSTD functions without holding Ruby's Global VM Lock (GVL).
|
|
@@ -253,7 +318,10 @@ typedef struct {
|
|
|
253
318
|
size_t dst_capacity;
|
|
254
319
|
size_t dst_size;
|
|
255
320
|
size_t initial_capacity;
|
|
321
|
+
size_t max_size; // 0 = unlimited; otherwise output must not exceed this
|
|
256
322
|
int error;
|
|
323
|
+
int limit_exceeded; // set if output would exceed max_size
|
|
324
|
+
int truncated; // set if input was exhausted before the frame completed
|
|
257
325
|
const char *error_name;
|
|
258
326
|
} decompress_stream_nogvl_args;
|
|
259
327
|
|
|
@@ -264,9 +332,15 @@ static void*
|
|
|
264
332
|
decompress_stream_without_gvl(void* arg) {
|
|
265
333
|
decompress_stream_nogvl_args* args = arg;
|
|
266
334
|
args->error = 0;
|
|
335
|
+
args->limit_exceeded = 0;
|
|
336
|
+
args->truncated = 0;
|
|
267
337
|
args->error_name = NULL;
|
|
268
338
|
|
|
269
339
|
args->dst_capacity = args->initial_capacity;
|
|
340
|
+
// Never allocate more than the configured limit up front.
|
|
341
|
+
if (args->max_size && args->dst_capacity > args->max_size) {
|
|
342
|
+
args->dst_capacity = args->max_size;
|
|
343
|
+
}
|
|
270
344
|
args->dst = malloc(args->dst_capacity);
|
|
271
345
|
if (!args->dst) {
|
|
272
346
|
args->error = 1;
|
|
@@ -276,11 +350,21 @@ decompress_stream_without_gvl(void* arg) {
|
|
|
276
350
|
args->dst_size = 0;
|
|
277
351
|
|
|
278
352
|
ZSTD_inBuffer input = { args->src, args->src_size, 0 };
|
|
353
|
+
size_t last_ret = 1; // sentinel: non-zero = frame not yet complete
|
|
279
354
|
|
|
280
355
|
while (input.pos < input.size) {
|
|
281
356
|
// Ensure we have room for output
|
|
282
357
|
if (args->dst_size >= args->dst_capacity) {
|
|
283
358
|
size_t new_capacity = args->dst_capacity * 2;
|
|
359
|
+
// Clamp growth to the configured limit. If we cannot grow past the
|
|
360
|
+
// current capacity, the output would exceed the limit.
|
|
361
|
+
if (args->max_size && new_capacity > args->max_size) {
|
|
362
|
+
new_capacity = args->max_size;
|
|
363
|
+
}
|
|
364
|
+
if (new_capacity <= args->dst_capacity) {
|
|
365
|
+
args->limit_exceeded = 1;
|
|
366
|
+
return NULL;
|
|
367
|
+
}
|
|
284
368
|
char* new_buf = realloc(args->dst, new_capacity);
|
|
285
369
|
if (!new_buf) {
|
|
286
370
|
args->error = 1;
|
|
@@ -305,14 +389,74 @@ decompress_stream_without_gvl(void* arg) {
|
|
|
305
389
|
}
|
|
306
390
|
|
|
307
391
|
args->dst_size += output.pos;
|
|
392
|
+
last_ret = ret;
|
|
308
393
|
|
|
309
394
|
// ret == 0 means frame is complete
|
|
310
395
|
if (ret == 0) break;
|
|
311
396
|
}
|
|
312
397
|
|
|
398
|
+
// If we consumed all input but the last call still reported a non-zero hint
|
|
399
|
+
// (more input needed), the frame was cut short — flag it as truncated.
|
|
400
|
+
if (last_ret != 0) {
|
|
401
|
+
args->truncated = 1;
|
|
402
|
+
}
|
|
403
|
+
|
|
313
404
|
return NULL;
|
|
314
405
|
}
|
|
315
406
|
|
|
407
|
+
// State for the rb_ensure-wrapped unknown-size decompression path.
|
|
408
|
+
// Groups everything the body needs to run the no-GVL stream loop and everything
|
|
409
|
+
// the cleanup needs to release on any exit (raise, async exception, success).
|
|
410
|
+
typedef struct {
|
|
411
|
+
ZSTD_DCtx* dctx;
|
|
412
|
+
ZSTD_DDict* ddict;
|
|
413
|
+
decompress_stream_nogvl_args* args;
|
|
414
|
+
VALUE data;
|
|
415
|
+
size_t max_size;
|
|
416
|
+
} dctx_stream_decompress_state;
|
|
417
|
+
|
|
418
|
+
// Body: run the no-GVL stream loop (source string locked), check the outcome,
|
|
419
|
+
// and build the result string. Raising here is safe: cleanup always runs.
|
|
420
|
+
static VALUE
|
|
421
|
+
vibe_zstd_dctx_stream_decompress_body(VALUE p) {
|
|
422
|
+
dctx_stream_decompress_state* state = (dctx_stream_decompress_state*)p;
|
|
423
|
+
|
|
424
|
+
// Lock the source string while the GVL is released: another Ruby thread
|
|
425
|
+
// holding the same string must not mutate or GC it mid-decompression.
|
|
426
|
+
vibe_zstd_nogvl_with_str_locked(decompress_stream_without_gvl, state->args, state->data);
|
|
427
|
+
|
|
428
|
+
if (state->args->limit_exceeded) {
|
|
429
|
+
rb_raise(rb_eDecompressedSizeExceeded,
|
|
430
|
+
"Decompressed output exceeds limit of %zu bytes", state->max_size);
|
|
431
|
+
}
|
|
432
|
+
|
|
433
|
+
if (state->args->error) {
|
|
434
|
+
rb_raise(rb_eRuntimeError, "Decompression failed: %s", state->args->error_name);
|
|
435
|
+
}
|
|
436
|
+
|
|
437
|
+
if (state->args->truncated) {
|
|
438
|
+
rb_raise(rb_eRuntimeError, "Truncated frame: incomplete zstd data");
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
// Create Ruby string from the C buffer; cleanup frees the buffer
|
|
442
|
+
return rb_str_new(state->args->dst, state->args->dst_size);
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
// Cleanup: free the C output buffer and return the context to no-dictionary
|
|
446
|
+
// mode so subsequent calls on this DCtx are not affected.
|
|
447
|
+
static VALUE
|
|
448
|
+
vibe_zstd_dctx_stream_decompress_cleanup(VALUE p) {
|
|
449
|
+
dctx_stream_decompress_state* state = (dctx_stream_decompress_state*)p;
|
|
450
|
+
if (state->args->dst) {
|
|
451
|
+
free(state->args->dst);
|
|
452
|
+
state->args->dst = NULL;
|
|
453
|
+
}
|
|
454
|
+
if (state->ddict) {
|
|
455
|
+
ZSTD_DCtx_refDDict(state->dctx, NULL);
|
|
456
|
+
}
|
|
457
|
+
return Qnil;
|
|
458
|
+
}
|
|
459
|
+
|
|
316
460
|
// DCtx frame_content_size - class method to get frame content size
|
|
317
461
|
static VALUE
|
|
318
462
|
vibe_zstd_dctx_frame_content_size(VALUE self, VALUE data) {
|
|
@@ -353,35 +497,52 @@ vibe_zstd_dctx_decompress(int argc, VALUE* argv, VALUE self) {
|
|
|
353
497
|
size_t srcSize = RSTRING_LEN(data);
|
|
354
498
|
size_t offset = 0;
|
|
355
499
|
|
|
356
|
-
//
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
500
|
+
// Magicless frames (format = ZSTD_f_zstd1_magicless) carry no magic number,
|
|
501
|
+
// so frame introspection (content size, dict ID, skippable detection) cannot
|
|
502
|
+
// be performed. Force the streaming decompress path, which honors the format
|
|
503
|
+
// parameter set on the context via ZSTD_decompressStream.
|
|
504
|
+
int dformat = 0;
|
|
505
|
+
(void)ZSTD_DCtx_getParameter(dctx->dctx, ZSTD_d_format, &dformat);
|
|
506
|
+
int magicless = (dformat == ZSTD_f_zstd1_magicless);
|
|
507
|
+
|
|
508
|
+
unsigned long long contentSize;
|
|
509
|
+
unsigned int frame_dict_id;
|
|
510
|
+
|
|
511
|
+
if (magicless) {
|
|
512
|
+
contentSize = ZSTD_CONTENTSIZE_UNKNOWN; // route to streaming path
|
|
513
|
+
frame_dict_id = 0; // cannot read dict ID without magic
|
|
514
|
+
} else {
|
|
515
|
+
// Skip any leading skippable frames
|
|
516
|
+
while (offset < srcSize && ZSTD_isSkippableFrame(src + offset, srcSize - offset)) {
|
|
517
|
+
size_t frameSize = ZSTD_findFrameCompressedSize(src + offset, srcSize - offset);
|
|
518
|
+
if (ZSTD_isError(frameSize)) {
|
|
519
|
+
rb_raise(rb_eRuntimeError, "Invalid skippable frame at offset %zu: %s", offset, ZSTD_getErrorName(frameSize));
|
|
520
|
+
}
|
|
521
|
+
offset += frameSize;
|
|
361
522
|
}
|
|
362
|
-
offset += frameSize;
|
|
363
|
-
}
|
|
364
523
|
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
524
|
+
// Now check the actual compressed frame
|
|
525
|
+
if (offset >= srcSize) {
|
|
526
|
+
rb_raise(rb_eRuntimeError, "No compressed frame found in %zu bytes (only skippable frames)", srcSize);
|
|
527
|
+
}
|
|
369
528
|
|
|
370
|
-
|
|
371
|
-
|
|
529
|
+
src += offset;
|
|
530
|
+
srcSize -= offset;
|
|
372
531
|
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
532
|
+
contentSize = ZSTD_getFrameContentSize(src, srcSize);
|
|
533
|
+
if (contentSize == ZSTD_CONTENTSIZE_ERROR) {
|
|
534
|
+
rb_raise(rb_eRuntimeError, "Invalid compressed data: not a valid zstd frame (size: %zu bytes)", srcSize);
|
|
535
|
+
}
|
|
377
536
|
|
|
378
|
-
|
|
379
|
-
|
|
537
|
+
// Check dictionary requirements from the frame
|
|
538
|
+
frame_dict_id = ZSTD_getDictID_fromFrame(src, srcSize);
|
|
539
|
+
}
|
|
380
540
|
|
|
381
541
|
// Extract keyword arguments
|
|
382
542
|
ZSTD_DDict* ddict = NULL;
|
|
383
543
|
unsigned int provided_dict_id = 0;
|
|
384
544
|
size_t initial_capacity = 0; // 0 = not specified in per-call options
|
|
545
|
+
size_t max_size = 0; // 0 = not specified in per-call options
|
|
385
546
|
|
|
386
547
|
if (!NIL_P(options)) {
|
|
387
548
|
VALUE dict_val = rb_hash_aref(options, ID2SYM(rb_intern("dict")));
|
|
@@ -399,6 +560,27 @@ vibe_zstd_dctx_decompress(int argc, VALUE* argv, VALUE self) {
|
|
|
399
560
|
rb_raise(rb_eArgError, "initial_capacity must be positive");
|
|
400
561
|
}
|
|
401
562
|
}
|
|
563
|
+
|
|
564
|
+
// Per-call output-size limit; accepts :max_decompressed_size or :max_size.
|
|
565
|
+
VALUE max_size_val = rb_hash_aref(options, ID2SYM(rb_intern("max_decompressed_size")));
|
|
566
|
+
if (NIL_P(max_size_val)) {
|
|
567
|
+
max_size_val = rb_hash_aref(options, ID2SYM(rb_intern("max_size")));
|
|
568
|
+
}
|
|
569
|
+
if (!NIL_P(max_size_val)) {
|
|
570
|
+
max_size = NUM2SIZET(max_size_val);
|
|
571
|
+
if (max_size == 0) {
|
|
572
|
+
rb_raise(rb_eArgError, "max_decompressed_size must be positive");
|
|
573
|
+
}
|
|
574
|
+
}
|
|
575
|
+
}
|
|
576
|
+
|
|
577
|
+
// Resolve max_size fallback chain: per-call > instance > class default.
|
|
578
|
+
// A value of 0 at every level means unlimited.
|
|
579
|
+
if (max_size == 0) {
|
|
580
|
+
max_size = dctx->max_decompressed_size; // instance
|
|
581
|
+
if (max_size == 0) {
|
|
582
|
+
max_size = default_max_decompressed_size; // class
|
|
583
|
+
}
|
|
402
584
|
}
|
|
403
585
|
|
|
404
586
|
// Validate dictionary matches frame requirements
|
|
@@ -426,6 +608,17 @@ vibe_zstd_dctx_decompress(int argc, VALUE* argv, VALUE self) {
|
|
|
426
608
|
// Releases GVL to allow other Ruby threads to run during decompression.
|
|
427
609
|
// Uses C malloc/realloc (not Ruby allocators) since Ruby API calls are forbidden without GVL.
|
|
428
610
|
if (contentSize == ZSTD_CONTENTSIZE_UNKNOWN) {
|
|
611
|
+
// Reference the dictionary on the context before streaming decompression.
|
|
612
|
+
// ZSTD_decompressStream uses whatever dict is referenced on the DCtx, so
|
|
613
|
+
// without this the dictionary would be ignored on the unknown-size path
|
|
614
|
+
// (every dict frame produced by CompressWriter has unknown content size).
|
|
615
|
+
if (ddict) {
|
|
616
|
+
size_t rd = ZSTD_DCtx_refDDict(dctx->dctx, ddict);
|
|
617
|
+
if (ZSTD_isError(rd)) {
|
|
618
|
+
rb_raise(rb_eRuntimeError, "Failed to reference dictionary: %s", ZSTD_getErrorName(rd));
|
|
619
|
+
}
|
|
620
|
+
}
|
|
621
|
+
|
|
429
622
|
decompress_stream_nogvl_args stream_args = {
|
|
430
623
|
.dctx = dctx->dctx,
|
|
431
624
|
.src = src,
|
|
@@ -434,22 +627,34 @@ vibe_zstd_dctx_decompress(int argc, VALUE* argv, VALUE self) {
|
|
|
434
627
|
.dst_capacity = 0,
|
|
435
628
|
.dst_size = 0,
|
|
436
629
|
.initial_capacity = initial_capacity,
|
|
630
|
+
.max_size = max_size,
|
|
437
631
|
.error = 0,
|
|
632
|
+
.limit_exceeded = 0,
|
|
633
|
+
.truncated = 0,
|
|
438
634
|
.error_name = NULL
|
|
439
635
|
};
|
|
440
636
|
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
637
|
+
// Run the streaming decompression and build the result under rb_ensure:
|
|
638
|
+
// the cleanup frees the C buffer and un-references the dictionary on
|
|
639
|
+
// every exit path, including the raises below and async exceptions
|
|
640
|
+
// delivered when the GVL is reacquired.
|
|
641
|
+
dctx_stream_decompress_state state = {
|
|
642
|
+
.dctx = dctx->dctx,
|
|
643
|
+
.ddict = ddict,
|
|
644
|
+
.args = &stream_args,
|
|
645
|
+
.data = data,
|
|
646
|
+
.max_size = max_size
|
|
647
|
+
};
|
|
648
|
+
return rb_ensure(vibe_zstd_dctx_stream_decompress_body, (VALUE)&state,
|
|
649
|
+
vibe_zstd_dctx_stream_decompress_cleanup, (VALUE)&state);
|
|
452
650
|
}
|
|
651
|
+
// Reject a frame whose declared content size exceeds the limit before
|
|
652
|
+
// allocating the output buffer (the header is attacker-controlled).
|
|
653
|
+
if (max_size && contentSize > (unsigned long long)max_size) {
|
|
654
|
+
rb_raise(rb_eDecompressedSizeExceeded,
|
|
655
|
+
"Declared content size %llu exceeds limit of %zu bytes", contentSize, max_size);
|
|
656
|
+
}
|
|
657
|
+
|
|
453
658
|
VALUE result = rb_str_new(NULL, contentSize);
|
|
454
659
|
decompress_args args = {
|
|
455
660
|
.dctx = dctx->dctx,
|
|
@@ -460,7 +665,11 @@ vibe_zstd_dctx_decompress(int argc, VALUE* argv, VALUE self) {
|
|
|
460
665
|
.dstCapacity = contentSize,
|
|
461
666
|
.result = 0
|
|
462
667
|
};
|
|
463
|
-
|
|
668
|
+
// Lock the source string while the GVL is released: another Ruby thread
|
|
669
|
+
// holding the same string must not mutate or GC it mid-decompression.
|
|
670
|
+
// The helper unlocks via rb_ensure so an async exception cannot leave
|
|
671
|
+
// the string permanently locked.
|
|
672
|
+
vibe_zstd_nogvl_with_str_locked(decompress_without_gvl, &args, data);
|
|
464
673
|
if (ZSTD_isError(args.result)) {
|
|
465
674
|
rb_raise(rb_eRuntimeError, "Decompression failed: %s", ZSTD_getErrorName(args.result));
|
|
466
675
|
}
|
|
@@ -525,6 +734,13 @@ vibe_zstd_dctx_init_class(VALUE rb_cVibeZstdDCtx) {
|
|
|
525
734
|
// Initialize parameter lookup table
|
|
526
735
|
init_dctx_param_table();
|
|
527
736
|
|
|
737
|
+
// Define VibeZstd::Error (base) and VibeZstd::DecompressedSizeExceeded.
|
|
738
|
+
// Defined here (rather than only in Ruby) so the error is available even if
|
|
739
|
+
// the C extension is required without the Ruby wrapper. Ruby's
|
|
740
|
+
// `class Error < StandardError` simply reopens the same class.
|
|
741
|
+
VALUE rb_eVibeZstdError = rb_define_class_under(rb_mVibeZstd, "Error", rb_eStandardError);
|
|
742
|
+
rb_eDecompressedSizeExceeded = rb_define_class_under(rb_mVibeZstd, "DecompressedSizeExceeded", rb_eVibeZstdError);
|
|
743
|
+
|
|
528
744
|
rb_define_alloc_func(rb_cVibeZstdDCtx, vibe_zstd_dctx_alloc);
|
|
529
745
|
rb_define_method(rb_cVibeZstdDCtx, "initialize", vibe_zstd_dctx_initialize, -1);
|
|
530
746
|
rb_define_method(rb_cVibeZstdDCtx, "decompress", vibe_zstd_dctx_decompress, -1);
|
|
@@ -543,8 +759,20 @@ vibe_zstd_dctx_init_class(VALUE rb_cVibeZstdDCtx) {
|
|
|
543
759
|
rb_define_method(rb_cVibeZstdDCtx, "window_log_max", vibe_zstd_dctx_get_window_log_max, 0);
|
|
544
760
|
rb_define_alias(rb_cVibeZstdDCtx, "max_window_log=", "window_log_max=");
|
|
545
761
|
rb_define_alias(rb_cVibeZstdDCtx, "max_window_log", "window_log_max");
|
|
762
|
+
rb_define_method(rb_cVibeZstdDCtx, "format=", vibe_zstd_dctx_set_format, 1);
|
|
763
|
+
rb_define_method(rb_cVibeZstdDCtx, "format", vibe_zstd_dctx_get_format, 0);
|
|
546
764
|
|
|
547
765
|
// Instance-level initial_capacity accessors
|
|
548
766
|
rb_define_method(rb_cVibeZstdDCtx, "initial_capacity", vibe_zstd_dctx_get_initial_capacity, 0);
|
|
549
767
|
rb_define_method(rb_cVibeZstdDCtx, "initial_capacity=", vibe_zstd_dctx_set_initial_capacity, 1);
|
|
768
|
+
|
|
769
|
+
// Class-level default_max_decompressed_size accessors (0 = unlimited)
|
|
770
|
+
rb_define_singleton_method(rb_cVibeZstdDCtx, "default_max_decompressed_size", vibe_zstd_dctx_get_default_max_decompressed_size, 0);
|
|
771
|
+
rb_define_singleton_method(rb_cVibeZstdDCtx, "default_max_decompressed_size=", vibe_zstd_dctx_set_default_max_decompressed_size, 1);
|
|
772
|
+
|
|
773
|
+
// Instance-level max_decompressed_size accessors (with shorter max_size alias)
|
|
774
|
+
rb_define_method(rb_cVibeZstdDCtx, "max_decompressed_size", vibe_zstd_dctx_get_max_decompressed_size, 0);
|
|
775
|
+
rb_define_method(rb_cVibeZstdDCtx, "max_decompressed_size=", vibe_zstd_dctx_set_max_decompressed_size, 1);
|
|
776
|
+
rb_define_alias(rb_cVibeZstdDCtx, "max_size", "max_decompressed_size");
|
|
777
|
+
rb_define_alias(rb_cVibeZstdDCtx, "max_size=", "max_decompressed_size=");
|
|
550
778
|
}
|
data/ext/vibe_zstd/dict.c
CHANGED
|
@@ -125,13 +125,21 @@ dict_training_cleanup(VALUE arg) {
|
|
|
125
125
|
return Qnil;
|
|
126
126
|
}
|
|
127
127
|
|
|
128
|
-
// Copy Ruby sample strings into contiguous C buffer for ZDICT functions
|
|
128
|
+
// Copy Ruby sample strings into contiguous C buffer for ZDICT functions.
|
|
129
|
+
// capacity is the total_samples_size measured during the validation pass.
|
|
130
|
+
// If any sample has grown since validation (TOCTOU mutation), we raise rather
|
|
131
|
+
// than overflow the buffer.
|
|
129
132
|
static void
|
|
130
|
-
copy_samples_to_buffer(dict_training_resources* resources, VALUE samples, long num_samples
|
|
133
|
+
copy_samples_to_buffer(dict_training_resources* resources, VALUE samples, long num_samples,
|
|
134
|
+
size_t capacity) {
|
|
131
135
|
size_t offset = 0;
|
|
132
136
|
for (long i = 0; i < num_samples; i++) {
|
|
133
137
|
VALUE sample = rb_ary_entry(samples, i);
|
|
134
138
|
size_t sample_len = RSTRING_LEN(sample);
|
|
139
|
+
// Guard against mutation between the validation pass and this copy.
|
|
140
|
+
if (sample_len > capacity - offset) {
|
|
141
|
+
rb_raise(rb_eRuntimeError, "sample mutated during dictionary training");
|
|
142
|
+
}
|
|
135
143
|
resources->sample_sizes[i] = sample_len;
|
|
136
144
|
memcpy(resources->samples_buffer + offset, RSTRING_PTR(sample), sample_len);
|
|
137
145
|
offset += sample_len;
|
|
@@ -148,12 +156,14 @@ typedef struct {
|
|
|
148
156
|
VALUE result;
|
|
149
157
|
size_t max_dict_size;
|
|
150
158
|
long num_samples;
|
|
151
|
-
|
|
159
|
+
size_t total_samples_size; // measured during validation; passed to copy_samples_to_buffer
|
|
160
|
+
VALUE samples; // private converted-samples array built during validation
|
|
152
161
|
} dict_training_ctx;
|
|
153
162
|
|
|
154
163
|
static VALUE train_dict_basic_body(VALUE arg) {
|
|
155
164
|
dict_training_ctx* ctx = (dict_training_ctx*)arg;
|
|
156
|
-
copy_samples_to_buffer(ctx->resources, ctx->samples, ctx->num_samples
|
|
165
|
+
copy_samples_to_buffer(ctx->resources, ctx->samples, ctx->num_samples,
|
|
166
|
+
ctx->total_samples_size);
|
|
157
167
|
size_t dict_size = ZDICT_trainFromBuffer(
|
|
158
168
|
ctx->resources->dict_buffer, ctx->max_dict_size,
|
|
159
169
|
ctx->resources->samples_buffer, ctx->resources->sample_sizes, (unsigned)ctx->num_samples
|
|
@@ -172,7 +182,8 @@ typedef struct {
|
|
|
172
182
|
|
|
173
183
|
static VALUE train_dict_cover_body(VALUE arg) {
|
|
174
184
|
train_dict_cover_ctx* ctx = (train_dict_cover_ctx*)arg;
|
|
175
|
-
copy_samples_to_buffer(ctx->base.resources, ctx->base.samples, ctx->base.num_samples
|
|
185
|
+
copy_samples_to_buffer(ctx->base.resources, ctx->base.samples, ctx->base.num_samples,
|
|
186
|
+
ctx->base.total_samples_size);
|
|
176
187
|
size_t dict_size = ZDICT_trainFromBuffer_cover(
|
|
177
188
|
ctx->base.resources->dict_buffer, ctx->base.max_dict_size,
|
|
178
189
|
ctx->base.resources->samples_buffer, ctx->base.resources->sample_sizes, (unsigned)ctx->base.num_samples,
|
|
@@ -192,7 +203,8 @@ typedef struct {
|
|
|
192
203
|
|
|
193
204
|
static VALUE train_dict_fast_cover_body(VALUE arg) {
|
|
194
205
|
train_dict_fast_cover_ctx* ctx = (train_dict_fast_cover_ctx*)arg;
|
|
195
|
-
copy_samples_to_buffer(ctx->base.resources, ctx->base.samples, ctx->base.num_samples
|
|
206
|
+
copy_samples_to_buffer(ctx->base.resources, ctx->base.samples, ctx->base.num_samples,
|
|
207
|
+
ctx->base.total_samples_size);
|
|
196
208
|
size_t dict_size = ZDICT_trainFromBuffer_fastCover(
|
|
197
209
|
ctx->base.resources->dict_buffer, ctx->base.max_dict_size,
|
|
198
210
|
ctx->base.resources->samples_buffer, ctx->base.resources->sample_sizes, (unsigned)ctx->base.num_samples,
|
|
@@ -213,7 +225,8 @@ typedef struct {
|
|
|
213
225
|
|
|
214
226
|
static VALUE finalize_dict_body(VALUE arg) {
|
|
215
227
|
finalize_dict_ctx* ctx = (finalize_dict_ctx*)arg;
|
|
216
|
-
copy_samples_to_buffer(ctx->base.resources, ctx->base.samples, ctx->base.num_samples
|
|
228
|
+
copy_samples_to_buffer(ctx->base.resources, ctx->base.samples, ctx->base.num_samples,
|
|
229
|
+
ctx->base.total_samples_size);
|
|
217
230
|
size_t dict_size = ZDICT_finalizeDictionary(
|
|
218
231
|
ctx->base.resources->dict_buffer, ctx->base.max_dict_size,
|
|
219
232
|
RSTRING_PTR(ctx->content_val), RSTRING_LEN(ctx->content_val),
|
|
@@ -245,11 +258,17 @@ vibe_zstd_train_dict(int argc, VALUE* argv, VALUE self) {
|
|
|
245
258
|
rb_raise(rb_eArgError, "samples array cannot be empty");
|
|
246
259
|
}
|
|
247
260
|
|
|
248
|
-
// Validate all samples are strings and calculate sizes BEFORE allocating
|
|
261
|
+
// Validate all samples are strings and calculate sizes BEFORE allocating.
|
|
262
|
+
// Build a private converted-samples array so that StringValue conversions are
|
|
263
|
+
// retained: rb_ary_entry re-fetches the raw element, so storing the converted
|
|
264
|
+
// value here ensures copy_samples_to_buffer operates on real String objects
|
|
265
|
+
// rather than potentially-non-String objects that merely respond to to_str.
|
|
266
|
+
VALUE converted_samples = rb_ary_new_capa(num_samples);
|
|
249
267
|
size_t total_samples_size = 0;
|
|
250
268
|
for (long i = 0; i < num_samples; i++) {
|
|
251
269
|
VALUE sample = rb_ary_entry(samples, i);
|
|
252
|
-
StringValue(sample); // Validate type early - may raise TypeError
|
|
270
|
+
StringValue(sample); // Validate type early - may raise TypeError; updates local
|
|
271
|
+
rb_ary_push(converted_samples, sample);
|
|
253
272
|
total_samples_size += RSTRING_LEN(sample);
|
|
254
273
|
}
|
|
255
274
|
|
|
@@ -274,7 +293,8 @@ vibe_zstd_train_dict(int argc, VALUE* argv, VALUE self) {
|
|
|
274
293
|
.result = Qnil,
|
|
275
294
|
.max_dict_size = max_dict_size,
|
|
276
295
|
.num_samples = num_samples,
|
|
277
|
-
.
|
|
296
|
+
.total_samples_size = total_samples_size,
|
|
297
|
+
.samples = converted_samples // use private array, not caller's array
|
|
278
298
|
};
|
|
279
299
|
|
|
280
300
|
rb_ensure(train_dict_basic_body, (VALUE)&ctx, dict_training_cleanup, (VALUE)&resources);
|
|
@@ -298,11 +318,14 @@ vibe_zstd_train_dict_cover(int argc, VALUE* argv, VALUE self) {
|
|
|
298
318
|
rb_raise(rb_eArgError, "samples array cannot be empty");
|
|
299
319
|
}
|
|
300
320
|
|
|
301
|
-
// Validate all samples are strings and calculate sizes BEFORE allocating
|
|
321
|
+
// Validate all samples are strings and calculate sizes BEFORE allocating.
|
|
322
|
+
// Build a private converted-samples array (see vibe_zstd_train_dict for details).
|
|
323
|
+
VALUE converted_samples = rb_ary_new_capa(num_samples);
|
|
302
324
|
size_t total_samples_size = 0;
|
|
303
325
|
for (long i = 0; i < num_samples; i++) {
|
|
304
326
|
VALUE sample = rb_ary_entry(samples, i);
|
|
305
|
-
StringValue(sample); // Validate type early - may raise TypeError
|
|
327
|
+
StringValue(sample); // Validate type early - may raise TypeError; updates local
|
|
328
|
+
rb_ary_push(converted_samples, sample);
|
|
306
329
|
total_samples_size += RSTRING_LEN(sample);
|
|
307
330
|
}
|
|
308
331
|
|
|
@@ -358,7 +381,8 @@ vibe_zstd_train_dict_cover(int argc, VALUE* argv, VALUE self) {
|
|
|
358
381
|
.result = Qnil,
|
|
359
382
|
.max_dict_size = max_dict_size,
|
|
360
383
|
.num_samples = num_samples,
|
|
361
|
-
.
|
|
384
|
+
.total_samples_size = total_samples_size,
|
|
385
|
+
.samples = converted_samples // use private array, not caller's array
|
|
362
386
|
},
|
|
363
387
|
.params = params
|
|
364
388
|
};
|
|
@@ -384,11 +408,14 @@ vibe_zstd_train_dict_fast_cover(int argc, VALUE* argv, VALUE self) {
|
|
|
384
408
|
rb_raise(rb_eArgError, "samples array cannot be empty");
|
|
385
409
|
}
|
|
386
410
|
|
|
387
|
-
// Validate all samples are strings and calculate sizes BEFORE allocating
|
|
411
|
+
// Validate all samples are strings and calculate sizes BEFORE allocating.
|
|
412
|
+
// Build a private converted-samples array (see vibe_zstd_train_dict for details).
|
|
413
|
+
VALUE converted_samples = rb_ary_new_capa(num_samples);
|
|
388
414
|
size_t total_samples_size = 0;
|
|
389
415
|
for (long i = 0; i < num_samples; i++) {
|
|
390
416
|
VALUE sample = rb_ary_entry(samples, i);
|
|
391
|
-
StringValue(sample); // Validate type early - may raise TypeError
|
|
417
|
+
StringValue(sample); // Validate type early - may raise TypeError; updates local
|
|
418
|
+
rb_ary_push(converted_samples, sample);
|
|
392
419
|
total_samples_size += RSTRING_LEN(sample);
|
|
393
420
|
}
|
|
394
421
|
|
|
@@ -447,7 +474,8 @@ vibe_zstd_train_dict_fast_cover(int argc, VALUE* argv, VALUE self) {
|
|
|
447
474
|
.result = Qnil,
|
|
448
475
|
.max_dict_size = max_dict_size,
|
|
449
476
|
.num_samples = num_samples,
|
|
450
|
-
.
|
|
477
|
+
.total_samples_size = total_samples_size,
|
|
478
|
+
.samples = converted_samples // use private array, not caller's array
|
|
451
479
|
},
|
|
452
480
|
.params = params
|
|
453
481
|
};
|
|
@@ -514,11 +542,14 @@ vibe_zstd_finalize_dictionary(int argc, VALUE* argv, VALUE self) {
|
|
|
514
542
|
rb_raise(rb_eArgError, "samples array cannot be empty");
|
|
515
543
|
}
|
|
516
544
|
|
|
517
|
-
// Validate all samples are strings and calculate sizes BEFORE allocating
|
|
545
|
+
// Validate all samples are strings and calculate sizes BEFORE allocating.
|
|
546
|
+
// Build a private converted-samples array (see vibe_zstd_train_dict for details).
|
|
547
|
+
VALUE converted_samples = rb_ary_new_capa(num_samples);
|
|
518
548
|
size_t total_samples_size = 0;
|
|
519
549
|
for (long i = 0; i < num_samples; i++) {
|
|
520
550
|
VALUE sample = rb_ary_entry(samples_val, i);
|
|
521
|
-
StringValue(sample); // Validate type early - may raise TypeError
|
|
551
|
+
StringValue(sample); // Validate type early - may raise TypeError; updates local
|
|
552
|
+
rb_ary_push(converted_samples, sample);
|
|
522
553
|
total_samples_size += RSTRING_LEN(sample);
|
|
523
554
|
}
|
|
524
555
|
|
|
@@ -546,7 +577,8 @@ vibe_zstd_finalize_dictionary(int argc, VALUE* argv, VALUE self) {
|
|
|
546
577
|
.result = Qnil,
|
|
547
578
|
.max_dict_size = max_size,
|
|
548
579
|
.num_samples = num_samples,
|
|
549
|
-
.
|
|
580
|
+
.total_samples_size = total_samples_size,
|
|
581
|
+
.samples = converted_samples // use private array, not caller's array
|
|
550
582
|
},
|
|
551
583
|
.content_val = content_val,
|
|
552
584
|
.params = params
|
data/ext/vibe_zstd/extconf.rb
CHANGED
|
@@ -14,10 +14,13 @@ $INCFLAGS << " -I#{LIBZSTD_DIR}/decompress"
|
|
|
14
14
|
$INCFLAGS << " -I#{LIBZSTD_DIR}/dictBuilder"
|
|
15
15
|
# standard:enable Style/GlobalVars
|
|
16
16
|
|
|
17
|
-
# Add preprocessor definitions
|
|
18
|
-
append_cflags
|
|
19
|
-
|
|
20
|
-
|
|
17
|
+
# Add preprocessor definitions (use $defs so they appear in DEFS in the Makefile,
|
|
18
|
+
# append_cflags only validates the flag but doesn't reliably propagate -D flags)
|
|
19
|
+
# standard:disable Style/GlobalVars
|
|
20
|
+
$defs << "-DXXH_NAMESPACE=ZSTD_"
|
|
21
|
+
$defs << "-DZSTD_LEGACY_SUPPORT=0" # Disable legacy support to reduce size
|
|
22
|
+
$defs << "-DZSTD_MULTITHREAD" # Enable multithreading support
|
|
23
|
+
# standard:enable Style/GlobalVars
|
|
21
24
|
|
|
22
25
|
# Link with pthread for multithreading
|
|
23
26
|
have_library("pthread") || abort("pthread library is required for multithreading support")
|
data/ext/vibe_zstd/frames.c
CHANGED
|
@@ -81,12 +81,23 @@ vibe_zstd_read_skippable_frame(VALUE self, VALUE data) {
|
|
|
81
81
|
uint32_t content_size;
|
|
82
82
|
memcpy(&content_size, src + 4, 4);
|
|
83
83
|
|
|
84
|
-
|
|
84
|
+
// The content size field is attacker-controlled and may claim up to ~4 GiB.
|
|
85
|
+
// A skippable frame's content cannot exceed the bytes actually provided
|
|
86
|
+
// (src_size minus the 8-byte header), so cap the allocation accordingly to
|
|
87
|
+
// prevent a tiny truncated input from forcing a huge allocation. A frame
|
|
88
|
+
// whose declared size exceeds what is present is malformed and
|
|
89
|
+
// ZSTD_readSkippableFrame reports the error below.
|
|
90
|
+
size_t capacity = content_size;
|
|
91
|
+
if (capacity > src_size - 8) {
|
|
92
|
+
capacity = src_size - 8;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
VALUE result = rb_str_buf_new(capacity);
|
|
85
96
|
unsigned magic_variant;
|
|
86
97
|
|
|
87
98
|
size_t bytes_read = ZSTD_readSkippableFrame(
|
|
88
99
|
RSTRING_PTR(result),
|
|
89
|
-
|
|
100
|
+
capacity,
|
|
90
101
|
&magic_variant,
|
|
91
102
|
src,
|
|
92
103
|
src_size
|