zstdlib 0.12.0-x86-mingw32 → 0.13.0-x86-mingw32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGES.md +7 -0
  3. data/Rakefile +1 -1
  4. data/ext/zstdlib_c/extconf.rb +1 -1
  5. data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/common/allocations.h +1 -1
  6. data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/common/bitstream.h +49 -29
  7. data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/common/compiler.h +114 -22
  8. data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/common/cpu.h +36 -0
  9. data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/common/debug.c +6 -0
  10. data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/common/debug.h +20 -11
  11. data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/common/error_private.h +45 -36
  12. data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/common/fse.h +3 -2
  13. data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/common/fse_decompress.c +19 -17
  14. data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/common/huf.h +14 -1
  15. data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/common/mem.h +0 -9
  16. data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/common/pool.c +1 -1
  17. data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/common/pool.h +1 -1
  18. data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/common/portability_macros.h +2 -0
  19. data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/common/threading.c +8 -2
  20. data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/common/xxhash.c +5 -11
  21. data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/common/xxhash.h +2341 -1007
  22. data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/common/zstd_internal.h +5 -5
  23. data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/compress/fse_compress.c +8 -7
  24. data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/compress/huf_compress.c +54 -25
  25. data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/compress/zstd_compress.c +282 -161
  26. data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/compress/zstd_compress_internal.h +29 -27
  27. data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/compress/zstd_compress_superblock.c +224 -113
  28. data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/compress/zstd_cwksp.h +19 -13
  29. data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/compress/zstd_double_fast.c +17 -5
  30. data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/compress/zstd_double_fast.h +11 -0
  31. data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/compress/zstd_fast.c +14 -6
  32. data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/compress/zstd_lazy.c +129 -87
  33. data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/compress/zstd_lazy.h +103 -28
  34. data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/compress/zstd_ldm.c +8 -2
  35. data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/compress/zstd_opt.c +216 -112
  36. data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/compress/zstd_opt.h +31 -7
  37. data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/compress/zstdmt_compress.c +94 -79
  38. data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/decompress/huf_decompress.c +188 -126
  39. data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/decompress/huf_decompress_amd64.S +38 -19
  40. data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/decompress/zstd_decompress.c +84 -32
  41. data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/decompress/zstd_decompress_block.c +231 -208
  42. data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/decompress/zstd_decompress_block.h +1 -1
  43. data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/decompress/zstd_decompress_internal.h +2 -0
  44. data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/zstd.h +129 -60
  45. data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/zlibWrapper/gzclose.c +1 -3
  46. data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/zlibWrapper/gzlib.c +20 -73
  47. data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/zlibWrapper/gzread.c +17 -58
  48. data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/zlibWrapper/gzwrite.c +18 -58
  49. data/lib/2.4/zstdlib_c.so +0 -0
  50. data/lib/2.5/zstdlib_c.so +0 -0
  51. data/lib/2.6/zstdlib_c.so +0 -0
  52. data/lib/2.7/zstdlib_c.so +0 -0
  53. data/lib/3.0/zstdlib_c.so +0 -0
  54. data/lib/3.1/zstdlib_c.so +0 -0
  55. data/lib/3.2/zstdlib_c.so +0 -0
  56. data/lib/3.3/zstdlib_c.so +0 -0
  57. metadata +75 -75
  58. /data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/common/bits.h +0 -0
  59. /data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/common/entropy_common.c +0 -0
  60. /data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/common/error_private.c +0 -0
  61. /data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/common/threading.h +0 -0
  62. /data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/common/zstd_common.c +0 -0
  63. /data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/common/zstd_deps.h +0 -0
  64. /data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/common/zstd_trace.h +0 -0
  65. /data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/compress/clevels.h +0 -0
  66. /data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/compress/hist.c +0 -0
  67. /data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/compress/hist.h +0 -0
  68. /data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/compress/zstd_compress_literals.c +0 -0
  69. /data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/compress/zstd_compress_literals.h +0 -0
  70. /data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/compress/zstd_compress_sequences.c +0 -0
  71. /data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/compress/zstd_compress_sequences.h +0 -0
  72. /data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/compress/zstd_compress_superblock.h +0 -0
  73. /data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/compress/zstd_fast.h +0 -0
  74. /data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/compress/zstd_ldm.h +0 -0
  75. /data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/compress/zstd_ldm_geartab.h +0 -0
  76. /data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/compress/zstdmt_compress.h +0 -0
  77. /data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/decompress/zstd_ddict.c +0 -0
  78. /data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/decompress/zstd_ddict.h +0 -0
  79. /data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/zdict.h +0 -0
  80. /data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/lib/zstd_errors.h +0 -0
  81. /data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/zlibWrapper/gzcompatibility.h +0 -0
  82. /data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/zlibWrapper/gzguts.h +0 -0
  83. /data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/zlibWrapper/zstd_zlibwrapper.c +0 -0
  84. /data/ext/zstdlib_c/{zstd-1.5.5 → zstd-1.5.6}/zlibWrapper/zstd_zlibwrapper.h +0 -0
@@ -34,6 +34,12 @@
34
34
  * Macros
35
35
  ****************************************************************/
36
36
 
37
+ #ifdef HUF_DISABLE_FAST_DECODE
38
+ # define HUF_ENABLE_FAST_DECODE 0
39
+ #else
40
+ # define HUF_ENABLE_FAST_DECODE 1
41
+ #endif
42
+
37
43
  /* These two optional macros force the use one way or another of the two
38
44
  * Huffman decompression implementations. You can't force in both directions
39
45
  * at the same time.
@@ -158,17 +164,18 @@ static size_t HUF_initFastDStream(BYTE const* ip) {
158
164
  * op [in/out] - The output pointers, must be updated to reflect what is written.
159
165
  * bits [in/out] - The bitstream containers, must be updated to reflect the current state.
160
166
  * dt [in] - The decoding table.
161
- * ilimit [in] - The input limit, stop when any input pointer is below ilimit.
167
+ * ilowest [in] - The beginning of the valid range of the input. Decoders may read
168
+ * down to this pointer. It may be below iend[0].
162
169
  * oend [in] - The end of the output stream. op[3] must not cross oend.
163
170
  * iend [in] - The end of each input stream. ip[i] may cross iend[i],
164
- * as long as it is above ilimit, but that indicates corruption.
171
+ * as long as it is above ilowest, but that indicates corruption.
165
172
  */
166
173
  typedef struct {
167
174
  BYTE const* ip[4];
168
175
  BYTE* op[4];
169
176
  U64 bits[4];
170
177
  void const* dt;
171
- BYTE const* ilimit;
178
+ BYTE const* ilowest;
172
179
  BYTE* oend;
173
180
  BYTE const* iend[4];
174
181
  } HUF_DecompressFastArgs;
@@ -186,9 +193,9 @@ static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* ds
186
193
  void const* dt = DTable + 1;
187
194
  U32 const dtLog = HUF_getDTableDesc(DTable).tableLog;
188
195
 
189
- const BYTE* const ilimit = (const BYTE*)src + 6 + 8;
196
+ const BYTE* const istart = (const BYTE*)src;
190
197
 
191
- BYTE* const oend = (BYTE*)dst + dstSize;
198
+ BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize);
192
199
 
193
200
  /* The fast decoding loop assumes 64-bit little-endian.
194
201
  * This condition is false on x32.
@@ -196,6 +203,11 @@ static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* ds
196
203
  if (!MEM_isLittleEndian() || MEM_32bits())
197
204
  return 0;
198
205
 
206
+ /* Avoid nullptr addition */
207
+ if (dstSize == 0)
208
+ return 0;
209
+ assert(dst != NULL);
210
+
199
211
  /* strict minimum : jump table + 1 byte per stream */
200
212
  if (srcSize < 10)
201
213
  return ERROR(corruption_detected);
@@ -209,7 +221,6 @@ static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* ds
209
221
 
210
222
  /* Read the jump table. */
211
223
  {
212
- const BYTE* const istart = (const BYTE*)src;
213
224
  size_t const length1 = MEM_readLE16(istart);
214
225
  size_t const length2 = MEM_readLE16(istart+2);
215
226
  size_t const length3 = MEM_readLE16(istart+4);
@@ -221,10 +232,8 @@ static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* ds
221
232
 
222
233
  /* HUF_initFastDStream() requires this, and this small of an input
223
234
  * won't benefit from the ASM loop anyways.
224
- * length1 must be >= 16 so that ip[0] >= ilimit before the loop
225
- * starts.
226
235
  */
227
- if (length1 < 16 || length2 < 8 || length3 < 8 || length4 < 8)
236
+ if (length1 < 8 || length2 < 8 || length3 < 8 || length4 < 8)
228
237
  return 0;
229
238
  if (length4 > srcSize) return ERROR(corruption_detected); /* overflow */
230
239
  }
@@ -256,11 +265,12 @@ static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* ds
256
265
  args->bits[2] = HUF_initFastDStream(args->ip[2]);
257
266
  args->bits[3] = HUF_initFastDStream(args->ip[3]);
258
267
 
259
- /* If ip[] >= ilimit, it is guaranteed to be safe to
260
- * reload bits[]. It may be beyond its section, but is
261
- * guaranteed to be valid (>= istart).
262
- */
263
- args->ilimit = ilimit;
268
+ /* The decoders must be sure to never read beyond ilowest.
269
+ * This is lower than iend[0], but allowing decoders to read
270
+ * down to ilowest can allow an extra iteration or two in the
271
+ * fast loop.
272
+ */
273
+ args->ilowest = istart;
264
274
 
265
275
  args->oend = oend;
266
276
  args->dt = dt;
@@ -285,13 +295,31 @@ static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressFastArg
285
295
  assert(sizeof(size_t) == 8);
286
296
  bit->bitContainer = MEM_readLEST(args->ip[stream]);
287
297
  bit->bitsConsumed = ZSTD_countTrailingZeros64(args->bits[stream]);
288
- bit->start = (const char*)args->iend[0];
298
+ bit->start = (const char*)args->ilowest;
289
299
  bit->limitPtr = bit->start + sizeof(size_t);
290
300
  bit->ptr = (const char*)args->ip[stream];
291
301
 
292
302
  return 0;
293
303
  }
294
304
 
305
+ /* Calls X(N) for each stream 0, 1, 2, 3. */
306
+ #define HUF_4X_FOR_EACH_STREAM(X) \
307
+ do { \
308
+ X(0); \
309
+ X(1); \
310
+ X(2); \
311
+ X(3); \
312
+ } while (0)
313
+
314
+ /* Calls X(N, var) for each stream 0, 1, 2, 3. */
315
+ #define HUF_4X_FOR_EACH_STREAM_WITH_VAR(X, var) \
316
+ do { \
317
+ X(0, (var)); \
318
+ X(1, (var)); \
319
+ X(2, (var)); \
320
+ X(3, (var)); \
321
+ } while (0)
322
+
295
323
 
296
324
  #ifndef HUF_FORCE_DECOMPRESS_X2
297
325
 
@@ -500,15 +528,19 @@ HUF_decodeSymbolX1(BIT_DStream_t* Dstream, const HUF_DEltX1* dt, const U32 dtLog
500
528
  }
501
529
 
502
530
  #define HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr) \
503
- *ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog)
531
+ do { *ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog); } while (0)
504
532
 
505
- #define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr) \
506
- if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
507
- HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr)
533
+ #define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr) \
534
+ do { \
535
+ if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
536
+ HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr); \
537
+ } while (0)
508
538
 
509
- #define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr) \
510
- if (MEM_64bits()) \
511
- HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr)
539
+ #define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr) \
540
+ do { \
541
+ if (MEM_64bits()) \
542
+ HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr); \
543
+ } while (0)
512
544
 
513
545
  HINT_INLINE size_t
514
546
  HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, const HUF_DEltX1* const dt, const U32 dtLog)
@@ -546,7 +578,7 @@ HUF_decompress1X1_usingDTable_internal_body(
546
578
  const HUF_DTable* DTable)
547
579
  {
548
580
  BYTE* op = (BYTE*)dst;
549
- BYTE* const oend = op + dstSize;
581
+ BYTE* const oend = ZSTD_maybeNullPtrAdd(op, dstSize);
550
582
  const void* dtPtr = DTable + 1;
551
583
  const HUF_DEltX1* const dt = (const HUF_DEltX1*)dtPtr;
552
584
  BIT_DStream_t bitD;
@@ -574,6 +606,7 @@ HUF_decompress4X1_usingDTable_internal_body(
574
606
  {
575
607
  /* Check */
576
608
  if (cSrcSize < 10) return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */
609
+ if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */
577
610
 
578
611
  { const BYTE* const istart = (const BYTE*) cSrc;
579
612
  BYTE* const ostart = (BYTE*) dst;
@@ -609,7 +642,7 @@ HUF_decompress4X1_usingDTable_internal_body(
609
642
 
610
643
  if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */
611
644
  if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */
612
- if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */
645
+ assert(dstSize >= 6); /* validated above */
613
646
  CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
614
647
  CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
615
648
  CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
@@ -692,7 +725,7 @@ void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs*
692
725
  BYTE* op[4];
693
726
  U16 const* const dtable = (U16 const*)args->dt;
694
727
  BYTE* const oend = args->oend;
695
- BYTE const* const ilimit = args->ilimit;
728
+ BYTE const* const ilowest = args->ilowest;
696
729
 
697
730
  /* Copy the arguments to local variables */
698
731
  ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
@@ -705,13 +738,12 @@ void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs*
705
738
  for (;;) {
706
739
  BYTE* olimit;
707
740
  int stream;
708
- int symbol;
709
741
 
710
742
  /* Assert loop preconditions */
711
743
  #ifndef NDEBUG
712
744
  for (stream = 0; stream < 4; ++stream) {
713
745
  assert(op[stream] <= (stream == 3 ? oend : op[stream + 1]));
714
- assert(ip[stream] >= ilimit);
746
+ assert(ip[stream] >= ilowest);
715
747
  }
716
748
  #endif
717
749
  /* Compute olimit */
@@ -721,7 +753,7 @@ void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs*
721
753
  /* Each iteration consumes up to 11 bits * 5 = 55 bits < 7 bytes
722
754
  * per stream.
723
755
  */
724
- size_t const iiters = (size_t)(ip[0] - ilimit) / 7;
756
+ size_t const iiters = (size_t)(ip[0] - ilowest) / 7;
725
757
  /* We can safely run iters iterations before running bounds checks */
726
758
  size_t const iters = MIN(oiters, iiters);
727
759
  size_t const symbols = iters * 5;
@@ -732,8 +764,8 @@ void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs*
732
764
  */
733
765
  olimit = op[3] + symbols;
734
766
 
735
- /* Exit fast decoding loop once we get close to the end. */
736
- if (op[3] + 20 > olimit)
767
+ /* Exit fast decoding loop once we reach the end. */
768
+ if (op[3] == olimit)
737
769
  break;
738
770
 
739
771
  /* Exit the decoding loop if any input pointer has crossed the
@@ -752,27 +784,42 @@ void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs*
752
784
  }
753
785
  #endif
754
786
 
787
+ #define HUF_4X1_DECODE_SYMBOL(_stream, _symbol) \
788
+ do { \
789
+ int const index = (int)(bits[(_stream)] >> 53); \
790
+ int const entry = (int)dtable[index]; \
791
+ bits[(_stream)] <<= (entry & 0x3F); \
792
+ op[(_stream)][(_symbol)] = (BYTE)((entry >> 8) & 0xFF); \
793
+ } while (0)
794
+
795
+ #define HUF_4X1_RELOAD_STREAM(_stream) \
796
+ do { \
797
+ int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \
798
+ int const nbBits = ctz & 7; \
799
+ int const nbBytes = ctz >> 3; \
800
+ op[(_stream)] += 5; \
801
+ ip[(_stream)] -= nbBytes; \
802
+ bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1; \
803
+ bits[(_stream)] <<= nbBits; \
804
+ } while (0)
805
+
806
+ /* Manually unroll the loop because compilers don't consistently
807
+ * unroll the inner loops, which destroys performance.
808
+ */
755
809
  do {
756
810
  /* Decode 5 symbols in each of the 4 streams */
757
- for (symbol = 0; symbol < 5; ++symbol) {
758
- for (stream = 0; stream < 4; ++stream) {
759
- int const index = (int)(bits[stream] >> 53);
760
- int const entry = (int)dtable[index];
761
- bits[stream] <<= (entry & 63);
762
- op[stream][symbol] = (BYTE)((entry >> 8) & 0xFF);
763
- }
764
- }
765
- /* Reload the bitstreams */
766
- for (stream = 0; stream < 4; ++stream) {
767
- int const ctz = ZSTD_countTrailingZeros64(bits[stream]);
768
- int const nbBits = ctz & 7;
769
- int const nbBytes = ctz >> 3;
770
- op[stream] += 5;
771
- ip[stream] -= nbBytes;
772
- bits[stream] = MEM_read64(ip[stream]) | 1;
773
- bits[stream] <<= nbBits;
774
- }
811
+ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 0);
812
+ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 1);
813
+ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 2);
814
+ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 3);
815
+ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 4);
816
+
817
+ /* Reload each of the 4 the bitstreams */
818
+ HUF_4X_FOR_EACH_STREAM(HUF_4X1_RELOAD_STREAM);
775
819
  } while (op[3] < olimit);
820
+
821
+ #undef HUF_4X1_DECODE_SYMBOL
822
+ #undef HUF_4X1_RELOAD_STREAM
776
823
  }
777
824
 
778
825
  _out:
@@ -797,8 +844,8 @@ HUF_decompress4X1_usingDTable_internal_fast(
797
844
  HUF_DecompressFastLoopFn loopFn)
798
845
  {
799
846
  void const* dt = DTable + 1;
800
- const BYTE* const iend = (const BYTE*)cSrc + 6;
801
- BYTE* const oend = (BYTE*)dst + dstSize;
847
+ BYTE const* const ilowest = (BYTE const*)cSrc;
848
+ BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize);
802
849
  HUF_DecompressFastArgs args;
803
850
  { size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
804
851
  FORWARD_IF_ERROR(ret, "Failed to init fast loop args");
@@ -806,18 +853,22 @@ HUF_decompress4X1_usingDTable_internal_fast(
806
853
  return 0;
807
854
  }
808
855
 
809
- assert(args.ip[0] >= args.ilimit);
856
+ assert(args.ip[0] >= args.ilowest);
810
857
  loopFn(&args);
811
858
 
812
- /* Our loop guarantees that ip[] >= ilimit and that we haven't
859
+ /* Our loop guarantees that ip[] >= ilowest and that we haven't
813
860
  * overwritten any op[].
814
861
  */
815
- assert(args.ip[0] >= iend);
816
- assert(args.ip[1] >= iend);
817
- assert(args.ip[2] >= iend);
818
- assert(args.ip[3] >= iend);
862
+ assert(args.ip[0] >= ilowest);
863
+ assert(args.ip[0] >= ilowest);
864
+ assert(args.ip[1] >= ilowest);
865
+ assert(args.ip[2] >= ilowest);
866
+ assert(args.ip[3] >= ilowest);
819
867
  assert(args.op[3] <= oend);
820
- (void)iend;
868
+
869
+ assert(ilowest == args.ilowest);
870
+ assert(ilowest + 6 == args.iend[0]);
871
+ (void)ilowest;
821
872
 
822
873
  /* finish bit streams one by one. */
823
874
  { size_t const segmentSize = (dstSize+3) / 4;
@@ -868,7 +919,7 @@ static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize,
868
919
  }
869
920
  #endif
870
921
 
871
- if (!(flags & HUF_flags_disableFast)) {
922
+ if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) {
872
923
  size_t const ret = HUF_decompress4X1_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);
873
924
  if (ret != 0)
874
925
  return ret;
@@ -1239,15 +1290,19 @@ HUF_decodeLastSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, c
1239
1290
  }
1240
1291
 
1241
1292
  #define HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) \
1242
- ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
1293
+ do { ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); } while (0)
1243
1294
 
1244
- #define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \
1245
- if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
1246
- ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
1295
+ #define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \
1296
+ do { \
1297
+ if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
1298
+ ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); \
1299
+ } while (0)
1247
1300
 
1248
- #define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \
1249
- if (MEM_64bits()) \
1250
- ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
1301
+ #define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \
1302
+ do { \
1303
+ if (MEM_64bits()) \
1304
+ ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); \
1305
+ } while (0)
1251
1306
 
1252
1307
  HINT_INLINE size_t
1253
1308
  HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd,
@@ -1307,7 +1362,7 @@ HUF_decompress1X2_usingDTable_internal_body(
1307
1362
 
1308
1363
  /* decode */
1309
1364
  { BYTE* const ostart = (BYTE*) dst;
1310
- BYTE* const oend = ostart + dstSize;
1365
+ BYTE* const oend = ZSTD_maybeNullPtrAdd(ostart, dstSize);
1311
1366
  const void* const dtPtr = DTable+1; /* force compiler to not use strict-aliasing */
1312
1367
  const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr;
1313
1368
  DTableDesc const dtd = HUF_getDTableDesc(DTable);
@@ -1332,6 +1387,7 @@ HUF_decompress4X2_usingDTable_internal_body(
1332
1387
  const HUF_DTable* DTable)
1333
1388
  {
1334
1389
  if (cSrcSize < 10) return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */
1390
+ if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */
1335
1391
 
1336
1392
  { const BYTE* const istart = (const BYTE*) cSrc;
1337
1393
  BYTE* const ostart = (BYTE*) dst;
@@ -1367,7 +1423,7 @@ HUF_decompress4X2_usingDTable_internal_body(
1367
1423
 
1368
1424
  if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */
1369
1425
  if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */
1370
- if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */
1426
+ assert(dstSize >= 6 /* validated above */);
1371
1427
  CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
1372
1428
  CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
1373
1429
  CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
@@ -1472,7 +1528,7 @@ void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs*
1472
1528
  BYTE* op[4];
1473
1529
  BYTE* oend[4];
1474
1530
  HUF_DEltX2 const* const dtable = (HUF_DEltX2 const*)args->dt;
1475
- BYTE const* const ilimit = args->ilimit;
1531
+ BYTE const* const ilowest = args->ilowest;
1476
1532
 
1477
1533
  /* Copy the arguments to local registers. */
1478
1534
  ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
@@ -1490,13 +1546,12 @@ void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs*
1490
1546
  for (;;) {
1491
1547
  BYTE* olimit;
1492
1548
  int stream;
1493
- int symbol;
1494
1549
 
1495
1550
  /* Assert loop preconditions */
1496
1551
  #ifndef NDEBUG
1497
1552
  for (stream = 0; stream < 4; ++stream) {
1498
1553
  assert(op[stream] <= oend[stream]);
1499
- assert(ip[stream] >= ilimit);
1554
+ assert(ip[stream] >= ilowest);
1500
1555
  }
1501
1556
  #endif
1502
1557
  /* Compute olimit */
@@ -1509,7 +1564,7 @@ void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs*
1509
1564
  * We also know that each input pointer is >= ip[0]. So we can run
1510
1565
  * iters loops before running out of input.
1511
1566
  */
1512
- size_t iters = (size_t)(ip[0] - ilimit) / 7;
1567
+ size_t iters = (size_t)(ip[0] - ilowest) / 7;
1513
1568
  /* Each iteration can produce up to 10 bytes of output per stream.
1514
1569
  * Each output stream my advance at different rates. So take the
1515
1570
  * minimum number of safe iterations among all the output streams.
@@ -1527,8 +1582,8 @@ void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs*
1527
1582
  */
1528
1583
  olimit = op[3] + (iters * 5);
1529
1584
 
1530
- /* Exit the fast decoding loop if we are too close to the end. */
1531
- if (op[3] + 10 > olimit)
1585
+ /* Exit the fast decoding loop once we reach the end. */
1586
+ if (op[3] == olimit)
1532
1587
  break;
1533
1588
 
1534
1589
  /* Exit the decoding loop if any input pointer has crossed the
@@ -1547,54 +1602,58 @@ void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs*
1547
1602
  }
1548
1603
  #endif
1549
1604
 
1605
+ #define HUF_4X2_DECODE_SYMBOL(_stream, _decode3) \
1606
+ do { \
1607
+ if ((_decode3) || (_stream) != 3) { \
1608
+ int const index = (int)(bits[(_stream)] >> 53); \
1609
+ HUF_DEltX2 const entry = dtable[index]; \
1610
+ MEM_write16(op[(_stream)], entry.sequence); \
1611
+ bits[(_stream)] <<= (entry.nbBits) & 0x3F; \
1612
+ op[(_stream)] += (entry.length); \
1613
+ } \
1614
+ } while (0)
1615
+
1616
+ #define HUF_4X2_RELOAD_STREAM(_stream) \
1617
+ do { \
1618
+ HUF_4X2_DECODE_SYMBOL(3, 1); \
1619
+ { \
1620
+ int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \
1621
+ int const nbBits = ctz & 7; \
1622
+ int const nbBytes = ctz >> 3; \
1623
+ ip[(_stream)] -= nbBytes; \
1624
+ bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1; \
1625
+ bits[(_stream)] <<= nbBits; \
1626
+ } \
1627
+ } while (0)
1628
+
1629
+ /* Manually unroll the loop because compilers don't consistently
1630
+ * unroll the inner loops, which destroys performance.
1631
+ */
1550
1632
  do {
1551
- /* Do 5 table lookups for each of the first 3 streams */
1552
- for (symbol = 0; symbol < 5; ++symbol) {
1553
- for (stream = 0; stream < 3; ++stream) {
1554
- int const index = (int)(bits[stream] >> 53);
1555
- HUF_DEltX2 const entry = dtable[index];
1556
- MEM_write16(op[stream], entry.sequence);
1557
- bits[stream] <<= (entry.nbBits);
1558
- op[stream] += (entry.length);
1559
- }
1560
- }
1561
- /* Do 1 table lookup from the final stream */
1562
- {
1563
- int const index = (int)(bits[3] >> 53);
1564
- HUF_DEltX2 const entry = dtable[index];
1565
- MEM_write16(op[3], entry.sequence);
1566
- bits[3] <<= (entry.nbBits);
1567
- op[3] += (entry.length);
1568
- }
1569
- /* Do 4 table lookups from the final stream & reload bitstreams */
1570
- for (stream = 0; stream < 4; ++stream) {
1571
- /* Do a table lookup from the final stream.
1572
- * This is interleaved with the reloading to reduce register
1573
- * pressure. This shouldn't be necessary, but compilers can
1574
- * struggle with codegen with high register pressure.
1575
- */
1576
- {
1577
- int const index = (int)(bits[3] >> 53);
1578
- HUF_DEltX2 const entry = dtable[index];
1579
- MEM_write16(op[3], entry.sequence);
1580
- bits[3] <<= (entry.nbBits);
1581
- op[3] += (entry.length);
1582
- }
1583
- /* Reload the bistreams. The final bitstream must be reloaded
1584
- * after the 5th symbol was decoded.
1585
- */
1586
- {
1587
- int const ctz = ZSTD_countTrailingZeros64(bits[stream]);
1588
- int const nbBits = ctz & 7;
1589
- int const nbBytes = ctz >> 3;
1590
- ip[stream] -= nbBytes;
1591
- bits[stream] = MEM_read64(ip[stream]) | 1;
1592
- bits[stream] <<= nbBits;
1593
- }
1594
- }
1633
+ /* Decode 5 symbols from each of the first 3 streams.
1634
+ * The final stream will be decoded during the reload phase
1635
+ * to reduce register pressure.
1636
+ */
1637
+ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
1638
+ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
1639
+ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
1640
+ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
1641
+ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
1642
+
1643
+ /* Decode one symbol from the final stream */
1644
+ HUF_4X2_DECODE_SYMBOL(3, 1);
1645
+
1646
+ /* Decode 4 symbols from the final stream & reload bitstreams.
1647
+ * The final stream is reloaded last, meaning that all 5 symbols
1648
+ * are decoded from the final stream before it is reloaded.
1649
+ */
1650
+ HUF_4X_FOR_EACH_STREAM(HUF_4X2_RELOAD_STREAM);
1595
1651
  } while (op[3] < olimit);
1596
1652
  }
1597
1653
 
1654
+ #undef HUF_4X2_DECODE_SYMBOL
1655
+ #undef HUF_4X2_RELOAD_STREAM
1656
+
1598
1657
  _out:
1599
1658
 
1600
1659
  /* Save the final values of each of the state variables back to args. */
@@ -1611,8 +1670,8 @@ HUF_decompress4X2_usingDTable_internal_fast(
1611
1670
  const HUF_DTable* DTable,
1612
1671
  HUF_DecompressFastLoopFn loopFn) {
1613
1672
  void const* dt = DTable + 1;
1614
- const BYTE* const iend = (const BYTE*)cSrc + 6;
1615
- BYTE* const oend = (BYTE*)dst + dstSize;
1673
+ const BYTE* const ilowest = (const BYTE*)cSrc;
1674
+ BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize);
1616
1675
  HUF_DecompressFastArgs args;
1617
1676
  {
1618
1677
  size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
@@ -1621,16 +1680,19 @@ HUF_decompress4X2_usingDTable_internal_fast(
1621
1680
  return 0;
1622
1681
  }
1623
1682
 
1624
- assert(args.ip[0] >= args.ilimit);
1683
+ assert(args.ip[0] >= args.ilowest);
1625
1684
  loopFn(&args);
1626
1685
 
1627
1686
  /* note : op4 already verified within main loop */
1628
- assert(args.ip[0] >= iend);
1629
- assert(args.ip[1] >= iend);
1630
- assert(args.ip[2] >= iend);
1631
- assert(args.ip[3] >= iend);
1687
+ assert(args.ip[0] >= ilowest);
1688
+ assert(args.ip[1] >= ilowest);
1689
+ assert(args.ip[2] >= ilowest);
1690
+ assert(args.ip[3] >= ilowest);
1632
1691
  assert(args.op[3] <= oend);
1633
- (void)iend;
1692
+
1693
+ assert(ilowest == args.ilowest);
1694
+ assert(ilowest + 6 == args.iend[0]);
1695
+ (void)ilowest;
1634
1696
 
1635
1697
  /* finish bitStreams one by one */
1636
1698
  {
@@ -1679,7 +1741,7 @@ static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize,
1679
1741
  }
1680
1742
  #endif
1681
1743
 
1682
- if (!(flags & HUF_flags_disableFast)) {
1744
+ if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) {
1683
1745
  size_t const ret = HUF_decompress4X2_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);
1684
1746
  if (ret != 0)
1685
1747
  return ret;