zstd-ruby 1.5.5.1 → 1.5.6.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (48) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +1 -1
  3. data/ext/zstdruby/libzstd/common/allocations.h +1 -1
  4. data/ext/zstdruby/libzstd/common/bitstream.h +49 -29
  5. data/ext/zstdruby/libzstd/common/compiler.h +114 -22
  6. data/ext/zstdruby/libzstd/common/cpu.h +36 -0
  7. data/ext/zstdruby/libzstd/common/debug.c +6 -0
  8. data/ext/zstdruby/libzstd/common/debug.h +20 -11
  9. data/ext/zstdruby/libzstd/common/error_private.h +45 -36
  10. data/ext/zstdruby/libzstd/common/fse.h +3 -2
  11. data/ext/zstdruby/libzstd/common/fse_decompress.c +19 -17
  12. data/ext/zstdruby/libzstd/common/huf.h +14 -1
  13. data/ext/zstdruby/libzstd/common/mem.h +0 -9
  14. data/ext/zstdruby/libzstd/common/pool.c +1 -1
  15. data/ext/zstdruby/libzstd/common/pool.h +1 -1
  16. data/ext/zstdruby/libzstd/common/portability_macros.h +2 -0
  17. data/ext/zstdruby/libzstd/common/threading.c +8 -2
  18. data/ext/zstdruby/libzstd/common/xxhash.c +5 -11
  19. data/ext/zstdruby/libzstd/common/xxhash.h +2341 -1007
  20. data/ext/zstdruby/libzstd/common/zstd_internal.h +5 -5
  21. data/ext/zstdruby/libzstd/compress/fse_compress.c +8 -7
  22. data/ext/zstdruby/libzstd/compress/huf_compress.c +54 -25
  23. data/ext/zstdruby/libzstd/compress/zstd_compress.c +282 -161
  24. data/ext/zstdruby/libzstd/compress/zstd_compress_internal.h +29 -27
  25. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.c +224 -113
  26. data/ext/zstdruby/libzstd/compress/zstd_cwksp.h +19 -13
  27. data/ext/zstdruby/libzstd/compress/zstd_double_fast.c +17 -5
  28. data/ext/zstdruby/libzstd/compress/zstd_double_fast.h +11 -0
  29. data/ext/zstdruby/libzstd/compress/zstd_fast.c +14 -6
  30. data/ext/zstdruby/libzstd/compress/zstd_lazy.c +129 -87
  31. data/ext/zstdruby/libzstd/compress/zstd_lazy.h +103 -28
  32. data/ext/zstdruby/libzstd/compress/zstd_ldm.c +8 -2
  33. data/ext/zstdruby/libzstd/compress/zstd_opt.c +216 -112
  34. data/ext/zstdruby/libzstd/compress/zstd_opt.h +31 -7
  35. data/ext/zstdruby/libzstd/compress/zstdmt_compress.c +94 -79
  36. data/ext/zstdruby/libzstd/decompress/huf_decompress.c +188 -126
  37. data/ext/zstdruby/libzstd/decompress/huf_decompress_amd64.S +38 -19
  38. data/ext/zstdruby/libzstd/decompress/zstd_decompress.c +84 -32
  39. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.c +231 -208
  40. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.h +1 -1
  41. data/ext/zstdruby/libzstd/decompress/zstd_decompress_internal.h +2 -0
  42. data/ext/zstdruby/libzstd/dictBuilder/cover.c +16 -12
  43. data/ext/zstdruby/libzstd/dictBuilder/cover.h +2 -8
  44. data/ext/zstdruby/libzstd/dictBuilder/fastcover.c +2 -2
  45. data/ext/zstdruby/libzstd/dictBuilder/zdict.c +12 -6
  46. data/ext/zstdruby/libzstd/zstd.h +129 -60
  47. data/lib/zstd-ruby/version.rb +1 -1
  48. metadata +1 -1
@@ -34,6 +34,12 @@
34
34
  * Macros
35
35
  ****************************************************************/
36
36
 
37
+ #ifdef HUF_DISABLE_FAST_DECODE
38
+ # define HUF_ENABLE_FAST_DECODE 0
39
+ #else
40
+ # define HUF_ENABLE_FAST_DECODE 1
41
+ #endif
42
+
37
43
  /* These two optional macros force the use one way or another of the two
38
44
  * Huffman decompression implementations. You can't force in both directions
39
45
  * at the same time.
@@ -158,17 +164,18 @@ static size_t HUF_initFastDStream(BYTE const* ip) {
158
164
  * op [in/out] - The output pointers, must be updated to reflect what is written.
159
165
  * bits [in/out] - The bitstream containers, must be updated to reflect the current state.
160
166
  * dt [in] - The decoding table.
161
- * ilimit [in] - The input limit, stop when any input pointer is below ilimit.
167
+ * ilowest [in] - The beginning of the valid range of the input. Decoders may read
168
+ * down to this pointer. It may be below iend[0].
162
169
  * oend [in] - The end of the output stream. op[3] must not cross oend.
163
170
  * iend [in] - The end of each input stream. ip[i] may cross iend[i],
164
- * as long as it is above ilimit, but that indicates corruption.
171
+ * as long as it is above ilowest, but that indicates corruption.
165
172
  */
166
173
  typedef struct {
167
174
  BYTE const* ip[4];
168
175
  BYTE* op[4];
169
176
  U64 bits[4];
170
177
  void const* dt;
171
- BYTE const* ilimit;
178
+ BYTE const* ilowest;
172
179
  BYTE* oend;
173
180
  BYTE const* iend[4];
174
181
  } HUF_DecompressFastArgs;
@@ -186,9 +193,9 @@ static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* ds
186
193
  void const* dt = DTable + 1;
187
194
  U32 const dtLog = HUF_getDTableDesc(DTable).tableLog;
188
195
 
189
- const BYTE* const ilimit = (const BYTE*)src + 6 + 8;
196
+ const BYTE* const istart = (const BYTE*)src;
190
197
 
191
- BYTE* const oend = (BYTE*)dst + dstSize;
198
+ BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize);
192
199
 
193
200
  /* The fast decoding loop assumes 64-bit little-endian.
194
201
  * This condition is false on x32.
@@ -196,6 +203,11 @@ static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* ds
196
203
  if (!MEM_isLittleEndian() || MEM_32bits())
197
204
  return 0;
198
205
 
206
+ /* Avoid nullptr addition */
207
+ if (dstSize == 0)
208
+ return 0;
209
+ assert(dst != NULL);
210
+
199
211
  /* strict minimum : jump table + 1 byte per stream */
200
212
  if (srcSize < 10)
201
213
  return ERROR(corruption_detected);
@@ -209,7 +221,6 @@ static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* ds
209
221
 
210
222
  /* Read the jump table. */
211
223
  {
212
- const BYTE* const istart = (const BYTE*)src;
213
224
  size_t const length1 = MEM_readLE16(istart);
214
225
  size_t const length2 = MEM_readLE16(istart+2);
215
226
  size_t const length3 = MEM_readLE16(istart+4);
@@ -221,10 +232,8 @@ static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* ds
221
232
 
222
233
  /* HUF_initFastDStream() requires this, and this small of an input
223
234
  * won't benefit from the ASM loop anyways.
224
- * length1 must be >= 16 so that ip[0] >= ilimit before the loop
225
- * starts.
226
235
  */
227
- if (length1 < 16 || length2 < 8 || length3 < 8 || length4 < 8)
236
+ if (length1 < 8 || length2 < 8 || length3 < 8 || length4 < 8)
228
237
  return 0;
229
238
  if (length4 > srcSize) return ERROR(corruption_detected); /* overflow */
230
239
  }
@@ -256,11 +265,12 @@ static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* ds
256
265
  args->bits[2] = HUF_initFastDStream(args->ip[2]);
257
266
  args->bits[3] = HUF_initFastDStream(args->ip[3]);
258
267
 
259
- /* If ip[] >= ilimit, it is guaranteed to be safe to
260
- * reload bits[]. It may be beyond its section, but is
261
- * guaranteed to be valid (>= istart).
262
- */
263
- args->ilimit = ilimit;
268
+ /* The decoders must be sure to never read beyond ilowest.
269
+ * This is lower than iend[0], but allowing decoders to read
270
+ * down to ilowest can allow an extra iteration or two in the
271
+ * fast loop.
272
+ */
273
+ args->ilowest = istart;
264
274
 
265
275
  args->oend = oend;
266
276
  args->dt = dt;
@@ -285,13 +295,31 @@ static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressFastArg
285
295
  assert(sizeof(size_t) == 8);
286
296
  bit->bitContainer = MEM_readLEST(args->ip[stream]);
287
297
  bit->bitsConsumed = ZSTD_countTrailingZeros64(args->bits[stream]);
288
- bit->start = (const char*)args->iend[0];
298
+ bit->start = (const char*)args->ilowest;
289
299
  bit->limitPtr = bit->start + sizeof(size_t);
290
300
  bit->ptr = (const char*)args->ip[stream];
291
301
 
292
302
  return 0;
293
303
  }
294
304
 
305
+ /* Calls X(N) for each stream 0, 1, 2, 3. */
306
+ #define HUF_4X_FOR_EACH_STREAM(X) \
307
+ do { \
308
+ X(0); \
309
+ X(1); \
310
+ X(2); \
311
+ X(3); \
312
+ } while (0)
313
+
314
+ /* Calls X(N, var) for each stream 0, 1, 2, 3. */
315
+ #define HUF_4X_FOR_EACH_STREAM_WITH_VAR(X, var) \
316
+ do { \
317
+ X(0, (var)); \
318
+ X(1, (var)); \
319
+ X(2, (var)); \
320
+ X(3, (var)); \
321
+ } while (0)
322
+
295
323
 
296
324
  #ifndef HUF_FORCE_DECOMPRESS_X2
297
325
 
@@ -500,15 +528,19 @@ HUF_decodeSymbolX1(BIT_DStream_t* Dstream, const HUF_DEltX1* dt, const U32 dtLog
500
528
  }
501
529
 
502
530
  #define HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr) \
503
- *ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog)
531
+ do { *ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog); } while (0)
504
532
 
505
- #define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr) \
506
- if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
507
- HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr)
533
+ #define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr) \
534
+ do { \
535
+ if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
536
+ HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr); \
537
+ } while (0)
508
538
 
509
- #define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr) \
510
- if (MEM_64bits()) \
511
- HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr)
539
+ #define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr) \
540
+ do { \
541
+ if (MEM_64bits()) \
542
+ HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr); \
543
+ } while (0)
512
544
 
513
545
  HINT_INLINE size_t
514
546
  HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, const HUF_DEltX1* const dt, const U32 dtLog)
@@ -546,7 +578,7 @@ HUF_decompress1X1_usingDTable_internal_body(
546
578
  const HUF_DTable* DTable)
547
579
  {
548
580
  BYTE* op = (BYTE*)dst;
549
- BYTE* const oend = op + dstSize;
581
+ BYTE* const oend = ZSTD_maybeNullPtrAdd(op, dstSize);
550
582
  const void* dtPtr = DTable + 1;
551
583
  const HUF_DEltX1* const dt = (const HUF_DEltX1*)dtPtr;
552
584
  BIT_DStream_t bitD;
@@ -574,6 +606,7 @@ HUF_decompress4X1_usingDTable_internal_body(
574
606
  {
575
607
  /* Check */
576
608
  if (cSrcSize < 10) return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */
609
+ if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */
577
610
 
578
611
  { const BYTE* const istart = (const BYTE*) cSrc;
579
612
  BYTE* const ostart = (BYTE*) dst;
@@ -609,7 +642,7 @@ HUF_decompress4X1_usingDTable_internal_body(
609
642
 
610
643
  if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */
611
644
  if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */
612
- if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */
645
+ assert(dstSize >= 6); /* validated above */
613
646
  CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
614
647
  CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
615
648
  CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
@@ -692,7 +725,7 @@ void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs*
692
725
  BYTE* op[4];
693
726
  U16 const* const dtable = (U16 const*)args->dt;
694
727
  BYTE* const oend = args->oend;
695
- BYTE const* const ilimit = args->ilimit;
728
+ BYTE const* const ilowest = args->ilowest;
696
729
 
697
730
  /* Copy the arguments to local variables */
698
731
  ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
@@ -705,13 +738,12 @@ void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs*
705
738
  for (;;) {
706
739
  BYTE* olimit;
707
740
  int stream;
708
- int symbol;
709
741
 
710
742
  /* Assert loop preconditions */
711
743
  #ifndef NDEBUG
712
744
  for (stream = 0; stream < 4; ++stream) {
713
745
  assert(op[stream] <= (stream == 3 ? oend : op[stream + 1]));
714
- assert(ip[stream] >= ilimit);
746
+ assert(ip[stream] >= ilowest);
715
747
  }
716
748
  #endif
717
749
  /* Compute olimit */
@@ -721,7 +753,7 @@ void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs*
721
753
  /* Each iteration consumes up to 11 bits * 5 = 55 bits < 7 bytes
722
754
  * per stream.
723
755
  */
724
- size_t const iiters = (size_t)(ip[0] - ilimit) / 7;
756
+ size_t const iiters = (size_t)(ip[0] - ilowest) / 7;
725
757
  /* We can safely run iters iterations before running bounds checks */
726
758
  size_t const iters = MIN(oiters, iiters);
727
759
  size_t const symbols = iters * 5;
@@ -732,8 +764,8 @@ void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs*
732
764
  */
733
765
  olimit = op[3] + symbols;
734
766
 
735
- /* Exit fast decoding loop once we get close to the end. */
736
- if (op[3] + 20 > olimit)
767
+ /* Exit fast decoding loop once we reach the end. */
768
+ if (op[3] == olimit)
737
769
  break;
738
770
 
739
771
  /* Exit the decoding loop if any input pointer has crossed the
@@ -752,27 +784,42 @@ void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs*
752
784
  }
753
785
  #endif
754
786
 
787
+ #define HUF_4X1_DECODE_SYMBOL(_stream, _symbol) \
788
+ do { \
789
+ int const index = (int)(bits[(_stream)] >> 53); \
790
+ int const entry = (int)dtable[index]; \
791
+ bits[(_stream)] <<= (entry & 0x3F); \
792
+ op[(_stream)][(_symbol)] = (BYTE)((entry >> 8) & 0xFF); \
793
+ } while (0)
794
+
795
+ #define HUF_4X1_RELOAD_STREAM(_stream) \
796
+ do { \
797
+ int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \
798
+ int const nbBits = ctz & 7; \
799
+ int const nbBytes = ctz >> 3; \
800
+ op[(_stream)] += 5; \
801
+ ip[(_stream)] -= nbBytes; \
802
+ bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1; \
803
+ bits[(_stream)] <<= nbBits; \
804
+ } while (0)
805
+
806
+ /* Manually unroll the loop because compilers don't consistently
807
+ * unroll the inner loops, which destroys performance.
808
+ */
755
809
  do {
756
810
  /* Decode 5 symbols in each of the 4 streams */
757
- for (symbol = 0; symbol < 5; ++symbol) {
758
- for (stream = 0; stream < 4; ++stream) {
759
- int const index = (int)(bits[stream] >> 53);
760
- int const entry = (int)dtable[index];
761
- bits[stream] <<= (entry & 63);
762
- op[stream][symbol] = (BYTE)((entry >> 8) & 0xFF);
763
- }
764
- }
765
- /* Reload the bitstreams */
766
- for (stream = 0; stream < 4; ++stream) {
767
- int const ctz = ZSTD_countTrailingZeros64(bits[stream]);
768
- int const nbBits = ctz & 7;
769
- int const nbBytes = ctz >> 3;
770
- op[stream] += 5;
771
- ip[stream] -= nbBytes;
772
- bits[stream] = MEM_read64(ip[stream]) | 1;
773
- bits[stream] <<= nbBits;
774
- }
811
+ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 0);
812
+ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 1);
813
+ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 2);
814
+ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 3);
815
+ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 4);
816
+
817
+ /* Reload each of the 4 the bitstreams */
818
+ HUF_4X_FOR_EACH_STREAM(HUF_4X1_RELOAD_STREAM);
775
819
  } while (op[3] < olimit);
820
+
821
+ #undef HUF_4X1_DECODE_SYMBOL
822
+ #undef HUF_4X1_RELOAD_STREAM
776
823
  }
777
824
 
778
825
  _out:
@@ -797,8 +844,8 @@ HUF_decompress4X1_usingDTable_internal_fast(
797
844
  HUF_DecompressFastLoopFn loopFn)
798
845
  {
799
846
  void const* dt = DTable + 1;
800
- const BYTE* const iend = (const BYTE*)cSrc + 6;
801
- BYTE* const oend = (BYTE*)dst + dstSize;
847
+ BYTE const* const ilowest = (BYTE const*)cSrc;
848
+ BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize);
802
849
  HUF_DecompressFastArgs args;
803
850
  { size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
804
851
  FORWARD_IF_ERROR(ret, "Failed to init fast loop args");
@@ -806,18 +853,22 @@ HUF_decompress4X1_usingDTable_internal_fast(
806
853
  return 0;
807
854
  }
808
855
 
809
- assert(args.ip[0] >= args.ilimit);
856
+ assert(args.ip[0] >= args.ilowest);
810
857
  loopFn(&args);
811
858
 
812
- /* Our loop guarantees that ip[] >= ilimit and that we haven't
859
+ /* Our loop guarantees that ip[] >= ilowest and that we haven't
813
860
  * overwritten any op[].
814
861
  */
815
- assert(args.ip[0] >= iend);
816
- assert(args.ip[1] >= iend);
817
- assert(args.ip[2] >= iend);
818
- assert(args.ip[3] >= iend);
862
+ assert(args.ip[0] >= ilowest);
863
+ assert(args.ip[0] >= ilowest);
864
+ assert(args.ip[1] >= ilowest);
865
+ assert(args.ip[2] >= ilowest);
866
+ assert(args.ip[3] >= ilowest);
819
867
  assert(args.op[3] <= oend);
820
- (void)iend;
868
+
869
+ assert(ilowest == args.ilowest);
870
+ assert(ilowest + 6 == args.iend[0]);
871
+ (void)ilowest;
821
872
 
822
873
  /* finish bit streams one by one. */
823
874
  { size_t const segmentSize = (dstSize+3) / 4;
@@ -868,7 +919,7 @@ static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize,
868
919
  }
869
920
  #endif
870
921
 
871
- if (!(flags & HUF_flags_disableFast)) {
922
+ if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) {
872
923
  size_t const ret = HUF_decompress4X1_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);
873
924
  if (ret != 0)
874
925
  return ret;
@@ -1239,15 +1290,19 @@ HUF_decodeLastSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, c
1239
1290
  }
1240
1291
 
1241
1292
  #define HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) \
1242
- ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
1293
+ do { ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); } while (0)
1243
1294
 
1244
- #define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \
1245
- if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
1246
- ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
1295
+ #define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \
1296
+ do { \
1297
+ if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
1298
+ ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); \
1299
+ } while (0)
1247
1300
 
1248
- #define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \
1249
- if (MEM_64bits()) \
1250
- ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
1301
+ #define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \
1302
+ do { \
1303
+ if (MEM_64bits()) \
1304
+ ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); \
1305
+ } while (0)
1251
1306
 
1252
1307
  HINT_INLINE size_t
1253
1308
  HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd,
@@ -1307,7 +1362,7 @@ HUF_decompress1X2_usingDTable_internal_body(
1307
1362
 
1308
1363
  /* decode */
1309
1364
  { BYTE* const ostart = (BYTE*) dst;
1310
- BYTE* const oend = ostart + dstSize;
1365
+ BYTE* const oend = ZSTD_maybeNullPtrAdd(ostart, dstSize);
1311
1366
  const void* const dtPtr = DTable+1; /* force compiler to not use strict-aliasing */
1312
1367
  const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr;
1313
1368
  DTableDesc const dtd = HUF_getDTableDesc(DTable);
@@ -1332,6 +1387,7 @@ HUF_decompress4X2_usingDTable_internal_body(
1332
1387
  const HUF_DTable* DTable)
1333
1388
  {
1334
1389
  if (cSrcSize < 10) return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */
1390
+ if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */
1335
1391
 
1336
1392
  { const BYTE* const istart = (const BYTE*) cSrc;
1337
1393
  BYTE* const ostart = (BYTE*) dst;
@@ -1367,7 +1423,7 @@ HUF_decompress4X2_usingDTable_internal_body(
1367
1423
 
1368
1424
  if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */
1369
1425
  if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */
1370
- if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */
1426
+ assert(dstSize >= 6 /* validated above */);
1371
1427
  CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
1372
1428
  CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
1373
1429
  CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
@@ -1472,7 +1528,7 @@ void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs*
1472
1528
  BYTE* op[4];
1473
1529
  BYTE* oend[4];
1474
1530
  HUF_DEltX2 const* const dtable = (HUF_DEltX2 const*)args->dt;
1475
- BYTE const* const ilimit = args->ilimit;
1531
+ BYTE const* const ilowest = args->ilowest;
1476
1532
 
1477
1533
  /* Copy the arguments to local registers. */
1478
1534
  ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
@@ -1490,13 +1546,12 @@ void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs*
1490
1546
  for (;;) {
1491
1547
  BYTE* olimit;
1492
1548
  int stream;
1493
- int symbol;
1494
1549
 
1495
1550
  /* Assert loop preconditions */
1496
1551
  #ifndef NDEBUG
1497
1552
  for (stream = 0; stream < 4; ++stream) {
1498
1553
  assert(op[stream] <= oend[stream]);
1499
- assert(ip[stream] >= ilimit);
1554
+ assert(ip[stream] >= ilowest);
1500
1555
  }
1501
1556
  #endif
1502
1557
  /* Compute olimit */
@@ -1509,7 +1564,7 @@ void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs*
1509
1564
  * We also know that each input pointer is >= ip[0]. So we can run
1510
1565
  * iters loops before running out of input.
1511
1566
  */
1512
- size_t iters = (size_t)(ip[0] - ilimit) / 7;
1567
+ size_t iters = (size_t)(ip[0] - ilowest) / 7;
1513
1568
  /* Each iteration can produce up to 10 bytes of output per stream.
1514
1569
  * Each output stream my advance at different rates. So take the
1515
1570
  * minimum number of safe iterations among all the output streams.
@@ -1527,8 +1582,8 @@ void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs*
1527
1582
  */
1528
1583
  olimit = op[3] + (iters * 5);
1529
1584
 
1530
- /* Exit the fast decoding loop if we are too close to the end. */
1531
- if (op[3] + 10 > olimit)
1585
+ /* Exit the fast decoding loop once we reach the end. */
1586
+ if (op[3] == olimit)
1532
1587
  break;
1533
1588
 
1534
1589
  /* Exit the decoding loop if any input pointer has crossed the
@@ -1547,54 +1602,58 @@ void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs*
1547
1602
  }
1548
1603
  #endif
1549
1604
 
1605
+ #define HUF_4X2_DECODE_SYMBOL(_stream, _decode3) \
1606
+ do { \
1607
+ if ((_decode3) || (_stream) != 3) { \
1608
+ int const index = (int)(bits[(_stream)] >> 53); \
1609
+ HUF_DEltX2 const entry = dtable[index]; \
1610
+ MEM_write16(op[(_stream)], entry.sequence); \
1611
+ bits[(_stream)] <<= (entry.nbBits) & 0x3F; \
1612
+ op[(_stream)] += (entry.length); \
1613
+ } \
1614
+ } while (0)
1615
+
1616
+ #define HUF_4X2_RELOAD_STREAM(_stream) \
1617
+ do { \
1618
+ HUF_4X2_DECODE_SYMBOL(3, 1); \
1619
+ { \
1620
+ int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \
1621
+ int const nbBits = ctz & 7; \
1622
+ int const nbBytes = ctz >> 3; \
1623
+ ip[(_stream)] -= nbBytes; \
1624
+ bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1; \
1625
+ bits[(_stream)] <<= nbBits; \
1626
+ } \
1627
+ } while (0)
1628
+
1629
+ /* Manually unroll the loop because compilers don't consistently
1630
+ * unroll the inner loops, which destroys performance.
1631
+ */
1550
1632
  do {
1551
- /* Do 5 table lookups for each of the first 3 streams */
1552
- for (symbol = 0; symbol < 5; ++symbol) {
1553
- for (stream = 0; stream < 3; ++stream) {
1554
- int const index = (int)(bits[stream] >> 53);
1555
- HUF_DEltX2 const entry = dtable[index];
1556
- MEM_write16(op[stream], entry.sequence);
1557
- bits[stream] <<= (entry.nbBits);
1558
- op[stream] += (entry.length);
1559
- }
1560
- }
1561
- /* Do 1 table lookup from the final stream */
1562
- {
1563
- int const index = (int)(bits[3] >> 53);
1564
- HUF_DEltX2 const entry = dtable[index];
1565
- MEM_write16(op[3], entry.sequence);
1566
- bits[3] <<= (entry.nbBits);
1567
- op[3] += (entry.length);
1568
- }
1569
- /* Do 4 table lookups from the final stream & reload bitstreams */
1570
- for (stream = 0; stream < 4; ++stream) {
1571
- /* Do a table lookup from the final stream.
1572
- * This is interleaved with the reloading to reduce register
1573
- * pressure. This shouldn't be necessary, but compilers can
1574
- * struggle with codegen with high register pressure.
1575
- */
1576
- {
1577
- int const index = (int)(bits[3] >> 53);
1578
- HUF_DEltX2 const entry = dtable[index];
1579
- MEM_write16(op[3], entry.sequence);
1580
- bits[3] <<= (entry.nbBits);
1581
- op[3] += (entry.length);
1582
- }
1583
- /* Reload the bistreams. The final bitstream must be reloaded
1584
- * after the 5th symbol was decoded.
1585
- */
1586
- {
1587
- int const ctz = ZSTD_countTrailingZeros64(bits[stream]);
1588
- int const nbBits = ctz & 7;
1589
- int const nbBytes = ctz >> 3;
1590
- ip[stream] -= nbBytes;
1591
- bits[stream] = MEM_read64(ip[stream]) | 1;
1592
- bits[stream] <<= nbBits;
1593
- }
1594
- }
1633
+ /* Decode 5 symbols from each of the first 3 streams.
1634
+ * The final stream will be decoded during the reload phase
1635
+ * to reduce register pressure.
1636
+ */
1637
+ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
1638
+ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
1639
+ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
1640
+ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
1641
+ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
1642
+
1643
+ /* Decode one symbol from the final stream */
1644
+ HUF_4X2_DECODE_SYMBOL(3, 1);
1645
+
1646
+ /* Decode 4 symbols from the final stream & reload bitstreams.
1647
+ * The final stream is reloaded last, meaning that all 5 symbols
1648
+ * are decoded from the final stream before it is reloaded.
1649
+ */
1650
+ HUF_4X_FOR_EACH_STREAM(HUF_4X2_RELOAD_STREAM);
1595
1651
  } while (op[3] < olimit);
1596
1652
  }
1597
1653
 
1654
+ #undef HUF_4X2_DECODE_SYMBOL
1655
+ #undef HUF_4X2_RELOAD_STREAM
1656
+
1598
1657
  _out:
1599
1658
 
1600
1659
  /* Save the final values of each of the state variables back to args. */
@@ -1611,8 +1670,8 @@ HUF_decompress4X2_usingDTable_internal_fast(
1611
1670
  const HUF_DTable* DTable,
1612
1671
  HUF_DecompressFastLoopFn loopFn) {
1613
1672
  void const* dt = DTable + 1;
1614
- const BYTE* const iend = (const BYTE*)cSrc + 6;
1615
- BYTE* const oend = (BYTE*)dst + dstSize;
1673
+ const BYTE* const ilowest = (const BYTE*)cSrc;
1674
+ BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize);
1616
1675
  HUF_DecompressFastArgs args;
1617
1676
  {
1618
1677
  size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
@@ -1621,16 +1680,19 @@ HUF_decompress4X2_usingDTable_internal_fast(
1621
1680
  return 0;
1622
1681
  }
1623
1682
 
1624
- assert(args.ip[0] >= args.ilimit);
1683
+ assert(args.ip[0] >= args.ilowest);
1625
1684
  loopFn(&args);
1626
1685
 
1627
1686
  /* note : op4 already verified within main loop */
1628
- assert(args.ip[0] >= iend);
1629
- assert(args.ip[1] >= iend);
1630
- assert(args.ip[2] >= iend);
1631
- assert(args.ip[3] >= iend);
1687
+ assert(args.ip[0] >= ilowest);
1688
+ assert(args.ip[1] >= ilowest);
1689
+ assert(args.ip[2] >= ilowest);
1690
+ assert(args.ip[3] >= ilowest);
1632
1691
  assert(args.op[3] <= oend);
1633
- (void)iend;
1692
+
1693
+ assert(ilowest == args.ilowest);
1694
+ assert(ilowest + 6 == args.iend[0]);
1695
+ (void)ilowest;
1634
1696
 
1635
1697
  /* finish bitStreams one by one */
1636
1698
  {
@@ -1679,7 +1741,7 @@ static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize,
1679
1741
  }
1680
1742
  #endif
1681
1743
 
1682
- if (!(flags & HUF_flags_disableFast)) {
1744
+ if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) {
1683
1745
  size_t const ret = HUF_decompress4X2_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);
1684
1746
  if (ret != 0)
1685
1747
  return ret;