zstdlib 0.3.0-x64-mingw32 → 0.8.0-x64-mingw32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGES.md +30 -1
  3. data/README.md +2 -2
  4. data/Rakefile +1 -1
  5. data/ext/zstdlib/extconf.rb +3 -3
  6. data/ext/zstdlib/ruby/zlib-2.7/zstdlib.c +4895 -0
  7. data/ext/zstdlib/ruby/zlib-3.0/zstdlib.c +4994 -0
  8. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/common/bitstream.h +59 -51
  9. data/ext/zstdlib/zstd-1.5.0/lib/common/compiler.h +289 -0
  10. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/common/cpu.h +1 -3
  11. data/ext/zstdlib/zstd-1.5.0/lib/common/debug.c +24 -0
  12. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/common/debug.h +22 -49
  13. data/ext/zstdlib/zstd-1.5.0/lib/common/entropy_common.c +362 -0
  14. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/common/error_private.c +3 -1
  15. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/common/error_private.h +8 -4
  16. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/common/fse.h +50 -42
  17. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/common/fse_decompress.c +149 -55
  18. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/common/huf.h +43 -39
  19. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/common/mem.h +69 -25
  20. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/common/pool.c +30 -20
  21. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/common/pool.h +3 -3
  22. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/common/threading.c +51 -4
  23. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/common/threading.h +36 -4
  24. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/common/xxhash.c +40 -92
  25. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/common/xxhash.h +12 -32
  26. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/common/zstd_common.c +10 -10
  27. data/ext/zstdlib/zstd-1.5.0/lib/common/zstd_deps.h +111 -0
  28. data/ext/zstdlib/zstd-1.5.0/lib/common/zstd_internal.h +490 -0
  29. data/ext/zstdlib/zstd-1.5.0/lib/common/zstd_trace.h +154 -0
  30. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/compress/fse_compress.c +47 -63
  31. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/compress/hist.c +41 -63
  32. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/compress/hist.h +13 -33
  33. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/compress/huf_compress.c +332 -193
  34. data/ext/zstdlib/zstd-1.5.0/lib/compress/zstd_compress.c +6393 -0
  35. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/compress/zstd_compress_internal.h +522 -86
  36. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/compress/zstd_compress_literals.c +25 -16
  37. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/compress/zstd_compress_literals.h +2 -2
  38. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/compress/zstd_compress_sequences.c +50 -24
  39. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/compress/zstd_compress_sequences.h +11 -4
  40. data/ext/zstdlib/zstd-1.5.0/lib/compress/zstd_compress_superblock.c +572 -0
  41. data/ext/zstdlib/zstd-1.5.0/lib/compress/zstd_compress_superblock.h +32 -0
  42. data/ext/zstdlib/zstd-1.5.0/lib/compress/zstd_cwksp.h +662 -0
  43. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/compress/zstd_double_fast.c +43 -41
  44. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/compress/zstd_double_fast.h +2 -2
  45. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/compress/zstd_fast.c +85 -80
  46. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/compress/zstd_fast.h +2 -2
  47. data/ext/zstdlib/zstd-1.5.0/lib/compress/zstd_lazy.c +2184 -0
  48. data/ext/zstdlib/zstd-1.5.0/lib/compress/zstd_lazy.h +125 -0
  49. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/compress/zstd_ldm.c +333 -208
  50. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/compress/zstd_ldm.h +15 -3
  51. data/ext/zstdlib/zstd-1.5.0/lib/compress/zstd_ldm_geartab.h +103 -0
  52. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/compress/zstd_opt.c +228 -129
  53. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/compress/zstd_opt.h +1 -1
  54. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/compress/zstdmt_compress.c +151 -440
  55. data/ext/zstdlib/zstd-1.5.0/lib/compress/zstdmt_compress.h +110 -0
  56. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/decompress/huf_decompress.c +395 -276
  57. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/decompress/zstd_ddict.c +20 -16
  58. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/decompress/zstd_ddict.h +3 -3
  59. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/decompress/zstd_decompress.c +628 -231
  60. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/decompress/zstd_decompress_block.c +606 -380
  61. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/decompress/zstd_decompress_block.h +8 -5
  62. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/decompress/zstd_decompress_internal.h +39 -9
  63. data/ext/zstdlib/zstd-1.5.0/lib/zdict.h +452 -0
  64. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/zstd.h +740 -153
  65. data/ext/zstdlib/{zstd-1.4.2/lib/common → zstd-1.5.0/lib}/zstd_errors.h +3 -1
  66. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/zlibWrapper/gzclose.c +1 -1
  67. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/zlibWrapper/gzcompatibility.h +1 -1
  68. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/zlibWrapper/gzguts.h +0 -0
  69. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/zlibWrapper/gzlib.c +9 -9
  70. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/zlibWrapper/gzread.c +16 -8
  71. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/zlibWrapper/gzwrite.c +8 -8
  72. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/zlibWrapper/zstd_zlibwrapper.c +131 -45
  73. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/zlibWrapper/zstd_zlibwrapper.h +1 -1
  74. data/lib/2.2/zstdlib.so +0 -0
  75. data/lib/2.3/zstdlib.so +0 -0
  76. data/lib/2.4/zstdlib.so +0 -0
  77. data/lib/2.5/zstdlib.so +0 -0
  78. data/lib/2.6/zstdlib.so +0 -0
  79. data/lib/2.7/zstdlib.so +0 -0
  80. metadata +76 -67
  81. data/ext/zstdlib/zstd-1.4.2/lib/common/compiler.h +0 -147
  82. data/ext/zstdlib/zstd-1.4.2/lib/common/debug.c +0 -44
  83. data/ext/zstdlib/zstd-1.4.2/lib/common/entropy_common.c +0 -236
  84. data/ext/zstdlib/zstd-1.4.2/lib/common/zstd_internal.h +0 -371
  85. data/ext/zstdlib/zstd-1.4.2/lib/compress/zstd_compress.c +0 -3904
  86. data/ext/zstdlib/zstd-1.4.2/lib/compress/zstd_lazy.c +0 -1111
  87. data/ext/zstdlib/zstd-1.4.2/lib/compress/zstd_lazy.h +0 -67
  88. data/ext/zstdlib/zstd-1.4.2/lib/compress/zstdmt_compress.h +0 -192
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
2
+ * Copyright (c) Yann Collet, Facebook, Inc.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -14,15 +14,15 @@
14
14
  /*-*******************************************************
15
15
  * Dependencies
16
16
  *********************************************************/
17
- #include <string.h> /* memcpy, memmove, memset */
18
- #include "compiler.h" /* prefetch */
19
- #include "cpu.h" /* bmi2 */
20
- #include "mem.h" /* low level memory routines */
17
+ #include "../common/zstd_deps.h" /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */
18
+ #include "../common/compiler.h" /* prefetch */
19
+ #include "../common/cpu.h" /* bmi2 */
20
+ #include "../common/mem.h" /* low level memory routines */
21
21
  #define FSE_STATIC_LINKING_ONLY
22
- #include "fse.h"
22
+ #include "../common/fse.h"
23
23
  #define HUF_STATIC_LINKING_ONLY
24
- #include "huf.h"
25
- #include "zstd_internal.h"
24
+ #include "../common/huf.h"
25
+ #include "../common/zstd_internal.h"
26
26
  #include "zstd_decompress_internal.h" /* ZSTD_DCtx */
27
27
  #include "zstd_ddict.h" /* ZSTD_DDictDictContent */
28
28
  #include "zstd_decompress_block.h"
@@ -44,7 +44,7 @@
44
44
  /*_*******************************************************
45
45
  * Memory operations
46
46
  **********************************************************/
47
- static void ZSTD_copy4(void* dst, const void* src) { memcpy(dst, src, 4); }
47
+ static void ZSTD_copy4(void* dst, const void* src) { ZSTD_memcpy(dst, src, 4); }
48
48
 
49
49
 
50
50
  /*-*************************************************************
@@ -56,7 +56,7 @@ static void ZSTD_copy4(void* dst, const void* src) { memcpy(dst, src, 4); }
56
56
  size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
57
57
  blockProperties_t* bpPtr)
58
58
  {
59
- RETURN_ERROR_IF(srcSize < ZSTD_blockHeaderSize, srcSize_wrong);
59
+ RETURN_ERROR_IF(srcSize < ZSTD_blockHeaderSize, srcSize_wrong, "");
60
60
 
61
61
  { U32 const cBlockHeader = MEM_readLE24(src);
62
62
  U32 const cSize = cBlockHeader >> 3;
@@ -64,7 +64,7 @@ size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
64
64
  bpPtr->blockType = (blockType_e)((cBlockHeader >> 1) & 3);
65
65
  bpPtr->origSize = cSize; /* only useful for RLE */
66
66
  if (bpPtr->blockType == bt_rle) return 1;
67
- RETURN_ERROR_IF(bpPtr->blockType == bt_reserved, corruption_detected);
67
+ RETURN_ERROR_IF(bpPtr->blockType == bt_reserved, corruption_detected, "");
68
68
  return cSize;
69
69
  }
70
70
  }
@@ -79,7 +79,8 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
79
79
  size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
80
80
  const void* src, size_t srcSize) /* note : srcSize < BLOCKSIZE */
81
81
  {
82
- RETURN_ERROR_IF(srcSize < MIN_CBLOCK_SIZE, corruption_detected);
82
+ DEBUGLOG(5, "ZSTD_decodeLiteralsBlock");
83
+ RETURN_ERROR_IF(srcSize < MIN_CBLOCK_SIZE, corruption_detected, "");
83
84
 
84
85
  { const BYTE* const istart = (const BYTE*) src;
85
86
  symbolEncodingType_e const litEncType = (symbolEncodingType_e)(istart[0] & 3);
@@ -87,7 +88,8 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
87
88
  switch(litEncType)
88
89
  {
89
90
  case set_repeat:
90
- RETURN_ERROR_IF(dctx->litEntropy==0, dictionary_corrupted);
91
+ DEBUGLOG(5, "set_repeat flag : re-using stats from previous compressed literals block");
92
+ RETURN_ERROR_IF(dctx->litEntropy==0, dictionary_corrupted, "");
91
93
  /* fall-through */
92
94
 
93
95
  case set_compressed:
@@ -116,11 +118,11 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
116
118
  /* 2 - 2 - 18 - 18 */
117
119
  lhSize = 5;
118
120
  litSize = (lhc >> 4) & 0x3FFFF;
119
- litCSize = (lhc >> 22) + (istart[4] << 10);
121
+ litCSize = (lhc >> 22) + ((size_t)istart[4] << 10);
120
122
  break;
121
123
  }
122
- RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected);
123
- RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected);
124
+ RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
125
+ RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, "");
124
126
 
125
127
  /* prefetch huffman table if cold */
126
128
  if (dctx->ddictIsCold && (litSize > 768 /* heuristic */)) {
@@ -158,13 +160,13 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
158
160
  }
159
161
  }
160
162
 
161
- RETURN_ERROR_IF(HUF_isError(hufSuccess), corruption_detected);
163
+ RETURN_ERROR_IF(HUF_isError(hufSuccess), corruption_detected, "");
162
164
 
163
165
  dctx->litPtr = dctx->litBuffer;
164
166
  dctx->litSize = litSize;
165
167
  dctx->litEntropy = 1;
166
168
  if (litEncType==set_compressed) dctx->HUFptr = dctx->entropy.hufTable;
167
- memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
169
+ ZSTD_memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
168
170
  return litCSize + lhSize;
169
171
  }
170
172
 
@@ -188,11 +190,11 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
188
190
  }
189
191
 
190
192
  if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) { /* risk reading beyond src buffer with wildcopy */
191
- RETURN_ERROR_IF(litSize+lhSize > srcSize, corruption_detected);
192
- memcpy(dctx->litBuffer, istart+lhSize, litSize);
193
+ RETURN_ERROR_IF(litSize+lhSize > srcSize, corruption_detected, "");
194
+ ZSTD_memcpy(dctx->litBuffer, istart+lhSize, litSize);
193
195
  dctx->litPtr = dctx->litBuffer;
194
196
  dctx->litSize = litSize;
195
- memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
197
+ ZSTD_memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
196
198
  return lhSize+litSize;
197
199
  }
198
200
  /* direct reference into compressed stream */
@@ -220,8 +222,8 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
220
222
  RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4");
221
223
  break;
222
224
  }
223
- RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected);
224
- memset(dctx->litBuffer, istart[lhSize], litSize + WILDCOPY_OVERLENGTH);
225
+ RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
226
+ ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize + WILDCOPY_OVERLENGTH);
225
227
  dctx->litPtr = dctx->litBuffer;
226
228
  dctx->litSize = litSize;
227
229
  return lhSize+1;
@@ -234,7 +236,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
234
236
 
235
237
  /* Default FSE distribution tables.
236
238
  * These are pre-calculated FSE decoding tables using default distributions as defined in specification :
237
- * https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md#default-distributions
239
+ * https://github.com/facebook/zstd/blob/release/doc/zstd_compression_format.md#default-distributions
238
240
  * They were generated programmatically with following method :
239
241
  * - start from default distributions, present in /lib/common/zstd_internal.h
240
242
  * - generate tables normally, using ZSTD_buildFSETable()
@@ -362,23 +364,26 @@ static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U32 nbAddB
362
364
  * generate FSE decoding table for one symbol (ll, ml or off)
363
365
  * cannot fail if input is valid =>
364
366
  * all inputs are presumed validated at this stage */
365
- void
366
- ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
367
+ FORCE_INLINE_TEMPLATE
368
+ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
367
369
  const short* normalizedCounter, unsigned maxSymbolValue,
368
370
  const U32* baseValue, const U32* nbAdditionalBits,
369
- unsigned tableLog)
371
+ unsigned tableLog, void* wksp, size_t wkspSize)
370
372
  {
371
373
  ZSTD_seqSymbol* const tableDecode = dt+1;
372
- U16 symbolNext[MaxSeq+1];
373
-
374
374
  U32 const maxSV1 = maxSymbolValue + 1;
375
375
  U32 const tableSize = 1 << tableLog;
376
- U32 highThreshold = tableSize-1;
376
+
377
+ U16* symbolNext = (U16*)wksp;
378
+ BYTE* spread = (BYTE*)(symbolNext + MaxSeq + 1);
379
+ U32 highThreshold = tableSize - 1;
380
+
377
381
 
378
382
  /* Sanity Checks */
379
383
  assert(maxSymbolValue <= MaxSeq);
380
384
  assert(tableLog <= MaxFSELog);
381
-
385
+ assert(wkspSize >= ZSTD_BUILD_FSE_TABLE_WKSP_SIZE);
386
+ (void)wkspSize;
382
387
  /* Init, lay down lowprob symbols */
383
388
  { ZSTD_seqSymbol_header DTableH;
384
389
  DTableH.tableLog = tableLog;
@@ -391,18 +396,72 @@ ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
391
396
  symbolNext[s] = 1;
392
397
  } else {
393
398
  if (normalizedCounter[s] >= largeLimit) DTableH.fastMode=0;
394
- symbolNext[s] = normalizedCounter[s];
399
+ assert(normalizedCounter[s]>=0);
400
+ symbolNext[s] = (U16)normalizedCounter[s];
395
401
  } } }
396
- memcpy(dt, &DTableH, sizeof(DTableH));
402
+ ZSTD_memcpy(dt, &DTableH, sizeof(DTableH));
397
403
  }
398
404
 
399
405
  /* Spread symbols */
400
- { U32 const tableMask = tableSize-1;
406
+ assert(tableSize <= 512);
407
+ /* Specialized symbol spreading for the case when there are
408
+ * no low probability (-1 count) symbols. When compressing
409
+ * small blocks we avoid low probability symbols to hit this
410
+ * case, since header decoding speed matters more.
411
+ */
412
+ if (highThreshold == tableSize - 1) {
413
+ size_t const tableMask = tableSize-1;
414
+ size_t const step = FSE_TABLESTEP(tableSize);
415
+ /* First lay down the symbols in order.
416
+ * We use a uint64_t to lay down 8 bytes at a time. This reduces branch
417
+ * misses since small blocks generally have small table logs, so nearly
418
+ * all symbols have counts <= 8. We ensure we have 8 bytes at the end of
419
+ * our buffer to handle the over-write.
420
+ */
421
+ {
422
+ U64 const add = 0x0101010101010101ull;
423
+ size_t pos = 0;
424
+ U64 sv = 0;
425
+ U32 s;
426
+ for (s=0; s<maxSV1; ++s, sv += add) {
427
+ int i;
428
+ int const n = normalizedCounter[s];
429
+ MEM_write64(spread + pos, sv);
430
+ for (i = 8; i < n; i += 8) {
431
+ MEM_write64(spread + pos + i, sv);
432
+ }
433
+ pos += n;
434
+ }
435
+ }
436
+ /* Now we spread those positions across the table.
437
+ * The benefit of doing it in two stages is that we avoid the the
438
+ * variable size inner loop, which caused lots of branch misses.
439
+ * Now we can run through all the positions without any branch misses.
440
+ * We unroll the loop twice, since that is what emperically worked best.
441
+ */
442
+ {
443
+ size_t position = 0;
444
+ size_t s;
445
+ size_t const unroll = 2;
446
+ assert(tableSize % unroll == 0); /* FSE_MIN_TABLELOG is 5 */
447
+ for (s = 0; s < (size_t)tableSize; s += unroll) {
448
+ size_t u;
449
+ for (u = 0; u < unroll; ++u) {
450
+ size_t const uPosition = (position + (u * step)) & tableMask;
451
+ tableDecode[uPosition].baseValue = spread[s + u];
452
+ }
453
+ position = (position + (unroll * step)) & tableMask;
454
+ }
455
+ assert(position == 0);
456
+ }
457
+ } else {
458
+ U32 const tableMask = tableSize-1;
401
459
  U32 const step = FSE_TABLESTEP(tableSize);
402
460
  U32 s, position = 0;
403
461
  for (s=0; s<maxSV1; s++) {
404
462
  int i;
405
- for (i=0; i<normalizedCounter[s]; i++) {
463
+ int const n = normalizedCounter[s];
464
+ for (i=0; i<n; i++) {
406
465
  tableDecode[position].baseValue = s;
407
466
  position = (position + step) & tableMask;
408
467
  while (position > highThreshold) position = (position + step) & tableMask; /* lowprob area */
@@ -411,7 +470,8 @@ ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
411
470
  }
412
471
 
413
472
  /* Build Decoding table */
414
- { U32 u;
473
+ {
474
+ U32 u;
415
475
  for (u=0; u<tableSize; u++) {
416
476
  U32 const symbol = tableDecode[u].baseValue;
417
477
  U32 const nextState = symbolNext[symbol]++;
@@ -420,7 +480,46 @@ ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
420
480
  assert(nbAdditionalBits[symbol] < 255);
421
481
  tableDecode[u].nbAdditionalBits = (BYTE)nbAdditionalBits[symbol];
422
482
  tableDecode[u].baseValue = baseValue[symbol];
423
- } }
483
+ }
484
+ }
485
+ }
486
+
487
+ /* Avoids the FORCE_INLINE of the _body() function. */
488
+ static void ZSTD_buildFSETable_body_default(ZSTD_seqSymbol* dt,
489
+ const short* normalizedCounter, unsigned maxSymbolValue,
490
+ const U32* baseValue, const U32* nbAdditionalBits,
491
+ unsigned tableLog, void* wksp, size_t wkspSize)
492
+ {
493
+ ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue,
494
+ baseValue, nbAdditionalBits, tableLog, wksp, wkspSize);
495
+ }
496
+
497
+ #if DYNAMIC_BMI2
498
+ TARGET_ATTRIBUTE("bmi2") static void ZSTD_buildFSETable_body_bmi2(ZSTD_seqSymbol* dt,
499
+ const short* normalizedCounter, unsigned maxSymbolValue,
500
+ const U32* baseValue, const U32* nbAdditionalBits,
501
+ unsigned tableLog, void* wksp, size_t wkspSize)
502
+ {
503
+ ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue,
504
+ baseValue, nbAdditionalBits, tableLog, wksp, wkspSize);
505
+ }
506
+ #endif
507
+
508
+ void ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
509
+ const short* normalizedCounter, unsigned maxSymbolValue,
510
+ const U32* baseValue, const U32* nbAdditionalBits,
511
+ unsigned tableLog, void* wksp, size_t wkspSize, int bmi2)
512
+ {
513
+ #if DYNAMIC_BMI2
514
+ if (bmi2) {
515
+ ZSTD_buildFSETable_body_bmi2(dt, normalizedCounter, maxSymbolValue,
516
+ baseValue, nbAdditionalBits, tableLog, wksp, wkspSize);
517
+ return;
518
+ }
519
+ #endif
520
+ (void)bmi2;
521
+ ZSTD_buildFSETable_body_default(dt, normalizedCounter, maxSymbolValue,
522
+ baseValue, nbAdditionalBits, tableLog, wksp, wkspSize);
424
523
  }
425
524
 
426
525
 
@@ -432,13 +531,14 @@ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymb
432
531
  const void* src, size_t srcSize,
433
532
  const U32* baseValue, const U32* nbAdditionalBits,
434
533
  const ZSTD_seqSymbol* defaultTable, U32 flagRepeatTable,
435
- int ddictIsCold, int nbSeq)
534
+ int ddictIsCold, int nbSeq, U32* wksp, size_t wkspSize,
535
+ int bmi2)
436
536
  {
437
537
  switch(type)
438
538
  {
439
539
  case set_rle :
440
- RETURN_ERROR_IF(!srcSize, srcSize_wrong);
441
- RETURN_ERROR_IF((*(const BYTE*)src) > max, corruption_detected);
540
+ RETURN_ERROR_IF(!srcSize, srcSize_wrong, "");
541
+ RETURN_ERROR_IF((*(const BYTE*)src) > max, corruption_detected, "");
442
542
  { U32 const symbol = *(const BYTE*)src;
443
543
  U32 const baseline = baseValue[symbol];
444
544
  U32 const nbBits = nbAdditionalBits[symbol];
@@ -450,7 +550,7 @@ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymb
450
550
  *DTablePtr = defaultTable;
451
551
  return 0;
452
552
  case set_repeat:
453
- RETURN_ERROR_IF(!flagRepeatTable, corruption_detected);
553
+ RETURN_ERROR_IF(!flagRepeatTable, corruption_detected, "");
454
554
  /* prefetch FSE table if used */
455
555
  if (ddictIsCold && (nbSeq > 24 /* heuristic */)) {
456
556
  const void* const pStart = *DTablePtr;
@@ -462,9 +562,9 @@ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymb
462
562
  { unsigned tableLog;
463
563
  S16 norm[MaxSeq+1];
464
564
  size_t const headerSize = FSE_readNCount(norm, &max, &tableLog, src, srcSize);
465
- RETURN_ERROR_IF(FSE_isError(headerSize), corruption_detected);
466
- RETURN_ERROR_IF(tableLog > maxLog, corruption_detected);
467
- ZSTD_buildFSETable(DTableSpace, norm, max, baseValue, nbAdditionalBits, tableLog);
565
+ RETURN_ERROR_IF(FSE_isError(headerSize), corruption_detected, "");
566
+ RETURN_ERROR_IF(tableLog > maxLog, corruption_detected, "");
567
+ ZSTD_buildFSETable(DTableSpace, norm, max, baseValue, nbAdditionalBits, tableLog, wksp, wkspSize, bmi2);
468
568
  *DTablePtr = DTableSpace;
469
569
  return headerSize;
470
570
  }
@@ -477,35 +577,36 @@ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymb
477
577
  size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
478
578
  const void* src, size_t srcSize)
479
579
  {
480
- const BYTE* const istart = (const BYTE* const)src;
580
+ const BYTE* const istart = (const BYTE*)src;
481
581
  const BYTE* const iend = istart + srcSize;
482
582
  const BYTE* ip = istart;
483
583
  int nbSeq;
484
584
  DEBUGLOG(5, "ZSTD_decodeSeqHeaders");
485
585
 
486
586
  /* check */
487
- RETURN_ERROR_IF(srcSize < MIN_SEQUENCES_SIZE, srcSize_wrong);
587
+ RETURN_ERROR_IF(srcSize < MIN_SEQUENCES_SIZE, srcSize_wrong, "");
488
588
 
489
589
  /* SeqHead */
490
590
  nbSeq = *ip++;
491
591
  if (!nbSeq) {
492
592
  *nbSeqPtr=0;
493
- RETURN_ERROR_IF(srcSize != 1, srcSize_wrong);
593
+ RETURN_ERROR_IF(srcSize != 1, srcSize_wrong, "");
494
594
  return 1;
495
595
  }
496
596
  if (nbSeq > 0x7F) {
497
597
  if (nbSeq == 0xFF) {
498
- RETURN_ERROR_IF(ip+2 > iend, srcSize_wrong);
499
- nbSeq = MEM_readLE16(ip) + LONGNBSEQ, ip+=2;
598
+ RETURN_ERROR_IF(ip+2 > iend, srcSize_wrong, "");
599
+ nbSeq = MEM_readLE16(ip) + LONGNBSEQ;
600
+ ip+=2;
500
601
  } else {
501
- RETURN_ERROR_IF(ip >= iend, srcSize_wrong);
602
+ RETURN_ERROR_IF(ip >= iend, srcSize_wrong, "");
502
603
  nbSeq = ((nbSeq-0x80)<<8) + *ip++;
503
604
  }
504
605
  }
505
606
  *nbSeqPtr = nbSeq;
506
607
 
507
608
  /* FSE table descriptors */
508
- RETURN_ERROR_IF(ip+1 > iend, srcSize_wrong); /* minimum possible size: 1 byte for symbol encoding types */
609
+ RETURN_ERROR_IF(ip+1 > iend, srcSize_wrong, ""); /* minimum possible size: 1 byte for symbol encoding types */
509
610
  { symbolEncodingType_e const LLtype = (symbolEncodingType_e)(*ip >> 6);
510
611
  symbolEncodingType_e const OFtype = (symbolEncodingType_e)((*ip >> 4) & 3);
511
612
  symbolEncodingType_e const MLtype = (symbolEncodingType_e)((*ip >> 2) & 3);
@@ -517,8 +618,10 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
517
618
  ip, iend-ip,
518
619
  LL_base, LL_bits,
519
620
  LL_defaultDTable, dctx->fseEntropy,
520
- dctx->ddictIsCold, nbSeq);
521
- RETURN_ERROR_IF(ZSTD_isError(llhSize), corruption_detected);
621
+ dctx->ddictIsCold, nbSeq,
622
+ dctx->workspace, sizeof(dctx->workspace),
623
+ dctx->bmi2);
624
+ RETURN_ERROR_IF(ZSTD_isError(llhSize), corruption_detected, "ZSTD_buildSeqTable failed");
522
625
  ip += llhSize;
523
626
  }
524
627
 
@@ -527,8 +630,10 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
527
630
  ip, iend-ip,
528
631
  OF_base, OF_bits,
529
632
  OF_defaultDTable, dctx->fseEntropy,
530
- dctx->ddictIsCold, nbSeq);
531
- RETURN_ERROR_IF(ZSTD_isError(ofhSize), corruption_detected);
633
+ dctx->ddictIsCold, nbSeq,
634
+ dctx->workspace, sizeof(dctx->workspace),
635
+ dctx->bmi2);
636
+ RETURN_ERROR_IF(ZSTD_isError(ofhSize), corruption_detected, "ZSTD_buildSeqTable failed");
532
637
  ip += ofhSize;
533
638
  }
534
639
 
@@ -537,8 +642,10 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
537
642
  ip, iend-ip,
538
643
  ML_base, ML_bits,
539
644
  ML_defaultDTable, dctx->fseEntropy,
540
- dctx->ddictIsCold, nbSeq);
541
- RETURN_ERROR_IF(ZSTD_isError(mlhSize), corruption_detected);
645
+ dctx->ddictIsCold, nbSeq,
646
+ dctx->workspace, sizeof(dctx->workspace),
647
+ dctx->bmi2);
648
+ RETURN_ERROR_IF(ZSTD_isError(mlhSize), corruption_detected, "ZSTD_buildSeqTable failed");
542
649
  ip += mlhSize;
543
650
  }
544
651
  }
@@ -551,7 +658,6 @@ typedef struct {
551
658
  size_t litLength;
552
659
  size_t matchLength;
553
660
  size_t offset;
554
- const BYTE* match;
555
661
  } seq_t;
556
662
 
557
663
  typedef struct {
@@ -565,59 +671,135 @@ typedef struct {
565
671
  ZSTD_fseState stateOffb;
566
672
  ZSTD_fseState stateML;
567
673
  size_t prevOffset[ZSTD_REP_NUM];
568
- const BYTE* prefixStart;
569
- const BYTE* dictEnd;
570
- size_t pos;
571
674
  } seqState_t;
572
675
 
676
+ /*! ZSTD_overlapCopy8() :
677
+ * Copies 8 bytes from ip to op and updates op and ip where ip <= op.
678
+ * If the offset is < 8 then the offset is spread to at least 8 bytes.
679
+ *
680
+ * Precondition: *ip <= *op
681
+ * Postcondition: *op - *op >= 8
682
+ */
683
+ HINT_INLINE void ZSTD_overlapCopy8(BYTE** op, BYTE const** ip, size_t offset) {
684
+ assert(*ip <= *op);
685
+ if (offset < 8) {
686
+ /* close range match, overlap */
687
+ static const U32 dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 }; /* added */
688
+ static const int dec64table[] = { 8, 8, 8, 7, 8, 9,10,11 }; /* subtracted */
689
+ int const sub2 = dec64table[offset];
690
+ (*op)[0] = (*ip)[0];
691
+ (*op)[1] = (*ip)[1];
692
+ (*op)[2] = (*ip)[2];
693
+ (*op)[3] = (*ip)[3];
694
+ *ip += dec32table[offset];
695
+ ZSTD_copy4(*op+4, *ip);
696
+ *ip -= sub2;
697
+ } else {
698
+ ZSTD_copy8(*op, *ip);
699
+ }
700
+ *ip += 8;
701
+ *op += 8;
702
+ assert(*op - *ip >= 8);
703
+ }
704
+
705
+ /*! ZSTD_safecopy() :
706
+ * Specialized version of memcpy() that is allowed to READ up to WILDCOPY_OVERLENGTH past the input buffer
707
+ * and write up to 16 bytes past oend_w (op >= oend_w is allowed).
708
+ * This function is only called in the uncommon case where the sequence is near the end of the block. It
709
+ * should be fast for a single long sequence, but can be slow for several short sequences.
710
+ *
711
+ * @param ovtype controls the overlap detection
712
+ * - ZSTD_no_overlap: The source and destination are guaranteed to be at least WILDCOPY_VECLEN bytes apart.
713
+ * - ZSTD_overlap_src_before_dst: The src and dst may overlap and may be any distance apart.
714
+ * The src buffer must be before the dst buffer.
715
+ */
716
+ static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, ptrdiff_t length, ZSTD_overlap_e ovtype) {
717
+ ptrdiff_t const diff = op - ip;
718
+ BYTE* const oend = op + length;
719
+
720
+ assert((ovtype == ZSTD_no_overlap && (diff <= -8 || diff >= 8 || op >= oend_w)) ||
721
+ (ovtype == ZSTD_overlap_src_before_dst && diff >= 0));
722
+
723
+ if (length < 8) {
724
+ /* Handle short lengths. */
725
+ while (op < oend) *op++ = *ip++;
726
+ return;
727
+ }
728
+ if (ovtype == ZSTD_overlap_src_before_dst) {
729
+ /* Copy 8 bytes and ensure the offset >= 8 when there can be overlap. */
730
+ assert(length >= 8);
731
+ ZSTD_overlapCopy8(&op, &ip, diff);
732
+ assert(op - ip >= 8);
733
+ assert(op <= oend);
734
+ }
573
735
 
574
- /* ZSTD_execSequenceLast7():
575
- * exceptional case : decompress a match starting within last 7 bytes of output buffer.
576
- * requires more careful checks, to ensure there is no overflow.
577
- * performance does not matter though.
578
- * note : this case is supposed to be never generated "naturally" by reference encoder,
579
- * since in most cases it needs at least 8 bytes to look for a match.
580
- * but it's allowed by the specification. */
736
+ if (oend <= oend_w) {
737
+ /* No risk of overwrite. */
738
+ ZSTD_wildcopy(op, ip, length, ovtype);
739
+ return;
740
+ }
741
+ if (op <= oend_w) {
742
+ /* Wildcopy until we get close to the end. */
743
+ assert(oend > oend_w);
744
+ ZSTD_wildcopy(op, ip, oend_w - op, ovtype);
745
+ ip += oend_w - op;
746
+ op = oend_w;
747
+ }
748
+ /* Handle the leftovers. */
749
+ while (op < oend) *op++ = *ip++;
750
+ }
751
+
752
+ /* ZSTD_execSequenceEnd():
753
+ * This version handles cases that are near the end of the output buffer. It requires
754
+ * more careful checks to make sure there is no overflow. By separating out these hard
755
+ * and unlikely cases, we can speed up the common cases.
756
+ *
757
+ * NOTE: This function needs to be fast for a single long sequence, but doesn't need
758
+ * to be optimized for many small sequences, since those fall into ZSTD_execSequence().
759
+ */
581
760
  FORCE_NOINLINE
582
- size_t ZSTD_execSequenceLast7(BYTE* op,
583
- BYTE* const oend, seq_t sequence,
584
- const BYTE** litPtr, const BYTE* const litLimit,
585
- const BYTE* const base, const BYTE* const vBase, const BYTE* const dictEnd)
761
+ size_t ZSTD_execSequenceEnd(BYTE* op,
762
+ BYTE* const oend, seq_t sequence,
763
+ const BYTE** litPtr, const BYTE* const litLimit,
764
+ const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
586
765
  {
587
766
  BYTE* const oLitEnd = op + sequence.litLength;
588
767
  size_t const sequenceLength = sequence.litLength + sequence.matchLength;
589
- BYTE* const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */
590
768
  const BYTE* const iLitEnd = *litPtr + sequence.litLength;
591
769
  const BYTE* match = oLitEnd - sequence.offset;
770
+ BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH;
592
771
 
593
- /* check */
594
- RETURN_ERROR_IF(oMatchEnd>oend, dstSize_tooSmall, "last match must fit within dstBuffer");
595
- RETURN_ERROR_IF(iLitEnd > litLimit, corruption_detected, "try to read beyond literal buffer");
772
+ /* bounds checks : careful of address space overflow in 32-bit mode */
773
+ RETURN_ERROR_IF(sequenceLength > (size_t)(oend - op), dstSize_tooSmall, "last match must fit within dstBuffer");
774
+ RETURN_ERROR_IF(sequence.litLength > (size_t)(litLimit - *litPtr), corruption_detected, "try to read beyond literal buffer");
775
+ assert(op < op + sequenceLength);
776
+ assert(oLitEnd < op + sequenceLength);
596
777
 
597
778
  /* copy literals */
598
- while (op < oLitEnd) *op++ = *(*litPtr)++;
779
+ ZSTD_safecopy(op, oend_w, *litPtr, sequence.litLength, ZSTD_no_overlap);
780
+ op = oLitEnd;
781
+ *litPtr = iLitEnd;
599
782
 
600
783
  /* copy Match */
601
- if (sequence.offset > (size_t)(oLitEnd - base)) {
784
+ if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
602
785
  /* offset beyond prefix */
603
- RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - vBase),corruption_detected);
604
- match = dictEnd - (base-match);
786
+ RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, "");
787
+ match = dictEnd - (prefixStart-match);
605
788
  if (match + sequence.matchLength <= dictEnd) {
606
- memmove(oLitEnd, match, sequence.matchLength);
789
+ ZSTD_memmove(oLitEnd, match, sequence.matchLength);
607
790
  return sequenceLength;
608
791
  }
609
792
  /* span extDict & currentPrefixSegment */
610
793
  { size_t const length1 = dictEnd - match;
611
- memmove(oLitEnd, match, length1);
794
+ ZSTD_memmove(oLitEnd, match, length1);
612
795
  op = oLitEnd + length1;
613
796
  sequence.matchLength -= length1;
614
- match = base;
797
+ match = prefixStart;
615
798
  } }
616
- while (op < oMatchEnd) *op++ = *match++;
799
+ ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst);
617
800
  return sequenceLength;
618
801
  }
619
802
 
620
-
621
803
  HINT_INLINE
622
804
  size_t ZSTD_execSequence(BYTE* op,
623
805
  BYTE* const oend, seq_t sequence,
@@ -627,155 +809,85 @@ size_t ZSTD_execSequence(BYTE* op,
627
809
  BYTE* const oLitEnd = op + sequence.litLength;
628
810
  size_t const sequenceLength = sequence.litLength + sequence.matchLength;
629
811
  BYTE* const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */
630
- BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH;
812
+ BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH; /* risk : address space underflow on oend=NULL */
631
813
  const BYTE* const iLitEnd = *litPtr + sequence.litLength;
632
814
  const BYTE* match = oLitEnd - sequence.offset;
633
815
 
634
- /* check */
635
- RETURN_ERROR_IF(oMatchEnd>oend, dstSize_tooSmall, "last match must start at a minimum distance of WILDCOPY_OVERLENGTH from oend");
636
- RETURN_ERROR_IF(iLitEnd > litLimit, corruption_detected, "over-read beyond lit buffer");
637
- if (oLitEnd>oend_w) return ZSTD_execSequenceLast7(op, oend, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd);
638
-
639
- /* copy Literals */
640
- if (sequence.litLength > 8)
641
- ZSTD_wildcopy_16min(op, (*litPtr), sequence.litLength, ZSTD_no_overlap); /* note : since oLitEnd <= oend-WILDCOPY_OVERLENGTH, no risk of overwrite beyond oend */
642
- else
643
- ZSTD_copy8(op, *litPtr);
816
+ assert(op != NULL /* Precondition */);
817
+ assert(oend_w < oend /* No underflow */);
818
+ /* Handle edge cases in a slow path:
819
+ * - Read beyond end of literals
820
+ * - Match end is within WILDCOPY_OVERLIMIT of oend
821
+ * - 32-bit mode and the match length overflows
822
+ */
823
+ if (UNLIKELY(
824
+ iLitEnd > litLimit ||
825
+ oMatchEnd > oend_w ||
826
+ (MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH)))
827
+ return ZSTD_execSequenceEnd(op, oend, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd);
828
+
829
+ /* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */
830
+ assert(op <= oLitEnd /* No overflow */);
831
+ assert(oLitEnd < oMatchEnd /* Non-zero match & no overflow */);
832
+ assert(oMatchEnd <= oend /* No underflow */);
833
+ assert(iLitEnd <= litLimit /* Literal length is in bounds */);
834
+ assert(oLitEnd <= oend_w /* Can wildcopy literals */);
835
+ assert(oMatchEnd <= oend_w /* Can wildcopy matches */);
836
+
837
+ /* Copy Literals:
838
+ * Split out litLength <= 16 since it is nearly always true. +1.6% on gcc-9.
839
+ * We likely don't need the full 32-byte wildcopy.
840
+ */
841
+ assert(WILDCOPY_OVERLENGTH >= 16);
842
+ ZSTD_copy16(op, (*litPtr));
843
+ if (UNLIKELY(sequence.litLength > 16)) {
844
+ ZSTD_wildcopy(op+16, (*litPtr)+16, sequence.litLength-16, ZSTD_no_overlap);
845
+ }
644
846
  op = oLitEnd;
645
847
  *litPtr = iLitEnd; /* update for next sequence */
646
848
 
647
- /* copy Match */
849
+ /* Copy Match */
648
850
  if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
649
851
  /* offset beyond prefix -> go into extDict */
650
- RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected);
852
+ RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virtualStart)), corruption_detected, "");
651
853
  match = dictEnd + (match - prefixStart);
652
854
  if (match + sequence.matchLength <= dictEnd) {
653
- memmove(oLitEnd, match, sequence.matchLength);
855
+ ZSTD_memmove(oLitEnd, match, sequence.matchLength);
654
856
  return sequenceLength;
655
857
  }
656
858
  /* span extDict & currentPrefixSegment */
657
859
  { size_t const length1 = dictEnd - match;
658
- memmove(oLitEnd, match, length1);
860
+ ZSTD_memmove(oLitEnd, match, length1);
659
861
  op = oLitEnd + length1;
660
862
  sequence.matchLength -= length1;
661
863
  match = prefixStart;
662
- if (op > oend_w || sequence.matchLength < MINMATCH) {
663
- U32 i;
664
- for (i = 0; i < sequence.matchLength; ++i) op[i] = match[i];
665
- return sequenceLength;
666
- }
667
864
  } }
668
- /* Requirement: op <= oend_w && sequence.matchLength >= MINMATCH */
669
-
670
- /* match within prefix */
671
- if (sequence.offset < 8) {
672
- /* close range match, overlap */
673
- static const U32 dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 }; /* added */
674
- static const int dec64table[] = { 8, 8, 8, 7, 8, 9,10,11 }; /* subtracted */
675
- int const sub2 = dec64table[sequence.offset];
676
- op[0] = match[0];
677
- op[1] = match[1];
678
- op[2] = match[2];
679
- op[3] = match[3];
680
- match += dec32table[sequence.offset];
681
- ZSTD_copy4(op+4, match);
682
- match -= sub2;
683
- } else {
684
- ZSTD_copy8(op, match);
685
- }
686
- op += 8; match += 8;
687
-
688
- if (oMatchEnd > oend-(16-MINMATCH)) {
689
- if (op < oend_w) {
690
- ZSTD_wildcopy(op, match, oend_w - op, ZSTD_overlap_src_before_dst);
691
- match += oend_w - op;
692
- op = oend_w;
693
- }
694
- while (op < oMatchEnd) *op++ = *match++;
695
- } else {
696
- ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength-8, ZSTD_overlap_src_before_dst); /* works even if matchLength < 8 */
865
+ /* Match within prefix of 1 or more bytes */
866
+ assert(op <= oMatchEnd);
867
+ assert(oMatchEnd <= oend_w);
868
+ assert(match >= prefixStart);
869
+ assert(sequence.matchLength >= 1);
870
+
871
+ /* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy
872
+ * without overlap checking.
873
+ */
874
+ if (LIKELY(sequence.offset >= WILDCOPY_VECLEN)) {
875
+ /* We bet on a full wildcopy for matches, since we expect matches to be
876
+ * longer than literals (in general). In silesia, ~10% of matches are longer
877
+ * than 16 bytes.
878
+ */
879
+ ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength, ZSTD_no_overlap);
880
+ return sequenceLength;
697
881
  }
698
- return sequenceLength;
699
- }
700
-
701
-
702
- HINT_INLINE
703
- size_t ZSTD_execSequenceLong(BYTE* op,
704
- BYTE* const oend, seq_t sequence,
705
- const BYTE** litPtr, const BYTE* const litLimit,
706
- const BYTE* const prefixStart, const BYTE* const dictStart, const BYTE* const dictEnd)
707
- {
708
- BYTE* const oLitEnd = op + sequence.litLength;
709
- size_t const sequenceLength = sequence.litLength + sequence.matchLength;
710
- BYTE* const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */
711
- BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH;
712
- const BYTE* const iLitEnd = *litPtr + sequence.litLength;
713
- const BYTE* match = sequence.match;
714
-
715
- /* check */
716
- RETURN_ERROR_IF(oMatchEnd > oend, dstSize_tooSmall, "last match must start at a minimum distance of WILDCOPY_OVERLENGTH from oend");
717
- RETURN_ERROR_IF(iLitEnd > litLimit, corruption_detected, "over-read beyond lit buffer");
718
- if (oLitEnd > oend_w) return ZSTD_execSequenceLast7(op, oend, sequence, litPtr, litLimit, prefixStart, dictStart, dictEnd);
719
-
720
- /* copy Literals */
721
- if (sequence.litLength > 8)
722
- ZSTD_wildcopy_16min(op, *litPtr, sequence.litLength, ZSTD_no_overlap); /* note : since oLitEnd <= oend-WILDCOPY_OVERLENGTH, no risk of overwrite beyond oend */
723
- else
724
- ZSTD_copy8(op, *litPtr); /* note : op <= oLitEnd <= oend_w == oend - 8 */
725
-
726
- op = oLitEnd;
727
- *litPtr = iLitEnd; /* update for next sequence */
728
-
729
- /* copy Match */
730
- if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
731
- /* offset beyond prefix */
732
- RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - dictStart), corruption_detected);
733
- if (match + sequence.matchLength <= dictEnd) {
734
- memmove(oLitEnd, match, sequence.matchLength);
735
- return sequenceLength;
736
- }
737
- /* span extDict & currentPrefixSegment */
738
- { size_t const length1 = dictEnd - match;
739
- memmove(oLitEnd, match, length1);
740
- op = oLitEnd + length1;
741
- sequence.matchLength -= length1;
742
- match = prefixStart;
743
- if (op > oend_w || sequence.matchLength < MINMATCH) {
744
- U32 i;
745
- for (i = 0; i < sequence.matchLength; ++i) op[i] = match[i];
746
- return sequenceLength;
747
- }
748
- } }
749
- assert(op <= oend_w);
750
- assert(sequence.matchLength >= MINMATCH);
882
+ assert(sequence.offset < WILDCOPY_VECLEN);
751
883
 
752
- /* match within prefix */
753
- if (sequence.offset < 8) {
754
- /* close range match, overlap */
755
- static const U32 dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 }; /* added */
756
- static const int dec64table[] = { 8, 8, 8, 7, 8, 9,10,11 }; /* subtracted */
757
- int const sub2 = dec64table[sequence.offset];
758
- op[0] = match[0];
759
- op[1] = match[1];
760
- op[2] = match[2];
761
- op[3] = match[3];
762
- match += dec32table[sequence.offset];
763
- ZSTD_copy4(op+4, match);
764
- match -= sub2;
765
- } else {
766
- ZSTD_copy8(op, match);
767
- }
768
- op += 8; match += 8;
884
+ /* Copy 8 bytes and spread the offset to be >= 8. */
885
+ ZSTD_overlapCopy8(&op, &match, sequence.offset);
769
886
 
770
- if (oMatchEnd > oend-(16-MINMATCH)) {
771
- if (op < oend_w) {
772
- ZSTD_wildcopy(op, match, oend_w - op, ZSTD_overlap_src_before_dst);
773
- match += oend_w - op;
774
- op = oend_w;
775
- }
776
- while (op < oMatchEnd) *op++ = *match++;
777
- } else {
778
- ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength-8, ZSTD_overlap_src_before_dst); /* works even if matchLength < 8 */
887
+ /* If the match length is > 8 bytes, then continue with the wildcopy. */
888
+ if (sequence.matchLength > 8) {
889
+ assert(op < oMatchEnd);
890
+ ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength-8, ZSTD_overlap_src_before_dst);
779
891
  }
780
892
  return sequenceLength;
781
893
  }
@@ -801,6 +913,14 @@ ZSTD_updateFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD)
801
913
  DStatePtr->state = DInfo.nextState + lowBits;
802
914
  }
803
915
 
916
+ FORCE_INLINE_TEMPLATE void
917
+ ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, ZSTD_seqSymbol const DInfo)
918
+ {
919
+ U32 const nbBits = DInfo.nbBits;
920
+ size_t const lowBits = BIT_readBits(bitD, nbBits);
921
+ DStatePtr->state = DInfo.nextState + lowBits;
922
+ }
923
+
804
924
  /* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum
805
925
  * offset bits. But we can only read at most (STREAM_ACCUMULATOR_MIN_32 - 1)
806
926
  * bits before reloading. This value is the maximum number of bytes we read
@@ -813,24 +933,24 @@ ZSTD_updateFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD)
813
933
 
814
934
  typedef enum { ZSTD_lo_isRegularOffset, ZSTD_lo_isLongOffset=1 } ZSTD_longOffset_e;
815
935
 
816
- #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
817
936
  FORCE_INLINE_TEMPLATE seq_t
818
937
  ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
819
938
  {
820
939
  seq_t seq;
821
- U32 const llBits = seqState->stateLL.table[seqState->stateLL.state].nbAdditionalBits;
822
- U32 const mlBits = seqState->stateML.table[seqState->stateML.state].nbAdditionalBits;
823
- U32 const ofBits = seqState->stateOffb.table[seqState->stateOffb.state].nbAdditionalBits;
824
- U32 const totalBits = llBits+mlBits+ofBits;
825
- U32 const llBase = seqState->stateLL.table[seqState->stateLL.state].baseValue;
826
- U32 const mlBase = seqState->stateML.table[seqState->stateML.state].baseValue;
827
- U32 const ofBase = seqState->stateOffb.table[seqState->stateOffb.state].baseValue;
940
+ ZSTD_seqSymbol const llDInfo = seqState->stateLL.table[seqState->stateLL.state];
941
+ ZSTD_seqSymbol const mlDInfo = seqState->stateML.table[seqState->stateML.state];
942
+ ZSTD_seqSymbol const ofDInfo = seqState->stateOffb.table[seqState->stateOffb.state];
943
+ U32 const llBase = llDInfo.baseValue;
944
+ U32 const mlBase = mlDInfo.baseValue;
945
+ U32 const ofBase = ofDInfo.baseValue;
946
+ BYTE const llBits = llDInfo.nbAdditionalBits;
947
+ BYTE const mlBits = mlDInfo.nbAdditionalBits;
948
+ BYTE const ofBits = ofDInfo.nbAdditionalBits;
949
+ BYTE const totalBits = llBits+mlBits+ofBits;
828
950
 
829
951
  /* sequence */
830
952
  { size_t offset;
831
- if (!ofBits)
832
- offset = 0;
833
- else {
953
+ if (ofBits > 1) {
834
954
  ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
835
955
  ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
836
956
  assert(ofBits <= MaxOff);
@@ -844,63 +964,138 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
844
964
  offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */
845
965
  if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
846
966
  }
847
- }
848
-
849
- if (ofBits <= 1) {
850
- offset += (llBase==0);
851
- if (offset) {
852
- size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
853
- temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */
854
- if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1];
855
- seqState->prevOffset[1] = seqState->prevOffset[0];
856
- seqState->prevOffset[0] = offset = temp;
857
- } else { /* offset == 0 */
858
- offset = seqState->prevOffset[0];
859
- }
860
- } else {
861
967
  seqState->prevOffset[2] = seqState->prevOffset[1];
862
968
  seqState->prevOffset[1] = seqState->prevOffset[0];
863
969
  seqState->prevOffset[0] = offset;
864
- }
970
+ } else {
971
+ U32 const ll0 = (llBase == 0);
972
+ if (LIKELY((ofBits == 0))) {
973
+ if (LIKELY(!ll0))
974
+ offset = seqState->prevOffset[0];
975
+ else {
976
+ offset = seqState->prevOffset[1];
977
+ seqState->prevOffset[1] = seqState->prevOffset[0];
978
+ seqState->prevOffset[0] = offset;
979
+ }
980
+ } else {
981
+ offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1);
982
+ { size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
983
+ temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */
984
+ if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1];
985
+ seqState->prevOffset[1] = seqState->prevOffset[0];
986
+ seqState->prevOffset[0] = offset = temp;
987
+ } } }
865
988
  seq.offset = offset;
866
989
  }
867
990
 
868
- seq.matchLength = mlBase
869
- + ((mlBits>0) ? BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/) : 0); /* <= 16 bits */
991
+ seq.matchLength = mlBase;
992
+ if (mlBits > 0)
993
+ seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/);
994
+
870
995
  if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))
871
996
  BIT_reloadDStream(&seqState->DStream);
872
- if (MEM_64bits() && (totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog)))
997
+ if (MEM_64bits() && UNLIKELY(totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog)))
873
998
  BIT_reloadDStream(&seqState->DStream);
874
999
  /* Ensure there are enough bits to read the rest of data in 64-bit mode. */
875
1000
  ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);
876
1001
 
877
- seq.litLength = llBase
878
- + ((llBits>0) ? BIT_readBitsFast(&seqState->DStream, llBits/*>0*/) : 0); /* <= 16 bits */
1002
+ seq.litLength = llBase;
1003
+ if (llBits > 0)
1004
+ seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/);
1005
+
879
1006
  if (MEM_32bits())
880
1007
  BIT_reloadDStream(&seqState->DStream);
881
1008
 
882
1009
  DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u",
883
1010
  (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
884
1011
 
885
- /* ANS state update */
886
- ZSTD_updateFseState(&seqState->stateLL, &seqState->DStream); /* <= 9 bits */
887
- ZSTD_updateFseState(&seqState->stateML, &seqState->DStream); /* <= 9 bits */
888
- if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */
889
- ZSTD_updateFseState(&seqState->stateOffb, &seqState->DStream); /* <= 8 bits */
1012
+ /* ANS state update
1013
+ * gcc-9.0.0 does 2.5% worse with ZSTD_updateFseStateWithDInfo().
1014
+ * clang-9.2.0 does 7% worse with ZSTD_updateFseState().
1015
+ * Naturally it seems like ZSTD_updateFseStateWithDInfo() should be the
1016
+ * better option, so it is the default for other compilers. But, if you
1017
+ * measure that it is worse, please put up a pull request.
1018
+ */
1019
+ {
1020
+ #if defined(__GNUC__) && !defined(__clang__)
1021
+ const int kUseUpdateFseState = 1;
1022
+ #else
1023
+ const int kUseUpdateFseState = 0;
1024
+ #endif
1025
+ if (kUseUpdateFseState) {
1026
+ ZSTD_updateFseState(&seqState->stateLL, &seqState->DStream); /* <= 9 bits */
1027
+ ZSTD_updateFseState(&seqState->stateML, &seqState->DStream); /* <= 9 bits */
1028
+ if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */
1029
+ ZSTD_updateFseState(&seqState->stateOffb, &seqState->DStream); /* <= 8 bits */
1030
+ } else {
1031
+ ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llDInfo); /* <= 9 bits */
1032
+ ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlDInfo); /* <= 9 bits */
1033
+ if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */
1034
+ ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofDInfo); /* <= 8 bits */
1035
+ }
1036
+ }
890
1037
 
891
1038
  return seq;
892
1039
  }
893
1040
 
1041
+ #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
1042
+ MEM_STATIC int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd)
1043
+ {
1044
+ size_t const windowSize = dctx->fParams.windowSize;
1045
+ /* No dictionary used. */
1046
+ if (dctx->dictContentEndForFuzzing == NULL) return 0;
1047
+ /* Dictionary is our prefix. */
1048
+ if (prefixStart == dctx->dictContentBeginForFuzzing) return 1;
1049
+ /* Dictionary is not our ext-dict. */
1050
+ if (dctx->dictEnd != dctx->dictContentEndForFuzzing) return 0;
1051
+ /* Dictionary is not within our window size. */
1052
+ if ((size_t)(oLitEnd - prefixStart) >= windowSize) return 0;
1053
+ /* Dictionary is active. */
1054
+ return 1;
1055
+ }
1056
+
1057
+ MEM_STATIC void ZSTD_assertValidSequence(
1058
+ ZSTD_DCtx const* dctx,
1059
+ BYTE const* op, BYTE const* oend,
1060
+ seq_t const seq,
1061
+ BYTE const* prefixStart, BYTE const* virtualStart)
1062
+ {
1063
+ #if DEBUGLEVEL >= 1
1064
+ size_t const windowSize = dctx->fParams.windowSize;
1065
+ size_t const sequenceSize = seq.litLength + seq.matchLength;
1066
+ BYTE const* const oLitEnd = op + seq.litLength;
1067
+ DEBUGLOG(6, "Checking sequence: litL=%u matchL=%u offset=%u",
1068
+ (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
1069
+ assert(op <= oend);
1070
+ assert((size_t)(oend - op) >= sequenceSize);
1071
+ assert(sequenceSize <= ZSTD_BLOCKSIZE_MAX);
1072
+ if (ZSTD_dictionaryIsActive(dctx, prefixStart, oLitEnd)) {
1073
+ size_t const dictSize = (size_t)((char const*)dctx->dictContentEndForFuzzing - (char const*)dctx->dictContentBeginForFuzzing);
1074
+ /* Offset must be within the dictionary. */
1075
+ assert(seq.offset <= (size_t)(oLitEnd - virtualStart));
1076
+ assert(seq.offset <= windowSize + dictSize);
1077
+ } else {
1078
+ /* Offset must be within our window. */
1079
+ assert(seq.offset <= windowSize);
1080
+ }
1081
+ #else
1082
+ (void)dctx, (void)op, (void)oend, (void)seq, (void)prefixStart, (void)virtualStart;
1083
+ #endif
1084
+ }
1085
+ #endif
1086
+
1087
+ #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
894
1088
  FORCE_INLINE_TEMPLATE size_t
895
1089
  DONT_VECTORIZE
896
1090
  ZSTD_decompressSequences_body( ZSTD_DCtx* dctx,
897
1091
  void* dst, size_t maxDstSize,
898
1092
  const void* seqStart, size_t seqSize, int nbSeq,
899
- const ZSTD_longOffset_e isLongOffset)
1093
+ const ZSTD_longOffset_e isLongOffset,
1094
+ const int frame)
900
1095
  {
901
1096
  const BYTE* ip = (const BYTE*)seqStart;
902
1097
  const BYTE* const iend = ip + seqSize;
903
- BYTE* const ostart = (BYTE* const)dst;
1098
+ BYTE* const ostart = (BYTE*)dst;
904
1099
  BYTE* const oend = ostart + maxDstSize;
905
1100
  BYTE* op = ostart;
906
1101
  const BYTE* litPtr = dctx->litPtr;
@@ -909,6 +1104,7 @@ ZSTD_decompressSequences_body( ZSTD_DCtx* dctx,
909
1104
  const BYTE* const vBase = (const BYTE*) (dctx->virtualStart);
910
1105
  const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
911
1106
  DEBUGLOG(5, "ZSTD_decompressSequences_body");
1107
+ (void)frame;
912
1108
 
913
1109
  /* Regen sequences */
914
1110
  if (nbSeq) {
@@ -917,38 +1113,97 @@ ZSTD_decompressSequences_body( ZSTD_DCtx* dctx,
917
1113
  { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; }
918
1114
  RETURN_ERROR_IF(
919
1115
  ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend-ip)),
920
- corruption_detected);
1116
+ corruption_detected, "");
921
1117
  ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr);
922
1118
  ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr);
923
1119
  ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
1120
+ assert(dst != NULL);
924
1121
 
925
1122
  ZSTD_STATIC_ASSERT(
926
1123
  BIT_DStream_unfinished < BIT_DStream_completed &&
927
1124
  BIT_DStream_endOfBuffer < BIT_DStream_completed &&
928
1125
  BIT_DStream_completed < BIT_DStream_overflow);
929
1126
 
930
- for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && nbSeq ; ) {
931
- nbSeq--;
932
- { seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
933
- size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd);
934
- DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
935
- if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
936
- op += oneSeqSize;
937
- } }
1127
+ #if defined(__GNUC__) && defined(__x86_64__)
1128
+ /* Align the decompression loop to 32 + 16 bytes.
1129
+ *
1130
+ * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression
1131
+ * speed swings based on the alignment of the decompression loop. This
1132
+ * performance swing is caused by parts of the decompression loop falling
1133
+ * out of the DSB. The entire decompression loop should fit in the DSB,
1134
+ * when it can't we get much worse performance. You can measure if you've
1135
+ * hit the good case or the bad case with this perf command for some
1136
+ * compressed file test.zst:
1137
+ *
1138
+ * perf stat -e cycles -e instructions -e idq.all_dsb_cycles_any_uops \
1139
+ * -e idq.all_mite_cycles_any_uops -- ./zstd -tq test.zst
1140
+ *
1141
+ * If you see most cycles served out of the MITE you've hit the bad case.
1142
+ * If you see most cycles served out of the DSB you've hit the good case.
1143
+ * If it is pretty even then you may be in an okay case.
1144
+ *
1145
+ * This issue has been reproduced on the following CPUs:
1146
+ * - Kabylake: Macbook Pro (15-inch, 2019) 2.4 GHz Intel Core i9
1147
+ * Use Instruments->Counters to get DSB/MITE cycles.
1148
+ * I never got performance swings, but I was able to
1149
+ * go from the good case of mostly DSB to half of the
1150
+ * cycles served from MITE.
1151
+ * - Coffeelake: Intel i9-9900k
1152
+ * - Coffeelake: Intel i7-9700k
1153
+ *
1154
+ * I haven't been able to reproduce the instability or DSB misses on any
1155
+ * of the following CPUS:
1156
+ * - Haswell
1157
+ * - Broadwell: Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GH
1158
+ * - Skylake
1159
+ *
1160
+ * If you are seeing performance stability this script can help test.
1161
+ * It tests on 4 commits in zstd where I saw performance change.
1162
+ *
1163
+ * https://gist.github.com/terrelln/9889fc06a423fd5ca6e99351564473f4
1164
+ */
1165
+ __asm__(".p2align 6");
1166
+ __asm__("nop");
1167
+ __asm__(".p2align 5");
1168
+ __asm__("nop");
1169
+ # if __GNUC__ >= 9
1170
+ /* better for gcc-9 and gcc-10, worse for clang and gcc-8 */
1171
+ __asm__(".p2align 3");
1172
+ # else
1173
+ __asm__(".p2align 4");
1174
+ # endif
1175
+ #endif
1176
+ for ( ; ; ) {
1177
+ seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1178
+ size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd);
1179
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1180
+ assert(!ZSTD_isError(oneSeqSize));
1181
+ if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
1182
+ #endif
1183
+ if (UNLIKELY(ZSTD_isError(oneSeqSize)))
1184
+ return oneSeqSize;
1185
+ DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
1186
+ op += oneSeqSize;
1187
+ if (UNLIKELY(!--nbSeq))
1188
+ break;
1189
+ BIT_reloadDStream(&(seqState.DStream));
1190
+ }
938
1191
 
939
1192
  /* check if reached exact end */
940
1193
  DEBUGLOG(5, "ZSTD_decompressSequences_body: after decode loop, remaining nbSeq : %i", nbSeq);
941
- RETURN_ERROR_IF(nbSeq, corruption_detected);
942
- RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected);
1194
+ RETURN_ERROR_IF(nbSeq, corruption_detected, "");
1195
+ RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, "");
943
1196
  /* save reps for next block */
944
1197
  { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
945
1198
  }
946
1199
 
947
1200
  /* last literal segment */
948
1201
  { size_t const lastLLSize = litEnd - litPtr;
949
- RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall);
950
- memcpy(op, litPtr, lastLLSize);
951
- op += lastLLSize;
1202
+ RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
1203
+ if (op != NULL) {
1204
+ ZSTD_memcpy(op, litPtr, lastLLSize);
1205
+ op += lastLLSize;
1206
+ }
952
1207
  }
953
1208
 
954
1209
  return op-ostart;
@@ -958,103 +1213,43 @@ static size_t
958
1213
  ZSTD_decompressSequences_default(ZSTD_DCtx* dctx,
959
1214
  void* dst, size_t maxDstSize,
960
1215
  const void* seqStart, size_t seqSize, int nbSeq,
961
- const ZSTD_longOffset_e isLongOffset)
1216
+ const ZSTD_longOffset_e isLongOffset,
1217
+ const int frame)
962
1218
  {
963
- return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
1219
+ return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
964
1220
  }
965
1221
  #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
966
1222
 
967
-
968
-
969
1223
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
970
- FORCE_INLINE_TEMPLATE seq_t
971
- ZSTD_decodeSequenceLong(seqState_t* seqState, ZSTD_longOffset_e const longOffsets)
972
- {
973
- seq_t seq;
974
- U32 const llBits = seqState->stateLL.table[seqState->stateLL.state].nbAdditionalBits;
975
- U32 const mlBits = seqState->stateML.table[seqState->stateML.state].nbAdditionalBits;
976
- U32 const ofBits = seqState->stateOffb.table[seqState->stateOffb.state].nbAdditionalBits;
977
- U32 const totalBits = llBits+mlBits+ofBits;
978
- U32 const llBase = seqState->stateLL.table[seqState->stateLL.state].baseValue;
979
- U32 const mlBase = seqState->stateML.table[seqState->stateML.state].baseValue;
980
- U32 const ofBase = seqState->stateOffb.table[seqState->stateOffb.state].baseValue;
981
-
982
- /* sequence */
983
- { size_t offset;
984
- if (!ofBits)
985
- offset = 0;
986
- else {
987
- ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
988
- ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
989
- assert(ofBits <= MaxOff);
990
- if (MEM_32bits() && longOffsets) {
991
- U32 const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN_32-1);
992
- offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
993
- if (MEM_32bits() || extraBits) BIT_reloadDStream(&seqState->DStream);
994
- if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits);
995
- } else {
996
- offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */
997
- if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
998
- }
999
- }
1000
1224
 
1001
- if (ofBits <= 1) {
1002
- offset += (llBase==0);
1003
- if (offset) {
1004
- size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
1005
- temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */
1006
- if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1];
1007
- seqState->prevOffset[1] = seqState->prevOffset[0];
1008
- seqState->prevOffset[0] = offset = temp;
1009
- } else {
1010
- offset = seqState->prevOffset[0];
1011
- }
1012
- } else {
1013
- seqState->prevOffset[2] = seqState->prevOffset[1];
1014
- seqState->prevOffset[1] = seqState->prevOffset[0];
1015
- seqState->prevOffset[0] = offset;
1016
- }
1017
- seq.offset = offset;
1018
- }
1019
-
1020
- seq.matchLength = mlBase + ((mlBits>0) ? BIT_readBitsFast(&seqState->DStream, mlBits) : 0); /* <= 16 bits */
1021
- if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))
1022
- BIT_reloadDStream(&seqState->DStream);
1023
- if (MEM_64bits() && (totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog)))
1024
- BIT_reloadDStream(&seqState->DStream);
1025
- /* Verify that there is enough bits to read the rest of the data in 64-bit mode. */
1026
- ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);
1027
-
1028
- seq.litLength = llBase + ((llBits>0) ? BIT_readBitsFast(&seqState->DStream, llBits) : 0); /* <= 16 bits */
1029
- if (MEM_32bits())
1030
- BIT_reloadDStream(&seqState->DStream);
1031
-
1032
- { size_t const pos = seqState->pos + seq.litLength;
1033
- const BYTE* const matchBase = (seq.offset > pos) ? seqState->dictEnd : seqState->prefixStart;
1034
- seq.match = matchBase + pos - seq.offset; /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted.
1035
- * No consequence though : no memory access will occur, overly large offset will be detected in ZSTD_execSequenceLong() */
1036
- seqState->pos = pos + seq.matchLength;
1225
+ FORCE_INLINE_TEMPLATE size_t
1226
+ ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence,
1227
+ const BYTE* const prefixStart, const BYTE* const dictEnd)
1228
+ {
1229
+ prefetchPos += sequence.litLength;
1230
+ { const BYTE* const matchBase = (sequence.offset > prefetchPos) ? dictEnd : prefixStart;
1231
+ const BYTE* const match = matchBase + prefetchPos - sequence.offset; /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted.
1232
+ * No consequence though : memory address is only used for prefetching, not for dereferencing */
1233
+ PREFETCH_L1(match); PREFETCH_L1(match+CACHELINE_SIZE); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
1037
1234
  }
1038
-
1039
- /* ANS state update */
1040
- ZSTD_updateFseState(&seqState->stateLL, &seqState->DStream); /* <= 9 bits */
1041
- ZSTD_updateFseState(&seqState->stateML, &seqState->DStream); /* <= 9 bits */
1042
- if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */
1043
- ZSTD_updateFseState(&seqState->stateOffb, &seqState->DStream); /* <= 8 bits */
1044
-
1045
- return seq;
1235
+ return prefetchPos + sequence.matchLength;
1046
1236
  }
1047
1237
 
1238
+ /* This decoding function employs prefetching
1239
+ * to reduce latency impact of cache misses.
1240
+ * It's generally employed when block contains a significant portion of long-distance matches
1241
+ * or when coupled with a "cold" dictionary */
1048
1242
  FORCE_INLINE_TEMPLATE size_t
1049
1243
  ZSTD_decompressSequencesLong_body(
1050
1244
  ZSTD_DCtx* dctx,
1051
1245
  void* dst, size_t maxDstSize,
1052
1246
  const void* seqStart, size_t seqSize, int nbSeq,
1053
- const ZSTD_longOffset_e isLongOffset)
1247
+ const ZSTD_longOffset_e isLongOffset,
1248
+ const int frame)
1054
1249
  {
1055
1250
  const BYTE* ip = (const BYTE*)seqStart;
1056
1251
  const BYTE* const iend = ip + seqSize;
1057
- BYTE* const ostart = (BYTE* const)dst;
1252
+ BYTE* const ostart = (BYTE*)dst;
1058
1253
  BYTE* const oend = ostart + maxDstSize;
1059
1254
  BYTE* op = ostart;
1060
1255
  const BYTE* litPtr = dctx->litPtr;
@@ -1062,51 +1257,62 @@ ZSTD_decompressSequencesLong_body(
1062
1257
  const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart);
1063
1258
  const BYTE* const dictStart = (const BYTE*) (dctx->virtualStart);
1064
1259
  const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
1260
+ (void)frame;
1065
1261
 
1066
1262
  /* Regen sequences */
1067
1263
  if (nbSeq) {
1068
- #define STORED_SEQS 4
1264
+ #define STORED_SEQS 8
1069
1265
  #define STORED_SEQS_MASK (STORED_SEQS-1)
1070
- #define ADVANCED_SEQS 4
1266
+ #define ADVANCED_SEQS STORED_SEQS
1071
1267
  seq_t sequences[STORED_SEQS];
1072
1268
  int const seqAdvance = MIN(nbSeq, ADVANCED_SEQS);
1073
1269
  seqState_t seqState;
1074
1270
  int seqNb;
1271
+ size_t prefetchPos = (size_t)(op-prefixStart); /* track position relative to prefixStart */
1272
+
1075
1273
  dctx->fseEntropy = 1;
1076
1274
  { int i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; }
1077
- seqState.prefixStart = prefixStart;
1078
- seqState.pos = (size_t)(op-prefixStart);
1079
- seqState.dictEnd = dictEnd;
1275
+ assert(dst != NULL);
1080
1276
  assert(iend >= ip);
1081
1277
  RETURN_ERROR_IF(
1082
1278
  ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend-ip)),
1083
- corruption_detected);
1279
+ corruption_detected, "");
1084
1280
  ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr);
1085
1281
  ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr);
1086
1282
  ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
1087
1283
 
1088
1284
  /* prepare in advance */
1089
1285
  for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && (seqNb<seqAdvance); seqNb++) {
1090
- sequences[seqNb] = ZSTD_decodeSequenceLong(&seqState, isLongOffset);
1091
- PREFETCH_L1(sequences[seqNb].match); PREFETCH_L1(sequences[seqNb].match + sequences[seqNb].matchLength - 1); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
1286
+ seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1287
+ prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
1288
+ sequences[seqNb] = sequence;
1092
1289
  }
1093
- RETURN_ERROR_IF(seqNb<seqAdvance, corruption_detected);
1290
+ RETURN_ERROR_IF(seqNb<seqAdvance, corruption_detected, "");
1094
1291
 
1095
1292
  /* decode and decompress */
1096
1293
  for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (seqNb<nbSeq) ; seqNb++) {
1097
- seq_t const sequence = ZSTD_decodeSequenceLong(&seqState, isLongOffset);
1098
- size_t const oneSeqSize = ZSTD_execSequenceLong(op, oend, sequences[(seqNb-ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd);
1294
+ seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1295
+ size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb-ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd);
1296
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1297
+ assert(!ZSTD_isError(oneSeqSize));
1298
+ if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb-ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
1299
+ #endif
1099
1300
  if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1100
- PREFETCH_L1(sequence.match); PREFETCH_L1(sequence.match + sequence.matchLength - 1); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
1301
+
1302
+ prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
1101
1303
  sequences[seqNb & STORED_SEQS_MASK] = sequence;
1102
1304
  op += oneSeqSize;
1103
1305
  }
1104
- RETURN_ERROR_IF(seqNb<nbSeq, corruption_detected);
1306
+ RETURN_ERROR_IF(seqNb<nbSeq, corruption_detected, "");
1105
1307
 
1106
1308
  /* finish queue */
1107
1309
  seqNb -= seqAdvance;
1108
1310
  for ( ; seqNb<nbSeq ; seqNb++) {
1109
- size_t const oneSeqSize = ZSTD_execSequenceLong(op, oend, sequences[seqNb&STORED_SEQS_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd);
1311
+ size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[seqNb&STORED_SEQS_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd);
1312
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1313
+ assert(!ZSTD_isError(oneSeqSize));
1314
+ if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
1315
+ #endif
1110
1316
  if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1111
1317
  op += oneSeqSize;
1112
1318
  }
@@ -1117,9 +1323,11 @@ ZSTD_decompressSequencesLong_body(
1117
1323
 
1118
1324
  /* last literal segment */
1119
1325
  { size_t const lastLLSize = litEnd - litPtr;
1120
- RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall);
1121
- memcpy(op, litPtr, lastLLSize);
1122
- op += lastLLSize;
1326
+ RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
1327
+ if (op != NULL) {
1328
+ ZSTD_memcpy(op, litPtr, lastLLSize);
1329
+ op += lastLLSize;
1330
+ }
1123
1331
  }
1124
1332
 
1125
1333
  return op-ostart;
@@ -1129,9 +1337,10 @@ static size_t
1129
1337
  ZSTD_decompressSequencesLong_default(ZSTD_DCtx* dctx,
1130
1338
  void* dst, size_t maxDstSize,
1131
1339
  const void* seqStart, size_t seqSize, int nbSeq,
1132
- const ZSTD_longOffset_e isLongOffset)
1340
+ const ZSTD_longOffset_e isLongOffset,
1341
+ const int frame)
1133
1342
  {
1134
- return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
1343
+ return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1135
1344
  }
1136
1345
  #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
1137
1346
 
@@ -1145,9 +1354,10 @@ DONT_VECTORIZE
1145
1354
  ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx,
1146
1355
  void* dst, size_t maxDstSize,
1147
1356
  const void* seqStart, size_t seqSize, int nbSeq,
1148
- const ZSTD_longOffset_e isLongOffset)
1357
+ const ZSTD_longOffset_e isLongOffset,
1358
+ const int frame)
1149
1359
  {
1150
- return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
1360
+ return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1151
1361
  }
1152
1362
  #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
1153
1363
 
@@ -1156,9 +1366,10 @@ static TARGET_ATTRIBUTE("bmi2") size_t
1156
1366
  ZSTD_decompressSequencesLong_bmi2(ZSTD_DCtx* dctx,
1157
1367
  void* dst, size_t maxDstSize,
1158
1368
  const void* seqStart, size_t seqSize, int nbSeq,
1159
- const ZSTD_longOffset_e isLongOffset)
1369
+ const ZSTD_longOffset_e isLongOffset,
1370
+ const int frame)
1160
1371
  {
1161
- return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
1372
+ return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1162
1373
  }
1163
1374
  #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
1164
1375
 
@@ -1168,21 +1379,23 @@ typedef size_t (*ZSTD_decompressSequences_t)(
1168
1379
  ZSTD_DCtx* dctx,
1169
1380
  void* dst, size_t maxDstSize,
1170
1381
  const void* seqStart, size_t seqSize, int nbSeq,
1171
- const ZSTD_longOffset_e isLongOffset);
1382
+ const ZSTD_longOffset_e isLongOffset,
1383
+ const int frame);
1172
1384
 
1173
1385
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
1174
1386
  static size_t
1175
1387
  ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,
1176
1388
  const void* seqStart, size_t seqSize, int nbSeq,
1177
- const ZSTD_longOffset_e isLongOffset)
1389
+ const ZSTD_longOffset_e isLongOffset,
1390
+ const int frame)
1178
1391
  {
1179
1392
  DEBUGLOG(5, "ZSTD_decompressSequences");
1180
1393
  #if DYNAMIC_BMI2
1181
1394
  if (dctx->bmi2) {
1182
- return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
1395
+ return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1183
1396
  }
1184
1397
  #endif
1185
- return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
1398
+ return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1186
1399
  }
1187
1400
  #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
1188
1401
 
@@ -1197,15 +1410,16 @@ static size_t
1197
1410
  ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx,
1198
1411
  void* dst, size_t maxDstSize,
1199
1412
  const void* seqStart, size_t seqSize, int nbSeq,
1200
- const ZSTD_longOffset_e isLongOffset)
1413
+ const ZSTD_longOffset_e isLongOffset,
1414
+ const int frame)
1201
1415
  {
1202
1416
  DEBUGLOG(5, "ZSTD_decompressSequencesLong");
1203
1417
  #if DYNAMIC_BMI2
1204
1418
  if (dctx->bmi2) {
1205
- return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
1419
+ return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1206
1420
  }
1207
1421
  #endif
1208
- return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
1422
+ return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1209
1423
  }
1210
1424
  #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
1211
1425
 
@@ -1239,7 +1453,6 @@ ZSTD_getLongOffsetsShare(const ZSTD_seqSymbol* offTable)
1239
1453
  }
1240
1454
  #endif
1241
1455
 
1242
-
1243
1456
  size_t
1244
1457
  ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
1245
1458
  void* dst, size_t dstCapacity,
@@ -1255,7 +1468,7 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
1255
1468
  ZSTD_longOffset_e const isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (!frame || (dctx->fParams.windowSize > (1ULL << STREAM_ACCUMULATOR_MIN))));
1256
1469
  DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize);
1257
1470
 
1258
- RETURN_ERROR_IF(srcSize >= ZSTD_BLOCKSIZE_MAX, srcSize_wrong);
1471
+ RETURN_ERROR_IF(srcSize >= ZSTD_BLOCKSIZE_MAX, srcSize_wrong, "");
1259
1472
 
1260
1473
  /* Decode literals section */
1261
1474
  { size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize);
@@ -1281,6 +1494,8 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
1281
1494
  ip += seqHSize;
1282
1495
  srcSize -= seqHSize;
1283
1496
 
1497
+ RETURN_ERROR_IF(dst == NULL && nbSeq > 0, dstSize_tooSmall, "NULL not handled");
1498
+
1284
1499
  #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
1285
1500
  !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
1286
1501
  if ( !usePrefetchDecoder
@@ -1299,23 +1514,34 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
1299
1514
  if (usePrefetchDecoder)
1300
1515
  #endif
1301
1516
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
1302
- return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
1517
+ return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
1303
1518
  #endif
1304
1519
 
1305
1520
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
1306
1521
  /* else */
1307
- return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
1522
+ return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
1308
1523
  #endif
1309
1524
  }
1310
1525
  }
1311
1526
 
1312
1527
 
1528
+ void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize)
1529
+ {
1530
+ if (dst != dctx->previousDstEnd && dstSize > 0) { /* not contiguous */
1531
+ dctx->dictEnd = dctx->previousDstEnd;
1532
+ dctx->virtualStart = (const char*)dst - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->prefixStart));
1533
+ dctx->prefixStart = dst;
1534
+ dctx->previousDstEnd = dst;
1535
+ }
1536
+ }
1537
+
1538
+
1313
1539
  size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
1314
1540
  void* dst, size_t dstCapacity,
1315
1541
  const void* src, size_t srcSize)
1316
1542
  {
1317
1543
  size_t dSize;
1318
- ZSTD_checkContinuity(dctx, dst);
1544
+ ZSTD_checkContinuity(dctx, dst, dstCapacity);
1319
1545
  dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 0);
1320
1546
  dctx->previousDstEnd = (char*)dst + dSize;
1321
1547
  return dSize;