zstdlib 0.3.0-x64-mingw32 → 0.8.0-x64-mingw32

Sign up to get free protection for your applications and to get access to all the features.
Files changed (88) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGES.md +30 -1
  3. data/README.md +2 -2
  4. data/Rakefile +1 -1
  5. data/ext/zstdlib/extconf.rb +3 -3
  6. data/ext/zstdlib/ruby/zlib-2.7/zstdlib.c +4895 -0
  7. data/ext/zstdlib/ruby/zlib-3.0/zstdlib.c +4994 -0
  8. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/common/bitstream.h +59 -51
  9. data/ext/zstdlib/zstd-1.5.0/lib/common/compiler.h +289 -0
  10. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/common/cpu.h +1 -3
  11. data/ext/zstdlib/zstd-1.5.0/lib/common/debug.c +24 -0
  12. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/common/debug.h +22 -49
  13. data/ext/zstdlib/zstd-1.5.0/lib/common/entropy_common.c +362 -0
  14. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/common/error_private.c +3 -1
  15. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/common/error_private.h +8 -4
  16. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/common/fse.h +50 -42
  17. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/common/fse_decompress.c +149 -55
  18. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/common/huf.h +43 -39
  19. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/common/mem.h +69 -25
  20. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/common/pool.c +30 -20
  21. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/common/pool.h +3 -3
  22. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/common/threading.c +51 -4
  23. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/common/threading.h +36 -4
  24. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/common/xxhash.c +40 -92
  25. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/common/xxhash.h +12 -32
  26. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/common/zstd_common.c +10 -10
  27. data/ext/zstdlib/zstd-1.5.0/lib/common/zstd_deps.h +111 -0
  28. data/ext/zstdlib/zstd-1.5.0/lib/common/zstd_internal.h +490 -0
  29. data/ext/zstdlib/zstd-1.5.0/lib/common/zstd_trace.h +154 -0
  30. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/compress/fse_compress.c +47 -63
  31. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/compress/hist.c +41 -63
  32. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/compress/hist.h +13 -33
  33. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/compress/huf_compress.c +332 -193
  34. data/ext/zstdlib/zstd-1.5.0/lib/compress/zstd_compress.c +6393 -0
  35. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/compress/zstd_compress_internal.h +522 -86
  36. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/compress/zstd_compress_literals.c +25 -16
  37. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/compress/zstd_compress_literals.h +2 -2
  38. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/compress/zstd_compress_sequences.c +50 -24
  39. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/compress/zstd_compress_sequences.h +11 -4
  40. data/ext/zstdlib/zstd-1.5.0/lib/compress/zstd_compress_superblock.c +572 -0
  41. data/ext/zstdlib/zstd-1.5.0/lib/compress/zstd_compress_superblock.h +32 -0
  42. data/ext/zstdlib/zstd-1.5.0/lib/compress/zstd_cwksp.h +662 -0
  43. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/compress/zstd_double_fast.c +43 -41
  44. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/compress/zstd_double_fast.h +2 -2
  45. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/compress/zstd_fast.c +85 -80
  46. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/compress/zstd_fast.h +2 -2
  47. data/ext/zstdlib/zstd-1.5.0/lib/compress/zstd_lazy.c +2184 -0
  48. data/ext/zstdlib/zstd-1.5.0/lib/compress/zstd_lazy.h +125 -0
  49. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/compress/zstd_ldm.c +333 -208
  50. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/compress/zstd_ldm.h +15 -3
  51. data/ext/zstdlib/zstd-1.5.0/lib/compress/zstd_ldm_geartab.h +103 -0
  52. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/compress/zstd_opt.c +228 -129
  53. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/compress/zstd_opt.h +1 -1
  54. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/compress/zstdmt_compress.c +151 -440
  55. data/ext/zstdlib/zstd-1.5.0/lib/compress/zstdmt_compress.h +110 -0
  56. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/decompress/huf_decompress.c +395 -276
  57. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/decompress/zstd_ddict.c +20 -16
  58. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/decompress/zstd_ddict.h +3 -3
  59. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/decompress/zstd_decompress.c +628 -231
  60. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/decompress/zstd_decompress_block.c +606 -380
  61. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/decompress/zstd_decompress_block.h +8 -5
  62. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/decompress/zstd_decompress_internal.h +39 -9
  63. data/ext/zstdlib/zstd-1.5.0/lib/zdict.h +452 -0
  64. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/zstd.h +740 -153
  65. data/ext/zstdlib/{zstd-1.4.2/lib/common → zstd-1.5.0/lib}/zstd_errors.h +3 -1
  66. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/zlibWrapper/gzclose.c +1 -1
  67. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/zlibWrapper/gzcompatibility.h +1 -1
  68. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/zlibWrapper/gzguts.h +0 -0
  69. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/zlibWrapper/gzlib.c +9 -9
  70. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/zlibWrapper/gzread.c +16 -8
  71. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/zlibWrapper/gzwrite.c +8 -8
  72. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/zlibWrapper/zstd_zlibwrapper.c +131 -45
  73. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/zlibWrapper/zstd_zlibwrapper.h +1 -1
  74. data/lib/2.2/zstdlib.so +0 -0
  75. data/lib/2.3/zstdlib.so +0 -0
  76. data/lib/2.4/zstdlib.so +0 -0
  77. data/lib/2.5/zstdlib.so +0 -0
  78. data/lib/2.6/zstdlib.so +0 -0
  79. data/lib/2.7/zstdlib.so +0 -0
  80. metadata +76 -67
  81. data/ext/zstdlib/zstd-1.4.2/lib/common/compiler.h +0 -147
  82. data/ext/zstdlib/zstd-1.4.2/lib/common/debug.c +0 -44
  83. data/ext/zstdlib/zstd-1.4.2/lib/common/entropy_common.c +0 -236
  84. data/ext/zstdlib/zstd-1.4.2/lib/common/zstd_internal.h +0 -371
  85. data/ext/zstdlib/zstd-1.4.2/lib/compress/zstd_compress.c +0 -3904
  86. data/ext/zstdlib/zstd-1.4.2/lib/compress/zstd_lazy.c +0 -1111
  87. data/ext/zstdlib/zstd-1.4.2/lib/compress/zstd_lazy.h +0 -67
  88. data/ext/zstdlib/zstd-1.4.2/lib/compress/zstdmt_compress.h +0 -192
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
2
+ * Copyright (c) Yann Collet, Facebook, Inc.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -14,15 +14,15 @@
14
14
  /*-*******************************************************
15
15
  * Dependencies
16
16
  *********************************************************/
17
- #include <string.h> /* memcpy, memmove, memset */
18
- #include "compiler.h" /* prefetch */
19
- #include "cpu.h" /* bmi2 */
20
- #include "mem.h" /* low level memory routines */
17
+ #include "../common/zstd_deps.h" /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */
18
+ #include "../common/compiler.h" /* prefetch */
19
+ #include "../common/cpu.h" /* bmi2 */
20
+ #include "../common/mem.h" /* low level memory routines */
21
21
  #define FSE_STATIC_LINKING_ONLY
22
- #include "fse.h"
22
+ #include "../common/fse.h"
23
23
  #define HUF_STATIC_LINKING_ONLY
24
- #include "huf.h"
25
- #include "zstd_internal.h"
24
+ #include "../common/huf.h"
25
+ #include "../common/zstd_internal.h"
26
26
  #include "zstd_decompress_internal.h" /* ZSTD_DCtx */
27
27
  #include "zstd_ddict.h" /* ZSTD_DDictDictContent */
28
28
  #include "zstd_decompress_block.h"
@@ -44,7 +44,7 @@
44
44
  /*_*******************************************************
45
45
  * Memory operations
46
46
  **********************************************************/
47
- static void ZSTD_copy4(void* dst, const void* src) { memcpy(dst, src, 4); }
47
+ static void ZSTD_copy4(void* dst, const void* src) { ZSTD_memcpy(dst, src, 4); }
48
48
 
49
49
 
50
50
  /*-*************************************************************
@@ -56,7 +56,7 @@ static void ZSTD_copy4(void* dst, const void* src) { memcpy(dst, src, 4); }
56
56
  size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
57
57
  blockProperties_t* bpPtr)
58
58
  {
59
- RETURN_ERROR_IF(srcSize < ZSTD_blockHeaderSize, srcSize_wrong);
59
+ RETURN_ERROR_IF(srcSize < ZSTD_blockHeaderSize, srcSize_wrong, "");
60
60
 
61
61
  { U32 const cBlockHeader = MEM_readLE24(src);
62
62
  U32 const cSize = cBlockHeader >> 3;
@@ -64,7 +64,7 @@ size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
64
64
  bpPtr->blockType = (blockType_e)((cBlockHeader >> 1) & 3);
65
65
  bpPtr->origSize = cSize; /* only useful for RLE */
66
66
  if (bpPtr->blockType == bt_rle) return 1;
67
- RETURN_ERROR_IF(bpPtr->blockType == bt_reserved, corruption_detected);
67
+ RETURN_ERROR_IF(bpPtr->blockType == bt_reserved, corruption_detected, "");
68
68
  return cSize;
69
69
  }
70
70
  }
@@ -79,7 +79,8 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
79
79
  size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
80
80
  const void* src, size_t srcSize) /* note : srcSize < BLOCKSIZE */
81
81
  {
82
- RETURN_ERROR_IF(srcSize < MIN_CBLOCK_SIZE, corruption_detected);
82
+ DEBUGLOG(5, "ZSTD_decodeLiteralsBlock");
83
+ RETURN_ERROR_IF(srcSize < MIN_CBLOCK_SIZE, corruption_detected, "");
83
84
 
84
85
  { const BYTE* const istart = (const BYTE*) src;
85
86
  symbolEncodingType_e const litEncType = (symbolEncodingType_e)(istart[0] & 3);
@@ -87,7 +88,8 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
87
88
  switch(litEncType)
88
89
  {
89
90
  case set_repeat:
90
- RETURN_ERROR_IF(dctx->litEntropy==0, dictionary_corrupted);
91
+ DEBUGLOG(5, "set_repeat flag : re-using stats from previous compressed literals block");
92
+ RETURN_ERROR_IF(dctx->litEntropy==0, dictionary_corrupted, "");
91
93
  /* fall-through */
92
94
 
93
95
  case set_compressed:
@@ -116,11 +118,11 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
116
118
  /* 2 - 2 - 18 - 18 */
117
119
  lhSize = 5;
118
120
  litSize = (lhc >> 4) & 0x3FFFF;
119
- litCSize = (lhc >> 22) + (istart[4] << 10);
121
+ litCSize = (lhc >> 22) + ((size_t)istart[4] << 10);
120
122
  break;
121
123
  }
122
- RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected);
123
- RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected);
124
+ RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
125
+ RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, "");
124
126
 
125
127
  /* prefetch huffman table if cold */
126
128
  if (dctx->ddictIsCold && (litSize > 768 /* heuristic */)) {
@@ -158,13 +160,13 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
158
160
  }
159
161
  }
160
162
 
161
- RETURN_ERROR_IF(HUF_isError(hufSuccess), corruption_detected);
163
+ RETURN_ERROR_IF(HUF_isError(hufSuccess), corruption_detected, "");
162
164
 
163
165
  dctx->litPtr = dctx->litBuffer;
164
166
  dctx->litSize = litSize;
165
167
  dctx->litEntropy = 1;
166
168
  if (litEncType==set_compressed) dctx->HUFptr = dctx->entropy.hufTable;
167
- memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
169
+ ZSTD_memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
168
170
  return litCSize + lhSize;
169
171
  }
170
172
 
@@ -188,11 +190,11 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
188
190
  }
189
191
 
190
192
  if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) { /* risk reading beyond src buffer with wildcopy */
191
- RETURN_ERROR_IF(litSize+lhSize > srcSize, corruption_detected);
192
- memcpy(dctx->litBuffer, istart+lhSize, litSize);
193
+ RETURN_ERROR_IF(litSize+lhSize > srcSize, corruption_detected, "");
194
+ ZSTD_memcpy(dctx->litBuffer, istart+lhSize, litSize);
193
195
  dctx->litPtr = dctx->litBuffer;
194
196
  dctx->litSize = litSize;
195
- memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
197
+ ZSTD_memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
196
198
  return lhSize+litSize;
197
199
  }
198
200
  /* direct reference into compressed stream */
@@ -220,8 +222,8 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
220
222
  RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4");
221
223
  break;
222
224
  }
223
- RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected);
224
- memset(dctx->litBuffer, istart[lhSize], litSize + WILDCOPY_OVERLENGTH);
225
+ RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
226
+ ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize + WILDCOPY_OVERLENGTH);
225
227
  dctx->litPtr = dctx->litBuffer;
226
228
  dctx->litSize = litSize;
227
229
  return lhSize+1;
@@ -234,7 +236,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
234
236
 
235
237
  /* Default FSE distribution tables.
236
238
  * These are pre-calculated FSE decoding tables using default distributions as defined in specification :
237
- * https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md#default-distributions
239
+ * https://github.com/facebook/zstd/blob/release/doc/zstd_compression_format.md#default-distributions
238
240
  * They were generated programmatically with following method :
239
241
  * - start from default distributions, present in /lib/common/zstd_internal.h
240
242
  * - generate tables normally, using ZSTD_buildFSETable()
@@ -362,23 +364,26 @@ static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U32 nbAddB
362
364
  * generate FSE decoding table for one symbol (ll, ml or off)
363
365
  * cannot fail if input is valid =>
364
366
  * all inputs are presumed validated at this stage */
365
- void
366
- ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
367
+ FORCE_INLINE_TEMPLATE
368
+ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
367
369
  const short* normalizedCounter, unsigned maxSymbolValue,
368
370
  const U32* baseValue, const U32* nbAdditionalBits,
369
- unsigned tableLog)
371
+ unsigned tableLog, void* wksp, size_t wkspSize)
370
372
  {
371
373
  ZSTD_seqSymbol* const tableDecode = dt+1;
372
- U16 symbolNext[MaxSeq+1];
373
-
374
374
  U32 const maxSV1 = maxSymbolValue + 1;
375
375
  U32 const tableSize = 1 << tableLog;
376
- U32 highThreshold = tableSize-1;
376
+
377
+ U16* symbolNext = (U16*)wksp;
378
+ BYTE* spread = (BYTE*)(symbolNext + MaxSeq + 1);
379
+ U32 highThreshold = tableSize - 1;
380
+
377
381
 
378
382
  /* Sanity Checks */
379
383
  assert(maxSymbolValue <= MaxSeq);
380
384
  assert(tableLog <= MaxFSELog);
381
-
385
+ assert(wkspSize >= ZSTD_BUILD_FSE_TABLE_WKSP_SIZE);
386
+ (void)wkspSize;
382
387
  /* Init, lay down lowprob symbols */
383
388
  { ZSTD_seqSymbol_header DTableH;
384
389
  DTableH.tableLog = tableLog;
@@ -391,18 +396,72 @@ ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
391
396
  symbolNext[s] = 1;
392
397
  } else {
393
398
  if (normalizedCounter[s] >= largeLimit) DTableH.fastMode=0;
394
- symbolNext[s] = normalizedCounter[s];
399
+ assert(normalizedCounter[s]>=0);
400
+ symbolNext[s] = (U16)normalizedCounter[s];
395
401
  } } }
396
- memcpy(dt, &DTableH, sizeof(DTableH));
402
+ ZSTD_memcpy(dt, &DTableH, sizeof(DTableH));
397
403
  }
398
404
 
399
405
  /* Spread symbols */
400
- { U32 const tableMask = tableSize-1;
406
+ assert(tableSize <= 512);
407
+ /* Specialized symbol spreading for the case when there are
408
+ * no low probability (-1 count) symbols. When compressing
409
+ * small blocks we avoid low probability symbols to hit this
410
+ * case, since header decoding speed matters more.
411
+ */
412
+ if (highThreshold == tableSize - 1) {
413
+ size_t const tableMask = tableSize-1;
414
+ size_t const step = FSE_TABLESTEP(tableSize);
415
+ /* First lay down the symbols in order.
416
+ * We use a uint64_t to lay down 8 bytes at a time. This reduces branch
417
+ * misses since small blocks generally have small table logs, so nearly
418
+ * all symbols have counts <= 8. We ensure we have 8 bytes at the end of
419
+ * our buffer to handle the over-write.
420
+ */
421
+ {
422
+ U64 const add = 0x0101010101010101ull;
423
+ size_t pos = 0;
424
+ U64 sv = 0;
425
+ U32 s;
426
+ for (s=0; s<maxSV1; ++s, sv += add) {
427
+ int i;
428
+ int const n = normalizedCounter[s];
429
+ MEM_write64(spread + pos, sv);
430
+ for (i = 8; i < n; i += 8) {
431
+ MEM_write64(spread + pos + i, sv);
432
+ }
433
+ pos += n;
434
+ }
435
+ }
436
+ /* Now we spread those positions across the table.
437
+ * The benefit of doing it in two stages is that we avoid the the
438
+ * variable size inner loop, which caused lots of branch misses.
439
+ * Now we can run through all the positions without any branch misses.
440
+ * We unroll the loop twice, since that is what emperically worked best.
441
+ */
442
+ {
443
+ size_t position = 0;
444
+ size_t s;
445
+ size_t const unroll = 2;
446
+ assert(tableSize % unroll == 0); /* FSE_MIN_TABLELOG is 5 */
447
+ for (s = 0; s < (size_t)tableSize; s += unroll) {
448
+ size_t u;
449
+ for (u = 0; u < unroll; ++u) {
450
+ size_t const uPosition = (position + (u * step)) & tableMask;
451
+ tableDecode[uPosition].baseValue = spread[s + u];
452
+ }
453
+ position = (position + (unroll * step)) & tableMask;
454
+ }
455
+ assert(position == 0);
456
+ }
457
+ } else {
458
+ U32 const tableMask = tableSize-1;
401
459
  U32 const step = FSE_TABLESTEP(tableSize);
402
460
  U32 s, position = 0;
403
461
  for (s=0; s<maxSV1; s++) {
404
462
  int i;
405
- for (i=0; i<normalizedCounter[s]; i++) {
463
+ int const n = normalizedCounter[s];
464
+ for (i=0; i<n; i++) {
406
465
  tableDecode[position].baseValue = s;
407
466
  position = (position + step) & tableMask;
408
467
  while (position > highThreshold) position = (position + step) & tableMask; /* lowprob area */
@@ -411,7 +470,8 @@ ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
411
470
  }
412
471
 
413
472
  /* Build Decoding table */
414
- { U32 u;
473
+ {
474
+ U32 u;
415
475
  for (u=0; u<tableSize; u++) {
416
476
  U32 const symbol = tableDecode[u].baseValue;
417
477
  U32 const nextState = symbolNext[symbol]++;
@@ -420,7 +480,46 @@ ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
420
480
  assert(nbAdditionalBits[symbol] < 255);
421
481
  tableDecode[u].nbAdditionalBits = (BYTE)nbAdditionalBits[symbol];
422
482
  tableDecode[u].baseValue = baseValue[symbol];
423
- } }
483
+ }
484
+ }
485
+ }
486
+
487
+ /* Avoids the FORCE_INLINE of the _body() function. */
488
+ static void ZSTD_buildFSETable_body_default(ZSTD_seqSymbol* dt,
489
+ const short* normalizedCounter, unsigned maxSymbolValue,
490
+ const U32* baseValue, const U32* nbAdditionalBits,
491
+ unsigned tableLog, void* wksp, size_t wkspSize)
492
+ {
493
+ ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue,
494
+ baseValue, nbAdditionalBits, tableLog, wksp, wkspSize);
495
+ }
496
+
497
+ #if DYNAMIC_BMI2
498
+ TARGET_ATTRIBUTE("bmi2") static void ZSTD_buildFSETable_body_bmi2(ZSTD_seqSymbol* dt,
499
+ const short* normalizedCounter, unsigned maxSymbolValue,
500
+ const U32* baseValue, const U32* nbAdditionalBits,
501
+ unsigned tableLog, void* wksp, size_t wkspSize)
502
+ {
503
+ ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue,
504
+ baseValue, nbAdditionalBits, tableLog, wksp, wkspSize);
505
+ }
506
+ #endif
507
+
508
+ void ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
509
+ const short* normalizedCounter, unsigned maxSymbolValue,
510
+ const U32* baseValue, const U32* nbAdditionalBits,
511
+ unsigned tableLog, void* wksp, size_t wkspSize, int bmi2)
512
+ {
513
+ #if DYNAMIC_BMI2
514
+ if (bmi2) {
515
+ ZSTD_buildFSETable_body_bmi2(dt, normalizedCounter, maxSymbolValue,
516
+ baseValue, nbAdditionalBits, tableLog, wksp, wkspSize);
517
+ return;
518
+ }
519
+ #endif
520
+ (void)bmi2;
521
+ ZSTD_buildFSETable_body_default(dt, normalizedCounter, maxSymbolValue,
522
+ baseValue, nbAdditionalBits, tableLog, wksp, wkspSize);
424
523
  }
425
524
 
426
525
 
@@ -432,13 +531,14 @@ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymb
432
531
  const void* src, size_t srcSize,
433
532
  const U32* baseValue, const U32* nbAdditionalBits,
434
533
  const ZSTD_seqSymbol* defaultTable, U32 flagRepeatTable,
435
- int ddictIsCold, int nbSeq)
534
+ int ddictIsCold, int nbSeq, U32* wksp, size_t wkspSize,
535
+ int bmi2)
436
536
  {
437
537
  switch(type)
438
538
  {
439
539
  case set_rle :
440
- RETURN_ERROR_IF(!srcSize, srcSize_wrong);
441
- RETURN_ERROR_IF((*(const BYTE*)src) > max, corruption_detected);
540
+ RETURN_ERROR_IF(!srcSize, srcSize_wrong, "");
541
+ RETURN_ERROR_IF((*(const BYTE*)src) > max, corruption_detected, "");
442
542
  { U32 const symbol = *(const BYTE*)src;
443
543
  U32 const baseline = baseValue[symbol];
444
544
  U32 const nbBits = nbAdditionalBits[symbol];
@@ -450,7 +550,7 @@ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymb
450
550
  *DTablePtr = defaultTable;
451
551
  return 0;
452
552
  case set_repeat:
453
- RETURN_ERROR_IF(!flagRepeatTable, corruption_detected);
553
+ RETURN_ERROR_IF(!flagRepeatTable, corruption_detected, "");
454
554
  /* prefetch FSE table if used */
455
555
  if (ddictIsCold && (nbSeq > 24 /* heuristic */)) {
456
556
  const void* const pStart = *DTablePtr;
@@ -462,9 +562,9 @@ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymb
462
562
  { unsigned tableLog;
463
563
  S16 norm[MaxSeq+1];
464
564
  size_t const headerSize = FSE_readNCount(norm, &max, &tableLog, src, srcSize);
465
- RETURN_ERROR_IF(FSE_isError(headerSize), corruption_detected);
466
- RETURN_ERROR_IF(tableLog > maxLog, corruption_detected);
467
- ZSTD_buildFSETable(DTableSpace, norm, max, baseValue, nbAdditionalBits, tableLog);
565
+ RETURN_ERROR_IF(FSE_isError(headerSize), corruption_detected, "");
566
+ RETURN_ERROR_IF(tableLog > maxLog, corruption_detected, "");
567
+ ZSTD_buildFSETable(DTableSpace, norm, max, baseValue, nbAdditionalBits, tableLog, wksp, wkspSize, bmi2);
468
568
  *DTablePtr = DTableSpace;
469
569
  return headerSize;
470
570
  }
@@ -477,35 +577,36 @@ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymb
477
577
  size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
478
578
  const void* src, size_t srcSize)
479
579
  {
480
- const BYTE* const istart = (const BYTE* const)src;
580
+ const BYTE* const istart = (const BYTE*)src;
481
581
  const BYTE* const iend = istart + srcSize;
482
582
  const BYTE* ip = istart;
483
583
  int nbSeq;
484
584
  DEBUGLOG(5, "ZSTD_decodeSeqHeaders");
485
585
 
486
586
  /* check */
487
- RETURN_ERROR_IF(srcSize < MIN_SEQUENCES_SIZE, srcSize_wrong);
587
+ RETURN_ERROR_IF(srcSize < MIN_SEQUENCES_SIZE, srcSize_wrong, "");
488
588
 
489
589
  /* SeqHead */
490
590
  nbSeq = *ip++;
491
591
  if (!nbSeq) {
492
592
  *nbSeqPtr=0;
493
- RETURN_ERROR_IF(srcSize != 1, srcSize_wrong);
593
+ RETURN_ERROR_IF(srcSize != 1, srcSize_wrong, "");
494
594
  return 1;
495
595
  }
496
596
  if (nbSeq > 0x7F) {
497
597
  if (nbSeq == 0xFF) {
498
- RETURN_ERROR_IF(ip+2 > iend, srcSize_wrong);
499
- nbSeq = MEM_readLE16(ip) + LONGNBSEQ, ip+=2;
598
+ RETURN_ERROR_IF(ip+2 > iend, srcSize_wrong, "");
599
+ nbSeq = MEM_readLE16(ip) + LONGNBSEQ;
600
+ ip+=2;
500
601
  } else {
501
- RETURN_ERROR_IF(ip >= iend, srcSize_wrong);
602
+ RETURN_ERROR_IF(ip >= iend, srcSize_wrong, "");
502
603
  nbSeq = ((nbSeq-0x80)<<8) + *ip++;
503
604
  }
504
605
  }
505
606
  *nbSeqPtr = nbSeq;
506
607
 
507
608
  /* FSE table descriptors */
508
- RETURN_ERROR_IF(ip+1 > iend, srcSize_wrong); /* minimum possible size: 1 byte for symbol encoding types */
609
+ RETURN_ERROR_IF(ip+1 > iend, srcSize_wrong, ""); /* minimum possible size: 1 byte for symbol encoding types */
509
610
  { symbolEncodingType_e const LLtype = (symbolEncodingType_e)(*ip >> 6);
510
611
  symbolEncodingType_e const OFtype = (symbolEncodingType_e)((*ip >> 4) & 3);
511
612
  symbolEncodingType_e const MLtype = (symbolEncodingType_e)((*ip >> 2) & 3);
@@ -517,8 +618,10 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
517
618
  ip, iend-ip,
518
619
  LL_base, LL_bits,
519
620
  LL_defaultDTable, dctx->fseEntropy,
520
- dctx->ddictIsCold, nbSeq);
521
- RETURN_ERROR_IF(ZSTD_isError(llhSize), corruption_detected);
621
+ dctx->ddictIsCold, nbSeq,
622
+ dctx->workspace, sizeof(dctx->workspace),
623
+ dctx->bmi2);
624
+ RETURN_ERROR_IF(ZSTD_isError(llhSize), corruption_detected, "ZSTD_buildSeqTable failed");
522
625
  ip += llhSize;
523
626
  }
524
627
 
@@ -527,8 +630,10 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
527
630
  ip, iend-ip,
528
631
  OF_base, OF_bits,
529
632
  OF_defaultDTable, dctx->fseEntropy,
530
- dctx->ddictIsCold, nbSeq);
531
- RETURN_ERROR_IF(ZSTD_isError(ofhSize), corruption_detected);
633
+ dctx->ddictIsCold, nbSeq,
634
+ dctx->workspace, sizeof(dctx->workspace),
635
+ dctx->bmi2);
636
+ RETURN_ERROR_IF(ZSTD_isError(ofhSize), corruption_detected, "ZSTD_buildSeqTable failed");
532
637
  ip += ofhSize;
533
638
  }
534
639
 
@@ -537,8 +642,10 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
537
642
  ip, iend-ip,
538
643
  ML_base, ML_bits,
539
644
  ML_defaultDTable, dctx->fseEntropy,
540
- dctx->ddictIsCold, nbSeq);
541
- RETURN_ERROR_IF(ZSTD_isError(mlhSize), corruption_detected);
645
+ dctx->ddictIsCold, nbSeq,
646
+ dctx->workspace, sizeof(dctx->workspace),
647
+ dctx->bmi2);
648
+ RETURN_ERROR_IF(ZSTD_isError(mlhSize), corruption_detected, "ZSTD_buildSeqTable failed");
542
649
  ip += mlhSize;
543
650
  }
544
651
  }
@@ -551,7 +658,6 @@ typedef struct {
551
658
  size_t litLength;
552
659
  size_t matchLength;
553
660
  size_t offset;
554
- const BYTE* match;
555
661
  } seq_t;
556
662
 
557
663
  typedef struct {
@@ -565,59 +671,135 @@ typedef struct {
565
671
  ZSTD_fseState stateOffb;
566
672
  ZSTD_fseState stateML;
567
673
  size_t prevOffset[ZSTD_REP_NUM];
568
- const BYTE* prefixStart;
569
- const BYTE* dictEnd;
570
- size_t pos;
571
674
  } seqState_t;
572
675
 
676
+ /*! ZSTD_overlapCopy8() :
677
+ * Copies 8 bytes from ip to op and updates op and ip where ip <= op.
678
+ * If the offset is < 8 then the offset is spread to at least 8 bytes.
679
+ *
680
+ * Precondition: *ip <= *op
681
+ * Postcondition: *op - *op >= 8
682
+ */
683
+ HINT_INLINE void ZSTD_overlapCopy8(BYTE** op, BYTE const** ip, size_t offset) {
684
+ assert(*ip <= *op);
685
+ if (offset < 8) {
686
+ /* close range match, overlap */
687
+ static const U32 dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 }; /* added */
688
+ static const int dec64table[] = { 8, 8, 8, 7, 8, 9,10,11 }; /* subtracted */
689
+ int const sub2 = dec64table[offset];
690
+ (*op)[0] = (*ip)[0];
691
+ (*op)[1] = (*ip)[1];
692
+ (*op)[2] = (*ip)[2];
693
+ (*op)[3] = (*ip)[3];
694
+ *ip += dec32table[offset];
695
+ ZSTD_copy4(*op+4, *ip);
696
+ *ip -= sub2;
697
+ } else {
698
+ ZSTD_copy8(*op, *ip);
699
+ }
700
+ *ip += 8;
701
+ *op += 8;
702
+ assert(*op - *ip >= 8);
703
+ }
704
+
705
+ /*! ZSTD_safecopy() :
706
+ * Specialized version of memcpy() that is allowed to READ up to WILDCOPY_OVERLENGTH past the input buffer
707
+ * and write up to 16 bytes past oend_w (op >= oend_w is allowed).
708
+ * This function is only called in the uncommon case where the sequence is near the end of the block. It
709
+ * should be fast for a single long sequence, but can be slow for several short sequences.
710
+ *
711
+ * @param ovtype controls the overlap detection
712
+ * - ZSTD_no_overlap: The source and destination are guaranteed to be at least WILDCOPY_VECLEN bytes apart.
713
+ * - ZSTD_overlap_src_before_dst: The src and dst may overlap and may be any distance apart.
714
+ * The src buffer must be before the dst buffer.
715
+ */
716
+ static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, ptrdiff_t length, ZSTD_overlap_e ovtype) {
717
+ ptrdiff_t const diff = op - ip;
718
+ BYTE* const oend = op + length;
719
+
720
+ assert((ovtype == ZSTD_no_overlap && (diff <= -8 || diff >= 8 || op >= oend_w)) ||
721
+ (ovtype == ZSTD_overlap_src_before_dst && diff >= 0));
722
+
723
+ if (length < 8) {
724
+ /* Handle short lengths. */
725
+ while (op < oend) *op++ = *ip++;
726
+ return;
727
+ }
728
+ if (ovtype == ZSTD_overlap_src_before_dst) {
729
+ /* Copy 8 bytes and ensure the offset >= 8 when there can be overlap. */
730
+ assert(length >= 8);
731
+ ZSTD_overlapCopy8(&op, &ip, diff);
732
+ assert(op - ip >= 8);
733
+ assert(op <= oend);
734
+ }
573
735
 
574
- /* ZSTD_execSequenceLast7():
575
- * exceptional case : decompress a match starting within last 7 bytes of output buffer.
576
- * requires more careful checks, to ensure there is no overflow.
577
- * performance does not matter though.
578
- * note : this case is supposed to be never generated "naturally" by reference encoder,
579
- * since in most cases it needs at least 8 bytes to look for a match.
580
- * but it's allowed by the specification. */
736
+ if (oend <= oend_w) {
737
+ /* No risk of overwrite. */
738
+ ZSTD_wildcopy(op, ip, length, ovtype);
739
+ return;
740
+ }
741
+ if (op <= oend_w) {
742
+ /* Wildcopy until we get close to the end. */
743
+ assert(oend > oend_w);
744
+ ZSTD_wildcopy(op, ip, oend_w - op, ovtype);
745
+ ip += oend_w - op;
746
+ op = oend_w;
747
+ }
748
+ /* Handle the leftovers. */
749
+ while (op < oend) *op++ = *ip++;
750
+ }
751
+
752
+ /* ZSTD_execSequenceEnd():
753
+ * This version handles cases that are near the end of the output buffer. It requires
754
+ * more careful checks to make sure there is no overflow. By separating out these hard
755
+ * and unlikely cases, we can speed up the common cases.
756
+ *
757
+ * NOTE: This function needs to be fast for a single long sequence, but doesn't need
758
+ * to be optimized for many small sequences, since those fall into ZSTD_execSequence().
759
+ */
581
760
  FORCE_NOINLINE
582
- size_t ZSTD_execSequenceLast7(BYTE* op,
583
- BYTE* const oend, seq_t sequence,
584
- const BYTE** litPtr, const BYTE* const litLimit,
585
- const BYTE* const base, const BYTE* const vBase, const BYTE* const dictEnd)
761
+ size_t ZSTD_execSequenceEnd(BYTE* op,
762
+ BYTE* const oend, seq_t sequence,
763
+ const BYTE** litPtr, const BYTE* const litLimit,
764
+ const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
586
765
  {
587
766
  BYTE* const oLitEnd = op + sequence.litLength;
588
767
  size_t const sequenceLength = sequence.litLength + sequence.matchLength;
589
- BYTE* const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */
590
768
  const BYTE* const iLitEnd = *litPtr + sequence.litLength;
591
769
  const BYTE* match = oLitEnd - sequence.offset;
770
+ BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH;
592
771
 
593
- /* check */
594
- RETURN_ERROR_IF(oMatchEnd>oend, dstSize_tooSmall, "last match must fit within dstBuffer");
595
- RETURN_ERROR_IF(iLitEnd > litLimit, corruption_detected, "try to read beyond literal buffer");
772
+ /* bounds checks : careful of address space overflow in 32-bit mode */
773
+ RETURN_ERROR_IF(sequenceLength > (size_t)(oend - op), dstSize_tooSmall, "last match must fit within dstBuffer");
774
+ RETURN_ERROR_IF(sequence.litLength > (size_t)(litLimit - *litPtr), corruption_detected, "try to read beyond literal buffer");
775
+ assert(op < op + sequenceLength);
776
+ assert(oLitEnd < op + sequenceLength);
596
777
 
597
778
  /* copy literals */
598
- while (op < oLitEnd) *op++ = *(*litPtr)++;
779
+ ZSTD_safecopy(op, oend_w, *litPtr, sequence.litLength, ZSTD_no_overlap);
780
+ op = oLitEnd;
781
+ *litPtr = iLitEnd;
599
782
 
600
783
  /* copy Match */
601
- if (sequence.offset > (size_t)(oLitEnd - base)) {
784
+ if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
602
785
  /* offset beyond prefix */
603
- RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - vBase),corruption_detected);
604
- match = dictEnd - (base-match);
786
+ RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, "");
787
+ match = dictEnd - (prefixStart-match);
605
788
  if (match + sequence.matchLength <= dictEnd) {
606
- memmove(oLitEnd, match, sequence.matchLength);
789
+ ZSTD_memmove(oLitEnd, match, sequence.matchLength);
607
790
  return sequenceLength;
608
791
  }
609
792
  /* span extDict & currentPrefixSegment */
610
793
  { size_t const length1 = dictEnd - match;
611
- memmove(oLitEnd, match, length1);
794
+ ZSTD_memmove(oLitEnd, match, length1);
612
795
  op = oLitEnd + length1;
613
796
  sequence.matchLength -= length1;
614
- match = base;
797
+ match = prefixStart;
615
798
  } }
616
- while (op < oMatchEnd) *op++ = *match++;
799
+ ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst);
617
800
  return sequenceLength;
618
801
  }
619
802
 
620
-
621
803
  HINT_INLINE
622
804
  size_t ZSTD_execSequence(BYTE* op,
623
805
  BYTE* const oend, seq_t sequence,
@@ -627,155 +809,85 @@ size_t ZSTD_execSequence(BYTE* op,
627
809
  BYTE* const oLitEnd = op + sequence.litLength;
628
810
  size_t const sequenceLength = sequence.litLength + sequence.matchLength;
629
811
  BYTE* const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */
630
- BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH;
812
+ BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH; /* risk : address space underflow on oend=NULL */
631
813
  const BYTE* const iLitEnd = *litPtr + sequence.litLength;
632
814
  const BYTE* match = oLitEnd - sequence.offset;
633
815
 
634
- /* check */
635
- RETURN_ERROR_IF(oMatchEnd>oend, dstSize_tooSmall, "last match must start at a minimum distance of WILDCOPY_OVERLENGTH from oend");
636
- RETURN_ERROR_IF(iLitEnd > litLimit, corruption_detected, "over-read beyond lit buffer");
637
- if (oLitEnd>oend_w) return ZSTD_execSequenceLast7(op, oend, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd);
638
-
639
- /* copy Literals */
640
- if (sequence.litLength > 8)
641
- ZSTD_wildcopy_16min(op, (*litPtr), sequence.litLength, ZSTD_no_overlap); /* note : since oLitEnd <= oend-WILDCOPY_OVERLENGTH, no risk of overwrite beyond oend */
642
- else
643
- ZSTD_copy8(op, *litPtr);
816
+ assert(op != NULL /* Precondition */);
817
+ assert(oend_w < oend /* No underflow */);
818
+ /* Handle edge cases in a slow path:
819
+ * - Read beyond end of literals
820
+ * - Match end is within WILDCOPY_OVERLIMIT of oend
821
+ * - 32-bit mode and the match length overflows
822
+ */
823
+ if (UNLIKELY(
824
+ iLitEnd > litLimit ||
825
+ oMatchEnd > oend_w ||
826
+ (MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH)))
827
+ return ZSTD_execSequenceEnd(op, oend, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd);
828
+
829
+ /* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */
830
+ assert(op <= oLitEnd /* No overflow */);
831
+ assert(oLitEnd < oMatchEnd /* Non-zero match & no overflow */);
832
+ assert(oMatchEnd <= oend /* No underflow */);
833
+ assert(iLitEnd <= litLimit /* Literal length is in bounds */);
834
+ assert(oLitEnd <= oend_w /* Can wildcopy literals */);
835
+ assert(oMatchEnd <= oend_w /* Can wildcopy matches */);
836
+
837
+ /* Copy Literals:
838
+ * Split out litLength <= 16 since it is nearly always true. +1.6% on gcc-9.
839
+ * We likely don't need the full 32-byte wildcopy.
840
+ */
841
+ assert(WILDCOPY_OVERLENGTH >= 16);
842
+ ZSTD_copy16(op, (*litPtr));
843
+ if (UNLIKELY(sequence.litLength > 16)) {
844
+ ZSTD_wildcopy(op+16, (*litPtr)+16, sequence.litLength-16, ZSTD_no_overlap);
845
+ }
644
846
  op = oLitEnd;
645
847
  *litPtr = iLitEnd; /* update for next sequence */
646
848
 
647
- /* copy Match */
849
+ /* Copy Match */
648
850
  if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
649
851
  /* offset beyond prefix -> go into extDict */
650
- RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected);
852
+ RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virtualStart)), corruption_detected, "");
651
853
  match = dictEnd + (match - prefixStart);
652
854
  if (match + sequence.matchLength <= dictEnd) {
653
- memmove(oLitEnd, match, sequence.matchLength);
855
+ ZSTD_memmove(oLitEnd, match, sequence.matchLength);
654
856
  return sequenceLength;
655
857
  }
656
858
  /* span extDict & currentPrefixSegment */
657
859
  { size_t const length1 = dictEnd - match;
658
- memmove(oLitEnd, match, length1);
860
+ ZSTD_memmove(oLitEnd, match, length1);
659
861
  op = oLitEnd + length1;
660
862
  sequence.matchLength -= length1;
661
863
  match = prefixStart;
662
- if (op > oend_w || sequence.matchLength < MINMATCH) {
663
- U32 i;
664
- for (i = 0; i < sequence.matchLength; ++i) op[i] = match[i];
665
- return sequenceLength;
666
- }
667
864
  } }
668
- /* Requirement: op <= oend_w && sequence.matchLength >= MINMATCH */
669
-
670
- /* match within prefix */
671
- if (sequence.offset < 8) {
672
- /* close range match, overlap */
673
- static const U32 dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 }; /* added */
674
- static const int dec64table[] = { 8, 8, 8, 7, 8, 9,10,11 }; /* subtracted */
675
- int const sub2 = dec64table[sequence.offset];
676
- op[0] = match[0];
677
- op[1] = match[1];
678
- op[2] = match[2];
679
- op[3] = match[3];
680
- match += dec32table[sequence.offset];
681
- ZSTD_copy4(op+4, match);
682
- match -= sub2;
683
- } else {
684
- ZSTD_copy8(op, match);
685
- }
686
- op += 8; match += 8;
687
-
688
- if (oMatchEnd > oend-(16-MINMATCH)) {
689
- if (op < oend_w) {
690
- ZSTD_wildcopy(op, match, oend_w - op, ZSTD_overlap_src_before_dst);
691
- match += oend_w - op;
692
- op = oend_w;
693
- }
694
- while (op < oMatchEnd) *op++ = *match++;
695
- } else {
696
- ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength-8, ZSTD_overlap_src_before_dst); /* works even if matchLength < 8 */
865
+ /* Match within prefix of 1 or more bytes */
866
+ assert(op <= oMatchEnd);
867
+ assert(oMatchEnd <= oend_w);
868
+ assert(match >= prefixStart);
869
+ assert(sequence.matchLength >= 1);
870
+
871
+ /* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy
872
+ * without overlap checking.
873
+ */
874
+ if (LIKELY(sequence.offset >= WILDCOPY_VECLEN)) {
875
+ /* We bet on a full wildcopy for matches, since we expect matches to be
876
+ * longer than literals (in general). In silesia, ~10% of matches are longer
877
+ * than 16 bytes.
878
+ */
879
+ ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength, ZSTD_no_overlap);
880
+ return sequenceLength;
697
881
  }
698
- return sequenceLength;
699
- }
700
-
701
-
702
- HINT_INLINE
703
- size_t ZSTD_execSequenceLong(BYTE* op,
704
- BYTE* const oend, seq_t sequence,
705
- const BYTE** litPtr, const BYTE* const litLimit,
706
- const BYTE* const prefixStart, const BYTE* const dictStart, const BYTE* const dictEnd)
707
- {
708
- BYTE* const oLitEnd = op + sequence.litLength;
709
- size_t const sequenceLength = sequence.litLength + sequence.matchLength;
710
- BYTE* const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */
711
- BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH;
712
- const BYTE* const iLitEnd = *litPtr + sequence.litLength;
713
- const BYTE* match = sequence.match;
714
-
715
- /* check */
716
- RETURN_ERROR_IF(oMatchEnd > oend, dstSize_tooSmall, "last match must start at a minimum distance of WILDCOPY_OVERLENGTH from oend");
717
- RETURN_ERROR_IF(iLitEnd > litLimit, corruption_detected, "over-read beyond lit buffer");
718
- if (oLitEnd > oend_w) return ZSTD_execSequenceLast7(op, oend, sequence, litPtr, litLimit, prefixStart, dictStart, dictEnd);
719
-
720
- /* copy Literals */
721
- if (sequence.litLength > 8)
722
- ZSTD_wildcopy_16min(op, *litPtr, sequence.litLength, ZSTD_no_overlap); /* note : since oLitEnd <= oend-WILDCOPY_OVERLENGTH, no risk of overwrite beyond oend */
723
- else
724
- ZSTD_copy8(op, *litPtr); /* note : op <= oLitEnd <= oend_w == oend - 8 */
725
-
726
- op = oLitEnd;
727
- *litPtr = iLitEnd; /* update for next sequence */
728
-
729
- /* copy Match */
730
- if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
731
- /* offset beyond prefix */
732
- RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - dictStart), corruption_detected);
733
- if (match + sequence.matchLength <= dictEnd) {
734
- memmove(oLitEnd, match, sequence.matchLength);
735
- return sequenceLength;
736
- }
737
- /* span extDict & currentPrefixSegment */
738
- { size_t const length1 = dictEnd - match;
739
- memmove(oLitEnd, match, length1);
740
- op = oLitEnd + length1;
741
- sequence.matchLength -= length1;
742
- match = prefixStart;
743
- if (op > oend_w || sequence.matchLength < MINMATCH) {
744
- U32 i;
745
- for (i = 0; i < sequence.matchLength; ++i) op[i] = match[i];
746
- return sequenceLength;
747
- }
748
- } }
749
- assert(op <= oend_w);
750
- assert(sequence.matchLength >= MINMATCH);
882
+ assert(sequence.offset < WILDCOPY_VECLEN);
751
883
 
752
- /* match within prefix */
753
- if (sequence.offset < 8) {
754
- /* close range match, overlap */
755
- static const U32 dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 }; /* added */
756
- static const int dec64table[] = { 8, 8, 8, 7, 8, 9,10,11 }; /* subtracted */
757
- int const sub2 = dec64table[sequence.offset];
758
- op[0] = match[0];
759
- op[1] = match[1];
760
- op[2] = match[2];
761
- op[3] = match[3];
762
- match += dec32table[sequence.offset];
763
- ZSTD_copy4(op+4, match);
764
- match -= sub2;
765
- } else {
766
- ZSTD_copy8(op, match);
767
- }
768
- op += 8; match += 8;
884
+ /* Copy 8 bytes and spread the offset to be >= 8. */
885
+ ZSTD_overlapCopy8(&op, &match, sequence.offset);
769
886
 
770
- if (oMatchEnd > oend-(16-MINMATCH)) {
771
- if (op < oend_w) {
772
- ZSTD_wildcopy(op, match, oend_w - op, ZSTD_overlap_src_before_dst);
773
- match += oend_w - op;
774
- op = oend_w;
775
- }
776
- while (op < oMatchEnd) *op++ = *match++;
777
- } else {
778
- ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength-8, ZSTD_overlap_src_before_dst); /* works even if matchLength < 8 */
887
+ /* If the match length is > 8 bytes, then continue with the wildcopy. */
888
+ if (sequence.matchLength > 8) {
889
+ assert(op < oMatchEnd);
890
+ ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength-8, ZSTD_overlap_src_before_dst);
779
891
  }
780
892
  return sequenceLength;
781
893
  }
@@ -801,6 +913,14 @@ ZSTD_updateFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD)
801
913
  DStatePtr->state = DInfo.nextState + lowBits;
802
914
  }
803
915
 
916
+ FORCE_INLINE_TEMPLATE void
917
+ ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, ZSTD_seqSymbol const DInfo)
918
+ {
919
+ U32 const nbBits = DInfo.nbBits;
920
+ size_t const lowBits = BIT_readBits(bitD, nbBits);
921
+ DStatePtr->state = DInfo.nextState + lowBits;
922
+ }
923
+
804
924
  /* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum
805
925
  * offset bits. But we can only read at most (STREAM_ACCUMULATOR_MIN_32 - 1)
806
926
  * bits before reloading. This value is the maximum number of bytes we read
@@ -813,24 +933,24 @@ ZSTD_updateFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD)
813
933
 
814
934
  typedef enum { ZSTD_lo_isRegularOffset, ZSTD_lo_isLongOffset=1 } ZSTD_longOffset_e;
815
935
 
816
- #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
817
936
  FORCE_INLINE_TEMPLATE seq_t
818
937
  ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
819
938
  {
820
939
  seq_t seq;
821
- U32 const llBits = seqState->stateLL.table[seqState->stateLL.state].nbAdditionalBits;
822
- U32 const mlBits = seqState->stateML.table[seqState->stateML.state].nbAdditionalBits;
823
- U32 const ofBits = seqState->stateOffb.table[seqState->stateOffb.state].nbAdditionalBits;
824
- U32 const totalBits = llBits+mlBits+ofBits;
825
- U32 const llBase = seqState->stateLL.table[seqState->stateLL.state].baseValue;
826
- U32 const mlBase = seqState->stateML.table[seqState->stateML.state].baseValue;
827
- U32 const ofBase = seqState->stateOffb.table[seqState->stateOffb.state].baseValue;
940
+ ZSTD_seqSymbol const llDInfo = seqState->stateLL.table[seqState->stateLL.state];
941
+ ZSTD_seqSymbol const mlDInfo = seqState->stateML.table[seqState->stateML.state];
942
+ ZSTD_seqSymbol const ofDInfo = seqState->stateOffb.table[seqState->stateOffb.state];
943
+ U32 const llBase = llDInfo.baseValue;
944
+ U32 const mlBase = mlDInfo.baseValue;
945
+ U32 const ofBase = ofDInfo.baseValue;
946
+ BYTE const llBits = llDInfo.nbAdditionalBits;
947
+ BYTE const mlBits = mlDInfo.nbAdditionalBits;
948
+ BYTE const ofBits = ofDInfo.nbAdditionalBits;
949
+ BYTE const totalBits = llBits+mlBits+ofBits;
828
950
 
829
951
  /* sequence */
830
952
  { size_t offset;
831
- if (!ofBits)
832
- offset = 0;
833
- else {
953
+ if (ofBits > 1) {
834
954
  ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
835
955
  ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
836
956
  assert(ofBits <= MaxOff);
@@ -844,63 +964,138 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
844
964
  offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */
845
965
  if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
846
966
  }
847
- }
848
-
849
- if (ofBits <= 1) {
850
- offset += (llBase==0);
851
- if (offset) {
852
- size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
853
- temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */
854
- if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1];
855
- seqState->prevOffset[1] = seqState->prevOffset[0];
856
- seqState->prevOffset[0] = offset = temp;
857
- } else { /* offset == 0 */
858
- offset = seqState->prevOffset[0];
859
- }
860
- } else {
861
967
  seqState->prevOffset[2] = seqState->prevOffset[1];
862
968
  seqState->prevOffset[1] = seqState->prevOffset[0];
863
969
  seqState->prevOffset[0] = offset;
864
- }
970
+ } else {
971
+ U32 const ll0 = (llBase == 0);
972
+ if (LIKELY((ofBits == 0))) {
973
+ if (LIKELY(!ll0))
974
+ offset = seqState->prevOffset[0];
975
+ else {
976
+ offset = seqState->prevOffset[1];
977
+ seqState->prevOffset[1] = seqState->prevOffset[0];
978
+ seqState->prevOffset[0] = offset;
979
+ }
980
+ } else {
981
+ offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1);
982
+ { size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
983
+ temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */
984
+ if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1];
985
+ seqState->prevOffset[1] = seqState->prevOffset[0];
986
+ seqState->prevOffset[0] = offset = temp;
987
+ } } }
865
988
  seq.offset = offset;
866
989
  }
867
990
 
868
- seq.matchLength = mlBase
869
- + ((mlBits>0) ? BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/) : 0); /* <= 16 bits */
991
+ seq.matchLength = mlBase;
992
+ if (mlBits > 0)
993
+ seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/);
994
+
870
995
  if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))
871
996
  BIT_reloadDStream(&seqState->DStream);
872
- if (MEM_64bits() && (totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog)))
997
+ if (MEM_64bits() && UNLIKELY(totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog)))
873
998
  BIT_reloadDStream(&seqState->DStream);
874
999
  /* Ensure there are enough bits to read the rest of data in 64-bit mode. */
875
1000
  ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);
876
1001
 
877
- seq.litLength = llBase
878
- + ((llBits>0) ? BIT_readBitsFast(&seqState->DStream, llBits/*>0*/) : 0); /* <= 16 bits */
1002
+ seq.litLength = llBase;
1003
+ if (llBits > 0)
1004
+ seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/);
1005
+
879
1006
  if (MEM_32bits())
880
1007
  BIT_reloadDStream(&seqState->DStream);
881
1008
 
882
1009
  DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u",
883
1010
  (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
884
1011
 
885
- /* ANS state update */
886
- ZSTD_updateFseState(&seqState->stateLL, &seqState->DStream); /* <= 9 bits */
887
- ZSTD_updateFseState(&seqState->stateML, &seqState->DStream); /* <= 9 bits */
888
- if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */
889
- ZSTD_updateFseState(&seqState->stateOffb, &seqState->DStream); /* <= 8 bits */
1012
+ /* ANS state update
1013
+ * gcc-9.0.0 does 2.5% worse with ZSTD_updateFseStateWithDInfo().
1014
+ * clang-9.2.0 does 7% worse with ZSTD_updateFseState().
1015
+ * Naturally it seems like ZSTD_updateFseStateWithDInfo() should be the
1016
+ * better option, so it is the default for other compilers. But, if you
1017
+ * measure that it is worse, please put up a pull request.
1018
+ */
1019
+ {
1020
+ #if defined(__GNUC__) && !defined(__clang__)
1021
+ const int kUseUpdateFseState = 1;
1022
+ #else
1023
+ const int kUseUpdateFseState = 0;
1024
+ #endif
1025
+ if (kUseUpdateFseState) {
1026
+ ZSTD_updateFseState(&seqState->stateLL, &seqState->DStream); /* <= 9 bits */
1027
+ ZSTD_updateFseState(&seqState->stateML, &seqState->DStream); /* <= 9 bits */
1028
+ if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */
1029
+ ZSTD_updateFseState(&seqState->stateOffb, &seqState->DStream); /* <= 8 bits */
1030
+ } else {
1031
+ ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llDInfo); /* <= 9 bits */
1032
+ ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlDInfo); /* <= 9 bits */
1033
+ if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */
1034
+ ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofDInfo); /* <= 8 bits */
1035
+ }
1036
+ }
890
1037
 
891
1038
  return seq;
892
1039
  }
893
1040
 
1041
+ #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
1042
+ MEM_STATIC int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd)
1043
+ {
1044
+ size_t const windowSize = dctx->fParams.windowSize;
1045
+ /* No dictionary used. */
1046
+ if (dctx->dictContentEndForFuzzing == NULL) return 0;
1047
+ /* Dictionary is our prefix. */
1048
+ if (prefixStart == dctx->dictContentBeginForFuzzing) return 1;
1049
+ /* Dictionary is not our ext-dict. */
1050
+ if (dctx->dictEnd != dctx->dictContentEndForFuzzing) return 0;
1051
+ /* Dictionary is not within our window size. */
1052
+ if ((size_t)(oLitEnd - prefixStart) >= windowSize) return 0;
1053
+ /* Dictionary is active. */
1054
+ return 1;
1055
+ }
1056
+
1057
+ MEM_STATIC void ZSTD_assertValidSequence(
1058
+ ZSTD_DCtx const* dctx,
1059
+ BYTE const* op, BYTE const* oend,
1060
+ seq_t const seq,
1061
+ BYTE const* prefixStart, BYTE const* virtualStart)
1062
+ {
1063
+ #if DEBUGLEVEL >= 1
1064
+ size_t const windowSize = dctx->fParams.windowSize;
1065
+ size_t const sequenceSize = seq.litLength + seq.matchLength;
1066
+ BYTE const* const oLitEnd = op + seq.litLength;
1067
+ DEBUGLOG(6, "Checking sequence: litL=%u matchL=%u offset=%u",
1068
+ (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
1069
+ assert(op <= oend);
1070
+ assert((size_t)(oend - op) >= sequenceSize);
1071
+ assert(sequenceSize <= ZSTD_BLOCKSIZE_MAX);
1072
+ if (ZSTD_dictionaryIsActive(dctx, prefixStart, oLitEnd)) {
1073
+ size_t const dictSize = (size_t)((char const*)dctx->dictContentEndForFuzzing - (char const*)dctx->dictContentBeginForFuzzing);
1074
+ /* Offset must be within the dictionary. */
1075
+ assert(seq.offset <= (size_t)(oLitEnd - virtualStart));
1076
+ assert(seq.offset <= windowSize + dictSize);
1077
+ } else {
1078
+ /* Offset must be within our window. */
1079
+ assert(seq.offset <= windowSize);
1080
+ }
1081
+ #else
1082
+ (void)dctx, (void)op, (void)oend, (void)seq, (void)prefixStart, (void)virtualStart;
1083
+ #endif
1084
+ }
1085
+ #endif
1086
+
1087
+ #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
894
1088
  FORCE_INLINE_TEMPLATE size_t
895
1089
  DONT_VECTORIZE
896
1090
  ZSTD_decompressSequences_body( ZSTD_DCtx* dctx,
897
1091
  void* dst, size_t maxDstSize,
898
1092
  const void* seqStart, size_t seqSize, int nbSeq,
899
- const ZSTD_longOffset_e isLongOffset)
1093
+ const ZSTD_longOffset_e isLongOffset,
1094
+ const int frame)
900
1095
  {
901
1096
  const BYTE* ip = (const BYTE*)seqStart;
902
1097
  const BYTE* const iend = ip + seqSize;
903
- BYTE* const ostart = (BYTE* const)dst;
1098
+ BYTE* const ostart = (BYTE*)dst;
904
1099
  BYTE* const oend = ostart + maxDstSize;
905
1100
  BYTE* op = ostart;
906
1101
  const BYTE* litPtr = dctx->litPtr;
@@ -909,6 +1104,7 @@ ZSTD_decompressSequences_body( ZSTD_DCtx* dctx,
909
1104
  const BYTE* const vBase = (const BYTE*) (dctx->virtualStart);
910
1105
  const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
911
1106
  DEBUGLOG(5, "ZSTD_decompressSequences_body");
1107
+ (void)frame;
912
1108
 
913
1109
  /* Regen sequences */
914
1110
  if (nbSeq) {
@@ -917,38 +1113,97 @@ ZSTD_decompressSequences_body( ZSTD_DCtx* dctx,
917
1113
  { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; }
918
1114
  RETURN_ERROR_IF(
919
1115
  ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend-ip)),
920
- corruption_detected);
1116
+ corruption_detected, "");
921
1117
  ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr);
922
1118
  ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr);
923
1119
  ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
1120
+ assert(dst != NULL);
924
1121
 
925
1122
  ZSTD_STATIC_ASSERT(
926
1123
  BIT_DStream_unfinished < BIT_DStream_completed &&
927
1124
  BIT_DStream_endOfBuffer < BIT_DStream_completed &&
928
1125
  BIT_DStream_completed < BIT_DStream_overflow);
929
1126
 
930
- for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && nbSeq ; ) {
931
- nbSeq--;
932
- { seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
933
- size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd);
934
- DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
935
- if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
936
- op += oneSeqSize;
937
- } }
1127
+ #if defined(__GNUC__) && defined(__x86_64__)
1128
+ /* Align the decompression loop to 32 + 16 bytes.
1129
+ *
1130
+ * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression
1131
+ * speed swings based on the alignment of the decompression loop. This
1132
+ * performance swing is caused by parts of the decompression loop falling
1133
+ * out of the DSB. The entire decompression loop should fit in the DSB,
1134
+ * when it can't we get much worse performance. You can measure if you've
1135
+ * hit the good case or the bad case with this perf command for some
1136
+ * compressed file test.zst:
1137
+ *
1138
+ * perf stat -e cycles -e instructions -e idq.all_dsb_cycles_any_uops \
1139
+ * -e idq.all_mite_cycles_any_uops -- ./zstd -tq test.zst
1140
+ *
1141
+ * If you see most cycles served out of the MITE you've hit the bad case.
1142
+ * If you see most cycles served out of the DSB you've hit the good case.
1143
+ * If it is pretty even then you may be in an okay case.
1144
+ *
1145
+ * This issue has been reproduced on the following CPUs:
1146
+ * - Kabylake: Macbook Pro (15-inch, 2019) 2.4 GHz Intel Core i9
1147
+ * Use Instruments->Counters to get DSB/MITE cycles.
1148
+ * I never got performance swings, but I was able to
1149
+ * go from the good case of mostly DSB to half of the
1150
+ * cycles served from MITE.
1151
+ * - Coffeelake: Intel i9-9900k
1152
+ * - Coffeelake: Intel i7-9700k
1153
+ *
1154
+ * I haven't been able to reproduce the instability or DSB misses on any
1155
+ * of the following CPUS:
1156
+ * - Haswell
1157
+ * - Broadwell: Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GH
1158
+ * - Skylake
1159
+ *
1160
+ * If you are seeing performance stability this script can help test.
1161
+ * It tests on 4 commits in zstd where I saw performance change.
1162
+ *
1163
+ * https://gist.github.com/terrelln/9889fc06a423fd5ca6e99351564473f4
1164
+ */
1165
+ __asm__(".p2align 6");
1166
+ __asm__("nop");
1167
+ __asm__(".p2align 5");
1168
+ __asm__("nop");
1169
+ # if __GNUC__ >= 9
1170
+ /* better for gcc-9 and gcc-10, worse for clang and gcc-8 */
1171
+ __asm__(".p2align 3");
1172
+ # else
1173
+ __asm__(".p2align 4");
1174
+ # endif
1175
+ #endif
1176
+ for ( ; ; ) {
1177
+ seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1178
+ size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd);
1179
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1180
+ assert(!ZSTD_isError(oneSeqSize));
1181
+ if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
1182
+ #endif
1183
+ if (UNLIKELY(ZSTD_isError(oneSeqSize)))
1184
+ return oneSeqSize;
1185
+ DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
1186
+ op += oneSeqSize;
1187
+ if (UNLIKELY(!--nbSeq))
1188
+ break;
1189
+ BIT_reloadDStream(&(seqState.DStream));
1190
+ }
938
1191
 
939
1192
  /* check if reached exact end */
940
1193
  DEBUGLOG(5, "ZSTD_decompressSequences_body: after decode loop, remaining nbSeq : %i", nbSeq);
941
- RETURN_ERROR_IF(nbSeq, corruption_detected);
942
- RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected);
1194
+ RETURN_ERROR_IF(nbSeq, corruption_detected, "");
1195
+ RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, "");
943
1196
  /* save reps for next block */
944
1197
  { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
945
1198
  }
946
1199
 
947
1200
  /* last literal segment */
948
1201
  { size_t const lastLLSize = litEnd - litPtr;
949
- RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall);
950
- memcpy(op, litPtr, lastLLSize);
951
- op += lastLLSize;
1202
+ RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
1203
+ if (op != NULL) {
1204
+ ZSTD_memcpy(op, litPtr, lastLLSize);
1205
+ op += lastLLSize;
1206
+ }
952
1207
  }
953
1208
 
954
1209
  return op-ostart;
@@ -958,103 +1213,43 @@ static size_t
958
1213
  ZSTD_decompressSequences_default(ZSTD_DCtx* dctx,
959
1214
  void* dst, size_t maxDstSize,
960
1215
  const void* seqStart, size_t seqSize, int nbSeq,
961
- const ZSTD_longOffset_e isLongOffset)
1216
+ const ZSTD_longOffset_e isLongOffset,
1217
+ const int frame)
962
1218
  {
963
- return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
1219
+ return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
964
1220
  }
965
1221
  #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
966
1222
 
967
-
968
-
969
1223
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
970
- FORCE_INLINE_TEMPLATE seq_t
971
- ZSTD_decodeSequenceLong(seqState_t* seqState, ZSTD_longOffset_e const longOffsets)
972
- {
973
- seq_t seq;
974
- U32 const llBits = seqState->stateLL.table[seqState->stateLL.state].nbAdditionalBits;
975
- U32 const mlBits = seqState->stateML.table[seqState->stateML.state].nbAdditionalBits;
976
- U32 const ofBits = seqState->stateOffb.table[seqState->stateOffb.state].nbAdditionalBits;
977
- U32 const totalBits = llBits+mlBits+ofBits;
978
- U32 const llBase = seqState->stateLL.table[seqState->stateLL.state].baseValue;
979
- U32 const mlBase = seqState->stateML.table[seqState->stateML.state].baseValue;
980
- U32 const ofBase = seqState->stateOffb.table[seqState->stateOffb.state].baseValue;
981
-
982
- /* sequence */
983
- { size_t offset;
984
- if (!ofBits)
985
- offset = 0;
986
- else {
987
- ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
988
- ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
989
- assert(ofBits <= MaxOff);
990
- if (MEM_32bits() && longOffsets) {
991
- U32 const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN_32-1);
992
- offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
993
- if (MEM_32bits() || extraBits) BIT_reloadDStream(&seqState->DStream);
994
- if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits);
995
- } else {
996
- offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */
997
- if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
998
- }
999
- }
1000
1224
 
1001
- if (ofBits <= 1) {
1002
- offset += (llBase==0);
1003
- if (offset) {
1004
- size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
1005
- temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */
1006
- if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1];
1007
- seqState->prevOffset[1] = seqState->prevOffset[0];
1008
- seqState->prevOffset[0] = offset = temp;
1009
- } else {
1010
- offset = seqState->prevOffset[0];
1011
- }
1012
- } else {
1013
- seqState->prevOffset[2] = seqState->prevOffset[1];
1014
- seqState->prevOffset[1] = seqState->prevOffset[0];
1015
- seqState->prevOffset[0] = offset;
1016
- }
1017
- seq.offset = offset;
1018
- }
1019
-
1020
- seq.matchLength = mlBase + ((mlBits>0) ? BIT_readBitsFast(&seqState->DStream, mlBits) : 0); /* <= 16 bits */
1021
- if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))
1022
- BIT_reloadDStream(&seqState->DStream);
1023
- if (MEM_64bits() && (totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog)))
1024
- BIT_reloadDStream(&seqState->DStream);
1025
- /* Verify that there is enough bits to read the rest of the data in 64-bit mode. */
1026
- ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);
1027
-
1028
- seq.litLength = llBase + ((llBits>0) ? BIT_readBitsFast(&seqState->DStream, llBits) : 0); /* <= 16 bits */
1029
- if (MEM_32bits())
1030
- BIT_reloadDStream(&seqState->DStream);
1031
-
1032
- { size_t const pos = seqState->pos + seq.litLength;
1033
- const BYTE* const matchBase = (seq.offset > pos) ? seqState->dictEnd : seqState->prefixStart;
1034
- seq.match = matchBase + pos - seq.offset; /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted.
1035
- * No consequence though : no memory access will occur, overly large offset will be detected in ZSTD_execSequenceLong() */
1036
- seqState->pos = pos + seq.matchLength;
1225
+ FORCE_INLINE_TEMPLATE size_t
1226
+ ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence,
1227
+ const BYTE* const prefixStart, const BYTE* const dictEnd)
1228
+ {
1229
+ prefetchPos += sequence.litLength;
1230
+ { const BYTE* const matchBase = (sequence.offset > prefetchPos) ? dictEnd : prefixStart;
1231
+ const BYTE* const match = matchBase + prefetchPos - sequence.offset; /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted.
1232
+ * No consequence though : memory address is only used for prefetching, not for dereferencing */
1233
+ PREFETCH_L1(match); PREFETCH_L1(match+CACHELINE_SIZE); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
1037
1234
  }
1038
-
1039
- /* ANS state update */
1040
- ZSTD_updateFseState(&seqState->stateLL, &seqState->DStream); /* <= 9 bits */
1041
- ZSTD_updateFseState(&seqState->stateML, &seqState->DStream); /* <= 9 bits */
1042
- if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */
1043
- ZSTD_updateFseState(&seqState->stateOffb, &seqState->DStream); /* <= 8 bits */
1044
-
1045
- return seq;
1235
+ return prefetchPos + sequence.matchLength;
1046
1236
  }
1047
1237
 
1238
+ /* This decoding function employs prefetching
1239
+ * to reduce latency impact of cache misses.
1240
+ * It's generally employed when block contains a significant portion of long-distance matches
1241
+ * or when coupled with a "cold" dictionary */
1048
1242
  FORCE_INLINE_TEMPLATE size_t
1049
1243
  ZSTD_decompressSequencesLong_body(
1050
1244
  ZSTD_DCtx* dctx,
1051
1245
  void* dst, size_t maxDstSize,
1052
1246
  const void* seqStart, size_t seqSize, int nbSeq,
1053
- const ZSTD_longOffset_e isLongOffset)
1247
+ const ZSTD_longOffset_e isLongOffset,
1248
+ const int frame)
1054
1249
  {
1055
1250
  const BYTE* ip = (const BYTE*)seqStart;
1056
1251
  const BYTE* const iend = ip + seqSize;
1057
- BYTE* const ostart = (BYTE* const)dst;
1252
+ BYTE* const ostart = (BYTE*)dst;
1058
1253
  BYTE* const oend = ostart + maxDstSize;
1059
1254
  BYTE* op = ostart;
1060
1255
  const BYTE* litPtr = dctx->litPtr;
@@ -1062,51 +1257,62 @@ ZSTD_decompressSequencesLong_body(
1062
1257
  const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart);
1063
1258
  const BYTE* const dictStart = (const BYTE*) (dctx->virtualStart);
1064
1259
  const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
1260
+ (void)frame;
1065
1261
 
1066
1262
  /* Regen sequences */
1067
1263
  if (nbSeq) {
1068
- #define STORED_SEQS 4
1264
+ #define STORED_SEQS 8
1069
1265
  #define STORED_SEQS_MASK (STORED_SEQS-1)
1070
- #define ADVANCED_SEQS 4
1266
+ #define ADVANCED_SEQS STORED_SEQS
1071
1267
  seq_t sequences[STORED_SEQS];
1072
1268
  int const seqAdvance = MIN(nbSeq, ADVANCED_SEQS);
1073
1269
  seqState_t seqState;
1074
1270
  int seqNb;
1271
+ size_t prefetchPos = (size_t)(op-prefixStart); /* track position relative to prefixStart */
1272
+
1075
1273
  dctx->fseEntropy = 1;
1076
1274
  { int i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; }
1077
- seqState.prefixStart = prefixStart;
1078
- seqState.pos = (size_t)(op-prefixStart);
1079
- seqState.dictEnd = dictEnd;
1275
+ assert(dst != NULL);
1080
1276
  assert(iend >= ip);
1081
1277
  RETURN_ERROR_IF(
1082
1278
  ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend-ip)),
1083
- corruption_detected);
1279
+ corruption_detected, "");
1084
1280
  ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr);
1085
1281
  ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr);
1086
1282
  ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
1087
1283
 
1088
1284
  /* prepare in advance */
1089
1285
  for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && (seqNb<seqAdvance); seqNb++) {
1090
- sequences[seqNb] = ZSTD_decodeSequenceLong(&seqState, isLongOffset);
1091
- PREFETCH_L1(sequences[seqNb].match); PREFETCH_L1(sequences[seqNb].match + sequences[seqNb].matchLength - 1); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
1286
+ seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1287
+ prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
1288
+ sequences[seqNb] = sequence;
1092
1289
  }
1093
- RETURN_ERROR_IF(seqNb<seqAdvance, corruption_detected);
1290
+ RETURN_ERROR_IF(seqNb<seqAdvance, corruption_detected, "");
1094
1291
 
1095
1292
  /* decode and decompress */
1096
1293
  for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (seqNb<nbSeq) ; seqNb++) {
1097
- seq_t const sequence = ZSTD_decodeSequenceLong(&seqState, isLongOffset);
1098
- size_t const oneSeqSize = ZSTD_execSequenceLong(op, oend, sequences[(seqNb-ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd);
1294
+ seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1295
+ size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb-ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd);
1296
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1297
+ assert(!ZSTD_isError(oneSeqSize));
1298
+ if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb-ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
1299
+ #endif
1099
1300
  if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1100
- PREFETCH_L1(sequence.match); PREFETCH_L1(sequence.match + sequence.matchLength - 1); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
1301
+
1302
+ prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
1101
1303
  sequences[seqNb & STORED_SEQS_MASK] = sequence;
1102
1304
  op += oneSeqSize;
1103
1305
  }
1104
- RETURN_ERROR_IF(seqNb<nbSeq, corruption_detected);
1306
+ RETURN_ERROR_IF(seqNb<nbSeq, corruption_detected, "");
1105
1307
 
1106
1308
  /* finish queue */
1107
1309
  seqNb -= seqAdvance;
1108
1310
  for ( ; seqNb<nbSeq ; seqNb++) {
1109
- size_t const oneSeqSize = ZSTD_execSequenceLong(op, oend, sequences[seqNb&STORED_SEQS_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd);
1311
+ size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[seqNb&STORED_SEQS_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd);
1312
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1313
+ assert(!ZSTD_isError(oneSeqSize));
1314
+ if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
1315
+ #endif
1110
1316
  if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1111
1317
  op += oneSeqSize;
1112
1318
  }
@@ -1117,9 +1323,11 @@ ZSTD_decompressSequencesLong_body(
1117
1323
 
1118
1324
  /* last literal segment */
1119
1325
  { size_t const lastLLSize = litEnd - litPtr;
1120
- RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall);
1121
- memcpy(op, litPtr, lastLLSize);
1122
- op += lastLLSize;
1326
+ RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
1327
+ if (op != NULL) {
1328
+ ZSTD_memcpy(op, litPtr, lastLLSize);
1329
+ op += lastLLSize;
1330
+ }
1123
1331
  }
1124
1332
 
1125
1333
  return op-ostart;
@@ -1129,9 +1337,10 @@ static size_t
1129
1337
  ZSTD_decompressSequencesLong_default(ZSTD_DCtx* dctx,
1130
1338
  void* dst, size_t maxDstSize,
1131
1339
  const void* seqStart, size_t seqSize, int nbSeq,
1132
- const ZSTD_longOffset_e isLongOffset)
1340
+ const ZSTD_longOffset_e isLongOffset,
1341
+ const int frame)
1133
1342
  {
1134
- return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
1343
+ return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1135
1344
  }
1136
1345
  #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
1137
1346
 
@@ -1145,9 +1354,10 @@ DONT_VECTORIZE
1145
1354
  ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx,
1146
1355
  void* dst, size_t maxDstSize,
1147
1356
  const void* seqStart, size_t seqSize, int nbSeq,
1148
- const ZSTD_longOffset_e isLongOffset)
1357
+ const ZSTD_longOffset_e isLongOffset,
1358
+ const int frame)
1149
1359
  {
1150
- return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
1360
+ return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1151
1361
  }
1152
1362
  #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
1153
1363
 
@@ -1156,9 +1366,10 @@ static TARGET_ATTRIBUTE("bmi2") size_t
1156
1366
  ZSTD_decompressSequencesLong_bmi2(ZSTD_DCtx* dctx,
1157
1367
  void* dst, size_t maxDstSize,
1158
1368
  const void* seqStart, size_t seqSize, int nbSeq,
1159
- const ZSTD_longOffset_e isLongOffset)
1369
+ const ZSTD_longOffset_e isLongOffset,
1370
+ const int frame)
1160
1371
  {
1161
- return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
1372
+ return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1162
1373
  }
1163
1374
  #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
1164
1375
 
@@ -1168,21 +1379,23 @@ typedef size_t (*ZSTD_decompressSequences_t)(
1168
1379
  ZSTD_DCtx* dctx,
1169
1380
  void* dst, size_t maxDstSize,
1170
1381
  const void* seqStart, size_t seqSize, int nbSeq,
1171
- const ZSTD_longOffset_e isLongOffset);
1382
+ const ZSTD_longOffset_e isLongOffset,
1383
+ const int frame);
1172
1384
 
1173
1385
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
1174
1386
  static size_t
1175
1387
  ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,
1176
1388
  const void* seqStart, size_t seqSize, int nbSeq,
1177
- const ZSTD_longOffset_e isLongOffset)
1389
+ const ZSTD_longOffset_e isLongOffset,
1390
+ const int frame)
1178
1391
  {
1179
1392
  DEBUGLOG(5, "ZSTD_decompressSequences");
1180
1393
  #if DYNAMIC_BMI2
1181
1394
  if (dctx->bmi2) {
1182
- return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
1395
+ return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1183
1396
  }
1184
1397
  #endif
1185
- return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
1398
+ return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1186
1399
  }
1187
1400
  #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
1188
1401
 
@@ -1197,15 +1410,16 @@ static size_t
1197
1410
  ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx,
1198
1411
  void* dst, size_t maxDstSize,
1199
1412
  const void* seqStart, size_t seqSize, int nbSeq,
1200
- const ZSTD_longOffset_e isLongOffset)
1413
+ const ZSTD_longOffset_e isLongOffset,
1414
+ const int frame)
1201
1415
  {
1202
1416
  DEBUGLOG(5, "ZSTD_decompressSequencesLong");
1203
1417
  #if DYNAMIC_BMI2
1204
1418
  if (dctx->bmi2) {
1205
- return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
1419
+ return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1206
1420
  }
1207
1421
  #endif
1208
- return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
1422
+ return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1209
1423
  }
1210
1424
  #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
1211
1425
 
@@ -1239,7 +1453,6 @@ ZSTD_getLongOffsetsShare(const ZSTD_seqSymbol* offTable)
1239
1453
  }
1240
1454
  #endif
1241
1455
 
1242
-
1243
1456
  size_t
1244
1457
  ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
1245
1458
  void* dst, size_t dstCapacity,
@@ -1255,7 +1468,7 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
1255
1468
  ZSTD_longOffset_e const isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (!frame || (dctx->fParams.windowSize > (1ULL << STREAM_ACCUMULATOR_MIN))));
1256
1469
  DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize);
1257
1470
 
1258
- RETURN_ERROR_IF(srcSize >= ZSTD_BLOCKSIZE_MAX, srcSize_wrong);
1471
+ RETURN_ERROR_IF(srcSize >= ZSTD_BLOCKSIZE_MAX, srcSize_wrong, "");
1259
1472
 
1260
1473
  /* Decode literals section */
1261
1474
  { size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize);
@@ -1281,6 +1494,8 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
1281
1494
  ip += seqHSize;
1282
1495
  srcSize -= seqHSize;
1283
1496
 
1497
+ RETURN_ERROR_IF(dst == NULL && nbSeq > 0, dstSize_tooSmall, "NULL not handled");
1498
+
1284
1499
  #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
1285
1500
  !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
1286
1501
  if ( !usePrefetchDecoder
@@ -1299,23 +1514,34 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
1299
1514
  if (usePrefetchDecoder)
1300
1515
  #endif
1301
1516
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
1302
- return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
1517
+ return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
1303
1518
  #endif
1304
1519
 
1305
1520
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
1306
1521
  /* else */
1307
- return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
1522
+ return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
1308
1523
  #endif
1309
1524
  }
1310
1525
  }
1311
1526
 
1312
1527
 
1528
+ void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize)
1529
+ {
1530
+ if (dst != dctx->previousDstEnd && dstSize > 0) { /* not contiguous */
1531
+ dctx->dictEnd = dctx->previousDstEnd;
1532
+ dctx->virtualStart = (const char*)dst - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->prefixStart));
1533
+ dctx->prefixStart = dst;
1534
+ dctx->previousDstEnd = dst;
1535
+ }
1536
+ }
1537
+
1538
+
1313
1539
  size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
1314
1540
  void* dst, size_t dstCapacity,
1315
1541
  const void* src, size_t srcSize)
1316
1542
  {
1317
1543
  size_t dSize;
1318
- ZSTD_checkContinuity(dctx, dst);
1544
+ ZSTD_checkContinuity(dctx, dst, dstCapacity);
1319
1545
  dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 0);
1320
1546
  dctx->previousDstEnd = (char*)dst + dSize;
1321
1547
  return dSize;