zstd-ruby 1.4.4.0 → 1.5.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. checksums.yaml +4 -4
  2. data/.github/dependabot.yml +8 -0
  3. data/.github/workflows/ruby.yml +35 -0
  4. data/README.md +2 -2
  5. data/ext/zstdruby/extconf.rb +1 -0
  6. data/ext/zstdruby/libzstd/BUCK +5 -7
  7. data/ext/zstdruby/libzstd/Makefile +241 -173
  8. data/ext/zstdruby/libzstd/README.md +76 -18
  9. data/ext/zstdruby/libzstd/common/bitstream.h +75 -57
  10. data/ext/zstdruby/libzstd/common/compiler.h +196 -20
  11. data/ext/zstdruby/libzstd/common/cpu.h +1 -3
  12. data/ext/zstdruby/libzstd/common/debug.c +11 -31
  13. data/ext/zstdruby/libzstd/common/debug.h +22 -49
  14. data/ext/zstdruby/libzstd/common/entropy_common.c +208 -76
  15. data/ext/zstdruby/libzstd/common/error_private.c +3 -1
  16. data/ext/zstdruby/libzstd/common/error_private.h +87 -4
  17. data/ext/zstdruby/libzstd/common/fse.h +51 -42
  18. data/ext/zstdruby/libzstd/common/fse_decompress.c +149 -57
  19. data/ext/zstdruby/libzstd/common/huf.h +60 -54
  20. data/ext/zstdruby/libzstd/common/mem.h +87 -98
  21. data/ext/zstdruby/libzstd/common/pool.c +23 -17
  22. data/ext/zstdruby/libzstd/common/pool.h +3 -3
  23. data/ext/zstdruby/libzstd/common/portability_macros.h +131 -0
  24. data/ext/zstdruby/libzstd/common/threading.c +10 -8
  25. data/ext/zstdruby/libzstd/common/threading.h +4 -3
  26. data/ext/zstdruby/libzstd/common/xxhash.c +15 -873
  27. data/ext/zstdruby/libzstd/common/xxhash.h +5572 -191
  28. data/ext/zstdruby/libzstd/common/zstd_common.c +10 -10
  29. data/ext/zstdruby/libzstd/common/zstd_deps.h +111 -0
  30. data/ext/zstdruby/libzstd/common/zstd_internal.h +252 -108
  31. data/ext/zstdruby/libzstd/common/zstd_trace.h +163 -0
  32. data/ext/zstdruby/libzstd/compress/clevels.h +134 -0
  33. data/ext/zstdruby/libzstd/compress/fse_compress.c +105 -85
  34. data/ext/zstdruby/libzstd/compress/hist.c +41 -63
  35. data/ext/zstdruby/libzstd/compress/hist.h +13 -33
  36. data/ext/zstdruby/libzstd/compress/huf_compress.c +831 -259
  37. data/ext/zstdruby/libzstd/compress/zstd_compress.c +3213 -1007
  38. data/ext/zstdruby/libzstd/compress/zstd_compress_internal.h +493 -71
  39. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.c +21 -16
  40. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.h +4 -2
  41. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.c +51 -24
  42. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.h +10 -3
  43. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.c +573 -0
  44. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.h +32 -0
  45. data/ext/zstdruby/libzstd/compress/zstd_cwksp.h +208 -81
  46. data/ext/zstdruby/libzstd/compress/zstd_double_fast.c +315 -137
  47. data/ext/zstdruby/libzstd/compress/zstd_double_fast.h +2 -2
  48. data/ext/zstdruby/libzstd/compress/zstd_fast.c +319 -128
  49. data/ext/zstdruby/libzstd/compress/zstd_fast.h +2 -2
  50. data/ext/zstdruby/libzstd/compress/zstd_lazy.c +1156 -171
  51. data/ext/zstdruby/libzstd/compress/zstd_lazy.h +59 -1
  52. data/ext/zstdruby/libzstd/compress/zstd_ldm.c +331 -206
  53. data/ext/zstdruby/libzstd/compress/zstd_ldm.h +15 -3
  54. data/ext/zstdruby/libzstd/compress/zstd_ldm_geartab.h +106 -0
  55. data/ext/zstdruby/libzstd/compress/zstd_opt.c +403 -226
  56. data/ext/zstdruby/libzstd/compress/zstd_opt.h +1 -1
  57. data/ext/zstdruby/libzstd/compress/zstdmt_compress.c +188 -453
  58. data/ext/zstdruby/libzstd/compress/zstdmt_compress.h +32 -114
  59. data/ext/zstdruby/libzstd/decompress/huf_decompress.c +1065 -410
  60. data/ext/zstdruby/libzstd/decompress/huf_decompress_amd64.S +571 -0
  61. data/ext/zstdruby/libzstd/decompress/zstd_ddict.c +20 -16
  62. data/ext/zstdruby/libzstd/decompress/zstd_ddict.h +3 -3
  63. data/ext/zstdruby/libzstd/decompress/zstd_decompress.c +691 -230
  64. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.c +1072 -323
  65. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.h +16 -7
  66. data/ext/zstdruby/libzstd/decompress/zstd_decompress_internal.h +71 -10
  67. data/ext/zstdruby/libzstd/deprecated/zbuff.h +3 -3
  68. data/ext/zstdruby/libzstd/deprecated/zbuff_common.c +2 -2
  69. data/ext/zstdruby/libzstd/deprecated/zbuff_compress.c +24 -4
  70. data/ext/zstdruby/libzstd/deprecated/zbuff_decompress.c +1 -1
  71. data/ext/zstdruby/libzstd/dictBuilder/cover.c +57 -40
  72. data/ext/zstdruby/libzstd/dictBuilder/cover.h +20 -9
  73. data/ext/zstdruby/libzstd/dictBuilder/divsufsort.c +1 -1
  74. data/ext/zstdruby/libzstd/dictBuilder/fastcover.c +54 -35
  75. data/ext/zstdruby/libzstd/dictBuilder/zdict.c +151 -57
  76. data/ext/zstdruby/libzstd/dll/example/Makefile +2 -1
  77. data/ext/zstdruby/libzstd/dll/example/README.md +16 -22
  78. data/ext/zstdruby/libzstd/legacy/zstd_legacy.h +4 -4
  79. data/ext/zstdruby/libzstd/legacy/zstd_v01.c +25 -19
  80. data/ext/zstdruby/libzstd/legacy/zstd_v01.h +1 -1
  81. data/ext/zstdruby/libzstd/legacy/zstd_v02.c +18 -14
  82. data/ext/zstdruby/libzstd/legacy/zstd_v02.h +1 -1
  83. data/ext/zstdruby/libzstd/legacy/zstd_v03.c +18 -14
  84. data/ext/zstdruby/libzstd/legacy/zstd_v03.h +1 -1
  85. data/ext/zstdruby/libzstd/legacy/zstd_v04.c +22 -16
  86. data/ext/zstdruby/libzstd/legacy/zstd_v04.h +1 -1
  87. data/ext/zstdruby/libzstd/legacy/zstd_v05.c +29 -25
  88. data/ext/zstdruby/libzstd/legacy/zstd_v05.h +2 -2
  89. data/ext/zstdruby/libzstd/legacy/zstd_v06.c +29 -25
  90. data/ext/zstdruby/libzstd/legacy/zstd_v06.h +1 -1
  91. data/ext/zstdruby/libzstd/legacy/zstd_v07.c +34 -26
  92. data/ext/zstdruby/libzstd/legacy/zstd_v07.h +1 -1
  93. data/ext/zstdruby/libzstd/libzstd.mk +185 -0
  94. data/ext/zstdruby/libzstd/libzstd.pc.in +4 -3
  95. data/ext/zstdruby/libzstd/modulemap/module.modulemap +4 -0
  96. data/ext/zstdruby/libzstd/{dictBuilder/zdict.h → zdict.h} +201 -31
  97. data/ext/zstdruby/libzstd/zstd.h +760 -234
  98. data/ext/zstdruby/libzstd/{common/zstd_errors.h → zstd_errors.h} +3 -1
  99. data/ext/zstdruby/zstdruby.c +2 -2
  100. data/lib/zstd-ruby/version.rb +1 -1
  101. metadata +20 -9
  102. data/.travis.yml +0 -14
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
2
+ * Copyright (c) Yann Collet, Facebook, Inc.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -14,15 +14,15 @@
14
14
  /*-*******************************************************
15
15
  * Dependencies
16
16
  *********************************************************/
17
- #include <string.h> /* memcpy, memmove, memset */
18
- #include "compiler.h" /* prefetch */
19
- #include "cpu.h" /* bmi2 */
20
- #include "mem.h" /* low level memory routines */
17
+ #include "../common/zstd_deps.h" /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */
18
+ #include "../common/compiler.h" /* prefetch */
19
+ #include "../common/cpu.h" /* bmi2 */
20
+ #include "../common/mem.h" /* low level memory routines */
21
21
  #define FSE_STATIC_LINKING_ONLY
22
- #include "fse.h"
22
+ #include "../common/fse.h"
23
23
  #define HUF_STATIC_LINKING_ONLY
24
- #include "huf.h"
25
- #include "zstd_internal.h"
24
+ #include "../common/huf.h"
25
+ #include "../common/zstd_internal.h"
26
26
  #include "zstd_decompress_internal.h" /* ZSTD_DCtx */
27
27
  #include "zstd_ddict.h" /* ZSTD_DDictDictContent */
28
28
  #include "zstd_decompress_block.h"
@@ -44,7 +44,7 @@
44
44
  /*_*******************************************************
45
45
  * Memory operations
46
46
  **********************************************************/
47
- static void ZSTD_copy4(void* dst, const void* src) { memcpy(dst, src, 4); }
47
+ static void ZSTD_copy4(void* dst, const void* src) { ZSTD_memcpy(dst, src, 4); }
48
48
 
49
49
 
50
50
  /*-*************************************************************
@@ -56,7 +56,7 @@ static void ZSTD_copy4(void* dst, const void* src) { memcpy(dst, src, 4); }
56
56
  size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
57
57
  blockProperties_t* bpPtr)
58
58
  {
59
- RETURN_ERROR_IF(srcSize < ZSTD_blockHeaderSize, srcSize_wrong);
59
+ RETURN_ERROR_IF(srcSize < ZSTD_blockHeaderSize, srcSize_wrong, "");
60
60
 
61
61
  { U32 const cBlockHeader = MEM_readLE24(src);
62
62
  U32 const cSize = cBlockHeader >> 3;
@@ -64,23 +64,64 @@ size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
64
64
  bpPtr->blockType = (blockType_e)((cBlockHeader >> 1) & 3);
65
65
  bpPtr->origSize = cSize; /* only useful for RLE */
66
66
  if (bpPtr->blockType == bt_rle) return 1;
67
- RETURN_ERROR_IF(bpPtr->blockType == bt_reserved, corruption_detected);
67
+ RETURN_ERROR_IF(bpPtr->blockType == bt_reserved, corruption_detected, "");
68
68
  return cSize;
69
69
  }
70
70
  }
71
71
 
72
+ /* Allocate buffer for literals, either overlapping current dst, or split between dst and litExtraBuffer, or stored entirely within litExtraBuffer */
73
+ static void ZSTD_allocateLiteralsBuffer(ZSTD_DCtx* dctx, void* const dst, const size_t dstCapacity, const size_t litSize,
74
+ const streaming_operation streaming, const size_t expectedWriteSize, const unsigned splitImmediately)
75
+ {
76
+ if (streaming == not_streaming && dstCapacity > ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH)
77
+ {
78
+ /* room for litbuffer to fit without read faulting */
79
+ dctx->litBuffer = (BYTE*)dst + ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH;
80
+ dctx->litBufferEnd = dctx->litBuffer + litSize;
81
+ dctx->litBufferLocation = ZSTD_in_dst;
82
+ }
83
+ else if (litSize > ZSTD_LITBUFFEREXTRASIZE)
84
+ {
85
+ /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */
86
+ if (splitImmediately) {
87
+ /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */
88
+ dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH;
89
+ dctx->litBufferEnd = dctx->litBuffer + litSize - ZSTD_LITBUFFEREXTRASIZE;
90
+ }
91
+ else {
92
+ /* initially this will be stored entirely in dst during huffman decoding, it will partially shifted to litExtraBuffer after */
93
+ dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize;
94
+ dctx->litBufferEnd = (BYTE*)dst + expectedWriteSize;
95
+ }
96
+ dctx->litBufferLocation = ZSTD_split;
97
+ }
98
+ else
99
+ {
100
+ /* fits entirely within litExtraBuffer, so no split is necessary */
101
+ dctx->litBuffer = dctx->litExtraBuffer;
102
+ dctx->litBufferEnd = dctx->litBuffer + litSize;
103
+ dctx->litBufferLocation = ZSTD_not_in_dst;
104
+ }
105
+ }
72
106
 
73
107
  /* Hidden declaration for fullbench */
74
108
  size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
75
- const void* src, size_t srcSize);
109
+ const void* src, size_t srcSize,
110
+ void* dst, size_t dstCapacity, const streaming_operation streaming);
76
111
  /*! ZSTD_decodeLiteralsBlock() :
112
+ * Where it is possible to do so without being stomped by the output during decompression, the literals block will be stored
113
+ * in the dstBuffer. If there is room to do so, it will be stored in full in the excess dst space after where the current
114
+ * block will be output. Otherwise it will be stored at the end of the current dst blockspace, with a small portion being
115
+ * stored in dctx->litExtraBuffer to help keep it "ahead" of the current output write.
116
+ *
77
117
  * @return : nb of bytes read from src (< srcSize )
78
118
  * note : symbol not declared but exposed for fullbench */
79
119
  size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
80
- const void* src, size_t srcSize) /* note : srcSize < BLOCKSIZE */
120
+ const void* src, size_t srcSize, /* note : srcSize < BLOCKSIZE */
121
+ void* dst, size_t dstCapacity, const streaming_operation streaming)
81
122
  {
82
123
  DEBUGLOG(5, "ZSTD_decodeLiteralsBlock");
83
- RETURN_ERROR_IF(srcSize < MIN_CBLOCK_SIZE, corruption_detected);
124
+ RETURN_ERROR_IF(srcSize < MIN_CBLOCK_SIZE, corruption_detected, "");
84
125
 
85
126
  { const BYTE* const istart = (const BYTE*) src;
86
127
  symbolEncodingType_e const litEncType = (symbolEncodingType_e)(istart[0] & 3);
@@ -89,8 +130,8 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
89
130
  {
90
131
  case set_repeat:
91
132
  DEBUGLOG(5, "set_repeat flag : re-using stats from previous compressed literals block");
92
- RETURN_ERROR_IF(dctx->litEntropy==0, dictionary_corrupted);
93
- /* fall-through */
133
+ RETURN_ERROR_IF(dctx->litEntropy==0, dictionary_corrupted, "");
134
+ ZSTD_FALLTHROUGH;
94
135
 
95
136
  case set_compressed:
96
137
  RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3");
@@ -99,6 +140,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
99
140
  U32 const lhlCode = (istart[0] >> 2) & 3;
100
141
  U32 const lhc = MEM_readLE32(istart);
101
142
  size_t hufSuccess;
143
+ size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
102
144
  switch(lhlCode)
103
145
  {
104
146
  case 0: case 1: default: /* note : default is impossible, since lhlCode into [0..3] */
@@ -121,8 +163,11 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
121
163
  litCSize = (lhc >> 22) + ((size_t)istart[4] << 10);
122
164
  break;
123
165
  }
124
- RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected);
125
- RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected);
166
+ RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
167
+ RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
168
+ RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, "");
169
+ RETURN_ERROR_IF(expectedWriteSize < litSize , dstSize_tooSmall, "");
170
+ ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 0);
126
171
 
127
172
  /* prefetch huffman table if cold */
128
173
  if (dctx->ddictIsCold && (litSize > 768 /* heuristic */)) {
@@ -133,11 +178,11 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
133
178
  if (singleStream) {
134
179
  hufSuccess = HUF_decompress1X_usingDTable_bmi2(
135
180
  dctx->litBuffer, litSize, istart+lhSize, litCSize,
136
- dctx->HUFptr, dctx->bmi2);
181
+ dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx));
137
182
  } else {
138
183
  hufSuccess = HUF_decompress4X_usingDTable_bmi2(
139
184
  dctx->litBuffer, litSize, istart+lhSize, litCSize,
140
- dctx->HUFptr, dctx->bmi2);
185
+ dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx));
141
186
  }
142
187
  } else {
143
188
  if (singleStream) {
@@ -150,29 +195,36 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
150
195
  hufSuccess = HUF_decompress1X1_DCtx_wksp_bmi2(
151
196
  dctx->entropy.hufTable, dctx->litBuffer, litSize,
152
197
  istart+lhSize, litCSize, dctx->workspace,
153
- sizeof(dctx->workspace), dctx->bmi2);
198
+ sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx));
154
199
  #endif
155
200
  } else {
156
201
  hufSuccess = HUF_decompress4X_hufOnly_wksp_bmi2(
157
202
  dctx->entropy.hufTable, dctx->litBuffer, litSize,
158
203
  istart+lhSize, litCSize, dctx->workspace,
159
- sizeof(dctx->workspace), dctx->bmi2);
204
+ sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx));
160
205
  }
161
206
  }
207
+ if (dctx->litBufferLocation == ZSTD_split)
208
+ {
209
+ ZSTD_memcpy(dctx->litExtraBuffer, dctx->litBufferEnd - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE);
210
+ ZSTD_memmove(dctx->litBuffer + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH, dctx->litBuffer, litSize - ZSTD_LITBUFFEREXTRASIZE);
211
+ dctx->litBuffer += ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH;
212
+ dctx->litBufferEnd -= WILDCOPY_OVERLENGTH;
213
+ }
162
214
 
163
- RETURN_ERROR_IF(HUF_isError(hufSuccess), corruption_detected);
215
+ RETURN_ERROR_IF(HUF_isError(hufSuccess), corruption_detected, "");
164
216
 
165
217
  dctx->litPtr = dctx->litBuffer;
166
218
  dctx->litSize = litSize;
167
219
  dctx->litEntropy = 1;
168
220
  if (litEncType==set_compressed) dctx->HUFptr = dctx->entropy.hufTable;
169
- memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
170
221
  return litCSize + lhSize;
171
222
  }
172
223
 
173
224
  case set_basic:
174
225
  { size_t litSize, lhSize;
175
226
  U32 const lhlCode = ((istart[0]) >> 2) & 3;
227
+ size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
176
228
  switch(lhlCode)
177
229
  {
178
230
  case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */
@@ -189,23 +241,36 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
189
241
  break;
190
242
  }
191
243
 
244
+ RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
245
+ RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, "");
246
+ ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1);
192
247
  if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) { /* risk reading beyond src buffer with wildcopy */
193
- RETURN_ERROR_IF(litSize+lhSize > srcSize, corruption_detected);
194
- memcpy(dctx->litBuffer, istart+lhSize, litSize);
248
+ RETURN_ERROR_IF(litSize+lhSize > srcSize, corruption_detected, "");
249
+ if (dctx->litBufferLocation == ZSTD_split)
250
+ {
251
+ ZSTD_memcpy(dctx->litBuffer, istart + lhSize, litSize - ZSTD_LITBUFFEREXTRASIZE);
252
+ ZSTD_memcpy(dctx->litExtraBuffer, istart + lhSize + litSize - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE);
253
+ }
254
+ else
255
+ {
256
+ ZSTD_memcpy(dctx->litBuffer, istart + lhSize, litSize);
257
+ }
195
258
  dctx->litPtr = dctx->litBuffer;
196
259
  dctx->litSize = litSize;
197
- memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
198
260
  return lhSize+litSize;
199
261
  }
200
262
  /* direct reference into compressed stream */
201
263
  dctx->litPtr = istart+lhSize;
202
264
  dctx->litSize = litSize;
265
+ dctx->litBufferEnd = dctx->litPtr + litSize;
266
+ dctx->litBufferLocation = ZSTD_not_in_dst;
203
267
  return lhSize+litSize;
204
268
  }
205
269
 
206
270
  case set_rle:
207
271
  { U32 const lhlCode = ((istart[0]) >> 2) & 3;
208
272
  size_t litSize, lhSize;
273
+ size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
209
274
  switch(lhlCode)
210
275
  {
211
276
  case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */
@@ -222,8 +287,19 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
222
287
  RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4");
223
288
  break;
224
289
  }
225
- RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected);
226
- memset(dctx->litBuffer, istart[lhSize], litSize + WILDCOPY_OVERLENGTH);
290
+ RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
291
+ RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
292
+ RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, "");
293
+ ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1);
294
+ if (dctx->litBufferLocation == ZSTD_split)
295
+ {
296
+ ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize - ZSTD_LITBUFFEREXTRASIZE);
297
+ ZSTD_memset(dctx->litExtraBuffer, istart[lhSize], ZSTD_LITBUFFEREXTRASIZE);
298
+ }
299
+ else
300
+ {
301
+ ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize);
302
+ }
227
303
  dctx->litPtr = dctx->litBuffer;
228
304
  dctx->litSize = litSize;
229
305
  return lhSize+1;
@@ -236,7 +312,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
236
312
 
237
313
  /* Default FSE distribution tables.
238
314
  * These are pre-calculated FSE decoding tables using default distributions as defined in specification :
239
- * https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md#default-distributions
315
+ * https://github.com/facebook/zstd/blob/release/doc/zstd_compression_format.md#default-distributions
240
316
  * They were generated programmatically with following method :
241
317
  * - start from default distributions, present in /lib/common/zstd_internal.h
242
318
  * - generate tables normally, using ZSTD_buildFSETable()
@@ -343,7 +419,7 @@ static const ZSTD_seqSymbol ML_defaultDTable[(1<<ML_DEFAULTNORMLOG)+1] = {
343
419
  }; /* ML_defaultDTable */
344
420
 
345
421
 
346
- static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U32 nbAddBits)
422
+ static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U8 nbAddBits)
347
423
  {
348
424
  void* ptr = dt;
349
425
  ZSTD_seqSymbol_header* const DTableH = (ZSTD_seqSymbol_header*)ptr;
@@ -355,7 +431,7 @@ static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U32 nbAddB
355
431
  cell->nbBits = 0;
356
432
  cell->nextState = 0;
357
433
  assert(nbAddBits < 255);
358
- cell->nbAdditionalBits = (BYTE)nbAddBits;
434
+ cell->nbAdditionalBits = nbAddBits;
359
435
  cell->baseValue = baseValue;
360
436
  }
361
437
 
@@ -364,23 +440,26 @@ static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U32 nbAddB
364
440
  * generate FSE decoding table for one symbol (ll, ml or off)
365
441
  * cannot fail if input is valid =>
366
442
  * all inputs are presumed validated at this stage */
367
- void
368
- ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
443
+ FORCE_INLINE_TEMPLATE
444
+ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
369
445
  const short* normalizedCounter, unsigned maxSymbolValue,
370
- const U32* baseValue, const U32* nbAdditionalBits,
371
- unsigned tableLog)
446
+ const U32* baseValue, const U8* nbAdditionalBits,
447
+ unsigned tableLog, void* wksp, size_t wkspSize)
372
448
  {
373
449
  ZSTD_seqSymbol* const tableDecode = dt+1;
374
- U16 symbolNext[MaxSeq+1];
375
-
376
450
  U32 const maxSV1 = maxSymbolValue + 1;
377
451
  U32 const tableSize = 1 << tableLog;
378
- U32 highThreshold = tableSize-1;
452
+
453
+ U16* symbolNext = (U16*)wksp;
454
+ BYTE* spread = (BYTE*)(symbolNext + MaxSeq + 1);
455
+ U32 highThreshold = tableSize - 1;
456
+
379
457
 
380
458
  /* Sanity Checks */
381
459
  assert(maxSymbolValue <= MaxSeq);
382
460
  assert(tableLog <= MaxFSELog);
383
-
461
+ assert(wkspSize >= ZSTD_BUILD_FSE_TABLE_WKSP_SIZE);
462
+ (void)wkspSize;
384
463
  /* Init, lay down lowprob symbols */
385
464
  { ZSTD_seqSymbol_header DTableH;
386
465
  DTableH.tableLog = tableLog;
@@ -396,16 +475,69 @@ ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
396
475
  assert(normalizedCounter[s]>=0);
397
476
  symbolNext[s] = (U16)normalizedCounter[s];
398
477
  } } }
399
- memcpy(dt, &DTableH, sizeof(DTableH));
478
+ ZSTD_memcpy(dt, &DTableH, sizeof(DTableH));
400
479
  }
401
480
 
402
481
  /* Spread symbols */
403
- { U32 const tableMask = tableSize-1;
482
+ assert(tableSize <= 512);
483
+ /* Specialized symbol spreading for the case when there are
484
+ * no low probability (-1 count) symbols. When compressing
485
+ * small blocks we avoid low probability symbols to hit this
486
+ * case, since header decoding speed matters more.
487
+ */
488
+ if (highThreshold == tableSize - 1) {
489
+ size_t const tableMask = tableSize-1;
490
+ size_t const step = FSE_TABLESTEP(tableSize);
491
+ /* First lay down the symbols in order.
492
+ * We use a uint64_t to lay down 8 bytes at a time. This reduces branch
493
+ * misses since small blocks generally have small table logs, so nearly
494
+ * all symbols have counts <= 8. We ensure we have 8 bytes at the end of
495
+ * our buffer to handle the over-write.
496
+ */
497
+ {
498
+ U64 const add = 0x0101010101010101ull;
499
+ size_t pos = 0;
500
+ U64 sv = 0;
501
+ U32 s;
502
+ for (s=0; s<maxSV1; ++s, sv += add) {
503
+ int i;
504
+ int const n = normalizedCounter[s];
505
+ MEM_write64(spread + pos, sv);
506
+ for (i = 8; i < n; i += 8) {
507
+ MEM_write64(spread + pos + i, sv);
508
+ }
509
+ pos += n;
510
+ }
511
+ }
512
+ /* Now we spread those positions across the table.
513
+ * The benefit of doing it in two stages is that we avoid the the
514
+ * variable size inner loop, which caused lots of branch misses.
515
+ * Now we can run through all the positions without any branch misses.
516
+ * We unroll the loop twice, since that is what emperically worked best.
517
+ */
518
+ {
519
+ size_t position = 0;
520
+ size_t s;
521
+ size_t const unroll = 2;
522
+ assert(tableSize % unroll == 0); /* FSE_MIN_TABLELOG is 5 */
523
+ for (s = 0; s < (size_t)tableSize; s += unroll) {
524
+ size_t u;
525
+ for (u = 0; u < unroll; ++u) {
526
+ size_t const uPosition = (position + (u * step)) & tableMask;
527
+ tableDecode[uPosition].baseValue = spread[s + u];
528
+ }
529
+ position = (position + (unroll * step)) & tableMask;
530
+ }
531
+ assert(position == 0);
532
+ }
533
+ } else {
534
+ U32 const tableMask = tableSize-1;
404
535
  U32 const step = FSE_TABLESTEP(tableSize);
405
536
  U32 s, position = 0;
406
537
  for (s=0; s<maxSV1; s++) {
407
538
  int i;
408
- for (i=0; i<normalizedCounter[s]; i++) {
539
+ int const n = normalizedCounter[s];
540
+ for (i=0; i<n; i++) {
409
541
  tableDecode[position].baseValue = s;
410
542
  position = (position + step) & tableMask;
411
543
  while (position > highThreshold) position = (position + step) & tableMask; /* lowprob area */
@@ -414,16 +546,56 @@ ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
414
546
  }
415
547
 
416
548
  /* Build Decoding table */
417
- { U32 u;
549
+ {
550
+ U32 u;
418
551
  for (u=0; u<tableSize; u++) {
419
552
  U32 const symbol = tableDecode[u].baseValue;
420
553
  U32 const nextState = symbolNext[symbol]++;
421
554
  tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32(nextState) );
422
555
  tableDecode[u].nextState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
423
556
  assert(nbAdditionalBits[symbol] < 255);
424
- tableDecode[u].nbAdditionalBits = (BYTE)nbAdditionalBits[symbol];
557
+ tableDecode[u].nbAdditionalBits = nbAdditionalBits[symbol];
425
558
  tableDecode[u].baseValue = baseValue[symbol];
426
- } }
559
+ }
560
+ }
561
+ }
562
+
563
+ /* Avoids the FORCE_INLINE of the _body() function. */
564
+ static void ZSTD_buildFSETable_body_default(ZSTD_seqSymbol* dt,
565
+ const short* normalizedCounter, unsigned maxSymbolValue,
566
+ const U32* baseValue, const U8* nbAdditionalBits,
567
+ unsigned tableLog, void* wksp, size_t wkspSize)
568
+ {
569
+ ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue,
570
+ baseValue, nbAdditionalBits, tableLog, wksp, wkspSize);
571
+ }
572
+
573
+ #if DYNAMIC_BMI2
574
+ BMI2_TARGET_ATTRIBUTE static void ZSTD_buildFSETable_body_bmi2(ZSTD_seqSymbol* dt,
575
+ const short* normalizedCounter, unsigned maxSymbolValue,
576
+ const U32* baseValue, const U8* nbAdditionalBits,
577
+ unsigned tableLog, void* wksp, size_t wkspSize)
578
+ {
579
+ ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue,
580
+ baseValue, nbAdditionalBits, tableLog, wksp, wkspSize);
581
+ }
582
+ #endif
583
+
584
+ void ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
585
+ const short* normalizedCounter, unsigned maxSymbolValue,
586
+ const U32* baseValue, const U8* nbAdditionalBits,
587
+ unsigned tableLog, void* wksp, size_t wkspSize, int bmi2)
588
+ {
589
+ #if DYNAMIC_BMI2
590
+ if (bmi2) {
591
+ ZSTD_buildFSETable_body_bmi2(dt, normalizedCounter, maxSymbolValue,
592
+ baseValue, nbAdditionalBits, tableLog, wksp, wkspSize);
593
+ return;
594
+ }
595
+ #endif
596
+ (void)bmi2;
597
+ ZSTD_buildFSETable_body_default(dt, normalizedCounter, maxSymbolValue,
598
+ baseValue, nbAdditionalBits, tableLog, wksp, wkspSize);
427
599
  }
428
600
 
429
601
 
@@ -433,18 +605,19 @@ ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
433
605
  static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymbol** DTablePtr,
434
606
  symbolEncodingType_e type, unsigned max, U32 maxLog,
435
607
  const void* src, size_t srcSize,
436
- const U32* baseValue, const U32* nbAdditionalBits,
608
+ const U32* baseValue, const U8* nbAdditionalBits,
437
609
  const ZSTD_seqSymbol* defaultTable, U32 flagRepeatTable,
438
- int ddictIsCold, int nbSeq)
610
+ int ddictIsCold, int nbSeq, U32* wksp, size_t wkspSize,
611
+ int bmi2)
439
612
  {
440
613
  switch(type)
441
614
  {
442
615
  case set_rle :
443
- RETURN_ERROR_IF(!srcSize, srcSize_wrong);
444
- RETURN_ERROR_IF((*(const BYTE*)src) > max, corruption_detected);
616
+ RETURN_ERROR_IF(!srcSize, srcSize_wrong, "");
617
+ RETURN_ERROR_IF((*(const BYTE*)src) > max, corruption_detected, "");
445
618
  { U32 const symbol = *(const BYTE*)src;
446
619
  U32 const baseline = baseValue[symbol];
447
- U32 const nbBits = nbAdditionalBits[symbol];
620
+ U8 const nbBits = nbAdditionalBits[symbol];
448
621
  ZSTD_buildSeqTable_rle(DTableSpace, baseline, nbBits);
449
622
  }
450
623
  *DTablePtr = DTableSpace;
@@ -453,7 +626,7 @@ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymb
453
626
  *DTablePtr = defaultTable;
454
627
  return 0;
455
628
  case set_repeat:
456
- RETURN_ERROR_IF(!flagRepeatTable, corruption_detected);
629
+ RETURN_ERROR_IF(!flagRepeatTable, corruption_detected, "");
457
630
  /* prefetch FSE table if used */
458
631
  if (ddictIsCold && (nbSeq > 24 /* heuristic */)) {
459
632
  const void* const pStart = *DTablePtr;
@@ -465,9 +638,9 @@ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymb
465
638
  { unsigned tableLog;
466
639
  S16 norm[MaxSeq+1];
467
640
  size_t const headerSize = FSE_readNCount(norm, &max, &tableLog, src, srcSize);
468
- RETURN_ERROR_IF(FSE_isError(headerSize), corruption_detected);
469
- RETURN_ERROR_IF(tableLog > maxLog, corruption_detected);
470
- ZSTD_buildFSETable(DTableSpace, norm, max, baseValue, nbAdditionalBits, tableLog);
641
+ RETURN_ERROR_IF(FSE_isError(headerSize), corruption_detected, "");
642
+ RETURN_ERROR_IF(tableLog > maxLog, corruption_detected, "");
643
+ ZSTD_buildFSETable(DTableSpace, norm, max, baseValue, nbAdditionalBits, tableLog, wksp, wkspSize, bmi2);
471
644
  *DTablePtr = DTableSpace;
472
645
  return headerSize;
473
646
  }
@@ -480,35 +653,36 @@ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymb
480
653
  size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
481
654
  const void* src, size_t srcSize)
482
655
  {
483
- const BYTE* const istart = (const BYTE* const)src;
656
+ const BYTE* const istart = (const BYTE*)src;
484
657
  const BYTE* const iend = istart + srcSize;
485
658
  const BYTE* ip = istart;
486
659
  int nbSeq;
487
660
  DEBUGLOG(5, "ZSTD_decodeSeqHeaders");
488
661
 
489
662
  /* check */
490
- RETURN_ERROR_IF(srcSize < MIN_SEQUENCES_SIZE, srcSize_wrong);
663
+ RETURN_ERROR_IF(srcSize < MIN_SEQUENCES_SIZE, srcSize_wrong, "");
491
664
 
492
665
  /* SeqHead */
493
666
  nbSeq = *ip++;
494
667
  if (!nbSeq) {
495
668
  *nbSeqPtr=0;
496
- RETURN_ERROR_IF(srcSize != 1, srcSize_wrong);
669
+ RETURN_ERROR_IF(srcSize != 1, srcSize_wrong, "");
497
670
  return 1;
498
671
  }
499
672
  if (nbSeq > 0x7F) {
500
673
  if (nbSeq == 0xFF) {
501
- RETURN_ERROR_IF(ip+2 > iend, srcSize_wrong);
502
- nbSeq = MEM_readLE16(ip) + LONGNBSEQ, ip+=2;
674
+ RETURN_ERROR_IF(ip+2 > iend, srcSize_wrong, "");
675
+ nbSeq = MEM_readLE16(ip) + LONGNBSEQ;
676
+ ip+=2;
503
677
  } else {
504
- RETURN_ERROR_IF(ip >= iend, srcSize_wrong);
678
+ RETURN_ERROR_IF(ip >= iend, srcSize_wrong, "");
505
679
  nbSeq = ((nbSeq-0x80)<<8) + *ip++;
506
680
  }
507
681
  }
508
682
  *nbSeqPtr = nbSeq;
509
683
 
510
684
  /* FSE table descriptors */
511
- RETURN_ERROR_IF(ip+1 > iend, srcSize_wrong); /* minimum possible size: 1 byte for symbol encoding types */
685
+ RETURN_ERROR_IF(ip+1 > iend, srcSize_wrong, ""); /* minimum possible size: 1 byte for symbol encoding types */
512
686
  { symbolEncodingType_e const LLtype = (symbolEncodingType_e)(*ip >> 6);
513
687
  symbolEncodingType_e const OFtype = (symbolEncodingType_e)((*ip >> 4) & 3);
514
688
  symbolEncodingType_e const MLtype = (symbolEncodingType_e)((*ip >> 2) & 3);
@@ -520,8 +694,10 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
520
694
  ip, iend-ip,
521
695
  LL_base, LL_bits,
522
696
  LL_defaultDTable, dctx->fseEntropy,
523
- dctx->ddictIsCold, nbSeq);
524
- RETURN_ERROR_IF(ZSTD_isError(llhSize), corruption_detected);
697
+ dctx->ddictIsCold, nbSeq,
698
+ dctx->workspace, sizeof(dctx->workspace),
699
+ ZSTD_DCtx_get_bmi2(dctx));
700
+ RETURN_ERROR_IF(ZSTD_isError(llhSize), corruption_detected, "ZSTD_buildSeqTable failed");
525
701
  ip += llhSize;
526
702
  }
527
703
 
@@ -530,8 +706,10 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
530
706
  ip, iend-ip,
531
707
  OF_base, OF_bits,
532
708
  OF_defaultDTable, dctx->fseEntropy,
533
- dctx->ddictIsCold, nbSeq);
534
- RETURN_ERROR_IF(ZSTD_isError(ofhSize), corruption_detected);
709
+ dctx->ddictIsCold, nbSeq,
710
+ dctx->workspace, sizeof(dctx->workspace),
711
+ ZSTD_DCtx_get_bmi2(dctx));
712
+ RETURN_ERROR_IF(ZSTD_isError(ofhSize), corruption_detected, "ZSTD_buildSeqTable failed");
535
713
  ip += ofhSize;
536
714
  }
537
715
 
@@ -540,8 +718,10 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
540
718
  ip, iend-ip,
541
719
  ML_base, ML_bits,
542
720
  ML_defaultDTable, dctx->fseEntropy,
543
- dctx->ddictIsCold, nbSeq);
544
- RETURN_ERROR_IF(ZSTD_isError(mlhSize), corruption_detected);
721
+ dctx->ddictIsCold, nbSeq,
722
+ dctx->workspace, sizeof(dctx->workspace),
723
+ ZSTD_DCtx_get_bmi2(dctx));
724
+ RETURN_ERROR_IF(ZSTD_isError(mlhSize), corruption_detected, "ZSTD_buildSeqTable failed");
545
725
  ip += mlhSize;
546
726
  }
547
727
  }
@@ -554,7 +734,6 @@ typedef struct {
554
734
  size_t litLength;
555
735
  size_t matchLength;
556
736
  size_t offset;
557
- const BYTE* match;
558
737
  } seq_t;
559
738
 
560
739
  typedef struct {
@@ -568,9 +747,6 @@ typedef struct {
568
747
  ZSTD_fseState stateOffb;
569
748
  ZSTD_fseState stateML;
570
749
  size_t prevOffset[ZSTD_REP_NUM];
571
- const BYTE* prefixStart;
572
- const BYTE* dictEnd;
573
- size_t pos;
574
750
  } seqState_t;
575
751
 
576
752
  /*! ZSTD_overlapCopy8() :
@@ -580,7 +756,7 @@ typedef struct {
580
756
  * Precondition: *ip <= *op
581
757
  * Postcondition: *op - *op >= 8
582
758
  */
583
- static void ZSTD_overlapCopy8(BYTE** op, BYTE const** ip, size_t offset) {
759
+ HINT_INLINE void ZSTD_overlapCopy8(BYTE** op, BYTE const** ip, size_t offset) {
584
760
  assert(*ip <= *op);
585
761
  if (offset < 8) {
586
762
  /* close range match, overlap */
@@ -613,7 +789,7 @@ static void ZSTD_overlapCopy8(BYTE** op, BYTE const** ip, size_t offset) {
613
789
  * - ZSTD_overlap_src_before_dst: The src and dst may overlap and may be any distance apart.
614
790
  * The src buffer must be before the dst buffer.
615
791
  */
616
- static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, ptrdiff_t length, ZSTD_overlap_e ovtype) {
792
+ static void ZSTD_safecopy(BYTE* op, const BYTE* const oend_w, BYTE const* ip, ptrdiff_t length, ZSTD_overlap_e ovtype) {
617
793
  ptrdiff_t const diff = op - ip;
618
794
  BYTE* const oend = op + length;
619
795
 
@@ -629,6 +805,7 @@ static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, ptrdiff_
629
805
  /* Copy 8 bytes and ensure the offset >= 8 when there can be overlap. */
630
806
  assert(length >= 8);
631
807
  ZSTD_overlapCopy8(&op, &ip, diff);
808
+ length -= 8;
632
809
  assert(op - ip >= 8);
633
810
  assert(op <= oend);
634
811
  }
@@ -643,12 +820,35 @@ static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, ptrdiff_
643
820
  assert(oend > oend_w);
644
821
  ZSTD_wildcopy(op, ip, oend_w - op, ovtype);
645
822
  ip += oend_w - op;
646
- op = oend_w;
823
+ op += oend_w - op;
647
824
  }
648
825
  /* Handle the leftovers. */
649
826
  while (op < oend) *op++ = *ip++;
650
827
  }
651
828
 
829
+ /* ZSTD_safecopyDstBeforeSrc():
830
+ * This version allows overlap with dst before src, or handles the non-overlap case with dst after src
831
+ * Kept separate from more common ZSTD_safecopy case to avoid performance impact to the safecopy common case */
832
+ static void ZSTD_safecopyDstBeforeSrc(BYTE* op, BYTE const* ip, ptrdiff_t length) {
833
+ ptrdiff_t const diff = op - ip;
834
+ BYTE* const oend = op + length;
835
+
836
+ if (length < 8 || diff > -8) {
837
+ /* Handle short lengths, close overlaps, and dst not before src. */
838
+ while (op < oend) *op++ = *ip++;
839
+ return;
840
+ }
841
+
842
+ if (op <= oend - WILDCOPY_OVERLENGTH && diff < -WILDCOPY_VECLEN) {
843
+ ZSTD_wildcopy(op, ip, oend - WILDCOPY_OVERLENGTH - op, ZSTD_no_overlap);
844
+ ip += oend - WILDCOPY_OVERLENGTH - op;
845
+ op += oend - WILDCOPY_OVERLENGTH - op;
846
+ }
847
+
848
+ /* Handle the leftovers. */
849
+ while (op < oend) *op++ = *ip++;
850
+ }
851
+
652
852
  /* ZSTD_execSequenceEnd():
653
853
  * This version handles cases that are near the end of the output buffer. It requires
654
854
  * more careful checks to make sure there is no overflow. By separating out these hard
@@ -659,21 +859,21 @@ static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, ptrdiff_
659
859
  */
660
860
  FORCE_NOINLINE
661
861
  size_t ZSTD_execSequenceEnd(BYTE* op,
662
- BYTE* const oend, seq_t sequence,
663
- const BYTE** litPtr, const BYTE* const litLimit,
664
- const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
862
+ BYTE* const oend, seq_t sequence,
863
+ const BYTE** litPtr, const BYTE* const litLimit,
864
+ const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
665
865
  {
666
866
  BYTE* const oLitEnd = op + sequence.litLength;
667
867
  size_t const sequenceLength = sequence.litLength + sequence.matchLength;
668
- BYTE* const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */
669
868
  const BYTE* const iLitEnd = *litPtr + sequence.litLength;
670
869
  const BYTE* match = oLitEnd - sequence.offset;
671
870
  BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH;
672
871
 
673
- /* bounds checks */
674
- assert(oLitEnd < oMatchEnd);
675
- RETURN_ERROR_IF(oMatchEnd > oend, dstSize_tooSmall, "last match must fit within dstBuffer");
676
- RETURN_ERROR_IF(iLitEnd > litLimit, corruption_detected, "try to read beyond literal buffer");
872
+ /* bounds checks : careful of address space overflow in 32-bit mode */
873
+ RETURN_ERROR_IF(sequenceLength > (size_t)(oend - op), dstSize_tooSmall, "last match must fit within dstBuffer");
874
+ RETURN_ERROR_IF(sequence.litLength > (size_t)(litLimit - *litPtr), corruption_detected, "try to read beyond literal buffer");
875
+ assert(op < op + sequenceLength);
876
+ assert(oLitEnd < op + sequenceLength);
677
877
 
678
878
  /* copy literals */
679
879
  ZSTD_safecopy(op, oend_w, *litPtr, sequence.litLength, ZSTD_no_overlap);
@@ -683,42 +883,102 @@ size_t ZSTD_execSequenceEnd(BYTE* op,
683
883
  /* copy Match */
684
884
  if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
685
885
  /* offset beyond prefix */
686
- RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected);
687
- match = dictEnd - (prefixStart-match);
886
+ RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, "");
887
+ match = dictEnd - (prefixStart - match);
688
888
  if (match + sequence.matchLength <= dictEnd) {
689
- memmove(oLitEnd, match, sequence.matchLength);
889
+ ZSTD_memmove(oLitEnd, match, sequence.matchLength);
690
890
  return sequenceLength;
691
891
  }
692
892
  /* span extDict & currentPrefixSegment */
693
893
  { size_t const length1 = dictEnd - match;
694
- memmove(oLitEnd, match, length1);
695
- op = oLitEnd + length1;
696
- sequence.matchLength -= length1;
697
- match = prefixStart;
698
- } }
894
+ ZSTD_memmove(oLitEnd, match, length1);
895
+ op = oLitEnd + length1;
896
+ sequence.matchLength -= length1;
897
+ match = prefixStart;
898
+ }
899
+ }
900
+ ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst);
901
+ return sequenceLength;
902
+ }
903
+
904
+ /* ZSTD_execSequenceEndSplitLitBuffer():
905
+ * This version is intended to be used during instances where the litBuffer is still split. It is kept separate to avoid performance impact for the good case.
906
+ */
907
+ FORCE_NOINLINE
908
+ size_t ZSTD_execSequenceEndSplitLitBuffer(BYTE* op,
909
+ BYTE* const oend, const BYTE* const oend_w, seq_t sequence,
910
+ const BYTE** litPtr, const BYTE* const litLimit,
911
+ const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
912
+ {
913
+ BYTE* const oLitEnd = op + sequence.litLength;
914
+ size_t const sequenceLength = sequence.litLength + sequence.matchLength;
915
+ const BYTE* const iLitEnd = *litPtr + sequence.litLength;
916
+ const BYTE* match = oLitEnd - sequence.offset;
917
+
918
+
919
+ /* bounds checks : careful of address space overflow in 32-bit mode */
920
+ RETURN_ERROR_IF(sequenceLength > (size_t)(oend - op), dstSize_tooSmall, "last match must fit within dstBuffer");
921
+ RETURN_ERROR_IF(sequence.litLength > (size_t)(litLimit - *litPtr), corruption_detected, "try to read beyond literal buffer");
922
+ assert(op < op + sequenceLength);
923
+ assert(oLitEnd < op + sequenceLength);
924
+
925
+ /* copy literals */
926
+ RETURN_ERROR_IF(op > *litPtr && op < *litPtr + sequence.litLength, dstSize_tooSmall, "output should not catch up to and overwrite literal buffer");
927
+ ZSTD_safecopyDstBeforeSrc(op, *litPtr, sequence.litLength);
928
+ op = oLitEnd;
929
+ *litPtr = iLitEnd;
930
+
931
+ /* copy Match */
932
+ if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
933
+ /* offset beyond prefix */
934
+ RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, "");
935
+ match = dictEnd - (prefixStart - match);
936
+ if (match + sequence.matchLength <= dictEnd) {
937
+ ZSTD_memmove(oLitEnd, match, sequence.matchLength);
938
+ return sequenceLength;
939
+ }
940
+ /* span extDict & currentPrefixSegment */
941
+ { size_t const length1 = dictEnd - match;
942
+ ZSTD_memmove(oLitEnd, match, length1);
943
+ op = oLitEnd + length1;
944
+ sequence.matchLength -= length1;
945
+ match = prefixStart;
946
+ }
947
+ }
699
948
  ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst);
700
949
  return sequenceLength;
701
950
  }
702
951
 
703
952
  HINT_INLINE
704
953
  size_t ZSTD_execSequence(BYTE* op,
705
- BYTE* const oend, seq_t sequence,
706
- const BYTE** litPtr, const BYTE* const litLimit,
707
- const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
954
+ BYTE* const oend, seq_t sequence,
955
+ const BYTE** litPtr, const BYTE* const litLimit,
956
+ const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
708
957
  {
709
958
  BYTE* const oLitEnd = op + sequence.litLength;
710
959
  size_t const sequenceLength = sequence.litLength + sequence.matchLength;
711
960
  BYTE* const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */
712
- BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH;
961
+ BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH; /* risk : address space underflow on oend=NULL */
713
962
  const BYTE* const iLitEnd = *litPtr + sequence.litLength;
714
963
  const BYTE* match = oLitEnd - sequence.offset;
715
964
 
716
- /* Errors and uncommon cases handled here. */
717
- assert(oLitEnd < oMatchEnd);
718
- if (iLitEnd > litLimit || oMatchEnd > oend_w)
965
+ assert(op != NULL /* Precondition */);
966
+ assert(oend_w < oend /* No underflow */);
967
+ /* Handle edge cases in a slow path:
968
+ * - Read beyond end of literals
969
+ * - Match end is within WILDCOPY_OVERLIMIT of oend
970
+ * - 32-bit mode and the match length overflows
971
+ */
972
+ if (UNLIKELY(
973
+ iLitEnd > litLimit ||
974
+ oMatchEnd > oend_w ||
975
+ (MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH)))
719
976
  return ZSTD_execSequenceEnd(op, oend, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd);
720
977
 
721
978
  /* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */
979
+ assert(op <= oLitEnd /* No overflow */);
980
+ assert(oLitEnd < oMatchEnd /* Non-zero match & no overflow */);
981
+ assert(oMatchEnd <= oend /* No underflow */);
722
982
  assert(iLitEnd <= litLimit /* Literal length is in bounds */);
723
983
  assert(oLitEnd <= oend_w /* Can wildcopy literals */);
724
984
  assert(oMatchEnd <= oend_w /* Can wildcopy matches */);
@@ -729,7 +989,99 @@ size_t ZSTD_execSequence(BYTE* op,
729
989
  */
730
990
  assert(WILDCOPY_OVERLENGTH >= 16);
731
991
  ZSTD_copy16(op, (*litPtr));
732
- if (sequence.litLength > 16) {
992
+ if (UNLIKELY(sequence.litLength > 16)) {
993
+ ZSTD_wildcopy(op + 16, (*litPtr) + 16, sequence.litLength - 16, ZSTD_no_overlap);
994
+ }
995
+ op = oLitEnd;
996
+ *litPtr = iLitEnd; /* update for next sequence */
997
+
998
+ /* Copy Match */
999
+ if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
1000
+ /* offset beyond prefix -> go into extDict */
1001
+ RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virtualStart)), corruption_detected, "");
1002
+ match = dictEnd + (match - prefixStart);
1003
+ if (match + sequence.matchLength <= dictEnd) {
1004
+ ZSTD_memmove(oLitEnd, match, sequence.matchLength);
1005
+ return sequenceLength;
1006
+ }
1007
+ /* span extDict & currentPrefixSegment */
1008
+ { size_t const length1 = dictEnd - match;
1009
+ ZSTD_memmove(oLitEnd, match, length1);
1010
+ op = oLitEnd + length1;
1011
+ sequence.matchLength -= length1;
1012
+ match = prefixStart;
1013
+ }
1014
+ }
1015
+ /* Match within prefix of 1 or more bytes */
1016
+ assert(op <= oMatchEnd);
1017
+ assert(oMatchEnd <= oend_w);
1018
+ assert(match >= prefixStart);
1019
+ assert(sequence.matchLength >= 1);
1020
+
1021
+ /* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy
1022
+ * without overlap checking.
1023
+ */
1024
+ if (LIKELY(sequence.offset >= WILDCOPY_VECLEN)) {
1025
+ /* We bet on a full wildcopy for matches, since we expect matches to be
1026
+ * longer than literals (in general). In silesia, ~10% of matches are longer
1027
+ * than 16 bytes.
1028
+ */
1029
+ ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength, ZSTD_no_overlap);
1030
+ return sequenceLength;
1031
+ }
1032
+ assert(sequence.offset < WILDCOPY_VECLEN);
1033
+
1034
+ /* Copy 8 bytes and spread the offset to be >= 8. */
1035
+ ZSTD_overlapCopy8(&op, &match, sequence.offset);
1036
+
1037
+ /* If the match length is > 8 bytes, then continue with the wildcopy. */
1038
+ if (sequence.matchLength > 8) {
1039
+ assert(op < oMatchEnd);
1040
+ ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength - 8, ZSTD_overlap_src_before_dst);
1041
+ }
1042
+ return sequenceLength;
1043
+ }
1044
+
1045
+ HINT_INLINE
1046
+ size_t ZSTD_execSequenceSplitLitBuffer(BYTE* op,
1047
+ BYTE* const oend, const BYTE* const oend_w, seq_t sequence,
1048
+ const BYTE** litPtr, const BYTE* const litLimit,
1049
+ const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
1050
+ {
1051
+ BYTE* const oLitEnd = op + sequence.litLength;
1052
+ size_t const sequenceLength = sequence.litLength + sequence.matchLength;
1053
+ BYTE* const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */
1054
+ const BYTE* const iLitEnd = *litPtr + sequence.litLength;
1055
+ const BYTE* match = oLitEnd - sequence.offset;
1056
+
1057
+ assert(op != NULL /* Precondition */);
1058
+ assert(oend_w < oend /* No underflow */);
1059
+ /* Handle edge cases in a slow path:
1060
+ * - Read beyond end of literals
1061
+ * - Match end is within WILDCOPY_OVERLIMIT of oend
1062
+ * - 32-bit mode and the match length overflows
1063
+ */
1064
+ if (UNLIKELY(
1065
+ iLitEnd > litLimit ||
1066
+ oMatchEnd > oend_w ||
1067
+ (MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH)))
1068
+ return ZSTD_execSequenceEndSplitLitBuffer(op, oend, oend_w, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd);
1069
+
1070
+ /* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */
1071
+ assert(op <= oLitEnd /* No overflow */);
1072
+ assert(oLitEnd < oMatchEnd /* Non-zero match & no overflow */);
1073
+ assert(oMatchEnd <= oend /* No underflow */);
1074
+ assert(iLitEnd <= litLimit /* Literal length is in bounds */);
1075
+ assert(oLitEnd <= oend_w /* Can wildcopy literals */);
1076
+ assert(oMatchEnd <= oend_w /* Can wildcopy matches */);
1077
+
1078
+ /* Copy Literals:
1079
+ * Split out litLength <= 16 since it is nearly always true. +1.6% on gcc-9.
1080
+ * We likely don't need the full 32-byte wildcopy.
1081
+ */
1082
+ assert(WILDCOPY_OVERLENGTH >= 16);
1083
+ ZSTD_copy16(op, (*litPtr));
1084
+ if (UNLIKELY(sequence.litLength > 16)) {
733
1085
  ZSTD_wildcopy(op+16, (*litPtr)+16, sequence.litLength-16, ZSTD_no_overlap);
734
1086
  }
735
1087
  op = oLitEnd;
@@ -738,15 +1090,15 @@ size_t ZSTD_execSequence(BYTE* op,
738
1090
  /* Copy Match */
739
1091
  if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
740
1092
  /* offset beyond prefix -> go into extDict */
741
- RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected);
1093
+ RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virtualStart)), corruption_detected, "");
742
1094
  match = dictEnd + (match - prefixStart);
743
1095
  if (match + sequence.matchLength <= dictEnd) {
744
- memmove(oLitEnd, match, sequence.matchLength);
1096
+ ZSTD_memmove(oLitEnd, match, sequence.matchLength);
745
1097
  return sequenceLength;
746
1098
  }
747
1099
  /* span extDict & currentPrefixSegment */
748
1100
  { size_t const length1 = dictEnd - match;
749
- memmove(oLitEnd, match, length1);
1101
+ ZSTD_memmove(oLitEnd, match, length1);
750
1102
  op = oLitEnd + length1;
751
1103
  sequence.matchLength -= length1;
752
1104
  match = prefixStart;
@@ -760,7 +1112,7 @@ size_t ZSTD_execSequence(BYTE* op,
760
1112
  /* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy
761
1113
  * without overlap checking.
762
1114
  */
763
- if (sequence.offset >= WILDCOPY_VECLEN) {
1115
+ if (LIKELY(sequence.offset >= WILDCOPY_VECLEN)) {
764
1116
  /* We bet on a full wildcopy for matches, since we expect matches to be
765
1117
  * longer than literals (in general). In silesia, ~10% of matches are longer
766
1118
  * than 16 bytes.
@@ -781,6 +1133,7 @@ size_t ZSTD_execSequence(BYTE* op,
781
1133
  return sequenceLength;
782
1134
  }
783
1135
 
1136
+
784
1137
  static void
785
1138
  ZSTD_initFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, const ZSTD_seqSymbol* dt)
786
1139
  {
@@ -794,12 +1147,10 @@ ZSTD_initFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, const ZSTD_seqS
794
1147
  }
795
1148
 
796
1149
  FORCE_INLINE_TEMPLATE void
797
- ZSTD_updateFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD)
1150
+ ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16 nextState, U32 nbBits)
798
1151
  {
799
- ZSTD_seqSymbol const DInfo = DStatePtr->table[DStatePtr->state];
800
- U32 const nbBits = DInfo.nbBits;
801
1152
  size_t const lowBits = BIT_readBits(bitD, nbBits);
802
- DStatePtr->state = DInfo.nextState + lowBits;
1153
+ DStatePtr->state = nextState + lowBits;
803
1154
  }
804
1155
 
805
1156
  /* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum
@@ -814,102 +1165,178 @@ ZSTD_updateFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD)
814
1165
 
815
1166
  typedef enum { ZSTD_lo_isRegularOffset, ZSTD_lo_isLongOffset=1 } ZSTD_longOffset_e;
816
1167
 
817
- #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
818
1168
  FORCE_INLINE_TEMPLATE seq_t
819
1169
  ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
820
1170
  {
821
1171
  seq_t seq;
822
- U32 const llBits = seqState->stateLL.table[seqState->stateLL.state].nbAdditionalBits;
823
- U32 const mlBits = seqState->stateML.table[seqState->stateML.state].nbAdditionalBits;
824
- U32 const ofBits = seqState->stateOffb.table[seqState->stateOffb.state].nbAdditionalBits;
825
- U32 const totalBits = llBits+mlBits+ofBits;
826
- U32 const llBase = seqState->stateLL.table[seqState->stateLL.state].baseValue;
827
- U32 const mlBase = seqState->stateML.table[seqState->stateML.state].baseValue;
828
- U32 const ofBase = seqState->stateOffb.table[seqState->stateOffb.state].baseValue;
829
-
830
- /* sequence */
831
- { size_t offset;
832
- if (!ofBits)
833
- offset = 0;
834
- else {
835
- ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
836
- ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
837
- assert(ofBits <= MaxOff);
838
- if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) {
839
- U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed);
840
- offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
841
- BIT_reloadDStream(&seqState->DStream);
842
- if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits);
843
- assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32); /* to avoid another reload */
844
- } else {
845
- offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */
846
- if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
847
- }
848
- }
1172
+ const ZSTD_seqSymbol* const llDInfo = seqState->stateLL.table + seqState->stateLL.state;
1173
+ const ZSTD_seqSymbol* const mlDInfo = seqState->stateML.table + seqState->stateML.state;
1174
+ const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state;
1175
+ seq.matchLength = mlDInfo->baseValue;
1176
+ seq.litLength = llDInfo->baseValue;
1177
+ { U32 const ofBase = ofDInfo->baseValue;
1178
+ BYTE const llBits = llDInfo->nbAdditionalBits;
1179
+ BYTE const mlBits = mlDInfo->nbAdditionalBits;
1180
+ BYTE const ofBits = ofDInfo->nbAdditionalBits;
1181
+ BYTE const totalBits = llBits+mlBits+ofBits;
1182
+
1183
+ U16 const llNext = llDInfo->nextState;
1184
+ U16 const mlNext = mlDInfo->nextState;
1185
+ U16 const ofNext = ofDInfo->nextState;
1186
+ U32 const llnbBits = llDInfo->nbBits;
1187
+ U32 const mlnbBits = mlDInfo->nbBits;
1188
+ U32 const ofnbBits = ofDInfo->nbBits;
1189
+ /*
1190
+ * As gcc has better branch and block analyzers, sometimes it is only
1191
+ * valuable to mark likelyness for clang, it gives around 3-4% of
1192
+ * performance.
1193
+ */
849
1194
 
850
- if (ofBits <= 1) {
851
- offset += (llBase==0);
852
- if (offset) {
853
- size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
854
- temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */
855
- if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1];
1195
+ /* sequence */
1196
+ { size_t offset;
1197
+ #if defined(__clang__)
1198
+ if (LIKELY(ofBits > 1)) {
1199
+ #else
1200
+ if (ofBits > 1) {
1201
+ #endif
1202
+ ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
1203
+ ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
1204
+ assert(ofBits <= MaxOff);
1205
+ if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) {
1206
+ U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed);
1207
+ offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
1208
+ BIT_reloadDStream(&seqState->DStream);
1209
+ if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits);
1210
+ assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32); /* to avoid another reload */
1211
+ } else {
1212
+ offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */
1213
+ if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
1214
+ }
1215
+ seqState->prevOffset[2] = seqState->prevOffset[1];
856
1216
  seqState->prevOffset[1] = seqState->prevOffset[0];
857
- seqState->prevOffset[0] = offset = temp;
858
- } else { /* offset == 0 */
859
- offset = seqState->prevOffset[0];
860
- }
861
- } else {
862
- seqState->prevOffset[2] = seqState->prevOffset[1];
863
- seqState->prevOffset[1] = seqState->prevOffset[0];
864
- seqState->prevOffset[0] = offset;
1217
+ seqState->prevOffset[0] = offset;
1218
+ } else {
1219
+ U32 const ll0 = (llDInfo->baseValue == 0);
1220
+ if (LIKELY((ofBits == 0))) {
1221
+ offset = seqState->prevOffset[ll0];
1222
+ seqState->prevOffset[1] = seqState->prevOffset[!ll0];
1223
+ seqState->prevOffset[0] = offset;
1224
+ } else {
1225
+ offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1);
1226
+ { size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
1227
+ temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */
1228
+ if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1];
1229
+ seqState->prevOffset[1] = seqState->prevOffset[0];
1230
+ seqState->prevOffset[0] = offset = temp;
1231
+ } } }
1232
+ seq.offset = offset;
865
1233
  }
866
- seq.offset = offset;
867
- }
868
1234
 
869
- seq.matchLength = mlBase
870
- + ((mlBits>0) ? BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/) : 0); /* <= 16 bits */
871
- if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))
872
- BIT_reloadDStream(&seqState->DStream);
873
- if (MEM_64bits() && (totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog)))
874
- BIT_reloadDStream(&seqState->DStream);
875
- /* Ensure there are enough bits to read the rest of data in 64-bit mode. */
876
- ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);
877
-
878
- seq.litLength = llBase
879
- + ((llBits>0) ? BIT_readBitsFast(&seqState->DStream, llBits/*>0*/) : 0); /* <= 16 bits */
880
- if (MEM_32bits())
881
- BIT_reloadDStream(&seqState->DStream);
882
-
883
- DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u",
884
- (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
885
-
886
- /* ANS state update */
887
- ZSTD_updateFseState(&seqState->stateLL, &seqState->DStream); /* <= 9 bits */
888
- ZSTD_updateFseState(&seqState->stateML, &seqState->DStream); /* <= 9 bits */
889
- if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */
890
- ZSTD_updateFseState(&seqState->stateOffb, &seqState->DStream); /* <= 8 bits */
1235
+ #if defined(__clang__)
1236
+ if (UNLIKELY(mlBits > 0))
1237
+ #else
1238
+ if (mlBits > 0)
1239
+ #endif
1240
+ seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/);
1241
+
1242
+ if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))
1243
+ BIT_reloadDStream(&seqState->DStream);
1244
+ if (MEM_64bits() && UNLIKELY(totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog)))
1245
+ BIT_reloadDStream(&seqState->DStream);
1246
+ /* Ensure there are enough bits to read the rest of data in 64-bit mode. */
1247
+ ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);
1248
+
1249
+ #if defined(__clang__)
1250
+ if (UNLIKELY(llBits > 0))
1251
+ #else
1252
+ if (llBits > 0)
1253
+ #endif
1254
+ seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/);
1255
+
1256
+ if (MEM_32bits())
1257
+ BIT_reloadDStream(&seqState->DStream);
1258
+
1259
+ DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u",
1260
+ (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
1261
+
1262
+ ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits); /* <= 9 bits */
1263
+ ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits); /* <= 9 bits */
1264
+ if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */
1265
+ ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits); /* <= 8 bits */
1266
+ }
891
1267
 
892
1268
  return seq;
893
1269
  }
894
1270
 
1271
+ #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
1272
+ MEM_STATIC int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd)
1273
+ {
1274
+ size_t const windowSize = dctx->fParams.windowSize;
1275
+ /* No dictionary used. */
1276
+ if (dctx->dictContentEndForFuzzing == NULL) return 0;
1277
+ /* Dictionary is our prefix. */
1278
+ if (prefixStart == dctx->dictContentBeginForFuzzing) return 1;
1279
+ /* Dictionary is not our ext-dict. */
1280
+ if (dctx->dictEnd != dctx->dictContentEndForFuzzing) return 0;
1281
+ /* Dictionary is not within our window size. */
1282
+ if ((size_t)(oLitEnd - prefixStart) >= windowSize) return 0;
1283
+ /* Dictionary is active. */
1284
+ return 1;
1285
+ }
1286
+
1287
+ MEM_STATIC void ZSTD_assertValidSequence(
1288
+ ZSTD_DCtx const* dctx,
1289
+ BYTE const* op, BYTE const* oend,
1290
+ seq_t const seq,
1291
+ BYTE const* prefixStart, BYTE const* virtualStart)
1292
+ {
1293
+ #if DEBUGLEVEL >= 1
1294
+ size_t const windowSize = dctx->fParams.windowSize;
1295
+ size_t const sequenceSize = seq.litLength + seq.matchLength;
1296
+ BYTE const* const oLitEnd = op + seq.litLength;
1297
+ DEBUGLOG(6, "Checking sequence: litL=%u matchL=%u offset=%u",
1298
+ (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
1299
+ assert(op <= oend);
1300
+ assert((size_t)(oend - op) >= sequenceSize);
1301
+ assert(sequenceSize <= ZSTD_BLOCKSIZE_MAX);
1302
+ if (ZSTD_dictionaryIsActive(dctx, prefixStart, oLitEnd)) {
1303
+ size_t const dictSize = (size_t)((char const*)dctx->dictContentEndForFuzzing - (char const*)dctx->dictContentBeginForFuzzing);
1304
+ /* Offset must be within the dictionary. */
1305
+ assert(seq.offset <= (size_t)(oLitEnd - virtualStart));
1306
+ assert(seq.offset <= windowSize + dictSize);
1307
+ } else {
1308
+ /* Offset must be within our window. */
1309
+ assert(seq.offset <= windowSize);
1310
+ }
1311
+ #else
1312
+ (void)dctx, (void)op, (void)oend, (void)seq, (void)prefixStart, (void)virtualStart;
1313
+ #endif
1314
+ }
1315
+ #endif
1316
+
1317
+ #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
1318
+
1319
+
895
1320
  FORCE_INLINE_TEMPLATE size_t
896
1321
  DONT_VECTORIZE
897
- ZSTD_decompressSequences_body( ZSTD_DCtx* dctx,
1322
+ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,
898
1323
  void* dst, size_t maxDstSize,
899
1324
  const void* seqStart, size_t seqSize, int nbSeq,
900
- const ZSTD_longOffset_e isLongOffset)
1325
+ const ZSTD_longOffset_e isLongOffset,
1326
+ const int frame)
901
1327
  {
902
1328
  const BYTE* ip = (const BYTE*)seqStart;
903
1329
  const BYTE* const iend = ip + seqSize;
904
- BYTE* const ostart = (BYTE* const)dst;
1330
+ BYTE* const ostart = (BYTE*)dst;
905
1331
  BYTE* const oend = ostart + maxDstSize;
906
1332
  BYTE* op = ostart;
907
1333
  const BYTE* litPtr = dctx->litPtr;
908
- const BYTE* const litEnd = litPtr + dctx->litSize;
1334
+ const BYTE* litBufferEnd = dctx->litBufferEnd;
909
1335
  const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart);
910
1336
  const BYTE* const vBase = (const BYTE*) (dctx->virtualStart);
911
1337
  const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
912
- DEBUGLOG(5, "ZSTD_decompressSequences_body");
1338
+ DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer");
1339
+ (void)frame;
913
1340
 
914
1341
  /* Regen sequences */
915
1342
  if (nbSeq) {
@@ -918,38 +1345,279 @@ ZSTD_decompressSequences_body( ZSTD_DCtx* dctx,
918
1345
  { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; }
919
1346
  RETURN_ERROR_IF(
920
1347
  ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend-ip)),
921
- corruption_detected);
1348
+ corruption_detected, "");
922
1349
  ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr);
923
1350
  ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr);
924
1351
  ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
1352
+ assert(dst != NULL);
925
1353
 
926
1354
  ZSTD_STATIC_ASSERT(
927
1355
  BIT_DStream_unfinished < BIT_DStream_completed &&
928
1356
  BIT_DStream_endOfBuffer < BIT_DStream_completed &&
929
1357
  BIT_DStream_completed < BIT_DStream_overflow);
930
1358
 
931
- for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && nbSeq ; ) {
932
- nbSeq--;
933
- { seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
934
- size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd);
1359
+ /* decompress without overrunning litPtr begins */
1360
+ {
1361
+ seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1362
+ /* Align the decompression loop to 32 + 16 bytes.
1363
+ *
1364
+ * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression
1365
+ * speed swings based on the alignment of the decompression loop. This
1366
+ * performance swing is caused by parts of the decompression loop falling
1367
+ * out of the DSB. The entire decompression loop should fit in the DSB,
1368
+ * when it can't we get much worse performance. You can measure if you've
1369
+ * hit the good case or the bad case with this perf command for some
1370
+ * compressed file test.zst:
1371
+ *
1372
+ * perf stat -e cycles -e instructions -e idq.all_dsb_cycles_any_uops \
1373
+ * -e idq.all_mite_cycles_any_uops -- ./zstd -tq test.zst
1374
+ *
1375
+ * If you see most cycles served out of the MITE you've hit the bad case.
1376
+ * If you see most cycles served out of the DSB you've hit the good case.
1377
+ * If it is pretty even then you may be in an okay case.
1378
+ *
1379
+ * This issue has been reproduced on the following CPUs:
1380
+ * - Kabylake: Macbook Pro (15-inch, 2019) 2.4 GHz Intel Core i9
1381
+ * Use Instruments->Counters to get DSB/MITE cycles.
1382
+ * I never got performance swings, but I was able to
1383
+ * go from the good case of mostly DSB to half of the
1384
+ * cycles served from MITE.
1385
+ * - Coffeelake: Intel i9-9900k
1386
+ * - Coffeelake: Intel i7-9700k
1387
+ *
1388
+ * I haven't been able to reproduce the instability or DSB misses on any
1389
+ * of the following CPUS:
1390
+ * - Haswell
1391
+ * - Broadwell: Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GH
1392
+ * - Skylake
1393
+ *
1394
+ * Alignment is done for each of the three major decompression loops:
1395
+ * - ZSTD_decompressSequences_bodySplitLitBuffer - presplit section of the literal buffer
1396
+ * - ZSTD_decompressSequences_bodySplitLitBuffer - postsplit section of the literal buffer
1397
+ * - ZSTD_decompressSequences_body
1398
+ * Alignment choices are made to minimize large swings on bad cases and influence on performance
1399
+ * from changes external to this code, rather than to overoptimize on the current commit.
1400
+ *
1401
+ * If you are seeing performance stability this script can help test.
1402
+ * It tests on 4 commits in zstd where I saw performance change.
1403
+ *
1404
+ * https://gist.github.com/terrelln/9889fc06a423fd5ca6e99351564473f4
1405
+ */
1406
+ #if defined(__GNUC__) && defined(__x86_64__)
1407
+ __asm__(".p2align 6");
1408
+ # if __GNUC__ >= 7
1409
+ /* good for gcc-7, gcc-9, and gcc-11 */
1410
+ __asm__("nop");
1411
+ __asm__(".p2align 5");
1412
+ __asm__("nop");
1413
+ __asm__(".p2align 4");
1414
+ # if __GNUC__ == 8 || __GNUC__ == 10
1415
+ /* good for gcc-8 and gcc-10 */
1416
+ __asm__("nop");
1417
+ __asm__(".p2align 3");
1418
+ # endif
1419
+ # endif
1420
+ #endif
1421
+
1422
+ /* Handle the initial state where litBuffer is currently split between dst and litExtraBuffer */
1423
+ for (; litPtr + sequence.litLength <= dctx->litBufferEnd; ) {
1424
+ size_t const oneSeqSize = ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
1425
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1426
+ assert(!ZSTD_isError(oneSeqSize));
1427
+ if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
1428
+ #endif
1429
+ if (UNLIKELY(ZSTD_isError(oneSeqSize)))
1430
+ return oneSeqSize;
935
1431
  DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
936
- if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
937
1432
  op += oneSeqSize;
938
- } }
1433
+ if (UNLIKELY(!--nbSeq))
1434
+ break;
1435
+ BIT_reloadDStream(&(seqState.DStream));
1436
+ sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1437
+ }
1438
+
1439
+ /* If there are more sequences, they will need to read literals from litExtraBuffer; copy over the remainder from dst and update litPtr and litEnd */
1440
+ if (nbSeq > 0) {
1441
+ const size_t leftoverLit = dctx->litBufferEnd - litPtr;
1442
+ if (leftoverLit)
1443
+ {
1444
+ RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");
1445
+ ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
1446
+ sequence.litLength -= leftoverLit;
1447
+ op += leftoverLit;
1448
+ }
1449
+ litPtr = dctx->litExtraBuffer;
1450
+ litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
1451
+ dctx->litBufferLocation = ZSTD_not_in_dst;
1452
+ {
1453
+ size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
1454
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1455
+ assert(!ZSTD_isError(oneSeqSize));
1456
+ if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
1457
+ #endif
1458
+ if (UNLIKELY(ZSTD_isError(oneSeqSize)))
1459
+ return oneSeqSize;
1460
+ DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
1461
+ op += oneSeqSize;
1462
+ if (--nbSeq)
1463
+ BIT_reloadDStream(&(seqState.DStream));
1464
+ }
1465
+ }
1466
+ }
1467
+
1468
+ if (nbSeq > 0) /* there is remaining lit from extra buffer */
1469
+ {
1470
+
1471
+ #if defined(__GNUC__) && defined(__x86_64__)
1472
+ __asm__(".p2align 6");
1473
+ __asm__("nop");
1474
+ # if __GNUC__ != 7
1475
+ /* worse for gcc-7 better for gcc-8, gcc-9, and gcc-10 and clang */
1476
+ __asm__(".p2align 4");
1477
+ __asm__("nop");
1478
+ __asm__(".p2align 3");
1479
+ # elif __GNUC__ >= 11
1480
+ __asm__(".p2align 3");
1481
+ # else
1482
+ __asm__(".p2align 5");
1483
+ __asm__("nop");
1484
+ __asm__(".p2align 3");
1485
+ # endif
1486
+ #endif
1487
+
1488
+ for (; ; ) {
1489
+ seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1490
+ size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
1491
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1492
+ assert(!ZSTD_isError(oneSeqSize));
1493
+ if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
1494
+ #endif
1495
+ if (UNLIKELY(ZSTD_isError(oneSeqSize)))
1496
+ return oneSeqSize;
1497
+ DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
1498
+ op += oneSeqSize;
1499
+ if (UNLIKELY(!--nbSeq))
1500
+ break;
1501
+ BIT_reloadDStream(&(seqState.DStream));
1502
+ }
1503
+ }
1504
+
1505
+ /* check if reached exact end */
1506
+ DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer: after decode loop, remaining nbSeq : %i", nbSeq);
1507
+ RETURN_ERROR_IF(nbSeq, corruption_detected, "");
1508
+ RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, "");
1509
+ /* save reps for next block */
1510
+ { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
1511
+ }
1512
+
1513
+ /* last literal segment */
1514
+ if (dctx->litBufferLocation == ZSTD_split) /* split hasn't been reached yet, first get dst then copy litExtraBuffer */
1515
+ {
1516
+ size_t const lastLLSize = litBufferEnd - litPtr;
1517
+ RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, "");
1518
+ if (op != NULL) {
1519
+ ZSTD_memmove(op, litPtr, lastLLSize);
1520
+ op += lastLLSize;
1521
+ }
1522
+ litPtr = dctx->litExtraBuffer;
1523
+ litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
1524
+ dctx->litBufferLocation = ZSTD_not_in_dst;
1525
+ }
1526
+ { size_t const lastLLSize = litBufferEnd - litPtr;
1527
+ RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
1528
+ if (op != NULL) {
1529
+ ZSTD_memcpy(op, litPtr, lastLLSize);
1530
+ op += lastLLSize;
1531
+ }
1532
+ }
1533
+
1534
+ return op-ostart;
1535
+ }
1536
+
1537
+ FORCE_INLINE_TEMPLATE size_t
1538
+ DONT_VECTORIZE
1539
+ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx,
1540
+ void* dst, size_t maxDstSize,
1541
+ const void* seqStart, size_t seqSize, int nbSeq,
1542
+ const ZSTD_longOffset_e isLongOffset,
1543
+ const int frame)
1544
+ {
1545
+ const BYTE* ip = (const BYTE*)seqStart;
1546
+ const BYTE* const iend = ip + seqSize;
1547
+ BYTE* const ostart = (BYTE*)dst;
1548
+ BYTE* const oend = dctx->litBufferLocation == ZSTD_not_in_dst ? ostart + maxDstSize : dctx->litBuffer;
1549
+ BYTE* op = ostart;
1550
+ const BYTE* litPtr = dctx->litPtr;
1551
+ const BYTE* const litEnd = litPtr + dctx->litSize;
1552
+ const BYTE* const prefixStart = (const BYTE*)(dctx->prefixStart);
1553
+ const BYTE* const vBase = (const BYTE*)(dctx->virtualStart);
1554
+ const BYTE* const dictEnd = (const BYTE*)(dctx->dictEnd);
1555
+ DEBUGLOG(5, "ZSTD_decompressSequences_body");
1556
+ (void)frame;
1557
+
1558
+ /* Regen sequences */
1559
+ if (nbSeq) {
1560
+ seqState_t seqState;
1561
+ dctx->fseEntropy = 1;
1562
+ { U32 i; for (i = 0; i < ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; }
1563
+ RETURN_ERROR_IF(
1564
+ ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend - ip)),
1565
+ corruption_detected, "");
1566
+ ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr);
1567
+ ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr);
1568
+ ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
1569
+ assert(dst != NULL);
1570
+
1571
+ ZSTD_STATIC_ASSERT(
1572
+ BIT_DStream_unfinished < BIT_DStream_completed &&
1573
+ BIT_DStream_endOfBuffer < BIT_DStream_completed &&
1574
+ BIT_DStream_completed < BIT_DStream_overflow);
1575
+
1576
+ #if defined(__GNUC__) && defined(__x86_64__)
1577
+ __asm__(".p2align 6");
1578
+ __asm__("nop");
1579
+ # if __GNUC__ >= 7
1580
+ __asm__(".p2align 5");
1581
+ __asm__("nop");
1582
+ __asm__(".p2align 3");
1583
+ # else
1584
+ __asm__(".p2align 4");
1585
+ __asm__("nop");
1586
+ __asm__(".p2align 3");
1587
+ # endif
1588
+ #endif
1589
+
1590
+ for ( ; ; ) {
1591
+ seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1592
+ size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd);
1593
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1594
+ assert(!ZSTD_isError(oneSeqSize));
1595
+ if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
1596
+ #endif
1597
+ if (UNLIKELY(ZSTD_isError(oneSeqSize)))
1598
+ return oneSeqSize;
1599
+ DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
1600
+ op += oneSeqSize;
1601
+ if (UNLIKELY(!--nbSeq))
1602
+ break;
1603
+ BIT_reloadDStream(&(seqState.DStream));
1604
+ }
939
1605
 
940
1606
  /* check if reached exact end */
941
1607
  DEBUGLOG(5, "ZSTD_decompressSequences_body: after decode loop, remaining nbSeq : %i", nbSeq);
942
- RETURN_ERROR_IF(nbSeq, corruption_detected);
943
- RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected);
1608
+ RETURN_ERROR_IF(nbSeq, corruption_detected, "");
1609
+ RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, "");
944
1610
  /* save reps for next block */
945
1611
  { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
946
1612
  }
947
1613
 
948
1614
  /* last literal segment */
949
1615
  { size_t const lastLLSize = litEnd - litPtr;
950
- RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall);
951
- memcpy(op, litPtr, lastLLSize);
952
- op += lastLLSize;
1616
+ RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
1617
+ if (op != NULL) {
1618
+ ZSTD_memcpy(op, litPtr, lastLLSize);
1619
+ op += lastLLSize;
1620
+ }
953
1621
  }
954
1622
 
955
1623
  return op-ostart;
@@ -959,157 +1627,180 @@ static size_t
959
1627
  ZSTD_decompressSequences_default(ZSTD_DCtx* dctx,
960
1628
  void* dst, size_t maxDstSize,
961
1629
  const void* seqStart, size_t seqSize, int nbSeq,
962
- const ZSTD_longOffset_e isLongOffset)
1630
+ const ZSTD_longOffset_e isLongOffset,
1631
+ const int frame)
963
1632
  {
964
- return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
1633
+ return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
965
1634
  }
966
- #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
967
-
968
1635
 
969
-
970
- #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
971
- FORCE_INLINE_TEMPLATE seq_t
972
- ZSTD_decodeSequenceLong(seqState_t* seqState, ZSTD_longOffset_e const longOffsets)
1636
+ static size_t
1637
+ ZSTD_decompressSequencesSplitLitBuffer_default(ZSTD_DCtx* dctx,
1638
+ void* dst, size_t maxDstSize,
1639
+ const void* seqStart, size_t seqSize, int nbSeq,
1640
+ const ZSTD_longOffset_e isLongOffset,
1641
+ const int frame)
973
1642
  {
974
- seq_t seq;
975
- U32 const llBits = seqState->stateLL.table[seqState->stateLL.state].nbAdditionalBits;
976
- U32 const mlBits = seqState->stateML.table[seqState->stateML.state].nbAdditionalBits;
977
- U32 const ofBits = seqState->stateOffb.table[seqState->stateOffb.state].nbAdditionalBits;
978
- U32 const totalBits = llBits+mlBits+ofBits;
979
- U32 const llBase = seqState->stateLL.table[seqState->stateLL.state].baseValue;
980
- U32 const mlBase = seqState->stateML.table[seqState->stateML.state].baseValue;
981
- U32 const ofBase = seqState->stateOffb.table[seqState->stateOffb.state].baseValue;
982
-
983
- /* sequence */
984
- { size_t offset;
985
- if (!ofBits)
986
- offset = 0;
987
- else {
988
- ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
989
- ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
990
- assert(ofBits <= MaxOff);
991
- if (MEM_32bits() && longOffsets) {
992
- U32 const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN_32-1);
993
- offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
994
- if (MEM_32bits() || extraBits) BIT_reloadDStream(&seqState->DStream);
995
- if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits);
996
- } else {
997
- offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */
998
- if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
999
- }
1000
- }
1643
+ return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1644
+ }
1645
+ #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
1001
1646
 
1002
- if (ofBits <= 1) {
1003
- offset += (llBase==0);
1004
- if (offset) {
1005
- size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
1006
- temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */
1007
- if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1];
1008
- seqState->prevOffset[1] = seqState->prevOffset[0];
1009
- seqState->prevOffset[0] = offset = temp;
1010
- } else {
1011
- offset = seqState->prevOffset[0];
1012
- }
1013
- } else {
1014
- seqState->prevOffset[2] = seqState->prevOffset[1];
1015
- seqState->prevOffset[1] = seqState->prevOffset[0];
1016
- seqState->prevOffset[0] = offset;
1017
- }
1018
- seq.offset = offset;
1019
- }
1647
+ #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
1020
1648
 
1021
- seq.matchLength = mlBase + ((mlBits>0) ? BIT_readBitsFast(&seqState->DStream, mlBits) : 0); /* <= 16 bits */
1022
- if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))
1023
- BIT_reloadDStream(&seqState->DStream);
1024
- if (MEM_64bits() && (totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog)))
1025
- BIT_reloadDStream(&seqState->DStream);
1026
- /* Verify that there is enough bits to read the rest of the data in 64-bit mode. */
1027
- ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);
1028
-
1029
- seq.litLength = llBase + ((llBits>0) ? BIT_readBitsFast(&seqState->DStream, llBits) : 0); /* <= 16 bits */
1030
- if (MEM_32bits())
1031
- BIT_reloadDStream(&seqState->DStream);
1032
-
1033
- { size_t const pos = seqState->pos + seq.litLength;
1034
- const BYTE* const matchBase = (seq.offset > pos) ? seqState->dictEnd : seqState->prefixStart;
1035
- seq.match = matchBase + pos - seq.offset; /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted.
1036
- * No consequence though : no memory access will occur, overly large offset will be detected in ZSTD_execSequenceLong() */
1037
- seqState->pos = pos + seq.matchLength;
1649
+ FORCE_INLINE_TEMPLATE size_t
1650
+ ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence,
1651
+ const BYTE* const prefixStart, const BYTE* const dictEnd)
1652
+ {
1653
+ prefetchPos += sequence.litLength;
1654
+ { const BYTE* const matchBase = (sequence.offset > prefetchPos) ? dictEnd : prefixStart;
1655
+ const BYTE* const match = matchBase + prefetchPos - sequence.offset; /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted.
1656
+ * No consequence though : memory address is only used for prefetching, not for dereferencing */
1657
+ PREFETCH_L1(match); PREFETCH_L1(match+CACHELINE_SIZE); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
1038
1658
  }
1039
-
1040
- /* ANS state update */
1041
- ZSTD_updateFseState(&seqState->stateLL, &seqState->DStream); /* <= 9 bits */
1042
- ZSTD_updateFseState(&seqState->stateML, &seqState->DStream); /* <= 9 bits */
1043
- if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */
1044
- ZSTD_updateFseState(&seqState->stateOffb, &seqState->DStream); /* <= 8 bits */
1045
-
1046
- return seq;
1659
+ return prefetchPos + sequence.matchLength;
1047
1660
  }
1048
1661
 
1662
+ /* This decoding function employs prefetching
1663
+ * to reduce latency impact of cache misses.
1664
+ * It's generally employed when block contains a significant portion of long-distance matches
1665
+ * or when coupled with a "cold" dictionary */
1049
1666
  FORCE_INLINE_TEMPLATE size_t
1050
1667
  ZSTD_decompressSequencesLong_body(
1051
1668
  ZSTD_DCtx* dctx,
1052
1669
  void* dst, size_t maxDstSize,
1053
1670
  const void* seqStart, size_t seqSize, int nbSeq,
1054
- const ZSTD_longOffset_e isLongOffset)
1671
+ const ZSTD_longOffset_e isLongOffset,
1672
+ const int frame)
1055
1673
  {
1056
1674
  const BYTE* ip = (const BYTE*)seqStart;
1057
1675
  const BYTE* const iend = ip + seqSize;
1058
- BYTE* const ostart = (BYTE* const)dst;
1059
- BYTE* const oend = ostart + maxDstSize;
1676
+ BYTE* const ostart = (BYTE*)dst;
1677
+ BYTE* const oend = dctx->litBufferLocation == ZSTD_in_dst ? dctx->litBuffer : ostart + maxDstSize;
1060
1678
  BYTE* op = ostart;
1061
1679
  const BYTE* litPtr = dctx->litPtr;
1062
- const BYTE* const litEnd = litPtr + dctx->litSize;
1680
+ const BYTE* litBufferEnd = dctx->litBufferEnd;
1063
1681
  const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart);
1064
1682
  const BYTE* const dictStart = (const BYTE*) (dctx->virtualStart);
1065
1683
  const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
1684
+ (void)frame;
1066
1685
 
1067
1686
  /* Regen sequences */
1068
1687
  if (nbSeq) {
1069
- #define STORED_SEQS 4
1688
+ #define STORED_SEQS 8
1070
1689
  #define STORED_SEQS_MASK (STORED_SEQS-1)
1071
- #define ADVANCED_SEQS 4
1690
+ #define ADVANCED_SEQS STORED_SEQS
1072
1691
  seq_t sequences[STORED_SEQS];
1073
1692
  int const seqAdvance = MIN(nbSeq, ADVANCED_SEQS);
1074
1693
  seqState_t seqState;
1075
1694
  int seqNb;
1695
+ size_t prefetchPos = (size_t)(op-prefixStart); /* track position relative to prefixStart */
1696
+
1076
1697
  dctx->fseEntropy = 1;
1077
1698
  { int i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; }
1078
- seqState.prefixStart = prefixStart;
1079
- seqState.pos = (size_t)(op-prefixStart);
1080
- seqState.dictEnd = dictEnd;
1699
+ assert(dst != NULL);
1081
1700
  assert(iend >= ip);
1082
1701
  RETURN_ERROR_IF(
1083
1702
  ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend-ip)),
1084
- corruption_detected);
1703
+ corruption_detected, "");
1085
1704
  ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr);
1086
1705
  ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr);
1087
1706
  ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
1088
1707
 
1089
1708
  /* prepare in advance */
1090
1709
  for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && (seqNb<seqAdvance); seqNb++) {
1091
- sequences[seqNb] = ZSTD_decodeSequenceLong(&seqState, isLongOffset);
1092
- PREFETCH_L1(sequences[seqNb].match); PREFETCH_L1(sequences[seqNb].match + sequences[seqNb].matchLength - 1); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
1710
+ seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1711
+ prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
1712
+ sequences[seqNb] = sequence;
1093
1713
  }
1094
- RETURN_ERROR_IF(seqNb<seqAdvance, corruption_detected);
1095
-
1096
- /* decode and decompress */
1097
- for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (seqNb<nbSeq) ; seqNb++) {
1098
- seq_t const sequence = ZSTD_decodeSequenceLong(&seqState, isLongOffset);
1099
- size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb-ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd);
1100
- if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1101
- PREFETCH_L1(sequence.match); PREFETCH_L1(sequence.match + sequence.matchLength - 1); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
1102
- sequences[seqNb & STORED_SEQS_MASK] = sequence;
1103
- op += oneSeqSize;
1714
+ RETURN_ERROR_IF(seqNb<seqAdvance, corruption_detected, "");
1715
+
1716
+ /* decompress without stomping litBuffer */
1717
+ for (; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (seqNb < nbSeq); seqNb++) {
1718
+ seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1719
+ size_t oneSeqSize;
1720
+
1721
+ if (dctx->litBufferLocation == ZSTD_split && litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength > dctx->litBufferEnd)
1722
+ {
1723
+ /* lit buffer is reaching split point, empty out the first buffer and transition to litExtraBuffer */
1724
+ const size_t leftoverLit = dctx->litBufferEnd - litPtr;
1725
+ if (leftoverLit)
1726
+ {
1727
+ RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");
1728
+ ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
1729
+ sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength -= leftoverLit;
1730
+ op += leftoverLit;
1731
+ }
1732
+ litPtr = dctx->litExtraBuffer;
1733
+ litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
1734
+ dctx->litBufferLocation = ZSTD_not_in_dst;
1735
+ oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
1736
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1737
+ assert(!ZSTD_isError(oneSeqSize));
1738
+ if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
1739
+ #endif
1740
+ if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1741
+
1742
+ prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
1743
+ sequences[seqNb & STORED_SEQS_MASK] = sequence;
1744
+ op += oneSeqSize;
1745
+ }
1746
+ else
1747
+ {
1748
+ /* lit buffer is either wholly contained in first or second split, or not split at all*/
1749
+ oneSeqSize = dctx->litBufferLocation == ZSTD_split ?
1750
+ ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength - WILDCOPY_OVERLENGTH, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) :
1751
+ ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
1752
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1753
+ assert(!ZSTD_isError(oneSeqSize));
1754
+ if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
1755
+ #endif
1756
+ if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1757
+
1758
+ prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
1759
+ sequences[seqNb & STORED_SEQS_MASK] = sequence;
1760
+ op += oneSeqSize;
1761
+ }
1104
1762
  }
1105
- RETURN_ERROR_IF(seqNb<nbSeq, corruption_detected);
1763
+ RETURN_ERROR_IF(seqNb<nbSeq, corruption_detected, "");
1106
1764
 
1107
1765
  /* finish queue */
1108
1766
  seqNb -= seqAdvance;
1109
1767
  for ( ; seqNb<nbSeq ; seqNb++) {
1110
- size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[seqNb&STORED_SEQS_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd);
1111
- if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1112
- op += oneSeqSize;
1768
+ seq_t *sequence = &(sequences[seqNb&STORED_SEQS_MASK]);
1769
+ if (dctx->litBufferLocation == ZSTD_split && litPtr + sequence->litLength > dctx->litBufferEnd)
1770
+ {
1771
+ const size_t leftoverLit = dctx->litBufferEnd - litPtr;
1772
+ if (leftoverLit)
1773
+ {
1774
+ RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");
1775
+ ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
1776
+ sequence->litLength -= leftoverLit;
1777
+ op += leftoverLit;
1778
+ }
1779
+ litPtr = dctx->litExtraBuffer;
1780
+ litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
1781
+ dctx->litBufferLocation = ZSTD_not_in_dst;
1782
+ {
1783
+ size_t const oneSeqSize = ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
1784
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1785
+ assert(!ZSTD_isError(oneSeqSize));
1786
+ if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
1787
+ #endif
1788
+ if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1789
+ op += oneSeqSize;
1790
+ }
1791
+ }
1792
+ else
1793
+ {
1794
+ size_t const oneSeqSize = dctx->litBufferLocation == ZSTD_split ?
1795
+ ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence->litLength - WILDCOPY_OVERLENGTH, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) :
1796
+ ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
1797
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1798
+ assert(!ZSTD_isError(oneSeqSize));
1799
+ if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
1800
+ #endif
1801
+ if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1802
+ op += oneSeqSize;
1803
+ }
1113
1804
  }
1114
1805
 
1115
1806
  /* save reps for next block */
@@ -1117,10 +1808,23 @@ ZSTD_decompressSequencesLong_body(
1117
1808
  }
1118
1809
 
1119
1810
  /* last literal segment */
1120
- { size_t const lastLLSize = litEnd - litPtr;
1121
- RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall);
1122
- memcpy(op, litPtr, lastLLSize);
1123
- op += lastLLSize;
1811
+ if (dctx->litBufferLocation == ZSTD_split) /* first deplete literal buffer in dst, then copy litExtraBuffer */
1812
+ {
1813
+ size_t const lastLLSize = litBufferEnd - litPtr;
1814
+ RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, "");
1815
+ if (op != NULL) {
1816
+ ZSTD_memmove(op, litPtr, lastLLSize);
1817
+ op += lastLLSize;
1818
+ }
1819
+ litPtr = dctx->litExtraBuffer;
1820
+ litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
1821
+ }
1822
+ { size_t const lastLLSize = litBufferEnd - litPtr;
1823
+ RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
1824
+ if (op != NULL) {
1825
+ ZSTD_memmove(op, litPtr, lastLLSize);
1826
+ op += lastLLSize;
1827
+ }
1124
1828
  }
1125
1829
 
1126
1830
  return op-ostart;
@@ -1130,9 +1834,10 @@ static size_t
1130
1834
  ZSTD_decompressSequencesLong_default(ZSTD_DCtx* dctx,
1131
1835
  void* dst, size_t maxDstSize,
1132
1836
  const void* seqStart, size_t seqSize, int nbSeq,
1133
- const ZSTD_longOffset_e isLongOffset)
1837
+ const ZSTD_longOffset_e isLongOffset,
1838
+ const int frame)
1134
1839
  {
1135
- return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
1840
+ return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1136
1841
  }
1137
1842
  #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
1138
1843
 
@@ -1141,25 +1846,37 @@ ZSTD_decompressSequencesLong_default(ZSTD_DCtx* dctx,
1141
1846
  #if DYNAMIC_BMI2
1142
1847
 
1143
1848
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
1144
- static TARGET_ATTRIBUTE("bmi2") size_t
1849
+ static BMI2_TARGET_ATTRIBUTE size_t
1145
1850
  DONT_VECTORIZE
1146
1851
  ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx,
1147
1852
  void* dst, size_t maxDstSize,
1148
1853
  const void* seqStart, size_t seqSize, int nbSeq,
1149
- const ZSTD_longOffset_e isLongOffset)
1854
+ const ZSTD_longOffset_e isLongOffset,
1855
+ const int frame)
1150
1856
  {
1151
- return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
1857
+ return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1858
+ }
1859
+ static BMI2_TARGET_ATTRIBUTE size_t
1860
+ DONT_VECTORIZE
1861
+ ZSTD_decompressSequencesSplitLitBuffer_bmi2(ZSTD_DCtx* dctx,
1862
+ void* dst, size_t maxDstSize,
1863
+ const void* seqStart, size_t seqSize, int nbSeq,
1864
+ const ZSTD_longOffset_e isLongOffset,
1865
+ const int frame)
1866
+ {
1867
+ return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1152
1868
  }
1153
1869
  #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
1154
1870
 
1155
1871
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
1156
- static TARGET_ATTRIBUTE("bmi2") size_t
1872
+ static BMI2_TARGET_ATTRIBUTE size_t
1157
1873
  ZSTD_decompressSequencesLong_bmi2(ZSTD_DCtx* dctx,
1158
1874
  void* dst, size_t maxDstSize,
1159
1875
  const void* seqStart, size_t seqSize, int nbSeq,
1160
- const ZSTD_longOffset_e isLongOffset)
1876
+ const ZSTD_longOffset_e isLongOffset,
1877
+ const int frame)
1161
1878
  {
1162
- return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
1879
+ return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1163
1880
  }
1164
1881
  #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
1165
1882
 
@@ -1169,21 +1886,37 @@ typedef size_t (*ZSTD_decompressSequences_t)(
1169
1886
  ZSTD_DCtx* dctx,
1170
1887
  void* dst, size_t maxDstSize,
1171
1888
  const void* seqStart, size_t seqSize, int nbSeq,
1172
- const ZSTD_longOffset_e isLongOffset);
1889
+ const ZSTD_longOffset_e isLongOffset,
1890
+ const int frame);
1173
1891
 
1174
1892
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
1175
1893
  static size_t
1176
1894
  ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,
1177
1895
  const void* seqStart, size_t seqSize, int nbSeq,
1178
- const ZSTD_longOffset_e isLongOffset)
1896
+ const ZSTD_longOffset_e isLongOffset,
1897
+ const int frame)
1179
1898
  {
1180
1899
  DEBUGLOG(5, "ZSTD_decompressSequences");
1181
1900
  #if DYNAMIC_BMI2
1182
- if (dctx->bmi2) {
1183
- return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
1901
+ if (ZSTD_DCtx_get_bmi2(dctx)) {
1902
+ return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1184
1903
  }
1185
1904
  #endif
1186
- return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
1905
+ return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1906
+ }
1907
+ static size_t
1908
+ ZSTD_decompressSequencesSplitLitBuffer(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,
1909
+ const void* seqStart, size_t seqSize, int nbSeq,
1910
+ const ZSTD_longOffset_e isLongOffset,
1911
+ const int frame)
1912
+ {
1913
+ DEBUGLOG(5, "ZSTD_decompressSequencesSplitLitBuffer");
1914
+ #if DYNAMIC_BMI2
1915
+ if (ZSTD_DCtx_get_bmi2(dctx)) {
1916
+ return ZSTD_decompressSequencesSplitLitBuffer_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1917
+ }
1918
+ #endif
1919
+ return ZSTD_decompressSequencesSplitLitBuffer_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1187
1920
  }
1188
1921
  #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
1189
1922
 
@@ -1198,15 +1931,16 @@ static size_t
1198
1931
  ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx,
1199
1932
  void* dst, size_t maxDstSize,
1200
1933
  const void* seqStart, size_t seqSize, int nbSeq,
1201
- const ZSTD_longOffset_e isLongOffset)
1934
+ const ZSTD_longOffset_e isLongOffset,
1935
+ const int frame)
1202
1936
  {
1203
1937
  DEBUGLOG(5, "ZSTD_decompressSequencesLong");
1204
1938
  #if DYNAMIC_BMI2
1205
- if (dctx->bmi2) {
1206
- return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
1939
+ if (ZSTD_DCtx_get_bmi2(dctx)) {
1940
+ return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1207
1941
  }
1208
1942
  #endif
1209
- return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
1943
+ return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1210
1944
  }
1211
1945
  #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
1212
1946
 
@@ -1240,11 +1974,10 @@ ZSTD_getLongOffsetsShare(const ZSTD_seqSymbol* offTable)
1240
1974
  }
1241
1975
  #endif
1242
1976
 
1243
-
1244
1977
  size_t
1245
1978
  ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
1246
1979
  void* dst, size_t dstCapacity,
1247
- const void* src, size_t srcSize, const int frame)
1980
+ const void* src, size_t srcSize, const int frame, const streaming_operation streaming)
1248
1981
  { /* blockType == blockCompressed */
1249
1982
  const BYTE* ip = (const BYTE*)src;
1250
1983
  /* isLongOffset must be true if there are long offsets.
@@ -1256,10 +1989,10 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
1256
1989
  ZSTD_longOffset_e const isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (!frame || (dctx->fParams.windowSize > (1ULL << STREAM_ACCUMULATOR_MIN))));
1257
1990
  DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize);
1258
1991
 
1259
- RETURN_ERROR_IF(srcSize >= ZSTD_BLOCKSIZE_MAX, srcSize_wrong);
1992
+ RETURN_ERROR_IF(srcSize >= ZSTD_BLOCKSIZE_MAX, srcSize_wrong, "");
1260
1993
 
1261
1994
  /* Decode literals section */
1262
- { size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize);
1995
+ { size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, streaming);
1263
1996
  DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : %u", (U32)litCSize);
1264
1997
  if (ZSTD_isError(litCSize)) return litCSize;
1265
1998
  ip += litCSize;
@@ -1282,6 +2015,8 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
1282
2015
  ip += seqHSize;
1283
2016
  srcSize -= seqHSize;
1284
2017
 
2018
+ RETURN_ERROR_IF(dst == NULL && nbSeq > 0, dstSize_tooSmall, "NULL not handled");
2019
+
1285
2020
  #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
1286
2021
  !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
1287
2022
  if ( !usePrefetchDecoder
@@ -1300,24 +2035,38 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
1300
2035
  if (usePrefetchDecoder)
1301
2036
  #endif
1302
2037
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
1303
- return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
2038
+ return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
1304
2039
  #endif
1305
2040
 
1306
2041
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
1307
2042
  /* else */
1308
- return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
2043
+ if (dctx->litBufferLocation == ZSTD_split)
2044
+ return ZSTD_decompressSequencesSplitLitBuffer(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
2045
+ else
2046
+ return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
1309
2047
  #endif
1310
2048
  }
1311
2049
  }
1312
2050
 
1313
2051
 
2052
+ void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize)
2053
+ {
2054
+ if (dst != dctx->previousDstEnd && dstSize > 0) { /* not contiguous */
2055
+ dctx->dictEnd = dctx->previousDstEnd;
2056
+ dctx->virtualStart = (const char*)dst - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->prefixStart));
2057
+ dctx->prefixStart = dst;
2058
+ dctx->previousDstEnd = dst;
2059
+ }
2060
+ }
2061
+
2062
+
1314
2063
  size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
1315
2064
  void* dst, size_t dstCapacity,
1316
2065
  const void* src, size_t srcSize)
1317
2066
  {
1318
2067
  size_t dSize;
1319
- ZSTD_checkContinuity(dctx, dst);
1320
- dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 0);
2068
+ ZSTD_checkContinuity(dctx, dst, dstCapacity);
2069
+ dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 0, not_streaming);
1321
2070
  dctx->previousDstEnd = (char*)dst + dSize;
1322
2071
  return dSize;
1323
2072
  }