extzstd 0.3.2 → 0.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (112) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +4 -3
  3. data/contrib/zstd/CHANGELOG +225 -1
  4. data/contrib/zstd/CONTRIBUTING.md +158 -75
  5. data/contrib/zstd/LICENSE +4 -4
  6. data/contrib/zstd/Makefile +106 -69
  7. data/contrib/zstd/Package.swift +36 -0
  8. data/contrib/zstd/README.md +64 -36
  9. data/contrib/zstd/SECURITY.md +15 -0
  10. data/contrib/zstd/TESTING.md +2 -3
  11. data/contrib/zstd/lib/BUCK +5 -7
  12. data/contrib/zstd/lib/Makefile +117 -199
  13. data/contrib/zstd/lib/README.md +37 -7
  14. data/contrib/zstd/lib/common/allocations.h +55 -0
  15. data/contrib/zstd/lib/common/bits.h +200 -0
  16. data/contrib/zstd/lib/common/bitstream.h +80 -86
  17. data/contrib/zstd/lib/common/compiler.h +225 -63
  18. data/contrib/zstd/lib/common/cpu.h +37 -1
  19. data/contrib/zstd/lib/common/debug.c +7 -1
  20. data/contrib/zstd/lib/common/debug.h +21 -12
  21. data/contrib/zstd/lib/common/entropy_common.c +15 -37
  22. data/contrib/zstd/lib/common/error_private.c +9 -2
  23. data/contrib/zstd/lib/common/error_private.h +93 -5
  24. data/contrib/zstd/lib/common/fse.h +12 -87
  25. data/contrib/zstd/lib/common/fse_decompress.c +37 -117
  26. data/contrib/zstd/lib/common/huf.h +97 -172
  27. data/contrib/zstd/lib/common/mem.h +58 -58
  28. data/contrib/zstd/lib/common/pool.c +38 -17
  29. data/contrib/zstd/lib/common/pool.h +10 -4
  30. data/contrib/zstd/lib/common/portability_macros.h +158 -0
  31. data/contrib/zstd/lib/common/threading.c +74 -14
  32. data/contrib/zstd/lib/common/threading.h +5 -10
  33. data/contrib/zstd/lib/common/xxhash.c +6 -814
  34. data/contrib/zstd/lib/common/xxhash.h +6930 -195
  35. data/contrib/zstd/lib/common/zstd_common.c +1 -36
  36. data/contrib/zstd/lib/common/zstd_deps.h +1 -1
  37. data/contrib/zstd/lib/common/zstd_internal.h +68 -154
  38. data/contrib/zstd/lib/common/zstd_trace.h +163 -0
  39. data/contrib/zstd/lib/compress/clevels.h +134 -0
  40. data/contrib/zstd/lib/compress/fse_compress.c +75 -155
  41. data/contrib/zstd/lib/compress/hist.c +1 -1
  42. data/contrib/zstd/lib/compress/hist.h +1 -1
  43. data/contrib/zstd/lib/compress/huf_compress.c +810 -259
  44. data/contrib/zstd/lib/compress/zstd_compress.c +2864 -919
  45. data/contrib/zstd/lib/compress/zstd_compress_internal.h +523 -192
  46. data/contrib/zstd/lib/compress/zstd_compress_literals.c +117 -40
  47. data/contrib/zstd/lib/compress/zstd_compress_literals.h +16 -6
  48. data/contrib/zstd/lib/compress/zstd_compress_sequences.c +28 -19
  49. data/contrib/zstd/lib/compress/zstd_compress_sequences.h +1 -1
  50. data/contrib/zstd/lib/compress/zstd_compress_superblock.c +251 -412
  51. data/contrib/zstd/lib/compress/zstd_compress_superblock.h +1 -1
  52. data/contrib/zstd/lib/compress/zstd_cwksp.h +284 -97
  53. data/contrib/zstd/lib/compress/zstd_double_fast.c +382 -133
  54. data/contrib/zstd/lib/compress/zstd_double_fast.h +14 -2
  55. data/contrib/zstd/lib/compress/zstd_fast.c +732 -260
  56. data/contrib/zstd/lib/compress/zstd_fast.h +3 -2
  57. data/contrib/zstd/lib/compress/zstd_lazy.c +1177 -390
  58. data/contrib/zstd/lib/compress/zstd_lazy.h +129 -14
  59. data/contrib/zstd/lib/compress/zstd_ldm.c +280 -210
  60. data/contrib/zstd/lib/compress/zstd_ldm.h +3 -2
  61. data/contrib/zstd/lib/compress/zstd_ldm_geartab.h +106 -0
  62. data/contrib/zstd/lib/compress/zstd_opt.c +516 -285
  63. data/contrib/zstd/lib/compress/zstd_opt.h +32 -8
  64. data/contrib/zstd/lib/compress/zstdmt_compress.c +202 -131
  65. data/contrib/zstd/lib/compress/zstdmt_compress.h +9 -6
  66. data/contrib/zstd/lib/decompress/huf_decompress.c +1149 -555
  67. data/contrib/zstd/lib/decompress/huf_decompress_amd64.S +595 -0
  68. data/contrib/zstd/lib/decompress/zstd_ddict.c +4 -4
  69. data/contrib/zstd/lib/decompress/zstd_ddict.h +1 -1
  70. data/contrib/zstd/lib/decompress/zstd_decompress.c +583 -106
  71. data/contrib/zstd/lib/decompress/zstd_decompress_block.c +1054 -379
  72. data/contrib/zstd/lib/decompress/zstd_decompress_block.h +14 -3
  73. data/contrib/zstd/lib/decompress/zstd_decompress_internal.h +56 -6
  74. data/contrib/zstd/lib/deprecated/zbuff.h +1 -1
  75. data/contrib/zstd/lib/deprecated/zbuff_common.c +1 -1
  76. data/contrib/zstd/lib/deprecated/zbuff_compress.c +24 -4
  77. data/contrib/zstd/lib/deprecated/zbuff_decompress.c +3 -1
  78. data/contrib/zstd/lib/dictBuilder/cover.c +60 -44
  79. data/contrib/zstd/lib/dictBuilder/cover.h +6 -11
  80. data/contrib/zstd/lib/dictBuilder/divsufsort.c +1 -1
  81. data/contrib/zstd/lib/dictBuilder/fastcover.c +26 -18
  82. data/contrib/zstd/lib/dictBuilder/zdict.c +100 -101
  83. data/contrib/zstd/lib/legacy/zstd_legacy.h +38 -1
  84. data/contrib/zstd/lib/legacy/zstd_v01.c +18 -53
  85. data/contrib/zstd/lib/legacy/zstd_v01.h +1 -1
  86. data/contrib/zstd/lib/legacy/zstd_v02.c +28 -85
  87. data/contrib/zstd/lib/legacy/zstd_v02.h +1 -1
  88. data/contrib/zstd/lib/legacy/zstd_v03.c +29 -88
  89. data/contrib/zstd/lib/legacy/zstd_v03.h +1 -1
  90. data/contrib/zstd/lib/legacy/zstd_v04.c +27 -80
  91. data/contrib/zstd/lib/legacy/zstd_v04.h +1 -1
  92. data/contrib/zstd/lib/legacy/zstd_v05.c +36 -85
  93. data/contrib/zstd/lib/legacy/zstd_v05.h +1 -1
  94. data/contrib/zstd/lib/legacy/zstd_v06.c +44 -96
  95. data/contrib/zstd/lib/legacy/zstd_v06.h +1 -1
  96. data/contrib/zstd/lib/legacy/zstd_v07.c +37 -92
  97. data/contrib/zstd/lib/legacy/zstd_v07.h +1 -1
  98. data/contrib/zstd/lib/libzstd.mk +237 -0
  99. data/contrib/zstd/lib/libzstd.pc.in +4 -3
  100. data/contrib/zstd/lib/module.modulemap +35 -0
  101. data/contrib/zstd/lib/{dictBuilder/zdict.h → zdict.h} +202 -33
  102. data/contrib/zstd/lib/zstd.h +1030 -332
  103. data/contrib/zstd/lib/{common/zstd_errors.h → zstd_errors.h} +27 -8
  104. data/ext/extconf.rb +26 -7
  105. data/ext/extzstd.c +51 -24
  106. data/ext/extzstd.h +33 -6
  107. data/ext/extzstd_stream.c +74 -31
  108. data/ext/libzstd_conf.h +0 -1
  109. data/ext/zstd_decompress_asm.S +1 -0
  110. metadata +17 -7
  111. data/contrib/zstd/appveyor.yml +0 -292
  112. data/ext/depend +0 -2
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -20,12 +20,12 @@
20
20
  #include "../common/mem.h" /* low level memory routines */
21
21
  #define FSE_STATIC_LINKING_ONLY
22
22
  #include "../common/fse.h"
23
- #define HUF_STATIC_LINKING_ONLY
24
23
  #include "../common/huf.h"
25
24
  #include "../common/zstd_internal.h"
26
25
  #include "zstd_decompress_internal.h" /* ZSTD_DCtx */
27
26
  #include "zstd_ddict.h" /* ZSTD_DDictDictContent */
28
27
  #include "zstd_decompress_block.h"
28
+ #include "../common/bits.h" /* ZSTD_highbit32 */
29
29
 
30
30
  /*_*******************************************************
31
31
  * Macros
@@ -51,6 +51,13 @@ static void ZSTD_copy4(void* dst, const void* src) { ZSTD_memcpy(dst, src, 4); }
51
51
  * Block decoding
52
52
  ***************************************************************/
53
53
 
54
+ static size_t ZSTD_blockSizeMax(ZSTD_DCtx const* dctx)
55
+ {
56
+ size_t const blockSizeMax = dctx->isFrameDecompression ? dctx->fParams.blockSizeMax : ZSTD_BLOCKSIZE_MAX;
57
+ assert(blockSizeMax <= ZSTD_BLOCKSIZE_MAX);
58
+ return blockSizeMax;
59
+ }
60
+
54
61
  /*! ZSTD_getcBlockSize() :
55
62
  * Provides the size of compressed block from block header `src` */
56
63
  size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
@@ -69,36 +76,90 @@ size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
69
76
  }
70
77
  }
71
78
 
79
+ /* Allocate buffer for literals, either overlapping current dst, or split between dst and litExtraBuffer, or stored entirely within litExtraBuffer */
80
+ static void ZSTD_allocateLiteralsBuffer(ZSTD_DCtx* dctx, void* const dst, const size_t dstCapacity, const size_t litSize,
81
+ const streaming_operation streaming, const size_t expectedWriteSize, const unsigned splitImmediately)
82
+ {
83
+ size_t const blockSizeMax = ZSTD_blockSizeMax(dctx);
84
+ assert(litSize <= blockSizeMax);
85
+ assert(dctx->isFrameDecompression || streaming == not_streaming);
86
+ assert(expectedWriteSize <= blockSizeMax);
87
+ if (streaming == not_streaming && dstCapacity > blockSizeMax + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH) {
88
+ /* If we aren't streaming, we can just put the literals after the output
89
+ * of the current block. We don't need to worry about overwriting the
90
+ * extDict of our window, because it doesn't exist.
91
+ * So if we have space after the end of the block, just put it there.
92
+ */
93
+ dctx->litBuffer = (BYTE*)dst + blockSizeMax + WILDCOPY_OVERLENGTH;
94
+ dctx->litBufferEnd = dctx->litBuffer + litSize;
95
+ dctx->litBufferLocation = ZSTD_in_dst;
96
+ } else if (litSize <= ZSTD_LITBUFFEREXTRASIZE) {
97
+ /* Literals fit entirely within the extra buffer, put them there to avoid
98
+ * having to split the literals.
99
+ */
100
+ dctx->litBuffer = dctx->litExtraBuffer;
101
+ dctx->litBufferEnd = dctx->litBuffer + litSize;
102
+ dctx->litBufferLocation = ZSTD_not_in_dst;
103
+ } else {
104
+ assert(blockSizeMax > ZSTD_LITBUFFEREXTRASIZE);
105
+ /* Literals must be split between the output block and the extra lit
106
+ * buffer. We fill the extra lit buffer with the tail of the literals,
107
+ * and put the rest of the literals at the end of the block, with
108
+ * WILDCOPY_OVERLENGTH of buffer room to allow for overreads.
109
+ * This MUST not write more than our maxBlockSize beyond dst, because in
110
+ * streaming mode, that could overwrite part of our extDict window.
111
+ */
112
+ if (splitImmediately) {
113
+ /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */
114
+ dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH;
115
+ dctx->litBufferEnd = dctx->litBuffer + litSize - ZSTD_LITBUFFEREXTRASIZE;
116
+ } else {
117
+ /* initially this will be stored entirely in dst during huffman decoding, it will partially be shifted to litExtraBuffer after */
118
+ dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize;
119
+ dctx->litBufferEnd = (BYTE*)dst + expectedWriteSize;
120
+ }
121
+ dctx->litBufferLocation = ZSTD_split;
122
+ assert(dctx->litBufferEnd <= (BYTE*)dst + expectedWriteSize);
123
+ }
124
+ }
72
125
 
73
- /* Hidden declaration for fullbench */
74
- size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
75
- const void* src, size_t srcSize);
76
126
  /*! ZSTD_decodeLiteralsBlock() :
127
+ * Where it is possible to do so without being stomped by the output during decompression, the literals block will be stored
128
+ * in the dstBuffer. If there is room to do so, it will be stored in full in the excess dst space after where the current
129
+ * block will be output. Otherwise it will be stored at the end of the current dst blockspace, with a small portion being
130
+ * stored in dctx->litExtraBuffer to help keep it "ahead" of the current output write.
131
+ *
77
132
  * @return : nb of bytes read from src (< srcSize )
78
133
  * note : symbol not declared but exposed for fullbench */
79
- size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
80
- const void* src, size_t srcSize) /* note : srcSize < BLOCKSIZE */
134
+ static size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
135
+ const void* src, size_t srcSize, /* note : srcSize < BLOCKSIZE */
136
+ void* dst, size_t dstCapacity, const streaming_operation streaming)
81
137
  {
82
138
  DEBUGLOG(5, "ZSTD_decodeLiteralsBlock");
83
139
  RETURN_ERROR_IF(srcSize < MIN_CBLOCK_SIZE, corruption_detected, "");
84
140
 
85
141
  { const BYTE* const istart = (const BYTE*) src;
86
142
  symbolEncodingType_e const litEncType = (symbolEncodingType_e)(istart[0] & 3);
143
+ size_t const blockSizeMax = ZSTD_blockSizeMax(dctx);
87
144
 
88
145
  switch(litEncType)
89
146
  {
90
147
  case set_repeat:
91
148
  DEBUGLOG(5, "set_repeat flag : re-using stats from previous compressed literals block");
92
149
  RETURN_ERROR_IF(dctx->litEntropy==0, dictionary_corrupted, "");
93
- /* fall-through */
150
+ ZSTD_FALLTHROUGH;
94
151
 
95
152
  case set_compressed:
96
- RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3");
153
+ RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need up to 5 for case 3");
97
154
  { size_t lhSize, litSize, litCSize;
98
155
  U32 singleStream=0;
99
156
  U32 const lhlCode = (istart[0] >> 2) & 3;
100
157
  U32 const lhc = MEM_readLE32(istart);
101
158
  size_t hufSuccess;
159
+ size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity);
160
+ int const flags = 0
161
+ | (ZSTD_DCtx_get_bmi2(dctx) ? HUF_flags_bmi2 : 0)
162
+ | (dctx->disableHufAsm ? HUF_flags_disableAsm : 0);
102
163
  switch(lhlCode)
103
164
  {
104
165
  case 0: case 1: default: /* note : default is impossible, since lhlCode into [0..3] */
@@ -121,8 +182,15 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
121
182
  litCSize = (lhc >> 22) + ((size_t)istart[4] << 10);
122
183
  break;
123
184
  }
124
- RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
185
+ RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
186
+ RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, "");
187
+ if (!singleStream)
188
+ RETURN_ERROR_IF(litSize < MIN_LITERALS_FOR_4_STREAMS, literals_headerWrong,
189
+ "Not enough literals (%zu) for the 4-streams mode (min %u)",
190
+ litSize, MIN_LITERALS_FOR_4_STREAMS);
125
191
  RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, "");
192
+ RETURN_ERROR_IF(expectedWriteSize < litSize , dstSize_tooSmall, "");
193
+ ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 0);
126
194
 
127
195
  /* prefetch huffman table if cold */
128
196
  if (dctx->ddictIsCold && (litSize > 768 /* heuristic */)) {
@@ -131,13 +199,14 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
131
199
 
132
200
  if (litEncType==set_repeat) {
133
201
  if (singleStream) {
134
- hufSuccess = HUF_decompress1X_usingDTable_bmi2(
202
+ hufSuccess = HUF_decompress1X_usingDTable(
135
203
  dctx->litBuffer, litSize, istart+lhSize, litCSize,
136
- dctx->HUFptr, dctx->bmi2);
204
+ dctx->HUFptr, flags);
137
205
  } else {
138
- hufSuccess = HUF_decompress4X_usingDTable_bmi2(
206
+ assert(litSize >= MIN_LITERALS_FOR_4_STREAMS);
207
+ hufSuccess = HUF_decompress4X_usingDTable(
139
208
  dctx->litBuffer, litSize, istart+lhSize, litCSize,
140
- dctx->HUFptr, dctx->bmi2);
209
+ dctx->HUFptr, flags);
141
210
  }
142
211
  } else {
143
212
  if (singleStream) {
@@ -145,20 +214,29 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
145
214
  hufSuccess = HUF_decompress1X_DCtx_wksp(
146
215
  dctx->entropy.hufTable, dctx->litBuffer, litSize,
147
216
  istart+lhSize, litCSize, dctx->workspace,
148
- sizeof(dctx->workspace));
217
+ sizeof(dctx->workspace), flags);
149
218
  #else
150
- hufSuccess = HUF_decompress1X1_DCtx_wksp_bmi2(
219
+ hufSuccess = HUF_decompress1X1_DCtx_wksp(
151
220
  dctx->entropy.hufTable, dctx->litBuffer, litSize,
152
221
  istart+lhSize, litCSize, dctx->workspace,
153
- sizeof(dctx->workspace), dctx->bmi2);
222
+ sizeof(dctx->workspace), flags);
154
223
  #endif
155
224
  } else {
156
- hufSuccess = HUF_decompress4X_hufOnly_wksp_bmi2(
225
+ hufSuccess = HUF_decompress4X_hufOnly_wksp(
157
226
  dctx->entropy.hufTable, dctx->litBuffer, litSize,
158
227
  istart+lhSize, litCSize, dctx->workspace,
159
- sizeof(dctx->workspace), dctx->bmi2);
228
+ sizeof(dctx->workspace), flags);
160
229
  }
161
230
  }
231
+ if (dctx->litBufferLocation == ZSTD_split)
232
+ {
233
+ assert(litSize > ZSTD_LITBUFFEREXTRASIZE);
234
+ ZSTD_memcpy(dctx->litExtraBuffer, dctx->litBufferEnd - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE);
235
+ ZSTD_memmove(dctx->litBuffer + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH, dctx->litBuffer, litSize - ZSTD_LITBUFFEREXTRASIZE);
236
+ dctx->litBuffer += ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH;
237
+ dctx->litBufferEnd -= WILDCOPY_OVERLENGTH;
238
+ assert(dctx->litBufferEnd <= (BYTE*)dst + blockSizeMax);
239
+ }
162
240
 
163
241
  RETURN_ERROR_IF(HUF_isError(hufSuccess), corruption_detected, "");
164
242
 
@@ -166,13 +244,13 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
166
244
  dctx->litSize = litSize;
167
245
  dctx->litEntropy = 1;
168
246
  if (litEncType==set_compressed) dctx->HUFptr = dctx->entropy.hufTable;
169
- ZSTD_memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
170
247
  return litCSize + lhSize;
171
248
  }
172
249
 
173
250
  case set_basic:
174
251
  { size_t litSize, lhSize;
175
252
  U32 const lhlCode = ((istart[0]) >> 2) & 3;
253
+ size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity);
176
254
  switch(lhlCode)
177
255
  {
178
256
  case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */
@@ -185,27 +263,42 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
185
263
  break;
186
264
  case 3:
187
265
  lhSize = 3;
266
+ RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize = 3");
188
267
  litSize = MEM_readLE24(istart) >> 4;
189
268
  break;
190
269
  }
191
270
 
271
+ RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
272
+ RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, "");
273
+ RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, "");
274
+ ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1);
192
275
  if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) { /* risk reading beyond src buffer with wildcopy */
193
276
  RETURN_ERROR_IF(litSize+lhSize > srcSize, corruption_detected, "");
194
- ZSTD_memcpy(dctx->litBuffer, istart+lhSize, litSize);
277
+ if (dctx->litBufferLocation == ZSTD_split)
278
+ {
279
+ ZSTD_memcpy(dctx->litBuffer, istart + lhSize, litSize - ZSTD_LITBUFFEREXTRASIZE);
280
+ ZSTD_memcpy(dctx->litExtraBuffer, istart + lhSize + litSize - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE);
281
+ }
282
+ else
283
+ {
284
+ ZSTD_memcpy(dctx->litBuffer, istart + lhSize, litSize);
285
+ }
195
286
  dctx->litPtr = dctx->litBuffer;
196
287
  dctx->litSize = litSize;
197
- ZSTD_memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
198
288
  return lhSize+litSize;
199
289
  }
200
290
  /* direct reference into compressed stream */
201
291
  dctx->litPtr = istart+lhSize;
202
292
  dctx->litSize = litSize;
293
+ dctx->litBufferEnd = dctx->litPtr + litSize;
294
+ dctx->litBufferLocation = ZSTD_not_in_dst;
203
295
  return lhSize+litSize;
204
296
  }
205
297
 
206
298
  case set_rle:
207
299
  { U32 const lhlCode = ((istart[0]) >> 2) & 3;
208
300
  size_t litSize, lhSize;
301
+ size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity);
209
302
  switch(lhlCode)
210
303
  {
211
304
  case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */
@@ -214,16 +307,28 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
214
307
  break;
215
308
  case 1:
216
309
  lhSize = 2;
310
+ RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 3");
217
311
  litSize = MEM_readLE16(istart) >> 4;
218
312
  break;
219
313
  case 3:
220
314
  lhSize = 3;
315
+ RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 4");
221
316
  litSize = MEM_readLE24(istart) >> 4;
222
- RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4");
223
317
  break;
224
318
  }
225
- RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
226
- ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize + WILDCOPY_OVERLENGTH);
319
+ RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
320
+ RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, "");
321
+ RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, "");
322
+ ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1);
323
+ if (dctx->litBufferLocation == ZSTD_split)
324
+ {
325
+ ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize - ZSTD_LITBUFFEREXTRASIZE);
326
+ ZSTD_memset(dctx->litExtraBuffer, istart[lhSize], ZSTD_LITBUFFEREXTRASIZE);
327
+ }
328
+ else
329
+ {
330
+ ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize);
331
+ }
227
332
  dctx->litPtr = dctx->litBuffer;
228
333
  dctx->litSize = litSize;
229
334
  return lhSize+1;
@@ -234,9 +339,21 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
234
339
  }
235
340
  }
236
341
 
342
+ /* Hidden declaration for fullbench */
343
+ size_t ZSTD_decodeLiteralsBlock_wrapper(ZSTD_DCtx* dctx,
344
+ const void* src, size_t srcSize,
345
+ void* dst, size_t dstCapacity);
346
+ size_t ZSTD_decodeLiteralsBlock_wrapper(ZSTD_DCtx* dctx,
347
+ const void* src, size_t srcSize,
348
+ void* dst, size_t dstCapacity)
349
+ {
350
+ dctx->isFrameDecompression = 0;
351
+ return ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, not_streaming);
352
+ }
353
+
237
354
  /* Default FSE distribution tables.
238
355
  * These are pre-calculated FSE decoding tables using default distributions as defined in specification :
239
- * https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md#default-distributions
356
+ * https://github.com/facebook/zstd/blob/release/doc/zstd_compression_format.md#default-distributions
240
357
  * They were generated programmatically with following method :
241
358
  * - start from default distributions, present in /lib/common/zstd_internal.h
242
359
  * - generate tables normally, using ZSTD_buildFSETable()
@@ -343,7 +460,7 @@ static const ZSTD_seqSymbol ML_defaultDTable[(1<<ML_DEFAULTNORMLOG)+1] = {
343
460
  }; /* ML_defaultDTable */
344
461
 
345
462
 
346
- static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U32 nbAddBits)
463
+ static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U8 nbAddBits)
347
464
  {
348
465
  void* ptr = dt;
349
466
  ZSTD_seqSymbol_header* const DTableH = (ZSTD_seqSymbol_header*)ptr;
@@ -355,7 +472,7 @@ static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U32 nbAddB
355
472
  cell->nbBits = 0;
356
473
  cell->nextState = 0;
357
474
  assert(nbAddBits < 255);
358
- cell->nbAdditionalBits = (BYTE)nbAddBits;
475
+ cell->nbAdditionalBits = nbAddBits;
359
476
  cell->baseValue = baseValue;
360
477
  }
361
478
 
@@ -367,7 +484,7 @@ static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U32 nbAddB
367
484
  FORCE_INLINE_TEMPLATE
368
485
  void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
369
486
  const short* normalizedCounter, unsigned maxSymbolValue,
370
- const U32* baseValue, const U32* nbAdditionalBits,
487
+ const U32* baseValue, const U8* nbAdditionalBits,
371
488
  unsigned tableLog, void* wksp, size_t wkspSize)
372
489
  {
373
490
  ZSTD_seqSymbol* const tableDecode = dt+1;
@@ -430,14 +547,15 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
430
547
  for (i = 8; i < n; i += 8) {
431
548
  MEM_write64(spread + pos + i, sv);
432
549
  }
433
- pos += n;
550
+ assert(n>=0);
551
+ pos += (size_t)n;
434
552
  }
435
553
  }
436
554
  /* Now we spread those positions across the table.
437
- * The benefit of doing it in two stages is that we avoid the the
555
+ * The benefit of doing it in two stages is that we avoid the
438
556
  * variable size inner loop, which caused lots of branch misses.
439
557
  * Now we can run through all the positions without any branch misses.
440
- * We unroll the loop twice, since that is what emperically worked best.
558
+ * We unroll the loop twice, since that is what empirically worked best.
441
559
  */
442
560
  {
443
561
  size_t position = 0;
@@ -464,7 +582,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
464
582
  for (i=0; i<n; i++) {
465
583
  tableDecode[position].baseValue = s;
466
584
  position = (position + step) & tableMask;
467
- while (position > highThreshold) position = (position + step) & tableMask; /* lowprob area */
585
+ while (UNLIKELY(position > highThreshold)) position = (position + step) & tableMask; /* lowprob area */
468
586
  } }
469
587
  assert(position == 0); /* position must reach all cells once, otherwise normalizedCounter is incorrect */
470
588
  }
@@ -475,10 +593,10 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
475
593
  for (u=0; u<tableSize; u++) {
476
594
  U32 const symbol = tableDecode[u].baseValue;
477
595
  U32 const nextState = symbolNext[symbol]++;
478
- tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32(nextState) );
596
+ tableDecode[u].nbBits = (BYTE) (tableLog - ZSTD_highbit32(nextState) );
479
597
  tableDecode[u].nextState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
480
598
  assert(nbAdditionalBits[symbol] < 255);
481
- tableDecode[u].nbAdditionalBits = (BYTE)nbAdditionalBits[symbol];
599
+ tableDecode[u].nbAdditionalBits = nbAdditionalBits[symbol];
482
600
  tableDecode[u].baseValue = baseValue[symbol];
483
601
  }
484
602
  }
@@ -487,7 +605,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
487
605
  /* Avoids the FORCE_INLINE of the _body() function. */
488
606
  static void ZSTD_buildFSETable_body_default(ZSTD_seqSymbol* dt,
489
607
  const short* normalizedCounter, unsigned maxSymbolValue,
490
- const U32* baseValue, const U32* nbAdditionalBits,
608
+ const U32* baseValue, const U8* nbAdditionalBits,
491
609
  unsigned tableLog, void* wksp, size_t wkspSize)
492
610
  {
493
611
  ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue,
@@ -495,9 +613,9 @@ static void ZSTD_buildFSETable_body_default(ZSTD_seqSymbol* dt,
495
613
  }
496
614
 
497
615
  #if DYNAMIC_BMI2
498
- TARGET_ATTRIBUTE("bmi2") static void ZSTD_buildFSETable_body_bmi2(ZSTD_seqSymbol* dt,
616
+ BMI2_TARGET_ATTRIBUTE static void ZSTD_buildFSETable_body_bmi2(ZSTD_seqSymbol* dt,
499
617
  const short* normalizedCounter, unsigned maxSymbolValue,
500
- const U32* baseValue, const U32* nbAdditionalBits,
618
+ const U32* baseValue, const U8* nbAdditionalBits,
501
619
  unsigned tableLog, void* wksp, size_t wkspSize)
502
620
  {
503
621
  ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue,
@@ -507,7 +625,7 @@ TARGET_ATTRIBUTE("bmi2") static void ZSTD_buildFSETable_body_bmi2(ZSTD_seqSymbol
507
625
 
508
626
  void ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
509
627
  const short* normalizedCounter, unsigned maxSymbolValue,
510
- const U32* baseValue, const U32* nbAdditionalBits,
628
+ const U32* baseValue, const U8* nbAdditionalBits,
511
629
  unsigned tableLog, void* wksp, size_t wkspSize, int bmi2)
512
630
  {
513
631
  #if DYNAMIC_BMI2
@@ -529,7 +647,7 @@ void ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
529
647
  static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymbol** DTablePtr,
530
648
  symbolEncodingType_e type, unsigned max, U32 maxLog,
531
649
  const void* src, size_t srcSize,
532
- const U32* baseValue, const U32* nbAdditionalBits,
650
+ const U32* baseValue, const U8* nbAdditionalBits,
533
651
  const ZSTD_seqSymbol* defaultTable, U32 flagRepeatTable,
534
652
  int ddictIsCold, int nbSeq, U32* wksp, size_t wkspSize,
535
653
  int bmi2)
@@ -541,7 +659,7 @@ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymb
541
659
  RETURN_ERROR_IF((*(const BYTE*)src) > max, corruption_detected, "");
542
660
  { U32 const symbol = *(const BYTE*)src;
543
661
  U32 const baseline = baseValue[symbol];
544
- U32 const nbBits = nbAdditionalBits[symbol];
662
+ U8 const nbBits = nbAdditionalBits[symbol];
545
663
  ZSTD_buildSeqTable_rle(DTableSpace, baseline, nbBits);
546
664
  }
547
665
  *DTablePtr = DTableSpace;
@@ -577,7 +695,7 @@ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymb
577
695
  size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
578
696
  const void* src, size_t srcSize)
579
697
  {
580
- const BYTE* const istart = (const BYTE* const)src;
698
+ const BYTE* const istart = (const BYTE*)src;
581
699
  const BYTE* const iend = istart + srcSize;
582
700
  const BYTE* ip = istart;
583
701
  int nbSeq;
@@ -588,11 +706,6 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
588
706
 
589
707
  /* SeqHead */
590
708
  nbSeq = *ip++;
591
- if (!nbSeq) {
592
- *nbSeqPtr=0;
593
- RETURN_ERROR_IF(srcSize != 1, srcSize_wrong, "");
594
- return 1;
595
- }
596
709
  if (nbSeq > 0x7F) {
597
710
  if (nbSeq == 0xFF) {
598
711
  RETURN_ERROR_IF(ip+2 > iend, srcSize_wrong, "");
@@ -605,8 +718,16 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
605
718
  }
606
719
  *nbSeqPtr = nbSeq;
607
720
 
721
+ if (nbSeq == 0) {
722
+ /* No sequence : section ends immediately */
723
+ RETURN_ERROR_IF(ip != iend, corruption_detected,
724
+ "extraneous data present in the Sequences section");
725
+ return (size_t)(ip - istart);
726
+ }
727
+
608
728
  /* FSE table descriptors */
609
729
  RETURN_ERROR_IF(ip+1 > iend, srcSize_wrong, ""); /* minimum possible size: 1 byte for symbol encoding types */
730
+ RETURN_ERROR_IF(*ip & 3, corruption_detected, ""); /* The last field, Reserved, must be all-zeroes. */
610
731
  { symbolEncodingType_e const LLtype = (symbolEncodingType_e)(*ip >> 6);
611
732
  symbolEncodingType_e const OFtype = (symbolEncodingType_e)((*ip >> 4) & 3);
612
733
  symbolEncodingType_e const MLtype = (symbolEncodingType_e)((*ip >> 2) & 3);
@@ -620,7 +741,7 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
620
741
  LL_defaultDTable, dctx->fseEntropy,
621
742
  dctx->ddictIsCold, nbSeq,
622
743
  dctx->workspace, sizeof(dctx->workspace),
623
- dctx->bmi2);
744
+ ZSTD_DCtx_get_bmi2(dctx));
624
745
  RETURN_ERROR_IF(ZSTD_isError(llhSize), corruption_detected, "ZSTD_buildSeqTable failed");
625
746
  ip += llhSize;
626
747
  }
@@ -632,7 +753,7 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
632
753
  OF_defaultDTable, dctx->fseEntropy,
633
754
  dctx->ddictIsCold, nbSeq,
634
755
  dctx->workspace, sizeof(dctx->workspace),
635
- dctx->bmi2);
756
+ ZSTD_DCtx_get_bmi2(dctx));
636
757
  RETURN_ERROR_IF(ZSTD_isError(ofhSize), corruption_detected, "ZSTD_buildSeqTable failed");
637
758
  ip += ofhSize;
638
759
  }
@@ -644,7 +765,7 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
644
765
  ML_defaultDTable, dctx->fseEntropy,
645
766
  dctx->ddictIsCold, nbSeq,
646
767
  dctx->workspace, sizeof(dctx->workspace),
647
- dctx->bmi2);
768
+ ZSTD_DCtx_get_bmi2(dctx));
648
769
  RETURN_ERROR_IF(ZSTD_isError(mlhSize), corruption_detected, "ZSTD_buildSeqTable failed");
649
770
  ip += mlhSize;
650
771
  }
@@ -658,7 +779,6 @@ typedef struct {
658
779
  size_t litLength;
659
780
  size_t matchLength;
660
781
  size_t offset;
661
- const BYTE* match;
662
782
  } seq_t;
663
783
 
664
784
  typedef struct {
@@ -672,9 +792,6 @@ typedef struct {
672
792
  ZSTD_fseState stateOffb;
673
793
  ZSTD_fseState stateML;
674
794
  size_t prevOffset[ZSTD_REP_NUM];
675
- const BYTE* prefixStart;
676
- const BYTE* dictEnd;
677
- size_t pos;
678
795
  } seqState_t;
679
796
 
680
797
  /*! ZSTD_overlapCopy8() :
@@ -717,7 +834,7 @@ HINT_INLINE void ZSTD_overlapCopy8(BYTE** op, BYTE const** ip, size_t offset) {
717
834
  * - ZSTD_overlap_src_before_dst: The src and dst may overlap and may be any distance apart.
718
835
  * The src buffer must be before the dst buffer.
719
836
  */
720
- static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, ptrdiff_t length, ZSTD_overlap_e ovtype) {
837
+ static void ZSTD_safecopy(BYTE* op, const BYTE* const oend_w, BYTE const* ip, ptrdiff_t length, ZSTD_overlap_e ovtype) {
721
838
  ptrdiff_t const diff = op - ip;
722
839
  BYTE* const oend = op + length;
723
840
 
@@ -733,6 +850,7 @@ static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, ptrdiff_
733
850
  /* Copy 8 bytes and ensure the offset >= 8 when there can be overlap. */
734
851
  assert(length >= 8);
735
852
  ZSTD_overlapCopy8(&op, &ip, diff);
853
+ length -= 8;
736
854
  assert(op - ip >= 8);
737
855
  assert(op <= oend);
738
856
  }
@@ -747,8 +865,31 @@ static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, ptrdiff_
747
865
  assert(oend > oend_w);
748
866
  ZSTD_wildcopy(op, ip, oend_w - op, ovtype);
749
867
  ip += oend_w - op;
750
- op = oend_w;
868
+ op += oend_w - op;
869
+ }
870
+ /* Handle the leftovers. */
871
+ while (op < oend) *op++ = *ip++;
872
+ }
873
+
874
+ /* ZSTD_safecopyDstBeforeSrc():
875
+ * This version allows overlap with dst before src, or handles the non-overlap case with dst after src
876
+ * Kept separate from more common ZSTD_safecopy case to avoid performance impact to the safecopy common case */
877
+ static void ZSTD_safecopyDstBeforeSrc(BYTE* op, const BYTE* ip, ptrdiff_t length) {
878
+ ptrdiff_t const diff = op - ip;
879
+ BYTE* const oend = op + length;
880
+
881
+ if (length < 8 || diff > -8) {
882
+ /* Handle short lengths, close overlaps, and dst not before src. */
883
+ while (op < oend) *op++ = *ip++;
884
+ return;
885
+ }
886
+
887
+ if (op <= oend - WILDCOPY_OVERLENGTH && diff < -WILDCOPY_VECLEN) {
888
+ ZSTD_wildcopy(op, ip, oend - WILDCOPY_OVERLENGTH - op, ZSTD_no_overlap);
889
+ ip += oend - WILDCOPY_OVERLENGTH - op;
890
+ op += oend - WILDCOPY_OVERLENGTH - op;
751
891
  }
892
+
752
893
  /* Handle the leftovers. */
753
894
  while (op < oend) *op++ = *ip++;
754
895
  }
@@ -762,10 +903,11 @@ static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, ptrdiff_
762
903
  * to be optimized for many small sequences, since those fall into ZSTD_execSequence().
763
904
  */
764
905
  FORCE_NOINLINE
906
+ ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
765
907
  size_t ZSTD_execSequenceEnd(BYTE* op,
766
- BYTE* const oend, seq_t sequence,
767
- const BYTE** litPtr, const BYTE* const litLimit,
768
- const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
908
+ BYTE* const oend, seq_t sequence,
909
+ const BYTE** litPtr, const BYTE* const litLimit,
910
+ const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
769
911
  {
770
912
  BYTE* const oLitEnd = op + sequence.litLength;
771
913
  size_t const sequenceLength = sequence.litLength + sequence.matchLength;
@@ -788,27 +930,78 @@ size_t ZSTD_execSequenceEnd(BYTE* op,
788
930
  if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
789
931
  /* offset beyond prefix */
790
932
  RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, "");
791
- match = dictEnd - (prefixStart-match);
933
+ match = dictEnd - (prefixStart - match);
792
934
  if (match + sequence.matchLength <= dictEnd) {
793
935
  ZSTD_memmove(oLitEnd, match, sequence.matchLength);
794
936
  return sequenceLength;
795
937
  }
796
938
  /* span extDict & currentPrefixSegment */
797
939
  { size_t const length1 = dictEnd - match;
798
- ZSTD_memmove(oLitEnd, match, length1);
799
- op = oLitEnd + length1;
800
- sequence.matchLength -= length1;
801
- match = prefixStart;
802
- } }
940
+ ZSTD_memmove(oLitEnd, match, length1);
941
+ op = oLitEnd + length1;
942
+ sequence.matchLength -= length1;
943
+ match = prefixStart;
944
+ }
945
+ }
946
+ ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst);
947
+ return sequenceLength;
948
+ }
949
+
950
+ /* ZSTD_execSequenceEndSplitLitBuffer():
951
+ * This version is intended to be used during instances where the litBuffer is still split. It is kept separate to avoid performance impact for the good case.
952
+ */
953
+ FORCE_NOINLINE
954
+ ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
955
+ size_t ZSTD_execSequenceEndSplitLitBuffer(BYTE* op,
956
+ BYTE* const oend, const BYTE* const oend_w, seq_t sequence,
957
+ const BYTE** litPtr, const BYTE* const litLimit,
958
+ const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
959
+ {
960
+ BYTE* const oLitEnd = op + sequence.litLength;
961
+ size_t const sequenceLength = sequence.litLength + sequence.matchLength;
962
+ const BYTE* const iLitEnd = *litPtr + sequence.litLength;
963
+ const BYTE* match = oLitEnd - sequence.offset;
964
+
965
+
966
+ /* bounds checks : careful of address space overflow in 32-bit mode */
967
+ RETURN_ERROR_IF(sequenceLength > (size_t)(oend - op), dstSize_tooSmall, "last match must fit within dstBuffer");
968
+ RETURN_ERROR_IF(sequence.litLength > (size_t)(litLimit - *litPtr), corruption_detected, "try to read beyond literal buffer");
969
+ assert(op < op + sequenceLength);
970
+ assert(oLitEnd < op + sequenceLength);
971
+
972
+ /* copy literals */
973
+ RETURN_ERROR_IF(op > *litPtr && op < *litPtr + sequence.litLength, dstSize_tooSmall, "output should not catch up to and overwrite literal buffer");
974
+ ZSTD_safecopyDstBeforeSrc(op, *litPtr, sequence.litLength);
975
+ op = oLitEnd;
976
+ *litPtr = iLitEnd;
977
+
978
+ /* copy Match */
979
+ if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
980
+ /* offset beyond prefix */
981
+ RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, "");
982
+ match = dictEnd - (prefixStart - match);
983
+ if (match + sequence.matchLength <= dictEnd) {
984
+ ZSTD_memmove(oLitEnd, match, sequence.matchLength);
985
+ return sequenceLength;
986
+ }
987
+ /* span extDict & currentPrefixSegment */
988
+ { size_t const length1 = dictEnd - match;
989
+ ZSTD_memmove(oLitEnd, match, length1);
990
+ op = oLitEnd + length1;
991
+ sequence.matchLength -= length1;
992
+ match = prefixStart;
993
+ }
994
+ }
803
995
  ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst);
804
996
  return sequenceLength;
805
997
  }
806
998
 
807
999
  HINT_INLINE
1000
+ ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
808
1001
  size_t ZSTD_execSequence(BYTE* op,
809
- BYTE* const oend, seq_t sequence,
810
- const BYTE** litPtr, const BYTE* const litLimit,
811
- const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
1002
+ BYTE* const oend, seq_t sequence,
1003
+ const BYTE** litPtr, const BYTE* const litLimit,
1004
+ const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
812
1005
  {
813
1006
  BYTE* const oLitEnd = op + sequence.litLength;
814
1007
  size_t const sequenceLength = sequence.litLength + sequence.matchLength;
@@ -819,6 +1012,104 @@ size_t ZSTD_execSequence(BYTE* op,
819
1012
 
820
1013
  assert(op != NULL /* Precondition */);
821
1014
  assert(oend_w < oend /* No underflow */);
1015
+
1016
+ #if defined(__aarch64__)
1017
+ /* prefetch sequence starting from match that will be used for copy later */
1018
+ PREFETCH_L1(match);
1019
+ #endif
1020
+ /* Handle edge cases in a slow path:
1021
+ * - Read beyond end of literals
1022
+ * - Match end is within WILDCOPY_OVERLIMIT of oend
1023
+ * - 32-bit mode and the match length overflows
1024
+ */
1025
+ if (UNLIKELY(
1026
+ iLitEnd > litLimit ||
1027
+ oMatchEnd > oend_w ||
1028
+ (MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH)))
1029
+ return ZSTD_execSequenceEnd(op, oend, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd);
1030
+
1031
+ /* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */
1032
+ assert(op <= oLitEnd /* No overflow */);
1033
+ assert(oLitEnd < oMatchEnd /* Non-zero match & no overflow */);
1034
+ assert(oMatchEnd <= oend /* No underflow */);
1035
+ assert(iLitEnd <= litLimit /* Literal length is in bounds */);
1036
+ assert(oLitEnd <= oend_w /* Can wildcopy literals */);
1037
+ assert(oMatchEnd <= oend_w /* Can wildcopy matches */);
1038
+
1039
+ /* Copy Literals:
1040
+ * Split out litLength <= 16 since it is nearly always true. +1.6% on gcc-9.
1041
+ * We likely don't need the full 32-byte wildcopy.
1042
+ */
1043
+ assert(WILDCOPY_OVERLENGTH >= 16);
1044
+ ZSTD_copy16(op, (*litPtr));
1045
+ if (UNLIKELY(sequence.litLength > 16)) {
1046
+ ZSTD_wildcopy(op + 16, (*litPtr) + 16, sequence.litLength - 16, ZSTD_no_overlap);
1047
+ }
1048
+ op = oLitEnd;
1049
+ *litPtr = iLitEnd; /* update for next sequence */
1050
+
1051
+ /* Copy Match */
1052
+ if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
1053
+ /* offset beyond prefix -> go into extDict */
1054
+ RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virtualStart)), corruption_detected, "");
1055
+ match = dictEnd + (match - prefixStart);
1056
+ if (match + sequence.matchLength <= dictEnd) {
1057
+ ZSTD_memmove(oLitEnd, match, sequence.matchLength);
1058
+ return sequenceLength;
1059
+ }
1060
+ /* span extDict & currentPrefixSegment */
1061
+ { size_t const length1 = dictEnd - match;
1062
+ ZSTD_memmove(oLitEnd, match, length1);
1063
+ op = oLitEnd + length1;
1064
+ sequence.matchLength -= length1;
1065
+ match = prefixStart;
1066
+ }
1067
+ }
1068
+ /* Match within prefix of 1 or more bytes */
1069
+ assert(op <= oMatchEnd);
1070
+ assert(oMatchEnd <= oend_w);
1071
+ assert(match >= prefixStart);
1072
+ assert(sequence.matchLength >= 1);
1073
+
1074
+ /* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy
1075
+ * without overlap checking.
1076
+ */
1077
+ if (LIKELY(sequence.offset >= WILDCOPY_VECLEN)) {
1078
+ /* We bet on a full wildcopy for matches, since we expect matches to be
1079
+ * longer than literals (in general). In silesia, ~10% of matches are longer
1080
+ * than 16 bytes.
1081
+ */
1082
+ ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength, ZSTD_no_overlap);
1083
+ return sequenceLength;
1084
+ }
1085
+ assert(sequence.offset < WILDCOPY_VECLEN);
1086
+
1087
+ /* Copy 8 bytes and spread the offset to be >= 8. */
1088
+ ZSTD_overlapCopy8(&op, &match, sequence.offset);
1089
+
1090
+ /* If the match length is > 8 bytes, then continue with the wildcopy. */
1091
+ if (sequence.matchLength > 8) {
1092
+ assert(op < oMatchEnd);
1093
+ ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength - 8, ZSTD_overlap_src_before_dst);
1094
+ }
1095
+ return sequenceLength;
1096
+ }
1097
+
1098
+ HINT_INLINE
1099
+ ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
1100
+ size_t ZSTD_execSequenceSplitLitBuffer(BYTE* op,
1101
+ BYTE* const oend, const BYTE* const oend_w, seq_t sequence,
1102
+ const BYTE** litPtr, const BYTE* const litLimit,
1103
+ const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
1104
+ {
1105
+ BYTE* const oLitEnd = op + sequence.litLength;
1106
+ size_t const sequenceLength = sequence.litLength + sequence.matchLength;
1107
+ BYTE* const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */
1108
+ const BYTE* const iLitEnd = *litPtr + sequence.litLength;
1109
+ const BYTE* match = oLitEnd - sequence.offset;
1110
+
1111
+ assert(op != NULL /* Precondition */);
1112
+ assert(oend_w < oend /* No underflow */);
822
1113
  /* Handle edge cases in a slow path:
823
1114
  * - Read beyond end of literals
824
1115
  * - Match end is within WILDCOPY_OVERLIMIT of oend
@@ -828,7 +1119,7 @@ size_t ZSTD_execSequence(BYTE* op,
828
1119
  iLitEnd > litLimit ||
829
1120
  oMatchEnd > oend_w ||
830
1121
  (MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH)))
831
- return ZSTD_execSequenceEnd(op, oend, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd);
1122
+ return ZSTD_execSequenceEndSplitLitBuffer(op, oend, oend_w, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd);
832
1123
 
833
1124
  /* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */
834
1125
  assert(op <= oLitEnd /* No overflow */);
@@ -896,6 +1187,7 @@ size_t ZSTD_execSequence(BYTE* op,
896
1187
  return sequenceLength;
897
1188
  }
898
1189
 
1190
+
899
1191
  static void
900
1192
  ZSTD_initFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, const ZSTD_seqSymbol* dt)
901
1193
  {
@@ -909,24 +1201,14 @@ ZSTD_initFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, const ZSTD_seqS
909
1201
  }
910
1202
 
911
1203
  FORCE_INLINE_TEMPLATE void
912
- ZSTD_updateFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD)
913
- {
914
- ZSTD_seqSymbol const DInfo = DStatePtr->table[DStatePtr->state];
915
- U32 const nbBits = DInfo.nbBits;
916
- size_t const lowBits = BIT_readBits(bitD, nbBits);
917
- DStatePtr->state = DInfo.nextState + lowBits;
918
- }
919
-
920
- FORCE_INLINE_TEMPLATE void
921
- ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, ZSTD_seqSymbol const DInfo)
1204
+ ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16 nextState, U32 nbBits)
922
1205
  {
923
- U32 const nbBits = DInfo.nbBits;
924
1206
  size_t const lowBits = BIT_readBits(bitD, nbBits);
925
- DStatePtr->state = DInfo.nextState + lowBits;
1207
+ DStatePtr->state = nextState + lowBits;
926
1208
  }
927
1209
 
928
1210
  /* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum
929
- * offset bits. But we can only read at most (STREAM_ACCUMULATOR_MIN_32 - 1)
1211
+ * offset bits. But we can only read at most STREAM_ACCUMULATOR_MIN_32
930
1212
  * bits before reloading. This value is the maximum number of bytes we read
931
1213
  * after reloading when we are decoding long offsets.
932
1214
  */
@@ -936,123 +1218,136 @@ ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, ZSTD
936
1218
  : 0)
937
1219
 
938
1220
  typedef enum { ZSTD_lo_isRegularOffset, ZSTD_lo_isLongOffset=1 } ZSTD_longOffset_e;
939
- typedef enum { ZSTD_p_noPrefetch=0, ZSTD_p_prefetch=1 } ZSTD_prefetch_e;
940
1221
 
1222
+ /**
1223
+ * ZSTD_decodeSequence():
1224
+ * @p longOffsets : tells the decoder to reload more bit while decoding large offsets
1225
+ * only used in 32-bit mode
1226
+ * @return : Sequence (litL + matchL + offset)
1227
+ */
941
1228
  FORCE_INLINE_TEMPLATE seq_t
942
- ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets, const ZSTD_prefetch_e prefetch)
1229
+ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets, const int isLastSeq)
943
1230
  {
944
1231
  seq_t seq;
945
- ZSTD_seqSymbol const llDInfo = seqState->stateLL.table[seqState->stateLL.state];
946
- ZSTD_seqSymbol const mlDInfo = seqState->stateML.table[seqState->stateML.state];
947
- ZSTD_seqSymbol const ofDInfo = seqState->stateOffb.table[seqState->stateOffb.state];
948
- U32 const llBase = llDInfo.baseValue;
949
- U32 const mlBase = mlDInfo.baseValue;
950
- U32 const ofBase = ofDInfo.baseValue;
951
- BYTE const llBits = llDInfo.nbAdditionalBits;
952
- BYTE const mlBits = mlDInfo.nbAdditionalBits;
953
- BYTE const ofBits = ofDInfo.nbAdditionalBits;
954
- BYTE const totalBits = llBits+mlBits+ofBits;
955
-
956
- /* sequence */
957
- { size_t offset;
958
- if (ofBits > 1) {
959
- ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
960
- ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
961
- assert(ofBits <= MaxOff);
962
- if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) {
963
- U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed);
964
- offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
965
- BIT_reloadDStream(&seqState->DStream);
966
- if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits);
967
- assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32); /* to avoid another reload */
968
- } else {
969
- offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */
970
- if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
971
- }
972
- seqState->prevOffset[2] = seqState->prevOffset[1];
973
- seqState->prevOffset[1] = seqState->prevOffset[0];
974
- seqState->prevOffset[0] = offset;
975
- } else {
976
- U32 const ll0 = (llBase == 0);
977
- if (LIKELY((ofBits == 0))) {
978
- if (LIKELY(!ll0))
979
- offset = seqState->prevOffset[0];
980
- else {
981
- offset = seqState->prevOffset[1];
982
- seqState->prevOffset[1] = seqState->prevOffset[0];
983
- seqState->prevOffset[0] = offset;
1232
+ /*
1233
+ * ZSTD_seqSymbol is a 64 bits wide structure.
1234
+ * It can be loaded in one operation
1235
+ * and its fields extracted by simply shifting or bit-extracting on aarch64.
1236
+ * GCC doesn't recognize this and generates more unnecessary ldr/ldrb/ldrh
1237
+ * operations that cause performance drop. This can be avoided by using this
1238
+ * ZSTD_memcpy hack.
1239
+ */
1240
+ #if defined(__aarch64__) && (defined(__GNUC__) && !defined(__clang__))
1241
+ ZSTD_seqSymbol llDInfoS, mlDInfoS, ofDInfoS;
1242
+ ZSTD_seqSymbol* const llDInfo = &llDInfoS;
1243
+ ZSTD_seqSymbol* const mlDInfo = &mlDInfoS;
1244
+ ZSTD_seqSymbol* const ofDInfo = &ofDInfoS;
1245
+ ZSTD_memcpy(llDInfo, seqState->stateLL.table + seqState->stateLL.state, sizeof(ZSTD_seqSymbol));
1246
+ ZSTD_memcpy(mlDInfo, seqState->stateML.table + seqState->stateML.state, sizeof(ZSTD_seqSymbol));
1247
+ ZSTD_memcpy(ofDInfo, seqState->stateOffb.table + seqState->stateOffb.state, sizeof(ZSTD_seqSymbol));
1248
+ #else
1249
+ const ZSTD_seqSymbol* const llDInfo = seqState->stateLL.table + seqState->stateLL.state;
1250
+ const ZSTD_seqSymbol* const mlDInfo = seqState->stateML.table + seqState->stateML.state;
1251
+ const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state;
1252
+ #endif
1253
+ seq.matchLength = mlDInfo->baseValue;
1254
+ seq.litLength = llDInfo->baseValue;
1255
+ { U32 const ofBase = ofDInfo->baseValue;
1256
+ BYTE const llBits = llDInfo->nbAdditionalBits;
1257
+ BYTE const mlBits = mlDInfo->nbAdditionalBits;
1258
+ BYTE const ofBits = ofDInfo->nbAdditionalBits;
1259
+ BYTE const totalBits = llBits+mlBits+ofBits;
1260
+
1261
+ U16 const llNext = llDInfo->nextState;
1262
+ U16 const mlNext = mlDInfo->nextState;
1263
+ U16 const ofNext = ofDInfo->nextState;
1264
+ U32 const llnbBits = llDInfo->nbBits;
1265
+ U32 const mlnbBits = mlDInfo->nbBits;
1266
+ U32 const ofnbBits = ofDInfo->nbBits;
1267
+
1268
+ assert(llBits <= MaxLLBits);
1269
+ assert(mlBits <= MaxMLBits);
1270
+ assert(ofBits <= MaxOff);
1271
+ /*
1272
+ * As gcc has better branch and block analyzers, sometimes it is only
1273
+ * valuable to mark likeliness for clang, it gives around 3-4% of
1274
+ * performance.
1275
+ */
1276
+
1277
+ /* sequence */
1278
+ { size_t offset;
1279
+ if (ofBits > 1) {
1280
+ ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
1281
+ ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
1282
+ ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 > LONG_OFFSETS_MAX_EXTRA_BITS_32);
1283
+ ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 - LONG_OFFSETS_MAX_EXTRA_BITS_32 >= MaxMLBits);
1284
+ if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) {
1285
+ /* Always read extra bits, this keeps the logic simple,
1286
+ * avoids branches, and avoids accidentally reading 0 bits.
1287
+ */
1288
+ U32 const extraBits = LONG_OFFSETS_MAX_EXTRA_BITS_32;
1289
+ offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
1290
+ BIT_reloadDStream(&seqState->DStream);
1291
+ offset += BIT_readBitsFast(&seqState->DStream, extraBits);
1292
+ } else {
1293
+ offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */
1294
+ if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
984
1295
  }
1296
+ seqState->prevOffset[2] = seqState->prevOffset[1];
1297
+ seqState->prevOffset[1] = seqState->prevOffset[0];
1298
+ seqState->prevOffset[0] = offset;
985
1299
  } else {
986
- offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1);
987
- { size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
988
- temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */
989
- if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1];
990
- seqState->prevOffset[1] = seqState->prevOffset[0];
991
- seqState->prevOffset[0] = offset = temp;
992
- } } }
993
- seq.offset = offset;
994
- }
1300
+ U32 const ll0 = (llDInfo->baseValue == 0);
1301
+ if (LIKELY((ofBits == 0))) {
1302
+ offset = seqState->prevOffset[ll0];
1303
+ seqState->prevOffset[1] = seqState->prevOffset[!ll0];
1304
+ seqState->prevOffset[0] = offset;
1305
+ } else {
1306
+ offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1);
1307
+ { size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
1308
+ temp -= !temp; /* 0 is not valid: input corrupted => force offset to -1 => corruption detected at execSequence */
1309
+ if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1];
1310
+ seqState->prevOffset[1] = seqState->prevOffset[0];
1311
+ seqState->prevOffset[0] = offset = temp;
1312
+ } } }
1313
+ seq.offset = offset;
1314
+ }
995
1315
 
996
- seq.matchLength = mlBase;
997
- if (mlBits > 0)
998
- seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/);
1316
+ if (mlBits > 0)
1317
+ seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/);
999
1318
 
1000
- if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))
1001
- BIT_reloadDStream(&seqState->DStream);
1002
- if (MEM_64bits() && UNLIKELY(totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog)))
1003
- BIT_reloadDStream(&seqState->DStream);
1004
- /* Ensure there are enough bits to read the rest of data in 64-bit mode. */
1005
- ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);
1319
+ if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))
1320
+ BIT_reloadDStream(&seqState->DStream);
1321
+ if (MEM_64bits() && UNLIKELY(totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog)))
1322
+ BIT_reloadDStream(&seqState->DStream);
1323
+ /* Ensure there are enough bits to read the rest of data in 64-bit mode. */
1324
+ ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);
1006
1325
 
1007
- seq.litLength = llBase;
1008
- if (llBits > 0)
1009
- seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/);
1326
+ if (llBits > 0)
1327
+ seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/);
1010
1328
 
1011
- if (MEM_32bits())
1012
- BIT_reloadDStream(&seqState->DStream);
1329
+ if (MEM_32bits())
1330
+ BIT_reloadDStream(&seqState->DStream);
1013
1331
 
1014
- DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u",
1015
- (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
1332
+ DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u",
1333
+ (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
1016
1334
 
1017
- if (prefetch == ZSTD_p_prefetch) {
1018
- size_t const pos = seqState->pos + seq.litLength;
1019
- const BYTE* const matchBase = (seq.offset > pos) ? seqState->dictEnd : seqState->prefixStart;
1020
- seq.match = matchBase + pos - seq.offset; /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted.
1021
- * No consequence though : no memory access will occur, offset is only used for prefetching */
1022
- seqState->pos = pos + seq.matchLength;
1023
- }
1024
-
1025
- /* ANS state update
1026
- * gcc-9.0.0 does 2.5% worse with ZSTD_updateFseStateWithDInfo().
1027
- * clang-9.2.0 does 7% worse with ZSTD_updateFseState().
1028
- * Naturally it seems like ZSTD_updateFseStateWithDInfo() should be the
1029
- * better option, so it is the default for other compilers. But, if you
1030
- * measure that it is worse, please put up a pull request.
1031
- */
1032
- {
1033
- #if defined(__GNUC__) && !defined(__clang__)
1034
- const int kUseUpdateFseState = 1;
1035
- #else
1036
- const int kUseUpdateFseState = 0;
1037
- #endif
1038
- if (kUseUpdateFseState) {
1039
- ZSTD_updateFseState(&seqState->stateLL, &seqState->DStream); /* <= 9 bits */
1040
- ZSTD_updateFseState(&seqState->stateML, &seqState->DStream); /* <= 9 bits */
1335
+ if (!isLastSeq) {
1336
+ /* don't update FSE state for last Sequence */
1337
+ ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits); /* <= 9 bits */
1338
+ ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits); /* <= 9 bits */
1041
1339
  if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */
1042
- ZSTD_updateFseState(&seqState->stateOffb, &seqState->DStream); /* <= 8 bits */
1043
- } else {
1044
- ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llDInfo); /* <= 9 bits */
1045
- ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlDInfo); /* <= 9 bits */
1046
- if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */
1047
- ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofDInfo); /* <= 8 bits */
1340
+ ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits); /* <= 8 bits */
1341
+ BIT_reloadDStream(&seqState->DStream);
1048
1342
  }
1049
1343
  }
1050
1344
 
1051
1345
  return seq;
1052
1346
  }
1053
1347
 
1054
- #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
1055
- MEM_STATIC int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd)
1348
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1349
+ #if DEBUGLEVEL >= 1
1350
+ static int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd)
1056
1351
  {
1057
1352
  size_t const windowSize = dctx->fParams.windowSize;
1058
1353
  /* No dictionary used. */
@@ -1066,30 +1361,33 @@ MEM_STATIC int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefix
1066
1361
  /* Dictionary is active. */
1067
1362
  return 1;
1068
1363
  }
1364
+ #endif
1069
1365
 
1070
- MEM_STATIC void ZSTD_assertValidSequence(
1366
+ static void ZSTD_assertValidSequence(
1071
1367
  ZSTD_DCtx const* dctx,
1072
1368
  BYTE const* op, BYTE const* oend,
1073
1369
  seq_t const seq,
1074
1370
  BYTE const* prefixStart, BYTE const* virtualStart)
1075
1371
  {
1076
1372
  #if DEBUGLEVEL >= 1
1077
- size_t const windowSize = dctx->fParams.windowSize;
1078
- size_t const sequenceSize = seq.litLength + seq.matchLength;
1079
- BYTE const* const oLitEnd = op + seq.litLength;
1080
- DEBUGLOG(6, "Checking sequence: litL=%u matchL=%u offset=%u",
1081
- (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
1082
- assert(op <= oend);
1083
- assert((size_t)(oend - op) >= sequenceSize);
1084
- assert(sequenceSize <= ZSTD_BLOCKSIZE_MAX);
1085
- if (ZSTD_dictionaryIsActive(dctx, prefixStart, oLitEnd)) {
1086
- size_t const dictSize = (size_t)((char const*)dctx->dictContentEndForFuzzing - (char const*)dctx->dictContentBeginForFuzzing);
1087
- /* Offset must be within the dictionary. */
1088
- assert(seq.offset <= (size_t)(oLitEnd - virtualStart));
1089
- assert(seq.offset <= windowSize + dictSize);
1090
- } else {
1091
- /* Offset must be within our window. */
1092
- assert(seq.offset <= windowSize);
1373
+ if (dctx->isFrameDecompression) {
1374
+ size_t const windowSize = dctx->fParams.windowSize;
1375
+ size_t const sequenceSize = seq.litLength + seq.matchLength;
1376
+ BYTE const* const oLitEnd = op + seq.litLength;
1377
+ DEBUGLOG(6, "Checking sequence: litL=%u matchL=%u offset=%u",
1378
+ (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
1379
+ assert(op <= oend);
1380
+ assert((size_t)(oend - op) >= sequenceSize);
1381
+ assert(sequenceSize <= ZSTD_blockSizeMax(dctx));
1382
+ if (ZSTD_dictionaryIsActive(dctx, prefixStart, oLitEnd)) {
1383
+ size_t const dictSize = (size_t)((char const*)dctx->dictContentEndForFuzzing - (char const*)dctx->dictContentBeginForFuzzing);
1384
+ /* Offset must be within the dictionary. */
1385
+ assert(seq.offset <= (size_t)(oLitEnd - virtualStart));
1386
+ assert(seq.offset <= windowSize + dictSize);
1387
+ } else {
1388
+ /* Offset must be within our window. */
1389
+ assert(seq.offset <= windowSize);
1390
+ }
1093
1391
  }
1094
1392
  #else
1095
1393
  (void)dctx, (void)op, (void)oend, (void)seq, (void)prefixStart, (void)virtualStart;
@@ -1098,31 +1396,30 @@ MEM_STATIC void ZSTD_assertValidSequence(
1098
1396
  #endif
1099
1397
 
1100
1398
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
1399
+
1400
+
1101
1401
  FORCE_INLINE_TEMPLATE size_t
1102
1402
  DONT_VECTORIZE
1103
- ZSTD_decompressSequences_body( ZSTD_DCtx* dctx,
1403
+ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,
1104
1404
  void* dst, size_t maxDstSize,
1105
1405
  const void* seqStart, size_t seqSize, int nbSeq,
1106
- const ZSTD_longOffset_e isLongOffset,
1107
- const int frame)
1406
+ const ZSTD_longOffset_e isLongOffset)
1108
1407
  {
1109
1408
  const BYTE* ip = (const BYTE*)seqStart;
1110
1409
  const BYTE* const iend = ip + seqSize;
1111
- BYTE* const ostart = (BYTE* const)dst;
1112
- BYTE* const oend = ostart + maxDstSize;
1410
+ BYTE* const ostart = (BYTE*)dst;
1411
+ BYTE* const oend = ZSTD_maybeNullPtrAdd(ostart, maxDstSize);
1113
1412
  BYTE* op = ostart;
1114
1413
  const BYTE* litPtr = dctx->litPtr;
1115
- const BYTE* const litEnd = litPtr + dctx->litSize;
1414
+ const BYTE* litBufferEnd = dctx->litBufferEnd;
1116
1415
  const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart);
1117
1416
  const BYTE* const vBase = (const BYTE*) (dctx->virtualStart);
1118
1417
  const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
1119
- DEBUGLOG(5, "ZSTD_decompressSequences_body");
1120
- (void)frame;
1418
+ DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer (%i seqs)", nbSeq);
1121
1419
 
1122
- /* Regen sequences */
1420
+ /* Literals are split between internal buffer & output buffer */
1123
1421
  if (nbSeq) {
1124
1422
  seqState_t seqState;
1125
- size_t error = 0;
1126
1423
  dctx->fseEntropy = 1;
1127
1424
  { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; }
1128
1425
  RETURN_ERROR_IF(
@@ -1138,134 +1435,331 @@ ZSTD_decompressSequences_body( ZSTD_DCtx* dctx,
1138
1435
  BIT_DStream_endOfBuffer < BIT_DStream_completed &&
1139
1436
  BIT_DStream_completed < BIT_DStream_overflow);
1140
1437
 
1438
+ /* decompress without overrunning litPtr begins */
1439
+ { seq_t sequence = {0,0,0}; /* some static analyzer believe that @sequence is not initialized (it necessarily is, since for(;;) loop as at least one iteration) */
1440
+ /* Align the decompression loop to 32 + 16 bytes.
1441
+ *
1442
+ * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression
1443
+ * speed swings based on the alignment of the decompression loop. This
1444
+ * performance swing is caused by parts of the decompression loop falling
1445
+ * out of the DSB. The entire decompression loop should fit in the DSB,
1446
+ * when it can't we get much worse performance. You can measure if you've
1447
+ * hit the good case or the bad case with this perf command for some
1448
+ * compressed file test.zst:
1449
+ *
1450
+ * perf stat -e cycles -e instructions -e idq.all_dsb_cycles_any_uops \
1451
+ * -e idq.all_mite_cycles_any_uops -- ./zstd -tq test.zst
1452
+ *
1453
+ * If you see most cycles served out of the MITE you've hit the bad case.
1454
+ * If you see most cycles served out of the DSB you've hit the good case.
1455
+ * If it is pretty even then you may be in an okay case.
1456
+ *
1457
+ * This issue has been reproduced on the following CPUs:
1458
+ * - Kabylake: Macbook Pro (15-inch, 2019) 2.4 GHz Intel Core i9
1459
+ * Use Instruments->Counters to get DSB/MITE cycles.
1460
+ * I never got performance swings, but I was able to
1461
+ * go from the good case of mostly DSB to half of the
1462
+ * cycles served from MITE.
1463
+ * - Coffeelake: Intel i9-9900k
1464
+ * - Coffeelake: Intel i7-9700k
1465
+ *
1466
+ * I haven't been able to reproduce the instability or DSB misses on any
1467
+ * of the following CPUS:
1468
+ * - Haswell
1469
+ * - Broadwell: Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GH
1470
+ * - Skylake
1471
+ *
1472
+ * Alignment is done for each of the three major decompression loops:
1473
+ * - ZSTD_decompressSequences_bodySplitLitBuffer - presplit section of the literal buffer
1474
+ * - ZSTD_decompressSequences_bodySplitLitBuffer - postsplit section of the literal buffer
1475
+ * - ZSTD_decompressSequences_body
1476
+ * Alignment choices are made to minimize large swings on bad cases and influence on performance
1477
+ * from changes external to this code, rather than to overoptimize on the current commit.
1478
+ *
1479
+ * If you are seeing performance stability this script can help test.
1480
+ * It tests on 4 commits in zstd where I saw performance change.
1481
+ *
1482
+ * https://gist.github.com/terrelln/9889fc06a423fd5ca6e99351564473f4
1483
+ */
1141
1484
  #if defined(__GNUC__) && defined(__x86_64__)
1142
- /* Align the decompression loop to 32 + 16 bytes.
1143
- *
1144
- * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression
1145
- * speed swings based on the alignment of the decompression loop. This
1146
- * performance swing is caused by parts of the decompression loop falling
1147
- * out of the DSB. The entire decompression loop should fit in the DSB,
1148
- * when it can't we get much worse performance. You can measure if you've
1149
- * hit the good case or the bad case with this perf command for some
1150
- * compressed file test.zst:
1151
- *
1152
- * perf stat -e cycles -e instructions -e idq.all_dsb_cycles_any_uops \
1153
- * -e idq.all_mite_cycles_any_uops -- ./zstd -tq test.zst
1154
- *
1155
- * If you see most cycles served out of the MITE you've hit the bad case.
1156
- * If you see most cycles served out of the DSB you've hit the good case.
1157
- * If it is pretty even then you may be in an okay case.
1158
- *
1159
- * I've been able to reproduce this issue on the following CPUs:
1160
- * - Kabylake: Macbook Pro (15-inch, 2019) 2.4 GHz Intel Core i9
1161
- * Use Instruments->Counters to get DSB/MITE cycles.
1162
- * I never got performance swings, but I was able to
1163
- * go from the good case of mostly DSB to half of the
1164
- * cycles served from MITE.
1165
- * - Coffeelake: Intel i9-9900k
1166
- *
1167
- * I haven't been able to reproduce the instability or DSB misses on any
1168
- * of the following CPUS:
1169
- * - Haswell
1170
- * - Broadwell: Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GH
1171
- * - Skylake
1172
- *
1173
- * If you are seeing performance stability this script can help test.
1174
- * It tests on 4 commits in zstd where I saw performance change.
1175
- *
1176
- * https://gist.github.com/terrelln/9889fc06a423fd5ca6e99351564473f4
1177
- */
1178
- __asm__(".p2align 5");
1179
- __asm__("nop");
1180
- __asm__(".p2align 4");
1485
+ __asm__(".p2align 6");
1486
+ # if __GNUC__ >= 7
1487
+ /* good for gcc-7, gcc-9, and gcc-11 */
1488
+ __asm__("nop");
1489
+ __asm__(".p2align 5");
1490
+ __asm__("nop");
1491
+ __asm__(".p2align 4");
1492
+ # if __GNUC__ == 8 || __GNUC__ == 10
1493
+ /* good for gcc-8 and gcc-10 */
1494
+ __asm__("nop");
1495
+ __asm__(".p2align 3");
1496
+ # endif
1497
+ # endif
1181
1498
  #endif
1182
- for ( ; ; ) {
1183
- seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, ZSTD_p_noPrefetch);
1184
- size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd);
1499
+
1500
+ /* Handle the initial state where litBuffer is currently split between dst and litExtraBuffer */
1501
+ for ( ; nbSeq; nbSeq--) {
1502
+ sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1);
1503
+ if (litPtr + sequence.litLength > dctx->litBufferEnd) break;
1504
+ { size_t const oneSeqSize = ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
1185
1505
  #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1186
- assert(!ZSTD_isError(oneSeqSize));
1187
- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
1506
+ assert(!ZSTD_isError(oneSeqSize));
1507
+ ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
1188
1508
  #endif
1189
- DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
1190
- BIT_reloadDStream(&(seqState.DStream));
1191
- op += oneSeqSize;
1192
- /* gcc and clang both don't like early returns in this loop.
1193
- * Instead break and check for an error at the end of the loop.
1194
- */
1195
- if (UNLIKELY(ZSTD_isError(oneSeqSize))) {
1196
- error = oneSeqSize;
1197
- break;
1509
+ if (UNLIKELY(ZSTD_isError(oneSeqSize)))
1510
+ return oneSeqSize;
1511
+ DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
1512
+ op += oneSeqSize;
1513
+ } }
1514
+ DEBUGLOG(6, "reached: (litPtr + sequence.litLength > dctx->litBufferEnd)");
1515
+
1516
+ /* If there are more sequences, they will need to read literals from litExtraBuffer; copy over the remainder from dst and update litPtr and litEnd */
1517
+ if (nbSeq > 0) {
1518
+ const size_t leftoverLit = dctx->litBufferEnd - litPtr;
1519
+ DEBUGLOG(6, "There are %i sequences left, and %zu/%zu literals left in buffer", nbSeq, leftoverLit, sequence.litLength);
1520
+ if (leftoverLit) {
1521
+ RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");
1522
+ ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
1523
+ sequence.litLength -= leftoverLit;
1524
+ op += leftoverLit;
1525
+ }
1526
+ litPtr = dctx->litExtraBuffer;
1527
+ litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
1528
+ dctx->litBufferLocation = ZSTD_not_in_dst;
1529
+ { size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
1530
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1531
+ assert(!ZSTD_isError(oneSeqSize));
1532
+ ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
1533
+ #endif
1534
+ if (UNLIKELY(ZSTD_isError(oneSeqSize)))
1535
+ return oneSeqSize;
1536
+ DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
1537
+ op += oneSeqSize;
1538
+ }
1539
+ nbSeq--;
1540
+ }
1541
+ }
1542
+
1543
+ if (nbSeq > 0) {
1544
+ /* there is remaining lit from extra buffer */
1545
+
1546
+ #if defined(__GNUC__) && defined(__x86_64__)
1547
+ __asm__(".p2align 6");
1548
+ __asm__("nop");
1549
+ # if __GNUC__ != 7
1550
+ /* worse for gcc-7 better for gcc-8, gcc-9, and gcc-10 and clang */
1551
+ __asm__(".p2align 4");
1552
+ __asm__("nop");
1553
+ __asm__(".p2align 3");
1554
+ # elif __GNUC__ >= 11
1555
+ __asm__(".p2align 3");
1556
+ # else
1557
+ __asm__(".p2align 5");
1558
+ __asm__("nop");
1559
+ __asm__(".p2align 3");
1560
+ # endif
1561
+ #endif
1562
+
1563
+ for ( ; nbSeq ; nbSeq--) {
1564
+ seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1);
1565
+ size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
1566
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1567
+ assert(!ZSTD_isError(oneSeqSize));
1568
+ ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
1569
+ #endif
1570
+ if (UNLIKELY(ZSTD_isError(oneSeqSize)))
1571
+ return oneSeqSize;
1572
+ DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
1573
+ op += oneSeqSize;
1198
1574
  }
1199
- if (UNLIKELY(!--nbSeq)) break;
1200
1575
  }
1201
1576
 
1202
1577
  /* check if reached exact end */
1203
- DEBUGLOG(5, "ZSTD_decompressSequences_body: after decode loop, remaining nbSeq : %i", nbSeq);
1204
- if (ZSTD_isError(error)) return error;
1578
+ DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer: after decode loop, remaining nbSeq : %i", nbSeq);
1205
1579
  RETURN_ERROR_IF(nbSeq, corruption_detected, "");
1206
- RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, "");
1580
+ DEBUGLOG(5, "bitStream : start=%p, ptr=%p, bitsConsumed=%u", seqState.DStream.start, seqState.DStream.ptr, seqState.DStream.bitsConsumed);
1581
+ RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, "");
1207
1582
  /* save reps for next block */
1208
1583
  { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
1209
1584
  }
1210
1585
 
1211
1586
  /* last literal segment */
1212
- { size_t const lastLLSize = litEnd - litPtr;
1587
+ if (dctx->litBufferLocation == ZSTD_split) {
1588
+ /* split hasn't been reached yet, first get dst then copy litExtraBuffer */
1589
+ size_t const lastLLSize = (size_t)(litBufferEnd - litPtr);
1590
+ DEBUGLOG(6, "copy last literals from segment : %u", (U32)lastLLSize);
1591
+ RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, "");
1592
+ if (op != NULL) {
1593
+ ZSTD_memmove(op, litPtr, lastLLSize);
1594
+ op += lastLLSize;
1595
+ }
1596
+ litPtr = dctx->litExtraBuffer;
1597
+ litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
1598
+ dctx->litBufferLocation = ZSTD_not_in_dst;
1599
+ }
1600
+ /* copy last literals from internal buffer */
1601
+ { size_t const lastLLSize = (size_t)(litBufferEnd - litPtr);
1602
+ DEBUGLOG(6, "copy last literals from internal buffer : %u", (U32)lastLLSize);
1213
1603
  RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
1214
1604
  if (op != NULL) {
1215
1605
  ZSTD_memcpy(op, litPtr, lastLLSize);
1216
1606
  op += lastLLSize;
1607
+ } }
1608
+
1609
+ DEBUGLOG(6, "decoded block of size %u bytes", (U32)(op - ostart));
1610
+ return (size_t)(op - ostart);
1611
+ }
1612
+
1613
+ FORCE_INLINE_TEMPLATE size_t
1614
+ DONT_VECTORIZE
1615
+ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx,
1616
+ void* dst, size_t maxDstSize,
1617
+ const void* seqStart, size_t seqSize, int nbSeq,
1618
+ const ZSTD_longOffset_e isLongOffset)
1619
+ {
1620
+ const BYTE* ip = (const BYTE*)seqStart;
1621
+ const BYTE* const iend = ip + seqSize;
1622
+ BYTE* const ostart = (BYTE*)dst;
1623
+ BYTE* const oend = dctx->litBufferLocation == ZSTD_not_in_dst ? ZSTD_maybeNullPtrAdd(ostart, maxDstSize) : dctx->litBuffer;
1624
+ BYTE* op = ostart;
1625
+ const BYTE* litPtr = dctx->litPtr;
1626
+ const BYTE* const litEnd = litPtr + dctx->litSize;
1627
+ const BYTE* const prefixStart = (const BYTE*)(dctx->prefixStart);
1628
+ const BYTE* const vBase = (const BYTE*)(dctx->virtualStart);
1629
+ const BYTE* const dictEnd = (const BYTE*)(dctx->dictEnd);
1630
+ DEBUGLOG(5, "ZSTD_decompressSequences_body: nbSeq = %d", nbSeq);
1631
+
1632
+ /* Regen sequences */
1633
+ if (nbSeq) {
1634
+ seqState_t seqState;
1635
+ dctx->fseEntropy = 1;
1636
+ { U32 i; for (i = 0; i < ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; }
1637
+ RETURN_ERROR_IF(
1638
+ ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend - ip)),
1639
+ corruption_detected, "");
1640
+ ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr);
1641
+ ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr);
1642
+ ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
1643
+ assert(dst != NULL);
1644
+
1645
+ #if defined(__GNUC__) && defined(__x86_64__)
1646
+ __asm__(".p2align 6");
1647
+ __asm__("nop");
1648
+ # if __GNUC__ >= 7
1649
+ __asm__(".p2align 5");
1650
+ __asm__("nop");
1651
+ __asm__(".p2align 3");
1652
+ # else
1653
+ __asm__(".p2align 4");
1654
+ __asm__("nop");
1655
+ __asm__(".p2align 3");
1656
+ # endif
1657
+ #endif
1658
+
1659
+ for ( ; nbSeq ; nbSeq--) {
1660
+ seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1);
1661
+ size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd);
1662
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1663
+ assert(!ZSTD_isError(oneSeqSize));
1664
+ ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
1665
+ #endif
1666
+ if (UNLIKELY(ZSTD_isError(oneSeqSize)))
1667
+ return oneSeqSize;
1668
+ DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
1669
+ op += oneSeqSize;
1217
1670
  }
1671
+
1672
+ /* check if reached exact end */
1673
+ assert(nbSeq == 0);
1674
+ RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, "");
1675
+ /* save reps for next block */
1676
+ { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
1218
1677
  }
1219
1678
 
1220
- return op-ostart;
1679
+ /* last literal segment */
1680
+ { size_t const lastLLSize = (size_t)(litEnd - litPtr);
1681
+ DEBUGLOG(6, "copy last literals : %u", (U32)lastLLSize);
1682
+ RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
1683
+ if (op != NULL) {
1684
+ ZSTD_memcpy(op, litPtr, lastLLSize);
1685
+ op += lastLLSize;
1686
+ } }
1687
+
1688
+ DEBUGLOG(6, "decoded block of size %u bytes", (U32)(op - ostart));
1689
+ return (size_t)(op - ostart);
1221
1690
  }
1222
1691
 
1223
1692
  static size_t
1224
1693
  ZSTD_decompressSequences_default(ZSTD_DCtx* dctx,
1225
1694
  void* dst, size_t maxDstSize,
1226
1695
  const void* seqStart, size_t seqSize, int nbSeq,
1227
- const ZSTD_longOffset_e isLongOffset,
1228
- const int frame)
1696
+ const ZSTD_longOffset_e isLongOffset)
1229
1697
  {
1230
- return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1698
+ return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
1699
+ }
1700
+
1701
+ static size_t
1702
+ ZSTD_decompressSequencesSplitLitBuffer_default(ZSTD_DCtx* dctx,
1703
+ void* dst, size_t maxDstSize,
1704
+ const void* seqStart, size_t seqSize, int nbSeq,
1705
+ const ZSTD_longOffset_e isLongOffset)
1706
+ {
1707
+ return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
1231
1708
  }
1232
1709
  #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
1233
1710
 
1234
1711
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
1712
+
1713
+ FORCE_INLINE_TEMPLATE
1714
+
1715
+ size_t ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence,
1716
+ const BYTE* const prefixStart, const BYTE* const dictEnd)
1717
+ {
1718
+ prefetchPos += sequence.litLength;
1719
+ { const BYTE* const matchBase = (sequence.offset > prefetchPos) ? dictEnd : prefixStart;
1720
+ /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted.
1721
+ * No consequence though : memory address is only used for prefetching, not for dereferencing */
1722
+ const BYTE* const match = ZSTD_wrappedPtrSub(ZSTD_wrappedPtrAdd(matchBase, prefetchPos), sequence.offset);
1723
+ PREFETCH_L1(match); PREFETCH_L1(match+CACHELINE_SIZE); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
1724
+ }
1725
+ return prefetchPos + sequence.matchLength;
1726
+ }
1727
+
1728
+ /* This decoding function employs prefetching
1729
+ * to reduce latency impact of cache misses.
1730
+ * It's generally employed when block contains a significant portion of long-distance matches
1731
+ * or when coupled with a "cold" dictionary */
1235
1732
  FORCE_INLINE_TEMPLATE size_t
1236
1733
  ZSTD_decompressSequencesLong_body(
1237
1734
  ZSTD_DCtx* dctx,
1238
1735
  void* dst, size_t maxDstSize,
1239
1736
  const void* seqStart, size_t seqSize, int nbSeq,
1240
- const ZSTD_longOffset_e isLongOffset,
1241
- const int frame)
1737
+ const ZSTD_longOffset_e isLongOffset)
1242
1738
  {
1243
1739
  const BYTE* ip = (const BYTE*)seqStart;
1244
1740
  const BYTE* const iend = ip + seqSize;
1245
- BYTE* const ostart = (BYTE* const)dst;
1246
- BYTE* const oend = ostart + maxDstSize;
1741
+ BYTE* const ostart = (BYTE*)dst;
1742
+ BYTE* const oend = dctx->litBufferLocation == ZSTD_in_dst ? dctx->litBuffer : ZSTD_maybeNullPtrAdd(ostart, maxDstSize);
1247
1743
  BYTE* op = ostart;
1248
1744
  const BYTE* litPtr = dctx->litPtr;
1249
- const BYTE* const litEnd = litPtr + dctx->litSize;
1745
+ const BYTE* litBufferEnd = dctx->litBufferEnd;
1250
1746
  const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart);
1251
1747
  const BYTE* const dictStart = (const BYTE*) (dctx->virtualStart);
1252
1748
  const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
1253
- (void)frame;
1254
1749
 
1255
1750
  /* Regen sequences */
1256
1751
  if (nbSeq) {
1257
- #define STORED_SEQS 4
1752
+ #define STORED_SEQS 8
1258
1753
  #define STORED_SEQS_MASK (STORED_SEQS-1)
1259
- #define ADVANCED_SEQS 4
1754
+ #define ADVANCED_SEQS STORED_SEQS
1260
1755
  seq_t sequences[STORED_SEQS];
1261
1756
  int const seqAdvance = MIN(nbSeq, ADVANCED_SEQS);
1262
1757
  seqState_t seqState;
1263
1758
  int seqNb;
1759
+ size_t prefetchPos = (size_t)(op-prefixStart); /* track position relative to prefixStart */
1760
+
1264
1761
  dctx->fseEntropy = 1;
1265
1762
  { int i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; }
1266
- seqState.prefixStart = prefixStart;
1267
- seqState.pos = (size_t)(op-prefixStart);
1268
- seqState.dictEnd = dictEnd;
1269
1763
  assert(dst != NULL);
1270
1764
  assert(iend >= ip);
1271
1765
  RETURN_ERROR_IF(
@@ -1276,37 +1770,95 @@ ZSTD_decompressSequencesLong_body(
1276
1770
  ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
1277
1771
 
1278
1772
  /* prepare in advance */
1279
- for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && (seqNb<seqAdvance); seqNb++) {
1280
- sequences[seqNb] = ZSTD_decodeSequence(&seqState, isLongOffset, ZSTD_p_prefetch);
1281
- PREFETCH_L1(sequences[seqNb].match); PREFETCH_L1(sequences[seqNb].match + sequences[seqNb].matchLength - 1); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
1773
+ for (seqNb=0; seqNb<seqAdvance; seqNb++) {
1774
+ seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, seqNb == nbSeq-1);
1775
+ prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
1776
+ sequences[seqNb] = sequence;
1282
1777
  }
1283
- RETURN_ERROR_IF(seqNb<seqAdvance, corruption_detected, "");
1284
1778
 
1285
- /* decode and decompress */
1286
- for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (seqNb<nbSeq) ; seqNb++) {
1287
- seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, ZSTD_p_prefetch);
1288
- size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb-ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd);
1779
+ /* decompress without stomping litBuffer */
1780
+ for (; seqNb < nbSeq; seqNb++) {
1781
+ seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset, seqNb == nbSeq-1);
1782
+
1783
+ if (dctx->litBufferLocation == ZSTD_split && litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength > dctx->litBufferEnd) {
1784
+ /* lit buffer is reaching split point, empty out the first buffer and transition to litExtraBuffer */
1785
+ const size_t leftoverLit = dctx->litBufferEnd - litPtr;
1786
+ if (leftoverLit)
1787
+ {
1788
+ RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");
1789
+ ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
1790
+ sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength -= leftoverLit;
1791
+ op += leftoverLit;
1792
+ }
1793
+ litPtr = dctx->litExtraBuffer;
1794
+ litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
1795
+ dctx->litBufferLocation = ZSTD_not_in_dst;
1796
+ { size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
1289
1797
  #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1290
- assert(!ZSTD_isError(oneSeqSize));
1291
- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb-ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
1798
+ assert(!ZSTD_isError(oneSeqSize));
1799
+ ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
1292
1800
  #endif
1293
- if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1294
- PREFETCH_L1(sequence.match); PREFETCH_L1(sequence.match + sequence.matchLength - 1); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
1295
- sequences[seqNb & STORED_SEQS_MASK] = sequence;
1296
- op += oneSeqSize;
1801
+ if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1802
+
1803
+ prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
1804
+ sequences[seqNb & STORED_SEQS_MASK] = sequence;
1805
+ op += oneSeqSize;
1806
+ } }
1807
+ else
1808
+ {
1809
+ /* lit buffer is either wholly contained in first or second split, or not split at all*/
1810
+ size_t const oneSeqSize = dctx->litBufferLocation == ZSTD_split ?
1811
+ ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength - WILDCOPY_OVERLENGTH, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) :
1812
+ ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
1813
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1814
+ assert(!ZSTD_isError(oneSeqSize));
1815
+ ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
1816
+ #endif
1817
+ if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1818
+
1819
+ prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
1820
+ sequences[seqNb & STORED_SEQS_MASK] = sequence;
1821
+ op += oneSeqSize;
1822
+ }
1297
1823
  }
1298
- RETURN_ERROR_IF(seqNb<nbSeq, corruption_detected, "");
1824
+ RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, "");
1299
1825
 
1300
1826
  /* finish queue */
1301
1827
  seqNb -= seqAdvance;
1302
1828
  for ( ; seqNb<nbSeq ; seqNb++) {
1303
- size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[seqNb&STORED_SEQS_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd);
1829
+ seq_t *sequence = &(sequences[seqNb&STORED_SEQS_MASK]);
1830
+ if (dctx->litBufferLocation == ZSTD_split && litPtr + sequence->litLength > dctx->litBufferEnd) {
1831
+ const size_t leftoverLit = dctx->litBufferEnd - litPtr;
1832
+ if (leftoverLit) {
1833
+ RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");
1834
+ ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
1835
+ sequence->litLength -= leftoverLit;
1836
+ op += leftoverLit;
1837
+ }
1838
+ litPtr = dctx->litExtraBuffer;
1839
+ litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
1840
+ dctx->litBufferLocation = ZSTD_not_in_dst;
1841
+ { size_t const oneSeqSize = ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
1304
1842
  #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1305
- assert(!ZSTD_isError(oneSeqSize));
1306
- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
1843
+ assert(!ZSTD_isError(oneSeqSize));
1844
+ ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
1307
1845
  #endif
1308
- if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1309
- op += oneSeqSize;
1846
+ if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1847
+ op += oneSeqSize;
1848
+ }
1849
+ }
1850
+ else
1851
+ {
1852
+ size_t const oneSeqSize = dctx->litBufferLocation == ZSTD_split ?
1853
+ ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence->litLength - WILDCOPY_OVERLENGTH, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) :
1854
+ ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
1855
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1856
+ assert(!ZSTD_isError(oneSeqSize));
1857
+ ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
1858
+ #endif
1859
+ if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1860
+ op += oneSeqSize;
1861
+ }
1310
1862
  }
1311
1863
 
1312
1864
  /* save reps for next block */
@@ -1314,25 +1866,34 @@ ZSTD_decompressSequencesLong_body(
1314
1866
  }
1315
1867
 
1316
1868
  /* last literal segment */
1317
- { size_t const lastLLSize = litEnd - litPtr;
1869
+ if (dctx->litBufferLocation == ZSTD_split) { /* first deplete literal buffer in dst, then copy litExtraBuffer */
1870
+ size_t const lastLLSize = litBufferEnd - litPtr;
1871
+ RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, "");
1872
+ if (op != NULL) {
1873
+ ZSTD_memmove(op, litPtr, lastLLSize);
1874
+ op += lastLLSize;
1875
+ }
1876
+ litPtr = dctx->litExtraBuffer;
1877
+ litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
1878
+ }
1879
+ { size_t const lastLLSize = litBufferEnd - litPtr;
1318
1880
  RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
1319
1881
  if (op != NULL) {
1320
- ZSTD_memcpy(op, litPtr, lastLLSize);
1882
+ ZSTD_memmove(op, litPtr, lastLLSize);
1321
1883
  op += lastLLSize;
1322
1884
  }
1323
1885
  }
1324
1886
 
1325
- return op-ostart;
1887
+ return (size_t)(op - ostart);
1326
1888
  }
1327
1889
 
1328
1890
  static size_t
1329
1891
  ZSTD_decompressSequencesLong_default(ZSTD_DCtx* dctx,
1330
1892
  void* dst, size_t maxDstSize,
1331
1893
  const void* seqStart, size_t seqSize, int nbSeq,
1332
- const ZSTD_longOffset_e isLongOffset,
1333
- const int frame)
1894
+ const ZSTD_longOffset_e isLongOffset)
1334
1895
  {
1335
- return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1896
+ return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
1336
1897
  }
1337
1898
  #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
1338
1899
 
@@ -1341,27 +1902,34 @@ ZSTD_decompressSequencesLong_default(ZSTD_DCtx* dctx,
1341
1902
  #if DYNAMIC_BMI2
1342
1903
 
1343
1904
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
1344
- static TARGET_ATTRIBUTE("bmi2") size_t
1905
+ static BMI2_TARGET_ATTRIBUTE size_t
1345
1906
  DONT_VECTORIZE
1346
1907
  ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx,
1347
1908
  void* dst, size_t maxDstSize,
1348
1909
  const void* seqStart, size_t seqSize, int nbSeq,
1349
- const ZSTD_longOffset_e isLongOffset,
1350
- const int frame)
1910
+ const ZSTD_longOffset_e isLongOffset)
1351
1911
  {
1352
- return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1912
+ return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
1913
+ }
1914
+ static BMI2_TARGET_ATTRIBUTE size_t
1915
+ DONT_VECTORIZE
1916
+ ZSTD_decompressSequencesSplitLitBuffer_bmi2(ZSTD_DCtx* dctx,
1917
+ void* dst, size_t maxDstSize,
1918
+ const void* seqStart, size_t seqSize, int nbSeq,
1919
+ const ZSTD_longOffset_e isLongOffset)
1920
+ {
1921
+ return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
1353
1922
  }
1354
1923
  #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
1355
1924
 
1356
1925
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
1357
- static TARGET_ATTRIBUTE("bmi2") size_t
1926
+ static BMI2_TARGET_ATTRIBUTE size_t
1358
1927
  ZSTD_decompressSequencesLong_bmi2(ZSTD_DCtx* dctx,
1359
1928
  void* dst, size_t maxDstSize,
1360
1929
  const void* seqStart, size_t seqSize, int nbSeq,
1361
- const ZSTD_longOffset_e isLongOffset,
1362
- const int frame)
1930
+ const ZSTD_longOffset_e isLongOffset)
1363
1931
  {
1364
- return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1932
+ return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
1365
1933
  }
1366
1934
  #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
1367
1935
 
@@ -1371,23 +1939,34 @@ typedef size_t (*ZSTD_decompressSequences_t)(
1371
1939
  ZSTD_DCtx* dctx,
1372
1940
  void* dst, size_t maxDstSize,
1373
1941
  const void* seqStart, size_t seqSize, int nbSeq,
1374
- const ZSTD_longOffset_e isLongOffset,
1375
- const int frame);
1942
+ const ZSTD_longOffset_e isLongOffset);
1376
1943
 
1377
1944
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
1378
1945
  static size_t
1379
1946
  ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,
1380
1947
  const void* seqStart, size_t seqSize, int nbSeq,
1381
- const ZSTD_longOffset_e isLongOffset,
1382
- const int frame)
1948
+ const ZSTD_longOffset_e isLongOffset)
1383
1949
  {
1384
1950
  DEBUGLOG(5, "ZSTD_decompressSequences");
1385
1951
  #if DYNAMIC_BMI2
1386
- if (dctx->bmi2) {
1387
- return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1952
+ if (ZSTD_DCtx_get_bmi2(dctx)) {
1953
+ return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
1388
1954
  }
1389
1955
  #endif
1390
- return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1956
+ return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
1957
+ }
1958
+ static size_t
1959
+ ZSTD_decompressSequencesSplitLitBuffer(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,
1960
+ const void* seqStart, size_t seqSize, int nbSeq,
1961
+ const ZSTD_longOffset_e isLongOffset)
1962
+ {
1963
+ DEBUGLOG(5, "ZSTD_decompressSequencesSplitLitBuffer");
1964
+ #if DYNAMIC_BMI2
1965
+ if (ZSTD_DCtx_get_bmi2(dctx)) {
1966
+ return ZSTD_decompressSequencesSplitLitBuffer_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
1967
+ }
1968
+ #endif
1969
+ return ZSTD_decompressSequencesSplitLitBuffer_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
1391
1970
  }
1392
1971
  #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
1393
1972
 
@@ -1402,69 +1981,114 @@ static size_t
1402
1981
  ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx,
1403
1982
  void* dst, size_t maxDstSize,
1404
1983
  const void* seqStart, size_t seqSize, int nbSeq,
1405
- const ZSTD_longOffset_e isLongOffset,
1406
- const int frame)
1984
+ const ZSTD_longOffset_e isLongOffset)
1407
1985
  {
1408
1986
  DEBUGLOG(5, "ZSTD_decompressSequencesLong");
1409
1987
  #if DYNAMIC_BMI2
1410
- if (dctx->bmi2) {
1411
- return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1988
+ if (ZSTD_DCtx_get_bmi2(dctx)) {
1989
+ return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
1412
1990
  }
1413
1991
  #endif
1414
- return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1992
+ return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
1415
1993
  }
1416
1994
  #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
1417
1995
 
1418
1996
 
1997
+ /**
1998
+ * @returns The total size of the history referenceable by zstd, including
1999
+ * both the prefix and the extDict. At @p op any offset larger than this
2000
+ * is invalid.
2001
+ */
2002
+ static size_t ZSTD_totalHistorySize(BYTE* op, BYTE const* virtualStart)
2003
+ {
2004
+ return (size_t)(op - virtualStart);
2005
+ }
2006
+
2007
+ typedef struct {
2008
+ unsigned longOffsetShare;
2009
+ unsigned maxNbAdditionalBits;
2010
+ } ZSTD_OffsetInfo;
1419
2011
 
1420
- #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
1421
- !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
1422
- /* ZSTD_getLongOffsetsShare() :
2012
+ /* ZSTD_getOffsetInfo() :
1423
2013
  * condition : offTable must be valid
1424
2014
  * @return : "share" of long offsets (arbitrarily defined as > (1<<23))
1425
- * compared to maximum possible of (1<<OffFSELog) */
1426
- static unsigned
1427
- ZSTD_getLongOffsetsShare(const ZSTD_seqSymbol* offTable)
2015
+ * compared to maximum possible of (1<<OffFSELog),
2016
+ * as well as the maximum number additional bits required.
2017
+ */
2018
+ static ZSTD_OffsetInfo
2019
+ ZSTD_getOffsetInfo(const ZSTD_seqSymbol* offTable, int nbSeq)
1428
2020
  {
1429
- const void* ptr = offTable;
1430
- U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog;
1431
- const ZSTD_seqSymbol* table = offTable + 1;
1432
- U32 const max = 1 << tableLog;
1433
- U32 u, total = 0;
1434
- DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog);
1435
-
1436
- assert(max <= (1 << OffFSELog)); /* max not too large */
1437
- for (u=0; u<max; u++) {
1438
- if (table[u].nbAdditionalBits > 22) total += 1;
2021
+ ZSTD_OffsetInfo info = {0, 0};
2022
+ /* If nbSeq == 0, then the offTable is uninitialized, but we have
2023
+ * no sequences, so both values should be 0.
2024
+ */
2025
+ if (nbSeq != 0) {
2026
+ const void* ptr = offTable;
2027
+ U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog;
2028
+ const ZSTD_seqSymbol* table = offTable + 1;
2029
+ U32 const max = 1 << tableLog;
2030
+ U32 u;
2031
+ DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog);
2032
+
2033
+ assert(max <= (1 << OffFSELog)); /* max not too large */
2034
+ for (u=0; u<max; u++) {
2035
+ info.maxNbAdditionalBits = MAX(info.maxNbAdditionalBits, table[u].nbAdditionalBits);
2036
+ if (table[u].nbAdditionalBits > 22) info.longOffsetShare += 1;
2037
+ }
2038
+
2039
+ assert(tableLog <= OffFSELog);
2040
+ info.longOffsetShare <<= (OffFSELog - tableLog); /* scale to OffFSELog */
1439
2041
  }
1440
2042
 
1441
- assert(tableLog <= OffFSELog);
1442
- total <<= (OffFSELog - tableLog); /* scale to OffFSELog */
2043
+ return info;
2044
+ }
1443
2045
 
1444
- return total;
2046
+ /**
2047
+ * @returns The maximum offset we can decode in one read of our bitstream, without
2048
+ * reloading more bits in the middle of the offset bits read. Any offsets larger
2049
+ * than this must use the long offset decoder.
2050
+ */
2051
+ static size_t ZSTD_maxShortOffset(void)
2052
+ {
2053
+ if (MEM_64bits()) {
2054
+ /* We can decode any offset without reloading bits.
2055
+ * This might change if the max window size grows.
2056
+ */
2057
+ ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31);
2058
+ return (size_t)-1;
2059
+ } else {
2060
+ /* The maximum offBase is (1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1.
2061
+ * This offBase would require STREAM_ACCUMULATOR_MIN extra bits.
2062
+ * Then we have to subtract ZSTD_REP_NUM to get the maximum possible offset.
2063
+ */
2064
+ size_t const maxOffbase = ((size_t)1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1;
2065
+ size_t const maxOffset = maxOffbase - ZSTD_REP_NUM;
2066
+ assert(ZSTD_highbit32((U32)maxOffbase) == STREAM_ACCUMULATOR_MIN);
2067
+ return maxOffset;
2068
+ }
1445
2069
  }
1446
- #endif
1447
2070
 
1448
2071
  size_t
1449
2072
  ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
1450
2073
  void* dst, size_t dstCapacity,
1451
- const void* src, size_t srcSize, const int frame)
2074
+ const void* src, size_t srcSize, const streaming_operation streaming)
1452
2075
  { /* blockType == blockCompressed */
1453
2076
  const BYTE* ip = (const BYTE*)src;
1454
- /* isLongOffset must be true if there are long offsets.
1455
- * Offsets are long if they are larger than 2^STREAM_ACCUMULATOR_MIN.
1456
- * We don't expect that to be the case in 64-bit mode.
1457
- * In block mode, window size is not known, so we have to be conservative.
1458
- * (note: but it could be evaluated from current-lowLimit)
1459
- */
1460
- ZSTD_longOffset_e const isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (!frame || (dctx->fParams.windowSize > (1ULL << STREAM_ACCUMULATOR_MIN))));
1461
- DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize);
1462
-
1463
- RETURN_ERROR_IF(srcSize >= ZSTD_BLOCKSIZE_MAX, srcSize_wrong, "");
2077
+ DEBUGLOG(5, "ZSTD_decompressBlock_internal (cSize : %u)", (unsigned)srcSize);
2078
+
2079
+ /* Note : the wording of the specification
2080
+ * allows compressed block to be sized exactly ZSTD_blockSizeMax(dctx).
2081
+ * This generally does not happen, as it makes little sense,
2082
+ * since an uncompressed block would feature same size and have no decompression cost.
2083
+ * Also, note that decoder from reference libzstd before < v1.5.4
2084
+ * would consider this edge case as an error.
2085
+ * As a consequence, avoid generating compressed blocks of size ZSTD_blockSizeMax(dctx)
2086
+ * for broader compatibility with the deployed ecosystem of zstd decoders */
2087
+ RETURN_ERROR_IF(srcSize > ZSTD_blockSizeMax(dctx), srcSize_wrong, "");
1464
2088
 
1465
2089
  /* Decode literals section */
1466
- { size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize);
1467
- DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : %u", (U32)litCSize);
2090
+ { size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, streaming);
2091
+ DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : cSize=%u, nbLiterals=%zu", (U32)litCSize, dctx->litSize);
1468
2092
  if (ZSTD_isError(litCSize)) return litCSize;
1469
2093
  ip += litCSize;
1470
2094
  srcSize -= litCSize;
@@ -1472,6 +2096,23 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
1472
2096
 
1473
2097
  /* Build Decoding Tables */
1474
2098
  {
2099
+ /* Compute the maximum block size, which must also work when !frame and fParams are unset.
2100
+ * Additionally, take the min with dstCapacity to ensure that the totalHistorySize fits in a size_t.
2101
+ */
2102
+ size_t const blockSizeMax = MIN(dstCapacity, ZSTD_blockSizeMax(dctx));
2103
+ size_t const totalHistorySize = ZSTD_totalHistorySize(ZSTD_maybeNullPtrAdd((BYTE*)dst, blockSizeMax), (BYTE const*)dctx->virtualStart);
2104
+ /* isLongOffset must be true if there are long offsets.
2105
+ * Offsets are long if they are larger than ZSTD_maxShortOffset().
2106
+ * We don't expect that to be the case in 64-bit mode.
2107
+ *
2108
+ * We check here to see if our history is large enough to allow long offsets.
2109
+ * If it isn't, then we can't possible have (valid) long offsets. If the offset
2110
+ * is invalid, then it is okay to read it incorrectly.
2111
+ *
2112
+ * If isLongOffsets is true, then we will later check our decoding table to see
2113
+ * if it is even possible to generate long offsets.
2114
+ */
2115
+ ZSTD_longOffset_e isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (totalHistorySize > ZSTD_maxShortOffset()));
1475
2116
  /* These macros control at build-time which decompressor implementation
1476
2117
  * we use. If neither is defined, we do some inspection and dispatch at
1477
2118
  * runtime.
@@ -1479,6 +2120,11 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
1479
2120
  #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
1480
2121
  !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
1481
2122
  int usePrefetchDecoder = dctx->ddictIsCold;
2123
+ #else
2124
+ /* Set to 1 to avoid computing offset info if we don't need to.
2125
+ * Otherwise this value is ignored.
2126
+ */
2127
+ int usePrefetchDecoder = 1;
1482
2128
  #endif
1483
2129
  int nbSeq;
1484
2130
  size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize);
@@ -1486,40 +2132,58 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
1486
2132
  ip += seqHSize;
1487
2133
  srcSize -= seqHSize;
1488
2134
 
1489
- RETURN_ERROR_IF(dst == NULL && nbSeq > 0, dstSize_tooSmall, "NULL not handled");
2135
+ RETURN_ERROR_IF((dst == NULL || dstCapacity == 0) && nbSeq > 0, dstSize_tooSmall, "NULL not handled");
2136
+ RETURN_ERROR_IF(MEM_64bits() && sizeof(size_t) == sizeof(void*) && (size_t)(-1) - (size_t)dst < (size_t)(1 << 20), dstSize_tooSmall,
2137
+ "invalid dst");
1490
2138
 
1491
- #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
1492
- !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
1493
- if ( !usePrefetchDecoder
1494
- && (!frame || (dctx->fParams.windowSize > (1<<24)))
1495
- && (nbSeq>ADVANCED_SEQS) ) { /* could probably use a larger nbSeq limit */
1496
- U32 const shareLongOffsets = ZSTD_getLongOffsetsShare(dctx->OFTptr);
1497
- U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */
1498
- usePrefetchDecoder = (shareLongOffsets >= minShare);
2139
+ /* If we could potentially have long offsets, or we might want to use the prefetch decoder,
2140
+ * compute information about the share of long offsets, and the maximum nbAdditionalBits.
2141
+ * NOTE: could probably use a larger nbSeq limit
2142
+ */
2143
+ if (isLongOffset || (!usePrefetchDecoder && (totalHistorySize > (1u << 24)) && (nbSeq > 8))) {
2144
+ ZSTD_OffsetInfo const info = ZSTD_getOffsetInfo(dctx->OFTptr, nbSeq);
2145
+ if (isLongOffset && info.maxNbAdditionalBits <= STREAM_ACCUMULATOR_MIN) {
2146
+ /* If isLongOffset, but the maximum number of additional bits that we see in our table is small
2147
+ * enough, then we know it is impossible to have too long an offset in this block, so we can
2148
+ * use the regular offset decoder.
2149
+ */
2150
+ isLongOffset = ZSTD_lo_isRegularOffset;
2151
+ }
2152
+ if (!usePrefetchDecoder) {
2153
+ U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */
2154
+ usePrefetchDecoder = (info.longOffsetShare >= minShare);
2155
+ }
1499
2156
  }
1500
- #endif
1501
2157
 
1502
2158
  dctx->ddictIsCold = 0;
1503
2159
 
1504
2160
  #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
1505
2161
  !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
1506
- if (usePrefetchDecoder)
2162
+ if (usePrefetchDecoder) {
2163
+ #else
2164
+ (void)usePrefetchDecoder;
2165
+ {
1507
2166
  #endif
1508
2167
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
1509
- return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
2168
+ return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
1510
2169
  #endif
2170
+ }
1511
2171
 
1512
2172
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
1513
2173
  /* else */
1514
- return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
2174
+ if (dctx->litBufferLocation == ZSTD_split)
2175
+ return ZSTD_decompressSequencesSplitLitBuffer(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
2176
+ else
2177
+ return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
1515
2178
  #endif
1516
2179
  }
1517
2180
  }
1518
2181
 
1519
2182
 
1520
- void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst)
2183
+ ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
2184
+ void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize)
1521
2185
  {
1522
- if (dst != dctx->previousDstEnd) { /* not contiguous */
2186
+ if (dst != dctx->previousDstEnd && dstSize > 0) { /* not contiguous */
1523
2187
  dctx->dictEnd = dctx->previousDstEnd;
1524
2188
  dctx->virtualStart = (const char*)dst - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->prefixStart));
1525
2189
  dctx->prefixStart = dst;
@@ -1528,13 +2192,24 @@ void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst)
1528
2192
  }
1529
2193
 
1530
2194
 
1531
- size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
1532
- void* dst, size_t dstCapacity,
1533
- const void* src, size_t srcSize)
2195
+ size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx,
2196
+ void* dst, size_t dstCapacity,
2197
+ const void* src, size_t srcSize)
1534
2198
  {
1535
2199
  size_t dSize;
1536
- ZSTD_checkContinuity(dctx, dst);
1537
- dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 0);
2200
+ dctx->isFrameDecompression = 0;
2201
+ ZSTD_checkContinuity(dctx, dst, dstCapacity);
2202
+ dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, not_streaming);
2203
+ FORWARD_IF_ERROR(dSize, "");
1538
2204
  dctx->previousDstEnd = (char*)dst + dSize;
1539
2205
  return dSize;
1540
2206
  }
2207
+
2208
+
2209
+ /* NOTE: Must just wrap ZSTD_decompressBlock_deprecated() */
2210
+ size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
2211
+ void* dst, size_t dstCapacity,
2212
+ const void* src, size_t srcSize)
2213
+ {
2214
+ return ZSTD_decompressBlock_deprecated(dctx, dst, dstCapacity, src, srcSize);
2215
+ }