extzstd 0.3.2 → 0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +4 -3
  3. data/contrib/zstd/CHANGELOG +225 -1
  4. data/contrib/zstd/CONTRIBUTING.md +158 -75
  5. data/contrib/zstd/LICENSE +4 -4
  6. data/contrib/zstd/Makefile +106 -69
  7. data/contrib/zstd/Package.swift +36 -0
  8. data/contrib/zstd/README.md +64 -36
  9. data/contrib/zstd/SECURITY.md +15 -0
  10. data/contrib/zstd/TESTING.md +2 -3
  11. data/contrib/zstd/lib/BUCK +5 -7
  12. data/contrib/zstd/lib/Makefile +117 -199
  13. data/contrib/zstd/lib/README.md +37 -7
  14. data/contrib/zstd/lib/common/allocations.h +55 -0
  15. data/contrib/zstd/lib/common/bits.h +200 -0
  16. data/contrib/zstd/lib/common/bitstream.h +80 -86
  17. data/contrib/zstd/lib/common/compiler.h +225 -63
  18. data/contrib/zstd/lib/common/cpu.h +37 -1
  19. data/contrib/zstd/lib/common/debug.c +7 -1
  20. data/contrib/zstd/lib/common/debug.h +21 -12
  21. data/contrib/zstd/lib/common/entropy_common.c +15 -37
  22. data/contrib/zstd/lib/common/error_private.c +9 -2
  23. data/contrib/zstd/lib/common/error_private.h +93 -5
  24. data/contrib/zstd/lib/common/fse.h +12 -87
  25. data/contrib/zstd/lib/common/fse_decompress.c +37 -117
  26. data/contrib/zstd/lib/common/huf.h +97 -172
  27. data/contrib/zstd/lib/common/mem.h +58 -58
  28. data/contrib/zstd/lib/common/pool.c +38 -17
  29. data/contrib/zstd/lib/common/pool.h +10 -4
  30. data/contrib/zstd/lib/common/portability_macros.h +158 -0
  31. data/contrib/zstd/lib/common/threading.c +74 -14
  32. data/contrib/zstd/lib/common/threading.h +5 -10
  33. data/contrib/zstd/lib/common/xxhash.c +6 -814
  34. data/contrib/zstd/lib/common/xxhash.h +6930 -195
  35. data/contrib/zstd/lib/common/zstd_common.c +1 -36
  36. data/contrib/zstd/lib/common/zstd_deps.h +1 -1
  37. data/contrib/zstd/lib/common/zstd_internal.h +68 -154
  38. data/contrib/zstd/lib/common/zstd_trace.h +163 -0
  39. data/contrib/zstd/lib/compress/clevels.h +134 -0
  40. data/contrib/zstd/lib/compress/fse_compress.c +75 -155
  41. data/contrib/zstd/lib/compress/hist.c +1 -1
  42. data/contrib/zstd/lib/compress/hist.h +1 -1
  43. data/contrib/zstd/lib/compress/huf_compress.c +810 -259
  44. data/contrib/zstd/lib/compress/zstd_compress.c +2864 -919
  45. data/contrib/zstd/lib/compress/zstd_compress_internal.h +523 -192
  46. data/contrib/zstd/lib/compress/zstd_compress_literals.c +117 -40
  47. data/contrib/zstd/lib/compress/zstd_compress_literals.h +16 -6
  48. data/contrib/zstd/lib/compress/zstd_compress_sequences.c +28 -19
  49. data/contrib/zstd/lib/compress/zstd_compress_sequences.h +1 -1
  50. data/contrib/zstd/lib/compress/zstd_compress_superblock.c +251 -412
  51. data/contrib/zstd/lib/compress/zstd_compress_superblock.h +1 -1
  52. data/contrib/zstd/lib/compress/zstd_cwksp.h +284 -97
  53. data/contrib/zstd/lib/compress/zstd_double_fast.c +382 -133
  54. data/contrib/zstd/lib/compress/zstd_double_fast.h +14 -2
  55. data/contrib/zstd/lib/compress/zstd_fast.c +732 -260
  56. data/contrib/zstd/lib/compress/zstd_fast.h +3 -2
  57. data/contrib/zstd/lib/compress/zstd_lazy.c +1177 -390
  58. data/contrib/zstd/lib/compress/zstd_lazy.h +129 -14
  59. data/contrib/zstd/lib/compress/zstd_ldm.c +280 -210
  60. data/contrib/zstd/lib/compress/zstd_ldm.h +3 -2
  61. data/contrib/zstd/lib/compress/zstd_ldm_geartab.h +106 -0
  62. data/contrib/zstd/lib/compress/zstd_opt.c +516 -285
  63. data/contrib/zstd/lib/compress/zstd_opt.h +32 -8
  64. data/contrib/zstd/lib/compress/zstdmt_compress.c +202 -131
  65. data/contrib/zstd/lib/compress/zstdmt_compress.h +9 -6
  66. data/contrib/zstd/lib/decompress/huf_decompress.c +1149 -555
  67. data/contrib/zstd/lib/decompress/huf_decompress_amd64.S +595 -0
  68. data/contrib/zstd/lib/decompress/zstd_ddict.c +4 -4
  69. data/contrib/zstd/lib/decompress/zstd_ddict.h +1 -1
  70. data/contrib/zstd/lib/decompress/zstd_decompress.c +583 -106
  71. data/contrib/zstd/lib/decompress/zstd_decompress_block.c +1054 -379
  72. data/contrib/zstd/lib/decompress/zstd_decompress_block.h +14 -3
  73. data/contrib/zstd/lib/decompress/zstd_decompress_internal.h +56 -6
  74. data/contrib/zstd/lib/deprecated/zbuff.h +1 -1
  75. data/contrib/zstd/lib/deprecated/zbuff_common.c +1 -1
  76. data/contrib/zstd/lib/deprecated/zbuff_compress.c +24 -4
  77. data/contrib/zstd/lib/deprecated/zbuff_decompress.c +3 -1
  78. data/contrib/zstd/lib/dictBuilder/cover.c +60 -44
  79. data/contrib/zstd/lib/dictBuilder/cover.h +6 -11
  80. data/contrib/zstd/lib/dictBuilder/divsufsort.c +1 -1
  81. data/contrib/zstd/lib/dictBuilder/fastcover.c +26 -18
  82. data/contrib/zstd/lib/dictBuilder/zdict.c +100 -101
  83. data/contrib/zstd/lib/legacy/zstd_legacy.h +38 -1
  84. data/contrib/zstd/lib/legacy/zstd_v01.c +18 -53
  85. data/contrib/zstd/lib/legacy/zstd_v01.h +1 -1
  86. data/contrib/zstd/lib/legacy/zstd_v02.c +28 -85
  87. data/contrib/zstd/lib/legacy/zstd_v02.h +1 -1
  88. data/contrib/zstd/lib/legacy/zstd_v03.c +29 -88
  89. data/contrib/zstd/lib/legacy/zstd_v03.h +1 -1
  90. data/contrib/zstd/lib/legacy/zstd_v04.c +27 -80
  91. data/contrib/zstd/lib/legacy/zstd_v04.h +1 -1
  92. data/contrib/zstd/lib/legacy/zstd_v05.c +36 -85
  93. data/contrib/zstd/lib/legacy/zstd_v05.h +1 -1
  94. data/contrib/zstd/lib/legacy/zstd_v06.c +44 -96
  95. data/contrib/zstd/lib/legacy/zstd_v06.h +1 -1
  96. data/contrib/zstd/lib/legacy/zstd_v07.c +37 -92
  97. data/contrib/zstd/lib/legacy/zstd_v07.h +1 -1
  98. data/contrib/zstd/lib/libzstd.mk +237 -0
  99. data/contrib/zstd/lib/libzstd.pc.in +4 -3
  100. data/contrib/zstd/lib/module.modulemap +35 -0
  101. data/contrib/zstd/lib/{dictBuilder/zdict.h → zdict.h} +202 -33
  102. data/contrib/zstd/lib/zstd.h +1030 -332
  103. data/contrib/zstd/lib/{common/zstd_errors.h → zstd_errors.h} +27 -8
  104. data/ext/extconf.rb +26 -7
  105. data/ext/extzstd.c +51 -24
  106. data/ext/extzstd.h +33 -6
  107. data/ext/extzstd_stream.c +74 -31
  108. data/ext/libzstd_conf.h +0 -1
  109. data/ext/zstd_decompress_asm.S +1 -0
  110. metadata +17 -7
  111. data/contrib/zstd/appveyor.yml +0 -292
  112. data/ext/depend +0 -2
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -20,12 +20,12 @@
20
20
  #include "../common/mem.h" /* low level memory routines */
21
21
  #define FSE_STATIC_LINKING_ONLY
22
22
  #include "../common/fse.h"
23
- #define HUF_STATIC_LINKING_ONLY
24
23
  #include "../common/huf.h"
25
24
  #include "../common/zstd_internal.h"
26
25
  #include "zstd_decompress_internal.h" /* ZSTD_DCtx */
27
26
  #include "zstd_ddict.h" /* ZSTD_DDictDictContent */
28
27
  #include "zstd_decompress_block.h"
28
+ #include "../common/bits.h" /* ZSTD_highbit32 */
29
29
 
30
30
  /*_*******************************************************
31
31
  * Macros
@@ -51,6 +51,13 @@ static void ZSTD_copy4(void* dst, const void* src) { ZSTD_memcpy(dst, src, 4); }
51
51
  * Block decoding
52
52
  ***************************************************************/
53
53
 
54
+ static size_t ZSTD_blockSizeMax(ZSTD_DCtx const* dctx)
55
+ {
56
+ size_t const blockSizeMax = dctx->isFrameDecompression ? dctx->fParams.blockSizeMax : ZSTD_BLOCKSIZE_MAX;
57
+ assert(blockSizeMax <= ZSTD_BLOCKSIZE_MAX);
58
+ return blockSizeMax;
59
+ }
60
+
54
61
  /*! ZSTD_getcBlockSize() :
55
62
  * Provides the size of compressed block from block header `src` */
56
63
  size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
@@ -69,36 +76,90 @@ size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
69
76
  }
70
77
  }
71
78
 
79
+ /* Allocate buffer for literals, either overlapping current dst, or split between dst and litExtraBuffer, or stored entirely within litExtraBuffer */
80
+ static void ZSTD_allocateLiteralsBuffer(ZSTD_DCtx* dctx, void* const dst, const size_t dstCapacity, const size_t litSize,
81
+ const streaming_operation streaming, const size_t expectedWriteSize, const unsigned splitImmediately)
82
+ {
83
+ size_t const blockSizeMax = ZSTD_blockSizeMax(dctx);
84
+ assert(litSize <= blockSizeMax);
85
+ assert(dctx->isFrameDecompression || streaming == not_streaming);
86
+ assert(expectedWriteSize <= blockSizeMax);
87
+ if (streaming == not_streaming && dstCapacity > blockSizeMax + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH) {
88
+ /* If we aren't streaming, we can just put the literals after the output
89
+ * of the current block. We don't need to worry about overwriting the
90
+ * extDict of our window, because it doesn't exist.
91
+ * So if we have space after the end of the block, just put it there.
92
+ */
93
+ dctx->litBuffer = (BYTE*)dst + blockSizeMax + WILDCOPY_OVERLENGTH;
94
+ dctx->litBufferEnd = dctx->litBuffer + litSize;
95
+ dctx->litBufferLocation = ZSTD_in_dst;
96
+ } else if (litSize <= ZSTD_LITBUFFEREXTRASIZE) {
97
+ /* Literals fit entirely within the extra buffer, put them there to avoid
98
+ * having to split the literals.
99
+ */
100
+ dctx->litBuffer = dctx->litExtraBuffer;
101
+ dctx->litBufferEnd = dctx->litBuffer + litSize;
102
+ dctx->litBufferLocation = ZSTD_not_in_dst;
103
+ } else {
104
+ assert(blockSizeMax > ZSTD_LITBUFFEREXTRASIZE);
105
+ /* Literals must be split between the output block and the extra lit
106
+ * buffer. We fill the extra lit buffer with the tail of the literals,
107
+ * and put the rest of the literals at the end of the block, with
108
+ * WILDCOPY_OVERLENGTH of buffer room to allow for overreads.
109
+ * This MUST not write more than our maxBlockSize beyond dst, because in
110
+ * streaming mode, that could overwrite part of our extDict window.
111
+ */
112
+ if (splitImmediately) {
113
+ /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */
114
+ dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH;
115
+ dctx->litBufferEnd = dctx->litBuffer + litSize - ZSTD_LITBUFFEREXTRASIZE;
116
+ } else {
117
+ /* initially this will be stored entirely in dst during huffman decoding, it will partially be shifted to litExtraBuffer after */
118
+ dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize;
119
+ dctx->litBufferEnd = (BYTE*)dst + expectedWriteSize;
120
+ }
121
+ dctx->litBufferLocation = ZSTD_split;
122
+ assert(dctx->litBufferEnd <= (BYTE*)dst + expectedWriteSize);
123
+ }
124
+ }
72
125
 
73
- /* Hidden declaration for fullbench */
74
- size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
75
- const void* src, size_t srcSize);
76
126
  /*! ZSTD_decodeLiteralsBlock() :
127
+ * Where it is possible to do so without being stomped by the output during decompression, the literals block will be stored
128
+ * in the dstBuffer. If there is room to do so, it will be stored in full in the excess dst space after where the current
129
+ * block will be output. Otherwise it will be stored at the end of the current dst blockspace, with a small portion being
130
+ * stored in dctx->litExtraBuffer to help keep it "ahead" of the current output write.
131
+ *
77
132
  * @return : nb of bytes read from src (< srcSize )
78
133
  * note : symbol not declared but exposed for fullbench */
79
- size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
80
- const void* src, size_t srcSize) /* note : srcSize < BLOCKSIZE */
134
+ static size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
135
+ const void* src, size_t srcSize, /* note : srcSize < BLOCKSIZE */
136
+ void* dst, size_t dstCapacity, const streaming_operation streaming)
81
137
  {
82
138
  DEBUGLOG(5, "ZSTD_decodeLiteralsBlock");
83
139
  RETURN_ERROR_IF(srcSize < MIN_CBLOCK_SIZE, corruption_detected, "");
84
140
 
85
141
  { const BYTE* const istart = (const BYTE*) src;
86
142
  symbolEncodingType_e const litEncType = (symbolEncodingType_e)(istart[0] & 3);
143
+ size_t const blockSizeMax = ZSTD_blockSizeMax(dctx);
87
144
 
88
145
  switch(litEncType)
89
146
  {
90
147
  case set_repeat:
91
148
  DEBUGLOG(5, "set_repeat flag : re-using stats from previous compressed literals block");
92
149
  RETURN_ERROR_IF(dctx->litEntropy==0, dictionary_corrupted, "");
93
- /* fall-through */
150
+ ZSTD_FALLTHROUGH;
94
151
 
95
152
  case set_compressed:
96
- RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3");
153
+ RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need up to 5 for case 3");
97
154
  { size_t lhSize, litSize, litCSize;
98
155
  U32 singleStream=0;
99
156
  U32 const lhlCode = (istart[0] >> 2) & 3;
100
157
  U32 const lhc = MEM_readLE32(istart);
101
158
  size_t hufSuccess;
159
+ size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity);
160
+ int const flags = 0
161
+ | (ZSTD_DCtx_get_bmi2(dctx) ? HUF_flags_bmi2 : 0)
162
+ | (dctx->disableHufAsm ? HUF_flags_disableAsm : 0);
102
163
  switch(lhlCode)
103
164
  {
104
165
  case 0: case 1: default: /* note : default is impossible, since lhlCode into [0..3] */
@@ -121,8 +182,15 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
121
182
  litCSize = (lhc >> 22) + ((size_t)istart[4] << 10);
122
183
  break;
123
184
  }
124
- RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
185
+ RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
186
+ RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, "");
187
+ if (!singleStream)
188
+ RETURN_ERROR_IF(litSize < MIN_LITERALS_FOR_4_STREAMS, literals_headerWrong,
189
+ "Not enough literals (%zu) for the 4-streams mode (min %u)",
190
+ litSize, MIN_LITERALS_FOR_4_STREAMS);
125
191
  RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, "");
192
+ RETURN_ERROR_IF(expectedWriteSize < litSize , dstSize_tooSmall, "");
193
+ ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 0);
126
194
 
127
195
  /* prefetch huffman table if cold */
128
196
  if (dctx->ddictIsCold && (litSize > 768 /* heuristic */)) {
@@ -131,13 +199,14 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
131
199
 
132
200
  if (litEncType==set_repeat) {
133
201
  if (singleStream) {
134
- hufSuccess = HUF_decompress1X_usingDTable_bmi2(
202
+ hufSuccess = HUF_decompress1X_usingDTable(
135
203
  dctx->litBuffer, litSize, istart+lhSize, litCSize,
136
- dctx->HUFptr, dctx->bmi2);
204
+ dctx->HUFptr, flags);
137
205
  } else {
138
- hufSuccess = HUF_decompress4X_usingDTable_bmi2(
206
+ assert(litSize >= MIN_LITERALS_FOR_4_STREAMS);
207
+ hufSuccess = HUF_decompress4X_usingDTable(
139
208
  dctx->litBuffer, litSize, istart+lhSize, litCSize,
140
- dctx->HUFptr, dctx->bmi2);
209
+ dctx->HUFptr, flags);
141
210
  }
142
211
  } else {
143
212
  if (singleStream) {
@@ -145,20 +214,29 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
145
214
  hufSuccess = HUF_decompress1X_DCtx_wksp(
146
215
  dctx->entropy.hufTable, dctx->litBuffer, litSize,
147
216
  istart+lhSize, litCSize, dctx->workspace,
148
- sizeof(dctx->workspace));
217
+ sizeof(dctx->workspace), flags);
149
218
  #else
150
- hufSuccess = HUF_decompress1X1_DCtx_wksp_bmi2(
219
+ hufSuccess = HUF_decompress1X1_DCtx_wksp(
151
220
  dctx->entropy.hufTable, dctx->litBuffer, litSize,
152
221
  istart+lhSize, litCSize, dctx->workspace,
153
- sizeof(dctx->workspace), dctx->bmi2);
222
+ sizeof(dctx->workspace), flags);
154
223
  #endif
155
224
  } else {
156
- hufSuccess = HUF_decompress4X_hufOnly_wksp_bmi2(
225
+ hufSuccess = HUF_decompress4X_hufOnly_wksp(
157
226
  dctx->entropy.hufTable, dctx->litBuffer, litSize,
158
227
  istart+lhSize, litCSize, dctx->workspace,
159
- sizeof(dctx->workspace), dctx->bmi2);
228
+ sizeof(dctx->workspace), flags);
160
229
  }
161
230
  }
231
+ if (dctx->litBufferLocation == ZSTD_split)
232
+ {
233
+ assert(litSize > ZSTD_LITBUFFEREXTRASIZE);
234
+ ZSTD_memcpy(dctx->litExtraBuffer, dctx->litBufferEnd - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE);
235
+ ZSTD_memmove(dctx->litBuffer + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH, dctx->litBuffer, litSize - ZSTD_LITBUFFEREXTRASIZE);
236
+ dctx->litBuffer += ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH;
237
+ dctx->litBufferEnd -= WILDCOPY_OVERLENGTH;
238
+ assert(dctx->litBufferEnd <= (BYTE*)dst + blockSizeMax);
239
+ }
162
240
 
163
241
  RETURN_ERROR_IF(HUF_isError(hufSuccess), corruption_detected, "");
164
242
 
@@ -166,13 +244,13 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
166
244
  dctx->litSize = litSize;
167
245
  dctx->litEntropy = 1;
168
246
  if (litEncType==set_compressed) dctx->HUFptr = dctx->entropy.hufTable;
169
- ZSTD_memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
170
247
  return litCSize + lhSize;
171
248
  }
172
249
 
173
250
  case set_basic:
174
251
  { size_t litSize, lhSize;
175
252
  U32 const lhlCode = ((istart[0]) >> 2) & 3;
253
+ size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity);
176
254
  switch(lhlCode)
177
255
  {
178
256
  case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */
@@ -185,27 +263,42 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
185
263
  break;
186
264
  case 3:
187
265
  lhSize = 3;
266
+ RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize = 3");
188
267
  litSize = MEM_readLE24(istart) >> 4;
189
268
  break;
190
269
  }
191
270
 
271
+ RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
272
+ RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, "");
273
+ RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, "");
274
+ ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1);
192
275
  if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) { /* risk reading beyond src buffer with wildcopy */
193
276
  RETURN_ERROR_IF(litSize+lhSize > srcSize, corruption_detected, "");
194
- ZSTD_memcpy(dctx->litBuffer, istart+lhSize, litSize);
277
+ if (dctx->litBufferLocation == ZSTD_split)
278
+ {
279
+ ZSTD_memcpy(dctx->litBuffer, istart + lhSize, litSize - ZSTD_LITBUFFEREXTRASIZE);
280
+ ZSTD_memcpy(dctx->litExtraBuffer, istart + lhSize + litSize - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE);
281
+ }
282
+ else
283
+ {
284
+ ZSTD_memcpy(dctx->litBuffer, istart + lhSize, litSize);
285
+ }
195
286
  dctx->litPtr = dctx->litBuffer;
196
287
  dctx->litSize = litSize;
197
- ZSTD_memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
198
288
  return lhSize+litSize;
199
289
  }
200
290
  /* direct reference into compressed stream */
201
291
  dctx->litPtr = istart+lhSize;
202
292
  dctx->litSize = litSize;
293
+ dctx->litBufferEnd = dctx->litPtr + litSize;
294
+ dctx->litBufferLocation = ZSTD_not_in_dst;
203
295
  return lhSize+litSize;
204
296
  }
205
297
 
206
298
  case set_rle:
207
299
  { U32 const lhlCode = ((istart[0]) >> 2) & 3;
208
300
  size_t litSize, lhSize;
301
+ size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity);
209
302
  switch(lhlCode)
210
303
  {
211
304
  case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */
@@ -214,16 +307,28 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
214
307
  break;
215
308
  case 1:
216
309
  lhSize = 2;
310
+ RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 3");
217
311
  litSize = MEM_readLE16(istart) >> 4;
218
312
  break;
219
313
  case 3:
220
314
  lhSize = 3;
315
+ RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 4");
221
316
  litSize = MEM_readLE24(istart) >> 4;
222
- RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4");
223
317
  break;
224
318
  }
225
- RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
226
- ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize + WILDCOPY_OVERLENGTH);
319
+ RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
320
+ RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, "");
321
+ RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, "");
322
+ ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1);
323
+ if (dctx->litBufferLocation == ZSTD_split)
324
+ {
325
+ ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize - ZSTD_LITBUFFEREXTRASIZE);
326
+ ZSTD_memset(dctx->litExtraBuffer, istart[lhSize], ZSTD_LITBUFFEREXTRASIZE);
327
+ }
328
+ else
329
+ {
330
+ ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize);
331
+ }
227
332
  dctx->litPtr = dctx->litBuffer;
228
333
  dctx->litSize = litSize;
229
334
  return lhSize+1;
@@ -234,9 +339,21 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
234
339
  }
235
340
  }
236
341
 
342
+ /* Hidden declaration for fullbench */
343
+ size_t ZSTD_decodeLiteralsBlock_wrapper(ZSTD_DCtx* dctx,
344
+ const void* src, size_t srcSize,
345
+ void* dst, size_t dstCapacity);
346
+ size_t ZSTD_decodeLiteralsBlock_wrapper(ZSTD_DCtx* dctx,
347
+ const void* src, size_t srcSize,
348
+ void* dst, size_t dstCapacity)
349
+ {
350
+ dctx->isFrameDecompression = 0;
351
+ return ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, not_streaming);
352
+ }
353
+
237
354
  /* Default FSE distribution tables.
238
355
  * These are pre-calculated FSE decoding tables using default distributions as defined in specification :
239
- * https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md#default-distributions
356
+ * https://github.com/facebook/zstd/blob/release/doc/zstd_compression_format.md#default-distributions
240
357
  * They were generated programmatically with following method :
241
358
  * - start from default distributions, present in /lib/common/zstd_internal.h
242
359
  * - generate tables normally, using ZSTD_buildFSETable()
@@ -343,7 +460,7 @@ static const ZSTD_seqSymbol ML_defaultDTable[(1<<ML_DEFAULTNORMLOG)+1] = {
343
460
  }; /* ML_defaultDTable */
344
461
 
345
462
 
346
- static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U32 nbAddBits)
463
+ static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U8 nbAddBits)
347
464
  {
348
465
  void* ptr = dt;
349
466
  ZSTD_seqSymbol_header* const DTableH = (ZSTD_seqSymbol_header*)ptr;
@@ -355,7 +472,7 @@ static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U32 nbAddB
355
472
  cell->nbBits = 0;
356
473
  cell->nextState = 0;
357
474
  assert(nbAddBits < 255);
358
- cell->nbAdditionalBits = (BYTE)nbAddBits;
475
+ cell->nbAdditionalBits = nbAddBits;
359
476
  cell->baseValue = baseValue;
360
477
  }
361
478
 
@@ -367,7 +484,7 @@ static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U32 nbAddB
367
484
  FORCE_INLINE_TEMPLATE
368
485
  void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
369
486
  const short* normalizedCounter, unsigned maxSymbolValue,
370
- const U32* baseValue, const U32* nbAdditionalBits,
487
+ const U32* baseValue, const U8* nbAdditionalBits,
371
488
  unsigned tableLog, void* wksp, size_t wkspSize)
372
489
  {
373
490
  ZSTD_seqSymbol* const tableDecode = dt+1;
@@ -430,14 +547,15 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
430
547
  for (i = 8; i < n; i += 8) {
431
548
  MEM_write64(spread + pos + i, sv);
432
549
  }
433
- pos += n;
550
+ assert(n>=0);
551
+ pos += (size_t)n;
434
552
  }
435
553
  }
436
554
  /* Now we spread those positions across the table.
437
- * The benefit of doing it in two stages is that we avoid the the
555
+ * The benefit of doing it in two stages is that we avoid the
438
556
  * variable size inner loop, which caused lots of branch misses.
439
557
  * Now we can run through all the positions without any branch misses.
440
- * We unroll the loop twice, since that is what emperically worked best.
558
+ * We unroll the loop twice, since that is what empirically worked best.
441
559
  */
442
560
  {
443
561
  size_t position = 0;
@@ -464,7 +582,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
464
582
  for (i=0; i<n; i++) {
465
583
  tableDecode[position].baseValue = s;
466
584
  position = (position + step) & tableMask;
467
- while (position > highThreshold) position = (position + step) & tableMask; /* lowprob area */
585
+ while (UNLIKELY(position > highThreshold)) position = (position + step) & tableMask; /* lowprob area */
468
586
  } }
469
587
  assert(position == 0); /* position must reach all cells once, otherwise normalizedCounter is incorrect */
470
588
  }
@@ -475,10 +593,10 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
475
593
  for (u=0; u<tableSize; u++) {
476
594
  U32 const symbol = tableDecode[u].baseValue;
477
595
  U32 const nextState = symbolNext[symbol]++;
478
- tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32(nextState) );
596
+ tableDecode[u].nbBits = (BYTE) (tableLog - ZSTD_highbit32(nextState) );
479
597
  tableDecode[u].nextState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
480
598
  assert(nbAdditionalBits[symbol] < 255);
481
- tableDecode[u].nbAdditionalBits = (BYTE)nbAdditionalBits[symbol];
599
+ tableDecode[u].nbAdditionalBits = nbAdditionalBits[symbol];
482
600
  tableDecode[u].baseValue = baseValue[symbol];
483
601
  }
484
602
  }
@@ -487,7 +605,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
487
605
  /* Avoids the FORCE_INLINE of the _body() function. */
488
606
  static void ZSTD_buildFSETable_body_default(ZSTD_seqSymbol* dt,
489
607
  const short* normalizedCounter, unsigned maxSymbolValue,
490
- const U32* baseValue, const U32* nbAdditionalBits,
608
+ const U32* baseValue, const U8* nbAdditionalBits,
491
609
  unsigned tableLog, void* wksp, size_t wkspSize)
492
610
  {
493
611
  ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue,
@@ -495,9 +613,9 @@ static void ZSTD_buildFSETable_body_default(ZSTD_seqSymbol* dt,
495
613
  }
496
614
 
497
615
  #if DYNAMIC_BMI2
498
- TARGET_ATTRIBUTE("bmi2") static void ZSTD_buildFSETable_body_bmi2(ZSTD_seqSymbol* dt,
616
+ BMI2_TARGET_ATTRIBUTE static void ZSTD_buildFSETable_body_bmi2(ZSTD_seqSymbol* dt,
499
617
  const short* normalizedCounter, unsigned maxSymbolValue,
500
- const U32* baseValue, const U32* nbAdditionalBits,
618
+ const U32* baseValue, const U8* nbAdditionalBits,
501
619
  unsigned tableLog, void* wksp, size_t wkspSize)
502
620
  {
503
621
  ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue,
@@ -507,7 +625,7 @@ TARGET_ATTRIBUTE("bmi2") static void ZSTD_buildFSETable_body_bmi2(ZSTD_seqSymbol
507
625
 
508
626
  void ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
509
627
  const short* normalizedCounter, unsigned maxSymbolValue,
510
- const U32* baseValue, const U32* nbAdditionalBits,
628
+ const U32* baseValue, const U8* nbAdditionalBits,
511
629
  unsigned tableLog, void* wksp, size_t wkspSize, int bmi2)
512
630
  {
513
631
  #if DYNAMIC_BMI2
@@ -529,7 +647,7 @@ void ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
529
647
  static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymbol** DTablePtr,
530
648
  symbolEncodingType_e type, unsigned max, U32 maxLog,
531
649
  const void* src, size_t srcSize,
532
- const U32* baseValue, const U32* nbAdditionalBits,
650
+ const U32* baseValue, const U8* nbAdditionalBits,
533
651
  const ZSTD_seqSymbol* defaultTable, U32 flagRepeatTable,
534
652
  int ddictIsCold, int nbSeq, U32* wksp, size_t wkspSize,
535
653
  int bmi2)
@@ -541,7 +659,7 @@ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymb
541
659
  RETURN_ERROR_IF((*(const BYTE*)src) > max, corruption_detected, "");
542
660
  { U32 const symbol = *(const BYTE*)src;
543
661
  U32 const baseline = baseValue[symbol];
544
- U32 const nbBits = nbAdditionalBits[symbol];
662
+ U8 const nbBits = nbAdditionalBits[symbol];
545
663
  ZSTD_buildSeqTable_rle(DTableSpace, baseline, nbBits);
546
664
  }
547
665
  *DTablePtr = DTableSpace;
@@ -577,7 +695,7 @@ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymb
577
695
  size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
578
696
  const void* src, size_t srcSize)
579
697
  {
580
- const BYTE* const istart = (const BYTE* const)src;
698
+ const BYTE* const istart = (const BYTE*)src;
581
699
  const BYTE* const iend = istart + srcSize;
582
700
  const BYTE* ip = istart;
583
701
  int nbSeq;
@@ -588,11 +706,6 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
588
706
 
589
707
  /* SeqHead */
590
708
  nbSeq = *ip++;
591
- if (!nbSeq) {
592
- *nbSeqPtr=0;
593
- RETURN_ERROR_IF(srcSize != 1, srcSize_wrong, "");
594
- return 1;
595
- }
596
709
  if (nbSeq > 0x7F) {
597
710
  if (nbSeq == 0xFF) {
598
711
  RETURN_ERROR_IF(ip+2 > iend, srcSize_wrong, "");
@@ -605,8 +718,16 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
605
718
  }
606
719
  *nbSeqPtr = nbSeq;
607
720
 
721
+ if (nbSeq == 0) {
722
+ /* No sequence : section ends immediately */
723
+ RETURN_ERROR_IF(ip != iend, corruption_detected,
724
+ "extraneous data present in the Sequences section");
725
+ return (size_t)(ip - istart);
726
+ }
727
+
608
728
  /* FSE table descriptors */
609
729
  RETURN_ERROR_IF(ip+1 > iend, srcSize_wrong, ""); /* minimum possible size: 1 byte for symbol encoding types */
730
+ RETURN_ERROR_IF(*ip & 3, corruption_detected, ""); /* The last field, Reserved, must be all-zeroes. */
610
731
  { symbolEncodingType_e const LLtype = (symbolEncodingType_e)(*ip >> 6);
611
732
  symbolEncodingType_e const OFtype = (symbolEncodingType_e)((*ip >> 4) & 3);
612
733
  symbolEncodingType_e const MLtype = (symbolEncodingType_e)((*ip >> 2) & 3);
@@ -620,7 +741,7 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
620
741
  LL_defaultDTable, dctx->fseEntropy,
621
742
  dctx->ddictIsCold, nbSeq,
622
743
  dctx->workspace, sizeof(dctx->workspace),
623
- dctx->bmi2);
744
+ ZSTD_DCtx_get_bmi2(dctx));
624
745
  RETURN_ERROR_IF(ZSTD_isError(llhSize), corruption_detected, "ZSTD_buildSeqTable failed");
625
746
  ip += llhSize;
626
747
  }
@@ -632,7 +753,7 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
632
753
  OF_defaultDTable, dctx->fseEntropy,
633
754
  dctx->ddictIsCold, nbSeq,
634
755
  dctx->workspace, sizeof(dctx->workspace),
635
- dctx->bmi2);
756
+ ZSTD_DCtx_get_bmi2(dctx));
636
757
  RETURN_ERROR_IF(ZSTD_isError(ofhSize), corruption_detected, "ZSTD_buildSeqTable failed");
637
758
  ip += ofhSize;
638
759
  }
@@ -644,7 +765,7 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
644
765
  ML_defaultDTable, dctx->fseEntropy,
645
766
  dctx->ddictIsCold, nbSeq,
646
767
  dctx->workspace, sizeof(dctx->workspace),
647
- dctx->bmi2);
768
+ ZSTD_DCtx_get_bmi2(dctx));
648
769
  RETURN_ERROR_IF(ZSTD_isError(mlhSize), corruption_detected, "ZSTD_buildSeqTable failed");
649
770
  ip += mlhSize;
650
771
  }
@@ -658,7 +779,6 @@ typedef struct {
658
779
  size_t litLength;
659
780
  size_t matchLength;
660
781
  size_t offset;
661
- const BYTE* match;
662
782
  } seq_t;
663
783
 
664
784
  typedef struct {
@@ -672,9 +792,6 @@ typedef struct {
672
792
  ZSTD_fseState stateOffb;
673
793
  ZSTD_fseState stateML;
674
794
  size_t prevOffset[ZSTD_REP_NUM];
675
- const BYTE* prefixStart;
676
- const BYTE* dictEnd;
677
- size_t pos;
678
795
  } seqState_t;
679
796
 
680
797
  /*! ZSTD_overlapCopy8() :
@@ -717,7 +834,7 @@ HINT_INLINE void ZSTD_overlapCopy8(BYTE** op, BYTE const** ip, size_t offset) {
717
834
  * - ZSTD_overlap_src_before_dst: The src and dst may overlap and may be any distance apart.
718
835
  * The src buffer must be before the dst buffer.
719
836
  */
720
- static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, ptrdiff_t length, ZSTD_overlap_e ovtype) {
837
+ static void ZSTD_safecopy(BYTE* op, const BYTE* const oend_w, BYTE const* ip, ptrdiff_t length, ZSTD_overlap_e ovtype) {
721
838
  ptrdiff_t const diff = op - ip;
722
839
  BYTE* const oend = op + length;
723
840
 
@@ -733,6 +850,7 @@ static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, ptrdiff_
733
850
  /* Copy 8 bytes and ensure the offset >= 8 when there can be overlap. */
734
851
  assert(length >= 8);
735
852
  ZSTD_overlapCopy8(&op, &ip, diff);
853
+ length -= 8;
736
854
  assert(op - ip >= 8);
737
855
  assert(op <= oend);
738
856
  }
@@ -747,8 +865,31 @@ static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, ptrdiff_
747
865
  assert(oend > oend_w);
748
866
  ZSTD_wildcopy(op, ip, oend_w - op, ovtype);
749
867
  ip += oend_w - op;
750
- op = oend_w;
868
+ op += oend_w - op;
869
+ }
870
+ /* Handle the leftovers. */
871
+ while (op < oend) *op++ = *ip++;
872
+ }
873
+
874
+ /* ZSTD_safecopyDstBeforeSrc():
875
+ * This version allows overlap with dst before src, or handles the non-overlap case with dst after src
876
+ * Kept separate from more common ZSTD_safecopy case to avoid performance impact to the safecopy common case */
877
+ static void ZSTD_safecopyDstBeforeSrc(BYTE* op, const BYTE* ip, ptrdiff_t length) {
878
+ ptrdiff_t const diff = op - ip;
879
+ BYTE* const oend = op + length;
880
+
881
+ if (length < 8 || diff > -8) {
882
+ /* Handle short lengths, close overlaps, and dst not before src. */
883
+ while (op < oend) *op++ = *ip++;
884
+ return;
885
+ }
886
+
887
+ if (op <= oend - WILDCOPY_OVERLENGTH && diff < -WILDCOPY_VECLEN) {
888
+ ZSTD_wildcopy(op, ip, oend - WILDCOPY_OVERLENGTH - op, ZSTD_no_overlap);
889
+ ip += oend - WILDCOPY_OVERLENGTH - op;
890
+ op += oend - WILDCOPY_OVERLENGTH - op;
751
891
  }
892
+
752
893
  /* Handle the leftovers. */
753
894
  while (op < oend) *op++ = *ip++;
754
895
  }
@@ -762,10 +903,11 @@ static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, ptrdiff_
762
903
  * to be optimized for many small sequences, since those fall into ZSTD_execSequence().
763
904
  */
764
905
  FORCE_NOINLINE
906
+ ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
765
907
  size_t ZSTD_execSequenceEnd(BYTE* op,
766
- BYTE* const oend, seq_t sequence,
767
- const BYTE** litPtr, const BYTE* const litLimit,
768
- const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
908
+ BYTE* const oend, seq_t sequence,
909
+ const BYTE** litPtr, const BYTE* const litLimit,
910
+ const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
769
911
  {
770
912
  BYTE* const oLitEnd = op + sequence.litLength;
771
913
  size_t const sequenceLength = sequence.litLength + sequence.matchLength;
@@ -788,27 +930,78 @@ size_t ZSTD_execSequenceEnd(BYTE* op,
788
930
  if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
789
931
  /* offset beyond prefix */
790
932
  RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, "");
791
- match = dictEnd - (prefixStart-match);
933
+ match = dictEnd - (prefixStart - match);
792
934
  if (match + sequence.matchLength <= dictEnd) {
793
935
  ZSTD_memmove(oLitEnd, match, sequence.matchLength);
794
936
  return sequenceLength;
795
937
  }
796
938
  /* span extDict & currentPrefixSegment */
797
939
  { size_t const length1 = dictEnd - match;
798
- ZSTD_memmove(oLitEnd, match, length1);
799
- op = oLitEnd + length1;
800
- sequence.matchLength -= length1;
801
- match = prefixStart;
802
- } }
940
+ ZSTD_memmove(oLitEnd, match, length1);
941
+ op = oLitEnd + length1;
942
+ sequence.matchLength -= length1;
943
+ match = prefixStart;
944
+ }
945
+ }
946
+ ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst);
947
+ return sequenceLength;
948
+ }
949
+
950
+ /* ZSTD_execSequenceEndSplitLitBuffer():
951
+ * This version is intended to be used during instances where the litBuffer is still split. It is kept separate to avoid performance impact for the good case.
952
+ */
953
+ FORCE_NOINLINE
954
+ ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
955
+ size_t ZSTD_execSequenceEndSplitLitBuffer(BYTE* op,
956
+ BYTE* const oend, const BYTE* const oend_w, seq_t sequence,
957
+ const BYTE** litPtr, const BYTE* const litLimit,
958
+ const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
959
+ {
960
+ BYTE* const oLitEnd = op + sequence.litLength;
961
+ size_t const sequenceLength = sequence.litLength + sequence.matchLength;
962
+ const BYTE* const iLitEnd = *litPtr + sequence.litLength;
963
+ const BYTE* match = oLitEnd - sequence.offset;
964
+
965
+
966
+ /* bounds checks : careful of address space overflow in 32-bit mode */
967
+ RETURN_ERROR_IF(sequenceLength > (size_t)(oend - op), dstSize_tooSmall, "last match must fit within dstBuffer");
968
+ RETURN_ERROR_IF(sequence.litLength > (size_t)(litLimit - *litPtr), corruption_detected, "try to read beyond literal buffer");
969
+ assert(op < op + sequenceLength);
970
+ assert(oLitEnd < op + sequenceLength);
971
+
972
+ /* copy literals */
973
+ RETURN_ERROR_IF(op > *litPtr && op < *litPtr + sequence.litLength, dstSize_tooSmall, "output should not catch up to and overwrite literal buffer");
974
+ ZSTD_safecopyDstBeforeSrc(op, *litPtr, sequence.litLength);
975
+ op = oLitEnd;
976
+ *litPtr = iLitEnd;
977
+
978
+ /* copy Match */
979
+ if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
980
+ /* offset beyond prefix */
981
+ RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, "");
982
+ match = dictEnd - (prefixStart - match);
983
+ if (match + sequence.matchLength <= dictEnd) {
984
+ ZSTD_memmove(oLitEnd, match, sequence.matchLength);
985
+ return sequenceLength;
986
+ }
987
+ /* span extDict & currentPrefixSegment */
988
+ { size_t const length1 = dictEnd - match;
989
+ ZSTD_memmove(oLitEnd, match, length1);
990
+ op = oLitEnd + length1;
991
+ sequence.matchLength -= length1;
992
+ match = prefixStart;
993
+ }
994
+ }
803
995
  ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst);
804
996
  return sequenceLength;
805
997
  }
806
998
 
807
999
  HINT_INLINE
1000
+ ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
808
1001
  size_t ZSTD_execSequence(BYTE* op,
809
- BYTE* const oend, seq_t sequence,
810
- const BYTE** litPtr, const BYTE* const litLimit,
811
- const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
1002
+ BYTE* const oend, seq_t sequence,
1003
+ const BYTE** litPtr, const BYTE* const litLimit,
1004
+ const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
812
1005
  {
813
1006
  BYTE* const oLitEnd = op + sequence.litLength;
814
1007
  size_t const sequenceLength = sequence.litLength + sequence.matchLength;
@@ -819,6 +1012,104 @@ size_t ZSTD_execSequence(BYTE* op,
819
1012
 
820
1013
  assert(op != NULL /* Precondition */);
821
1014
  assert(oend_w < oend /* No underflow */);
1015
+
1016
+ #if defined(__aarch64__)
1017
+ /* prefetch sequence starting from match that will be used for copy later */
1018
+ PREFETCH_L1(match);
1019
+ #endif
1020
+ /* Handle edge cases in a slow path:
1021
+ * - Read beyond end of literals
1022
+ * - Match end is within WILDCOPY_OVERLIMIT of oend
1023
+ * - 32-bit mode and the match length overflows
1024
+ */
1025
+ if (UNLIKELY(
1026
+ iLitEnd > litLimit ||
1027
+ oMatchEnd > oend_w ||
1028
+ (MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH)))
1029
+ return ZSTD_execSequenceEnd(op, oend, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd);
1030
+
1031
+ /* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */
1032
+ assert(op <= oLitEnd /* No overflow */);
1033
+ assert(oLitEnd < oMatchEnd /* Non-zero match & no overflow */);
1034
+ assert(oMatchEnd <= oend /* No underflow */);
1035
+ assert(iLitEnd <= litLimit /* Literal length is in bounds */);
1036
+ assert(oLitEnd <= oend_w /* Can wildcopy literals */);
1037
+ assert(oMatchEnd <= oend_w /* Can wildcopy matches */);
1038
+
1039
+ /* Copy Literals:
1040
+ * Split out litLength <= 16 since it is nearly always true. +1.6% on gcc-9.
1041
+ * We likely don't need the full 32-byte wildcopy.
1042
+ */
1043
+ assert(WILDCOPY_OVERLENGTH >= 16);
1044
+ ZSTD_copy16(op, (*litPtr));
1045
+ if (UNLIKELY(sequence.litLength > 16)) {
1046
+ ZSTD_wildcopy(op + 16, (*litPtr) + 16, sequence.litLength - 16, ZSTD_no_overlap);
1047
+ }
1048
+ op = oLitEnd;
1049
+ *litPtr = iLitEnd; /* update for next sequence */
1050
+
1051
+ /* Copy Match */
1052
+ if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
1053
+ /* offset beyond prefix -> go into extDict */
1054
+ RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virtualStart)), corruption_detected, "");
1055
+ match = dictEnd + (match - prefixStart);
1056
+ if (match + sequence.matchLength <= dictEnd) {
1057
+ ZSTD_memmove(oLitEnd, match, sequence.matchLength);
1058
+ return sequenceLength;
1059
+ }
1060
+ /* span extDict & currentPrefixSegment */
1061
+ { size_t const length1 = dictEnd - match;
1062
+ ZSTD_memmove(oLitEnd, match, length1);
1063
+ op = oLitEnd + length1;
1064
+ sequence.matchLength -= length1;
1065
+ match = prefixStart;
1066
+ }
1067
+ }
1068
+ /* Match within prefix of 1 or more bytes */
1069
+ assert(op <= oMatchEnd);
1070
+ assert(oMatchEnd <= oend_w);
1071
+ assert(match >= prefixStart);
1072
+ assert(sequence.matchLength >= 1);
1073
+
1074
+ /* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy
1075
+ * without overlap checking.
1076
+ */
1077
+ if (LIKELY(sequence.offset >= WILDCOPY_VECLEN)) {
1078
+ /* We bet on a full wildcopy for matches, since we expect matches to be
1079
+ * longer than literals (in general). In silesia, ~10% of matches are longer
1080
+ * than 16 bytes.
1081
+ */
1082
+ ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength, ZSTD_no_overlap);
1083
+ return sequenceLength;
1084
+ }
1085
+ assert(sequence.offset < WILDCOPY_VECLEN);
1086
+
1087
+ /* Copy 8 bytes and spread the offset to be >= 8. */
1088
+ ZSTD_overlapCopy8(&op, &match, sequence.offset);
1089
+
1090
+ /* If the match length is > 8 bytes, then continue with the wildcopy. */
1091
+ if (sequence.matchLength > 8) {
1092
+ assert(op < oMatchEnd);
1093
+ ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength - 8, ZSTD_overlap_src_before_dst);
1094
+ }
1095
+ return sequenceLength;
1096
+ }
1097
+
1098
+ HINT_INLINE
1099
+ ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
1100
+ size_t ZSTD_execSequenceSplitLitBuffer(BYTE* op,
1101
+ BYTE* const oend, const BYTE* const oend_w, seq_t sequence,
1102
+ const BYTE** litPtr, const BYTE* const litLimit,
1103
+ const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
1104
+ {
1105
+ BYTE* const oLitEnd = op + sequence.litLength;
1106
+ size_t const sequenceLength = sequence.litLength + sequence.matchLength;
1107
+ BYTE* const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */
1108
+ const BYTE* const iLitEnd = *litPtr + sequence.litLength;
1109
+ const BYTE* match = oLitEnd - sequence.offset;
1110
+
1111
+ assert(op != NULL /* Precondition */);
1112
+ assert(oend_w < oend /* No underflow */);
822
1113
  /* Handle edge cases in a slow path:
823
1114
  * - Read beyond end of literals
824
1115
  * - Match end is within WILDCOPY_OVERLIMIT of oend
@@ -828,7 +1119,7 @@ size_t ZSTD_execSequence(BYTE* op,
828
1119
  iLitEnd > litLimit ||
829
1120
  oMatchEnd > oend_w ||
830
1121
  (MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH)))
831
- return ZSTD_execSequenceEnd(op, oend, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd);
1122
+ return ZSTD_execSequenceEndSplitLitBuffer(op, oend, oend_w, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd);
832
1123
 
833
1124
  /* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */
834
1125
  assert(op <= oLitEnd /* No overflow */);
@@ -896,6 +1187,7 @@ size_t ZSTD_execSequence(BYTE* op,
896
1187
  return sequenceLength;
897
1188
  }
898
1189
 
1190
+
899
1191
  static void
900
1192
  ZSTD_initFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, const ZSTD_seqSymbol* dt)
901
1193
  {
@@ -909,24 +1201,14 @@ ZSTD_initFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, const ZSTD_seqS
909
1201
  }
910
1202
 
911
1203
  FORCE_INLINE_TEMPLATE void
912
- ZSTD_updateFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD)
913
- {
914
- ZSTD_seqSymbol const DInfo = DStatePtr->table[DStatePtr->state];
915
- U32 const nbBits = DInfo.nbBits;
916
- size_t const lowBits = BIT_readBits(bitD, nbBits);
917
- DStatePtr->state = DInfo.nextState + lowBits;
918
- }
919
-
920
- FORCE_INLINE_TEMPLATE void
921
- ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, ZSTD_seqSymbol const DInfo)
1204
+ ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16 nextState, U32 nbBits)
922
1205
  {
923
- U32 const nbBits = DInfo.nbBits;
924
1206
  size_t const lowBits = BIT_readBits(bitD, nbBits);
925
- DStatePtr->state = DInfo.nextState + lowBits;
1207
+ DStatePtr->state = nextState + lowBits;
926
1208
  }
927
1209
 
928
1210
  /* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum
929
- * offset bits. But we can only read at most (STREAM_ACCUMULATOR_MIN_32 - 1)
1211
+ * offset bits. But we can only read at most STREAM_ACCUMULATOR_MIN_32
930
1212
  * bits before reloading. This value is the maximum number of bytes we read
931
1213
  * after reloading when we are decoding long offsets.
932
1214
  */
@@ -936,123 +1218,136 @@ ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, ZSTD
936
1218
  : 0)
937
1219
 
938
1220
  typedef enum { ZSTD_lo_isRegularOffset, ZSTD_lo_isLongOffset=1 } ZSTD_longOffset_e;
939
- typedef enum { ZSTD_p_noPrefetch=0, ZSTD_p_prefetch=1 } ZSTD_prefetch_e;
940
1221
 
1222
+ /**
1223
+ * ZSTD_decodeSequence():
1224
+ * @p longOffsets : tells the decoder to reload more bit while decoding large offsets
1225
+ * only used in 32-bit mode
1226
+ * @return : Sequence (litL + matchL + offset)
1227
+ */
941
1228
  FORCE_INLINE_TEMPLATE seq_t
942
- ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets, const ZSTD_prefetch_e prefetch)
1229
+ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets, const int isLastSeq)
943
1230
  {
944
1231
  seq_t seq;
945
- ZSTD_seqSymbol const llDInfo = seqState->stateLL.table[seqState->stateLL.state];
946
- ZSTD_seqSymbol const mlDInfo = seqState->stateML.table[seqState->stateML.state];
947
- ZSTD_seqSymbol const ofDInfo = seqState->stateOffb.table[seqState->stateOffb.state];
948
- U32 const llBase = llDInfo.baseValue;
949
- U32 const mlBase = mlDInfo.baseValue;
950
- U32 const ofBase = ofDInfo.baseValue;
951
- BYTE const llBits = llDInfo.nbAdditionalBits;
952
- BYTE const mlBits = mlDInfo.nbAdditionalBits;
953
- BYTE const ofBits = ofDInfo.nbAdditionalBits;
954
- BYTE const totalBits = llBits+mlBits+ofBits;
955
-
956
- /* sequence */
957
- { size_t offset;
958
- if (ofBits > 1) {
959
- ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
960
- ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
961
- assert(ofBits <= MaxOff);
962
- if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) {
963
- U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed);
964
- offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
965
- BIT_reloadDStream(&seqState->DStream);
966
- if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits);
967
- assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32); /* to avoid another reload */
968
- } else {
969
- offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */
970
- if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
971
- }
972
- seqState->prevOffset[2] = seqState->prevOffset[1];
973
- seqState->prevOffset[1] = seqState->prevOffset[0];
974
- seqState->prevOffset[0] = offset;
975
- } else {
976
- U32 const ll0 = (llBase == 0);
977
- if (LIKELY((ofBits == 0))) {
978
- if (LIKELY(!ll0))
979
- offset = seqState->prevOffset[0];
980
- else {
981
- offset = seqState->prevOffset[1];
982
- seqState->prevOffset[1] = seqState->prevOffset[0];
983
- seqState->prevOffset[0] = offset;
1232
+ /*
1233
+ * ZSTD_seqSymbol is a 64 bits wide structure.
1234
+ * It can be loaded in one operation
1235
+ * and its fields extracted by simply shifting or bit-extracting on aarch64.
1236
+ * GCC doesn't recognize this and generates more unnecessary ldr/ldrb/ldrh
1237
+ * operations that cause performance drop. This can be avoided by using this
1238
+ * ZSTD_memcpy hack.
1239
+ */
1240
+ #if defined(__aarch64__) && (defined(__GNUC__) && !defined(__clang__))
1241
+ ZSTD_seqSymbol llDInfoS, mlDInfoS, ofDInfoS;
1242
+ ZSTD_seqSymbol* const llDInfo = &llDInfoS;
1243
+ ZSTD_seqSymbol* const mlDInfo = &mlDInfoS;
1244
+ ZSTD_seqSymbol* const ofDInfo = &ofDInfoS;
1245
+ ZSTD_memcpy(llDInfo, seqState->stateLL.table + seqState->stateLL.state, sizeof(ZSTD_seqSymbol));
1246
+ ZSTD_memcpy(mlDInfo, seqState->stateML.table + seqState->stateML.state, sizeof(ZSTD_seqSymbol));
1247
+ ZSTD_memcpy(ofDInfo, seqState->stateOffb.table + seqState->stateOffb.state, sizeof(ZSTD_seqSymbol));
1248
+ #else
1249
+ const ZSTD_seqSymbol* const llDInfo = seqState->stateLL.table + seqState->stateLL.state;
1250
+ const ZSTD_seqSymbol* const mlDInfo = seqState->stateML.table + seqState->stateML.state;
1251
+ const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state;
1252
+ #endif
1253
+ seq.matchLength = mlDInfo->baseValue;
1254
+ seq.litLength = llDInfo->baseValue;
1255
+ { U32 const ofBase = ofDInfo->baseValue;
1256
+ BYTE const llBits = llDInfo->nbAdditionalBits;
1257
+ BYTE const mlBits = mlDInfo->nbAdditionalBits;
1258
+ BYTE const ofBits = ofDInfo->nbAdditionalBits;
1259
+ BYTE const totalBits = llBits+mlBits+ofBits;
1260
+
1261
+ U16 const llNext = llDInfo->nextState;
1262
+ U16 const mlNext = mlDInfo->nextState;
1263
+ U16 const ofNext = ofDInfo->nextState;
1264
+ U32 const llnbBits = llDInfo->nbBits;
1265
+ U32 const mlnbBits = mlDInfo->nbBits;
1266
+ U32 const ofnbBits = ofDInfo->nbBits;
1267
+
1268
+ assert(llBits <= MaxLLBits);
1269
+ assert(mlBits <= MaxMLBits);
1270
+ assert(ofBits <= MaxOff);
1271
+ /*
1272
+ * As gcc has better branch and block analyzers, sometimes it is only
1273
+ * valuable to mark likeliness for clang, it gives around 3-4% of
1274
+ * performance.
1275
+ */
1276
+
1277
+ /* sequence */
1278
+ { size_t offset;
1279
+ if (ofBits > 1) {
1280
+ ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
1281
+ ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
1282
+ ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 > LONG_OFFSETS_MAX_EXTRA_BITS_32);
1283
+ ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 - LONG_OFFSETS_MAX_EXTRA_BITS_32 >= MaxMLBits);
1284
+ if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) {
1285
+ /* Always read extra bits, this keeps the logic simple,
1286
+ * avoids branches, and avoids accidentally reading 0 bits.
1287
+ */
1288
+ U32 const extraBits = LONG_OFFSETS_MAX_EXTRA_BITS_32;
1289
+ offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
1290
+ BIT_reloadDStream(&seqState->DStream);
1291
+ offset += BIT_readBitsFast(&seqState->DStream, extraBits);
1292
+ } else {
1293
+ offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */
1294
+ if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
984
1295
  }
1296
+ seqState->prevOffset[2] = seqState->prevOffset[1];
1297
+ seqState->prevOffset[1] = seqState->prevOffset[0];
1298
+ seqState->prevOffset[0] = offset;
985
1299
  } else {
986
- offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1);
987
- { size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
988
- temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */
989
- if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1];
990
- seqState->prevOffset[1] = seqState->prevOffset[0];
991
- seqState->prevOffset[0] = offset = temp;
992
- } } }
993
- seq.offset = offset;
994
- }
1300
+ U32 const ll0 = (llDInfo->baseValue == 0);
1301
+ if (LIKELY((ofBits == 0))) {
1302
+ offset = seqState->prevOffset[ll0];
1303
+ seqState->prevOffset[1] = seqState->prevOffset[!ll0];
1304
+ seqState->prevOffset[0] = offset;
1305
+ } else {
1306
+ offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1);
1307
+ { size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
1308
+ temp -= !temp; /* 0 is not valid: input corrupted => force offset to -1 => corruption detected at execSequence */
1309
+ if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1];
1310
+ seqState->prevOffset[1] = seqState->prevOffset[0];
1311
+ seqState->prevOffset[0] = offset = temp;
1312
+ } } }
1313
+ seq.offset = offset;
1314
+ }
995
1315
 
996
- seq.matchLength = mlBase;
997
- if (mlBits > 0)
998
- seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/);
1316
+ if (mlBits > 0)
1317
+ seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/);
999
1318
 
1000
- if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))
1001
- BIT_reloadDStream(&seqState->DStream);
1002
- if (MEM_64bits() && UNLIKELY(totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog)))
1003
- BIT_reloadDStream(&seqState->DStream);
1004
- /* Ensure there are enough bits to read the rest of data in 64-bit mode. */
1005
- ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);
1319
+ if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))
1320
+ BIT_reloadDStream(&seqState->DStream);
1321
+ if (MEM_64bits() && UNLIKELY(totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog)))
1322
+ BIT_reloadDStream(&seqState->DStream);
1323
+ /* Ensure there are enough bits to read the rest of data in 64-bit mode. */
1324
+ ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);
1006
1325
 
1007
- seq.litLength = llBase;
1008
- if (llBits > 0)
1009
- seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/);
1326
+ if (llBits > 0)
1327
+ seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/);
1010
1328
 
1011
- if (MEM_32bits())
1012
- BIT_reloadDStream(&seqState->DStream);
1329
+ if (MEM_32bits())
1330
+ BIT_reloadDStream(&seqState->DStream);
1013
1331
 
1014
- DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u",
1015
- (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
1332
+ DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u",
1333
+ (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
1016
1334
 
1017
- if (prefetch == ZSTD_p_prefetch) {
1018
- size_t const pos = seqState->pos + seq.litLength;
1019
- const BYTE* const matchBase = (seq.offset > pos) ? seqState->dictEnd : seqState->prefixStart;
1020
- seq.match = matchBase + pos - seq.offset; /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted.
1021
- * No consequence though : no memory access will occur, offset is only used for prefetching */
1022
- seqState->pos = pos + seq.matchLength;
1023
- }
1024
-
1025
- /* ANS state update
1026
- * gcc-9.0.0 does 2.5% worse with ZSTD_updateFseStateWithDInfo().
1027
- * clang-9.2.0 does 7% worse with ZSTD_updateFseState().
1028
- * Naturally it seems like ZSTD_updateFseStateWithDInfo() should be the
1029
- * better option, so it is the default for other compilers. But, if you
1030
- * measure that it is worse, please put up a pull request.
1031
- */
1032
- {
1033
- #if defined(__GNUC__) && !defined(__clang__)
1034
- const int kUseUpdateFseState = 1;
1035
- #else
1036
- const int kUseUpdateFseState = 0;
1037
- #endif
1038
- if (kUseUpdateFseState) {
1039
- ZSTD_updateFseState(&seqState->stateLL, &seqState->DStream); /* <= 9 bits */
1040
- ZSTD_updateFseState(&seqState->stateML, &seqState->DStream); /* <= 9 bits */
1335
+ if (!isLastSeq) {
1336
+ /* don't update FSE state for last Sequence */
1337
+ ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits); /* <= 9 bits */
1338
+ ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits); /* <= 9 bits */
1041
1339
  if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */
1042
- ZSTD_updateFseState(&seqState->stateOffb, &seqState->DStream); /* <= 8 bits */
1043
- } else {
1044
- ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llDInfo); /* <= 9 bits */
1045
- ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlDInfo); /* <= 9 bits */
1046
- if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */
1047
- ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofDInfo); /* <= 8 bits */
1340
+ ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits); /* <= 8 bits */
1341
+ BIT_reloadDStream(&seqState->DStream);
1048
1342
  }
1049
1343
  }
1050
1344
 
1051
1345
  return seq;
1052
1346
  }
1053
1347
 
1054
- #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
1055
- MEM_STATIC int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd)
1348
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1349
+ #if DEBUGLEVEL >= 1
1350
+ static int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd)
1056
1351
  {
1057
1352
  size_t const windowSize = dctx->fParams.windowSize;
1058
1353
  /* No dictionary used. */
@@ -1066,30 +1361,33 @@ MEM_STATIC int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefix
1066
1361
  /* Dictionary is active. */
1067
1362
  return 1;
1068
1363
  }
1364
+ #endif
1069
1365
 
1070
- MEM_STATIC void ZSTD_assertValidSequence(
1366
+ static void ZSTD_assertValidSequence(
1071
1367
  ZSTD_DCtx const* dctx,
1072
1368
  BYTE const* op, BYTE const* oend,
1073
1369
  seq_t const seq,
1074
1370
  BYTE const* prefixStart, BYTE const* virtualStart)
1075
1371
  {
1076
1372
  #if DEBUGLEVEL >= 1
1077
- size_t const windowSize = dctx->fParams.windowSize;
1078
- size_t const sequenceSize = seq.litLength + seq.matchLength;
1079
- BYTE const* const oLitEnd = op + seq.litLength;
1080
- DEBUGLOG(6, "Checking sequence: litL=%u matchL=%u offset=%u",
1081
- (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
1082
- assert(op <= oend);
1083
- assert((size_t)(oend - op) >= sequenceSize);
1084
- assert(sequenceSize <= ZSTD_BLOCKSIZE_MAX);
1085
- if (ZSTD_dictionaryIsActive(dctx, prefixStart, oLitEnd)) {
1086
- size_t const dictSize = (size_t)((char const*)dctx->dictContentEndForFuzzing - (char const*)dctx->dictContentBeginForFuzzing);
1087
- /* Offset must be within the dictionary. */
1088
- assert(seq.offset <= (size_t)(oLitEnd - virtualStart));
1089
- assert(seq.offset <= windowSize + dictSize);
1090
- } else {
1091
- /* Offset must be within our window. */
1092
- assert(seq.offset <= windowSize);
1373
+ if (dctx->isFrameDecompression) {
1374
+ size_t const windowSize = dctx->fParams.windowSize;
1375
+ size_t const sequenceSize = seq.litLength + seq.matchLength;
1376
+ BYTE const* const oLitEnd = op + seq.litLength;
1377
+ DEBUGLOG(6, "Checking sequence: litL=%u matchL=%u offset=%u",
1378
+ (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
1379
+ assert(op <= oend);
1380
+ assert((size_t)(oend - op) >= sequenceSize);
1381
+ assert(sequenceSize <= ZSTD_blockSizeMax(dctx));
1382
+ if (ZSTD_dictionaryIsActive(dctx, prefixStart, oLitEnd)) {
1383
+ size_t const dictSize = (size_t)((char const*)dctx->dictContentEndForFuzzing - (char const*)dctx->dictContentBeginForFuzzing);
1384
+ /* Offset must be within the dictionary. */
1385
+ assert(seq.offset <= (size_t)(oLitEnd - virtualStart));
1386
+ assert(seq.offset <= windowSize + dictSize);
1387
+ } else {
1388
+ /* Offset must be within our window. */
1389
+ assert(seq.offset <= windowSize);
1390
+ }
1093
1391
  }
1094
1392
  #else
1095
1393
  (void)dctx, (void)op, (void)oend, (void)seq, (void)prefixStart, (void)virtualStart;
@@ -1098,31 +1396,30 @@ MEM_STATIC void ZSTD_assertValidSequence(
1098
1396
  #endif
1099
1397
 
1100
1398
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
1399
+
1400
+
1101
1401
  FORCE_INLINE_TEMPLATE size_t
1102
1402
  DONT_VECTORIZE
1103
- ZSTD_decompressSequences_body( ZSTD_DCtx* dctx,
1403
+ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,
1104
1404
  void* dst, size_t maxDstSize,
1105
1405
  const void* seqStart, size_t seqSize, int nbSeq,
1106
- const ZSTD_longOffset_e isLongOffset,
1107
- const int frame)
1406
+ const ZSTD_longOffset_e isLongOffset)
1108
1407
  {
1109
1408
  const BYTE* ip = (const BYTE*)seqStart;
1110
1409
  const BYTE* const iend = ip + seqSize;
1111
- BYTE* const ostart = (BYTE* const)dst;
1112
- BYTE* const oend = ostart + maxDstSize;
1410
+ BYTE* const ostart = (BYTE*)dst;
1411
+ BYTE* const oend = ZSTD_maybeNullPtrAdd(ostart, maxDstSize);
1113
1412
  BYTE* op = ostart;
1114
1413
  const BYTE* litPtr = dctx->litPtr;
1115
- const BYTE* const litEnd = litPtr + dctx->litSize;
1414
+ const BYTE* litBufferEnd = dctx->litBufferEnd;
1116
1415
  const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart);
1117
1416
  const BYTE* const vBase = (const BYTE*) (dctx->virtualStart);
1118
1417
  const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
1119
- DEBUGLOG(5, "ZSTD_decompressSequences_body");
1120
- (void)frame;
1418
+ DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer (%i seqs)", nbSeq);
1121
1419
 
1122
- /* Regen sequences */
1420
+ /* Literals are split between internal buffer & output buffer */
1123
1421
  if (nbSeq) {
1124
1422
  seqState_t seqState;
1125
- size_t error = 0;
1126
1423
  dctx->fseEntropy = 1;
1127
1424
  { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; }
1128
1425
  RETURN_ERROR_IF(
@@ -1138,134 +1435,331 @@ ZSTD_decompressSequences_body( ZSTD_DCtx* dctx,
1138
1435
  BIT_DStream_endOfBuffer < BIT_DStream_completed &&
1139
1436
  BIT_DStream_completed < BIT_DStream_overflow);
1140
1437
 
1438
+ /* decompress without overrunning litPtr begins */
1439
+ { seq_t sequence = {0,0,0}; /* some static analyzer believe that @sequence is not initialized (it necessarily is, since for(;;) loop as at least one iteration) */
1440
+ /* Align the decompression loop to 32 + 16 bytes.
1441
+ *
1442
+ * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression
1443
+ * speed swings based on the alignment of the decompression loop. This
1444
+ * performance swing is caused by parts of the decompression loop falling
1445
+ * out of the DSB. The entire decompression loop should fit in the DSB,
1446
+ * when it can't we get much worse performance. You can measure if you've
1447
+ * hit the good case or the bad case with this perf command for some
1448
+ * compressed file test.zst:
1449
+ *
1450
+ * perf stat -e cycles -e instructions -e idq.all_dsb_cycles_any_uops \
1451
+ * -e idq.all_mite_cycles_any_uops -- ./zstd -tq test.zst
1452
+ *
1453
+ * If you see most cycles served out of the MITE you've hit the bad case.
1454
+ * If you see most cycles served out of the DSB you've hit the good case.
1455
+ * If it is pretty even then you may be in an okay case.
1456
+ *
1457
+ * This issue has been reproduced on the following CPUs:
1458
+ * - Kabylake: Macbook Pro (15-inch, 2019) 2.4 GHz Intel Core i9
1459
+ * Use Instruments->Counters to get DSB/MITE cycles.
1460
+ * I never got performance swings, but I was able to
1461
+ * go from the good case of mostly DSB to half of the
1462
+ * cycles served from MITE.
1463
+ * - Coffeelake: Intel i9-9900k
1464
+ * - Coffeelake: Intel i7-9700k
1465
+ *
1466
+ * I haven't been able to reproduce the instability or DSB misses on any
1467
+ * of the following CPUS:
1468
+ * - Haswell
1469
+ * - Broadwell: Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GH
1470
+ * - Skylake
1471
+ *
1472
+ * Alignment is done for each of the three major decompression loops:
1473
+ * - ZSTD_decompressSequences_bodySplitLitBuffer - presplit section of the literal buffer
1474
+ * - ZSTD_decompressSequences_bodySplitLitBuffer - postsplit section of the literal buffer
1475
+ * - ZSTD_decompressSequences_body
1476
+ * Alignment choices are made to minimize large swings on bad cases and influence on performance
1477
+ * from changes external to this code, rather than to overoptimize on the current commit.
1478
+ *
1479
+ * If you are seeing performance stability this script can help test.
1480
+ * It tests on 4 commits in zstd where I saw performance change.
1481
+ *
1482
+ * https://gist.github.com/terrelln/9889fc06a423fd5ca6e99351564473f4
1483
+ */
1141
1484
  #if defined(__GNUC__) && defined(__x86_64__)
1142
- /* Align the decompression loop to 32 + 16 bytes.
1143
- *
1144
- * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression
1145
- * speed swings based on the alignment of the decompression loop. This
1146
- * performance swing is caused by parts of the decompression loop falling
1147
- * out of the DSB. The entire decompression loop should fit in the DSB,
1148
- * when it can't we get much worse performance. You can measure if you've
1149
- * hit the good case or the bad case with this perf command for some
1150
- * compressed file test.zst:
1151
- *
1152
- * perf stat -e cycles -e instructions -e idq.all_dsb_cycles_any_uops \
1153
- * -e idq.all_mite_cycles_any_uops -- ./zstd -tq test.zst
1154
- *
1155
- * If you see most cycles served out of the MITE you've hit the bad case.
1156
- * If you see most cycles served out of the DSB you've hit the good case.
1157
- * If it is pretty even then you may be in an okay case.
1158
- *
1159
- * I've been able to reproduce this issue on the following CPUs:
1160
- * - Kabylake: Macbook Pro (15-inch, 2019) 2.4 GHz Intel Core i9
1161
- * Use Instruments->Counters to get DSB/MITE cycles.
1162
- * I never got performance swings, but I was able to
1163
- * go from the good case of mostly DSB to half of the
1164
- * cycles served from MITE.
1165
- * - Coffeelake: Intel i9-9900k
1166
- *
1167
- * I haven't been able to reproduce the instability or DSB misses on any
1168
- * of the following CPUS:
1169
- * - Haswell
1170
- * - Broadwell: Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GH
1171
- * - Skylake
1172
- *
1173
- * If you are seeing performance stability this script can help test.
1174
- * It tests on 4 commits in zstd where I saw performance change.
1175
- *
1176
- * https://gist.github.com/terrelln/9889fc06a423fd5ca6e99351564473f4
1177
- */
1178
- __asm__(".p2align 5");
1179
- __asm__("nop");
1180
- __asm__(".p2align 4");
1485
+ __asm__(".p2align 6");
1486
+ # if __GNUC__ >= 7
1487
+ /* good for gcc-7, gcc-9, and gcc-11 */
1488
+ __asm__("nop");
1489
+ __asm__(".p2align 5");
1490
+ __asm__("nop");
1491
+ __asm__(".p2align 4");
1492
+ # if __GNUC__ == 8 || __GNUC__ == 10
1493
+ /* good for gcc-8 and gcc-10 */
1494
+ __asm__("nop");
1495
+ __asm__(".p2align 3");
1496
+ # endif
1497
+ # endif
1181
1498
  #endif
1182
- for ( ; ; ) {
1183
- seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, ZSTD_p_noPrefetch);
1184
- size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd);
1499
+
1500
+ /* Handle the initial state where litBuffer is currently split between dst and litExtraBuffer */
1501
+ for ( ; nbSeq; nbSeq--) {
1502
+ sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1);
1503
+ if (litPtr + sequence.litLength > dctx->litBufferEnd) break;
1504
+ { size_t const oneSeqSize = ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
1185
1505
  #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1186
- assert(!ZSTD_isError(oneSeqSize));
1187
- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
1506
+ assert(!ZSTD_isError(oneSeqSize));
1507
+ ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
1188
1508
  #endif
1189
- DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
1190
- BIT_reloadDStream(&(seqState.DStream));
1191
- op += oneSeqSize;
1192
- /* gcc and clang both don't like early returns in this loop.
1193
- * Instead break and check for an error at the end of the loop.
1194
- */
1195
- if (UNLIKELY(ZSTD_isError(oneSeqSize))) {
1196
- error = oneSeqSize;
1197
- break;
1509
+ if (UNLIKELY(ZSTD_isError(oneSeqSize)))
1510
+ return oneSeqSize;
1511
+ DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
1512
+ op += oneSeqSize;
1513
+ } }
1514
+ DEBUGLOG(6, "reached: (litPtr + sequence.litLength > dctx->litBufferEnd)");
1515
+
1516
+ /* If there are more sequences, they will need to read literals from litExtraBuffer; copy over the remainder from dst and update litPtr and litEnd */
1517
+ if (nbSeq > 0) {
1518
+ const size_t leftoverLit = dctx->litBufferEnd - litPtr;
1519
+ DEBUGLOG(6, "There are %i sequences left, and %zu/%zu literals left in buffer", nbSeq, leftoverLit, sequence.litLength);
1520
+ if (leftoverLit) {
1521
+ RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");
1522
+ ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
1523
+ sequence.litLength -= leftoverLit;
1524
+ op += leftoverLit;
1525
+ }
1526
+ litPtr = dctx->litExtraBuffer;
1527
+ litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
1528
+ dctx->litBufferLocation = ZSTD_not_in_dst;
1529
+ { size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
1530
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1531
+ assert(!ZSTD_isError(oneSeqSize));
1532
+ ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
1533
+ #endif
1534
+ if (UNLIKELY(ZSTD_isError(oneSeqSize)))
1535
+ return oneSeqSize;
1536
+ DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
1537
+ op += oneSeqSize;
1538
+ }
1539
+ nbSeq--;
1540
+ }
1541
+ }
1542
+
1543
+ if (nbSeq > 0) {
1544
+ /* there is remaining lit from extra buffer */
1545
+
1546
+ #if defined(__GNUC__) && defined(__x86_64__)
1547
+ __asm__(".p2align 6");
1548
+ __asm__("nop");
1549
+ # if __GNUC__ != 7
1550
+ /* worse for gcc-7 better for gcc-8, gcc-9, and gcc-10 and clang */
1551
+ __asm__(".p2align 4");
1552
+ __asm__("nop");
1553
+ __asm__(".p2align 3");
1554
+ # elif __GNUC__ >= 11
1555
+ __asm__(".p2align 3");
1556
+ # else
1557
+ __asm__(".p2align 5");
1558
+ __asm__("nop");
1559
+ __asm__(".p2align 3");
1560
+ # endif
1561
+ #endif
1562
+
1563
+ for ( ; nbSeq ; nbSeq--) {
1564
+ seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1);
1565
+ size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
1566
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1567
+ assert(!ZSTD_isError(oneSeqSize));
1568
+ ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
1569
+ #endif
1570
+ if (UNLIKELY(ZSTD_isError(oneSeqSize)))
1571
+ return oneSeqSize;
1572
+ DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
1573
+ op += oneSeqSize;
1198
1574
  }
1199
- if (UNLIKELY(!--nbSeq)) break;
1200
1575
  }
1201
1576
 
1202
1577
  /* check if reached exact end */
1203
- DEBUGLOG(5, "ZSTD_decompressSequences_body: after decode loop, remaining nbSeq : %i", nbSeq);
1204
- if (ZSTD_isError(error)) return error;
1578
+ DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer: after decode loop, remaining nbSeq : %i", nbSeq);
1205
1579
  RETURN_ERROR_IF(nbSeq, corruption_detected, "");
1206
- RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, "");
1580
+ DEBUGLOG(5, "bitStream : start=%p, ptr=%p, bitsConsumed=%u", seqState.DStream.start, seqState.DStream.ptr, seqState.DStream.bitsConsumed);
1581
+ RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, "");
1207
1582
  /* save reps for next block */
1208
1583
  { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
1209
1584
  }
1210
1585
 
1211
1586
  /* last literal segment */
1212
- { size_t const lastLLSize = litEnd - litPtr;
1587
+ if (dctx->litBufferLocation == ZSTD_split) {
1588
+ /* split hasn't been reached yet, first get dst then copy litExtraBuffer */
1589
+ size_t const lastLLSize = (size_t)(litBufferEnd - litPtr);
1590
+ DEBUGLOG(6, "copy last literals from segment : %u", (U32)lastLLSize);
1591
+ RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, "");
1592
+ if (op != NULL) {
1593
+ ZSTD_memmove(op, litPtr, lastLLSize);
1594
+ op += lastLLSize;
1595
+ }
1596
+ litPtr = dctx->litExtraBuffer;
1597
+ litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
1598
+ dctx->litBufferLocation = ZSTD_not_in_dst;
1599
+ }
1600
+ /* copy last literals from internal buffer */
1601
+ { size_t const lastLLSize = (size_t)(litBufferEnd - litPtr);
1602
+ DEBUGLOG(6, "copy last literals from internal buffer : %u", (U32)lastLLSize);
1213
1603
  RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
1214
1604
  if (op != NULL) {
1215
1605
  ZSTD_memcpy(op, litPtr, lastLLSize);
1216
1606
  op += lastLLSize;
1607
+ } }
1608
+
1609
+ DEBUGLOG(6, "decoded block of size %u bytes", (U32)(op - ostart));
1610
+ return (size_t)(op - ostart);
1611
+ }
1612
+
1613
+ FORCE_INLINE_TEMPLATE size_t
1614
+ DONT_VECTORIZE
1615
+ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx,
1616
+ void* dst, size_t maxDstSize,
1617
+ const void* seqStart, size_t seqSize, int nbSeq,
1618
+ const ZSTD_longOffset_e isLongOffset)
1619
+ {
1620
+ const BYTE* ip = (const BYTE*)seqStart;
1621
+ const BYTE* const iend = ip + seqSize;
1622
+ BYTE* const ostart = (BYTE*)dst;
1623
+ BYTE* const oend = dctx->litBufferLocation == ZSTD_not_in_dst ? ZSTD_maybeNullPtrAdd(ostart, maxDstSize) : dctx->litBuffer;
1624
+ BYTE* op = ostart;
1625
+ const BYTE* litPtr = dctx->litPtr;
1626
+ const BYTE* const litEnd = litPtr + dctx->litSize;
1627
+ const BYTE* const prefixStart = (const BYTE*)(dctx->prefixStart);
1628
+ const BYTE* const vBase = (const BYTE*)(dctx->virtualStart);
1629
+ const BYTE* const dictEnd = (const BYTE*)(dctx->dictEnd);
1630
+ DEBUGLOG(5, "ZSTD_decompressSequences_body: nbSeq = %d", nbSeq);
1631
+
1632
+ /* Regen sequences */
1633
+ if (nbSeq) {
1634
+ seqState_t seqState;
1635
+ dctx->fseEntropy = 1;
1636
+ { U32 i; for (i = 0; i < ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; }
1637
+ RETURN_ERROR_IF(
1638
+ ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend - ip)),
1639
+ corruption_detected, "");
1640
+ ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr);
1641
+ ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr);
1642
+ ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
1643
+ assert(dst != NULL);
1644
+
1645
+ #if defined(__GNUC__) && defined(__x86_64__)
1646
+ __asm__(".p2align 6");
1647
+ __asm__("nop");
1648
+ # if __GNUC__ >= 7
1649
+ __asm__(".p2align 5");
1650
+ __asm__("nop");
1651
+ __asm__(".p2align 3");
1652
+ # else
1653
+ __asm__(".p2align 4");
1654
+ __asm__("nop");
1655
+ __asm__(".p2align 3");
1656
+ # endif
1657
+ #endif
1658
+
1659
+ for ( ; nbSeq ; nbSeq--) {
1660
+ seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1);
1661
+ size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd);
1662
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1663
+ assert(!ZSTD_isError(oneSeqSize));
1664
+ ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
1665
+ #endif
1666
+ if (UNLIKELY(ZSTD_isError(oneSeqSize)))
1667
+ return oneSeqSize;
1668
+ DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
1669
+ op += oneSeqSize;
1217
1670
  }
1671
+
1672
+ /* check if reached exact end */
1673
+ assert(nbSeq == 0);
1674
+ RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, "");
1675
+ /* save reps for next block */
1676
+ { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
1218
1677
  }
1219
1678
 
1220
- return op-ostart;
1679
+ /* last literal segment */
1680
+ { size_t const lastLLSize = (size_t)(litEnd - litPtr);
1681
+ DEBUGLOG(6, "copy last literals : %u", (U32)lastLLSize);
1682
+ RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
1683
+ if (op != NULL) {
1684
+ ZSTD_memcpy(op, litPtr, lastLLSize);
1685
+ op += lastLLSize;
1686
+ } }
1687
+
1688
+ DEBUGLOG(6, "decoded block of size %u bytes", (U32)(op - ostart));
1689
+ return (size_t)(op - ostart);
1221
1690
  }
1222
1691
 
1223
1692
  static size_t
1224
1693
  ZSTD_decompressSequences_default(ZSTD_DCtx* dctx,
1225
1694
  void* dst, size_t maxDstSize,
1226
1695
  const void* seqStart, size_t seqSize, int nbSeq,
1227
- const ZSTD_longOffset_e isLongOffset,
1228
- const int frame)
1696
+ const ZSTD_longOffset_e isLongOffset)
1229
1697
  {
1230
- return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1698
+ return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
1699
+ }
1700
+
1701
+ static size_t
1702
+ ZSTD_decompressSequencesSplitLitBuffer_default(ZSTD_DCtx* dctx,
1703
+ void* dst, size_t maxDstSize,
1704
+ const void* seqStart, size_t seqSize, int nbSeq,
1705
+ const ZSTD_longOffset_e isLongOffset)
1706
+ {
1707
+ return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
1231
1708
  }
1232
1709
  #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
1233
1710
 
1234
1711
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
1712
+
1713
+ FORCE_INLINE_TEMPLATE
1714
+
1715
+ size_t ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence,
1716
+ const BYTE* const prefixStart, const BYTE* const dictEnd)
1717
+ {
1718
+ prefetchPos += sequence.litLength;
1719
+ { const BYTE* const matchBase = (sequence.offset > prefetchPos) ? dictEnd : prefixStart;
1720
+ /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted.
1721
+ * No consequence though : memory address is only used for prefetching, not for dereferencing */
1722
+ const BYTE* const match = ZSTD_wrappedPtrSub(ZSTD_wrappedPtrAdd(matchBase, prefetchPos), sequence.offset);
1723
+ PREFETCH_L1(match); PREFETCH_L1(match+CACHELINE_SIZE); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
1724
+ }
1725
+ return prefetchPos + sequence.matchLength;
1726
+ }
1727
+
1728
+ /* This decoding function employs prefetching
1729
+ * to reduce latency impact of cache misses.
1730
+ * It's generally employed when block contains a significant portion of long-distance matches
1731
+ * or when coupled with a "cold" dictionary */
1235
1732
  FORCE_INLINE_TEMPLATE size_t
1236
1733
  ZSTD_decompressSequencesLong_body(
1237
1734
  ZSTD_DCtx* dctx,
1238
1735
  void* dst, size_t maxDstSize,
1239
1736
  const void* seqStart, size_t seqSize, int nbSeq,
1240
- const ZSTD_longOffset_e isLongOffset,
1241
- const int frame)
1737
+ const ZSTD_longOffset_e isLongOffset)
1242
1738
  {
1243
1739
  const BYTE* ip = (const BYTE*)seqStart;
1244
1740
  const BYTE* const iend = ip + seqSize;
1245
- BYTE* const ostart = (BYTE* const)dst;
1246
- BYTE* const oend = ostart + maxDstSize;
1741
+ BYTE* const ostart = (BYTE*)dst;
1742
+ BYTE* const oend = dctx->litBufferLocation == ZSTD_in_dst ? dctx->litBuffer : ZSTD_maybeNullPtrAdd(ostart, maxDstSize);
1247
1743
  BYTE* op = ostart;
1248
1744
  const BYTE* litPtr = dctx->litPtr;
1249
- const BYTE* const litEnd = litPtr + dctx->litSize;
1745
+ const BYTE* litBufferEnd = dctx->litBufferEnd;
1250
1746
  const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart);
1251
1747
  const BYTE* const dictStart = (const BYTE*) (dctx->virtualStart);
1252
1748
  const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
1253
- (void)frame;
1254
1749
 
1255
1750
  /* Regen sequences */
1256
1751
  if (nbSeq) {
1257
- #define STORED_SEQS 4
1752
+ #define STORED_SEQS 8
1258
1753
  #define STORED_SEQS_MASK (STORED_SEQS-1)
1259
- #define ADVANCED_SEQS 4
1754
+ #define ADVANCED_SEQS STORED_SEQS
1260
1755
  seq_t sequences[STORED_SEQS];
1261
1756
  int const seqAdvance = MIN(nbSeq, ADVANCED_SEQS);
1262
1757
  seqState_t seqState;
1263
1758
  int seqNb;
1759
+ size_t prefetchPos = (size_t)(op-prefixStart); /* track position relative to prefixStart */
1760
+
1264
1761
  dctx->fseEntropy = 1;
1265
1762
  { int i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; }
1266
- seqState.prefixStart = prefixStart;
1267
- seqState.pos = (size_t)(op-prefixStart);
1268
- seqState.dictEnd = dictEnd;
1269
1763
  assert(dst != NULL);
1270
1764
  assert(iend >= ip);
1271
1765
  RETURN_ERROR_IF(
@@ -1276,37 +1770,95 @@ ZSTD_decompressSequencesLong_body(
1276
1770
  ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
1277
1771
 
1278
1772
  /* prepare in advance */
1279
- for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && (seqNb<seqAdvance); seqNb++) {
1280
- sequences[seqNb] = ZSTD_decodeSequence(&seqState, isLongOffset, ZSTD_p_prefetch);
1281
- PREFETCH_L1(sequences[seqNb].match); PREFETCH_L1(sequences[seqNb].match + sequences[seqNb].matchLength - 1); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
1773
+ for (seqNb=0; seqNb<seqAdvance; seqNb++) {
1774
+ seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, seqNb == nbSeq-1);
1775
+ prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
1776
+ sequences[seqNb] = sequence;
1282
1777
  }
1283
- RETURN_ERROR_IF(seqNb<seqAdvance, corruption_detected, "");
1284
1778
 
1285
- /* decode and decompress */
1286
- for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (seqNb<nbSeq) ; seqNb++) {
1287
- seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, ZSTD_p_prefetch);
1288
- size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb-ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd);
1779
+ /* decompress without stomping litBuffer */
1780
+ for (; seqNb < nbSeq; seqNb++) {
1781
+ seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset, seqNb == nbSeq-1);
1782
+
1783
+ if (dctx->litBufferLocation == ZSTD_split && litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength > dctx->litBufferEnd) {
1784
+ /* lit buffer is reaching split point, empty out the first buffer and transition to litExtraBuffer */
1785
+ const size_t leftoverLit = dctx->litBufferEnd - litPtr;
1786
+ if (leftoverLit)
1787
+ {
1788
+ RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");
1789
+ ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
1790
+ sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength -= leftoverLit;
1791
+ op += leftoverLit;
1792
+ }
1793
+ litPtr = dctx->litExtraBuffer;
1794
+ litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
1795
+ dctx->litBufferLocation = ZSTD_not_in_dst;
1796
+ { size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
1289
1797
  #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1290
- assert(!ZSTD_isError(oneSeqSize));
1291
- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb-ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
1798
+ assert(!ZSTD_isError(oneSeqSize));
1799
+ ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
1292
1800
  #endif
1293
- if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1294
- PREFETCH_L1(sequence.match); PREFETCH_L1(sequence.match + sequence.matchLength - 1); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
1295
- sequences[seqNb & STORED_SEQS_MASK] = sequence;
1296
- op += oneSeqSize;
1801
+ if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1802
+
1803
+ prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
1804
+ sequences[seqNb & STORED_SEQS_MASK] = sequence;
1805
+ op += oneSeqSize;
1806
+ } }
1807
+ else
1808
+ {
1809
+ /* lit buffer is either wholly contained in first or second split, or not split at all*/
1810
+ size_t const oneSeqSize = dctx->litBufferLocation == ZSTD_split ?
1811
+ ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength - WILDCOPY_OVERLENGTH, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) :
1812
+ ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
1813
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1814
+ assert(!ZSTD_isError(oneSeqSize));
1815
+ ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
1816
+ #endif
1817
+ if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1818
+
1819
+ prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
1820
+ sequences[seqNb & STORED_SEQS_MASK] = sequence;
1821
+ op += oneSeqSize;
1822
+ }
1297
1823
  }
1298
- RETURN_ERROR_IF(seqNb<nbSeq, corruption_detected, "");
1824
+ RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, "");
1299
1825
 
1300
1826
  /* finish queue */
1301
1827
  seqNb -= seqAdvance;
1302
1828
  for ( ; seqNb<nbSeq ; seqNb++) {
1303
- size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[seqNb&STORED_SEQS_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd);
1829
+ seq_t *sequence = &(sequences[seqNb&STORED_SEQS_MASK]);
1830
+ if (dctx->litBufferLocation == ZSTD_split && litPtr + sequence->litLength > dctx->litBufferEnd) {
1831
+ const size_t leftoverLit = dctx->litBufferEnd - litPtr;
1832
+ if (leftoverLit) {
1833
+ RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");
1834
+ ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
1835
+ sequence->litLength -= leftoverLit;
1836
+ op += leftoverLit;
1837
+ }
1838
+ litPtr = dctx->litExtraBuffer;
1839
+ litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
1840
+ dctx->litBufferLocation = ZSTD_not_in_dst;
1841
+ { size_t const oneSeqSize = ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
1304
1842
  #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1305
- assert(!ZSTD_isError(oneSeqSize));
1306
- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
1843
+ assert(!ZSTD_isError(oneSeqSize));
1844
+ ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
1307
1845
  #endif
1308
- if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1309
- op += oneSeqSize;
1846
+ if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1847
+ op += oneSeqSize;
1848
+ }
1849
+ }
1850
+ else
1851
+ {
1852
+ size_t const oneSeqSize = dctx->litBufferLocation == ZSTD_split ?
1853
+ ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence->litLength - WILDCOPY_OVERLENGTH, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) :
1854
+ ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
1855
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1856
+ assert(!ZSTD_isError(oneSeqSize));
1857
+ ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
1858
+ #endif
1859
+ if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1860
+ op += oneSeqSize;
1861
+ }
1310
1862
  }
1311
1863
 
1312
1864
  /* save reps for next block */
@@ -1314,25 +1866,34 @@ ZSTD_decompressSequencesLong_body(
1314
1866
  }
1315
1867
 
1316
1868
  /* last literal segment */
1317
- { size_t const lastLLSize = litEnd - litPtr;
1869
+ if (dctx->litBufferLocation == ZSTD_split) { /* first deplete literal buffer in dst, then copy litExtraBuffer */
1870
+ size_t const lastLLSize = litBufferEnd - litPtr;
1871
+ RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, "");
1872
+ if (op != NULL) {
1873
+ ZSTD_memmove(op, litPtr, lastLLSize);
1874
+ op += lastLLSize;
1875
+ }
1876
+ litPtr = dctx->litExtraBuffer;
1877
+ litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
1878
+ }
1879
+ { size_t const lastLLSize = litBufferEnd - litPtr;
1318
1880
  RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
1319
1881
  if (op != NULL) {
1320
- ZSTD_memcpy(op, litPtr, lastLLSize);
1882
+ ZSTD_memmove(op, litPtr, lastLLSize);
1321
1883
  op += lastLLSize;
1322
1884
  }
1323
1885
  }
1324
1886
 
1325
- return op-ostart;
1887
+ return (size_t)(op - ostart);
1326
1888
  }
1327
1889
 
1328
1890
  static size_t
1329
1891
  ZSTD_decompressSequencesLong_default(ZSTD_DCtx* dctx,
1330
1892
  void* dst, size_t maxDstSize,
1331
1893
  const void* seqStart, size_t seqSize, int nbSeq,
1332
- const ZSTD_longOffset_e isLongOffset,
1333
- const int frame)
1894
+ const ZSTD_longOffset_e isLongOffset)
1334
1895
  {
1335
- return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1896
+ return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
1336
1897
  }
1337
1898
  #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
1338
1899
 
@@ -1341,27 +1902,34 @@ ZSTD_decompressSequencesLong_default(ZSTD_DCtx* dctx,
1341
1902
  #if DYNAMIC_BMI2
1342
1903
 
1343
1904
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
1344
- static TARGET_ATTRIBUTE("bmi2") size_t
1905
+ static BMI2_TARGET_ATTRIBUTE size_t
1345
1906
  DONT_VECTORIZE
1346
1907
  ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx,
1347
1908
  void* dst, size_t maxDstSize,
1348
1909
  const void* seqStart, size_t seqSize, int nbSeq,
1349
- const ZSTD_longOffset_e isLongOffset,
1350
- const int frame)
1910
+ const ZSTD_longOffset_e isLongOffset)
1351
1911
  {
1352
- return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1912
+ return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
1913
+ }
1914
+ static BMI2_TARGET_ATTRIBUTE size_t
1915
+ DONT_VECTORIZE
1916
+ ZSTD_decompressSequencesSplitLitBuffer_bmi2(ZSTD_DCtx* dctx,
1917
+ void* dst, size_t maxDstSize,
1918
+ const void* seqStart, size_t seqSize, int nbSeq,
1919
+ const ZSTD_longOffset_e isLongOffset)
1920
+ {
1921
+ return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
1353
1922
  }
1354
1923
  #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
1355
1924
 
1356
1925
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
1357
- static TARGET_ATTRIBUTE("bmi2") size_t
1926
+ static BMI2_TARGET_ATTRIBUTE size_t
1358
1927
  ZSTD_decompressSequencesLong_bmi2(ZSTD_DCtx* dctx,
1359
1928
  void* dst, size_t maxDstSize,
1360
1929
  const void* seqStart, size_t seqSize, int nbSeq,
1361
- const ZSTD_longOffset_e isLongOffset,
1362
- const int frame)
1930
+ const ZSTD_longOffset_e isLongOffset)
1363
1931
  {
1364
- return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1932
+ return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
1365
1933
  }
1366
1934
  #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
1367
1935
 
@@ -1371,23 +1939,34 @@ typedef size_t (*ZSTD_decompressSequences_t)(
1371
1939
  ZSTD_DCtx* dctx,
1372
1940
  void* dst, size_t maxDstSize,
1373
1941
  const void* seqStart, size_t seqSize, int nbSeq,
1374
- const ZSTD_longOffset_e isLongOffset,
1375
- const int frame);
1942
+ const ZSTD_longOffset_e isLongOffset);
1376
1943
 
1377
1944
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
1378
1945
  static size_t
1379
1946
  ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,
1380
1947
  const void* seqStart, size_t seqSize, int nbSeq,
1381
- const ZSTD_longOffset_e isLongOffset,
1382
- const int frame)
1948
+ const ZSTD_longOffset_e isLongOffset)
1383
1949
  {
1384
1950
  DEBUGLOG(5, "ZSTD_decompressSequences");
1385
1951
  #if DYNAMIC_BMI2
1386
- if (dctx->bmi2) {
1387
- return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1952
+ if (ZSTD_DCtx_get_bmi2(dctx)) {
1953
+ return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
1388
1954
  }
1389
1955
  #endif
1390
- return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1956
+ return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
1957
+ }
1958
+ static size_t
1959
+ ZSTD_decompressSequencesSplitLitBuffer(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,
1960
+ const void* seqStart, size_t seqSize, int nbSeq,
1961
+ const ZSTD_longOffset_e isLongOffset)
1962
+ {
1963
+ DEBUGLOG(5, "ZSTD_decompressSequencesSplitLitBuffer");
1964
+ #if DYNAMIC_BMI2
1965
+ if (ZSTD_DCtx_get_bmi2(dctx)) {
1966
+ return ZSTD_decompressSequencesSplitLitBuffer_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
1967
+ }
1968
+ #endif
1969
+ return ZSTD_decompressSequencesSplitLitBuffer_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
1391
1970
  }
1392
1971
  #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
1393
1972
 
@@ -1402,69 +1981,114 @@ static size_t
1402
1981
  ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx,
1403
1982
  void* dst, size_t maxDstSize,
1404
1983
  const void* seqStart, size_t seqSize, int nbSeq,
1405
- const ZSTD_longOffset_e isLongOffset,
1406
- const int frame)
1984
+ const ZSTD_longOffset_e isLongOffset)
1407
1985
  {
1408
1986
  DEBUGLOG(5, "ZSTD_decompressSequencesLong");
1409
1987
  #if DYNAMIC_BMI2
1410
- if (dctx->bmi2) {
1411
- return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1988
+ if (ZSTD_DCtx_get_bmi2(dctx)) {
1989
+ return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
1412
1990
  }
1413
1991
  #endif
1414
- return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1992
+ return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
1415
1993
  }
1416
1994
  #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
1417
1995
 
1418
1996
 
1997
+ /**
1998
+ * @returns The total size of the history referenceable by zstd, including
1999
+ * both the prefix and the extDict. At @p op any offset larger than this
2000
+ * is invalid.
2001
+ */
2002
+ static size_t ZSTD_totalHistorySize(BYTE* op, BYTE const* virtualStart)
2003
+ {
2004
+ return (size_t)(op - virtualStart);
2005
+ }
2006
+
2007
+ typedef struct {
2008
+ unsigned longOffsetShare;
2009
+ unsigned maxNbAdditionalBits;
2010
+ } ZSTD_OffsetInfo;
1419
2011
 
1420
- #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
1421
- !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
1422
- /* ZSTD_getLongOffsetsShare() :
2012
+ /* ZSTD_getOffsetInfo() :
1423
2013
  * condition : offTable must be valid
1424
2014
  * @return : "share" of long offsets (arbitrarily defined as > (1<<23))
1425
- * compared to maximum possible of (1<<OffFSELog) */
1426
- static unsigned
1427
- ZSTD_getLongOffsetsShare(const ZSTD_seqSymbol* offTable)
2015
+ * compared to maximum possible of (1<<OffFSELog),
2016
+ * as well as the maximum number additional bits required.
2017
+ */
2018
+ static ZSTD_OffsetInfo
2019
+ ZSTD_getOffsetInfo(const ZSTD_seqSymbol* offTable, int nbSeq)
1428
2020
  {
1429
- const void* ptr = offTable;
1430
- U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog;
1431
- const ZSTD_seqSymbol* table = offTable + 1;
1432
- U32 const max = 1 << tableLog;
1433
- U32 u, total = 0;
1434
- DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog);
1435
-
1436
- assert(max <= (1 << OffFSELog)); /* max not too large */
1437
- for (u=0; u<max; u++) {
1438
- if (table[u].nbAdditionalBits > 22) total += 1;
2021
+ ZSTD_OffsetInfo info = {0, 0};
2022
+ /* If nbSeq == 0, then the offTable is uninitialized, but we have
2023
+ * no sequences, so both values should be 0.
2024
+ */
2025
+ if (nbSeq != 0) {
2026
+ const void* ptr = offTable;
2027
+ U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog;
2028
+ const ZSTD_seqSymbol* table = offTable + 1;
2029
+ U32 const max = 1 << tableLog;
2030
+ U32 u;
2031
+ DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog);
2032
+
2033
+ assert(max <= (1 << OffFSELog)); /* max not too large */
2034
+ for (u=0; u<max; u++) {
2035
+ info.maxNbAdditionalBits = MAX(info.maxNbAdditionalBits, table[u].nbAdditionalBits);
2036
+ if (table[u].nbAdditionalBits > 22) info.longOffsetShare += 1;
2037
+ }
2038
+
2039
+ assert(tableLog <= OffFSELog);
2040
+ info.longOffsetShare <<= (OffFSELog - tableLog); /* scale to OffFSELog */
1439
2041
  }
1440
2042
 
1441
- assert(tableLog <= OffFSELog);
1442
- total <<= (OffFSELog - tableLog); /* scale to OffFSELog */
2043
+ return info;
2044
+ }
1443
2045
 
1444
- return total;
2046
+ /**
2047
+ * @returns The maximum offset we can decode in one read of our bitstream, without
2048
+ * reloading more bits in the middle of the offset bits read. Any offsets larger
2049
+ * than this must use the long offset decoder.
2050
+ */
2051
+ static size_t ZSTD_maxShortOffset(void)
2052
+ {
2053
+ if (MEM_64bits()) {
2054
+ /* We can decode any offset without reloading bits.
2055
+ * This might change if the max window size grows.
2056
+ */
2057
+ ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31);
2058
+ return (size_t)-1;
2059
+ } else {
2060
+ /* The maximum offBase is (1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1.
2061
+ * This offBase would require STREAM_ACCUMULATOR_MIN extra bits.
2062
+ * Then we have to subtract ZSTD_REP_NUM to get the maximum possible offset.
2063
+ */
2064
+ size_t const maxOffbase = ((size_t)1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1;
2065
+ size_t const maxOffset = maxOffbase - ZSTD_REP_NUM;
2066
+ assert(ZSTD_highbit32((U32)maxOffbase) == STREAM_ACCUMULATOR_MIN);
2067
+ return maxOffset;
2068
+ }
1445
2069
  }
1446
- #endif
1447
2070
 
1448
2071
  size_t
1449
2072
  ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
1450
2073
  void* dst, size_t dstCapacity,
1451
- const void* src, size_t srcSize, const int frame)
2074
+ const void* src, size_t srcSize, const streaming_operation streaming)
1452
2075
  { /* blockType == blockCompressed */
1453
2076
  const BYTE* ip = (const BYTE*)src;
1454
- /* isLongOffset must be true if there are long offsets.
1455
- * Offsets are long if they are larger than 2^STREAM_ACCUMULATOR_MIN.
1456
- * We don't expect that to be the case in 64-bit mode.
1457
- * In block mode, window size is not known, so we have to be conservative.
1458
- * (note: but it could be evaluated from current-lowLimit)
1459
- */
1460
- ZSTD_longOffset_e const isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (!frame || (dctx->fParams.windowSize > (1ULL << STREAM_ACCUMULATOR_MIN))));
1461
- DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize);
1462
-
1463
- RETURN_ERROR_IF(srcSize >= ZSTD_BLOCKSIZE_MAX, srcSize_wrong, "");
2077
+ DEBUGLOG(5, "ZSTD_decompressBlock_internal (cSize : %u)", (unsigned)srcSize);
2078
+
2079
+ /* Note : the wording of the specification
2080
+ * allows compressed block to be sized exactly ZSTD_blockSizeMax(dctx).
2081
+ * This generally does not happen, as it makes little sense,
2082
+ * since an uncompressed block would feature same size and have no decompression cost.
2083
+ * Also, note that decoder from reference libzstd before < v1.5.4
2084
+ * would consider this edge case as an error.
2085
+ * As a consequence, avoid generating compressed blocks of size ZSTD_blockSizeMax(dctx)
2086
+ * for broader compatibility with the deployed ecosystem of zstd decoders */
2087
+ RETURN_ERROR_IF(srcSize > ZSTD_blockSizeMax(dctx), srcSize_wrong, "");
1464
2088
 
1465
2089
  /* Decode literals section */
1466
- { size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize);
1467
- DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : %u", (U32)litCSize);
2090
+ { size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, streaming);
2091
+ DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : cSize=%u, nbLiterals=%zu", (U32)litCSize, dctx->litSize);
1468
2092
  if (ZSTD_isError(litCSize)) return litCSize;
1469
2093
  ip += litCSize;
1470
2094
  srcSize -= litCSize;
@@ -1472,6 +2096,23 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
1472
2096
 
1473
2097
  /* Build Decoding Tables */
1474
2098
  {
2099
+ /* Compute the maximum block size, which must also work when !frame and fParams are unset.
2100
+ * Additionally, take the min with dstCapacity to ensure that the totalHistorySize fits in a size_t.
2101
+ */
2102
+ size_t const blockSizeMax = MIN(dstCapacity, ZSTD_blockSizeMax(dctx));
2103
+ size_t const totalHistorySize = ZSTD_totalHistorySize(ZSTD_maybeNullPtrAdd((BYTE*)dst, blockSizeMax), (BYTE const*)dctx->virtualStart);
2104
+ /* isLongOffset must be true if there are long offsets.
2105
+ * Offsets are long if they are larger than ZSTD_maxShortOffset().
2106
+ * We don't expect that to be the case in 64-bit mode.
2107
+ *
2108
+ * We check here to see if our history is large enough to allow long offsets.
2109
+ * If it isn't, then we can't possible have (valid) long offsets. If the offset
2110
+ * is invalid, then it is okay to read it incorrectly.
2111
+ *
2112
+ * If isLongOffsets is true, then we will later check our decoding table to see
2113
+ * if it is even possible to generate long offsets.
2114
+ */
2115
+ ZSTD_longOffset_e isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (totalHistorySize > ZSTD_maxShortOffset()));
1475
2116
  /* These macros control at build-time which decompressor implementation
1476
2117
  * we use. If neither is defined, we do some inspection and dispatch at
1477
2118
  * runtime.
@@ -1479,6 +2120,11 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
1479
2120
  #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
1480
2121
  !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
1481
2122
  int usePrefetchDecoder = dctx->ddictIsCold;
2123
+ #else
2124
+ /* Set to 1 to avoid computing offset info if we don't need to.
2125
+ * Otherwise this value is ignored.
2126
+ */
2127
+ int usePrefetchDecoder = 1;
1482
2128
  #endif
1483
2129
  int nbSeq;
1484
2130
  size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize);
@@ -1486,40 +2132,58 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
1486
2132
  ip += seqHSize;
1487
2133
  srcSize -= seqHSize;
1488
2134
 
1489
- RETURN_ERROR_IF(dst == NULL && nbSeq > 0, dstSize_tooSmall, "NULL not handled");
2135
+ RETURN_ERROR_IF((dst == NULL || dstCapacity == 0) && nbSeq > 0, dstSize_tooSmall, "NULL not handled");
2136
+ RETURN_ERROR_IF(MEM_64bits() && sizeof(size_t) == sizeof(void*) && (size_t)(-1) - (size_t)dst < (size_t)(1 << 20), dstSize_tooSmall,
2137
+ "invalid dst");
1490
2138
 
1491
- #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
1492
- !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
1493
- if ( !usePrefetchDecoder
1494
- && (!frame || (dctx->fParams.windowSize > (1<<24)))
1495
- && (nbSeq>ADVANCED_SEQS) ) { /* could probably use a larger nbSeq limit */
1496
- U32 const shareLongOffsets = ZSTD_getLongOffsetsShare(dctx->OFTptr);
1497
- U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */
1498
- usePrefetchDecoder = (shareLongOffsets >= minShare);
2139
+ /* If we could potentially have long offsets, or we might want to use the prefetch decoder,
2140
+ * compute information about the share of long offsets, and the maximum nbAdditionalBits.
2141
+ * NOTE: could probably use a larger nbSeq limit
2142
+ */
2143
+ if (isLongOffset || (!usePrefetchDecoder && (totalHistorySize > (1u << 24)) && (nbSeq > 8))) {
2144
+ ZSTD_OffsetInfo const info = ZSTD_getOffsetInfo(dctx->OFTptr, nbSeq);
2145
+ if (isLongOffset && info.maxNbAdditionalBits <= STREAM_ACCUMULATOR_MIN) {
2146
+ /* If isLongOffset, but the maximum number of additional bits that we see in our table is small
2147
+ * enough, then we know it is impossible to have too long an offset in this block, so we can
2148
+ * use the regular offset decoder.
2149
+ */
2150
+ isLongOffset = ZSTD_lo_isRegularOffset;
2151
+ }
2152
+ if (!usePrefetchDecoder) {
2153
+ U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */
2154
+ usePrefetchDecoder = (info.longOffsetShare >= minShare);
2155
+ }
1499
2156
  }
1500
- #endif
1501
2157
 
1502
2158
  dctx->ddictIsCold = 0;
1503
2159
 
1504
2160
  #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
1505
2161
  !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
1506
- if (usePrefetchDecoder)
2162
+ if (usePrefetchDecoder) {
2163
+ #else
2164
+ (void)usePrefetchDecoder;
2165
+ {
1507
2166
  #endif
1508
2167
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
1509
- return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
2168
+ return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
1510
2169
  #endif
2170
+ }
1511
2171
 
1512
2172
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
1513
2173
  /* else */
1514
- return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
2174
+ if (dctx->litBufferLocation == ZSTD_split)
2175
+ return ZSTD_decompressSequencesSplitLitBuffer(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
2176
+ else
2177
+ return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
1515
2178
  #endif
1516
2179
  }
1517
2180
  }
1518
2181
 
1519
2182
 
1520
- void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst)
2183
+ ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
2184
+ void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize)
1521
2185
  {
1522
- if (dst != dctx->previousDstEnd) { /* not contiguous */
2186
+ if (dst != dctx->previousDstEnd && dstSize > 0) { /* not contiguous */
1523
2187
  dctx->dictEnd = dctx->previousDstEnd;
1524
2188
  dctx->virtualStart = (const char*)dst - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->prefixStart));
1525
2189
  dctx->prefixStart = dst;
@@ -1528,13 +2192,24 @@ void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst)
1528
2192
  }
1529
2193
 
1530
2194
 
1531
- size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
1532
- void* dst, size_t dstCapacity,
1533
- const void* src, size_t srcSize)
2195
+ size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx,
2196
+ void* dst, size_t dstCapacity,
2197
+ const void* src, size_t srcSize)
1534
2198
  {
1535
2199
  size_t dSize;
1536
- ZSTD_checkContinuity(dctx, dst);
1537
- dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 0);
2200
+ dctx->isFrameDecompression = 0;
2201
+ ZSTD_checkContinuity(dctx, dst, dstCapacity);
2202
+ dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, not_streaming);
2203
+ FORWARD_IF_ERROR(dSize, "");
1538
2204
  dctx->previousDstEnd = (char*)dst + dSize;
1539
2205
  return dSize;
1540
2206
  }
2207
+
2208
+
2209
+ /* NOTE: Must just wrap ZSTD_decompressBlock_deprecated() */
2210
+ size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
2211
+ void* dst, size_t dstCapacity,
2212
+ const void* src, size_t srcSize)
2213
+ {
2214
+ return ZSTD_decompressBlock_deprecated(dctx, dst, dstCapacity, src, srcSize);
2215
+ }