zstd-ruby 1.5.0.0 → 1.5.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (62) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/ruby.yml +2 -2
  3. data/README.md +1 -1
  4. data/ext/zstdruby/extconf.rb +1 -0
  5. data/ext/zstdruby/libzstd/Makefile +50 -175
  6. data/ext/zstdruby/libzstd/README.md +7 -1
  7. data/ext/zstdruby/libzstd/common/bitstream.h +24 -9
  8. data/ext/zstdruby/libzstd/common/compiler.h +89 -43
  9. data/ext/zstdruby/libzstd/common/entropy_common.c +11 -5
  10. data/ext/zstdruby/libzstd/common/error_private.h +79 -0
  11. data/ext/zstdruby/libzstd/common/fse.h +2 -1
  12. data/ext/zstdruby/libzstd/common/fse_decompress.c +1 -1
  13. data/ext/zstdruby/libzstd/common/huf.h +24 -22
  14. data/ext/zstdruby/libzstd/common/mem.h +18 -0
  15. data/ext/zstdruby/libzstd/common/portability_macros.h +131 -0
  16. data/ext/zstdruby/libzstd/common/xxhash.c +5 -805
  17. data/ext/zstdruby/libzstd/common/xxhash.h +5568 -167
  18. data/ext/zstdruby/libzstd/common/zstd_internal.h +92 -88
  19. data/ext/zstdruby/libzstd/common/zstd_trace.h +12 -3
  20. data/ext/zstdruby/libzstd/compress/clevels.h +134 -0
  21. data/ext/zstdruby/libzstd/compress/fse_compress.c +63 -27
  22. data/ext/zstdruby/libzstd/compress/huf_compress.c +537 -104
  23. data/ext/zstdruby/libzstd/compress/zstd_compress.c +194 -278
  24. data/ext/zstdruby/libzstd/compress/zstd_compress_internal.h +102 -44
  25. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.c +4 -3
  26. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.h +3 -1
  27. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.c +5 -4
  28. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.c +3 -2
  29. data/ext/zstdruby/libzstd/compress/zstd_cwksp.h +3 -3
  30. data/ext/zstdruby/libzstd/compress/zstd_double_fast.c +289 -114
  31. data/ext/zstdruby/libzstd/compress/zstd_fast.c +302 -123
  32. data/ext/zstdruby/libzstd/compress/zstd_lazy.c +418 -502
  33. data/ext/zstdruby/libzstd/compress/zstd_ldm.c +4 -4
  34. data/ext/zstdruby/libzstd/compress/zstd_ldm.h +1 -1
  35. data/ext/zstdruby/libzstd/compress/zstd_ldm_geartab.h +4 -1
  36. data/ext/zstdruby/libzstd/compress/zstd_opt.c +186 -108
  37. data/ext/zstdruby/libzstd/compress/zstdmt_compress.c +59 -29
  38. data/ext/zstdruby/libzstd/decompress/huf_decompress.c +727 -189
  39. data/ext/zstdruby/libzstd/decompress/huf_decompress_amd64.S +571 -0
  40. data/ext/zstdruby/libzstd/decompress/zstd_decompress.c +85 -22
  41. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.c +744 -220
  42. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.h +8 -2
  43. data/ext/zstdruby/libzstd/decompress/zstd_decompress_internal.h +34 -3
  44. data/ext/zstdruby/libzstd/deprecated/zbuff_compress.c +23 -3
  45. data/ext/zstdruby/libzstd/dictBuilder/cover.c +9 -2
  46. data/ext/zstdruby/libzstd/dictBuilder/fastcover.c +11 -4
  47. data/ext/zstdruby/libzstd/dictBuilder/zdict.c +99 -28
  48. data/ext/zstdruby/libzstd/legacy/zstd_v01.c +2 -6
  49. data/ext/zstdruby/libzstd/legacy/zstd_v02.c +3 -7
  50. data/ext/zstdruby/libzstd/legacy/zstd_v03.c +3 -7
  51. data/ext/zstdruby/libzstd/legacy/zstd_v04.c +3 -7
  52. data/ext/zstdruby/libzstd/legacy/zstd_v05.c +3 -7
  53. data/ext/zstdruby/libzstd/legacy/zstd_v06.c +3 -7
  54. data/ext/zstdruby/libzstd/legacy/zstd_v07.c +3 -7
  55. data/ext/zstdruby/libzstd/libzstd.mk +185 -0
  56. data/ext/zstdruby/libzstd/libzstd.pc.in +1 -0
  57. data/ext/zstdruby/libzstd/modulemap/module.modulemap +4 -0
  58. data/ext/zstdruby/libzstd/zdict.h +4 -4
  59. data/ext/zstdruby/libzstd/zstd.h +179 -136
  60. data/ext/zstdruby/zstdruby.c +2 -2
  61. data/lib/zstd-ruby/version.rb +1 -1
  62. metadata +8 -3
@@ -69,15 +69,56 @@ size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
69
69
  }
70
70
  }
71
71
 
72
+ /* Allocate buffer for literals, either overlapping current dst, or split between dst and litExtraBuffer, or stored entirely within litExtraBuffer */
73
+ static void ZSTD_allocateLiteralsBuffer(ZSTD_DCtx* dctx, void* const dst, const size_t dstCapacity, const size_t litSize,
74
+ const streaming_operation streaming, const size_t expectedWriteSize, const unsigned splitImmediately)
75
+ {
76
+ if (streaming == not_streaming && dstCapacity > ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH)
77
+ {
78
+ /* room for litbuffer to fit without read faulting */
79
+ dctx->litBuffer = (BYTE*)dst + ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH;
80
+ dctx->litBufferEnd = dctx->litBuffer + litSize;
81
+ dctx->litBufferLocation = ZSTD_in_dst;
82
+ }
83
+ else if (litSize > ZSTD_LITBUFFEREXTRASIZE)
84
+ {
85
+ /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */
86
+ if (splitImmediately) {
87
+ /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */
88
+ dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH;
89
+ dctx->litBufferEnd = dctx->litBuffer + litSize - ZSTD_LITBUFFEREXTRASIZE;
90
+ }
91
+ else {
92
+ /* initially this will be stored entirely in dst during huffman decoding, it will partially shifted to litExtraBuffer after */
93
+ dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize;
94
+ dctx->litBufferEnd = (BYTE*)dst + expectedWriteSize;
95
+ }
96
+ dctx->litBufferLocation = ZSTD_split;
97
+ }
98
+ else
99
+ {
100
+ /* fits entirely within litExtraBuffer, so no split is necessary */
101
+ dctx->litBuffer = dctx->litExtraBuffer;
102
+ dctx->litBufferEnd = dctx->litBuffer + litSize;
103
+ dctx->litBufferLocation = ZSTD_not_in_dst;
104
+ }
105
+ }
72
106
 
73
107
  /* Hidden declaration for fullbench */
74
108
  size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
75
- const void* src, size_t srcSize);
109
+ const void* src, size_t srcSize,
110
+ void* dst, size_t dstCapacity, const streaming_operation streaming);
76
111
  /*! ZSTD_decodeLiteralsBlock() :
112
+ * Where it is possible to do so without being stomped by the output during decompression, the literals block will be stored
113
+ * in the dstBuffer. If there is room to do so, it will be stored in full in the excess dst space after where the current
114
+ * block will be output. Otherwise it will be stored at the end of the current dst blockspace, with a small portion being
115
+ * stored in dctx->litExtraBuffer to help keep it "ahead" of the current output write.
116
+ *
77
117
  * @return : nb of bytes read from src (< srcSize )
78
118
  * note : symbol not declared but exposed for fullbench */
79
119
  size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
80
- const void* src, size_t srcSize) /* note : srcSize < BLOCKSIZE */
120
+ const void* src, size_t srcSize, /* note : srcSize < BLOCKSIZE */
121
+ void* dst, size_t dstCapacity, const streaming_operation streaming)
81
122
  {
82
123
  DEBUGLOG(5, "ZSTD_decodeLiteralsBlock");
83
124
  RETURN_ERROR_IF(srcSize < MIN_CBLOCK_SIZE, corruption_detected, "");
@@ -90,7 +131,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
90
131
  case set_repeat:
91
132
  DEBUGLOG(5, "set_repeat flag : re-using stats from previous compressed literals block");
92
133
  RETURN_ERROR_IF(dctx->litEntropy==0, dictionary_corrupted, "");
93
- /* fall-through */
134
+ ZSTD_FALLTHROUGH;
94
135
 
95
136
  case set_compressed:
96
137
  RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3");
@@ -99,6 +140,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
99
140
  U32 const lhlCode = (istart[0] >> 2) & 3;
100
141
  U32 const lhc = MEM_readLE32(istart);
101
142
  size_t hufSuccess;
143
+ size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
102
144
  switch(lhlCode)
103
145
  {
104
146
  case 0: case 1: default: /* note : default is impossible, since lhlCode into [0..3] */
@@ -121,8 +163,11 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
121
163
  litCSize = (lhc >> 22) + ((size_t)istart[4] << 10);
122
164
  break;
123
165
  }
166
+ RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
124
167
  RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
125
168
  RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, "");
169
+ RETURN_ERROR_IF(expectedWriteSize < litSize , dstSize_tooSmall, "");
170
+ ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 0);
126
171
 
127
172
  /* prefetch huffman table if cold */
128
173
  if (dctx->ddictIsCold && (litSize > 768 /* heuristic */)) {
@@ -133,11 +178,11 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
133
178
  if (singleStream) {
134
179
  hufSuccess = HUF_decompress1X_usingDTable_bmi2(
135
180
  dctx->litBuffer, litSize, istart+lhSize, litCSize,
136
- dctx->HUFptr, dctx->bmi2);
181
+ dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx));
137
182
  } else {
138
183
  hufSuccess = HUF_decompress4X_usingDTable_bmi2(
139
184
  dctx->litBuffer, litSize, istart+lhSize, litCSize,
140
- dctx->HUFptr, dctx->bmi2);
185
+ dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx));
141
186
  }
142
187
  } else {
143
188
  if (singleStream) {
@@ -150,15 +195,22 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
150
195
  hufSuccess = HUF_decompress1X1_DCtx_wksp_bmi2(
151
196
  dctx->entropy.hufTable, dctx->litBuffer, litSize,
152
197
  istart+lhSize, litCSize, dctx->workspace,
153
- sizeof(dctx->workspace), dctx->bmi2);
198
+ sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx));
154
199
  #endif
155
200
  } else {
156
201
  hufSuccess = HUF_decompress4X_hufOnly_wksp_bmi2(
157
202
  dctx->entropy.hufTable, dctx->litBuffer, litSize,
158
203
  istart+lhSize, litCSize, dctx->workspace,
159
- sizeof(dctx->workspace), dctx->bmi2);
204
+ sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx));
160
205
  }
161
206
  }
207
+ if (dctx->litBufferLocation == ZSTD_split)
208
+ {
209
+ ZSTD_memcpy(dctx->litExtraBuffer, dctx->litBufferEnd - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE);
210
+ ZSTD_memmove(dctx->litBuffer + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH, dctx->litBuffer, litSize - ZSTD_LITBUFFEREXTRASIZE);
211
+ dctx->litBuffer += ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH;
212
+ dctx->litBufferEnd -= WILDCOPY_OVERLENGTH;
213
+ }
162
214
 
163
215
  RETURN_ERROR_IF(HUF_isError(hufSuccess), corruption_detected, "");
164
216
 
@@ -166,13 +218,13 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
166
218
  dctx->litSize = litSize;
167
219
  dctx->litEntropy = 1;
168
220
  if (litEncType==set_compressed) dctx->HUFptr = dctx->entropy.hufTable;
169
- ZSTD_memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
170
221
  return litCSize + lhSize;
171
222
  }
172
223
 
173
224
  case set_basic:
174
225
  { size_t litSize, lhSize;
175
226
  U32 const lhlCode = ((istart[0]) >> 2) & 3;
227
+ size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
176
228
  switch(lhlCode)
177
229
  {
178
230
  case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */
@@ -189,23 +241,36 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
189
241
  break;
190
242
  }
191
243
 
244
+ RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
245
+ RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, "");
246
+ ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1);
192
247
  if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) { /* risk reading beyond src buffer with wildcopy */
193
248
  RETURN_ERROR_IF(litSize+lhSize > srcSize, corruption_detected, "");
194
- ZSTD_memcpy(dctx->litBuffer, istart+lhSize, litSize);
249
+ if (dctx->litBufferLocation == ZSTD_split)
250
+ {
251
+ ZSTD_memcpy(dctx->litBuffer, istart + lhSize, litSize - ZSTD_LITBUFFEREXTRASIZE);
252
+ ZSTD_memcpy(dctx->litExtraBuffer, istart + lhSize + litSize - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE);
253
+ }
254
+ else
255
+ {
256
+ ZSTD_memcpy(dctx->litBuffer, istart + lhSize, litSize);
257
+ }
195
258
  dctx->litPtr = dctx->litBuffer;
196
259
  dctx->litSize = litSize;
197
- ZSTD_memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
198
260
  return lhSize+litSize;
199
261
  }
200
262
  /* direct reference into compressed stream */
201
263
  dctx->litPtr = istart+lhSize;
202
264
  dctx->litSize = litSize;
265
+ dctx->litBufferEnd = dctx->litPtr + litSize;
266
+ dctx->litBufferLocation = ZSTD_not_in_dst;
203
267
  return lhSize+litSize;
204
268
  }
205
269
 
206
270
  case set_rle:
207
271
  { U32 const lhlCode = ((istart[0]) >> 2) & 3;
208
272
  size_t litSize, lhSize;
273
+ size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
209
274
  switch(lhlCode)
210
275
  {
211
276
  case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */
@@ -222,8 +287,19 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
222
287
  RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4");
223
288
  break;
224
289
  }
290
+ RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
225
291
  RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
226
- ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize + WILDCOPY_OVERLENGTH);
292
+ RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, "");
293
+ ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1);
294
+ if (dctx->litBufferLocation == ZSTD_split)
295
+ {
296
+ ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize - ZSTD_LITBUFFEREXTRASIZE);
297
+ ZSTD_memset(dctx->litExtraBuffer, istart[lhSize], ZSTD_LITBUFFEREXTRASIZE);
298
+ }
299
+ else
300
+ {
301
+ ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize);
302
+ }
227
303
  dctx->litPtr = dctx->litBuffer;
228
304
  dctx->litSize = litSize;
229
305
  return lhSize+1;
@@ -343,7 +419,7 @@ static const ZSTD_seqSymbol ML_defaultDTable[(1<<ML_DEFAULTNORMLOG)+1] = {
343
419
  }; /* ML_defaultDTable */
344
420
 
345
421
 
346
- static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U32 nbAddBits)
422
+ static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U8 nbAddBits)
347
423
  {
348
424
  void* ptr = dt;
349
425
  ZSTD_seqSymbol_header* const DTableH = (ZSTD_seqSymbol_header*)ptr;
@@ -355,7 +431,7 @@ static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U32 nbAddB
355
431
  cell->nbBits = 0;
356
432
  cell->nextState = 0;
357
433
  assert(nbAddBits < 255);
358
- cell->nbAdditionalBits = (BYTE)nbAddBits;
434
+ cell->nbAdditionalBits = nbAddBits;
359
435
  cell->baseValue = baseValue;
360
436
  }
361
437
 
@@ -367,7 +443,7 @@ static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U32 nbAddB
367
443
  FORCE_INLINE_TEMPLATE
368
444
  void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
369
445
  const short* normalizedCounter, unsigned maxSymbolValue,
370
- const U32* baseValue, const U32* nbAdditionalBits,
446
+ const U32* baseValue, const U8* nbAdditionalBits,
371
447
  unsigned tableLog, void* wksp, size_t wkspSize)
372
448
  {
373
449
  ZSTD_seqSymbol* const tableDecode = dt+1;
@@ -478,7 +554,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
478
554
  tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32(nextState) );
479
555
  tableDecode[u].nextState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
480
556
  assert(nbAdditionalBits[symbol] < 255);
481
- tableDecode[u].nbAdditionalBits = (BYTE)nbAdditionalBits[symbol];
557
+ tableDecode[u].nbAdditionalBits = nbAdditionalBits[symbol];
482
558
  tableDecode[u].baseValue = baseValue[symbol];
483
559
  }
484
560
  }
@@ -487,7 +563,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
487
563
  /* Avoids the FORCE_INLINE of the _body() function. */
488
564
  static void ZSTD_buildFSETable_body_default(ZSTD_seqSymbol* dt,
489
565
  const short* normalizedCounter, unsigned maxSymbolValue,
490
- const U32* baseValue, const U32* nbAdditionalBits,
566
+ const U32* baseValue, const U8* nbAdditionalBits,
491
567
  unsigned tableLog, void* wksp, size_t wkspSize)
492
568
  {
493
569
  ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue,
@@ -495,9 +571,9 @@ static void ZSTD_buildFSETable_body_default(ZSTD_seqSymbol* dt,
495
571
  }
496
572
 
497
573
  #if DYNAMIC_BMI2
498
- TARGET_ATTRIBUTE("bmi2") static void ZSTD_buildFSETable_body_bmi2(ZSTD_seqSymbol* dt,
574
+ BMI2_TARGET_ATTRIBUTE static void ZSTD_buildFSETable_body_bmi2(ZSTD_seqSymbol* dt,
499
575
  const short* normalizedCounter, unsigned maxSymbolValue,
500
- const U32* baseValue, const U32* nbAdditionalBits,
576
+ const U32* baseValue, const U8* nbAdditionalBits,
501
577
  unsigned tableLog, void* wksp, size_t wkspSize)
502
578
  {
503
579
  ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue,
@@ -507,7 +583,7 @@ TARGET_ATTRIBUTE("bmi2") static void ZSTD_buildFSETable_body_bmi2(ZSTD_seqSymbol
507
583
 
508
584
  void ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
509
585
  const short* normalizedCounter, unsigned maxSymbolValue,
510
- const U32* baseValue, const U32* nbAdditionalBits,
586
+ const U32* baseValue, const U8* nbAdditionalBits,
511
587
  unsigned tableLog, void* wksp, size_t wkspSize, int bmi2)
512
588
  {
513
589
  #if DYNAMIC_BMI2
@@ -529,7 +605,7 @@ void ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
529
605
  static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymbol** DTablePtr,
530
606
  symbolEncodingType_e type, unsigned max, U32 maxLog,
531
607
  const void* src, size_t srcSize,
532
- const U32* baseValue, const U32* nbAdditionalBits,
608
+ const U32* baseValue, const U8* nbAdditionalBits,
533
609
  const ZSTD_seqSymbol* defaultTable, U32 flagRepeatTable,
534
610
  int ddictIsCold, int nbSeq, U32* wksp, size_t wkspSize,
535
611
  int bmi2)
@@ -541,7 +617,7 @@ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymb
541
617
  RETURN_ERROR_IF((*(const BYTE*)src) > max, corruption_detected, "");
542
618
  { U32 const symbol = *(const BYTE*)src;
543
619
  U32 const baseline = baseValue[symbol];
544
- U32 const nbBits = nbAdditionalBits[symbol];
620
+ U8 const nbBits = nbAdditionalBits[symbol];
545
621
  ZSTD_buildSeqTable_rle(DTableSpace, baseline, nbBits);
546
622
  }
547
623
  *DTablePtr = DTableSpace;
@@ -620,7 +696,7 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
620
696
  LL_defaultDTable, dctx->fseEntropy,
621
697
  dctx->ddictIsCold, nbSeq,
622
698
  dctx->workspace, sizeof(dctx->workspace),
623
- dctx->bmi2);
699
+ ZSTD_DCtx_get_bmi2(dctx));
624
700
  RETURN_ERROR_IF(ZSTD_isError(llhSize), corruption_detected, "ZSTD_buildSeqTable failed");
625
701
  ip += llhSize;
626
702
  }
@@ -632,7 +708,7 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
632
708
  OF_defaultDTable, dctx->fseEntropy,
633
709
  dctx->ddictIsCold, nbSeq,
634
710
  dctx->workspace, sizeof(dctx->workspace),
635
- dctx->bmi2);
711
+ ZSTD_DCtx_get_bmi2(dctx));
636
712
  RETURN_ERROR_IF(ZSTD_isError(ofhSize), corruption_detected, "ZSTD_buildSeqTable failed");
637
713
  ip += ofhSize;
638
714
  }
@@ -644,7 +720,7 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
644
720
  ML_defaultDTable, dctx->fseEntropy,
645
721
  dctx->ddictIsCold, nbSeq,
646
722
  dctx->workspace, sizeof(dctx->workspace),
647
- dctx->bmi2);
723
+ ZSTD_DCtx_get_bmi2(dctx));
648
724
  RETURN_ERROR_IF(ZSTD_isError(mlhSize), corruption_detected, "ZSTD_buildSeqTable failed");
649
725
  ip += mlhSize;
650
726
  }
@@ -713,7 +789,7 @@ HINT_INLINE void ZSTD_overlapCopy8(BYTE** op, BYTE const** ip, size_t offset) {
713
789
  * - ZSTD_overlap_src_before_dst: The src and dst may overlap and may be any distance apart.
714
790
  * The src buffer must be before the dst buffer.
715
791
  */
716
- static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, ptrdiff_t length, ZSTD_overlap_e ovtype) {
792
+ static void ZSTD_safecopy(BYTE* op, const BYTE* const oend_w, BYTE const* ip, ptrdiff_t length, ZSTD_overlap_e ovtype) {
717
793
  ptrdiff_t const diff = op - ip;
718
794
  BYTE* const oend = op + length;
719
795
 
@@ -729,6 +805,7 @@ static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, ptrdiff_
729
805
  /* Copy 8 bytes and ensure the offset >= 8 when there can be overlap. */
730
806
  assert(length >= 8);
731
807
  ZSTD_overlapCopy8(&op, &ip, diff);
808
+ length -= 8;
732
809
  assert(op - ip >= 8);
733
810
  assert(op <= oend);
734
811
  }
@@ -743,12 +820,35 @@ static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, ptrdiff_
743
820
  assert(oend > oend_w);
744
821
  ZSTD_wildcopy(op, ip, oend_w - op, ovtype);
745
822
  ip += oend_w - op;
746
- op = oend_w;
823
+ op += oend_w - op;
747
824
  }
748
825
  /* Handle the leftovers. */
749
826
  while (op < oend) *op++ = *ip++;
750
827
  }
751
828
 
829
+ /* ZSTD_safecopyDstBeforeSrc():
830
+ * This version allows overlap with dst before src, or handles the non-overlap case with dst after src
831
+ * Kept separate from more common ZSTD_safecopy case to avoid performance impact to the safecopy common case */
832
+ static void ZSTD_safecopyDstBeforeSrc(BYTE* op, BYTE const* ip, ptrdiff_t length) {
833
+ ptrdiff_t const diff = op - ip;
834
+ BYTE* const oend = op + length;
835
+
836
+ if (length < 8 || diff > -8) {
837
+ /* Handle short lengths, close overlaps, and dst not before src. */
838
+ while (op < oend) *op++ = *ip++;
839
+ return;
840
+ }
841
+
842
+ if (op <= oend - WILDCOPY_OVERLENGTH && diff < -WILDCOPY_VECLEN) {
843
+ ZSTD_wildcopy(op, ip, oend - WILDCOPY_OVERLENGTH - op, ZSTD_no_overlap);
844
+ ip += oend - WILDCOPY_OVERLENGTH - op;
845
+ op += oend - WILDCOPY_OVERLENGTH - op;
846
+ }
847
+
848
+ /* Handle the leftovers. */
849
+ while (op < oend) *op++ = *ip++;
850
+ }
851
+
752
852
  /* ZSTD_execSequenceEnd():
753
853
  * This version handles cases that are near the end of the output buffer. It requires
754
854
  * more careful checks to make sure there is no overflow. By separating out these hard
@@ -759,9 +859,9 @@ static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, ptrdiff_
759
859
  */
760
860
  FORCE_NOINLINE
761
861
  size_t ZSTD_execSequenceEnd(BYTE* op,
762
- BYTE* const oend, seq_t sequence,
763
- const BYTE** litPtr, const BYTE* const litLimit,
764
- const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
862
+ BYTE* const oend, seq_t sequence,
863
+ const BYTE** litPtr, const BYTE* const litLimit,
864
+ const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
765
865
  {
766
866
  BYTE* const oLitEnd = op + sequence.litLength;
767
867
  size_t const sequenceLength = sequence.litLength + sequence.matchLength;
@@ -784,27 +884,76 @@ size_t ZSTD_execSequenceEnd(BYTE* op,
784
884
  if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
785
885
  /* offset beyond prefix */
786
886
  RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, "");
787
- match = dictEnd - (prefixStart-match);
887
+ match = dictEnd - (prefixStart - match);
788
888
  if (match + sequence.matchLength <= dictEnd) {
789
889
  ZSTD_memmove(oLitEnd, match, sequence.matchLength);
790
890
  return sequenceLength;
791
891
  }
792
892
  /* span extDict & currentPrefixSegment */
793
893
  { size_t const length1 = dictEnd - match;
794
- ZSTD_memmove(oLitEnd, match, length1);
795
- op = oLitEnd + length1;
796
- sequence.matchLength -= length1;
797
- match = prefixStart;
798
- } }
894
+ ZSTD_memmove(oLitEnd, match, length1);
895
+ op = oLitEnd + length1;
896
+ sequence.matchLength -= length1;
897
+ match = prefixStart;
898
+ }
899
+ }
900
+ ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst);
901
+ return sequenceLength;
902
+ }
903
+
904
+ /* ZSTD_execSequenceEndSplitLitBuffer():
905
+ * This version is intended to be used during instances where the litBuffer is still split. It is kept separate to avoid performance impact for the good case.
906
+ */
907
+ FORCE_NOINLINE
908
+ size_t ZSTD_execSequenceEndSplitLitBuffer(BYTE* op,
909
+ BYTE* const oend, const BYTE* const oend_w, seq_t sequence,
910
+ const BYTE** litPtr, const BYTE* const litLimit,
911
+ const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
912
+ {
913
+ BYTE* const oLitEnd = op + sequence.litLength;
914
+ size_t const sequenceLength = sequence.litLength + sequence.matchLength;
915
+ const BYTE* const iLitEnd = *litPtr + sequence.litLength;
916
+ const BYTE* match = oLitEnd - sequence.offset;
917
+
918
+
919
+ /* bounds checks : careful of address space overflow in 32-bit mode */
920
+ RETURN_ERROR_IF(sequenceLength > (size_t)(oend - op), dstSize_tooSmall, "last match must fit within dstBuffer");
921
+ RETURN_ERROR_IF(sequence.litLength > (size_t)(litLimit - *litPtr), corruption_detected, "try to read beyond literal buffer");
922
+ assert(op < op + sequenceLength);
923
+ assert(oLitEnd < op + sequenceLength);
924
+
925
+ /* copy literals */
926
+ RETURN_ERROR_IF(op > *litPtr && op < *litPtr + sequence.litLength, dstSize_tooSmall, "output should not catch up to and overwrite literal buffer");
927
+ ZSTD_safecopyDstBeforeSrc(op, *litPtr, sequence.litLength);
928
+ op = oLitEnd;
929
+ *litPtr = iLitEnd;
930
+
931
+ /* copy Match */
932
+ if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
933
+ /* offset beyond prefix */
934
+ RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, "");
935
+ match = dictEnd - (prefixStart - match);
936
+ if (match + sequence.matchLength <= dictEnd) {
937
+ ZSTD_memmove(oLitEnd, match, sequence.matchLength);
938
+ return sequenceLength;
939
+ }
940
+ /* span extDict & currentPrefixSegment */
941
+ { size_t const length1 = dictEnd - match;
942
+ ZSTD_memmove(oLitEnd, match, length1);
943
+ op = oLitEnd + length1;
944
+ sequence.matchLength -= length1;
945
+ match = prefixStart;
946
+ }
947
+ }
799
948
  ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst);
800
949
  return sequenceLength;
801
950
  }
802
951
 
803
952
  HINT_INLINE
804
953
  size_t ZSTD_execSequence(BYTE* op,
805
- BYTE* const oend, seq_t sequence,
806
- const BYTE** litPtr, const BYTE* const litLimit,
807
- const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
954
+ BYTE* const oend, seq_t sequence,
955
+ const BYTE** litPtr, const BYTE* const litLimit,
956
+ const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
808
957
  {
809
958
  BYTE* const oLitEnd = op + sequence.litLength;
810
959
  size_t const sequenceLength = sequence.litLength + sequence.matchLength;
@@ -813,6 +962,98 @@ size_t ZSTD_execSequence(BYTE* op,
813
962
  const BYTE* const iLitEnd = *litPtr + sequence.litLength;
814
963
  const BYTE* match = oLitEnd - sequence.offset;
815
964
 
965
+ assert(op != NULL /* Precondition */);
966
+ assert(oend_w < oend /* No underflow */);
967
+ /* Handle edge cases in a slow path:
968
+ * - Read beyond end of literals
969
+ * - Match end is within WILDCOPY_OVERLIMIT of oend
970
+ * - 32-bit mode and the match length overflows
971
+ */
972
+ if (UNLIKELY(
973
+ iLitEnd > litLimit ||
974
+ oMatchEnd > oend_w ||
975
+ (MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH)))
976
+ return ZSTD_execSequenceEnd(op, oend, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd);
977
+
978
+ /* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */
979
+ assert(op <= oLitEnd /* No overflow */);
980
+ assert(oLitEnd < oMatchEnd /* Non-zero match & no overflow */);
981
+ assert(oMatchEnd <= oend /* No underflow */);
982
+ assert(iLitEnd <= litLimit /* Literal length is in bounds */);
983
+ assert(oLitEnd <= oend_w /* Can wildcopy literals */);
984
+ assert(oMatchEnd <= oend_w /* Can wildcopy matches */);
985
+
986
+ /* Copy Literals:
987
+ * Split out litLength <= 16 since it is nearly always true. +1.6% on gcc-9.
988
+ * We likely don't need the full 32-byte wildcopy.
989
+ */
990
+ assert(WILDCOPY_OVERLENGTH >= 16);
991
+ ZSTD_copy16(op, (*litPtr));
992
+ if (UNLIKELY(sequence.litLength > 16)) {
993
+ ZSTD_wildcopy(op + 16, (*litPtr) + 16, sequence.litLength - 16, ZSTD_no_overlap);
994
+ }
995
+ op = oLitEnd;
996
+ *litPtr = iLitEnd; /* update for next sequence */
997
+
998
+ /* Copy Match */
999
+ if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
1000
+ /* offset beyond prefix -> go into extDict */
1001
+ RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virtualStart)), corruption_detected, "");
1002
+ match = dictEnd + (match - prefixStart);
1003
+ if (match + sequence.matchLength <= dictEnd) {
1004
+ ZSTD_memmove(oLitEnd, match, sequence.matchLength);
1005
+ return sequenceLength;
1006
+ }
1007
+ /* span extDict & currentPrefixSegment */
1008
+ { size_t const length1 = dictEnd - match;
1009
+ ZSTD_memmove(oLitEnd, match, length1);
1010
+ op = oLitEnd + length1;
1011
+ sequence.matchLength -= length1;
1012
+ match = prefixStart;
1013
+ }
1014
+ }
1015
+ /* Match within prefix of 1 or more bytes */
1016
+ assert(op <= oMatchEnd);
1017
+ assert(oMatchEnd <= oend_w);
1018
+ assert(match >= prefixStart);
1019
+ assert(sequence.matchLength >= 1);
1020
+
1021
+ /* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy
1022
+ * without overlap checking.
1023
+ */
1024
+ if (LIKELY(sequence.offset >= WILDCOPY_VECLEN)) {
1025
+ /* We bet on a full wildcopy for matches, since we expect matches to be
1026
+ * longer than literals (in general). In silesia, ~10% of matches are longer
1027
+ * than 16 bytes.
1028
+ */
1029
+ ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength, ZSTD_no_overlap);
1030
+ return sequenceLength;
1031
+ }
1032
+ assert(sequence.offset < WILDCOPY_VECLEN);
1033
+
1034
+ /* Copy 8 bytes and spread the offset to be >= 8. */
1035
+ ZSTD_overlapCopy8(&op, &match, sequence.offset);
1036
+
1037
+ /* If the match length is > 8 bytes, then continue with the wildcopy. */
1038
+ if (sequence.matchLength > 8) {
1039
+ assert(op < oMatchEnd);
1040
+ ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength - 8, ZSTD_overlap_src_before_dst);
1041
+ }
1042
+ return sequenceLength;
1043
+ }
1044
+
1045
+ HINT_INLINE
1046
+ size_t ZSTD_execSequenceSplitLitBuffer(BYTE* op,
1047
+ BYTE* const oend, const BYTE* const oend_w, seq_t sequence,
1048
+ const BYTE** litPtr, const BYTE* const litLimit,
1049
+ const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
1050
+ {
1051
+ BYTE* const oLitEnd = op + sequence.litLength;
1052
+ size_t const sequenceLength = sequence.litLength + sequence.matchLength;
1053
+ BYTE* const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */
1054
+ const BYTE* const iLitEnd = *litPtr + sequence.litLength;
1055
+ const BYTE* match = oLitEnd - sequence.offset;
1056
+
816
1057
  assert(op != NULL /* Precondition */);
817
1058
  assert(oend_w < oend /* No underflow */);
818
1059
  /* Handle edge cases in a slow path:
@@ -824,7 +1065,7 @@ size_t ZSTD_execSequence(BYTE* op,
824
1065
  iLitEnd > litLimit ||
825
1066
  oMatchEnd > oend_w ||
826
1067
  (MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH)))
827
- return ZSTD_execSequenceEnd(op, oend, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd);
1068
+ return ZSTD_execSequenceEndSplitLitBuffer(op, oend, oend_w, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd);
828
1069
 
829
1070
  /* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */
830
1071
  assert(op <= oLitEnd /* No overflow */);
@@ -892,6 +1133,7 @@ size_t ZSTD_execSequence(BYTE* op,
892
1133
  return sequenceLength;
893
1134
  }
894
1135
 
1136
+
895
1137
  static void
896
1138
  ZSTD_initFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, const ZSTD_seqSymbol* dt)
897
1139
  {
@@ -905,20 +1147,10 @@ ZSTD_initFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, const ZSTD_seqS
905
1147
  }
906
1148
 
907
1149
  FORCE_INLINE_TEMPLATE void
908
- ZSTD_updateFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD)
909
- {
910
- ZSTD_seqSymbol const DInfo = DStatePtr->table[DStatePtr->state];
911
- U32 const nbBits = DInfo.nbBits;
912
- size_t const lowBits = BIT_readBits(bitD, nbBits);
913
- DStatePtr->state = DInfo.nextState + lowBits;
914
- }
915
-
916
- FORCE_INLINE_TEMPLATE void
917
- ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, ZSTD_seqSymbol const DInfo)
1150
+ ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16 nextState, U32 nbBits)
918
1151
  {
919
- U32 const nbBits = DInfo.nbBits;
920
1152
  size_t const lowBits = BIT_readBits(bitD, nbBits);
921
- DStatePtr->state = DInfo.nextState + lowBits;
1153
+ DStatePtr->state = nextState + lowBits;
922
1154
  }
923
1155
 
924
1156
  /* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum
@@ -937,102 +1169,100 @@ FORCE_INLINE_TEMPLATE seq_t
937
1169
  ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
938
1170
  {
939
1171
  seq_t seq;
940
- ZSTD_seqSymbol const llDInfo = seqState->stateLL.table[seqState->stateLL.state];
941
- ZSTD_seqSymbol const mlDInfo = seqState->stateML.table[seqState->stateML.state];
942
- ZSTD_seqSymbol const ofDInfo = seqState->stateOffb.table[seqState->stateOffb.state];
943
- U32 const llBase = llDInfo.baseValue;
944
- U32 const mlBase = mlDInfo.baseValue;
945
- U32 const ofBase = ofDInfo.baseValue;
946
- BYTE const llBits = llDInfo.nbAdditionalBits;
947
- BYTE const mlBits = mlDInfo.nbAdditionalBits;
948
- BYTE const ofBits = ofDInfo.nbAdditionalBits;
949
- BYTE const totalBits = llBits+mlBits+ofBits;
950
-
951
- /* sequence */
952
- { size_t offset;
953
- if (ofBits > 1) {
954
- ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
955
- ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
956
- assert(ofBits <= MaxOff);
957
- if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) {
958
- U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed);
959
- offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
960
- BIT_reloadDStream(&seqState->DStream);
961
- if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits);
962
- assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32); /* to avoid another reload */
963
- } else {
964
- offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */
965
- if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
966
- }
967
- seqState->prevOffset[2] = seqState->prevOffset[1];
968
- seqState->prevOffset[1] = seqState->prevOffset[0];
969
- seqState->prevOffset[0] = offset;
970
- } else {
971
- U32 const ll0 = (llBase == 0);
972
- if (LIKELY((ofBits == 0))) {
973
- if (LIKELY(!ll0))
974
- offset = seqState->prevOffset[0];
975
- else {
976
- offset = seqState->prevOffset[1];
977
- seqState->prevOffset[1] = seqState->prevOffset[0];
978
- seqState->prevOffset[0] = offset;
1172
+ const ZSTD_seqSymbol* const llDInfo = seqState->stateLL.table + seqState->stateLL.state;
1173
+ const ZSTD_seqSymbol* const mlDInfo = seqState->stateML.table + seqState->stateML.state;
1174
+ const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state;
1175
+ seq.matchLength = mlDInfo->baseValue;
1176
+ seq.litLength = llDInfo->baseValue;
1177
+ { U32 const ofBase = ofDInfo->baseValue;
1178
+ BYTE const llBits = llDInfo->nbAdditionalBits;
1179
+ BYTE const mlBits = mlDInfo->nbAdditionalBits;
1180
+ BYTE const ofBits = ofDInfo->nbAdditionalBits;
1181
+ BYTE const totalBits = llBits+mlBits+ofBits;
1182
+
1183
+ U16 const llNext = llDInfo->nextState;
1184
+ U16 const mlNext = mlDInfo->nextState;
1185
+ U16 const ofNext = ofDInfo->nextState;
1186
+ U32 const llnbBits = llDInfo->nbBits;
1187
+ U32 const mlnbBits = mlDInfo->nbBits;
1188
+ U32 const ofnbBits = ofDInfo->nbBits;
1189
+ /*
1190
+ * As gcc has better branch and block analyzers, sometimes it is only
1191
+ * valuable to mark likelyness for clang, it gives around 3-4% of
1192
+ * performance.
1193
+ */
1194
+
1195
+ /* sequence */
1196
+ { size_t offset;
1197
+ #if defined(__clang__)
1198
+ if (LIKELY(ofBits > 1)) {
1199
+ #else
1200
+ if (ofBits > 1) {
1201
+ #endif
1202
+ ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
1203
+ ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
1204
+ assert(ofBits <= MaxOff);
1205
+ if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) {
1206
+ U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed);
1207
+ offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
1208
+ BIT_reloadDStream(&seqState->DStream);
1209
+ if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits);
1210
+ assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32); /* to avoid another reload */
1211
+ } else {
1212
+ offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */
1213
+ if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
979
1214
  }
1215
+ seqState->prevOffset[2] = seqState->prevOffset[1];
1216
+ seqState->prevOffset[1] = seqState->prevOffset[0];
1217
+ seqState->prevOffset[0] = offset;
980
1218
  } else {
981
- offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1);
982
- { size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
983
- temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */
984
- if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1];
985
- seqState->prevOffset[1] = seqState->prevOffset[0];
986
- seqState->prevOffset[0] = offset = temp;
987
- } } }
988
- seq.offset = offset;
989
- }
990
-
991
- seq.matchLength = mlBase;
992
- if (mlBits > 0)
993
- seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/);
994
-
995
- if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))
996
- BIT_reloadDStream(&seqState->DStream);
997
- if (MEM_64bits() && UNLIKELY(totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog)))
998
- BIT_reloadDStream(&seqState->DStream);
999
- /* Ensure there are enough bits to read the rest of data in 64-bit mode. */
1000
- ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);
1001
-
1002
- seq.litLength = llBase;
1003
- if (llBits > 0)
1004
- seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/);
1005
-
1006
- if (MEM_32bits())
1007
- BIT_reloadDStream(&seqState->DStream);
1008
-
1009
- DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u",
1010
- (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
1011
-
1012
- /* ANS state update
1013
- * gcc-9.0.0 does 2.5% worse with ZSTD_updateFseStateWithDInfo().
1014
- * clang-9.2.0 does 7% worse with ZSTD_updateFseState().
1015
- * Naturally it seems like ZSTD_updateFseStateWithDInfo() should be the
1016
- * better option, so it is the default for other compilers. But, if you
1017
- * measure that it is worse, please put up a pull request.
1018
- */
1019
- {
1020
- #if defined(__GNUC__) && !defined(__clang__)
1021
- const int kUseUpdateFseState = 1;
1022
- #else
1023
- const int kUseUpdateFseState = 0;
1024
- #endif
1025
- if (kUseUpdateFseState) {
1026
- ZSTD_updateFseState(&seqState->stateLL, &seqState->DStream); /* <= 9 bits */
1027
- ZSTD_updateFseState(&seqState->stateML, &seqState->DStream); /* <= 9 bits */
1028
- if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */
1029
- ZSTD_updateFseState(&seqState->stateOffb, &seqState->DStream); /* <= 8 bits */
1030
- } else {
1031
- ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llDInfo); /* <= 9 bits */
1032
- ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlDInfo); /* <= 9 bits */
1033
- if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */
1034
- ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofDInfo); /* <= 8 bits */
1219
+ U32 const ll0 = (llDInfo->baseValue == 0);
1220
+ if (LIKELY((ofBits == 0))) {
1221
+ offset = seqState->prevOffset[ll0];
1222
+ seqState->prevOffset[1] = seqState->prevOffset[!ll0];
1223
+ seqState->prevOffset[0] = offset;
1224
+ } else {
1225
+ offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1);
1226
+ { size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
1227
+ temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */
1228
+ if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1];
1229
+ seqState->prevOffset[1] = seqState->prevOffset[0];
1230
+ seqState->prevOffset[0] = offset = temp;
1231
+ } } }
1232
+ seq.offset = offset;
1035
1233
  }
1234
+
1235
+ #if defined(__clang__)
1236
+ if (UNLIKELY(mlBits > 0))
1237
+ #else
1238
+ if (mlBits > 0)
1239
+ #endif
1240
+ seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/);
1241
+
1242
+ if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))
1243
+ BIT_reloadDStream(&seqState->DStream);
1244
+ if (MEM_64bits() && UNLIKELY(totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog)))
1245
+ BIT_reloadDStream(&seqState->DStream);
1246
+ /* Ensure there are enough bits to read the rest of data in 64-bit mode. */
1247
+ ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);
1248
+
1249
+ #if defined(__clang__)
1250
+ if (UNLIKELY(llBits > 0))
1251
+ #else
1252
+ if (llBits > 0)
1253
+ #endif
1254
+ seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/);
1255
+
1256
+ if (MEM_32bits())
1257
+ BIT_reloadDStream(&seqState->DStream);
1258
+
1259
+ DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u",
1260
+ (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
1261
+
1262
+ ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits); /* <= 9 bits */
1263
+ ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits); /* <= 9 bits */
1264
+ if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */
1265
+ ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits); /* <= 8 bits */
1036
1266
  }
1037
1267
 
1038
1268
  return seq;
@@ -1085,9 +1315,11 @@ MEM_STATIC void ZSTD_assertValidSequence(
1085
1315
  #endif
1086
1316
 
1087
1317
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
1318
+
1319
+
1088
1320
  FORCE_INLINE_TEMPLATE size_t
1089
1321
  DONT_VECTORIZE
1090
- ZSTD_decompressSequences_body( ZSTD_DCtx* dctx,
1322
+ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,
1091
1323
  void* dst, size_t maxDstSize,
1092
1324
  const void* seqStart, size_t seqSize, int nbSeq,
1093
1325
  const ZSTD_longOffset_e isLongOffset,
@@ -1099,11 +1331,11 @@ ZSTD_decompressSequences_body( ZSTD_DCtx* dctx,
1099
1331
  BYTE* const oend = ostart + maxDstSize;
1100
1332
  BYTE* op = ostart;
1101
1333
  const BYTE* litPtr = dctx->litPtr;
1102
- const BYTE* const litEnd = litPtr + dctx->litSize;
1334
+ const BYTE* litBufferEnd = dctx->litBufferEnd;
1103
1335
  const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart);
1104
1336
  const BYTE* const vBase = (const BYTE*) (dctx->virtualStart);
1105
1337
  const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
1106
- DEBUGLOG(5, "ZSTD_decompressSequences_body");
1338
+ DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer");
1107
1339
  (void)frame;
1108
1340
 
1109
1341
  /* Regen sequences */
@@ -1124,55 +1356,237 @@ ZSTD_decompressSequences_body( ZSTD_DCtx* dctx,
1124
1356
  BIT_DStream_endOfBuffer < BIT_DStream_completed &&
1125
1357
  BIT_DStream_completed < BIT_DStream_overflow);
1126
1358
 
1359
+ /* decompress without overrunning litPtr begins */
1360
+ {
1361
+ seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1362
+ /* Align the decompression loop to 32 + 16 bytes.
1363
+ *
1364
+ * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression
1365
+ * speed swings based on the alignment of the decompression loop. This
1366
+ * performance swing is caused by parts of the decompression loop falling
1367
+ * out of the DSB. The entire decompression loop should fit in the DSB,
1368
+ * when it can't we get much worse performance. You can measure if you've
1369
+ * hit the good case or the bad case with this perf command for some
1370
+ * compressed file test.zst:
1371
+ *
1372
+ * perf stat -e cycles -e instructions -e idq.all_dsb_cycles_any_uops \
1373
+ * -e idq.all_mite_cycles_any_uops -- ./zstd -tq test.zst
1374
+ *
1375
+ * If you see most cycles served out of the MITE you've hit the bad case.
1376
+ * If you see most cycles served out of the DSB you've hit the good case.
1377
+ * If it is pretty even then you may be in an okay case.
1378
+ *
1379
+ * This issue has been reproduced on the following CPUs:
1380
+ * - Kabylake: Macbook Pro (15-inch, 2019) 2.4 GHz Intel Core i9
1381
+ * Use Instruments->Counters to get DSB/MITE cycles.
1382
+ * I never got performance swings, but I was able to
1383
+ * go from the good case of mostly DSB to half of the
1384
+ * cycles served from MITE.
1385
+ * - Coffeelake: Intel i9-9900k
1386
+ * - Coffeelake: Intel i7-9700k
1387
+ *
1388
+ * I haven't been able to reproduce the instability or DSB misses on any
1389
+ * of the following CPUS:
1390
+ * - Haswell
1391
+ * - Broadwell: Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GH
1392
+ * - Skylake
1393
+ *
1394
+ * Alignment is done for each of the three major decompression loops:
1395
+ * - ZSTD_decompressSequences_bodySplitLitBuffer - presplit section of the literal buffer
1396
+ * - ZSTD_decompressSequences_bodySplitLitBuffer - postsplit section of the literal buffer
1397
+ * - ZSTD_decompressSequences_body
1398
+ * Alignment choices are made to minimize large swings on bad cases and influence on performance
1399
+ * from changes external to this code, rather than to overoptimize on the current commit.
1400
+ *
1401
+ * If you are seeing performance stability this script can help test.
1402
+ * It tests on 4 commits in zstd where I saw performance change.
1403
+ *
1404
+ * https://gist.github.com/terrelln/9889fc06a423fd5ca6e99351564473f4
1405
+ */
1127
1406
  #if defined(__GNUC__) && defined(__x86_64__)
1128
- /* Align the decompression loop to 32 + 16 bytes.
1129
- *
1130
- * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression
1131
- * speed swings based on the alignment of the decompression loop. This
1132
- * performance swing is caused by parts of the decompression loop falling
1133
- * out of the DSB. The entire decompression loop should fit in the DSB,
1134
- * when it can't we get much worse performance. You can measure if you've
1135
- * hit the good case or the bad case with this perf command for some
1136
- * compressed file test.zst:
1137
- *
1138
- * perf stat -e cycles -e instructions -e idq.all_dsb_cycles_any_uops \
1139
- * -e idq.all_mite_cycles_any_uops -- ./zstd -tq test.zst
1140
- *
1141
- * If you see most cycles served out of the MITE you've hit the bad case.
1142
- * If you see most cycles served out of the DSB you've hit the good case.
1143
- * If it is pretty even then you may be in an okay case.
1144
- *
1145
- * This issue has been reproduced on the following CPUs:
1146
- * - Kabylake: Macbook Pro (15-inch, 2019) 2.4 GHz Intel Core i9
1147
- * Use Instruments->Counters to get DSB/MITE cycles.
1148
- * I never got performance swings, but I was able to
1149
- * go from the good case of mostly DSB to half of the
1150
- * cycles served from MITE.
1151
- * - Coffeelake: Intel i9-9900k
1152
- * - Coffeelake: Intel i7-9700k
1153
- *
1154
- * I haven't been able to reproduce the instability or DSB misses on any
1155
- * of the following CPUS:
1156
- * - Haswell
1157
- * - Broadwell: Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GH
1158
- * - Skylake
1159
- *
1160
- * If you are seeing performance stability this script can help test.
1161
- * It tests on 4 commits in zstd where I saw performance change.
1162
- *
1163
- * https://gist.github.com/terrelln/9889fc06a423fd5ca6e99351564473f4
1164
- */
1165
- __asm__(".p2align 6");
1166
- __asm__("nop");
1167
- __asm__(".p2align 5");
1168
- __asm__("nop");
1169
- # if __GNUC__ >= 9
1170
- /* better for gcc-9 and gcc-10, worse for clang and gcc-8 */
1171
- __asm__(".p2align 3");
1407
+ __asm__(".p2align 6");
1408
+ # if __GNUC__ >= 7
1409
+ /* good for gcc-7, gcc-9, and gcc-11 */
1410
+ __asm__("nop");
1411
+ __asm__(".p2align 5");
1412
+ __asm__("nop");
1413
+ __asm__(".p2align 4");
1414
+ # if __GNUC__ == 8 || __GNUC__ == 10
1415
+ /* good for gcc-8 and gcc-10 */
1416
+ __asm__("nop");
1417
+ __asm__(".p2align 3");
1418
+ # endif
1419
+ # endif
1420
+ #endif
1421
+
1422
+ /* Handle the initial state where litBuffer is currently split between dst and litExtraBuffer */
1423
+ for (; litPtr + sequence.litLength <= dctx->litBufferEnd; ) {
1424
+ size_t const oneSeqSize = ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
1425
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1426
+ assert(!ZSTD_isError(oneSeqSize));
1427
+ if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
1428
+ #endif
1429
+ if (UNLIKELY(ZSTD_isError(oneSeqSize)))
1430
+ return oneSeqSize;
1431
+ DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
1432
+ op += oneSeqSize;
1433
+ if (UNLIKELY(!--nbSeq))
1434
+ break;
1435
+ BIT_reloadDStream(&(seqState.DStream));
1436
+ sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1437
+ }
1438
+
1439
+ /* If there are more sequences, they will need to read literals from litExtraBuffer; copy over the remainder from dst and update litPtr and litEnd */
1440
+ if (nbSeq > 0) {
1441
+ const size_t leftoverLit = dctx->litBufferEnd - litPtr;
1442
+ if (leftoverLit)
1443
+ {
1444
+ RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");
1445
+ ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
1446
+ sequence.litLength -= leftoverLit;
1447
+ op += leftoverLit;
1448
+ }
1449
+ litPtr = dctx->litExtraBuffer;
1450
+ litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
1451
+ dctx->litBufferLocation = ZSTD_not_in_dst;
1452
+ {
1453
+ size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
1454
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1455
+ assert(!ZSTD_isError(oneSeqSize));
1456
+ if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
1457
+ #endif
1458
+ if (UNLIKELY(ZSTD_isError(oneSeqSize)))
1459
+ return oneSeqSize;
1460
+ DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
1461
+ op += oneSeqSize;
1462
+ if (--nbSeq)
1463
+ BIT_reloadDStream(&(seqState.DStream));
1464
+ }
1465
+ }
1466
+ }
1467
+
1468
+ if (nbSeq > 0) /* there is remaining lit from extra buffer */
1469
+ {
1470
+
1471
+ #if defined(__GNUC__) && defined(__x86_64__)
1472
+ __asm__(".p2align 6");
1473
+ __asm__("nop");
1474
+ # if __GNUC__ != 7
1475
+ /* worse for gcc-7 better for gcc-8, gcc-9, and gcc-10 and clang */
1476
+ __asm__(".p2align 4");
1477
+ __asm__("nop");
1478
+ __asm__(".p2align 3");
1479
+ # elif __GNUC__ >= 11
1480
+ __asm__(".p2align 3");
1481
+ # else
1482
+ __asm__(".p2align 5");
1483
+ __asm__("nop");
1484
+ __asm__(".p2align 3");
1485
+ # endif
1486
+ #endif
1487
+
1488
+ for (; ; ) {
1489
+ seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1490
+ size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
1491
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1492
+ assert(!ZSTD_isError(oneSeqSize));
1493
+ if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
1494
+ #endif
1495
+ if (UNLIKELY(ZSTD_isError(oneSeqSize)))
1496
+ return oneSeqSize;
1497
+ DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
1498
+ op += oneSeqSize;
1499
+ if (UNLIKELY(!--nbSeq))
1500
+ break;
1501
+ BIT_reloadDStream(&(seqState.DStream));
1502
+ }
1503
+ }
1504
+
1505
+ /* check if reached exact end */
1506
+ DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer: after decode loop, remaining nbSeq : %i", nbSeq);
1507
+ RETURN_ERROR_IF(nbSeq, corruption_detected, "");
1508
+ RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, "");
1509
+ /* save reps for next block */
1510
+ { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
1511
+ }
1512
+
1513
+ /* last literal segment */
1514
+ if (dctx->litBufferLocation == ZSTD_split) /* split hasn't been reached yet, first get dst then copy litExtraBuffer */
1515
+ {
1516
+ size_t const lastLLSize = litBufferEnd - litPtr;
1517
+ RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, "");
1518
+ if (op != NULL) {
1519
+ ZSTD_memmove(op, litPtr, lastLLSize);
1520
+ op += lastLLSize;
1521
+ }
1522
+ litPtr = dctx->litExtraBuffer;
1523
+ litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
1524
+ dctx->litBufferLocation = ZSTD_not_in_dst;
1525
+ }
1526
+ { size_t const lastLLSize = litBufferEnd - litPtr;
1527
+ RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
1528
+ if (op != NULL) {
1529
+ ZSTD_memcpy(op, litPtr, lastLLSize);
1530
+ op += lastLLSize;
1531
+ }
1532
+ }
1533
+
1534
+ return op-ostart;
1535
+ }
1536
+
1537
+ FORCE_INLINE_TEMPLATE size_t
1538
+ DONT_VECTORIZE
1539
+ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx,
1540
+ void* dst, size_t maxDstSize,
1541
+ const void* seqStart, size_t seqSize, int nbSeq,
1542
+ const ZSTD_longOffset_e isLongOffset,
1543
+ const int frame)
1544
+ {
1545
+ const BYTE* ip = (const BYTE*)seqStart;
1546
+ const BYTE* const iend = ip + seqSize;
1547
+ BYTE* const ostart = (BYTE*)dst;
1548
+ BYTE* const oend = dctx->litBufferLocation == ZSTD_not_in_dst ? ostart + maxDstSize : dctx->litBuffer;
1549
+ BYTE* op = ostart;
1550
+ const BYTE* litPtr = dctx->litPtr;
1551
+ const BYTE* const litEnd = litPtr + dctx->litSize;
1552
+ const BYTE* const prefixStart = (const BYTE*)(dctx->prefixStart);
1553
+ const BYTE* const vBase = (const BYTE*)(dctx->virtualStart);
1554
+ const BYTE* const dictEnd = (const BYTE*)(dctx->dictEnd);
1555
+ DEBUGLOG(5, "ZSTD_decompressSequences_body");
1556
+ (void)frame;
1557
+
1558
+ /* Regen sequences */
1559
+ if (nbSeq) {
1560
+ seqState_t seqState;
1561
+ dctx->fseEntropy = 1;
1562
+ { U32 i; for (i = 0; i < ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; }
1563
+ RETURN_ERROR_IF(
1564
+ ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend - ip)),
1565
+ corruption_detected, "");
1566
+ ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr);
1567
+ ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr);
1568
+ ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
1569
+ assert(dst != NULL);
1570
+
1571
+ ZSTD_STATIC_ASSERT(
1572
+ BIT_DStream_unfinished < BIT_DStream_completed &&
1573
+ BIT_DStream_endOfBuffer < BIT_DStream_completed &&
1574
+ BIT_DStream_completed < BIT_DStream_overflow);
1575
+
1576
+ #if defined(__GNUC__) && defined(__x86_64__)
1577
+ __asm__(".p2align 6");
1578
+ __asm__("nop");
1579
+ # if __GNUC__ >= 7
1580
+ __asm__(".p2align 5");
1581
+ __asm__("nop");
1582
+ __asm__(".p2align 3");
1172
1583
  # else
1173
- __asm__(".p2align 4");
1584
+ __asm__(".p2align 4");
1585
+ __asm__("nop");
1586
+ __asm__(".p2align 3");
1174
1587
  # endif
1175
1588
  #endif
1589
+
1176
1590
  for ( ; ; ) {
1177
1591
  seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1178
1592
  size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd);
@@ -1218,6 +1632,16 @@ ZSTD_decompressSequences_default(ZSTD_DCtx* dctx,
1218
1632
  {
1219
1633
  return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1220
1634
  }
1635
+
1636
+ static size_t
1637
+ ZSTD_decompressSequencesSplitLitBuffer_default(ZSTD_DCtx* dctx,
1638
+ void* dst, size_t maxDstSize,
1639
+ const void* seqStart, size_t seqSize, int nbSeq,
1640
+ const ZSTD_longOffset_e isLongOffset,
1641
+ const int frame)
1642
+ {
1643
+ return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1644
+ }
1221
1645
  #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
1222
1646
 
1223
1647
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
@@ -1250,10 +1674,10 @@ ZSTD_decompressSequencesLong_body(
1250
1674
  const BYTE* ip = (const BYTE*)seqStart;
1251
1675
  const BYTE* const iend = ip + seqSize;
1252
1676
  BYTE* const ostart = (BYTE*)dst;
1253
- BYTE* const oend = ostart + maxDstSize;
1677
+ BYTE* const oend = dctx->litBufferLocation == ZSTD_in_dst ? dctx->litBuffer : ostart + maxDstSize;
1254
1678
  BYTE* op = ostart;
1255
1679
  const BYTE* litPtr = dctx->litPtr;
1256
- const BYTE* const litEnd = litPtr + dctx->litSize;
1680
+ const BYTE* litBufferEnd = dctx->litBufferEnd;
1257
1681
  const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart);
1258
1682
  const BYTE* const dictStart = (const BYTE*) (dctx->virtualStart);
1259
1683
  const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
@@ -1289,32 +1713,94 @@ ZSTD_decompressSequencesLong_body(
1289
1713
  }
1290
1714
  RETURN_ERROR_IF(seqNb<seqAdvance, corruption_detected, "");
1291
1715
 
1292
- /* decode and decompress */
1293
- for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (seqNb<nbSeq) ; seqNb++) {
1294
- seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1295
- size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb-ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd);
1716
+ /* decompress without stomping litBuffer */
1717
+ for (; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (seqNb < nbSeq); seqNb++) {
1718
+ seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1719
+ size_t oneSeqSize;
1720
+
1721
+ if (dctx->litBufferLocation == ZSTD_split && litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength > dctx->litBufferEnd)
1722
+ {
1723
+ /* lit buffer is reaching split point, empty out the first buffer and transition to litExtraBuffer */
1724
+ const size_t leftoverLit = dctx->litBufferEnd - litPtr;
1725
+ if (leftoverLit)
1726
+ {
1727
+ RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");
1728
+ ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
1729
+ sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength -= leftoverLit;
1730
+ op += leftoverLit;
1731
+ }
1732
+ litPtr = dctx->litExtraBuffer;
1733
+ litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
1734
+ dctx->litBufferLocation = ZSTD_not_in_dst;
1735
+ oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
1296
1736
  #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1297
- assert(!ZSTD_isError(oneSeqSize));
1298
- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb-ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
1737
+ assert(!ZSTD_isError(oneSeqSize));
1738
+ if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
1299
1739
  #endif
1300
- if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1740
+ if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1301
1741
 
1302
- prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
1303
- sequences[seqNb & STORED_SEQS_MASK] = sequence;
1304
- op += oneSeqSize;
1742
+ prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
1743
+ sequences[seqNb & STORED_SEQS_MASK] = sequence;
1744
+ op += oneSeqSize;
1745
+ }
1746
+ else
1747
+ {
1748
+ /* lit buffer is either wholly contained in first or second split, or not split at all*/
1749
+ oneSeqSize = dctx->litBufferLocation == ZSTD_split ?
1750
+ ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength - WILDCOPY_OVERLENGTH, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) :
1751
+ ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
1752
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1753
+ assert(!ZSTD_isError(oneSeqSize));
1754
+ if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
1755
+ #endif
1756
+ if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1757
+
1758
+ prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
1759
+ sequences[seqNb & STORED_SEQS_MASK] = sequence;
1760
+ op += oneSeqSize;
1761
+ }
1305
1762
  }
1306
1763
  RETURN_ERROR_IF(seqNb<nbSeq, corruption_detected, "");
1307
1764
 
1308
1765
  /* finish queue */
1309
1766
  seqNb -= seqAdvance;
1310
1767
  for ( ; seqNb<nbSeq ; seqNb++) {
1311
- size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[seqNb&STORED_SEQS_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd);
1768
+ seq_t *sequence = &(sequences[seqNb&STORED_SEQS_MASK]);
1769
+ if (dctx->litBufferLocation == ZSTD_split && litPtr + sequence->litLength > dctx->litBufferEnd)
1770
+ {
1771
+ const size_t leftoverLit = dctx->litBufferEnd - litPtr;
1772
+ if (leftoverLit)
1773
+ {
1774
+ RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");
1775
+ ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
1776
+ sequence->litLength -= leftoverLit;
1777
+ op += leftoverLit;
1778
+ }
1779
+ litPtr = dctx->litExtraBuffer;
1780
+ litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
1781
+ dctx->litBufferLocation = ZSTD_not_in_dst;
1782
+ {
1783
+ size_t const oneSeqSize = ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
1312
1784
  #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1313
- assert(!ZSTD_isError(oneSeqSize));
1314
- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
1785
+ assert(!ZSTD_isError(oneSeqSize));
1786
+ if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
1315
1787
  #endif
1316
- if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1317
- op += oneSeqSize;
1788
+ if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1789
+ op += oneSeqSize;
1790
+ }
1791
+ }
1792
+ else
1793
+ {
1794
+ size_t const oneSeqSize = dctx->litBufferLocation == ZSTD_split ?
1795
+ ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence->litLength - WILDCOPY_OVERLENGTH, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) :
1796
+ ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
1797
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1798
+ assert(!ZSTD_isError(oneSeqSize));
1799
+ if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
1800
+ #endif
1801
+ if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1802
+ op += oneSeqSize;
1803
+ }
1318
1804
  }
1319
1805
 
1320
1806
  /* save reps for next block */
@@ -1322,10 +1808,21 @@ ZSTD_decompressSequencesLong_body(
1322
1808
  }
1323
1809
 
1324
1810
  /* last literal segment */
1325
- { size_t const lastLLSize = litEnd - litPtr;
1811
+ if (dctx->litBufferLocation == ZSTD_split) /* first deplete literal buffer in dst, then copy litExtraBuffer */
1812
+ {
1813
+ size_t const lastLLSize = litBufferEnd - litPtr;
1814
+ RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, "");
1815
+ if (op != NULL) {
1816
+ ZSTD_memmove(op, litPtr, lastLLSize);
1817
+ op += lastLLSize;
1818
+ }
1819
+ litPtr = dctx->litExtraBuffer;
1820
+ litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
1821
+ }
1822
+ { size_t const lastLLSize = litBufferEnd - litPtr;
1326
1823
  RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
1327
1824
  if (op != NULL) {
1328
- ZSTD_memcpy(op, litPtr, lastLLSize);
1825
+ ZSTD_memmove(op, litPtr, lastLLSize);
1329
1826
  op += lastLLSize;
1330
1827
  }
1331
1828
  }
@@ -1349,7 +1846,7 @@ ZSTD_decompressSequencesLong_default(ZSTD_DCtx* dctx,
1349
1846
  #if DYNAMIC_BMI2
1350
1847
 
1351
1848
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
1352
- static TARGET_ATTRIBUTE("bmi2") size_t
1849
+ static BMI2_TARGET_ATTRIBUTE size_t
1353
1850
  DONT_VECTORIZE
1354
1851
  ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx,
1355
1852
  void* dst, size_t maxDstSize,
@@ -1359,10 +1856,20 @@ ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx,
1359
1856
  {
1360
1857
  return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1361
1858
  }
1859
+ static BMI2_TARGET_ATTRIBUTE size_t
1860
+ DONT_VECTORIZE
1861
+ ZSTD_decompressSequencesSplitLitBuffer_bmi2(ZSTD_DCtx* dctx,
1862
+ void* dst, size_t maxDstSize,
1863
+ const void* seqStart, size_t seqSize, int nbSeq,
1864
+ const ZSTD_longOffset_e isLongOffset,
1865
+ const int frame)
1866
+ {
1867
+ return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1868
+ }
1362
1869
  #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
1363
1870
 
1364
1871
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
1365
- static TARGET_ATTRIBUTE("bmi2") size_t
1872
+ static BMI2_TARGET_ATTRIBUTE size_t
1366
1873
  ZSTD_decompressSequencesLong_bmi2(ZSTD_DCtx* dctx,
1367
1874
  void* dst, size_t maxDstSize,
1368
1875
  const void* seqStart, size_t seqSize, int nbSeq,
@@ -1391,11 +1898,25 @@ ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,
1391
1898
  {
1392
1899
  DEBUGLOG(5, "ZSTD_decompressSequences");
1393
1900
  #if DYNAMIC_BMI2
1394
- if (dctx->bmi2) {
1901
+ if (ZSTD_DCtx_get_bmi2(dctx)) {
1395
1902
  return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1396
1903
  }
1397
1904
  #endif
1398
- return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1905
+ return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1906
+ }
1907
+ static size_t
1908
+ ZSTD_decompressSequencesSplitLitBuffer(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,
1909
+ const void* seqStart, size_t seqSize, int nbSeq,
1910
+ const ZSTD_longOffset_e isLongOffset,
1911
+ const int frame)
1912
+ {
1913
+ DEBUGLOG(5, "ZSTD_decompressSequencesSplitLitBuffer");
1914
+ #if DYNAMIC_BMI2
1915
+ if (ZSTD_DCtx_get_bmi2(dctx)) {
1916
+ return ZSTD_decompressSequencesSplitLitBuffer_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1917
+ }
1918
+ #endif
1919
+ return ZSTD_decompressSequencesSplitLitBuffer_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1399
1920
  }
1400
1921
  #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
1401
1922
 
@@ -1415,7 +1936,7 @@ ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx,
1415
1936
  {
1416
1937
  DEBUGLOG(5, "ZSTD_decompressSequencesLong");
1417
1938
  #if DYNAMIC_BMI2
1418
- if (dctx->bmi2) {
1939
+ if (ZSTD_DCtx_get_bmi2(dctx)) {
1419
1940
  return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1420
1941
  }
1421
1942
  #endif
@@ -1456,7 +1977,7 @@ ZSTD_getLongOffsetsShare(const ZSTD_seqSymbol* offTable)
1456
1977
  size_t
1457
1978
  ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
1458
1979
  void* dst, size_t dstCapacity,
1459
- const void* src, size_t srcSize, const int frame)
1980
+ const void* src, size_t srcSize, const int frame, const streaming_operation streaming)
1460
1981
  { /* blockType == blockCompressed */
1461
1982
  const BYTE* ip = (const BYTE*)src;
1462
1983
  /* isLongOffset must be true if there are long offsets.
@@ -1471,7 +1992,7 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
1471
1992
  RETURN_ERROR_IF(srcSize >= ZSTD_BLOCKSIZE_MAX, srcSize_wrong, "");
1472
1993
 
1473
1994
  /* Decode literals section */
1474
- { size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize);
1995
+ { size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, streaming);
1475
1996
  DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : %u", (U32)litCSize);
1476
1997
  if (ZSTD_isError(litCSize)) return litCSize;
1477
1998
  ip += litCSize;
@@ -1519,7 +2040,10 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
1519
2040
 
1520
2041
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
1521
2042
  /* else */
1522
- return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
2043
+ if (dctx->litBufferLocation == ZSTD_split)
2044
+ return ZSTD_decompressSequencesSplitLitBuffer(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
2045
+ else
2046
+ return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
1523
2047
  #endif
1524
2048
  }
1525
2049
  }
@@ -1542,7 +2066,7 @@ size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
1542
2066
  {
1543
2067
  size_t dSize;
1544
2068
  ZSTD_checkContinuity(dctx, dst, dstCapacity);
1545
- dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 0);
2069
+ dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 0, not_streaming);
1546
2070
  dctx->previousDstEnd = (char*)dst + dSize;
1547
2071
  return dSize;
1548
2072
  }