zstd-ruby 1.4.4.0 → 1.5.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (115) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +2 -0
  3. data/README.md +78 -5
  4. data/Rakefile +8 -2
  5. data/ext/zstdruby/common.h +15 -0
  6. data/ext/zstdruby/extconf.rb +3 -2
  7. data/ext/zstdruby/libzstd/common/allocations.h +55 -0
  8. data/ext/zstdruby/libzstd/common/bits.h +200 -0
  9. data/ext/zstdruby/libzstd/common/bitstream.h +74 -97
  10. data/ext/zstdruby/libzstd/common/compiler.h +219 -20
  11. data/ext/zstdruby/libzstd/common/cpu.h +1 -3
  12. data/ext/zstdruby/libzstd/common/debug.c +11 -31
  13. data/ext/zstdruby/libzstd/common/debug.h +22 -49
  14. data/ext/zstdruby/libzstd/common/entropy_common.c +184 -80
  15. data/ext/zstdruby/libzstd/common/error_private.c +11 -2
  16. data/ext/zstdruby/libzstd/common/error_private.h +87 -4
  17. data/ext/zstdruby/libzstd/common/fse.h +47 -116
  18. data/ext/zstdruby/libzstd/common/fse_decompress.c +127 -127
  19. data/ext/zstdruby/libzstd/common/huf.h +112 -197
  20. data/ext/zstdruby/libzstd/common/mem.h +124 -142
  21. data/ext/zstdruby/libzstd/common/pool.c +54 -27
  22. data/ext/zstdruby/libzstd/common/pool.h +11 -5
  23. data/ext/zstdruby/libzstd/common/portability_macros.h +156 -0
  24. data/ext/zstdruby/libzstd/common/threading.c +78 -22
  25. data/ext/zstdruby/libzstd/common/threading.h +9 -13
  26. data/ext/zstdruby/libzstd/common/xxhash.c +15 -873
  27. data/ext/zstdruby/libzstd/common/xxhash.h +5572 -191
  28. data/ext/zstdruby/libzstd/common/zstd_common.c +2 -37
  29. data/ext/zstdruby/libzstd/common/zstd_deps.h +111 -0
  30. data/ext/zstdruby/libzstd/common/zstd_internal.h +186 -144
  31. data/ext/zstdruby/libzstd/common/zstd_trace.h +163 -0
  32. data/ext/zstdruby/libzstd/compress/clevels.h +134 -0
  33. data/ext/zstdruby/libzstd/compress/fse_compress.c +99 -196
  34. data/ext/zstdruby/libzstd/compress/hist.c +41 -63
  35. data/ext/zstdruby/libzstd/compress/hist.h +13 -33
  36. data/ext/zstdruby/libzstd/compress/huf_compress.c +968 -331
  37. data/ext/zstdruby/libzstd/compress/zstd_compress.c +4120 -1191
  38. data/ext/zstdruby/libzstd/compress/zstd_compress_internal.h +688 -159
  39. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.c +121 -40
  40. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.h +16 -6
  41. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.c +62 -35
  42. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.h +10 -3
  43. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.c +577 -0
  44. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.h +32 -0
  45. data/ext/zstdruby/libzstd/compress/zstd_cwksp.h +322 -115
  46. data/ext/zstdruby/libzstd/compress/zstd_double_fast.c +394 -154
  47. data/ext/zstdruby/libzstd/compress/zstd_double_fast.h +4 -3
  48. data/ext/zstdruby/libzstd/compress/zstd_fast.c +729 -253
  49. data/ext/zstdruby/libzstd/compress/zstd_fast.h +4 -3
  50. data/ext/zstdruby/libzstd/compress/zstd_lazy.c +1289 -247
  51. data/ext/zstdruby/libzstd/compress/zstd_lazy.h +61 -1
  52. data/ext/zstdruby/libzstd/compress/zstd_ldm.c +339 -212
  53. data/ext/zstdruby/libzstd/compress/zstd_ldm.h +15 -3
  54. data/ext/zstdruby/libzstd/compress/zstd_ldm_geartab.h +106 -0
  55. data/ext/zstdruby/libzstd/compress/zstd_opt.c +508 -282
  56. data/ext/zstdruby/libzstd/compress/zstd_opt.h +1 -1
  57. data/ext/zstdruby/libzstd/compress/zstdmt_compress.c +217 -466
  58. data/ext/zstdruby/libzstd/compress/zstdmt_compress.h +35 -114
  59. data/ext/zstdruby/libzstd/decompress/huf_decompress.c +1220 -572
  60. data/ext/zstdruby/libzstd/decompress/huf_decompress_amd64.S +576 -0
  61. data/ext/zstdruby/libzstd/decompress/zstd_ddict.c +23 -19
  62. data/ext/zstdruby/libzstd/decompress/zstd_ddict.h +3 -3
  63. data/ext/zstdruby/libzstd/decompress/zstd_decompress.c +859 -273
  64. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.c +1244 -375
  65. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.h +21 -7
  66. data/ext/zstdruby/libzstd/decompress/zstd_decompress_internal.h +74 -11
  67. data/ext/zstdruby/libzstd/dictBuilder/cover.c +75 -54
  68. data/ext/zstdruby/libzstd/dictBuilder/cover.h +20 -9
  69. data/ext/zstdruby/libzstd/dictBuilder/divsufsort.c +1 -1
  70. data/ext/zstdruby/libzstd/dictBuilder/fastcover.c +55 -36
  71. data/ext/zstdruby/libzstd/dictBuilder/zdict.c +126 -110
  72. data/ext/zstdruby/libzstd/{dictBuilder/zdict.h → zdict.h} +248 -56
  73. data/ext/zstdruby/libzstd/zstd.h +1277 -306
  74. data/ext/zstdruby/libzstd/{common/zstd_errors.h → zstd_errors.h} +29 -8
  75. data/ext/zstdruby/main.c +20 -0
  76. data/ext/zstdruby/skippable_frame.c +63 -0
  77. data/ext/zstdruby/streaming_compress.c +177 -0
  78. data/ext/zstdruby/streaming_compress.h +5 -0
  79. data/ext/zstdruby/streaming_decompress.c +123 -0
  80. data/ext/zstdruby/zstdruby.c +114 -32
  81. data/lib/zstd-ruby/version.rb +1 -1
  82. data/lib/zstd-ruby.rb +0 -1
  83. data/zstd-ruby.gemspec +1 -1
  84. metadata +24 -39
  85. data/.travis.yml +0 -14
  86. data/ext/zstdruby/libzstd/.gitignore +0 -3
  87. data/ext/zstdruby/libzstd/BUCK +0 -234
  88. data/ext/zstdruby/libzstd/Makefile +0 -289
  89. data/ext/zstdruby/libzstd/README.md +0 -159
  90. data/ext/zstdruby/libzstd/deprecated/zbuff.h +0 -214
  91. data/ext/zstdruby/libzstd/deprecated/zbuff_common.c +0 -26
  92. data/ext/zstdruby/libzstd/deprecated/zbuff_compress.c +0 -147
  93. data/ext/zstdruby/libzstd/deprecated/zbuff_decompress.c +0 -75
  94. data/ext/zstdruby/libzstd/dll/example/Makefile +0 -47
  95. data/ext/zstdruby/libzstd/dll/example/README.md +0 -69
  96. data/ext/zstdruby/libzstd/dll/example/build_package.bat +0 -20
  97. data/ext/zstdruby/libzstd/dll/example/fullbench-dll.sln +0 -25
  98. data/ext/zstdruby/libzstd/dll/example/fullbench-dll.vcxproj +0 -181
  99. data/ext/zstdruby/libzstd/legacy/zstd_legacy.h +0 -415
  100. data/ext/zstdruby/libzstd/legacy/zstd_v01.c +0 -2152
  101. data/ext/zstdruby/libzstd/legacy/zstd_v01.h +0 -94
  102. data/ext/zstdruby/libzstd/legacy/zstd_v02.c +0 -3514
  103. data/ext/zstdruby/libzstd/legacy/zstd_v02.h +0 -93
  104. data/ext/zstdruby/libzstd/legacy/zstd_v03.c +0 -3156
  105. data/ext/zstdruby/libzstd/legacy/zstd_v03.h +0 -93
  106. data/ext/zstdruby/libzstd/legacy/zstd_v04.c +0 -3641
  107. data/ext/zstdruby/libzstd/legacy/zstd_v04.h +0 -142
  108. data/ext/zstdruby/libzstd/legacy/zstd_v05.c +0 -4046
  109. data/ext/zstdruby/libzstd/legacy/zstd_v05.h +0 -162
  110. data/ext/zstdruby/libzstd/legacy/zstd_v06.c +0 -4150
  111. data/ext/zstdruby/libzstd/legacy/zstd_v06.h +0 -172
  112. data/ext/zstdruby/libzstd/legacy/zstd_v07.c +0 -4533
  113. data/ext/zstdruby/libzstd/legacy/zstd_v07.h +0 -187
  114. data/ext/zstdruby/libzstd/libzstd.pc.in +0 -15
  115. data/ext/zstdruby/zstdruby.h +0 -6
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -14,18 +14,18 @@
14
14
  /*-*******************************************************
15
15
  * Dependencies
16
16
  *********************************************************/
17
- #include <string.h> /* memcpy, memmove, memset */
18
- #include "compiler.h" /* prefetch */
19
- #include "cpu.h" /* bmi2 */
20
- #include "mem.h" /* low level memory routines */
17
+ #include "../common/zstd_deps.h" /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */
18
+ #include "../common/compiler.h" /* prefetch */
19
+ #include "../common/cpu.h" /* bmi2 */
20
+ #include "../common/mem.h" /* low level memory routines */
21
21
  #define FSE_STATIC_LINKING_ONLY
22
- #include "fse.h"
23
- #define HUF_STATIC_LINKING_ONLY
24
- #include "huf.h"
25
- #include "zstd_internal.h"
22
+ #include "../common/fse.h"
23
+ #include "../common/huf.h"
24
+ #include "../common/zstd_internal.h"
26
25
  #include "zstd_decompress_internal.h" /* ZSTD_DCtx */
27
26
  #include "zstd_ddict.h" /* ZSTD_DDictDictContent */
28
27
  #include "zstd_decompress_block.h"
28
+ #include "../common/bits.h" /* ZSTD_highbit32 */
29
29
 
30
30
  /*_*******************************************************
31
31
  * Macros
@@ -44,7 +44,7 @@
44
44
  /*_*******************************************************
45
45
  * Memory operations
46
46
  **********************************************************/
47
- static void ZSTD_copy4(void* dst, const void* src) { memcpy(dst, src, 4); }
47
+ static void ZSTD_copy4(void* dst, const void* src) { ZSTD_memcpy(dst, src, 4); }
48
48
 
49
49
 
50
50
  /*-*************************************************************
@@ -56,7 +56,7 @@ static void ZSTD_copy4(void* dst, const void* src) { memcpy(dst, src, 4); }
56
56
  size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
57
57
  blockProperties_t* bpPtr)
58
58
  {
59
- RETURN_ERROR_IF(srcSize < ZSTD_blockHeaderSize, srcSize_wrong);
59
+ RETURN_ERROR_IF(srcSize < ZSTD_blockHeaderSize, srcSize_wrong, "");
60
60
 
61
61
  { U32 const cBlockHeader = MEM_readLE24(src);
62
62
  U32 const cSize = cBlockHeader >> 3;
@@ -64,23 +64,64 @@ size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
64
64
  bpPtr->blockType = (blockType_e)((cBlockHeader >> 1) & 3);
65
65
  bpPtr->origSize = cSize; /* only useful for RLE */
66
66
  if (bpPtr->blockType == bt_rle) return 1;
67
- RETURN_ERROR_IF(bpPtr->blockType == bt_reserved, corruption_detected);
67
+ RETURN_ERROR_IF(bpPtr->blockType == bt_reserved, corruption_detected, "");
68
68
  return cSize;
69
69
  }
70
70
  }
71
71
 
72
+ /* Allocate buffer for literals, either overlapping current dst, or split between dst and litExtraBuffer, or stored entirely within litExtraBuffer */
73
+ static void ZSTD_allocateLiteralsBuffer(ZSTD_DCtx* dctx, void* const dst, const size_t dstCapacity, const size_t litSize,
74
+ const streaming_operation streaming, const size_t expectedWriteSize, const unsigned splitImmediately)
75
+ {
76
+ if (streaming == not_streaming && dstCapacity > ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH)
77
+ {
78
+ /* room for litbuffer to fit without read faulting */
79
+ dctx->litBuffer = (BYTE*)dst + ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH;
80
+ dctx->litBufferEnd = dctx->litBuffer + litSize;
81
+ dctx->litBufferLocation = ZSTD_in_dst;
82
+ }
83
+ else if (litSize > ZSTD_LITBUFFEREXTRASIZE)
84
+ {
85
+ /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */
86
+ if (splitImmediately) {
87
+ /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */
88
+ dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH;
89
+ dctx->litBufferEnd = dctx->litBuffer + litSize - ZSTD_LITBUFFEREXTRASIZE;
90
+ }
91
+ else {
92
+ /* initially this will be stored entirely in dst during huffman decoding, it will partially be shifted to litExtraBuffer after */
93
+ dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize;
94
+ dctx->litBufferEnd = (BYTE*)dst + expectedWriteSize;
95
+ }
96
+ dctx->litBufferLocation = ZSTD_split;
97
+ }
98
+ else
99
+ {
100
+ /* fits entirely within litExtraBuffer, so no split is necessary */
101
+ dctx->litBuffer = dctx->litExtraBuffer;
102
+ dctx->litBufferEnd = dctx->litBuffer + litSize;
103
+ dctx->litBufferLocation = ZSTD_not_in_dst;
104
+ }
105
+ }
72
106
 
73
107
  /* Hidden declaration for fullbench */
74
108
  size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
75
- const void* src, size_t srcSize);
109
+ const void* src, size_t srcSize,
110
+ void* dst, size_t dstCapacity, const streaming_operation streaming);
76
111
  /*! ZSTD_decodeLiteralsBlock() :
112
+ * Where it is possible to do so without being stomped by the output during decompression, the literals block will be stored
113
+ * in the dstBuffer. If there is room to do so, it will be stored in full in the excess dst space after where the current
114
+ * block will be output. Otherwise it will be stored at the end of the current dst blockspace, with a small portion being
115
+ * stored in dctx->litExtraBuffer to help keep it "ahead" of the current output write.
116
+ *
77
117
  * @return : nb of bytes read from src (< srcSize )
78
118
  * note : symbol not declared but exposed for fullbench */
79
119
  size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
80
- const void* src, size_t srcSize) /* note : srcSize < BLOCKSIZE */
120
+ const void* src, size_t srcSize, /* note : srcSize < BLOCKSIZE */
121
+ void* dst, size_t dstCapacity, const streaming_operation streaming)
81
122
  {
82
123
  DEBUGLOG(5, "ZSTD_decodeLiteralsBlock");
83
- RETURN_ERROR_IF(srcSize < MIN_CBLOCK_SIZE, corruption_detected);
124
+ RETURN_ERROR_IF(srcSize < MIN_CBLOCK_SIZE, corruption_detected, "");
84
125
 
85
126
  { const BYTE* const istart = (const BYTE*) src;
86
127
  symbolEncodingType_e const litEncType = (symbolEncodingType_e)(istart[0] & 3);
@@ -89,16 +130,20 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
89
130
  {
90
131
  case set_repeat:
91
132
  DEBUGLOG(5, "set_repeat flag : re-using stats from previous compressed literals block");
92
- RETURN_ERROR_IF(dctx->litEntropy==0, dictionary_corrupted);
93
- /* fall-through */
133
+ RETURN_ERROR_IF(dctx->litEntropy==0, dictionary_corrupted, "");
134
+ ZSTD_FALLTHROUGH;
94
135
 
95
136
  case set_compressed:
96
- RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3");
137
+ RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need up to 5 for case 3");
97
138
  { size_t lhSize, litSize, litCSize;
98
139
  U32 singleStream=0;
99
140
  U32 const lhlCode = (istart[0] >> 2) & 3;
100
141
  U32 const lhc = MEM_readLE32(istart);
101
142
  size_t hufSuccess;
143
+ size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
144
+ int const flags = 0
145
+ | (ZSTD_DCtx_get_bmi2(dctx) ? HUF_flags_bmi2 : 0)
146
+ | (dctx->disableHufAsm ? HUF_flags_disableAsm : 0);
102
147
  switch(lhlCode)
103
148
  {
104
149
  case 0: case 1: default: /* note : default is impossible, since lhlCode into [0..3] */
@@ -121,8 +166,15 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
121
166
  litCSize = (lhc >> 22) + ((size_t)istart[4] << 10);
122
167
  break;
123
168
  }
124
- RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected);
125
- RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected);
169
+ RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
170
+ RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
171
+ if (!singleStream)
172
+ RETURN_ERROR_IF(litSize < MIN_LITERALS_FOR_4_STREAMS, literals_headerWrong,
173
+ "Not enough literals (%zu) for the 4-streams mode (min %u)",
174
+ litSize, MIN_LITERALS_FOR_4_STREAMS);
175
+ RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, "");
176
+ RETURN_ERROR_IF(expectedWriteSize < litSize , dstSize_tooSmall, "");
177
+ ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 0);
126
178
 
127
179
  /* prefetch huffman table if cold */
128
180
  if (dctx->ddictIsCold && (litSize > 768 /* heuristic */)) {
@@ -131,13 +183,14 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
131
183
 
132
184
  if (litEncType==set_repeat) {
133
185
  if (singleStream) {
134
- hufSuccess = HUF_decompress1X_usingDTable_bmi2(
186
+ hufSuccess = HUF_decompress1X_usingDTable(
135
187
  dctx->litBuffer, litSize, istart+lhSize, litCSize,
136
- dctx->HUFptr, dctx->bmi2);
188
+ dctx->HUFptr, flags);
137
189
  } else {
138
- hufSuccess = HUF_decompress4X_usingDTable_bmi2(
190
+ assert(litSize >= MIN_LITERALS_FOR_4_STREAMS);
191
+ hufSuccess = HUF_decompress4X_usingDTable(
139
192
  dctx->litBuffer, litSize, istart+lhSize, litCSize,
140
- dctx->HUFptr, dctx->bmi2);
193
+ dctx->HUFptr, flags);
141
194
  }
142
195
  } else {
143
196
  if (singleStream) {
@@ -145,34 +198,41 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
145
198
  hufSuccess = HUF_decompress1X_DCtx_wksp(
146
199
  dctx->entropy.hufTable, dctx->litBuffer, litSize,
147
200
  istart+lhSize, litCSize, dctx->workspace,
148
- sizeof(dctx->workspace));
201
+ sizeof(dctx->workspace), flags);
149
202
  #else
150
- hufSuccess = HUF_decompress1X1_DCtx_wksp_bmi2(
203
+ hufSuccess = HUF_decompress1X1_DCtx_wksp(
151
204
  dctx->entropy.hufTable, dctx->litBuffer, litSize,
152
205
  istart+lhSize, litCSize, dctx->workspace,
153
- sizeof(dctx->workspace), dctx->bmi2);
206
+ sizeof(dctx->workspace), flags);
154
207
  #endif
155
208
  } else {
156
- hufSuccess = HUF_decompress4X_hufOnly_wksp_bmi2(
209
+ hufSuccess = HUF_decompress4X_hufOnly_wksp(
157
210
  dctx->entropy.hufTable, dctx->litBuffer, litSize,
158
211
  istart+lhSize, litCSize, dctx->workspace,
159
- sizeof(dctx->workspace), dctx->bmi2);
212
+ sizeof(dctx->workspace), flags);
160
213
  }
161
214
  }
215
+ if (dctx->litBufferLocation == ZSTD_split)
216
+ {
217
+ ZSTD_memcpy(dctx->litExtraBuffer, dctx->litBufferEnd - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE);
218
+ ZSTD_memmove(dctx->litBuffer + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH, dctx->litBuffer, litSize - ZSTD_LITBUFFEREXTRASIZE);
219
+ dctx->litBuffer += ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH;
220
+ dctx->litBufferEnd -= WILDCOPY_OVERLENGTH;
221
+ }
162
222
 
163
- RETURN_ERROR_IF(HUF_isError(hufSuccess), corruption_detected);
223
+ RETURN_ERROR_IF(HUF_isError(hufSuccess), corruption_detected, "");
164
224
 
165
225
  dctx->litPtr = dctx->litBuffer;
166
226
  dctx->litSize = litSize;
167
227
  dctx->litEntropy = 1;
168
228
  if (litEncType==set_compressed) dctx->HUFptr = dctx->entropy.hufTable;
169
- memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
170
229
  return litCSize + lhSize;
171
230
  }
172
231
 
173
232
  case set_basic:
174
233
  { size_t litSize, lhSize;
175
234
  U32 const lhlCode = ((istart[0]) >> 2) & 3;
235
+ size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
176
236
  switch(lhlCode)
177
237
  {
178
238
  case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */
@@ -185,27 +245,41 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
185
245
  break;
186
246
  case 3:
187
247
  lhSize = 3;
248
+ RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize = 3");
188
249
  litSize = MEM_readLE24(istart) >> 4;
189
250
  break;
190
251
  }
191
252
 
253
+ RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
254
+ RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, "");
255
+ ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1);
192
256
  if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) { /* risk reading beyond src buffer with wildcopy */
193
- RETURN_ERROR_IF(litSize+lhSize > srcSize, corruption_detected);
194
- memcpy(dctx->litBuffer, istart+lhSize, litSize);
257
+ RETURN_ERROR_IF(litSize+lhSize > srcSize, corruption_detected, "");
258
+ if (dctx->litBufferLocation == ZSTD_split)
259
+ {
260
+ ZSTD_memcpy(dctx->litBuffer, istart + lhSize, litSize - ZSTD_LITBUFFEREXTRASIZE);
261
+ ZSTD_memcpy(dctx->litExtraBuffer, istart + lhSize + litSize - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE);
262
+ }
263
+ else
264
+ {
265
+ ZSTD_memcpy(dctx->litBuffer, istart + lhSize, litSize);
266
+ }
195
267
  dctx->litPtr = dctx->litBuffer;
196
268
  dctx->litSize = litSize;
197
- memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
198
269
  return lhSize+litSize;
199
270
  }
200
271
  /* direct reference into compressed stream */
201
272
  dctx->litPtr = istart+lhSize;
202
273
  dctx->litSize = litSize;
274
+ dctx->litBufferEnd = dctx->litPtr + litSize;
275
+ dctx->litBufferLocation = ZSTD_not_in_dst;
203
276
  return lhSize+litSize;
204
277
  }
205
278
 
206
279
  case set_rle:
207
280
  { U32 const lhlCode = ((istart[0]) >> 2) & 3;
208
281
  size_t litSize, lhSize;
282
+ size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
209
283
  switch(lhlCode)
210
284
  {
211
285
  case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */
@@ -214,16 +288,28 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
214
288
  break;
215
289
  case 1:
216
290
  lhSize = 2;
291
+ RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 3");
217
292
  litSize = MEM_readLE16(istart) >> 4;
218
293
  break;
219
294
  case 3:
220
295
  lhSize = 3;
296
+ RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 4");
221
297
  litSize = MEM_readLE24(istart) >> 4;
222
- RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4");
223
298
  break;
224
299
  }
225
- RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected);
226
- memset(dctx->litBuffer, istart[lhSize], litSize + WILDCOPY_OVERLENGTH);
300
+ RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
301
+ RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
302
+ RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, "");
303
+ ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1);
304
+ if (dctx->litBufferLocation == ZSTD_split)
305
+ {
306
+ ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize - ZSTD_LITBUFFEREXTRASIZE);
307
+ ZSTD_memset(dctx->litExtraBuffer, istart[lhSize], ZSTD_LITBUFFEREXTRASIZE);
308
+ }
309
+ else
310
+ {
311
+ ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize);
312
+ }
227
313
  dctx->litPtr = dctx->litBuffer;
228
314
  dctx->litSize = litSize;
229
315
  return lhSize+1;
@@ -236,7 +322,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
236
322
 
237
323
  /* Default FSE distribution tables.
238
324
  * These are pre-calculated FSE decoding tables using default distributions as defined in specification :
239
- * https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md#default-distributions
325
+ * https://github.com/facebook/zstd/blob/release/doc/zstd_compression_format.md#default-distributions
240
326
  * They were generated programmatically with following method :
241
327
  * - start from default distributions, present in /lib/common/zstd_internal.h
242
328
  * - generate tables normally, using ZSTD_buildFSETable()
@@ -343,7 +429,7 @@ static const ZSTD_seqSymbol ML_defaultDTable[(1<<ML_DEFAULTNORMLOG)+1] = {
343
429
  }; /* ML_defaultDTable */
344
430
 
345
431
 
346
- static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U32 nbAddBits)
432
+ static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U8 nbAddBits)
347
433
  {
348
434
  void* ptr = dt;
349
435
  ZSTD_seqSymbol_header* const DTableH = (ZSTD_seqSymbol_header*)ptr;
@@ -355,7 +441,7 @@ static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U32 nbAddB
355
441
  cell->nbBits = 0;
356
442
  cell->nextState = 0;
357
443
  assert(nbAddBits < 255);
358
- cell->nbAdditionalBits = (BYTE)nbAddBits;
444
+ cell->nbAdditionalBits = nbAddBits;
359
445
  cell->baseValue = baseValue;
360
446
  }
361
447
 
@@ -364,23 +450,26 @@ static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U32 nbAddB
364
450
  * generate FSE decoding table for one symbol (ll, ml or off)
365
451
  * cannot fail if input is valid =>
366
452
  * all inputs are presumed validated at this stage */
367
- void
368
- ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
453
+ FORCE_INLINE_TEMPLATE
454
+ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
369
455
  const short* normalizedCounter, unsigned maxSymbolValue,
370
- const U32* baseValue, const U32* nbAdditionalBits,
371
- unsigned tableLog)
456
+ const U32* baseValue, const U8* nbAdditionalBits,
457
+ unsigned tableLog, void* wksp, size_t wkspSize)
372
458
  {
373
459
  ZSTD_seqSymbol* const tableDecode = dt+1;
374
- U16 symbolNext[MaxSeq+1];
375
-
376
460
  U32 const maxSV1 = maxSymbolValue + 1;
377
461
  U32 const tableSize = 1 << tableLog;
378
- U32 highThreshold = tableSize-1;
462
+
463
+ U16* symbolNext = (U16*)wksp;
464
+ BYTE* spread = (BYTE*)(symbolNext + MaxSeq + 1);
465
+ U32 highThreshold = tableSize - 1;
466
+
379
467
 
380
468
  /* Sanity Checks */
381
469
  assert(maxSymbolValue <= MaxSeq);
382
470
  assert(tableLog <= MaxFSELog);
383
-
471
+ assert(wkspSize >= ZSTD_BUILD_FSE_TABLE_WKSP_SIZE);
472
+ (void)wkspSize;
384
473
  /* Init, lay down lowprob symbols */
385
474
  { ZSTD_seqSymbol_header DTableH;
386
475
  DTableH.tableLog = tableLog;
@@ -396,34 +485,128 @@ ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
396
485
  assert(normalizedCounter[s]>=0);
397
486
  symbolNext[s] = (U16)normalizedCounter[s];
398
487
  } } }
399
- memcpy(dt, &DTableH, sizeof(DTableH));
488
+ ZSTD_memcpy(dt, &DTableH, sizeof(DTableH));
400
489
  }
401
490
 
402
491
  /* Spread symbols */
403
- { U32 const tableMask = tableSize-1;
492
+ assert(tableSize <= 512);
493
+ /* Specialized symbol spreading for the case when there are
494
+ * no low probability (-1 count) symbols. When compressing
495
+ * small blocks we avoid low probability symbols to hit this
496
+ * case, since header decoding speed matters more.
497
+ */
498
+ if (highThreshold == tableSize - 1) {
499
+ size_t const tableMask = tableSize-1;
500
+ size_t const step = FSE_TABLESTEP(tableSize);
501
+ /* First lay down the symbols in order.
502
+ * We use a uint64_t to lay down 8 bytes at a time. This reduces branch
503
+ * misses since small blocks generally have small table logs, so nearly
504
+ * all symbols have counts <= 8. We ensure we have 8 bytes at the end of
505
+ * our buffer to handle the over-write.
506
+ */
507
+ {
508
+ U64 const add = 0x0101010101010101ull;
509
+ size_t pos = 0;
510
+ U64 sv = 0;
511
+ U32 s;
512
+ for (s=0; s<maxSV1; ++s, sv += add) {
513
+ int i;
514
+ int const n = normalizedCounter[s];
515
+ MEM_write64(spread + pos, sv);
516
+ for (i = 8; i < n; i += 8) {
517
+ MEM_write64(spread + pos + i, sv);
518
+ }
519
+ assert(n>=0);
520
+ pos += (size_t)n;
521
+ }
522
+ }
523
+ /* Now we spread those positions across the table.
524
+ * The benefit of doing it in two stages is that we avoid the
525
+ * variable size inner loop, which caused lots of branch misses.
526
+ * Now we can run through all the positions without any branch misses.
527
+ * We unroll the loop twice, since that is what empirically worked best.
528
+ */
529
+ {
530
+ size_t position = 0;
531
+ size_t s;
532
+ size_t const unroll = 2;
533
+ assert(tableSize % unroll == 0); /* FSE_MIN_TABLELOG is 5 */
534
+ for (s = 0; s < (size_t)tableSize; s += unroll) {
535
+ size_t u;
536
+ for (u = 0; u < unroll; ++u) {
537
+ size_t const uPosition = (position + (u * step)) & tableMask;
538
+ tableDecode[uPosition].baseValue = spread[s + u];
539
+ }
540
+ position = (position + (unroll * step)) & tableMask;
541
+ }
542
+ assert(position == 0);
543
+ }
544
+ } else {
545
+ U32 const tableMask = tableSize-1;
404
546
  U32 const step = FSE_TABLESTEP(tableSize);
405
547
  U32 s, position = 0;
406
548
  for (s=0; s<maxSV1; s++) {
407
549
  int i;
408
- for (i=0; i<normalizedCounter[s]; i++) {
550
+ int const n = normalizedCounter[s];
551
+ for (i=0; i<n; i++) {
409
552
  tableDecode[position].baseValue = s;
410
553
  position = (position + step) & tableMask;
411
- while (position > highThreshold) position = (position + step) & tableMask; /* lowprob area */
554
+ while (UNLIKELY(position > highThreshold)) position = (position + step) & tableMask; /* lowprob area */
412
555
  } }
413
556
  assert(position == 0); /* position must reach all cells once, otherwise normalizedCounter is incorrect */
414
557
  }
415
558
 
416
559
  /* Build Decoding table */
417
- { U32 u;
560
+ {
561
+ U32 u;
418
562
  for (u=0; u<tableSize; u++) {
419
563
  U32 const symbol = tableDecode[u].baseValue;
420
564
  U32 const nextState = symbolNext[symbol]++;
421
- tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32(nextState) );
565
+ tableDecode[u].nbBits = (BYTE) (tableLog - ZSTD_highbit32(nextState) );
422
566
  tableDecode[u].nextState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
423
567
  assert(nbAdditionalBits[symbol] < 255);
424
- tableDecode[u].nbAdditionalBits = (BYTE)nbAdditionalBits[symbol];
568
+ tableDecode[u].nbAdditionalBits = nbAdditionalBits[symbol];
425
569
  tableDecode[u].baseValue = baseValue[symbol];
426
- } }
570
+ }
571
+ }
572
+ }
573
+
574
+ /* Avoids the FORCE_INLINE of the _body() function. */
575
+ static void ZSTD_buildFSETable_body_default(ZSTD_seqSymbol* dt,
576
+ const short* normalizedCounter, unsigned maxSymbolValue,
577
+ const U32* baseValue, const U8* nbAdditionalBits,
578
+ unsigned tableLog, void* wksp, size_t wkspSize)
579
+ {
580
+ ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue,
581
+ baseValue, nbAdditionalBits, tableLog, wksp, wkspSize);
582
+ }
583
+
584
+ #if DYNAMIC_BMI2
585
+ BMI2_TARGET_ATTRIBUTE static void ZSTD_buildFSETable_body_bmi2(ZSTD_seqSymbol* dt,
586
+ const short* normalizedCounter, unsigned maxSymbolValue,
587
+ const U32* baseValue, const U8* nbAdditionalBits,
588
+ unsigned tableLog, void* wksp, size_t wkspSize)
589
+ {
590
+ ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue,
591
+ baseValue, nbAdditionalBits, tableLog, wksp, wkspSize);
592
+ }
593
+ #endif
594
+
595
+ void ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
596
+ const short* normalizedCounter, unsigned maxSymbolValue,
597
+ const U32* baseValue, const U8* nbAdditionalBits,
598
+ unsigned tableLog, void* wksp, size_t wkspSize, int bmi2)
599
+ {
600
+ #if DYNAMIC_BMI2
601
+ if (bmi2) {
602
+ ZSTD_buildFSETable_body_bmi2(dt, normalizedCounter, maxSymbolValue,
603
+ baseValue, nbAdditionalBits, tableLog, wksp, wkspSize);
604
+ return;
605
+ }
606
+ #endif
607
+ (void)bmi2;
608
+ ZSTD_buildFSETable_body_default(dt, normalizedCounter, maxSymbolValue,
609
+ baseValue, nbAdditionalBits, tableLog, wksp, wkspSize);
427
610
  }
428
611
 
429
612
 
@@ -433,18 +616,19 @@ ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
433
616
  static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymbol** DTablePtr,
434
617
  symbolEncodingType_e type, unsigned max, U32 maxLog,
435
618
  const void* src, size_t srcSize,
436
- const U32* baseValue, const U32* nbAdditionalBits,
619
+ const U32* baseValue, const U8* nbAdditionalBits,
437
620
  const ZSTD_seqSymbol* defaultTable, U32 flagRepeatTable,
438
- int ddictIsCold, int nbSeq)
621
+ int ddictIsCold, int nbSeq, U32* wksp, size_t wkspSize,
622
+ int bmi2)
439
623
  {
440
624
  switch(type)
441
625
  {
442
626
  case set_rle :
443
- RETURN_ERROR_IF(!srcSize, srcSize_wrong);
444
- RETURN_ERROR_IF((*(const BYTE*)src) > max, corruption_detected);
627
+ RETURN_ERROR_IF(!srcSize, srcSize_wrong, "");
628
+ RETURN_ERROR_IF((*(const BYTE*)src) > max, corruption_detected, "");
445
629
  { U32 const symbol = *(const BYTE*)src;
446
630
  U32 const baseline = baseValue[symbol];
447
- U32 const nbBits = nbAdditionalBits[symbol];
631
+ U8 const nbBits = nbAdditionalBits[symbol];
448
632
  ZSTD_buildSeqTable_rle(DTableSpace, baseline, nbBits);
449
633
  }
450
634
  *DTablePtr = DTableSpace;
@@ -453,7 +637,7 @@ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymb
453
637
  *DTablePtr = defaultTable;
454
638
  return 0;
455
639
  case set_repeat:
456
- RETURN_ERROR_IF(!flagRepeatTable, corruption_detected);
640
+ RETURN_ERROR_IF(!flagRepeatTable, corruption_detected, "");
457
641
  /* prefetch FSE table if used */
458
642
  if (ddictIsCold && (nbSeq > 24 /* heuristic */)) {
459
643
  const void* const pStart = *DTablePtr;
@@ -465,9 +649,9 @@ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymb
465
649
  { unsigned tableLog;
466
650
  S16 norm[MaxSeq+1];
467
651
  size_t const headerSize = FSE_readNCount(norm, &max, &tableLog, src, srcSize);
468
- RETURN_ERROR_IF(FSE_isError(headerSize), corruption_detected);
469
- RETURN_ERROR_IF(tableLog > maxLog, corruption_detected);
470
- ZSTD_buildFSETable(DTableSpace, norm, max, baseValue, nbAdditionalBits, tableLog);
652
+ RETURN_ERROR_IF(FSE_isError(headerSize), corruption_detected, "");
653
+ RETURN_ERROR_IF(tableLog > maxLog, corruption_detected, "");
654
+ ZSTD_buildFSETable(DTableSpace, norm, max, baseValue, nbAdditionalBits, tableLog, wksp, wkspSize, bmi2);
471
655
  *DTablePtr = DTableSpace;
472
656
  return headerSize;
473
657
  }
@@ -480,35 +664,36 @@ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymb
480
664
  size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
481
665
  const void* src, size_t srcSize)
482
666
  {
483
- const BYTE* const istart = (const BYTE* const)src;
667
+ const BYTE* const istart = (const BYTE*)src;
484
668
  const BYTE* const iend = istart + srcSize;
485
669
  const BYTE* ip = istart;
486
670
  int nbSeq;
487
671
  DEBUGLOG(5, "ZSTD_decodeSeqHeaders");
488
672
 
489
673
  /* check */
490
- RETURN_ERROR_IF(srcSize < MIN_SEQUENCES_SIZE, srcSize_wrong);
674
+ RETURN_ERROR_IF(srcSize < MIN_SEQUENCES_SIZE, srcSize_wrong, "");
491
675
 
492
676
  /* SeqHead */
493
677
  nbSeq = *ip++;
494
678
  if (!nbSeq) {
495
679
  *nbSeqPtr=0;
496
- RETURN_ERROR_IF(srcSize != 1, srcSize_wrong);
680
+ RETURN_ERROR_IF(srcSize != 1, srcSize_wrong, "");
497
681
  return 1;
498
682
  }
499
683
  if (nbSeq > 0x7F) {
500
684
  if (nbSeq == 0xFF) {
501
- RETURN_ERROR_IF(ip+2 > iend, srcSize_wrong);
502
- nbSeq = MEM_readLE16(ip) + LONGNBSEQ, ip+=2;
685
+ RETURN_ERROR_IF(ip+2 > iend, srcSize_wrong, "");
686
+ nbSeq = MEM_readLE16(ip) + LONGNBSEQ;
687
+ ip+=2;
503
688
  } else {
504
- RETURN_ERROR_IF(ip >= iend, srcSize_wrong);
689
+ RETURN_ERROR_IF(ip >= iend, srcSize_wrong, "");
505
690
  nbSeq = ((nbSeq-0x80)<<8) + *ip++;
506
691
  }
507
692
  }
508
693
  *nbSeqPtr = nbSeq;
509
694
 
510
695
  /* FSE table descriptors */
511
- RETURN_ERROR_IF(ip+1 > iend, srcSize_wrong); /* minimum possible size: 1 byte for symbol encoding types */
696
+ RETURN_ERROR_IF(ip+1 > iend, srcSize_wrong, ""); /* minimum possible size: 1 byte for symbol encoding types */
512
697
  { symbolEncodingType_e const LLtype = (symbolEncodingType_e)(*ip >> 6);
513
698
  symbolEncodingType_e const OFtype = (symbolEncodingType_e)((*ip >> 4) & 3);
514
699
  symbolEncodingType_e const MLtype = (symbolEncodingType_e)((*ip >> 2) & 3);
@@ -520,8 +705,10 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
520
705
  ip, iend-ip,
521
706
  LL_base, LL_bits,
522
707
  LL_defaultDTable, dctx->fseEntropy,
523
- dctx->ddictIsCold, nbSeq);
524
- RETURN_ERROR_IF(ZSTD_isError(llhSize), corruption_detected);
708
+ dctx->ddictIsCold, nbSeq,
709
+ dctx->workspace, sizeof(dctx->workspace),
710
+ ZSTD_DCtx_get_bmi2(dctx));
711
+ RETURN_ERROR_IF(ZSTD_isError(llhSize), corruption_detected, "ZSTD_buildSeqTable failed");
525
712
  ip += llhSize;
526
713
  }
527
714
 
@@ -530,8 +717,10 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
530
717
  ip, iend-ip,
531
718
  OF_base, OF_bits,
532
719
  OF_defaultDTable, dctx->fseEntropy,
533
- dctx->ddictIsCold, nbSeq);
534
- RETURN_ERROR_IF(ZSTD_isError(ofhSize), corruption_detected);
720
+ dctx->ddictIsCold, nbSeq,
721
+ dctx->workspace, sizeof(dctx->workspace),
722
+ ZSTD_DCtx_get_bmi2(dctx));
723
+ RETURN_ERROR_IF(ZSTD_isError(ofhSize), corruption_detected, "ZSTD_buildSeqTable failed");
535
724
  ip += ofhSize;
536
725
  }
537
726
 
@@ -540,8 +729,10 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
540
729
  ip, iend-ip,
541
730
  ML_base, ML_bits,
542
731
  ML_defaultDTable, dctx->fseEntropy,
543
- dctx->ddictIsCold, nbSeq);
544
- RETURN_ERROR_IF(ZSTD_isError(mlhSize), corruption_detected);
732
+ dctx->ddictIsCold, nbSeq,
733
+ dctx->workspace, sizeof(dctx->workspace),
734
+ ZSTD_DCtx_get_bmi2(dctx));
735
+ RETURN_ERROR_IF(ZSTD_isError(mlhSize), corruption_detected, "ZSTD_buildSeqTable failed");
545
736
  ip += mlhSize;
546
737
  }
547
738
  }
@@ -554,7 +745,6 @@ typedef struct {
554
745
  size_t litLength;
555
746
  size_t matchLength;
556
747
  size_t offset;
557
- const BYTE* match;
558
748
  } seq_t;
559
749
 
560
750
  typedef struct {
@@ -568,9 +758,6 @@ typedef struct {
568
758
  ZSTD_fseState stateOffb;
569
759
  ZSTD_fseState stateML;
570
760
  size_t prevOffset[ZSTD_REP_NUM];
571
- const BYTE* prefixStart;
572
- const BYTE* dictEnd;
573
- size_t pos;
574
761
  } seqState_t;
575
762
 
576
763
  /*! ZSTD_overlapCopy8() :
@@ -580,7 +767,7 @@ typedef struct {
580
767
  * Precondition: *ip <= *op
581
768
  * Postcondition: *op - *op >= 8
582
769
  */
583
- static void ZSTD_overlapCopy8(BYTE** op, BYTE const** ip, size_t offset) {
770
+ HINT_INLINE void ZSTD_overlapCopy8(BYTE** op, BYTE const** ip, size_t offset) {
584
771
  assert(*ip <= *op);
585
772
  if (offset < 8) {
586
773
  /* close range match, overlap */
@@ -613,7 +800,7 @@ static void ZSTD_overlapCopy8(BYTE** op, BYTE const** ip, size_t offset) {
613
800
  * - ZSTD_overlap_src_before_dst: The src and dst may overlap and may be any distance apart.
614
801
  * The src buffer must be before the dst buffer.
615
802
  */
616
- static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, ptrdiff_t length, ZSTD_overlap_e ovtype) {
803
+ static void ZSTD_safecopy(BYTE* op, const BYTE* const oend_w, BYTE const* ip, ptrdiff_t length, ZSTD_overlap_e ovtype) {
617
804
  ptrdiff_t const diff = op - ip;
618
805
  BYTE* const oend = op + length;
619
806
 
@@ -629,6 +816,7 @@ static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, ptrdiff_
629
816
  /* Copy 8 bytes and ensure the offset >= 8 when there can be overlap. */
630
817
  assert(length >= 8);
631
818
  ZSTD_overlapCopy8(&op, &ip, diff);
819
+ length -= 8;
632
820
  assert(op - ip >= 8);
633
821
  assert(op <= oend);
634
822
  }
@@ -643,12 +831,35 @@ static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, ptrdiff_
643
831
  assert(oend > oend_w);
644
832
  ZSTD_wildcopy(op, ip, oend_w - op, ovtype);
645
833
  ip += oend_w - op;
646
- op = oend_w;
834
+ op += oend_w - op;
647
835
  }
648
836
  /* Handle the leftovers. */
649
837
  while (op < oend) *op++ = *ip++;
650
838
  }
651
839
 
840
+ /* ZSTD_safecopyDstBeforeSrc():
841
+ * This version allows overlap with dst before src, or handles the non-overlap case with dst after src
842
+ * Kept separate from more common ZSTD_safecopy case to avoid performance impact to the safecopy common case */
843
+ static void ZSTD_safecopyDstBeforeSrc(BYTE* op, BYTE const* ip, ptrdiff_t length) {
844
+ ptrdiff_t const diff = op - ip;
845
+ BYTE* const oend = op + length;
846
+
847
+ if (length < 8 || diff > -8) {
848
+ /* Handle short lengths, close overlaps, and dst not before src. */
849
+ while (op < oend) *op++ = *ip++;
850
+ return;
851
+ }
852
+
853
+ if (op <= oend - WILDCOPY_OVERLENGTH && diff < -WILDCOPY_VECLEN) {
854
+ ZSTD_wildcopy(op, ip, oend - WILDCOPY_OVERLENGTH - op, ZSTD_no_overlap);
855
+ ip += oend - WILDCOPY_OVERLENGTH - op;
856
+ op += oend - WILDCOPY_OVERLENGTH - op;
857
+ }
858
+
859
+ /* Handle the leftovers. */
860
+ while (op < oend) *op++ = *ip++;
861
+ }
862
+
652
863
  /* ZSTD_execSequenceEnd():
653
864
  * This version handles cases that are near the end of the output buffer. It requires
654
865
  * more careful checks to make sure there is no overflow. By separating out these hard
@@ -659,21 +870,21 @@ static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, ptrdiff_
659
870
  */
660
871
  FORCE_NOINLINE
661
872
  size_t ZSTD_execSequenceEnd(BYTE* op,
662
- BYTE* const oend, seq_t sequence,
663
- const BYTE** litPtr, const BYTE* const litLimit,
664
- const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
873
+ BYTE* const oend, seq_t sequence,
874
+ const BYTE** litPtr, const BYTE* const litLimit,
875
+ const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
665
876
  {
666
877
  BYTE* const oLitEnd = op + sequence.litLength;
667
878
  size_t const sequenceLength = sequence.litLength + sequence.matchLength;
668
- BYTE* const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */
669
879
  const BYTE* const iLitEnd = *litPtr + sequence.litLength;
670
880
  const BYTE* match = oLitEnd - sequence.offset;
671
881
  BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH;
672
882
 
673
- /* bounds checks */
674
- assert(oLitEnd < oMatchEnd);
675
- RETURN_ERROR_IF(oMatchEnd > oend, dstSize_tooSmall, "last match must fit within dstBuffer");
676
- RETURN_ERROR_IF(iLitEnd > litLimit, corruption_detected, "try to read beyond literal buffer");
883
+ /* bounds checks : careful of address space overflow in 32-bit mode */
884
+ RETURN_ERROR_IF(sequenceLength > (size_t)(oend - op), dstSize_tooSmall, "last match must fit within dstBuffer");
885
+ RETURN_ERROR_IF(sequence.litLength > (size_t)(litLimit - *litPtr), corruption_detected, "try to read beyond literal buffer");
886
+ assert(op < op + sequenceLength);
887
+ assert(oLitEnd < op + sequenceLength);
677
888
 
678
889
  /* copy literals */
679
890
  ZSTD_safecopy(op, oend_w, *litPtr, sequence.litLength, ZSTD_no_overlap);
@@ -683,42 +894,199 @@ size_t ZSTD_execSequenceEnd(BYTE* op,
683
894
  /* copy Match */
684
895
  if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
685
896
  /* offset beyond prefix */
686
- RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected);
687
- match = dictEnd - (prefixStart-match);
897
+ RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, "");
898
+ match = dictEnd - (prefixStart - match);
688
899
  if (match + sequence.matchLength <= dictEnd) {
689
- memmove(oLitEnd, match, sequence.matchLength);
900
+ ZSTD_memmove(oLitEnd, match, sequence.matchLength);
690
901
  return sequenceLength;
691
902
  }
692
903
  /* span extDict & currentPrefixSegment */
693
904
  { size_t const length1 = dictEnd - match;
694
- memmove(oLitEnd, match, length1);
695
- op = oLitEnd + length1;
696
- sequence.matchLength -= length1;
697
- match = prefixStart;
698
- } }
905
+ ZSTD_memmove(oLitEnd, match, length1);
906
+ op = oLitEnd + length1;
907
+ sequence.matchLength -= length1;
908
+ match = prefixStart;
909
+ }
910
+ }
911
+ ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst);
912
+ return sequenceLength;
913
+ }
914
+
915
+ /* ZSTD_execSequenceEndSplitLitBuffer():
916
+ * This version is intended to be used during instances where the litBuffer is still split. It is kept separate to avoid performance impact for the good case.
917
+ */
918
+ FORCE_NOINLINE
919
+ size_t ZSTD_execSequenceEndSplitLitBuffer(BYTE* op,
920
+ BYTE* const oend, const BYTE* const oend_w, seq_t sequence,
921
+ const BYTE** litPtr, const BYTE* const litLimit,
922
+ const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
923
+ {
924
+ BYTE* const oLitEnd = op + sequence.litLength;
925
+ size_t const sequenceLength = sequence.litLength + sequence.matchLength;
926
+ const BYTE* const iLitEnd = *litPtr + sequence.litLength;
927
+ const BYTE* match = oLitEnd - sequence.offset;
928
+
929
+
930
+ /* bounds checks : careful of address space overflow in 32-bit mode */
931
+ RETURN_ERROR_IF(sequenceLength > (size_t)(oend - op), dstSize_tooSmall, "last match must fit within dstBuffer");
932
+ RETURN_ERROR_IF(sequence.litLength > (size_t)(litLimit - *litPtr), corruption_detected, "try to read beyond literal buffer");
933
+ assert(op < op + sequenceLength);
934
+ assert(oLitEnd < op + sequenceLength);
935
+
936
+ /* copy literals */
937
+ RETURN_ERROR_IF(op > *litPtr && op < *litPtr + sequence.litLength, dstSize_tooSmall, "output should not catch up to and overwrite literal buffer");
938
+ ZSTD_safecopyDstBeforeSrc(op, *litPtr, sequence.litLength);
939
+ op = oLitEnd;
940
+ *litPtr = iLitEnd;
941
+
942
+ /* copy Match */
943
+ if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
944
+ /* offset beyond prefix */
945
+ RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, "");
946
+ match = dictEnd - (prefixStart - match);
947
+ if (match + sequence.matchLength <= dictEnd) {
948
+ ZSTD_memmove(oLitEnd, match, sequence.matchLength);
949
+ return sequenceLength;
950
+ }
951
+ /* span extDict & currentPrefixSegment */
952
+ { size_t const length1 = dictEnd - match;
953
+ ZSTD_memmove(oLitEnd, match, length1);
954
+ op = oLitEnd + length1;
955
+ sequence.matchLength -= length1;
956
+ match = prefixStart;
957
+ }
958
+ }
699
959
  ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst);
700
960
  return sequenceLength;
701
961
  }
702
962
 
703
963
  HINT_INLINE
704
964
  size_t ZSTD_execSequence(BYTE* op,
705
- BYTE* const oend, seq_t sequence,
706
- const BYTE** litPtr, const BYTE* const litLimit,
707
- const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
965
+ BYTE* const oend, seq_t sequence,
966
+ const BYTE** litPtr, const BYTE* const litLimit,
967
+ const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
708
968
  {
709
969
  BYTE* const oLitEnd = op + sequence.litLength;
710
970
  size_t const sequenceLength = sequence.litLength + sequence.matchLength;
711
971
  BYTE* const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */
712
- BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH;
972
+ BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH; /* risk : address space underflow on oend=NULL */
713
973
  const BYTE* const iLitEnd = *litPtr + sequence.litLength;
714
974
  const BYTE* match = oLitEnd - sequence.offset;
715
975
 
716
- /* Errors and uncommon cases handled here. */
717
- assert(oLitEnd < oMatchEnd);
718
- if (iLitEnd > litLimit || oMatchEnd > oend_w)
976
+ assert(op != NULL /* Precondition */);
977
+ assert(oend_w < oend /* No underflow */);
978
+
979
+ #if defined(__aarch64__)
980
+ /* prefetch sequence starting from match that will be used for copy later */
981
+ PREFETCH_L1(match);
982
+ #endif
983
+ /* Handle edge cases in a slow path:
984
+ * - Read beyond end of literals
985
+ * - Match end is within WILDCOPY_OVERLIMIT of oend
986
+ * - 32-bit mode and the match length overflows
987
+ */
988
+ if (UNLIKELY(
989
+ iLitEnd > litLimit ||
990
+ oMatchEnd > oend_w ||
991
+ (MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH)))
719
992
  return ZSTD_execSequenceEnd(op, oend, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd);
720
993
 
721
994
  /* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */
995
+ assert(op <= oLitEnd /* No overflow */);
996
+ assert(oLitEnd < oMatchEnd /* Non-zero match & no overflow */);
997
+ assert(oMatchEnd <= oend /* No underflow */);
998
+ assert(iLitEnd <= litLimit /* Literal length is in bounds */);
999
+ assert(oLitEnd <= oend_w /* Can wildcopy literals */);
1000
+ assert(oMatchEnd <= oend_w /* Can wildcopy matches */);
1001
+
1002
+ /* Copy Literals:
1003
+ * Split out litLength <= 16 since it is nearly always true. +1.6% on gcc-9.
1004
+ * We likely don't need the full 32-byte wildcopy.
1005
+ */
1006
+ assert(WILDCOPY_OVERLENGTH >= 16);
1007
+ ZSTD_copy16(op, (*litPtr));
1008
+ if (UNLIKELY(sequence.litLength > 16)) {
1009
+ ZSTD_wildcopy(op + 16, (*litPtr) + 16, sequence.litLength - 16, ZSTD_no_overlap);
1010
+ }
1011
+ op = oLitEnd;
1012
+ *litPtr = iLitEnd; /* update for next sequence */
1013
+
1014
+ /* Copy Match */
1015
+ if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
1016
+ /* offset beyond prefix -> go into extDict */
1017
+ RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virtualStart)), corruption_detected, "");
1018
+ match = dictEnd + (match - prefixStart);
1019
+ if (match + sequence.matchLength <= dictEnd) {
1020
+ ZSTD_memmove(oLitEnd, match, sequence.matchLength);
1021
+ return sequenceLength;
1022
+ }
1023
+ /* span extDict & currentPrefixSegment */
1024
+ { size_t const length1 = dictEnd - match;
1025
+ ZSTD_memmove(oLitEnd, match, length1);
1026
+ op = oLitEnd + length1;
1027
+ sequence.matchLength -= length1;
1028
+ match = prefixStart;
1029
+ }
1030
+ }
1031
+ /* Match within prefix of 1 or more bytes */
1032
+ assert(op <= oMatchEnd);
1033
+ assert(oMatchEnd <= oend_w);
1034
+ assert(match >= prefixStart);
1035
+ assert(sequence.matchLength >= 1);
1036
+
1037
+ /* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy
1038
+ * without overlap checking.
1039
+ */
1040
+ if (LIKELY(sequence.offset >= WILDCOPY_VECLEN)) {
1041
+ /* We bet on a full wildcopy for matches, since we expect matches to be
1042
+ * longer than literals (in general). In silesia, ~10% of matches are longer
1043
+ * than 16 bytes.
1044
+ */
1045
+ ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength, ZSTD_no_overlap);
1046
+ return sequenceLength;
1047
+ }
1048
+ assert(sequence.offset < WILDCOPY_VECLEN);
1049
+
1050
+ /* Copy 8 bytes and spread the offset to be >= 8. */
1051
+ ZSTD_overlapCopy8(&op, &match, sequence.offset);
1052
+
1053
+ /* If the match length is > 8 bytes, then continue with the wildcopy. */
1054
+ if (sequence.matchLength > 8) {
1055
+ assert(op < oMatchEnd);
1056
+ ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength - 8, ZSTD_overlap_src_before_dst);
1057
+ }
1058
+ return sequenceLength;
1059
+ }
1060
+
1061
+ HINT_INLINE
1062
+ size_t ZSTD_execSequenceSplitLitBuffer(BYTE* op,
1063
+ BYTE* const oend, const BYTE* const oend_w, seq_t sequence,
1064
+ const BYTE** litPtr, const BYTE* const litLimit,
1065
+ const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
1066
+ {
1067
+ BYTE* const oLitEnd = op + sequence.litLength;
1068
+ size_t const sequenceLength = sequence.litLength + sequence.matchLength;
1069
+ BYTE* const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */
1070
+ const BYTE* const iLitEnd = *litPtr + sequence.litLength;
1071
+ const BYTE* match = oLitEnd - sequence.offset;
1072
+
1073
+ assert(op != NULL /* Precondition */);
1074
+ assert(oend_w < oend /* No underflow */);
1075
+ /* Handle edge cases in a slow path:
1076
+ * - Read beyond end of literals
1077
+ * - Match end is within WILDCOPY_OVERLIMIT of oend
1078
+ * - 32-bit mode and the match length overflows
1079
+ */
1080
+ if (UNLIKELY(
1081
+ iLitEnd > litLimit ||
1082
+ oMatchEnd > oend_w ||
1083
+ (MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH)))
1084
+ return ZSTD_execSequenceEndSplitLitBuffer(op, oend, oend_w, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd);
1085
+
1086
+ /* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */
1087
+ assert(op <= oLitEnd /* No overflow */);
1088
+ assert(oLitEnd < oMatchEnd /* Non-zero match & no overflow */);
1089
+ assert(oMatchEnd <= oend /* No underflow */);
722
1090
  assert(iLitEnd <= litLimit /* Literal length is in bounds */);
723
1091
  assert(oLitEnd <= oend_w /* Can wildcopy literals */);
724
1092
  assert(oMatchEnd <= oend_w /* Can wildcopy matches */);
@@ -729,7 +1097,7 @@ size_t ZSTD_execSequence(BYTE* op,
729
1097
  */
730
1098
  assert(WILDCOPY_OVERLENGTH >= 16);
731
1099
  ZSTD_copy16(op, (*litPtr));
732
- if (sequence.litLength > 16) {
1100
+ if (UNLIKELY(sequence.litLength > 16)) {
733
1101
  ZSTD_wildcopy(op+16, (*litPtr)+16, sequence.litLength-16, ZSTD_no_overlap);
734
1102
  }
735
1103
  op = oLitEnd;
@@ -738,15 +1106,15 @@ size_t ZSTD_execSequence(BYTE* op,
738
1106
  /* Copy Match */
739
1107
  if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
740
1108
  /* offset beyond prefix -> go into extDict */
741
- RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected);
1109
+ RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virtualStart)), corruption_detected, "");
742
1110
  match = dictEnd + (match - prefixStart);
743
1111
  if (match + sequence.matchLength <= dictEnd) {
744
- memmove(oLitEnd, match, sequence.matchLength);
1112
+ ZSTD_memmove(oLitEnd, match, sequence.matchLength);
745
1113
  return sequenceLength;
746
1114
  }
747
1115
  /* span extDict & currentPrefixSegment */
748
1116
  { size_t const length1 = dictEnd - match;
749
- memmove(oLitEnd, match, length1);
1117
+ ZSTD_memmove(oLitEnd, match, length1);
750
1118
  op = oLitEnd + length1;
751
1119
  sequence.matchLength -= length1;
752
1120
  match = prefixStart;
@@ -760,7 +1128,7 @@ size_t ZSTD_execSequence(BYTE* op,
760
1128
  /* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy
761
1129
  * without overlap checking.
762
1130
  */
763
- if (sequence.offset >= WILDCOPY_VECLEN) {
1131
+ if (LIKELY(sequence.offset >= WILDCOPY_VECLEN)) {
764
1132
  /* We bet on a full wildcopy for matches, since we expect matches to be
765
1133
  * longer than literals (in general). In silesia, ~10% of matches are longer
766
1134
  * than 16 bytes.
@@ -781,6 +1149,7 @@ size_t ZSTD_execSequence(BYTE* op,
781
1149
  return sequenceLength;
782
1150
  }
783
1151
 
1152
+
784
1153
  static void
785
1154
  ZSTD_initFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, const ZSTD_seqSymbol* dt)
786
1155
  {
@@ -794,16 +1163,14 @@ ZSTD_initFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, const ZSTD_seqS
794
1163
  }
795
1164
 
796
1165
  FORCE_INLINE_TEMPLATE void
797
- ZSTD_updateFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD)
1166
+ ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16 nextState, U32 nbBits)
798
1167
  {
799
- ZSTD_seqSymbol const DInfo = DStatePtr->table[DStatePtr->state];
800
- U32 const nbBits = DInfo.nbBits;
801
1168
  size_t const lowBits = BIT_readBits(bitD, nbBits);
802
- DStatePtr->state = DInfo.nextState + lowBits;
1169
+ DStatePtr->state = nextState + lowBits;
803
1170
  }
804
1171
 
805
1172
  /* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum
806
- * offset bits. But we can only read at most (STREAM_ACCUMULATOR_MIN_32 - 1)
1173
+ * offset bits. But we can only read at most STREAM_ACCUMULATOR_MIN_32
807
1174
  * bits before reloading. This value is the maximum number of bytes we read
808
1175
  * after reloading when we are decoding long offsets.
809
1176
  */
@@ -814,102 +1181,191 @@ ZSTD_updateFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD)
814
1181
 
815
1182
  typedef enum { ZSTD_lo_isRegularOffset, ZSTD_lo_isLongOffset=1 } ZSTD_longOffset_e;
816
1183
 
817
- #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
818
1184
  FORCE_INLINE_TEMPLATE seq_t
819
1185
  ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
820
1186
  {
821
1187
  seq_t seq;
822
- U32 const llBits = seqState->stateLL.table[seqState->stateLL.state].nbAdditionalBits;
823
- U32 const mlBits = seqState->stateML.table[seqState->stateML.state].nbAdditionalBits;
824
- U32 const ofBits = seqState->stateOffb.table[seqState->stateOffb.state].nbAdditionalBits;
825
- U32 const totalBits = llBits+mlBits+ofBits;
826
- U32 const llBase = seqState->stateLL.table[seqState->stateLL.state].baseValue;
827
- U32 const mlBase = seqState->stateML.table[seqState->stateML.state].baseValue;
828
- U32 const ofBase = seqState->stateOffb.table[seqState->stateOffb.state].baseValue;
829
-
830
- /* sequence */
831
- { size_t offset;
832
- if (!ofBits)
833
- offset = 0;
834
- else {
835
- ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
836
- ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
837
- assert(ofBits <= MaxOff);
838
- if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) {
839
- U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed);
840
- offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
841
- BIT_reloadDStream(&seqState->DStream);
842
- if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits);
843
- assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32); /* to avoid another reload */
844
- } else {
845
- offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */
846
- if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
847
- }
848
- }
1188
+ /*
1189
+ * ZSTD_seqSymbol is a structure with a total of 64 bits wide. So it can be
1190
+ * loaded in one operation and extracted its fields by simply shifting or
1191
+ * bit-extracting on aarch64.
1192
+ * GCC doesn't recognize this and generates more unnecessary ldr/ldrb/ldrh
1193
+ * operations that cause performance drop. This can be avoided by using this
1194
+ * ZSTD_memcpy hack.
1195
+ */
1196
+ #if defined(__aarch64__) && (defined(__GNUC__) && !defined(__clang__))
1197
+ ZSTD_seqSymbol llDInfoS, mlDInfoS, ofDInfoS;
1198
+ ZSTD_seqSymbol* const llDInfo = &llDInfoS;
1199
+ ZSTD_seqSymbol* const mlDInfo = &mlDInfoS;
1200
+ ZSTD_seqSymbol* const ofDInfo = &ofDInfoS;
1201
+ ZSTD_memcpy(llDInfo, seqState->stateLL.table + seqState->stateLL.state, sizeof(ZSTD_seqSymbol));
1202
+ ZSTD_memcpy(mlDInfo, seqState->stateML.table + seqState->stateML.state, sizeof(ZSTD_seqSymbol));
1203
+ ZSTD_memcpy(ofDInfo, seqState->stateOffb.table + seqState->stateOffb.state, sizeof(ZSTD_seqSymbol));
1204
+ #else
1205
+ const ZSTD_seqSymbol* const llDInfo = seqState->stateLL.table + seqState->stateLL.state;
1206
+ const ZSTD_seqSymbol* const mlDInfo = seqState->stateML.table + seqState->stateML.state;
1207
+ const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state;
1208
+ #endif
1209
+ seq.matchLength = mlDInfo->baseValue;
1210
+ seq.litLength = llDInfo->baseValue;
1211
+ { U32 const ofBase = ofDInfo->baseValue;
1212
+ BYTE const llBits = llDInfo->nbAdditionalBits;
1213
+ BYTE const mlBits = mlDInfo->nbAdditionalBits;
1214
+ BYTE const ofBits = ofDInfo->nbAdditionalBits;
1215
+ BYTE const totalBits = llBits+mlBits+ofBits;
1216
+
1217
+ U16 const llNext = llDInfo->nextState;
1218
+ U16 const mlNext = mlDInfo->nextState;
1219
+ U16 const ofNext = ofDInfo->nextState;
1220
+ U32 const llnbBits = llDInfo->nbBits;
1221
+ U32 const mlnbBits = mlDInfo->nbBits;
1222
+ U32 const ofnbBits = ofDInfo->nbBits;
1223
+
1224
+ assert(llBits <= MaxLLBits);
1225
+ assert(mlBits <= MaxMLBits);
1226
+ assert(ofBits <= MaxOff);
1227
+ /*
1228
+ * As gcc has better branch and block analyzers, sometimes it is only
1229
+ * valuable to mark likeliness for clang, it gives around 3-4% of
1230
+ * performance.
1231
+ */
849
1232
 
850
- if (ofBits <= 1) {
851
- offset += (llBase==0);
852
- if (offset) {
853
- size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
854
- temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */
855
- if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1];
1233
+ /* sequence */
1234
+ { size_t offset;
1235
+ if (ofBits > 1) {
1236
+ ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
1237
+ ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
1238
+ ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 > LONG_OFFSETS_MAX_EXTRA_BITS_32);
1239
+ ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 - LONG_OFFSETS_MAX_EXTRA_BITS_32 >= MaxMLBits);
1240
+ if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) {
1241
+ /* Always read extra bits, this keeps the logic simple,
1242
+ * avoids branches, and avoids accidentally reading 0 bits.
1243
+ */
1244
+ U32 const extraBits = LONG_OFFSETS_MAX_EXTRA_BITS_32;
1245
+ offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
1246
+ BIT_reloadDStream(&seqState->DStream);
1247
+ offset += BIT_readBitsFast(&seqState->DStream, extraBits);
1248
+ } else {
1249
+ offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */
1250
+ if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
1251
+ }
1252
+ seqState->prevOffset[2] = seqState->prevOffset[1];
856
1253
  seqState->prevOffset[1] = seqState->prevOffset[0];
857
- seqState->prevOffset[0] = offset = temp;
858
- } else { /* offset == 0 */
859
- offset = seqState->prevOffset[0];
860
- }
861
- } else {
862
- seqState->prevOffset[2] = seqState->prevOffset[1];
863
- seqState->prevOffset[1] = seqState->prevOffset[0];
864
- seqState->prevOffset[0] = offset;
1254
+ seqState->prevOffset[0] = offset;
1255
+ } else {
1256
+ U32 const ll0 = (llDInfo->baseValue == 0);
1257
+ if (LIKELY((ofBits == 0))) {
1258
+ offset = seqState->prevOffset[ll0];
1259
+ seqState->prevOffset[1] = seqState->prevOffset[!ll0];
1260
+ seqState->prevOffset[0] = offset;
1261
+ } else {
1262
+ offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1);
1263
+ { size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
1264
+ temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */
1265
+ if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1];
1266
+ seqState->prevOffset[1] = seqState->prevOffset[0];
1267
+ seqState->prevOffset[0] = offset = temp;
1268
+ } } }
1269
+ seq.offset = offset;
865
1270
  }
866
- seq.offset = offset;
867
- }
868
1271
 
869
- seq.matchLength = mlBase
870
- + ((mlBits>0) ? BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/) : 0); /* <= 16 bits */
871
- if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))
872
- BIT_reloadDStream(&seqState->DStream);
873
- if (MEM_64bits() && (totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog)))
874
- BIT_reloadDStream(&seqState->DStream);
875
- /* Ensure there are enough bits to read the rest of data in 64-bit mode. */
876
- ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);
877
-
878
- seq.litLength = llBase
879
- + ((llBits>0) ? BIT_readBitsFast(&seqState->DStream, llBits/*>0*/) : 0); /* <= 16 bits */
880
- if (MEM_32bits())
881
- BIT_reloadDStream(&seqState->DStream);
882
-
883
- DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u",
884
- (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
885
-
886
- /* ANS state update */
887
- ZSTD_updateFseState(&seqState->stateLL, &seqState->DStream); /* <= 9 bits */
888
- ZSTD_updateFseState(&seqState->stateML, &seqState->DStream); /* <= 9 bits */
889
- if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */
890
- ZSTD_updateFseState(&seqState->stateOffb, &seqState->DStream); /* <= 8 bits */
1272
+ if (mlBits > 0)
1273
+ seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/);
1274
+
1275
+ if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))
1276
+ BIT_reloadDStream(&seqState->DStream);
1277
+ if (MEM_64bits() && UNLIKELY(totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog)))
1278
+ BIT_reloadDStream(&seqState->DStream);
1279
+ /* Ensure there are enough bits to read the rest of data in 64-bit mode. */
1280
+ ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);
1281
+
1282
+ if (llBits > 0)
1283
+ seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/);
1284
+
1285
+ if (MEM_32bits())
1286
+ BIT_reloadDStream(&seqState->DStream);
1287
+
1288
+ DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u",
1289
+ (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
1290
+
1291
+ ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits); /* <= 9 bits */
1292
+ ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits); /* <= 9 bits */
1293
+ if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */
1294
+ ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits); /* <= 8 bits */
1295
+ }
891
1296
 
892
1297
  return seq;
893
1298
  }
894
1299
 
1300
+ #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
1301
+ MEM_STATIC int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd)
1302
+ {
1303
+ size_t const windowSize = dctx->fParams.windowSize;
1304
+ /* No dictionary used. */
1305
+ if (dctx->dictContentEndForFuzzing == NULL) return 0;
1306
+ /* Dictionary is our prefix. */
1307
+ if (prefixStart == dctx->dictContentBeginForFuzzing) return 1;
1308
+ /* Dictionary is not our ext-dict. */
1309
+ if (dctx->dictEnd != dctx->dictContentEndForFuzzing) return 0;
1310
+ /* Dictionary is not within our window size. */
1311
+ if ((size_t)(oLitEnd - prefixStart) >= windowSize) return 0;
1312
+ /* Dictionary is active. */
1313
+ return 1;
1314
+ }
1315
+
1316
+ MEM_STATIC void ZSTD_assertValidSequence(
1317
+ ZSTD_DCtx const* dctx,
1318
+ BYTE const* op, BYTE const* oend,
1319
+ seq_t const seq,
1320
+ BYTE const* prefixStart, BYTE const* virtualStart)
1321
+ {
1322
+ #if DEBUGLEVEL >= 1
1323
+ size_t const windowSize = dctx->fParams.windowSize;
1324
+ size_t const sequenceSize = seq.litLength + seq.matchLength;
1325
+ BYTE const* const oLitEnd = op + seq.litLength;
1326
+ DEBUGLOG(6, "Checking sequence: litL=%u matchL=%u offset=%u",
1327
+ (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
1328
+ assert(op <= oend);
1329
+ assert((size_t)(oend - op) >= sequenceSize);
1330
+ assert(sequenceSize <= ZSTD_BLOCKSIZE_MAX);
1331
+ if (ZSTD_dictionaryIsActive(dctx, prefixStart, oLitEnd)) {
1332
+ size_t const dictSize = (size_t)((char const*)dctx->dictContentEndForFuzzing - (char const*)dctx->dictContentBeginForFuzzing);
1333
+ /* Offset must be within the dictionary. */
1334
+ assert(seq.offset <= (size_t)(oLitEnd - virtualStart));
1335
+ assert(seq.offset <= windowSize + dictSize);
1336
+ } else {
1337
+ /* Offset must be within our window. */
1338
+ assert(seq.offset <= windowSize);
1339
+ }
1340
+ #else
1341
+ (void)dctx, (void)op, (void)oend, (void)seq, (void)prefixStart, (void)virtualStart;
1342
+ #endif
1343
+ }
1344
+ #endif
1345
+
1346
+ #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
1347
+
1348
+
895
1349
  FORCE_INLINE_TEMPLATE size_t
896
1350
  DONT_VECTORIZE
897
- ZSTD_decompressSequences_body( ZSTD_DCtx* dctx,
1351
+ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,
898
1352
  void* dst, size_t maxDstSize,
899
1353
  const void* seqStart, size_t seqSize, int nbSeq,
900
- const ZSTD_longOffset_e isLongOffset)
1354
+ const ZSTD_longOffset_e isLongOffset,
1355
+ const int frame)
901
1356
  {
902
1357
  const BYTE* ip = (const BYTE*)seqStart;
903
1358
  const BYTE* const iend = ip + seqSize;
904
- BYTE* const ostart = (BYTE* const)dst;
1359
+ BYTE* const ostart = (BYTE*)dst;
905
1360
  BYTE* const oend = ostart + maxDstSize;
906
1361
  BYTE* op = ostart;
907
1362
  const BYTE* litPtr = dctx->litPtr;
908
- const BYTE* const litEnd = litPtr + dctx->litSize;
1363
+ const BYTE* litBufferEnd = dctx->litBufferEnd;
909
1364
  const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart);
910
1365
  const BYTE* const vBase = (const BYTE*) (dctx->virtualStart);
911
1366
  const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
912
- DEBUGLOG(5, "ZSTD_decompressSequences_body");
1367
+ DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer");
1368
+ (void)frame;
913
1369
 
914
1370
  /* Regen sequences */
915
1371
  if (nbSeq) {
@@ -918,38 +1374,279 @@ ZSTD_decompressSequences_body( ZSTD_DCtx* dctx,
918
1374
  { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; }
919
1375
  RETURN_ERROR_IF(
920
1376
  ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend-ip)),
921
- corruption_detected);
1377
+ corruption_detected, "");
922
1378
  ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr);
923
1379
  ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr);
924
1380
  ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
1381
+ assert(dst != NULL);
925
1382
 
926
1383
  ZSTD_STATIC_ASSERT(
927
1384
  BIT_DStream_unfinished < BIT_DStream_completed &&
928
1385
  BIT_DStream_endOfBuffer < BIT_DStream_completed &&
929
1386
  BIT_DStream_completed < BIT_DStream_overflow);
930
1387
 
931
- for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && nbSeq ; ) {
932
- nbSeq--;
933
- { seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
934
- size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd);
1388
+ /* decompress without overrunning litPtr begins */
1389
+ {
1390
+ seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1391
+ /* Align the decompression loop to 32 + 16 bytes.
1392
+ *
1393
+ * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression
1394
+ * speed swings based on the alignment of the decompression loop. This
1395
+ * performance swing is caused by parts of the decompression loop falling
1396
+ * out of the DSB. The entire decompression loop should fit in the DSB,
1397
+ * when it can't we get much worse performance. You can measure if you've
1398
+ * hit the good case or the bad case with this perf command for some
1399
+ * compressed file test.zst:
1400
+ *
1401
+ * perf stat -e cycles -e instructions -e idq.all_dsb_cycles_any_uops \
1402
+ * -e idq.all_mite_cycles_any_uops -- ./zstd -tq test.zst
1403
+ *
1404
+ * If you see most cycles served out of the MITE you've hit the bad case.
1405
+ * If you see most cycles served out of the DSB you've hit the good case.
1406
+ * If it is pretty even then you may be in an okay case.
1407
+ *
1408
+ * This issue has been reproduced on the following CPUs:
1409
+ * - Kabylake: Macbook Pro (15-inch, 2019) 2.4 GHz Intel Core i9
1410
+ * Use Instruments->Counters to get DSB/MITE cycles.
1411
+ * I never got performance swings, but I was able to
1412
+ * go from the good case of mostly DSB to half of the
1413
+ * cycles served from MITE.
1414
+ * - Coffeelake: Intel i9-9900k
1415
+ * - Coffeelake: Intel i7-9700k
1416
+ *
1417
+ * I haven't been able to reproduce the instability or DSB misses on any
1418
+ * of the following CPUS:
1419
+ * - Haswell
1420
+ * - Broadwell: Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GH
1421
+ * - Skylake
1422
+ *
1423
+ * Alignment is done for each of the three major decompression loops:
1424
+ * - ZSTD_decompressSequences_bodySplitLitBuffer - presplit section of the literal buffer
1425
+ * - ZSTD_decompressSequences_bodySplitLitBuffer - postsplit section of the literal buffer
1426
+ * - ZSTD_decompressSequences_body
1427
+ * Alignment choices are made to minimize large swings on bad cases and influence on performance
1428
+ * from changes external to this code, rather than to overoptimize on the current commit.
1429
+ *
1430
+ * If you are seeing performance stability this script can help test.
1431
+ * It tests on 4 commits in zstd where I saw performance change.
1432
+ *
1433
+ * https://gist.github.com/terrelln/9889fc06a423fd5ca6e99351564473f4
1434
+ */
1435
+ #if defined(__GNUC__) && defined(__x86_64__)
1436
+ __asm__(".p2align 6");
1437
+ # if __GNUC__ >= 7
1438
+ /* good for gcc-7, gcc-9, and gcc-11 */
1439
+ __asm__("nop");
1440
+ __asm__(".p2align 5");
1441
+ __asm__("nop");
1442
+ __asm__(".p2align 4");
1443
+ # if __GNUC__ == 8 || __GNUC__ == 10
1444
+ /* good for gcc-8 and gcc-10 */
1445
+ __asm__("nop");
1446
+ __asm__(".p2align 3");
1447
+ # endif
1448
+ # endif
1449
+ #endif
1450
+
1451
+ /* Handle the initial state where litBuffer is currently split between dst and litExtraBuffer */
1452
+ for (; litPtr + sequence.litLength <= dctx->litBufferEnd; ) {
1453
+ size_t const oneSeqSize = ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
1454
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1455
+ assert(!ZSTD_isError(oneSeqSize));
1456
+ if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
1457
+ #endif
1458
+ if (UNLIKELY(ZSTD_isError(oneSeqSize)))
1459
+ return oneSeqSize;
935
1460
  DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
936
- if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
937
1461
  op += oneSeqSize;
938
- } }
1462
+ if (UNLIKELY(!--nbSeq))
1463
+ break;
1464
+ BIT_reloadDStream(&(seqState.DStream));
1465
+ sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1466
+ }
1467
+
1468
+ /* If there are more sequences, they will need to read literals from litExtraBuffer; copy over the remainder from dst and update litPtr and litEnd */
1469
+ if (nbSeq > 0) {
1470
+ const size_t leftoverLit = dctx->litBufferEnd - litPtr;
1471
+ if (leftoverLit)
1472
+ {
1473
+ RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");
1474
+ ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
1475
+ sequence.litLength -= leftoverLit;
1476
+ op += leftoverLit;
1477
+ }
1478
+ litPtr = dctx->litExtraBuffer;
1479
+ litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
1480
+ dctx->litBufferLocation = ZSTD_not_in_dst;
1481
+ {
1482
+ size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
1483
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1484
+ assert(!ZSTD_isError(oneSeqSize));
1485
+ if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
1486
+ #endif
1487
+ if (UNLIKELY(ZSTD_isError(oneSeqSize)))
1488
+ return oneSeqSize;
1489
+ DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
1490
+ op += oneSeqSize;
1491
+ if (--nbSeq)
1492
+ BIT_reloadDStream(&(seqState.DStream));
1493
+ }
1494
+ }
1495
+ }
1496
+
1497
+ if (nbSeq > 0) /* there is remaining lit from extra buffer */
1498
+ {
1499
+
1500
+ #if defined(__GNUC__) && defined(__x86_64__)
1501
+ __asm__(".p2align 6");
1502
+ __asm__("nop");
1503
+ # if __GNUC__ != 7
1504
+ /* worse for gcc-7 better for gcc-8, gcc-9, and gcc-10 and clang */
1505
+ __asm__(".p2align 4");
1506
+ __asm__("nop");
1507
+ __asm__(".p2align 3");
1508
+ # elif __GNUC__ >= 11
1509
+ __asm__(".p2align 3");
1510
+ # else
1511
+ __asm__(".p2align 5");
1512
+ __asm__("nop");
1513
+ __asm__(".p2align 3");
1514
+ # endif
1515
+ #endif
1516
+
1517
+ for (; ; ) {
1518
+ seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1519
+ size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
1520
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1521
+ assert(!ZSTD_isError(oneSeqSize));
1522
+ if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
1523
+ #endif
1524
+ if (UNLIKELY(ZSTD_isError(oneSeqSize)))
1525
+ return oneSeqSize;
1526
+ DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
1527
+ op += oneSeqSize;
1528
+ if (UNLIKELY(!--nbSeq))
1529
+ break;
1530
+ BIT_reloadDStream(&(seqState.DStream));
1531
+ }
1532
+ }
1533
+
1534
+ /* check if reached exact end */
1535
+ DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer: after decode loop, remaining nbSeq : %i", nbSeq);
1536
+ RETURN_ERROR_IF(nbSeq, corruption_detected, "");
1537
+ RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, "");
1538
+ /* save reps for next block */
1539
+ { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
1540
+ }
1541
+
1542
+ /* last literal segment */
1543
+ if (dctx->litBufferLocation == ZSTD_split) /* split hasn't been reached yet, first get dst then copy litExtraBuffer */
1544
+ {
1545
+ size_t const lastLLSize = litBufferEnd - litPtr;
1546
+ RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, "");
1547
+ if (op != NULL) {
1548
+ ZSTD_memmove(op, litPtr, lastLLSize);
1549
+ op += lastLLSize;
1550
+ }
1551
+ litPtr = dctx->litExtraBuffer;
1552
+ litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
1553
+ dctx->litBufferLocation = ZSTD_not_in_dst;
1554
+ }
1555
+ { size_t const lastLLSize = litBufferEnd - litPtr;
1556
+ RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
1557
+ if (op != NULL) {
1558
+ ZSTD_memcpy(op, litPtr, lastLLSize);
1559
+ op += lastLLSize;
1560
+ }
1561
+ }
1562
+
1563
+ return op-ostart;
1564
+ }
1565
+
1566
+ FORCE_INLINE_TEMPLATE size_t
1567
+ DONT_VECTORIZE
1568
+ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx,
1569
+ void* dst, size_t maxDstSize,
1570
+ const void* seqStart, size_t seqSize, int nbSeq,
1571
+ const ZSTD_longOffset_e isLongOffset,
1572
+ const int frame)
1573
+ {
1574
+ const BYTE* ip = (const BYTE*)seqStart;
1575
+ const BYTE* const iend = ip + seqSize;
1576
+ BYTE* const ostart = (BYTE*)dst;
1577
+ BYTE* const oend = dctx->litBufferLocation == ZSTD_not_in_dst ? ostart + maxDstSize : dctx->litBuffer;
1578
+ BYTE* op = ostart;
1579
+ const BYTE* litPtr = dctx->litPtr;
1580
+ const BYTE* const litEnd = litPtr + dctx->litSize;
1581
+ const BYTE* const prefixStart = (const BYTE*)(dctx->prefixStart);
1582
+ const BYTE* const vBase = (const BYTE*)(dctx->virtualStart);
1583
+ const BYTE* const dictEnd = (const BYTE*)(dctx->dictEnd);
1584
+ DEBUGLOG(5, "ZSTD_decompressSequences_body: nbSeq = %d", nbSeq);
1585
+ (void)frame;
1586
+
1587
+ /* Regen sequences */
1588
+ if (nbSeq) {
1589
+ seqState_t seqState;
1590
+ dctx->fseEntropy = 1;
1591
+ { U32 i; for (i = 0; i < ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; }
1592
+ RETURN_ERROR_IF(
1593
+ ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend - ip)),
1594
+ corruption_detected, "");
1595
+ ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr);
1596
+ ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr);
1597
+ ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
1598
+ assert(dst != NULL);
1599
+
1600
+ ZSTD_STATIC_ASSERT(
1601
+ BIT_DStream_unfinished < BIT_DStream_completed &&
1602
+ BIT_DStream_endOfBuffer < BIT_DStream_completed &&
1603
+ BIT_DStream_completed < BIT_DStream_overflow);
1604
+
1605
+ #if defined(__GNUC__) && defined(__x86_64__)
1606
+ __asm__(".p2align 6");
1607
+ __asm__("nop");
1608
+ # if __GNUC__ >= 7
1609
+ __asm__(".p2align 5");
1610
+ __asm__("nop");
1611
+ __asm__(".p2align 3");
1612
+ # else
1613
+ __asm__(".p2align 4");
1614
+ __asm__("nop");
1615
+ __asm__(".p2align 3");
1616
+ # endif
1617
+ #endif
1618
+
1619
+ for ( ; ; ) {
1620
+ seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1621
+ size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd);
1622
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1623
+ assert(!ZSTD_isError(oneSeqSize));
1624
+ if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
1625
+ #endif
1626
+ if (UNLIKELY(ZSTD_isError(oneSeqSize)))
1627
+ return oneSeqSize;
1628
+ DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
1629
+ op += oneSeqSize;
1630
+ if (UNLIKELY(!--nbSeq))
1631
+ break;
1632
+ BIT_reloadDStream(&(seqState.DStream));
1633
+ }
939
1634
 
940
1635
  /* check if reached exact end */
941
1636
  DEBUGLOG(5, "ZSTD_decompressSequences_body: after decode loop, remaining nbSeq : %i", nbSeq);
942
- RETURN_ERROR_IF(nbSeq, corruption_detected);
943
- RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected);
1637
+ RETURN_ERROR_IF(nbSeq, corruption_detected, "");
1638
+ RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, "");
944
1639
  /* save reps for next block */
945
1640
  { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
946
1641
  }
947
1642
 
948
1643
  /* last literal segment */
949
1644
  { size_t const lastLLSize = litEnd - litPtr;
950
- RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall);
951
- memcpy(op, litPtr, lastLLSize);
952
- op += lastLLSize;
1645
+ RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
1646
+ if (op != NULL) {
1647
+ ZSTD_memcpy(op, litPtr, lastLLSize);
1648
+ op += lastLLSize;
1649
+ }
953
1650
  }
954
1651
 
955
1652
  return op-ostart;
@@ -959,157 +1656,180 @@ static size_t
959
1656
  ZSTD_decompressSequences_default(ZSTD_DCtx* dctx,
960
1657
  void* dst, size_t maxDstSize,
961
1658
  const void* seqStart, size_t seqSize, int nbSeq,
962
- const ZSTD_longOffset_e isLongOffset)
1659
+ const ZSTD_longOffset_e isLongOffset,
1660
+ const int frame)
963
1661
  {
964
- return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
1662
+ return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
965
1663
  }
966
- #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
967
1664
 
968
-
969
-
970
- #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
971
- FORCE_INLINE_TEMPLATE seq_t
972
- ZSTD_decodeSequenceLong(seqState_t* seqState, ZSTD_longOffset_e const longOffsets)
1665
+ static size_t
1666
+ ZSTD_decompressSequencesSplitLitBuffer_default(ZSTD_DCtx* dctx,
1667
+ void* dst, size_t maxDstSize,
1668
+ const void* seqStart, size_t seqSize, int nbSeq,
1669
+ const ZSTD_longOffset_e isLongOffset,
1670
+ const int frame)
973
1671
  {
974
- seq_t seq;
975
- U32 const llBits = seqState->stateLL.table[seqState->stateLL.state].nbAdditionalBits;
976
- U32 const mlBits = seqState->stateML.table[seqState->stateML.state].nbAdditionalBits;
977
- U32 const ofBits = seqState->stateOffb.table[seqState->stateOffb.state].nbAdditionalBits;
978
- U32 const totalBits = llBits+mlBits+ofBits;
979
- U32 const llBase = seqState->stateLL.table[seqState->stateLL.state].baseValue;
980
- U32 const mlBase = seqState->stateML.table[seqState->stateML.state].baseValue;
981
- U32 const ofBase = seqState->stateOffb.table[seqState->stateOffb.state].baseValue;
982
-
983
- /* sequence */
984
- { size_t offset;
985
- if (!ofBits)
986
- offset = 0;
987
- else {
988
- ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
989
- ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
990
- assert(ofBits <= MaxOff);
991
- if (MEM_32bits() && longOffsets) {
992
- U32 const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN_32-1);
993
- offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
994
- if (MEM_32bits() || extraBits) BIT_reloadDStream(&seqState->DStream);
995
- if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits);
996
- } else {
997
- offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */
998
- if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
999
- }
1000
- }
1672
+ return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1673
+ }
1674
+ #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
1001
1675
 
1002
- if (ofBits <= 1) {
1003
- offset += (llBase==0);
1004
- if (offset) {
1005
- size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
1006
- temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */
1007
- if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1];
1008
- seqState->prevOffset[1] = seqState->prevOffset[0];
1009
- seqState->prevOffset[0] = offset = temp;
1010
- } else {
1011
- offset = seqState->prevOffset[0];
1012
- }
1013
- } else {
1014
- seqState->prevOffset[2] = seqState->prevOffset[1];
1015
- seqState->prevOffset[1] = seqState->prevOffset[0];
1016
- seqState->prevOffset[0] = offset;
1017
- }
1018
- seq.offset = offset;
1019
- }
1676
+ #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
1020
1677
 
1021
- seq.matchLength = mlBase + ((mlBits>0) ? BIT_readBitsFast(&seqState->DStream, mlBits) : 0); /* <= 16 bits */
1022
- if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))
1023
- BIT_reloadDStream(&seqState->DStream);
1024
- if (MEM_64bits() && (totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog)))
1025
- BIT_reloadDStream(&seqState->DStream);
1026
- /* Verify that there is enough bits to read the rest of the data in 64-bit mode. */
1027
- ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);
1028
-
1029
- seq.litLength = llBase + ((llBits>0) ? BIT_readBitsFast(&seqState->DStream, llBits) : 0); /* <= 16 bits */
1030
- if (MEM_32bits())
1031
- BIT_reloadDStream(&seqState->DStream);
1032
-
1033
- { size_t const pos = seqState->pos + seq.litLength;
1034
- const BYTE* const matchBase = (seq.offset > pos) ? seqState->dictEnd : seqState->prefixStart;
1035
- seq.match = matchBase + pos - seq.offset; /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted.
1036
- * No consequence though : no memory access will occur, overly large offset will be detected in ZSTD_execSequenceLong() */
1037
- seqState->pos = pos + seq.matchLength;
1678
+ FORCE_INLINE_TEMPLATE size_t
1679
+ ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence,
1680
+ const BYTE* const prefixStart, const BYTE* const dictEnd)
1681
+ {
1682
+ prefetchPos += sequence.litLength;
1683
+ { const BYTE* const matchBase = (sequence.offset > prefetchPos) ? dictEnd : prefixStart;
1684
+ const BYTE* const match = matchBase + prefetchPos - sequence.offset; /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted.
1685
+ * No consequence though : memory address is only used for prefetching, not for dereferencing */
1686
+ PREFETCH_L1(match); PREFETCH_L1(match+CACHELINE_SIZE); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
1038
1687
  }
1039
-
1040
- /* ANS state update */
1041
- ZSTD_updateFseState(&seqState->stateLL, &seqState->DStream); /* <= 9 bits */
1042
- ZSTD_updateFseState(&seqState->stateML, &seqState->DStream); /* <= 9 bits */
1043
- if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */
1044
- ZSTD_updateFseState(&seqState->stateOffb, &seqState->DStream); /* <= 8 bits */
1045
-
1046
- return seq;
1688
+ return prefetchPos + sequence.matchLength;
1047
1689
  }
1048
1690
 
1691
+ /* This decoding function employs prefetching
1692
+ * to reduce latency impact of cache misses.
1693
+ * It's generally employed when block contains a significant portion of long-distance matches
1694
+ * or when coupled with a "cold" dictionary */
1049
1695
  FORCE_INLINE_TEMPLATE size_t
1050
1696
  ZSTD_decompressSequencesLong_body(
1051
1697
  ZSTD_DCtx* dctx,
1052
1698
  void* dst, size_t maxDstSize,
1053
1699
  const void* seqStart, size_t seqSize, int nbSeq,
1054
- const ZSTD_longOffset_e isLongOffset)
1700
+ const ZSTD_longOffset_e isLongOffset,
1701
+ const int frame)
1055
1702
  {
1056
1703
  const BYTE* ip = (const BYTE*)seqStart;
1057
1704
  const BYTE* const iend = ip + seqSize;
1058
- BYTE* const ostart = (BYTE* const)dst;
1059
- BYTE* const oend = ostart + maxDstSize;
1705
+ BYTE* const ostart = (BYTE*)dst;
1706
+ BYTE* const oend = dctx->litBufferLocation == ZSTD_in_dst ? dctx->litBuffer : ostart + maxDstSize;
1060
1707
  BYTE* op = ostart;
1061
1708
  const BYTE* litPtr = dctx->litPtr;
1062
- const BYTE* const litEnd = litPtr + dctx->litSize;
1709
+ const BYTE* litBufferEnd = dctx->litBufferEnd;
1063
1710
  const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart);
1064
1711
  const BYTE* const dictStart = (const BYTE*) (dctx->virtualStart);
1065
1712
  const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
1713
+ (void)frame;
1066
1714
 
1067
1715
  /* Regen sequences */
1068
1716
  if (nbSeq) {
1069
- #define STORED_SEQS 4
1717
+ #define STORED_SEQS 8
1070
1718
  #define STORED_SEQS_MASK (STORED_SEQS-1)
1071
- #define ADVANCED_SEQS 4
1719
+ #define ADVANCED_SEQS STORED_SEQS
1072
1720
  seq_t sequences[STORED_SEQS];
1073
1721
  int const seqAdvance = MIN(nbSeq, ADVANCED_SEQS);
1074
1722
  seqState_t seqState;
1075
1723
  int seqNb;
1724
+ size_t prefetchPos = (size_t)(op-prefixStart); /* track position relative to prefixStart */
1725
+
1076
1726
  dctx->fseEntropy = 1;
1077
1727
  { int i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; }
1078
- seqState.prefixStart = prefixStart;
1079
- seqState.pos = (size_t)(op-prefixStart);
1080
- seqState.dictEnd = dictEnd;
1728
+ assert(dst != NULL);
1081
1729
  assert(iend >= ip);
1082
1730
  RETURN_ERROR_IF(
1083
1731
  ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend-ip)),
1084
- corruption_detected);
1732
+ corruption_detected, "");
1085
1733
  ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr);
1086
1734
  ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr);
1087
1735
  ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
1088
1736
 
1089
1737
  /* prepare in advance */
1090
1738
  for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && (seqNb<seqAdvance); seqNb++) {
1091
- sequences[seqNb] = ZSTD_decodeSequenceLong(&seqState, isLongOffset);
1092
- PREFETCH_L1(sequences[seqNb].match); PREFETCH_L1(sequences[seqNb].match + sequences[seqNb].matchLength - 1); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
1739
+ seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1740
+ prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
1741
+ sequences[seqNb] = sequence;
1093
1742
  }
1094
- RETURN_ERROR_IF(seqNb<seqAdvance, corruption_detected);
1095
-
1096
- /* decode and decompress */
1097
- for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (seqNb<nbSeq) ; seqNb++) {
1098
- seq_t const sequence = ZSTD_decodeSequenceLong(&seqState, isLongOffset);
1099
- size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb-ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd);
1100
- if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1101
- PREFETCH_L1(sequence.match); PREFETCH_L1(sequence.match + sequence.matchLength - 1); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
1102
- sequences[seqNb & STORED_SEQS_MASK] = sequence;
1103
- op += oneSeqSize;
1743
+ RETURN_ERROR_IF(seqNb<seqAdvance, corruption_detected, "");
1744
+
1745
+ /* decompress without stomping litBuffer */
1746
+ for (; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (seqNb < nbSeq); seqNb++) {
1747
+ seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1748
+ size_t oneSeqSize;
1749
+
1750
+ if (dctx->litBufferLocation == ZSTD_split && litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength > dctx->litBufferEnd)
1751
+ {
1752
+ /* lit buffer is reaching split point, empty out the first buffer and transition to litExtraBuffer */
1753
+ const size_t leftoverLit = dctx->litBufferEnd - litPtr;
1754
+ if (leftoverLit)
1755
+ {
1756
+ RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");
1757
+ ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
1758
+ sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength -= leftoverLit;
1759
+ op += leftoverLit;
1760
+ }
1761
+ litPtr = dctx->litExtraBuffer;
1762
+ litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
1763
+ dctx->litBufferLocation = ZSTD_not_in_dst;
1764
+ oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
1765
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1766
+ assert(!ZSTD_isError(oneSeqSize));
1767
+ if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
1768
+ #endif
1769
+ if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1770
+
1771
+ prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
1772
+ sequences[seqNb & STORED_SEQS_MASK] = sequence;
1773
+ op += oneSeqSize;
1774
+ }
1775
+ else
1776
+ {
1777
+ /* lit buffer is either wholly contained in first or second split, or not split at all*/
1778
+ oneSeqSize = dctx->litBufferLocation == ZSTD_split ?
1779
+ ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength - WILDCOPY_OVERLENGTH, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) :
1780
+ ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
1781
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1782
+ assert(!ZSTD_isError(oneSeqSize));
1783
+ if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
1784
+ #endif
1785
+ if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1786
+
1787
+ prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
1788
+ sequences[seqNb & STORED_SEQS_MASK] = sequence;
1789
+ op += oneSeqSize;
1790
+ }
1104
1791
  }
1105
- RETURN_ERROR_IF(seqNb<nbSeq, corruption_detected);
1792
+ RETURN_ERROR_IF(seqNb<nbSeq, corruption_detected, "");
1106
1793
 
1107
1794
  /* finish queue */
1108
1795
  seqNb -= seqAdvance;
1109
1796
  for ( ; seqNb<nbSeq ; seqNb++) {
1110
- size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[seqNb&STORED_SEQS_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd);
1111
- if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1112
- op += oneSeqSize;
1797
+ seq_t *sequence = &(sequences[seqNb&STORED_SEQS_MASK]);
1798
+ if (dctx->litBufferLocation == ZSTD_split && litPtr + sequence->litLength > dctx->litBufferEnd)
1799
+ {
1800
+ const size_t leftoverLit = dctx->litBufferEnd - litPtr;
1801
+ if (leftoverLit)
1802
+ {
1803
+ RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");
1804
+ ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
1805
+ sequence->litLength -= leftoverLit;
1806
+ op += leftoverLit;
1807
+ }
1808
+ litPtr = dctx->litExtraBuffer;
1809
+ litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
1810
+ dctx->litBufferLocation = ZSTD_not_in_dst;
1811
+ {
1812
+ size_t const oneSeqSize = ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
1813
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1814
+ assert(!ZSTD_isError(oneSeqSize));
1815
+ if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
1816
+ #endif
1817
+ if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1818
+ op += oneSeqSize;
1819
+ }
1820
+ }
1821
+ else
1822
+ {
1823
+ size_t const oneSeqSize = dctx->litBufferLocation == ZSTD_split ?
1824
+ ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence->litLength - WILDCOPY_OVERLENGTH, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) :
1825
+ ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
1826
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1827
+ assert(!ZSTD_isError(oneSeqSize));
1828
+ if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
1829
+ #endif
1830
+ if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1831
+ op += oneSeqSize;
1832
+ }
1113
1833
  }
1114
1834
 
1115
1835
  /* save reps for next block */
@@ -1117,10 +1837,23 @@ ZSTD_decompressSequencesLong_body(
1117
1837
  }
1118
1838
 
1119
1839
  /* last literal segment */
1120
- { size_t const lastLLSize = litEnd - litPtr;
1121
- RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall);
1122
- memcpy(op, litPtr, lastLLSize);
1123
- op += lastLLSize;
1840
+ if (dctx->litBufferLocation == ZSTD_split) /* first deplete literal buffer in dst, then copy litExtraBuffer */
1841
+ {
1842
+ size_t const lastLLSize = litBufferEnd - litPtr;
1843
+ RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, "");
1844
+ if (op != NULL) {
1845
+ ZSTD_memmove(op, litPtr, lastLLSize);
1846
+ op += lastLLSize;
1847
+ }
1848
+ litPtr = dctx->litExtraBuffer;
1849
+ litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
1850
+ }
1851
+ { size_t const lastLLSize = litBufferEnd - litPtr;
1852
+ RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
1853
+ if (op != NULL) {
1854
+ ZSTD_memmove(op, litPtr, lastLLSize);
1855
+ op += lastLLSize;
1856
+ }
1124
1857
  }
1125
1858
 
1126
1859
  return op-ostart;
@@ -1130,9 +1863,10 @@ static size_t
1130
1863
  ZSTD_decompressSequencesLong_default(ZSTD_DCtx* dctx,
1131
1864
  void* dst, size_t maxDstSize,
1132
1865
  const void* seqStart, size_t seqSize, int nbSeq,
1133
- const ZSTD_longOffset_e isLongOffset)
1866
+ const ZSTD_longOffset_e isLongOffset,
1867
+ const int frame)
1134
1868
  {
1135
- return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
1869
+ return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1136
1870
  }
1137
1871
  #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
1138
1872
 
@@ -1141,25 +1875,37 @@ ZSTD_decompressSequencesLong_default(ZSTD_DCtx* dctx,
1141
1875
  #if DYNAMIC_BMI2
1142
1876
 
1143
1877
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
1144
- static TARGET_ATTRIBUTE("bmi2") size_t
1878
+ static BMI2_TARGET_ATTRIBUTE size_t
1145
1879
  DONT_VECTORIZE
1146
1880
  ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx,
1147
1881
  void* dst, size_t maxDstSize,
1148
1882
  const void* seqStart, size_t seqSize, int nbSeq,
1149
- const ZSTD_longOffset_e isLongOffset)
1883
+ const ZSTD_longOffset_e isLongOffset,
1884
+ const int frame)
1150
1885
  {
1151
- return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
1886
+ return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1887
+ }
1888
+ static BMI2_TARGET_ATTRIBUTE size_t
1889
+ DONT_VECTORIZE
1890
+ ZSTD_decompressSequencesSplitLitBuffer_bmi2(ZSTD_DCtx* dctx,
1891
+ void* dst, size_t maxDstSize,
1892
+ const void* seqStart, size_t seqSize, int nbSeq,
1893
+ const ZSTD_longOffset_e isLongOffset,
1894
+ const int frame)
1895
+ {
1896
+ return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1152
1897
  }
1153
1898
  #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
1154
1899
 
1155
1900
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
1156
- static TARGET_ATTRIBUTE("bmi2") size_t
1901
+ static BMI2_TARGET_ATTRIBUTE size_t
1157
1902
  ZSTD_decompressSequencesLong_bmi2(ZSTD_DCtx* dctx,
1158
1903
  void* dst, size_t maxDstSize,
1159
1904
  const void* seqStart, size_t seqSize, int nbSeq,
1160
- const ZSTD_longOffset_e isLongOffset)
1905
+ const ZSTD_longOffset_e isLongOffset,
1906
+ const int frame)
1161
1907
  {
1162
- return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
1908
+ return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1163
1909
  }
1164
1910
  #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
1165
1911
 
@@ -1169,21 +1915,37 @@ typedef size_t (*ZSTD_decompressSequences_t)(
1169
1915
  ZSTD_DCtx* dctx,
1170
1916
  void* dst, size_t maxDstSize,
1171
1917
  const void* seqStart, size_t seqSize, int nbSeq,
1172
- const ZSTD_longOffset_e isLongOffset);
1918
+ const ZSTD_longOffset_e isLongOffset,
1919
+ const int frame);
1173
1920
 
1174
1921
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
1175
1922
  static size_t
1176
1923
  ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,
1177
1924
  const void* seqStart, size_t seqSize, int nbSeq,
1178
- const ZSTD_longOffset_e isLongOffset)
1925
+ const ZSTD_longOffset_e isLongOffset,
1926
+ const int frame)
1179
1927
  {
1180
1928
  DEBUGLOG(5, "ZSTD_decompressSequences");
1181
1929
  #if DYNAMIC_BMI2
1182
- if (dctx->bmi2) {
1183
- return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
1930
+ if (ZSTD_DCtx_get_bmi2(dctx)) {
1931
+ return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1932
+ }
1933
+ #endif
1934
+ return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1935
+ }
1936
+ static size_t
1937
+ ZSTD_decompressSequencesSplitLitBuffer(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,
1938
+ const void* seqStart, size_t seqSize, int nbSeq,
1939
+ const ZSTD_longOffset_e isLongOffset,
1940
+ const int frame)
1941
+ {
1942
+ DEBUGLOG(5, "ZSTD_decompressSequencesSplitLitBuffer");
1943
+ #if DYNAMIC_BMI2
1944
+ if (ZSTD_DCtx_get_bmi2(dctx)) {
1945
+ return ZSTD_decompressSequencesSplitLitBuffer_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1184
1946
  }
1185
1947
  #endif
1186
- return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
1948
+ return ZSTD_decompressSequencesSplitLitBuffer_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1187
1949
  }
1188
1950
  #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
1189
1951
 
@@ -1198,69 +1960,115 @@ static size_t
1198
1960
  ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx,
1199
1961
  void* dst, size_t maxDstSize,
1200
1962
  const void* seqStart, size_t seqSize, int nbSeq,
1201
- const ZSTD_longOffset_e isLongOffset)
1963
+ const ZSTD_longOffset_e isLongOffset,
1964
+ const int frame)
1202
1965
  {
1203
1966
  DEBUGLOG(5, "ZSTD_decompressSequencesLong");
1204
1967
  #if DYNAMIC_BMI2
1205
- if (dctx->bmi2) {
1206
- return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
1968
+ if (ZSTD_DCtx_get_bmi2(dctx)) {
1969
+ return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1207
1970
  }
1208
1971
  #endif
1209
- return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
1972
+ return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1210
1973
  }
1211
1974
  #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
1212
1975
 
1213
1976
 
1977
+ /**
1978
+ * @returns The total size of the history referenceable by zstd, including
1979
+ * both the prefix and the extDict. At @p op any offset larger than this
1980
+ * is invalid.
1981
+ */
1982
+ static size_t ZSTD_totalHistorySize(BYTE* op, BYTE const* virtualStart)
1983
+ {
1984
+ return (size_t)(op - virtualStart);
1985
+ }
1214
1986
 
1215
- #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
1216
- !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
1217
- /* ZSTD_getLongOffsetsShare() :
1987
+ typedef struct {
1988
+ unsigned longOffsetShare;
1989
+ unsigned maxNbAdditionalBits;
1990
+ } ZSTD_OffsetInfo;
1991
+
1992
+ /* ZSTD_getOffsetInfo() :
1218
1993
  * condition : offTable must be valid
1219
1994
  * @return : "share" of long offsets (arbitrarily defined as > (1<<23))
1220
- * compared to maximum possible of (1<<OffFSELog) */
1221
- static unsigned
1222
- ZSTD_getLongOffsetsShare(const ZSTD_seqSymbol* offTable)
1995
+ * compared to maximum possible of (1<<OffFSELog),
1996
+ * as well as the maximum number additional bits required.
1997
+ */
1998
+ static ZSTD_OffsetInfo
1999
+ ZSTD_getOffsetInfo(const ZSTD_seqSymbol* offTable, int nbSeq)
1223
2000
  {
1224
- const void* ptr = offTable;
1225
- U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog;
1226
- const ZSTD_seqSymbol* table = offTable + 1;
1227
- U32 const max = 1 << tableLog;
1228
- U32 u, total = 0;
1229
- DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog);
1230
-
1231
- assert(max <= (1 << OffFSELog)); /* max not too large */
1232
- for (u=0; u<max; u++) {
1233
- if (table[u].nbAdditionalBits > 22) total += 1;
1234
- }
2001
+ ZSTD_OffsetInfo info = {0, 0};
2002
+ /* If nbSeq == 0, then the offTable is uninitialized, but we have
2003
+ * no sequences, so both values should be 0.
2004
+ */
2005
+ if (nbSeq != 0) {
2006
+ const void* ptr = offTable;
2007
+ U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog;
2008
+ const ZSTD_seqSymbol* table = offTable + 1;
2009
+ U32 const max = 1 << tableLog;
2010
+ U32 u;
2011
+ DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog);
2012
+
2013
+ assert(max <= (1 << OffFSELog)); /* max not too large */
2014
+ for (u=0; u<max; u++) {
2015
+ info.maxNbAdditionalBits = MAX(info.maxNbAdditionalBits, table[u].nbAdditionalBits);
2016
+ if (table[u].nbAdditionalBits > 22) info.longOffsetShare += 1;
2017
+ }
1235
2018
 
1236
- assert(tableLog <= OffFSELog);
1237
- total <<= (OffFSELog - tableLog); /* scale to OffFSELog */
2019
+ assert(tableLog <= OffFSELog);
2020
+ info.longOffsetShare <<= (OffFSELog - tableLog); /* scale to OffFSELog */
2021
+ }
1238
2022
 
1239
- return total;
2023
+ return info;
1240
2024
  }
1241
- #endif
1242
2025
 
2026
+ /**
2027
+ * @returns The maximum offset we can decode in one read of our bitstream, without
2028
+ * reloading more bits in the middle of the offset bits read. Any offsets larger
2029
+ * than this must use the long offset decoder.
2030
+ */
2031
+ static size_t ZSTD_maxShortOffset(void)
2032
+ {
2033
+ if (MEM_64bits()) {
2034
+ /* We can decode any offset without reloading bits.
2035
+ * This might change if the max window size grows.
2036
+ */
2037
+ ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31);
2038
+ return (size_t)-1;
2039
+ } else {
2040
+ /* The maximum offBase is (1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1.
2041
+ * This offBase would require STREAM_ACCUMULATOR_MIN extra bits.
2042
+ * Then we have to subtract ZSTD_REP_NUM to get the maximum possible offset.
2043
+ */
2044
+ size_t const maxOffbase = ((size_t)1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1;
2045
+ size_t const maxOffset = maxOffbase - ZSTD_REP_NUM;
2046
+ assert(ZSTD_highbit32((U32)maxOffbase) == STREAM_ACCUMULATOR_MIN);
2047
+ return maxOffset;
2048
+ }
2049
+ }
1243
2050
 
1244
2051
  size_t
1245
2052
  ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
1246
2053
  void* dst, size_t dstCapacity,
1247
- const void* src, size_t srcSize, const int frame)
2054
+ const void* src, size_t srcSize, const int frame, const streaming_operation streaming)
1248
2055
  { /* blockType == blockCompressed */
1249
2056
  const BYTE* ip = (const BYTE*)src;
1250
- /* isLongOffset must be true if there are long offsets.
1251
- * Offsets are long if they are larger than 2^STREAM_ACCUMULATOR_MIN.
1252
- * We don't expect that to be the case in 64-bit mode.
1253
- * In block mode, window size is not known, so we have to be conservative.
1254
- * (note: but it could be evaluated from current-lowLimit)
1255
- */
1256
- ZSTD_longOffset_e const isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (!frame || (dctx->fParams.windowSize > (1ULL << STREAM_ACCUMULATOR_MIN))));
1257
2057
  DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize);
1258
2058
 
1259
- RETURN_ERROR_IF(srcSize >= ZSTD_BLOCKSIZE_MAX, srcSize_wrong);
2059
+ /* Note : the wording of the specification
2060
+ * allows compressed block to be sized exactly ZSTD_BLOCKSIZE_MAX.
2061
+ * This generally does not happen, as it makes little sense,
2062
+ * since an uncompressed block would feature same size and have no decompression cost.
2063
+ * Also, note that decoder from reference libzstd before < v1.5.4
2064
+ * would consider this edge case as an error.
2065
+ * As a consequence, avoid generating compressed blocks of size ZSTD_BLOCKSIZE_MAX
2066
+ * for broader compatibility with the deployed ecosystem of zstd decoders */
2067
+ RETURN_ERROR_IF(srcSize > ZSTD_BLOCKSIZE_MAX, srcSize_wrong, "");
1260
2068
 
1261
2069
  /* Decode literals section */
1262
- { size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize);
1263
- DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : %u", (U32)litCSize);
2070
+ { size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, streaming);
2071
+ DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : cSize=%u, nbLiterals=%zu", (U32)litCSize, dctx->litSize);
1264
2072
  if (ZSTD_isError(litCSize)) return litCSize;
1265
2073
  ip += litCSize;
1266
2074
  srcSize -= litCSize;
@@ -1268,6 +2076,23 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
1268
2076
 
1269
2077
  /* Build Decoding Tables */
1270
2078
  {
2079
+ /* Compute the maximum block size, which must also work when !frame and fParams are unset.
2080
+ * Additionally, take the min with dstCapacity to ensure that the totalHistorySize fits in a size_t.
2081
+ */
2082
+ size_t const blockSizeMax = MIN(dstCapacity, (frame ? dctx->fParams.blockSizeMax : ZSTD_BLOCKSIZE_MAX));
2083
+ size_t const totalHistorySize = ZSTD_totalHistorySize((BYTE*)dst + blockSizeMax, (BYTE const*)dctx->virtualStart);
2084
+ /* isLongOffset must be true if there are long offsets.
2085
+ * Offsets are long if they are larger than ZSTD_maxShortOffset().
2086
+ * We don't expect that to be the case in 64-bit mode.
2087
+ *
2088
+ * We check here to see if our history is large enough to allow long offsets.
2089
+ * If it isn't, then we can't possible have (valid) long offsets. If the offset
2090
+ * is invalid, then it is okay to read it incorrectly.
2091
+ *
2092
+ * If isLongOffsets is true, then we will later check our decoding table to see
2093
+ * if it is even possible to generate long offsets.
2094
+ */
2095
+ ZSTD_longOffset_e isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (totalHistorySize > ZSTD_maxShortOffset()));
1271
2096
  /* These macros control at build-time which decompressor implementation
1272
2097
  * we use. If neither is defined, we do some inspection and dispatch at
1273
2098
  * runtime.
@@ -1275,6 +2100,11 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
1275
2100
  #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
1276
2101
  !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
1277
2102
  int usePrefetchDecoder = dctx->ddictIsCold;
2103
+ #else
2104
+ /* Set to 1 to avoid computing offset info if we don't need to.
2105
+ * Otherwise this value is ignored.
2106
+ */
2107
+ int usePrefetchDecoder = 1;
1278
2108
  #endif
1279
2109
  int nbSeq;
1280
2110
  size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize);
@@ -1282,42 +2112,81 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
1282
2112
  ip += seqHSize;
1283
2113
  srcSize -= seqHSize;
1284
2114
 
1285
- #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
1286
- !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
1287
- if ( !usePrefetchDecoder
1288
- && (!frame || (dctx->fParams.windowSize > (1<<24)))
1289
- && (nbSeq>ADVANCED_SEQS) ) { /* could probably use a larger nbSeq limit */
1290
- U32 const shareLongOffsets = ZSTD_getLongOffsetsShare(dctx->OFTptr);
1291
- U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */
1292
- usePrefetchDecoder = (shareLongOffsets >= minShare);
2115
+ RETURN_ERROR_IF((dst == NULL || dstCapacity == 0) && nbSeq > 0, dstSize_tooSmall, "NULL not handled");
2116
+ RETURN_ERROR_IF(MEM_64bits() && sizeof(size_t) == sizeof(void*) && (size_t)(-1) - (size_t)dst < (size_t)(1 << 20), dstSize_tooSmall,
2117
+ "invalid dst");
2118
+
2119
+ /* If we could potentially have long offsets, or we might want to use the prefetch decoder,
2120
+ * compute information about the share of long offsets, and the maximum nbAdditionalBits.
2121
+ * NOTE: could probably use a larger nbSeq limit
2122
+ */
2123
+ if (isLongOffset || (!usePrefetchDecoder && (totalHistorySize > (1u << 24)) && (nbSeq > 8))) {
2124
+ ZSTD_OffsetInfo const info = ZSTD_getOffsetInfo(dctx->OFTptr, nbSeq);
2125
+ if (isLongOffset && info.maxNbAdditionalBits <= STREAM_ACCUMULATOR_MIN) {
2126
+ /* If isLongOffset, but the maximum number of additional bits that we see in our table is small
2127
+ * enough, then we know it is impossible to have too long an offset in this block, so we can
2128
+ * use the regular offset decoder.
2129
+ */
2130
+ isLongOffset = ZSTD_lo_isRegularOffset;
2131
+ }
2132
+ if (!usePrefetchDecoder) {
2133
+ U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */
2134
+ usePrefetchDecoder = (info.longOffsetShare >= minShare);
2135
+ }
1293
2136
  }
1294
- #endif
1295
2137
 
1296
2138
  dctx->ddictIsCold = 0;
1297
2139
 
1298
2140
  #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
1299
2141
  !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
1300
- if (usePrefetchDecoder)
2142
+ if (usePrefetchDecoder) {
2143
+ #else
2144
+ (void)usePrefetchDecoder;
2145
+ {
1301
2146
  #endif
1302
2147
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
1303
- return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
2148
+ return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
1304
2149
  #endif
2150
+ }
1305
2151
 
1306
2152
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
1307
2153
  /* else */
1308
- return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
2154
+ if (dctx->litBufferLocation == ZSTD_split)
2155
+ return ZSTD_decompressSequencesSplitLitBuffer(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
2156
+ else
2157
+ return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
1309
2158
  #endif
1310
2159
  }
1311
2160
  }
1312
2161
 
1313
2162
 
1314
- size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
1315
- void* dst, size_t dstCapacity,
1316
- const void* src, size_t srcSize)
2163
+ void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize)
2164
+ {
2165
+ if (dst != dctx->previousDstEnd && dstSize > 0) { /* not contiguous */
2166
+ dctx->dictEnd = dctx->previousDstEnd;
2167
+ dctx->virtualStart = (const char*)dst - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->prefixStart));
2168
+ dctx->prefixStart = dst;
2169
+ dctx->previousDstEnd = dst;
2170
+ }
2171
+ }
2172
+
2173
+
2174
+ size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx,
2175
+ void* dst, size_t dstCapacity,
2176
+ const void* src, size_t srcSize)
1317
2177
  {
1318
2178
  size_t dSize;
1319
- ZSTD_checkContinuity(dctx, dst);
1320
- dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 0);
2179
+ ZSTD_checkContinuity(dctx, dst, dstCapacity);
2180
+ dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 0, not_streaming);
1321
2181
  dctx->previousDstEnd = (char*)dst + dSize;
1322
2182
  return dSize;
1323
2183
  }
2184
+
2185
+
2186
+ /* NOTE: Must just wrap ZSTD_decompressBlock_deprecated() */
2187
+ size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
2188
+ void* dst, size_t dstCapacity,
2189
+ const void* src, size_t srcSize)
2190
+ {
2191
+ return ZSTD_decompressBlock_deprecated(dctx, dst, dstCapacity, src, srcSize);
2192
+ }