extzstd 0.3.2 → 0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +4 -3
  3. data/contrib/zstd/CHANGELOG +225 -1
  4. data/contrib/zstd/CONTRIBUTING.md +158 -75
  5. data/contrib/zstd/LICENSE +4 -4
  6. data/contrib/zstd/Makefile +106 -69
  7. data/contrib/zstd/Package.swift +36 -0
  8. data/contrib/zstd/README.md +64 -36
  9. data/contrib/zstd/SECURITY.md +15 -0
  10. data/contrib/zstd/TESTING.md +2 -3
  11. data/contrib/zstd/lib/BUCK +5 -7
  12. data/contrib/zstd/lib/Makefile +117 -199
  13. data/contrib/zstd/lib/README.md +37 -7
  14. data/contrib/zstd/lib/common/allocations.h +55 -0
  15. data/contrib/zstd/lib/common/bits.h +200 -0
  16. data/contrib/zstd/lib/common/bitstream.h +80 -86
  17. data/contrib/zstd/lib/common/compiler.h +225 -63
  18. data/contrib/zstd/lib/common/cpu.h +37 -1
  19. data/contrib/zstd/lib/common/debug.c +7 -1
  20. data/contrib/zstd/lib/common/debug.h +21 -12
  21. data/contrib/zstd/lib/common/entropy_common.c +15 -37
  22. data/contrib/zstd/lib/common/error_private.c +9 -2
  23. data/contrib/zstd/lib/common/error_private.h +93 -5
  24. data/contrib/zstd/lib/common/fse.h +12 -87
  25. data/contrib/zstd/lib/common/fse_decompress.c +37 -117
  26. data/contrib/zstd/lib/common/huf.h +97 -172
  27. data/contrib/zstd/lib/common/mem.h +58 -58
  28. data/contrib/zstd/lib/common/pool.c +38 -17
  29. data/contrib/zstd/lib/common/pool.h +10 -4
  30. data/contrib/zstd/lib/common/portability_macros.h +158 -0
  31. data/contrib/zstd/lib/common/threading.c +74 -14
  32. data/contrib/zstd/lib/common/threading.h +5 -10
  33. data/contrib/zstd/lib/common/xxhash.c +6 -814
  34. data/contrib/zstd/lib/common/xxhash.h +6930 -195
  35. data/contrib/zstd/lib/common/zstd_common.c +1 -36
  36. data/contrib/zstd/lib/common/zstd_deps.h +1 -1
  37. data/contrib/zstd/lib/common/zstd_internal.h +68 -154
  38. data/contrib/zstd/lib/common/zstd_trace.h +163 -0
  39. data/contrib/zstd/lib/compress/clevels.h +134 -0
  40. data/contrib/zstd/lib/compress/fse_compress.c +75 -155
  41. data/contrib/zstd/lib/compress/hist.c +1 -1
  42. data/contrib/zstd/lib/compress/hist.h +1 -1
  43. data/contrib/zstd/lib/compress/huf_compress.c +810 -259
  44. data/contrib/zstd/lib/compress/zstd_compress.c +2864 -919
  45. data/contrib/zstd/lib/compress/zstd_compress_internal.h +523 -192
  46. data/contrib/zstd/lib/compress/zstd_compress_literals.c +117 -40
  47. data/contrib/zstd/lib/compress/zstd_compress_literals.h +16 -6
  48. data/contrib/zstd/lib/compress/zstd_compress_sequences.c +28 -19
  49. data/contrib/zstd/lib/compress/zstd_compress_sequences.h +1 -1
  50. data/contrib/zstd/lib/compress/zstd_compress_superblock.c +251 -412
  51. data/contrib/zstd/lib/compress/zstd_compress_superblock.h +1 -1
  52. data/contrib/zstd/lib/compress/zstd_cwksp.h +284 -97
  53. data/contrib/zstd/lib/compress/zstd_double_fast.c +382 -133
  54. data/contrib/zstd/lib/compress/zstd_double_fast.h +14 -2
  55. data/contrib/zstd/lib/compress/zstd_fast.c +732 -260
  56. data/contrib/zstd/lib/compress/zstd_fast.h +3 -2
  57. data/contrib/zstd/lib/compress/zstd_lazy.c +1177 -390
  58. data/contrib/zstd/lib/compress/zstd_lazy.h +129 -14
  59. data/contrib/zstd/lib/compress/zstd_ldm.c +280 -210
  60. data/contrib/zstd/lib/compress/zstd_ldm.h +3 -2
  61. data/contrib/zstd/lib/compress/zstd_ldm_geartab.h +106 -0
  62. data/contrib/zstd/lib/compress/zstd_opt.c +516 -285
  63. data/contrib/zstd/lib/compress/zstd_opt.h +32 -8
  64. data/contrib/zstd/lib/compress/zstdmt_compress.c +202 -131
  65. data/contrib/zstd/lib/compress/zstdmt_compress.h +9 -6
  66. data/contrib/zstd/lib/decompress/huf_decompress.c +1149 -555
  67. data/contrib/zstd/lib/decompress/huf_decompress_amd64.S +595 -0
  68. data/contrib/zstd/lib/decompress/zstd_ddict.c +4 -4
  69. data/contrib/zstd/lib/decompress/zstd_ddict.h +1 -1
  70. data/contrib/zstd/lib/decompress/zstd_decompress.c +583 -106
  71. data/contrib/zstd/lib/decompress/zstd_decompress_block.c +1054 -379
  72. data/contrib/zstd/lib/decompress/zstd_decompress_block.h +14 -3
  73. data/contrib/zstd/lib/decompress/zstd_decompress_internal.h +56 -6
  74. data/contrib/zstd/lib/deprecated/zbuff.h +1 -1
  75. data/contrib/zstd/lib/deprecated/zbuff_common.c +1 -1
  76. data/contrib/zstd/lib/deprecated/zbuff_compress.c +24 -4
  77. data/contrib/zstd/lib/deprecated/zbuff_decompress.c +3 -1
  78. data/contrib/zstd/lib/dictBuilder/cover.c +60 -44
  79. data/contrib/zstd/lib/dictBuilder/cover.h +6 -11
  80. data/contrib/zstd/lib/dictBuilder/divsufsort.c +1 -1
  81. data/contrib/zstd/lib/dictBuilder/fastcover.c +26 -18
  82. data/contrib/zstd/lib/dictBuilder/zdict.c +100 -101
  83. data/contrib/zstd/lib/legacy/zstd_legacy.h +38 -1
  84. data/contrib/zstd/lib/legacy/zstd_v01.c +18 -53
  85. data/contrib/zstd/lib/legacy/zstd_v01.h +1 -1
  86. data/contrib/zstd/lib/legacy/zstd_v02.c +28 -85
  87. data/contrib/zstd/lib/legacy/zstd_v02.h +1 -1
  88. data/contrib/zstd/lib/legacy/zstd_v03.c +29 -88
  89. data/contrib/zstd/lib/legacy/zstd_v03.h +1 -1
  90. data/contrib/zstd/lib/legacy/zstd_v04.c +27 -80
  91. data/contrib/zstd/lib/legacy/zstd_v04.h +1 -1
  92. data/contrib/zstd/lib/legacy/zstd_v05.c +36 -85
  93. data/contrib/zstd/lib/legacy/zstd_v05.h +1 -1
  94. data/contrib/zstd/lib/legacy/zstd_v06.c +44 -96
  95. data/contrib/zstd/lib/legacy/zstd_v06.h +1 -1
  96. data/contrib/zstd/lib/legacy/zstd_v07.c +37 -92
  97. data/contrib/zstd/lib/legacy/zstd_v07.h +1 -1
  98. data/contrib/zstd/lib/libzstd.mk +237 -0
  99. data/contrib/zstd/lib/libzstd.pc.in +4 -3
  100. data/contrib/zstd/lib/module.modulemap +35 -0
  101. data/contrib/zstd/lib/{dictBuilder/zdict.h → zdict.h} +202 -33
  102. data/contrib/zstd/lib/zstd.h +1030 -332
  103. data/contrib/zstd/lib/{common/zstd_errors.h → zstd_errors.h} +27 -8
  104. data/ext/extconf.rb +26 -7
  105. data/ext/extzstd.c +51 -24
  106. data/ext/extzstd.h +33 -6
  107. data/ext/extzstd_stream.c +74 -31
  108. data/ext/libzstd_conf.h +0 -1
  109. data/ext/zstd_decompress_asm.S +1 -0
  110. metadata +17 -7
  111. data/contrib/zstd/appveyor.yml +0 -292
  112. data/ext/depend +0 -2
@@ -1,7 +1,7 @@
1
1
  /* ******************************************************************
2
2
  * huff0 huffman decoder,
3
3
  * part of Finite State Entropy library
4
- * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc.
4
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
5
5
  *
6
6
  * You can contact the author at :
7
7
  * - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
@@ -19,14 +19,27 @@
19
19
  #include "../common/compiler.h"
20
20
  #include "../common/bitstream.h" /* BIT_* */
21
21
  #include "../common/fse.h" /* to compress headers */
22
- #define HUF_STATIC_LINKING_ONLY
23
22
  #include "../common/huf.h"
24
23
  #include "../common/error_private.h"
24
+ #include "../common/zstd_internal.h"
25
+ #include "../common/bits.h" /* ZSTD_highbit32, ZSTD_countTrailingZeros64 */
26
+
27
+ /* **************************************************************
28
+ * Constants
29
+ ****************************************************************/
30
+
31
+ #define HUF_DECODER_FAST_TABLELOG 11
25
32
 
26
33
  /* **************************************************************
27
34
  * Macros
28
35
  ****************************************************************/
29
36
 
37
+ #ifdef HUF_DISABLE_FAST_DECODE
38
+ # define HUF_ENABLE_FAST_DECODE 0
39
+ #else
40
+ # define HUF_ENABLE_FAST_DECODE 1
41
+ #endif
42
+
30
43
  /* These two optional macros force the use one way or another of the two
31
44
  * Huffman decompression implementations. You can't force in both directions
32
45
  * at the same time.
@@ -36,6 +49,28 @@
36
49
  #error "Cannot force the use of the X1 and X2 decoders at the same time!"
37
50
  #endif
38
51
 
52
+ /* When DYNAMIC_BMI2 is enabled, fast decoders are only called when bmi2 is
53
+ * supported at runtime, so we can add the BMI2 target attribute.
54
+ * When it is disabled, we will still get BMI2 if it is enabled statically.
55
+ */
56
+ #if DYNAMIC_BMI2
57
+ # define HUF_FAST_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE
58
+ #else
59
+ # define HUF_FAST_BMI2_ATTRS
60
+ #endif
61
+
62
+ #ifdef __cplusplus
63
+ # define HUF_EXTERN_C extern "C"
64
+ #else
65
+ # define HUF_EXTERN_C
66
+ #endif
67
+ #define HUF_ASM_DECL HUF_EXTERN_C
68
+
69
+ #if DYNAMIC_BMI2
70
+ # define HUF_NEED_BMI2_FUNCTION 1
71
+ #else
72
+ # define HUF_NEED_BMI2_FUNCTION 0
73
+ #endif
39
74
 
40
75
  /* **************************************************************
41
76
  * Error Management
@@ -53,6 +88,11 @@
53
88
  /* **************************************************************
54
89
  * BMI2 Variant Wrappers
55
90
  ****************************************************************/
91
+ typedef size_t (*HUF_DecompressUsingDTableFn)(void *dst, size_t dstSize,
92
+ const void *cSrc,
93
+ size_t cSrcSize,
94
+ const HUF_DTable *DTable);
95
+
56
96
  #if DYNAMIC_BMI2
57
97
 
58
98
  #define HUF_DGEN(fn) \
@@ -65,7 +105,7 @@
65
105
  return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \
66
106
  } \
67
107
  \
68
- static TARGET_ATTRIBUTE("bmi2") size_t fn##_bmi2( \
108
+ static BMI2_TARGET_ATTRIBUTE size_t fn##_bmi2( \
69
109
  void* dst, size_t dstSize, \
70
110
  const void* cSrc, size_t cSrcSize, \
71
111
  const HUF_DTable* DTable) \
@@ -74,9 +114,9 @@
74
114
  } \
75
115
  \
76
116
  static size_t fn(void* dst, size_t dstSize, void const* cSrc, \
77
- size_t cSrcSize, HUF_DTable const* DTable, int bmi2) \
117
+ size_t cSrcSize, HUF_DTable const* DTable, int flags) \
78
118
  { \
79
- if (bmi2) { \
119
+ if (flags & HUF_flags_bmi2) { \
80
120
  return fn##_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); \
81
121
  } \
82
122
  return fn##_default(dst, dstSize, cSrc, cSrcSize, DTable); \
@@ -86,9 +126,9 @@
86
126
 
87
127
  #define HUF_DGEN(fn) \
88
128
  static size_t fn(void* dst, size_t dstSize, void const* cSrc, \
89
- size_t cSrcSize, HUF_DTable const* DTable, int bmi2) \
129
+ size_t cSrcSize, HUF_DTable const* DTable, int flags) \
90
130
  { \
91
- (void)bmi2; \
131
+ (void)flags; \
92
132
  return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \
93
133
  }
94
134
 
@@ -107,13 +147,186 @@ static DTableDesc HUF_getDTableDesc(const HUF_DTable* table)
107
147
  return dtd;
108
148
  }
109
149
 
150
+ static size_t HUF_initFastDStream(BYTE const* ip) {
151
+ BYTE const lastByte = ip[7];
152
+ size_t const bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0;
153
+ size_t const value = MEM_readLEST(ip) | 1;
154
+ assert(bitsConsumed <= 8);
155
+ assert(sizeof(size_t) == 8);
156
+ return value << bitsConsumed;
157
+ }
158
+
159
+
160
+ /**
161
+ * The input/output arguments to the Huffman fast decoding loop:
162
+ *
163
+ * ip [in/out] - The input pointers, must be updated to reflect what is consumed.
164
+ * op [in/out] - The output pointers, must be updated to reflect what is written.
165
+ * bits [in/out] - The bitstream containers, must be updated to reflect the current state.
166
+ * dt [in] - The decoding table.
167
+ * ilowest [in] - The beginning of the valid range of the input. Decoders may read
168
+ * down to this pointer. It may be below iend[0].
169
+ * oend [in] - The end of the output stream. op[3] must not cross oend.
170
+ * iend [in] - The end of each input stream. ip[i] may cross iend[i],
171
+ * as long as it is above ilowest, but that indicates corruption.
172
+ */
173
+ typedef struct {
174
+ BYTE const* ip[4];
175
+ BYTE* op[4];
176
+ U64 bits[4];
177
+ void const* dt;
178
+ BYTE const* ilowest;
179
+ BYTE* oend;
180
+ BYTE const* iend[4];
181
+ } HUF_DecompressFastArgs;
182
+
183
+ typedef void (*HUF_DecompressFastLoopFn)(HUF_DecompressFastArgs*);
184
+
185
+ /**
186
+ * Initializes args for the fast decoding loop.
187
+ * @returns 1 on success
188
+ * 0 if the fallback implementation should be used.
189
+ * Or an error code on failure.
190
+ */
191
+ static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable)
192
+ {
193
+ void const* dt = DTable + 1;
194
+ U32 const dtLog = HUF_getDTableDesc(DTable).tableLog;
195
+
196
+ const BYTE* const istart = (const BYTE*)src;
197
+
198
+ BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize);
199
+
200
+ /* The fast decoding loop assumes 64-bit little-endian.
201
+ * This condition is false on x32.
202
+ */
203
+ if (!MEM_isLittleEndian() || MEM_32bits())
204
+ return 0;
205
+
206
+ /* Avoid nullptr addition */
207
+ if (dstSize == 0)
208
+ return 0;
209
+ assert(dst != NULL);
210
+
211
+ /* strict minimum : jump table + 1 byte per stream */
212
+ if (srcSize < 10)
213
+ return ERROR(corruption_detected);
214
+
215
+ /* Must have at least 8 bytes per stream because we don't handle initializing smaller bit containers.
216
+ * If table log is not correct at this point, fallback to the old decoder.
217
+ * On small inputs we don't have enough data to trigger the fast loop, so use the old decoder.
218
+ */
219
+ if (dtLog != HUF_DECODER_FAST_TABLELOG)
220
+ return 0;
221
+
222
+ /* Read the jump table. */
223
+ {
224
+ size_t const length1 = MEM_readLE16(istart);
225
+ size_t const length2 = MEM_readLE16(istart+2);
226
+ size_t const length3 = MEM_readLE16(istart+4);
227
+ size_t const length4 = srcSize - (length1 + length2 + length3 + 6);
228
+ args->iend[0] = istart + 6; /* jumpTable */
229
+ args->iend[1] = args->iend[0] + length1;
230
+ args->iend[2] = args->iend[1] + length2;
231
+ args->iend[3] = args->iend[2] + length3;
232
+
233
+ /* HUF_initFastDStream() requires this, and this small of an input
234
+ * won't benefit from the ASM loop anyways.
235
+ */
236
+ if (length1 < 8 || length2 < 8 || length3 < 8 || length4 < 8)
237
+ return 0;
238
+ if (length4 > srcSize) return ERROR(corruption_detected); /* overflow */
239
+ }
240
+ /* ip[] contains the position that is currently loaded into bits[]. */
241
+ args->ip[0] = args->iend[1] - sizeof(U64);
242
+ args->ip[1] = args->iend[2] - sizeof(U64);
243
+ args->ip[2] = args->iend[3] - sizeof(U64);
244
+ args->ip[3] = (BYTE const*)src + srcSize - sizeof(U64);
245
+
246
+ /* op[] contains the output pointers. */
247
+ args->op[0] = (BYTE*)dst;
248
+ args->op[1] = args->op[0] + (dstSize+3)/4;
249
+ args->op[2] = args->op[1] + (dstSize+3)/4;
250
+ args->op[3] = args->op[2] + (dstSize+3)/4;
251
+
252
+ /* No point to call the ASM loop for tiny outputs. */
253
+ if (args->op[3] >= oend)
254
+ return 0;
255
+
256
+ /* bits[] is the bit container.
257
+ * It is read from the MSB down to the LSB.
258
+ * It is shifted left as it is read, and zeros are
259
+ * shifted in. After the lowest valid bit a 1 is
260
+ * set, so that CountTrailingZeros(bits[]) can be used
261
+ * to count how many bits we've consumed.
262
+ */
263
+ args->bits[0] = HUF_initFastDStream(args->ip[0]);
264
+ args->bits[1] = HUF_initFastDStream(args->ip[1]);
265
+ args->bits[2] = HUF_initFastDStream(args->ip[2]);
266
+ args->bits[3] = HUF_initFastDStream(args->ip[3]);
267
+
268
+ /* The decoders must be sure to never read beyond ilowest.
269
+ * This is lower than iend[0], but allowing decoders to read
270
+ * down to ilowest can allow an extra iteration or two in the
271
+ * fast loop.
272
+ */
273
+ args->ilowest = istart;
274
+
275
+ args->oend = oend;
276
+ args->dt = dt;
277
+
278
+ return 1;
279
+ }
280
+
281
+ static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressFastArgs const* args, int stream, BYTE* segmentEnd)
282
+ {
283
+ /* Validate that we haven't overwritten. */
284
+ if (args->op[stream] > segmentEnd)
285
+ return ERROR(corruption_detected);
286
+ /* Validate that we haven't read beyond iend[].
287
+ * Note that ip[] may be < iend[] because the MSB is
288
+ * the next bit to read, and we may have consumed 100%
289
+ * of the stream, so down to iend[i] - 8 is valid.
290
+ */
291
+ if (args->ip[stream] < args->iend[stream] - 8)
292
+ return ERROR(corruption_detected);
293
+
294
+ /* Construct the BIT_DStream_t. */
295
+ assert(sizeof(size_t) == 8);
296
+ bit->bitContainer = MEM_readLEST(args->ip[stream]);
297
+ bit->bitsConsumed = ZSTD_countTrailingZeros64(args->bits[stream]);
298
+ bit->start = (const char*)args->ilowest;
299
+ bit->limitPtr = bit->start + sizeof(size_t);
300
+ bit->ptr = (const char*)args->ip[stream];
301
+
302
+ return 0;
303
+ }
304
+
305
+ /* Calls X(N) for each stream 0, 1, 2, 3. */
306
+ #define HUF_4X_FOR_EACH_STREAM(X) \
307
+ do { \
308
+ X(0); \
309
+ X(1); \
310
+ X(2); \
311
+ X(3); \
312
+ } while (0)
313
+
314
+ /* Calls X(N, var) for each stream 0, 1, 2, 3. */
315
+ #define HUF_4X_FOR_EACH_STREAM_WITH_VAR(X, var) \
316
+ do { \
317
+ X(0, (var)); \
318
+ X(1, (var)); \
319
+ X(2, (var)); \
320
+ X(3, (var)); \
321
+ } while (0)
322
+
110
323
 
111
324
  #ifndef HUF_FORCE_DECOMPRESS_X2
112
325
 
113
326
  /*-***************************/
114
327
  /* single-symbol decoding */
115
328
  /*-***************************/
116
- typedef struct { BYTE byte; BYTE nbBits; } HUF_DEltX1; /* single-symbol decoding */
329
+ typedef struct { BYTE nbBits; BYTE byte; } HUF_DEltX1; /* single-symbol decoding */
117
330
 
118
331
  /**
119
332
  * Packs 4 HUF_DEltX1 structs into a U64. This is used to lay down 4 entries at
@@ -122,14 +335,45 @@ typedef struct { BYTE byte; BYTE nbBits; } HUF_DEltX1; /* single-symbol decodi
122
335
  static U64 HUF_DEltX1_set4(BYTE symbol, BYTE nbBits) {
123
336
  U64 D4;
124
337
  if (MEM_isLittleEndian()) {
125
- D4 = symbol + (nbBits << 8);
338
+ D4 = (U64)((symbol << 8) + nbBits);
126
339
  } else {
127
- D4 = (symbol << 8) + nbBits;
340
+ D4 = (U64)(symbol + (nbBits << 8));
128
341
  }
342
+ assert(D4 < (1U << 16));
129
343
  D4 *= 0x0001000100010001ULL;
130
344
  return D4;
131
345
  }
132
346
 
347
+ /**
348
+ * Increase the tableLog to targetTableLog and rescales the stats.
349
+ * If tableLog > targetTableLog this is a no-op.
350
+ * @returns New tableLog
351
+ */
352
+ static U32 HUF_rescaleStats(BYTE* huffWeight, U32* rankVal, U32 nbSymbols, U32 tableLog, U32 targetTableLog)
353
+ {
354
+ if (tableLog > targetTableLog)
355
+ return tableLog;
356
+ if (tableLog < targetTableLog) {
357
+ U32 const scale = targetTableLog - tableLog;
358
+ U32 s;
359
+ /* Increase the weight for all non-zero probability symbols by scale. */
360
+ for (s = 0; s < nbSymbols; ++s) {
361
+ huffWeight[s] += (BYTE)((huffWeight[s] == 0) ? 0 : scale);
362
+ }
363
+ /* Update rankVal to reflect the new weights.
364
+ * All weights except 0 get moved to weight + scale.
365
+ * Weights [1, scale] are empty.
366
+ */
367
+ for (s = targetTableLog; s > scale; --s) {
368
+ rankVal[s] = rankVal[s - scale];
369
+ }
370
+ for (s = scale; s > 0; --s) {
371
+ rankVal[s] = 0;
372
+ }
373
+ }
374
+ return targetTableLog;
375
+ }
376
+
133
377
  typedef struct {
134
378
  U32 rankVal[HUF_TABLELOG_ABSOLUTEMAX + 1];
135
379
  U32 rankStart[HUF_TABLELOG_ABSOLUTEMAX + 1];
@@ -138,13 +382,7 @@ typedef struct {
138
382
  BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1];
139
383
  } HUF_ReadDTableX1_Workspace;
140
384
 
141
-
142
- size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize)
143
- {
144
- return HUF_readDTableX1_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0);
145
- }
146
-
147
- size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2)
385
+ size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags)
148
386
  {
149
387
  U32 tableLog = 0;
150
388
  U32 nbSymbols = 0;
@@ -159,11 +397,15 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
159
397
  DEBUG_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable));
160
398
  /* ZSTD_memset(huffWeight, 0, sizeof(huffWeight)); */ /* is not necessary, even though some analyzer complain ... */
161
399
 
162
- iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), bmi2);
400
+ iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), flags);
163
401
  if (HUF_isError(iSize)) return iSize;
164
402
 
403
+
165
404
  /* Table header */
166
405
  { DTableDesc dtd = HUF_getDTableDesc(DTable);
406
+ U32 const maxTableLog = dtd.maxTableLog + 1;
407
+ U32 const targetTableLog = MIN(maxTableLog, HUF_DECODER_FAST_TABLELOG);
408
+ tableLog = HUF_rescaleStats(wksp->huffWeight, wksp->rankVal, nbSymbols, tableLog, targetTableLog);
167
409
  if (tableLog > (U32)(dtd.maxTableLog+1)) return ERROR(tableLog_tooLarge); /* DTable too small, Huffman tree cannot fit in */
168
410
  dtd.tableType = 0;
169
411
  dtd.tableLog = (BYTE)tableLog;
@@ -182,9 +424,8 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
182
424
  * rankStart[0] is not filled because there are no entries in the table for
183
425
  * weight 0.
184
426
  */
185
- {
186
- int n;
187
- int nextRankStart = 0;
427
+ { int n;
428
+ U32 nextRankStart = 0;
188
429
  int const unroll = 4;
189
430
  int const nLimit = (int)nbSymbols - unroll + 1;
190
431
  for (n=0; n<(int)tableLog+1; n++) {
@@ -207,14 +448,13 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
207
448
 
208
449
  /* fill DTable
209
450
  * We fill all entries of each weight in order.
210
- * That way length is a constant for each iteration of the outter loop.
451
+ * That way length is a constant for each iteration of the outer loop.
211
452
  * We can switch based on the length to a different inner loop which is
212
453
  * optimized for that particular case.
213
454
  */
214
- {
215
- U32 w;
216
- int symbol=wksp->rankVal[0];
217
- int rankStart=0;
455
+ { U32 w;
456
+ int symbol = wksp->rankVal[0];
457
+ int rankStart = 0;
218
458
  for (w=1; w<tableLog+1; ++w) {
219
459
  int const symbolCount = wksp->rankVal[w];
220
460
  int const length = (1 << w) >> 1;
@@ -288,15 +528,19 @@ HUF_decodeSymbolX1(BIT_DStream_t* Dstream, const HUF_DEltX1* dt, const U32 dtLog
288
528
  }
289
529
 
290
530
  #define HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr) \
291
- *ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog)
531
+ do { *ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog); } while (0)
292
532
 
293
- #define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr) \
294
- if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
295
- HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr)
533
+ #define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr) \
534
+ do { \
535
+ if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
536
+ HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr); \
537
+ } while (0)
296
538
 
297
- #define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr) \
298
- if (MEM_64bits()) \
299
- HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr)
539
+ #define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr) \
540
+ do { \
541
+ if (MEM_64bits()) \
542
+ HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr); \
543
+ } while (0)
300
544
 
301
545
  HINT_INLINE size_t
302
546
  HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, const HUF_DEltX1* const dt, const U32 dtLog)
@@ -304,11 +548,15 @@ HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, cons
304
548
  BYTE* const pStart = p;
305
549
 
306
550
  /* up to 4 symbols at a time */
307
- while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-3)) {
308
- HUF_DECODE_SYMBOLX1_2(p, bitDPtr);
309
- HUF_DECODE_SYMBOLX1_1(p, bitDPtr);
310
- HUF_DECODE_SYMBOLX1_2(p, bitDPtr);
311
- HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
551
+ if ((pEnd - p) > 3) {
552
+ while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-3)) {
553
+ HUF_DECODE_SYMBOLX1_2(p, bitDPtr);
554
+ HUF_DECODE_SYMBOLX1_1(p, bitDPtr);
555
+ HUF_DECODE_SYMBOLX1_2(p, bitDPtr);
556
+ HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
557
+ }
558
+ } else {
559
+ BIT_reloadDStream(bitDPtr);
312
560
  }
313
561
 
314
562
  /* [0-3] symbols remaining */
@@ -320,7 +568,7 @@ HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, cons
320
568
  while (p < pEnd)
321
569
  HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
322
570
 
323
- return pEnd-pStart;
571
+ return (size_t)(pEnd-pStart);
324
572
  }
325
573
 
326
574
  FORCE_INLINE_TEMPLATE size_t
@@ -330,7 +578,7 @@ HUF_decompress1X1_usingDTable_internal_body(
330
578
  const HUF_DTable* DTable)
331
579
  {
332
580
  BYTE* op = (BYTE*)dst;
333
- BYTE* const oend = op + dstSize;
581
+ BYTE* const oend = ZSTD_maybeNullPtrAdd(op, dstSize);
334
582
  const void* dtPtr = DTable + 1;
335
583
  const HUF_DEltX1* const dt = (const HUF_DEltX1*)dtPtr;
336
584
  BIT_DStream_t bitD;
@@ -346,6 +594,10 @@ HUF_decompress1X1_usingDTable_internal_body(
346
594
  return dstSize;
347
595
  }
348
596
 
597
+ /* HUF_decompress4X1_usingDTable_internal_body():
598
+ * Conditions :
599
+ * @dstSize >= 6
600
+ */
349
601
  FORCE_INLINE_TEMPLATE size_t
350
602
  HUF_decompress4X1_usingDTable_internal_body(
351
603
  void* dst, size_t dstSize,
@@ -354,6 +606,7 @@ HUF_decompress4X1_usingDTable_internal_body(
354
606
  {
355
607
  /* Check */
356
608
  if (cSrcSize < 10) return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */
609
+ if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */
357
610
 
358
611
  { const BYTE* const istart = (const BYTE*) cSrc;
359
612
  BYTE* const ostart = (BYTE*) dst;
@@ -388,33 +641,37 @@ HUF_decompress4X1_usingDTable_internal_body(
388
641
  U32 endSignal = 1;
389
642
 
390
643
  if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */
644
+ if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */
645
+ assert(dstSize >= 6); /* validated above */
391
646
  CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
392
647
  CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
393
648
  CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
394
649
  CHECK_F( BIT_initDStream(&bitD4, istart4, length4) );
395
650
 
396
651
  /* up to 16 symbols per loop (4 symbols per stream) in 64-bit mode */
397
- for ( ; (endSignal) & (op4 < olimit) ; ) {
398
- HUF_DECODE_SYMBOLX1_2(op1, &bitD1);
399
- HUF_DECODE_SYMBOLX1_2(op2, &bitD2);
400
- HUF_DECODE_SYMBOLX1_2(op3, &bitD3);
401
- HUF_DECODE_SYMBOLX1_2(op4, &bitD4);
402
- HUF_DECODE_SYMBOLX1_1(op1, &bitD1);
403
- HUF_DECODE_SYMBOLX1_1(op2, &bitD2);
404
- HUF_DECODE_SYMBOLX1_1(op3, &bitD3);
405
- HUF_DECODE_SYMBOLX1_1(op4, &bitD4);
406
- HUF_DECODE_SYMBOLX1_2(op1, &bitD1);
407
- HUF_DECODE_SYMBOLX1_2(op2, &bitD2);
408
- HUF_DECODE_SYMBOLX1_2(op3, &bitD3);
409
- HUF_DECODE_SYMBOLX1_2(op4, &bitD4);
410
- HUF_DECODE_SYMBOLX1_0(op1, &bitD1);
411
- HUF_DECODE_SYMBOLX1_0(op2, &bitD2);
412
- HUF_DECODE_SYMBOLX1_0(op3, &bitD3);
413
- HUF_DECODE_SYMBOLX1_0(op4, &bitD4);
414
- endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished;
415
- endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished;
416
- endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished;
417
- endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished;
652
+ if ((size_t)(oend - op4) >= sizeof(size_t)) {
653
+ for ( ; (endSignal) & (op4 < olimit) ; ) {
654
+ HUF_DECODE_SYMBOLX1_2(op1, &bitD1);
655
+ HUF_DECODE_SYMBOLX1_2(op2, &bitD2);
656
+ HUF_DECODE_SYMBOLX1_2(op3, &bitD3);
657
+ HUF_DECODE_SYMBOLX1_2(op4, &bitD4);
658
+ HUF_DECODE_SYMBOLX1_1(op1, &bitD1);
659
+ HUF_DECODE_SYMBOLX1_1(op2, &bitD2);
660
+ HUF_DECODE_SYMBOLX1_1(op3, &bitD3);
661
+ HUF_DECODE_SYMBOLX1_1(op4, &bitD4);
662
+ HUF_DECODE_SYMBOLX1_2(op1, &bitD1);
663
+ HUF_DECODE_SYMBOLX1_2(op2, &bitD2);
664
+ HUF_DECODE_SYMBOLX1_2(op3, &bitD3);
665
+ HUF_DECODE_SYMBOLX1_2(op4, &bitD4);
666
+ HUF_DECODE_SYMBOLX1_0(op1, &bitD1);
667
+ HUF_DECODE_SYMBOLX1_0(op2, &bitD2);
668
+ HUF_DECODE_SYMBOLX1_0(op3, &bitD3);
669
+ HUF_DECODE_SYMBOLX1_0(op4, &bitD4);
670
+ endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished;
671
+ endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished;
672
+ endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished;
673
+ endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished;
674
+ }
418
675
  }
419
676
 
420
677
  /* check corruption */
@@ -440,74 +697,250 @@ HUF_decompress4X1_usingDTable_internal_body(
440
697
  }
441
698
  }
442
699
 
700
+ #if HUF_NEED_BMI2_FUNCTION
701
+ static BMI2_TARGET_ATTRIBUTE
702
+ size_t HUF_decompress4X1_usingDTable_internal_bmi2(void* dst, size_t dstSize, void const* cSrc,
703
+ size_t cSrcSize, HUF_DTable const* DTable) {
704
+ return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
705
+ }
706
+ #endif
443
707
 
444
- typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize,
445
- const void *cSrc,
446
- size_t cSrcSize,
447
- const HUF_DTable *DTable);
708
+ static
709
+ size_t HUF_decompress4X1_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,
710
+ size_t cSrcSize, HUF_DTable const* DTable) {
711
+ return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
712
+ }
448
713
 
449
- HUF_DGEN(HUF_decompress1X1_usingDTable_internal)
450
- HUF_DGEN(HUF_decompress4X1_usingDTable_internal)
714
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2
451
715
 
716
+ HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN;
452
717
 
718
+ #endif
453
719
 
454
- size_t HUF_decompress1X1_usingDTable(
455
- void* dst, size_t dstSize,
456
- const void* cSrc, size_t cSrcSize,
457
- const HUF_DTable* DTable)
720
+ static HUF_FAST_BMI2_ATTRS
721
+ void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args)
458
722
  {
459
- DTableDesc dtd = HUF_getDTableDesc(DTable);
460
- if (dtd.tableType != 0) return ERROR(GENERIC);
461
- return HUF_decompress1X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
723
+ U64 bits[4];
724
+ BYTE const* ip[4];
725
+ BYTE* op[4];
726
+ U16 const* const dtable = (U16 const*)args->dt;
727
+ BYTE* const oend = args->oend;
728
+ BYTE const* const ilowest = args->ilowest;
729
+
730
+ /* Copy the arguments to local variables */
731
+ ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
732
+ ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip));
733
+ ZSTD_memcpy(&op, &args->op, sizeof(op));
734
+
735
+ assert(MEM_isLittleEndian());
736
+ assert(!MEM_32bits());
737
+
738
+ for (;;) {
739
+ BYTE* olimit;
740
+ int stream;
741
+
742
+ /* Assert loop preconditions */
743
+ #ifndef NDEBUG
744
+ for (stream = 0; stream < 4; ++stream) {
745
+ assert(op[stream] <= (stream == 3 ? oend : op[stream + 1]));
746
+ assert(ip[stream] >= ilowest);
747
+ }
748
+ #endif
749
+ /* Compute olimit */
750
+ {
751
+ /* Each iteration produces 5 output symbols per stream */
752
+ size_t const oiters = (size_t)(oend - op[3]) / 5;
753
+ /* Each iteration consumes up to 11 bits * 5 = 55 bits < 7 bytes
754
+ * per stream.
755
+ */
756
+ size_t const iiters = (size_t)(ip[0] - ilowest) / 7;
757
+ /* We can safely run iters iterations before running bounds checks */
758
+ size_t const iters = MIN(oiters, iiters);
759
+ size_t const symbols = iters * 5;
760
+
761
+ /* We can simply check that op[3] < olimit, instead of checking all
762
+ * of our bounds, since we can't hit the other bounds until we've run
763
+ * iters iterations, which only happens when op[3] == olimit.
764
+ */
765
+ olimit = op[3] + symbols;
766
+
767
+ /* Exit fast decoding loop once we reach the end. */
768
+ if (op[3] == olimit)
769
+ break;
770
+
771
+ /* Exit the decoding loop if any input pointer has crossed the
772
+ * previous one. This indicates corruption, and a precondition
773
+ * to our loop is that ip[i] >= ip[0].
774
+ */
775
+ for (stream = 1; stream < 4; ++stream) {
776
+ if (ip[stream] < ip[stream - 1])
777
+ goto _out;
778
+ }
779
+ }
780
+
781
+ #ifndef NDEBUG
782
+ for (stream = 1; stream < 4; ++stream) {
783
+ assert(ip[stream] >= ip[stream - 1]);
784
+ }
785
+ #endif
786
+
787
+ #define HUF_4X1_DECODE_SYMBOL(_stream, _symbol) \
788
+ do { \
789
+ int const index = (int)(bits[(_stream)] >> 53); \
790
+ int const entry = (int)dtable[index]; \
791
+ bits[(_stream)] <<= (entry & 0x3F); \
792
+ op[(_stream)][(_symbol)] = (BYTE)((entry >> 8) & 0xFF); \
793
+ } while (0)
794
+
795
+ #define HUF_4X1_RELOAD_STREAM(_stream) \
796
+ do { \
797
+ int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \
798
+ int const nbBits = ctz & 7; \
799
+ int const nbBytes = ctz >> 3; \
800
+ op[(_stream)] += 5; \
801
+ ip[(_stream)] -= nbBytes; \
802
+ bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1; \
803
+ bits[(_stream)] <<= nbBits; \
804
+ } while (0)
805
+
806
+ /* Manually unroll the loop because compilers don't consistently
807
+ * unroll the inner loops, which destroys performance.
808
+ */
809
+ do {
810
+ /* Decode 5 symbols in each of the 4 streams */
811
+ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 0);
812
+ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 1);
813
+ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 2);
814
+ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 3);
815
+ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 4);
816
+
817
+ /* Reload each of the 4 the bitstreams */
818
+ HUF_4X_FOR_EACH_STREAM(HUF_4X1_RELOAD_STREAM);
819
+ } while (op[3] < olimit);
820
+
821
+ #undef HUF_4X1_DECODE_SYMBOL
822
+ #undef HUF_4X1_RELOAD_STREAM
823
+ }
824
+
825
+ _out:
826
+
827
+ /* Save the final values of each of the state variables back to args. */
828
+ ZSTD_memcpy(&args->bits, &bits, sizeof(bits));
829
+ ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip));
830
+ ZSTD_memcpy(&args->op, &op, sizeof(op));
462
831
  }
463
832
 
464
- size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
465
- const void* cSrc, size_t cSrcSize,
466
- void* workSpace, size_t wkspSize)
833
+ /**
834
+ * @returns @p dstSize on success (>= 6)
835
+ * 0 if the fallback implementation should be used
836
+ * An error if an error occurred
837
+ */
838
+ static HUF_FAST_BMI2_ATTRS
839
+ size_t
840
+ HUF_decompress4X1_usingDTable_internal_fast(
841
+ void* dst, size_t dstSize,
842
+ const void* cSrc, size_t cSrcSize,
843
+ const HUF_DTable* DTable,
844
+ HUF_DecompressFastLoopFn loopFn)
467
845
  {
468
- const BYTE* ip = (const BYTE*) cSrc;
846
+ void const* dt = DTable + 1;
847
+ BYTE const* const ilowest = (BYTE const*)cSrc;
848
+ BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize);
849
+ HUF_DecompressFastArgs args;
850
+ { size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
851
+ FORWARD_IF_ERROR(ret, "Failed to init fast loop args");
852
+ if (ret == 0)
853
+ return 0;
854
+ }
469
855
 
470
- size_t const hSize = HUF_readDTableX1_wksp(DCtx, cSrc, cSrcSize, workSpace, wkspSize);
471
- if (HUF_isError(hSize)) return hSize;
472
- if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
473
- ip += hSize; cSrcSize -= hSize;
856
+ assert(args.ip[0] >= args.ilowest);
857
+ loopFn(&args);
858
+
859
+ /* Our loop guarantees that ip[] >= ilowest and that we haven't
860
+ * overwritten any op[].
861
+ */
862
+ assert(args.ip[0] >= ilowest);
863
+ assert(args.ip[0] >= ilowest);
864
+ assert(args.ip[1] >= ilowest);
865
+ assert(args.ip[2] >= ilowest);
866
+ assert(args.ip[3] >= ilowest);
867
+ assert(args.op[3] <= oend);
868
+
869
+ assert(ilowest == args.ilowest);
870
+ assert(ilowest + 6 == args.iend[0]);
871
+ (void)ilowest;
872
+
873
+ /* finish bit streams one by one. */
874
+ { size_t const segmentSize = (dstSize+3) / 4;
875
+ BYTE* segmentEnd = (BYTE*)dst;
876
+ int i;
877
+ for (i = 0; i < 4; ++i) {
878
+ BIT_DStream_t bit;
879
+ if (segmentSize <= (size_t)(oend - segmentEnd))
880
+ segmentEnd += segmentSize;
881
+ else
882
+ segmentEnd = oend;
883
+ FORWARD_IF_ERROR(HUF_initRemainingDStream(&bit, &args, i, segmentEnd), "corruption");
884
+ /* Decompress and validate that we've produced exactly the expected length. */
885
+ args.op[i] += HUF_decodeStreamX1(args.op[i], &bit, segmentEnd, (HUF_DEltX1 const*)dt, HUF_DECODER_FAST_TABLELOG);
886
+ if (args.op[i] != segmentEnd) return ERROR(corruption_detected);
887
+ }
888
+ }
474
889
 
475
- return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
890
+ /* decoded size */
891
+ assert(dstSize != 0);
892
+ return dstSize;
476
893
  }
477
894
 
895
+ HUF_DGEN(HUF_decompress1X1_usingDTable_internal)
478
896
 
479
- size_t HUF_decompress4X1_usingDTable(
480
- void* dst, size_t dstSize,
481
- const void* cSrc, size_t cSrcSize,
482
- const HUF_DTable* DTable)
897
+ static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc,
898
+ size_t cSrcSize, HUF_DTable const* DTable, int flags)
483
899
  {
484
- DTableDesc dtd = HUF_getDTableDesc(DTable);
485
- if (dtd.tableType != 0) return ERROR(GENERIC);
486
- return HUF_decompress4X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
900
+ HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X1_usingDTable_internal_default;
901
+ HUF_DecompressFastLoopFn loopFn = HUF_decompress4X1_usingDTable_internal_fast_c_loop;
902
+
903
+ #if DYNAMIC_BMI2
904
+ if (flags & HUF_flags_bmi2) {
905
+ fallbackFn = HUF_decompress4X1_usingDTable_internal_bmi2;
906
+ # if ZSTD_ENABLE_ASM_X86_64_BMI2
907
+ if (!(flags & HUF_flags_disableAsm)) {
908
+ loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop;
909
+ }
910
+ # endif
911
+ } else {
912
+ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
913
+ }
914
+ #endif
915
+
916
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
917
+ if (!(flags & HUF_flags_disableAsm)) {
918
+ loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop;
919
+ }
920
+ #endif
921
+
922
+ if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) {
923
+ size_t const ret = HUF_decompress4X1_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);
924
+ if (ret != 0)
925
+ return ret;
926
+ }
927
+ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
487
928
  }
488
929
 
489
- static size_t HUF_decompress4X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
930
+ static size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
490
931
  const void* cSrc, size_t cSrcSize,
491
- void* workSpace, size_t wkspSize, int bmi2)
932
+ void* workSpace, size_t wkspSize, int flags)
492
933
  {
493
934
  const BYTE* ip = (const BYTE*) cSrc;
494
935
 
495
- size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
936
+ size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags);
496
937
  if (HUF_isError(hSize)) return hSize;
497
938
  if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
498
939
  ip += hSize; cSrcSize -= hSize;
499
940
 
500
- return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
941
+ return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
501
942
  }
502
943
 
503
- size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
504
- const void* cSrc, size_t cSrcSize,
505
- void* workSpace, size_t wkspSize)
506
- {
507
- return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, 0);
508
- }
509
-
510
-
511
944
  #endif /* HUF_FORCE_DECOMPRESS_X2 */
512
945
 
513
946
 
@@ -518,188 +951,308 @@ size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
518
951
  /* *************************/
519
952
 
520
953
  typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUF_DEltX2; /* double-symbols decoding */
521
- typedef struct { BYTE symbol; BYTE weight; } sortedSymbol_t;
954
+ typedef struct { BYTE symbol; } sortedSymbol_t;
522
955
  typedef U32 rankValCol_t[HUF_TABLELOG_MAX + 1];
523
956
  typedef rankValCol_t rankVal_t[HUF_TABLELOG_MAX];
524
957
 
958
+ /**
959
+ * Constructs a HUF_DEltX2 in a U32.
960
+ */
961
+ static U32 HUF_buildDEltX2U32(U32 symbol, U32 nbBits, U32 baseSeq, int level)
962
+ {
963
+ U32 seq;
964
+ DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, sequence) == 0);
965
+ DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, nbBits) == 2);
966
+ DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, length) == 3);
967
+ DEBUG_STATIC_ASSERT(sizeof(HUF_DEltX2) == sizeof(U32));
968
+ if (MEM_isLittleEndian()) {
969
+ seq = level == 1 ? symbol : (baseSeq + (symbol << 8));
970
+ return seq + (nbBits << 16) + ((U32)level << 24);
971
+ } else {
972
+ seq = level == 1 ? (symbol << 8) : ((baseSeq << 8) + symbol);
973
+ return (seq << 16) + (nbBits << 8) + (U32)level;
974
+ }
975
+ }
525
976
 
526
- /* HUF_fillDTableX2Level2() :
527
- * `rankValOrigin` must be a table of at least (HUF_TABLELOG_MAX + 1) U32 */
528
- static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 sizeLog, const U32 consumed,
529
- const U32* rankValOrigin, const int minWeight,
530
- const sortedSymbol_t* sortedSymbols, const U32 sortedListSize,
531
- U32 nbBitsBaseline, U16 baseSeq)
977
+ /**
978
+ * Constructs a HUF_DEltX2.
979
+ */
980
+ static HUF_DEltX2 HUF_buildDEltX2(U32 symbol, U32 nbBits, U32 baseSeq, int level)
532
981
  {
533
982
  HUF_DEltX2 DElt;
534
- U32 rankVal[HUF_TABLELOG_MAX + 1];
983
+ U32 const val = HUF_buildDEltX2U32(symbol, nbBits, baseSeq, level);
984
+ DEBUG_STATIC_ASSERT(sizeof(DElt) == sizeof(val));
985
+ ZSTD_memcpy(&DElt, &val, sizeof(val));
986
+ return DElt;
987
+ }
988
+
989
+ /**
990
+ * Constructs 2 HUF_DEltX2s and packs them into a U64.
991
+ */
992
+ static U64 HUF_buildDEltX2U64(U32 symbol, U32 nbBits, U16 baseSeq, int level)
993
+ {
994
+ U32 DElt = HUF_buildDEltX2U32(symbol, nbBits, baseSeq, level);
995
+ return (U64)DElt + ((U64)DElt << 32);
996
+ }
535
997
 
536
- /* get pre-calculated rankVal */
537
- ZSTD_memcpy(rankVal, rankValOrigin, sizeof(rankVal));
998
+ /**
999
+ * Fills the DTable rank with all the symbols from [begin, end) that are each
1000
+ * nbBits long.
1001
+ *
1002
+ * @param DTableRank The start of the rank in the DTable.
1003
+ * @param begin The first symbol to fill (inclusive).
1004
+ * @param end The last symbol to fill (exclusive).
1005
+ * @param nbBits Each symbol is nbBits long.
1006
+ * @param tableLog The table log.
1007
+ * @param baseSeq If level == 1 { 0 } else { the first level symbol }
1008
+ * @param level The level in the table. Must be 1 or 2.
1009
+ */
1010
+ static void HUF_fillDTableX2ForWeight(
1011
+ HUF_DEltX2* DTableRank,
1012
+ sortedSymbol_t const* begin, sortedSymbol_t const* end,
1013
+ U32 nbBits, U32 tableLog,
1014
+ U16 baseSeq, int const level)
1015
+ {
1016
+ U32 const length = 1U << ((tableLog - nbBits) & 0x1F /* quiet static-analyzer */);
1017
+ const sortedSymbol_t* ptr;
1018
+ assert(level >= 1 && level <= 2);
1019
+ switch (length) {
1020
+ case 1:
1021
+ for (ptr = begin; ptr != end; ++ptr) {
1022
+ HUF_DEltX2 const DElt = HUF_buildDEltX2(ptr->symbol, nbBits, baseSeq, level);
1023
+ *DTableRank++ = DElt;
1024
+ }
1025
+ break;
1026
+ case 2:
1027
+ for (ptr = begin; ptr != end; ++ptr) {
1028
+ HUF_DEltX2 const DElt = HUF_buildDEltX2(ptr->symbol, nbBits, baseSeq, level);
1029
+ DTableRank[0] = DElt;
1030
+ DTableRank[1] = DElt;
1031
+ DTableRank += 2;
1032
+ }
1033
+ break;
1034
+ case 4:
1035
+ for (ptr = begin; ptr != end; ++ptr) {
1036
+ U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level);
1037
+ ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2));
1038
+ ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2));
1039
+ DTableRank += 4;
1040
+ }
1041
+ break;
1042
+ case 8:
1043
+ for (ptr = begin; ptr != end; ++ptr) {
1044
+ U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level);
1045
+ ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2));
1046
+ ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2));
1047
+ ZSTD_memcpy(DTableRank + 4, &DEltX2, sizeof(DEltX2));
1048
+ ZSTD_memcpy(DTableRank + 6, &DEltX2, sizeof(DEltX2));
1049
+ DTableRank += 8;
1050
+ }
1051
+ break;
1052
+ default:
1053
+ for (ptr = begin; ptr != end; ++ptr) {
1054
+ U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level);
1055
+ HUF_DEltX2* const DTableRankEnd = DTableRank + length;
1056
+ for (; DTableRank != DTableRankEnd; DTableRank += 8) {
1057
+ ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2));
1058
+ ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2));
1059
+ ZSTD_memcpy(DTableRank + 4, &DEltX2, sizeof(DEltX2));
1060
+ ZSTD_memcpy(DTableRank + 6, &DEltX2, sizeof(DEltX2));
1061
+ }
1062
+ }
1063
+ break;
1064
+ }
1065
+ }
538
1066
 
539
- /* fill skipped values */
1067
+ /* HUF_fillDTableX2Level2() :
1068
+ * `rankValOrigin` must be a table of at least (HUF_TABLELOG_MAX + 1) U32 */
1069
+ static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 targetLog, const U32 consumedBits,
1070
+ const U32* rankVal, const int minWeight, const int maxWeight1,
1071
+ const sortedSymbol_t* sortedSymbols, U32 const* rankStart,
1072
+ U32 nbBitsBaseline, U16 baseSeq)
1073
+ {
1074
+ /* Fill skipped values (all positions up to rankVal[minWeight]).
1075
+ * These are positions only get a single symbol because the combined weight
1076
+ * is too large.
1077
+ */
540
1078
  if (minWeight>1) {
541
- U32 i, skipSize = rankVal[minWeight];
542
- MEM_writeLE16(&(DElt.sequence), baseSeq);
543
- DElt.nbBits = (BYTE)(consumed);
544
- DElt.length = 1;
545
- for (i = 0; i < skipSize; i++)
546
- DTable[i] = DElt;
1079
+ U32 const length = 1U << ((targetLog - consumedBits) & 0x1F /* quiet static-analyzer */);
1080
+ U64 const DEltX2 = HUF_buildDEltX2U64(baseSeq, consumedBits, /* baseSeq */ 0, /* level */ 1);
1081
+ int const skipSize = rankVal[minWeight];
1082
+ assert(length > 1);
1083
+ assert((U32)skipSize < length);
1084
+ switch (length) {
1085
+ case 2:
1086
+ assert(skipSize == 1);
1087
+ ZSTD_memcpy(DTable, &DEltX2, sizeof(DEltX2));
1088
+ break;
1089
+ case 4:
1090
+ assert(skipSize <= 4);
1091
+ ZSTD_memcpy(DTable + 0, &DEltX2, sizeof(DEltX2));
1092
+ ZSTD_memcpy(DTable + 2, &DEltX2, sizeof(DEltX2));
1093
+ break;
1094
+ default:
1095
+ {
1096
+ int i;
1097
+ for (i = 0; i < skipSize; i += 8) {
1098
+ ZSTD_memcpy(DTable + i + 0, &DEltX2, sizeof(DEltX2));
1099
+ ZSTD_memcpy(DTable + i + 2, &DEltX2, sizeof(DEltX2));
1100
+ ZSTD_memcpy(DTable + i + 4, &DEltX2, sizeof(DEltX2));
1101
+ ZSTD_memcpy(DTable + i + 6, &DEltX2, sizeof(DEltX2));
1102
+ }
1103
+ }
1104
+ }
547
1105
  }
548
1106
 
549
- /* fill DTable */
550
- { U32 s; for (s=0; s<sortedListSize; s++) { /* note : sortedSymbols already skipped */
551
- const U32 symbol = sortedSymbols[s].symbol;
552
- const U32 weight = sortedSymbols[s].weight;
553
- const U32 nbBits = nbBitsBaseline - weight;
554
- const U32 length = 1 << (sizeLog-nbBits);
555
- const U32 start = rankVal[weight];
556
- U32 i = start;
557
- const U32 end = start + length;
558
-
559
- MEM_writeLE16(&(DElt.sequence), (U16)(baseSeq + (symbol << 8)));
560
- DElt.nbBits = (BYTE)(nbBits + consumed);
561
- DElt.length = 2;
562
- do { DTable[i++] = DElt; } while (i<end); /* since length >= 1 */
563
-
564
- rankVal[weight] += length;
565
- } }
1107
+ /* Fill each of the second level symbols by weight. */
1108
+ {
1109
+ int w;
1110
+ for (w = minWeight; w < maxWeight1; ++w) {
1111
+ int const begin = rankStart[w];
1112
+ int const end = rankStart[w+1];
1113
+ U32 const nbBits = nbBitsBaseline - w;
1114
+ U32 const totalBits = nbBits + consumedBits;
1115
+ HUF_fillDTableX2ForWeight(
1116
+ DTable + rankVal[w],
1117
+ sortedSymbols + begin, sortedSymbols + end,
1118
+ totalBits, targetLog,
1119
+ baseSeq, /* level */ 2);
1120
+ }
1121
+ }
566
1122
  }
567
1123
 
568
-
569
1124
  static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog,
570
- const sortedSymbol_t* sortedList, const U32 sortedListSize,
571
- const U32* rankStart, rankVal_t rankValOrigin, const U32 maxWeight,
1125
+ const sortedSymbol_t* sortedList,
1126
+ const U32* rankStart, rankValCol_t* rankValOrigin, const U32 maxWeight,
572
1127
  const U32 nbBitsBaseline)
573
1128
  {
574
- U32 rankVal[HUF_TABLELOG_MAX + 1];
1129
+ U32* const rankVal = rankValOrigin[0];
575
1130
  const int scaleLog = nbBitsBaseline - targetLog; /* note : targetLog >= srcLog, hence scaleLog <= 1 */
576
1131
  const U32 minBits = nbBitsBaseline - maxWeight;
577
- U32 s;
578
-
579
- ZSTD_memcpy(rankVal, rankValOrigin, sizeof(rankVal));
580
-
581
- /* fill DTable */
582
- for (s=0; s<sortedListSize; s++) {
583
- const U16 symbol = sortedList[s].symbol;
584
- const U32 weight = sortedList[s].weight;
585
- const U32 nbBits = nbBitsBaseline - weight;
586
- const U32 start = rankVal[weight];
587
- const U32 length = 1 << (targetLog-nbBits);
588
-
589
- if (targetLog-nbBits >= minBits) { /* enough room for a second symbol */
590
- U32 sortedRank;
1132
+ int w;
1133
+ int const wEnd = (int)maxWeight + 1;
1134
+
1135
+ /* Fill DTable in order of weight. */
1136
+ for (w = 1; w < wEnd; ++w) {
1137
+ int const begin = (int)rankStart[w];
1138
+ int const end = (int)rankStart[w+1];
1139
+ U32 const nbBits = nbBitsBaseline - w;
1140
+
1141
+ if (targetLog-nbBits >= minBits) {
1142
+ /* Enough room for a second symbol. */
1143
+ int start = rankVal[w];
1144
+ U32 const length = 1U << ((targetLog - nbBits) & 0x1F /* quiet static-analyzer */);
591
1145
  int minWeight = nbBits + scaleLog;
1146
+ int s;
592
1147
  if (minWeight < 1) minWeight = 1;
593
- sortedRank = rankStart[minWeight];
594
- HUF_fillDTableX2Level2(DTable+start, targetLog-nbBits, nbBits,
595
- rankValOrigin[nbBits], minWeight,
596
- sortedList+sortedRank, sortedListSize-sortedRank,
597
- nbBitsBaseline, symbol);
1148
+ /* Fill the DTable for every symbol of weight w.
1149
+ * These symbols get at least 1 second symbol.
1150
+ */
1151
+ for (s = begin; s != end; ++s) {
1152
+ HUF_fillDTableX2Level2(
1153
+ DTable + start, targetLog, nbBits,
1154
+ rankValOrigin[nbBits], minWeight, wEnd,
1155
+ sortedList, rankStart,
1156
+ nbBitsBaseline, sortedList[s].symbol);
1157
+ start += length;
1158
+ }
598
1159
  } else {
599
- HUF_DEltX2 DElt;
600
- MEM_writeLE16(&(DElt.sequence), symbol);
601
- DElt.nbBits = (BYTE)(nbBits);
602
- DElt.length = 1;
603
- { U32 const end = start + length;
604
- U32 u;
605
- for (u = start; u < end; u++) DTable[u] = DElt;
606
- } }
607
- rankVal[weight] += length;
1160
+ /* Only a single symbol. */
1161
+ HUF_fillDTableX2ForWeight(
1162
+ DTable + rankVal[w],
1163
+ sortedList + begin, sortedList + end,
1164
+ nbBits, targetLog,
1165
+ /* baseSeq */ 0, /* level */ 1);
1166
+ }
608
1167
  }
609
1168
  }
610
1169
 
1170
+ typedef struct {
1171
+ rankValCol_t rankVal[HUF_TABLELOG_MAX];
1172
+ U32 rankStats[HUF_TABLELOG_MAX + 1];
1173
+ U32 rankStart0[HUF_TABLELOG_MAX + 3];
1174
+ sortedSymbol_t sortedSymbol[HUF_SYMBOLVALUE_MAX + 1];
1175
+ BYTE weightList[HUF_SYMBOLVALUE_MAX + 1];
1176
+ U32 calleeWksp[HUF_READ_STATS_WORKSPACE_SIZE_U32];
1177
+ } HUF_ReadDTableX2_Workspace;
1178
+
611
1179
  size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,
612
1180
  const void* src, size_t srcSize,
613
- void* workSpace, size_t wkspSize)
1181
+ void* workSpace, size_t wkspSize, int flags)
614
1182
  {
615
- U32 tableLog, maxW, sizeOfSort, nbSymbols;
1183
+ U32 tableLog, maxW, nbSymbols;
616
1184
  DTableDesc dtd = HUF_getDTableDesc(DTable);
617
- U32 const maxTableLog = dtd.maxTableLog;
1185
+ U32 maxTableLog = dtd.maxTableLog;
618
1186
  size_t iSize;
619
1187
  void* dtPtr = DTable+1; /* force compiler to avoid strict-aliasing */
620
1188
  HUF_DEltX2* const dt = (HUF_DEltX2*)dtPtr;
621
1189
  U32 *rankStart;
622
1190
 
623
- rankValCol_t* rankVal;
624
- U32* rankStats;
625
- U32* rankStart0;
626
- sortedSymbol_t* sortedSymbol;
627
- BYTE* weightList;
628
- size_t spaceUsed32 = 0;
629
-
630
- rankVal = (rankValCol_t *)((U32 *)workSpace + spaceUsed32);
631
- spaceUsed32 += (sizeof(rankValCol_t) * HUF_TABLELOG_MAX) >> 2;
632
- rankStats = (U32 *)workSpace + spaceUsed32;
633
- spaceUsed32 += HUF_TABLELOG_MAX + 1;
634
- rankStart0 = (U32 *)workSpace + spaceUsed32;
635
- spaceUsed32 += HUF_TABLELOG_MAX + 2;
636
- sortedSymbol = (sortedSymbol_t *)workSpace + (spaceUsed32 * sizeof(U32)) / sizeof(sortedSymbol_t);
637
- spaceUsed32 += HUF_ALIGN(sizeof(sortedSymbol_t) * (HUF_SYMBOLVALUE_MAX + 1), sizeof(U32)) >> 2;
638
- weightList = (BYTE *)((U32 *)workSpace + spaceUsed32);
639
- spaceUsed32 += HUF_ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2;
640
-
641
- if ((spaceUsed32 << 2) > wkspSize) return ERROR(tableLog_tooLarge);
642
-
643
- rankStart = rankStart0 + 1;
644
- ZSTD_memset(rankStats, 0, sizeof(U32) * (2 * HUF_TABLELOG_MAX + 2 + 1));
1191
+ HUF_ReadDTableX2_Workspace* const wksp = (HUF_ReadDTableX2_Workspace*)workSpace;
1192
+
1193
+ if (sizeof(*wksp) > wkspSize) return ERROR(GENERIC);
1194
+
1195
+ rankStart = wksp->rankStart0 + 1;
1196
+ ZSTD_memset(wksp->rankStats, 0, sizeof(wksp->rankStats));
1197
+ ZSTD_memset(wksp->rankStart0, 0, sizeof(wksp->rankStart0));
645
1198
 
646
1199
  DEBUG_STATIC_ASSERT(sizeof(HUF_DEltX2) == sizeof(HUF_DTable)); /* if compiler fails here, assertion is wrong */
647
1200
  if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
648
1201
  /* ZSTD_memset(weightList, 0, sizeof(weightList)); */ /* is not necessary, even though some analyzer complain ... */
649
1202
 
650
- iSize = HUF_readStats(weightList, HUF_SYMBOLVALUE_MAX + 1, rankStats, &nbSymbols, &tableLog, src, srcSize);
1203
+ iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), flags);
651
1204
  if (HUF_isError(iSize)) return iSize;
652
1205
 
653
1206
  /* check result */
654
1207
  if (tableLog > maxTableLog) return ERROR(tableLog_tooLarge); /* DTable can't fit code depth */
1208
+ if (tableLog <= HUF_DECODER_FAST_TABLELOG && maxTableLog > HUF_DECODER_FAST_TABLELOG) maxTableLog = HUF_DECODER_FAST_TABLELOG;
655
1209
 
656
1210
  /* find maxWeight */
657
- for (maxW = tableLog; rankStats[maxW]==0; maxW--) {} /* necessarily finds a solution before 0 */
1211
+ for (maxW = tableLog; wksp->rankStats[maxW]==0; maxW--) {} /* necessarily finds a solution before 0 */
658
1212
 
659
1213
  /* Get start index of each weight */
660
1214
  { U32 w, nextRankStart = 0;
661
1215
  for (w=1; w<maxW+1; w++) {
662
1216
  U32 curr = nextRankStart;
663
- nextRankStart += rankStats[w];
1217
+ nextRankStart += wksp->rankStats[w];
664
1218
  rankStart[w] = curr;
665
1219
  }
666
1220
  rankStart[0] = nextRankStart; /* put all 0w symbols at the end of sorted list*/
667
- sizeOfSort = nextRankStart;
1221
+ rankStart[maxW+1] = nextRankStart;
668
1222
  }
669
1223
 
670
1224
  /* sort symbols by weight */
671
1225
  { U32 s;
672
1226
  for (s=0; s<nbSymbols; s++) {
673
- U32 const w = weightList[s];
1227
+ U32 const w = wksp->weightList[s];
674
1228
  U32 const r = rankStart[w]++;
675
- sortedSymbol[r].symbol = (BYTE)s;
676
- sortedSymbol[r].weight = (BYTE)w;
1229
+ wksp->sortedSymbol[r].symbol = (BYTE)s;
677
1230
  }
678
1231
  rankStart[0] = 0; /* forget 0w symbols; this is beginning of weight(1) */
679
1232
  }
680
1233
 
681
1234
  /* Build rankVal */
682
- { U32* const rankVal0 = rankVal[0];
1235
+ { U32* const rankVal0 = wksp->rankVal[0];
683
1236
  { int const rescale = (maxTableLog-tableLog) - 1; /* tableLog <= maxTableLog */
684
1237
  U32 nextRankVal = 0;
685
1238
  U32 w;
686
1239
  for (w=1; w<maxW+1; w++) {
687
1240
  U32 curr = nextRankVal;
688
- nextRankVal += rankStats[w] << (w+rescale);
1241
+ nextRankVal += wksp->rankStats[w] << (w+rescale);
689
1242
  rankVal0[w] = curr;
690
1243
  } }
691
1244
  { U32 const minBits = tableLog+1 - maxW;
692
1245
  U32 consumed;
693
1246
  for (consumed = minBits; consumed < maxTableLog - minBits + 1; consumed++) {
694
- U32* const rankValPtr = rankVal[consumed];
1247
+ U32* const rankValPtr = wksp->rankVal[consumed];
695
1248
  U32 w;
696
1249
  for (w = 1; w < maxW+1; w++) {
697
1250
  rankValPtr[w] = rankVal0[w] >> consumed;
698
1251
  } } } }
699
1252
 
700
1253
  HUF_fillDTableX2(dt, maxTableLog,
701
- sortedSymbol, sizeOfSort,
702
- rankStart0, rankVal, maxW,
1254
+ wksp->sortedSymbol,
1255
+ wksp->rankStart0, wksp->rankVal, maxW,
703
1256
  tableLog+1);
704
1257
 
705
1258
  dtd.tableLog = (BYTE)maxTableLog;
@@ -713,7 +1266,7 @@ FORCE_INLINE_TEMPLATE U32
713
1266
  HUF_decodeSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, const U32 dtLog)
714
1267
  {
715
1268
  size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */
716
- ZSTD_memcpy(op, dt+val, 2);
1269
+ ZSTD_memcpy(op, &dt[val].sequence, 2);
717
1270
  BIT_skipBits(DStream, dt[val].nbBits);
718
1271
  return dt[val].length;
719
1272
  }
@@ -722,28 +1275,34 @@ FORCE_INLINE_TEMPLATE U32
722
1275
  HUF_decodeLastSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, const U32 dtLog)
723
1276
  {
724
1277
  size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */
725
- ZSTD_memcpy(op, dt+val, 1);
726
- if (dt[val].length==1) BIT_skipBits(DStream, dt[val].nbBits);
727
- else {
1278
+ ZSTD_memcpy(op, &dt[val].sequence, 1);
1279
+ if (dt[val].length==1) {
1280
+ BIT_skipBits(DStream, dt[val].nbBits);
1281
+ } else {
728
1282
  if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8)) {
729
1283
  BIT_skipBits(DStream, dt[val].nbBits);
730
1284
  if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8))
731
1285
  /* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */
732
1286
  DStream->bitsConsumed = (sizeof(DStream->bitContainer)*8);
733
- } }
1287
+ }
1288
+ }
734
1289
  return 1;
735
1290
  }
736
1291
 
737
1292
  #define HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) \
738
- ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
1293
+ do { ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); } while (0)
739
1294
 
740
- #define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \
741
- if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
742
- ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
1295
+ #define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \
1296
+ do { \
1297
+ if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
1298
+ ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); \
1299
+ } while (0)
743
1300
 
744
- #define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \
745
- if (MEM_64bits()) \
746
- ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
1301
+ #define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \
1302
+ do { \
1303
+ if (MEM_64bits()) \
1304
+ ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); \
1305
+ } while (0)
747
1306
 
748
1307
  HINT_INLINE size_t
749
1308
  HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd,
@@ -752,19 +1311,37 @@ HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd,
752
1311
  BYTE* const pStart = p;
753
1312
 
754
1313
  /* up to 8 symbols at a time */
755
- while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-(sizeof(bitDPtr->bitContainer)-1))) {
756
- HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
757
- HUF_DECODE_SYMBOLX2_1(p, bitDPtr);
758
- HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
759
- HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
1314
+ if ((size_t)(pEnd - p) >= sizeof(bitDPtr->bitContainer)) {
1315
+ if (dtLog <= 11 && MEM_64bits()) {
1316
+ /* up to 10 symbols at a time */
1317
+ while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-9)) {
1318
+ HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
1319
+ HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
1320
+ HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
1321
+ HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
1322
+ HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
1323
+ }
1324
+ } else {
1325
+ /* up to 8 symbols at a time */
1326
+ while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-(sizeof(bitDPtr->bitContainer)-1))) {
1327
+ HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
1328
+ HUF_DECODE_SYMBOLX2_1(p, bitDPtr);
1329
+ HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
1330
+ HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
1331
+ }
1332
+ }
1333
+ } else {
1334
+ BIT_reloadDStream(bitDPtr);
760
1335
  }
761
1336
 
762
1337
  /* closer to end : up to 2 symbols at a time */
763
- while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p <= pEnd-2))
764
- HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
1338
+ if ((size_t)(pEnd - p) >= 2) {
1339
+ while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p <= pEnd-2))
1340
+ HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
765
1341
 
766
- while (p <= pEnd-2)
767
- HUF_DECODE_SYMBOLX2_0(p, bitDPtr); /* no need to reload : reached the end of DStream */
1342
+ while (p <= pEnd-2)
1343
+ HUF_DECODE_SYMBOLX2_0(p, bitDPtr); /* no need to reload : reached the end of DStream */
1344
+ }
768
1345
 
769
1346
  if (p < pEnd)
770
1347
  p += HUF_decodeLastSymbolX2(p, bitDPtr, dt, dtLog);
@@ -785,7 +1362,7 @@ HUF_decompress1X2_usingDTable_internal_body(
785
1362
 
786
1363
  /* decode */
787
1364
  { BYTE* const ostart = (BYTE*) dst;
788
- BYTE* const oend = ostart + dstSize;
1365
+ BYTE* const oend = ZSTD_maybeNullPtrAdd(ostart, dstSize);
789
1366
  const void* const dtPtr = DTable+1; /* force compiler to not use strict-aliasing */
790
1367
  const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr;
791
1368
  DTableDesc const dtd = HUF_getDTableDesc(DTable);
@@ -799,6 +1376,10 @@ HUF_decompress1X2_usingDTable_internal_body(
799
1376
  return dstSize;
800
1377
  }
801
1378
 
1379
+ /* HUF_decompress4X2_usingDTable_internal_body():
1380
+ * Conditions:
1381
+ * @dstSize >= 6
1382
+ */
802
1383
  FORCE_INLINE_TEMPLATE size_t
803
1384
  HUF_decompress4X2_usingDTable_internal_body(
804
1385
  void* dst, size_t dstSize,
@@ -806,6 +1387,7 @@ HUF_decompress4X2_usingDTable_internal_body(
806
1387
  const HUF_DTable* DTable)
807
1388
  {
808
1389
  if (cSrcSize < 10) return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */
1390
+ if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */
809
1391
 
810
1392
  { const BYTE* const istart = (const BYTE*) cSrc;
811
1393
  BYTE* const ostart = (BYTE*) dst;
@@ -839,58 +1421,62 @@ HUF_decompress4X2_usingDTable_internal_body(
839
1421
  DTableDesc const dtd = HUF_getDTableDesc(DTable);
840
1422
  U32 const dtLog = dtd.tableLog;
841
1423
 
842
- if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */
1424
+ if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */
1425
+ if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */
1426
+ assert(dstSize >= 6 /* validated above */);
843
1427
  CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
844
1428
  CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
845
1429
  CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
846
1430
  CHECK_F( BIT_initDStream(&bitD4, istart4, length4) );
847
1431
 
848
1432
  /* 16-32 symbols per loop (4-8 symbols per stream) */
849
- for ( ; (endSignal) & (op4 < olimit); ) {
1433
+ if ((size_t)(oend - op4) >= sizeof(size_t)) {
1434
+ for ( ; (endSignal) & (op4 < olimit); ) {
850
1435
  #if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
851
- HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
852
- HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
853
- HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
854
- HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
855
- HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
856
- HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
857
- HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
858
- HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
859
- endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished;
860
- endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished;
861
- HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
862
- HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
863
- HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
864
- HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
865
- HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
866
- HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
867
- HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
868
- HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
869
- endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished;
870
- endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished;
1436
+ HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
1437
+ HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
1438
+ HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
1439
+ HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
1440
+ HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
1441
+ HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
1442
+ HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
1443
+ HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
1444
+ endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished;
1445
+ endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished;
1446
+ HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
1447
+ HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
1448
+ HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
1449
+ HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
1450
+ HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
1451
+ HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
1452
+ HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
1453
+ HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
1454
+ endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished;
1455
+ endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished;
871
1456
  #else
872
- HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
873
- HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
874
- HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
875
- HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
876
- HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
877
- HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
878
- HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
879
- HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
880
- HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
881
- HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
882
- HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
883
- HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
884
- HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
885
- HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
886
- HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
887
- HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
888
- endSignal = (U32)LIKELY(
889
- (BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished)
890
- & (BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished)
891
- & (BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished)
892
- & (BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished));
1457
+ HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
1458
+ HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
1459
+ HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
1460
+ HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
1461
+ HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
1462
+ HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
1463
+ HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
1464
+ HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
1465
+ HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
1466
+ HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
1467
+ HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
1468
+ HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
1469
+ HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
1470
+ HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
1471
+ HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
1472
+ HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
1473
+ endSignal = (U32)LIKELY((U32)
1474
+ (BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished)
1475
+ & (BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished)
1476
+ & (BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished)
1477
+ & (BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished));
893
1478
  #endif
1479
+ }
894
1480
  }
895
1481
 
896
1482
  /* check corruption */
@@ -914,68 +1500,287 @@ HUF_decompress4X2_usingDTable_internal_body(
914
1500
  }
915
1501
  }
916
1502
 
917
- HUF_DGEN(HUF_decompress1X2_usingDTable_internal)
918
- HUF_DGEN(HUF_decompress4X2_usingDTable_internal)
1503
+ #if HUF_NEED_BMI2_FUNCTION
1504
+ static BMI2_TARGET_ATTRIBUTE
1505
+ size_t HUF_decompress4X2_usingDTable_internal_bmi2(void* dst, size_t dstSize, void const* cSrc,
1506
+ size_t cSrcSize, HUF_DTable const* DTable) {
1507
+ return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
1508
+ }
1509
+ #endif
1510
+
1511
+ static
1512
+ size_t HUF_decompress4X2_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,
1513
+ size_t cSrcSize, HUF_DTable const* DTable) {
1514
+ return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
1515
+ }
1516
+
1517
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2
1518
+
1519
+ HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN;
1520
+
1521
+ #endif
1522
+
1523
+ static HUF_FAST_BMI2_ATTRS
1524
+ void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args)
1525
+ {
1526
+ U64 bits[4];
1527
+ BYTE const* ip[4];
1528
+ BYTE* op[4];
1529
+ BYTE* oend[4];
1530
+ HUF_DEltX2 const* const dtable = (HUF_DEltX2 const*)args->dt;
1531
+ BYTE const* const ilowest = args->ilowest;
1532
+
1533
+ /* Copy the arguments to local registers. */
1534
+ ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
1535
+ ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip));
1536
+ ZSTD_memcpy(&op, &args->op, sizeof(op));
1537
+
1538
+ oend[0] = op[1];
1539
+ oend[1] = op[2];
1540
+ oend[2] = op[3];
1541
+ oend[3] = args->oend;
1542
+
1543
+ assert(MEM_isLittleEndian());
1544
+ assert(!MEM_32bits());
1545
+
1546
+ for (;;) {
1547
+ BYTE* olimit;
1548
+ int stream;
1549
+
1550
+ /* Assert loop preconditions */
1551
+ #ifndef NDEBUG
1552
+ for (stream = 0; stream < 4; ++stream) {
1553
+ assert(op[stream] <= oend[stream]);
1554
+ assert(ip[stream] >= ilowest);
1555
+ }
1556
+ #endif
1557
+ /* Compute olimit */
1558
+ {
1559
+ /* Each loop does 5 table lookups for each of the 4 streams.
1560
+ * Each table lookup consumes up to 11 bits of input, and produces
1561
+ * up to 2 bytes of output.
1562
+ */
1563
+ /* We can consume up to 7 bytes of input per iteration per stream.
1564
+ * We also know that each input pointer is >= ip[0]. So we can run
1565
+ * iters loops before running out of input.
1566
+ */
1567
+ size_t iters = (size_t)(ip[0] - ilowest) / 7;
1568
+ /* Each iteration can produce up to 10 bytes of output per stream.
1569
+ * Each output stream my advance at different rates. So take the
1570
+ * minimum number of safe iterations among all the output streams.
1571
+ */
1572
+ for (stream = 0; stream < 4; ++stream) {
1573
+ size_t const oiters = (size_t)(oend[stream] - op[stream]) / 10;
1574
+ iters = MIN(iters, oiters);
1575
+ }
1576
+
1577
+ /* Each iteration produces at least 5 output symbols. So until
1578
+ * op[3] crosses olimit, we know we haven't executed iters
1579
+ * iterations yet. This saves us maintaining an iters counter,
1580
+ * at the expense of computing the remaining # of iterations
1581
+ * more frequently.
1582
+ */
1583
+ olimit = op[3] + (iters * 5);
1584
+
1585
+ /* Exit the fast decoding loop once we reach the end. */
1586
+ if (op[3] == olimit)
1587
+ break;
1588
+
1589
+ /* Exit the decoding loop if any input pointer has crossed the
1590
+ * previous one. This indicates corruption, and a precondition
1591
+ * to our loop is that ip[i] >= ip[0].
1592
+ */
1593
+ for (stream = 1; stream < 4; ++stream) {
1594
+ if (ip[stream] < ip[stream - 1])
1595
+ goto _out;
1596
+ }
1597
+ }
1598
+
1599
+ #ifndef NDEBUG
1600
+ for (stream = 1; stream < 4; ++stream) {
1601
+ assert(ip[stream] >= ip[stream - 1]);
1602
+ }
1603
+ #endif
1604
+
1605
+ #define HUF_4X2_DECODE_SYMBOL(_stream, _decode3) \
1606
+ do { \
1607
+ if ((_decode3) || (_stream) != 3) { \
1608
+ int const index = (int)(bits[(_stream)] >> 53); \
1609
+ HUF_DEltX2 const entry = dtable[index]; \
1610
+ MEM_write16(op[(_stream)], entry.sequence); \
1611
+ bits[(_stream)] <<= (entry.nbBits) & 0x3F; \
1612
+ op[(_stream)] += (entry.length); \
1613
+ } \
1614
+ } while (0)
1615
+
1616
+ #define HUF_4X2_RELOAD_STREAM(_stream) \
1617
+ do { \
1618
+ HUF_4X2_DECODE_SYMBOL(3, 1); \
1619
+ { \
1620
+ int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \
1621
+ int const nbBits = ctz & 7; \
1622
+ int const nbBytes = ctz >> 3; \
1623
+ ip[(_stream)] -= nbBytes; \
1624
+ bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1; \
1625
+ bits[(_stream)] <<= nbBits; \
1626
+ } \
1627
+ } while (0)
1628
+
1629
+ /* Manually unroll the loop because compilers don't consistently
1630
+ * unroll the inner loops, which destroys performance.
1631
+ */
1632
+ do {
1633
+ /* Decode 5 symbols from each of the first 3 streams.
1634
+ * The final stream will be decoded during the reload phase
1635
+ * to reduce register pressure.
1636
+ */
1637
+ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
1638
+ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
1639
+ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
1640
+ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
1641
+ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
1642
+
1643
+ /* Decode one symbol from the final stream */
1644
+ HUF_4X2_DECODE_SYMBOL(3, 1);
1645
+
1646
+ /* Decode 4 symbols from the final stream & reload bitstreams.
1647
+ * The final stream is reloaded last, meaning that all 5 symbols
1648
+ * are decoded from the final stream before it is reloaded.
1649
+ */
1650
+ HUF_4X_FOR_EACH_STREAM(HUF_4X2_RELOAD_STREAM);
1651
+ } while (op[3] < olimit);
1652
+ }
919
1653
 
920
- size_t HUF_decompress1X2_usingDTable(
1654
+ #undef HUF_4X2_DECODE_SYMBOL
1655
+ #undef HUF_4X2_RELOAD_STREAM
1656
+
1657
+ _out:
1658
+
1659
+ /* Save the final values of each of the state variables back to args. */
1660
+ ZSTD_memcpy(&args->bits, &bits, sizeof(bits));
1661
+ ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip));
1662
+ ZSTD_memcpy(&args->op, &op, sizeof(op));
1663
+ }
1664
+
1665
+
1666
+ static HUF_FAST_BMI2_ATTRS size_t
1667
+ HUF_decompress4X2_usingDTable_internal_fast(
921
1668
  void* dst, size_t dstSize,
922
1669
  const void* cSrc, size_t cSrcSize,
923
- const HUF_DTable* DTable)
1670
+ const HUF_DTable* DTable,
1671
+ HUF_DecompressFastLoopFn loopFn) {
1672
+ void const* dt = DTable + 1;
1673
+ const BYTE* const ilowest = (const BYTE*)cSrc;
1674
+ BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize);
1675
+ HUF_DecompressFastArgs args;
1676
+ {
1677
+ size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
1678
+ FORWARD_IF_ERROR(ret, "Failed to init asm args");
1679
+ if (ret == 0)
1680
+ return 0;
1681
+ }
1682
+
1683
+ assert(args.ip[0] >= args.ilowest);
1684
+ loopFn(&args);
1685
+
1686
+ /* note : op4 already verified within main loop */
1687
+ assert(args.ip[0] >= ilowest);
1688
+ assert(args.ip[1] >= ilowest);
1689
+ assert(args.ip[2] >= ilowest);
1690
+ assert(args.ip[3] >= ilowest);
1691
+ assert(args.op[3] <= oend);
1692
+
1693
+ assert(ilowest == args.ilowest);
1694
+ assert(ilowest + 6 == args.iend[0]);
1695
+ (void)ilowest;
1696
+
1697
+ /* finish bitStreams one by one */
1698
+ {
1699
+ size_t const segmentSize = (dstSize+3) / 4;
1700
+ BYTE* segmentEnd = (BYTE*)dst;
1701
+ int i;
1702
+ for (i = 0; i < 4; ++i) {
1703
+ BIT_DStream_t bit;
1704
+ if (segmentSize <= (size_t)(oend - segmentEnd))
1705
+ segmentEnd += segmentSize;
1706
+ else
1707
+ segmentEnd = oend;
1708
+ FORWARD_IF_ERROR(HUF_initRemainingDStream(&bit, &args, i, segmentEnd), "corruption");
1709
+ args.op[i] += HUF_decodeStreamX2(args.op[i], &bit, segmentEnd, (HUF_DEltX2 const*)dt, HUF_DECODER_FAST_TABLELOG);
1710
+ if (args.op[i] != segmentEnd)
1711
+ return ERROR(corruption_detected);
1712
+ }
1713
+ }
1714
+
1715
+ /* decoded size */
1716
+ return dstSize;
1717
+ }
1718
+
1719
+ static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc,
1720
+ size_t cSrcSize, HUF_DTable const* DTable, int flags)
924
1721
  {
925
- DTableDesc dtd = HUF_getDTableDesc(DTable);
926
- if (dtd.tableType != 1) return ERROR(GENERIC);
927
- return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
1722
+ HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X2_usingDTable_internal_default;
1723
+ HUF_DecompressFastLoopFn loopFn = HUF_decompress4X2_usingDTable_internal_fast_c_loop;
1724
+
1725
+ #if DYNAMIC_BMI2
1726
+ if (flags & HUF_flags_bmi2) {
1727
+ fallbackFn = HUF_decompress4X2_usingDTable_internal_bmi2;
1728
+ # if ZSTD_ENABLE_ASM_X86_64_BMI2
1729
+ if (!(flags & HUF_flags_disableAsm)) {
1730
+ loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop;
1731
+ }
1732
+ # endif
1733
+ } else {
1734
+ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
1735
+ }
1736
+ #endif
1737
+
1738
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
1739
+ if (!(flags & HUF_flags_disableAsm)) {
1740
+ loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop;
1741
+ }
1742
+ #endif
1743
+
1744
+ if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) {
1745
+ size_t const ret = HUF_decompress4X2_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);
1746
+ if (ret != 0)
1747
+ return ret;
1748
+ }
1749
+ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
928
1750
  }
929
1751
 
1752
+ HUF_DGEN(HUF_decompress1X2_usingDTable_internal)
1753
+
930
1754
  size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
931
1755
  const void* cSrc, size_t cSrcSize,
932
- void* workSpace, size_t wkspSize)
1756
+ void* workSpace, size_t wkspSize, int flags)
933
1757
  {
934
1758
  const BYTE* ip = (const BYTE*) cSrc;
935
1759
 
936
1760
  size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize,
937
- workSpace, wkspSize);
1761
+ workSpace, wkspSize, flags);
938
1762
  if (HUF_isError(hSize)) return hSize;
939
1763
  if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
940
1764
  ip += hSize; cSrcSize -= hSize;
941
1765
 
942
- return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
1766
+ return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, flags);
943
1767
  }
944
1768
 
945
-
946
- size_t HUF_decompress4X2_usingDTable(
947
- void* dst, size_t dstSize,
948
- const void* cSrc, size_t cSrcSize,
949
- const HUF_DTable* DTable)
950
- {
951
- DTableDesc dtd = HUF_getDTableDesc(DTable);
952
- if (dtd.tableType != 1) return ERROR(GENERIC);
953
- return HUF_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
954
- }
955
-
956
- static size_t HUF_decompress4X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
1769
+ static size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
957
1770
  const void* cSrc, size_t cSrcSize,
958
- void* workSpace, size_t wkspSize, int bmi2)
1771
+ void* workSpace, size_t wkspSize, int flags)
959
1772
  {
960
1773
  const BYTE* ip = (const BYTE*) cSrc;
961
1774
 
962
1775
  size_t hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize,
963
- workSpace, wkspSize);
1776
+ workSpace, wkspSize, flags);
964
1777
  if (HUF_isError(hSize)) return hSize;
965
1778
  if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
966
1779
  ip += hSize; cSrcSize -= hSize;
967
1780
 
968
- return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
969
- }
970
-
971
- size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
972
- const void* cSrc, size_t cSrcSize,
973
- void* workSpace, size_t wkspSize)
974
- {
975
- return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, /* bmi2 */ 0);
1781
+ return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
976
1782
  }
977
1783
 
978
-
979
1784
  #endif /* HUF_FORCE_DECOMPRESS_X1 */
980
1785
 
981
1786
 
@@ -983,66 +1788,28 @@ size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
983
1788
  /* Universal decompression selectors */
984
1789
  /* ***********************************/
985
1790
 
986
- size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize,
987
- const void* cSrc, size_t cSrcSize,
988
- const HUF_DTable* DTable)
989
- {
990
- DTableDesc const dtd = HUF_getDTableDesc(DTable);
991
- #if defined(HUF_FORCE_DECOMPRESS_X1)
992
- (void)dtd;
993
- assert(dtd.tableType == 0);
994
- return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
995
- #elif defined(HUF_FORCE_DECOMPRESS_X2)
996
- (void)dtd;
997
- assert(dtd.tableType == 1);
998
- return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
999
- #else
1000
- return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
1001
- HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
1002
- #endif
1003
- }
1004
-
1005
- size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize,
1006
- const void* cSrc, size_t cSrcSize,
1007
- const HUF_DTable* DTable)
1008
- {
1009
- DTableDesc const dtd = HUF_getDTableDesc(DTable);
1010
- #if defined(HUF_FORCE_DECOMPRESS_X1)
1011
- (void)dtd;
1012
- assert(dtd.tableType == 0);
1013
- return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
1014
- #elif defined(HUF_FORCE_DECOMPRESS_X2)
1015
- (void)dtd;
1016
- assert(dtd.tableType == 1);
1017
- return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
1018
- #else
1019
- return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
1020
- HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
1021
- #endif
1022
- }
1023
-
1024
1791
 
1025
1792
  #if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2)
1026
1793
  typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t;
1027
- static const algo_time_t algoTime[16 /* Quantization */][3 /* single, double, quad */] =
1794
+ static const algo_time_t algoTime[16 /* Quantization */][2 /* single, double */] =
1028
1795
  {
1029
1796
  /* single, double, quad */
1030
- {{0,0}, {1,1}, {2,2}}, /* Q==0 : impossible */
1031
- {{0,0}, {1,1}, {2,2}}, /* Q==1 : impossible */
1032
- {{ 38,130}, {1313, 74}, {2151, 38}}, /* Q == 2 : 12-18% */
1033
- {{ 448,128}, {1353, 74}, {2238, 41}}, /* Q == 3 : 18-25% */
1034
- {{ 556,128}, {1353, 74}, {2238, 47}}, /* Q == 4 : 25-32% */
1035
- {{ 714,128}, {1418, 74}, {2436, 53}}, /* Q == 5 : 32-38% */
1036
- {{ 883,128}, {1437, 74}, {2464, 61}}, /* Q == 6 : 38-44% */
1037
- {{ 897,128}, {1515, 75}, {2622, 68}}, /* Q == 7 : 44-50% */
1038
- {{ 926,128}, {1613, 75}, {2730, 75}}, /* Q == 8 : 50-56% */
1039
- {{ 947,128}, {1729, 77}, {3359, 77}}, /* Q == 9 : 56-62% */
1040
- {{1107,128}, {2083, 81}, {4006, 84}}, /* Q ==10 : 62-69% */
1041
- {{1177,128}, {2379, 87}, {4785, 88}}, /* Q ==11 : 69-75% */
1042
- {{1242,128}, {2415, 93}, {5155, 84}}, /* Q ==12 : 75-81% */
1043
- {{1349,128}, {2644,106}, {5260,106}}, /* Q ==13 : 81-87% */
1044
- {{1455,128}, {2422,124}, {4174,124}}, /* Q ==14 : 87-93% */
1045
- {{ 722,128}, {1891,145}, {1936,146}}, /* Q ==15 : 93-99% */
1797
+ {{0,0}, {1,1}}, /* Q==0 : impossible */
1798
+ {{0,0}, {1,1}}, /* Q==1 : impossible */
1799
+ {{ 150,216}, { 381,119}}, /* Q == 2 : 12-18% */
1800
+ {{ 170,205}, { 514,112}}, /* Q == 3 : 18-25% */
1801
+ {{ 177,199}, { 539,110}}, /* Q == 4 : 25-32% */
1802
+ {{ 197,194}, { 644,107}}, /* Q == 5 : 32-38% */
1803
+ {{ 221,192}, { 735,107}}, /* Q == 6 : 38-44% */
1804
+ {{ 256,189}, { 881,106}}, /* Q == 7 : 44-50% */
1805
+ {{ 359,188}, {1167,109}}, /* Q == 8 : 50-56% */
1806
+ {{ 582,187}, {1570,114}}, /* Q == 9 : 56-62% */
1807
+ {{ 688,187}, {1712,122}}, /* Q ==10 : 62-69% */
1808
+ {{ 825,186}, {1965,136}}, /* Q ==11 : 69-75% */
1809
+ {{ 976,185}, {2131,150}}, /* Q ==12 : 75-81% */
1810
+ {{1180,186}, {2070,175}}, /* Q ==13 : 81-87% */
1811
+ {{1377,185}, {1731,202}}, /* Q ==14 : 87-93% */
1812
+ {{1412,185}, {1695,202}}, /* Q ==15 : 93-99% */
1046
1813
  };
1047
1814
  #endif
1048
1815
 
@@ -1069,42 +1836,15 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize)
1069
1836
  U32 const D256 = (U32)(dstSize >> 8);
1070
1837
  U32 const DTime0 = algoTime[Q][0].tableTime + (algoTime[Q][0].decode256Time * D256);
1071
1838
  U32 DTime1 = algoTime[Q][1].tableTime + (algoTime[Q][1].decode256Time * D256);
1072
- DTime1 += DTime1 >> 3; /* advantage to algorithm using less memory, to reduce cache eviction */
1839
+ DTime1 += DTime1 >> 5; /* small advantage to algorithm using less memory, to reduce cache eviction */
1073
1840
  return DTime1 < DTime0;
1074
1841
  }
1075
1842
  #endif
1076
1843
  }
1077
1844
 
1078
-
1079
- size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst,
1080
- size_t dstSize, const void* cSrc,
1081
- size_t cSrcSize, void* workSpace,
1082
- size_t wkspSize)
1083
- {
1084
- /* validation checks */
1085
- if (dstSize == 0) return ERROR(dstSize_tooSmall);
1086
- if (cSrcSize == 0) return ERROR(corruption_detected);
1087
-
1088
- { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
1089
- #if defined(HUF_FORCE_DECOMPRESS_X1)
1090
- (void)algoNb;
1091
- assert(algoNb == 0);
1092
- return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
1093
- #elif defined(HUF_FORCE_DECOMPRESS_X2)
1094
- (void)algoNb;
1095
- assert(algoNb == 1);
1096
- return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
1097
- #else
1098
- return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
1099
- cSrcSize, workSpace, wkspSize):
1100
- HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
1101
- #endif
1102
- }
1103
- }
1104
-
1105
1845
  size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
1106
1846
  const void* cSrc, size_t cSrcSize,
1107
- void* workSpace, size_t wkspSize)
1847
+ void* workSpace, size_t wkspSize, int flags)
1108
1848
  {
1109
1849
  /* validation checks */
1110
1850
  if (dstSize == 0) return ERROR(dstSize_tooSmall);
@@ -1117,71 +1857,71 @@ size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
1117
1857
  (void)algoNb;
1118
1858
  assert(algoNb == 0);
1119
1859
  return HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc,
1120
- cSrcSize, workSpace, wkspSize);
1860
+ cSrcSize, workSpace, wkspSize, flags);
1121
1861
  #elif defined(HUF_FORCE_DECOMPRESS_X2)
1122
1862
  (void)algoNb;
1123
1863
  assert(algoNb == 1);
1124
1864
  return HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
1125
- cSrcSize, workSpace, wkspSize);
1865
+ cSrcSize, workSpace, wkspSize, flags);
1126
1866
  #else
1127
1867
  return algoNb ? HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
1128
- cSrcSize, workSpace, wkspSize):
1868
+ cSrcSize, workSpace, wkspSize, flags):
1129
1869
  HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc,
1130
- cSrcSize, workSpace, wkspSize);
1870
+ cSrcSize, workSpace, wkspSize, flags);
1131
1871
  #endif
1132
1872
  }
1133
1873
  }
1134
1874
 
1135
1875
 
1136
- size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
1876
+ size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags)
1137
1877
  {
1138
1878
  DTableDesc const dtd = HUF_getDTableDesc(DTable);
1139
1879
  #if defined(HUF_FORCE_DECOMPRESS_X1)
1140
1880
  (void)dtd;
1141
1881
  assert(dtd.tableType == 0);
1142
- return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
1882
+ return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
1143
1883
  #elif defined(HUF_FORCE_DECOMPRESS_X2)
1144
1884
  (void)dtd;
1145
1885
  assert(dtd.tableType == 1);
1146
- return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
1886
+ return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
1147
1887
  #else
1148
- return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
1149
- HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
1888
+ return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) :
1889
+ HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
1150
1890
  #endif
1151
1891
  }
1152
1892
 
1153
1893
  #ifndef HUF_FORCE_DECOMPRESS_X2
1154
- size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
1894
+ size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags)
1155
1895
  {
1156
1896
  const BYTE* ip = (const BYTE*) cSrc;
1157
1897
 
1158
- size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
1898
+ size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags);
1159
1899
  if (HUF_isError(hSize)) return hSize;
1160
1900
  if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
1161
1901
  ip += hSize; cSrcSize -= hSize;
1162
1902
 
1163
- return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
1903
+ return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
1164
1904
  }
1165
1905
  #endif
1166
1906
 
1167
- size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
1907
+ size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags)
1168
1908
  {
1169
1909
  DTableDesc const dtd = HUF_getDTableDesc(DTable);
1170
1910
  #if defined(HUF_FORCE_DECOMPRESS_X1)
1171
1911
  (void)dtd;
1172
1912
  assert(dtd.tableType == 0);
1173
- return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
1913
+ return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
1174
1914
  #elif defined(HUF_FORCE_DECOMPRESS_X2)
1175
1915
  (void)dtd;
1176
1916
  assert(dtd.tableType == 1);
1177
- return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
1917
+ return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
1178
1918
  #else
1179
- return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
1180
- HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
1919
+ return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) :
1920
+ HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
1181
1921
  #endif
1182
1922
  }
1183
1923
 
1184
- size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
1924
+ size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags)
1185
1925
  {
1186
1926
  /* validation checks */
1187
1927
  if (dstSize == 0) return ERROR(dstSize_tooSmall);
@@ -1191,160 +1931,14 @@ size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t ds
1191
1931
  #if defined(HUF_FORCE_DECOMPRESS_X1)
1192
1932
  (void)algoNb;
1193
1933
  assert(algoNb == 0);
1194
- return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
1195
- #elif defined(HUF_FORCE_DECOMPRESS_X2)
1196
- (void)algoNb;
1197
- assert(algoNb == 1);
1198
- return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
1199
- #else
1200
- return algoNb ? HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2) :
1201
- HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
1202
- #endif
1203
- }
1204
- }
1205
-
1206
- #ifndef ZSTD_NO_UNUSED_FUNCTIONS
1207
- #ifndef HUF_FORCE_DECOMPRESS_X2
1208
- size_t HUF_readDTableX1(HUF_DTable* DTable, const void* src, size_t srcSize)
1209
- {
1210
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
1211
- return HUF_readDTableX1_wksp(DTable, src, srcSize,
1212
- workSpace, sizeof(workSpace));
1213
- }
1214
-
1215
- size_t HUF_decompress1X1_DCtx(HUF_DTable* DCtx, void* dst, size_t dstSize,
1216
- const void* cSrc, size_t cSrcSize)
1217
- {
1218
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
1219
- return HUF_decompress1X1_DCtx_wksp(DCtx, dst, dstSize, cSrc, cSrcSize,
1220
- workSpace, sizeof(workSpace));
1221
- }
1222
-
1223
- size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
1224
- {
1225
- HUF_CREATE_STATIC_DTABLEX1(DTable, HUF_TABLELOG_MAX);
1226
- return HUF_decompress1X1_DCtx (DTable, dst, dstSize, cSrc, cSrcSize);
1227
- }
1228
- #endif
1229
-
1230
- #ifndef HUF_FORCE_DECOMPRESS_X1
1231
- size_t HUF_readDTableX2(HUF_DTable* DTable, const void* src, size_t srcSize)
1232
- {
1233
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
1234
- return HUF_readDTableX2_wksp(DTable, src, srcSize,
1235
- workSpace, sizeof(workSpace));
1236
- }
1237
-
1238
- size_t HUF_decompress1X2_DCtx(HUF_DTable* DCtx, void* dst, size_t dstSize,
1239
- const void* cSrc, size_t cSrcSize)
1240
- {
1241
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
1242
- return HUF_decompress1X2_DCtx_wksp(DCtx, dst, dstSize, cSrc, cSrcSize,
1243
- workSpace, sizeof(workSpace));
1244
- }
1245
-
1246
- size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
1247
- {
1248
- HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_TABLELOG_MAX);
1249
- return HUF_decompress1X2_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
1250
- }
1251
- #endif
1252
-
1253
- #ifndef HUF_FORCE_DECOMPRESS_X2
1254
- size_t HUF_decompress4X1_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
1255
- {
1256
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
1257
- return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
1258
- workSpace, sizeof(workSpace));
1259
- }
1260
- size_t HUF_decompress4X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
1261
- {
1262
- HUF_CREATE_STATIC_DTABLEX1(DTable, HUF_TABLELOG_MAX);
1263
- return HUF_decompress4X1_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
1264
- }
1265
- #endif
1266
-
1267
- #ifndef HUF_FORCE_DECOMPRESS_X1
1268
- size_t HUF_decompress4X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize,
1269
- const void* cSrc, size_t cSrcSize)
1270
- {
1271
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
1272
- return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
1273
- workSpace, sizeof(workSpace));
1274
- }
1275
-
1276
- size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
1277
- {
1278
- HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_TABLELOG_MAX);
1279
- return HUF_decompress4X2_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
1280
- }
1281
- #endif
1282
-
1283
- typedef size_t (*decompressionAlgo)(void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);
1284
-
1285
- size_t HUF_decompress (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
1286
- {
1287
- #if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2)
1288
- static const decompressionAlgo decompress[2] = { HUF_decompress4X1, HUF_decompress4X2 };
1289
- #endif
1290
-
1291
- /* validation checks */
1292
- if (dstSize == 0) return ERROR(dstSize_tooSmall);
1293
- if (cSrcSize > dstSize) return ERROR(corruption_detected); /* invalid */
1294
- if (cSrcSize == dstSize) { ZSTD_memcpy(dst, cSrc, dstSize); return dstSize; } /* not compressed */
1295
- if (cSrcSize == 1) { ZSTD_memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; } /* RLE */
1296
-
1297
- { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
1298
- #if defined(HUF_FORCE_DECOMPRESS_X1)
1299
- (void)algoNb;
1300
- assert(algoNb == 0);
1301
- return HUF_decompress4X1(dst, dstSize, cSrc, cSrcSize);
1302
- #elif defined(HUF_FORCE_DECOMPRESS_X2)
1303
- (void)algoNb;
1304
- assert(algoNb == 1);
1305
- return HUF_decompress4X2(dst, dstSize, cSrc, cSrcSize);
1306
- #else
1307
- return decompress[algoNb](dst, dstSize, cSrc, cSrcSize);
1308
- #endif
1309
- }
1310
- }
1311
-
1312
- size_t HUF_decompress4X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
1313
- {
1314
- /* validation checks */
1315
- if (dstSize == 0) return ERROR(dstSize_tooSmall);
1316
- if (cSrcSize > dstSize) return ERROR(corruption_detected); /* invalid */
1317
- if (cSrcSize == dstSize) { ZSTD_memcpy(dst, cSrc, dstSize); return dstSize; } /* not compressed */
1318
- if (cSrcSize == 1) { ZSTD_memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; } /* RLE */
1319
-
1320
- { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
1321
- #if defined(HUF_FORCE_DECOMPRESS_X1)
1322
- (void)algoNb;
1323
- assert(algoNb == 0);
1324
- return HUF_decompress4X1_DCtx(dctx, dst, dstSize, cSrc, cSrcSize);
1934
+ return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
1325
1935
  #elif defined(HUF_FORCE_DECOMPRESS_X2)
1326
1936
  (void)algoNb;
1327
1937
  assert(algoNb == 1);
1328
- return HUF_decompress4X2_DCtx(dctx, dst, dstSize, cSrc, cSrcSize);
1938
+ return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
1329
1939
  #else
1330
- return algoNb ? HUF_decompress4X2_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) :
1331
- HUF_decompress4X1_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) ;
1940
+ return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags) :
1941
+ HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
1332
1942
  #endif
1333
1943
  }
1334
1944
  }
1335
-
1336
- size_t HUF_decompress4X_hufOnly(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
1337
- {
1338
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
1339
- return HUF_decompress4X_hufOnly_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
1340
- workSpace, sizeof(workSpace));
1341
- }
1342
-
1343
- size_t HUF_decompress1X_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize,
1344
- const void* cSrc, size_t cSrcSize)
1345
- {
1346
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
1347
- return HUF_decompress1X_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
1348
- workSpace, sizeof(workSpace));
1349
- }
1350
- #endif