zstd-ruby 1.5.2.3 → 1.5.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (71) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +3 -3
  3. data/ext/zstdruby/libzstd/common/bits.h +175 -0
  4. data/ext/zstdruby/libzstd/common/bitstream.h +18 -59
  5. data/ext/zstdruby/libzstd/common/compiler.h +22 -3
  6. data/ext/zstdruby/libzstd/common/cpu.h +1 -1
  7. data/ext/zstdruby/libzstd/common/debug.c +1 -1
  8. data/ext/zstdruby/libzstd/common/debug.h +1 -1
  9. data/ext/zstdruby/libzstd/common/entropy_common.c +12 -40
  10. data/ext/zstdruby/libzstd/common/error_private.c +9 -2
  11. data/ext/zstdruby/libzstd/common/error_private.h +1 -1
  12. data/ext/zstdruby/libzstd/common/fse.h +5 -83
  13. data/ext/zstdruby/libzstd/common/fse_decompress.c +7 -99
  14. data/ext/zstdruby/libzstd/common/huf.h +65 -156
  15. data/ext/zstdruby/libzstd/common/mem.h +39 -46
  16. data/ext/zstdruby/libzstd/common/pool.c +26 -10
  17. data/ext/zstdruby/libzstd/common/pool.h +7 -1
  18. data/ext/zstdruby/libzstd/common/portability_macros.h +22 -3
  19. data/ext/zstdruby/libzstd/common/threading.c +68 -14
  20. data/ext/zstdruby/libzstd/common/threading.h +5 -10
  21. data/ext/zstdruby/libzstd/common/xxhash.c +2 -2
  22. data/ext/zstdruby/libzstd/common/xxhash.h +8 -8
  23. data/ext/zstdruby/libzstd/common/zstd_common.c +1 -1
  24. data/ext/zstdruby/libzstd/common/zstd_deps.h +1 -1
  25. data/ext/zstdruby/libzstd/common/zstd_internal.h +17 -113
  26. data/ext/zstdruby/libzstd/common/zstd_trace.h +3 -3
  27. data/ext/zstdruby/libzstd/compress/clevels.h +1 -1
  28. data/ext/zstdruby/libzstd/compress/fse_compress.c +7 -124
  29. data/ext/zstdruby/libzstd/compress/hist.c +1 -1
  30. data/ext/zstdruby/libzstd/compress/hist.h +1 -1
  31. data/ext/zstdruby/libzstd/compress/huf_compress.c +234 -169
  32. data/ext/zstdruby/libzstd/compress/zstd_compress.c +1055 -455
  33. data/ext/zstdruby/libzstd/compress/zstd_compress_internal.h +165 -145
  34. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.c +115 -39
  35. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.h +16 -8
  36. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.c +3 -3
  37. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.h +1 -1
  38. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.c +25 -21
  39. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.h +1 -1
  40. data/ext/zstdruby/libzstd/compress/zstd_cwksp.h +5 -3
  41. data/ext/zstdruby/libzstd/compress/zstd_double_fast.c +95 -33
  42. data/ext/zstdruby/libzstd/compress/zstd_double_fast.h +3 -2
  43. data/ext/zstdruby/libzstd/compress/zstd_fast.c +433 -148
  44. data/ext/zstdruby/libzstd/compress/zstd_fast.h +3 -2
  45. data/ext/zstdruby/libzstd/compress/zstd_lazy.c +306 -283
  46. data/ext/zstdruby/libzstd/compress/zstd_lazy.h +4 -2
  47. data/ext/zstdruby/libzstd/compress/zstd_ldm.c +5 -5
  48. data/ext/zstdruby/libzstd/compress/zstd_ldm.h +1 -1
  49. data/ext/zstdruby/libzstd/compress/zstd_ldm_geartab.h +1 -1
  50. data/ext/zstdruby/libzstd/compress/zstd_opt.c +104 -80
  51. data/ext/zstdruby/libzstd/compress/zstd_opt.h +1 -1
  52. data/ext/zstdruby/libzstd/compress/zstdmt_compress.c +12 -5
  53. data/ext/zstdruby/libzstd/compress/zstdmt_compress.h +1 -1
  54. data/ext/zstdruby/libzstd/decompress/huf_decompress.c +434 -441
  55. data/ext/zstdruby/libzstd/decompress/huf_decompress_amd64.S +30 -39
  56. data/ext/zstdruby/libzstd/decompress/zstd_ddict.c +3 -4
  57. data/ext/zstdruby/libzstd/decompress/zstd_ddict.h +1 -1
  58. data/ext/zstdruby/libzstd/decompress/zstd_decompress.c +164 -42
  59. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.c +186 -65
  60. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.h +1 -1
  61. data/ext/zstdruby/libzstd/decompress/zstd_decompress_internal.h +4 -2
  62. data/ext/zstdruby/libzstd/dictBuilder/cover.c +19 -15
  63. data/ext/zstdruby/libzstd/dictBuilder/cover.h +1 -1
  64. data/ext/zstdruby/libzstd/dictBuilder/fastcover.c +2 -2
  65. data/ext/zstdruby/libzstd/dictBuilder/zdict.c +9 -87
  66. data/ext/zstdruby/libzstd/zdict.h +53 -31
  67. data/ext/zstdruby/libzstd/zstd.h +489 -90
  68. data/ext/zstdruby/libzstd/zstd_errors.h +27 -8
  69. data/ext/zstdruby/main.c +4 -0
  70. data/lib/zstd-ruby/version.rb +1 -1
  71. metadata +7 -6
@@ -1,7 +1,7 @@
1
1
  /* ******************************************************************
2
2
  * huff0 huffman decoder,
3
3
  * part of Finite State Entropy library
4
- * Copyright (c) Yann Collet, Facebook, Inc.
4
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
5
5
  *
6
6
  * You can contact the author at :
7
7
  * - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
@@ -19,10 +19,10 @@
19
19
  #include "../common/compiler.h"
20
20
  #include "../common/bitstream.h" /* BIT_* */
21
21
  #include "../common/fse.h" /* to compress headers */
22
- #define HUF_STATIC_LINKING_ONLY
23
22
  #include "../common/huf.h"
24
23
  #include "../common/error_private.h"
25
24
  #include "../common/zstd_internal.h"
25
+ #include "../common/bits.h" /* ZSTD_highbit32, ZSTD_countTrailingZeros64 */
26
26
 
27
27
  /* **************************************************************
28
28
  * Constants
@@ -43,10 +43,14 @@
43
43
  #error "Cannot force the use of the X1 and X2 decoders at the same time!"
44
44
  #endif
45
45
 
46
- #if ZSTD_ENABLE_ASM_X86_64_BMI2 && DYNAMIC_BMI2
47
- # define HUF_ASM_X86_64_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE
46
+ /* When DYNAMIC_BMI2 is enabled, fast decoders are only called when bmi2 is
47
+ * supported at runtime, so we can add the BMI2 target attribute.
48
+ * When it is disabled, we will still get BMI2 if it is enabled statically.
49
+ */
50
+ #if DYNAMIC_BMI2
51
+ # define HUF_FAST_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE
48
52
  #else
49
- # define HUF_ASM_X86_64_BMI2_ATTRS
53
+ # define HUF_FAST_BMI2_ATTRS
50
54
  #endif
51
55
 
52
56
  #ifdef __cplusplus
@@ -56,18 +60,12 @@
56
60
  #endif
57
61
  #define HUF_ASM_DECL HUF_EXTERN_C
58
62
 
59
- #if DYNAMIC_BMI2 || (ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__))
63
+ #if DYNAMIC_BMI2
60
64
  # define HUF_NEED_BMI2_FUNCTION 1
61
65
  #else
62
66
  # define HUF_NEED_BMI2_FUNCTION 0
63
67
  #endif
64
68
 
65
- #if !(ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__))
66
- # define HUF_NEED_DEFAULT_FUNCTION 1
67
- #else
68
- # define HUF_NEED_DEFAULT_FUNCTION 0
69
- #endif
70
-
71
69
  /* **************************************************************
72
70
  * Error Management
73
71
  ****************************************************************/
@@ -84,6 +82,11 @@
84
82
  /* **************************************************************
85
83
  * BMI2 Variant Wrappers
86
84
  ****************************************************************/
85
+ typedef size_t (*HUF_DecompressUsingDTableFn)(void *dst, size_t dstSize,
86
+ const void *cSrc,
87
+ size_t cSrcSize,
88
+ const HUF_DTable *DTable);
89
+
87
90
  #if DYNAMIC_BMI2
88
91
 
89
92
  #define HUF_DGEN(fn) \
@@ -105,9 +108,9 @@
105
108
  } \
106
109
  \
107
110
  static size_t fn(void* dst, size_t dstSize, void const* cSrc, \
108
- size_t cSrcSize, HUF_DTable const* DTable, int bmi2) \
111
+ size_t cSrcSize, HUF_DTable const* DTable, int flags) \
109
112
  { \
110
- if (bmi2) { \
113
+ if (flags & HUF_flags_bmi2) { \
111
114
  return fn##_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); \
112
115
  } \
113
116
  return fn##_default(dst, dstSize, cSrc, cSrcSize, DTable); \
@@ -117,9 +120,9 @@
117
120
 
118
121
  #define HUF_DGEN(fn) \
119
122
  static size_t fn(void* dst, size_t dstSize, void const* cSrc, \
120
- size_t cSrcSize, HUF_DTable const* DTable, int bmi2) \
123
+ size_t cSrcSize, HUF_DTable const* DTable, int flags) \
121
124
  { \
122
- (void)bmi2; \
125
+ (void)flags; \
123
126
  return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \
124
127
  }
125
128
 
@@ -138,15 +141,28 @@ static DTableDesc HUF_getDTableDesc(const HUF_DTable* table)
138
141
  return dtd;
139
142
  }
140
143
 
141
- #if ZSTD_ENABLE_ASM_X86_64_BMI2
142
-
143
- static size_t HUF_initDStream(BYTE const* ip) {
144
+ static size_t HUF_initFastDStream(BYTE const* ip) {
144
145
  BYTE const lastByte = ip[7];
145
- size_t const bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;
146
+ size_t const bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0;
146
147
  size_t const value = MEM_readLEST(ip) | 1;
147
148
  assert(bitsConsumed <= 8);
149
+ assert(sizeof(size_t) == 8);
148
150
  return value << bitsConsumed;
149
151
  }
152
+
153
+
154
+ /**
155
+ * The input/output arguments to the Huffman fast decoding loop:
156
+ *
157
+ * ip [in/out] - The input pointers, must be updated to reflect what is consumed.
158
+ * op [in/out] - The output pointers, must be updated to reflect what is written.
159
+ * bits [in/out] - The bitstream containers, must be updated to reflect the current state.
160
+ * dt [in] - The decoding table.
161
+ * ilimit [in] - The input limit, stop when any input pointer is below ilimit.
162
+ * oend [in] - The end of the output stream. op[3] must not cross oend.
163
+ * iend [in] - The end of each input stream. ip[i] may cross iend[i],
164
+ * as long as it is above ilimit, but that indicates corruption.
165
+ */
150
166
  typedef struct {
151
167
  BYTE const* ip[4];
152
168
  BYTE* op[4];
@@ -155,15 +171,17 @@ typedef struct {
155
171
  BYTE const* ilimit;
156
172
  BYTE* oend;
157
173
  BYTE const* iend[4];
158
- } HUF_DecompressAsmArgs;
174
+ } HUF_DecompressFastArgs;
175
+
176
+ typedef void (*HUF_DecompressFastLoopFn)(HUF_DecompressFastArgs*);
159
177
 
160
178
  /**
161
- * Initializes args for the asm decoding loop.
162
- * @returns 0 on success
163
- * 1 if the fallback implementation should be used.
179
+ * Initializes args for the fast decoding loop.
180
+ * @returns 1 on success
181
+ * 0 if the fallback implementation should be used.
164
182
  * Or an error code on failure.
165
183
  */
166
- static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable)
184
+ static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable)
167
185
  {
168
186
  void const* dt = DTable + 1;
169
187
  U32 const dtLog = HUF_getDTableDesc(DTable).tableLog;
@@ -172,9 +190,11 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
172
190
 
173
191
  BYTE* const oend = (BYTE*)dst + dstSize;
174
192
 
175
- /* The following condition is false on x32 platform,
176
- * but HUF_asm is not compatible with this ABI */
177
- if (!(MEM_isLittleEndian() && !MEM_32bits())) return 1;
193
+ /* The fast decoding loop assumes 64-bit little-endian.
194
+ * This condition is false on x32.
195
+ */
196
+ if (!MEM_isLittleEndian() || MEM_32bits())
197
+ return 0;
178
198
 
179
199
  /* strict minimum : jump table + 1 byte per stream */
180
200
  if (srcSize < 10)
@@ -185,7 +205,7 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
185
205
  * On small inputs we don't have enough data to trigger the fast loop, so use the old decoder.
186
206
  */
187
207
  if (dtLog != HUF_DECODER_FAST_TABLELOG)
188
- return 1;
208
+ return 0;
189
209
 
190
210
  /* Read the jump table. */
191
211
  {
@@ -199,13 +219,13 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
199
219
  args->iend[2] = args->iend[1] + length2;
200
220
  args->iend[3] = args->iend[2] + length3;
201
221
 
202
- /* HUF_initDStream() requires this, and this small of an input
222
+ /* HUF_initFastDStream() requires this, and this small of an input
203
223
  * won't benefit from the ASM loop anyways.
204
224
  * length1 must be >= 16 so that ip[0] >= ilimit before the loop
205
225
  * starts.
206
226
  */
207
227
  if (length1 < 16 || length2 < 8 || length3 < 8 || length4 < 8)
208
- return 1;
228
+ return 0;
209
229
  if (length4 > srcSize) return ERROR(corruption_detected); /* overflow */
210
230
  }
211
231
  /* ip[] contains the position that is currently loaded into bits[]. */
@@ -222,7 +242,7 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
222
242
 
223
243
  /* No point to call the ASM loop for tiny outputs. */
224
244
  if (args->op[3] >= oend)
225
- return 1;
245
+ return 0;
226
246
 
227
247
  /* bits[] is the bit container.
228
248
  * It is read from the MSB down to the LSB.
@@ -231,10 +251,10 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
231
251
  * set, so that CountTrailingZeros(bits[]) can be used
232
252
  * to count how many bits we've consumed.
233
253
  */
234
- args->bits[0] = HUF_initDStream(args->ip[0]);
235
- args->bits[1] = HUF_initDStream(args->ip[1]);
236
- args->bits[2] = HUF_initDStream(args->ip[2]);
237
- args->bits[3] = HUF_initDStream(args->ip[3]);
254
+ args->bits[0] = HUF_initFastDStream(args->ip[0]);
255
+ args->bits[1] = HUF_initFastDStream(args->ip[1]);
256
+ args->bits[2] = HUF_initFastDStream(args->ip[2]);
257
+ args->bits[3] = HUF_initFastDStream(args->ip[3]);
238
258
 
239
259
  /* If ip[] >= ilimit, it is guaranteed to be safe to
240
260
  * reload bits[]. It may be beyond its section, but is
@@ -245,10 +265,10 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
245
265
  args->oend = oend;
246
266
  args->dt = dt;
247
267
 
248
- return 0;
268
+ return 1;
249
269
  }
250
270
 
251
- static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs const* args, int stream, BYTE* segmentEnd)
271
+ static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressFastArgs const* args, int stream, BYTE* segmentEnd)
252
272
  {
253
273
  /* Validate that we haven't overwritten. */
254
274
  if (args->op[stream] > segmentEnd)
@@ -262,15 +282,15 @@ static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs
262
282
  return ERROR(corruption_detected);
263
283
 
264
284
  /* Construct the BIT_DStream_t. */
265
- bit->bitContainer = MEM_readLE64(args->ip[stream]);
266
- bit->bitsConsumed = ZSTD_countTrailingZeros((size_t)args->bits[stream]);
285
+ assert(sizeof(size_t) == 8);
286
+ bit->bitContainer = MEM_readLEST(args->ip[stream]);
287
+ bit->bitsConsumed = ZSTD_countTrailingZeros64(args->bits[stream]);
267
288
  bit->start = (const char*)args->iend[0];
268
289
  bit->limitPtr = bit->start + sizeof(size_t);
269
290
  bit->ptr = (const char*)args->ip[stream];
270
291
 
271
292
  return 0;
272
293
  }
273
- #endif
274
294
 
275
295
 
276
296
  #ifndef HUF_FORCE_DECOMPRESS_X2
@@ -287,10 +307,11 @@ typedef struct { BYTE nbBits; BYTE byte; } HUF_DEltX1; /* single-symbol decodi
287
307
  static U64 HUF_DEltX1_set4(BYTE symbol, BYTE nbBits) {
288
308
  U64 D4;
289
309
  if (MEM_isLittleEndian()) {
290
- D4 = (symbol << 8) + nbBits;
310
+ D4 = (U64)((symbol << 8) + nbBits);
291
311
  } else {
292
- D4 = symbol + (nbBits << 8);
312
+ D4 = (U64)(symbol + (nbBits << 8));
293
313
  }
314
+ assert(D4 < (1U << 16));
294
315
  D4 *= 0x0001000100010001ULL;
295
316
  return D4;
296
317
  }
@@ -333,13 +354,7 @@ typedef struct {
333
354
  BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1];
334
355
  } HUF_ReadDTableX1_Workspace;
335
356
 
336
-
337
- size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize)
338
- {
339
- return HUF_readDTableX1_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0);
340
- }
341
-
342
- size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2)
357
+ size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags)
343
358
  {
344
359
  U32 tableLog = 0;
345
360
  U32 nbSymbols = 0;
@@ -354,7 +369,7 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
354
369
  DEBUG_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable));
355
370
  /* ZSTD_memset(huffWeight, 0, sizeof(huffWeight)); */ /* is not necessary, even though some analyzer complain ... */
356
371
 
357
- iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), bmi2);
372
+ iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), flags);
358
373
  if (HUF_isError(iSize)) return iSize;
359
374
 
360
375
 
@@ -381,9 +396,8 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
381
396
  * rankStart[0] is not filled because there are no entries in the table for
382
397
  * weight 0.
383
398
  */
384
- {
385
- int n;
386
- int nextRankStart = 0;
399
+ { int n;
400
+ U32 nextRankStart = 0;
387
401
  int const unroll = 4;
388
402
  int const nLimit = (int)nbSymbols - unroll + 1;
389
403
  for (n=0; n<(int)tableLog+1; n++) {
@@ -410,10 +424,9 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
410
424
  * We can switch based on the length to a different inner loop which is
411
425
  * optimized for that particular case.
412
426
  */
413
- {
414
- U32 w;
415
- int symbol=wksp->rankVal[0];
416
- int rankStart=0;
427
+ { U32 w;
428
+ int symbol = wksp->rankVal[0];
429
+ int rankStart = 0;
417
430
  for (w=1; w<tableLog+1; ++w) {
418
431
  int const symbolCount = wksp->rankVal[w];
419
432
  int const length = (1 << w) >> 1;
@@ -523,7 +536,7 @@ HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, cons
523
536
  while (p < pEnd)
524
537
  HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
525
538
 
526
- return pEnd-pStart;
539
+ return (size_t)(pEnd-pStart);
527
540
  }
528
541
 
529
542
  FORCE_INLINE_TEMPLATE size_t
@@ -549,6 +562,10 @@ HUF_decompress1X1_usingDTable_internal_body(
549
562
  return dstSize;
550
563
  }
551
564
 
565
+ /* HUF_decompress4X1_usingDTable_internal_body():
566
+ * Conditions :
567
+ * @dstSize >= 6
568
+ */
552
569
  FORCE_INLINE_TEMPLATE size_t
553
570
  HUF_decompress4X1_usingDTable_internal_body(
554
571
  void* dst, size_t dstSize,
@@ -592,6 +609,7 @@ HUF_decompress4X1_usingDTable_internal_body(
592
609
 
593
610
  if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */
594
611
  if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */
612
+ if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */
595
613
  CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
596
614
  CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
597
615
  CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
@@ -654,38 +672,142 @@ size_t HUF_decompress4X1_usingDTable_internal_bmi2(void* dst, size_t dstSize, vo
654
672
  }
655
673
  #endif
656
674
 
657
- #if HUF_NEED_DEFAULT_FUNCTION
658
675
  static
659
676
  size_t HUF_decompress4X1_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,
660
677
  size_t cSrcSize, HUF_DTable const* DTable) {
661
678
  return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
662
679
  }
663
- #endif
664
680
 
665
681
  #if ZSTD_ENABLE_ASM_X86_64_BMI2
666
682
 
667
- HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN;
683
+ HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN;
668
684
 
669
- static HUF_ASM_X86_64_BMI2_ATTRS
685
+ #endif
686
+
687
+ static HUF_FAST_BMI2_ATTRS
688
+ void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args)
689
+ {
690
+ U64 bits[4];
691
+ BYTE const* ip[4];
692
+ BYTE* op[4];
693
+ U16 const* const dtable = (U16 const*)args->dt;
694
+ BYTE* const oend = args->oend;
695
+ BYTE const* const ilimit = args->ilimit;
696
+
697
+ /* Copy the arguments to local variables */
698
+ ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
699
+ ZSTD_memcpy(&ip, &args->ip, sizeof(ip));
700
+ ZSTD_memcpy(&op, &args->op, sizeof(op));
701
+
702
+ assert(MEM_isLittleEndian());
703
+ assert(!MEM_32bits());
704
+
705
+ for (;;) {
706
+ BYTE* olimit;
707
+ int stream;
708
+ int symbol;
709
+
710
+ /* Assert loop preconditions */
711
+ #ifndef NDEBUG
712
+ for (stream = 0; stream < 4; ++stream) {
713
+ assert(op[stream] <= (stream == 3 ? oend : op[stream + 1]));
714
+ assert(ip[stream] >= ilimit);
715
+ }
716
+ #endif
717
+ /* Compute olimit */
718
+ {
719
+ /* Each iteration produces 5 output symbols per stream */
720
+ size_t const oiters = (size_t)(oend - op[3]) / 5;
721
+ /* Each iteration consumes up to 11 bits * 5 = 55 bits < 7 bytes
722
+ * per stream.
723
+ */
724
+ size_t const iiters = (size_t)(ip[0] - ilimit) / 7;
725
+ /* We can safely run iters iterations before running bounds checks */
726
+ size_t const iters = MIN(oiters, iiters);
727
+ size_t const symbols = iters * 5;
728
+
729
+ /* We can simply check that op[3] < olimit, instead of checking all
730
+ * of our bounds, since we can't hit the other bounds until we've run
731
+ * iters iterations, which only happens when op[3] == olimit.
732
+ */
733
+ olimit = op[3] + symbols;
734
+
735
+ /* Exit fast decoding loop once we get close to the end. */
736
+ if (op[3] + 20 > olimit)
737
+ break;
738
+
739
+ /* Exit the decoding loop if any input pointer has crossed the
740
+ * previous one. This indicates corruption, and a precondition
741
+ * to our loop is that ip[i] >= ip[0].
742
+ */
743
+ for (stream = 1; stream < 4; ++stream) {
744
+ if (ip[stream] < ip[stream - 1])
745
+ goto _out;
746
+ }
747
+ }
748
+
749
+ #ifndef NDEBUG
750
+ for (stream = 1; stream < 4; ++stream) {
751
+ assert(ip[stream] >= ip[stream - 1]);
752
+ }
753
+ #endif
754
+
755
+ do {
756
+ /* Decode 5 symbols in each of the 4 streams */
757
+ for (symbol = 0; symbol < 5; ++symbol) {
758
+ for (stream = 0; stream < 4; ++stream) {
759
+ int const index = (int)(bits[stream] >> 53);
760
+ int const entry = (int)dtable[index];
761
+ bits[stream] <<= (entry & 63);
762
+ op[stream][symbol] = (BYTE)((entry >> 8) & 0xFF);
763
+ }
764
+ }
765
+ /* Reload the bitstreams */
766
+ for (stream = 0; stream < 4; ++stream) {
767
+ int const ctz = ZSTD_countTrailingZeros64(bits[stream]);
768
+ int const nbBits = ctz & 7;
769
+ int const nbBytes = ctz >> 3;
770
+ op[stream] += 5;
771
+ ip[stream] -= nbBytes;
772
+ bits[stream] = MEM_read64(ip[stream]) | 1;
773
+ bits[stream] <<= nbBits;
774
+ }
775
+ } while (op[3] < olimit);
776
+ }
777
+
778
+ _out:
779
+
780
+ /* Save the final values of each of the state variables back to args. */
781
+ ZSTD_memcpy(&args->bits, &bits, sizeof(bits));
782
+ ZSTD_memcpy(&args->ip, &ip, sizeof(ip));
783
+ ZSTD_memcpy(&args->op, &op, sizeof(op));
784
+ }
785
+
786
+ /**
787
+ * @returns @p dstSize on success (>= 6)
788
+ * 0 if the fallback implementation should be used
789
+ * An error if an error occurred
790
+ */
791
+ static HUF_FAST_BMI2_ATTRS
670
792
  size_t
671
- HUF_decompress4X1_usingDTable_internal_bmi2_asm(
793
+ HUF_decompress4X1_usingDTable_internal_fast(
672
794
  void* dst, size_t dstSize,
673
795
  const void* cSrc, size_t cSrcSize,
674
- const HUF_DTable* DTable)
796
+ const HUF_DTable* DTable,
797
+ HUF_DecompressFastLoopFn loopFn)
675
798
  {
676
799
  void const* dt = DTable + 1;
677
800
  const BYTE* const iend = (const BYTE*)cSrc + 6;
678
801
  BYTE* const oend = (BYTE*)dst + dstSize;
679
- HUF_DecompressAsmArgs args;
680
- {
681
- size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
682
- FORWARD_IF_ERROR(ret, "Failed to init asm args");
683
- if (ret != 0)
684
- return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
802
+ HUF_DecompressFastArgs args;
803
+ { size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
804
+ FORWARD_IF_ERROR(ret, "Failed to init fast loop args");
805
+ if (ret == 0)
806
+ return 0;
685
807
  }
686
808
 
687
809
  assert(args.ip[0] >= args.ilimit);
688
- HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(&args);
810
+ loopFn(&args);
689
811
 
690
812
  /* Our loop guarantees that ip[] >= ilimit and that we haven't
691
813
  * overwritten any op[].
@@ -698,8 +820,7 @@ HUF_decompress4X1_usingDTable_internal_bmi2_asm(
698
820
  (void)iend;
699
821
 
700
822
  /* finish bit streams one by one. */
701
- {
702
- size_t const segmentSize = (dstSize+3) / 4;
823
+ { size_t const segmentSize = (dstSize+3) / 4;
703
824
  BYTE* segmentEnd = (BYTE*)dst;
704
825
  int i;
705
826
  for (i = 0; i < 4; ++i) {
@@ -716,97 +837,59 @@ HUF_decompress4X1_usingDTable_internal_bmi2_asm(
716
837
  }
717
838
 
718
839
  /* decoded size */
840
+ assert(dstSize != 0);
719
841
  return dstSize;
720
842
  }
721
- #endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */
722
-
723
- typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize,
724
- const void *cSrc,
725
- size_t cSrcSize,
726
- const HUF_DTable *DTable);
727
843
 
728
844
  HUF_DGEN(HUF_decompress1X1_usingDTable_internal)
729
845
 
730
846
  static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc,
731
- size_t cSrcSize, HUF_DTable const* DTable, int bmi2)
847
+ size_t cSrcSize, HUF_DTable const* DTable, int flags)
732
848
  {
849
+ HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X1_usingDTable_internal_default;
850
+ HUF_DecompressFastLoopFn loopFn = HUF_decompress4X1_usingDTable_internal_fast_c_loop;
851
+
733
852
  #if DYNAMIC_BMI2
734
- if (bmi2) {
853
+ if (flags & HUF_flags_bmi2) {
854
+ fallbackFn = HUF_decompress4X1_usingDTable_internal_bmi2;
735
855
  # if ZSTD_ENABLE_ASM_X86_64_BMI2
736
- return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
737
- # else
738
- return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
856
+ if (!(flags & HUF_flags_disableAsm)) {
857
+ loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop;
858
+ }
739
859
  # endif
860
+ } else {
861
+ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
740
862
  }
741
- #else
742
- (void)bmi2;
743
863
  #endif
744
864
 
745
865
  #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
746
- return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
747
- #else
748
- return HUF_decompress4X1_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable);
866
+ if (!(flags & HUF_flags_disableAsm)) {
867
+ loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop;
868
+ }
749
869
  #endif
750
- }
751
870
 
752
-
753
- size_t HUF_decompress1X1_usingDTable(
754
- void* dst, size_t dstSize,
755
- const void* cSrc, size_t cSrcSize,
756
- const HUF_DTable* DTable)
757
- {
758
- DTableDesc dtd = HUF_getDTableDesc(DTable);
759
- if (dtd.tableType != 0) return ERROR(GENERIC);
760
- return HUF_decompress1X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
761
- }
762
-
763
- size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
764
- const void* cSrc, size_t cSrcSize,
765
- void* workSpace, size_t wkspSize)
766
- {
767
- const BYTE* ip = (const BYTE*) cSrc;
768
-
769
- size_t const hSize = HUF_readDTableX1_wksp(DCtx, cSrc, cSrcSize, workSpace, wkspSize);
770
- if (HUF_isError(hSize)) return hSize;
771
- if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
772
- ip += hSize; cSrcSize -= hSize;
773
-
774
- return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
775
- }
776
-
777
-
778
- size_t HUF_decompress4X1_usingDTable(
779
- void* dst, size_t dstSize,
780
- const void* cSrc, size_t cSrcSize,
781
- const HUF_DTable* DTable)
782
- {
783
- DTableDesc dtd = HUF_getDTableDesc(DTable);
784
- if (dtd.tableType != 0) return ERROR(GENERIC);
785
- return HUF_decompress4X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
871
+ if (!(flags & HUF_flags_disableFast)) {
872
+ size_t const ret = HUF_decompress4X1_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);
873
+ if (ret != 0)
874
+ return ret;
875
+ }
876
+ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
786
877
  }
787
878
 
788
- static size_t HUF_decompress4X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
879
+ static size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
789
880
  const void* cSrc, size_t cSrcSize,
790
- void* workSpace, size_t wkspSize, int bmi2)
881
+ void* workSpace, size_t wkspSize, int flags)
791
882
  {
792
883
  const BYTE* ip = (const BYTE*) cSrc;
793
884
 
794
- size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
885
+ size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags);
795
886
  if (HUF_isError(hSize)) return hSize;
796
887
  if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
797
888
  ip += hSize; cSrcSize -= hSize;
798
889
 
799
- return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
800
- }
801
-
802
- size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
803
- const void* cSrc, size_t cSrcSize,
804
- void* workSpace, size_t wkspSize)
805
- {
806
- return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, 0);
890
+ return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
807
891
  }
808
892
 
809
-
810
893
  #endif /* HUF_FORCE_DECOMPRESS_X2 */
811
894
 
812
895
 
@@ -989,7 +1072,7 @@ static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 targetLog, const U32
989
1072
 
990
1073
  static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog,
991
1074
  const sortedSymbol_t* sortedList,
992
- const U32* rankStart, rankVal_t rankValOrigin, const U32 maxWeight,
1075
+ const U32* rankStart, rankValCol_t* rankValOrigin, const U32 maxWeight,
993
1076
  const U32 nbBitsBaseline)
994
1077
  {
995
1078
  U32* const rankVal = rankValOrigin[0];
@@ -1044,14 +1127,7 @@ typedef struct {
1044
1127
 
1045
1128
  size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,
1046
1129
  const void* src, size_t srcSize,
1047
- void* workSpace, size_t wkspSize)
1048
- {
1049
- return HUF_readDTableX2_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0);
1050
- }
1051
-
1052
- size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable,
1053
- const void* src, size_t srcSize,
1054
- void* workSpace, size_t wkspSize, int bmi2)
1130
+ void* workSpace, size_t wkspSize, int flags)
1055
1131
  {
1056
1132
  U32 tableLog, maxW, nbSymbols;
1057
1133
  DTableDesc dtd = HUF_getDTableDesc(DTable);
@@ -1073,7 +1149,7 @@ size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable,
1073
1149
  if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
1074
1150
  /* ZSTD_memset(weightList, 0, sizeof(weightList)); */ /* is not necessary, even though some analyzer complain ... */
1075
1151
 
1076
- iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), bmi2);
1152
+ iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), flags);
1077
1153
  if (HUF_isError(iSize)) return iSize;
1078
1154
 
1079
1155
  /* check result */
@@ -1244,6 +1320,11 @@ HUF_decompress1X2_usingDTable_internal_body(
1244
1320
  /* decoded size */
1245
1321
  return dstSize;
1246
1322
  }
1323
+
1324
+ /* HUF_decompress4X2_usingDTable_internal_body():
1325
+ * Conditions:
1326
+ * @dstSize >= 6
1327
+ */
1247
1328
  FORCE_INLINE_TEMPLATE size_t
1248
1329
  HUF_decompress4X2_usingDTable_internal_body(
1249
1330
  void* dst, size_t dstSize,
@@ -1284,8 +1365,9 @@ HUF_decompress4X2_usingDTable_internal_body(
1284
1365
  DTableDesc const dtd = HUF_getDTableDesc(DTable);
1285
1366
  U32 const dtLog = dtd.tableLog;
1286
1367
 
1287
- if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */
1288
- if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */
1368
+ if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */
1369
+ if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */
1370
+ if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */
1289
1371
  CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
1290
1372
  CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
1291
1373
  CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
@@ -1370,36 +1452,177 @@ size_t HUF_decompress4X2_usingDTable_internal_bmi2(void* dst, size_t dstSize, vo
1370
1452
  }
1371
1453
  #endif
1372
1454
 
1373
- #if HUF_NEED_DEFAULT_FUNCTION
1374
1455
  static
1375
1456
  size_t HUF_decompress4X2_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,
1376
1457
  size_t cSrcSize, HUF_DTable const* DTable) {
1377
1458
  return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
1378
1459
  }
1379
- #endif
1380
1460
 
1381
1461
  #if ZSTD_ENABLE_ASM_X86_64_BMI2
1382
1462
 
1383
- HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN;
1463
+ HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN;
1464
+
1465
+ #endif
1466
+
1467
+ static HUF_FAST_BMI2_ATTRS
1468
+ void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args)
1469
+ {
1470
+ U64 bits[4];
1471
+ BYTE const* ip[4];
1472
+ BYTE* op[4];
1473
+ BYTE* oend[4];
1474
+ HUF_DEltX2 const* const dtable = (HUF_DEltX2 const*)args->dt;
1475
+ BYTE const* const ilimit = args->ilimit;
1476
+
1477
+ /* Copy the arguments to local registers. */
1478
+ ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
1479
+ ZSTD_memcpy(&ip, &args->ip, sizeof(ip));
1480
+ ZSTD_memcpy(&op, &args->op, sizeof(op));
1481
+
1482
+ oend[0] = op[1];
1483
+ oend[1] = op[2];
1484
+ oend[2] = op[3];
1485
+ oend[3] = args->oend;
1486
+
1487
+ assert(MEM_isLittleEndian());
1488
+ assert(!MEM_32bits());
1489
+
1490
+ for (;;) {
1491
+ BYTE* olimit;
1492
+ int stream;
1493
+ int symbol;
1494
+
1495
+ /* Assert loop preconditions */
1496
+ #ifndef NDEBUG
1497
+ for (stream = 0; stream < 4; ++stream) {
1498
+ assert(op[stream] <= oend[stream]);
1499
+ assert(ip[stream] >= ilimit);
1500
+ }
1501
+ #endif
1502
+ /* Compute olimit */
1503
+ {
1504
+ /* Each loop does 5 table lookups for each of the 4 streams.
1505
+ * Each table lookup consumes up to 11 bits of input, and produces
1506
+ * up to 2 bytes of output.
1507
+ */
1508
+ /* We can consume up to 7 bytes of input per iteration per stream.
1509
+ * We also know that each input pointer is >= ip[0]. So we can run
1510
+ * iters loops before running out of input.
1511
+ */
1512
+ size_t iters = (size_t)(ip[0] - ilimit) / 7;
1513
+ /* Each iteration can produce up to 10 bytes of output per stream.
1514
+ * Each output stream my advance at different rates. So take the
1515
+ * minimum number of safe iterations among all the output streams.
1516
+ */
1517
+ for (stream = 0; stream < 4; ++stream) {
1518
+ size_t const oiters = (size_t)(oend[stream] - op[stream]) / 10;
1519
+ iters = MIN(iters, oiters);
1520
+ }
1521
+
1522
+ /* Each iteration produces at least 5 output symbols. So until
1523
+ * op[3] crosses olimit, we know we haven't executed iters
1524
+ * iterations yet. This saves us maintaining an iters counter,
1525
+ * at the expense of computing the remaining # of iterations
1526
+ * more frequently.
1527
+ */
1528
+ olimit = op[3] + (iters * 5);
1529
+
1530
+ /* Exit the fast decoding loop if we are too close to the end. */
1531
+ if (op[3] + 10 > olimit)
1532
+ break;
1533
+
1534
+ /* Exit the decoding loop if any input pointer has crossed the
1535
+ * previous one. This indicates corruption, and a precondition
1536
+ * to our loop is that ip[i] >= ip[0].
1537
+ */
1538
+ for (stream = 1; stream < 4; ++stream) {
1539
+ if (ip[stream] < ip[stream - 1])
1540
+ goto _out;
1541
+ }
1542
+ }
1543
+
1544
+ #ifndef NDEBUG
1545
+ for (stream = 1; stream < 4; ++stream) {
1546
+ assert(ip[stream] >= ip[stream - 1]);
1547
+ }
1548
+ #endif
1549
+
1550
+ do {
1551
+ /* Do 5 table lookups for each of the first 3 streams */
1552
+ for (symbol = 0; symbol < 5; ++symbol) {
1553
+ for (stream = 0; stream < 3; ++stream) {
1554
+ int const index = (int)(bits[stream] >> 53);
1555
+ HUF_DEltX2 const entry = dtable[index];
1556
+ MEM_write16(op[stream], entry.sequence);
1557
+ bits[stream] <<= (entry.nbBits);
1558
+ op[stream] += (entry.length);
1559
+ }
1560
+ }
1561
+ /* Do 1 table lookup from the final stream */
1562
+ {
1563
+ int const index = (int)(bits[3] >> 53);
1564
+ HUF_DEltX2 const entry = dtable[index];
1565
+ MEM_write16(op[3], entry.sequence);
1566
+ bits[3] <<= (entry.nbBits);
1567
+ op[3] += (entry.length);
1568
+ }
1569
+ /* Do 4 table lookups from the final stream & reload bitstreams */
1570
+ for (stream = 0; stream < 4; ++stream) {
1571
+ /* Do a table lookup from the final stream.
1572
+ * This is interleaved with the reloading to reduce register
1573
+ * pressure. This shouldn't be necessary, but compilers can
1574
+ * struggle with codegen with high register pressure.
1575
+ */
1576
+ {
1577
+ int const index = (int)(bits[3] >> 53);
1578
+ HUF_DEltX2 const entry = dtable[index];
1579
+ MEM_write16(op[3], entry.sequence);
1580
+ bits[3] <<= (entry.nbBits);
1581
+ op[3] += (entry.length);
1582
+ }
1583
+ /* Reload the bistreams. The final bitstream must be reloaded
1584
+ * after the 5th symbol was decoded.
1585
+ */
1586
+ {
1587
+ int const ctz = ZSTD_countTrailingZeros64(bits[stream]);
1588
+ int const nbBits = ctz & 7;
1589
+ int const nbBytes = ctz >> 3;
1590
+ ip[stream] -= nbBytes;
1591
+ bits[stream] = MEM_read64(ip[stream]) | 1;
1592
+ bits[stream] <<= nbBits;
1593
+ }
1594
+ }
1595
+ } while (op[3] < olimit);
1596
+ }
1384
1597
 
1385
- static HUF_ASM_X86_64_BMI2_ATTRS size_t
1386
- HUF_decompress4X2_usingDTable_internal_bmi2_asm(
1598
+ _out:
1599
+
1600
+ /* Save the final values of each of the state variables back to args. */
1601
+ ZSTD_memcpy(&args->bits, &bits, sizeof(bits));
1602
+ ZSTD_memcpy(&args->ip, &ip, sizeof(ip));
1603
+ ZSTD_memcpy(&args->op, &op, sizeof(op));
1604
+ }
1605
+
1606
+
1607
+ static HUF_FAST_BMI2_ATTRS size_t
1608
+ HUF_decompress4X2_usingDTable_internal_fast(
1387
1609
  void* dst, size_t dstSize,
1388
1610
  const void* cSrc, size_t cSrcSize,
1389
- const HUF_DTable* DTable) {
1611
+ const HUF_DTable* DTable,
1612
+ HUF_DecompressFastLoopFn loopFn) {
1390
1613
  void const* dt = DTable + 1;
1391
1614
  const BYTE* const iend = (const BYTE*)cSrc + 6;
1392
1615
  BYTE* const oend = (BYTE*)dst + dstSize;
1393
- HUF_DecompressAsmArgs args;
1616
+ HUF_DecompressFastArgs args;
1394
1617
  {
1395
- size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
1618
+ size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
1396
1619
  FORWARD_IF_ERROR(ret, "Failed to init asm args");
1397
- if (ret != 0)
1398
- return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
1620
+ if (ret == 0)
1621
+ return 0;
1399
1622
  }
1400
1623
 
1401
1624
  assert(args.ip[0] >= args.ilimit);
1402
- HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(&args);
1625
+ loopFn(&args);
1403
1626
 
1404
1627
  /* note : op4 already verified within main loop */
1405
1628
  assert(args.ip[0] >= iend);
@@ -1430,91 +1653,72 @@ HUF_decompress4X2_usingDTable_internal_bmi2_asm(
1430
1653
  /* decoded size */
1431
1654
  return dstSize;
1432
1655
  }
1433
- #endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */
1434
1656
 
1435
1657
  static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc,
1436
- size_t cSrcSize, HUF_DTable const* DTable, int bmi2)
1658
+ size_t cSrcSize, HUF_DTable const* DTable, int flags)
1437
1659
  {
1660
+ HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X2_usingDTable_internal_default;
1661
+ HUF_DecompressFastLoopFn loopFn = HUF_decompress4X2_usingDTable_internal_fast_c_loop;
1662
+
1438
1663
  #if DYNAMIC_BMI2
1439
- if (bmi2) {
1664
+ if (flags & HUF_flags_bmi2) {
1665
+ fallbackFn = HUF_decompress4X2_usingDTable_internal_bmi2;
1440
1666
  # if ZSTD_ENABLE_ASM_X86_64_BMI2
1441
- return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
1442
- # else
1443
- return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
1667
+ if (!(flags & HUF_flags_disableAsm)) {
1668
+ loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop;
1669
+ }
1444
1670
  # endif
1671
+ } else {
1672
+ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
1445
1673
  }
1446
- #else
1447
- (void)bmi2;
1448
1674
  #endif
1449
1675
 
1450
1676
  #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
1451
- return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
1452
- #else
1453
- return HUF_decompress4X2_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable);
1677
+ if (!(flags & HUF_flags_disableAsm)) {
1678
+ loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop;
1679
+ }
1454
1680
  #endif
1681
+
1682
+ if (!(flags & HUF_flags_disableFast)) {
1683
+ size_t const ret = HUF_decompress4X2_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);
1684
+ if (ret != 0)
1685
+ return ret;
1686
+ }
1687
+ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
1455
1688
  }
1456
1689
 
1457
1690
  HUF_DGEN(HUF_decompress1X2_usingDTable_internal)
1458
1691
 
1459
- size_t HUF_decompress1X2_usingDTable(
1460
- void* dst, size_t dstSize,
1461
- const void* cSrc, size_t cSrcSize,
1462
- const HUF_DTable* DTable)
1463
- {
1464
- DTableDesc dtd = HUF_getDTableDesc(DTable);
1465
- if (dtd.tableType != 1) return ERROR(GENERIC);
1466
- return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
1467
- }
1468
-
1469
1692
  size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
1470
1693
  const void* cSrc, size_t cSrcSize,
1471
- void* workSpace, size_t wkspSize)
1694
+ void* workSpace, size_t wkspSize, int flags)
1472
1695
  {
1473
1696
  const BYTE* ip = (const BYTE*) cSrc;
1474
1697
 
1475
1698
  size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize,
1476
- workSpace, wkspSize);
1699
+ workSpace, wkspSize, flags);
1477
1700
  if (HUF_isError(hSize)) return hSize;
1478
1701
  if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
1479
1702
  ip += hSize; cSrcSize -= hSize;
1480
1703
 
1481
- return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
1482
- }
1483
-
1484
-
1485
- size_t HUF_decompress4X2_usingDTable(
1486
- void* dst, size_t dstSize,
1487
- const void* cSrc, size_t cSrcSize,
1488
- const HUF_DTable* DTable)
1489
- {
1490
- DTableDesc dtd = HUF_getDTableDesc(DTable);
1491
- if (dtd.tableType != 1) return ERROR(GENERIC);
1492
- return HUF_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
1704
+ return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, flags);
1493
1705
  }
1494
1706
 
1495
- static size_t HUF_decompress4X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
1707
+ static size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
1496
1708
  const void* cSrc, size_t cSrcSize,
1497
- void* workSpace, size_t wkspSize, int bmi2)
1709
+ void* workSpace, size_t wkspSize, int flags)
1498
1710
  {
1499
1711
  const BYTE* ip = (const BYTE*) cSrc;
1500
1712
 
1501
1713
  size_t hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize,
1502
- workSpace, wkspSize);
1714
+ workSpace, wkspSize, flags);
1503
1715
  if (HUF_isError(hSize)) return hSize;
1504
1716
  if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
1505
1717
  ip += hSize; cSrcSize -= hSize;
1506
1718
 
1507
- return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
1719
+ return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
1508
1720
  }
1509
1721
 
1510
- size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
1511
- const void* cSrc, size_t cSrcSize,
1512
- void* workSpace, size_t wkspSize)
1513
- {
1514
- return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, /* bmi2 */ 0);
1515
- }
1516
-
1517
-
1518
1722
  #endif /* HUF_FORCE_DECOMPRESS_X1 */
1519
1723
 
1520
1724
 
@@ -1522,44 +1726,6 @@ size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
1522
1726
  /* Universal decompression selectors */
1523
1727
  /* ***********************************/
1524
1728
 
1525
- size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize,
1526
- const void* cSrc, size_t cSrcSize,
1527
- const HUF_DTable* DTable)
1528
- {
1529
- DTableDesc const dtd = HUF_getDTableDesc(DTable);
1530
- #if defined(HUF_FORCE_DECOMPRESS_X1)
1531
- (void)dtd;
1532
- assert(dtd.tableType == 0);
1533
- return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
1534
- #elif defined(HUF_FORCE_DECOMPRESS_X2)
1535
- (void)dtd;
1536
- assert(dtd.tableType == 1);
1537
- return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
1538
- #else
1539
- return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
1540
- HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
1541
- #endif
1542
- }
1543
-
1544
- size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize,
1545
- const void* cSrc, size_t cSrcSize,
1546
- const HUF_DTable* DTable)
1547
- {
1548
- DTableDesc const dtd = HUF_getDTableDesc(DTable);
1549
- #if defined(HUF_FORCE_DECOMPRESS_X1)
1550
- (void)dtd;
1551
- assert(dtd.tableType == 0);
1552
- return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
1553
- #elif defined(HUF_FORCE_DECOMPRESS_X2)
1554
- (void)dtd;
1555
- assert(dtd.tableType == 1);
1556
- return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
1557
- #else
1558
- return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
1559
- HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
1560
- #endif
1561
- }
1562
-
1563
1729
 
1564
1730
  #if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2)
1565
1731
  typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t;
@@ -1614,36 +1780,9 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize)
1614
1780
  #endif
1615
1781
  }
1616
1782
 
1617
-
1618
- size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst,
1619
- size_t dstSize, const void* cSrc,
1620
- size_t cSrcSize, void* workSpace,
1621
- size_t wkspSize)
1622
- {
1623
- /* validation checks */
1624
- if (dstSize == 0) return ERROR(dstSize_tooSmall);
1625
- if (cSrcSize == 0) return ERROR(corruption_detected);
1626
-
1627
- { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
1628
- #if defined(HUF_FORCE_DECOMPRESS_X1)
1629
- (void)algoNb;
1630
- assert(algoNb == 0);
1631
- return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
1632
- #elif defined(HUF_FORCE_DECOMPRESS_X2)
1633
- (void)algoNb;
1634
- assert(algoNb == 1);
1635
- return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
1636
- #else
1637
- return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
1638
- cSrcSize, workSpace, wkspSize):
1639
- HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
1640
- #endif
1641
- }
1642
- }
1643
-
1644
1783
  size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
1645
1784
  const void* cSrc, size_t cSrcSize,
1646
- void* workSpace, size_t wkspSize)
1785
+ void* workSpace, size_t wkspSize, int flags)
1647
1786
  {
1648
1787
  /* validation checks */
1649
1788
  if (dstSize == 0) return ERROR(dstSize_tooSmall);
@@ -1656,71 +1795,71 @@ size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
1656
1795
  (void)algoNb;
1657
1796
  assert(algoNb == 0);
1658
1797
  return HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc,
1659
- cSrcSize, workSpace, wkspSize);
1798
+ cSrcSize, workSpace, wkspSize, flags);
1660
1799
  #elif defined(HUF_FORCE_DECOMPRESS_X2)
1661
1800
  (void)algoNb;
1662
1801
  assert(algoNb == 1);
1663
1802
  return HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
1664
- cSrcSize, workSpace, wkspSize);
1803
+ cSrcSize, workSpace, wkspSize, flags);
1665
1804
  #else
1666
1805
  return algoNb ? HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
1667
- cSrcSize, workSpace, wkspSize):
1806
+ cSrcSize, workSpace, wkspSize, flags):
1668
1807
  HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc,
1669
- cSrcSize, workSpace, wkspSize);
1808
+ cSrcSize, workSpace, wkspSize, flags);
1670
1809
  #endif
1671
1810
  }
1672
1811
  }
1673
1812
 
1674
1813
 
1675
- size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
1814
+ size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags)
1676
1815
  {
1677
1816
  DTableDesc const dtd = HUF_getDTableDesc(DTable);
1678
1817
  #if defined(HUF_FORCE_DECOMPRESS_X1)
1679
1818
  (void)dtd;
1680
1819
  assert(dtd.tableType == 0);
1681
- return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
1820
+ return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
1682
1821
  #elif defined(HUF_FORCE_DECOMPRESS_X2)
1683
1822
  (void)dtd;
1684
1823
  assert(dtd.tableType == 1);
1685
- return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
1824
+ return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
1686
1825
  #else
1687
- return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
1688
- HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
1826
+ return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) :
1827
+ HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
1689
1828
  #endif
1690
1829
  }
1691
1830
 
1692
1831
  #ifndef HUF_FORCE_DECOMPRESS_X2
1693
- size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
1832
+ size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags)
1694
1833
  {
1695
1834
  const BYTE* ip = (const BYTE*) cSrc;
1696
1835
 
1697
- size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
1836
+ size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags);
1698
1837
  if (HUF_isError(hSize)) return hSize;
1699
1838
  if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
1700
1839
  ip += hSize; cSrcSize -= hSize;
1701
1840
 
1702
- return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
1841
+ return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
1703
1842
  }
1704
1843
  #endif
1705
1844
 
1706
- size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
1845
+ size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags)
1707
1846
  {
1708
1847
  DTableDesc const dtd = HUF_getDTableDesc(DTable);
1709
1848
  #if defined(HUF_FORCE_DECOMPRESS_X1)
1710
1849
  (void)dtd;
1711
1850
  assert(dtd.tableType == 0);
1712
- return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
1851
+ return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
1713
1852
  #elif defined(HUF_FORCE_DECOMPRESS_X2)
1714
1853
  (void)dtd;
1715
1854
  assert(dtd.tableType == 1);
1716
- return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
1855
+ return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
1717
1856
  #else
1718
- return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
1719
- HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
1857
+ return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) :
1858
+ HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
1720
1859
  #endif
1721
1860
  }
1722
1861
 
1723
- size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
1862
+ size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags)
1724
1863
  {
1725
1864
  /* validation checks */
1726
1865
  if (dstSize == 0) return ERROR(dstSize_tooSmall);
@@ -1730,160 +1869,14 @@ size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t ds
1730
1869
  #if defined(HUF_FORCE_DECOMPRESS_X1)
1731
1870
  (void)algoNb;
1732
1871
  assert(algoNb == 0);
1733
- return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
1872
+ return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
1734
1873
  #elif defined(HUF_FORCE_DECOMPRESS_X2)
1735
1874
  (void)algoNb;
1736
1875
  assert(algoNb == 1);
1737
- return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
1876
+ return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
1738
1877
  #else
1739
- return algoNb ? HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2) :
1740
- HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
1878
+ return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags) :
1879
+ HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
1741
1880
  #endif
1742
1881
  }
1743
1882
  }
1744
-
1745
- #ifndef ZSTD_NO_UNUSED_FUNCTIONS
1746
- #ifndef HUF_FORCE_DECOMPRESS_X2
1747
- size_t HUF_readDTableX1(HUF_DTable* DTable, const void* src, size_t srcSize)
1748
- {
1749
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
1750
- return HUF_readDTableX1_wksp(DTable, src, srcSize,
1751
- workSpace, sizeof(workSpace));
1752
- }
1753
-
1754
- size_t HUF_decompress1X1_DCtx(HUF_DTable* DCtx, void* dst, size_t dstSize,
1755
- const void* cSrc, size_t cSrcSize)
1756
- {
1757
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
1758
- return HUF_decompress1X1_DCtx_wksp(DCtx, dst, dstSize, cSrc, cSrcSize,
1759
- workSpace, sizeof(workSpace));
1760
- }
1761
-
1762
- size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
1763
- {
1764
- HUF_CREATE_STATIC_DTABLEX1(DTable, HUF_TABLELOG_MAX);
1765
- return HUF_decompress1X1_DCtx (DTable, dst, dstSize, cSrc, cSrcSize);
1766
- }
1767
- #endif
1768
-
1769
- #ifndef HUF_FORCE_DECOMPRESS_X1
1770
- size_t HUF_readDTableX2(HUF_DTable* DTable, const void* src, size_t srcSize)
1771
- {
1772
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
1773
- return HUF_readDTableX2_wksp(DTable, src, srcSize,
1774
- workSpace, sizeof(workSpace));
1775
- }
1776
-
1777
- size_t HUF_decompress1X2_DCtx(HUF_DTable* DCtx, void* dst, size_t dstSize,
1778
- const void* cSrc, size_t cSrcSize)
1779
- {
1780
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
1781
- return HUF_decompress1X2_DCtx_wksp(DCtx, dst, dstSize, cSrc, cSrcSize,
1782
- workSpace, sizeof(workSpace));
1783
- }
1784
-
1785
- size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
1786
- {
1787
- HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_TABLELOG_MAX);
1788
- return HUF_decompress1X2_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
1789
- }
1790
- #endif
1791
-
1792
- #ifndef HUF_FORCE_DECOMPRESS_X2
1793
- size_t HUF_decompress4X1_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
1794
- {
1795
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
1796
- return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
1797
- workSpace, sizeof(workSpace));
1798
- }
1799
- size_t HUF_decompress4X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
1800
- {
1801
- HUF_CREATE_STATIC_DTABLEX1(DTable, HUF_TABLELOG_MAX);
1802
- return HUF_decompress4X1_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
1803
- }
1804
- #endif
1805
-
1806
- #ifndef HUF_FORCE_DECOMPRESS_X1
1807
- size_t HUF_decompress4X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize,
1808
- const void* cSrc, size_t cSrcSize)
1809
- {
1810
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
1811
- return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
1812
- workSpace, sizeof(workSpace));
1813
- }
1814
-
1815
- size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
1816
- {
1817
- HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_TABLELOG_MAX);
1818
- return HUF_decompress4X2_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
1819
- }
1820
- #endif
1821
-
1822
- typedef size_t (*decompressionAlgo)(void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);
1823
-
1824
- size_t HUF_decompress (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
1825
- {
1826
- #if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2)
1827
- static const decompressionAlgo decompress[2] = { HUF_decompress4X1, HUF_decompress4X2 };
1828
- #endif
1829
-
1830
- /* validation checks */
1831
- if (dstSize == 0) return ERROR(dstSize_tooSmall);
1832
- if (cSrcSize > dstSize) return ERROR(corruption_detected); /* invalid */
1833
- if (cSrcSize == dstSize) { ZSTD_memcpy(dst, cSrc, dstSize); return dstSize; } /* not compressed */
1834
- if (cSrcSize == 1) { ZSTD_memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; } /* RLE */
1835
-
1836
- { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
1837
- #if defined(HUF_FORCE_DECOMPRESS_X1)
1838
- (void)algoNb;
1839
- assert(algoNb == 0);
1840
- return HUF_decompress4X1(dst, dstSize, cSrc, cSrcSize);
1841
- #elif defined(HUF_FORCE_DECOMPRESS_X2)
1842
- (void)algoNb;
1843
- assert(algoNb == 1);
1844
- return HUF_decompress4X2(dst, dstSize, cSrc, cSrcSize);
1845
- #else
1846
- return decompress[algoNb](dst, dstSize, cSrc, cSrcSize);
1847
- #endif
1848
- }
1849
- }
1850
-
1851
- size_t HUF_decompress4X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
1852
- {
1853
- /* validation checks */
1854
- if (dstSize == 0) return ERROR(dstSize_tooSmall);
1855
- if (cSrcSize > dstSize) return ERROR(corruption_detected); /* invalid */
1856
- if (cSrcSize == dstSize) { ZSTD_memcpy(dst, cSrc, dstSize); return dstSize; } /* not compressed */
1857
- if (cSrcSize == 1) { ZSTD_memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; } /* RLE */
1858
-
1859
- { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
1860
- #if defined(HUF_FORCE_DECOMPRESS_X1)
1861
- (void)algoNb;
1862
- assert(algoNb == 0);
1863
- return HUF_decompress4X1_DCtx(dctx, dst, dstSize, cSrc, cSrcSize);
1864
- #elif defined(HUF_FORCE_DECOMPRESS_X2)
1865
- (void)algoNb;
1866
- assert(algoNb == 1);
1867
- return HUF_decompress4X2_DCtx(dctx, dst, dstSize, cSrc, cSrcSize);
1868
- #else
1869
- return algoNb ? HUF_decompress4X2_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) :
1870
- HUF_decompress4X1_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) ;
1871
- #endif
1872
- }
1873
- }
1874
-
1875
- size_t HUF_decompress4X_hufOnly(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
1876
- {
1877
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
1878
- return HUF_decompress4X_hufOnly_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
1879
- workSpace, sizeof(workSpace));
1880
- }
1881
-
1882
- size_t HUF_decompress1X_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize,
1883
- const void* cSrc, size_t cSrcSize)
1884
- {
1885
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
1886
- return HUF_decompress1X_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
1887
- workSpace, sizeof(workSpace));
1888
- }
1889
- #endif