extzstd 0.3.2 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (108) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +3 -3
  3. data/contrib/zstd/CHANGELOG +188 -1
  4. data/contrib/zstd/CONTRIBUTING.md +157 -74
  5. data/contrib/zstd/LICENSE +4 -4
  6. data/contrib/zstd/Makefile +81 -58
  7. data/contrib/zstd/Package.swift +36 -0
  8. data/contrib/zstd/README.md +59 -35
  9. data/contrib/zstd/TESTING.md +2 -3
  10. data/contrib/zstd/appveyor.yml +49 -136
  11. data/contrib/zstd/lib/BUCK +5 -7
  12. data/contrib/zstd/lib/Makefile +87 -181
  13. data/contrib/zstd/lib/README.md +23 -6
  14. data/contrib/zstd/lib/common/allocations.h +55 -0
  15. data/contrib/zstd/lib/common/bits.h +200 -0
  16. data/contrib/zstd/lib/common/bitstream.h +33 -59
  17. data/contrib/zstd/lib/common/compiler.h +115 -45
  18. data/contrib/zstd/lib/common/cpu.h +1 -1
  19. data/contrib/zstd/lib/common/debug.c +1 -1
  20. data/contrib/zstd/lib/common/debug.h +1 -1
  21. data/contrib/zstd/lib/common/entropy_common.c +15 -37
  22. data/contrib/zstd/lib/common/error_private.c +9 -2
  23. data/contrib/zstd/lib/common/error_private.h +82 -3
  24. data/contrib/zstd/lib/common/fse.h +9 -85
  25. data/contrib/zstd/lib/common/fse_decompress.c +29 -111
  26. data/contrib/zstd/lib/common/huf.h +84 -172
  27. data/contrib/zstd/lib/common/mem.h +58 -49
  28. data/contrib/zstd/lib/common/pool.c +37 -16
  29. data/contrib/zstd/lib/common/pool.h +9 -3
  30. data/contrib/zstd/lib/common/portability_macros.h +156 -0
  31. data/contrib/zstd/lib/common/threading.c +68 -14
  32. data/contrib/zstd/lib/common/threading.h +5 -10
  33. data/contrib/zstd/lib/common/xxhash.c +7 -809
  34. data/contrib/zstd/lib/common/xxhash.h +5568 -167
  35. data/contrib/zstd/lib/common/zstd_common.c +1 -36
  36. data/contrib/zstd/lib/common/zstd_deps.h +1 -1
  37. data/contrib/zstd/lib/common/zstd_internal.h +64 -150
  38. data/contrib/zstd/lib/common/zstd_trace.h +163 -0
  39. data/contrib/zstd/lib/compress/clevels.h +134 -0
  40. data/contrib/zstd/lib/compress/fse_compress.c +69 -150
  41. data/contrib/zstd/lib/compress/hist.c +1 -1
  42. data/contrib/zstd/lib/compress/hist.h +1 -1
  43. data/contrib/zstd/lib/compress/huf_compress.c +773 -251
  44. data/contrib/zstd/lib/compress/zstd_compress.c +2650 -826
  45. data/contrib/zstd/lib/compress/zstd_compress_internal.h +509 -180
  46. data/contrib/zstd/lib/compress/zstd_compress_literals.c +117 -40
  47. data/contrib/zstd/lib/compress/zstd_compress_literals.h +16 -6
  48. data/contrib/zstd/lib/compress/zstd_compress_sequences.c +28 -19
  49. data/contrib/zstd/lib/compress/zstd_compress_sequences.h +1 -1
  50. data/contrib/zstd/lib/compress/zstd_compress_superblock.c +33 -305
  51. data/contrib/zstd/lib/compress/zstd_compress_superblock.h +1 -1
  52. data/contrib/zstd/lib/compress/zstd_cwksp.h +266 -85
  53. data/contrib/zstd/lib/compress/zstd_double_fast.c +369 -132
  54. data/contrib/zstd/lib/compress/zstd_double_fast.h +3 -2
  55. data/contrib/zstd/lib/compress/zstd_fast.c +722 -258
  56. data/contrib/zstd/lib/compress/zstd_fast.h +3 -2
  57. data/contrib/zstd/lib/compress/zstd_lazy.c +1105 -360
  58. data/contrib/zstd/lib/compress/zstd_lazy.h +41 -1
  59. data/contrib/zstd/lib/compress/zstd_ldm.c +272 -208
  60. data/contrib/zstd/lib/compress/zstd_ldm.h +3 -2
  61. data/contrib/zstd/lib/compress/zstd_ldm_geartab.h +106 -0
  62. data/contrib/zstd/lib/compress/zstd_opt.c +324 -197
  63. data/contrib/zstd/lib/compress/zstd_opt.h +1 -1
  64. data/contrib/zstd/lib/compress/zstdmt_compress.c +109 -53
  65. data/contrib/zstd/lib/compress/zstdmt_compress.h +9 -6
  66. data/contrib/zstd/lib/decompress/huf_decompress.c +1071 -539
  67. data/contrib/zstd/lib/decompress/huf_decompress_amd64.S +576 -0
  68. data/contrib/zstd/lib/decompress/zstd_ddict.c +4 -4
  69. data/contrib/zstd/lib/decompress/zstd_ddict.h +1 -1
  70. data/contrib/zstd/lib/decompress/zstd_decompress.c +507 -82
  71. data/contrib/zstd/lib/decompress/zstd_decompress_block.c +962 -310
  72. data/contrib/zstd/lib/decompress/zstd_decompress_block.h +14 -3
  73. data/contrib/zstd/lib/decompress/zstd_decompress_internal.h +54 -6
  74. data/contrib/zstd/lib/deprecated/zbuff.h +1 -1
  75. data/contrib/zstd/lib/deprecated/zbuff_common.c +1 -1
  76. data/contrib/zstd/lib/deprecated/zbuff_compress.c +24 -4
  77. data/contrib/zstd/lib/deprecated/zbuff_decompress.c +3 -1
  78. data/contrib/zstd/lib/dictBuilder/cover.c +44 -32
  79. data/contrib/zstd/lib/dictBuilder/cover.h +6 -5
  80. data/contrib/zstd/lib/dictBuilder/divsufsort.c +1 -1
  81. data/contrib/zstd/lib/dictBuilder/fastcover.c +24 -16
  82. data/contrib/zstd/lib/dictBuilder/zdict.c +88 -95
  83. data/contrib/zstd/lib/legacy/zstd_legacy.h +8 -1
  84. data/contrib/zstd/lib/legacy/zstd_v01.c +16 -53
  85. data/contrib/zstd/lib/legacy/zstd_v01.h +1 -1
  86. data/contrib/zstd/lib/legacy/zstd_v02.c +24 -69
  87. data/contrib/zstd/lib/legacy/zstd_v02.h +1 -1
  88. data/contrib/zstd/lib/legacy/zstd_v03.c +25 -72
  89. data/contrib/zstd/lib/legacy/zstd_v03.h +1 -1
  90. data/contrib/zstd/lib/legacy/zstd_v04.c +23 -69
  91. data/contrib/zstd/lib/legacy/zstd_v04.h +1 -1
  92. data/contrib/zstd/lib/legacy/zstd_v05.c +35 -85
  93. data/contrib/zstd/lib/legacy/zstd_v05.h +1 -1
  94. data/contrib/zstd/lib/legacy/zstd_v06.c +42 -87
  95. data/contrib/zstd/lib/legacy/zstd_v06.h +1 -1
  96. data/contrib/zstd/lib/legacy/zstd_v07.c +35 -82
  97. data/contrib/zstd/lib/legacy/zstd_v07.h +1 -1
  98. data/contrib/zstd/lib/libzstd.mk +214 -0
  99. data/contrib/zstd/lib/libzstd.pc.in +4 -3
  100. data/contrib/zstd/lib/module.modulemap +35 -0
  101. data/contrib/zstd/lib/{dictBuilder/zdict.h → zdict.h} +202 -33
  102. data/contrib/zstd/lib/zstd.h +922 -293
  103. data/contrib/zstd/lib/{common/zstd_errors.h → zstd_errors.h} +27 -8
  104. data/ext/extconf.rb +7 -6
  105. data/ext/extzstd.c +13 -10
  106. data/ext/libzstd_conf.h +0 -1
  107. data/ext/zstd_decompress_asm.S +1 -0
  108. metadata +16 -5
@@ -1,7 +1,7 @@
1
1
  /* ******************************************************************
2
2
  * huff0 huffman decoder,
3
3
  * part of Finite State Entropy library
4
- * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc.
4
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
5
5
  *
6
6
  * You can contact the author at :
7
7
  * - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
@@ -19,9 +19,16 @@
19
19
  #include "../common/compiler.h"
20
20
  #include "../common/bitstream.h" /* BIT_* */
21
21
  #include "../common/fse.h" /* to compress headers */
22
- #define HUF_STATIC_LINKING_ONLY
23
22
  #include "../common/huf.h"
24
23
  #include "../common/error_private.h"
24
+ #include "../common/zstd_internal.h"
25
+ #include "../common/bits.h" /* ZSTD_highbit32, ZSTD_countTrailingZeros64 */
26
+
27
+ /* **************************************************************
28
+ * Constants
29
+ ****************************************************************/
30
+
31
+ #define HUF_DECODER_FAST_TABLELOG 11
25
32
 
26
33
  /* **************************************************************
27
34
  * Macros
@@ -36,6 +43,28 @@
36
43
  #error "Cannot force the use of the X1 and X2 decoders at the same time!"
37
44
  #endif
38
45
 
46
+ /* When DYNAMIC_BMI2 is enabled, fast decoders are only called when bmi2 is
47
+ * supported at runtime, so we can add the BMI2 target attribute.
48
+ * When it is disabled, we will still get BMI2 if it is enabled statically.
49
+ */
50
+ #if DYNAMIC_BMI2
51
+ # define HUF_FAST_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE
52
+ #else
53
+ # define HUF_FAST_BMI2_ATTRS
54
+ #endif
55
+
56
+ #ifdef __cplusplus
57
+ # define HUF_EXTERN_C extern "C"
58
+ #else
59
+ # define HUF_EXTERN_C
60
+ #endif
61
+ #define HUF_ASM_DECL HUF_EXTERN_C
62
+
63
+ #if DYNAMIC_BMI2
64
+ # define HUF_NEED_BMI2_FUNCTION 1
65
+ #else
66
+ # define HUF_NEED_BMI2_FUNCTION 0
67
+ #endif
39
68
 
40
69
  /* **************************************************************
41
70
  * Error Management
@@ -53,6 +82,11 @@
53
82
  /* **************************************************************
54
83
  * BMI2 Variant Wrappers
55
84
  ****************************************************************/
85
+ typedef size_t (*HUF_DecompressUsingDTableFn)(void *dst, size_t dstSize,
86
+ const void *cSrc,
87
+ size_t cSrcSize,
88
+ const HUF_DTable *DTable);
89
+
56
90
  #if DYNAMIC_BMI2
57
91
 
58
92
  #define HUF_DGEN(fn) \
@@ -65,7 +99,7 @@
65
99
  return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \
66
100
  } \
67
101
  \
68
- static TARGET_ATTRIBUTE("bmi2") size_t fn##_bmi2( \
102
+ static BMI2_TARGET_ATTRIBUTE size_t fn##_bmi2( \
69
103
  void* dst, size_t dstSize, \
70
104
  const void* cSrc, size_t cSrcSize, \
71
105
  const HUF_DTable* DTable) \
@@ -74,9 +108,9 @@
74
108
  } \
75
109
  \
76
110
  static size_t fn(void* dst, size_t dstSize, void const* cSrc, \
77
- size_t cSrcSize, HUF_DTable const* DTable, int bmi2) \
111
+ size_t cSrcSize, HUF_DTable const* DTable, int flags) \
78
112
  { \
79
- if (bmi2) { \
113
+ if (flags & HUF_flags_bmi2) { \
80
114
  return fn##_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); \
81
115
  } \
82
116
  return fn##_default(dst, dstSize, cSrc, cSrcSize, DTable); \
@@ -86,9 +120,9 @@
86
120
 
87
121
  #define HUF_DGEN(fn) \
88
122
  static size_t fn(void* dst, size_t dstSize, void const* cSrc, \
89
- size_t cSrcSize, HUF_DTable const* DTable, int bmi2) \
123
+ size_t cSrcSize, HUF_DTable const* DTable, int flags) \
90
124
  { \
91
- (void)bmi2; \
125
+ (void)flags; \
92
126
  return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \
93
127
  }
94
128
 
@@ -107,13 +141,164 @@ static DTableDesc HUF_getDTableDesc(const HUF_DTable* table)
107
141
  return dtd;
108
142
  }
109
143
 
144
+ static size_t HUF_initFastDStream(BYTE const* ip) {
145
+ BYTE const lastByte = ip[7];
146
+ size_t const bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0;
147
+ size_t const value = MEM_readLEST(ip) | 1;
148
+ assert(bitsConsumed <= 8);
149
+ assert(sizeof(size_t) == 8);
150
+ return value << bitsConsumed;
151
+ }
152
+
153
+
154
+ /**
155
+ * The input/output arguments to the Huffman fast decoding loop:
156
+ *
157
+ * ip [in/out] - The input pointers, must be updated to reflect what is consumed.
158
+ * op [in/out] - The output pointers, must be updated to reflect what is written.
159
+ * bits [in/out] - The bitstream containers, must be updated to reflect the current state.
160
+ * dt [in] - The decoding table.
161
+ * ilimit [in] - The input limit, stop when any input pointer is below ilimit.
162
+ * oend [in] - The end of the output stream. op[3] must not cross oend.
163
+ * iend [in] - The end of each input stream. ip[i] may cross iend[i],
164
+ * as long as it is above ilimit, but that indicates corruption.
165
+ */
166
+ typedef struct {
167
+ BYTE const* ip[4];
168
+ BYTE* op[4];
169
+ U64 bits[4];
170
+ void const* dt;
171
+ BYTE const* ilimit;
172
+ BYTE* oend;
173
+ BYTE const* iend[4];
174
+ } HUF_DecompressFastArgs;
175
+
176
+ typedef void (*HUF_DecompressFastLoopFn)(HUF_DecompressFastArgs*);
177
+
178
+ /**
179
+ * Initializes args for the fast decoding loop.
180
+ * @returns 1 on success
181
+ * 0 if the fallback implementation should be used.
182
+ * Or an error code on failure.
183
+ */
184
+ static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable)
185
+ {
186
+ void const* dt = DTable + 1;
187
+ U32 const dtLog = HUF_getDTableDesc(DTable).tableLog;
188
+
189
+ const BYTE* const ilimit = (const BYTE*)src + 6 + 8;
190
+
191
+ BYTE* const oend = (BYTE*)dst + dstSize;
192
+
193
+ /* The fast decoding loop assumes 64-bit little-endian.
194
+ * This condition is false on x32.
195
+ */
196
+ if (!MEM_isLittleEndian() || MEM_32bits())
197
+ return 0;
198
+
199
+ /* strict minimum : jump table + 1 byte per stream */
200
+ if (srcSize < 10)
201
+ return ERROR(corruption_detected);
202
+
203
+ /* Must have at least 8 bytes per stream because we don't handle initializing smaller bit containers.
204
+ * If table log is not correct at this point, fallback to the old decoder.
205
+ * On small inputs we don't have enough data to trigger the fast loop, so use the old decoder.
206
+ */
207
+ if (dtLog != HUF_DECODER_FAST_TABLELOG)
208
+ return 0;
209
+
210
+ /* Read the jump table. */
211
+ {
212
+ const BYTE* const istart = (const BYTE*)src;
213
+ size_t const length1 = MEM_readLE16(istart);
214
+ size_t const length2 = MEM_readLE16(istart+2);
215
+ size_t const length3 = MEM_readLE16(istart+4);
216
+ size_t const length4 = srcSize - (length1 + length2 + length3 + 6);
217
+ args->iend[0] = istart + 6; /* jumpTable */
218
+ args->iend[1] = args->iend[0] + length1;
219
+ args->iend[2] = args->iend[1] + length2;
220
+ args->iend[3] = args->iend[2] + length3;
221
+
222
+ /* HUF_initFastDStream() requires this, and this small of an input
223
+ * won't benefit from the ASM loop anyways.
224
+ * length1 must be >= 16 so that ip[0] >= ilimit before the loop
225
+ * starts.
226
+ */
227
+ if (length1 < 16 || length2 < 8 || length3 < 8 || length4 < 8)
228
+ return 0;
229
+ if (length4 > srcSize) return ERROR(corruption_detected); /* overflow */
230
+ }
231
+ /* ip[] contains the position that is currently loaded into bits[]. */
232
+ args->ip[0] = args->iend[1] - sizeof(U64);
233
+ args->ip[1] = args->iend[2] - sizeof(U64);
234
+ args->ip[2] = args->iend[3] - sizeof(U64);
235
+ args->ip[3] = (BYTE const*)src + srcSize - sizeof(U64);
236
+
237
+ /* op[] contains the output pointers. */
238
+ args->op[0] = (BYTE*)dst;
239
+ args->op[1] = args->op[0] + (dstSize+3)/4;
240
+ args->op[2] = args->op[1] + (dstSize+3)/4;
241
+ args->op[3] = args->op[2] + (dstSize+3)/4;
242
+
243
+ /* No point to call the ASM loop for tiny outputs. */
244
+ if (args->op[3] >= oend)
245
+ return 0;
246
+
247
+ /* bits[] is the bit container.
248
+ * It is read from the MSB down to the LSB.
249
+ * It is shifted left as it is read, and zeros are
250
+ * shifted in. After the lowest valid bit a 1 is
251
+ * set, so that CountTrailingZeros(bits[]) can be used
252
+ * to count how many bits we've consumed.
253
+ */
254
+ args->bits[0] = HUF_initFastDStream(args->ip[0]);
255
+ args->bits[1] = HUF_initFastDStream(args->ip[1]);
256
+ args->bits[2] = HUF_initFastDStream(args->ip[2]);
257
+ args->bits[3] = HUF_initFastDStream(args->ip[3]);
258
+
259
+ /* If ip[] >= ilimit, it is guaranteed to be safe to
260
+ * reload bits[]. It may be beyond its section, but is
261
+ * guaranteed to be valid (>= istart).
262
+ */
263
+ args->ilimit = ilimit;
264
+
265
+ args->oend = oend;
266
+ args->dt = dt;
267
+
268
+ return 1;
269
+ }
270
+
271
+ static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressFastArgs const* args, int stream, BYTE* segmentEnd)
272
+ {
273
+ /* Validate that we haven't overwritten. */
274
+ if (args->op[stream] > segmentEnd)
275
+ return ERROR(corruption_detected);
276
+ /* Validate that we haven't read beyond iend[].
277
+ * Note that ip[] may be < iend[] because the MSB is
278
+ * the next bit to read, and we may have consumed 100%
279
+ * of the stream, so down to iend[i] - 8 is valid.
280
+ */
281
+ if (args->ip[stream] < args->iend[stream] - 8)
282
+ return ERROR(corruption_detected);
283
+
284
+ /* Construct the BIT_DStream_t. */
285
+ assert(sizeof(size_t) == 8);
286
+ bit->bitContainer = MEM_readLEST(args->ip[stream]);
287
+ bit->bitsConsumed = ZSTD_countTrailingZeros64(args->bits[stream]);
288
+ bit->start = (const char*)args->iend[0];
289
+ bit->limitPtr = bit->start + sizeof(size_t);
290
+ bit->ptr = (const char*)args->ip[stream];
291
+
292
+ return 0;
293
+ }
294
+
110
295
 
111
296
  #ifndef HUF_FORCE_DECOMPRESS_X2
112
297
 
113
298
  /*-***************************/
114
299
  /* single-symbol decoding */
115
300
  /*-***************************/
116
- typedef struct { BYTE byte; BYTE nbBits; } HUF_DEltX1; /* single-symbol decoding */
301
+ typedef struct { BYTE nbBits; BYTE byte; } HUF_DEltX1; /* single-symbol decoding */
117
302
 
118
303
  /**
119
304
  * Packs 4 HUF_DEltX1 structs into a U64. This is used to lay down 4 entries at
@@ -122,14 +307,45 @@ typedef struct { BYTE byte; BYTE nbBits; } HUF_DEltX1; /* single-symbol decodi
122
307
  static U64 HUF_DEltX1_set4(BYTE symbol, BYTE nbBits) {
123
308
  U64 D4;
124
309
  if (MEM_isLittleEndian()) {
125
- D4 = symbol + (nbBits << 8);
310
+ D4 = (U64)((symbol << 8) + nbBits);
126
311
  } else {
127
- D4 = (symbol << 8) + nbBits;
312
+ D4 = (U64)(symbol + (nbBits << 8));
128
313
  }
314
+ assert(D4 < (1U << 16));
129
315
  D4 *= 0x0001000100010001ULL;
130
316
  return D4;
131
317
  }
132
318
 
319
+ /**
320
+ * Increase the tableLog to targetTableLog and rescales the stats.
321
+ * If tableLog > targetTableLog this is a no-op.
322
+ * @returns New tableLog
323
+ */
324
+ static U32 HUF_rescaleStats(BYTE* huffWeight, U32* rankVal, U32 nbSymbols, U32 tableLog, U32 targetTableLog)
325
+ {
326
+ if (tableLog > targetTableLog)
327
+ return tableLog;
328
+ if (tableLog < targetTableLog) {
329
+ U32 const scale = targetTableLog - tableLog;
330
+ U32 s;
331
+ /* Increase the weight for all non-zero probability symbols by scale. */
332
+ for (s = 0; s < nbSymbols; ++s) {
333
+ huffWeight[s] += (BYTE)((huffWeight[s] == 0) ? 0 : scale);
334
+ }
335
+ /* Update rankVal to reflect the new weights.
336
+ * All weights except 0 get moved to weight + scale.
337
+ * Weights [1, scale] are empty.
338
+ */
339
+ for (s = targetTableLog; s > scale; --s) {
340
+ rankVal[s] = rankVal[s - scale];
341
+ }
342
+ for (s = scale; s > 0; --s) {
343
+ rankVal[s] = 0;
344
+ }
345
+ }
346
+ return targetTableLog;
347
+ }
348
+
133
349
  typedef struct {
134
350
  U32 rankVal[HUF_TABLELOG_ABSOLUTEMAX + 1];
135
351
  U32 rankStart[HUF_TABLELOG_ABSOLUTEMAX + 1];
@@ -138,13 +354,7 @@ typedef struct {
138
354
  BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1];
139
355
  } HUF_ReadDTableX1_Workspace;
140
356
 
141
-
142
- size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize)
143
- {
144
- return HUF_readDTableX1_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0);
145
- }
146
-
147
- size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2)
357
+ size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags)
148
358
  {
149
359
  U32 tableLog = 0;
150
360
  U32 nbSymbols = 0;
@@ -159,11 +369,15 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
159
369
  DEBUG_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable));
160
370
  /* ZSTD_memset(huffWeight, 0, sizeof(huffWeight)); */ /* is not necessary, even though some analyzer complain ... */
161
371
 
162
- iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), bmi2);
372
+ iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), flags);
163
373
  if (HUF_isError(iSize)) return iSize;
164
374
 
375
+
165
376
  /* Table header */
166
377
  { DTableDesc dtd = HUF_getDTableDesc(DTable);
378
+ U32 const maxTableLog = dtd.maxTableLog + 1;
379
+ U32 const targetTableLog = MIN(maxTableLog, HUF_DECODER_FAST_TABLELOG);
380
+ tableLog = HUF_rescaleStats(wksp->huffWeight, wksp->rankVal, nbSymbols, tableLog, targetTableLog);
167
381
  if (tableLog > (U32)(dtd.maxTableLog+1)) return ERROR(tableLog_tooLarge); /* DTable too small, Huffman tree cannot fit in */
168
382
  dtd.tableType = 0;
169
383
  dtd.tableLog = (BYTE)tableLog;
@@ -182,9 +396,8 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
182
396
  * rankStart[0] is not filled because there are no entries in the table for
183
397
  * weight 0.
184
398
  */
185
- {
186
- int n;
187
- int nextRankStart = 0;
399
+ { int n;
400
+ U32 nextRankStart = 0;
188
401
  int const unroll = 4;
189
402
  int const nLimit = (int)nbSymbols - unroll + 1;
190
403
  for (n=0; n<(int)tableLog+1; n++) {
@@ -207,14 +420,13 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
207
420
 
208
421
  /* fill DTable
209
422
  * We fill all entries of each weight in order.
210
- * That way length is a constant for each iteration of the outter loop.
423
+ * That way length is a constant for each iteration of the outer loop.
211
424
  * We can switch based on the length to a different inner loop which is
212
425
  * optimized for that particular case.
213
426
  */
214
- {
215
- U32 w;
216
- int symbol=wksp->rankVal[0];
217
- int rankStart=0;
427
+ { U32 w;
428
+ int symbol = wksp->rankVal[0];
429
+ int rankStart = 0;
218
430
  for (w=1; w<tableLog+1; ++w) {
219
431
  int const symbolCount = wksp->rankVal[w];
220
432
  int const length = (1 << w) >> 1;
@@ -304,11 +516,15 @@ HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, cons
304
516
  BYTE* const pStart = p;
305
517
 
306
518
  /* up to 4 symbols at a time */
307
- while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-3)) {
308
- HUF_DECODE_SYMBOLX1_2(p, bitDPtr);
309
- HUF_DECODE_SYMBOLX1_1(p, bitDPtr);
310
- HUF_DECODE_SYMBOLX1_2(p, bitDPtr);
311
- HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
519
+ if ((pEnd - p) > 3) {
520
+ while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-3)) {
521
+ HUF_DECODE_SYMBOLX1_2(p, bitDPtr);
522
+ HUF_DECODE_SYMBOLX1_1(p, bitDPtr);
523
+ HUF_DECODE_SYMBOLX1_2(p, bitDPtr);
524
+ HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
525
+ }
526
+ } else {
527
+ BIT_reloadDStream(bitDPtr);
312
528
  }
313
529
 
314
530
  /* [0-3] symbols remaining */
@@ -320,7 +536,7 @@ HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, cons
320
536
  while (p < pEnd)
321
537
  HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
322
538
 
323
- return pEnd-pStart;
539
+ return (size_t)(pEnd-pStart);
324
540
  }
325
541
 
326
542
  FORCE_INLINE_TEMPLATE size_t
@@ -346,6 +562,10 @@ HUF_decompress1X1_usingDTable_internal_body(
346
562
  return dstSize;
347
563
  }
348
564
 
565
+ /* HUF_decompress4X1_usingDTable_internal_body():
566
+ * Conditions :
567
+ * @dstSize >= 6
568
+ */
349
569
  FORCE_INLINE_TEMPLATE size_t
350
570
  HUF_decompress4X1_usingDTable_internal_body(
351
571
  void* dst, size_t dstSize,
@@ -388,33 +608,37 @@ HUF_decompress4X1_usingDTable_internal_body(
388
608
  U32 endSignal = 1;
389
609
 
390
610
  if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */
611
+ if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */
612
+ if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */
391
613
  CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
392
614
  CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
393
615
  CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
394
616
  CHECK_F( BIT_initDStream(&bitD4, istart4, length4) );
395
617
 
396
618
  /* up to 16 symbols per loop (4 symbols per stream) in 64-bit mode */
397
- for ( ; (endSignal) & (op4 < olimit) ; ) {
398
- HUF_DECODE_SYMBOLX1_2(op1, &bitD1);
399
- HUF_DECODE_SYMBOLX1_2(op2, &bitD2);
400
- HUF_DECODE_SYMBOLX1_2(op3, &bitD3);
401
- HUF_DECODE_SYMBOLX1_2(op4, &bitD4);
402
- HUF_DECODE_SYMBOLX1_1(op1, &bitD1);
403
- HUF_DECODE_SYMBOLX1_1(op2, &bitD2);
404
- HUF_DECODE_SYMBOLX1_1(op3, &bitD3);
405
- HUF_DECODE_SYMBOLX1_1(op4, &bitD4);
406
- HUF_DECODE_SYMBOLX1_2(op1, &bitD1);
407
- HUF_DECODE_SYMBOLX1_2(op2, &bitD2);
408
- HUF_DECODE_SYMBOLX1_2(op3, &bitD3);
409
- HUF_DECODE_SYMBOLX1_2(op4, &bitD4);
410
- HUF_DECODE_SYMBOLX1_0(op1, &bitD1);
411
- HUF_DECODE_SYMBOLX1_0(op2, &bitD2);
412
- HUF_DECODE_SYMBOLX1_0(op3, &bitD3);
413
- HUF_DECODE_SYMBOLX1_0(op4, &bitD4);
414
- endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished;
415
- endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished;
416
- endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished;
417
- endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished;
619
+ if ((size_t)(oend - op4) >= sizeof(size_t)) {
620
+ for ( ; (endSignal) & (op4 < olimit) ; ) {
621
+ HUF_DECODE_SYMBOLX1_2(op1, &bitD1);
622
+ HUF_DECODE_SYMBOLX1_2(op2, &bitD2);
623
+ HUF_DECODE_SYMBOLX1_2(op3, &bitD3);
624
+ HUF_DECODE_SYMBOLX1_2(op4, &bitD4);
625
+ HUF_DECODE_SYMBOLX1_1(op1, &bitD1);
626
+ HUF_DECODE_SYMBOLX1_1(op2, &bitD2);
627
+ HUF_DECODE_SYMBOLX1_1(op3, &bitD3);
628
+ HUF_DECODE_SYMBOLX1_1(op4, &bitD4);
629
+ HUF_DECODE_SYMBOLX1_2(op1, &bitD1);
630
+ HUF_DECODE_SYMBOLX1_2(op2, &bitD2);
631
+ HUF_DECODE_SYMBOLX1_2(op3, &bitD3);
632
+ HUF_DECODE_SYMBOLX1_2(op4, &bitD4);
633
+ HUF_DECODE_SYMBOLX1_0(op1, &bitD1);
634
+ HUF_DECODE_SYMBOLX1_0(op2, &bitD2);
635
+ HUF_DECODE_SYMBOLX1_0(op3, &bitD3);
636
+ HUF_DECODE_SYMBOLX1_0(op4, &bitD4);
637
+ endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished;
638
+ endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished;
639
+ endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished;
640
+ endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished;
641
+ }
418
642
  }
419
643
 
420
644
  /* check corruption */
@@ -440,74 +664,232 @@ HUF_decompress4X1_usingDTable_internal_body(
440
664
  }
441
665
  }
442
666
 
667
+ #if HUF_NEED_BMI2_FUNCTION
668
+ static BMI2_TARGET_ATTRIBUTE
669
+ size_t HUF_decompress4X1_usingDTable_internal_bmi2(void* dst, size_t dstSize, void const* cSrc,
670
+ size_t cSrcSize, HUF_DTable const* DTable) {
671
+ return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
672
+ }
673
+ #endif
443
674
 
444
- typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize,
445
- const void *cSrc,
446
- size_t cSrcSize,
447
- const HUF_DTable *DTable);
675
+ static
676
+ size_t HUF_decompress4X1_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,
677
+ size_t cSrcSize, HUF_DTable const* DTable) {
678
+ return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
679
+ }
448
680
 
449
- HUF_DGEN(HUF_decompress1X1_usingDTable_internal)
450
- HUF_DGEN(HUF_decompress4X1_usingDTable_internal)
681
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2
451
682
 
683
+ HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN;
452
684
 
685
+ #endif
453
686
 
454
- size_t HUF_decompress1X1_usingDTable(
455
- void* dst, size_t dstSize,
456
- const void* cSrc, size_t cSrcSize,
457
- const HUF_DTable* DTable)
687
+ static HUF_FAST_BMI2_ATTRS
688
+ void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args)
458
689
  {
459
- DTableDesc dtd = HUF_getDTableDesc(DTable);
460
- if (dtd.tableType != 0) return ERROR(GENERIC);
461
- return HUF_decompress1X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
690
+ U64 bits[4];
691
+ BYTE const* ip[4];
692
+ BYTE* op[4];
693
+ U16 const* const dtable = (U16 const*)args->dt;
694
+ BYTE* const oend = args->oend;
695
+ BYTE const* const ilimit = args->ilimit;
696
+
697
+ /* Copy the arguments to local variables */
698
+ ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
699
+ ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip));
700
+ ZSTD_memcpy(&op, &args->op, sizeof(op));
701
+
702
+ assert(MEM_isLittleEndian());
703
+ assert(!MEM_32bits());
704
+
705
+ for (;;) {
706
+ BYTE* olimit;
707
+ int stream;
708
+ int symbol;
709
+
710
+ /* Assert loop preconditions */
711
+ #ifndef NDEBUG
712
+ for (stream = 0; stream < 4; ++stream) {
713
+ assert(op[stream] <= (stream == 3 ? oend : op[stream + 1]));
714
+ assert(ip[stream] >= ilimit);
715
+ }
716
+ #endif
717
+ /* Compute olimit */
718
+ {
719
+ /* Each iteration produces 5 output symbols per stream */
720
+ size_t const oiters = (size_t)(oend - op[3]) / 5;
721
+ /* Each iteration consumes up to 11 bits * 5 = 55 bits < 7 bytes
722
+ * per stream.
723
+ */
724
+ size_t const iiters = (size_t)(ip[0] - ilimit) / 7;
725
+ /* We can safely run iters iterations before running bounds checks */
726
+ size_t const iters = MIN(oiters, iiters);
727
+ size_t const symbols = iters * 5;
728
+
729
+ /* We can simply check that op[3] < olimit, instead of checking all
730
+ * of our bounds, since we can't hit the other bounds until we've run
731
+ * iters iterations, which only happens when op[3] == olimit.
732
+ */
733
+ olimit = op[3] + symbols;
734
+
735
+ /* Exit fast decoding loop once we get close to the end. */
736
+ if (op[3] + 20 > olimit)
737
+ break;
738
+
739
+ /* Exit the decoding loop if any input pointer has crossed the
740
+ * previous one. This indicates corruption, and a precondition
741
+ * to our loop is that ip[i] >= ip[0].
742
+ */
743
+ for (stream = 1; stream < 4; ++stream) {
744
+ if (ip[stream] < ip[stream - 1])
745
+ goto _out;
746
+ }
747
+ }
748
+
749
+ #ifndef NDEBUG
750
+ for (stream = 1; stream < 4; ++stream) {
751
+ assert(ip[stream] >= ip[stream - 1]);
752
+ }
753
+ #endif
754
+
755
+ do {
756
+ /* Decode 5 symbols in each of the 4 streams */
757
+ for (symbol = 0; symbol < 5; ++symbol) {
758
+ for (stream = 0; stream < 4; ++stream) {
759
+ int const index = (int)(bits[stream] >> 53);
760
+ int const entry = (int)dtable[index];
761
+ bits[stream] <<= (entry & 63);
762
+ op[stream][symbol] = (BYTE)((entry >> 8) & 0xFF);
763
+ }
764
+ }
765
+ /* Reload the bitstreams */
766
+ for (stream = 0; stream < 4; ++stream) {
767
+ int const ctz = ZSTD_countTrailingZeros64(bits[stream]);
768
+ int const nbBits = ctz & 7;
769
+ int const nbBytes = ctz >> 3;
770
+ op[stream] += 5;
771
+ ip[stream] -= nbBytes;
772
+ bits[stream] = MEM_read64(ip[stream]) | 1;
773
+ bits[stream] <<= nbBits;
774
+ }
775
+ } while (op[3] < olimit);
776
+ }
777
+
778
+ _out:
779
+
780
+ /* Save the final values of each of the state variables back to args. */
781
+ ZSTD_memcpy(&args->bits, &bits, sizeof(bits));
782
+ ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip));
783
+ ZSTD_memcpy(&args->op, &op, sizeof(op));
462
784
  }
463
785
 
464
- size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
465
- const void* cSrc, size_t cSrcSize,
466
- void* workSpace, size_t wkspSize)
786
+ /**
787
+ * @returns @p dstSize on success (>= 6)
788
+ * 0 if the fallback implementation should be used
789
+ * An error if an error occurred
790
+ */
791
+ static HUF_FAST_BMI2_ATTRS
792
+ size_t
793
+ HUF_decompress4X1_usingDTable_internal_fast(
794
+ void* dst, size_t dstSize,
795
+ const void* cSrc, size_t cSrcSize,
796
+ const HUF_DTable* DTable,
797
+ HUF_DecompressFastLoopFn loopFn)
467
798
  {
468
- const BYTE* ip = (const BYTE*) cSrc;
799
+ void const* dt = DTable + 1;
800
+ const BYTE* const iend = (const BYTE*)cSrc + 6;
801
+ BYTE* const oend = (BYTE*)dst + dstSize;
802
+ HUF_DecompressFastArgs args;
803
+ { size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
804
+ FORWARD_IF_ERROR(ret, "Failed to init fast loop args");
805
+ if (ret == 0)
806
+ return 0;
807
+ }
469
808
 
470
- size_t const hSize = HUF_readDTableX1_wksp(DCtx, cSrc, cSrcSize, workSpace, wkspSize);
471
- if (HUF_isError(hSize)) return hSize;
472
- if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
473
- ip += hSize; cSrcSize -= hSize;
809
+ assert(args.ip[0] >= args.ilimit);
810
+ loopFn(&args);
811
+
812
+ /* Our loop guarantees that ip[] >= ilimit and that we haven't
813
+ * overwritten any op[].
814
+ */
815
+ assert(args.ip[0] >= iend);
816
+ assert(args.ip[1] >= iend);
817
+ assert(args.ip[2] >= iend);
818
+ assert(args.ip[3] >= iend);
819
+ assert(args.op[3] <= oend);
820
+ (void)iend;
821
+
822
+ /* finish bit streams one by one. */
823
+ { size_t const segmentSize = (dstSize+3) / 4;
824
+ BYTE* segmentEnd = (BYTE*)dst;
825
+ int i;
826
+ for (i = 0; i < 4; ++i) {
827
+ BIT_DStream_t bit;
828
+ if (segmentSize <= (size_t)(oend - segmentEnd))
829
+ segmentEnd += segmentSize;
830
+ else
831
+ segmentEnd = oend;
832
+ FORWARD_IF_ERROR(HUF_initRemainingDStream(&bit, &args, i, segmentEnd), "corruption");
833
+ /* Decompress and validate that we've produced exactly the expected length. */
834
+ args.op[i] += HUF_decodeStreamX1(args.op[i], &bit, segmentEnd, (HUF_DEltX1 const*)dt, HUF_DECODER_FAST_TABLELOG);
835
+ if (args.op[i] != segmentEnd) return ERROR(corruption_detected);
836
+ }
837
+ }
474
838
 
475
- return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
839
+ /* decoded size */
840
+ assert(dstSize != 0);
841
+ return dstSize;
476
842
  }
477
843
 
844
+ HUF_DGEN(HUF_decompress1X1_usingDTable_internal)
478
845
 
479
- size_t HUF_decompress4X1_usingDTable(
480
- void* dst, size_t dstSize,
481
- const void* cSrc, size_t cSrcSize,
482
- const HUF_DTable* DTable)
846
+ static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc,
847
+ size_t cSrcSize, HUF_DTable const* DTable, int flags)
483
848
  {
484
- DTableDesc dtd = HUF_getDTableDesc(DTable);
485
- if (dtd.tableType != 0) return ERROR(GENERIC);
486
- return HUF_decompress4X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
849
+ HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X1_usingDTable_internal_default;
850
+ HUF_DecompressFastLoopFn loopFn = HUF_decompress4X1_usingDTable_internal_fast_c_loop;
851
+
852
+ #if DYNAMIC_BMI2
853
+ if (flags & HUF_flags_bmi2) {
854
+ fallbackFn = HUF_decompress4X1_usingDTable_internal_bmi2;
855
+ # if ZSTD_ENABLE_ASM_X86_64_BMI2
856
+ if (!(flags & HUF_flags_disableAsm)) {
857
+ loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop;
858
+ }
859
+ # endif
860
+ } else {
861
+ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
862
+ }
863
+ #endif
864
+
865
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
866
+ if (!(flags & HUF_flags_disableAsm)) {
867
+ loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop;
868
+ }
869
+ #endif
870
+
871
+ if (!(flags & HUF_flags_disableFast)) {
872
+ size_t const ret = HUF_decompress4X1_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);
873
+ if (ret != 0)
874
+ return ret;
875
+ }
876
+ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
487
877
  }
488
878
 
489
- static size_t HUF_decompress4X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
879
+ static size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
490
880
  const void* cSrc, size_t cSrcSize,
491
- void* workSpace, size_t wkspSize, int bmi2)
881
+ void* workSpace, size_t wkspSize, int flags)
492
882
  {
493
883
  const BYTE* ip = (const BYTE*) cSrc;
494
884
 
495
- size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
885
+ size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags);
496
886
  if (HUF_isError(hSize)) return hSize;
497
887
  if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
498
888
  ip += hSize; cSrcSize -= hSize;
499
889
 
500
- return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
890
+ return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
501
891
  }
502
892
 
503
- size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
504
- const void* cSrc, size_t cSrcSize,
505
- void* workSpace, size_t wkspSize)
506
- {
507
- return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, 0);
508
- }
509
-
510
-
511
893
  #endif /* HUF_FORCE_DECOMPRESS_X2 */
512
894
 
513
895
 
@@ -518,188 +900,308 @@ size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
518
900
  /* *************************/
519
901
 
520
902
  typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUF_DEltX2; /* double-symbols decoding */
521
- typedef struct { BYTE symbol; BYTE weight; } sortedSymbol_t;
903
+ typedef struct { BYTE symbol; } sortedSymbol_t;
522
904
  typedef U32 rankValCol_t[HUF_TABLELOG_MAX + 1];
523
905
  typedef rankValCol_t rankVal_t[HUF_TABLELOG_MAX];
524
906
 
907
+ /**
908
+ * Constructs a HUF_DEltX2 in a U32.
909
+ */
910
+ static U32 HUF_buildDEltX2U32(U32 symbol, U32 nbBits, U32 baseSeq, int level)
911
+ {
912
+ U32 seq;
913
+ DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, sequence) == 0);
914
+ DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, nbBits) == 2);
915
+ DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, length) == 3);
916
+ DEBUG_STATIC_ASSERT(sizeof(HUF_DEltX2) == sizeof(U32));
917
+ if (MEM_isLittleEndian()) {
918
+ seq = level == 1 ? symbol : (baseSeq + (symbol << 8));
919
+ return seq + (nbBits << 16) + ((U32)level << 24);
920
+ } else {
921
+ seq = level == 1 ? (symbol << 8) : ((baseSeq << 8) + symbol);
922
+ return (seq << 16) + (nbBits << 8) + (U32)level;
923
+ }
924
+ }
525
925
 
526
- /* HUF_fillDTableX2Level2() :
527
- * `rankValOrigin` must be a table of at least (HUF_TABLELOG_MAX + 1) U32 */
528
- static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 sizeLog, const U32 consumed,
529
- const U32* rankValOrigin, const int minWeight,
530
- const sortedSymbol_t* sortedSymbols, const U32 sortedListSize,
531
- U32 nbBitsBaseline, U16 baseSeq)
926
+ /**
927
+ * Constructs a HUF_DEltX2.
928
+ */
929
+ static HUF_DEltX2 HUF_buildDEltX2(U32 symbol, U32 nbBits, U32 baseSeq, int level)
532
930
  {
533
931
  HUF_DEltX2 DElt;
534
- U32 rankVal[HUF_TABLELOG_MAX + 1];
932
+ U32 const val = HUF_buildDEltX2U32(symbol, nbBits, baseSeq, level);
933
+ DEBUG_STATIC_ASSERT(sizeof(DElt) == sizeof(val));
934
+ ZSTD_memcpy(&DElt, &val, sizeof(val));
935
+ return DElt;
936
+ }
535
937
 
536
- /* get pre-calculated rankVal */
537
- ZSTD_memcpy(rankVal, rankValOrigin, sizeof(rankVal));
938
+ /**
939
+ * Constructs 2 HUF_DEltX2s and packs them into a U64.
940
+ */
941
+ static U64 HUF_buildDEltX2U64(U32 symbol, U32 nbBits, U16 baseSeq, int level)
942
+ {
943
+ U32 DElt = HUF_buildDEltX2U32(symbol, nbBits, baseSeq, level);
944
+ return (U64)DElt + ((U64)DElt << 32);
945
+ }
946
+
947
+ /**
948
+ * Fills the DTable rank with all the symbols from [begin, end) that are each
949
+ * nbBits long.
950
+ *
951
+ * @param DTableRank The start of the rank in the DTable.
952
+ * @param begin The first symbol to fill (inclusive).
953
+ * @param end The last symbol to fill (exclusive).
954
+ * @param nbBits Each symbol is nbBits long.
955
+ * @param tableLog The table log.
956
+ * @param baseSeq If level == 1 { 0 } else { the first level symbol }
957
+ * @param level The level in the table. Must be 1 or 2.
958
+ */
959
+ static void HUF_fillDTableX2ForWeight(
960
+ HUF_DEltX2* DTableRank,
961
+ sortedSymbol_t const* begin, sortedSymbol_t const* end,
962
+ U32 nbBits, U32 tableLog,
963
+ U16 baseSeq, int const level)
964
+ {
965
+ U32 const length = 1U << ((tableLog - nbBits) & 0x1F /* quiet static-analyzer */);
966
+ const sortedSymbol_t* ptr;
967
+ assert(level >= 1 && level <= 2);
968
+ switch (length) {
969
+ case 1:
970
+ for (ptr = begin; ptr != end; ++ptr) {
971
+ HUF_DEltX2 const DElt = HUF_buildDEltX2(ptr->symbol, nbBits, baseSeq, level);
972
+ *DTableRank++ = DElt;
973
+ }
974
+ break;
975
+ case 2:
976
+ for (ptr = begin; ptr != end; ++ptr) {
977
+ HUF_DEltX2 const DElt = HUF_buildDEltX2(ptr->symbol, nbBits, baseSeq, level);
978
+ DTableRank[0] = DElt;
979
+ DTableRank[1] = DElt;
980
+ DTableRank += 2;
981
+ }
982
+ break;
983
+ case 4:
984
+ for (ptr = begin; ptr != end; ++ptr) {
985
+ U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level);
986
+ ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2));
987
+ ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2));
988
+ DTableRank += 4;
989
+ }
990
+ break;
991
+ case 8:
992
+ for (ptr = begin; ptr != end; ++ptr) {
993
+ U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level);
994
+ ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2));
995
+ ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2));
996
+ ZSTD_memcpy(DTableRank + 4, &DEltX2, sizeof(DEltX2));
997
+ ZSTD_memcpy(DTableRank + 6, &DEltX2, sizeof(DEltX2));
998
+ DTableRank += 8;
999
+ }
1000
+ break;
1001
+ default:
1002
+ for (ptr = begin; ptr != end; ++ptr) {
1003
+ U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level);
1004
+ HUF_DEltX2* const DTableRankEnd = DTableRank + length;
1005
+ for (; DTableRank != DTableRankEnd; DTableRank += 8) {
1006
+ ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2));
1007
+ ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2));
1008
+ ZSTD_memcpy(DTableRank + 4, &DEltX2, sizeof(DEltX2));
1009
+ ZSTD_memcpy(DTableRank + 6, &DEltX2, sizeof(DEltX2));
1010
+ }
1011
+ }
1012
+ break;
1013
+ }
1014
+ }
538
1015
 
539
- /* fill skipped values */
1016
+ /* HUF_fillDTableX2Level2() :
1017
+ * `rankValOrigin` must be a table of at least (HUF_TABLELOG_MAX + 1) U32 */
1018
+ static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 targetLog, const U32 consumedBits,
1019
+ const U32* rankVal, const int minWeight, const int maxWeight1,
1020
+ const sortedSymbol_t* sortedSymbols, U32 const* rankStart,
1021
+ U32 nbBitsBaseline, U16 baseSeq)
1022
+ {
1023
+ /* Fill skipped values (all positions up to rankVal[minWeight]).
1024
+ * These are positions only get a single symbol because the combined weight
1025
+ * is too large.
1026
+ */
540
1027
  if (minWeight>1) {
541
- U32 i, skipSize = rankVal[minWeight];
542
- MEM_writeLE16(&(DElt.sequence), baseSeq);
543
- DElt.nbBits = (BYTE)(consumed);
544
- DElt.length = 1;
545
- for (i = 0; i < skipSize; i++)
546
- DTable[i] = DElt;
1028
+ U32 const length = 1U << ((targetLog - consumedBits) & 0x1F /* quiet static-analyzer */);
1029
+ U64 const DEltX2 = HUF_buildDEltX2U64(baseSeq, consumedBits, /* baseSeq */ 0, /* level */ 1);
1030
+ int const skipSize = rankVal[minWeight];
1031
+ assert(length > 1);
1032
+ assert((U32)skipSize < length);
1033
+ switch (length) {
1034
+ case 2:
1035
+ assert(skipSize == 1);
1036
+ ZSTD_memcpy(DTable, &DEltX2, sizeof(DEltX2));
1037
+ break;
1038
+ case 4:
1039
+ assert(skipSize <= 4);
1040
+ ZSTD_memcpy(DTable + 0, &DEltX2, sizeof(DEltX2));
1041
+ ZSTD_memcpy(DTable + 2, &DEltX2, sizeof(DEltX2));
1042
+ break;
1043
+ default:
1044
+ {
1045
+ int i;
1046
+ for (i = 0; i < skipSize; i += 8) {
1047
+ ZSTD_memcpy(DTable + i + 0, &DEltX2, sizeof(DEltX2));
1048
+ ZSTD_memcpy(DTable + i + 2, &DEltX2, sizeof(DEltX2));
1049
+ ZSTD_memcpy(DTable + i + 4, &DEltX2, sizeof(DEltX2));
1050
+ ZSTD_memcpy(DTable + i + 6, &DEltX2, sizeof(DEltX2));
1051
+ }
1052
+ }
1053
+ }
547
1054
  }
548
1055
 
549
- /* fill DTable */
550
- { U32 s; for (s=0; s<sortedListSize; s++) { /* note : sortedSymbols already skipped */
551
- const U32 symbol = sortedSymbols[s].symbol;
552
- const U32 weight = sortedSymbols[s].weight;
553
- const U32 nbBits = nbBitsBaseline - weight;
554
- const U32 length = 1 << (sizeLog-nbBits);
555
- const U32 start = rankVal[weight];
556
- U32 i = start;
557
- const U32 end = start + length;
558
-
559
- MEM_writeLE16(&(DElt.sequence), (U16)(baseSeq + (symbol << 8)));
560
- DElt.nbBits = (BYTE)(nbBits + consumed);
561
- DElt.length = 2;
562
- do { DTable[i++] = DElt; } while (i<end); /* since length >= 1 */
563
-
564
- rankVal[weight] += length;
565
- } }
1056
+ /* Fill each of the second level symbols by weight. */
1057
+ {
1058
+ int w;
1059
+ for (w = minWeight; w < maxWeight1; ++w) {
1060
+ int const begin = rankStart[w];
1061
+ int const end = rankStart[w+1];
1062
+ U32 const nbBits = nbBitsBaseline - w;
1063
+ U32 const totalBits = nbBits + consumedBits;
1064
+ HUF_fillDTableX2ForWeight(
1065
+ DTable + rankVal[w],
1066
+ sortedSymbols + begin, sortedSymbols + end,
1067
+ totalBits, targetLog,
1068
+ baseSeq, /* level */ 2);
1069
+ }
1070
+ }
566
1071
  }
567
1072
 
568
-
569
1073
  static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog,
570
- const sortedSymbol_t* sortedList, const U32 sortedListSize,
571
- const U32* rankStart, rankVal_t rankValOrigin, const U32 maxWeight,
1074
+ const sortedSymbol_t* sortedList,
1075
+ const U32* rankStart, rankValCol_t* rankValOrigin, const U32 maxWeight,
572
1076
  const U32 nbBitsBaseline)
573
1077
  {
574
- U32 rankVal[HUF_TABLELOG_MAX + 1];
1078
+ U32* const rankVal = rankValOrigin[0];
575
1079
  const int scaleLog = nbBitsBaseline - targetLog; /* note : targetLog >= srcLog, hence scaleLog <= 1 */
576
1080
  const U32 minBits = nbBitsBaseline - maxWeight;
577
- U32 s;
578
-
579
- ZSTD_memcpy(rankVal, rankValOrigin, sizeof(rankVal));
580
-
581
- /* fill DTable */
582
- for (s=0; s<sortedListSize; s++) {
583
- const U16 symbol = sortedList[s].symbol;
584
- const U32 weight = sortedList[s].weight;
585
- const U32 nbBits = nbBitsBaseline - weight;
586
- const U32 start = rankVal[weight];
587
- const U32 length = 1 << (targetLog-nbBits);
588
-
589
- if (targetLog-nbBits >= minBits) { /* enough room for a second symbol */
590
- U32 sortedRank;
1081
+ int w;
1082
+ int const wEnd = (int)maxWeight + 1;
1083
+
1084
+ /* Fill DTable in order of weight. */
1085
+ for (w = 1; w < wEnd; ++w) {
1086
+ int const begin = (int)rankStart[w];
1087
+ int const end = (int)rankStart[w+1];
1088
+ U32 const nbBits = nbBitsBaseline - w;
1089
+
1090
+ if (targetLog-nbBits >= minBits) {
1091
+ /* Enough room for a second symbol. */
1092
+ int start = rankVal[w];
1093
+ U32 const length = 1U << ((targetLog - nbBits) & 0x1F /* quiet static-analyzer */);
591
1094
  int minWeight = nbBits + scaleLog;
1095
+ int s;
592
1096
  if (minWeight < 1) minWeight = 1;
593
- sortedRank = rankStart[minWeight];
594
- HUF_fillDTableX2Level2(DTable+start, targetLog-nbBits, nbBits,
595
- rankValOrigin[nbBits], minWeight,
596
- sortedList+sortedRank, sortedListSize-sortedRank,
597
- nbBitsBaseline, symbol);
1097
+ /* Fill the DTable for every symbol of weight w.
1098
+ * These symbols get at least 1 second symbol.
1099
+ */
1100
+ for (s = begin; s != end; ++s) {
1101
+ HUF_fillDTableX2Level2(
1102
+ DTable + start, targetLog, nbBits,
1103
+ rankValOrigin[nbBits], minWeight, wEnd,
1104
+ sortedList, rankStart,
1105
+ nbBitsBaseline, sortedList[s].symbol);
1106
+ start += length;
1107
+ }
598
1108
  } else {
599
- HUF_DEltX2 DElt;
600
- MEM_writeLE16(&(DElt.sequence), symbol);
601
- DElt.nbBits = (BYTE)(nbBits);
602
- DElt.length = 1;
603
- { U32 const end = start + length;
604
- U32 u;
605
- for (u = start; u < end; u++) DTable[u] = DElt;
606
- } }
607
- rankVal[weight] += length;
1109
+ /* Only a single symbol. */
1110
+ HUF_fillDTableX2ForWeight(
1111
+ DTable + rankVal[w],
1112
+ sortedList + begin, sortedList + end,
1113
+ nbBits, targetLog,
1114
+ /* baseSeq */ 0, /* level */ 1);
1115
+ }
608
1116
  }
609
1117
  }
610
1118
 
1119
+ typedef struct {
1120
+ rankValCol_t rankVal[HUF_TABLELOG_MAX];
1121
+ U32 rankStats[HUF_TABLELOG_MAX + 1];
1122
+ U32 rankStart0[HUF_TABLELOG_MAX + 3];
1123
+ sortedSymbol_t sortedSymbol[HUF_SYMBOLVALUE_MAX + 1];
1124
+ BYTE weightList[HUF_SYMBOLVALUE_MAX + 1];
1125
+ U32 calleeWksp[HUF_READ_STATS_WORKSPACE_SIZE_U32];
1126
+ } HUF_ReadDTableX2_Workspace;
1127
+
611
1128
  size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,
612
1129
  const void* src, size_t srcSize,
613
- void* workSpace, size_t wkspSize)
1130
+ void* workSpace, size_t wkspSize, int flags)
614
1131
  {
615
- U32 tableLog, maxW, sizeOfSort, nbSymbols;
1132
+ U32 tableLog, maxW, nbSymbols;
616
1133
  DTableDesc dtd = HUF_getDTableDesc(DTable);
617
- U32 const maxTableLog = dtd.maxTableLog;
1134
+ U32 maxTableLog = dtd.maxTableLog;
618
1135
  size_t iSize;
619
1136
  void* dtPtr = DTable+1; /* force compiler to avoid strict-aliasing */
620
1137
  HUF_DEltX2* const dt = (HUF_DEltX2*)dtPtr;
621
1138
  U32 *rankStart;
622
1139
 
623
- rankValCol_t* rankVal;
624
- U32* rankStats;
625
- U32* rankStart0;
626
- sortedSymbol_t* sortedSymbol;
627
- BYTE* weightList;
628
- size_t spaceUsed32 = 0;
629
-
630
- rankVal = (rankValCol_t *)((U32 *)workSpace + spaceUsed32);
631
- spaceUsed32 += (sizeof(rankValCol_t) * HUF_TABLELOG_MAX) >> 2;
632
- rankStats = (U32 *)workSpace + spaceUsed32;
633
- spaceUsed32 += HUF_TABLELOG_MAX + 1;
634
- rankStart0 = (U32 *)workSpace + spaceUsed32;
635
- spaceUsed32 += HUF_TABLELOG_MAX + 2;
636
- sortedSymbol = (sortedSymbol_t *)workSpace + (spaceUsed32 * sizeof(U32)) / sizeof(sortedSymbol_t);
637
- spaceUsed32 += HUF_ALIGN(sizeof(sortedSymbol_t) * (HUF_SYMBOLVALUE_MAX + 1), sizeof(U32)) >> 2;
638
- weightList = (BYTE *)((U32 *)workSpace + spaceUsed32);
639
- spaceUsed32 += HUF_ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2;
640
-
641
- if ((spaceUsed32 << 2) > wkspSize) return ERROR(tableLog_tooLarge);
642
-
643
- rankStart = rankStart0 + 1;
644
- ZSTD_memset(rankStats, 0, sizeof(U32) * (2 * HUF_TABLELOG_MAX + 2 + 1));
1140
+ HUF_ReadDTableX2_Workspace* const wksp = (HUF_ReadDTableX2_Workspace*)workSpace;
1141
+
1142
+ if (sizeof(*wksp) > wkspSize) return ERROR(GENERIC);
1143
+
1144
+ rankStart = wksp->rankStart0 + 1;
1145
+ ZSTD_memset(wksp->rankStats, 0, sizeof(wksp->rankStats));
1146
+ ZSTD_memset(wksp->rankStart0, 0, sizeof(wksp->rankStart0));
645
1147
 
646
1148
  DEBUG_STATIC_ASSERT(sizeof(HUF_DEltX2) == sizeof(HUF_DTable)); /* if compiler fails here, assertion is wrong */
647
1149
  if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
648
1150
  /* ZSTD_memset(weightList, 0, sizeof(weightList)); */ /* is not necessary, even though some analyzer complain ... */
649
1151
 
650
- iSize = HUF_readStats(weightList, HUF_SYMBOLVALUE_MAX + 1, rankStats, &nbSymbols, &tableLog, src, srcSize);
1152
+ iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), flags);
651
1153
  if (HUF_isError(iSize)) return iSize;
652
1154
 
653
1155
  /* check result */
654
1156
  if (tableLog > maxTableLog) return ERROR(tableLog_tooLarge); /* DTable can't fit code depth */
1157
+ if (tableLog <= HUF_DECODER_FAST_TABLELOG && maxTableLog > HUF_DECODER_FAST_TABLELOG) maxTableLog = HUF_DECODER_FAST_TABLELOG;
655
1158
 
656
1159
  /* find maxWeight */
657
- for (maxW = tableLog; rankStats[maxW]==0; maxW--) {} /* necessarily finds a solution before 0 */
1160
+ for (maxW = tableLog; wksp->rankStats[maxW]==0; maxW--) {} /* necessarily finds a solution before 0 */
658
1161
 
659
1162
  /* Get start index of each weight */
660
1163
  { U32 w, nextRankStart = 0;
661
1164
  for (w=1; w<maxW+1; w++) {
662
1165
  U32 curr = nextRankStart;
663
- nextRankStart += rankStats[w];
1166
+ nextRankStart += wksp->rankStats[w];
664
1167
  rankStart[w] = curr;
665
1168
  }
666
1169
  rankStart[0] = nextRankStart; /* put all 0w symbols at the end of sorted list*/
667
- sizeOfSort = nextRankStart;
1170
+ rankStart[maxW+1] = nextRankStart;
668
1171
  }
669
1172
 
670
1173
  /* sort symbols by weight */
671
1174
  { U32 s;
672
1175
  for (s=0; s<nbSymbols; s++) {
673
- U32 const w = weightList[s];
1176
+ U32 const w = wksp->weightList[s];
674
1177
  U32 const r = rankStart[w]++;
675
- sortedSymbol[r].symbol = (BYTE)s;
676
- sortedSymbol[r].weight = (BYTE)w;
1178
+ wksp->sortedSymbol[r].symbol = (BYTE)s;
677
1179
  }
678
1180
  rankStart[0] = 0; /* forget 0w symbols; this is beginning of weight(1) */
679
1181
  }
680
1182
 
681
1183
  /* Build rankVal */
682
- { U32* const rankVal0 = rankVal[0];
1184
+ { U32* const rankVal0 = wksp->rankVal[0];
683
1185
  { int const rescale = (maxTableLog-tableLog) - 1; /* tableLog <= maxTableLog */
684
1186
  U32 nextRankVal = 0;
685
1187
  U32 w;
686
1188
  for (w=1; w<maxW+1; w++) {
687
1189
  U32 curr = nextRankVal;
688
- nextRankVal += rankStats[w] << (w+rescale);
1190
+ nextRankVal += wksp->rankStats[w] << (w+rescale);
689
1191
  rankVal0[w] = curr;
690
1192
  } }
691
1193
  { U32 const minBits = tableLog+1 - maxW;
692
1194
  U32 consumed;
693
1195
  for (consumed = minBits; consumed < maxTableLog - minBits + 1; consumed++) {
694
- U32* const rankValPtr = rankVal[consumed];
1196
+ U32* const rankValPtr = wksp->rankVal[consumed];
695
1197
  U32 w;
696
1198
  for (w = 1; w < maxW+1; w++) {
697
1199
  rankValPtr[w] = rankVal0[w] >> consumed;
698
1200
  } } } }
699
1201
 
700
1202
  HUF_fillDTableX2(dt, maxTableLog,
701
- sortedSymbol, sizeOfSort,
702
- rankStart0, rankVal, maxW,
1203
+ wksp->sortedSymbol,
1204
+ wksp->rankStart0, wksp->rankVal, maxW,
703
1205
  tableLog+1);
704
1206
 
705
1207
  dtd.tableLog = (BYTE)maxTableLog;
@@ -713,7 +1215,7 @@ FORCE_INLINE_TEMPLATE U32
713
1215
  HUF_decodeSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, const U32 dtLog)
714
1216
  {
715
1217
  size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */
716
- ZSTD_memcpy(op, dt+val, 2);
1218
+ ZSTD_memcpy(op, &dt[val].sequence, 2);
717
1219
  BIT_skipBits(DStream, dt[val].nbBits);
718
1220
  return dt[val].length;
719
1221
  }
@@ -722,15 +1224,17 @@ FORCE_INLINE_TEMPLATE U32
722
1224
  HUF_decodeLastSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, const U32 dtLog)
723
1225
  {
724
1226
  size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */
725
- ZSTD_memcpy(op, dt+val, 1);
726
- if (dt[val].length==1) BIT_skipBits(DStream, dt[val].nbBits);
727
- else {
1227
+ ZSTD_memcpy(op, &dt[val].sequence, 1);
1228
+ if (dt[val].length==1) {
1229
+ BIT_skipBits(DStream, dt[val].nbBits);
1230
+ } else {
728
1231
  if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8)) {
729
1232
  BIT_skipBits(DStream, dt[val].nbBits);
730
1233
  if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8))
731
1234
  /* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */
732
1235
  DStream->bitsConsumed = (sizeof(DStream->bitContainer)*8);
733
- } }
1236
+ }
1237
+ }
734
1238
  return 1;
735
1239
  }
736
1240
 
@@ -752,19 +1256,37 @@ HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd,
752
1256
  BYTE* const pStart = p;
753
1257
 
754
1258
  /* up to 8 symbols at a time */
755
- while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-(sizeof(bitDPtr->bitContainer)-1))) {
756
- HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
757
- HUF_DECODE_SYMBOLX2_1(p, bitDPtr);
758
- HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
759
- HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
1259
+ if ((size_t)(pEnd - p) >= sizeof(bitDPtr->bitContainer)) {
1260
+ if (dtLog <= 11 && MEM_64bits()) {
1261
+ /* up to 10 symbols at a time */
1262
+ while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-9)) {
1263
+ HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
1264
+ HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
1265
+ HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
1266
+ HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
1267
+ HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
1268
+ }
1269
+ } else {
1270
+ /* up to 8 symbols at a time */
1271
+ while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-(sizeof(bitDPtr->bitContainer)-1))) {
1272
+ HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
1273
+ HUF_DECODE_SYMBOLX2_1(p, bitDPtr);
1274
+ HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
1275
+ HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
1276
+ }
1277
+ }
1278
+ } else {
1279
+ BIT_reloadDStream(bitDPtr);
760
1280
  }
761
1281
 
762
1282
  /* closer to end : up to 2 symbols at a time */
763
- while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p <= pEnd-2))
764
- HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
1283
+ if ((size_t)(pEnd - p) >= 2) {
1284
+ while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p <= pEnd-2))
1285
+ HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
765
1286
 
766
- while (p <= pEnd-2)
767
- HUF_DECODE_SYMBOLX2_0(p, bitDPtr); /* no need to reload : reached the end of DStream */
1287
+ while (p <= pEnd-2)
1288
+ HUF_DECODE_SYMBOLX2_0(p, bitDPtr); /* no need to reload : reached the end of DStream */
1289
+ }
768
1290
 
769
1291
  if (p < pEnd)
770
1292
  p += HUF_decodeLastSymbolX2(p, bitDPtr, dt, dtLog);
@@ -799,6 +1321,10 @@ HUF_decompress1X2_usingDTable_internal_body(
799
1321
  return dstSize;
800
1322
  }
801
1323
 
1324
+ /* HUF_decompress4X2_usingDTable_internal_body():
1325
+ * Conditions:
1326
+ * @dstSize >= 6
1327
+ */
802
1328
  FORCE_INLINE_TEMPLATE size_t
803
1329
  HUF_decompress4X2_usingDTable_internal_body(
804
1330
  void* dst, size_t dstSize,
@@ -839,58 +1365,62 @@ HUF_decompress4X2_usingDTable_internal_body(
839
1365
  DTableDesc const dtd = HUF_getDTableDesc(DTable);
840
1366
  U32 const dtLog = dtd.tableLog;
841
1367
 
842
- if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */
1368
+ if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */
1369
+ if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */
1370
+ if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */
843
1371
  CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
844
1372
  CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
845
1373
  CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
846
1374
  CHECK_F( BIT_initDStream(&bitD4, istart4, length4) );
847
1375
 
848
1376
  /* 16-32 symbols per loop (4-8 symbols per stream) */
849
- for ( ; (endSignal) & (op4 < olimit); ) {
1377
+ if ((size_t)(oend - op4) >= sizeof(size_t)) {
1378
+ for ( ; (endSignal) & (op4 < olimit); ) {
850
1379
  #if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
851
- HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
852
- HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
853
- HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
854
- HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
855
- HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
856
- HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
857
- HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
858
- HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
859
- endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished;
860
- endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished;
861
- HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
862
- HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
863
- HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
864
- HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
865
- HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
866
- HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
867
- HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
868
- HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
869
- endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished;
870
- endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished;
1380
+ HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
1381
+ HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
1382
+ HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
1383
+ HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
1384
+ HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
1385
+ HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
1386
+ HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
1387
+ HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
1388
+ endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished;
1389
+ endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished;
1390
+ HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
1391
+ HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
1392
+ HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
1393
+ HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
1394
+ HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
1395
+ HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
1396
+ HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
1397
+ HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
1398
+ endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished;
1399
+ endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished;
871
1400
  #else
872
- HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
873
- HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
874
- HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
875
- HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
876
- HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
877
- HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
878
- HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
879
- HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
880
- HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
881
- HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
882
- HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
883
- HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
884
- HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
885
- HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
886
- HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
887
- HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
888
- endSignal = (U32)LIKELY(
889
- (BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished)
890
- & (BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished)
891
- & (BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished)
892
- & (BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished));
1401
+ HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
1402
+ HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
1403
+ HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
1404
+ HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
1405
+ HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
1406
+ HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
1407
+ HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
1408
+ HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
1409
+ HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
1410
+ HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
1411
+ HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
1412
+ HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
1413
+ HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
1414
+ HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
1415
+ HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
1416
+ HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
1417
+ endSignal = (U32)LIKELY((U32)
1418
+ (BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished)
1419
+ & (BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished)
1420
+ & (BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished)
1421
+ & (BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished));
893
1422
  #endif
1423
+ }
894
1424
  }
895
1425
 
896
1426
  /* check corruption */
@@ -914,68 +1444,281 @@ HUF_decompress4X2_usingDTable_internal_body(
914
1444
  }
915
1445
  }
916
1446
 
917
- HUF_DGEN(HUF_decompress1X2_usingDTable_internal)
918
- HUF_DGEN(HUF_decompress4X2_usingDTable_internal)
1447
+ #if HUF_NEED_BMI2_FUNCTION
1448
+ static BMI2_TARGET_ATTRIBUTE
1449
+ size_t HUF_decompress4X2_usingDTable_internal_bmi2(void* dst, size_t dstSize, void const* cSrc,
1450
+ size_t cSrcSize, HUF_DTable const* DTable) {
1451
+ return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
1452
+ }
1453
+ #endif
919
1454
 
920
- size_t HUF_decompress1X2_usingDTable(
1455
+ static
1456
+ size_t HUF_decompress4X2_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,
1457
+ size_t cSrcSize, HUF_DTable const* DTable) {
1458
+ return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
1459
+ }
1460
+
1461
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2
1462
+
1463
+ HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN;
1464
+
1465
+ #endif
1466
+
1467
+ static HUF_FAST_BMI2_ATTRS
1468
+ void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args)
1469
+ {
1470
+ U64 bits[4];
1471
+ BYTE const* ip[4];
1472
+ BYTE* op[4];
1473
+ BYTE* oend[4];
1474
+ HUF_DEltX2 const* const dtable = (HUF_DEltX2 const*)args->dt;
1475
+ BYTE const* const ilimit = args->ilimit;
1476
+
1477
+ /* Copy the arguments to local registers. */
1478
+ ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
1479
+ ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip));
1480
+ ZSTD_memcpy(&op, &args->op, sizeof(op));
1481
+
1482
+ oend[0] = op[1];
1483
+ oend[1] = op[2];
1484
+ oend[2] = op[3];
1485
+ oend[3] = args->oend;
1486
+
1487
+ assert(MEM_isLittleEndian());
1488
+ assert(!MEM_32bits());
1489
+
1490
+ for (;;) {
1491
+ BYTE* olimit;
1492
+ int stream;
1493
+ int symbol;
1494
+
1495
+ /* Assert loop preconditions */
1496
+ #ifndef NDEBUG
1497
+ for (stream = 0; stream < 4; ++stream) {
1498
+ assert(op[stream] <= oend[stream]);
1499
+ assert(ip[stream] >= ilimit);
1500
+ }
1501
+ #endif
1502
+ /* Compute olimit */
1503
+ {
1504
+ /* Each loop does 5 table lookups for each of the 4 streams.
1505
+ * Each table lookup consumes up to 11 bits of input, and produces
1506
+ * up to 2 bytes of output.
1507
+ */
1508
+ /* We can consume up to 7 bytes of input per iteration per stream.
1509
+ * We also know that each input pointer is >= ip[0]. So we can run
1510
+ * iters loops before running out of input.
1511
+ */
1512
+ size_t iters = (size_t)(ip[0] - ilimit) / 7;
1513
+ /* Each iteration can produce up to 10 bytes of output per stream.
1514
+ * Each output stream my advance at different rates. So take the
1515
+ * minimum number of safe iterations among all the output streams.
1516
+ */
1517
+ for (stream = 0; stream < 4; ++stream) {
1518
+ size_t const oiters = (size_t)(oend[stream] - op[stream]) / 10;
1519
+ iters = MIN(iters, oiters);
1520
+ }
1521
+
1522
+ /* Each iteration produces at least 5 output symbols. So until
1523
+ * op[3] crosses olimit, we know we haven't executed iters
1524
+ * iterations yet. This saves us maintaining an iters counter,
1525
+ * at the expense of computing the remaining # of iterations
1526
+ * more frequently.
1527
+ */
1528
+ olimit = op[3] + (iters * 5);
1529
+
1530
+ /* Exit the fast decoding loop if we are too close to the end. */
1531
+ if (op[3] + 10 > olimit)
1532
+ break;
1533
+
1534
+ /* Exit the decoding loop if any input pointer has crossed the
1535
+ * previous one. This indicates corruption, and a precondition
1536
+ * to our loop is that ip[i] >= ip[0].
1537
+ */
1538
+ for (stream = 1; stream < 4; ++stream) {
1539
+ if (ip[stream] < ip[stream - 1])
1540
+ goto _out;
1541
+ }
1542
+ }
1543
+
1544
+ #ifndef NDEBUG
1545
+ for (stream = 1; stream < 4; ++stream) {
1546
+ assert(ip[stream] >= ip[stream - 1]);
1547
+ }
1548
+ #endif
1549
+
1550
+ do {
1551
+ /* Do 5 table lookups for each of the first 3 streams */
1552
+ for (symbol = 0; symbol < 5; ++symbol) {
1553
+ for (stream = 0; stream < 3; ++stream) {
1554
+ int const index = (int)(bits[stream] >> 53);
1555
+ HUF_DEltX2 const entry = dtable[index];
1556
+ MEM_write16(op[stream], entry.sequence);
1557
+ bits[stream] <<= (entry.nbBits);
1558
+ op[stream] += (entry.length);
1559
+ }
1560
+ }
1561
+ /* Do 1 table lookup from the final stream */
1562
+ {
1563
+ int const index = (int)(bits[3] >> 53);
1564
+ HUF_DEltX2 const entry = dtable[index];
1565
+ MEM_write16(op[3], entry.sequence);
1566
+ bits[3] <<= (entry.nbBits);
1567
+ op[3] += (entry.length);
1568
+ }
1569
+ /* Do 4 table lookups from the final stream & reload bitstreams */
1570
+ for (stream = 0; stream < 4; ++stream) {
1571
+ /* Do a table lookup from the final stream.
1572
+ * This is interleaved with the reloading to reduce register
1573
+ * pressure. This shouldn't be necessary, but compilers can
1574
+ * struggle with codegen with high register pressure.
1575
+ */
1576
+ {
1577
+ int const index = (int)(bits[3] >> 53);
1578
+ HUF_DEltX2 const entry = dtable[index];
1579
+ MEM_write16(op[3], entry.sequence);
1580
+ bits[3] <<= (entry.nbBits);
1581
+ op[3] += (entry.length);
1582
+ }
1583
+ /* Reload the bistreams. The final bitstream must be reloaded
1584
+ * after the 5th symbol was decoded.
1585
+ */
1586
+ {
1587
+ int const ctz = ZSTD_countTrailingZeros64(bits[stream]);
1588
+ int const nbBits = ctz & 7;
1589
+ int const nbBytes = ctz >> 3;
1590
+ ip[stream] -= nbBytes;
1591
+ bits[stream] = MEM_read64(ip[stream]) | 1;
1592
+ bits[stream] <<= nbBits;
1593
+ }
1594
+ }
1595
+ } while (op[3] < olimit);
1596
+ }
1597
+
1598
+ _out:
1599
+
1600
+ /* Save the final values of each of the state variables back to args. */
1601
+ ZSTD_memcpy(&args->bits, &bits, sizeof(bits));
1602
+ ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip));
1603
+ ZSTD_memcpy(&args->op, &op, sizeof(op));
1604
+ }
1605
+
1606
+
1607
+ static HUF_FAST_BMI2_ATTRS size_t
1608
+ HUF_decompress4X2_usingDTable_internal_fast(
921
1609
  void* dst, size_t dstSize,
922
1610
  const void* cSrc, size_t cSrcSize,
923
- const HUF_DTable* DTable)
1611
+ const HUF_DTable* DTable,
1612
+ HUF_DecompressFastLoopFn loopFn) {
1613
+ void const* dt = DTable + 1;
1614
+ const BYTE* const iend = (const BYTE*)cSrc + 6;
1615
+ BYTE* const oend = (BYTE*)dst + dstSize;
1616
+ HUF_DecompressFastArgs args;
1617
+ {
1618
+ size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
1619
+ FORWARD_IF_ERROR(ret, "Failed to init asm args");
1620
+ if (ret == 0)
1621
+ return 0;
1622
+ }
1623
+
1624
+ assert(args.ip[0] >= args.ilimit);
1625
+ loopFn(&args);
1626
+
1627
+ /* note : op4 already verified within main loop */
1628
+ assert(args.ip[0] >= iend);
1629
+ assert(args.ip[1] >= iend);
1630
+ assert(args.ip[2] >= iend);
1631
+ assert(args.ip[3] >= iend);
1632
+ assert(args.op[3] <= oend);
1633
+ (void)iend;
1634
+
1635
+ /* finish bitStreams one by one */
1636
+ {
1637
+ size_t const segmentSize = (dstSize+3) / 4;
1638
+ BYTE* segmentEnd = (BYTE*)dst;
1639
+ int i;
1640
+ for (i = 0; i < 4; ++i) {
1641
+ BIT_DStream_t bit;
1642
+ if (segmentSize <= (size_t)(oend - segmentEnd))
1643
+ segmentEnd += segmentSize;
1644
+ else
1645
+ segmentEnd = oend;
1646
+ FORWARD_IF_ERROR(HUF_initRemainingDStream(&bit, &args, i, segmentEnd), "corruption");
1647
+ args.op[i] += HUF_decodeStreamX2(args.op[i], &bit, segmentEnd, (HUF_DEltX2 const*)dt, HUF_DECODER_FAST_TABLELOG);
1648
+ if (args.op[i] != segmentEnd)
1649
+ return ERROR(corruption_detected);
1650
+ }
1651
+ }
1652
+
1653
+ /* decoded size */
1654
+ return dstSize;
1655
+ }
1656
+
1657
+ static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc,
1658
+ size_t cSrcSize, HUF_DTable const* DTable, int flags)
924
1659
  {
925
- DTableDesc dtd = HUF_getDTableDesc(DTable);
926
- if (dtd.tableType != 1) return ERROR(GENERIC);
927
- return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
1660
+ HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X2_usingDTable_internal_default;
1661
+ HUF_DecompressFastLoopFn loopFn = HUF_decompress4X2_usingDTable_internal_fast_c_loop;
1662
+
1663
+ #if DYNAMIC_BMI2
1664
+ if (flags & HUF_flags_bmi2) {
1665
+ fallbackFn = HUF_decompress4X2_usingDTable_internal_bmi2;
1666
+ # if ZSTD_ENABLE_ASM_X86_64_BMI2
1667
+ if (!(flags & HUF_flags_disableAsm)) {
1668
+ loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop;
1669
+ }
1670
+ # endif
1671
+ } else {
1672
+ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
1673
+ }
1674
+ #endif
1675
+
1676
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
1677
+ if (!(flags & HUF_flags_disableAsm)) {
1678
+ loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop;
1679
+ }
1680
+ #endif
1681
+
1682
+ if (!(flags & HUF_flags_disableFast)) {
1683
+ size_t const ret = HUF_decompress4X2_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);
1684
+ if (ret != 0)
1685
+ return ret;
1686
+ }
1687
+ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
928
1688
  }
929
1689
 
1690
+ HUF_DGEN(HUF_decompress1X2_usingDTable_internal)
1691
+
930
1692
  size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
931
1693
  const void* cSrc, size_t cSrcSize,
932
- void* workSpace, size_t wkspSize)
1694
+ void* workSpace, size_t wkspSize, int flags)
933
1695
  {
934
1696
  const BYTE* ip = (const BYTE*) cSrc;
935
1697
 
936
1698
  size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize,
937
- workSpace, wkspSize);
1699
+ workSpace, wkspSize, flags);
938
1700
  if (HUF_isError(hSize)) return hSize;
939
1701
  if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
940
1702
  ip += hSize; cSrcSize -= hSize;
941
1703
 
942
- return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
1704
+ return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, flags);
943
1705
  }
944
1706
 
945
-
946
- size_t HUF_decompress4X2_usingDTable(
947
- void* dst, size_t dstSize,
948
- const void* cSrc, size_t cSrcSize,
949
- const HUF_DTable* DTable)
950
- {
951
- DTableDesc dtd = HUF_getDTableDesc(DTable);
952
- if (dtd.tableType != 1) return ERROR(GENERIC);
953
- return HUF_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
954
- }
955
-
956
- static size_t HUF_decompress4X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
1707
+ static size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
957
1708
  const void* cSrc, size_t cSrcSize,
958
- void* workSpace, size_t wkspSize, int bmi2)
1709
+ void* workSpace, size_t wkspSize, int flags)
959
1710
  {
960
1711
  const BYTE* ip = (const BYTE*) cSrc;
961
1712
 
962
1713
  size_t hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize,
963
- workSpace, wkspSize);
1714
+ workSpace, wkspSize, flags);
964
1715
  if (HUF_isError(hSize)) return hSize;
965
1716
  if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
966
1717
  ip += hSize; cSrcSize -= hSize;
967
1718
 
968
- return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
969
- }
970
-
971
- size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
972
- const void* cSrc, size_t cSrcSize,
973
- void* workSpace, size_t wkspSize)
974
- {
975
- return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, /* bmi2 */ 0);
1719
+ return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
976
1720
  }
977
1721
 
978
-
979
1722
  #endif /* HUF_FORCE_DECOMPRESS_X1 */
980
1723
 
981
1724
 
@@ -983,66 +1726,28 @@ size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
983
1726
  /* Universal decompression selectors */
984
1727
  /* ***********************************/
985
1728
 
986
- size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize,
987
- const void* cSrc, size_t cSrcSize,
988
- const HUF_DTable* DTable)
989
- {
990
- DTableDesc const dtd = HUF_getDTableDesc(DTable);
991
- #if defined(HUF_FORCE_DECOMPRESS_X1)
992
- (void)dtd;
993
- assert(dtd.tableType == 0);
994
- return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
995
- #elif defined(HUF_FORCE_DECOMPRESS_X2)
996
- (void)dtd;
997
- assert(dtd.tableType == 1);
998
- return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
999
- #else
1000
- return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
1001
- HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
1002
- #endif
1003
- }
1004
-
1005
- size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize,
1006
- const void* cSrc, size_t cSrcSize,
1007
- const HUF_DTable* DTable)
1008
- {
1009
- DTableDesc const dtd = HUF_getDTableDesc(DTable);
1010
- #if defined(HUF_FORCE_DECOMPRESS_X1)
1011
- (void)dtd;
1012
- assert(dtd.tableType == 0);
1013
- return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
1014
- #elif defined(HUF_FORCE_DECOMPRESS_X2)
1015
- (void)dtd;
1016
- assert(dtd.tableType == 1);
1017
- return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
1018
- #else
1019
- return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
1020
- HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
1021
- #endif
1022
- }
1023
-
1024
1729
 
1025
1730
  #if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2)
1026
1731
  typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t;
1027
- static const algo_time_t algoTime[16 /* Quantization */][3 /* single, double, quad */] =
1732
+ static const algo_time_t algoTime[16 /* Quantization */][2 /* single, double */] =
1028
1733
  {
1029
1734
  /* single, double, quad */
1030
- {{0,0}, {1,1}, {2,2}}, /* Q==0 : impossible */
1031
- {{0,0}, {1,1}, {2,2}}, /* Q==1 : impossible */
1032
- {{ 38,130}, {1313, 74}, {2151, 38}}, /* Q == 2 : 12-18% */
1033
- {{ 448,128}, {1353, 74}, {2238, 41}}, /* Q == 3 : 18-25% */
1034
- {{ 556,128}, {1353, 74}, {2238, 47}}, /* Q == 4 : 25-32% */
1035
- {{ 714,128}, {1418, 74}, {2436, 53}}, /* Q == 5 : 32-38% */
1036
- {{ 883,128}, {1437, 74}, {2464, 61}}, /* Q == 6 : 38-44% */
1037
- {{ 897,128}, {1515, 75}, {2622, 68}}, /* Q == 7 : 44-50% */
1038
- {{ 926,128}, {1613, 75}, {2730, 75}}, /* Q == 8 : 50-56% */
1039
- {{ 947,128}, {1729, 77}, {3359, 77}}, /* Q == 9 : 56-62% */
1040
- {{1107,128}, {2083, 81}, {4006, 84}}, /* Q ==10 : 62-69% */
1041
- {{1177,128}, {2379, 87}, {4785, 88}}, /* Q ==11 : 69-75% */
1042
- {{1242,128}, {2415, 93}, {5155, 84}}, /* Q ==12 : 75-81% */
1043
- {{1349,128}, {2644,106}, {5260,106}}, /* Q ==13 : 81-87% */
1044
- {{1455,128}, {2422,124}, {4174,124}}, /* Q ==14 : 87-93% */
1045
- {{ 722,128}, {1891,145}, {1936,146}}, /* Q ==15 : 93-99% */
1735
+ {{0,0}, {1,1}}, /* Q==0 : impossible */
1736
+ {{0,0}, {1,1}}, /* Q==1 : impossible */
1737
+ {{ 150,216}, { 381,119}}, /* Q == 2 : 12-18% */
1738
+ {{ 170,205}, { 514,112}}, /* Q == 3 : 18-25% */
1739
+ {{ 177,199}, { 539,110}}, /* Q == 4 : 25-32% */
1740
+ {{ 197,194}, { 644,107}}, /* Q == 5 : 32-38% */
1741
+ {{ 221,192}, { 735,107}}, /* Q == 6 : 38-44% */
1742
+ {{ 256,189}, { 881,106}}, /* Q == 7 : 44-50% */
1743
+ {{ 359,188}, {1167,109}}, /* Q == 8 : 50-56% */
1744
+ {{ 582,187}, {1570,114}}, /* Q == 9 : 56-62% */
1745
+ {{ 688,187}, {1712,122}}, /* Q ==10 : 62-69% */
1746
+ {{ 825,186}, {1965,136}}, /* Q ==11 : 69-75% */
1747
+ {{ 976,185}, {2131,150}}, /* Q ==12 : 75-81% */
1748
+ {{1180,186}, {2070,175}}, /* Q ==13 : 81-87% */
1749
+ {{1377,185}, {1731,202}}, /* Q ==14 : 87-93% */
1750
+ {{1412,185}, {1695,202}}, /* Q ==15 : 93-99% */
1046
1751
  };
1047
1752
  #endif
1048
1753
 
@@ -1069,42 +1774,15 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize)
1069
1774
  U32 const D256 = (U32)(dstSize >> 8);
1070
1775
  U32 const DTime0 = algoTime[Q][0].tableTime + (algoTime[Q][0].decode256Time * D256);
1071
1776
  U32 DTime1 = algoTime[Q][1].tableTime + (algoTime[Q][1].decode256Time * D256);
1072
- DTime1 += DTime1 >> 3; /* advantage to algorithm using less memory, to reduce cache eviction */
1777
+ DTime1 += DTime1 >> 5; /* small advantage to algorithm using less memory, to reduce cache eviction */
1073
1778
  return DTime1 < DTime0;
1074
1779
  }
1075
1780
  #endif
1076
1781
  }
1077
1782
 
1078
-
1079
- size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst,
1080
- size_t dstSize, const void* cSrc,
1081
- size_t cSrcSize, void* workSpace,
1082
- size_t wkspSize)
1083
- {
1084
- /* validation checks */
1085
- if (dstSize == 0) return ERROR(dstSize_tooSmall);
1086
- if (cSrcSize == 0) return ERROR(corruption_detected);
1087
-
1088
- { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
1089
- #if defined(HUF_FORCE_DECOMPRESS_X1)
1090
- (void)algoNb;
1091
- assert(algoNb == 0);
1092
- return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
1093
- #elif defined(HUF_FORCE_DECOMPRESS_X2)
1094
- (void)algoNb;
1095
- assert(algoNb == 1);
1096
- return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
1097
- #else
1098
- return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
1099
- cSrcSize, workSpace, wkspSize):
1100
- HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
1101
- #endif
1102
- }
1103
- }
1104
-
1105
1783
  size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
1106
1784
  const void* cSrc, size_t cSrcSize,
1107
- void* workSpace, size_t wkspSize)
1785
+ void* workSpace, size_t wkspSize, int flags)
1108
1786
  {
1109
1787
  /* validation checks */
1110
1788
  if (dstSize == 0) return ERROR(dstSize_tooSmall);
@@ -1117,71 +1795,71 @@ size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
1117
1795
  (void)algoNb;
1118
1796
  assert(algoNb == 0);
1119
1797
  return HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc,
1120
- cSrcSize, workSpace, wkspSize);
1798
+ cSrcSize, workSpace, wkspSize, flags);
1121
1799
  #elif defined(HUF_FORCE_DECOMPRESS_X2)
1122
1800
  (void)algoNb;
1123
1801
  assert(algoNb == 1);
1124
1802
  return HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
1125
- cSrcSize, workSpace, wkspSize);
1803
+ cSrcSize, workSpace, wkspSize, flags);
1126
1804
  #else
1127
1805
  return algoNb ? HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
1128
- cSrcSize, workSpace, wkspSize):
1806
+ cSrcSize, workSpace, wkspSize, flags):
1129
1807
  HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc,
1130
- cSrcSize, workSpace, wkspSize);
1808
+ cSrcSize, workSpace, wkspSize, flags);
1131
1809
  #endif
1132
1810
  }
1133
1811
  }
1134
1812
 
1135
1813
 
1136
- size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
1814
+ size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags)
1137
1815
  {
1138
1816
  DTableDesc const dtd = HUF_getDTableDesc(DTable);
1139
1817
  #if defined(HUF_FORCE_DECOMPRESS_X1)
1140
1818
  (void)dtd;
1141
1819
  assert(dtd.tableType == 0);
1142
- return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
1820
+ return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
1143
1821
  #elif defined(HUF_FORCE_DECOMPRESS_X2)
1144
1822
  (void)dtd;
1145
1823
  assert(dtd.tableType == 1);
1146
- return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
1824
+ return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
1147
1825
  #else
1148
- return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
1149
- HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
1826
+ return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) :
1827
+ HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
1150
1828
  #endif
1151
1829
  }
1152
1830
 
1153
1831
  #ifndef HUF_FORCE_DECOMPRESS_X2
1154
- size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
1832
+ size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags)
1155
1833
  {
1156
1834
  const BYTE* ip = (const BYTE*) cSrc;
1157
1835
 
1158
- size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
1836
+ size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags);
1159
1837
  if (HUF_isError(hSize)) return hSize;
1160
1838
  if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
1161
1839
  ip += hSize; cSrcSize -= hSize;
1162
1840
 
1163
- return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
1841
+ return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
1164
1842
  }
1165
1843
  #endif
1166
1844
 
1167
- size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
1845
+ size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags)
1168
1846
  {
1169
1847
  DTableDesc const dtd = HUF_getDTableDesc(DTable);
1170
1848
  #if defined(HUF_FORCE_DECOMPRESS_X1)
1171
1849
  (void)dtd;
1172
1850
  assert(dtd.tableType == 0);
1173
- return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
1851
+ return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
1174
1852
  #elif defined(HUF_FORCE_DECOMPRESS_X2)
1175
1853
  (void)dtd;
1176
1854
  assert(dtd.tableType == 1);
1177
- return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
1855
+ return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
1178
1856
  #else
1179
- return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
1180
- HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
1857
+ return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) :
1858
+ HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
1181
1859
  #endif
1182
1860
  }
1183
1861
 
1184
- size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
1862
+ size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags)
1185
1863
  {
1186
1864
  /* validation checks */
1187
1865
  if (dstSize == 0) return ERROR(dstSize_tooSmall);
@@ -1191,160 +1869,14 @@ size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t ds
1191
1869
  #if defined(HUF_FORCE_DECOMPRESS_X1)
1192
1870
  (void)algoNb;
1193
1871
  assert(algoNb == 0);
1194
- return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
1195
- #elif defined(HUF_FORCE_DECOMPRESS_X2)
1196
- (void)algoNb;
1197
- assert(algoNb == 1);
1198
- return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
1199
- #else
1200
- return algoNb ? HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2) :
1201
- HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
1202
- #endif
1203
- }
1204
- }
1205
-
1206
- #ifndef ZSTD_NO_UNUSED_FUNCTIONS
1207
- #ifndef HUF_FORCE_DECOMPRESS_X2
1208
- size_t HUF_readDTableX1(HUF_DTable* DTable, const void* src, size_t srcSize)
1209
- {
1210
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
1211
- return HUF_readDTableX1_wksp(DTable, src, srcSize,
1212
- workSpace, sizeof(workSpace));
1213
- }
1214
-
1215
- size_t HUF_decompress1X1_DCtx(HUF_DTable* DCtx, void* dst, size_t dstSize,
1216
- const void* cSrc, size_t cSrcSize)
1217
- {
1218
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
1219
- return HUF_decompress1X1_DCtx_wksp(DCtx, dst, dstSize, cSrc, cSrcSize,
1220
- workSpace, sizeof(workSpace));
1221
- }
1222
-
1223
- size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
1224
- {
1225
- HUF_CREATE_STATIC_DTABLEX1(DTable, HUF_TABLELOG_MAX);
1226
- return HUF_decompress1X1_DCtx (DTable, dst, dstSize, cSrc, cSrcSize);
1227
- }
1228
- #endif
1229
-
1230
- #ifndef HUF_FORCE_DECOMPRESS_X1
1231
- size_t HUF_readDTableX2(HUF_DTable* DTable, const void* src, size_t srcSize)
1232
- {
1233
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
1234
- return HUF_readDTableX2_wksp(DTable, src, srcSize,
1235
- workSpace, sizeof(workSpace));
1236
- }
1237
-
1238
- size_t HUF_decompress1X2_DCtx(HUF_DTable* DCtx, void* dst, size_t dstSize,
1239
- const void* cSrc, size_t cSrcSize)
1240
- {
1241
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
1242
- return HUF_decompress1X2_DCtx_wksp(DCtx, dst, dstSize, cSrc, cSrcSize,
1243
- workSpace, sizeof(workSpace));
1244
- }
1245
-
1246
- size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
1247
- {
1248
- HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_TABLELOG_MAX);
1249
- return HUF_decompress1X2_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
1250
- }
1251
- #endif
1252
-
1253
- #ifndef HUF_FORCE_DECOMPRESS_X2
1254
- size_t HUF_decompress4X1_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
1255
- {
1256
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
1257
- return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
1258
- workSpace, sizeof(workSpace));
1259
- }
1260
- size_t HUF_decompress4X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
1261
- {
1262
- HUF_CREATE_STATIC_DTABLEX1(DTable, HUF_TABLELOG_MAX);
1263
- return HUF_decompress4X1_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
1264
- }
1265
- #endif
1266
-
1267
- #ifndef HUF_FORCE_DECOMPRESS_X1
1268
- size_t HUF_decompress4X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize,
1269
- const void* cSrc, size_t cSrcSize)
1270
- {
1271
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
1272
- return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
1273
- workSpace, sizeof(workSpace));
1274
- }
1275
-
1276
- size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
1277
- {
1278
- HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_TABLELOG_MAX);
1279
- return HUF_decompress4X2_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
1280
- }
1281
- #endif
1282
-
1283
- typedef size_t (*decompressionAlgo)(void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);
1284
-
1285
- size_t HUF_decompress (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
1286
- {
1287
- #if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2)
1288
- static const decompressionAlgo decompress[2] = { HUF_decompress4X1, HUF_decompress4X2 };
1289
- #endif
1290
-
1291
- /* validation checks */
1292
- if (dstSize == 0) return ERROR(dstSize_tooSmall);
1293
- if (cSrcSize > dstSize) return ERROR(corruption_detected); /* invalid */
1294
- if (cSrcSize == dstSize) { ZSTD_memcpy(dst, cSrc, dstSize); return dstSize; } /* not compressed */
1295
- if (cSrcSize == 1) { ZSTD_memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; } /* RLE */
1296
-
1297
- { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
1298
- #if defined(HUF_FORCE_DECOMPRESS_X1)
1299
- (void)algoNb;
1300
- assert(algoNb == 0);
1301
- return HUF_decompress4X1(dst, dstSize, cSrc, cSrcSize);
1302
- #elif defined(HUF_FORCE_DECOMPRESS_X2)
1303
- (void)algoNb;
1304
- assert(algoNb == 1);
1305
- return HUF_decompress4X2(dst, dstSize, cSrc, cSrcSize);
1306
- #else
1307
- return decompress[algoNb](dst, dstSize, cSrc, cSrcSize);
1308
- #endif
1309
- }
1310
- }
1311
-
1312
- size_t HUF_decompress4X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
1313
- {
1314
- /* validation checks */
1315
- if (dstSize == 0) return ERROR(dstSize_tooSmall);
1316
- if (cSrcSize > dstSize) return ERROR(corruption_detected); /* invalid */
1317
- if (cSrcSize == dstSize) { ZSTD_memcpy(dst, cSrc, dstSize); return dstSize; } /* not compressed */
1318
- if (cSrcSize == 1) { ZSTD_memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; } /* RLE */
1319
-
1320
- { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
1321
- #if defined(HUF_FORCE_DECOMPRESS_X1)
1322
- (void)algoNb;
1323
- assert(algoNb == 0);
1324
- return HUF_decompress4X1_DCtx(dctx, dst, dstSize, cSrc, cSrcSize);
1872
+ return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
1325
1873
  #elif defined(HUF_FORCE_DECOMPRESS_X2)
1326
1874
  (void)algoNb;
1327
1875
  assert(algoNb == 1);
1328
- return HUF_decompress4X2_DCtx(dctx, dst, dstSize, cSrc, cSrcSize);
1876
+ return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
1329
1877
  #else
1330
- return algoNb ? HUF_decompress4X2_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) :
1331
- HUF_decompress4X1_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) ;
1878
+ return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags) :
1879
+ HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
1332
1880
  #endif
1333
1881
  }
1334
1882
  }
1335
-
1336
- size_t HUF_decompress4X_hufOnly(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
1337
- {
1338
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
1339
- return HUF_decompress4X_hufOnly_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
1340
- workSpace, sizeof(workSpace));
1341
- }
1342
-
1343
- size_t HUF_decompress1X_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize,
1344
- const void* cSrc, size_t cSrcSize)
1345
- {
1346
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
1347
- return HUF_decompress1X_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
1348
- workSpace, sizeof(workSpace));
1349
- }
1350
- #endif