zstd-ruby 1.4.4.0 → 1.5.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (115) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +2 -0
  3. data/README.md +78 -5
  4. data/Rakefile +8 -2
  5. data/ext/zstdruby/common.h +15 -0
  6. data/ext/zstdruby/extconf.rb +3 -2
  7. data/ext/zstdruby/libzstd/common/allocations.h +55 -0
  8. data/ext/zstdruby/libzstd/common/bits.h +200 -0
  9. data/ext/zstdruby/libzstd/common/bitstream.h +74 -97
  10. data/ext/zstdruby/libzstd/common/compiler.h +219 -20
  11. data/ext/zstdruby/libzstd/common/cpu.h +1 -3
  12. data/ext/zstdruby/libzstd/common/debug.c +11 -31
  13. data/ext/zstdruby/libzstd/common/debug.h +22 -49
  14. data/ext/zstdruby/libzstd/common/entropy_common.c +184 -80
  15. data/ext/zstdruby/libzstd/common/error_private.c +11 -2
  16. data/ext/zstdruby/libzstd/common/error_private.h +87 -4
  17. data/ext/zstdruby/libzstd/common/fse.h +47 -116
  18. data/ext/zstdruby/libzstd/common/fse_decompress.c +127 -127
  19. data/ext/zstdruby/libzstd/common/huf.h +112 -197
  20. data/ext/zstdruby/libzstd/common/mem.h +124 -142
  21. data/ext/zstdruby/libzstd/common/pool.c +54 -27
  22. data/ext/zstdruby/libzstd/common/pool.h +11 -5
  23. data/ext/zstdruby/libzstd/common/portability_macros.h +156 -0
  24. data/ext/zstdruby/libzstd/common/threading.c +78 -22
  25. data/ext/zstdruby/libzstd/common/threading.h +9 -13
  26. data/ext/zstdruby/libzstd/common/xxhash.c +15 -873
  27. data/ext/zstdruby/libzstd/common/xxhash.h +5572 -191
  28. data/ext/zstdruby/libzstd/common/zstd_common.c +2 -37
  29. data/ext/zstdruby/libzstd/common/zstd_deps.h +111 -0
  30. data/ext/zstdruby/libzstd/common/zstd_internal.h +186 -144
  31. data/ext/zstdruby/libzstd/common/zstd_trace.h +163 -0
  32. data/ext/zstdruby/libzstd/compress/clevels.h +134 -0
  33. data/ext/zstdruby/libzstd/compress/fse_compress.c +99 -196
  34. data/ext/zstdruby/libzstd/compress/hist.c +41 -63
  35. data/ext/zstdruby/libzstd/compress/hist.h +13 -33
  36. data/ext/zstdruby/libzstd/compress/huf_compress.c +968 -331
  37. data/ext/zstdruby/libzstd/compress/zstd_compress.c +4120 -1191
  38. data/ext/zstdruby/libzstd/compress/zstd_compress_internal.h +688 -159
  39. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.c +121 -40
  40. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.h +16 -6
  41. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.c +62 -35
  42. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.h +10 -3
  43. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.c +577 -0
  44. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.h +32 -0
  45. data/ext/zstdruby/libzstd/compress/zstd_cwksp.h +322 -115
  46. data/ext/zstdruby/libzstd/compress/zstd_double_fast.c +394 -154
  47. data/ext/zstdruby/libzstd/compress/zstd_double_fast.h +4 -3
  48. data/ext/zstdruby/libzstd/compress/zstd_fast.c +729 -253
  49. data/ext/zstdruby/libzstd/compress/zstd_fast.h +4 -3
  50. data/ext/zstdruby/libzstd/compress/zstd_lazy.c +1289 -247
  51. data/ext/zstdruby/libzstd/compress/zstd_lazy.h +61 -1
  52. data/ext/zstdruby/libzstd/compress/zstd_ldm.c +339 -212
  53. data/ext/zstdruby/libzstd/compress/zstd_ldm.h +15 -3
  54. data/ext/zstdruby/libzstd/compress/zstd_ldm_geartab.h +106 -0
  55. data/ext/zstdruby/libzstd/compress/zstd_opt.c +508 -282
  56. data/ext/zstdruby/libzstd/compress/zstd_opt.h +1 -1
  57. data/ext/zstdruby/libzstd/compress/zstdmt_compress.c +217 -466
  58. data/ext/zstdruby/libzstd/compress/zstdmt_compress.h +35 -114
  59. data/ext/zstdruby/libzstd/decompress/huf_decompress.c +1220 -572
  60. data/ext/zstdruby/libzstd/decompress/huf_decompress_amd64.S +576 -0
  61. data/ext/zstdruby/libzstd/decompress/zstd_ddict.c +23 -19
  62. data/ext/zstdruby/libzstd/decompress/zstd_ddict.h +3 -3
  63. data/ext/zstdruby/libzstd/decompress/zstd_decompress.c +859 -273
  64. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.c +1244 -375
  65. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.h +21 -7
  66. data/ext/zstdruby/libzstd/decompress/zstd_decompress_internal.h +74 -11
  67. data/ext/zstdruby/libzstd/dictBuilder/cover.c +75 -54
  68. data/ext/zstdruby/libzstd/dictBuilder/cover.h +20 -9
  69. data/ext/zstdruby/libzstd/dictBuilder/divsufsort.c +1 -1
  70. data/ext/zstdruby/libzstd/dictBuilder/fastcover.c +55 -36
  71. data/ext/zstdruby/libzstd/dictBuilder/zdict.c +126 -110
  72. data/ext/zstdruby/libzstd/{dictBuilder/zdict.h → zdict.h} +248 -56
  73. data/ext/zstdruby/libzstd/zstd.h +1277 -306
  74. data/ext/zstdruby/libzstd/{common/zstd_errors.h → zstd_errors.h} +29 -8
  75. data/ext/zstdruby/main.c +20 -0
  76. data/ext/zstdruby/skippable_frame.c +63 -0
  77. data/ext/zstdruby/streaming_compress.c +177 -0
  78. data/ext/zstdruby/streaming_compress.h +5 -0
  79. data/ext/zstdruby/streaming_decompress.c +123 -0
  80. data/ext/zstdruby/zstdruby.c +114 -32
  81. data/lib/zstd-ruby/version.rb +1 -1
  82. data/lib/zstd-ruby.rb +0 -1
  83. data/zstd-ruby.gemspec +1 -1
  84. metadata +24 -39
  85. data/.travis.yml +0 -14
  86. data/ext/zstdruby/libzstd/.gitignore +0 -3
  87. data/ext/zstdruby/libzstd/BUCK +0 -234
  88. data/ext/zstdruby/libzstd/Makefile +0 -289
  89. data/ext/zstdruby/libzstd/README.md +0 -159
  90. data/ext/zstdruby/libzstd/deprecated/zbuff.h +0 -214
  91. data/ext/zstdruby/libzstd/deprecated/zbuff_common.c +0 -26
  92. data/ext/zstdruby/libzstd/deprecated/zbuff_compress.c +0 -147
  93. data/ext/zstdruby/libzstd/deprecated/zbuff_decompress.c +0 -75
  94. data/ext/zstdruby/libzstd/dll/example/Makefile +0 -47
  95. data/ext/zstdruby/libzstd/dll/example/README.md +0 -69
  96. data/ext/zstdruby/libzstd/dll/example/build_package.bat +0 -20
  97. data/ext/zstdruby/libzstd/dll/example/fullbench-dll.sln +0 -25
  98. data/ext/zstdruby/libzstd/dll/example/fullbench-dll.vcxproj +0 -181
  99. data/ext/zstdruby/libzstd/legacy/zstd_legacy.h +0 -415
  100. data/ext/zstdruby/libzstd/legacy/zstd_v01.c +0 -2152
  101. data/ext/zstdruby/libzstd/legacy/zstd_v01.h +0 -94
  102. data/ext/zstdruby/libzstd/legacy/zstd_v02.c +0 -3514
  103. data/ext/zstdruby/libzstd/legacy/zstd_v02.h +0 -93
  104. data/ext/zstdruby/libzstd/legacy/zstd_v03.c +0 -3156
  105. data/ext/zstdruby/libzstd/legacy/zstd_v03.h +0 -93
  106. data/ext/zstdruby/libzstd/legacy/zstd_v04.c +0 -3641
  107. data/ext/zstdruby/libzstd/legacy/zstd_v04.h +0 -142
  108. data/ext/zstdruby/libzstd/legacy/zstd_v05.c +0 -4046
  109. data/ext/zstdruby/libzstd/legacy/zstd_v05.h +0 -162
  110. data/ext/zstdruby/libzstd/legacy/zstd_v06.c +0 -4150
  111. data/ext/zstdruby/libzstd/legacy/zstd_v06.h +0 -172
  112. data/ext/zstdruby/libzstd/legacy/zstd_v07.c +0 -4533
  113. data/ext/zstdruby/libzstd/legacy/zstd_v07.h +0 -187
  114. data/ext/zstdruby/libzstd/libzstd.pc.in +0 -15
  115. data/ext/zstdruby/zstdruby.h +0 -6
@@ -1,47 +1,34 @@
1
1
  /* ******************************************************************
2
- huff0 huffman decoder,
3
- part of Finite State Entropy library
4
- Copyright (C) 2013-present, Yann Collet.
5
-
6
- BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
7
-
8
- Redistribution and use in source and binary forms, with or without
9
- modification, are permitted provided that the following conditions are
10
- met:
11
-
12
- * Redistributions of source code must retain the above copyright
13
- notice, this list of conditions and the following disclaimer.
14
- * Redistributions in binary form must reproduce the above
15
- copyright notice, this list of conditions and the following disclaimer
16
- in the documentation and/or other materials provided with the
17
- distribution.
18
-
19
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23
- OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
-
31
- You can contact the author at :
32
- - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
2
+ * huff0 huffman decoder,
3
+ * part of Finite State Entropy library
4
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
5
+ *
6
+ * You can contact the author at :
7
+ * - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
8
+ *
9
+ * This source code is licensed under both the BSD-style license (found in the
10
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
11
+ * in the COPYING file in the root directory of this source tree).
12
+ * You may select, at your option, one of the above-listed licenses.
33
13
  ****************************************************************** */
34
14
 
35
15
  /* **************************************************************
36
16
  * Dependencies
37
17
  ****************************************************************/
38
- #include <string.h> /* memcpy, memset */
39
- #include "compiler.h"
40
- #include "bitstream.h" /* BIT_* */
41
- #include "fse.h" /* to compress headers */
42
- #define HUF_STATIC_LINKING_ONLY
43
- #include "huf.h"
44
- #include "error_private.h"
18
+ #include "../common/zstd_deps.h" /* ZSTD_memcpy, ZSTD_memset */
19
+ #include "../common/compiler.h"
20
+ #include "../common/bitstream.h" /* BIT_* */
21
+ #include "../common/fse.h" /* to compress headers */
22
+ #include "../common/huf.h"
23
+ #include "../common/error_private.h"
24
+ #include "../common/zstd_internal.h"
25
+ #include "../common/bits.h" /* ZSTD_highbit32, ZSTD_countTrailingZeros64 */
26
+
27
+ /* **************************************************************
28
+ * Constants
29
+ ****************************************************************/
30
+
31
+ #define HUF_DECODER_FAST_TABLELOG 11
45
32
 
46
33
  /* **************************************************************
47
34
  * Macros
@@ -56,14 +43,33 @@
56
43
  #error "Cannot force the use of the X1 and X2 decoders at the same time!"
57
44
  #endif
58
45
 
46
+ /* When DYNAMIC_BMI2 is enabled, fast decoders are only called when bmi2 is
47
+ * supported at runtime, so we can add the BMI2 target attribute.
48
+ * When it is disabled, we will still get BMI2 if it is enabled statically.
49
+ */
50
+ #if DYNAMIC_BMI2
51
+ # define HUF_FAST_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE
52
+ #else
53
+ # define HUF_FAST_BMI2_ATTRS
54
+ #endif
55
+
56
+ #ifdef __cplusplus
57
+ # define HUF_EXTERN_C extern "C"
58
+ #else
59
+ # define HUF_EXTERN_C
60
+ #endif
61
+ #define HUF_ASM_DECL HUF_EXTERN_C
62
+
63
+ #if DYNAMIC_BMI2
64
+ # define HUF_NEED_BMI2_FUNCTION 1
65
+ #else
66
+ # define HUF_NEED_BMI2_FUNCTION 0
67
+ #endif
59
68
 
60
69
  /* **************************************************************
61
70
  * Error Management
62
71
  ****************************************************************/
63
72
  #define HUF_isError ERR_isError
64
- #ifndef CHECK_F
65
- #define CHECK_F(f) { size_t const err_ = (f); if (HUF_isError(err_)) return err_; }
66
- #endif
67
73
 
68
74
 
69
75
  /* **************************************************************
@@ -76,6 +82,11 @@
76
82
  /* **************************************************************
77
83
  * BMI2 Variant Wrappers
78
84
  ****************************************************************/
85
+ typedef size_t (*HUF_DecompressUsingDTableFn)(void *dst, size_t dstSize,
86
+ const void *cSrc,
87
+ size_t cSrcSize,
88
+ const HUF_DTable *DTable);
89
+
79
90
  #if DYNAMIC_BMI2
80
91
 
81
92
  #define HUF_DGEN(fn) \
@@ -88,7 +99,7 @@
88
99
  return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \
89
100
  } \
90
101
  \
91
- static TARGET_ATTRIBUTE("bmi2") size_t fn##_bmi2( \
102
+ static BMI2_TARGET_ATTRIBUTE size_t fn##_bmi2( \
92
103
  void* dst, size_t dstSize, \
93
104
  const void* cSrc, size_t cSrcSize, \
94
105
  const HUF_DTable* DTable) \
@@ -97,9 +108,9 @@
97
108
  } \
98
109
  \
99
110
  static size_t fn(void* dst, size_t dstSize, void const* cSrc, \
100
- size_t cSrcSize, HUF_DTable const* DTable, int bmi2) \
111
+ size_t cSrcSize, HUF_DTable const* DTable, int flags) \
101
112
  { \
102
- if (bmi2) { \
113
+ if (flags & HUF_flags_bmi2) { \
103
114
  return fn##_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); \
104
115
  } \
105
116
  return fn##_default(dst, dstSize, cSrc, cSrcSize, DTable); \
@@ -109,9 +120,9 @@
109
120
 
110
121
  #define HUF_DGEN(fn) \
111
122
  static size_t fn(void* dst, size_t dstSize, void const* cSrc, \
112
- size_t cSrcSize, HUF_DTable const* DTable, int bmi2) \
123
+ size_t cSrcSize, HUF_DTable const* DTable, int flags) \
113
124
  { \
114
- (void)bmi2; \
125
+ (void)flags; \
115
126
  return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \
116
127
  }
117
128
 
@@ -126,82 +137,359 @@ typedef struct { BYTE maxTableLog; BYTE tableType; BYTE tableLog; BYTE reserved;
126
137
  static DTableDesc HUF_getDTableDesc(const HUF_DTable* table)
127
138
  {
128
139
  DTableDesc dtd;
129
- memcpy(&dtd, table, sizeof(dtd));
140
+ ZSTD_memcpy(&dtd, table, sizeof(dtd));
130
141
  return dtd;
131
142
  }
132
143
 
144
+ static size_t HUF_initFastDStream(BYTE const* ip) {
145
+ BYTE const lastByte = ip[7];
146
+ size_t const bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0;
147
+ size_t const value = MEM_readLEST(ip) | 1;
148
+ assert(bitsConsumed <= 8);
149
+ assert(sizeof(size_t) == 8);
150
+ return value << bitsConsumed;
151
+ }
152
+
153
+
154
+ /**
155
+ * The input/output arguments to the Huffman fast decoding loop:
156
+ *
157
+ * ip [in/out] - The input pointers, must be updated to reflect what is consumed.
158
+ * op [in/out] - The output pointers, must be updated to reflect what is written.
159
+ * bits [in/out] - The bitstream containers, must be updated to reflect the current state.
160
+ * dt [in] - The decoding table.
161
+ * ilimit [in] - The input limit, stop when any input pointer is below ilimit.
162
+ * oend [in] - The end of the output stream. op[3] must not cross oend.
163
+ * iend [in] - The end of each input stream. ip[i] may cross iend[i],
164
+ * as long as it is above ilimit, but that indicates corruption.
165
+ */
166
+ typedef struct {
167
+ BYTE const* ip[4];
168
+ BYTE* op[4];
169
+ U64 bits[4];
170
+ void const* dt;
171
+ BYTE const* ilimit;
172
+ BYTE* oend;
173
+ BYTE const* iend[4];
174
+ } HUF_DecompressFastArgs;
175
+
176
+ typedef void (*HUF_DecompressFastLoopFn)(HUF_DecompressFastArgs*);
177
+
178
+ /**
179
+ * Initializes args for the fast decoding loop.
180
+ * @returns 1 on success
181
+ * 0 if the fallback implementation should be used.
182
+ * Or an error code on failure.
183
+ */
184
+ static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable)
185
+ {
186
+ void const* dt = DTable + 1;
187
+ U32 const dtLog = HUF_getDTableDesc(DTable).tableLog;
188
+
189
+ const BYTE* const ilimit = (const BYTE*)src + 6 + 8;
190
+
191
+ BYTE* const oend = (BYTE*)dst + dstSize;
192
+
193
+ /* The fast decoding loop assumes 64-bit little-endian.
194
+ * This condition is false on x32.
195
+ */
196
+ if (!MEM_isLittleEndian() || MEM_32bits())
197
+ return 0;
198
+
199
+ /* strict minimum : jump table + 1 byte per stream */
200
+ if (srcSize < 10)
201
+ return ERROR(corruption_detected);
202
+
203
+ /* Must have at least 8 bytes per stream because we don't handle initializing smaller bit containers.
204
+ * If table log is not correct at this point, fallback to the old decoder.
205
+ * On small inputs we don't have enough data to trigger the fast loop, so use the old decoder.
206
+ */
207
+ if (dtLog != HUF_DECODER_FAST_TABLELOG)
208
+ return 0;
209
+
210
+ /* Read the jump table. */
211
+ {
212
+ const BYTE* const istart = (const BYTE*)src;
213
+ size_t const length1 = MEM_readLE16(istart);
214
+ size_t const length2 = MEM_readLE16(istart+2);
215
+ size_t const length3 = MEM_readLE16(istart+4);
216
+ size_t const length4 = srcSize - (length1 + length2 + length3 + 6);
217
+ args->iend[0] = istart + 6; /* jumpTable */
218
+ args->iend[1] = args->iend[0] + length1;
219
+ args->iend[2] = args->iend[1] + length2;
220
+ args->iend[3] = args->iend[2] + length3;
221
+
222
+ /* HUF_initFastDStream() requires this, and this small of an input
223
+ * won't benefit from the ASM loop anyways.
224
+ * length1 must be >= 16 so that ip[0] >= ilimit before the loop
225
+ * starts.
226
+ */
227
+ if (length1 < 16 || length2 < 8 || length3 < 8 || length4 < 8)
228
+ return 0;
229
+ if (length4 > srcSize) return ERROR(corruption_detected); /* overflow */
230
+ }
231
+ /* ip[] contains the position that is currently loaded into bits[]. */
232
+ args->ip[0] = args->iend[1] - sizeof(U64);
233
+ args->ip[1] = args->iend[2] - sizeof(U64);
234
+ args->ip[2] = args->iend[3] - sizeof(U64);
235
+ args->ip[3] = (BYTE const*)src + srcSize - sizeof(U64);
236
+
237
+ /* op[] contains the output pointers. */
238
+ args->op[0] = (BYTE*)dst;
239
+ args->op[1] = args->op[0] + (dstSize+3)/4;
240
+ args->op[2] = args->op[1] + (dstSize+3)/4;
241
+ args->op[3] = args->op[2] + (dstSize+3)/4;
242
+
243
+ /* No point to call the ASM loop for tiny outputs. */
244
+ if (args->op[3] >= oend)
245
+ return 0;
246
+
247
+ /* bits[] is the bit container.
248
+ * It is read from the MSB down to the LSB.
249
+ * It is shifted left as it is read, and zeros are
250
+ * shifted in. After the lowest valid bit a 1 is
251
+ * set, so that CountTrailingZeros(bits[]) can be used
252
+ * to count how many bits we've consumed.
253
+ */
254
+ args->bits[0] = HUF_initFastDStream(args->ip[0]);
255
+ args->bits[1] = HUF_initFastDStream(args->ip[1]);
256
+ args->bits[2] = HUF_initFastDStream(args->ip[2]);
257
+ args->bits[3] = HUF_initFastDStream(args->ip[3]);
258
+
259
+ /* If ip[] >= ilimit, it is guaranteed to be safe to
260
+ * reload bits[]. It may be beyond its section, but is
261
+ * guaranteed to be valid (>= istart).
262
+ */
263
+ args->ilimit = ilimit;
264
+
265
+ args->oend = oend;
266
+ args->dt = dt;
267
+
268
+ return 1;
269
+ }
270
+
271
+ static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressFastArgs const* args, int stream, BYTE* segmentEnd)
272
+ {
273
+ /* Validate that we haven't overwritten. */
274
+ if (args->op[stream] > segmentEnd)
275
+ return ERROR(corruption_detected);
276
+ /* Validate that we haven't read beyond iend[].
277
+ * Note that ip[] may be < iend[] because the MSB is
278
+ * the next bit to read, and we may have consumed 100%
279
+ * of the stream, so down to iend[i] - 8 is valid.
280
+ */
281
+ if (args->ip[stream] < args->iend[stream] - 8)
282
+ return ERROR(corruption_detected);
283
+
284
+ /* Construct the BIT_DStream_t. */
285
+ assert(sizeof(size_t) == 8);
286
+ bit->bitContainer = MEM_readLEST(args->ip[stream]);
287
+ bit->bitsConsumed = ZSTD_countTrailingZeros64(args->bits[stream]);
288
+ bit->start = (const char*)args->iend[0];
289
+ bit->limitPtr = bit->start + sizeof(size_t);
290
+ bit->ptr = (const char*)args->ip[stream];
291
+
292
+ return 0;
293
+ }
294
+
133
295
 
134
296
  #ifndef HUF_FORCE_DECOMPRESS_X2
135
297
 
136
298
  /*-***************************/
137
299
  /* single-symbol decoding */
138
300
  /*-***************************/
139
- typedef struct { BYTE byte; BYTE nbBits; } HUF_DEltX1; /* single-symbol decoding */
301
+ typedef struct { BYTE nbBits; BYTE byte; } HUF_DEltX1; /* single-symbol decoding */
140
302
 
141
- size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize)
303
+ /**
304
+ * Packs 4 HUF_DEltX1 structs into a U64. This is used to lay down 4 entries at
305
+ * a time.
306
+ */
307
+ static U64 HUF_DEltX1_set4(BYTE symbol, BYTE nbBits) {
308
+ U64 D4;
309
+ if (MEM_isLittleEndian()) {
310
+ D4 = (U64)((symbol << 8) + nbBits);
311
+ } else {
312
+ D4 = (U64)(symbol + (nbBits << 8));
313
+ }
314
+ assert(D4 < (1U << 16));
315
+ D4 *= 0x0001000100010001ULL;
316
+ return D4;
317
+ }
318
+
319
+ /**
320
+ * Increase the tableLog to targetTableLog and rescales the stats.
321
+ * If tableLog > targetTableLog this is a no-op.
322
+ * @returns New tableLog
323
+ */
324
+ static U32 HUF_rescaleStats(BYTE* huffWeight, U32* rankVal, U32 nbSymbols, U32 tableLog, U32 targetTableLog)
325
+ {
326
+ if (tableLog > targetTableLog)
327
+ return tableLog;
328
+ if (tableLog < targetTableLog) {
329
+ U32 const scale = targetTableLog - tableLog;
330
+ U32 s;
331
+ /* Increase the weight for all non-zero probability symbols by scale. */
332
+ for (s = 0; s < nbSymbols; ++s) {
333
+ huffWeight[s] += (BYTE)((huffWeight[s] == 0) ? 0 : scale);
334
+ }
335
+ /* Update rankVal to reflect the new weights.
336
+ * All weights except 0 get moved to weight + scale.
337
+ * Weights [1, scale] are empty.
338
+ */
339
+ for (s = targetTableLog; s > scale; --s) {
340
+ rankVal[s] = rankVal[s - scale];
341
+ }
342
+ for (s = scale; s > 0; --s) {
343
+ rankVal[s] = 0;
344
+ }
345
+ }
346
+ return targetTableLog;
347
+ }
348
+
349
+ typedef struct {
350
+ U32 rankVal[HUF_TABLELOG_ABSOLUTEMAX + 1];
351
+ U32 rankStart[HUF_TABLELOG_ABSOLUTEMAX + 1];
352
+ U32 statsWksp[HUF_READ_STATS_WORKSPACE_SIZE_U32];
353
+ BYTE symbols[HUF_SYMBOLVALUE_MAX + 1];
354
+ BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1];
355
+ } HUF_ReadDTableX1_Workspace;
356
+
357
+ size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags)
142
358
  {
143
359
  U32 tableLog = 0;
144
360
  U32 nbSymbols = 0;
145
361
  size_t iSize;
146
362
  void* const dtPtr = DTable + 1;
147
363
  HUF_DEltX1* const dt = (HUF_DEltX1*)dtPtr;
364
+ HUF_ReadDTableX1_Workspace* wksp = (HUF_ReadDTableX1_Workspace*)workSpace;
148
365
 
149
- U32* rankVal;
150
- BYTE* huffWeight;
151
- size_t spaceUsed32 = 0;
152
-
153
- rankVal = (U32 *)workSpace + spaceUsed32;
154
- spaceUsed32 += HUF_TABLELOG_ABSOLUTEMAX + 1;
155
- huffWeight = (BYTE *)((U32 *)workSpace + spaceUsed32);
156
- spaceUsed32 += HUF_ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2;
157
-
158
- if ((spaceUsed32 << 2) > wkspSize) return ERROR(tableLog_tooLarge);
366
+ DEBUG_STATIC_ASSERT(HUF_DECOMPRESS_WORKSPACE_SIZE >= sizeof(*wksp));
367
+ if (sizeof(*wksp) > wkspSize) return ERROR(tableLog_tooLarge);
159
368
 
160
369
  DEBUG_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable));
161
- /* memset(huffWeight, 0, sizeof(huffWeight)); */ /* is not necessary, even though some analyzer complain ... */
370
+ /* ZSTD_memset(huffWeight, 0, sizeof(huffWeight)); */ /* is not necessary, even though some analyzer complain ... */
162
371
 
163
- iSize = HUF_readStats(huffWeight, HUF_SYMBOLVALUE_MAX + 1, rankVal, &nbSymbols, &tableLog, src, srcSize);
372
+ iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), flags);
164
373
  if (HUF_isError(iSize)) return iSize;
165
374
 
375
+
166
376
  /* Table header */
167
377
  { DTableDesc dtd = HUF_getDTableDesc(DTable);
378
+ U32 const maxTableLog = dtd.maxTableLog + 1;
379
+ U32 const targetTableLog = MIN(maxTableLog, HUF_DECODER_FAST_TABLELOG);
380
+ tableLog = HUF_rescaleStats(wksp->huffWeight, wksp->rankVal, nbSymbols, tableLog, targetTableLog);
168
381
  if (tableLog > (U32)(dtd.maxTableLog+1)) return ERROR(tableLog_tooLarge); /* DTable too small, Huffman tree cannot fit in */
169
382
  dtd.tableType = 0;
170
383
  dtd.tableLog = (BYTE)tableLog;
171
- memcpy(DTable, &dtd, sizeof(dtd));
384
+ ZSTD_memcpy(DTable, &dtd, sizeof(dtd));
172
385
  }
173
386
 
174
- /* Calculate starting value for each rank */
175
- { U32 n, nextRankStart = 0;
176
- for (n=1; n<tableLog+1; n++) {
177
- U32 const current = nextRankStart;
178
- nextRankStart += (rankVal[n] << (n-1));
179
- rankVal[n] = current;
180
- } }
181
-
182
- /* fill DTable */
183
- { U32 n;
184
- for (n=0; n<nbSymbols; n++) {
185
- U32 const w = huffWeight[n];
186
- U32 const length = (1 << w) >> 1;
187
- U32 u;
188
- HUF_DEltX1 D;
189
- D.byte = (BYTE)n; D.nbBits = (BYTE)(tableLog + 1 - w);
190
- for (u = rankVal[w]; u < rankVal[w] + length; u++)
191
- dt[u] = D;
192
- rankVal[w] += length;
193
- } }
387
+ /* Compute symbols and rankStart given rankVal:
388
+ *
389
+ * rankVal already contains the number of values of each weight.
390
+ *
391
+ * symbols contains the symbols ordered by weight. First are the rankVal[0]
392
+ * weight 0 symbols, followed by the rankVal[1] weight 1 symbols, and so on.
393
+ * symbols[0] is filled (but unused) to avoid a branch.
394
+ *
395
+ * rankStart contains the offset where each rank belongs in the DTable.
396
+ * rankStart[0] is not filled because there are no entries in the table for
397
+ * weight 0.
398
+ */
399
+ { int n;
400
+ U32 nextRankStart = 0;
401
+ int const unroll = 4;
402
+ int const nLimit = (int)nbSymbols - unroll + 1;
403
+ for (n=0; n<(int)tableLog+1; n++) {
404
+ U32 const curr = nextRankStart;
405
+ nextRankStart += wksp->rankVal[n];
406
+ wksp->rankStart[n] = curr;
407
+ }
408
+ for (n=0; n < nLimit; n += unroll) {
409
+ int u;
410
+ for (u=0; u < unroll; ++u) {
411
+ size_t const w = wksp->huffWeight[n+u];
412
+ wksp->symbols[wksp->rankStart[w]++] = (BYTE)(n+u);
413
+ }
414
+ }
415
+ for (; n < (int)nbSymbols; ++n) {
416
+ size_t const w = wksp->huffWeight[n];
417
+ wksp->symbols[wksp->rankStart[w]++] = (BYTE)n;
418
+ }
419
+ }
194
420
 
421
+ /* fill DTable
422
+ * We fill all entries of each weight in order.
423
+ * That way length is a constant for each iteration of the outer loop.
424
+ * We can switch based on the length to a different inner loop which is
425
+ * optimized for that particular case.
426
+ */
427
+ { U32 w;
428
+ int symbol = wksp->rankVal[0];
429
+ int rankStart = 0;
430
+ for (w=1; w<tableLog+1; ++w) {
431
+ int const symbolCount = wksp->rankVal[w];
432
+ int const length = (1 << w) >> 1;
433
+ int uStart = rankStart;
434
+ BYTE const nbBits = (BYTE)(tableLog + 1 - w);
435
+ int s;
436
+ int u;
437
+ switch (length) {
438
+ case 1:
439
+ for (s=0; s<symbolCount; ++s) {
440
+ HUF_DEltX1 D;
441
+ D.byte = wksp->symbols[symbol + s];
442
+ D.nbBits = nbBits;
443
+ dt[uStart] = D;
444
+ uStart += 1;
445
+ }
446
+ break;
447
+ case 2:
448
+ for (s=0; s<symbolCount; ++s) {
449
+ HUF_DEltX1 D;
450
+ D.byte = wksp->symbols[symbol + s];
451
+ D.nbBits = nbBits;
452
+ dt[uStart+0] = D;
453
+ dt[uStart+1] = D;
454
+ uStart += 2;
455
+ }
456
+ break;
457
+ case 4:
458
+ for (s=0; s<symbolCount; ++s) {
459
+ U64 const D4 = HUF_DEltX1_set4(wksp->symbols[symbol + s], nbBits);
460
+ MEM_write64(dt + uStart, D4);
461
+ uStart += 4;
462
+ }
463
+ break;
464
+ case 8:
465
+ for (s=0; s<symbolCount; ++s) {
466
+ U64 const D4 = HUF_DEltX1_set4(wksp->symbols[symbol + s], nbBits);
467
+ MEM_write64(dt + uStart, D4);
468
+ MEM_write64(dt + uStart + 4, D4);
469
+ uStart += 8;
470
+ }
471
+ break;
472
+ default:
473
+ for (s=0; s<symbolCount; ++s) {
474
+ U64 const D4 = HUF_DEltX1_set4(wksp->symbols[symbol + s], nbBits);
475
+ for (u=0; u < length; u += 16) {
476
+ MEM_write64(dt + uStart + u + 0, D4);
477
+ MEM_write64(dt + uStart + u + 4, D4);
478
+ MEM_write64(dt + uStart + u + 8, D4);
479
+ MEM_write64(dt + uStart + u + 12, D4);
480
+ }
481
+ assert(u == length);
482
+ uStart += length;
483
+ }
484
+ break;
485
+ }
486
+ symbol += symbolCount;
487
+ rankStart += symbolCount * length;
488
+ }
489
+ }
195
490
  return iSize;
196
491
  }
197
492
 
198
- size_t HUF_readDTableX1(HUF_DTable* DTable, const void* src, size_t srcSize)
199
- {
200
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
201
- return HUF_readDTableX1_wksp(DTable, src, srcSize,
202
- workSpace, sizeof(workSpace));
203
- }
204
-
205
493
  FORCE_INLINE_TEMPLATE BYTE
206
494
  HUF_decodeSymbolX1(BIT_DStream_t* Dstream, const HUF_DEltX1* dt, const U32 dtLog)
207
495
  {
@@ -228,11 +516,15 @@ HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, cons
228
516
  BYTE* const pStart = p;
229
517
 
230
518
  /* up to 4 symbols at a time */
231
- while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-3)) {
232
- HUF_DECODE_SYMBOLX1_2(p, bitDPtr);
233
- HUF_DECODE_SYMBOLX1_1(p, bitDPtr);
234
- HUF_DECODE_SYMBOLX1_2(p, bitDPtr);
235
- HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
519
+ if ((pEnd - p) > 3) {
520
+ while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-3)) {
521
+ HUF_DECODE_SYMBOLX1_2(p, bitDPtr);
522
+ HUF_DECODE_SYMBOLX1_1(p, bitDPtr);
523
+ HUF_DECODE_SYMBOLX1_2(p, bitDPtr);
524
+ HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
525
+ }
526
+ } else {
527
+ BIT_reloadDStream(bitDPtr);
236
528
  }
237
529
 
238
530
  /* [0-3] symbols remaining */
@@ -244,7 +536,7 @@ HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, cons
244
536
  while (p < pEnd)
245
537
  HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
246
538
 
247
- return pEnd-pStart;
539
+ return (size_t)(pEnd-pStart);
248
540
  }
249
541
 
250
542
  FORCE_INLINE_TEMPLATE size_t
@@ -270,6 +562,10 @@ HUF_decompress1X1_usingDTable_internal_body(
270
562
  return dstSize;
271
563
  }
272
564
 
565
+ /* HUF_decompress4X1_usingDTable_internal_body():
566
+ * Conditions :
567
+ * @dstSize >= 6
568
+ */
273
569
  FORCE_INLINE_TEMPLATE size_t
274
570
  HUF_decompress4X1_usingDTable_internal_body(
275
571
  void* dst, size_t dstSize,
@@ -282,6 +578,7 @@ HUF_decompress4X1_usingDTable_internal_body(
282
578
  { const BYTE* const istart = (const BYTE*) cSrc;
283
579
  BYTE* const ostart = (BYTE*) dst;
284
580
  BYTE* const oend = ostart + dstSize;
581
+ BYTE* const olimit = oend - 3;
285
582
  const void* const dtPtr = DTable + 1;
286
583
  const HUF_DEltX1* const dt = (const HUF_DEltX1*)dtPtr;
287
584
 
@@ -306,39 +603,42 @@ HUF_decompress4X1_usingDTable_internal_body(
306
603
  BYTE* op2 = opStart2;
307
604
  BYTE* op3 = opStart3;
308
605
  BYTE* op4 = opStart4;
309
- U32 endSignal = BIT_DStream_unfinished;
310
606
  DTableDesc const dtd = HUF_getDTableDesc(DTable);
311
607
  U32 const dtLog = dtd.tableLog;
608
+ U32 endSignal = 1;
312
609
 
313
610
  if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */
611
+ if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */
612
+ if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */
314
613
  CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
315
614
  CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
316
615
  CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
317
616
  CHECK_F( BIT_initDStream(&bitD4, istart4, length4) );
318
617
 
319
618
  /* up to 16 symbols per loop (4 symbols per stream) in 64-bit mode */
320
- endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
321
- while ( (endSignal==BIT_DStream_unfinished) && (op4<(oend-3)) ) {
322
- HUF_DECODE_SYMBOLX1_2(op1, &bitD1);
323
- HUF_DECODE_SYMBOLX1_2(op2, &bitD2);
324
- HUF_DECODE_SYMBOLX1_2(op3, &bitD3);
325
- HUF_DECODE_SYMBOLX1_2(op4, &bitD4);
326
- HUF_DECODE_SYMBOLX1_1(op1, &bitD1);
327
- HUF_DECODE_SYMBOLX1_1(op2, &bitD2);
328
- HUF_DECODE_SYMBOLX1_1(op3, &bitD3);
329
- HUF_DECODE_SYMBOLX1_1(op4, &bitD4);
330
- HUF_DECODE_SYMBOLX1_2(op1, &bitD1);
331
- HUF_DECODE_SYMBOLX1_2(op2, &bitD2);
332
- HUF_DECODE_SYMBOLX1_2(op3, &bitD3);
333
- HUF_DECODE_SYMBOLX1_2(op4, &bitD4);
334
- HUF_DECODE_SYMBOLX1_0(op1, &bitD1);
335
- HUF_DECODE_SYMBOLX1_0(op2, &bitD2);
336
- HUF_DECODE_SYMBOLX1_0(op3, &bitD3);
337
- HUF_DECODE_SYMBOLX1_0(op4, &bitD4);
338
- BIT_reloadDStream(&bitD1);
339
- BIT_reloadDStream(&bitD2);
340
- BIT_reloadDStream(&bitD3);
341
- BIT_reloadDStream(&bitD4);
619
+ if ((size_t)(oend - op4) >= sizeof(size_t)) {
620
+ for ( ; (endSignal) & (op4 < olimit) ; ) {
621
+ HUF_DECODE_SYMBOLX1_2(op1, &bitD1);
622
+ HUF_DECODE_SYMBOLX1_2(op2, &bitD2);
623
+ HUF_DECODE_SYMBOLX1_2(op3, &bitD3);
624
+ HUF_DECODE_SYMBOLX1_2(op4, &bitD4);
625
+ HUF_DECODE_SYMBOLX1_1(op1, &bitD1);
626
+ HUF_DECODE_SYMBOLX1_1(op2, &bitD2);
627
+ HUF_DECODE_SYMBOLX1_1(op3, &bitD3);
628
+ HUF_DECODE_SYMBOLX1_1(op4, &bitD4);
629
+ HUF_DECODE_SYMBOLX1_2(op1, &bitD1);
630
+ HUF_DECODE_SYMBOLX1_2(op2, &bitD2);
631
+ HUF_DECODE_SYMBOLX1_2(op3, &bitD3);
632
+ HUF_DECODE_SYMBOLX1_2(op4, &bitD4);
633
+ HUF_DECODE_SYMBOLX1_0(op1, &bitD1);
634
+ HUF_DECODE_SYMBOLX1_0(op2, &bitD2);
635
+ HUF_DECODE_SYMBOLX1_0(op3, &bitD3);
636
+ HUF_DECODE_SYMBOLX1_0(op4, &bitD4);
637
+ endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished;
638
+ endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished;
639
+ endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished;
640
+ endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished;
641
+ }
342
642
  }
343
643
 
344
644
  /* check corruption */
@@ -364,99 +664,230 @@ HUF_decompress4X1_usingDTable_internal_body(
364
664
  }
365
665
  }
366
666
 
667
+ #if HUF_NEED_BMI2_FUNCTION
668
+ static BMI2_TARGET_ATTRIBUTE
669
+ size_t HUF_decompress4X1_usingDTable_internal_bmi2(void* dst, size_t dstSize, void const* cSrc,
670
+ size_t cSrcSize, HUF_DTable const* DTable) {
671
+ return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
672
+ }
673
+ #endif
367
674
 
368
- typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize,
369
- const void *cSrc,
370
- size_t cSrcSize,
371
- const HUF_DTable *DTable);
675
+ static
676
+ size_t HUF_decompress4X1_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,
677
+ size_t cSrcSize, HUF_DTable const* DTable) {
678
+ return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
679
+ }
372
680
 
373
- HUF_DGEN(HUF_decompress1X1_usingDTable_internal)
374
- HUF_DGEN(HUF_decompress4X1_usingDTable_internal)
681
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2
375
682
 
683
+ HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN;
376
684
 
685
+ #endif
377
686
 
378
- size_t HUF_decompress1X1_usingDTable(
379
- void* dst, size_t dstSize,
380
- const void* cSrc, size_t cSrcSize,
381
- const HUF_DTable* DTable)
687
+ static HUF_FAST_BMI2_ATTRS
688
+ void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args)
382
689
  {
383
- DTableDesc dtd = HUF_getDTableDesc(DTable);
384
- if (dtd.tableType != 0) return ERROR(GENERIC);
385
- return HUF_decompress1X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
690
+ U64 bits[4];
691
+ BYTE const* ip[4];
692
+ BYTE* op[4];
693
+ U16 const* const dtable = (U16 const*)args->dt;
694
+ BYTE* const oend = args->oend;
695
+ BYTE const* const ilimit = args->ilimit;
696
+
697
+ /* Copy the arguments to local variables */
698
+ ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
699
+ ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip));
700
+ ZSTD_memcpy(&op, &args->op, sizeof(op));
701
+
702
+ assert(MEM_isLittleEndian());
703
+ assert(!MEM_32bits());
704
+
705
+ for (;;) {
706
+ BYTE* olimit;
707
+ int stream;
708
+ int symbol;
709
+
710
+ /* Assert loop preconditions */
711
+ #ifndef NDEBUG
712
+ for (stream = 0; stream < 4; ++stream) {
713
+ assert(op[stream] <= (stream == 3 ? oend : op[stream + 1]));
714
+ assert(ip[stream] >= ilimit);
715
+ }
716
+ #endif
717
+ /* Compute olimit */
718
+ {
719
+ /* Each iteration produces 5 output symbols per stream */
720
+ size_t const oiters = (size_t)(oend - op[3]) / 5;
721
+ /* Each iteration consumes up to 11 bits * 5 = 55 bits < 7 bytes
722
+ * per stream.
723
+ */
724
+ size_t const iiters = (size_t)(ip[0] - ilimit) / 7;
725
+ /* We can safely run iters iterations before running bounds checks */
726
+ size_t const iters = MIN(oiters, iiters);
727
+ size_t const symbols = iters * 5;
728
+
729
+ /* We can simply check that op[3] < olimit, instead of checking all
730
+ * of our bounds, since we can't hit the other bounds until we've run
731
+ * iters iterations, which only happens when op[3] == olimit.
732
+ */
733
+ olimit = op[3] + symbols;
734
+
735
+ /* Exit fast decoding loop once we get close to the end. */
736
+ if (op[3] + 20 > olimit)
737
+ break;
738
+
739
+ /* Exit the decoding loop if any input pointer has crossed the
740
+ * previous one. This indicates corruption, and a precondition
741
+ * to our loop is that ip[i] >= ip[0].
742
+ */
743
+ for (stream = 1; stream < 4; ++stream) {
744
+ if (ip[stream] < ip[stream - 1])
745
+ goto _out;
746
+ }
747
+ }
748
+
749
+ #ifndef NDEBUG
750
+ for (stream = 1; stream < 4; ++stream) {
751
+ assert(ip[stream] >= ip[stream - 1]);
752
+ }
753
+ #endif
754
+
755
+ do {
756
+ /* Decode 5 symbols in each of the 4 streams */
757
+ for (symbol = 0; symbol < 5; ++symbol) {
758
+ for (stream = 0; stream < 4; ++stream) {
759
+ int const index = (int)(bits[stream] >> 53);
760
+ int const entry = (int)dtable[index];
761
+ bits[stream] <<= (entry & 63);
762
+ op[stream][symbol] = (BYTE)((entry >> 8) & 0xFF);
763
+ }
764
+ }
765
+ /* Reload the bitstreams */
766
+ for (stream = 0; stream < 4; ++stream) {
767
+ int const ctz = ZSTD_countTrailingZeros64(bits[stream]);
768
+ int const nbBits = ctz & 7;
769
+ int const nbBytes = ctz >> 3;
770
+ op[stream] += 5;
771
+ ip[stream] -= nbBytes;
772
+ bits[stream] = MEM_read64(ip[stream]) | 1;
773
+ bits[stream] <<= nbBits;
774
+ }
775
+ } while (op[3] < olimit);
776
+ }
777
+
778
+ _out:
779
+
780
+ /* Save the final values of each of the state variables back to args. */
781
+ ZSTD_memcpy(&args->bits, &bits, sizeof(bits));
782
+ ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip));
783
+ ZSTD_memcpy(&args->op, &op, sizeof(op));
386
784
  }
387
785
 
388
- size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
389
- const void* cSrc, size_t cSrcSize,
390
- void* workSpace, size_t wkspSize)
786
+ /**
787
+ * @returns @p dstSize on success (>= 6)
788
+ * 0 if the fallback implementation should be used
789
+ * An error if an error occurred
790
+ */
791
+ static HUF_FAST_BMI2_ATTRS
792
+ size_t
793
+ HUF_decompress4X1_usingDTable_internal_fast(
794
+ void* dst, size_t dstSize,
795
+ const void* cSrc, size_t cSrcSize,
796
+ const HUF_DTable* DTable,
797
+ HUF_DecompressFastLoopFn loopFn)
391
798
  {
392
- const BYTE* ip = (const BYTE*) cSrc;
799
+ void const* dt = DTable + 1;
800
+ const BYTE* const iend = (const BYTE*)cSrc + 6;
801
+ BYTE* const oend = (BYTE*)dst + dstSize;
802
+ HUF_DecompressFastArgs args;
803
+ { size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
804
+ FORWARD_IF_ERROR(ret, "Failed to init fast loop args");
805
+ if (ret == 0)
806
+ return 0;
807
+ }
393
808
 
394
- size_t const hSize = HUF_readDTableX1_wksp(DCtx, cSrc, cSrcSize, workSpace, wkspSize);
395
- if (HUF_isError(hSize)) return hSize;
396
- if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
397
- ip += hSize; cSrcSize -= hSize;
809
+ assert(args.ip[0] >= args.ilimit);
810
+ loopFn(&args);
811
+
812
+ /* Our loop guarantees that ip[] >= ilimit and that we haven't
813
+ * overwritten any op[].
814
+ */
815
+ assert(args.ip[0] >= iend);
816
+ assert(args.ip[1] >= iend);
817
+ assert(args.ip[2] >= iend);
818
+ assert(args.ip[3] >= iend);
819
+ assert(args.op[3] <= oend);
820
+ (void)iend;
821
+
822
+ /* finish bit streams one by one. */
823
+ { size_t const segmentSize = (dstSize+3) / 4;
824
+ BYTE* segmentEnd = (BYTE*)dst;
825
+ int i;
826
+ for (i = 0; i < 4; ++i) {
827
+ BIT_DStream_t bit;
828
+ if (segmentSize <= (size_t)(oend - segmentEnd))
829
+ segmentEnd += segmentSize;
830
+ else
831
+ segmentEnd = oend;
832
+ FORWARD_IF_ERROR(HUF_initRemainingDStream(&bit, &args, i, segmentEnd), "corruption");
833
+ /* Decompress and validate that we've produced exactly the expected length. */
834
+ args.op[i] += HUF_decodeStreamX1(args.op[i], &bit, segmentEnd, (HUF_DEltX1 const*)dt, HUF_DECODER_FAST_TABLELOG);
835
+ if (args.op[i] != segmentEnd) return ERROR(corruption_detected);
836
+ }
837
+ }
398
838
 
399
- return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
839
+ /* decoded size */
840
+ assert(dstSize != 0);
841
+ return dstSize;
400
842
  }
401
843
 
844
+ HUF_DGEN(HUF_decompress1X1_usingDTable_internal)
402
845
 
403
- size_t HUF_decompress1X1_DCtx(HUF_DTable* DCtx, void* dst, size_t dstSize,
404
- const void* cSrc, size_t cSrcSize)
846
+ static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc,
847
+ size_t cSrcSize, HUF_DTable const* DTable, int flags)
405
848
  {
406
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
407
- return HUF_decompress1X1_DCtx_wksp(DCtx, dst, dstSize, cSrc, cSrcSize,
408
- workSpace, sizeof(workSpace));
409
- }
849
+ HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X1_usingDTable_internal_default;
850
+ HUF_DecompressFastLoopFn loopFn = HUF_decompress4X1_usingDTable_internal_fast_c_loop;
410
851
 
411
- size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
412
- {
413
- HUF_CREATE_STATIC_DTABLEX1(DTable, HUF_TABLELOG_MAX);
414
- return HUF_decompress1X1_DCtx (DTable, dst, dstSize, cSrc, cSrcSize);
415
- }
852
+ #if DYNAMIC_BMI2
853
+ if (flags & HUF_flags_bmi2) {
854
+ fallbackFn = HUF_decompress4X1_usingDTable_internal_bmi2;
855
+ # if ZSTD_ENABLE_ASM_X86_64_BMI2
856
+ if (!(flags & HUF_flags_disableAsm)) {
857
+ loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop;
858
+ }
859
+ # endif
860
+ } else {
861
+ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
862
+ }
863
+ #endif
416
864
 
417
- size_t HUF_decompress4X1_usingDTable(
418
- void* dst, size_t dstSize,
419
- const void* cSrc, size_t cSrcSize,
420
- const HUF_DTable* DTable)
421
- {
422
- DTableDesc dtd = HUF_getDTableDesc(DTable);
423
- if (dtd.tableType != 0) return ERROR(GENERIC);
424
- return HUF_decompress4X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
865
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
866
+ if (!(flags & HUF_flags_disableAsm)) {
867
+ loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop;
868
+ }
869
+ #endif
870
+
871
+ if (!(flags & HUF_flags_disableFast)) {
872
+ size_t const ret = HUF_decompress4X1_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);
873
+ if (ret != 0)
874
+ return ret;
875
+ }
876
+ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
425
877
  }
426
878
 
427
- static size_t HUF_decompress4X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
879
+ static size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
428
880
  const void* cSrc, size_t cSrcSize,
429
- void* workSpace, size_t wkspSize, int bmi2)
881
+ void* workSpace, size_t wkspSize, int flags)
430
882
  {
431
883
  const BYTE* ip = (const BYTE*) cSrc;
432
884
 
433
- size_t const hSize = HUF_readDTableX1_wksp (dctx, cSrc, cSrcSize,
434
- workSpace, wkspSize);
885
+ size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags);
435
886
  if (HUF_isError(hSize)) return hSize;
436
887
  if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
437
888
  ip += hSize; cSrcSize -= hSize;
438
889
 
439
- return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
440
- }
441
-
442
- size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
443
- const void* cSrc, size_t cSrcSize,
444
- void* workSpace, size_t wkspSize)
445
- {
446
- return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, 0);
447
- }
448
-
449
-
450
- size_t HUF_decompress4X1_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
451
- {
452
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
453
- return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
454
- workSpace, sizeof(workSpace));
455
- }
456
- size_t HUF_decompress4X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
457
- {
458
- HUF_CREATE_STATIC_DTABLEX1(DTable, HUF_TABLELOG_MAX);
459
- return HUF_decompress4X1_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
890
+ return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
460
891
  }
461
892
 
462
893
  #endif /* HUF_FORCE_DECOMPRESS_X2 */
@@ -469,209 +900,322 @@ size_t HUF_decompress4X1 (void* dst, size_t dstSize, const void* cSrc, size_t cS
469
900
  /* *************************/
470
901
 
471
902
  typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUF_DEltX2; /* double-symbols decoding */
472
- typedef struct { BYTE symbol; BYTE weight; } sortedSymbol_t;
903
+ typedef struct { BYTE symbol; } sortedSymbol_t;
473
904
  typedef U32 rankValCol_t[HUF_TABLELOG_MAX + 1];
474
905
  typedef rankValCol_t rankVal_t[HUF_TABLELOG_MAX];
475
906
 
907
+ /**
908
+ * Constructs a HUF_DEltX2 in a U32.
909
+ */
910
+ static U32 HUF_buildDEltX2U32(U32 symbol, U32 nbBits, U32 baseSeq, int level)
911
+ {
912
+ U32 seq;
913
+ DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, sequence) == 0);
914
+ DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, nbBits) == 2);
915
+ DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, length) == 3);
916
+ DEBUG_STATIC_ASSERT(sizeof(HUF_DEltX2) == sizeof(U32));
917
+ if (MEM_isLittleEndian()) {
918
+ seq = level == 1 ? symbol : (baseSeq + (symbol << 8));
919
+ return seq + (nbBits << 16) + ((U32)level << 24);
920
+ } else {
921
+ seq = level == 1 ? (symbol << 8) : ((baseSeq << 8) + symbol);
922
+ return (seq << 16) + (nbBits << 8) + (U32)level;
923
+ }
924
+ }
476
925
 
477
- /* HUF_fillDTableX2Level2() :
478
- * `rankValOrigin` must be a table of at least (HUF_TABLELOG_MAX + 1) U32 */
479
- static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 sizeLog, const U32 consumed,
480
- const U32* rankValOrigin, const int minWeight,
481
- const sortedSymbol_t* sortedSymbols, const U32 sortedListSize,
482
- U32 nbBitsBaseline, U16 baseSeq)
926
+ /**
927
+ * Constructs a HUF_DEltX2.
928
+ */
929
+ static HUF_DEltX2 HUF_buildDEltX2(U32 symbol, U32 nbBits, U32 baseSeq, int level)
483
930
  {
484
931
  HUF_DEltX2 DElt;
485
- U32 rankVal[HUF_TABLELOG_MAX + 1];
932
+ U32 const val = HUF_buildDEltX2U32(symbol, nbBits, baseSeq, level);
933
+ DEBUG_STATIC_ASSERT(sizeof(DElt) == sizeof(val));
934
+ ZSTD_memcpy(&DElt, &val, sizeof(val));
935
+ return DElt;
936
+ }
937
+
938
+ /**
939
+ * Constructs 2 HUF_DEltX2s and packs them into a U64.
940
+ */
941
+ static U64 HUF_buildDEltX2U64(U32 symbol, U32 nbBits, U16 baseSeq, int level)
942
+ {
943
+ U32 DElt = HUF_buildDEltX2U32(symbol, nbBits, baseSeq, level);
944
+ return (U64)DElt + ((U64)DElt << 32);
945
+ }
486
946
 
487
- /* get pre-calculated rankVal */
488
- memcpy(rankVal, rankValOrigin, sizeof(rankVal));
947
+ /**
948
+ * Fills the DTable rank with all the symbols from [begin, end) that are each
949
+ * nbBits long.
950
+ *
951
+ * @param DTableRank The start of the rank in the DTable.
952
+ * @param begin The first symbol to fill (inclusive).
953
+ * @param end The last symbol to fill (exclusive).
954
+ * @param nbBits Each symbol is nbBits long.
955
+ * @param tableLog The table log.
956
+ * @param baseSeq If level == 1 { 0 } else { the first level symbol }
957
+ * @param level The level in the table. Must be 1 or 2.
958
+ */
959
+ static void HUF_fillDTableX2ForWeight(
960
+ HUF_DEltX2* DTableRank,
961
+ sortedSymbol_t const* begin, sortedSymbol_t const* end,
962
+ U32 nbBits, U32 tableLog,
963
+ U16 baseSeq, int const level)
964
+ {
965
+ U32 const length = 1U << ((tableLog - nbBits) & 0x1F /* quiet static-analyzer */);
966
+ const sortedSymbol_t* ptr;
967
+ assert(level >= 1 && level <= 2);
968
+ switch (length) {
969
+ case 1:
970
+ for (ptr = begin; ptr != end; ++ptr) {
971
+ HUF_DEltX2 const DElt = HUF_buildDEltX2(ptr->symbol, nbBits, baseSeq, level);
972
+ *DTableRank++ = DElt;
973
+ }
974
+ break;
975
+ case 2:
976
+ for (ptr = begin; ptr != end; ++ptr) {
977
+ HUF_DEltX2 const DElt = HUF_buildDEltX2(ptr->symbol, nbBits, baseSeq, level);
978
+ DTableRank[0] = DElt;
979
+ DTableRank[1] = DElt;
980
+ DTableRank += 2;
981
+ }
982
+ break;
983
+ case 4:
984
+ for (ptr = begin; ptr != end; ++ptr) {
985
+ U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level);
986
+ ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2));
987
+ ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2));
988
+ DTableRank += 4;
989
+ }
990
+ break;
991
+ case 8:
992
+ for (ptr = begin; ptr != end; ++ptr) {
993
+ U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level);
994
+ ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2));
995
+ ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2));
996
+ ZSTD_memcpy(DTableRank + 4, &DEltX2, sizeof(DEltX2));
997
+ ZSTD_memcpy(DTableRank + 6, &DEltX2, sizeof(DEltX2));
998
+ DTableRank += 8;
999
+ }
1000
+ break;
1001
+ default:
1002
+ for (ptr = begin; ptr != end; ++ptr) {
1003
+ U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level);
1004
+ HUF_DEltX2* const DTableRankEnd = DTableRank + length;
1005
+ for (; DTableRank != DTableRankEnd; DTableRank += 8) {
1006
+ ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2));
1007
+ ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2));
1008
+ ZSTD_memcpy(DTableRank + 4, &DEltX2, sizeof(DEltX2));
1009
+ ZSTD_memcpy(DTableRank + 6, &DEltX2, sizeof(DEltX2));
1010
+ }
1011
+ }
1012
+ break;
1013
+ }
1014
+ }
489
1015
 
490
- /* fill skipped values */
1016
+ /* HUF_fillDTableX2Level2() :
1017
+ * `rankValOrigin` must be a table of at least (HUF_TABLELOG_MAX + 1) U32 */
1018
+ static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 targetLog, const U32 consumedBits,
1019
+ const U32* rankVal, const int minWeight, const int maxWeight1,
1020
+ const sortedSymbol_t* sortedSymbols, U32 const* rankStart,
1021
+ U32 nbBitsBaseline, U16 baseSeq)
1022
+ {
1023
+ /* Fill skipped values (all positions up to rankVal[minWeight]).
1024
+ * These are positions only get a single symbol because the combined weight
1025
+ * is too large.
1026
+ */
491
1027
  if (minWeight>1) {
492
- U32 i, skipSize = rankVal[minWeight];
493
- MEM_writeLE16(&(DElt.sequence), baseSeq);
494
- DElt.nbBits = (BYTE)(consumed);
495
- DElt.length = 1;
496
- for (i = 0; i < skipSize; i++)
497
- DTable[i] = DElt;
1028
+ U32 const length = 1U << ((targetLog - consumedBits) & 0x1F /* quiet static-analyzer */);
1029
+ U64 const DEltX2 = HUF_buildDEltX2U64(baseSeq, consumedBits, /* baseSeq */ 0, /* level */ 1);
1030
+ int const skipSize = rankVal[minWeight];
1031
+ assert(length > 1);
1032
+ assert((U32)skipSize < length);
1033
+ switch (length) {
1034
+ case 2:
1035
+ assert(skipSize == 1);
1036
+ ZSTD_memcpy(DTable, &DEltX2, sizeof(DEltX2));
1037
+ break;
1038
+ case 4:
1039
+ assert(skipSize <= 4);
1040
+ ZSTD_memcpy(DTable + 0, &DEltX2, sizeof(DEltX2));
1041
+ ZSTD_memcpy(DTable + 2, &DEltX2, sizeof(DEltX2));
1042
+ break;
1043
+ default:
1044
+ {
1045
+ int i;
1046
+ for (i = 0; i < skipSize; i += 8) {
1047
+ ZSTD_memcpy(DTable + i + 0, &DEltX2, sizeof(DEltX2));
1048
+ ZSTD_memcpy(DTable + i + 2, &DEltX2, sizeof(DEltX2));
1049
+ ZSTD_memcpy(DTable + i + 4, &DEltX2, sizeof(DEltX2));
1050
+ ZSTD_memcpy(DTable + i + 6, &DEltX2, sizeof(DEltX2));
1051
+ }
1052
+ }
1053
+ }
498
1054
  }
499
1055
 
500
- /* fill DTable */
501
- { U32 s; for (s=0; s<sortedListSize; s++) { /* note : sortedSymbols already skipped */
502
- const U32 symbol = sortedSymbols[s].symbol;
503
- const U32 weight = sortedSymbols[s].weight;
504
- const U32 nbBits = nbBitsBaseline - weight;
505
- const U32 length = 1 << (sizeLog-nbBits);
506
- const U32 start = rankVal[weight];
507
- U32 i = start;
508
- const U32 end = start + length;
509
-
510
- MEM_writeLE16(&(DElt.sequence), (U16)(baseSeq + (symbol << 8)));
511
- DElt.nbBits = (BYTE)(nbBits + consumed);
512
- DElt.length = 2;
513
- do { DTable[i++] = DElt; } while (i<end); /* since length >= 1 */
514
-
515
- rankVal[weight] += length;
516
- } }
1056
+ /* Fill each of the second level symbols by weight. */
1057
+ {
1058
+ int w;
1059
+ for (w = minWeight; w < maxWeight1; ++w) {
1060
+ int const begin = rankStart[w];
1061
+ int const end = rankStart[w+1];
1062
+ U32 const nbBits = nbBitsBaseline - w;
1063
+ U32 const totalBits = nbBits + consumedBits;
1064
+ HUF_fillDTableX2ForWeight(
1065
+ DTable + rankVal[w],
1066
+ sortedSymbols + begin, sortedSymbols + end,
1067
+ totalBits, targetLog,
1068
+ baseSeq, /* level */ 2);
1069
+ }
1070
+ }
517
1071
  }
518
1072
 
519
-
520
1073
  static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog,
521
- const sortedSymbol_t* sortedList, const U32 sortedListSize,
522
- const U32* rankStart, rankVal_t rankValOrigin, const U32 maxWeight,
1074
+ const sortedSymbol_t* sortedList,
1075
+ const U32* rankStart, rankValCol_t* rankValOrigin, const U32 maxWeight,
523
1076
  const U32 nbBitsBaseline)
524
1077
  {
525
- U32 rankVal[HUF_TABLELOG_MAX + 1];
1078
+ U32* const rankVal = rankValOrigin[0];
526
1079
  const int scaleLog = nbBitsBaseline - targetLog; /* note : targetLog >= srcLog, hence scaleLog <= 1 */
527
1080
  const U32 minBits = nbBitsBaseline - maxWeight;
528
- U32 s;
529
-
530
- memcpy(rankVal, rankValOrigin, sizeof(rankVal));
531
-
532
- /* fill DTable */
533
- for (s=0; s<sortedListSize; s++) {
534
- const U16 symbol = sortedList[s].symbol;
535
- const U32 weight = sortedList[s].weight;
536
- const U32 nbBits = nbBitsBaseline - weight;
537
- const U32 start = rankVal[weight];
538
- const U32 length = 1 << (targetLog-nbBits);
539
-
540
- if (targetLog-nbBits >= minBits) { /* enough room for a second symbol */
541
- U32 sortedRank;
1081
+ int w;
1082
+ int const wEnd = (int)maxWeight + 1;
1083
+
1084
+ /* Fill DTable in order of weight. */
1085
+ for (w = 1; w < wEnd; ++w) {
1086
+ int const begin = (int)rankStart[w];
1087
+ int const end = (int)rankStart[w+1];
1088
+ U32 const nbBits = nbBitsBaseline - w;
1089
+
1090
+ if (targetLog-nbBits >= minBits) {
1091
+ /* Enough room for a second symbol. */
1092
+ int start = rankVal[w];
1093
+ U32 const length = 1U << ((targetLog - nbBits) & 0x1F /* quiet static-analyzer */);
542
1094
  int minWeight = nbBits + scaleLog;
1095
+ int s;
543
1096
  if (minWeight < 1) minWeight = 1;
544
- sortedRank = rankStart[minWeight];
545
- HUF_fillDTableX2Level2(DTable+start, targetLog-nbBits, nbBits,
546
- rankValOrigin[nbBits], minWeight,
547
- sortedList+sortedRank, sortedListSize-sortedRank,
548
- nbBitsBaseline, symbol);
1097
+ /* Fill the DTable for every symbol of weight w.
1098
+ * These symbols get at least 1 second symbol.
1099
+ */
1100
+ for (s = begin; s != end; ++s) {
1101
+ HUF_fillDTableX2Level2(
1102
+ DTable + start, targetLog, nbBits,
1103
+ rankValOrigin[nbBits], minWeight, wEnd,
1104
+ sortedList, rankStart,
1105
+ nbBitsBaseline, sortedList[s].symbol);
1106
+ start += length;
1107
+ }
549
1108
  } else {
550
- HUF_DEltX2 DElt;
551
- MEM_writeLE16(&(DElt.sequence), symbol);
552
- DElt.nbBits = (BYTE)(nbBits);
553
- DElt.length = 1;
554
- { U32 const end = start + length;
555
- U32 u;
556
- for (u = start; u < end; u++) DTable[u] = DElt;
557
- } }
558
- rankVal[weight] += length;
1109
+ /* Only a single symbol. */
1110
+ HUF_fillDTableX2ForWeight(
1111
+ DTable + rankVal[w],
1112
+ sortedList + begin, sortedList + end,
1113
+ nbBits, targetLog,
1114
+ /* baseSeq */ 0, /* level */ 1);
1115
+ }
559
1116
  }
560
1117
  }
561
1118
 
1119
+ typedef struct {
1120
+ rankValCol_t rankVal[HUF_TABLELOG_MAX];
1121
+ U32 rankStats[HUF_TABLELOG_MAX + 1];
1122
+ U32 rankStart0[HUF_TABLELOG_MAX + 3];
1123
+ sortedSymbol_t sortedSymbol[HUF_SYMBOLVALUE_MAX + 1];
1124
+ BYTE weightList[HUF_SYMBOLVALUE_MAX + 1];
1125
+ U32 calleeWksp[HUF_READ_STATS_WORKSPACE_SIZE_U32];
1126
+ } HUF_ReadDTableX2_Workspace;
1127
+
562
1128
  size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,
563
1129
  const void* src, size_t srcSize,
564
- void* workSpace, size_t wkspSize)
1130
+ void* workSpace, size_t wkspSize, int flags)
565
1131
  {
566
- U32 tableLog, maxW, sizeOfSort, nbSymbols;
1132
+ U32 tableLog, maxW, nbSymbols;
567
1133
  DTableDesc dtd = HUF_getDTableDesc(DTable);
568
- U32 const maxTableLog = dtd.maxTableLog;
1134
+ U32 maxTableLog = dtd.maxTableLog;
569
1135
  size_t iSize;
570
1136
  void* dtPtr = DTable+1; /* force compiler to avoid strict-aliasing */
571
1137
  HUF_DEltX2* const dt = (HUF_DEltX2*)dtPtr;
572
1138
  U32 *rankStart;
573
1139
 
574
- rankValCol_t* rankVal;
575
- U32* rankStats;
576
- U32* rankStart0;
577
- sortedSymbol_t* sortedSymbol;
578
- BYTE* weightList;
579
- size_t spaceUsed32 = 0;
580
-
581
- rankVal = (rankValCol_t *)((U32 *)workSpace + spaceUsed32);
582
- spaceUsed32 += (sizeof(rankValCol_t) * HUF_TABLELOG_MAX) >> 2;
583
- rankStats = (U32 *)workSpace + spaceUsed32;
584
- spaceUsed32 += HUF_TABLELOG_MAX + 1;
585
- rankStart0 = (U32 *)workSpace + spaceUsed32;
586
- spaceUsed32 += HUF_TABLELOG_MAX + 2;
587
- sortedSymbol = (sortedSymbol_t *)workSpace + (spaceUsed32 * sizeof(U32)) / sizeof(sortedSymbol_t);
588
- spaceUsed32 += HUF_ALIGN(sizeof(sortedSymbol_t) * (HUF_SYMBOLVALUE_MAX + 1), sizeof(U32)) >> 2;
589
- weightList = (BYTE *)((U32 *)workSpace + spaceUsed32);
590
- spaceUsed32 += HUF_ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2;
591
-
592
- if ((spaceUsed32 << 2) > wkspSize) return ERROR(tableLog_tooLarge);
593
-
594
- rankStart = rankStart0 + 1;
595
- memset(rankStats, 0, sizeof(U32) * (2 * HUF_TABLELOG_MAX + 2 + 1));
1140
+ HUF_ReadDTableX2_Workspace* const wksp = (HUF_ReadDTableX2_Workspace*)workSpace;
1141
+
1142
+ if (sizeof(*wksp) > wkspSize) return ERROR(GENERIC);
1143
+
1144
+ rankStart = wksp->rankStart0 + 1;
1145
+ ZSTD_memset(wksp->rankStats, 0, sizeof(wksp->rankStats));
1146
+ ZSTD_memset(wksp->rankStart0, 0, sizeof(wksp->rankStart0));
596
1147
 
597
1148
  DEBUG_STATIC_ASSERT(sizeof(HUF_DEltX2) == sizeof(HUF_DTable)); /* if compiler fails here, assertion is wrong */
598
1149
  if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
599
- /* memset(weightList, 0, sizeof(weightList)); */ /* is not necessary, even though some analyzer complain ... */
1150
+ /* ZSTD_memset(weightList, 0, sizeof(weightList)); */ /* is not necessary, even though some analyzer complain ... */
600
1151
 
601
- iSize = HUF_readStats(weightList, HUF_SYMBOLVALUE_MAX + 1, rankStats, &nbSymbols, &tableLog, src, srcSize);
1152
+ iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), flags);
602
1153
  if (HUF_isError(iSize)) return iSize;
603
1154
 
604
1155
  /* check result */
605
1156
  if (tableLog > maxTableLog) return ERROR(tableLog_tooLarge); /* DTable can't fit code depth */
1157
+ if (tableLog <= HUF_DECODER_FAST_TABLELOG && maxTableLog > HUF_DECODER_FAST_TABLELOG) maxTableLog = HUF_DECODER_FAST_TABLELOG;
606
1158
 
607
1159
  /* find maxWeight */
608
- for (maxW = tableLog; rankStats[maxW]==0; maxW--) {} /* necessarily finds a solution before 0 */
1160
+ for (maxW = tableLog; wksp->rankStats[maxW]==0; maxW--) {} /* necessarily finds a solution before 0 */
609
1161
 
610
1162
  /* Get start index of each weight */
611
1163
  { U32 w, nextRankStart = 0;
612
1164
  for (w=1; w<maxW+1; w++) {
613
- U32 current = nextRankStart;
614
- nextRankStart += rankStats[w];
615
- rankStart[w] = current;
1165
+ U32 curr = nextRankStart;
1166
+ nextRankStart += wksp->rankStats[w];
1167
+ rankStart[w] = curr;
616
1168
  }
617
1169
  rankStart[0] = nextRankStart; /* put all 0w symbols at the end of sorted list*/
618
- sizeOfSort = nextRankStart;
1170
+ rankStart[maxW+1] = nextRankStart;
619
1171
  }
620
1172
 
621
1173
  /* sort symbols by weight */
622
1174
  { U32 s;
623
1175
  for (s=0; s<nbSymbols; s++) {
624
- U32 const w = weightList[s];
1176
+ U32 const w = wksp->weightList[s];
625
1177
  U32 const r = rankStart[w]++;
626
- sortedSymbol[r].symbol = (BYTE)s;
627
- sortedSymbol[r].weight = (BYTE)w;
1178
+ wksp->sortedSymbol[r].symbol = (BYTE)s;
628
1179
  }
629
1180
  rankStart[0] = 0; /* forget 0w symbols; this is beginning of weight(1) */
630
1181
  }
631
1182
 
632
1183
  /* Build rankVal */
633
- { U32* const rankVal0 = rankVal[0];
1184
+ { U32* const rankVal0 = wksp->rankVal[0];
634
1185
  { int const rescale = (maxTableLog-tableLog) - 1; /* tableLog <= maxTableLog */
635
1186
  U32 nextRankVal = 0;
636
1187
  U32 w;
637
1188
  for (w=1; w<maxW+1; w++) {
638
- U32 current = nextRankVal;
639
- nextRankVal += rankStats[w] << (w+rescale);
640
- rankVal0[w] = current;
1189
+ U32 curr = nextRankVal;
1190
+ nextRankVal += wksp->rankStats[w] << (w+rescale);
1191
+ rankVal0[w] = curr;
641
1192
  } }
642
1193
  { U32 const minBits = tableLog+1 - maxW;
643
1194
  U32 consumed;
644
1195
  for (consumed = minBits; consumed < maxTableLog - minBits + 1; consumed++) {
645
- U32* const rankValPtr = rankVal[consumed];
1196
+ U32* const rankValPtr = wksp->rankVal[consumed];
646
1197
  U32 w;
647
1198
  for (w = 1; w < maxW+1; w++) {
648
1199
  rankValPtr[w] = rankVal0[w] >> consumed;
649
1200
  } } } }
650
1201
 
651
1202
  HUF_fillDTableX2(dt, maxTableLog,
652
- sortedSymbol, sizeOfSort,
653
- rankStart0, rankVal, maxW,
1203
+ wksp->sortedSymbol,
1204
+ wksp->rankStart0, wksp->rankVal, maxW,
654
1205
  tableLog+1);
655
1206
 
656
1207
  dtd.tableLog = (BYTE)maxTableLog;
657
1208
  dtd.tableType = 1;
658
- memcpy(DTable, &dtd, sizeof(dtd));
1209
+ ZSTD_memcpy(DTable, &dtd, sizeof(dtd));
659
1210
  return iSize;
660
1211
  }
661
1212
 
662
- size_t HUF_readDTableX2(HUF_DTable* DTable, const void* src, size_t srcSize)
663
- {
664
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
665
- return HUF_readDTableX2_wksp(DTable, src, srcSize,
666
- workSpace, sizeof(workSpace));
667
- }
668
-
669
1213
 
670
1214
  FORCE_INLINE_TEMPLATE U32
671
1215
  HUF_decodeSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, const U32 dtLog)
672
1216
  {
673
1217
  size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */
674
- memcpy(op, dt+val, 2);
1218
+ ZSTD_memcpy(op, &dt[val].sequence, 2);
675
1219
  BIT_skipBits(DStream, dt[val].nbBits);
676
1220
  return dt[val].length;
677
1221
  }
@@ -680,15 +1224,17 @@ FORCE_INLINE_TEMPLATE U32
680
1224
  HUF_decodeLastSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, const U32 dtLog)
681
1225
  {
682
1226
  size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */
683
- memcpy(op, dt+val, 1);
684
- if (dt[val].length==1) BIT_skipBits(DStream, dt[val].nbBits);
685
- else {
1227
+ ZSTD_memcpy(op, &dt[val].sequence, 1);
1228
+ if (dt[val].length==1) {
1229
+ BIT_skipBits(DStream, dt[val].nbBits);
1230
+ } else {
686
1231
  if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8)) {
687
1232
  BIT_skipBits(DStream, dt[val].nbBits);
688
1233
  if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8))
689
1234
  /* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */
690
1235
  DStream->bitsConsumed = (sizeof(DStream->bitContainer)*8);
691
- } }
1236
+ }
1237
+ }
692
1238
  return 1;
693
1239
  }
694
1240
 
@@ -710,19 +1256,37 @@ HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd,
710
1256
  BYTE* const pStart = p;
711
1257
 
712
1258
  /* up to 8 symbols at a time */
713
- while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-(sizeof(bitDPtr->bitContainer)-1))) {
714
- HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
715
- HUF_DECODE_SYMBOLX2_1(p, bitDPtr);
716
- HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
717
- HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
1259
+ if ((size_t)(pEnd - p) >= sizeof(bitDPtr->bitContainer)) {
1260
+ if (dtLog <= 11 && MEM_64bits()) {
1261
+ /* up to 10 symbols at a time */
1262
+ while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-9)) {
1263
+ HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
1264
+ HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
1265
+ HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
1266
+ HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
1267
+ HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
1268
+ }
1269
+ } else {
1270
+ /* up to 8 symbols at a time */
1271
+ while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-(sizeof(bitDPtr->bitContainer)-1))) {
1272
+ HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
1273
+ HUF_DECODE_SYMBOLX2_1(p, bitDPtr);
1274
+ HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
1275
+ HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
1276
+ }
1277
+ }
1278
+ } else {
1279
+ BIT_reloadDStream(bitDPtr);
718
1280
  }
719
1281
 
720
1282
  /* closer to end : up to 2 symbols at a time */
721
- while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p <= pEnd-2))
722
- HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
1283
+ if ((size_t)(pEnd - p) >= 2) {
1284
+ while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p <= pEnd-2))
1285
+ HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
723
1286
 
724
- while (p <= pEnd-2)
725
- HUF_DECODE_SYMBOLX2_0(p, bitDPtr); /* no need to reload : reached the end of DStream */
1287
+ while (p <= pEnd-2)
1288
+ HUF_DECODE_SYMBOLX2_0(p, bitDPtr); /* no need to reload : reached the end of DStream */
1289
+ }
726
1290
 
727
1291
  if (p < pEnd)
728
1292
  p += HUF_decodeLastSymbolX2(p, bitDPtr, dt, dtLog);
@@ -757,7 +1321,10 @@ HUF_decompress1X2_usingDTable_internal_body(
757
1321
  return dstSize;
758
1322
  }
759
1323
 
760
-
1324
+ /* HUF_decompress4X2_usingDTable_internal_body():
1325
+ * Conditions:
1326
+ * @dstSize >= 6
1327
+ */
761
1328
  FORCE_INLINE_TEMPLATE size_t
762
1329
  HUF_decompress4X2_usingDTable_internal_body(
763
1330
  void* dst, size_t dstSize,
@@ -769,6 +1336,7 @@ HUF_decompress4X2_usingDTable_internal_body(
769
1336
  { const BYTE* const istart = (const BYTE*) cSrc;
770
1337
  BYTE* const ostart = (BYTE*) dst;
771
1338
  BYTE* const oend = ostart + dstSize;
1339
+ BYTE* const olimit = oend - (sizeof(size_t)-1);
772
1340
  const void* const dtPtr = DTable+1;
773
1341
  const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr;
774
1342
 
@@ -793,37 +1361,66 @@ HUF_decompress4X2_usingDTable_internal_body(
793
1361
  BYTE* op2 = opStart2;
794
1362
  BYTE* op3 = opStart3;
795
1363
  BYTE* op4 = opStart4;
796
- U32 endSignal;
1364
+ U32 endSignal = 1;
797
1365
  DTableDesc const dtd = HUF_getDTableDesc(DTable);
798
1366
  U32 const dtLog = dtd.tableLog;
799
1367
 
800
- if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */
1368
+ if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */
1369
+ if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */
1370
+ if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */
801
1371
  CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
802
1372
  CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
803
1373
  CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
804
1374
  CHECK_F( BIT_initDStream(&bitD4, istart4, length4) );
805
1375
 
806
1376
  /* 16-32 symbols per loop (4-8 symbols per stream) */
807
- endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
808
- for ( ; (endSignal==BIT_DStream_unfinished) & (op4<(oend-(sizeof(bitD4.bitContainer)-1))) ; ) {
809
- HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
810
- HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
811
- HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
812
- HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
813
- HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
814
- HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
815
- HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
816
- HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
817
- HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
818
- HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
819
- HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
820
- HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
821
- HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
822
- HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
823
- HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
824
- HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
825
-
826
- endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
1377
+ if ((size_t)(oend - op4) >= sizeof(size_t)) {
1378
+ for ( ; (endSignal) & (op4 < olimit); ) {
1379
+ #if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
1380
+ HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
1381
+ HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
1382
+ HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
1383
+ HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
1384
+ HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
1385
+ HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
1386
+ HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
1387
+ HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
1388
+ endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished;
1389
+ endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished;
1390
+ HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
1391
+ HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
1392
+ HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
1393
+ HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
1394
+ HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
1395
+ HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
1396
+ HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
1397
+ HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
1398
+ endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished;
1399
+ endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished;
1400
+ #else
1401
+ HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
1402
+ HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
1403
+ HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
1404
+ HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
1405
+ HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
1406
+ HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
1407
+ HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
1408
+ HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
1409
+ HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
1410
+ HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
1411
+ HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
1412
+ HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
1413
+ HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
1414
+ HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
1415
+ HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
1416
+ HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
1417
+ endSignal = (U32)LIKELY((U32)
1418
+ (BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished)
1419
+ & (BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished)
1420
+ & (BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished)
1421
+ & (BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished));
1422
+ #endif
1423
+ }
827
1424
  }
828
1425
 
829
1426
  /* check corruption */
@@ -847,94 +1444,279 @@ HUF_decompress4X2_usingDTable_internal_body(
847
1444
  }
848
1445
  }
849
1446
 
850
- HUF_DGEN(HUF_decompress1X2_usingDTable_internal)
851
- HUF_DGEN(HUF_decompress4X2_usingDTable_internal)
1447
+ #if HUF_NEED_BMI2_FUNCTION
1448
+ static BMI2_TARGET_ATTRIBUTE
1449
+ size_t HUF_decompress4X2_usingDTable_internal_bmi2(void* dst, size_t dstSize, void const* cSrc,
1450
+ size_t cSrcSize, HUF_DTable const* DTable) {
1451
+ return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
1452
+ }
1453
+ #endif
852
1454
 
853
- size_t HUF_decompress1X2_usingDTable(
854
- void* dst, size_t dstSize,
855
- const void* cSrc, size_t cSrcSize,
856
- const HUF_DTable* DTable)
857
- {
858
- DTableDesc dtd = HUF_getDTableDesc(DTable);
859
- if (dtd.tableType != 1) return ERROR(GENERIC);
860
- return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
1455
+ static
1456
+ size_t HUF_decompress4X2_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,
1457
+ size_t cSrcSize, HUF_DTable const* DTable) {
1458
+ return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
861
1459
  }
862
1460
 
863
- size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
864
- const void* cSrc, size_t cSrcSize,
865
- void* workSpace, size_t wkspSize)
1461
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2
1462
+
1463
+ HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN;
1464
+
1465
+ #endif
1466
+
1467
+ static HUF_FAST_BMI2_ATTRS
1468
+ void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args)
866
1469
  {
867
- const BYTE* ip = (const BYTE*) cSrc;
1470
+ U64 bits[4];
1471
+ BYTE const* ip[4];
1472
+ BYTE* op[4];
1473
+ BYTE* oend[4];
1474
+ HUF_DEltX2 const* const dtable = (HUF_DEltX2 const*)args->dt;
1475
+ BYTE const* const ilimit = args->ilimit;
1476
+
1477
+ /* Copy the arguments to local registers. */
1478
+ ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
1479
+ ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip));
1480
+ ZSTD_memcpy(&op, &args->op, sizeof(op));
1481
+
1482
+ oend[0] = op[1];
1483
+ oend[1] = op[2];
1484
+ oend[2] = op[3];
1485
+ oend[3] = args->oend;
1486
+
1487
+ assert(MEM_isLittleEndian());
1488
+ assert(!MEM_32bits());
1489
+
1490
+ for (;;) {
1491
+ BYTE* olimit;
1492
+ int stream;
1493
+ int symbol;
1494
+
1495
+ /* Assert loop preconditions */
1496
+ #ifndef NDEBUG
1497
+ for (stream = 0; stream < 4; ++stream) {
1498
+ assert(op[stream] <= oend[stream]);
1499
+ assert(ip[stream] >= ilimit);
1500
+ }
1501
+ #endif
1502
+ /* Compute olimit */
1503
+ {
1504
+ /* Each loop does 5 table lookups for each of the 4 streams.
1505
+ * Each table lookup consumes up to 11 bits of input, and produces
1506
+ * up to 2 bytes of output.
1507
+ */
1508
+ /* We can consume up to 7 bytes of input per iteration per stream.
1509
+ * We also know that each input pointer is >= ip[0]. So we can run
1510
+ * iters loops before running out of input.
1511
+ */
1512
+ size_t iters = (size_t)(ip[0] - ilimit) / 7;
1513
+ /* Each iteration can produce up to 10 bytes of output per stream.
1514
+ * Each output stream my advance at different rates. So take the
1515
+ * minimum number of safe iterations among all the output streams.
1516
+ */
1517
+ for (stream = 0; stream < 4; ++stream) {
1518
+ size_t const oiters = (size_t)(oend[stream] - op[stream]) / 10;
1519
+ iters = MIN(iters, oiters);
1520
+ }
1521
+
1522
+ /* Each iteration produces at least 5 output symbols. So until
1523
+ * op[3] crosses olimit, we know we haven't executed iters
1524
+ * iterations yet. This saves us maintaining an iters counter,
1525
+ * at the expense of computing the remaining # of iterations
1526
+ * more frequently.
1527
+ */
1528
+ olimit = op[3] + (iters * 5);
1529
+
1530
+ /* Exit the fast decoding loop if we are too close to the end. */
1531
+ if (op[3] + 10 > olimit)
1532
+ break;
1533
+
1534
+ /* Exit the decoding loop if any input pointer has crossed the
1535
+ * previous one. This indicates corruption, and a precondition
1536
+ * to our loop is that ip[i] >= ip[0].
1537
+ */
1538
+ for (stream = 1; stream < 4; ++stream) {
1539
+ if (ip[stream] < ip[stream - 1])
1540
+ goto _out;
1541
+ }
1542
+ }
868
1543
 
869
- size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize,
870
- workSpace, wkspSize);
871
- if (HUF_isError(hSize)) return hSize;
872
- if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
873
- ip += hSize; cSrcSize -= hSize;
1544
+ #ifndef NDEBUG
1545
+ for (stream = 1; stream < 4; ++stream) {
1546
+ assert(ip[stream] >= ip[stream - 1]);
1547
+ }
1548
+ #endif
874
1549
 
875
- return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
876
- }
1550
+ do {
1551
+ /* Do 5 table lookups for each of the first 3 streams */
1552
+ for (symbol = 0; symbol < 5; ++symbol) {
1553
+ for (stream = 0; stream < 3; ++stream) {
1554
+ int const index = (int)(bits[stream] >> 53);
1555
+ HUF_DEltX2 const entry = dtable[index];
1556
+ MEM_write16(op[stream], entry.sequence);
1557
+ bits[stream] <<= (entry.nbBits);
1558
+ op[stream] += (entry.length);
1559
+ }
1560
+ }
1561
+ /* Do 1 table lookup from the final stream */
1562
+ {
1563
+ int const index = (int)(bits[3] >> 53);
1564
+ HUF_DEltX2 const entry = dtable[index];
1565
+ MEM_write16(op[3], entry.sequence);
1566
+ bits[3] <<= (entry.nbBits);
1567
+ op[3] += (entry.length);
1568
+ }
1569
+ /* Do 4 table lookups from the final stream & reload bitstreams */
1570
+ for (stream = 0; stream < 4; ++stream) {
1571
+ /* Do a table lookup from the final stream.
1572
+ * This is interleaved with the reloading to reduce register
1573
+ * pressure. This shouldn't be necessary, but compilers can
1574
+ * struggle with codegen with high register pressure.
1575
+ */
1576
+ {
1577
+ int const index = (int)(bits[3] >> 53);
1578
+ HUF_DEltX2 const entry = dtable[index];
1579
+ MEM_write16(op[3], entry.sequence);
1580
+ bits[3] <<= (entry.nbBits);
1581
+ op[3] += (entry.length);
1582
+ }
1583
+ /* Reload the bistreams. The final bitstream must be reloaded
1584
+ * after the 5th symbol was decoded.
1585
+ */
1586
+ {
1587
+ int const ctz = ZSTD_countTrailingZeros64(bits[stream]);
1588
+ int const nbBits = ctz & 7;
1589
+ int const nbBytes = ctz >> 3;
1590
+ ip[stream] -= nbBytes;
1591
+ bits[stream] = MEM_read64(ip[stream]) | 1;
1592
+ bits[stream] <<= nbBits;
1593
+ }
1594
+ }
1595
+ } while (op[3] < olimit);
1596
+ }
877
1597
 
1598
+ _out:
878
1599
 
879
- size_t HUF_decompress1X2_DCtx(HUF_DTable* DCtx, void* dst, size_t dstSize,
880
- const void* cSrc, size_t cSrcSize)
881
- {
882
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
883
- return HUF_decompress1X2_DCtx_wksp(DCtx, dst, dstSize, cSrc, cSrcSize,
884
- workSpace, sizeof(workSpace));
1600
+ /* Save the final values of each of the state variables back to args. */
1601
+ ZSTD_memcpy(&args->bits, &bits, sizeof(bits));
1602
+ ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip));
1603
+ ZSTD_memcpy(&args->op, &op, sizeof(op));
885
1604
  }
886
1605
 
887
- size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
888
- {
889
- HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_TABLELOG_MAX);
890
- return HUF_decompress1X2_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
891
- }
892
1606
 
893
- size_t HUF_decompress4X2_usingDTable(
1607
+ static HUF_FAST_BMI2_ATTRS size_t
1608
+ HUF_decompress4X2_usingDTable_internal_fast(
894
1609
  void* dst, size_t dstSize,
895
1610
  const void* cSrc, size_t cSrcSize,
896
- const HUF_DTable* DTable)
1611
+ const HUF_DTable* DTable,
1612
+ HUF_DecompressFastLoopFn loopFn) {
1613
+ void const* dt = DTable + 1;
1614
+ const BYTE* const iend = (const BYTE*)cSrc + 6;
1615
+ BYTE* const oend = (BYTE*)dst + dstSize;
1616
+ HUF_DecompressFastArgs args;
1617
+ {
1618
+ size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
1619
+ FORWARD_IF_ERROR(ret, "Failed to init asm args");
1620
+ if (ret == 0)
1621
+ return 0;
1622
+ }
1623
+
1624
+ assert(args.ip[0] >= args.ilimit);
1625
+ loopFn(&args);
1626
+
1627
+ /* note : op4 already verified within main loop */
1628
+ assert(args.ip[0] >= iend);
1629
+ assert(args.ip[1] >= iend);
1630
+ assert(args.ip[2] >= iend);
1631
+ assert(args.ip[3] >= iend);
1632
+ assert(args.op[3] <= oend);
1633
+ (void)iend;
1634
+
1635
+ /* finish bitStreams one by one */
1636
+ {
1637
+ size_t const segmentSize = (dstSize+3) / 4;
1638
+ BYTE* segmentEnd = (BYTE*)dst;
1639
+ int i;
1640
+ for (i = 0; i < 4; ++i) {
1641
+ BIT_DStream_t bit;
1642
+ if (segmentSize <= (size_t)(oend - segmentEnd))
1643
+ segmentEnd += segmentSize;
1644
+ else
1645
+ segmentEnd = oend;
1646
+ FORWARD_IF_ERROR(HUF_initRemainingDStream(&bit, &args, i, segmentEnd), "corruption");
1647
+ args.op[i] += HUF_decodeStreamX2(args.op[i], &bit, segmentEnd, (HUF_DEltX2 const*)dt, HUF_DECODER_FAST_TABLELOG);
1648
+ if (args.op[i] != segmentEnd)
1649
+ return ERROR(corruption_detected);
1650
+ }
1651
+ }
1652
+
1653
+ /* decoded size */
1654
+ return dstSize;
1655
+ }
1656
+
1657
+ static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc,
1658
+ size_t cSrcSize, HUF_DTable const* DTable, int flags)
897
1659
  {
898
- DTableDesc dtd = HUF_getDTableDesc(DTable);
899
- if (dtd.tableType != 1) return ERROR(GENERIC);
900
- return HUF_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
1660
+ HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X2_usingDTable_internal_default;
1661
+ HUF_DecompressFastLoopFn loopFn = HUF_decompress4X2_usingDTable_internal_fast_c_loop;
1662
+
1663
+ #if DYNAMIC_BMI2
1664
+ if (flags & HUF_flags_bmi2) {
1665
+ fallbackFn = HUF_decompress4X2_usingDTable_internal_bmi2;
1666
+ # if ZSTD_ENABLE_ASM_X86_64_BMI2
1667
+ if (!(flags & HUF_flags_disableAsm)) {
1668
+ loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop;
1669
+ }
1670
+ # endif
1671
+ } else {
1672
+ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
1673
+ }
1674
+ #endif
1675
+
1676
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
1677
+ if (!(flags & HUF_flags_disableAsm)) {
1678
+ loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop;
1679
+ }
1680
+ #endif
1681
+
1682
+ if (!(flags & HUF_flags_disableFast)) {
1683
+ size_t const ret = HUF_decompress4X2_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);
1684
+ if (ret != 0)
1685
+ return ret;
1686
+ }
1687
+ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
901
1688
  }
902
1689
 
903
- static size_t HUF_decompress4X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
1690
+ HUF_DGEN(HUF_decompress1X2_usingDTable_internal)
1691
+
1692
+ size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
904
1693
  const void* cSrc, size_t cSrcSize,
905
- void* workSpace, size_t wkspSize, int bmi2)
1694
+ void* workSpace, size_t wkspSize, int flags)
906
1695
  {
907
1696
  const BYTE* ip = (const BYTE*) cSrc;
908
1697
 
909
- size_t hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize,
910
- workSpace, wkspSize);
1698
+ size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize,
1699
+ workSpace, wkspSize, flags);
911
1700
  if (HUF_isError(hSize)) return hSize;
912
1701
  if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
913
1702
  ip += hSize; cSrcSize -= hSize;
914
1703
 
915
- return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
1704
+ return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, flags);
916
1705
  }
917
1706
 
918
- size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
1707
+ static size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
919
1708
  const void* cSrc, size_t cSrcSize,
920
- void* workSpace, size_t wkspSize)
1709
+ void* workSpace, size_t wkspSize, int flags)
921
1710
  {
922
- return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, /* bmi2 */ 0);
923
- }
924
-
1711
+ const BYTE* ip = (const BYTE*) cSrc;
925
1712
 
926
- size_t HUF_decompress4X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize,
927
- const void* cSrc, size_t cSrcSize)
928
- {
929
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
930
- return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
931
- workSpace, sizeof(workSpace));
932
- }
1713
+ size_t hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize,
1714
+ workSpace, wkspSize, flags);
1715
+ if (HUF_isError(hSize)) return hSize;
1716
+ if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
1717
+ ip += hSize; cSrcSize -= hSize;
933
1718
 
934
- size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
935
- {
936
- HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_TABLELOG_MAX);
937
- return HUF_decompress4X2_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
1719
+ return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
938
1720
  }
939
1721
 
940
1722
  #endif /* HUF_FORCE_DECOMPRESS_X1 */
@@ -944,66 +1726,28 @@ size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cS
944
1726
  /* Universal decompression selectors */
945
1727
  /* ***********************************/
946
1728
 
947
- size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize,
948
- const void* cSrc, size_t cSrcSize,
949
- const HUF_DTable* DTable)
950
- {
951
- DTableDesc const dtd = HUF_getDTableDesc(DTable);
952
- #if defined(HUF_FORCE_DECOMPRESS_X1)
953
- (void)dtd;
954
- assert(dtd.tableType == 0);
955
- return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
956
- #elif defined(HUF_FORCE_DECOMPRESS_X2)
957
- (void)dtd;
958
- assert(dtd.tableType == 1);
959
- return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
960
- #else
961
- return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
962
- HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
963
- #endif
964
- }
965
-
966
- size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize,
967
- const void* cSrc, size_t cSrcSize,
968
- const HUF_DTable* DTable)
969
- {
970
- DTableDesc const dtd = HUF_getDTableDesc(DTable);
971
- #if defined(HUF_FORCE_DECOMPRESS_X1)
972
- (void)dtd;
973
- assert(dtd.tableType == 0);
974
- return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
975
- #elif defined(HUF_FORCE_DECOMPRESS_X2)
976
- (void)dtd;
977
- assert(dtd.tableType == 1);
978
- return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
979
- #else
980
- return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
981
- HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
982
- #endif
983
- }
984
-
985
1729
 
986
1730
  #if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2)
987
1731
  typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t;
988
- static const algo_time_t algoTime[16 /* Quantization */][3 /* single, double, quad */] =
1732
+ static const algo_time_t algoTime[16 /* Quantization */][2 /* single, double */] =
989
1733
  {
990
1734
  /* single, double, quad */
991
- {{0,0}, {1,1}, {2,2}}, /* Q==0 : impossible */
992
- {{0,0}, {1,1}, {2,2}}, /* Q==1 : impossible */
993
- {{ 38,130}, {1313, 74}, {2151, 38}}, /* Q == 2 : 12-18% */
994
- {{ 448,128}, {1353, 74}, {2238, 41}}, /* Q == 3 : 18-25% */
995
- {{ 556,128}, {1353, 74}, {2238, 47}}, /* Q == 4 : 25-32% */
996
- {{ 714,128}, {1418, 74}, {2436, 53}}, /* Q == 5 : 32-38% */
997
- {{ 883,128}, {1437, 74}, {2464, 61}}, /* Q == 6 : 38-44% */
998
- {{ 897,128}, {1515, 75}, {2622, 68}}, /* Q == 7 : 44-50% */
999
- {{ 926,128}, {1613, 75}, {2730, 75}}, /* Q == 8 : 50-56% */
1000
- {{ 947,128}, {1729, 77}, {3359, 77}}, /* Q == 9 : 56-62% */
1001
- {{1107,128}, {2083, 81}, {4006, 84}}, /* Q ==10 : 62-69% */
1002
- {{1177,128}, {2379, 87}, {4785, 88}}, /* Q ==11 : 69-75% */
1003
- {{1242,128}, {2415, 93}, {5155, 84}}, /* Q ==12 : 75-81% */
1004
- {{1349,128}, {2644,106}, {5260,106}}, /* Q ==13 : 81-87% */
1005
- {{1455,128}, {2422,124}, {4174,124}}, /* Q ==14 : 87-93% */
1006
- {{ 722,128}, {1891,145}, {1936,146}}, /* Q ==15 : 93-99% */
1735
+ {{0,0}, {1,1}}, /* Q==0 : impossible */
1736
+ {{0,0}, {1,1}}, /* Q==1 : impossible */
1737
+ {{ 150,216}, { 381,119}}, /* Q == 2 : 12-18% */
1738
+ {{ 170,205}, { 514,112}}, /* Q == 3 : 18-25% */
1739
+ {{ 177,199}, { 539,110}}, /* Q == 4 : 25-32% */
1740
+ {{ 197,194}, { 644,107}}, /* Q == 5 : 32-38% */
1741
+ {{ 221,192}, { 735,107}}, /* Q == 6 : 38-44% */
1742
+ {{ 256,189}, { 881,106}}, /* Q == 7 : 44-50% */
1743
+ {{ 359,188}, {1167,109}}, /* Q == 8 : 50-56% */
1744
+ {{ 582,187}, {1570,114}}, /* Q == 9 : 56-62% */
1745
+ {{ 688,187}, {1712,122}}, /* Q ==10 : 62-69% */
1746
+ {{ 825,186}, {1965,136}}, /* Q ==11 : 69-75% */
1747
+ {{ 976,185}, {2131,150}}, /* Q ==12 : 75-81% */
1748
+ {{1180,186}, {2070,175}}, /* Q ==13 : 81-87% */
1749
+ {{1377,185}, {1731,202}}, /* Q ==14 : 87-93% */
1750
+ {{1412,185}, {1695,202}}, /* Q ==15 : 93-99% */
1007
1751
  };
1008
1752
  #endif
1009
1753
 
@@ -1030,188 +1774,92 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize)
1030
1774
  U32 const D256 = (U32)(dstSize >> 8);
1031
1775
  U32 const DTime0 = algoTime[Q][0].tableTime + (algoTime[Q][0].decode256Time * D256);
1032
1776
  U32 DTime1 = algoTime[Q][1].tableTime + (algoTime[Q][1].decode256Time * D256);
1033
- DTime1 += DTime1 >> 3; /* advantage to algorithm using less memory, to reduce cache eviction */
1777
+ DTime1 += DTime1 >> 5; /* small advantage to algorithm using less memory, to reduce cache eviction */
1034
1778
  return DTime1 < DTime0;
1035
1779
  }
1036
1780
  #endif
1037
1781
  }
1038
1782
 
1039
-
1040
- typedef size_t (*decompressionAlgo)(void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);
1041
-
1042
- size_t HUF_decompress (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
1043
- {
1044
- #if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2)
1045
- static const decompressionAlgo decompress[2] = { HUF_decompress4X1, HUF_decompress4X2 };
1046
- #endif
1047
-
1048
- /* validation checks */
1049
- if (dstSize == 0) return ERROR(dstSize_tooSmall);
1050
- if (cSrcSize > dstSize) return ERROR(corruption_detected); /* invalid */
1051
- if (cSrcSize == dstSize) { memcpy(dst, cSrc, dstSize); return dstSize; } /* not compressed */
1052
- if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; } /* RLE */
1053
-
1054
- { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
1055
- #if defined(HUF_FORCE_DECOMPRESS_X1)
1056
- (void)algoNb;
1057
- assert(algoNb == 0);
1058
- return HUF_decompress4X1(dst, dstSize, cSrc, cSrcSize);
1059
- #elif defined(HUF_FORCE_DECOMPRESS_X2)
1060
- (void)algoNb;
1061
- assert(algoNb == 1);
1062
- return HUF_decompress4X2(dst, dstSize, cSrc, cSrcSize);
1063
- #else
1064
- return decompress[algoNb](dst, dstSize, cSrc, cSrcSize);
1065
- #endif
1066
- }
1067
- }
1068
-
1069
- size_t HUF_decompress4X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
1070
- {
1071
- /* validation checks */
1072
- if (dstSize == 0) return ERROR(dstSize_tooSmall);
1073
- if (cSrcSize > dstSize) return ERROR(corruption_detected); /* invalid */
1074
- if (cSrcSize == dstSize) { memcpy(dst, cSrc, dstSize); return dstSize; } /* not compressed */
1075
- if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; } /* RLE */
1076
-
1077
- { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
1078
- #if defined(HUF_FORCE_DECOMPRESS_X1)
1079
- (void)algoNb;
1080
- assert(algoNb == 0);
1081
- return HUF_decompress4X1_DCtx(dctx, dst, dstSize, cSrc, cSrcSize);
1082
- #elif defined(HUF_FORCE_DECOMPRESS_X2)
1083
- (void)algoNb;
1084
- assert(algoNb == 1);
1085
- return HUF_decompress4X2_DCtx(dctx, dst, dstSize, cSrc, cSrcSize);
1086
- #else
1087
- return algoNb ? HUF_decompress4X2_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) :
1088
- HUF_decompress4X1_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) ;
1089
- #endif
1090
- }
1091
- }
1092
-
1093
- size_t HUF_decompress4X_hufOnly(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
1094
- {
1095
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
1096
- return HUF_decompress4X_hufOnly_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
1097
- workSpace, sizeof(workSpace));
1098
- }
1099
-
1100
-
1101
- size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst,
1102
- size_t dstSize, const void* cSrc,
1103
- size_t cSrcSize, void* workSpace,
1104
- size_t wkspSize)
1105
- {
1106
- /* validation checks */
1107
- if (dstSize == 0) return ERROR(dstSize_tooSmall);
1108
- if (cSrcSize == 0) return ERROR(corruption_detected);
1109
-
1110
- { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
1111
- #if defined(HUF_FORCE_DECOMPRESS_X1)
1112
- (void)algoNb;
1113
- assert(algoNb == 0);
1114
- return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
1115
- #elif defined(HUF_FORCE_DECOMPRESS_X2)
1116
- (void)algoNb;
1117
- assert(algoNb == 1);
1118
- return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
1119
- #else
1120
- return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
1121
- cSrcSize, workSpace, wkspSize):
1122
- HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
1123
- #endif
1124
- }
1125
- }
1126
-
1127
1783
  size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
1128
1784
  const void* cSrc, size_t cSrcSize,
1129
- void* workSpace, size_t wkspSize)
1785
+ void* workSpace, size_t wkspSize, int flags)
1130
1786
  {
1131
1787
  /* validation checks */
1132
1788
  if (dstSize == 0) return ERROR(dstSize_tooSmall);
1133
1789
  if (cSrcSize > dstSize) return ERROR(corruption_detected); /* invalid */
1134
- if (cSrcSize == dstSize) { memcpy(dst, cSrc, dstSize); return dstSize; } /* not compressed */
1135
- if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; } /* RLE */
1790
+ if (cSrcSize == dstSize) { ZSTD_memcpy(dst, cSrc, dstSize); return dstSize; } /* not compressed */
1791
+ if (cSrcSize == 1) { ZSTD_memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; } /* RLE */
1136
1792
 
1137
1793
  { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
1138
1794
  #if defined(HUF_FORCE_DECOMPRESS_X1)
1139
1795
  (void)algoNb;
1140
1796
  assert(algoNb == 0);
1141
1797
  return HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc,
1142
- cSrcSize, workSpace, wkspSize);
1798
+ cSrcSize, workSpace, wkspSize, flags);
1143
1799
  #elif defined(HUF_FORCE_DECOMPRESS_X2)
1144
1800
  (void)algoNb;
1145
1801
  assert(algoNb == 1);
1146
1802
  return HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
1147
- cSrcSize, workSpace, wkspSize);
1803
+ cSrcSize, workSpace, wkspSize, flags);
1148
1804
  #else
1149
1805
  return algoNb ? HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
1150
- cSrcSize, workSpace, wkspSize):
1806
+ cSrcSize, workSpace, wkspSize, flags):
1151
1807
  HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc,
1152
- cSrcSize, workSpace, wkspSize);
1808
+ cSrcSize, workSpace, wkspSize, flags);
1153
1809
  #endif
1154
1810
  }
1155
1811
  }
1156
1812
 
1157
- size_t HUF_decompress1X_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize,
1158
- const void* cSrc, size_t cSrcSize)
1159
- {
1160
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
1161
- return HUF_decompress1X_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
1162
- workSpace, sizeof(workSpace));
1163
- }
1164
-
1165
1813
 
1166
- size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
1814
+ size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags)
1167
1815
  {
1168
1816
  DTableDesc const dtd = HUF_getDTableDesc(DTable);
1169
1817
  #if defined(HUF_FORCE_DECOMPRESS_X1)
1170
1818
  (void)dtd;
1171
1819
  assert(dtd.tableType == 0);
1172
- return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
1820
+ return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
1173
1821
  #elif defined(HUF_FORCE_DECOMPRESS_X2)
1174
1822
  (void)dtd;
1175
1823
  assert(dtd.tableType == 1);
1176
- return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
1824
+ return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
1177
1825
  #else
1178
- return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
1179
- HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
1826
+ return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) :
1827
+ HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
1180
1828
  #endif
1181
1829
  }
1182
1830
 
1183
1831
  #ifndef HUF_FORCE_DECOMPRESS_X2
1184
- size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
1832
+ size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags)
1185
1833
  {
1186
1834
  const BYTE* ip = (const BYTE*) cSrc;
1187
1835
 
1188
- size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize);
1836
+ size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags);
1189
1837
  if (HUF_isError(hSize)) return hSize;
1190
1838
  if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
1191
1839
  ip += hSize; cSrcSize -= hSize;
1192
1840
 
1193
- return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
1841
+ return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
1194
1842
  }
1195
1843
  #endif
1196
1844
 
1197
- size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
1845
+ size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags)
1198
1846
  {
1199
1847
  DTableDesc const dtd = HUF_getDTableDesc(DTable);
1200
1848
  #if defined(HUF_FORCE_DECOMPRESS_X1)
1201
1849
  (void)dtd;
1202
1850
  assert(dtd.tableType == 0);
1203
- return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
1851
+ return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
1204
1852
  #elif defined(HUF_FORCE_DECOMPRESS_X2)
1205
1853
  (void)dtd;
1206
1854
  assert(dtd.tableType == 1);
1207
- return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
1855
+ return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
1208
1856
  #else
1209
- return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
1210
- HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
1857
+ return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) :
1858
+ HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
1211
1859
  #endif
1212
1860
  }
1213
1861
 
1214
- size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
1862
+ size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags)
1215
1863
  {
1216
1864
  /* validation checks */
1217
1865
  if (dstSize == 0) return ERROR(dstSize_tooSmall);
@@ -1221,14 +1869,14 @@ size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t ds
1221
1869
  #if defined(HUF_FORCE_DECOMPRESS_X1)
1222
1870
  (void)algoNb;
1223
1871
  assert(algoNb == 0);
1224
- return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
1872
+ return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
1225
1873
  #elif defined(HUF_FORCE_DECOMPRESS_X2)
1226
1874
  (void)algoNb;
1227
1875
  assert(algoNb == 1);
1228
- return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
1876
+ return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
1229
1877
  #else
1230
- return algoNb ? HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2) :
1231
- HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
1878
+ return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags) :
1879
+ HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
1232
1880
  #endif
1233
1881
  }
1234
1882
  }