zstd-ruby 1.4.4.0 → 1.5.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +2 -0
  3. data/README.md +78 -5
  4. data/Rakefile +8 -2
  5. data/ext/zstdruby/common.h +15 -0
  6. data/ext/zstdruby/extconf.rb +3 -2
  7. data/ext/zstdruby/libzstd/common/allocations.h +55 -0
  8. data/ext/zstdruby/libzstd/common/bits.h +200 -0
  9. data/ext/zstdruby/libzstd/common/bitstream.h +74 -97
  10. data/ext/zstdruby/libzstd/common/compiler.h +219 -20
  11. data/ext/zstdruby/libzstd/common/cpu.h +1 -3
  12. data/ext/zstdruby/libzstd/common/debug.c +11 -31
  13. data/ext/zstdruby/libzstd/common/debug.h +22 -49
  14. data/ext/zstdruby/libzstd/common/entropy_common.c +184 -80
  15. data/ext/zstdruby/libzstd/common/error_private.c +11 -2
  16. data/ext/zstdruby/libzstd/common/error_private.h +87 -4
  17. data/ext/zstdruby/libzstd/common/fse.h +47 -116
  18. data/ext/zstdruby/libzstd/common/fse_decompress.c +127 -127
  19. data/ext/zstdruby/libzstd/common/huf.h +112 -197
  20. data/ext/zstdruby/libzstd/common/mem.h +124 -142
  21. data/ext/zstdruby/libzstd/common/pool.c +54 -27
  22. data/ext/zstdruby/libzstd/common/pool.h +11 -5
  23. data/ext/zstdruby/libzstd/common/portability_macros.h +156 -0
  24. data/ext/zstdruby/libzstd/common/threading.c +78 -22
  25. data/ext/zstdruby/libzstd/common/threading.h +9 -13
  26. data/ext/zstdruby/libzstd/common/xxhash.c +15 -873
  27. data/ext/zstdruby/libzstd/common/xxhash.h +5572 -191
  28. data/ext/zstdruby/libzstd/common/zstd_common.c +2 -37
  29. data/ext/zstdruby/libzstd/common/zstd_deps.h +111 -0
  30. data/ext/zstdruby/libzstd/common/zstd_internal.h +186 -144
  31. data/ext/zstdruby/libzstd/common/zstd_trace.h +163 -0
  32. data/ext/zstdruby/libzstd/compress/clevels.h +134 -0
  33. data/ext/zstdruby/libzstd/compress/fse_compress.c +99 -196
  34. data/ext/zstdruby/libzstd/compress/hist.c +41 -63
  35. data/ext/zstdruby/libzstd/compress/hist.h +13 -33
  36. data/ext/zstdruby/libzstd/compress/huf_compress.c +968 -331
  37. data/ext/zstdruby/libzstd/compress/zstd_compress.c +4120 -1191
  38. data/ext/zstdruby/libzstd/compress/zstd_compress_internal.h +688 -159
  39. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.c +121 -40
  40. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.h +16 -6
  41. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.c +62 -35
  42. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.h +10 -3
  43. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.c +577 -0
  44. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.h +32 -0
  45. data/ext/zstdruby/libzstd/compress/zstd_cwksp.h +322 -115
  46. data/ext/zstdruby/libzstd/compress/zstd_double_fast.c +394 -154
  47. data/ext/zstdruby/libzstd/compress/zstd_double_fast.h +4 -3
  48. data/ext/zstdruby/libzstd/compress/zstd_fast.c +729 -253
  49. data/ext/zstdruby/libzstd/compress/zstd_fast.h +4 -3
  50. data/ext/zstdruby/libzstd/compress/zstd_lazy.c +1289 -247
  51. data/ext/zstdruby/libzstd/compress/zstd_lazy.h +61 -1
  52. data/ext/zstdruby/libzstd/compress/zstd_ldm.c +339 -212
  53. data/ext/zstdruby/libzstd/compress/zstd_ldm.h +15 -3
  54. data/ext/zstdruby/libzstd/compress/zstd_ldm_geartab.h +106 -0
  55. data/ext/zstdruby/libzstd/compress/zstd_opt.c +508 -282
  56. data/ext/zstdruby/libzstd/compress/zstd_opt.h +1 -1
  57. data/ext/zstdruby/libzstd/compress/zstdmt_compress.c +217 -466
  58. data/ext/zstdruby/libzstd/compress/zstdmt_compress.h +35 -114
  59. data/ext/zstdruby/libzstd/decompress/huf_decompress.c +1220 -572
  60. data/ext/zstdruby/libzstd/decompress/huf_decompress_amd64.S +576 -0
  61. data/ext/zstdruby/libzstd/decompress/zstd_ddict.c +23 -19
  62. data/ext/zstdruby/libzstd/decompress/zstd_ddict.h +3 -3
  63. data/ext/zstdruby/libzstd/decompress/zstd_decompress.c +859 -273
  64. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.c +1244 -375
  65. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.h +21 -7
  66. data/ext/zstdruby/libzstd/decompress/zstd_decompress_internal.h +74 -11
  67. data/ext/zstdruby/libzstd/dictBuilder/cover.c +75 -54
  68. data/ext/zstdruby/libzstd/dictBuilder/cover.h +20 -9
  69. data/ext/zstdruby/libzstd/dictBuilder/divsufsort.c +1 -1
  70. data/ext/zstdruby/libzstd/dictBuilder/fastcover.c +55 -36
  71. data/ext/zstdruby/libzstd/dictBuilder/zdict.c +126 -110
  72. data/ext/zstdruby/libzstd/{dictBuilder/zdict.h → zdict.h} +248 -56
  73. data/ext/zstdruby/libzstd/zstd.h +1277 -306
  74. data/ext/zstdruby/libzstd/{common/zstd_errors.h → zstd_errors.h} +29 -8
  75. data/ext/zstdruby/main.c +20 -0
  76. data/ext/zstdruby/skippable_frame.c +63 -0
  77. data/ext/zstdruby/streaming_compress.c +177 -0
  78. data/ext/zstdruby/streaming_compress.h +5 -0
  79. data/ext/zstdruby/streaming_decompress.c +123 -0
  80. data/ext/zstdruby/zstdruby.c +114 -32
  81. data/lib/zstd-ruby/version.rb +1 -1
  82. data/lib/zstd-ruby.rb +0 -1
  83. data/zstd-ruby.gemspec +1 -1
  84. metadata +24 -39
  85. data/.travis.yml +0 -14
  86. data/ext/zstdruby/libzstd/.gitignore +0 -3
  87. data/ext/zstdruby/libzstd/BUCK +0 -234
  88. data/ext/zstdruby/libzstd/Makefile +0 -289
  89. data/ext/zstdruby/libzstd/README.md +0 -159
  90. data/ext/zstdruby/libzstd/deprecated/zbuff.h +0 -214
  91. data/ext/zstdruby/libzstd/deprecated/zbuff_common.c +0 -26
  92. data/ext/zstdruby/libzstd/deprecated/zbuff_compress.c +0 -147
  93. data/ext/zstdruby/libzstd/deprecated/zbuff_decompress.c +0 -75
  94. data/ext/zstdruby/libzstd/dll/example/Makefile +0 -47
  95. data/ext/zstdruby/libzstd/dll/example/README.md +0 -69
  96. data/ext/zstdruby/libzstd/dll/example/build_package.bat +0 -20
  97. data/ext/zstdruby/libzstd/dll/example/fullbench-dll.sln +0 -25
  98. data/ext/zstdruby/libzstd/dll/example/fullbench-dll.vcxproj +0 -181
  99. data/ext/zstdruby/libzstd/legacy/zstd_legacy.h +0 -415
  100. data/ext/zstdruby/libzstd/legacy/zstd_v01.c +0 -2152
  101. data/ext/zstdruby/libzstd/legacy/zstd_v01.h +0 -94
  102. data/ext/zstdruby/libzstd/legacy/zstd_v02.c +0 -3514
  103. data/ext/zstdruby/libzstd/legacy/zstd_v02.h +0 -93
  104. data/ext/zstdruby/libzstd/legacy/zstd_v03.c +0 -3156
  105. data/ext/zstdruby/libzstd/legacy/zstd_v03.h +0 -93
  106. data/ext/zstdruby/libzstd/legacy/zstd_v04.c +0 -3641
  107. data/ext/zstdruby/libzstd/legacy/zstd_v04.h +0 -142
  108. data/ext/zstdruby/libzstd/legacy/zstd_v05.c +0 -4046
  109. data/ext/zstdruby/libzstd/legacy/zstd_v05.h +0 -162
  110. data/ext/zstdruby/libzstd/legacy/zstd_v06.c +0 -4150
  111. data/ext/zstdruby/libzstd/legacy/zstd_v06.h +0 -172
  112. data/ext/zstdruby/libzstd/legacy/zstd_v07.c +0 -4533
  113. data/ext/zstdruby/libzstd/legacy/zstd_v07.h +0 -187
  114. data/ext/zstdruby/libzstd/libzstd.pc.in +0 -15
  115. data/ext/zstdruby/zstdruby.h +0 -6
@@ -1,47 +1,34 @@
1
1
  /* ******************************************************************
2
- huff0 huffman decoder,
3
- part of Finite State Entropy library
4
- Copyright (C) 2013-present, Yann Collet.
5
-
6
- BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
7
-
8
- Redistribution and use in source and binary forms, with or without
9
- modification, are permitted provided that the following conditions are
10
- met:
11
-
12
- * Redistributions of source code must retain the above copyright
13
- notice, this list of conditions and the following disclaimer.
14
- * Redistributions in binary form must reproduce the above
15
- copyright notice, this list of conditions and the following disclaimer
16
- in the documentation and/or other materials provided with the
17
- distribution.
18
-
19
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23
- OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
-
31
- You can contact the author at :
32
- - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
2
+ * huff0 huffman decoder,
3
+ * part of Finite State Entropy library
4
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
5
+ *
6
+ * You can contact the author at :
7
+ * - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
8
+ *
9
+ * This source code is licensed under both the BSD-style license (found in the
10
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
11
+ * in the COPYING file in the root directory of this source tree).
12
+ * You may select, at your option, one of the above-listed licenses.
33
13
  ****************************************************************** */
34
14
 
35
15
  /* **************************************************************
36
16
  * Dependencies
37
17
  ****************************************************************/
38
- #include <string.h> /* memcpy, memset */
39
- #include "compiler.h"
40
- #include "bitstream.h" /* BIT_* */
41
- #include "fse.h" /* to compress headers */
42
- #define HUF_STATIC_LINKING_ONLY
43
- #include "huf.h"
44
- #include "error_private.h"
18
+ #include "../common/zstd_deps.h" /* ZSTD_memcpy, ZSTD_memset */
19
+ #include "../common/compiler.h"
20
+ #include "../common/bitstream.h" /* BIT_* */
21
+ #include "../common/fse.h" /* to compress headers */
22
+ #include "../common/huf.h"
23
+ #include "../common/error_private.h"
24
+ #include "../common/zstd_internal.h"
25
+ #include "../common/bits.h" /* ZSTD_highbit32, ZSTD_countTrailingZeros64 */
26
+
27
+ /* **************************************************************
28
+ * Constants
29
+ ****************************************************************/
30
+
31
+ #define HUF_DECODER_FAST_TABLELOG 11
45
32
 
46
33
  /* **************************************************************
47
34
  * Macros
@@ -56,14 +43,33 @@
56
43
  #error "Cannot force the use of the X1 and X2 decoders at the same time!"
57
44
  #endif
58
45
 
46
+ /* When DYNAMIC_BMI2 is enabled, fast decoders are only called when bmi2 is
47
+ * supported at runtime, so we can add the BMI2 target attribute.
48
+ * When it is disabled, we will still get BMI2 if it is enabled statically.
49
+ */
50
+ #if DYNAMIC_BMI2
51
+ # define HUF_FAST_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE
52
+ #else
53
+ # define HUF_FAST_BMI2_ATTRS
54
+ #endif
55
+
56
+ #ifdef __cplusplus
57
+ # define HUF_EXTERN_C extern "C"
58
+ #else
59
+ # define HUF_EXTERN_C
60
+ #endif
61
+ #define HUF_ASM_DECL HUF_EXTERN_C
62
+
63
+ #if DYNAMIC_BMI2
64
+ # define HUF_NEED_BMI2_FUNCTION 1
65
+ #else
66
+ # define HUF_NEED_BMI2_FUNCTION 0
67
+ #endif
59
68
 
60
69
  /* **************************************************************
61
70
  * Error Management
62
71
  ****************************************************************/
63
72
  #define HUF_isError ERR_isError
64
- #ifndef CHECK_F
65
- #define CHECK_F(f) { size_t const err_ = (f); if (HUF_isError(err_)) return err_; }
66
- #endif
67
73
 
68
74
 
69
75
  /* **************************************************************
@@ -76,6 +82,11 @@
76
82
  /* **************************************************************
77
83
  * BMI2 Variant Wrappers
78
84
  ****************************************************************/
85
+ typedef size_t (*HUF_DecompressUsingDTableFn)(void *dst, size_t dstSize,
86
+ const void *cSrc,
87
+ size_t cSrcSize,
88
+ const HUF_DTable *DTable);
89
+
79
90
  #if DYNAMIC_BMI2
80
91
 
81
92
  #define HUF_DGEN(fn) \
@@ -88,7 +99,7 @@
88
99
  return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \
89
100
  } \
90
101
  \
91
- static TARGET_ATTRIBUTE("bmi2") size_t fn##_bmi2( \
102
+ static BMI2_TARGET_ATTRIBUTE size_t fn##_bmi2( \
92
103
  void* dst, size_t dstSize, \
93
104
  const void* cSrc, size_t cSrcSize, \
94
105
  const HUF_DTable* DTable) \
@@ -97,9 +108,9 @@
97
108
  } \
98
109
  \
99
110
  static size_t fn(void* dst, size_t dstSize, void const* cSrc, \
100
- size_t cSrcSize, HUF_DTable const* DTable, int bmi2) \
111
+ size_t cSrcSize, HUF_DTable const* DTable, int flags) \
101
112
  { \
102
- if (bmi2) { \
113
+ if (flags & HUF_flags_bmi2) { \
103
114
  return fn##_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); \
104
115
  } \
105
116
  return fn##_default(dst, dstSize, cSrc, cSrcSize, DTable); \
@@ -109,9 +120,9 @@
109
120
 
110
121
  #define HUF_DGEN(fn) \
111
122
  static size_t fn(void* dst, size_t dstSize, void const* cSrc, \
112
- size_t cSrcSize, HUF_DTable const* DTable, int bmi2) \
123
+ size_t cSrcSize, HUF_DTable const* DTable, int flags) \
113
124
  { \
114
- (void)bmi2; \
125
+ (void)flags; \
115
126
  return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \
116
127
  }
117
128
 
@@ -126,82 +137,359 @@ typedef struct { BYTE maxTableLog; BYTE tableType; BYTE tableLog; BYTE reserved;
126
137
  static DTableDesc HUF_getDTableDesc(const HUF_DTable* table)
127
138
  {
128
139
  DTableDesc dtd;
129
- memcpy(&dtd, table, sizeof(dtd));
140
+ ZSTD_memcpy(&dtd, table, sizeof(dtd));
130
141
  return dtd;
131
142
  }
132
143
 
144
+ static size_t HUF_initFastDStream(BYTE const* ip) {
145
+ BYTE const lastByte = ip[7];
146
+ size_t const bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0;
147
+ size_t const value = MEM_readLEST(ip) | 1;
148
+ assert(bitsConsumed <= 8);
149
+ assert(sizeof(size_t) == 8);
150
+ return value << bitsConsumed;
151
+ }
152
+
153
+
154
+ /**
155
+ * The input/output arguments to the Huffman fast decoding loop:
156
+ *
157
+ * ip [in/out] - The input pointers, must be updated to reflect what is consumed.
158
+ * op [in/out] - The output pointers, must be updated to reflect what is written.
159
+ * bits [in/out] - The bitstream containers, must be updated to reflect the current state.
160
+ * dt [in] - The decoding table.
161
+ * ilimit [in] - The input limit, stop when any input pointer is below ilimit.
162
+ * oend [in] - The end of the output stream. op[3] must not cross oend.
163
+ * iend [in] - The end of each input stream. ip[i] may cross iend[i],
164
+ * as long as it is above ilimit, but that indicates corruption.
165
+ */
166
+ typedef struct {
167
+ BYTE const* ip[4];
168
+ BYTE* op[4];
169
+ U64 bits[4];
170
+ void const* dt;
171
+ BYTE const* ilimit;
172
+ BYTE* oend;
173
+ BYTE const* iend[4];
174
+ } HUF_DecompressFastArgs;
175
+
176
+ typedef void (*HUF_DecompressFastLoopFn)(HUF_DecompressFastArgs*);
177
+
178
+ /**
179
+ * Initializes args for the fast decoding loop.
180
+ * @returns 1 on success
181
+ * 0 if the fallback implementation should be used.
182
+ * Or an error code on failure.
183
+ */
184
+ static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable)
185
+ {
186
+ void const* dt = DTable + 1;
187
+ U32 const dtLog = HUF_getDTableDesc(DTable).tableLog;
188
+
189
+ const BYTE* const ilimit = (const BYTE*)src + 6 + 8;
190
+
191
+ BYTE* const oend = (BYTE*)dst + dstSize;
192
+
193
+ /* The fast decoding loop assumes 64-bit little-endian.
194
+ * This condition is false on x32.
195
+ */
196
+ if (!MEM_isLittleEndian() || MEM_32bits())
197
+ return 0;
198
+
199
+ /* strict minimum : jump table + 1 byte per stream */
200
+ if (srcSize < 10)
201
+ return ERROR(corruption_detected);
202
+
203
+ /* Must have at least 8 bytes per stream because we don't handle initializing smaller bit containers.
204
+ * If table log is not correct at this point, fallback to the old decoder.
205
+ * On small inputs we don't have enough data to trigger the fast loop, so use the old decoder.
206
+ */
207
+ if (dtLog != HUF_DECODER_FAST_TABLELOG)
208
+ return 0;
209
+
210
+ /* Read the jump table. */
211
+ {
212
+ const BYTE* const istart = (const BYTE*)src;
213
+ size_t const length1 = MEM_readLE16(istart);
214
+ size_t const length2 = MEM_readLE16(istart+2);
215
+ size_t const length3 = MEM_readLE16(istart+4);
216
+ size_t const length4 = srcSize - (length1 + length2 + length3 + 6);
217
+ args->iend[0] = istart + 6; /* jumpTable */
218
+ args->iend[1] = args->iend[0] + length1;
219
+ args->iend[2] = args->iend[1] + length2;
220
+ args->iend[3] = args->iend[2] + length3;
221
+
222
+ /* HUF_initFastDStream() requires this, and this small of an input
223
+ * won't benefit from the ASM loop anyways.
224
+ * length1 must be >= 16 so that ip[0] >= ilimit before the loop
225
+ * starts.
226
+ */
227
+ if (length1 < 16 || length2 < 8 || length3 < 8 || length4 < 8)
228
+ return 0;
229
+ if (length4 > srcSize) return ERROR(corruption_detected); /* overflow */
230
+ }
231
+ /* ip[] contains the position that is currently loaded into bits[]. */
232
+ args->ip[0] = args->iend[1] - sizeof(U64);
233
+ args->ip[1] = args->iend[2] - sizeof(U64);
234
+ args->ip[2] = args->iend[3] - sizeof(U64);
235
+ args->ip[3] = (BYTE const*)src + srcSize - sizeof(U64);
236
+
237
+ /* op[] contains the output pointers. */
238
+ args->op[0] = (BYTE*)dst;
239
+ args->op[1] = args->op[0] + (dstSize+3)/4;
240
+ args->op[2] = args->op[1] + (dstSize+3)/4;
241
+ args->op[3] = args->op[2] + (dstSize+3)/4;
242
+
243
+ /* No point to call the ASM loop for tiny outputs. */
244
+ if (args->op[3] >= oend)
245
+ return 0;
246
+
247
+ /* bits[] is the bit container.
248
+ * It is read from the MSB down to the LSB.
249
+ * It is shifted left as it is read, and zeros are
250
+ * shifted in. After the lowest valid bit a 1 is
251
+ * set, so that CountTrailingZeros(bits[]) can be used
252
+ * to count how many bits we've consumed.
253
+ */
254
+ args->bits[0] = HUF_initFastDStream(args->ip[0]);
255
+ args->bits[1] = HUF_initFastDStream(args->ip[1]);
256
+ args->bits[2] = HUF_initFastDStream(args->ip[2]);
257
+ args->bits[3] = HUF_initFastDStream(args->ip[3]);
258
+
259
+ /* If ip[] >= ilimit, it is guaranteed to be safe to
260
+ * reload bits[]. It may be beyond its section, but is
261
+ * guaranteed to be valid (>= istart).
262
+ */
263
+ args->ilimit = ilimit;
264
+
265
+ args->oend = oend;
266
+ args->dt = dt;
267
+
268
+ return 1;
269
+ }
270
+
271
+ static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressFastArgs const* args, int stream, BYTE* segmentEnd)
272
+ {
273
+ /* Validate that we haven't overwritten. */
274
+ if (args->op[stream] > segmentEnd)
275
+ return ERROR(corruption_detected);
276
+ /* Validate that we haven't read beyond iend[].
277
+ * Note that ip[] may be < iend[] because the MSB is
278
+ * the next bit to read, and we may have consumed 100%
279
+ * of the stream, so down to iend[i] - 8 is valid.
280
+ */
281
+ if (args->ip[stream] < args->iend[stream] - 8)
282
+ return ERROR(corruption_detected);
283
+
284
+ /* Construct the BIT_DStream_t. */
285
+ assert(sizeof(size_t) == 8);
286
+ bit->bitContainer = MEM_readLEST(args->ip[stream]);
287
+ bit->bitsConsumed = ZSTD_countTrailingZeros64(args->bits[stream]);
288
+ bit->start = (const char*)args->iend[0];
289
+ bit->limitPtr = bit->start + sizeof(size_t);
290
+ bit->ptr = (const char*)args->ip[stream];
291
+
292
+ return 0;
293
+ }
294
+
133
295
 
134
296
  #ifndef HUF_FORCE_DECOMPRESS_X2
135
297
 
136
298
  /*-***************************/
137
299
  /* single-symbol decoding */
138
300
  /*-***************************/
139
- typedef struct { BYTE byte; BYTE nbBits; } HUF_DEltX1; /* single-symbol decoding */
301
+ typedef struct { BYTE nbBits; BYTE byte; } HUF_DEltX1; /* single-symbol decoding */
140
302
 
141
- size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize)
303
+ /**
304
+ * Packs 4 HUF_DEltX1 structs into a U64. This is used to lay down 4 entries at
305
+ * a time.
306
+ */
307
+ static U64 HUF_DEltX1_set4(BYTE symbol, BYTE nbBits) {
308
+ U64 D4;
309
+ if (MEM_isLittleEndian()) {
310
+ D4 = (U64)((symbol << 8) + nbBits);
311
+ } else {
312
+ D4 = (U64)(symbol + (nbBits << 8));
313
+ }
314
+ assert(D4 < (1U << 16));
315
+ D4 *= 0x0001000100010001ULL;
316
+ return D4;
317
+ }
318
+
319
+ /**
320
+ * Increase the tableLog to targetTableLog and rescales the stats.
321
+ * If tableLog > targetTableLog this is a no-op.
322
+ * @returns New tableLog
323
+ */
324
+ static U32 HUF_rescaleStats(BYTE* huffWeight, U32* rankVal, U32 nbSymbols, U32 tableLog, U32 targetTableLog)
325
+ {
326
+ if (tableLog > targetTableLog)
327
+ return tableLog;
328
+ if (tableLog < targetTableLog) {
329
+ U32 const scale = targetTableLog - tableLog;
330
+ U32 s;
331
+ /* Increase the weight for all non-zero probability symbols by scale. */
332
+ for (s = 0; s < nbSymbols; ++s) {
333
+ huffWeight[s] += (BYTE)((huffWeight[s] == 0) ? 0 : scale);
334
+ }
335
+ /* Update rankVal to reflect the new weights.
336
+ * All weights except 0 get moved to weight + scale.
337
+ * Weights [1, scale] are empty.
338
+ */
339
+ for (s = targetTableLog; s > scale; --s) {
340
+ rankVal[s] = rankVal[s - scale];
341
+ }
342
+ for (s = scale; s > 0; --s) {
343
+ rankVal[s] = 0;
344
+ }
345
+ }
346
+ return targetTableLog;
347
+ }
348
+
349
+ typedef struct {
350
+ U32 rankVal[HUF_TABLELOG_ABSOLUTEMAX + 1];
351
+ U32 rankStart[HUF_TABLELOG_ABSOLUTEMAX + 1];
352
+ U32 statsWksp[HUF_READ_STATS_WORKSPACE_SIZE_U32];
353
+ BYTE symbols[HUF_SYMBOLVALUE_MAX + 1];
354
+ BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1];
355
+ } HUF_ReadDTableX1_Workspace;
356
+
357
+ size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags)
142
358
  {
143
359
  U32 tableLog = 0;
144
360
  U32 nbSymbols = 0;
145
361
  size_t iSize;
146
362
  void* const dtPtr = DTable + 1;
147
363
  HUF_DEltX1* const dt = (HUF_DEltX1*)dtPtr;
364
+ HUF_ReadDTableX1_Workspace* wksp = (HUF_ReadDTableX1_Workspace*)workSpace;
148
365
 
149
- U32* rankVal;
150
- BYTE* huffWeight;
151
- size_t spaceUsed32 = 0;
152
-
153
- rankVal = (U32 *)workSpace + spaceUsed32;
154
- spaceUsed32 += HUF_TABLELOG_ABSOLUTEMAX + 1;
155
- huffWeight = (BYTE *)((U32 *)workSpace + spaceUsed32);
156
- spaceUsed32 += HUF_ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2;
157
-
158
- if ((spaceUsed32 << 2) > wkspSize) return ERROR(tableLog_tooLarge);
366
+ DEBUG_STATIC_ASSERT(HUF_DECOMPRESS_WORKSPACE_SIZE >= sizeof(*wksp));
367
+ if (sizeof(*wksp) > wkspSize) return ERROR(tableLog_tooLarge);
159
368
 
160
369
  DEBUG_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable));
161
- /* memset(huffWeight, 0, sizeof(huffWeight)); */ /* is not necessary, even though some analyzer complain ... */
370
+ /* ZSTD_memset(huffWeight, 0, sizeof(huffWeight)); */ /* is not necessary, even though some analyzer complain ... */
162
371
 
163
- iSize = HUF_readStats(huffWeight, HUF_SYMBOLVALUE_MAX + 1, rankVal, &nbSymbols, &tableLog, src, srcSize);
372
+ iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), flags);
164
373
  if (HUF_isError(iSize)) return iSize;
165
374
 
375
+
166
376
  /* Table header */
167
377
  { DTableDesc dtd = HUF_getDTableDesc(DTable);
378
+ U32 const maxTableLog = dtd.maxTableLog + 1;
379
+ U32 const targetTableLog = MIN(maxTableLog, HUF_DECODER_FAST_TABLELOG);
380
+ tableLog = HUF_rescaleStats(wksp->huffWeight, wksp->rankVal, nbSymbols, tableLog, targetTableLog);
168
381
  if (tableLog > (U32)(dtd.maxTableLog+1)) return ERROR(tableLog_tooLarge); /* DTable too small, Huffman tree cannot fit in */
169
382
  dtd.tableType = 0;
170
383
  dtd.tableLog = (BYTE)tableLog;
171
- memcpy(DTable, &dtd, sizeof(dtd));
384
+ ZSTD_memcpy(DTable, &dtd, sizeof(dtd));
172
385
  }
173
386
 
174
- /* Calculate starting value for each rank */
175
- { U32 n, nextRankStart = 0;
176
- for (n=1; n<tableLog+1; n++) {
177
- U32 const current = nextRankStart;
178
- nextRankStart += (rankVal[n] << (n-1));
179
- rankVal[n] = current;
180
- } }
181
-
182
- /* fill DTable */
183
- { U32 n;
184
- for (n=0; n<nbSymbols; n++) {
185
- U32 const w = huffWeight[n];
186
- U32 const length = (1 << w) >> 1;
187
- U32 u;
188
- HUF_DEltX1 D;
189
- D.byte = (BYTE)n; D.nbBits = (BYTE)(tableLog + 1 - w);
190
- for (u = rankVal[w]; u < rankVal[w] + length; u++)
191
- dt[u] = D;
192
- rankVal[w] += length;
193
- } }
387
+ /* Compute symbols and rankStart given rankVal:
388
+ *
389
+ * rankVal already contains the number of values of each weight.
390
+ *
391
+ * symbols contains the symbols ordered by weight. First are the rankVal[0]
392
+ * weight 0 symbols, followed by the rankVal[1] weight 1 symbols, and so on.
393
+ * symbols[0] is filled (but unused) to avoid a branch.
394
+ *
395
+ * rankStart contains the offset where each rank belongs in the DTable.
396
+ * rankStart[0] is not filled because there are no entries in the table for
397
+ * weight 0.
398
+ */
399
+ { int n;
400
+ U32 nextRankStart = 0;
401
+ int const unroll = 4;
402
+ int const nLimit = (int)nbSymbols - unroll + 1;
403
+ for (n=0; n<(int)tableLog+1; n++) {
404
+ U32 const curr = nextRankStart;
405
+ nextRankStart += wksp->rankVal[n];
406
+ wksp->rankStart[n] = curr;
407
+ }
408
+ for (n=0; n < nLimit; n += unroll) {
409
+ int u;
410
+ for (u=0; u < unroll; ++u) {
411
+ size_t const w = wksp->huffWeight[n+u];
412
+ wksp->symbols[wksp->rankStart[w]++] = (BYTE)(n+u);
413
+ }
414
+ }
415
+ for (; n < (int)nbSymbols; ++n) {
416
+ size_t const w = wksp->huffWeight[n];
417
+ wksp->symbols[wksp->rankStart[w]++] = (BYTE)n;
418
+ }
419
+ }
194
420
 
421
+ /* fill DTable
422
+ * We fill all entries of each weight in order.
423
+ * That way length is a constant for each iteration of the outer loop.
424
+ * We can switch based on the length to a different inner loop which is
425
+ * optimized for that particular case.
426
+ */
427
+ { U32 w;
428
+ int symbol = wksp->rankVal[0];
429
+ int rankStart = 0;
430
+ for (w=1; w<tableLog+1; ++w) {
431
+ int const symbolCount = wksp->rankVal[w];
432
+ int const length = (1 << w) >> 1;
433
+ int uStart = rankStart;
434
+ BYTE const nbBits = (BYTE)(tableLog + 1 - w);
435
+ int s;
436
+ int u;
437
+ switch (length) {
438
+ case 1:
439
+ for (s=0; s<symbolCount; ++s) {
440
+ HUF_DEltX1 D;
441
+ D.byte = wksp->symbols[symbol + s];
442
+ D.nbBits = nbBits;
443
+ dt[uStart] = D;
444
+ uStart += 1;
445
+ }
446
+ break;
447
+ case 2:
448
+ for (s=0; s<symbolCount; ++s) {
449
+ HUF_DEltX1 D;
450
+ D.byte = wksp->symbols[symbol + s];
451
+ D.nbBits = nbBits;
452
+ dt[uStart+0] = D;
453
+ dt[uStart+1] = D;
454
+ uStart += 2;
455
+ }
456
+ break;
457
+ case 4:
458
+ for (s=0; s<symbolCount; ++s) {
459
+ U64 const D4 = HUF_DEltX1_set4(wksp->symbols[symbol + s], nbBits);
460
+ MEM_write64(dt + uStart, D4);
461
+ uStart += 4;
462
+ }
463
+ break;
464
+ case 8:
465
+ for (s=0; s<symbolCount; ++s) {
466
+ U64 const D4 = HUF_DEltX1_set4(wksp->symbols[symbol + s], nbBits);
467
+ MEM_write64(dt + uStart, D4);
468
+ MEM_write64(dt + uStart + 4, D4);
469
+ uStart += 8;
470
+ }
471
+ break;
472
+ default:
473
+ for (s=0; s<symbolCount; ++s) {
474
+ U64 const D4 = HUF_DEltX1_set4(wksp->symbols[symbol + s], nbBits);
475
+ for (u=0; u < length; u += 16) {
476
+ MEM_write64(dt + uStart + u + 0, D4);
477
+ MEM_write64(dt + uStart + u + 4, D4);
478
+ MEM_write64(dt + uStart + u + 8, D4);
479
+ MEM_write64(dt + uStart + u + 12, D4);
480
+ }
481
+ assert(u == length);
482
+ uStart += length;
483
+ }
484
+ break;
485
+ }
486
+ symbol += symbolCount;
487
+ rankStart += symbolCount * length;
488
+ }
489
+ }
195
490
  return iSize;
196
491
  }
197
492
 
198
- size_t HUF_readDTableX1(HUF_DTable* DTable, const void* src, size_t srcSize)
199
- {
200
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
201
- return HUF_readDTableX1_wksp(DTable, src, srcSize,
202
- workSpace, sizeof(workSpace));
203
- }
204
-
205
493
  FORCE_INLINE_TEMPLATE BYTE
206
494
  HUF_decodeSymbolX1(BIT_DStream_t* Dstream, const HUF_DEltX1* dt, const U32 dtLog)
207
495
  {
@@ -228,11 +516,15 @@ HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, cons
228
516
  BYTE* const pStart = p;
229
517
 
230
518
  /* up to 4 symbols at a time */
231
- while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-3)) {
232
- HUF_DECODE_SYMBOLX1_2(p, bitDPtr);
233
- HUF_DECODE_SYMBOLX1_1(p, bitDPtr);
234
- HUF_DECODE_SYMBOLX1_2(p, bitDPtr);
235
- HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
519
+ if ((pEnd - p) > 3) {
520
+ while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-3)) {
521
+ HUF_DECODE_SYMBOLX1_2(p, bitDPtr);
522
+ HUF_DECODE_SYMBOLX1_1(p, bitDPtr);
523
+ HUF_DECODE_SYMBOLX1_2(p, bitDPtr);
524
+ HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
525
+ }
526
+ } else {
527
+ BIT_reloadDStream(bitDPtr);
236
528
  }
237
529
 
238
530
  /* [0-3] symbols remaining */
@@ -244,7 +536,7 @@ HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, cons
244
536
  while (p < pEnd)
245
537
  HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
246
538
 
247
- return pEnd-pStart;
539
+ return (size_t)(pEnd-pStart);
248
540
  }
249
541
 
250
542
  FORCE_INLINE_TEMPLATE size_t
@@ -270,6 +562,10 @@ HUF_decompress1X1_usingDTable_internal_body(
270
562
  return dstSize;
271
563
  }
272
564
 
565
+ /* HUF_decompress4X1_usingDTable_internal_body():
566
+ * Conditions :
567
+ * @dstSize >= 6
568
+ */
273
569
  FORCE_INLINE_TEMPLATE size_t
274
570
  HUF_decompress4X1_usingDTable_internal_body(
275
571
  void* dst, size_t dstSize,
@@ -282,6 +578,7 @@ HUF_decompress4X1_usingDTable_internal_body(
282
578
  { const BYTE* const istart = (const BYTE*) cSrc;
283
579
  BYTE* const ostart = (BYTE*) dst;
284
580
  BYTE* const oend = ostart + dstSize;
581
+ BYTE* const olimit = oend - 3;
285
582
  const void* const dtPtr = DTable + 1;
286
583
  const HUF_DEltX1* const dt = (const HUF_DEltX1*)dtPtr;
287
584
 
@@ -306,39 +603,42 @@ HUF_decompress4X1_usingDTable_internal_body(
306
603
  BYTE* op2 = opStart2;
307
604
  BYTE* op3 = opStart3;
308
605
  BYTE* op4 = opStart4;
309
- U32 endSignal = BIT_DStream_unfinished;
310
606
  DTableDesc const dtd = HUF_getDTableDesc(DTable);
311
607
  U32 const dtLog = dtd.tableLog;
608
+ U32 endSignal = 1;
312
609
 
313
610
  if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */
611
+ if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */
612
+ if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */
314
613
  CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
315
614
  CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
316
615
  CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
317
616
  CHECK_F( BIT_initDStream(&bitD4, istart4, length4) );
318
617
 
319
618
  /* up to 16 symbols per loop (4 symbols per stream) in 64-bit mode */
320
- endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
321
- while ( (endSignal==BIT_DStream_unfinished) && (op4<(oend-3)) ) {
322
- HUF_DECODE_SYMBOLX1_2(op1, &bitD1);
323
- HUF_DECODE_SYMBOLX1_2(op2, &bitD2);
324
- HUF_DECODE_SYMBOLX1_2(op3, &bitD3);
325
- HUF_DECODE_SYMBOLX1_2(op4, &bitD4);
326
- HUF_DECODE_SYMBOLX1_1(op1, &bitD1);
327
- HUF_DECODE_SYMBOLX1_1(op2, &bitD2);
328
- HUF_DECODE_SYMBOLX1_1(op3, &bitD3);
329
- HUF_DECODE_SYMBOLX1_1(op4, &bitD4);
330
- HUF_DECODE_SYMBOLX1_2(op1, &bitD1);
331
- HUF_DECODE_SYMBOLX1_2(op2, &bitD2);
332
- HUF_DECODE_SYMBOLX1_2(op3, &bitD3);
333
- HUF_DECODE_SYMBOLX1_2(op4, &bitD4);
334
- HUF_DECODE_SYMBOLX1_0(op1, &bitD1);
335
- HUF_DECODE_SYMBOLX1_0(op2, &bitD2);
336
- HUF_DECODE_SYMBOLX1_0(op3, &bitD3);
337
- HUF_DECODE_SYMBOLX1_0(op4, &bitD4);
338
- BIT_reloadDStream(&bitD1);
339
- BIT_reloadDStream(&bitD2);
340
- BIT_reloadDStream(&bitD3);
341
- BIT_reloadDStream(&bitD4);
619
+ if ((size_t)(oend - op4) >= sizeof(size_t)) {
620
+ for ( ; (endSignal) & (op4 < olimit) ; ) {
621
+ HUF_DECODE_SYMBOLX1_2(op1, &bitD1);
622
+ HUF_DECODE_SYMBOLX1_2(op2, &bitD2);
623
+ HUF_DECODE_SYMBOLX1_2(op3, &bitD3);
624
+ HUF_DECODE_SYMBOLX1_2(op4, &bitD4);
625
+ HUF_DECODE_SYMBOLX1_1(op1, &bitD1);
626
+ HUF_DECODE_SYMBOLX1_1(op2, &bitD2);
627
+ HUF_DECODE_SYMBOLX1_1(op3, &bitD3);
628
+ HUF_DECODE_SYMBOLX1_1(op4, &bitD4);
629
+ HUF_DECODE_SYMBOLX1_2(op1, &bitD1);
630
+ HUF_DECODE_SYMBOLX1_2(op2, &bitD2);
631
+ HUF_DECODE_SYMBOLX1_2(op3, &bitD3);
632
+ HUF_DECODE_SYMBOLX1_2(op4, &bitD4);
633
+ HUF_DECODE_SYMBOLX1_0(op1, &bitD1);
634
+ HUF_DECODE_SYMBOLX1_0(op2, &bitD2);
635
+ HUF_DECODE_SYMBOLX1_0(op3, &bitD3);
636
+ HUF_DECODE_SYMBOLX1_0(op4, &bitD4);
637
+ endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished;
638
+ endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished;
639
+ endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished;
640
+ endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished;
641
+ }
342
642
  }
343
643
 
344
644
  /* check corruption */
@@ -364,99 +664,230 @@ HUF_decompress4X1_usingDTable_internal_body(
364
664
  }
365
665
  }
366
666
 
667
+ #if HUF_NEED_BMI2_FUNCTION
668
+ static BMI2_TARGET_ATTRIBUTE
669
+ size_t HUF_decompress4X1_usingDTable_internal_bmi2(void* dst, size_t dstSize, void const* cSrc,
670
+ size_t cSrcSize, HUF_DTable const* DTable) {
671
+ return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
672
+ }
673
+ #endif
367
674
 
368
- typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize,
369
- const void *cSrc,
370
- size_t cSrcSize,
371
- const HUF_DTable *DTable);
675
+ static
676
+ size_t HUF_decompress4X1_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,
677
+ size_t cSrcSize, HUF_DTable const* DTable) {
678
+ return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
679
+ }
372
680
 
373
- HUF_DGEN(HUF_decompress1X1_usingDTable_internal)
374
- HUF_DGEN(HUF_decompress4X1_usingDTable_internal)
681
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2
375
682
 
683
+ HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN;
376
684
 
685
+ #endif
377
686
 
378
- size_t HUF_decompress1X1_usingDTable(
379
- void* dst, size_t dstSize,
380
- const void* cSrc, size_t cSrcSize,
381
- const HUF_DTable* DTable)
687
+ static HUF_FAST_BMI2_ATTRS
688
+ void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args)
382
689
  {
383
- DTableDesc dtd = HUF_getDTableDesc(DTable);
384
- if (dtd.tableType != 0) return ERROR(GENERIC);
385
- return HUF_decompress1X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
690
+ U64 bits[4];
691
+ BYTE const* ip[4];
692
+ BYTE* op[4];
693
+ U16 const* const dtable = (U16 const*)args->dt;
694
+ BYTE* const oend = args->oend;
695
+ BYTE const* const ilimit = args->ilimit;
696
+
697
+ /* Copy the arguments to local variables */
698
+ ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
699
+ ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip));
700
+ ZSTD_memcpy(&op, &args->op, sizeof(op));
701
+
702
+ assert(MEM_isLittleEndian());
703
+ assert(!MEM_32bits());
704
+
705
+ for (;;) {
706
+ BYTE* olimit;
707
+ int stream;
708
+ int symbol;
709
+
710
+ /* Assert loop preconditions */
711
+ #ifndef NDEBUG
712
+ for (stream = 0; stream < 4; ++stream) {
713
+ assert(op[stream] <= (stream == 3 ? oend : op[stream + 1]));
714
+ assert(ip[stream] >= ilimit);
715
+ }
716
+ #endif
717
+ /* Compute olimit */
718
+ {
719
+ /* Each iteration produces 5 output symbols per stream */
720
+ size_t const oiters = (size_t)(oend - op[3]) / 5;
721
+ /* Each iteration consumes up to 11 bits * 5 = 55 bits < 7 bytes
722
+ * per stream.
723
+ */
724
+ size_t const iiters = (size_t)(ip[0] - ilimit) / 7;
725
+ /* We can safely run iters iterations before running bounds checks */
726
+ size_t const iters = MIN(oiters, iiters);
727
+ size_t const symbols = iters * 5;
728
+
729
+ /* We can simply check that op[3] < olimit, instead of checking all
730
+ * of our bounds, since we can't hit the other bounds until we've run
731
+ * iters iterations, which only happens when op[3] == olimit.
732
+ */
733
+ olimit = op[3] + symbols;
734
+
735
+ /* Exit fast decoding loop once we get close to the end. */
736
+ if (op[3] + 20 > olimit)
737
+ break;
738
+
739
+ /* Exit the decoding loop if any input pointer has crossed the
740
+ * previous one. This indicates corruption, and a precondition
741
+ * to our loop is that ip[i] >= ip[0].
742
+ */
743
+ for (stream = 1; stream < 4; ++stream) {
744
+ if (ip[stream] < ip[stream - 1])
745
+ goto _out;
746
+ }
747
+ }
748
+
749
+ #ifndef NDEBUG
750
+ for (stream = 1; stream < 4; ++stream) {
751
+ assert(ip[stream] >= ip[stream - 1]);
752
+ }
753
+ #endif
754
+
755
+ do {
756
+ /* Decode 5 symbols in each of the 4 streams */
757
+ for (symbol = 0; symbol < 5; ++symbol) {
758
+ for (stream = 0; stream < 4; ++stream) {
759
+ int const index = (int)(bits[stream] >> 53);
760
+ int const entry = (int)dtable[index];
761
+ bits[stream] <<= (entry & 63);
762
+ op[stream][symbol] = (BYTE)((entry >> 8) & 0xFF);
763
+ }
764
+ }
765
+ /* Reload the bitstreams */
766
+ for (stream = 0; stream < 4; ++stream) {
767
+ int const ctz = ZSTD_countTrailingZeros64(bits[stream]);
768
+ int const nbBits = ctz & 7;
769
+ int const nbBytes = ctz >> 3;
770
+ op[stream] += 5;
771
+ ip[stream] -= nbBytes;
772
+ bits[stream] = MEM_read64(ip[stream]) | 1;
773
+ bits[stream] <<= nbBits;
774
+ }
775
+ } while (op[3] < olimit);
776
+ }
777
+
778
+ _out:
779
+
780
+ /* Save the final values of each of the state variables back to args. */
781
+ ZSTD_memcpy(&args->bits, &bits, sizeof(bits));
782
+ ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip));
783
+ ZSTD_memcpy(&args->op, &op, sizeof(op));
386
784
  }
387
785
 
388
- size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
389
- const void* cSrc, size_t cSrcSize,
390
- void* workSpace, size_t wkspSize)
786
+ /**
787
+ * @returns @p dstSize on success (>= 6)
788
+ * 0 if the fallback implementation should be used
789
+ * An error if an error occurred
790
+ */
791
+ static HUF_FAST_BMI2_ATTRS
792
+ size_t
793
+ HUF_decompress4X1_usingDTable_internal_fast(
794
+ void* dst, size_t dstSize,
795
+ const void* cSrc, size_t cSrcSize,
796
+ const HUF_DTable* DTable,
797
+ HUF_DecompressFastLoopFn loopFn)
391
798
  {
392
- const BYTE* ip = (const BYTE*) cSrc;
799
+ void const* dt = DTable + 1;
800
+ const BYTE* const iend = (const BYTE*)cSrc + 6;
801
+ BYTE* const oend = (BYTE*)dst + dstSize;
802
+ HUF_DecompressFastArgs args;
803
+ { size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
804
+ FORWARD_IF_ERROR(ret, "Failed to init fast loop args");
805
+ if (ret == 0)
806
+ return 0;
807
+ }
393
808
 
394
- size_t const hSize = HUF_readDTableX1_wksp(DCtx, cSrc, cSrcSize, workSpace, wkspSize);
395
- if (HUF_isError(hSize)) return hSize;
396
- if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
397
- ip += hSize; cSrcSize -= hSize;
809
+ assert(args.ip[0] >= args.ilimit);
810
+ loopFn(&args);
811
+
812
+ /* Our loop guarantees that ip[] >= ilimit and that we haven't
813
+ * overwritten any op[].
814
+ */
815
+ assert(args.ip[0] >= iend);
816
+ assert(args.ip[1] >= iend);
817
+ assert(args.ip[2] >= iend);
818
+ assert(args.ip[3] >= iend);
819
+ assert(args.op[3] <= oend);
820
+ (void)iend;
821
+
822
+ /* finish bit streams one by one. */
823
+ { size_t const segmentSize = (dstSize+3) / 4;
824
+ BYTE* segmentEnd = (BYTE*)dst;
825
+ int i;
826
+ for (i = 0; i < 4; ++i) {
827
+ BIT_DStream_t bit;
828
+ if (segmentSize <= (size_t)(oend - segmentEnd))
829
+ segmentEnd += segmentSize;
830
+ else
831
+ segmentEnd = oend;
832
+ FORWARD_IF_ERROR(HUF_initRemainingDStream(&bit, &args, i, segmentEnd), "corruption");
833
+ /* Decompress and validate that we've produced exactly the expected length. */
834
+ args.op[i] += HUF_decodeStreamX1(args.op[i], &bit, segmentEnd, (HUF_DEltX1 const*)dt, HUF_DECODER_FAST_TABLELOG);
835
+ if (args.op[i] != segmentEnd) return ERROR(corruption_detected);
836
+ }
837
+ }
398
838
 
399
- return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
839
+ /* decoded size */
840
+ assert(dstSize != 0);
841
+ return dstSize;
400
842
  }
401
843
 
844
+ HUF_DGEN(HUF_decompress1X1_usingDTable_internal)
402
845
 
403
- size_t HUF_decompress1X1_DCtx(HUF_DTable* DCtx, void* dst, size_t dstSize,
404
- const void* cSrc, size_t cSrcSize)
846
+ static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc,
847
+ size_t cSrcSize, HUF_DTable const* DTable, int flags)
405
848
  {
406
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
407
- return HUF_decompress1X1_DCtx_wksp(DCtx, dst, dstSize, cSrc, cSrcSize,
408
- workSpace, sizeof(workSpace));
409
- }
849
+ HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X1_usingDTable_internal_default;
850
+ HUF_DecompressFastLoopFn loopFn = HUF_decompress4X1_usingDTable_internal_fast_c_loop;
410
851
 
411
- size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
412
- {
413
- HUF_CREATE_STATIC_DTABLEX1(DTable, HUF_TABLELOG_MAX);
414
- return HUF_decompress1X1_DCtx (DTable, dst, dstSize, cSrc, cSrcSize);
415
- }
852
+ #if DYNAMIC_BMI2
853
+ if (flags & HUF_flags_bmi2) {
854
+ fallbackFn = HUF_decompress4X1_usingDTable_internal_bmi2;
855
+ # if ZSTD_ENABLE_ASM_X86_64_BMI2
856
+ if (!(flags & HUF_flags_disableAsm)) {
857
+ loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop;
858
+ }
859
+ # endif
860
+ } else {
861
+ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
862
+ }
863
+ #endif
416
864
 
417
- size_t HUF_decompress4X1_usingDTable(
418
- void* dst, size_t dstSize,
419
- const void* cSrc, size_t cSrcSize,
420
- const HUF_DTable* DTable)
421
- {
422
- DTableDesc dtd = HUF_getDTableDesc(DTable);
423
- if (dtd.tableType != 0) return ERROR(GENERIC);
424
- return HUF_decompress4X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
865
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
866
+ if (!(flags & HUF_flags_disableAsm)) {
867
+ loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop;
868
+ }
869
+ #endif
870
+
871
+ if (!(flags & HUF_flags_disableFast)) {
872
+ size_t const ret = HUF_decompress4X1_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);
873
+ if (ret != 0)
874
+ return ret;
875
+ }
876
+ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
425
877
  }
426
878
 
427
- static size_t HUF_decompress4X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
879
+ static size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
428
880
  const void* cSrc, size_t cSrcSize,
429
- void* workSpace, size_t wkspSize, int bmi2)
881
+ void* workSpace, size_t wkspSize, int flags)
430
882
  {
431
883
  const BYTE* ip = (const BYTE*) cSrc;
432
884
 
433
- size_t const hSize = HUF_readDTableX1_wksp (dctx, cSrc, cSrcSize,
434
- workSpace, wkspSize);
885
+ size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags);
435
886
  if (HUF_isError(hSize)) return hSize;
436
887
  if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
437
888
  ip += hSize; cSrcSize -= hSize;
438
889
 
439
- return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
440
- }
441
-
442
- size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
443
- const void* cSrc, size_t cSrcSize,
444
- void* workSpace, size_t wkspSize)
445
- {
446
- return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, 0);
447
- }
448
-
449
-
450
- size_t HUF_decompress4X1_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
451
- {
452
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
453
- return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
454
- workSpace, sizeof(workSpace));
455
- }
456
- size_t HUF_decompress4X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
457
- {
458
- HUF_CREATE_STATIC_DTABLEX1(DTable, HUF_TABLELOG_MAX);
459
- return HUF_decompress4X1_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
890
+ return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
460
891
  }
461
892
 
462
893
  #endif /* HUF_FORCE_DECOMPRESS_X2 */
@@ -469,209 +900,322 @@ size_t HUF_decompress4X1 (void* dst, size_t dstSize, const void* cSrc, size_t cS
469
900
  /* *************************/
470
901
 
471
902
  typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUF_DEltX2; /* double-symbols decoding */
472
- typedef struct { BYTE symbol; BYTE weight; } sortedSymbol_t;
903
+ typedef struct { BYTE symbol; } sortedSymbol_t;
473
904
  typedef U32 rankValCol_t[HUF_TABLELOG_MAX + 1];
474
905
  typedef rankValCol_t rankVal_t[HUF_TABLELOG_MAX];
475
906
 
907
+ /**
908
+ * Constructs a HUF_DEltX2 in a U32.
909
+ */
910
+ static U32 HUF_buildDEltX2U32(U32 symbol, U32 nbBits, U32 baseSeq, int level)
911
+ {
912
+ U32 seq;
913
+ DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, sequence) == 0);
914
+ DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, nbBits) == 2);
915
+ DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, length) == 3);
916
+ DEBUG_STATIC_ASSERT(sizeof(HUF_DEltX2) == sizeof(U32));
917
+ if (MEM_isLittleEndian()) {
918
+ seq = level == 1 ? symbol : (baseSeq + (symbol << 8));
919
+ return seq + (nbBits << 16) + ((U32)level << 24);
920
+ } else {
921
+ seq = level == 1 ? (symbol << 8) : ((baseSeq << 8) + symbol);
922
+ return (seq << 16) + (nbBits << 8) + (U32)level;
923
+ }
924
+ }
476
925
 
477
- /* HUF_fillDTableX2Level2() :
478
- * `rankValOrigin` must be a table of at least (HUF_TABLELOG_MAX + 1) U32 */
479
- static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 sizeLog, const U32 consumed,
480
- const U32* rankValOrigin, const int minWeight,
481
- const sortedSymbol_t* sortedSymbols, const U32 sortedListSize,
482
- U32 nbBitsBaseline, U16 baseSeq)
926
+ /**
927
+ * Constructs a HUF_DEltX2.
928
+ */
929
+ static HUF_DEltX2 HUF_buildDEltX2(U32 symbol, U32 nbBits, U32 baseSeq, int level)
483
930
  {
484
931
  HUF_DEltX2 DElt;
485
- U32 rankVal[HUF_TABLELOG_MAX + 1];
932
+ U32 const val = HUF_buildDEltX2U32(symbol, nbBits, baseSeq, level);
933
+ DEBUG_STATIC_ASSERT(sizeof(DElt) == sizeof(val));
934
+ ZSTD_memcpy(&DElt, &val, sizeof(val));
935
+ return DElt;
936
+ }
937
+
938
+ /**
939
+ * Constructs 2 HUF_DEltX2s and packs them into a U64.
940
+ */
941
+ static U64 HUF_buildDEltX2U64(U32 symbol, U32 nbBits, U16 baseSeq, int level)
942
+ {
943
+ U32 DElt = HUF_buildDEltX2U32(symbol, nbBits, baseSeq, level);
944
+ return (U64)DElt + ((U64)DElt << 32);
945
+ }
486
946
 
487
- /* get pre-calculated rankVal */
488
- memcpy(rankVal, rankValOrigin, sizeof(rankVal));
947
+ /**
948
+ * Fills the DTable rank with all the symbols from [begin, end) that are each
949
+ * nbBits long.
950
+ *
951
+ * @param DTableRank The start of the rank in the DTable.
952
+ * @param begin The first symbol to fill (inclusive).
953
+ * @param end The last symbol to fill (exclusive).
954
+ * @param nbBits Each symbol is nbBits long.
955
+ * @param tableLog The table log.
956
+ * @param baseSeq If level == 1 { 0 } else { the first level symbol }
957
+ * @param level The level in the table. Must be 1 or 2.
958
+ */
959
+ static void HUF_fillDTableX2ForWeight(
960
+ HUF_DEltX2* DTableRank,
961
+ sortedSymbol_t const* begin, sortedSymbol_t const* end,
962
+ U32 nbBits, U32 tableLog,
963
+ U16 baseSeq, int const level)
964
+ {
965
+ U32 const length = 1U << ((tableLog - nbBits) & 0x1F /* quiet static-analyzer */);
966
+ const sortedSymbol_t* ptr;
967
+ assert(level >= 1 && level <= 2);
968
+ switch (length) {
969
+ case 1:
970
+ for (ptr = begin; ptr != end; ++ptr) {
971
+ HUF_DEltX2 const DElt = HUF_buildDEltX2(ptr->symbol, nbBits, baseSeq, level);
972
+ *DTableRank++ = DElt;
973
+ }
974
+ break;
975
+ case 2:
976
+ for (ptr = begin; ptr != end; ++ptr) {
977
+ HUF_DEltX2 const DElt = HUF_buildDEltX2(ptr->symbol, nbBits, baseSeq, level);
978
+ DTableRank[0] = DElt;
979
+ DTableRank[1] = DElt;
980
+ DTableRank += 2;
981
+ }
982
+ break;
983
+ case 4:
984
+ for (ptr = begin; ptr != end; ++ptr) {
985
+ U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level);
986
+ ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2));
987
+ ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2));
988
+ DTableRank += 4;
989
+ }
990
+ break;
991
+ case 8:
992
+ for (ptr = begin; ptr != end; ++ptr) {
993
+ U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level);
994
+ ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2));
995
+ ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2));
996
+ ZSTD_memcpy(DTableRank + 4, &DEltX2, sizeof(DEltX2));
997
+ ZSTD_memcpy(DTableRank + 6, &DEltX2, sizeof(DEltX2));
998
+ DTableRank += 8;
999
+ }
1000
+ break;
1001
+ default:
1002
+ for (ptr = begin; ptr != end; ++ptr) {
1003
+ U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level);
1004
+ HUF_DEltX2* const DTableRankEnd = DTableRank + length;
1005
+ for (; DTableRank != DTableRankEnd; DTableRank += 8) {
1006
+ ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2));
1007
+ ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2));
1008
+ ZSTD_memcpy(DTableRank + 4, &DEltX2, sizeof(DEltX2));
1009
+ ZSTD_memcpy(DTableRank + 6, &DEltX2, sizeof(DEltX2));
1010
+ }
1011
+ }
1012
+ break;
1013
+ }
1014
+ }
489
1015
 
490
- /* fill skipped values */
1016
+ /* HUF_fillDTableX2Level2() :
1017
+ * `rankValOrigin` must be a table of at least (HUF_TABLELOG_MAX + 1) U32 */
1018
+ static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 targetLog, const U32 consumedBits,
1019
+ const U32* rankVal, const int minWeight, const int maxWeight1,
1020
+ const sortedSymbol_t* sortedSymbols, U32 const* rankStart,
1021
+ U32 nbBitsBaseline, U16 baseSeq)
1022
+ {
1023
+ /* Fill skipped values (all positions up to rankVal[minWeight]).
1024
+ * These are positions only get a single symbol because the combined weight
1025
+ * is too large.
1026
+ */
491
1027
  if (minWeight>1) {
492
- U32 i, skipSize = rankVal[minWeight];
493
- MEM_writeLE16(&(DElt.sequence), baseSeq);
494
- DElt.nbBits = (BYTE)(consumed);
495
- DElt.length = 1;
496
- for (i = 0; i < skipSize; i++)
497
- DTable[i] = DElt;
1028
+ U32 const length = 1U << ((targetLog - consumedBits) & 0x1F /* quiet static-analyzer */);
1029
+ U64 const DEltX2 = HUF_buildDEltX2U64(baseSeq, consumedBits, /* baseSeq */ 0, /* level */ 1);
1030
+ int const skipSize = rankVal[minWeight];
1031
+ assert(length > 1);
1032
+ assert((U32)skipSize < length);
1033
+ switch (length) {
1034
+ case 2:
1035
+ assert(skipSize == 1);
1036
+ ZSTD_memcpy(DTable, &DEltX2, sizeof(DEltX2));
1037
+ break;
1038
+ case 4:
1039
+ assert(skipSize <= 4);
1040
+ ZSTD_memcpy(DTable + 0, &DEltX2, sizeof(DEltX2));
1041
+ ZSTD_memcpy(DTable + 2, &DEltX2, sizeof(DEltX2));
1042
+ break;
1043
+ default:
1044
+ {
1045
+ int i;
1046
+ for (i = 0; i < skipSize; i += 8) {
1047
+ ZSTD_memcpy(DTable + i + 0, &DEltX2, sizeof(DEltX2));
1048
+ ZSTD_memcpy(DTable + i + 2, &DEltX2, sizeof(DEltX2));
1049
+ ZSTD_memcpy(DTable + i + 4, &DEltX2, sizeof(DEltX2));
1050
+ ZSTD_memcpy(DTable + i + 6, &DEltX2, sizeof(DEltX2));
1051
+ }
1052
+ }
1053
+ }
498
1054
  }
499
1055
 
500
- /* fill DTable */
501
- { U32 s; for (s=0; s<sortedListSize; s++) { /* note : sortedSymbols already skipped */
502
- const U32 symbol = sortedSymbols[s].symbol;
503
- const U32 weight = sortedSymbols[s].weight;
504
- const U32 nbBits = nbBitsBaseline - weight;
505
- const U32 length = 1 << (sizeLog-nbBits);
506
- const U32 start = rankVal[weight];
507
- U32 i = start;
508
- const U32 end = start + length;
509
-
510
- MEM_writeLE16(&(DElt.sequence), (U16)(baseSeq + (symbol << 8)));
511
- DElt.nbBits = (BYTE)(nbBits + consumed);
512
- DElt.length = 2;
513
- do { DTable[i++] = DElt; } while (i<end); /* since length >= 1 */
514
-
515
- rankVal[weight] += length;
516
- } }
1056
+ /* Fill each of the second level symbols by weight. */
1057
+ {
1058
+ int w;
1059
+ for (w = minWeight; w < maxWeight1; ++w) {
1060
+ int const begin = rankStart[w];
1061
+ int const end = rankStart[w+1];
1062
+ U32 const nbBits = nbBitsBaseline - w;
1063
+ U32 const totalBits = nbBits + consumedBits;
1064
+ HUF_fillDTableX2ForWeight(
1065
+ DTable + rankVal[w],
1066
+ sortedSymbols + begin, sortedSymbols + end,
1067
+ totalBits, targetLog,
1068
+ baseSeq, /* level */ 2);
1069
+ }
1070
+ }
517
1071
  }
518
1072
 
519
-
520
1073
  static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog,
521
- const sortedSymbol_t* sortedList, const U32 sortedListSize,
522
- const U32* rankStart, rankVal_t rankValOrigin, const U32 maxWeight,
1074
+ const sortedSymbol_t* sortedList,
1075
+ const U32* rankStart, rankValCol_t* rankValOrigin, const U32 maxWeight,
523
1076
  const U32 nbBitsBaseline)
524
1077
  {
525
- U32 rankVal[HUF_TABLELOG_MAX + 1];
1078
+ U32* const rankVal = rankValOrigin[0];
526
1079
  const int scaleLog = nbBitsBaseline - targetLog; /* note : targetLog >= srcLog, hence scaleLog <= 1 */
527
1080
  const U32 minBits = nbBitsBaseline - maxWeight;
528
- U32 s;
529
-
530
- memcpy(rankVal, rankValOrigin, sizeof(rankVal));
531
-
532
- /* fill DTable */
533
- for (s=0; s<sortedListSize; s++) {
534
- const U16 symbol = sortedList[s].symbol;
535
- const U32 weight = sortedList[s].weight;
536
- const U32 nbBits = nbBitsBaseline - weight;
537
- const U32 start = rankVal[weight];
538
- const U32 length = 1 << (targetLog-nbBits);
539
-
540
- if (targetLog-nbBits >= minBits) { /* enough room for a second symbol */
541
- U32 sortedRank;
1081
+ int w;
1082
+ int const wEnd = (int)maxWeight + 1;
1083
+
1084
+ /* Fill DTable in order of weight. */
1085
+ for (w = 1; w < wEnd; ++w) {
1086
+ int const begin = (int)rankStart[w];
1087
+ int const end = (int)rankStart[w+1];
1088
+ U32 const nbBits = nbBitsBaseline - w;
1089
+
1090
+ if (targetLog-nbBits >= minBits) {
1091
+ /* Enough room for a second symbol. */
1092
+ int start = rankVal[w];
1093
+ U32 const length = 1U << ((targetLog - nbBits) & 0x1F /* quiet static-analyzer */);
542
1094
  int minWeight = nbBits + scaleLog;
1095
+ int s;
543
1096
  if (minWeight < 1) minWeight = 1;
544
- sortedRank = rankStart[minWeight];
545
- HUF_fillDTableX2Level2(DTable+start, targetLog-nbBits, nbBits,
546
- rankValOrigin[nbBits], minWeight,
547
- sortedList+sortedRank, sortedListSize-sortedRank,
548
- nbBitsBaseline, symbol);
1097
+ /* Fill the DTable for every symbol of weight w.
1098
+ * These symbols get at least 1 second symbol.
1099
+ */
1100
+ for (s = begin; s != end; ++s) {
1101
+ HUF_fillDTableX2Level2(
1102
+ DTable + start, targetLog, nbBits,
1103
+ rankValOrigin[nbBits], minWeight, wEnd,
1104
+ sortedList, rankStart,
1105
+ nbBitsBaseline, sortedList[s].symbol);
1106
+ start += length;
1107
+ }
549
1108
  } else {
550
- HUF_DEltX2 DElt;
551
- MEM_writeLE16(&(DElt.sequence), symbol);
552
- DElt.nbBits = (BYTE)(nbBits);
553
- DElt.length = 1;
554
- { U32 const end = start + length;
555
- U32 u;
556
- for (u = start; u < end; u++) DTable[u] = DElt;
557
- } }
558
- rankVal[weight] += length;
1109
+ /* Only a single symbol. */
1110
+ HUF_fillDTableX2ForWeight(
1111
+ DTable + rankVal[w],
1112
+ sortedList + begin, sortedList + end,
1113
+ nbBits, targetLog,
1114
+ /* baseSeq */ 0, /* level */ 1);
1115
+ }
559
1116
  }
560
1117
  }
561
1118
 
1119
+ typedef struct {
1120
+ rankValCol_t rankVal[HUF_TABLELOG_MAX];
1121
+ U32 rankStats[HUF_TABLELOG_MAX + 1];
1122
+ U32 rankStart0[HUF_TABLELOG_MAX + 3];
1123
+ sortedSymbol_t sortedSymbol[HUF_SYMBOLVALUE_MAX + 1];
1124
+ BYTE weightList[HUF_SYMBOLVALUE_MAX + 1];
1125
+ U32 calleeWksp[HUF_READ_STATS_WORKSPACE_SIZE_U32];
1126
+ } HUF_ReadDTableX2_Workspace;
1127
+
562
1128
  size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,
563
1129
  const void* src, size_t srcSize,
564
- void* workSpace, size_t wkspSize)
1130
+ void* workSpace, size_t wkspSize, int flags)
565
1131
  {
566
- U32 tableLog, maxW, sizeOfSort, nbSymbols;
1132
+ U32 tableLog, maxW, nbSymbols;
567
1133
  DTableDesc dtd = HUF_getDTableDesc(DTable);
568
- U32 const maxTableLog = dtd.maxTableLog;
1134
+ U32 maxTableLog = dtd.maxTableLog;
569
1135
  size_t iSize;
570
1136
  void* dtPtr = DTable+1; /* force compiler to avoid strict-aliasing */
571
1137
  HUF_DEltX2* const dt = (HUF_DEltX2*)dtPtr;
572
1138
  U32 *rankStart;
573
1139
 
574
- rankValCol_t* rankVal;
575
- U32* rankStats;
576
- U32* rankStart0;
577
- sortedSymbol_t* sortedSymbol;
578
- BYTE* weightList;
579
- size_t spaceUsed32 = 0;
580
-
581
- rankVal = (rankValCol_t *)((U32 *)workSpace + spaceUsed32);
582
- spaceUsed32 += (sizeof(rankValCol_t) * HUF_TABLELOG_MAX) >> 2;
583
- rankStats = (U32 *)workSpace + spaceUsed32;
584
- spaceUsed32 += HUF_TABLELOG_MAX + 1;
585
- rankStart0 = (U32 *)workSpace + spaceUsed32;
586
- spaceUsed32 += HUF_TABLELOG_MAX + 2;
587
- sortedSymbol = (sortedSymbol_t *)workSpace + (spaceUsed32 * sizeof(U32)) / sizeof(sortedSymbol_t);
588
- spaceUsed32 += HUF_ALIGN(sizeof(sortedSymbol_t) * (HUF_SYMBOLVALUE_MAX + 1), sizeof(U32)) >> 2;
589
- weightList = (BYTE *)((U32 *)workSpace + spaceUsed32);
590
- spaceUsed32 += HUF_ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2;
591
-
592
- if ((spaceUsed32 << 2) > wkspSize) return ERROR(tableLog_tooLarge);
593
-
594
- rankStart = rankStart0 + 1;
595
- memset(rankStats, 0, sizeof(U32) * (2 * HUF_TABLELOG_MAX + 2 + 1));
1140
+ HUF_ReadDTableX2_Workspace* const wksp = (HUF_ReadDTableX2_Workspace*)workSpace;
1141
+
1142
+ if (sizeof(*wksp) > wkspSize) return ERROR(GENERIC);
1143
+
1144
+ rankStart = wksp->rankStart0 + 1;
1145
+ ZSTD_memset(wksp->rankStats, 0, sizeof(wksp->rankStats));
1146
+ ZSTD_memset(wksp->rankStart0, 0, sizeof(wksp->rankStart0));
596
1147
 
597
1148
  DEBUG_STATIC_ASSERT(sizeof(HUF_DEltX2) == sizeof(HUF_DTable)); /* if compiler fails here, assertion is wrong */
598
1149
  if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
599
- /* memset(weightList, 0, sizeof(weightList)); */ /* is not necessary, even though some analyzer complain ... */
1150
+ /* ZSTD_memset(weightList, 0, sizeof(weightList)); */ /* is not necessary, even though some analyzer complain ... */
600
1151
 
601
- iSize = HUF_readStats(weightList, HUF_SYMBOLVALUE_MAX + 1, rankStats, &nbSymbols, &tableLog, src, srcSize);
1152
+ iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), flags);
602
1153
  if (HUF_isError(iSize)) return iSize;
603
1154
 
604
1155
  /* check result */
605
1156
  if (tableLog > maxTableLog) return ERROR(tableLog_tooLarge); /* DTable can't fit code depth */
1157
+ if (tableLog <= HUF_DECODER_FAST_TABLELOG && maxTableLog > HUF_DECODER_FAST_TABLELOG) maxTableLog = HUF_DECODER_FAST_TABLELOG;
606
1158
 
607
1159
  /* find maxWeight */
608
- for (maxW = tableLog; rankStats[maxW]==0; maxW--) {} /* necessarily finds a solution before 0 */
1160
+ for (maxW = tableLog; wksp->rankStats[maxW]==0; maxW--) {} /* necessarily finds a solution before 0 */
609
1161
 
610
1162
  /* Get start index of each weight */
611
1163
  { U32 w, nextRankStart = 0;
612
1164
  for (w=1; w<maxW+1; w++) {
613
- U32 current = nextRankStart;
614
- nextRankStart += rankStats[w];
615
- rankStart[w] = current;
1165
+ U32 curr = nextRankStart;
1166
+ nextRankStart += wksp->rankStats[w];
1167
+ rankStart[w] = curr;
616
1168
  }
617
1169
  rankStart[0] = nextRankStart; /* put all 0w symbols at the end of sorted list*/
618
- sizeOfSort = nextRankStart;
1170
+ rankStart[maxW+1] = nextRankStart;
619
1171
  }
620
1172
 
621
1173
  /* sort symbols by weight */
622
1174
  { U32 s;
623
1175
  for (s=0; s<nbSymbols; s++) {
624
- U32 const w = weightList[s];
1176
+ U32 const w = wksp->weightList[s];
625
1177
  U32 const r = rankStart[w]++;
626
- sortedSymbol[r].symbol = (BYTE)s;
627
- sortedSymbol[r].weight = (BYTE)w;
1178
+ wksp->sortedSymbol[r].symbol = (BYTE)s;
628
1179
  }
629
1180
  rankStart[0] = 0; /* forget 0w symbols; this is beginning of weight(1) */
630
1181
  }
631
1182
 
632
1183
  /* Build rankVal */
633
- { U32* const rankVal0 = rankVal[0];
1184
+ { U32* const rankVal0 = wksp->rankVal[0];
634
1185
  { int const rescale = (maxTableLog-tableLog) - 1; /* tableLog <= maxTableLog */
635
1186
  U32 nextRankVal = 0;
636
1187
  U32 w;
637
1188
  for (w=1; w<maxW+1; w++) {
638
- U32 current = nextRankVal;
639
- nextRankVal += rankStats[w] << (w+rescale);
640
- rankVal0[w] = current;
1189
+ U32 curr = nextRankVal;
1190
+ nextRankVal += wksp->rankStats[w] << (w+rescale);
1191
+ rankVal0[w] = curr;
641
1192
  } }
642
1193
  { U32 const minBits = tableLog+1 - maxW;
643
1194
  U32 consumed;
644
1195
  for (consumed = minBits; consumed < maxTableLog - minBits + 1; consumed++) {
645
- U32* const rankValPtr = rankVal[consumed];
1196
+ U32* const rankValPtr = wksp->rankVal[consumed];
646
1197
  U32 w;
647
1198
  for (w = 1; w < maxW+1; w++) {
648
1199
  rankValPtr[w] = rankVal0[w] >> consumed;
649
1200
  } } } }
650
1201
 
651
1202
  HUF_fillDTableX2(dt, maxTableLog,
652
- sortedSymbol, sizeOfSort,
653
- rankStart0, rankVal, maxW,
1203
+ wksp->sortedSymbol,
1204
+ wksp->rankStart0, wksp->rankVal, maxW,
654
1205
  tableLog+1);
655
1206
 
656
1207
  dtd.tableLog = (BYTE)maxTableLog;
657
1208
  dtd.tableType = 1;
658
- memcpy(DTable, &dtd, sizeof(dtd));
1209
+ ZSTD_memcpy(DTable, &dtd, sizeof(dtd));
659
1210
  return iSize;
660
1211
  }
661
1212
 
662
- size_t HUF_readDTableX2(HUF_DTable* DTable, const void* src, size_t srcSize)
663
- {
664
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
665
- return HUF_readDTableX2_wksp(DTable, src, srcSize,
666
- workSpace, sizeof(workSpace));
667
- }
668
-
669
1213
 
670
1214
  FORCE_INLINE_TEMPLATE U32
671
1215
  HUF_decodeSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, const U32 dtLog)
672
1216
  {
673
1217
  size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */
674
- memcpy(op, dt+val, 2);
1218
+ ZSTD_memcpy(op, &dt[val].sequence, 2);
675
1219
  BIT_skipBits(DStream, dt[val].nbBits);
676
1220
  return dt[val].length;
677
1221
  }
@@ -680,15 +1224,17 @@ FORCE_INLINE_TEMPLATE U32
680
1224
  HUF_decodeLastSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, const U32 dtLog)
681
1225
  {
682
1226
  size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */
683
- memcpy(op, dt+val, 1);
684
- if (dt[val].length==1) BIT_skipBits(DStream, dt[val].nbBits);
685
- else {
1227
+ ZSTD_memcpy(op, &dt[val].sequence, 1);
1228
+ if (dt[val].length==1) {
1229
+ BIT_skipBits(DStream, dt[val].nbBits);
1230
+ } else {
686
1231
  if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8)) {
687
1232
  BIT_skipBits(DStream, dt[val].nbBits);
688
1233
  if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8))
689
1234
  /* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */
690
1235
  DStream->bitsConsumed = (sizeof(DStream->bitContainer)*8);
691
- } }
1236
+ }
1237
+ }
692
1238
  return 1;
693
1239
  }
694
1240
 
@@ -710,19 +1256,37 @@ HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd,
710
1256
  BYTE* const pStart = p;
711
1257
 
712
1258
  /* up to 8 symbols at a time */
713
- while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-(sizeof(bitDPtr->bitContainer)-1))) {
714
- HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
715
- HUF_DECODE_SYMBOLX2_1(p, bitDPtr);
716
- HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
717
- HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
1259
+ if ((size_t)(pEnd - p) >= sizeof(bitDPtr->bitContainer)) {
1260
+ if (dtLog <= 11 && MEM_64bits()) {
1261
+ /* up to 10 symbols at a time */
1262
+ while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-9)) {
1263
+ HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
1264
+ HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
1265
+ HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
1266
+ HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
1267
+ HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
1268
+ }
1269
+ } else {
1270
+ /* up to 8 symbols at a time */
1271
+ while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-(sizeof(bitDPtr->bitContainer)-1))) {
1272
+ HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
1273
+ HUF_DECODE_SYMBOLX2_1(p, bitDPtr);
1274
+ HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
1275
+ HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
1276
+ }
1277
+ }
1278
+ } else {
1279
+ BIT_reloadDStream(bitDPtr);
718
1280
  }
719
1281
 
720
1282
  /* closer to end : up to 2 symbols at a time */
721
- while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p <= pEnd-2))
722
- HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
1283
+ if ((size_t)(pEnd - p) >= 2) {
1284
+ while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p <= pEnd-2))
1285
+ HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
723
1286
 
724
- while (p <= pEnd-2)
725
- HUF_DECODE_SYMBOLX2_0(p, bitDPtr); /* no need to reload : reached the end of DStream */
1287
+ while (p <= pEnd-2)
1288
+ HUF_DECODE_SYMBOLX2_0(p, bitDPtr); /* no need to reload : reached the end of DStream */
1289
+ }
726
1290
 
727
1291
  if (p < pEnd)
728
1292
  p += HUF_decodeLastSymbolX2(p, bitDPtr, dt, dtLog);
@@ -757,7 +1321,10 @@ HUF_decompress1X2_usingDTable_internal_body(
757
1321
  return dstSize;
758
1322
  }
759
1323
 
760
-
1324
+ /* HUF_decompress4X2_usingDTable_internal_body():
1325
+ * Conditions:
1326
+ * @dstSize >= 6
1327
+ */
761
1328
  FORCE_INLINE_TEMPLATE size_t
762
1329
  HUF_decompress4X2_usingDTable_internal_body(
763
1330
  void* dst, size_t dstSize,
@@ -769,6 +1336,7 @@ HUF_decompress4X2_usingDTable_internal_body(
769
1336
  { const BYTE* const istart = (const BYTE*) cSrc;
770
1337
  BYTE* const ostart = (BYTE*) dst;
771
1338
  BYTE* const oend = ostart + dstSize;
1339
+ BYTE* const olimit = oend - (sizeof(size_t)-1);
772
1340
  const void* const dtPtr = DTable+1;
773
1341
  const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr;
774
1342
 
@@ -793,37 +1361,66 @@ HUF_decompress4X2_usingDTable_internal_body(
793
1361
  BYTE* op2 = opStart2;
794
1362
  BYTE* op3 = opStart3;
795
1363
  BYTE* op4 = opStart4;
796
- U32 endSignal;
1364
+ U32 endSignal = 1;
797
1365
  DTableDesc const dtd = HUF_getDTableDesc(DTable);
798
1366
  U32 const dtLog = dtd.tableLog;
799
1367
 
800
- if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */
1368
+ if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */
1369
+ if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */
1370
+ if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */
801
1371
  CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
802
1372
  CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
803
1373
  CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
804
1374
  CHECK_F( BIT_initDStream(&bitD4, istart4, length4) );
805
1375
 
806
1376
  /* 16-32 symbols per loop (4-8 symbols per stream) */
807
- endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
808
- for ( ; (endSignal==BIT_DStream_unfinished) & (op4<(oend-(sizeof(bitD4.bitContainer)-1))) ; ) {
809
- HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
810
- HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
811
- HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
812
- HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
813
- HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
814
- HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
815
- HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
816
- HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
817
- HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
818
- HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
819
- HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
820
- HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
821
- HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
822
- HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
823
- HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
824
- HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
825
-
826
- endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
1377
+ if ((size_t)(oend - op4) >= sizeof(size_t)) {
1378
+ for ( ; (endSignal) & (op4 < olimit); ) {
1379
+ #if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
1380
+ HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
1381
+ HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
1382
+ HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
1383
+ HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
1384
+ HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
1385
+ HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
1386
+ HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
1387
+ HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
1388
+ endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished;
1389
+ endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished;
1390
+ HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
1391
+ HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
1392
+ HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
1393
+ HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
1394
+ HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
1395
+ HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
1396
+ HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
1397
+ HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
1398
+ endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished;
1399
+ endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished;
1400
+ #else
1401
+ HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
1402
+ HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
1403
+ HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
1404
+ HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
1405
+ HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
1406
+ HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
1407
+ HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
1408
+ HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
1409
+ HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
1410
+ HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
1411
+ HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
1412
+ HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
1413
+ HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
1414
+ HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
1415
+ HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
1416
+ HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
1417
+ endSignal = (U32)LIKELY((U32)
1418
+ (BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished)
1419
+ & (BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished)
1420
+ & (BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished)
1421
+ & (BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished));
1422
+ #endif
1423
+ }
827
1424
  }
828
1425
 
829
1426
  /* check corruption */
@@ -847,94 +1444,279 @@ HUF_decompress4X2_usingDTable_internal_body(
847
1444
  }
848
1445
  }
849
1446
 
850
- HUF_DGEN(HUF_decompress1X2_usingDTable_internal)
851
- HUF_DGEN(HUF_decompress4X2_usingDTable_internal)
1447
+ #if HUF_NEED_BMI2_FUNCTION
1448
+ static BMI2_TARGET_ATTRIBUTE
1449
+ size_t HUF_decompress4X2_usingDTable_internal_bmi2(void* dst, size_t dstSize, void const* cSrc,
1450
+ size_t cSrcSize, HUF_DTable const* DTable) {
1451
+ return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
1452
+ }
1453
+ #endif
852
1454
 
853
- size_t HUF_decompress1X2_usingDTable(
854
- void* dst, size_t dstSize,
855
- const void* cSrc, size_t cSrcSize,
856
- const HUF_DTable* DTable)
857
- {
858
- DTableDesc dtd = HUF_getDTableDesc(DTable);
859
- if (dtd.tableType != 1) return ERROR(GENERIC);
860
- return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
1455
+ static
1456
+ size_t HUF_decompress4X2_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,
1457
+ size_t cSrcSize, HUF_DTable const* DTable) {
1458
+ return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
861
1459
  }
862
1460
 
863
- size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
864
- const void* cSrc, size_t cSrcSize,
865
- void* workSpace, size_t wkspSize)
1461
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2
1462
+
1463
+ HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN;
1464
+
1465
+ #endif
1466
+
1467
+ static HUF_FAST_BMI2_ATTRS
1468
+ void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args)
866
1469
  {
867
- const BYTE* ip = (const BYTE*) cSrc;
1470
+ U64 bits[4];
1471
+ BYTE const* ip[4];
1472
+ BYTE* op[4];
1473
+ BYTE* oend[4];
1474
+ HUF_DEltX2 const* const dtable = (HUF_DEltX2 const*)args->dt;
1475
+ BYTE const* const ilimit = args->ilimit;
1476
+
1477
+ /* Copy the arguments to local registers. */
1478
+ ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
1479
+ ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip));
1480
+ ZSTD_memcpy(&op, &args->op, sizeof(op));
1481
+
1482
+ oend[0] = op[1];
1483
+ oend[1] = op[2];
1484
+ oend[2] = op[3];
1485
+ oend[3] = args->oend;
1486
+
1487
+ assert(MEM_isLittleEndian());
1488
+ assert(!MEM_32bits());
1489
+
1490
+ for (;;) {
1491
+ BYTE* olimit;
1492
+ int stream;
1493
+ int symbol;
1494
+
1495
+ /* Assert loop preconditions */
1496
+ #ifndef NDEBUG
1497
+ for (stream = 0; stream < 4; ++stream) {
1498
+ assert(op[stream] <= oend[stream]);
1499
+ assert(ip[stream] >= ilimit);
1500
+ }
1501
+ #endif
1502
+ /* Compute olimit */
1503
+ {
1504
+ /* Each loop does 5 table lookups for each of the 4 streams.
1505
+ * Each table lookup consumes up to 11 bits of input, and produces
1506
+ * up to 2 bytes of output.
1507
+ */
1508
+ /* We can consume up to 7 bytes of input per iteration per stream.
1509
+ * We also know that each input pointer is >= ip[0]. So we can run
1510
+ * iters loops before running out of input.
1511
+ */
1512
+ size_t iters = (size_t)(ip[0] - ilimit) / 7;
1513
+ /* Each iteration can produce up to 10 bytes of output per stream.
1514
+ * Each output stream my advance at different rates. So take the
1515
+ * minimum number of safe iterations among all the output streams.
1516
+ */
1517
+ for (stream = 0; stream < 4; ++stream) {
1518
+ size_t const oiters = (size_t)(oend[stream] - op[stream]) / 10;
1519
+ iters = MIN(iters, oiters);
1520
+ }
1521
+
1522
+ /* Each iteration produces at least 5 output symbols. So until
1523
+ * op[3] crosses olimit, we know we haven't executed iters
1524
+ * iterations yet. This saves us maintaining an iters counter,
1525
+ * at the expense of computing the remaining # of iterations
1526
+ * more frequently.
1527
+ */
1528
+ olimit = op[3] + (iters * 5);
1529
+
1530
+ /* Exit the fast decoding loop if we are too close to the end. */
1531
+ if (op[3] + 10 > olimit)
1532
+ break;
1533
+
1534
+ /* Exit the decoding loop if any input pointer has crossed the
1535
+ * previous one. This indicates corruption, and a precondition
1536
+ * to our loop is that ip[i] >= ip[0].
1537
+ */
1538
+ for (stream = 1; stream < 4; ++stream) {
1539
+ if (ip[stream] < ip[stream - 1])
1540
+ goto _out;
1541
+ }
1542
+ }
868
1543
 
869
- size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize,
870
- workSpace, wkspSize);
871
- if (HUF_isError(hSize)) return hSize;
872
- if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
873
- ip += hSize; cSrcSize -= hSize;
1544
+ #ifndef NDEBUG
1545
+ for (stream = 1; stream < 4; ++stream) {
1546
+ assert(ip[stream] >= ip[stream - 1]);
1547
+ }
1548
+ #endif
874
1549
 
875
- return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
876
- }
1550
+ do {
1551
+ /* Do 5 table lookups for each of the first 3 streams */
1552
+ for (symbol = 0; symbol < 5; ++symbol) {
1553
+ for (stream = 0; stream < 3; ++stream) {
1554
+ int const index = (int)(bits[stream] >> 53);
1555
+ HUF_DEltX2 const entry = dtable[index];
1556
+ MEM_write16(op[stream], entry.sequence);
1557
+ bits[stream] <<= (entry.nbBits);
1558
+ op[stream] += (entry.length);
1559
+ }
1560
+ }
1561
+ /* Do 1 table lookup from the final stream */
1562
+ {
1563
+ int const index = (int)(bits[3] >> 53);
1564
+ HUF_DEltX2 const entry = dtable[index];
1565
+ MEM_write16(op[3], entry.sequence);
1566
+ bits[3] <<= (entry.nbBits);
1567
+ op[3] += (entry.length);
1568
+ }
1569
+ /* Do 4 table lookups from the final stream & reload bitstreams */
1570
+ for (stream = 0; stream < 4; ++stream) {
1571
+ /* Do a table lookup from the final stream.
1572
+ * This is interleaved with the reloading to reduce register
1573
+ * pressure. This shouldn't be necessary, but compilers can
1574
+ * struggle with codegen with high register pressure.
1575
+ */
1576
+ {
1577
+ int const index = (int)(bits[3] >> 53);
1578
+ HUF_DEltX2 const entry = dtable[index];
1579
+ MEM_write16(op[3], entry.sequence);
1580
+ bits[3] <<= (entry.nbBits);
1581
+ op[3] += (entry.length);
1582
+ }
1583
+ /* Reload the bistreams. The final bitstream must be reloaded
1584
+ * after the 5th symbol was decoded.
1585
+ */
1586
+ {
1587
+ int const ctz = ZSTD_countTrailingZeros64(bits[stream]);
1588
+ int const nbBits = ctz & 7;
1589
+ int const nbBytes = ctz >> 3;
1590
+ ip[stream] -= nbBytes;
1591
+ bits[stream] = MEM_read64(ip[stream]) | 1;
1592
+ bits[stream] <<= nbBits;
1593
+ }
1594
+ }
1595
+ } while (op[3] < olimit);
1596
+ }
877
1597
 
1598
+ _out:
878
1599
 
879
- size_t HUF_decompress1X2_DCtx(HUF_DTable* DCtx, void* dst, size_t dstSize,
880
- const void* cSrc, size_t cSrcSize)
881
- {
882
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
883
- return HUF_decompress1X2_DCtx_wksp(DCtx, dst, dstSize, cSrc, cSrcSize,
884
- workSpace, sizeof(workSpace));
1600
+ /* Save the final values of each of the state variables back to args. */
1601
+ ZSTD_memcpy(&args->bits, &bits, sizeof(bits));
1602
+ ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip));
1603
+ ZSTD_memcpy(&args->op, &op, sizeof(op));
885
1604
  }
886
1605
 
887
- size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
888
- {
889
- HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_TABLELOG_MAX);
890
- return HUF_decompress1X2_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
891
- }
892
1606
 
893
- size_t HUF_decompress4X2_usingDTable(
1607
+ static HUF_FAST_BMI2_ATTRS size_t
1608
+ HUF_decompress4X2_usingDTable_internal_fast(
894
1609
  void* dst, size_t dstSize,
895
1610
  const void* cSrc, size_t cSrcSize,
896
- const HUF_DTable* DTable)
1611
+ const HUF_DTable* DTable,
1612
+ HUF_DecompressFastLoopFn loopFn) {
1613
+ void const* dt = DTable + 1;
1614
+ const BYTE* const iend = (const BYTE*)cSrc + 6;
1615
+ BYTE* const oend = (BYTE*)dst + dstSize;
1616
+ HUF_DecompressFastArgs args;
1617
+ {
1618
+ size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
1619
+ FORWARD_IF_ERROR(ret, "Failed to init asm args");
1620
+ if (ret == 0)
1621
+ return 0;
1622
+ }
1623
+
1624
+ assert(args.ip[0] >= args.ilimit);
1625
+ loopFn(&args);
1626
+
1627
+ /* note : op4 already verified within main loop */
1628
+ assert(args.ip[0] >= iend);
1629
+ assert(args.ip[1] >= iend);
1630
+ assert(args.ip[2] >= iend);
1631
+ assert(args.ip[3] >= iend);
1632
+ assert(args.op[3] <= oend);
1633
+ (void)iend;
1634
+
1635
+ /* finish bitStreams one by one */
1636
+ {
1637
+ size_t const segmentSize = (dstSize+3) / 4;
1638
+ BYTE* segmentEnd = (BYTE*)dst;
1639
+ int i;
1640
+ for (i = 0; i < 4; ++i) {
1641
+ BIT_DStream_t bit;
1642
+ if (segmentSize <= (size_t)(oend - segmentEnd))
1643
+ segmentEnd += segmentSize;
1644
+ else
1645
+ segmentEnd = oend;
1646
+ FORWARD_IF_ERROR(HUF_initRemainingDStream(&bit, &args, i, segmentEnd), "corruption");
1647
+ args.op[i] += HUF_decodeStreamX2(args.op[i], &bit, segmentEnd, (HUF_DEltX2 const*)dt, HUF_DECODER_FAST_TABLELOG);
1648
+ if (args.op[i] != segmentEnd)
1649
+ return ERROR(corruption_detected);
1650
+ }
1651
+ }
1652
+
1653
+ /* decoded size */
1654
+ return dstSize;
1655
+ }
1656
+
1657
+ static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc,
1658
+ size_t cSrcSize, HUF_DTable const* DTable, int flags)
897
1659
  {
898
- DTableDesc dtd = HUF_getDTableDesc(DTable);
899
- if (dtd.tableType != 1) return ERROR(GENERIC);
900
- return HUF_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
1660
+ HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X2_usingDTable_internal_default;
1661
+ HUF_DecompressFastLoopFn loopFn = HUF_decompress4X2_usingDTable_internal_fast_c_loop;
1662
+
1663
+ #if DYNAMIC_BMI2
1664
+ if (flags & HUF_flags_bmi2) {
1665
+ fallbackFn = HUF_decompress4X2_usingDTable_internal_bmi2;
1666
+ # if ZSTD_ENABLE_ASM_X86_64_BMI2
1667
+ if (!(flags & HUF_flags_disableAsm)) {
1668
+ loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop;
1669
+ }
1670
+ # endif
1671
+ } else {
1672
+ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
1673
+ }
1674
+ #endif
1675
+
1676
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
1677
+ if (!(flags & HUF_flags_disableAsm)) {
1678
+ loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop;
1679
+ }
1680
+ #endif
1681
+
1682
+ if (!(flags & HUF_flags_disableFast)) {
1683
+ size_t const ret = HUF_decompress4X2_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);
1684
+ if (ret != 0)
1685
+ return ret;
1686
+ }
1687
+ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
901
1688
  }
902
1689
 
903
- static size_t HUF_decompress4X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
1690
+ HUF_DGEN(HUF_decompress1X2_usingDTable_internal)
1691
+
1692
+ size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
904
1693
  const void* cSrc, size_t cSrcSize,
905
- void* workSpace, size_t wkspSize, int bmi2)
1694
+ void* workSpace, size_t wkspSize, int flags)
906
1695
  {
907
1696
  const BYTE* ip = (const BYTE*) cSrc;
908
1697
 
909
- size_t hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize,
910
- workSpace, wkspSize);
1698
+ size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize,
1699
+ workSpace, wkspSize, flags);
911
1700
  if (HUF_isError(hSize)) return hSize;
912
1701
  if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
913
1702
  ip += hSize; cSrcSize -= hSize;
914
1703
 
915
- return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
1704
+ return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, flags);
916
1705
  }
917
1706
 
918
- size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
1707
+ static size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
919
1708
  const void* cSrc, size_t cSrcSize,
920
- void* workSpace, size_t wkspSize)
1709
+ void* workSpace, size_t wkspSize, int flags)
921
1710
  {
922
- return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, /* bmi2 */ 0);
923
- }
924
-
1711
+ const BYTE* ip = (const BYTE*) cSrc;
925
1712
 
926
- size_t HUF_decompress4X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize,
927
- const void* cSrc, size_t cSrcSize)
928
- {
929
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
930
- return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
931
- workSpace, sizeof(workSpace));
932
- }
1713
+ size_t hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize,
1714
+ workSpace, wkspSize, flags);
1715
+ if (HUF_isError(hSize)) return hSize;
1716
+ if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
1717
+ ip += hSize; cSrcSize -= hSize;
933
1718
 
934
- size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
935
- {
936
- HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_TABLELOG_MAX);
937
- return HUF_decompress4X2_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
1719
+ return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
938
1720
  }
939
1721
 
940
1722
  #endif /* HUF_FORCE_DECOMPRESS_X1 */
@@ -944,66 +1726,28 @@ size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cS
944
1726
  /* Universal decompression selectors */
945
1727
  /* ***********************************/
946
1728
 
947
- size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize,
948
- const void* cSrc, size_t cSrcSize,
949
- const HUF_DTable* DTable)
950
- {
951
- DTableDesc const dtd = HUF_getDTableDesc(DTable);
952
- #if defined(HUF_FORCE_DECOMPRESS_X1)
953
- (void)dtd;
954
- assert(dtd.tableType == 0);
955
- return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
956
- #elif defined(HUF_FORCE_DECOMPRESS_X2)
957
- (void)dtd;
958
- assert(dtd.tableType == 1);
959
- return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
960
- #else
961
- return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
962
- HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
963
- #endif
964
- }
965
-
966
- size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize,
967
- const void* cSrc, size_t cSrcSize,
968
- const HUF_DTable* DTable)
969
- {
970
- DTableDesc const dtd = HUF_getDTableDesc(DTable);
971
- #if defined(HUF_FORCE_DECOMPRESS_X1)
972
- (void)dtd;
973
- assert(dtd.tableType == 0);
974
- return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
975
- #elif defined(HUF_FORCE_DECOMPRESS_X2)
976
- (void)dtd;
977
- assert(dtd.tableType == 1);
978
- return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
979
- #else
980
- return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
981
- HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
982
- #endif
983
- }
984
-
985
1729
 
986
1730
  #if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2)
987
1731
  typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t;
988
- static const algo_time_t algoTime[16 /* Quantization */][3 /* single, double, quad */] =
1732
+ static const algo_time_t algoTime[16 /* Quantization */][2 /* single, double */] =
989
1733
  {
990
1734
  /* single, double, quad */
991
- {{0,0}, {1,1}, {2,2}}, /* Q==0 : impossible */
992
- {{0,0}, {1,1}, {2,2}}, /* Q==1 : impossible */
993
- {{ 38,130}, {1313, 74}, {2151, 38}}, /* Q == 2 : 12-18% */
994
- {{ 448,128}, {1353, 74}, {2238, 41}}, /* Q == 3 : 18-25% */
995
- {{ 556,128}, {1353, 74}, {2238, 47}}, /* Q == 4 : 25-32% */
996
- {{ 714,128}, {1418, 74}, {2436, 53}}, /* Q == 5 : 32-38% */
997
- {{ 883,128}, {1437, 74}, {2464, 61}}, /* Q == 6 : 38-44% */
998
- {{ 897,128}, {1515, 75}, {2622, 68}}, /* Q == 7 : 44-50% */
999
- {{ 926,128}, {1613, 75}, {2730, 75}}, /* Q == 8 : 50-56% */
1000
- {{ 947,128}, {1729, 77}, {3359, 77}}, /* Q == 9 : 56-62% */
1001
- {{1107,128}, {2083, 81}, {4006, 84}}, /* Q ==10 : 62-69% */
1002
- {{1177,128}, {2379, 87}, {4785, 88}}, /* Q ==11 : 69-75% */
1003
- {{1242,128}, {2415, 93}, {5155, 84}}, /* Q ==12 : 75-81% */
1004
- {{1349,128}, {2644,106}, {5260,106}}, /* Q ==13 : 81-87% */
1005
- {{1455,128}, {2422,124}, {4174,124}}, /* Q ==14 : 87-93% */
1006
- {{ 722,128}, {1891,145}, {1936,146}}, /* Q ==15 : 93-99% */
1735
+ {{0,0}, {1,1}}, /* Q==0 : impossible */
1736
+ {{0,0}, {1,1}}, /* Q==1 : impossible */
1737
+ {{ 150,216}, { 381,119}}, /* Q == 2 : 12-18% */
1738
+ {{ 170,205}, { 514,112}}, /* Q == 3 : 18-25% */
1739
+ {{ 177,199}, { 539,110}}, /* Q == 4 : 25-32% */
1740
+ {{ 197,194}, { 644,107}}, /* Q == 5 : 32-38% */
1741
+ {{ 221,192}, { 735,107}}, /* Q == 6 : 38-44% */
1742
+ {{ 256,189}, { 881,106}}, /* Q == 7 : 44-50% */
1743
+ {{ 359,188}, {1167,109}}, /* Q == 8 : 50-56% */
1744
+ {{ 582,187}, {1570,114}}, /* Q == 9 : 56-62% */
1745
+ {{ 688,187}, {1712,122}}, /* Q ==10 : 62-69% */
1746
+ {{ 825,186}, {1965,136}}, /* Q ==11 : 69-75% */
1747
+ {{ 976,185}, {2131,150}}, /* Q ==12 : 75-81% */
1748
+ {{1180,186}, {2070,175}}, /* Q ==13 : 81-87% */
1749
+ {{1377,185}, {1731,202}}, /* Q ==14 : 87-93% */
1750
+ {{1412,185}, {1695,202}}, /* Q ==15 : 93-99% */
1007
1751
  };
1008
1752
  #endif
1009
1753
 
@@ -1030,188 +1774,92 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize)
1030
1774
  U32 const D256 = (U32)(dstSize >> 8);
1031
1775
  U32 const DTime0 = algoTime[Q][0].tableTime + (algoTime[Q][0].decode256Time * D256);
1032
1776
  U32 DTime1 = algoTime[Q][1].tableTime + (algoTime[Q][1].decode256Time * D256);
1033
- DTime1 += DTime1 >> 3; /* advantage to algorithm using less memory, to reduce cache eviction */
1777
+ DTime1 += DTime1 >> 5; /* small advantage to algorithm using less memory, to reduce cache eviction */
1034
1778
  return DTime1 < DTime0;
1035
1779
  }
1036
1780
  #endif
1037
1781
  }
1038
1782
 
1039
-
1040
- typedef size_t (*decompressionAlgo)(void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);
1041
-
1042
- size_t HUF_decompress (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
1043
- {
1044
- #if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2)
1045
- static const decompressionAlgo decompress[2] = { HUF_decompress4X1, HUF_decompress4X2 };
1046
- #endif
1047
-
1048
- /* validation checks */
1049
- if (dstSize == 0) return ERROR(dstSize_tooSmall);
1050
- if (cSrcSize > dstSize) return ERROR(corruption_detected); /* invalid */
1051
- if (cSrcSize == dstSize) { memcpy(dst, cSrc, dstSize); return dstSize; } /* not compressed */
1052
- if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; } /* RLE */
1053
-
1054
- { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
1055
- #if defined(HUF_FORCE_DECOMPRESS_X1)
1056
- (void)algoNb;
1057
- assert(algoNb == 0);
1058
- return HUF_decompress4X1(dst, dstSize, cSrc, cSrcSize);
1059
- #elif defined(HUF_FORCE_DECOMPRESS_X2)
1060
- (void)algoNb;
1061
- assert(algoNb == 1);
1062
- return HUF_decompress4X2(dst, dstSize, cSrc, cSrcSize);
1063
- #else
1064
- return decompress[algoNb](dst, dstSize, cSrc, cSrcSize);
1065
- #endif
1066
- }
1067
- }
1068
-
1069
- size_t HUF_decompress4X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
1070
- {
1071
- /* validation checks */
1072
- if (dstSize == 0) return ERROR(dstSize_tooSmall);
1073
- if (cSrcSize > dstSize) return ERROR(corruption_detected); /* invalid */
1074
- if (cSrcSize == dstSize) { memcpy(dst, cSrc, dstSize); return dstSize; } /* not compressed */
1075
- if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; } /* RLE */
1076
-
1077
- { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
1078
- #if defined(HUF_FORCE_DECOMPRESS_X1)
1079
- (void)algoNb;
1080
- assert(algoNb == 0);
1081
- return HUF_decompress4X1_DCtx(dctx, dst, dstSize, cSrc, cSrcSize);
1082
- #elif defined(HUF_FORCE_DECOMPRESS_X2)
1083
- (void)algoNb;
1084
- assert(algoNb == 1);
1085
- return HUF_decompress4X2_DCtx(dctx, dst, dstSize, cSrc, cSrcSize);
1086
- #else
1087
- return algoNb ? HUF_decompress4X2_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) :
1088
- HUF_decompress4X1_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) ;
1089
- #endif
1090
- }
1091
- }
1092
-
1093
- size_t HUF_decompress4X_hufOnly(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
1094
- {
1095
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
1096
- return HUF_decompress4X_hufOnly_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
1097
- workSpace, sizeof(workSpace));
1098
- }
1099
-
1100
-
1101
- size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst,
1102
- size_t dstSize, const void* cSrc,
1103
- size_t cSrcSize, void* workSpace,
1104
- size_t wkspSize)
1105
- {
1106
- /* validation checks */
1107
- if (dstSize == 0) return ERROR(dstSize_tooSmall);
1108
- if (cSrcSize == 0) return ERROR(corruption_detected);
1109
-
1110
- { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
1111
- #if defined(HUF_FORCE_DECOMPRESS_X1)
1112
- (void)algoNb;
1113
- assert(algoNb == 0);
1114
- return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
1115
- #elif defined(HUF_FORCE_DECOMPRESS_X2)
1116
- (void)algoNb;
1117
- assert(algoNb == 1);
1118
- return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
1119
- #else
1120
- return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
1121
- cSrcSize, workSpace, wkspSize):
1122
- HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
1123
- #endif
1124
- }
1125
- }
1126
-
1127
1783
  size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
1128
1784
  const void* cSrc, size_t cSrcSize,
1129
- void* workSpace, size_t wkspSize)
1785
+ void* workSpace, size_t wkspSize, int flags)
1130
1786
  {
1131
1787
  /* validation checks */
1132
1788
  if (dstSize == 0) return ERROR(dstSize_tooSmall);
1133
1789
  if (cSrcSize > dstSize) return ERROR(corruption_detected); /* invalid */
1134
- if (cSrcSize == dstSize) { memcpy(dst, cSrc, dstSize); return dstSize; } /* not compressed */
1135
- if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; } /* RLE */
1790
+ if (cSrcSize == dstSize) { ZSTD_memcpy(dst, cSrc, dstSize); return dstSize; } /* not compressed */
1791
+ if (cSrcSize == 1) { ZSTD_memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; } /* RLE */
1136
1792
 
1137
1793
  { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
1138
1794
  #if defined(HUF_FORCE_DECOMPRESS_X1)
1139
1795
  (void)algoNb;
1140
1796
  assert(algoNb == 0);
1141
1797
  return HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc,
1142
- cSrcSize, workSpace, wkspSize);
1798
+ cSrcSize, workSpace, wkspSize, flags);
1143
1799
  #elif defined(HUF_FORCE_DECOMPRESS_X2)
1144
1800
  (void)algoNb;
1145
1801
  assert(algoNb == 1);
1146
1802
  return HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
1147
- cSrcSize, workSpace, wkspSize);
1803
+ cSrcSize, workSpace, wkspSize, flags);
1148
1804
  #else
1149
1805
  return algoNb ? HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
1150
- cSrcSize, workSpace, wkspSize):
1806
+ cSrcSize, workSpace, wkspSize, flags):
1151
1807
  HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc,
1152
- cSrcSize, workSpace, wkspSize);
1808
+ cSrcSize, workSpace, wkspSize, flags);
1153
1809
  #endif
1154
1810
  }
1155
1811
  }
1156
1812
 
1157
- size_t HUF_decompress1X_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize,
1158
- const void* cSrc, size_t cSrcSize)
1159
- {
1160
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
1161
- return HUF_decompress1X_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
1162
- workSpace, sizeof(workSpace));
1163
- }
1164
-
1165
1813
 
1166
- size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
1814
+ size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags)
1167
1815
  {
1168
1816
  DTableDesc const dtd = HUF_getDTableDesc(DTable);
1169
1817
  #if defined(HUF_FORCE_DECOMPRESS_X1)
1170
1818
  (void)dtd;
1171
1819
  assert(dtd.tableType == 0);
1172
- return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
1820
+ return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
1173
1821
  #elif defined(HUF_FORCE_DECOMPRESS_X2)
1174
1822
  (void)dtd;
1175
1823
  assert(dtd.tableType == 1);
1176
- return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
1824
+ return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
1177
1825
  #else
1178
- return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
1179
- HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
1826
+ return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) :
1827
+ HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
1180
1828
  #endif
1181
1829
  }
1182
1830
 
1183
1831
  #ifndef HUF_FORCE_DECOMPRESS_X2
1184
- size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
1832
+ size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags)
1185
1833
  {
1186
1834
  const BYTE* ip = (const BYTE*) cSrc;
1187
1835
 
1188
- size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize);
1836
+ size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags);
1189
1837
  if (HUF_isError(hSize)) return hSize;
1190
1838
  if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
1191
1839
  ip += hSize; cSrcSize -= hSize;
1192
1840
 
1193
- return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
1841
+ return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
1194
1842
  }
1195
1843
  #endif
1196
1844
 
1197
- size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
1845
+ size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags)
1198
1846
  {
1199
1847
  DTableDesc const dtd = HUF_getDTableDesc(DTable);
1200
1848
  #if defined(HUF_FORCE_DECOMPRESS_X1)
1201
1849
  (void)dtd;
1202
1850
  assert(dtd.tableType == 0);
1203
- return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
1851
+ return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
1204
1852
  #elif defined(HUF_FORCE_DECOMPRESS_X2)
1205
1853
  (void)dtd;
1206
1854
  assert(dtd.tableType == 1);
1207
- return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
1855
+ return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
1208
1856
  #else
1209
- return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
1210
- HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
1857
+ return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) :
1858
+ HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
1211
1859
  #endif
1212
1860
  }
1213
1861
 
1214
- size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
1862
+ size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags)
1215
1863
  {
1216
1864
  /* validation checks */
1217
1865
  if (dstSize == 0) return ERROR(dstSize_tooSmall);
@@ -1221,14 +1869,14 @@ size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t ds
1221
1869
  #if defined(HUF_FORCE_DECOMPRESS_X1)
1222
1870
  (void)algoNb;
1223
1871
  assert(algoNb == 0);
1224
- return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
1872
+ return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
1225
1873
  #elif defined(HUF_FORCE_DECOMPRESS_X2)
1226
1874
  (void)algoNb;
1227
1875
  assert(algoNb == 1);
1228
- return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
1876
+ return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
1229
1877
  #else
1230
- return algoNb ? HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2) :
1231
- HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
1878
+ return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags) :
1879
+ HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
1232
1880
  #endif
1233
1881
  }
1234
1882
  }