zstdlib 0.8.0-x64-mingw32 → 0.9.0-x64-mingw32

Sign up to get free protection for your applications and to get access to all the features.
Files changed (128) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGES.md +10 -0
  3. data/README.md +7 -1
  4. data/Rakefile +38 -8
  5. data/ext/{zstdlib → zstdlib_c}/extconf.rb +10 -5
  6. data/ext/{zstdlib → zstdlib_c}/ruby/zlib-2.2/zstdlib.c +2 -2
  7. data/ext/{zstdlib → zstdlib_c}/ruby/zlib-2.3/zstdlib.c +2 -2
  8. data/ext/{zstdlib → zstdlib_c}/ruby/zlib-2.4/zstdlib.c +2 -2
  9. data/ext/{zstdlib → zstdlib_c}/ruby/zlib-2.5/zstdlib.c +2 -2
  10. data/ext/{zstdlib → zstdlib_c}/ruby/zlib-2.6/zstdlib.c +2 -2
  11. data/ext/{zstdlib → zstdlib_c}/ruby/zlib-2.7/zstdlib.c +2 -2
  12. data/ext/{zstdlib → zstdlib_c}/ruby/zlib-3.0/zstdlib.c +2 -2
  13. data/ext/zstdlib_c/ruby/zlib-3.1/zstdlib.c +5076 -0
  14. data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/adler32.c +0 -0
  15. data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/compress.c +0 -0
  16. data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/crc32.c +0 -0
  17. data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/crc32.h +0 -0
  18. data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/deflate.c +0 -0
  19. data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/deflate.h +0 -0
  20. data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/gzclose.c +0 -0
  21. data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/gzguts.h +0 -0
  22. data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/gzlib.c +0 -0
  23. data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/gzread.c +0 -0
  24. data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/gzwrite.c +0 -0
  25. data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/infback.c +0 -0
  26. data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/inffast.c +0 -0
  27. data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/inffast.h +0 -0
  28. data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/inffixed.h +0 -0
  29. data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/inflate.c +0 -0
  30. data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/inflate.h +0 -0
  31. data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/inftrees.c +0 -0
  32. data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/inftrees.h +0 -0
  33. data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/trees.c +0 -0
  34. data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/trees.h +0 -0
  35. data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/uncompr.c +0 -0
  36. data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/zconf.h +0 -0
  37. data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/zlib.h +0 -0
  38. data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/zutil.c +0 -0
  39. data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/zutil.h +0 -0
  40. data/ext/{zstdlib → zstdlib_c}/zlib.mk +0 -0
  41. data/ext/{zstdlib → zstdlib_c}/zlibwrapper/zlibwrapper.c +1 -5
  42. data/ext/{zstdlib → zstdlib_c}/zlibwrapper.mk +0 -0
  43. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/bitstream.h +24 -9
  44. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/compiler.h +89 -43
  45. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/cpu.h +0 -0
  46. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/debug.c +0 -0
  47. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/debug.h +0 -0
  48. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/entropy_common.c +11 -5
  49. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/error_private.c +0 -0
  50. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/error_private.h +79 -0
  51. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/fse.h +2 -1
  52. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/fse_decompress.c +1 -1
  53. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/huf.h +24 -22
  54. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/mem.h +18 -0
  55. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/pool.c +11 -6
  56. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/pool.h +2 -2
  57. data/ext/zstdlib_c/zstd-1.5.2/lib/common/portability_macros.h +137 -0
  58. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/threading.c +0 -0
  59. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/threading.h +0 -0
  60. data/ext/zstdlib_c/zstd-1.5.2/lib/common/xxhash.c +24 -0
  61. data/ext/zstdlib_c/zstd-1.5.2/lib/common/xxhash.h +5686 -0
  62. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/zstd_common.c +0 -0
  63. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/zstd_deps.h +0 -0
  64. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/zstd_internal.h +95 -92
  65. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/zstd_trace.h +12 -3
  66. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/clevels.h +134 -0
  67. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/fse_compress.c +63 -27
  68. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/hist.c +0 -0
  69. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/hist.h +0 -0
  70. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/huf_compress.c +537 -104
  71. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_compress.c +307 -373
  72. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_compress_internal.h +174 -83
  73. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_compress_literals.c +4 -3
  74. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_compress_literals.h +3 -1
  75. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_compress_sequences.c +15 -14
  76. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_compress_sequences.h +0 -0
  77. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_compress_superblock.c +4 -3
  78. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_compress_superblock.h +0 -0
  79. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_cwksp.h +41 -27
  80. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_double_fast.c +295 -120
  81. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_double_fast.h +0 -0
  82. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_fast.c +309 -130
  83. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_fast.h +0 -0
  84. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_lazy.c +482 -562
  85. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_lazy.h +0 -0
  86. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_ldm.c +9 -7
  87. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_ldm.h +1 -1
  88. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_ldm_geartab.h +4 -1
  89. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_opt.c +249 -148
  90. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_opt.h +0 -0
  91. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstdmt_compress.c +76 -38
  92. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstdmt_compress.h +4 -1
  93. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/decompress/huf_decompress.c +727 -189
  94. data/ext/zstdlib_c/zstd-1.5.2/lib/decompress/huf_decompress_amd64.S +585 -0
  95. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/decompress/zstd_ddict.c +0 -0
  96. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/decompress/zstd_ddict.h +0 -0
  97. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/decompress/zstd_decompress.c +85 -22
  98. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/decompress/zstd_decompress_block.c +744 -220
  99. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/decompress/zstd_decompress_block.h +8 -2
  100. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/decompress/zstd_decompress_internal.h +34 -3
  101. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/zdict.h +4 -4
  102. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/zstd.h +179 -136
  103. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/zstd_errors.h +0 -0
  104. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/zlibWrapper/gzclose.c +0 -0
  105. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/zlibWrapper/gzcompatibility.h +0 -0
  106. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/zlibWrapper/gzguts.h +0 -0
  107. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/zlibWrapper/gzlib.c +0 -0
  108. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/zlibWrapper/gzread.c +0 -0
  109. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/zlibWrapper/gzwrite.c +0 -0
  110. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/zlibWrapper/zstd_zlibwrapper.c +7 -0
  111. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/zlibWrapper/zstd_zlibwrapper.h +0 -0
  112. data/ext/zstdlib_c/zstd.mk +15 -0
  113. data/lib/2.4/zstdlib_c.so +0 -0
  114. data/lib/2.5/zstdlib_c.so +0 -0
  115. data/lib/2.6/zstdlib_c.so +0 -0
  116. data/lib/2.7/zstdlib_c.so +0 -0
  117. data/lib/3.0/zstdlib_c.so +0 -0
  118. data/lib/zstdlib.rb +2 -2
  119. metadata +124 -121
  120. data/ext/zstdlib/zstd-1.5.0/lib/common/xxhash.c +0 -824
  121. data/ext/zstdlib/zstd-1.5.0/lib/common/xxhash.h +0 -285
  122. data/ext/zstdlib/zstd.mk +0 -14
  123. data/lib/2.2/zstdlib.so +0 -0
  124. data/lib/2.3/zstdlib.so +0 -0
  125. data/lib/2.4/zstdlib.so +0 -0
  126. data/lib/2.5/zstdlib.so +0 -0
  127. data/lib/2.6/zstdlib.so +0 -0
  128. data/lib/2.7/zstdlib.so +0 -0
@@ -22,6 +22,13 @@
22
22
  #define HUF_STATIC_LINKING_ONLY
23
23
  #include "../common/huf.h"
24
24
  #include "../common/error_private.h"
25
+ #include "../common/zstd_internal.h"
26
+
27
+ /* **************************************************************
28
+ * Constants
29
+ ****************************************************************/
30
+
31
+ #define HUF_DECODER_FAST_TABLELOG 11
25
32
 
26
33
  /* **************************************************************
27
34
  * Macros
@@ -36,6 +43,30 @@
36
43
  #error "Cannot force the use of the X1 and X2 decoders at the same time!"
37
44
  #endif
38
45
 
46
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2 && DYNAMIC_BMI2
47
+ # define HUF_ASM_X86_64_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE
48
+ #else
49
+ # define HUF_ASM_X86_64_BMI2_ATTRS
50
+ #endif
51
+
52
+ #ifdef __cplusplus
53
+ # define HUF_EXTERN_C extern "C"
54
+ #else
55
+ # define HUF_EXTERN_C
56
+ #endif
57
+ #define HUF_ASM_DECL HUF_EXTERN_C
58
+
59
+ #if DYNAMIC_BMI2 || (ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__))
60
+ # define HUF_NEED_BMI2_FUNCTION 1
61
+ #else
62
+ # define HUF_NEED_BMI2_FUNCTION 0
63
+ #endif
64
+
65
+ #if !(ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__))
66
+ # define HUF_NEED_DEFAULT_FUNCTION 1
67
+ #else
68
+ # define HUF_NEED_DEFAULT_FUNCTION 0
69
+ #endif
39
70
 
40
71
  /* **************************************************************
41
72
  * Error Management
@@ -65,7 +96,7 @@
65
96
  return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \
66
97
  } \
67
98
  \
68
- static TARGET_ATTRIBUTE("bmi2") size_t fn##_bmi2( \
99
+ static BMI2_TARGET_ATTRIBUTE size_t fn##_bmi2( \
69
100
  void* dst, size_t dstSize, \
70
101
  const void* cSrc, size_t cSrcSize, \
71
102
  const HUF_DTable* DTable) \
@@ -107,13 +138,147 @@ static DTableDesc HUF_getDTableDesc(const HUF_DTable* table)
107
138
  return dtd;
108
139
  }
109
140
 
141
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2
142
+
143
+ static size_t HUF_initDStream(BYTE const* ip) {
144
+ BYTE const lastByte = ip[7];
145
+ size_t const bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;
146
+ size_t const value = MEM_readLEST(ip) | 1;
147
+ assert(bitsConsumed <= 8);
148
+ return value << bitsConsumed;
149
+ }
150
+ typedef struct {
151
+ BYTE const* ip[4];
152
+ BYTE* op[4];
153
+ U64 bits[4];
154
+ void const* dt;
155
+ BYTE const* ilimit;
156
+ BYTE* oend;
157
+ BYTE const* iend[4];
158
+ } HUF_DecompressAsmArgs;
159
+
160
+ /**
161
+ * Initializes args for the asm decoding loop.
162
+ * @returns 0 on success
163
+ * 1 if the fallback implementation should be used.
164
+ * Or an error code on failure.
165
+ */
166
+ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable)
167
+ {
168
+ void const* dt = DTable + 1;
169
+ U32 const dtLog = HUF_getDTableDesc(DTable).tableLog;
170
+
171
+ const BYTE* const ilimit = (const BYTE*)src + 6 + 8;
172
+
173
+ BYTE* const oend = (BYTE*)dst + dstSize;
174
+
175
+ /* The following condition is false on x32 platform,
176
+ * but HUF_asm is not compatible with this ABI */
177
+ if (!(MEM_isLittleEndian() && !MEM_32bits())) return 1;
178
+
179
+ /* strict minimum : jump table + 1 byte per stream */
180
+ if (srcSize < 10)
181
+ return ERROR(corruption_detected);
182
+
183
+ /* Must have at least 8 bytes per stream because we don't handle initializing smaller bit containers.
184
+ * If table log is not correct at this point, fallback to the old decoder.
185
+ * On small inputs we don't have enough data to trigger the fast loop, so use the old decoder.
186
+ */
187
+ if (dtLog != HUF_DECODER_FAST_TABLELOG)
188
+ return 1;
189
+
190
+ /* Read the jump table. */
191
+ {
192
+ const BYTE* const istart = (const BYTE*)src;
193
+ size_t const length1 = MEM_readLE16(istart);
194
+ size_t const length2 = MEM_readLE16(istart+2);
195
+ size_t const length3 = MEM_readLE16(istart+4);
196
+ size_t const length4 = srcSize - (length1 + length2 + length3 + 6);
197
+ args->iend[0] = istart + 6; /* jumpTable */
198
+ args->iend[1] = args->iend[0] + length1;
199
+ args->iend[2] = args->iend[1] + length2;
200
+ args->iend[3] = args->iend[2] + length3;
201
+
202
+ /* HUF_initDStream() requires this, and this small of an input
203
+ * won't benefit from the ASM loop anyways.
204
+ * length1 must be >= 16 so that ip[0] >= ilimit before the loop
205
+ * starts.
206
+ */
207
+ if (length1 < 16 || length2 < 8 || length3 < 8 || length4 < 8)
208
+ return 1;
209
+ if (length4 > srcSize) return ERROR(corruption_detected); /* overflow */
210
+ }
211
+ /* ip[] contains the position that is currently loaded into bits[]. */
212
+ args->ip[0] = args->iend[1] - sizeof(U64);
213
+ args->ip[1] = args->iend[2] - sizeof(U64);
214
+ args->ip[2] = args->iend[3] - sizeof(U64);
215
+ args->ip[3] = (BYTE const*)src + srcSize - sizeof(U64);
216
+
217
+ /* op[] contains the output pointers. */
218
+ args->op[0] = (BYTE*)dst;
219
+ args->op[1] = args->op[0] + (dstSize+3)/4;
220
+ args->op[2] = args->op[1] + (dstSize+3)/4;
221
+ args->op[3] = args->op[2] + (dstSize+3)/4;
222
+
223
+ /* No point to call the ASM loop for tiny outputs. */
224
+ if (args->op[3] >= oend)
225
+ return 1;
226
+
227
+ /* bits[] is the bit container.
228
+ * It is read from the MSB down to the LSB.
229
+ * It is shifted left as it is read, and zeros are
230
+ * shifted in. After the lowest valid bit a 1 is
231
+ * set, so that CountTrailingZeros(bits[]) can be used
232
+ * to count how many bits we've consumed.
233
+ */
234
+ args->bits[0] = HUF_initDStream(args->ip[0]);
235
+ args->bits[1] = HUF_initDStream(args->ip[1]);
236
+ args->bits[2] = HUF_initDStream(args->ip[2]);
237
+ args->bits[3] = HUF_initDStream(args->ip[3]);
238
+
239
+ /* If ip[] >= ilimit, it is guaranteed to be safe to
240
+ * reload bits[]. It may be beyond its section, but is
241
+ * guaranteed to be valid (>= istart).
242
+ */
243
+ args->ilimit = ilimit;
244
+
245
+ args->oend = oend;
246
+ args->dt = dt;
247
+
248
+ return 0;
249
+ }
250
+
251
+ static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs const* args, int stream, BYTE* segmentEnd)
252
+ {
253
+ /* Validate that we haven't overwritten. */
254
+ if (args->op[stream] > segmentEnd)
255
+ return ERROR(corruption_detected);
256
+ /* Validate that we haven't read beyond iend[].
257
+ * Note that ip[] may be < iend[] because the MSB is
258
+ * the next bit to read, and we may have consumed 100%
259
+ * of the stream, so down to iend[i] - 8 is valid.
260
+ */
261
+ if (args->ip[stream] < args->iend[stream] - 8)
262
+ return ERROR(corruption_detected);
263
+
264
+ /* Construct the BIT_DStream_t. */
265
+ bit->bitContainer = MEM_readLE64(args->ip[stream]);
266
+ bit->bitsConsumed = ZSTD_countTrailingZeros((size_t)args->bits[stream]);
267
+ bit->start = (const char*)args->iend[0];
268
+ bit->limitPtr = bit->start + sizeof(size_t);
269
+ bit->ptr = (const char*)args->ip[stream];
270
+
271
+ return 0;
272
+ }
273
+ #endif
274
+
110
275
 
111
276
  #ifndef HUF_FORCE_DECOMPRESS_X2
112
277
 
113
278
  /*-***************************/
114
279
  /* single-symbol decoding */
115
280
  /*-***************************/
116
- typedef struct { BYTE byte; BYTE nbBits; } HUF_DEltX1; /* single-symbol decoding */
281
+ typedef struct { BYTE nbBits; BYTE byte; } HUF_DEltX1; /* single-symbol decoding */
117
282
 
118
283
  /**
119
284
  * Packs 4 HUF_DEltX1 structs into a U64. This is used to lay down 4 entries at
@@ -122,14 +287,44 @@ typedef struct { BYTE byte; BYTE nbBits; } HUF_DEltX1; /* single-symbol decodi
122
287
  static U64 HUF_DEltX1_set4(BYTE symbol, BYTE nbBits) {
123
288
  U64 D4;
124
289
  if (MEM_isLittleEndian()) {
125
- D4 = symbol + (nbBits << 8);
126
- } else {
127
290
  D4 = (symbol << 8) + nbBits;
291
+ } else {
292
+ D4 = symbol + (nbBits << 8);
128
293
  }
129
294
  D4 *= 0x0001000100010001ULL;
130
295
  return D4;
131
296
  }
132
297
 
298
+ /**
299
+ * Increase the tableLog to targetTableLog and rescales the stats.
300
+ * If tableLog > targetTableLog this is a no-op.
301
+ * @returns New tableLog
302
+ */
303
+ static U32 HUF_rescaleStats(BYTE* huffWeight, U32* rankVal, U32 nbSymbols, U32 tableLog, U32 targetTableLog)
304
+ {
305
+ if (tableLog > targetTableLog)
306
+ return tableLog;
307
+ if (tableLog < targetTableLog) {
308
+ U32 const scale = targetTableLog - tableLog;
309
+ U32 s;
310
+ /* Increase the weight for all non-zero probability symbols by scale. */
311
+ for (s = 0; s < nbSymbols; ++s) {
312
+ huffWeight[s] += (BYTE)((huffWeight[s] == 0) ? 0 : scale);
313
+ }
314
+ /* Update rankVal to reflect the new weights.
315
+ * All weights except 0 get moved to weight + scale.
316
+ * Weights [1, scale] are empty.
317
+ */
318
+ for (s = targetTableLog; s > scale; --s) {
319
+ rankVal[s] = rankVal[s - scale];
320
+ }
321
+ for (s = scale; s > 0; --s) {
322
+ rankVal[s] = 0;
323
+ }
324
+ }
325
+ return targetTableLog;
326
+ }
327
+
133
328
  typedef struct {
134
329
  U32 rankVal[HUF_TABLELOG_ABSOLUTEMAX + 1];
135
330
  U32 rankStart[HUF_TABLELOG_ABSOLUTEMAX + 1];
@@ -162,8 +357,12 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
162
357
  iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), bmi2);
163
358
  if (HUF_isError(iSize)) return iSize;
164
359
 
360
+
165
361
  /* Table header */
166
362
  { DTableDesc dtd = HUF_getDTableDesc(DTable);
363
+ U32 const maxTableLog = dtd.maxTableLog + 1;
364
+ U32 const targetTableLog = MIN(maxTableLog, HUF_DECODER_FAST_TABLELOG);
365
+ tableLog = HUF_rescaleStats(wksp->huffWeight, wksp->rankVal, nbSymbols, tableLog, targetTableLog);
167
366
  if (tableLog > (U32)(dtd.maxTableLog+1)) return ERROR(tableLog_tooLarge); /* DTable too small, Huffman tree cannot fit in */
168
367
  dtd.tableType = 0;
169
368
  dtd.tableLog = (BYTE)tableLog;
@@ -207,7 +406,7 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
207
406
 
208
407
  /* fill DTable
209
408
  * We fill all entries of each weight in order.
210
- * That way length is a constant for each iteration of the outter loop.
409
+ * That way length is a constant for each iteration of the outer loop.
211
410
  * We can switch based on the length to a different inner loop which is
212
411
  * optimized for that particular case.
213
412
  */
@@ -304,11 +503,15 @@ HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, cons
304
503
  BYTE* const pStart = p;
305
504
 
306
505
  /* up to 4 symbols at a time */
307
- while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-3)) {
308
- HUF_DECODE_SYMBOLX1_2(p, bitDPtr);
309
- HUF_DECODE_SYMBOLX1_1(p, bitDPtr);
310
- HUF_DECODE_SYMBOLX1_2(p, bitDPtr);
311
- HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
506
+ if ((pEnd - p) > 3) {
507
+ while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-3)) {
508
+ HUF_DECODE_SYMBOLX1_2(p, bitDPtr);
509
+ HUF_DECODE_SYMBOLX1_1(p, bitDPtr);
510
+ HUF_DECODE_SYMBOLX1_2(p, bitDPtr);
511
+ HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
512
+ }
513
+ } else {
514
+ BIT_reloadDStream(bitDPtr);
312
515
  }
313
516
 
314
517
  /* [0-3] symbols remaining */
@@ -388,33 +591,36 @@ HUF_decompress4X1_usingDTable_internal_body(
388
591
  U32 endSignal = 1;
389
592
 
390
593
  if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */
594
+ if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */
391
595
  CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
392
596
  CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
393
597
  CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
394
598
  CHECK_F( BIT_initDStream(&bitD4, istart4, length4) );
395
599
 
396
600
  /* up to 16 symbols per loop (4 symbols per stream) in 64-bit mode */
397
- for ( ; (endSignal) & (op4 < olimit) ; ) {
398
- HUF_DECODE_SYMBOLX1_2(op1, &bitD1);
399
- HUF_DECODE_SYMBOLX1_2(op2, &bitD2);
400
- HUF_DECODE_SYMBOLX1_2(op3, &bitD3);
401
- HUF_DECODE_SYMBOLX1_2(op4, &bitD4);
402
- HUF_DECODE_SYMBOLX1_1(op1, &bitD1);
403
- HUF_DECODE_SYMBOLX1_1(op2, &bitD2);
404
- HUF_DECODE_SYMBOLX1_1(op3, &bitD3);
405
- HUF_DECODE_SYMBOLX1_1(op4, &bitD4);
406
- HUF_DECODE_SYMBOLX1_2(op1, &bitD1);
407
- HUF_DECODE_SYMBOLX1_2(op2, &bitD2);
408
- HUF_DECODE_SYMBOLX1_2(op3, &bitD3);
409
- HUF_DECODE_SYMBOLX1_2(op4, &bitD4);
410
- HUF_DECODE_SYMBOLX1_0(op1, &bitD1);
411
- HUF_DECODE_SYMBOLX1_0(op2, &bitD2);
412
- HUF_DECODE_SYMBOLX1_0(op3, &bitD3);
413
- HUF_DECODE_SYMBOLX1_0(op4, &bitD4);
414
- endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished;
415
- endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished;
416
- endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished;
417
- endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished;
601
+ if ((size_t)(oend - op4) >= sizeof(size_t)) {
602
+ for ( ; (endSignal) & (op4 < olimit) ; ) {
603
+ HUF_DECODE_SYMBOLX1_2(op1, &bitD1);
604
+ HUF_DECODE_SYMBOLX1_2(op2, &bitD2);
605
+ HUF_DECODE_SYMBOLX1_2(op3, &bitD3);
606
+ HUF_DECODE_SYMBOLX1_2(op4, &bitD4);
607
+ HUF_DECODE_SYMBOLX1_1(op1, &bitD1);
608
+ HUF_DECODE_SYMBOLX1_1(op2, &bitD2);
609
+ HUF_DECODE_SYMBOLX1_1(op3, &bitD3);
610
+ HUF_DECODE_SYMBOLX1_1(op4, &bitD4);
611
+ HUF_DECODE_SYMBOLX1_2(op1, &bitD1);
612
+ HUF_DECODE_SYMBOLX1_2(op2, &bitD2);
613
+ HUF_DECODE_SYMBOLX1_2(op3, &bitD3);
614
+ HUF_DECODE_SYMBOLX1_2(op4, &bitD4);
615
+ HUF_DECODE_SYMBOLX1_0(op1, &bitD1);
616
+ HUF_DECODE_SYMBOLX1_0(op2, &bitD2);
617
+ HUF_DECODE_SYMBOLX1_0(op3, &bitD3);
618
+ HUF_DECODE_SYMBOLX1_0(op4, &bitD4);
619
+ endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished;
620
+ endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished;
621
+ endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished;
622
+ endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished;
623
+ }
418
624
  }
419
625
 
420
626
  /* check corruption */
@@ -440,6 +646,79 @@ HUF_decompress4X1_usingDTable_internal_body(
440
646
  }
441
647
  }
442
648
 
649
+ #if HUF_NEED_BMI2_FUNCTION
650
+ static BMI2_TARGET_ATTRIBUTE
651
+ size_t HUF_decompress4X1_usingDTable_internal_bmi2(void* dst, size_t dstSize, void const* cSrc,
652
+ size_t cSrcSize, HUF_DTable const* DTable) {
653
+ return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
654
+ }
655
+ #endif
656
+
657
+ #if HUF_NEED_DEFAULT_FUNCTION
658
+ static
659
+ size_t HUF_decompress4X1_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,
660
+ size_t cSrcSize, HUF_DTable const* DTable) {
661
+ return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
662
+ }
663
+ #endif
664
+
665
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2
666
+
667
+ HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN;
668
+
669
+ static HUF_ASM_X86_64_BMI2_ATTRS
670
+ size_t
671
+ HUF_decompress4X1_usingDTable_internal_bmi2_asm(
672
+ void* dst, size_t dstSize,
673
+ const void* cSrc, size_t cSrcSize,
674
+ const HUF_DTable* DTable)
675
+ {
676
+ void const* dt = DTable + 1;
677
+ const BYTE* const iend = (const BYTE*)cSrc + 6;
678
+ BYTE* const oend = (BYTE*)dst + dstSize;
679
+ HUF_DecompressAsmArgs args;
680
+ {
681
+ size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
682
+ FORWARD_IF_ERROR(ret, "Failed to init asm args");
683
+ if (ret != 0)
684
+ return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
685
+ }
686
+
687
+ assert(args.ip[0] >= args.ilimit);
688
+ HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(&args);
689
+
690
+ /* Our loop guarantees that ip[] >= ilimit and that we haven't
691
+ * overwritten any op[].
692
+ */
693
+ assert(args.ip[0] >= iend);
694
+ assert(args.ip[1] >= iend);
695
+ assert(args.ip[2] >= iend);
696
+ assert(args.ip[3] >= iend);
697
+ assert(args.op[3] <= oend);
698
+ (void)iend;
699
+
700
+ /* finish bit streams one by one. */
701
+ {
702
+ size_t const segmentSize = (dstSize+3) / 4;
703
+ BYTE* segmentEnd = (BYTE*)dst;
704
+ int i;
705
+ for (i = 0; i < 4; ++i) {
706
+ BIT_DStream_t bit;
707
+ if (segmentSize <= (size_t)(oend - segmentEnd))
708
+ segmentEnd += segmentSize;
709
+ else
710
+ segmentEnd = oend;
711
+ FORWARD_IF_ERROR(HUF_initRemainingDStream(&bit, &args, i, segmentEnd), "corruption");
712
+ /* Decompress and validate that we've produced exactly the expected length. */
713
+ args.op[i] += HUF_decodeStreamX1(args.op[i], &bit, segmentEnd, (HUF_DEltX1 const*)dt, HUF_DECODER_FAST_TABLELOG);
714
+ if (args.op[i] != segmentEnd) return ERROR(corruption_detected);
715
+ }
716
+ }
717
+
718
+ /* decoded size */
719
+ return dstSize;
720
+ }
721
+ #endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */
443
722
 
444
723
  typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize,
445
724
  const void *cSrc,
@@ -447,8 +726,28 @@ typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize,
447
726
  const HUF_DTable *DTable);
448
727
 
449
728
  HUF_DGEN(HUF_decompress1X1_usingDTable_internal)
450
- HUF_DGEN(HUF_decompress4X1_usingDTable_internal)
451
729
 
730
+ static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc,
731
+ size_t cSrcSize, HUF_DTable const* DTable, int bmi2)
732
+ {
733
+ #if DYNAMIC_BMI2
734
+ if (bmi2) {
735
+ # if ZSTD_ENABLE_ASM_X86_64_BMI2
736
+ return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
737
+ # else
738
+ return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
739
+ # endif
740
+ }
741
+ #else
742
+ (void)bmi2;
743
+ #endif
744
+
745
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
746
+ return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
747
+ #else
748
+ return HUF_decompress4X1_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable);
749
+ #endif
750
+ }
452
751
 
453
752
 
454
753
  size_t HUF_decompress1X1_usingDTable(
@@ -518,106 +817,226 @@ size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
518
817
  /* *************************/
519
818
 
520
819
  typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUF_DEltX2; /* double-symbols decoding */
521
- typedef struct { BYTE symbol; BYTE weight; } sortedSymbol_t;
820
+ typedef struct { BYTE symbol; } sortedSymbol_t;
522
821
  typedef U32 rankValCol_t[HUF_TABLELOG_MAX + 1];
523
822
  typedef rankValCol_t rankVal_t[HUF_TABLELOG_MAX];
524
823
 
824
+ /**
825
+ * Constructs a HUF_DEltX2 in a U32.
826
+ */
827
+ static U32 HUF_buildDEltX2U32(U32 symbol, U32 nbBits, U32 baseSeq, int level)
828
+ {
829
+ U32 seq;
830
+ DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, sequence) == 0);
831
+ DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, nbBits) == 2);
832
+ DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, length) == 3);
833
+ DEBUG_STATIC_ASSERT(sizeof(HUF_DEltX2) == sizeof(U32));
834
+ if (MEM_isLittleEndian()) {
835
+ seq = level == 1 ? symbol : (baseSeq + (symbol << 8));
836
+ return seq + (nbBits << 16) + ((U32)level << 24);
837
+ } else {
838
+ seq = level == 1 ? (symbol << 8) : ((baseSeq << 8) + symbol);
839
+ return (seq << 16) + (nbBits << 8) + (U32)level;
840
+ }
841
+ }
525
842
 
526
- /* HUF_fillDTableX2Level2() :
527
- * `rankValOrigin` must be a table of at least (HUF_TABLELOG_MAX + 1) U32 */
528
- static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 sizeLog, const U32 consumed,
529
- const U32* rankValOrigin, const int minWeight,
530
- const sortedSymbol_t* sortedSymbols, const U32 sortedListSize,
531
- U32 nbBitsBaseline, U16 baseSeq, U32* wksp, size_t wkspSize)
843
+ /**
844
+ * Constructs a HUF_DEltX2.
845
+ */
846
+ static HUF_DEltX2 HUF_buildDEltX2(U32 symbol, U32 nbBits, U32 baseSeq, int level)
532
847
  {
533
848
  HUF_DEltX2 DElt;
534
- U32* rankVal = wksp;
849
+ U32 const val = HUF_buildDEltX2U32(symbol, nbBits, baseSeq, level);
850
+ DEBUG_STATIC_ASSERT(sizeof(DElt) == sizeof(val));
851
+ ZSTD_memcpy(&DElt, &val, sizeof(val));
852
+ return DElt;
853
+ }
535
854
 
536
- assert(wkspSize >= HUF_TABLELOG_MAX + 1);
537
- (void)wkspSize;
538
- /* get pre-calculated rankVal */
539
- ZSTD_memcpy(rankVal, rankValOrigin, sizeof(U32) * (HUF_TABLELOG_MAX + 1));
855
+ /**
856
+ * Constructs 2 HUF_DEltX2s and packs them into a U64.
857
+ */
858
+ static U64 HUF_buildDEltX2U64(U32 symbol, U32 nbBits, U16 baseSeq, int level)
859
+ {
860
+ U32 DElt = HUF_buildDEltX2U32(symbol, nbBits, baseSeq, level);
861
+ return (U64)DElt + ((U64)DElt << 32);
862
+ }
540
863
 
541
- /* fill skipped values */
542
- if (minWeight>1) {
543
- U32 i, skipSize = rankVal[minWeight];
544
- MEM_writeLE16(&(DElt.sequence), baseSeq);
545
- DElt.nbBits = (BYTE)(consumed);
546
- DElt.length = 1;
547
- for (i = 0; i < skipSize; i++)
548
- DTable[i] = DElt;
864
+ /**
865
+ * Fills the DTable rank with all the symbols from [begin, end) that are each
866
+ * nbBits long.
867
+ *
868
+ * @param DTableRank The start of the rank in the DTable.
869
+ * @param begin The first symbol to fill (inclusive).
870
+ * @param end The last symbol to fill (exclusive).
871
+ * @param nbBits Each symbol is nbBits long.
872
+ * @param tableLog The table log.
873
+ * @param baseSeq If level == 1 { 0 } else { the first level symbol }
874
+ * @param level The level in the table. Must be 1 or 2.
875
+ */
876
+ static void HUF_fillDTableX2ForWeight(
877
+ HUF_DEltX2* DTableRank,
878
+ sortedSymbol_t const* begin, sortedSymbol_t const* end,
879
+ U32 nbBits, U32 tableLog,
880
+ U16 baseSeq, int const level)
881
+ {
882
+ U32 const length = 1U << ((tableLog - nbBits) & 0x1F /* quiet static-analyzer */);
883
+ const sortedSymbol_t* ptr;
884
+ assert(level >= 1 && level <= 2);
885
+ switch (length) {
886
+ case 1:
887
+ for (ptr = begin; ptr != end; ++ptr) {
888
+ HUF_DEltX2 const DElt = HUF_buildDEltX2(ptr->symbol, nbBits, baseSeq, level);
889
+ *DTableRank++ = DElt;
890
+ }
891
+ break;
892
+ case 2:
893
+ for (ptr = begin; ptr != end; ++ptr) {
894
+ HUF_DEltX2 const DElt = HUF_buildDEltX2(ptr->symbol, nbBits, baseSeq, level);
895
+ DTableRank[0] = DElt;
896
+ DTableRank[1] = DElt;
897
+ DTableRank += 2;
898
+ }
899
+ break;
900
+ case 4:
901
+ for (ptr = begin; ptr != end; ++ptr) {
902
+ U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level);
903
+ ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2));
904
+ ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2));
905
+ DTableRank += 4;
906
+ }
907
+ break;
908
+ case 8:
909
+ for (ptr = begin; ptr != end; ++ptr) {
910
+ U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level);
911
+ ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2));
912
+ ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2));
913
+ ZSTD_memcpy(DTableRank + 4, &DEltX2, sizeof(DEltX2));
914
+ ZSTD_memcpy(DTableRank + 6, &DEltX2, sizeof(DEltX2));
915
+ DTableRank += 8;
916
+ }
917
+ break;
918
+ default:
919
+ for (ptr = begin; ptr != end; ++ptr) {
920
+ U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level);
921
+ HUF_DEltX2* const DTableRankEnd = DTableRank + length;
922
+ for (; DTableRank != DTableRankEnd; DTableRank += 8) {
923
+ ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2));
924
+ ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2));
925
+ ZSTD_memcpy(DTableRank + 4, &DEltX2, sizeof(DEltX2));
926
+ ZSTD_memcpy(DTableRank + 6, &DEltX2, sizeof(DEltX2));
927
+ }
928
+ }
929
+ break;
549
930
  }
931
+ }
550
932
 
551
- /* fill DTable */
552
- { U32 s; for (s=0; s<sortedListSize; s++) { /* note : sortedSymbols already skipped */
553
- const U32 symbol = sortedSymbols[s].symbol;
554
- const U32 weight = sortedSymbols[s].weight;
555
- const U32 nbBits = nbBitsBaseline - weight;
556
- const U32 length = 1 << (sizeLog-nbBits);
557
- const U32 start = rankVal[weight];
558
- U32 i = start;
559
- const U32 end = start + length;
560
-
561
- MEM_writeLE16(&(DElt.sequence), (U16)(baseSeq + (symbol << 8)));
562
- DElt.nbBits = (BYTE)(nbBits + consumed);
563
- DElt.length = 2;
564
- do { DTable[i++] = DElt; } while (i<end); /* since length >= 1 */
933
+ /* HUF_fillDTableX2Level2() :
934
+ * `rankValOrigin` must be a table of at least (HUF_TABLELOG_MAX + 1) U32 */
935
+ static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 targetLog, const U32 consumedBits,
936
+ const U32* rankVal, const int minWeight, const int maxWeight1,
937
+ const sortedSymbol_t* sortedSymbols, U32 const* rankStart,
938
+ U32 nbBitsBaseline, U16 baseSeq)
939
+ {
940
+ /* Fill skipped values (all positions up to rankVal[minWeight]).
941
+ * These are positions only get a single symbol because the combined weight
942
+ * is too large.
943
+ */
944
+ if (minWeight>1) {
945
+ U32 const length = 1U << ((targetLog - consumedBits) & 0x1F /* quiet static-analyzer */);
946
+ U64 const DEltX2 = HUF_buildDEltX2U64(baseSeq, consumedBits, /* baseSeq */ 0, /* level */ 1);
947
+ int const skipSize = rankVal[minWeight];
948
+ assert(length > 1);
949
+ assert((U32)skipSize < length);
950
+ switch (length) {
951
+ case 2:
952
+ assert(skipSize == 1);
953
+ ZSTD_memcpy(DTable, &DEltX2, sizeof(DEltX2));
954
+ break;
955
+ case 4:
956
+ assert(skipSize <= 4);
957
+ ZSTD_memcpy(DTable + 0, &DEltX2, sizeof(DEltX2));
958
+ ZSTD_memcpy(DTable + 2, &DEltX2, sizeof(DEltX2));
959
+ break;
960
+ default:
961
+ {
962
+ int i;
963
+ for (i = 0; i < skipSize; i += 8) {
964
+ ZSTD_memcpy(DTable + i + 0, &DEltX2, sizeof(DEltX2));
965
+ ZSTD_memcpy(DTable + i + 2, &DEltX2, sizeof(DEltX2));
966
+ ZSTD_memcpy(DTable + i + 4, &DEltX2, sizeof(DEltX2));
967
+ ZSTD_memcpy(DTable + i + 6, &DEltX2, sizeof(DEltX2));
968
+ }
969
+ }
970
+ }
971
+ }
565
972
 
566
- rankVal[weight] += length;
567
- } }
973
+ /* Fill each of the second level symbols by weight. */
974
+ {
975
+ int w;
976
+ for (w = minWeight; w < maxWeight1; ++w) {
977
+ int const begin = rankStart[w];
978
+ int const end = rankStart[w+1];
979
+ U32 const nbBits = nbBitsBaseline - w;
980
+ U32 const totalBits = nbBits + consumedBits;
981
+ HUF_fillDTableX2ForWeight(
982
+ DTable + rankVal[w],
983
+ sortedSymbols + begin, sortedSymbols + end,
984
+ totalBits, targetLog,
985
+ baseSeq, /* level */ 2);
986
+ }
987
+ }
568
988
  }
569
989
 
570
-
571
990
  static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog,
572
- const sortedSymbol_t* sortedList, const U32 sortedListSize,
991
+ const sortedSymbol_t* sortedList,
573
992
  const U32* rankStart, rankVal_t rankValOrigin, const U32 maxWeight,
574
- const U32 nbBitsBaseline, U32* wksp, size_t wkspSize)
993
+ const U32 nbBitsBaseline)
575
994
  {
576
- U32* rankVal = wksp;
995
+ U32* const rankVal = rankValOrigin[0];
577
996
  const int scaleLog = nbBitsBaseline - targetLog; /* note : targetLog >= srcLog, hence scaleLog <= 1 */
578
997
  const U32 minBits = nbBitsBaseline - maxWeight;
579
- U32 s;
580
-
581
- assert(wkspSize >= HUF_TABLELOG_MAX + 1);
582
- wksp += HUF_TABLELOG_MAX + 1;
583
- wkspSize -= HUF_TABLELOG_MAX + 1;
584
-
585
- ZSTD_memcpy(rankVal, rankValOrigin, sizeof(U32) * (HUF_TABLELOG_MAX + 1));
586
-
587
- /* fill DTable */
588
- for (s=0; s<sortedListSize; s++) {
589
- const U16 symbol = sortedList[s].symbol;
590
- const U32 weight = sortedList[s].weight;
591
- const U32 nbBits = nbBitsBaseline - weight;
592
- const U32 start = rankVal[weight];
593
- const U32 length = 1 << (targetLog-nbBits);
594
-
595
- if (targetLog-nbBits >= minBits) { /* enough room for a second symbol */
596
- U32 sortedRank;
998
+ int w;
999
+ int const wEnd = (int)maxWeight + 1;
1000
+
1001
+ /* Fill DTable in order of weight. */
1002
+ for (w = 1; w < wEnd; ++w) {
1003
+ int const begin = (int)rankStart[w];
1004
+ int const end = (int)rankStart[w+1];
1005
+ U32 const nbBits = nbBitsBaseline - w;
1006
+
1007
+ if (targetLog-nbBits >= minBits) {
1008
+ /* Enough room for a second symbol. */
1009
+ int start = rankVal[w];
1010
+ U32 const length = 1U << ((targetLog - nbBits) & 0x1F /* quiet static-analyzer */);
597
1011
  int minWeight = nbBits + scaleLog;
1012
+ int s;
598
1013
  if (minWeight < 1) minWeight = 1;
599
- sortedRank = rankStart[minWeight];
600
- HUF_fillDTableX2Level2(DTable+start, targetLog-nbBits, nbBits,
601
- rankValOrigin[nbBits], minWeight,
602
- sortedList+sortedRank, sortedListSize-sortedRank,
603
- nbBitsBaseline, symbol, wksp, wkspSize);
1014
+ /* Fill the DTable for every symbol of weight w.
1015
+ * These symbols get at least 1 second symbol.
1016
+ */
1017
+ for (s = begin; s != end; ++s) {
1018
+ HUF_fillDTableX2Level2(
1019
+ DTable + start, targetLog, nbBits,
1020
+ rankValOrigin[nbBits], minWeight, wEnd,
1021
+ sortedList, rankStart,
1022
+ nbBitsBaseline, sortedList[s].symbol);
1023
+ start += length;
1024
+ }
604
1025
  } else {
605
- HUF_DEltX2 DElt;
606
- MEM_writeLE16(&(DElt.sequence), symbol);
607
- DElt.nbBits = (BYTE)(nbBits);
608
- DElt.length = 1;
609
- { U32 const end = start + length;
610
- U32 u;
611
- for (u = start; u < end; u++) DTable[u] = DElt;
612
- } }
613
- rankVal[weight] += length;
1026
+ /* Only a single symbol. */
1027
+ HUF_fillDTableX2ForWeight(
1028
+ DTable + rankVal[w],
1029
+ sortedList + begin, sortedList + end,
1030
+ nbBits, targetLog,
1031
+ /* baseSeq */ 0, /* level */ 1);
1032
+ }
614
1033
  }
615
1034
  }
616
1035
 
617
1036
  typedef struct {
618
1037
  rankValCol_t rankVal[HUF_TABLELOG_MAX];
619
1038
  U32 rankStats[HUF_TABLELOG_MAX + 1];
620
- U32 rankStart0[HUF_TABLELOG_MAX + 2];
1039
+ U32 rankStart0[HUF_TABLELOG_MAX + 3];
621
1040
  sortedSymbol_t sortedSymbol[HUF_SYMBOLVALUE_MAX + 1];
622
1041
  BYTE weightList[HUF_SYMBOLVALUE_MAX + 1];
623
1042
  U32 calleeWksp[HUF_READ_STATS_WORKSPACE_SIZE_U32];
@@ -627,9 +1046,16 @@ size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,
627
1046
  const void* src, size_t srcSize,
628
1047
  void* workSpace, size_t wkspSize)
629
1048
  {
630
- U32 tableLog, maxW, sizeOfSort, nbSymbols;
1049
+ return HUF_readDTableX2_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0);
1050
+ }
1051
+
1052
+ size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable,
1053
+ const void* src, size_t srcSize,
1054
+ void* workSpace, size_t wkspSize, int bmi2)
1055
+ {
1056
+ U32 tableLog, maxW, nbSymbols;
631
1057
  DTableDesc dtd = HUF_getDTableDesc(DTable);
632
- U32 const maxTableLog = dtd.maxTableLog;
1058
+ U32 maxTableLog = dtd.maxTableLog;
633
1059
  size_t iSize;
634
1060
  void* dtPtr = DTable+1; /* force compiler to avoid strict-aliasing */
635
1061
  HUF_DEltX2* const dt = (HUF_DEltX2*)dtPtr;
@@ -647,11 +1073,12 @@ size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,
647
1073
  if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
648
1074
  /* ZSTD_memset(weightList, 0, sizeof(weightList)); */ /* is not necessary, even though some analyzer complain ... */
649
1075
 
650
- iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), /* bmi2 */ 0);
1076
+ iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), bmi2);
651
1077
  if (HUF_isError(iSize)) return iSize;
652
1078
 
653
1079
  /* check result */
654
1080
  if (tableLog > maxTableLog) return ERROR(tableLog_tooLarge); /* DTable can't fit code depth */
1081
+ if (tableLog <= HUF_DECODER_FAST_TABLELOG && maxTableLog > HUF_DECODER_FAST_TABLELOG) maxTableLog = HUF_DECODER_FAST_TABLELOG;
655
1082
 
656
1083
  /* find maxWeight */
657
1084
  for (maxW = tableLog; wksp->rankStats[maxW]==0; maxW--) {} /* necessarily finds a solution before 0 */
@@ -664,7 +1091,7 @@ size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,
664
1091
  rankStart[w] = curr;
665
1092
  }
666
1093
  rankStart[0] = nextRankStart; /* put all 0w symbols at the end of sorted list*/
667
- sizeOfSort = nextRankStart;
1094
+ rankStart[maxW+1] = nextRankStart;
668
1095
  }
669
1096
 
670
1097
  /* sort symbols by weight */
@@ -673,7 +1100,6 @@ size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,
673
1100
  U32 const w = wksp->weightList[s];
674
1101
  U32 const r = rankStart[w]++;
675
1102
  wksp->sortedSymbol[r].symbol = (BYTE)s;
676
- wksp->sortedSymbol[r].weight = (BYTE)w;
677
1103
  }
678
1104
  rankStart[0] = 0; /* forget 0w symbols; this is beginning of weight(1) */
679
1105
  }
@@ -698,10 +1124,9 @@ size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,
698
1124
  } } } }
699
1125
 
700
1126
  HUF_fillDTableX2(dt, maxTableLog,
701
- wksp->sortedSymbol, sizeOfSort,
1127
+ wksp->sortedSymbol,
702
1128
  wksp->rankStart0, wksp->rankVal, maxW,
703
- tableLog+1,
704
- wksp->calleeWksp, sizeof(wksp->calleeWksp) / sizeof(U32));
1129
+ tableLog+1);
705
1130
 
706
1131
  dtd.tableLog = (BYTE)maxTableLog;
707
1132
  dtd.tableType = 1;
@@ -714,7 +1139,7 @@ FORCE_INLINE_TEMPLATE U32
714
1139
  HUF_decodeSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, const U32 dtLog)
715
1140
  {
716
1141
  size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */
717
- ZSTD_memcpy(op, dt+val, 2);
1142
+ ZSTD_memcpy(op, &dt[val].sequence, 2);
718
1143
  BIT_skipBits(DStream, dt[val].nbBits);
719
1144
  return dt[val].length;
720
1145
  }
@@ -723,15 +1148,17 @@ FORCE_INLINE_TEMPLATE U32
723
1148
  HUF_decodeLastSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, const U32 dtLog)
724
1149
  {
725
1150
  size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */
726
- ZSTD_memcpy(op, dt+val, 1);
727
- if (dt[val].length==1) BIT_skipBits(DStream, dt[val].nbBits);
728
- else {
1151
+ ZSTD_memcpy(op, &dt[val].sequence, 1);
1152
+ if (dt[val].length==1) {
1153
+ BIT_skipBits(DStream, dt[val].nbBits);
1154
+ } else {
729
1155
  if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8)) {
730
1156
  BIT_skipBits(DStream, dt[val].nbBits);
731
1157
  if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8))
732
1158
  /* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */
733
1159
  DStream->bitsConsumed = (sizeof(DStream->bitContainer)*8);
734
- } }
1160
+ }
1161
+ }
735
1162
  return 1;
736
1163
  }
737
1164
 
@@ -753,19 +1180,37 @@ HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd,
753
1180
  BYTE* const pStart = p;
754
1181
 
755
1182
  /* up to 8 symbols at a time */
756
- while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-(sizeof(bitDPtr->bitContainer)-1))) {
757
- HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
758
- HUF_DECODE_SYMBOLX2_1(p, bitDPtr);
759
- HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
760
- HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
1183
+ if ((size_t)(pEnd - p) >= sizeof(bitDPtr->bitContainer)) {
1184
+ if (dtLog <= 11 && MEM_64bits()) {
1185
+ /* up to 10 symbols at a time */
1186
+ while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-9)) {
1187
+ HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
1188
+ HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
1189
+ HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
1190
+ HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
1191
+ HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
1192
+ }
1193
+ } else {
1194
+ /* up to 8 symbols at a time */
1195
+ while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-(sizeof(bitDPtr->bitContainer)-1))) {
1196
+ HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
1197
+ HUF_DECODE_SYMBOLX2_1(p, bitDPtr);
1198
+ HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
1199
+ HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
1200
+ }
1201
+ }
1202
+ } else {
1203
+ BIT_reloadDStream(bitDPtr);
761
1204
  }
762
1205
 
763
1206
  /* closer to end : up to 2 symbols at a time */
764
- while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p <= pEnd-2))
765
- HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
1207
+ if ((size_t)(pEnd - p) >= 2) {
1208
+ while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p <= pEnd-2))
1209
+ HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
766
1210
 
767
- while (p <= pEnd-2)
768
- HUF_DECODE_SYMBOLX2_0(p, bitDPtr); /* no need to reload : reached the end of DStream */
1211
+ while (p <= pEnd-2)
1212
+ HUF_DECODE_SYMBOLX2_0(p, bitDPtr); /* no need to reload : reached the end of DStream */
1213
+ }
769
1214
 
770
1215
  if (p < pEnd)
771
1216
  p += HUF_decodeLastSymbolX2(p, bitDPtr, dt, dtLog);
@@ -799,7 +1244,6 @@ HUF_decompress1X2_usingDTable_internal_body(
799
1244
  /* decoded size */
800
1245
  return dstSize;
801
1246
  }
802
-
803
1247
  FORCE_INLINE_TEMPLATE size_t
804
1248
  HUF_decompress4X2_usingDTable_internal_body(
805
1249
  void* dst, size_t dstSize,
@@ -841,57 +1285,60 @@ HUF_decompress4X2_usingDTable_internal_body(
841
1285
  U32 const dtLog = dtd.tableLog;
842
1286
 
843
1287
  if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */
1288
+ if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */
844
1289
  CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
845
1290
  CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
846
1291
  CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
847
1292
  CHECK_F( BIT_initDStream(&bitD4, istart4, length4) );
848
1293
 
849
1294
  /* 16-32 symbols per loop (4-8 symbols per stream) */
850
- for ( ; (endSignal) & (op4 < olimit); ) {
1295
+ if ((size_t)(oend - op4) >= sizeof(size_t)) {
1296
+ for ( ; (endSignal) & (op4 < olimit); ) {
851
1297
  #if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
852
- HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
853
- HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
854
- HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
855
- HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
856
- HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
857
- HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
858
- HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
859
- HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
860
- endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished;
861
- endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished;
862
- HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
863
- HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
864
- HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
865
- HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
866
- HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
867
- HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
868
- HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
869
- HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
870
- endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished;
871
- endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished;
1298
+ HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
1299
+ HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
1300
+ HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
1301
+ HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
1302
+ HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
1303
+ HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
1304
+ HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
1305
+ HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
1306
+ endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished;
1307
+ endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished;
1308
+ HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
1309
+ HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
1310
+ HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
1311
+ HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
1312
+ HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
1313
+ HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
1314
+ HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
1315
+ HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
1316
+ endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished;
1317
+ endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished;
872
1318
  #else
873
- HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
874
- HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
875
- HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
876
- HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
877
- HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
878
- HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
879
- HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
880
- HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
881
- HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
882
- HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
883
- HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
884
- HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
885
- HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
886
- HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
887
- HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
888
- HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
889
- endSignal = (U32)LIKELY(
890
- (BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished)
891
- & (BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished)
892
- & (BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished)
893
- & (BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished));
1319
+ HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
1320
+ HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
1321
+ HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
1322
+ HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
1323
+ HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
1324
+ HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
1325
+ HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
1326
+ HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
1327
+ HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
1328
+ HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
1329
+ HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
1330
+ HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
1331
+ HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
1332
+ HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
1333
+ HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
1334
+ HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
1335
+ endSignal = (U32)LIKELY((U32)
1336
+ (BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished)
1337
+ & (BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished)
1338
+ & (BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished)
1339
+ & (BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished));
894
1340
  #endif
1341
+ }
895
1342
  }
896
1343
 
897
1344
  /* check corruption */
@@ -915,8 +1362,99 @@ HUF_decompress4X2_usingDTable_internal_body(
915
1362
  }
916
1363
  }
917
1364
 
1365
+ #if HUF_NEED_BMI2_FUNCTION
1366
+ static BMI2_TARGET_ATTRIBUTE
1367
+ size_t HUF_decompress4X2_usingDTable_internal_bmi2(void* dst, size_t dstSize, void const* cSrc,
1368
+ size_t cSrcSize, HUF_DTable const* DTable) {
1369
+ return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
1370
+ }
1371
+ #endif
1372
+
1373
+ #if HUF_NEED_DEFAULT_FUNCTION
1374
+ static
1375
+ size_t HUF_decompress4X2_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,
1376
+ size_t cSrcSize, HUF_DTable const* DTable) {
1377
+ return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
1378
+ }
1379
+ #endif
1380
+
1381
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2
1382
+
1383
+ HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN;
1384
+
1385
+ static HUF_ASM_X86_64_BMI2_ATTRS size_t
1386
+ HUF_decompress4X2_usingDTable_internal_bmi2_asm(
1387
+ void* dst, size_t dstSize,
1388
+ const void* cSrc, size_t cSrcSize,
1389
+ const HUF_DTable* DTable) {
1390
+ void const* dt = DTable + 1;
1391
+ const BYTE* const iend = (const BYTE*)cSrc + 6;
1392
+ BYTE* const oend = (BYTE*)dst + dstSize;
1393
+ HUF_DecompressAsmArgs args;
1394
+ {
1395
+ size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
1396
+ FORWARD_IF_ERROR(ret, "Failed to init asm args");
1397
+ if (ret != 0)
1398
+ return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
1399
+ }
1400
+
1401
+ assert(args.ip[0] >= args.ilimit);
1402
+ HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(&args);
1403
+
1404
+ /* note : op4 already verified within main loop */
1405
+ assert(args.ip[0] >= iend);
1406
+ assert(args.ip[1] >= iend);
1407
+ assert(args.ip[2] >= iend);
1408
+ assert(args.ip[3] >= iend);
1409
+ assert(args.op[3] <= oend);
1410
+ (void)iend;
1411
+
1412
+ /* finish bitStreams one by one */
1413
+ {
1414
+ size_t const segmentSize = (dstSize+3) / 4;
1415
+ BYTE* segmentEnd = (BYTE*)dst;
1416
+ int i;
1417
+ for (i = 0; i < 4; ++i) {
1418
+ BIT_DStream_t bit;
1419
+ if (segmentSize <= (size_t)(oend - segmentEnd))
1420
+ segmentEnd += segmentSize;
1421
+ else
1422
+ segmentEnd = oend;
1423
+ FORWARD_IF_ERROR(HUF_initRemainingDStream(&bit, &args, i, segmentEnd), "corruption");
1424
+ args.op[i] += HUF_decodeStreamX2(args.op[i], &bit, segmentEnd, (HUF_DEltX2 const*)dt, HUF_DECODER_FAST_TABLELOG);
1425
+ if (args.op[i] != segmentEnd)
1426
+ return ERROR(corruption_detected);
1427
+ }
1428
+ }
1429
+
1430
+ /* decoded size */
1431
+ return dstSize;
1432
+ }
1433
+ #endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */
1434
+
1435
+ static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc,
1436
+ size_t cSrcSize, HUF_DTable const* DTable, int bmi2)
1437
+ {
1438
+ #if DYNAMIC_BMI2
1439
+ if (bmi2) {
1440
+ # if ZSTD_ENABLE_ASM_X86_64_BMI2
1441
+ return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
1442
+ # else
1443
+ return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
1444
+ # endif
1445
+ }
1446
+ #else
1447
+ (void)bmi2;
1448
+ #endif
1449
+
1450
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
1451
+ return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
1452
+ #else
1453
+ return HUF_decompress4X2_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable);
1454
+ #endif
1455
+ }
1456
+
918
1457
  HUF_DGEN(HUF_decompress1X2_usingDTable_internal)
919
- HUF_DGEN(HUF_decompress4X2_usingDTable_internal)
920
1458
 
921
1459
  size_t HUF_decompress1X2_usingDTable(
922
1460
  void* dst, size_t dstSize,
@@ -1025,25 +1563,25 @@ size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize,
1025
1563
 
1026
1564
  #if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2)
1027
1565
  typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t;
1028
- static const algo_time_t algoTime[16 /* Quantization */][3 /* single, double, quad */] =
1566
+ static const algo_time_t algoTime[16 /* Quantization */][2 /* single, double */] =
1029
1567
  {
1030
1568
  /* single, double, quad */
1031
- {{0,0}, {1,1}, {2,2}}, /* Q==0 : impossible */
1032
- {{0,0}, {1,1}, {2,2}}, /* Q==1 : impossible */
1033
- {{ 38,130}, {1313, 74}, {2151, 38}}, /* Q == 2 : 12-18% */
1034
- {{ 448,128}, {1353, 74}, {2238, 41}}, /* Q == 3 : 18-25% */
1035
- {{ 556,128}, {1353, 74}, {2238, 47}}, /* Q == 4 : 25-32% */
1036
- {{ 714,128}, {1418, 74}, {2436, 53}}, /* Q == 5 : 32-38% */
1037
- {{ 883,128}, {1437, 74}, {2464, 61}}, /* Q == 6 : 38-44% */
1038
- {{ 897,128}, {1515, 75}, {2622, 68}}, /* Q == 7 : 44-50% */
1039
- {{ 926,128}, {1613, 75}, {2730, 75}}, /* Q == 8 : 50-56% */
1040
- {{ 947,128}, {1729, 77}, {3359, 77}}, /* Q == 9 : 56-62% */
1041
- {{1107,128}, {2083, 81}, {4006, 84}}, /* Q ==10 : 62-69% */
1042
- {{1177,128}, {2379, 87}, {4785, 88}}, /* Q ==11 : 69-75% */
1043
- {{1242,128}, {2415, 93}, {5155, 84}}, /* Q ==12 : 75-81% */
1044
- {{1349,128}, {2644,106}, {5260,106}}, /* Q ==13 : 81-87% */
1045
- {{1455,128}, {2422,124}, {4174,124}}, /* Q ==14 : 87-93% */
1046
- {{ 722,128}, {1891,145}, {1936,146}}, /* Q ==15 : 93-99% */
1569
+ {{0,0}, {1,1}}, /* Q==0 : impossible */
1570
+ {{0,0}, {1,1}}, /* Q==1 : impossible */
1571
+ {{ 150,216}, { 381,119}}, /* Q == 2 : 12-18% */
1572
+ {{ 170,205}, { 514,112}}, /* Q == 3 : 18-25% */
1573
+ {{ 177,199}, { 539,110}}, /* Q == 4 : 25-32% */
1574
+ {{ 197,194}, { 644,107}}, /* Q == 5 : 32-38% */
1575
+ {{ 221,192}, { 735,107}}, /* Q == 6 : 38-44% */
1576
+ {{ 256,189}, { 881,106}}, /* Q == 7 : 44-50% */
1577
+ {{ 359,188}, {1167,109}}, /* Q == 8 : 50-56% */
1578
+ {{ 582,187}, {1570,114}}, /* Q == 9 : 56-62% */
1579
+ {{ 688,187}, {1712,122}}, /* Q ==10 : 62-69% */
1580
+ {{ 825,186}, {1965,136}}, /* Q ==11 : 69-75% */
1581
+ {{ 976,185}, {2131,150}}, /* Q ==12 : 75-81% */
1582
+ {{1180,186}, {2070,175}}, /* Q ==13 : 81-87% */
1583
+ {{1377,185}, {1731,202}}, /* Q ==14 : 87-93% */
1584
+ {{1412,185}, {1695,202}}, /* Q ==15 : 93-99% */
1047
1585
  };
1048
1586
  #endif
1049
1587
 
@@ -1070,7 +1608,7 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize)
1070
1608
  U32 const D256 = (U32)(dstSize >> 8);
1071
1609
  U32 const DTime0 = algoTime[Q][0].tableTime + (algoTime[Q][0].decode256Time * D256);
1072
1610
  U32 DTime1 = algoTime[Q][1].tableTime + (algoTime[Q][1].decode256Time * D256);
1073
- DTime1 += DTime1 >> 3; /* advantage to algorithm using less memory, to reduce cache eviction */
1611
+ DTime1 += DTime1 >> 5; /* small advantage to algorithm using less memory, to reduce cache eviction */
1074
1612
  return DTime1 < DTime0;
1075
1613
  }
1076
1614
  #endif