zstdlib 0.7.0-x86-mingw32 → 0.10.0-x86-mingw32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGES.md +20 -0
  3. data/README.md +7 -1
  4. data/Rakefile +38 -8
  5. data/ext/{zstdlib → zstdlib_c}/extconf.rb +11 -6
  6. data/ext/{zstdlib → zstdlib_c}/ruby/zlib-2.2/zstdlib.c +2 -2
  7. data/ext/{zstdlib → zstdlib_c}/ruby/zlib-2.3/zstdlib.c +2 -2
  8. data/ext/{zstdlib → zstdlib_c}/ruby/zlib-2.4/zstdlib.c +2 -2
  9. data/ext/{zstdlib → zstdlib_c}/ruby/zlib-2.5/zstdlib.c +2 -2
  10. data/ext/{zstdlib → zstdlib_c}/ruby/zlib-2.6/zstdlib.c +2 -2
  11. data/ext/{zstdlib → zstdlib_c}/ruby/zlib-2.7/zstdlib.c +2 -2
  12. data/ext/zstdlib_c/ruby/zlib-3.0/zstdlib.c +4994 -0
  13. data/ext/zstdlib_c/ruby/zlib-3.1/zstdlib.c +5076 -0
  14. data/ext/{zstdlib/zlib-1.2.11 → zstdlib_c/zlib-1.2.12}/adler32.c +0 -0
  15. data/ext/{zstdlib/zlib-1.2.11 → zstdlib_c/zlib-1.2.12}/compress.c +0 -0
  16. data/ext/zstdlib_c/zlib-1.2.12/crc32.c +1116 -0
  17. data/ext/zstdlib_c/zlib-1.2.12/crc32.h +9446 -0
  18. data/ext/{zstdlib/zlib-1.2.11 → zstdlib_c/zlib-1.2.12}/deflate.c +78 -30
  19. data/ext/{zstdlib/zlib-1.2.11 → zstdlib_c/zlib-1.2.12}/deflate.h +12 -15
  20. data/ext/{zstdlib/zlib-1.2.11 → zstdlib_c/zlib-1.2.12}/gzclose.c +0 -0
  21. data/ext/{zstdlib/zlib-1.2.11 → zstdlib_c/zlib-1.2.12}/gzguts.h +3 -2
  22. data/ext/{zstdlib/zlib-1.2.11 → zstdlib_c/zlib-1.2.12}/gzlib.c +5 -3
  23. data/ext/{zstdlib/zlib-1.2.11 → zstdlib_c/zlib-1.2.12}/gzread.c +5 -7
  24. data/ext/{zstdlib/zlib-1.2.11 → zstdlib_c/zlib-1.2.12}/gzwrite.c +25 -13
  25. data/ext/{zstdlib/zlib-1.2.11 → zstdlib_c/zlib-1.2.12}/infback.c +2 -1
  26. data/ext/{zstdlib/zlib-1.2.11 → zstdlib_c/zlib-1.2.12}/inffast.c +14 -14
  27. data/ext/{zstdlib/zlib-1.2.11 → zstdlib_c/zlib-1.2.12}/inffast.h +0 -0
  28. data/ext/{zstdlib/zlib-1.2.11 → zstdlib_c/zlib-1.2.12}/inffixed.h +0 -0
  29. data/ext/{zstdlib/zlib-1.2.11 → zstdlib_c/zlib-1.2.12}/inflate.c +39 -8
  30. data/ext/{zstdlib/zlib-1.2.11 → zstdlib_c/zlib-1.2.12}/inflate.h +3 -2
  31. data/ext/{zstdlib/zlib-1.2.11 → zstdlib_c/zlib-1.2.12}/inftrees.c +3 -3
  32. data/ext/{zstdlib/zlib-1.2.11 → zstdlib_c/zlib-1.2.12}/inftrees.h +0 -0
  33. data/ext/{zstdlib/zlib-1.2.11 → zstdlib_c/zlib-1.2.12}/trees.c +27 -48
  34. data/ext/{zstdlib/zlib-1.2.11 → zstdlib_c/zlib-1.2.12}/trees.h +0 -0
  35. data/ext/{zstdlib/zlib-1.2.11 → zstdlib_c/zlib-1.2.12}/uncompr.c +0 -0
  36. data/ext/{zstdlib/zlib-1.2.11 → zstdlib_c/zlib-1.2.12}/zconf.h +0 -0
  37. data/ext/{zstdlib/zlib-1.2.11 → zstdlib_c/zlib-1.2.12}/zlib.h +123 -100
  38. data/ext/{zstdlib/zlib-1.2.11 → zstdlib_c/zlib-1.2.12}/zutil.c +2 -2
  39. data/ext/{zstdlib/zlib-1.2.11 → zstdlib_c/zlib-1.2.12}/zutil.h +12 -9
  40. data/ext/{zstdlib → zstdlib_c}/zlib.mk +0 -0
  41. data/ext/{zstdlib → zstdlib_c}/zlibwrapper/zlibwrapper.c +1 -5
  42. data/ext/{zstdlib → zstdlib_c}/zlibwrapper.mk +0 -0
  43. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/common/bitstream.h +46 -22
  44. data/ext/zstdlib_c/zstd-1.5.2/lib/common/compiler.h +335 -0
  45. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/common/cpu.h +1 -3
  46. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/common/debug.c +1 -1
  47. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/common/debug.h +12 -19
  48. data/ext/zstdlib_c/zstd-1.5.2/lib/common/entropy_common.c +368 -0
  49. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/common/error_private.c +2 -1
  50. data/ext/zstdlib_c/zstd-1.5.2/lib/common/error_private.h +159 -0
  51. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/common/fse.h +41 -12
  52. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/common/fse_decompress.c +139 -22
  53. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/common/huf.h +47 -23
  54. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/common/mem.h +87 -98
  55. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/common/pool.c +34 -23
  56. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/common/pool.h +4 -4
  57. data/ext/zstdlib_c/zstd-1.5.2/lib/common/portability_macros.h +137 -0
  58. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/common/threading.c +6 -5
  59. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/common/threading.h +0 -0
  60. data/ext/zstdlib_c/zstd-1.5.2/lib/common/xxhash.c +24 -0
  61. data/ext/zstdlib_c/zstd-1.5.2/lib/common/xxhash.h +5686 -0
  62. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/common/zstd_common.c +10 -10
  63. data/ext/zstdlib_c/zstd-1.5.2/lib/common/zstd_deps.h +111 -0
  64. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/common/zstd_internal.h +191 -145
  65. data/ext/zstdlib_c/zstd-1.5.2/lib/common/zstd_trace.h +163 -0
  66. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/clevels.h +134 -0
  67. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/compress/fse_compress.c +89 -46
  68. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/compress/hist.c +27 -29
  69. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/compress/hist.h +2 -2
  70. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/huf_compress.c +1370 -0
  71. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_compress.c +2917 -868
  72. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_compress_internal.h +458 -125
  73. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_compress_literals.c +12 -11
  74. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_compress_literals.h +4 -2
  75. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_compress_sequences.c +41 -18
  76. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_compress_sequences.h +1 -1
  77. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_compress_superblock.c +26 -298
  78. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_compress_superblock.h +1 -1
  79. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_cwksp.h +234 -83
  80. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_double_fast.c +313 -138
  81. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_double_fast.h +1 -1
  82. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_fast.c +329 -150
  83. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_fast.h +1 -1
  84. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/zstd_lazy.c +2104 -0
  85. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/zstd_lazy.h +125 -0
  86. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_ldm.c +321 -216
  87. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_ldm.h +9 -2
  88. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/zstd_ldm_geartab.h +106 -0
  89. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_opt.c +412 -166
  90. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_opt.h +1 -1
  91. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/compress/zstdmt_compress.c +169 -453
  92. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/zstdmt_compress.h +113 -0
  93. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/decompress/huf_decompress.c +1044 -403
  94. data/ext/zstdlib_c/zstd-1.5.2/lib/decompress/huf_decompress_amd64.S +585 -0
  95. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/decompress/zstd_ddict.c +9 -9
  96. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/decompress/zstd_ddict.h +2 -2
  97. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/decompress/zstd_decompress.c +450 -105
  98. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/decompress/zstd_decompress_block.c +913 -273
  99. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/decompress/zstd_decompress_block.h +14 -5
  100. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/decompress/zstd_decompress_internal.h +59 -12
  101. data/ext/zstdlib_c/zstd-1.5.2/lib/zdict.h +452 -0
  102. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/zstd.h +699 -214
  103. data/ext/{zstdlib/zstd-1.4.5/lib/common → zstdlib_c/zstd-1.5.2/lib}/zstd_errors.h +2 -1
  104. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/zlibWrapper/gzclose.c +0 -0
  105. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/zlibWrapper/gzcompatibility.h +1 -1
  106. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/zlibWrapper/gzguts.h +0 -0
  107. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/zlibWrapper/gzlib.c +0 -0
  108. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/zlibWrapper/gzread.c +0 -0
  109. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/zlibWrapper/gzwrite.c +0 -0
  110. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/zlibWrapper/zstd_zlibwrapper.c +133 -44
  111. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/zlibWrapper/zstd_zlibwrapper.h +1 -1
  112. data/ext/zstdlib_c/zstd.mk +15 -0
  113. data/lib/2.4/zstdlib_c.so +0 -0
  114. data/lib/2.5/zstdlib_c.so +0 -0
  115. data/lib/2.6/zstdlib_c.so +0 -0
  116. data/lib/2.7/zstdlib_c.so +0 -0
  117. data/lib/3.0/zstdlib_c.so +0 -0
  118. data/lib/3.1/zstdlib_c.so +0 -0
  119. data/lib/zstdlib.rb +2 -2
  120. metadata +125 -116
  121. data/ext/zstdlib/zlib-1.2.11/crc32.c +0 -442
  122. data/ext/zstdlib/zlib-1.2.11/crc32.h +0 -441
  123. data/ext/zstdlib/zstd-1.4.5/lib/common/compiler.h +0 -175
  124. data/ext/zstdlib/zstd-1.4.5/lib/common/entropy_common.c +0 -216
  125. data/ext/zstdlib/zstd-1.4.5/lib/common/error_private.h +0 -80
  126. data/ext/zstdlib/zstd-1.4.5/lib/common/xxhash.c +0 -864
  127. data/ext/zstdlib/zstd-1.4.5/lib/common/xxhash.h +0 -285
  128. data/ext/zstdlib/zstd-1.4.5/lib/compress/huf_compress.c +0 -798
  129. data/ext/zstdlib/zstd-1.4.5/lib/compress/zstd_lazy.c +0 -1138
  130. data/ext/zstdlib/zstd-1.4.5/lib/compress/zstd_lazy.h +0 -67
  131. data/ext/zstdlib/zstd-1.4.5/lib/compress/zstdmt_compress.h +0 -192
  132. data/ext/zstdlib/zstd.mk +0 -14
  133. data/lib/2.2/zstdlib.so +0 -0
  134. data/lib/2.3/zstdlib.so +0 -0
  135. data/lib/2.4/zstdlib.so +0 -0
  136. data/lib/2.5/zstdlib.so +0 -0
  137. data/lib/2.6/zstdlib.so +0 -0
  138. data/lib/2.7/zstdlib.so +0 -0
@@ -1,7 +1,7 @@
1
1
  /* ******************************************************************
2
2
  * huff0 huffman decoder,
3
3
  * part of Finite State Entropy library
4
- * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc.
4
+ * Copyright (c) Yann Collet, Facebook, Inc.
5
5
  *
6
6
  * You can contact the author at :
7
7
  * - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
@@ -15,13 +15,20 @@
15
15
  /* **************************************************************
16
16
  * Dependencies
17
17
  ****************************************************************/
18
- #include <string.h> /* memcpy, memset */
18
+ #include "../common/zstd_deps.h" /* ZSTD_memcpy, ZSTD_memset */
19
19
  #include "../common/compiler.h"
20
20
  #include "../common/bitstream.h" /* BIT_* */
21
21
  #include "../common/fse.h" /* to compress headers */
22
22
  #define HUF_STATIC_LINKING_ONLY
23
23
  #include "../common/huf.h"
24
24
  #include "../common/error_private.h"
25
+ #include "../common/zstd_internal.h"
26
+
27
+ /* **************************************************************
28
+ * Constants
29
+ ****************************************************************/
30
+
31
+ #define HUF_DECODER_FAST_TABLELOG 11
25
32
 
26
33
  /* **************************************************************
27
34
  * Macros
@@ -36,6 +43,30 @@
36
43
  #error "Cannot force the use of the X1 and X2 decoders at the same time!"
37
44
  #endif
38
45
 
46
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2 && DYNAMIC_BMI2
47
+ # define HUF_ASM_X86_64_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE
48
+ #else
49
+ # define HUF_ASM_X86_64_BMI2_ATTRS
50
+ #endif
51
+
52
+ #ifdef __cplusplus
53
+ # define HUF_EXTERN_C extern "C"
54
+ #else
55
+ # define HUF_EXTERN_C
56
+ #endif
57
+ #define HUF_ASM_DECL HUF_EXTERN_C
58
+
59
+ #if DYNAMIC_BMI2 || (ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__))
60
+ # define HUF_NEED_BMI2_FUNCTION 1
61
+ #else
62
+ # define HUF_NEED_BMI2_FUNCTION 0
63
+ #endif
64
+
65
+ #if !(ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__))
66
+ # define HUF_NEED_DEFAULT_FUNCTION 1
67
+ #else
68
+ # define HUF_NEED_DEFAULT_FUNCTION 0
69
+ #endif
39
70
 
40
71
  /* **************************************************************
41
72
  * Error Management
@@ -65,7 +96,7 @@
65
96
  return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \
66
97
  } \
67
98
  \
68
- static TARGET_ATTRIBUTE("bmi2") size_t fn##_bmi2( \
99
+ static BMI2_TARGET_ATTRIBUTE size_t fn##_bmi2( \
69
100
  void* dst, size_t dstSize, \
70
101
  const void* cSrc, size_t cSrcSize, \
71
102
  const HUF_DTable* DTable) \
@@ -103,92 +134,347 @@ typedef struct { BYTE maxTableLog; BYTE tableType; BYTE tableLog; BYTE reserved;
103
134
  static DTableDesc HUF_getDTableDesc(const HUF_DTable* table)
104
135
  {
105
136
  DTableDesc dtd;
106
- memcpy(&dtd, table, sizeof(dtd));
137
+ ZSTD_memcpy(&dtd, table, sizeof(dtd));
107
138
  return dtd;
108
139
  }
109
140
 
141
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2
142
+
143
+ static size_t HUF_initDStream(BYTE const* ip) {
144
+ BYTE const lastByte = ip[7];
145
+ size_t const bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;
146
+ size_t const value = MEM_readLEST(ip) | 1;
147
+ assert(bitsConsumed <= 8);
148
+ return value << bitsConsumed;
149
+ }
150
+ typedef struct {
151
+ BYTE const* ip[4];
152
+ BYTE* op[4];
153
+ U64 bits[4];
154
+ void const* dt;
155
+ BYTE const* ilimit;
156
+ BYTE* oend;
157
+ BYTE const* iend[4];
158
+ } HUF_DecompressAsmArgs;
159
+
160
+ /**
161
+ * Initializes args for the asm decoding loop.
162
+ * @returns 0 on success
163
+ * 1 if the fallback implementation should be used.
164
+ * Or an error code on failure.
165
+ */
166
+ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable)
167
+ {
168
+ void const* dt = DTable + 1;
169
+ U32 const dtLog = HUF_getDTableDesc(DTable).tableLog;
170
+
171
+ const BYTE* const ilimit = (const BYTE*)src + 6 + 8;
172
+
173
+ BYTE* const oend = (BYTE*)dst + dstSize;
174
+
175
+ /* The following condition is false on x32 platform,
176
+ * but HUF_asm is not compatible with this ABI */
177
+ if (!(MEM_isLittleEndian() && !MEM_32bits())) return 1;
178
+
179
+ /* strict minimum : jump table + 1 byte per stream */
180
+ if (srcSize < 10)
181
+ return ERROR(corruption_detected);
182
+
183
+ /* Must have at least 8 bytes per stream because we don't handle initializing smaller bit containers.
184
+ * If table log is not correct at this point, fallback to the old decoder.
185
+ * On small inputs we don't have enough data to trigger the fast loop, so use the old decoder.
186
+ */
187
+ if (dtLog != HUF_DECODER_FAST_TABLELOG)
188
+ return 1;
189
+
190
+ /* Read the jump table. */
191
+ {
192
+ const BYTE* const istart = (const BYTE*)src;
193
+ size_t const length1 = MEM_readLE16(istart);
194
+ size_t const length2 = MEM_readLE16(istart+2);
195
+ size_t const length3 = MEM_readLE16(istart+4);
196
+ size_t const length4 = srcSize - (length1 + length2 + length3 + 6);
197
+ args->iend[0] = istart + 6; /* jumpTable */
198
+ args->iend[1] = args->iend[0] + length1;
199
+ args->iend[2] = args->iend[1] + length2;
200
+ args->iend[3] = args->iend[2] + length3;
201
+
202
+ /* HUF_initDStream() requires this, and this small of an input
203
+ * won't benefit from the ASM loop anyways.
204
+ * length1 must be >= 16 so that ip[0] >= ilimit before the loop
205
+ * starts.
206
+ */
207
+ if (length1 < 16 || length2 < 8 || length3 < 8 || length4 < 8)
208
+ return 1;
209
+ if (length4 > srcSize) return ERROR(corruption_detected); /* overflow */
210
+ }
211
+ /* ip[] contains the position that is currently loaded into bits[]. */
212
+ args->ip[0] = args->iend[1] - sizeof(U64);
213
+ args->ip[1] = args->iend[2] - sizeof(U64);
214
+ args->ip[2] = args->iend[3] - sizeof(U64);
215
+ args->ip[3] = (BYTE const*)src + srcSize - sizeof(U64);
216
+
217
+ /* op[] contains the output pointers. */
218
+ args->op[0] = (BYTE*)dst;
219
+ args->op[1] = args->op[0] + (dstSize+3)/4;
220
+ args->op[2] = args->op[1] + (dstSize+3)/4;
221
+ args->op[3] = args->op[2] + (dstSize+3)/4;
222
+
223
+ /* No point to call the ASM loop for tiny outputs. */
224
+ if (args->op[3] >= oend)
225
+ return 1;
226
+
227
+ /* bits[] is the bit container.
228
+ * It is read from the MSB down to the LSB.
229
+ * It is shifted left as it is read, and zeros are
230
+ * shifted in. After the lowest valid bit a 1 is
231
+ * set, so that CountTrailingZeros(bits[]) can be used
232
+ * to count how many bits we've consumed.
233
+ */
234
+ args->bits[0] = HUF_initDStream(args->ip[0]);
235
+ args->bits[1] = HUF_initDStream(args->ip[1]);
236
+ args->bits[2] = HUF_initDStream(args->ip[2]);
237
+ args->bits[3] = HUF_initDStream(args->ip[3]);
238
+
239
+ /* If ip[] >= ilimit, it is guaranteed to be safe to
240
+ * reload bits[]. It may be beyond its section, but is
241
+ * guaranteed to be valid (>= istart).
242
+ */
243
+ args->ilimit = ilimit;
244
+
245
+ args->oend = oend;
246
+ args->dt = dt;
247
+
248
+ return 0;
249
+ }
250
+
251
+ static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs const* args, int stream, BYTE* segmentEnd)
252
+ {
253
+ /* Validate that we haven't overwritten. */
254
+ if (args->op[stream] > segmentEnd)
255
+ return ERROR(corruption_detected);
256
+ /* Validate that we haven't read beyond iend[].
257
+ * Note that ip[] may be < iend[] because the MSB is
258
+ * the next bit to read, and we may have consumed 100%
259
+ * of the stream, so down to iend[i] - 8 is valid.
260
+ */
261
+ if (args->ip[stream] < args->iend[stream] - 8)
262
+ return ERROR(corruption_detected);
263
+
264
+ /* Construct the BIT_DStream_t. */
265
+ bit->bitContainer = MEM_readLE64(args->ip[stream]);
266
+ bit->bitsConsumed = ZSTD_countTrailingZeros((size_t)args->bits[stream]);
267
+ bit->start = (const char*)args->iend[0];
268
+ bit->limitPtr = bit->start + sizeof(size_t);
269
+ bit->ptr = (const char*)args->ip[stream];
270
+
271
+ return 0;
272
+ }
273
+ #endif
274
+
110
275
 
111
276
  #ifndef HUF_FORCE_DECOMPRESS_X2
112
277
 
113
278
  /*-***************************/
114
279
  /* single-symbol decoding */
115
280
  /*-***************************/
116
- typedef struct { BYTE byte; BYTE nbBits; } HUF_DEltX1; /* single-symbol decoding */
281
+ typedef struct { BYTE nbBits; BYTE byte; } HUF_DEltX1; /* single-symbol decoding */
282
+
283
+ /**
284
+ * Packs 4 HUF_DEltX1 structs into a U64. This is used to lay down 4 entries at
285
+ * a time.
286
+ */
287
+ static U64 HUF_DEltX1_set4(BYTE symbol, BYTE nbBits) {
288
+ U64 D4;
289
+ if (MEM_isLittleEndian()) {
290
+ D4 = (symbol << 8) + nbBits;
291
+ } else {
292
+ D4 = symbol + (nbBits << 8);
293
+ }
294
+ D4 *= 0x0001000100010001ULL;
295
+ return D4;
296
+ }
297
+
298
+ /**
299
+ * Increase the tableLog to targetTableLog and rescales the stats.
300
+ * If tableLog > targetTableLog this is a no-op.
301
+ * @returns New tableLog
302
+ */
303
+ static U32 HUF_rescaleStats(BYTE* huffWeight, U32* rankVal, U32 nbSymbols, U32 tableLog, U32 targetTableLog)
304
+ {
305
+ if (tableLog > targetTableLog)
306
+ return tableLog;
307
+ if (tableLog < targetTableLog) {
308
+ U32 const scale = targetTableLog - tableLog;
309
+ U32 s;
310
+ /* Increase the weight for all non-zero probability symbols by scale. */
311
+ for (s = 0; s < nbSymbols; ++s) {
312
+ huffWeight[s] += (BYTE)((huffWeight[s] == 0) ? 0 : scale);
313
+ }
314
+ /* Update rankVal to reflect the new weights.
315
+ * All weights except 0 get moved to weight + scale.
316
+ * Weights [1, scale] are empty.
317
+ */
318
+ for (s = targetTableLog; s > scale; --s) {
319
+ rankVal[s] = rankVal[s - scale];
320
+ }
321
+ for (s = scale; s > 0; --s) {
322
+ rankVal[s] = 0;
323
+ }
324
+ }
325
+ return targetTableLog;
326
+ }
327
+
328
+ typedef struct {
329
+ U32 rankVal[HUF_TABLELOG_ABSOLUTEMAX + 1];
330
+ U32 rankStart[HUF_TABLELOG_ABSOLUTEMAX + 1];
331
+ U32 statsWksp[HUF_READ_STATS_WORKSPACE_SIZE_U32];
332
+ BYTE symbols[HUF_SYMBOLVALUE_MAX + 1];
333
+ BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1];
334
+ } HUF_ReadDTableX1_Workspace;
335
+
117
336
 
118
337
  size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize)
338
+ {
339
+ return HUF_readDTableX1_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0);
340
+ }
341
+
342
+ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2)
119
343
  {
120
344
  U32 tableLog = 0;
121
345
  U32 nbSymbols = 0;
122
346
  size_t iSize;
123
347
  void* const dtPtr = DTable + 1;
124
348
  HUF_DEltX1* const dt = (HUF_DEltX1*)dtPtr;
349
+ HUF_ReadDTableX1_Workspace* wksp = (HUF_ReadDTableX1_Workspace*)workSpace;
125
350
 
126
- U32* rankVal;
127
- BYTE* huffWeight;
128
- size_t spaceUsed32 = 0;
129
-
130
- rankVal = (U32 *)workSpace + spaceUsed32;
131
- spaceUsed32 += HUF_TABLELOG_ABSOLUTEMAX + 1;
132
- huffWeight = (BYTE *)((U32 *)workSpace + spaceUsed32);
133
- spaceUsed32 += HUF_ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2;
134
-
135
- if ((spaceUsed32 << 2) > wkspSize) return ERROR(tableLog_tooLarge);
351
+ DEBUG_STATIC_ASSERT(HUF_DECOMPRESS_WORKSPACE_SIZE >= sizeof(*wksp));
352
+ if (sizeof(*wksp) > wkspSize) return ERROR(tableLog_tooLarge);
136
353
 
137
354
  DEBUG_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable));
138
- /* memset(huffWeight, 0, sizeof(huffWeight)); */ /* is not necessary, even though some analyzer complain ... */
355
+ /* ZSTD_memset(huffWeight, 0, sizeof(huffWeight)); */ /* is not necessary, even though some analyzer complain ... */
139
356
 
140
- iSize = HUF_readStats(huffWeight, HUF_SYMBOLVALUE_MAX + 1, rankVal, &nbSymbols, &tableLog, src, srcSize);
357
+ iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), bmi2);
141
358
  if (HUF_isError(iSize)) return iSize;
142
359
 
360
+
143
361
  /* Table header */
144
362
  { DTableDesc dtd = HUF_getDTableDesc(DTable);
363
+ U32 const maxTableLog = dtd.maxTableLog + 1;
364
+ U32 const targetTableLog = MIN(maxTableLog, HUF_DECODER_FAST_TABLELOG);
365
+ tableLog = HUF_rescaleStats(wksp->huffWeight, wksp->rankVal, nbSymbols, tableLog, targetTableLog);
145
366
  if (tableLog > (U32)(dtd.maxTableLog+1)) return ERROR(tableLog_tooLarge); /* DTable too small, Huffman tree cannot fit in */
146
367
  dtd.tableType = 0;
147
368
  dtd.tableLog = (BYTE)tableLog;
148
- memcpy(DTable, &dtd, sizeof(dtd));
369
+ ZSTD_memcpy(DTable, &dtd, sizeof(dtd));
149
370
  }
150
371
 
151
- /* Calculate starting value for each rank */
152
- { U32 n, nextRankStart = 0;
153
- for (n=1; n<tableLog+1; n++) {
154
- U32 const current = nextRankStart;
155
- nextRankStart += (rankVal[n] << (n-1));
156
- rankVal[n] = current;
157
- } }
158
-
159
- /* fill DTable */
160
- { U32 n;
161
- size_t const nEnd = nbSymbols;
162
- for (n=0; n<nEnd; n++) {
163
- size_t const w = huffWeight[n];
164
- size_t const length = (1 << w) >> 1;
165
- size_t const uStart = rankVal[w];
166
- size_t const uEnd = uStart + length;
167
- size_t u;
168
- HUF_DEltX1 D;
169
- D.byte = (BYTE)n;
170
- D.nbBits = (BYTE)(tableLog + 1 - w);
171
- rankVal[w] = (U32)uEnd;
172
- if (length < 4) {
173
- /* Use length in the loop bound so the compiler knows it is short. */
174
- for (u = 0; u < length; ++u)
175
- dt[uStart + u] = D;
176
- } else {
177
- /* Unroll the loop 4 times, we know it is a power of 2. */
178
- for (u = uStart; u < uEnd; u += 4) {
179
- dt[u + 0] = D;
180
- dt[u + 1] = D;
181
- dt[u + 2] = D;
182
- dt[u + 3] = D;
183
- } } } }
184
- return iSize;
185
- }
372
+ /* Compute symbols and rankStart given rankVal:
373
+ *
374
+ * rankVal already contains the number of values of each weight.
375
+ *
376
+ * symbols contains the symbols ordered by weight. First are the rankVal[0]
377
+ * weight 0 symbols, followed by the rankVal[1] weight 1 symbols, and so on.
378
+ * symbols[0] is filled (but unused) to avoid a branch.
379
+ *
380
+ * rankStart contains the offset where each rank belongs in the DTable.
381
+ * rankStart[0] is not filled because there are no entries in the table for
382
+ * weight 0.
383
+ */
384
+ {
385
+ int n;
386
+ int nextRankStart = 0;
387
+ int const unroll = 4;
388
+ int const nLimit = (int)nbSymbols - unroll + 1;
389
+ for (n=0; n<(int)tableLog+1; n++) {
390
+ U32 const curr = nextRankStart;
391
+ nextRankStart += wksp->rankVal[n];
392
+ wksp->rankStart[n] = curr;
393
+ }
394
+ for (n=0; n < nLimit; n += unroll) {
395
+ int u;
396
+ for (u=0; u < unroll; ++u) {
397
+ size_t const w = wksp->huffWeight[n+u];
398
+ wksp->symbols[wksp->rankStart[w]++] = (BYTE)(n+u);
399
+ }
400
+ }
401
+ for (; n < (int)nbSymbols; ++n) {
402
+ size_t const w = wksp->huffWeight[n];
403
+ wksp->symbols[wksp->rankStart[w]++] = (BYTE)n;
404
+ }
405
+ }
186
406
 
187
- size_t HUF_readDTableX1(HUF_DTable* DTable, const void* src, size_t srcSize)
188
- {
189
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
190
- return HUF_readDTableX1_wksp(DTable, src, srcSize,
191
- workSpace, sizeof(workSpace));
407
+ /* fill DTable
408
+ * We fill all entries of each weight in order.
409
+ * That way length is a constant for each iteration of the outer loop.
410
+ * We can switch based on the length to a different inner loop which is
411
+ * optimized for that particular case.
412
+ */
413
+ {
414
+ U32 w;
415
+ int symbol=wksp->rankVal[0];
416
+ int rankStart=0;
417
+ for (w=1; w<tableLog+1; ++w) {
418
+ int const symbolCount = wksp->rankVal[w];
419
+ int const length = (1 << w) >> 1;
420
+ int uStart = rankStart;
421
+ BYTE const nbBits = (BYTE)(tableLog + 1 - w);
422
+ int s;
423
+ int u;
424
+ switch (length) {
425
+ case 1:
426
+ for (s=0; s<symbolCount; ++s) {
427
+ HUF_DEltX1 D;
428
+ D.byte = wksp->symbols[symbol + s];
429
+ D.nbBits = nbBits;
430
+ dt[uStart] = D;
431
+ uStart += 1;
432
+ }
433
+ break;
434
+ case 2:
435
+ for (s=0; s<symbolCount; ++s) {
436
+ HUF_DEltX1 D;
437
+ D.byte = wksp->symbols[symbol + s];
438
+ D.nbBits = nbBits;
439
+ dt[uStart+0] = D;
440
+ dt[uStart+1] = D;
441
+ uStart += 2;
442
+ }
443
+ break;
444
+ case 4:
445
+ for (s=0; s<symbolCount; ++s) {
446
+ U64 const D4 = HUF_DEltX1_set4(wksp->symbols[symbol + s], nbBits);
447
+ MEM_write64(dt + uStart, D4);
448
+ uStart += 4;
449
+ }
450
+ break;
451
+ case 8:
452
+ for (s=0; s<symbolCount; ++s) {
453
+ U64 const D4 = HUF_DEltX1_set4(wksp->symbols[symbol + s], nbBits);
454
+ MEM_write64(dt + uStart, D4);
455
+ MEM_write64(dt + uStart + 4, D4);
456
+ uStart += 8;
457
+ }
458
+ break;
459
+ default:
460
+ for (s=0; s<symbolCount; ++s) {
461
+ U64 const D4 = HUF_DEltX1_set4(wksp->symbols[symbol + s], nbBits);
462
+ for (u=0; u < length; u += 16) {
463
+ MEM_write64(dt + uStart + u + 0, D4);
464
+ MEM_write64(dt + uStart + u + 4, D4);
465
+ MEM_write64(dt + uStart + u + 8, D4);
466
+ MEM_write64(dt + uStart + u + 12, D4);
467
+ }
468
+ assert(u == length);
469
+ uStart += length;
470
+ }
471
+ break;
472
+ }
473
+ symbol += symbolCount;
474
+ rankStart += symbolCount * length;
475
+ }
476
+ }
477
+ return iSize;
192
478
  }
193
479
 
194
480
  FORCE_INLINE_TEMPLATE BYTE
@@ -217,11 +503,15 @@ HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, cons
217
503
  BYTE* const pStart = p;
218
504
 
219
505
  /* up to 4 symbols at a time */
220
- while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-3)) {
221
- HUF_DECODE_SYMBOLX1_2(p, bitDPtr);
222
- HUF_DECODE_SYMBOLX1_1(p, bitDPtr);
223
- HUF_DECODE_SYMBOLX1_2(p, bitDPtr);
224
- HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
506
+ if ((pEnd - p) > 3) {
507
+ while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-3)) {
508
+ HUF_DECODE_SYMBOLX1_2(p, bitDPtr);
509
+ HUF_DECODE_SYMBOLX1_1(p, bitDPtr);
510
+ HUF_DECODE_SYMBOLX1_2(p, bitDPtr);
511
+ HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
512
+ }
513
+ } else {
514
+ BIT_reloadDStream(bitDPtr);
225
515
  }
226
516
 
227
517
  /* [0-3] symbols remaining */
@@ -301,33 +591,36 @@ HUF_decompress4X1_usingDTable_internal_body(
301
591
  U32 endSignal = 1;
302
592
 
303
593
  if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */
594
+ if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */
304
595
  CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
305
596
  CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
306
597
  CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
307
598
  CHECK_F( BIT_initDStream(&bitD4, istart4, length4) );
308
599
 
309
600
  /* up to 16 symbols per loop (4 symbols per stream) in 64-bit mode */
310
- for ( ; (endSignal) & (op4 < olimit) ; ) {
311
- HUF_DECODE_SYMBOLX1_2(op1, &bitD1);
312
- HUF_DECODE_SYMBOLX1_2(op2, &bitD2);
313
- HUF_DECODE_SYMBOLX1_2(op3, &bitD3);
314
- HUF_DECODE_SYMBOLX1_2(op4, &bitD4);
315
- HUF_DECODE_SYMBOLX1_1(op1, &bitD1);
316
- HUF_DECODE_SYMBOLX1_1(op2, &bitD2);
317
- HUF_DECODE_SYMBOLX1_1(op3, &bitD3);
318
- HUF_DECODE_SYMBOLX1_1(op4, &bitD4);
319
- HUF_DECODE_SYMBOLX1_2(op1, &bitD1);
320
- HUF_DECODE_SYMBOLX1_2(op2, &bitD2);
321
- HUF_DECODE_SYMBOLX1_2(op3, &bitD3);
322
- HUF_DECODE_SYMBOLX1_2(op4, &bitD4);
323
- HUF_DECODE_SYMBOLX1_0(op1, &bitD1);
324
- HUF_DECODE_SYMBOLX1_0(op2, &bitD2);
325
- HUF_DECODE_SYMBOLX1_0(op3, &bitD3);
326
- HUF_DECODE_SYMBOLX1_0(op4, &bitD4);
327
- endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished;
328
- endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished;
329
- endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished;
330
- endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished;
601
+ if ((size_t)(oend - op4) >= sizeof(size_t)) {
602
+ for ( ; (endSignal) & (op4 < olimit) ; ) {
603
+ HUF_DECODE_SYMBOLX1_2(op1, &bitD1);
604
+ HUF_DECODE_SYMBOLX1_2(op2, &bitD2);
605
+ HUF_DECODE_SYMBOLX1_2(op3, &bitD3);
606
+ HUF_DECODE_SYMBOLX1_2(op4, &bitD4);
607
+ HUF_DECODE_SYMBOLX1_1(op1, &bitD1);
608
+ HUF_DECODE_SYMBOLX1_1(op2, &bitD2);
609
+ HUF_DECODE_SYMBOLX1_1(op3, &bitD3);
610
+ HUF_DECODE_SYMBOLX1_1(op4, &bitD4);
611
+ HUF_DECODE_SYMBOLX1_2(op1, &bitD1);
612
+ HUF_DECODE_SYMBOLX1_2(op2, &bitD2);
613
+ HUF_DECODE_SYMBOLX1_2(op3, &bitD3);
614
+ HUF_DECODE_SYMBOLX1_2(op4, &bitD4);
615
+ HUF_DECODE_SYMBOLX1_0(op1, &bitD1);
616
+ HUF_DECODE_SYMBOLX1_0(op2, &bitD2);
617
+ HUF_DECODE_SYMBOLX1_0(op3, &bitD3);
618
+ HUF_DECODE_SYMBOLX1_0(op4, &bitD4);
619
+ endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished;
620
+ endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished;
621
+ endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished;
622
+ endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished;
623
+ }
331
624
  }
332
625
 
333
626
  /* check corruption */
@@ -353,6 +646,79 @@ HUF_decompress4X1_usingDTable_internal_body(
353
646
  }
354
647
  }
355
648
 
649
+ #if HUF_NEED_BMI2_FUNCTION
650
+ static BMI2_TARGET_ATTRIBUTE
651
+ size_t HUF_decompress4X1_usingDTable_internal_bmi2(void* dst, size_t dstSize, void const* cSrc,
652
+ size_t cSrcSize, HUF_DTable const* DTable) {
653
+ return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
654
+ }
655
+ #endif
656
+
657
+ #if HUF_NEED_DEFAULT_FUNCTION
658
+ static
659
+ size_t HUF_decompress4X1_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,
660
+ size_t cSrcSize, HUF_DTable const* DTable) {
661
+ return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
662
+ }
663
+ #endif
664
+
665
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2
666
+
667
+ HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN;
668
+
669
+ static HUF_ASM_X86_64_BMI2_ATTRS
670
+ size_t
671
+ HUF_decompress4X1_usingDTable_internal_bmi2_asm(
672
+ void* dst, size_t dstSize,
673
+ const void* cSrc, size_t cSrcSize,
674
+ const HUF_DTable* DTable)
675
+ {
676
+ void const* dt = DTable + 1;
677
+ const BYTE* const iend = (const BYTE*)cSrc + 6;
678
+ BYTE* const oend = (BYTE*)dst + dstSize;
679
+ HUF_DecompressAsmArgs args;
680
+ {
681
+ size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
682
+ FORWARD_IF_ERROR(ret, "Failed to init asm args");
683
+ if (ret != 0)
684
+ return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
685
+ }
686
+
687
+ assert(args.ip[0] >= args.ilimit);
688
+ HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(&args);
689
+
690
+ /* Our loop guarantees that ip[] >= ilimit and that we haven't
691
+ * overwritten any op[].
692
+ */
693
+ assert(args.ip[0] >= iend);
694
+ assert(args.ip[1] >= iend);
695
+ assert(args.ip[2] >= iend);
696
+ assert(args.ip[3] >= iend);
697
+ assert(args.op[3] <= oend);
698
+ (void)iend;
699
+
700
+ /* finish bit streams one by one. */
701
+ {
702
+ size_t const segmentSize = (dstSize+3) / 4;
703
+ BYTE* segmentEnd = (BYTE*)dst;
704
+ int i;
705
+ for (i = 0; i < 4; ++i) {
706
+ BIT_DStream_t bit;
707
+ if (segmentSize <= (size_t)(oend - segmentEnd))
708
+ segmentEnd += segmentSize;
709
+ else
710
+ segmentEnd = oend;
711
+ FORWARD_IF_ERROR(HUF_initRemainingDStream(&bit, &args, i, segmentEnd), "corruption");
712
+ /* Decompress and validate that we've produced exactly the expected length. */
713
+ args.op[i] += HUF_decodeStreamX1(args.op[i], &bit, segmentEnd, (HUF_DEltX1 const*)dt, HUF_DECODER_FAST_TABLELOG);
714
+ if (args.op[i] != segmentEnd) return ERROR(corruption_detected);
715
+ }
716
+ }
717
+
718
+ /* decoded size */
719
+ return dstSize;
720
+ }
721
+ #endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */
356
722
 
357
723
  typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize,
358
724
  const void *cSrc,
@@ -360,8 +726,28 @@ typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize,
360
726
  const HUF_DTable *DTable);
361
727
 
362
728
  HUF_DGEN(HUF_decompress1X1_usingDTable_internal)
363
- HUF_DGEN(HUF_decompress4X1_usingDTable_internal)
364
729
 
730
+ static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc,
731
+ size_t cSrcSize, HUF_DTable const* DTable, int bmi2)
732
+ {
733
+ #if DYNAMIC_BMI2
734
+ if (bmi2) {
735
+ # if ZSTD_ENABLE_ASM_X86_64_BMI2
736
+ return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
737
+ # else
738
+ return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
739
+ # endif
740
+ }
741
+ #else
742
+ (void)bmi2;
743
+ #endif
744
+
745
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
746
+ return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
747
+ #else
748
+ return HUF_decompress4X1_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable);
749
+ #endif
750
+ }
365
751
 
366
752
 
367
753
  size_t HUF_decompress1X1_usingDTable(
@@ -389,20 +775,6 @@ size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
389
775
  }
390
776
 
391
777
 
392
- size_t HUF_decompress1X1_DCtx(HUF_DTable* DCtx, void* dst, size_t dstSize,
393
- const void* cSrc, size_t cSrcSize)
394
- {
395
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
396
- return HUF_decompress1X1_DCtx_wksp(DCtx, dst, dstSize, cSrc, cSrcSize,
397
- workSpace, sizeof(workSpace));
398
- }
399
-
400
- size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
401
- {
402
- HUF_CREATE_STATIC_DTABLEX1(DTable, HUF_TABLELOG_MAX);
403
- return HUF_decompress1X1_DCtx (DTable, dst, dstSize, cSrc, cSrcSize);
404
- }
405
-
406
778
  size_t HUF_decompress4X1_usingDTable(
407
779
  void* dst, size_t dstSize,
408
780
  const void* cSrc, size_t cSrcSize,
@@ -419,8 +791,7 @@ static size_t HUF_decompress4X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size
419
791
  {
420
792
  const BYTE* ip = (const BYTE*) cSrc;
421
793
 
422
- size_t const hSize = HUF_readDTableX1_wksp (dctx, cSrc, cSrcSize,
423
- workSpace, wkspSize);
794
+ size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
424
795
  if (HUF_isError(hSize)) return hSize;
425
796
  if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
426
797
  ip += hSize; cSrcSize -= hSize;
@@ -436,18 +807,6 @@ size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
436
807
  }
437
808
 
438
809
 
439
- size_t HUF_decompress4X1_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
440
- {
441
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
442
- return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
443
- workSpace, sizeof(workSpace));
444
- }
445
- size_t HUF_decompress4X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
446
- {
447
- HUF_CREATE_STATIC_DTABLEX1(DTable, HUF_TABLELOG_MAX);
448
- return HUF_decompress4X1_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
449
- }
450
-
451
810
  #endif /* HUF_FORCE_DECOMPRESS_X2 */
452
811
 
453
812
 
@@ -458,209 +817,329 @@ size_t HUF_decompress4X1 (void* dst, size_t dstSize, const void* cSrc, size_t cS
458
817
  /* *************************/
459
818
 
460
819
  typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUF_DEltX2; /* double-symbols decoding */
461
- typedef struct { BYTE symbol; BYTE weight; } sortedSymbol_t;
820
+ typedef struct { BYTE symbol; } sortedSymbol_t;
462
821
  typedef U32 rankValCol_t[HUF_TABLELOG_MAX + 1];
463
822
  typedef rankValCol_t rankVal_t[HUF_TABLELOG_MAX];
464
823
 
824
+ /**
825
+ * Constructs a HUF_DEltX2 in a U32.
826
+ */
827
+ static U32 HUF_buildDEltX2U32(U32 symbol, U32 nbBits, U32 baseSeq, int level)
828
+ {
829
+ U32 seq;
830
+ DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, sequence) == 0);
831
+ DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, nbBits) == 2);
832
+ DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, length) == 3);
833
+ DEBUG_STATIC_ASSERT(sizeof(HUF_DEltX2) == sizeof(U32));
834
+ if (MEM_isLittleEndian()) {
835
+ seq = level == 1 ? symbol : (baseSeq + (symbol << 8));
836
+ return seq + (nbBits << 16) + ((U32)level << 24);
837
+ } else {
838
+ seq = level == 1 ? (symbol << 8) : ((baseSeq << 8) + symbol);
839
+ return (seq << 16) + (nbBits << 8) + (U32)level;
840
+ }
841
+ }
465
842
 
466
- /* HUF_fillDTableX2Level2() :
467
- * `rankValOrigin` must be a table of at least (HUF_TABLELOG_MAX + 1) U32 */
468
- static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 sizeLog, const U32 consumed,
469
- const U32* rankValOrigin, const int minWeight,
470
- const sortedSymbol_t* sortedSymbols, const U32 sortedListSize,
471
- U32 nbBitsBaseline, U16 baseSeq)
843
+ /**
844
+ * Constructs a HUF_DEltX2.
845
+ */
846
+ static HUF_DEltX2 HUF_buildDEltX2(U32 symbol, U32 nbBits, U32 baseSeq, int level)
472
847
  {
473
848
  HUF_DEltX2 DElt;
474
- U32 rankVal[HUF_TABLELOG_MAX + 1];
849
+ U32 const val = HUF_buildDEltX2U32(symbol, nbBits, baseSeq, level);
850
+ DEBUG_STATIC_ASSERT(sizeof(DElt) == sizeof(val));
851
+ ZSTD_memcpy(&DElt, &val, sizeof(val));
852
+ return DElt;
853
+ }
475
854
 
476
- /* get pre-calculated rankVal */
477
- memcpy(rankVal, rankValOrigin, sizeof(rankVal));
855
+ /**
856
+ * Constructs 2 HUF_DEltX2s and packs them into a U64.
857
+ */
858
+ static U64 HUF_buildDEltX2U64(U32 symbol, U32 nbBits, U16 baseSeq, int level)
859
+ {
860
+ U32 DElt = HUF_buildDEltX2U32(symbol, nbBits, baseSeq, level);
861
+ return (U64)DElt + ((U64)DElt << 32);
862
+ }
478
863
 
479
- /* fill skipped values */
480
- if (minWeight>1) {
481
- U32 i, skipSize = rankVal[minWeight];
482
- MEM_writeLE16(&(DElt.sequence), baseSeq);
483
- DElt.nbBits = (BYTE)(consumed);
484
- DElt.length = 1;
485
- for (i = 0; i < skipSize; i++)
486
- DTable[i] = DElt;
864
+ /**
865
+ * Fills the DTable rank with all the symbols from [begin, end) that are each
866
+ * nbBits long.
867
+ *
868
+ * @param DTableRank The start of the rank in the DTable.
869
+ * @param begin The first symbol to fill (inclusive).
870
+ * @param end The last symbol to fill (exclusive).
871
+ * @param nbBits Each symbol is nbBits long.
872
+ * @param tableLog The table log.
873
+ * @param baseSeq If level == 1 { 0 } else { the first level symbol }
874
+ * @param level The level in the table. Must be 1 or 2.
875
+ */
876
+ static void HUF_fillDTableX2ForWeight(
877
+ HUF_DEltX2* DTableRank,
878
+ sortedSymbol_t const* begin, sortedSymbol_t const* end,
879
+ U32 nbBits, U32 tableLog,
880
+ U16 baseSeq, int const level)
881
+ {
882
+ U32 const length = 1U << ((tableLog - nbBits) & 0x1F /* quiet static-analyzer */);
883
+ const sortedSymbol_t* ptr;
884
+ assert(level >= 1 && level <= 2);
885
+ switch (length) {
886
+ case 1:
887
+ for (ptr = begin; ptr != end; ++ptr) {
888
+ HUF_DEltX2 const DElt = HUF_buildDEltX2(ptr->symbol, nbBits, baseSeq, level);
889
+ *DTableRank++ = DElt;
890
+ }
891
+ break;
892
+ case 2:
893
+ for (ptr = begin; ptr != end; ++ptr) {
894
+ HUF_DEltX2 const DElt = HUF_buildDEltX2(ptr->symbol, nbBits, baseSeq, level);
895
+ DTableRank[0] = DElt;
896
+ DTableRank[1] = DElt;
897
+ DTableRank += 2;
898
+ }
899
+ break;
900
+ case 4:
901
+ for (ptr = begin; ptr != end; ++ptr) {
902
+ U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level);
903
+ ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2));
904
+ ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2));
905
+ DTableRank += 4;
906
+ }
907
+ break;
908
+ case 8:
909
+ for (ptr = begin; ptr != end; ++ptr) {
910
+ U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level);
911
+ ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2));
912
+ ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2));
913
+ ZSTD_memcpy(DTableRank + 4, &DEltX2, sizeof(DEltX2));
914
+ ZSTD_memcpy(DTableRank + 6, &DEltX2, sizeof(DEltX2));
915
+ DTableRank += 8;
916
+ }
917
+ break;
918
+ default:
919
+ for (ptr = begin; ptr != end; ++ptr) {
920
+ U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level);
921
+ HUF_DEltX2* const DTableRankEnd = DTableRank + length;
922
+ for (; DTableRank != DTableRankEnd; DTableRank += 8) {
923
+ ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2));
924
+ ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2));
925
+ ZSTD_memcpy(DTableRank + 4, &DEltX2, sizeof(DEltX2));
926
+ ZSTD_memcpy(DTableRank + 6, &DEltX2, sizeof(DEltX2));
927
+ }
928
+ }
929
+ break;
487
930
  }
931
+ }
488
932
 
489
- /* fill DTable */
490
- { U32 s; for (s=0; s<sortedListSize; s++) { /* note : sortedSymbols already skipped */
491
- const U32 symbol = sortedSymbols[s].symbol;
492
- const U32 weight = sortedSymbols[s].weight;
493
- const U32 nbBits = nbBitsBaseline - weight;
494
- const U32 length = 1 << (sizeLog-nbBits);
495
- const U32 start = rankVal[weight];
496
- U32 i = start;
497
- const U32 end = start + length;
498
-
499
- MEM_writeLE16(&(DElt.sequence), (U16)(baseSeq + (symbol << 8)));
500
- DElt.nbBits = (BYTE)(nbBits + consumed);
501
- DElt.length = 2;
502
- do { DTable[i++] = DElt; } while (i<end); /* since length >= 1 */
933
+ /* HUF_fillDTableX2Level2() :
934
+ * `rankValOrigin` must be a table of at least (HUF_TABLELOG_MAX + 1) U32 */
935
+ static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 targetLog, const U32 consumedBits,
936
+ const U32* rankVal, const int minWeight, const int maxWeight1,
937
+ const sortedSymbol_t* sortedSymbols, U32 const* rankStart,
938
+ U32 nbBitsBaseline, U16 baseSeq)
939
+ {
940
+ /* Fill skipped values (all positions up to rankVal[minWeight]).
941
+ * These are positions only get a single symbol because the combined weight
942
+ * is too large.
943
+ */
944
+ if (minWeight>1) {
945
+ U32 const length = 1U << ((targetLog - consumedBits) & 0x1F /* quiet static-analyzer */);
946
+ U64 const DEltX2 = HUF_buildDEltX2U64(baseSeq, consumedBits, /* baseSeq */ 0, /* level */ 1);
947
+ int const skipSize = rankVal[minWeight];
948
+ assert(length > 1);
949
+ assert((U32)skipSize < length);
950
+ switch (length) {
951
+ case 2:
952
+ assert(skipSize == 1);
953
+ ZSTD_memcpy(DTable, &DEltX2, sizeof(DEltX2));
954
+ break;
955
+ case 4:
956
+ assert(skipSize <= 4);
957
+ ZSTD_memcpy(DTable + 0, &DEltX2, sizeof(DEltX2));
958
+ ZSTD_memcpy(DTable + 2, &DEltX2, sizeof(DEltX2));
959
+ break;
960
+ default:
961
+ {
962
+ int i;
963
+ for (i = 0; i < skipSize; i += 8) {
964
+ ZSTD_memcpy(DTable + i + 0, &DEltX2, sizeof(DEltX2));
965
+ ZSTD_memcpy(DTable + i + 2, &DEltX2, sizeof(DEltX2));
966
+ ZSTD_memcpy(DTable + i + 4, &DEltX2, sizeof(DEltX2));
967
+ ZSTD_memcpy(DTable + i + 6, &DEltX2, sizeof(DEltX2));
968
+ }
969
+ }
970
+ }
971
+ }
503
972
 
504
- rankVal[weight] += length;
505
- } }
973
+ /* Fill each of the second level symbols by weight. */
974
+ {
975
+ int w;
976
+ for (w = minWeight; w < maxWeight1; ++w) {
977
+ int const begin = rankStart[w];
978
+ int const end = rankStart[w+1];
979
+ U32 const nbBits = nbBitsBaseline - w;
980
+ U32 const totalBits = nbBits + consumedBits;
981
+ HUF_fillDTableX2ForWeight(
982
+ DTable + rankVal[w],
983
+ sortedSymbols + begin, sortedSymbols + end,
984
+ totalBits, targetLog,
985
+ baseSeq, /* level */ 2);
986
+ }
987
+ }
506
988
  }
507
989
 
508
-
509
990
  static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog,
510
- const sortedSymbol_t* sortedList, const U32 sortedListSize,
991
+ const sortedSymbol_t* sortedList,
511
992
  const U32* rankStart, rankVal_t rankValOrigin, const U32 maxWeight,
512
993
  const U32 nbBitsBaseline)
513
994
  {
514
- U32 rankVal[HUF_TABLELOG_MAX + 1];
995
+ U32* const rankVal = rankValOrigin[0];
515
996
  const int scaleLog = nbBitsBaseline - targetLog; /* note : targetLog >= srcLog, hence scaleLog <= 1 */
516
997
  const U32 minBits = nbBitsBaseline - maxWeight;
517
- U32 s;
518
-
519
- memcpy(rankVal, rankValOrigin, sizeof(rankVal));
520
-
521
- /* fill DTable */
522
- for (s=0; s<sortedListSize; s++) {
523
- const U16 symbol = sortedList[s].symbol;
524
- const U32 weight = sortedList[s].weight;
525
- const U32 nbBits = nbBitsBaseline - weight;
526
- const U32 start = rankVal[weight];
527
- const U32 length = 1 << (targetLog-nbBits);
528
-
529
- if (targetLog-nbBits >= minBits) { /* enough room for a second symbol */
530
- U32 sortedRank;
998
+ int w;
999
+ int const wEnd = (int)maxWeight + 1;
1000
+
1001
+ /* Fill DTable in order of weight. */
1002
+ for (w = 1; w < wEnd; ++w) {
1003
+ int const begin = (int)rankStart[w];
1004
+ int const end = (int)rankStart[w+1];
1005
+ U32 const nbBits = nbBitsBaseline - w;
1006
+
1007
+ if (targetLog-nbBits >= minBits) {
1008
+ /* Enough room for a second symbol. */
1009
+ int start = rankVal[w];
1010
+ U32 const length = 1U << ((targetLog - nbBits) & 0x1F /* quiet static-analyzer */);
531
1011
  int minWeight = nbBits + scaleLog;
1012
+ int s;
532
1013
  if (minWeight < 1) minWeight = 1;
533
- sortedRank = rankStart[minWeight];
534
- HUF_fillDTableX2Level2(DTable+start, targetLog-nbBits, nbBits,
535
- rankValOrigin[nbBits], minWeight,
536
- sortedList+sortedRank, sortedListSize-sortedRank,
537
- nbBitsBaseline, symbol);
1014
+ /* Fill the DTable for every symbol of weight w.
1015
+ * These symbols get at least 1 second symbol.
1016
+ */
1017
+ for (s = begin; s != end; ++s) {
1018
+ HUF_fillDTableX2Level2(
1019
+ DTable + start, targetLog, nbBits,
1020
+ rankValOrigin[nbBits], minWeight, wEnd,
1021
+ sortedList, rankStart,
1022
+ nbBitsBaseline, sortedList[s].symbol);
1023
+ start += length;
1024
+ }
538
1025
  } else {
539
- HUF_DEltX2 DElt;
540
- MEM_writeLE16(&(DElt.sequence), symbol);
541
- DElt.nbBits = (BYTE)(nbBits);
542
- DElt.length = 1;
543
- { U32 const end = start + length;
544
- U32 u;
545
- for (u = start; u < end; u++) DTable[u] = DElt;
546
- } }
547
- rankVal[weight] += length;
1026
+ /* Only a single symbol. */
1027
+ HUF_fillDTableX2ForWeight(
1028
+ DTable + rankVal[w],
1029
+ sortedList + begin, sortedList + end,
1030
+ nbBits, targetLog,
1031
+ /* baseSeq */ 0, /* level */ 1);
1032
+ }
548
1033
  }
549
1034
  }
550
1035
 
1036
+ typedef struct {
1037
+ rankValCol_t rankVal[HUF_TABLELOG_MAX];
1038
+ U32 rankStats[HUF_TABLELOG_MAX + 1];
1039
+ U32 rankStart0[HUF_TABLELOG_MAX + 3];
1040
+ sortedSymbol_t sortedSymbol[HUF_SYMBOLVALUE_MAX + 1];
1041
+ BYTE weightList[HUF_SYMBOLVALUE_MAX + 1];
1042
+ U32 calleeWksp[HUF_READ_STATS_WORKSPACE_SIZE_U32];
1043
+ } HUF_ReadDTableX2_Workspace;
1044
+
551
1045
  size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,
552
1046
  const void* src, size_t srcSize,
553
1047
  void* workSpace, size_t wkspSize)
554
1048
  {
555
- U32 tableLog, maxW, sizeOfSort, nbSymbols;
1049
+ return HUF_readDTableX2_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0);
1050
+ }
1051
+
1052
+ size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable,
1053
+ const void* src, size_t srcSize,
1054
+ void* workSpace, size_t wkspSize, int bmi2)
1055
+ {
1056
+ U32 tableLog, maxW, nbSymbols;
556
1057
  DTableDesc dtd = HUF_getDTableDesc(DTable);
557
- U32 const maxTableLog = dtd.maxTableLog;
1058
+ U32 maxTableLog = dtd.maxTableLog;
558
1059
  size_t iSize;
559
1060
  void* dtPtr = DTable+1; /* force compiler to avoid strict-aliasing */
560
1061
  HUF_DEltX2* const dt = (HUF_DEltX2*)dtPtr;
561
1062
  U32 *rankStart;
562
1063
 
563
- rankValCol_t* rankVal;
564
- U32* rankStats;
565
- U32* rankStart0;
566
- sortedSymbol_t* sortedSymbol;
567
- BYTE* weightList;
568
- size_t spaceUsed32 = 0;
569
-
570
- rankVal = (rankValCol_t *)((U32 *)workSpace + spaceUsed32);
571
- spaceUsed32 += (sizeof(rankValCol_t) * HUF_TABLELOG_MAX) >> 2;
572
- rankStats = (U32 *)workSpace + spaceUsed32;
573
- spaceUsed32 += HUF_TABLELOG_MAX + 1;
574
- rankStart0 = (U32 *)workSpace + spaceUsed32;
575
- spaceUsed32 += HUF_TABLELOG_MAX + 2;
576
- sortedSymbol = (sortedSymbol_t *)workSpace + (spaceUsed32 * sizeof(U32)) / sizeof(sortedSymbol_t);
577
- spaceUsed32 += HUF_ALIGN(sizeof(sortedSymbol_t) * (HUF_SYMBOLVALUE_MAX + 1), sizeof(U32)) >> 2;
578
- weightList = (BYTE *)((U32 *)workSpace + spaceUsed32);
579
- spaceUsed32 += HUF_ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2;
580
-
581
- if ((spaceUsed32 << 2) > wkspSize) return ERROR(tableLog_tooLarge);
582
-
583
- rankStart = rankStart0 + 1;
584
- memset(rankStats, 0, sizeof(U32) * (2 * HUF_TABLELOG_MAX + 2 + 1));
1064
+ HUF_ReadDTableX2_Workspace* const wksp = (HUF_ReadDTableX2_Workspace*)workSpace;
1065
+
1066
+ if (sizeof(*wksp) > wkspSize) return ERROR(GENERIC);
1067
+
1068
+ rankStart = wksp->rankStart0 + 1;
1069
+ ZSTD_memset(wksp->rankStats, 0, sizeof(wksp->rankStats));
1070
+ ZSTD_memset(wksp->rankStart0, 0, sizeof(wksp->rankStart0));
585
1071
 
586
1072
  DEBUG_STATIC_ASSERT(sizeof(HUF_DEltX2) == sizeof(HUF_DTable)); /* if compiler fails here, assertion is wrong */
587
1073
  if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
588
- /* memset(weightList, 0, sizeof(weightList)); */ /* is not necessary, even though some analyzer complain ... */
1074
+ /* ZSTD_memset(weightList, 0, sizeof(weightList)); */ /* is not necessary, even though some analyzer complain ... */
589
1075
 
590
- iSize = HUF_readStats(weightList, HUF_SYMBOLVALUE_MAX + 1, rankStats, &nbSymbols, &tableLog, src, srcSize);
1076
+ iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), bmi2);
591
1077
  if (HUF_isError(iSize)) return iSize;
592
1078
 
593
1079
  /* check result */
594
1080
  if (tableLog > maxTableLog) return ERROR(tableLog_tooLarge); /* DTable can't fit code depth */
1081
+ if (tableLog <= HUF_DECODER_FAST_TABLELOG && maxTableLog > HUF_DECODER_FAST_TABLELOG) maxTableLog = HUF_DECODER_FAST_TABLELOG;
595
1082
 
596
1083
  /* find maxWeight */
597
- for (maxW = tableLog; rankStats[maxW]==0; maxW--) {} /* necessarily finds a solution before 0 */
1084
+ for (maxW = tableLog; wksp->rankStats[maxW]==0; maxW--) {} /* necessarily finds a solution before 0 */
598
1085
 
599
1086
  /* Get start index of each weight */
600
1087
  { U32 w, nextRankStart = 0;
601
1088
  for (w=1; w<maxW+1; w++) {
602
- U32 current = nextRankStart;
603
- nextRankStart += rankStats[w];
604
- rankStart[w] = current;
1089
+ U32 curr = nextRankStart;
1090
+ nextRankStart += wksp->rankStats[w];
1091
+ rankStart[w] = curr;
605
1092
  }
606
1093
  rankStart[0] = nextRankStart; /* put all 0w symbols at the end of sorted list*/
607
- sizeOfSort = nextRankStart;
1094
+ rankStart[maxW+1] = nextRankStart;
608
1095
  }
609
1096
 
610
1097
  /* sort symbols by weight */
611
1098
  { U32 s;
612
1099
  for (s=0; s<nbSymbols; s++) {
613
- U32 const w = weightList[s];
1100
+ U32 const w = wksp->weightList[s];
614
1101
  U32 const r = rankStart[w]++;
615
- sortedSymbol[r].symbol = (BYTE)s;
616
- sortedSymbol[r].weight = (BYTE)w;
1102
+ wksp->sortedSymbol[r].symbol = (BYTE)s;
617
1103
  }
618
1104
  rankStart[0] = 0; /* forget 0w symbols; this is beginning of weight(1) */
619
1105
  }
620
1106
 
621
1107
  /* Build rankVal */
622
- { U32* const rankVal0 = rankVal[0];
1108
+ { U32* const rankVal0 = wksp->rankVal[0];
623
1109
  { int const rescale = (maxTableLog-tableLog) - 1; /* tableLog <= maxTableLog */
624
1110
  U32 nextRankVal = 0;
625
1111
  U32 w;
626
1112
  for (w=1; w<maxW+1; w++) {
627
- U32 current = nextRankVal;
628
- nextRankVal += rankStats[w] << (w+rescale);
629
- rankVal0[w] = current;
1113
+ U32 curr = nextRankVal;
1114
+ nextRankVal += wksp->rankStats[w] << (w+rescale);
1115
+ rankVal0[w] = curr;
630
1116
  } }
631
1117
  { U32 const minBits = tableLog+1 - maxW;
632
1118
  U32 consumed;
633
1119
  for (consumed = minBits; consumed < maxTableLog - minBits + 1; consumed++) {
634
- U32* const rankValPtr = rankVal[consumed];
1120
+ U32* const rankValPtr = wksp->rankVal[consumed];
635
1121
  U32 w;
636
1122
  for (w = 1; w < maxW+1; w++) {
637
1123
  rankValPtr[w] = rankVal0[w] >> consumed;
638
1124
  } } } }
639
1125
 
640
1126
  HUF_fillDTableX2(dt, maxTableLog,
641
- sortedSymbol, sizeOfSort,
642
- rankStart0, rankVal, maxW,
1127
+ wksp->sortedSymbol,
1128
+ wksp->rankStart0, wksp->rankVal, maxW,
643
1129
  tableLog+1);
644
1130
 
645
1131
  dtd.tableLog = (BYTE)maxTableLog;
646
1132
  dtd.tableType = 1;
647
- memcpy(DTable, &dtd, sizeof(dtd));
1133
+ ZSTD_memcpy(DTable, &dtd, sizeof(dtd));
648
1134
  return iSize;
649
1135
  }
650
1136
 
651
- size_t HUF_readDTableX2(HUF_DTable* DTable, const void* src, size_t srcSize)
652
- {
653
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
654
- return HUF_readDTableX2_wksp(DTable, src, srcSize,
655
- workSpace, sizeof(workSpace));
656
- }
657
-
658
1137
 
659
1138
  FORCE_INLINE_TEMPLATE U32
660
1139
  HUF_decodeSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, const U32 dtLog)
661
1140
  {
662
1141
  size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */
663
- memcpy(op, dt+val, 2);
1142
+ ZSTD_memcpy(op, &dt[val].sequence, 2);
664
1143
  BIT_skipBits(DStream, dt[val].nbBits);
665
1144
  return dt[val].length;
666
1145
  }
@@ -669,15 +1148,17 @@ FORCE_INLINE_TEMPLATE U32
669
1148
  HUF_decodeLastSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, const U32 dtLog)
670
1149
  {
671
1150
  size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */
672
- memcpy(op, dt+val, 1);
673
- if (dt[val].length==1) BIT_skipBits(DStream, dt[val].nbBits);
674
- else {
1151
+ ZSTD_memcpy(op, &dt[val].sequence, 1);
1152
+ if (dt[val].length==1) {
1153
+ BIT_skipBits(DStream, dt[val].nbBits);
1154
+ } else {
675
1155
  if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8)) {
676
1156
  BIT_skipBits(DStream, dt[val].nbBits);
677
1157
  if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8))
678
1158
  /* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */
679
1159
  DStream->bitsConsumed = (sizeof(DStream->bitContainer)*8);
680
- } }
1160
+ }
1161
+ }
681
1162
  return 1;
682
1163
  }
683
1164
 
@@ -699,19 +1180,37 @@ HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd,
699
1180
  BYTE* const pStart = p;
700
1181
 
701
1182
  /* up to 8 symbols at a time */
702
- while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-(sizeof(bitDPtr->bitContainer)-1))) {
703
- HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
704
- HUF_DECODE_SYMBOLX2_1(p, bitDPtr);
705
- HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
706
- HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
1183
+ if ((size_t)(pEnd - p) >= sizeof(bitDPtr->bitContainer)) {
1184
+ if (dtLog <= 11 && MEM_64bits()) {
1185
+ /* up to 10 symbols at a time */
1186
+ while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-9)) {
1187
+ HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
1188
+ HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
1189
+ HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
1190
+ HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
1191
+ HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
1192
+ }
1193
+ } else {
1194
+ /* up to 8 symbols at a time */
1195
+ while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-(sizeof(bitDPtr->bitContainer)-1))) {
1196
+ HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
1197
+ HUF_DECODE_SYMBOLX2_1(p, bitDPtr);
1198
+ HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
1199
+ HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
1200
+ }
1201
+ }
1202
+ } else {
1203
+ BIT_reloadDStream(bitDPtr);
707
1204
  }
708
1205
 
709
1206
  /* closer to end : up to 2 symbols at a time */
710
- while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p <= pEnd-2))
711
- HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
1207
+ if ((size_t)(pEnd - p) >= 2) {
1208
+ while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p <= pEnd-2))
1209
+ HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
712
1210
 
713
- while (p <= pEnd-2)
714
- HUF_DECODE_SYMBOLX2_0(p, bitDPtr); /* no need to reload : reached the end of DStream */
1211
+ while (p <= pEnd-2)
1212
+ HUF_DECODE_SYMBOLX2_0(p, bitDPtr); /* no need to reload : reached the end of DStream */
1213
+ }
715
1214
 
716
1215
  if (p < pEnd)
717
1216
  p += HUF_decodeLastSymbolX2(p, bitDPtr, dt, dtLog);
@@ -745,7 +1244,6 @@ HUF_decompress1X2_usingDTable_internal_body(
745
1244
  /* decoded size */
746
1245
  return dstSize;
747
1246
  }
748
-
749
1247
  FORCE_INLINE_TEMPLATE size_t
750
1248
  HUF_decompress4X2_usingDTable_internal_body(
751
1249
  void* dst, size_t dstSize,
@@ -787,57 +1285,60 @@ HUF_decompress4X2_usingDTable_internal_body(
787
1285
  U32 const dtLog = dtd.tableLog;
788
1286
 
789
1287
  if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */
1288
+ if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */
790
1289
  CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
791
1290
  CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
792
1291
  CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
793
1292
  CHECK_F( BIT_initDStream(&bitD4, istart4, length4) );
794
1293
 
795
1294
  /* 16-32 symbols per loop (4-8 symbols per stream) */
796
- for ( ; (endSignal) & (op4 < olimit); ) {
1295
+ if ((size_t)(oend - op4) >= sizeof(size_t)) {
1296
+ for ( ; (endSignal) & (op4 < olimit); ) {
797
1297
  #if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
798
- HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
799
- HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
800
- HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
801
- HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
802
- HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
803
- HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
804
- HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
805
- HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
806
- endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished;
807
- endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished;
808
- HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
809
- HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
810
- HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
811
- HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
812
- HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
813
- HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
814
- HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
815
- HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
816
- endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished;
817
- endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished;
1298
+ HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
1299
+ HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
1300
+ HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
1301
+ HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
1302
+ HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
1303
+ HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
1304
+ HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
1305
+ HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
1306
+ endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished;
1307
+ endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished;
1308
+ HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
1309
+ HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
1310
+ HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
1311
+ HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
1312
+ HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
1313
+ HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
1314
+ HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
1315
+ HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
1316
+ endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished;
1317
+ endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished;
818
1318
  #else
819
- HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
820
- HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
821
- HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
822
- HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
823
- HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
824
- HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
825
- HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
826
- HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
827
- HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
828
- HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
829
- HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
830
- HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
831
- HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
832
- HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
833
- HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
834
- HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
835
- endSignal = (U32)LIKELY(
836
- (BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished)
837
- & (BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished)
838
- & (BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished)
839
- & (BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished));
1319
+ HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
1320
+ HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
1321
+ HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
1322
+ HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
1323
+ HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
1324
+ HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
1325
+ HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
1326
+ HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
1327
+ HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
1328
+ HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
1329
+ HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
1330
+ HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
1331
+ HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
1332
+ HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
1333
+ HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
1334
+ HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
1335
+ endSignal = (U32)LIKELY((U32)
1336
+ (BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished)
1337
+ & (BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished)
1338
+ & (BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished)
1339
+ & (BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished));
840
1340
  #endif
1341
+ }
841
1342
  }
842
1343
 
843
1344
  /* check corruption */
@@ -861,8 +1362,99 @@ HUF_decompress4X2_usingDTable_internal_body(
861
1362
  }
862
1363
  }
863
1364
 
1365
+ #if HUF_NEED_BMI2_FUNCTION
1366
+ static BMI2_TARGET_ATTRIBUTE
1367
+ size_t HUF_decompress4X2_usingDTable_internal_bmi2(void* dst, size_t dstSize, void const* cSrc,
1368
+ size_t cSrcSize, HUF_DTable const* DTable) {
1369
+ return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
1370
+ }
1371
+ #endif
1372
+
1373
+ #if HUF_NEED_DEFAULT_FUNCTION
1374
+ static
1375
+ size_t HUF_decompress4X2_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,
1376
+ size_t cSrcSize, HUF_DTable const* DTable) {
1377
+ return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
1378
+ }
1379
+ #endif
1380
+
1381
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2
1382
+
1383
+ HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN;
1384
+
1385
+ static HUF_ASM_X86_64_BMI2_ATTRS size_t
1386
+ HUF_decompress4X2_usingDTable_internal_bmi2_asm(
1387
+ void* dst, size_t dstSize,
1388
+ const void* cSrc, size_t cSrcSize,
1389
+ const HUF_DTable* DTable) {
1390
+ void const* dt = DTable + 1;
1391
+ const BYTE* const iend = (const BYTE*)cSrc + 6;
1392
+ BYTE* const oend = (BYTE*)dst + dstSize;
1393
+ HUF_DecompressAsmArgs args;
1394
+ {
1395
+ size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
1396
+ FORWARD_IF_ERROR(ret, "Failed to init asm args");
1397
+ if (ret != 0)
1398
+ return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
1399
+ }
1400
+
1401
+ assert(args.ip[0] >= args.ilimit);
1402
+ HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(&args);
1403
+
1404
+ /* note : op4 already verified within main loop */
1405
+ assert(args.ip[0] >= iend);
1406
+ assert(args.ip[1] >= iend);
1407
+ assert(args.ip[2] >= iend);
1408
+ assert(args.ip[3] >= iend);
1409
+ assert(args.op[3] <= oend);
1410
+ (void)iend;
1411
+
1412
+ /* finish bitStreams one by one */
1413
+ {
1414
+ size_t const segmentSize = (dstSize+3) / 4;
1415
+ BYTE* segmentEnd = (BYTE*)dst;
1416
+ int i;
1417
+ for (i = 0; i < 4; ++i) {
1418
+ BIT_DStream_t bit;
1419
+ if (segmentSize <= (size_t)(oend - segmentEnd))
1420
+ segmentEnd += segmentSize;
1421
+ else
1422
+ segmentEnd = oend;
1423
+ FORWARD_IF_ERROR(HUF_initRemainingDStream(&bit, &args, i, segmentEnd), "corruption");
1424
+ args.op[i] += HUF_decodeStreamX2(args.op[i], &bit, segmentEnd, (HUF_DEltX2 const*)dt, HUF_DECODER_FAST_TABLELOG);
1425
+ if (args.op[i] != segmentEnd)
1426
+ return ERROR(corruption_detected);
1427
+ }
1428
+ }
1429
+
1430
+ /* decoded size */
1431
+ return dstSize;
1432
+ }
1433
+ #endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */
1434
+
1435
+ static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc,
1436
+ size_t cSrcSize, HUF_DTable const* DTable, int bmi2)
1437
+ {
1438
+ #if DYNAMIC_BMI2
1439
+ if (bmi2) {
1440
+ # if ZSTD_ENABLE_ASM_X86_64_BMI2
1441
+ return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
1442
+ # else
1443
+ return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
1444
+ # endif
1445
+ }
1446
+ #else
1447
+ (void)bmi2;
1448
+ #endif
1449
+
1450
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
1451
+ return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
1452
+ #else
1453
+ return HUF_decompress4X2_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable);
1454
+ #endif
1455
+ }
1456
+
864
1457
  HUF_DGEN(HUF_decompress1X2_usingDTable_internal)
865
- HUF_DGEN(HUF_decompress4X2_usingDTable_internal)
866
1458
 
867
1459
  size_t HUF_decompress1X2_usingDTable(
868
1460
  void* dst, size_t dstSize,
@@ -890,20 +1482,6 @@ size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
890
1482
  }
891
1483
 
892
1484
 
893
- size_t HUF_decompress1X2_DCtx(HUF_DTable* DCtx, void* dst, size_t dstSize,
894
- const void* cSrc, size_t cSrcSize)
895
- {
896
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
897
- return HUF_decompress1X2_DCtx_wksp(DCtx, dst, dstSize, cSrc, cSrcSize,
898
- workSpace, sizeof(workSpace));
899
- }
900
-
901
- size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
902
- {
903
- HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_TABLELOG_MAX);
904
- return HUF_decompress1X2_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
905
- }
906
-
907
1485
  size_t HUF_decompress4X2_usingDTable(
908
1486
  void* dst, size_t dstSize,
909
1487
  const void* cSrc, size_t cSrcSize,
@@ -937,20 +1515,6 @@ size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
937
1515
  }
938
1516
 
939
1517
 
940
- size_t HUF_decompress4X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize,
941
- const void* cSrc, size_t cSrcSize)
942
- {
943
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
944
- return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
945
- workSpace, sizeof(workSpace));
946
- }
947
-
948
- size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
949
- {
950
- HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_TABLELOG_MAX);
951
- return HUF_decompress4X2_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
952
- }
953
-
954
1518
  #endif /* HUF_FORCE_DECOMPRESS_X1 */
955
1519
 
956
1520
 
@@ -999,25 +1563,25 @@ size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize,
999
1563
 
1000
1564
  #if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2)
1001
1565
  typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t;
1002
- static const algo_time_t algoTime[16 /* Quantization */][3 /* single, double, quad */] =
1566
+ static const algo_time_t algoTime[16 /* Quantization */][2 /* single, double */] =
1003
1567
  {
1004
1568
  /* single, double, quad */
1005
- {{0,0}, {1,1}, {2,2}}, /* Q==0 : impossible */
1006
- {{0,0}, {1,1}, {2,2}}, /* Q==1 : impossible */
1007
- {{ 38,130}, {1313, 74}, {2151, 38}}, /* Q == 2 : 12-18% */
1008
- {{ 448,128}, {1353, 74}, {2238, 41}}, /* Q == 3 : 18-25% */
1009
- {{ 556,128}, {1353, 74}, {2238, 47}}, /* Q == 4 : 25-32% */
1010
- {{ 714,128}, {1418, 74}, {2436, 53}}, /* Q == 5 : 32-38% */
1011
- {{ 883,128}, {1437, 74}, {2464, 61}}, /* Q == 6 : 38-44% */
1012
- {{ 897,128}, {1515, 75}, {2622, 68}}, /* Q == 7 : 44-50% */
1013
- {{ 926,128}, {1613, 75}, {2730, 75}}, /* Q == 8 : 50-56% */
1014
- {{ 947,128}, {1729, 77}, {3359, 77}}, /* Q == 9 : 56-62% */
1015
- {{1107,128}, {2083, 81}, {4006, 84}}, /* Q ==10 : 62-69% */
1016
- {{1177,128}, {2379, 87}, {4785, 88}}, /* Q ==11 : 69-75% */
1017
- {{1242,128}, {2415, 93}, {5155, 84}}, /* Q ==12 : 75-81% */
1018
- {{1349,128}, {2644,106}, {5260,106}}, /* Q ==13 : 81-87% */
1019
- {{1455,128}, {2422,124}, {4174,124}}, /* Q ==14 : 87-93% */
1020
- {{ 722,128}, {1891,145}, {1936,146}}, /* Q ==15 : 93-99% */
1569
+ {{0,0}, {1,1}}, /* Q==0 : impossible */
1570
+ {{0,0}, {1,1}}, /* Q==1 : impossible */
1571
+ {{ 150,216}, { 381,119}}, /* Q == 2 : 12-18% */
1572
+ {{ 170,205}, { 514,112}}, /* Q == 3 : 18-25% */
1573
+ {{ 177,199}, { 539,110}}, /* Q == 4 : 25-32% */
1574
+ {{ 197,194}, { 644,107}}, /* Q == 5 : 32-38% */
1575
+ {{ 221,192}, { 735,107}}, /* Q == 6 : 38-44% */
1576
+ {{ 256,189}, { 881,106}}, /* Q == 7 : 44-50% */
1577
+ {{ 359,188}, {1167,109}}, /* Q == 8 : 50-56% */
1578
+ {{ 582,187}, {1570,114}}, /* Q == 9 : 56-62% */
1579
+ {{ 688,187}, {1712,122}}, /* Q ==10 : 62-69% */
1580
+ {{ 825,186}, {1965,136}}, /* Q ==11 : 69-75% */
1581
+ {{ 976,185}, {2131,150}}, /* Q ==12 : 75-81% */
1582
+ {{1180,186}, {2070,175}}, /* Q ==13 : 81-87% */
1583
+ {{1377,185}, {1731,202}}, /* Q ==14 : 87-93% */
1584
+ {{1412,185}, {1695,202}}, /* Q ==15 : 93-99% */
1021
1585
  };
1022
1586
  #endif
1023
1587
 
@@ -1044,74 +1608,13 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize)
1044
1608
  U32 const D256 = (U32)(dstSize >> 8);
1045
1609
  U32 const DTime0 = algoTime[Q][0].tableTime + (algoTime[Q][0].decode256Time * D256);
1046
1610
  U32 DTime1 = algoTime[Q][1].tableTime + (algoTime[Q][1].decode256Time * D256);
1047
- DTime1 += DTime1 >> 3; /* advantage to algorithm using less memory, to reduce cache eviction */
1611
+ DTime1 += DTime1 >> 5; /* small advantage to algorithm using less memory, to reduce cache eviction */
1048
1612
  return DTime1 < DTime0;
1049
1613
  }
1050
1614
  #endif
1051
1615
  }
1052
1616
 
1053
1617
 
1054
- typedef size_t (*decompressionAlgo)(void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);
1055
-
1056
- size_t HUF_decompress (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
1057
- {
1058
- #if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2)
1059
- static const decompressionAlgo decompress[2] = { HUF_decompress4X1, HUF_decompress4X2 };
1060
- #endif
1061
-
1062
- /* validation checks */
1063
- if (dstSize == 0) return ERROR(dstSize_tooSmall);
1064
- if (cSrcSize > dstSize) return ERROR(corruption_detected); /* invalid */
1065
- if (cSrcSize == dstSize) { memcpy(dst, cSrc, dstSize); return dstSize; } /* not compressed */
1066
- if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; } /* RLE */
1067
-
1068
- { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
1069
- #if defined(HUF_FORCE_DECOMPRESS_X1)
1070
- (void)algoNb;
1071
- assert(algoNb == 0);
1072
- return HUF_decompress4X1(dst, dstSize, cSrc, cSrcSize);
1073
- #elif defined(HUF_FORCE_DECOMPRESS_X2)
1074
- (void)algoNb;
1075
- assert(algoNb == 1);
1076
- return HUF_decompress4X2(dst, dstSize, cSrc, cSrcSize);
1077
- #else
1078
- return decompress[algoNb](dst, dstSize, cSrc, cSrcSize);
1079
- #endif
1080
- }
1081
- }
1082
-
1083
- size_t HUF_decompress4X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
1084
- {
1085
- /* validation checks */
1086
- if (dstSize == 0) return ERROR(dstSize_tooSmall);
1087
- if (cSrcSize > dstSize) return ERROR(corruption_detected); /* invalid */
1088
- if (cSrcSize == dstSize) { memcpy(dst, cSrc, dstSize); return dstSize; } /* not compressed */
1089
- if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; } /* RLE */
1090
-
1091
- { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
1092
- #if defined(HUF_FORCE_DECOMPRESS_X1)
1093
- (void)algoNb;
1094
- assert(algoNb == 0);
1095
- return HUF_decompress4X1_DCtx(dctx, dst, dstSize, cSrc, cSrcSize);
1096
- #elif defined(HUF_FORCE_DECOMPRESS_X2)
1097
- (void)algoNb;
1098
- assert(algoNb == 1);
1099
- return HUF_decompress4X2_DCtx(dctx, dst, dstSize, cSrc, cSrcSize);
1100
- #else
1101
- return algoNb ? HUF_decompress4X2_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) :
1102
- HUF_decompress4X1_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) ;
1103
- #endif
1104
- }
1105
- }
1106
-
1107
- size_t HUF_decompress4X_hufOnly(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
1108
- {
1109
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
1110
- return HUF_decompress4X_hufOnly_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
1111
- workSpace, sizeof(workSpace));
1112
- }
1113
-
1114
-
1115
1618
  size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst,
1116
1619
  size_t dstSize, const void* cSrc,
1117
1620
  size_t cSrcSize, void* workSpace,
@@ -1145,8 +1648,8 @@ size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
1145
1648
  /* validation checks */
1146
1649
  if (dstSize == 0) return ERROR(dstSize_tooSmall);
1147
1650
  if (cSrcSize > dstSize) return ERROR(corruption_detected); /* invalid */
1148
- if (cSrcSize == dstSize) { memcpy(dst, cSrc, dstSize); return dstSize; } /* not compressed */
1149
- if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; } /* RLE */
1651
+ if (cSrcSize == dstSize) { ZSTD_memcpy(dst, cSrc, dstSize); return dstSize; } /* not compressed */
1652
+ if (cSrcSize == 1) { ZSTD_memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; } /* RLE */
1150
1653
 
1151
1654
  { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
1152
1655
  #if defined(HUF_FORCE_DECOMPRESS_X1)
@@ -1168,14 +1671,6 @@ size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
1168
1671
  }
1169
1672
  }
1170
1673
 
1171
- size_t HUF_decompress1X_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize,
1172
- const void* cSrc, size_t cSrcSize)
1173
- {
1174
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
1175
- return HUF_decompress1X_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
1176
- workSpace, sizeof(workSpace));
1177
- }
1178
-
1179
1674
 
1180
1675
  size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
1181
1676
  {
@@ -1199,7 +1694,7 @@ size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstS
1199
1694
  {
1200
1695
  const BYTE* ip = (const BYTE*) cSrc;
1201
1696
 
1202
- size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize);
1697
+ size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
1203
1698
  if (HUF_isError(hSize)) return hSize;
1204
1699
  if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
1205
1700
  ip += hSize; cSrcSize -= hSize;
@@ -1246,3 +1741,149 @@ size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t ds
1246
1741
  #endif
1247
1742
  }
1248
1743
  }
1744
+
1745
+ #ifndef ZSTD_NO_UNUSED_FUNCTIONS
1746
+ #ifndef HUF_FORCE_DECOMPRESS_X2
1747
+ size_t HUF_readDTableX1(HUF_DTable* DTable, const void* src, size_t srcSize)
1748
+ {
1749
+ U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
1750
+ return HUF_readDTableX1_wksp(DTable, src, srcSize,
1751
+ workSpace, sizeof(workSpace));
1752
+ }
1753
+
1754
+ size_t HUF_decompress1X1_DCtx(HUF_DTable* DCtx, void* dst, size_t dstSize,
1755
+ const void* cSrc, size_t cSrcSize)
1756
+ {
1757
+ U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
1758
+ return HUF_decompress1X1_DCtx_wksp(DCtx, dst, dstSize, cSrc, cSrcSize,
1759
+ workSpace, sizeof(workSpace));
1760
+ }
1761
+
1762
+ size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
1763
+ {
1764
+ HUF_CREATE_STATIC_DTABLEX1(DTable, HUF_TABLELOG_MAX);
1765
+ return HUF_decompress1X1_DCtx (DTable, dst, dstSize, cSrc, cSrcSize);
1766
+ }
1767
+ #endif
1768
+
1769
+ #ifndef HUF_FORCE_DECOMPRESS_X1
1770
+ size_t HUF_readDTableX2(HUF_DTable* DTable, const void* src, size_t srcSize)
1771
+ {
1772
+ U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
1773
+ return HUF_readDTableX2_wksp(DTable, src, srcSize,
1774
+ workSpace, sizeof(workSpace));
1775
+ }
1776
+
1777
+ size_t HUF_decompress1X2_DCtx(HUF_DTable* DCtx, void* dst, size_t dstSize,
1778
+ const void* cSrc, size_t cSrcSize)
1779
+ {
1780
+ U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
1781
+ return HUF_decompress1X2_DCtx_wksp(DCtx, dst, dstSize, cSrc, cSrcSize,
1782
+ workSpace, sizeof(workSpace));
1783
+ }
1784
+
1785
+ size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
1786
+ {
1787
+ HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_TABLELOG_MAX);
1788
+ return HUF_decompress1X2_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
1789
+ }
1790
+ #endif
1791
+
1792
+ #ifndef HUF_FORCE_DECOMPRESS_X2
1793
+ size_t HUF_decompress4X1_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
1794
+ {
1795
+ U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
1796
+ return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
1797
+ workSpace, sizeof(workSpace));
1798
+ }
1799
+ size_t HUF_decompress4X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
1800
+ {
1801
+ HUF_CREATE_STATIC_DTABLEX1(DTable, HUF_TABLELOG_MAX);
1802
+ return HUF_decompress4X1_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
1803
+ }
1804
+ #endif
1805
+
1806
+ #ifndef HUF_FORCE_DECOMPRESS_X1
1807
+ size_t HUF_decompress4X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize,
1808
+ const void* cSrc, size_t cSrcSize)
1809
+ {
1810
+ U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
1811
+ return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
1812
+ workSpace, sizeof(workSpace));
1813
+ }
1814
+
1815
+ size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
1816
+ {
1817
+ HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_TABLELOG_MAX);
1818
+ return HUF_decompress4X2_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
1819
+ }
1820
+ #endif
1821
+
1822
+ typedef size_t (*decompressionAlgo)(void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);
1823
+
1824
+ size_t HUF_decompress (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
1825
+ {
1826
+ #if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2)
1827
+ static const decompressionAlgo decompress[2] = { HUF_decompress4X1, HUF_decompress4X2 };
1828
+ #endif
1829
+
1830
+ /* validation checks */
1831
+ if (dstSize == 0) return ERROR(dstSize_tooSmall);
1832
+ if (cSrcSize > dstSize) return ERROR(corruption_detected); /* invalid */
1833
+ if (cSrcSize == dstSize) { ZSTD_memcpy(dst, cSrc, dstSize); return dstSize; } /* not compressed */
1834
+ if (cSrcSize == 1) { ZSTD_memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; } /* RLE */
1835
+
1836
+ { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
1837
+ #if defined(HUF_FORCE_DECOMPRESS_X1)
1838
+ (void)algoNb;
1839
+ assert(algoNb == 0);
1840
+ return HUF_decompress4X1(dst, dstSize, cSrc, cSrcSize);
1841
+ #elif defined(HUF_FORCE_DECOMPRESS_X2)
1842
+ (void)algoNb;
1843
+ assert(algoNb == 1);
1844
+ return HUF_decompress4X2(dst, dstSize, cSrc, cSrcSize);
1845
+ #else
1846
+ return decompress[algoNb](dst, dstSize, cSrc, cSrcSize);
1847
+ #endif
1848
+ }
1849
+ }
1850
+
1851
+ size_t HUF_decompress4X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
1852
+ {
1853
+ /* validation checks */
1854
+ if (dstSize == 0) return ERROR(dstSize_tooSmall);
1855
+ if (cSrcSize > dstSize) return ERROR(corruption_detected); /* invalid */
1856
+ if (cSrcSize == dstSize) { ZSTD_memcpy(dst, cSrc, dstSize); return dstSize; } /* not compressed */
1857
+ if (cSrcSize == 1) { ZSTD_memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; } /* RLE */
1858
+
1859
+ { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
1860
+ #if defined(HUF_FORCE_DECOMPRESS_X1)
1861
+ (void)algoNb;
1862
+ assert(algoNb == 0);
1863
+ return HUF_decompress4X1_DCtx(dctx, dst, dstSize, cSrc, cSrcSize);
1864
+ #elif defined(HUF_FORCE_DECOMPRESS_X2)
1865
+ (void)algoNb;
1866
+ assert(algoNb == 1);
1867
+ return HUF_decompress4X2_DCtx(dctx, dst, dstSize, cSrc, cSrcSize);
1868
+ #else
1869
+ return algoNb ? HUF_decompress4X2_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) :
1870
+ HUF_decompress4X1_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) ;
1871
+ #endif
1872
+ }
1873
+ }
1874
+
1875
+ size_t HUF_decompress4X_hufOnly(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
1876
+ {
1877
+ U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
1878
+ return HUF_decompress4X_hufOnly_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
1879
+ workSpace, sizeof(workSpace));
1880
+ }
1881
+
1882
+ size_t HUF_decompress1X_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize,
1883
+ const void* cSrc, size_t cSrcSize)
1884
+ {
1885
+ U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
1886
+ return HUF_decompress1X_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
1887
+ workSpace, sizeof(workSpace));
1888
+ }
1889
+ #endif