multi_compress 0.3.2 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (212) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +17 -3
  3. data/GET_STARTED.md +3 -3
  4. data/README.md +75 -66
  5. data/THIRD_PARTY_NOTICES.md +24 -0
  6. data/ext/multi_compress/brotli_dec_static_init.c +3 -0
  7. data/ext/multi_compress/brotli_enc_static_init.c +3 -0
  8. data/ext/multi_compress/extconf.rb +79 -3
  9. data/ext/multi_compress/multi_compress.c +199 -120
  10. data/ext/multi_compress/vendor/.vendored +2 -2
  11. data/ext/multi_compress/vendor/brotli/LICENSE +19 -0
  12. data/ext/multi_compress/vendor/brotli/c/common/constants.c +7 -7
  13. data/ext/multi_compress/vendor/brotli/c/common/constants.h +2 -5
  14. data/ext/multi_compress/vendor/brotli/c/common/context.c +2 -2
  15. data/ext/multi_compress/vendor/brotli/c/common/context.h +1 -2
  16. data/ext/multi_compress/vendor/brotli/c/common/dictionary.c +4 -5856
  17. data/ext/multi_compress/vendor/brotli/c/common/dictionary.h +1 -2
  18. data/ext/multi_compress/vendor/brotli/c/common/dictionary_inc.h +5847 -0
  19. data/ext/multi_compress/vendor/brotli/c/common/platform.c +0 -4
  20. data/ext/multi_compress/vendor/brotli/c/common/platform.h +182 -43
  21. data/ext/multi_compress/vendor/brotli/c/common/shared_dictionary.c +3 -7
  22. data/ext/multi_compress/vendor/brotli/c/common/shared_dictionary_internal.h +1 -1
  23. data/ext/multi_compress/vendor/brotli/c/common/static_init.h +56 -0
  24. data/ext/multi_compress/vendor/brotli/c/common/transform.c +6 -4
  25. data/ext/multi_compress/vendor/brotli/c/common/transform.h +1 -2
  26. data/ext/multi_compress/vendor/brotli/c/common/version.h +3 -3
  27. data/ext/multi_compress/vendor/brotli/c/dec/bit_reader.c +2 -3
  28. data/ext/multi_compress/vendor/brotli/c/dec/bit_reader.h +0 -4
  29. data/ext/multi_compress/vendor/brotli/c/dec/decode.c +128 -39
  30. data/ext/multi_compress/vendor/brotli/c/dec/huffman.c +2 -5
  31. data/ext/multi_compress/vendor/brotli/c/dec/huffman.h +0 -2
  32. data/ext/multi_compress/vendor/brotli/c/dec/prefix.c +67 -0
  33. data/ext/multi_compress/vendor/brotli/c/dec/prefix.h +18 -708
  34. data/ext/multi_compress/vendor/brotli/c/dec/prefix_inc.h +707 -0
  35. data/ext/multi_compress/vendor/brotli/c/dec/state.c +18 -15
  36. data/ext/multi_compress/vendor/brotli/c/dec/state.h +2 -6
  37. data/ext/multi_compress/vendor/brotli/c/dec/static_init.c +53 -0
  38. data/ext/multi_compress/vendor/brotli/c/dec/static_init.h +30 -0
  39. data/ext/multi_compress/vendor/brotli/c/enc/backward_references.c +32 -8
  40. data/ext/multi_compress/vendor/brotli/c/enc/backward_references.h +1 -5
  41. data/ext/multi_compress/vendor/brotli/c/enc/backward_references_hq.c +15 -15
  42. data/ext/multi_compress/vendor/brotli/c/enc/backward_references_hq.h +1 -5
  43. data/ext/multi_compress/vendor/brotli/c/enc/bit_cost.c +28 -4
  44. data/ext/multi_compress/vendor/brotli/c/enc/bit_cost.h +8 -40
  45. data/ext/multi_compress/vendor/brotli/c/enc/bit_cost_inc.h +1 -1
  46. data/ext/multi_compress/vendor/brotli/c/enc/block_splitter.c +9 -12
  47. data/ext/multi_compress/vendor/brotli/c/enc/block_splitter.h +0 -3
  48. data/ext/multi_compress/vendor/brotli/c/enc/block_splitter_inc.h +14 -8
  49. data/ext/multi_compress/vendor/brotli/c/enc/brotli_bit_stream.c +10 -9
  50. data/ext/multi_compress/vendor/brotli/c/enc/brotli_bit_stream.h +0 -6
  51. data/ext/multi_compress/vendor/brotli/c/enc/cluster.c +0 -2
  52. data/ext/multi_compress/vendor/brotli/c/enc/cluster.h +0 -2
  53. data/ext/multi_compress/vendor/brotli/c/enc/command.c +1 -1
  54. data/ext/multi_compress/vendor/brotli/c/enc/command.h +8 -10
  55. data/ext/multi_compress/vendor/brotli/c/enc/compound_dictionary.c +3 -5
  56. data/ext/multi_compress/vendor/brotli/c/enc/compound_dictionary.h +1 -4
  57. data/ext/multi_compress/vendor/brotli/c/enc/compress_fragment.c +3 -13
  58. data/ext/multi_compress/vendor/brotli/c/enc/compress_fragment.h +0 -2
  59. data/ext/multi_compress/vendor/brotli/c/enc/compress_fragment_two_pass.c +5 -15
  60. data/ext/multi_compress/vendor/brotli/c/enc/compress_fragment_two_pass.h +0 -2
  61. data/ext/multi_compress/vendor/brotli/c/enc/dictionary_hash.c +127 -1830
  62. data/ext/multi_compress/vendor/brotli/c/enc/dictionary_hash.h +23 -3
  63. data/ext/multi_compress/vendor/brotli/c/enc/dictionary_hash_inc.h +1829 -0
  64. data/ext/multi_compress/vendor/brotli/c/enc/encode.c +77 -52
  65. data/ext/multi_compress/vendor/brotli/c/enc/encoder_dict.c +9 -7
  66. data/ext/multi_compress/vendor/brotli/c/enc/encoder_dict.h +2 -4
  67. data/ext/multi_compress/vendor/brotli/c/enc/entropy_encode.c +3 -6
  68. data/ext/multi_compress/vendor/brotli/c/enc/entropy_encode.h +2 -4
  69. data/ext/multi_compress/vendor/brotli/c/enc/entropy_encode_static.h +18 -12
  70. data/ext/multi_compress/vendor/brotli/c/enc/fast_log.c +1 -1
  71. data/ext/multi_compress/vendor/brotli/c/enc/fast_log.h +2 -3
  72. data/ext/multi_compress/vendor/brotli/c/enc/find_match_length.h +0 -2
  73. data/ext/multi_compress/vendor/brotli/c/enc/hash.h +38 -31
  74. data/ext/multi_compress/vendor/brotli/c/enc/hash_base.h +38 -0
  75. data/ext/multi_compress/vendor/brotli/c/enc/hash_forgetful_chain_inc.h +11 -1
  76. data/ext/multi_compress/vendor/brotli/c/enc/hash_longest_match64_inc.h +24 -7
  77. data/ext/multi_compress/vendor/brotli/c/enc/hash_longest_match64_simd_inc.h +304 -0
  78. data/ext/multi_compress/vendor/brotli/c/enc/hash_longest_match_inc.h +30 -11
  79. data/ext/multi_compress/vendor/brotli/c/enc/hash_longest_match_quickly_inc.h +4 -0
  80. data/ext/multi_compress/vendor/brotli/c/enc/hash_longest_match_simd_inc.h +278 -0
  81. data/ext/multi_compress/vendor/brotli/c/enc/histogram.c +1 -0
  82. data/ext/multi_compress/vendor/brotli/c/enc/histogram.h +0 -4
  83. data/ext/multi_compress/vendor/brotli/c/enc/literal_cost.c +4 -6
  84. data/ext/multi_compress/vendor/brotli/c/enc/literal_cost.h +0 -2
  85. data/ext/multi_compress/vendor/brotli/c/enc/matching_tag_mask.h +69 -0
  86. data/ext/multi_compress/vendor/brotli/c/enc/memory.c +0 -5
  87. data/ext/multi_compress/vendor/brotli/c/enc/memory.h +0 -4
  88. data/ext/multi_compress/vendor/brotli/c/enc/metablock.c +7 -9
  89. data/ext/multi_compress/vendor/brotli/c/enc/metablock.h +3 -3
  90. data/ext/multi_compress/vendor/brotli/c/enc/metablock_inc.h +4 -4
  91. data/ext/multi_compress/vendor/brotli/c/enc/params.h +0 -1
  92. data/ext/multi_compress/vendor/brotli/c/enc/prefix.h +0 -2
  93. data/ext/multi_compress/vendor/brotli/c/enc/quality.h +17 -10
  94. data/ext/multi_compress/vendor/brotli/c/enc/ringbuffer.h +1 -4
  95. data/ext/multi_compress/vendor/brotli/c/enc/state.h +2 -2
  96. data/ext/multi_compress/vendor/brotli/c/enc/static_dict.c +5 -11
  97. data/ext/multi_compress/vendor/brotli/c/enc/static_dict.h +1 -3
  98. data/ext/multi_compress/vendor/brotli/c/enc/static_dict_lut.c +224 -0
  99. data/ext/multi_compress/vendor/brotli/c/enc/static_dict_lut.h +20 -5837
  100. data/ext/multi_compress/vendor/brotli/c/enc/static_dict_lut_inc.h +5830 -0
  101. data/ext/multi_compress/vendor/brotli/c/enc/static_init.c +59 -0
  102. data/ext/multi_compress/vendor/brotli/c/enc/static_init.h +30 -0
  103. data/ext/multi_compress/vendor/brotli/c/enc/static_init_lazy.cc +26 -0
  104. data/ext/multi_compress/vendor/brotli/c/enc/utf8_util.c +1 -1
  105. data/ext/multi_compress/vendor/brotli/c/enc/utf8_util.h +0 -2
  106. data/ext/multi_compress/vendor/brotli/c/enc/write_bits.h +0 -2
  107. data/ext/multi_compress/vendor/brotli/c/include/brotli/decode.h +1 -1
  108. data/ext/multi_compress/vendor/brotli/c/include/brotli/encode.h +5 -1
  109. data/ext/multi_compress/vendor/brotli/c/include/brotli/port.h +4 -7
  110. data/ext/multi_compress/vendor/brotli/c/include/brotli/types.h +2 -2
  111. data/ext/multi_compress/vendor/lz4/LICENSE +12 -0
  112. data/ext/multi_compress/vendor/zstd/COPYING +339 -0
  113. data/ext/multi_compress/vendor/zstd/LICENSE +30 -0
  114. data/ext/multi_compress/vendor/zstd/lib/Makefile +67 -35
  115. data/ext/multi_compress/vendor/zstd/lib/README.md +33 -2
  116. data/ext/multi_compress/vendor/zstd/lib/common/allocations.h +55 -0
  117. data/ext/multi_compress/vendor/zstd/lib/common/bits.h +205 -0
  118. data/ext/multi_compress/vendor/zstd/lib/common/bitstream.h +84 -108
  119. data/ext/multi_compress/vendor/zstd/lib/common/compiler.h +170 -41
  120. data/ext/multi_compress/vendor/zstd/lib/common/cpu.h +37 -1
  121. data/ext/multi_compress/vendor/zstd/lib/common/debug.c +7 -1
  122. data/ext/multi_compress/vendor/zstd/lib/common/debug.h +21 -21
  123. data/ext/multi_compress/vendor/zstd/lib/common/entropy_common.c +12 -40
  124. data/ext/multi_compress/vendor/zstd/lib/common/error_private.c +10 -2
  125. data/ext/multi_compress/vendor/zstd/lib/common/error_private.h +46 -47
  126. data/ext/multi_compress/vendor/zstd/lib/common/fse.h +8 -100
  127. data/ext/multi_compress/vendor/zstd/lib/common/fse_decompress.c +28 -116
  128. data/ext/multi_compress/vendor/zstd/lib/common/huf.h +79 -166
  129. data/ext/multi_compress/vendor/zstd/lib/common/mem.h +46 -66
  130. data/ext/multi_compress/vendor/zstd/lib/common/pool.c +27 -11
  131. data/ext/multi_compress/vendor/zstd/lib/common/pool.h +8 -11
  132. data/ext/multi_compress/vendor/zstd/lib/common/portability_macros.h +45 -11
  133. data/ext/multi_compress/vendor/zstd/lib/common/threading.c +74 -14
  134. data/ext/multi_compress/vendor/zstd/lib/common/threading.h +5 -18
  135. data/ext/multi_compress/vendor/zstd/lib/common/xxhash.c +5 -11
  136. data/ext/multi_compress/vendor/zstd/lib/common/xxhash.h +2411 -1003
  137. data/ext/multi_compress/vendor/zstd/lib/common/zstd_common.c +1 -36
  138. data/ext/multi_compress/vendor/zstd/lib/common/zstd_deps.h +13 -1
  139. data/ext/multi_compress/vendor/zstd/lib/common/zstd_internal.h +13 -182
  140. data/ext/multi_compress/vendor/zstd/lib/common/zstd_trace.h +6 -13
  141. data/ext/multi_compress/vendor/zstd/lib/compress/clevels.h +1 -1
  142. data/ext/multi_compress/vendor/zstd/lib/compress/fse_compress.c +15 -131
  143. data/ext/multi_compress/vendor/zstd/lib/compress/hist.c +11 -1
  144. data/ext/multi_compress/vendor/zstd/lib/compress/hist.h +8 -1
  145. data/ext/multi_compress/vendor/zstd/lib/compress/huf_compress.c +283 -189
  146. data/ext/multi_compress/vendor/zstd/lib/compress/zstd_compress.c +2419 -903
  147. data/ext/multi_compress/vendor/zstd/lib/compress/zstd_compress_internal.h +423 -245
  148. data/ext/multi_compress/vendor/zstd/lib/compress/zstd_compress_literals.c +116 -40
  149. data/ext/multi_compress/vendor/zstd/lib/compress/zstd_compress_literals.h +16 -8
  150. data/ext/multi_compress/vendor/zstd/lib/compress/zstd_compress_sequences.c +10 -10
  151. data/ext/multi_compress/vendor/zstd/lib/compress/zstd_compress_sequences.h +8 -7
  152. data/ext/multi_compress/vendor/zstd/lib/compress/zstd_compress_superblock.c +254 -139
  153. data/ext/multi_compress/vendor/zstd/lib/compress/zstd_compress_superblock.h +1 -1
  154. data/ext/multi_compress/vendor/zstd/lib/compress/zstd_cwksp.h +184 -95
  155. data/ext/multi_compress/vendor/zstd/lib/compress/zstd_double_fast.c +163 -81
  156. data/ext/multi_compress/vendor/zstd/lib/compress/zstd_double_fast.h +18 -14
  157. data/ext/multi_compress/vendor/zstd/lib/compress/zstd_fast.c +507 -197
  158. data/ext/multi_compress/vendor/zstd/lib/compress/zstd_fast.h +7 -14
  159. data/ext/multi_compress/vendor/zstd/lib/compress/zstd_lazy.c +579 -484
  160. data/ext/multi_compress/vendor/zstd/lib/compress/zstd_lazy.h +133 -65
  161. data/ext/multi_compress/vendor/zstd/lib/compress/zstd_ldm.c +61 -40
  162. data/ext/multi_compress/vendor/zstd/lib/compress/zstd_ldm.h +7 -15
  163. data/ext/multi_compress/vendor/zstd/lib/compress/zstd_ldm_geartab.h +1 -1
  164. data/ext/multi_compress/vendor/zstd/lib/compress/zstd_opt.c +352 -218
  165. data/ext/multi_compress/vendor/zstd/lib/compress/zstd_opt.h +37 -21
  166. data/ext/multi_compress/vendor/zstd/lib/compress/zstd_preSplit.c +238 -0
  167. data/ext/multi_compress/vendor/zstd/lib/compress/zstd_preSplit.h +33 -0
  168. data/ext/multi_compress/vendor/zstd/lib/compress/zstdmt_compress.c +239 -175
  169. data/ext/multi_compress/vendor/zstd/lib/compress/zstdmt_compress.h +5 -16
  170. data/ext/multi_compress/vendor/zstd/lib/decompress/huf_decompress.c +543 -488
  171. data/ext/multi_compress/vendor/zstd/lib/decompress/huf_decompress_amd64.S +78 -61
  172. data/ext/multi_compress/vendor/zstd/lib/decompress/zstd_ddict.c +4 -4
  173. data/ext/multi_compress/vendor/zstd/lib/decompress/zstd_ddict.h +1 -1
  174. data/ext/multi_compress/vendor/zstd/lib/decompress/zstd_decompress.c +295 -115
  175. data/ext/multi_compress/vendor/zstd/lib/decompress/zstd_decompress_block.c +430 -293
  176. data/ext/multi_compress/vendor/zstd/lib/decompress/zstd_decompress_block.h +7 -2
  177. data/ext/multi_compress/vendor/zstd/lib/decompress/zstd_decompress_internal.h +11 -7
  178. data/ext/multi_compress/vendor/zstd/lib/deprecated/zbuff.h +1 -1
  179. data/ext/multi_compress/vendor/zstd/lib/deprecated/zbuff_common.c +1 -1
  180. data/ext/multi_compress/vendor/zstd/lib/deprecated/zbuff_compress.c +1 -1
  181. data/ext/multi_compress/vendor/zstd/lib/deprecated/zbuff_decompress.c +3 -1
  182. data/ext/multi_compress/vendor/zstd/lib/dictBuilder/cover.c +95 -46
  183. data/ext/multi_compress/vendor/zstd/lib/dictBuilder/cover.h +3 -9
  184. data/ext/multi_compress/vendor/zstd/lib/dictBuilder/divsufsort.h +0 -10
  185. data/ext/multi_compress/vendor/zstd/lib/dictBuilder/fastcover.c +4 -4
  186. data/ext/multi_compress/vendor/zstd/lib/dictBuilder/zdict.c +25 -97
  187. data/ext/multi_compress/vendor/zstd/lib/dll/example/Makefile +1 -1
  188. data/ext/multi_compress/vendor/zstd/lib/dll/example/README.md +1 -1
  189. data/ext/multi_compress/vendor/zstd/lib/legacy/zstd_legacy.h +38 -1
  190. data/ext/multi_compress/vendor/zstd/lib/legacy/zstd_v01.c +19 -50
  191. data/ext/multi_compress/vendor/zstd/lib/legacy/zstd_v01.h +1 -1
  192. data/ext/multi_compress/vendor/zstd/lib/legacy/zstd_v02.c +27 -80
  193. data/ext/multi_compress/vendor/zstd/lib/legacy/zstd_v02.h +1 -1
  194. data/ext/multi_compress/vendor/zstd/lib/legacy/zstd_v03.c +28 -83
  195. data/ext/multi_compress/vendor/zstd/lib/legacy/zstd_v03.h +1 -1
  196. data/ext/multi_compress/vendor/zstd/lib/legacy/zstd_v04.c +25 -74
  197. data/ext/multi_compress/vendor/zstd/lib/legacy/zstd_v04.h +1 -1
  198. data/ext/multi_compress/vendor/zstd/lib/legacy/zstd_v05.c +31 -76
  199. data/ext/multi_compress/vendor/zstd/lib/legacy/zstd_v05.h +1 -1
  200. data/ext/multi_compress/vendor/zstd/lib/legacy/zstd_v06.c +44 -88
  201. data/ext/multi_compress/vendor/zstd/lib/legacy/zstd_v06.h +1 -1
  202. data/ext/multi_compress/vendor/zstd/lib/legacy/zstd_v07.c +33 -84
  203. data/ext/multi_compress/vendor/zstd/lib/legacy/zstd_v07.h +1 -1
  204. data/ext/multi_compress/vendor/zstd/lib/libzstd.mk +65 -33
  205. data/ext/multi_compress/vendor/zstd/lib/libzstd.pc.in +5 -5
  206. data/ext/multi_compress/vendor/zstd/lib/module.modulemap +13 -3
  207. data/ext/multi_compress/vendor/zstd/lib/zdict.h +65 -36
  208. data/ext/multi_compress/vendor/zstd/lib/zstd.h +890 -267
  209. data/ext/multi_compress/vendor/zstd/lib/zstd_errors.h +28 -16
  210. data/lib/multi_compress/version.rb +1 -1
  211. data/lib/multi_compress.rb +80 -41
  212. metadata +29 -2
@@ -1,7 +1,7 @@
1
1
  /* ******************************************************************
2
2
  * huff0 huffman decoder,
3
3
  * part of Finite State Entropy library
4
- * Copyright (c) Yann Collet, Facebook, Inc.
4
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
5
5
  *
6
6
  * You can contact the author at :
7
7
  * - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
@@ -19,10 +19,10 @@
19
19
  #include "../common/compiler.h"
20
20
  #include "../common/bitstream.h" /* BIT_* */
21
21
  #include "../common/fse.h" /* to compress headers */
22
- #define HUF_STATIC_LINKING_ONLY
23
22
  #include "../common/huf.h"
24
23
  #include "../common/error_private.h"
25
24
  #include "../common/zstd_internal.h"
25
+ #include "../common/bits.h" /* ZSTD_highbit32, ZSTD_countTrailingZeros64 */
26
26
 
27
27
  /* **************************************************************
28
28
  * Constants
@@ -34,6 +34,12 @@
34
34
  * Macros
35
35
  ****************************************************************/
36
36
 
37
+ #ifdef HUF_DISABLE_FAST_DECODE
38
+ # define HUF_ENABLE_FAST_DECODE 0
39
+ #else
40
+ # define HUF_ENABLE_FAST_DECODE 1
41
+ #endif
42
+
37
43
  /* These two optional macros force the use one way or another of the two
38
44
  * Huffman decompression implementations. You can't force in both directions
39
45
  * at the same time.
@@ -43,10 +49,14 @@
43
49
  #error "Cannot force the use of the X1 and X2 decoders at the same time!"
44
50
  #endif
45
51
 
46
- #if ZSTD_ENABLE_ASM_X86_64_BMI2 && DYNAMIC_BMI2
47
- # define HUF_ASM_X86_64_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE
52
+ /* When DYNAMIC_BMI2 is enabled, fast decoders are only called when bmi2 is
53
+ * supported at runtime, so we can add the BMI2 target attribute.
54
+ * When it is disabled, we will still get BMI2 if it is enabled statically.
55
+ */
56
+ #if DYNAMIC_BMI2
57
+ # define HUF_FAST_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE
48
58
  #else
49
- # define HUF_ASM_X86_64_BMI2_ATTRS
59
+ # define HUF_FAST_BMI2_ATTRS
50
60
  #endif
51
61
 
52
62
  #ifdef __cplusplus
@@ -56,18 +66,12 @@
56
66
  #endif
57
67
  #define HUF_ASM_DECL HUF_EXTERN_C
58
68
 
59
- #if DYNAMIC_BMI2 || (ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__))
69
+ #if DYNAMIC_BMI2
60
70
  # define HUF_NEED_BMI2_FUNCTION 1
61
71
  #else
62
72
  # define HUF_NEED_BMI2_FUNCTION 0
63
73
  #endif
64
74
 
65
- #if !(ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__))
66
- # define HUF_NEED_DEFAULT_FUNCTION 1
67
- #else
68
- # define HUF_NEED_DEFAULT_FUNCTION 0
69
- #endif
70
-
71
75
  /* **************************************************************
72
76
  * Error Management
73
77
  ****************************************************************/
@@ -84,6 +88,11 @@
84
88
  /* **************************************************************
85
89
  * BMI2 Variant Wrappers
86
90
  ****************************************************************/
91
+ typedef size_t (*HUF_DecompressUsingDTableFn)(void *dst, size_t dstSize,
92
+ const void *cSrc,
93
+ size_t cSrcSize,
94
+ const HUF_DTable *DTable);
95
+
87
96
  #if DYNAMIC_BMI2
88
97
 
89
98
  #define HUF_DGEN(fn) \
@@ -105,9 +114,9 @@
105
114
  } \
106
115
  \
107
116
  static size_t fn(void* dst, size_t dstSize, void const* cSrc, \
108
- size_t cSrcSize, HUF_DTable const* DTable, int bmi2) \
117
+ size_t cSrcSize, HUF_DTable const* DTable, int flags) \
109
118
  { \
110
- if (bmi2) { \
119
+ if (flags & HUF_flags_bmi2) { \
111
120
  return fn##_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); \
112
121
  } \
113
122
  return fn##_default(dst, dstSize, cSrc, cSrcSize, DTable); \
@@ -117,9 +126,9 @@
117
126
 
118
127
  #define HUF_DGEN(fn) \
119
128
  static size_t fn(void* dst, size_t dstSize, void const* cSrc, \
120
- size_t cSrcSize, HUF_DTable const* DTable, int bmi2) \
129
+ size_t cSrcSize, HUF_DTable const* DTable, int flags) \
121
130
  { \
122
- (void)bmi2; \
131
+ (void)flags; \
123
132
  return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \
124
133
  }
125
134
 
@@ -138,43 +147,66 @@ static DTableDesc HUF_getDTableDesc(const HUF_DTable* table)
138
147
  return dtd;
139
148
  }
140
149
 
141
- #if ZSTD_ENABLE_ASM_X86_64_BMI2
142
-
143
- static size_t HUF_initDStream(BYTE const* ip) {
150
+ static size_t HUF_initFastDStream(BYTE const* ip) {
144
151
  BYTE const lastByte = ip[7];
145
- size_t const bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;
152
+ size_t const bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0;
146
153
  size_t const value = MEM_readLEST(ip) | 1;
147
154
  assert(bitsConsumed <= 8);
155
+ assert(sizeof(size_t) == 8);
148
156
  return value << bitsConsumed;
149
157
  }
158
+
159
+
160
+ /**
161
+ * The input/output arguments to the Huffman fast decoding loop:
162
+ *
163
+ * ip [in/out] - The input pointers, must be updated to reflect what is consumed.
164
+ * op [in/out] - The output pointers, must be updated to reflect what is written.
165
+ * bits [in/out] - The bitstream containers, must be updated to reflect the current state.
166
+ * dt [in] - The decoding table.
167
+ * ilowest [in] - The beginning of the valid range of the input. Decoders may read
168
+ * down to this pointer. It may be below iend[0].
169
+ * oend [in] - The end of the output stream. op[3] must not cross oend.
170
+ * iend [in] - The end of each input stream. ip[i] may cross iend[i],
171
+ * as long as it is above ilowest, but that indicates corruption.
172
+ */
150
173
  typedef struct {
151
174
  BYTE const* ip[4];
152
175
  BYTE* op[4];
153
176
  U64 bits[4];
154
177
  void const* dt;
155
- BYTE const* ilimit;
178
+ BYTE const* ilowest;
156
179
  BYTE* oend;
157
180
  BYTE const* iend[4];
158
- } HUF_DecompressAsmArgs;
181
+ } HUF_DecompressFastArgs;
182
+
183
+ typedef void (*HUF_DecompressFastLoopFn)(HUF_DecompressFastArgs*);
159
184
 
160
185
  /**
161
- * Initializes args for the asm decoding loop.
162
- * @returns 0 on success
163
- * 1 if the fallback implementation should be used.
186
+ * Initializes args for the fast decoding loop.
187
+ * @returns 1 on success
188
+ * 0 if the fallback implementation should be used.
164
189
  * Or an error code on failure.
165
190
  */
166
- static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable)
191
+ static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable)
167
192
  {
168
193
  void const* dt = DTable + 1;
169
194
  U32 const dtLog = HUF_getDTableDesc(DTable).tableLog;
170
195
 
171
- const BYTE* const ilimit = (const BYTE*)src + 6 + 8;
196
+ const BYTE* const istart = (const BYTE*)src;
172
197
 
173
- BYTE* const oend = (BYTE*)dst + dstSize;
198
+ BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize);
199
+
200
+ /* The fast decoding loop assumes 64-bit little-endian.
201
+ * This condition is false on x32.
202
+ */
203
+ if (!MEM_isLittleEndian() || MEM_32bits())
204
+ return 0;
174
205
 
175
- /* The following condition is false on x32 platform,
176
- * but HUF_asm is not compatible with this ABI */
177
- if (!(MEM_isLittleEndian() && !MEM_32bits())) return 1;
206
+ /* Avoid nullptr addition */
207
+ if (dstSize == 0)
208
+ return 0;
209
+ assert(dst != NULL);
178
210
 
179
211
  /* strict minimum : jump table + 1 byte per stream */
180
212
  if (srcSize < 10)
@@ -185,11 +217,10 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
185
217
  * On small inputs we don't have enough data to trigger the fast loop, so use the old decoder.
186
218
  */
187
219
  if (dtLog != HUF_DECODER_FAST_TABLELOG)
188
- return 1;
220
+ return 0;
189
221
 
190
222
  /* Read the jump table. */
191
223
  {
192
- const BYTE* const istart = (const BYTE*)src;
193
224
  size_t const length1 = MEM_readLE16(istart);
194
225
  size_t const length2 = MEM_readLE16(istart+2);
195
226
  size_t const length3 = MEM_readLE16(istart+4);
@@ -199,13 +230,11 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
199
230
  args->iend[2] = args->iend[1] + length2;
200
231
  args->iend[3] = args->iend[2] + length3;
201
232
 
202
- /* HUF_initDStream() requires this, and this small of an input
233
+ /* HUF_initFastDStream() requires this, and this small of an input
203
234
  * won't benefit from the ASM loop anyways.
204
- * length1 must be >= 16 so that ip[0] >= ilimit before the loop
205
- * starts.
206
235
  */
207
- if (length1 < 16 || length2 < 8 || length3 < 8 || length4 < 8)
208
- return 1;
236
+ if (length1 < 8 || length2 < 8 || length3 < 8 || length4 < 8)
237
+ return 0;
209
238
  if (length4 > srcSize) return ERROR(corruption_detected); /* overflow */
210
239
  }
211
240
  /* ip[] contains the position that is currently loaded into bits[]. */
@@ -222,7 +251,7 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
222
251
 
223
252
  /* No point to call the ASM loop for tiny outputs. */
224
253
  if (args->op[3] >= oend)
225
- return 1;
254
+ return 0;
226
255
 
227
256
  /* bits[] is the bit container.
228
257
  * It is read from the MSB down to the LSB.
@@ -231,24 +260,25 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
231
260
  * set, so that CountTrailingZeros(bits[]) can be used
232
261
  * to count how many bits we've consumed.
233
262
  */
234
- args->bits[0] = HUF_initDStream(args->ip[0]);
235
- args->bits[1] = HUF_initDStream(args->ip[1]);
236
- args->bits[2] = HUF_initDStream(args->ip[2]);
237
- args->bits[3] = HUF_initDStream(args->ip[3]);
238
-
239
- /* If ip[] >= ilimit, it is guaranteed to be safe to
240
- * reload bits[]. It may be beyond its section, but is
241
- * guaranteed to be valid (>= istart).
242
- */
243
- args->ilimit = ilimit;
263
+ args->bits[0] = HUF_initFastDStream(args->ip[0]);
264
+ args->bits[1] = HUF_initFastDStream(args->ip[1]);
265
+ args->bits[2] = HUF_initFastDStream(args->ip[2]);
266
+ args->bits[3] = HUF_initFastDStream(args->ip[3]);
267
+
268
+ /* The decoders must be sure to never read beyond ilowest.
269
+ * This is lower than iend[0], but allowing decoders to read
270
+ * down to ilowest can allow an extra iteration or two in the
271
+ * fast loop.
272
+ */
273
+ args->ilowest = istart;
244
274
 
245
275
  args->oend = oend;
246
276
  args->dt = dt;
247
277
 
248
- return 0;
278
+ return 1;
249
279
  }
250
280
 
251
- static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs const* args, int stream, BYTE* segmentEnd)
281
+ static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressFastArgs const* args, int stream, BYTE* segmentEnd)
252
282
  {
253
283
  /* Validate that we haven't overwritten. */
254
284
  if (args->op[stream] > segmentEnd)
@@ -262,15 +292,33 @@ static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs
262
292
  return ERROR(corruption_detected);
263
293
 
264
294
  /* Construct the BIT_DStream_t. */
265
- bit->bitContainer = MEM_readLE64(args->ip[stream]);
266
- bit->bitsConsumed = ZSTD_countTrailingZeros((size_t)args->bits[stream]);
267
- bit->start = (const char*)args->iend[0];
295
+ assert(sizeof(size_t) == 8);
296
+ bit->bitContainer = MEM_readLEST(args->ip[stream]);
297
+ bit->bitsConsumed = ZSTD_countTrailingZeros64(args->bits[stream]);
298
+ bit->start = (const char*)args->ilowest;
268
299
  bit->limitPtr = bit->start + sizeof(size_t);
269
300
  bit->ptr = (const char*)args->ip[stream];
270
301
 
271
302
  return 0;
272
303
  }
273
- #endif
304
+
305
+ /* Calls X(N) for each stream 0, 1, 2, 3. */
306
+ #define HUF_4X_FOR_EACH_STREAM(X) \
307
+ do { \
308
+ X(0); \
309
+ X(1); \
310
+ X(2); \
311
+ X(3); \
312
+ } while (0)
313
+
314
+ /* Calls X(N, var) for each stream 0, 1, 2, 3. */
315
+ #define HUF_4X_FOR_EACH_STREAM_WITH_VAR(X, var) \
316
+ do { \
317
+ X(0, (var)); \
318
+ X(1, (var)); \
319
+ X(2, (var)); \
320
+ X(3, (var)); \
321
+ } while (0)
274
322
 
275
323
 
276
324
  #ifndef HUF_FORCE_DECOMPRESS_X2
@@ -287,10 +335,11 @@ typedef struct { BYTE nbBits; BYTE byte; } HUF_DEltX1; /* single-symbol decodi
287
335
  static U64 HUF_DEltX1_set4(BYTE symbol, BYTE nbBits) {
288
336
  U64 D4;
289
337
  if (MEM_isLittleEndian()) {
290
- D4 = (symbol << 8) + nbBits;
338
+ D4 = (U64)((symbol << 8) + nbBits);
291
339
  } else {
292
- D4 = symbol + (nbBits << 8);
340
+ D4 = (U64)(symbol + (nbBits << 8));
293
341
  }
342
+ assert(D4 < (1U << 16));
294
343
  D4 *= 0x0001000100010001ULL;
295
344
  return D4;
296
345
  }
@@ -333,13 +382,7 @@ typedef struct {
333
382
  BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1];
334
383
  } HUF_ReadDTableX1_Workspace;
335
384
 
336
-
337
- size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize)
338
- {
339
- return HUF_readDTableX1_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0);
340
- }
341
-
342
- size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2)
385
+ size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags)
343
386
  {
344
387
  U32 tableLog = 0;
345
388
  U32 nbSymbols = 0;
@@ -354,7 +397,7 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
354
397
  DEBUG_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable));
355
398
  /* ZSTD_memset(huffWeight, 0, sizeof(huffWeight)); */ /* is not necessary, even though some analyzer complain ... */
356
399
 
357
- iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), bmi2);
400
+ iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), flags);
358
401
  if (HUF_isError(iSize)) return iSize;
359
402
 
360
403
 
@@ -381,9 +424,8 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
381
424
  * rankStart[0] is not filled because there are no entries in the table for
382
425
  * weight 0.
383
426
  */
384
- {
385
- int n;
386
- int nextRankStart = 0;
427
+ { int n;
428
+ U32 nextRankStart = 0;
387
429
  int const unroll = 4;
388
430
  int const nLimit = (int)nbSymbols - unroll + 1;
389
431
  for (n=0; n<(int)tableLog+1; n++) {
@@ -410,10 +452,9 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
410
452
  * We can switch based on the length to a different inner loop which is
411
453
  * optimized for that particular case.
412
454
  */
413
- {
414
- U32 w;
415
- int symbol=wksp->rankVal[0];
416
- int rankStart=0;
455
+ { U32 w;
456
+ int symbol = wksp->rankVal[0];
457
+ int rankStart = 0;
417
458
  for (w=1; w<tableLog+1; ++w) {
418
459
  int const symbolCount = wksp->rankVal[w];
419
460
  int const length = (1 << w) >> 1;
@@ -487,15 +528,19 @@ HUF_decodeSymbolX1(BIT_DStream_t* Dstream, const HUF_DEltX1* dt, const U32 dtLog
487
528
  }
488
529
 
489
530
  #define HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr) \
490
- *ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog)
531
+ do { *ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog); } while (0)
491
532
 
492
- #define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr) \
493
- if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
494
- HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr)
533
+ #define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr) \
534
+ do { \
535
+ if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
536
+ HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr); \
537
+ } while (0)
495
538
 
496
- #define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr) \
497
- if (MEM_64bits()) \
498
- HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr)
539
+ #define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr) \
540
+ do { \
541
+ if (MEM_64bits()) \
542
+ HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr); \
543
+ } while (0)
499
544
 
500
545
  HINT_INLINE size_t
501
546
  HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, const HUF_DEltX1* const dt, const U32 dtLog)
@@ -523,7 +568,7 @@ HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, cons
523
568
  while (p < pEnd)
524
569
  HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
525
570
 
526
- return pEnd-pStart;
571
+ return (size_t)(pEnd-pStart);
527
572
  }
528
573
 
529
574
  FORCE_INLINE_TEMPLATE size_t
@@ -533,7 +578,7 @@ HUF_decompress1X1_usingDTable_internal_body(
533
578
  const HUF_DTable* DTable)
534
579
  {
535
580
  BYTE* op = (BYTE*)dst;
536
- BYTE* const oend = op + dstSize;
581
+ BYTE* const oend = ZSTD_maybeNullPtrAdd(op, dstSize);
537
582
  const void* dtPtr = DTable + 1;
538
583
  const HUF_DEltX1* const dt = (const HUF_DEltX1*)dtPtr;
539
584
  BIT_DStream_t bitD;
@@ -549,6 +594,10 @@ HUF_decompress1X1_usingDTable_internal_body(
549
594
  return dstSize;
550
595
  }
551
596
 
597
+ /* HUF_decompress4X1_usingDTable_internal_body():
598
+ * Conditions :
599
+ * @dstSize >= 6
600
+ */
552
601
  FORCE_INLINE_TEMPLATE size_t
553
602
  HUF_decompress4X1_usingDTable_internal_body(
554
603
  void* dst, size_t dstSize,
@@ -557,6 +606,7 @@ HUF_decompress4X1_usingDTable_internal_body(
557
606
  {
558
607
  /* Check */
559
608
  if (cSrcSize < 10) return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */
609
+ if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */
560
610
 
561
611
  { const BYTE* const istart = (const BYTE*) cSrc;
562
612
  BYTE* const ostart = (BYTE*) dst;
@@ -592,6 +642,7 @@ HUF_decompress4X1_usingDTable_internal_body(
592
642
 
593
643
  if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */
594
644
  if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */
645
+ assert(dstSize >= 6); /* validated above */
595
646
  CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
596
647
  CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
597
648
  CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
@@ -654,52 +705,173 @@ size_t HUF_decompress4X1_usingDTable_internal_bmi2(void* dst, size_t dstSize, vo
654
705
  }
655
706
  #endif
656
707
 
657
- #if HUF_NEED_DEFAULT_FUNCTION
658
708
  static
659
709
  size_t HUF_decompress4X1_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,
660
710
  size_t cSrcSize, HUF_DTable const* DTable) {
661
711
  return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
662
712
  }
663
- #endif
664
713
 
665
714
  #if ZSTD_ENABLE_ASM_X86_64_BMI2
666
715
 
667
- HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN;
716
+ HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN;
717
+
718
+ #endif
719
+
720
+ static HUF_FAST_BMI2_ATTRS
721
+ void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args)
722
+ {
723
+ U64 bits[4];
724
+ BYTE const* ip[4];
725
+ BYTE* op[4];
726
+ U16 const* const dtable = (U16 const*)args->dt;
727
+ BYTE* const oend = args->oend;
728
+ BYTE const* const ilowest = args->ilowest;
729
+
730
+ /* Copy the arguments to local variables */
731
+ ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
732
+ ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip));
733
+ ZSTD_memcpy(&op, &args->op, sizeof(op));
734
+
735
+ assert(MEM_isLittleEndian());
736
+ assert(!MEM_32bits());
737
+
738
+ for (;;) {
739
+ BYTE* olimit;
740
+ int stream;
741
+
742
+ /* Assert loop preconditions */
743
+ #ifndef NDEBUG
744
+ for (stream = 0; stream < 4; ++stream) {
745
+ assert(op[stream] <= (stream == 3 ? oend : op[stream + 1]));
746
+ assert(ip[stream] >= ilowest);
747
+ }
748
+ #endif
749
+ /* Compute olimit */
750
+ {
751
+ /* Each iteration produces 5 output symbols per stream */
752
+ size_t const oiters = (size_t)(oend - op[3]) / 5;
753
+ /* Each iteration consumes up to 11 bits * 5 = 55 bits < 7 bytes
754
+ * per stream.
755
+ */
756
+ size_t const iiters = (size_t)(ip[0] - ilowest) / 7;
757
+ /* We can safely run iters iterations before running bounds checks */
758
+ size_t const iters = MIN(oiters, iiters);
759
+ size_t const symbols = iters * 5;
760
+
761
+ /* We can simply check that op[3] < olimit, instead of checking all
762
+ * of our bounds, since we can't hit the other bounds until we've run
763
+ * iters iterations, which only happens when op[3] == olimit.
764
+ */
765
+ olimit = op[3] + symbols;
766
+
767
+ /* Exit fast decoding loop once we reach the end. */
768
+ if (op[3] == olimit)
769
+ break;
770
+
771
+ /* Exit the decoding loop if any input pointer has crossed the
772
+ * previous one. This indicates corruption, and a precondition
773
+ * to our loop is that ip[i] >= ip[0].
774
+ */
775
+ for (stream = 1; stream < 4; ++stream) {
776
+ if (ip[stream] < ip[stream - 1])
777
+ goto _out;
778
+ }
779
+ }
780
+
781
+ #ifndef NDEBUG
782
+ for (stream = 1; stream < 4; ++stream) {
783
+ assert(ip[stream] >= ip[stream - 1]);
784
+ }
785
+ #endif
786
+
787
+ #define HUF_4X1_DECODE_SYMBOL(_stream, _symbol) \
788
+ do { \
789
+ int const index = (int)(bits[(_stream)] >> 53); \
790
+ int const entry = (int)dtable[index]; \
791
+ bits[(_stream)] <<= (entry & 0x3F); \
792
+ op[(_stream)][(_symbol)] = (BYTE)((entry >> 8) & 0xFF); \
793
+ } while (0)
794
+
795
+ #define HUF_4X1_RELOAD_STREAM(_stream) \
796
+ do { \
797
+ int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \
798
+ int const nbBits = ctz & 7; \
799
+ int const nbBytes = ctz >> 3; \
800
+ op[(_stream)] += 5; \
801
+ ip[(_stream)] -= nbBytes; \
802
+ bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1; \
803
+ bits[(_stream)] <<= nbBits; \
804
+ } while (0)
805
+
806
+ /* Manually unroll the loop because compilers don't consistently
807
+ * unroll the inner loops, which destroys performance.
808
+ */
809
+ do {
810
+ /* Decode 5 symbols in each of the 4 streams */
811
+ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 0);
812
+ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 1);
813
+ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 2);
814
+ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 3);
815
+ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 4);
816
+
817
+ /* Reload each of the 4 the bitstreams */
818
+ HUF_4X_FOR_EACH_STREAM(HUF_4X1_RELOAD_STREAM);
819
+ } while (op[3] < olimit);
820
+
821
+ #undef HUF_4X1_DECODE_SYMBOL
822
+ #undef HUF_4X1_RELOAD_STREAM
823
+ }
824
+
825
+ _out:
668
826
 
669
- static HUF_ASM_X86_64_BMI2_ATTRS
827
+ /* Save the final values of each of the state variables back to args. */
828
+ ZSTD_memcpy(&args->bits, &bits, sizeof(bits));
829
+ ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip));
830
+ ZSTD_memcpy(&args->op, &op, sizeof(op));
831
+ }
832
+
833
+ /**
834
+ * @returns @p dstSize on success (>= 6)
835
+ * 0 if the fallback implementation should be used
836
+ * An error if an error occurred
837
+ */
838
+ static HUF_FAST_BMI2_ATTRS
670
839
  size_t
671
- HUF_decompress4X1_usingDTable_internal_bmi2_asm(
840
+ HUF_decompress4X1_usingDTable_internal_fast(
672
841
  void* dst, size_t dstSize,
673
842
  const void* cSrc, size_t cSrcSize,
674
- const HUF_DTable* DTable)
843
+ const HUF_DTable* DTable,
844
+ HUF_DecompressFastLoopFn loopFn)
675
845
  {
676
846
  void const* dt = DTable + 1;
677
- const BYTE* const iend = (const BYTE*)cSrc + 6;
678
- BYTE* const oend = (BYTE*)dst + dstSize;
679
- HUF_DecompressAsmArgs args;
680
- {
681
- size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
682
- FORWARD_IF_ERROR(ret, "Failed to init asm args");
683
- if (ret != 0)
684
- return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
847
+ BYTE const* const ilowest = (BYTE const*)cSrc;
848
+ BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize);
849
+ HUF_DecompressFastArgs args;
850
+ { size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
851
+ FORWARD_IF_ERROR(ret, "Failed to init fast loop args");
852
+ if (ret == 0)
853
+ return 0;
685
854
  }
686
855
 
687
- assert(args.ip[0] >= args.ilimit);
688
- HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(&args);
856
+ assert(args.ip[0] >= args.ilowest);
857
+ loopFn(&args);
689
858
 
690
- /* Our loop guarantees that ip[] >= ilimit and that we haven't
859
+ /* Our loop guarantees that ip[] >= ilowest and that we haven't
691
860
  * overwritten any op[].
692
861
  */
693
- assert(args.ip[0] >= iend);
694
- assert(args.ip[1] >= iend);
695
- assert(args.ip[2] >= iend);
696
- assert(args.ip[3] >= iend);
862
+ assert(args.ip[0] >= ilowest);
863
+ assert(args.ip[0] >= ilowest);
864
+ assert(args.ip[1] >= ilowest);
865
+ assert(args.ip[2] >= ilowest);
866
+ assert(args.ip[3] >= ilowest);
697
867
  assert(args.op[3] <= oend);
698
- (void)iend;
868
+
869
+ assert(ilowest == args.ilowest);
870
+ assert(ilowest + 6 == args.iend[0]);
871
+ (void)ilowest;
699
872
 
700
873
  /* finish bit streams one by one. */
701
- {
702
- size_t const segmentSize = (dstSize+3) / 4;
874
+ { size_t const segmentSize = (dstSize+3) / 4;
703
875
  BYTE* segmentEnd = (BYTE*)dst;
704
876
  int i;
705
877
  for (i = 0; i < 4; ++i) {
@@ -716,97 +888,59 @@ HUF_decompress4X1_usingDTable_internal_bmi2_asm(
716
888
  }
717
889
 
718
890
  /* decoded size */
891
+ assert(dstSize != 0);
719
892
  return dstSize;
720
893
  }
721
- #endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */
722
-
723
- typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize,
724
- const void *cSrc,
725
- size_t cSrcSize,
726
- const HUF_DTable *DTable);
727
894
 
728
895
  HUF_DGEN(HUF_decompress1X1_usingDTable_internal)
729
896
 
730
897
  static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc,
731
- size_t cSrcSize, HUF_DTable const* DTable, int bmi2)
898
+ size_t cSrcSize, HUF_DTable const* DTable, int flags)
732
899
  {
900
+ HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X1_usingDTable_internal_default;
901
+ HUF_DecompressFastLoopFn loopFn = HUF_decompress4X1_usingDTable_internal_fast_c_loop;
902
+
733
903
  #if DYNAMIC_BMI2
734
- if (bmi2) {
904
+ if (flags & HUF_flags_bmi2) {
905
+ fallbackFn = HUF_decompress4X1_usingDTable_internal_bmi2;
735
906
  # if ZSTD_ENABLE_ASM_X86_64_BMI2
736
- return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
737
- # else
738
- return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
907
+ if (!(flags & HUF_flags_disableAsm)) {
908
+ loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop;
909
+ }
739
910
  # endif
911
+ } else {
912
+ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
740
913
  }
741
- #else
742
- (void)bmi2;
743
914
  #endif
744
915
 
745
916
  #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
746
- return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
747
- #else
748
- return HUF_decompress4X1_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable);
917
+ if (!(flags & HUF_flags_disableAsm)) {
918
+ loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop;
919
+ }
749
920
  #endif
750
- }
751
921
 
752
-
753
- size_t HUF_decompress1X1_usingDTable(
754
- void* dst, size_t dstSize,
755
- const void* cSrc, size_t cSrcSize,
756
- const HUF_DTable* DTable)
757
- {
758
- DTableDesc dtd = HUF_getDTableDesc(DTable);
759
- if (dtd.tableType != 0) return ERROR(GENERIC);
760
- return HUF_decompress1X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
761
- }
762
-
763
- size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
764
- const void* cSrc, size_t cSrcSize,
765
- void* workSpace, size_t wkspSize)
766
- {
767
- const BYTE* ip = (const BYTE*) cSrc;
768
-
769
- size_t const hSize = HUF_readDTableX1_wksp(DCtx, cSrc, cSrcSize, workSpace, wkspSize);
770
- if (HUF_isError(hSize)) return hSize;
771
- if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
772
- ip += hSize; cSrcSize -= hSize;
773
-
774
- return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
775
- }
776
-
777
-
778
- size_t HUF_decompress4X1_usingDTable(
779
- void* dst, size_t dstSize,
780
- const void* cSrc, size_t cSrcSize,
781
- const HUF_DTable* DTable)
782
- {
783
- DTableDesc dtd = HUF_getDTableDesc(DTable);
784
- if (dtd.tableType != 0) return ERROR(GENERIC);
785
- return HUF_decompress4X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
922
+ if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) {
923
+ size_t const ret = HUF_decompress4X1_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);
924
+ if (ret != 0)
925
+ return ret;
926
+ }
927
+ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
786
928
  }
787
929
 
788
- static size_t HUF_decompress4X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
930
+ static size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
789
931
  const void* cSrc, size_t cSrcSize,
790
- void* workSpace, size_t wkspSize, int bmi2)
932
+ void* workSpace, size_t wkspSize, int flags)
791
933
  {
792
934
  const BYTE* ip = (const BYTE*) cSrc;
793
935
 
794
- size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
936
+ size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags);
795
937
  if (HUF_isError(hSize)) return hSize;
796
938
  if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
797
939
  ip += hSize; cSrcSize -= hSize;
798
940
 
799
- return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
800
- }
801
-
802
- size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
803
- const void* cSrc, size_t cSrcSize,
804
- void* workSpace, size_t wkspSize)
805
- {
806
- return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, 0);
941
+ return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
807
942
  }
808
943
 
809
-
810
944
  #endif /* HUF_FORCE_DECOMPRESS_X2 */
811
945
 
812
946
 
@@ -989,7 +1123,7 @@ static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 targetLog, const U32
989
1123
 
990
1124
  static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog,
991
1125
  const sortedSymbol_t* sortedList,
992
- const U32* rankStart, rankVal_t rankValOrigin, const U32 maxWeight,
1126
+ const U32* rankStart, rankValCol_t* rankValOrigin, const U32 maxWeight,
993
1127
  const U32 nbBitsBaseline)
994
1128
  {
995
1129
  U32* const rankVal = rankValOrigin[0];
@@ -1044,14 +1178,7 @@ typedef struct {
1044
1178
 
1045
1179
  size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,
1046
1180
  const void* src, size_t srcSize,
1047
- void* workSpace, size_t wkspSize)
1048
- {
1049
- return HUF_readDTableX2_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0);
1050
- }
1051
-
1052
- size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable,
1053
- const void* src, size_t srcSize,
1054
- void* workSpace, size_t wkspSize, int bmi2)
1181
+ void* workSpace, size_t wkspSize, int flags)
1055
1182
  {
1056
1183
  U32 tableLog, maxW, nbSymbols;
1057
1184
  DTableDesc dtd = HUF_getDTableDesc(DTable);
@@ -1073,7 +1200,7 @@ size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable,
1073
1200
  if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
1074
1201
  /* ZSTD_memset(weightList, 0, sizeof(weightList)); */ /* is not necessary, even though some analyzer complain ... */
1075
1202
 
1076
- iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), bmi2);
1203
+ iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), flags);
1077
1204
  if (HUF_isError(iSize)) return iSize;
1078
1205
 
1079
1206
  /* check result */
@@ -1163,15 +1290,19 @@ HUF_decodeLastSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, c
1163
1290
  }
1164
1291
 
1165
1292
  #define HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) \
1166
- ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
1293
+ do { ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); } while (0)
1167
1294
 
1168
- #define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \
1169
- if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
1170
- ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
1295
+ #define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \
1296
+ do { \
1297
+ if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
1298
+ ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); \
1299
+ } while (0)
1171
1300
 
1172
- #define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \
1173
- if (MEM_64bits()) \
1174
- ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
1301
+ #define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \
1302
+ do { \
1303
+ if (MEM_64bits()) \
1304
+ ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); \
1305
+ } while (0)
1175
1306
 
1176
1307
  HINT_INLINE size_t
1177
1308
  HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd,
@@ -1231,7 +1362,7 @@ HUF_decompress1X2_usingDTable_internal_body(
1231
1362
 
1232
1363
  /* decode */
1233
1364
  { BYTE* const ostart = (BYTE*) dst;
1234
- BYTE* const oend = ostart + dstSize;
1365
+ BYTE* const oend = ZSTD_maybeNullPtrAdd(ostart, dstSize);
1235
1366
  const void* const dtPtr = DTable+1; /* force compiler to not use strict-aliasing */
1236
1367
  const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr;
1237
1368
  DTableDesc const dtd = HUF_getDTableDesc(DTable);
@@ -1244,6 +1375,11 @@ HUF_decompress1X2_usingDTable_internal_body(
1244
1375
  /* decoded size */
1245
1376
  return dstSize;
1246
1377
  }
1378
+
1379
+ /* HUF_decompress4X2_usingDTable_internal_body():
1380
+ * Conditions:
1381
+ * @dstSize >= 6
1382
+ */
1247
1383
  FORCE_INLINE_TEMPLATE size_t
1248
1384
  HUF_decompress4X2_usingDTable_internal_body(
1249
1385
  void* dst, size_t dstSize,
@@ -1251,6 +1387,7 @@ HUF_decompress4X2_usingDTable_internal_body(
1251
1387
  const HUF_DTable* DTable)
1252
1388
  {
1253
1389
  if (cSrcSize < 10) return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */
1390
+ if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */
1254
1391
 
1255
1392
  { const BYTE* const istart = (const BYTE*) cSrc;
1256
1393
  BYTE* const ostart = (BYTE*) dst;
@@ -1284,8 +1421,9 @@ HUF_decompress4X2_usingDTable_internal_body(
1284
1421
  DTableDesc const dtd = HUF_getDTableDesc(DTable);
1285
1422
  U32 const dtLog = dtd.tableLog;
1286
1423
 
1287
- if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */
1288
- if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */
1424
+ if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */
1425
+ if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */
1426
+ assert(dstSize >= 6 /* validated above */);
1289
1427
  CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
1290
1428
  CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
1291
1429
  CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
@@ -1370,44 +1508,191 @@ size_t HUF_decompress4X2_usingDTable_internal_bmi2(void* dst, size_t dstSize, vo
1370
1508
  }
1371
1509
  #endif
1372
1510
 
1373
- #if HUF_NEED_DEFAULT_FUNCTION
1374
1511
  static
1375
1512
  size_t HUF_decompress4X2_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,
1376
1513
  size_t cSrcSize, HUF_DTable const* DTable) {
1377
1514
  return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
1378
1515
  }
1379
- #endif
1380
1516
 
1381
1517
  #if ZSTD_ENABLE_ASM_X86_64_BMI2
1382
1518
 
1383
- HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN;
1519
+ HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN;
1520
+
1521
+ #endif
1522
+
1523
+ static HUF_FAST_BMI2_ATTRS
1524
+ void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args)
1525
+ {
1526
+ U64 bits[4];
1527
+ BYTE const* ip[4];
1528
+ BYTE* op[4];
1529
+ BYTE* oend[4];
1530
+ HUF_DEltX2 const* const dtable = (HUF_DEltX2 const*)args->dt;
1531
+ BYTE const* const ilowest = args->ilowest;
1532
+
1533
+ /* Copy the arguments to local registers. */
1534
+ ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
1535
+ ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip));
1536
+ ZSTD_memcpy(&op, &args->op, sizeof(op));
1537
+
1538
+ oend[0] = op[1];
1539
+ oend[1] = op[2];
1540
+ oend[2] = op[3];
1541
+ oend[3] = args->oend;
1542
+
1543
+ assert(MEM_isLittleEndian());
1544
+ assert(!MEM_32bits());
1545
+
1546
+ for (;;) {
1547
+ BYTE* olimit;
1548
+ int stream;
1549
+
1550
+ /* Assert loop preconditions */
1551
+ #ifndef NDEBUG
1552
+ for (stream = 0; stream < 4; ++stream) {
1553
+ assert(op[stream] <= oend[stream]);
1554
+ assert(ip[stream] >= ilowest);
1555
+ }
1556
+ #endif
1557
+ /* Compute olimit */
1558
+ {
1559
+ /* Each loop does 5 table lookups for each of the 4 streams.
1560
+ * Each table lookup consumes up to 11 bits of input, and produces
1561
+ * up to 2 bytes of output.
1562
+ */
1563
+ /* We can consume up to 7 bytes of input per iteration per stream.
1564
+ * We also know that each input pointer is >= ip[0]. So we can run
1565
+ * iters loops before running out of input.
1566
+ */
1567
+ size_t iters = (size_t)(ip[0] - ilowest) / 7;
1568
+ /* Each iteration can produce up to 10 bytes of output per stream.
1569
+ * Each output stream my advance at different rates. So take the
1570
+ * minimum number of safe iterations among all the output streams.
1571
+ */
1572
+ for (stream = 0; stream < 4; ++stream) {
1573
+ size_t const oiters = (size_t)(oend[stream] - op[stream]) / 10;
1574
+ iters = MIN(iters, oiters);
1575
+ }
1576
+
1577
+ /* Each iteration produces at least 5 output symbols. So until
1578
+ * op[3] crosses olimit, we know we haven't executed iters
1579
+ * iterations yet. This saves us maintaining an iters counter,
1580
+ * at the expense of computing the remaining # of iterations
1581
+ * more frequently.
1582
+ */
1583
+ olimit = op[3] + (iters * 5);
1384
1584
 
1385
- static HUF_ASM_X86_64_BMI2_ATTRS size_t
1386
- HUF_decompress4X2_usingDTable_internal_bmi2_asm(
1585
+ /* Exit the fast decoding loop once we reach the end. */
1586
+ if (op[3] == olimit)
1587
+ break;
1588
+
1589
+ /* Exit the decoding loop if any input pointer has crossed the
1590
+ * previous one. This indicates corruption, and a precondition
1591
+ * to our loop is that ip[i] >= ip[0].
1592
+ */
1593
+ for (stream = 1; stream < 4; ++stream) {
1594
+ if (ip[stream] < ip[stream - 1])
1595
+ goto _out;
1596
+ }
1597
+ }
1598
+
1599
+ #ifndef NDEBUG
1600
+ for (stream = 1; stream < 4; ++stream) {
1601
+ assert(ip[stream] >= ip[stream - 1]);
1602
+ }
1603
+ #endif
1604
+
1605
+ #define HUF_4X2_DECODE_SYMBOL(_stream, _decode3) \
1606
+ do { \
1607
+ if ((_decode3) || (_stream) != 3) { \
1608
+ int const index = (int)(bits[(_stream)] >> 53); \
1609
+ HUF_DEltX2 const entry = dtable[index]; \
1610
+ MEM_write16(op[(_stream)], entry.sequence); \
1611
+ bits[(_stream)] <<= (entry.nbBits) & 0x3F; \
1612
+ op[(_stream)] += (entry.length); \
1613
+ } \
1614
+ } while (0)
1615
+
1616
+ #define HUF_4X2_RELOAD_STREAM(_stream) \
1617
+ do { \
1618
+ HUF_4X2_DECODE_SYMBOL(3, 1); \
1619
+ { \
1620
+ int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \
1621
+ int const nbBits = ctz & 7; \
1622
+ int const nbBytes = ctz >> 3; \
1623
+ ip[(_stream)] -= nbBytes; \
1624
+ bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1; \
1625
+ bits[(_stream)] <<= nbBits; \
1626
+ } \
1627
+ } while (0)
1628
+
1629
+ /* Manually unroll the loop because compilers don't consistently
1630
+ * unroll the inner loops, which destroys performance.
1631
+ */
1632
+ do {
1633
+ /* Decode 5 symbols from each of the first 3 streams.
1634
+ * The final stream will be decoded during the reload phase
1635
+ * to reduce register pressure.
1636
+ */
1637
+ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
1638
+ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
1639
+ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
1640
+ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
1641
+ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
1642
+
1643
+ /* Decode one symbol from the final stream */
1644
+ HUF_4X2_DECODE_SYMBOL(3, 1);
1645
+
1646
+ /* Decode 4 symbols from the final stream & reload bitstreams.
1647
+ * The final stream is reloaded last, meaning that all 5 symbols
1648
+ * are decoded from the final stream before it is reloaded.
1649
+ */
1650
+ HUF_4X_FOR_EACH_STREAM(HUF_4X2_RELOAD_STREAM);
1651
+ } while (op[3] < olimit);
1652
+ }
1653
+
1654
+ #undef HUF_4X2_DECODE_SYMBOL
1655
+ #undef HUF_4X2_RELOAD_STREAM
1656
+
1657
+ _out:
1658
+
1659
+ /* Save the final values of each of the state variables back to args. */
1660
+ ZSTD_memcpy(&args->bits, &bits, sizeof(bits));
1661
+ ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip));
1662
+ ZSTD_memcpy(&args->op, &op, sizeof(op));
1663
+ }
1664
+
1665
+
1666
+ static HUF_FAST_BMI2_ATTRS size_t
1667
+ HUF_decompress4X2_usingDTable_internal_fast(
1387
1668
  void* dst, size_t dstSize,
1388
1669
  const void* cSrc, size_t cSrcSize,
1389
- const HUF_DTable* DTable) {
1670
+ const HUF_DTable* DTable,
1671
+ HUF_DecompressFastLoopFn loopFn) {
1390
1672
  void const* dt = DTable + 1;
1391
- const BYTE* const iend = (const BYTE*)cSrc + 6;
1392
- BYTE* const oend = (BYTE*)dst + dstSize;
1393
- HUF_DecompressAsmArgs args;
1673
+ const BYTE* const ilowest = (const BYTE*)cSrc;
1674
+ BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize);
1675
+ HUF_DecompressFastArgs args;
1394
1676
  {
1395
- size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
1677
+ size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
1396
1678
  FORWARD_IF_ERROR(ret, "Failed to init asm args");
1397
- if (ret != 0)
1398
- return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
1679
+ if (ret == 0)
1680
+ return 0;
1399
1681
  }
1400
1682
 
1401
- assert(args.ip[0] >= args.ilimit);
1402
- HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(&args);
1683
+ assert(args.ip[0] >= args.ilowest);
1684
+ loopFn(&args);
1403
1685
 
1404
1686
  /* note : op4 already verified within main loop */
1405
- assert(args.ip[0] >= iend);
1406
- assert(args.ip[1] >= iend);
1407
- assert(args.ip[2] >= iend);
1408
- assert(args.ip[3] >= iend);
1687
+ assert(args.ip[0] >= ilowest);
1688
+ assert(args.ip[1] >= ilowest);
1689
+ assert(args.ip[2] >= ilowest);
1690
+ assert(args.ip[3] >= ilowest);
1409
1691
  assert(args.op[3] <= oend);
1410
- (void)iend;
1692
+
1693
+ assert(ilowest == args.ilowest);
1694
+ assert(ilowest + 6 == args.iend[0]);
1695
+ (void)ilowest;
1411
1696
 
1412
1697
  /* finish bitStreams one by one */
1413
1698
  {
@@ -1430,91 +1715,72 @@ HUF_decompress4X2_usingDTable_internal_bmi2_asm(
1430
1715
  /* decoded size */
1431
1716
  return dstSize;
1432
1717
  }
1433
- #endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */
1434
1718
 
1435
1719
  static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc,
1436
- size_t cSrcSize, HUF_DTable const* DTable, int bmi2)
1720
+ size_t cSrcSize, HUF_DTable const* DTable, int flags)
1437
1721
  {
1722
+ HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X2_usingDTable_internal_default;
1723
+ HUF_DecompressFastLoopFn loopFn = HUF_decompress4X2_usingDTable_internal_fast_c_loop;
1724
+
1438
1725
  #if DYNAMIC_BMI2
1439
- if (bmi2) {
1726
+ if (flags & HUF_flags_bmi2) {
1727
+ fallbackFn = HUF_decompress4X2_usingDTable_internal_bmi2;
1440
1728
  # if ZSTD_ENABLE_ASM_X86_64_BMI2
1441
- return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
1442
- # else
1443
- return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
1729
+ if (!(flags & HUF_flags_disableAsm)) {
1730
+ loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop;
1731
+ }
1444
1732
  # endif
1733
+ } else {
1734
+ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
1445
1735
  }
1446
- #else
1447
- (void)bmi2;
1448
1736
  #endif
1449
1737
 
1450
1738
  #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
1451
- return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
1452
- #else
1453
- return HUF_decompress4X2_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable);
1739
+ if (!(flags & HUF_flags_disableAsm)) {
1740
+ loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop;
1741
+ }
1454
1742
  #endif
1743
+
1744
+ if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) {
1745
+ size_t const ret = HUF_decompress4X2_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);
1746
+ if (ret != 0)
1747
+ return ret;
1748
+ }
1749
+ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
1455
1750
  }
1456
1751
 
1457
1752
  HUF_DGEN(HUF_decompress1X2_usingDTable_internal)
1458
1753
 
1459
- size_t HUF_decompress1X2_usingDTable(
1460
- void* dst, size_t dstSize,
1461
- const void* cSrc, size_t cSrcSize,
1462
- const HUF_DTable* DTable)
1463
- {
1464
- DTableDesc dtd = HUF_getDTableDesc(DTable);
1465
- if (dtd.tableType != 1) return ERROR(GENERIC);
1466
- return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
1467
- }
1468
-
1469
1754
  size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
1470
1755
  const void* cSrc, size_t cSrcSize,
1471
- void* workSpace, size_t wkspSize)
1756
+ void* workSpace, size_t wkspSize, int flags)
1472
1757
  {
1473
1758
  const BYTE* ip = (const BYTE*) cSrc;
1474
1759
 
1475
1760
  size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize,
1476
- workSpace, wkspSize);
1761
+ workSpace, wkspSize, flags);
1477
1762
  if (HUF_isError(hSize)) return hSize;
1478
1763
  if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
1479
1764
  ip += hSize; cSrcSize -= hSize;
1480
1765
 
1481
- return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
1766
+ return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, flags);
1482
1767
  }
1483
1768
 
1484
-
1485
- size_t HUF_decompress4X2_usingDTable(
1486
- void* dst, size_t dstSize,
1487
- const void* cSrc, size_t cSrcSize,
1488
- const HUF_DTable* DTable)
1489
- {
1490
- DTableDesc dtd = HUF_getDTableDesc(DTable);
1491
- if (dtd.tableType != 1) return ERROR(GENERIC);
1492
- return HUF_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
1493
- }
1494
-
1495
- static size_t HUF_decompress4X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
1769
+ static size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
1496
1770
  const void* cSrc, size_t cSrcSize,
1497
- void* workSpace, size_t wkspSize, int bmi2)
1771
+ void* workSpace, size_t wkspSize, int flags)
1498
1772
  {
1499
1773
  const BYTE* ip = (const BYTE*) cSrc;
1500
1774
 
1501
1775
  size_t hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize,
1502
- workSpace, wkspSize);
1776
+ workSpace, wkspSize, flags);
1503
1777
  if (HUF_isError(hSize)) return hSize;
1504
1778
  if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
1505
1779
  ip += hSize; cSrcSize -= hSize;
1506
1780
 
1507
- return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
1508
- }
1509
-
1510
- size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
1511
- const void* cSrc, size_t cSrcSize,
1512
- void* workSpace, size_t wkspSize)
1513
- {
1514
- return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, /* bmi2 */ 0);
1781
+ return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
1515
1782
  }
1516
1783
 
1517
-
1518
1784
  #endif /* HUF_FORCE_DECOMPRESS_X1 */
1519
1785
 
1520
1786
 
@@ -1522,44 +1788,6 @@ size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
1522
1788
  /* Universal decompression selectors */
1523
1789
  /* ***********************************/
1524
1790
 
1525
- size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize,
1526
- const void* cSrc, size_t cSrcSize,
1527
- const HUF_DTable* DTable)
1528
- {
1529
- DTableDesc const dtd = HUF_getDTableDesc(DTable);
1530
- #if defined(HUF_FORCE_DECOMPRESS_X1)
1531
- (void)dtd;
1532
- assert(dtd.tableType == 0);
1533
- return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
1534
- #elif defined(HUF_FORCE_DECOMPRESS_X2)
1535
- (void)dtd;
1536
- assert(dtd.tableType == 1);
1537
- return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
1538
- #else
1539
- return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
1540
- HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
1541
- #endif
1542
- }
1543
-
1544
- size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize,
1545
- const void* cSrc, size_t cSrcSize,
1546
- const HUF_DTable* DTable)
1547
- {
1548
- DTableDesc const dtd = HUF_getDTableDesc(DTable);
1549
- #if defined(HUF_FORCE_DECOMPRESS_X1)
1550
- (void)dtd;
1551
- assert(dtd.tableType == 0);
1552
- return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
1553
- #elif defined(HUF_FORCE_DECOMPRESS_X2)
1554
- (void)dtd;
1555
- assert(dtd.tableType == 1);
1556
- return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
1557
- #else
1558
- return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
1559
- HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
1560
- #endif
1561
- }
1562
-
1563
1791
 
1564
1792
  #if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2)
1565
1793
  typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t;
@@ -1614,36 +1842,9 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize)
1614
1842
  #endif
1615
1843
  }
1616
1844
 
1617
-
1618
- size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst,
1619
- size_t dstSize, const void* cSrc,
1620
- size_t cSrcSize, void* workSpace,
1621
- size_t wkspSize)
1622
- {
1623
- /* validation checks */
1624
- if (dstSize == 0) return ERROR(dstSize_tooSmall);
1625
- if (cSrcSize == 0) return ERROR(corruption_detected);
1626
-
1627
- { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
1628
- #if defined(HUF_FORCE_DECOMPRESS_X1)
1629
- (void)algoNb;
1630
- assert(algoNb == 0);
1631
- return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
1632
- #elif defined(HUF_FORCE_DECOMPRESS_X2)
1633
- (void)algoNb;
1634
- assert(algoNb == 1);
1635
- return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
1636
- #else
1637
- return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
1638
- cSrcSize, workSpace, wkspSize):
1639
- HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
1640
- #endif
1641
- }
1642
- }
1643
-
1644
1845
  size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
1645
1846
  const void* cSrc, size_t cSrcSize,
1646
- void* workSpace, size_t wkspSize)
1847
+ void* workSpace, size_t wkspSize, int flags)
1647
1848
  {
1648
1849
  /* validation checks */
1649
1850
  if (dstSize == 0) return ERROR(dstSize_tooSmall);
@@ -1656,71 +1857,71 @@ size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
1656
1857
  (void)algoNb;
1657
1858
  assert(algoNb == 0);
1658
1859
  return HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc,
1659
- cSrcSize, workSpace, wkspSize);
1860
+ cSrcSize, workSpace, wkspSize, flags);
1660
1861
  #elif defined(HUF_FORCE_DECOMPRESS_X2)
1661
1862
  (void)algoNb;
1662
1863
  assert(algoNb == 1);
1663
1864
  return HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
1664
- cSrcSize, workSpace, wkspSize);
1865
+ cSrcSize, workSpace, wkspSize, flags);
1665
1866
  #else
1666
1867
  return algoNb ? HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
1667
- cSrcSize, workSpace, wkspSize):
1868
+ cSrcSize, workSpace, wkspSize, flags):
1668
1869
  HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc,
1669
- cSrcSize, workSpace, wkspSize);
1870
+ cSrcSize, workSpace, wkspSize, flags);
1670
1871
  #endif
1671
1872
  }
1672
1873
  }
1673
1874
 
1674
1875
 
1675
- size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
1876
+ size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags)
1676
1877
  {
1677
1878
  DTableDesc const dtd = HUF_getDTableDesc(DTable);
1678
1879
  #if defined(HUF_FORCE_DECOMPRESS_X1)
1679
1880
  (void)dtd;
1680
1881
  assert(dtd.tableType == 0);
1681
- return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
1882
+ return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
1682
1883
  #elif defined(HUF_FORCE_DECOMPRESS_X2)
1683
1884
  (void)dtd;
1684
1885
  assert(dtd.tableType == 1);
1685
- return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
1886
+ return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
1686
1887
  #else
1687
- return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
1688
- HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
1888
+ return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) :
1889
+ HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
1689
1890
  #endif
1690
1891
  }
1691
1892
 
1692
1893
  #ifndef HUF_FORCE_DECOMPRESS_X2
1693
- size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
1894
+ size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags)
1694
1895
  {
1695
1896
  const BYTE* ip = (const BYTE*) cSrc;
1696
1897
 
1697
- size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
1898
+ size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags);
1698
1899
  if (HUF_isError(hSize)) return hSize;
1699
1900
  if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
1700
1901
  ip += hSize; cSrcSize -= hSize;
1701
1902
 
1702
- return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
1903
+ return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
1703
1904
  }
1704
1905
  #endif
1705
1906
 
1706
- size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
1907
+ size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags)
1707
1908
  {
1708
1909
  DTableDesc const dtd = HUF_getDTableDesc(DTable);
1709
1910
  #if defined(HUF_FORCE_DECOMPRESS_X1)
1710
1911
  (void)dtd;
1711
1912
  assert(dtd.tableType == 0);
1712
- return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
1913
+ return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
1713
1914
  #elif defined(HUF_FORCE_DECOMPRESS_X2)
1714
1915
  (void)dtd;
1715
1916
  assert(dtd.tableType == 1);
1716
- return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
1917
+ return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
1717
1918
  #else
1718
- return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
1719
- HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
1919
+ return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) :
1920
+ HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
1720
1921
  #endif
1721
1922
  }
1722
1923
 
1723
- size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
1924
+ size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags)
1724
1925
  {
1725
1926
  /* validation checks */
1726
1927
  if (dstSize == 0) return ERROR(dstSize_tooSmall);
@@ -1730,160 +1931,14 @@ size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t ds
1730
1931
  #if defined(HUF_FORCE_DECOMPRESS_X1)
1731
1932
  (void)algoNb;
1732
1933
  assert(algoNb == 0);
1733
- return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
1734
- #elif defined(HUF_FORCE_DECOMPRESS_X2)
1735
- (void)algoNb;
1736
- assert(algoNb == 1);
1737
- return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
1738
- #else
1739
- return algoNb ? HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2) :
1740
- HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
1741
- #endif
1742
- }
1743
- }
1744
-
1745
- #ifndef ZSTD_NO_UNUSED_FUNCTIONS
1746
- #ifndef HUF_FORCE_DECOMPRESS_X2
1747
- size_t HUF_readDTableX1(HUF_DTable* DTable, const void* src, size_t srcSize)
1748
- {
1749
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
1750
- return HUF_readDTableX1_wksp(DTable, src, srcSize,
1751
- workSpace, sizeof(workSpace));
1752
- }
1753
-
1754
- size_t HUF_decompress1X1_DCtx(HUF_DTable* DCtx, void* dst, size_t dstSize,
1755
- const void* cSrc, size_t cSrcSize)
1756
- {
1757
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
1758
- return HUF_decompress1X1_DCtx_wksp(DCtx, dst, dstSize, cSrc, cSrcSize,
1759
- workSpace, sizeof(workSpace));
1760
- }
1761
-
1762
- size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
1763
- {
1764
- HUF_CREATE_STATIC_DTABLEX1(DTable, HUF_TABLELOG_MAX);
1765
- return HUF_decompress1X1_DCtx (DTable, dst, dstSize, cSrc, cSrcSize);
1766
- }
1767
- #endif
1768
-
1769
- #ifndef HUF_FORCE_DECOMPRESS_X1
1770
- size_t HUF_readDTableX2(HUF_DTable* DTable, const void* src, size_t srcSize)
1771
- {
1772
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
1773
- return HUF_readDTableX2_wksp(DTable, src, srcSize,
1774
- workSpace, sizeof(workSpace));
1775
- }
1776
-
1777
- size_t HUF_decompress1X2_DCtx(HUF_DTable* DCtx, void* dst, size_t dstSize,
1778
- const void* cSrc, size_t cSrcSize)
1779
- {
1780
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
1781
- return HUF_decompress1X2_DCtx_wksp(DCtx, dst, dstSize, cSrc, cSrcSize,
1782
- workSpace, sizeof(workSpace));
1783
- }
1784
-
1785
- size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
1786
- {
1787
- HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_TABLELOG_MAX);
1788
- return HUF_decompress1X2_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
1789
- }
1790
- #endif
1791
-
1792
- #ifndef HUF_FORCE_DECOMPRESS_X2
1793
- size_t HUF_decompress4X1_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
1794
- {
1795
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
1796
- return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
1797
- workSpace, sizeof(workSpace));
1798
- }
1799
- size_t HUF_decompress4X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
1800
- {
1801
- HUF_CREATE_STATIC_DTABLEX1(DTable, HUF_TABLELOG_MAX);
1802
- return HUF_decompress4X1_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
1803
- }
1804
- #endif
1805
-
1806
- #ifndef HUF_FORCE_DECOMPRESS_X1
1807
- size_t HUF_decompress4X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize,
1808
- const void* cSrc, size_t cSrcSize)
1809
- {
1810
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
1811
- return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
1812
- workSpace, sizeof(workSpace));
1813
- }
1814
-
1815
- size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
1816
- {
1817
- HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_TABLELOG_MAX);
1818
- return HUF_decompress4X2_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
1819
- }
1820
- #endif
1821
-
1822
- typedef size_t (*decompressionAlgo)(void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);
1823
-
1824
- size_t HUF_decompress (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
1825
- {
1826
- #if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2)
1827
- static const decompressionAlgo decompress[2] = { HUF_decompress4X1, HUF_decompress4X2 };
1828
- #endif
1829
-
1830
- /* validation checks */
1831
- if (dstSize == 0) return ERROR(dstSize_tooSmall);
1832
- if (cSrcSize > dstSize) return ERROR(corruption_detected); /* invalid */
1833
- if (cSrcSize == dstSize) { ZSTD_memcpy(dst, cSrc, dstSize); return dstSize; } /* not compressed */
1834
- if (cSrcSize == 1) { ZSTD_memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; } /* RLE */
1835
-
1836
- { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
1837
- #if defined(HUF_FORCE_DECOMPRESS_X1)
1838
- (void)algoNb;
1839
- assert(algoNb == 0);
1840
- return HUF_decompress4X1(dst, dstSize, cSrc, cSrcSize);
1934
+ return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
1841
1935
  #elif defined(HUF_FORCE_DECOMPRESS_X2)
1842
1936
  (void)algoNb;
1843
1937
  assert(algoNb == 1);
1844
- return HUF_decompress4X2(dst, dstSize, cSrc, cSrcSize);
1938
+ return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
1845
1939
  #else
1846
- return decompress[algoNb](dst, dstSize, cSrc, cSrcSize);
1940
+ return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags) :
1941
+ HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
1847
1942
  #endif
1848
1943
  }
1849
1944
  }
1850
-
1851
- size_t HUF_decompress4X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
1852
- {
1853
- /* validation checks */
1854
- if (dstSize == 0) return ERROR(dstSize_tooSmall);
1855
- if (cSrcSize > dstSize) return ERROR(corruption_detected); /* invalid */
1856
- if (cSrcSize == dstSize) { ZSTD_memcpy(dst, cSrc, dstSize); return dstSize; } /* not compressed */
1857
- if (cSrcSize == 1) { ZSTD_memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; } /* RLE */
1858
-
1859
- { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
1860
- #if defined(HUF_FORCE_DECOMPRESS_X1)
1861
- (void)algoNb;
1862
- assert(algoNb == 0);
1863
- return HUF_decompress4X1_DCtx(dctx, dst, dstSize, cSrc, cSrcSize);
1864
- #elif defined(HUF_FORCE_DECOMPRESS_X2)
1865
- (void)algoNb;
1866
- assert(algoNb == 1);
1867
- return HUF_decompress4X2_DCtx(dctx, dst, dstSize, cSrc, cSrcSize);
1868
- #else
1869
- return algoNb ? HUF_decompress4X2_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) :
1870
- HUF_decompress4X1_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) ;
1871
- #endif
1872
- }
1873
- }
1874
-
1875
- size_t HUF_decompress4X_hufOnly(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
1876
- {
1877
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
1878
- return HUF_decompress4X_hufOnly_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
1879
- workSpace, sizeof(workSpace));
1880
- }
1881
-
1882
- size_t HUF_decompress1X_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize,
1883
- const void* cSrc, size_t cSrcSize)
1884
- {
1885
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
1886
- return HUF_decompress1X_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
1887
- workSpace, sizeof(workSpace));
1888
- }
1889
- #endif