deflate-ruby 1.0.1 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. checksums.yaml +4 -4
  2. data/CLAUDE.md +95 -92
  3. data/LICENSE.txt +6 -6
  4. data/README.md +87 -65
  5. data/Rakefile +23 -0
  6. data/ext/deflate_ruby/{libdeflate/lib/x86/adler32_impl.h → adler32_impl.h} +8 -7
  7. data/ext/deflate_ruby/common_defs.h +748 -0
  8. data/ext/deflate_ruby/{libdeflate/lib/x86/cpu_features.c → cpu_features.c} +46 -16
  9. data/ext/deflate_ruby/{libdeflate/lib/x86/cpu_features.h → cpu_features.h} +2 -1
  10. data/ext/deflate_ruby/{libdeflate/lib/x86/crc32_impl.h → crc32_impl.h} +22 -23
  11. data/ext/deflate_ruby/{libdeflate/lib/crc32_multipliers.h → crc32_multipliers.h} +2 -4
  12. data/ext/deflate_ruby/{libdeflate/lib/x86/crc32_pclmul_template.h → crc32_pclmul_template.h} +23 -94
  13. data/ext/deflate_ruby/{libdeflate/lib/crc32_tables.h → crc32_tables.h} +1 -1
  14. data/ext/deflate_ruby/{libdeflate/lib/deflate_compress.c → deflate_compress.c} +59 -60
  15. data/ext/deflate_ruby/deflate_ruby.c +392 -218
  16. data/ext/deflate_ruby/deflate_ruby.h +6 -0
  17. data/ext/deflate_ruby/extconf.rb +35 -25
  18. data/ext/deflate_ruby/libdeflate/adler32.c +162 -0
  19. data/ext/deflate_ruby/libdeflate/{lib/arm → arm}/adler32_impl.h +14 -7
  20. data/ext/deflate_ruby/libdeflate/{lib/arm → arm}/crc32_impl.h +25 -31
  21. data/ext/deflate_ruby/libdeflate/arm/crc32_pmull_helpers.h +156 -0
  22. data/ext/deflate_ruby/libdeflate/arm/crc32_pmull_wide.h +226 -0
  23. data/ext/deflate_ruby/libdeflate/bt_matchfinder.h +342 -0
  24. data/ext/deflate_ruby/libdeflate/common_defs.h +2 -1
  25. data/ext/deflate_ruby/libdeflate/cpu_features_common.h +93 -0
  26. data/ext/deflate_ruby/libdeflate/crc32.c +262 -0
  27. data/ext/deflate_ruby/libdeflate/crc32_multipliers.h +375 -0
  28. data/ext/deflate_ruby/libdeflate/crc32_tables.h +587 -0
  29. data/ext/deflate_ruby/libdeflate/decompress_template.h +777 -0
  30. data/ext/deflate_ruby/libdeflate/deflate_compress.c +4128 -0
  31. data/ext/deflate_ruby/libdeflate/deflate_compress.h +15 -0
  32. data/ext/deflate_ruby/libdeflate/deflate_constants.h +56 -0
  33. data/ext/deflate_ruby/libdeflate/deflate_decompress.c +1208 -0
  34. data/ext/deflate_ruby/libdeflate/gzip_compress.c +90 -0
  35. data/ext/deflate_ruby/libdeflate/gzip_constants.h +45 -0
  36. data/ext/deflate_ruby/libdeflate/gzip_decompress.c +144 -0
  37. data/ext/deflate_ruby/libdeflate/hc_matchfinder.h +401 -0
  38. data/ext/deflate_ruby/libdeflate/ht_matchfinder.h +234 -0
  39. data/ext/deflate_ruby/libdeflate/lib_common.h +106 -0
  40. data/ext/deflate_ruby/libdeflate/libdeflate.h +2 -2
  41. data/ext/deflate_ruby/libdeflate/{lib/matchfinder_common.h → matchfinder_common.h} +3 -3
  42. data/ext/deflate_ruby/libdeflate/x86/adler32_impl.h +135 -0
  43. data/ext/deflate_ruby/libdeflate/x86/adler32_template.h +518 -0
  44. data/ext/deflate_ruby/libdeflate/x86/cpu_features.c +213 -0
  45. data/ext/deflate_ruby/libdeflate/x86/cpu_features.h +170 -0
  46. data/ext/deflate_ruby/libdeflate/x86/crc32_impl.h +159 -0
  47. data/ext/deflate_ruby/libdeflate/x86/crc32_pclmul_template.h +424 -0
  48. data/ext/deflate_ruby/libdeflate/x86/decompress_impl.h +57 -0
  49. data/ext/deflate_ruby/libdeflate.h +411 -0
  50. data/ext/deflate_ruby/matchfinder_common.h +224 -0
  51. data/ext/deflate_ruby/matchfinder_impl.h +122 -0
  52. data/ext/deflate_ruby/utils.c +141 -0
  53. data/ext/deflate_ruby/zlib_compress.c +82 -0
  54. data/ext/deflate_ruby/zlib_constants.h +21 -0
  55. data/ext/deflate_ruby/zlib_decompress.c +104 -0
  56. data/lib/deflate_ruby/version.rb +1 -1
  57. data/lib/deflate_ruby.rb +1 -63
  58. data/sig/deflate_ruby.rbs +4 -0
  59. data/test/test_deflate_ruby.rb +220 -0
  60. data/test/test_helper.rb +6 -0
  61. metadata +89 -144
  62. data/ext/deflate_ruby/libdeflate/CMakeLists.txt +0 -270
  63. data/ext/deflate_ruby/libdeflate/NEWS.md +0 -494
  64. data/ext/deflate_ruby/libdeflate/README.md +0 -228
  65. data/ext/deflate_ruby/libdeflate/libdeflate-config.cmake.in +0 -3
  66. data/ext/deflate_ruby/libdeflate/libdeflate.pc.in +0 -18
  67. data/ext/deflate_ruby/libdeflate/programs/CMakeLists.txt +0 -105
  68. data/ext/deflate_ruby/libdeflate/programs/benchmark.c +0 -696
  69. data/ext/deflate_ruby/libdeflate/programs/checksum.c +0 -218
  70. data/ext/deflate_ruby/libdeflate/programs/config.h.in +0 -19
  71. data/ext/deflate_ruby/libdeflate/programs/gzip.c +0 -688
  72. data/ext/deflate_ruby/libdeflate/programs/prog_util.c +0 -521
  73. data/ext/deflate_ruby/libdeflate/programs/prog_util.h +0 -225
  74. data/ext/deflate_ruby/libdeflate/programs/test_checksums.c +0 -200
  75. data/ext/deflate_ruby/libdeflate/programs/test_custom_malloc.c +0 -155
  76. data/ext/deflate_ruby/libdeflate/programs/test_incomplete_codes.c +0 -385
  77. data/ext/deflate_ruby/libdeflate/programs/test_invalid_streams.c +0 -130
  78. data/ext/deflate_ruby/libdeflate/programs/test_litrunlen_overflow.c +0 -72
  79. data/ext/deflate_ruby/libdeflate/programs/test_overread.c +0 -95
  80. data/ext/deflate_ruby/libdeflate/programs/test_slow_decompression.c +0 -472
  81. data/ext/deflate_ruby/libdeflate/programs/test_trailing_bytes.c +0 -151
  82. data/ext/deflate_ruby/libdeflate/programs/test_util.c +0 -237
  83. data/ext/deflate_ruby/libdeflate/programs/test_util.h +0 -61
  84. data/ext/deflate_ruby/libdeflate/programs/tgetopt.c +0 -118
  85. data/ext/deflate_ruby/libdeflate/scripts/android_build.sh +0 -118
  86. data/ext/deflate_ruby/libdeflate/scripts/android_tests.sh +0 -69
  87. data/ext/deflate_ruby/libdeflate/scripts/benchmark.sh +0 -10
  88. data/ext/deflate_ruby/libdeflate/scripts/checksum.sh +0 -10
  89. data/ext/deflate_ruby/libdeflate/scripts/checksum_benchmarks.sh +0 -253
  90. data/ext/deflate_ruby/libdeflate/scripts/cmake-helper.sh +0 -17
  91. data/ext/deflate_ruby/libdeflate/scripts/deflate_benchmarks.sh +0 -119
  92. data/ext/deflate_ruby/libdeflate/scripts/exec_tests.sh +0 -38
  93. data/ext/deflate_ruby/libdeflate/scripts/gen-release-archives.sh +0 -37
  94. data/ext/deflate_ruby/libdeflate/scripts/gen_bitreverse_tab.py +0 -19
  95. data/ext/deflate_ruby/libdeflate/scripts/gen_crc32_multipliers.c +0 -199
  96. data/ext/deflate_ruby/libdeflate/scripts/gen_crc32_tables.c +0 -105
  97. data/ext/deflate_ruby/libdeflate/scripts/gen_default_litlen_costs.py +0 -44
  98. data/ext/deflate_ruby/libdeflate/scripts/gen_offset_slot_map.py +0 -29
  99. data/ext/deflate_ruby/libdeflate/scripts/gzip_tests.sh +0 -523
  100. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/deflate_compress/corpus/0 +0 -0
  101. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/deflate_compress/fuzz.c +0 -95
  102. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/deflate_decompress/corpus/0 +0 -3
  103. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/deflate_decompress/fuzz.c +0 -62
  104. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/fuzz.sh +0 -108
  105. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/gzip_decompress/corpus/0 +0 -0
  106. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/gzip_decompress/fuzz.c +0 -19
  107. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/zlib_decompress/corpus/0 +0 -3
  108. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/zlib_decompress/fuzz.c +0 -19
  109. data/ext/deflate_ruby/libdeflate/scripts/run_tests.sh +0 -416
  110. data/ext/deflate_ruby/libdeflate/scripts/toolchain-i686-w64-mingw32.cmake +0 -8
  111. data/ext/deflate_ruby/libdeflate/scripts/toolchain-x86_64-w64-mingw32.cmake +0 -8
  112. /data/ext/deflate_ruby/{libdeflate/lib/adler32.c → adler32.c} +0 -0
  113. /data/ext/deflate_ruby/{libdeflate/lib/x86/adler32_template.h → adler32_template.h} +0 -0
  114. /data/ext/deflate_ruby/{libdeflate/lib/bt_matchfinder.h → bt_matchfinder.h} +0 -0
  115. /data/ext/deflate_ruby/{libdeflate/lib/cpu_features_common.h → cpu_features_common.h} +0 -0
  116. /data/ext/deflate_ruby/{libdeflate/lib/crc32.c → crc32.c} +0 -0
  117. /data/ext/deflate_ruby/{libdeflate/lib/arm/crc32_pmull_helpers.h → crc32_pmull_helpers.h} +0 -0
  118. /data/ext/deflate_ruby/{libdeflate/lib/arm/crc32_pmull_wide.h → crc32_pmull_wide.h} +0 -0
  119. /data/ext/deflate_ruby/{libdeflate/lib/x86/decompress_impl.h → decompress_impl.h} +0 -0
  120. /data/ext/deflate_ruby/{libdeflate/lib/decompress_template.h → decompress_template.h} +0 -0
  121. /data/ext/deflate_ruby/{libdeflate/lib/deflate_compress.h → deflate_compress.h} +0 -0
  122. /data/ext/deflate_ruby/{libdeflate/lib/deflate_constants.h → deflate_constants.h} +0 -0
  123. /data/ext/deflate_ruby/{libdeflate/lib/deflate_decompress.c → deflate_decompress.c} +0 -0
  124. /data/ext/deflate_ruby/{libdeflate/lib/gzip_compress.c → gzip_compress.c} +0 -0
  125. /data/ext/deflate_ruby/{libdeflate/lib/gzip_constants.h → gzip_constants.h} +0 -0
  126. /data/ext/deflate_ruby/{libdeflate/lib/gzip_decompress.c → gzip_decompress.c} +0 -0
  127. /data/ext/deflate_ruby/{libdeflate/lib/hc_matchfinder.h → hc_matchfinder.h} +0 -0
  128. /data/ext/deflate_ruby/{libdeflate/lib/ht_matchfinder.h → ht_matchfinder.h} +0 -0
  129. /data/ext/deflate_ruby/{libdeflate/lib/lib_common.h → lib_common.h} +0 -0
  130. /data/ext/deflate_ruby/libdeflate/{lib/arm → arm}/cpu_features.c +0 -0
  131. /data/ext/deflate_ruby/libdeflate/{lib/arm → arm}/cpu_features.h +0 -0
  132. /data/ext/deflate_ruby/libdeflate/{lib/arm → arm}/matchfinder_impl.h +0 -0
  133. /data/ext/deflate_ruby/libdeflate/{lib/riscv → riscv}/matchfinder_impl.h +0 -0
  134. /data/ext/deflate_ruby/libdeflate/{lib/utils.c → utils.c} +0 -0
  135. /data/ext/deflate_ruby/libdeflate/{lib/x86 → x86}/matchfinder_impl.h +0 -0
  136. /data/ext/deflate_ruby/libdeflate/{lib/zlib_compress.c → zlib_compress.c} +0 -0
  137. /data/ext/deflate_ruby/libdeflate/{lib/zlib_constants.h → zlib_constants.h} +0 -0
  138. /data/ext/deflate_ruby/libdeflate/{lib/zlib_decompress.c → zlib_decompress.c} +0 -0
@@ -0,0 +1,424 @@
1
+ /*
2
+ * x86/crc32_pclmul_template.h - gzip CRC-32 with PCLMULQDQ instructions
3
+ *
4
+ * Copyright 2016 Eric Biggers
5
+ *
6
+ * Permission is hereby granted, free of charge, to any person
7
+ * obtaining a copy of this software and associated documentation
8
+ * files (the "Software"), to deal in the Software without
9
+ * restriction, including without limitation the rights to use,
10
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ * copies of the Software, and to permit persons to whom the
12
+ * Software is furnished to do so, subject to the following
13
+ * conditions:
14
+ *
15
+ * The above copyright notice and this permission notice shall be
16
+ * included in all copies or substantial portions of the Software.
17
+ *
18
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
20
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
23
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
25
+ * OTHER DEALINGS IN THE SOFTWARE.
26
+ */
27
+
28
+ /*
29
+ * This file is a "template" for instantiating PCLMULQDQ-based crc32_x86
30
+ * functions. The "parameters" are:
31
+ *
32
+ * SUFFIX:
33
+ * Name suffix to append to all instantiated functions.
34
+ * ATTRIBUTES:
35
+ * Target function attributes to use. Must satisfy the dependencies of the
36
+ * other parameters as follows:
37
+ * VL=16 && USE_AVX512=0: at least pclmul,sse4.1
38
+ * VL=32 && USE_AVX512=0: at least vpclmulqdq,pclmul,avx2
39
+ * VL=32 && USE_AVX512=1: at least vpclmulqdq,pclmul,avx512bw,avx512vl
40
+ * VL=64 && USE_AVX512=1: at least vpclmulqdq,pclmul,avx512bw,avx512vl
41
+ * (Other combinations are not useful and have not been tested.)
42
+ * VL:
43
+ * Vector length in bytes. Must be 16, 32, or 64.
44
+ * USE_AVX512:
45
+ * If 1, take advantage of AVX-512 features such as masking and the
46
+ * vpternlog instruction. This doesn't enable the use of 512-bit vectors;
47
+ * the vector length is controlled by VL. If 0, assume that the CPU might
48
+ * not support AVX-512.
49
+ *
50
+ * The overall algorithm used is CRC folding with carryless multiplication
51
+ * instructions. Note that the x86 crc32 instruction cannot be used, as it is
52
+ * for a different polynomial, not the gzip one. For an explanation of CRC
53
+ * folding with carryless multiplication instructions, see
54
+ * scripts/gen-crc32-consts.py and the following blog posts and papers:
55
+ *
56
+ * "An alternative exposition of crc32_4k_pclmulqdq"
57
+ * https://www.corsix.org/content/alternative-exposition-crc32_4k_pclmulqdq
58
+ *
59
+ * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
60
+ * https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
61
+ *
62
+ * The original pclmulqdq instruction does one 64x64 to 128-bit carryless
63
+ * multiplication. The VPCLMULQDQ feature added instructions that do two
64
+ * parallel 64x64 to 128-bit carryless multiplications in combination with AVX
65
+ * or AVX512VL, or four in combination with AVX512F.
66
+ */
67
+
68
+ #if VL == 16
69
+ # define vec_t __m128i
70
+ # define fold_vec fold_vec128
71
+ # define VLOADU(p) _mm_loadu_si128((const void *)(p))
72
+ # define VXOR(a, b) _mm_xor_si128((a), (b))
73
+ # define M128I_TO_VEC(a) a
74
+ # define MULTS_8V _mm_set_epi64x(CRC32_X991_MODG, CRC32_X1055_MODG)
75
+ # define MULTS_4V _mm_set_epi64x(CRC32_X479_MODG, CRC32_X543_MODG)
76
+ # define MULTS_2V _mm_set_epi64x(CRC32_X223_MODG, CRC32_X287_MODG)
77
+ # define MULTS_1V _mm_set_epi64x(CRC32_X95_MODG, CRC32_X159_MODG)
78
+ #elif VL == 32
79
+ # define vec_t __m256i
80
+ # define fold_vec fold_vec256
81
+ # define VLOADU(p) _mm256_loadu_si256((const void *)(p))
82
+ # define VXOR(a, b) _mm256_xor_si256((a), (b))
83
+ # define M128I_TO_VEC(a) _mm256_zextsi128_si256(a)
84
+ # define MULTS(a, b) _mm256_set_epi64x(a, b, a, b)
85
+ # define MULTS_8V MULTS(CRC32_X2015_MODG, CRC32_X2079_MODG)
86
+ # define MULTS_4V MULTS(CRC32_X991_MODG, CRC32_X1055_MODG)
87
+ # define MULTS_2V MULTS(CRC32_X479_MODG, CRC32_X543_MODG)
88
+ # define MULTS_1V MULTS(CRC32_X223_MODG, CRC32_X287_MODG)
89
+ #elif VL == 64
90
+ # define vec_t __m512i
91
+ # define fold_vec fold_vec512
92
+ # define VLOADU(p) _mm512_loadu_si512((const void *)(p))
93
+ # define VXOR(a, b) _mm512_xor_si512((a), (b))
94
+ # define M128I_TO_VEC(a) _mm512_zextsi128_si512(a)
95
+ # define MULTS(a, b) _mm512_set_epi64(a, b, a, b, a, b, a, b)
96
+ # define MULTS_8V MULTS(CRC32_X4063_MODG, CRC32_X4127_MODG)
97
+ # define MULTS_4V MULTS(CRC32_X2015_MODG, CRC32_X2079_MODG)
98
+ # define MULTS_2V MULTS(CRC32_X991_MODG, CRC32_X1055_MODG)
99
+ # define MULTS_1V MULTS(CRC32_X479_MODG, CRC32_X543_MODG)
100
+ #else
101
+ # error "unsupported vector length"
102
+ #endif
103
+
104
+ #undef fold_vec128
105
+ static forceinline ATTRIBUTES __m128i
106
+ ADD_SUFFIX(fold_vec128)(__m128i src, __m128i dst, __m128i /* __v2du */ mults)
107
+ {
108
+ dst = _mm_xor_si128(dst, _mm_clmulepi64_si128(src, mults, 0x00));
109
+ dst = _mm_xor_si128(dst, _mm_clmulepi64_si128(src, mults, 0x11));
110
+ return dst;
111
+ }
112
+ #define fold_vec128 ADD_SUFFIX(fold_vec128)
113
+
114
+ #if VL >= 32
115
+ #undef fold_vec256
116
+ static forceinline ATTRIBUTES __m256i
117
+ ADD_SUFFIX(fold_vec256)(__m256i src, __m256i dst, __m256i /* __v4du */ mults)
118
+ {
119
+ #if USE_AVX512
120
+ /* vpternlog with immediate 0x96 is a three-argument XOR. */
121
+ return _mm256_ternarylogic_epi32(
122
+ _mm256_clmulepi64_epi128(src, mults, 0x00),
123
+ _mm256_clmulepi64_epi128(src, mults, 0x11),
124
+ dst,
125
+ 0x96);
126
+ #else
127
+ return _mm256_xor_si256(
128
+ _mm256_xor_si256(dst,
129
+ _mm256_clmulepi64_epi128(src, mults, 0x00)),
130
+ _mm256_clmulepi64_epi128(src, mults, 0x11));
131
+ #endif
132
+ }
133
+ #define fold_vec256 ADD_SUFFIX(fold_vec256)
134
+ #endif /* VL >= 32 */
135
+
136
+ #if VL >= 64
137
+ #undef fold_vec512
138
+ static forceinline ATTRIBUTES __m512i
139
+ ADD_SUFFIX(fold_vec512)(__m512i src, __m512i dst, __m512i /* __v8du */ mults)
140
+ {
141
+ /* vpternlog with immediate 0x96 is a three-argument XOR. */
142
+ return _mm512_ternarylogic_epi32(
143
+ _mm512_clmulepi64_epi128(src, mults, 0x00),
144
+ _mm512_clmulepi64_epi128(src, mults, 0x11),
145
+ dst,
146
+ 0x96);
147
+ }
148
+ #define fold_vec512 ADD_SUFFIX(fold_vec512)
149
+ #endif /* VL >= 64 */
150
+
151
+ /*
152
+ * Given 'x' containing a 16-byte polynomial, and a pointer 'p' that points to
153
+ * the next '1 <= len <= 15' data bytes, rearrange the concatenation of 'x' and
154
+ * the data into vectors x0 and x1 that contain 'len' bytes and 16 bytes,
155
+ * respectively. Then fold x0 into x1 and return the result.
156
+ * Assumes that 'p + len - 16' is in-bounds.
157
+ */
158
+ #undef fold_lessthan16bytes
159
+ static forceinline ATTRIBUTES __m128i
160
+ ADD_SUFFIX(fold_lessthan16bytes)(__m128i x, const u8 *p, size_t len,
161
+ __m128i /* __v2du */ mults_128b)
162
+ {
163
+ __m128i lshift = _mm_loadu_si128((const void *)&shift_tab[len]);
164
+ __m128i rshift = _mm_loadu_si128((const void *)&shift_tab[len + 16]);
165
+ __m128i x0, x1;
166
+
167
+ /* x0 = x left-shifted by '16 - len' bytes */
168
+ x0 = _mm_shuffle_epi8(x, lshift);
169
+
170
+ /*
171
+ * x1 = the last '16 - len' bytes from x (i.e. x right-shifted by 'len'
172
+ * bytes) followed by the remaining data.
173
+ */
174
+ x1 = _mm_blendv_epi8(_mm_shuffle_epi8(x, rshift),
175
+ _mm_loadu_si128((const void *)(p + len - 16)),
176
+ /* msb 0/1 of each byte selects byte from arg1/2 */
177
+ rshift);
178
+
179
+ return fold_vec128(x0, x1, mults_128b);
180
+ }
181
+ #define fold_lessthan16bytes ADD_SUFFIX(fold_lessthan16bytes)
182
+
183
+ static ATTRIBUTES u32
184
+ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
185
+ {
186
+ /*
187
+ * mults_{N}v are the vectors of multipliers for folding across N vec_t
188
+ * vectors, i.e. N*VL*8 bits. mults_128b are the two multipliers for
189
+ * folding across 128 bits. mults_128b differs from mults_1v when
190
+ * VL != 16. All multipliers are 64-bit, to match what pclmulqdq needs,
191
+ * but since this is for CRC-32 only their low 32 bits are nonzero.
192
+ * For more details, see scripts/gen-crc32-consts.py.
193
+ */
194
+ const vec_t mults_8v = MULTS_8V;
195
+ const vec_t mults_4v = MULTS_4V;
196
+ const vec_t mults_2v = MULTS_2V;
197
+ const vec_t mults_1v = MULTS_1V;
198
+ const __m128i mults_128b = _mm_set_epi64x(CRC32_X95_MODG, CRC32_X159_MODG);
199
+ const __m128i barrett_reduction_constants =
200
+ _mm_set_epi64x(CRC32_BARRETT_CONSTANT_2, CRC32_BARRETT_CONSTANT_1);
201
+ vec_t v0, v1, v2, v3, v4, v5, v6, v7;
202
+ __m128i x0 = _mm_cvtsi32_si128(crc);
203
+ __m128i x1;
204
+
205
+ if (len < 8*VL) {
206
+ if (len < VL) {
207
+ STATIC_ASSERT(VL == 16 || VL == 32 || VL == 64);
208
+ if (len < 16) {
209
+ #if USE_AVX512
210
+ if (len < 4)
211
+ return crc32_slice1(crc, p, len);
212
+ /*
213
+ * Handle 4 <= len <= 15 bytes by doing a masked
214
+ * load, XOR'ing the current CRC with the first
215
+ * 4 bytes, left-shifting by '16 - len' bytes to
216
+ * align the result to the end of x0 (so that it
217
+ * becomes the low-order coefficients of a
218
+ * 128-bit polynomial), and then doing the usual
219
+ * reduction from 128 bits to 32 bits.
220
+ */
221
+ x0 = _mm_xor_si128(
222
+ x0, _mm_maskz_loadu_epi8((1 << len) - 1, p));
223
+ x0 = _mm_shuffle_epi8(
224
+ x0, _mm_loadu_si128((const void *)&shift_tab[len]));
225
+ goto reduce_x0;
226
+ #else
227
+ return crc32_slice1(crc, p, len);
228
+ #endif
229
+ }
230
+ /*
231
+ * Handle 16 <= len < VL bytes where VL is 32 or 64.
232
+ * Use 128-bit instructions so that these lengths aren't
233
+ * slower with VL > 16 than with VL=16.
234
+ */
235
+ x0 = _mm_xor_si128(_mm_loadu_si128((const void *)p), x0);
236
+ if (len >= 32) {
237
+ x0 = fold_vec128(x0, _mm_loadu_si128((const void *)(p + 16)),
238
+ mults_128b);
239
+ if (len >= 48)
240
+ x0 = fold_vec128(x0, _mm_loadu_si128((const void *)(p + 32)),
241
+ mults_128b);
242
+ }
243
+ p += len & ~15;
244
+ goto less_than_16_remaining;
245
+ }
246
+ v0 = VXOR(VLOADU(p), M128I_TO_VEC(x0));
247
+ if (len < 2*VL) {
248
+ p += VL;
249
+ goto less_than_vl_remaining;
250
+ }
251
+ v1 = VLOADU(p + 1*VL);
252
+ if (len < 4*VL) {
253
+ p += 2*VL;
254
+ goto less_than_2vl_remaining;
255
+ }
256
+ v2 = VLOADU(p + 2*VL);
257
+ v3 = VLOADU(p + 3*VL);
258
+ p += 4*VL;
259
+ } else {
260
+ /*
261
+ * If the length is large and the pointer is misaligned, align
262
+ * it. For smaller lengths, just take the misaligned load
263
+ * penalty. Note that on recent x86 CPUs, vmovdqu with an
264
+ * aligned address is just as fast as vmovdqa, so there's no
265
+ * need to use vmovdqa in the main loop.
266
+ */
267
+ if (len > 65536 && ((uintptr_t)p & (VL-1))) {
268
+ size_t align = -(uintptr_t)p & (VL-1);
269
+
270
+ len -= align;
271
+ x0 = _mm_xor_si128(_mm_loadu_si128((const void *)p), x0);
272
+ p += 16;
273
+ if (align & 15) {
274
+ x0 = fold_lessthan16bytes(x0, p, align & 15,
275
+ mults_128b);
276
+ p += align & 15;
277
+ align &= ~15;
278
+ }
279
+ while (align) {
280
+ x0 = fold_vec128(x0, *(const __m128i *)p,
281
+ mults_128b);
282
+ p += 16;
283
+ align -= 16;
284
+ }
285
+ v0 = M128I_TO_VEC(x0);
286
+ # if VL == 32
287
+ v0 = _mm256_inserti128_si256(v0, *(const __m128i *)p, 1);
288
+ # elif VL == 64
289
+ v0 = _mm512_inserti32x4(v0, *(const __m128i *)p, 1);
290
+ v0 = _mm512_inserti64x4(v0, *(const __m256i *)(p + 16), 1);
291
+ # endif
292
+ p -= 16;
293
+ } else {
294
+ v0 = VXOR(VLOADU(p), M128I_TO_VEC(x0));
295
+ }
296
+ v1 = VLOADU(p + 1*VL);
297
+ v2 = VLOADU(p + 2*VL);
298
+ v3 = VLOADU(p + 3*VL);
299
+ v4 = VLOADU(p + 4*VL);
300
+ v5 = VLOADU(p + 5*VL);
301
+ v6 = VLOADU(p + 6*VL);
302
+ v7 = VLOADU(p + 7*VL);
303
+ p += 8*VL;
304
+
305
+ /*
306
+ * This is the main loop, processing 8*VL bytes per iteration.
307
+ * 4*VL is usually enough and would result in smaller code, but
308
+ * Skylake and Cascade Lake need 8*VL to get full performance.
309
+ */
310
+ while (len >= 16*VL) {
311
+ v0 = fold_vec(v0, VLOADU(p + 0*VL), mults_8v);
312
+ v1 = fold_vec(v1, VLOADU(p + 1*VL), mults_8v);
313
+ v2 = fold_vec(v2, VLOADU(p + 2*VL), mults_8v);
314
+ v3 = fold_vec(v3, VLOADU(p + 3*VL), mults_8v);
315
+ v4 = fold_vec(v4, VLOADU(p + 4*VL), mults_8v);
316
+ v5 = fold_vec(v5, VLOADU(p + 5*VL), mults_8v);
317
+ v6 = fold_vec(v6, VLOADU(p + 6*VL), mults_8v);
318
+ v7 = fold_vec(v7, VLOADU(p + 7*VL), mults_8v);
319
+ p += 8*VL;
320
+ len -= 8*VL;
321
+ }
322
+
323
+ /* Fewer than 8*VL bytes remain. */
324
+ v0 = fold_vec(v0, v4, mults_4v);
325
+ v1 = fold_vec(v1, v5, mults_4v);
326
+ v2 = fold_vec(v2, v6, mults_4v);
327
+ v3 = fold_vec(v3, v7, mults_4v);
328
+ if (len & (4*VL)) {
329
+ v0 = fold_vec(v0, VLOADU(p + 0*VL), mults_4v);
330
+ v1 = fold_vec(v1, VLOADU(p + 1*VL), mults_4v);
331
+ v2 = fold_vec(v2, VLOADU(p + 2*VL), mults_4v);
332
+ v3 = fold_vec(v3, VLOADU(p + 3*VL), mults_4v);
333
+ p += 4*VL;
334
+ }
335
+ }
336
+ /* Fewer than 4*VL bytes remain. */
337
+ v0 = fold_vec(v0, v2, mults_2v);
338
+ v1 = fold_vec(v1, v3, mults_2v);
339
+ if (len & (2*VL)) {
340
+ v0 = fold_vec(v0, VLOADU(p + 0*VL), mults_2v);
341
+ v1 = fold_vec(v1, VLOADU(p + 1*VL), mults_2v);
342
+ p += 2*VL;
343
+ }
344
+ less_than_2vl_remaining:
345
+ /* Fewer than 2*VL bytes remain. */
346
+ v0 = fold_vec(v0, v1, mults_1v);
347
+ if (len & VL) {
348
+ v0 = fold_vec(v0, VLOADU(p), mults_1v);
349
+ p += VL;
350
+ }
351
+ less_than_vl_remaining:
352
+ /*
353
+ * Fewer than VL bytes remain. Reduce v0 (length VL bytes) to x0
354
+ * (length 16 bytes) and fold in any 16-byte data segments that remain.
355
+ */
356
+ #if VL == 16
357
+ x0 = v0;
358
+ #else
359
+ {
360
+ #if VL == 32
361
+ __m256i y0 = v0;
362
+ #else
363
+ const __m256i mults_256b =
364
+ _mm256_set_epi64x(CRC32_X223_MODG, CRC32_X287_MODG,
365
+ CRC32_X223_MODG, CRC32_X287_MODG);
366
+ __m256i y0 = fold_vec256(_mm512_extracti64x4_epi64(v0, 0),
367
+ _mm512_extracti64x4_epi64(v0, 1),
368
+ mults_256b);
369
+ if (len & 32) {
370
+ y0 = fold_vec256(y0, _mm256_loadu_si256((const void *)p),
371
+ mults_256b);
372
+ p += 32;
373
+ }
374
+ #endif
375
+ x0 = fold_vec128(_mm256_extracti128_si256(y0, 0),
376
+ _mm256_extracti128_si256(y0, 1), mults_128b);
377
+ }
378
+ if (len & 16) {
379
+ x0 = fold_vec128(x0, _mm_loadu_si128((const void *)p),
380
+ mults_128b);
381
+ p += 16;
382
+ }
383
+ #endif
384
+ less_than_16_remaining:
385
+ len &= 15;
386
+
387
+ /* Handle any remainder of 1 to 15 bytes. */
388
+ if (len)
389
+ x0 = fold_lessthan16bytes(x0, p, len, mults_128b);
390
+ #if USE_AVX512
391
+ reduce_x0:
392
+ #endif
393
+ /*
394
+ * Multiply the remaining 128-bit message polynomial 'x0' by x^32, then
395
+ * reduce it modulo the generator polynomial G. This gives the CRC.
396
+ *
397
+ * This implementation matches that used in crc-pclmul-template.S from
398
+ * https://lore.kernel.org/r/20250210174540.161705-4-ebiggers@kernel.org/
399
+ * with the parameters n=32 and LSB_CRC=1 (what the gzip CRC uses). See
400
+ * there for a detailed explanation of the math used here.
401
+ */
402
+ x0 = _mm_xor_si128(_mm_clmulepi64_si128(x0, mults_128b, 0x10),
403
+ _mm_bsrli_si128(x0, 8));
404
+ x1 = _mm_clmulepi64_si128(x0, barrett_reduction_constants, 0x00);
405
+ x1 = _mm_clmulepi64_si128(x1, barrett_reduction_constants, 0x10);
406
+ x0 = _mm_xor_si128(x0, x1);
407
+ return _mm_extract_epi32(x0, 2);
408
+ }
409
+
410
+ #undef vec_t
411
+ #undef fold_vec
412
+ #undef VLOADU
413
+ #undef VXOR
414
+ #undef M128I_TO_VEC
415
+ #undef MULTS
416
+ #undef MULTS_8V
417
+ #undef MULTS_4V
418
+ #undef MULTS_2V
419
+ #undef MULTS_1V
420
+
421
+ #undef SUFFIX
422
+ #undef ATTRIBUTES
423
+ #undef VL
424
+ #undef USE_AVX512
@@ -0,0 +1,57 @@
1
+ #ifndef LIB_X86_DECOMPRESS_IMPL_H
2
+ #define LIB_X86_DECOMPRESS_IMPL_H
3
+
4
+ #include "cpu_features.h"
5
+
6
+ /*
7
+ * BMI2 optimized decompression function.
8
+ *
9
+ * With gcc and clang we just compile the whole function with
10
+ * __attribute__((target("bmi2"))), and the compiler uses bmi2 automatically.
11
+ *
12
+ * With MSVC, there is no target function attribute, but it's still possible to
13
+ * use bmi2 intrinsics explicitly. Currently we mostly don't, but there's a
14
+ * case in which we do (see below), so we at least take advantage of that.
15
+ * However, MSVC from VS2017 (toolset v141) apparently miscompiles the _bzhi_*()
16
+ * intrinsics. It seems to be fixed in VS2022. Hence, use MSVC_PREREQ(1930).
17
+ */
18
+ #if defined(__GNUC__) || defined(__clang__) || MSVC_PREREQ(1930)
19
+ # define deflate_decompress_bmi2 deflate_decompress_bmi2
20
+ # define FUNCNAME deflate_decompress_bmi2
21
+ # define ATTRIBUTES _target_attribute("bmi2")
22
+ /*
23
+ * Even with __attribute__((target("bmi2"))), gcc doesn't reliably use the
24
+ * bzhi instruction for 'word & BITMASK(count)'. So use the bzhi intrinsic
25
+ * explicitly. EXTRACT_VARBITS() is equivalent to 'word & BITMASK(count)';
26
+ * EXTRACT_VARBITS8() is equivalent to 'word & BITMASK((u8)count)'.
27
+ * Nevertheless, their implementation using the bzhi intrinsic is identical,
28
+ * as the bzhi instruction truncates the count to 8 bits implicitly.
29
+ */
30
+ # ifndef __clang__
31
+ # ifdef ARCH_X86_64
32
+ # define EXTRACT_VARBITS(word, count) _bzhi_u64((word), (count))
33
+ # define EXTRACT_VARBITS8(word, count) _bzhi_u64((word), (count))
34
+ # else
35
+ # define EXTRACT_VARBITS(word, count) _bzhi_u32((word), (count))
36
+ # define EXTRACT_VARBITS8(word, count) _bzhi_u32((word), (count))
37
+ # endif
38
+ # endif
39
+ # include "../decompress_template.h"
40
+ #endif
41
+
42
+ #if defined(deflate_decompress_bmi2) && HAVE_BMI2_NATIVE
43
+ #define DEFAULT_IMPL deflate_decompress_bmi2
44
+ #else
45
+ static inline decompress_func_t
46
+ arch_select_decompress_func(void)
47
+ {
48
+ #ifdef deflate_decompress_bmi2
49
+ if (HAVE_BMI2(get_x86_cpu_features()))
50
+ return deflate_decompress_bmi2;
51
+ #endif
52
+ return NULL;
53
+ }
54
+ #define arch_select_decompress_func arch_select_decompress_func
55
+ #endif
56
+
57
+ #endif /* LIB_X86_DECOMPRESS_IMPL_H */