deflate-ruby 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. checksums.yaml +7 -0
  2. data/CLAUDE.md +138 -0
  3. data/LICENSE.txt +21 -0
  4. data/README.md +117 -0
  5. data/ext/deflate_ruby/deflate_ruby.c +301 -0
  6. data/ext/deflate_ruby/extconf.rb +34 -0
  7. data/ext/deflate_ruby/libdeflate/CMakeLists.txt +270 -0
  8. data/ext/deflate_ruby/libdeflate/COPYING +22 -0
  9. data/ext/deflate_ruby/libdeflate/NEWS.md +494 -0
  10. data/ext/deflate_ruby/libdeflate/README.md +228 -0
  11. data/ext/deflate_ruby/libdeflate/common_defs.h +747 -0
  12. data/ext/deflate_ruby/libdeflate/lib/adler32.c +162 -0
  13. data/ext/deflate_ruby/libdeflate/lib/arm/adler32_impl.h +358 -0
  14. data/ext/deflate_ruby/libdeflate/lib/arm/cpu_features.c +230 -0
  15. data/ext/deflate_ruby/libdeflate/lib/arm/cpu_features.h +214 -0
  16. data/ext/deflate_ruby/libdeflate/lib/arm/crc32_impl.h +600 -0
  17. data/ext/deflate_ruby/libdeflate/lib/arm/crc32_pmull_helpers.h +156 -0
  18. data/ext/deflate_ruby/libdeflate/lib/arm/crc32_pmull_wide.h +226 -0
  19. data/ext/deflate_ruby/libdeflate/lib/arm/matchfinder_impl.h +78 -0
  20. data/ext/deflate_ruby/libdeflate/lib/bt_matchfinder.h +342 -0
  21. data/ext/deflate_ruby/libdeflate/lib/cpu_features_common.h +93 -0
  22. data/ext/deflate_ruby/libdeflate/lib/crc32.c +262 -0
  23. data/ext/deflate_ruby/libdeflate/lib/crc32_multipliers.h +377 -0
  24. data/ext/deflate_ruby/libdeflate/lib/crc32_tables.h +587 -0
  25. data/ext/deflate_ruby/libdeflate/lib/decompress_template.h +777 -0
  26. data/ext/deflate_ruby/libdeflate/lib/deflate_compress.c +4129 -0
  27. data/ext/deflate_ruby/libdeflate/lib/deflate_compress.h +15 -0
  28. data/ext/deflate_ruby/libdeflate/lib/deflate_constants.h +56 -0
  29. data/ext/deflate_ruby/libdeflate/lib/deflate_decompress.c +1208 -0
  30. data/ext/deflate_ruby/libdeflate/lib/gzip_compress.c +90 -0
  31. data/ext/deflate_ruby/libdeflate/lib/gzip_constants.h +45 -0
  32. data/ext/deflate_ruby/libdeflate/lib/gzip_decompress.c +144 -0
  33. data/ext/deflate_ruby/libdeflate/lib/hc_matchfinder.h +401 -0
  34. data/ext/deflate_ruby/libdeflate/lib/ht_matchfinder.h +234 -0
  35. data/ext/deflate_ruby/libdeflate/lib/lib_common.h +106 -0
  36. data/ext/deflate_ruby/libdeflate/lib/matchfinder_common.h +224 -0
  37. data/ext/deflate_ruby/libdeflate/lib/riscv/matchfinder_impl.h +97 -0
  38. data/ext/deflate_ruby/libdeflate/lib/utils.c +141 -0
  39. data/ext/deflate_ruby/libdeflate/lib/x86/adler32_impl.h +134 -0
  40. data/ext/deflate_ruby/libdeflate/lib/x86/adler32_template.h +518 -0
  41. data/ext/deflate_ruby/libdeflate/lib/x86/cpu_features.c +183 -0
  42. data/ext/deflate_ruby/libdeflate/lib/x86/cpu_features.h +169 -0
  43. data/ext/deflate_ruby/libdeflate/lib/x86/crc32_impl.h +160 -0
  44. data/ext/deflate_ruby/libdeflate/lib/x86/crc32_pclmul_template.h +495 -0
  45. data/ext/deflate_ruby/libdeflate/lib/x86/decompress_impl.h +57 -0
  46. data/ext/deflate_ruby/libdeflate/lib/x86/matchfinder_impl.h +122 -0
  47. data/ext/deflate_ruby/libdeflate/lib/zlib_compress.c +82 -0
  48. data/ext/deflate_ruby/libdeflate/lib/zlib_constants.h +21 -0
  49. data/ext/deflate_ruby/libdeflate/lib/zlib_decompress.c +104 -0
  50. data/ext/deflate_ruby/libdeflate/libdeflate-config.cmake.in +3 -0
  51. data/ext/deflate_ruby/libdeflate/libdeflate.h +411 -0
  52. data/ext/deflate_ruby/libdeflate/libdeflate.pc.in +18 -0
  53. data/ext/deflate_ruby/libdeflate/programs/CMakeLists.txt +105 -0
  54. data/ext/deflate_ruby/libdeflate/programs/benchmark.c +696 -0
  55. data/ext/deflate_ruby/libdeflate/programs/checksum.c +218 -0
  56. data/ext/deflate_ruby/libdeflate/programs/config.h.in +19 -0
  57. data/ext/deflate_ruby/libdeflate/programs/gzip.c +688 -0
  58. data/ext/deflate_ruby/libdeflate/programs/prog_util.c +521 -0
  59. data/ext/deflate_ruby/libdeflate/programs/prog_util.h +225 -0
  60. data/ext/deflate_ruby/libdeflate/programs/test_checksums.c +200 -0
  61. data/ext/deflate_ruby/libdeflate/programs/test_custom_malloc.c +155 -0
  62. data/ext/deflate_ruby/libdeflate/programs/test_incomplete_codes.c +385 -0
  63. data/ext/deflate_ruby/libdeflate/programs/test_invalid_streams.c +130 -0
  64. data/ext/deflate_ruby/libdeflate/programs/test_litrunlen_overflow.c +72 -0
  65. data/ext/deflate_ruby/libdeflate/programs/test_overread.c +95 -0
  66. data/ext/deflate_ruby/libdeflate/programs/test_slow_decompression.c +472 -0
  67. data/ext/deflate_ruby/libdeflate/programs/test_trailing_bytes.c +151 -0
  68. data/ext/deflate_ruby/libdeflate/programs/test_util.c +237 -0
  69. data/ext/deflate_ruby/libdeflate/programs/test_util.h +61 -0
  70. data/ext/deflate_ruby/libdeflate/programs/tgetopt.c +118 -0
  71. data/ext/deflate_ruby/libdeflate/scripts/android_build.sh +118 -0
  72. data/ext/deflate_ruby/libdeflate/scripts/android_tests.sh +69 -0
  73. data/ext/deflate_ruby/libdeflate/scripts/benchmark.sh +10 -0
  74. data/ext/deflate_ruby/libdeflate/scripts/checksum.sh +10 -0
  75. data/ext/deflate_ruby/libdeflate/scripts/checksum_benchmarks.sh +253 -0
  76. data/ext/deflate_ruby/libdeflate/scripts/cmake-helper.sh +17 -0
  77. data/ext/deflate_ruby/libdeflate/scripts/deflate_benchmarks.sh +119 -0
  78. data/ext/deflate_ruby/libdeflate/scripts/exec_tests.sh +38 -0
  79. data/ext/deflate_ruby/libdeflate/scripts/gen-release-archives.sh +37 -0
  80. data/ext/deflate_ruby/libdeflate/scripts/gen_bitreverse_tab.py +19 -0
  81. data/ext/deflate_ruby/libdeflate/scripts/gen_crc32_multipliers.c +199 -0
  82. data/ext/deflate_ruby/libdeflate/scripts/gen_crc32_tables.c +105 -0
  83. data/ext/deflate_ruby/libdeflate/scripts/gen_default_litlen_costs.py +44 -0
  84. data/ext/deflate_ruby/libdeflate/scripts/gen_offset_slot_map.py +29 -0
  85. data/ext/deflate_ruby/libdeflate/scripts/gzip_tests.sh +523 -0
  86. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/deflate_compress/corpus/0 +0 -0
  87. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/deflate_compress/fuzz.c +95 -0
  88. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/deflate_decompress/corpus/0 +3 -0
  89. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/deflate_decompress/fuzz.c +62 -0
  90. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/fuzz.sh +108 -0
  91. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/gzip_decompress/corpus/0 +0 -0
  92. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/gzip_decompress/fuzz.c +19 -0
  93. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/zlib_decompress/corpus/0 +3 -0
  94. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/zlib_decompress/fuzz.c +19 -0
  95. data/ext/deflate_ruby/libdeflate/scripts/run_tests.sh +416 -0
  96. data/ext/deflate_ruby/libdeflate/scripts/toolchain-i686-w64-mingw32.cmake +8 -0
  97. data/ext/deflate_ruby/libdeflate/scripts/toolchain-x86_64-w64-mingw32.cmake +8 -0
  98. data/lib/deflate_ruby/version.rb +5 -0
  99. data/lib/deflate_ruby.rb +71 -0
  100. metadata +191 -0
@@ -0,0 +1,495 @@
1
+ /*
2
+ * x86/crc32_pclmul_template.h - gzip CRC-32 with PCLMULQDQ instructions
3
+ *
4
+ * Copyright 2016 Eric Biggers
5
+ *
6
+ * Permission is hereby granted, free of charge, to any person
7
+ * obtaining a copy of this software and associated documentation
8
+ * files (the "Software"), to deal in the Software without
9
+ * restriction, including without limitation the rights to use,
10
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ * copies of the Software, and to permit persons to whom the
12
+ * Software is furnished to do so, subject to the following
13
+ * conditions:
14
+ *
15
+ * The above copyright notice and this permission notice shall be
16
+ * included in all copies or substantial portions of the Software.
17
+ *
18
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
20
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
23
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
25
+ * OTHER DEALINGS IN THE SOFTWARE.
26
+ */
27
+
28
+ /*
29
+ * This file is a "template" for instantiating PCLMULQDQ-based crc32_x86
30
+ * functions. The "parameters" are:
31
+ *
32
+ * SUFFIX:
33
+ * Name suffix to append to all instantiated functions.
34
+ * ATTRIBUTES:
35
+ * Target function attributes to use. Must satisfy the dependencies of the
36
+ * other parameters as follows:
37
+ * VL=16 && USE_SSE4_1=0 && USE_AVX512=0: at least pclmul
38
+ * VL=16 && USE_SSE4_1=1 && USE_AVX512=0: at least pclmul,sse4.1
39
+ * VL=32 && USE_SSE4_1=1 && USE_AVX512=0: at least vpclmulqdq,pclmul,avx2
40
+ * VL=32 && USE_SSE4_1=1 && USE_AVX512=1: at least vpclmulqdq,pclmul,avx512bw,avx512vl
41
+ * VL=64 && USE_SSE4_1=1 && USE_AVX512=1: at least vpclmulqdq,pclmul,avx512bw,avx512vl
42
+ * (Other combinations are not useful and have not been tested.)
43
+ * VL:
44
+ * Vector length in bytes. Must be 16, 32, or 64.
45
+ * USE_SSE4_1:
46
+ * If 1, take advantage of SSE4.1 instructions such as pblendvb.
47
+ * If 0, assume that the CPU might not support SSE4.1.
48
+ * USE_AVX512:
49
+ * If 1, take advantage of AVX-512 features such as masking and the
50
+ * vpternlog instruction. This doesn't enable the use of 512-bit vectors;
51
+ * the vector length is controlled by VL. If 0, assume that the CPU might
52
+ * not support AVX-512.
53
+ *
54
+ * The overall algorithm used is CRC folding with carryless multiplication
55
+ * instructions. Note that the x86 crc32 instruction cannot be used, as it is
56
+ * for a different polynomial, not the gzip one. For an explanation of CRC
57
+ * folding with carryless multiplication instructions, see
58
+ * scripts/gen_crc32_multipliers.c and the following paper:
59
+ *
60
+ * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
61
+ * https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
62
+ *
63
+ * The original pclmulqdq instruction does one 64x64 to 128-bit carryless
64
+ * multiplication. The VPCLMULQDQ feature added instructions that do two
65
+ * parallel 64x64 to 128-bit carryless multiplications in combination with AVX
66
+ * or AVX512VL, or four in combination with AVX512F.
67
+ */
68
+
69
+ #if VL == 16
70
+ # define vec_t __m128i
71
+ # define fold_vec fold_vec128
72
+ # define VLOADU(p) _mm_loadu_si128((const void *)(p))
73
+ # define VXOR(a, b) _mm_xor_si128((a), (b))
74
+ # define M128I_TO_VEC(a) a
75
+ # define MULTS_8V _mm_set_epi64x(CRC32_X991_MODG, CRC32_X1055_MODG)
76
+ # define MULTS_4V _mm_set_epi64x(CRC32_X479_MODG, CRC32_X543_MODG)
77
+ # define MULTS_2V _mm_set_epi64x(CRC32_X223_MODG, CRC32_X287_MODG)
78
+ # define MULTS_1V _mm_set_epi64x(CRC32_X95_MODG, CRC32_X159_MODG)
79
+ #elif VL == 32
80
+ # define vec_t __m256i
81
+ # define fold_vec fold_vec256
82
+ # define VLOADU(p) _mm256_loadu_si256((const void *)(p))
83
+ # define VXOR(a, b) _mm256_xor_si256((a), (b))
84
+ # define M128I_TO_VEC(a) _mm256_castsi128_si256(a)
85
+ # define MULTS(a, b) _mm256_set_epi64x(a, b, a, b)
86
+ # define MULTS_8V MULTS(CRC32_X2015_MODG, CRC32_X2079_MODG)
87
+ # define MULTS_4V MULTS(CRC32_X991_MODG, CRC32_X1055_MODG)
88
+ # define MULTS_2V MULTS(CRC32_X479_MODG, CRC32_X543_MODG)
89
+ # define MULTS_1V MULTS(CRC32_X223_MODG, CRC32_X287_MODG)
90
+ #elif VL == 64
91
+ # define vec_t __m512i
92
+ # define fold_vec fold_vec512
93
+ # define VLOADU(p) _mm512_loadu_si512((const void *)(p))
94
+ # define VXOR(a, b) _mm512_xor_si512((a), (b))
95
+ # define M128I_TO_VEC(a) _mm512_castsi128_si512(a)
96
+ # define MULTS(a, b) _mm512_set_epi64(a, b, a, b, a, b, a, b)
97
+ # define MULTS_8V MULTS(CRC32_X4063_MODG, CRC32_X4127_MODG)
98
+ # define MULTS_4V MULTS(CRC32_X2015_MODG, CRC32_X2079_MODG)
99
+ # define MULTS_2V MULTS(CRC32_X991_MODG, CRC32_X1055_MODG)
100
+ # define MULTS_1V MULTS(CRC32_X479_MODG, CRC32_X543_MODG)
101
+ #else
102
+ # error "unsupported vector length"
103
+ #endif
104
+
105
+ #undef fold_vec128
106
+ static forceinline ATTRIBUTES __m128i
107
+ ADD_SUFFIX(fold_vec128)(__m128i src, __m128i dst, __m128i /* __v2du */ mults)
108
+ {
109
+ dst = _mm_xor_si128(dst, _mm_clmulepi64_si128(src, mults, 0x00));
110
+ dst = _mm_xor_si128(dst, _mm_clmulepi64_si128(src, mults, 0x11));
111
+ return dst;
112
+ }
113
+ #define fold_vec128 ADD_SUFFIX(fold_vec128)
114
+
115
+ #if VL >= 32
116
+ #undef fold_vec256
117
+ static forceinline ATTRIBUTES __m256i
118
+ ADD_SUFFIX(fold_vec256)(__m256i src, __m256i dst, __m256i /* __v4du */ mults)
119
+ {
120
+ #if USE_AVX512
121
+ /* vpternlog with immediate 0x96 is a three-argument XOR. */
122
+ return _mm256_ternarylogic_epi32(
123
+ _mm256_clmulepi64_epi128(src, mults, 0x00),
124
+ _mm256_clmulepi64_epi128(src, mults, 0x11),
125
+ dst,
126
+ 0x96);
127
+ #else
128
+ return _mm256_xor_si256(
129
+ _mm256_xor_si256(dst,
130
+ _mm256_clmulepi64_epi128(src, mults, 0x00)),
131
+ _mm256_clmulepi64_epi128(src, mults, 0x11));
132
+ #endif
133
+ }
134
+ #define fold_vec256 ADD_SUFFIX(fold_vec256)
135
+ #endif /* VL >= 32 */
136
+
137
+ #if VL >= 64
138
+ #undef fold_vec512
139
+ static forceinline ATTRIBUTES __m512i
140
+ ADD_SUFFIX(fold_vec512)(__m512i src, __m512i dst, __m512i /* __v8du */ mults)
141
+ {
142
+ /* vpternlog with immediate 0x96 is a three-argument XOR. */
143
+ return _mm512_ternarylogic_epi32(
144
+ _mm512_clmulepi64_epi128(src, mults, 0x00),
145
+ _mm512_clmulepi64_epi128(src, mults, 0x11),
146
+ dst,
147
+ 0x96);
148
+ }
149
+ #define fold_vec512 ADD_SUFFIX(fold_vec512)
150
+ #endif /* VL >= 64 */
151
+
152
+ #if USE_SSE4_1
153
+ /*
154
+ * Given 'x' containing a 16-byte polynomial, and a pointer 'p' that points to
155
+ * the next '1 <= len <= 15' data bytes, rearrange the concatenation of 'x' and
156
+ * the data into vectors x0 and x1 that contain 'len' bytes and 16 bytes,
157
+ * respectively. Then fold x0 into x1 and return the result.
158
+ * Assumes that 'p + len - 16' is in-bounds.
159
+ */
160
+ #undef fold_lessthan16bytes
161
+ static forceinline ATTRIBUTES __m128i
162
+ ADD_SUFFIX(fold_lessthan16bytes)(__m128i x, const u8 *p, size_t len,
163
+ __m128i /* __v2du */ mults_128b)
164
+ {
165
+ __m128i lshift = _mm_loadu_si128((const void *)&shift_tab[len]);
166
+ __m128i rshift = _mm_loadu_si128((const void *)&shift_tab[len + 16]);
167
+ __m128i x0, x1;
168
+
169
+ /* x0 = x left-shifted by '16 - len' bytes */
170
+ x0 = _mm_shuffle_epi8(x, lshift);
171
+
172
+ /*
173
+ * x1 = the last '16 - len' bytes from x (i.e. x right-shifted by 'len'
174
+ * bytes) followed by the remaining data.
175
+ */
176
+ x1 = _mm_blendv_epi8(_mm_shuffle_epi8(x, rshift),
177
+ _mm_loadu_si128((const void *)(p + len - 16)),
178
+ /* msb 0/1 of each byte selects byte from arg1/2 */
179
+ rshift);
180
+
181
+ return fold_vec128(x0, x1, mults_128b);
182
+ }
183
+ #define fold_lessthan16bytes ADD_SUFFIX(fold_lessthan16bytes)
184
+ #endif /* USE_SSE4_1 */
185
+
186
+ static ATTRIBUTES u32
187
+ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
188
+ {
189
+ /*
190
+ * mults_{N}v are the vectors of multipliers for folding across N vec_t
191
+ * vectors, i.e. N*VL*8 bits. mults_128b are the two multipliers for
192
+ * folding across 128 bits. mults_128b differs from mults_1v when
193
+ * VL != 16. All multipliers are 64-bit, to match what pclmulqdq needs,
194
+ * but since this is for CRC-32 only their low 32 bits are nonzero.
195
+ * For more details, see scripts/gen_crc32_multipliers.c.
196
+ */
197
+ const vec_t mults_8v = MULTS_8V;
198
+ const vec_t mults_4v = MULTS_4V;
199
+ const vec_t mults_2v = MULTS_2V;
200
+ const vec_t mults_1v = MULTS_1V;
201
+ const __m128i mults_128b = _mm_set_epi64x(CRC32_X95_MODG, CRC32_X159_MODG);
202
+ const __m128i final_mult = _mm_set_epi64x(0, CRC32_X63_MODG);
203
+ const __m128i mask32 = _mm_set_epi32(0, 0, 0, 0xFFFFFFFF);
204
+ const __m128i barrett_reduction_constants =
205
+ _mm_set_epi64x(CRC32_BARRETT_CONSTANT_2, CRC32_BARRETT_CONSTANT_1);
206
+ vec_t v0, v1, v2, v3, v4, v5, v6, v7;
207
+ __m128i x0 = _mm_cvtsi32_si128(crc);
208
+ __m128i x1;
209
+
210
+ if (len < 8*VL) {
211
+ if (len < VL) {
212
+ STATIC_ASSERT(VL == 16 || VL == 32 || VL == 64);
213
+ if (len < 16) {
214
+ #if USE_AVX512
215
+ if (len < 4)
216
+ return crc32_slice1(crc, p, len);
217
+ /*
218
+ * Handle 4 <= len <= 15 bytes by doing a masked
219
+ * load, XOR'ing the current CRC with the first
220
+ * 4 bytes, left-shifting by '16 - len' bytes to
221
+ * align the result to the end of x0 (so that it
222
+ * becomes the low-order coefficients of a
223
+ * 128-bit polynomial), and then doing the usual
224
+ * reduction from 128 bits to 32 bits.
225
+ */
226
+ x0 = _mm_xor_si128(
227
+ x0, _mm_maskz_loadu_epi8((1 << len) - 1, p));
228
+ x0 = _mm_shuffle_epi8(
229
+ x0, _mm_loadu_si128((const void *)&shift_tab[len]));
230
+ goto reduce_x0;
231
+ #else
232
+ return crc32_slice1(crc, p, len);
233
+ #endif
234
+ }
235
+ /*
236
+ * Handle 16 <= len < VL bytes where VL is 32 or 64.
237
+ * Use 128-bit instructions so that these lengths aren't
238
+ * slower with VL > 16 than with VL=16.
239
+ */
240
+ x0 = _mm_xor_si128(_mm_loadu_si128((const void *)p), x0);
241
+ if (len >= 32) {
242
+ x0 = fold_vec128(x0, _mm_loadu_si128((const void *)(p + 16)),
243
+ mults_128b);
244
+ if (len >= 48)
245
+ x0 = fold_vec128(x0, _mm_loadu_si128((const void *)(p + 32)),
246
+ mults_128b);
247
+ }
248
+ p += len & ~15;
249
+ goto less_than_16_remaining;
250
+ }
251
+ v0 = VXOR(VLOADU(p), M128I_TO_VEC(x0));
252
+ if (len < 2*VL) {
253
+ p += VL;
254
+ goto less_than_vl_remaining;
255
+ }
256
+ v1 = VLOADU(p + 1*VL);
257
+ if (len < 4*VL) {
258
+ p += 2*VL;
259
+ goto less_than_2vl_remaining;
260
+ }
261
+ v2 = VLOADU(p + 2*VL);
262
+ v3 = VLOADU(p + 3*VL);
263
+ p += 4*VL;
264
+ } else {
265
+ /*
266
+ * If the length is large and the pointer is misaligned, align
267
+ * it. For smaller lengths, just take the misaligned load
268
+ * penalty. Note that on recent x86 CPUs, vmovdqu with an
269
+ * aligned address is just as fast as vmovdqa, so there's no
270
+ * need to use vmovdqa in the main loop.
271
+ */
272
+ if (len > 65536 && ((uintptr_t)p & (VL-1))) {
273
+ size_t align = -(uintptr_t)p & (VL-1);
274
+
275
+ len -= align;
276
+ #if USE_SSE4_1
277
+ x0 = _mm_xor_si128(_mm_loadu_si128((const void *)p), x0);
278
+ p += 16;
279
+ if (align & 15) {
280
+ x0 = fold_lessthan16bytes(x0, p, align & 15,
281
+ mults_128b);
282
+ p += align & 15;
283
+ align &= ~15;
284
+ }
285
+ while (align) {
286
+ x0 = fold_vec128(x0, *(const __m128i *)p,
287
+ mults_128b);
288
+ p += 16;
289
+ align -= 16;
290
+ }
291
+ v0 = M128I_TO_VEC(x0);
292
+ # if VL == 32
293
+ v0 = _mm256_inserti128_si256(v0, *(const __m128i *)p, 1);
294
+ # elif VL == 64
295
+ v0 = _mm512_inserti32x4(v0, *(const __m128i *)p, 1);
296
+ v0 = _mm512_inserti64x4(v0, *(const __m256i *)(p + 16), 1);
297
+ # endif
298
+ p -= 16;
299
+ #else
300
+ crc = crc32_slice1(crc, p, align);
301
+ p += align;
302
+ v0 = VXOR(VLOADU(p), M128I_TO_VEC(_mm_cvtsi32_si128(crc)));
303
+ #endif
304
+ } else {
305
+ v0 = VXOR(VLOADU(p), M128I_TO_VEC(x0));
306
+ }
307
+ v1 = VLOADU(p + 1*VL);
308
+ v2 = VLOADU(p + 2*VL);
309
+ v3 = VLOADU(p + 3*VL);
310
+ v4 = VLOADU(p + 4*VL);
311
+ v5 = VLOADU(p + 5*VL);
312
+ v6 = VLOADU(p + 6*VL);
313
+ v7 = VLOADU(p + 7*VL);
314
+ p += 8*VL;
315
+
316
+ /*
317
+ * This is the main loop, processing 8*VL bytes per iteration.
318
+ * 4*VL is usually enough and would result in smaller code, but
319
+ * Skylake and Cascade Lake need 8*VL to get full performance.
320
+ */
321
+ while (len >= 16*VL) {
322
+ v0 = fold_vec(v0, VLOADU(p + 0*VL), mults_8v);
323
+ v1 = fold_vec(v1, VLOADU(p + 1*VL), mults_8v);
324
+ v2 = fold_vec(v2, VLOADU(p + 2*VL), mults_8v);
325
+ v3 = fold_vec(v3, VLOADU(p + 3*VL), mults_8v);
326
+ v4 = fold_vec(v4, VLOADU(p + 4*VL), mults_8v);
327
+ v5 = fold_vec(v5, VLOADU(p + 5*VL), mults_8v);
328
+ v6 = fold_vec(v6, VLOADU(p + 6*VL), mults_8v);
329
+ v7 = fold_vec(v7, VLOADU(p + 7*VL), mults_8v);
330
+ p += 8*VL;
331
+ len -= 8*VL;
332
+ }
333
+
334
+ /* Fewer than 8*VL bytes remain. */
335
+ v0 = fold_vec(v0, v4, mults_4v);
336
+ v1 = fold_vec(v1, v5, mults_4v);
337
+ v2 = fold_vec(v2, v6, mults_4v);
338
+ v3 = fold_vec(v3, v7, mults_4v);
339
+ if (len & (4*VL)) {
340
+ v0 = fold_vec(v0, VLOADU(p + 0*VL), mults_4v);
341
+ v1 = fold_vec(v1, VLOADU(p + 1*VL), mults_4v);
342
+ v2 = fold_vec(v2, VLOADU(p + 2*VL), mults_4v);
343
+ v3 = fold_vec(v3, VLOADU(p + 3*VL), mults_4v);
344
+ p += 4*VL;
345
+ }
346
+ }
347
+ /* Fewer than 4*VL bytes remain. */
348
+ v0 = fold_vec(v0, v2, mults_2v);
349
+ v1 = fold_vec(v1, v3, mults_2v);
350
+ if (len & (2*VL)) {
351
+ v0 = fold_vec(v0, VLOADU(p + 0*VL), mults_2v);
352
+ v1 = fold_vec(v1, VLOADU(p + 1*VL), mults_2v);
353
+ p += 2*VL;
354
+ }
355
+ less_than_2vl_remaining:
356
+ /* Fewer than 2*VL bytes remain. */
357
+ v0 = fold_vec(v0, v1, mults_1v);
358
+ if (len & VL) {
359
+ v0 = fold_vec(v0, VLOADU(p), mults_1v);
360
+ p += VL;
361
+ }
362
+ less_than_vl_remaining:
363
+ /*
364
+ * Fewer than VL bytes remain. Reduce v0 (length VL bytes) to x0
365
+ * (length 16 bytes) and fold in any 16-byte data segments that remain.
366
+ */
367
+ #if VL == 16
368
+ x0 = v0;
369
+ #else
370
+ {
371
+ #if VL == 32
372
+ __m256i y0 = v0;
373
+ #else
374
+ const __m256i mults_256b =
375
+ _mm256_set_epi64x(CRC32_X223_MODG, CRC32_X287_MODG,
376
+ CRC32_X223_MODG, CRC32_X287_MODG);
377
+ __m256i y0 = fold_vec256(_mm512_extracti64x4_epi64(v0, 0),
378
+ _mm512_extracti64x4_epi64(v0, 1),
379
+ mults_256b);
380
+ if (len & 32) {
381
+ y0 = fold_vec256(y0, _mm256_loadu_si256((const void *)p),
382
+ mults_256b);
383
+ p += 32;
384
+ }
385
+ #endif
386
+ x0 = fold_vec128(_mm256_extracti128_si256(y0, 0),
387
+ _mm256_extracti128_si256(y0, 1), mults_128b);
388
+ }
389
+ if (len & 16) {
390
+ x0 = fold_vec128(x0, _mm_loadu_si128((const void *)p),
391
+ mults_128b);
392
+ p += 16;
393
+ }
394
+ #endif
395
+ less_than_16_remaining:
396
+ len &= 15;
397
+
398
+ /*
399
+ * If fold_lessthan16bytes() is available, handle any remainder
400
+ * of 1 to 15 bytes now, before reducing to 32 bits.
401
+ */
402
+ #if USE_SSE4_1
403
+ if (len)
404
+ x0 = fold_lessthan16bytes(x0, p, len, mults_128b);
405
+ #endif
406
+ #if USE_AVX512
407
+ reduce_x0:
408
+ #endif
409
+
410
+ /*
411
+ * Fold 128 => 96 bits. This also implicitly appends 32 zero bits,
412
+ * which is equivalent to multiplying by x^32. This is needed because
413
+ * the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x).
414
+ */
415
+ x0 = _mm_xor_si128(_mm_srli_si128(x0, 8),
416
+ _mm_clmulepi64_si128(x0, mults_128b, 0x10));
417
+
418
+ /* Fold 96 => 64 bits. */
419
+ x0 = _mm_xor_si128(_mm_srli_si128(x0, 4),
420
+ _mm_clmulepi64_si128(_mm_and_si128(x0, mask32),
421
+ final_mult, 0x00));
422
+
423
+ /*
424
+ * Reduce 64 => 32 bits using Barrett reduction.
425
+ *
426
+ * Let M(x) = A(x)*x^32 + B(x) be the remaining message. The goal is to
427
+ * compute R(x) = M(x) mod G(x). Since degree(B(x)) < degree(G(x)):
428
+ *
429
+ * R(x) = (A(x)*x^32 + B(x)) mod G(x)
430
+ * = (A(x)*x^32) mod G(x) + B(x)
431
+ *
432
+ * Then, by the Division Algorithm there exists a unique q(x) such that:
433
+ *
434
+ * A(x)*x^32 mod G(x) = A(x)*x^32 - q(x)*G(x)
435
+ *
436
+ * Since the left-hand side is of maximum degree 31, the right-hand side
437
+ * must be too. This implies that we can apply 'mod x^32' to the
438
+ * right-hand side without changing its value:
439
+ *
440
+ * (A(x)*x^32 - q(x)*G(x)) mod x^32 = q(x)*G(x) mod x^32
441
+ *
442
+ * Note that '+' is equivalent to '-' in polynomials over GF(2).
443
+ *
444
+ * We also know that:
445
+ *
446
+ * / A(x)*x^32 \
447
+ * q(x) = floor ( --------- )
448
+ * \ G(x) /
449
+ *
450
+ * To compute this efficiently, we can multiply the top and bottom by
451
+ * x^32 and move the division by G(x) to the top:
452
+ *
453
+ * / A(x) * floor(x^64 / G(x)) \
454
+ * q(x) = floor ( ------------------------- )
455
+ * \ x^32 /
456
+ *
457
+ * Note that floor(x^64 / G(x)) is a constant.
458
+ *
459
+ * So finally we have:
460
+ *
461
+ * / A(x) * floor(x^64 / G(x)) \
462
+ * R(x) = B(x) + G(x)*floor ( ------------------------- )
463
+ * \ x^32 /
464
+ */
465
+ x1 = _mm_clmulepi64_si128(_mm_and_si128(x0, mask32),
466
+ barrett_reduction_constants, 0x00);
467
+ x1 = _mm_clmulepi64_si128(_mm_and_si128(x1, mask32),
468
+ barrett_reduction_constants, 0x10);
469
+ x0 = _mm_xor_si128(x0, x1);
470
+ #if USE_SSE4_1
471
+ crc = _mm_extract_epi32(x0, 1);
472
+ #else
473
+ crc = _mm_cvtsi128_si32(_mm_shuffle_epi32(x0, 0x01));
474
+ /* Process up to 15 bytes left over at the end. */
475
+ crc = crc32_slice1(crc, p, len);
476
+ #endif
477
+ return crc;
478
+ }
479
+
480
+ #undef vec_t
481
+ #undef fold_vec
482
+ #undef VLOADU
483
+ #undef VXOR
484
+ #undef M128I_TO_VEC
485
+ #undef MULTS
486
+ #undef MULTS_8V
487
+ #undef MULTS_4V
488
+ #undef MULTS_2V
489
+ #undef MULTS_1V
490
+
491
+ #undef SUFFIX
492
+ #undef ATTRIBUTES
493
+ #undef VL
494
+ #undef USE_SSE4_1
495
+ #undef USE_AVX512
@@ -0,0 +1,57 @@
1
+ #ifndef LIB_X86_DECOMPRESS_IMPL_H
2
+ #define LIB_X86_DECOMPRESS_IMPL_H
3
+
4
+ #include "cpu_features.h"
5
+
6
+ /*
7
+ * BMI2 optimized decompression function.
8
+ *
9
+ * With gcc and clang we just compile the whole function with
10
+ * __attribute__((target("bmi2"))), and the compiler uses bmi2 automatically.
11
+ *
12
+ * With MSVC, there is no target function attribute, but it's still possible to
13
+ * use bmi2 intrinsics explicitly. Currently we mostly don't, but there's a
14
+ * case in which we do (see below), so we at least take advantage of that.
15
+ * However, MSVC from VS2017 (toolset v141) apparently miscompiles the _bzhi_*()
16
+ * intrinsics. It seems to be fixed in VS2022. Hence, use MSVC_PREREQ(1930).
17
+ */
18
+ #if defined(__GNUC__) || defined(__clang__) || MSVC_PREREQ(1930)
19
+ # define deflate_decompress_bmi2 deflate_decompress_bmi2
20
+ # define FUNCNAME deflate_decompress_bmi2
21
+ # define ATTRIBUTES _target_attribute("bmi2")
22
+ /*
23
+ * Even with __attribute__((target("bmi2"))), gcc doesn't reliably use the
24
+ * bzhi instruction for 'word & BITMASK(count)'. So use the bzhi intrinsic
25
+ * explicitly. EXTRACT_VARBITS() is equivalent to 'word & BITMASK(count)';
26
+ * EXTRACT_VARBITS8() is equivalent to 'word & BITMASK((u8)count)'.
27
+ * Nevertheless, their implementation using the bzhi intrinsic is identical,
28
+ * as the bzhi instruction truncates the count to 8 bits implicitly.
29
+ */
30
+ # ifndef __clang__
31
+ # ifdef ARCH_X86_64
32
+ # define EXTRACT_VARBITS(word, count) _bzhi_u64((word), (count))
33
+ # define EXTRACT_VARBITS8(word, count) _bzhi_u64((word), (count))
34
+ # else
35
+ # define EXTRACT_VARBITS(word, count) _bzhi_u32((word), (count))
36
+ # define EXTRACT_VARBITS8(word, count) _bzhi_u32((word), (count))
37
+ # endif
38
+ # endif
39
+ # include "../decompress_template.h"
40
+ #endif
41
+
42
+ #if defined(deflate_decompress_bmi2) && HAVE_BMI2_NATIVE
43
+ #define DEFAULT_IMPL deflate_decompress_bmi2
44
+ #else
45
+ static inline decompress_func_t
46
+ arch_select_decompress_func(void)
47
+ {
48
+ #ifdef deflate_decompress_bmi2
49
+ if (HAVE_BMI2(get_x86_cpu_features()))
50
+ return deflate_decompress_bmi2;
51
+ #endif
52
+ return NULL;
53
+ }
54
+ #define arch_select_decompress_func arch_select_decompress_func
55
+ #endif
56
+
57
+ #endif /* LIB_X86_DECOMPRESS_IMPL_H */
@@ -0,0 +1,122 @@
1
+ /*
2
+ * x86/matchfinder_impl.h - x86 implementations of matchfinder functions
3
+ *
4
+ * Copyright 2016 Eric Biggers
5
+ *
6
+ * Permission is hereby granted, free of charge, to any person
7
+ * obtaining a copy of this software and associated documentation
8
+ * files (the "Software"), to deal in the Software without
9
+ * restriction, including without limitation the rights to use,
10
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ * copies of the Software, and to permit persons to whom the
12
+ * Software is furnished to do so, subject to the following
13
+ * conditions:
14
+ *
15
+ * The above copyright notice and this permission notice shall be
16
+ * included in all copies or substantial portions of the Software.
17
+ *
18
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
20
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
23
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
25
+ * OTHER DEALINGS IN THE SOFTWARE.
26
+ */
27
+
28
+ #ifndef LIB_X86_MATCHFINDER_IMPL_H
29
+ #define LIB_X86_MATCHFINDER_IMPL_H
30
+
31
+ #include "cpu_features.h"
32
+
33
+ #ifdef __AVX2__
34
+ static forceinline void
35
+ matchfinder_init_avx2(mf_pos_t *data, size_t size)
36
+ {
37
+ __m256i *p = (__m256i *)data;
38
+ __m256i v = _mm256_set1_epi16(MATCHFINDER_INITVAL);
39
+
40
+ STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0);
41
+ STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0);
42
+ STATIC_ASSERT(sizeof(mf_pos_t) == 2);
43
+
44
+ do {
45
+ p[0] = v;
46
+ p[1] = v;
47
+ p[2] = v;
48
+ p[3] = v;
49
+ p += 4;
50
+ size -= 4 * sizeof(*p);
51
+ } while (size != 0);
52
+ }
53
+ #define matchfinder_init matchfinder_init_avx2
54
+
55
+ static forceinline void
56
+ matchfinder_rebase_avx2(mf_pos_t *data, size_t size)
57
+ {
58
+ __m256i *p = (__m256i *)data;
59
+ __m256i v = _mm256_set1_epi16((u16)-MATCHFINDER_WINDOW_SIZE);
60
+
61
+ STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0);
62
+ STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0);
63
+ STATIC_ASSERT(sizeof(mf_pos_t) == 2);
64
+
65
+ do {
66
+ /* PADDSW: Add Packed Signed Integers With Signed Saturation */
67
+ p[0] = _mm256_adds_epi16(p[0], v);
68
+ p[1] = _mm256_adds_epi16(p[1], v);
69
+ p[2] = _mm256_adds_epi16(p[2], v);
70
+ p[3] = _mm256_adds_epi16(p[3], v);
71
+ p += 4;
72
+ size -= 4 * sizeof(*p);
73
+ } while (size != 0);
74
+ }
75
+ #define matchfinder_rebase matchfinder_rebase_avx2
76
+
77
+ #elif HAVE_SSE2_NATIVE
78
+ static forceinline void
79
+ matchfinder_init_sse2(mf_pos_t *data, size_t size)
80
+ {
81
+ __m128i *p = (__m128i *)data;
82
+ __m128i v = _mm_set1_epi16(MATCHFINDER_INITVAL);
83
+
84
+ STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0);
85
+ STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0);
86
+ STATIC_ASSERT(sizeof(mf_pos_t) == 2);
87
+
88
+ do {
89
+ p[0] = v;
90
+ p[1] = v;
91
+ p[2] = v;
92
+ p[3] = v;
93
+ p += 4;
94
+ size -= 4 * sizeof(*p);
95
+ } while (size != 0);
96
+ }
97
+ #define matchfinder_init matchfinder_init_sse2
98
+
99
+ static forceinline void
100
+ matchfinder_rebase_sse2(mf_pos_t *data, size_t size)
101
+ {
102
+ __m128i *p = (__m128i *)data;
103
+ __m128i v = _mm_set1_epi16((u16)-MATCHFINDER_WINDOW_SIZE);
104
+
105
+ STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0);
106
+ STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0);
107
+ STATIC_ASSERT(sizeof(mf_pos_t) == 2);
108
+
109
+ do {
110
+ /* PADDSW: Add Packed Signed Integers With Signed Saturation */
111
+ p[0] = _mm_adds_epi16(p[0], v);
112
+ p[1] = _mm_adds_epi16(p[1], v);
113
+ p[2] = _mm_adds_epi16(p[2], v);
114
+ p[3] = _mm_adds_epi16(p[3], v);
115
+ p += 4;
116
+ size -= 4 * sizeof(*p);
117
+ } while (size != 0);
118
+ }
119
+ #define matchfinder_rebase matchfinder_rebase_sse2
120
+ #endif /* HAVE_SSE2_NATIVE */
121
+
122
+ #endif /* LIB_X86_MATCHFINDER_IMPL_H */