deflate-ruby 1.0.1 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CLAUDE.md +95 -92
- data/GEM_VERIFICATION_REPORT.md +140 -0
- data/LICENSE.txt +6 -6
- data/README.md +87 -65
- data/Rakefile +23 -0
- data/ext/deflate_ruby/{libdeflate/lib/x86/adler32_impl.h → adler32_impl.h} +8 -7
- data/ext/deflate_ruby/common_defs.h +748 -0
- data/ext/deflate_ruby/{libdeflate/lib/x86/cpu_features.c → cpu_features.c} +46 -16
- data/ext/deflate_ruby/{libdeflate/lib/x86/cpu_features.h → cpu_features.h} +2 -1
- data/ext/deflate_ruby/{libdeflate/lib/x86/crc32_impl.h → crc32_impl.h} +22 -23
- data/ext/deflate_ruby/{libdeflate/lib/crc32_multipliers.h → crc32_multipliers.h} +2 -4
- data/ext/deflate_ruby/{libdeflate/lib/x86/crc32_pclmul_template.h → crc32_pclmul_template.h} +23 -94
- data/ext/deflate_ruby/{libdeflate/lib/crc32_tables.h → crc32_tables.h} +1 -1
- data/ext/deflate_ruby/{libdeflate/lib/deflate_compress.c → deflate_compress.c} +59 -60
- data/ext/deflate_ruby/deflate_ruby.c +392 -218
- data/ext/deflate_ruby/deflate_ruby.h +6 -0
- data/ext/deflate_ruby/extconf.rb +35 -25
- data/ext/deflate_ruby/libdeflate/adler32.c +162 -0
- data/ext/deflate_ruby/libdeflate/{lib/arm → arm}/adler32_impl.h +14 -7
- data/ext/deflate_ruby/libdeflate/{lib/arm → arm}/crc32_impl.h +25 -31
- data/ext/deflate_ruby/libdeflate/arm/crc32_pmull_helpers.h +156 -0
- data/ext/deflate_ruby/libdeflate/arm/crc32_pmull_wide.h +226 -0
- data/ext/deflate_ruby/libdeflate/bt_matchfinder.h +342 -0
- data/ext/deflate_ruby/libdeflate/common_defs.h +2 -1
- data/ext/deflate_ruby/libdeflate/cpu_features_common.h +93 -0
- data/ext/deflate_ruby/libdeflate/crc32.c +262 -0
- data/ext/deflate_ruby/libdeflate/crc32_multipliers.h +375 -0
- data/ext/deflate_ruby/libdeflate/crc32_tables.h +587 -0
- data/ext/deflate_ruby/libdeflate/decompress_template.h +777 -0
- data/ext/deflate_ruby/libdeflate/deflate_compress.c +4128 -0
- data/ext/deflate_ruby/libdeflate/deflate_compress.h +15 -0
- data/ext/deflate_ruby/libdeflate/deflate_constants.h +56 -0
- data/ext/deflate_ruby/libdeflate/deflate_decompress.c +1208 -0
- data/ext/deflate_ruby/libdeflate/gzip_compress.c +90 -0
- data/ext/deflate_ruby/libdeflate/gzip_constants.h +45 -0
- data/ext/deflate_ruby/libdeflate/gzip_decompress.c +144 -0
- data/ext/deflate_ruby/libdeflate/hc_matchfinder.h +401 -0
- data/ext/deflate_ruby/libdeflate/ht_matchfinder.h +234 -0
- data/ext/deflate_ruby/libdeflate/lib_common.h +106 -0
- data/ext/deflate_ruby/libdeflate/libdeflate.h +2 -2
- data/ext/deflate_ruby/libdeflate/{lib/matchfinder_common.h → matchfinder_common.h} +3 -3
- data/ext/deflate_ruby/libdeflate/x86/adler32_impl.h +135 -0
- data/ext/deflate_ruby/libdeflate/x86/adler32_template.h +518 -0
- data/ext/deflate_ruby/libdeflate/x86/cpu_features.c +213 -0
- data/ext/deflate_ruby/libdeflate/x86/cpu_features.h +170 -0
- data/ext/deflate_ruby/libdeflate/x86/crc32_impl.h +159 -0
- data/ext/deflate_ruby/libdeflate/x86/crc32_pclmul_template.h +424 -0
- data/ext/deflate_ruby/libdeflate/x86/decompress_impl.h +57 -0
- data/ext/deflate_ruby/libdeflate.h +411 -0
- data/ext/deflate_ruby/matchfinder_common.h +224 -0
- data/ext/deflate_ruby/matchfinder_impl.h +122 -0
- data/ext/deflate_ruby/utils.c +141 -0
- data/ext/deflate_ruby/zlib_compress.c +82 -0
- data/ext/deflate_ruby/zlib_constants.h +21 -0
- data/ext/deflate_ruby/zlib_decompress.c +104 -0
- data/lib/deflate_ruby/version.rb +1 -1
- data/lib/deflate_ruby.rb +1 -63
- data/sig/deflate_ruby.rbs +4 -0
- data/test/test_deflate_ruby.rb +220 -0
- data/test/test_helper.rb +6 -0
- metadata +90 -144
- data/ext/deflate_ruby/libdeflate/CMakeLists.txt +0 -270
- data/ext/deflate_ruby/libdeflate/NEWS.md +0 -494
- data/ext/deflate_ruby/libdeflate/README.md +0 -228
- data/ext/deflate_ruby/libdeflate/libdeflate-config.cmake.in +0 -3
- data/ext/deflate_ruby/libdeflate/libdeflate.pc.in +0 -18
- data/ext/deflate_ruby/libdeflate/programs/CMakeLists.txt +0 -105
- data/ext/deflate_ruby/libdeflate/programs/benchmark.c +0 -696
- data/ext/deflate_ruby/libdeflate/programs/checksum.c +0 -218
- data/ext/deflate_ruby/libdeflate/programs/config.h.in +0 -19
- data/ext/deflate_ruby/libdeflate/programs/gzip.c +0 -688
- data/ext/deflate_ruby/libdeflate/programs/prog_util.c +0 -521
- data/ext/deflate_ruby/libdeflate/programs/prog_util.h +0 -225
- data/ext/deflate_ruby/libdeflate/programs/test_checksums.c +0 -200
- data/ext/deflate_ruby/libdeflate/programs/test_custom_malloc.c +0 -155
- data/ext/deflate_ruby/libdeflate/programs/test_incomplete_codes.c +0 -385
- data/ext/deflate_ruby/libdeflate/programs/test_invalid_streams.c +0 -130
- data/ext/deflate_ruby/libdeflate/programs/test_litrunlen_overflow.c +0 -72
- data/ext/deflate_ruby/libdeflate/programs/test_overread.c +0 -95
- data/ext/deflate_ruby/libdeflate/programs/test_slow_decompression.c +0 -472
- data/ext/deflate_ruby/libdeflate/programs/test_trailing_bytes.c +0 -151
- data/ext/deflate_ruby/libdeflate/programs/test_util.c +0 -237
- data/ext/deflate_ruby/libdeflate/programs/test_util.h +0 -61
- data/ext/deflate_ruby/libdeflate/programs/tgetopt.c +0 -118
- data/ext/deflate_ruby/libdeflate/scripts/android_build.sh +0 -118
- data/ext/deflate_ruby/libdeflate/scripts/android_tests.sh +0 -69
- data/ext/deflate_ruby/libdeflate/scripts/benchmark.sh +0 -10
- data/ext/deflate_ruby/libdeflate/scripts/checksum.sh +0 -10
- data/ext/deflate_ruby/libdeflate/scripts/checksum_benchmarks.sh +0 -253
- data/ext/deflate_ruby/libdeflate/scripts/cmake-helper.sh +0 -17
- data/ext/deflate_ruby/libdeflate/scripts/deflate_benchmarks.sh +0 -119
- data/ext/deflate_ruby/libdeflate/scripts/exec_tests.sh +0 -38
- data/ext/deflate_ruby/libdeflate/scripts/gen-release-archives.sh +0 -37
- data/ext/deflate_ruby/libdeflate/scripts/gen_bitreverse_tab.py +0 -19
- data/ext/deflate_ruby/libdeflate/scripts/gen_crc32_multipliers.c +0 -199
- data/ext/deflate_ruby/libdeflate/scripts/gen_crc32_tables.c +0 -105
- data/ext/deflate_ruby/libdeflate/scripts/gen_default_litlen_costs.py +0 -44
- data/ext/deflate_ruby/libdeflate/scripts/gen_offset_slot_map.py +0 -29
- data/ext/deflate_ruby/libdeflate/scripts/gzip_tests.sh +0 -523
- data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/deflate_compress/corpus/0 +0 -0
- data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/deflate_compress/fuzz.c +0 -95
- data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/deflate_decompress/corpus/0 +0 -3
- data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/deflate_decompress/fuzz.c +0 -62
- data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/fuzz.sh +0 -108
- data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/gzip_decompress/corpus/0 +0 -0
- data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/gzip_decompress/fuzz.c +0 -19
- data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/zlib_decompress/corpus/0 +0 -3
- data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/zlib_decompress/fuzz.c +0 -19
- data/ext/deflate_ruby/libdeflate/scripts/run_tests.sh +0 -416
- data/ext/deflate_ruby/libdeflate/scripts/toolchain-i686-w64-mingw32.cmake +0 -8
- data/ext/deflate_ruby/libdeflate/scripts/toolchain-x86_64-w64-mingw32.cmake +0 -8
- /data/ext/deflate_ruby/{libdeflate/lib/adler32.c → adler32.c} +0 -0
- /data/ext/deflate_ruby/{libdeflate/lib/x86/adler32_template.h → adler32_template.h} +0 -0
- /data/ext/deflate_ruby/{libdeflate/lib/bt_matchfinder.h → bt_matchfinder.h} +0 -0
- /data/ext/deflate_ruby/{libdeflate/lib/cpu_features_common.h → cpu_features_common.h} +0 -0
- /data/ext/deflate_ruby/{libdeflate/lib/crc32.c → crc32.c} +0 -0
- /data/ext/deflate_ruby/{libdeflate/lib/arm/crc32_pmull_helpers.h → crc32_pmull_helpers.h} +0 -0
- /data/ext/deflate_ruby/{libdeflate/lib/arm/crc32_pmull_wide.h → crc32_pmull_wide.h} +0 -0
- /data/ext/deflate_ruby/{libdeflate/lib/x86/decompress_impl.h → decompress_impl.h} +0 -0
- /data/ext/deflate_ruby/{libdeflate/lib/decompress_template.h → decompress_template.h} +0 -0
- /data/ext/deflate_ruby/{libdeflate/lib/deflate_compress.h → deflate_compress.h} +0 -0
- /data/ext/deflate_ruby/{libdeflate/lib/deflate_constants.h → deflate_constants.h} +0 -0
- /data/ext/deflate_ruby/{libdeflate/lib/deflate_decompress.c → deflate_decompress.c} +0 -0
- /data/ext/deflate_ruby/{libdeflate/lib/gzip_compress.c → gzip_compress.c} +0 -0
- /data/ext/deflate_ruby/{libdeflate/lib/gzip_constants.h → gzip_constants.h} +0 -0
- /data/ext/deflate_ruby/{libdeflate/lib/gzip_decompress.c → gzip_decompress.c} +0 -0
- /data/ext/deflate_ruby/{libdeflate/lib/hc_matchfinder.h → hc_matchfinder.h} +0 -0
- /data/ext/deflate_ruby/{libdeflate/lib/ht_matchfinder.h → ht_matchfinder.h} +0 -0
- /data/ext/deflate_ruby/{libdeflate/lib/lib_common.h → lib_common.h} +0 -0
- /data/ext/deflate_ruby/libdeflate/{lib/arm → arm}/cpu_features.c +0 -0
- /data/ext/deflate_ruby/libdeflate/{lib/arm → arm}/cpu_features.h +0 -0
- /data/ext/deflate_ruby/libdeflate/{lib/arm → arm}/matchfinder_impl.h +0 -0
- /data/ext/deflate_ruby/libdeflate/{lib/riscv → riscv}/matchfinder_impl.h +0 -0
- /data/ext/deflate_ruby/libdeflate/{lib/utils.c → utils.c} +0 -0
- /data/ext/deflate_ruby/libdeflate/{lib/x86 → x86}/matchfinder_impl.h +0 -0
- /data/ext/deflate_ruby/libdeflate/{lib/zlib_compress.c → zlib_compress.c} +0 -0
- /data/ext/deflate_ruby/libdeflate/{lib/zlib_constants.h → zlib_constants.h} +0 -0
- /data/ext/deflate_ruby/libdeflate/{lib/zlib_decompress.c → zlib_decompress.c} +0 -0
|
@@ -88,9 +88,31 @@ static const struct cpu_feature x86_cpu_feature_table[] = {
|
|
|
88
88
|
|
|
89
89
|
volatile u32 libdeflate_x86_cpu_features = 0;
|
|
90
90
|
|
|
91
|
+
static inline bool
|
|
92
|
+
os_supports_avx512(u64 xcr0)
|
|
93
|
+
{
|
|
94
|
+
#ifdef __APPLE__
|
|
95
|
+
/*
|
|
96
|
+
* The Darwin kernel had a bug where it could corrupt the opmask
|
|
97
|
+
* registers. See
|
|
98
|
+
* https://community.intel.com/t5/Software-Tuning-Performance/MacOS-Darwin-kernel-bug-clobbers-AVX-512-opmask-register-state/m-p/1327259
|
|
99
|
+
* Darwin also does not initially set the XCR0 bits for AVX512, but they
|
|
100
|
+
* are set if the thread tries to use AVX512 anyway. Thus, to safely
|
|
101
|
+
* and consistently use AVX512 on macOS we'd need to check the kernel
|
|
102
|
+
* version as well as detect AVX512 support using a macOS-specific
|
|
103
|
+
* method. We don't bother with this, especially given Apple's
|
|
104
|
+
* transition to arm64.
|
|
105
|
+
*/
|
|
106
|
+
return false;
|
|
107
|
+
#else
|
|
108
|
+
return (xcr0 & 0xe6) == 0xe6;
|
|
109
|
+
#endif
|
|
110
|
+
}
|
|
111
|
+
|
|
91
112
|
/*
|
|
92
|
-
* Don't use 512-bit vectors on Intel CPUs before Rocket Lake
|
|
93
|
-
* Rapids, due to the downclocking
|
|
113
|
+
* Don't use 512-bit vectors (ZMM registers) on Intel CPUs before Rocket Lake
|
|
114
|
+
* and Sapphire Rapids, due to the overly-eager downclocking which can reduce
|
|
115
|
+
* the performance of workloads that use ZMM registers only occasionally.
|
|
94
116
|
*/
|
|
95
117
|
static inline bool
|
|
96
118
|
allow_512bit_vectors(const u32 manufacturer[3], u32 family, u32 model)
|
|
@@ -140,7 +162,12 @@ void libdeflate_init_x86_cpu_features(void)
|
|
|
140
162
|
family += (a >> 20) & 0xff;
|
|
141
163
|
if (d & (1 << 26))
|
|
142
164
|
features |= X86_CPU_FEATURE_SSE2;
|
|
143
|
-
|
|
165
|
+
/*
|
|
166
|
+
* No known CPUs have pclmulqdq without sse4.1, so in practice code
|
|
167
|
+
* targeting pclmulqdq can use sse4.1 instructions. But to be safe,
|
|
168
|
+
* explicitly check for both the pclmulqdq and sse4.1 bits.
|
|
169
|
+
*/
|
|
170
|
+
if ((c & (1 << 1)) && (c & (1 << 19)))
|
|
144
171
|
features |= X86_CPU_FEATURE_PCLMULQDQ;
|
|
145
172
|
if (c & (1 << 27))
|
|
146
173
|
xcr0 = read_xcr(0);
|
|
@@ -152,21 +179,24 @@ void libdeflate_init_x86_cpu_features(void)
|
|
|
152
179
|
|
|
153
180
|
/* EAX=7, ECX=0: Extended Features */
|
|
154
181
|
cpuid(7, 0, &a, &b, &c, &d);
|
|
155
|
-
if ((b & (1 << 5)) && ((xcr0 & 0x6) == 0x6))
|
|
156
|
-
features |= X86_CPU_FEATURE_AVX2;
|
|
157
182
|
if (b & (1 << 8))
|
|
158
183
|
features |= X86_CPU_FEATURE_BMI2;
|
|
159
|
-
if ((
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
184
|
+
if ((xcr0 & 0x6) == 0x6) {
|
|
185
|
+
if (b & (1 << 5))
|
|
186
|
+
features |= X86_CPU_FEATURE_AVX2;
|
|
187
|
+
if (c & (1 << 10))
|
|
188
|
+
features |= X86_CPU_FEATURE_VPCLMULQDQ;
|
|
189
|
+
}
|
|
190
|
+
if (os_supports_avx512(xcr0)) {
|
|
191
|
+
if (allow_512bit_vectors(manufacturer, family, model))
|
|
192
|
+
features |= X86_CPU_FEATURE_ZMM;
|
|
193
|
+
if (b & (1 << 30))
|
|
194
|
+
features |= X86_CPU_FEATURE_AVX512BW;
|
|
195
|
+
if (b & (1U << 31))
|
|
196
|
+
features |= X86_CPU_FEATURE_AVX512VL;
|
|
197
|
+
if (c & (1 << 11))
|
|
198
|
+
features |= X86_CPU_FEATURE_AVX512VNNI;
|
|
199
|
+
}
|
|
170
200
|
|
|
171
201
|
/* EAX=7, ECX=1: Extended Features */
|
|
172
202
|
cpuid(7, 1, &a, &b, &c, &d);
|
|
@@ -108,7 +108,8 @@ static inline u32 get_x86_cpu_features(void) { return 0; }
|
|
|
108
108
|
# define HAVE_SSE2_NATIVE 0
|
|
109
109
|
#endif
|
|
110
110
|
|
|
111
|
-
#if defined(__PCLMUL__)
|
|
111
|
+
#if (defined(__PCLMUL__) && defined(__SSE4_1__)) || \
|
|
112
|
+
(defined(_MSC_VER) && defined(__AVX2__))
|
|
112
113
|
# define HAVE_PCLMULQDQ(features) 1
|
|
113
114
|
#else
|
|
114
115
|
# define HAVE_PCLMULQDQ(features) ((features) & X86_CPU_FEATURE_PCLMULQDQ)
|
|
@@ -44,31 +44,26 @@ static const u8 MAYBE_UNUSED shift_tab[48] = {
|
|
|
44
44
|
};
|
|
45
45
|
|
|
46
46
|
#if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)
|
|
47
|
-
/*
|
|
47
|
+
/*
|
|
48
|
+
* PCLMULQDQ implementation. This targets PCLMULQDQ+SSE4.1, since in practice
|
|
49
|
+
* all CPUs that support PCLMULQDQ also support SSE4.1.
|
|
50
|
+
*/
|
|
48
51
|
# define crc32_x86_pclmulqdq crc32_x86_pclmulqdq
|
|
49
52
|
# define SUFFIX _pclmulqdq
|
|
50
|
-
# define ATTRIBUTES _target_attribute("pclmul")
|
|
53
|
+
# define ATTRIBUTES _target_attribute("pclmul,sse4.1")
|
|
51
54
|
# define VL 16
|
|
52
|
-
# define USE_SSE4_1 0
|
|
53
55
|
# define USE_AVX512 0
|
|
54
56
|
# include "crc32_pclmul_template.h"
|
|
55
57
|
|
|
56
58
|
/*
|
|
57
|
-
* PCLMULQDQ/AVX implementation.
|
|
58
|
-
*
|
|
59
|
-
*
|
|
60
|
-
* performance significantly (e.g. 10100 MB/s to 16700 MB/s on Skylake) without
|
|
61
|
-
* actually using any AVX intrinsics, probably due to the availability of
|
|
62
|
-
* non-destructive VEX-encoded instructions. Second, AVX support implies SSSE3
|
|
63
|
-
* and SSE4.1 support, and we can use SSSE3 and SSE4.1 intrinsics for efficient
|
|
64
|
-
* handling of partial blocks. (We *could* compile a variant with
|
|
65
|
-
* PCLMULQDQ+SSE4.1 without AVX, but for simplicity we currently don't bother.)
|
|
59
|
+
* PCLMULQDQ/AVX implementation. Same as above, but this is compiled with AVX
|
|
60
|
+
* enabled so that the compiler can generate VEX-coded instructions which can be
|
|
61
|
+
* slightly more efficient. It still uses 128-bit vectors.
|
|
66
62
|
*/
|
|
67
63
|
# define crc32_x86_pclmulqdq_avx crc32_x86_pclmulqdq_avx
|
|
68
64
|
# define SUFFIX _pclmulqdq_avx
|
|
69
65
|
# define ATTRIBUTES _target_attribute("pclmul,avx")
|
|
70
66
|
# define VL 16
|
|
71
|
-
# define USE_SSE4_1 1
|
|
72
67
|
# define USE_AVX512 0
|
|
73
68
|
# include "crc32_pclmul_template.h"
|
|
74
69
|
#endif
|
|
@@ -83,43 +78,47 @@ static const u8 MAYBE_UNUSED shift_tab[48] = {
|
|
|
83
78
|
*
|
|
84
79
|
* gcc 8.1 and 8.2 had a similar bug where they assumed that
|
|
85
80
|
* _mm256_clmulepi64_epi128() always needed AVX512. It's fixed in gcc 8.3.
|
|
81
|
+
*
|
|
82
|
+
* _mm256_zextsi128_si256() requires gcc 10.
|
|
86
83
|
*/
|
|
87
|
-
#if GCC_PREREQ(
|
|
84
|
+
#if (GCC_PREREQ(10, 1) || CLANG_PREREQ(6, 0, 10000000)) && \
|
|
85
|
+
!defined(LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_VPCLMULQDQ)
|
|
88
86
|
# define crc32_x86_vpclmulqdq_avx2 crc32_x86_vpclmulqdq_avx2
|
|
89
87
|
# define SUFFIX _vpclmulqdq_avx2
|
|
90
88
|
# define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx2")
|
|
91
89
|
# define VL 32
|
|
92
|
-
# define USE_SSE4_1 1
|
|
93
90
|
# define USE_AVX512 0
|
|
94
91
|
# include "crc32_pclmul_template.h"
|
|
95
92
|
#endif
|
|
96
93
|
|
|
97
|
-
#if GCC_PREREQ(
|
|
94
|
+
#if (GCC_PREREQ(10, 1) || CLANG_PREREQ(6, 0, 10000000) || MSVC_PREREQ(1920)) && \
|
|
95
|
+
!defined(LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_VPCLMULQDQ)
|
|
98
96
|
/*
|
|
99
97
|
* VPCLMULQDQ/AVX512 implementation using 256-bit vectors. This is very similar
|
|
100
98
|
* to the VPCLMULQDQ/AVX2 implementation but takes advantage of the vpternlog
|
|
101
|
-
* instruction and more registers. This is used on
|
|
102
|
-
*
|
|
103
|
-
*
|
|
99
|
+
* instruction and more registers. This is used on certain older Intel CPUs,
|
|
100
|
+
* specifically Ice Lake and Tiger Lake, which support VPCLMULQDQ and AVX512 but
|
|
101
|
+
* downclock a bit too eagerly when ZMM registers are used.
|
|
102
|
+
*
|
|
103
|
+
* _mm256_zextsi128_si256() requires gcc 10.
|
|
104
104
|
*/
|
|
105
105
|
# define crc32_x86_vpclmulqdq_avx512_vl256 crc32_x86_vpclmulqdq_avx512_vl256
|
|
106
106
|
# define SUFFIX _vpclmulqdq_avx512_vl256
|
|
107
107
|
# define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx512bw,avx512vl")
|
|
108
108
|
# define VL 32
|
|
109
|
-
# define USE_SSE4_1 1
|
|
110
109
|
# define USE_AVX512 1
|
|
111
110
|
# include "crc32_pclmul_template.h"
|
|
112
111
|
|
|
113
112
|
/*
|
|
114
113
|
* VPCLMULQDQ/AVX512 implementation using 512-bit vectors. This is used on CPUs
|
|
115
|
-
* that have a good AVX-512 implementation including VPCLMULQDQ.
|
|
116
|
-
*
|
|
114
|
+
* that have a good AVX-512 implementation including VPCLMULQDQ.
|
|
115
|
+
*
|
|
116
|
+
* _mm512_zextsi128_si512() requires gcc 10.
|
|
117
117
|
*/
|
|
118
118
|
# define crc32_x86_vpclmulqdq_avx512_vl512 crc32_x86_vpclmulqdq_avx512_vl512
|
|
119
119
|
# define SUFFIX _vpclmulqdq_avx512_vl512
|
|
120
120
|
# define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx512bw,avx512vl")
|
|
121
121
|
# define VL 64
|
|
122
|
-
# define USE_SSE4_1 1
|
|
123
122
|
# define USE_AVX512 1
|
|
124
123
|
# include "crc32_pclmul_template.h"
|
|
125
124
|
#endif
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
/*
|
|
2
2
|
* crc32_multipliers.h - constants for CRC-32 folding
|
|
3
3
|
*
|
|
4
|
-
* THIS FILE WAS GENERATED BY
|
|
4
|
+
* THIS FILE WAS GENERATED BY gen-crc32-consts.py. DO NOT EDIT.
|
|
5
5
|
*/
|
|
6
6
|
|
|
7
7
|
#define CRC32_X159_MODG 0xae689191 /* x^159 mod G(x) */
|
|
@@ -100,10 +100,8 @@
|
|
|
100
100
|
#define CRC32_X4127_MODG 0x1072db28 /* x^4127 mod G(x) */
|
|
101
101
|
#define CRC32_X4063_MODG 0x0c30f51d /* x^4063 mod G(x) */
|
|
102
102
|
|
|
103
|
-
#define
|
|
104
|
-
#define CRC32_BARRETT_CONSTANT_1 0x00000001f7011641ULL /* floor(x^64 / G(x)) */
|
|
103
|
+
#define CRC32_BARRETT_CONSTANT_1 0xb4e5b025f7011641ULL /* floor(x^95 / G(x)) */
|
|
105
104
|
#define CRC32_BARRETT_CONSTANT_2 0x00000001db710641ULL /* G(x) */
|
|
106
|
-
#define CRC32_BARRETT_CONSTANTS { CRC32_BARRETT_CONSTANT_1, CRC32_BARRETT_CONSTANT_2 }
|
|
107
105
|
|
|
108
106
|
#define CRC32_NUM_CHUNKS 4
|
|
109
107
|
#define CRC32_MIN_VARIABLE_CHUNK_LEN 128UL
|
data/ext/deflate_ruby/{libdeflate/lib/x86/crc32_pclmul_template.h → crc32_pclmul_template.h}
RENAMED
|
@@ -34,17 +34,13 @@
|
|
|
34
34
|
* ATTRIBUTES:
|
|
35
35
|
* Target function attributes to use. Must satisfy the dependencies of the
|
|
36
36
|
* other parameters as follows:
|
|
37
|
-
* VL=16 &&
|
|
38
|
-
* VL=
|
|
39
|
-
* VL=32 &&
|
|
40
|
-
* VL=
|
|
41
|
-
* VL=64 && USE_SSE4_1=1 && USE_AVX512=1: at least vpclmulqdq,pclmul,avx512bw,avx512vl
|
|
37
|
+
* VL=16 && USE_AVX512=0: at least pclmul,sse4.1
|
|
38
|
+
* VL=32 && USE_AVX512=0: at least vpclmulqdq,pclmul,avx2
|
|
39
|
+
* VL=32 && USE_AVX512=1: at least vpclmulqdq,pclmul,avx512bw,avx512vl
|
|
40
|
+
* VL=64 && USE_AVX512=1: at least vpclmulqdq,pclmul,avx512bw,avx512vl
|
|
42
41
|
* (Other combinations are not useful and have not been tested.)
|
|
43
42
|
* VL:
|
|
44
43
|
* Vector length in bytes. Must be 16, 32, or 64.
|
|
45
|
-
* USE_SSE4_1:
|
|
46
|
-
* If 1, take advantage of SSE4.1 instructions such as pblendvb.
|
|
47
|
-
* If 0, assume that the CPU might not support SSE4.1.
|
|
48
44
|
* USE_AVX512:
|
|
49
45
|
* If 1, take advantage of AVX-512 features such as masking and the
|
|
50
46
|
* vpternlog instruction. This doesn't enable the use of 512-bit vectors;
|
|
@@ -55,7 +51,10 @@
|
|
|
55
51
|
* instructions. Note that the x86 crc32 instruction cannot be used, as it is
|
|
56
52
|
* for a different polynomial, not the gzip one. For an explanation of CRC
|
|
57
53
|
* folding with carryless multiplication instructions, see
|
|
58
|
-
* scripts/
|
|
54
|
+
* scripts/gen-crc32-consts.py and the following blog posts and papers:
|
|
55
|
+
*
|
|
56
|
+
* "An alternative exposition of crc32_4k_pclmulqdq"
|
|
57
|
+
* https://www.corsix.org/content/alternative-exposition-crc32_4k_pclmulqdq
|
|
59
58
|
*
|
|
60
59
|
* "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
|
|
61
60
|
* https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
|
|
@@ -81,7 +80,7 @@
|
|
|
81
80
|
# define fold_vec fold_vec256
|
|
82
81
|
# define VLOADU(p) _mm256_loadu_si256((const void *)(p))
|
|
83
82
|
# define VXOR(a, b) _mm256_xor_si256((a), (b))
|
|
84
|
-
# define M128I_TO_VEC(a)
|
|
83
|
+
# define M128I_TO_VEC(a) _mm256_zextsi128_si256(a)
|
|
85
84
|
# define MULTS(a, b) _mm256_set_epi64x(a, b, a, b)
|
|
86
85
|
# define MULTS_8V MULTS(CRC32_X2015_MODG, CRC32_X2079_MODG)
|
|
87
86
|
# define MULTS_4V MULTS(CRC32_X991_MODG, CRC32_X1055_MODG)
|
|
@@ -92,7 +91,7 @@
|
|
|
92
91
|
# define fold_vec fold_vec512
|
|
93
92
|
# define VLOADU(p) _mm512_loadu_si512((const void *)(p))
|
|
94
93
|
# define VXOR(a, b) _mm512_xor_si512((a), (b))
|
|
95
|
-
# define M128I_TO_VEC(a)
|
|
94
|
+
# define M128I_TO_VEC(a) _mm512_zextsi128_si512(a)
|
|
96
95
|
# define MULTS(a, b) _mm512_set_epi64(a, b, a, b, a, b, a, b)
|
|
97
96
|
# define MULTS_8V MULTS(CRC32_X4063_MODG, CRC32_X4127_MODG)
|
|
98
97
|
# define MULTS_4V MULTS(CRC32_X2015_MODG, CRC32_X2079_MODG)
|
|
@@ -149,7 +148,6 @@ ADD_SUFFIX(fold_vec512)(__m512i src, __m512i dst, __m512i /* __v8du */ mults)
|
|
|
149
148
|
#define fold_vec512 ADD_SUFFIX(fold_vec512)
|
|
150
149
|
#endif /* VL >= 64 */
|
|
151
150
|
|
|
152
|
-
#if USE_SSE4_1
|
|
153
151
|
/*
|
|
154
152
|
* Given 'x' containing a 16-byte polynomial, and a pointer 'p' that points to
|
|
155
153
|
* the next '1 <= len <= 15' data bytes, rearrange the concatenation of 'x' and
|
|
@@ -181,7 +179,6 @@ ADD_SUFFIX(fold_lessthan16bytes)(__m128i x, const u8 *p, size_t len,
|
|
|
181
179
|
return fold_vec128(x0, x1, mults_128b);
|
|
182
180
|
}
|
|
183
181
|
#define fold_lessthan16bytes ADD_SUFFIX(fold_lessthan16bytes)
|
|
184
|
-
#endif /* USE_SSE4_1 */
|
|
185
182
|
|
|
186
183
|
static ATTRIBUTES u32
|
|
187
184
|
ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
|
|
@@ -192,15 +189,13 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
|
|
|
192
189
|
* folding across 128 bits. mults_128b differs from mults_1v when
|
|
193
190
|
* VL != 16. All multipliers are 64-bit, to match what pclmulqdq needs,
|
|
194
191
|
* but since this is for CRC-32 only their low 32 bits are nonzero.
|
|
195
|
-
* For more details, see scripts/
|
|
192
|
+
* For more details, see scripts/gen-crc32-consts.py.
|
|
196
193
|
*/
|
|
197
194
|
const vec_t mults_8v = MULTS_8V;
|
|
198
195
|
const vec_t mults_4v = MULTS_4V;
|
|
199
196
|
const vec_t mults_2v = MULTS_2V;
|
|
200
197
|
const vec_t mults_1v = MULTS_1V;
|
|
201
198
|
const __m128i mults_128b = _mm_set_epi64x(CRC32_X95_MODG, CRC32_X159_MODG);
|
|
202
|
-
const __m128i final_mult = _mm_set_epi64x(0, CRC32_X63_MODG);
|
|
203
|
-
const __m128i mask32 = _mm_set_epi32(0, 0, 0, 0xFFFFFFFF);
|
|
204
199
|
const __m128i barrett_reduction_constants =
|
|
205
200
|
_mm_set_epi64x(CRC32_BARRETT_CONSTANT_2, CRC32_BARRETT_CONSTANT_1);
|
|
206
201
|
vec_t v0, v1, v2, v3, v4, v5, v6, v7;
|
|
@@ -273,7 +268,6 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
|
|
|
273
268
|
size_t align = -(uintptr_t)p & (VL-1);
|
|
274
269
|
|
|
275
270
|
len -= align;
|
|
276
|
-
#if USE_SSE4_1
|
|
277
271
|
x0 = _mm_xor_si128(_mm_loadu_si128((const void *)p), x0);
|
|
278
272
|
p += 16;
|
|
279
273
|
if (align & 15) {
|
|
@@ -296,11 +290,6 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
|
|
|
296
290
|
v0 = _mm512_inserti64x4(v0, *(const __m256i *)(p + 16), 1);
|
|
297
291
|
# endif
|
|
298
292
|
p -= 16;
|
|
299
|
-
#else
|
|
300
|
-
crc = crc32_slice1(crc, p, align);
|
|
301
|
-
p += align;
|
|
302
|
-
v0 = VXOR(VLOADU(p), M128I_TO_VEC(_mm_cvtsi32_si128(crc)));
|
|
303
|
-
#endif
|
|
304
293
|
} else {
|
|
305
294
|
v0 = VXOR(VLOADU(p), M128I_TO_VEC(x0));
|
|
306
295
|
}
|
|
@@ -395,86 +384,27 @@ less_than_vl_remaining:
|
|
|
395
384
|
less_than_16_remaining:
|
|
396
385
|
len &= 15;
|
|
397
386
|
|
|
398
|
-
/*
|
|
399
|
-
* If fold_lessthan16bytes() is available, handle any remainder
|
|
400
|
-
* of 1 to 15 bytes now, before reducing to 32 bits.
|
|
401
|
-
*/
|
|
402
|
-
#if USE_SSE4_1
|
|
387
|
+
/* Handle any remainder of 1 to 15 bytes. */
|
|
403
388
|
if (len)
|
|
404
389
|
x0 = fold_lessthan16bytes(x0, p, len, mults_128b);
|
|
405
|
-
#endif
|
|
406
390
|
#if USE_AVX512
|
|
407
391
|
reduce_x0:
|
|
408
392
|
#endif
|
|
409
|
-
|
|
410
|
-
/*
|
|
411
|
-
* Fold 128 => 96 bits. This also implicitly appends 32 zero bits,
|
|
412
|
-
* which is equivalent to multiplying by x^32. This is needed because
|
|
413
|
-
* the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x).
|
|
414
|
-
*/
|
|
415
|
-
x0 = _mm_xor_si128(_mm_srli_si128(x0, 8),
|
|
416
|
-
_mm_clmulepi64_si128(x0, mults_128b, 0x10));
|
|
417
|
-
|
|
418
|
-
/* Fold 96 => 64 bits. */
|
|
419
|
-
x0 = _mm_xor_si128(_mm_srli_si128(x0, 4),
|
|
420
|
-
_mm_clmulepi64_si128(_mm_and_si128(x0, mask32),
|
|
421
|
-
final_mult, 0x00));
|
|
422
|
-
|
|
423
393
|
/*
|
|
424
|
-
*
|
|
425
|
-
*
|
|
426
|
-
* Let M(x) = A(x)*x^32 + B(x) be the remaining message. The goal is to
|
|
427
|
-
* compute R(x) = M(x) mod G(x). Since degree(B(x)) < degree(G(x)):
|
|
428
|
-
*
|
|
429
|
-
* R(x) = (A(x)*x^32 + B(x)) mod G(x)
|
|
430
|
-
* = (A(x)*x^32) mod G(x) + B(x)
|
|
431
|
-
*
|
|
432
|
-
* Then, by the Division Algorithm there exists a unique q(x) such that:
|
|
394
|
+
* Multiply the remaining 128-bit message polynomial 'x0' by x^32, then
|
|
395
|
+
* reduce it modulo the generator polynomial G. This gives the CRC.
|
|
433
396
|
*
|
|
434
|
-
*
|
|
435
|
-
*
|
|
436
|
-
*
|
|
437
|
-
*
|
|
438
|
-
* right-hand side without changing its value:
|
|
439
|
-
*
|
|
440
|
-
* (A(x)*x^32 - q(x)*G(x)) mod x^32 = q(x)*G(x) mod x^32
|
|
441
|
-
*
|
|
442
|
-
* Note that '+' is equivalent to '-' in polynomials over GF(2).
|
|
443
|
-
*
|
|
444
|
-
* We also know that:
|
|
445
|
-
*
|
|
446
|
-
* / A(x)*x^32 \
|
|
447
|
-
* q(x) = floor ( --------- )
|
|
448
|
-
* \ G(x) /
|
|
449
|
-
*
|
|
450
|
-
* To compute this efficiently, we can multiply the top and bottom by
|
|
451
|
-
* x^32 and move the division by G(x) to the top:
|
|
452
|
-
*
|
|
453
|
-
* / A(x) * floor(x^64 / G(x)) \
|
|
454
|
-
* q(x) = floor ( ------------------------- )
|
|
455
|
-
* \ x^32 /
|
|
456
|
-
*
|
|
457
|
-
* Note that floor(x^64 / G(x)) is a constant.
|
|
458
|
-
*
|
|
459
|
-
* So finally we have:
|
|
460
|
-
*
|
|
461
|
-
* / A(x) * floor(x^64 / G(x)) \
|
|
462
|
-
* R(x) = B(x) + G(x)*floor ( ------------------------- )
|
|
463
|
-
* \ x^32 /
|
|
397
|
+
* This implementation matches that used in crc-pclmul-template.S from
|
|
398
|
+
* https://lore.kernel.org/r/20250210174540.161705-4-ebiggers@kernel.org/
|
|
399
|
+
* with the parameters n=32 and LSB_CRC=1 (what the gzip CRC uses). See
|
|
400
|
+
* there for a detailed explanation of the math used here.
|
|
464
401
|
*/
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
x1 = _mm_clmulepi64_si128(
|
|
468
|
-
|
|
402
|
+
x0 = _mm_xor_si128(_mm_clmulepi64_si128(x0, mults_128b, 0x10),
|
|
403
|
+
_mm_bsrli_si128(x0, 8));
|
|
404
|
+
x1 = _mm_clmulepi64_si128(x0, barrett_reduction_constants, 0x00);
|
|
405
|
+
x1 = _mm_clmulepi64_si128(x1, barrett_reduction_constants, 0x10);
|
|
469
406
|
x0 = _mm_xor_si128(x0, x1);
|
|
470
|
-
|
|
471
|
-
crc = _mm_extract_epi32(x0, 1);
|
|
472
|
-
#else
|
|
473
|
-
crc = _mm_cvtsi128_si32(_mm_shuffle_epi32(x0, 0x01));
|
|
474
|
-
/* Process up to 15 bytes left over at the end. */
|
|
475
|
-
crc = crc32_slice1(crc, p, len);
|
|
476
|
-
#endif
|
|
477
|
-
return crc;
|
|
407
|
+
return _mm_extract_epi32(x0, 2);
|
|
478
408
|
}
|
|
479
409
|
|
|
480
410
|
#undef vec_t
|
|
@@ -491,5 +421,4 @@ reduce_x0:
|
|
|
491
421
|
#undef SUFFIX
|
|
492
422
|
#undef ATTRIBUTES
|
|
493
423
|
#undef VL
|
|
494
|
-
#undef USE_SSE4_1
|
|
495
424
|
#undef USE_AVX512
|