deflate-ruby 1.0.1 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. checksums.yaml +4 -4
  2. data/CLAUDE.md +95 -92
  3. data/LICENSE.txt +6 -6
  4. data/README.md +87 -65
  5. data/Rakefile +23 -0
  6. data/ext/deflate_ruby/{libdeflate/lib/x86/adler32_impl.h → adler32_impl.h} +8 -7
  7. data/ext/deflate_ruby/common_defs.h +748 -0
  8. data/ext/deflate_ruby/{libdeflate/lib/x86/cpu_features.c → cpu_features.c} +46 -16
  9. data/ext/deflate_ruby/{libdeflate/lib/x86/cpu_features.h → cpu_features.h} +2 -1
  10. data/ext/deflate_ruby/{libdeflate/lib/x86/crc32_impl.h → crc32_impl.h} +22 -23
  11. data/ext/deflate_ruby/{libdeflate/lib/crc32_multipliers.h → crc32_multipliers.h} +2 -4
  12. data/ext/deflate_ruby/{libdeflate/lib/x86/crc32_pclmul_template.h → crc32_pclmul_template.h} +23 -94
  13. data/ext/deflate_ruby/{libdeflate/lib/crc32_tables.h → crc32_tables.h} +1 -1
  14. data/ext/deflate_ruby/{libdeflate/lib/deflate_compress.c → deflate_compress.c} +59 -60
  15. data/ext/deflate_ruby/deflate_ruby.c +392 -218
  16. data/ext/deflate_ruby/deflate_ruby.h +6 -0
  17. data/ext/deflate_ruby/extconf.rb +35 -25
  18. data/ext/deflate_ruby/libdeflate/adler32.c +162 -0
  19. data/ext/deflate_ruby/libdeflate/{lib/arm → arm}/adler32_impl.h +14 -7
  20. data/ext/deflate_ruby/libdeflate/{lib/arm → arm}/crc32_impl.h +25 -31
  21. data/ext/deflate_ruby/libdeflate/arm/crc32_pmull_helpers.h +156 -0
  22. data/ext/deflate_ruby/libdeflate/arm/crc32_pmull_wide.h +226 -0
  23. data/ext/deflate_ruby/libdeflate/bt_matchfinder.h +342 -0
  24. data/ext/deflate_ruby/libdeflate/common_defs.h +2 -1
  25. data/ext/deflate_ruby/libdeflate/cpu_features_common.h +93 -0
  26. data/ext/deflate_ruby/libdeflate/crc32.c +262 -0
  27. data/ext/deflate_ruby/libdeflate/crc32_multipliers.h +375 -0
  28. data/ext/deflate_ruby/libdeflate/crc32_tables.h +587 -0
  29. data/ext/deflate_ruby/libdeflate/decompress_template.h +777 -0
  30. data/ext/deflate_ruby/libdeflate/deflate_compress.c +4128 -0
  31. data/ext/deflate_ruby/libdeflate/deflate_compress.h +15 -0
  32. data/ext/deflate_ruby/libdeflate/deflate_constants.h +56 -0
  33. data/ext/deflate_ruby/libdeflate/deflate_decompress.c +1208 -0
  34. data/ext/deflate_ruby/libdeflate/gzip_compress.c +90 -0
  35. data/ext/deflate_ruby/libdeflate/gzip_constants.h +45 -0
  36. data/ext/deflate_ruby/libdeflate/gzip_decompress.c +144 -0
  37. data/ext/deflate_ruby/libdeflate/hc_matchfinder.h +401 -0
  38. data/ext/deflate_ruby/libdeflate/ht_matchfinder.h +234 -0
  39. data/ext/deflate_ruby/libdeflate/lib_common.h +106 -0
  40. data/ext/deflate_ruby/libdeflate/libdeflate.h +2 -2
  41. data/ext/deflate_ruby/libdeflate/{lib/matchfinder_common.h → matchfinder_common.h} +3 -3
  42. data/ext/deflate_ruby/libdeflate/x86/adler32_impl.h +135 -0
  43. data/ext/deflate_ruby/libdeflate/x86/adler32_template.h +518 -0
  44. data/ext/deflate_ruby/libdeflate/x86/cpu_features.c +213 -0
  45. data/ext/deflate_ruby/libdeflate/x86/cpu_features.h +170 -0
  46. data/ext/deflate_ruby/libdeflate/x86/crc32_impl.h +159 -0
  47. data/ext/deflate_ruby/libdeflate/x86/crc32_pclmul_template.h +424 -0
  48. data/ext/deflate_ruby/libdeflate/x86/decompress_impl.h +57 -0
  49. data/ext/deflate_ruby/libdeflate.h +411 -0
  50. data/ext/deflate_ruby/matchfinder_common.h +224 -0
  51. data/ext/deflate_ruby/matchfinder_impl.h +122 -0
  52. data/ext/deflate_ruby/utils.c +141 -0
  53. data/ext/deflate_ruby/zlib_compress.c +82 -0
  54. data/ext/deflate_ruby/zlib_constants.h +21 -0
  55. data/ext/deflate_ruby/zlib_decompress.c +104 -0
  56. data/lib/deflate_ruby/version.rb +1 -1
  57. data/lib/deflate_ruby.rb +1 -63
  58. data/sig/deflate_ruby.rbs +4 -0
  59. data/test/test_deflate_ruby.rb +220 -0
  60. data/test/test_helper.rb +6 -0
  61. metadata +89 -144
  62. data/ext/deflate_ruby/libdeflate/CMakeLists.txt +0 -270
  63. data/ext/deflate_ruby/libdeflate/NEWS.md +0 -494
  64. data/ext/deflate_ruby/libdeflate/README.md +0 -228
  65. data/ext/deflate_ruby/libdeflate/libdeflate-config.cmake.in +0 -3
  66. data/ext/deflate_ruby/libdeflate/libdeflate.pc.in +0 -18
  67. data/ext/deflate_ruby/libdeflate/programs/CMakeLists.txt +0 -105
  68. data/ext/deflate_ruby/libdeflate/programs/benchmark.c +0 -696
  69. data/ext/deflate_ruby/libdeflate/programs/checksum.c +0 -218
  70. data/ext/deflate_ruby/libdeflate/programs/config.h.in +0 -19
  71. data/ext/deflate_ruby/libdeflate/programs/gzip.c +0 -688
  72. data/ext/deflate_ruby/libdeflate/programs/prog_util.c +0 -521
  73. data/ext/deflate_ruby/libdeflate/programs/prog_util.h +0 -225
  74. data/ext/deflate_ruby/libdeflate/programs/test_checksums.c +0 -200
  75. data/ext/deflate_ruby/libdeflate/programs/test_custom_malloc.c +0 -155
  76. data/ext/deflate_ruby/libdeflate/programs/test_incomplete_codes.c +0 -385
  77. data/ext/deflate_ruby/libdeflate/programs/test_invalid_streams.c +0 -130
  78. data/ext/deflate_ruby/libdeflate/programs/test_litrunlen_overflow.c +0 -72
  79. data/ext/deflate_ruby/libdeflate/programs/test_overread.c +0 -95
  80. data/ext/deflate_ruby/libdeflate/programs/test_slow_decompression.c +0 -472
  81. data/ext/deflate_ruby/libdeflate/programs/test_trailing_bytes.c +0 -151
  82. data/ext/deflate_ruby/libdeflate/programs/test_util.c +0 -237
  83. data/ext/deflate_ruby/libdeflate/programs/test_util.h +0 -61
  84. data/ext/deflate_ruby/libdeflate/programs/tgetopt.c +0 -118
  85. data/ext/deflate_ruby/libdeflate/scripts/android_build.sh +0 -118
  86. data/ext/deflate_ruby/libdeflate/scripts/android_tests.sh +0 -69
  87. data/ext/deflate_ruby/libdeflate/scripts/benchmark.sh +0 -10
  88. data/ext/deflate_ruby/libdeflate/scripts/checksum.sh +0 -10
  89. data/ext/deflate_ruby/libdeflate/scripts/checksum_benchmarks.sh +0 -253
  90. data/ext/deflate_ruby/libdeflate/scripts/cmake-helper.sh +0 -17
  91. data/ext/deflate_ruby/libdeflate/scripts/deflate_benchmarks.sh +0 -119
  92. data/ext/deflate_ruby/libdeflate/scripts/exec_tests.sh +0 -38
  93. data/ext/deflate_ruby/libdeflate/scripts/gen-release-archives.sh +0 -37
  94. data/ext/deflate_ruby/libdeflate/scripts/gen_bitreverse_tab.py +0 -19
  95. data/ext/deflate_ruby/libdeflate/scripts/gen_crc32_multipliers.c +0 -199
  96. data/ext/deflate_ruby/libdeflate/scripts/gen_crc32_tables.c +0 -105
  97. data/ext/deflate_ruby/libdeflate/scripts/gen_default_litlen_costs.py +0 -44
  98. data/ext/deflate_ruby/libdeflate/scripts/gen_offset_slot_map.py +0 -29
  99. data/ext/deflate_ruby/libdeflate/scripts/gzip_tests.sh +0 -523
  100. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/deflate_compress/corpus/0 +0 -0
  101. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/deflate_compress/fuzz.c +0 -95
  102. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/deflate_decompress/corpus/0 +0 -3
  103. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/deflate_decompress/fuzz.c +0 -62
  104. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/fuzz.sh +0 -108
  105. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/gzip_decompress/corpus/0 +0 -0
  106. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/gzip_decompress/fuzz.c +0 -19
  107. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/zlib_decompress/corpus/0 +0 -3
  108. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/zlib_decompress/fuzz.c +0 -19
  109. data/ext/deflate_ruby/libdeflate/scripts/run_tests.sh +0 -416
  110. data/ext/deflate_ruby/libdeflate/scripts/toolchain-i686-w64-mingw32.cmake +0 -8
  111. data/ext/deflate_ruby/libdeflate/scripts/toolchain-x86_64-w64-mingw32.cmake +0 -8
  112. /data/ext/deflate_ruby/{libdeflate/lib/adler32.c → adler32.c} +0 -0
  113. /data/ext/deflate_ruby/{libdeflate/lib/x86/adler32_template.h → adler32_template.h} +0 -0
  114. /data/ext/deflate_ruby/{libdeflate/lib/bt_matchfinder.h → bt_matchfinder.h} +0 -0
  115. /data/ext/deflate_ruby/{libdeflate/lib/cpu_features_common.h → cpu_features_common.h} +0 -0
  116. /data/ext/deflate_ruby/{libdeflate/lib/crc32.c → crc32.c} +0 -0
  117. /data/ext/deflate_ruby/{libdeflate/lib/arm/crc32_pmull_helpers.h → crc32_pmull_helpers.h} +0 -0
  118. /data/ext/deflate_ruby/{libdeflate/lib/arm/crc32_pmull_wide.h → crc32_pmull_wide.h} +0 -0
  119. /data/ext/deflate_ruby/{libdeflate/lib/x86/decompress_impl.h → decompress_impl.h} +0 -0
  120. /data/ext/deflate_ruby/{libdeflate/lib/decompress_template.h → decompress_template.h} +0 -0
  121. /data/ext/deflate_ruby/{libdeflate/lib/deflate_compress.h → deflate_compress.h} +0 -0
  122. /data/ext/deflate_ruby/{libdeflate/lib/deflate_constants.h → deflate_constants.h} +0 -0
  123. /data/ext/deflate_ruby/{libdeflate/lib/deflate_decompress.c → deflate_decompress.c} +0 -0
  124. /data/ext/deflate_ruby/{libdeflate/lib/gzip_compress.c → gzip_compress.c} +0 -0
  125. /data/ext/deflate_ruby/{libdeflate/lib/gzip_constants.h → gzip_constants.h} +0 -0
  126. /data/ext/deflate_ruby/{libdeflate/lib/gzip_decompress.c → gzip_decompress.c} +0 -0
  127. /data/ext/deflate_ruby/{libdeflate/lib/hc_matchfinder.h → hc_matchfinder.h} +0 -0
  128. /data/ext/deflate_ruby/{libdeflate/lib/ht_matchfinder.h → ht_matchfinder.h} +0 -0
  129. /data/ext/deflate_ruby/{libdeflate/lib/lib_common.h → lib_common.h} +0 -0
  130. /data/ext/deflate_ruby/libdeflate/{lib/arm → arm}/cpu_features.c +0 -0
  131. /data/ext/deflate_ruby/libdeflate/{lib/arm → arm}/cpu_features.h +0 -0
  132. /data/ext/deflate_ruby/libdeflate/{lib/arm → arm}/matchfinder_impl.h +0 -0
  133. /data/ext/deflate_ruby/libdeflate/{lib/riscv → riscv}/matchfinder_impl.h +0 -0
  134. /data/ext/deflate_ruby/libdeflate/{lib/utils.c → utils.c} +0 -0
  135. /data/ext/deflate_ruby/libdeflate/{lib/x86 → x86}/matchfinder_impl.h +0 -0
  136. /data/ext/deflate_ruby/libdeflate/{lib/zlib_compress.c → zlib_compress.c} +0 -0
  137. /data/ext/deflate_ruby/libdeflate/{lib/zlib_constants.h → zlib_constants.h} +0 -0
  138. /data/ext/deflate_ruby/libdeflate/{lib/zlib_decompress.c → zlib_decompress.c} +0 -0
@@ -88,9 +88,31 @@ static const struct cpu_feature x86_cpu_feature_table[] = {
88
88
 
89
89
  volatile u32 libdeflate_x86_cpu_features = 0;
90
90
 
91
+ static inline bool
92
+ os_supports_avx512(u64 xcr0)
93
+ {
94
+ #ifdef __APPLE__
95
+ /*
96
+ * The Darwin kernel had a bug where it could corrupt the opmask
97
+ * registers. See
98
+ * https://community.intel.com/t5/Software-Tuning-Performance/MacOS-Darwin-kernel-bug-clobbers-AVX-512-opmask-register-state/m-p/1327259
99
+ * Darwin also does not initially set the XCR0 bits for AVX512, but they
100
+ * are set if the thread tries to use AVX512 anyway. Thus, to safely
101
+ * and consistently use AVX512 on macOS we'd need to check the kernel
102
+ * version as well as detect AVX512 support using a macOS-specific
103
+ * method. We don't bother with this, especially given Apple's
104
+ * transition to arm64.
105
+ */
106
+ return false;
107
+ #else
108
+ return (xcr0 & 0xe6) == 0xe6;
109
+ #endif
110
+ }
111
+
91
112
  /*
92
- * Don't use 512-bit vectors on Intel CPUs before Rocket Lake and Sapphire
93
- * Rapids, due to the downclocking penalty.
113
+ * Don't use 512-bit vectors (ZMM registers) on Intel CPUs before Rocket Lake
114
+ * and Sapphire Rapids, due to the overly-eager downclocking which can reduce
115
+ * the performance of workloads that use ZMM registers only occasionally.
94
116
  */
95
117
  static inline bool
96
118
  allow_512bit_vectors(const u32 manufacturer[3], u32 family, u32 model)
@@ -140,7 +162,12 @@ void libdeflate_init_x86_cpu_features(void)
140
162
  family += (a >> 20) & 0xff;
141
163
  if (d & (1 << 26))
142
164
  features |= X86_CPU_FEATURE_SSE2;
143
- if (c & (1 << 1))
165
+ /*
166
+ * No known CPUs have pclmulqdq without sse4.1, so in practice code
167
+ * targeting pclmulqdq can use sse4.1 instructions. But to be safe,
168
+ * explicitly check for both the pclmulqdq and sse4.1 bits.
169
+ */
170
+ if ((c & (1 << 1)) && (c & (1 << 19)))
144
171
  features |= X86_CPU_FEATURE_PCLMULQDQ;
145
172
  if (c & (1 << 27))
146
173
  xcr0 = read_xcr(0);
@@ -152,21 +179,24 @@ void libdeflate_init_x86_cpu_features(void)
152
179
 
153
180
  /* EAX=7, ECX=0: Extended Features */
154
181
  cpuid(7, 0, &a, &b, &c, &d);
155
- if ((b & (1 << 5)) && ((xcr0 & 0x6) == 0x6))
156
- features |= X86_CPU_FEATURE_AVX2;
157
182
  if (b & (1 << 8))
158
183
  features |= X86_CPU_FEATURE_BMI2;
159
- if (((xcr0 & 0xe6) == 0xe6) &&
160
- allow_512bit_vectors(manufacturer, family, model))
161
- features |= X86_CPU_FEATURE_ZMM;
162
- if ((b & (1 << 30)) && ((xcr0 & 0xe6) == 0xe6))
163
- features |= X86_CPU_FEATURE_AVX512BW;
164
- if ((b & (1U << 31)) && ((xcr0 & 0xe6) == 0xe6))
165
- features |= X86_CPU_FEATURE_AVX512VL;
166
- if ((c & (1 << 10)) && ((xcr0 & 0x6) == 0x6))
167
- features |= X86_CPU_FEATURE_VPCLMULQDQ;
168
- if ((c & (1 << 11)) && ((xcr0 & 0xe6) == 0xe6))
169
- features |= X86_CPU_FEATURE_AVX512VNNI;
184
+ if ((xcr0 & 0x6) == 0x6) {
185
+ if (b & (1 << 5))
186
+ features |= X86_CPU_FEATURE_AVX2;
187
+ if (c & (1 << 10))
188
+ features |= X86_CPU_FEATURE_VPCLMULQDQ;
189
+ }
190
+ if (os_supports_avx512(xcr0)) {
191
+ if (allow_512bit_vectors(manufacturer, family, model))
192
+ features |= X86_CPU_FEATURE_ZMM;
193
+ if (b & (1 << 30))
194
+ features |= X86_CPU_FEATURE_AVX512BW;
195
+ if (b & (1U << 31))
196
+ features |= X86_CPU_FEATURE_AVX512VL;
197
+ if (c & (1 << 11))
198
+ features |= X86_CPU_FEATURE_AVX512VNNI;
199
+ }
170
200
 
171
201
  /* EAX=7, ECX=1: Extended Features */
172
202
  cpuid(7, 1, &a, &b, &c, &d);
@@ -108,7 +108,8 @@ static inline u32 get_x86_cpu_features(void) { return 0; }
108
108
  # define HAVE_SSE2_NATIVE 0
109
109
  #endif
110
110
 
111
- #if defined(__PCLMUL__) || (defined(_MSC_VER) && defined(__AVX2__))
111
+ #if (defined(__PCLMUL__) && defined(__SSE4_1__)) || \
112
+ (defined(_MSC_VER) && defined(__AVX2__))
112
113
  # define HAVE_PCLMULQDQ(features) 1
113
114
  #else
114
115
  # define HAVE_PCLMULQDQ(features) ((features) & X86_CPU_FEATURE_PCLMULQDQ)
@@ -44,31 +44,26 @@ static const u8 MAYBE_UNUSED shift_tab[48] = {
44
44
  };
45
45
 
46
46
  #if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)
47
- /* PCLMULQDQ implementation */
47
+ /*
48
+ * PCLMULQDQ implementation. This targets PCLMULQDQ+SSE4.1, since in practice
49
+ * all CPUs that support PCLMULQDQ also support SSE4.1.
50
+ */
48
51
  # define crc32_x86_pclmulqdq crc32_x86_pclmulqdq
49
52
  # define SUFFIX _pclmulqdq
50
- # define ATTRIBUTES _target_attribute("pclmul")
53
+ # define ATTRIBUTES _target_attribute("pclmul,sse4.1")
51
54
  # define VL 16
52
- # define USE_SSE4_1 0
53
55
  # define USE_AVX512 0
54
56
  # include "crc32_pclmul_template.h"
55
57
 
56
58
  /*
57
- * PCLMULQDQ/AVX implementation. Compared to the regular PCLMULQDQ
58
- * implementation, this still uses 128-bit vectors, but it has two potential
59
- * benefits. First, simply compiling against the AVX target can improve
60
- * performance significantly (e.g. 10100 MB/s to 16700 MB/s on Skylake) without
61
- * actually using any AVX intrinsics, probably due to the availability of
62
- * non-destructive VEX-encoded instructions. Second, AVX support implies SSSE3
63
- * and SSE4.1 support, and we can use SSSE3 and SSE4.1 intrinsics for efficient
64
- * handling of partial blocks. (We *could* compile a variant with
65
- * PCLMULQDQ+SSE4.1 without AVX, but for simplicity we currently don't bother.)
59
+ * PCLMULQDQ/AVX implementation. Same as above, but this is compiled with AVX
60
+ * enabled so that the compiler can generate VEX-coded instructions which can be
61
+ * slightly more efficient. It still uses 128-bit vectors.
66
62
  */
67
63
  # define crc32_x86_pclmulqdq_avx crc32_x86_pclmulqdq_avx
68
64
  # define SUFFIX _pclmulqdq_avx
69
65
  # define ATTRIBUTES _target_attribute("pclmul,avx")
70
66
  # define VL 16
71
- # define USE_SSE4_1 1
72
67
  # define USE_AVX512 0
73
68
  # include "crc32_pclmul_template.h"
74
69
  #endif
@@ -83,43 +78,47 @@ static const u8 MAYBE_UNUSED shift_tab[48] = {
83
78
  *
84
79
  * gcc 8.1 and 8.2 had a similar bug where they assumed that
85
80
  * _mm256_clmulepi64_epi128() always needed AVX512. It's fixed in gcc 8.3.
81
+ *
82
+ * _mm256_zextsi128_si256() requires gcc 10.
86
83
  */
87
- #if GCC_PREREQ(8, 3) || CLANG_PREREQ(6, 0, 10000000)
84
+ #if (GCC_PREREQ(10, 1) || CLANG_PREREQ(6, 0, 10000000)) && \
85
+ !defined(LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_VPCLMULQDQ)
88
86
  # define crc32_x86_vpclmulqdq_avx2 crc32_x86_vpclmulqdq_avx2
89
87
  # define SUFFIX _vpclmulqdq_avx2
90
88
  # define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx2")
91
89
  # define VL 32
92
- # define USE_SSE4_1 1
93
90
  # define USE_AVX512 0
94
91
  # include "crc32_pclmul_template.h"
95
92
  #endif
96
93
 
97
- #if GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 10000000) || MSVC_PREREQ(1920)
94
+ #if (GCC_PREREQ(10, 1) || CLANG_PREREQ(6, 0, 10000000) || MSVC_PREREQ(1920)) && \
95
+ !defined(LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_VPCLMULQDQ)
98
96
  /*
99
97
  * VPCLMULQDQ/AVX512 implementation using 256-bit vectors. This is very similar
100
98
  * to the VPCLMULQDQ/AVX2 implementation but takes advantage of the vpternlog
101
- * instruction and more registers. This is used on CPUs that support AVX-512
102
- * but where using 512-bit vectors causes downclocking. This should also be the
103
- * optimal implementation on CPUs that support AVX10/256 but not AVX10/512.
99
+ * instruction and more registers. This is used on certain older Intel CPUs,
100
+ * specifically Ice Lake and Tiger Lake, which support VPCLMULQDQ and AVX512 but
101
+ * downclock a bit too eagerly when ZMM registers are used.
102
+ *
103
+ * _mm256_zextsi128_si256() requires gcc 10.
104
104
  */
105
105
  # define crc32_x86_vpclmulqdq_avx512_vl256 crc32_x86_vpclmulqdq_avx512_vl256
106
106
  # define SUFFIX _vpclmulqdq_avx512_vl256
107
107
  # define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx512bw,avx512vl")
108
108
  # define VL 32
109
- # define USE_SSE4_1 1
110
109
  # define USE_AVX512 1
111
110
  # include "crc32_pclmul_template.h"
112
111
 
113
112
  /*
114
113
  * VPCLMULQDQ/AVX512 implementation using 512-bit vectors. This is used on CPUs
115
- * that have a good AVX-512 implementation including VPCLMULQDQ. This should
116
- * also be the optimal implementation on CPUs that support AVX10/512.
114
+ * that have a good AVX-512 implementation including VPCLMULQDQ.
115
+ *
116
+ * _mm512_zextsi128_si512() requires gcc 10.
117
117
  */
118
118
  # define crc32_x86_vpclmulqdq_avx512_vl512 crc32_x86_vpclmulqdq_avx512_vl512
119
119
  # define SUFFIX _vpclmulqdq_avx512_vl512
120
120
  # define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx512bw,avx512vl")
121
121
  # define VL 64
122
- # define USE_SSE4_1 1
123
122
  # define USE_AVX512 1
124
123
  # include "crc32_pclmul_template.h"
125
124
  #endif
@@ -1,7 +1,7 @@
1
1
  /*
2
2
  * crc32_multipliers.h - constants for CRC-32 folding
3
3
  *
4
- * THIS FILE WAS GENERATED BY gen_crc32_multipliers.c. DO NOT EDIT.
4
+ * THIS FILE WAS GENERATED BY gen-crc32-consts.py. DO NOT EDIT.
5
5
  */
6
6
 
7
7
  #define CRC32_X159_MODG 0xae689191 /* x^159 mod G(x) */
@@ -100,10 +100,8 @@
100
100
  #define CRC32_X4127_MODG 0x1072db28 /* x^4127 mod G(x) */
101
101
  #define CRC32_X4063_MODG 0x0c30f51d /* x^4063 mod G(x) */
102
102
 
103
- #define CRC32_X63_MODG 0xb8bc6765 /* x^63 mod G(x) */
104
- #define CRC32_BARRETT_CONSTANT_1 0x00000001f7011641ULL /* floor(x^64 / G(x)) */
103
+ #define CRC32_BARRETT_CONSTANT_1 0xb4e5b025f7011641ULL /* floor(x^95 / G(x)) */
105
104
  #define CRC32_BARRETT_CONSTANT_2 0x00000001db710641ULL /* G(x) */
106
- #define CRC32_BARRETT_CONSTANTS { CRC32_BARRETT_CONSTANT_1, CRC32_BARRETT_CONSTANT_2 }
107
105
 
108
106
  #define CRC32_NUM_CHUNKS 4
109
107
  #define CRC32_MIN_VARIABLE_CHUNK_LEN 128UL
@@ -34,17 +34,13 @@
34
34
  * ATTRIBUTES:
35
35
  * Target function attributes to use. Must satisfy the dependencies of the
36
36
  * other parameters as follows:
37
- * VL=16 && USE_SSE4_1=0 && USE_AVX512=0: at least pclmul
38
- * VL=16 && USE_SSE4_1=1 && USE_AVX512=0: at least pclmul,sse4.1
39
- * VL=32 && USE_SSE4_1=1 && USE_AVX512=0: at least vpclmulqdq,pclmul,avx2
40
- * VL=32 && USE_SSE4_1=1 && USE_AVX512=1: at least vpclmulqdq,pclmul,avx512bw,avx512vl
41
- * VL=64 && USE_SSE4_1=1 && USE_AVX512=1: at least vpclmulqdq,pclmul,avx512bw,avx512vl
37
+ * VL=16 && USE_AVX512=0: at least pclmul,sse4.1
38
+ * VL=32 && USE_AVX512=0: at least vpclmulqdq,pclmul,avx2
39
+ * VL=32 && USE_AVX512=1: at least vpclmulqdq,pclmul,avx512bw,avx512vl
40
+ * VL=64 && USE_AVX512=1: at least vpclmulqdq,pclmul,avx512bw,avx512vl
42
41
  * (Other combinations are not useful and have not been tested.)
43
42
  * VL:
44
43
  * Vector length in bytes. Must be 16, 32, or 64.
45
- * USE_SSE4_1:
46
- * If 1, take advantage of SSE4.1 instructions such as pblendvb.
47
- * If 0, assume that the CPU might not support SSE4.1.
48
44
  * USE_AVX512:
49
45
  * If 1, take advantage of AVX-512 features such as masking and the
50
46
  * vpternlog instruction. This doesn't enable the use of 512-bit vectors;
@@ -55,7 +51,10 @@
55
51
  * instructions. Note that the x86 crc32 instruction cannot be used, as it is
56
52
  * for a different polynomial, not the gzip one. For an explanation of CRC
57
53
  * folding with carryless multiplication instructions, see
58
- * scripts/gen_crc32_multipliers.c and the following paper:
54
+ * scripts/gen-crc32-consts.py and the following blog posts and papers:
55
+ *
56
+ * "An alternative exposition of crc32_4k_pclmulqdq"
57
+ * https://www.corsix.org/content/alternative-exposition-crc32_4k_pclmulqdq
59
58
  *
60
59
  * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
61
60
  * https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
@@ -81,7 +80,7 @@
81
80
  # define fold_vec fold_vec256
82
81
  # define VLOADU(p) _mm256_loadu_si256((const void *)(p))
83
82
  # define VXOR(a, b) _mm256_xor_si256((a), (b))
84
- # define M128I_TO_VEC(a) _mm256_castsi128_si256(a)
83
+ # define M128I_TO_VEC(a) _mm256_zextsi128_si256(a)
85
84
  # define MULTS(a, b) _mm256_set_epi64x(a, b, a, b)
86
85
  # define MULTS_8V MULTS(CRC32_X2015_MODG, CRC32_X2079_MODG)
87
86
  # define MULTS_4V MULTS(CRC32_X991_MODG, CRC32_X1055_MODG)
@@ -92,7 +91,7 @@
92
91
  # define fold_vec fold_vec512
93
92
  # define VLOADU(p) _mm512_loadu_si512((const void *)(p))
94
93
  # define VXOR(a, b) _mm512_xor_si512((a), (b))
95
- # define M128I_TO_VEC(a) _mm512_castsi128_si512(a)
94
+ # define M128I_TO_VEC(a) _mm512_zextsi128_si512(a)
96
95
  # define MULTS(a, b) _mm512_set_epi64(a, b, a, b, a, b, a, b)
97
96
  # define MULTS_8V MULTS(CRC32_X4063_MODG, CRC32_X4127_MODG)
98
97
  # define MULTS_4V MULTS(CRC32_X2015_MODG, CRC32_X2079_MODG)
@@ -149,7 +148,6 @@ ADD_SUFFIX(fold_vec512)(__m512i src, __m512i dst, __m512i /* __v8du */ mults)
149
148
  #define fold_vec512 ADD_SUFFIX(fold_vec512)
150
149
  #endif /* VL >= 64 */
151
150
 
152
- #if USE_SSE4_1
153
151
  /*
154
152
  * Given 'x' containing a 16-byte polynomial, and a pointer 'p' that points to
155
153
  * the next '1 <= len <= 15' data bytes, rearrange the concatenation of 'x' and
@@ -181,7 +179,6 @@ ADD_SUFFIX(fold_lessthan16bytes)(__m128i x, const u8 *p, size_t len,
181
179
  return fold_vec128(x0, x1, mults_128b);
182
180
  }
183
181
  #define fold_lessthan16bytes ADD_SUFFIX(fold_lessthan16bytes)
184
- #endif /* USE_SSE4_1 */
185
182
 
186
183
  static ATTRIBUTES u32
187
184
  ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
@@ -192,15 +189,13 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
192
189
  * folding across 128 bits. mults_128b differs from mults_1v when
193
190
  * VL != 16. All multipliers are 64-bit, to match what pclmulqdq needs,
194
191
  * but since this is for CRC-32 only their low 32 bits are nonzero.
195
- * For more details, see scripts/gen_crc32_multipliers.c.
192
+ * For more details, see scripts/gen-crc32-consts.py.
196
193
  */
197
194
  const vec_t mults_8v = MULTS_8V;
198
195
  const vec_t mults_4v = MULTS_4V;
199
196
  const vec_t mults_2v = MULTS_2V;
200
197
  const vec_t mults_1v = MULTS_1V;
201
198
  const __m128i mults_128b = _mm_set_epi64x(CRC32_X95_MODG, CRC32_X159_MODG);
202
- const __m128i final_mult = _mm_set_epi64x(0, CRC32_X63_MODG);
203
- const __m128i mask32 = _mm_set_epi32(0, 0, 0, 0xFFFFFFFF);
204
199
  const __m128i barrett_reduction_constants =
205
200
  _mm_set_epi64x(CRC32_BARRETT_CONSTANT_2, CRC32_BARRETT_CONSTANT_1);
206
201
  vec_t v0, v1, v2, v3, v4, v5, v6, v7;
@@ -273,7 +268,6 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
273
268
  size_t align = -(uintptr_t)p & (VL-1);
274
269
 
275
270
  len -= align;
276
- #if USE_SSE4_1
277
271
  x0 = _mm_xor_si128(_mm_loadu_si128((const void *)p), x0);
278
272
  p += 16;
279
273
  if (align & 15) {
@@ -296,11 +290,6 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
296
290
  v0 = _mm512_inserti64x4(v0, *(const __m256i *)(p + 16), 1);
297
291
  # endif
298
292
  p -= 16;
299
- #else
300
- crc = crc32_slice1(crc, p, align);
301
- p += align;
302
- v0 = VXOR(VLOADU(p), M128I_TO_VEC(_mm_cvtsi32_si128(crc)));
303
- #endif
304
293
  } else {
305
294
  v0 = VXOR(VLOADU(p), M128I_TO_VEC(x0));
306
295
  }
@@ -395,86 +384,27 @@ less_than_vl_remaining:
395
384
  less_than_16_remaining:
396
385
  len &= 15;
397
386
 
398
- /*
399
- * If fold_lessthan16bytes() is available, handle any remainder
400
- * of 1 to 15 bytes now, before reducing to 32 bits.
401
- */
402
- #if USE_SSE4_1
387
+ /* Handle any remainder of 1 to 15 bytes. */
403
388
  if (len)
404
389
  x0 = fold_lessthan16bytes(x0, p, len, mults_128b);
405
- #endif
406
390
  #if USE_AVX512
407
391
  reduce_x0:
408
392
  #endif
409
-
410
- /*
411
- * Fold 128 => 96 bits. This also implicitly appends 32 zero bits,
412
- * which is equivalent to multiplying by x^32. This is needed because
413
- * the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x).
414
- */
415
- x0 = _mm_xor_si128(_mm_srli_si128(x0, 8),
416
- _mm_clmulepi64_si128(x0, mults_128b, 0x10));
417
-
418
- /* Fold 96 => 64 bits. */
419
- x0 = _mm_xor_si128(_mm_srli_si128(x0, 4),
420
- _mm_clmulepi64_si128(_mm_and_si128(x0, mask32),
421
- final_mult, 0x00));
422
-
423
393
  /*
424
- * Reduce 64 => 32 bits using Barrett reduction.
425
- *
426
- * Let M(x) = A(x)*x^32 + B(x) be the remaining message. The goal is to
427
- * compute R(x) = M(x) mod G(x). Since degree(B(x)) < degree(G(x)):
428
- *
429
- * R(x) = (A(x)*x^32 + B(x)) mod G(x)
430
- * = (A(x)*x^32) mod G(x) + B(x)
431
- *
432
- * Then, by the Division Algorithm there exists a unique q(x) such that:
394
+ * Multiply the remaining 128-bit message polynomial 'x0' by x^32, then
395
+ * reduce it modulo the generator polynomial G. This gives the CRC.
433
396
  *
434
- * A(x)*x^32 mod G(x) = A(x)*x^32 - q(x)*G(x)
435
- *
436
- * Since the left-hand side is of maximum degree 31, the right-hand side
437
- * must be too. This implies that we can apply 'mod x^32' to the
438
- * right-hand side without changing its value:
439
- *
440
- * (A(x)*x^32 - q(x)*G(x)) mod x^32 = q(x)*G(x) mod x^32
441
- *
442
- * Note that '+' is equivalent to '-' in polynomials over GF(2).
443
- *
444
- * We also know that:
445
- *
446
- * / A(x)*x^32 \
447
- * q(x) = floor ( --------- )
448
- * \ G(x) /
449
- *
450
- * To compute this efficiently, we can multiply the top and bottom by
451
- * x^32 and move the division by G(x) to the top:
452
- *
453
- * / A(x) * floor(x^64 / G(x)) \
454
- * q(x) = floor ( ------------------------- )
455
- * \ x^32 /
456
- *
457
- * Note that floor(x^64 / G(x)) is a constant.
458
- *
459
- * So finally we have:
460
- *
461
- * / A(x) * floor(x^64 / G(x)) \
462
- * R(x) = B(x) + G(x)*floor ( ------------------------- )
463
- * \ x^32 /
397
+ * This implementation matches that used in crc-pclmul-template.S from
398
+ * https://lore.kernel.org/r/20250210174540.161705-4-ebiggers@kernel.org/
399
+ * with the parameters n=32 and LSB_CRC=1 (what the gzip CRC uses). See
400
+ * there for a detailed explanation of the math used here.
464
401
  */
465
- x1 = _mm_clmulepi64_si128(_mm_and_si128(x0, mask32),
466
- barrett_reduction_constants, 0x00);
467
- x1 = _mm_clmulepi64_si128(_mm_and_si128(x1, mask32),
468
- barrett_reduction_constants, 0x10);
402
+ x0 = _mm_xor_si128(_mm_clmulepi64_si128(x0, mults_128b, 0x10),
403
+ _mm_bsrli_si128(x0, 8));
404
+ x1 = _mm_clmulepi64_si128(x0, barrett_reduction_constants, 0x00);
405
+ x1 = _mm_clmulepi64_si128(x1, barrett_reduction_constants, 0x10);
469
406
  x0 = _mm_xor_si128(x0, x1);
470
- #if USE_SSE4_1
471
- crc = _mm_extract_epi32(x0, 1);
472
- #else
473
- crc = _mm_cvtsi128_si32(_mm_shuffle_epi32(x0, 0x01));
474
- /* Process up to 15 bytes left over at the end. */
475
- crc = crc32_slice1(crc, p, len);
476
- #endif
477
- return crc;
407
+ return _mm_extract_epi32(x0, 2);
478
408
  }
479
409
 
480
410
  #undef vec_t
@@ -491,5 +421,4 @@ reduce_x0:
491
421
  #undef SUFFIX
492
422
  #undef ATTRIBUTES
493
423
  #undef VL
494
- #undef USE_SSE4_1
495
424
  #undef USE_AVX512
@@ -1,7 +1,7 @@
1
1
  /*
2
2
  * crc32_tables.h - data tables for CRC-32 computation
3
3
  *
4
- * THIS FILE WAS GENERATED BY gen_crc32_tables.c. DO NOT EDIT.
4
+ * THIS FILE WAS GENERATED BY gen-crc32-consts.py. DO NOT EDIT.
5
5
  */
6
6
 
7
7
  static const u32 crc32_slice1_table[] MAYBE_UNUSED = {