deflate-ruby 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. checksums.yaml +7 -0
  2. data/CLAUDE.md +138 -0
  3. data/LICENSE.txt +21 -0
  4. data/README.md +117 -0
  5. data/ext/deflate_ruby/deflate_ruby.c +301 -0
  6. data/ext/deflate_ruby/extconf.rb +34 -0
  7. data/ext/deflate_ruby/libdeflate/CMakeLists.txt +270 -0
  8. data/ext/deflate_ruby/libdeflate/COPYING +22 -0
  9. data/ext/deflate_ruby/libdeflate/NEWS.md +494 -0
  10. data/ext/deflate_ruby/libdeflate/README.md +228 -0
  11. data/ext/deflate_ruby/libdeflate/common_defs.h +747 -0
  12. data/ext/deflate_ruby/libdeflate/lib/adler32.c +162 -0
  13. data/ext/deflate_ruby/libdeflate/lib/arm/adler32_impl.h +358 -0
  14. data/ext/deflate_ruby/libdeflate/lib/arm/cpu_features.c +230 -0
  15. data/ext/deflate_ruby/libdeflate/lib/arm/cpu_features.h +214 -0
  16. data/ext/deflate_ruby/libdeflate/lib/arm/crc32_impl.h +600 -0
  17. data/ext/deflate_ruby/libdeflate/lib/arm/crc32_pmull_helpers.h +156 -0
  18. data/ext/deflate_ruby/libdeflate/lib/arm/crc32_pmull_wide.h +226 -0
  19. data/ext/deflate_ruby/libdeflate/lib/arm/matchfinder_impl.h +78 -0
  20. data/ext/deflate_ruby/libdeflate/lib/bt_matchfinder.h +342 -0
  21. data/ext/deflate_ruby/libdeflate/lib/cpu_features_common.h +93 -0
  22. data/ext/deflate_ruby/libdeflate/lib/crc32.c +262 -0
  23. data/ext/deflate_ruby/libdeflate/lib/crc32_multipliers.h +377 -0
  24. data/ext/deflate_ruby/libdeflate/lib/crc32_tables.h +587 -0
  25. data/ext/deflate_ruby/libdeflate/lib/decompress_template.h +777 -0
  26. data/ext/deflate_ruby/libdeflate/lib/deflate_compress.c +4129 -0
  27. data/ext/deflate_ruby/libdeflate/lib/deflate_compress.h +15 -0
  28. data/ext/deflate_ruby/libdeflate/lib/deflate_constants.h +56 -0
  29. data/ext/deflate_ruby/libdeflate/lib/deflate_decompress.c +1208 -0
  30. data/ext/deflate_ruby/libdeflate/lib/gzip_compress.c +90 -0
  31. data/ext/deflate_ruby/libdeflate/lib/gzip_constants.h +45 -0
  32. data/ext/deflate_ruby/libdeflate/lib/gzip_decompress.c +144 -0
  33. data/ext/deflate_ruby/libdeflate/lib/hc_matchfinder.h +401 -0
  34. data/ext/deflate_ruby/libdeflate/lib/ht_matchfinder.h +234 -0
  35. data/ext/deflate_ruby/libdeflate/lib/lib_common.h +106 -0
  36. data/ext/deflate_ruby/libdeflate/lib/matchfinder_common.h +224 -0
  37. data/ext/deflate_ruby/libdeflate/lib/riscv/matchfinder_impl.h +97 -0
  38. data/ext/deflate_ruby/libdeflate/lib/utils.c +141 -0
  39. data/ext/deflate_ruby/libdeflate/lib/x86/adler32_impl.h +134 -0
  40. data/ext/deflate_ruby/libdeflate/lib/x86/adler32_template.h +518 -0
  41. data/ext/deflate_ruby/libdeflate/lib/x86/cpu_features.c +183 -0
  42. data/ext/deflate_ruby/libdeflate/lib/x86/cpu_features.h +169 -0
  43. data/ext/deflate_ruby/libdeflate/lib/x86/crc32_impl.h +160 -0
  44. data/ext/deflate_ruby/libdeflate/lib/x86/crc32_pclmul_template.h +495 -0
  45. data/ext/deflate_ruby/libdeflate/lib/x86/decompress_impl.h +57 -0
  46. data/ext/deflate_ruby/libdeflate/lib/x86/matchfinder_impl.h +122 -0
  47. data/ext/deflate_ruby/libdeflate/lib/zlib_compress.c +82 -0
  48. data/ext/deflate_ruby/libdeflate/lib/zlib_constants.h +21 -0
  49. data/ext/deflate_ruby/libdeflate/lib/zlib_decompress.c +104 -0
  50. data/ext/deflate_ruby/libdeflate/libdeflate-config.cmake.in +3 -0
  51. data/ext/deflate_ruby/libdeflate/libdeflate.h +411 -0
  52. data/ext/deflate_ruby/libdeflate/libdeflate.pc.in +18 -0
  53. data/ext/deflate_ruby/libdeflate/programs/CMakeLists.txt +105 -0
  54. data/ext/deflate_ruby/libdeflate/programs/benchmark.c +696 -0
  55. data/ext/deflate_ruby/libdeflate/programs/checksum.c +218 -0
  56. data/ext/deflate_ruby/libdeflate/programs/config.h.in +19 -0
  57. data/ext/deflate_ruby/libdeflate/programs/gzip.c +688 -0
  58. data/ext/deflate_ruby/libdeflate/programs/prog_util.c +521 -0
  59. data/ext/deflate_ruby/libdeflate/programs/prog_util.h +225 -0
  60. data/ext/deflate_ruby/libdeflate/programs/test_checksums.c +200 -0
  61. data/ext/deflate_ruby/libdeflate/programs/test_custom_malloc.c +155 -0
  62. data/ext/deflate_ruby/libdeflate/programs/test_incomplete_codes.c +385 -0
  63. data/ext/deflate_ruby/libdeflate/programs/test_invalid_streams.c +130 -0
  64. data/ext/deflate_ruby/libdeflate/programs/test_litrunlen_overflow.c +72 -0
  65. data/ext/deflate_ruby/libdeflate/programs/test_overread.c +95 -0
  66. data/ext/deflate_ruby/libdeflate/programs/test_slow_decompression.c +472 -0
  67. data/ext/deflate_ruby/libdeflate/programs/test_trailing_bytes.c +151 -0
  68. data/ext/deflate_ruby/libdeflate/programs/test_util.c +237 -0
  69. data/ext/deflate_ruby/libdeflate/programs/test_util.h +61 -0
  70. data/ext/deflate_ruby/libdeflate/programs/tgetopt.c +118 -0
  71. data/ext/deflate_ruby/libdeflate/scripts/android_build.sh +118 -0
  72. data/ext/deflate_ruby/libdeflate/scripts/android_tests.sh +69 -0
  73. data/ext/deflate_ruby/libdeflate/scripts/benchmark.sh +10 -0
  74. data/ext/deflate_ruby/libdeflate/scripts/checksum.sh +10 -0
  75. data/ext/deflate_ruby/libdeflate/scripts/checksum_benchmarks.sh +253 -0
  76. data/ext/deflate_ruby/libdeflate/scripts/cmake-helper.sh +17 -0
  77. data/ext/deflate_ruby/libdeflate/scripts/deflate_benchmarks.sh +119 -0
  78. data/ext/deflate_ruby/libdeflate/scripts/exec_tests.sh +38 -0
  79. data/ext/deflate_ruby/libdeflate/scripts/gen-release-archives.sh +37 -0
  80. data/ext/deflate_ruby/libdeflate/scripts/gen_bitreverse_tab.py +19 -0
  81. data/ext/deflate_ruby/libdeflate/scripts/gen_crc32_multipliers.c +199 -0
  82. data/ext/deflate_ruby/libdeflate/scripts/gen_crc32_tables.c +105 -0
  83. data/ext/deflate_ruby/libdeflate/scripts/gen_default_litlen_costs.py +44 -0
  84. data/ext/deflate_ruby/libdeflate/scripts/gen_offset_slot_map.py +29 -0
  85. data/ext/deflate_ruby/libdeflate/scripts/gzip_tests.sh +523 -0
  86. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/deflate_compress/corpus/0 +0 -0
  87. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/deflate_compress/fuzz.c +95 -0
  88. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/deflate_decompress/corpus/0 +3 -0
  89. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/deflate_decompress/fuzz.c +62 -0
  90. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/fuzz.sh +108 -0
  91. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/gzip_decompress/corpus/0 +0 -0
  92. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/gzip_decompress/fuzz.c +19 -0
  93. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/zlib_decompress/corpus/0 +3 -0
  94. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/zlib_decompress/fuzz.c +19 -0
  95. data/ext/deflate_ruby/libdeflate/scripts/run_tests.sh +416 -0
  96. data/ext/deflate_ruby/libdeflate/scripts/toolchain-i686-w64-mingw32.cmake +8 -0
  97. data/ext/deflate_ruby/libdeflate/scripts/toolchain-x86_64-w64-mingw32.cmake +8 -0
  98. data/lib/deflate_ruby/version.rb +5 -0
  99. data/lib/deflate_ruby.rb +71 -0
  100. metadata +191 -0
@@ -0,0 +1,600 @@
1
+ /*
2
+ * arm/crc32_impl.h - ARM implementations of the gzip CRC-32 algorithm
3
+ *
4
+ * Copyright 2022 Eric Biggers
5
+ *
6
+ * Permission is hereby granted, free of charge, to any person
7
+ * obtaining a copy of this software and associated documentation
8
+ * files (the "Software"), to deal in the Software without
9
+ * restriction, including without limitation the rights to use,
10
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ * copies of the Software, and to permit persons to whom the
12
+ * Software is furnished to do so, subject to the following
13
+ * conditions:
14
+ *
15
+ * The above copyright notice and this permission notice shall be
16
+ * included in all copies or substantial portions of the Software.
17
+ *
18
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
20
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
23
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
25
+ * OTHER DEALINGS IN THE SOFTWARE.
26
+ */
27
+
28
+ #ifndef LIB_ARM_CRC32_IMPL_H
29
+ #define LIB_ARM_CRC32_IMPL_H
30
+
31
+ #include "cpu_features.h"
32
+
33
+ /*
34
+ * crc32_arm_crc() - implementation using crc32 instructions (only)
35
+ *
36
+ * In general this implementation is straightforward. However, naive use of the
37
+ * crc32 instructions is serial: one of the two inputs to each crc32 instruction
38
+ * is the output of the previous one. To take advantage of CPUs that can
39
+ * execute multiple crc32 instructions in parallel, when possible we interleave
40
+ * the checksumming of several adjacent chunks, then combine their CRCs.
41
+ *
42
+ * However, without pmull, combining CRCs is fairly slow. So in this pmull-less
43
+ * version, we only use a large chunk length, and thus we only do chunked
44
+ * processing if there is a lot of data to checksum. This also means that a
45
+ * variable chunk length wouldn't help much, so we just support a fixed length.
46
+ */
47
+ #if HAVE_CRC32_INTRIN
48
+ # ifdef __clang__
49
+ # define ATTRIBUTES _target_attribute("crc")
50
+ # else
51
+ # define ATTRIBUTES _target_attribute("+crc")
52
+ # endif
53
+
54
+ /*
55
+ * Combine the CRCs for 4 adjacent chunks of length L = CRC32_FIXED_CHUNK_LEN
56
+ * bytes each by computing:
57
+ *
58
+ * [ crc0*x^(3*8*L) + crc1*x^(2*8*L) + crc2*x^(1*8*L) + crc3 ] mod G(x)
59
+ *
60
+ * This has been optimized in several ways:
61
+ *
62
+ * - The needed multipliers (x to some power, reduced mod G(x)) were
63
+ * precomputed.
64
+ *
65
+ * - The 3 multiplications are interleaved.
66
+ *
67
+ * - The reduction mod G(x) is delayed to the end and done using __crc32d.
68
+ * Note that the use of __crc32d introduces an extra factor of x^32. To
69
+ * cancel that out along with the extra factor of x^1 that gets introduced
70
+ * because of how the 63-bit products are aligned in their 64-bit integers,
71
+ * the multipliers are actually x^(j*8*L - 33) instead of x^(j*8*L).
72
+ */
73
+ static forceinline ATTRIBUTES u32
74
+ combine_crcs_slow(u32 crc0, u32 crc1, u32 crc2, u32 crc3)
75
+ {
76
+ u64 res0 = 0, res1 = 0, res2 = 0;
77
+ int i;
78
+
79
+ /* Multiply crc{0,1,2} by CRC32_FIXED_CHUNK_MULT_{3,2,1}. */
80
+ for (i = 0; i < 32; i++) {
81
+ if (CRC32_FIXED_CHUNK_MULT_3 & (1U << i))
82
+ res0 ^= (u64)crc0 << i;
83
+ if (CRC32_FIXED_CHUNK_MULT_2 & (1U << i))
84
+ res1 ^= (u64)crc1 << i;
85
+ if (CRC32_FIXED_CHUNK_MULT_1 & (1U << i))
86
+ res2 ^= (u64)crc2 << i;
87
+ }
88
+ /* Add the different parts and reduce mod G(x). */
89
+ return __crc32d(0, res0 ^ res1 ^ res2) ^ crc3;
90
+ }
91
+
92
+ #define crc32_arm_crc crc32_arm_crc
93
+ static ATTRIBUTES u32
94
+ crc32_arm_crc(u32 crc, const u8 *p, size_t len)
95
+ {
96
+ if (len >= 64) {
97
+ const size_t align = -(uintptr_t)p & 7;
98
+
99
+ /* Align p to the next 8-byte boundary. */
100
+ if (align) {
101
+ if (align & 1)
102
+ crc = __crc32b(crc, *p++);
103
+ if (align & 2) {
104
+ crc = __crc32h(crc, le16_bswap(*(u16 *)p));
105
+ p += 2;
106
+ }
107
+ if (align & 4) {
108
+ crc = __crc32w(crc, le32_bswap(*(u32 *)p));
109
+ p += 4;
110
+ }
111
+ len -= align;
112
+ }
113
+ /*
114
+ * Interleave the processing of multiple adjacent data chunks to
115
+ * take advantage of instruction-level parallelism.
116
+ *
117
+ * Some CPUs don't prefetch the data if it's being fetched in
118
+ * multiple interleaved streams, so do explicit prefetching.
119
+ */
120
+ while (len >= CRC32_NUM_CHUNKS * CRC32_FIXED_CHUNK_LEN) {
121
+ const u64 *wp0 = (const u64 *)p;
122
+ const u64 * const wp0_end =
123
+ (const u64 *)(p + CRC32_FIXED_CHUNK_LEN);
124
+ u32 crc1 = 0, crc2 = 0, crc3 = 0;
125
+
126
+ STATIC_ASSERT(CRC32_NUM_CHUNKS == 4);
127
+ STATIC_ASSERT(CRC32_FIXED_CHUNK_LEN % (4 * 8) == 0);
128
+ do {
129
+ prefetchr(&wp0[64 + 0*CRC32_FIXED_CHUNK_LEN/8]);
130
+ prefetchr(&wp0[64 + 1*CRC32_FIXED_CHUNK_LEN/8]);
131
+ prefetchr(&wp0[64 + 2*CRC32_FIXED_CHUNK_LEN/8]);
132
+ prefetchr(&wp0[64 + 3*CRC32_FIXED_CHUNK_LEN/8]);
133
+ crc = __crc32d(crc, le64_bswap(wp0[0*CRC32_FIXED_CHUNK_LEN/8]));
134
+ crc1 = __crc32d(crc1, le64_bswap(wp0[1*CRC32_FIXED_CHUNK_LEN/8]));
135
+ crc2 = __crc32d(crc2, le64_bswap(wp0[2*CRC32_FIXED_CHUNK_LEN/8]));
136
+ crc3 = __crc32d(crc3, le64_bswap(wp0[3*CRC32_FIXED_CHUNK_LEN/8]));
137
+ wp0++;
138
+ crc = __crc32d(crc, le64_bswap(wp0[0*CRC32_FIXED_CHUNK_LEN/8]));
139
+ crc1 = __crc32d(crc1, le64_bswap(wp0[1*CRC32_FIXED_CHUNK_LEN/8]));
140
+ crc2 = __crc32d(crc2, le64_bswap(wp0[2*CRC32_FIXED_CHUNK_LEN/8]));
141
+ crc3 = __crc32d(crc3, le64_bswap(wp0[3*CRC32_FIXED_CHUNK_LEN/8]));
142
+ wp0++;
143
+ crc = __crc32d(crc, le64_bswap(wp0[0*CRC32_FIXED_CHUNK_LEN/8]));
144
+ crc1 = __crc32d(crc1, le64_bswap(wp0[1*CRC32_FIXED_CHUNK_LEN/8]));
145
+ crc2 = __crc32d(crc2, le64_bswap(wp0[2*CRC32_FIXED_CHUNK_LEN/8]));
146
+ crc3 = __crc32d(crc3, le64_bswap(wp0[3*CRC32_FIXED_CHUNK_LEN/8]));
147
+ wp0++;
148
+ crc = __crc32d(crc, le64_bswap(wp0[0*CRC32_FIXED_CHUNK_LEN/8]));
149
+ crc1 = __crc32d(crc1, le64_bswap(wp0[1*CRC32_FIXED_CHUNK_LEN/8]));
150
+ crc2 = __crc32d(crc2, le64_bswap(wp0[2*CRC32_FIXED_CHUNK_LEN/8]));
151
+ crc3 = __crc32d(crc3, le64_bswap(wp0[3*CRC32_FIXED_CHUNK_LEN/8]));
152
+ wp0++;
153
+ } while (wp0 != wp0_end);
154
+ crc = combine_crcs_slow(crc, crc1, crc2, crc3);
155
+ p += CRC32_NUM_CHUNKS * CRC32_FIXED_CHUNK_LEN;
156
+ len -= CRC32_NUM_CHUNKS * CRC32_FIXED_CHUNK_LEN;
157
+ }
158
+ /*
159
+ * Due to the large fixed chunk length used above, there might
160
+ * still be a lot of data left. So use a 64-byte loop here,
161
+ * instead of a loop that is less unrolled.
162
+ */
163
+ while (len >= 64) {
164
+ crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 0)));
165
+ crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 8)));
166
+ crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 16)));
167
+ crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 24)));
168
+ crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 32)));
169
+ crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 40)));
170
+ crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 48)));
171
+ crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 56)));
172
+ p += 64;
173
+ len -= 64;
174
+ }
175
+ }
176
+ if (len & 32) {
177
+ crc = __crc32d(crc, get_unaligned_le64(p + 0));
178
+ crc = __crc32d(crc, get_unaligned_le64(p + 8));
179
+ crc = __crc32d(crc, get_unaligned_le64(p + 16));
180
+ crc = __crc32d(crc, get_unaligned_le64(p + 24));
181
+ p += 32;
182
+ }
183
+ if (len & 16) {
184
+ crc = __crc32d(crc, get_unaligned_le64(p + 0));
185
+ crc = __crc32d(crc, get_unaligned_le64(p + 8));
186
+ p += 16;
187
+ }
188
+ if (len & 8) {
189
+ crc = __crc32d(crc, get_unaligned_le64(p));
190
+ p += 8;
191
+ }
192
+ if (len & 4) {
193
+ crc = __crc32w(crc, get_unaligned_le32(p));
194
+ p += 4;
195
+ }
196
+ if (len & 2) {
197
+ crc = __crc32h(crc, get_unaligned_le16(p));
198
+ p += 2;
199
+ }
200
+ if (len & 1)
201
+ crc = __crc32b(crc, *p);
202
+ return crc;
203
+ }
204
+ #undef ATTRIBUTES
205
+ #endif /* crc32_arm_crc() */
206
+
207
+ /*
208
+ * crc32_arm_crc_pmullcombine() - implementation using crc32 instructions, plus
209
+ * pmull instructions for CRC combining
210
+ *
211
+ * This is similar to crc32_arm_crc(), but it enables the use of pmull
212
+ * (carryless multiplication) instructions for the steps where the CRCs of
213
+ * adjacent data chunks are combined. As this greatly speeds up CRC
214
+ * combination, this implementation also differs from crc32_arm_crc() in that it
215
+ * uses a variable chunk length which can get fairly small. The precomputed
216
+ * multipliers needed for the selected chunk length are loaded from a table.
217
+ *
218
+ * Note that pmull is used here only for combining the CRCs of separately
219
+ * checksummed chunks, not for folding the data itself. See crc32_arm_pmull*()
220
+ * for implementations that use pmull for folding the data itself.
221
+ */
222
+ #if HAVE_CRC32_INTRIN && HAVE_PMULL_INTRIN
223
+ # ifdef __clang__
224
+ # define ATTRIBUTES _target_attribute("crc,aes")
225
+ # else
226
+ # define ATTRIBUTES _target_attribute("+crc,+crypto")
227
+ # endif
228
+
229
+ /* Do carryless multiplication of two 32-bit values. */
230
+ static forceinline ATTRIBUTES u64
231
+ clmul_u32(u32 a, u32 b)
232
+ {
233
+ uint64x2_t res = vreinterpretq_u64_p128(
234
+ compat_vmull_p64((poly64_t)a, (poly64_t)b));
235
+
236
+ return vgetq_lane_u64(res, 0);
237
+ }
238
+
239
+ /*
240
+ * Like combine_crcs_slow(), but uses vmull_p64 to do the multiplications more
241
+ * quickly, and supports a variable chunk length. The chunk length is
242
+ * 'i * CRC32_MIN_VARIABLE_CHUNK_LEN'
243
+ * where 1 <= i < ARRAY_LEN(crc32_mults_for_chunklen).
244
+ */
245
+ static forceinline ATTRIBUTES u32
246
+ combine_crcs_fast(u32 crc0, u32 crc1, u32 crc2, u32 crc3, size_t i)
247
+ {
248
+ u64 res0 = clmul_u32(crc0, crc32_mults_for_chunklen[i][0]);
249
+ u64 res1 = clmul_u32(crc1, crc32_mults_for_chunklen[i][1]);
250
+ u64 res2 = clmul_u32(crc2, crc32_mults_for_chunklen[i][2]);
251
+
252
+ return __crc32d(0, res0 ^ res1 ^ res2) ^ crc3;
253
+ }
254
+
255
+ #define crc32_arm_crc_pmullcombine crc32_arm_crc_pmullcombine
256
+ static ATTRIBUTES u32
257
+ crc32_arm_crc_pmullcombine(u32 crc, const u8 *p, size_t len)
258
+ {
259
+ const size_t align = -(uintptr_t)p & 7;
260
+
261
+ if (len >= align + CRC32_NUM_CHUNKS * CRC32_MIN_VARIABLE_CHUNK_LEN) {
262
+ /* Align p to the next 8-byte boundary. */
263
+ if (align) {
264
+ if (align & 1)
265
+ crc = __crc32b(crc, *p++);
266
+ if (align & 2) {
267
+ crc = __crc32h(crc, le16_bswap(*(u16 *)p));
268
+ p += 2;
269
+ }
270
+ if (align & 4) {
271
+ crc = __crc32w(crc, le32_bswap(*(u32 *)p));
272
+ p += 4;
273
+ }
274
+ len -= align;
275
+ }
276
+ /*
277
+ * Handle CRC32_MAX_VARIABLE_CHUNK_LEN specially, so that better
278
+ * code is generated for it.
279
+ */
280
+ while (len >= CRC32_NUM_CHUNKS * CRC32_MAX_VARIABLE_CHUNK_LEN) {
281
+ const u64 *wp0 = (const u64 *)p;
282
+ const u64 * const wp0_end =
283
+ (const u64 *)(p + CRC32_MAX_VARIABLE_CHUNK_LEN);
284
+ u32 crc1 = 0, crc2 = 0, crc3 = 0;
285
+
286
+ STATIC_ASSERT(CRC32_NUM_CHUNKS == 4);
287
+ STATIC_ASSERT(CRC32_MAX_VARIABLE_CHUNK_LEN % (4 * 8) == 0);
288
+ do {
289
+ prefetchr(&wp0[64 + 0*CRC32_MAX_VARIABLE_CHUNK_LEN/8]);
290
+ prefetchr(&wp0[64 + 1*CRC32_MAX_VARIABLE_CHUNK_LEN/8]);
291
+ prefetchr(&wp0[64 + 2*CRC32_MAX_VARIABLE_CHUNK_LEN/8]);
292
+ prefetchr(&wp0[64 + 3*CRC32_MAX_VARIABLE_CHUNK_LEN/8]);
293
+ crc = __crc32d(crc, le64_bswap(wp0[0*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
294
+ crc1 = __crc32d(crc1, le64_bswap(wp0[1*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
295
+ crc2 = __crc32d(crc2, le64_bswap(wp0[2*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
296
+ crc3 = __crc32d(crc3, le64_bswap(wp0[3*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
297
+ wp0++;
298
+ crc = __crc32d(crc, le64_bswap(wp0[0*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
299
+ crc1 = __crc32d(crc1, le64_bswap(wp0[1*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
300
+ crc2 = __crc32d(crc2, le64_bswap(wp0[2*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
301
+ crc3 = __crc32d(crc3, le64_bswap(wp0[3*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
302
+ wp0++;
303
+ crc = __crc32d(crc, le64_bswap(wp0[0*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
304
+ crc1 = __crc32d(crc1, le64_bswap(wp0[1*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
305
+ crc2 = __crc32d(crc2, le64_bswap(wp0[2*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
306
+ crc3 = __crc32d(crc3, le64_bswap(wp0[3*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
307
+ wp0++;
308
+ crc = __crc32d(crc, le64_bswap(wp0[0*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
309
+ crc1 = __crc32d(crc1, le64_bswap(wp0[1*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
310
+ crc2 = __crc32d(crc2, le64_bswap(wp0[2*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
311
+ crc3 = __crc32d(crc3, le64_bswap(wp0[3*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
312
+ wp0++;
313
+ } while (wp0 != wp0_end);
314
+ crc = combine_crcs_fast(crc, crc1, crc2, crc3,
315
+ ARRAY_LEN(crc32_mults_for_chunklen) - 1);
316
+ p += CRC32_NUM_CHUNKS * CRC32_MAX_VARIABLE_CHUNK_LEN;
317
+ len -= CRC32_NUM_CHUNKS * CRC32_MAX_VARIABLE_CHUNK_LEN;
318
+ }
319
+ /* Handle up to one variable-length chunk. */
320
+ if (len >= CRC32_NUM_CHUNKS * CRC32_MIN_VARIABLE_CHUNK_LEN) {
321
+ const size_t i = len / (CRC32_NUM_CHUNKS *
322
+ CRC32_MIN_VARIABLE_CHUNK_LEN);
323
+ const size_t chunk_len =
324
+ i * CRC32_MIN_VARIABLE_CHUNK_LEN;
325
+ const u64 *wp0 = (const u64 *)(p + 0*chunk_len);
326
+ const u64 *wp1 = (const u64 *)(p + 1*chunk_len);
327
+ const u64 *wp2 = (const u64 *)(p + 2*chunk_len);
328
+ const u64 *wp3 = (const u64 *)(p + 3*chunk_len);
329
+ const u64 * const wp0_end = wp1;
330
+ u32 crc1 = 0, crc2 = 0, crc3 = 0;
331
+
332
+ STATIC_ASSERT(CRC32_NUM_CHUNKS == 4);
333
+ STATIC_ASSERT(CRC32_MIN_VARIABLE_CHUNK_LEN % (4 * 8) == 0);
334
+ do {
335
+ prefetchr(wp0 + 64);
336
+ prefetchr(wp1 + 64);
337
+ prefetchr(wp2 + 64);
338
+ prefetchr(wp3 + 64);
339
+ crc = __crc32d(crc, le64_bswap(*wp0++));
340
+ crc1 = __crc32d(crc1, le64_bswap(*wp1++));
341
+ crc2 = __crc32d(crc2, le64_bswap(*wp2++));
342
+ crc3 = __crc32d(crc3, le64_bswap(*wp3++));
343
+ crc = __crc32d(crc, le64_bswap(*wp0++));
344
+ crc1 = __crc32d(crc1, le64_bswap(*wp1++));
345
+ crc2 = __crc32d(crc2, le64_bswap(*wp2++));
346
+ crc3 = __crc32d(crc3, le64_bswap(*wp3++));
347
+ crc = __crc32d(crc, le64_bswap(*wp0++));
348
+ crc1 = __crc32d(crc1, le64_bswap(*wp1++));
349
+ crc2 = __crc32d(crc2, le64_bswap(*wp2++));
350
+ crc3 = __crc32d(crc3, le64_bswap(*wp3++));
351
+ crc = __crc32d(crc, le64_bswap(*wp0++));
352
+ crc1 = __crc32d(crc1, le64_bswap(*wp1++));
353
+ crc2 = __crc32d(crc2, le64_bswap(*wp2++));
354
+ crc3 = __crc32d(crc3, le64_bswap(*wp3++));
355
+ } while (wp0 != wp0_end);
356
+ crc = combine_crcs_fast(crc, crc1, crc2, crc3, i);
357
+ p += CRC32_NUM_CHUNKS * chunk_len;
358
+ len -= CRC32_NUM_CHUNKS * chunk_len;
359
+ }
360
+
361
+ while (len >= 32) {
362
+ crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 0)));
363
+ crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 8)));
364
+ crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 16)));
365
+ crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 24)));
366
+ p += 32;
367
+ len -= 32;
368
+ }
369
+ } else {
370
+ while (len >= 32) {
371
+ crc = __crc32d(crc, get_unaligned_le64(p + 0));
372
+ crc = __crc32d(crc, get_unaligned_le64(p + 8));
373
+ crc = __crc32d(crc, get_unaligned_le64(p + 16));
374
+ crc = __crc32d(crc, get_unaligned_le64(p + 24));
375
+ p += 32;
376
+ len -= 32;
377
+ }
378
+ }
379
+ if (len & 16) {
380
+ crc = __crc32d(crc, get_unaligned_le64(p + 0));
381
+ crc = __crc32d(crc, get_unaligned_le64(p + 8));
382
+ p += 16;
383
+ }
384
+ if (len & 8) {
385
+ crc = __crc32d(crc, get_unaligned_le64(p));
386
+ p += 8;
387
+ }
388
+ if (len & 4) {
389
+ crc = __crc32w(crc, get_unaligned_le32(p));
390
+ p += 4;
391
+ }
392
+ if (len & 2) {
393
+ crc = __crc32h(crc, get_unaligned_le16(p));
394
+ p += 2;
395
+ }
396
+ if (len & 1)
397
+ crc = __crc32b(crc, *p);
398
+ return crc;
399
+ }
400
+ #undef ATTRIBUTES
401
+ #endif /* crc32_arm_crc_pmullcombine() */
402
+
403
+ /*
404
+ * crc32_arm_pmullx4() - implementation using "folding" with pmull instructions
405
+ *
406
+ * This implementation is intended for CPUs that support pmull instructions but
407
+ * not crc32 instructions.
408
+ */
409
+ #if HAVE_PMULL_INTRIN
410
+ # define crc32_arm_pmullx4 crc32_arm_pmullx4
411
+ # define SUFFIX _pmullx4
412
+ # ifdef __clang__
413
+ /*
414
+ * This used to use "crypto", but that stopped working with clang 16.
415
+ * Now only "aes" works. "aes" works with older versions too, so use
416
+ * that. No "+" prefix; clang 15 and earlier doesn't accept that.
417
+ */
418
+ # define ATTRIBUTES _target_attribute("aes")
419
+ # else
420
+ /*
421
+ * With gcc, only "+crypto" works. Both the "+" prefix and the
422
+ * "crypto" (not "aes") are essential...
423
+ */
424
+ # define ATTRIBUTES _target_attribute("+crypto")
425
+ # endif
426
+ # define ENABLE_EOR3 0
427
+ # include "crc32_pmull_helpers.h"
428
+
429
+ static ATTRIBUTES u32
430
+ crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len)
431
+ {
432
+ static const u64 _aligned_attribute(16) mults[3][2] = {
433
+ { CRC32_X159_MODG, CRC32_X95_MODG }, /* 1 vecs */
434
+ { CRC32_X543_MODG, CRC32_X479_MODG }, /* 4 vecs */
435
+ { CRC32_X287_MODG, CRC32_X223_MODG }, /* 2 vecs */
436
+ };
437
+ static const u64 _aligned_attribute(16) final_mults[3][2] = {
438
+ { CRC32_X63_MODG, 0 },
439
+ { CRC32_BARRETT_CONSTANT_1, 0 },
440
+ { CRC32_BARRETT_CONSTANT_2, 0 },
441
+ };
442
+ const uint8x16_t zeroes = vdupq_n_u8(0);
443
+ const uint8x16_t mask32 = vreinterpretq_u8_u64(vdupq_n_u64(0xFFFFFFFF));
444
+ const poly64x2_t multipliers_1 = load_multipliers(mults[0]);
445
+ uint8x16_t v0, v1, v2, v3;
446
+
447
+ if (len < 64 + 15) {
448
+ if (len < 16)
449
+ return crc32_slice1(crc, p, len);
450
+ v0 = veorq_u8(vld1q_u8(p), u32_to_bytevec(crc));
451
+ p += 16;
452
+ len -= 16;
453
+ while (len >= 16) {
454
+ v0 = fold_vec(v0, vld1q_u8(p), multipliers_1);
455
+ p += 16;
456
+ len -= 16;
457
+ }
458
+ } else {
459
+ const poly64x2_t multipliers_4 = load_multipliers(mults[1]);
460
+ const poly64x2_t multipliers_2 = load_multipliers(mults[2]);
461
+ const size_t align = -(uintptr_t)p & 15;
462
+ const uint8x16_t *vp;
463
+
464
+ v0 = veorq_u8(vld1q_u8(p), u32_to_bytevec(crc));
465
+ p += 16;
466
+ /* Align p to the next 16-byte boundary. */
467
+ if (align) {
468
+ v0 = fold_partial_vec(v0, p, align, multipliers_1);
469
+ p += align;
470
+ len -= align;
471
+ }
472
+ vp = (const uint8x16_t *)p;
473
+ v1 = *vp++;
474
+ v2 = *vp++;
475
+ v3 = *vp++;
476
+ while (len >= 64 + 64) {
477
+ v0 = fold_vec(v0, *vp++, multipliers_4);
478
+ v1 = fold_vec(v1, *vp++, multipliers_4);
479
+ v2 = fold_vec(v2, *vp++, multipliers_4);
480
+ v3 = fold_vec(v3, *vp++, multipliers_4);
481
+ len -= 64;
482
+ }
483
+ v0 = fold_vec(v0, v2, multipliers_2);
484
+ v1 = fold_vec(v1, v3, multipliers_2);
485
+ if (len & 32) {
486
+ v0 = fold_vec(v0, *vp++, multipliers_2);
487
+ v1 = fold_vec(v1, *vp++, multipliers_2);
488
+ }
489
+ v0 = fold_vec(v0, v1, multipliers_1);
490
+ if (len & 16)
491
+ v0 = fold_vec(v0, *vp++, multipliers_1);
492
+ p = (const u8 *)vp;
493
+ len &= 15;
494
+ }
495
+
496
+ /* Handle any remaining partial block now before reducing to 32 bits. */
497
+ if (len)
498
+ v0 = fold_partial_vec(v0, p, len, multipliers_1);
499
+
500
+ /*
501
+ * Fold 128 => 96 bits. This also implicitly appends 32 zero bits,
502
+ * which is equivalent to multiplying by x^32. This is needed because
503
+ * the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x).
504
+ */
505
+
506
+ v0 = veorq_u8(vextq_u8(v0, zeroes, 8),
507
+ clmul_high(vextq_u8(zeroes, v0, 8), multipliers_1));
508
+
509
+ /* Fold 96 => 64 bits. */
510
+ v0 = veorq_u8(vextq_u8(v0, zeroes, 4),
511
+ clmul_low(vandq_u8(v0, mask32),
512
+ load_multipliers(final_mults[0])));
513
+
514
+ /* Reduce 64 => 32 bits using Barrett reduction. */
515
+ v1 = clmul_low(vandq_u8(v0, mask32), load_multipliers(final_mults[1]));
516
+ v1 = clmul_low(vandq_u8(v1, mask32), load_multipliers(final_mults[2]));
517
+ return vgetq_lane_u32(vreinterpretq_u32_u8(veorq_u8(v0, v1)), 1);
518
+ }
519
+ #undef SUFFIX
520
+ #undef ATTRIBUTES
521
+ #undef ENABLE_EOR3
522
+ #endif /* crc32_arm_pmullx4() */
523
+
524
+ /*
525
+ * crc32_arm_pmullx12_crc() - large-stride implementation using "folding" with
526
+ * pmull instructions, where crc32 instructions are also available
527
+ *
528
+ * See crc32_pmull_wide.h for explanation.
529
+ */
530
+ #if HAVE_PMULL_INTRIN && HAVE_CRC32_INTRIN
531
+ # define crc32_arm_pmullx12_crc crc32_arm_pmullx12_crc
532
+ # define SUFFIX _pmullx12_crc
533
+ # ifdef __clang__
534
+ # define ATTRIBUTES _target_attribute("aes,crc")
535
+ # else
536
+ # define ATTRIBUTES _target_attribute("+crypto,+crc")
537
+ # endif
538
+ # define ENABLE_EOR3 0
539
+ # include "crc32_pmull_wide.h"
540
+ #endif
541
+
542
+ /*
543
+ * crc32_arm_pmullx12_crc_eor3()
544
+ *
545
+ * This like crc32_arm_pmullx12_crc(), but it adds the eor3 instruction (from
546
+ * the sha3 extension) for even better performance.
547
+ */
548
+ #if HAVE_PMULL_INTRIN && HAVE_CRC32_INTRIN && HAVE_SHA3_INTRIN
549
+ # define crc32_arm_pmullx12_crc_eor3 crc32_arm_pmullx12_crc_eor3
550
+ # define SUFFIX _pmullx12_crc_eor3
551
+ # ifdef __clang__
552
+ # define ATTRIBUTES _target_attribute("aes,crc,sha3")
553
+ /*
554
+ * With gcc 13.1 and earlier (before gcc commit 73d3bc348190 or 9aac37ab8a7b,
555
+ * "aarch64: Remove architecture dependencies from intrinsics"),
556
+ * arch=armv8.2-a is needed for the sha3 intrinsics, unless the default
557
+ * target is armv8.3-a or later in which case it must be omitted. armv8.3-a
558
+ * or later can be detected by checking for __ARM_FEATURE_JCVT.
559
+ */
560
+ # elif GCC_PREREQ(13, 2) || defined(__ARM_FEATURE_JCVT)
561
+ # define ATTRIBUTES _target_attribute("+crypto,+crc,+sha3")
562
+ # else
563
+ # define ATTRIBUTES _target_attribute("arch=armv8.2-a+crypto+crc+sha3")
564
+ # endif
565
+ # define ENABLE_EOR3 1
566
+ # include "crc32_pmull_wide.h"
567
+ #endif
568
+
569
+ static inline crc32_func_t
570
+ arch_select_crc32_func(void)
571
+ {
572
+ const u32 features MAYBE_UNUSED = get_arm_cpu_features();
573
+
574
+ #ifdef crc32_arm_pmullx12_crc_eor3
575
+ if ((features & ARM_CPU_FEATURE_PREFER_PMULL) &&
576
+ HAVE_PMULL(features) && HAVE_CRC32(features) && HAVE_SHA3(features))
577
+ return crc32_arm_pmullx12_crc_eor3;
578
+ #endif
579
+ #ifdef crc32_arm_pmullx12_crc
580
+ if ((features & ARM_CPU_FEATURE_PREFER_PMULL) &&
581
+ HAVE_PMULL(features) && HAVE_CRC32(features))
582
+ return crc32_arm_pmullx12_crc;
583
+ #endif
584
+ #ifdef crc32_arm_crc_pmullcombine
585
+ if (HAVE_CRC32(features) && HAVE_PMULL(features))
586
+ return crc32_arm_crc_pmullcombine;
587
+ #endif
588
+ #ifdef crc32_arm_crc
589
+ if (HAVE_CRC32(features))
590
+ return crc32_arm_crc;
591
+ #endif
592
+ #ifdef crc32_arm_pmullx4
593
+ if (HAVE_PMULL(features))
594
+ return crc32_arm_pmullx4;
595
+ #endif
596
+ return NULL;
597
+ }
598
+ #define arch_select_crc32_func arch_select_crc32_func
599
+
600
+ #endif /* LIB_ARM_CRC32_IMPL_H */