libdeflate 0.1.1 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (79) hide show
  1. checksums.yaml +5 -5
  2. data/.github/workflows/test.yml +34 -0
  3. data/README.md +1 -6
  4. data/ext/libdeflate/extconf.rb +18 -7
  5. data/ext/libdeflate/libdeflate_ext.c +17 -17
  6. data/lib/libdeflate/version.rb +1 -1
  7. data/libdeflate.gemspec +2 -1
  8. metadata +13 -84
  9. data/.gitmodules +0 -3
  10. data/.travis.yml +0 -5
  11. data/ext/libdeflate/libdeflate/.gitignore +0 -19
  12. data/ext/libdeflate/libdeflate/COPYING +0 -21
  13. data/ext/libdeflate/libdeflate/Makefile +0 -231
  14. data/ext/libdeflate/libdeflate/Makefile.msc +0 -64
  15. data/ext/libdeflate/libdeflate/NEWS +0 -57
  16. data/ext/libdeflate/libdeflate/README.md +0 -170
  17. data/ext/libdeflate/libdeflate/common/common_defs.h +0 -351
  18. data/ext/libdeflate/libdeflate/common/compiler_gcc.h +0 -134
  19. data/ext/libdeflate/libdeflate/common/compiler_msc.h +0 -95
  20. data/ext/libdeflate/libdeflate/lib/adler32.c +0 -213
  21. data/ext/libdeflate/libdeflate/lib/adler32_impl.h +0 -281
  22. data/ext/libdeflate/libdeflate/lib/aligned_malloc.c +0 -57
  23. data/ext/libdeflate/libdeflate/lib/aligned_malloc.h +0 -13
  24. data/ext/libdeflate/libdeflate/lib/bt_matchfinder.h +0 -357
  25. data/ext/libdeflate/libdeflate/lib/crc32.c +0 -368
  26. data/ext/libdeflate/libdeflate/lib/crc32_impl.h +0 -286
  27. data/ext/libdeflate/libdeflate/lib/crc32_table.h +0 -526
  28. data/ext/libdeflate/libdeflate/lib/decompress_impl.h +0 -404
  29. data/ext/libdeflate/libdeflate/lib/deflate_compress.c +0 -2817
  30. data/ext/libdeflate/libdeflate/lib/deflate_compress.h +0 -14
  31. data/ext/libdeflate/libdeflate/lib/deflate_constants.h +0 -66
  32. data/ext/libdeflate/libdeflate/lib/deflate_decompress.c +0 -889
  33. data/ext/libdeflate/libdeflate/lib/gzip_compress.c +0 -95
  34. data/ext/libdeflate/libdeflate/lib/gzip_constants.h +0 -45
  35. data/ext/libdeflate/libdeflate/lib/gzip_decompress.c +0 -130
  36. data/ext/libdeflate/libdeflate/lib/hc_matchfinder.h +0 -405
  37. data/ext/libdeflate/libdeflate/lib/lib_common.h +0 -35
  38. data/ext/libdeflate/libdeflate/lib/matchfinder_avx2.h +0 -53
  39. data/ext/libdeflate/libdeflate/lib/matchfinder_common.h +0 -205
  40. data/ext/libdeflate/libdeflate/lib/matchfinder_neon.h +0 -61
  41. data/ext/libdeflate/libdeflate/lib/matchfinder_sse2.h +0 -53
  42. data/ext/libdeflate/libdeflate/lib/unaligned.h +0 -202
  43. data/ext/libdeflate/libdeflate/lib/x86_cpu_features.c +0 -169
  44. data/ext/libdeflate/libdeflate/lib/x86_cpu_features.h +0 -48
  45. data/ext/libdeflate/libdeflate/lib/zlib_compress.c +0 -87
  46. data/ext/libdeflate/libdeflate/lib/zlib_constants.h +0 -21
  47. data/ext/libdeflate/libdeflate/lib/zlib_decompress.c +0 -91
  48. data/ext/libdeflate/libdeflate/libdeflate.h +0 -274
  49. data/ext/libdeflate/libdeflate/programs/benchmark.c +0 -558
  50. data/ext/libdeflate/libdeflate/programs/checksum.c +0 -197
  51. data/ext/libdeflate/libdeflate/programs/detect.sh +0 -62
  52. data/ext/libdeflate/libdeflate/programs/gzip.c +0 -603
  53. data/ext/libdeflate/libdeflate/programs/prog_util.c +0 -530
  54. data/ext/libdeflate/libdeflate/programs/prog_util.h +0 -162
  55. data/ext/libdeflate/libdeflate/programs/test_checksums.c +0 -135
  56. data/ext/libdeflate/libdeflate/programs/tgetopt.c +0 -118
  57. data/ext/libdeflate/libdeflate/tools/afl-fuzz/Makefile +0 -12
  58. data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_compress/fuzz.c +0 -40
  59. data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_compress/inputs/0 +0 -0
  60. data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_decompress/fuzz.c +0 -28
  61. data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_decompress/inputs/0 +0 -3
  62. data/ext/libdeflate/libdeflate/tools/afl-fuzz/gzip_decompress/fuzz.c +0 -28
  63. data/ext/libdeflate/libdeflate/tools/afl-fuzz/gzip_decompress/inputs/0 +0 -0
  64. data/ext/libdeflate/libdeflate/tools/afl-fuzz/prepare_for_fuzz.sh +0 -14
  65. data/ext/libdeflate/libdeflate/tools/afl-fuzz/zlib_decompress/fuzz.c +0 -28
  66. data/ext/libdeflate/libdeflate/tools/afl-fuzz/zlib_decompress/inputs/0 +0 -3
  67. data/ext/libdeflate/libdeflate/tools/android_build.sh +0 -104
  68. data/ext/libdeflate/libdeflate/tools/checksum_benchmarks.sh +0 -76
  69. data/ext/libdeflate/libdeflate/tools/exec_tests.sh +0 -30
  70. data/ext/libdeflate/libdeflate/tools/gen_crc32_multipliers.c +0 -108
  71. data/ext/libdeflate/libdeflate/tools/gen_crc32_table.c +0 -100
  72. data/ext/libdeflate/libdeflate/tools/gzip_tests.sh +0 -412
  73. data/ext/libdeflate/libdeflate/tools/make-windows-releases +0 -21
  74. data/ext/libdeflate/libdeflate/tools/mips_build.sh +0 -9
  75. data/ext/libdeflate/libdeflate/tools/msc_test.bat +0 -3
  76. data/ext/libdeflate/libdeflate/tools/pgo_build.sh +0 -23
  77. data/ext/libdeflate/libdeflate/tools/produce_gzip_benchmark_table.sh +0 -37
  78. data/ext/libdeflate/libdeflate/tools/run_tests.sh +0 -305
  79. data/ext/libdeflate/libdeflate/tools/windows_build.sh +0 -10
@@ -1,368 +0,0 @@
1
- /*
2
- * crc32.c - CRC-32 checksum algorithm for the gzip format
3
- *
4
- * Originally public domain; changes after 2016-09-07 are copyrighted.
5
- *
6
- * Copyright 2016 Eric Biggers
7
- *
8
- * Permission is hereby granted, free of charge, to any person
9
- * obtaining a copy of this software and associated documentation
10
- * files (the "Software"), to deal in the Software without
11
- * restriction, including without limitation the rights to use,
12
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
13
- * copies of the Software, and to permit persons to whom the
14
- * Software is furnished to do so, subject to the following
15
- * conditions:
16
- *
17
- * The above copyright notice and this permission notice shall be
18
- * included in all copies or substantial portions of the Software.
19
- *
20
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27
- * OTHER DEALINGS IN THE SOFTWARE.
28
- */
29
-
30
- /*
31
- * High-level description of CRC
32
- * =============================
33
- *
34
- * Consider a bit sequence 'bits[1...len]'. Interpret 'bits' as the "message"
35
- * polynomial M(x) with coefficients in GF(2) (the field of integers modulo 2),
36
- * where the coefficient of 'x^i' is 'bits[len - i]'. Then, compute:
37
- *
38
- * R(x) = M(x)*x^n mod G(x)
39
- *
40
- * where G(x) is a selected "generator" polynomial of degree 'n'. The remainder
41
- * R(x) is a polynomial of max degree 'n - 1'. The CRC of 'bits' is R(x)
42
- * interpreted as a bitstring of length 'n'.
43
- *
44
- * CRC used in gzip
45
- * ================
46
- *
47
- * In the gzip format (RFC 1952):
48
- *
49
- * - The bitstring to checksum is formed from the bytes of the uncompressed
50
- * data by concatenating the bits from the bytes in order, proceeding
51
- * from the low-order bit to the high-order bit within each byte.
52
- *
53
- * - The generator polynomial G(x) is: x^32 + x^26 + x^23 + x^22 + x^16 +
54
- * x^12 + x^11 + x^10 + x^8 + x^7 + x^5 + x^4 + x^2 + x + 1.
55
- * Consequently, the CRC length is 32 bits ("CRC-32").
56
- *
57
- * - The highest order 32 coefficients of M(x)*x^n are inverted.
58
- *
59
- * - All 32 coefficients of R(x) are inverted.
60
- *
61
- * The two inversions cause added leading and trailing zero bits to affect the
62
- * resulting CRC, whereas with a regular CRC such bits would have no effect on
63
- * the CRC.
64
- *
65
- * Computation and optimizations
66
- * =============================
67
- *
68
- * We can compute R(x) through "long division", maintaining only 32 bits of
69
- * state at any given time. Multiplication by 'x' can be implemented as
70
- * right-shifting by 1 (assuming the polynomial<=>bitstring mapping where the
71
- * highest order bit represents the coefficient of x^0), and both addition and
72
- * subtraction can be implemented as bitwise exclusive OR (since we are working
73
- * in GF(2)). Here is an unoptimized implementation:
74
- *
75
- * static u32 crc32_gzip(const u8 *buffer, size_t nbytes)
76
- * {
77
- * u32 remainder = 0;
78
- * const u32 divisor = 0xEDB88320;
79
- *
80
- * for (size_t i = 0; i < nbytes * 8 + 32; i++) {
81
- * int bit;
82
- * u32 multiple;
83
- *
84
- * if (i < nbytes * 8)
85
- * bit = (buffer[i / 8] >> (i % 8)) & 1;
86
- * else
87
- * bit = 0; // one of the 32 appended 0 bits
88
- *
89
- * if (i < 32) // the first 32 bits are inverted
90
- * bit ^= 1;
91
- *
92
- * if (remainder & 1)
93
- * multiple = divisor;
94
- * else
95
- * multiple = 0;
96
- *
97
- * remainder >>= 1;
98
- * remainder |= (u32)bit << 31;
99
- * remainder ^= multiple;
100
- * }
101
- *
102
- * return ~remainder;
103
- * }
104
- *
105
- * In this implementation, the 32-bit integer 'remainder' maintains the
106
- * remainder of the currently processed portion of the message (with 32 zero
107
- * bits appended) when divided by the generator polynomial. 'remainder' is the
108
- * representation of R(x), and 'divisor' is the representation of G(x) excluding
109
- * the x^32 coefficient. For each bit to process, we multiply R(x) by 'x^1',
110
- * then add 'x^0' if the new bit is a 1. If this causes R(x) to gain a nonzero
111
- * x^32 term, then we subtract G(x) from R(x).
112
- *
113
- * We can speed this up by taking advantage of the fact that XOR is commutative
114
- * and associative, so the order in which we combine the inputs into 'remainder'
115
- * is unimportant. And since each message bit we add doesn't affect the choice
116
- * of 'multiple' until 32 bits later, we need not actually add each message bit
117
- * until that point:
118
- *
119
- * static u32 crc32_gzip(const u8 *buffer, size_t nbytes)
120
- * {
121
- * u32 remainder = ~0;
122
- * const u32 divisor = 0xEDB88320;
123
- *
124
- * for (size_t i = 0; i < nbytes * 8; i++) {
125
- * int bit;
126
- * u32 multiple;
127
- *
128
- * bit = (buffer[i / 8] >> (i % 8)) & 1;
129
- * remainder ^= bit;
130
- * if (remainder & 1)
131
- * multiple = divisor;
132
- * else
133
- * multiple = 0;
134
- * remainder >>= 1;
135
- * remainder ^= multiple;
136
- * }
137
- *
138
- * return ~remainder;
139
- * }
140
- *
141
- * With the above implementation we get the effect of 32 appended 0 bits for
142
- * free; they never affect the choice of a divisor, nor would they change the
143
- * value of 'remainder' if they were to be actually XOR'ed in. And by starting
144
- * with a remainder of all 1 bits, we get the effect of complementing the first
145
- * 32 message bits.
146
- *
147
- * The next optimization is to process the input in multi-bit units. Suppose
148
- * that we insert the next 'n' message bits into the remainder. Then we get an
149
- * intermediate remainder of length '32 + n' bits, and the CRC of the extra 'n'
150
- * bits is the amount by which the low 32 bits of the remainder will change as a
151
- * result of cancelling out those 'n' bits. Taking n=8 (one byte) and
152
- * precomputing a table containing the CRC of each possible byte, we get
153
- * crc32_slice1() defined below.
154
- *
155
- * As a further optimization, we could increase the multi-bit unit size to 16.
156
- * However, that is inefficient because the table size explodes from 256 entries
157
- * (1024 bytes) to 65536 entries (262144 bytes), which wastes memory and won't
158
- * fit in L1 cache on typical processors.
159
- *
160
- * However, we can actually process 4 bytes at a time using 4 different tables
161
- * with 256 entries each. Logically, we form a 64-bit intermediate remainder
162
- * and cancel out the high 32 bits in 8-bit chunks. Bits 32-39 are cancelled
163
- * out by the CRC of those bits, whereas bits 40-47 are be cancelled out by the
164
- * CRC of those bits with 8 zero bits appended, and so on. This method is
165
- * implemented in crc32_slice4(), defined below.
166
- *
167
- * In crc32_slice8(), this method is extended to 8 bytes at a time. The
168
- * intermediate remainder (which we never actually store explicitly) is 96 bits.
169
- *
170
- * On CPUs that support fast carryless multiplication, CRCs can be computed even
171
- * more quickly via "folding". See crc32_pclmul() for an example.
172
- */
173
-
174
- #include "x86_cpu_features.h"
175
-
176
- #include "libdeflate.h"
177
-
178
- /* Select the implementations to compile in. */
179
-
180
- #define NEED_GENERIC_IMPL 1 /* include generic impl unless overridden */
181
- #define DEFAULT_IMPL crc32_slice8
182
-
183
- /* Include the PCLMUL implementation? */
184
- #define NEED_PCLMUL_IMPL 0
185
- #if defined(__PCLMUL__) || \
186
- (X86_CPU_FEATURES_ENABLED && COMPILER_SUPPORTS_PCLMUL_TARGET && \
187
- COMPILER_SUPPORTS_TARGET_INTRINSICS)
188
- # include <wmmintrin.h>
189
- # undef NEED_PCLMUL_IMPL
190
- # define NEED_PCLMUL_IMPL 1
191
- # ifdef __PCLMUL__ /* compiling for PCLMUL, i.e. can we assume it's there? */
192
- # undef NEED_GENERIC_IMPL
193
- # define NEED_GENERIC_IMPL 0 /* generic impl not needed */
194
- # undef DEFAULT_IMPL
195
- # define DEFAULT_IMPL crc32_pclmul
196
- # endif /* otherwise, we can build a PCLMUL version, but we won't know whether
197
- we can use it until runtime */
198
- #endif
199
-
200
- /*
201
- * Include the PCLMUL/AVX implementation? Although our PCLMUL-optimized CRC-32
202
- * function doesn't use any AVX intrinsics specifically, it can benefit a lot
203
- * from being compiled for an AVX target: on Skylake, ~16700 MB/s vs. ~10100
204
- * MB/s. I expect this is related to the PCLMULQDQ instructions being assembled
205
- * in the newer three-operand form rather than the older two-operand form.
206
- *
207
- * Note: this is only needed if __AVX__ is *not* defined, since otherwise the
208
- * "regular" PCLMUL implementation would already be AVX enabled.
209
- */
210
- #define NEED_PCLMUL_AVX_IMPL 0
211
- #if NEED_PCLMUL_IMPL && !defined(__AVX__) && \
212
- X86_CPU_FEATURES_ENABLED && COMPILER_SUPPORTS_AVX_TARGET
213
- # undef NEED_PCLMUL_AVX_IMPL
214
- # define NEED_PCLMUL_AVX_IMPL 1
215
- #endif
216
-
217
- #define NUM_IMPLS (NEED_GENERIC_IMPL + NEED_PCLMUL_IMPL + NEED_PCLMUL_AVX_IMPL)
218
-
219
- /* Define the CRC-32 table */
220
- #if NEED_GENERIC_IMPL
221
- # define CRC32_SLICE8
222
- #else
223
- # define CRC32_SLICE1 /* only need short table for unaligned ends */
224
- #endif
225
- #include "crc32_table.h"
226
-
227
- static forceinline u32
228
- crc32_update_byte(u32 remainder, u8 next_byte)
229
- {
230
- return (remainder >> 8) ^ crc32_table[(u8)remainder ^ next_byte];
231
- }
232
-
233
- #if defined(CRC32_SLICE1) || (NUM_IMPLS > NEED_GENERIC_IMPL)
234
- static u32
235
- crc32_slice1(u32 remainder, const u8 *buffer, size_t nbytes)
236
- {
237
- size_t i;
238
-
239
- STATIC_ASSERT(ARRAY_LEN(crc32_table) >= 0x100);
240
-
241
- for (i = 0; i < nbytes; i++)
242
- remainder = crc32_update_byte(remainder, buffer[i]);
243
- return remainder;
244
- }
245
- #endif
246
-
247
- #ifdef CRC32_SLICE4
248
- static u32
249
- crc32_slice4(u32 remainder, const u8 *buffer, size_t nbytes)
250
- {
251
- const u8 *p = buffer;
252
- const u8 *end = buffer + nbytes;
253
- const u8 *end32;
254
-
255
- STATIC_ASSERT(ARRAY_LEN(crc32_table) >= 0x400);
256
-
257
- for (; ((uintptr_t)p & 3) && p != end; p++)
258
- remainder = crc32_update_byte(remainder, *p);
259
-
260
- end32 = p + ((end - p) & ~3);
261
- for (; p != end32; p += 4) {
262
- u32 v = le32_bswap(*(const u32 *)p);
263
- remainder =
264
- crc32_table[0x300 + (u8)((remainder ^ v) >> 0)] ^
265
- crc32_table[0x200 + (u8)((remainder ^ v) >> 8)] ^
266
- crc32_table[0x100 + (u8)((remainder ^ v) >> 16)] ^
267
- crc32_table[0x000 + (u8)((remainder ^ v) >> 24)];
268
- }
269
-
270
- for (; p != end; p++)
271
- remainder = crc32_update_byte(remainder, *p);
272
-
273
- return remainder;
274
- }
275
- #endif
276
-
277
- #ifdef CRC32_SLICE8
278
- static u32
279
- crc32_slice8(u32 remainder, const u8 *buffer, size_t nbytes)
280
- {
281
- const u8 *p = buffer;
282
- const u8 *end = buffer + nbytes;
283
- const u8 *end64;
284
-
285
- STATIC_ASSERT(ARRAY_LEN(crc32_table) >= 0x800);
286
-
287
- for (; ((uintptr_t)p & 7) && p != end; p++)
288
- remainder = crc32_update_byte(remainder, *p);
289
-
290
- end64 = p + ((end - p) & ~7);
291
- for (; p != end64; p += 8) {
292
- u32 v1 = le32_bswap(*(const u32 *)(p + 0));
293
- u32 v2 = le32_bswap(*(const u32 *)(p + 4));
294
- remainder =
295
- crc32_table[0x700 + (u8)((remainder ^ v1) >> 0)] ^
296
- crc32_table[0x600 + (u8)((remainder ^ v1) >> 8)] ^
297
- crc32_table[0x500 + (u8)((remainder ^ v1) >> 16)] ^
298
- crc32_table[0x400 + (u8)((remainder ^ v1) >> 24)] ^
299
- crc32_table[0x300 + (u8)(v2 >> 0)] ^
300
- crc32_table[0x200 + (u8)(v2 >> 8)] ^
301
- crc32_table[0x100 + (u8)(v2 >> 16)] ^
302
- crc32_table[0x000 + (u8)(v2 >> 24)];
303
- }
304
-
305
- for (; p != end; p++)
306
- remainder = crc32_update_byte(remainder, *p);
307
-
308
- return remainder;
309
- }
310
- #endif
311
-
312
- /* Define the PCLMUL implementation if needed. */
313
- #if NEED_PCLMUL_IMPL
314
- # define FUNCNAME crc32_pclmul
315
- # define FUNCNAME_ALIGNED crc32_pclmul_aligned
316
- # ifdef __PCLMUL__
317
- # define ATTRIBUTES
318
- # else
319
- # define ATTRIBUTES __attribute__((target("pclmul")))
320
- # endif
321
- # include "crc32_impl.h"
322
- #endif
323
-
324
- /* Define the PCLMUL/AVX implementation if needed. */
325
- #if NEED_PCLMUL_AVX_IMPL
326
- # define FUNCNAME crc32_pclmul_avx
327
- # define FUNCNAME_ALIGNED crc32_pclmul_avx_aligned
328
- # define ATTRIBUTES __attribute__((target("pclmul,avx")))
329
- # include "crc32_impl.h"
330
- #endif
331
-
332
- typedef u32 (*crc32_func_t)(u32, const u8 *, size_t);
333
-
334
- /*
335
- * If multiple implementations are available, then dispatch among them based on
336
- * CPU features at runtime. Otherwise just call the single one directly.
337
- */
338
- #if NUM_IMPLS == 1
339
- # define crc32_impl DEFAULT_IMPL
340
- #else
341
- static u32 dispatch(u32, const u8 *, size_t);
342
-
343
- static crc32_func_t crc32_impl = dispatch;
344
-
345
- static u32 dispatch(u32 remainder, const u8 *buffer, size_t nbytes)
346
- {
347
- crc32_func_t f = DEFAULT_IMPL;
348
- #if NEED_PCLMUL_IMPL && !defined(__PCLMUL__)
349
- if (x86_have_cpu_features(X86_CPU_FEATURE_PCLMULQDQ))
350
- f = crc32_pclmul;
351
- #endif
352
- #if NEED_PCLMUL_AVX_IMPL
353
- if (x86_have_cpu_features(X86_CPU_FEATURE_PCLMULQDQ |
354
- X86_CPU_FEATURE_AVX))
355
- f = crc32_pclmul_avx;
356
- #endif
357
- crc32_impl = f;
358
- return crc32_impl(remainder, buffer, nbytes);
359
- }
360
- #endif /* NUM_IMPLS != 1 */
361
-
362
- LIBDEFLATEAPI u32
363
- libdeflate_crc32(u32 remainder, const void *buffer, size_t nbytes)
364
- {
365
- if (buffer == NULL) /* return initial value */
366
- return 0;
367
- return ~crc32_impl(~remainder, buffer, nbytes);
368
- }
@@ -1,286 +0,0 @@
1
- /*
2
- * crc32_impl.h
3
- *
4
- * Copyright 2016 Eric Biggers
5
- *
6
- * Permission is hereby granted, free of charge, to any person
7
- * obtaining a copy of this software and associated documentation
8
- * files (the "Software"), to deal in the Software without
9
- * restriction, including without limitation the rights to use,
10
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
11
- * copies of the Software, and to permit persons to whom the
12
- * Software is furnished to do so, subject to the following
13
- * conditions:
14
- *
15
- * The above copyright notice and this permission notice shall be
16
- * included in all copies or substantial portions of the Software.
17
- *
18
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
20
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
23
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
25
- * OTHER DEALINGS IN THE SOFTWARE.
26
- */
27
-
28
- /*
29
- * CRC-32 folding with PCLMULQDQ.
30
- *
31
- * The basic idea is to repeatedly "fold" each 512 bits into the next 512 bits,
32
- * producing an abbreviated message which is congruent the original message
33
- * modulo the generator polynomial G(x).
34
- *
35
- * Folding each 512 bits is implemented as eight 64-bit folds, each of which
36
- * uses one carryless multiplication instruction. It's expected that CPUs may
37
- * be able to execute some of these multiplications in parallel.
38
- *
39
- * Explanation of "folding": let A(x) be 64 bits from the message, and let B(x)
40
- * be 95 bits from a constant distance D later in the message. The relevant
41
- * portion of the message can be written as:
42
- *
43
- * M(x) = A(x)*x^D + B(x)
44
- *
45
- * ... where + and * represent addition and multiplication, respectively, of
46
- * polynomials over GF(2). Note that when implemented on a computer, these
47
- * operations are equivalent to XOR and carryless multiplication, respectively.
48
- *
49
- * For the purpose of CRC calculation, only the remainder modulo the generator
50
- * polynomial G(x) matters:
51
- *
52
- * M(x) mod G(x) = (A(x)*x^D + B(x)) mod G(x)
53
- *
54
- * Since the modulo operation can be applied anywhere in a sequence of additions
55
- * and multiplications without affecting the result, this is equivalent to:
56
- *
57
- * M(x) mod G(x) = (A(x)*(x^D mod G(x)) + B(x)) mod G(x)
58
- *
59
- * For any D, 'x^D mod G(x)' will be a polynomial with maximum degree 31, i.e.
60
- * a 32-bit quantity. So 'A(x) * (x^D mod G(x))' is equivalent to a carryless
61
- * multiplication of a 64-bit quantity by a 32-bit quantity, producing a 95-bit
62
- * product. Then, adding (XOR-ing) the product to B(x) produces a polynomial
63
- * with the same length as B(x) but with the same remainder as 'A(x)*x^D +
64
- * B(x)'. This is the basic fold operation with 64 bits.
65
- *
66
- * Note that the carryless multiplication instruction PCLMULQDQ actually takes
67
- * two 64-bit inputs and produces a 127-bit product in the low-order bits of a
68
- * 128-bit XMM register. This works fine, but care must be taken to account for
69
- * "bit endianness". With the CRC version implemented here, bits are always
70
- * ordered such that the lowest-order bit represents the coefficient of highest
71
- * power of x and the highest-order bit represents the coefficient of the lowest
72
- * power of x. This is backwards from the more intuitive order. Still,
73
- * carryless multiplication works essentially the same either way. It just must
74
- * be accounted for that when we XOR the 95-bit product in the low-order 95 bits
75
- * of a 128-bit XMM register into 128-bits of later data held in another XMM
76
- * register, we'll really be XOR-ing the product into the mathematically higher
77
- * degree end of those later bits, not the lower degree end as may be expected.
78
- *
79
- * So given that caveat and the fact that we process 512 bits per iteration, the
80
- * 'D' values we need for the two 64-bit halves of each 128 bits of data are:
81
- *
82
- * D = (512 + 95) - 64 for the higher-degree half of each 128 bits,
83
- * i.e. the lower order bits in the XMM register
84
- *
85
- * D = (512 + 95) - 128 for the lower-degree half of each 128 bits,
86
- * i.e. the higher order bits in the XMM register
87
- *
88
- * The required 'x^D mod G(x)' values were precomputed.
89
- *
90
- * When <= 512 bits remain in the message, we finish up by folding across
91
- * smaller distances. This works similarly; the distance D is just different,
92
- * so different constant multipliers must be used. Finally, once the remaining
93
- * message is just 64 bits, it is is reduced to the CRC-32 using Barrett
94
- * reduction (explained later).
95
- *
96
- * For more information see the original paper from Intel:
97
- * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
98
- * December 2009
99
- * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
100
- */
101
- static u32 ATTRIBUTES
102
- FUNCNAME_ALIGNED(u32 remainder, const __m128i *p, size_t vec_count)
103
- {
104
- /* Constants precomputed by gen_crc32_multipliers.c. Do not edit! */
105
- const __v2di multipliers_4 = (__v2di){ 0x8F352D95, 0x1D9513D7 };
106
- const __v2di multipliers_2 = (__v2di){ 0xF1DA05AA, 0x81256527 };
107
- const __v2di multipliers_1 = (__v2di){ 0xAE689191, 0xCCAA009E };
108
- const __v2di final_multiplier = (__v2di){ 0xB8BC6765 };
109
- const __m128i mask32 = (__m128i)(__v4si){ 0xFFFFFFFF };
110
- const __v2di barrett_reduction_constants =
111
- (__v2di){ 0x00000001F7011641, 0x00000001DB710641 };
112
-
113
- const __m128i * const end = p + vec_count;
114
- const __m128i * const end512 = p + (vec_count & ~3);
115
- __m128i x0, x1, x2, x3;
116
-
117
- /*
118
- * Account for the current 'remainder', i.e. the CRC of the part of the
119
- * message already processed. Explanation: rewrite the message
120
- * polynomial M(x) in terms of the first part A(x), the second part
121
- * B(x), and the length of the second part in bits |B(x)| >= 32:
122
- *
123
- * M(x) = A(x)*x^|B(x)| + B(x)
124
- *
125
- * Then the CRC of M(x) is:
126
- *
127
- * CRC(M(x)) = CRC(A(x)*x^|B(x)| + B(x))
128
- * = CRC(A(x)*x^32*x^(|B(x)| - 32) + B(x))
129
- * = CRC(CRC(A(x))*x^(|B(x)| - 32) + B(x))
130
- *
131
- * Note: all arithmetic is modulo G(x), the generator polynomial; that's
132
- * why A(x)*x^32 can be replaced with CRC(A(x)) = A(x)*x^32 mod G(x).
133
- *
134
- * So the CRC of the full message is the CRC of the second part of the
135
- * message where the first 32 bits of the second part of the message
136
- * have been XOR'ed with the CRC of the first part of the message.
137
- */
138
- x0 = *p++;
139
- x0 ^= (__m128i)(__v4si){ remainder };
140
-
141
- if (p > end512) /* only 128, 256, or 384 bits of input? */
142
- goto _128_bits_at_a_time;
143
- x1 = *p++;
144
- x2 = *p++;
145
- x3 = *p++;
146
-
147
- /* Fold 512 bits at a time */
148
- for (; p != end512; p += 4) {
149
- __m128i y0, y1, y2, y3;
150
-
151
- y0 = p[0];
152
- y1 = p[1];
153
- y2 = p[2];
154
- y3 = p[3];
155
-
156
- /*
157
- * Note: the immediate constant for PCLMULQDQ specifies which
158
- * 64-bit halves of the 128-bit vectors to multiply:
159
- *
160
- * 0x00 means low halves (higher degree polynomial terms for us)
161
- * 0x11 means high halves (lower degree polynomial terms for us)
162
- */
163
- y0 ^= _mm_clmulepi64_si128(x0, multipliers_4, 0x00);
164
- y1 ^= _mm_clmulepi64_si128(x1, multipliers_4, 0x00);
165
- y2 ^= _mm_clmulepi64_si128(x2, multipliers_4, 0x00);
166
- y3 ^= _mm_clmulepi64_si128(x3, multipliers_4, 0x00);
167
- y0 ^= _mm_clmulepi64_si128(x0, multipliers_4, 0x11);
168
- y1 ^= _mm_clmulepi64_si128(x1, multipliers_4, 0x11);
169
- y2 ^= _mm_clmulepi64_si128(x2, multipliers_4, 0x11);
170
- y3 ^= _mm_clmulepi64_si128(x3, multipliers_4, 0x11);
171
-
172
- x0 = y0;
173
- x1 = y1;
174
- x2 = y2;
175
- x3 = y3;
176
- }
177
-
178
- /* Fold 512 bits => 128 bits */
179
- x2 ^= _mm_clmulepi64_si128(x0, multipliers_2, 0x00);
180
- x3 ^= _mm_clmulepi64_si128(x1, multipliers_2, 0x00);
181
- x2 ^= _mm_clmulepi64_si128(x0, multipliers_2, 0x11);
182
- x3 ^= _mm_clmulepi64_si128(x1, multipliers_2, 0x11);
183
- x3 ^= _mm_clmulepi64_si128(x2, multipliers_1, 0x00);
184
- x3 ^= _mm_clmulepi64_si128(x2, multipliers_1, 0x11);
185
- x0 = x3;
186
-
187
- _128_bits_at_a_time:
188
- while (p != end) {
189
- /* Fold 128 bits into next 128 bits */
190
- x1 = *p++;
191
- x1 ^= _mm_clmulepi64_si128(x0, multipliers_1, 0x00);
192
- x1 ^= _mm_clmulepi64_si128(x0, multipliers_1, 0x11);
193
- x0 = x1;
194
- }
195
-
196
- /* Now there are just 128 bits left, stored in 'x0'. */
197
-
198
- /*
199
- * Fold 128 => 96 bits. This also implicitly appends 32 zero bits,
200
- * which is equivalent to multiplying by x^32. This is needed because
201
- * the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x).
202
- */
203
- x0 = _mm_srli_si128(x0, 8) ^
204
- _mm_clmulepi64_si128(x0, multipliers_1, 0x10);
205
-
206
- /* Fold 96 => 64 bits */
207
- x0 = _mm_srli_si128(x0, 4) ^
208
- _mm_clmulepi64_si128(x0 & mask32, final_multiplier, 0x00);
209
-
210
- /*
211
- * Finally, reduce 64 => 32 bits using Barrett reduction.
212
- *
213
- * Let M(x) = A(x)*x^32 + B(x) be the remaining message. The goal is to
214
- * compute R(x) = M(x) mod G(x). Since degree(B(x)) < degree(G(x)):
215
- *
216
- * R(x) = (A(x)*x^32 + B(x)) mod G(x)
217
- * = (A(x)*x^32) mod G(x) + B(x)
218
- *
219
- * Then, by the Division Algorithm there exists a unique q(x) such that:
220
- *
221
- * A(x)*x^32 mod G(x) = A(x)*x^32 - q(x)*G(x)
222
- *
223
- * Since the left-hand side is of maximum degree 31, the right-hand side
224
- * must be too. This implies that we can apply 'mod x^32' to the
225
- * right-hand side without changing its value:
226
- *
227
- * (A(x)*x^32 - q(x)*G(x)) mod x^32 = q(x)*G(x) mod x^32
228
- *
229
- * Note that '+' is equivalent to '-' in polynomials over GF(2).
230
- *
231
- * We also know that:
232
- *
233
- * / A(x)*x^32 \
234
- * q(x) = floor ( --------- )
235
- * \ G(x) /
236
- *
237
- * To compute this efficiently, we can multiply the top and bottom by
238
- * x^32 and move the division by G(x) to the top:
239
- *
240
- * / A(x) * floor(x^64 / G(x)) \
241
- * q(x) = floor ( ------------------------- )
242
- * \ x^32 /
243
- *
244
- * Note that floor(x^64 / G(x)) is a constant.
245
- *
246
- * So finally we have:
247
- *
248
- * / A(x) * floor(x^64 / G(x)) \
249
- * R(x) = B(x) + G(x)*floor ( ------------------------- )
250
- * \ x^32 /
251
- */
252
- x1 = x0;
253
- x0 = _mm_clmulepi64_si128(x0 & mask32, barrett_reduction_constants, 0x00);
254
- x0 = _mm_clmulepi64_si128(x0 & mask32, barrett_reduction_constants, 0x10);
255
- return _mm_cvtsi128_si32(_mm_srli_si128(x0 ^ x1, 4));
256
- }
257
-
258
- /*
259
- * Fast CRC-32 implementation for x86_64 processors that have the carryless
260
- * multiplication extension (PCLMUL).
261
- *
262
- * Note: on unaligned ends of the buffer, we fall back to crc32_slice1() instead
263
- * of crc32_slice8() because only a few bytes need to be processed, so a smaller
264
- * table is preferable.
265
- */
266
- static u32 ATTRIBUTES
267
- FUNCNAME(u32 remainder, const u8 *buffer, size_t nbytes)
268
- {
269
- if ((uintptr_t)buffer & 15) {
270
- size_t n = MIN(nbytes, -(uintptr_t)buffer & 15);
271
- remainder = crc32_slice1(remainder, buffer, n);
272
- buffer += n;
273
- nbytes -= n;
274
- }
275
- if (nbytes >= 16) {
276
- remainder = FUNCNAME_ALIGNED(remainder, (const __m128i *)buffer,
277
- nbytes / 16);
278
- buffer += nbytes & ~15;
279
- nbytes &= 15;
280
- }
281
- return crc32_slice1(remainder, buffer, nbytes);
282
- }
283
-
284
- #undef FUNCNAME
285
- #undef FUNCNAME_ALIGNED
286
- #undef ATTRIBUTES