libdeflate 0.1.1 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.github/workflows/test.yml +34 -0
- data/README.md +1 -6
- data/ext/libdeflate/extconf.rb +18 -7
- data/ext/libdeflate/libdeflate_ext.c +17 -17
- data/lib/libdeflate/version.rb +1 -1
- data/libdeflate.gemspec +2 -1
- metadata +13 -84
- data/.gitmodules +0 -3
- data/.travis.yml +0 -5
- data/ext/libdeflate/libdeflate/.gitignore +0 -19
- data/ext/libdeflate/libdeflate/COPYING +0 -21
- data/ext/libdeflate/libdeflate/Makefile +0 -231
- data/ext/libdeflate/libdeflate/Makefile.msc +0 -64
- data/ext/libdeflate/libdeflate/NEWS +0 -57
- data/ext/libdeflate/libdeflate/README.md +0 -170
- data/ext/libdeflate/libdeflate/common/common_defs.h +0 -351
- data/ext/libdeflate/libdeflate/common/compiler_gcc.h +0 -134
- data/ext/libdeflate/libdeflate/common/compiler_msc.h +0 -95
- data/ext/libdeflate/libdeflate/lib/adler32.c +0 -213
- data/ext/libdeflate/libdeflate/lib/adler32_impl.h +0 -281
- data/ext/libdeflate/libdeflate/lib/aligned_malloc.c +0 -57
- data/ext/libdeflate/libdeflate/lib/aligned_malloc.h +0 -13
- data/ext/libdeflate/libdeflate/lib/bt_matchfinder.h +0 -357
- data/ext/libdeflate/libdeflate/lib/crc32.c +0 -368
- data/ext/libdeflate/libdeflate/lib/crc32_impl.h +0 -286
- data/ext/libdeflate/libdeflate/lib/crc32_table.h +0 -526
- data/ext/libdeflate/libdeflate/lib/decompress_impl.h +0 -404
- data/ext/libdeflate/libdeflate/lib/deflate_compress.c +0 -2817
- data/ext/libdeflate/libdeflate/lib/deflate_compress.h +0 -14
- data/ext/libdeflate/libdeflate/lib/deflate_constants.h +0 -66
- data/ext/libdeflate/libdeflate/lib/deflate_decompress.c +0 -889
- data/ext/libdeflate/libdeflate/lib/gzip_compress.c +0 -95
- data/ext/libdeflate/libdeflate/lib/gzip_constants.h +0 -45
- data/ext/libdeflate/libdeflate/lib/gzip_decompress.c +0 -130
- data/ext/libdeflate/libdeflate/lib/hc_matchfinder.h +0 -405
- data/ext/libdeflate/libdeflate/lib/lib_common.h +0 -35
- data/ext/libdeflate/libdeflate/lib/matchfinder_avx2.h +0 -53
- data/ext/libdeflate/libdeflate/lib/matchfinder_common.h +0 -205
- data/ext/libdeflate/libdeflate/lib/matchfinder_neon.h +0 -61
- data/ext/libdeflate/libdeflate/lib/matchfinder_sse2.h +0 -53
- data/ext/libdeflate/libdeflate/lib/unaligned.h +0 -202
- data/ext/libdeflate/libdeflate/lib/x86_cpu_features.c +0 -169
- data/ext/libdeflate/libdeflate/lib/x86_cpu_features.h +0 -48
- data/ext/libdeflate/libdeflate/lib/zlib_compress.c +0 -87
- data/ext/libdeflate/libdeflate/lib/zlib_constants.h +0 -21
- data/ext/libdeflate/libdeflate/lib/zlib_decompress.c +0 -91
- data/ext/libdeflate/libdeflate/libdeflate.h +0 -274
- data/ext/libdeflate/libdeflate/programs/benchmark.c +0 -558
- data/ext/libdeflate/libdeflate/programs/checksum.c +0 -197
- data/ext/libdeflate/libdeflate/programs/detect.sh +0 -62
- data/ext/libdeflate/libdeflate/programs/gzip.c +0 -603
- data/ext/libdeflate/libdeflate/programs/prog_util.c +0 -530
- data/ext/libdeflate/libdeflate/programs/prog_util.h +0 -162
- data/ext/libdeflate/libdeflate/programs/test_checksums.c +0 -135
- data/ext/libdeflate/libdeflate/programs/tgetopt.c +0 -118
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/Makefile +0 -12
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_compress/fuzz.c +0 -40
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_compress/inputs/0 +0 -0
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_decompress/fuzz.c +0 -28
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_decompress/inputs/0 +0 -3
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/gzip_decompress/fuzz.c +0 -28
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/gzip_decompress/inputs/0 +0 -0
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/prepare_for_fuzz.sh +0 -14
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/zlib_decompress/fuzz.c +0 -28
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/zlib_decompress/inputs/0 +0 -3
- data/ext/libdeflate/libdeflate/tools/android_build.sh +0 -104
- data/ext/libdeflate/libdeflate/tools/checksum_benchmarks.sh +0 -76
- data/ext/libdeflate/libdeflate/tools/exec_tests.sh +0 -30
- data/ext/libdeflate/libdeflate/tools/gen_crc32_multipliers.c +0 -108
- data/ext/libdeflate/libdeflate/tools/gen_crc32_table.c +0 -100
- data/ext/libdeflate/libdeflate/tools/gzip_tests.sh +0 -412
- data/ext/libdeflate/libdeflate/tools/make-windows-releases +0 -21
- data/ext/libdeflate/libdeflate/tools/mips_build.sh +0 -9
- data/ext/libdeflate/libdeflate/tools/msc_test.bat +0 -3
- data/ext/libdeflate/libdeflate/tools/pgo_build.sh +0 -23
- data/ext/libdeflate/libdeflate/tools/produce_gzip_benchmark_table.sh +0 -37
- data/ext/libdeflate/libdeflate/tools/run_tests.sh +0 -305
- data/ext/libdeflate/libdeflate/tools/windows_build.sh +0 -10
@@ -1,368 +0,0 @@
|
|
1
|
-
/*
|
2
|
-
* crc32.c - CRC-32 checksum algorithm for the gzip format
|
3
|
-
*
|
4
|
-
* Originally public domain; changes after 2016-09-07 are copyrighted.
|
5
|
-
*
|
6
|
-
* Copyright 2016 Eric Biggers
|
7
|
-
*
|
8
|
-
* Permission is hereby granted, free of charge, to any person
|
9
|
-
* obtaining a copy of this software and associated documentation
|
10
|
-
* files (the "Software"), to deal in the Software without
|
11
|
-
* restriction, including without limitation the rights to use,
|
12
|
-
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
13
|
-
* copies of the Software, and to permit persons to whom the
|
14
|
-
* Software is furnished to do so, subject to the following
|
15
|
-
* conditions:
|
16
|
-
*
|
17
|
-
* The above copyright notice and this permission notice shall be
|
18
|
-
* included in all copies or substantial portions of the Software.
|
19
|
-
*
|
20
|
-
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
21
|
-
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
22
|
-
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
23
|
-
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
24
|
-
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
25
|
-
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
26
|
-
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
27
|
-
* OTHER DEALINGS IN THE SOFTWARE.
|
28
|
-
*/
|
29
|
-
|
30
|
-
/*
|
31
|
-
* High-level description of CRC
|
32
|
-
* =============================
|
33
|
-
*
|
34
|
-
* Consider a bit sequence 'bits[1...len]'. Interpret 'bits' as the "message"
|
35
|
-
* polynomial M(x) with coefficients in GF(2) (the field of integers modulo 2),
|
36
|
-
* where the coefficient of 'x^i' is 'bits[len - i]'. Then, compute:
|
37
|
-
*
|
38
|
-
* R(x) = M(x)*x^n mod G(x)
|
39
|
-
*
|
40
|
-
* where G(x) is a selected "generator" polynomial of degree 'n'. The remainder
|
41
|
-
* R(x) is a polynomial of max degree 'n - 1'. The CRC of 'bits' is R(x)
|
42
|
-
* interpreted as a bitstring of length 'n'.
|
43
|
-
*
|
44
|
-
* CRC used in gzip
|
45
|
-
* ================
|
46
|
-
*
|
47
|
-
* In the gzip format (RFC 1952):
|
48
|
-
*
|
49
|
-
* - The bitstring to checksum is formed from the bytes of the uncompressed
|
50
|
-
* data by concatenating the bits from the bytes in order, proceeding
|
51
|
-
* from the low-order bit to the high-order bit within each byte.
|
52
|
-
*
|
53
|
-
* - The generator polynomial G(x) is: x^32 + x^26 + x^23 + x^22 + x^16 +
|
54
|
-
* x^12 + x^11 + x^10 + x^8 + x^7 + x^5 + x^4 + x^2 + x + 1.
|
55
|
-
* Consequently, the CRC length is 32 bits ("CRC-32").
|
56
|
-
*
|
57
|
-
* - The highest order 32 coefficients of M(x)*x^n are inverted.
|
58
|
-
*
|
59
|
-
* - All 32 coefficients of R(x) are inverted.
|
60
|
-
*
|
61
|
-
* The two inversions cause added leading and trailing zero bits to affect the
|
62
|
-
* resulting CRC, whereas with a regular CRC such bits would have no effect on
|
63
|
-
* the CRC.
|
64
|
-
*
|
65
|
-
* Computation and optimizations
|
66
|
-
* =============================
|
67
|
-
*
|
68
|
-
* We can compute R(x) through "long division", maintaining only 32 bits of
|
69
|
-
* state at any given time. Multiplication by 'x' can be implemented as
|
70
|
-
* right-shifting by 1 (assuming the polynomial<=>bitstring mapping where the
|
71
|
-
* highest order bit represents the coefficient of x^0), and both addition and
|
72
|
-
* subtraction can be implemented as bitwise exclusive OR (since we are working
|
73
|
-
* in GF(2)). Here is an unoptimized implementation:
|
74
|
-
*
|
75
|
-
* static u32 crc32_gzip(const u8 *buffer, size_t nbytes)
|
76
|
-
* {
|
77
|
-
* u32 remainder = 0;
|
78
|
-
* const u32 divisor = 0xEDB88320;
|
79
|
-
*
|
80
|
-
* for (size_t i = 0; i < nbytes * 8 + 32; i++) {
|
81
|
-
* int bit;
|
82
|
-
* u32 multiple;
|
83
|
-
*
|
84
|
-
* if (i < nbytes * 8)
|
85
|
-
* bit = (buffer[i / 8] >> (i % 8)) & 1;
|
86
|
-
* else
|
87
|
-
* bit = 0; // one of the 32 appended 0 bits
|
88
|
-
*
|
89
|
-
* if (i < 32) // the first 32 bits are inverted
|
90
|
-
* bit ^= 1;
|
91
|
-
*
|
92
|
-
* if (remainder & 1)
|
93
|
-
* multiple = divisor;
|
94
|
-
* else
|
95
|
-
* multiple = 0;
|
96
|
-
*
|
97
|
-
* remainder >>= 1;
|
98
|
-
* remainder |= (u32)bit << 31;
|
99
|
-
* remainder ^= multiple;
|
100
|
-
* }
|
101
|
-
*
|
102
|
-
* return ~remainder;
|
103
|
-
* }
|
104
|
-
*
|
105
|
-
* In this implementation, the 32-bit integer 'remainder' maintains the
|
106
|
-
* remainder of the currently processed portion of the message (with 32 zero
|
107
|
-
* bits appended) when divided by the generator polynomial. 'remainder' is the
|
108
|
-
* representation of R(x), and 'divisor' is the representation of G(x) excluding
|
109
|
-
* the x^32 coefficient. For each bit to process, we multiply R(x) by 'x^1',
|
110
|
-
* then add 'x^0' if the new bit is a 1. If this causes R(x) to gain a nonzero
|
111
|
-
* x^32 term, then we subtract G(x) from R(x).
|
112
|
-
*
|
113
|
-
* We can speed this up by taking advantage of the fact that XOR is commutative
|
114
|
-
* and associative, so the order in which we combine the inputs into 'remainder'
|
115
|
-
* is unimportant. And since each message bit we add doesn't affect the choice
|
116
|
-
* of 'multiple' until 32 bits later, we need not actually add each message bit
|
117
|
-
* until that point:
|
118
|
-
*
|
119
|
-
* static u32 crc32_gzip(const u8 *buffer, size_t nbytes)
|
120
|
-
* {
|
121
|
-
* u32 remainder = ~0;
|
122
|
-
* const u32 divisor = 0xEDB88320;
|
123
|
-
*
|
124
|
-
* for (size_t i = 0; i < nbytes * 8; i++) {
|
125
|
-
* int bit;
|
126
|
-
* u32 multiple;
|
127
|
-
*
|
128
|
-
* bit = (buffer[i / 8] >> (i % 8)) & 1;
|
129
|
-
* remainder ^= bit;
|
130
|
-
* if (remainder & 1)
|
131
|
-
* multiple = divisor;
|
132
|
-
* else
|
133
|
-
* multiple = 0;
|
134
|
-
* remainder >>= 1;
|
135
|
-
* remainder ^= multiple;
|
136
|
-
* }
|
137
|
-
*
|
138
|
-
* return ~remainder;
|
139
|
-
* }
|
140
|
-
*
|
141
|
-
* With the above implementation we get the effect of 32 appended 0 bits for
|
142
|
-
* free; they never affect the choice of a divisor, nor would they change the
|
143
|
-
* value of 'remainder' if they were to be actually XOR'ed in. And by starting
|
144
|
-
* with a remainder of all 1 bits, we get the effect of complementing the first
|
145
|
-
* 32 message bits.
|
146
|
-
*
|
147
|
-
* The next optimization is to process the input in multi-bit units. Suppose
|
148
|
-
* that we insert the next 'n' message bits into the remainder. Then we get an
|
149
|
-
* intermediate remainder of length '32 + n' bits, and the CRC of the extra 'n'
|
150
|
-
* bits is the amount by which the low 32 bits of the remainder will change as a
|
151
|
-
* result of cancelling out those 'n' bits. Taking n=8 (one byte) and
|
152
|
-
* precomputing a table containing the CRC of each possible byte, we get
|
153
|
-
* crc32_slice1() defined below.
|
154
|
-
*
|
155
|
-
* As a further optimization, we could increase the multi-bit unit size to 16.
|
156
|
-
* However, that is inefficient because the table size explodes from 256 entries
|
157
|
-
* (1024 bytes) to 65536 entries (262144 bytes), which wastes memory and won't
|
158
|
-
* fit in L1 cache on typical processors.
|
159
|
-
*
|
160
|
-
* However, we can actually process 4 bytes at a time using 4 different tables
|
161
|
-
* with 256 entries each. Logically, we form a 64-bit intermediate remainder
|
162
|
-
* and cancel out the high 32 bits in 8-bit chunks. Bits 32-39 are cancelled
|
163
|
-
* out by the CRC of those bits, whereas bits 40-47 are be cancelled out by the
|
164
|
-
* CRC of those bits with 8 zero bits appended, and so on. This method is
|
165
|
-
* implemented in crc32_slice4(), defined below.
|
166
|
-
*
|
167
|
-
* In crc32_slice8(), this method is extended to 8 bytes at a time. The
|
168
|
-
* intermediate remainder (which we never actually store explicitly) is 96 bits.
|
169
|
-
*
|
170
|
-
* On CPUs that support fast carryless multiplication, CRCs can be computed even
|
171
|
-
* more quickly via "folding". See crc32_pclmul() for an example.
|
172
|
-
*/
|
173
|
-
|
174
|
-
#include "x86_cpu_features.h"
|
175
|
-
|
176
|
-
#include "libdeflate.h"
|
177
|
-
|
178
|
-
/* Select the implementations to compile in. */
|
179
|
-
|
180
|
-
#define NEED_GENERIC_IMPL 1 /* include generic impl unless overridden */
|
181
|
-
#define DEFAULT_IMPL crc32_slice8
|
182
|
-
|
183
|
-
/* Include the PCLMUL implementation? */
|
184
|
-
#define NEED_PCLMUL_IMPL 0
|
185
|
-
#if defined(__PCLMUL__) || \
|
186
|
-
(X86_CPU_FEATURES_ENABLED && COMPILER_SUPPORTS_PCLMUL_TARGET && \
|
187
|
-
COMPILER_SUPPORTS_TARGET_INTRINSICS)
|
188
|
-
# include <wmmintrin.h>
|
189
|
-
# undef NEED_PCLMUL_IMPL
|
190
|
-
# define NEED_PCLMUL_IMPL 1
|
191
|
-
# ifdef __PCLMUL__ /* compiling for PCLMUL, i.e. can we assume it's there? */
|
192
|
-
# undef NEED_GENERIC_IMPL
|
193
|
-
# define NEED_GENERIC_IMPL 0 /* generic impl not needed */
|
194
|
-
# undef DEFAULT_IMPL
|
195
|
-
# define DEFAULT_IMPL crc32_pclmul
|
196
|
-
# endif /* otherwise, we can build a PCLMUL version, but we won't know whether
|
197
|
-
we can use it until runtime */
|
198
|
-
#endif
|
199
|
-
|
200
|
-
/*
|
201
|
-
* Include the PCLMUL/AVX implementation? Although our PCLMUL-optimized CRC-32
|
202
|
-
* function doesn't use any AVX intrinsics specifically, it can benefit a lot
|
203
|
-
* from being compiled for an AVX target: on Skylake, ~16700 MB/s vs. ~10100
|
204
|
-
* MB/s. I expect this is related to the PCLMULQDQ instructions being assembled
|
205
|
-
* in the newer three-operand form rather than the older two-operand form.
|
206
|
-
*
|
207
|
-
* Note: this is only needed if __AVX__ is *not* defined, since otherwise the
|
208
|
-
* "regular" PCLMUL implementation would already be AVX enabled.
|
209
|
-
*/
|
210
|
-
#define NEED_PCLMUL_AVX_IMPL 0
|
211
|
-
#if NEED_PCLMUL_IMPL && !defined(__AVX__) && \
|
212
|
-
X86_CPU_FEATURES_ENABLED && COMPILER_SUPPORTS_AVX_TARGET
|
213
|
-
# undef NEED_PCLMUL_AVX_IMPL
|
214
|
-
# define NEED_PCLMUL_AVX_IMPL 1
|
215
|
-
#endif
|
216
|
-
|
217
|
-
#define NUM_IMPLS (NEED_GENERIC_IMPL + NEED_PCLMUL_IMPL + NEED_PCLMUL_AVX_IMPL)
|
218
|
-
|
219
|
-
/* Define the CRC-32 table */
|
220
|
-
#if NEED_GENERIC_IMPL
|
221
|
-
# define CRC32_SLICE8
|
222
|
-
#else
|
223
|
-
# define CRC32_SLICE1 /* only need short table for unaligned ends */
|
224
|
-
#endif
|
225
|
-
#include "crc32_table.h"
|
226
|
-
|
227
|
-
static forceinline u32
|
228
|
-
crc32_update_byte(u32 remainder, u8 next_byte)
|
229
|
-
{
|
230
|
-
return (remainder >> 8) ^ crc32_table[(u8)remainder ^ next_byte];
|
231
|
-
}
|
232
|
-
|
233
|
-
#if defined(CRC32_SLICE1) || (NUM_IMPLS > NEED_GENERIC_IMPL)
|
234
|
-
static u32
|
235
|
-
crc32_slice1(u32 remainder, const u8 *buffer, size_t nbytes)
|
236
|
-
{
|
237
|
-
size_t i;
|
238
|
-
|
239
|
-
STATIC_ASSERT(ARRAY_LEN(crc32_table) >= 0x100);
|
240
|
-
|
241
|
-
for (i = 0; i < nbytes; i++)
|
242
|
-
remainder = crc32_update_byte(remainder, buffer[i]);
|
243
|
-
return remainder;
|
244
|
-
}
|
245
|
-
#endif
|
246
|
-
|
247
|
-
#ifdef CRC32_SLICE4
|
248
|
-
static u32
|
249
|
-
crc32_slice4(u32 remainder, const u8 *buffer, size_t nbytes)
|
250
|
-
{
|
251
|
-
const u8 *p = buffer;
|
252
|
-
const u8 *end = buffer + nbytes;
|
253
|
-
const u8 *end32;
|
254
|
-
|
255
|
-
STATIC_ASSERT(ARRAY_LEN(crc32_table) >= 0x400);
|
256
|
-
|
257
|
-
for (; ((uintptr_t)p & 3) && p != end; p++)
|
258
|
-
remainder = crc32_update_byte(remainder, *p);
|
259
|
-
|
260
|
-
end32 = p + ((end - p) & ~3);
|
261
|
-
for (; p != end32; p += 4) {
|
262
|
-
u32 v = le32_bswap(*(const u32 *)p);
|
263
|
-
remainder =
|
264
|
-
crc32_table[0x300 + (u8)((remainder ^ v) >> 0)] ^
|
265
|
-
crc32_table[0x200 + (u8)((remainder ^ v) >> 8)] ^
|
266
|
-
crc32_table[0x100 + (u8)((remainder ^ v) >> 16)] ^
|
267
|
-
crc32_table[0x000 + (u8)((remainder ^ v) >> 24)];
|
268
|
-
}
|
269
|
-
|
270
|
-
for (; p != end; p++)
|
271
|
-
remainder = crc32_update_byte(remainder, *p);
|
272
|
-
|
273
|
-
return remainder;
|
274
|
-
}
|
275
|
-
#endif
|
276
|
-
|
277
|
-
#ifdef CRC32_SLICE8
|
278
|
-
static u32
|
279
|
-
crc32_slice8(u32 remainder, const u8 *buffer, size_t nbytes)
|
280
|
-
{
|
281
|
-
const u8 *p = buffer;
|
282
|
-
const u8 *end = buffer + nbytes;
|
283
|
-
const u8 *end64;
|
284
|
-
|
285
|
-
STATIC_ASSERT(ARRAY_LEN(crc32_table) >= 0x800);
|
286
|
-
|
287
|
-
for (; ((uintptr_t)p & 7) && p != end; p++)
|
288
|
-
remainder = crc32_update_byte(remainder, *p);
|
289
|
-
|
290
|
-
end64 = p + ((end - p) & ~7);
|
291
|
-
for (; p != end64; p += 8) {
|
292
|
-
u32 v1 = le32_bswap(*(const u32 *)(p + 0));
|
293
|
-
u32 v2 = le32_bswap(*(const u32 *)(p + 4));
|
294
|
-
remainder =
|
295
|
-
crc32_table[0x700 + (u8)((remainder ^ v1) >> 0)] ^
|
296
|
-
crc32_table[0x600 + (u8)((remainder ^ v1) >> 8)] ^
|
297
|
-
crc32_table[0x500 + (u8)((remainder ^ v1) >> 16)] ^
|
298
|
-
crc32_table[0x400 + (u8)((remainder ^ v1) >> 24)] ^
|
299
|
-
crc32_table[0x300 + (u8)(v2 >> 0)] ^
|
300
|
-
crc32_table[0x200 + (u8)(v2 >> 8)] ^
|
301
|
-
crc32_table[0x100 + (u8)(v2 >> 16)] ^
|
302
|
-
crc32_table[0x000 + (u8)(v2 >> 24)];
|
303
|
-
}
|
304
|
-
|
305
|
-
for (; p != end; p++)
|
306
|
-
remainder = crc32_update_byte(remainder, *p);
|
307
|
-
|
308
|
-
return remainder;
|
309
|
-
}
|
310
|
-
#endif
|
311
|
-
|
312
|
-
/* Define the PCLMUL implementation if needed. */
|
313
|
-
#if NEED_PCLMUL_IMPL
|
314
|
-
# define FUNCNAME crc32_pclmul
|
315
|
-
# define FUNCNAME_ALIGNED crc32_pclmul_aligned
|
316
|
-
# ifdef __PCLMUL__
|
317
|
-
# define ATTRIBUTES
|
318
|
-
# else
|
319
|
-
# define ATTRIBUTES __attribute__((target("pclmul")))
|
320
|
-
# endif
|
321
|
-
# include "crc32_impl.h"
|
322
|
-
#endif
|
323
|
-
|
324
|
-
/* Define the PCLMUL/AVX implementation if needed. */
|
325
|
-
#if NEED_PCLMUL_AVX_IMPL
|
326
|
-
# define FUNCNAME crc32_pclmul_avx
|
327
|
-
# define FUNCNAME_ALIGNED crc32_pclmul_avx_aligned
|
328
|
-
# define ATTRIBUTES __attribute__((target("pclmul,avx")))
|
329
|
-
# include "crc32_impl.h"
|
330
|
-
#endif
|
331
|
-
|
332
|
-
typedef u32 (*crc32_func_t)(u32, const u8 *, size_t);
|
333
|
-
|
334
|
-
/*
|
335
|
-
* If multiple implementations are available, then dispatch among them based on
|
336
|
-
* CPU features at runtime. Otherwise just call the single one directly.
|
337
|
-
*/
|
338
|
-
#if NUM_IMPLS == 1
|
339
|
-
# define crc32_impl DEFAULT_IMPL
|
340
|
-
#else
|
341
|
-
static u32 dispatch(u32, const u8 *, size_t);
|
342
|
-
|
343
|
-
static crc32_func_t crc32_impl = dispatch;
|
344
|
-
|
345
|
-
static u32 dispatch(u32 remainder, const u8 *buffer, size_t nbytes)
|
346
|
-
{
|
347
|
-
crc32_func_t f = DEFAULT_IMPL;
|
348
|
-
#if NEED_PCLMUL_IMPL && !defined(__PCLMUL__)
|
349
|
-
if (x86_have_cpu_features(X86_CPU_FEATURE_PCLMULQDQ))
|
350
|
-
f = crc32_pclmul;
|
351
|
-
#endif
|
352
|
-
#if NEED_PCLMUL_AVX_IMPL
|
353
|
-
if (x86_have_cpu_features(X86_CPU_FEATURE_PCLMULQDQ |
|
354
|
-
X86_CPU_FEATURE_AVX))
|
355
|
-
f = crc32_pclmul_avx;
|
356
|
-
#endif
|
357
|
-
crc32_impl = f;
|
358
|
-
return crc32_impl(remainder, buffer, nbytes);
|
359
|
-
}
|
360
|
-
#endif /* NUM_IMPLS != 1 */
|
361
|
-
|
362
|
-
LIBDEFLATEAPI u32
|
363
|
-
libdeflate_crc32(u32 remainder, const void *buffer, size_t nbytes)
|
364
|
-
{
|
365
|
-
if (buffer == NULL) /* return initial value */
|
366
|
-
return 0;
|
367
|
-
return ~crc32_impl(~remainder, buffer, nbytes);
|
368
|
-
}
|
@@ -1,286 +0,0 @@
|
|
1
|
-
/*
|
2
|
-
* crc32_impl.h
|
3
|
-
*
|
4
|
-
* Copyright 2016 Eric Biggers
|
5
|
-
*
|
6
|
-
* Permission is hereby granted, free of charge, to any person
|
7
|
-
* obtaining a copy of this software and associated documentation
|
8
|
-
* files (the "Software"), to deal in the Software without
|
9
|
-
* restriction, including without limitation the rights to use,
|
10
|
-
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
11
|
-
* copies of the Software, and to permit persons to whom the
|
12
|
-
* Software is furnished to do so, subject to the following
|
13
|
-
* conditions:
|
14
|
-
*
|
15
|
-
* The above copyright notice and this permission notice shall be
|
16
|
-
* included in all copies or substantial portions of the Software.
|
17
|
-
*
|
18
|
-
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
19
|
-
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
20
|
-
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
21
|
-
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
22
|
-
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
23
|
-
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
24
|
-
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
25
|
-
* OTHER DEALINGS IN THE SOFTWARE.
|
26
|
-
*/
|
27
|
-
|
28
|
-
/*
|
29
|
-
* CRC-32 folding with PCLMULQDQ.
|
30
|
-
*
|
31
|
-
* The basic idea is to repeatedly "fold" each 512 bits into the next 512 bits,
|
32
|
-
* producing an abbreviated message which is congruent the original message
|
33
|
-
* modulo the generator polynomial G(x).
|
34
|
-
*
|
35
|
-
* Folding each 512 bits is implemented as eight 64-bit folds, each of which
|
36
|
-
* uses one carryless multiplication instruction. It's expected that CPUs may
|
37
|
-
* be able to execute some of these multiplications in parallel.
|
38
|
-
*
|
39
|
-
* Explanation of "folding": let A(x) be 64 bits from the message, and let B(x)
|
40
|
-
* be 95 bits from a constant distance D later in the message. The relevant
|
41
|
-
* portion of the message can be written as:
|
42
|
-
*
|
43
|
-
* M(x) = A(x)*x^D + B(x)
|
44
|
-
*
|
45
|
-
* ... where + and * represent addition and multiplication, respectively, of
|
46
|
-
* polynomials over GF(2). Note that when implemented on a computer, these
|
47
|
-
* operations are equivalent to XOR and carryless multiplication, respectively.
|
48
|
-
*
|
49
|
-
* For the purpose of CRC calculation, only the remainder modulo the generator
|
50
|
-
* polynomial G(x) matters:
|
51
|
-
*
|
52
|
-
* M(x) mod G(x) = (A(x)*x^D + B(x)) mod G(x)
|
53
|
-
*
|
54
|
-
* Since the modulo operation can be applied anywhere in a sequence of additions
|
55
|
-
* and multiplications without affecting the result, this is equivalent to:
|
56
|
-
*
|
57
|
-
* M(x) mod G(x) = (A(x)*(x^D mod G(x)) + B(x)) mod G(x)
|
58
|
-
*
|
59
|
-
* For any D, 'x^D mod G(x)' will be a polynomial with maximum degree 31, i.e.
|
60
|
-
* a 32-bit quantity. So 'A(x) * (x^D mod G(x))' is equivalent to a carryless
|
61
|
-
* multiplication of a 64-bit quantity by a 32-bit quantity, producing a 95-bit
|
62
|
-
* product. Then, adding (XOR-ing) the product to B(x) produces a polynomial
|
63
|
-
* with the same length as B(x) but with the same remainder as 'A(x)*x^D +
|
64
|
-
* B(x)'. This is the basic fold operation with 64 bits.
|
65
|
-
*
|
66
|
-
* Note that the carryless multiplication instruction PCLMULQDQ actually takes
|
67
|
-
* two 64-bit inputs and produces a 127-bit product in the low-order bits of a
|
68
|
-
* 128-bit XMM register. This works fine, but care must be taken to account for
|
69
|
-
* "bit endianness". With the CRC version implemented here, bits are always
|
70
|
-
* ordered such that the lowest-order bit represents the coefficient of highest
|
71
|
-
* power of x and the highest-order bit represents the coefficient of the lowest
|
72
|
-
* power of x. This is backwards from the more intuitive order. Still,
|
73
|
-
* carryless multiplication works essentially the same either way. It just must
|
74
|
-
* be accounted for that when we XOR the 95-bit product in the low-order 95 bits
|
75
|
-
* of a 128-bit XMM register into 128-bits of later data held in another XMM
|
76
|
-
* register, we'll really be XOR-ing the product into the mathematically higher
|
77
|
-
* degree end of those later bits, not the lower degree end as may be expected.
|
78
|
-
*
|
79
|
-
* So given that caveat and the fact that we process 512 bits per iteration, the
|
80
|
-
* 'D' values we need for the two 64-bit halves of each 128 bits of data are:
|
81
|
-
*
|
82
|
-
* D = (512 + 95) - 64 for the higher-degree half of each 128 bits,
|
83
|
-
* i.e. the lower order bits in the XMM register
|
84
|
-
*
|
85
|
-
* D = (512 + 95) - 128 for the lower-degree half of each 128 bits,
|
86
|
-
* i.e. the higher order bits in the XMM register
|
87
|
-
*
|
88
|
-
* The required 'x^D mod G(x)' values were precomputed.
|
89
|
-
*
|
90
|
-
* When <= 512 bits remain in the message, we finish up by folding across
|
91
|
-
* smaller distances. This works similarly; the distance D is just different,
|
92
|
-
* so different constant multipliers must be used. Finally, once the remaining
|
93
|
-
* message is just 64 bits, it is is reduced to the CRC-32 using Barrett
|
94
|
-
* reduction (explained later).
|
95
|
-
*
|
96
|
-
* For more information see the original paper from Intel:
|
97
|
-
* "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
|
98
|
-
* December 2009
|
99
|
-
* http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
|
100
|
-
*/
|
101
|
-
static u32 ATTRIBUTES
|
102
|
-
FUNCNAME_ALIGNED(u32 remainder, const __m128i *p, size_t vec_count)
|
103
|
-
{
|
104
|
-
/* Constants precomputed by gen_crc32_multipliers.c. Do not edit! */
|
105
|
-
const __v2di multipliers_4 = (__v2di){ 0x8F352D95, 0x1D9513D7 };
|
106
|
-
const __v2di multipliers_2 = (__v2di){ 0xF1DA05AA, 0x81256527 };
|
107
|
-
const __v2di multipliers_1 = (__v2di){ 0xAE689191, 0xCCAA009E };
|
108
|
-
const __v2di final_multiplier = (__v2di){ 0xB8BC6765 };
|
109
|
-
const __m128i mask32 = (__m128i)(__v4si){ 0xFFFFFFFF };
|
110
|
-
const __v2di barrett_reduction_constants =
|
111
|
-
(__v2di){ 0x00000001F7011641, 0x00000001DB710641 };
|
112
|
-
|
113
|
-
const __m128i * const end = p + vec_count;
|
114
|
-
const __m128i * const end512 = p + (vec_count & ~3);
|
115
|
-
__m128i x0, x1, x2, x3;
|
116
|
-
|
117
|
-
/*
|
118
|
-
* Account for the current 'remainder', i.e. the CRC of the part of the
|
119
|
-
* message already processed. Explanation: rewrite the message
|
120
|
-
* polynomial M(x) in terms of the first part A(x), the second part
|
121
|
-
* B(x), and the length of the second part in bits |B(x)| >= 32:
|
122
|
-
*
|
123
|
-
* M(x) = A(x)*x^|B(x)| + B(x)
|
124
|
-
*
|
125
|
-
* Then the CRC of M(x) is:
|
126
|
-
*
|
127
|
-
* CRC(M(x)) = CRC(A(x)*x^|B(x)| + B(x))
|
128
|
-
* = CRC(A(x)*x^32*x^(|B(x)| - 32) + B(x))
|
129
|
-
* = CRC(CRC(A(x))*x^(|B(x)| - 32) + B(x))
|
130
|
-
*
|
131
|
-
* Note: all arithmetic is modulo G(x), the generator polynomial; that's
|
132
|
-
* why A(x)*x^32 can be replaced with CRC(A(x)) = A(x)*x^32 mod G(x).
|
133
|
-
*
|
134
|
-
* So the CRC of the full message is the CRC of the second part of the
|
135
|
-
* message where the first 32 bits of the second part of the message
|
136
|
-
* have been XOR'ed with the CRC of the first part of the message.
|
137
|
-
*/
|
138
|
-
x0 = *p++;
|
139
|
-
x0 ^= (__m128i)(__v4si){ remainder };
|
140
|
-
|
141
|
-
if (p > end512) /* only 128, 256, or 384 bits of input? */
|
142
|
-
goto _128_bits_at_a_time;
|
143
|
-
x1 = *p++;
|
144
|
-
x2 = *p++;
|
145
|
-
x3 = *p++;
|
146
|
-
|
147
|
-
/* Fold 512 bits at a time */
|
148
|
-
for (; p != end512; p += 4) {
|
149
|
-
__m128i y0, y1, y2, y3;
|
150
|
-
|
151
|
-
y0 = p[0];
|
152
|
-
y1 = p[1];
|
153
|
-
y2 = p[2];
|
154
|
-
y3 = p[3];
|
155
|
-
|
156
|
-
/*
|
157
|
-
* Note: the immediate constant for PCLMULQDQ specifies which
|
158
|
-
* 64-bit halves of the 128-bit vectors to multiply:
|
159
|
-
*
|
160
|
-
* 0x00 means low halves (higher degree polynomial terms for us)
|
161
|
-
* 0x11 means high halves (lower degree polynomial terms for us)
|
162
|
-
*/
|
163
|
-
y0 ^= _mm_clmulepi64_si128(x0, multipliers_4, 0x00);
|
164
|
-
y1 ^= _mm_clmulepi64_si128(x1, multipliers_4, 0x00);
|
165
|
-
y2 ^= _mm_clmulepi64_si128(x2, multipliers_4, 0x00);
|
166
|
-
y3 ^= _mm_clmulepi64_si128(x3, multipliers_4, 0x00);
|
167
|
-
y0 ^= _mm_clmulepi64_si128(x0, multipliers_4, 0x11);
|
168
|
-
y1 ^= _mm_clmulepi64_si128(x1, multipliers_4, 0x11);
|
169
|
-
y2 ^= _mm_clmulepi64_si128(x2, multipliers_4, 0x11);
|
170
|
-
y3 ^= _mm_clmulepi64_si128(x3, multipliers_4, 0x11);
|
171
|
-
|
172
|
-
x0 = y0;
|
173
|
-
x1 = y1;
|
174
|
-
x2 = y2;
|
175
|
-
x3 = y3;
|
176
|
-
}
|
177
|
-
|
178
|
-
/* Fold 512 bits => 128 bits */
|
179
|
-
x2 ^= _mm_clmulepi64_si128(x0, multipliers_2, 0x00);
|
180
|
-
x3 ^= _mm_clmulepi64_si128(x1, multipliers_2, 0x00);
|
181
|
-
x2 ^= _mm_clmulepi64_si128(x0, multipliers_2, 0x11);
|
182
|
-
x3 ^= _mm_clmulepi64_si128(x1, multipliers_2, 0x11);
|
183
|
-
x3 ^= _mm_clmulepi64_si128(x2, multipliers_1, 0x00);
|
184
|
-
x3 ^= _mm_clmulepi64_si128(x2, multipliers_1, 0x11);
|
185
|
-
x0 = x3;
|
186
|
-
|
187
|
-
_128_bits_at_a_time:
|
188
|
-
while (p != end) {
|
189
|
-
/* Fold 128 bits into next 128 bits */
|
190
|
-
x1 = *p++;
|
191
|
-
x1 ^= _mm_clmulepi64_si128(x0, multipliers_1, 0x00);
|
192
|
-
x1 ^= _mm_clmulepi64_si128(x0, multipliers_1, 0x11);
|
193
|
-
x0 = x1;
|
194
|
-
}
|
195
|
-
|
196
|
-
/* Now there are just 128 bits left, stored in 'x0'. */
|
197
|
-
|
198
|
-
/*
|
199
|
-
* Fold 128 => 96 bits. This also implicitly appends 32 zero bits,
|
200
|
-
* which is equivalent to multiplying by x^32. This is needed because
|
201
|
-
* the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x).
|
202
|
-
*/
|
203
|
-
x0 = _mm_srli_si128(x0, 8) ^
|
204
|
-
_mm_clmulepi64_si128(x0, multipliers_1, 0x10);
|
205
|
-
|
206
|
-
/* Fold 96 => 64 bits */
|
207
|
-
x0 = _mm_srli_si128(x0, 4) ^
|
208
|
-
_mm_clmulepi64_si128(x0 & mask32, final_multiplier, 0x00);
|
209
|
-
|
210
|
-
/*
|
211
|
-
* Finally, reduce 64 => 32 bits using Barrett reduction.
|
212
|
-
*
|
213
|
-
* Let M(x) = A(x)*x^32 + B(x) be the remaining message. The goal is to
|
214
|
-
* compute R(x) = M(x) mod G(x). Since degree(B(x)) < degree(G(x)):
|
215
|
-
*
|
216
|
-
* R(x) = (A(x)*x^32 + B(x)) mod G(x)
|
217
|
-
* = (A(x)*x^32) mod G(x) + B(x)
|
218
|
-
*
|
219
|
-
* Then, by the Division Algorithm there exists a unique q(x) such that:
|
220
|
-
*
|
221
|
-
* A(x)*x^32 mod G(x) = A(x)*x^32 - q(x)*G(x)
|
222
|
-
*
|
223
|
-
* Since the left-hand side is of maximum degree 31, the right-hand side
|
224
|
-
* must be too. This implies that we can apply 'mod x^32' to the
|
225
|
-
* right-hand side without changing its value:
|
226
|
-
*
|
227
|
-
* (A(x)*x^32 - q(x)*G(x)) mod x^32 = q(x)*G(x) mod x^32
|
228
|
-
*
|
229
|
-
* Note that '+' is equivalent to '-' in polynomials over GF(2).
|
230
|
-
*
|
231
|
-
* We also know that:
|
232
|
-
*
|
233
|
-
* / A(x)*x^32 \
|
234
|
-
* q(x) = floor ( --------- )
|
235
|
-
* \ G(x) /
|
236
|
-
*
|
237
|
-
* To compute this efficiently, we can multiply the top and bottom by
|
238
|
-
* x^32 and move the division by G(x) to the top:
|
239
|
-
*
|
240
|
-
* / A(x) * floor(x^64 / G(x)) \
|
241
|
-
* q(x) = floor ( ------------------------- )
|
242
|
-
* \ x^32 /
|
243
|
-
*
|
244
|
-
* Note that floor(x^64 / G(x)) is a constant.
|
245
|
-
*
|
246
|
-
* So finally we have:
|
247
|
-
*
|
248
|
-
* / A(x) * floor(x^64 / G(x)) \
|
249
|
-
* R(x) = B(x) + G(x)*floor ( ------------------------- )
|
250
|
-
* \ x^32 /
|
251
|
-
*/
|
252
|
-
x1 = x0;
|
253
|
-
x0 = _mm_clmulepi64_si128(x0 & mask32, barrett_reduction_constants, 0x00);
|
254
|
-
x0 = _mm_clmulepi64_si128(x0 & mask32, barrett_reduction_constants, 0x10);
|
255
|
-
return _mm_cvtsi128_si32(_mm_srli_si128(x0 ^ x1, 4));
|
256
|
-
}
|
257
|
-
|
258
|
-
/*
|
259
|
-
* Fast CRC-32 implementation for x86_64 processors that have the carryless
|
260
|
-
* multiplication extension (PCLMUL).
|
261
|
-
*
|
262
|
-
* Note: on unaligned ends of the buffer, we fall back to crc32_slice1() instead
|
263
|
-
* of crc32_slice8() because only a few bytes need to be processed, so a smaller
|
264
|
-
* table is preferable.
|
265
|
-
*/
|
266
|
-
static u32 ATTRIBUTES
|
267
|
-
FUNCNAME(u32 remainder, const u8 *buffer, size_t nbytes)
|
268
|
-
{
|
269
|
-
if ((uintptr_t)buffer & 15) {
|
270
|
-
size_t n = MIN(nbytes, -(uintptr_t)buffer & 15);
|
271
|
-
remainder = crc32_slice1(remainder, buffer, n);
|
272
|
-
buffer += n;
|
273
|
-
nbytes -= n;
|
274
|
-
}
|
275
|
-
if (nbytes >= 16) {
|
276
|
-
remainder = FUNCNAME_ALIGNED(remainder, (const __m128i *)buffer,
|
277
|
-
nbytes / 16);
|
278
|
-
buffer += nbytes & ~15;
|
279
|
-
nbytes &= 15;
|
280
|
-
}
|
281
|
-
return crc32_slice1(remainder, buffer, nbytes);
|
282
|
-
}
|
283
|
-
|
284
|
-
#undef FUNCNAME
|
285
|
-
#undef FUNCNAME_ALIGNED
|
286
|
-
#undef ATTRIBUTES
|