deflate-ruby 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CLAUDE.md +138 -0
- data/LICENSE.txt +21 -0
- data/README.md +117 -0
- data/ext/deflate_ruby/deflate_ruby.c +301 -0
- data/ext/deflate_ruby/extconf.rb +34 -0
- data/ext/deflate_ruby/libdeflate/CMakeLists.txt +270 -0
- data/ext/deflate_ruby/libdeflate/COPYING +22 -0
- data/ext/deflate_ruby/libdeflate/NEWS.md +494 -0
- data/ext/deflate_ruby/libdeflate/README.md +228 -0
- data/ext/deflate_ruby/libdeflate/common_defs.h +747 -0
- data/ext/deflate_ruby/libdeflate/lib/adler32.c +162 -0
- data/ext/deflate_ruby/libdeflate/lib/arm/adler32_impl.h +358 -0
- data/ext/deflate_ruby/libdeflate/lib/arm/cpu_features.c +230 -0
- data/ext/deflate_ruby/libdeflate/lib/arm/cpu_features.h +214 -0
- data/ext/deflate_ruby/libdeflate/lib/arm/crc32_impl.h +600 -0
- data/ext/deflate_ruby/libdeflate/lib/arm/crc32_pmull_helpers.h +156 -0
- data/ext/deflate_ruby/libdeflate/lib/arm/crc32_pmull_wide.h +226 -0
- data/ext/deflate_ruby/libdeflate/lib/arm/matchfinder_impl.h +78 -0
- data/ext/deflate_ruby/libdeflate/lib/bt_matchfinder.h +342 -0
- data/ext/deflate_ruby/libdeflate/lib/cpu_features_common.h +93 -0
- data/ext/deflate_ruby/libdeflate/lib/crc32.c +262 -0
- data/ext/deflate_ruby/libdeflate/lib/crc32_multipliers.h +377 -0
- data/ext/deflate_ruby/libdeflate/lib/crc32_tables.h +587 -0
- data/ext/deflate_ruby/libdeflate/lib/decompress_template.h +777 -0
- data/ext/deflate_ruby/libdeflate/lib/deflate_compress.c +4129 -0
- data/ext/deflate_ruby/libdeflate/lib/deflate_compress.h +15 -0
- data/ext/deflate_ruby/libdeflate/lib/deflate_constants.h +56 -0
- data/ext/deflate_ruby/libdeflate/lib/deflate_decompress.c +1208 -0
- data/ext/deflate_ruby/libdeflate/lib/gzip_compress.c +90 -0
- data/ext/deflate_ruby/libdeflate/lib/gzip_constants.h +45 -0
- data/ext/deflate_ruby/libdeflate/lib/gzip_decompress.c +144 -0
- data/ext/deflate_ruby/libdeflate/lib/hc_matchfinder.h +401 -0
- data/ext/deflate_ruby/libdeflate/lib/ht_matchfinder.h +234 -0
- data/ext/deflate_ruby/libdeflate/lib/lib_common.h +106 -0
- data/ext/deflate_ruby/libdeflate/lib/matchfinder_common.h +224 -0
- data/ext/deflate_ruby/libdeflate/lib/riscv/matchfinder_impl.h +97 -0
- data/ext/deflate_ruby/libdeflate/lib/utils.c +141 -0
- data/ext/deflate_ruby/libdeflate/lib/x86/adler32_impl.h +134 -0
- data/ext/deflate_ruby/libdeflate/lib/x86/adler32_template.h +518 -0
- data/ext/deflate_ruby/libdeflate/lib/x86/cpu_features.c +183 -0
- data/ext/deflate_ruby/libdeflate/lib/x86/cpu_features.h +169 -0
- data/ext/deflate_ruby/libdeflate/lib/x86/crc32_impl.h +160 -0
- data/ext/deflate_ruby/libdeflate/lib/x86/crc32_pclmul_template.h +495 -0
- data/ext/deflate_ruby/libdeflate/lib/x86/decompress_impl.h +57 -0
- data/ext/deflate_ruby/libdeflate/lib/x86/matchfinder_impl.h +122 -0
- data/ext/deflate_ruby/libdeflate/lib/zlib_compress.c +82 -0
- data/ext/deflate_ruby/libdeflate/lib/zlib_constants.h +21 -0
- data/ext/deflate_ruby/libdeflate/lib/zlib_decompress.c +104 -0
- data/ext/deflate_ruby/libdeflate/libdeflate-config.cmake.in +3 -0
- data/ext/deflate_ruby/libdeflate/libdeflate.h +411 -0
- data/ext/deflate_ruby/libdeflate/libdeflate.pc.in +18 -0
- data/ext/deflate_ruby/libdeflate/programs/CMakeLists.txt +105 -0
- data/ext/deflate_ruby/libdeflate/programs/benchmark.c +696 -0
- data/ext/deflate_ruby/libdeflate/programs/checksum.c +218 -0
- data/ext/deflate_ruby/libdeflate/programs/config.h.in +19 -0
- data/ext/deflate_ruby/libdeflate/programs/gzip.c +688 -0
- data/ext/deflate_ruby/libdeflate/programs/prog_util.c +521 -0
- data/ext/deflate_ruby/libdeflate/programs/prog_util.h +225 -0
- data/ext/deflate_ruby/libdeflate/programs/test_checksums.c +200 -0
- data/ext/deflate_ruby/libdeflate/programs/test_custom_malloc.c +155 -0
- data/ext/deflate_ruby/libdeflate/programs/test_incomplete_codes.c +385 -0
- data/ext/deflate_ruby/libdeflate/programs/test_invalid_streams.c +130 -0
- data/ext/deflate_ruby/libdeflate/programs/test_litrunlen_overflow.c +72 -0
- data/ext/deflate_ruby/libdeflate/programs/test_overread.c +95 -0
- data/ext/deflate_ruby/libdeflate/programs/test_slow_decompression.c +472 -0
- data/ext/deflate_ruby/libdeflate/programs/test_trailing_bytes.c +151 -0
- data/ext/deflate_ruby/libdeflate/programs/test_util.c +237 -0
- data/ext/deflate_ruby/libdeflate/programs/test_util.h +61 -0
- data/ext/deflate_ruby/libdeflate/programs/tgetopt.c +118 -0
- data/ext/deflate_ruby/libdeflate/scripts/android_build.sh +118 -0
- data/ext/deflate_ruby/libdeflate/scripts/android_tests.sh +69 -0
- data/ext/deflate_ruby/libdeflate/scripts/benchmark.sh +10 -0
- data/ext/deflate_ruby/libdeflate/scripts/checksum.sh +10 -0
- data/ext/deflate_ruby/libdeflate/scripts/checksum_benchmarks.sh +253 -0
- data/ext/deflate_ruby/libdeflate/scripts/cmake-helper.sh +17 -0
- data/ext/deflate_ruby/libdeflate/scripts/deflate_benchmarks.sh +119 -0
- data/ext/deflate_ruby/libdeflate/scripts/exec_tests.sh +38 -0
- data/ext/deflate_ruby/libdeflate/scripts/gen-release-archives.sh +37 -0
- data/ext/deflate_ruby/libdeflate/scripts/gen_bitreverse_tab.py +19 -0
- data/ext/deflate_ruby/libdeflate/scripts/gen_crc32_multipliers.c +199 -0
- data/ext/deflate_ruby/libdeflate/scripts/gen_crc32_tables.c +105 -0
- data/ext/deflate_ruby/libdeflate/scripts/gen_default_litlen_costs.py +44 -0
- data/ext/deflate_ruby/libdeflate/scripts/gen_offset_slot_map.py +29 -0
- data/ext/deflate_ruby/libdeflate/scripts/gzip_tests.sh +523 -0
- data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/deflate_compress/corpus/0 +0 -0
- data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/deflate_compress/fuzz.c +95 -0
- data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/deflate_decompress/corpus/0 +3 -0
- data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/deflate_decompress/fuzz.c +62 -0
- data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/fuzz.sh +108 -0
- data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/gzip_decompress/corpus/0 +0 -0
- data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/gzip_decompress/fuzz.c +19 -0
- data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/zlib_decompress/corpus/0 +3 -0
- data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/zlib_decompress/fuzz.c +19 -0
- data/ext/deflate_ruby/libdeflate/scripts/run_tests.sh +416 -0
- data/ext/deflate_ruby/libdeflate/scripts/toolchain-i686-w64-mingw32.cmake +8 -0
- data/ext/deflate_ruby/libdeflate/scripts/toolchain-x86_64-w64-mingw32.cmake +8 -0
- data/lib/deflate_ruby/version.rb +5 -0
- data/lib/deflate_ruby.rb +71 -0
- metadata +191 -0
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* adler32.c - Adler-32 checksum algorithm
|
|
3
|
+
*
|
|
4
|
+
* Copyright 2016 Eric Biggers
|
|
5
|
+
*
|
|
6
|
+
* Permission is hereby granted, free of charge, to any person
|
|
7
|
+
* obtaining a copy of this software and associated documentation
|
|
8
|
+
* files (the "Software"), to deal in the Software without
|
|
9
|
+
* restriction, including without limitation the rights to use,
|
|
10
|
+
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
11
|
+
* copies of the Software, and to permit persons to whom the
|
|
12
|
+
* Software is furnished to do so, subject to the following
|
|
13
|
+
* conditions:
|
|
14
|
+
*
|
|
15
|
+
* The above copyright notice and this permission notice shall be
|
|
16
|
+
* included in all copies or substantial portions of the Software.
|
|
17
|
+
*
|
|
18
|
+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
19
|
+
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
|
20
|
+
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
21
|
+
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
22
|
+
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|
23
|
+
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
24
|
+
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
25
|
+
* OTHER DEALINGS IN THE SOFTWARE.
|
|
26
|
+
*/
|
|
27
|
+
|
|
28
|
+
#include "lib_common.h"
|
|
29
|
+
|
|
30
|
+
/* The Adler-32 divisor, or "base", value */
|
|
31
|
+
#define DIVISOR 65521
|
|
32
|
+
|
|
33
|
+
/*
|
|
34
|
+
* MAX_CHUNK_LEN is the most bytes that can be processed without the possibility
|
|
35
|
+
* of s2 overflowing when it is represented as an unsigned 32-bit integer. This
|
|
36
|
+
* value was computed using the following Python script:
|
|
37
|
+
*
|
|
38
|
+
* divisor = 65521
|
|
39
|
+
* count = 0
|
|
40
|
+
* s1 = divisor - 1
|
|
41
|
+
* s2 = divisor - 1
|
|
42
|
+
* while True:
|
|
43
|
+
* s1 += 0xFF
|
|
44
|
+
* s2 += s1
|
|
45
|
+
* if s2 > 0xFFFFFFFF:
|
|
46
|
+
* break
|
|
47
|
+
* count += 1
|
|
48
|
+
* print(count)
|
|
49
|
+
*
|
|
50
|
+
* Note that to get the correct worst-case value, we must assume that every byte
|
|
51
|
+
* has value 0xFF and that s1 and s2 started with the highest possible values
|
|
52
|
+
* modulo the divisor.
|
|
53
|
+
*/
|
|
54
|
+
#define MAX_CHUNK_LEN 5552
|
|
55
|
+
|
|
56
|
+
/*
|
|
57
|
+
* Update the Adler-32 values s1 and s2 using n bytes from p, update p to p + n,
|
|
58
|
+
* update n to 0, and reduce s1 and s2 mod DIVISOR. It is assumed that neither
|
|
59
|
+
* s1 nor s2 can overflow before the reduction at the end, i.e. n plus any bytes
|
|
60
|
+
* already processed after the last reduction must not exceed MAX_CHUNK_LEN.
|
|
61
|
+
*
|
|
62
|
+
* This uses only portable C code. This is used as a fallback when a vectorized
|
|
63
|
+
* implementation of Adler-32 (e.g. AVX2) is unavailable on the platform.
|
|
64
|
+
*
|
|
65
|
+
* Some of the vectorized implementations also use this to handle the end of the
|
|
66
|
+
* data when the data isn't evenly divisible by the length the vectorized code
|
|
67
|
+
* works on. To avoid compiler errors about target-specific option mismatches
|
|
68
|
+
* when this is used in that way, this is a macro rather than a function.
|
|
69
|
+
*
|
|
70
|
+
* Although this is unvectorized, this does include an optimization where the
|
|
71
|
+
* main loop processes four bytes at a time using a strategy similar to that
|
|
72
|
+
* used by vectorized implementations. This provides increased instruction-
|
|
73
|
+
* level parallelism compared to the traditional 's1 += *p++; s2 += s1;'.
|
|
74
|
+
*/
|
|
75
|
+
#define ADLER32_CHUNK(s1, s2, p, n) \
|
|
76
|
+
do { \
|
|
77
|
+
if (n >= 4) { \
|
|
78
|
+
u32 s1_sum = 0; \
|
|
79
|
+
u32 byte_0_sum = 0; \
|
|
80
|
+
u32 byte_1_sum = 0; \
|
|
81
|
+
u32 byte_2_sum = 0; \
|
|
82
|
+
u32 byte_3_sum = 0; \
|
|
83
|
+
\
|
|
84
|
+
do { \
|
|
85
|
+
s1_sum += s1; \
|
|
86
|
+
s1 += p[0] + p[1] + p[2] + p[3]; \
|
|
87
|
+
byte_0_sum += p[0]; \
|
|
88
|
+
byte_1_sum += p[1]; \
|
|
89
|
+
byte_2_sum += p[2]; \
|
|
90
|
+
byte_3_sum += p[3]; \
|
|
91
|
+
p += 4; \
|
|
92
|
+
n -= 4; \
|
|
93
|
+
} while (n >= 4); \
|
|
94
|
+
s2 += (4 * (s1_sum + byte_0_sum)) + (3 * byte_1_sum) + \
|
|
95
|
+
(2 * byte_2_sum) + byte_3_sum; \
|
|
96
|
+
} \
|
|
97
|
+
for (; n; n--, p++) { \
|
|
98
|
+
s1 += *p; \
|
|
99
|
+
s2 += s1; \
|
|
100
|
+
} \
|
|
101
|
+
s1 %= DIVISOR; \
|
|
102
|
+
s2 %= DIVISOR; \
|
|
103
|
+
} while (0)
|
|
104
|
+
|
|
105
|
+
static u32 MAYBE_UNUSED
|
|
106
|
+
adler32_generic(u32 adler, const u8 *p, size_t len)
|
|
107
|
+
{
|
|
108
|
+
u32 s1 = adler & 0xFFFF;
|
|
109
|
+
u32 s2 = adler >> 16;
|
|
110
|
+
|
|
111
|
+
while (len) {
|
|
112
|
+
size_t n = MIN(len, MAX_CHUNK_LEN & ~3);
|
|
113
|
+
|
|
114
|
+
len -= n;
|
|
115
|
+
ADLER32_CHUNK(s1, s2, p, n);
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
return (s2 << 16) | s1;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
/* Include architecture-specific implementation(s) if available. */
|
|
122
|
+
#undef DEFAULT_IMPL
|
|
123
|
+
#undef arch_select_adler32_func
|
|
124
|
+
typedef u32 (*adler32_func_t)(u32 adler, const u8 *p, size_t len);
|
|
125
|
+
#if defined(ARCH_ARM32) || defined(ARCH_ARM64)
|
|
126
|
+
# include "arm/adler32_impl.h"
|
|
127
|
+
#elif defined(ARCH_X86_32) || defined(ARCH_X86_64)
|
|
128
|
+
# include "x86/adler32_impl.h"
|
|
129
|
+
#endif
|
|
130
|
+
|
|
131
|
+
#ifndef DEFAULT_IMPL
|
|
132
|
+
# define DEFAULT_IMPL adler32_generic
|
|
133
|
+
#endif
|
|
134
|
+
|
|
135
|
+
#ifdef arch_select_adler32_func
|
|
136
|
+
static u32 dispatch_adler32(u32 adler, const u8 *p, size_t len);
|
|
137
|
+
|
|
138
|
+
static volatile adler32_func_t adler32_impl = dispatch_adler32;
|
|
139
|
+
|
|
140
|
+
/* Choose the best implementation at runtime. */
|
|
141
|
+
static u32 dispatch_adler32(u32 adler, const u8 *p, size_t len)
|
|
142
|
+
{
|
|
143
|
+
adler32_func_t f = arch_select_adler32_func();
|
|
144
|
+
|
|
145
|
+
if (f == NULL)
|
|
146
|
+
f = DEFAULT_IMPL;
|
|
147
|
+
|
|
148
|
+
adler32_impl = f;
|
|
149
|
+
return f(adler, p, len);
|
|
150
|
+
}
|
|
151
|
+
#else
|
|
152
|
+
/* The best implementation is statically known, so call it directly. */
|
|
153
|
+
#define adler32_impl DEFAULT_IMPL
|
|
154
|
+
#endif
|
|
155
|
+
|
|
156
|
+
LIBDEFLATEAPI u32
|
|
157
|
+
libdeflate_adler32(u32 adler, const void *buffer, size_t len)
|
|
158
|
+
{
|
|
159
|
+
if (buffer == NULL) /* Return initial value. */
|
|
160
|
+
return 1;
|
|
161
|
+
return adler32_impl(adler, buffer, len);
|
|
162
|
+
}
|
|
@@ -0,0 +1,358 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* arm/adler32_impl.h - ARM implementations of Adler-32 checksum algorithm
|
|
3
|
+
*
|
|
4
|
+
* Copyright 2016 Eric Biggers
|
|
5
|
+
*
|
|
6
|
+
* Permission is hereby granted, free of charge, to any person
|
|
7
|
+
* obtaining a copy of this software and associated documentation
|
|
8
|
+
* files (the "Software"), to deal in the Software without
|
|
9
|
+
* restriction, including without limitation the rights to use,
|
|
10
|
+
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
11
|
+
* copies of the Software, and to permit persons to whom the
|
|
12
|
+
* Software is furnished to do so, subject to the following
|
|
13
|
+
* conditions:
|
|
14
|
+
*
|
|
15
|
+
* The above copyright notice and this permission notice shall be
|
|
16
|
+
* included in all copies or substantial portions of the Software.
|
|
17
|
+
*
|
|
18
|
+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
19
|
+
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
|
20
|
+
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
21
|
+
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
22
|
+
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|
23
|
+
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
24
|
+
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
25
|
+
* OTHER DEALINGS IN THE SOFTWARE.
|
|
26
|
+
*/
|
|
27
|
+
|
|
28
|
+
#ifndef LIB_ARM_ADLER32_IMPL_H
|
|
29
|
+
#define LIB_ARM_ADLER32_IMPL_H
|
|
30
|
+
|
|
31
|
+
#include "cpu_features.h"
|
|
32
|
+
|
|
33
|
+
/* Regular NEON implementation */
|
|
34
|
+
#if HAVE_NEON_INTRIN && CPU_IS_LITTLE_ENDIAN()
|
|
35
|
+
# define adler32_arm_neon adler32_arm_neon
|
|
36
|
+
# if HAVE_NEON_NATIVE
|
|
37
|
+
/*
|
|
38
|
+
* Use no attributes if none are needed, to support old versions of clang
|
|
39
|
+
* that don't accept the simd target attribute.
|
|
40
|
+
*/
|
|
41
|
+
# define ATTRIBUTES
|
|
42
|
+
# elif defined(ARCH_ARM32)
|
|
43
|
+
# define ATTRIBUTES _target_attribute("fpu=neon")
|
|
44
|
+
# elif defined(__clang__)
|
|
45
|
+
# define ATTRIBUTES _target_attribute("simd")
|
|
46
|
+
# else
|
|
47
|
+
# define ATTRIBUTES _target_attribute("+simd")
|
|
48
|
+
# endif
|
|
49
|
+
static ATTRIBUTES MAYBE_UNUSED u32
|
|
50
|
+
adler32_arm_neon(u32 adler, const u8 *p, size_t len)
|
|
51
|
+
{
|
|
52
|
+
static const u16 _aligned_attribute(16) mults[64] = {
|
|
53
|
+
64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49,
|
|
54
|
+
48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33,
|
|
55
|
+
32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
|
|
56
|
+
16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
|
|
57
|
+
};
|
|
58
|
+
const uint16x8_t mults_a = vld1q_u16(&mults[0]);
|
|
59
|
+
const uint16x8_t mults_b = vld1q_u16(&mults[8]);
|
|
60
|
+
const uint16x8_t mults_c = vld1q_u16(&mults[16]);
|
|
61
|
+
const uint16x8_t mults_d = vld1q_u16(&mults[24]);
|
|
62
|
+
const uint16x8_t mults_e = vld1q_u16(&mults[32]);
|
|
63
|
+
const uint16x8_t mults_f = vld1q_u16(&mults[40]);
|
|
64
|
+
const uint16x8_t mults_g = vld1q_u16(&mults[48]);
|
|
65
|
+
const uint16x8_t mults_h = vld1q_u16(&mults[56]);
|
|
66
|
+
u32 s1 = adler & 0xFFFF;
|
|
67
|
+
u32 s2 = adler >> 16;
|
|
68
|
+
|
|
69
|
+
/*
|
|
70
|
+
* If the length is large and the pointer is misaligned, align it.
|
|
71
|
+
* For smaller lengths, just take the misaligned load penalty.
|
|
72
|
+
*/
|
|
73
|
+
if (unlikely(len > 32768 && ((uintptr_t)p & 15))) {
|
|
74
|
+
do {
|
|
75
|
+
s1 += *p++;
|
|
76
|
+
s2 += s1;
|
|
77
|
+
len--;
|
|
78
|
+
} while ((uintptr_t)p & 15);
|
|
79
|
+
s1 %= DIVISOR;
|
|
80
|
+
s2 %= DIVISOR;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
while (len) {
|
|
84
|
+
/*
|
|
85
|
+
* Calculate the length of the next data chunk such that s1 and
|
|
86
|
+
* s2 are guaranteed to not exceed UINT32_MAX.
|
|
87
|
+
*/
|
|
88
|
+
size_t n = MIN(len, MAX_CHUNK_LEN & ~63);
|
|
89
|
+
|
|
90
|
+
len -= n;
|
|
91
|
+
|
|
92
|
+
if (n >= 64) {
|
|
93
|
+
uint32x4_t v_s1 = vdupq_n_u32(0);
|
|
94
|
+
uint32x4_t v_s2 = vdupq_n_u32(0);
|
|
95
|
+
/*
|
|
96
|
+
* v_byte_sums_* contain the sum of the bytes at index i
|
|
97
|
+
* across all 64-byte segments, for each index 0..63.
|
|
98
|
+
*/
|
|
99
|
+
uint16x8_t v_byte_sums_a = vdupq_n_u16(0);
|
|
100
|
+
uint16x8_t v_byte_sums_b = vdupq_n_u16(0);
|
|
101
|
+
uint16x8_t v_byte_sums_c = vdupq_n_u16(0);
|
|
102
|
+
uint16x8_t v_byte_sums_d = vdupq_n_u16(0);
|
|
103
|
+
uint16x8_t v_byte_sums_e = vdupq_n_u16(0);
|
|
104
|
+
uint16x8_t v_byte_sums_f = vdupq_n_u16(0);
|
|
105
|
+
uint16x8_t v_byte_sums_g = vdupq_n_u16(0);
|
|
106
|
+
uint16x8_t v_byte_sums_h = vdupq_n_u16(0);
|
|
107
|
+
|
|
108
|
+
s2 += s1 * (n & ~63);
|
|
109
|
+
|
|
110
|
+
do {
|
|
111
|
+
/* Load the next 64 data bytes. */
|
|
112
|
+
const uint8x16_t data_a = vld1q_u8(p + 0);
|
|
113
|
+
const uint8x16_t data_b = vld1q_u8(p + 16);
|
|
114
|
+
const uint8x16_t data_c = vld1q_u8(p + 32);
|
|
115
|
+
const uint8x16_t data_d = vld1q_u8(p + 48);
|
|
116
|
+
uint16x8_t tmp;
|
|
117
|
+
|
|
118
|
+
/*
|
|
119
|
+
* Accumulate the previous s1 counters into the
|
|
120
|
+
* s2 counters. The needed multiplication by 64
|
|
121
|
+
* is delayed to later.
|
|
122
|
+
*/
|
|
123
|
+
v_s2 = vaddq_u32(v_s2, v_s1);
|
|
124
|
+
|
|
125
|
+
/*
|
|
126
|
+
* Add the 64 data bytes to their v_byte_sums
|
|
127
|
+
* counters, while also accumulating the sums of
|
|
128
|
+
* each adjacent set of 4 bytes into v_s1.
|
|
129
|
+
*/
|
|
130
|
+
tmp = vpaddlq_u8(data_a);
|
|
131
|
+
v_byte_sums_a = vaddw_u8(v_byte_sums_a,
|
|
132
|
+
vget_low_u8(data_a));
|
|
133
|
+
v_byte_sums_b = vaddw_u8(v_byte_sums_b,
|
|
134
|
+
vget_high_u8(data_a));
|
|
135
|
+
tmp = vpadalq_u8(tmp, data_b);
|
|
136
|
+
v_byte_sums_c = vaddw_u8(v_byte_sums_c,
|
|
137
|
+
vget_low_u8(data_b));
|
|
138
|
+
v_byte_sums_d = vaddw_u8(v_byte_sums_d,
|
|
139
|
+
vget_high_u8(data_b));
|
|
140
|
+
tmp = vpadalq_u8(tmp, data_c);
|
|
141
|
+
v_byte_sums_e = vaddw_u8(v_byte_sums_e,
|
|
142
|
+
vget_low_u8(data_c));
|
|
143
|
+
v_byte_sums_f = vaddw_u8(v_byte_sums_f,
|
|
144
|
+
vget_high_u8(data_c));
|
|
145
|
+
tmp = vpadalq_u8(tmp, data_d);
|
|
146
|
+
v_byte_sums_g = vaddw_u8(v_byte_sums_g,
|
|
147
|
+
vget_low_u8(data_d));
|
|
148
|
+
v_byte_sums_h = vaddw_u8(v_byte_sums_h,
|
|
149
|
+
vget_high_u8(data_d));
|
|
150
|
+
v_s1 = vpadalq_u16(v_s1, tmp);
|
|
151
|
+
|
|
152
|
+
p += 64;
|
|
153
|
+
n -= 64;
|
|
154
|
+
} while (n >= 64);
|
|
155
|
+
|
|
156
|
+
/* s2 = 64*s2 + (64*bytesum0 + 63*bytesum1 + ... + 1*bytesum63) */
|
|
157
|
+
#ifdef ARCH_ARM32
|
|
158
|
+
# define umlal2(a, b, c) vmlal_u16((a), vget_high_u16(b), vget_high_u16(c))
|
|
159
|
+
#else
|
|
160
|
+
# define umlal2 vmlal_high_u16
|
|
161
|
+
#endif
|
|
162
|
+
v_s2 = vqshlq_n_u32(v_s2, 6);
|
|
163
|
+
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_a),
|
|
164
|
+
vget_low_u16(mults_a));
|
|
165
|
+
v_s2 = umlal2(v_s2, v_byte_sums_a, mults_a);
|
|
166
|
+
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_b),
|
|
167
|
+
vget_low_u16(mults_b));
|
|
168
|
+
v_s2 = umlal2(v_s2, v_byte_sums_b, mults_b);
|
|
169
|
+
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_c),
|
|
170
|
+
vget_low_u16(mults_c));
|
|
171
|
+
v_s2 = umlal2(v_s2, v_byte_sums_c, mults_c);
|
|
172
|
+
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_d),
|
|
173
|
+
vget_low_u16(mults_d));
|
|
174
|
+
v_s2 = umlal2(v_s2, v_byte_sums_d, mults_d);
|
|
175
|
+
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_e),
|
|
176
|
+
vget_low_u16(mults_e));
|
|
177
|
+
v_s2 = umlal2(v_s2, v_byte_sums_e, mults_e);
|
|
178
|
+
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_f),
|
|
179
|
+
vget_low_u16(mults_f));
|
|
180
|
+
v_s2 = umlal2(v_s2, v_byte_sums_f, mults_f);
|
|
181
|
+
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_g),
|
|
182
|
+
vget_low_u16(mults_g));
|
|
183
|
+
v_s2 = umlal2(v_s2, v_byte_sums_g, mults_g);
|
|
184
|
+
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_h),
|
|
185
|
+
vget_low_u16(mults_h));
|
|
186
|
+
v_s2 = umlal2(v_s2, v_byte_sums_h, mults_h);
|
|
187
|
+
#undef umlal2
|
|
188
|
+
|
|
189
|
+
/* Horizontal sum to finish up */
|
|
190
|
+
#ifdef ARCH_ARM32
|
|
191
|
+
s1 += vgetq_lane_u32(v_s1, 0) + vgetq_lane_u32(v_s1, 1) +
|
|
192
|
+
vgetq_lane_u32(v_s1, 2) + vgetq_lane_u32(v_s1, 3);
|
|
193
|
+
s2 += vgetq_lane_u32(v_s2, 0) + vgetq_lane_u32(v_s2, 1) +
|
|
194
|
+
vgetq_lane_u32(v_s2, 2) + vgetq_lane_u32(v_s2, 3);
|
|
195
|
+
#else
|
|
196
|
+
s1 += vaddvq_u32(v_s1);
|
|
197
|
+
s2 += vaddvq_u32(v_s2);
|
|
198
|
+
#endif
|
|
199
|
+
}
|
|
200
|
+
/*
|
|
201
|
+
* Process the last 0 <= n < 64 bytes of the chunk using
|
|
202
|
+
* scalar instructions and reduce s1 and s2 mod DIVISOR.
|
|
203
|
+
*/
|
|
204
|
+
ADLER32_CHUNK(s1, s2, p, n);
|
|
205
|
+
}
|
|
206
|
+
return (s2 << 16) | s1;
|
|
207
|
+
}
|
|
208
|
+
#undef ATTRIBUTES
|
|
209
|
+
#endif /* Regular NEON implementation */
|
|
210
|
+
|
|
211
|
+
/* NEON+dotprod implementation */
|
|
212
|
+
#if HAVE_DOTPROD_INTRIN && CPU_IS_LITTLE_ENDIAN()
|
|
213
|
+
# define adler32_arm_neon_dotprod adler32_arm_neon_dotprod
|
|
214
|
+
# ifdef __clang__
|
|
215
|
+
# define ATTRIBUTES _target_attribute("dotprod")
|
|
216
|
+
/*
|
|
217
|
+
* With gcc 13.1 and earlier (before gcc commit 73d3bc348190 or 9aac37ab8a7b,
|
|
218
|
+
* "aarch64: Remove architecture dependencies from intrinsics"),
|
|
219
|
+
* arch=armv8.2-a is needed for the dotprod intrinsics, unless the default
|
|
220
|
+
* target is armv8.3-a or later in which case it must be omitted. armv8.3-a
|
|
221
|
+
* or later can be detected by checking for __ARM_FEATURE_JCVT.
|
|
222
|
+
*/
|
|
223
|
+
# elif GCC_PREREQ(13, 2) || defined(__ARM_FEATURE_JCVT)
|
|
224
|
+
# define ATTRIBUTES _target_attribute("+dotprod")
|
|
225
|
+
# else
|
|
226
|
+
# define ATTRIBUTES _target_attribute("arch=armv8.2-a+dotprod")
|
|
227
|
+
# endif
|
|
228
|
+
static ATTRIBUTES u32
|
|
229
|
+
adler32_arm_neon_dotprod(u32 adler, const u8 *p, size_t len)
|
|
230
|
+
{
|
|
231
|
+
static const u8 _aligned_attribute(16) mults[64] = {
|
|
232
|
+
64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49,
|
|
233
|
+
48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33,
|
|
234
|
+
32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
|
|
235
|
+
16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
|
|
236
|
+
};
|
|
237
|
+
const uint8x16_t mults_a = vld1q_u8(&mults[0]);
|
|
238
|
+
const uint8x16_t mults_b = vld1q_u8(&mults[16]);
|
|
239
|
+
const uint8x16_t mults_c = vld1q_u8(&mults[32]);
|
|
240
|
+
const uint8x16_t mults_d = vld1q_u8(&mults[48]);
|
|
241
|
+
const uint8x16_t ones = vdupq_n_u8(1);
|
|
242
|
+
u32 s1 = adler & 0xFFFF;
|
|
243
|
+
u32 s2 = adler >> 16;
|
|
244
|
+
|
|
245
|
+
/*
|
|
246
|
+
* If the length is large and the pointer is misaligned, align it.
|
|
247
|
+
* For smaller lengths, just take the misaligned load penalty.
|
|
248
|
+
*/
|
|
249
|
+
if (unlikely(len > 32768 && ((uintptr_t)p & 15))) {
|
|
250
|
+
do {
|
|
251
|
+
s1 += *p++;
|
|
252
|
+
s2 += s1;
|
|
253
|
+
len--;
|
|
254
|
+
} while ((uintptr_t)p & 15);
|
|
255
|
+
s1 %= DIVISOR;
|
|
256
|
+
s2 %= DIVISOR;
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
while (len) {
|
|
260
|
+
/*
|
|
261
|
+
* Calculate the length of the next data chunk such that s1 and
|
|
262
|
+
* s2 are guaranteed to not exceed UINT32_MAX.
|
|
263
|
+
*/
|
|
264
|
+
size_t n = MIN(len, MAX_CHUNK_LEN & ~63);
|
|
265
|
+
|
|
266
|
+
len -= n;
|
|
267
|
+
|
|
268
|
+
if (n >= 64) {
|
|
269
|
+
uint32x4_t v_s1_a = vdupq_n_u32(0);
|
|
270
|
+
uint32x4_t v_s1_b = vdupq_n_u32(0);
|
|
271
|
+
uint32x4_t v_s1_c = vdupq_n_u32(0);
|
|
272
|
+
uint32x4_t v_s1_d = vdupq_n_u32(0);
|
|
273
|
+
uint32x4_t v_s2_a = vdupq_n_u32(0);
|
|
274
|
+
uint32x4_t v_s2_b = vdupq_n_u32(0);
|
|
275
|
+
uint32x4_t v_s2_c = vdupq_n_u32(0);
|
|
276
|
+
uint32x4_t v_s2_d = vdupq_n_u32(0);
|
|
277
|
+
uint32x4_t v_s1_sums_a = vdupq_n_u32(0);
|
|
278
|
+
uint32x4_t v_s1_sums_b = vdupq_n_u32(0);
|
|
279
|
+
uint32x4_t v_s1_sums_c = vdupq_n_u32(0);
|
|
280
|
+
uint32x4_t v_s1_sums_d = vdupq_n_u32(0);
|
|
281
|
+
uint32x4_t v_s1;
|
|
282
|
+
uint32x4_t v_s2;
|
|
283
|
+
uint32x4_t v_s1_sums;
|
|
284
|
+
|
|
285
|
+
s2 += s1 * (n & ~63);
|
|
286
|
+
|
|
287
|
+
do {
|
|
288
|
+
uint8x16_t data_a = vld1q_u8(p + 0);
|
|
289
|
+
uint8x16_t data_b = vld1q_u8(p + 16);
|
|
290
|
+
uint8x16_t data_c = vld1q_u8(p + 32);
|
|
291
|
+
uint8x16_t data_d = vld1q_u8(p + 48);
|
|
292
|
+
|
|
293
|
+
v_s1_sums_a = vaddq_u32(v_s1_sums_a, v_s1_a);
|
|
294
|
+
v_s1_a = vdotq_u32(v_s1_a, data_a, ones);
|
|
295
|
+
v_s2_a = vdotq_u32(v_s2_a, data_a, mults_a);
|
|
296
|
+
|
|
297
|
+
v_s1_sums_b = vaddq_u32(v_s1_sums_b, v_s1_b);
|
|
298
|
+
v_s1_b = vdotq_u32(v_s1_b, data_b, ones);
|
|
299
|
+
v_s2_b = vdotq_u32(v_s2_b, data_b, mults_b);
|
|
300
|
+
|
|
301
|
+
v_s1_sums_c = vaddq_u32(v_s1_sums_c, v_s1_c);
|
|
302
|
+
v_s1_c = vdotq_u32(v_s1_c, data_c, ones);
|
|
303
|
+
v_s2_c = vdotq_u32(v_s2_c, data_c, mults_c);
|
|
304
|
+
|
|
305
|
+
v_s1_sums_d = vaddq_u32(v_s1_sums_d, v_s1_d);
|
|
306
|
+
v_s1_d = vdotq_u32(v_s1_d, data_d, ones);
|
|
307
|
+
v_s2_d = vdotq_u32(v_s2_d, data_d, mults_d);
|
|
308
|
+
|
|
309
|
+
p += 64;
|
|
310
|
+
n -= 64;
|
|
311
|
+
} while (n >= 64);
|
|
312
|
+
|
|
313
|
+
v_s1 = vaddq_u32(vaddq_u32(v_s1_a, v_s1_b),
|
|
314
|
+
vaddq_u32(v_s1_c, v_s1_d));
|
|
315
|
+
v_s2 = vaddq_u32(vaddq_u32(v_s2_a, v_s2_b),
|
|
316
|
+
vaddq_u32(v_s2_c, v_s2_d));
|
|
317
|
+
v_s1_sums = vaddq_u32(vaddq_u32(v_s1_sums_a,
|
|
318
|
+
v_s1_sums_b),
|
|
319
|
+
vaddq_u32(v_s1_sums_c,
|
|
320
|
+
v_s1_sums_d));
|
|
321
|
+
v_s2 = vaddq_u32(v_s2, vqshlq_n_u32(v_s1_sums, 6));
|
|
322
|
+
|
|
323
|
+
s1 += vaddvq_u32(v_s1);
|
|
324
|
+
s2 += vaddvq_u32(v_s2);
|
|
325
|
+
}
|
|
326
|
+
/*
|
|
327
|
+
* Process the last 0 <= n < 64 bytes of the chunk using
|
|
328
|
+
* scalar instructions and reduce s1 and s2 mod DIVISOR.
|
|
329
|
+
*/
|
|
330
|
+
ADLER32_CHUNK(s1, s2, p, n);
|
|
331
|
+
}
|
|
332
|
+
return (s2 << 16) | s1;
|
|
333
|
+
}
|
|
334
|
+
#undef ATTRIBUTES
|
|
335
|
+
#endif /* NEON+dotprod implementation */
|
|
336
|
+
|
|
337
|
+
#if defined(adler32_arm_neon_dotprod) && defined(__ARM_FEATURE_DOTPROD)
|
|
338
|
+
#define DEFAULT_IMPL adler32_arm_neon_dotprod
|
|
339
|
+
#else
|
|
340
|
+
static inline adler32_func_t
|
|
341
|
+
arch_select_adler32_func(void)
|
|
342
|
+
{
|
|
343
|
+
const u32 features MAYBE_UNUSED = get_arm_cpu_features();
|
|
344
|
+
|
|
345
|
+
#ifdef adler32_arm_neon_dotprod
|
|
346
|
+
if (HAVE_NEON(features) && HAVE_DOTPROD(features))
|
|
347
|
+
return adler32_arm_neon_dotprod;
|
|
348
|
+
#endif
|
|
349
|
+
#ifdef adler32_arm_neon
|
|
350
|
+
if (HAVE_NEON(features))
|
|
351
|
+
return adler32_arm_neon;
|
|
352
|
+
#endif
|
|
353
|
+
return NULL;
|
|
354
|
+
}
|
|
355
|
+
#define arch_select_adler32_func arch_select_adler32_func
|
|
356
|
+
#endif
|
|
357
|
+
|
|
358
|
+
#endif /* LIB_ARM_ADLER32_IMPL_H */
|