deflate-ruby 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. checksums.yaml +7 -0
  2. data/CLAUDE.md +138 -0
  3. data/LICENSE.txt +21 -0
  4. data/README.md +117 -0
  5. data/ext/deflate_ruby/deflate_ruby.c +301 -0
  6. data/ext/deflate_ruby/extconf.rb +34 -0
  7. data/ext/deflate_ruby/libdeflate/CMakeLists.txt +270 -0
  8. data/ext/deflate_ruby/libdeflate/COPYING +22 -0
  9. data/ext/deflate_ruby/libdeflate/NEWS.md +494 -0
  10. data/ext/deflate_ruby/libdeflate/README.md +228 -0
  11. data/ext/deflate_ruby/libdeflate/common_defs.h +747 -0
  12. data/ext/deflate_ruby/libdeflate/lib/adler32.c +162 -0
  13. data/ext/deflate_ruby/libdeflate/lib/arm/adler32_impl.h +358 -0
  14. data/ext/deflate_ruby/libdeflate/lib/arm/cpu_features.c +230 -0
  15. data/ext/deflate_ruby/libdeflate/lib/arm/cpu_features.h +214 -0
  16. data/ext/deflate_ruby/libdeflate/lib/arm/crc32_impl.h +600 -0
  17. data/ext/deflate_ruby/libdeflate/lib/arm/crc32_pmull_helpers.h +156 -0
  18. data/ext/deflate_ruby/libdeflate/lib/arm/crc32_pmull_wide.h +226 -0
  19. data/ext/deflate_ruby/libdeflate/lib/arm/matchfinder_impl.h +78 -0
  20. data/ext/deflate_ruby/libdeflate/lib/bt_matchfinder.h +342 -0
  21. data/ext/deflate_ruby/libdeflate/lib/cpu_features_common.h +93 -0
  22. data/ext/deflate_ruby/libdeflate/lib/crc32.c +262 -0
  23. data/ext/deflate_ruby/libdeflate/lib/crc32_multipliers.h +377 -0
  24. data/ext/deflate_ruby/libdeflate/lib/crc32_tables.h +587 -0
  25. data/ext/deflate_ruby/libdeflate/lib/decompress_template.h +777 -0
  26. data/ext/deflate_ruby/libdeflate/lib/deflate_compress.c +4129 -0
  27. data/ext/deflate_ruby/libdeflate/lib/deflate_compress.h +15 -0
  28. data/ext/deflate_ruby/libdeflate/lib/deflate_constants.h +56 -0
  29. data/ext/deflate_ruby/libdeflate/lib/deflate_decompress.c +1208 -0
  30. data/ext/deflate_ruby/libdeflate/lib/gzip_compress.c +90 -0
  31. data/ext/deflate_ruby/libdeflate/lib/gzip_constants.h +45 -0
  32. data/ext/deflate_ruby/libdeflate/lib/gzip_decompress.c +144 -0
  33. data/ext/deflate_ruby/libdeflate/lib/hc_matchfinder.h +401 -0
  34. data/ext/deflate_ruby/libdeflate/lib/ht_matchfinder.h +234 -0
  35. data/ext/deflate_ruby/libdeflate/lib/lib_common.h +106 -0
  36. data/ext/deflate_ruby/libdeflate/lib/matchfinder_common.h +224 -0
  37. data/ext/deflate_ruby/libdeflate/lib/riscv/matchfinder_impl.h +97 -0
  38. data/ext/deflate_ruby/libdeflate/lib/utils.c +141 -0
  39. data/ext/deflate_ruby/libdeflate/lib/x86/adler32_impl.h +134 -0
  40. data/ext/deflate_ruby/libdeflate/lib/x86/adler32_template.h +518 -0
  41. data/ext/deflate_ruby/libdeflate/lib/x86/cpu_features.c +183 -0
  42. data/ext/deflate_ruby/libdeflate/lib/x86/cpu_features.h +169 -0
  43. data/ext/deflate_ruby/libdeflate/lib/x86/crc32_impl.h +160 -0
  44. data/ext/deflate_ruby/libdeflate/lib/x86/crc32_pclmul_template.h +495 -0
  45. data/ext/deflate_ruby/libdeflate/lib/x86/decompress_impl.h +57 -0
  46. data/ext/deflate_ruby/libdeflate/lib/x86/matchfinder_impl.h +122 -0
  47. data/ext/deflate_ruby/libdeflate/lib/zlib_compress.c +82 -0
  48. data/ext/deflate_ruby/libdeflate/lib/zlib_constants.h +21 -0
  49. data/ext/deflate_ruby/libdeflate/lib/zlib_decompress.c +104 -0
  50. data/ext/deflate_ruby/libdeflate/libdeflate-config.cmake.in +3 -0
  51. data/ext/deflate_ruby/libdeflate/libdeflate.h +411 -0
  52. data/ext/deflate_ruby/libdeflate/libdeflate.pc.in +18 -0
  53. data/ext/deflate_ruby/libdeflate/programs/CMakeLists.txt +105 -0
  54. data/ext/deflate_ruby/libdeflate/programs/benchmark.c +696 -0
  55. data/ext/deflate_ruby/libdeflate/programs/checksum.c +218 -0
  56. data/ext/deflate_ruby/libdeflate/programs/config.h.in +19 -0
  57. data/ext/deflate_ruby/libdeflate/programs/gzip.c +688 -0
  58. data/ext/deflate_ruby/libdeflate/programs/prog_util.c +521 -0
  59. data/ext/deflate_ruby/libdeflate/programs/prog_util.h +225 -0
  60. data/ext/deflate_ruby/libdeflate/programs/test_checksums.c +200 -0
  61. data/ext/deflate_ruby/libdeflate/programs/test_custom_malloc.c +155 -0
  62. data/ext/deflate_ruby/libdeflate/programs/test_incomplete_codes.c +385 -0
  63. data/ext/deflate_ruby/libdeflate/programs/test_invalid_streams.c +130 -0
  64. data/ext/deflate_ruby/libdeflate/programs/test_litrunlen_overflow.c +72 -0
  65. data/ext/deflate_ruby/libdeflate/programs/test_overread.c +95 -0
  66. data/ext/deflate_ruby/libdeflate/programs/test_slow_decompression.c +472 -0
  67. data/ext/deflate_ruby/libdeflate/programs/test_trailing_bytes.c +151 -0
  68. data/ext/deflate_ruby/libdeflate/programs/test_util.c +237 -0
  69. data/ext/deflate_ruby/libdeflate/programs/test_util.h +61 -0
  70. data/ext/deflate_ruby/libdeflate/programs/tgetopt.c +118 -0
  71. data/ext/deflate_ruby/libdeflate/scripts/android_build.sh +118 -0
  72. data/ext/deflate_ruby/libdeflate/scripts/android_tests.sh +69 -0
  73. data/ext/deflate_ruby/libdeflate/scripts/benchmark.sh +10 -0
  74. data/ext/deflate_ruby/libdeflate/scripts/checksum.sh +10 -0
  75. data/ext/deflate_ruby/libdeflate/scripts/checksum_benchmarks.sh +253 -0
  76. data/ext/deflate_ruby/libdeflate/scripts/cmake-helper.sh +17 -0
  77. data/ext/deflate_ruby/libdeflate/scripts/deflate_benchmarks.sh +119 -0
  78. data/ext/deflate_ruby/libdeflate/scripts/exec_tests.sh +38 -0
  79. data/ext/deflate_ruby/libdeflate/scripts/gen-release-archives.sh +37 -0
  80. data/ext/deflate_ruby/libdeflate/scripts/gen_bitreverse_tab.py +19 -0
  81. data/ext/deflate_ruby/libdeflate/scripts/gen_crc32_multipliers.c +199 -0
  82. data/ext/deflate_ruby/libdeflate/scripts/gen_crc32_tables.c +105 -0
  83. data/ext/deflate_ruby/libdeflate/scripts/gen_default_litlen_costs.py +44 -0
  84. data/ext/deflate_ruby/libdeflate/scripts/gen_offset_slot_map.py +29 -0
  85. data/ext/deflate_ruby/libdeflate/scripts/gzip_tests.sh +523 -0
  86. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/deflate_compress/corpus/0 +0 -0
  87. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/deflate_compress/fuzz.c +95 -0
  88. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/deflate_decompress/corpus/0 +3 -0
  89. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/deflate_decompress/fuzz.c +62 -0
  90. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/fuzz.sh +108 -0
  91. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/gzip_decompress/corpus/0 +0 -0
  92. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/gzip_decompress/fuzz.c +19 -0
  93. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/zlib_decompress/corpus/0 +3 -0
  94. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/zlib_decompress/fuzz.c +19 -0
  95. data/ext/deflate_ruby/libdeflate/scripts/run_tests.sh +416 -0
  96. data/ext/deflate_ruby/libdeflate/scripts/toolchain-i686-w64-mingw32.cmake +8 -0
  97. data/ext/deflate_ruby/libdeflate/scripts/toolchain-x86_64-w64-mingw32.cmake +8 -0
  98. data/lib/deflate_ruby/version.rb +5 -0
  99. data/lib/deflate_ruby.rb +71 -0
  100. metadata +191 -0
@@ -0,0 +1,162 @@
1
+ /*
2
+ * adler32.c - Adler-32 checksum algorithm
3
+ *
4
+ * Copyright 2016 Eric Biggers
5
+ *
6
+ * Permission is hereby granted, free of charge, to any person
7
+ * obtaining a copy of this software and associated documentation
8
+ * files (the "Software"), to deal in the Software without
9
+ * restriction, including without limitation the rights to use,
10
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ * copies of the Software, and to permit persons to whom the
12
+ * Software is furnished to do so, subject to the following
13
+ * conditions:
14
+ *
15
+ * The above copyright notice and this permission notice shall be
16
+ * included in all copies or substantial portions of the Software.
17
+ *
18
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
20
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
23
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
25
+ * OTHER DEALINGS IN THE SOFTWARE.
26
+ */
27
+
28
+ #include "lib_common.h"
29
+
30
+ /* The Adler-32 divisor, or "base", value */
31
+ #define DIVISOR 65521
32
+
33
+ /*
34
+ * MAX_CHUNK_LEN is the most bytes that can be processed without the possibility
35
+ * of s2 overflowing when it is represented as an unsigned 32-bit integer. This
36
+ * value was computed using the following Python script:
37
+ *
38
+ * divisor = 65521
39
+ * count = 0
40
+ * s1 = divisor - 1
41
+ * s2 = divisor - 1
42
+ * while True:
43
+ * s1 += 0xFF
44
+ * s2 += s1
45
+ * if s2 > 0xFFFFFFFF:
46
+ * break
47
+ * count += 1
48
+ * print(count)
49
+ *
50
+ * Note that to get the correct worst-case value, we must assume that every byte
51
+ * has value 0xFF and that s1 and s2 started with the highest possible values
52
+ * modulo the divisor.
53
+ */
54
+ #define MAX_CHUNK_LEN 5552
55
+
56
+ /*
57
+ * Update the Adler-32 values s1 and s2 using n bytes from p, update p to p + n,
58
+ * update n to 0, and reduce s1 and s2 mod DIVISOR. It is assumed that neither
59
+ * s1 nor s2 can overflow before the reduction at the end, i.e. n plus any bytes
60
+ * already processed after the last reduction must not exceed MAX_CHUNK_LEN.
61
+ *
62
+ * This uses only portable C code. This is used as a fallback when a vectorized
63
+ * implementation of Adler-32 (e.g. AVX2) is unavailable on the platform.
64
+ *
65
+ * Some of the vectorized implementations also use this to handle the end of the
66
+ * data when the data isn't evenly divisible by the length the vectorized code
67
+ * works on. To avoid compiler errors about target-specific option mismatches
68
+ * when this is used in that way, this is a macro rather than a function.
69
+ *
70
+ * Although this is unvectorized, this does include an optimization where the
71
+ * main loop processes four bytes at a time using a strategy similar to that
72
+ * used by vectorized implementations. This provides increased instruction-
73
+ * level parallelism compared to the traditional 's1 += *p++; s2 += s1;'.
74
+ */
75
+ #define ADLER32_CHUNK(s1, s2, p, n) \
76
+ do { \
77
+ if (n >= 4) { \
78
+ u32 s1_sum = 0; \
79
+ u32 byte_0_sum = 0; \
80
+ u32 byte_1_sum = 0; \
81
+ u32 byte_2_sum = 0; \
82
+ u32 byte_3_sum = 0; \
83
+ \
84
+ do { \
85
+ s1_sum += s1; \
86
+ s1 += p[0] + p[1] + p[2] + p[3]; \
87
+ byte_0_sum += p[0]; \
88
+ byte_1_sum += p[1]; \
89
+ byte_2_sum += p[2]; \
90
+ byte_3_sum += p[3]; \
91
+ p += 4; \
92
+ n -= 4; \
93
+ } while (n >= 4); \
94
+ s2 += (4 * (s1_sum + byte_0_sum)) + (3 * byte_1_sum) + \
95
+ (2 * byte_2_sum) + byte_3_sum; \
96
+ } \
97
+ for (; n; n--, p++) { \
98
+ s1 += *p; \
99
+ s2 += s1; \
100
+ } \
101
+ s1 %= DIVISOR; \
102
+ s2 %= DIVISOR; \
103
+ } while (0)
104
+
105
+ static u32 MAYBE_UNUSED
106
+ adler32_generic(u32 adler, const u8 *p, size_t len)
107
+ {
108
+ u32 s1 = adler & 0xFFFF;
109
+ u32 s2 = adler >> 16;
110
+
111
+ while (len) {
112
+ size_t n = MIN(len, MAX_CHUNK_LEN & ~3);
113
+
114
+ len -= n;
115
+ ADLER32_CHUNK(s1, s2, p, n);
116
+ }
117
+
118
+ return (s2 << 16) | s1;
119
+ }
120
+
121
+ /* Include architecture-specific implementation(s) if available. */
122
+ #undef DEFAULT_IMPL
123
+ #undef arch_select_adler32_func
124
+ typedef u32 (*adler32_func_t)(u32 adler, const u8 *p, size_t len);
125
+ #if defined(ARCH_ARM32) || defined(ARCH_ARM64)
126
+ # include "arm/adler32_impl.h"
127
+ #elif defined(ARCH_X86_32) || defined(ARCH_X86_64)
128
+ # include "x86/adler32_impl.h"
129
+ #endif
130
+
131
+ #ifndef DEFAULT_IMPL
132
+ # define DEFAULT_IMPL adler32_generic
133
+ #endif
134
+
135
+ #ifdef arch_select_adler32_func
136
+ static u32 dispatch_adler32(u32 adler, const u8 *p, size_t len);
137
+
138
+ static volatile adler32_func_t adler32_impl = dispatch_adler32;
139
+
140
+ /* Choose the best implementation at runtime. */
141
+ static u32 dispatch_adler32(u32 adler, const u8 *p, size_t len)
142
+ {
143
+ adler32_func_t f = arch_select_adler32_func();
144
+
145
+ if (f == NULL)
146
+ f = DEFAULT_IMPL;
147
+
148
+ adler32_impl = f;
149
+ return f(adler, p, len);
150
+ }
151
+ #else
152
+ /* The best implementation is statically known, so call it directly. */
153
+ #define adler32_impl DEFAULT_IMPL
154
+ #endif
155
+
156
+ LIBDEFLATEAPI u32
157
+ libdeflate_adler32(u32 adler, const void *buffer, size_t len)
158
+ {
159
+ if (buffer == NULL) /* Return initial value. */
160
+ return 1;
161
+ return adler32_impl(adler, buffer, len);
162
+ }
@@ -0,0 +1,358 @@
1
+ /*
2
+ * arm/adler32_impl.h - ARM implementations of Adler-32 checksum algorithm
3
+ *
4
+ * Copyright 2016 Eric Biggers
5
+ *
6
+ * Permission is hereby granted, free of charge, to any person
7
+ * obtaining a copy of this software and associated documentation
8
+ * files (the "Software"), to deal in the Software without
9
+ * restriction, including without limitation the rights to use,
10
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ * copies of the Software, and to permit persons to whom the
12
+ * Software is furnished to do so, subject to the following
13
+ * conditions:
14
+ *
15
+ * The above copyright notice and this permission notice shall be
16
+ * included in all copies or substantial portions of the Software.
17
+ *
18
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
20
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
23
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
25
+ * OTHER DEALINGS IN THE SOFTWARE.
26
+ */
27
+
28
+ #ifndef LIB_ARM_ADLER32_IMPL_H
29
+ #define LIB_ARM_ADLER32_IMPL_H
30
+
31
+ #include "cpu_features.h"
32
+
33
+ /* Regular NEON implementation */
34
+ #if HAVE_NEON_INTRIN && CPU_IS_LITTLE_ENDIAN()
35
+ # define adler32_arm_neon adler32_arm_neon
36
+ # if HAVE_NEON_NATIVE
37
+ /*
38
+ * Use no attributes if none are needed, to support old versions of clang
39
+ * that don't accept the simd target attribute.
40
+ */
41
+ # define ATTRIBUTES
42
+ # elif defined(ARCH_ARM32)
43
+ # define ATTRIBUTES _target_attribute("fpu=neon")
44
+ # elif defined(__clang__)
45
+ # define ATTRIBUTES _target_attribute("simd")
46
+ # else
47
+ # define ATTRIBUTES _target_attribute("+simd")
48
+ # endif
49
+ static ATTRIBUTES MAYBE_UNUSED u32
50
+ adler32_arm_neon(u32 adler, const u8 *p, size_t len)
51
+ {
52
+ static const u16 _aligned_attribute(16) mults[64] = {
53
+ 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49,
54
+ 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33,
55
+ 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
56
+ 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
57
+ };
58
+ const uint16x8_t mults_a = vld1q_u16(&mults[0]);
59
+ const uint16x8_t mults_b = vld1q_u16(&mults[8]);
60
+ const uint16x8_t mults_c = vld1q_u16(&mults[16]);
61
+ const uint16x8_t mults_d = vld1q_u16(&mults[24]);
62
+ const uint16x8_t mults_e = vld1q_u16(&mults[32]);
63
+ const uint16x8_t mults_f = vld1q_u16(&mults[40]);
64
+ const uint16x8_t mults_g = vld1q_u16(&mults[48]);
65
+ const uint16x8_t mults_h = vld1q_u16(&mults[56]);
66
+ u32 s1 = adler & 0xFFFF;
67
+ u32 s2 = adler >> 16;
68
+
69
+ /*
70
+ * If the length is large and the pointer is misaligned, align it.
71
+ * For smaller lengths, just take the misaligned load penalty.
72
+ */
73
+ if (unlikely(len > 32768 && ((uintptr_t)p & 15))) {
74
+ do {
75
+ s1 += *p++;
76
+ s2 += s1;
77
+ len--;
78
+ } while ((uintptr_t)p & 15);
79
+ s1 %= DIVISOR;
80
+ s2 %= DIVISOR;
81
+ }
82
+
83
+ while (len) {
84
+ /*
85
+ * Calculate the length of the next data chunk such that s1 and
86
+ * s2 are guaranteed to not exceed UINT32_MAX.
87
+ */
88
+ size_t n = MIN(len, MAX_CHUNK_LEN & ~63);
89
+
90
+ len -= n;
91
+
92
+ if (n >= 64) {
93
+ uint32x4_t v_s1 = vdupq_n_u32(0);
94
+ uint32x4_t v_s2 = vdupq_n_u32(0);
95
+ /*
96
+ * v_byte_sums_* contain the sum of the bytes at index i
97
+ * across all 64-byte segments, for each index 0..63.
98
+ */
99
+ uint16x8_t v_byte_sums_a = vdupq_n_u16(0);
100
+ uint16x8_t v_byte_sums_b = vdupq_n_u16(0);
101
+ uint16x8_t v_byte_sums_c = vdupq_n_u16(0);
102
+ uint16x8_t v_byte_sums_d = vdupq_n_u16(0);
103
+ uint16x8_t v_byte_sums_e = vdupq_n_u16(0);
104
+ uint16x8_t v_byte_sums_f = vdupq_n_u16(0);
105
+ uint16x8_t v_byte_sums_g = vdupq_n_u16(0);
106
+ uint16x8_t v_byte_sums_h = vdupq_n_u16(0);
107
+
108
+ s2 += s1 * (n & ~63);
109
+
110
+ do {
111
+ /* Load the next 64 data bytes. */
112
+ const uint8x16_t data_a = vld1q_u8(p + 0);
113
+ const uint8x16_t data_b = vld1q_u8(p + 16);
114
+ const uint8x16_t data_c = vld1q_u8(p + 32);
115
+ const uint8x16_t data_d = vld1q_u8(p + 48);
116
+ uint16x8_t tmp;
117
+
118
+ /*
119
+ * Accumulate the previous s1 counters into the
120
+ * s2 counters. The needed multiplication by 64
121
+ * is delayed to later.
122
+ */
123
+ v_s2 = vaddq_u32(v_s2, v_s1);
124
+
125
+ /*
126
+ * Add the 64 data bytes to their v_byte_sums
127
+ * counters, while also accumulating the sums of
128
+ * each adjacent set of 4 bytes into v_s1.
129
+ */
130
+ tmp = vpaddlq_u8(data_a);
131
+ v_byte_sums_a = vaddw_u8(v_byte_sums_a,
132
+ vget_low_u8(data_a));
133
+ v_byte_sums_b = vaddw_u8(v_byte_sums_b,
134
+ vget_high_u8(data_a));
135
+ tmp = vpadalq_u8(tmp, data_b);
136
+ v_byte_sums_c = vaddw_u8(v_byte_sums_c,
137
+ vget_low_u8(data_b));
138
+ v_byte_sums_d = vaddw_u8(v_byte_sums_d,
139
+ vget_high_u8(data_b));
140
+ tmp = vpadalq_u8(tmp, data_c);
141
+ v_byte_sums_e = vaddw_u8(v_byte_sums_e,
142
+ vget_low_u8(data_c));
143
+ v_byte_sums_f = vaddw_u8(v_byte_sums_f,
144
+ vget_high_u8(data_c));
145
+ tmp = vpadalq_u8(tmp, data_d);
146
+ v_byte_sums_g = vaddw_u8(v_byte_sums_g,
147
+ vget_low_u8(data_d));
148
+ v_byte_sums_h = vaddw_u8(v_byte_sums_h,
149
+ vget_high_u8(data_d));
150
+ v_s1 = vpadalq_u16(v_s1, tmp);
151
+
152
+ p += 64;
153
+ n -= 64;
154
+ } while (n >= 64);
155
+
156
+ /* s2 = 64*s2 + (64*bytesum0 + 63*bytesum1 + ... + 1*bytesum63) */
157
+ #ifdef ARCH_ARM32
158
+ # define umlal2(a, b, c) vmlal_u16((a), vget_high_u16(b), vget_high_u16(c))
159
+ #else
160
+ # define umlal2 vmlal_high_u16
161
+ #endif
162
+ v_s2 = vqshlq_n_u32(v_s2, 6);
163
+ v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_a),
164
+ vget_low_u16(mults_a));
165
+ v_s2 = umlal2(v_s2, v_byte_sums_a, mults_a);
166
+ v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_b),
167
+ vget_low_u16(mults_b));
168
+ v_s2 = umlal2(v_s2, v_byte_sums_b, mults_b);
169
+ v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_c),
170
+ vget_low_u16(mults_c));
171
+ v_s2 = umlal2(v_s2, v_byte_sums_c, mults_c);
172
+ v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_d),
173
+ vget_low_u16(mults_d));
174
+ v_s2 = umlal2(v_s2, v_byte_sums_d, mults_d);
175
+ v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_e),
176
+ vget_low_u16(mults_e));
177
+ v_s2 = umlal2(v_s2, v_byte_sums_e, mults_e);
178
+ v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_f),
179
+ vget_low_u16(mults_f));
180
+ v_s2 = umlal2(v_s2, v_byte_sums_f, mults_f);
181
+ v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_g),
182
+ vget_low_u16(mults_g));
183
+ v_s2 = umlal2(v_s2, v_byte_sums_g, mults_g);
184
+ v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_h),
185
+ vget_low_u16(mults_h));
186
+ v_s2 = umlal2(v_s2, v_byte_sums_h, mults_h);
187
+ #undef umlal2
188
+
189
+ /* Horizontal sum to finish up */
190
+ #ifdef ARCH_ARM32
191
+ s1 += vgetq_lane_u32(v_s1, 0) + vgetq_lane_u32(v_s1, 1) +
192
+ vgetq_lane_u32(v_s1, 2) + vgetq_lane_u32(v_s1, 3);
193
+ s2 += vgetq_lane_u32(v_s2, 0) + vgetq_lane_u32(v_s2, 1) +
194
+ vgetq_lane_u32(v_s2, 2) + vgetq_lane_u32(v_s2, 3);
195
+ #else
196
+ s1 += vaddvq_u32(v_s1);
197
+ s2 += vaddvq_u32(v_s2);
198
+ #endif
199
+ }
200
+ /*
201
+ * Process the last 0 <= n < 64 bytes of the chunk using
202
+ * scalar instructions and reduce s1 and s2 mod DIVISOR.
203
+ */
204
+ ADLER32_CHUNK(s1, s2, p, n);
205
+ }
206
+ return (s2 << 16) | s1;
207
+ }
208
+ #undef ATTRIBUTES
209
+ #endif /* Regular NEON implementation */
210
+
211
+ /* NEON+dotprod implementation */
212
+ #if HAVE_DOTPROD_INTRIN && CPU_IS_LITTLE_ENDIAN()
213
+ # define adler32_arm_neon_dotprod adler32_arm_neon_dotprod
214
+ # ifdef __clang__
215
+ # define ATTRIBUTES _target_attribute("dotprod")
216
+ /*
217
+ * With gcc 13.1 and earlier (before gcc commit 73d3bc348190 or 9aac37ab8a7b,
218
+ * "aarch64: Remove architecture dependencies from intrinsics"),
219
+ * arch=armv8.2-a is needed for the dotprod intrinsics, unless the default
220
+ * target is armv8.3-a or later in which case it must be omitted. armv8.3-a
221
+ * or later can be detected by checking for __ARM_FEATURE_JCVT.
222
+ */
223
+ # elif GCC_PREREQ(13, 2) || defined(__ARM_FEATURE_JCVT)
224
+ # define ATTRIBUTES _target_attribute("+dotprod")
225
+ # else
226
+ # define ATTRIBUTES _target_attribute("arch=armv8.2-a+dotprod")
227
+ # endif
228
+ static ATTRIBUTES u32
229
+ adler32_arm_neon_dotprod(u32 adler, const u8 *p, size_t len)
230
+ {
231
+ static const u8 _aligned_attribute(16) mults[64] = {
232
+ 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49,
233
+ 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33,
234
+ 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
235
+ 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
236
+ };
237
+ const uint8x16_t mults_a = vld1q_u8(&mults[0]);
238
+ const uint8x16_t mults_b = vld1q_u8(&mults[16]);
239
+ const uint8x16_t mults_c = vld1q_u8(&mults[32]);
240
+ const uint8x16_t mults_d = vld1q_u8(&mults[48]);
241
+ const uint8x16_t ones = vdupq_n_u8(1);
242
+ u32 s1 = adler & 0xFFFF;
243
+ u32 s2 = adler >> 16;
244
+
245
+ /*
246
+ * If the length is large and the pointer is misaligned, align it.
247
+ * For smaller lengths, just take the misaligned load penalty.
248
+ */
249
+ if (unlikely(len > 32768 && ((uintptr_t)p & 15))) {
250
+ do {
251
+ s1 += *p++;
252
+ s2 += s1;
253
+ len--;
254
+ } while ((uintptr_t)p & 15);
255
+ s1 %= DIVISOR;
256
+ s2 %= DIVISOR;
257
+ }
258
+
259
+ while (len) {
260
+ /*
261
+ * Calculate the length of the next data chunk such that s1 and
262
+ * s2 are guaranteed to not exceed UINT32_MAX.
263
+ */
264
+ size_t n = MIN(len, MAX_CHUNK_LEN & ~63);
265
+
266
+ len -= n;
267
+
268
+ if (n >= 64) {
269
+ uint32x4_t v_s1_a = vdupq_n_u32(0);
270
+ uint32x4_t v_s1_b = vdupq_n_u32(0);
271
+ uint32x4_t v_s1_c = vdupq_n_u32(0);
272
+ uint32x4_t v_s1_d = vdupq_n_u32(0);
273
+ uint32x4_t v_s2_a = vdupq_n_u32(0);
274
+ uint32x4_t v_s2_b = vdupq_n_u32(0);
275
+ uint32x4_t v_s2_c = vdupq_n_u32(0);
276
+ uint32x4_t v_s2_d = vdupq_n_u32(0);
277
+ uint32x4_t v_s1_sums_a = vdupq_n_u32(0);
278
+ uint32x4_t v_s1_sums_b = vdupq_n_u32(0);
279
+ uint32x4_t v_s1_sums_c = vdupq_n_u32(0);
280
+ uint32x4_t v_s1_sums_d = vdupq_n_u32(0);
281
+ uint32x4_t v_s1;
282
+ uint32x4_t v_s2;
283
+ uint32x4_t v_s1_sums;
284
+
285
+ s2 += s1 * (n & ~63);
286
+
287
+ do {
288
+ uint8x16_t data_a = vld1q_u8(p + 0);
289
+ uint8x16_t data_b = vld1q_u8(p + 16);
290
+ uint8x16_t data_c = vld1q_u8(p + 32);
291
+ uint8x16_t data_d = vld1q_u8(p + 48);
292
+
293
+ v_s1_sums_a = vaddq_u32(v_s1_sums_a, v_s1_a);
294
+ v_s1_a = vdotq_u32(v_s1_a, data_a, ones);
295
+ v_s2_a = vdotq_u32(v_s2_a, data_a, mults_a);
296
+
297
+ v_s1_sums_b = vaddq_u32(v_s1_sums_b, v_s1_b);
298
+ v_s1_b = vdotq_u32(v_s1_b, data_b, ones);
299
+ v_s2_b = vdotq_u32(v_s2_b, data_b, mults_b);
300
+
301
+ v_s1_sums_c = vaddq_u32(v_s1_sums_c, v_s1_c);
302
+ v_s1_c = vdotq_u32(v_s1_c, data_c, ones);
303
+ v_s2_c = vdotq_u32(v_s2_c, data_c, mults_c);
304
+
305
+ v_s1_sums_d = vaddq_u32(v_s1_sums_d, v_s1_d);
306
+ v_s1_d = vdotq_u32(v_s1_d, data_d, ones);
307
+ v_s2_d = vdotq_u32(v_s2_d, data_d, mults_d);
308
+
309
+ p += 64;
310
+ n -= 64;
311
+ } while (n >= 64);
312
+
313
+ v_s1 = vaddq_u32(vaddq_u32(v_s1_a, v_s1_b),
314
+ vaddq_u32(v_s1_c, v_s1_d));
315
+ v_s2 = vaddq_u32(vaddq_u32(v_s2_a, v_s2_b),
316
+ vaddq_u32(v_s2_c, v_s2_d));
317
+ v_s1_sums = vaddq_u32(vaddq_u32(v_s1_sums_a,
318
+ v_s1_sums_b),
319
+ vaddq_u32(v_s1_sums_c,
320
+ v_s1_sums_d));
321
+ v_s2 = vaddq_u32(v_s2, vqshlq_n_u32(v_s1_sums, 6));
322
+
323
+ s1 += vaddvq_u32(v_s1);
324
+ s2 += vaddvq_u32(v_s2);
325
+ }
326
+ /*
327
+ * Process the last 0 <= n < 64 bytes of the chunk using
328
+ * scalar instructions and reduce s1 and s2 mod DIVISOR.
329
+ */
330
+ ADLER32_CHUNK(s1, s2, p, n);
331
+ }
332
+ return (s2 << 16) | s1;
333
+ }
334
+ #undef ATTRIBUTES
335
+ #endif /* NEON+dotprod implementation */
336
+
337
+ #if defined(adler32_arm_neon_dotprod) && defined(__ARM_FEATURE_DOTPROD)
338
+ #define DEFAULT_IMPL adler32_arm_neon_dotprod
339
+ #else
340
+ static inline adler32_func_t
341
+ arch_select_adler32_func(void)
342
+ {
343
+ const u32 features MAYBE_UNUSED = get_arm_cpu_features();
344
+
345
+ #ifdef adler32_arm_neon_dotprod
346
+ if (HAVE_NEON(features) && HAVE_DOTPROD(features))
347
+ return adler32_arm_neon_dotprod;
348
+ #endif
349
+ #ifdef adler32_arm_neon
350
+ if (HAVE_NEON(features))
351
+ return adler32_arm_neon;
352
+ #endif
353
+ return NULL;
354
+ }
355
+ #define arch_select_adler32_func arch_select_adler32_func
356
+ #endif
357
+
358
+ #endif /* LIB_ARM_ADLER32_IMPL_H */