deflate-ruby 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. checksums.yaml +7 -0
  2. data/CLAUDE.md +138 -0
  3. data/LICENSE.txt +21 -0
  4. data/README.md +117 -0
  5. data/ext/deflate_ruby/deflate_ruby.c +301 -0
  6. data/ext/deflate_ruby/extconf.rb +34 -0
  7. data/ext/deflate_ruby/libdeflate/CMakeLists.txt +270 -0
  8. data/ext/deflate_ruby/libdeflate/COPYING +22 -0
  9. data/ext/deflate_ruby/libdeflate/NEWS.md +494 -0
  10. data/ext/deflate_ruby/libdeflate/README.md +228 -0
  11. data/ext/deflate_ruby/libdeflate/common_defs.h +747 -0
  12. data/ext/deflate_ruby/libdeflate/lib/adler32.c +162 -0
  13. data/ext/deflate_ruby/libdeflate/lib/arm/adler32_impl.h +358 -0
  14. data/ext/deflate_ruby/libdeflate/lib/arm/cpu_features.c +230 -0
  15. data/ext/deflate_ruby/libdeflate/lib/arm/cpu_features.h +214 -0
  16. data/ext/deflate_ruby/libdeflate/lib/arm/crc32_impl.h +600 -0
  17. data/ext/deflate_ruby/libdeflate/lib/arm/crc32_pmull_helpers.h +156 -0
  18. data/ext/deflate_ruby/libdeflate/lib/arm/crc32_pmull_wide.h +226 -0
  19. data/ext/deflate_ruby/libdeflate/lib/arm/matchfinder_impl.h +78 -0
  20. data/ext/deflate_ruby/libdeflate/lib/bt_matchfinder.h +342 -0
  21. data/ext/deflate_ruby/libdeflate/lib/cpu_features_common.h +93 -0
  22. data/ext/deflate_ruby/libdeflate/lib/crc32.c +262 -0
  23. data/ext/deflate_ruby/libdeflate/lib/crc32_multipliers.h +377 -0
  24. data/ext/deflate_ruby/libdeflate/lib/crc32_tables.h +587 -0
  25. data/ext/deflate_ruby/libdeflate/lib/decompress_template.h +777 -0
  26. data/ext/deflate_ruby/libdeflate/lib/deflate_compress.c +4129 -0
  27. data/ext/deflate_ruby/libdeflate/lib/deflate_compress.h +15 -0
  28. data/ext/deflate_ruby/libdeflate/lib/deflate_constants.h +56 -0
  29. data/ext/deflate_ruby/libdeflate/lib/deflate_decompress.c +1208 -0
  30. data/ext/deflate_ruby/libdeflate/lib/gzip_compress.c +90 -0
  31. data/ext/deflate_ruby/libdeflate/lib/gzip_constants.h +45 -0
  32. data/ext/deflate_ruby/libdeflate/lib/gzip_decompress.c +144 -0
  33. data/ext/deflate_ruby/libdeflate/lib/hc_matchfinder.h +401 -0
  34. data/ext/deflate_ruby/libdeflate/lib/ht_matchfinder.h +234 -0
  35. data/ext/deflate_ruby/libdeflate/lib/lib_common.h +106 -0
  36. data/ext/deflate_ruby/libdeflate/lib/matchfinder_common.h +224 -0
  37. data/ext/deflate_ruby/libdeflate/lib/riscv/matchfinder_impl.h +97 -0
  38. data/ext/deflate_ruby/libdeflate/lib/utils.c +141 -0
  39. data/ext/deflate_ruby/libdeflate/lib/x86/adler32_impl.h +134 -0
  40. data/ext/deflate_ruby/libdeflate/lib/x86/adler32_template.h +518 -0
  41. data/ext/deflate_ruby/libdeflate/lib/x86/cpu_features.c +183 -0
  42. data/ext/deflate_ruby/libdeflate/lib/x86/cpu_features.h +169 -0
  43. data/ext/deflate_ruby/libdeflate/lib/x86/crc32_impl.h +160 -0
  44. data/ext/deflate_ruby/libdeflate/lib/x86/crc32_pclmul_template.h +495 -0
  45. data/ext/deflate_ruby/libdeflate/lib/x86/decompress_impl.h +57 -0
  46. data/ext/deflate_ruby/libdeflate/lib/x86/matchfinder_impl.h +122 -0
  47. data/ext/deflate_ruby/libdeflate/lib/zlib_compress.c +82 -0
  48. data/ext/deflate_ruby/libdeflate/lib/zlib_constants.h +21 -0
  49. data/ext/deflate_ruby/libdeflate/lib/zlib_decompress.c +104 -0
  50. data/ext/deflate_ruby/libdeflate/libdeflate-config.cmake.in +3 -0
  51. data/ext/deflate_ruby/libdeflate/libdeflate.h +411 -0
  52. data/ext/deflate_ruby/libdeflate/libdeflate.pc.in +18 -0
  53. data/ext/deflate_ruby/libdeflate/programs/CMakeLists.txt +105 -0
  54. data/ext/deflate_ruby/libdeflate/programs/benchmark.c +696 -0
  55. data/ext/deflate_ruby/libdeflate/programs/checksum.c +218 -0
  56. data/ext/deflate_ruby/libdeflate/programs/config.h.in +19 -0
  57. data/ext/deflate_ruby/libdeflate/programs/gzip.c +688 -0
  58. data/ext/deflate_ruby/libdeflate/programs/prog_util.c +521 -0
  59. data/ext/deflate_ruby/libdeflate/programs/prog_util.h +225 -0
  60. data/ext/deflate_ruby/libdeflate/programs/test_checksums.c +200 -0
  61. data/ext/deflate_ruby/libdeflate/programs/test_custom_malloc.c +155 -0
  62. data/ext/deflate_ruby/libdeflate/programs/test_incomplete_codes.c +385 -0
  63. data/ext/deflate_ruby/libdeflate/programs/test_invalid_streams.c +130 -0
  64. data/ext/deflate_ruby/libdeflate/programs/test_litrunlen_overflow.c +72 -0
  65. data/ext/deflate_ruby/libdeflate/programs/test_overread.c +95 -0
  66. data/ext/deflate_ruby/libdeflate/programs/test_slow_decompression.c +472 -0
  67. data/ext/deflate_ruby/libdeflate/programs/test_trailing_bytes.c +151 -0
  68. data/ext/deflate_ruby/libdeflate/programs/test_util.c +237 -0
  69. data/ext/deflate_ruby/libdeflate/programs/test_util.h +61 -0
  70. data/ext/deflate_ruby/libdeflate/programs/tgetopt.c +118 -0
  71. data/ext/deflate_ruby/libdeflate/scripts/android_build.sh +118 -0
  72. data/ext/deflate_ruby/libdeflate/scripts/android_tests.sh +69 -0
  73. data/ext/deflate_ruby/libdeflate/scripts/benchmark.sh +10 -0
  74. data/ext/deflate_ruby/libdeflate/scripts/checksum.sh +10 -0
  75. data/ext/deflate_ruby/libdeflate/scripts/checksum_benchmarks.sh +253 -0
  76. data/ext/deflate_ruby/libdeflate/scripts/cmake-helper.sh +17 -0
  77. data/ext/deflate_ruby/libdeflate/scripts/deflate_benchmarks.sh +119 -0
  78. data/ext/deflate_ruby/libdeflate/scripts/exec_tests.sh +38 -0
  79. data/ext/deflate_ruby/libdeflate/scripts/gen-release-archives.sh +37 -0
  80. data/ext/deflate_ruby/libdeflate/scripts/gen_bitreverse_tab.py +19 -0
  81. data/ext/deflate_ruby/libdeflate/scripts/gen_crc32_multipliers.c +199 -0
  82. data/ext/deflate_ruby/libdeflate/scripts/gen_crc32_tables.c +105 -0
  83. data/ext/deflate_ruby/libdeflate/scripts/gen_default_litlen_costs.py +44 -0
  84. data/ext/deflate_ruby/libdeflate/scripts/gen_offset_slot_map.py +29 -0
  85. data/ext/deflate_ruby/libdeflate/scripts/gzip_tests.sh +523 -0
  86. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/deflate_compress/corpus/0 +0 -0
  87. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/deflate_compress/fuzz.c +95 -0
  88. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/deflate_decompress/corpus/0 +3 -0
  89. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/deflate_decompress/fuzz.c +62 -0
  90. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/fuzz.sh +108 -0
  91. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/gzip_decompress/corpus/0 +0 -0
  92. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/gzip_decompress/fuzz.c +19 -0
  93. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/zlib_decompress/corpus/0 +3 -0
  94. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/zlib_decompress/fuzz.c +19 -0
  95. data/ext/deflate_ruby/libdeflate/scripts/run_tests.sh +416 -0
  96. data/ext/deflate_ruby/libdeflate/scripts/toolchain-i686-w64-mingw32.cmake +8 -0
  97. data/ext/deflate_ruby/libdeflate/scripts/toolchain-x86_64-w64-mingw32.cmake +8 -0
  98. data/lib/deflate_ruby/version.rb +5 -0
  99. data/lib/deflate_ruby.rb +71 -0
  100. metadata +191 -0
@@ -0,0 +1,156 @@
1
+ /*
2
+ * arm/crc32_pmull_helpers.h - helper functions for CRC-32 folding with PMULL
3
+ *
4
+ * Copyright 2022 Eric Biggers
5
+ *
6
+ * Permission is hereby granted, free of charge, to any person
7
+ * obtaining a copy of this software and associated documentation
8
+ * files (the "Software"), to deal in the Software without
9
+ * restriction, including without limitation the rights to use,
10
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ * copies of the Software, and to permit persons to whom the
12
+ * Software is furnished to do so, subject to the following
13
+ * conditions:
14
+ *
15
+ * The above copyright notice and this permission notice shall be
16
+ * included in all copies or substantial portions of the Software.
17
+ *
18
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
20
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
23
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
25
+ * OTHER DEALINGS IN THE SOFTWARE.
26
+ */
27
+
28
+ /*
29
+ * This file is a "template" for instantiating helper functions for CRC folding
30
+ * with pmull instructions. It accepts the following parameters:
31
+ *
32
+ * SUFFIX:
33
+ * Name suffix to append to all instantiated functions.
34
+ * ATTRIBUTES:
35
+ * Target function attributes to use.
36
+ * ENABLE_EOR3:
37
+ * Use the eor3 instruction (from the sha3 extension).
38
+ */
39
+
40
+ /* Create a vector with 'a' in the first 4 bytes, and the rest zeroed out. */
41
+ #undef u32_to_bytevec
42
+ static forceinline ATTRIBUTES uint8x16_t
43
+ ADD_SUFFIX(u32_to_bytevec)(u32 a)
44
+ {
45
+ return vreinterpretq_u8_u32(vsetq_lane_u32(a, vdupq_n_u32(0), 0));
46
+ }
47
+ #define u32_to_bytevec ADD_SUFFIX(u32_to_bytevec)
48
+
49
+ /* Load two 64-bit values into a vector. */
50
+ #undef load_multipliers
51
+ static forceinline ATTRIBUTES poly64x2_t
52
+ ADD_SUFFIX(load_multipliers)(const u64 p[2])
53
+ {
54
+ return vreinterpretq_p64_u64(vld1q_u64(p));
55
+ }
56
+ #define load_multipliers ADD_SUFFIX(load_multipliers)
57
+
58
+ /* Do carryless multiplication of the low halves of two vectors. */
59
+ #undef clmul_low
60
+ static forceinline ATTRIBUTES uint8x16_t
61
+ ADD_SUFFIX(clmul_low)(uint8x16_t a, poly64x2_t b)
62
+ {
63
+ return vreinterpretq_u8_p128(
64
+ compat_vmull_p64(vgetq_lane_p64(vreinterpretq_p64_u8(a), 0),
65
+ vgetq_lane_p64(b, 0)));
66
+ }
67
+ #define clmul_low ADD_SUFFIX(clmul_low)
68
+
69
+ /* Do carryless multiplication of the high halves of two vectors. */
70
+ #undef clmul_high
71
+ static forceinline ATTRIBUTES uint8x16_t
72
+ ADD_SUFFIX(clmul_high)(uint8x16_t a, poly64x2_t b)
73
+ {
74
+ #ifdef __clang__
75
+ /*
76
+ * Use inline asm to ensure that pmull2 is really used. This works
77
+ * around clang bug https://github.com/llvm/llvm-project/issues/52868.
78
+ */
79
+ uint8x16_t res;
80
+
81
+ __asm__("pmull2 %0.1q, %1.2d, %2.2d" : "=w" (res) : "w" (a), "w" (b));
82
+ return res;
83
+ #else
84
+ return vreinterpretq_u8_p128(vmull_high_p64(vreinterpretq_p64_u8(a), b));
85
+ #endif
86
+ }
87
+ #define clmul_high ADD_SUFFIX(clmul_high)
88
+
89
+ #undef eor3
90
+ static forceinline ATTRIBUTES uint8x16_t
91
+ ADD_SUFFIX(eor3)(uint8x16_t a, uint8x16_t b, uint8x16_t c)
92
+ {
93
+ #if ENABLE_EOR3
94
+ return veor3q_u8(a, b, c);
95
+ #else
96
+ return veorq_u8(veorq_u8(a, b), c);
97
+ #endif
98
+ }
99
+ #define eor3 ADD_SUFFIX(eor3)
100
+
101
+ #undef fold_vec
102
+ static forceinline ATTRIBUTES uint8x16_t
103
+ ADD_SUFFIX(fold_vec)(uint8x16_t src, uint8x16_t dst, poly64x2_t multipliers)
104
+ {
105
+ uint8x16_t a = clmul_low(src, multipliers);
106
+ uint8x16_t b = clmul_high(src, multipliers);
107
+
108
+ return eor3(a, b, dst);
109
+ }
110
+ #define fold_vec ADD_SUFFIX(fold_vec)
111
+
112
+ /*
113
+ * Given v containing a 16-byte polynomial, and a pointer 'p' that points to the
114
+ * next '1 <= len <= 15' data bytes, rearrange the concatenation of v and the
115
+ * data into vectors x0 and x1 that contain 'len' bytes and 16 bytes,
116
+ * respectively. Then fold x0 into x1 and return the result. Assumes that
117
+ * 'p + len - 16' is in-bounds.
118
+ */
119
+ #undef fold_partial_vec
120
+ static forceinline ATTRIBUTES MAYBE_UNUSED uint8x16_t
121
+ ADD_SUFFIX(fold_partial_vec)(uint8x16_t v, const u8 *p, size_t len,
122
+ poly64x2_t multipliers_1)
123
+ {
124
+ /*
125
+ * vqtbl1q_u8(v, shift_tab[len..len+15]) left shifts v by 16-len bytes.
126
+ * vqtbl1q_u8(v, shift_tab[len+16..len+31]) right shifts v by len bytes.
127
+ */
128
+ static const u8 shift_tab[48] = {
129
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
130
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
131
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
132
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
133
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
134
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
135
+ };
136
+ const uint8x16_t lshift = vld1q_u8(&shift_tab[len]);
137
+ const uint8x16_t rshift = vld1q_u8(&shift_tab[len + 16]);
138
+ uint8x16_t x0, x1, bsl_mask;
139
+
140
+ /* x0 = v left-shifted by '16 - len' bytes */
141
+ x0 = vqtbl1q_u8(v, lshift);
142
+
143
+ /* Create a vector of '16 - len' 0x00 bytes, then 'len' 0xff bytes. */
144
+ bsl_mask = vreinterpretq_u8_s8(
145
+ vshrq_n_s8(vreinterpretq_s8_u8(rshift), 7));
146
+
147
+ /*
148
+ * x1 = the last '16 - len' bytes from v (i.e. v right-shifted by 'len'
149
+ * bytes) followed by the remaining data.
150
+ */
151
+ x1 = vbslq_u8(bsl_mask /* 0 bits select from arg3, 1 bits from arg2 */,
152
+ vld1q_u8(p + len - 16), vqtbl1q_u8(v, rshift));
153
+
154
+ return fold_vec(x0, x1, multipliers_1);
155
+ }
156
+ #define fold_partial_vec ADD_SUFFIX(fold_partial_vec)
@@ -0,0 +1,226 @@
1
+ /*
2
+ * arm/crc32_pmull_wide.h - gzip CRC-32 with PMULL (extra-wide version)
3
+ *
4
+ * Copyright 2022 Eric Biggers
5
+ *
6
+ * Permission is hereby granted, free of charge, to any person
7
+ * obtaining a copy of this software and associated documentation
8
+ * files (the "Software"), to deal in the Software without
9
+ * restriction, including without limitation the rights to use,
10
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ * copies of the Software, and to permit persons to whom the
12
+ * Software is furnished to do so, subject to the following
13
+ * conditions:
14
+ *
15
+ * The above copyright notice and this permission notice shall be
16
+ * included in all copies or substantial portions of the Software.
17
+ *
18
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
20
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
23
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
25
+ * OTHER DEALINGS IN THE SOFTWARE.
26
+ */
27
+
28
+ /*
29
+ * This file is a "template" for instantiating PMULL-based crc32_arm functions.
30
+ * The "parameters" are:
31
+ *
32
+ * SUFFIX:
33
+ * Name suffix to append to all instantiated functions.
34
+ * ATTRIBUTES:
35
+ * Target function attributes to use.
36
+ * ENABLE_EOR3:
37
+ * Use the eor3 instruction (from the sha3 extension).
38
+ *
39
+ * This is the extra-wide version; it uses an unusually large stride length of
40
+ * 12, and it assumes that crc32 instructions are available too. It's intended
41
+ * for powerful CPUs that support both pmull and crc32 instructions, but where
42
+ * throughput of pmull and xor (given enough instructions issued in parallel) is
43
+ * significantly higher than that of crc32, thus making the crc32 instructions
44
+ * (counterintuitively) not actually the fastest way to compute the CRC-32. The
45
+ * Apple M1 processor is an example of such a CPU.
46
+ */
47
+
48
+ #include "crc32_pmull_helpers.h"
49
+
50
+ static ATTRIBUTES u32
51
+ ADD_SUFFIX(crc32_arm)(u32 crc, const u8 *p, size_t len)
52
+ {
53
+ uint8x16_t v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11;
54
+
55
+ if (len < 3 * 192) {
56
+ static const u64 _aligned_attribute(16) mults[3][2] = {
57
+ { CRC32_X543_MODG, CRC32_X479_MODG }, /* 4 vecs */
58
+ { CRC32_X287_MODG, CRC32_X223_MODG }, /* 2 vecs */
59
+ { CRC32_X159_MODG, CRC32_X95_MODG }, /* 1 vecs */
60
+ };
61
+ poly64x2_t multipliers_4, multipliers_2, multipliers_1;
62
+
63
+ if (len < 64)
64
+ goto tail;
65
+ multipliers_4 = load_multipliers(mults[0]);
66
+ multipliers_2 = load_multipliers(mults[1]);
67
+ multipliers_1 = load_multipliers(mults[2]);
68
+ /*
69
+ * Short length; don't bother aligning the pointer, and fold
70
+ * 64 bytes (4 vectors) at a time, at most.
71
+ */
72
+ v0 = veorq_u8(vld1q_u8(p + 0), u32_to_bytevec(crc));
73
+ v1 = vld1q_u8(p + 16);
74
+ v2 = vld1q_u8(p + 32);
75
+ v3 = vld1q_u8(p + 48);
76
+ p += 64;
77
+ len -= 64;
78
+ while (len >= 64) {
79
+ v0 = fold_vec(v0, vld1q_u8(p + 0), multipliers_4);
80
+ v1 = fold_vec(v1, vld1q_u8(p + 16), multipliers_4);
81
+ v2 = fold_vec(v2, vld1q_u8(p + 32), multipliers_4);
82
+ v3 = fold_vec(v3, vld1q_u8(p + 48), multipliers_4);
83
+ p += 64;
84
+ len -= 64;
85
+ }
86
+ v0 = fold_vec(v0, v2, multipliers_2);
87
+ v1 = fold_vec(v1, v3, multipliers_2);
88
+ if (len >= 32) {
89
+ v0 = fold_vec(v0, vld1q_u8(p + 0), multipliers_2);
90
+ v1 = fold_vec(v1, vld1q_u8(p + 16), multipliers_2);
91
+ p += 32;
92
+ len -= 32;
93
+ }
94
+ v0 = fold_vec(v0, v1, multipliers_1);
95
+ } else {
96
+ static const u64 _aligned_attribute(16) mults[4][2] = {
97
+ { CRC32_X1567_MODG, CRC32_X1503_MODG }, /* 12 vecs */
98
+ { CRC32_X799_MODG, CRC32_X735_MODG }, /* 6 vecs */
99
+ { CRC32_X415_MODG, CRC32_X351_MODG }, /* 3 vecs */
100
+ { CRC32_X159_MODG, CRC32_X95_MODG }, /* 1 vecs */
101
+ };
102
+ const poly64x2_t multipliers_12 = load_multipliers(mults[0]);
103
+ const poly64x2_t multipliers_6 = load_multipliers(mults[1]);
104
+ const poly64x2_t multipliers_3 = load_multipliers(mults[2]);
105
+ const poly64x2_t multipliers_1 = load_multipliers(mults[3]);
106
+ const size_t align = -(uintptr_t)p & 15;
107
+ const uint8x16_t *vp;
108
+
109
+ /* Align p to the next 16-byte boundary. */
110
+ if (align) {
111
+ if (align & 1)
112
+ crc = __crc32b(crc, *p++);
113
+ if (align & 2) {
114
+ crc = __crc32h(crc, le16_bswap(*(u16 *)p));
115
+ p += 2;
116
+ }
117
+ if (align & 4) {
118
+ crc = __crc32w(crc, le32_bswap(*(u32 *)p));
119
+ p += 4;
120
+ }
121
+ if (align & 8) {
122
+ crc = __crc32d(crc, le64_bswap(*(u64 *)p));
123
+ p += 8;
124
+ }
125
+ len -= align;
126
+ }
127
+ vp = (const uint8x16_t *)p;
128
+ v0 = veorq_u8(*vp++, u32_to_bytevec(crc));
129
+ v1 = *vp++;
130
+ v2 = *vp++;
131
+ v3 = *vp++;
132
+ v4 = *vp++;
133
+ v5 = *vp++;
134
+ v6 = *vp++;
135
+ v7 = *vp++;
136
+ v8 = *vp++;
137
+ v9 = *vp++;
138
+ v10 = *vp++;
139
+ v11 = *vp++;
140
+ len -= 192;
141
+ /* Fold 192 bytes (12 vectors) at a time. */
142
+ do {
143
+ v0 = fold_vec(v0, *vp++, multipliers_12);
144
+ v1 = fold_vec(v1, *vp++, multipliers_12);
145
+ v2 = fold_vec(v2, *vp++, multipliers_12);
146
+ v3 = fold_vec(v3, *vp++, multipliers_12);
147
+ v4 = fold_vec(v4, *vp++, multipliers_12);
148
+ v5 = fold_vec(v5, *vp++, multipliers_12);
149
+ v6 = fold_vec(v6, *vp++, multipliers_12);
150
+ v7 = fold_vec(v7, *vp++, multipliers_12);
151
+ v8 = fold_vec(v8, *vp++, multipliers_12);
152
+ v9 = fold_vec(v9, *vp++, multipliers_12);
153
+ v10 = fold_vec(v10, *vp++, multipliers_12);
154
+ v11 = fold_vec(v11, *vp++, multipliers_12);
155
+ len -= 192;
156
+ } while (len >= 192);
157
+
158
+ /*
159
+ * Fewer than 192 bytes left. Fold v0-v11 down to just v0,
160
+ * while processing up to 144 more bytes.
161
+ */
162
+ v0 = fold_vec(v0, v6, multipliers_6);
163
+ v1 = fold_vec(v1, v7, multipliers_6);
164
+ v2 = fold_vec(v2, v8, multipliers_6);
165
+ v3 = fold_vec(v3, v9, multipliers_6);
166
+ v4 = fold_vec(v4, v10, multipliers_6);
167
+ v5 = fold_vec(v5, v11, multipliers_6);
168
+ if (len >= 96) {
169
+ v0 = fold_vec(v0, *vp++, multipliers_6);
170
+ v1 = fold_vec(v1, *vp++, multipliers_6);
171
+ v2 = fold_vec(v2, *vp++, multipliers_6);
172
+ v3 = fold_vec(v3, *vp++, multipliers_6);
173
+ v4 = fold_vec(v4, *vp++, multipliers_6);
174
+ v5 = fold_vec(v5, *vp++, multipliers_6);
175
+ len -= 96;
176
+ }
177
+ v0 = fold_vec(v0, v3, multipliers_3);
178
+ v1 = fold_vec(v1, v4, multipliers_3);
179
+ v2 = fold_vec(v2, v5, multipliers_3);
180
+ if (len >= 48) {
181
+ v0 = fold_vec(v0, *vp++, multipliers_3);
182
+ v1 = fold_vec(v1, *vp++, multipliers_3);
183
+ v2 = fold_vec(v2, *vp++, multipliers_3);
184
+ len -= 48;
185
+ }
186
+ v0 = fold_vec(v0, v1, multipliers_1);
187
+ v0 = fold_vec(v0, v2, multipliers_1);
188
+ p = (const u8 *)vp;
189
+ }
190
+ /* Reduce 128 to 32 bits using crc32 instructions. */
191
+ crc = __crc32d(0, vgetq_lane_u64(vreinterpretq_u64_u8(v0), 0));
192
+ crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(v0), 1));
193
+ tail:
194
+ /* Finish up the remainder using crc32 instructions. */
195
+ if (len & 32) {
196
+ crc = __crc32d(crc, get_unaligned_le64(p + 0));
197
+ crc = __crc32d(crc, get_unaligned_le64(p + 8));
198
+ crc = __crc32d(crc, get_unaligned_le64(p + 16));
199
+ crc = __crc32d(crc, get_unaligned_le64(p + 24));
200
+ p += 32;
201
+ }
202
+ if (len & 16) {
203
+ crc = __crc32d(crc, get_unaligned_le64(p + 0));
204
+ crc = __crc32d(crc, get_unaligned_le64(p + 8));
205
+ p += 16;
206
+ }
207
+ if (len & 8) {
208
+ crc = __crc32d(crc, get_unaligned_le64(p));
209
+ p += 8;
210
+ }
211
+ if (len & 4) {
212
+ crc = __crc32w(crc, get_unaligned_le32(p));
213
+ p += 4;
214
+ }
215
+ if (len & 2) {
216
+ crc = __crc32h(crc, get_unaligned_le16(p));
217
+ p += 2;
218
+ }
219
+ if (len & 1)
220
+ crc = __crc32b(crc, *p);
221
+ return crc;
222
+ }
223
+
224
+ #undef SUFFIX
225
+ #undef ATTRIBUTES
226
+ #undef ENABLE_EOR3
@@ -0,0 +1,78 @@
1
+ /*
2
+ * arm/matchfinder_impl.h - ARM implementations of matchfinder functions
3
+ *
4
+ * Copyright 2016 Eric Biggers
5
+ *
6
+ * Permission is hereby granted, free of charge, to any person
7
+ * obtaining a copy of this software and associated documentation
8
+ * files (the "Software"), to deal in the Software without
9
+ * restriction, including without limitation the rights to use,
10
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ * copies of the Software, and to permit persons to whom the
12
+ * Software is furnished to do so, subject to the following
13
+ * conditions:
14
+ *
15
+ * The above copyright notice and this permission notice shall be
16
+ * included in all copies or substantial portions of the Software.
17
+ *
18
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
20
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
23
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
25
+ * OTHER DEALINGS IN THE SOFTWARE.
26
+ */
27
+
28
+ #ifndef LIB_ARM_MATCHFINDER_IMPL_H
29
+ #define LIB_ARM_MATCHFINDER_IMPL_H
30
+
31
+ #include "cpu_features.h"
32
+
33
+ #if HAVE_NEON_NATIVE
34
+ static forceinline void
35
+ matchfinder_init_neon(mf_pos_t *data, size_t size)
36
+ {
37
+ int16x8_t *p = (int16x8_t *)data;
38
+ int16x8_t v = vdupq_n_s16(MATCHFINDER_INITVAL);
39
+
40
+ STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0);
41
+ STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0);
42
+ STATIC_ASSERT(sizeof(mf_pos_t) == 2);
43
+
44
+ do {
45
+ p[0] = v;
46
+ p[1] = v;
47
+ p[2] = v;
48
+ p[3] = v;
49
+ p += 4;
50
+ size -= 4 * sizeof(*p);
51
+ } while (size != 0);
52
+ }
53
+ #define matchfinder_init matchfinder_init_neon
54
+
55
+ static forceinline void
56
+ matchfinder_rebase_neon(mf_pos_t *data, size_t size)
57
+ {
58
+ int16x8_t *p = (int16x8_t *)data;
59
+ int16x8_t v = vdupq_n_s16((u16)-MATCHFINDER_WINDOW_SIZE);
60
+
61
+ STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0);
62
+ STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0);
63
+ STATIC_ASSERT(sizeof(mf_pos_t) == 2);
64
+
65
+ do {
66
+ p[0] = vqaddq_s16(p[0], v);
67
+ p[1] = vqaddq_s16(p[1], v);
68
+ p[2] = vqaddq_s16(p[2], v);
69
+ p[3] = vqaddq_s16(p[3], v);
70
+ p += 4;
71
+ size -= 4 * sizeof(*p);
72
+ } while (size != 0);
73
+ }
74
+ #define matchfinder_rebase matchfinder_rebase_neon
75
+
76
+ #endif /* HAVE_NEON_NATIVE */
77
+
78
+ #endif /* LIB_ARM_MATCHFINDER_IMPL_H */