deflate-ruby 1.0.1 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CLAUDE.md +95 -92
- data/LICENSE.txt +6 -6
- data/README.md +87 -65
- data/Rakefile +23 -0
- data/ext/deflate_ruby/{libdeflate/lib/x86/adler32_impl.h → adler32_impl.h} +8 -7
- data/ext/deflate_ruby/common_defs.h +748 -0
- data/ext/deflate_ruby/{libdeflate/lib/x86/cpu_features.c → cpu_features.c} +46 -16
- data/ext/deflate_ruby/{libdeflate/lib/x86/cpu_features.h → cpu_features.h} +2 -1
- data/ext/deflate_ruby/{libdeflate/lib/x86/crc32_impl.h → crc32_impl.h} +22 -23
- data/ext/deflate_ruby/{libdeflate/lib/crc32_multipliers.h → crc32_multipliers.h} +2 -4
- data/ext/deflate_ruby/{libdeflate/lib/x86/crc32_pclmul_template.h → crc32_pclmul_template.h} +23 -94
- data/ext/deflate_ruby/{libdeflate/lib/crc32_tables.h → crc32_tables.h} +1 -1
- data/ext/deflate_ruby/{libdeflate/lib/deflate_compress.c → deflate_compress.c} +59 -60
- data/ext/deflate_ruby/deflate_ruby.c +392 -218
- data/ext/deflate_ruby/deflate_ruby.h +6 -0
- data/ext/deflate_ruby/extconf.rb +35 -25
- data/ext/deflate_ruby/libdeflate/adler32.c +162 -0
- data/ext/deflate_ruby/libdeflate/{lib/arm → arm}/adler32_impl.h +14 -7
- data/ext/deflate_ruby/libdeflate/{lib/arm → arm}/crc32_impl.h +25 -31
- data/ext/deflate_ruby/libdeflate/arm/crc32_pmull_helpers.h +156 -0
- data/ext/deflate_ruby/libdeflate/arm/crc32_pmull_wide.h +226 -0
- data/ext/deflate_ruby/libdeflate/bt_matchfinder.h +342 -0
- data/ext/deflate_ruby/libdeflate/common_defs.h +2 -1
- data/ext/deflate_ruby/libdeflate/cpu_features_common.h +93 -0
- data/ext/deflate_ruby/libdeflate/crc32.c +262 -0
- data/ext/deflate_ruby/libdeflate/crc32_multipliers.h +375 -0
- data/ext/deflate_ruby/libdeflate/crc32_tables.h +587 -0
- data/ext/deflate_ruby/libdeflate/decompress_template.h +777 -0
- data/ext/deflate_ruby/libdeflate/deflate_compress.c +4128 -0
- data/ext/deflate_ruby/libdeflate/deflate_compress.h +15 -0
- data/ext/deflate_ruby/libdeflate/deflate_constants.h +56 -0
- data/ext/deflate_ruby/libdeflate/deflate_decompress.c +1208 -0
- data/ext/deflate_ruby/libdeflate/gzip_compress.c +90 -0
- data/ext/deflate_ruby/libdeflate/gzip_constants.h +45 -0
- data/ext/deflate_ruby/libdeflate/gzip_decompress.c +144 -0
- data/ext/deflate_ruby/libdeflate/hc_matchfinder.h +401 -0
- data/ext/deflate_ruby/libdeflate/ht_matchfinder.h +234 -0
- data/ext/deflate_ruby/libdeflate/lib_common.h +106 -0
- data/ext/deflate_ruby/libdeflate/libdeflate.h +2 -2
- data/ext/deflate_ruby/libdeflate/{lib/matchfinder_common.h → matchfinder_common.h} +3 -3
- data/ext/deflate_ruby/libdeflate/x86/adler32_impl.h +135 -0
- data/ext/deflate_ruby/libdeflate/x86/adler32_template.h +518 -0
- data/ext/deflate_ruby/libdeflate/x86/cpu_features.c +213 -0
- data/ext/deflate_ruby/libdeflate/x86/cpu_features.h +170 -0
- data/ext/deflate_ruby/libdeflate/x86/crc32_impl.h +159 -0
- data/ext/deflate_ruby/libdeflate/x86/crc32_pclmul_template.h +424 -0
- data/ext/deflate_ruby/libdeflate/x86/decompress_impl.h +57 -0
- data/ext/deflate_ruby/libdeflate.h +411 -0
- data/ext/deflate_ruby/matchfinder_common.h +224 -0
- data/ext/deflate_ruby/matchfinder_impl.h +122 -0
- data/ext/deflate_ruby/utils.c +141 -0
- data/ext/deflate_ruby/zlib_compress.c +82 -0
- data/ext/deflate_ruby/zlib_constants.h +21 -0
- data/ext/deflate_ruby/zlib_decompress.c +104 -0
- data/lib/deflate_ruby/version.rb +1 -1
- data/lib/deflate_ruby.rb +1 -63
- data/sig/deflate_ruby.rbs +4 -0
- data/test/test_deflate_ruby.rb +220 -0
- data/test/test_helper.rb +6 -0
- metadata +89 -144
- data/ext/deflate_ruby/libdeflate/CMakeLists.txt +0 -270
- data/ext/deflate_ruby/libdeflate/NEWS.md +0 -494
- data/ext/deflate_ruby/libdeflate/README.md +0 -228
- data/ext/deflate_ruby/libdeflate/libdeflate-config.cmake.in +0 -3
- data/ext/deflate_ruby/libdeflate/libdeflate.pc.in +0 -18
- data/ext/deflate_ruby/libdeflate/programs/CMakeLists.txt +0 -105
- data/ext/deflate_ruby/libdeflate/programs/benchmark.c +0 -696
- data/ext/deflate_ruby/libdeflate/programs/checksum.c +0 -218
- data/ext/deflate_ruby/libdeflate/programs/config.h.in +0 -19
- data/ext/deflate_ruby/libdeflate/programs/gzip.c +0 -688
- data/ext/deflate_ruby/libdeflate/programs/prog_util.c +0 -521
- data/ext/deflate_ruby/libdeflate/programs/prog_util.h +0 -225
- data/ext/deflate_ruby/libdeflate/programs/test_checksums.c +0 -200
- data/ext/deflate_ruby/libdeflate/programs/test_custom_malloc.c +0 -155
- data/ext/deflate_ruby/libdeflate/programs/test_incomplete_codes.c +0 -385
- data/ext/deflate_ruby/libdeflate/programs/test_invalid_streams.c +0 -130
- data/ext/deflate_ruby/libdeflate/programs/test_litrunlen_overflow.c +0 -72
- data/ext/deflate_ruby/libdeflate/programs/test_overread.c +0 -95
- data/ext/deflate_ruby/libdeflate/programs/test_slow_decompression.c +0 -472
- data/ext/deflate_ruby/libdeflate/programs/test_trailing_bytes.c +0 -151
- data/ext/deflate_ruby/libdeflate/programs/test_util.c +0 -237
- data/ext/deflate_ruby/libdeflate/programs/test_util.h +0 -61
- data/ext/deflate_ruby/libdeflate/programs/tgetopt.c +0 -118
- data/ext/deflate_ruby/libdeflate/scripts/android_build.sh +0 -118
- data/ext/deflate_ruby/libdeflate/scripts/android_tests.sh +0 -69
- data/ext/deflate_ruby/libdeflate/scripts/benchmark.sh +0 -10
- data/ext/deflate_ruby/libdeflate/scripts/checksum.sh +0 -10
- data/ext/deflate_ruby/libdeflate/scripts/checksum_benchmarks.sh +0 -253
- data/ext/deflate_ruby/libdeflate/scripts/cmake-helper.sh +0 -17
- data/ext/deflate_ruby/libdeflate/scripts/deflate_benchmarks.sh +0 -119
- data/ext/deflate_ruby/libdeflate/scripts/exec_tests.sh +0 -38
- data/ext/deflate_ruby/libdeflate/scripts/gen-release-archives.sh +0 -37
- data/ext/deflate_ruby/libdeflate/scripts/gen_bitreverse_tab.py +0 -19
- data/ext/deflate_ruby/libdeflate/scripts/gen_crc32_multipliers.c +0 -199
- data/ext/deflate_ruby/libdeflate/scripts/gen_crc32_tables.c +0 -105
- data/ext/deflate_ruby/libdeflate/scripts/gen_default_litlen_costs.py +0 -44
- data/ext/deflate_ruby/libdeflate/scripts/gen_offset_slot_map.py +0 -29
- data/ext/deflate_ruby/libdeflate/scripts/gzip_tests.sh +0 -523
- data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/deflate_compress/corpus/0 +0 -0
- data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/deflate_compress/fuzz.c +0 -95
- data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/deflate_decompress/corpus/0 +0 -3
- data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/deflate_decompress/fuzz.c +0 -62
- data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/fuzz.sh +0 -108
- data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/gzip_decompress/corpus/0 +0 -0
- data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/gzip_decompress/fuzz.c +0 -19
- data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/zlib_decompress/corpus/0 +0 -3
- data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/zlib_decompress/fuzz.c +0 -19
- data/ext/deflate_ruby/libdeflate/scripts/run_tests.sh +0 -416
- data/ext/deflate_ruby/libdeflate/scripts/toolchain-i686-w64-mingw32.cmake +0 -8
- data/ext/deflate_ruby/libdeflate/scripts/toolchain-x86_64-w64-mingw32.cmake +0 -8
- /data/ext/deflate_ruby/{libdeflate/lib/adler32.c → adler32.c} +0 -0
- /data/ext/deflate_ruby/{libdeflate/lib/x86/adler32_template.h → adler32_template.h} +0 -0
- /data/ext/deflate_ruby/{libdeflate/lib/bt_matchfinder.h → bt_matchfinder.h} +0 -0
- /data/ext/deflate_ruby/{libdeflate/lib/cpu_features_common.h → cpu_features_common.h} +0 -0
- /data/ext/deflate_ruby/{libdeflate/lib/crc32.c → crc32.c} +0 -0
- /data/ext/deflate_ruby/{libdeflate/lib/arm/crc32_pmull_helpers.h → crc32_pmull_helpers.h} +0 -0
- /data/ext/deflate_ruby/{libdeflate/lib/arm/crc32_pmull_wide.h → crc32_pmull_wide.h} +0 -0
- /data/ext/deflate_ruby/{libdeflate/lib/x86/decompress_impl.h → decompress_impl.h} +0 -0
- /data/ext/deflate_ruby/{libdeflate/lib/decompress_template.h → decompress_template.h} +0 -0
- /data/ext/deflate_ruby/{libdeflate/lib/deflate_compress.h → deflate_compress.h} +0 -0
- /data/ext/deflate_ruby/{libdeflate/lib/deflate_constants.h → deflate_constants.h} +0 -0
- /data/ext/deflate_ruby/{libdeflate/lib/deflate_decompress.c → deflate_decompress.c} +0 -0
- /data/ext/deflate_ruby/{libdeflate/lib/gzip_compress.c → gzip_compress.c} +0 -0
- /data/ext/deflate_ruby/{libdeflate/lib/gzip_constants.h → gzip_constants.h} +0 -0
- /data/ext/deflate_ruby/{libdeflate/lib/gzip_decompress.c → gzip_decompress.c} +0 -0
- /data/ext/deflate_ruby/{libdeflate/lib/hc_matchfinder.h → hc_matchfinder.h} +0 -0
- /data/ext/deflate_ruby/{libdeflate/lib/ht_matchfinder.h → ht_matchfinder.h} +0 -0
- /data/ext/deflate_ruby/{libdeflate/lib/lib_common.h → lib_common.h} +0 -0
- /data/ext/deflate_ruby/libdeflate/{lib/arm → arm}/cpu_features.c +0 -0
- /data/ext/deflate_ruby/libdeflate/{lib/arm → arm}/cpu_features.h +0 -0
- /data/ext/deflate_ruby/libdeflate/{lib/arm → arm}/matchfinder_impl.h +0 -0
- /data/ext/deflate_ruby/libdeflate/{lib/riscv → riscv}/matchfinder_impl.h +0 -0
- /data/ext/deflate_ruby/libdeflate/{lib/utils.c → utils.c} +0 -0
- /data/ext/deflate_ruby/libdeflate/{lib/x86 → x86}/matchfinder_impl.h +0 -0
- /data/ext/deflate_ruby/libdeflate/{lib/zlib_compress.c → zlib_compress.c} +0 -0
- /data/ext/deflate_ruby/libdeflate/{lib/zlib_constants.h → zlib_constants.h} +0 -0
- /data/ext/deflate_ruby/libdeflate/{lib/zlib_decompress.c → zlib_decompress.c} +0 -0
data/ext/deflate_ruby/extconf.rb
CHANGED
|
@@ -1,34 +1,44 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require "mkmf"
|
|
4
|
+
require "rbconfig"
|
|
4
5
|
|
|
5
|
-
#
|
|
6
|
+
# Makes all symbols private by default to avoid unintended conflict
|
|
7
|
+
# with other gems. To explicitly export symbols you can use RUBY_FUNC_EXPORTED
|
|
8
|
+
# selectively, or entirely remove this flag.
|
|
9
|
+
append_cflags("-fvisibility=hidden")
|
|
10
|
+
|
|
11
|
+
# Add libdeflate source directory and subdirectories to include path
|
|
6
12
|
$INCFLAGS << " -I$(srcdir)/libdeflate"
|
|
7
|
-
$
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
libdeflate/
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
arch_dirs.each do |dir|
|
|
25
|
-
Dir.glob("#{dir}*.c").each do |source|
|
|
26
|
-
libdeflate_sources << source
|
|
27
|
-
end
|
|
13
|
+
$INCFLAGS << " -I$(srcdir)/libdeflate/arm"
|
|
14
|
+
$INCFLAGS << " -I$(srcdir)/libdeflate/x86"
|
|
15
|
+
$INCFLAGS << " -I$(srcdir)/libdeflate/riscv"
|
|
16
|
+
|
|
17
|
+
# Detect CPU architecture
|
|
18
|
+
arch = RbConfig::CONFIG['host_cpu']
|
|
19
|
+
|
|
20
|
+
# Get base libdeflate C files (not in subdirectories)
|
|
21
|
+
libdeflate_sources = Dir.glob("#{__dir__}/libdeflate/*.c")
|
|
22
|
+
|
|
23
|
+
# Add architecture-specific files
|
|
24
|
+
if arch =~ /arm|aarch64/
|
|
25
|
+
libdeflate_sources += Dir.glob("#{__dir__}/libdeflate/arm/*.c")
|
|
26
|
+
elsif arch =~ /x86_64|i686|i386/
|
|
27
|
+
libdeflate_sources += Dir.glob("#{__dir__}/libdeflate/x86/*.c")
|
|
28
|
+
elsif arch =~ /riscv/
|
|
29
|
+
libdeflate_sources += Dir.glob("#{__dir__}/libdeflate/riscv/*.c")
|
|
28
30
|
end
|
|
29
31
|
|
|
30
|
-
#
|
|
31
|
-
$
|
|
32
|
-
|
|
32
|
+
# Build source file list for mkmf
|
|
33
|
+
$srcs = ["deflate_ruby.c"] + libdeflate_sources.map { |f| File.basename(f) }
|
|
34
|
+
|
|
35
|
+
# Optimization flags for better performance
|
|
36
|
+
append_cflags("-O3")
|
|
37
|
+
|
|
38
|
+
# Platform-specific optimizations
|
|
39
|
+
if arch =~ /x86_64|i686|i386/
|
|
40
|
+
# Enable SSE2 on x86 (generally available on x86_64)
|
|
41
|
+
have_func("__builtin_cpu_supports")
|
|
42
|
+
end
|
|
33
43
|
|
|
34
44
|
create_makefile("deflate_ruby/deflate_ruby")
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* adler32.c - Adler-32 checksum algorithm
|
|
3
|
+
*
|
|
4
|
+
* Copyright 2016 Eric Biggers
|
|
5
|
+
*
|
|
6
|
+
* Permission is hereby granted, free of charge, to any person
|
|
7
|
+
* obtaining a copy of this software and associated documentation
|
|
8
|
+
* files (the "Software"), to deal in the Software without
|
|
9
|
+
* restriction, including without limitation the rights to use,
|
|
10
|
+
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
11
|
+
* copies of the Software, and to permit persons to whom the
|
|
12
|
+
* Software is furnished to do so, subject to the following
|
|
13
|
+
* conditions:
|
|
14
|
+
*
|
|
15
|
+
* The above copyright notice and this permission notice shall be
|
|
16
|
+
* included in all copies or substantial portions of the Software.
|
|
17
|
+
*
|
|
18
|
+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
19
|
+
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
|
20
|
+
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
21
|
+
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
22
|
+
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|
23
|
+
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
24
|
+
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
25
|
+
* OTHER DEALINGS IN THE SOFTWARE.
|
|
26
|
+
*/
|
|
27
|
+
|
|
28
|
+
#include "lib_common.h"
|
|
29
|
+
|
|
30
|
+
/* The Adler-32 divisor, or "base", value */
|
|
31
|
+
#define DIVISOR 65521
|
|
32
|
+
|
|
33
|
+
/*
|
|
34
|
+
* MAX_CHUNK_LEN is the most bytes that can be processed without the possibility
|
|
35
|
+
* of s2 overflowing when it is represented as an unsigned 32-bit integer. This
|
|
36
|
+
* value was computed using the following Python script:
|
|
37
|
+
*
|
|
38
|
+
* divisor = 65521
|
|
39
|
+
* count = 0
|
|
40
|
+
* s1 = divisor - 1
|
|
41
|
+
* s2 = divisor - 1
|
|
42
|
+
* while True:
|
|
43
|
+
* s1 += 0xFF
|
|
44
|
+
* s2 += s1
|
|
45
|
+
* if s2 > 0xFFFFFFFF:
|
|
46
|
+
* break
|
|
47
|
+
* count += 1
|
|
48
|
+
* print(count)
|
|
49
|
+
*
|
|
50
|
+
* Note that to get the correct worst-case value, we must assume that every byte
|
|
51
|
+
* has value 0xFF and that s1 and s2 started with the highest possible values
|
|
52
|
+
* modulo the divisor.
|
|
53
|
+
*/
|
|
54
|
+
#define MAX_CHUNK_LEN 5552
|
|
55
|
+
|
|
56
|
+
/*
|
|
57
|
+
* Update the Adler-32 values s1 and s2 using n bytes from p, update p to p + n,
|
|
58
|
+
* update n to 0, and reduce s1 and s2 mod DIVISOR. It is assumed that neither
|
|
59
|
+
* s1 nor s2 can overflow before the reduction at the end, i.e. n plus any bytes
|
|
60
|
+
* already processed after the last reduction must not exceed MAX_CHUNK_LEN.
|
|
61
|
+
*
|
|
62
|
+
* This uses only portable C code. This is used as a fallback when a vectorized
|
|
63
|
+
* implementation of Adler-32 (e.g. AVX2) is unavailable on the platform.
|
|
64
|
+
*
|
|
65
|
+
* Some of the vectorized implementations also use this to handle the end of the
|
|
66
|
+
* data when the data isn't evenly divisible by the length the vectorized code
|
|
67
|
+
* works on. To avoid compiler errors about target-specific option mismatches
|
|
68
|
+
* when this is used in that way, this is a macro rather than a function.
|
|
69
|
+
*
|
|
70
|
+
* Although this is unvectorized, this does include an optimization where the
|
|
71
|
+
* main loop processes four bytes at a time using a strategy similar to that
|
|
72
|
+
* used by vectorized implementations. This provides increased instruction-
|
|
73
|
+
* level parallelism compared to the traditional 's1 += *p++; s2 += s1;'.
|
|
74
|
+
*/
|
|
75
|
+
#define ADLER32_CHUNK(s1, s2, p, n) \
|
|
76
|
+
do { \
|
|
77
|
+
if (n >= 4) { \
|
|
78
|
+
u32 s1_sum = 0; \
|
|
79
|
+
u32 byte_0_sum = 0; \
|
|
80
|
+
u32 byte_1_sum = 0; \
|
|
81
|
+
u32 byte_2_sum = 0; \
|
|
82
|
+
u32 byte_3_sum = 0; \
|
|
83
|
+
\
|
|
84
|
+
do { \
|
|
85
|
+
s1_sum += s1; \
|
|
86
|
+
s1 += p[0] + p[1] + p[2] + p[3]; \
|
|
87
|
+
byte_0_sum += p[0]; \
|
|
88
|
+
byte_1_sum += p[1]; \
|
|
89
|
+
byte_2_sum += p[2]; \
|
|
90
|
+
byte_3_sum += p[3]; \
|
|
91
|
+
p += 4; \
|
|
92
|
+
n -= 4; \
|
|
93
|
+
} while (n >= 4); \
|
|
94
|
+
s2 += (4 * (s1_sum + byte_0_sum)) + (3 * byte_1_sum) + \
|
|
95
|
+
(2 * byte_2_sum) + byte_3_sum; \
|
|
96
|
+
} \
|
|
97
|
+
for (; n; n--, p++) { \
|
|
98
|
+
s1 += *p; \
|
|
99
|
+
s2 += s1; \
|
|
100
|
+
} \
|
|
101
|
+
s1 %= DIVISOR; \
|
|
102
|
+
s2 %= DIVISOR; \
|
|
103
|
+
} while (0)
|
|
104
|
+
|
|
105
|
+
static u32 MAYBE_UNUSED
|
|
106
|
+
adler32_generic(u32 adler, const u8 *p, size_t len)
|
|
107
|
+
{
|
|
108
|
+
u32 s1 = adler & 0xFFFF;
|
|
109
|
+
u32 s2 = adler >> 16;
|
|
110
|
+
|
|
111
|
+
while (len) {
|
|
112
|
+
size_t n = MIN(len, MAX_CHUNK_LEN & ~3);
|
|
113
|
+
|
|
114
|
+
len -= n;
|
|
115
|
+
ADLER32_CHUNK(s1, s2, p, n);
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
return (s2 << 16) | s1;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
/* Include architecture-specific implementation(s) if available. */
|
|
122
|
+
#undef DEFAULT_IMPL
|
|
123
|
+
#undef arch_select_adler32_func
|
|
124
|
+
typedef u32 (*adler32_func_t)(u32 adler, const u8 *p, size_t len);
|
|
125
|
+
#if defined(ARCH_ARM32) || defined(ARCH_ARM64)
|
|
126
|
+
# include "arm/adler32_impl.h"
|
|
127
|
+
#elif defined(ARCH_X86_32) || defined(ARCH_X86_64)
|
|
128
|
+
# include "x86/adler32_impl.h"
|
|
129
|
+
#endif
|
|
130
|
+
|
|
131
|
+
#ifndef DEFAULT_IMPL
|
|
132
|
+
# define DEFAULT_IMPL adler32_generic
|
|
133
|
+
#endif
|
|
134
|
+
|
|
135
|
+
#ifdef arch_select_adler32_func
|
|
136
|
+
static u32 dispatch_adler32(u32 adler, const u8 *p, size_t len);
|
|
137
|
+
|
|
138
|
+
static volatile adler32_func_t adler32_impl = dispatch_adler32;
|
|
139
|
+
|
|
140
|
+
/* Choose the best implementation at runtime. */
|
|
141
|
+
static u32 dispatch_adler32(u32 adler, const u8 *p, size_t len)
|
|
142
|
+
{
|
|
143
|
+
adler32_func_t f = arch_select_adler32_func();
|
|
144
|
+
|
|
145
|
+
if (f == NULL)
|
|
146
|
+
f = DEFAULT_IMPL;
|
|
147
|
+
|
|
148
|
+
adler32_impl = f;
|
|
149
|
+
return f(adler, p, len);
|
|
150
|
+
}
|
|
151
|
+
#else
|
|
152
|
+
/* The best implementation is statically known, so call it directly. */
|
|
153
|
+
#define adler32_impl DEFAULT_IMPL
|
|
154
|
+
#endif
|
|
155
|
+
|
|
156
|
+
LIBDEFLATEAPI u32
|
|
157
|
+
libdeflate_adler32(u32 adler, const void *buffer, size_t len)
|
|
158
|
+
{
|
|
159
|
+
if (buffer == NULL) /* Return initial value. */
|
|
160
|
+
return 1;
|
|
161
|
+
return adler32_impl(adler, buffer, len);
|
|
162
|
+
}
|
|
@@ -209,18 +209,25 @@ adler32_arm_neon(u32 adler, const u8 *p, size_t len)
|
|
|
209
209
|
#endif /* Regular NEON implementation */
|
|
210
210
|
|
|
211
211
|
/* NEON+dotprod implementation */
|
|
212
|
-
#if HAVE_DOTPROD_INTRIN && CPU_IS_LITTLE_ENDIAN()
|
|
212
|
+
#if HAVE_DOTPROD_INTRIN && CPU_IS_LITTLE_ENDIAN() && \
|
|
213
|
+
!defined(LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_DOTPROD)
|
|
213
214
|
# define adler32_arm_neon_dotprod adler32_arm_neon_dotprod
|
|
214
215
|
# ifdef __clang__
|
|
215
216
|
# define ATTRIBUTES _target_attribute("dotprod")
|
|
216
217
|
/*
|
|
217
|
-
*
|
|
218
|
-
*
|
|
219
|
-
*
|
|
220
|
-
*
|
|
221
|
-
*
|
|
218
|
+
* Both gcc and binutils originally considered dotprod to depend on
|
|
219
|
+
* arch=armv8.2-a or later. This was fixed in gcc 13.2 by commit
|
|
220
|
+
* 9aac37ab8a7b ("aarch64: Remove architecture dependencies from intrinsics")
|
|
221
|
+
* and in binutils 2.41 by commit 205e4380c800 ("aarch64: Remove version
|
|
222
|
+
* dependencies from features"). Unfortunately, always using arch=armv8.2-a
|
|
223
|
+
* causes build errors with some compiler options because it may reduce the
|
|
224
|
+
* arch rather than increase it. Therefore we try to omit the arch whenever
|
|
225
|
+
* possible. If gcc is 14 or later, then both gcc and binutils are probably
|
|
226
|
+
* fixed, so we omit the arch. We also omit the arch if a feature that
|
|
227
|
+
* depends on armv8.2-a or later (in gcc 13.1 and earlier) is present.
|
|
222
228
|
*/
|
|
223
|
-
# elif GCC_PREREQ(
|
|
229
|
+
# elif GCC_PREREQ(14, 0) || defined(__ARM_FEATURE_JCVT) \
|
|
230
|
+
|| defined(__ARM_FEATURE_DOTPROD)
|
|
224
231
|
# define ATTRIBUTES _target_attribute("+dotprod")
|
|
225
232
|
# else
|
|
226
233
|
# define ATTRIBUTES _target_attribute("arch=armv8.2-a+dotprod")
|
|
@@ -434,13 +434,11 @@ crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len)
|
|
|
434
434
|
{ CRC32_X543_MODG, CRC32_X479_MODG }, /* 4 vecs */
|
|
435
435
|
{ CRC32_X287_MODG, CRC32_X223_MODG }, /* 2 vecs */
|
|
436
436
|
};
|
|
437
|
-
static const u64 _aligned_attribute(16)
|
|
438
|
-
{
|
|
439
|
-
{ CRC32_BARRETT_CONSTANT_1,
|
|
440
|
-
{ CRC32_BARRETT_CONSTANT_2,
|
|
437
|
+
static const u64 _aligned_attribute(16) barrett_consts[3][2] = {
|
|
438
|
+
{ CRC32_X95_MODG, },
|
|
439
|
+
{ CRC32_BARRETT_CONSTANT_1, },
|
|
440
|
+
{ CRC32_BARRETT_CONSTANT_2, },
|
|
441
441
|
};
|
|
442
|
-
const uint8x16_t zeroes = vdupq_n_u8(0);
|
|
443
|
-
const uint8x16_t mask32 = vreinterpretq_u8_u64(vdupq_n_u64(0xFFFFFFFF));
|
|
444
442
|
const poly64x2_t multipliers_1 = load_multipliers(mults[0]);
|
|
445
443
|
uint8x16_t v0, v1, v2, v3;
|
|
446
444
|
|
|
@@ -497,24 +495,13 @@ crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len)
|
|
|
497
495
|
if (len)
|
|
498
496
|
v0 = fold_partial_vec(v0, p, len, multipliers_1);
|
|
499
497
|
|
|
500
|
-
/*
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
clmul_high(vextq_u8(zeroes, v0, 8), multipliers_1));
|
|
508
|
-
|
|
509
|
-
/* Fold 96 => 64 bits. */
|
|
510
|
-
v0 = veorq_u8(vextq_u8(v0, zeroes, 4),
|
|
511
|
-
clmul_low(vandq_u8(v0, mask32),
|
|
512
|
-
load_multipliers(final_mults[0])));
|
|
513
|
-
|
|
514
|
-
/* Reduce 64 => 32 bits using Barrett reduction. */
|
|
515
|
-
v1 = clmul_low(vandq_u8(v0, mask32), load_multipliers(final_mults[1]));
|
|
516
|
-
v1 = clmul_low(vandq_u8(v1, mask32), load_multipliers(final_mults[2]));
|
|
517
|
-
return vgetq_lane_u32(vreinterpretq_u32_u8(veorq_u8(v0, v1)), 1);
|
|
498
|
+
/* Reduce to 32 bits, following lib/x86/crc32_pclmul_template.h */
|
|
499
|
+
v0 = veorq_u8(clmul_low(v0, load_multipliers(barrett_consts[0])),
|
|
500
|
+
vextq_u8(v0, vdupq_n_u8(0), 8));
|
|
501
|
+
v1 = clmul_low(v0, load_multipliers(barrett_consts[1]));
|
|
502
|
+
v1 = clmul_low(v1, load_multipliers(barrett_consts[2]));
|
|
503
|
+
v0 = veorq_u8(v0, v1);
|
|
504
|
+
return vgetq_lane_u32(vreinterpretq_u32_u8(v0), 2);
|
|
518
505
|
}
|
|
519
506
|
#undef SUFFIX
|
|
520
507
|
#undef ATTRIBUTES
|
|
@@ -545,19 +532,26 @@ crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len)
|
|
|
545
532
|
* This like crc32_arm_pmullx12_crc(), but it adds the eor3 instruction (from
|
|
546
533
|
* the sha3 extension) for even better performance.
|
|
547
534
|
*/
|
|
548
|
-
#if HAVE_PMULL_INTRIN && HAVE_CRC32_INTRIN && HAVE_SHA3_INTRIN
|
|
535
|
+
#if HAVE_PMULL_INTRIN && HAVE_CRC32_INTRIN && HAVE_SHA3_INTRIN && \
|
|
536
|
+
!defined(LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_SHA3)
|
|
549
537
|
# define crc32_arm_pmullx12_crc_eor3 crc32_arm_pmullx12_crc_eor3
|
|
550
538
|
# define SUFFIX _pmullx12_crc_eor3
|
|
551
539
|
# ifdef __clang__
|
|
552
540
|
# define ATTRIBUTES _target_attribute("aes,crc,sha3")
|
|
553
541
|
/*
|
|
554
|
-
*
|
|
555
|
-
*
|
|
556
|
-
*
|
|
557
|
-
*
|
|
558
|
-
*
|
|
542
|
+
* Both gcc and binutils originally considered sha3 to depend on
|
|
543
|
+
* arch=armv8.2-a or later. This was fixed in gcc 13.2 by commit
|
|
544
|
+
* 9aac37ab8a7b ("aarch64: Remove architecture dependencies from intrinsics")
|
|
545
|
+
* and in binutils 2.41 by commit 205e4380c800 ("aarch64: Remove version
|
|
546
|
+
* dependencies from features"). Unfortunately, always using arch=armv8.2-a
|
|
547
|
+
* causes build errors with some compiler options because it may reduce the
|
|
548
|
+
* arch rather than increase it. Therefore we try to omit the arch whenever
|
|
549
|
+
* possible. If gcc is 14 or later, then both gcc and binutils are probably
|
|
550
|
+
* fixed, so we omit the arch. We also omit the arch if a feature that
|
|
551
|
+
* depends on armv8.2-a or later (in gcc 13.1 and earlier) is present.
|
|
559
552
|
*/
|
|
560
|
-
# elif GCC_PREREQ(
|
|
553
|
+
# elif GCC_PREREQ(14, 0) || defined(__ARM_FEATURE_JCVT) \
|
|
554
|
+
|| defined(__ARM_FEATURE_DOTPROD)
|
|
561
555
|
# define ATTRIBUTES _target_attribute("+crypto,+crc,+sha3")
|
|
562
556
|
# else
|
|
563
557
|
# define ATTRIBUTES _target_attribute("arch=armv8.2-a+crypto+crc+sha3")
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* arm/crc32_pmull_helpers.h - helper functions for CRC-32 folding with PMULL
|
|
3
|
+
*
|
|
4
|
+
* Copyright 2022 Eric Biggers
|
|
5
|
+
*
|
|
6
|
+
* Permission is hereby granted, free of charge, to any person
|
|
7
|
+
* obtaining a copy of this software and associated documentation
|
|
8
|
+
* files (the "Software"), to deal in the Software without
|
|
9
|
+
* restriction, including without limitation the rights to use,
|
|
10
|
+
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
11
|
+
* copies of the Software, and to permit persons to whom the
|
|
12
|
+
* Software is furnished to do so, subject to the following
|
|
13
|
+
* conditions:
|
|
14
|
+
*
|
|
15
|
+
* The above copyright notice and this permission notice shall be
|
|
16
|
+
* included in all copies or substantial portions of the Software.
|
|
17
|
+
*
|
|
18
|
+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
19
|
+
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
|
20
|
+
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
21
|
+
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
22
|
+
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|
23
|
+
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
24
|
+
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
25
|
+
* OTHER DEALINGS IN THE SOFTWARE.
|
|
26
|
+
*/
|
|
27
|
+
|
|
28
|
+
/*
|
|
29
|
+
* This file is a "template" for instantiating helper functions for CRC folding
|
|
30
|
+
* with pmull instructions. It accepts the following parameters:
|
|
31
|
+
*
|
|
32
|
+
* SUFFIX:
|
|
33
|
+
* Name suffix to append to all instantiated functions.
|
|
34
|
+
* ATTRIBUTES:
|
|
35
|
+
* Target function attributes to use.
|
|
36
|
+
* ENABLE_EOR3:
|
|
37
|
+
* Use the eor3 instruction (from the sha3 extension).
|
|
38
|
+
*/
|
|
39
|
+
|
|
40
|
+
/* Create a vector with 'a' in the first 4 bytes, and the rest zeroed out. */
|
|
41
|
+
#undef u32_to_bytevec
|
|
42
|
+
static forceinline ATTRIBUTES uint8x16_t
|
|
43
|
+
ADD_SUFFIX(u32_to_bytevec)(u32 a)
|
|
44
|
+
{
|
|
45
|
+
return vreinterpretq_u8_u32(vsetq_lane_u32(a, vdupq_n_u32(0), 0));
|
|
46
|
+
}
|
|
47
|
+
#define u32_to_bytevec ADD_SUFFIX(u32_to_bytevec)
|
|
48
|
+
|
|
49
|
+
/* Load two 64-bit values into a vector. */
|
|
50
|
+
#undef load_multipliers
|
|
51
|
+
static forceinline ATTRIBUTES poly64x2_t
|
|
52
|
+
ADD_SUFFIX(load_multipliers)(const u64 p[2])
|
|
53
|
+
{
|
|
54
|
+
return vreinterpretq_p64_u64(vld1q_u64(p));
|
|
55
|
+
}
|
|
56
|
+
#define load_multipliers ADD_SUFFIX(load_multipliers)
|
|
57
|
+
|
|
58
|
+
/* Do carryless multiplication of the low halves of two vectors. */
|
|
59
|
+
#undef clmul_low
|
|
60
|
+
static forceinline ATTRIBUTES uint8x16_t
|
|
61
|
+
ADD_SUFFIX(clmul_low)(uint8x16_t a, poly64x2_t b)
|
|
62
|
+
{
|
|
63
|
+
return vreinterpretq_u8_p128(
|
|
64
|
+
compat_vmull_p64(vgetq_lane_p64(vreinterpretq_p64_u8(a), 0),
|
|
65
|
+
vgetq_lane_p64(b, 0)));
|
|
66
|
+
}
|
|
67
|
+
#define clmul_low ADD_SUFFIX(clmul_low)
|
|
68
|
+
|
|
69
|
+
/* Do carryless multiplication of the high halves of two vectors. */
|
|
70
|
+
#undef clmul_high
|
|
71
|
+
static forceinline ATTRIBUTES uint8x16_t
|
|
72
|
+
ADD_SUFFIX(clmul_high)(uint8x16_t a, poly64x2_t b)
|
|
73
|
+
{
|
|
74
|
+
#ifdef __clang__
|
|
75
|
+
/*
|
|
76
|
+
* Use inline asm to ensure that pmull2 is really used. This works
|
|
77
|
+
* around clang bug https://github.com/llvm/llvm-project/issues/52868.
|
|
78
|
+
*/
|
|
79
|
+
uint8x16_t res;
|
|
80
|
+
|
|
81
|
+
__asm__("pmull2 %0.1q, %1.2d, %2.2d" : "=w" (res) : "w" (a), "w" (b));
|
|
82
|
+
return res;
|
|
83
|
+
#else
|
|
84
|
+
return vreinterpretq_u8_p128(vmull_high_p64(vreinterpretq_p64_u8(a), b));
|
|
85
|
+
#endif
|
|
86
|
+
}
|
|
87
|
+
#define clmul_high ADD_SUFFIX(clmul_high)
|
|
88
|
+
|
|
89
|
+
#undef eor3
|
|
90
|
+
static forceinline ATTRIBUTES uint8x16_t
|
|
91
|
+
ADD_SUFFIX(eor3)(uint8x16_t a, uint8x16_t b, uint8x16_t c)
|
|
92
|
+
{
|
|
93
|
+
#if ENABLE_EOR3
|
|
94
|
+
return veor3q_u8(a, b, c);
|
|
95
|
+
#else
|
|
96
|
+
return veorq_u8(veorq_u8(a, b), c);
|
|
97
|
+
#endif
|
|
98
|
+
}
|
|
99
|
+
#define eor3 ADD_SUFFIX(eor3)
|
|
100
|
+
|
|
101
|
+
#undef fold_vec
|
|
102
|
+
static forceinline ATTRIBUTES uint8x16_t
|
|
103
|
+
ADD_SUFFIX(fold_vec)(uint8x16_t src, uint8x16_t dst, poly64x2_t multipliers)
|
|
104
|
+
{
|
|
105
|
+
uint8x16_t a = clmul_low(src, multipliers);
|
|
106
|
+
uint8x16_t b = clmul_high(src, multipliers);
|
|
107
|
+
|
|
108
|
+
return eor3(a, b, dst);
|
|
109
|
+
}
|
|
110
|
+
#define fold_vec ADD_SUFFIX(fold_vec)
|
|
111
|
+
|
|
112
|
+
/*
|
|
113
|
+
* Given v containing a 16-byte polynomial, and a pointer 'p' that points to the
|
|
114
|
+
* next '1 <= len <= 15' data bytes, rearrange the concatenation of v and the
|
|
115
|
+
* data into vectors x0 and x1 that contain 'len' bytes and 16 bytes,
|
|
116
|
+
* respectively. Then fold x0 into x1 and return the result. Assumes that
|
|
117
|
+
* 'p + len - 16' is in-bounds.
|
|
118
|
+
*/
|
|
119
|
+
#undef fold_partial_vec
|
|
120
|
+
static forceinline ATTRIBUTES MAYBE_UNUSED uint8x16_t
|
|
121
|
+
ADD_SUFFIX(fold_partial_vec)(uint8x16_t v, const u8 *p, size_t len,
|
|
122
|
+
poly64x2_t multipliers_1)
|
|
123
|
+
{
|
|
124
|
+
/*
|
|
125
|
+
* vqtbl1q_u8(v, shift_tab[len..len+15]) left shifts v by 16-len bytes.
|
|
126
|
+
* vqtbl1q_u8(v, shift_tab[len+16..len+31]) right shifts v by len bytes.
|
|
127
|
+
*/
|
|
128
|
+
static const u8 shift_tab[48] = {
|
|
129
|
+
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
|
130
|
+
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
|
131
|
+
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
|
|
132
|
+
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
|
|
133
|
+
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
|
134
|
+
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
|
135
|
+
};
|
|
136
|
+
const uint8x16_t lshift = vld1q_u8(&shift_tab[len]);
|
|
137
|
+
const uint8x16_t rshift = vld1q_u8(&shift_tab[len + 16]);
|
|
138
|
+
uint8x16_t x0, x1, bsl_mask;
|
|
139
|
+
|
|
140
|
+
/* x0 = v left-shifted by '16 - len' bytes */
|
|
141
|
+
x0 = vqtbl1q_u8(v, lshift);
|
|
142
|
+
|
|
143
|
+
/* Create a vector of '16 - len' 0x00 bytes, then 'len' 0xff bytes. */
|
|
144
|
+
bsl_mask = vreinterpretq_u8_s8(
|
|
145
|
+
vshrq_n_s8(vreinterpretq_s8_u8(rshift), 7));
|
|
146
|
+
|
|
147
|
+
/*
|
|
148
|
+
* x1 = the last '16 - len' bytes from v (i.e. v right-shifted by 'len'
|
|
149
|
+
* bytes) followed by the remaining data.
|
|
150
|
+
*/
|
|
151
|
+
x1 = vbslq_u8(bsl_mask /* 0 bits select from arg3, 1 bits from arg2 */,
|
|
152
|
+
vld1q_u8(p + len - 16), vqtbl1q_u8(v, rshift));
|
|
153
|
+
|
|
154
|
+
return fold_vec(x0, x1, multipliers_1);
|
|
155
|
+
}
|
|
156
|
+
#define fold_partial_vec ADD_SUFFIX(fold_partial_vec)
|