libdeflate 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (89) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +17 -0
  3. data/.gitmodules +3 -0
  4. data/.rspec +2 -0
  5. data/.rubocop.yml +1 -0
  6. data/.rubocop_todo.yml +9 -0
  7. data/.travis.yml +5 -0
  8. data/Gemfile +4 -0
  9. data/LICENSE.txt +21 -0
  10. data/README.md +52 -0
  11. data/Rakefile +15 -0
  12. data/bin/console +14 -0
  13. data/bin/setup +8 -0
  14. data/ext/libdeflate/extconf.rb +14 -0
  15. data/ext/libdeflate/libdeflate/.gitignore +19 -0
  16. data/ext/libdeflate/libdeflate/COPYING +21 -0
  17. data/ext/libdeflate/libdeflate/Makefile +231 -0
  18. data/ext/libdeflate/libdeflate/Makefile.msc +64 -0
  19. data/ext/libdeflate/libdeflate/NEWS +57 -0
  20. data/ext/libdeflate/libdeflate/README.md +170 -0
  21. data/ext/libdeflate/libdeflate/common/common_defs.h +351 -0
  22. data/ext/libdeflate/libdeflate/common/compiler_gcc.h +134 -0
  23. data/ext/libdeflate/libdeflate/common/compiler_msc.h +95 -0
  24. data/ext/libdeflate/libdeflate/lib/adler32.c +213 -0
  25. data/ext/libdeflate/libdeflate/lib/adler32_impl.h +281 -0
  26. data/ext/libdeflate/libdeflate/lib/aligned_malloc.c +57 -0
  27. data/ext/libdeflate/libdeflate/lib/aligned_malloc.h +13 -0
  28. data/ext/libdeflate/libdeflate/lib/bt_matchfinder.h +357 -0
  29. data/ext/libdeflate/libdeflate/lib/crc32.c +368 -0
  30. data/ext/libdeflate/libdeflate/lib/crc32_impl.h +286 -0
  31. data/ext/libdeflate/libdeflate/lib/crc32_table.h +526 -0
  32. data/ext/libdeflate/libdeflate/lib/decompress_impl.h +404 -0
  33. data/ext/libdeflate/libdeflate/lib/deflate_compress.c +2817 -0
  34. data/ext/libdeflate/libdeflate/lib/deflate_compress.h +14 -0
  35. data/ext/libdeflate/libdeflate/lib/deflate_constants.h +66 -0
  36. data/ext/libdeflate/libdeflate/lib/deflate_decompress.c +889 -0
  37. data/ext/libdeflate/libdeflate/lib/gzip_compress.c +95 -0
  38. data/ext/libdeflate/libdeflate/lib/gzip_constants.h +45 -0
  39. data/ext/libdeflate/libdeflate/lib/gzip_decompress.c +130 -0
  40. data/ext/libdeflate/libdeflate/lib/hc_matchfinder.h +405 -0
  41. data/ext/libdeflate/libdeflate/lib/lib_common.h +35 -0
  42. data/ext/libdeflate/libdeflate/lib/matchfinder_avx2.h +53 -0
  43. data/ext/libdeflate/libdeflate/lib/matchfinder_common.h +205 -0
  44. data/ext/libdeflate/libdeflate/lib/matchfinder_neon.h +61 -0
  45. data/ext/libdeflate/libdeflate/lib/matchfinder_sse2.h +53 -0
  46. data/ext/libdeflate/libdeflate/lib/unaligned.h +202 -0
  47. data/ext/libdeflate/libdeflate/lib/x86_cpu_features.c +169 -0
  48. data/ext/libdeflate/libdeflate/lib/x86_cpu_features.h +48 -0
  49. data/ext/libdeflate/libdeflate/lib/zlib_compress.c +87 -0
  50. data/ext/libdeflate/libdeflate/lib/zlib_constants.h +21 -0
  51. data/ext/libdeflate/libdeflate/lib/zlib_decompress.c +91 -0
  52. data/ext/libdeflate/libdeflate/libdeflate.h +274 -0
  53. data/ext/libdeflate/libdeflate/programs/benchmark.c +558 -0
  54. data/ext/libdeflate/libdeflate/programs/checksum.c +197 -0
  55. data/ext/libdeflate/libdeflate/programs/detect.sh +62 -0
  56. data/ext/libdeflate/libdeflate/programs/gzip.c +603 -0
  57. data/ext/libdeflate/libdeflate/programs/prog_util.c +530 -0
  58. data/ext/libdeflate/libdeflate/programs/prog_util.h +162 -0
  59. data/ext/libdeflate/libdeflate/programs/test_checksums.c +135 -0
  60. data/ext/libdeflate/libdeflate/programs/tgetopt.c +118 -0
  61. data/ext/libdeflate/libdeflate/tools/afl-fuzz/Makefile +12 -0
  62. data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_compress/fuzz.c +40 -0
  63. data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_compress/inputs/0 +0 -0
  64. data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_decompress/fuzz.c +28 -0
  65. data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_decompress/inputs/0 +3 -0
  66. data/ext/libdeflate/libdeflate/tools/afl-fuzz/gzip_decompress/fuzz.c +28 -0
  67. data/ext/libdeflate/libdeflate/tools/afl-fuzz/gzip_decompress/inputs/0 +0 -0
  68. data/ext/libdeflate/libdeflate/tools/afl-fuzz/prepare_for_fuzz.sh +14 -0
  69. data/ext/libdeflate/libdeflate/tools/afl-fuzz/zlib_decompress/fuzz.c +28 -0
  70. data/ext/libdeflate/libdeflate/tools/afl-fuzz/zlib_decompress/inputs/0 +3 -0
  71. data/ext/libdeflate/libdeflate/tools/android_build.sh +104 -0
  72. data/ext/libdeflate/libdeflate/tools/checksum_benchmarks.sh +76 -0
  73. data/ext/libdeflate/libdeflate/tools/exec_tests.sh +30 -0
  74. data/ext/libdeflate/libdeflate/tools/gen_crc32_multipliers.c +108 -0
  75. data/ext/libdeflate/libdeflate/tools/gen_crc32_table.c +100 -0
  76. data/ext/libdeflate/libdeflate/tools/gzip_tests.sh +412 -0
  77. data/ext/libdeflate/libdeflate/tools/make-windows-releases +21 -0
  78. data/ext/libdeflate/libdeflate/tools/mips_build.sh +9 -0
  79. data/ext/libdeflate/libdeflate/tools/msc_test.bat +3 -0
  80. data/ext/libdeflate/libdeflate/tools/pgo_build.sh +23 -0
  81. data/ext/libdeflate/libdeflate/tools/produce_gzip_benchmark_table.sh +37 -0
  82. data/ext/libdeflate/libdeflate/tools/run_tests.sh +305 -0
  83. data/ext/libdeflate/libdeflate/tools/windows_build.sh +10 -0
  84. data/ext/libdeflate/libdeflate_ext.c +389 -0
  85. data/ext/libdeflate/libdeflate_ext.h +8 -0
  86. data/lib/libdeflate.rb +2 -0
  87. data/lib/libdeflate/version.rb +3 -0
  88. data/libdeflate.gemspec +33 -0
  89. metadata +230 -0
@@ -0,0 +1,134 @@
1
+ /*
2
+ * compiler_gcc.h - definitions for the GNU C Compiler. This also handles clang
3
+ * and the Intel C Compiler (icc).
4
+ *
5
+ * TODO: icc is not well tested, so some things are currently disabled even
6
+ * though they maybe can be enabled on some icc versions.
7
+ */
8
+
9
+ #if !defined(__clang__) && !defined(__INTEL_COMPILER)
10
+ # define GCC_PREREQ(major, minor) \
11
+ (__GNUC__ > (major) || \
12
+ (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor)))
13
+ #else
14
+ # define GCC_PREREQ(major, minor) 0
15
+ #endif
16
+
17
+ /* Note: only check the clang version when absolutely necessary!
18
+ * "Vendors" such as Apple can use different version numbers. */
19
+ #ifdef __clang__
20
+ # ifdef __apple_build_version__
21
+ # define CLANG_PREREQ(major, minor, apple_version) \
22
+ (__apple_build_version__ >= (apple_version))
23
+ # else
24
+ # define CLANG_PREREQ(major, minor, apple_version) \
25
+ (__clang_major__ > (major) || \
26
+ (__clang_major__ == (major) && __clang_minor__ >= (minor)))
27
+ # endif
28
+ #else
29
+ # define CLANG_PREREQ(major, minor, apple_version) 0
30
+ #endif
31
+
32
+ #ifndef __has_attribute
33
+ # define __has_attribute(attribute) 0
34
+ #endif
35
+ #ifndef __has_feature
36
+ # define __has_feature(feature) 0
37
+ #endif
38
+ #ifndef __has_builtin
39
+ # define __has_builtin(builtin) 0
40
+ #endif
41
+
42
+ #ifdef _WIN32
43
+ # define LIBEXPORT __declspec(dllexport)
44
+ #else
45
+ # define LIBEXPORT __attribute__((visibility("default")))
46
+ #endif
47
+
48
+ #define inline inline
49
+ #define forceinline inline __attribute__((always_inline))
50
+ #define restrict __restrict__
51
+ #define likely(expr) __builtin_expect(!!(expr), 1)
52
+ #define unlikely(expr) __builtin_expect(!!(expr), 0)
53
+ #define prefetchr(addr) __builtin_prefetch((addr), 0)
54
+ #define prefetchw(addr) __builtin_prefetch((addr), 1)
55
+ #define _aligned_attribute(n) __attribute__((aligned(n)))
56
+
57
+ /*
58
+ * Support for the following x86 instruction set extensions was introduced by
59
+ * the following gcc versions:
60
+ *
61
+ * PCLMUL 4.4
62
+ * AVX 4.6
63
+ * BMI2 4.7
64
+ * AVX2 4.7
65
+ *
66
+ * With clang, __has_builtin() can be used to detect the presence of one of the
67
+ * associated builtins.
68
+ *
69
+ * Additionally, gcc 4.4 introduced the 'target' function attribute. With
70
+ * clang, support for this can be detected with with __has_attribute(target).
71
+ *
72
+ * However, prior to gcc 4.9 and clang 3.8, x86 intrinsics not available in the
73
+ * main target could not be used in 'target' attribute functions. Unfortunately
74
+ * clang has no feature test macro for this so we have to check its version.
75
+ */
76
+ #define COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE \
77
+ (GCC_PREREQ(4, 4) || __has_attribute(target))
78
+ #if COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE
79
+ # define COMPILER_SUPPORTS_TARGET_INTRINSICS \
80
+ (GCC_PREREQ(4, 9) || CLANG_PREREQ(3, 8, 7030000))
81
+ # define COMPILER_SUPPORTS_PCLMUL_TARGET \
82
+ (GCC_PREREQ(4, 4) || __has_builtin(__builtin_ia32_pclmulqdq128))
83
+ # define COMPILER_SUPPORTS_AVX_TARGET \
84
+ (GCC_PREREQ(4, 6) || __has_builtin(__builtin_ia32_maxps256))
85
+ # define COMPILER_SUPPORTS_BMI2_TARGET \
86
+ (GCC_PREREQ(4, 7) || __has_builtin(__builtin_ia32_pdep_di))
87
+ # define COMPILER_SUPPORTS_AVX2_TARGET \
88
+ (GCC_PREREQ(4, 7) || __has_builtin(__builtin_ia32_pmaddwd256))
89
+ #endif
90
+
91
+ /* Newer gcc supports __BYTE_ORDER__. Older gcc doesn't. */
92
+ #ifdef __BYTE_ORDER__
93
+ # define CPU_IS_LITTLE_ENDIAN() (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
94
+ #endif
95
+
96
+ #if GCC_PREREQ(4, 8) || __has_builtin(__builtin_bswap16)
97
+ # define bswap16 __builtin_bswap16
98
+ #endif
99
+
100
+ #if GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap32)
101
+ # define bswap32 __builtin_bswap32
102
+ #endif
103
+
104
+ #if GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap64)
105
+ # define bswap64 __builtin_bswap64
106
+ #endif
107
+
108
+ #if defined(__x86_64__) || defined(__i386__) || defined(__ARM_FEATURE_UNALIGNED)
109
+ # define UNALIGNED_ACCESS_IS_FAST 1
110
+ #endif
111
+
112
+ /* With gcc, we can access unaligned memory through 'packed' structures. */
113
+ #define DEFINE_UNALIGNED_TYPE(type) \
114
+ \
115
+ struct type##unaligned { \
116
+ type v; \
117
+ } __attribute__((packed)); \
118
+ \
119
+ static forceinline type \
120
+ load_##type##_unaligned(const void *p) \
121
+ { \
122
+ return ((const struct type##unaligned *)p)->v; \
123
+ } \
124
+ \
125
+ static forceinline void \
126
+ store_##type##_unaligned(type v, void *p) \
127
+ { \
128
+ ((struct type##unaligned *)p)->v = v; \
129
+ }
130
+
131
+ #define bsr32(n) (31 - __builtin_clz(n))
132
+ #define bsr64(n) (63 - __builtin_clzll(n))
133
+ #define bsf32(n) __builtin_ctz(n)
134
+ #define bsf64(n) __builtin_ctzll(n)
@@ -0,0 +1,95 @@
1
+ /*
2
+ * compiler_msc.h - definitions for the Microsoft C Compiler
3
+ */
4
+
5
+ #define LIBEXPORT __declspec(dllexport)
6
+
7
+ /*
8
+ * Old versions (e.g. VS2010) of MSC don't have the C99 header stdbool.h.
9
+ * Beware: the below replacement isn't fully standard, since normally any value
10
+ * != 0 should be implicitly cast to a bool with value 1... but that doesn't
11
+ * happen if bool is really just an 'int'.
12
+ */
13
+ typedef int bool;
14
+ #define true 1
15
+ #define false 0
16
+ #define __bool_true_false_are_defined 1
17
+
18
+ /* Define ssize_t */
19
+ #ifdef _WIN64
20
+ typedef long long ssize_t;
21
+ #else
22
+ typedef int ssize_t;
23
+ #endif
24
+
25
+ /*
26
+ * Old versions (e.g. VS2010) of MSC have stdint.h but not the C99 header
27
+ * inttypes.h. Work around this by defining the PRI* macros ourselves.
28
+ */
29
+ #include <stdint.h>
30
+ #define PRIu8 "hhu"
31
+ #define PRIu16 "hu"
32
+ #define PRIu32 "u"
33
+ #define PRIu64 "llu"
34
+ #define PRIi8 "hhi"
35
+ #define PRIi16 "hi"
36
+ #define PRIi32 "i"
37
+ #define PRIi64 "lli"
38
+ #define PRIx8 "hhx"
39
+ #define PRIx16 "hx"
40
+ #define PRIx32 "x"
41
+ #define PRIx64 "llx"
42
+
43
+ /* Assume a little endian architecture with fast unaligned access */
44
+ #define CPU_IS_LITTLE_ENDIAN() 1
45
+ #define UNALIGNED_ACCESS_IS_FAST 1
46
+
47
+ /* __restrict has nonstandard behavior; don't use it */
48
+ #define restrict
49
+
50
+ /* ... but we can use __inline and __forceinline */
51
+ #define inline __inline
52
+ #define forceinline __forceinline
53
+
54
+ /* Byte swap functions */
55
+ #define bswap16 _byteswap_ushort
56
+ #define bswap32 _byteswap_ulong
57
+ #define bswap64 _byteswap_uint64
58
+
59
+ /* Bit scan functions (32-bit) */
60
+
61
+ static forceinline unsigned
62
+ bsr32(uint32_t n)
63
+ {
64
+ _BitScanReverse(&n, n);
65
+ return n;
66
+ }
67
+ #define bsr32 bsr32
68
+
69
+ static forceinline unsigned
70
+ bsf32(uint32_t n)
71
+ {
72
+ _BitScanForward(&n, n);
73
+ return n;
74
+ }
75
+ #define bsf32 bsf32
76
+
77
+ #ifdef _M_X64 /* Bit scan functions (64-bit) */
78
+
79
+ static forceinline unsigned
80
+ bsr64(uint64_t n)
81
+ {
82
+ _BitScanReverse64(&n, n);
83
+ return n;
84
+ }
85
+ #define bsr64 bsr64
86
+
87
+ static forceinline unsigned
88
+ bsf64(uint64_t n)
89
+ {
90
+ _BitScanForward64(&n, n);
91
+ return n;
92
+ }
93
+ #define bsf64 bsf64
94
+
95
+ #endif /* _M_X64 */
@@ -0,0 +1,213 @@
1
+ /*
2
+ * adler32.c - Adler-32 checksum algorithm
3
+ *
4
+ * Originally public domain; changes after 2016-09-07 are copyrighted.
5
+ *
6
+ * Copyright 2016 Eric Biggers
7
+ *
8
+ * Permission is hereby granted, free of charge, to any person
9
+ * obtaining a copy of this software and associated documentation
10
+ * files (the "Software"), to deal in the Software without
11
+ * restriction, including without limitation the rights to use,
12
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
13
+ * copies of the Software, and to permit persons to whom the
14
+ * Software is furnished to do so, subject to the following
15
+ * conditions:
16
+ *
17
+ * The above copyright notice and this permission notice shall be
18
+ * included in all copies or substantial portions of the Software.
19
+ *
20
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27
+ * OTHER DEALINGS IN THE SOFTWARE.
28
+ */
29
+
30
+ #include "x86_cpu_features.h"
31
+
32
+ #include "libdeflate.h"
33
+
34
+ /* The Adler-32 divisor, or "base", value. */
35
+ #define DIVISOR 65521
36
+
37
+ /*
38
+ * MAX_BYTES_PER_CHUNK is the most bytes that can be processed without the
39
+ * possibility of s2 overflowing when it is represented as an unsigned 32-bit
40
+ * integer. This value was computed using the following Python script:
41
+ *
42
+ * divisor = 65521
43
+ * count = 0
44
+ * s1 = divisor - 1
45
+ * s2 = divisor - 1
46
+ * while True:
47
+ * s1 += 0xFF
48
+ * s2 += s1
49
+ * if s2 > 0xFFFFFFFF:
50
+ * break
51
+ * count += 1
52
+ * print(count)
53
+ *
54
+ * Note that to get the correct worst-case value, we must assume that every byte
55
+ * has value 0xFF and that s1 and s2 started with the highest possible values
56
+ * modulo the divisor.
57
+ */
58
+ #define MAX_BYTES_PER_CHUNK 5552
59
+
60
+ /* Select the implementations to compile in. */
61
+
62
+ #define NEED_GENERIC_IMPL 1 /* include generic impl unless overridden */
63
+
64
+ /* Include the SSE2 implementation? */
65
+ #define NEED_SSE2_IMPL 0
66
+ #ifdef __SSE2__
67
+ # include <emmintrin.h>
68
+ # undef NEED_SSE2_IMPL
69
+ # define NEED_SSE2_IMPL 1
70
+ # undef NEED_GENERIC_IMPL
71
+ # define NEED_GENERIC_IMPL 0 /* generic impl not needed */
72
+ #endif
73
+
74
+ /* Include the AVX2 implementation? */
75
+ #define NEED_AVX2_IMPL 0
76
+ #if defined(__AVX2__) || \
77
+ (X86_CPU_FEATURES_ENABLED && COMPILER_SUPPORTS_AVX2_TARGET && \
78
+ COMPILER_SUPPORTS_TARGET_INTRINSICS)
79
+ # include <immintrin.h>
80
+ # undef NEED_AVX2_IMPL
81
+ # define NEED_AVX2_IMPL 1
82
+ # ifdef __AVX2__ /* compiling for AVX2, i.e. can we assume it's there? */
83
+ # undef NEED_GENERIC_IMPL
84
+ # define NEED_GENERIC_IMPL 0 /* generic impl not needed */
85
+ # undef NEED_SSE2_IMPL
86
+ # define NEED_SSE2_IMPL 0 /* SSE2 impl not needed */
87
+ # endif /* otherwise, we can build an AVX2 version, but we won't know whether
88
+ we can use it until runtime */
89
+ #endif
90
+
91
+ /* Include the NEON implementation? */
92
+ #define NEED_NEON_IMPL 0
93
+ #ifdef __ARM_NEON
94
+ # include <arm_neon.h>
95
+ # undef NEED_NEON_IMPL
96
+ # define NEED_NEON_IMPL 1
97
+ # undef NEED_GENERIC_IMPL
98
+ # define NEED_GENERIC_IMPL 0 /* generic impl not needed */
99
+ #endif
100
+
101
+ #define NUM_IMPLS (NEED_GENERIC_IMPL + NEED_SSE2_IMPL + NEED_AVX2_IMPL + \
102
+ NEED_NEON_IMPL)
103
+
104
+ /* Define the generic implementation if needed. */
105
+ #if NEED_GENERIC_IMPL
106
+ static u32 adler32_generic(u32 adler, const void *buffer, size_t size)
107
+ {
108
+ u32 s1 = adler & 0xFFFF;
109
+ u32 s2 = adler >> 16;
110
+ const u8 *p = buffer;
111
+ const u8 * const end = p + size;
112
+
113
+ while (p != end) {
114
+ size_t chunk_size = MIN(end - p, MAX_BYTES_PER_CHUNK);
115
+ const u8 *chunk_end = p + chunk_size;
116
+ size_t num_unrolled_iterations = chunk_size / 4;
117
+
118
+ while (num_unrolled_iterations--) {
119
+ s1 += *p++;
120
+ s2 += s1;
121
+ s1 += *p++;
122
+ s2 += s1;
123
+ s1 += *p++;
124
+ s2 += s1;
125
+ s1 += *p++;
126
+ s2 += s1;
127
+ }
128
+ while (p != chunk_end) {
129
+ s1 += *p++;
130
+ s2 += s1;
131
+ }
132
+ s1 %= DIVISOR;
133
+ s2 %= DIVISOR;
134
+ }
135
+
136
+ return (s2 << 16) | s1;
137
+ }
138
+ #define DEFAULT_IMPL adler32_generic
139
+ #endif /* NEED_GENERIC_IMPL */
140
+
141
+ #define TARGET_SSE2 100
142
+ #define TARGET_AVX2 200
143
+ #define TARGET_NEON 300
144
+
145
+ /* Define the SSE2 implementation if needed. */
146
+ #if NEED_SSE2_IMPL
147
+ # define FUNCNAME adler32_sse2
148
+ # define TARGET TARGET_SSE2
149
+ # define ALIGNMENT_REQUIRED 16
150
+ # define BYTES_PER_ITERATION 32
151
+ # define ATTRIBUTES
152
+ # define DEFAULT_IMPL adler32_sse2
153
+ # include "adler32_impl.h"
154
+ #endif
155
+
156
+ /* Define the AVX2 implementation if needed. */
157
+ #if NEED_AVX2_IMPL
158
+ # define FUNCNAME adler32_avx2
159
+ # define TARGET TARGET_AVX2
160
+ # define ALIGNMENT_REQUIRED 32
161
+ # define BYTES_PER_ITERATION 32
162
+ # ifdef __AVX2__
163
+ # define ATTRIBUTES
164
+ # define DEFAULT_IMPL adler32_avx2
165
+ # else
166
+ # define ATTRIBUTES __attribute__((target("avx2")))
167
+ # endif
168
+ # include "adler32_impl.h"
169
+ #endif
170
+
171
+ /* Define the NEON implementation if needed. */
172
+ #if NEED_NEON_IMPL
173
+ # define FUNCNAME adler32_neon
174
+ # define TARGET TARGET_NEON
175
+ # define ALIGNMENT_REQUIRED 16
176
+ # define BYTES_PER_ITERATION 32
177
+ # define ATTRIBUTES
178
+ # define DEFAULT_IMPL adler32_neon
179
+ # include "adler32_impl.h"
180
+ #endif
181
+
182
+ typedef u32 (*adler32_func_t)(u32, const void *, size_t);
183
+
184
+ /*
185
+ * If multiple implementations are available, then dispatch among them based on
186
+ * CPU features at runtime. Otherwise just call the single one directly.
187
+ */
188
+ #if NUM_IMPLS == 1
189
+ # define adler32_impl DEFAULT_IMPL
190
+ #else
191
+ static u32 dispatch(u32, const void *, size_t);
192
+
193
+ static adler32_func_t adler32_impl = dispatch;
194
+
195
+ static u32 dispatch(u32 adler, const void *buffer, size_t size)
196
+ {
197
+ adler32_func_t f = DEFAULT_IMPL;
198
+ #if NEED_AVX2_IMPL && !defined(__AVX2__)
199
+ if (x86_have_cpu_features(X86_CPU_FEATURE_AVX2))
200
+ f = adler32_avx2;
201
+ #endif
202
+ adler32_impl = f;
203
+ return adler32_impl(adler, buffer, size);
204
+ }
205
+ #endif /* NUM_IMPLS != 1 */
206
+
207
+ LIBDEFLATEAPI u32
208
+ libdeflate_adler32(u32 adler, const void *buffer, size_t size)
209
+ {
210
+ if (buffer == NULL) /* return initial value */
211
+ return 1;
212
+ return adler32_impl(adler, buffer, size);
213
+ }
@@ -0,0 +1,281 @@
1
+ /*
2
+ * adler32_impl.h
3
+ *
4
+ * Originally public domain; changes after 2016-09-07 are copyrighted.
5
+ *
6
+ * Copyright 2016 Eric Biggers
7
+ *
8
+ * Permission is hereby granted, free of charge, to any person
9
+ * obtaining a copy of this software and associated documentation
10
+ * files (the "Software"), to deal in the Software without
11
+ * restriction, including without limitation the rights to use,
12
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
13
+ * copies of the Software, and to permit persons to whom the
14
+ * Software is furnished to do so, subject to the following
15
+ * conditions:
16
+ *
17
+ * The above copyright notice and this permission notice shall be
18
+ * included in all copies or substantial portions of the Software.
19
+ *
20
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27
+ * OTHER DEALINGS IN THE SOFTWARE.
28
+ */
29
+
30
+ /*
31
+ * This file contains a template for vectorized Adler-32 implementations.
32
+ *
33
+ * The inner loop between reductions modulo 65521 of an unvectorized Adler-32
34
+ * implementation looks something like this:
35
+ *
36
+ * do {
37
+ * s1 += *p;
38
+ * s2 += s1;
39
+ * } while (++p != chunk_end);
40
+ *
41
+ * For vectorized calculation of s1, we only need to sum the input bytes. They
42
+ * can be accumulated into multiple counters which are eventually summed
43
+ * together.
44
+ *
45
+ * For vectorized calculation of s2, the basic idea is that for each iteration
46
+ * that processes N bytes, we can perform the following vectorizable
47
+ * calculation:
48
+ *
49
+ * s2 += N*byte_1 + (N-1)*byte_2 + (N-2)*byte_3 + ... + 1*byte_N
50
+ *
51
+ * Or, equivalently, we can sum the byte_1...byte_N for each iteration into N
52
+ * separate counters, then do the multiplications by N...1 just once at the end
53
+ * rather than once per iteration.
54
+ *
55
+ * Also, we must account for how previous bytes will affect s2 by doing the
56
+ * following at beginning of each iteration:
57
+ *
58
+ * s2 += s1 * N
59
+ *
60
+ * Furthermore, like s1, "s2" can actually be multiple counters which are
61
+ * eventually summed together.
62
+ */
63
+
64
+ static u32 ATTRIBUTES
65
+ FUNCNAME(u32 adler, const void *buffer, size_t size)
66
+ {
67
+ u32 s1 = adler & 0xFFFF;
68
+ u32 s2 = adler >> 16;
69
+ const u8 *p = buffer;
70
+ const u8 * const end = p + size;
71
+ const u8 *vend;
72
+
73
+ /* Process a byte at a time until the required alignment is reached. */
74
+ if (p != end && (uintptr_t)p % ALIGNMENT_REQUIRED) {
75
+ do {
76
+ s1 += *p++;
77
+ s2 += s1;
78
+ } while (p != end && (uintptr_t)p % ALIGNMENT_REQUIRED);
79
+ s1 %= DIVISOR;
80
+ s2 %= DIVISOR;
81
+ }
82
+
83
+ /*
84
+ * Process "chunks" of bytes using vector instructions. Chunk sizes are
85
+ * limited to MAX_BYTES_PER_CHUNK, which guarantees that s1 and s2 never
86
+ * overflow before being reduced modulo DIVISOR. For vector processing,
87
+ * chunks size are also made evenly divisible by BYTES_PER_ITERATION.
88
+ */
89
+ STATIC_ASSERT(BYTES_PER_ITERATION % ALIGNMENT_REQUIRED == 0);
90
+ vend = end - ((size_t)(end - p) % BYTES_PER_ITERATION);
91
+ while (p != vend) {
92
+ size_t chunk_size;
93
+ const u8 *chunk_end;
94
+
95
+ chunk_size = MIN((size_t)(vend - p), MAX_BYTES_PER_CHUNK);
96
+ #if TARGET == TARGET_SSE2
97
+ /* SSE2: the 16-bit precision byte counters must not undergo
98
+ * *signed* overflow, otherwise the signed multiplication at the
99
+ * end will not behave as desired. */
100
+ chunk_size = MIN(chunk_size, BYTES_PER_ITERATION * (0x7FFF / 0xFF));
101
+ #elif TARGET == TARGET_NEON
102
+ /* NEON: the 16-bit precision counters must not undergo
103
+ * *unsigned* overflow. */
104
+ chunk_size = MIN(chunk_size, BYTES_PER_ITERATION * (0xFFFF / 0xFF));
105
+ #endif
106
+ chunk_size -= chunk_size % BYTES_PER_ITERATION;
107
+
108
+ chunk_end = p + chunk_size;
109
+
110
+ s2 += s1 * chunk_size;
111
+ {
112
+ #if TARGET == TARGET_AVX2
113
+ /* AVX2 implementation */
114
+ const __m256i zeroes = _mm256_setzero_si256();
115
+ const __v32qi multipliers = (__v32qi) { 32, 31, 30, 29, 28, 27, 26, 25,
116
+ 24, 23, 22, 21, 20, 19, 18, 17,
117
+ 16, 15, 14, 13, 12, 11, 10, 9,
118
+ 8, 7, 6, 5, 4, 3, 2, 1 };
119
+ const __v16hi ones = (__v16hi)_mm256_set1_epi16(1);
120
+ __v8si v_s1 = (__v8si)zeroes;
121
+ __v8si v_s1_sums = (__v8si)zeroes;
122
+ __v8si v_s2 = (__v8si)zeroes;
123
+ STATIC_ASSERT(ALIGNMENT_REQUIRED == 32 && BYTES_PER_ITERATION == 32);
124
+ do {
125
+ __m256i bytes = *(const __m256i *)p;
126
+ __v16hi sums = (__v16hi)_mm256_maddubs_epi16(
127
+ bytes, (__m256i)multipliers);
128
+ v_s1_sums += v_s1;
129
+ v_s1 += (__v8si)_mm256_sad_epu8(bytes, zeroes);
130
+ v_s2 += (__v8si)_mm256_madd_epi16((__m256i)sums, (__m256i)ones);
131
+ } while ((p += BYTES_PER_ITERATION) != chunk_end);
132
+
133
+ v_s1 = (__v8si)_mm256_hadd_epi32((__m256i)v_s1, zeroes);
134
+ v_s1 = (__v8si)_mm256_hadd_epi32((__m256i)v_s1, zeroes);
135
+ s1 += v_s1[0] + v_s1[4];
136
+
137
+ v_s2 += (__v8si)_mm256_slli_epi32((__m256i)v_s1_sums, 5);
138
+ v_s2 = (__v8si)_mm256_hadd_epi32((__m256i)v_s2, zeroes);
139
+ v_s2 = (__v8si)_mm256_hadd_epi32((__m256i)v_s2, zeroes);
140
+ s2 += v_s2[0] + v_s2[4];
141
+
142
+ #elif TARGET == TARGET_SSE2
143
+ /* SSE2 implementation */
144
+ const __m128i zeroes = _mm_setzero_si128();
145
+
146
+ /* s1 counters: 32-bit, sum of bytes */
147
+ __v4si v_s1 = (__v4si)zeroes;
148
+
149
+ /* s2 counters: 32-bit, sum of s1 values */
150
+ __v4si v_s2 = (__v4si)zeroes;
151
+
152
+ /*
153
+ * Thirty-two 16-bit counters for byte sums. Each accumulates
154
+ * the bytes that eventually need to be multiplied by a number
155
+ * 32...1 for addition into s2.
156
+ */
157
+ __v8hi v_byte_sums_a = (__v8hi)zeroes;
158
+ __v8hi v_byte_sums_b = (__v8hi)zeroes;
159
+ __v8hi v_byte_sums_c = (__v8hi)zeroes;
160
+ __v8hi v_byte_sums_d = (__v8hi)zeroes;
161
+
162
+ STATIC_ASSERT(ALIGNMENT_REQUIRED == 16 && BYTES_PER_ITERATION == 32);
163
+ do {
164
+ /* Load the next 32 bytes. */
165
+ const __m128i bytes1 = *(const __m128i *)p;
166
+ const __m128i bytes2 = *(const __m128i *)(p + 16);
167
+
168
+ /*
169
+ * Accumulate the previous s1 counters into the s2
170
+ * counters. Logically, this really should be
171
+ * v_s2 += v_s1 * BYTES_PER_ITERATION, but we can do the
172
+ * multiplication (or left shift) later.
173
+ */
174
+ v_s2 += v_s1;
175
+
176
+ /*
177
+ * s1 update: use "Packed Sum of Absolute Differences"
178
+ * to add the bytes horizontally with 8 bytes per sum.
179
+ * Then add the sums to the s1 counters.
180
+ */
181
+ v_s1 += (__v4si)_mm_sad_epu8(bytes1, zeroes);
182
+ v_s1 += (__v4si)_mm_sad_epu8(bytes2, zeroes);
183
+
184
+ /*
185
+ * Also accumulate the bytes into 32 separate counters
186
+ * that have 16-bit precision.
187
+ */
188
+ v_byte_sums_a += (__v8hi)_mm_unpacklo_epi8(bytes1, zeroes);
189
+ v_byte_sums_b += (__v8hi)_mm_unpackhi_epi8(bytes1, zeroes);
190
+ v_byte_sums_c += (__v8hi)_mm_unpacklo_epi8(bytes2, zeroes);
191
+ v_byte_sums_d += (__v8hi)_mm_unpackhi_epi8(bytes2, zeroes);
192
+
193
+ } while ((p += BYTES_PER_ITERATION) != chunk_end);
194
+
195
+ /* Finish calculating the s2 counters. */
196
+ v_s2 = (__v4si)_mm_slli_epi32((__m128i)v_s2, 5);
197
+ v_s2 += (__v4si)_mm_madd_epi16((__m128i)v_byte_sums_a,
198
+ (__m128i)(__v8hi){ 32, 31, 30, 29, 28, 27, 26, 25 });
199
+ v_s2 += (__v4si)_mm_madd_epi16((__m128i)v_byte_sums_b,
200
+ (__m128i)(__v8hi){ 24, 23, 22, 21, 20, 19, 18, 17 });
201
+ v_s2 += (__v4si)_mm_madd_epi16((__m128i)v_byte_sums_c,
202
+ (__m128i)(__v8hi){ 16, 15, 14, 13, 12, 11, 10, 9 });
203
+ v_s2 += (__v4si)_mm_madd_epi16((__m128i)v_byte_sums_d,
204
+ (__m128i)(__v8hi){ 8, 7, 6, 5, 4, 3, 2, 1 });
205
+
206
+ /* Now accumulate what we computed into the real s1 and s2. */
207
+ v_s1 += (__v4si)_mm_shuffle_epi32((__m128i)v_s1, 0x31);
208
+ v_s1 += (__v4si)_mm_shuffle_epi32((__m128i)v_s1, 0x02);
209
+ s1 += _mm_cvtsi128_si32((__m128i)v_s1);
210
+
211
+ v_s2 += (__v4si)_mm_shuffle_epi32((__m128i)v_s2, 0x31);
212
+ v_s2 += (__v4si)_mm_shuffle_epi32((__m128i)v_s2, 0x02);
213
+ s2 += _mm_cvtsi128_si32((__m128i)v_s2);
214
+
215
+ #elif TARGET == TARGET_NEON
216
+ /* ARM NEON (Advanced SIMD) implementation */
217
+ uint32x4_t v_s1 = (uint32x4_t) { 0, 0, 0, 0 };
218
+ uint32x4_t v_s2 = (uint32x4_t) { 0, 0, 0, 0 };
219
+ uint16x8_t v_byte_sums_a = (uint16x8_t) { 0, 0, 0, 0, 0, 0, 0, 0 };
220
+ uint16x8_t v_byte_sums_b = (uint16x8_t) { 0, 0, 0, 0, 0, 0, 0, 0 };
221
+ uint16x8_t v_byte_sums_c = (uint16x8_t) { 0, 0, 0, 0, 0, 0, 0, 0 };
222
+ uint16x8_t v_byte_sums_d = (uint16x8_t) { 0, 0, 0, 0, 0, 0, 0, 0 };
223
+
224
+ STATIC_ASSERT(ALIGNMENT_REQUIRED == 16 && BYTES_PER_ITERATION == 32);
225
+ do {
226
+ const uint8x16_t bytes1 = *(const uint8x16_t *)p;
227
+ const uint8x16_t bytes2 = *(const uint8x16_t *)(p + 16);
228
+ uint16x8_t tmp;
229
+
230
+ v_s2 += v_s1;
231
+
232
+ tmp = vpaddlq_u8(bytes1);
233
+ tmp = vpadalq_u8(tmp, bytes2);
234
+ v_s1 = vpadalq_u16(v_s1, tmp);
235
+
236
+ v_byte_sums_a = vaddw_u8(v_byte_sums_a, vget_low_u8(bytes1));
237
+ v_byte_sums_b = vaddw_u8(v_byte_sums_b, vget_high_u8(bytes1));
238
+ v_byte_sums_c = vaddw_u8(v_byte_sums_c, vget_low_u8(bytes2));
239
+ v_byte_sums_d = vaddw_u8(v_byte_sums_d, vget_high_u8(bytes2));
240
+
241
+ } while ((p += BYTES_PER_ITERATION) != chunk_end);
242
+
243
+ v_s2 = vqshlq_n_u32(v_s2, 5);
244
+ v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_a), (uint16x4_t) { 32, 31, 30, 29 });
245
+ v_s2 = vmlal_u16(v_s2, vget_high_u16(v_byte_sums_a), (uint16x4_t) { 28, 27, 26, 25 });
246
+ v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_b), (uint16x4_t) { 24, 23, 22, 21 });
247
+ v_s2 = vmlal_u16(v_s2, vget_high_u16(v_byte_sums_b), (uint16x4_t) { 20, 19, 18, 17 });
248
+ v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_c), (uint16x4_t) { 16, 15, 14, 13 });
249
+ v_s2 = vmlal_u16(v_s2, vget_high_u16(v_byte_sums_c), (uint16x4_t) { 12, 11, 10, 9 });
250
+ v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_byte_sums_d), (uint16x4_t) { 8, 7, 6, 5 });
251
+ v_s2 = vmlal_u16(v_s2, vget_high_u16(v_byte_sums_d), (uint16x4_t) { 4, 3, 2, 1 });
252
+
253
+ s1 += v_s1[0] + v_s1[1] + v_s1[2] + v_s1[3];
254
+ s2 += v_s2[0] + v_s2[1] + v_s2[2] + v_s2[3];
255
+ #else
256
+ # error "BUG: unknown target"
257
+ #endif
258
+ }
259
+
260
+ s1 %= DIVISOR;
261
+ s2 %= DIVISOR;
262
+ }
263
+
264
+ /* Process any remaining bytes. */
265
+ if (p != end) {
266
+ do {
267
+ s1 += *p++;
268
+ s2 += s1;
269
+ } while (p != end);
270
+ s1 %= DIVISOR;
271
+ s2 %= DIVISOR;
272
+ }
273
+
274
+ return (s2 << 16) | s1;
275
+ }
276
+
277
+ #undef FUNCNAME
278
+ #undef TARGET
279
+ #undef ALIGNMENT_REQUIRED
280
+ #undef BYTES_PER_ITERATION
281
+ #undef ATTRIBUTES