libdeflate 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +17 -0
- data/.gitmodules +3 -0
- data/.rspec +2 -0
- data/.rubocop.yml +1 -0
- data/.rubocop_todo.yml +9 -0
- data/.travis.yml +5 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +52 -0
- data/Rakefile +15 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/ext/libdeflate/extconf.rb +14 -0
- data/ext/libdeflate/libdeflate/.gitignore +19 -0
- data/ext/libdeflate/libdeflate/COPYING +21 -0
- data/ext/libdeflate/libdeflate/Makefile +231 -0
- data/ext/libdeflate/libdeflate/Makefile.msc +64 -0
- data/ext/libdeflate/libdeflate/NEWS +57 -0
- data/ext/libdeflate/libdeflate/README.md +170 -0
- data/ext/libdeflate/libdeflate/common/common_defs.h +351 -0
- data/ext/libdeflate/libdeflate/common/compiler_gcc.h +134 -0
- data/ext/libdeflate/libdeflate/common/compiler_msc.h +95 -0
- data/ext/libdeflate/libdeflate/lib/adler32.c +213 -0
- data/ext/libdeflate/libdeflate/lib/adler32_impl.h +281 -0
- data/ext/libdeflate/libdeflate/lib/aligned_malloc.c +57 -0
- data/ext/libdeflate/libdeflate/lib/aligned_malloc.h +13 -0
- data/ext/libdeflate/libdeflate/lib/bt_matchfinder.h +357 -0
- data/ext/libdeflate/libdeflate/lib/crc32.c +368 -0
- data/ext/libdeflate/libdeflate/lib/crc32_impl.h +286 -0
- data/ext/libdeflate/libdeflate/lib/crc32_table.h +526 -0
- data/ext/libdeflate/libdeflate/lib/decompress_impl.h +404 -0
- data/ext/libdeflate/libdeflate/lib/deflate_compress.c +2817 -0
- data/ext/libdeflate/libdeflate/lib/deflate_compress.h +14 -0
- data/ext/libdeflate/libdeflate/lib/deflate_constants.h +66 -0
- data/ext/libdeflate/libdeflate/lib/deflate_decompress.c +889 -0
- data/ext/libdeflate/libdeflate/lib/gzip_compress.c +95 -0
- data/ext/libdeflate/libdeflate/lib/gzip_constants.h +45 -0
- data/ext/libdeflate/libdeflate/lib/gzip_decompress.c +130 -0
- data/ext/libdeflate/libdeflate/lib/hc_matchfinder.h +405 -0
- data/ext/libdeflate/libdeflate/lib/lib_common.h +35 -0
- data/ext/libdeflate/libdeflate/lib/matchfinder_avx2.h +53 -0
- data/ext/libdeflate/libdeflate/lib/matchfinder_common.h +205 -0
- data/ext/libdeflate/libdeflate/lib/matchfinder_neon.h +61 -0
- data/ext/libdeflate/libdeflate/lib/matchfinder_sse2.h +53 -0
- data/ext/libdeflate/libdeflate/lib/unaligned.h +202 -0
- data/ext/libdeflate/libdeflate/lib/x86_cpu_features.c +169 -0
- data/ext/libdeflate/libdeflate/lib/x86_cpu_features.h +48 -0
- data/ext/libdeflate/libdeflate/lib/zlib_compress.c +87 -0
- data/ext/libdeflate/libdeflate/lib/zlib_constants.h +21 -0
- data/ext/libdeflate/libdeflate/lib/zlib_decompress.c +91 -0
- data/ext/libdeflate/libdeflate/libdeflate.h +274 -0
- data/ext/libdeflate/libdeflate/programs/benchmark.c +558 -0
- data/ext/libdeflate/libdeflate/programs/checksum.c +197 -0
- data/ext/libdeflate/libdeflate/programs/detect.sh +62 -0
- data/ext/libdeflate/libdeflate/programs/gzip.c +603 -0
- data/ext/libdeflate/libdeflate/programs/prog_util.c +530 -0
- data/ext/libdeflate/libdeflate/programs/prog_util.h +162 -0
- data/ext/libdeflate/libdeflate/programs/test_checksums.c +135 -0
- data/ext/libdeflate/libdeflate/programs/tgetopt.c +118 -0
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/Makefile +12 -0
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_compress/fuzz.c +40 -0
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_compress/inputs/0 +0 -0
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_decompress/fuzz.c +28 -0
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_decompress/inputs/0 +3 -0
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/gzip_decompress/fuzz.c +28 -0
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/gzip_decompress/inputs/0 +0 -0
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/prepare_for_fuzz.sh +14 -0
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/zlib_decompress/fuzz.c +28 -0
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/zlib_decompress/inputs/0 +3 -0
- data/ext/libdeflate/libdeflate/tools/android_build.sh +104 -0
- data/ext/libdeflate/libdeflate/tools/checksum_benchmarks.sh +76 -0
- data/ext/libdeflate/libdeflate/tools/exec_tests.sh +30 -0
- data/ext/libdeflate/libdeflate/tools/gen_crc32_multipliers.c +108 -0
- data/ext/libdeflate/libdeflate/tools/gen_crc32_table.c +100 -0
- data/ext/libdeflate/libdeflate/tools/gzip_tests.sh +412 -0
- data/ext/libdeflate/libdeflate/tools/make-windows-releases +21 -0
- data/ext/libdeflate/libdeflate/tools/mips_build.sh +9 -0
- data/ext/libdeflate/libdeflate/tools/msc_test.bat +3 -0
- data/ext/libdeflate/libdeflate/tools/pgo_build.sh +23 -0
- data/ext/libdeflate/libdeflate/tools/produce_gzip_benchmark_table.sh +37 -0
- data/ext/libdeflate/libdeflate/tools/run_tests.sh +305 -0
- data/ext/libdeflate/libdeflate/tools/windows_build.sh +10 -0
- data/ext/libdeflate/libdeflate_ext.c +389 -0
- data/ext/libdeflate/libdeflate_ext.h +8 -0
- data/lib/libdeflate.rb +2 -0
- data/lib/libdeflate/version.rb +3 -0
- data/libdeflate.gemspec +33 -0
- metadata +230 -0
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* lib_common.h - internal header included by all library code
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
#ifndef LIB_LIB_COMMON_H
|
|
6
|
+
#define LIB_LIB_COMMON_H
|
|
7
|
+
|
|
8
|
+
#ifdef LIBDEFLATE_H
|
|
9
|
+
# error "lib_common.h must always be included before libdeflate.h"
|
|
10
|
+
/* because BUILDING_LIBDEFLATE must be set first */
|
|
11
|
+
#endif
|
|
12
|
+
|
|
13
|
+
#define BUILDING_LIBDEFLATE
|
|
14
|
+
|
|
15
|
+
#include "common_defs.h"
|
|
16
|
+
|
|
17
|
+
/*
|
|
18
|
+
* Prefix with "_libdeflate_" all global symbols which are not part of the API.
|
|
19
|
+
* This avoids exposing overly generic names when libdeflate is built as a
|
|
20
|
+
* static library.
|
|
21
|
+
*
|
|
22
|
+
* Note that the chosen prefix is not really important and can be changed
|
|
23
|
+
* without breaking library users. It was just chosen so that the resulting
|
|
24
|
+
* symbol names are unlikely to conflict with those from any other software.
|
|
25
|
+
* Also note that this fixup has no useful effect when libdeflate is built as a
|
|
26
|
+
* shared library, since these symbols are not exported.
|
|
27
|
+
*/
|
|
28
|
+
#define SYM_FIXUP(sym) _libdeflate_##sym
|
|
29
|
+
#define aligned_malloc SYM_FIXUP(aligned_malloc)
|
|
30
|
+
#define aligned_free SYM_FIXUP(aligned_free)
|
|
31
|
+
#define deflate_get_compression_level SYM_FIXUP(deflate_get_compression_level)
|
|
32
|
+
#define _x86_cpu_features SYM_FIXUP(_x86_cpu_features)
|
|
33
|
+
#define x86_setup_cpu_features SYM_FIXUP(x86_setup_cpu_features)
|
|
34
|
+
|
|
35
|
+
#endif /* LIB_LIB_COMMON_H */
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* matchfinder_avx2.h - matchfinding routines optimized for Intel AVX2 (Advanced
|
|
3
|
+
* Vector Extensions)
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
#include <immintrin.h>
|
|
7
|
+
|
|
8
|
+
static forceinline bool
|
|
9
|
+
matchfinder_init_avx2(mf_pos_t *data, size_t size)
|
|
10
|
+
{
|
|
11
|
+
__m256i v, *p;
|
|
12
|
+
size_t n;
|
|
13
|
+
|
|
14
|
+
if (size % sizeof(__m256i) * 4)
|
|
15
|
+
return false;
|
|
16
|
+
|
|
17
|
+
STATIC_ASSERT(sizeof(mf_pos_t) == 2);
|
|
18
|
+
v = _mm256_set1_epi16(MATCHFINDER_INITVAL);
|
|
19
|
+
p = (__m256i *)data;
|
|
20
|
+
n = size / (sizeof(__m256i) * 4);
|
|
21
|
+
do {
|
|
22
|
+
p[0] = v;
|
|
23
|
+
p[1] = v;
|
|
24
|
+
p[2] = v;
|
|
25
|
+
p[3] = v;
|
|
26
|
+
p += 4;
|
|
27
|
+
} while (--n);
|
|
28
|
+
return true;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
static forceinline bool
|
|
32
|
+
matchfinder_rebase_avx2(mf_pos_t *data, size_t size)
|
|
33
|
+
{
|
|
34
|
+
__m256i v, *p;
|
|
35
|
+
size_t n;
|
|
36
|
+
|
|
37
|
+
if ((size % sizeof(__m256i) * 4 != 0))
|
|
38
|
+
return false;
|
|
39
|
+
|
|
40
|
+
STATIC_ASSERT(sizeof(mf_pos_t) == 2);
|
|
41
|
+
v = _mm256_set1_epi16((u16)-MATCHFINDER_WINDOW_SIZE);
|
|
42
|
+
p = (__m256i *)data;
|
|
43
|
+
n = size / (sizeof(__m256i) * 4);
|
|
44
|
+
do {
|
|
45
|
+
/* PADDSW: Add Packed Signed Integers With Signed Saturation */
|
|
46
|
+
p[0] = _mm256_adds_epi16(p[0], v);
|
|
47
|
+
p[1] = _mm256_adds_epi16(p[1], v);
|
|
48
|
+
p[2] = _mm256_adds_epi16(p[2], v);
|
|
49
|
+
p[3] = _mm256_adds_epi16(p[3], v);
|
|
50
|
+
p += 4;
|
|
51
|
+
} while (--n);
|
|
52
|
+
return true;
|
|
53
|
+
}
|
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* matchfinder_common.h - common code for Lempel-Ziv matchfinding
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
#ifndef LIB_MATCHFINDER_COMMON_H
|
|
6
|
+
#define LIB_MATCHFINDER_COMMON_H
|
|
7
|
+
|
|
8
|
+
#include "lib_common.h"
|
|
9
|
+
#include "unaligned.h"
|
|
10
|
+
|
|
11
|
+
#ifndef MATCHFINDER_WINDOW_ORDER
|
|
12
|
+
# error "MATCHFINDER_WINDOW_ORDER must be defined!"
|
|
13
|
+
#endif
|
|
14
|
+
|
|
15
|
+
#define MATCHFINDER_WINDOW_SIZE (1UL << MATCHFINDER_WINDOW_ORDER)
|
|
16
|
+
|
|
17
|
+
typedef s16 mf_pos_t;
|
|
18
|
+
|
|
19
|
+
#define MATCHFINDER_INITVAL ((mf_pos_t)-MATCHFINDER_WINDOW_SIZE)
|
|
20
|
+
|
|
21
|
+
#define MATCHFINDER_ALIGNMENT 8
|
|
22
|
+
|
|
23
|
+
#ifdef __AVX2__
|
|
24
|
+
# include "matchfinder_avx2.h"
|
|
25
|
+
# if MATCHFINDER_ALIGNMENT < 32
|
|
26
|
+
# undef MATCHFINDER_ALIGNMENT
|
|
27
|
+
# define MATCHFINDER_ALIGNMENT 32
|
|
28
|
+
# endif
|
|
29
|
+
#endif
|
|
30
|
+
|
|
31
|
+
#ifdef __SSE2__
|
|
32
|
+
# include "matchfinder_sse2.h"
|
|
33
|
+
# if MATCHFINDER_ALIGNMENT < 16
|
|
34
|
+
# undef MATCHFINDER_ALIGNMENT
|
|
35
|
+
# define MATCHFINDER_ALIGNMENT 16
|
|
36
|
+
# endif
|
|
37
|
+
#endif
|
|
38
|
+
|
|
39
|
+
#ifdef __ARM_NEON
|
|
40
|
+
# include "matchfinder_neon.h"
|
|
41
|
+
# if MATCHFINDER_ALIGNMENT < 16
|
|
42
|
+
# undef MATCHFINDER_ALIGNMENT
|
|
43
|
+
# define MATCHFINDER_ALIGNMENT 16
|
|
44
|
+
# endif
|
|
45
|
+
#endif
|
|
46
|
+
|
|
47
|
+
/*
|
|
48
|
+
* Initialize the hash table portion of the matchfinder.
|
|
49
|
+
*
|
|
50
|
+
* Essentially, this is an optimized memset().
|
|
51
|
+
*
|
|
52
|
+
* 'data' must be aligned to a MATCHFINDER_ALIGNMENT boundary.
|
|
53
|
+
*/
|
|
54
|
+
static forceinline void
|
|
55
|
+
matchfinder_init(mf_pos_t *data, size_t num_entries)
|
|
56
|
+
{
|
|
57
|
+
size_t i;
|
|
58
|
+
|
|
59
|
+
#if defined(__AVX2__) && defined(_aligned_attribute)
|
|
60
|
+
if (matchfinder_init_avx2(data, num_entries * sizeof(data[0])))
|
|
61
|
+
return;
|
|
62
|
+
#endif
|
|
63
|
+
|
|
64
|
+
#if defined(__SSE2__) && defined(_aligned_attribute)
|
|
65
|
+
if (matchfinder_init_sse2(data, num_entries * sizeof(data[0])))
|
|
66
|
+
return;
|
|
67
|
+
#endif
|
|
68
|
+
|
|
69
|
+
#if defined(__ARM_NEON) && defined(_aligned_attribute)
|
|
70
|
+
if (matchfinder_init_neon(data, num_entries * sizeof(data[0])))
|
|
71
|
+
return;
|
|
72
|
+
#endif
|
|
73
|
+
|
|
74
|
+
for (i = 0; i < num_entries; i++)
|
|
75
|
+
data[i] = MATCHFINDER_INITVAL;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
/*
|
|
79
|
+
* Slide the matchfinder by WINDOW_SIZE bytes.
|
|
80
|
+
*
|
|
81
|
+
* This must be called just after each WINDOW_SIZE bytes have been run through
|
|
82
|
+
* the matchfinder.
|
|
83
|
+
*
|
|
84
|
+
* This will subtract WINDOW_SIZE bytes from each entry in the array specified.
|
|
85
|
+
* The effect is that all entries are updated to be relative to the current
|
|
86
|
+
* position, rather than the position WINDOW_SIZE bytes prior.
|
|
87
|
+
*
|
|
88
|
+
* Underflow is detected and replaced with signed saturation. This ensures that
|
|
89
|
+
* once the sliding window has passed over a position, that position forever
|
|
90
|
+
* remains out of bounds.
|
|
91
|
+
*
|
|
92
|
+
* The array passed in must contain all matchfinder data that is
|
|
93
|
+
* position-relative. Concretely, this will include the hash table as well as
|
|
94
|
+
* the table of positions that is used to link together the sequences in each
|
|
95
|
+
* hash bucket. Note that in the latter table, the links are 1-ary in the case
|
|
96
|
+
* of "hash chains", and 2-ary in the case of "binary trees". In either case,
|
|
97
|
+
* the links need to be rebased in the same way.
|
|
98
|
+
*/
|
|
99
|
+
static forceinline void
|
|
100
|
+
matchfinder_rebase(mf_pos_t *data, size_t num_entries)
|
|
101
|
+
{
|
|
102
|
+
size_t i;
|
|
103
|
+
|
|
104
|
+
#if defined(__AVX2__) && defined(_aligned_attribute)
|
|
105
|
+
if (matchfinder_rebase_avx2(data, num_entries * sizeof(data[0])))
|
|
106
|
+
return;
|
|
107
|
+
#endif
|
|
108
|
+
|
|
109
|
+
#if defined(__SSE2__) && defined(_aligned_attribute)
|
|
110
|
+
if (matchfinder_rebase_sse2(data, num_entries * sizeof(data[0])))
|
|
111
|
+
return;
|
|
112
|
+
#endif
|
|
113
|
+
|
|
114
|
+
#if defined(__ARM_NEON) && defined(_aligned_attribute)
|
|
115
|
+
if (matchfinder_rebase_neon(data, num_entries * sizeof(data[0])))
|
|
116
|
+
return;
|
|
117
|
+
#endif
|
|
118
|
+
|
|
119
|
+
if (MATCHFINDER_WINDOW_SIZE == 32768) {
|
|
120
|
+
/* Branchless version for 32768 byte windows. If the value was
|
|
121
|
+
* already negative, clear all bits except the sign bit; this
|
|
122
|
+
* changes the value to -32768. Otherwise, set the sign bit;
|
|
123
|
+
* this is equivalent to subtracting 32768. */
|
|
124
|
+
for (i = 0; i < num_entries; i++) {
|
|
125
|
+
u16 v = data[i];
|
|
126
|
+
u16 sign_bit = v & 0x8000;
|
|
127
|
+
v &= sign_bit - ((sign_bit >> 15) ^ 1);
|
|
128
|
+
v |= 0x8000;
|
|
129
|
+
data[i] = v;
|
|
130
|
+
}
|
|
131
|
+
return;
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
for (i = 0; i < num_entries; i++) {
|
|
135
|
+
if (data[i] >= 0)
|
|
136
|
+
data[i] -= (mf_pos_t)-MATCHFINDER_WINDOW_SIZE;
|
|
137
|
+
else
|
|
138
|
+
data[i] = (mf_pos_t)-MATCHFINDER_WINDOW_SIZE;
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
/*
|
|
143
|
+
* The hash function: given a sequence prefix held in the low-order bits of a
|
|
144
|
+
* 32-bit value, multiply by a carefully-chosen large constant. Discard any
|
|
145
|
+
* bits of the product that don't fit in a 32-bit value, but take the
|
|
146
|
+
* next-highest @num_bits bits of the product as the hash value, as those have
|
|
147
|
+
* the most randomness.
|
|
148
|
+
*/
|
|
149
|
+
static forceinline u32
|
|
150
|
+
lz_hash(u32 seq, unsigned num_bits)
|
|
151
|
+
{
|
|
152
|
+
return (u32)(seq * 0x1E35A7BD) >> (32 - num_bits);
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
/*
|
|
156
|
+
* Return the number of bytes at @matchptr that match the bytes at @strptr, up
|
|
157
|
+
* to a maximum of @max_len. Initially, @start_len bytes are matched.
|
|
158
|
+
*/
|
|
159
|
+
static forceinline unsigned
|
|
160
|
+
lz_extend(const u8 * const strptr, const u8 * const matchptr,
|
|
161
|
+
const unsigned start_len, const unsigned max_len)
|
|
162
|
+
{
|
|
163
|
+
unsigned len = start_len;
|
|
164
|
+
machine_word_t v_word;
|
|
165
|
+
|
|
166
|
+
if (UNALIGNED_ACCESS_IS_FAST) {
|
|
167
|
+
|
|
168
|
+
if (likely(max_len - len >= 4 * WORDBYTES)) {
|
|
169
|
+
|
|
170
|
+
#define COMPARE_WORD_STEP \
|
|
171
|
+
v_word = load_word_unaligned(&matchptr[len]) ^ \
|
|
172
|
+
load_word_unaligned(&strptr[len]); \
|
|
173
|
+
if (v_word != 0) \
|
|
174
|
+
goto word_differs; \
|
|
175
|
+
len += WORDBYTES; \
|
|
176
|
+
|
|
177
|
+
COMPARE_WORD_STEP
|
|
178
|
+
COMPARE_WORD_STEP
|
|
179
|
+
COMPARE_WORD_STEP
|
|
180
|
+
COMPARE_WORD_STEP
|
|
181
|
+
#undef COMPARE_WORD_STEP
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
while (len + WORDBYTES <= max_len) {
|
|
185
|
+
v_word = load_word_unaligned(&matchptr[len]) ^
|
|
186
|
+
load_word_unaligned(&strptr[len]);
|
|
187
|
+
if (v_word != 0)
|
|
188
|
+
goto word_differs;
|
|
189
|
+
len += WORDBYTES;
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
while (len < max_len && matchptr[len] == strptr[len])
|
|
194
|
+
len++;
|
|
195
|
+
return len;
|
|
196
|
+
|
|
197
|
+
word_differs:
|
|
198
|
+
if (CPU_IS_LITTLE_ENDIAN())
|
|
199
|
+
len += (bsfw(v_word) >> 3);
|
|
200
|
+
else
|
|
201
|
+
len += (WORDBITS - 1 - bsrw(v_word)) >> 3;
|
|
202
|
+
return len;
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
#endif /* LIB_MATCHFINDER_COMMON_H */
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* matchfinder_neon.h - matchfinding routines optimized for ARM NEON (Advanced
|
|
3
|
+
* SIMD) instructions
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
#include <arm_neon.h>
|
|
7
|
+
|
|
8
|
+
static forceinline bool
|
|
9
|
+
matchfinder_init_neon(mf_pos_t *data, size_t size)
|
|
10
|
+
{
|
|
11
|
+
int16x8_t v, *p;
|
|
12
|
+
size_t n;
|
|
13
|
+
|
|
14
|
+
if (size % sizeof(int16x8_t) * 4)
|
|
15
|
+
return false;
|
|
16
|
+
|
|
17
|
+
STATIC_ASSERT(sizeof(mf_pos_t) == 2);
|
|
18
|
+
v = (int16x8_t) {
|
|
19
|
+
MATCHFINDER_INITVAL, MATCHFINDER_INITVAL, MATCHFINDER_INITVAL,
|
|
20
|
+
MATCHFINDER_INITVAL, MATCHFINDER_INITVAL, MATCHFINDER_INITVAL,
|
|
21
|
+
MATCHFINDER_INITVAL, MATCHFINDER_INITVAL,
|
|
22
|
+
};
|
|
23
|
+
p = (int16x8_t *)data;
|
|
24
|
+
n = size / (sizeof(int16x8_t) * 4);
|
|
25
|
+
do {
|
|
26
|
+
p[0] = v;
|
|
27
|
+
p[1] = v;
|
|
28
|
+
p[2] = v;
|
|
29
|
+
p[3] = v;
|
|
30
|
+
p += 4;
|
|
31
|
+
} while (--n);
|
|
32
|
+
return true;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
static forceinline bool
|
|
36
|
+
matchfinder_rebase_neon(mf_pos_t *data, size_t size)
|
|
37
|
+
{
|
|
38
|
+
int16x8_t v, *p;
|
|
39
|
+
size_t n;
|
|
40
|
+
|
|
41
|
+
if ((size % sizeof(int16x8_t) * 4 != 0))
|
|
42
|
+
return false;
|
|
43
|
+
|
|
44
|
+
STATIC_ASSERT(sizeof(mf_pos_t) == 2);
|
|
45
|
+
v = (int16x8_t) {
|
|
46
|
+
(u16)-MATCHFINDER_WINDOW_SIZE, (u16)-MATCHFINDER_WINDOW_SIZE,
|
|
47
|
+
(u16)-MATCHFINDER_WINDOW_SIZE, (u16)-MATCHFINDER_WINDOW_SIZE,
|
|
48
|
+
(u16)-MATCHFINDER_WINDOW_SIZE, (u16)-MATCHFINDER_WINDOW_SIZE,
|
|
49
|
+
(u16)-MATCHFINDER_WINDOW_SIZE, (u16)-MATCHFINDER_WINDOW_SIZE,
|
|
50
|
+
};
|
|
51
|
+
p = (int16x8_t *)data;
|
|
52
|
+
n = size / (sizeof(int16x8_t) * 4);
|
|
53
|
+
do {
|
|
54
|
+
p[0] = vqaddq_s16(p[0], v);
|
|
55
|
+
p[1] = vqaddq_s16(p[1], v);
|
|
56
|
+
p[2] = vqaddq_s16(p[2], v);
|
|
57
|
+
p[3] = vqaddq_s16(p[3], v);
|
|
58
|
+
p += 4;
|
|
59
|
+
} while (--n);
|
|
60
|
+
return true;
|
|
61
|
+
}
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* matchfinder_sse2.h - matchfinding routines optimized for Intel SSE2
|
|
3
|
+
* (Streaming SIMD Extensions).
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
#include <emmintrin.h>
|
|
7
|
+
|
|
8
|
+
static forceinline bool
|
|
9
|
+
matchfinder_init_sse2(mf_pos_t *data, size_t size)
|
|
10
|
+
{
|
|
11
|
+
__m128i v, *p;
|
|
12
|
+
size_t n;
|
|
13
|
+
|
|
14
|
+
if (size % sizeof(__m128i) * 4)
|
|
15
|
+
return false;
|
|
16
|
+
|
|
17
|
+
STATIC_ASSERT(sizeof(mf_pos_t) == 2);
|
|
18
|
+
v = _mm_set1_epi16(MATCHFINDER_INITVAL);
|
|
19
|
+
p = (__m128i *)data;
|
|
20
|
+
n = size / (sizeof(__m128i) * 4);
|
|
21
|
+
do {
|
|
22
|
+
p[0] = v;
|
|
23
|
+
p[1] = v;
|
|
24
|
+
p[2] = v;
|
|
25
|
+
p[3] = v;
|
|
26
|
+
p += 4;
|
|
27
|
+
} while (--n);
|
|
28
|
+
return true;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
static forceinline bool
|
|
32
|
+
matchfinder_rebase_sse2(mf_pos_t *data, size_t size)
|
|
33
|
+
{
|
|
34
|
+
__m128i v, *p;
|
|
35
|
+
size_t n;
|
|
36
|
+
|
|
37
|
+
if ((size % sizeof(__m128i) * 4 != 0))
|
|
38
|
+
return false;
|
|
39
|
+
|
|
40
|
+
STATIC_ASSERT(sizeof(mf_pos_t) == 2);
|
|
41
|
+
v = _mm_set1_epi16((u16)-MATCHFINDER_WINDOW_SIZE);
|
|
42
|
+
p = (__m128i *)data;
|
|
43
|
+
n = size / (sizeof(__m128i) * 4);
|
|
44
|
+
do {
|
|
45
|
+
/* PADDSW: Add Packed Signed Integers With Signed Saturation */
|
|
46
|
+
p[0] = _mm_adds_epi16(p[0], v);
|
|
47
|
+
p[1] = _mm_adds_epi16(p[1], v);
|
|
48
|
+
p[2] = _mm_adds_epi16(p[2], v);
|
|
49
|
+
p[3] = _mm_adds_epi16(p[3], v);
|
|
50
|
+
p += 4;
|
|
51
|
+
} while (--n);
|
|
52
|
+
return true;
|
|
53
|
+
}
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* unaligned.h - inline functions for unaligned memory accesses
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
#ifndef LIB_UNALIGNED_H
|
|
6
|
+
#define LIB_UNALIGNED_H
|
|
7
|
+
|
|
8
|
+
#include "lib_common.h"
|
|
9
|
+
|
|
10
|
+
/*
|
|
11
|
+
* Naming note:
|
|
12
|
+
*
|
|
13
|
+
* {load,store}_*_unaligned() deal with raw bytes without endianness conversion.
|
|
14
|
+
* {get,put}_unaligned_*() deal with a specific endianness.
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
DEFINE_UNALIGNED_TYPE(u16)
|
|
18
|
+
DEFINE_UNALIGNED_TYPE(u32)
|
|
19
|
+
DEFINE_UNALIGNED_TYPE(u64)
|
|
20
|
+
DEFINE_UNALIGNED_TYPE(machine_word_t)
|
|
21
|
+
|
|
22
|
+
#define load_word_unaligned load_machine_word_t_unaligned
|
|
23
|
+
#define store_word_unaligned store_machine_word_t_unaligned
|
|
24
|
+
|
|
25
|
+
/***** Unaligned loads *****/
|
|
26
|
+
|
|
27
|
+
static forceinline u16
|
|
28
|
+
get_unaligned_le16(const u8 *p)
|
|
29
|
+
{
|
|
30
|
+
if (UNALIGNED_ACCESS_IS_FAST)
|
|
31
|
+
return le16_bswap(load_u16_unaligned(p));
|
|
32
|
+
else
|
|
33
|
+
return ((u16)p[1] << 8) | p[0];
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
static forceinline u16
|
|
37
|
+
get_unaligned_be16(const u8 *p)
|
|
38
|
+
{
|
|
39
|
+
if (UNALIGNED_ACCESS_IS_FAST)
|
|
40
|
+
return be16_bswap(load_u16_unaligned(p));
|
|
41
|
+
else
|
|
42
|
+
return ((u16)p[0] << 8) | p[1];
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
static forceinline u32
|
|
46
|
+
get_unaligned_le32(const u8 *p)
|
|
47
|
+
{
|
|
48
|
+
if (UNALIGNED_ACCESS_IS_FAST)
|
|
49
|
+
return le32_bswap(load_u32_unaligned(p));
|
|
50
|
+
else
|
|
51
|
+
return ((u32)p[3] << 24) | ((u32)p[2] << 16) |
|
|
52
|
+
((u32)p[1] << 8) | p[0];
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
static forceinline u32
|
|
56
|
+
get_unaligned_be32(const u8 *p)
|
|
57
|
+
{
|
|
58
|
+
if (UNALIGNED_ACCESS_IS_FAST)
|
|
59
|
+
return be32_bswap(load_u32_unaligned(p));
|
|
60
|
+
else
|
|
61
|
+
return ((u32)p[0] << 24) | ((u32)p[1] << 16) |
|
|
62
|
+
((u32)p[2] << 8) | p[3];
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
static forceinline u64
|
|
66
|
+
get_unaligned_le64(const u8 *p)
|
|
67
|
+
{
|
|
68
|
+
if (UNALIGNED_ACCESS_IS_FAST)
|
|
69
|
+
return le64_bswap(load_u64_unaligned(p));
|
|
70
|
+
else
|
|
71
|
+
return ((u64)p[7] << 56) | ((u64)p[6] << 48) |
|
|
72
|
+
((u64)p[5] << 40) | ((u64)p[4] << 32) |
|
|
73
|
+
((u64)p[3] << 24) | ((u64)p[2] << 16) |
|
|
74
|
+
((u64)p[1] << 8) | p[0];
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
static forceinline machine_word_t
|
|
78
|
+
get_unaligned_leword(const u8 *p)
|
|
79
|
+
{
|
|
80
|
+
STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
|
|
81
|
+
if (WORDBITS == 32)
|
|
82
|
+
return get_unaligned_le32(p);
|
|
83
|
+
else
|
|
84
|
+
return get_unaligned_le64(p);
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
/***** Unaligned stores *****/
|
|
88
|
+
|
|
89
|
+
static forceinline void
|
|
90
|
+
put_unaligned_le16(u16 v, u8 *p)
|
|
91
|
+
{
|
|
92
|
+
if (UNALIGNED_ACCESS_IS_FAST) {
|
|
93
|
+
store_u16_unaligned(le16_bswap(v), p);
|
|
94
|
+
} else {
|
|
95
|
+
p[0] = (u8)(v >> 0);
|
|
96
|
+
p[1] = (u8)(v >> 8);
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
static forceinline void
|
|
101
|
+
put_unaligned_be16(u16 v, u8 *p)
|
|
102
|
+
{
|
|
103
|
+
if (UNALIGNED_ACCESS_IS_FAST) {
|
|
104
|
+
store_u16_unaligned(be16_bswap(v), p);
|
|
105
|
+
} else {
|
|
106
|
+
p[0] = (u8)(v >> 8);
|
|
107
|
+
p[1] = (u8)(v >> 0);
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
static forceinline void
|
|
112
|
+
put_unaligned_le32(u32 v, u8 *p)
|
|
113
|
+
{
|
|
114
|
+
if (UNALIGNED_ACCESS_IS_FAST) {
|
|
115
|
+
store_u32_unaligned(le32_bswap(v), p);
|
|
116
|
+
} else {
|
|
117
|
+
p[0] = (u8)(v >> 0);
|
|
118
|
+
p[1] = (u8)(v >> 8);
|
|
119
|
+
p[2] = (u8)(v >> 16);
|
|
120
|
+
p[3] = (u8)(v >> 24);
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
static forceinline void
|
|
125
|
+
put_unaligned_be32(u32 v, u8 *p)
|
|
126
|
+
{
|
|
127
|
+
if (UNALIGNED_ACCESS_IS_FAST) {
|
|
128
|
+
store_u32_unaligned(be32_bswap(v), p);
|
|
129
|
+
} else {
|
|
130
|
+
p[0] = (u8)(v >> 24);
|
|
131
|
+
p[1] = (u8)(v >> 16);
|
|
132
|
+
p[2] = (u8)(v >> 8);
|
|
133
|
+
p[3] = (u8)(v >> 0);
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
static forceinline void
|
|
138
|
+
put_unaligned_le64(u64 v, u8 *p)
|
|
139
|
+
{
|
|
140
|
+
if (UNALIGNED_ACCESS_IS_FAST) {
|
|
141
|
+
store_u64_unaligned(le64_bswap(v), p);
|
|
142
|
+
} else {
|
|
143
|
+
p[0] = (u8)(v >> 0);
|
|
144
|
+
p[1] = (u8)(v >> 8);
|
|
145
|
+
p[2] = (u8)(v >> 16);
|
|
146
|
+
p[3] = (u8)(v >> 24);
|
|
147
|
+
p[4] = (u8)(v >> 32);
|
|
148
|
+
p[5] = (u8)(v >> 40);
|
|
149
|
+
p[6] = (u8)(v >> 48);
|
|
150
|
+
p[7] = (u8)(v >> 56);
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
static forceinline void
|
|
155
|
+
put_unaligned_leword(machine_word_t v, u8 *p)
|
|
156
|
+
{
|
|
157
|
+
STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
|
|
158
|
+
if (WORDBITS == 32)
|
|
159
|
+
put_unaligned_le32(v, p);
|
|
160
|
+
else
|
|
161
|
+
put_unaligned_le64(v, p);
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
/***** 24-bit loads *****/
|
|
165
|
+
|
|
166
|
+
/*
|
|
167
|
+
* Given a 32-bit value that was loaded with the platform's native endianness,
|
|
168
|
+
* return a 32-bit value whose high-order 8 bits are 0 and whose low-order 24
|
|
169
|
+
* bits contain the first 3 bytes, arranged in octets in a platform-dependent
|
|
170
|
+
* order, at the memory location from which the input 32-bit value was loaded.
|
|
171
|
+
*/
|
|
172
|
+
static forceinline u32
|
|
173
|
+
loaded_u32_to_u24(u32 v)
|
|
174
|
+
{
|
|
175
|
+
if (CPU_IS_LITTLE_ENDIAN())
|
|
176
|
+
return v & 0xFFFFFF;
|
|
177
|
+
else
|
|
178
|
+
return v >> 8;
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
/*
|
|
182
|
+
* Load the next 3 bytes from the memory location @p into the 24 low-order bits
|
|
183
|
+
* of a 32-bit value. The order in which the 3 bytes will be arranged as octets
|
|
184
|
+
* in the 24 bits is platform-dependent. At least LOAD_U24_REQUIRED_NBYTES
|
|
185
|
+
* bytes must be available at @p; note that this may be more than 3.
|
|
186
|
+
*/
|
|
187
|
+
static forceinline u32
|
|
188
|
+
load_u24_unaligned(const u8 *p)
|
|
189
|
+
{
|
|
190
|
+
#if UNALIGNED_ACCESS_IS_FAST
|
|
191
|
+
# define LOAD_U24_REQUIRED_NBYTES 4
|
|
192
|
+
return loaded_u32_to_u24(load_u32_unaligned(p));
|
|
193
|
+
#else
|
|
194
|
+
# define LOAD_U24_REQUIRED_NBYTES 3
|
|
195
|
+
if (CPU_IS_LITTLE_ENDIAN())
|
|
196
|
+
return ((u32)p[0] << 0) | ((u32)p[1] << 8) | ((u32)p[2] << 16);
|
|
197
|
+
else
|
|
198
|
+
return ((u32)p[2] << 0) | ((u32)p[1] << 8) | ((u32)p[0] << 16);
|
|
199
|
+
#endif
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
#endif /* LIB_UNALIGNED_H */
|