libdeflate 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +17 -0
- data/.gitmodules +3 -0
- data/.rspec +2 -0
- data/.rubocop.yml +1 -0
- data/.rubocop_todo.yml +9 -0
- data/.travis.yml +5 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +52 -0
- data/Rakefile +15 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/ext/libdeflate/extconf.rb +14 -0
- data/ext/libdeflate/libdeflate/.gitignore +19 -0
- data/ext/libdeflate/libdeflate/COPYING +21 -0
- data/ext/libdeflate/libdeflate/Makefile +231 -0
- data/ext/libdeflate/libdeflate/Makefile.msc +64 -0
- data/ext/libdeflate/libdeflate/NEWS +57 -0
- data/ext/libdeflate/libdeflate/README.md +170 -0
- data/ext/libdeflate/libdeflate/common/common_defs.h +351 -0
- data/ext/libdeflate/libdeflate/common/compiler_gcc.h +134 -0
- data/ext/libdeflate/libdeflate/common/compiler_msc.h +95 -0
- data/ext/libdeflate/libdeflate/lib/adler32.c +213 -0
- data/ext/libdeflate/libdeflate/lib/adler32_impl.h +281 -0
- data/ext/libdeflate/libdeflate/lib/aligned_malloc.c +57 -0
- data/ext/libdeflate/libdeflate/lib/aligned_malloc.h +13 -0
- data/ext/libdeflate/libdeflate/lib/bt_matchfinder.h +357 -0
- data/ext/libdeflate/libdeflate/lib/crc32.c +368 -0
- data/ext/libdeflate/libdeflate/lib/crc32_impl.h +286 -0
- data/ext/libdeflate/libdeflate/lib/crc32_table.h +526 -0
- data/ext/libdeflate/libdeflate/lib/decompress_impl.h +404 -0
- data/ext/libdeflate/libdeflate/lib/deflate_compress.c +2817 -0
- data/ext/libdeflate/libdeflate/lib/deflate_compress.h +14 -0
- data/ext/libdeflate/libdeflate/lib/deflate_constants.h +66 -0
- data/ext/libdeflate/libdeflate/lib/deflate_decompress.c +889 -0
- data/ext/libdeflate/libdeflate/lib/gzip_compress.c +95 -0
- data/ext/libdeflate/libdeflate/lib/gzip_constants.h +45 -0
- data/ext/libdeflate/libdeflate/lib/gzip_decompress.c +130 -0
- data/ext/libdeflate/libdeflate/lib/hc_matchfinder.h +405 -0
- data/ext/libdeflate/libdeflate/lib/lib_common.h +35 -0
- data/ext/libdeflate/libdeflate/lib/matchfinder_avx2.h +53 -0
- data/ext/libdeflate/libdeflate/lib/matchfinder_common.h +205 -0
- data/ext/libdeflate/libdeflate/lib/matchfinder_neon.h +61 -0
- data/ext/libdeflate/libdeflate/lib/matchfinder_sse2.h +53 -0
- data/ext/libdeflate/libdeflate/lib/unaligned.h +202 -0
- data/ext/libdeflate/libdeflate/lib/x86_cpu_features.c +169 -0
- data/ext/libdeflate/libdeflate/lib/x86_cpu_features.h +48 -0
- data/ext/libdeflate/libdeflate/lib/zlib_compress.c +87 -0
- data/ext/libdeflate/libdeflate/lib/zlib_constants.h +21 -0
- data/ext/libdeflate/libdeflate/lib/zlib_decompress.c +91 -0
- data/ext/libdeflate/libdeflate/libdeflate.h +274 -0
- data/ext/libdeflate/libdeflate/programs/benchmark.c +558 -0
- data/ext/libdeflate/libdeflate/programs/checksum.c +197 -0
- data/ext/libdeflate/libdeflate/programs/detect.sh +62 -0
- data/ext/libdeflate/libdeflate/programs/gzip.c +603 -0
- data/ext/libdeflate/libdeflate/programs/prog_util.c +530 -0
- data/ext/libdeflate/libdeflate/programs/prog_util.h +162 -0
- data/ext/libdeflate/libdeflate/programs/test_checksums.c +135 -0
- data/ext/libdeflate/libdeflate/programs/tgetopt.c +118 -0
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/Makefile +12 -0
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_compress/fuzz.c +40 -0
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_compress/inputs/0 +0 -0
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_decompress/fuzz.c +28 -0
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_decompress/inputs/0 +3 -0
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/gzip_decompress/fuzz.c +28 -0
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/gzip_decompress/inputs/0 +0 -0
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/prepare_for_fuzz.sh +14 -0
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/zlib_decompress/fuzz.c +28 -0
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/zlib_decompress/inputs/0 +3 -0
- data/ext/libdeflate/libdeflate/tools/android_build.sh +104 -0
- data/ext/libdeflate/libdeflate/tools/checksum_benchmarks.sh +76 -0
- data/ext/libdeflate/libdeflate/tools/exec_tests.sh +30 -0
- data/ext/libdeflate/libdeflate/tools/gen_crc32_multipliers.c +108 -0
- data/ext/libdeflate/libdeflate/tools/gen_crc32_table.c +100 -0
- data/ext/libdeflate/libdeflate/tools/gzip_tests.sh +412 -0
- data/ext/libdeflate/libdeflate/tools/make-windows-releases +21 -0
- data/ext/libdeflate/libdeflate/tools/mips_build.sh +9 -0
- data/ext/libdeflate/libdeflate/tools/msc_test.bat +3 -0
- data/ext/libdeflate/libdeflate/tools/pgo_build.sh +23 -0
- data/ext/libdeflate/libdeflate/tools/produce_gzip_benchmark_table.sh +37 -0
- data/ext/libdeflate/libdeflate/tools/run_tests.sh +305 -0
- data/ext/libdeflate/libdeflate/tools/windows_build.sh +10 -0
- data/ext/libdeflate/libdeflate_ext.c +389 -0
- data/ext/libdeflate/libdeflate_ext.h +8 -0
- data/lib/libdeflate.rb +2 -0
- data/lib/libdeflate/version.rb +3 -0
- data/libdeflate.gemspec +33 -0
- metadata +230 -0
@@ -0,0 +1,35 @@
|
|
1
|
+
/*
|
2
|
+
* lib_common.h - internal header included by all library code
|
3
|
+
*/
|
4
|
+
|
5
|
+
#ifndef LIB_LIB_COMMON_H
|
6
|
+
#define LIB_LIB_COMMON_H
|
7
|
+
|
8
|
+
#ifdef LIBDEFLATE_H
|
9
|
+
# error "lib_common.h must always be included before libdeflate.h"
|
10
|
+
/* because BUILDING_LIBDEFLATE must be set first */
|
11
|
+
#endif
|
12
|
+
|
13
|
+
#define BUILDING_LIBDEFLATE
|
14
|
+
|
15
|
+
#include "common_defs.h"
|
16
|
+
|
17
|
+
/*
|
18
|
+
* Prefix with "_libdeflate_" all global symbols which are not part of the API.
|
19
|
+
* This avoids exposing overly generic names when libdeflate is built as a
|
20
|
+
* static library.
|
21
|
+
*
|
22
|
+
* Note that the chosen prefix is not really important and can be changed
|
23
|
+
* without breaking library users. It was just chosen so that the resulting
|
24
|
+
* symbol names are unlikely to conflict with those from any other software.
|
25
|
+
* Also note that this fixup has no useful effect when libdeflate is built as a
|
26
|
+
* shared library, since these symbols are not exported.
|
27
|
+
*/
|
28
|
+
#define SYM_FIXUP(sym) _libdeflate_##sym
|
29
|
+
#define aligned_malloc SYM_FIXUP(aligned_malloc)
|
30
|
+
#define aligned_free SYM_FIXUP(aligned_free)
|
31
|
+
#define deflate_get_compression_level SYM_FIXUP(deflate_get_compression_level)
|
32
|
+
#define _x86_cpu_features SYM_FIXUP(_x86_cpu_features)
|
33
|
+
#define x86_setup_cpu_features SYM_FIXUP(x86_setup_cpu_features)
|
34
|
+
|
35
|
+
#endif /* LIB_LIB_COMMON_H */
|
@@ -0,0 +1,53 @@
|
|
1
|
+
/*
|
2
|
+
* matchfinder_avx2.h - matchfinding routines optimized for Intel AVX2 (Advanced
|
3
|
+
* Vector Extensions)
|
4
|
+
*/
|
5
|
+
|
6
|
+
#include <immintrin.h>
|
7
|
+
|
8
|
+
static forceinline bool
|
9
|
+
matchfinder_init_avx2(mf_pos_t *data, size_t size)
|
10
|
+
{
|
11
|
+
__m256i v, *p;
|
12
|
+
size_t n;
|
13
|
+
|
14
|
+
if (size % sizeof(__m256i) * 4)
|
15
|
+
return false;
|
16
|
+
|
17
|
+
STATIC_ASSERT(sizeof(mf_pos_t) == 2);
|
18
|
+
v = _mm256_set1_epi16(MATCHFINDER_INITVAL);
|
19
|
+
p = (__m256i *)data;
|
20
|
+
n = size / (sizeof(__m256i) * 4);
|
21
|
+
do {
|
22
|
+
p[0] = v;
|
23
|
+
p[1] = v;
|
24
|
+
p[2] = v;
|
25
|
+
p[3] = v;
|
26
|
+
p += 4;
|
27
|
+
} while (--n);
|
28
|
+
return true;
|
29
|
+
}
|
30
|
+
|
31
|
+
static forceinline bool
|
32
|
+
matchfinder_rebase_avx2(mf_pos_t *data, size_t size)
|
33
|
+
{
|
34
|
+
__m256i v, *p;
|
35
|
+
size_t n;
|
36
|
+
|
37
|
+
if ((size % sizeof(__m256i) * 4 != 0))
|
38
|
+
return false;
|
39
|
+
|
40
|
+
STATIC_ASSERT(sizeof(mf_pos_t) == 2);
|
41
|
+
v = _mm256_set1_epi16((u16)-MATCHFINDER_WINDOW_SIZE);
|
42
|
+
p = (__m256i *)data;
|
43
|
+
n = size / (sizeof(__m256i) * 4);
|
44
|
+
do {
|
45
|
+
/* PADDSW: Add Packed Signed Integers With Signed Saturation */
|
46
|
+
p[0] = _mm256_adds_epi16(p[0], v);
|
47
|
+
p[1] = _mm256_adds_epi16(p[1], v);
|
48
|
+
p[2] = _mm256_adds_epi16(p[2], v);
|
49
|
+
p[3] = _mm256_adds_epi16(p[3], v);
|
50
|
+
p += 4;
|
51
|
+
} while (--n);
|
52
|
+
return true;
|
53
|
+
}
|
@@ -0,0 +1,205 @@
|
|
1
|
+
/*
|
2
|
+
* matchfinder_common.h - common code for Lempel-Ziv matchfinding
|
3
|
+
*/
|
4
|
+
|
5
|
+
#ifndef LIB_MATCHFINDER_COMMON_H
|
6
|
+
#define LIB_MATCHFINDER_COMMON_H
|
7
|
+
|
8
|
+
#include "lib_common.h"
|
9
|
+
#include "unaligned.h"
|
10
|
+
|
11
|
+
#ifndef MATCHFINDER_WINDOW_ORDER
|
12
|
+
# error "MATCHFINDER_WINDOW_ORDER must be defined!"
|
13
|
+
#endif
|
14
|
+
|
15
|
+
#define MATCHFINDER_WINDOW_SIZE (1UL << MATCHFINDER_WINDOW_ORDER)
|
16
|
+
|
17
|
+
typedef s16 mf_pos_t;
|
18
|
+
|
19
|
+
#define MATCHFINDER_INITVAL ((mf_pos_t)-MATCHFINDER_WINDOW_SIZE)
|
20
|
+
|
21
|
+
#define MATCHFINDER_ALIGNMENT 8
|
22
|
+
|
23
|
+
#ifdef __AVX2__
|
24
|
+
# include "matchfinder_avx2.h"
|
25
|
+
# if MATCHFINDER_ALIGNMENT < 32
|
26
|
+
# undef MATCHFINDER_ALIGNMENT
|
27
|
+
# define MATCHFINDER_ALIGNMENT 32
|
28
|
+
# endif
|
29
|
+
#endif
|
30
|
+
|
31
|
+
#ifdef __SSE2__
|
32
|
+
# include "matchfinder_sse2.h"
|
33
|
+
# if MATCHFINDER_ALIGNMENT < 16
|
34
|
+
# undef MATCHFINDER_ALIGNMENT
|
35
|
+
# define MATCHFINDER_ALIGNMENT 16
|
36
|
+
# endif
|
37
|
+
#endif
|
38
|
+
|
39
|
+
#ifdef __ARM_NEON
|
40
|
+
# include "matchfinder_neon.h"
|
41
|
+
# if MATCHFINDER_ALIGNMENT < 16
|
42
|
+
# undef MATCHFINDER_ALIGNMENT
|
43
|
+
# define MATCHFINDER_ALIGNMENT 16
|
44
|
+
# endif
|
45
|
+
#endif
|
46
|
+
|
47
|
+
/*
|
48
|
+
* Initialize the hash table portion of the matchfinder.
|
49
|
+
*
|
50
|
+
* Essentially, this is an optimized memset().
|
51
|
+
*
|
52
|
+
* 'data' must be aligned to a MATCHFINDER_ALIGNMENT boundary.
|
53
|
+
*/
|
54
|
+
static forceinline void
|
55
|
+
matchfinder_init(mf_pos_t *data, size_t num_entries)
|
56
|
+
{
|
57
|
+
size_t i;
|
58
|
+
|
59
|
+
#if defined(__AVX2__) && defined(_aligned_attribute)
|
60
|
+
if (matchfinder_init_avx2(data, num_entries * sizeof(data[0])))
|
61
|
+
return;
|
62
|
+
#endif
|
63
|
+
|
64
|
+
#if defined(__SSE2__) && defined(_aligned_attribute)
|
65
|
+
if (matchfinder_init_sse2(data, num_entries * sizeof(data[0])))
|
66
|
+
return;
|
67
|
+
#endif
|
68
|
+
|
69
|
+
#if defined(__ARM_NEON) && defined(_aligned_attribute)
|
70
|
+
if (matchfinder_init_neon(data, num_entries * sizeof(data[0])))
|
71
|
+
return;
|
72
|
+
#endif
|
73
|
+
|
74
|
+
for (i = 0; i < num_entries; i++)
|
75
|
+
data[i] = MATCHFINDER_INITVAL;
|
76
|
+
}
|
77
|
+
|
78
|
+
/*
|
79
|
+
* Slide the matchfinder by WINDOW_SIZE bytes.
|
80
|
+
*
|
81
|
+
* This must be called just after each WINDOW_SIZE bytes have been run through
|
82
|
+
* the matchfinder.
|
83
|
+
*
|
84
|
+
* This will subtract WINDOW_SIZE bytes from each entry in the array specified.
|
85
|
+
* The effect is that all entries are updated to be relative to the current
|
86
|
+
* position, rather than the position WINDOW_SIZE bytes prior.
|
87
|
+
*
|
88
|
+
* Underflow is detected and replaced with signed saturation. This ensures that
|
89
|
+
* once the sliding window has passed over a position, that position forever
|
90
|
+
* remains out of bounds.
|
91
|
+
*
|
92
|
+
* The array passed in must contain all matchfinder data that is
|
93
|
+
* position-relative. Concretely, this will include the hash table as well as
|
94
|
+
* the table of positions that is used to link together the sequences in each
|
95
|
+
* hash bucket. Note that in the latter table, the links are 1-ary in the case
|
96
|
+
* of "hash chains", and 2-ary in the case of "binary trees". In either case,
|
97
|
+
* the links need to be rebased in the same way.
|
98
|
+
*/
|
99
|
+
static forceinline void
|
100
|
+
matchfinder_rebase(mf_pos_t *data, size_t num_entries)
|
101
|
+
{
|
102
|
+
size_t i;
|
103
|
+
|
104
|
+
#if defined(__AVX2__) && defined(_aligned_attribute)
|
105
|
+
if (matchfinder_rebase_avx2(data, num_entries * sizeof(data[0])))
|
106
|
+
return;
|
107
|
+
#endif
|
108
|
+
|
109
|
+
#if defined(__SSE2__) && defined(_aligned_attribute)
|
110
|
+
if (matchfinder_rebase_sse2(data, num_entries * sizeof(data[0])))
|
111
|
+
return;
|
112
|
+
#endif
|
113
|
+
|
114
|
+
#if defined(__ARM_NEON) && defined(_aligned_attribute)
|
115
|
+
if (matchfinder_rebase_neon(data, num_entries * sizeof(data[0])))
|
116
|
+
return;
|
117
|
+
#endif
|
118
|
+
|
119
|
+
if (MATCHFINDER_WINDOW_SIZE == 32768) {
|
120
|
+
/* Branchless version for 32768 byte windows. If the value was
|
121
|
+
* already negative, clear all bits except the sign bit; this
|
122
|
+
* changes the value to -32768. Otherwise, set the sign bit;
|
123
|
+
* this is equivalent to subtracting 32768. */
|
124
|
+
for (i = 0; i < num_entries; i++) {
|
125
|
+
u16 v = data[i];
|
126
|
+
u16 sign_bit = v & 0x8000;
|
127
|
+
v &= sign_bit - ((sign_bit >> 15) ^ 1);
|
128
|
+
v |= 0x8000;
|
129
|
+
data[i] = v;
|
130
|
+
}
|
131
|
+
return;
|
132
|
+
}
|
133
|
+
|
134
|
+
for (i = 0; i < num_entries; i++) {
|
135
|
+
if (data[i] >= 0)
|
136
|
+
data[i] -= (mf_pos_t)-MATCHFINDER_WINDOW_SIZE;
|
137
|
+
else
|
138
|
+
data[i] = (mf_pos_t)-MATCHFINDER_WINDOW_SIZE;
|
139
|
+
}
|
140
|
+
}
|
141
|
+
|
142
|
+
/*
|
143
|
+
* The hash function: given a sequence prefix held in the low-order bits of a
|
144
|
+
* 32-bit value, multiply by a carefully-chosen large constant. Discard any
|
145
|
+
* bits of the product that don't fit in a 32-bit value, but take the
|
146
|
+
* next-highest @num_bits bits of the product as the hash value, as those have
|
147
|
+
* the most randomness.
|
148
|
+
*/
|
149
|
+
static forceinline u32
|
150
|
+
lz_hash(u32 seq, unsigned num_bits)
|
151
|
+
{
|
152
|
+
return (u32)(seq * 0x1E35A7BD) >> (32 - num_bits);
|
153
|
+
}
|
154
|
+
|
155
|
+
/*
|
156
|
+
* Return the number of bytes at @matchptr that match the bytes at @strptr, up
|
157
|
+
* to a maximum of @max_len. Initially, @start_len bytes are matched.
|
158
|
+
*/
|
159
|
+
static forceinline unsigned
|
160
|
+
lz_extend(const u8 * const strptr, const u8 * const matchptr,
|
161
|
+
const unsigned start_len, const unsigned max_len)
|
162
|
+
{
|
163
|
+
unsigned len = start_len;
|
164
|
+
machine_word_t v_word;
|
165
|
+
|
166
|
+
if (UNALIGNED_ACCESS_IS_FAST) {
|
167
|
+
|
168
|
+
if (likely(max_len - len >= 4 * WORDBYTES)) {
|
169
|
+
|
170
|
+
#define COMPARE_WORD_STEP \
|
171
|
+
v_word = load_word_unaligned(&matchptr[len]) ^ \
|
172
|
+
load_word_unaligned(&strptr[len]); \
|
173
|
+
if (v_word != 0) \
|
174
|
+
goto word_differs; \
|
175
|
+
len += WORDBYTES; \
|
176
|
+
|
177
|
+
COMPARE_WORD_STEP
|
178
|
+
COMPARE_WORD_STEP
|
179
|
+
COMPARE_WORD_STEP
|
180
|
+
COMPARE_WORD_STEP
|
181
|
+
#undef COMPARE_WORD_STEP
|
182
|
+
}
|
183
|
+
|
184
|
+
while (len + WORDBYTES <= max_len) {
|
185
|
+
v_word = load_word_unaligned(&matchptr[len]) ^
|
186
|
+
load_word_unaligned(&strptr[len]);
|
187
|
+
if (v_word != 0)
|
188
|
+
goto word_differs;
|
189
|
+
len += WORDBYTES;
|
190
|
+
}
|
191
|
+
}
|
192
|
+
|
193
|
+
while (len < max_len && matchptr[len] == strptr[len])
|
194
|
+
len++;
|
195
|
+
return len;
|
196
|
+
|
197
|
+
word_differs:
|
198
|
+
if (CPU_IS_LITTLE_ENDIAN())
|
199
|
+
len += (bsfw(v_word) >> 3);
|
200
|
+
else
|
201
|
+
len += (WORDBITS - 1 - bsrw(v_word)) >> 3;
|
202
|
+
return len;
|
203
|
+
}
|
204
|
+
|
205
|
+
#endif /* LIB_MATCHFINDER_COMMON_H */
|
@@ -0,0 +1,61 @@
|
|
1
|
+
/*
|
2
|
+
* matchfinder_neon.h - matchfinding routines optimized for ARM NEON (Advanced
|
3
|
+
* SIMD) instructions
|
4
|
+
*/
|
5
|
+
|
6
|
+
#include <arm_neon.h>
|
7
|
+
|
8
|
+
static forceinline bool
|
9
|
+
matchfinder_init_neon(mf_pos_t *data, size_t size)
|
10
|
+
{
|
11
|
+
int16x8_t v, *p;
|
12
|
+
size_t n;
|
13
|
+
|
14
|
+
if (size % sizeof(int16x8_t) * 4)
|
15
|
+
return false;
|
16
|
+
|
17
|
+
STATIC_ASSERT(sizeof(mf_pos_t) == 2);
|
18
|
+
v = (int16x8_t) {
|
19
|
+
MATCHFINDER_INITVAL, MATCHFINDER_INITVAL, MATCHFINDER_INITVAL,
|
20
|
+
MATCHFINDER_INITVAL, MATCHFINDER_INITVAL, MATCHFINDER_INITVAL,
|
21
|
+
MATCHFINDER_INITVAL, MATCHFINDER_INITVAL,
|
22
|
+
};
|
23
|
+
p = (int16x8_t *)data;
|
24
|
+
n = size / (sizeof(int16x8_t) * 4);
|
25
|
+
do {
|
26
|
+
p[0] = v;
|
27
|
+
p[1] = v;
|
28
|
+
p[2] = v;
|
29
|
+
p[3] = v;
|
30
|
+
p += 4;
|
31
|
+
} while (--n);
|
32
|
+
return true;
|
33
|
+
}
|
34
|
+
|
35
|
+
static forceinline bool
|
36
|
+
matchfinder_rebase_neon(mf_pos_t *data, size_t size)
|
37
|
+
{
|
38
|
+
int16x8_t v, *p;
|
39
|
+
size_t n;
|
40
|
+
|
41
|
+
if ((size % sizeof(int16x8_t) * 4 != 0))
|
42
|
+
return false;
|
43
|
+
|
44
|
+
STATIC_ASSERT(sizeof(mf_pos_t) == 2);
|
45
|
+
v = (int16x8_t) {
|
46
|
+
(u16)-MATCHFINDER_WINDOW_SIZE, (u16)-MATCHFINDER_WINDOW_SIZE,
|
47
|
+
(u16)-MATCHFINDER_WINDOW_SIZE, (u16)-MATCHFINDER_WINDOW_SIZE,
|
48
|
+
(u16)-MATCHFINDER_WINDOW_SIZE, (u16)-MATCHFINDER_WINDOW_SIZE,
|
49
|
+
(u16)-MATCHFINDER_WINDOW_SIZE, (u16)-MATCHFINDER_WINDOW_SIZE,
|
50
|
+
};
|
51
|
+
p = (int16x8_t *)data;
|
52
|
+
n = size / (sizeof(int16x8_t) * 4);
|
53
|
+
do {
|
54
|
+
p[0] = vqaddq_s16(p[0], v);
|
55
|
+
p[1] = vqaddq_s16(p[1], v);
|
56
|
+
p[2] = vqaddq_s16(p[2], v);
|
57
|
+
p[3] = vqaddq_s16(p[3], v);
|
58
|
+
p += 4;
|
59
|
+
} while (--n);
|
60
|
+
return true;
|
61
|
+
}
|
@@ -0,0 +1,53 @@
|
|
1
|
+
/*
|
2
|
+
* matchfinder_sse2.h - matchfinding routines optimized for Intel SSE2
|
3
|
+
* (Streaming SIMD Extensions).
|
4
|
+
*/
|
5
|
+
|
6
|
+
#include <emmintrin.h>
|
7
|
+
|
8
|
+
static forceinline bool
|
9
|
+
matchfinder_init_sse2(mf_pos_t *data, size_t size)
|
10
|
+
{
|
11
|
+
__m128i v, *p;
|
12
|
+
size_t n;
|
13
|
+
|
14
|
+
if (size % sizeof(__m128i) * 4)
|
15
|
+
return false;
|
16
|
+
|
17
|
+
STATIC_ASSERT(sizeof(mf_pos_t) == 2);
|
18
|
+
v = _mm_set1_epi16(MATCHFINDER_INITVAL);
|
19
|
+
p = (__m128i *)data;
|
20
|
+
n = size / (sizeof(__m128i) * 4);
|
21
|
+
do {
|
22
|
+
p[0] = v;
|
23
|
+
p[1] = v;
|
24
|
+
p[2] = v;
|
25
|
+
p[3] = v;
|
26
|
+
p += 4;
|
27
|
+
} while (--n);
|
28
|
+
return true;
|
29
|
+
}
|
30
|
+
|
31
|
+
static forceinline bool
|
32
|
+
matchfinder_rebase_sse2(mf_pos_t *data, size_t size)
|
33
|
+
{
|
34
|
+
__m128i v, *p;
|
35
|
+
size_t n;
|
36
|
+
|
37
|
+
if ((size % sizeof(__m128i) * 4 != 0))
|
38
|
+
return false;
|
39
|
+
|
40
|
+
STATIC_ASSERT(sizeof(mf_pos_t) == 2);
|
41
|
+
v = _mm_set1_epi16((u16)-MATCHFINDER_WINDOW_SIZE);
|
42
|
+
p = (__m128i *)data;
|
43
|
+
n = size / (sizeof(__m128i) * 4);
|
44
|
+
do {
|
45
|
+
/* PADDSW: Add Packed Signed Integers With Signed Saturation */
|
46
|
+
p[0] = _mm_adds_epi16(p[0], v);
|
47
|
+
p[1] = _mm_adds_epi16(p[1], v);
|
48
|
+
p[2] = _mm_adds_epi16(p[2], v);
|
49
|
+
p[3] = _mm_adds_epi16(p[3], v);
|
50
|
+
p += 4;
|
51
|
+
} while (--n);
|
52
|
+
return true;
|
53
|
+
}
|
@@ -0,0 +1,202 @@
|
|
1
|
+
/*
|
2
|
+
* unaligned.h - inline functions for unaligned memory accesses
|
3
|
+
*/
|
4
|
+
|
5
|
+
#ifndef LIB_UNALIGNED_H
|
6
|
+
#define LIB_UNALIGNED_H
|
7
|
+
|
8
|
+
#include "lib_common.h"
|
9
|
+
|
10
|
+
/*
|
11
|
+
* Naming note:
|
12
|
+
*
|
13
|
+
* {load,store}_*_unaligned() deal with raw bytes without endianness conversion.
|
14
|
+
* {get,put}_unaligned_*() deal with a specific endianness.
|
15
|
+
*/
|
16
|
+
|
17
|
+
DEFINE_UNALIGNED_TYPE(u16)
|
18
|
+
DEFINE_UNALIGNED_TYPE(u32)
|
19
|
+
DEFINE_UNALIGNED_TYPE(u64)
|
20
|
+
DEFINE_UNALIGNED_TYPE(machine_word_t)
|
21
|
+
|
22
|
+
#define load_word_unaligned load_machine_word_t_unaligned
|
23
|
+
#define store_word_unaligned store_machine_word_t_unaligned
|
24
|
+
|
25
|
+
/***** Unaligned loads *****/
|
26
|
+
|
27
|
+
static forceinline u16
|
28
|
+
get_unaligned_le16(const u8 *p)
|
29
|
+
{
|
30
|
+
if (UNALIGNED_ACCESS_IS_FAST)
|
31
|
+
return le16_bswap(load_u16_unaligned(p));
|
32
|
+
else
|
33
|
+
return ((u16)p[1] << 8) | p[0];
|
34
|
+
}
|
35
|
+
|
36
|
+
static forceinline u16
|
37
|
+
get_unaligned_be16(const u8 *p)
|
38
|
+
{
|
39
|
+
if (UNALIGNED_ACCESS_IS_FAST)
|
40
|
+
return be16_bswap(load_u16_unaligned(p));
|
41
|
+
else
|
42
|
+
return ((u16)p[0] << 8) | p[1];
|
43
|
+
}
|
44
|
+
|
45
|
+
static forceinline u32
|
46
|
+
get_unaligned_le32(const u8 *p)
|
47
|
+
{
|
48
|
+
if (UNALIGNED_ACCESS_IS_FAST)
|
49
|
+
return le32_bswap(load_u32_unaligned(p));
|
50
|
+
else
|
51
|
+
return ((u32)p[3] << 24) | ((u32)p[2] << 16) |
|
52
|
+
((u32)p[1] << 8) | p[0];
|
53
|
+
}
|
54
|
+
|
55
|
+
static forceinline u32
|
56
|
+
get_unaligned_be32(const u8 *p)
|
57
|
+
{
|
58
|
+
if (UNALIGNED_ACCESS_IS_FAST)
|
59
|
+
return be32_bswap(load_u32_unaligned(p));
|
60
|
+
else
|
61
|
+
return ((u32)p[0] << 24) | ((u32)p[1] << 16) |
|
62
|
+
((u32)p[2] << 8) | p[3];
|
63
|
+
}
|
64
|
+
|
65
|
+
static forceinline u64
|
66
|
+
get_unaligned_le64(const u8 *p)
|
67
|
+
{
|
68
|
+
if (UNALIGNED_ACCESS_IS_FAST)
|
69
|
+
return le64_bswap(load_u64_unaligned(p));
|
70
|
+
else
|
71
|
+
return ((u64)p[7] << 56) | ((u64)p[6] << 48) |
|
72
|
+
((u64)p[5] << 40) | ((u64)p[4] << 32) |
|
73
|
+
((u64)p[3] << 24) | ((u64)p[2] << 16) |
|
74
|
+
((u64)p[1] << 8) | p[0];
|
75
|
+
}
|
76
|
+
|
77
|
+
static forceinline machine_word_t
|
78
|
+
get_unaligned_leword(const u8 *p)
|
79
|
+
{
|
80
|
+
STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
|
81
|
+
if (WORDBITS == 32)
|
82
|
+
return get_unaligned_le32(p);
|
83
|
+
else
|
84
|
+
return get_unaligned_le64(p);
|
85
|
+
}
|
86
|
+
|
87
|
+
/***** Unaligned stores *****/
|
88
|
+
|
89
|
+
static forceinline void
|
90
|
+
put_unaligned_le16(u16 v, u8 *p)
|
91
|
+
{
|
92
|
+
if (UNALIGNED_ACCESS_IS_FAST) {
|
93
|
+
store_u16_unaligned(le16_bswap(v), p);
|
94
|
+
} else {
|
95
|
+
p[0] = (u8)(v >> 0);
|
96
|
+
p[1] = (u8)(v >> 8);
|
97
|
+
}
|
98
|
+
}
|
99
|
+
|
100
|
+
static forceinline void
|
101
|
+
put_unaligned_be16(u16 v, u8 *p)
|
102
|
+
{
|
103
|
+
if (UNALIGNED_ACCESS_IS_FAST) {
|
104
|
+
store_u16_unaligned(be16_bswap(v), p);
|
105
|
+
} else {
|
106
|
+
p[0] = (u8)(v >> 8);
|
107
|
+
p[1] = (u8)(v >> 0);
|
108
|
+
}
|
109
|
+
}
|
110
|
+
|
111
|
+
static forceinline void
|
112
|
+
put_unaligned_le32(u32 v, u8 *p)
|
113
|
+
{
|
114
|
+
if (UNALIGNED_ACCESS_IS_FAST) {
|
115
|
+
store_u32_unaligned(le32_bswap(v), p);
|
116
|
+
} else {
|
117
|
+
p[0] = (u8)(v >> 0);
|
118
|
+
p[1] = (u8)(v >> 8);
|
119
|
+
p[2] = (u8)(v >> 16);
|
120
|
+
p[3] = (u8)(v >> 24);
|
121
|
+
}
|
122
|
+
}
|
123
|
+
|
124
|
+
static forceinline void
|
125
|
+
put_unaligned_be32(u32 v, u8 *p)
|
126
|
+
{
|
127
|
+
if (UNALIGNED_ACCESS_IS_FAST) {
|
128
|
+
store_u32_unaligned(be32_bswap(v), p);
|
129
|
+
} else {
|
130
|
+
p[0] = (u8)(v >> 24);
|
131
|
+
p[1] = (u8)(v >> 16);
|
132
|
+
p[2] = (u8)(v >> 8);
|
133
|
+
p[3] = (u8)(v >> 0);
|
134
|
+
}
|
135
|
+
}
|
136
|
+
|
137
|
+
static forceinline void
|
138
|
+
put_unaligned_le64(u64 v, u8 *p)
|
139
|
+
{
|
140
|
+
if (UNALIGNED_ACCESS_IS_FAST) {
|
141
|
+
store_u64_unaligned(le64_bswap(v), p);
|
142
|
+
} else {
|
143
|
+
p[0] = (u8)(v >> 0);
|
144
|
+
p[1] = (u8)(v >> 8);
|
145
|
+
p[2] = (u8)(v >> 16);
|
146
|
+
p[3] = (u8)(v >> 24);
|
147
|
+
p[4] = (u8)(v >> 32);
|
148
|
+
p[5] = (u8)(v >> 40);
|
149
|
+
p[6] = (u8)(v >> 48);
|
150
|
+
p[7] = (u8)(v >> 56);
|
151
|
+
}
|
152
|
+
}
|
153
|
+
|
154
|
+
static forceinline void
|
155
|
+
put_unaligned_leword(machine_word_t v, u8 *p)
|
156
|
+
{
|
157
|
+
STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
|
158
|
+
if (WORDBITS == 32)
|
159
|
+
put_unaligned_le32(v, p);
|
160
|
+
else
|
161
|
+
put_unaligned_le64(v, p);
|
162
|
+
}
|
163
|
+
|
164
|
+
/***** 24-bit loads *****/
|
165
|
+
|
166
|
+
/*
|
167
|
+
* Given a 32-bit value that was loaded with the platform's native endianness,
|
168
|
+
* return a 32-bit value whose high-order 8 bits are 0 and whose low-order 24
|
169
|
+
* bits contain the first 3 bytes, arranged in octets in a platform-dependent
|
170
|
+
* order, at the memory location from which the input 32-bit value was loaded.
|
171
|
+
*/
|
172
|
+
static forceinline u32
|
173
|
+
loaded_u32_to_u24(u32 v)
|
174
|
+
{
|
175
|
+
if (CPU_IS_LITTLE_ENDIAN())
|
176
|
+
return v & 0xFFFFFF;
|
177
|
+
else
|
178
|
+
return v >> 8;
|
179
|
+
}
|
180
|
+
|
181
|
+
/*
|
182
|
+
* Load the next 3 bytes from the memory location @p into the 24 low-order bits
|
183
|
+
* of a 32-bit value. The order in which the 3 bytes will be arranged as octets
|
184
|
+
* in the 24 bits is platform-dependent. At least LOAD_U24_REQUIRED_NBYTES
|
185
|
+
* bytes must be available at @p; note that this may be more than 3.
|
186
|
+
*/
|
187
|
+
static forceinline u32
|
188
|
+
load_u24_unaligned(const u8 *p)
|
189
|
+
{
|
190
|
+
#if UNALIGNED_ACCESS_IS_FAST
|
191
|
+
# define LOAD_U24_REQUIRED_NBYTES 4
|
192
|
+
return loaded_u32_to_u24(load_u32_unaligned(p));
|
193
|
+
#else
|
194
|
+
# define LOAD_U24_REQUIRED_NBYTES 3
|
195
|
+
if (CPU_IS_LITTLE_ENDIAN())
|
196
|
+
return ((u32)p[0] << 0) | ((u32)p[1] << 8) | ((u32)p[2] << 16);
|
197
|
+
else
|
198
|
+
return ((u32)p[2] << 0) | ((u32)p[1] << 8) | ((u32)p[0] << 16);
|
199
|
+
#endif
|
200
|
+
}
|
201
|
+
|
202
|
+
#endif /* LIB_UNALIGNED_H */
|