libdeflate 0.1.1 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.github/workflows/test.yml +34 -0
- data/README.md +1 -6
- data/ext/libdeflate/extconf.rb +18 -7
- data/ext/libdeflate/libdeflate_ext.c +17 -17
- data/lib/libdeflate/version.rb +1 -1
- data/libdeflate.gemspec +2 -1
- metadata +13 -84
- data/.gitmodules +0 -3
- data/.travis.yml +0 -5
- data/ext/libdeflate/libdeflate/.gitignore +0 -19
- data/ext/libdeflate/libdeflate/COPYING +0 -21
- data/ext/libdeflate/libdeflate/Makefile +0 -231
- data/ext/libdeflate/libdeflate/Makefile.msc +0 -64
- data/ext/libdeflate/libdeflate/NEWS +0 -57
- data/ext/libdeflate/libdeflate/README.md +0 -170
- data/ext/libdeflate/libdeflate/common/common_defs.h +0 -351
- data/ext/libdeflate/libdeflate/common/compiler_gcc.h +0 -134
- data/ext/libdeflate/libdeflate/common/compiler_msc.h +0 -95
- data/ext/libdeflate/libdeflate/lib/adler32.c +0 -213
- data/ext/libdeflate/libdeflate/lib/adler32_impl.h +0 -281
- data/ext/libdeflate/libdeflate/lib/aligned_malloc.c +0 -57
- data/ext/libdeflate/libdeflate/lib/aligned_malloc.h +0 -13
- data/ext/libdeflate/libdeflate/lib/bt_matchfinder.h +0 -357
- data/ext/libdeflate/libdeflate/lib/crc32.c +0 -368
- data/ext/libdeflate/libdeflate/lib/crc32_impl.h +0 -286
- data/ext/libdeflate/libdeflate/lib/crc32_table.h +0 -526
- data/ext/libdeflate/libdeflate/lib/decompress_impl.h +0 -404
- data/ext/libdeflate/libdeflate/lib/deflate_compress.c +0 -2817
- data/ext/libdeflate/libdeflate/lib/deflate_compress.h +0 -14
- data/ext/libdeflate/libdeflate/lib/deflate_constants.h +0 -66
- data/ext/libdeflate/libdeflate/lib/deflate_decompress.c +0 -889
- data/ext/libdeflate/libdeflate/lib/gzip_compress.c +0 -95
- data/ext/libdeflate/libdeflate/lib/gzip_constants.h +0 -45
- data/ext/libdeflate/libdeflate/lib/gzip_decompress.c +0 -130
- data/ext/libdeflate/libdeflate/lib/hc_matchfinder.h +0 -405
- data/ext/libdeflate/libdeflate/lib/lib_common.h +0 -35
- data/ext/libdeflate/libdeflate/lib/matchfinder_avx2.h +0 -53
- data/ext/libdeflate/libdeflate/lib/matchfinder_common.h +0 -205
- data/ext/libdeflate/libdeflate/lib/matchfinder_neon.h +0 -61
- data/ext/libdeflate/libdeflate/lib/matchfinder_sse2.h +0 -53
- data/ext/libdeflate/libdeflate/lib/unaligned.h +0 -202
- data/ext/libdeflate/libdeflate/lib/x86_cpu_features.c +0 -169
- data/ext/libdeflate/libdeflate/lib/x86_cpu_features.h +0 -48
- data/ext/libdeflate/libdeflate/lib/zlib_compress.c +0 -87
- data/ext/libdeflate/libdeflate/lib/zlib_constants.h +0 -21
- data/ext/libdeflate/libdeflate/lib/zlib_decompress.c +0 -91
- data/ext/libdeflate/libdeflate/libdeflate.h +0 -274
- data/ext/libdeflate/libdeflate/programs/benchmark.c +0 -558
- data/ext/libdeflate/libdeflate/programs/checksum.c +0 -197
- data/ext/libdeflate/libdeflate/programs/detect.sh +0 -62
- data/ext/libdeflate/libdeflate/programs/gzip.c +0 -603
- data/ext/libdeflate/libdeflate/programs/prog_util.c +0 -530
- data/ext/libdeflate/libdeflate/programs/prog_util.h +0 -162
- data/ext/libdeflate/libdeflate/programs/test_checksums.c +0 -135
- data/ext/libdeflate/libdeflate/programs/tgetopt.c +0 -118
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/Makefile +0 -12
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_compress/fuzz.c +0 -40
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_compress/inputs/0 +0 -0
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_decompress/fuzz.c +0 -28
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_decompress/inputs/0 +0 -3
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/gzip_decompress/fuzz.c +0 -28
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/gzip_decompress/inputs/0 +0 -0
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/prepare_for_fuzz.sh +0 -14
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/zlib_decompress/fuzz.c +0 -28
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/zlib_decompress/inputs/0 +0 -3
- data/ext/libdeflate/libdeflate/tools/android_build.sh +0 -104
- data/ext/libdeflate/libdeflate/tools/checksum_benchmarks.sh +0 -76
- data/ext/libdeflate/libdeflate/tools/exec_tests.sh +0 -30
- data/ext/libdeflate/libdeflate/tools/gen_crc32_multipliers.c +0 -108
- data/ext/libdeflate/libdeflate/tools/gen_crc32_table.c +0 -100
- data/ext/libdeflate/libdeflate/tools/gzip_tests.sh +0 -412
- data/ext/libdeflate/libdeflate/tools/make-windows-releases +0 -21
- data/ext/libdeflate/libdeflate/tools/mips_build.sh +0 -9
- data/ext/libdeflate/libdeflate/tools/msc_test.bat +0 -3
- data/ext/libdeflate/libdeflate/tools/pgo_build.sh +0 -23
- data/ext/libdeflate/libdeflate/tools/produce_gzip_benchmark_table.sh +0 -37
- data/ext/libdeflate/libdeflate/tools/run_tests.sh +0 -305
- data/ext/libdeflate/libdeflate/tools/windows_build.sh +0 -10
@@ -1,53 +0,0 @@
|
|
1
|
-
/*
|
2
|
-
* matchfinder_avx2.h - matchfinding routines optimized for Intel AVX2 (Advanced
|
3
|
-
* Vector Extensions)
|
4
|
-
*/
|
5
|
-
|
6
|
-
#include <immintrin.h>
|
7
|
-
|
8
|
-
static forceinline bool
|
9
|
-
matchfinder_init_avx2(mf_pos_t *data, size_t size)
|
10
|
-
{
|
11
|
-
__m256i v, *p;
|
12
|
-
size_t n;
|
13
|
-
|
14
|
-
if (size % sizeof(__m256i) * 4)
|
15
|
-
return false;
|
16
|
-
|
17
|
-
STATIC_ASSERT(sizeof(mf_pos_t) == 2);
|
18
|
-
v = _mm256_set1_epi16(MATCHFINDER_INITVAL);
|
19
|
-
p = (__m256i *)data;
|
20
|
-
n = size / (sizeof(__m256i) * 4);
|
21
|
-
do {
|
22
|
-
p[0] = v;
|
23
|
-
p[1] = v;
|
24
|
-
p[2] = v;
|
25
|
-
p[3] = v;
|
26
|
-
p += 4;
|
27
|
-
} while (--n);
|
28
|
-
return true;
|
29
|
-
}
|
30
|
-
|
31
|
-
static forceinline bool
|
32
|
-
matchfinder_rebase_avx2(mf_pos_t *data, size_t size)
|
33
|
-
{
|
34
|
-
__m256i v, *p;
|
35
|
-
size_t n;
|
36
|
-
|
37
|
-
if ((size % sizeof(__m256i) * 4 != 0))
|
38
|
-
return false;
|
39
|
-
|
40
|
-
STATIC_ASSERT(sizeof(mf_pos_t) == 2);
|
41
|
-
v = _mm256_set1_epi16((u16)-MATCHFINDER_WINDOW_SIZE);
|
42
|
-
p = (__m256i *)data;
|
43
|
-
n = size / (sizeof(__m256i) * 4);
|
44
|
-
do {
|
45
|
-
/* PADDSW: Add Packed Signed Integers With Signed Saturation */
|
46
|
-
p[0] = _mm256_adds_epi16(p[0], v);
|
47
|
-
p[1] = _mm256_adds_epi16(p[1], v);
|
48
|
-
p[2] = _mm256_adds_epi16(p[2], v);
|
49
|
-
p[3] = _mm256_adds_epi16(p[3], v);
|
50
|
-
p += 4;
|
51
|
-
} while (--n);
|
52
|
-
return true;
|
53
|
-
}
|
@@ -1,205 +0,0 @@
|
|
1
|
-
/*
|
2
|
-
* matchfinder_common.h - common code for Lempel-Ziv matchfinding
|
3
|
-
*/
|
4
|
-
|
5
|
-
#ifndef LIB_MATCHFINDER_COMMON_H
|
6
|
-
#define LIB_MATCHFINDER_COMMON_H
|
7
|
-
|
8
|
-
#include "lib_common.h"
|
9
|
-
#include "unaligned.h"
|
10
|
-
|
11
|
-
#ifndef MATCHFINDER_WINDOW_ORDER
|
12
|
-
# error "MATCHFINDER_WINDOW_ORDER must be defined!"
|
13
|
-
#endif
|
14
|
-
|
15
|
-
#define MATCHFINDER_WINDOW_SIZE (1UL << MATCHFINDER_WINDOW_ORDER)
|
16
|
-
|
17
|
-
typedef s16 mf_pos_t;
|
18
|
-
|
19
|
-
#define MATCHFINDER_INITVAL ((mf_pos_t)-MATCHFINDER_WINDOW_SIZE)
|
20
|
-
|
21
|
-
#define MATCHFINDER_ALIGNMENT 8
|
22
|
-
|
23
|
-
#ifdef __AVX2__
|
24
|
-
# include "matchfinder_avx2.h"
|
25
|
-
# if MATCHFINDER_ALIGNMENT < 32
|
26
|
-
# undef MATCHFINDER_ALIGNMENT
|
27
|
-
# define MATCHFINDER_ALIGNMENT 32
|
28
|
-
# endif
|
29
|
-
#endif
|
30
|
-
|
31
|
-
#ifdef __SSE2__
|
32
|
-
# include "matchfinder_sse2.h"
|
33
|
-
# if MATCHFINDER_ALIGNMENT < 16
|
34
|
-
# undef MATCHFINDER_ALIGNMENT
|
35
|
-
# define MATCHFINDER_ALIGNMENT 16
|
36
|
-
# endif
|
37
|
-
#endif
|
38
|
-
|
39
|
-
#ifdef __ARM_NEON
|
40
|
-
# include "matchfinder_neon.h"
|
41
|
-
# if MATCHFINDER_ALIGNMENT < 16
|
42
|
-
# undef MATCHFINDER_ALIGNMENT
|
43
|
-
# define MATCHFINDER_ALIGNMENT 16
|
44
|
-
# endif
|
45
|
-
#endif
|
46
|
-
|
47
|
-
/*
|
48
|
-
* Initialize the hash table portion of the matchfinder.
|
49
|
-
*
|
50
|
-
* Essentially, this is an optimized memset().
|
51
|
-
*
|
52
|
-
* 'data' must be aligned to a MATCHFINDER_ALIGNMENT boundary.
|
53
|
-
*/
|
54
|
-
static forceinline void
|
55
|
-
matchfinder_init(mf_pos_t *data, size_t num_entries)
|
56
|
-
{
|
57
|
-
size_t i;
|
58
|
-
|
59
|
-
#if defined(__AVX2__) && defined(_aligned_attribute)
|
60
|
-
if (matchfinder_init_avx2(data, num_entries * sizeof(data[0])))
|
61
|
-
return;
|
62
|
-
#endif
|
63
|
-
|
64
|
-
#if defined(__SSE2__) && defined(_aligned_attribute)
|
65
|
-
if (matchfinder_init_sse2(data, num_entries * sizeof(data[0])))
|
66
|
-
return;
|
67
|
-
#endif
|
68
|
-
|
69
|
-
#if defined(__ARM_NEON) && defined(_aligned_attribute)
|
70
|
-
if (matchfinder_init_neon(data, num_entries * sizeof(data[0])))
|
71
|
-
return;
|
72
|
-
#endif
|
73
|
-
|
74
|
-
for (i = 0; i < num_entries; i++)
|
75
|
-
data[i] = MATCHFINDER_INITVAL;
|
76
|
-
}
|
77
|
-
|
78
|
-
/*
|
79
|
-
* Slide the matchfinder by WINDOW_SIZE bytes.
|
80
|
-
*
|
81
|
-
* This must be called just after each WINDOW_SIZE bytes have been run through
|
82
|
-
* the matchfinder.
|
83
|
-
*
|
84
|
-
* This will subtract WINDOW_SIZE bytes from each entry in the array specified.
|
85
|
-
* The effect is that all entries are updated to be relative to the current
|
86
|
-
* position, rather than the position WINDOW_SIZE bytes prior.
|
87
|
-
*
|
88
|
-
* Underflow is detected and replaced with signed saturation. This ensures that
|
89
|
-
* once the sliding window has passed over a position, that position forever
|
90
|
-
* remains out of bounds.
|
91
|
-
*
|
92
|
-
* The array passed in must contain all matchfinder data that is
|
93
|
-
* position-relative. Concretely, this will include the hash table as well as
|
94
|
-
* the table of positions that is used to link together the sequences in each
|
95
|
-
* hash bucket. Note that in the latter table, the links are 1-ary in the case
|
96
|
-
* of "hash chains", and 2-ary in the case of "binary trees". In either case,
|
97
|
-
* the links need to be rebased in the same way.
|
98
|
-
*/
|
99
|
-
static forceinline void
|
100
|
-
matchfinder_rebase(mf_pos_t *data, size_t num_entries)
|
101
|
-
{
|
102
|
-
size_t i;
|
103
|
-
|
104
|
-
#if defined(__AVX2__) && defined(_aligned_attribute)
|
105
|
-
if (matchfinder_rebase_avx2(data, num_entries * sizeof(data[0])))
|
106
|
-
return;
|
107
|
-
#endif
|
108
|
-
|
109
|
-
#if defined(__SSE2__) && defined(_aligned_attribute)
|
110
|
-
if (matchfinder_rebase_sse2(data, num_entries * sizeof(data[0])))
|
111
|
-
return;
|
112
|
-
#endif
|
113
|
-
|
114
|
-
#if defined(__ARM_NEON) && defined(_aligned_attribute)
|
115
|
-
if (matchfinder_rebase_neon(data, num_entries * sizeof(data[0])))
|
116
|
-
return;
|
117
|
-
#endif
|
118
|
-
|
119
|
-
if (MATCHFINDER_WINDOW_SIZE == 32768) {
|
120
|
-
/* Branchless version for 32768 byte windows. If the value was
|
121
|
-
* already negative, clear all bits except the sign bit; this
|
122
|
-
* changes the value to -32768. Otherwise, set the sign bit;
|
123
|
-
* this is equivalent to subtracting 32768. */
|
124
|
-
for (i = 0; i < num_entries; i++) {
|
125
|
-
u16 v = data[i];
|
126
|
-
u16 sign_bit = v & 0x8000;
|
127
|
-
v &= sign_bit - ((sign_bit >> 15) ^ 1);
|
128
|
-
v |= 0x8000;
|
129
|
-
data[i] = v;
|
130
|
-
}
|
131
|
-
return;
|
132
|
-
}
|
133
|
-
|
134
|
-
for (i = 0; i < num_entries; i++) {
|
135
|
-
if (data[i] >= 0)
|
136
|
-
data[i] -= (mf_pos_t)-MATCHFINDER_WINDOW_SIZE;
|
137
|
-
else
|
138
|
-
data[i] = (mf_pos_t)-MATCHFINDER_WINDOW_SIZE;
|
139
|
-
}
|
140
|
-
}
|
141
|
-
|
142
|
-
/*
|
143
|
-
* The hash function: given a sequence prefix held in the low-order bits of a
|
144
|
-
* 32-bit value, multiply by a carefully-chosen large constant. Discard any
|
145
|
-
* bits of the product that don't fit in a 32-bit value, but take the
|
146
|
-
* next-highest @num_bits bits of the product as the hash value, as those have
|
147
|
-
* the most randomness.
|
148
|
-
*/
|
149
|
-
static forceinline u32
|
150
|
-
lz_hash(u32 seq, unsigned num_bits)
|
151
|
-
{
|
152
|
-
return (u32)(seq * 0x1E35A7BD) >> (32 - num_bits);
|
153
|
-
}
|
154
|
-
|
155
|
-
/*
|
156
|
-
* Return the number of bytes at @matchptr that match the bytes at @strptr, up
|
157
|
-
* to a maximum of @max_len. Initially, @start_len bytes are matched.
|
158
|
-
*/
|
159
|
-
static forceinline unsigned
|
160
|
-
lz_extend(const u8 * const strptr, const u8 * const matchptr,
|
161
|
-
const unsigned start_len, const unsigned max_len)
|
162
|
-
{
|
163
|
-
unsigned len = start_len;
|
164
|
-
machine_word_t v_word;
|
165
|
-
|
166
|
-
if (UNALIGNED_ACCESS_IS_FAST) {
|
167
|
-
|
168
|
-
if (likely(max_len - len >= 4 * WORDBYTES)) {
|
169
|
-
|
170
|
-
#define COMPARE_WORD_STEP \
|
171
|
-
v_word = load_word_unaligned(&matchptr[len]) ^ \
|
172
|
-
load_word_unaligned(&strptr[len]); \
|
173
|
-
if (v_word != 0) \
|
174
|
-
goto word_differs; \
|
175
|
-
len += WORDBYTES; \
|
176
|
-
|
177
|
-
COMPARE_WORD_STEP
|
178
|
-
COMPARE_WORD_STEP
|
179
|
-
COMPARE_WORD_STEP
|
180
|
-
COMPARE_WORD_STEP
|
181
|
-
#undef COMPARE_WORD_STEP
|
182
|
-
}
|
183
|
-
|
184
|
-
while (len + WORDBYTES <= max_len) {
|
185
|
-
v_word = load_word_unaligned(&matchptr[len]) ^
|
186
|
-
load_word_unaligned(&strptr[len]);
|
187
|
-
if (v_word != 0)
|
188
|
-
goto word_differs;
|
189
|
-
len += WORDBYTES;
|
190
|
-
}
|
191
|
-
}
|
192
|
-
|
193
|
-
while (len < max_len && matchptr[len] == strptr[len])
|
194
|
-
len++;
|
195
|
-
return len;
|
196
|
-
|
197
|
-
word_differs:
|
198
|
-
if (CPU_IS_LITTLE_ENDIAN())
|
199
|
-
len += (bsfw(v_word) >> 3);
|
200
|
-
else
|
201
|
-
len += (WORDBITS - 1 - bsrw(v_word)) >> 3;
|
202
|
-
return len;
|
203
|
-
}
|
204
|
-
|
205
|
-
#endif /* LIB_MATCHFINDER_COMMON_H */
|
@@ -1,61 +0,0 @@
|
|
1
|
-
/*
|
2
|
-
* matchfinder_neon.h - matchfinding routines optimized for ARM NEON (Advanced
|
3
|
-
* SIMD) instructions
|
4
|
-
*/
|
5
|
-
|
6
|
-
#include <arm_neon.h>
|
7
|
-
|
8
|
-
static forceinline bool
|
9
|
-
matchfinder_init_neon(mf_pos_t *data, size_t size)
|
10
|
-
{
|
11
|
-
int16x8_t v, *p;
|
12
|
-
size_t n;
|
13
|
-
|
14
|
-
if (size % sizeof(int16x8_t) * 4)
|
15
|
-
return false;
|
16
|
-
|
17
|
-
STATIC_ASSERT(sizeof(mf_pos_t) == 2);
|
18
|
-
v = (int16x8_t) {
|
19
|
-
MATCHFINDER_INITVAL, MATCHFINDER_INITVAL, MATCHFINDER_INITVAL,
|
20
|
-
MATCHFINDER_INITVAL, MATCHFINDER_INITVAL, MATCHFINDER_INITVAL,
|
21
|
-
MATCHFINDER_INITVAL, MATCHFINDER_INITVAL,
|
22
|
-
};
|
23
|
-
p = (int16x8_t *)data;
|
24
|
-
n = size / (sizeof(int16x8_t) * 4);
|
25
|
-
do {
|
26
|
-
p[0] = v;
|
27
|
-
p[1] = v;
|
28
|
-
p[2] = v;
|
29
|
-
p[3] = v;
|
30
|
-
p += 4;
|
31
|
-
} while (--n);
|
32
|
-
return true;
|
33
|
-
}
|
34
|
-
|
35
|
-
static forceinline bool
|
36
|
-
matchfinder_rebase_neon(mf_pos_t *data, size_t size)
|
37
|
-
{
|
38
|
-
int16x8_t v, *p;
|
39
|
-
size_t n;
|
40
|
-
|
41
|
-
if ((size % sizeof(int16x8_t) * 4 != 0))
|
42
|
-
return false;
|
43
|
-
|
44
|
-
STATIC_ASSERT(sizeof(mf_pos_t) == 2);
|
45
|
-
v = (int16x8_t) {
|
46
|
-
(u16)-MATCHFINDER_WINDOW_SIZE, (u16)-MATCHFINDER_WINDOW_SIZE,
|
47
|
-
(u16)-MATCHFINDER_WINDOW_SIZE, (u16)-MATCHFINDER_WINDOW_SIZE,
|
48
|
-
(u16)-MATCHFINDER_WINDOW_SIZE, (u16)-MATCHFINDER_WINDOW_SIZE,
|
49
|
-
(u16)-MATCHFINDER_WINDOW_SIZE, (u16)-MATCHFINDER_WINDOW_SIZE,
|
50
|
-
};
|
51
|
-
p = (int16x8_t *)data;
|
52
|
-
n = size / (sizeof(int16x8_t) * 4);
|
53
|
-
do {
|
54
|
-
p[0] = vqaddq_s16(p[0], v);
|
55
|
-
p[1] = vqaddq_s16(p[1], v);
|
56
|
-
p[2] = vqaddq_s16(p[2], v);
|
57
|
-
p[3] = vqaddq_s16(p[3], v);
|
58
|
-
p += 4;
|
59
|
-
} while (--n);
|
60
|
-
return true;
|
61
|
-
}
|
@@ -1,53 +0,0 @@
|
|
1
|
-
/*
|
2
|
-
* matchfinder_sse2.h - matchfinding routines optimized for Intel SSE2
|
3
|
-
* (Streaming SIMD Extensions).
|
4
|
-
*/
|
5
|
-
|
6
|
-
#include <emmintrin.h>
|
7
|
-
|
8
|
-
static forceinline bool
|
9
|
-
matchfinder_init_sse2(mf_pos_t *data, size_t size)
|
10
|
-
{
|
11
|
-
__m128i v, *p;
|
12
|
-
size_t n;
|
13
|
-
|
14
|
-
if (size % sizeof(__m128i) * 4)
|
15
|
-
return false;
|
16
|
-
|
17
|
-
STATIC_ASSERT(sizeof(mf_pos_t) == 2);
|
18
|
-
v = _mm_set1_epi16(MATCHFINDER_INITVAL);
|
19
|
-
p = (__m128i *)data;
|
20
|
-
n = size / (sizeof(__m128i) * 4);
|
21
|
-
do {
|
22
|
-
p[0] = v;
|
23
|
-
p[1] = v;
|
24
|
-
p[2] = v;
|
25
|
-
p[3] = v;
|
26
|
-
p += 4;
|
27
|
-
} while (--n);
|
28
|
-
return true;
|
29
|
-
}
|
30
|
-
|
31
|
-
static forceinline bool
|
32
|
-
matchfinder_rebase_sse2(mf_pos_t *data, size_t size)
|
33
|
-
{
|
34
|
-
__m128i v, *p;
|
35
|
-
size_t n;
|
36
|
-
|
37
|
-
if ((size % sizeof(__m128i) * 4 != 0))
|
38
|
-
return false;
|
39
|
-
|
40
|
-
STATIC_ASSERT(sizeof(mf_pos_t) == 2);
|
41
|
-
v = _mm_set1_epi16((u16)-MATCHFINDER_WINDOW_SIZE);
|
42
|
-
p = (__m128i *)data;
|
43
|
-
n = size / (sizeof(__m128i) * 4);
|
44
|
-
do {
|
45
|
-
/* PADDSW: Add Packed Signed Integers With Signed Saturation */
|
46
|
-
p[0] = _mm_adds_epi16(p[0], v);
|
47
|
-
p[1] = _mm_adds_epi16(p[1], v);
|
48
|
-
p[2] = _mm_adds_epi16(p[2], v);
|
49
|
-
p[3] = _mm_adds_epi16(p[3], v);
|
50
|
-
p += 4;
|
51
|
-
} while (--n);
|
52
|
-
return true;
|
53
|
-
}
|
@@ -1,202 +0,0 @@
|
|
1
|
-
/*
|
2
|
-
* unaligned.h - inline functions for unaligned memory accesses
|
3
|
-
*/
|
4
|
-
|
5
|
-
#ifndef LIB_UNALIGNED_H
|
6
|
-
#define LIB_UNALIGNED_H
|
7
|
-
|
8
|
-
#include "lib_common.h"
|
9
|
-
|
10
|
-
/*
|
11
|
-
* Naming note:
|
12
|
-
*
|
13
|
-
* {load,store}_*_unaligned() deal with raw bytes without endianness conversion.
|
14
|
-
* {get,put}_unaligned_*() deal with a specific endianness.
|
15
|
-
*/
|
16
|
-
|
17
|
-
DEFINE_UNALIGNED_TYPE(u16)
|
18
|
-
DEFINE_UNALIGNED_TYPE(u32)
|
19
|
-
DEFINE_UNALIGNED_TYPE(u64)
|
20
|
-
DEFINE_UNALIGNED_TYPE(machine_word_t)
|
21
|
-
|
22
|
-
#define load_word_unaligned load_machine_word_t_unaligned
|
23
|
-
#define store_word_unaligned store_machine_word_t_unaligned
|
24
|
-
|
25
|
-
/***** Unaligned loads *****/
|
26
|
-
|
27
|
-
static forceinline u16
|
28
|
-
get_unaligned_le16(const u8 *p)
|
29
|
-
{
|
30
|
-
if (UNALIGNED_ACCESS_IS_FAST)
|
31
|
-
return le16_bswap(load_u16_unaligned(p));
|
32
|
-
else
|
33
|
-
return ((u16)p[1] << 8) | p[0];
|
34
|
-
}
|
35
|
-
|
36
|
-
static forceinline u16
|
37
|
-
get_unaligned_be16(const u8 *p)
|
38
|
-
{
|
39
|
-
if (UNALIGNED_ACCESS_IS_FAST)
|
40
|
-
return be16_bswap(load_u16_unaligned(p));
|
41
|
-
else
|
42
|
-
return ((u16)p[0] << 8) | p[1];
|
43
|
-
}
|
44
|
-
|
45
|
-
static forceinline u32
|
46
|
-
get_unaligned_le32(const u8 *p)
|
47
|
-
{
|
48
|
-
if (UNALIGNED_ACCESS_IS_FAST)
|
49
|
-
return le32_bswap(load_u32_unaligned(p));
|
50
|
-
else
|
51
|
-
return ((u32)p[3] << 24) | ((u32)p[2] << 16) |
|
52
|
-
((u32)p[1] << 8) | p[0];
|
53
|
-
}
|
54
|
-
|
55
|
-
static forceinline u32
|
56
|
-
get_unaligned_be32(const u8 *p)
|
57
|
-
{
|
58
|
-
if (UNALIGNED_ACCESS_IS_FAST)
|
59
|
-
return be32_bswap(load_u32_unaligned(p));
|
60
|
-
else
|
61
|
-
return ((u32)p[0] << 24) | ((u32)p[1] << 16) |
|
62
|
-
((u32)p[2] << 8) | p[3];
|
63
|
-
}
|
64
|
-
|
65
|
-
static forceinline u64
|
66
|
-
get_unaligned_le64(const u8 *p)
|
67
|
-
{
|
68
|
-
if (UNALIGNED_ACCESS_IS_FAST)
|
69
|
-
return le64_bswap(load_u64_unaligned(p));
|
70
|
-
else
|
71
|
-
return ((u64)p[7] << 56) | ((u64)p[6] << 48) |
|
72
|
-
((u64)p[5] << 40) | ((u64)p[4] << 32) |
|
73
|
-
((u64)p[3] << 24) | ((u64)p[2] << 16) |
|
74
|
-
((u64)p[1] << 8) | p[0];
|
75
|
-
}
|
76
|
-
|
77
|
-
static forceinline machine_word_t
|
78
|
-
get_unaligned_leword(const u8 *p)
|
79
|
-
{
|
80
|
-
STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
|
81
|
-
if (WORDBITS == 32)
|
82
|
-
return get_unaligned_le32(p);
|
83
|
-
else
|
84
|
-
return get_unaligned_le64(p);
|
85
|
-
}
|
86
|
-
|
87
|
-
/***** Unaligned stores *****/
|
88
|
-
|
89
|
-
static forceinline void
|
90
|
-
put_unaligned_le16(u16 v, u8 *p)
|
91
|
-
{
|
92
|
-
if (UNALIGNED_ACCESS_IS_FAST) {
|
93
|
-
store_u16_unaligned(le16_bswap(v), p);
|
94
|
-
} else {
|
95
|
-
p[0] = (u8)(v >> 0);
|
96
|
-
p[1] = (u8)(v >> 8);
|
97
|
-
}
|
98
|
-
}
|
99
|
-
|
100
|
-
static forceinline void
|
101
|
-
put_unaligned_be16(u16 v, u8 *p)
|
102
|
-
{
|
103
|
-
if (UNALIGNED_ACCESS_IS_FAST) {
|
104
|
-
store_u16_unaligned(be16_bswap(v), p);
|
105
|
-
} else {
|
106
|
-
p[0] = (u8)(v >> 8);
|
107
|
-
p[1] = (u8)(v >> 0);
|
108
|
-
}
|
109
|
-
}
|
110
|
-
|
111
|
-
static forceinline void
|
112
|
-
put_unaligned_le32(u32 v, u8 *p)
|
113
|
-
{
|
114
|
-
if (UNALIGNED_ACCESS_IS_FAST) {
|
115
|
-
store_u32_unaligned(le32_bswap(v), p);
|
116
|
-
} else {
|
117
|
-
p[0] = (u8)(v >> 0);
|
118
|
-
p[1] = (u8)(v >> 8);
|
119
|
-
p[2] = (u8)(v >> 16);
|
120
|
-
p[3] = (u8)(v >> 24);
|
121
|
-
}
|
122
|
-
}
|
123
|
-
|
124
|
-
static forceinline void
|
125
|
-
put_unaligned_be32(u32 v, u8 *p)
|
126
|
-
{
|
127
|
-
if (UNALIGNED_ACCESS_IS_FAST) {
|
128
|
-
store_u32_unaligned(be32_bswap(v), p);
|
129
|
-
} else {
|
130
|
-
p[0] = (u8)(v >> 24);
|
131
|
-
p[1] = (u8)(v >> 16);
|
132
|
-
p[2] = (u8)(v >> 8);
|
133
|
-
p[3] = (u8)(v >> 0);
|
134
|
-
}
|
135
|
-
}
|
136
|
-
|
137
|
-
static forceinline void
|
138
|
-
put_unaligned_le64(u64 v, u8 *p)
|
139
|
-
{
|
140
|
-
if (UNALIGNED_ACCESS_IS_FAST) {
|
141
|
-
store_u64_unaligned(le64_bswap(v), p);
|
142
|
-
} else {
|
143
|
-
p[0] = (u8)(v >> 0);
|
144
|
-
p[1] = (u8)(v >> 8);
|
145
|
-
p[2] = (u8)(v >> 16);
|
146
|
-
p[3] = (u8)(v >> 24);
|
147
|
-
p[4] = (u8)(v >> 32);
|
148
|
-
p[5] = (u8)(v >> 40);
|
149
|
-
p[6] = (u8)(v >> 48);
|
150
|
-
p[7] = (u8)(v >> 56);
|
151
|
-
}
|
152
|
-
}
|
153
|
-
|
154
|
-
static forceinline void
|
155
|
-
put_unaligned_leword(machine_word_t v, u8 *p)
|
156
|
-
{
|
157
|
-
STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
|
158
|
-
if (WORDBITS == 32)
|
159
|
-
put_unaligned_le32(v, p);
|
160
|
-
else
|
161
|
-
put_unaligned_le64(v, p);
|
162
|
-
}
|
163
|
-
|
164
|
-
/***** 24-bit loads *****/
|
165
|
-
|
166
|
-
/*
|
167
|
-
* Given a 32-bit value that was loaded with the platform's native endianness,
|
168
|
-
* return a 32-bit value whose high-order 8 bits are 0 and whose low-order 24
|
169
|
-
* bits contain the first 3 bytes, arranged in octets in a platform-dependent
|
170
|
-
* order, at the memory location from which the input 32-bit value was loaded.
|
171
|
-
*/
|
172
|
-
static forceinline u32
|
173
|
-
loaded_u32_to_u24(u32 v)
|
174
|
-
{
|
175
|
-
if (CPU_IS_LITTLE_ENDIAN())
|
176
|
-
return v & 0xFFFFFF;
|
177
|
-
else
|
178
|
-
return v >> 8;
|
179
|
-
}
|
180
|
-
|
181
|
-
/*
|
182
|
-
* Load the next 3 bytes from the memory location @p into the 24 low-order bits
|
183
|
-
* of a 32-bit value. The order in which the 3 bytes will be arranged as octets
|
184
|
-
* in the 24 bits is platform-dependent. At least LOAD_U24_REQUIRED_NBYTES
|
185
|
-
* bytes must be available at @p; note that this may be more than 3.
|
186
|
-
*/
|
187
|
-
static forceinline u32
|
188
|
-
load_u24_unaligned(const u8 *p)
|
189
|
-
{
|
190
|
-
#if UNALIGNED_ACCESS_IS_FAST
|
191
|
-
# define LOAD_U24_REQUIRED_NBYTES 4
|
192
|
-
return loaded_u32_to_u24(load_u32_unaligned(p));
|
193
|
-
#else
|
194
|
-
# define LOAD_U24_REQUIRED_NBYTES 3
|
195
|
-
if (CPU_IS_LITTLE_ENDIAN())
|
196
|
-
return ((u32)p[0] << 0) | ((u32)p[1] << 8) | ((u32)p[2] << 16);
|
197
|
-
else
|
198
|
-
return ((u32)p[2] << 0) | ((u32)p[1] << 8) | ((u32)p[0] << 16);
|
199
|
-
#endif
|
200
|
-
}
|
201
|
-
|
202
|
-
#endif /* LIB_UNALIGNED_H */
|