sjpeg 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/Gemfile +4 -0
- data/Gemfile.lock +40 -0
- data/LICENSE.txt +21 -0
- data/README.md +40 -0
- data/Rakefile +11 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/ext/sjpeg/bit_writer.cc +122 -0
- data/ext/sjpeg/bit_writer.h +169 -0
- data/ext/sjpeg/colors_rgb.cc +691 -0
- data/ext/sjpeg/dichotomy.cc +290 -0
- data/ext/sjpeg/enc.cc +2132 -0
- data/ext/sjpeg/extconf.rb +3 -0
- data/ext/sjpeg/fdct.cc +627 -0
- data/ext/sjpeg/headers.cc +218 -0
- data/ext/sjpeg/jpeg_tools.cc +274 -0
- data/ext/sjpeg/libsjpeg.pc.in +11 -0
- data/ext/sjpeg/score_7.cc +6220 -0
- data/ext/sjpeg/sjpeg.h +353 -0
- data/ext/sjpeg/sjpegi.h +427 -0
- data/ext/sjpeg/yuv_convert.cc +698 -0
- data/lib/sjpeg/version.rb +3 -0
- data/lib/sjpeg.rb +35 -0
- data/sjpeg.gemspec +36 -0
- metadata +143 -0
data/ext/sjpeg/fdct.cc
ADDED
@@ -0,0 +1,627 @@
|
|
1
|
+
// Copyright 2017 Google Inc.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
//
|
15
|
+
// forward DCT
|
16
|
+
//
|
17
|
+
// fdct output is kept scaled by 16, to retain maximum 16bit precision
|
18
|
+
//
|
19
|
+
// Author: Skal (pascal.massimino@gmail.com)
|
20
|
+
|
21
|
+
#define SJPEG_NEED_ASM_HEADERS
|
22
|
+
#include "sjpegi.h"
|
23
|
+
|
24
|
+
namespace sjpeg {
|
25
|
+
|
26
|
+
///////////////////////////////////////////////////////////////////////////////
|
27
|
+
// Cosine table: C(k) = cos(k.pi/16)/sqrt(2), k = 1..7 using 15 bits signed
|
28
|
+
const int16_t kTable04[7] = { 22725, 21407, 19266, 16384, 12873, 8867, 4520 };
|
29
|
+
// rows #1 and #7 are pre-multiplied by 2.C(1) before the 2nd pass.
|
30
|
+
// This multiply is merged in the table of constants used during 1rst pass:
|
31
|
+
const int16_t kTable17[7] = { 31521, 29692, 26722, 22725, 17855, 12299, 6270 };
|
32
|
+
// rows #2 and #6 are pre-multiplied by 2.C(2):
|
33
|
+
const int16_t kTable26[7] = { 29692, 27969, 25172, 21407, 16819, 11585, 5906 };
|
34
|
+
// rows #3 and #5 are pre-multiplied by 2.C(3):
|
35
|
+
const int16_t kTable35[7] = { 26722, 25172, 22654, 19266, 15137, 10426, 5315 };
|
36
|
+
|
37
|
+
///////////////////////////////////////////////////////////////////////////////
|
38
|
+
// Constants and C/SSE2 macros for IDCT vertical pass
|
39
|
+
|
40
|
+
#define kTan1 (13036) // = tan(pi/16)
|
41
|
+
#define kTan2 (27146) // = tan(2.pi/16) = sqrt(2) - 1.
|
42
|
+
#define kTan3m1 (-21746) // = tan(3.pi/16) - 1
|
43
|
+
#define k2Sqrt2 (23170) // = 1 / 2.sqrt(2)
|
44
|
+
|
45
|
+
// performs: {a,b} <- {a-b, a+b}, without saturation
|
46
|
+
#define BUTTERFLY(a, b) do { \
|
47
|
+
SUB((a), (b)); \
|
48
|
+
ADD((b), (b)); \
|
49
|
+
ADD((b), (a)); \
|
50
|
+
} while (0)
|
51
|
+
|
52
|
+
///////////////////////////////////////////////////////////////////////////////
|
53
|
+
// Constants for DCT horizontal pass
|
54
|
+
|
55
|
+
// Note about the CORRECT_LSB macro:
|
56
|
+
// using 16bit fixed-point constants, we often compute products like:
|
57
|
+
// p = (A*x + B*y + 32768) >> 16 by adding two sub-terms q = (A*x) >> 16
|
58
|
+
// and r = (B*y) >> 16 together. Statistically, we have p = q + r + 1
|
59
|
+
// in 3/4 of the cases. This can be easily seen from the relation:
|
60
|
+
// (a + b + 1) >> 1 = (a >> 1) + (b >> 1) + ((a|b)&1)
|
61
|
+
// The approximation we are doing is replacing ((a|b)&1) by 1.
|
62
|
+
// In practice, this is a slightly more involved because the constants A and B
|
63
|
+
// have also been rounded compared to their exact floating point value.
|
64
|
+
// However, all in all the correction is quite small, and CORRECT_LSB can
|
65
|
+
// be defined empty if needed.
|
66
|
+
|
67
|
+
#define COLUMN_DCT8(in) do { \
|
68
|
+
LOAD(m0, (in)[0 * 8]); \
|
69
|
+
LOAD(m2, (in)[2 * 8]); \
|
70
|
+
LOAD(m7, (in)[7 * 8]); \
|
71
|
+
LOAD(m5, (in)[5 * 8]); \
|
72
|
+
\
|
73
|
+
BUTTERFLY(m0, m7); \
|
74
|
+
BUTTERFLY(m2, m5); \
|
75
|
+
\
|
76
|
+
LOAD(m3, (in)[3 * 8]); \
|
77
|
+
LOAD(m4, (in)[4 * 8]); \
|
78
|
+
BUTTERFLY(m3, m4); \
|
79
|
+
\
|
80
|
+
LOAD(m6, (in)[6 * 8]); \
|
81
|
+
LOAD(m1, (in)[1 * 8]); \
|
82
|
+
BUTTERFLY(m1, m6); \
|
83
|
+
BUTTERFLY(m7, m4); \
|
84
|
+
BUTTERFLY(m6, m5); \
|
85
|
+
\
|
86
|
+
/* RowIdct() needs 15bits fixed-point input, when the output from */ \
|
87
|
+
/* ColumnIdct() would be 12bits. We are better doing the shift by 3 */ \
|
88
|
+
/* now instead of in RowIdct(), because we have some multiplies to */ \
|
89
|
+
/* perform, that can take advantage of the extra 3bits precision. */ \
|
90
|
+
LSHIFT(m4, 3); \
|
91
|
+
LSHIFT(m5, 3); \
|
92
|
+
BUTTERFLY(m4, m5); \
|
93
|
+
STORE16((in)[0 * 8], m5); \
|
94
|
+
STORE16((in)[4 * 8], m4); \
|
95
|
+
\
|
96
|
+
LSHIFT(m7, 3); \
|
97
|
+
LSHIFT(m6, 3); \
|
98
|
+
LSHIFT(m3, 3); \
|
99
|
+
LSHIFT(m0, 3); \
|
100
|
+
\
|
101
|
+
LOAD_CST(m4, kTan2); \
|
102
|
+
m5 = m4; \
|
103
|
+
MULT(m4, m7); \
|
104
|
+
MULT(m5, m6); \
|
105
|
+
SUB(m4, m6); \
|
106
|
+
ADD(m5, m7); \
|
107
|
+
STORE16((in)[2 * 8], m5); \
|
108
|
+
STORE16((in)[6 * 8], m4); \
|
109
|
+
\
|
110
|
+
/* We should be multiplying m6 by C4 = 1/sqrt(2) here, but we only have */ \
|
111
|
+
/* the k2Sqrt2 = 1/(2.sqrt(2)) constant that fits into 15bits. So we */ \
|
112
|
+
/* shift by 4 instead of 3 to compensate for the additional 1/2 factor. */ \
|
113
|
+
LOAD_CST(m6, k2Sqrt2); \
|
114
|
+
LSHIFT(m2, 3 + 1); \
|
115
|
+
LSHIFT(m1, 3 + 1); \
|
116
|
+
BUTTERFLY(m1, m2); \
|
117
|
+
MULT(m2, m6); \
|
118
|
+
MULT(m1, m6); \
|
119
|
+
BUTTERFLY(m3, m1); \
|
120
|
+
BUTTERFLY(m0, m2); \
|
121
|
+
\
|
122
|
+
LOAD_CST(m4, kTan3m1); \
|
123
|
+
LOAD_CST(m5, kTan1); \
|
124
|
+
m7 = m3; \
|
125
|
+
m6 = m1; \
|
126
|
+
MULT(m3, m4); \
|
127
|
+
MULT(m1, m5); \
|
128
|
+
\
|
129
|
+
ADD(m3, m7); \
|
130
|
+
ADD(m1, m2); \
|
131
|
+
CORRECT_LSB(m1); \
|
132
|
+
CORRECT_LSB(m3); \
|
133
|
+
MULT(m4, m0); \
|
134
|
+
MULT(m5, m2); \
|
135
|
+
ADD(m4, m0); \
|
136
|
+
SUB(m0, m3); \
|
137
|
+
ADD(m7, m4); \
|
138
|
+
SUB(m5, m6); \
|
139
|
+
\
|
140
|
+
STORE16((in)[1 * 8], m1); \
|
141
|
+
STORE16((in)[3 * 8], m0); \
|
142
|
+
STORE16((in)[5 * 8], m7); \
|
143
|
+
STORE16((in)[7 * 8], m5); \
|
144
|
+
} while (0)
|
145
|
+
|
146
|
+
///////////////////////////////////////////////////////////////////////////////
|
147
|
+
// Plain-C implementation, bit-wise equivalent to the SSE2 version
|
148
|
+
|
149
|
+
// these are the macro required by COLUMN_*
|
150
|
+
#define LOAD_CST(dst, src) (dst) = (src)
|
151
|
+
#define LOAD(dst, src) (dst) = (src)
|
152
|
+
#define MULT(a, b) (a) = (((a) * (b)) >> 16)
|
153
|
+
#define ADD(a, b) (a) = (a) + (b)
|
154
|
+
#define SUB(a, b) (a) = (a) - (b)
|
155
|
+
#define LSHIFT(a, n) (a) = ((a) << (n))
|
156
|
+
#define STORE16(a, b) (a) = (b)
|
157
|
+
#define CORRECT_LSB(a) (a) += 1
|
158
|
+
|
159
|
+
// DCT vertical pass
|
160
|
+
|
161
|
+
void ColumnDct(int16_t* in) {
|
162
|
+
for (int i = 0; i < 8; ++i) {
|
163
|
+
int32_t m0, m1, m2, m3, m4, m5, m6, m7;
|
164
|
+
COLUMN_DCT8(in + i);
|
165
|
+
}
|
166
|
+
}
|
167
|
+
|
168
|
+
// DCT horizontal pass
|
169
|
+
|
170
|
+
// We don't really need to round before descaling, since we
|
171
|
+
// still have 4 bits of precision left as final scaled output.
|
172
|
+
#define DESCALE(a) static_cast<int16_t>((a) >> 16)
|
173
|
+
|
174
|
+
static void RowDct(int16_t* in, const int16_t*table) {
|
175
|
+
// The Fourier transform is an unitary operator, so we're basically
|
176
|
+
// doing the transpose of RowIdct()
|
177
|
+
const int a0 = in[0] + in[7];
|
178
|
+
const int b0 = in[0] - in[7];
|
179
|
+
const int a1 = in[1] + in[6];
|
180
|
+
const int b1 = in[1] - in[6];
|
181
|
+
const int a2 = in[2] + in[5];
|
182
|
+
const int b2 = in[2] - in[5];
|
183
|
+
const int a3 = in[3] + in[4];
|
184
|
+
const int b3 = in[3] - in[4];
|
185
|
+
|
186
|
+
// even part
|
187
|
+
const int C2 = table[1];
|
188
|
+
const int C4 = table[3];
|
189
|
+
const int C6 = table[5];
|
190
|
+
const int c0 = a0 + a3;
|
191
|
+
const int c1 = a0 - a3;
|
192
|
+
const int c2 = a1 + a2;
|
193
|
+
const int c3 = a1 - a2;
|
194
|
+
|
195
|
+
in[0] = DESCALE(C4 * (c0 + c2));
|
196
|
+
in[4] = DESCALE(C4 * (c0 - c2));
|
197
|
+
in[2] = DESCALE(C2 * c1 + C6 * c3);
|
198
|
+
in[6] = DESCALE(C6 * c1 - C2 * c3);
|
199
|
+
|
200
|
+
// odd part
|
201
|
+
const int C1 = table[0];
|
202
|
+
const int C3 = table[2];
|
203
|
+
const int C5 = table[4];
|
204
|
+
const int C7 = table[6];
|
205
|
+
in[1] = DESCALE(C1 * b0 + C3 * b1 + C5 * b2 + C7 * b3);
|
206
|
+
in[3] = DESCALE(C3 * b0 - C7 * b1 - C1 * b2 - C5 * b3);
|
207
|
+
in[5] = DESCALE(C5 * b0 - C1 * b1 + C7 * b2 + C3 * b3);
|
208
|
+
in[7] = DESCALE(C7 * b0 - C5 * b1 + C3 * b2 - C1 * b3);
|
209
|
+
}
|
210
|
+
#undef DESCALE
|
211
|
+
|
212
|
+
#undef LOAD_CST
|
213
|
+
#undef LOAD
|
214
|
+
#undef MULT
|
215
|
+
#undef ADD
|
216
|
+
#undef SUB
|
217
|
+
#undef LSHIFT
|
218
|
+
#undef STORE16
|
219
|
+
#undef CORRECT_LSB
|
220
|
+
|
221
|
+
///////////////////////////////////////////////////////////////////////////////
|
222
|
+
// SSE2 implementation
|
223
|
+
|
224
|
+
#if defined(SJPEG_USE_SSE2)
|
225
|
+
|
226
|
+
// Tables and macros
|
227
|
+
|
228
|
+
#define CST(v) { { v, v, v, v, v, v, v, v } }
|
229
|
+
static const union {
|
230
|
+
const int16_t s[8];
|
231
|
+
const __m128i m;
|
232
|
+
} CST_kTan1 = CST(kTan1),
|
233
|
+
CST_kTan2 = CST(kTan2),
|
234
|
+
CST_kTan3m1 = CST(kTan3m1),
|
235
|
+
CST_k2Sqrt2 = CST(k2Sqrt2),
|
236
|
+
CST_kfRounder1 = CST(1); // rounders for fdct
|
237
|
+
#undef CST
|
238
|
+
|
239
|
+
static const union {
|
240
|
+
const uint16_t s[4 * 8];
|
241
|
+
const __m128i m[4];
|
242
|
+
} kfTables_SSE2[4] = {
|
243
|
+
// Tables for fdct, roughly the transposed of the above, shuffled
|
244
|
+
{ { 0x4000, 0x4000, 0x58c5, 0x4b42, 0xdd5d, 0xac61, 0xa73b, 0xcdb7,
|
245
|
+
0x4000, 0x4000, 0x3249, 0x11a8, 0x539f, 0x22a3, 0x4b42, 0xee58,
|
246
|
+
0x4000, 0xc000, 0x3249, 0xa73b, 0x539f, 0xdd5d, 0x4b42, 0xa73b,
|
247
|
+
0xc000, 0x4000, 0x11a8, 0x4b42, 0x22a3, 0xac61, 0x11a8, 0xcdb7 } },
|
248
|
+
{ { 0x58c5, 0x58c5, 0x7b21, 0x6862, 0xcff5, 0x8c04, 0x84df, 0xba41,
|
249
|
+
0x58c5, 0x58c5, 0x45bf, 0x187e, 0x73fc, 0x300b, 0x6862, 0xe782,
|
250
|
+
0x58c5, 0xa73b, 0x45bf, 0x84df, 0x73fc, 0xcff5, 0x6862, 0x84df,
|
251
|
+
0xa73b, 0x58c5, 0x187e, 0x6862, 0x300b, 0x8c04, 0x187e, 0xba41 } },
|
252
|
+
{ { 0x539f, 0x539f, 0x73fc, 0x6254, 0xd2bf, 0x92bf, 0x8c04, 0xbe4d,
|
253
|
+
0x539f, 0x539f, 0x41b3, 0x1712, 0x6d41, 0x2d41, 0x6254, 0xe8ee,
|
254
|
+
0x539f, 0xac61, 0x41b3, 0x8c04, 0x6d41, 0xd2bf, 0x6254, 0x8c04,
|
255
|
+
0xac61, 0x539f, 0x1712, 0x6254, 0x2d41, 0x92bf, 0x1712, 0xbe4d } },
|
256
|
+
{ { 0x4b42, 0x4b42, 0x6862, 0x587e, 0xd746, 0x9dac, 0x979e, 0xc4df,
|
257
|
+
0x4b42, 0x4b42, 0x3b21, 0x14c3, 0x6254, 0x28ba, 0x587e, 0xeb3d,
|
258
|
+
0x4b42, 0xb4be, 0x3b21, 0x979e, 0x6254, 0xd746, 0x587e, 0x979e,
|
259
|
+
0xb4be, 0x4b42, 0x14c3, 0x587e, 0x28ba, 0x9dac, 0x14c3, 0xc4df } } };
|
260
|
+
|
261
|
+
#define LOAD_CST(x, y) (x) = (CST_ ## y).m
|
262
|
+
#define LOAD(x, y) \
|
263
|
+
(x) = _mm_load_si128(reinterpret_cast<const __m128i*>(&(y)))
|
264
|
+
#define MULT(x, y) (x) = _mm_mulhi_epi16((x), (y))
|
265
|
+
#define ADD(x, y) (x) = _mm_add_epi16((x), (y))
|
266
|
+
#define SUB(x, y) (x) = _mm_sub_epi16((x), (y))
|
267
|
+
#define LSHIFT(x, n) (x) = _mm_slli_epi16((x), (n))
|
268
|
+
#define STORE16(a, b) _mm_store_si128(reinterpret_cast<__m128i*>(&(a)), (b))
|
269
|
+
#define CORRECT_LSB(a) (a) = _mm_adds_epi16((a), CST_kfRounder1.m)
|
270
|
+
|
271
|
+
// DCT vertical pass
|
272
|
+
|
273
|
+
static void ColumnDct_SSE2(int16_t* in) {
|
274
|
+
__m128i m0, m1, m2, m3, m4, m5, m6, m7;
|
275
|
+
COLUMN_DCT8(in);
|
276
|
+
}
|
277
|
+
|
278
|
+
// DCT horizontal pass
|
279
|
+
|
280
|
+
static void RowDct_SSE2(int16_t* in, const __m128i* table1,
|
281
|
+
const __m128i* table2) {
|
282
|
+
// load row [0123|4567] as [0123|7654]
|
283
|
+
__m128i m0 =
|
284
|
+
_mm_shufflehi_epi16(*reinterpret_cast<__m128i*>(in + 0 * 8), 0x1b);
|
285
|
+
__m128i m2 =
|
286
|
+
_mm_shufflehi_epi16(*reinterpret_cast<__m128i*>(in + 1 * 8), 0x1b);
|
287
|
+
|
288
|
+
// we process two rows in parallel
|
289
|
+
__m128i m4 = m0;
|
290
|
+
// => x0 x1 x2 x3 | x0' x1' x2' x3'
|
291
|
+
m0 = (__m128i)_mm_shuffle_ps((__m128)m0, (__m128)m2, 0x44);
|
292
|
+
// => x7 x6 x5 x4 | x7' x6' x5' x4'
|
293
|
+
m4 = (__m128i)_mm_shuffle_ps((__m128)m4, (__m128)m2, 0xee);
|
294
|
+
|
295
|
+
// initial butterfly
|
296
|
+
m2 = m0;
|
297
|
+
m0 = _mm_add_epi16(m0, m4); // a0=x0+x7 | a1=x1+x6 | a2=x2+x5 | a3=x3+x4
|
298
|
+
m2 = _mm_sub_epi16(m2, m4); // b0=x0-x7 | b1=x1-x6 | b2=x2-x5 | b3=x3-x4
|
299
|
+
|
300
|
+
// prepare for scalar products which are performed using four madd_epi16
|
301
|
+
__m128i m6;
|
302
|
+
m4 = m0;
|
303
|
+
m0 = _mm_unpacklo_epi32(m0, m2); // a0 a1 | b0 b1 | a2 a3 | b2 b3
|
304
|
+
m4 = _mm_unpackhi_epi32(m4, m2);
|
305
|
+
m2 = _mm_shuffle_epi32(m0, 0x4e); // a2 a3 | b2 b3 | a0 a1 | b0 b1
|
306
|
+
m6 = _mm_shuffle_epi32(m4, 0x4e);
|
307
|
+
|
308
|
+
__m128i m1, m3, m5, m7;
|
309
|
+
m1 = _mm_madd_epi16(m2, table1[1]);
|
310
|
+
m3 = _mm_madd_epi16(m0, table1[2]);
|
311
|
+
m5 = _mm_madd_epi16(m6, table2[1]);
|
312
|
+
m7 = _mm_madd_epi16(m4, table2[2]);
|
313
|
+
|
314
|
+
m2 = _mm_madd_epi16(m2, table1[3]);
|
315
|
+
m0 = _mm_madd_epi16(m0, table1[0]);
|
316
|
+
m6 = _mm_madd_epi16(m6, table2[3]);
|
317
|
+
m4 = _mm_madd_epi16(m4, table2[0]);
|
318
|
+
|
319
|
+
// add the sub-terms
|
320
|
+
m0 = _mm_add_epi32(m0, m1);
|
321
|
+
m4 = _mm_add_epi32(m4, m5);
|
322
|
+
m2 = _mm_add_epi32(m2, m3);
|
323
|
+
m6 = _mm_add_epi32(m6, m7);
|
324
|
+
|
325
|
+
// descale
|
326
|
+
m0 = _mm_srai_epi32(m0, 16);
|
327
|
+
m4 = _mm_srai_epi32(m4, 16);
|
328
|
+
m2 = _mm_srai_epi32(m2, 16);
|
329
|
+
m6 = _mm_srai_epi32(m6, 16);
|
330
|
+
|
331
|
+
m0 = _mm_packs_epi32(m0, m2);
|
332
|
+
m4 = _mm_packs_epi32(m4, m6);
|
333
|
+
|
334
|
+
_mm_store_si128(reinterpret_cast<__m128i*>(in + 0 * 8), m0);
|
335
|
+
_mm_store_si128(reinterpret_cast<__m128i*>(in + 1 * 8), m4);
|
336
|
+
}
|
337
|
+
|
338
|
+
#undef LOAD_CST
|
339
|
+
#undef LOAD
|
340
|
+
#undef MULT
|
341
|
+
#undef ADD
|
342
|
+
#undef SUB
|
343
|
+
#undef LSHIFT
|
344
|
+
#undef STORE16
|
345
|
+
#undef CORRECT_LSB
|
346
|
+
|
347
|
+
#endif // SJPEG_USE_SSE2
|
348
|
+
|
349
|
+
// done with the macros
|
350
|
+
|
351
|
+
#undef BUTTERFLY
|
352
|
+
#undef COLUMN_DCT8
|
353
|
+
|
354
|
+
///////////////////////////////////////////////////////////////////////////////
|
355
|
+
// NEON implementation
|
356
|
+
|
357
|
+
#if defined(SJPEG_USE_NEON)
|
358
|
+
|
359
|
+
// multiply by scalar
|
360
|
+
#define MULT(A, kC) (vqdmulhq_n_s16((A), (kC) >> 1))
|
361
|
+
// V0 = r0 - r1, V1 = r0 + r1
|
362
|
+
#define BUTTERFLY(V0, V1, r0, r1) \
|
363
|
+
const int16x8_t V0 = vsubq_s16((r0), (r1)); \
|
364
|
+
const int16x8_t V1 = vaddq_s16((r0), (r1))
|
365
|
+
|
366
|
+
// collect the 16b hi-words of 32bit words into a packed 16b one
|
367
|
+
static int16x8_t PackS32(const int32x4_t lo, const int32x4_t hi) {
|
368
|
+
return vuzpq_s16(vreinterpretq_s16_s32(lo),
|
369
|
+
vreinterpretq_s16_s32(hi)).val[1];
|
370
|
+
}
|
371
|
+
|
372
|
+
#define MULT_DCL_32(LO, HI, A, CST) \
|
373
|
+
int32x4_t LO = vmull_s16(vget_low_s16(A), vget_low_s16(CST)); \
|
374
|
+
int32x4_t HI = vmull_s16(vget_high_s16(A), vget_high_s16(CST))
|
375
|
+
#define MULT_ADD_32(LO, HI, A, CST) do { \
|
376
|
+
LO = vmlal_s16(LO, vget_low_s16(A), vget_low_s16(CST)); \
|
377
|
+
HI = vmlal_s16(HI, vget_high_s16(A), vget_high_s16(CST)); \
|
378
|
+
} while (0)
|
379
|
+
#define MULT_SUB_32(LO, HI, A, CST) do { \
|
380
|
+
LO = vmlsl_s16(LO, vget_low_s16(A), vget_low_s16(CST)); \
|
381
|
+
HI = vmlsl_s16(HI, vget_high_s16(A), vget_high_s16(CST)); \
|
382
|
+
} while (0)
|
383
|
+
|
384
|
+
#define MK_TABLE_CST(A, B, C, D) { (A), (B), (C), (D), (A), (D), (C), (B) }
|
385
|
+
|
386
|
+
// s64 transposing helper:
|
387
|
+
// *out0 = lo(v0) | hi(v1)
|
388
|
+
// *out1 = lo(v1) | hi(v0)
|
389
|
+
static void vtrn_s64(const int32x4_t v0, const int32x4_t v1,
|
390
|
+
int16x8_t* out0, int16x8_t* out1) {
|
391
|
+
*out0 = vreinterpretq_s16_s64(
|
392
|
+
vcombine_s64(vreinterpret_s64_s32(vget_low_s32(v0)),
|
393
|
+
vreinterpret_s64_s32(vget_low_s32(v1))));
|
394
|
+
*out1 = vreinterpretq_s16_s64(
|
395
|
+
vcombine_s64(vreinterpret_s64_s32(vget_high_s32(v0)),
|
396
|
+
vreinterpret_s64_s32(vget_high_s32(v1))));
|
397
|
+
}
|
398
|
+
|
399
|
+
void Transpose8x8(int16x8_t* const A0, int16x8_t* const A1,
|
400
|
+
int16x8_t* const A2, int16x8_t* const A3,
|
401
|
+
int16x8_t* const A4, int16x8_t* const A5,
|
402
|
+
int16x8_t* const A6, int16x8_t* const A7) {
|
403
|
+
const int16x8x2_t row01 = vtrnq_s16(*A0, *A1);
|
404
|
+
const int16x8x2_t row23 = vtrnq_s16(*A2, *A3);
|
405
|
+
const int16x8x2_t row45 = vtrnq_s16(*A4, *A5);
|
406
|
+
const int16x8x2_t row67 = vtrnq_s16(*A6, *A7);
|
407
|
+
|
408
|
+
const int32x4x2_t row02 = vtrnq_s32(vreinterpretq_s32_s16(row01.val[0]),
|
409
|
+
vreinterpretq_s32_s16(row23.val[0]));
|
410
|
+
const int32x4x2_t row13 = vtrnq_s32(vreinterpretq_s32_s16(row01.val[1]),
|
411
|
+
vreinterpretq_s32_s16(row23.val[1]));
|
412
|
+
const int32x4x2_t row46 = vtrnq_s32(vreinterpretq_s32_s16(row45.val[0]),
|
413
|
+
vreinterpretq_s32_s16(row67.val[0]));
|
414
|
+
const int32x4x2_t row57 = vtrnq_s32(vreinterpretq_s32_s16(row45.val[1]),
|
415
|
+
vreinterpretq_s32_s16(row67.val[1]));
|
416
|
+
|
417
|
+
vtrn_s64(row02.val[0], row46.val[0], A0, A4);
|
418
|
+
vtrn_s64(row02.val[1], row46.val[1], A2, A6);
|
419
|
+
vtrn_s64(row13.val[0], row57.val[0], A1, A5);
|
420
|
+
vtrn_s64(row13.val[1], row57.val[1], A3, A7);
|
421
|
+
}
|
422
|
+
|
423
|
+
static void Dct_NEON(int16_t* in) {
|
424
|
+
////////////////////
|
425
|
+
// vertical pass
|
426
|
+
////////////////////
|
427
|
+
const int16x8_t m0 = vld1q_s16(in + 0 * 8);
|
428
|
+
const int16x8_t m1 = vld1q_s16(in + 1 * 8);
|
429
|
+
const int16x8_t m2 = vld1q_s16(in + 2 * 8);
|
430
|
+
const int16x8_t m3 = vld1q_s16(in + 3 * 8);
|
431
|
+
const int16x8_t m4 = vld1q_s16(in + 4 * 8);
|
432
|
+
const int16x8_t m5 = vld1q_s16(in + 5 * 8);
|
433
|
+
const int16x8_t m6 = vld1q_s16(in + 6 * 8);
|
434
|
+
const int16x8_t m7 = vld1q_s16(in + 7 * 8);
|
435
|
+
|
436
|
+
BUTTERFLY(A0, A7, m0, m7);
|
437
|
+
BUTTERFLY(A2, A5, m2, m5);
|
438
|
+
BUTTERFLY(A3, A4, m3, m4);
|
439
|
+
BUTTERFLY(A1, A6, m1, m6);
|
440
|
+
|
441
|
+
BUTTERFLY(B7, B4, A7, A4);
|
442
|
+
BUTTERFLY(B6, B5, A6, A5);
|
443
|
+
|
444
|
+
// see comment in COLUMN_DCT8
|
445
|
+
const int16x8_t C4 = vshlq_n_s16(B4, 3);
|
446
|
+
const int16x8_t C5 = vshlq_n_s16(B5, 3);
|
447
|
+
const int16x8_t C7 = vshlq_n_s16(B7, 3);
|
448
|
+
const int16x8_t C6 = vshlq_n_s16(B6, 3);
|
449
|
+
const int16x8_t C3 = vshlq_n_s16(A3, 3);
|
450
|
+
const int16x8_t C0 = vshlq_n_s16(A0, 3);
|
451
|
+
|
452
|
+
// BUTTERFLY(tmp4, tmp0, C4, C5)
|
453
|
+
int16x8_t tmp0 = vaddq_s16(C4, C5);
|
454
|
+
int16x8_t tmp4 = vsubq_s16(C4, C5);
|
455
|
+
int16x8_t tmp6 = vsubq_s16(MULT(C7, kTan2), C6);
|
456
|
+
int16x8_t tmp2 = vaddq_s16(MULT(C6, kTan2), C7);
|
457
|
+
|
458
|
+
// see comment in COLUMN_DCT8
|
459
|
+
const int16x8_t E2 = vshlq_n_s16(A2, 3 + 1);
|
460
|
+
const int16x8_t E1 = vshlq_n_s16(A1, 3 + 1);
|
461
|
+
BUTTERFLY(F1, F2, E1, E2);
|
462
|
+
const int16x8_t G2 = MULT(F2, k2Sqrt2);
|
463
|
+
const int16x8_t G1 = MULT(F1, k2Sqrt2);
|
464
|
+
BUTTERFLY(H3, H1, C3, G1);
|
465
|
+
BUTTERFLY(H0, H2, C0, G2);
|
466
|
+
|
467
|
+
const int16x8_t G3 = vaddq_s16(MULT(H3, kTan3m1), H3);
|
468
|
+
const int16x8_t G6 = vaddq_s16(MULT(H1, kTan1), H2);
|
469
|
+
|
470
|
+
// CORRECT_LSB
|
471
|
+
const int16x8_t kOne = vdupq_n_s16(1);
|
472
|
+
const int16x8_t I3 = vaddq_s16(G3, kOne);
|
473
|
+
const int16x8_t G4 = vaddq_s16(MULT(H0, kTan3m1), H0);
|
474
|
+
|
475
|
+
int16x8_t tmp1 = vaddq_s16(G6, kOne);
|
476
|
+
int16x8_t tmp3 = vsubq_s16(H0, I3);
|
477
|
+
int16x8_t tmp5 = vaddq_s16(H3, G4);
|
478
|
+
int16x8_t tmp7 = vsubq_s16(MULT(H2, kTan1), H1);
|
479
|
+
|
480
|
+
Transpose8x8(&tmp0, &tmp1, &tmp2, &tmp3, &tmp4, &tmp5, &tmp6, &tmp7);
|
481
|
+
|
482
|
+
////////////////////
|
483
|
+
// Horizontal pass
|
484
|
+
////////////////////
|
485
|
+
BUTTERFLY(b0, a0, tmp0, tmp7);
|
486
|
+
BUTTERFLY(b1, a1, tmp1, tmp6);
|
487
|
+
BUTTERFLY(b2, a2, tmp2, tmp5);
|
488
|
+
BUTTERFLY(b3, a3, tmp3, tmp4);
|
489
|
+
|
490
|
+
BUTTERFLY(c1, c0, a0, a3);
|
491
|
+
BUTTERFLY(c3, c2, a1, a2);
|
492
|
+
|
493
|
+
const int16_t kTable0[] = MK_TABLE_CST(22725, 31521, 29692, 26722);
|
494
|
+
const int16_t kTable1[] = MK_TABLE_CST(21407, 29692, 27969, 25172);
|
495
|
+
const int16_t kTable2[] = MK_TABLE_CST(19266, 26722, 25172, 22654);
|
496
|
+
const int16_t kTable3[] = MK_TABLE_CST(16384, 22725, 21407, 19266);
|
497
|
+
const int16_t kTable4[] = MK_TABLE_CST(12873, 17855, 16819, 15137);
|
498
|
+
const int16_t kTable5[] = MK_TABLE_CST(8867, 12299, 11585, 10426);
|
499
|
+
const int16_t kTable6[] = MK_TABLE_CST(4520, 6270, 5906, 5315);
|
500
|
+
|
501
|
+
// even part
|
502
|
+
const int16x8_t kC2 = vld1q_s16(kTable1);
|
503
|
+
const int16x8_t kC4 = vld1q_s16(kTable3);
|
504
|
+
const int16x8_t kC6 = vld1q_s16(kTable5);
|
505
|
+
|
506
|
+
MULT_DCL_32(out0_lo, out0_hi, c0, kC4);
|
507
|
+
MULT_DCL_32(out4_lo, out4_hi, c0, kC4);
|
508
|
+
MULT_ADD_32(out0_lo, out0_hi, c2, kC4);
|
509
|
+
MULT_SUB_32(out4_lo, out4_hi, c2, kC4);
|
510
|
+
MULT_DCL_32(out2_lo, out2_hi, c1, kC2);
|
511
|
+
MULT_DCL_32(out6_lo, out6_hi, c1, kC6);
|
512
|
+
MULT_ADD_32(out2_lo, out2_hi, c3, kC6);
|
513
|
+
MULT_SUB_32(out6_lo, out6_hi, c3, kC2);
|
514
|
+
|
515
|
+
int16x8_t out0 = PackS32(out0_lo, out0_hi);
|
516
|
+
int16x8_t out4 = PackS32(out4_lo, out4_hi);
|
517
|
+
int16x8_t out2 = PackS32(out2_lo, out2_hi);
|
518
|
+
int16x8_t out6 = PackS32(out6_lo, out6_hi);
|
519
|
+
|
520
|
+
// odd part
|
521
|
+
const int16x8_t kC1 = vld1q_s16(kTable0);
|
522
|
+
const int16x8_t kC3 = vld1q_s16(kTable2);
|
523
|
+
const int16x8_t kC5 = vld1q_s16(kTable4);
|
524
|
+
const int16x8_t kC7 = vld1q_s16(kTable6);
|
525
|
+
|
526
|
+
MULT_DCL_32(out1_lo, out1_hi, b0, kC1);
|
527
|
+
MULT_DCL_32(out3_lo, out3_hi, b0, kC3);
|
528
|
+
MULT_DCL_32(out5_lo, out5_hi, b0, kC5);
|
529
|
+
MULT_DCL_32(out7_lo, out7_hi, b0, kC7);
|
530
|
+
|
531
|
+
MULT_ADD_32(out1_lo, out1_hi, b1, kC3);
|
532
|
+
MULT_SUB_32(out3_lo, out3_hi, b1, kC7);
|
533
|
+
MULT_SUB_32(out5_lo, out5_hi, b1, kC1);
|
534
|
+
MULT_SUB_32(out7_lo, out7_hi, b1, kC5);
|
535
|
+
|
536
|
+
MULT_ADD_32(out1_lo, out1_hi, b2, kC5);
|
537
|
+
MULT_SUB_32(out3_lo, out3_hi, b2, kC1);
|
538
|
+
MULT_ADD_32(out5_lo, out5_hi, b2, kC7);
|
539
|
+
MULT_ADD_32(out7_lo, out7_hi, b2, kC3);
|
540
|
+
|
541
|
+
MULT_ADD_32(out1_lo, out1_hi, b3, kC7);
|
542
|
+
MULT_SUB_32(out3_lo, out3_hi, b3, kC5);
|
543
|
+
MULT_ADD_32(out5_lo, out5_hi, b3, kC3);
|
544
|
+
MULT_SUB_32(out7_lo, out7_hi, b3, kC1);
|
545
|
+
|
546
|
+
int16x8_t out1 = PackS32(out1_lo, out1_hi);
|
547
|
+
int16x8_t out3 = PackS32(out3_lo, out3_hi);
|
548
|
+
int16x8_t out5 = PackS32(out5_lo, out5_hi);
|
549
|
+
int16x8_t out7 = PackS32(out7_lo, out7_hi);
|
550
|
+
|
551
|
+
// final transpose
|
552
|
+
Transpose8x8(&out0, &out1, &out2, &out3, &out4, &out5, &out6, &out7);
|
553
|
+
|
554
|
+
// and storage.
|
555
|
+
vst1q_s16(&in[0 * 8], out0);
|
556
|
+
vst1q_s16(&in[1 * 8], out1);
|
557
|
+
vst1q_s16(&in[2 * 8], out2);
|
558
|
+
vst1q_s16(&in[3 * 8], out3);
|
559
|
+
vst1q_s16(&in[4 * 8], out4);
|
560
|
+
vst1q_s16(&in[5 * 8], out5);
|
561
|
+
vst1q_s16(&in[6 * 8], out6);
|
562
|
+
vst1q_s16(&in[7 * 8], out7);
|
563
|
+
}
|
564
|
+
|
565
|
+
static void FdctNEON(int16_t* coeffs, int num_blocks) {
|
566
|
+
while (num_blocks-- > 0) {
|
567
|
+
Dct_NEON(coeffs);
|
568
|
+
coeffs += 64;
|
569
|
+
}
|
570
|
+
}
|
571
|
+
#undef MULT
|
572
|
+
#undef BUTTERFLY
|
573
|
+
#undef MULT_DCL_32
|
574
|
+
#undef MULT_ADD_32
|
575
|
+
#undef MULT_SUB_32
|
576
|
+
#undef MK_TABLE_CST
|
577
|
+
|
578
|
+
#endif // SJPEG_USE_NEON
|
579
|
+
|
580
|
+
#undef kTan1
|
581
|
+
#undef kTan2
|
582
|
+
#undef kTan3m1
|
583
|
+
#undef k2Sqrt2
|
584
|
+
|
585
|
+
///////////////////////////////////////////////////////////////////////////////
|
586
|
+
// visible FDCT callable functions
|
587
|
+
|
588
|
+
static void FdctC(int16_t* coeffs, int num_blocks) {
|
589
|
+
while (num_blocks-- > 0) {
|
590
|
+
ColumnDct(coeffs);
|
591
|
+
RowDct(coeffs + 0 * 8, kTable04);
|
592
|
+
RowDct(coeffs + 1 * 8, kTable17);
|
593
|
+
RowDct(coeffs + 2 * 8, kTable26);
|
594
|
+
RowDct(coeffs + 3 * 8, kTable35);
|
595
|
+
RowDct(coeffs + 4 * 8, kTable04);
|
596
|
+
RowDct(coeffs + 5 * 8, kTable35);
|
597
|
+
RowDct(coeffs + 6 * 8, kTable26);
|
598
|
+
RowDct(coeffs + 7 * 8, kTable17);
|
599
|
+
coeffs += 64;
|
600
|
+
}
|
601
|
+
}
|
602
|
+
|
603
|
+
#if defined(SJPEG_USE_SSE2)
|
604
|
+
static void FdctSSE2(int16_t* coeffs, int num_blocks) {
|
605
|
+
while (num_blocks-- > 0) {
|
606
|
+
ColumnDct_SSE2(coeffs);
|
607
|
+
RowDct_SSE2(coeffs + 0 * 8, kfTables_SSE2[0].m, kfTables_SSE2[1].m);
|
608
|
+
RowDct_SSE2(coeffs + 2 * 8, kfTables_SSE2[2].m, kfTables_SSE2[3].m);
|
609
|
+
RowDct_SSE2(coeffs + 4 * 8, kfTables_SSE2[0].m, kfTables_SSE2[3].m);
|
610
|
+
RowDct_SSE2(coeffs + 6 * 8, kfTables_SSE2[2].m, kfTables_SSE2[1].m);
|
611
|
+
coeffs += 64;
|
612
|
+
}
|
613
|
+
}
|
614
|
+
#endif // SJPEG_USE_SSE2
|
615
|
+
|
616
|
+
FdctFunc GetFdct() {
|
617
|
+
#if defined(SJPEG_USE_SSE2)
|
618
|
+
if (SupportsSSE2()) return FdctSSE2;
|
619
|
+
#elif defined(SJPEG_USE_NEON)
|
620
|
+
if (SupportsNEON()) return FdctNEON;
|
621
|
+
#endif
|
622
|
+
return FdctC; // default
|
623
|
+
}
|
624
|
+
|
625
|
+
///////////////////////////////////////////////////////////////////////////////
|
626
|
+
|
627
|
+
} // namespace sjpeg
|