sjpeg 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,3 @@
1
+ require "mkmf"
2
+
3
+ create_makefile "sjpeg"
data/ext/sjpeg/fdct.cc ADDED
@@ -0,0 +1,627 @@
1
+ // Copyright 2017 Google Inc.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // forward DCT
16
+ //
17
+ // fdct output is kept scaled by 16, to retain maximum 16bit precision
18
+ //
19
+ // Author: Skal (pascal.massimino@gmail.com)
20
+
21
+ #define SJPEG_NEED_ASM_HEADERS
22
+ #include "sjpegi.h"
23
+
24
+ namespace sjpeg {
25
+
26
+ ///////////////////////////////////////////////////////////////////////////////
27
+ // Cosine table: C(k) = cos(k.pi/16)/sqrt(2), k = 1..7 using 15 bits signed
28
+ const int16_t kTable04[7] = { 22725, 21407, 19266, 16384, 12873, 8867, 4520 };
29
+ // rows #1 and #7 are pre-multiplied by 2.C(1) before the 2nd pass.
30
+ // This multiply is merged in the table of constants used during 1rst pass:
31
+ const int16_t kTable17[7] = { 31521, 29692, 26722, 22725, 17855, 12299, 6270 };
32
+ // rows #2 and #6 are pre-multiplied by 2.C(2):
33
+ const int16_t kTable26[7] = { 29692, 27969, 25172, 21407, 16819, 11585, 5906 };
34
+ // rows #3 and #5 are pre-multiplied by 2.C(3):
35
+ const int16_t kTable35[7] = { 26722, 25172, 22654, 19266, 15137, 10426, 5315 };
36
+
37
+ ///////////////////////////////////////////////////////////////////////////////
38
+ // Constants and C/SSE2 macros for IDCT vertical pass
39
+
40
+ #define kTan1 (13036) // = tan(pi/16)
41
+ #define kTan2 (27146) // = tan(2.pi/16) = sqrt(2) - 1.
42
+ #define kTan3m1 (-21746) // = tan(3.pi/16) - 1
43
+ #define k2Sqrt2 (23170) // = 1 / 2.sqrt(2)
44
+
45
+ // performs: {a,b} <- {a-b, a+b}, without saturation
46
+ #define BUTTERFLY(a, b) do { \
47
+ SUB((a), (b)); \
48
+ ADD((b), (b)); \
49
+ ADD((b), (a)); \
50
+ } while (0)
51
+
52
+ ///////////////////////////////////////////////////////////////////////////////
53
+ // Constants for DCT horizontal pass
54
+
55
+ // Note about the CORRECT_LSB macro:
56
+ // using 16bit fixed-point constants, we often compute products like:
57
+ // p = (A*x + B*y + 32768) >> 16 by adding two sub-terms q = (A*x) >> 16
58
+ // and r = (B*y) >> 16 together. Statistically, we have p = q + r + 1
59
+ // in 3/4 of the cases. This can be easily seen from the relation:
60
+ // (a + b + 1) >> 1 = (a >> 1) + (b >> 1) + ((a|b)&1)
61
+ // The approximation we are doing is replacing ((a|b)&1) by 1.
62
+ // In practice, this is a slightly more involved because the constants A and B
63
+ // have also been rounded compared to their exact floating point value.
64
+ // However, all in all the correction is quite small, and CORRECT_LSB can
65
+ // be defined empty if needed.
66
+
67
+ #define COLUMN_DCT8(in) do { \
68
+ LOAD(m0, (in)[0 * 8]); \
69
+ LOAD(m2, (in)[2 * 8]); \
70
+ LOAD(m7, (in)[7 * 8]); \
71
+ LOAD(m5, (in)[5 * 8]); \
72
+ \
73
+ BUTTERFLY(m0, m7); \
74
+ BUTTERFLY(m2, m5); \
75
+ \
76
+ LOAD(m3, (in)[3 * 8]); \
77
+ LOAD(m4, (in)[4 * 8]); \
78
+ BUTTERFLY(m3, m4); \
79
+ \
80
+ LOAD(m6, (in)[6 * 8]); \
81
+ LOAD(m1, (in)[1 * 8]); \
82
+ BUTTERFLY(m1, m6); \
83
+ BUTTERFLY(m7, m4); \
84
+ BUTTERFLY(m6, m5); \
85
+ \
86
+ /* RowIdct() needs 15bits fixed-point input, when the output from */ \
87
+ /* ColumnIdct() would be 12bits. We are better doing the shift by 3 */ \
88
+ /* now instead of in RowIdct(), because we have some multiplies to */ \
89
+ /* perform, that can take advantage of the extra 3bits precision. */ \
90
+ LSHIFT(m4, 3); \
91
+ LSHIFT(m5, 3); \
92
+ BUTTERFLY(m4, m5); \
93
+ STORE16((in)[0 * 8], m5); \
94
+ STORE16((in)[4 * 8], m4); \
95
+ \
96
+ LSHIFT(m7, 3); \
97
+ LSHIFT(m6, 3); \
98
+ LSHIFT(m3, 3); \
99
+ LSHIFT(m0, 3); \
100
+ \
101
+ LOAD_CST(m4, kTan2); \
102
+ m5 = m4; \
103
+ MULT(m4, m7); \
104
+ MULT(m5, m6); \
105
+ SUB(m4, m6); \
106
+ ADD(m5, m7); \
107
+ STORE16((in)[2 * 8], m5); \
108
+ STORE16((in)[6 * 8], m4); \
109
+ \
110
+ /* We should be multiplying m6 by C4 = 1/sqrt(2) here, but we only have */ \
111
+ /* the k2Sqrt2 = 1/(2.sqrt(2)) constant that fits into 15bits. So we */ \
112
+ /* shift by 4 instead of 3 to compensate for the additional 1/2 factor. */ \
113
+ LOAD_CST(m6, k2Sqrt2); \
114
+ LSHIFT(m2, 3 + 1); \
115
+ LSHIFT(m1, 3 + 1); \
116
+ BUTTERFLY(m1, m2); \
117
+ MULT(m2, m6); \
118
+ MULT(m1, m6); \
119
+ BUTTERFLY(m3, m1); \
120
+ BUTTERFLY(m0, m2); \
121
+ \
122
+ LOAD_CST(m4, kTan3m1); \
123
+ LOAD_CST(m5, kTan1); \
124
+ m7 = m3; \
125
+ m6 = m1; \
126
+ MULT(m3, m4); \
127
+ MULT(m1, m5); \
128
+ \
129
+ ADD(m3, m7); \
130
+ ADD(m1, m2); \
131
+ CORRECT_LSB(m1); \
132
+ CORRECT_LSB(m3); \
133
+ MULT(m4, m0); \
134
+ MULT(m5, m2); \
135
+ ADD(m4, m0); \
136
+ SUB(m0, m3); \
137
+ ADD(m7, m4); \
138
+ SUB(m5, m6); \
139
+ \
140
+ STORE16((in)[1 * 8], m1); \
141
+ STORE16((in)[3 * 8], m0); \
142
+ STORE16((in)[5 * 8], m7); \
143
+ STORE16((in)[7 * 8], m5); \
144
+ } while (0)
145
+
146
+ ///////////////////////////////////////////////////////////////////////////////
147
+ // Plain-C implementation, bit-wise equivalent to the SSE2 version
148
+
149
+ // these are the macro required by COLUMN_*
150
+ #define LOAD_CST(dst, src) (dst) = (src)
151
+ #define LOAD(dst, src) (dst) = (src)
152
+ #define MULT(a, b) (a) = (((a) * (b)) >> 16)
153
+ #define ADD(a, b) (a) = (a) + (b)
154
+ #define SUB(a, b) (a) = (a) - (b)
155
+ #define LSHIFT(a, n) (a) = ((a) << (n))
156
+ #define STORE16(a, b) (a) = (b)
157
+ #define CORRECT_LSB(a) (a) += 1
158
+
159
+ // DCT vertical pass
160
+
161
+ void ColumnDct(int16_t* in) {
162
+ for (int i = 0; i < 8; ++i) {
163
+ int32_t m0, m1, m2, m3, m4, m5, m6, m7;
164
+ COLUMN_DCT8(in + i);
165
+ }
166
+ }
167
+
168
+ // DCT horizontal pass
169
+
170
+ // We don't really need to round before descaling, since we
171
+ // still have 4 bits of precision left as final scaled output.
172
+ #define DESCALE(a) static_cast<int16_t>((a) >> 16)
173
+
174
+ static void RowDct(int16_t* in, const int16_t*table) {
175
+ // The Fourier transform is an unitary operator, so we're basically
176
+ // doing the transpose of RowIdct()
177
+ const int a0 = in[0] + in[7];
178
+ const int b0 = in[0] - in[7];
179
+ const int a1 = in[1] + in[6];
180
+ const int b1 = in[1] - in[6];
181
+ const int a2 = in[2] + in[5];
182
+ const int b2 = in[2] - in[5];
183
+ const int a3 = in[3] + in[4];
184
+ const int b3 = in[3] - in[4];
185
+
186
+ // even part
187
+ const int C2 = table[1];
188
+ const int C4 = table[3];
189
+ const int C6 = table[5];
190
+ const int c0 = a0 + a3;
191
+ const int c1 = a0 - a3;
192
+ const int c2 = a1 + a2;
193
+ const int c3 = a1 - a2;
194
+
195
+ in[0] = DESCALE(C4 * (c0 + c2));
196
+ in[4] = DESCALE(C4 * (c0 - c2));
197
+ in[2] = DESCALE(C2 * c1 + C6 * c3);
198
+ in[6] = DESCALE(C6 * c1 - C2 * c3);
199
+
200
+ // odd part
201
+ const int C1 = table[0];
202
+ const int C3 = table[2];
203
+ const int C5 = table[4];
204
+ const int C7 = table[6];
205
+ in[1] = DESCALE(C1 * b0 + C3 * b1 + C5 * b2 + C7 * b3);
206
+ in[3] = DESCALE(C3 * b0 - C7 * b1 - C1 * b2 - C5 * b3);
207
+ in[5] = DESCALE(C5 * b0 - C1 * b1 + C7 * b2 + C3 * b3);
208
+ in[7] = DESCALE(C7 * b0 - C5 * b1 + C3 * b2 - C1 * b3);
209
+ }
210
+ #undef DESCALE
211
+
212
+ #undef LOAD_CST
213
+ #undef LOAD
214
+ #undef MULT
215
+ #undef ADD
216
+ #undef SUB
217
+ #undef LSHIFT
218
+ #undef STORE16
219
+ #undef CORRECT_LSB
220
+
221
+ ///////////////////////////////////////////////////////////////////////////////
222
+ // SSE2 implementation
223
+
224
+ #if defined(SJPEG_USE_SSE2)
225
+
226
+ // Tables and macros
227
+
228
+ #define CST(v) { { v, v, v, v, v, v, v, v } }
229
+ static const union {
230
+ const int16_t s[8];
231
+ const __m128i m;
232
+ } CST_kTan1 = CST(kTan1),
233
+ CST_kTan2 = CST(kTan2),
234
+ CST_kTan3m1 = CST(kTan3m1),
235
+ CST_k2Sqrt2 = CST(k2Sqrt2),
236
+ CST_kfRounder1 = CST(1); // rounders for fdct
237
+ #undef CST
238
+
239
+ static const union {
240
+ const uint16_t s[4 * 8];
241
+ const __m128i m[4];
242
+ } kfTables_SSE2[4] = {
243
+ // Tables for fdct, roughly the transposed of the above, shuffled
244
+ { { 0x4000, 0x4000, 0x58c5, 0x4b42, 0xdd5d, 0xac61, 0xa73b, 0xcdb7,
245
+ 0x4000, 0x4000, 0x3249, 0x11a8, 0x539f, 0x22a3, 0x4b42, 0xee58,
246
+ 0x4000, 0xc000, 0x3249, 0xa73b, 0x539f, 0xdd5d, 0x4b42, 0xa73b,
247
+ 0xc000, 0x4000, 0x11a8, 0x4b42, 0x22a3, 0xac61, 0x11a8, 0xcdb7 } },
248
+ { { 0x58c5, 0x58c5, 0x7b21, 0x6862, 0xcff5, 0x8c04, 0x84df, 0xba41,
249
+ 0x58c5, 0x58c5, 0x45bf, 0x187e, 0x73fc, 0x300b, 0x6862, 0xe782,
250
+ 0x58c5, 0xa73b, 0x45bf, 0x84df, 0x73fc, 0xcff5, 0x6862, 0x84df,
251
+ 0xa73b, 0x58c5, 0x187e, 0x6862, 0x300b, 0x8c04, 0x187e, 0xba41 } },
252
+ { { 0x539f, 0x539f, 0x73fc, 0x6254, 0xd2bf, 0x92bf, 0x8c04, 0xbe4d,
253
+ 0x539f, 0x539f, 0x41b3, 0x1712, 0x6d41, 0x2d41, 0x6254, 0xe8ee,
254
+ 0x539f, 0xac61, 0x41b3, 0x8c04, 0x6d41, 0xd2bf, 0x6254, 0x8c04,
255
+ 0xac61, 0x539f, 0x1712, 0x6254, 0x2d41, 0x92bf, 0x1712, 0xbe4d } },
256
+ { { 0x4b42, 0x4b42, 0x6862, 0x587e, 0xd746, 0x9dac, 0x979e, 0xc4df,
257
+ 0x4b42, 0x4b42, 0x3b21, 0x14c3, 0x6254, 0x28ba, 0x587e, 0xeb3d,
258
+ 0x4b42, 0xb4be, 0x3b21, 0x979e, 0x6254, 0xd746, 0x587e, 0x979e,
259
+ 0xb4be, 0x4b42, 0x14c3, 0x587e, 0x28ba, 0x9dac, 0x14c3, 0xc4df } } };
260
+
261
+ #define LOAD_CST(x, y) (x) = (CST_ ## y).m
262
+ #define LOAD(x, y) \
263
+ (x) = _mm_load_si128(reinterpret_cast<const __m128i*>(&(y)))
264
+ #define MULT(x, y) (x) = _mm_mulhi_epi16((x), (y))
265
+ #define ADD(x, y) (x) = _mm_add_epi16((x), (y))
266
+ #define SUB(x, y) (x) = _mm_sub_epi16((x), (y))
267
+ #define LSHIFT(x, n) (x) = _mm_slli_epi16((x), (n))
268
+ #define STORE16(a, b) _mm_store_si128(reinterpret_cast<__m128i*>(&(a)), (b))
269
+ #define CORRECT_LSB(a) (a) = _mm_adds_epi16((a), CST_kfRounder1.m)
270
+
271
+ // DCT vertical pass
272
+
273
+ static void ColumnDct_SSE2(int16_t* in) {
274
+ __m128i m0, m1, m2, m3, m4, m5, m6, m7;
275
+ COLUMN_DCT8(in);
276
+ }
277
+
278
+ // DCT horizontal pass
279
+
280
+ static void RowDct_SSE2(int16_t* in, const __m128i* table1,
281
+ const __m128i* table2) {
282
+ // load row [0123|4567] as [0123|7654]
283
+ __m128i m0 =
284
+ _mm_shufflehi_epi16(*reinterpret_cast<__m128i*>(in + 0 * 8), 0x1b);
285
+ __m128i m2 =
286
+ _mm_shufflehi_epi16(*reinterpret_cast<__m128i*>(in + 1 * 8), 0x1b);
287
+
288
+ // we process two rows in parallel
289
+ __m128i m4 = m0;
290
+ // => x0 x1 x2 x3 | x0' x1' x2' x3'
291
+ m0 = (__m128i)_mm_shuffle_ps((__m128)m0, (__m128)m2, 0x44);
292
+ // => x7 x6 x5 x4 | x7' x6' x5' x4'
293
+ m4 = (__m128i)_mm_shuffle_ps((__m128)m4, (__m128)m2, 0xee);
294
+
295
+ // initial butterfly
296
+ m2 = m0;
297
+ m0 = _mm_add_epi16(m0, m4); // a0=x0+x7 | a1=x1+x6 | a2=x2+x5 | a3=x3+x4
298
+ m2 = _mm_sub_epi16(m2, m4); // b0=x0-x7 | b1=x1-x6 | b2=x2-x5 | b3=x3-x4
299
+
300
+ // prepare for scalar products which are performed using four madd_epi16
301
+ __m128i m6;
302
+ m4 = m0;
303
+ m0 = _mm_unpacklo_epi32(m0, m2); // a0 a1 | b0 b1 | a2 a3 | b2 b3
304
+ m4 = _mm_unpackhi_epi32(m4, m2);
305
+ m2 = _mm_shuffle_epi32(m0, 0x4e); // a2 a3 | b2 b3 | a0 a1 | b0 b1
306
+ m6 = _mm_shuffle_epi32(m4, 0x4e);
307
+
308
+ __m128i m1, m3, m5, m7;
309
+ m1 = _mm_madd_epi16(m2, table1[1]);
310
+ m3 = _mm_madd_epi16(m0, table1[2]);
311
+ m5 = _mm_madd_epi16(m6, table2[1]);
312
+ m7 = _mm_madd_epi16(m4, table2[2]);
313
+
314
+ m2 = _mm_madd_epi16(m2, table1[3]);
315
+ m0 = _mm_madd_epi16(m0, table1[0]);
316
+ m6 = _mm_madd_epi16(m6, table2[3]);
317
+ m4 = _mm_madd_epi16(m4, table2[0]);
318
+
319
+ // add the sub-terms
320
+ m0 = _mm_add_epi32(m0, m1);
321
+ m4 = _mm_add_epi32(m4, m5);
322
+ m2 = _mm_add_epi32(m2, m3);
323
+ m6 = _mm_add_epi32(m6, m7);
324
+
325
+ // descale
326
+ m0 = _mm_srai_epi32(m0, 16);
327
+ m4 = _mm_srai_epi32(m4, 16);
328
+ m2 = _mm_srai_epi32(m2, 16);
329
+ m6 = _mm_srai_epi32(m6, 16);
330
+
331
+ m0 = _mm_packs_epi32(m0, m2);
332
+ m4 = _mm_packs_epi32(m4, m6);
333
+
334
+ _mm_store_si128(reinterpret_cast<__m128i*>(in + 0 * 8), m0);
335
+ _mm_store_si128(reinterpret_cast<__m128i*>(in + 1 * 8), m4);
336
+ }
337
+
338
+ #undef LOAD_CST
339
+ #undef LOAD
340
+ #undef MULT
341
+ #undef ADD
342
+ #undef SUB
343
+ #undef LSHIFT
344
+ #undef STORE16
345
+ #undef CORRECT_LSB
346
+
347
+ #endif // SJPEG_USE_SSE2
348
+
349
+ // done with the macros
350
+
351
+ #undef BUTTERFLY
352
+ #undef COLUMN_DCT8
353
+
354
+ ///////////////////////////////////////////////////////////////////////////////
355
+ // NEON implementation
356
+
357
+ #if defined(SJPEG_USE_NEON)
358
+
359
+ // multiply by scalar
360
+ #define MULT(A, kC) (vqdmulhq_n_s16((A), (kC) >> 1))
361
+ // V0 = r0 - r1, V1 = r0 + r1
362
+ #define BUTTERFLY(V0, V1, r0, r1) \
363
+ const int16x8_t V0 = vsubq_s16((r0), (r1)); \
364
+ const int16x8_t V1 = vaddq_s16((r0), (r1))
365
+
366
+ // collect the 16b hi-words of 32bit words into a packed 16b one
367
+ static int16x8_t PackS32(const int32x4_t lo, const int32x4_t hi) {
368
+ return vuzpq_s16(vreinterpretq_s16_s32(lo),
369
+ vreinterpretq_s16_s32(hi)).val[1];
370
+ }
371
+
372
+ #define MULT_DCL_32(LO, HI, A, CST) \
373
+ int32x4_t LO = vmull_s16(vget_low_s16(A), vget_low_s16(CST)); \
374
+ int32x4_t HI = vmull_s16(vget_high_s16(A), vget_high_s16(CST))
375
+ #define MULT_ADD_32(LO, HI, A, CST) do { \
376
+ LO = vmlal_s16(LO, vget_low_s16(A), vget_low_s16(CST)); \
377
+ HI = vmlal_s16(HI, vget_high_s16(A), vget_high_s16(CST)); \
378
+ } while (0)
379
+ #define MULT_SUB_32(LO, HI, A, CST) do { \
380
+ LO = vmlsl_s16(LO, vget_low_s16(A), vget_low_s16(CST)); \
381
+ HI = vmlsl_s16(HI, vget_high_s16(A), vget_high_s16(CST)); \
382
+ } while (0)
383
+
384
+ #define MK_TABLE_CST(A, B, C, D) { (A), (B), (C), (D), (A), (D), (C), (B) }
385
+
386
+ // s64 transposing helper:
387
+ // *out0 = lo(v0) | hi(v1)
388
+ // *out1 = lo(v1) | hi(v0)
389
+ static void vtrn_s64(const int32x4_t v0, const int32x4_t v1,
390
+ int16x8_t* out0, int16x8_t* out1) {
391
+ *out0 = vreinterpretq_s16_s64(
392
+ vcombine_s64(vreinterpret_s64_s32(vget_low_s32(v0)),
393
+ vreinterpret_s64_s32(vget_low_s32(v1))));
394
+ *out1 = vreinterpretq_s16_s64(
395
+ vcombine_s64(vreinterpret_s64_s32(vget_high_s32(v0)),
396
+ vreinterpret_s64_s32(vget_high_s32(v1))));
397
+ }
398
+
399
+ void Transpose8x8(int16x8_t* const A0, int16x8_t* const A1,
400
+ int16x8_t* const A2, int16x8_t* const A3,
401
+ int16x8_t* const A4, int16x8_t* const A5,
402
+ int16x8_t* const A6, int16x8_t* const A7) {
403
+ const int16x8x2_t row01 = vtrnq_s16(*A0, *A1);
404
+ const int16x8x2_t row23 = vtrnq_s16(*A2, *A3);
405
+ const int16x8x2_t row45 = vtrnq_s16(*A4, *A5);
406
+ const int16x8x2_t row67 = vtrnq_s16(*A6, *A7);
407
+
408
+ const int32x4x2_t row02 = vtrnq_s32(vreinterpretq_s32_s16(row01.val[0]),
409
+ vreinterpretq_s32_s16(row23.val[0]));
410
+ const int32x4x2_t row13 = vtrnq_s32(vreinterpretq_s32_s16(row01.val[1]),
411
+ vreinterpretq_s32_s16(row23.val[1]));
412
+ const int32x4x2_t row46 = vtrnq_s32(vreinterpretq_s32_s16(row45.val[0]),
413
+ vreinterpretq_s32_s16(row67.val[0]));
414
+ const int32x4x2_t row57 = vtrnq_s32(vreinterpretq_s32_s16(row45.val[1]),
415
+ vreinterpretq_s32_s16(row67.val[1]));
416
+
417
+ vtrn_s64(row02.val[0], row46.val[0], A0, A4);
418
+ vtrn_s64(row02.val[1], row46.val[1], A2, A6);
419
+ vtrn_s64(row13.val[0], row57.val[0], A1, A5);
420
+ vtrn_s64(row13.val[1], row57.val[1], A3, A7);
421
+ }
422
+
423
+ static void Dct_NEON(int16_t* in) {
424
+ ////////////////////
425
+ // vertical pass
426
+ ////////////////////
427
+ const int16x8_t m0 = vld1q_s16(in + 0 * 8);
428
+ const int16x8_t m1 = vld1q_s16(in + 1 * 8);
429
+ const int16x8_t m2 = vld1q_s16(in + 2 * 8);
430
+ const int16x8_t m3 = vld1q_s16(in + 3 * 8);
431
+ const int16x8_t m4 = vld1q_s16(in + 4 * 8);
432
+ const int16x8_t m5 = vld1q_s16(in + 5 * 8);
433
+ const int16x8_t m6 = vld1q_s16(in + 6 * 8);
434
+ const int16x8_t m7 = vld1q_s16(in + 7 * 8);
435
+
436
+ BUTTERFLY(A0, A7, m0, m7);
437
+ BUTTERFLY(A2, A5, m2, m5);
438
+ BUTTERFLY(A3, A4, m3, m4);
439
+ BUTTERFLY(A1, A6, m1, m6);
440
+
441
+ BUTTERFLY(B7, B4, A7, A4);
442
+ BUTTERFLY(B6, B5, A6, A5);
443
+
444
+ // see comment in COLUMN_DCT8
445
+ const int16x8_t C4 = vshlq_n_s16(B4, 3);
446
+ const int16x8_t C5 = vshlq_n_s16(B5, 3);
447
+ const int16x8_t C7 = vshlq_n_s16(B7, 3);
448
+ const int16x8_t C6 = vshlq_n_s16(B6, 3);
449
+ const int16x8_t C3 = vshlq_n_s16(A3, 3);
450
+ const int16x8_t C0 = vshlq_n_s16(A0, 3);
451
+
452
+ // BUTTERFLY(tmp4, tmp0, C4, C5)
453
+ int16x8_t tmp0 = vaddq_s16(C4, C5);
454
+ int16x8_t tmp4 = vsubq_s16(C4, C5);
455
+ int16x8_t tmp6 = vsubq_s16(MULT(C7, kTan2), C6);
456
+ int16x8_t tmp2 = vaddq_s16(MULT(C6, kTan2), C7);
457
+
458
+ // see comment in COLUMN_DCT8
459
+ const int16x8_t E2 = vshlq_n_s16(A2, 3 + 1);
460
+ const int16x8_t E1 = vshlq_n_s16(A1, 3 + 1);
461
+ BUTTERFLY(F1, F2, E1, E2);
462
+ const int16x8_t G2 = MULT(F2, k2Sqrt2);
463
+ const int16x8_t G1 = MULT(F1, k2Sqrt2);
464
+ BUTTERFLY(H3, H1, C3, G1);
465
+ BUTTERFLY(H0, H2, C0, G2);
466
+
467
+ const int16x8_t G3 = vaddq_s16(MULT(H3, kTan3m1), H3);
468
+ const int16x8_t G6 = vaddq_s16(MULT(H1, kTan1), H2);
469
+
470
+ // CORRECT_LSB
471
+ const int16x8_t kOne = vdupq_n_s16(1);
472
+ const int16x8_t I3 = vaddq_s16(G3, kOne);
473
+ const int16x8_t G4 = vaddq_s16(MULT(H0, kTan3m1), H0);
474
+
475
+ int16x8_t tmp1 = vaddq_s16(G6, kOne);
476
+ int16x8_t tmp3 = vsubq_s16(H0, I3);
477
+ int16x8_t tmp5 = vaddq_s16(H3, G4);
478
+ int16x8_t tmp7 = vsubq_s16(MULT(H2, kTan1), H1);
479
+
480
+ Transpose8x8(&tmp0, &tmp1, &tmp2, &tmp3, &tmp4, &tmp5, &tmp6, &tmp7);
481
+
482
+ ////////////////////
483
+ // Horizontal pass
484
+ ////////////////////
485
+ BUTTERFLY(b0, a0, tmp0, tmp7);
486
+ BUTTERFLY(b1, a1, tmp1, tmp6);
487
+ BUTTERFLY(b2, a2, tmp2, tmp5);
488
+ BUTTERFLY(b3, a3, tmp3, tmp4);
489
+
490
+ BUTTERFLY(c1, c0, a0, a3);
491
+ BUTTERFLY(c3, c2, a1, a2);
492
+
493
+ const int16_t kTable0[] = MK_TABLE_CST(22725, 31521, 29692, 26722);
494
+ const int16_t kTable1[] = MK_TABLE_CST(21407, 29692, 27969, 25172);
495
+ const int16_t kTable2[] = MK_TABLE_CST(19266, 26722, 25172, 22654);
496
+ const int16_t kTable3[] = MK_TABLE_CST(16384, 22725, 21407, 19266);
497
+ const int16_t kTable4[] = MK_TABLE_CST(12873, 17855, 16819, 15137);
498
+ const int16_t kTable5[] = MK_TABLE_CST(8867, 12299, 11585, 10426);
499
+ const int16_t kTable6[] = MK_TABLE_CST(4520, 6270, 5906, 5315);
500
+
501
+ // even part
502
+ const int16x8_t kC2 = vld1q_s16(kTable1);
503
+ const int16x8_t kC4 = vld1q_s16(kTable3);
504
+ const int16x8_t kC6 = vld1q_s16(kTable5);
505
+
506
+ MULT_DCL_32(out0_lo, out0_hi, c0, kC4);
507
+ MULT_DCL_32(out4_lo, out4_hi, c0, kC4);
508
+ MULT_ADD_32(out0_lo, out0_hi, c2, kC4);
509
+ MULT_SUB_32(out4_lo, out4_hi, c2, kC4);
510
+ MULT_DCL_32(out2_lo, out2_hi, c1, kC2);
511
+ MULT_DCL_32(out6_lo, out6_hi, c1, kC6);
512
+ MULT_ADD_32(out2_lo, out2_hi, c3, kC6);
513
+ MULT_SUB_32(out6_lo, out6_hi, c3, kC2);
514
+
515
+ int16x8_t out0 = PackS32(out0_lo, out0_hi);
516
+ int16x8_t out4 = PackS32(out4_lo, out4_hi);
517
+ int16x8_t out2 = PackS32(out2_lo, out2_hi);
518
+ int16x8_t out6 = PackS32(out6_lo, out6_hi);
519
+
520
+ // odd part
521
+ const int16x8_t kC1 = vld1q_s16(kTable0);
522
+ const int16x8_t kC3 = vld1q_s16(kTable2);
523
+ const int16x8_t kC5 = vld1q_s16(kTable4);
524
+ const int16x8_t kC7 = vld1q_s16(kTable6);
525
+
526
+ MULT_DCL_32(out1_lo, out1_hi, b0, kC1);
527
+ MULT_DCL_32(out3_lo, out3_hi, b0, kC3);
528
+ MULT_DCL_32(out5_lo, out5_hi, b0, kC5);
529
+ MULT_DCL_32(out7_lo, out7_hi, b0, kC7);
530
+
531
+ MULT_ADD_32(out1_lo, out1_hi, b1, kC3);
532
+ MULT_SUB_32(out3_lo, out3_hi, b1, kC7);
533
+ MULT_SUB_32(out5_lo, out5_hi, b1, kC1);
534
+ MULT_SUB_32(out7_lo, out7_hi, b1, kC5);
535
+
536
+ MULT_ADD_32(out1_lo, out1_hi, b2, kC5);
537
+ MULT_SUB_32(out3_lo, out3_hi, b2, kC1);
538
+ MULT_ADD_32(out5_lo, out5_hi, b2, kC7);
539
+ MULT_ADD_32(out7_lo, out7_hi, b2, kC3);
540
+
541
+ MULT_ADD_32(out1_lo, out1_hi, b3, kC7);
542
+ MULT_SUB_32(out3_lo, out3_hi, b3, kC5);
543
+ MULT_ADD_32(out5_lo, out5_hi, b3, kC3);
544
+ MULT_SUB_32(out7_lo, out7_hi, b3, kC1);
545
+
546
+ int16x8_t out1 = PackS32(out1_lo, out1_hi);
547
+ int16x8_t out3 = PackS32(out3_lo, out3_hi);
548
+ int16x8_t out5 = PackS32(out5_lo, out5_hi);
549
+ int16x8_t out7 = PackS32(out7_lo, out7_hi);
550
+
551
+ // final transpose
552
+ Transpose8x8(&out0, &out1, &out2, &out3, &out4, &out5, &out6, &out7);
553
+
554
+ // and storage.
555
+ vst1q_s16(&in[0 * 8], out0);
556
+ vst1q_s16(&in[1 * 8], out1);
557
+ vst1q_s16(&in[2 * 8], out2);
558
+ vst1q_s16(&in[3 * 8], out3);
559
+ vst1q_s16(&in[4 * 8], out4);
560
+ vst1q_s16(&in[5 * 8], out5);
561
+ vst1q_s16(&in[6 * 8], out6);
562
+ vst1q_s16(&in[7 * 8], out7);
563
+ }
564
+
565
+ static void FdctNEON(int16_t* coeffs, int num_blocks) {
566
+ while (num_blocks-- > 0) {
567
+ Dct_NEON(coeffs);
568
+ coeffs += 64;
569
+ }
570
+ }
571
+ #undef MULT
572
+ #undef BUTTERFLY
573
+ #undef MULT_DCL_32
574
+ #undef MULT_ADD_32
575
+ #undef MULT_SUB_32
576
+ #undef MK_TABLE_CST
577
+
578
+ #endif // SJPEG_USE_NEON
579
+
580
+ #undef kTan1
581
+ #undef kTan2
582
+ #undef kTan3m1
583
+ #undef k2Sqrt2
584
+
585
+ ///////////////////////////////////////////////////////////////////////////////
586
+ // visible FDCT callable functions
587
+
588
+ static void FdctC(int16_t* coeffs, int num_blocks) {
589
+ while (num_blocks-- > 0) {
590
+ ColumnDct(coeffs);
591
+ RowDct(coeffs + 0 * 8, kTable04);
592
+ RowDct(coeffs + 1 * 8, kTable17);
593
+ RowDct(coeffs + 2 * 8, kTable26);
594
+ RowDct(coeffs + 3 * 8, kTable35);
595
+ RowDct(coeffs + 4 * 8, kTable04);
596
+ RowDct(coeffs + 5 * 8, kTable35);
597
+ RowDct(coeffs + 6 * 8, kTable26);
598
+ RowDct(coeffs + 7 * 8, kTable17);
599
+ coeffs += 64;
600
+ }
601
+ }
602
+
603
+ #if defined(SJPEG_USE_SSE2)
604
+ static void FdctSSE2(int16_t* coeffs, int num_blocks) {
605
+ while (num_blocks-- > 0) {
606
+ ColumnDct_SSE2(coeffs);
607
+ RowDct_SSE2(coeffs + 0 * 8, kfTables_SSE2[0].m, kfTables_SSE2[1].m);
608
+ RowDct_SSE2(coeffs + 2 * 8, kfTables_SSE2[2].m, kfTables_SSE2[3].m);
609
+ RowDct_SSE2(coeffs + 4 * 8, kfTables_SSE2[0].m, kfTables_SSE2[3].m);
610
+ RowDct_SSE2(coeffs + 6 * 8, kfTables_SSE2[2].m, kfTables_SSE2[1].m);
611
+ coeffs += 64;
612
+ }
613
+ }
614
+ #endif // SJPEG_USE_SSE2
615
+
616
+ FdctFunc GetFdct() {
617
+ #if defined(SJPEG_USE_SSE2)
618
+ if (SupportsSSE2()) return FdctSSE2;
619
+ #elif defined(SJPEG_USE_NEON)
620
+ if (SupportsNEON()) return FdctNEON;
621
+ #endif
622
+ return FdctC; // default
623
+ }
624
+
625
+ ///////////////////////////////////////////////////////////////////////////////
626
+
627
+ } // namespace sjpeg