sjpeg 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3 @@
1
+ require "mkmf"
2
+
3
+ create_makefile "sjpeg"
data/ext/sjpeg/fdct.cc ADDED
@@ -0,0 +1,627 @@
1
+ // Copyright 2017 Google Inc.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // forward DCT
16
+ //
17
+ // fdct output is kept scaled by 16, to retain maximum 16bit precision
18
+ //
19
+ // Author: Skal (pascal.massimino@gmail.com)
20
+
21
+ #define SJPEG_NEED_ASM_HEADERS
22
+ #include "sjpegi.h"
23
+
24
+ namespace sjpeg {
25
+
26
+ ///////////////////////////////////////////////////////////////////////////////
27
+ // Cosine table: C(k) = cos(k.pi/16)/sqrt(2), k = 1..7 using 15 bits signed
28
+ const int16_t kTable04[7] = { 22725, 21407, 19266, 16384, 12873, 8867, 4520 };
29
+ // rows #1 and #7 are pre-multiplied by 2.C(1) before the 2nd pass.
30
+ // This multiply is merged in the table of constants used during 1rst pass:
31
+ const int16_t kTable17[7] = { 31521, 29692, 26722, 22725, 17855, 12299, 6270 };
32
+ // rows #2 and #6 are pre-multiplied by 2.C(2):
33
+ const int16_t kTable26[7] = { 29692, 27969, 25172, 21407, 16819, 11585, 5906 };
34
+ // rows #3 and #5 are pre-multiplied by 2.C(3):
35
+ const int16_t kTable35[7] = { 26722, 25172, 22654, 19266, 15137, 10426, 5315 };
36
+
37
+ ///////////////////////////////////////////////////////////////////////////////
38
+ // Constants and C/SSE2 macros for IDCT vertical pass
39
+
40
+ #define kTan1 (13036) // = tan(pi/16)
41
+ #define kTan2 (27146) // = tan(2.pi/16) = sqrt(2) - 1.
42
+ #define kTan3m1 (-21746) // = tan(3.pi/16) - 1
43
+ #define k2Sqrt2 (23170) // = 1 / 2.sqrt(2)
44
+
45
+ // performs: {a,b} <- {a-b, a+b}, without saturation
46
+ #define BUTTERFLY(a, b) do { \
47
+ SUB((a), (b)); \
48
+ ADD((b), (b)); \
49
+ ADD((b), (a)); \
50
+ } while (0)
51
+
52
+ ///////////////////////////////////////////////////////////////////////////////
53
+ // Constants for DCT horizontal pass
54
+
55
+ // Note about the CORRECT_LSB macro:
56
+ // using 16bit fixed-point constants, we often compute products like:
57
+ // p = (A*x + B*y + 32768) >> 16 by adding two sub-terms q = (A*x) >> 16
58
+ // and r = (B*y) >> 16 together. Statistically, we have p = q + r + 1
59
+ // in 3/4 of the cases. This can be easily seen from the relation:
60
+ // (a + b + 1) >> 1 = (a >> 1) + (b >> 1) + ((a|b)&1)
61
+ // The approximation we are doing is replacing ((a|b)&1) by 1.
62
+ // In practice, this is a slightly more involved because the constants A and B
63
+ // have also been rounded compared to their exact floating point value.
64
+ // However, all in all the correction is quite small, and CORRECT_LSB can
65
+ // be defined empty if needed.
66
+
67
+ #define COLUMN_DCT8(in) do { \
68
+ LOAD(m0, (in)[0 * 8]); \
69
+ LOAD(m2, (in)[2 * 8]); \
70
+ LOAD(m7, (in)[7 * 8]); \
71
+ LOAD(m5, (in)[5 * 8]); \
72
+ \
73
+ BUTTERFLY(m0, m7); \
74
+ BUTTERFLY(m2, m5); \
75
+ \
76
+ LOAD(m3, (in)[3 * 8]); \
77
+ LOAD(m4, (in)[4 * 8]); \
78
+ BUTTERFLY(m3, m4); \
79
+ \
80
+ LOAD(m6, (in)[6 * 8]); \
81
+ LOAD(m1, (in)[1 * 8]); \
82
+ BUTTERFLY(m1, m6); \
83
+ BUTTERFLY(m7, m4); \
84
+ BUTTERFLY(m6, m5); \
85
+ \
86
+ /* RowIdct() needs 15bits fixed-point input, when the output from */ \
87
+ /* ColumnIdct() would be 12bits. We are better doing the shift by 3 */ \
88
+ /* now instead of in RowIdct(), because we have some multiplies to */ \
89
+ /* perform, that can take advantage of the extra 3bits precision. */ \
90
+ LSHIFT(m4, 3); \
91
+ LSHIFT(m5, 3); \
92
+ BUTTERFLY(m4, m5); \
93
+ STORE16((in)[0 * 8], m5); \
94
+ STORE16((in)[4 * 8], m4); \
95
+ \
96
+ LSHIFT(m7, 3); \
97
+ LSHIFT(m6, 3); \
98
+ LSHIFT(m3, 3); \
99
+ LSHIFT(m0, 3); \
100
+ \
101
+ LOAD_CST(m4, kTan2); \
102
+ m5 = m4; \
103
+ MULT(m4, m7); \
104
+ MULT(m5, m6); \
105
+ SUB(m4, m6); \
106
+ ADD(m5, m7); \
107
+ STORE16((in)[2 * 8], m5); \
108
+ STORE16((in)[6 * 8], m4); \
109
+ \
110
+ /* We should be multiplying m6 by C4 = 1/sqrt(2) here, but we only have */ \
111
+ /* the k2Sqrt2 = 1/(2.sqrt(2)) constant that fits into 15bits. So we */ \
112
+ /* shift by 4 instead of 3 to compensate for the additional 1/2 factor. */ \
113
+ LOAD_CST(m6, k2Sqrt2); \
114
+ LSHIFT(m2, 3 + 1); \
115
+ LSHIFT(m1, 3 + 1); \
116
+ BUTTERFLY(m1, m2); \
117
+ MULT(m2, m6); \
118
+ MULT(m1, m6); \
119
+ BUTTERFLY(m3, m1); \
120
+ BUTTERFLY(m0, m2); \
121
+ \
122
+ LOAD_CST(m4, kTan3m1); \
123
+ LOAD_CST(m5, kTan1); \
124
+ m7 = m3; \
125
+ m6 = m1; \
126
+ MULT(m3, m4); \
127
+ MULT(m1, m5); \
128
+ \
129
+ ADD(m3, m7); \
130
+ ADD(m1, m2); \
131
+ CORRECT_LSB(m1); \
132
+ CORRECT_LSB(m3); \
133
+ MULT(m4, m0); \
134
+ MULT(m5, m2); \
135
+ ADD(m4, m0); \
136
+ SUB(m0, m3); \
137
+ ADD(m7, m4); \
138
+ SUB(m5, m6); \
139
+ \
140
+ STORE16((in)[1 * 8], m1); \
141
+ STORE16((in)[3 * 8], m0); \
142
+ STORE16((in)[5 * 8], m7); \
143
+ STORE16((in)[7 * 8], m5); \
144
+ } while (0)
145
+
146
+ ///////////////////////////////////////////////////////////////////////////////
147
+ // Plain-C implementation, bit-wise equivalent to the SSE2 version
148
+
149
+ // these are the macro required by COLUMN_*
150
+ #define LOAD_CST(dst, src) (dst) = (src)
151
+ #define LOAD(dst, src) (dst) = (src)
152
+ #define MULT(a, b) (a) = (((a) * (b)) >> 16)
153
+ #define ADD(a, b) (a) = (a) + (b)
154
+ #define SUB(a, b) (a) = (a) - (b)
155
+ #define LSHIFT(a, n) (a) = ((a) << (n))
156
+ #define STORE16(a, b) (a) = (b)
157
+ #define CORRECT_LSB(a) (a) += 1
158
+
159
+ // DCT vertical pass
160
+
161
+ void ColumnDct(int16_t* in) {
162
+ for (int i = 0; i < 8; ++i) {
163
+ int32_t m0, m1, m2, m3, m4, m5, m6, m7;
164
+ COLUMN_DCT8(in + i);
165
+ }
166
+ }
167
+
168
+ // DCT horizontal pass
169
+
170
+ // We don't really need to round before descaling, since we
171
+ // still have 4 bits of precision left as final scaled output.
172
+ #define DESCALE(a) static_cast<int16_t>((a) >> 16)
173
+
174
+ static void RowDct(int16_t* in, const int16_t*table) {
175
+ // The Fourier transform is an unitary operator, so we're basically
176
+ // doing the transpose of RowIdct()
177
+ const int a0 = in[0] + in[7];
178
+ const int b0 = in[0] - in[7];
179
+ const int a1 = in[1] + in[6];
180
+ const int b1 = in[1] - in[6];
181
+ const int a2 = in[2] + in[5];
182
+ const int b2 = in[2] - in[5];
183
+ const int a3 = in[3] + in[4];
184
+ const int b3 = in[3] - in[4];
185
+
186
+ // even part
187
+ const int C2 = table[1];
188
+ const int C4 = table[3];
189
+ const int C6 = table[5];
190
+ const int c0 = a0 + a3;
191
+ const int c1 = a0 - a3;
192
+ const int c2 = a1 + a2;
193
+ const int c3 = a1 - a2;
194
+
195
+ in[0] = DESCALE(C4 * (c0 + c2));
196
+ in[4] = DESCALE(C4 * (c0 - c2));
197
+ in[2] = DESCALE(C2 * c1 + C6 * c3);
198
+ in[6] = DESCALE(C6 * c1 - C2 * c3);
199
+
200
+ // odd part
201
+ const int C1 = table[0];
202
+ const int C3 = table[2];
203
+ const int C5 = table[4];
204
+ const int C7 = table[6];
205
+ in[1] = DESCALE(C1 * b0 + C3 * b1 + C5 * b2 + C7 * b3);
206
+ in[3] = DESCALE(C3 * b0 - C7 * b1 - C1 * b2 - C5 * b3);
207
+ in[5] = DESCALE(C5 * b0 - C1 * b1 + C7 * b2 + C3 * b3);
208
+ in[7] = DESCALE(C7 * b0 - C5 * b1 + C3 * b2 - C1 * b3);
209
+ }
210
+ #undef DESCALE
211
+
212
+ #undef LOAD_CST
213
+ #undef LOAD
214
+ #undef MULT
215
+ #undef ADD
216
+ #undef SUB
217
+ #undef LSHIFT
218
+ #undef STORE16
219
+ #undef CORRECT_LSB
220
+
221
+ ///////////////////////////////////////////////////////////////////////////////
222
+ // SSE2 implementation
223
+
224
+ #if defined(SJPEG_USE_SSE2)
225
+
226
+ // Tables and macros
227
+
228
+ #define CST(v) { { v, v, v, v, v, v, v, v } }
229
+ static const union {
230
+ const int16_t s[8];
231
+ const __m128i m;
232
+ } CST_kTan1 = CST(kTan1),
233
+ CST_kTan2 = CST(kTan2),
234
+ CST_kTan3m1 = CST(kTan3m1),
235
+ CST_k2Sqrt2 = CST(k2Sqrt2),
236
+ CST_kfRounder1 = CST(1); // rounders for fdct
237
+ #undef CST
238
+
239
+ static const union {
240
+ const uint16_t s[4 * 8];
241
+ const __m128i m[4];
242
+ } kfTables_SSE2[4] = {
243
+ // Tables for fdct, roughly the transposed of the above, shuffled
244
+ { { 0x4000, 0x4000, 0x58c5, 0x4b42, 0xdd5d, 0xac61, 0xa73b, 0xcdb7,
245
+ 0x4000, 0x4000, 0x3249, 0x11a8, 0x539f, 0x22a3, 0x4b42, 0xee58,
246
+ 0x4000, 0xc000, 0x3249, 0xa73b, 0x539f, 0xdd5d, 0x4b42, 0xa73b,
247
+ 0xc000, 0x4000, 0x11a8, 0x4b42, 0x22a3, 0xac61, 0x11a8, 0xcdb7 } },
248
+ { { 0x58c5, 0x58c5, 0x7b21, 0x6862, 0xcff5, 0x8c04, 0x84df, 0xba41,
249
+ 0x58c5, 0x58c5, 0x45bf, 0x187e, 0x73fc, 0x300b, 0x6862, 0xe782,
250
+ 0x58c5, 0xa73b, 0x45bf, 0x84df, 0x73fc, 0xcff5, 0x6862, 0x84df,
251
+ 0xa73b, 0x58c5, 0x187e, 0x6862, 0x300b, 0x8c04, 0x187e, 0xba41 } },
252
+ { { 0x539f, 0x539f, 0x73fc, 0x6254, 0xd2bf, 0x92bf, 0x8c04, 0xbe4d,
253
+ 0x539f, 0x539f, 0x41b3, 0x1712, 0x6d41, 0x2d41, 0x6254, 0xe8ee,
254
+ 0x539f, 0xac61, 0x41b3, 0x8c04, 0x6d41, 0xd2bf, 0x6254, 0x8c04,
255
+ 0xac61, 0x539f, 0x1712, 0x6254, 0x2d41, 0x92bf, 0x1712, 0xbe4d } },
256
+ { { 0x4b42, 0x4b42, 0x6862, 0x587e, 0xd746, 0x9dac, 0x979e, 0xc4df,
257
+ 0x4b42, 0x4b42, 0x3b21, 0x14c3, 0x6254, 0x28ba, 0x587e, 0xeb3d,
258
+ 0x4b42, 0xb4be, 0x3b21, 0x979e, 0x6254, 0xd746, 0x587e, 0x979e,
259
+ 0xb4be, 0x4b42, 0x14c3, 0x587e, 0x28ba, 0x9dac, 0x14c3, 0xc4df } } };
260
+
261
+ #define LOAD_CST(x, y) (x) = (CST_ ## y).m
262
+ #define LOAD(x, y) \
263
+ (x) = _mm_load_si128(reinterpret_cast<const __m128i*>(&(y)))
264
+ #define MULT(x, y) (x) = _mm_mulhi_epi16((x), (y))
265
+ #define ADD(x, y) (x) = _mm_add_epi16((x), (y))
266
+ #define SUB(x, y) (x) = _mm_sub_epi16((x), (y))
267
+ #define LSHIFT(x, n) (x) = _mm_slli_epi16((x), (n))
268
+ #define STORE16(a, b) _mm_store_si128(reinterpret_cast<__m128i*>(&(a)), (b))
269
+ #define CORRECT_LSB(a) (a) = _mm_adds_epi16((a), CST_kfRounder1.m)
270
+
271
+ // DCT vertical pass
272
+
273
+ static void ColumnDct_SSE2(int16_t* in) {
274
+ __m128i m0, m1, m2, m3, m4, m5, m6, m7;
275
+ COLUMN_DCT8(in);
276
+ }
277
+
278
+ // DCT horizontal pass
279
+
280
+ static void RowDct_SSE2(int16_t* in, const __m128i* table1,
281
+ const __m128i* table2) {
282
+ // load row [0123|4567] as [0123|7654]
283
+ __m128i m0 =
284
+ _mm_shufflehi_epi16(*reinterpret_cast<__m128i*>(in + 0 * 8), 0x1b);
285
+ __m128i m2 =
286
+ _mm_shufflehi_epi16(*reinterpret_cast<__m128i*>(in + 1 * 8), 0x1b);
287
+
288
+ // we process two rows in parallel
289
+ __m128i m4 = m0;
290
+ // => x0 x1 x2 x3 | x0' x1' x2' x3'
291
+ m0 = (__m128i)_mm_shuffle_ps((__m128)m0, (__m128)m2, 0x44);
292
+ // => x7 x6 x5 x4 | x7' x6' x5' x4'
293
+ m4 = (__m128i)_mm_shuffle_ps((__m128)m4, (__m128)m2, 0xee);
294
+
295
+ // initial butterfly
296
+ m2 = m0;
297
+ m0 = _mm_add_epi16(m0, m4); // a0=x0+x7 | a1=x1+x6 | a2=x2+x5 | a3=x3+x4
298
+ m2 = _mm_sub_epi16(m2, m4); // b0=x0-x7 | b1=x1-x6 | b2=x2-x5 | b3=x3-x4
299
+
300
+ // prepare for scalar products which are performed using four madd_epi16
301
+ __m128i m6;
302
+ m4 = m0;
303
+ m0 = _mm_unpacklo_epi32(m0, m2); // a0 a1 | b0 b1 | a2 a3 | b2 b3
304
+ m4 = _mm_unpackhi_epi32(m4, m2);
305
+ m2 = _mm_shuffle_epi32(m0, 0x4e); // a2 a3 | b2 b3 | a0 a1 | b0 b1
306
+ m6 = _mm_shuffle_epi32(m4, 0x4e);
307
+
308
+ __m128i m1, m3, m5, m7;
309
+ m1 = _mm_madd_epi16(m2, table1[1]);
310
+ m3 = _mm_madd_epi16(m0, table1[2]);
311
+ m5 = _mm_madd_epi16(m6, table2[1]);
312
+ m7 = _mm_madd_epi16(m4, table2[2]);
313
+
314
+ m2 = _mm_madd_epi16(m2, table1[3]);
315
+ m0 = _mm_madd_epi16(m0, table1[0]);
316
+ m6 = _mm_madd_epi16(m6, table2[3]);
317
+ m4 = _mm_madd_epi16(m4, table2[0]);
318
+
319
+ // add the sub-terms
320
+ m0 = _mm_add_epi32(m0, m1);
321
+ m4 = _mm_add_epi32(m4, m5);
322
+ m2 = _mm_add_epi32(m2, m3);
323
+ m6 = _mm_add_epi32(m6, m7);
324
+
325
+ // descale
326
+ m0 = _mm_srai_epi32(m0, 16);
327
+ m4 = _mm_srai_epi32(m4, 16);
328
+ m2 = _mm_srai_epi32(m2, 16);
329
+ m6 = _mm_srai_epi32(m6, 16);
330
+
331
+ m0 = _mm_packs_epi32(m0, m2);
332
+ m4 = _mm_packs_epi32(m4, m6);
333
+
334
+ _mm_store_si128(reinterpret_cast<__m128i*>(in + 0 * 8), m0);
335
+ _mm_store_si128(reinterpret_cast<__m128i*>(in + 1 * 8), m4);
336
+ }
337
+
338
+ #undef LOAD_CST
339
+ #undef LOAD
340
+ #undef MULT
341
+ #undef ADD
342
+ #undef SUB
343
+ #undef LSHIFT
344
+ #undef STORE16
345
+ #undef CORRECT_LSB
346
+
347
+ #endif // SJPEG_USE_SSE2
348
+
349
+ // done with the macros
350
+
351
+ #undef BUTTERFLY
352
+ #undef COLUMN_DCT8
353
+
354
+ ///////////////////////////////////////////////////////////////////////////////
355
+ // NEON implementation
356
+
357
+ #if defined(SJPEG_USE_NEON)
358
+
359
+ // multiply by scalar
360
+ #define MULT(A, kC) (vqdmulhq_n_s16((A), (kC) >> 1))
361
+ // V0 = r0 - r1, V1 = r0 + r1
362
+ #define BUTTERFLY(V0, V1, r0, r1) \
363
+ const int16x8_t V0 = vsubq_s16((r0), (r1)); \
364
+ const int16x8_t V1 = vaddq_s16((r0), (r1))
365
+
366
+ // collect the 16b hi-words of 32bit words into a packed 16b one
367
+ static int16x8_t PackS32(const int32x4_t lo, const int32x4_t hi) {
368
+ return vuzpq_s16(vreinterpretq_s16_s32(lo),
369
+ vreinterpretq_s16_s32(hi)).val[1];
370
+ }
371
+
372
+ #define MULT_DCL_32(LO, HI, A, CST) \
373
+ int32x4_t LO = vmull_s16(vget_low_s16(A), vget_low_s16(CST)); \
374
+ int32x4_t HI = vmull_s16(vget_high_s16(A), vget_high_s16(CST))
375
+ #define MULT_ADD_32(LO, HI, A, CST) do { \
376
+ LO = vmlal_s16(LO, vget_low_s16(A), vget_low_s16(CST)); \
377
+ HI = vmlal_s16(HI, vget_high_s16(A), vget_high_s16(CST)); \
378
+ } while (0)
379
+ #define MULT_SUB_32(LO, HI, A, CST) do { \
380
+ LO = vmlsl_s16(LO, vget_low_s16(A), vget_low_s16(CST)); \
381
+ HI = vmlsl_s16(HI, vget_high_s16(A), vget_high_s16(CST)); \
382
+ } while (0)
383
+
384
+ #define MK_TABLE_CST(A, B, C, D) { (A), (B), (C), (D), (A), (D), (C), (B) }
385
+
386
+ // s64 transposing helper:
387
+ // *out0 = lo(v0) | hi(v1)
388
+ // *out1 = lo(v1) | hi(v0)
389
+ static void vtrn_s64(const int32x4_t v0, const int32x4_t v1,
390
+ int16x8_t* out0, int16x8_t* out1) {
391
+ *out0 = vreinterpretq_s16_s64(
392
+ vcombine_s64(vreinterpret_s64_s32(vget_low_s32(v0)),
393
+ vreinterpret_s64_s32(vget_low_s32(v1))));
394
+ *out1 = vreinterpretq_s16_s64(
395
+ vcombine_s64(vreinterpret_s64_s32(vget_high_s32(v0)),
396
+ vreinterpret_s64_s32(vget_high_s32(v1))));
397
+ }
398
+
399
+ void Transpose8x8(int16x8_t* const A0, int16x8_t* const A1,
400
+ int16x8_t* const A2, int16x8_t* const A3,
401
+ int16x8_t* const A4, int16x8_t* const A5,
402
+ int16x8_t* const A6, int16x8_t* const A7) {
403
+ const int16x8x2_t row01 = vtrnq_s16(*A0, *A1);
404
+ const int16x8x2_t row23 = vtrnq_s16(*A2, *A3);
405
+ const int16x8x2_t row45 = vtrnq_s16(*A4, *A5);
406
+ const int16x8x2_t row67 = vtrnq_s16(*A6, *A7);
407
+
408
+ const int32x4x2_t row02 = vtrnq_s32(vreinterpretq_s32_s16(row01.val[0]),
409
+ vreinterpretq_s32_s16(row23.val[0]));
410
+ const int32x4x2_t row13 = vtrnq_s32(vreinterpretq_s32_s16(row01.val[1]),
411
+ vreinterpretq_s32_s16(row23.val[1]));
412
+ const int32x4x2_t row46 = vtrnq_s32(vreinterpretq_s32_s16(row45.val[0]),
413
+ vreinterpretq_s32_s16(row67.val[0]));
414
+ const int32x4x2_t row57 = vtrnq_s32(vreinterpretq_s32_s16(row45.val[1]),
415
+ vreinterpretq_s32_s16(row67.val[1]));
416
+
417
+ vtrn_s64(row02.val[0], row46.val[0], A0, A4);
418
+ vtrn_s64(row02.val[1], row46.val[1], A2, A6);
419
+ vtrn_s64(row13.val[0], row57.val[0], A1, A5);
420
+ vtrn_s64(row13.val[1], row57.val[1], A3, A7);
421
+ }
422
+
423
+ static void Dct_NEON(int16_t* in) {
424
+ ////////////////////
425
+ // vertical pass
426
+ ////////////////////
427
+ const int16x8_t m0 = vld1q_s16(in + 0 * 8);
428
+ const int16x8_t m1 = vld1q_s16(in + 1 * 8);
429
+ const int16x8_t m2 = vld1q_s16(in + 2 * 8);
430
+ const int16x8_t m3 = vld1q_s16(in + 3 * 8);
431
+ const int16x8_t m4 = vld1q_s16(in + 4 * 8);
432
+ const int16x8_t m5 = vld1q_s16(in + 5 * 8);
433
+ const int16x8_t m6 = vld1q_s16(in + 6 * 8);
434
+ const int16x8_t m7 = vld1q_s16(in + 7 * 8);
435
+
436
+ BUTTERFLY(A0, A7, m0, m7);
437
+ BUTTERFLY(A2, A5, m2, m5);
438
+ BUTTERFLY(A3, A4, m3, m4);
439
+ BUTTERFLY(A1, A6, m1, m6);
440
+
441
+ BUTTERFLY(B7, B4, A7, A4);
442
+ BUTTERFLY(B6, B5, A6, A5);
443
+
444
+ // see comment in COLUMN_DCT8
445
+ const int16x8_t C4 = vshlq_n_s16(B4, 3);
446
+ const int16x8_t C5 = vshlq_n_s16(B5, 3);
447
+ const int16x8_t C7 = vshlq_n_s16(B7, 3);
448
+ const int16x8_t C6 = vshlq_n_s16(B6, 3);
449
+ const int16x8_t C3 = vshlq_n_s16(A3, 3);
450
+ const int16x8_t C0 = vshlq_n_s16(A0, 3);
451
+
452
+ // BUTTERFLY(tmp4, tmp0, C4, C5)
453
+ int16x8_t tmp0 = vaddq_s16(C4, C5);
454
+ int16x8_t tmp4 = vsubq_s16(C4, C5);
455
+ int16x8_t tmp6 = vsubq_s16(MULT(C7, kTan2), C6);
456
+ int16x8_t tmp2 = vaddq_s16(MULT(C6, kTan2), C7);
457
+
458
+ // see comment in COLUMN_DCT8
459
+ const int16x8_t E2 = vshlq_n_s16(A2, 3 + 1);
460
+ const int16x8_t E1 = vshlq_n_s16(A1, 3 + 1);
461
+ BUTTERFLY(F1, F2, E1, E2);
462
+ const int16x8_t G2 = MULT(F2, k2Sqrt2);
463
+ const int16x8_t G1 = MULT(F1, k2Sqrt2);
464
+ BUTTERFLY(H3, H1, C3, G1);
465
+ BUTTERFLY(H0, H2, C0, G2);
466
+
467
+ const int16x8_t G3 = vaddq_s16(MULT(H3, kTan3m1), H3);
468
+ const int16x8_t G6 = vaddq_s16(MULT(H1, kTan1), H2);
469
+
470
+ // CORRECT_LSB
471
+ const int16x8_t kOne = vdupq_n_s16(1);
472
+ const int16x8_t I3 = vaddq_s16(G3, kOne);
473
+ const int16x8_t G4 = vaddq_s16(MULT(H0, kTan3m1), H0);
474
+
475
+ int16x8_t tmp1 = vaddq_s16(G6, kOne);
476
+ int16x8_t tmp3 = vsubq_s16(H0, I3);
477
+ int16x8_t tmp5 = vaddq_s16(H3, G4);
478
+ int16x8_t tmp7 = vsubq_s16(MULT(H2, kTan1), H1);
479
+
480
+ Transpose8x8(&tmp0, &tmp1, &tmp2, &tmp3, &tmp4, &tmp5, &tmp6, &tmp7);
481
+
482
+ ////////////////////
483
+ // Horizontal pass
484
+ ////////////////////
485
+ BUTTERFLY(b0, a0, tmp0, tmp7);
486
+ BUTTERFLY(b1, a1, tmp1, tmp6);
487
+ BUTTERFLY(b2, a2, tmp2, tmp5);
488
+ BUTTERFLY(b3, a3, tmp3, tmp4);
489
+
490
+ BUTTERFLY(c1, c0, a0, a3);
491
+ BUTTERFLY(c3, c2, a1, a2);
492
+
493
+ const int16_t kTable0[] = MK_TABLE_CST(22725, 31521, 29692, 26722);
494
+ const int16_t kTable1[] = MK_TABLE_CST(21407, 29692, 27969, 25172);
495
+ const int16_t kTable2[] = MK_TABLE_CST(19266, 26722, 25172, 22654);
496
+ const int16_t kTable3[] = MK_TABLE_CST(16384, 22725, 21407, 19266);
497
+ const int16_t kTable4[] = MK_TABLE_CST(12873, 17855, 16819, 15137);
498
+ const int16_t kTable5[] = MK_TABLE_CST(8867, 12299, 11585, 10426);
499
+ const int16_t kTable6[] = MK_TABLE_CST(4520, 6270, 5906, 5315);
500
+
501
+ // even part
502
+ const int16x8_t kC2 = vld1q_s16(kTable1);
503
+ const int16x8_t kC4 = vld1q_s16(kTable3);
504
+ const int16x8_t kC6 = vld1q_s16(kTable5);
505
+
506
+ MULT_DCL_32(out0_lo, out0_hi, c0, kC4);
507
+ MULT_DCL_32(out4_lo, out4_hi, c0, kC4);
508
+ MULT_ADD_32(out0_lo, out0_hi, c2, kC4);
509
+ MULT_SUB_32(out4_lo, out4_hi, c2, kC4);
510
+ MULT_DCL_32(out2_lo, out2_hi, c1, kC2);
511
+ MULT_DCL_32(out6_lo, out6_hi, c1, kC6);
512
+ MULT_ADD_32(out2_lo, out2_hi, c3, kC6);
513
+ MULT_SUB_32(out6_lo, out6_hi, c3, kC2);
514
+
515
+ int16x8_t out0 = PackS32(out0_lo, out0_hi);
516
+ int16x8_t out4 = PackS32(out4_lo, out4_hi);
517
+ int16x8_t out2 = PackS32(out2_lo, out2_hi);
518
+ int16x8_t out6 = PackS32(out6_lo, out6_hi);
519
+
520
+ // odd part
521
+ const int16x8_t kC1 = vld1q_s16(kTable0);
522
+ const int16x8_t kC3 = vld1q_s16(kTable2);
523
+ const int16x8_t kC5 = vld1q_s16(kTable4);
524
+ const int16x8_t kC7 = vld1q_s16(kTable6);
525
+
526
+ MULT_DCL_32(out1_lo, out1_hi, b0, kC1);
527
+ MULT_DCL_32(out3_lo, out3_hi, b0, kC3);
528
+ MULT_DCL_32(out5_lo, out5_hi, b0, kC5);
529
+ MULT_DCL_32(out7_lo, out7_hi, b0, kC7);
530
+
531
+ MULT_ADD_32(out1_lo, out1_hi, b1, kC3);
532
+ MULT_SUB_32(out3_lo, out3_hi, b1, kC7);
533
+ MULT_SUB_32(out5_lo, out5_hi, b1, kC1);
534
+ MULT_SUB_32(out7_lo, out7_hi, b1, kC5);
535
+
536
+ MULT_ADD_32(out1_lo, out1_hi, b2, kC5);
537
+ MULT_SUB_32(out3_lo, out3_hi, b2, kC1);
538
+ MULT_ADD_32(out5_lo, out5_hi, b2, kC7);
539
+ MULT_ADD_32(out7_lo, out7_hi, b2, kC3);
540
+
541
+ MULT_ADD_32(out1_lo, out1_hi, b3, kC7);
542
+ MULT_SUB_32(out3_lo, out3_hi, b3, kC5);
543
+ MULT_ADD_32(out5_lo, out5_hi, b3, kC3);
544
+ MULT_SUB_32(out7_lo, out7_hi, b3, kC1);
545
+
546
+ int16x8_t out1 = PackS32(out1_lo, out1_hi);
547
+ int16x8_t out3 = PackS32(out3_lo, out3_hi);
548
+ int16x8_t out5 = PackS32(out5_lo, out5_hi);
549
+ int16x8_t out7 = PackS32(out7_lo, out7_hi);
550
+
551
+ // final transpose
552
+ Transpose8x8(&out0, &out1, &out2, &out3, &out4, &out5, &out6, &out7);
553
+
554
+ // and storage.
555
+ vst1q_s16(&in[0 * 8], out0);
556
+ vst1q_s16(&in[1 * 8], out1);
557
+ vst1q_s16(&in[2 * 8], out2);
558
+ vst1q_s16(&in[3 * 8], out3);
559
+ vst1q_s16(&in[4 * 8], out4);
560
+ vst1q_s16(&in[5 * 8], out5);
561
+ vst1q_s16(&in[6 * 8], out6);
562
+ vst1q_s16(&in[7 * 8], out7);
563
+ }
564
+
565
+ static void FdctNEON(int16_t* coeffs, int num_blocks) {
566
+ while (num_blocks-- > 0) {
567
+ Dct_NEON(coeffs);
568
+ coeffs += 64;
569
+ }
570
+ }
571
+ #undef MULT
572
+ #undef BUTTERFLY
573
+ #undef MULT_DCL_32
574
+ #undef MULT_ADD_32
575
+ #undef MULT_SUB_32
576
+ #undef MK_TABLE_CST
577
+
578
+ #endif // SJPEG_USE_NEON
579
+
580
+ #undef kTan1
581
+ #undef kTan2
582
+ #undef kTan3m1
583
+ #undef k2Sqrt2
584
+
585
+ ///////////////////////////////////////////////////////////////////////////////
586
+ // visible FDCT callable functions
587
+
588
+ static void FdctC(int16_t* coeffs, int num_blocks) {
589
+ while (num_blocks-- > 0) {
590
+ ColumnDct(coeffs);
591
+ RowDct(coeffs + 0 * 8, kTable04);
592
+ RowDct(coeffs + 1 * 8, kTable17);
593
+ RowDct(coeffs + 2 * 8, kTable26);
594
+ RowDct(coeffs + 3 * 8, kTable35);
595
+ RowDct(coeffs + 4 * 8, kTable04);
596
+ RowDct(coeffs + 5 * 8, kTable35);
597
+ RowDct(coeffs + 6 * 8, kTable26);
598
+ RowDct(coeffs + 7 * 8, kTable17);
599
+ coeffs += 64;
600
+ }
601
+ }
602
+
603
+ #if defined(SJPEG_USE_SSE2)
604
+ static void FdctSSE2(int16_t* coeffs, int num_blocks) {
605
+ while (num_blocks-- > 0) {
606
+ ColumnDct_SSE2(coeffs);
607
+ RowDct_SSE2(coeffs + 0 * 8, kfTables_SSE2[0].m, kfTables_SSE2[1].m);
608
+ RowDct_SSE2(coeffs + 2 * 8, kfTables_SSE2[2].m, kfTables_SSE2[3].m);
609
+ RowDct_SSE2(coeffs + 4 * 8, kfTables_SSE2[0].m, kfTables_SSE2[3].m);
610
+ RowDct_SSE2(coeffs + 6 * 8, kfTables_SSE2[2].m, kfTables_SSE2[1].m);
611
+ coeffs += 64;
612
+ }
613
+ }
614
+ #endif // SJPEG_USE_SSE2
615
+
616
+ FdctFunc GetFdct() {
617
+ #if defined(SJPEG_USE_SSE2)
618
+ if (SupportsSSE2()) return FdctSSE2;
619
+ #elif defined(SJPEG_USE_NEON)
620
+ if (SupportsNEON()) return FdctNEON;
621
+ #endif
622
+ return FdctC; // default
623
+ }
624
+
625
+ ///////////////////////////////////////////////////////////////////////////////
626
+
627
+ } // namespace sjpeg