sjpeg 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,691 @@
1
+ // Copyright 2017 Google Inc.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // RGB -> YUV conversion
16
+ //
17
+ // Y = ( 19595 * r + 38469 * g + 7471 * b + HALF) >> FRAC
18
+ // U - 128 = (-11059 * r - 21709 * g + 32768 * b + HALF) >> FRAC
19
+ // V - 128 = ( 32768 * r - 27439 * g - 5329 * b + HALF) >> FRAC
20
+ //
21
+ // Author: Skal (pascal.massimino@gmail.com)
22
+
23
+ #include <string.h>
24
+
25
+ #define SJPEG_NEED_ASM_HEADERS
26
+ #include "sjpegi.h"
27
+
28
+ namespace sjpeg {
29
+
30
+ // global fixed-point precision
31
+ enum { FRAC = 16, HALF = 1 << FRAC >> 1,
32
+ ROUND_UV = (HALF << 2), ROUND_Y = HALF - (128 << FRAC) };
33
+
34
+ #if defined(SJPEG_USE_SSE2)
35
+
36
+ // Load eight 16b-words from *src.
37
+ #define LOAD_16(src) _mm_loadu_si128(reinterpret_cast<const __m128i*>(src))
38
+ // Store eight 16b-words into *dst
39
+ #define STORE_16(V, dst) _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), (V))
40
+
41
+ // Convert 8 packed RGB samples to r[], g[], b[]
42
+ static inline void RGB24PackedToPlanar(const uint8_t* const rgb,
43
+ __m128i* const r,
44
+ __m128i* const g,
45
+ __m128i* const b) {
46
+ const __m128i zero = _mm_setzero_si128();
47
+ // in0: r0 g0 b0 r1 | g1 b1 r2 g2 | b2 r3 g3 b3 | r4 g4 b4 r5
48
+ // in1: b2 r3 g3 b3 | r4 g4 b4 r5 | g5 b5 r6 g6 | b6 r7 g7 b7
49
+ const __m128i in0 = LOAD_16(rgb + 0);
50
+ const __m128i in1 = LOAD_16(rgb + 8);
51
+ // A0: | r2 g2 b2 r3 | g3 b3 r4 g4 | b4 r5 ...
52
+ // A1: ... b2 r3 | g3 b3 r4 g4 | b4 r5 g5 b5 |
53
+ const __m128i A0 = _mm_srli_si128(in0, 6);
54
+ const __m128i A1 = _mm_slli_si128(in1, 6);
55
+ // B0: r0 r2 g0 g2 | b0 b2 r1 r3 | g1 g3 b1 b3 | r2 r4 b2 b4
56
+ // B1: g3 g5 b3 b5 | r4 r6 g4 g6 | b4 b6 r5 r7 | g5 g7 b5 b7
57
+ const __m128i B0 = _mm_unpacklo_epi8(in0, A0);
58
+ const __m128i B1 = _mm_unpackhi_epi8(A1, in1);
59
+ // C0: r1 r3 g1 g3 | b1 b3 r2 r4 | b2 b4 ...
60
+ // C1: ... g3 g5 | b3 b5 r4 r6 | g4 g6 b4 b6
61
+ const __m128i C0 = _mm_srli_si128(B0, 6);
62
+ const __m128i C1 = _mm_slli_si128(B1, 6);
63
+ // D0: r0 r1 r2 r3 | g0 g1 g2 g3 | b0 b1 b2 b3 | r1 r2 r3 r4
64
+ // D1: b3 b4 b5 b6 | r4 r5 r6 r7 | g4 g5 g6 g7 | b4 b5 b6 b7 |
65
+ const __m128i D0 = _mm_unpacklo_epi8(B0, C0);
66
+ const __m128i D1 = _mm_unpackhi_epi8(C1, B1);
67
+ // r4 r5 r6 r7 | g4 g5 g6 g7 | b4 b5 b6 b7 | 0
68
+ const __m128i D2 = _mm_srli_si128(D1, 4);
69
+ // r0 r1 r2 r3 | r4 r5 r6 r7 | g0 g1 g2 g3 | g4 g5 g6 g7
70
+ const __m128i E0 = _mm_unpacklo_epi32(D0, D2);
71
+ // b0 b1 b2 b3 | b4 b5 b6 b7 | r1 r2 r3 r4 | 0
72
+ const __m128i E1 = _mm_unpackhi_epi32(D0, D2);
73
+ // g0 g1 g2 g3 | g4 g5 g6 g7 | 0
74
+ const __m128i E2 = _mm_srli_si128(E0, 8);
75
+ const __m128i F0 = _mm_unpacklo_epi8(E0, zero); // -> R
76
+ const __m128i F1 = _mm_unpacklo_epi8(E1, zero); // -> B
77
+ const __m128i F2 = _mm_unpacklo_epi8(E2, zero); // -> G
78
+ *r = F0;
79
+ *b = F1;
80
+ *g = F2;
81
+ }
82
+
83
+ // This macro computes (RG * MULT_RG + GB * MULT_GB + ROUNDER) >> DESCALE_FIX
84
+ // It's a macro and not a function because we need to use immediate values with
85
+ // srai_epi32, e.g.
86
+ #define TRANSFORM(RG_LO, RG_HI, GB_LO, GB_HI, MULT_RG, MULT_GB, \
87
+ ROUNDER, DESCALE_FIX, ADD_OR_SUB, OUT) do { \
88
+ const __m128i V0_lo = _mm_madd_epi16(RG_LO, MULT_RG); \
89
+ const __m128i V0_hi = _mm_madd_epi16(RG_HI, MULT_RG); \
90
+ const __m128i V1_lo = _mm_madd_epi16(GB_LO, MULT_GB); \
91
+ const __m128i V1_hi = _mm_madd_epi16(GB_HI, MULT_GB); \
92
+ const __m128i V2_lo = ADD_OR_SUB(V0_lo, V1_lo); \
93
+ const __m128i V2_hi = ADD_OR_SUB(V0_hi, V1_hi); \
94
+ const __m128i V3_lo = _mm_add_epi32(V2_lo, ROUNDER); \
95
+ const __m128i V3_hi = _mm_add_epi32(V2_hi, ROUNDER); \
96
+ const __m128i V5_lo = _mm_srai_epi32(V3_lo, DESCALE_FIX); \
97
+ const __m128i V5_hi = _mm_srai_epi32(V3_hi, DESCALE_FIX); \
98
+ (OUT) = _mm_packs_epi32(V5_lo, V5_hi); \
99
+ } while (0)
100
+
101
+ #define MK_CST_16(A, B) _mm_set_epi16((B), (A), (B), (A), (B), (A), (B), (A))
102
+
103
+ inline void ConvertRGBToY(const __m128i* const R,
104
+ const __m128i* const G,
105
+ const __m128i* const B,
106
+ int offset,
107
+ __m128i* const Y) {
108
+ const __m128i kRG_y = MK_CST_16(19595, 38469 - 16384);
109
+ const __m128i kGB_y = MK_CST_16(16384, 7471);
110
+ const __m128i kROUND_Y = _mm_set1_epi32(HALF + (offset << FRAC));
111
+ const __m128i RG_lo = _mm_unpacklo_epi16(*R, *G);
112
+ const __m128i RG_hi = _mm_unpackhi_epi16(*R, *G);
113
+ const __m128i GB_lo = _mm_unpacklo_epi16(*G, *B);
114
+ const __m128i GB_hi = _mm_unpackhi_epi16(*G, *B);
115
+ TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_y, kGB_y, kROUND_Y, FRAC,
116
+ _mm_add_epi32, *Y);
117
+ }
118
+
119
+ inline void ConvertRGBToUV(const __m128i* const R,
120
+ const __m128i* const G,
121
+ const __m128i* const B,
122
+ int offset,
123
+ __m128i* const U, __m128i* const V) {
124
+ // Warning! 32768 is overflowing int16, so we're actually multiplying
125
+ // by -32768 instead of 32768. We compensate by subtracting the result
126
+ // instead of adding, thus restoring the sign.
127
+ const __m128i kRG_u = MK_CST_16(-11059, -21709);
128
+ const __m128i kGB_u = MK_CST_16(0, -32768);
129
+ const __m128i kRG_v = MK_CST_16(-32768, 0);
130
+ const __m128i kGB_v = MK_CST_16(-27439, -5329);
131
+ const __m128i kRound = _mm_set1_epi32((offset << FRAC) + HALF);
132
+
133
+ const __m128i RG_lo = _mm_unpacklo_epi16(*R, *G);
134
+ const __m128i RG_hi = _mm_unpackhi_epi16(*R, *G);
135
+ const __m128i GB_lo = _mm_unpacklo_epi16(*G, *B);
136
+ const __m128i GB_hi = _mm_unpackhi_epi16(*G, *B);
137
+
138
+ // _mm_sub_epi32 -> sign restore!
139
+ TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_u, kGB_u, kRound, FRAC,
140
+ _mm_sub_epi32, *U);
141
+ // note! GB and RG are inverted, for sign-restoration
142
+ TRANSFORM(GB_lo, GB_hi, RG_lo, RG_hi, kGB_v, kRG_v, kRound, FRAC,
143
+ _mm_sub_epi32, *V);
144
+ }
145
+
146
+ // This version takes four accumulated R/G/B samples. Hence, the
147
+ // descaling factor is FRAC + 2.
148
+ inline void ConvertRGBToUVAccumulated(const __m128i* const R,
149
+ const __m128i* const G,
150
+ const __m128i* const B,
151
+ __m128i* const U, __m128i* const V) {
152
+ // Warning! 32768 is overflowing int16, so we're actually multiplying
153
+ // by -32768 instead of 32768. We compensate by subtracting the result
154
+ // instead of adding, thus restoring the sign.
155
+ const __m128i kRG_u = MK_CST_16(-11059, -21709);
156
+ const __m128i kGB_u = MK_CST_16(0, -32768);
157
+ const __m128i kRG_v = MK_CST_16(-32768, 0);
158
+ const __m128i kGB_v = MK_CST_16(-27439, -5329);
159
+ const __m128i kRound = _mm_set1_epi32(ROUND_UV);
160
+
161
+ const __m128i RG_lo = _mm_unpacklo_epi16(*R, *G);
162
+ const __m128i RG_hi = _mm_unpackhi_epi16(*R, *G);
163
+ const __m128i GB_lo = _mm_unpacklo_epi16(*G, *B);
164
+ const __m128i GB_hi = _mm_unpackhi_epi16(*G, *B);
165
+
166
+ // _mm_sub_epi32 -> sign restore!
167
+ TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_u, kGB_u,
168
+ kRound, FRAC + 2, _mm_sub_epi32, *U);
169
+ // note! GB and RG are inverted, for sign-restoration
170
+ TRANSFORM(GB_lo, GB_hi, RG_lo, RG_hi, kGB_v, kRG_v,
171
+ kRound, FRAC + 2, _mm_sub_epi32, *V);
172
+ }
173
+
174
+ #undef MK_CST_16
175
+ #undef TRANSFORM
176
+
177
+ // Convert 8 RGB samples to YUV. out[] points to a 3*64 data block.
178
+ static inline void ToYUV_8(const __m128i* const r,
179
+ const __m128i* const g,
180
+ const __m128i* const b,
181
+ int16_t* const out) {
182
+ __m128i Y, U, V;
183
+ ConvertRGBToY(r, g, b, -128, &Y);
184
+ ConvertRGBToUV(r, g, b, 0, &U, &V);
185
+ STORE_16(Y, out + 0 * 64);
186
+ STORE_16(U, out + 1 * 64);
187
+ STORE_16(V, out + 2 * 64);
188
+ }
189
+
190
+ static void Get8x8Block_SSE2(const uint8_t* data, int step, int16_t* out) {
191
+ for (int y = 8; y > 0; --y) {
192
+ __m128i r, g, b;
193
+ RGB24PackedToPlanar(data, &r, &g, &b);
194
+ ToYUV_8(&r, &g, &b, out);
195
+ out += 8;
196
+ data += step;
197
+ }
198
+ }
199
+
200
+ // Convert 16x16 RGB samples to YUV420
201
+ static inline void ToY_16x16(const __m128i* const r,
202
+ const __m128i* const g,
203
+ const __m128i* const b,
204
+ int16_t* const y_out,
205
+ __m128i* const R_acc,
206
+ __m128i* const G_acc,
207
+ __m128i* const B_acc,
208
+ bool do_add) {
209
+ __m128i Y;
210
+ ConvertRGBToY(r, g, b, -128, &Y);
211
+ STORE_16(Y, y_out);
212
+ if (do_add) {
213
+ *R_acc = _mm_add_epi16(*R_acc, *r);
214
+ *G_acc = _mm_add_epi16(*G_acc, *g);
215
+ *B_acc = _mm_add_epi16(*B_acc, *b);
216
+ } else { // just store
217
+ *R_acc = *r;
218
+ *G_acc = *g;
219
+ *B_acc = *b;
220
+ }
221
+ }
222
+
223
+ static inline void ToUV_8x8(const __m128i* const R,
224
+ const __m128i* const G,
225
+ const __m128i* const B,
226
+ int16_t* const uv_out) {
227
+ __m128i U, V;
228
+ ConvertRGBToUVAccumulated(R, G, B, &U, &V);
229
+ STORE_16(U, uv_out + 0 * 64);
230
+ STORE_16(V, uv_out + 1 * 64);
231
+ }
232
+
233
+ static void Condense16To8(const __m128i* const acc1, __m128i* const acc2) {
234
+ const __m128i one = _mm_set1_epi16(1);
235
+ const __m128i tmp1 = _mm_madd_epi16(*acc1, one);
236
+ const __m128i tmp2 = _mm_madd_epi16(*acc2, one);
237
+ *acc2 = _mm_packs_epi32(tmp1, tmp2);
238
+ }
239
+
240
+ // convert two 16x8 RGB blocks into two blocks of luma, and 2 blocks of U/V
241
+ static void Get16x8_SSE2(const uint8_t* src1, int src_stride,
242
+ int16_t y[4 * 64], int16_t uv[2 * 64]) {
243
+ for (int i = 4; i > 0; --i, src1 += 2 * src_stride) {
244
+ __m128i r_acc1, r_acc2, g_acc1, g_acc2, b_acc1, b_acc2;
245
+ __m128i r, g, b;
246
+ const uint8_t* const src2 = src1 + src_stride;
247
+ RGB24PackedToPlanar(src1 + 0 * 8, &r, &g, &b);
248
+ ToY_16x16(&r, &g, &b, y + 0 * 64 + 0, &r_acc1, &g_acc1, &b_acc1, false);
249
+ RGB24PackedToPlanar(src1 + 3 * 8, &r, &g, &b);
250
+ ToY_16x16(&r, &g, &b, y + 1 * 64 + 0, &r_acc2, &g_acc2, &b_acc2, false);
251
+ RGB24PackedToPlanar(src2 + 0 * 8, &r, &g, &b);
252
+ ToY_16x16(&r, &g, &b, y + 0 * 64 + 8, &r_acc1, &g_acc1, &b_acc1, true);
253
+ RGB24PackedToPlanar(src2 + 3 * 8, &r, &g, &b);
254
+ ToY_16x16(&r, &g, &b, y + 1 * 64 + 8, &r_acc2, &g_acc2, &b_acc2, true);
255
+ Condense16To8(&r_acc1, &r_acc2);
256
+ Condense16To8(&g_acc1, &g_acc2);
257
+ Condense16To8(&b_acc1, &b_acc2);
258
+ ToUV_8x8(&r_acc2, &g_acc2, &b_acc2, uv);
259
+ y += 2 * 8;
260
+ uv += 8;
261
+ }
262
+ }
263
+
264
+ static void Get16x16Block_SSE2(const uint8_t* data, int step, int16_t* blocks) {
265
+ Get16x8_SSE2(data + 0 * step, step, blocks + 0 * 64, blocks + 4 * 64 + 0 * 8);
266
+ Get16x8_SSE2(data + 8 * step, step, blocks + 2 * 64, blocks + 4 * 64 + 4 * 8);
267
+ }
268
+
269
+ #undef LOAD_16
270
+ #undef STORE_16
271
+
272
+ #endif // SJPEG_USE_SSE2
273
+
274
+ ///////////////////////////////////////////////////////////////////////////////
275
+ // NEON-version for 8x8 and 16x16 blocks
276
+
277
+ #if defined(SJPEG_USE_NEON)
278
+
279
+ static const int16_t kCoeff1[4] = { (int16_t)38469, 19595, 7471, 0 };
280
+ static const int16_t kCoeff2[4] = { 21709, 11059, 27439, 5329 };
281
+
282
+ // Convert 8 packed RGB or BGR samples to r[], g[], b[]
283
+ static void RGB24PackedToPlanar(const uint8_t* const rgb,
284
+ int16x8_t* const r,
285
+ int16x8_t* const g,
286
+ int16x8_t* const b) {
287
+ const uint8x8x3_t in = vld3_u8(rgb);
288
+ *r = vreinterpretq_s16_u16(vmovl_u8(in.val[0]));
289
+ *g = vreinterpretq_s16_u16(vmovl_u8(in.val[1]));
290
+ *b = vreinterpretq_s16_u16(vmovl_u8(in.val[2]));
291
+ }
292
+
293
+ // s16->s32 widening multiply with large (>=32768) coeff requires special care:
294
+ #define MULT_S32_S16_LARGE(S16, COEFF, LANE) \
295
+ vreinterpretq_s32_u32(vmull_lane_u16(vreinterpret_u16_s16(S16), \
296
+ vreinterpret_u16_s16(COEFF), (LANE)))
297
+
298
+ static inline void ConvertRGBToY(const int16x8_t R,
299
+ const int16x8_t G,
300
+ const int16x8_t B,
301
+ const int16x4_t coeffs,
302
+ int16x8_t* const Y) {
303
+ int32x4_t lo = MULT_S32_S16_LARGE(vget_low_s16(G), coeffs, 0);
304
+ int32x4_t hi = MULT_S32_S16_LARGE(vget_high_s16(G), coeffs, 0);
305
+ lo = vmlal_lane_s16(lo, vget_low_s16(R), coeffs, 1);
306
+ hi = vmlal_lane_s16(hi, vget_high_s16(R), coeffs, 1);
307
+ lo = vmlal_lane_s16(lo, vget_low_s16(B), coeffs, 2);
308
+ hi = vmlal_lane_s16(hi, vget_high_s16(B), coeffs, 2);
309
+ const int16x8_t V0 = vcombine_s16(vqmovn_s32(vrshrq_n_s32(lo, FRAC)),
310
+ vqmovn_s32(vrshrq_n_s32(hi, FRAC)));
311
+ *Y = vsubq_s16(V0, vdupq_n_s16(128));
312
+ }
313
+
314
+ // Compute ((V0<<15) - V1 * C1 - V2 * C2 + round) >> SHIFT
315
+ #define DOT_PROD_PREAMBLE(V0, V1, V2, COEFF, LANE1, LANE2) \
316
+ int32x4_t lo, hi; \
317
+ do { \
318
+ lo = vshll_n_s16(vget_low_s16(V0), 15); \
319
+ hi = vshll_n_s16(vget_high_s16(V0), 15); \
320
+ lo = vmlsl_lane_s16(lo, vget_low_s16(V1), COEFF, LANE1); \
321
+ hi = vmlsl_lane_s16(hi, vget_high_s16(V1), COEFF, LANE1); \
322
+ lo = vmlsl_lane_s16(lo, vget_low_s16(V2), COEFF, LANE2); \
323
+ hi = vmlsl_lane_s16(hi, vget_high_s16(V2), COEFF, LANE2); \
324
+ } while (0)
325
+
326
+ // This version assumes SHIFT <= 16
327
+ #define DOT_PROD1(V0, V1, V2, COEFF, LANE1, LANE2, SHIFT, OUT) do { \
328
+ assert(SHIFT <= 16); \
329
+ DOT_PROD_PREAMBLE(V0, V1, V2, COEFF, LANE1, LANE2); \
330
+ (OUT) = vcombine_s16(vrshrn_n_s32(lo, SHIFT), vrshrn_n_s32(hi, SHIFT)); \
331
+ } while (0)
332
+
333
+ // alternate version for SHIFT > 16
334
+ #define DOT_PROD2(V0, V1, V2, COEFF, LANE1, LANE2, SHIFT, OUT) do { \
335
+ assert(SHIFT > 16); \
336
+ DOT_PROD_PREAMBLE(V0, V1, V2, COEFF, LANE1, LANE2); \
337
+ (OUT) = vcombine_s16(vqmovn_s32(vrshrq_n_s32(lo, SHIFT)), \
338
+ vqmovn_s32(vrshrq_n_s32(hi, SHIFT))); \
339
+ } while (0)
340
+
341
+ static inline void ConvertRGBToUV(const int16x8_t R,
342
+ const int16x8_t G,
343
+ const int16x8_t B,
344
+ const int16x4_t coeffs,
345
+ int16x8_t* const U, int16x8_t* const V) {
346
+ DOT_PROD1(B, G, R, coeffs, 0, 1, FRAC, *U);
347
+ DOT_PROD1(R, G, B, coeffs, 2, 3, FRAC, *V);
348
+ }
349
+
350
+ static inline void ConvertRGBToUVAccumulated(const int16x8_t R,
351
+ const int16x8_t G,
352
+ const int16x8_t B,
353
+ const int16x4_t coeffs,
354
+ int16x8_t* const U,
355
+ int16x8_t* const V) {
356
+ DOT_PROD2(B, G, R, coeffs, 0, 1, FRAC + 2, *U);
357
+ DOT_PROD2(R, G, B, coeffs, 2, 3, FRAC + 2, *V);
358
+ }
359
+
360
+ // Convert 8 RGB samples to YUV. out[] points to a 3*64 data block.
361
+ static void ToYUV_8(const int16x8_t r, const int16x8_t g, const int16x8_t b,
362
+ const int16x4_t coeffs1, const int16x4_t coeffs2,
363
+ int16_t* const out) {
364
+ int16x8_t Y, U, V;
365
+ ConvertRGBToY(r, g, b, coeffs1, &Y);
366
+ ConvertRGBToUV(r, g, b, coeffs2, &U, &V);
367
+ vst1q_s16(out + 0 * 64, Y);
368
+ vst1q_s16(out + 1 * 64, U);
369
+ vst1q_s16(out + 2 * 64, V);
370
+ }
371
+
372
+ static void Get8x8Block_NEON(const uint8_t* data, int step, int16_t* out) {
373
+ const int16x4_t kC1 = vld1_s16(kCoeff1);
374
+ const int16x4_t kC2 = vld1_s16(kCoeff2);
375
+ for (int y = 8; y > 0; --y) {
376
+ int16x8_t r, g, b;
377
+ RGB24PackedToPlanar(data, &r, &g, &b);
378
+ ToYUV_8(r, g, b, kC1, kC2, out);
379
+ out += 8;
380
+ data += step;
381
+ }
382
+ }
383
+
384
+ // Convert 16x16 RGB samples to YUV420
385
+ static inline void ToY_16x16(const int16x8_t r,
386
+ const int16x8_t g,
387
+ const int16x8_t b,
388
+ int16_t* const y_out,
389
+ int16x8_t* const R_acc,
390
+ int16x8_t* const G_acc,
391
+ int16x8_t* const B_acc,
392
+ const int16x4_t coeffs,
393
+ bool do_add) {
394
+ int16x8_t Y;
395
+ ConvertRGBToY(r, g, b, coeffs, &Y);
396
+ vst1q_s16(y_out, Y);
397
+ if (do_add) {
398
+ *R_acc = vaddq_s16(*R_acc, r);
399
+ *G_acc = vaddq_s16(*G_acc, g);
400
+ *B_acc = vaddq_s16(*B_acc, b);
401
+ } else { // just store
402
+ *R_acc = r;
403
+ *G_acc = g;
404
+ *B_acc = b;
405
+ }
406
+ }
407
+
408
+ static inline void ToUV_8x8(const int16x8_t R,
409
+ const int16x8_t G,
410
+ const int16x8_t B,
411
+ const int16x4_t coeffs,
412
+ int16_t* const uv_out) {
413
+ int16x8_t U, V;
414
+ ConvertRGBToUVAccumulated(R, G, B, coeffs, &U, &V);
415
+ vst1q_s16(uv_out + 0 * 64, U);
416
+ vst1q_s16(uv_out + 1 * 64, V);
417
+ }
418
+
419
+ static void Condense16To8(const int16x8_t acc1, int16x8_t* const acc2) {
420
+ const int32x4_t lo = vpaddlq_s16(acc1);
421
+ const int32x4_t hi = vpaddlq_s16(*acc2);
422
+ *acc2 = vcombine_s16(vqmovn_s32(lo), vqmovn_s32(hi)); // pack-saturate
423
+ }
424
+
425
+ // convert two 16x8 RGB blocks into two blocks of luma, and 2 blocks of U/V
426
+ static void Get16x8_NEON(const uint8_t* src1, int src_stride,
427
+ int16_t y[4 * 64], int16_t uv[2 * 64]) {
428
+ const int16x4_t kC1 = vld1_s16(kCoeff1);
429
+ const int16x4_t kC2 = vld1_s16(kCoeff2);
430
+ for (int i = 4; i > 0; --i, src1 += 2 * src_stride) {
431
+ int16x8_t r_acc1, r_acc2, g_acc1, g_acc2, b_acc1, b_acc2;
432
+ int16x8_t r, g, b;
433
+ const uint8_t* const src2 = src1 + src_stride;
434
+ RGB24PackedToPlanar(src1 + 0 * 8, &r, &g, &b);
435
+ ToY_16x16(r, g, b, y + 0 * 64 + 0, &r_acc1, &g_acc1, &b_acc1, kC1, false);
436
+ RGB24PackedToPlanar(src1 + 3 * 8, &r, &g, &b);
437
+ ToY_16x16(r, g, b, y + 1 * 64 + 0, &r_acc2, &g_acc2, &b_acc2, kC1, false);
438
+ RGB24PackedToPlanar(src2 + 0 * 8, &r, &g, &b);
439
+ ToY_16x16(r, g, b, y + 0 * 64 + 8, &r_acc1, &g_acc1, &b_acc1, kC1, true);
440
+ RGB24PackedToPlanar(src2 + 3 * 8, &r, &g, &b);
441
+ ToY_16x16(r, g, b, y + 1 * 64 + 8, &r_acc2, &g_acc2, &b_acc2, kC1, true);
442
+ Condense16To8(r_acc1, &r_acc2);
443
+ Condense16To8(g_acc1, &g_acc2);
444
+ Condense16To8(b_acc1, &b_acc2);
445
+ ToUV_8x8(r_acc2, g_acc2, b_acc2, kC2, uv);
446
+ y += 2 * 8;
447
+ uv += 8;
448
+ }
449
+ }
450
+
451
+ static void Get16x16Block_NEON(const uint8_t* data, int step, int16_t* yuv) {
452
+ int16_t* const uv = yuv + 4 * 64;
453
+ Get16x8_NEON(data + 0 * step, step, yuv + 0 * 64, uv + 0 * 8);
454
+ Get16x8_NEON(data + 8 * step, step, yuv + 2 * 64, uv + 4 * 8);
455
+ }
456
+
457
+ #undef MULT_S32_S16_LARGE
458
+ #undef DOT_PROD_PREAMBLE
459
+ #undef DOT_PROD1
460
+ #undef DOT_PROD2
461
+
462
+ #endif // SJPEG_USE_NEON
463
+
464
+ ///////////////////////////////////////////////////////////////////////////////
465
+ // C-version
466
+
467
+ // convert rgb_in[3] RGB triplet to Y, and accumulate in *rgb_sum
468
+ static inline int16_t ToY(const uint8_t* const rgb_in, int* const rgb_sum) {
469
+ const int r = rgb_in[0];
470
+ const int g = rgb_in[1];
471
+ const int b = rgb_in[2];
472
+ rgb_sum[0] += r;
473
+ rgb_sum[1] += g;
474
+ rgb_sum[2] += b;
475
+ const int y = 19595 * r + 38469 * g + 7471 * b + ROUND_Y;
476
+ return static_cast<int16_t>(y >> FRAC);
477
+ }
478
+
479
+ // convert sum of four rgb triplets to U
480
+ static inline int16_t ToU(const int* const rgb) {
481
+ const int u = -11059 * rgb[0] - 21709 * rgb[1] + 32768 * rgb[2] + ROUND_UV;
482
+ return static_cast<int16_t>(u >> (FRAC + 2));
483
+ }
484
+
485
+ // convert sum of four rgb triplets to V
486
+ static inline int16_t ToV(const int* const rgb) {
487
+ const int v = 32768 * rgb[0] - 27439 * rgb[1] - 5329 * rgb[2] + ROUND_UV;
488
+ return static_cast<int16_t>(v >> (FRAC + 2));
489
+ }
490
+
491
+ // for 4:4:4 conversion: convert rgb[3] to yuv
492
+ static inline void ToYUV(const uint8_t* const rgb, int16_t* const out) {
493
+ const int r = rgb[0];
494
+ const int g = rgb[1];
495
+ const int b = rgb[2];
496
+ const int y = 19595 * r + 38469 * g + 7471 * b + ROUND_Y;
497
+ const int u = -11059 * r - 21709 * g + 32768 * b + HALF;
498
+ const int v = 32768 * r - 27439 * g - 5329 * b + HALF;
499
+ out[0 * 64] = static_cast<int16_t>(y >> FRAC);
500
+ out[1 * 64] = static_cast<int16_t>(u >> FRAC);
501
+ out[2 * 64] = static_cast<int16_t>(v >> FRAC);
502
+ }
503
+
504
+ static void Get8x8Block_C(const uint8_t* data, int step, int16_t* out) {
505
+ for (int y = 8; y > 0; --y) {
506
+ for (int x = 0; x < 8; ++x) {
507
+ ToYUV(data + 3 * x, out + x);
508
+ }
509
+ out += 8;
510
+ data += step;
511
+ }
512
+ }
513
+
514
+ void Get16x8Block_C(const uint8_t* src1, int src_stride,
515
+ int16_t yblock[4 * 64], int16_t uvblock[2 * 64]) {
516
+ for (int y = 8; y > 0; y -= 2) {
517
+ const uint8_t* const src2 = src1 + src_stride;
518
+ for (int x = 0; x < 4; ++x) {
519
+ int rgb[2][3];
520
+ memset(rgb, 0, sizeof(rgb));
521
+ yblock[2 * x ] = ToY(src1 + 6 * x, rgb[0]);
522
+ yblock[2 * x + 1] = ToY(src1 + 6 * x + 3, rgb[0]);
523
+ yblock[2 * x + 8] = ToY(src2 + 6 * x, rgb[0]);
524
+ yblock[2 * x + 9] = ToY(src2 + 6 * x + 3, rgb[0]);
525
+ uvblock[0 * 64 + x] = ToU(rgb[0]);
526
+ uvblock[1 * 64 + x] = ToV(rgb[0]);
527
+ yblock[2 * x + 64] = ToY(src1 + 3 * 8 + 6 * x, rgb[1]);
528
+ yblock[2 * x + 1 + 64] = ToY(src1 + 3 * 8 + 6 * x + 3, rgb[1]);
529
+ yblock[2 * x + 8 + 64] = ToY(src2 + 3 * 8 + 6 * x, rgb[1]);
530
+ yblock[2 * x + 9 + 64] = ToY(src2 + 3 * 8 + 6 * x + 3, rgb[1]);
531
+ uvblock[0 * 64 + x + 4] = ToU(rgb[1]);
532
+ uvblock[1 * 64 + x + 4] = ToV(rgb[1]);
533
+ }
534
+ yblock += 2 * 8;
535
+ uvblock += 8;
536
+ src1 += 2 * src_stride;
537
+ }
538
+ }
539
+
540
+ static void Get16x16Block_C(const uint8_t* rgb, int step, int16_t* yuv) {
541
+ Get16x8Block_C(rgb + 0 * step, step, yuv + 0 * 64, yuv + 4 * 64 + 0 * 8);
542
+ Get16x8Block_C(rgb + 8 * step, step, yuv + 2 * 64, yuv + 4 * 64 + 4 * 8);
543
+ }
544
+
545
+ ///////////////////////////////////////////////////////////////////////////////
546
+
547
+ RGBToYUVBlockFunc GetBlockFunc(bool use_444) {
548
+ #if defined(SJPEG_USE_SSE2)
549
+ if (SupportsSSE2()) return use_444 ? Get8x8Block_SSE2
550
+ : Get16x16Block_SSE2;
551
+ #elif defined(SJPEG_USE_NEON)
552
+ if (SupportsNEON()) return use_444 ? Get8x8Block_NEON
553
+ : Get16x16Block_NEON;
554
+ #endif
555
+ return use_444 ? Get8x8Block_C : Get16x16Block_C; // default
556
+ }
557
+
558
+ ///////////////////////////////////////////////////////////////////////////////
559
+
560
+ namespace {
561
+
562
+ uint8_t clip_8b(int v) {
563
+ return (!(v & ~0xff)) ? (uint8_t)v : (v < 0) ? 0u : 255u;
564
+ }
565
+
566
+ int ToY(int r, int g, int b) {
567
+ const int luma = 19595 * r + 38469 * g + 7471 * b;
568
+ return (luma + HALF) >> FRAC; // no need to clip
569
+ }
570
+
571
+ uint32_t clip_uv(int v) {
572
+ return clip_8b(128 + ((v + HALF) >> FRAC));
573
+ }
574
+
575
+ uint32_t ToU(int r, int g, int b) {
576
+ const int u = -11059 * r - 21709 * g + 32768 * b;
577
+ return clip_uv(u);
578
+ }
579
+
580
+ uint32_t ToV(int r, int g, int b) {
581
+ const int v = 32768 * r - 27439 * g - 5329 * b;
582
+ return clip_uv(v);
583
+ }
584
+
585
+ // (X * 0x0101 >> 16) ~= X / 255
586
+ uint32_t Convert(uint32_t v) {
587
+ return (v * (0x0101u * (sjpeg::kRGBSize - 1))) >> 16;
588
+ }
589
+
590
+ int ConvertToYUVIndex(const uint8_t* const rgb) {
591
+ const int r = rgb[0];
592
+ const int g = rgb[1];
593
+ const int b = rgb[2];
594
+ const uint32_t y = Convert(ToY(r, g, b));
595
+ const uint32_t u = Convert(ToU(r, g, b));
596
+ const uint32_t v = Convert(ToV(r, g, b));
597
+ return (y + u * sjpeg::kRGBSize + v * sjpeg::kRGBSize * sjpeg::kRGBSize);
598
+ }
599
+
600
+ void RowToIndexC(const uint8_t* rgb, int width, uint16_t* dst) {
601
+ for (int i = 0; i < width; ++i, rgb += 3) {
602
+ dst[i] = ConvertToYUVIndex(rgb);
603
+ }
604
+ }
605
+
606
+ #if defined(SJPEG_USE_SSE2)
607
+ void RowToIndexSSE2(const uint8_t* rgb, int width, uint16_t* dst) {
608
+ const __m128i zero = _mm_setzero_si128();
609
+ const __m128i mult = _mm_set1_epi16(0x0101u * (sjpeg::kRGBSize - 1));
610
+ const __m128i mult1 = _mm_set1_epi16(sjpeg::kRGBSize);
611
+ const __m128i mult2 = _mm_set1_epi16(sjpeg::kRGBSize * sjpeg::kRGBSize);
612
+ const __m128i k255 = _mm_set1_epi16(255);
613
+ while (width >= 8) {
614
+ __m128i r, g, b;
615
+ __m128i Y, U, V;
616
+ RGB24PackedToPlanar(rgb, &r, &g, &b);
617
+ ConvertRGBToY(&r, &g, &b, 0, &Y);
618
+ ConvertRGBToUV(&r, &g, &b, 128, &U, &V);
619
+ // clamping to [0, 255]
620
+ const __m128i y1 = _mm_min_epi16(_mm_max_epi16(Y, zero), k255);
621
+ const __m128i u1 = _mm_min_epi16(_mm_max_epi16(U, zero), k255);
622
+ const __m128i v1 = _mm_min_epi16(_mm_max_epi16(V, zero), k255);
623
+ // convert to idx
624
+ const __m128i y2 = _mm_mulhi_epi16(y1, mult);
625
+ const __m128i u2 = _mm_mulhi_epi16(u1, mult);
626
+ const __m128i v2 = _mm_mulhi_epi16(v1, mult);
627
+ // store final idx
628
+ const __m128i u3 = _mm_mullo_epi16(u2, mult1);
629
+ const __m128i v3 = _mm_mullo_epi16(v2, mult2);
630
+ const __m128i tmp = _mm_add_epi16(y2, u3);
631
+ const __m128i idx = _mm_add_epi16(tmp, v3);
632
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), idx);
633
+
634
+ rgb += 3 * 8;
635
+ dst += 8;
636
+ width -= 8;
637
+ }
638
+ if (width > 0) RowToIndexC(rgb, width, dst);
639
+ }
640
+ #elif defined(SJPEG_USE_NEON)
641
+ void RowToIndexNEON(const uint8_t* rgb, int width, uint16_t* dst) {
642
+ const int16x8_t k128 = vdupq_n_s16(128);
643
+ const uint8x8_t mult = vdup_n_u8(sjpeg::kRGBSize - 1);
644
+ const uint16x8_t mult1 = vdupq_n_u16(sjpeg::kRGBSize);
645
+ const uint16x8_t mult2 = vdupq_n_u16(sjpeg::kRGBSize * sjpeg::kRGBSize);
646
+ const int16x4_t coeffs1 = vld1_s16(kCoeff1);
647
+ const int16x4_t coeffs2 = vld1_s16(kCoeff2);
648
+ while (width >= 8) {
649
+ int16x8_t r, g, b;
650
+ int16x8_t Y, U, V;
651
+ RGB24PackedToPlanar(rgb, &r, &g, &b);
652
+ ConvertRGBToY(r, g, b, coeffs1, &Y);
653
+ ConvertRGBToUV(r, g, b, coeffs2, &U, &V);
654
+ // clamping to [0, 255]
655
+ const uint8x8_t y1 = vqmovun_s16(vaddq_s16(Y, k128));
656
+ const uint8x8_t u1 = vqmovun_s16(vaddq_s16(U, k128));
657
+ const uint8x8_t v1 = vqmovun_s16(vaddq_s16(V, k128));
658
+ // convert to idx
659
+ const uint16x8_t y2 = vmull_u8(y1, mult);
660
+ const uint16x8_t u2 = vmull_u8(u1, mult);
661
+ const uint16x8_t v2 = vmull_u8(v1, mult);
662
+ // divide by 255 using v/255 = (v * 0x0101) >> 16
663
+ const uint16x8_t y3 = vshrq_n_u16(vsraq_n_u16(y2, y2, 8), 8);
664
+ const uint16x8_t u3 = vshrq_n_u16(vsraq_n_u16(u2, u2, 8), 8);
665
+ const uint16x8_t v3 = vshrq_n_u16(vsraq_n_u16(v2, v2, 8), 8);
666
+ // store final idx
667
+ const uint16x8_t tmp = vmlaq_u16(y3, u3, mult1);
668
+ const uint16x8_t idx = vmlaq_u16(tmp, v3, mult2);
669
+ vst1q_u16(dst, idx);
670
+
671
+ rgb += 3 * 8;
672
+ dst += 8;
673
+ width -= 8;
674
+ }
675
+ if (width > 0) RowToIndexC(rgb, width, dst);
676
+ }
677
+ #endif // SJPEG_USE_NEON
678
+
679
+ } // namespace
680
+
681
+
682
+ RGBToIndexRowFunc GetRowFunc() {
683
+ #if defined(SJPEG_USE_SSE2)
684
+ if (SupportsSSE2()) return RowToIndexSSE2;
685
+ #elif defined(SJPEG_USE_NEON)
686
+ if (SupportsNEON()) return RowToIndexNEON;
687
+ #endif
688
+ return RowToIndexC;
689
+ }
690
+
691
+ } // namespace sjpeg