sjpeg 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,698 @@
1
+ // Copyright 2017 Google Inc.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Enhanced RGB->YUV conversion functions
16
+ //
17
+ // Author: Skal (pascal.massimino@gmail.com)
18
+
19
+ #include <math.h>
20
+ #include <stdlib.h>
21
+ #include <string.h>
22
+ #include <memory>
23
+ #include <vector>
24
+ using std::vector;
25
+
26
+ #define SJPEG_NEED_ASM_HEADERS
27
+ #include "sjpegi.h"
28
+
29
+ namespace sjpeg {
30
+
31
+ // We could use SFIX=0 and only uint8_t for fixed_y_t, but it produces some
32
+ // banding sometimes. Better use extra precision.
33
+ #define SFIX 2 // fixed-point precision of RGB and Y/W
34
+ #define SHALF (1 << SFIX >> 1)
35
+ #define MAX_Y_T ((256 << SFIX) - 1)
36
+ typedef int16_t fixed_t; // signed type with extra SFIX precision for UV
37
+ typedef uint16_t fixed_y_t; // unsigned type with extra SFIX precision for W
38
+
39
+ static fixed_y_t clip_y(int y) {
40
+ return (!(y & ~MAX_Y_T)) ? (fixed_y_t)y : (y < 0) ? 0 : MAX_Y_T;
41
+ }
42
+
43
+ ////////////////////////////////////////////////////////////////////////////////
44
+ // Helper functions for Y/U/V fixed-point calculations.
45
+
46
+ // The following functions convert r/g/b values in SFIX fixed-point precision
47
+ // to 8b values, clipped:
48
+ #define YUV_FIX 16
49
+ #define TFIX (YUV_FIX + SFIX)
50
+ #define TROUNDER (1 << TFIX >> 1)
51
+
52
+ static uint8_t clip_8b(int v) {
53
+ return (!(v & ~0xff)) ? (uint8_t)v : (v < 0) ? 0u : 255u;
54
+ }
55
+
56
+ static uint8_t ConvertRGBToY(int r, int g, int b) {
57
+ const int luma = 19595 * r + 38469 * g + 7471 * b + TROUNDER;
58
+ return clip_8b(luma >> TFIX);
59
+ }
60
+
61
+ static uint8_t ConvertRGBToU(int r, int g, int b) {
62
+ const int u = -11058 * r - 21709 * g + 32768 * b + TROUNDER;
63
+ return clip_8b(128 + (u >> TFIX));
64
+ }
65
+
66
+ static uint8_t ConvertRGBToV(int r, int g, int b) {
67
+ const int v = +32768 * r - 27439 * g - 5328 * b + TROUNDER;
68
+ return clip_8b(128 + (v >> TFIX));
69
+ }
70
+
71
+ // convert to luma using 16b precision:
72
+ static void ConvertRowToY(const uint8_t* row, int w, uint8_t* const dst) {
73
+ for (int i = 0; i < w; i += 1, row += 3) {
74
+ const int r = row[0], g = row[1], b = row[2];
75
+ const int y = 19595 * r + 38469 * g + 7471 * b;
76
+ dst[i] = (y + (1 << YUV_FIX >> 1)) >> YUV_FIX;
77
+ }
78
+ }
79
+
80
+ static void ConvertRowToUV(const uint8_t* row1, const uint8_t* row2,
81
+ int w, uint8_t* u, uint8_t* v) {
82
+ for (int i = 0; i < (w & ~1); i += 2, row1 += 6, row2 += 6) {
83
+ const int r = row1[0] + row1[3] + row2[0] + row2[3];
84
+ const int g = row1[1] + row1[4] + row2[1] + row2[4];
85
+ const int b = row1[2] + row1[5] + row2[2] + row2[5];
86
+ *u++ = ConvertRGBToU(r, g, b);
87
+ *v++ = ConvertRGBToV(r, g, b);
88
+ }
89
+ if (w & 1) {
90
+ const int r = 2 * (row1[0] + row2[0]);
91
+ const int g = 2 * (row1[1] + row2[1]);
92
+ const int b = 2 * (row1[2] + row2[2]);
93
+ *u++ = ConvertRGBToU(r, g, b);
94
+ *v++ = ConvertRGBToV(r, g, b);
95
+ }
96
+ }
97
+
98
+ #undef TFIX
99
+ #undef ROUNDER
100
+
101
+ ////////////////////////////////////////////////////////////////////////////////
102
+ // Sharp RGB->YUV conversion
103
+
104
+ static const int kNumIterations = 4;
105
+ static const int kMinDimensionIterativeConversion = 4;
106
+
107
+ // size of the interpolation table for linear-to-gamma
108
+ #define GAMMA_TABLE_SIZE 32
109
+ static uint32_t kLinearToGammaTab[GAMMA_TABLE_SIZE + 2];
110
+ #define GAMMA_TO_LINEAR_BITS 14
111
+ static uint32_t kGammaToLinearTab[MAX_Y_T + 1]; // size scales with Y_FIX
112
+
113
+ static void InitGammaTablesF(void) {
114
+ static bool done = false;
115
+ assert(2 * GAMMA_TO_LINEAR_BITS < 32); // we use uint32_t intermediate values
116
+ if (!done) {
117
+ int v;
118
+ const double norm = 1. / MAX_Y_T;
119
+ const double scale = 1. / GAMMA_TABLE_SIZE;
120
+ const double a = 0.099;
121
+ const double thresh = 0.018;
122
+ const double gamma = 1. / 0.45;
123
+ const double final_scale = 1 << GAMMA_TO_LINEAR_BITS;
124
+ for (v = 0; v <= MAX_Y_T; ++v) {
125
+ const double g = norm * v;
126
+ double value;
127
+ if (g <= thresh * 4.5) {
128
+ value = g / 4.5;
129
+ } else {
130
+ const double a_rec = 1. / (1. + a);
131
+ value = pow(a_rec * (g + a), gamma);
132
+ }
133
+ kGammaToLinearTab[v] = static_cast<uint32_t>(value * final_scale + .5);
134
+ }
135
+ for (v = 0; v <= GAMMA_TABLE_SIZE; ++v) {
136
+ const double g = scale * v;
137
+ double value;
138
+ if (g <= thresh) {
139
+ value = 4.5 * g;
140
+ } else {
141
+ value = (1. + a) * pow(g, 1. / gamma) - a;
142
+ }
143
+ // we already incorporate the 1/2 rounding constant here
144
+ kLinearToGammaTab[v] =
145
+ static_cast<uint32_t>(MAX_Y_T * value)
146
+ + (1 << GAMMA_TO_LINEAR_BITS >> 1);
147
+ }
148
+ // to prevent small rounding errors to cause read-overflow:
149
+ kLinearToGammaTab[GAMMA_TABLE_SIZE + 1] =
150
+ kLinearToGammaTab[GAMMA_TABLE_SIZE];
151
+ done = true;
152
+ }
153
+ }
154
+
155
+ // return value has a fixed-point precision of GAMMA_TO_LINEAR_BITS
156
+ static uint32_t GammaToLinear(int v) { return kGammaToLinearTab[v]; }
157
+
158
+ static uint32_t LinearToGamma(uint32_t value) {
159
+ // 'value' is in GAMMA_TO_LINEAR_BITS fractional precision
160
+ const uint32_t v = value * GAMMA_TABLE_SIZE;
161
+ const uint32_t tab_pos = v >> GAMMA_TO_LINEAR_BITS;
162
+ // fractional part, in GAMMA_TO_LINEAR_BITS fixed-point precision
163
+ const uint32_t x = v - (tab_pos << GAMMA_TO_LINEAR_BITS); // fractional part
164
+ // v0 / v1 are in GAMMA_TO_LINEAR_BITS fixed-point precision (range [0..1])
165
+ const uint32_t v0 = kLinearToGammaTab[tab_pos + 0];
166
+ const uint32_t v1 = kLinearToGammaTab[tab_pos + 1];
167
+ // Final interpolation. Note that rounding is already included.
168
+ const uint32_t v2 = (v1 - v0) * x; // note: v1 >= v0.
169
+ const uint32_t result = v0 + (v2 >> GAMMA_TO_LINEAR_BITS);
170
+ return result;
171
+ }
172
+
173
+ //------------------------------------------------------------------------------
174
+
175
+ static uint64_t SharpUpdateY_C(const uint16_t* ref, const uint16_t* src,
176
+ uint16_t* dst, int len) {
177
+ uint64_t diff = 0;
178
+ for (int i = 0; i < len; ++i) {
179
+ const int diff_y = ref[i] - src[i];
180
+ const int new_y = static_cast<int>(dst[i]) + diff_y;
181
+ dst[i] = clip_y(new_y);
182
+ diff += (uint64_t)abs(diff_y);
183
+ }
184
+ return diff;
185
+ }
186
+
187
+ static void SharpUpdateRGB_C(const int16_t* ref, const int16_t* src,
188
+ int16_t* dst, int len) {
189
+ for (int i = 0; i < len; ++i) {
190
+ const int diff_uv = ref[i] - src[i];
191
+ dst[i] += diff_uv;
192
+ }
193
+ }
194
+
195
+ static void SharpFilterRow_C(const int16_t* A, const int16_t* B, int len,
196
+ const uint16_t* best_y, uint16_t* out) {
197
+ for (int i = 0; i < len; ++i, ++A, ++B) {
198
+ const int v0 = (A[0] * 9 + A[1] * 3 + B[0] * 3 + B[1] + 8) >> 4;
199
+ const int v1 = (A[1] * 9 + A[0] * 3 + B[1] * 3 + B[0] + 8) >> 4;
200
+ out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0);
201
+ out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);
202
+ }
203
+ }
204
+
205
+ #if defined(SJPEG_USE_SSE2)
206
+
207
+ #define LOAD_16(P) (_mm_loadu_si128(reinterpret_cast<const __m128i*>(P)))
208
+ #define STORE_16(P, V) (_mm_storeu_si128(reinterpret_cast<__m128i*>(P), (V)))
209
+
210
+ static uint64_t SharpUpdateY_SSE2(const uint16_t* ref, const uint16_t* src,
211
+ uint16_t* dst, int len) {
212
+ uint64_t diff = 0;
213
+ uint32_t tmp[4];
214
+ int i;
215
+ const __m128i zero = _mm_setzero_si128();
216
+ const __m128i max = _mm_set1_epi16(MAX_Y_T);
217
+ const __m128i one = _mm_set1_epi16(1);
218
+ __m128i sum = zero;
219
+
220
+ for (i = 0; i + 8 <= len; i += 8) {
221
+ const __m128i A = LOAD_16(ref + i);
222
+ const __m128i B = LOAD_16(src + i);
223
+ const __m128i C = LOAD_16(dst + i);
224
+ const __m128i D = _mm_sub_epi16(A, B); // diff_y
225
+ const __m128i E = _mm_cmpgt_epi16(zero, D); // sign (-1 or 0)
226
+ const __m128i F = _mm_add_epi16(C, D); // new_y
227
+ const __m128i G = _mm_or_si128(E, one); // -1 or 1
228
+ const __m128i H = _mm_max_epi16(_mm_min_epi16(F, max), zero);
229
+ const __m128i I = _mm_madd_epi16(D, G); // sum(abs(...))
230
+ STORE_16(dst + i, H);
231
+ sum = _mm_add_epi32(sum, I);
232
+ }
233
+ STORE_16(tmp, sum);
234
+ diff = tmp[3] + tmp[2] + tmp[1] + tmp[0];
235
+ for (; i < len; ++i) {
236
+ const int diff_y = ref[i] - src[i];
237
+ const int new_y = static_cast<int>(dst[i]) + diff_y;
238
+ dst[i] = clip_y(new_y);
239
+ diff += (uint64_t)abs(diff_y);
240
+ }
241
+ return diff;
242
+ }
243
+
244
+ static void SharpUpdateRGB_SSE2(const int16_t* ref, const int16_t* src,
245
+ int16_t* dst, int len) {
246
+ int i = 0;
247
+ for (i = 0; i + 8 <= len; i += 8) {
248
+ const __m128i A = LOAD_16(ref + i);
249
+ const __m128i B = LOAD_16(src + i);
250
+ const __m128i C = LOAD_16(dst + i);
251
+ const __m128i D = _mm_sub_epi16(A, B); // diff_uv
252
+ const __m128i E = _mm_add_epi16(C, D); // new_uv
253
+ STORE_16(dst + i, E);
254
+ }
255
+ for (; i < len; ++i) {
256
+ const int diff_uv = ref[i] - src[i];
257
+ dst[i] += diff_uv;
258
+ }
259
+ }
260
+
261
+ static void SharpFilterRow_SSE2(const int16_t* A, const int16_t* B, int len,
262
+ const uint16_t* best_y, uint16_t* out) {
263
+ int i;
264
+ const __m128i kCst8 = _mm_set1_epi16(8);
265
+ const __m128i max = _mm_set1_epi16(MAX_Y_T);
266
+ const __m128i zero = _mm_setzero_si128();
267
+ for (i = 0; i + 8 <= len; i += 8) {
268
+ const __m128i a0 = LOAD_16(A + i + 0);
269
+ const __m128i a1 = LOAD_16(A + i + 1);
270
+ const __m128i b0 = LOAD_16(B + i + 0);
271
+ const __m128i b1 = LOAD_16(B + i + 1);
272
+ const __m128i a0b1 = _mm_add_epi16(a0, b1);
273
+ const __m128i a1b0 = _mm_add_epi16(a1, b0);
274
+ const __m128i a0a1b0b1 = _mm_add_epi16(a0b1, a1b0); // A0+A1+B0+B1
275
+ const __m128i a0a1b0b1_8 = _mm_add_epi16(a0a1b0b1, kCst8);
276
+ const __m128i a0b1_2 = _mm_add_epi16(a0b1, a0b1); // 2*(A0+B1)
277
+ const __m128i a1b0_2 = _mm_add_epi16(a1b0, a1b0); // 2*(A1+B0)
278
+ const __m128i c0 = _mm_srai_epi16(_mm_add_epi16(a0b1_2, a0a1b0b1_8), 3);
279
+ const __m128i c1 = _mm_srai_epi16(_mm_add_epi16(a1b0_2, a0a1b0b1_8), 3);
280
+ const __m128i d0 = _mm_add_epi16(c1, a0);
281
+ const __m128i d1 = _mm_add_epi16(c0, a1);
282
+ const __m128i e0 = _mm_srai_epi16(d0, 1);
283
+ const __m128i e1 = _mm_srai_epi16(d1, 1);
284
+ const __m128i f0 = _mm_unpacklo_epi16(e0, e1);
285
+ const __m128i f1 = _mm_unpackhi_epi16(e0, e1);
286
+ const __m128i g0 = LOAD_16(best_y + 2 * i + 0);
287
+ const __m128i g1 = LOAD_16(best_y + 2 * i + 8);
288
+ const __m128i h0 = _mm_add_epi16(g0, f0);
289
+ const __m128i h1 = _mm_add_epi16(g1, f1);
290
+ const __m128i i0 = _mm_max_epi16(_mm_min_epi16(h0, max), zero);
291
+ const __m128i i1 = _mm_max_epi16(_mm_min_epi16(h1, max), zero);
292
+ STORE_16(out + 2 * i + 0, i0);
293
+ STORE_16(out + 2 * i + 8, i1);
294
+ }
295
+ for (; i < len; ++i) {
296
+ // (9 * A0 + 3 * A1 + 3 * B0 + B1 + 8) >> 4 =
297
+ // = (8 * A0 + 2 * (A1 + B0) + (A0 + A1 + B0 + B1 + 8)) >> 4
298
+ // We reuse the common sub-expressions.
299
+ const int a0b1 = A[i + 0] + B[i + 1];
300
+ const int a1b0 = A[i + 1] + B[i + 0];
301
+ const int a0a1b0b1 = a0b1 + a1b0 + 8;
302
+ const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
303
+ const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
304
+ out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0);
305
+ out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);
306
+ }
307
+ }
308
+ #undef STORE_16
309
+ #undef LOAD_16
310
+
311
+ #elif defined(SJPEG_USE_NEON)
312
+
313
+ static uint64_t SharpUpdateY_NEON(const uint16_t* ref, const uint16_t* src,
314
+ uint16_t* dst, int len) {
315
+ int i;
316
+ const int16x8_t zero = vdupq_n_s16(0);
317
+ const int16x8_t max = vdupq_n_s16(MAX_Y_T);
318
+ uint64x2_t sum = vdupq_n_u64(0);
319
+
320
+ for (i = 0; i + 8 <= len; i += 8) {
321
+ const int16x8_t A = vreinterpretq_s16_u16(vld1q_u16(ref + i));
322
+ const int16x8_t B = vreinterpretq_s16_u16(vld1q_u16(src + i));
323
+ const int16x8_t C = vreinterpretq_s16_u16(vld1q_u16(dst + i));
324
+ const int16x8_t D = vsubq_s16(A, B); // diff_y
325
+ const int16x8_t F = vaddq_s16(C, D); // new_y
326
+ const uint16x8_t H =
327
+ vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(F, max), zero));
328
+ const int16x8_t I = vabsq_s16(D); // abs(diff_y)
329
+ vst1q_u16(dst + i, H);
330
+ sum = vpadalq_u32(sum, vpaddlq_u16(vreinterpretq_u16_s16(I)));
331
+ }
332
+ uint64_t diff = vgetq_lane_u64(sum, 0) + vgetq_lane_u64(sum, 1);
333
+ for (; i < len; ++i) {
334
+ const int diff_y = ref[i] - src[i];
335
+ const int new_y = static_cast<int>(dst[i]) + diff_y;
336
+ dst[i] = clip_y(new_y);
337
+ diff += static_cast<uint64_t>(abs(diff_y));
338
+ }
339
+ return diff;
340
+ }
341
+
342
+ static void SharpUpdateRGB_NEON(const int16_t* ref, const int16_t* src,
343
+ int16_t* dst, int len) {
344
+ int i;
345
+ for (i = 0; i + 8 <= len; i += 8) {
346
+ const int16x8_t A = vld1q_s16(ref + i);
347
+ const int16x8_t B = vld1q_s16(src + i);
348
+ const int16x8_t C = vld1q_s16(dst + i);
349
+ const int16x8_t D = vsubq_s16(A, B); // diff_uv
350
+ const int16x8_t E = vaddq_s16(C, D); // new_uv
351
+ vst1q_s16(dst + i, E);
352
+ }
353
+ for (; i < len; ++i) {
354
+ const int diff_uv = ref[i] - src[i];
355
+ dst[i] += diff_uv;
356
+ }
357
+ }
358
+
359
+ static void SharpFilterRow_NEON(const int16_t* A, const int16_t* B, int len,
360
+ const uint16_t* best_y, uint16_t* out) {
361
+ int i;
362
+ const int16x8_t max = vdupq_n_s16(MAX_Y_T);
363
+ const int16x8_t zero = vdupq_n_s16(0);
364
+ for (i = 0; i + 8 <= len; i += 8) {
365
+ const int16x8_t a0 = vld1q_s16(A + i + 0);
366
+ const int16x8_t a1 = vld1q_s16(A + i + 1);
367
+ const int16x8_t b0 = vld1q_s16(B + i + 0);
368
+ const int16x8_t b1 = vld1q_s16(B + i + 1);
369
+ const int16x8_t a0b1 = vaddq_s16(a0, b1);
370
+ const int16x8_t a1b0 = vaddq_s16(a1, b0);
371
+ const int16x8_t a0a1b0b1 = vaddq_s16(a0b1, a1b0); // A0+A1+B0+B1
372
+ const int16x8_t a0b1_2 = vaddq_s16(a0b1, a0b1); // 2*(A0+B1)
373
+ const int16x8_t a1b0_2 = vaddq_s16(a1b0, a1b0); // 2*(A1+B0)
374
+ const int16x8_t c0 = vshrq_n_s16(vaddq_s16(a0b1_2, a0a1b0b1), 3);
375
+ const int16x8_t c1 = vshrq_n_s16(vaddq_s16(a1b0_2, a0a1b0b1), 3);
376
+ const int16x8_t d0 = vaddq_s16(c1, a0);
377
+ const int16x8_t d1 = vaddq_s16(c0, a1);
378
+ const int16x8_t e0 = vrshrq_n_s16(d0, 1);
379
+ const int16x8_t e1 = vrshrq_n_s16(d1, 1);
380
+ const int16x8x2_t f = vzipq_s16(e0, e1);
381
+ const int16x8_t g0 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 0));
382
+ const int16x8_t g1 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 8));
383
+ const int16x8_t h0 = vaddq_s16(g0, f.val[0]);
384
+ const int16x8_t h1 = vaddq_s16(g1, f.val[1]);
385
+ const int16x8_t i0 = vmaxq_s16(vminq_s16(h0, max), zero);
386
+ const int16x8_t i1 = vmaxq_s16(vminq_s16(h1, max), zero);
387
+ vst1q_u16(out + 2 * i + 0, vreinterpretq_u16_s16(i0));
388
+ vst1q_u16(out + 2 * i + 8, vreinterpretq_u16_s16(i1));
389
+ }
390
+ for (; i < len; ++i) {
391
+ const int a0b1 = A[i + 0] + B[i + 1];
392
+ const int a1b0 = A[i + 1] + B[i + 0];
393
+ const int a0a1b0b1 = a0b1 + a1b0 + 8;
394
+ const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
395
+ const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
396
+ out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0);
397
+ out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);
398
+ }
399
+ }
400
+
401
+ #endif // SJPEG_USE_NEON
402
+
403
+ static uint64_t (*kSharpUpdateY)(const uint16_t* src, const uint16_t* ref,
404
+ uint16_t* dst, int len);
405
+ static void (*kSharpUpdateRGB)(const int16_t* src, const int16_t* ref,
406
+ int16_t* dst, int len);
407
+ static void (*kSharpFilterRow)(const int16_t* A, const int16_t* B,
408
+ int len, const uint16_t* best_y, uint16_t* out);
409
+
410
+ static void InitFunctionPointers() {
411
+ static bool done = false;
412
+ if (!done) {
413
+ kSharpUpdateY = SharpUpdateY_C;
414
+ kSharpUpdateRGB = SharpUpdateRGB_C;
415
+ kSharpFilterRow = SharpFilterRow_C;
416
+ #if defined(SJPEG_USE_SSE2)
417
+ if (sjpeg::SupportsSSE2()) {
418
+ kSharpUpdateY = SharpUpdateY_SSE2;
419
+ kSharpUpdateRGB = SharpUpdateRGB_SSE2;
420
+ kSharpFilterRow = SharpFilterRow_SSE2;
421
+ }
422
+ #endif
423
+ #if defined(SJPEG_USE_NEON)
424
+ if (sjpeg::SupportsNEON()) {
425
+ kSharpUpdateY = SharpUpdateY_NEON;
426
+ kSharpUpdateRGB = SharpUpdateRGB_NEON;
427
+ kSharpFilterRow = SharpFilterRow_NEON;
428
+ }
429
+ #endif
430
+ done = true;
431
+ }
432
+ }
433
+
434
+ //------------------------------------------------------------------------------
435
+
436
+ static uint32_t RGBToGray(uint32_t r, uint32_t g, uint32_t b) {
437
+ const uint32_t luma = 13933 * r + 46871 * g + 4732 * b + (1u << YUV_FIX >> 1);
438
+ return (luma >> YUV_FIX);
439
+ }
440
+
441
+ static uint32_t ScaleDown(int a, int b, int c, int d) {
442
+ const uint32_t A = GammaToLinear(a);
443
+ const uint32_t B = GammaToLinear(b);
444
+ const uint32_t C = GammaToLinear(c);
445
+ const uint32_t D = GammaToLinear(d);
446
+ return LinearToGamma((A + B + C + D + 2) >> 2);
447
+ }
448
+
449
+ static void UpdateChroma(const fixed_y_t* src1, const fixed_y_t* src2,
450
+ fixed_t* dst, size_t uv_w) {
451
+ for (size_t i = 0; i < uv_w; ++i) {
452
+ const uint32_t r = ScaleDown(src1[0 * uv_w + 0], src1[0 * uv_w + 1],
453
+ src2[0 * uv_w + 0], src2[0 * uv_w + 1]);
454
+ const uint32_t g = ScaleDown(src1[2 * uv_w + 0], src1[2 * uv_w + 1],
455
+ src2[2 * uv_w + 0], src2[2 * uv_w + 1]);
456
+ const uint32_t b = ScaleDown(src1[4 * uv_w + 0], src1[4 * uv_w + 1],
457
+ src2[4 * uv_w + 0], src2[4 * uv_w + 1]);
458
+ const int W = RGBToGray(r, g, b);
459
+ dst[0 * uv_w] = (fixed_t)(r - W);
460
+ dst[1 * uv_w] = (fixed_t)(g - W);
461
+ dst[2 * uv_w] = (fixed_t)(b - W);
462
+ dst += 1;
463
+ src1 += 2;
464
+ src2 += 2;
465
+ }
466
+ }
467
+
468
+ static void UpdateW(const fixed_y_t* src, fixed_y_t* dst, int w) {
469
+ for (int i = 0; i < w; ++i) {
470
+ const uint32_t R = GammaToLinear(src[0 * w + i]);
471
+ const uint32_t G = GammaToLinear(src[1 * w + i]);
472
+ const uint32_t B = GammaToLinear(src[2 * w + i]);
473
+ const uint32_t Y = RGBToGray(R, G, B);
474
+ dst[i] = (fixed_y_t)LinearToGamma(Y);
475
+ }
476
+ }
477
+
478
+ static void StoreGray(const fixed_y_t* const rgb, fixed_y_t* const y, int w) {
479
+ for (int i = 0; i < w; ++i) {
480
+ y[i] = RGBToGray(rgb[0 * w + i], rgb[1 * w + i], rgb[2 * w + i]);
481
+ }
482
+ }
483
+
484
+ //------------------------------------------------------------------------------
485
+
486
+ static fixed_y_t Filter2(int A, int B, int W0) {
487
+ const int v0 = (A * 3 + B + 2) >> 2;
488
+ return clip_y(v0 + W0);
489
+ }
490
+
491
+ //------------------------------------------------------------------------------
492
+
493
+ static fixed_y_t UpLift(uint8_t a) { // 8bit -> SFIX
494
+ return ((fixed_y_t)a << SFIX) | SHALF;
495
+ }
496
+
497
+ static void ImportOneRow(const uint8_t* const rgb, int pic_width,
498
+ fixed_y_t* const dst) {
499
+ const int w = (pic_width + 1) & ~1;
500
+ for (int i = 0; i < pic_width; ++i) {
501
+ const int off = i * 3;
502
+ dst[i + 0 * w] = UpLift(rgb[off + 0]);
503
+ dst[i + 1 * w] = UpLift(rgb[off + 1]);
504
+ dst[i + 2 * w] = UpLift(rgb[off + 2]);
505
+ }
506
+ if (pic_width & 1) { // replicate rightmost pixel
507
+ dst[pic_width + 0 * w] = dst[pic_width + 0 * w - 1];
508
+ dst[pic_width + 1 * w] = dst[pic_width + 1 * w - 1];
509
+ dst[pic_width + 2 * w] = dst[pic_width + 2 * w - 1];
510
+ }
511
+ }
512
+
513
+ static void InterpolateTwoRows(const fixed_y_t* const best_y,
514
+ const fixed_t* prev_uv,
515
+ const fixed_t* cur_uv,
516
+ const fixed_t* next_uv,
517
+ int w,
518
+ fixed_y_t* out1, fixed_y_t* out2) {
519
+ const int uv_w = w >> 1;
520
+ const int len = (w - 1) >> 1; // length to filter
521
+ for (int k = 3; k > 0; --k) { // process each R/G/B segments in turn
522
+ // special boundary case for i==0
523
+ out1[0] = Filter2(cur_uv[0], prev_uv[0], best_y[0]);
524
+ out2[0] = Filter2(cur_uv[0], next_uv[0], best_y[w]);
525
+
526
+ kSharpFilterRow(cur_uv, prev_uv, len, best_y + 0 + 1, out1 + 1);
527
+ kSharpFilterRow(cur_uv, next_uv, len, best_y + w + 1, out2 + 1);
528
+
529
+ // special boundary case for i == w - 1 when w is even
530
+ if (!(w & 1)) {
531
+ out1[w - 1] = Filter2(cur_uv[uv_w - 1], prev_uv[uv_w - 1],
532
+ best_y[w - 1 + 0]);
533
+ out2[w - 1] = Filter2(cur_uv[uv_w - 1], next_uv[uv_w - 1],
534
+ best_y[w - 1 + w]);
535
+ }
536
+ out1 += w;
537
+ out2 += w;
538
+ prev_uv += uv_w;
539
+ cur_uv += uv_w;
540
+ next_uv += uv_w;
541
+ }
542
+ }
543
+
544
+ static void ConvertWRGBToYUV(const fixed_y_t* best_y,
545
+ const fixed_t* best_uv,
546
+ int width, int height,
547
+ uint8_t* y_plane,
548
+ uint8_t* u_plane, uint8_t* v_plane) {
549
+ const int w = (width + 1) & ~1;
550
+ const int h = (height + 1) & ~1;
551
+ const int uv_w = w >> 1;
552
+ const int uv_h = h >> 1;
553
+ for (int j = 0; j < height; ++j) {
554
+ const int off = (j >> 1) * 3 * uv_w;
555
+ for (int i = 0; i < width; ++i) {
556
+ const int W = best_y[i + j * w];
557
+ const int r = best_uv[off + (i >> 1) + 0 * uv_w] + W;
558
+ const int g = best_uv[off + (i >> 1) + 1 * uv_w] + W;
559
+ const int b = best_uv[off + (i >> 1) + 2 * uv_w] + W;
560
+ y_plane[i] = ConvertRGBToY(r, g, b);
561
+ }
562
+ y_plane += width;
563
+ }
564
+ for (int j = 0; j < uv_h; ++j) {
565
+ for (int i = 0; i < uv_w; ++i) {
566
+ const int off = i + j * 3 * uv_w;
567
+ const int r = best_uv[off + 0 * uv_w];
568
+ const int g = best_uv[off + 1 * uv_w];
569
+ const int b = best_uv[off + 2 * uv_w];
570
+ u_plane[i] = ConvertRGBToU(r, g, b);
571
+ v_plane[i] = ConvertRGBToV(r, g, b);
572
+ }
573
+ u_plane += uv_w;
574
+ v_plane += uv_w;
575
+ }
576
+ }
577
+
578
+ //------------------------------------------------------------------------------
579
+ // Main function
580
+
581
+ static void PreprocessARGB(const uint8_t* const rgb,
582
+ int width, int height, size_t stride,
583
+ uint8_t* y_plane,
584
+ uint8_t* u_plane, uint8_t* v_plane) {
585
+ // we expand the right/bottom border if needed
586
+ const int w = (width + 1) & ~1;
587
+ const int h = (height + 1) & ~1;
588
+ const int uv_w = w >> 1;
589
+ const int uv_h = h >> 1;
590
+ uint64_t prev_diff_y_sum = ~0;
591
+
592
+ InitGammaTablesF();
593
+ InitFunctionPointers();
594
+
595
+ // TODO(skal): allocate one big memory chunk instead.
596
+ vector<fixed_y_t> tmp_buffer(w * 3 * 2);
597
+ vector<fixed_y_t> best_y(w * h);
598
+ vector<fixed_y_t> target_y(w * h);
599
+ vector<fixed_y_t> best_rgb_y(w * 2);
600
+ vector<fixed_t> best_uv(uv_w * 3 * uv_h);
601
+ vector<fixed_t> target_uv(uv_w * 3 * uv_h);
602
+ vector<fixed_t> best_rgb_uv(uv_w * 3 * 1);
603
+ const uint64_t diff_y_threshold = static_cast<uint64_t>(3.0 * w * h);
604
+
605
+ assert(width >= kMinDimensionIterativeConversion);
606
+ assert(height >= kMinDimensionIterativeConversion);
607
+
608
+ // Import RGB samples to W/RGB representation.
609
+ for (int j = 0; j < height; j += 2) {
610
+ const int is_last_row = (j == height - 1);
611
+ fixed_y_t* const src1 = &tmp_buffer[0 * w];
612
+ fixed_y_t* const src2 = &tmp_buffer[3 * w];
613
+ const int rgb_off = j * stride;
614
+ const int y_off = j * w;
615
+ const int uv_off = (j >> 1) * 3 * uv_w;
616
+
617
+ // prepare two rows of input
618
+ ImportOneRow(rgb + rgb_off, width, src1);
619
+ if (!is_last_row) {
620
+ ImportOneRow(rgb + rgb_off + stride, width, src2);
621
+ } else {
622
+ memcpy(src2, src1, 3 * w * sizeof(*src2));
623
+ }
624
+ StoreGray(src1, &best_y[y_off + 0], w);
625
+ StoreGray(src2, &best_y[y_off + w], w);
626
+ UpdateW(src1, &target_y[y_off + 0], w);
627
+ UpdateW(src2, &target_y[y_off + w], w);
628
+ UpdateChroma(src1, src2, &target_uv[uv_off], uv_w);
629
+ memcpy(&best_uv[uv_off], &target_uv[uv_off], 3 * uv_w * sizeof(best_uv[0]));
630
+ }
631
+
632
+ // Iterate and resolve clipping conflicts.
633
+ for (int iter = 0; iter < kNumIterations; ++iter) {
634
+ const fixed_t* cur_uv = &best_uv[0];
635
+ const fixed_t* prev_uv = &best_uv[0];
636
+ uint64_t diff_y_sum = 0;
637
+
638
+ for (int j = 0; j < h; j += 2) {
639
+ const int uv_off = (j >> 1) * 3 * uv_w;
640
+ fixed_y_t* const src1 = &tmp_buffer[0 * w];
641
+ fixed_y_t* const src2 = &tmp_buffer[3 * w];
642
+ const fixed_t* const next_uv = cur_uv + ((j < h - 2) ? 3 * uv_w : 0);
643
+ InterpolateTwoRows(&best_y[j * w], prev_uv, cur_uv, next_uv,
644
+ w, src1, src2);
645
+ prev_uv = cur_uv;
646
+ cur_uv = next_uv;
647
+
648
+ UpdateW(src1, &best_rgb_y[0 * w], w);
649
+ UpdateW(src2, &best_rgb_y[1 * w], w);
650
+ UpdateChroma(src1, src2, &best_rgb_uv[0], uv_w);
651
+
652
+ // update two rows of Y and one row of RGB
653
+ diff_y_sum += kSharpUpdateY(&target_y[j * w],
654
+ &best_rgb_y[0], &best_y[j * w], 2 * w);
655
+ kSharpUpdateRGB(&target_uv[uv_off],
656
+ &best_rgb_uv[0], &best_uv[uv_off], 3 * uv_w);
657
+ }
658
+ // test exit condition
659
+ if (iter > 0) {
660
+ if (diff_y_sum < diff_y_threshold) break;
661
+ if (diff_y_sum > prev_diff_y_sum) break;
662
+ }
663
+ prev_diff_y_sum = diff_y_sum;
664
+ }
665
+ // final reconstruction
666
+ ConvertWRGBToYUV(&best_y[0], &best_uv[0], width, height,
667
+ y_plane, u_plane, v_plane);
668
+ }
669
+
670
+ } // namespace sjpeg
671
+
672
+ ////////////////////////////////////////////////////////////////////////////////
673
+ // Entry point
674
+
675
+ void sjpeg::ApplySharpYUVConversion(const uint8_t* const rgb,
676
+ int W, int H, int stride,
677
+ uint8_t* y_plane,
678
+ uint8_t* u_plane, uint8_t* v_plane) {
679
+ if (W <= kMinDimensionIterativeConversion ||
680
+ H <= kMinDimensionIterativeConversion) {
681
+ const int uv_w = (W + 1) >> 1;
682
+ for (int y = 0; y < H; y += 2) {
683
+ const uint8_t* const rgb1 = rgb + y * stride;
684
+ const uint8_t* const rgb2 = (y < H - 1) ? rgb1 + stride : rgb1;
685
+ ConvertRowToY(rgb1, W, &y_plane[y * W]);
686
+ if (y < H - 1) {
687
+ ConvertRowToY(rgb2, W, &y_plane[(y + 1) * W]);
688
+ }
689
+ ConvertRowToUV(rgb1, rgb2, W,
690
+ &u_plane[(y >> 1) * uv_w],
691
+ &v_plane[(y >> 1) * uv_w]);
692
+ }
693
+ } else {
694
+ PreprocessARGB(rgb, W, H, stride, y_plane, u_plane, v_plane);
695
+ }
696
+ }
697
+
698
+ ////////////////////////////////////////////////////////////////////////////////
@@ -0,0 +1,3 @@
1
+ module Sjpeg
2
+ VERSION = "0.1.0"
3
+ end