sjpeg 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,698 @@
1
+ // Copyright 2017 Google Inc.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Enhanced RGB->YUV conversion functions
16
+ //
17
+ // Author: Skal (pascal.massimino@gmail.com)
18
+
19
+ #include <math.h>
20
+ #include <stdlib.h>
21
+ #include <string.h>
22
+ #include <memory>
23
+ #include <vector>
24
+ using std::vector;
25
+
26
+ #define SJPEG_NEED_ASM_HEADERS
27
+ #include "sjpegi.h"
28
+
29
+ namespace sjpeg {
30
+
31
+ // We could use SFIX=0 and only uint8_t for fixed_y_t, but it produces some
32
+ // banding sometimes. Better use extra precision.
33
+ #define SFIX 2 // fixed-point precision of RGB and Y/W
34
+ #define SHALF (1 << SFIX >> 1)
35
+ #define MAX_Y_T ((256 << SFIX) - 1)
36
+ typedef int16_t fixed_t; // signed type with extra SFIX precision for UV
37
+ typedef uint16_t fixed_y_t; // unsigned type with extra SFIX precision for W
38
+
39
+ static fixed_y_t clip_y(int y) {
40
+ return (!(y & ~MAX_Y_T)) ? (fixed_y_t)y : (y < 0) ? 0 : MAX_Y_T;
41
+ }
42
+
43
+ ////////////////////////////////////////////////////////////////////////////////
44
+ // Helper functions for Y/U/V fixed-point calculations.
45
+
46
+ // The following functions convert r/g/b values in SFIX fixed-point precision
47
+ // to 8b values, clipped:
48
+ #define YUV_FIX 16
49
+ #define TFIX (YUV_FIX + SFIX)
50
+ #define TROUNDER (1 << TFIX >> 1)
51
+
52
+ static uint8_t clip_8b(int v) {
53
+ return (!(v & ~0xff)) ? (uint8_t)v : (v < 0) ? 0u : 255u;
54
+ }
55
+
56
+ static uint8_t ConvertRGBToY(int r, int g, int b) {
57
+ const int luma = 19595 * r + 38469 * g + 7471 * b + TROUNDER;
58
+ return clip_8b(luma >> TFIX);
59
+ }
60
+
61
+ static uint8_t ConvertRGBToU(int r, int g, int b) {
62
+ const int u = -11058 * r - 21709 * g + 32768 * b + TROUNDER;
63
+ return clip_8b(128 + (u >> TFIX));
64
+ }
65
+
66
+ static uint8_t ConvertRGBToV(int r, int g, int b) {
67
+ const int v = +32768 * r - 27439 * g - 5328 * b + TROUNDER;
68
+ return clip_8b(128 + (v >> TFIX));
69
+ }
70
+
71
+ // convert to luma using 16b precision:
72
+ static void ConvertRowToY(const uint8_t* row, int w, uint8_t* const dst) {
73
+ for (int i = 0; i < w; i += 1, row += 3) {
74
+ const int r = row[0], g = row[1], b = row[2];
75
+ const int y = 19595 * r + 38469 * g + 7471 * b;
76
+ dst[i] = (y + (1 << YUV_FIX >> 1)) >> YUV_FIX;
77
+ }
78
+ }
79
+
80
+ static void ConvertRowToUV(const uint8_t* row1, const uint8_t* row2,
81
+ int w, uint8_t* u, uint8_t* v) {
82
+ for (int i = 0; i < (w & ~1); i += 2, row1 += 6, row2 += 6) {
83
+ const int r = row1[0] + row1[3] + row2[0] + row2[3];
84
+ const int g = row1[1] + row1[4] + row2[1] + row2[4];
85
+ const int b = row1[2] + row1[5] + row2[2] + row2[5];
86
+ *u++ = ConvertRGBToU(r, g, b);
87
+ *v++ = ConvertRGBToV(r, g, b);
88
+ }
89
+ if (w & 1) {
90
+ const int r = 2 * (row1[0] + row2[0]);
91
+ const int g = 2 * (row1[1] + row2[1]);
92
+ const int b = 2 * (row1[2] + row2[2]);
93
+ *u++ = ConvertRGBToU(r, g, b);
94
+ *v++ = ConvertRGBToV(r, g, b);
95
+ }
96
+ }
97
+
98
+ #undef TFIX
99
+ #undef ROUNDER
100
+
101
+ ////////////////////////////////////////////////////////////////////////////////
102
+ // Sharp RGB->YUV conversion
103
+
104
+ static const int kNumIterations = 4;
105
+ static const int kMinDimensionIterativeConversion = 4;
106
+
107
+ // size of the interpolation table for linear-to-gamma
108
+ #define GAMMA_TABLE_SIZE 32
109
+ static uint32_t kLinearToGammaTab[GAMMA_TABLE_SIZE + 2];
110
+ #define GAMMA_TO_LINEAR_BITS 14
111
+ static uint32_t kGammaToLinearTab[MAX_Y_T + 1]; // size scales with Y_FIX
112
+
113
+ static void InitGammaTablesF(void) {
114
+ static bool done = false;
115
+ assert(2 * GAMMA_TO_LINEAR_BITS < 32); // we use uint32_t intermediate values
116
+ if (!done) {
117
+ int v;
118
+ const double norm = 1. / MAX_Y_T;
119
+ const double scale = 1. / GAMMA_TABLE_SIZE;
120
+ const double a = 0.099;
121
+ const double thresh = 0.018;
122
+ const double gamma = 1. / 0.45;
123
+ const double final_scale = 1 << GAMMA_TO_LINEAR_BITS;
124
+ for (v = 0; v <= MAX_Y_T; ++v) {
125
+ const double g = norm * v;
126
+ double value;
127
+ if (g <= thresh * 4.5) {
128
+ value = g / 4.5;
129
+ } else {
130
+ const double a_rec = 1. / (1. + a);
131
+ value = pow(a_rec * (g + a), gamma);
132
+ }
133
+ kGammaToLinearTab[v] = static_cast<uint32_t>(value * final_scale + .5);
134
+ }
135
+ for (v = 0; v <= GAMMA_TABLE_SIZE; ++v) {
136
+ const double g = scale * v;
137
+ double value;
138
+ if (g <= thresh) {
139
+ value = 4.5 * g;
140
+ } else {
141
+ value = (1. + a) * pow(g, 1. / gamma) - a;
142
+ }
143
+ // we already incorporate the 1/2 rounding constant here
144
+ kLinearToGammaTab[v] =
145
+ static_cast<uint32_t>(MAX_Y_T * value)
146
+ + (1 << GAMMA_TO_LINEAR_BITS >> 1);
147
+ }
148
+ // to prevent small rounding errors to cause read-overflow:
149
+ kLinearToGammaTab[GAMMA_TABLE_SIZE + 1] =
150
+ kLinearToGammaTab[GAMMA_TABLE_SIZE];
151
+ done = true;
152
+ }
153
+ }
154
+
155
+ // return value has a fixed-point precision of GAMMA_TO_LINEAR_BITS
156
+ static uint32_t GammaToLinear(int v) { return kGammaToLinearTab[v]; }
157
+
158
+ static uint32_t LinearToGamma(uint32_t value) {
159
+ // 'value' is in GAMMA_TO_LINEAR_BITS fractional precision
160
+ const uint32_t v = value * GAMMA_TABLE_SIZE;
161
+ const uint32_t tab_pos = v >> GAMMA_TO_LINEAR_BITS;
162
+ // fractional part, in GAMMA_TO_LINEAR_BITS fixed-point precision
163
+ const uint32_t x = v - (tab_pos << GAMMA_TO_LINEAR_BITS); // fractional part
164
+ // v0 / v1 are in GAMMA_TO_LINEAR_BITS fixed-point precision (range [0..1])
165
+ const uint32_t v0 = kLinearToGammaTab[tab_pos + 0];
166
+ const uint32_t v1 = kLinearToGammaTab[tab_pos + 1];
167
+ // Final interpolation. Note that rounding is already included.
168
+ const uint32_t v2 = (v1 - v0) * x; // note: v1 >= v0.
169
+ const uint32_t result = v0 + (v2 >> GAMMA_TO_LINEAR_BITS);
170
+ return result;
171
+ }
172
+
173
+ //------------------------------------------------------------------------------
174
+
175
+ static uint64_t SharpUpdateY_C(const uint16_t* ref, const uint16_t* src,
176
+ uint16_t* dst, int len) {
177
+ uint64_t diff = 0;
178
+ for (int i = 0; i < len; ++i) {
179
+ const int diff_y = ref[i] - src[i];
180
+ const int new_y = static_cast<int>(dst[i]) + diff_y;
181
+ dst[i] = clip_y(new_y);
182
+ diff += (uint64_t)abs(diff_y);
183
+ }
184
+ return diff;
185
+ }
186
+
187
+ static void SharpUpdateRGB_C(const int16_t* ref, const int16_t* src,
188
+ int16_t* dst, int len) {
189
+ for (int i = 0; i < len; ++i) {
190
+ const int diff_uv = ref[i] - src[i];
191
+ dst[i] += diff_uv;
192
+ }
193
+ }
194
+
195
+ static void SharpFilterRow_C(const int16_t* A, const int16_t* B, int len,
196
+ const uint16_t* best_y, uint16_t* out) {
197
+ for (int i = 0; i < len; ++i, ++A, ++B) {
198
+ const int v0 = (A[0] * 9 + A[1] * 3 + B[0] * 3 + B[1] + 8) >> 4;
199
+ const int v1 = (A[1] * 9 + A[0] * 3 + B[1] * 3 + B[0] + 8) >> 4;
200
+ out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0);
201
+ out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);
202
+ }
203
+ }
204
+
205
+ #if defined(SJPEG_USE_SSE2)
206
+
207
+ #define LOAD_16(P) (_mm_loadu_si128(reinterpret_cast<const __m128i*>(P)))
208
+ #define STORE_16(P, V) (_mm_storeu_si128(reinterpret_cast<__m128i*>(P), (V)))
209
+
210
+ static uint64_t SharpUpdateY_SSE2(const uint16_t* ref, const uint16_t* src,
211
+ uint16_t* dst, int len) {
212
+ uint64_t diff = 0;
213
+ uint32_t tmp[4];
214
+ int i;
215
+ const __m128i zero = _mm_setzero_si128();
216
+ const __m128i max = _mm_set1_epi16(MAX_Y_T);
217
+ const __m128i one = _mm_set1_epi16(1);
218
+ __m128i sum = zero;
219
+
220
+ for (i = 0; i + 8 <= len; i += 8) {
221
+ const __m128i A = LOAD_16(ref + i);
222
+ const __m128i B = LOAD_16(src + i);
223
+ const __m128i C = LOAD_16(dst + i);
224
+ const __m128i D = _mm_sub_epi16(A, B); // diff_y
225
+ const __m128i E = _mm_cmpgt_epi16(zero, D); // sign (-1 or 0)
226
+ const __m128i F = _mm_add_epi16(C, D); // new_y
227
+ const __m128i G = _mm_or_si128(E, one); // -1 or 1
228
+ const __m128i H = _mm_max_epi16(_mm_min_epi16(F, max), zero);
229
+ const __m128i I = _mm_madd_epi16(D, G); // sum(abs(...))
230
+ STORE_16(dst + i, H);
231
+ sum = _mm_add_epi32(sum, I);
232
+ }
233
+ STORE_16(tmp, sum);
234
+ diff = tmp[3] + tmp[2] + tmp[1] + tmp[0];
235
+ for (; i < len; ++i) {
236
+ const int diff_y = ref[i] - src[i];
237
+ const int new_y = static_cast<int>(dst[i]) + diff_y;
238
+ dst[i] = clip_y(new_y);
239
+ diff += (uint64_t)abs(diff_y);
240
+ }
241
+ return diff;
242
+ }
243
+
244
+ static void SharpUpdateRGB_SSE2(const int16_t* ref, const int16_t* src,
245
+ int16_t* dst, int len) {
246
+ int i = 0;
247
+ for (i = 0; i + 8 <= len; i += 8) {
248
+ const __m128i A = LOAD_16(ref + i);
249
+ const __m128i B = LOAD_16(src + i);
250
+ const __m128i C = LOAD_16(dst + i);
251
+ const __m128i D = _mm_sub_epi16(A, B); // diff_uv
252
+ const __m128i E = _mm_add_epi16(C, D); // new_uv
253
+ STORE_16(dst + i, E);
254
+ }
255
+ for (; i < len; ++i) {
256
+ const int diff_uv = ref[i] - src[i];
257
+ dst[i] += diff_uv;
258
+ }
259
+ }
260
+
261
+ static void SharpFilterRow_SSE2(const int16_t* A, const int16_t* B, int len,
262
+ const uint16_t* best_y, uint16_t* out) {
263
+ int i;
264
+ const __m128i kCst8 = _mm_set1_epi16(8);
265
+ const __m128i max = _mm_set1_epi16(MAX_Y_T);
266
+ const __m128i zero = _mm_setzero_si128();
267
+ for (i = 0; i + 8 <= len; i += 8) {
268
+ const __m128i a0 = LOAD_16(A + i + 0);
269
+ const __m128i a1 = LOAD_16(A + i + 1);
270
+ const __m128i b0 = LOAD_16(B + i + 0);
271
+ const __m128i b1 = LOAD_16(B + i + 1);
272
+ const __m128i a0b1 = _mm_add_epi16(a0, b1);
273
+ const __m128i a1b0 = _mm_add_epi16(a1, b0);
274
+ const __m128i a0a1b0b1 = _mm_add_epi16(a0b1, a1b0); // A0+A1+B0+B1
275
+ const __m128i a0a1b0b1_8 = _mm_add_epi16(a0a1b0b1, kCst8);
276
+ const __m128i a0b1_2 = _mm_add_epi16(a0b1, a0b1); // 2*(A0+B1)
277
+ const __m128i a1b0_2 = _mm_add_epi16(a1b0, a1b0); // 2*(A1+B0)
278
+ const __m128i c0 = _mm_srai_epi16(_mm_add_epi16(a0b1_2, a0a1b0b1_8), 3);
279
+ const __m128i c1 = _mm_srai_epi16(_mm_add_epi16(a1b0_2, a0a1b0b1_8), 3);
280
+ const __m128i d0 = _mm_add_epi16(c1, a0);
281
+ const __m128i d1 = _mm_add_epi16(c0, a1);
282
+ const __m128i e0 = _mm_srai_epi16(d0, 1);
283
+ const __m128i e1 = _mm_srai_epi16(d1, 1);
284
+ const __m128i f0 = _mm_unpacklo_epi16(e0, e1);
285
+ const __m128i f1 = _mm_unpackhi_epi16(e0, e1);
286
+ const __m128i g0 = LOAD_16(best_y + 2 * i + 0);
287
+ const __m128i g1 = LOAD_16(best_y + 2 * i + 8);
288
+ const __m128i h0 = _mm_add_epi16(g0, f0);
289
+ const __m128i h1 = _mm_add_epi16(g1, f1);
290
+ const __m128i i0 = _mm_max_epi16(_mm_min_epi16(h0, max), zero);
291
+ const __m128i i1 = _mm_max_epi16(_mm_min_epi16(h1, max), zero);
292
+ STORE_16(out + 2 * i + 0, i0);
293
+ STORE_16(out + 2 * i + 8, i1);
294
+ }
295
+ for (; i < len; ++i) {
296
+ // (9 * A0 + 3 * A1 + 3 * B0 + B1 + 8) >> 4 =
297
+ // = (8 * A0 + 2 * (A1 + B0) + (A0 + A1 + B0 + B1 + 8)) >> 4
298
+ // We reuse the common sub-expressions.
299
+ const int a0b1 = A[i + 0] + B[i + 1];
300
+ const int a1b0 = A[i + 1] + B[i + 0];
301
+ const int a0a1b0b1 = a0b1 + a1b0 + 8;
302
+ const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
303
+ const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
304
+ out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0);
305
+ out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);
306
+ }
307
+ }
308
+ #undef STORE_16
309
+ #undef LOAD_16
310
+
311
+ #elif defined(SJPEG_USE_NEON)
312
+
313
+ static uint64_t SharpUpdateY_NEON(const uint16_t* ref, const uint16_t* src,
314
+ uint16_t* dst, int len) {
315
+ int i;
316
+ const int16x8_t zero = vdupq_n_s16(0);
317
+ const int16x8_t max = vdupq_n_s16(MAX_Y_T);
318
+ uint64x2_t sum = vdupq_n_u64(0);
319
+
320
+ for (i = 0; i + 8 <= len; i += 8) {
321
+ const int16x8_t A = vreinterpretq_s16_u16(vld1q_u16(ref + i));
322
+ const int16x8_t B = vreinterpretq_s16_u16(vld1q_u16(src + i));
323
+ const int16x8_t C = vreinterpretq_s16_u16(vld1q_u16(dst + i));
324
+ const int16x8_t D = vsubq_s16(A, B); // diff_y
325
+ const int16x8_t F = vaddq_s16(C, D); // new_y
326
+ const uint16x8_t H =
327
+ vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(F, max), zero));
328
+ const int16x8_t I = vabsq_s16(D); // abs(diff_y)
329
+ vst1q_u16(dst + i, H);
330
+ sum = vpadalq_u32(sum, vpaddlq_u16(vreinterpretq_u16_s16(I)));
331
+ }
332
+ uint64_t diff = vgetq_lane_u64(sum, 0) + vgetq_lane_u64(sum, 1);
333
+ for (; i < len; ++i) {
334
+ const int diff_y = ref[i] - src[i];
335
+ const int new_y = static_cast<int>(dst[i]) + diff_y;
336
+ dst[i] = clip_y(new_y);
337
+ diff += static_cast<uint64_t>(abs(diff_y));
338
+ }
339
+ return diff;
340
+ }
341
+
342
+ static void SharpUpdateRGB_NEON(const int16_t* ref, const int16_t* src,
343
+ int16_t* dst, int len) {
344
+ int i;
345
+ for (i = 0; i + 8 <= len; i += 8) {
346
+ const int16x8_t A = vld1q_s16(ref + i);
347
+ const int16x8_t B = vld1q_s16(src + i);
348
+ const int16x8_t C = vld1q_s16(dst + i);
349
+ const int16x8_t D = vsubq_s16(A, B); // diff_uv
350
+ const int16x8_t E = vaddq_s16(C, D); // new_uv
351
+ vst1q_s16(dst + i, E);
352
+ }
353
+ for (; i < len; ++i) {
354
+ const int diff_uv = ref[i] - src[i];
355
+ dst[i] += diff_uv;
356
+ }
357
+ }
358
+
359
+ static void SharpFilterRow_NEON(const int16_t* A, const int16_t* B, int len,
360
+ const uint16_t* best_y, uint16_t* out) {
361
+ int i;
362
+ const int16x8_t max = vdupq_n_s16(MAX_Y_T);
363
+ const int16x8_t zero = vdupq_n_s16(0);
364
+ for (i = 0; i + 8 <= len; i += 8) {
365
+ const int16x8_t a0 = vld1q_s16(A + i + 0);
366
+ const int16x8_t a1 = vld1q_s16(A + i + 1);
367
+ const int16x8_t b0 = vld1q_s16(B + i + 0);
368
+ const int16x8_t b1 = vld1q_s16(B + i + 1);
369
+ const int16x8_t a0b1 = vaddq_s16(a0, b1);
370
+ const int16x8_t a1b0 = vaddq_s16(a1, b0);
371
+ const int16x8_t a0a1b0b1 = vaddq_s16(a0b1, a1b0); // A0+A1+B0+B1
372
+ const int16x8_t a0b1_2 = vaddq_s16(a0b1, a0b1); // 2*(A0+B1)
373
+ const int16x8_t a1b0_2 = vaddq_s16(a1b0, a1b0); // 2*(A1+B0)
374
+ const int16x8_t c0 = vshrq_n_s16(vaddq_s16(a0b1_2, a0a1b0b1), 3);
375
+ const int16x8_t c1 = vshrq_n_s16(vaddq_s16(a1b0_2, a0a1b0b1), 3);
376
+ const int16x8_t d0 = vaddq_s16(c1, a0);
377
+ const int16x8_t d1 = vaddq_s16(c0, a1);
378
+ const int16x8_t e0 = vrshrq_n_s16(d0, 1);
379
+ const int16x8_t e1 = vrshrq_n_s16(d1, 1);
380
+ const int16x8x2_t f = vzipq_s16(e0, e1);
381
+ const int16x8_t g0 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 0));
382
+ const int16x8_t g1 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 8));
383
+ const int16x8_t h0 = vaddq_s16(g0, f.val[0]);
384
+ const int16x8_t h1 = vaddq_s16(g1, f.val[1]);
385
+ const int16x8_t i0 = vmaxq_s16(vminq_s16(h0, max), zero);
386
+ const int16x8_t i1 = vmaxq_s16(vminq_s16(h1, max), zero);
387
+ vst1q_u16(out + 2 * i + 0, vreinterpretq_u16_s16(i0));
388
+ vst1q_u16(out + 2 * i + 8, vreinterpretq_u16_s16(i1));
389
+ }
390
+ for (; i < len; ++i) {
391
+ const int a0b1 = A[i + 0] + B[i + 1];
392
+ const int a1b0 = A[i + 1] + B[i + 0];
393
+ const int a0a1b0b1 = a0b1 + a1b0 + 8;
394
+ const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
395
+ const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
396
+ out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0);
397
+ out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);
398
+ }
399
+ }
400
+
401
+ #endif // SJPEG_USE_NEON
402
+
403
+ static uint64_t (*kSharpUpdateY)(const uint16_t* src, const uint16_t* ref,
404
+ uint16_t* dst, int len);
405
+ static void (*kSharpUpdateRGB)(const int16_t* src, const int16_t* ref,
406
+ int16_t* dst, int len);
407
+ static void (*kSharpFilterRow)(const int16_t* A, const int16_t* B,
408
+ int len, const uint16_t* best_y, uint16_t* out);
409
+
410
+ static void InitFunctionPointers() {
411
+ static bool done = false;
412
+ if (!done) {
413
+ kSharpUpdateY = SharpUpdateY_C;
414
+ kSharpUpdateRGB = SharpUpdateRGB_C;
415
+ kSharpFilterRow = SharpFilterRow_C;
416
+ #if defined(SJPEG_USE_SSE2)
417
+ if (sjpeg::SupportsSSE2()) {
418
+ kSharpUpdateY = SharpUpdateY_SSE2;
419
+ kSharpUpdateRGB = SharpUpdateRGB_SSE2;
420
+ kSharpFilterRow = SharpFilterRow_SSE2;
421
+ }
422
+ #endif
423
+ #if defined(SJPEG_USE_NEON)
424
+ if (sjpeg::SupportsNEON()) {
425
+ kSharpUpdateY = SharpUpdateY_NEON;
426
+ kSharpUpdateRGB = SharpUpdateRGB_NEON;
427
+ kSharpFilterRow = SharpFilterRow_NEON;
428
+ }
429
+ #endif
430
+ done = true;
431
+ }
432
+ }
433
+
434
+ //------------------------------------------------------------------------------
435
+
436
+ static uint32_t RGBToGray(uint32_t r, uint32_t g, uint32_t b) {
437
+ const uint32_t luma = 13933 * r + 46871 * g + 4732 * b + (1u << YUV_FIX >> 1);
438
+ return (luma >> YUV_FIX);
439
+ }
440
+
441
+ static uint32_t ScaleDown(int a, int b, int c, int d) {
442
+ const uint32_t A = GammaToLinear(a);
443
+ const uint32_t B = GammaToLinear(b);
444
+ const uint32_t C = GammaToLinear(c);
445
+ const uint32_t D = GammaToLinear(d);
446
+ return LinearToGamma((A + B + C + D + 2) >> 2);
447
+ }
448
+
449
+ static void UpdateChroma(const fixed_y_t* src1, const fixed_y_t* src2,
450
+ fixed_t* dst, size_t uv_w) {
451
+ for (size_t i = 0; i < uv_w; ++i) {
452
+ const uint32_t r = ScaleDown(src1[0 * uv_w + 0], src1[0 * uv_w + 1],
453
+ src2[0 * uv_w + 0], src2[0 * uv_w + 1]);
454
+ const uint32_t g = ScaleDown(src1[2 * uv_w + 0], src1[2 * uv_w + 1],
455
+ src2[2 * uv_w + 0], src2[2 * uv_w + 1]);
456
+ const uint32_t b = ScaleDown(src1[4 * uv_w + 0], src1[4 * uv_w + 1],
457
+ src2[4 * uv_w + 0], src2[4 * uv_w + 1]);
458
+ const int W = RGBToGray(r, g, b);
459
+ dst[0 * uv_w] = (fixed_t)(r - W);
460
+ dst[1 * uv_w] = (fixed_t)(g - W);
461
+ dst[2 * uv_w] = (fixed_t)(b - W);
462
+ dst += 1;
463
+ src1 += 2;
464
+ src2 += 2;
465
+ }
466
+ }
467
+
468
+ static void UpdateW(const fixed_y_t* src, fixed_y_t* dst, int w) {
469
+ for (int i = 0; i < w; ++i) {
470
+ const uint32_t R = GammaToLinear(src[0 * w + i]);
471
+ const uint32_t G = GammaToLinear(src[1 * w + i]);
472
+ const uint32_t B = GammaToLinear(src[2 * w + i]);
473
+ const uint32_t Y = RGBToGray(R, G, B);
474
+ dst[i] = (fixed_y_t)LinearToGamma(Y);
475
+ }
476
+ }
477
+
478
+ static void StoreGray(const fixed_y_t* const rgb, fixed_y_t* const y, int w) {
479
+ for (int i = 0; i < w; ++i) {
480
+ y[i] = RGBToGray(rgb[0 * w + i], rgb[1 * w + i], rgb[2 * w + i]);
481
+ }
482
+ }
483
+
484
+ //------------------------------------------------------------------------------
485
+
486
+ static fixed_y_t Filter2(int A, int B, int W0) {
487
+ const int v0 = (A * 3 + B + 2) >> 2;
488
+ return clip_y(v0 + W0);
489
+ }
490
+
491
+ //------------------------------------------------------------------------------
492
+
493
+ static fixed_y_t UpLift(uint8_t a) { // 8bit -> SFIX
494
+ return ((fixed_y_t)a << SFIX) | SHALF;
495
+ }
496
+
497
+ static void ImportOneRow(const uint8_t* const rgb, int pic_width,
498
+ fixed_y_t* const dst) {
499
+ const int w = (pic_width + 1) & ~1;
500
+ for (int i = 0; i < pic_width; ++i) {
501
+ const int off = i * 3;
502
+ dst[i + 0 * w] = UpLift(rgb[off + 0]);
503
+ dst[i + 1 * w] = UpLift(rgb[off + 1]);
504
+ dst[i + 2 * w] = UpLift(rgb[off + 2]);
505
+ }
506
+ if (pic_width & 1) { // replicate rightmost pixel
507
+ dst[pic_width + 0 * w] = dst[pic_width + 0 * w - 1];
508
+ dst[pic_width + 1 * w] = dst[pic_width + 1 * w - 1];
509
+ dst[pic_width + 2 * w] = dst[pic_width + 2 * w - 1];
510
+ }
511
+ }
512
+
513
+ static void InterpolateTwoRows(const fixed_y_t* const best_y,
514
+ const fixed_t* prev_uv,
515
+ const fixed_t* cur_uv,
516
+ const fixed_t* next_uv,
517
+ int w,
518
+ fixed_y_t* out1, fixed_y_t* out2) {
519
+ const int uv_w = w >> 1;
520
+ const int len = (w - 1) >> 1; // length to filter
521
+ for (int k = 3; k > 0; --k) { // process each R/G/B segments in turn
522
+ // special boundary case for i==0
523
+ out1[0] = Filter2(cur_uv[0], prev_uv[0], best_y[0]);
524
+ out2[0] = Filter2(cur_uv[0], next_uv[0], best_y[w]);
525
+
526
+ kSharpFilterRow(cur_uv, prev_uv, len, best_y + 0 + 1, out1 + 1);
527
+ kSharpFilterRow(cur_uv, next_uv, len, best_y + w + 1, out2 + 1);
528
+
529
+ // special boundary case for i == w - 1 when w is even
530
+ if (!(w & 1)) {
531
+ out1[w - 1] = Filter2(cur_uv[uv_w - 1], prev_uv[uv_w - 1],
532
+ best_y[w - 1 + 0]);
533
+ out2[w - 1] = Filter2(cur_uv[uv_w - 1], next_uv[uv_w - 1],
534
+ best_y[w - 1 + w]);
535
+ }
536
+ out1 += w;
537
+ out2 += w;
538
+ prev_uv += uv_w;
539
+ cur_uv += uv_w;
540
+ next_uv += uv_w;
541
+ }
542
+ }
543
+
544
+ static void ConvertWRGBToYUV(const fixed_y_t* best_y,
545
+ const fixed_t* best_uv,
546
+ int width, int height,
547
+ uint8_t* y_plane,
548
+ uint8_t* u_plane, uint8_t* v_plane) {
549
+ const int w = (width + 1) & ~1;
550
+ const int h = (height + 1) & ~1;
551
+ const int uv_w = w >> 1;
552
+ const int uv_h = h >> 1;
553
+ for (int j = 0; j < height; ++j) {
554
+ const int off = (j >> 1) * 3 * uv_w;
555
+ for (int i = 0; i < width; ++i) {
556
+ const int W = best_y[i + j * w];
557
+ const int r = best_uv[off + (i >> 1) + 0 * uv_w] + W;
558
+ const int g = best_uv[off + (i >> 1) + 1 * uv_w] + W;
559
+ const int b = best_uv[off + (i >> 1) + 2 * uv_w] + W;
560
+ y_plane[i] = ConvertRGBToY(r, g, b);
561
+ }
562
+ y_plane += width;
563
+ }
564
+ for (int j = 0; j < uv_h; ++j) {
565
+ for (int i = 0; i < uv_w; ++i) {
566
+ const int off = i + j * 3 * uv_w;
567
+ const int r = best_uv[off + 0 * uv_w];
568
+ const int g = best_uv[off + 1 * uv_w];
569
+ const int b = best_uv[off + 2 * uv_w];
570
+ u_plane[i] = ConvertRGBToU(r, g, b);
571
+ v_plane[i] = ConvertRGBToV(r, g, b);
572
+ }
573
+ u_plane += uv_w;
574
+ v_plane += uv_w;
575
+ }
576
+ }
577
+
578
+ //------------------------------------------------------------------------------
579
+ // Main function
580
+
581
+ static void PreprocessARGB(const uint8_t* const rgb,
582
+ int width, int height, size_t stride,
583
+ uint8_t* y_plane,
584
+ uint8_t* u_plane, uint8_t* v_plane) {
585
+ // we expand the right/bottom border if needed
586
+ const int w = (width + 1) & ~1;
587
+ const int h = (height + 1) & ~1;
588
+ const int uv_w = w >> 1;
589
+ const int uv_h = h >> 1;
590
+ uint64_t prev_diff_y_sum = ~0;
591
+
592
+ InitGammaTablesF();
593
+ InitFunctionPointers();
594
+
595
+ // TODO(skal): allocate one big memory chunk instead.
596
+ vector<fixed_y_t> tmp_buffer(w * 3 * 2);
597
+ vector<fixed_y_t> best_y(w * h);
598
+ vector<fixed_y_t> target_y(w * h);
599
+ vector<fixed_y_t> best_rgb_y(w * 2);
600
+ vector<fixed_t> best_uv(uv_w * 3 * uv_h);
601
+ vector<fixed_t> target_uv(uv_w * 3 * uv_h);
602
+ vector<fixed_t> best_rgb_uv(uv_w * 3 * 1);
603
+ const uint64_t diff_y_threshold = static_cast<uint64_t>(3.0 * w * h);
604
+
605
+ assert(width >= kMinDimensionIterativeConversion);
606
+ assert(height >= kMinDimensionIterativeConversion);
607
+
608
+ // Import RGB samples to W/RGB representation.
609
+ for (int j = 0; j < height; j += 2) {
610
+ const int is_last_row = (j == height - 1);
611
+ fixed_y_t* const src1 = &tmp_buffer[0 * w];
612
+ fixed_y_t* const src2 = &tmp_buffer[3 * w];
613
+ const int rgb_off = j * stride;
614
+ const int y_off = j * w;
615
+ const int uv_off = (j >> 1) * 3 * uv_w;
616
+
617
+ // prepare two rows of input
618
+ ImportOneRow(rgb + rgb_off, width, src1);
619
+ if (!is_last_row) {
620
+ ImportOneRow(rgb + rgb_off + stride, width, src2);
621
+ } else {
622
+ memcpy(src2, src1, 3 * w * sizeof(*src2));
623
+ }
624
+ StoreGray(src1, &best_y[y_off + 0], w);
625
+ StoreGray(src2, &best_y[y_off + w], w);
626
+ UpdateW(src1, &target_y[y_off + 0], w);
627
+ UpdateW(src2, &target_y[y_off + w], w);
628
+ UpdateChroma(src1, src2, &target_uv[uv_off], uv_w);
629
+ memcpy(&best_uv[uv_off], &target_uv[uv_off], 3 * uv_w * sizeof(best_uv[0]));
630
+ }
631
+
632
+ // Iterate and resolve clipping conflicts.
633
+ for (int iter = 0; iter < kNumIterations; ++iter) {
634
+ const fixed_t* cur_uv = &best_uv[0];
635
+ const fixed_t* prev_uv = &best_uv[0];
636
+ uint64_t diff_y_sum = 0;
637
+
638
+ for (int j = 0; j < h; j += 2) {
639
+ const int uv_off = (j >> 1) * 3 * uv_w;
640
+ fixed_y_t* const src1 = &tmp_buffer[0 * w];
641
+ fixed_y_t* const src2 = &tmp_buffer[3 * w];
642
+ const fixed_t* const next_uv = cur_uv + ((j < h - 2) ? 3 * uv_w : 0);
643
+ InterpolateTwoRows(&best_y[j * w], prev_uv, cur_uv, next_uv,
644
+ w, src1, src2);
645
+ prev_uv = cur_uv;
646
+ cur_uv = next_uv;
647
+
648
+ UpdateW(src1, &best_rgb_y[0 * w], w);
649
+ UpdateW(src2, &best_rgb_y[1 * w], w);
650
+ UpdateChroma(src1, src2, &best_rgb_uv[0], uv_w);
651
+
652
+ // update two rows of Y and one row of RGB
653
+ diff_y_sum += kSharpUpdateY(&target_y[j * w],
654
+ &best_rgb_y[0], &best_y[j * w], 2 * w);
655
+ kSharpUpdateRGB(&target_uv[uv_off],
656
+ &best_rgb_uv[0], &best_uv[uv_off], 3 * uv_w);
657
+ }
658
+ // test exit condition
659
+ if (iter > 0) {
660
+ if (diff_y_sum < diff_y_threshold) break;
661
+ if (diff_y_sum > prev_diff_y_sum) break;
662
+ }
663
+ prev_diff_y_sum = diff_y_sum;
664
+ }
665
+ // final reconstruction
666
+ ConvertWRGBToYUV(&best_y[0], &best_uv[0], width, height,
667
+ y_plane, u_plane, v_plane);
668
+ }
669
+
670
+ } // namespace sjpeg
671
+
672
+ ////////////////////////////////////////////////////////////////////////////////
673
+ // Entry point
674
+
675
+ void sjpeg::ApplySharpYUVConversion(const uint8_t* const rgb,
676
+ int W, int H, int stride,
677
+ uint8_t* y_plane,
678
+ uint8_t* u_plane, uint8_t* v_plane) {
679
+ if (W <= kMinDimensionIterativeConversion ||
680
+ H <= kMinDimensionIterativeConversion) {
681
+ const int uv_w = (W + 1) >> 1;
682
+ for (int y = 0; y < H; y += 2) {
683
+ const uint8_t* const rgb1 = rgb + y * stride;
684
+ const uint8_t* const rgb2 = (y < H - 1) ? rgb1 + stride : rgb1;
685
+ ConvertRowToY(rgb1, W, &y_plane[y * W]);
686
+ if (y < H - 1) {
687
+ ConvertRowToY(rgb2, W, &y_plane[(y + 1) * W]);
688
+ }
689
+ ConvertRowToUV(rgb1, rgb2, W,
690
+ &u_plane[(y >> 1) * uv_w],
691
+ &v_plane[(y >> 1) * uv_w]);
692
+ }
693
+ } else {
694
+ PreprocessARGB(rgb, W, H, stride, y_plane, u_plane, v_plane);
695
+ }
696
+ }
697
+
698
+ ////////////////////////////////////////////////////////////////////////////////
@@ -0,0 +1,3 @@
1
+ module Sjpeg
2
+ VERSION = "0.1.0"
3
+ end