sjpeg 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/Gemfile +4 -0
- data/Gemfile.lock +40 -0
- data/LICENSE.txt +21 -0
- data/README.md +40 -0
- data/Rakefile +11 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/ext/sjpeg/bit_writer.cc +122 -0
- data/ext/sjpeg/bit_writer.h +169 -0
- data/ext/sjpeg/colors_rgb.cc +691 -0
- data/ext/sjpeg/dichotomy.cc +290 -0
- data/ext/sjpeg/enc.cc +2132 -0
- data/ext/sjpeg/extconf.rb +3 -0
- data/ext/sjpeg/fdct.cc +627 -0
- data/ext/sjpeg/headers.cc +218 -0
- data/ext/sjpeg/jpeg_tools.cc +274 -0
- data/ext/sjpeg/libsjpeg.pc.in +11 -0
- data/ext/sjpeg/score_7.cc +6220 -0
- data/ext/sjpeg/sjpeg.h +353 -0
- data/ext/sjpeg/sjpegi.h +427 -0
- data/ext/sjpeg/yuv_convert.cc +698 -0
- data/lib/sjpeg/version.rb +3 -0
- data/lib/sjpeg.rb +35 -0
- data/sjpeg.gemspec +36 -0
- metadata +143 -0
@@ -0,0 +1,698 @@
|
|
1
|
+
// Copyright 2017 Google Inc.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
//
|
15
|
+
// Enhanced RGB->YUV conversion functions
|
16
|
+
//
|
17
|
+
// Author: Skal (pascal.massimino@gmail.com)
|
18
|
+
|
19
|
+
#include <math.h>
|
20
|
+
#include <stdlib.h>
|
21
|
+
#include <string.h>
|
22
|
+
#include <memory>
|
23
|
+
#include <vector>
|
24
|
+
using std::vector;
|
25
|
+
|
26
|
+
#define SJPEG_NEED_ASM_HEADERS
|
27
|
+
#include "sjpegi.h"
|
28
|
+
|
29
|
+
namespace sjpeg {
|
30
|
+
|
31
|
+
// We could use SFIX=0 and only uint8_t for fixed_y_t, but it produces some
|
32
|
+
// banding sometimes. Better use extra precision.
|
33
|
+
#define SFIX 2 // fixed-point precision of RGB and Y/W
|
34
|
+
#define SHALF (1 << SFIX >> 1)
|
35
|
+
#define MAX_Y_T ((256 << SFIX) - 1)
|
36
|
+
typedef int16_t fixed_t; // signed type with extra SFIX precision for UV
|
37
|
+
typedef uint16_t fixed_y_t; // unsigned type with extra SFIX precision for W
|
38
|
+
|
39
|
+
static fixed_y_t clip_y(int y) {
|
40
|
+
return (!(y & ~MAX_Y_T)) ? (fixed_y_t)y : (y < 0) ? 0 : MAX_Y_T;
|
41
|
+
}
|
42
|
+
|
43
|
+
////////////////////////////////////////////////////////////////////////////////
|
44
|
+
// Helper functions for Y/U/V fixed-point calculations.
|
45
|
+
|
46
|
+
// The following functions convert r/g/b values in SFIX fixed-point precision
|
47
|
+
// to 8b values, clipped:
|
48
|
+
#define YUV_FIX 16
|
49
|
+
#define TFIX (YUV_FIX + SFIX)
|
50
|
+
#define TROUNDER (1 << TFIX >> 1)
|
51
|
+
|
52
|
+
static uint8_t clip_8b(int v) {
|
53
|
+
return (!(v & ~0xff)) ? (uint8_t)v : (v < 0) ? 0u : 255u;
|
54
|
+
}
|
55
|
+
|
56
|
+
static uint8_t ConvertRGBToY(int r, int g, int b) {
|
57
|
+
const int luma = 19595 * r + 38469 * g + 7471 * b + TROUNDER;
|
58
|
+
return clip_8b(luma >> TFIX);
|
59
|
+
}
|
60
|
+
|
61
|
+
static uint8_t ConvertRGBToU(int r, int g, int b) {
|
62
|
+
const int u = -11058 * r - 21709 * g + 32768 * b + TROUNDER;
|
63
|
+
return clip_8b(128 + (u >> TFIX));
|
64
|
+
}
|
65
|
+
|
66
|
+
static uint8_t ConvertRGBToV(int r, int g, int b) {
|
67
|
+
const int v = +32768 * r - 27439 * g - 5328 * b + TROUNDER;
|
68
|
+
return clip_8b(128 + (v >> TFIX));
|
69
|
+
}
|
70
|
+
|
71
|
+
// convert to luma using 16b precision:
|
72
|
+
static void ConvertRowToY(const uint8_t* row, int w, uint8_t* const dst) {
|
73
|
+
for (int i = 0; i < w; i += 1, row += 3) {
|
74
|
+
const int r = row[0], g = row[1], b = row[2];
|
75
|
+
const int y = 19595 * r + 38469 * g + 7471 * b;
|
76
|
+
dst[i] = (y + (1 << YUV_FIX >> 1)) >> YUV_FIX;
|
77
|
+
}
|
78
|
+
}
|
79
|
+
|
80
|
+
static void ConvertRowToUV(const uint8_t* row1, const uint8_t* row2,
|
81
|
+
int w, uint8_t* u, uint8_t* v) {
|
82
|
+
for (int i = 0; i < (w & ~1); i += 2, row1 += 6, row2 += 6) {
|
83
|
+
const int r = row1[0] + row1[3] + row2[0] + row2[3];
|
84
|
+
const int g = row1[1] + row1[4] + row2[1] + row2[4];
|
85
|
+
const int b = row1[2] + row1[5] + row2[2] + row2[5];
|
86
|
+
*u++ = ConvertRGBToU(r, g, b);
|
87
|
+
*v++ = ConvertRGBToV(r, g, b);
|
88
|
+
}
|
89
|
+
if (w & 1) {
|
90
|
+
const int r = 2 * (row1[0] + row2[0]);
|
91
|
+
const int g = 2 * (row1[1] + row2[1]);
|
92
|
+
const int b = 2 * (row1[2] + row2[2]);
|
93
|
+
*u++ = ConvertRGBToU(r, g, b);
|
94
|
+
*v++ = ConvertRGBToV(r, g, b);
|
95
|
+
}
|
96
|
+
}
|
97
|
+
|
98
|
+
#undef TFIX
|
99
|
+
#undef ROUNDER
|
100
|
+
|
101
|
+
////////////////////////////////////////////////////////////////////////////////
|
102
|
+
// Sharp RGB->YUV conversion
|
103
|
+
|
104
|
+
static const int kNumIterations = 4;
|
105
|
+
static const int kMinDimensionIterativeConversion = 4;
|
106
|
+
|
107
|
+
// size of the interpolation table for linear-to-gamma
|
108
|
+
#define GAMMA_TABLE_SIZE 32
|
109
|
+
static uint32_t kLinearToGammaTab[GAMMA_TABLE_SIZE + 2];
|
110
|
+
#define GAMMA_TO_LINEAR_BITS 14
|
111
|
+
static uint32_t kGammaToLinearTab[MAX_Y_T + 1]; // size scales with Y_FIX
|
112
|
+
|
113
|
+
static void InitGammaTablesF(void) {
|
114
|
+
static bool done = false;
|
115
|
+
assert(2 * GAMMA_TO_LINEAR_BITS < 32); // we use uint32_t intermediate values
|
116
|
+
if (!done) {
|
117
|
+
int v;
|
118
|
+
const double norm = 1. / MAX_Y_T;
|
119
|
+
const double scale = 1. / GAMMA_TABLE_SIZE;
|
120
|
+
const double a = 0.099;
|
121
|
+
const double thresh = 0.018;
|
122
|
+
const double gamma = 1. / 0.45;
|
123
|
+
const double final_scale = 1 << GAMMA_TO_LINEAR_BITS;
|
124
|
+
for (v = 0; v <= MAX_Y_T; ++v) {
|
125
|
+
const double g = norm * v;
|
126
|
+
double value;
|
127
|
+
if (g <= thresh * 4.5) {
|
128
|
+
value = g / 4.5;
|
129
|
+
} else {
|
130
|
+
const double a_rec = 1. / (1. + a);
|
131
|
+
value = pow(a_rec * (g + a), gamma);
|
132
|
+
}
|
133
|
+
kGammaToLinearTab[v] = static_cast<uint32_t>(value * final_scale + .5);
|
134
|
+
}
|
135
|
+
for (v = 0; v <= GAMMA_TABLE_SIZE; ++v) {
|
136
|
+
const double g = scale * v;
|
137
|
+
double value;
|
138
|
+
if (g <= thresh) {
|
139
|
+
value = 4.5 * g;
|
140
|
+
} else {
|
141
|
+
value = (1. + a) * pow(g, 1. / gamma) - a;
|
142
|
+
}
|
143
|
+
// we already incorporate the 1/2 rounding constant here
|
144
|
+
kLinearToGammaTab[v] =
|
145
|
+
static_cast<uint32_t>(MAX_Y_T * value)
|
146
|
+
+ (1 << GAMMA_TO_LINEAR_BITS >> 1);
|
147
|
+
}
|
148
|
+
// to prevent small rounding errors to cause read-overflow:
|
149
|
+
kLinearToGammaTab[GAMMA_TABLE_SIZE + 1] =
|
150
|
+
kLinearToGammaTab[GAMMA_TABLE_SIZE];
|
151
|
+
done = true;
|
152
|
+
}
|
153
|
+
}
|
154
|
+
|
155
|
+
// return value has a fixed-point precision of GAMMA_TO_LINEAR_BITS
|
156
|
+
static uint32_t GammaToLinear(int v) { return kGammaToLinearTab[v]; }
|
157
|
+
|
158
|
+
static uint32_t LinearToGamma(uint32_t value) {
|
159
|
+
// 'value' is in GAMMA_TO_LINEAR_BITS fractional precision
|
160
|
+
const uint32_t v = value * GAMMA_TABLE_SIZE;
|
161
|
+
const uint32_t tab_pos = v >> GAMMA_TO_LINEAR_BITS;
|
162
|
+
// fractional part, in GAMMA_TO_LINEAR_BITS fixed-point precision
|
163
|
+
const uint32_t x = v - (tab_pos << GAMMA_TO_LINEAR_BITS); // fractional part
|
164
|
+
// v0 / v1 are in GAMMA_TO_LINEAR_BITS fixed-point precision (range [0..1])
|
165
|
+
const uint32_t v0 = kLinearToGammaTab[tab_pos + 0];
|
166
|
+
const uint32_t v1 = kLinearToGammaTab[tab_pos + 1];
|
167
|
+
// Final interpolation. Note that rounding is already included.
|
168
|
+
const uint32_t v2 = (v1 - v0) * x; // note: v1 >= v0.
|
169
|
+
const uint32_t result = v0 + (v2 >> GAMMA_TO_LINEAR_BITS);
|
170
|
+
return result;
|
171
|
+
}
|
172
|
+
|
173
|
+
//------------------------------------------------------------------------------
|
174
|
+
|
175
|
+
static uint64_t SharpUpdateY_C(const uint16_t* ref, const uint16_t* src,
|
176
|
+
uint16_t* dst, int len) {
|
177
|
+
uint64_t diff = 0;
|
178
|
+
for (int i = 0; i < len; ++i) {
|
179
|
+
const int diff_y = ref[i] - src[i];
|
180
|
+
const int new_y = static_cast<int>(dst[i]) + diff_y;
|
181
|
+
dst[i] = clip_y(new_y);
|
182
|
+
diff += (uint64_t)abs(diff_y);
|
183
|
+
}
|
184
|
+
return diff;
|
185
|
+
}
|
186
|
+
|
187
|
+
static void SharpUpdateRGB_C(const int16_t* ref, const int16_t* src,
|
188
|
+
int16_t* dst, int len) {
|
189
|
+
for (int i = 0; i < len; ++i) {
|
190
|
+
const int diff_uv = ref[i] - src[i];
|
191
|
+
dst[i] += diff_uv;
|
192
|
+
}
|
193
|
+
}
|
194
|
+
|
195
|
+
static void SharpFilterRow_C(const int16_t* A, const int16_t* B, int len,
|
196
|
+
const uint16_t* best_y, uint16_t* out) {
|
197
|
+
for (int i = 0; i < len; ++i, ++A, ++B) {
|
198
|
+
const int v0 = (A[0] * 9 + A[1] * 3 + B[0] * 3 + B[1] + 8) >> 4;
|
199
|
+
const int v1 = (A[1] * 9 + A[0] * 3 + B[1] * 3 + B[0] + 8) >> 4;
|
200
|
+
out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0);
|
201
|
+
out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);
|
202
|
+
}
|
203
|
+
}
|
204
|
+
|
205
|
+
#if defined(SJPEG_USE_SSE2)
|
206
|
+
|
207
|
+
#define LOAD_16(P) (_mm_loadu_si128(reinterpret_cast<const __m128i*>(P)))
|
208
|
+
#define STORE_16(P, V) (_mm_storeu_si128(reinterpret_cast<__m128i*>(P), (V)))
|
209
|
+
|
210
|
+
static uint64_t SharpUpdateY_SSE2(const uint16_t* ref, const uint16_t* src,
|
211
|
+
uint16_t* dst, int len) {
|
212
|
+
uint64_t diff = 0;
|
213
|
+
uint32_t tmp[4];
|
214
|
+
int i;
|
215
|
+
const __m128i zero = _mm_setzero_si128();
|
216
|
+
const __m128i max = _mm_set1_epi16(MAX_Y_T);
|
217
|
+
const __m128i one = _mm_set1_epi16(1);
|
218
|
+
__m128i sum = zero;
|
219
|
+
|
220
|
+
for (i = 0; i + 8 <= len; i += 8) {
|
221
|
+
const __m128i A = LOAD_16(ref + i);
|
222
|
+
const __m128i B = LOAD_16(src + i);
|
223
|
+
const __m128i C = LOAD_16(dst + i);
|
224
|
+
const __m128i D = _mm_sub_epi16(A, B); // diff_y
|
225
|
+
const __m128i E = _mm_cmpgt_epi16(zero, D); // sign (-1 or 0)
|
226
|
+
const __m128i F = _mm_add_epi16(C, D); // new_y
|
227
|
+
const __m128i G = _mm_or_si128(E, one); // -1 or 1
|
228
|
+
const __m128i H = _mm_max_epi16(_mm_min_epi16(F, max), zero);
|
229
|
+
const __m128i I = _mm_madd_epi16(D, G); // sum(abs(...))
|
230
|
+
STORE_16(dst + i, H);
|
231
|
+
sum = _mm_add_epi32(sum, I);
|
232
|
+
}
|
233
|
+
STORE_16(tmp, sum);
|
234
|
+
diff = tmp[3] + tmp[2] + tmp[1] + tmp[0];
|
235
|
+
for (; i < len; ++i) {
|
236
|
+
const int diff_y = ref[i] - src[i];
|
237
|
+
const int new_y = static_cast<int>(dst[i]) + diff_y;
|
238
|
+
dst[i] = clip_y(new_y);
|
239
|
+
diff += (uint64_t)abs(diff_y);
|
240
|
+
}
|
241
|
+
return diff;
|
242
|
+
}
|
243
|
+
|
244
|
+
static void SharpUpdateRGB_SSE2(const int16_t* ref, const int16_t* src,
|
245
|
+
int16_t* dst, int len) {
|
246
|
+
int i = 0;
|
247
|
+
for (i = 0; i + 8 <= len; i += 8) {
|
248
|
+
const __m128i A = LOAD_16(ref + i);
|
249
|
+
const __m128i B = LOAD_16(src + i);
|
250
|
+
const __m128i C = LOAD_16(dst + i);
|
251
|
+
const __m128i D = _mm_sub_epi16(A, B); // diff_uv
|
252
|
+
const __m128i E = _mm_add_epi16(C, D); // new_uv
|
253
|
+
STORE_16(dst + i, E);
|
254
|
+
}
|
255
|
+
for (; i < len; ++i) {
|
256
|
+
const int diff_uv = ref[i] - src[i];
|
257
|
+
dst[i] += diff_uv;
|
258
|
+
}
|
259
|
+
}
|
260
|
+
|
261
|
+
static void SharpFilterRow_SSE2(const int16_t* A, const int16_t* B, int len,
|
262
|
+
const uint16_t* best_y, uint16_t* out) {
|
263
|
+
int i;
|
264
|
+
const __m128i kCst8 = _mm_set1_epi16(8);
|
265
|
+
const __m128i max = _mm_set1_epi16(MAX_Y_T);
|
266
|
+
const __m128i zero = _mm_setzero_si128();
|
267
|
+
for (i = 0; i + 8 <= len; i += 8) {
|
268
|
+
const __m128i a0 = LOAD_16(A + i + 0);
|
269
|
+
const __m128i a1 = LOAD_16(A + i + 1);
|
270
|
+
const __m128i b0 = LOAD_16(B + i + 0);
|
271
|
+
const __m128i b1 = LOAD_16(B + i + 1);
|
272
|
+
const __m128i a0b1 = _mm_add_epi16(a0, b1);
|
273
|
+
const __m128i a1b0 = _mm_add_epi16(a1, b0);
|
274
|
+
const __m128i a0a1b0b1 = _mm_add_epi16(a0b1, a1b0); // A0+A1+B0+B1
|
275
|
+
const __m128i a0a1b0b1_8 = _mm_add_epi16(a0a1b0b1, kCst8);
|
276
|
+
const __m128i a0b1_2 = _mm_add_epi16(a0b1, a0b1); // 2*(A0+B1)
|
277
|
+
const __m128i a1b0_2 = _mm_add_epi16(a1b0, a1b0); // 2*(A1+B0)
|
278
|
+
const __m128i c0 = _mm_srai_epi16(_mm_add_epi16(a0b1_2, a0a1b0b1_8), 3);
|
279
|
+
const __m128i c1 = _mm_srai_epi16(_mm_add_epi16(a1b0_2, a0a1b0b1_8), 3);
|
280
|
+
const __m128i d0 = _mm_add_epi16(c1, a0);
|
281
|
+
const __m128i d1 = _mm_add_epi16(c0, a1);
|
282
|
+
const __m128i e0 = _mm_srai_epi16(d0, 1);
|
283
|
+
const __m128i e1 = _mm_srai_epi16(d1, 1);
|
284
|
+
const __m128i f0 = _mm_unpacklo_epi16(e0, e1);
|
285
|
+
const __m128i f1 = _mm_unpackhi_epi16(e0, e1);
|
286
|
+
const __m128i g0 = LOAD_16(best_y + 2 * i + 0);
|
287
|
+
const __m128i g1 = LOAD_16(best_y + 2 * i + 8);
|
288
|
+
const __m128i h0 = _mm_add_epi16(g0, f0);
|
289
|
+
const __m128i h1 = _mm_add_epi16(g1, f1);
|
290
|
+
const __m128i i0 = _mm_max_epi16(_mm_min_epi16(h0, max), zero);
|
291
|
+
const __m128i i1 = _mm_max_epi16(_mm_min_epi16(h1, max), zero);
|
292
|
+
STORE_16(out + 2 * i + 0, i0);
|
293
|
+
STORE_16(out + 2 * i + 8, i1);
|
294
|
+
}
|
295
|
+
for (; i < len; ++i) {
|
296
|
+
// (9 * A0 + 3 * A1 + 3 * B0 + B1 + 8) >> 4 =
|
297
|
+
// = (8 * A0 + 2 * (A1 + B0) + (A0 + A1 + B0 + B1 + 8)) >> 4
|
298
|
+
// We reuse the common sub-expressions.
|
299
|
+
const int a0b1 = A[i + 0] + B[i + 1];
|
300
|
+
const int a1b0 = A[i + 1] + B[i + 0];
|
301
|
+
const int a0a1b0b1 = a0b1 + a1b0 + 8;
|
302
|
+
const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
|
303
|
+
const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
|
304
|
+
out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0);
|
305
|
+
out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);
|
306
|
+
}
|
307
|
+
}
|
308
|
+
#undef STORE_16
|
309
|
+
#undef LOAD_16
|
310
|
+
|
311
|
+
#elif defined(SJPEG_USE_NEON)
|
312
|
+
|
313
|
+
static uint64_t SharpUpdateY_NEON(const uint16_t* ref, const uint16_t* src,
|
314
|
+
uint16_t* dst, int len) {
|
315
|
+
int i;
|
316
|
+
const int16x8_t zero = vdupq_n_s16(0);
|
317
|
+
const int16x8_t max = vdupq_n_s16(MAX_Y_T);
|
318
|
+
uint64x2_t sum = vdupq_n_u64(0);
|
319
|
+
|
320
|
+
for (i = 0; i + 8 <= len; i += 8) {
|
321
|
+
const int16x8_t A = vreinterpretq_s16_u16(vld1q_u16(ref + i));
|
322
|
+
const int16x8_t B = vreinterpretq_s16_u16(vld1q_u16(src + i));
|
323
|
+
const int16x8_t C = vreinterpretq_s16_u16(vld1q_u16(dst + i));
|
324
|
+
const int16x8_t D = vsubq_s16(A, B); // diff_y
|
325
|
+
const int16x8_t F = vaddq_s16(C, D); // new_y
|
326
|
+
const uint16x8_t H =
|
327
|
+
vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(F, max), zero));
|
328
|
+
const int16x8_t I = vabsq_s16(D); // abs(diff_y)
|
329
|
+
vst1q_u16(dst + i, H);
|
330
|
+
sum = vpadalq_u32(sum, vpaddlq_u16(vreinterpretq_u16_s16(I)));
|
331
|
+
}
|
332
|
+
uint64_t diff = vgetq_lane_u64(sum, 0) + vgetq_lane_u64(sum, 1);
|
333
|
+
for (; i < len; ++i) {
|
334
|
+
const int diff_y = ref[i] - src[i];
|
335
|
+
const int new_y = static_cast<int>(dst[i]) + diff_y;
|
336
|
+
dst[i] = clip_y(new_y);
|
337
|
+
diff += static_cast<uint64_t>(abs(diff_y));
|
338
|
+
}
|
339
|
+
return diff;
|
340
|
+
}
|
341
|
+
|
342
|
+
static void SharpUpdateRGB_NEON(const int16_t* ref, const int16_t* src,
|
343
|
+
int16_t* dst, int len) {
|
344
|
+
int i;
|
345
|
+
for (i = 0; i + 8 <= len; i += 8) {
|
346
|
+
const int16x8_t A = vld1q_s16(ref + i);
|
347
|
+
const int16x8_t B = vld1q_s16(src + i);
|
348
|
+
const int16x8_t C = vld1q_s16(dst + i);
|
349
|
+
const int16x8_t D = vsubq_s16(A, B); // diff_uv
|
350
|
+
const int16x8_t E = vaddq_s16(C, D); // new_uv
|
351
|
+
vst1q_s16(dst + i, E);
|
352
|
+
}
|
353
|
+
for (; i < len; ++i) {
|
354
|
+
const int diff_uv = ref[i] - src[i];
|
355
|
+
dst[i] += diff_uv;
|
356
|
+
}
|
357
|
+
}
|
358
|
+
|
359
|
+
static void SharpFilterRow_NEON(const int16_t* A, const int16_t* B, int len,
|
360
|
+
const uint16_t* best_y, uint16_t* out) {
|
361
|
+
int i;
|
362
|
+
const int16x8_t max = vdupq_n_s16(MAX_Y_T);
|
363
|
+
const int16x8_t zero = vdupq_n_s16(0);
|
364
|
+
for (i = 0; i + 8 <= len; i += 8) {
|
365
|
+
const int16x8_t a0 = vld1q_s16(A + i + 0);
|
366
|
+
const int16x8_t a1 = vld1q_s16(A + i + 1);
|
367
|
+
const int16x8_t b0 = vld1q_s16(B + i + 0);
|
368
|
+
const int16x8_t b1 = vld1q_s16(B + i + 1);
|
369
|
+
const int16x8_t a0b1 = vaddq_s16(a0, b1);
|
370
|
+
const int16x8_t a1b0 = vaddq_s16(a1, b0);
|
371
|
+
const int16x8_t a0a1b0b1 = vaddq_s16(a0b1, a1b0); // A0+A1+B0+B1
|
372
|
+
const int16x8_t a0b1_2 = vaddq_s16(a0b1, a0b1); // 2*(A0+B1)
|
373
|
+
const int16x8_t a1b0_2 = vaddq_s16(a1b0, a1b0); // 2*(A1+B0)
|
374
|
+
const int16x8_t c0 = vshrq_n_s16(vaddq_s16(a0b1_2, a0a1b0b1), 3);
|
375
|
+
const int16x8_t c1 = vshrq_n_s16(vaddq_s16(a1b0_2, a0a1b0b1), 3);
|
376
|
+
const int16x8_t d0 = vaddq_s16(c1, a0);
|
377
|
+
const int16x8_t d1 = vaddq_s16(c0, a1);
|
378
|
+
const int16x8_t e0 = vrshrq_n_s16(d0, 1);
|
379
|
+
const int16x8_t e1 = vrshrq_n_s16(d1, 1);
|
380
|
+
const int16x8x2_t f = vzipq_s16(e0, e1);
|
381
|
+
const int16x8_t g0 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 0));
|
382
|
+
const int16x8_t g1 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 8));
|
383
|
+
const int16x8_t h0 = vaddq_s16(g0, f.val[0]);
|
384
|
+
const int16x8_t h1 = vaddq_s16(g1, f.val[1]);
|
385
|
+
const int16x8_t i0 = vmaxq_s16(vminq_s16(h0, max), zero);
|
386
|
+
const int16x8_t i1 = vmaxq_s16(vminq_s16(h1, max), zero);
|
387
|
+
vst1q_u16(out + 2 * i + 0, vreinterpretq_u16_s16(i0));
|
388
|
+
vst1q_u16(out + 2 * i + 8, vreinterpretq_u16_s16(i1));
|
389
|
+
}
|
390
|
+
for (; i < len; ++i) {
|
391
|
+
const int a0b1 = A[i + 0] + B[i + 1];
|
392
|
+
const int a1b0 = A[i + 1] + B[i + 0];
|
393
|
+
const int a0a1b0b1 = a0b1 + a1b0 + 8;
|
394
|
+
const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
|
395
|
+
const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
|
396
|
+
out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0);
|
397
|
+
out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);
|
398
|
+
}
|
399
|
+
}
|
400
|
+
|
401
|
+
#endif // SJPEG_USE_NEON
|
402
|
+
|
403
|
+
static uint64_t (*kSharpUpdateY)(const uint16_t* src, const uint16_t* ref,
|
404
|
+
uint16_t* dst, int len);
|
405
|
+
static void (*kSharpUpdateRGB)(const int16_t* src, const int16_t* ref,
|
406
|
+
int16_t* dst, int len);
|
407
|
+
static void (*kSharpFilterRow)(const int16_t* A, const int16_t* B,
|
408
|
+
int len, const uint16_t* best_y, uint16_t* out);
|
409
|
+
|
410
|
+
static void InitFunctionPointers() {
|
411
|
+
static bool done = false;
|
412
|
+
if (!done) {
|
413
|
+
kSharpUpdateY = SharpUpdateY_C;
|
414
|
+
kSharpUpdateRGB = SharpUpdateRGB_C;
|
415
|
+
kSharpFilterRow = SharpFilterRow_C;
|
416
|
+
#if defined(SJPEG_USE_SSE2)
|
417
|
+
if (sjpeg::SupportsSSE2()) {
|
418
|
+
kSharpUpdateY = SharpUpdateY_SSE2;
|
419
|
+
kSharpUpdateRGB = SharpUpdateRGB_SSE2;
|
420
|
+
kSharpFilterRow = SharpFilterRow_SSE2;
|
421
|
+
}
|
422
|
+
#endif
|
423
|
+
#if defined(SJPEG_USE_NEON)
|
424
|
+
if (sjpeg::SupportsNEON()) {
|
425
|
+
kSharpUpdateY = SharpUpdateY_NEON;
|
426
|
+
kSharpUpdateRGB = SharpUpdateRGB_NEON;
|
427
|
+
kSharpFilterRow = SharpFilterRow_NEON;
|
428
|
+
}
|
429
|
+
#endif
|
430
|
+
done = true;
|
431
|
+
}
|
432
|
+
}
|
433
|
+
|
434
|
+
//------------------------------------------------------------------------------
|
435
|
+
|
436
|
+
static uint32_t RGBToGray(uint32_t r, uint32_t g, uint32_t b) {
|
437
|
+
const uint32_t luma = 13933 * r + 46871 * g + 4732 * b + (1u << YUV_FIX >> 1);
|
438
|
+
return (luma >> YUV_FIX);
|
439
|
+
}
|
440
|
+
|
441
|
+
static uint32_t ScaleDown(int a, int b, int c, int d) {
|
442
|
+
const uint32_t A = GammaToLinear(a);
|
443
|
+
const uint32_t B = GammaToLinear(b);
|
444
|
+
const uint32_t C = GammaToLinear(c);
|
445
|
+
const uint32_t D = GammaToLinear(d);
|
446
|
+
return LinearToGamma((A + B + C + D + 2) >> 2);
|
447
|
+
}
|
448
|
+
|
449
|
+
static void UpdateChroma(const fixed_y_t* src1, const fixed_y_t* src2,
|
450
|
+
fixed_t* dst, size_t uv_w) {
|
451
|
+
for (size_t i = 0; i < uv_w; ++i) {
|
452
|
+
const uint32_t r = ScaleDown(src1[0 * uv_w + 0], src1[0 * uv_w + 1],
|
453
|
+
src2[0 * uv_w + 0], src2[0 * uv_w + 1]);
|
454
|
+
const uint32_t g = ScaleDown(src1[2 * uv_w + 0], src1[2 * uv_w + 1],
|
455
|
+
src2[2 * uv_w + 0], src2[2 * uv_w + 1]);
|
456
|
+
const uint32_t b = ScaleDown(src1[4 * uv_w + 0], src1[4 * uv_w + 1],
|
457
|
+
src2[4 * uv_w + 0], src2[4 * uv_w + 1]);
|
458
|
+
const int W = RGBToGray(r, g, b);
|
459
|
+
dst[0 * uv_w] = (fixed_t)(r - W);
|
460
|
+
dst[1 * uv_w] = (fixed_t)(g - W);
|
461
|
+
dst[2 * uv_w] = (fixed_t)(b - W);
|
462
|
+
dst += 1;
|
463
|
+
src1 += 2;
|
464
|
+
src2 += 2;
|
465
|
+
}
|
466
|
+
}
|
467
|
+
|
468
|
+
static void UpdateW(const fixed_y_t* src, fixed_y_t* dst, int w) {
|
469
|
+
for (int i = 0; i < w; ++i) {
|
470
|
+
const uint32_t R = GammaToLinear(src[0 * w + i]);
|
471
|
+
const uint32_t G = GammaToLinear(src[1 * w + i]);
|
472
|
+
const uint32_t B = GammaToLinear(src[2 * w + i]);
|
473
|
+
const uint32_t Y = RGBToGray(R, G, B);
|
474
|
+
dst[i] = (fixed_y_t)LinearToGamma(Y);
|
475
|
+
}
|
476
|
+
}
|
477
|
+
|
478
|
+
static void StoreGray(const fixed_y_t* const rgb, fixed_y_t* const y, int w) {
|
479
|
+
for (int i = 0; i < w; ++i) {
|
480
|
+
y[i] = RGBToGray(rgb[0 * w + i], rgb[1 * w + i], rgb[2 * w + i]);
|
481
|
+
}
|
482
|
+
}
|
483
|
+
|
484
|
+
//------------------------------------------------------------------------------
|
485
|
+
|
486
|
+
static fixed_y_t Filter2(int A, int B, int W0) {
|
487
|
+
const int v0 = (A * 3 + B + 2) >> 2;
|
488
|
+
return clip_y(v0 + W0);
|
489
|
+
}
|
490
|
+
|
491
|
+
//------------------------------------------------------------------------------
|
492
|
+
|
493
|
+
static fixed_y_t UpLift(uint8_t a) { // 8bit -> SFIX
|
494
|
+
return ((fixed_y_t)a << SFIX) | SHALF;
|
495
|
+
}
|
496
|
+
|
497
|
+
static void ImportOneRow(const uint8_t* const rgb, int pic_width,
|
498
|
+
fixed_y_t* const dst) {
|
499
|
+
const int w = (pic_width + 1) & ~1;
|
500
|
+
for (int i = 0; i < pic_width; ++i) {
|
501
|
+
const int off = i * 3;
|
502
|
+
dst[i + 0 * w] = UpLift(rgb[off + 0]);
|
503
|
+
dst[i + 1 * w] = UpLift(rgb[off + 1]);
|
504
|
+
dst[i + 2 * w] = UpLift(rgb[off + 2]);
|
505
|
+
}
|
506
|
+
if (pic_width & 1) { // replicate rightmost pixel
|
507
|
+
dst[pic_width + 0 * w] = dst[pic_width + 0 * w - 1];
|
508
|
+
dst[pic_width + 1 * w] = dst[pic_width + 1 * w - 1];
|
509
|
+
dst[pic_width + 2 * w] = dst[pic_width + 2 * w - 1];
|
510
|
+
}
|
511
|
+
}
|
512
|
+
|
513
|
+
static void InterpolateTwoRows(const fixed_y_t* const best_y,
|
514
|
+
const fixed_t* prev_uv,
|
515
|
+
const fixed_t* cur_uv,
|
516
|
+
const fixed_t* next_uv,
|
517
|
+
int w,
|
518
|
+
fixed_y_t* out1, fixed_y_t* out2) {
|
519
|
+
const int uv_w = w >> 1;
|
520
|
+
const int len = (w - 1) >> 1; // length to filter
|
521
|
+
for (int k = 3; k > 0; --k) { // process each R/G/B segments in turn
|
522
|
+
// special boundary case for i==0
|
523
|
+
out1[0] = Filter2(cur_uv[0], prev_uv[0], best_y[0]);
|
524
|
+
out2[0] = Filter2(cur_uv[0], next_uv[0], best_y[w]);
|
525
|
+
|
526
|
+
kSharpFilterRow(cur_uv, prev_uv, len, best_y + 0 + 1, out1 + 1);
|
527
|
+
kSharpFilterRow(cur_uv, next_uv, len, best_y + w + 1, out2 + 1);
|
528
|
+
|
529
|
+
// special boundary case for i == w - 1 when w is even
|
530
|
+
if (!(w & 1)) {
|
531
|
+
out1[w - 1] = Filter2(cur_uv[uv_w - 1], prev_uv[uv_w - 1],
|
532
|
+
best_y[w - 1 + 0]);
|
533
|
+
out2[w - 1] = Filter2(cur_uv[uv_w - 1], next_uv[uv_w - 1],
|
534
|
+
best_y[w - 1 + w]);
|
535
|
+
}
|
536
|
+
out1 += w;
|
537
|
+
out2 += w;
|
538
|
+
prev_uv += uv_w;
|
539
|
+
cur_uv += uv_w;
|
540
|
+
next_uv += uv_w;
|
541
|
+
}
|
542
|
+
}
|
543
|
+
|
544
|
+
static void ConvertWRGBToYUV(const fixed_y_t* best_y,
|
545
|
+
const fixed_t* best_uv,
|
546
|
+
int width, int height,
|
547
|
+
uint8_t* y_plane,
|
548
|
+
uint8_t* u_plane, uint8_t* v_plane) {
|
549
|
+
const int w = (width + 1) & ~1;
|
550
|
+
const int h = (height + 1) & ~1;
|
551
|
+
const int uv_w = w >> 1;
|
552
|
+
const int uv_h = h >> 1;
|
553
|
+
for (int j = 0; j < height; ++j) {
|
554
|
+
const int off = (j >> 1) * 3 * uv_w;
|
555
|
+
for (int i = 0; i < width; ++i) {
|
556
|
+
const int W = best_y[i + j * w];
|
557
|
+
const int r = best_uv[off + (i >> 1) + 0 * uv_w] + W;
|
558
|
+
const int g = best_uv[off + (i >> 1) + 1 * uv_w] + W;
|
559
|
+
const int b = best_uv[off + (i >> 1) + 2 * uv_w] + W;
|
560
|
+
y_plane[i] = ConvertRGBToY(r, g, b);
|
561
|
+
}
|
562
|
+
y_plane += width;
|
563
|
+
}
|
564
|
+
for (int j = 0; j < uv_h; ++j) {
|
565
|
+
for (int i = 0; i < uv_w; ++i) {
|
566
|
+
const int off = i + j * 3 * uv_w;
|
567
|
+
const int r = best_uv[off + 0 * uv_w];
|
568
|
+
const int g = best_uv[off + 1 * uv_w];
|
569
|
+
const int b = best_uv[off + 2 * uv_w];
|
570
|
+
u_plane[i] = ConvertRGBToU(r, g, b);
|
571
|
+
v_plane[i] = ConvertRGBToV(r, g, b);
|
572
|
+
}
|
573
|
+
u_plane += uv_w;
|
574
|
+
v_plane += uv_w;
|
575
|
+
}
|
576
|
+
}
|
577
|
+
|
578
|
+
//------------------------------------------------------------------------------
|
579
|
+
// Main function
|
580
|
+
|
581
|
+
static void PreprocessARGB(const uint8_t* const rgb,
|
582
|
+
int width, int height, size_t stride,
|
583
|
+
uint8_t* y_plane,
|
584
|
+
uint8_t* u_plane, uint8_t* v_plane) {
|
585
|
+
// we expand the right/bottom border if needed
|
586
|
+
const int w = (width + 1) & ~1;
|
587
|
+
const int h = (height + 1) & ~1;
|
588
|
+
const int uv_w = w >> 1;
|
589
|
+
const int uv_h = h >> 1;
|
590
|
+
uint64_t prev_diff_y_sum = ~0;
|
591
|
+
|
592
|
+
InitGammaTablesF();
|
593
|
+
InitFunctionPointers();
|
594
|
+
|
595
|
+
// TODO(skal): allocate one big memory chunk instead.
|
596
|
+
vector<fixed_y_t> tmp_buffer(w * 3 * 2);
|
597
|
+
vector<fixed_y_t> best_y(w * h);
|
598
|
+
vector<fixed_y_t> target_y(w * h);
|
599
|
+
vector<fixed_y_t> best_rgb_y(w * 2);
|
600
|
+
vector<fixed_t> best_uv(uv_w * 3 * uv_h);
|
601
|
+
vector<fixed_t> target_uv(uv_w * 3 * uv_h);
|
602
|
+
vector<fixed_t> best_rgb_uv(uv_w * 3 * 1);
|
603
|
+
const uint64_t diff_y_threshold = static_cast<uint64_t>(3.0 * w * h);
|
604
|
+
|
605
|
+
assert(width >= kMinDimensionIterativeConversion);
|
606
|
+
assert(height >= kMinDimensionIterativeConversion);
|
607
|
+
|
608
|
+
// Import RGB samples to W/RGB representation.
|
609
|
+
for (int j = 0; j < height; j += 2) {
|
610
|
+
const int is_last_row = (j == height - 1);
|
611
|
+
fixed_y_t* const src1 = &tmp_buffer[0 * w];
|
612
|
+
fixed_y_t* const src2 = &tmp_buffer[3 * w];
|
613
|
+
const int rgb_off = j * stride;
|
614
|
+
const int y_off = j * w;
|
615
|
+
const int uv_off = (j >> 1) * 3 * uv_w;
|
616
|
+
|
617
|
+
// prepare two rows of input
|
618
|
+
ImportOneRow(rgb + rgb_off, width, src1);
|
619
|
+
if (!is_last_row) {
|
620
|
+
ImportOneRow(rgb + rgb_off + stride, width, src2);
|
621
|
+
} else {
|
622
|
+
memcpy(src2, src1, 3 * w * sizeof(*src2));
|
623
|
+
}
|
624
|
+
StoreGray(src1, &best_y[y_off + 0], w);
|
625
|
+
StoreGray(src2, &best_y[y_off + w], w);
|
626
|
+
UpdateW(src1, &target_y[y_off + 0], w);
|
627
|
+
UpdateW(src2, &target_y[y_off + w], w);
|
628
|
+
UpdateChroma(src1, src2, &target_uv[uv_off], uv_w);
|
629
|
+
memcpy(&best_uv[uv_off], &target_uv[uv_off], 3 * uv_w * sizeof(best_uv[0]));
|
630
|
+
}
|
631
|
+
|
632
|
+
// Iterate and resolve clipping conflicts.
|
633
|
+
for (int iter = 0; iter < kNumIterations; ++iter) {
|
634
|
+
const fixed_t* cur_uv = &best_uv[0];
|
635
|
+
const fixed_t* prev_uv = &best_uv[0];
|
636
|
+
uint64_t diff_y_sum = 0;
|
637
|
+
|
638
|
+
for (int j = 0; j < h; j += 2) {
|
639
|
+
const int uv_off = (j >> 1) * 3 * uv_w;
|
640
|
+
fixed_y_t* const src1 = &tmp_buffer[0 * w];
|
641
|
+
fixed_y_t* const src2 = &tmp_buffer[3 * w];
|
642
|
+
const fixed_t* const next_uv = cur_uv + ((j < h - 2) ? 3 * uv_w : 0);
|
643
|
+
InterpolateTwoRows(&best_y[j * w], prev_uv, cur_uv, next_uv,
|
644
|
+
w, src1, src2);
|
645
|
+
prev_uv = cur_uv;
|
646
|
+
cur_uv = next_uv;
|
647
|
+
|
648
|
+
UpdateW(src1, &best_rgb_y[0 * w], w);
|
649
|
+
UpdateW(src2, &best_rgb_y[1 * w], w);
|
650
|
+
UpdateChroma(src1, src2, &best_rgb_uv[0], uv_w);
|
651
|
+
|
652
|
+
// update two rows of Y and one row of RGB
|
653
|
+
diff_y_sum += kSharpUpdateY(&target_y[j * w],
|
654
|
+
&best_rgb_y[0], &best_y[j * w], 2 * w);
|
655
|
+
kSharpUpdateRGB(&target_uv[uv_off],
|
656
|
+
&best_rgb_uv[0], &best_uv[uv_off], 3 * uv_w);
|
657
|
+
}
|
658
|
+
// test exit condition
|
659
|
+
if (iter > 0) {
|
660
|
+
if (diff_y_sum < diff_y_threshold) break;
|
661
|
+
if (diff_y_sum > prev_diff_y_sum) break;
|
662
|
+
}
|
663
|
+
prev_diff_y_sum = diff_y_sum;
|
664
|
+
}
|
665
|
+
// final reconstruction
|
666
|
+
ConvertWRGBToYUV(&best_y[0], &best_uv[0], width, height,
|
667
|
+
y_plane, u_plane, v_plane);
|
668
|
+
}
|
669
|
+
|
670
|
+
} // namespace sjpeg
|
671
|
+
|
672
|
+
////////////////////////////////////////////////////////////////////////////////
|
673
|
+
// Entry point
|
674
|
+
|
675
|
+
void sjpeg::ApplySharpYUVConversion(const uint8_t* const rgb,
|
676
|
+
int W, int H, int stride,
|
677
|
+
uint8_t* y_plane,
|
678
|
+
uint8_t* u_plane, uint8_t* v_plane) {
|
679
|
+
if (W <= kMinDimensionIterativeConversion ||
|
680
|
+
H <= kMinDimensionIterativeConversion) {
|
681
|
+
const int uv_w = (W + 1) >> 1;
|
682
|
+
for (int y = 0; y < H; y += 2) {
|
683
|
+
const uint8_t* const rgb1 = rgb + y * stride;
|
684
|
+
const uint8_t* const rgb2 = (y < H - 1) ? rgb1 + stride : rgb1;
|
685
|
+
ConvertRowToY(rgb1, W, &y_plane[y * W]);
|
686
|
+
if (y < H - 1) {
|
687
|
+
ConvertRowToY(rgb2, W, &y_plane[(y + 1) * W]);
|
688
|
+
}
|
689
|
+
ConvertRowToUV(rgb1, rgb2, W,
|
690
|
+
&u_plane[(y >> 1) * uv_w],
|
691
|
+
&v_plane[(y >> 1) * uv_w]);
|
692
|
+
}
|
693
|
+
} else {
|
694
|
+
PreprocessARGB(rgb, W, H, stride, y_plane, u_plane, v_plane);
|
695
|
+
}
|
696
|
+
}
|
697
|
+
|
698
|
+
////////////////////////////////////////////////////////////////////////////////
|