sjpeg 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/Gemfile +4 -0
- data/Gemfile.lock +40 -0
- data/LICENSE.txt +21 -0
- data/README.md +40 -0
- data/Rakefile +11 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/ext/sjpeg/bit_writer.cc +122 -0
- data/ext/sjpeg/bit_writer.h +169 -0
- data/ext/sjpeg/colors_rgb.cc +691 -0
- data/ext/sjpeg/dichotomy.cc +290 -0
- data/ext/sjpeg/enc.cc +2132 -0
- data/ext/sjpeg/extconf.rb +3 -0
- data/ext/sjpeg/fdct.cc +627 -0
- data/ext/sjpeg/headers.cc +218 -0
- data/ext/sjpeg/jpeg_tools.cc +274 -0
- data/ext/sjpeg/libsjpeg.pc.in +11 -0
- data/ext/sjpeg/score_7.cc +6220 -0
- data/ext/sjpeg/sjpeg.h +353 -0
- data/ext/sjpeg/sjpegi.h +427 -0
- data/ext/sjpeg/yuv_convert.cc +698 -0
- data/lib/sjpeg/version.rb +3 -0
- data/lib/sjpeg.rb +35 -0
- data/sjpeg.gemspec +36 -0
- metadata +143 -0
@@ -0,0 +1,698 @@
|
|
1
|
+
// Copyright 2017 Google Inc.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
//
|
15
|
+
// Enhanced RGB->YUV conversion functions
|
16
|
+
//
|
17
|
+
// Author: Skal (pascal.massimino@gmail.com)
|
18
|
+
|
19
|
+
#include <math.h>
|
20
|
+
#include <stdlib.h>
|
21
|
+
#include <string.h>
|
22
|
+
#include <memory>
|
23
|
+
#include <vector>
|
24
|
+
using std::vector;
|
25
|
+
|
26
|
+
#define SJPEG_NEED_ASM_HEADERS
|
27
|
+
#include "sjpegi.h"
|
28
|
+
|
29
|
+
namespace sjpeg {
|
30
|
+
|
31
|
+
// We could use SFIX=0 and only uint8_t for fixed_y_t, but it produces some
|
32
|
+
// banding sometimes. Better use extra precision.
|
33
|
+
#define SFIX 2 // fixed-point precision of RGB and Y/W
|
34
|
+
#define SHALF (1 << SFIX >> 1)
|
35
|
+
#define MAX_Y_T ((256 << SFIX) - 1)
|
36
|
+
typedef int16_t fixed_t; // signed type with extra SFIX precision for UV
|
37
|
+
typedef uint16_t fixed_y_t; // unsigned type with extra SFIX precision for W
|
38
|
+
|
39
|
+
static fixed_y_t clip_y(int y) {
|
40
|
+
return (!(y & ~MAX_Y_T)) ? (fixed_y_t)y : (y < 0) ? 0 : MAX_Y_T;
|
41
|
+
}
|
42
|
+
|
43
|
+
////////////////////////////////////////////////////////////////////////////////
|
44
|
+
// Helper functions for Y/U/V fixed-point calculations.
|
45
|
+
|
46
|
+
// The following functions convert r/g/b values in SFIX fixed-point precision
|
47
|
+
// to 8b values, clipped:
|
48
|
+
#define YUV_FIX 16
|
49
|
+
#define TFIX (YUV_FIX + SFIX)
|
50
|
+
#define TROUNDER (1 << TFIX >> 1)
|
51
|
+
|
52
|
+
static uint8_t clip_8b(int v) {
|
53
|
+
return (!(v & ~0xff)) ? (uint8_t)v : (v < 0) ? 0u : 255u;
|
54
|
+
}
|
55
|
+
|
56
|
+
static uint8_t ConvertRGBToY(int r, int g, int b) {
|
57
|
+
const int luma = 19595 * r + 38469 * g + 7471 * b + TROUNDER;
|
58
|
+
return clip_8b(luma >> TFIX);
|
59
|
+
}
|
60
|
+
|
61
|
+
static uint8_t ConvertRGBToU(int r, int g, int b) {
|
62
|
+
const int u = -11058 * r - 21709 * g + 32768 * b + TROUNDER;
|
63
|
+
return clip_8b(128 + (u >> TFIX));
|
64
|
+
}
|
65
|
+
|
66
|
+
static uint8_t ConvertRGBToV(int r, int g, int b) {
|
67
|
+
const int v = +32768 * r - 27439 * g - 5328 * b + TROUNDER;
|
68
|
+
return clip_8b(128 + (v >> TFIX));
|
69
|
+
}
|
70
|
+
|
71
|
+
// convert to luma using 16b precision:
|
72
|
+
static void ConvertRowToY(const uint8_t* row, int w, uint8_t* const dst) {
|
73
|
+
for (int i = 0; i < w; i += 1, row += 3) {
|
74
|
+
const int r = row[0], g = row[1], b = row[2];
|
75
|
+
const int y = 19595 * r + 38469 * g + 7471 * b;
|
76
|
+
dst[i] = (y + (1 << YUV_FIX >> 1)) >> YUV_FIX;
|
77
|
+
}
|
78
|
+
}
|
79
|
+
|
80
|
+
static void ConvertRowToUV(const uint8_t* row1, const uint8_t* row2,
|
81
|
+
int w, uint8_t* u, uint8_t* v) {
|
82
|
+
for (int i = 0; i < (w & ~1); i += 2, row1 += 6, row2 += 6) {
|
83
|
+
const int r = row1[0] + row1[3] + row2[0] + row2[3];
|
84
|
+
const int g = row1[1] + row1[4] + row2[1] + row2[4];
|
85
|
+
const int b = row1[2] + row1[5] + row2[2] + row2[5];
|
86
|
+
*u++ = ConvertRGBToU(r, g, b);
|
87
|
+
*v++ = ConvertRGBToV(r, g, b);
|
88
|
+
}
|
89
|
+
if (w & 1) {
|
90
|
+
const int r = 2 * (row1[0] + row2[0]);
|
91
|
+
const int g = 2 * (row1[1] + row2[1]);
|
92
|
+
const int b = 2 * (row1[2] + row2[2]);
|
93
|
+
*u++ = ConvertRGBToU(r, g, b);
|
94
|
+
*v++ = ConvertRGBToV(r, g, b);
|
95
|
+
}
|
96
|
+
}
|
97
|
+
|
98
|
+
#undef TFIX
|
99
|
+
#undef ROUNDER
|
100
|
+
|
101
|
+
////////////////////////////////////////////////////////////////////////////////
|
102
|
+
// Sharp RGB->YUV conversion
|
103
|
+
|
104
|
+
static const int kNumIterations = 4;
|
105
|
+
static const int kMinDimensionIterativeConversion = 4;
|
106
|
+
|
107
|
+
// size of the interpolation table for linear-to-gamma
|
108
|
+
#define GAMMA_TABLE_SIZE 32
|
109
|
+
static uint32_t kLinearToGammaTab[GAMMA_TABLE_SIZE + 2];
|
110
|
+
#define GAMMA_TO_LINEAR_BITS 14
|
111
|
+
static uint32_t kGammaToLinearTab[MAX_Y_T + 1]; // size scales with Y_FIX
|
112
|
+
|
113
|
+
static void InitGammaTablesF(void) {
|
114
|
+
static bool done = false;
|
115
|
+
assert(2 * GAMMA_TO_LINEAR_BITS < 32); // we use uint32_t intermediate values
|
116
|
+
if (!done) {
|
117
|
+
int v;
|
118
|
+
const double norm = 1. / MAX_Y_T;
|
119
|
+
const double scale = 1. / GAMMA_TABLE_SIZE;
|
120
|
+
const double a = 0.099;
|
121
|
+
const double thresh = 0.018;
|
122
|
+
const double gamma = 1. / 0.45;
|
123
|
+
const double final_scale = 1 << GAMMA_TO_LINEAR_BITS;
|
124
|
+
for (v = 0; v <= MAX_Y_T; ++v) {
|
125
|
+
const double g = norm * v;
|
126
|
+
double value;
|
127
|
+
if (g <= thresh * 4.5) {
|
128
|
+
value = g / 4.5;
|
129
|
+
} else {
|
130
|
+
const double a_rec = 1. / (1. + a);
|
131
|
+
value = pow(a_rec * (g + a), gamma);
|
132
|
+
}
|
133
|
+
kGammaToLinearTab[v] = static_cast<uint32_t>(value * final_scale + .5);
|
134
|
+
}
|
135
|
+
for (v = 0; v <= GAMMA_TABLE_SIZE; ++v) {
|
136
|
+
const double g = scale * v;
|
137
|
+
double value;
|
138
|
+
if (g <= thresh) {
|
139
|
+
value = 4.5 * g;
|
140
|
+
} else {
|
141
|
+
value = (1. + a) * pow(g, 1. / gamma) - a;
|
142
|
+
}
|
143
|
+
// we already incorporate the 1/2 rounding constant here
|
144
|
+
kLinearToGammaTab[v] =
|
145
|
+
static_cast<uint32_t>(MAX_Y_T * value)
|
146
|
+
+ (1 << GAMMA_TO_LINEAR_BITS >> 1);
|
147
|
+
}
|
148
|
+
// to prevent small rounding errors to cause read-overflow:
|
149
|
+
kLinearToGammaTab[GAMMA_TABLE_SIZE + 1] =
|
150
|
+
kLinearToGammaTab[GAMMA_TABLE_SIZE];
|
151
|
+
done = true;
|
152
|
+
}
|
153
|
+
}
|
154
|
+
|
155
|
+
// return value has a fixed-point precision of GAMMA_TO_LINEAR_BITS
|
156
|
+
static uint32_t GammaToLinear(int v) { return kGammaToLinearTab[v]; }
|
157
|
+
|
158
|
+
static uint32_t LinearToGamma(uint32_t value) {
|
159
|
+
// 'value' is in GAMMA_TO_LINEAR_BITS fractional precision
|
160
|
+
const uint32_t v = value * GAMMA_TABLE_SIZE;
|
161
|
+
const uint32_t tab_pos = v >> GAMMA_TO_LINEAR_BITS;
|
162
|
+
// fractional part, in GAMMA_TO_LINEAR_BITS fixed-point precision
|
163
|
+
const uint32_t x = v - (tab_pos << GAMMA_TO_LINEAR_BITS); // fractional part
|
164
|
+
// v0 / v1 are in GAMMA_TO_LINEAR_BITS fixed-point precision (range [0..1])
|
165
|
+
const uint32_t v0 = kLinearToGammaTab[tab_pos + 0];
|
166
|
+
const uint32_t v1 = kLinearToGammaTab[tab_pos + 1];
|
167
|
+
// Final interpolation. Note that rounding is already included.
|
168
|
+
const uint32_t v2 = (v1 - v0) * x; // note: v1 >= v0.
|
169
|
+
const uint32_t result = v0 + (v2 >> GAMMA_TO_LINEAR_BITS);
|
170
|
+
return result;
|
171
|
+
}
|
172
|
+
|
173
|
+
//------------------------------------------------------------------------------
|
174
|
+
|
175
|
+
static uint64_t SharpUpdateY_C(const uint16_t* ref, const uint16_t* src,
|
176
|
+
uint16_t* dst, int len) {
|
177
|
+
uint64_t diff = 0;
|
178
|
+
for (int i = 0; i < len; ++i) {
|
179
|
+
const int diff_y = ref[i] - src[i];
|
180
|
+
const int new_y = static_cast<int>(dst[i]) + diff_y;
|
181
|
+
dst[i] = clip_y(new_y);
|
182
|
+
diff += (uint64_t)abs(diff_y);
|
183
|
+
}
|
184
|
+
return diff;
|
185
|
+
}
|
186
|
+
|
187
|
+
static void SharpUpdateRGB_C(const int16_t* ref, const int16_t* src,
|
188
|
+
int16_t* dst, int len) {
|
189
|
+
for (int i = 0; i < len; ++i) {
|
190
|
+
const int diff_uv = ref[i] - src[i];
|
191
|
+
dst[i] += diff_uv;
|
192
|
+
}
|
193
|
+
}
|
194
|
+
|
195
|
+
static void SharpFilterRow_C(const int16_t* A, const int16_t* B, int len,
|
196
|
+
const uint16_t* best_y, uint16_t* out) {
|
197
|
+
for (int i = 0; i < len; ++i, ++A, ++B) {
|
198
|
+
const int v0 = (A[0] * 9 + A[1] * 3 + B[0] * 3 + B[1] + 8) >> 4;
|
199
|
+
const int v1 = (A[1] * 9 + A[0] * 3 + B[1] * 3 + B[0] + 8) >> 4;
|
200
|
+
out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0);
|
201
|
+
out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);
|
202
|
+
}
|
203
|
+
}
|
204
|
+
|
205
|
+
#if defined(SJPEG_USE_SSE2)
|
206
|
+
|
207
|
+
#define LOAD_16(P) (_mm_loadu_si128(reinterpret_cast<const __m128i*>(P)))
|
208
|
+
#define STORE_16(P, V) (_mm_storeu_si128(reinterpret_cast<__m128i*>(P), (V)))
|
209
|
+
|
210
|
+
static uint64_t SharpUpdateY_SSE2(const uint16_t* ref, const uint16_t* src,
|
211
|
+
uint16_t* dst, int len) {
|
212
|
+
uint64_t diff = 0;
|
213
|
+
uint32_t tmp[4];
|
214
|
+
int i;
|
215
|
+
const __m128i zero = _mm_setzero_si128();
|
216
|
+
const __m128i max = _mm_set1_epi16(MAX_Y_T);
|
217
|
+
const __m128i one = _mm_set1_epi16(1);
|
218
|
+
__m128i sum = zero;
|
219
|
+
|
220
|
+
for (i = 0; i + 8 <= len; i += 8) {
|
221
|
+
const __m128i A = LOAD_16(ref + i);
|
222
|
+
const __m128i B = LOAD_16(src + i);
|
223
|
+
const __m128i C = LOAD_16(dst + i);
|
224
|
+
const __m128i D = _mm_sub_epi16(A, B); // diff_y
|
225
|
+
const __m128i E = _mm_cmpgt_epi16(zero, D); // sign (-1 or 0)
|
226
|
+
const __m128i F = _mm_add_epi16(C, D); // new_y
|
227
|
+
const __m128i G = _mm_or_si128(E, one); // -1 or 1
|
228
|
+
const __m128i H = _mm_max_epi16(_mm_min_epi16(F, max), zero);
|
229
|
+
const __m128i I = _mm_madd_epi16(D, G); // sum(abs(...))
|
230
|
+
STORE_16(dst + i, H);
|
231
|
+
sum = _mm_add_epi32(sum, I);
|
232
|
+
}
|
233
|
+
STORE_16(tmp, sum);
|
234
|
+
diff = tmp[3] + tmp[2] + tmp[1] + tmp[0];
|
235
|
+
for (; i < len; ++i) {
|
236
|
+
const int diff_y = ref[i] - src[i];
|
237
|
+
const int new_y = static_cast<int>(dst[i]) + diff_y;
|
238
|
+
dst[i] = clip_y(new_y);
|
239
|
+
diff += (uint64_t)abs(diff_y);
|
240
|
+
}
|
241
|
+
return diff;
|
242
|
+
}
|
243
|
+
|
244
|
+
static void SharpUpdateRGB_SSE2(const int16_t* ref, const int16_t* src,
|
245
|
+
int16_t* dst, int len) {
|
246
|
+
int i = 0;
|
247
|
+
for (i = 0; i + 8 <= len; i += 8) {
|
248
|
+
const __m128i A = LOAD_16(ref + i);
|
249
|
+
const __m128i B = LOAD_16(src + i);
|
250
|
+
const __m128i C = LOAD_16(dst + i);
|
251
|
+
const __m128i D = _mm_sub_epi16(A, B); // diff_uv
|
252
|
+
const __m128i E = _mm_add_epi16(C, D); // new_uv
|
253
|
+
STORE_16(dst + i, E);
|
254
|
+
}
|
255
|
+
for (; i < len; ++i) {
|
256
|
+
const int diff_uv = ref[i] - src[i];
|
257
|
+
dst[i] += diff_uv;
|
258
|
+
}
|
259
|
+
}
|
260
|
+
|
261
|
+
static void SharpFilterRow_SSE2(const int16_t* A, const int16_t* B, int len,
|
262
|
+
const uint16_t* best_y, uint16_t* out) {
|
263
|
+
int i;
|
264
|
+
const __m128i kCst8 = _mm_set1_epi16(8);
|
265
|
+
const __m128i max = _mm_set1_epi16(MAX_Y_T);
|
266
|
+
const __m128i zero = _mm_setzero_si128();
|
267
|
+
for (i = 0; i + 8 <= len; i += 8) {
|
268
|
+
const __m128i a0 = LOAD_16(A + i + 0);
|
269
|
+
const __m128i a1 = LOAD_16(A + i + 1);
|
270
|
+
const __m128i b0 = LOAD_16(B + i + 0);
|
271
|
+
const __m128i b1 = LOAD_16(B + i + 1);
|
272
|
+
const __m128i a0b1 = _mm_add_epi16(a0, b1);
|
273
|
+
const __m128i a1b0 = _mm_add_epi16(a1, b0);
|
274
|
+
const __m128i a0a1b0b1 = _mm_add_epi16(a0b1, a1b0); // A0+A1+B0+B1
|
275
|
+
const __m128i a0a1b0b1_8 = _mm_add_epi16(a0a1b0b1, kCst8);
|
276
|
+
const __m128i a0b1_2 = _mm_add_epi16(a0b1, a0b1); // 2*(A0+B1)
|
277
|
+
const __m128i a1b0_2 = _mm_add_epi16(a1b0, a1b0); // 2*(A1+B0)
|
278
|
+
const __m128i c0 = _mm_srai_epi16(_mm_add_epi16(a0b1_2, a0a1b0b1_8), 3);
|
279
|
+
const __m128i c1 = _mm_srai_epi16(_mm_add_epi16(a1b0_2, a0a1b0b1_8), 3);
|
280
|
+
const __m128i d0 = _mm_add_epi16(c1, a0);
|
281
|
+
const __m128i d1 = _mm_add_epi16(c0, a1);
|
282
|
+
const __m128i e0 = _mm_srai_epi16(d0, 1);
|
283
|
+
const __m128i e1 = _mm_srai_epi16(d1, 1);
|
284
|
+
const __m128i f0 = _mm_unpacklo_epi16(e0, e1);
|
285
|
+
const __m128i f1 = _mm_unpackhi_epi16(e0, e1);
|
286
|
+
const __m128i g0 = LOAD_16(best_y + 2 * i + 0);
|
287
|
+
const __m128i g1 = LOAD_16(best_y + 2 * i + 8);
|
288
|
+
const __m128i h0 = _mm_add_epi16(g0, f0);
|
289
|
+
const __m128i h1 = _mm_add_epi16(g1, f1);
|
290
|
+
const __m128i i0 = _mm_max_epi16(_mm_min_epi16(h0, max), zero);
|
291
|
+
const __m128i i1 = _mm_max_epi16(_mm_min_epi16(h1, max), zero);
|
292
|
+
STORE_16(out + 2 * i + 0, i0);
|
293
|
+
STORE_16(out + 2 * i + 8, i1);
|
294
|
+
}
|
295
|
+
for (; i < len; ++i) {
|
296
|
+
// (9 * A0 + 3 * A1 + 3 * B0 + B1 + 8) >> 4 =
|
297
|
+
// = (8 * A0 + 2 * (A1 + B0) + (A0 + A1 + B0 + B1 + 8)) >> 4
|
298
|
+
// We reuse the common sub-expressions.
|
299
|
+
const int a0b1 = A[i + 0] + B[i + 1];
|
300
|
+
const int a1b0 = A[i + 1] + B[i + 0];
|
301
|
+
const int a0a1b0b1 = a0b1 + a1b0 + 8;
|
302
|
+
const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
|
303
|
+
const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
|
304
|
+
out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0);
|
305
|
+
out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);
|
306
|
+
}
|
307
|
+
}
|
308
|
+
#undef STORE_16
|
309
|
+
#undef LOAD_16
|
310
|
+
|
311
|
+
#elif defined(SJPEG_USE_NEON)
|
312
|
+
|
313
|
+
static uint64_t SharpUpdateY_NEON(const uint16_t* ref, const uint16_t* src,
|
314
|
+
uint16_t* dst, int len) {
|
315
|
+
int i;
|
316
|
+
const int16x8_t zero = vdupq_n_s16(0);
|
317
|
+
const int16x8_t max = vdupq_n_s16(MAX_Y_T);
|
318
|
+
uint64x2_t sum = vdupq_n_u64(0);
|
319
|
+
|
320
|
+
for (i = 0; i + 8 <= len; i += 8) {
|
321
|
+
const int16x8_t A = vreinterpretq_s16_u16(vld1q_u16(ref + i));
|
322
|
+
const int16x8_t B = vreinterpretq_s16_u16(vld1q_u16(src + i));
|
323
|
+
const int16x8_t C = vreinterpretq_s16_u16(vld1q_u16(dst + i));
|
324
|
+
const int16x8_t D = vsubq_s16(A, B); // diff_y
|
325
|
+
const int16x8_t F = vaddq_s16(C, D); // new_y
|
326
|
+
const uint16x8_t H =
|
327
|
+
vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(F, max), zero));
|
328
|
+
const int16x8_t I = vabsq_s16(D); // abs(diff_y)
|
329
|
+
vst1q_u16(dst + i, H);
|
330
|
+
sum = vpadalq_u32(sum, vpaddlq_u16(vreinterpretq_u16_s16(I)));
|
331
|
+
}
|
332
|
+
uint64_t diff = vgetq_lane_u64(sum, 0) + vgetq_lane_u64(sum, 1);
|
333
|
+
for (; i < len; ++i) {
|
334
|
+
const int diff_y = ref[i] - src[i];
|
335
|
+
const int new_y = static_cast<int>(dst[i]) + diff_y;
|
336
|
+
dst[i] = clip_y(new_y);
|
337
|
+
diff += static_cast<uint64_t>(abs(diff_y));
|
338
|
+
}
|
339
|
+
return diff;
|
340
|
+
}
|
341
|
+
|
342
|
+
static void SharpUpdateRGB_NEON(const int16_t* ref, const int16_t* src,
|
343
|
+
int16_t* dst, int len) {
|
344
|
+
int i;
|
345
|
+
for (i = 0; i + 8 <= len; i += 8) {
|
346
|
+
const int16x8_t A = vld1q_s16(ref + i);
|
347
|
+
const int16x8_t B = vld1q_s16(src + i);
|
348
|
+
const int16x8_t C = vld1q_s16(dst + i);
|
349
|
+
const int16x8_t D = vsubq_s16(A, B); // diff_uv
|
350
|
+
const int16x8_t E = vaddq_s16(C, D); // new_uv
|
351
|
+
vst1q_s16(dst + i, E);
|
352
|
+
}
|
353
|
+
for (; i < len; ++i) {
|
354
|
+
const int diff_uv = ref[i] - src[i];
|
355
|
+
dst[i] += diff_uv;
|
356
|
+
}
|
357
|
+
}
|
358
|
+
|
359
|
+
static void SharpFilterRow_NEON(const int16_t* A, const int16_t* B, int len,
|
360
|
+
const uint16_t* best_y, uint16_t* out) {
|
361
|
+
int i;
|
362
|
+
const int16x8_t max = vdupq_n_s16(MAX_Y_T);
|
363
|
+
const int16x8_t zero = vdupq_n_s16(0);
|
364
|
+
for (i = 0; i + 8 <= len; i += 8) {
|
365
|
+
const int16x8_t a0 = vld1q_s16(A + i + 0);
|
366
|
+
const int16x8_t a1 = vld1q_s16(A + i + 1);
|
367
|
+
const int16x8_t b0 = vld1q_s16(B + i + 0);
|
368
|
+
const int16x8_t b1 = vld1q_s16(B + i + 1);
|
369
|
+
const int16x8_t a0b1 = vaddq_s16(a0, b1);
|
370
|
+
const int16x8_t a1b0 = vaddq_s16(a1, b0);
|
371
|
+
const int16x8_t a0a1b0b1 = vaddq_s16(a0b1, a1b0); // A0+A1+B0+B1
|
372
|
+
const int16x8_t a0b1_2 = vaddq_s16(a0b1, a0b1); // 2*(A0+B1)
|
373
|
+
const int16x8_t a1b0_2 = vaddq_s16(a1b0, a1b0); // 2*(A1+B0)
|
374
|
+
const int16x8_t c0 = vshrq_n_s16(vaddq_s16(a0b1_2, a0a1b0b1), 3);
|
375
|
+
const int16x8_t c1 = vshrq_n_s16(vaddq_s16(a1b0_2, a0a1b0b1), 3);
|
376
|
+
const int16x8_t d0 = vaddq_s16(c1, a0);
|
377
|
+
const int16x8_t d1 = vaddq_s16(c0, a1);
|
378
|
+
const int16x8_t e0 = vrshrq_n_s16(d0, 1);
|
379
|
+
const int16x8_t e1 = vrshrq_n_s16(d1, 1);
|
380
|
+
const int16x8x2_t f = vzipq_s16(e0, e1);
|
381
|
+
const int16x8_t g0 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 0));
|
382
|
+
const int16x8_t g1 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 8));
|
383
|
+
const int16x8_t h0 = vaddq_s16(g0, f.val[0]);
|
384
|
+
const int16x8_t h1 = vaddq_s16(g1, f.val[1]);
|
385
|
+
const int16x8_t i0 = vmaxq_s16(vminq_s16(h0, max), zero);
|
386
|
+
const int16x8_t i1 = vmaxq_s16(vminq_s16(h1, max), zero);
|
387
|
+
vst1q_u16(out + 2 * i + 0, vreinterpretq_u16_s16(i0));
|
388
|
+
vst1q_u16(out + 2 * i + 8, vreinterpretq_u16_s16(i1));
|
389
|
+
}
|
390
|
+
for (; i < len; ++i) {
|
391
|
+
const int a0b1 = A[i + 0] + B[i + 1];
|
392
|
+
const int a1b0 = A[i + 1] + B[i + 0];
|
393
|
+
const int a0a1b0b1 = a0b1 + a1b0 + 8;
|
394
|
+
const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
|
395
|
+
const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
|
396
|
+
out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0);
|
397
|
+
out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);
|
398
|
+
}
|
399
|
+
}
|
400
|
+
|
401
|
+
#endif // SJPEG_USE_NEON
|
402
|
+
|
403
|
+
static uint64_t (*kSharpUpdateY)(const uint16_t* src, const uint16_t* ref,
|
404
|
+
uint16_t* dst, int len);
|
405
|
+
static void (*kSharpUpdateRGB)(const int16_t* src, const int16_t* ref,
|
406
|
+
int16_t* dst, int len);
|
407
|
+
static void (*kSharpFilterRow)(const int16_t* A, const int16_t* B,
|
408
|
+
int len, const uint16_t* best_y, uint16_t* out);
|
409
|
+
|
410
|
+
static void InitFunctionPointers() {
|
411
|
+
static bool done = false;
|
412
|
+
if (!done) {
|
413
|
+
kSharpUpdateY = SharpUpdateY_C;
|
414
|
+
kSharpUpdateRGB = SharpUpdateRGB_C;
|
415
|
+
kSharpFilterRow = SharpFilterRow_C;
|
416
|
+
#if defined(SJPEG_USE_SSE2)
|
417
|
+
if (sjpeg::SupportsSSE2()) {
|
418
|
+
kSharpUpdateY = SharpUpdateY_SSE2;
|
419
|
+
kSharpUpdateRGB = SharpUpdateRGB_SSE2;
|
420
|
+
kSharpFilterRow = SharpFilterRow_SSE2;
|
421
|
+
}
|
422
|
+
#endif
|
423
|
+
#if defined(SJPEG_USE_NEON)
|
424
|
+
if (sjpeg::SupportsNEON()) {
|
425
|
+
kSharpUpdateY = SharpUpdateY_NEON;
|
426
|
+
kSharpUpdateRGB = SharpUpdateRGB_NEON;
|
427
|
+
kSharpFilterRow = SharpFilterRow_NEON;
|
428
|
+
}
|
429
|
+
#endif
|
430
|
+
done = true;
|
431
|
+
}
|
432
|
+
}
|
433
|
+
|
434
|
+
//------------------------------------------------------------------------------
|
435
|
+
|
436
|
+
static uint32_t RGBToGray(uint32_t r, uint32_t g, uint32_t b) {
|
437
|
+
const uint32_t luma = 13933 * r + 46871 * g + 4732 * b + (1u << YUV_FIX >> 1);
|
438
|
+
return (luma >> YUV_FIX);
|
439
|
+
}
|
440
|
+
|
441
|
+
static uint32_t ScaleDown(int a, int b, int c, int d) {
|
442
|
+
const uint32_t A = GammaToLinear(a);
|
443
|
+
const uint32_t B = GammaToLinear(b);
|
444
|
+
const uint32_t C = GammaToLinear(c);
|
445
|
+
const uint32_t D = GammaToLinear(d);
|
446
|
+
return LinearToGamma((A + B + C + D + 2) >> 2);
|
447
|
+
}
|
448
|
+
|
449
|
+
static void UpdateChroma(const fixed_y_t* src1, const fixed_y_t* src2,
|
450
|
+
fixed_t* dst, size_t uv_w) {
|
451
|
+
for (size_t i = 0; i < uv_w; ++i) {
|
452
|
+
const uint32_t r = ScaleDown(src1[0 * uv_w + 0], src1[0 * uv_w + 1],
|
453
|
+
src2[0 * uv_w + 0], src2[0 * uv_w + 1]);
|
454
|
+
const uint32_t g = ScaleDown(src1[2 * uv_w + 0], src1[2 * uv_w + 1],
|
455
|
+
src2[2 * uv_w + 0], src2[2 * uv_w + 1]);
|
456
|
+
const uint32_t b = ScaleDown(src1[4 * uv_w + 0], src1[4 * uv_w + 1],
|
457
|
+
src2[4 * uv_w + 0], src2[4 * uv_w + 1]);
|
458
|
+
const int W = RGBToGray(r, g, b);
|
459
|
+
dst[0 * uv_w] = (fixed_t)(r - W);
|
460
|
+
dst[1 * uv_w] = (fixed_t)(g - W);
|
461
|
+
dst[2 * uv_w] = (fixed_t)(b - W);
|
462
|
+
dst += 1;
|
463
|
+
src1 += 2;
|
464
|
+
src2 += 2;
|
465
|
+
}
|
466
|
+
}
|
467
|
+
|
468
|
+
static void UpdateW(const fixed_y_t* src, fixed_y_t* dst, int w) {
|
469
|
+
for (int i = 0; i < w; ++i) {
|
470
|
+
const uint32_t R = GammaToLinear(src[0 * w + i]);
|
471
|
+
const uint32_t G = GammaToLinear(src[1 * w + i]);
|
472
|
+
const uint32_t B = GammaToLinear(src[2 * w + i]);
|
473
|
+
const uint32_t Y = RGBToGray(R, G, B);
|
474
|
+
dst[i] = (fixed_y_t)LinearToGamma(Y);
|
475
|
+
}
|
476
|
+
}
|
477
|
+
|
478
|
+
static void StoreGray(const fixed_y_t* const rgb, fixed_y_t* const y, int w) {
|
479
|
+
for (int i = 0; i < w; ++i) {
|
480
|
+
y[i] = RGBToGray(rgb[0 * w + i], rgb[1 * w + i], rgb[2 * w + i]);
|
481
|
+
}
|
482
|
+
}
|
483
|
+
|
484
|
+
//------------------------------------------------------------------------------
|
485
|
+
|
486
|
+
static fixed_y_t Filter2(int A, int B, int W0) {
|
487
|
+
const int v0 = (A * 3 + B + 2) >> 2;
|
488
|
+
return clip_y(v0 + W0);
|
489
|
+
}
|
490
|
+
|
491
|
+
//------------------------------------------------------------------------------
|
492
|
+
|
493
|
+
static fixed_y_t UpLift(uint8_t a) { // 8bit -> SFIX
|
494
|
+
return ((fixed_y_t)a << SFIX) | SHALF;
|
495
|
+
}
|
496
|
+
|
497
|
+
static void ImportOneRow(const uint8_t* const rgb, int pic_width,
|
498
|
+
fixed_y_t* const dst) {
|
499
|
+
const int w = (pic_width + 1) & ~1;
|
500
|
+
for (int i = 0; i < pic_width; ++i) {
|
501
|
+
const int off = i * 3;
|
502
|
+
dst[i + 0 * w] = UpLift(rgb[off + 0]);
|
503
|
+
dst[i + 1 * w] = UpLift(rgb[off + 1]);
|
504
|
+
dst[i + 2 * w] = UpLift(rgb[off + 2]);
|
505
|
+
}
|
506
|
+
if (pic_width & 1) { // replicate rightmost pixel
|
507
|
+
dst[pic_width + 0 * w] = dst[pic_width + 0 * w - 1];
|
508
|
+
dst[pic_width + 1 * w] = dst[pic_width + 1 * w - 1];
|
509
|
+
dst[pic_width + 2 * w] = dst[pic_width + 2 * w - 1];
|
510
|
+
}
|
511
|
+
}
|
512
|
+
|
513
|
+
static void InterpolateTwoRows(const fixed_y_t* const best_y,
|
514
|
+
const fixed_t* prev_uv,
|
515
|
+
const fixed_t* cur_uv,
|
516
|
+
const fixed_t* next_uv,
|
517
|
+
int w,
|
518
|
+
fixed_y_t* out1, fixed_y_t* out2) {
|
519
|
+
const int uv_w = w >> 1;
|
520
|
+
const int len = (w - 1) >> 1; // length to filter
|
521
|
+
for (int k = 3; k > 0; --k) { // process each R/G/B segments in turn
|
522
|
+
// special boundary case for i==0
|
523
|
+
out1[0] = Filter2(cur_uv[0], prev_uv[0], best_y[0]);
|
524
|
+
out2[0] = Filter2(cur_uv[0], next_uv[0], best_y[w]);
|
525
|
+
|
526
|
+
kSharpFilterRow(cur_uv, prev_uv, len, best_y + 0 + 1, out1 + 1);
|
527
|
+
kSharpFilterRow(cur_uv, next_uv, len, best_y + w + 1, out2 + 1);
|
528
|
+
|
529
|
+
// special boundary case for i == w - 1 when w is even
|
530
|
+
if (!(w & 1)) {
|
531
|
+
out1[w - 1] = Filter2(cur_uv[uv_w - 1], prev_uv[uv_w - 1],
|
532
|
+
best_y[w - 1 + 0]);
|
533
|
+
out2[w - 1] = Filter2(cur_uv[uv_w - 1], next_uv[uv_w - 1],
|
534
|
+
best_y[w - 1 + w]);
|
535
|
+
}
|
536
|
+
out1 += w;
|
537
|
+
out2 += w;
|
538
|
+
prev_uv += uv_w;
|
539
|
+
cur_uv += uv_w;
|
540
|
+
next_uv += uv_w;
|
541
|
+
}
|
542
|
+
}
|
543
|
+
|
544
|
+
static void ConvertWRGBToYUV(const fixed_y_t* best_y,
|
545
|
+
const fixed_t* best_uv,
|
546
|
+
int width, int height,
|
547
|
+
uint8_t* y_plane,
|
548
|
+
uint8_t* u_plane, uint8_t* v_plane) {
|
549
|
+
const int w = (width + 1) & ~1;
|
550
|
+
const int h = (height + 1) & ~1;
|
551
|
+
const int uv_w = w >> 1;
|
552
|
+
const int uv_h = h >> 1;
|
553
|
+
for (int j = 0; j < height; ++j) {
|
554
|
+
const int off = (j >> 1) * 3 * uv_w;
|
555
|
+
for (int i = 0; i < width; ++i) {
|
556
|
+
const int W = best_y[i + j * w];
|
557
|
+
const int r = best_uv[off + (i >> 1) + 0 * uv_w] + W;
|
558
|
+
const int g = best_uv[off + (i >> 1) + 1 * uv_w] + W;
|
559
|
+
const int b = best_uv[off + (i >> 1) + 2 * uv_w] + W;
|
560
|
+
y_plane[i] = ConvertRGBToY(r, g, b);
|
561
|
+
}
|
562
|
+
y_plane += width;
|
563
|
+
}
|
564
|
+
for (int j = 0; j < uv_h; ++j) {
|
565
|
+
for (int i = 0; i < uv_w; ++i) {
|
566
|
+
const int off = i + j * 3 * uv_w;
|
567
|
+
const int r = best_uv[off + 0 * uv_w];
|
568
|
+
const int g = best_uv[off + 1 * uv_w];
|
569
|
+
const int b = best_uv[off + 2 * uv_w];
|
570
|
+
u_plane[i] = ConvertRGBToU(r, g, b);
|
571
|
+
v_plane[i] = ConvertRGBToV(r, g, b);
|
572
|
+
}
|
573
|
+
u_plane += uv_w;
|
574
|
+
v_plane += uv_w;
|
575
|
+
}
|
576
|
+
}
|
577
|
+
|
578
|
+
//------------------------------------------------------------------------------
|
579
|
+
// Main function
|
580
|
+
|
581
|
+
static void PreprocessARGB(const uint8_t* const rgb,
|
582
|
+
int width, int height, size_t stride,
|
583
|
+
uint8_t* y_plane,
|
584
|
+
uint8_t* u_plane, uint8_t* v_plane) {
|
585
|
+
// we expand the right/bottom border if needed
|
586
|
+
const int w = (width + 1) & ~1;
|
587
|
+
const int h = (height + 1) & ~1;
|
588
|
+
const int uv_w = w >> 1;
|
589
|
+
const int uv_h = h >> 1;
|
590
|
+
uint64_t prev_diff_y_sum = ~0;
|
591
|
+
|
592
|
+
InitGammaTablesF();
|
593
|
+
InitFunctionPointers();
|
594
|
+
|
595
|
+
// TODO(skal): allocate one big memory chunk instead.
|
596
|
+
vector<fixed_y_t> tmp_buffer(w * 3 * 2);
|
597
|
+
vector<fixed_y_t> best_y(w * h);
|
598
|
+
vector<fixed_y_t> target_y(w * h);
|
599
|
+
vector<fixed_y_t> best_rgb_y(w * 2);
|
600
|
+
vector<fixed_t> best_uv(uv_w * 3 * uv_h);
|
601
|
+
vector<fixed_t> target_uv(uv_w * 3 * uv_h);
|
602
|
+
vector<fixed_t> best_rgb_uv(uv_w * 3 * 1);
|
603
|
+
const uint64_t diff_y_threshold = static_cast<uint64_t>(3.0 * w * h);
|
604
|
+
|
605
|
+
assert(width >= kMinDimensionIterativeConversion);
|
606
|
+
assert(height >= kMinDimensionIterativeConversion);
|
607
|
+
|
608
|
+
// Import RGB samples to W/RGB representation.
|
609
|
+
for (int j = 0; j < height; j += 2) {
|
610
|
+
const int is_last_row = (j == height - 1);
|
611
|
+
fixed_y_t* const src1 = &tmp_buffer[0 * w];
|
612
|
+
fixed_y_t* const src2 = &tmp_buffer[3 * w];
|
613
|
+
const int rgb_off = j * stride;
|
614
|
+
const int y_off = j * w;
|
615
|
+
const int uv_off = (j >> 1) * 3 * uv_w;
|
616
|
+
|
617
|
+
// prepare two rows of input
|
618
|
+
ImportOneRow(rgb + rgb_off, width, src1);
|
619
|
+
if (!is_last_row) {
|
620
|
+
ImportOneRow(rgb + rgb_off + stride, width, src2);
|
621
|
+
} else {
|
622
|
+
memcpy(src2, src1, 3 * w * sizeof(*src2));
|
623
|
+
}
|
624
|
+
StoreGray(src1, &best_y[y_off + 0], w);
|
625
|
+
StoreGray(src2, &best_y[y_off + w], w);
|
626
|
+
UpdateW(src1, &target_y[y_off + 0], w);
|
627
|
+
UpdateW(src2, &target_y[y_off + w], w);
|
628
|
+
UpdateChroma(src1, src2, &target_uv[uv_off], uv_w);
|
629
|
+
memcpy(&best_uv[uv_off], &target_uv[uv_off], 3 * uv_w * sizeof(best_uv[0]));
|
630
|
+
}
|
631
|
+
|
632
|
+
// Iterate and resolve clipping conflicts.
|
633
|
+
for (int iter = 0; iter < kNumIterations; ++iter) {
|
634
|
+
const fixed_t* cur_uv = &best_uv[0];
|
635
|
+
const fixed_t* prev_uv = &best_uv[0];
|
636
|
+
uint64_t diff_y_sum = 0;
|
637
|
+
|
638
|
+
for (int j = 0; j < h; j += 2) {
|
639
|
+
const int uv_off = (j >> 1) * 3 * uv_w;
|
640
|
+
fixed_y_t* const src1 = &tmp_buffer[0 * w];
|
641
|
+
fixed_y_t* const src2 = &tmp_buffer[3 * w];
|
642
|
+
const fixed_t* const next_uv = cur_uv + ((j < h - 2) ? 3 * uv_w : 0);
|
643
|
+
InterpolateTwoRows(&best_y[j * w], prev_uv, cur_uv, next_uv,
|
644
|
+
w, src1, src2);
|
645
|
+
prev_uv = cur_uv;
|
646
|
+
cur_uv = next_uv;
|
647
|
+
|
648
|
+
UpdateW(src1, &best_rgb_y[0 * w], w);
|
649
|
+
UpdateW(src2, &best_rgb_y[1 * w], w);
|
650
|
+
UpdateChroma(src1, src2, &best_rgb_uv[0], uv_w);
|
651
|
+
|
652
|
+
// update two rows of Y and one row of RGB
|
653
|
+
diff_y_sum += kSharpUpdateY(&target_y[j * w],
|
654
|
+
&best_rgb_y[0], &best_y[j * w], 2 * w);
|
655
|
+
kSharpUpdateRGB(&target_uv[uv_off],
|
656
|
+
&best_rgb_uv[0], &best_uv[uv_off], 3 * uv_w);
|
657
|
+
}
|
658
|
+
// test exit condition
|
659
|
+
if (iter > 0) {
|
660
|
+
if (diff_y_sum < diff_y_threshold) break;
|
661
|
+
if (diff_y_sum > prev_diff_y_sum) break;
|
662
|
+
}
|
663
|
+
prev_diff_y_sum = diff_y_sum;
|
664
|
+
}
|
665
|
+
// final reconstruction
|
666
|
+
ConvertWRGBToYUV(&best_y[0], &best_uv[0], width, height,
|
667
|
+
y_plane, u_plane, v_plane);
|
668
|
+
}
|
669
|
+
|
670
|
+
} // namespace sjpeg
|
671
|
+
|
672
|
+
////////////////////////////////////////////////////////////////////////////////
|
673
|
+
// Entry point
|
674
|
+
|
675
|
+
void sjpeg::ApplySharpYUVConversion(const uint8_t* const rgb,
|
676
|
+
int W, int H, int stride,
|
677
|
+
uint8_t* y_plane,
|
678
|
+
uint8_t* u_plane, uint8_t* v_plane) {
|
679
|
+
if (W <= kMinDimensionIterativeConversion ||
|
680
|
+
H <= kMinDimensionIterativeConversion) {
|
681
|
+
const int uv_w = (W + 1) >> 1;
|
682
|
+
for (int y = 0; y < H; y += 2) {
|
683
|
+
const uint8_t* const rgb1 = rgb + y * stride;
|
684
|
+
const uint8_t* const rgb2 = (y < H - 1) ? rgb1 + stride : rgb1;
|
685
|
+
ConvertRowToY(rgb1, W, &y_plane[y * W]);
|
686
|
+
if (y < H - 1) {
|
687
|
+
ConvertRowToY(rgb2, W, &y_plane[(y + 1) * W]);
|
688
|
+
}
|
689
|
+
ConvertRowToUV(rgb1, rgb2, W,
|
690
|
+
&u_plane[(y >> 1) * uv_w],
|
691
|
+
&v_plane[(y >> 1) * uv_w]);
|
692
|
+
}
|
693
|
+
} else {
|
694
|
+
PreprocessARGB(rgb, W, H, stride, y_plane, u_plane, v_plane);
|
695
|
+
}
|
696
|
+
}
|
697
|
+
|
698
|
+
////////////////////////////////////////////////////////////////////////////////
|