sjpeg 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/Gemfile +4 -0
- data/Gemfile.lock +40 -0
- data/LICENSE.txt +21 -0
- data/README.md +40 -0
- data/Rakefile +11 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/ext/sjpeg/bit_writer.cc +122 -0
- data/ext/sjpeg/bit_writer.h +169 -0
- data/ext/sjpeg/colors_rgb.cc +691 -0
- data/ext/sjpeg/dichotomy.cc +290 -0
- data/ext/sjpeg/enc.cc +2132 -0
- data/ext/sjpeg/extconf.rb +3 -0
- data/ext/sjpeg/fdct.cc +627 -0
- data/ext/sjpeg/headers.cc +218 -0
- data/ext/sjpeg/jpeg_tools.cc +274 -0
- data/ext/sjpeg/libsjpeg.pc.in +11 -0
- data/ext/sjpeg/score_7.cc +6220 -0
- data/ext/sjpeg/sjpeg.h +353 -0
- data/ext/sjpeg/sjpegi.h +427 -0
- data/ext/sjpeg/yuv_convert.cc +698 -0
- data/lib/sjpeg/version.rb +3 -0
- data/lib/sjpeg.rb +35 -0
- data/sjpeg.gemspec +36 -0
- metadata +143 -0
data/ext/sjpeg/enc.cc
ADDED
@@ -0,0 +1,2132 @@
|
|
1
|
+
// Copyright 2017 Google Inc.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
//
|
15
|
+
// Fast and simple JPEG encoder
|
16
|
+
//
|
17
|
+
// Author: Skal (pascal.massimino@gmail.com)
|
18
|
+
|
19
|
+
#include <stdlib.h>
|
20
|
+
#include <math.h>
|
21
|
+
#include <float.h> // for FLT_MAX
|
22
|
+
#include <stdint.h>
|
23
|
+
|
24
|
+
#define SJPEG_NEED_ASM_HEADERS
|
25
|
+
#include "sjpegi.h"
|
26
|
+
|
27
|
+
using namespace sjpeg;
|
28
|
+
|
29
|
+
// Some general default values:
|
30
|
+
static const float kDefaultQuality = 75.f;
|
31
|
+
static const int kDefaultMethod = 4;
|
32
|
+
// Rounding bias for AC coefficients, as 8bit fixed point.
|
33
|
+
// A default value 0x78 leans toward filesize reduction.
|
34
|
+
static const int32_t kDefaultBias = 0x78;
|
35
|
+
// for adaptive quantization:
|
36
|
+
static const int kDefaultDeltaMaxLuma = 12;
|
37
|
+
static const int kDefaultDeltaMaxChroma = 1;
|
38
|
+
|
39
|
+
// finer tuning of perceptual optimizations:
|
40
|
+
|
41
|
+
// Minimum average number of entries per bin required for performing histogram-
|
42
|
+
// -based optimization. Below this limit, the channel's histogram is declared
|
43
|
+
// under-populated and the corresponding optimization skipped.
|
44
|
+
static double kDensityThreshold = 0.5;
|
45
|
+
// Rejection limit on the correlation factor when extrapolating the distortion
|
46
|
+
// from histograms. If the least-square fit has a squared correlation factor
|
47
|
+
// less than this threshold, the corresponding quantization scale will be
|
48
|
+
// kept unchanged.
|
49
|
+
static double kCorrelationThreshold = 0.5;
|
50
|
+
// Bit-map of channels to omit during quantization matrix optimization.
|
51
|
+
// If the bit 'i + 8 * j' is set in this bit field, the matrix entry at
|
52
|
+
// position (i,j) will be kept unchanged during optimization.
|
53
|
+
// The default value is 0x103 = 1 + 2 + 256: the 3 entries in the top-left
|
54
|
+
// corner (with lowest-frequency) are not optimized, since it can lead to
|
55
|
+
// visual degradation of smooth gradients.
|
56
|
+
static const uint64_t kOmittedChannels = 0x0000000000000103ULL;
|
57
|
+
|
58
|
+
////////////////////////////////////////////////////////////////////////////////
|
59
|
+
|
60
|
+
namespace sjpeg {
|
61
|
+
|
62
|
+
const uint8_t kZigzag[64] = {
|
63
|
+
0, 1, 8, 16, 9, 2, 3, 10,
|
64
|
+
17, 24, 32, 25, 18, 11, 4, 5,
|
65
|
+
12, 19, 26, 33, 40, 48, 41, 34,
|
66
|
+
27, 20, 13, 6, 7, 14, 21, 28,
|
67
|
+
35, 42, 49, 56, 57, 50, 43, 36,
|
68
|
+
29, 22, 15, 23, 30, 37, 44, 51,
|
69
|
+
58, 59, 52, 45, 38, 31, 39, 46,
|
70
|
+
53, 60, 61, 54, 47, 55, 62, 63,
|
71
|
+
};
|
72
|
+
|
73
|
+
const uint8_t kDefaultMatrices[2][64] = {
|
74
|
+
// these are the default luma/chroma matrices (JPEG spec section K.1)
|
75
|
+
{ 16, 11, 10, 16, 24, 40, 51, 61,
|
76
|
+
12, 12, 14, 19, 26, 58, 60, 55,
|
77
|
+
14, 13, 16, 24, 40, 57, 69, 56,
|
78
|
+
14, 17, 22, 29, 51, 87, 80, 62,
|
79
|
+
18, 22, 37, 56, 68, 109, 103, 77,
|
80
|
+
24, 35, 55, 64, 81, 104, 113, 92,
|
81
|
+
49, 64, 78, 87, 103, 121, 120, 101,
|
82
|
+
72, 92, 95, 98, 112, 100, 103, 99 },
|
83
|
+
{ 17, 18, 24, 47, 99, 99, 99, 99,
|
84
|
+
18, 21, 26, 66, 99, 99, 99, 99,
|
85
|
+
24, 26, 56, 99, 99, 99, 99, 99,
|
86
|
+
47, 66, 99, 99, 99, 99, 99, 99,
|
87
|
+
99, 99, 99, 99, 99, 99, 99, 99,
|
88
|
+
99, 99, 99, 99, 99, 99, 99, 99,
|
89
|
+
99, 99, 99, 99, 99, 99, 99, 99,
|
90
|
+
99, 99, 99, 99, 99, 99, 99, 99 }
|
91
|
+
};
|
92
|
+
|
93
|
+
float GetQFactor(float q) {
|
94
|
+
// we use the same mapping than jpeg-6b, for coherency
|
95
|
+
q = (q <= 0) ? 5000 : (q < 50) ? 5000 / q : (q < 100) ? 2 * (100 - q) : 0;
|
96
|
+
// We floor-round to integer here just to preserve compatibility with jpeg6b.
|
97
|
+
return floorf(q);
|
98
|
+
}
|
99
|
+
|
100
|
+
void CopyQuantMatrix(const uint8_t in[64], uint8_t out[64]) {
|
101
|
+
memcpy(out, in, 64 * sizeof(out[0]));
|
102
|
+
}
|
103
|
+
|
104
|
+
void SetQuantMatrix(const uint8_t in[64], float q_factor, uint8_t out[64]) {
|
105
|
+
if (in == nullptr || out == nullptr) return;
|
106
|
+
q_factor /= 100.f;
|
107
|
+
for (size_t i = 0; i < 64; ++i) {
|
108
|
+
const int v = static_cast<int>(in[i] * q_factor + .5f);
|
109
|
+
// clamp to prevent illegal quantizer values
|
110
|
+
out[i] = (v < 1) ? 1 : (v > 255) ? 255u : v;
|
111
|
+
}
|
112
|
+
}
|
113
|
+
|
114
|
+
void SetMinQuantMatrix(const uint8_t m[64], uint8_t out[64], int tolerance) {
|
115
|
+
assert(out != nullptr && m != nullptr);
|
116
|
+
for (size_t i = 0; i < 64; ++i) {
|
117
|
+
const int v = static_cast<int>(m[i] * (256 - tolerance) >> 8);
|
118
|
+
out[i] = (v < 1) ? 1u : (v > 255) ? 255u : v;
|
119
|
+
}
|
120
|
+
}
|
121
|
+
|
122
|
+
void SetDefaultMinQuantMatrix(uint8_t out[64]) {
|
123
|
+
assert(out != nullptr);
|
124
|
+
for (size_t i = 0; i < 64; ++i) out[i] = 1u;
|
125
|
+
}
|
126
|
+
|
127
|
+
////////////////////////////////////////////////////////////////////////////////
|
128
|
+
// Default memory manager (singleton)
|
129
|
+
|
130
|
+
static struct DefaultMemory : public MemoryManager {
|
131
|
+
public:
|
132
|
+
virtual ~DefaultMemory() {}
|
133
|
+
virtual void* Alloc(size_t size) { return malloc(size); }
|
134
|
+
virtual void Free(void* const ptr) { free(ptr); }
|
135
|
+
} kDefaultMemory;
|
136
|
+
|
137
|
+
////////////////////////////////////////////////////////////////////////////////
|
138
|
+
// Encoder main class
|
139
|
+
|
140
|
+
Encoder::Encoder(int W, int H, int step, const uint8_t* const rgb,
|
141
|
+
ByteSink* const sink)
|
142
|
+
: W_(W), H_(H), step_(step),
|
143
|
+
rgb_(rgb),
|
144
|
+
ok_(true),
|
145
|
+
bw_(sink),
|
146
|
+
in_blocks_base_(nullptr),
|
147
|
+
in_blocks_(nullptr),
|
148
|
+
have_coeffs_(false),
|
149
|
+
all_run_levels_(nullptr),
|
150
|
+
nb_run_levels_(0),
|
151
|
+
max_run_levels_(0),
|
152
|
+
qdelta_max_luma_(kDefaultDeltaMaxLuma),
|
153
|
+
qdelta_max_chroma_(kDefaultDeltaMaxChroma),
|
154
|
+
passes_(1),
|
155
|
+
search_hook_(nullptr),
|
156
|
+
memory_hook_(&kDefaultMemory) {
|
157
|
+
SetCompressionMethod(kDefaultMethod);
|
158
|
+
SetQuality(kDefaultQuality);
|
159
|
+
SetYUVFormat(false);
|
160
|
+
SetQuantizationBias(kDefaultBias, false);
|
161
|
+
SetDefaultMinQuantMatrices();
|
162
|
+
InitializeStaticPointers();
|
163
|
+
memset(dc_codes_, 0, sizeof(dc_codes_)); // safety
|
164
|
+
memset(ac_codes_, 0, sizeof(ac_codes_));
|
165
|
+
}
|
166
|
+
|
167
|
+
Encoder::~Encoder() {
|
168
|
+
Free(all_run_levels_);
|
169
|
+
DesallocateBlocks(); // clean-up leftovers in case of we had an error
|
170
|
+
}
|
171
|
+
|
172
|
+
////////////////////////////////////////////////////////////////////////////////
|
173
|
+
|
174
|
+
void Encoder::SetQuality(float q) {
|
175
|
+
q = GetQFactor(q);
|
176
|
+
SetQuantMatrix(kDefaultMatrices[0], q, quants_[0].quant_);
|
177
|
+
SetQuantMatrix(kDefaultMatrices[1], q, quants_[1].quant_);
|
178
|
+
}
|
179
|
+
|
180
|
+
void Encoder::SetQuantMatrices(const uint8_t m[2][64]) {
|
181
|
+
SetQuantMatrix(m[0], 100, quants_[0].quant_);
|
182
|
+
SetQuantMatrix(m[1], 100, quants_[1].quant_);
|
183
|
+
}
|
184
|
+
|
185
|
+
void Encoder::SetMinQuantMatrices(const uint8_t m[2][64], int tolerance) {
|
186
|
+
SetMinQuantMatrix(m[0], quants_[0].min_quant_, tolerance);
|
187
|
+
SetMinQuantMatrix(m[1], quants_[1].min_quant_, tolerance);
|
188
|
+
}
|
189
|
+
|
190
|
+
void Encoder::SetDefaultMinQuantMatrices() {
|
191
|
+
SetDefaultMinQuantMatrix(quants_[0].min_quant_);
|
192
|
+
SetDefaultMinQuantMatrix(quants_[1].min_quant_);
|
193
|
+
}
|
194
|
+
|
195
|
+
void Encoder::SetCompressionMethod(int method) {
|
196
|
+
assert(method >= 0 && method <= 8);
|
197
|
+
use_adaptive_quant_ = (method >= 3);
|
198
|
+
optimize_size_ = (method != 0) && (method != 3);
|
199
|
+
use_extra_memory_ = (method == 3) || (method == 4) || (method == 7);
|
200
|
+
reuse_run_levels_ = (method == 1) || (method == 4) || (method == 5)
|
201
|
+
|| (method >= 7);
|
202
|
+
use_trellis_ = (method >= 7);
|
203
|
+
}
|
204
|
+
|
205
|
+
void Encoder::SetMetadata(const std::string& data, MetadataType type) {
|
206
|
+
switch (type) {
|
207
|
+
case ICC: iccp_ = data; break;
|
208
|
+
case EXIF: exif_ = data; break;
|
209
|
+
case XMP: xmp_ = data; break;
|
210
|
+
default:
|
211
|
+
case MARKERS: app_markers_ = data; break;
|
212
|
+
}
|
213
|
+
}
|
214
|
+
|
215
|
+
void Encoder::SetQuantizationBias(int bias, bool use_adaptive) {
|
216
|
+
assert(bias >= 0 && bias <= 255);
|
217
|
+
q_bias_ = bias;
|
218
|
+
adaptive_bias_ = use_adaptive;
|
219
|
+
}
|
220
|
+
|
221
|
+
void Encoder::SetQuantizationDeltas(int qdelta_luma, int qdelta_chroma) {
|
222
|
+
assert(qdelta_luma >= 0 && qdelta_luma <= 255);
|
223
|
+
assert(qdelta_chroma >= 0 && qdelta_chroma <= 255);
|
224
|
+
qdelta_max_luma_ = qdelta_luma;
|
225
|
+
qdelta_max_chroma_ = qdelta_chroma;
|
226
|
+
}
|
227
|
+
|
228
|
+
////////////////////////////////////////////////////////////////////////////////
|
229
|
+
// CPU support
|
230
|
+
|
231
|
+
extern bool ForceSlowCImplementation;
|
232
|
+
bool ForceSlowCImplementation = false; // undocumented! for tests.
|
233
|
+
|
234
|
+
bool SupportsSSE2() {
|
235
|
+
if (ForceSlowCImplementation) return false;
|
236
|
+
#if defined(SJPEG_USE_SSE2)
|
237
|
+
return true;
|
238
|
+
#endif
|
239
|
+
return false;
|
240
|
+
}
|
241
|
+
|
242
|
+
bool SupportsNEON() {
|
243
|
+
if (ForceSlowCImplementation) return false;
|
244
|
+
#if defined(SJPEG_USE_NEON)
|
245
|
+
return true;
|
246
|
+
#endif
|
247
|
+
return false;
|
248
|
+
}
|
249
|
+
|
250
|
+
////////////////////////////////////////////////////////////////////////////////
|
251
|
+
// static pointers to architecture-dependant implementation
|
252
|
+
|
253
|
+
Encoder::QuantizeErrorFunc Encoder::quantize_error_ = nullptr;
|
254
|
+
Encoder::QuantizeBlockFunc Encoder::quantize_block_ = nullptr;
|
255
|
+
void (*Encoder::fDCT_)(int16_t* in, int num_blocks) = nullptr;
|
256
|
+
Encoder::StoreHistoFunc Encoder::store_histo_ = nullptr;
|
257
|
+
RGBToYUVBlockFunc Encoder::get_yuv444_block_ = nullptr;
|
258
|
+
|
259
|
+
void Encoder::InitializeStaticPointers() {
|
260
|
+
if (fDCT_ == nullptr) {
|
261
|
+
store_histo_ = GetStoreHistoFunc();
|
262
|
+
quantize_block_ = GetQuantizeBlockFunc();
|
263
|
+
quantize_error_ = GetQuantizeErrorFunc();
|
264
|
+
fDCT_ = GetFdct();
|
265
|
+
get_yuv444_block_ = GetBlockFunc(true);
|
266
|
+
}
|
267
|
+
}
|
268
|
+
|
269
|
+
////////////////////////////////////////////////////////////////////////////////
|
270
|
+
// memory and internal buffers management. We grow on demand.
|
271
|
+
|
272
|
+
bool Encoder::SetError() {
|
273
|
+
ok_ = false;
|
274
|
+
return false;
|
275
|
+
}
|
276
|
+
|
277
|
+
bool Encoder::CheckBuffers() {
|
278
|
+
// maximum macroblock size, worst-case, is 24bits*64*6 coeffs = 1152bytes
|
279
|
+
ok_ = ok_ && bw_.Reserve(2048);
|
280
|
+
if (!ok_) return false;
|
281
|
+
|
282
|
+
if (reuse_run_levels_) {
|
283
|
+
if (nb_run_levels_ + 6*64 > max_run_levels_) {
|
284
|
+
// need to grow storage for run/levels
|
285
|
+
const size_t new_size = max_run_levels_ ? max_run_levels_ * 2 : 8192;
|
286
|
+
RunLevel* const new_rl = Alloc<RunLevel>(new_size);
|
287
|
+
if (new_rl == nullptr) return false;
|
288
|
+
if (nb_run_levels_ > 0) {
|
289
|
+
memcpy(new_rl, all_run_levels_,
|
290
|
+
nb_run_levels_ * sizeof(new_rl[0]));
|
291
|
+
}
|
292
|
+
Free(all_run_levels_);
|
293
|
+
all_run_levels_ = new_rl;
|
294
|
+
max_run_levels_ = new_size;
|
295
|
+
assert(nb_run_levels_ + 6 * 64 <= max_run_levels_);
|
296
|
+
}
|
297
|
+
}
|
298
|
+
return true;
|
299
|
+
}
|
300
|
+
|
301
|
+
bool Encoder::AllocateBlocks(size_t num_blocks) {
|
302
|
+
assert(in_blocks_ == nullptr);
|
303
|
+
have_coeffs_ = false;
|
304
|
+
const size_t size = num_blocks * 64 * sizeof(*in_blocks_);
|
305
|
+
in_blocks_base_ = Alloc<uint8_t>(size + ALIGN_CST);
|
306
|
+
if (in_blocks_base_ == nullptr) return false;
|
307
|
+
in_blocks_ = reinterpret_cast<int16_t*>(
|
308
|
+
(ALIGN_CST + reinterpret_cast<uintptr_t>(in_blocks_base_)) & ~ALIGN_CST);
|
309
|
+
return true;
|
310
|
+
}
|
311
|
+
|
312
|
+
void Encoder::DesallocateBlocks() {
|
313
|
+
Free(in_blocks_base_);
|
314
|
+
in_blocks_base_ = nullptr;
|
315
|
+
in_blocks_ = nullptr; // sanity
|
316
|
+
}
|
317
|
+
|
318
|
+
////////////////////////////////////////////////////////////////////////////////
|
319
|
+
|
320
|
+
#define FP_BITS 16 // fractional precision for fixed-point dividors
|
321
|
+
#define AC_BITS 4 // extra precision bits from fdct's scaling
|
322
|
+
#define BIAS_DC 0x80 // neutral bias for DC (mandatory!)
|
323
|
+
|
324
|
+
// divide-by-multiply helper macros
|
325
|
+
#define MAKE_INV_QUANT(Q) (((1u << FP_BITS) + (Q) / 2) / (Q))
|
326
|
+
#define DIV_BY_MULT(A, M) (((A) * (M)) >> FP_BITS)
|
327
|
+
#define QUANTIZE(A, M, B) (DIV_BY_MULT((A) + (B), (M)) >> AC_BITS)
|
328
|
+
|
329
|
+
void Encoder::FinalizeQuantMatrix(Quantizer* const q, int q_bias) {
|
330
|
+
// first, clamp the quant matrix:
|
331
|
+
for (size_t i = 0; i < 64; ++i) {
|
332
|
+
if (q->quant_[i] < q->min_quant_[i]) q->quant_[i] = q->min_quant_[i];
|
333
|
+
}
|
334
|
+
// Special case! for v=1 we can't represent the multiplier with 16b precision.
|
335
|
+
// So, instead we max out the multiplier to 0xffffu, and twist the bias to the
|
336
|
+
// value 0x80. The overall precision isn't affected: it's bit-exact the same
|
337
|
+
// for our working range.
|
338
|
+
// Note that quant=1 can start appearing at quality as low as 93.
|
339
|
+
const uint16_t bias_1 = 0x80;
|
340
|
+
const uint16_t iquant_1 = 0xffffu;
|
341
|
+
for (size_t i = 0; i < 64; ++i) {
|
342
|
+
const uint16_t v = q->quant_[i];
|
343
|
+
const uint16_t iquant = (v == 1) ? iquant_1 : MAKE_INV_QUANT(v);
|
344
|
+
const uint16_t bias = (v == 1) ? bias_1 : (i == 0) ? BIAS_DC : q_bias;
|
345
|
+
const uint16_t ibias = (((bias * v) << AC_BITS) + 128) >> 8;
|
346
|
+
const uint16_t qthresh =
|
347
|
+
((1 << (FP_BITS + AC_BITS)) + iquant - 1) / iquant - ibias;
|
348
|
+
q->bias_[i] = ibias;
|
349
|
+
q->iquant_[i] = iquant;
|
350
|
+
q->qthresh_[i] = qthresh;
|
351
|
+
assert(QUANTIZE(qthresh, iquant, ibias) > 0);
|
352
|
+
assert(QUANTIZE(qthresh - 1, iquant, ibias) == 0);
|
353
|
+
}
|
354
|
+
}
|
355
|
+
|
356
|
+
void Encoder::SetCostCodes(int idx) {
|
357
|
+
quants_[idx].codes_ = ac_codes_[idx];
|
358
|
+
}
|
359
|
+
|
360
|
+
////////////////////////////////////////////////////////////////////////////////
|
361
|
+
// standard Huffman tables, as per JPEG standard section K.3.
|
362
|
+
|
363
|
+
static const uint8_t kDCSyms[12] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 };
|
364
|
+
static const uint8_t kACSyms[2][162] = {
|
365
|
+
{ 0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12,
|
366
|
+
0x21, 0x31, 0x41, 0x06, 0x13, 0x51, 0x61, 0x07,
|
367
|
+
0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xa1, 0x08,
|
368
|
+
0x23, 0x42, 0xb1, 0xc1, 0x15, 0x52, 0xd1, 0xf0,
|
369
|
+
0x24, 0x33, 0x62, 0x72, 0x82, 0x09, 0x0a, 0x16,
|
370
|
+
0x17, 0x18, 0x19, 0x1a, 0x25, 0x26, 0x27, 0x28,
|
371
|
+
0x29, 0x2a, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
|
372
|
+
0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
|
373
|
+
0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
|
374
|
+
0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
|
375
|
+
0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
|
376
|
+
0x7a, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
|
377
|
+
0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98,
|
378
|
+
0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
|
379
|
+
0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6,
|
380
|
+
0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5,
|
381
|
+
0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4,
|
382
|
+
0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xe1, 0xe2,
|
383
|
+
0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea,
|
384
|
+
0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
|
385
|
+
0xf9, 0xfa },
|
386
|
+
{ 0x00, 0x01, 0x02, 0x03, 0x11, 0x04, 0x05, 0x21,
|
387
|
+
0x31, 0x06, 0x12, 0x41, 0x51, 0x07, 0x61, 0x71,
|
388
|
+
0x13, 0x22, 0x32, 0x81, 0x08, 0x14, 0x42, 0x91,
|
389
|
+
0xa1, 0xb1, 0xc1, 0x09, 0x23, 0x33, 0x52, 0xf0,
|
390
|
+
0x15, 0x62, 0x72, 0xd1, 0x0a, 0x16, 0x24, 0x34,
|
391
|
+
0xe1, 0x25, 0xf1, 0x17, 0x18, 0x19, 0x1a, 0x26,
|
392
|
+
0x27, 0x28, 0x29, 0x2a, 0x35, 0x36, 0x37, 0x38,
|
393
|
+
0x39, 0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48,
|
394
|
+
0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,
|
395
|
+
0x59, 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68,
|
396
|
+
0x69, 0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
|
397
|
+
0x79, 0x7a, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
|
398
|
+
0x88, 0x89, 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96,
|
399
|
+
0x97, 0x98, 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5,
|
400
|
+
0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4,
|
401
|
+
0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3,
|
402
|
+
0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2,
|
403
|
+
0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda,
|
404
|
+
0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9,
|
405
|
+
0xea, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
|
406
|
+
0xf9, 0xfa }
|
407
|
+
};
|
408
|
+
|
409
|
+
static const HuffmanTable kHuffmanTables[4] = {
|
410
|
+
{ { 0, 1, 5, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0 }, kDCSyms, 12 },
|
411
|
+
{ { 0, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0 }, kDCSyms, 12 },
|
412
|
+
{ { 0, 2, 1, 3, 3, 2, 4, 3, 5, 5, 4, 4, 0, 0, 1, 125 }, kACSyms[0], 162 },
|
413
|
+
{ { 0, 2, 1, 2, 4, 4, 3, 4, 7, 5, 4, 4, 0, 1, 2, 119 }, kACSyms[1], 162 }
|
414
|
+
};
|
415
|
+
|
416
|
+
////////////////////////////////////////////////////////////////////////////////
|
417
|
+
// This function generates a map from symbols to code + len stored in a packed
|
418
|
+
// way (lower 16bit is the lenth, upper 16bit is the VLC).
|
419
|
+
// The input is a JPEG-like description of the symbols:
|
420
|
+
// - bits[i] stores the number of codes having length i + 1.
|
421
|
+
// - symbols[] contain the symbols' map, in increasing bit-length order.
|
422
|
+
// There is no check performed on the validity symbols[]'s content.
|
423
|
+
// The values of tab[] not referring to an actual symbol will remain unchanged.
|
424
|
+
// Returns the number of symbols used (that is: sum{bits[i]})
|
425
|
+
|
426
|
+
static int BuildHuffmanTable(const uint8_t bits[16], const uint8_t* symbols,
|
427
|
+
uint32_t* const tab) {
|
428
|
+
uint32_t code = 0;
|
429
|
+
int nb = 0;
|
430
|
+
for (int nb_bits = 1; nb_bits <= 16; ++nb_bits, code <<= 1) {
|
431
|
+
int n = bits[nb_bits - 1]; // number of code for that given nb_bits
|
432
|
+
nb += n;
|
433
|
+
while (n-- > 0) {
|
434
|
+
const int symbol = *symbols++;
|
435
|
+
tab[symbol] = (code << 16) | nb_bits;
|
436
|
+
++code;
|
437
|
+
}
|
438
|
+
}
|
439
|
+
return nb;
|
440
|
+
}
|
441
|
+
|
442
|
+
////////////////////////////////////////////////////////////////////////////////
|
443
|
+
|
444
|
+
void Encoder::InitCodes(bool only_ac) {
|
445
|
+
const int nb_tables = (nb_comps_ == 1 ? 1 : 2);
|
446
|
+
for (int c = 0; c < nb_tables; ++c) { // luma, chroma
|
447
|
+
for (int type = (only_ac ? 1 : 0); type <= 1; ++type) {
|
448
|
+
const HuffmanTable* const h = Huffman_tables_[type * 2 + c];
|
449
|
+
const int nb_syms = BuildHuffmanTable(h->bits_, h->syms_,
|
450
|
+
type == 1 ? ac_codes_[c]
|
451
|
+
: dc_codes_[c]);
|
452
|
+
assert(nb_syms == h->nb_syms_);
|
453
|
+
(void)nb_syms;
|
454
|
+
}
|
455
|
+
}
|
456
|
+
}
|
457
|
+
|
458
|
+
////////////////////////////////////////////////////////////////////////////////
|
459
|
+
// Quantize coefficients and pseudo-code coefficients
|
460
|
+
|
461
|
+
static int CalcLog2(int v) {
|
462
|
+
#if defined(__GNUC__) && \
|
463
|
+
((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4)
|
464
|
+
return 32 - __builtin_clz(v);
|
465
|
+
#else
|
466
|
+
const int kLog2[16] = {
|
467
|
+
0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4 };
|
468
|
+
assert(v > 0 && v < (1 << 12));
|
469
|
+
return (v & ~0xff) ? 8 + kLog2[v >> 8] :
|
470
|
+
(v & ~0x0f) ? 4 + kLog2[v >> 4] :
|
471
|
+
0 + kLog2[v];
|
472
|
+
#endif
|
473
|
+
}
|
474
|
+
|
475
|
+
uint16_t Encoder::GenerateDCDiffCode(int DC, int* const DC_predictor) {
|
476
|
+
const int diff = DC - *DC_predictor;
|
477
|
+
*DC_predictor = DC;
|
478
|
+
if (diff == 0) {
|
479
|
+
return 0;
|
480
|
+
}
|
481
|
+
int suff, n;
|
482
|
+
if (diff < 0) {
|
483
|
+
n = CalcLog2(-diff);
|
484
|
+
suff = (diff - 1) & ((1 << n) - 1);
|
485
|
+
} else {
|
486
|
+
n = CalcLog2(diff);
|
487
|
+
suff = diff;
|
488
|
+
}
|
489
|
+
assert((suff & 0xf000) == 0);
|
490
|
+
assert(n < 12);
|
491
|
+
return n | (suff << 4);
|
492
|
+
}
|
493
|
+
|
494
|
+
////////////////////////////////////////////////////////////////////////////////
|
495
|
+
// various implementation of histogram collection
|
496
|
+
|
497
|
+
#if defined(SJPEG_USE_SSE2)
|
498
|
+
// Load eight 16b-words from *src.
|
499
|
+
#define LOAD_16(src) _mm_loadu_si128(reinterpret_cast<const __m128i*>(src))
|
500
|
+
// Store eight 16b-words into *dst
|
501
|
+
#define STORE_16(V, dst) _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), (V))
|
502
|
+
|
503
|
+
static int QuantizeBlockSSE2(const int16_t in[64], int idx,
|
504
|
+
const Quantizer* const Q,
|
505
|
+
DCTCoeffs* const out, RunLevel* const rl) {
|
506
|
+
const uint16_t* const bias = Q->bias_;
|
507
|
+
const uint16_t* const iquant = Q->iquant_;
|
508
|
+
int prev = 1;
|
509
|
+
int nb = 0;
|
510
|
+
int16_t tmp[64], masked[64];
|
511
|
+
for (int i = 0; i < 64; i += 8) {
|
512
|
+
const __m128i m_bias = LOAD_16(bias + i);
|
513
|
+
const __m128i m_mult = LOAD_16(iquant + i);
|
514
|
+
const __m128i A = LOAD_16(in + i); // A = in[i]
|
515
|
+
const __m128i B = _mm_srai_epi16(A, 15); // sign extract
|
516
|
+
const __m128i C = _mm_sub_epi16(_mm_xor_si128(A, B), B); // abs(A)
|
517
|
+
const __m128i D = _mm_adds_epi16(C, m_bias); // v' = v + bias
|
518
|
+
const __m128i E = _mm_mulhi_epu16(D, m_mult); // (v' * iq) >> 16
|
519
|
+
const __m128i F = _mm_srli_epi16(E, AC_BITS); // = QUANTIZE(...)
|
520
|
+
const __m128i G = _mm_xor_si128(F, B); // v ^ mask
|
521
|
+
STORE_16(F, tmp + i);
|
522
|
+
STORE_16(G, masked + i);
|
523
|
+
}
|
524
|
+
for (int i = 1; i < 64; ++i) {
|
525
|
+
const int j = kZigzag[i];
|
526
|
+
const int v = tmp[j];
|
527
|
+
if (v > 0) {
|
528
|
+
const int n = CalcLog2(v);
|
529
|
+
const uint16_t code = masked[j] & ((1 << n) - 1);
|
530
|
+
rl[nb].level_ = (code << 4) | n;
|
531
|
+
rl[nb].run_ = i - prev;
|
532
|
+
prev = i + 1;
|
533
|
+
++nb;
|
534
|
+
}
|
535
|
+
}
|
536
|
+
const int dc = (in[0] < 0) ? -tmp[0] : tmp[0];
|
537
|
+
out->idx_ = idx;
|
538
|
+
out->last_ = prev - 1;
|
539
|
+
out->nb_coeffs_ = nb;
|
540
|
+
return dc;
|
541
|
+
}
|
542
|
+
#undef LOAD_16
|
543
|
+
#undef STORE_16
|
544
|
+
|
545
|
+
#elif defined(SJPEG_USE_NEON)
|
546
|
+
static int QuantizeBlockNEON(const int16_t in[64], int idx,
|
547
|
+
const Quantizer* const Q,
|
548
|
+
DCTCoeffs* const out, RunLevel* const rl) {
|
549
|
+
const uint16_t* const bias = Q->bias_;
|
550
|
+
const uint16_t* const iquant = Q->iquant_;
|
551
|
+
int prev = 1;
|
552
|
+
int nb = 0;
|
553
|
+
uint16_t tmp[64], masked[64];
|
554
|
+
for (int i = 0; i < 64; i += 8) {
|
555
|
+
const uint16x8_t m_bias = vld1q_u16(bias + i);
|
556
|
+
const uint16x8_t m_mult = vld1q_u16(iquant + i);
|
557
|
+
const int16x8_t A = vld1q_s16(in + i); // in[i]
|
558
|
+
const uint16x8_t B = vreinterpretq_u16_s16(vabsq_s16(A)); // abs(in)
|
559
|
+
const int16x8_t sign = vshrq_n_s16(A, 15); // sign
|
560
|
+
const uint16x8_t C = vaddq_u16(B, m_bias); // + bias
|
561
|
+
const uint32x4_t D0 = vmull_u16(vget_low_u16(C), vget_low_u16(m_mult));
|
562
|
+
const uint32x4_t D1 = vmull_u16(vget_high_u16(C), vget_high_u16(m_mult));
|
563
|
+
// collect hi-words of the 32b mult result using 'unzip'
|
564
|
+
const uint16x8x2_t E = vuzpq_u16(vreinterpretq_u16_u32(D0),
|
565
|
+
vreinterpretq_u16_u32(D1));
|
566
|
+
const uint16x8_t F = vshrq_n_u16(E.val[1], AC_BITS);
|
567
|
+
const uint16x8_t G = veorq_u16(F, vreinterpretq_u16_s16(sign)); // v ^ mask
|
568
|
+
vst1q_u16(tmp + i, F);
|
569
|
+
vst1q_u16(masked + i, G);
|
570
|
+
}
|
571
|
+
for (int i = 1; i < 64; ++i) {
|
572
|
+
const int j = kZigzag[i];
|
573
|
+
const int v = tmp[j];
|
574
|
+
if (v > 0) {
|
575
|
+
const int n = CalcLog2(v);
|
576
|
+
const uint16_t code = masked[j] & ((1 << n) - 1);
|
577
|
+
rl[nb].level_ = (code << 4) | n;
|
578
|
+
rl[nb].run_ = i - prev;
|
579
|
+
prev = i + 1;
|
580
|
+
++nb;
|
581
|
+
}
|
582
|
+
}
|
583
|
+
const int dc = (in[0] < 0) ? -tmp[0] : tmp[0];
|
584
|
+
out->idx_ = idx;
|
585
|
+
out->last_ = prev - 1;
|
586
|
+
out->nb_coeffs_ = nb;
|
587
|
+
return dc;
|
588
|
+
}
|
589
|
+
#endif // SJPEG_USE_NEON
|
590
|
+
|
591
|
+
static int QuantizeBlock(const int16_t in[64], int idx,
|
592
|
+
const Quantizer* const Q,
|
593
|
+
DCTCoeffs* const out, RunLevel* const rl) {
|
594
|
+
const uint16_t* const bias = Q->bias_;
|
595
|
+
const uint16_t* const iquant = Q->iquant_;
|
596
|
+
int prev = 1;
|
597
|
+
int nb = 0;
|
598
|
+
// This function is speed-critical, so we're using some bit mask
|
599
|
+
// to extract absolute values, instead of sign tests.
|
600
|
+
const uint16_t* const qthresh = Q->qthresh_;
|
601
|
+
for (int i = 1; i < 64; ++i) {
|
602
|
+
const int j = kZigzag[i];
|
603
|
+
int v = in[j];
|
604
|
+
const int32_t mask = v >> 31;
|
605
|
+
v = (v ^ mask) - mask;
|
606
|
+
if (v >= qthresh[j]) {
|
607
|
+
v = QUANTIZE(v, iquant[j], bias[j]);
|
608
|
+
assert(v > 0);
|
609
|
+
const int n = CalcLog2(v);
|
610
|
+
const uint16_t code = (v ^ mask) & ((1 << n) - 1);
|
611
|
+
rl[nb].level_ = (code << 4) | n;
|
612
|
+
rl[nb].run_ = i - prev;
|
613
|
+
prev = i + 1;
|
614
|
+
++nb;
|
615
|
+
}
|
616
|
+
}
|
617
|
+
const int dc = (in[0] < 0) ? -QUANTIZE(-in[0], iquant[0], bias[0])
|
618
|
+
: QUANTIZE(in[0], iquant[0], bias[0]);
|
619
|
+
out->idx_ = idx;
|
620
|
+
out->last_ = prev - 1;
|
621
|
+
out->nb_coeffs_ = nb;
|
622
|
+
return dc;
|
623
|
+
}
|
624
|
+
|
625
|
+
////////////////////////////////////////////////////////////////////////////////
|
626
|
+
// Trellis-based quantization
|
627
|
+
|
628
|
+
typedef uint32_t score_t;
|
629
|
+
static const score_t kMaxScore = 0xffffffffu;
|
630
|
+
|
631
|
+
struct TrellisNode {
|
632
|
+
uint32_t code;
|
633
|
+
int nbits;
|
634
|
+
score_t score;
|
635
|
+
uint32_t disto;
|
636
|
+
uint32_t bits;
|
637
|
+
uint32_t run;
|
638
|
+
const TrellisNode* best_prev;
|
639
|
+
int pos;
|
640
|
+
int rank;
|
641
|
+
|
642
|
+
TrellisNode() : score(kMaxScore), best_prev(nullptr) {}
|
643
|
+
void InitSink() {
|
644
|
+
score = 0u;
|
645
|
+
disto = 0;
|
646
|
+
pos = 0;
|
647
|
+
rank = 0;
|
648
|
+
nbits = 0;
|
649
|
+
bits = 0;
|
650
|
+
}
|
651
|
+
};
|
652
|
+
|
653
|
+
static bool SearchBestPrev(const TrellisNode* const nodes0, TrellisNode* node,
|
654
|
+
const uint32_t disto0[], const uint32_t codes[],
|
655
|
+
uint32_t lambda) {
|
656
|
+
bool found = false;
|
657
|
+
assert(codes[0xf0] != 0);
|
658
|
+
const uint32_t base_disto = node->disto + disto0[node->pos - 1];
|
659
|
+
for (const TrellisNode* cur = node - 1; cur >= nodes0; --cur) {
|
660
|
+
const int run = node->pos - 1 - cur->pos;
|
661
|
+
if (run < 0) continue;
|
662
|
+
uint32_t bits = node->nbits;
|
663
|
+
bits += (run >> 4) * (codes[0xf0] & 0xff);
|
664
|
+
const uint32_t sym = ((run & 15) << 4) | node->nbits;
|
665
|
+
assert(codes[sym] != 0);
|
666
|
+
bits += codes[sym] & 0xff;
|
667
|
+
const uint32_t disto = base_disto - disto0[cur->pos];
|
668
|
+
const score_t score = disto + lambda * bits + cur->score;
|
669
|
+
if (score < node->score) {
|
670
|
+
node->score = score;
|
671
|
+
node->disto = disto;
|
672
|
+
node->bits = bits;
|
673
|
+
node->best_prev = cur;
|
674
|
+
node->rank = cur->rank + 1;
|
675
|
+
node->run = run;
|
676
|
+
found = true;
|
677
|
+
}
|
678
|
+
}
|
679
|
+
return found;
|
680
|
+
}
|
681
|
+
|
682
|
+
// number of alternate levels to investigate
|
683
|
+
#define NUM_TRELLIS_NODES 2
|
684
|
+
|
685
|
+
int Encoder::TrellisQuantizeBlock(const int16_t in[64], int idx,
|
686
|
+
const Quantizer* const Q,
|
687
|
+
DCTCoeffs* const out,
|
688
|
+
RunLevel* const rl) {
|
689
|
+
const uint16_t* const bias = Q->bias_;
|
690
|
+
const uint16_t* const iquant = Q->iquant_;
|
691
|
+
TrellisNode nodes[1 + NUM_TRELLIS_NODES * 63]; // 1 sink + n channels
|
692
|
+
nodes[0].InitSink();
|
693
|
+
const uint32_t* const codes = Q->codes_;
|
694
|
+
TrellisNode* cur_node = &nodes[1];
|
695
|
+
uint32_t disto0[64]; // disto0[i] = sum of distortions up to i (inclusive)
|
696
|
+
disto0[0] = 0;
|
697
|
+
for (int i = 1; i < 64; ++i) {
|
698
|
+
const int j = kZigzag[i];
|
699
|
+
const uint32_t q = Q->quant_[j] << AC_BITS;
|
700
|
+
const uint32_t lambda = q * q / 32u;
|
701
|
+
int V = in[j];
|
702
|
+
const int32_t mask = V >> 31;
|
703
|
+
V = (V ^ mask) - mask;
|
704
|
+
disto0[i] = V * V + disto0[i - 1];
|
705
|
+
int v = QUANTIZE(V, iquant[j], bias[j]);
|
706
|
+
if (v == 0) continue;
|
707
|
+
int nbits = CalcLog2(v);
|
708
|
+
for (int k = 0; k < NUM_TRELLIS_NODES; ++k) {
|
709
|
+
const int err = V - v * q;
|
710
|
+
cur_node->code = (v ^ mask) & ((1 << nbits) - 1);
|
711
|
+
cur_node->pos = i;
|
712
|
+
cur_node->disto = err * err;
|
713
|
+
cur_node->nbits = nbits;
|
714
|
+
cur_node->score = kMaxScore;
|
715
|
+
if (SearchBestPrev(&nodes[0], cur_node, disto0, codes, lambda)) {
|
716
|
+
++cur_node;
|
717
|
+
}
|
718
|
+
--nbits;
|
719
|
+
if (nbits <= 0) break;
|
720
|
+
v = (1 << nbits) - 1;
|
721
|
+
}
|
722
|
+
}
|
723
|
+
// search best entry point backward
|
724
|
+
const TrellisNode* nz = &nodes[0];
|
725
|
+
if (cur_node != nz) {
|
726
|
+
score_t best_score = kMaxScore;
|
727
|
+
while (cur_node-- != &nodes[0]) {
|
728
|
+
const uint32_t disto = disto0[63] - disto0[cur_node->pos];
|
729
|
+
// No need to incorporate EOB's bit cost (codes[0x00]), since
|
730
|
+
// it's the same for all coeff except the last one #63.
|
731
|
+
cur_node->disto += disto;
|
732
|
+
cur_node->score += disto;
|
733
|
+
if (cur_node->score < best_score) {
|
734
|
+
nz = cur_node;
|
735
|
+
best_score = cur_node->score;
|
736
|
+
}
|
737
|
+
}
|
738
|
+
}
|
739
|
+
int nb = nz->rank;
|
740
|
+
out->idx_ = idx;
|
741
|
+
out->last_ = nz->pos;
|
742
|
+
out->nb_coeffs_ = nb;
|
743
|
+
|
744
|
+
while (nb-- > 0) {
|
745
|
+
const int32_t code = nz->code;
|
746
|
+
const int n = nz->nbits;
|
747
|
+
rl[nb].level_ = (code << 4) | n;
|
748
|
+
rl[nb].run_ = nz->run;
|
749
|
+
nz = nz->best_prev;
|
750
|
+
}
|
751
|
+
const int dc = (in[0] < 0) ? -QUANTIZE(-in[0], iquant[0], bias[0])
|
752
|
+
: QUANTIZE(in[0], iquant[0], bias[0]);
|
753
|
+
return dc;
|
754
|
+
}
|
755
|
+
|
756
|
+
Encoder::QuantizeBlockFunc Encoder::GetQuantizeBlockFunc() {
|
757
|
+
#if defined(SJPEG_USE_SSE2)
|
758
|
+
if (SupportsSSE2()) return QuantizeBlockSSE2;
|
759
|
+
#elif defined(SJPEG_USE_NEON)
|
760
|
+
if (SupportsNEON()) return QuantizeBlockNEON;
|
761
|
+
#endif
|
762
|
+
return QuantizeBlock; // default
|
763
|
+
}
|
764
|
+
|
765
|
+
////////////////////////////////////////////////////////////////////////////////
|
766
|
+
|
767
|
+
#if defined(SJPEG_USE_SSE2)
|
768
|
+
// Load eight 16b-words from *src.
|
769
|
+
#define LOAD_16(src) _mm_loadu_si128((const __m128i*)(src))
|
770
|
+
#define LOAD_64(src) _mm_loadl_epi64((const __m128i*)(src))
|
771
|
+
// Store eight 16b-words into *dst
|
772
|
+
#define STORE_16(V, dst) _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), (V))
|
773
|
+
|
774
|
+
static uint32_t QuantizeErrorSSE2(const int16_t in[64],
|
775
|
+
const Quantizer* const Q) {
|
776
|
+
const uint16_t* const bias = Q->bias_;
|
777
|
+
const uint16_t* const iquant = Q->iquant_;
|
778
|
+
const uint8_t* const quant = Q->quant_;
|
779
|
+
const __m128i zero = _mm_setzero_si128();
|
780
|
+
uint32_t tmp[32];
|
781
|
+
for (int i = 0; i < 64; i += 8) {
|
782
|
+
const __m128i m_bias = LOAD_16(bias + i);
|
783
|
+
const __m128i m_iquant = LOAD_16(iquant + i);
|
784
|
+
const __m128i m_quant = _mm_unpacklo_epi8(LOAD_64(quant + i), zero);
|
785
|
+
const __m128i A = LOAD_16(in + i); // v0 = in[i]
|
786
|
+
const __m128i B = _mm_srai_epi16(A, 15); // sign extract
|
787
|
+
const __m128i C = _mm_sub_epi16(_mm_xor_si128(A, B), B); // abs(v0)
|
788
|
+
const __m128i D = _mm_adds_epi16(C, m_bias); // v' = v0 + bias
|
789
|
+
const __m128i E = _mm_mulhi_epu16(D, m_iquant); // (v' * iq) >> 16
|
790
|
+
const __m128i F = _mm_srai_epi16(E, AC_BITS);
|
791
|
+
const __m128i G = _mm_srai_epi16(C, AC_BITS);
|
792
|
+
const __m128i H = _mm_mullo_epi16(F, m_quant); // *= quant[j]
|
793
|
+
const __m128i I = _mm_sub_epi16(G, H);
|
794
|
+
const __m128i J = _mm_madd_epi16(I, I); // (v0-v) ^ 2
|
795
|
+
STORE_16(J, tmp + i / 2);
|
796
|
+
}
|
797
|
+
uint32_t err = 0;
|
798
|
+
for (int i = 0; i < 32; ++i) err += tmp[i];
|
799
|
+
return err;
|
800
|
+
}
|
801
|
+
#undef LOAD_16
|
802
|
+
#undef LOAD_64
|
803
|
+
#undef STORE_16
|
804
|
+
|
805
|
+
#elif defined(SJPEG_USE_NEON)
|
806
|
+
|
807
|
+
static uint32_t QuantizeErrorNEON(const int16_t in[64],
|
808
|
+
const Quantizer* const Q) {
|
809
|
+
const uint16_t* const bias = Q->bias_;
|
810
|
+
const uint16_t* const iquant = Q->iquant_;
|
811
|
+
const uint8_t* const quant = Q->quant_;
|
812
|
+
uint32x4_t sum1 = vdupq_n_u32(0);
|
813
|
+
uint32x4_t sum2 = vdupq_n_u32(0);
|
814
|
+
for (int i = 0; i < 64; i += 8) {
|
815
|
+
const uint16x8_t m_bias = vld1q_u16(bias + i);
|
816
|
+
const uint16x8_t m_mult = vld1q_u16(iquant + i);
|
817
|
+
const uint16x8_t m_quant = vmovl_u8(vld1_u8(quant + i));
|
818
|
+
const uint16x8_t A = vreinterpretq_u16_s16(vabsq_s16(vld1q_s16(in + i)));
|
819
|
+
const uint16x8_t B = vaddq_u16(A, m_bias);
|
820
|
+
const uint32x4_t C0 = vmull_u16(vget_low_u16(B), vget_low_u16(m_mult));
|
821
|
+
const uint32x4_t C1 = vmull_u16(vget_high_u16(B), vget_high_u16(m_mult));
|
822
|
+
// collect hi-words of the 32b mult result using 'unzip'
|
823
|
+
const uint16x8x2_t D = vuzpq_u16(vreinterpretq_u16_u32(C0),
|
824
|
+
vreinterpretq_u16_u32(C1));
|
825
|
+
const uint16x8_t E = vshrq_n_u16(D.val[1], AC_BITS);
|
826
|
+
const uint16x8_t F = vmulq_u16(E, m_quant); // dequantized coeff
|
827
|
+
const uint16x8_t G = vabdq_u16(F, vshrq_n_u16(A, AC_BITS));
|
828
|
+
sum1 = vmlal_u16(sum1, vget_low_u16(G), vget_low_u16(G));
|
829
|
+
sum2 = vmlal_u16(sum2, vget_high_u16(G), vget_high_u16(G));
|
830
|
+
}
|
831
|
+
const uint32x4_t sum3 = vaddq_u32(sum1, sum2);
|
832
|
+
const uint64x2_t sum4 = vpaddlq_u32(sum3);
|
833
|
+
const uint64_t sum5 = vgetq_lane_u64(sum4, 0) + vgetq_lane_u64(sum4, 1);
|
834
|
+
const uint32_t err = (uint32_t)sum5;
|
835
|
+
return err;
|
836
|
+
}
|
837
|
+
|
838
|
+
#endif // SJPEG_USE_NEON
|
839
|
+
|
840
|
+
static uint32_t QuantizeError(const int16_t in[64], const Quantizer* const Q) {
|
841
|
+
const uint16_t* const bias = Q->bias_;
|
842
|
+
const uint16_t* const iquant = Q->iquant_;
|
843
|
+
const uint8_t* const quant = Q->quant_;
|
844
|
+
uint32_t err = 0;
|
845
|
+
for (int j = 0; j < 64; ++j) {
|
846
|
+
int32_t v0 = (in[j] < 0) ? -in[j] : in[j];
|
847
|
+
const uint32_t v = quant[j] * QUANTIZE(v0, iquant[j], bias[j]);
|
848
|
+
v0 >>= AC_BITS;
|
849
|
+
err += (v0 - v) * (v0 - v);
|
850
|
+
}
|
851
|
+
return err;
|
852
|
+
}
|
853
|
+
|
854
|
+
Encoder::QuantizeErrorFunc Encoder::GetQuantizeErrorFunc() {
|
855
|
+
#if defined(SJPEG_USE_SSE2)
|
856
|
+
if (SupportsSSE2()) return QuantizeErrorSSE2;
|
857
|
+
#elif defined(SJPEG_USE_NEON)
|
858
|
+
if (SupportsNEON()) return QuantizeErrorNEON;
|
859
|
+
#endif
|
860
|
+
return QuantizeError; // default
|
861
|
+
}
|
862
|
+
|
863
|
+
////////////////////////////////////////////////////////////////////////////////
|
864
|
+
// Code bitstream
|
865
|
+
|
866
|
+
void Encoder::ResetDCs() {
|
867
|
+
for (int c = 0; c < nb_comps_; ++c) {
|
868
|
+
DCs_[c] = 0;
|
869
|
+
}
|
870
|
+
}
|
871
|
+
|
872
|
+
void Encoder::CodeBlock(const DCTCoeffs* const coeffs,
|
873
|
+
const RunLevel* const rl) {
|
874
|
+
const int idx = coeffs->idx_;
|
875
|
+
const int q_idx = quant_idx_[idx];
|
876
|
+
|
877
|
+
// DC coefficient symbol
|
878
|
+
const int dc_len = coeffs->dc_code_ & 0x0f;
|
879
|
+
const uint32_t code = dc_codes_[q_idx][dc_len];
|
880
|
+
bw_.PutPackedCode(code);
|
881
|
+
if (dc_len > 0) {
|
882
|
+
bw_.PutBits(coeffs->dc_code_ >> 4, dc_len);
|
883
|
+
}
|
884
|
+
|
885
|
+
// AC coeffs
|
886
|
+
const uint32_t* const codes = ac_codes_[q_idx];
|
887
|
+
for (int i = 0; i < coeffs->nb_coeffs_; ++i) {
|
888
|
+
int run = rl[i].run_;
|
889
|
+
while (run & ~15) { // escapes
|
890
|
+
bw_.PutPackedCode(codes[0xf0]);
|
891
|
+
run -= 16;
|
892
|
+
}
|
893
|
+
const uint32_t suffix = rl[i].level_;
|
894
|
+
const int n = suffix & 0x0f;
|
895
|
+
const int sym = (run << 4) | n;
|
896
|
+
bw_.PutPackedCode(codes[sym]);
|
897
|
+
bw_.PutBits(suffix >> 4, n);
|
898
|
+
}
|
899
|
+
if (coeffs->last_ < 63) { // EOB
|
900
|
+
bw_.PutPackedCode(codes[0x00]);
|
901
|
+
}
|
902
|
+
}
|
903
|
+
|
904
|
+
////////////////////////////////////////////////////////////////////////////////
|
905
|
+
// Histogram
|
906
|
+
|
907
|
+
void Encoder::ResetHisto() {
|
908
|
+
memset(histos_, 0, sizeof(histos_));
|
909
|
+
}
|
910
|
+
|
911
|
+
#if defined(SJPEG_USE_SSE2)
|
912
|
+
void StoreHistoSSE2(const int16_t in[64], Histo* const histos, int nb_blocks) {
|
913
|
+
const __m128i kMaxHisto = _mm_set1_epi16(MAX_HISTO_DCT_COEFF);
|
914
|
+
for (int n = 0; n < nb_blocks; ++n, in += 64) {
|
915
|
+
uint16_t tmp[64];
|
916
|
+
for (int i = 0; i < 64; i += 8) {
|
917
|
+
const __m128i A =
|
918
|
+
_mm_loadu_si128(reinterpret_cast<const __m128i*>(in + i));
|
919
|
+
const __m128i B = _mm_srai_epi16(A, 15); // sign extract
|
920
|
+
const __m128i C = _mm_sub_epi16(_mm_xor_si128(A, B), B); // abs(A)
|
921
|
+
const __m128i D = _mm_srli_epi16(C, HSHIFT); // >>= HSHIFT
|
922
|
+
const __m128i E = _mm_min_epi16(D, kMaxHisto);
|
923
|
+
_mm_storeu_si128(reinterpret_cast<__m128i*>(tmp + i), E);
|
924
|
+
}
|
925
|
+
for (int j = 0; j < 64; ++j) {
|
926
|
+
const int k = tmp[j];
|
927
|
+
++histos->counts_[j][k];
|
928
|
+
}
|
929
|
+
}
|
930
|
+
}
|
931
|
+
#elif defined(SJPEG_USE_NEON)
|
932
|
+
void StoreHistoNEON(const int16_t in[64], Histo* const histos, int nb_blocks) {
|
933
|
+
const uint16x8_t kMaxHisto = vdupq_n_u16(MAX_HISTO_DCT_COEFF);
|
934
|
+
for (int n = 0; n < nb_blocks; ++n, in += 64) {
|
935
|
+
uint16_t tmp[64];
|
936
|
+
for (int i = 0; i < 64; i += 8) {
|
937
|
+
const int16x8_t A = vld1q_s16(in + i);
|
938
|
+
const int16x8_t B = vabsq_s16(A); // abs(in)
|
939
|
+
const uint16x8_t C = vreinterpretq_u16_s16(B); // signed->unsigned
|
940
|
+
const uint16x8_t D = vshrq_n_u16(C, HSHIFT); // >>= HSHIFT
|
941
|
+
const uint16x8_t E = vminq_u16(D, kMaxHisto); // min(.,kMaxHisto)
|
942
|
+
vst1q_u16(tmp + i, E);
|
943
|
+
}
|
944
|
+
for (int j = 0; j < 64; ++j) {
|
945
|
+
const int k = tmp[j];
|
946
|
+
++histos->counts_[j][k];
|
947
|
+
}
|
948
|
+
}
|
949
|
+
}
|
950
|
+
#endif
|
951
|
+
|
952
|
+
// This C-version is does not produce the same counts_[] output than the
|
953
|
+
// assembly above. But the extra entry counts_[MAX_HISTO_DCT_COEFF] is
|
954
|
+
// not used for the final computation, and the global result is unchanged.
|
955
|
+
void StoreHisto(const int16_t in[64], Histo* const histos, int nb_blocks) {
|
956
|
+
for (int n = 0; n < nb_blocks; ++n, in += 64) {
|
957
|
+
for (int i = 0; i < 64; ++i) {
|
958
|
+
const int k = (in[i] < 0 ? -in[i] : in[i]) >> HSHIFT;
|
959
|
+
if (k < MAX_HISTO_DCT_COEFF) {
|
960
|
+
++histos->counts_[i][k];
|
961
|
+
}
|
962
|
+
}
|
963
|
+
}
|
964
|
+
}
|
965
|
+
|
966
|
+
Encoder::StoreHistoFunc Encoder::GetStoreHistoFunc() {
|
967
|
+
#if defined(SJPEG_USE_SSE2)
|
968
|
+
if (SupportsSSE2()) return StoreHistoSSE2;
|
969
|
+
#elif defined(SJPEG_USE_NEON)
|
970
|
+
if (SupportsNEON()) return StoreHistoNEON;
|
971
|
+
#endif
|
972
|
+
return StoreHisto; // default
|
973
|
+
}
|
974
|
+
|
975
|
+
const float Encoder::kHistoWeight[QSIZE] = {
|
976
|
+
// Gaussian with sigma ~= 3
|
977
|
+
0, 0, 0, 0, 0,
|
978
|
+
1, 5, 16, 43, 94, 164, 228, 255, 228, 164, 94, 43, 16, 5, 1,
|
979
|
+
0, 0, 0, 0, 0
|
980
|
+
};
|
981
|
+
|
982
|
+
void Encoder::AnalyseHisto() {
|
983
|
+
// A bit of theory and background: for each sub-band i in [0..63], we pick a
|
984
|
+
// quantization scale New_Qi close to the initial one Qi. We evaluate a cost
|
985
|
+
// function associated with F({New_Qi}) = distortion + lambda . rate,
|
986
|
+
// where rate and distortion depend on the quantizers set in a complex non-
|
987
|
+
// analytic way. Just, for well-behaved regular histograms, we expect the
|
988
|
+
// rate to scale as -log(Q), and the distortion as Q^2.
|
989
|
+
// We want the cost function to be stationnary around the initial {Qi} set,
|
990
|
+
// in order to achieve the best transfer between distortion and rate when we
|
991
|
+
// displace a little the Qi values. Mainly we want to use bits as efficiently
|
992
|
+
// as possible, where every bit we use has maximal impact in lowering
|
993
|
+
// distortion (and vice versa: if we spend an extra bit of coding, we want to
|
994
|
+
// have the best bang for this buck. The optimization works up-hill too).
|
995
|
+
//
|
996
|
+
// Hence, lambda is picked to minimize F around {Qi}, as:
|
997
|
+
// lambda = -d(distortion) / d(rate)
|
998
|
+
// where the derivates are evaluated using a double least-square fit on both
|
999
|
+
// the clouds of {delta, distortion} and {delta, size} points.
|
1000
|
+
//
|
1001
|
+
// Note1: The least-square fitted slope of a {x,y} cloud is expressed as:
|
1002
|
+
// slope = (<xy> - <x><y>) / (<xx> - <x><x>) = Cov(x,y) / Cov(x,x)
|
1003
|
+
// where <.> is our gaussian-averaging operator.
|
1004
|
+
// But since we are eventually computing a quotient of such slopes, we can
|
1005
|
+
// factor out the common (<xx> - <x><x>) denominator (which is strictly
|
1006
|
+
// positive).
|
1007
|
+
// Note2: we use a Gaussian-weighted average around the center value Qi
|
1008
|
+
// instead of averaging over the whole [QDELTA_MIN, QDELTA_MAX] range.
|
1009
|
+
// This rules out fringe samples on noisy cases (like: when the source is
|
1010
|
+
// already JPEG-compressed!).
|
1011
|
+
// Note3: We fall back to some sane value HLAMBDA in case of ill-condition.
|
1012
|
+
//
|
1013
|
+
// We use use the correlation coefficient
|
1014
|
+
// r = Cov(x,y) / sqrt(Cov(x,x) * Cov(y,y))
|
1015
|
+
// to detect bad cases with poorly extrapolated distortion. In such
|
1016
|
+
// occurrence, we skip the channel. This is particularly important for
|
1017
|
+
// already-compressed JPEG sources that give treacherous comb-like
|
1018
|
+
// histograms.
|
1019
|
+
//
|
1020
|
+
// Once this particular lambda has been picked, we loop over each channel
|
1021
|
+
// and optimize them separately, locally picking the best New_Qi for each.
|
1022
|
+
// The choice of lambda ensure a good balancing between size and distortion,
|
1023
|
+
// and prevent being too aggressive on file-size reduction for instance.
|
1024
|
+
//
|
1025
|
+
const double r_limit = kCorrelationThreshold;
|
1026
|
+
for (int c = (nb_comps_ > 1 ? 1 : 0); c >= 0; --c) {
|
1027
|
+
const int idx = quant_idx_[c];
|
1028
|
+
const Histo* const histo = &histos_[idx];
|
1029
|
+
// For chrominance, it can be visually damageable to be too
|
1030
|
+
// aggressive on the filesize. So with the default settings we
|
1031
|
+
// restrict the algorithm to mainly try to *increase* the bitrate
|
1032
|
+
// (and quality) by using a smaller qdelta_max_chroma_.
|
1033
|
+
// delta_max is only use during the second phase, but not during
|
1034
|
+
// the first phase of deriving an optimal lambda.
|
1035
|
+
assert(QDELTA_MAX >= qdelta_max_luma_);
|
1036
|
+
assert(QDELTA_MAX >= qdelta_max_chroma_);
|
1037
|
+
const int delta_max =
|
1038
|
+
((idx == 0) ? qdelta_max_luma_ : qdelta_max_chroma_) - QDELTA_MIN;
|
1039
|
+
assert(delta_max < QSIZE);
|
1040
|
+
float sizes[64][QSIZE];
|
1041
|
+
float distortions[64][QSIZE];
|
1042
|
+
double num = 0.; // accumulate d(distortion) around delta_q = 0
|
1043
|
+
double den = 0.; // accumulate d(size) around delta_q = 0
|
1044
|
+
uint64_t omit_channels = kOmittedChannels;
|
1045
|
+
for (int pos = 0; pos < 64; ++pos) {
|
1046
|
+
if (omit_channels & (1ULL << pos)) {
|
1047
|
+
continue;
|
1048
|
+
}
|
1049
|
+
const int dq0 = quants_[idx].quant_[pos];
|
1050
|
+
const int min_dq0 = quants_[idx].min_quant_[pos];
|
1051
|
+
// We should be using the exact bias:
|
1052
|
+
// const int bias = quants_[idx].bias_[pos] << (FP_BITS - AC_BITS);
|
1053
|
+
// but this value is too precise considering the other approximations
|
1054
|
+
// we're using (namely: HSHIFT). So we better use the a mid value of 0.5
|
1055
|
+
// for the bias. This have the advantage of making it possible to
|
1056
|
+
// use pre-calculated look-up tables for every quantities in the loop.
|
1057
|
+
// This is still a TODO(skal) below, though. Not sure the gain is big.
|
1058
|
+
const int bias = 1 << FP_BITS >> 1;
|
1059
|
+
const int* const h = histo->counts_[pos];
|
1060
|
+
int total = 0;
|
1061
|
+
int last = 0;
|
1062
|
+
for (int i = 0; i < MAX_HISTO_DCT_COEFF; ++i) {
|
1063
|
+
total += h[i];
|
1064
|
+
if (h[i]) last = i + 1;
|
1065
|
+
}
|
1066
|
+
if (total < kDensityThreshold * last) {
|
1067
|
+
omit_channels |= 1ULL << pos;
|
1068
|
+
continue;
|
1069
|
+
}
|
1070
|
+
// accumulators for averaged values.
|
1071
|
+
double sw = 0., sx = 0.;
|
1072
|
+
double sxx = 0., syy1 = 0.;
|
1073
|
+
double sy1 = 0., sxy1 = 0.; // accumulators for distortion cloud
|
1074
|
+
double sy2 = 0., sxy2 = 0.; // accumulators for size cloud
|
1075
|
+
for (int delta = 0; delta < QSIZE; ++delta) {
|
1076
|
+
double bsum = 0., dsum = 0.;
|
1077
|
+
const int dq = dq0 + (delta + QDELTA_MIN);
|
1078
|
+
if (dq >= min_dq0 && dq <= 255) {
|
1079
|
+
// TODO(skal): pre-compute idq and use it in FinalizeQuantMatrix too
|
1080
|
+
const int idq = ((1 << FP_BITS) + dq - 1) / dq;
|
1081
|
+
for (int i = 0; i < last; ++i) {
|
1082
|
+
if (h[i]) {
|
1083
|
+
// v = current bin's centroid in the histogram
|
1084
|
+
// qv = quantized value for the bin's representant 'v'
|
1085
|
+
// dqv = dequantized qv, to be compared against v (=> 'error')
|
1086
|
+
// bits = approximate bit-cost of quantized representant
|
1087
|
+
// h[i] = this bin's weight
|
1088
|
+
const int v = (i << HSHIFT) + HHALF;
|
1089
|
+
const int qv = (v * idq + bias) >> FP_BITS;
|
1090
|
+
// TODO(skal): for a given 'last' value, we know the upper limit
|
1091
|
+
// on dq that will make *all* quantized 'qv' values be zero.
|
1092
|
+
// => We can restrict the loop on 'dq' using 'last'.
|
1093
|
+
if (qv) {
|
1094
|
+
const int bits = CalcLog2(qv);
|
1095
|
+
const int dqv = qv * dq;
|
1096
|
+
const int error = (v - dqv) * (v - dqv);
|
1097
|
+
bsum += h[i] * bits;
|
1098
|
+
dsum += h[i] * error;
|
1099
|
+
} else {
|
1100
|
+
dsum += h[i] * v * v;
|
1101
|
+
}
|
1102
|
+
}
|
1103
|
+
} // end of 'i' loop
|
1104
|
+
distortions[pos][delta] = static_cast<float>(dsum);
|
1105
|
+
sizes[pos][delta] = static_cast<float>(bsum);
|
1106
|
+
const double w = kHistoWeight[delta]; // Gaussian weight
|
1107
|
+
if (w > 0.) {
|
1108
|
+
const double x = static_cast<double>(delta + QDELTA_MIN);
|
1109
|
+
sw += w;
|
1110
|
+
sx += w * x;
|
1111
|
+
sxx += w * x * x;
|
1112
|
+
sy1 += w * dsum;
|
1113
|
+
syy1 += w * dsum * dsum;
|
1114
|
+
sy2 += w * bsum;
|
1115
|
+
sxy1 += w * dsum * x;
|
1116
|
+
sxy2 += w * bsum * x;
|
1117
|
+
}
|
1118
|
+
} else { // the new quantizer is out-of-range.
|
1119
|
+
distortions[pos][delta] = FLT_MAX;
|
1120
|
+
sizes[pos][delta] = 0;
|
1121
|
+
}
|
1122
|
+
}
|
1123
|
+
// filter channels according to correlation factor.
|
1124
|
+
const double cov_xy1 = sw * sxy1 - sx * sy1;
|
1125
|
+
if (cov_xy1 * cov_xy1 < r_limit *
|
1126
|
+
(sw * sxx - sx * sx) * (sw * syy1 - sy1 * sy1)) {
|
1127
|
+
omit_channels |= 1ULL << pos;
|
1128
|
+
continue;
|
1129
|
+
}
|
1130
|
+
// accumulate numerator and denominator for the derivate calculation
|
1131
|
+
num += cov_xy1;
|
1132
|
+
den += sw * sxy2 - sx * sy2;
|
1133
|
+
}
|
1134
|
+
|
1135
|
+
// we evaluate lambda =~ -d(distortion)/d(size) at dq=0
|
1136
|
+
double lambda = HLAMBDA;
|
1137
|
+
// When increasing Q, size should significantly decrease and distortion
|
1138
|
+
// increase. If they don't, we are ill-conditionned and should fall back
|
1139
|
+
// to a safe value HLAMBDA.
|
1140
|
+
if (num > 1000. && den < -10.) {
|
1141
|
+
// This is our approximation of -d(Distortion) / d(Rate)
|
1142
|
+
// We limit it to 1. below, to avoid degenerated cases
|
1143
|
+
lambda = -num / den;
|
1144
|
+
if (lambda < 1.) {
|
1145
|
+
lambda = 1.;
|
1146
|
+
}
|
1147
|
+
}
|
1148
|
+
// now, optimize each channel using the optimal lambda selection
|
1149
|
+
for (int pos = 0; pos < 64; ++pos) {
|
1150
|
+
if (omit_channels & (1ULL << pos)) {
|
1151
|
+
continue;
|
1152
|
+
}
|
1153
|
+
float best_score = FLT_MAX;
|
1154
|
+
int best_dq = 0;
|
1155
|
+
for (int delta = 0; delta <= delta_max; ++delta) {
|
1156
|
+
if (distortions[pos][delta] < FLT_MAX) {
|
1157
|
+
const float score = distortions[pos][delta]
|
1158
|
+
+ lambda * sizes[pos][delta];
|
1159
|
+
if (score < best_score) {
|
1160
|
+
best_score = score;
|
1161
|
+
best_dq = delta + QDELTA_MIN;
|
1162
|
+
}
|
1163
|
+
}
|
1164
|
+
}
|
1165
|
+
quants_[idx].quant_[pos] += best_dq;
|
1166
|
+
assert(quants_[idx].quant_[pos] >= 1);
|
1167
|
+
}
|
1168
|
+
FinalizeQuantMatrix(&quants_[idx], q_bias_);
|
1169
|
+
SetCostCodes(idx);
|
1170
|
+
}
|
1171
|
+
}
|
1172
|
+
|
1173
|
+
void Encoder::CollectHistograms() {
|
1174
|
+
ResetHisto();
|
1175
|
+
int16_t* in = in_blocks_;
|
1176
|
+
const int mb_x_max = W_ / block_w_;
|
1177
|
+
const int mb_y_max = H_ / block_h_;
|
1178
|
+
for (int mb_y = 0; mb_y < mb_h_; ++mb_y) {
|
1179
|
+
const bool yclip = (mb_y == mb_y_max);
|
1180
|
+
for (int mb_x = 0; mb_x < mb_w_; ++mb_x) {
|
1181
|
+
if (!use_extra_memory_) {
|
1182
|
+
in = in_blocks_;
|
1183
|
+
}
|
1184
|
+
GetSamples(mb_x, mb_y, yclip | (mb_x == mb_x_max), in);
|
1185
|
+
fDCT_(in, mcu_blocks_);
|
1186
|
+
for (int c = 0; c < nb_comps_; ++c) {
|
1187
|
+
const int num_blocks = nb_blocks_[c];
|
1188
|
+
store_histo_(in, &histos_[quant_idx_[c]], num_blocks);
|
1189
|
+
in += 64 * num_blocks;
|
1190
|
+
}
|
1191
|
+
}
|
1192
|
+
}
|
1193
|
+
have_coeffs_ = use_extra_memory_;
|
1194
|
+
}
|
1195
|
+
|
1196
|
+
////////////////////////////////////////////////////////////////////////////////
|
1197
|
+
// Perform YUV conversion and fDCT, and store the unquantized coeffs
|
1198
|
+
|
1199
|
+
void Encoder::CollectCoeffs() {
|
1200
|
+
assert(use_extra_memory_);
|
1201
|
+
int16_t* in = in_blocks_;
|
1202
|
+
const int mb_x_max = W_ / block_w_;
|
1203
|
+
const int mb_y_max = H_ / block_h_;
|
1204
|
+
for (int mb_y = 0; mb_y < mb_h_; ++mb_y) {
|
1205
|
+
const bool yclip = (mb_y == mb_y_max);
|
1206
|
+
for (int mb_x = 0; mb_x < mb_w_; ++mb_x) {
|
1207
|
+
GetSamples(mb_x, mb_y, yclip | (mb_x == mb_x_max), in);
|
1208
|
+
fDCT_(in, mcu_blocks_);
|
1209
|
+
in += 64 * mcu_blocks_;
|
1210
|
+
}
|
1211
|
+
}
|
1212
|
+
have_coeffs_ = true;
|
1213
|
+
}
|
1214
|
+
|
1215
|
+
////////////////////////////////////////////////////////////////////////////////
|
1216
|
+
// 1-pass Scan
|
1217
|
+
|
1218
|
+
void Encoder::SinglePassScan() {
|
1219
|
+
ResetDCs();
|
1220
|
+
|
1221
|
+
RunLevel base_run_levels[64];
|
1222
|
+
int16_t* in = in_blocks_;
|
1223
|
+
const int mb_x_max = W_ / block_w_;
|
1224
|
+
const int mb_y_max = H_ / block_h_;
|
1225
|
+
const QuantizeBlockFunc quantize_block = use_trellis_ ? TrellisQuantizeBlock
|
1226
|
+
: quantize_block_;
|
1227
|
+
for (int mb_y = 0; mb_y < mb_h_; ++mb_y) {
|
1228
|
+
const bool yclip = (mb_y == mb_y_max);
|
1229
|
+
for (int mb_x = 0; mb_x < mb_w_; ++mb_x) {
|
1230
|
+
if (!CheckBuffers()) return;
|
1231
|
+
if (!have_coeffs_) {
|
1232
|
+
in = in_blocks_;
|
1233
|
+
GetSamples(mb_x, mb_y, yclip | (mb_x == mb_x_max), in);
|
1234
|
+
fDCT_(in, mcu_blocks_);
|
1235
|
+
}
|
1236
|
+
for (int c = 0; c < nb_comps_; ++c) {
|
1237
|
+
DCTCoeffs base_coeffs;
|
1238
|
+
for (int i = 0; i < nb_blocks_[c]; ++i) {
|
1239
|
+
const int dc = quantize_block(in, c, &quants_[quant_idx_[c]],
|
1240
|
+
&base_coeffs, base_run_levels);
|
1241
|
+
base_coeffs.dc_code_ = GenerateDCDiffCode(dc, &DCs_[c]);
|
1242
|
+
CodeBlock(&base_coeffs, base_run_levels);
|
1243
|
+
in += 64;
|
1244
|
+
}
|
1245
|
+
}
|
1246
|
+
}
|
1247
|
+
}
|
1248
|
+
}
|
1249
|
+
|
1250
|
+
void Encoder::FinalPassScan(size_t nb_mbs, const DCTCoeffs* coeffs) {
|
1251
|
+
DesallocateBlocks(); // we can free up some coeffs memory at this point
|
1252
|
+
if (!CheckBuffers()) return; // call needed to finalize all_run_levels_
|
1253
|
+
assert(reuse_run_levels_);
|
1254
|
+
const RunLevel* run_levels = all_run_levels_;
|
1255
|
+
for (size_t n = 0; n < nb_mbs; ++n) {
|
1256
|
+
if (!CheckBuffers()) return;
|
1257
|
+
CodeBlock(&coeffs[n], run_levels);
|
1258
|
+
run_levels += coeffs[n].nb_coeffs_;
|
1259
|
+
}
|
1260
|
+
}
|
1261
|
+
|
1262
|
+
////////////////////////////////////////////////////////////////////////////////
|
1263
|
+
// Huffman tables optimization
|
1264
|
+
|
1265
|
+
void Encoder::ResetEntropyStats() {
|
1266
|
+
memset(freq_ac_, 0, sizeof(freq_ac_));
|
1267
|
+
memset(freq_dc_, 0, sizeof(freq_dc_));
|
1268
|
+
}
|
1269
|
+
|
1270
|
+
void Encoder::AddEntropyStats(const DCTCoeffs* const coeffs,
|
1271
|
+
const RunLevel* const run_levels) {
|
1272
|
+
// freq_ac_[] and freq_dc_[] cannot overflow 32bits, since the maximum
|
1273
|
+
// resolution allowed is 65535 * 65535. The sum of all frequencies cannot
|
1274
|
+
// be greater than 32bits, either.
|
1275
|
+
const int idx = coeffs->idx_;
|
1276
|
+
const int q_idx = quant_idx_[idx];
|
1277
|
+
for (int i = 0; i < coeffs->nb_coeffs_; ++i) {
|
1278
|
+
const int run = run_levels[i].run_;
|
1279
|
+
const int tmp = (run >> 4);
|
1280
|
+
if (tmp) freq_ac_[q_idx][0xf0] += tmp; // count escapes (all at once)
|
1281
|
+
const int suffix = run_levels[i].level_;
|
1282
|
+
const int sym = ((run & 0x0f) << 4) | (suffix & 0x0f);
|
1283
|
+
++freq_ac_[q_idx][sym];
|
1284
|
+
}
|
1285
|
+
if (coeffs->last_ < 63) { // EOB
|
1286
|
+
++freq_ac_[q_idx][0x00];
|
1287
|
+
}
|
1288
|
+
++freq_dc_[q_idx][coeffs->dc_code_ & 0x0f];
|
1289
|
+
}
|
1290
|
+
|
1291
|
+
static int cmp(const void *pa, const void *pb) {
|
1292
|
+
const uint64_t a = *reinterpret_cast<const uint64_t*>(pa);
|
1293
|
+
const uint64_t b = *reinterpret_cast<const uint64_t*>(pb);
|
1294
|
+
assert(a != b); // tie-breaks can't happen
|
1295
|
+
return (a < b) ? 1 : -1;
|
1296
|
+
}
|
1297
|
+
|
1298
|
+
static void BuildOptimalTable(HuffmanTable* const t,
|
1299
|
+
const uint32_t* const freq, int size) {
|
1300
|
+
enum { MAX_BITS = 32, MAX_CODE_SIZE = 16 };
|
1301
|
+
assert(size <= 256);
|
1302
|
+
assert(t != nullptr);
|
1303
|
+
|
1304
|
+
// The celebrated merging algorithm from Huffman, with some restrictions:
|
1305
|
+
// * codes with all '1' are forbidden, to avoid trailing marker emulation
|
1306
|
+
// * code should be less than 16bits. So we're re-allocating them to shorter
|
1307
|
+
// code, even if it means being suboptimal for extremely rare symbols that
|
1308
|
+
// would eat a lot of bits.
|
1309
|
+
// This function will not touch the content of freq[].
|
1310
|
+
int codesizes[256 + 1];
|
1311
|
+
// chain[i] will hold the index of the next element in the subtree below
|
1312
|
+
// element 'i', or -1 if there's no sub-tree.
|
1313
|
+
// We use and maintain this list in order to efficiently increasing the
|
1314
|
+
// codesizes by one when merging two sub-trees into one.
|
1315
|
+
// To ease the merging (by avoiding 1 loop) we store the address of the last
|
1316
|
+
// element in the chain for each symbol. This makes the process being O(1).
|
1317
|
+
// It's probably better to keep the arrays separated instead of making
|
1318
|
+
// a struct, since we touch chain_end[] only once per merging, whereas
|
1319
|
+
// chain[] and codesizes[] are modified O(k) time per merging.
|
1320
|
+
int chain[256 + 1];
|
1321
|
+
int* chain_end[256 + 1];
|
1322
|
+
// sorted_freq[] remains sorted by decreasing frequencies along the process.
|
1323
|
+
uint64_t sorted_freq[256 + 1];
|
1324
|
+
|
1325
|
+
// Counts and puts the symbols effectively used at the beginning of the table.
|
1326
|
+
int nb_syms = 0;
|
1327
|
+
for (int i = 0; i < size; ++i) {
|
1328
|
+
const uint64_t v = freq[i];
|
1329
|
+
if (v > 0) {
|
1330
|
+
// we pack the sorted key (32bits) and index (9bits) into a single
|
1331
|
+
// uint64_t, so we don't have to resort to structs (and we avoid
|
1332
|
+
// tie-breaks, too)
|
1333
|
+
sorted_freq[nb_syms++] = (v << 9) | i;
|
1334
|
+
}
|
1335
|
+
codesizes[i] = 0;
|
1336
|
+
chain[i] = -1;
|
1337
|
+
chain_end[i] = &chain[i];
|
1338
|
+
}
|
1339
|
+
t->nb_syms_ = nb_syms; // Record how many final symbols we'll have.
|
1340
|
+
|
1341
|
+
// initial sort
|
1342
|
+
// TODO(skal): replace by counting-sort?? (merged with previous loop?)
|
1343
|
+
qsort(sorted_freq, nb_syms, sizeof(sorted_freq[0]), cmp);
|
1344
|
+
|
1345
|
+
// fake last symbol, with lowest frequency: will be assigned to the forbidden
|
1346
|
+
// code '1111...1', but will eventually be discarded.
|
1347
|
+
sorted_freq[nb_syms++] = (1ULL << 9) | size;
|
1348
|
+
codesizes[size] = 0;
|
1349
|
+
chain[size] = -1;
|
1350
|
+
chain_end[size] = &chain[size];
|
1351
|
+
|
1352
|
+
// Merging phase
|
1353
|
+
// Recursively merge the two symbols with lowest frequency. The resulting
|
1354
|
+
// super-symbol will be represented by a longer (by 1bit) code, since
|
1355
|
+
// it's the least frequent one.
|
1356
|
+
int nb = nb_syms;
|
1357
|
+
while (nb-- > 1) {
|
1358
|
+
// First, link the two sub-trees.
|
1359
|
+
const uint64_t s1 = sorted_freq[nb - 1]; // first symbol
|
1360
|
+
const uint64_t s2 = sorted_freq[nb]; // second symbol, appended
|
1361
|
+
// The 0x1ff masking is for taking only the symbol, discarding the
|
1362
|
+
// frequency that we stored in the upper bits for sorting.
|
1363
|
+
int i = s1 & 0x1ff;
|
1364
|
+
const int j = s2 & 0x1ff;
|
1365
|
+
assert(i <= size && j <= size);
|
1366
|
+
*chain_end[i] = j;
|
1367
|
+
chain_end[i] = chain_end[j];
|
1368
|
+
|
1369
|
+
// Then, following the chain, increase the whole sub-tree's weight by 1bit.
|
1370
|
+
do {
|
1371
|
+
++codesizes[i];
|
1372
|
+
i = chain[i];
|
1373
|
+
} while (i >= 0);
|
1374
|
+
|
1375
|
+
// Create new symbol, with merged frequencies. Will take s1's spot.
|
1376
|
+
// We must use 64bit here to prevent overflow in the sum. Both s1 and
|
1377
|
+
// s2 are originally 32 + 9 bits wide.
|
1378
|
+
const uint64_t new_symbol = s1 + (s2 & ~0x1ff);
|
1379
|
+
// Perform insertion sort to find the new spot of the merged symbol.
|
1380
|
+
int k = nb - 1;
|
1381
|
+
while (k > 0) {
|
1382
|
+
if (sorted_freq[k - 1] < new_symbol) {
|
1383
|
+
sorted_freq[k] = sorted_freq[k - 1];
|
1384
|
+
--k;
|
1385
|
+
} else {
|
1386
|
+
break;
|
1387
|
+
}
|
1388
|
+
}
|
1389
|
+
sorted_freq[k] = new_symbol;
|
1390
|
+
}
|
1391
|
+
|
1392
|
+
// Count bit distribution.
|
1393
|
+
uint8_t bits[MAX_BITS];
|
1394
|
+
memset(bits, 0, sizeof(bits));
|
1395
|
+
int max_bit_size = 0;
|
1396
|
+
for (int i = 0; i <= size; ++i) {
|
1397
|
+
int s = codesizes[i];
|
1398
|
+
assert(s <= codesizes[size]); // symbol #size is the biggest one.
|
1399
|
+
if (s > 0) {
|
1400
|
+
// This is slightly penalizing but only for ultra-rare symbol
|
1401
|
+
if (s > MAX_BITS) {
|
1402
|
+
s = MAX_BITS;
|
1403
|
+
codesizes[i] = MAX_BITS; // clamp code-size
|
1404
|
+
}
|
1405
|
+
++bits[s - 1];
|
1406
|
+
if (s > max_bit_size) {
|
1407
|
+
max_bit_size = s;
|
1408
|
+
}
|
1409
|
+
}
|
1410
|
+
}
|
1411
|
+
|
1412
|
+
// We sort symbols by slices of increasing bitsizes, using counting sort.
|
1413
|
+
// This will generate a partition of symbols in the final syms_[] array.
|
1414
|
+
int start[MAX_BITS]; // start[i] is the first code with length i+1
|
1415
|
+
int position = 0;
|
1416
|
+
for (int i = 0; i < max_bit_size; ++i) {
|
1417
|
+
start[i] = position;
|
1418
|
+
position += bits[i];
|
1419
|
+
}
|
1420
|
+
assert(position == nb_syms);
|
1421
|
+
|
1422
|
+
// Now, we can ventilate the symbols directly to their final slice in the
|
1423
|
+
// partitioning, according to the their bit-length.
|
1424
|
+
// Note: we omit the last symbol, which is fake.
|
1425
|
+
uint8_t* const syms = const_cast<uint8_t*>(t->syms_);
|
1426
|
+
// Note that we loop til symbol = size-1, hence omitting the last fake symbol.
|
1427
|
+
for (int symbol = 0; symbol < size; ++symbol) {
|
1428
|
+
const int s = codesizes[symbol];
|
1429
|
+
if (s > 0) {
|
1430
|
+
assert(s <= MAX_BITS);
|
1431
|
+
syms[start[s - 1]++] = symbol;
|
1432
|
+
}
|
1433
|
+
}
|
1434
|
+
assert(start[max_bit_size - 1] == nb_syms - 1);
|
1435
|
+
|
1436
|
+
// Fix codes with length greater than 16 bits. We move too long
|
1437
|
+
// codes up, and one short down, making the tree a little sub-optimal.
|
1438
|
+
for (int l = max_bit_size - 1; l >= MAX_CODE_SIZE; --l) {
|
1439
|
+
while (bits[l] > 0) {
|
1440
|
+
int k = l - 2;
|
1441
|
+
while (bits[k] == 0) { // Search for a level with a leaf to split.
|
1442
|
+
--k;
|
1443
|
+
}
|
1444
|
+
/* Move up 2 symbols from bottom-most level l, and sink down one from
|
1445
|
+
level k, like this:
|
1446
|
+
Before: After:
|
1447
|
+
/ .. / ..
|
1448
|
+
k bits-> c \ /\ \
|
1449
|
+
/\ c b /\
|
1450
|
+
.. /\ .. a
|
1451
|
+
l bits-> a b
|
1452
|
+
Note that by the very construction of the optimal tree, the least
|
1453
|
+
probable symbols always come by pair with same bit-length.
|
1454
|
+
So there's always a pair of 'a' and 'b' to find.
|
1455
|
+
*/
|
1456
|
+
bits[l ] -= 2; // remove 'a' and 'b'
|
1457
|
+
bits[l - 1] += 1; // put 'a' one level up.
|
1458
|
+
bits[k ] -= 1; // remove 'c'
|
1459
|
+
bits[k + 1] += 2; // put 'c' anb 'b' one level down.
|
1460
|
+
}
|
1461
|
+
}
|
1462
|
+
|
1463
|
+
// remove last pseudo-symbol
|
1464
|
+
max_bit_size = MAX_CODE_SIZE;
|
1465
|
+
while (bits[--max_bit_size] == 0) {
|
1466
|
+
assert(max_bit_size > 0);
|
1467
|
+
}
|
1468
|
+
--bits[max_bit_size];
|
1469
|
+
|
1470
|
+
// update table with final book
|
1471
|
+
for (int i = 0; i < MAX_CODE_SIZE; ++i) {
|
1472
|
+
t->bits_[i] = bits[i];
|
1473
|
+
}
|
1474
|
+
}
|
1475
|
+
|
1476
|
+
void Encoder::CompileEntropyStats() {
|
1477
|
+
// plug and build new tables
|
1478
|
+
for (int q_idx = 0; q_idx < (nb_comps_ == 1 ? 1 : 2); ++q_idx) {
|
1479
|
+
// DC tables
|
1480
|
+
Huffman_tables_[q_idx] = &opt_tables_dc_[q_idx];
|
1481
|
+
opt_tables_dc_[q_idx].syms_ = opt_syms_dc_[q_idx];
|
1482
|
+
BuildOptimalTable(&opt_tables_dc_[q_idx], freq_dc_[q_idx], 12);
|
1483
|
+
// AC tables
|
1484
|
+
Huffman_tables_[2 + q_idx] = &opt_tables_ac_[q_idx];
|
1485
|
+
opt_tables_ac_[q_idx].syms_ = opt_syms_ac_[q_idx];
|
1486
|
+
BuildOptimalTable(&opt_tables_ac_[q_idx], freq_ac_[q_idx], 256);
|
1487
|
+
}
|
1488
|
+
}
|
1489
|
+
|
1490
|
+
void Encoder::StoreOptimalHuffmanTables(size_t nb_mbs,
|
1491
|
+
const DCTCoeffs* coeffs) {
|
1492
|
+
// optimize Huffman tables
|
1493
|
+
ResetEntropyStats();
|
1494
|
+
const RunLevel* run_levels = all_run_levels_;
|
1495
|
+
for (size_t n = 0; n < nb_mbs; ++n) {
|
1496
|
+
AddEntropyStats(&coeffs[n], run_levels);
|
1497
|
+
run_levels += coeffs[n].nb_coeffs_;
|
1498
|
+
}
|
1499
|
+
CompileEntropyStats();
|
1500
|
+
}
|
1501
|
+
|
1502
|
+
////////////////////////////////////////////////////////////////////////////////
|
1503
|
+
|
1504
|
+
void Encoder::SinglePassScanOptimized() {
|
1505
|
+
const size_t nb_mbs = mb_w_ * mb_h_ * mcu_blocks_;
|
1506
|
+
DCTCoeffs* const base_coeffs =
|
1507
|
+
Alloc<DCTCoeffs>(reuse_run_levels_ ? nb_mbs : 1);
|
1508
|
+
if (base_coeffs == nullptr) return;
|
1509
|
+
DCTCoeffs* coeffs = base_coeffs;
|
1510
|
+
RunLevel base_run_levels[64];
|
1511
|
+
const QuantizeBlockFunc quantize_block = use_trellis_ ? TrellisQuantizeBlock
|
1512
|
+
: quantize_block_;
|
1513
|
+
|
1514
|
+
// We use the default Huffman tables as basis for bit-rate evaluation
|
1515
|
+
if (use_trellis_) InitCodes(true);
|
1516
|
+
|
1517
|
+
ResetEntropyStats();
|
1518
|
+
ResetDCs();
|
1519
|
+
nb_run_levels_ = 0;
|
1520
|
+
int16_t* in = in_blocks_;
|
1521
|
+
const int mb_x_max = W_ / block_w_;
|
1522
|
+
const int mb_y_max = H_ / block_h_;
|
1523
|
+
for (int mb_y = 0; mb_y < mb_h_; ++mb_y) {
|
1524
|
+
const bool yclip = (mb_y == mb_y_max);
|
1525
|
+
for (int mb_x = 0; mb_x < mb_w_; ++mb_x) {
|
1526
|
+
if (!have_coeffs_) {
|
1527
|
+
in = in_blocks_;
|
1528
|
+
GetSamples(mb_x, mb_y, yclip | (mb_x == mb_x_max), in);
|
1529
|
+
fDCT_(in, mcu_blocks_);
|
1530
|
+
}
|
1531
|
+
if (!CheckBuffers()) goto End;
|
1532
|
+
for (int c = 0; c < nb_comps_; ++c) {
|
1533
|
+
for (int i = 0; i < nb_blocks_[c]; ++i) {
|
1534
|
+
RunLevel* const run_levels =
|
1535
|
+
reuse_run_levels_ ? all_run_levels_ + nb_run_levels_
|
1536
|
+
: base_run_levels;
|
1537
|
+
const int dc = quantize_block(in, c, &quants_[quant_idx_[c]],
|
1538
|
+
coeffs, run_levels);
|
1539
|
+
coeffs->dc_code_ = GenerateDCDiffCode(dc, &DCs_[c]);
|
1540
|
+
AddEntropyStats(coeffs, run_levels);
|
1541
|
+
if (reuse_run_levels_) {
|
1542
|
+
nb_run_levels_ += coeffs->nb_coeffs_;
|
1543
|
+
++coeffs;
|
1544
|
+
assert(coeffs <= &base_coeffs[nb_mbs]);
|
1545
|
+
}
|
1546
|
+
in += 64;
|
1547
|
+
assert(nb_run_levels_ <= max_run_levels_);
|
1548
|
+
}
|
1549
|
+
}
|
1550
|
+
}
|
1551
|
+
}
|
1552
|
+
|
1553
|
+
CompileEntropyStats();
|
1554
|
+
WriteDHT();
|
1555
|
+
WriteSOS();
|
1556
|
+
|
1557
|
+
if (!reuse_run_levels_) {
|
1558
|
+
SinglePassScan(); // redo everything, but with optimal tables now.
|
1559
|
+
} else {
|
1560
|
+
// Re-use the saved run/levels for fast 2nd-pass.
|
1561
|
+
FinalPassScan(nb_mbs, base_coeffs);
|
1562
|
+
}
|
1563
|
+
End:
|
1564
|
+
Free(base_coeffs);
|
1565
|
+
}
|
1566
|
+
|
1567
|
+
////////////////////////////////////////////////////////////////////////////////
|
1568
|
+
// main call
|
1569
|
+
|
1570
|
+
bool Encoder::Encode() {
|
1571
|
+
if (!ok_) return false;
|
1572
|
+
|
1573
|
+
FinalizeQuantMatrix(&quants_[0], q_bias_);
|
1574
|
+
FinalizeQuantMatrix(&quants_[1], q_bias_);
|
1575
|
+
SetCostCodes(0);
|
1576
|
+
SetCostCodes(1);
|
1577
|
+
|
1578
|
+
// default tables
|
1579
|
+
for (int i = 0; i < 4; ++i) Huffman_tables_[i] = &kHuffmanTables[i];
|
1580
|
+
|
1581
|
+
// colorspace init
|
1582
|
+
InitComponents();
|
1583
|
+
assert(nb_comps_ <= MAX_COMP);
|
1584
|
+
assert(mcu_blocks_ <= 6);
|
1585
|
+
// validate some input parameters
|
1586
|
+
if (W_ <= 0 || H_ <= 0 || rgb_ == nullptr) return false;
|
1587
|
+
|
1588
|
+
mb_w_ = (W_ + (block_w_ - 1)) / block_w_;
|
1589
|
+
mb_h_ = (H_ + (block_h_ - 1)) / block_h_;
|
1590
|
+
const size_t nb_blocks = use_extra_memory_ ? mb_w_ * mb_h_ : 1;
|
1591
|
+
if (!AllocateBlocks(nb_blocks * mcu_blocks_)) return false;
|
1592
|
+
|
1593
|
+
WriteAPP0();
|
1594
|
+
|
1595
|
+
// custom markers written 'as is'
|
1596
|
+
if (!WriteAPPMarkers(app_markers_)) return false;
|
1597
|
+
|
1598
|
+
// metadata
|
1599
|
+
if (!WriteEXIF(exif_) || !WriteICCP(iccp_) || !WriteXMP(xmp_)) return false;
|
1600
|
+
|
1601
|
+
if (passes_ > 1) {
|
1602
|
+
LoopScan();
|
1603
|
+
} else {
|
1604
|
+
if (use_adaptive_quant_) {
|
1605
|
+
// Histogram analysis + derive optimal quant matrices
|
1606
|
+
CollectHistograms();
|
1607
|
+
AnalyseHisto();
|
1608
|
+
}
|
1609
|
+
|
1610
|
+
WriteDQT();
|
1611
|
+
WriteSOF();
|
1612
|
+
|
1613
|
+
if (optimize_size_) {
|
1614
|
+
SinglePassScanOptimized();
|
1615
|
+
} else {
|
1616
|
+
WriteDHT();
|
1617
|
+
WriteSOS();
|
1618
|
+
SinglePassScan();
|
1619
|
+
}
|
1620
|
+
}
|
1621
|
+
WriteEOI();
|
1622
|
+
ok_ = ok_ && bw_.Finalize();
|
1623
|
+
|
1624
|
+
DesallocateBlocks();
|
1625
|
+
return ok_;
|
1626
|
+
}
|
1627
|
+
|
1628
|
+
////////////////////////////////////////////////////////////////////////////////
|
1629
|
+
// Edge replication
|
1630
|
+
|
1631
|
+
namespace {
|
1632
|
+
|
1633
|
+
int GetAverage(const int16_t* const out) {
|
1634
|
+
int DC = 0;
|
1635
|
+
for (int i = 0; i < 64; ++i) DC += out[i];
|
1636
|
+
return (DC + 32) >> 6;
|
1637
|
+
}
|
1638
|
+
|
1639
|
+
void SetAverage(int DC, int16_t* const out) {
|
1640
|
+
for (int i = 0; i < 64; ++i) out[i] = DC;
|
1641
|
+
}
|
1642
|
+
|
1643
|
+
} // anonymous namespace
|
1644
|
+
|
1645
|
+
void Encoder::AverageExtraLuma(int sub_w, int sub_h, int16_t* out) {
|
1646
|
+
// out[] points to four 8x8 blocks. When one of this block is totally
|
1647
|
+
// outside of the frame, we set it flat to the average value of the previous
|
1648
|
+
// block ("DC"), in order to help compressibility.
|
1649
|
+
int DC = GetAverage(out);
|
1650
|
+
if (sub_w <= 8) { // set block #1 to block #0's average value
|
1651
|
+
SetAverage(DC, out + 1 * 64);
|
1652
|
+
}
|
1653
|
+
if (sub_h <= 8) { // Need to flatten block #2 and #3
|
1654
|
+
if (sub_w > 8) { // block #1 was not flatten, so get its real DC
|
1655
|
+
DC = GetAverage(out + 1 * 64);
|
1656
|
+
}
|
1657
|
+
SetAverage(DC, out + 2 * 64);
|
1658
|
+
SetAverage(DC, out + 3 * 64);
|
1659
|
+
} else if (sub_w <= 8) { // set block #3 to the block #2's average value
|
1660
|
+
DC = GetAverage(out + 2 * 64);
|
1661
|
+
SetAverage(DC, out + 3 * 64);
|
1662
|
+
}
|
1663
|
+
}
|
1664
|
+
|
1665
|
+
const uint8_t* Encoder::GetReplicatedSamples(const uint8_t* rgb,
|
1666
|
+
int rgb_step,
|
1667
|
+
int sub_w, int sub_h,
|
1668
|
+
int w, int h) {
|
1669
|
+
assert(sub_w > 0 && sub_h > 0);
|
1670
|
+
if (sub_w > w) {
|
1671
|
+
sub_w = w;
|
1672
|
+
}
|
1673
|
+
if (sub_h > h) {
|
1674
|
+
sub_h = h;
|
1675
|
+
}
|
1676
|
+
uint8_t* dst = replicated_buffer_;
|
1677
|
+
for (int y = 0; y < sub_h; ++y) {
|
1678
|
+
memcpy(dst, rgb, 3 * sub_w);
|
1679
|
+
const uint8_t* const src0 = &dst[3 * (sub_w - 1)];
|
1680
|
+
for (int x = 3 * sub_w; x < 3 * w; x += 3) {
|
1681
|
+
memcpy(dst + x, src0, 3);
|
1682
|
+
}
|
1683
|
+
dst += 3 * w;
|
1684
|
+
rgb += rgb_step;
|
1685
|
+
}
|
1686
|
+
const uint8_t* dst0 = dst - 3 * w;
|
1687
|
+
for (int y = sub_h; y < h; ++y) {
|
1688
|
+
memcpy(dst, dst0, 3 * w);
|
1689
|
+
dst += 3 * w;
|
1690
|
+
}
|
1691
|
+
return replicated_buffer_;
|
1692
|
+
}
|
1693
|
+
|
1694
|
+
// TODO(skal): merge with above function? Probably slower...
|
1695
|
+
const uint8_t* Encoder::GetReplicatedYUVSamples(const uint8_t* in,
|
1696
|
+
int step,
|
1697
|
+
int sub_w, int sub_h,
|
1698
|
+
int w, int h) {
|
1699
|
+
assert(sub_w > 0 && sub_h > 0);
|
1700
|
+
if (sub_w > w) {
|
1701
|
+
sub_w = w;
|
1702
|
+
}
|
1703
|
+
if (sub_h > h) {
|
1704
|
+
sub_h = h;
|
1705
|
+
}
|
1706
|
+
uint8_t* out = replicated_buffer_;
|
1707
|
+
for (int y = 0; y < sub_h; ++y) {
|
1708
|
+
int x;
|
1709
|
+
for (x = 0; x < sub_w; ++x)
|
1710
|
+
out[x] = in[x];
|
1711
|
+
for (; x < w; ++x) {
|
1712
|
+
out[x] = out[sub_w - 1];
|
1713
|
+
}
|
1714
|
+
out += w;
|
1715
|
+
in += step;
|
1716
|
+
}
|
1717
|
+
const uint8_t* const out0 = out - w;
|
1718
|
+
for (int y = sub_h; y < h; ++y) {
|
1719
|
+
memcpy(out, out0, w);
|
1720
|
+
out += w;
|
1721
|
+
}
|
1722
|
+
return replicated_buffer_;
|
1723
|
+
}
|
1724
|
+
|
1725
|
+
////////////////////////////////////////////////////////////////////////////////
|
1726
|
+
// sub-class for YUV 4:2:0 version
|
1727
|
+
|
1728
|
+
class Encoder420 : public Encoder {
|
1729
|
+
public:
|
1730
|
+
Encoder420(int W, int H, int step, const uint8_t* const rgb,
|
1731
|
+
ByteSink* const sink)
|
1732
|
+
: Encoder(W, H, step, rgb, sink) {}
|
1733
|
+
virtual ~Encoder420() {}
|
1734
|
+
virtual void InitComponents() {
|
1735
|
+
nb_comps_ = 3;
|
1736
|
+
|
1737
|
+
quant_idx_[0] = 0;
|
1738
|
+
quant_idx_[1] = 1;
|
1739
|
+
quant_idx_[2] = 1;
|
1740
|
+
|
1741
|
+
nb_blocks_[0] = 4;
|
1742
|
+
nb_blocks_[1] = 1;
|
1743
|
+
nb_blocks_[2] = 1;
|
1744
|
+
mcu_blocks_ = 6;
|
1745
|
+
|
1746
|
+
block_w_ = 16;
|
1747
|
+
block_h_ = 16;
|
1748
|
+
block_dims_[0] = 0x22;
|
1749
|
+
block_dims_[1] = 0x11;
|
1750
|
+
block_dims_[2] = 0x11;
|
1751
|
+
}
|
1752
|
+
virtual void GetSamples(int mb_x, int mb_y, bool clipped,
|
1753
|
+
int16_t* out_blocks) {
|
1754
|
+
const uint8_t* data = rgb_ + (3 * mb_x + mb_y * step_) * 16;
|
1755
|
+
int step = step_;
|
1756
|
+
if (clipped) {
|
1757
|
+
data = GetReplicatedSamples(data, step,
|
1758
|
+
W_ - mb_x * 16, H_ - mb_y * 16, 16, 16);
|
1759
|
+
step = 3 * 16;
|
1760
|
+
}
|
1761
|
+
get_yuv_block_(data, step, out_blocks);
|
1762
|
+
if (clipped) {
|
1763
|
+
AverageExtraLuma(W_ - mb_x * 16, H_ - mb_y * 16, out_blocks);
|
1764
|
+
}
|
1765
|
+
}
|
1766
|
+
};
|
1767
|
+
|
1768
|
+
////////////////////////////////////////////////////////////////////////////////
|
1769
|
+
// sub-class for YUV 4:4:4 version
|
1770
|
+
|
1771
|
+
class Encoder444 : public Encoder {
|
1772
|
+
public:
|
1773
|
+
Encoder444(int W, int H, int step, const uint8_t* const rgb,
|
1774
|
+
ByteSink* const sink)
|
1775
|
+
: Encoder(W, H, step, rgb, sink) {
|
1776
|
+
SetYUVFormat(true);
|
1777
|
+
}
|
1778
|
+
virtual ~Encoder444() {}
|
1779
|
+
virtual void InitComponents() {
|
1780
|
+
nb_comps_ = 3;
|
1781
|
+
|
1782
|
+
quant_idx_[0] = 0;
|
1783
|
+
quant_idx_[1] = 1;
|
1784
|
+
quant_idx_[2] = 1;
|
1785
|
+
|
1786
|
+
nb_blocks_[0] = 1;
|
1787
|
+
nb_blocks_[1] = 1;
|
1788
|
+
nb_blocks_[2] = 1;
|
1789
|
+
mcu_blocks_ = 3;
|
1790
|
+
|
1791
|
+
block_w_ = 8;
|
1792
|
+
block_h_ = 8;
|
1793
|
+
block_dims_[0] = 0x11;
|
1794
|
+
block_dims_[1] = 0x11;
|
1795
|
+
block_dims_[2] = 0x11;
|
1796
|
+
}
|
1797
|
+
virtual void GetSamples(int mb_x, int mb_y, bool clipped, int16_t* out) {
|
1798
|
+
const uint8_t* data = rgb_ + (3 * mb_x + mb_y * step_) * 8;
|
1799
|
+
int step = step_;
|
1800
|
+
if (clipped) {
|
1801
|
+
data = GetReplicatedSamples(data, step,
|
1802
|
+
W_ - mb_x * 8, H_ - mb_y * 8, 8, 8);
|
1803
|
+
step = 3 * 8;
|
1804
|
+
}
|
1805
|
+
get_yuv_block_(data, step, out);
|
1806
|
+
}
|
1807
|
+
};
|
1808
|
+
|
1809
|
+
////////////////////////////////////////////////////////////////////////////////
|
1810
|
+
// sub-class for the sharp YUV 4:2:0 version
|
1811
|
+
|
1812
|
+
class EncoderSharp420 : public Encoder420 {
|
1813
|
+
public:
|
1814
|
+
EncoderSharp420(int W, int H, int step, const uint8_t* const rgb,
|
1815
|
+
ByteSink* const sink)
|
1816
|
+
: Encoder420(W, H, step, rgb, sink), yuv_memory_(nullptr) {
|
1817
|
+
const int uv_w = (W + 1) >> 1;
|
1818
|
+
const int uv_h = (H + 1) >> 1;
|
1819
|
+
yuv_memory_ = Alloc<uint8_t>(W * H + 2 * uv_w * uv_h);
|
1820
|
+
if (yuv_memory_ == nullptr) return;
|
1821
|
+
y_plane_ = yuv_memory_;
|
1822
|
+
y_step_ = W;
|
1823
|
+
u_plane_ = yuv_memory_ + W * H;
|
1824
|
+
v_plane_ = u_plane_ + uv_w * uv_h;
|
1825
|
+
uv_step_ = uv_w;
|
1826
|
+
ApplySharpYUVConversion(rgb, W, H, step, y_plane_, u_plane_, v_plane_);
|
1827
|
+
}
|
1828
|
+
virtual ~EncoderSharp420() { Free(yuv_memory_); }
|
1829
|
+
virtual void GetSamples(int mb_x, int mb_y, bool clipped, int16_t* out);
|
1830
|
+
|
1831
|
+
protected:
|
1832
|
+
void GetLumaSamples(int mb_x, int mb_y, bool clipped, int16_t* out) {
|
1833
|
+
int step = y_step_;
|
1834
|
+
const uint8_t* Y1 = y_plane_ + (mb_x + mb_y * step) * 16;
|
1835
|
+
if (clipped) {
|
1836
|
+
Y1 = GetReplicatedYUVSamples(Y1, step,
|
1837
|
+
W_ - mb_x * 16, H_ - mb_y * 16, 16, 16);
|
1838
|
+
step = 16;
|
1839
|
+
}
|
1840
|
+
const uint8_t* Y2 = Y1 + 8 * step;
|
1841
|
+
for (int y = 8, n = 0; y > 0; --y) {
|
1842
|
+
for (int x = 0; x < 8; ++x, ++n) {
|
1843
|
+
out[n + 0 * 64] = Y1[x] - 128;
|
1844
|
+
out[n + 1 * 64] = Y1[x + 8] - 128;
|
1845
|
+
out[n + 2 * 64] = Y2[x] - 128;
|
1846
|
+
out[n + 3 * 64] = Y2[x + 8] - 128;
|
1847
|
+
}
|
1848
|
+
Y1 += step;
|
1849
|
+
Y2 += step;
|
1850
|
+
}
|
1851
|
+
if (clipped) {
|
1852
|
+
AverageExtraLuma(W_ - mb_x * 16, H_ - mb_y * 16, out);
|
1853
|
+
}
|
1854
|
+
}
|
1855
|
+
|
1856
|
+
private:
|
1857
|
+
uint8_t* y_plane_;
|
1858
|
+
int y_step_;
|
1859
|
+
uint8_t* u_plane_;
|
1860
|
+
uint8_t* v_plane_;
|
1861
|
+
int uv_step_;
|
1862
|
+
uint8_t* yuv_memory_;
|
1863
|
+
};
|
1864
|
+
|
1865
|
+
void EncoderSharp420::GetSamples(int mb_x, int mb_y,
|
1866
|
+
bool clipped, int16_t* out) {
|
1867
|
+
GetLumaSamples(mb_x, mb_y, clipped, out);
|
1868
|
+
|
1869
|
+
// Chroma
|
1870
|
+
const uint8_t* U = u_plane_ + (mb_x + mb_y * uv_step_) * 8;
|
1871
|
+
int step = uv_step_;
|
1872
|
+
if (clipped) {
|
1873
|
+
U = GetReplicatedYUVSamples(U, step,
|
1874
|
+
((W_ + 1) >> 1) - mb_x * 8,
|
1875
|
+
((H_ + 1) >> 1) - mb_y * 8, 8, 8);
|
1876
|
+
step = 8;
|
1877
|
+
}
|
1878
|
+
for (int y = 8, n = 0; y > 0; --y, U += step) {
|
1879
|
+
for (int x = 0; x < 8; ++x, ++n) {
|
1880
|
+
out[n + 4 * 64] = U[x] - 128;
|
1881
|
+
}
|
1882
|
+
}
|
1883
|
+
const uint8_t* V = v_plane_ + (mb_x + mb_y * uv_step_) * 8;
|
1884
|
+
step = uv_step_;
|
1885
|
+
if (clipped) {
|
1886
|
+
V = GetReplicatedYUVSamples(V, step,
|
1887
|
+
((W_ + 1) >> 1) - mb_x * 8,
|
1888
|
+
((H_ + 1) >> 1) - mb_y * 8, 8, 8);
|
1889
|
+
step = 8;
|
1890
|
+
}
|
1891
|
+
for (int y = 8, n = 0; y > 0; --y, V += step) {
|
1892
|
+
for (int x = 0; x < 8; ++x, ++n) {
|
1893
|
+
out[n + 5 * 64] = V[x] - 128;
|
1894
|
+
}
|
1895
|
+
}
|
1896
|
+
}
|
1897
|
+
|
1898
|
+
////////////////////////////////////////////////////////////////////////////////
|
1899
|
+
// all-in-one factory to pickup the right encoder instance
|
1900
|
+
|
1901
|
+
Encoder* EncoderFactory(const uint8_t* rgb,
|
1902
|
+
int W, int H, int stride, SjpegYUVMode yuv_mode,
|
1903
|
+
ByteSink* const sink) {
|
1904
|
+
if (yuv_mode == SJPEG_YUV_AUTO) {
|
1905
|
+
yuv_mode = SjpegRiskiness(rgb, W, H, stride, nullptr);
|
1906
|
+
}
|
1907
|
+
|
1908
|
+
Encoder* enc = nullptr;
|
1909
|
+
if (yuv_mode == SJPEG_YUV_420) {
|
1910
|
+
enc = new (std::nothrow) Encoder420(W, H, stride, rgb, sink);
|
1911
|
+
} else if (yuv_mode == SJPEG_YUV_SHARP) {
|
1912
|
+
enc = new (std::nothrow) EncoderSharp420(W, H, stride, rgb, sink);
|
1913
|
+
} else {
|
1914
|
+
enc = new (std::nothrow) Encoder444(W, H, stride, rgb, sink);
|
1915
|
+
}
|
1916
|
+
if (enc == nullptr || !enc->Ok()) {
|
1917
|
+
delete enc;
|
1918
|
+
enc = nullptr;
|
1919
|
+
}
|
1920
|
+
return enc;
|
1921
|
+
}
|
1922
|
+
|
1923
|
+
} // namespace sjpeg
|
1924
|
+
|
1925
|
+
////////////////////////////////////////////////////////////////////////////////
|
1926
|
+
// public plain-C functions
|
1927
|
+
|
1928
|
+
size_t SjpegEncode(const uint8_t* rgb, int width, int height, int stride,
|
1929
|
+
uint8_t** out_data, float quality, int method,
|
1930
|
+
SjpegYUVMode yuv_mode) {
|
1931
|
+
if (rgb == nullptr || out_data == nullptr) return 0;
|
1932
|
+
if (width <= 0 || height <= 0 || stride < 3 * width) return 0;
|
1933
|
+
*out_data = nullptr; // safety
|
1934
|
+
|
1935
|
+
MemorySink sink(width * height / 4);
|
1936
|
+
Encoder* const enc = EncoderFactory(rgb, width, height, stride, yuv_mode,
|
1937
|
+
&sink);
|
1938
|
+
enc->SetQuality(quality);
|
1939
|
+
enc->SetCompressionMethod(method);
|
1940
|
+
size_t size = 0;
|
1941
|
+
*out_data = nullptr;
|
1942
|
+
if (enc->Encode()) sink.Release(out_data, &size);
|
1943
|
+
delete enc;
|
1944
|
+
return size;
|
1945
|
+
}
|
1946
|
+
|
1947
|
+
////////////////////////////////////////////////////////////////////////////////
|
1948
|
+
|
1949
|
+
size_t SjpegCompress(const uint8_t* rgb, int width, int height, float quality,
|
1950
|
+
uint8_t** out_data) {
|
1951
|
+
return SjpegEncode(rgb, width, height, 3 * width, out_data,
|
1952
|
+
quality, 4, SJPEG_YUV_AUTO);
|
1953
|
+
}
|
1954
|
+
|
1955
|
+
void SjpegFreeBuffer(const uint8_t* buffer) {
|
1956
|
+
delete[] buffer;
|
1957
|
+
}
|
1958
|
+
|
1959
|
+
////////////////////////////////////////////////////////////////////////////////
|
1960
|
+
|
1961
|
+
uint32_t SjpegVersion() {
|
1962
|
+
return SJPEG_VERSION;
|
1963
|
+
}
|
1964
|
+
|
1965
|
+
////////////////////////////////////////////////////////////////////////////////
|
1966
|
+
// Parametrized call
|
1967
|
+
|
1968
|
+
EncoderParam::EncoderParam() : search_hook(nullptr), memory(nullptr) {
|
1969
|
+
Init(kDefaultQuality);
|
1970
|
+
}
|
1971
|
+
|
1972
|
+
EncoderParam::EncoderParam(float quality_factor)
|
1973
|
+
: search_hook(nullptr), memory(nullptr) {
|
1974
|
+
Init(quality_factor);
|
1975
|
+
}
|
1976
|
+
|
1977
|
+
void EncoderParam::Init(float quality_factor) {
|
1978
|
+
Huffman_compress = true;
|
1979
|
+
adaptive_quantization = true;
|
1980
|
+
use_trellis = false;
|
1981
|
+
yuv_mode = SJPEG_YUV_AUTO;
|
1982
|
+
quantization_bias = kDefaultBias;
|
1983
|
+
qdelta_max_luma = kDefaultDeltaMaxLuma;
|
1984
|
+
qdelta_max_chroma = kDefaultDeltaMaxChroma;
|
1985
|
+
adaptive_bias = false;
|
1986
|
+
SetLimitQuantization(false);
|
1987
|
+
min_quant_tolerance_ = 0;
|
1988
|
+
SetQuality(quality_factor);
|
1989
|
+
target_mode = TARGET_NONE;
|
1990
|
+
target_value = 0;
|
1991
|
+
passes = 1;
|
1992
|
+
tolerance = 1.;
|
1993
|
+
qmin = 0.;
|
1994
|
+
qmax = 100.;
|
1995
|
+
}
|
1996
|
+
|
1997
|
+
void EncoderParam::SetQuality(float quality_factor) {
|
1998
|
+
const float q = GetQFactor(quality_factor);
|
1999
|
+
sjpeg::SetQuantMatrix(kDefaultMatrices[0], q, quant_[0]);
|
2000
|
+
sjpeg::SetQuantMatrix(kDefaultMatrices[1], q, quant_[1]);
|
2001
|
+
}
|
2002
|
+
|
2003
|
+
void EncoderParam::SetQuantization(const uint8_t m[2][64],
|
2004
|
+
float reduction) {
|
2005
|
+
if (reduction <= 1.f) reduction = 1.f;
|
2006
|
+
if (m == nullptr) return;
|
2007
|
+
for (int c = 0; c < 2; ++c) {
|
2008
|
+
for (size_t i = 0; i < 64; ++i) {
|
2009
|
+
const int v = static_cast<int>(m[c][i] * 100. / reduction + .5);
|
2010
|
+
quant_[c][i] = (v > 255) ? 255u : (v < 1) ? 1u : v;
|
2011
|
+
}
|
2012
|
+
}
|
2013
|
+
}
|
2014
|
+
|
2015
|
+
void EncoderParam::SetLimitQuantization(bool limit_quantization,
|
2016
|
+
int min_quant_tolerance) {
|
2017
|
+
use_min_quant_ = limit_quantization;
|
2018
|
+
if (limit_quantization) SetMinQuantization(quant_, min_quant_tolerance);
|
2019
|
+
}
|
2020
|
+
|
2021
|
+
void EncoderParam::SetMinQuantization(const uint8_t m[2][64],
|
2022
|
+
int min_quant_tolerance) {
|
2023
|
+
use_min_quant_ = true;
|
2024
|
+
CopyQuantMatrix(m[0], min_quant_[0]);
|
2025
|
+
CopyQuantMatrix(m[1], min_quant_[1]);
|
2026
|
+
min_quant_tolerance_ = (min_quant_tolerance < 0) ? 0
|
2027
|
+
: (min_quant_tolerance > 100) ? 100
|
2028
|
+
: min_quant_tolerance;
|
2029
|
+
}
|
2030
|
+
|
2031
|
+
void EncoderParam::ResetMetadata() {
|
2032
|
+
iccp.clear();
|
2033
|
+
exif.clear();
|
2034
|
+
xmp.clear();
|
2035
|
+
app_markers.clear();
|
2036
|
+
}
|
2037
|
+
|
2038
|
+
bool Encoder::InitFromParam(const EncoderParam& param) {
|
2039
|
+
SetQuantMatrices(param.quant_);
|
2040
|
+
if (param.use_min_quant_) {
|
2041
|
+
SetMinQuantMatrices(param.min_quant_, param.min_quant_tolerance_);
|
2042
|
+
} else {
|
2043
|
+
SetDefaultMinQuantMatrices();
|
2044
|
+
}
|
2045
|
+
|
2046
|
+
int method = param.Huffman_compress ? 1 : 0;
|
2047
|
+
if (param.adaptive_quantization) method += 3;
|
2048
|
+
if (param.use_trellis) {
|
2049
|
+
method = (method == 4) ? 7 : (method == 6) ? 8 : method;
|
2050
|
+
}
|
2051
|
+
|
2052
|
+
SetCompressionMethod(method);
|
2053
|
+
SetQuantizationBias(param.quantization_bias, param.adaptive_bias);
|
2054
|
+
SetQuantizationDeltas(param.qdelta_max_luma, param.qdelta_max_chroma);
|
2055
|
+
|
2056
|
+
SetMetadata(param.iccp, Encoder::ICC);
|
2057
|
+
SetMetadata(param.exif, Encoder::EXIF);
|
2058
|
+
SetMetadata(param.xmp, Encoder::XMP);
|
2059
|
+
SetMetadata(param.app_markers, Encoder::MARKERS);
|
2060
|
+
|
2061
|
+
passes_ = (param.passes < 1) ? 1 : (param.passes > 20) ? 20 : param.passes;
|
2062
|
+
if (passes_ > 1) {
|
2063
|
+
use_extra_memory_ = true;
|
2064
|
+
reuse_run_levels_ = true;
|
2065
|
+
search_hook_ = (param.search_hook == nullptr) ? &default_hook_
|
2066
|
+
: param.search_hook;
|
2067
|
+
if (!search_hook_->Setup(param)) return false;
|
2068
|
+
}
|
2069
|
+
|
2070
|
+
memory_hook_ = (param.memory == nullptr) ? &kDefaultMemory : param.memory;
|
2071
|
+
return true;
|
2072
|
+
}
|
2073
|
+
|
2074
|
+
bool sjpeg::Encode(const uint8_t* rgb, int width, int height, int stride,
|
2075
|
+
const EncoderParam& param, ByteSink* sink) {
|
2076
|
+
if (rgb == nullptr || sink == nullptr) return false;
|
2077
|
+
if (width <= 0 || height <= 0 || stride < 3 * width) return false;
|
2078
|
+
|
2079
|
+
Encoder* const enc = EncoderFactory(rgb, width, height, stride,
|
2080
|
+
param.yuv_mode, sink);
|
2081
|
+
const bool ok = (enc != nullptr) &&
|
2082
|
+
enc->InitFromParam(param) &&
|
2083
|
+
enc->Encode();
|
2084
|
+
delete enc;
|
2085
|
+
return ok;
|
2086
|
+
}
|
2087
|
+
|
2088
|
+
size_t sjpeg::Encode(const uint8_t* rgb, int width, int height, int stride,
|
2089
|
+
const EncoderParam& param, uint8_t** out_data) {
|
2090
|
+
MemorySink sink(width * height / 4); // estimation of output size
|
2091
|
+
if (!sjpeg::Encode(rgb, width, height, stride, param, &sink)) return 0;
|
2092
|
+
size_t size;
|
2093
|
+
sink.Release(out_data, &size);
|
2094
|
+
return size;
|
2095
|
+
}
|
2096
|
+
|
2097
|
+
////////////////////////////////////////////////////////////////////////////////
|
2098
|
+
// std::string variants
|
2099
|
+
|
2100
|
+
bool sjpeg::Encode(const uint8_t* rgb, int width, int height, int stride,
|
2101
|
+
const EncoderParam& param, std::string* output) {
|
2102
|
+
if (output == nullptr) return false;
|
2103
|
+
output->clear();
|
2104
|
+
output->reserve(width * height / 4);
|
2105
|
+
StringSink sink(output);
|
2106
|
+
return Encode(rgb, width, height, stride, param, &sink);
|
2107
|
+
}
|
2108
|
+
|
2109
|
+
bool SjpegCompress(const uint8_t* rgb, int width, int height,
|
2110
|
+
float quality, std::string* output) {
|
2111
|
+
EncoderParam param;
|
2112
|
+
param.SetQuality(quality);
|
2113
|
+
return Encode(rgb, width, height, 3 * width, param, output);
|
2114
|
+
}
|
2115
|
+
|
2116
|
+
////////////////////////////////////////////////////////////////////////////////
|
2117
|
+
|
2118
|
+
bool SjpegDimensions(const std::string& jpeg_data,
|
2119
|
+
int* width, int* height, int* is_yuv420) {
|
2120
|
+
return SjpegDimensions(
|
2121
|
+
reinterpret_cast<const uint8_t*>(jpeg_data.data()),
|
2122
|
+
jpeg_data.size(), width, height, is_yuv420);
|
2123
|
+
}
|
2124
|
+
|
2125
|
+
int SjpegFindQuantizer(const std::string& jpeg_data,
|
2126
|
+
uint8_t quant[2][64]) {
|
2127
|
+
return SjpegFindQuantizer(
|
2128
|
+
reinterpret_cast<const uint8_t*>(jpeg_data.data()), jpeg_data.size(),
|
2129
|
+
quant);
|
2130
|
+
}
|
2131
|
+
|
2132
|
+
////////////////////////////////////////////////////////////////////////////////
|