sjpeg 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/ext/sjpeg/enc.cc ADDED
@@ -0,0 +1,2132 @@
1
+ // Copyright 2017 Google Inc.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Fast and simple JPEG encoder
16
+ //
17
+ // Author: Skal (pascal.massimino@gmail.com)
18
+
19
+ #include <stdlib.h>
20
+ #include <math.h>
21
+ #include <float.h> // for FLT_MAX
22
+ #include <stdint.h>
23
+
24
+ #define SJPEG_NEED_ASM_HEADERS
25
+ #include "sjpegi.h"
26
+
27
+ using namespace sjpeg;
28
+
29
+ // Some general default values:
30
+ static const float kDefaultQuality = 75.f;
31
+ static const int kDefaultMethod = 4;
32
+ // Rounding bias for AC coefficients, as 8bit fixed point.
33
+ // A default value 0x78 leans toward filesize reduction.
34
+ static const int32_t kDefaultBias = 0x78;
35
+ // for adaptive quantization:
36
+ static const int kDefaultDeltaMaxLuma = 12;
37
+ static const int kDefaultDeltaMaxChroma = 1;
38
+
39
+ // finer tuning of perceptual optimizations:
40
+
41
+ // Minimum average number of entries per bin required for performing histogram-
42
+ // -based optimization. Below this limit, the channel's histogram is declared
43
+ // under-populated and the corresponding optimization skipped.
44
+ static double kDensityThreshold = 0.5;
45
+ // Rejection limit on the correlation factor when extrapolating the distortion
46
+ // from histograms. If the least-square fit has a squared correlation factor
47
+ // less than this threshold, the corresponding quantization scale will be
48
+ // kept unchanged.
49
+ static double kCorrelationThreshold = 0.5;
50
+ // Bit-map of channels to omit during quantization matrix optimization.
51
+ // If the bit 'i + 8 * j' is set in this bit field, the matrix entry at
52
+ // position (i,j) will be kept unchanged during optimization.
53
+ // The default value is 0x103 = 1 + 2 + 256: the 3 entries in the top-left
54
+ // corner (with lowest-frequency) are not optimized, since it can lead to
55
+ // visual degradation of smooth gradients.
56
+ static const uint64_t kOmittedChannels = 0x0000000000000103ULL;
57
+
58
+ ////////////////////////////////////////////////////////////////////////////////
59
+
60
+ namespace sjpeg {
61
+
62
+ const uint8_t kZigzag[64] = {
63
+ 0, 1, 8, 16, 9, 2, 3, 10,
64
+ 17, 24, 32, 25, 18, 11, 4, 5,
65
+ 12, 19, 26, 33, 40, 48, 41, 34,
66
+ 27, 20, 13, 6, 7, 14, 21, 28,
67
+ 35, 42, 49, 56, 57, 50, 43, 36,
68
+ 29, 22, 15, 23, 30, 37, 44, 51,
69
+ 58, 59, 52, 45, 38, 31, 39, 46,
70
+ 53, 60, 61, 54, 47, 55, 62, 63,
71
+ };
72
+
73
+ const uint8_t kDefaultMatrices[2][64] = {
74
+ // these are the default luma/chroma matrices (JPEG spec section K.1)
75
+ { 16, 11, 10, 16, 24, 40, 51, 61,
76
+ 12, 12, 14, 19, 26, 58, 60, 55,
77
+ 14, 13, 16, 24, 40, 57, 69, 56,
78
+ 14, 17, 22, 29, 51, 87, 80, 62,
79
+ 18, 22, 37, 56, 68, 109, 103, 77,
80
+ 24, 35, 55, 64, 81, 104, 113, 92,
81
+ 49, 64, 78, 87, 103, 121, 120, 101,
82
+ 72, 92, 95, 98, 112, 100, 103, 99 },
83
+ { 17, 18, 24, 47, 99, 99, 99, 99,
84
+ 18, 21, 26, 66, 99, 99, 99, 99,
85
+ 24, 26, 56, 99, 99, 99, 99, 99,
86
+ 47, 66, 99, 99, 99, 99, 99, 99,
87
+ 99, 99, 99, 99, 99, 99, 99, 99,
88
+ 99, 99, 99, 99, 99, 99, 99, 99,
89
+ 99, 99, 99, 99, 99, 99, 99, 99,
90
+ 99, 99, 99, 99, 99, 99, 99, 99 }
91
+ };
92
+
93
+ float GetQFactor(float q) {
94
+ // we use the same mapping than jpeg-6b, for coherency
95
+ q = (q <= 0) ? 5000 : (q < 50) ? 5000 / q : (q < 100) ? 2 * (100 - q) : 0;
96
+ // We floor-round to integer here just to preserve compatibility with jpeg6b.
97
+ return floorf(q);
98
+ }
99
+
100
+ void CopyQuantMatrix(const uint8_t in[64], uint8_t out[64]) {
101
+ memcpy(out, in, 64 * sizeof(out[0]));
102
+ }
103
+
104
+ void SetQuantMatrix(const uint8_t in[64], float q_factor, uint8_t out[64]) {
105
+ if (in == nullptr || out == nullptr) return;
106
+ q_factor /= 100.f;
107
+ for (size_t i = 0; i < 64; ++i) {
108
+ const int v = static_cast<int>(in[i] * q_factor + .5f);
109
+ // clamp to prevent illegal quantizer values
110
+ out[i] = (v < 1) ? 1 : (v > 255) ? 255u : v;
111
+ }
112
+ }
113
+
114
+ void SetMinQuantMatrix(const uint8_t m[64], uint8_t out[64], int tolerance) {
115
+ assert(out != nullptr && m != nullptr);
116
+ for (size_t i = 0; i < 64; ++i) {
117
+ const int v = static_cast<int>(m[i] * (256 - tolerance) >> 8);
118
+ out[i] = (v < 1) ? 1u : (v > 255) ? 255u : v;
119
+ }
120
+ }
121
+
122
+ void SetDefaultMinQuantMatrix(uint8_t out[64]) {
123
+ assert(out != nullptr);
124
+ for (size_t i = 0; i < 64; ++i) out[i] = 1u;
125
+ }
126
+
127
+ ////////////////////////////////////////////////////////////////////////////////
128
+ // Default memory manager (singleton)
129
+
130
+ static struct DefaultMemory : public MemoryManager {
131
+ public:
132
+ virtual ~DefaultMemory() {}
133
+ virtual void* Alloc(size_t size) { return malloc(size); }
134
+ virtual void Free(void* const ptr) { free(ptr); }
135
+ } kDefaultMemory;
136
+
137
+ ////////////////////////////////////////////////////////////////////////////////
138
+ // Encoder main class
139
+
140
+ Encoder::Encoder(int W, int H, int step, const uint8_t* const rgb,
141
+ ByteSink* const sink)
142
+ : W_(W), H_(H), step_(step),
143
+ rgb_(rgb),
144
+ ok_(true),
145
+ bw_(sink),
146
+ in_blocks_base_(nullptr),
147
+ in_blocks_(nullptr),
148
+ have_coeffs_(false),
149
+ all_run_levels_(nullptr),
150
+ nb_run_levels_(0),
151
+ max_run_levels_(0),
152
+ qdelta_max_luma_(kDefaultDeltaMaxLuma),
153
+ qdelta_max_chroma_(kDefaultDeltaMaxChroma),
154
+ passes_(1),
155
+ search_hook_(nullptr),
156
+ memory_hook_(&kDefaultMemory) {
157
+ SetCompressionMethod(kDefaultMethod);
158
+ SetQuality(kDefaultQuality);
159
+ SetYUVFormat(false);
160
+ SetQuantizationBias(kDefaultBias, false);
161
+ SetDefaultMinQuantMatrices();
162
+ InitializeStaticPointers();
163
+ memset(dc_codes_, 0, sizeof(dc_codes_)); // safety
164
+ memset(ac_codes_, 0, sizeof(ac_codes_));
165
+ }
166
+
167
+ Encoder::~Encoder() {
168
+ Free(all_run_levels_);
169
+ DesallocateBlocks(); // clean-up leftovers in case of we had an error
170
+ }
171
+
172
+ ////////////////////////////////////////////////////////////////////////////////
173
+
174
+ void Encoder::SetQuality(float q) {
175
+ q = GetQFactor(q);
176
+ SetQuantMatrix(kDefaultMatrices[0], q, quants_[0].quant_);
177
+ SetQuantMatrix(kDefaultMatrices[1], q, quants_[1].quant_);
178
+ }
179
+
180
+ void Encoder::SetQuantMatrices(const uint8_t m[2][64]) {
181
+ SetQuantMatrix(m[0], 100, quants_[0].quant_);
182
+ SetQuantMatrix(m[1], 100, quants_[1].quant_);
183
+ }
184
+
185
+ void Encoder::SetMinQuantMatrices(const uint8_t m[2][64], int tolerance) {
186
+ SetMinQuantMatrix(m[0], quants_[0].min_quant_, tolerance);
187
+ SetMinQuantMatrix(m[1], quants_[1].min_quant_, tolerance);
188
+ }
189
+
190
+ void Encoder::SetDefaultMinQuantMatrices() {
191
+ SetDefaultMinQuantMatrix(quants_[0].min_quant_);
192
+ SetDefaultMinQuantMatrix(quants_[1].min_quant_);
193
+ }
194
+
195
+ void Encoder::SetCompressionMethod(int method) {
196
+ assert(method >= 0 && method <= 8);
197
+ use_adaptive_quant_ = (method >= 3);
198
+ optimize_size_ = (method != 0) && (method != 3);
199
+ use_extra_memory_ = (method == 3) || (method == 4) || (method == 7);
200
+ reuse_run_levels_ = (method == 1) || (method == 4) || (method == 5)
201
+ || (method >= 7);
202
+ use_trellis_ = (method >= 7);
203
+ }
204
+
205
+ void Encoder::SetMetadata(const std::string& data, MetadataType type) {
206
+ switch (type) {
207
+ case ICC: iccp_ = data; break;
208
+ case EXIF: exif_ = data; break;
209
+ case XMP: xmp_ = data; break;
210
+ default:
211
+ case MARKERS: app_markers_ = data; break;
212
+ }
213
+ }
214
+
215
+ void Encoder::SetQuantizationBias(int bias, bool use_adaptive) {
216
+ assert(bias >= 0 && bias <= 255);
217
+ q_bias_ = bias;
218
+ adaptive_bias_ = use_adaptive;
219
+ }
220
+
221
+ void Encoder::SetQuantizationDeltas(int qdelta_luma, int qdelta_chroma) {
222
+ assert(qdelta_luma >= 0 && qdelta_luma <= 255);
223
+ assert(qdelta_chroma >= 0 && qdelta_chroma <= 255);
224
+ qdelta_max_luma_ = qdelta_luma;
225
+ qdelta_max_chroma_ = qdelta_chroma;
226
+ }
227
+
228
+ ////////////////////////////////////////////////////////////////////////////////
229
+ // CPU support
230
+
231
+ extern bool ForceSlowCImplementation;
232
+ bool ForceSlowCImplementation = false; // undocumented! for tests.
233
+
234
+ bool SupportsSSE2() {
235
+ if (ForceSlowCImplementation) return false;
236
+ #if defined(SJPEG_USE_SSE2)
237
+ return true;
238
+ #endif
239
+ return false;
240
+ }
241
+
242
+ bool SupportsNEON() {
243
+ if (ForceSlowCImplementation) return false;
244
+ #if defined(SJPEG_USE_NEON)
245
+ return true;
246
+ #endif
247
+ return false;
248
+ }
249
+
250
+ ////////////////////////////////////////////////////////////////////////////////
251
+ // static pointers to architecture-dependant implementation
252
+
253
+ Encoder::QuantizeErrorFunc Encoder::quantize_error_ = nullptr;
254
+ Encoder::QuantizeBlockFunc Encoder::quantize_block_ = nullptr;
255
+ void (*Encoder::fDCT_)(int16_t* in, int num_blocks) = nullptr;
256
+ Encoder::StoreHistoFunc Encoder::store_histo_ = nullptr;
257
+ RGBToYUVBlockFunc Encoder::get_yuv444_block_ = nullptr;
258
+
259
+ void Encoder::InitializeStaticPointers() {
260
+ if (fDCT_ == nullptr) {
261
+ store_histo_ = GetStoreHistoFunc();
262
+ quantize_block_ = GetQuantizeBlockFunc();
263
+ quantize_error_ = GetQuantizeErrorFunc();
264
+ fDCT_ = GetFdct();
265
+ get_yuv444_block_ = GetBlockFunc(true);
266
+ }
267
+ }
268
+
269
+ ////////////////////////////////////////////////////////////////////////////////
270
+ // memory and internal buffers management. We grow on demand.
271
+
272
+ bool Encoder::SetError() {
273
+ ok_ = false;
274
+ return false;
275
+ }
276
+
277
+ bool Encoder::CheckBuffers() {
278
+ // maximum macroblock size, worst-case, is 24bits*64*6 coeffs = 1152bytes
279
+ ok_ = ok_ && bw_.Reserve(2048);
280
+ if (!ok_) return false;
281
+
282
+ if (reuse_run_levels_) {
283
+ if (nb_run_levels_ + 6*64 > max_run_levels_) {
284
+ // need to grow storage for run/levels
285
+ const size_t new_size = max_run_levels_ ? max_run_levels_ * 2 : 8192;
286
+ RunLevel* const new_rl = Alloc<RunLevel>(new_size);
287
+ if (new_rl == nullptr) return false;
288
+ if (nb_run_levels_ > 0) {
289
+ memcpy(new_rl, all_run_levels_,
290
+ nb_run_levels_ * sizeof(new_rl[0]));
291
+ }
292
+ Free(all_run_levels_);
293
+ all_run_levels_ = new_rl;
294
+ max_run_levels_ = new_size;
295
+ assert(nb_run_levels_ + 6 * 64 <= max_run_levels_);
296
+ }
297
+ }
298
+ return true;
299
+ }
300
+
301
+ bool Encoder::AllocateBlocks(size_t num_blocks) {
302
+ assert(in_blocks_ == nullptr);
303
+ have_coeffs_ = false;
304
+ const size_t size = num_blocks * 64 * sizeof(*in_blocks_);
305
+ in_blocks_base_ = Alloc<uint8_t>(size + ALIGN_CST);
306
+ if (in_blocks_base_ == nullptr) return false;
307
+ in_blocks_ = reinterpret_cast<int16_t*>(
308
+ (ALIGN_CST + reinterpret_cast<uintptr_t>(in_blocks_base_)) & ~ALIGN_CST);
309
+ return true;
310
+ }
311
+
312
+ void Encoder::DesallocateBlocks() {
313
+ Free(in_blocks_base_);
314
+ in_blocks_base_ = nullptr;
315
+ in_blocks_ = nullptr; // sanity
316
+ }
317
+
318
+ ////////////////////////////////////////////////////////////////////////////////
319
+
320
+ #define FP_BITS 16 // fractional precision for fixed-point dividors
321
+ #define AC_BITS 4 // extra precision bits from fdct's scaling
322
+ #define BIAS_DC 0x80 // neutral bias for DC (mandatory!)
323
+
324
+ // divide-by-multiply helper macros
325
+ #define MAKE_INV_QUANT(Q) (((1u << FP_BITS) + (Q) / 2) / (Q))
326
+ #define DIV_BY_MULT(A, M) (((A) * (M)) >> FP_BITS)
327
+ #define QUANTIZE(A, M, B) (DIV_BY_MULT((A) + (B), (M)) >> AC_BITS)
328
+
329
+ void Encoder::FinalizeQuantMatrix(Quantizer* const q, int q_bias) {
330
+ // first, clamp the quant matrix:
331
+ for (size_t i = 0; i < 64; ++i) {
332
+ if (q->quant_[i] < q->min_quant_[i]) q->quant_[i] = q->min_quant_[i];
333
+ }
334
+ // Special case! for v=1 we can't represent the multiplier with 16b precision.
335
+ // So, instead we max out the multiplier to 0xffffu, and twist the bias to the
336
+ // value 0x80. The overall precision isn't affected: it's bit-exact the same
337
+ // for our working range.
338
+ // Note that quant=1 can start appearing at quality as low as 93.
339
+ const uint16_t bias_1 = 0x80;
340
+ const uint16_t iquant_1 = 0xffffu;
341
+ for (size_t i = 0; i < 64; ++i) {
342
+ const uint16_t v = q->quant_[i];
343
+ const uint16_t iquant = (v == 1) ? iquant_1 : MAKE_INV_QUANT(v);
344
+ const uint16_t bias = (v == 1) ? bias_1 : (i == 0) ? BIAS_DC : q_bias;
345
+ const uint16_t ibias = (((bias * v) << AC_BITS) + 128) >> 8;
346
+ const uint16_t qthresh =
347
+ ((1 << (FP_BITS + AC_BITS)) + iquant - 1) / iquant - ibias;
348
+ q->bias_[i] = ibias;
349
+ q->iquant_[i] = iquant;
350
+ q->qthresh_[i] = qthresh;
351
+ assert(QUANTIZE(qthresh, iquant, ibias) > 0);
352
+ assert(QUANTIZE(qthresh - 1, iquant, ibias) == 0);
353
+ }
354
+ }
355
+
356
+ void Encoder::SetCostCodes(int idx) {
357
+ quants_[idx].codes_ = ac_codes_[idx];
358
+ }
359
+
360
+ ////////////////////////////////////////////////////////////////////////////////
361
+ // standard Huffman tables, as per JPEG standard section K.3.
362
+
363
+ static const uint8_t kDCSyms[12] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 };
364
+ static const uint8_t kACSyms[2][162] = {
365
+ { 0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12,
366
+ 0x21, 0x31, 0x41, 0x06, 0x13, 0x51, 0x61, 0x07,
367
+ 0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xa1, 0x08,
368
+ 0x23, 0x42, 0xb1, 0xc1, 0x15, 0x52, 0xd1, 0xf0,
369
+ 0x24, 0x33, 0x62, 0x72, 0x82, 0x09, 0x0a, 0x16,
370
+ 0x17, 0x18, 0x19, 0x1a, 0x25, 0x26, 0x27, 0x28,
371
+ 0x29, 0x2a, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
372
+ 0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
373
+ 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
374
+ 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
375
+ 0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
376
+ 0x7a, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
377
+ 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98,
378
+ 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
379
+ 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6,
380
+ 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5,
381
+ 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4,
382
+ 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xe1, 0xe2,
383
+ 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea,
384
+ 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
385
+ 0xf9, 0xfa },
386
+ { 0x00, 0x01, 0x02, 0x03, 0x11, 0x04, 0x05, 0x21,
387
+ 0x31, 0x06, 0x12, 0x41, 0x51, 0x07, 0x61, 0x71,
388
+ 0x13, 0x22, 0x32, 0x81, 0x08, 0x14, 0x42, 0x91,
389
+ 0xa1, 0xb1, 0xc1, 0x09, 0x23, 0x33, 0x52, 0xf0,
390
+ 0x15, 0x62, 0x72, 0xd1, 0x0a, 0x16, 0x24, 0x34,
391
+ 0xe1, 0x25, 0xf1, 0x17, 0x18, 0x19, 0x1a, 0x26,
392
+ 0x27, 0x28, 0x29, 0x2a, 0x35, 0x36, 0x37, 0x38,
393
+ 0x39, 0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48,
394
+ 0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,
395
+ 0x59, 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68,
396
+ 0x69, 0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
397
+ 0x79, 0x7a, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
398
+ 0x88, 0x89, 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96,
399
+ 0x97, 0x98, 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5,
400
+ 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4,
401
+ 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3,
402
+ 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2,
403
+ 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda,
404
+ 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9,
405
+ 0xea, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
406
+ 0xf9, 0xfa }
407
+ };
408
+
409
+ static const HuffmanTable kHuffmanTables[4] = {
410
+ { { 0, 1, 5, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0 }, kDCSyms, 12 },
411
+ { { 0, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0 }, kDCSyms, 12 },
412
+ { { 0, 2, 1, 3, 3, 2, 4, 3, 5, 5, 4, 4, 0, 0, 1, 125 }, kACSyms[0], 162 },
413
+ { { 0, 2, 1, 2, 4, 4, 3, 4, 7, 5, 4, 4, 0, 1, 2, 119 }, kACSyms[1], 162 }
414
+ };
415
+
416
+ ////////////////////////////////////////////////////////////////////////////////
417
+ // This function generates a map from symbols to code + len stored in a packed
418
+ // way (lower 16bit is the lenth, upper 16bit is the VLC).
419
+ // The input is a JPEG-like description of the symbols:
420
+ // - bits[i] stores the number of codes having length i + 1.
421
+ // - symbols[] contain the symbols' map, in increasing bit-length order.
422
+ // There is no check performed on the validity symbols[]'s content.
423
+ // The values of tab[] not referring to an actual symbol will remain unchanged.
424
+ // Returns the number of symbols used (that is: sum{bits[i]})
425
+
426
+ static int BuildHuffmanTable(const uint8_t bits[16], const uint8_t* symbols,
427
+ uint32_t* const tab) {
428
+ uint32_t code = 0;
429
+ int nb = 0;
430
+ for (int nb_bits = 1; nb_bits <= 16; ++nb_bits, code <<= 1) {
431
+ int n = bits[nb_bits - 1]; // number of code for that given nb_bits
432
+ nb += n;
433
+ while (n-- > 0) {
434
+ const int symbol = *symbols++;
435
+ tab[symbol] = (code << 16) | nb_bits;
436
+ ++code;
437
+ }
438
+ }
439
+ return nb;
440
+ }
441
+
442
+ ////////////////////////////////////////////////////////////////////////////////
443
+
444
+ void Encoder::InitCodes(bool only_ac) {
445
+ const int nb_tables = (nb_comps_ == 1 ? 1 : 2);
446
+ for (int c = 0; c < nb_tables; ++c) { // luma, chroma
447
+ for (int type = (only_ac ? 1 : 0); type <= 1; ++type) {
448
+ const HuffmanTable* const h = Huffman_tables_[type * 2 + c];
449
+ const int nb_syms = BuildHuffmanTable(h->bits_, h->syms_,
450
+ type == 1 ? ac_codes_[c]
451
+ : dc_codes_[c]);
452
+ assert(nb_syms == h->nb_syms_);
453
+ (void)nb_syms;
454
+ }
455
+ }
456
+ }
457
+
458
+ ////////////////////////////////////////////////////////////////////////////////
459
+ // Quantize coefficients and pseudo-code coefficients
460
+
461
+ static int CalcLog2(int v) {
462
+ #if defined(__GNUC__) && \
463
+ ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4)
464
+ return 32 - __builtin_clz(v);
465
+ #else
466
+ const int kLog2[16] = {
467
+ 0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4 };
468
+ assert(v > 0 && v < (1 << 12));
469
+ return (v & ~0xff) ? 8 + kLog2[v >> 8] :
470
+ (v & ~0x0f) ? 4 + kLog2[v >> 4] :
471
+ 0 + kLog2[v];
472
+ #endif
473
+ }
474
+
475
+ uint16_t Encoder::GenerateDCDiffCode(int DC, int* const DC_predictor) {
476
+ const int diff = DC - *DC_predictor;
477
+ *DC_predictor = DC;
478
+ if (diff == 0) {
479
+ return 0;
480
+ }
481
+ int suff, n;
482
+ if (diff < 0) {
483
+ n = CalcLog2(-diff);
484
+ suff = (diff - 1) & ((1 << n) - 1);
485
+ } else {
486
+ n = CalcLog2(diff);
487
+ suff = diff;
488
+ }
489
+ assert((suff & 0xf000) == 0);
490
+ assert(n < 12);
491
+ return n | (suff << 4);
492
+ }
493
+
494
+ ////////////////////////////////////////////////////////////////////////////////
495
+ // various implementation of histogram collection
496
+
497
+ #if defined(SJPEG_USE_SSE2)
498
+ // Load eight 16b-words from *src.
499
+ #define LOAD_16(src) _mm_loadu_si128(reinterpret_cast<const __m128i*>(src))
500
+ // Store eight 16b-words into *dst
501
+ #define STORE_16(V, dst) _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), (V))
502
+
503
+ static int QuantizeBlockSSE2(const int16_t in[64], int idx,
504
+ const Quantizer* const Q,
505
+ DCTCoeffs* const out, RunLevel* const rl) {
506
+ const uint16_t* const bias = Q->bias_;
507
+ const uint16_t* const iquant = Q->iquant_;
508
+ int prev = 1;
509
+ int nb = 0;
510
+ int16_t tmp[64], masked[64];
511
+ for (int i = 0; i < 64; i += 8) {
512
+ const __m128i m_bias = LOAD_16(bias + i);
513
+ const __m128i m_mult = LOAD_16(iquant + i);
514
+ const __m128i A = LOAD_16(in + i); // A = in[i]
515
+ const __m128i B = _mm_srai_epi16(A, 15); // sign extract
516
+ const __m128i C = _mm_sub_epi16(_mm_xor_si128(A, B), B); // abs(A)
517
+ const __m128i D = _mm_adds_epi16(C, m_bias); // v' = v + bias
518
+ const __m128i E = _mm_mulhi_epu16(D, m_mult); // (v' * iq) >> 16
519
+ const __m128i F = _mm_srli_epi16(E, AC_BITS); // = QUANTIZE(...)
520
+ const __m128i G = _mm_xor_si128(F, B); // v ^ mask
521
+ STORE_16(F, tmp + i);
522
+ STORE_16(G, masked + i);
523
+ }
524
+ for (int i = 1; i < 64; ++i) {
525
+ const int j = kZigzag[i];
526
+ const int v = tmp[j];
527
+ if (v > 0) {
528
+ const int n = CalcLog2(v);
529
+ const uint16_t code = masked[j] & ((1 << n) - 1);
530
+ rl[nb].level_ = (code << 4) | n;
531
+ rl[nb].run_ = i - prev;
532
+ prev = i + 1;
533
+ ++nb;
534
+ }
535
+ }
536
+ const int dc = (in[0] < 0) ? -tmp[0] : tmp[0];
537
+ out->idx_ = idx;
538
+ out->last_ = prev - 1;
539
+ out->nb_coeffs_ = nb;
540
+ return dc;
541
+ }
542
+ #undef LOAD_16
543
+ #undef STORE_16
544
+
545
+ #elif defined(SJPEG_USE_NEON)
546
+ static int QuantizeBlockNEON(const int16_t in[64], int idx,
547
+ const Quantizer* const Q,
548
+ DCTCoeffs* const out, RunLevel* const rl) {
549
+ const uint16_t* const bias = Q->bias_;
550
+ const uint16_t* const iquant = Q->iquant_;
551
+ int prev = 1;
552
+ int nb = 0;
553
+ uint16_t tmp[64], masked[64];
554
+ for (int i = 0; i < 64; i += 8) {
555
+ const uint16x8_t m_bias = vld1q_u16(bias + i);
556
+ const uint16x8_t m_mult = vld1q_u16(iquant + i);
557
+ const int16x8_t A = vld1q_s16(in + i); // in[i]
558
+ const uint16x8_t B = vreinterpretq_u16_s16(vabsq_s16(A)); // abs(in)
559
+ const int16x8_t sign = vshrq_n_s16(A, 15); // sign
560
+ const uint16x8_t C = vaddq_u16(B, m_bias); // + bias
561
+ const uint32x4_t D0 = vmull_u16(vget_low_u16(C), vget_low_u16(m_mult));
562
+ const uint32x4_t D1 = vmull_u16(vget_high_u16(C), vget_high_u16(m_mult));
563
+ // collect hi-words of the 32b mult result using 'unzip'
564
+ const uint16x8x2_t E = vuzpq_u16(vreinterpretq_u16_u32(D0),
565
+ vreinterpretq_u16_u32(D1));
566
+ const uint16x8_t F = vshrq_n_u16(E.val[1], AC_BITS);
567
+ const uint16x8_t G = veorq_u16(F, vreinterpretq_u16_s16(sign)); // v ^ mask
568
+ vst1q_u16(tmp + i, F);
569
+ vst1q_u16(masked + i, G);
570
+ }
571
+ for (int i = 1; i < 64; ++i) {
572
+ const int j = kZigzag[i];
573
+ const int v = tmp[j];
574
+ if (v > 0) {
575
+ const int n = CalcLog2(v);
576
+ const uint16_t code = masked[j] & ((1 << n) - 1);
577
+ rl[nb].level_ = (code << 4) | n;
578
+ rl[nb].run_ = i - prev;
579
+ prev = i + 1;
580
+ ++nb;
581
+ }
582
+ }
583
+ const int dc = (in[0] < 0) ? -tmp[0] : tmp[0];
584
+ out->idx_ = idx;
585
+ out->last_ = prev - 1;
586
+ out->nb_coeffs_ = nb;
587
+ return dc;
588
+ }
589
+ #endif // SJPEG_USE_NEON
590
+
591
+ static int QuantizeBlock(const int16_t in[64], int idx,
592
+ const Quantizer* const Q,
593
+ DCTCoeffs* const out, RunLevel* const rl) {
594
+ const uint16_t* const bias = Q->bias_;
595
+ const uint16_t* const iquant = Q->iquant_;
596
+ int prev = 1;
597
+ int nb = 0;
598
+ // This function is speed-critical, so we're using some bit mask
599
+ // to extract absolute values, instead of sign tests.
600
+ const uint16_t* const qthresh = Q->qthresh_;
601
+ for (int i = 1; i < 64; ++i) {
602
+ const int j = kZigzag[i];
603
+ int v = in[j];
604
+ const int32_t mask = v >> 31;
605
+ v = (v ^ mask) - mask;
606
+ if (v >= qthresh[j]) {
607
+ v = QUANTIZE(v, iquant[j], bias[j]);
608
+ assert(v > 0);
609
+ const int n = CalcLog2(v);
610
+ const uint16_t code = (v ^ mask) & ((1 << n) - 1);
611
+ rl[nb].level_ = (code << 4) | n;
612
+ rl[nb].run_ = i - prev;
613
+ prev = i + 1;
614
+ ++nb;
615
+ }
616
+ }
617
+ const int dc = (in[0] < 0) ? -QUANTIZE(-in[0], iquant[0], bias[0])
618
+ : QUANTIZE(in[0], iquant[0], bias[0]);
619
+ out->idx_ = idx;
620
+ out->last_ = prev - 1;
621
+ out->nb_coeffs_ = nb;
622
+ return dc;
623
+ }
624
+
625
+ ////////////////////////////////////////////////////////////////////////////////
626
+ // Trellis-based quantization
627
+
628
+ typedef uint32_t score_t;
629
+ static const score_t kMaxScore = 0xffffffffu;
630
+
631
+ struct TrellisNode {
632
+ uint32_t code;
633
+ int nbits;
634
+ score_t score;
635
+ uint32_t disto;
636
+ uint32_t bits;
637
+ uint32_t run;
638
+ const TrellisNode* best_prev;
639
+ int pos;
640
+ int rank;
641
+
642
+ TrellisNode() : score(kMaxScore), best_prev(nullptr) {}
643
+ void InitSink() {
644
+ score = 0u;
645
+ disto = 0;
646
+ pos = 0;
647
+ rank = 0;
648
+ nbits = 0;
649
+ bits = 0;
650
+ }
651
+ };
652
+
653
+ static bool SearchBestPrev(const TrellisNode* const nodes0, TrellisNode* node,
654
+ const uint32_t disto0[], const uint32_t codes[],
655
+ uint32_t lambda) {
656
+ bool found = false;
657
+ assert(codes[0xf0] != 0);
658
+ const uint32_t base_disto = node->disto + disto0[node->pos - 1];
659
+ for (const TrellisNode* cur = node - 1; cur >= nodes0; --cur) {
660
+ const int run = node->pos - 1 - cur->pos;
661
+ if (run < 0) continue;
662
+ uint32_t bits = node->nbits;
663
+ bits += (run >> 4) * (codes[0xf0] & 0xff);
664
+ const uint32_t sym = ((run & 15) << 4) | node->nbits;
665
+ assert(codes[sym] != 0);
666
+ bits += codes[sym] & 0xff;
667
+ const uint32_t disto = base_disto - disto0[cur->pos];
668
+ const score_t score = disto + lambda * bits + cur->score;
669
+ if (score < node->score) {
670
+ node->score = score;
671
+ node->disto = disto;
672
+ node->bits = bits;
673
+ node->best_prev = cur;
674
+ node->rank = cur->rank + 1;
675
+ node->run = run;
676
+ found = true;
677
+ }
678
+ }
679
+ return found;
680
+ }
681
+
682
+ // number of alternate levels to investigate
683
+ #define NUM_TRELLIS_NODES 2
684
+
685
+ int Encoder::TrellisQuantizeBlock(const int16_t in[64], int idx,
686
+ const Quantizer* const Q,
687
+ DCTCoeffs* const out,
688
+ RunLevel* const rl) {
689
+ const uint16_t* const bias = Q->bias_;
690
+ const uint16_t* const iquant = Q->iquant_;
691
+ TrellisNode nodes[1 + NUM_TRELLIS_NODES * 63]; // 1 sink + n channels
692
+ nodes[0].InitSink();
693
+ const uint32_t* const codes = Q->codes_;
694
+ TrellisNode* cur_node = &nodes[1];
695
+ uint32_t disto0[64]; // disto0[i] = sum of distortions up to i (inclusive)
696
+ disto0[0] = 0;
697
+ for (int i = 1; i < 64; ++i) {
698
+ const int j = kZigzag[i];
699
+ const uint32_t q = Q->quant_[j] << AC_BITS;
700
+ const uint32_t lambda = q * q / 32u;
701
+ int V = in[j];
702
+ const int32_t mask = V >> 31;
703
+ V = (V ^ mask) - mask;
704
+ disto0[i] = V * V + disto0[i - 1];
705
+ int v = QUANTIZE(V, iquant[j], bias[j]);
706
+ if (v == 0) continue;
707
+ int nbits = CalcLog2(v);
708
+ for (int k = 0; k < NUM_TRELLIS_NODES; ++k) {
709
+ const int err = V - v * q;
710
+ cur_node->code = (v ^ mask) & ((1 << nbits) - 1);
711
+ cur_node->pos = i;
712
+ cur_node->disto = err * err;
713
+ cur_node->nbits = nbits;
714
+ cur_node->score = kMaxScore;
715
+ if (SearchBestPrev(&nodes[0], cur_node, disto0, codes, lambda)) {
716
+ ++cur_node;
717
+ }
718
+ --nbits;
719
+ if (nbits <= 0) break;
720
+ v = (1 << nbits) - 1;
721
+ }
722
+ }
723
+ // search best entry point backward
724
+ const TrellisNode* nz = &nodes[0];
725
+ if (cur_node != nz) {
726
+ score_t best_score = kMaxScore;
727
+ while (cur_node-- != &nodes[0]) {
728
+ const uint32_t disto = disto0[63] - disto0[cur_node->pos];
729
+ // No need to incorporate EOB's bit cost (codes[0x00]), since
730
+ // it's the same for all coeff except the last one #63.
731
+ cur_node->disto += disto;
732
+ cur_node->score += disto;
733
+ if (cur_node->score < best_score) {
734
+ nz = cur_node;
735
+ best_score = cur_node->score;
736
+ }
737
+ }
738
+ }
739
+ int nb = nz->rank;
740
+ out->idx_ = idx;
741
+ out->last_ = nz->pos;
742
+ out->nb_coeffs_ = nb;
743
+
744
+ while (nb-- > 0) {
745
+ const int32_t code = nz->code;
746
+ const int n = nz->nbits;
747
+ rl[nb].level_ = (code << 4) | n;
748
+ rl[nb].run_ = nz->run;
749
+ nz = nz->best_prev;
750
+ }
751
+ const int dc = (in[0] < 0) ? -QUANTIZE(-in[0], iquant[0], bias[0])
752
+ : QUANTIZE(in[0], iquant[0], bias[0]);
753
+ return dc;
754
+ }
755
+
756
+ Encoder::QuantizeBlockFunc Encoder::GetQuantizeBlockFunc() {
757
+ #if defined(SJPEG_USE_SSE2)
758
+ if (SupportsSSE2()) return QuantizeBlockSSE2;
759
+ #elif defined(SJPEG_USE_NEON)
760
+ if (SupportsNEON()) return QuantizeBlockNEON;
761
+ #endif
762
+ return QuantizeBlock; // default
763
+ }
764
+
765
+ ////////////////////////////////////////////////////////////////////////////////
766
+
767
+ #if defined(SJPEG_USE_SSE2)
768
+ // Load eight 16b-words from *src.
769
+ #define LOAD_16(src) _mm_loadu_si128((const __m128i*)(src))
770
+ #define LOAD_64(src) _mm_loadl_epi64((const __m128i*)(src))
771
+ // Store eight 16b-words into *dst
772
+ #define STORE_16(V, dst) _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), (V))
773
+
774
+ static uint32_t QuantizeErrorSSE2(const int16_t in[64],
775
+ const Quantizer* const Q) {
776
+ const uint16_t* const bias = Q->bias_;
777
+ const uint16_t* const iquant = Q->iquant_;
778
+ const uint8_t* const quant = Q->quant_;
779
+ const __m128i zero = _mm_setzero_si128();
780
+ uint32_t tmp[32];
781
+ for (int i = 0; i < 64; i += 8) {
782
+ const __m128i m_bias = LOAD_16(bias + i);
783
+ const __m128i m_iquant = LOAD_16(iquant + i);
784
+ const __m128i m_quant = _mm_unpacklo_epi8(LOAD_64(quant + i), zero);
785
+ const __m128i A = LOAD_16(in + i); // v0 = in[i]
786
+ const __m128i B = _mm_srai_epi16(A, 15); // sign extract
787
+ const __m128i C = _mm_sub_epi16(_mm_xor_si128(A, B), B); // abs(v0)
788
+ const __m128i D = _mm_adds_epi16(C, m_bias); // v' = v0 + bias
789
+ const __m128i E = _mm_mulhi_epu16(D, m_iquant); // (v' * iq) >> 16
790
+ const __m128i F = _mm_srai_epi16(E, AC_BITS);
791
+ const __m128i G = _mm_srai_epi16(C, AC_BITS);
792
+ const __m128i H = _mm_mullo_epi16(F, m_quant); // *= quant[j]
793
+ const __m128i I = _mm_sub_epi16(G, H);
794
+ const __m128i J = _mm_madd_epi16(I, I); // (v0-v) ^ 2
795
+ STORE_16(J, tmp + i / 2);
796
+ }
797
+ uint32_t err = 0;
798
+ for (int i = 0; i < 32; ++i) err += tmp[i];
799
+ return err;
800
+ }
801
+ #undef LOAD_16
802
+ #undef LOAD_64
803
+ #undef STORE_16
804
+
805
+ #elif defined(SJPEG_USE_NEON)
806
+
807
+ static uint32_t QuantizeErrorNEON(const int16_t in[64],
808
+ const Quantizer* const Q) {
809
+ const uint16_t* const bias = Q->bias_;
810
+ const uint16_t* const iquant = Q->iquant_;
811
+ const uint8_t* const quant = Q->quant_;
812
+ uint32x4_t sum1 = vdupq_n_u32(0);
813
+ uint32x4_t sum2 = vdupq_n_u32(0);
814
+ for (int i = 0; i < 64; i += 8) {
815
+ const uint16x8_t m_bias = vld1q_u16(bias + i);
816
+ const uint16x8_t m_mult = vld1q_u16(iquant + i);
817
+ const uint16x8_t m_quant = vmovl_u8(vld1_u8(quant + i));
818
+ const uint16x8_t A = vreinterpretq_u16_s16(vabsq_s16(vld1q_s16(in + i)));
819
+ const uint16x8_t B = vaddq_u16(A, m_bias);
820
+ const uint32x4_t C0 = vmull_u16(vget_low_u16(B), vget_low_u16(m_mult));
821
+ const uint32x4_t C1 = vmull_u16(vget_high_u16(B), vget_high_u16(m_mult));
822
+ // collect hi-words of the 32b mult result using 'unzip'
823
+ const uint16x8x2_t D = vuzpq_u16(vreinterpretq_u16_u32(C0),
824
+ vreinterpretq_u16_u32(C1));
825
+ const uint16x8_t E = vshrq_n_u16(D.val[1], AC_BITS);
826
+ const uint16x8_t F = vmulq_u16(E, m_quant); // dequantized coeff
827
+ const uint16x8_t G = vabdq_u16(F, vshrq_n_u16(A, AC_BITS));
828
+ sum1 = vmlal_u16(sum1, vget_low_u16(G), vget_low_u16(G));
829
+ sum2 = vmlal_u16(sum2, vget_high_u16(G), vget_high_u16(G));
830
+ }
831
+ const uint32x4_t sum3 = vaddq_u32(sum1, sum2);
832
+ const uint64x2_t sum4 = vpaddlq_u32(sum3);
833
+ const uint64_t sum5 = vgetq_lane_u64(sum4, 0) + vgetq_lane_u64(sum4, 1);
834
+ const uint32_t err = (uint32_t)sum5;
835
+ return err;
836
+ }
837
+
838
+ #endif // SJPEG_USE_NEON
839
+
840
+ static uint32_t QuantizeError(const int16_t in[64], const Quantizer* const Q) {
841
+ const uint16_t* const bias = Q->bias_;
842
+ const uint16_t* const iquant = Q->iquant_;
843
+ const uint8_t* const quant = Q->quant_;
844
+ uint32_t err = 0;
845
+ for (int j = 0; j < 64; ++j) {
846
+ int32_t v0 = (in[j] < 0) ? -in[j] : in[j];
847
+ const uint32_t v = quant[j] * QUANTIZE(v0, iquant[j], bias[j]);
848
+ v0 >>= AC_BITS;
849
+ err += (v0 - v) * (v0 - v);
850
+ }
851
+ return err;
852
+ }
853
+
854
+ Encoder::QuantizeErrorFunc Encoder::GetQuantizeErrorFunc() {
855
+ #if defined(SJPEG_USE_SSE2)
856
+ if (SupportsSSE2()) return QuantizeErrorSSE2;
857
+ #elif defined(SJPEG_USE_NEON)
858
+ if (SupportsNEON()) return QuantizeErrorNEON;
859
+ #endif
860
+ return QuantizeError; // default
861
+ }
862
+
863
+ ////////////////////////////////////////////////////////////////////////////////
864
+ // Code bitstream
865
+
866
+ void Encoder::ResetDCs() {
867
+ for (int c = 0; c < nb_comps_; ++c) {
868
+ DCs_[c] = 0;
869
+ }
870
+ }
871
+
872
+ void Encoder::CodeBlock(const DCTCoeffs* const coeffs,
873
+ const RunLevel* const rl) {
874
+ const int idx = coeffs->idx_;
875
+ const int q_idx = quant_idx_[idx];
876
+
877
+ // DC coefficient symbol
878
+ const int dc_len = coeffs->dc_code_ & 0x0f;
879
+ const uint32_t code = dc_codes_[q_idx][dc_len];
880
+ bw_.PutPackedCode(code);
881
+ if (dc_len > 0) {
882
+ bw_.PutBits(coeffs->dc_code_ >> 4, dc_len);
883
+ }
884
+
885
+ // AC coeffs
886
+ const uint32_t* const codes = ac_codes_[q_idx];
887
+ for (int i = 0; i < coeffs->nb_coeffs_; ++i) {
888
+ int run = rl[i].run_;
889
+ while (run & ~15) { // escapes
890
+ bw_.PutPackedCode(codes[0xf0]);
891
+ run -= 16;
892
+ }
893
+ const uint32_t suffix = rl[i].level_;
894
+ const int n = suffix & 0x0f;
895
+ const int sym = (run << 4) | n;
896
+ bw_.PutPackedCode(codes[sym]);
897
+ bw_.PutBits(suffix >> 4, n);
898
+ }
899
+ if (coeffs->last_ < 63) { // EOB
900
+ bw_.PutPackedCode(codes[0x00]);
901
+ }
902
+ }
903
+
904
+ ////////////////////////////////////////////////////////////////////////////////
905
+ // Histogram
906
+
907
+ void Encoder::ResetHisto() {
908
+ memset(histos_, 0, sizeof(histos_));
909
+ }
910
+
911
+ #if defined(SJPEG_USE_SSE2)
912
+ void StoreHistoSSE2(const int16_t in[64], Histo* const histos, int nb_blocks) {
913
+ const __m128i kMaxHisto = _mm_set1_epi16(MAX_HISTO_DCT_COEFF);
914
+ for (int n = 0; n < nb_blocks; ++n, in += 64) {
915
+ uint16_t tmp[64];
916
+ for (int i = 0; i < 64; i += 8) {
917
+ const __m128i A =
918
+ _mm_loadu_si128(reinterpret_cast<const __m128i*>(in + i));
919
+ const __m128i B = _mm_srai_epi16(A, 15); // sign extract
920
+ const __m128i C = _mm_sub_epi16(_mm_xor_si128(A, B), B); // abs(A)
921
+ const __m128i D = _mm_srli_epi16(C, HSHIFT); // >>= HSHIFT
922
+ const __m128i E = _mm_min_epi16(D, kMaxHisto);
923
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(tmp + i), E);
924
+ }
925
+ for (int j = 0; j < 64; ++j) {
926
+ const int k = tmp[j];
927
+ ++histos->counts_[j][k];
928
+ }
929
+ }
930
+ }
931
+ #elif defined(SJPEG_USE_NEON)
932
+ void StoreHistoNEON(const int16_t in[64], Histo* const histos, int nb_blocks) {
933
+ const uint16x8_t kMaxHisto = vdupq_n_u16(MAX_HISTO_DCT_COEFF);
934
+ for (int n = 0; n < nb_blocks; ++n, in += 64) {
935
+ uint16_t tmp[64];
936
+ for (int i = 0; i < 64; i += 8) {
937
+ const int16x8_t A = vld1q_s16(in + i);
938
+ const int16x8_t B = vabsq_s16(A); // abs(in)
939
+ const uint16x8_t C = vreinterpretq_u16_s16(B); // signed->unsigned
940
+ const uint16x8_t D = vshrq_n_u16(C, HSHIFT); // >>= HSHIFT
941
+ const uint16x8_t E = vminq_u16(D, kMaxHisto); // min(.,kMaxHisto)
942
+ vst1q_u16(tmp + i, E);
943
+ }
944
+ for (int j = 0; j < 64; ++j) {
945
+ const int k = tmp[j];
946
+ ++histos->counts_[j][k];
947
+ }
948
+ }
949
+ }
950
+ #endif
951
+
952
+ // This C-version is does not produce the same counts_[] output than the
953
+ // assembly above. But the extra entry counts_[MAX_HISTO_DCT_COEFF] is
954
+ // not used for the final computation, and the global result is unchanged.
955
+ void StoreHisto(const int16_t in[64], Histo* const histos, int nb_blocks) {
956
+ for (int n = 0; n < nb_blocks; ++n, in += 64) {
957
+ for (int i = 0; i < 64; ++i) {
958
+ const int k = (in[i] < 0 ? -in[i] : in[i]) >> HSHIFT;
959
+ if (k < MAX_HISTO_DCT_COEFF) {
960
+ ++histos->counts_[i][k];
961
+ }
962
+ }
963
+ }
964
+ }
965
+
966
+ Encoder::StoreHistoFunc Encoder::GetStoreHistoFunc() {
967
+ #if defined(SJPEG_USE_SSE2)
968
+ if (SupportsSSE2()) return StoreHistoSSE2;
969
+ #elif defined(SJPEG_USE_NEON)
970
+ if (SupportsNEON()) return StoreHistoNEON;
971
+ #endif
972
+ return StoreHisto; // default
973
+ }
974
+
975
+ const float Encoder::kHistoWeight[QSIZE] = {
976
+ // Gaussian with sigma ~= 3
977
+ 0, 0, 0, 0, 0,
978
+ 1, 5, 16, 43, 94, 164, 228, 255, 228, 164, 94, 43, 16, 5, 1,
979
+ 0, 0, 0, 0, 0
980
+ };
981
+
982
+ void Encoder::AnalyseHisto() {
983
+ // A bit of theory and background: for each sub-band i in [0..63], we pick a
984
+ // quantization scale New_Qi close to the initial one Qi. We evaluate a cost
985
+ // function associated with F({New_Qi}) = distortion + lambda . rate,
986
+ // where rate and distortion depend on the quantizers set in a complex non-
987
+ // analytic way. Just, for well-behaved regular histograms, we expect the
988
+ // rate to scale as -log(Q), and the distortion as Q^2.
989
+ // We want the cost function to be stationnary around the initial {Qi} set,
990
+ // in order to achieve the best transfer between distortion and rate when we
991
+ // displace a little the Qi values. Mainly we want to use bits as efficiently
992
+ // as possible, where every bit we use has maximal impact in lowering
993
+ // distortion (and vice versa: if we spend an extra bit of coding, we want to
994
+ // have the best bang for this buck. The optimization works up-hill too).
995
+ //
996
+ // Hence, lambda is picked to minimize F around {Qi}, as:
997
+ // lambda = -d(distortion) / d(rate)
998
+ // where the derivates are evaluated using a double least-square fit on both
999
+ // the clouds of {delta, distortion} and {delta, size} points.
1000
+ //
1001
+ // Note1: The least-square fitted slope of a {x,y} cloud is expressed as:
1002
+ // slope = (<xy> - <x><y>) / (<xx> - <x><x>) = Cov(x,y) / Cov(x,x)
1003
+ // where <.> is our gaussian-averaging operator.
1004
+ // But since we are eventually computing a quotient of such slopes, we can
1005
+ // factor out the common (<xx> - <x><x>) denominator (which is strictly
1006
+ // positive).
1007
+ // Note2: we use a Gaussian-weighted average around the center value Qi
1008
+ // instead of averaging over the whole [QDELTA_MIN, QDELTA_MAX] range.
1009
+ // This rules out fringe samples on noisy cases (like: when the source is
1010
+ // already JPEG-compressed!).
1011
+ // Note3: We fall back to some sane value HLAMBDA in case of ill-condition.
1012
+ //
1013
+ // We use use the correlation coefficient
1014
+ // r = Cov(x,y) / sqrt(Cov(x,x) * Cov(y,y))
1015
+ // to detect bad cases with poorly extrapolated distortion. In such
1016
+ // occurrence, we skip the channel. This is particularly important for
1017
+ // already-compressed JPEG sources that give treacherous comb-like
1018
+ // histograms.
1019
+ //
1020
+ // Once this particular lambda has been picked, we loop over each channel
1021
+ // and optimize them separately, locally picking the best New_Qi for each.
1022
+ // The choice of lambda ensure a good balancing between size and distortion,
1023
+ // and prevent being too aggressive on file-size reduction for instance.
1024
+ //
1025
+ const double r_limit = kCorrelationThreshold;
1026
+ for (int c = (nb_comps_ > 1 ? 1 : 0); c >= 0; --c) {
1027
+ const int idx = quant_idx_[c];
1028
+ const Histo* const histo = &histos_[idx];
1029
+ // For chrominance, it can be visually damageable to be too
1030
+ // aggressive on the filesize. So with the default settings we
1031
+ // restrict the algorithm to mainly try to *increase* the bitrate
1032
+ // (and quality) by using a smaller qdelta_max_chroma_.
1033
+ // delta_max is only use during the second phase, but not during
1034
+ // the first phase of deriving an optimal lambda.
1035
+ assert(QDELTA_MAX >= qdelta_max_luma_);
1036
+ assert(QDELTA_MAX >= qdelta_max_chroma_);
1037
+ const int delta_max =
1038
+ ((idx == 0) ? qdelta_max_luma_ : qdelta_max_chroma_) - QDELTA_MIN;
1039
+ assert(delta_max < QSIZE);
1040
+ float sizes[64][QSIZE];
1041
+ float distortions[64][QSIZE];
1042
+ double num = 0.; // accumulate d(distortion) around delta_q = 0
1043
+ double den = 0.; // accumulate d(size) around delta_q = 0
1044
+ uint64_t omit_channels = kOmittedChannels;
1045
+ for (int pos = 0; pos < 64; ++pos) {
1046
+ if (omit_channels & (1ULL << pos)) {
1047
+ continue;
1048
+ }
1049
+ const int dq0 = quants_[idx].quant_[pos];
1050
+ const int min_dq0 = quants_[idx].min_quant_[pos];
1051
+ // We should be using the exact bias:
1052
+ // const int bias = quants_[idx].bias_[pos] << (FP_BITS - AC_BITS);
1053
+ // but this value is too precise considering the other approximations
1054
+ // we're using (namely: HSHIFT). So we better use the a mid value of 0.5
1055
+ // for the bias. This have the advantage of making it possible to
1056
+ // use pre-calculated look-up tables for every quantities in the loop.
1057
+ // This is still a TODO(skal) below, though. Not sure the gain is big.
1058
+ const int bias = 1 << FP_BITS >> 1;
1059
+ const int* const h = histo->counts_[pos];
1060
+ int total = 0;
1061
+ int last = 0;
1062
+ for (int i = 0; i < MAX_HISTO_DCT_COEFF; ++i) {
1063
+ total += h[i];
1064
+ if (h[i]) last = i + 1;
1065
+ }
1066
+ if (total < kDensityThreshold * last) {
1067
+ omit_channels |= 1ULL << pos;
1068
+ continue;
1069
+ }
1070
+ // accumulators for averaged values.
1071
+ double sw = 0., sx = 0.;
1072
+ double sxx = 0., syy1 = 0.;
1073
+ double sy1 = 0., sxy1 = 0.; // accumulators for distortion cloud
1074
+ double sy2 = 0., sxy2 = 0.; // accumulators for size cloud
1075
+ for (int delta = 0; delta < QSIZE; ++delta) {
1076
+ double bsum = 0., dsum = 0.;
1077
+ const int dq = dq0 + (delta + QDELTA_MIN);
1078
+ if (dq >= min_dq0 && dq <= 255) {
1079
+ // TODO(skal): pre-compute idq and use it in FinalizeQuantMatrix too
1080
+ const int idq = ((1 << FP_BITS) + dq - 1) / dq;
1081
+ for (int i = 0; i < last; ++i) {
1082
+ if (h[i]) {
1083
+ // v = current bin's centroid in the histogram
1084
+ // qv = quantized value for the bin's representant 'v'
1085
+ // dqv = dequantized qv, to be compared against v (=> 'error')
1086
+ // bits = approximate bit-cost of quantized representant
1087
+ // h[i] = this bin's weight
1088
+ const int v = (i << HSHIFT) + HHALF;
1089
+ const int qv = (v * idq + bias) >> FP_BITS;
1090
+ // TODO(skal): for a given 'last' value, we know the upper limit
1091
+ // on dq that will make *all* quantized 'qv' values be zero.
1092
+ // => We can restrict the loop on 'dq' using 'last'.
1093
+ if (qv) {
1094
+ const int bits = CalcLog2(qv);
1095
+ const int dqv = qv * dq;
1096
+ const int error = (v - dqv) * (v - dqv);
1097
+ bsum += h[i] * bits;
1098
+ dsum += h[i] * error;
1099
+ } else {
1100
+ dsum += h[i] * v * v;
1101
+ }
1102
+ }
1103
+ } // end of 'i' loop
1104
+ distortions[pos][delta] = static_cast<float>(dsum);
1105
+ sizes[pos][delta] = static_cast<float>(bsum);
1106
+ const double w = kHistoWeight[delta]; // Gaussian weight
1107
+ if (w > 0.) {
1108
+ const double x = static_cast<double>(delta + QDELTA_MIN);
1109
+ sw += w;
1110
+ sx += w * x;
1111
+ sxx += w * x * x;
1112
+ sy1 += w * dsum;
1113
+ syy1 += w * dsum * dsum;
1114
+ sy2 += w * bsum;
1115
+ sxy1 += w * dsum * x;
1116
+ sxy2 += w * bsum * x;
1117
+ }
1118
+ } else { // the new quantizer is out-of-range.
1119
+ distortions[pos][delta] = FLT_MAX;
1120
+ sizes[pos][delta] = 0;
1121
+ }
1122
+ }
1123
+ // filter channels according to correlation factor.
1124
+ const double cov_xy1 = sw * sxy1 - sx * sy1;
1125
+ if (cov_xy1 * cov_xy1 < r_limit *
1126
+ (sw * sxx - sx * sx) * (sw * syy1 - sy1 * sy1)) {
1127
+ omit_channels |= 1ULL << pos;
1128
+ continue;
1129
+ }
1130
+ // accumulate numerator and denominator for the derivate calculation
1131
+ num += cov_xy1;
1132
+ den += sw * sxy2 - sx * sy2;
1133
+ }
1134
+
1135
+ // we evaluate lambda =~ -d(distortion)/d(size) at dq=0
1136
+ double lambda = HLAMBDA;
1137
+ // When increasing Q, size should significantly decrease and distortion
1138
+ // increase. If they don't, we are ill-conditionned and should fall back
1139
+ // to a safe value HLAMBDA.
1140
+ if (num > 1000. && den < -10.) {
1141
+ // This is our approximation of -d(Distortion) / d(Rate)
1142
+ // We limit it to 1. below, to avoid degenerated cases
1143
+ lambda = -num / den;
1144
+ if (lambda < 1.) {
1145
+ lambda = 1.;
1146
+ }
1147
+ }
1148
+ // now, optimize each channel using the optimal lambda selection
1149
+ for (int pos = 0; pos < 64; ++pos) {
1150
+ if (omit_channels & (1ULL << pos)) {
1151
+ continue;
1152
+ }
1153
+ float best_score = FLT_MAX;
1154
+ int best_dq = 0;
1155
+ for (int delta = 0; delta <= delta_max; ++delta) {
1156
+ if (distortions[pos][delta] < FLT_MAX) {
1157
+ const float score = distortions[pos][delta]
1158
+ + lambda * sizes[pos][delta];
1159
+ if (score < best_score) {
1160
+ best_score = score;
1161
+ best_dq = delta + QDELTA_MIN;
1162
+ }
1163
+ }
1164
+ }
1165
+ quants_[idx].quant_[pos] += best_dq;
1166
+ assert(quants_[idx].quant_[pos] >= 1);
1167
+ }
1168
+ FinalizeQuantMatrix(&quants_[idx], q_bias_);
1169
+ SetCostCodes(idx);
1170
+ }
1171
+ }
1172
+
1173
+ void Encoder::CollectHistograms() {
1174
+ ResetHisto();
1175
+ int16_t* in = in_blocks_;
1176
+ const int mb_x_max = W_ / block_w_;
1177
+ const int mb_y_max = H_ / block_h_;
1178
+ for (int mb_y = 0; mb_y < mb_h_; ++mb_y) {
1179
+ const bool yclip = (mb_y == mb_y_max);
1180
+ for (int mb_x = 0; mb_x < mb_w_; ++mb_x) {
1181
+ if (!use_extra_memory_) {
1182
+ in = in_blocks_;
1183
+ }
1184
+ GetSamples(mb_x, mb_y, yclip | (mb_x == mb_x_max), in);
1185
+ fDCT_(in, mcu_blocks_);
1186
+ for (int c = 0; c < nb_comps_; ++c) {
1187
+ const int num_blocks = nb_blocks_[c];
1188
+ store_histo_(in, &histos_[quant_idx_[c]], num_blocks);
1189
+ in += 64 * num_blocks;
1190
+ }
1191
+ }
1192
+ }
1193
+ have_coeffs_ = use_extra_memory_;
1194
+ }
1195
+
1196
+ ////////////////////////////////////////////////////////////////////////////////
1197
+ // Perform YUV conversion and fDCT, and store the unquantized coeffs
1198
+
1199
+ void Encoder::CollectCoeffs() {
1200
+ assert(use_extra_memory_);
1201
+ int16_t* in = in_blocks_;
1202
+ const int mb_x_max = W_ / block_w_;
1203
+ const int mb_y_max = H_ / block_h_;
1204
+ for (int mb_y = 0; mb_y < mb_h_; ++mb_y) {
1205
+ const bool yclip = (mb_y == mb_y_max);
1206
+ for (int mb_x = 0; mb_x < mb_w_; ++mb_x) {
1207
+ GetSamples(mb_x, mb_y, yclip | (mb_x == mb_x_max), in);
1208
+ fDCT_(in, mcu_blocks_);
1209
+ in += 64 * mcu_blocks_;
1210
+ }
1211
+ }
1212
+ have_coeffs_ = true;
1213
+ }
1214
+
1215
+ ////////////////////////////////////////////////////////////////////////////////
1216
+ // 1-pass Scan
1217
+
1218
+ void Encoder::SinglePassScan() {
1219
+ ResetDCs();
1220
+
1221
+ RunLevel base_run_levels[64];
1222
+ int16_t* in = in_blocks_;
1223
+ const int mb_x_max = W_ / block_w_;
1224
+ const int mb_y_max = H_ / block_h_;
1225
+ const QuantizeBlockFunc quantize_block = use_trellis_ ? TrellisQuantizeBlock
1226
+ : quantize_block_;
1227
+ for (int mb_y = 0; mb_y < mb_h_; ++mb_y) {
1228
+ const bool yclip = (mb_y == mb_y_max);
1229
+ for (int mb_x = 0; mb_x < mb_w_; ++mb_x) {
1230
+ if (!CheckBuffers()) return;
1231
+ if (!have_coeffs_) {
1232
+ in = in_blocks_;
1233
+ GetSamples(mb_x, mb_y, yclip | (mb_x == mb_x_max), in);
1234
+ fDCT_(in, mcu_blocks_);
1235
+ }
1236
+ for (int c = 0; c < nb_comps_; ++c) {
1237
+ DCTCoeffs base_coeffs;
1238
+ for (int i = 0; i < nb_blocks_[c]; ++i) {
1239
+ const int dc = quantize_block(in, c, &quants_[quant_idx_[c]],
1240
+ &base_coeffs, base_run_levels);
1241
+ base_coeffs.dc_code_ = GenerateDCDiffCode(dc, &DCs_[c]);
1242
+ CodeBlock(&base_coeffs, base_run_levels);
1243
+ in += 64;
1244
+ }
1245
+ }
1246
+ }
1247
+ }
1248
+ }
1249
+
1250
+ void Encoder::FinalPassScan(size_t nb_mbs, const DCTCoeffs* coeffs) {
1251
+ DesallocateBlocks(); // we can free up some coeffs memory at this point
1252
+ if (!CheckBuffers()) return; // call needed to finalize all_run_levels_
1253
+ assert(reuse_run_levels_);
1254
+ const RunLevel* run_levels = all_run_levels_;
1255
+ for (size_t n = 0; n < nb_mbs; ++n) {
1256
+ if (!CheckBuffers()) return;
1257
+ CodeBlock(&coeffs[n], run_levels);
1258
+ run_levels += coeffs[n].nb_coeffs_;
1259
+ }
1260
+ }
1261
+
1262
+ ////////////////////////////////////////////////////////////////////////////////
1263
+ // Huffman tables optimization
1264
+
1265
+ void Encoder::ResetEntropyStats() {
1266
+ memset(freq_ac_, 0, sizeof(freq_ac_));
1267
+ memset(freq_dc_, 0, sizeof(freq_dc_));
1268
+ }
1269
+
1270
+ void Encoder::AddEntropyStats(const DCTCoeffs* const coeffs,
1271
+ const RunLevel* const run_levels) {
1272
+ // freq_ac_[] and freq_dc_[] cannot overflow 32bits, since the maximum
1273
+ // resolution allowed is 65535 * 65535. The sum of all frequencies cannot
1274
+ // be greater than 32bits, either.
1275
+ const int idx = coeffs->idx_;
1276
+ const int q_idx = quant_idx_[idx];
1277
+ for (int i = 0; i < coeffs->nb_coeffs_; ++i) {
1278
+ const int run = run_levels[i].run_;
1279
+ const int tmp = (run >> 4);
1280
+ if (tmp) freq_ac_[q_idx][0xf0] += tmp; // count escapes (all at once)
1281
+ const int suffix = run_levels[i].level_;
1282
+ const int sym = ((run & 0x0f) << 4) | (suffix & 0x0f);
1283
+ ++freq_ac_[q_idx][sym];
1284
+ }
1285
+ if (coeffs->last_ < 63) { // EOB
1286
+ ++freq_ac_[q_idx][0x00];
1287
+ }
1288
+ ++freq_dc_[q_idx][coeffs->dc_code_ & 0x0f];
1289
+ }
1290
+
1291
+ static int cmp(const void *pa, const void *pb) {
1292
+ const uint64_t a = *reinterpret_cast<const uint64_t*>(pa);
1293
+ const uint64_t b = *reinterpret_cast<const uint64_t*>(pb);
1294
+ assert(a != b); // tie-breaks can't happen
1295
+ return (a < b) ? 1 : -1;
1296
+ }
1297
+
1298
+ static void BuildOptimalTable(HuffmanTable* const t,
1299
+ const uint32_t* const freq, int size) {
1300
+ enum { MAX_BITS = 32, MAX_CODE_SIZE = 16 };
1301
+ assert(size <= 256);
1302
+ assert(t != nullptr);
1303
+
1304
+ // The celebrated merging algorithm from Huffman, with some restrictions:
1305
+ // * codes with all '1' are forbidden, to avoid trailing marker emulation
1306
+ // * code should be less than 16bits. So we're re-allocating them to shorter
1307
+ // code, even if it means being suboptimal for extremely rare symbols that
1308
+ // would eat a lot of bits.
1309
+ // This function will not touch the content of freq[].
1310
+ int codesizes[256 + 1];
1311
+ // chain[i] will hold the index of the next element in the subtree below
1312
+ // element 'i', or -1 if there's no sub-tree.
1313
+ // We use and maintain this list in order to efficiently increasing the
1314
+ // codesizes by one when merging two sub-trees into one.
1315
+ // To ease the merging (by avoiding 1 loop) we store the address of the last
1316
+ // element in the chain for each symbol. This makes the process being O(1).
1317
+ // It's probably better to keep the arrays separated instead of making
1318
+ // a struct, since we touch chain_end[] only once per merging, whereas
1319
+ // chain[] and codesizes[] are modified O(k) time per merging.
1320
+ int chain[256 + 1];
1321
+ int* chain_end[256 + 1];
1322
+ // sorted_freq[] remains sorted by decreasing frequencies along the process.
1323
+ uint64_t sorted_freq[256 + 1];
1324
+
1325
+ // Counts and puts the symbols effectively used at the beginning of the table.
1326
+ int nb_syms = 0;
1327
+ for (int i = 0; i < size; ++i) {
1328
+ const uint64_t v = freq[i];
1329
+ if (v > 0) {
1330
+ // we pack the sorted key (32bits) and index (9bits) into a single
1331
+ // uint64_t, so we don't have to resort to structs (and we avoid
1332
+ // tie-breaks, too)
1333
+ sorted_freq[nb_syms++] = (v << 9) | i;
1334
+ }
1335
+ codesizes[i] = 0;
1336
+ chain[i] = -1;
1337
+ chain_end[i] = &chain[i];
1338
+ }
1339
+ t->nb_syms_ = nb_syms; // Record how many final symbols we'll have.
1340
+
1341
+ // initial sort
1342
+ // TODO(skal): replace by counting-sort?? (merged with previous loop?)
1343
+ qsort(sorted_freq, nb_syms, sizeof(sorted_freq[0]), cmp);
1344
+
1345
+ // fake last symbol, with lowest frequency: will be assigned to the forbidden
1346
+ // code '1111...1', but will eventually be discarded.
1347
+ sorted_freq[nb_syms++] = (1ULL << 9) | size;
1348
+ codesizes[size] = 0;
1349
+ chain[size] = -1;
1350
+ chain_end[size] = &chain[size];
1351
+
1352
+ // Merging phase
1353
+ // Recursively merge the two symbols with lowest frequency. The resulting
1354
+ // super-symbol will be represented by a longer (by 1bit) code, since
1355
+ // it's the least frequent one.
1356
+ int nb = nb_syms;
1357
+ while (nb-- > 1) {
1358
+ // First, link the two sub-trees.
1359
+ const uint64_t s1 = sorted_freq[nb - 1]; // first symbol
1360
+ const uint64_t s2 = sorted_freq[nb]; // second symbol, appended
1361
+ // The 0x1ff masking is for taking only the symbol, discarding the
1362
+ // frequency that we stored in the upper bits for sorting.
1363
+ int i = s1 & 0x1ff;
1364
+ const int j = s2 & 0x1ff;
1365
+ assert(i <= size && j <= size);
1366
+ *chain_end[i] = j;
1367
+ chain_end[i] = chain_end[j];
1368
+
1369
+ // Then, following the chain, increase the whole sub-tree's weight by 1bit.
1370
+ do {
1371
+ ++codesizes[i];
1372
+ i = chain[i];
1373
+ } while (i >= 0);
1374
+
1375
+ // Create new symbol, with merged frequencies. Will take s1's spot.
1376
+ // We must use 64bit here to prevent overflow in the sum. Both s1 and
1377
+ // s2 are originally 32 + 9 bits wide.
1378
+ const uint64_t new_symbol = s1 + (s2 & ~0x1ff);
1379
+ // Perform insertion sort to find the new spot of the merged symbol.
1380
+ int k = nb - 1;
1381
+ while (k > 0) {
1382
+ if (sorted_freq[k - 1] < new_symbol) {
1383
+ sorted_freq[k] = sorted_freq[k - 1];
1384
+ --k;
1385
+ } else {
1386
+ break;
1387
+ }
1388
+ }
1389
+ sorted_freq[k] = new_symbol;
1390
+ }
1391
+
1392
+ // Count bit distribution.
1393
+ uint8_t bits[MAX_BITS];
1394
+ memset(bits, 0, sizeof(bits));
1395
+ int max_bit_size = 0;
1396
+ for (int i = 0; i <= size; ++i) {
1397
+ int s = codesizes[i];
1398
+ assert(s <= codesizes[size]); // symbol #size is the biggest one.
1399
+ if (s > 0) {
1400
+ // This is slightly penalizing but only for ultra-rare symbol
1401
+ if (s > MAX_BITS) {
1402
+ s = MAX_BITS;
1403
+ codesizes[i] = MAX_BITS; // clamp code-size
1404
+ }
1405
+ ++bits[s - 1];
1406
+ if (s > max_bit_size) {
1407
+ max_bit_size = s;
1408
+ }
1409
+ }
1410
+ }
1411
+
1412
+ // We sort symbols by slices of increasing bitsizes, using counting sort.
1413
+ // This will generate a partition of symbols in the final syms_[] array.
1414
+ int start[MAX_BITS]; // start[i] is the first code with length i+1
1415
+ int position = 0;
1416
+ for (int i = 0; i < max_bit_size; ++i) {
1417
+ start[i] = position;
1418
+ position += bits[i];
1419
+ }
1420
+ assert(position == nb_syms);
1421
+
1422
+ // Now, we can ventilate the symbols directly to their final slice in the
1423
+ // partitioning, according to the their bit-length.
1424
+ // Note: we omit the last symbol, which is fake.
1425
+ uint8_t* const syms = const_cast<uint8_t*>(t->syms_);
1426
+ // Note that we loop til symbol = size-1, hence omitting the last fake symbol.
1427
+ for (int symbol = 0; symbol < size; ++symbol) {
1428
+ const int s = codesizes[symbol];
1429
+ if (s > 0) {
1430
+ assert(s <= MAX_BITS);
1431
+ syms[start[s - 1]++] = symbol;
1432
+ }
1433
+ }
1434
+ assert(start[max_bit_size - 1] == nb_syms - 1);
1435
+
1436
+ // Fix codes with length greater than 16 bits. We move too long
1437
+ // codes up, and one short down, making the tree a little sub-optimal.
1438
+ for (int l = max_bit_size - 1; l >= MAX_CODE_SIZE; --l) {
1439
+ while (bits[l] > 0) {
1440
+ int k = l - 2;
1441
+ while (bits[k] == 0) { // Search for a level with a leaf to split.
1442
+ --k;
1443
+ }
1444
+ /* Move up 2 symbols from bottom-most level l, and sink down one from
1445
+ level k, like this:
1446
+ Before: After:
1447
+ / .. / ..
1448
+ k bits-> c \ /\ \
1449
+ /\ c b /\
1450
+ .. /\ .. a
1451
+ l bits-> a b
1452
+ Note that by the very construction of the optimal tree, the least
1453
+ probable symbols always come by pair with same bit-length.
1454
+ So there's always a pair of 'a' and 'b' to find.
1455
+ */
1456
+ bits[l ] -= 2; // remove 'a' and 'b'
1457
+ bits[l - 1] += 1; // put 'a' one level up.
1458
+ bits[k ] -= 1; // remove 'c'
1459
+ bits[k + 1] += 2; // put 'c' anb 'b' one level down.
1460
+ }
1461
+ }
1462
+
1463
+ // remove last pseudo-symbol
1464
+ max_bit_size = MAX_CODE_SIZE;
1465
+ while (bits[--max_bit_size] == 0) {
1466
+ assert(max_bit_size > 0);
1467
+ }
1468
+ --bits[max_bit_size];
1469
+
1470
+ // update table with final book
1471
+ for (int i = 0; i < MAX_CODE_SIZE; ++i) {
1472
+ t->bits_[i] = bits[i];
1473
+ }
1474
+ }
1475
+
1476
+ void Encoder::CompileEntropyStats() {
1477
+ // plug and build new tables
1478
+ for (int q_idx = 0; q_idx < (nb_comps_ == 1 ? 1 : 2); ++q_idx) {
1479
+ // DC tables
1480
+ Huffman_tables_[q_idx] = &opt_tables_dc_[q_idx];
1481
+ opt_tables_dc_[q_idx].syms_ = opt_syms_dc_[q_idx];
1482
+ BuildOptimalTable(&opt_tables_dc_[q_idx], freq_dc_[q_idx], 12);
1483
+ // AC tables
1484
+ Huffman_tables_[2 + q_idx] = &opt_tables_ac_[q_idx];
1485
+ opt_tables_ac_[q_idx].syms_ = opt_syms_ac_[q_idx];
1486
+ BuildOptimalTable(&opt_tables_ac_[q_idx], freq_ac_[q_idx], 256);
1487
+ }
1488
+ }
1489
+
1490
+ void Encoder::StoreOptimalHuffmanTables(size_t nb_mbs,
1491
+ const DCTCoeffs* coeffs) {
1492
+ // optimize Huffman tables
1493
+ ResetEntropyStats();
1494
+ const RunLevel* run_levels = all_run_levels_;
1495
+ for (size_t n = 0; n < nb_mbs; ++n) {
1496
+ AddEntropyStats(&coeffs[n], run_levels);
1497
+ run_levels += coeffs[n].nb_coeffs_;
1498
+ }
1499
+ CompileEntropyStats();
1500
+ }
1501
+
1502
+ ////////////////////////////////////////////////////////////////////////////////
1503
+
1504
+ void Encoder::SinglePassScanOptimized() {
1505
+ const size_t nb_mbs = mb_w_ * mb_h_ * mcu_blocks_;
1506
+ DCTCoeffs* const base_coeffs =
1507
+ Alloc<DCTCoeffs>(reuse_run_levels_ ? nb_mbs : 1);
1508
+ if (base_coeffs == nullptr) return;
1509
+ DCTCoeffs* coeffs = base_coeffs;
1510
+ RunLevel base_run_levels[64];
1511
+ const QuantizeBlockFunc quantize_block = use_trellis_ ? TrellisQuantizeBlock
1512
+ : quantize_block_;
1513
+
1514
+ // We use the default Huffman tables as basis for bit-rate evaluation
1515
+ if (use_trellis_) InitCodes(true);
1516
+
1517
+ ResetEntropyStats();
1518
+ ResetDCs();
1519
+ nb_run_levels_ = 0;
1520
+ int16_t* in = in_blocks_;
1521
+ const int mb_x_max = W_ / block_w_;
1522
+ const int mb_y_max = H_ / block_h_;
1523
+ for (int mb_y = 0; mb_y < mb_h_; ++mb_y) {
1524
+ const bool yclip = (mb_y == mb_y_max);
1525
+ for (int mb_x = 0; mb_x < mb_w_; ++mb_x) {
1526
+ if (!have_coeffs_) {
1527
+ in = in_blocks_;
1528
+ GetSamples(mb_x, mb_y, yclip | (mb_x == mb_x_max), in);
1529
+ fDCT_(in, mcu_blocks_);
1530
+ }
1531
+ if (!CheckBuffers()) goto End;
1532
+ for (int c = 0; c < nb_comps_; ++c) {
1533
+ for (int i = 0; i < nb_blocks_[c]; ++i) {
1534
+ RunLevel* const run_levels =
1535
+ reuse_run_levels_ ? all_run_levels_ + nb_run_levels_
1536
+ : base_run_levels;
1537
+ const int dc = quantize_block(in, c, &quants_[quant_idx_[c]],
1538
+ coeffs, run_levels);
1539
+ coeffs->dc_code_ = GenerateDCDiffCode(dc, &DCs_[c]);
1540
+ AddEntropyStats(coeffs, run_levels);
1541
+ if (reuse_run_levels_) {
1542
+ nb_run_levels_ += coeffs->nb_coeffs_;
1543
+ ++coeffs;
1544
+ assert(coeffs <= &base_coeffs[nb_mbs]);
1545
+ }
1546
+ in += 64;
1547
+ assert(nb_run_levels_ <= max_run_levels_);
1548
+ }
1549
+ }
1550
+ }
1551
+ }
1552
+
1553
+ CompileEntropyStats();
1554
+ WriteDHT();
1555
+ WriteSOS();
1556
+
1557
+ if (!reuse_run_levels_) {
1558
+ SinglePassScan(); // redo everything, but with optimal tables now.
1559
+ } else {
1560
+ // Re-use the saved run/levels for fast 2nd-pass.
1561
+ FinalPassScan(nb_mbs, base_coeffs);
1562
+ }
1563
+ End:
1564
+ Free(base_coeffs);
1565
+ }
1566
+
1567
+ ////////////////////////////////////////////////////////////////////////////////
1568
+ // main call
1569
+
1570
+ bool Encoder::Encode() {
1571
+ if (!ok_) return false;
1572
+
1573
+ FinalizeQuantMatrix(&quants_[0], q_bias_);
1574
+ FinalizeQuantMatrix(&quants_[1], q_bias_);
1575
+ SetCostCodes(0);
1576
+ SetCostCodes(1);
1577
+
1578
+ // default tables
1579
+ for (int i = 0; i < 4; ++i) Huffman_tables_[i] = &kHuffmanTables[i];
1580
+
1581
+ // colorspace init
1582
+ InitComponents();
1583
+ assert(nb_comps_ <= MAX_COMP);
1584
+ assert(mcu_blocks_ <= 6);
1585
+ // validate some input parameters
1586
+ if (W_ <= 0 || H_ <= 0 || rgb_ == nullptr) return false;
1587
+
1588
+ mb_w_ = (W_ + (block_w_ - 1)) / block_w_;
1589
+ mb_h_ = (H_ + (block_h_ - 1)) / block_h_;
1590
+ const size_t nb_blocks = use_extra_memory_ ? mb_w_ * mb_h_ : 1;
1591
+ if (!AllocateBlocks(nb_blocks * mcu_blocks_)) return false;
1592
+
1593
+ WriteAPP0();
1594
+
1595
+ // custom markers written 'as is'
1596
+ if (!WriteAPPMarkers(app_markers_)) return false;
1597
+
1598
+ // metadata
1599
+ if (!WriteEXIF(exif_) || !WriteICCP(iccp_) || !WriteXMP(xmp_)) return false;
1600
+
1601
+ if (passes_ > 1) {
1602
+ LoopScan();
1603
+ } else {
1604
+ if (use_adaptive_quant_) {
1605
+ // Histogram analysis + derive optimal quant matrices
1606
+ CollectHistograms();
1607
+ AnalyseHisto();
1608
+ }
1609
+
1610
+ WriteDQT();
1611
+ WriteSOF();
1612
+
1613
+ if (optimize_size_) {
1614
+ SinglePassScanOptimized();
1615
+ } else {
1616
+ WriteDHT();
1617
+ WriteSOS();
1618
+ SinglePassScan();
1619
+ }
1620
+ }
1621
+ WriteEOI();
1622
+ ok_ = ok_ && bw_.Finalize();
1623
+
1624
+ DesallocateBlocks();
1625
+ return ok_;
1626
+ }
1627
+
1628
+ ////////////////////////////////////////////////////////////////////////////////
1629
+ // Edge replication
1630
+
1631
+ namespace {
1632
+
1633
+ int GetAverage(const int16_t* const out) {
1634
+ int DC = 0;
1635
+ for (int i = 0; i < 64; ++i) DC += out[i];
1636
+ return (DC + 32) >> 6;
1637
+ }
1638
+
1639
+ void SetAverage(int DC, int16_t* const out) {
1640
+ for (int i = 0; i < 64; ++i) out[i] = DC;
1641
+ }
1642
+
1643
+ } // anonymous namespace
1644
+
1645
+ void Encoder::AverageExtraLuma(int sub_w, int sub_h, int16_t* out) {
1646
+ // out[] points to four 8x8 blocks. When one of this block is totally
1647
+ // outside of the frame, we set it flat to the average value of the previous
1648
+ // block ("DC"), in order to help compressibility.
1649
+ int DC = GetAverage(out);
1650
+ if (sub_w <= 8) { // set block #1 to block #0's average value
1651
+ SetAverage(DC, out + 1 * 64);
1652
+ }
1653
+ if (sub_h <= 8) { // Need to flatten block #2 and #3
1654
+ if (sub_w > 8) { // block #1 was not flatten, so get its real DC
1655
+ DC = GetAverage(out + 1 * 64);
1656
+ }
1657
+ SetAverage(DC, out + 2 * 64);
1658
+ SetAverage(DC, out + 3 * 64);
1659
+ } else if (sub_w <= 8) { // set block #3 to the block #2's average value
1660
+ DC = GetAverage(out + 2 * 64);
1661
+ SetAverage(DC, out + 3 * 64);
1662
+ }
1663
+ }
1664
+
1665
+ const uint8_t* Encoder::GetReplicatedSamples(const uint8_t* rgb,
1666
+ int rgb_step,
1667
+ int sub_w, int sub_h,
1668
+ int w, int h) {
1669
+ assert(sub_w > 0 && sub_h > 0);
1670
+ if (sub_w > w) {
1671
+ sub_w = w;
1672
+ }
1673
+ if (sub_h > h) {
1674
+ sub_h = h;
1675
+ }
1676
+ uint8_t* dst = replicated_buffer_;
1677
+ for (int y = 0; y < sub_h; ++y) {
1678
+ memcpy(dst, rgb, 3 * sub_w);
1679
+ const uint8_t* const src0 = &dst[3 * (sub_w - 1)];
1680
+ for (int x = 3 * sub_w; x < 3 * w; x += 3) {
1681
+ memcpy(dst + x, src0, 3);
1682
+ }
1683
+ dst += 3 * w;
1684
+ rgb += rgb_step;
1685
+ }
1686
+ const uint8_t* dst0 = dst - 3 * w;
1687
+ for (int y = sub_h; y < h; ++y) {
1688
+ memcpy(dst, dst0, 3 * w);
1689
+ dst += 3 * w;
1690
+ }
1691
+ return replicated_buffer_;
1692
+ }
1693
+
1694
+ // TODO(skal): merge with above function? Probably slower...
1695
+ const uint8_t* Encoder::GetReplicatedYUVSamples(const uint8_t* in,
1696
+ int step,
1697
+ int sub_w, int sub_h,
1698
+ int w, int h) {
1699
+ assert(sub_w > 0 && sub_h > 0);
1700
+ if (sub_w > w) {
1701
+ sub_w = w;
1702
+ }
1703
+ if (sub_h > h) {
1704
+ sub_h = h;
1705
+ }
1706
+ uint8_t* out = replicated_buffer_;
1707
+ for (int y = 0; y < sub_h; ++y) {
1708
+ int x;
1709
+ for (x = 0; x < sub_w; ++x)
1710
+ out[x] = in[x];
1711
+ for (; x < w; ++x) {
1712
+ out[x] = out[sub_w - 1];
1713
+ }
1714
+ out += w;
1715
+ in += step;
1716
+ }
1717
+ const uint8_t* const out0 = out - w;
1718
+ for (int y = sub_h; y < h; ++y) {
1719
+ memcpy(out, out0, w);
1720
+ out += w;
1721
+ }
1722
+ return replicated_buffer_;
1723
+ }
1724
+
1725
+ ////////////////////////////////////////////////////////////////////////////////
1726
+ // sub-class for YUV 4:2:0 version
1727
+
1728
+ class Encoder420 : public Encoder {
1729
+ public:
1730
+ Encoder420(int W, int H, int step, const uint8_t* const rgb,
1731
+ ByteSink* const sink)
1732
+ : Encoder(W, H, step, rgb, sink) {}
1733
+ virtual ~Encoder420() {}
1734
+ virtual void InitComponents() {
1735
+ nb_comps_ = 3;
1736
+
1737
+ quant_idx_[0] = 0;
1738
+ quant_idx_[1] = 1;
1739
+ quant_idx_[2] = 1;
1740
+
1741
+ nb_blocks_[0] = 4;
1742
+ nb_blocks_[1] = 1;
1743
+ nb_blocks_[2] = 1;
1744
+ mcu_blocks_ = 6;
1745
+
1746
+ block_w_ = 16;
1747
+ block_h_ = 16;
1748
+ block_dims_[0] = 0x22;
1749
+ block_dims_[1] = 0x11;
1750
+ block_dims_[2] = 0x11;
1751
+ }
1752
+ virtual void GetSamples(int mb_x, int mb_y, bool clipped,
1753
+ int16_t* out_blocks) {
1754
+ const uint8_t* data = rgb_ + (3 * mb_x + mb_y * step_) * 16;
1755
+ int step = step_;
1756
+ if (clipped) {
1757
+ data = GetReplicatedSamples(data, step,
1758
+ W_ - mb_x * 16, H_ - mb_y * 16, 16, 16);
1759
+ step = 3 * 16;
1760
+ }
1761
+ get_yuv_block_(data, step, out_blocks);
1762
+ if (clipped) {
1763
+ AverageExtraLuma(W_ - mb_x * 16, H_ - mb_y * 16, out_blocks);
1764
+ }
1765
+ }
1766
+ };
1767
+
1768
+ ////////////////////////////////////////////////////////////////////////////////
1769
+ // sub-class for YUV 4:4:4 version
1770
+
1771
+ class Encoder444 : public Encoder {
1772
+ public:
1773
+ Encoder444(int W, int H, int step, const uint8_t* const rgb,
1774
+ ByteSink* const sink)
1775
+ : Encoder(W, H, step, rgb, sink) {
1776
+ SetYUVFormat(true);
1777
+ }
1778
+ virtual ~Encoder444() {}
1779
+ virtual void InitComponents() {
1780
+ nb_comps_ = 3;
1781
+
1782
+ quant_idx_[0] = 0;
1783
+ quant_idx_[1] = 1;
1784
+ quant_idx_[2] = 1;
1785
+
1786
+ nb_blocks_[0] = 1;
1787
+ nb_blocks_[1] = 1;
1788
+ nb_blocks_[2] = 1;
1789
+ mcu_blocks_ = 3;
1790
+
1791
+ block_w_ = 8;
1792
+ block_h_ = 8;
1793
+ block_dims_[0] = 0x11;
1794
+ block_dims_[1] = 0x11;
1795
+ block_dims_[2] = 0x11;
1796
+ }
1797
+ virtual void GetSamples(int mb_x, int mb_y, bool clipped, int16_t* out) {
1798
+ const uint8_t* data = rgb_ + (3 * mb_x + mb_y * step_) * 8;
1799
+ int step = step_;
1800
+ if (clipped) {
1801
+ data = GetReplicatedSamples(data, step,
1802
+ W_ - mb_x * 8, H_ - mb_y * 8, 8, 8);
1803
+ step = 3 * 8;
1804
+ }
1805
+ get_yuv_block_(data, step, out);
1806
+ }
1807
+ };
1808
+
1809
+ ////////////////////////////////////////////////////////////////////////////////
1810
+ // sub-class for the sharp YUV 4:2:0 version
1811
+
1812
+ class EncoderSharp420 : public Encoder420 {
1813
+ public:
1814
+ EncoderSharp420(int W, int H, int step, const uint8_t* const rgb,
1815
+ ByteSink* const sink)
1816
+ : Encoder420(W, H, step, rgb, sink), yuv_memory_(nullptr) {
1817
+ const int uv_w = (W + 1) >> 1;
1818
+ const int uv_h = (H + 1) >> 1;
1819
+ yuv_memory_ = Alloc<uint8_t>(W * H + 2 * uv_w * uv_h);
1820
+ if (yuv_memory_ == nullptr) return;
1821
+ y_plane_ = yuv_memory_;
1822
+ y_step_ = W;
1823
+ u_plane_ = yuv_memory_ + W * H;
1824
+ v_plane_ = u_plane_ + uv_w * uv_h;
1825
+ uv_step_ = uv_w;
1826
+ ApplySharpYUVConversion(rgb, W, H, step, y_plane_, u_plane_, v_plane_);
1827
+ }
1828
+ virtual ~EncoderSharp420() { Free(yuv_memory_); }
1829
+ virtual void GetSamples(int mb_x, int mb_y, bool clipped, int16_t* out);
1830
+
1831
+ protected:
1832
+ void GetLumaSamples(int mb_x, int mb_y, bool clipped, int16_t* out) {
1833
+ int step = y_step_;
1834
+ const uint8_t* Y1 = y_plane_ + (mb_x + mb_y * step) * 16;
1835
+ if (clipped) {
1836
+ Y1 = GetReplicatedYUVSamples(Y1, step,
1837
+ W_ - mb_x * 16, H_ - mb_y * 16, 16, 16);
1838
+ step = 16;
1839
+ }
1840
+ const uint8_t* Y2 = Y1 + 8 * step;
1841
+ for (int y = 8, n = 0; y > 0; --y) {
1842
+ for (int x = 0; x < 8; ++x, ++n) {
1843
+ out[n + 0 * 64] = Y1[x] - 128;
1844
+ out[n + 1 * 64] = Y1[x + 8] - 128;
1845
+ out[n + 2 * 64] = Y2[x] - 128;
1846
+ out[n + 3 * 64] = Y2[x + 8] - 128;
1847
+ }
1848
+ Y1 += step;
1849
+ Y2 += step;
1850
+ }
1851
+ if (clipped) {
1852
+ AverageExtraLuma(W_ - mb_x * 16, H_ - mb_y * 16, out);
1853
+ }
1854
+ }
1855
+
1856
+ private:
1857
+ uint8_t* y_plane_;
1858
+ int y_step_;
1859
+ uint8_t* u_plane_;
1860
+ uint8_t* v_plane_;
1861
+ int uv_step_;
1862
+ uint8_t* yuv_memory_;
1863
+ };
1864
+
1865
+ void EncoderSharp420::GetSamples(int mb_x, int mb_y,
1866
+ bool clipped, int16_t* out) {
1867
+ GetLumaSamples(mb_x, mb_y, clipped, out);
1868
+
1869
+ // Chroma
1870
+ const uint8_t* U = u_plane_ + (mb_x + mb_y * uv_step_) * 8;
1871
+ int step = uv_step_;
1872
+ if (clipped) {
1873
+ U = GetReplicatedYUVSamples(U, step,
1874
+ ((W_ + 1) >> 1) - mb_x * 8,
1875
+ ((H_ + 1) >> 1) - mb_y * 8, 8, 8);
1876
+ step = 8;
1877
+ }
1878
+ for (int y = 8, n = 0; y > 0; --y, U += step) {
1879
+ for (int x = 0; x < 8; ++x, ++n) {
1880
+ out[n + 4 * 64] = U[x] - 128;
1881
+ }
1882
+ }
1883
+ const uint8_t* V = v_plane_ + (mb_x + mb_y * uv_step_) * 8;
1884
+ step = uv_step_;
1885
+ if (clipped) {
1886
+ V = GetReplicatedYUVSamples(V, step,
1887
+ ((W_ + 1) >> 1) - mb_x * 8,
1888
+ ((H_ + 1) >> 1) - mb_y * 8, 8, 8);
1889
+ step = 8;
1890
+ }
1891
+ for (int y = 8, n = 0; y > 0; --y, V += step) {
1892
+ for (int x = 0; x < 8; ++x, ++n) {
1893
+ out[n + 5 * 64] = V[x] - 128;
1894
+ }
1895
+ }
1896
+ }
1897
+
1898
+ ////////////////////////////////////////////////////////////////////////////////
1899
+ // all-in-one factory to pickup the right encoder instance
1900
+
1901
+ Encoder* EncoderFactory(const uint8_t* rgb,
1902
+ int W, int H, int stride, SjpegYUVMode yuv_mode,
1903
+ ByteSink* const sink) {
1904
+ if (yuv_mode == SJPEG_YUV_AUTO) {
1905
+ yuv_mode = SjpegRiskiness(rgb, W, H, stride, nullptr);
1906
+ }
1907
+
1908
+ Encoder* enc = nullptr;
1909
+ if (yuv_mode == SJPEG_YUV_420) {
1910
+ enc = new (std::nothrow) Encoder420(W, H, stride, rgb, sink);
1911
+ } else if (yuv_mode == SJPEG_YUV_SHARP) {
1912
+ enc = new (std::nothrow) EncoderSharp420(W, H, stride, rgb, sink);
1913
+ } else {
1914
+ enc = new (std::nothrow) Encoder444(W, H, stride, rgb, sink);
1915
+ }
1916
+ if (enc == nullptr || !enc->Ok()) {
1917
+ delete enc;
1918
+ enc = nullptr;
1919
+ }
1920
+ return enc;
1921
+ }
1922
+
1923
+ } // namespace sjpeg
1924
+
1925
+ ////////////////////////////////////////////////////////////////////////////////
1926
+ // public plain-C functions
1927
+
1928
+ size_t SjpegEncode(const uint8_t* rgb, int width, int height, int stride,
1929
+ uint8_t** out_data, float quality, int method,
1930
+ SjpegYUVMode yuv_mode) {
1931
+ if (rgb == nullptr || out_data == nullptr) return 0;
1932
+ if (width <= 0 || height <= 0 || stride < 3 * width) return 0;
1933
+ *out_data = nullptr; // safety
1934
+
1935
+ MemorySink sink(width * height / 4);
1936
+ Encoder* const enc = EncoderFactory(rgb, width, height, stride, yuv_mode,
1937
+ &sink);
1938
+ enc->SetQuality(quality);
1939
+ enc->SetCompressionMethod(method);
1940
+ size_t size = 0;
1941
+ *out_data = nullptr;
1942
+ if (enc->Encode()) sink.Release(out_data, &size);
1943
+ delete enc;
1944
+ return size;
1945
+ }
1946
+
1947
+ ////////////////////////////////////////////////////////////////////////////////
1948
+
1949
+ size_t SjpegCompress(const uint8_t* rgb, int width, int height, float quality,
1950
+ uint8_t** out_data) {
1951
+ return SjpegEncode(rgb, width, height, 3 * width, out_data,
1952
+ quality, 4, SJPEG_YUV_AUTO);
1953
+ }
1954
+
1955
+ void SjpegFreeBuffer(const uint8_t* buffer) {
1956
+ delete[] buffer;
1957
+ }
1958
+
1959
+ ////////////////////////////////////////////////////////////////////////////////
1960
+
1961
+ uint32_t SjpegVersion() {
1962
+ return SJPEG_VERSION;
1963
+ }
1964
+
1965
+ ////////////////////////////////////////////////////////////////////////////////
1966
+ // Parametrized call
1967
+
1968
+ EncoderParam::EncoderParam() : search_hook(nullptr), memory(nullptr) {
1969
+ Init(kDefaultQuality);
1970
+ }
1971
+
1972
+ EncoderParam::EncoderParam(float quality_factor)
1973
+ : search_hook(nullptr), memory(nullptr) {
1974
+ Init(quality_factor);
1975
+ }
1976
+
1977
+ void EncoderParam::Init(float quality_factor) {
1978
+ Huffman_compress = true;
1979
+ adaptive_quantization = true;
1980
+ use_trellis = false;
1981
+ yuv_mode = SJPEG_YUV_AUTO;
1982
+ quantization_bias = kDefaultBias;
1983
+ qdelta_max_luma = kDefaultDeltaMaxLuma;
1984
+ qdelta_max_chroma = kDefaultDeltaMaxChroma;
1985
+ adaptive_bias = false;
1986
+ SetLimitQuantization(false);
1987
+ min_quant_tolerance_ = 0;
1988
+ SetQuality(quality_factor);
1989
+ target_mode = TARGET_NONE;
1990
+ target_value = 0;
1991
+ passes = 1;
1992
+ tolerance = 1.;
1993
+ qmin = 0.;
1994
+ qmax = 100.;
1995
+ }
1996
+
1997
+ void EncoderParam::SetQuality(float quality_factor) {
1998
+ const float q = GetQFactor(quality_factor);
1999
+ sjpeg::SetQuantMatrix(kDefaultMatrices[0], q, quant_[0]);
2000
+ sjpeg::SetQuantMatrix(kDefaultMatrices[1], q, quant_[1]);
2001
+ }
2002
+
2003
+ void EncoderParam::SetQuantization(const uint8_t m[2][64],
2004
+ float reduction) {
2005
+ if (reduction <= 1.f) reduction = 1.f;
2006
+ if (m == nullptr) return;
2007
+ for (int c = 0; c < 2; ++c) {
2008
+ for (size_t i = 0; i < 64; ++i) {
2009
+ const int v = static_cast<int>(m[c][i] * 100. / reduction + .5);
2010
+ quant_[c][i] = (v > 255) ? 255u : (v < 1) ? 1u : v;
2011
+ }
2012
+ }
2013
+ }
2014
+
2015
+ void EncoderParam::SetLimitQuantization(bool limit_quantization,
2016
+ int min_quant_tolerance) {
2017
+ use_min_quant_ = limit_quantization;
2018
+ if (limit_quantization) SetMinQuantization(quant_, min_quant_tolerance);
2019
+ }
2020
+
2021
+ void EncoderParam::SetMinQuantization(const uint8_t m[2][64],
2022
+ int min_quant_tolerance) {
2023
+ use_min_quant_ = true;
2024
+ CopyQuantMatrix(m[0], min_quant_[0]);
2025
+ CopyQuantMatrix(m[1], min_quant_[1]);
2026
+ min_quant_tolerance_ = (min_quant_tolerance < 0) ? 0
2027
+ : (min_quant_tolerance > 100) ? 100
2028
+ : min_quant_tolerance;
2029
+ }
2030
+
2031
+ void EncoderParam::ResetMetadata() {
2032
+ iccp.clear();
2033
+ exif.clear();
2034
+ xmp.clear();
2035
+ app_markers.clear();
2036
+ }
2037
+
2038
+ bool Encoder::InitFromParam(const EncoderParam& param) {
2039
+ SetQuantMatrices(param.quant_);
2040
+ if (param.use_min_quant_) {
2041
+ SetMinQuantMatrices(param.min_quant_, param.min_quant_tolerance_);
2042
+ } else {
2043
+ SetDefaultMinQuantMatrices();
2044
+ }
2045
+
2046
+ int method = param.Huffman_compress ? 1 : 0;
2047
+ if (param.adaptive_quantization) method += 3;
2048
+ if (param.use_trellis) {
2049
+ method = (method == 4) ? 7 : (method == 6) ? 8 : method;
2050
+ }
2051
+
2052
+ SetCompressionMethod(method);
2053
+ SetQuantizationBias(param.quantization_bias, param.adaptive_bias);
2054
+ SetQuantizationDeltas(param.qdelta_max_luma, param.qdelta_max_chroma);
2055
+
2056
+ SetMetadata(param.iccp, Encoder::ICC);
2057
+ SetMetadata(param.exif, Encoder::EXIF);
2058
+ SetMetadata(param.xmp, Encoder::XMP);
2059
+ SetMetadata(param.app_markers, Encoder::MARKERS);
2060
+
2061
+ passes_ = (param.passes < 1) ? 1 : (param.passes > 20) ? 20 : param.passes;
2062
+ if (passes_ > 1) {
2063
+ use_extra_memory_ = true;
2064
+ reuse_run_levels_ = true;
2065
+ search_hook_ = (param.search_hook == nullptr) ? &default_hook_
2066
+ : param.search_hook;
2067
+ if (!search_hook_->Setup(param)) return false;
2068
+ }
2069
+
2070
+ memory_hook_ = (param.memory == nullptr) ? &kDefaultMemory : param.memory;
2071
+ return true;
2072
+ }
2073
+
2074
+ bool sjpeg::Encode(const uint8_t* rgb, int width, int height, int stride,
2075
+ const EncoderParam& param, ByteSink* sink) {
2076
+ if (rgb == nullptr || sink == nullptr) return false;
2077
+ if (width <= 0 || height <= 0 || stride < 3 * width) return false;
2078
+
2079
+ Encoder* const enc = EncoderFactory(rgb, width, height, stride,
2080
+ param.yuv_mode, sink);
2081
+ const bool ok = (enc != nullptr) &&
2082
+ enc->InitFromParam(param) &&
2083
+ enc->Encode();
2084
+ delete enc;
2085
+ return ok;
2086
+ }
2087
+
2088
+ size_t sjpeg::Encode(const uint8_t* rgb, int width, int height, int stride,
2089
+ const EncoderParam& param, uint8_t** out_data) {
2090
+ MemorySink sink(width * height / 4); // estimation of output size
2091
+ if (!sjpeg::Encode(rgb, width, height, stride, param, &sink)) return 0;
2092
+ size_t size;
2093
+ sink.Release(out_data, &size);
2094
+ return size;
2095
+ }
2096
+
2097
+ ////////////////////////////////////////////////////////////////////////////////
2098
+ // std::string variants
2099
+
2100
+ bool sjpeg::Encode(const uint8_t* rgb, int width, int height, int stride,
2101
+ const EncoderParam& param, std::string* output) {
2102
+ if (output == nullptr) return false;
2103
+ output->clear();
2104
+ output->reserve(width * height / 4);
2105
+ StringSink sink(output);
2106
+ return Encode(rgb, width, height, stride, param, &sink);
2107
+ }
2108
+
2109
+ bool SjpegCompress(const uint8_t* rgb, int width, int height,
2110
+ float quality, std::string* output) {
2111
+ EncoderParam param;
2112
+ param.SetQuality(quality);
2113
+ return Encode(rgb, width, height, 3 * width, param, output);
2114
+ }
2115
+
2116
+ ////////////////////////////////////////////////////////////////////////////////
2117
+
2118
+ bool SjpegDimensions(const std::string& jpeg_data,
2119
+ int* width, int* height, int* is_yuv420) {
2120
+ return SjpegDimensions(
2121
+ reinterpret_cast<const uint8_t*>(jpeg_data.data()),
2122
+ jpeg_data.size(), width, height, is_yuv420);
2123
+ }
2124
+
2125
+ int SjpegFindQuantizer(const std::string& jpeg_data,
2126
+ uint8_t quant[2][64]) {
2127
+ return SjpegFindQuantizer(
2128
+ reinterpret_cast<const uint8_t*>(jpeg_data.data()), jpeg_data.size(),
2129
+ quant);
2130
+ }
2131
+
2132
+ ////////////////////////////////////////////////////////////////////////////////