sjpeg 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/ext/sjpeg/enc.cc ADDED
@@ -0,0 +1,2132 @@
1
+ // Copyright 2017 Google Inc.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Fast and simple JPEG encoder
16
+ //
17
+ // Author: Skal (pascal.massimino@gmail.com)
18
+
19
+ #include <stdlib.h>
20
+ #include <math.h>
21
+ #include <float.h> // for FLT_MAX
22
+ #include <stdint.h>
23
+
24
+ #define SJPEG_NEED_ASM_HEADERS
25
+ #include "sjpegi.h"
26
+
27
+ using namespace sjpeg;
28
+
29
+ // Some general default values:
30
+ static const float kDefaultQuality = 75.f;
31
+ static const int kDefaultMethod = 4;
32
+ // Rounding bias for AC coefficients, as 8bit fixed point.
33
+ // A default value 0x78 leans toward filesize reduction.
34
+ static const int32_t kDefaultBias = 0x78;
35
+ // for adaptive quantization:
36
+ static const int kDefaultDeltaMaxLuma = 12;
37
+ static const int kDefaultDeltaMaxChroma = 1;
38
+
39
+ // finer tuning of perceptual optimizations:
40
+
41
+ // Minimum average number of entries per bin required for performing histogram-
42
+ // -based optimization. Below this limit, the channel's histogram is declared
43
+ // under-populated and the corresponding optimization skipped.
44
+ static double kDensityThreshold = 0.5;
45
+ // Rejection limit on the correlation factor when extrapolating the distortion
46
+ // from histograms. If the least-square fit has a squared correlation factor
47
+ // less than this threshold, the corresponding quantization scale will be
48
+ // kept unchanged.
49
+ static double kCorrelationThreshold = 0.5;
50
+ // Bit-map of channels to omit during quantization matrix optimization.
51
+ // If the bit 'i + 8 * j' is set in this bit field, the matrix entry at
52
+ // position (i,j) will be kept unchanged during optimization.
53
+ // The default value is 0x103 = 1 + 2 + 256: the 3 entries in the top-left
54
+ // corner (with lowest-frequency) are not optimized, since it can lead to
55
+ // visual degradation of smooth gradients.
56
+ static const uint64_t kOmittedChannels = 0x0000000000000103ULL;
57
+
58
+ ////////////////////////////////////////////////////////////////////////////////
59
+
60
+ namespace sjpeg {
61
+
62
+ const uint8_t kZigzag[64] = {
63
+ 0, 1, 8, 16, 9, 2, 3, 10,
64
+ 17, 24, 32, 25, 18, 11, 4, 5,
65
+ 12, 19, 26, 33, 40, 48, 41, 34,
66
+ 27, 20, 13, 6, 7, 14, 21, 28,
67
+ 35, 42, 49, 56, 57, 50, 43, 36,
68
+ 29, 22, 15, 23, 30, 37, 44, 51,
69
+ 58, 59, 52, 45, 38, 31, 39, 46,
70
+ 53, 60, 61, 54, 47, 55, 62, 63,
71
+ };
72
+
73
+ const uint8_t kDefaultMatrices[2][64] = {
74
+ // these are the default luma/chroma matrices (JPEG spec section K.1)
75
+ { 16, 11, 10, 16, 24, 40, 51, 61,
76
+ 12, 12, 14, 19, 26, 58, 60, 55,
77
+ 14, 13, 16, 24, 40, 57, 69, 56,
78
+ 14, 17, 22, 29, 51, 87, 80, 62,
79
+ 18, 22, 37, 56, 68, 109, 103, 77,
80
+ 24, 35, 55, 64, 81, 104, 113, 92,
81
+ 49, 64, 78, 87, 103, 121, 120, 101,
82
+ 72, 92, 95, 98, 112, 100, 103, 99 },
83
+ { 17, 18, 24, 47, 99, 99, 99, 99,
84
+ 18, 21, 26, 66, 99, 99, 99, 99,
85
+ 24, 26, 56, 99, 99, 99, 99, 99,
86
+ 47, 66, 99, 99, 99, 99, 99, 99,
87
+ 99, 99, 99, 99, 99, 99, 99, 99,
88
+ 99, 99, 99, 99, 99, 99, 99, 99,
89
+ 99, 99, 99, 99, 99, 99, 99, 99,
90
+ 99, 99, 99, 99, 99, 99, 99, 99 }
91
+ };
92
+
93
+ float GetQFactor(float q) {
94
+ // we use the same mapping than jpeg-6b, for coherency
95
+ q = (q <= 0) ? 5000 : (q < 50) ? 5000 / q : (q < 100) ? 2 * (100 - q) : 0;
96
+ // We floor-round to integer here just to preserve compatibility with jpeg6b.
97
+ return floorf(q);
98
+ }
99
+
100
+ void CopyQuantMatrix(const uint8_t in[64], uint8_t out[64]) {
101
+ memcpy(out, in, 64 * sizeof(out[0]));
102
+ }
103
+
104
+ void SetQuantMatrix(const uint8_t in[64], float q_factor, uint8_t out[64]) {
105
+ if (in == nullptr || out == nullptr) return;
106
+ q_factor /= 100.f;
107
+ for (size_t i = 0; i < 64; ++i) {
108
+ const int v = static_cast<int>(in[i] * q_factor + .5f);
109
+ // clamp to prevent illegal quantizer values
110
+ out[i] = (v < 1) ? 1 : (v > 255) ? 255u : v;
111
+ }
112
+ }
113
+
114
+ void SetMinQuantMatrix(const uint8_t m[64], uint8_t out[64], int tolerance) {
115
+ assert(out != nullptr && m != nullptr);
116
+ for (size_t i = 0; i < 64; ++i) {
117
+ const int v = static_cast<int>(m[i] * (256 - tolerance) >> 8);
118
+ out[i] = (v < 1) ? 1u : (v > 255) ? 255u : v;
119
+ }
120
+ }
121
+
122
+ void SetDefaultMinQuantMatrix(uint8_t out[64]) {
123
+ assert(out != nullptr);
124
+ for (size_t i = 0; i < 64; ++i) out[i] = 1u;
125
+ }
126
+
127
+ ////////////////////////////////////////////////////////////////////////////////
128
+ // Default memory manager (singleton)
129
+
130
+ static struct DefaultMemory : public MemoryManager {
131
+ public:
132
+ virtual ~DefaultMemory() {}
133
+ virtual void* Alloc(size_t size) { return malloc(size); }
134
+ virtual void Free(void* const ptr) { free(ptr); }
135
+ } kDefaultMemory;
136
+
137
+ ////////////////////////////////////////////////////////////////////////////////
138
+ // Encoder main class
139
+
140
+ Encoder::Encoder(int W, int H, int step, const uint8_t* const rgb,
141
+ ByteSink* const sink)
142
+ : W_(W), H_(H), step_(step),
143
+ rgb_(rgb),
144
+ ok_(true),
145
+ bw_(sink),
146
+ in_blocks_base_(nullptr),
147
+ in_blocks_(nullptr),
148
+ have_coeffs_(false),
149
+ all_run_levels_(nullptr),
150
+ nb_run_levels_(0),
151
+ max_run_levels_(0),
152
+ qdelta_max_luma_(kDefaultDeltaMaxLuma),
153
+ qdelta_max_chroma_(kDefaultDeltaMaxChroma),
154
+ passes_(1),
155
+ search_hook_(nullptr),
156
+ memory_hook_(&kDefaultMemory) {
157
+ SetCompressionMethod(kDefaultMethod);
158
+ SetQuality(kDefaultQuality);
159
+ SetYUVFormat(false);
160
+ SetQuantizationBias(kDefaultBias, false);
161
+ SetDefaultMinQuantMatrices();
162
+ InitializeStaticPointers();
163
+ memset(dc_codes_, 0, sizeof(dc_codes_)); // safety
164
+ memset(ac_codes_, 0, sizeof(ac_codes_));
165
+ }
166
+
167
+ Encoder::~Encoder() {
168
+ Free(all_run_levels_);
169
+ DesallocateBlocks(); // clean-up leftovers in case of we had an error
170
+ }
171
+
172
+ ////////////////////////////////////////////////////////////////////////////////
173
+
174
+ void Encoder::SetQuality(float q) {
175
+ q = GetQFactor(q);
176
+ SetQuantMatrix(kDefaultMatrices[0], q, quants_[0].quant_);
177
+ SetQuantMatrix(kDefaultMatrices[1], q, quants_[1].quant_);
178
+ }
179
+
180
+ void Encoder::SetQuantMatrices(const uint8_t m[2][64]) {
181
+ SetQuantMatrix(m[0], 100, quants_[0].quant_);
182
+ SetQuantMatrix(m[1], 100, quants_[1].quant_);
183
+ }
184
+
185
+ void Encoder::SetMinQuantMatrices(const uint8_t m[2][64], int tolerance) {
186
+ SetMinQuantMatrix(m[0], quants_[0].min_quant_, tolerance);
187
+ SetMinQuantMatrix(m[1], quants_[1].min_quant_, tolerance);
188
+ }
189
+
190
+ void Encoder::SetDefaultMinQuantMatrices() {
191
+ SetDefaultMinQuantMatrix(quants_[0].min_quant_);
192
+ SetDefaultMinQuantMatrix(quants_[1].min_quant_);
193
+ }
194
+
195
+ void Encoder::SetCompressionMethod(int method) {
196
+ assert(method >= 0 && method <= 8);
197
+ use_adaptive_quant_ = (method >= 3);
198
+ optimize_size_ = (method != 0) && (method != 3);
199
+ use_extra_memory_ = (method == 3) || (method == 4) || (method == 7);
200
+ reuse_run_levels_ = (method == 1) || (method == 4) || (method == 5)
201
+ || (method >= 7);
202
+ use_trellis_ = (method >= 7);
203
+ }
204
+
205
+ void Encoder::SetMetadata(const std::string& data, MetadataType type) {
206
+ switch (type) {
207
+ case ICC: iccp_ = data; break;
208
+ case EXIF: exif_ = data; break;
209
+ case XMP: xmp_ = data; break;
210
+ default:
211
+ case MARKERS: app_markers_ = data; break;
212
+ }
213
+ }
214
+
215
+ void Encoder::SetQuantizationBias(int bias, bool use_adaptive) {
216
+ assert(bias >= 0 && bias <= 255);
217
+ q_bias_ = bias;
218
+ adaptive_bias_ = use_adaptive;
219
+ }
220
+
221
+ void Encoder::SetQuantizationDeltas(int qdelta_luma, int qdelta_chroma) {
222
+ assert(qdelta_luma >= 0 && qdelta_luma <= 255);
223
+ assert(qdelta_chroma >= 0 && qdelta_chroma <= 255);
224
+ qdelta_max_luma_ = qdelta_luma;
225
+ qdelta_max_chroma_ = qdelta_chroma;
226
+ }
227
+
228
+ ////////////////////////////////////////////////////////////////////////////////
229
+ // CPU support
230
+
231
+ extern bool ForceSlowCImplementation;
232
+ bool ForceSlowCImplementation = false; // undocumented! for tests.
233
+
234
+ bool SupportsSSE2() {
235
+ if (ForceSlowCImplementation) return false;
236
+ #if defined(SJPEG_USE_SSE2)
237
+ return true;
238
+ #endif
239
+ return false;
240
+ }
241
+
242
+ bool SupportsNEON() {
243
+ if (ForceSlowCImplementation) return false;
244
+ #if defined(SJPEG_USE_NEON)
245
+ return true;
246
+ #endif
247
+ return false;
248
+ }
249
+
250
+ ////////////////////////////////////////////////////////////////////////////////
251
+ // static pointers to architecture-dependant implementation
252
+
253
+ Encoder::QuantizeErrorFunc Encoder::quantize_error_ = nullptr;
254
+ Encoder::QuantizeBlockFunc Encoder::quantize_block_ = nullptr;
255
+ void (*Encoder::fDCT_)(int16_t* in, int num_blocks) = nullptr;
256
+ Encoder::StoreHistoFunc Encoder::store_histo_ = nullptr;
257
+ RGBToYUVBlockFunc Encoder::get_yuv444_block_ = nullptr;
258
+
259
+ void Encoder::InitializeStaticPointers() {
260
+ if (fDCT_ == nullptr) {
261
+ store_histo_ = GetStoreHistoFunc();
262
+ quantize_block_ = GetQuantizeBlockFunc();
263
+ quantize_error_ = GetQuantizeErrorFunc();
264
+ fDCT_ = GetFdct();
265
+ get_yuv444_block_ = GetBlockFunc(true);
266
+ }
267
+ }
268
+
269
+ ////////////////////////////////////////////////////////////////////////////////
270
+ // memory and internal buffers management. We grow on demand.
271
+
272
+ bool Encoder::SetError() {
273
+ ok_ = false;
274
+ return false;
275
+ }
276
+
277
+ bool Encoder::CheckBuffers() {
278
+ // maximum macroblock size, worst-case, is 24bits*64*6 coeffs = 1152bytes
279
+ ok_ = ok_ && bw_.Reserve(2048);
280
+ if (!ok_) return false;
281
+
282
+ if (reuse_run_levels_) {
283
+ if (nb_run_levels_ + 6*64 > max_run_levels_) {
284
+ // need to grow storage for run/levels
285
+ const size_t new_size = max_run_levels_ ? max_run_levels_ * 2 : 8192;
286
+ RunLevel* const new_rl = Alloc<RunLevel>(new_size);
287
+ if (new_rl == nullptr) return false;
288
+ if (nb_run_levels_ > 0) {
289
+ memcpy(new_rl, all_run_levels_,
290
+ nb_run_levels_ * sizeof(new_rl[0]));
291
+ }
292
+ Free(all_run_levels_);
293
+ all_run_levels_ = new_rl;
294
+ max_run_levels_ = new_size;
295
+ assert(nb_run_levels_ + 6 * 64 <= max_run_levels_);
296
+ }
297
+ }
298
+ return true;
299
+ }
300
+
301
+ bool Encoder::AllocateBlocks(size_t num_blocks) {
302
+ assert(in_blocks_ == nullptr);
303
+ have_coeffs_ = false;
304
+ const size_t size = num_blocks * 64 * sizeof(*in_blocks_);
305
+ in_blocks_base_ = Alloc<uint8_t>(size + ALIGN_CST);
306
+ if (in_blocks_base_ == nullptr) return false;
307
+ in_blocks_ = reinterpret_cast<int16_t*>(
308
+ (ALIGN_CST + reinterpret_cast<uintptr_t>(in_blocks_base_)) & ~ALIGN_CST);
309
+ return true;
310
+ }
311
+
312
+ void Encoder::DesallocateBlocks() {
313
+ Free(in_blocks_base_);
314
+ in_blocks_base_ = nullptr;
315
+ in_blocks_ = nullptr; // sanity
316
+ }
317
+
318
+ ////////////////////////////////////////////////////////////////////////////////
319
+
320
+ #define FP_BITS 16 // fractional precision for fixed-point dividors
321
+ #define AC_BITS 4 // extra precision bits from fdct's scaling
322
+ #define BIAS_DC 0x80 // neutral bias for DC (mandatory!)
323
+
324
+ // divide-by-multiply helper macros
325
+ #define MAKE_INV_QUANT(Q) (((1u << FP_BITS) + (Q) / 2) / (Q))
326
+ #define DIV_BY_MULT(A, M) (((A) * (M)) >> FP_BITS)
327
+ #define QUANTIZE(A, M, B) (DIV_BY_MULT((A) + (B), (M)) >> AC_BITS)
328
+
329
+ void Encoder::FinalizeQuantMatrix(Quantizer* const q, int q_bias) {
330
+ // first, clamp the quant matrix:
331
+ for (size_t i = 0; i < 64; ++i) {
332
+ if (q->quant_[i] < q->min_quant_[i]) q->quant_[i] = q->min_quant_[i];
333
+ }
334
+ // Special case! for v=1 we can't represent the multiplier with 16b precision.
335
+ // So, instead we max out the multiplier to 0xffffu, and twist the bias to the
336
+ // value 0x80. The overall precision isn't affected: it's bit-exact the same
337
+ // for our working range.
338
+ // Note that quant=1 can start appearing at quality as low as 93.
339
+ const uint16_t bias_1 = 0x80;
340
+ const uint16_t iquant_1 = 0xffffu;
341
+ for (size_t i = 0; i < 64; ++i) {
342
+ const uint16_t v = q->quant_[i];
343
+ const uint16_t iquant = (v == 1) ? iquant_1 : MAKE_INV_QUANT(v);
344
+ const uint16_t bias = (v == 1) ? bias_1 : (i == 0) ? BIAS_DC : q_bias;
345
+ const uint16_t ibias = (((bias * v) << AC_BITS) + 128) >> 8;
346
+ const uint16_t qthresh =
347
+ ((1 << (FP_BITS + AC_BITS)) + iquant - 1) / iquant - ibias;
348
+ q->bias_[i] = ibias;
349
+ q->iquant_[i] = iquant;
350
+ q->qthresh_[i] = qthresh;
351
+ assert(QUANTIZE(qthresh, iquant, ibias) > 0);
352
+ assert(QUANTIZE(qthresh - 1, iquant, ibias) == 0);
353
+ }
354
+ }
355
+
356
+ void Encoder::SetCostCodes(int idx) {
357
+ quants_[idx].codes_ = ac_codes_[idx];
358
+ }
359
+
360
+ ////////////////////////////////////////////////////////////////////////////////
361
+ // standard Huffman tables, as per JPEG standard section K.3.
362
+
363
+ static const uint8_t kDCSyms[12] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 };
364
+ static const uint8_t kACSyms[2][162] = {
365
+ { 0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12,
366
+ 0x21, 0x31, 0x41, 0x06, 0x13, 0x51, 0x61, 0x07,
367
+ 0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xa1, 0x08,
368
+ 0x23, 0x42, 0xb1, 0xc1, 0x15, 0x52, 0xd1, 0xf0,
369
+ 0x24, 0x33, 0x62, 0x72, 0x82, 0x09, 0x0a, 0x16,
370
+ 0x17, 0x18, 0x19, 0x1a, 0x25, 0x26, 0x27, 0x28,
371
+ 0x29, 0x2a, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
372
+ 0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
373
+ 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
374
+ 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
375
+ 0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
376
+ 0x7a, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
377
+ 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98,
378
+ 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
379
+ 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6,
380
+ 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5,
381
+ 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4,
382
+ 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xe1, 0xe2,
383
+ 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea,
384
+ 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
385
+ 0xf9, 0xfa },
386
+ { 0x00, 0x01, 0x02, 0x03, 0x11, 0x04, 0x05, 0x21,
387
+ 0x31, 0x06, 0x12, 0x41, 0x51, 0x07, 0x61, 0x71,
388
+ 0x13, 0x22, 0x32, 0x81, 0x08, 0x14, 0x42, 0x91,
389
+ 0xa1, 0xb1, 0xc1, 0x09, 0x23, 0x33, 0x52, 0xf0,
390
+ 0x15, 0x62, 0x72, 0xd1, 0x0a, 0x16, 0x24, 0x34,
391
+ 0xe1, 0x25, 0xf1, 0x17, 0x18, 0x19, 0x1a, 0x26,
392
+ 0x27, 0x28, 0x29, 0x2a, 0x35, 0x36, 0x37, 0x38,
393
+ 0x39, 0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48,
394
+ 0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,
395
+ 0x59, 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68,
396
+ 0x69, 0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
397
+ 0x79, 0x7a, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
398
+ 0x88, 0x89, 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96,
399
+ 0x97, 0x98, 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5,
400
+ 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4,
401
+ 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3,
402
+ 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2,
403
+ 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda,
404
+ 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9,
405
+ 0xea, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
406
+ 0xf9, 0xfa }
407
+ };
408
+
409
+ static const HuffmanTable kHuffmanTables[4] = {
410
+ { { 0, 1, 5, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0 }, kDCSyms, 12 },
411
+ { { 0, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0 }, kDCSyms, 12 },
412
+ { { 0, 2, 1, 3, 3, 2, 4, 3, 5, 5, 4, 4, 0, 0, 1, 125 }, kACSyms[0], 162 },
413
+ { { 0, 2, 1, 2, 4, 4, 3, 4, 7, 5, 4, 4, 0, 1, 2, 119 }, kACSyms[1], 162 }
414
+ };
415
+
416
+ ////////////////////////////////////////////////////////////////////////////////
417
+ // This function generates a map from symbols to code + len stored in a packed
418
+ // way (lower 16bit is the lenth, upper 16bit is the VLC).
419
+ // The input is a JPEG-like description of the symbols:
420
+ // - bits[i] stores the number of codes having length i + 1.
421
+ // - symbols[] contain the symbols' map, in increasing bit-length order.
422
+ // There is no check performed on the validity symbols[]'s content.
423
+ // The values of tab[] not referring to an actual symbol will remain unchanged.
424
+ // Returns the number of symbols used (that is: sum{bits[i]})
425
+
426
+ static int BuildHuffmanTable(const uint8_t bits[16], const uint8_t* symbols,
427
+ uint32_t* const tab) {
428
+ uint32_t code = 0;
429
+ int nb = 0;
430
+ for (int nb_bits = 1; nb_bits <= 16; ++nb_bits, code <<= 1) {
431
+ int n = bits[nb_bits - 1]; // number of code for that given nb_bits
432
+ nb += n;
433
+ while (n-- > 0) {
434
+ const int symbol = *symbols++;
435
+ tab[symbol] = (code << 16) | nb_bits;
436
+ ++code;
437
+ }
438
+ }
439
+ return nb;
440
+ }
441
+
442
+ ////////////////////////////////////////////////////////////////////////////////
443
+
444
+ void Encoder::InitCodes(bool only_ac) {
445
+ const int nb_tables = (nb_comps_ == 1 ? 1 : 2);
446
+ for (int c = 0; c < nb_tables; ++c) { // luma, chroma
447
+ for (int type = (only_ac ? 1 : 0); type <= 1; ++type) {
448
+ const HuffmanTable* const h = Huffman_tables_[type * 2 + c];
449
+ const int nb_syms = BuildHuffmanTable(h->bits_, h->syms_,
450
+ type == 1 ? ac_codes_[c]
451
+ : dc_codes_[c]);
452
+ assert(nb_syms == h->nb_syms_);
453
+ (void)nb_syms;
454
+ }
455
+ }
456
+ }
457
+
458
+ ////////////////////////////////////////////////////////////////////////////////
459
+ // Quantize coefficients and pseudo-code coefficients
460
+
461
+ static int CalcLog2(int v) {
462
+ #if defined(__GNUC__) && \
463
+ ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4)
464
+ return 32 - __builtin_clz(v);
465
+ #else
466
+ const int kLog2[16] = {
467
+ 0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4 };
468
+ assert(v > 0 && v < (1 << 12));
469
+ return (v & ~0xff) ? 8 + kLog2[v >> 8] :
470
+ (v & ~0x0f) ? 4 + kLog2[v >> 4] :
471
+ 0 + kLog2[v];
472
+ #endif
473
+ }
474
+
475
+ uint16_t Encoder::GenerateDCDiffCode(int DC, int* const DC_predictor) {
476
+ const int diff = DC - *DC_predictor;
477
+ *DC_predictor = DC;
478
+ if (diff == 0) {
479
+ return 0;
480
+ }
481
+ int suff, n;
482
+ if (diff < 0) {
483
+ n = CalcLog2(-diff);
484
+ suff = (diff - 1) & ((1 << n) - 1);
485
+ } else {
486
+ n = CalcLog2(diff);
487
+ suff = diff;
488
+ }
489
+ assert((suff & 0xf000) == 0);
490
+ assert(n < 12);
491
+ return n | (suff << 4);
492
+ }
493
+
494
+ ////////////////////////////////////////////////////////////////////////////////
495
+ // various implementation of histogram collection
496
+
497
+ #if defined(SJPEG_USE_SSE2)
498
+ // Load eight 16b-words from *src.
499
+ #define LOAD_16(src) _mm_loadu_si128(reinterpret_cast<const __m128i*>(src))
500
+ // Store eight 16b-words into *dst
501
+ #define STORE_16(V, dst) _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), (V))
502
+
503
+ static int QuantizeBlockSSE2(const int16_t in[64], int idx,
504
+ const Quantizer* const Q,
505
+ DCTCoeffs* const out, RunLevel* const rl) {
506
+ const uint16_t* const bias = Q->bias_;
507
+ const uint16_t* const iquant = Q->iquant_;
508
+ int prev = 1;
509
+ int nb = 0;
510
+ int16_t tmp[64], masked[64];
511
+ for (int i = 0; i < 64; i += 8) {
512
+ const __m128i m_bias = LOAD_16(bias + i);
513
+ const __m128i m_mult = LOAD_16(iquant + i);
514
+ const __m128i A = LOAD_16(in + i); // A = in[i]
515
+ const __m128i B = _mm_srai_epi16(A, 15); // sign extract
516
+ const __m128i C = _mm_sub_epi16(_mm_xor_si128(A, B), B); // abs(A)
517
+ const __m128i D = _mm_adds_epi16(C, m_bias); // v' = v + bias
518
+ const __m128i E = _mm_mulhi_epu16(D, m_mult); // (v' * iq) >> 16
519
+ const __m128i F = _mm_srli_epi16(E, AC_BITS); // = QUANTIZE(...)
520
+ const __m128i G = _mm_xor_si128(F, B); // v ^ mask
521
+ STORE_16(F, tmp + i);
522
+ STORE_16(G, masked + i);
523
+ }
524
+ for (int i = 1; i < 64; ++i) {
525
+ const int j = kZigzag[i];
526
+ const int v = tmp[j];
527
+ if (v > 0) {
528
+ const int n = CalcLog2(v);
529
+ const uint16_t code = masked[j] & ((1 << n) - 1);
530
+ rl[nb].level_ = (code << 4) | n;
531
+ rl[nb].run_ = i - prev;
532
+ prev = i + 1;
533
+ ++nb;
534
+ }
535
+ }
536
+ const int dc = (in[0] < 0) ? -tmp[0] : tmp[0];
537
+ out->idx_ = idx;
538
+ out->last_ = prev - 1;
539
+ out->nb_coeffs_ = nb;
540
+ return dc;
541
+ }
542
+ #undef LOAD_16
543
+ #undef STORE_16
544
+
545
+ #elif defined(SJPEG_USE_NEON)
546
+ static int QuantizeBlockNEON(const int16_t in[64], int idx,
547
+ const Quantizer* const Q,
548
+ DCTCoeffs* const out, RunLevel* const rl) {
549
+ const uint16_t* const bias = Q->bias_;
550
+ const uint16_t* const iquant = Q->iquant_;
551
+ int prev = 1;
552
+ int nb = 0;
553
+ uint16_t tmp[64], masked[64];
554
+ for (int i = 0; i < 64; i += 8) {
555
+ const uint16x8_t m_bias = vld1q_u16(bias + i);
556
+ const uint16x8_t m_mult = vld1q_u16(iquant + i);
557
+ const int16x8_t A = vld1q_s16(in + i); // in[i]
558
+ const uint16x8_t B = vreinterpretq_u16_s16(vabsq_s16(A)); // abs(in)
559
+ const int16x8_t sign = vshrq_n_s16(A, 15); // sign
560
+ const uint16x8_t C = vaddq_u16(B, m_bias); // + bias
561
+ const uint32x4_t D0 = vmull_u16(vget_low_u16(C), vget_low_u16(m_mult));
562
+ const uint32x4_t D1 = vmull_u16(vget_high_u16(C), vget_high_u16(m_mult));
563
+ // collect hi-words of the 32b mult result using 'unzip'
564
+ const uint16x8x2_t E = vuzpq_u16(vreinterpretq_u16_u32(D0),
565
+ vreinterpretq_u16_u32(D1));
566
+ const uint16x8_t F = vshrq_n_u16(E.val[1], AC_BITS);
567
+ const uint16x8_t G = veorq_u16(F, vreinterpretq_u16_s16(sign)); // v ^ mask
568
+ vst1q_u16(tmp + i, F);
569
+ vst1q_u16(masked + i, G);
570
+ }
571
+ for (int i = 1; i < 64; ++i) {
572
+ const int j = kZigzag[i];
573
+ const int v = tmp[j];
574
+ if (v > 0) {
575
+ const int n = CalcLog2(v);
576
+ const uint16_t code = masked[j] & ((1 << n) - 1);
577
+ rl[nb].level_ = (code << 4) | n;
578
+ rl[nb].run_ = i - prev;
579
+ prev = i + 1;
580
+ ++nb;
581
+ }
582
+ }
583
+ const int dc = (in[0] < 0) ? -tmp[0] : tmp[0];
584
+ out->idx_ = idx;
585
+ out->last_ = prev - 1;
586
+ out->nb_coeffs_ = nb;
587
+ return dc;
588
+ }
589
+ #endif // SJPEG_USE_NEON
590
+
591
+ static int QuantizeBlock(const int16_t in[64], int idx,
592
+ const Quantizer* const Q,
593
+ DCTCoeffs* const out, RunLevel* const rl) {
594
+ const uint16_t* const bias = Q->bias_;
595
+ const uint16_t* const iquant = Q->iquant_;
596
+ int prev = 1;
597
+ int nb = 0;
598
+ // This function is speed-critical, so we're using some bit mask
599
+ // to extract absolute values, instead of sign tests.
600
+ const uint16_t* const qthresh = Q->qthresh_;
601
+ for (int i = 1; i < 64; ++i) {
602
+ const int j = kZigzag[i];
603
+ int v = in[j];
604
+ const int32_t mask = v >> 31;
605
+ v = (v ^ mask) - mask;
606
+ if (v >= qthresh[j]) {
607
+ v = QUANTIZE(v, iquant[j], bias[j]);
608
+ assert(v > 0);
609
+ const int n = CalcLog2(v);
610
+ const uint16_t code = (v ^ mask) & ((1 << n) - 1);
611
+ rl[nb].level_ = (code << 4) | n;
612
+ rl[nb].run_ = i - prev;
613
+ prev = i + 1;
614
+ ++nb;
615
+ }
616
+ }
617
+ const int dc = (in[0] < 0) ? -QUANTIZE(-in[0], iquant[0], bias[0])
618
+ : QUANTIZE(in[0], iquant[0], bias[0]);
619
+ out->idx_ = idx;
620
+ out->last_ = prev - 1;
621
+ out->nb_coeffs_ = nb;
622
+ return dc;
623
+ }
624
+
625
+ ////////////////////////////////////////////////////////////////////////////////
626
+ // Trellis-based quantization
627
+
628
+ typedef uint32_t score_t;
629
+ static const score_t kMaxScore = 0xffffffffu;
630
+
631
+ struct TrellisNode {
632
+ uint32_t code;
633
+ int nbits;
634
+ score_t score;
635
+ uint32_t disto;
636
+ uint32_t bits;
637
+ uint32_t run;
638
+ const TrellisNode* best_prev;
639
+ int pos;
640
+ int rank;
641
+
642
+ TrellisNode() : score(kMaxScore), best_prev(nullptr) {}
643
+ void InitSink() {
644
+ score = 0u;
645
+ disto = 0;
646
+ pos = 0;
647
+ rank = 0;
648
+ nbits = 0;
649
+ bits = 0;
650
+ }
651
+ };
652
+
653
+ static bool SearchBestPrev(const TrellisNode* const nodes0, TrellisNode* node,
654
+ const uint32_t disto0[], const uint32_t codes[],
655
+ uint32_t lambda) {
656
+ bool found = false;
657
+ assert(codes[0xf0] != 0);
658
+ const uint32_t base_disto = node->disto + disto0[node->pos - 1];
659
+ for (const TrellisNode* cur = node - 1; cur >= nodes0; --cur) {
660
+ const int run = node->pos - 1 - cur->pos;
661
+ if (run < 0) continue;
662
+ uint32_t bits = node->nbits;
663
+ bits += (run >> 4) * (codes[0xf0] & 0xff);
664
+ const uint32_t sym = ((run & 15) << 4) | node->nbits;
665
+ assert(codes[sym] != 0);
666
+ bits += codes[sym] & 0xff;
667
+ const uint32_t disto = base_disto - disto0[cur->pos];
668
+ const score_t score = disto + lambda * bits + cur->score;
669
+ if (score < node->score) {
670
+ node->score = score;
671
+ node->disto = disto;
672
+ node->bits = bits;
673
+ node->best_prev = cur;
674
+ node->rank = cur->rank + 1;
675
+ node->run = run;
676
+ found = true;
677
+ }
678
+ }
679
+ return found;
680
+ }
681
+
682
+ // number of alternate levels to investigate
683
+ #define NUM_TRELLIS_NODES 2
684
+
685
+ int Encoder::TrellisQuantizeBlock(const int16_t in[64], int idx,
686
+ const Quantizer* const Q,
687
+ DCTCoeffs* const out,
688
+ RunLevel* const rl) {
689
+ const uint16_t* const bias = Q->bias_;
690
+ const uint16_t* const iquant = Q->iquant_;
691
+ TrellisNode nodes[1 + NUM_TRELLIS_NODES * 63]; // 1 sink + n channels
692
+ nodes[0].InitSink();
693
+ const uint32_t* const codes = Q->codes_;
694
+ TrellisNode* cur_node = &nodes[1];
695
+ uint32_t disto0[64]; // disto0[i] = sum of distortions up to i (inclusive)
696
+ disto0[0] = 0;
697
+ for (int i = 1; i < 64; ++i) {
698
+ const int j = kZigzag[i];
699
+ const uint32_t q = Q->quant_[j] << AC_BITS;
700
+ const uint32_t lambda = q * q / 32u;
701
+ int V = in[j];
702
+ const int32_t mask = V >> 31;
703
+ V = (V ^ mask) - mask;
704
+ disto0[i] = V * V + disto0[i - 1];
705
+ int v = QUANTIZE(V, iquant[j], bias[j]);
706
+ if (v == 0) continue;
707
+ int nbits = CalcLog2(v);
708
+ for (int k = 0; k < NUM_TRELLIS_NODES; ++k) {
709
+ const int err = V - v * q;
710
+ cur_node->code = (v ^ mask) & ((1 << nbits) - 1);
711
+ cur_node->pos = i;
712
+ cur_node->disto = err * err;
713
+ cur_node->nbits = nbits;
714
+ cur_node->score = kMaxScore;
715
+ if (SearchBestPrev(&nodes[0], cur_node, disto0, codes, lambda)) {
716
+ ++cur_node;
717
+ }
718
+ --nbits;
719
+ if (nbits <= 0) break;
720
+ v = (1 << nbits) - 1;
721
+ }
722
+ }
723
+ // search best entry point backward
724
+ const TrellisNode* nz = &nodes[0];
725
+ if (cur_node != nz) {
726
+ score_t best_score = kMaxScore;
727
+ while (cur_node-- != &nodes[0]) {
728
+ const uint32_t disto = disto0[63] - disto0[cur_node->pos];
729
+ // No need to incorporate EOB's bit cost (codes[0x00]), since
730
+ // it's the same for all coeff except the last one #63.
731
+ cur_node->disto += disto;
732
+ cur_node->score += disto;
733
+ if (cur_node->score < best_score) {
734
+ nz = cur_node;
735
+ best_score = cur_node->score;
736
+ }
737
+ }
738
+ }
739
+ int nb = nz->rank;
740
+ out->idx_ = idx;
741
+ out->last_ = nz->pos;
742
+ out->nb_coeffs_ = nb;
743
+
744
+ while (nb-- > 0) {
745
+ const int32_t code = nz->code;
746
+ const int n = nz->nbits;
747
+ rl[nb].level_ = (code << 4) | n;
748
+ rl[nb].run_ = nz->run;
749
+ nz = nz->best_prev;
750
+ }
751
+ const int dc = (in[0] < 0) ? -QUANTIZE(-in[0], iquant[0], bias[0])
752
+ : QUANTIZE(in[0], iquant[0], bias[0]);
753
+ return dc;
754
+ }
755
+
756
+ Encoder::QuantizeBlockFunc Encoder::GetQuantizeBlockFunc() {
757
+ #if defined(SJPEG_USE_SSE2)
758
+ if (SupportsSSE2()) return QuantizeBlockSSE2;
759
+ #elif defined(SJPEG_USE_NEON)
760
+ if (SupportsNEON()) return QuantizeBlockNEON;
761
+ #endif
762
+ return QuantizeBlock; // default
763
+ }
764
+
765
+ ////////////////////////////////////////////////////////////////////////////////
766
+
767
+ #if defined(SJPEG_USE_SSE2)
768
+ // Load eight 16b-words from *src.
769
+ #define LOAD_16(src) _mm_loadu_si128((const __m128i*)(src))
770
+ #define LOAD_64(src) _mm_loadl_epi64((const __m128i*)(src))
771
+ // Store eight 16b-words into *dst
772
+ #define STORE_16(V, dst) _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), (V))
773
+
774
+ static uint32_t QuantizeErrorSSE2(const int16_t in[64],
775
+ const Quantizer* const Q) {
776
+ const uint16_t* const bias = Q->bias_;
777
+ const uint16_t* const iquant = Q->iquant_;
778
+ const uint8_t* const quant = Q->quant_;
779
+ const __m128i zero = _mm_setzero_si128();
780
+ uint32_t tmp[32];
781
+ for (int i = 0; i < 64; i += 8) {
782
+ const __m128i m_bias = LOAD_16(bias + i);
783
+ const __m128i m_iquant = LOAD_16(iquant + i);
784
+ const __m128i m_quant = _mm_unpacklo_epi8(LOAD_64(quant + i), zero);
785
+ const __m128i A = LOAD_16(in + i); // v0 = in[i]
786
+ const __m128i B = _mm_srai_epi16(A, 15); // sign extract
787
+ const __m128i C = _mm_sub_epi16(_mm_xor_si128(A, B), B); // abs(v0)
788
+ const __m128i D = _mm_adds_epi16(C, m_bias); // v' = v0 + bias
789
+ const __m128i E = _mm_mulhi_epu16(D, m_iquant); // (v' * iq) >> 16
790
+ const __m128i F = _mm_srai_epi16(E, AC_BITS);
791
+ const __m128i G = _mm_srai_epi16(C, AC_BITS);
792
+ const __m128i H = _mm_mullo_epi16(F, m_quant); // *= quant[j]
793
+ const __m128i I = _mm_sub_epi16(G, H);
794
+ const __m128i J = _mm_madd_epi16(I, I); // (v0-v) ^ 2
795
+ STORE_16(J, tmp + i / 2);
796
+ }
797
+ uint32_t err = 0;
798
+ for (int i = 0; i < 32; ++i) err += tmp[i];
799
+ return err;
800
+ }
801
+ #undef LOAD_16
802
+ #undef LOAD_64
803
+ #undef STORE_16
804
+
805
+ #elif defined(SJPEG_USE_NEON)
806
+
807
+ static uint32_t QuantizeErrorNEON(const int16_t in[64],
808
+ const Quantizer* const Q) {
809
+ const uint16_t* const bias = Q->bias_;
810
+ const uint16_t* const iquant = Q->iquant_;
811
+ const uint8_t* const quant = Q->quant_;
812
+ uint32x4_t sum1 = vdupq_n_u32(0);
813
+ uint32x4_t sum2 = vdupq_n_u32(0);
814
+ for (int i = 0; i < 64; i += 8) {
815
+ const uint16x8_t m_bias = vld1q_u16(bias + i);
816
+ const uint16x8_t m_mult = vld1q_u16(iquant + i);
817
+ const uint16x8_t m_quant = vmovl_u8(vld1_u8(quant + i));
818
+ const uint16x8_t A = vreinterpretq_u16_s16(vabsq_s16(vld1q_s16(in + i)));
819
+ const uint16x8_t B = vaddq_u16(A, m_bias);
820
+ const uint32x4_t C0 = vmull_u16(vget_low_u16(B), vget_low_u16(m_mult));
821
+ const uint32x4_t C1 = vmull_u16(vget_high_u16(B), vget_high_u16(m_mult));
822
+ // collect hi-words of the 32b mult result using 'unzip'
823
+ const uint16x8x2_t D = vuzpq_u16(vreinterpretq_u16_u32(C0),
824
+ vreinterpretq_u16_u32(C1));
825
+ const uint16x8_t E = vshrq_n_u16(D.val[1], AC_BITS);
826
+ const uint16x8_t F = vmulq_u16(E, m_quant); // dequantized coeff
827
+ const uint16x8_t G = vabdq_u16(F, vshrq_n_u16(A, AC_BITS));
828
+ sum1 = vmlal_u16(sum1, vget_low_u16(G), vget_low_u16(G));
829
+ sum2 = vmlal_u16(sum2, vget_high_u16(G), vget_high_u16(G));
830
+ }
831
+ const uint32x4_t sum3 = vaddq_u32(sum1, sum2);
832
+ const uint64x2_t sum4 = vpaddlq_u32(sum3);
833
+ const uint64_t sum5 = vgetq_lane_u64(sum4, 0) + vgetq_lane_u64(sum4, 1);
834
+ const uint32_t err = (uint32_t)sum5;
835
+ return err;
836
+ }
837
+
838
+ #endif // SJPEG_USE_NEON
839
+
840
+ static uint32_t QuantizeError(const int16_t in[64], const Quantizer* const Q) {
841
+ const uint16_t* const bias = Q->bias_;
842
+ const uint16_t* const iquant = Q->iquant_;
843
+ const uint8_t* const quant = Q->quant_;
844
+ uint32_t err = 0;
845
+ for (int j = 0; j < 64; ++j) {
846
+ int32_t v0 = (in[j] < 0) ? -in[j] : in[j];
847
+ const uint32_t v = quant[j] * QUANTIZE(v0, iquant[j], bias[j]);
848
+ v0 >>= AC_BITS;
849
+ err += (v0 - v) * (v0 - v);
850
+ }
851
+ return err;
852
+ }
853
+
854
+ Encoder::QuantizeErrorFunc Encoder::GetQuantizeErrorFunc() {
855
+ #if defined(SJPEG_USE_SSE2)
856
+ if (SupportsSSE2()) return QuantizeErrorSSE2;
857
+ #elif defined(SJPEG_USE_NEON)
858
+ if (SupportsNEON()) return QuantizeErrorNEON;
859
+ #endif
860
+ return QuantizeError; // default
861
+ }
862
+
863
+ ////////////////////////////////////////////////////////////////////////////////
864
+ // Code bitstream
865
+
866
+ void Encoder::ResetDCs() {
867
+ for (int c = 0; c < nb_comps_; ++c) {
868
+ DCs_[c] = 0;
869
+ }
870
+ }
871
+
872
+ void Encoder::CodeBlock(const DCTCoeffs* const coeffs,
873
+ const RunLevel* const rl) {
874
+ const int idx = coeffs->idx_;
875
+ const int q_idx = quant_idx_[idx];
876
+
877
+ // DC coefficient symbol
878
+ const int dc_len = coeffs->dc_code_ & 0x0f;
879
+ const uint32_t code = dc_codes_[q_idx][dc_len];
880
+ bw_.PutPackedCode(code);
881
+ if (dc_len > 0) {
882
+ bw_.PutBits(coeffs->dc_code_ >> 4, dc_len);
883
+ }
884
+
885
+ // AC coeffs
886
+ const uint32_t* const codes = ac_codes_[q_idx];
887
+ for (int i = 0; i < coeffs->nb_coeffs_; ++i) {
888
+ int run = rl[i].run_;
889
+ while (run & ~15) { // escapes
890
+ bw_.PutPackedCode(codes[0xf0]);
891
+ run -= 16;
892
+ }
893
+ const uint32_t suffix = rl[i].level_;
894
+ const int n = suffix & 0x0f;
895
+ const int sym = (run << 4) | n;
896
+ bw_.PutPackedCode(codes[sym]);
897
+ bw_.PutBits(suffix >> 4, n);
898
+ }
899
+ if (coeffs->last_ < 63) { // EOB
900
+ bw_.PutPackedCode(codes[0x00]);
901
+ }
902
+ }
903
+
904
+ ////////////////////////////////////////////////////////////////////////////////
905
+ // Histogram
906
+
907
+ void Encoder::ResetHisto() {
908
+ memset(histos_, 0, sizeof(histos_));
909
+ }
910
+
911
+ #if defined(SJPEG_USE_SSE2)
912
+ void StoreHistoSSE2(const int16_t in[64], Histo* const histos, int nb_blocks) {
913
+ const __m128i kMaxHisto = _mm_set1_epi16(MAX_HISTO_DCT_COEFF);
914
+ for (int n = 0; n < nb_blocks; ++n, in += 64) {
915
+ uint16_t tmp[64];
916
+ for (int i = 0; i < 64; i += 8) {
917
+ const __m128i A =
918
+ _mm_loadu_si128(reinterpret_cast<const __m128i*>(in + i));
919
+ const __m128i B = _mm_srai_epi16(A, 15); // sign extract
920
+ const __m128i C = _mm_sub_epi16(_mm_xor_si128(A, B), B); // abs(A)
921
+ const __m128i D = _mm_srli_epi16(C, HSHIFT); // >>= HSHIFT
922
+ const __m128i E = _mm_min_epi16(D, kMaxHisto);
923
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(tmp + i), E);
924
+ }
925
+ for (int j = 0; j < 64; ++j) {
926
+ const int k = tmp[j];
927
+ ++histos->counts_[j][k];
928
+ }
929
+ }
930
+ }
931
+ #elif defined(SJPEG_USE_NEON)
932
+ void StoreHistoNEON(const int16_t in[64], Histo* const histos, int nb_blocks) {
933
+ const uint16x8_t kMaxHisto = vdupq_n_u16(MAX_HISTO_DCT_COEFF);
934
+ for (int n = 0; n < nb_blocks; ++n, in += 64) {
935
+ uint16_t tmp[64];
936
+ for (int i = 0; i < 64; i += 8) {
937
+ const int16x8_t A = vld1q_s16(in + i);
938
+ const int16x8_t B = vabsq_s16(A); // abs(in)
939
+ const uint16x8_t C = vreinterpretq_u16_s16(B); // signed->unsigned
940
+ const uint16x8_t D = vshrq_n_u16(C, HSHIFT); // >>= HSHIFT
941
+ const uint16x8_t E = vminq_u16(D, kMaxHisto); // min(.,kMaxHisto)
942
+ vst1q_u16(tmp + i, E);
943
+ }
944
+ for (int j = 0; j < 64; ++j) {
945
+ const int k = tmp[j];
946
+ ++histos->counts_[j][k];
947
+ }
948
+ }
949
+ }
950
+ #endif
951
+
952
+ // This C-version is does not produce the same counts_[] output than the
953
+ // assembly above. But the extra entry counts_[MAX_HISTO_DCT_COEFF] is
954
+ // not used for the final computation, and the global result is unchanged.
955
+ void StoreHisto(const int16_t in[64], Histo* const histos, int nb_blocks) {
956
+ for (int n = 0; n < nb_blocks; ++n, in += 64) {
957
+ for (int i = 0; i < 64; ++i) {
958
+ const int k = (in[i] < 0 ? -in[i] : in[i]) >> HSHIFT;
959
+ if (k < MAX_HISTO_DCT_COEFF) {
960
+ ++histos->counts_[i][k];
961
+ }
962
+ }
963
+ }
964
+ }
965
+
966
+ Encoder::StoreHistoFunc Encoder::GetStoreHistoFunc() {
967
+ #if defined(SJPEG_USE_SSE2)
968
+ if (SupportsSSE2()) return StoreHistoSSE2;
969
+ #elif defined(SJPEG_USE_NEON)
970
+ if (SupportsNEON()) return StoreHistoNEON;
971
+ #endif
972
+ return StoreHisto; // default
973
+ }
974
+
975
+ const float Encoder::kHistoWeight[QSIZE] = {
976
+ // Gaussian with sigma ~= 3
977
+ 0, 0, 0, 0, 0,
978
+ 1, 5, 16, 43, 94, 164, 228, 255, 228, 164, 94, 43, 16, 5, 1,
979
+ 0, 0, 0, 0, 0
980
+ };
981
+
982
+ void Encoder::AnalyseHisto() {
983
+ // A bit of theory and background: for each sub-band i in [0..63], we pick a
984
+ // quantization scale New_Qi close to the initial one Qi. We evaluate a cost
985
+ // function associated with F({New_Qi}) = distortion + lambda . rate,
986
+ // where rate and distortion depend on the quantizers set in a complex non-
987
+ // analytic way. Just, for well-behaved regular histograms, we expect the
988
+ // rate to scale as -log(Q), and the distortion as Q^2.
989
+ // We want the cost function to be stationnary around the initial {Qi} set,
990
+ // in order to achieve the best transfer between distortion and rate when we
991
+ // displace a little the Qi values. Mainly we want to use bits as efficiently
992
+ // as possible, where every bit we use has maximal impact in lowering
993
+ // distortion (and vice versa: if we spend an extra bit of coding, we want to
994
+ // have the best bang for this buck. The optimization works up-hill too).
995
+ //
996
+ // Hence, lambda is picked to minimize F around {Qi}, as:
997
+ // lambda = -d(distortion) / d(rate)
998
+ // where the derivates are evaluated using a double least-square fit on both
999
+ // the clouds of {delta, distortion} and {delta, size} points.
1000
+ //
1001
+ // Note1: The least-square fitted slope of a {x,y} cloud is expressed as:
1002
+ // slope = (<xy> - <x><y>) / (<xx> - <x><x>) = Cov(x,y) / Cov(x,x)
1003
+ // where <.> is our gaussian-averaging operator.
1004
+ // But since we are eventually computing a quotient of such slopes, we can
1005
+ // factor out the common (<xx> - <x><x>) denominator (which is strictly
1006
+ // positive).
1007
+ // Note2: we use a Gaussian-weighted average around the center value Qi
1008
+ // instead of averaging over the whole [QDELTA_MIN, QDELTA_MAX] range.
1009
+ // This rules out fringe samples on noisy cases (like: when the source is
1010
+ // already JPEG-compressed!).
1011
+ // Note3: We fall back to some sane value HLAMBDA in case of ill-condition.
1012
+ //
1013
+ // We use use the correlation coefficient
1014
+ // r = Cov(x,y) / sqrt(Cov(x,x) * Cov(y,y))
1015
+ // to detect bad cases with poorly extrapolated distortion. In such
1016
+ // occurrence, we skip the channel. This is particularly important for
1017
+ // already-compressed JPEG sources that give treacherous comb-like
1018
+ // histograms.
1019
+ //
1020
+ // Once this particular lambda has been picked, we loop over each channel
1021
+ // and optimize them separately, locally picking the best New_Qi for each.
1022
+ // The choice of lambda ensure a good balancing between size and distortion,
1023
+ // and prevent being too aggressive on file-size reduction for instance.
1024
+ //
1025
+ const double r_limit = kCorrelationThreshold;
1026
+ for (int c = (nb_comps_ > 1 ? 1 : 0); c >= 0; --c) {
1027
+ const int idx = quant_idx_[c];
1028
+ const Histo* const histo = &histos_[idx];
1029
+ // For chrominance, it can be visually damageable to be too
1030
+ // aggressive on the filesize. So with the default settings we
1031
+ // restrict the algorithm to mainly try to *increase* the bitrate
1032
+ // (and quality) by using a smaller qdelta_max_chroma_.
1033
+ // delta_max is only use during the second phase, but not during
1034
+ // the first phase of deriving an optimal lambda.
1035
+ assert(QDELTA_MAX >= qdelta_max_luma_);
1036
+ assert(QDELTA_MAX >= qdelta_max_chroma_);
1037
+ const int delta_max =
1038
+ ((idx == 0) ? qdelta_max_luma_ : qdelta_max_chroma_) - QDELTA_MIN;
1039
+ assert(delta_max < QSIZE);
1040
+ float sizes[64][QSIZE];
1041
+ float distortions[64][QSIZE];
1042
+ double num = 0.; // accumulate d(distortion) around delta_q = 0
1043
+ double den = 0.; // accumulate d(size) around delta_q = 0
1044
+ uint64_t omit_channels = kOmittedChannels;
1045
+ for (int pos = 0; pos < 64; ++pos) {
1046
+ if (omit_channels & (1ULL << pos)) {
1047
+ continue;
1048
+ }
1049
+ const int dq0 = quants_[idx].quant_[pos];
1050
+ const int min_dq0 = quants_[idx].min_quant_[pos];
1051
+ // We should be using the exact bias:
1052
+ // const int bias = quants_[idx].bias_[pos] << (FP_BITS - AC_BITS);
1053
+ // but this value is too precise considering the other approximations
1054
+ // we're using (namely: HSHIFT). So we better use the a mid value of 0.5
1055
+ // for the bias. This have the advantage of making it possible to
1056
+ // use pre-calculated look-up tables for every quantities in the loop.
1057
+ // This is still a TODO(skal) below, though. Not sure the gain is big.
1058
+ const int bias = 1 << FP_BITS >> 1;
1059
+ const int* const h = histo->counts_[pos];
1060
+ int total = 0;
1061
+ int last = 0;
1062
+ for (int i = 0; i < MAX_HISTO_DCT_COEFF; ++i) {
1063
+ total += h[i];
1064
+ if (h[i]) last = i + 1;
1065
+ }
1066
+ if (total < kDensityThreshold * last) {
1067
+ omit_channels |= 1ULL << pos;
1068
+ continue;
1069
+ }
1070
+ // accumulators for averaged values.
1071
+ double sw = 0., sx = 0.;
1072
+ double sxx = 0., syy1 = 0.;
1073
+ double sy1 = 0., sxy1 = 0.; // accumulators for distortion cloud
1074
+ double sy2 = 0., sxy2 = 0.; // accumulators for size cloud
1075
+ for (int delta = 0; delta < QSIZE; ++delta) {
1076
+ double bsum = 0., dsum = 0.;
1077
+ const int dq = dq0 + (delta + QDELTA_MIN);
1078
+ if (dq >= min_dq0 && dq <= 255) {
1079
+ // TODO(skal): pre-compute idq and use it in FinalizeQuantMatrix too
1080
+ const int idq = ((1 << FP_BITS) + dq - 1) / dq;
1081
+ for (int i = 0; i < last; ++i) {
1082
+ if (h[i]) {
1083
+ // v = current bin's centroid in the histogram
1084
+ // qv = quantized value for the bin's representant 'v'
1085
+ // dqv = dequantized qv, to be compared against v (=> 'error')
1086
+ // bits = approximate bit-cost of quantized representant
1087
+ // h[i] = this bin's weight
1088
+ const int v = (i << HSHIFT) + HHALF;
1089
+ const int qv = (v * idq + bias) >> FP_BITS;
1090
+ // TODO(skal): for a given 'last' value, we know the upper limit
1091
+ // on dq that will make *all* quantized 'qv' values be zero.
1092
+ // => We can restrict the loop on 'dq' using 'last'.
1093
+ if (qv) {
1094
+ const int bits = CalcLog2(qv);
1095
+ const int dqv = qv * dq;
1096
+ const int error = (v - dqv) * (v - dqv);
1097
+ bsum += h[i] * bits;
1098
+ dsum += h[i] * error;
1099
+ } else {
1100
+ dsum += h[i] * v * v;
1101
+ }
1102
+ }
1103
+ } // end of 'i' loop
1104
+ distortions[pos][delta] = static_cast<float>(dsum);
1105
+ sizes[pos][delta] = static_cast<float>(bsum);
1106
+ const double w = kHistoWeight[delta]; // Gaussian weight
1107
+ if (w > 0.) {
1108
+ const double x = static_cast<double>(delta + QDELTA_MIN);
1109
+ sw += w;
1110
+ sx += w * x;
1111
+ sxx += w * x * x;
1112
+ sy1 += w * dsum;
1113
+ syy1 += w * dsum * dsum;
1114
+ sy2 += w * bsum;
1115
+ sxy1 += w * dsum * x;
1116
+ sxy2 += w * bsum * x;
1117
+ }
1118
+ } else { // the new quantizer is out-of-range.
1119
+ distortions[pos][delta] = FLT_MAX;
1120
+ sizes[pos][delta] = 0;
1121
+ }
1122
+ }
1123
+ // filter channels according to correlation factor.
1124
+ const double cov_xy1 = sw * sxy1 - sx * sy1;
1125
+ if (cov_xy1 * cov_xy1 < r_limit *
1126
+ (sw * sxx - sx * sx) * (sw * syy1 - sy1 * sy1)) {
1127
+ omit_channels |= 1ULL << pos;
1128
+ continue;
1129
+ }
1130
+ // accumulate numerator and denominator for the derivate calculation
1131
+ num += cov_xy1;
1132
+ den += sw * sxy2 - sx * sy2;
1133
+ }
1134
+
1135
+ // we evaluate lambda =~ -d(distortion)/d(size) at dq=0
1136
+ double lambda = HLAMBDA;
1137
+ // When increasing Q, size should significantly decrease and distortion
1138
+ // increase. If they don't, we are ill-conditionned and should fall back
1139
+ // to a safe value HLAMBDA.
1140
+ if (num > 1000. && den < -10.) {
1141
+ // This is our approximation of -d(Distortion) / d(Rate)
1142
+ // We limit it to 1. below, to avoid degenerated cases
1143
+ lambda = -num / den;
1144
+ if (lambda < 1.) {
1145
+ lambda = 1.;
1146
+ }
1147
+ }
1148
+ // now, optimize each channel using the optimal lambda selection
1149
+ for (int pos = 0; pos < 64; ++pos) {
1150
+ if (omit_channels & (1ULL << pos)) {
1151
+ continue;
1152
+ }
1153
+ float best_score = FLT_MAX;
1154
+ int best_dq = 0;
1155
+ for (int delta = 0; delta <= delta_max; ++delta) {
1156
+ if (distortions[pos][delta] < FLT_MAX) {
1157
+ const float score = distortions[pos][delta]
1158
+ + lambda * sizes[pos][delta];
1159
+ if (score < best_score) {
1160
+ best_score = score;
1161
+ best_dq = delta + QDELTA_MIN;
1162
+ }
1163
+ }
1164
+ }
1165
+ quants_[idx].quant_[pos] += best_dq;
1166
+ assert(quants_[idx].quant_[pos] >= 1);
1167
+ }
1168
+ FinalizeQuantMatrix(&quants_[idx], q_bias_);
1169
+ SetCostCodes(idx);
1170
+ }
1171
+ }
1172
+
1173
+ void Encoder::CollectHistograms() {
1174
+ ResetHisto();
1175
+ int16_t* in = in_blocks_;
1176
+ const int mb_x_max = W_ / block_w_;
1177
+ const int mb_y_max = H_ / block_h_;
1178
+ for (int mb_y = 0; mb_y < mb_h_; ++mb_y) {
1179
+ const bool yclip = (mb_y == mb_y_max);
1180
+ for (int mb_x = 0; mb_x < mb_w_; ++mb_x) {
1181
+ if (!use_extra_memory_) {
1182
+ in = in_blocks_;
1183
+ }
1184
+ GetSamples(mb_x, mb_y, yclip | (mb_x == mb_x_max), in);
1185
+ fDCT_(in, mcu_blocks_);
1186
+ for (int c = 0; c < nb_comps_; ++c) {
1187
+ const int num_blocks = nb_blocks_[c];
1188
+ store_histo_(in, &histos_[quant_idx_[c]], num_blocks);
1189
+ in += 64 * num_blocks;
1190
+ }
1191
+ }
1192
+ }
1193
+ have_coeffs_ = use_extra_memory_;
1194
+ }
1195
+
1196
+ ////////////////////////////////////////////////////////////////////////////////
1197
+ // Perform YUV conversion and fDCT, and store the unquantized coeffs
1198
+
1199
+ void Encoder::CollectCoeffs() {
1200
+ assert(use_extra_memory_);
1201
+ int16_t* in = in_blocks_;
1202
+ const int mb_x_max = W_ / block_w_;
1203
+ const int mb_y_max = H_ / block_h_;
1204
+ for (int mb_y = 0; mb_y < mb_h_; ++mb_y) {
1205
+ const bool yclip = (mb_y == mb_y_max);
1206
+ for (int mb_x = 0; mb_x < mb_w_; ++mb_x) {
1207
+ GetSamples(mb_x, mb_y, yclip | (mb_x == mb_x_max), in);
1208
+ fDCT_(in, mcu_blocks_);
1209
+ in += 64 * mcu_blocks_;
1210
+ }
1211
+ }
1212
+ have_coeffs_ = true;
1213
+ }
1214
+
1215
+ ////////////////////////////////////////////////////////////////////////////////
1216
+ // 1-pass Scan
1217
+
1218
+ void Encoder::SinglePassScan() {
1219
+ ResetDCs();
1220
+
1221
+ RunLevel base_run_levels[64];
1222
+ int16_t* in = in_blocks_;
1223
+ const int mb_x_max = W_ / block_w_;
1224
+ const int mb_y_max = H_ / block_h_;
1225
+ const QuantizeBlockFunc quantize_block = use_trellis_ ? TrellisQuantizeBlock
1226
+ : quantize_block_;
1227
+ for (int mb_y = 0; mb_y < mb_h_; ++mb_y) {
1228
+ const bool yclip = (mb_y == mb_y_max);
1229
+ for (int mb_x = 0; mb_x < mb_w_; ++mb_x) {
1230
+ if (!CheckBuffers()) return;
1231
+ if (!have_coeffs_) {
1232
+ in = in_blocks_;
1233
+ GetSamples(mb_x, mb_y, yclip | (mb_x == mb_x_max), in);
1234
+ fDCT_(in, mcu_blocks_);
1235
+ }
1236
+ for (int c = 0; c < nb_comps_; ++c) {
1237
+ DCTCoeffs base_coeffs;
1238
+ for (int i = 0; i < nb_blocks_[c]; ++i) {
1239
+ const int dc = quantize_block(in, c, &quants_[quant_idx_[c]],
1240
+ &base_coeffs, base_run_levels);
1241
+ base_coeffs.dc_code_ = GenerateDCDiffCode(dc, &DCs_[c]);
1242
+ CodeBlock(&base_coeffs, base_run_levels);
1243
+ in += 64;
1244
+ }
1245
+ }
1246
+ }
1247
+ }
1248
+ }
1249
+
1250
+ void Encoder::FinalPassScan(size_t nb_mbs, const DCTCoeffs* coeffs) {
1251
+ DesallocateBlocks(); // we can free up some coeffs memory at this point
1252
+ if (!CheckBuffers()) return; // call needed to finalize all_run_levels_
1253
+ assert(reuse_run_levels_);
1254
+ const RunLevel* run_levels = all_run_levels_;
1255
+ for (size_t n = 0; n < nb_mbs; ++n) {
1256
+ if (!CheckBuffers()) return;
1257
+ CodeBlock(&coeffs[n], run_levels);
1258
+ run_levels += coeffs[n].nb_coeffs_;
1259
+ }
1260
+ }
1261
+
1262
+ ////////////////////////////////////////////////////////////////////////////////
1263
+ // Huffman tables optimization
1264
+
1265
+ void Encoder::ResetEntropyStats() {
1266
+ memset(freq_ac_, 0, sizeof(freq_ac_));
1267
+ memset(freq_dc_, 0, sizeof(freq_dc_));
1268
+ }
1269
+
1270
+ void Encoder::AddEntropyStats(const DCTCoeffs* const coeffs,
1271
+ const RunLevel* const run_levels) {
1272
+ // freq_ac_[] and freq_dc_[] cannot overflow 32bits, since the maximum
1273
+ // resolution allowed is 65535 * 65535. The sum of all frequencies cannot
1274
+ // be greater than 32bits, either.
1275
+ const int idx = coeffs->idx_;
1276
+ const int q_idx = quant_idx_[idx];
1277
+ for (int i = 0; i < coeffs->nb_coeffs_; ++i) {
1278
+ const int run = run_levels[i].run_;
1279
+ const int tmp = (run >> 4);
1280
+ if (tmp) freq_ac_[q_idx][0xf0] += tmp; // count escapes (all at once)
1281
+ const int suffix = run_levels[i].level_;
1282
+ const int sym = ((run & 0x0f) << 4) | (suffix & 0x0f);
1283
+ ++freq_ac_[q_idx][sym];
1284
+ }
1285
+ if (coeffs->last_ < 63) { // EOB
1286
+ ++freq_ac_[q_idx][0x00];
1287
+ }
1288
+ ++freq_dc_[q_idx][coeffs->dc_code_ & 0x0f];
1289
+ }
1290
+
1291
+ static int cmp(const void *pa, const void *pb) {
1292
+ const uint64_t a = *reinterpret_cast<const uint64_t*>(pa);
1293
+ const uint64_t b = *reinterpret_cast<const uint64_t*>(pb);
1294
+ assert(a != b); // tie-breaks can't happen
1295
+ return (a < b) ? 1 : -1;
1296
+ }
1297
+
1298
+ static void BuildOptimalTable(HuffmanTable* const t,
1299
+ const uint32_t* const freq, int size) {
1300
+ enum { MAX_BITS = 32, MAX_CODE_SIZE = 16 };
1301
+ assert(size <= 256);
1302
+ assert(t != nullptr);
1303
+
1304
+ // The celebrated merging algorithm from Huffman, with some restrictions:
1305
+ // * codes with all '1' are forbidden, to avoid trailing marker emulation
1306
+ // * code should be less than 16bits. So we're re-allocating them to shorter
1307
+ // code, even if it means being suboptimal for extremely rare symbols that
1308
+ // would eat a lot of bits.
1309
+ // This function will not touch the content of freq[].
1310
+ int codesizes[256 + 1];
1311
+ // chain[i] will hold the index of the next element in the subtree below
1312
+ // element 'i', or -1 if there's no sub-tree.
1313
+ // We use and maintain this list in order to efficiently increasing the
1314
+ // codesizes by one when merging two sub-trees into one.
1315
+ // To ease the merging (by avoiding 1 loop) we store the address of the last
1316
+ // element in the chain for each symbol. This makes the process being O(1).
1317
+ // It's probably better to keep the arrays separated instead of making
1318
+ // a struct, since we touch chain_end[] only once per merging, whereas
1319
+ // chain[] and codesizes[] are modified O(k) time per merging.
1320
+ int chain[256 + 1];
1321
+ int* chain_end[256 + 1];
1322
+ // sorted_freq[] remains sorted by decreasing frequencies along the process.
1323
+ uint64_t sorted_freq[256 + 1];
1324
+
1325
+ // Counts and puts the symbols effectively used at the beginning of the table.
1326
+ int nb_syms = 0;
1327
+ for (int i = 0; i < size; ++i) {
1328
+ const uint64_t v = freq[i];
1329
+ if (v > 0) {
1330
+ // we pack the sorted key (32bits) and index (9bits) into a single
1331
+ // uint64_t, so we don't have to resort to structs (and we avoid
1332
+ // tie-breaks, too)
1333
+ sorted_freq[nb_syms++] = (v << 9) | i;
1334
+ }
1335
+ codesizes[i] = 0;
1336
+ chain[i] = -1;
1337
+ chain_end[i] = &chain[i];
1338
+ }
1339
+ t->nb_syms_ = nb_syms; // Record how many final symbols we'll have.
1340
+
1341
+ // initial sort
1342
+ // TODO(skal): replace by counting-sort?? (merged with previous loop?)
1343
+ qsort(sorted_freq, nb_syms, sizeof(sorted_freq[0]), cmp);
1344
+
1345
+ // fake last symbol, with lowest frequency: will be assigned to the forbidden
1346
+ // code '1111...1', but will eventually be discarded.
1347
+ sorted_freq[nb_syms++] = (1ULL << 9) | size;
1348
+ codesizes[size] = 0;
1349
+ chain[size] = -1;
1350
+ chain_end[size] = &chain[size];
1351
+
1352
+ // Merging phase
1353
+ // Recursively merge the two symbols with lowest frequency. The resulting
1354
+ // super-symbol will be represented by a longer (by 1bit) code, since
1355
+ // it's the least frequent one.
1356
+ int nb = nb_syms;
1357
+ while (nb-- > 1) {
1358
+ // First, link the two sub-trees.
1359
+ const uint64_t s1 = sorted_freq[nb - 1]; // first symbol
1360
+ const uint64_t s2 = sorted_freq[nb]; // second symbol, appended
1361
+ // The 0x1ff masking is for taking only the symbol, discarding the
1362
+ // frequency that we stored in the upper bits for sorting.
1363
+ int i = s1 & 0x1ff;
1364
+ const int j = s2 & 0x1ff;
1365
+ assert(i <= size && j <= size);
1366
+ *chain_end[i] = j;
1367
+ chain_end[i] = chain_end[j];
1368
+
1369
+ // Then, following the chain, increase the whole sub-tree's weight by 1bit.
1370
+ do {
1371
+ ++codesizes[i];
1372
+ i = chain[i];
1373
+ } while (i >= 0);
1374
+
1375
+ // Create new symbol, with merged frequencies. Will take s1's spot.
1376
+ // We must use 64bit here to prevent overflow in the sum. Both s1 and
1377
+ // s2 are originally 32 + 9 bits wide.
1378
+ const uint64_t new_symbol = s1 + (s2 & ~0x1ff);
1379
+ // Perform insertion sort to find the new spot of the merged symbol.
1380
+ int k = nb - 1;
1381
+ while (k > 0) {
1382
+ if (sorted_freq[k - 1] < new_symbol) {
1383
+ sorted_freq[k] = sorted_freq[k - 1];
1384
+ --k;
1385
+ } else {
1386
+ break;
1387
+ }
1388
+ }
1389
+ sorted_freq[k] = new_symbol;
1390
+ }
1391
+
1392
+ // Count bit distribution.
1393
+ uint8_t bits[MAX_BITS];
1394
+ memset(bits, 0, sizeof(bits));
1395
+ int max_bit_size = 0;
1396
+ for (int i = 0; i <= size; ++i) {
1397
+ int s = codesizes[i];
1398
+ assert(s <= codesizes[size]); // symbol #size is the biggest one.
1399
+ if (s > 0) {
1400
+ // This is slightly penalizing but only for ultra-rare symbol
1401
+ if (s > MAX_BITS) {
1402
+ s = MAX_BITS;
1403
+ codesizes[i] = MAX_BITS; // clamp code-size
1404
+ }
1405
+ ++bits[s - 1];
1406
+ if (s > max_bit_size) {
1407
+ max_bit_size = s;
1408
+ }
1409
+ }
1410
+ }
1411
+
1412
+ // We sort symbols by slices of increasing bitsizes, using counting sort.
1413
+ // This will generate a partition of symbols in the final syms_[] array.
1414
+ int start[MAX_BITS]; // start[i] is the first code with length i+1
1415
+ int position = 0;
1416
+ for (int i = 0; i < max_bit_size; ++i) {
1417
+ start[i] = position;
1418
+ position += bits[i];
1419
+ }
1420
+ assert(position == nb_syms);
1421
+
1422
+ // Now, we can ventilate the symbols directly to their final slice in the
1423
+ // partitioning, according to the their bit-length.
1424
+ // Note: we omit the last symbol, which is fake.
1425
+ uint8_t* const syms = const_cast<uint8_t*>(t->syms_);
1426
+ // Note that we loop til symbol = size-1, hence omitting the last fake symbol.
1427
+ for (int symbol = 0; symbol < size; ++symbol) {
1428
+ const int s = codesizes[symbol];
1429
+ if (s > 0) {
1430
+ assert(s <= MAX_BITS);
1431
+ syms[start[s - 1]++] = symbol;
1432
+ }
1433
+ }
1434
+ assert(start[max_bit_size - 1] == nb_syms - 1);
1435
+
1436
+ // Fix codes with length greater than 16 bits. We move too long
1437
+ // codes up, and one short down, making the tree a little sub-optimal.
1438
+ for (int l = max_bit_size - 1; l >= MAX_CODE_SIZE; --l) {
1439
+ while (bits[l] > 0) {
1440
+ int k = l - 2;
1441
+ while (bits[k] == 0) { // Search for a level with a leaf to split.
1442
+ --k;
1443
+ }
1444
+ /* Move up 2 symbols from bottom-most level l, and sink down one from
1445
+ level k, like this:
1446
+ Before: After:
1447
+ / .. / ..
1448
+ k bits-> c \ /\ \
1449
+ /\ c b /\
1450
+ .. /\ .. a
1451
+ l bits-> a b
1452
+ Note that by the very construction of the optimal tree, the least
1453
+ probable symbols always come by pair with same bit-length.
1454
+ So there's always a pair of 'a' and 'b' to find.
1455
+ */
1456
+ bits[l ] -= 2; // remove 'a' and 'b'
1457
+ bits[l - 1] += 1; // put 'a' one level up.
1458
+ bits[k ] -= 1; // remove 'c'
1459
+ bits[k + 1] += 2; // put 'c' anb 'b' one level down.
1460
+ }
1461
+ }
1462
+
1463
+ // remove last pseudo-symbol
1464
+ max_bit_size = MAX_CODE_SIZE;
1465
+ while (bits[--max_bit_size] == 0) {
1466
+ assert(max_bit_size > 0);
1467
+ }
1468
+ --bits[max_bit_size];
1469
+
1470
+ // update table with final book
1471
+ for (int i = 0; i < MAX_CODE_SIZE; ++i) {
1472
+ t->bits_[i] = bits[i];
1473
+ }
1474
+ }
1475
+
1476
+ void Encoder::CompileEntropyStats() {
1477
+ // plug and build new tables
1478
+ for (int q_idx = 0; q_idx < (nb_comps_ == 1 ? 1 : 2); ++q_idx) {
1479
+ // DC tables
1480
+ Huffman_tables_[q_idx] = &opt_tables_dc_[q_idx];
1481
+ opt_tables_dc_[q_idx].syms_ = opt_syms_dc_[q_idx];
1482
+ BuildOptimalTable(&opt_tables_dc_[q_idx], freq_dc_[q_idx], 12);
1483
+ // AC tables
1484
+ Huffman_tables_[2 + q_idx] = &opt_tables_ac_[q_idx];
1485
+ opt_tables_ac_[q_idx].syms_ = opt_syms_ac_[q_idx];
1486
+ BuildOptimalTable(&opt_tables_ac_[q_idx], freq_ac_[q_idx], 256);
1487
+ }
1488
+ }
1489
+
1490
+ void Encoder::StoreOptimalHuffmanTables(size_t nb_mbs,
1491
+ const DCTCoeffs* coeffs) {
1492
+ // optimize Huffman tables
1493
+ ResetEntropyStats();
1494
+ const RunLevel* run_levels = all_run_levels_;
1495
+ for (size_t n = 0; n < nb_mbs; ++n) {
1496
+ AddEntropyStats(&coeffs[n], run_levels);
1497
+ run_levels += coeffs[n].nb_coeffs_;
1498
+ }
1499
+ CompileEntropyStats();
1500
+ }
1501
+
1502
+ ////////////////////////////////////////////////////////////////////////////////
1503
+
1504
+ void Encoder::SinglePassScanOptimized() {
1505
+ const size_t nb_mbs = mb_w_ * mb_h_ * mcu_blocks_;
1506
+ DCTCoeffs* const base_coeffs =
1507
+ Alloc<DCTCoeffs>(reuse_run_levels_ ? nb_mbs : 1);
1508
+ if (base_coeffs == nullptr) return;
1509
+ DCTCoeffs* coeffs = base_coeffs;
1510
+ RunLevel base_run_levels[64];
1511
+ const QuantizeBlockFunc quantize_block = use_trellis_ ? TrellisQuantizeBlock
1512
+ : quantize_block_;
1513
+
1514
+ // We use the default Huffman tables as basis for bit-rate evaluation
1515
+ if (use_trellis_) InitCodes(true);
1516
+
1517
+ ResetEntropyStats();
1518
+ ResetDCs();
1519
+ nb_run_levels_ = 0;
1520
+ int16_t* in = in_blocks_;
1521
+ const int mb_x_max = W_ / block_w_;
1522
+ const int mb_y_max = H_ / block_h_;
1523
+ for (int mb_y = 0; mb_y < mb_h_; ++mb_y) {
1524
+ const bool yclip = (mb_y == mb_y_max);
1525
+ for (int mb_x = 0; mb_x < mb_w_; ++mb_x) {
1526
+ if (!have_coeffs_) {
1527
+ in = in_blocks_;
1528
+ GetSamples(mb_x, mb_y, yclip | (mb_x == mb_x_max), in);
1529
+ fDCT_(in, mcu_blocks_);
1530
+ }
1531
+ if (!CheckBuffers()) goto End;
1532
+ for (int c = 0; c < nb_comps_; ++c) {
1533
+ for (int i = 0; i < nb_blocks_[c]; ++i) {
1534
+ RunLevel* const run_levels =
1535
+ reuse_run_levels_ ? all_run_levels_ + nb_run_levels_
1536
+ : base_run_levels;
1537
+ const int dc = quantize_block(in, c, &quants_[quant_idx_[c]],
1538
+ coeffs, run_levels);
1539
+ coeffs->dc_code_ = GenerateDCDiffCode(dc, &DCs_[c]);
1540
+ AddEntropyStats(coeffs, run_levels);
1541
+ if (reuse_run_levels_) {
1542
+ nb_run_levels_ += coeffs->nb_coeffs_;
1543
+ ++coeffs;
1544
+ assert(coeffs <= &base_coeffs[nb_mbs]);
1545
+ }
1546
+ in += 64;
1547
+ assert(nb_run_levels_ <= max_run_levels_);
1548
+ }
1549
+ }
1550
+ }
1551
+ }
1552
+
1553
+ CompileEntropyStats();
1554
+ WriteDHT();
1555
+ WriteSOS();
1556
+
1557
+ if (!reuse_run_levels_) {
1558
+ SinglePassScan(); // redo everything, but with optimal tables now.
1559
+ } else {
1560
+ // Re-use the saved run/levels for fast 2nd-pass.
1561
+ FinalPassScan(nb_mbs, base_coeffs);
1562
+ }
1563
+ End:
1564
+ Free(base_coeffs);
1565
+ }
1566
+
1567
+ ////////////////////////////////////////////////////////////////////////////////
1568
+ // main call
1569
+
1570
+ bool Encoder::Encode() {
1571
+ if (!ok_) return false;
1572
+
1573
+ FinalizeQuantMatrix(&quants_[0], q_bias_);
1574
+ FinalizeQuantMatrix(&quants_[1], q_bias_);
1575
+ SetCostCodes(0);
1576
+ SetCostCodes(1);
1577
+
1578
+ // default tables
1579
+ for (int i = 0; i < 4; ++i) Huffman_tables_[i] = &kHuffmanTables[i];
1580
+
1581
+ // colorspace init
1582
+ InitComponents();
1583
+ assert(nb_comps_ <= MAX_COMP);
1584
+ assert(mcu_blocks_ <= 6);
1585
+ // validate some input parameters
1586
+ if (W_ <= 0 || H_ <= 0 || rgb_ == nullptr) return false;
1587
+
1588
+ mb_w_ = (W_ + (block_w_ - 1)) / block_w_;
1589
+ mb_h_ = (H_ + (block_h_ - 1)) / block_h_;
1590
+ const size_t nb_blocks = use_extra_memory_ ? mb_w_ * mb_h_ : 1;
1591
+ if (!AllocateBlocks(nb_blocks * mcu_blocks_)) return false;
1592
+
1593
+ WriteAPP0();
1594
+
1595
+ // custom markers written 'as is'
1596
+ if (!WriteAPPMarkers(app_markers_)) return false;
1597
+
1598
+ // metadata
1599
+ if (!WriteEXIF(exif_) || !WriteICCP(iccp_) || !WriteXMP(xmp_)) return false;
1600
+
1601
+ if (passes_ > 1) {
1602
+ LoopScan();
1603
+ } else {
1604
+ if (use_adaptive_quant_) {
1605
+ // Histogram analysis + derive optimal quant matrices
1606
+ CollectHistograms();
1607
+ AnalyseHisto();
1608
+ }
1609
+
1610
+ WriteDQT();
1611
+ WriteSOF();
1612
+
1613
+ if (optimize_size_) {
1614
+ SinglePassScanOptimized();
1615
+ } else {
1616
+ WriteDHT();
1617
+ WriteSOS();
1618
+ SinglePassScan();
1619
+ }
1620
+ }
1621
+ WriteEOI();
1622
+ ok_ = ok_ && bw_.Finalize();
1623
+
1624
+ DesallocateBlocks();
1625
+ return ok_;
1626
+ }
1627
+
1628
+ ////////////////////////////////////////////////////////////////////////////////
1629
+ // Edge replication
1630
+
1631
+ namespace {
1632
+
1633
+ int GetAverage(const int16_t* const out) {
1634
+ int DC = 0;
1635
+ for (int i = 0; i < 64; ++i) DC += out[i];
1636
+ return (DC + 32) >> 6;
1637
+ }
1638
+
1639
+ void SetAverage(int DC, int16_t* const out) {
1640
+ for (int i = 0; i < 64; ++i) out[i] = DC;
1641
+ }
1642
+
1643
+ } // anonymous namespace
1644
+
1645
+ void Encoder::AverageExtraLuma(int sub_w, int sub_h, int16_t* out) {
1646
+ // out[] points to four 8x8 blocks. When one of this block is totally
1647
+ // outside of the frame, we set it flat to the average value of the previous
1648
+ // block ("DC"), in order to help compressibility.
1649
+ int DC = GetAverage(out);
1650
+ if (sub_w <= 8) { // set block #1 to block #0's average value
1651
+ SetAverage(DC, out + 1 * 64);
1652
+ }
1653
+ if (sub_h <= 8) { // Need to flatten block #2 and #3
1654
+ if (sub_w > 8) { // block #1 was not flatten, so get its real DC
1655
+ DC = GetAverage(out + 1 * 64);
1656
+ }
1657
+ SetAverage(DC, out + 2 * 64);
1658
+ SetAverage(DC, out + 3 * 64);
1659
+ } else if (sub_w <= 8) { // set block #3 to the block #2's average value
1660
+ DC = GetAverage(out + 2 * 64);
1661
+ SetAverage(DC, out + 3 * 64);
1662
+ }
1663
+ }
1664
+
1665
+ const uint8_t* Encoder::GetReplicatedSamples(const uint8_t* rgb,
1666
+ int rgb_step,
1667
+ int sub_w, int sub_h,
1668
+ int w, int h) {
1669
+ assert(sub_w > 0 && sub_h > 0);
1670
+ if (sub_w > w) {
1671
+ sub_w = w;
1672
+ }
1673
+ if (sub_h > h) {
1674
+ sub_h = h;
1675
+ }
1676
+ uint8_t* dst = replicated_buffer_;
1677
+ for (int y = 0; y < sub_h; ++y) {
1678
+ memcpy(dst, rgb, 3 * sub_w);
1679
+ const uint8_t* const src0 = &dst[3 * (sub_w - 1)];
1680
+ for (int x = 3 * sub_w; x < 3 * w; x += 3) {
1681
+ memcpy(dst + x, src0, 3);
1682
+ }
1683
+ dst += 3 * w;
1684
+ rgb += rgb_step;
1685
+ }
1686
+ const uint8_t* dst0 = dst - 3 * w;
1687
+ for (int y = sub_h; y < h; ++y) {
1688
+ memcpy(dst, dst0, 3 * w);
1689
+ dst += 3 * w;
1690
+ }
1691
+ return replicated_buffer_;
1692
+ }
1693
+
1694
+ // TODO(skal): merge with above function? Probably slower...
1695
+ const uint8_t* Encoder::GetReplicatedYUVSamples(const uint8_t* in,
1696
+ int step,
1697
+ int sub_w, int sub_h,
1698
+ int w, int h) {
1699
+ assert(sub_w > 0 && sub_h > 0);
1700
+ if (sub_w > w) {
1701
+ sub_w = w;
1702
+ }
1703
+ if (sub_h > h) {
1704
+ sub_h = h;
1705
+ }
1706
+ uint8_t* out = replicated_buffer_;
1707
+ for (int y = 0; y < sub_h; ++y) {
1708
+ int x;
1709
+ for (x = 0; x < sub_w; ++x)
1710
+ out[x] = in[x];
1711
+ for (; x < w; ++x) {
1712
+ out[x] = out[sub_w - 1];
1713
+ }
1714
+ out += w;
1715
+ in += step;
1716
+ }
1717
+ const uint8_t* const out0 = out - w;
1718
+ for (int y = sub_h; y < h; ++y) {
1719
+ memcpy(out, out0, w);
1720
+ out += w;
1721
+ }
1722
+ return replicated_buffer_;
1723
+ }
1724
+
1725
+ ////////////////////////////////////////////////////////////////////////////////
1726
+ // sub-class for YUV 4:2:0 version
1727
+
1728
+ class Encoder420 : public Encoder {
1729
+ public:
1730
+ Encoder420(int W, int H, int step, const uint8_t* const rgb,
1731
+ ByteSink* const sink)
1732
+ : Encoder(W, H, step, rgb, sink) {}
1733
+ virtual ~Encoder420() {}
1734
+ virtual void InitComponents() {
1735
+ nb_comps_ = 3;
1736
+
1737
+ quant_idx_[0] = 0;
1738
+ quant_idx_[1] = 1;
1739
+ quant_idx_[2] = 1;
1740
+
1741
+ nb_blocks_[0] = 4;
1742
+ nb_blocks_[1] = 1;
1743
+ nb_blocks_[2] = 1;
1744
+ mcu_blocks_ = 6;
1745
+
1746
+ block_w_ = 16;
1747
+ block_h_ = 16;
1748
+ block_dims_[0] = 0x22;
1749
+ block_dims_[1] = 0x11;
1750
+ block_dims_[2] = 0x11;
1751
+ }
1752
+ virtual void GetSamples(int mb_x, int mb_y, bool clipped,
1753
+ int16_t* out_blocks) {
1754
+ const uint8_t* data = rgb_ + (3 * mb_x + mb_y * step_) * 16;
1755
+ int step = step_;
1756
+ if (clipped) {
1757
+ data = GetReplicatedSamples(data, step,
1758
+ W_ - mb_x * 16, H_ - mb_y * 16, 16, 16);
1759
+ step = 3 * 16;
1760
+ }
1761
+ get_yuv_block_(data, step, out_blocks);
1762
+ if (clipped) {
1763
+ AverageExtraLuma(W_ - mb_x * 16, H_ - mb_y * 16, out_blocks);
1764
+ }
1765
+ }
1766
+ };
1767
+
1768
+ ////////////////////////////////////////////////////////////////////////////////
1769
+ // sub-class for YUV 4:4:4 version
1770
+
1771
+ class Encoder444 : public Encoder {
1772
+ public:
1773
+ Encoder444(int W, int H, int step, const uint8_t* const rgb,
1774
+ ByteSink* const sink)
1775
+ : Encoder(W, H, step, rgb, sink) {
1776
+ SetYUVFormat(true);
1777
+ }
1778
+ virtual ~Encoder444() {}
1779
+ virtual void InitComponents() {
1780
+ nb_comps_ = 3;
1781
+
1782
+ quant_idx_[0] = 0;
1783
+ quant_idx_[1] = 1;
1784
+ quant_idx_[2] = 1;
1785
+
1786
+ nb_blocks_[0] = 1;
1787
+ nb_blocks_[1] = 1;
1788
+ nb_blocks_[2] = 1;
1789
+ mcu_blocks_ = 3;
1790
+
1791
+ block_w_ = 8;
1792
+ block_h_ = 8;
1793
+ block_dims_[0] = 0x11;
1794
+ block_dims_[1] = 0x11;
1795
+ block_dims_[2] = 0x11;
1796
+ }
1797
+ virtual void GetSamples(int mb_x, int mb_y, bool clipped, int16_t* out) {
1798
+ const uint8_t* data = rgb_ + (3 * mb_x + mb_y * step_) * 8;
1799
+ int step = step_;
1800
+ if (clipped) {
1801
+ data = GetReplicatedSamples(data, step,
1802
+ W_ - mb_x * 8, H_ - mb_y * 8, 8, 8);
1803
+ step = 3 * 8;
1804
+ }
1805
+ get_yuv_block_(data, step, out);
1806
+ }
1807
+ };
1808
+
1809
+ ////////////////////////////////////////////////////////////////////////////////
1810
+ // sub-class for the sharp YUV 4:2:0 version
1811
+
1812
+ class EncoderSharp420 : public Encoder420 {
1813
+ public:
1814
+ EncoderSharp420(int W, int H, int step, const uint8_t* const rgb,
1815
+ ByteSink* const sink)
1816
+ : Encoder420(W, H, step, rgb, sink), yuv_memory_(nullptr) {
1817
+ const int uv_w = (W + 1) >> 1;
1818
+ const int uv_h = (H + 1) >> 1;
1819
+ yuv_memory_ = Alloc<uint8_t>(W * H + 2 * uv_w * uv_h);
1820
+ if (yuv_memory_ == nullptr) return;
1821
+ y_plane_ = yuv_memory_;
1822
+ y_step_ = W;
1823
+ u_plane_ = yuv_memory_ + W * H;
1824
+ v_plane_ = u_plane_ + uv_w * uv_h;
1825
+ uv_step_ = uv_w;
1826
+ ApplySharpYUVConversion(rgb, W, H, step, y_plane_, u_plane_, v_plane_);
1827
+ }
1828
+ virtual ~EncoderSharp420() { Free(yuv_memory_); }
1829
+ virtual void GetSamples(int mb_x, int mb_y, bool clipped, int16_t* out);
1830
+
1831
+ protected:
1832
+ void GetLumaSamples(int mb_x, int mb_y, bool clipped, int16_t* out) {
1833
+ int step = y_step_;
1834
+ const uint8_t* Y1 = y_plane_ + (mb_x + mb_y * step) * 16;
1835
+ if (clipped) {
1836
+ Y1 = GetReplicatedYUVSamples(Y1, step,
1837
+ W_ - mb_x * 16, H_ - mb_y * 16, 16, 16);
1838
+ step = 16;
1839
+ }
1840
+ const uint8_t* Y2 = Y1 + 8 * step;
1841
+ for (int y = 8, n = 0; y > 0; --y) {
1842
+ for (int x = 0; x < 8; ++x, ++n) {
1843
+ out[n + 0 * 64] = Y1[x] - 128;
1844
+ out[n + 1 * 64] = Y1[x + 8] - 128;
1845
+ out[n + 2 * 64] = Y2[x] - 128;
1846
+ out[n + 3 * 64] = Y2[x + 8] - 128;
1847
+ }
1848
+ Y1 += step;
1849
+ Y2 += step;
1850
+ }
1851
+ if (clipped) {
1852
+ AverageExtraLuma(W_ - mb_x * 16, H_ - mb_y * 16, out);
1853
+ }
1854
+ }
1855
+
1856
+ private:
1857
+ uint8_t* y_plane_;
1858
+ int y_step_;
1859
+ uint8_t* u_plane_;
1860
+ uint8_t* v_plane_;
1861
+ int uv_step_;
1862
+ uint8_t* yuv_memory_;
1863
+ };
1864
+
1865
+ void EncoderSharp420::GetSamples(int mb_x, int mb_y,
1866
+ bool clipped, int16_t* out) {
1867
+ GetLumaSamples(mb_x, mb_y, clipped, out);
1868
+
1869
+ // Chroma
1870
+ const uint8_t* U = u_plane_ + (mb_x + mb_y * uv_step_) * 8;
1871
+ int step = uv_step_;
1872
+ if (clipped) {
1873
+ U = GetReplicatedYUVSamples(U, step,
1874
+ ((W_ + 1) >> 1) - mb_x * 8,
1875
+ ((H_ + 1) >> 1) - mb_y * 8, 8, 8);
1876
+ step = 8;
1877
+ }
1878
+ for (int y = 8, n = 0; y > 0; --y, U += step) {
1879
+ for (int x = 0; x < 8; ++x, ++n) {
1880
+ out[n + 4 * 64] = U[x] - 128;
1881
+ }
1882
+ }
1883
+ const uint8_t* V = v_plane_ + (mb_x + mb_y * uv_step_) * 8;
1884
+ step = uv_step_;
1885
+ if (clipped) {
1886
+ V = GetReplicatedYUVSamples(V, step,
1887
+ ((W_ + 1) >> 1) - mb_x * 8,
1888
+ ((H_ + 1) >> 1) - mb_y * 8, 8, 8);
1889
+ step = 8;
1890
+ }
1891
+ for (int y = 8, n = 0; y > 0; --y, V += step) {
1892
+ for (int x = 0; x < 8; ++x, ++n) {
1893
+ out[n + 5 * 64] = V[x] - 128;
1894
+ }
1895
+ }
1896
+ }
1897
+
1898
+ ////////////////////////////////////////////////////////////////////////////////
1899
+ // all-in-one factory to pickup the right encoder instance
1900
+
1901
+ Encoder* EncoderFactory(const uint8_t* rgb,
1902
+ int W, int H, int stride, SjpegYUVMode yuv_mode,
1903
+ ByteSink* const sink) {
1904
+ if (yuv_mode == SJPEG_YUV_AUTO) {
1905
+ yuv_mode = SjpegRiskiness(rgb, W, H, stride, nullptr);
1906
+ }
1907
+
1908
+ Encoder* enc = nullptr;
1909
+ if (yuv_mode == SJPEG_YUV_420) {
1910
+ enc = new (std::nothrow) Encoder420(W, H, stride, rgb, sink);
1911
+ } else if (yuv_mode == SJPEG_YUV_SHARP) {
1912
+ enc = new (std::nothrow) EncoderSharp420(W, H, stride, rgb, sink);
1913
+ } else {
1914
+ enc = new (std::nothrow) Encoder444(W, H, stride, rgb, sink);
1915
+ }
1916
+ if (enc == nullptr || !enc->Ok()) {
1917
+ delete enc;
1918
+ enc = nullptr;
1919
+ }
1920
+ return enc;
1921
+ }
1922
+
1923
+ } // namespace sjpeg
1924
+
1925
+ ////////////////////////////////////////////////////////////////////////////////
1926
+ // public plain-C functions
1927
+
1928
+ size_t SjpegEncode(const uint8_t* rgb, int width, int height, int stride,
1929
+ uint8_t** out_data, float quality, int method,
1930
+ SjpegYUVMode yuv_mode) {
1931
+ if (rgb == nullptr || out_data == nullptr) return 0;
1932
+ if (width <= 0 || height <= 0 || stride < 3 * width) return 0;
1933
+ *out_data = nullptr; // safety
1934
+
1935
+ MemorySink sink(width * height / 4);
1936
+ Encoder* const enc = EncoderFactory(rgb, width, height, stride, yuv_mode,
1937
+ &sink);
1938
+ enc->SetQuality(quality);
1939
+ enc->SetCompressionMethod(method);
1940
+ size_t size = 0;
1941
+ *out_data = nullptr;
1942
+ if (enc->Encode()) sink.Release(out_data, &size);
1943
+ delete enc;
1944
+ return size;
1945
+ }
1946
+
1947
+ ////////////////////////////////////////////////////////////////////////////////
1948
+
1949
+ size_t SjpegCompress(const uint8_t* rgb, int width, int height, float quality,
1950
+ uint8_t** out_data) {
1951
+ return SjpegEncode(rgb, width, height, 3 * width, out_data,
1952
+ quality, 4, SJPEG_YUV_AUTO);
1953
+ }
1954
+
1955
+ void SjpegFreeBuffer(const uint8_t* buffer) {
1956
+ delete[] buffer;
1957
+ }
1958
+
1959
+ ////////////////////////////////////////////////////////////////////////////////
1960
+
1961
+ uint32_t SjpegVersion() {
1962
+ return SJPEG_VERSION;
1963
+ }
1964
+
1965
+ ////////////////////////////////////////////////////////////////////////////////
1966
+ // Parametrized call
1967
+
1968
+ EncoderParam::EncoderParam() : search_hook(nullptr), memory(nullptr) {
1969
+ Init(kDefaultQuality);
1970
+ }
1971
+
1972
+ EncoderParam::EncoderParam(float quality_factor)
1973
+ : search_hook(nullptr), memory(nullptr) {
1974
+ Init(quality_factor);
1975
+ }
1976
+
1977
+ void EncoderParam::Init(float quality_factor) {
1978
+ Huffman_compress = true;
1979
+ adaptive_quantization = true;
1980
+ use_trellis = false;
1981
+ yuv_mode = SJPEG_YUV_AUTO;
1982
+ quantization_bias = kDefaultBias;
1983
+ qdelta_max_luma = kDefaultDeltaMaxLuma;
1984
+ qdelta_max_chroma = kDefaultDeltaMaxChroma;
1985
+ adaptive_bias = false;
1986
+ SetLimitQuantization(false);
1987
+ min_quant_tolerance_ = 0;
1988
+ SetQuality(quality_factor);
1989
+ target_mode = TARGET_NONE;
1990
+ target_value = 0;
1991
+ passes = 1;
1992
+ tolerance = 1.;
1993
+ qmin = 0.;
1994
+ qmax = 100.;
1995
+ }
1996
+
1997
+ void EncoderParam::SetQuality(float quality_factor) {
1998
+ const float q = GetQFactor(quality_factor);
1999
+ sjpeg::SetQuantMatrix(kDefaultMatrices[0], q, quant_[0]);
2000
+ sjpeg::SetQuantMatrix(kDefaultMatrices[1], q, quant_[1]);
2001
+ }
2002
+
2003
+ void EncoderParam::SetQuantization(const uint8_t m[2][64],
2004
+ float reduction) {
2005
+ if (reduction <= 1.f) reduction = 1.f;
2006
+ if (m == nullptr) return;
2007
+ for (int c = 0; c < 2; ++c) {
2008
+ for (size_t i = 0; i < 64; ++i) {
2009
+ const int v = static_cast<int>(m[c][i] * 100. / reduction + .5);
2010
+ quant_[c][i] = (v > 255) ? 255u : (v < 1) ? 1u : v;
2011
+ }
2012
+ }
2013
+ }
2014
+
2015
+ void EncoderParam::SetLimitQuantization(bool limit_quantization,
2016
+ int min_quant_tolerance) {
2017
+ use_min_quant_ = limit_quantization;
2018
+ if (limit_quantization) SetMinQuantization(quant_, min_quant_tolerance);
2019
+ }
2020
+
2021
+ void EncoderParam::SetMinQuantization(const uint8_t m[2][64],
2022
+ int min_quant_tolerance) {
2023
+ use_min_quant_ = true;
2024
+ CopyQuantMatrix(m[0], min_quant_[0]);
2025
+ CopyQuantMatrix(m[1], min_quant_[1]);
2026
+ min_quant_tolerance_ = (min_quant_tolerance < 0) ? 0
2027
+ : (min_quant_tolerance > 100) ? 100
2028
+ : min_quant_tolerance;
2029
+ }
2030
+
2031
+ void EncoderParam::ResetMetadata() {
2032
+ iccp.clear();
2033
+ exif.clear();
2034
+ xmp.clear();
2035
+ app_markers.clear();
2036
+ }
2037
+
2038
+ bool Encoder::InitFromParam(const EncoderParam& param) {
2039
+ SetQuantMatrices(param.quant_);
2040
+ if (param.use_min_quant_) {
2041
+ SetMinQuantMatrices(param.min_quant_, param.min_quant_tolerance_);
2042
+ } else {
2043
+ SetDefaultMinQuantMatrices();
2044
+ }
2045
+
2046
+ int method = param.Huffman_compress ? 1 : 0;
2047
+ if (param.adaptive_quantization) method += 3;
2048
+ if (param.use_trellis) {
2049
+ method = (method == 4) ? 7 : (method == 6) ? 8 : method;
2050
+ }
2051
+
2052
+ SetCompressionMethod(method);
2053
+ SetQuantizationBias(param.quantization_bias, param.adaptive_bias);
2054
+ SetQuantizationDeltas(param.qdelta_max_luma, param.qdelta_max_chroma);
2055
+
2056
+ SetMetadata(param.iccp, Encoder::ICC);
2057
+ SetMetadata(param.exif, Encoder::EXIF);
2058
+ SetMetadata(param.xmp, Encoder::XMP);
2059
+ SetMetadata(param.app_markers, Encoder::MARKERS);
2060
+
2061
+ passes_ = (param.passes < 1) ? 1 : (param.passes > 20) ? 20 : param.passes;
2062
+ if (passes_ > 1) {
2063
+ use_extra_memory_ = true;
2064
+ reuse_run_levels_ = true;
2065
+ search_hook_ = (param.search_hook == nullptr) ? &default_hook_
2066
+ : param.search_hook;
2067
+ if (!search_hook_->Setup(param)) return false;
2068
+ }
2069
+
2070
+ memory_hook_ = (param.memory == nullptr) ? &kDefaultMemory : param.memory;
2071
+ return true;
2072
+ }
2073
+
2074
+ bool sjpeg::Encode(const uint8_t* rgb, int width, int height, int stride,
2075
+ const EncoderParam& param, ByteSink* sink) {
2076
+ if (rgb == nullptr || sink == nullptr) return false;
2077
+ if (width <= 0 || height <= 0 || stride < 3 * width) return false;
2078
+
2079
+ Encoder* const enc = EncoderFactory(rgb, width, height, stride,
2080
+ param.yuv_mode, sink);
2081
+ const bool ok = (enc != nullptr) &&
2082
+ enc->InitFromParam(param) &&
2083
+ enc->Encode();
2084
+ delete enc;
2085
+ return ok;
2086
+ }
2087
+
2088
+ size_t sjpeg::Encode(const uint8_t* rgb, int width, int height, int stride,
2089
+ const EncoderParam& param, uint8_t** out_data) {
2090
+ MemorySink sink(width * height / 4); // estimation of output size
2091
+ if (!sjpeg::Encode(rgb, width, height, stride, param, &sink)) return 0;
2092
+ size_t size;
2093
+ sink.Release(out_data, &size);
2094
+ return size;
2095
+ }
2096
+
2097
+ ////////////////////////////////////////////////////////////////////////////////
2098
+ // std::string variants
2099
+
2100
+ bool sjpeg::Encode(const uint8_t* rgb, int width, int height, int stride,
2101
+ const EncoderParam& param, std::string* output) {
2102
+ if (output == nullptr) return false;
2103
+ output->clear();
2104
+ output->reserve(width * height / 4);
2105
+ StringSink sink(output);
2106
+ return Encode(rgb, width, height, stride, param, &sink);
2107
+ }
2108
+
2109
+ bool SjpegCompress(const uint8_t* rgb, int width, int height,
2110
+ float quality, std::string* output) {
2111
+ EncoderParam param;
2112
+ param.SetQuality(quality);
2113
+ return Encode(rgb, width, height, 3 * width, param, output);
2114
+ }
2115
+
2116
+ ////////////////////////////////////////////////////////////////////////////////
2117
+
2118
+ bool SjpegDimensions(const std::string& jpeg_data,
2119
+ int* width, int* height, int* is_yuv420) {
2120
+ return SjpegDimensions(
2121
+ reinterpret_cast<const uint8_t*>(jpeg_data.data()),
2122
+ jpeg_data.size(), width, height, is_yuv420);
2123
+ }
2124
+
2125
+ int SjpegFindQuantizer(const std::string& jpeg_data,
2126
+ uint8_t quant[2][64]) {
2127
+ return SjpegFindQuantizer(
2128
+ reinterpret_cast<const uint8_t*>(jpeg_data.data()), jpeg_data.size(),
2129
+ quant);
2130
+ }
2131
+
2132
+ ////////////////////////////////////////////////////////////////////////////////