RubyGems - sjpeg - Versions diffs - 0.1.0 - Mend

sjpeg 0.1.0

Files changed (27) hide show

data/ext/sjpeg/enc.cc ADDED Viewed

@@ -0,0 +1,2132 @@
+// Copyright 2017 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//  Fast and simple JPEG encoder
+//
+// Author: Skal (pascal.massimino@gmail.com)
+#include <stdlib.h>
+#include <math.h>
+#include <float.h>    // for FLT_MAX
+#include <stdint.h>
+#define SJPEG_NEED_ASM_HEADERS
+#include "sjpegi.h"
+using namespace sjpeg;
+// Some general default values:
+static const float kDefaultQuality = 75.f;
+static const int kDefaultMethod = 4;
+// Rounding bias for AC coefficients, as 8bit fixed point.
+// A default value 0x78 leans toward filesize reduction.
+static const int32_t kDefaultBias = 0x78;
+// for adaptive quantization:
+static const int kDefaultDeltaMaxLuma = 12;
+static const int kDefaultDeltaMaxChroma = 1;
+// finer tuning of perceptual optimizations:
+// Minimum average number of entries per bin required for performing histogram-
+// -based optimization. Below this limit, the channel's histogram is declared
+// under-populated and the corresponding optimization skipped.
+static double kDensityThreshold = 0.5;
+// Rejection limit on the correlation factor when extrapolating the distortion
+// from histograms. If the least-square fit has a squared correlation factor
+// less than this threshold, the corresponding quantization scale will be
+// kept unchanged.
+static double kCorrelationThreshold = 0.5;
+// Bit-map of channels to omit during quantization matrix optimization.
+// If the bit 'i + 8 * j' is set in this bit field, the matrix entry at
+// position (i,j) will be kept unchanged during optimization.
+// The default value is 0x103 = 1 + 2 + 256: the 3 entries in the top-left
+// corner (with lowest-frequency) are not optimized, since it can lead to
+// visual degradation of smooth gradients.
+static const uint64_t kOmittedChannels = 0x0000000000000103ULL;
+////////////////////////////////////////////////////////////////////////////////
+namespace sjpeg {
+const uint8_t kZigzag[64] = {
+  0,   1,  8, 16,  9,  2,  3, 10,
+  17, 24, 32, 25, 18, 11,  4,  5,
+  12, 19, 26, 33, 40, 48, 41, 34,
+  27, 20, 13,  6,  7, 14, 21, 28,
+  35, 42, 49, 56, 57, 50, 43, 36,
+  29, 22, 15, 23, 30, 37, 44, 51,
+  58, 59, 52, 45, 38, 31, 39, 46,
+  53, 60, 61, 54, 47, 55, 62, 63,
+};
+const uint8_t kDefaultMatrices[2][64] = {
+  // these are the default luma/chroma matrices (JPEG spec section K.1)
+  { 16,  11,  10,  16,  24,  40,  51,  61,
+    12,  12,  14,  19,  26,  58,  60,  55,
+    14,  13,  16,  24,  40,  57,  69,  56,
+    14,  17,  22,  29,  51,  87,  80,  62,
+    18,  22,  37,  56,  68, 109, 103,  77,
+    24,  35,  55,  64,  81, 104, 113,  92,
+    49,  64,  78,  87, 103, 121, 120, 101,
+    72,  92,  95,  98, 112, 100, 103,  99 },
+  { 17,  18,  24,  47,  99,  99,  99,  99,
+    18,  21,  26,  66,  99,  99,  99,  99,
+    24,  26,  56,  99,  99,  99,  99,  99,
+    47,  66,  99,  99,  99,  99,  99,  99,
+    99,  99,  99,  99,  99,  99,  99,  99,
+    99,  99,  99,  99,  99,  99,  99,  99,
+    99,  99,  99,  99,  99,  99,  99,  99,
+    99,  99,  99,  99,  99,  99,  99,  99 }
+};
+float GetQFactor(float q) {
+  // we use the same mapping than jpeg-6b, for coherency
+  q = (q <= 0) ? 5000 : (q < 50) ? 5000 / q : (q < 100) ? 2 * (100 - q) : 0;
+  // We floor-round to integer here just to preserve compatibility with jpeg6b.
+  return floorf(q);
+}
+void CopyQuantMatrix(const uint8_t in[64], uint8_t out[64]) {
+  memcpy(out, in, 64 * sizeof(out[0]));
+}
+void SetQuantMatrix(const uint8_t in[64], float q_factor, uint8_t out[64]) {
+  if (in == nullptr || out == nullptr) return;
+  q_factor /= 100.f;
+  for (size_t i = 0; i < 64; ++i) {
+    const int v = static_cast<int>(in[i] * q_factor + .5f);
+    // clamp to prevent illegal quantizer values
+    out[i] = (v < 1) ? 1 : (v > 255) ? 255u : v;
+  }
+}
+void SetMinQuantMatrix(const uint8_t m[64], uint8_t out[64], int tolerance) {
+  assert(out != nullptr && m != nullptr);
+  for (size_t i = 0; i < 64; ++i) {
+    const int v = static_cast<int>(m[i] * (256 - tolerance) >> 8);
+    out[i] = (v < 1) ? 1u : (v > 255) ? 255u : v;
+  }
+}
+void SetDefaultMinQuantMatrix(uint8_t out[64]) {
+  assert(out != nullptr);
+  for (size_t i = 0; i < 64; ++i) out[i] = 1u;
+}
+////////////////////////////////////////////////////////////////////////////////
+// Default memory manager (singleton)
+static struct DefaultMemory : public MemoryManager {
+ public:
+  virtual ~DefaultMemory() {}
+  virtual void* Alloc(size_t size) { return malloc(size); }
+  virtual void Free(void* const ptr) { free(ptr); }
+} kDefaultMemory;
+////////////////////////////////////////////////////////////////////////////////
+// Encoder main class
+Encoder::Encoder(int W, int H, int step, const uint8_t* const rgb,
+                 ByteSink* const sink)
+  : W_(W), H_(H), step_(step),
+    rgb_(rgb),
+    ok_(true),
+    bw_(sink),
+    in_blocks_base_(nullptr),
+    in_blocks_(nullptr),
+    have_coeffs_(false),
+    all_run_levels_(nullptr),
+    nb_run_levels_(0),
+    max_run_levels_(0),
+    qdelta_max_luma_(kDefaultDeltaMaxLuma),
+    qdelta_max_chroma_(kDefaultDeltaMaxChroma),
+    passes_(1),
+    search_hook_(nullptr),
+    memory_hook_(&kDefaultMemory) {
+  SetCompressionMethod(kDefaultMethod);
+  SetQuality(kDefaultQuality);
+  SetYUVFormat(false);
+  SetQuantizationBias(kDefaultBias, false);
+  SetDefaultMinQuantMatrices();
+  InitializeStaticPointers();
+  memset(dc_codes_, 0, sizeof(dc_codes_));  // safety
+  memset(ac_codes_, 0, sizeof(ac_codes_));
+}
+Encoder::~Encoder() {
+  Free(all_run_levels_);
+  DesallocateBlocks();   // clean-up leftovers in case of we had an error
+}
+////////////////////////////////////////////////////////////////////////////////
+void Encoder::SetQuality(float q) {
+  q = GetQFactor(q);
+  SetQuantMatrix(kDefaultMatrices[0], q, quants_[0].quant_);
+  SetQuantMatrix(kDefaultMatrices[1], q, quants_[1].quant_);
+}
+void Encoder::SetQuantMatrices(const uint8_t m[2][64]) {
+  SetQuantMatrix(m[0], 100, quants_[0].quant_);
+  SetQuantMatrix(m[1], 100, quants_[1].quant_);
+}
+void Encoder::SetMinQuantMatrices(const uint8_t m[2][64], int tolerance) {
+  SetMinQuantMatrix(m[0], quants_[0].min_quant_, tolerance);
+  SetMinQuantMatrix(m[1], quants_[1].min_quant_, tolerance);
+}
+void Encoder::SetDefaultMinQuantMatrices() {
+  SetDefaultMinQuantMatrix(quants_[0].min_quant_);
+  SetDefaultMinQuantMatrix(quants_[1].min_quant_);
+}
+void Encoder::SetCompressionMethod(int method) {
+  assert(method >= 0 && method <= 8);
+  use_adaptive_quant_ = (method >= 3);
+  optimize_size_ = (method != 0) && (method != 3);
+  use_extra_memory_ = (method == 3) || (method == 4) || (method == 7);
+  reuse_run_levels_ = (method == 1) || (method == 4) || (method == 5)
+                   || (method >= 7);
+  use_trellis_ = (method >= 7);
+}
+void Encoder::SetMetadata(const std::string& data, MetadataType type) {
+  switch (type) {
+    case ICC: iccp_ = data; break;
+    case EXIF: exif_ = data; break;
+    case XMP: xmp_ = data; break;
+    default:
+    case MARKERS: app_markers_ = data; break;
+  }
+}
+void Encoder::SetQuantizationBias(int bias, bool use_adaptive) {
+  assert(bias >= 0 && bias <= 255);
+  q_bias_ = bias;
+  adaptive_bias_ = use_adaptive;
+}
+void Encoder::SetQuantizationDeltas(int qdelta_luma, int qdelta_chroma) {
+  assert(qdelta_luma >= 0 && qdelta_luma <= 255);
+  assert(qdelta_chroma >= 0 && qdelta_chroma <= 255);
+  qdelta_max_luma_ = qdelta_luma;
+  qdelta_max_chroma_ = qdelta_chroma;
+}
+////////////////////////////////////////////////////////////////////////////////
+// CPU support
+extern bool ForceSlowCImplementation;
+bool ForceSlowCImplementation = false;   // undocumented! for tests.
+bool SupportsSSE2() {
+  if (ForceSlowCImplementation) return false;
+#if defined(SJPEG_USE_SSE2)
+  return true;
+#endif
+  return false;
+}
+bool SupportsNEON() {
+  if (ForceSlowCImplementation) return false;
+#if defined(SJPEG_USE_NEON)
+  return true;
+#endif
+  return false;
+}
+////////////////////////////////////////////////////////////////////////////////
+// static pointers to architecture-dependant implementation
+Encoder::QuantizeErrorFunc Encoder::quantize_error_ = nullptr;
+Encoder::QuantizeBlockFunc Encoder::quantize_block_ = nullptr;
+void (*Encoder::fDCT_)(int16_t* in, int num_blocks) = nullptr;
+Encoder::StoreHistoFunc Encoder::store_histo_ = nullptr;
+RGBToYUVBlockFunc Encoder::get_yuv444_block_ = nullptr;
+void Encoder::InitializeStaticPointers() {
+  if (fDCT_ == nullptr) {
+    store_histo_ = GetStoreHistoFunc();
+    quantize_block_ = GetQuantizeBlockFunc();
+    quantize_error_ = GetQuantizeErrorFunc();
+    fDCT_ = GetFdct();
+    get_yuv444_block_ = GetBlockFunc(true);
+  }
+}
+////////////////////////////////////////////////////////////////////////////////
+// memory and internal buffers management. We grow on demand.
+bool Encoder::SetError() {
+  ok_ = false;
+  return false;
+}
+bool Encoder::CheckBuffers() {
+  // maximum macroblock size, worst-case, is 24bits*64*6 coeffs = 1152bytes
+  ok_ = ok_ && bw_.Reserve(2048);
+  if (!ok_) return false;
+  if (reuse_run_levels_) {
+    if (nb_run_levels_ + 6*64 > max_run_levels_) {
+      // need to grow storage for run/levels
+      const size_t new_size = max_run_levels_ ? max_run_levels_ * 2 : 8192;
+      RunLevel* const new_rl = Alloc<RunLevel>(new_size);
+      if (new_rl == nullptr) return false;
+      if (nb_run_levels_ > 0) {
+        memcpy(new_rl, all_run_levels_,
+               nb_run_levels_ * sizeof(new_rl[0]));
+      }
+      Free(all_run_levels_);
+      all_run_levels_ = new_rl;
+      max_run_levels_ = new_size;
+      assert(nb_run_levels_ + 6 * 64 <= max_run_levels_);
+    }
+  }
+  return true;
+}
+bool Encoder::AllocateBlocks(size_t num_blocks) {
+  assert(in_blocks_ == nullptr);
+  have_coeffs_ = false;
+  const size_t size = num_blocks * 64 * sizeof(*in_blocks_);
+  in_blocks_base_ = Alloc<uint8_t>(size + ALIGN_CST);
+  if (in_blocks_base_ == nullptr) return false;
+  in_blocks_ = reinterpret_cast<int16_t*>(
+      (ALIGN_CST + reinterpret_cast<uintptr_t>(in_blocks_base_)) & ~ALIGN_CST);
+  return true;
+}
+void Encoder::DesallocateBlocks() {
+  Free(in_blocks_base_);
+  in_blocks_base_ = nullptr;
+  in_blocks_ = nullptr;          // sanity
+}
+////////////////////////////////////////////////////////////////////////////////
+#define FP_BITS 16    // fractional precision for fixed-point dividors
+#define AC_BITS 4     // extra precision bits from fdct's scaling
+#define BIAS_DC 0x80  // neutral bias for DC (mandatory!)
+// divide-by-multiply helper macros
+#define MAKE_INV_QUANT(Q) (((1u << FP_BITS) + (Q) / 2) / (Q))
+#define DIV_BY_MULT(A, M) (((A) * (M)) >> FP_BITS)
+#define QUANTIZE(A, M, B) (DIV_BY_MULT((A) + (B), (M)) >> AC_BITS)
+void Encoder::FinalizeQuantMatrix(Quantizer* const q, int q_bias) {
+  // first, clamp the quant matrix:
+  for (size_t i = 0; i < 64; ++i) {
+    if (q->quant_[i] < q->min_quant_[i]) q->quant_[i] = q->min_quant_[i];
+  }
+  // Special case! for v=1 we can't represent the multiplier with 16b precision.
+  // So, instead we max out the multiplier to 0xffffu, and twist the bias to the
+  // value 0x80. The overall precision isn't affected: it's bit-exact the same
+  // for our working range.
+  // Note that quant=1 can start appearing at quality as low as 93.
+  const uint16_t bias_1 = 0x80;
+  const uint16_t iquant_1 = 0xffffu;
+  for (size_t i = 0; i < 64; ++i) {
+    const uint16_t v = q->quant_[i];
+    const uint16_t iquant = (v == 1) ? iquant_1 : MAKE_INV_QUANT(v);
+    const uint16_t bias = (v == 1) ? bias_1 : (i == 0) ? BIAS_DC : q_bias;
+    const uint16_t ibias = (((bias * v) << AC_BITS) + 128) >> 8;
+    const uint16_t qthresh =
+        ((1 << (FP_BITS + AC_BITS)) + iquant - 1) / iquant - ibias;
+    q->bias_[i] = ibias;
+    q->iquant_[i] = iquant;
+    q->qthresh_[i] = qthresh;
+    assert(QUANTIZE(qthresh, iquant, ibias) > 0);
+    assert(QUANTIZE(qthresh - 1, iquant, ibias) == 0);
+  }
+}
+void Encoder::SetCostCodes(int idx) {
+  quants_[idx].codes_ = ac_codes_[idx];
+}
+////////////////////////////////////////////////////////////////////////////////
+// standard Huffman tables, as per JPEG standard section K.3.
+static const uint8_t kDCSyms[12] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 };
+static const uint8_t kACSyms[2][162] = {
+  { 0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12,
+    0x21, 0x31, 0x41, 0x06, 0x13, 0x51, 0x61, 0x07,
+    0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xa1, 0x08,
+    0x23, 0x42, 0xb1, 0xc1, 0x15, 0x52, 0xd1, 0xf0,
+    0x24, 0x33, 0x62, 0x72, 0x82, 0x09, 0x0a, 0x16,
+    0x17, 0x18, 0x19, 0x1a, 0x25, 0x26, 0x27, 0x28,
+    0x29, 0x2a, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
+    0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
+    0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
+    0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
+    0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
+    0x7a, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
+    0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98,
+    0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+    0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6,
+    0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5,
+    0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4,
+    0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xe1, 0xe2,
+    0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea,
+    0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
+    0xf9, 0xfa },
+  { 0x00, 0x01, 0x02, 0x03, 0x11, 0x04, 0x05, 0x21,
+    0x31, 0x06, 0x12, 0x41, 0x51, 0x07, 0x61, 0x71,
+    0x13, 0x22, 0x32, 0x81, 0x08, 0x14, 0x42, 0x91,
+    0xa1, 0xb1, 0xc1, 0x09, 0x23, 0x33, 0x52, 0xf0,
+    0x15, 0x62, 0x72, 0xd1, 0x0a, 0x16, 0x24, 0x34,
+    0xe1, 0x25, 0xf1, 0x17, 0x18, 0x19, 0x1a, 0x26,
+    0x27, 0x28, 0x29, 0x2a, 0x35, 0x36, 0x37, 0x38,
+    0x39, 0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48,
+    0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,
+    0x59, 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68,
+    0x69, 0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
+    0x79, 0x7a, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+    0x88, 0x89, 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96,
+    0x97, 0x98, 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5,
+    0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4,
+    0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3,
+    0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2,
+    0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda,
+    0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9,
+    0xea, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
+    0xf9, 0xfa }
+};
+static const HuffmanTable kHuffmanTables[4] = {
+  { { 0, 1, 5, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0 }, kDCSyms, 12 },
+  { { 0, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0 }, kDCSyms, 12 },
+  { { 0, 2, 1, 3, 3, 2, 4, 3, 5, 5, 4, 4, 0, 0, 1, 125 }, kACSyms[0], 162 },
+  { { 0, 2, 1, 2, 4, 4, 3, 4, 7, 5, 4, 4, 0, 1, 2, 119 }, kACSyms[1], 162 }
+};
+////////////////////////////////////////////////////////////////////////////////
+// This function generates a map from symbols to code + len stored in a packed
+// way (lower 16bit is the lenth, upper 16bit is the VLC).
+// The input is a JPEG-like description of the symbols:
+// - bits[i] stores the number of codes having length i + 1.
+// - symbols[] contain the symbols' map, in increasing bit-length order.
+// There is no check performed on the validity symbols[]'s content.
+// The values of tab[] not referring to an actual symbol will remain unchanged.
+// Returns the number of symbols used (that is: sum{bits[i]})
+static int BuildHuffmanTable(const uint8_t bits[16], const uint8_t* symbols,
+                             uint32_t* const tab) {
+  uint32_t code = 0;
+  int nb = 0;
+  for (int nb_bits = 1; nb_bits <= 16; ++nb_bits, code <<= 1) {
+    int n = bits[nb_bits - 1];  // number of code for that given nb_bits
+    nb += n;
+    while (n-- > 0) {
+      const int symbol = *symbols++;
+      tab[symbol] = (code << 16) | nb_bits;
+      ++code;
+    }
+  }
+  return nb;
+}
+////////////////////////////////////////////////////////////////////////////////
+void Encoder::InitCodes(bool only_ac) {
+  const int nb_tables = (nb_comps_ == 1 ? 1 : 2);
+  for (int c = 0; c < nb_tables; ++c) {   // luma, chroma
+    for (int type = (only_ac ? 1 : 0); type <= 1; ++type) {
+      const HuffmanTable* const h = Huffman_tables_[type * 2 + c];
+      const int nb_syms = BuildHuffmanTable(h->bits_, h->syms_,
+                                            type == 1 ? ac_codes_[c]
+                                                      : dc_codes_[c]);
+      assert(nb_syms == h->nb_syms_);
+      (void)nb_syms;
+    }
+  }
+}
+////////////////////////////////////////////////////////////////////////////////
+// Quantize coefficients and pseudo-code coefficients
+static int CalcLog2(int v) {
+#if defined(__GNUC__) && \
+    ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4)
+  return 32 - __builtin_clz(v);
+#else
+  const int kLog2[16] = {
+    0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4 };
+  assert(v > 0 && v < (1 << 12));
+  return (v & ~0xff) ? 8 + kLog2[v >> 8] :
+         (v & ~0x0f) ? 4 + kLog2[v >> 4] :
+                       0 + kLog2[v];
+#endif
+}
+uint16_t Encoder::GenerateDCDiffCode(int DC, int* const DC_predictor) {
+  const int diff = DC - *DC_predictor;
+  *DC_predictor = DC;
+  if (diff == 0) {
+    return 0;
+  }
+  int suff, n;
+  if (diff < 0) {
+    n = CalcLog2(-diff);
+    suff = (diff - 1) & ((1 << n) - 1);
+  } else {
+    n = CalcLog2(diff);
+    suff = diff;
+  }
+  assert((suff & 0xf000) == 0);
+  assert(n < 12);
+  return n | (suff << 4);
+}
+////////////////////////////////////////////////////////////////////////////////
+// various implementation of histogram collection
+#if defined(SJPEG_USE_SSE2)
+// Load eight 16b-words from *src.
+#define LOAD_16(src) _mm_loadu_si128(reinterpret_cast<const __m128i*>(src))
+// Store eight 16b-words into *dst
+#define STORE_16(V, dst) _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), (V))
+static int QuantizeBlockSSE2(const int16_t in[64], int idx,
+                             const Quantizer* const Q,
+                             DCTCoeffs* const out, RunLevel* const rl) {
+  const uint16_t* const bias = Q->bias_;
+  const uint16_t* const iquant = Q->iquant_;
+  int prev = 1;
+  int nb = 0;
+  int16_t tmp[64], masked[64];
+  for (int i = 0; i < 64; i += 8) {
+    const __m128i m_bias = LOAD_16(bias + i);
+    const __m128i m_mult = LOAD_16(iquant + i);
+    const __m128i A = LOAD_16(in + i);                        // A = in[i]
+    const __m128i B = _mm_srai_epi16(A, 15);                  // sign extract
+    const __m128i C = _mm_sub_epi16(_mm_xor_si128(A, B), B);  // abs(A)
+    const __m128i D = _mm_adds_epi16(C, m_bias);              // v' = v + bias
+    const __m128i E = _mm_mulhi_epu16(D, m_mult);             // (v' * iq) >> 16
+    const __m128i F = _mm_srli_epi16(E, AC_BITS);             // = QUANTIZE(...)
+    const __m128i G = _mm_xor_si128(F, B);                    // v ^ mask
+    STORE_16(F, tmp + i);
+    STORE_16(G, masked + i);
+  }
+  for (int i = 1; i < 64; ++i) {
+    const int j = kZigzag[i];
+    const int v = tmp[j];
+    if (v > 0) {
+      const int n = CalcLog2(v);
+      const uint16_t code = masked[j] & ((1 << n) - 1);
+      rl[nb].level_ = (code << 4) | n;
+      rl[nb].run_ = i - prev;
+      prev = i + 1;
+      ++nb;
+    }
+  }
+  const int dc = (in[0] < 0) ? -tmp[0] : tmp[0];
+  out->idx_ = idx;
+  out->last_ = prev - 1;
+  out->nb_coeffs_ = nb;
+  return dc;
+}
+#undef LOAD_16
+#undef STORE_16
+#elif defined(SJPEG_USE_NEON)
+static int QuantizeBlockNEON(const int16_t in[64], int idx,
+                             const Quantizer* const Q,
+                             DCTCoeffs* const out, RunLevel* const rl) {
+  const uint16_t* const bias = Q->bias_;
+  const uint16_t* const iquant = Q->iquant_;
+  int prev = 1;
+  int nb = 0;
+  uint16_t tmp[64], masked[64];
+  for (int i = 0; i < 64; i += 8) {
+    const uint16x8_t m_bias = vld1q_u16(bias + i);
+    const uint16x8_t m_mult = vld1q_u16(iquant + i);
+    const int16x8_t A = vld1q_s16(in + i);                           // in[i]
+    const uint16x8_t B = vreinterpretq_u16_s16(vabsq_s16(A));        // abs(in)
+    const int16x8_t sign = vshrq_n_s16(A, 15);                       // sign
+    const uint16x8_t C = vaddq_u16(B, m_bias);                       // + bias
+    const uint32x4_t D0 = vmull_u16(vget_low_u16(C), vget_low_u16(m_mult));
+    const uint32x4_t D1 = vmull_u16(vget_high_u16(C), vget_high_u16(m_mult));
+    // collect hi-words of the 32b mult result using 'unzip'
+    const uint16x8x2_t E = vuzpq_u16(vreinterpretq_u16_u32(D0),
+                                     vreinterpretq_u16_u32(D1));
+    const uint16x8_t F = vshrq_n_u16(E.val[1], AC_BITS);
+    const uint16x8_t G = veorq_u16(F, vreinterpretq_u16_s16(sign));  // v ^ mask
+    vst1q_u16(tmp + i, F);
+    vst1q_u16(masked + i, G);
+  }
+  for (int i = 1; i < 64; ++i) {
+    const int j = kZigzag[i];
+    const int v = tmp[j];
+    if (v > 0) {
+      const int n = CalcLog2(v);
+      const uint16_t code = masked[j] & ((1 << n) - 1);
+      rl[nb].level_ = (code << 4) | n;
+      rl[nb].run_ = i - prev;
+      prev = i + 1;
+      ++nb;
+    }
+  }
+  const int dc = (in[0] < 0) ? -tmp[0] : tmp[0];
+  out->idx_ = idx;
+  out->last_ = prev - 1;
+  out->nb_coeffs_ = nb;
+  return dc;
+}
+#endif    // SJPEG_USE_NEON
+static int QuantizeBlock(const int16_t in[64], int idx,
+                         const Quantizer* const Q,
+                         DCTCoeffs* const out, RunLevel* const rl) {
+  const uint16_t* const bias = Q->bias_;
+  const uint16_t* const iquant = Q->iquant_;
+  int prev = 1;
+  int nb = 0;
+  // This function is speed-critical, so we're using some bit mask
+  // to extract absolute values, instead of sign tests.
+  const uint16_t* const qthresh = Q->qthresh_;
+  for (int i = 1; i < 64; ++i) {
+    const int j = kZigzag[i];
+    int v = in[j];
+    const int32_t mask = v >> 31;
+    v = (v ^ mask) - mask;
+    if (v >= qthresh[j]) {
+      v = QUANTIZE(v, iquant[j], bias[j]);
+      assert(v > 0);
+      const int n = CalcLog2(v);
+      const uint16_t code = (v ^ mask) & ((1 << n) - 1);
+      rl[nb].level_ = (code << 4) | n;
+      rl[nb].run_ = i - prev;
+      prev = i + 1;
+      ++nb;
+    }
+  }
+  const int dc = (in[0] < 0) ? -QUANTIZE(-in[0], iquant[0], bias[0])
+                             : QUANTIZE(in[0], iquant[0], bias[0]);
+  out->idx_ = idx;
+  out->last_ = prev - 1;
+  out->nb_coeffs_ = nb;
+  return dc;
+}
+////////////////////////////////////////////////////////////////////////////////
+// Trellis-based quantization
+typedef uint32_t score_t;
+static const score_t kMaxScore = 0xffffffffu;
+struct TrellisNode {
+  uint32_t code;
+  int      nbits;
+  score_t score;
+  uint32_t disto;
+  uint32_t bits;
+  uint32_t run;
+  const TrellisNode* best_prev;
+  int pos;
+  int rank;
+  TrellisNode() : score(kMaxScore), best_prev(nullptr) {}
+  void InitSink() {
+    score = 0u;
+    disto = 0;
+    pos = 0;
+    rank = 0;
+    nbits = 0;
+    bits = 0;
+  }
+};
+static bool SearchBestPrev(const TrellisNode* const nodes0, TrellisNode* node,
+                           const uint32_t disto0[], const uint32_t codes[],
+                           uint32_t lambda) {
+  bool found = false;
+  assert(codes[0xf0] != 0);
+  const uint32_t base_disto = node->disto + disto0[node->pos - 1];
+  for (const TrellisNode* cur = node - 1; cur >= nodes0; --cur) {
+    const int run = node->pos - 1 - cur->pos;
+    if (run < 0) continue;
+    uint32_t bits = node->nbits;
+    bits += (run >> 4) * (codes[0xf0] & 0xff);
+    const uint32_t sym = ((run & 15) << 4) | node->nbits;
+    assert(codes[sym] != 0);
+    bits += codes[sym] & 0xff;
+    const uint32_t disto = base_disto - disto0[cur->pos];
+    const score_t score = disto + lambda * bits + cur->score;
+    if (score < node->score) {
+      node->score = score;
+      node->disto = disto;
+      node->bits = bits;
+      node->best_prev = cur;
+      node->rank = cur->rank + 1;
+      node->run = run;
+      found = true;
+    }
+  }
+  return found;
+}
+// number of alternate levels to investigate
+#define NUM_TRELLIS_NODES 2
+int Encoder::TrellisQuantizeBlock(const int16_t in[64], int idx,
+                                  const Quantizer* const Q,
+                                  DCTCoeffs* const out,
+                                  RunLevel* const rl) {
+  const uint16_t* const bias = Q->bias_;
+  const uint16_t* const iquant = Q->iquant_;
+  TrellisNode nodes[1 + NUM_TRELLIS_NODES * 63];  // 1 sink + n channels
+  nodes[0].InitSink();
+  const uint32_t* const codes = Q->codes_;
+  TrellisNode* cur_node = &nodes[1];
+  uint32_t disto0[64];   // disto0[i] = sum of distortions up to i (inclusive)
+  disto0[0] = 0;
+  for (int i = 1; i < 64; ++i) {
+    const int j = kZigzag[i];
+    const uint32_t q = Q->quant_[j] << AC_BITS;
+    const uint32_t lambda = q * q / 32u;
+    int V = in[j];
+    const int32_t mask = V >> 31;
+    V = (V ^ mask) - mask;
+    disto0[i] = V * V + disto0[i - 1];
+    int v = QUANTIZE(V, iquant[j], bias[j]);
+    if (v == 0) continue;
+    int nbits = CalcLog2(v);
+    for (int k = 0; k < NUM_TRELLIS_NODES; ++k) {
+      const int err = V - v * q;
+      cur_node->code = (v ^ mask) & ((1 << nbits) - 1);
+      cur_node->pos = i;
+      cur_node->disto = err * err;
+      cur_node->nbits = nbits;
+      cur_node->score = kMaxScore;
+      if (SearchBestPrev(&nodes[0], cur_node, disto0, codes, lambda)) {
+        ++cur_node;
+      }
+      --nbits;
+      if (nbits <= 0) break;
+      v = (1 << nbits) - 1;
+    }
+  }
+  // search best entry point backward
+  const TrellisNode* nz = &nodes[0];
+  if (cur_node != nz) {
+    score_t best_score = kMaxScore;
+    while (cur_node-- != &nodes[0]) {
+      const uint32_t disto = disto0[63] - disto0[cur_node->pos];
+      // No need to incorporate EOB's bit cost (codes[0x00]), since
+      // it's the same for all coeff except the last one #63.
+      cur_node->disto += disto;
+      cur_node->score += disto;
+      if (cur_node->score < best_score) {
+        nz = cur_node;
+        best_score = cur_node->score;
+      }
+    }
+  }
+  int nb = nz->rank;
+  out->idx_ = idx;
+  out->last_ = nz->pos;
+  out->nb_coeffs_ = nb;
+  while (nb-- > 0) {
+    const int32_t code = nz->code;
+    const int n = nz->nbits;
+    rl[nb].level_ = (code << 4) | n;
+    rl[nb].run_ = nz->run;
+    nz = nz->best_prev;
+  }
+  const int dc = (in[0] < 0) ? -QUANTIZE(-in[0], iquant[0], bias[0])
+                             : QUANTIZE(in[0], iquant[0], bias[0]);
+  return dc;
+}
+Encoder::QuantizeBlockFunc Encoder::GetQuantizeBlockFunc() {
+#if defined(SJPEG_USE_SSE2)
+  if (SupportsSSE2()) return QuantizeBlockSSE2;
+#elif defined(SJPEG_USE_NEON)
+  if (SupportsNEON()) return QuantizeBlockNEON;
+#endif
+  return QuantizeBlock;  // default
+}
+////////////////////////////////////////////////////////////////////////////////
+#if defined(SJPEG_USE_SSE2)
+// Load eight 16b-words from *src.
+#define LOAD_16(src) _mm_loadu_si128((const __m128i*)(src))
+#define LOAD_64(src) _mm_loadl_epi64((const __m128i*)(src))
+// Store eight 16b-words into *dst
+#define STORE_16(V, dst) _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), (V))
+static uint32_t QuantizeErrorSSE2(const int16_t in[64],
+                                  const Quantizer* const Q) {
+  const uint16_t* const bias = Q->bias_;
+  const uint16_t* const iquant = Q->iquant_;
+  const uint8_t* const quant = Q->quant_;
+  const __m128i zero = _mm_setzero_si128();
+  uint32_t tmp[32];
+  for (int i = 0; i < 64; i += 8) {
+    const __m128i m_bias = LOAD_16(bias + i);
+    const __m128i m_iquant = LOAD_16(iquant + i);
+    const __m128i m_quant = _mm_unpacklo_epi8(LOAD_64(quant + i), zero);
+    const __m128i A = LOAD_16(in + i);                        // v0 = in[i]
+    const __m128i B = _mm_srai_epi16(A, 15);                  // sign extract
+    const __m128i C = _mm_sub_epi16(_mm_xor_si128(A, B), B);  // abs(v0)
+    const __m128i D = _mm_adds_epi16(C, m_bias);              // v' = v0 + bias
+    const __m128i E = _mm_mulhi_epu16(D, m_iquant);           // (v' * iq) >> 16
+    const __m128i F = _mm_srai_epi16(E, AC_BITS);
+    const __m128i G = _mm_srai_epi16(C, AC_BITS);
+    const __m128i H = _mm_mullo_epi16(F, m_quant);            // *= quant[j]
+    const __m128i I = _mm_sub_epi16(G, H);
+    const __m128i J = _mm_madd_epi16(I, I);                   // (v0-v) ^ 2
+    STORE_16(J, tmp + i / 2);
+  }
+  uint32_t err = 0;
+  for (int i = 0; i < 32; ++i) err += tmp[i];
+  return err;
+}
+#undef LOAD_16
+#undef LOAD_64
+#undef STORE_16
+#elif defined(SJPEG_USE_NEON)
+static uint32_t QuantizeErrorNEON(const int16_t in[64],
+                                  const Quantizer* const Q) {
+  const uint16_t* const bias = Q->bias_;
+  const uint16_t* const iquant = Q->iquant_;
+  const uint8_t* const quant = Q->quant_;
+  uint32x4_t sum1 = vdupq_n_u32(0);
+  uint32x4_t sum2 = vdupq_n_u32(0);
+  for (int i = 0; i < 64; i += 8) {
+    const uint16x8_t m_bias = vld1q_u16(bias + i);
+    const uint16x8_t m_mult = vld1q_u16(iquant + i);
+    const uint16x8_t m_quant = vmovl_u8(vld1_u8(quant + i));
+    const uint16x8_t A = vreinterpretq_u16_s16(vabsq_s16(vld1q_s16(in + i)));
+    const uint16x8_t B = vaddq_u16(A, m_bias);
+    const uint32x4_t C0 = vmull_u16(vget_low_u16(B), vget_low_u16(m_mult));
+    const uint32x4_t C1 = vmull_u16(vget_high_u16(B), vget_high_u16(m_mult));
+    // collect hi-words of the 32b mult result using 'unzip'
+    const uint16x8x2_t D = vuzpq_u16(vreinterpretq_u16_u32(C0),
+                                     vreinterpretq_u16_u32(C1));
+    const uint16x8_t E = vshrq_n_u16(D.val[1], AC_BITS);
+    const uint16x8_t F = vmulq_u16(E, m_quant);        // dequantized coeff
+    const uint16x8_t G = vabdq_u16(F, vshrq_n_u16(A, AC_BITS));
+    sum1 = vmlal_u16(sum1, vget_low_u16(G), vget_low_u16(G));
+    sum2 = vmlal_u16(sum2, vget_high_u16(G), vget_high_u16(G));
+  }
+  const uint32x4_t sum3 = vaddq_u32(sum1, sum2);
+  const uint64x2_t sum4 = vpaddlq_u32(sum3);
+  const uint64_t sum5 = vgetq_lane_u64(sum4, 0) + vgetq_lane_u64(sum4, 1);
+  const uint32_t err = (uint32_t)sum5;
+  return err;
+}
+#endif    // SJPEG_USE_NEON
+static uint32_t QuantizeError(const int16_t in[64], const Quantizer* const Q) {
+  const uint16_t* const bias = Q->bias_;
+  const uint16_t* const iquant = Q->iquant_;
+  const uint8_t* const quant = Q->quant_;
+  uint32_t err = 0;
+  for (int j = 0; j < 64; ++j) {
+    int32_t v0 = (in[j] < 0) ? -in[j] : in[j];
+    const uint32_t v = quant[j] * QUANTIZE(v0, iquant[j], bias[j]);
+    v0 >>= AC_BITS;
+    err += (v0 - v) * (v0 - v);
+  }
+  return err;
+}
+Encoder::QuantizeErrorFunc Encoder::GetQuantizeErrorFunc() {
+#if defined(SJPEG_USE_SSE2)
+  if (SupportsSSE2()) return QuantizeErrorSSE2;
+#elif defined(SJPEG_USE_NEON)
+  if (SupportsNEON()) return QuantizeErrorNEON;
+#endif
+  return QuantizeError;  // default
+}
+////////////////////////////////////////////////////////////////////////////////
+// Code bitstream
+void Encoder::ResetDCs() {
+  for (int c = 0; c < nb_comps_; ++c) {
+    DCs_[c] = 0;
+  }
+}
+void Encoder::CodeBlock(const DCTCoeffs* const coeffs,
+                        const RunLevel* const rl) {
+  const int idx = coeffs->idx_;
+  const int q_idx = quant_idx_[idx];
+  // DC coefficient symbol
+  const int dc_len = coeffs->dc_code_ & 0x0f;
+  const uint32_t code = dc_codes_[q_idx][dc_len];
+  bw_.PutPackedCode(code);
+  if (dc_len > 0) {
+    bw_.PutBits(coeffs->dc_code_ >> 4, dc_len);
+  }
+  // AC coeffs
+  const uint32_t* const codes = ac_codes_[q_idx];
+  for (int i = 0; i < coeffs->nb_coeffs_; ++i) {
+    int run = rl[i].run_;
+    while (run & ~15) {        // escapes
+      bw_.PutPackedCode(codes[0xf0]);
+      run -= 16;
+    }
+    const uint32_t suffix = rl[i].level_;
+    const int n = suffix & 0x0f;
+    const int sym = (run << 4) | n;
+    bw_.PutPackedCode(codes[sym]);
+    bw_.PutBits(suffix >> 4, n);
+  }
+  if (coeffs->last_ < 63) {     // EOB
+    bw_.PutPackedCode(codes[0x00]);
+  }
+}
+////////////////////////////////////////////////////////////////////////////////
+// Histogram
+void Encoder::ResetHisto() {
+  memset(histos_, 0, sizeof(histos_));
+}
+#if defined(SJPEG_USE_SSE2)
+void StoreHistoSSE2(const int16_t in[64], Histo* const histos, int nb_blocks) {
+  const __m128i kMaxHisto = _mm_set1_epi16(MAX_HISTO_DCT_COEFF);
+  for (int n = 0; n < nb_blocks; ++n, in += 64) {
+    uint16_t tmp[64];
+    for (int i = 0; i < 64; i += 8) {
+      const __m128i A =
+          _mm_loadu_si128(reinterpret_cast<const __m128i*>(in + i));
+      const __m128i B = _mm_srai_epi16(A, 15);                  // sign extract
+      const __m128i C = _mm_sub_epi16(_mm_xor_si128(A, B), B);  // abs(A)
+      const __m128i D = _mm_srli_epi16(C, HSHIFT);              // >>= HSHIFT
+      const __m128i E = _mm_min_epi16(D, kMaxHisto);
+      _mm_storeu_si128(reinterpret_cast<__m128i*>(tmp + i), E);
+    }
+    for (int j = 0; j < 64; ++j) {
+      const int k = tmp[j];
+      ++histos->counts_[j][k];
+    }
+  }
+}
+#elif defined(SJPEG_USE_NEON)
+void StoreHistoNEON(const int16_t in[64], Histo* const histos, int nb_blocks) {
+  const uint16x8_t kMaxHisto = vdupq_n_u16(MAX_HISTO_DCT_COEFF);
+  for (int n = 0; n < nb_blocks; ++n, in += 64) {
+    uint16_t tmp[64];
+    for (int i = 0; i < 64; i += 8) {
+      const int16x8_t A = vld1q_s16(in + i);
+      const int16x8_t B = vabsq_s16(A);               // abs(in)
+      const uint16x8_t C = vreinterpretq_u16_s16(B);  // signed->unsigned
+      const uint16x8_t D = vshrq_n_u16(C, HSHIFT);    // >>= HSHIFT
+      const uint16x8_t E = vminq_u16(D, kMaxHisto);   // min(.,kMaxHisto)
+      vst1q_u16(tmp + i, E);
+    }
+    for (int j = 0; j < 64; ++j) {
+      const int k = tmp[j];
+      ++histos->counts_[j][k];
+    }
+  }
+}
+#endif
+// This C-version is does not produce the same counts_[] output than the
+// assembly above. But the extra entry counts_[MAX_HISTO_DCT_COEFF] is
+// not used for the final computation, and the global result is unchanged.
+void StoreHisto(const int16_t in[64], Histo* const histos, int nb_blocks) {
+  for (int n = 0; n < nb_blocks; ++n, in += 64) {
+    for (int i = 0; i < 64; ++i) {
+      const int k = (in[i] < 0 ? -in[i] : in[i]) >> HSHIFT;
+      if (k < MAX_HISTO_DCT_COEFF) {
+        ++histos->counts_[i][k];
+      }
+    }
+  }
+}
+Encoder::StoreHistoFunc Encoder::GetStoreHistoFunc() {
+#if defined(SJPEG_USE_SSE2)
+  if (SupportsSSE2()) return StoreHistoSSE2;
+#elif defined(SJPEG_USE_NEON)
+  if (SupportsNEON()) return StoreHistoNEON;
+#endif
+  return StoreHisto;  // default
+}
+const float Encoder::kHistoWeight[QSIZE] = {
+  // Gaussian with sigma ~= 3
+  0, 0, 0, 0, 0,
+  1,   5,  16,  43,  94, 164, 228, 255, 228, 164,  94,  43,  16,   5,   1,
+  0, 0, 0, 0, 0
+};
+void Encoder::AnalyseHisto() {
+  // A bit of theory and background: for each sub-band i in [0..63], we pick a
+  // quantization scale New_Qi close to the initial one Qi. We evaluate a cost
+  // function associated with F({New_Qi}) = distortion + lambda . rate,
+  // where rate and distortion depend on the quantizers set in a complex non-
+  // analytic way. Just, for well-behaved regular histograms, we expect the
+  // rate to scale as -log(Q), and the distortion as Q^2.
+  // We want the cost function to be stationnary around the initial {Qi} set,
+  // in order to achieve the best transfer between distortion and rate when we
+  // displace a little the Qi values. Mainly we want to use bits as efficiently
+  // as possible, where every bit we use has maximal impact in lowering
+  // distortion (and vice versa: if we spend an extra bit of coding, we want to
+  // have the best bang for this buck. The optimization works up-hill too).
+  //
+  // Hence, lambda is picked to minimize F around {Qi}, as:
+  //    lambda = -d(distortion) / d(rate)
+  // where the derivates are evaluated using a double least-square fit on both
+  // the clouds of {delta, distortion} and {delta, size} points.
+  //
+  // Note1: The least-square fitted slope of a {x,y} cloud is expressed as:
+  //    slope = (<xy> - <x><y>) / (<xx> - <x><x>) = Cov(x,y) / Cov(x,x)
+  // where <.> is our gaussian-averaging operator.
+  // But since we are eventually computing a quotient of such slopes, we can
+  // factor out the common (<xx> - <x><x>) denominator (which is strictly
+  // positive).
+  // Note2: we use a Gaussian-weighted average around the center value Qi
+  // instead of averaging over the whole [QDELTA_MIN, QDELTA_MAX] range.
+  // This rules out fringe samples on noisy cases (like: when the source is
+  // already JPEG-compressed!).
+  // Note3: We fall back to some sane value HLAMBDA in case of ill-condition.
+  //
+  // We use use the correlation coefficient
+  //       r = Cov(x,y) / sqrt(Cov(x,x) * Cov(y,y))
+  // to detect bad cases with poorly extrapolated distortion. In such
+  // occurrence, we skip the channel. This is particularly important for
+  // already-compressed JPEG sources that give treacherous comb-like
+  // histograms.
+  //
+  // Once this particular lambda has been picked, we loop over each channel
+  // and optimize them separately, locally picking the best New_Qi for each.
+  // The choice of lambda ensure a good balancing between size and distortion,
+  // and prevent being too aggressive on file-size reduction for instance.
+  //
+  const double r_limit = kCorrelationThreshold;
+  for (int c = (nb_comps_ > 1 ? 1 : 0); c >= 0; --c) {
+    const int idx = quant_idx_[c];
+    const Histo* const histo = &histos_[idx];
+    // For chrominance, it can be visually damageable to be too
+    // aggressive on the filesize. So with the default settings we
+    // restrict the algorithm to mainly try to *increase* the bitrate
+    // (and quality) by using a smaller qdelta_max_chroma_.
+    // delta_max is only use during the second phase, but not during
+    // the first phase of deriving an optimal lambda.
+    assert(QDELTA_MAX >= qdelta_max_luma_);
+    assert(QDELTA_MAX >= qdelta_max_chroma_);
+    const int delta_max =
+      ((idx == 0) ? qdelta_max_luma_ : qdelta_max_chroma_) - QDELTA_MIN;
+    assert(delta_max < QSIZE);
+    float sizes[64][QSIZE];
+    float distortions[64][QSIZE];
+    double num = 0.;  // accumulate d(distortion) around delta_q = 0
+    double den = 0.;  // accumulate d(size) around delta_q = 0
+    uint64_t omit_channels = kOmittedChannels;
+    for (int pos = 0; pos < 64; ++pos) {
+      if (omit_channels & (1ULL << pos)) {
+        continue;
+      }
+      const int dq0 = quants_[idx].quant_[pos];
+      const int min_dq0 = quants_[idx].min_quant_[pos];
+      // We should be using the exact bias:
+      //    const int bias = quants_[idx].bias_[pos] << (FP_BITS - AC_BITS);
+      // but this value is too precise considering the other approximations
+      // we're using (namely: HSHIFT). So we better use the a mid value of 0.5
+      // for the bias. This have the advantage of making it possible to
+      // use pre-calculated look-up tables for every quantities in the loop.
+      // This is still a TODO(skal) below, though. Not sure the gain is big.
+      const int bias = 1 << FP_BITS >> 1;
+      const int* const h = histo->counts_[pos];
+      int total = 0;
+      int last = 0;
+      for (int i = 0; i < MAX_HISTO_DCT_COEFF; ++i) {
+        total += h[i];
+        if (h[i]) last = i + 1;
+      }
+      if (total < kDensityThreshold * last) {
+        omit_channels |= 1ULL << pos;
+        continue;
+      }
+      // accumulators for averaged values.
+      double sw = 0., sx = 0.;
+      double sxx = 0., syy1 = 0.;
+      double sy1 = 0., sxy1 = 0.;   // accumulators for distortion cloud
+      double sy2 = 0., sxy2 = 0.;   // accumulators for size cloud
+      for (int delta = 0; delta < QSIZE; ++delta) {
+        double bsum = 0., dsum = 0.;
+        const int dq = dq0 + (delta + QDELTA_MIN);
+        if (dq >= min_dq0 && dq <= 255) {
+          // TODO(skal): pre-compute idq and use it in FinalizeQuantMatrix too
+          const int idq = ((1 << FP_BITS) + dq - 1) / dq;
+          for (int i = 0; i < last; ++i) {
+            if (h[i]) {
+              // v = current bin's centroid in the histogram
+              // qv = quantized value for the bin's representant 'v'
+              // dqv = dequantized qv, to be compared against v (=> 'error')
+              // bits = approximate bit-cost of quantized representant
+              // h[i] = this bin's weight
+              const int v = (i << HSHIFT) + HHALF;
+              const int qv = (v * idq + bias) >> FP_BITS;
+              // TODO(skal): for a given 'last' value, we know the upper limit
+              // on dq that will make *all* quantized 'qv' values be zero.
+              // => We can restrict the loop on 'dq' using 'last'.
+              if (qv) {
+                const int bits = CalcLog2(qv);
+                const int dqv = qv * dq;
+                const int error = (v - dqv) * (v - dqv);
+                bsum += h[i] * bits;
+                dsum += h[i] * error;
+              } else {
+                dsum += h[i] * v * v;
+              }
+            }
+          }   // end of 'i' loop
+          distortions[pos][delta] = static_cast<float>(dsum);
+          sizes[pos][delta] = static_cast<float>(bsum);
+          const double w = kHistoWeight[delta];   // Gaussian weight
+          if (w > 0.) {
+            const double x = static_cast<double>(delta + QDELTA_MIN);
+            sw   += w;
+            sx   += w * x;
+            sxx  += w * x * x;
+            sy1  += w * dsum;
+            syy1 += w * dsum * dsum;
+            sy2  += w * bsum;
+            sxy1 += w * dsum * x;
+            sxy2 += w * bsum * x;
+          }
+        } else {  // the new quantizer is out-of-range.
+          distortions[pos][delta] = FLT_MAX;
+          sizes[pos][delta] = 0;
+        }
+      }
+      // filter channels according to correlation factor.
+      const double cov_xy1 = sw * sxy1 - sx * sy1;
+      if (cov_xy1 * cov_xy1 < r_limit *
+                              (sw * sxx - sx * sx) * (sw * syy1 - sy1 * sy1)) {
+        omit_channels |= 1ULL << pos;
+        continue;
+      }
+      // accumulate numerator and denominator for the derivate calculation
+      num += cov_xy1;
+      den += sw * sxy2 - sx * sy2;
+    }
+    // we evaluate lambda =~ -d(distortion)/d(size) at dq=0
+    double lambda = HLAMBDA;
+    // When increasing Q, size should significantly decrease and distortion
+    // increase. If they don't, we are ill-conditionned and should fall back
+    // to a safe value HLAMBDA.
+    if (num > 1000. && den < -10.) {
+      // This is our approximation of -d(Distortion) / d(Rate)
+      // We limit it to 1. below, to avoid degenerated cases
+      lambda = -num / den;
+      if (lambda < 1.) {
+        lambda = 1.;
+      }
+    }
+    // now, optimize each channel using the optimal lambda selection
+    for (int pos = 0; pos < 64; ++pos) {
+      if (omit_channels & (1ULL << pos)) {
+        continue;
+      }
+      float best_score = FLT_MAX;
+      int best_dq = 0;
+      for (int delta = 0; delta <= delta_max; ++delta) {
+        if (distortions[pos][delta] < FLT_MAX) {
+          const float score = distortions[pos][delta]
+                            + lambda * sizes[pos][delta];
+          if (score < best_score) {
+            best_score = score;
+            best_dq = delta + QDELTA_MIN;
+          }
+        }
+      }
+      quants_[idx].quant_[pos] += best_dq;
+      assert(quants_[idx].quant_[pos] >= 1);
+    }
+    FinalizeQuantMatrix(&quants_[idx], q_bias_);
+    SetCostCodes(idx);
+  }
+}
+void Encoder::CollectHistograms() {
+  ResetHisto();
+  int16_t* in = in_blocks_;
+  const int mb_x_max = W_ / block_w_;
+  const int mb_y_max = H_ / block_h_;
+  for (int mb_y = 0; mb_y < mb_h_; ++mb_y) {
+    const bool yclip = (mb_y == mb_y_max);
+    for (int mb_x = 0; mb_x < mb_w_; ++mb_x) {
+      if (!use_extra_memory_) {
+        in = in_blocks_;
+      }
+      GetSamples(mb_x, mb_y, yclip | (mb_x == mb_x_max), in);
+      fDCT_(in, mcu_blocks_);
+      for (int c = 0; c < nb_comps_; ++c) {
+        const int num_blocks = nb_blocks_[c];
+        store_histo_(in, &histos_[quant_idx_[c]], num_blocks);
+        in += 64 * num_blocks;
+      }
+    }
+  }
+  have_coeffs_ = use_extra_memory_;
+}
+////////////////////////////////////////////////////////////////////////////////
+// Perform YUV conversion and fDCT, and store the unquantized coeffs
+void Encoder::CollectCoeffs() {
+  assert(use_extra_memory_);
+  int16_t* in = in_blocks_;
+  const int mb_x_max = W_ / block_w_;
+  const int mb_y_max = H_ / block_h_;
+  for (int mb_y = 0; mb_y < mb_h_; ++mb_y) {
+    const bool yclip = (mb_y == mb_y_max);
+    for (int mb_x = 0; mb_x < mb_w_; ++mb_x) {
+      GetSamples(mb_x, mb_y, yclip | (mb_x == mb_x_max), in);
+      fDCT_(in, mcu_blocks_);
+      in += 64 * mcu_blocks_;
+    }
+  }
+  have_coeffs_ = true;
+}
+////////////////////////////////////////////////////////////////////////////////
+// 1-pass Scan
+void Encoder::SinglePassScan() {
+  ResetDCs();
+  RunLevel base_run_levels[64];
+  int16_t* in = in_blocks_;
+  const int mb_x_max = W_ / block_w_;
+  const int mb_y_max = H_ / block_h_;
+  const QuantizeBlockFunc quantize_block = use_trellis_ ? TrellisQuantizeBlock
+                                                        : quantize_block_;
+  for (int mb_y = 0; mb_y < mb_h_; ++mb_y) {
+    const bool yclip = (mb_y == mb_y_max);
+    for (int mb_x = 0; mb_x < mb_w_; ++mb_x) {
+      if (!CheckBuffers()) return;
+      if (!have_coeffs_) {
+        in = in_blocks_;
+        GetSamples(mb_x, mb_y, yclip | (mb_x == mb_x_max), in);
+        fDCT_(in, mcu_blocks_);
+      }
+      for (int c = 0; c < nb_comps_; ++c) {
+        DCTCoeffs base_coeffs;
+        for (int i = 0; i < nb_blocks_[c]; ++i) {
+          const int dc = quantize_block(in, c, &quants_[quant_idx_[c]],
+                                        &base_coeffs, base_run_levels);
+          base_coeffs.dc_code_ = GenerateDCDiffCode(dc, &DCs_[c]);
+          CodeBlock(&base_coeffs, base_run_levels);
+          in += 64;
+        }
+      }
+    }
+  }
+}
+void Encoder::FinalPassScan(size_t nb_mbs, const DCTCoeffs* coeffs) {
+  DesallocateBlocks();     // we can free up some coeffs memory at this point
+  if (!CheckBuffers()) return;  // call needed to finalize all_run_levels_
+  assert(reuse_run_levels_);
+  const RunLevel* run_levels = all_run_levels_;
+  for (size_t n = 0; n < nb_mbs; ++n) {
+    if (!CheckBuffers()) return;
+    CodeBlock(&coeffs[n], run_levels);
+    run_levels += coeffs[n].nb_coeffs_;
+  }
+}
+////////////////////////////////////////////////////////////////////////////////
+// Huffman tables optimization
+void Encoder::ResetEntropyStats() {
+  memset(freq_ac_, 0, sizeof(freq_ac_));
+  memset(freq_dc_, 0, sizeof(freq_dc_));
+}
+void Encoder::AddEntropyStats(const DCTCoeffs* const coeffs,
+                              const RunLevel* const run_levels) {
+  // freq_ac_[] and freq_dc_[] cannot overflow 32bits, since the maximum
+  // resolution allowed is 65535 * 65535. The sum of all frequencies cannot
+  // be greater than 32bits, either.
+  const int idx = coeffs->idx_;
+  const int q_idx = quant_idx_[idx];
+  for (int i = 0; i < coeffs->nb_coeffs_; ++i) {
+    const int run = run_levels[i].run_;
+    const int tmp = (run >> 4);
+    if (tmp) freq_ac_[q_idx][0xf0] += tmp;  // count escapes (all at once)
+    const int suffix = run_levels[i].level_;
+    const int sym = ((run & 0x0f) << 4) | (suffix & 0x0f);
+    ++freq_ac_[q_idx][sym];
+  }
+  if (coeffs->last_ < 63) {     // EOB
+    ++freq_ac_[q_idx][0x00];
+  }
+  ++freq_dc_[q_idx][coeffs->dc_code_ & 0x0f];
+}
+static int cmp(const void *pa, const void *pb) {
+  const uint64_t a = *reinterpret_cast<const uint64_t*>(pa);
+  const uint64_t b = *reinterpret_cast<const uint64_t*>(pb);
+  assert(a != b);  // tie-breaks can't happen
+  return (a < b) ? 1 : -1;
+}
+static void BuildOptimalTable(HuffmanTable* const t,
+                              const uint32_t* const freq, int size) {
+  enum { MAX_BITS = 32, MAX_CODE_SIZE = 16 };
+  assert(size <= 256);
+  assert(t != nullptr);
+  // The celebrated merging algorithm from Huffman, with some restrictions:
+  // * codes with all '1' are forbidden, to avoid trailing marker emulation
+  // * code should be less than 16bits. So we're re-allocating them to shorter
+  //   code, even if it means being suboptimal for extremely rare symbols that
+  //   would eat a lot of bits.
+  // This function will not touch the content of freq[].
+  int codesizes[256 + 1];
+  // chain[i] will hold the index of the next element in the subtree below
+  // element 'i', or -1 if there's no sub-tree.
+  // We use and maintain this list in order to efficiently increasing the
+  // codesizes by one when merging two sub-trees into one.
+  // To ease the merging (by avoiding 1 loop) we store the address of the last
+  // element in the chain for each symbol. This makes the process being O(1).
+  // It's probably better to keep the arrays separated instead of making
+  // a struct, since we touch chain_end[] only once per merging, whereas
+  // chain[] and codesizes[] are modified O(k) time per merging.
+  int chain[256 + 1];
+  int* chain_end[256 + 1];
+  // sorted_freq[] remains sorted by decreasing frequencies along the process.
+  uint64_t sorted_freq[256 + 1];
+  // Counts and puts the symbols effectively used at the beginning of the table.
+  int nb_syms = 0;
+  for (int i = 0; i < size; ++i) {
+    const uint64_t v = freq[i];
+    if (v > 0) {
+      // we pack the sorted key (32bits) and index (9bits) into a single
+      // uint64_t, so we don't have to resort to structs (and we avoid
+      // tie-breaks, too)
+      sorted_freq[nb_syms++] = (v << 9) | i;
+    }
+    codesizes[i] = 0;
+    chain[i] = -1;
+    chain_end[i] = &chain[i];
+  }
+  t->nb_syms_ = nb_syms;  // Record how many final symbols we'll have.
+  // initial sort
+  // TODO(skal): replace by counting-sort?? (merged with previous loop?)
+  qsort(sorted_freq, nb_syms, sizeof(sorted_freq[0]), cmp);
+  // fake last symbol, with lowest frequency: will be assigned to the forbidden
+  // code '1111...1', but will eventually be discarded.
+  sorted_freq[nb_syms++] = (1ULL << 9) | size;
+  codesizes[size] = 0;
+  chain[size] = -1;
+  chain_end[size] = &chain[size];
+  // Merging phase
+  // Recursively merge the two symbols with lowest frequency. The resulting
+  // super-symbol will be represented by a longer (by 1bit) code, since
+  // it's the least frequent one.
+  int nb = nb_syms;
+  while (nb-- > 1) {
+    // First, link the two sub-trees.
+    const uint64_t s1 = sorted_freq[nb - 1];    // first symbol
+    const uint64_t s2 = sorted_freq[nb];        // second symbol, appended
+    // The 0x1ff masking is for taking only the symbol, discarding the
+    // frequency that we stored in the upper bits for sorting.
+    int i = s1 & 0x1ff;
+    const int j = s2 & 0x1ff;
+    assert(i <= size && j <= size);
+    *chain_end[i] = j;
+    chain_end[i] = chain_end[j];
+    // Then, following the chain, increase the whole sub-tree's weight by 1bit.
+    do {
+      ++codesizes[i];
+      i = chain[i];
+    } while (i >= 0);
+    // Create new symbol, with merged frequencies. Will take s1's spot.
+    // We must use 64bit here to prevent overflow in the sum. Both s1 and
+    // s2 are originally 32 + 9 bits wide.
+    const uint64_t new_symbol = s1 + (s2 & ~0x1ff);
+    // Perform insertion sort to find the new spot of the merged symbol.
+    int k = nb - 1;
+    while (k > 0) {
+      if (sorted_freq[k - 1] < new_symbol) {
+        sorted_freq[k] = sorted_freq[k - 1];
+        --k;
+      } else {
+        break;
+      }
+    }
+    sorted_freq[k] = new_symbol;
+  }
+  // Count bit distribution.
+  uint8_t bits[MAX_BITS];
+  memset(bits, 0, sizeof(bits));
+  int max_bit_size = 0;
+  for (int i = 0; i <= size; ++i) {
+    int s = codesizes[i];
+    assert(s <= codesizes[size]);    // symbol #size is the biggest one.
+    if (s > 0) {
+      // This is slightly penalizing but only for ultra-rare symbol
+      if (s > MAX_BITS) {
+        s = MAX_BITS;
+        codesizes[i] = MAX_BITS;    // clamp code-size
+      }
+      ++bits[s - 1];
+      if (s > max_bit_size) {
+        max_bit_size = s;
+      }
+    }
+  }
+  // We sort symbols by slices of increasing bitsizes, using counting sort.
+  // This will generate a partition of symbols in the final syms_[] array.
+  int start[MAX_BITS];     // start[i] is the first code with length i+1
+  int position = 0;
+  for (int i = 0; i < max_bit_size; ++i) {
+    start[i] = position;
+    position += bits[i];
+  }
+  assert(position == nb_syms);
+  // Now, we can ventilate the symbols directly to their final slice in the
+  // partitioning, according to the their bit-length.
+  // Note: we omit the last symbol, which is fake.
+  uint8_t* const syms = const_cast<uint8_t*>(t->syms_);
+  // Note that we loop til symbol = size-1, hence omitting the last fake symbol.
+  for (int symbol = 0; symbol < size; ++symbol) {
+    const int s = codesizes[symbol];
+    if (s > 0) {
+      assert(s <= MAX_BITS);
+      syms[start[s - 1]++] = symbol;
+    }
+  }
+  assert(start[max_bit_size - 1] == nb_syms - 1);
+  // Fix codes with length greater than 16 bits. We move too long
+  // codes up, and one short down, making the tree a little sub-optimal.
+  for (int l = max_bit_size - 1; l >= MAX_CODE_SIZE; --l) {
+    while (bits[l] > 0) {
+      int k = l - 2;
+      while (bits[k] == 0) {    // Search for a level with a leaf to split.
+        --k;
+      }
+      /* Move up 2 symbols from bottom-most level l, and sink down one from
+         level k, like this:
+                    Before:                After:
+                    /  ..                 /    ..
+        k bits->   c     \               /\      \
+                         /\             c  b     /\
+                       .. /\                   ..  a
+        l bits->         a  b
+        Note that by the very construction of the optimal tree, the least
+        probable symbols always come by pair with same bit-length.
+        So there's always a pair of 'a' and 'b' to find.
+      */
+      bits[l    ] -= 2;     // remove 'a' and 'b'
+      bits[l - 1] += 1;     // put 'a' one level up.
+      bits[k    ] -= 1;     // remove 'c'
+      bits[k + 1] += 2;     // put 'c' anb 'b' one level down.
+    }
+  }
+  // remove last pseudo-symbol
+  max_bit_size = MAX_CODE_SIZE;
+  while (bits[--max_bit_size] == 0) {
+    assert(max_bit_size > 0);
+  }
+  --bits[max_bit_size];
+  // update table with final book
+  for (int i = 0; i < MAX_CODE_SIZE; ++i) {
+    t->bits_[i] = bits[i];
+  }
+}
+void Encoder::CompileEntropyStats() {
+  // plug and build new tables
+  for (int q_idx = 0; q_idx < (nb_comps_ == 1 ? 1 : 2); ++q_idx) {
+    // DC tables
+    Huffman_tables_[q_idx] = &opt_tables_dc_[q_idx];
+    opt_tables_dc_[q_idx].syms_ = opt_syms_dc_[q_idx];
+    BuildOptimalTable(&opt_tables_dc_[q_idx], freq_dc_[q_idx], 12);
+    // AC tables
+    Huffman_tables_[2 + q_idx] = &opt_tables_ac_[q_idx];
+    opt_tables_ac_[q_idx].syms_ = opt_syms_ac_[q_idx];
+    BuildOptimalTable(&opt_tables_ac_[q_idx], freq_ac_[q_idx], 256);
+  }
+}
+void Encoder::StoreOptimalHuffmanTables(size_t nb_mbs,
+                                        const DCTCoeffs* coeffs) {
+  // optimize Huffman tables
+  ResetEntropyStats();
+  const RunLevel* run_levels = all_run_levels_;
+  for (size_t n = 0; n < nb_mbs; ++n) {
+    AddEntropyStats(&coeffs[n], run_levels);
+    run_levels += coeffs[n].nb_coeffs_;
+  }
+  CompileEntropyStats();
+}
+////////////////////////////////////////////////////////////////////////////////
+void Encoder::SinglePassScanOptimized() {
+  const size_t nb_mbs = mb_w_ * mb_h_ * mcu_blocks_;
+  DCTCoeffs* const base_coeffs =
+      Alloc<DCTCoeffs>(reuse_run_levels_ ? nb_mbs : 1);
+  if (base_coeffs == nullptr) return;
+  DCTCoeffs* coeffs = base_coeffs;
+  RunLevel base_run_levels[64];
+  const QuantizeBlockFunc quantize_block = use_trellis_ ? TrellisQuantizeBlock
+                                                        : quantize_block_;
+  // We use the default Huffman tables as basis for bit-rate evaluation
+  if (use_trellis_) InitCodes(true);
+  ResetEntropyStats();
+  ResetDCs();
+  nb_run_levels_ = 0;
+  int16_t* in = in_blocks_;
+  const int mb_x_max = W_ / block_w_;
+  const int mb_y_max = H_ / block_h_;
+  for (int mb_y = 0; mb_y < mb_h_; ++mb_y) {
+    const bool yclip = (mb_y == mb_y_max);
+    for (int mb_x = 0; mb_x < mb_w_; ++mb_x) {
+      if (!have_coeffs_) {
+        in = in_blocks_;
+        GetSamples(mb_x, mb_y, yclip | (mb_x == mb_x_max), in);
+        fDCT_(in, mcu_blocks_);
+      }
+      if (!CheckBuffers()) goto End;
+      for (int c = 0; c < nb_comps_; ++c) {
+        for (int i = 0; i < nb_blocks_[c]; ++i) {
+          RunLevel* const run_levels =
+              reuse_run_levels_ ? all_run_levels_ + nb_run_levels_
+                                : base_run_levels;
+          const int dc = quantize_block(in, c, &quants_[quant_idx_[c]],
+                                        coeffs, run_levels);
+          coeffs->dc_code_ = GenerateDCDiffCode(dc, &DCs_[c]);
+          AddEntropyStats(coeffs, run_levels);
+          if (reuse_run_levels_) {
+            nb_run_levels_ += coeffs->nb_coeffs_;
+            ++coeffs;
+            assert(coeffs <= &base_coeffs[nb_mbs]);
+          }
+          in += 64;
+          assert(nb_run_levels_ <= max_run_levels_);
+        }
+      }
+    }
+  }
+  CompileEntropyStats();
+  WriteDHT();
+  WriteSOS();
+  if (!reuse_run_levels_) {
+    SinglePassScan();   // redo everything, but with optimal tables now.
+  } else {
+    // Re-use the saved run/levels for fast 2nd-pass.
+    FinalPassScan(nb_mbs, base_coeffs);
+  }
+ End:
+  Free(base_coeffs);
+}
+////////////////////////////////////////////////////////////////////////////////
+// main call
+bool Encoder::Encode() {
+  if (!ok_) return false;
+  FinalizeQuantMatrix(&quants_[0], q_bias_);
+  FinalizeQuantMatrix(&quants_[1], q_bias_);
+  SetCostCodes(0);
+  SetCostCodes(1);
+  // default tables
+  for (int i = 0; i < 4; ++i) Huffman_tables_[i] = &kHuffmanTables[i];
+  // colorspace init
+  InitComponents();
+  assert(nb_comps_ <= MAX_COMP);
+  assert(mcu_blocks_ <= 6);
+  // validate some input parameters
+  if (W_ <= 0 || H_ <= 0 || rgb_ == nullptr) return false;
+  mb_w_ = (W_ + (block_w_ - 1)) / block_w_;
+  mb_h_ = (H_ + (block_h_ - 1)) / block_h_;
+  const size_t nb_blocks = use_extra_memory_ ? mb_w_ * mb_h_ : 1;
+  if (!AllocateBlocks(nb_blocks * mcu_blocks_)) return false;
+  WriteAPP0();
+  // custom markers written 'as is'
+  if (!WriteAPPMarkers(app_markers_)) return false;
+  // metadata
+  if (!WriteEXIF(exif_) || !WriteICCP(iccp_) || !WriteXMP(xmp_)) return false;
+  if (passes_ > 1) {
+    LoopScan();
+  } else {
+    if (use_adaptive_quant_) {
+      // Histogram analysis + derive optimal quant matrices
+      CollectHistograms();
+      AnalyseHisto();
+    }
+    WriteDQT();
+    WriteSOF();
+    if (optimize_size_) {
+      SinglePassScanOptimized();
+    } else {
+      WriteDHT();
+      WriteSOS();
+      SinglePassScan();
+    }
+  }
+  WriteEOI();
+  ok_ = ok_ && bw_.Finalize();
+  DesallocateBlocks();
+  return ok_;
+}
+////////////////////////////////////////////////////////////////////////////////
+// Edge replication
+namespace {
+int GetAverage(const int16_t* const out) {
+  int DC = 0;
+  for (int i = 0; i < 64; ++i) DC += out[i];
+  return (DC + 32) >> 6;
+}
+void SetAverage(int DC, int16_t* const out) {
+  for (int i = 0; i < 64; ++i) out[i] = DC;
+}
+}   // anonymous namespace
+void Encoder::AverageExtraLuma(int sub_w, int sub_h, int16_t* out) {
+  // out[] points to four 8x8 blocks. When one of this block is totally
+  // outside of the frame, we set it flat to the average value of the previous
+  // block ("DC"), in order to help compressibility.
+  int DC = GetAverage(out);
+  if (sub_w <= 8) {   // set block #1 to block #0's average value
+    SetAverage(DC, out + 1 * 64);
+  }
+  if (sub_h <= 8) {   // Need to flatten block #2 and #3
+    if (sub_w > 8) {  // block #1 was not flatten, so get its real DC
+      DC = GetAverage(out + 1 * 64);
+    }
+    SetAverage(DC, out + 2 * 64);
+    SetAverage(DC, out + 3 * 64);
+  } else if (sub_w <= 8) {   // set block #3 to the block #2's average value
+    DC = GetAverage(out + 2 * 64);
+    SetAverage(DC, out + 3 * 64);
+  }
+}
+const uint8_t* Encoder::GetReplicatedSamples(const uint8_t* rgb,
+                                             int rgb_step,
+                                             int sub_w, int sub_h,
+                                             int w, int h) {
+  assert(sub_w > 0 && sub_h > 0);
+  if (sub_w > w) {
+    sub_w = w;
+  }
+  if (sub_h > h) {
+    sub_h = h;
+  }
+  uint8_t* dst = replicated_buffer_;
+  for (int y = 0; y < sub_h; ++y) {
+    memcpy(dst, rgb, 3 * sub_w);
+    const uint8_t* const src0 = &dst[3 * (sub_w - 1)];
+    for (int x = 3 * sub_w; x < 3 * w; x += 3) {
+      memcpy(dst + x, src0, 3);
+    }
+    dst += 3 * w;
+    rgb += rgb_step;
+  }
+  const uint8_t* dst0 = dst - 3 * w;
+  for (int y = sub_h; y < h; ++y) {
+    memcpy(dst, dst0, 3 * w);
+    dst += 3 * w;
+  }
+  return replicated_buffer_;
+}
+// TODO(skal): merge with above function? Probably slower...
+const uint8_t* Encoder::GetReplicatedYUVSamples(const uint8_t* in,
+                                                int step,
+                                                int sub_w, int sub_h,
+                                                int w, int h) {
+  assert(sub_w > 0 && sub_h > 0);
+  if (sub_w > w) {
+    sub_w = w;
+  }
+  if (sub_h > h) {
+    sub_h = h;
+  }
+  uint8_t* out = replicated_buffer_;
+  for (int y = 0; y < sub_h; ++y) {
+    int x;
+    for (x = 0; x < sub_w; ++x)
+      out[x] = in[x];
+    for (; x < w; ++x) {
+      out[x] = out[sub_w - 1];
+    }
+    out += w;
+    in += step;
+  }
+  const uint8_t* const out0 = out - w;
+  for (int y = sub_h; y < h; ++y) {
+    memcpy(out, out0, w);
+    out += w;
+  }
+  return replicated_buffer_;
+}
+////////////////////////////////////////////////////////////////////////////////
+// sub-class for YUV 4:2:0 version
+class Encoder420 : public Encoder {
+ public:
+  Encoder420(int W, int H, int step, const uint8_t* const rgb,
+             ByteSink* const sink)
+    : Encoder(W, H, step, rgb, sink) {}
+  virtual ~Encoder420() {}
+  virtual void InitComponents() {
+    nb_comps_ = 3;
+    quant_idx_[0] = 0;
+    quant_idx_[1] = 1;
+    quant_idx_[2] = 1;
+    nb_blocks_[0] = 4;
+    nb_blocks_[1] = 1;
+    nb_blocks_[2] = 1;
+    mcu_blocks_ = 6;
+    block_w_ = 16;
+    block_h_ = 16;
+    block_dims_[0] = 0x22;
+    block_dims_[1] = 0x11;
+    block_dims_[2] = 0x11;
+  }
+  virtual void GetSamples(int mb_x, int mb_y, bool clipped,
+                          int16_t* out_blocks) {
+    const uint8_t* data = rgb_ + (3 * mb_x + mb_y * step_) * 16;
+    int step = step_;
+    if (clipped) {
+      data = GetReplicatedSamples(data, step,
+                                  W_ - mb_x * 16, H_ - mb_y * 16, 16, 16);
+      step = 3 * 16;
+    }
+    get_yuv_block_(data, step, out_blocks);
+    if (clipped) {
+      AverageExtraLuma(W_ - mb_x * 16, H_ - mb_y * 16, out_blocks);
+    }
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+// sub-class for YUV 4:4:4 version
+class Encoder444 : public Encoder {
+ public:
+  Encoder444(int W, int H, int step, const uint8_t* const rgb,
+             ByteSink* const sink)
+      : Encoder(W, H, step, rgb, sink) {
+    SetYUVFormat(true);
+  }
+  virtual ~Encoder444() {}
+  virtual void InitComponents() {
+    nb_comps_ = 3;
+    quant_idx_[0] = 0;
+    quant_idx_[1] = 1;
+    quant_idx_[2] = 1;
+    nb_blocks_[0] = 1;
+    nb_blocks_[1] = 1;
+    nb_blocks_[2] = 1;
+    mcu_blocks_ = 3;
+    block_w_ = 8;
+    block_h_ = 8;
+    block_dims_[0] = 0x11;
+    block_dims_[1] = 0x11;
+    block_dims_[2] = 0x11;
+  }
+  virtual void GetSamples(int mb_x, int mb_y, bool clipped, int16_t* out) {
+    const uint8_t* data = rgb_ + (3 * mb_x + mb_y * step_) * 8;
+    int step = step_;
+    if (clipped) {
+      data = GetReplicatedSamples(data, step,
+                                  W_ - mb_x * 8, H_ - mb_y * 8, 8, 8);
+      step = 3 * 8;
+    }
+    get_yuv_block_(data, step, out);
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+// sub-class for the sharp YUV 4:2:0 version
+class EncoderSharp420 : public Encoder420 {
+ public:
+  EncoderSharp420(int W, int H, int step, const uint8_t* const rgb,
+                  ByteSink* const sink)
+      : Encoder420(W, H, step, rgb, sink), yuv_memory_(nullptr) {
+    const int uv_w = (W + 1) >> 1;
+    const int uv_h = (H + 1) >> 1;
+    yuv_memory_ = Alloc<uint8_t>(W * H + 2 * uv_w * uv_h);
+    if (yuv_memory_ == nullptr) return;
+    y_plane_ = yuv_memory_;
+    y_step_ = W;
+    u_plane_ = yuv_memory_ + W * H;
+    v_plane_ = u_plane_ + uv_w * uv_h;
+    uv_step_ = uv_w;
+    ApplySharpYUVConversion(rgb, W, H, step, y_plane_, u_plane_, v_plane_);
+  }
+  virtual ~EncoderSharp420() { Free(yuv_memory_); }
+  virtual void GetSamples(int mb_x, int mb_y, bool clipped, int16_t* out);
+ protected:
+  void GetLumaSamples(int mb_x, int mb_y, bool clipped, int16_t* out) {
+    int step = y_step_;
+    const uint8_t* Y1 = y_plane_ + (mb_x + mb_y * step) * 16;
+    if (clipped) {
+      Y1 = GetReplicatedYUVSamples(Y1, step,
+                                   W_ - mb_x * 16, H_ - mb_y * 16, 16, 16);
+      step = 16;
+    }
+    const uint8_t* Y2 = Y1 + 8 * step;
+    for (int y = 8, n = 0; y > 0; --y) {
+      for (int x = 0; x < 8; ++x, ++n) {
+        out[n + 0 * 64] = Y1[x] - 128;
+        out[n + 1 * 64] = Y1[x + 8] - 128;
+        out[n + 2 * 64] = Y2[x] - 128;
+        out[n + 3 * 64] = Y2[x + 8] - 128;
+      }
+      Y1 += step;
+      Y2 += step;
+    }
+    if (clipped) {
+      AverageExtraLuma(W_ - mb_x * 16, H_ - mb_y * 16, out);
+    }
+  }
+ private:
+  uint8_t* y_plane_;
+  int y_step_;
+  uint8_t* u_plane_;
+  uint8_t* v_plane_;
+  int uv_step_;
+  uint8_t* yuv_memory_;
+};
+void EncoderSharp420::GetSamples(int mb_x, int mb_y,
+                                 bool clipped, int16_t* out) {
+  GetLumaSamples(mb_x, mb_y, clipped, out);
+  // Chroma
+  const uint8_t* U = u_plane_ + (mb_x + mb_y * uv_step_) * 8;
+  int step = uv_step_;
+  if (clipped) {
+    U = GetReplicatedYUVSamples(U, step,
+                                ((W_ + 1) >> 1) - mb_x * 8,
+                                ((H_ + 1) >> 1) - mb_y * 8, 8, 8);
+    step = 8;
+  }
+  for (int y = 8, n = 0; y > 0; --y, U += step) {
+    for (int x = 0; x < 8; ++x, ++n) {
+      out[n + 4 * 64] = U[x] - 128;
+    }
+  }
+  const uint8_t* V = v_plane_ + (mb_x + mb_y * uv_step_) * 8;
+  step = uv_step_;
+  if (clipped) {
+    V = GetReplicatedYUVSamples(V, step,
+                                ((W_ + 1) >> 1) - mb_x * 8,
+                                ((H_ + 1) >> 1) - mb_y * 8, 8, 8);
+    step = 8;
+  }
+  for (int y = 8, n = 0; y > 0; --y, V += step) {
+    for (int x = 0; x < 8; ++x, ++n) {
+      out[n + 5 * 64] = V[x] - 128;
+    }
+  }
+}
+////////////////////////////////////////////////////////////////////////////////
+// all-in-one factory to pickup the right encoder instance
+Encoder* EncoderFactory(const uint8_t* rgb,
+                             int W, int H, int stride, SjpegYUVMode yuv_mode,
+                             ByteSink* const sink) {
+  if (yuv_mode == SJPEG_YUV_AUTO) {
+    yuv_mode = SjpegRiskiness(rgb, W, H, stride, nullptr);
+  }
+  Encoder* enc = nullptr;
+  if (yuv_mode == SJPEG_YUV_420) {
+    enc = new (std::nothrow) Encoder420(W, H, stride, rgb, sink);
+  } else if (yuv_mode == SJPEG_YUV_SHARP) {
+    enc = new (std::nothrow) EncoderSharp420(W, H, stride, rgb, sink);
+  } else {
+    enc = new (std::nothrow) Encoder444(W, H, stride, rgb, sink);
+  }
+  if (enc == nullptr || !enc->Ok()) {
+    delete enc;
+    enc = nullptr;
+  }
+  return enc;
+}
+}    // namespace sjpeg
+////////////////////////////////////////////////////////////////////////////////
+// public plain-C functions
+size_t SjpegEncode(const uint8_t* rgb, int width, int height, int stride,
+                   uint8_t** out_data, float quality, int method,
+                   SjpegYUVMode yuv_mode) {
+  if (rgb == nullptr || out_data == nullptr) return 0;
+  if (width <= 0 || height <= 0 || stride < 3 * width) return 0;
+  *out_data = nullptr;  // safety
+  MemorySink sink(width * height / 4);
+  Encoder* const enc = EncoderFactory(rgb, width, height, stride, yuv_mode,
+                                      &sink);
+  enc->SetQuality(quality);
+  enc->SetCompressionMethod(method);
+  size_t size = 0;
+  *out_data = nullptr;
+  if (enc->Encode()) sink.Release(out_data, &size);
+  delete enc;
+  return size;
+}
+////////////////////////////////////////////////////////////////////////////////
+size_t SjpegCompress(const uint8_t* rgb, int width, int height, float quality,
+                     uint8_t** out_data) {
+  return SjpegEncode(rgb, width, height, 3 * width, out_data,
+                     quality, 4, SJPEG_YUV_AUTO);
+}
+void SjpegFreeBuffer(const uint8_t* buffer) {
+  delete[] buffer;
+}
+////////////////////////////////////////////////////////////////////////////////
+uint32_t SjpegVersion() {
+  return SJPEG_VERSION;
+}
+////////////////////////////////////////////////////////////////////////////////
+// Parametrized call
+EncoderParam::EncoderParam() : search_hook(nullptr), memory(nullptr) {
+  Init(kDefaultQuality);
+}
+EncoderParam::EncoderParam(float quality_factor)
+    : search_hook(nullptr), memory(nullptr) {
+  Init(quality_factor);
+}
+void EncoderParam::Init(float quality_factor) {
+  Huffman_compress = true;
+  adaptive_quantization = true;
+  use_trellis = false;
+  yuv_mode = SJPEG_YUV_AUTO;
+  quantization_bias = kDefaultBias;
+  qdelta_max_luma = kDefaultDeltaMaxLuma;
+  qdelta_max_chroma = kDefaultDeltaMaxChroma;
+  adaptive_bias = false;
+  SetLimitQuantization(false);
+  min_quant_tolerance_ = 0;
+  SetQuality(quality_factor);
+  target_mode = TARGET_NONE;
+  target_value = 0;
+  passes = 1;
+  tolerance = 1.;
+  qmin = 0.;
+  qmax = 100.;
+}
+void EncoderParam::SetQuality(float quality_factor) {
+  const float q = GetQFactor(quality_factor);
+  sjpeg::SetQuantMatrix(kDefaultMatrices[0], q, quant_[0]);
+  sjpeg::SetQuantMatrix(kDefaultMatrices[1], q, quant_[1]);
+}
+void EncoderParam::SetQuantization(const uint8_t m[2][64],
+                                       float reduction) {
+  if (reduction <= 1.f) reduction = 1.f;
+  if (m == nullptr) return;
+  for (int c = 0; c < 2; ++c) {
+    for (size_t i = 0; i < 64; ++i) {
+      const int v = static_cast<int>(m[c][i] * 100. / reduction + .5);
+      quant_[c][i] = (v > 255) ? 255u : (v < 1) ? 1u : v;
+    }
+  }
+}
+void EncoderParam::SetLimitQuantization(bool limit_quantization,
+                                            int min_quant_tolerance) {
+  use_min_quant_ = limit_quantization;
+  if (limit_quantization) SetMinQuantization(quant_, min_quant_tolerance);
+}
+void EncoderParam::SetMinQuantization(const uint8_t m[2][64],
+                                          int min_quant_tolerance) {
+  use_min_quant_ = true;
+  CopyQuantMatrix(m[0], min_quant_[0]);
+  CopyQuantMatrix(m[1], min_quant_[1]);
+  min_quant_tolerance_ = (min_quant_tolerance < 0) ? 0
+                       : (min_quant_tolerance > 100) ? 100
+                       : min_quant_tolerance;
+}
+void EncoderParam::ResetMetadata() {
+  iccp.clear();
+  exif.clear();
+  xmp.clear();
+  app_markers.clear();
+}
+bool Encoder::InitFromParam(const EncoderParam& param) {
+  SetQuantMatrices(param.quant_);
+  if (param.use_min_quant_) {
+    SetMinQuantMatrices(param.min_quant_, param.min_quant_tolerance_);
+  } else {
+    SetDefaultMinQuantMatrices();
+  }
+  int method = param.Huffman_compress ? 1 : 0;
+  if (param.adaptive_quantization) method += 3;
+  if (param.use_trellis) {
+    method = (method == 4) ? 7 : (method == 6) ? 8 : method;
+  }
+  SetCompressionMethod(method);
+  SetQuantizationBias(param.quantization_bias, param.adaptive_bias);
+  SetQuantizationDeltas(param.qdelta_max_luma, param.qdelta_max_chroma);
+  SetMetadata(param.iccp, Encoder::ICC);
+  SetMetadata(param.exif, Encoder::EXIF);
+  SetMetadata(param.xmp, Encoder::XMP);
+  SetMetadata(param.app_markers, Encoder::MARKERS);
+  passes_ = (param.passes < 1) ? 1 : (param.passes > 20) ? 20 : param.passes;
+  if (passes_ > 1) {
+    use_extra_memory_ = true;
+    reuse_run_levels_ = true;
+    search_hook_ = (param.search_hook == nullptr) ? &default_hook_
+                                                  : param.search_hook;
+    if (!search_hook_->Setup(param)) return false;
+  }
+  memory_hook_ = (param.memory == nullptr) ? &kDefaultMemory : param.memory;
+  return true;
+}
+bool sjpeg::Encode(const uint8_t* rgb, int width, int height, int stride,
+                   const EncoderParam& param, ByteSink* sink) {
+  if (rgb == nullptr || sink == nullptr) return false;
+  if (width <= 0 || height <= 0 || stride < 3 * width) return false;
+  Encoder* const enc = EncoderFactory(rgb, width, height, stride,
+                                      param.yuv_mode, sink);
+  const bool ok = (enc != nullptr) &&
+                  enc->InitFromParam(param) &&
+                  enc->Encode();
+  delete enc;
+  return ok;
+}
+size_t sjpeg::Encode(const uint8_t* rgb, int width, int height, int stride,
+                     const EncoderParam& param, uint8_t** out_data) {
+  MemorySink sink(width * height / 4);    // estimation of output size
+  if (!sjpeg::Encode(rgb, width, height, stride, param, &sink)) return 0;
+  size_t size;
+  sink.Release(out_data, &size);
+  return size;
+}
+////////////////////////////////////////////////////////////////////////////////
+// std::string variants
+bool sjpeg::Encode(const uint8_t* rgb, int width, int height, int stride,
+                   const EncoderParam& param, std::string* output) {
+  if (output == nullptr) return false;
+  output->clear();
+  output->reserve(width * height / 4);
+  StringSink sink(output);
+  return Encode(rgb, width, height, stride, param, &sink);
+}
+bool SjpegCompress(const uint8_t* rgb, int width, int height,
+                   float quality, std::string* output) {
+  EncoderParam param;
+  param.SetQuality(quality);
+  return Encode(rgb, width, height, 3 * width, param, output);
+}
+////////////////////////////////////////////////////////////////////////////////
+bool SjpegDimensions(const std::string& jpeg_data,
+                     int* width, int* height, int* is_yuv420) {
+  return SjpegDimensions(
+      reinterpret_cast<const uint8_t*>(jpeg_data.data()),
+      jpeg_data.size(), width, height, is_yuv420);
+}
+int SjpegFindQuantizer(const std::string& jpeg_data,
+                       uint8_t quant[2][64]) {
+  return SjpegFindQuantizer(
+      reinterpret_cast<const uint8_t*>(jpeg_data.data()), jpeg_data.size(),
+      quant);
+}
+////////////////////////////////////////////////////////////////////////////////