RubyGems - snappy - Versions diffs - 0.0.17 → 0.3.0 - Mend

snappy 0.0.17 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (87) hide show

checksums.yaml +5 -5
data/.dockerignore +2 -0
data/.github/workflows/main.yml +34 -0
data/.github/workflows/publish.yml +34 -0
data/.gitignore +2 -1
data/.gitmodules +1 -1
data/Dockerfile +13 -0
data/Gemfile +4 -0
data/README.md +29 -5
data/Rakefile +32 -29
data/ext/api.c +6 -1
data/ext/extconf.rb +23 -16
data/lib/snappy/hadoop/reader.rb +62 -0
data/lib/snappy/hadoop/writer.rb +51 -0
data/lib/snappy/hadoop.rb +22 -0
data/lib/snappy/reader.rb +14 -10
data/lib/snappy/shim.rb +1 -1
data/lib/snappy/version.rb +1 -1
data/lib/snappy.rb +5 -4
data/snappy.gemspec +13 -13
data/test/hadoop/snappy_hadoop_reader_test.rb +115 -0
data/test/hadoop/snappy_hadoop_writer_test.rb +48 -0
data/test/snappy_hadoop_test.rb +26 -0
data/test/snappy_reader_test.rb +148 -0
data/test/snappy_test.rb +95 -0
data/test/snappy_writer_test.rb +55 -0
data/test/test_helper.rb +7 -0
data/test.sh +3 -0
data/vendor/snappy/CMakeLists.txt +297 -0
data/vendor/snappy/CONTRIBUTING.md +26 -0
data/vendor/snappy/NEWS +40 -0
data/vendor/snappy/{README → README.md} +27 -18
data/vendor/snappy/cmake/SnappyConfig.cmake.in +33 -0
data/vendor/snappy/cmake/config.h.in +62 -0
data/vendor/snappy/docs/README.md +72 -0
data/vendor/snappy/snappy-internal.h +22 -18
data/vendor/snappy/snappy-stubs-internal.cc +1 -1
data/vendor/snappy/snappy-stubs-internal.h +116 -38
data/vendor/snappy/snappy-stubs-public.h.in +20 -46
data/vendor/snappy/snappy-test.cc +26 -22
data/vendor/snappy/snappy-test.h +24 -98
data/vendor/snappy/snappy.cc +380 -183
data/vendor/snappy/snappy.h +14 -10
data/vendor/snappy/snappy_compress_fuzzer.cc +59 -0
data/vendor/snappy/snappy_uncompress_fuzzer.cc +57 -0
data/vendor/snappy/snappy_unittest.cc +236 -261
metadata +37 -92
data/.travis.yml +0 -26
data/smoke.sh +0 -8
data/test/test-snappy-reader.rb +0 -129
data/test/test-snappy-writer.rb +0 -55
data/test/test-snappy.rb +0 -58
data/vendor/snappy/ChangeLog +0 -2468
data/vendor/snappy/INSTALL +0 -370
data/vendor/snappy/Makefile +0 -982
data/vendor/snappy/Makefile.am +0 -26
data/vendor/snappy/Makefile.in +0 -982
data/vendor/snappy/aclocal.m4 +0 -9738
data/vendor/snappy/autogen.sh +0 -12
data/vendor/snappy/autom4te.cache/output.0 +0 -18856
data/vendor/snappy/autom4te.cache/output.1 +0 -18852
data/vendor/snappy/autom4te.cache/requests +0 -297
data/vendor/snappy/autom4te.cache/traces.0 +0 -2689
data/vendor/snappy/autom4te.cache/traces.1 +0 -714
data/vendor/snappy/config.guess +0 -1530
data/vendor/snappy/config.h +0 -135
data/vendor/snappy/config.h.in +0 -134
data/vendor/snappy/config.log +0 -1640
data/vendor/snappy/config.status +0 -2318
data/vendor/snappy/config.sub +0 -1773
data/vendor/snappy/configure +0 -18852
data/vendor/snappy/configure.ac +0 -134
data/vendor/snappy/depcomp +0 -688
data/vendor/snappy/install-sh +0 -527
data/vendor/snappy/libtool +0 -10246
data/vendor/snappy/ltmain.sh +0 -9661
data/vendor/snappy/m4/gtest.m4 +0 -74
data/vendor/snappy/m4/libtool.m4 +0 -8001
data/vendor/snappy/m4/ltoptions.m4 +0 -384
data/vendor/snappy/m4/ltsugar.m4 +0 -123
data/vendor/snappy/m4/ltversion.m4 +0 -23
data/vendor/snappy/m4/lt~obsolete.m4 +0 -98
data/vendor/snappy/missing +0 -331
data/vendor/snappy/snappy-stubs-public.h +0 -100
data/vendor/snappy/snappy.pc +0 -10
data/vendor/snappy/snappy.pc.in +0 -10
data/vendor/snappy/stamp-h1 +0 -1

data/vendor/snappy/snappy.cc CHANGED Viewed

@@ -30,16 +30,50 @@
 #include "snappy-internal.h"
 #include "snappy-sinksource.h"
-#if defined(__x86_64__) || defined(_M_X64)
-#include <emmintrin.h>
+#if !defined(SNAPPY_HAVE_SSSE3)
+// __SSSE3__ is defined by GCC and Clang. Visual Studio doesn't target SIMD
+// support between SSE2 and AVX (so SSSE3 instructions require AVX support), and
+// defines __AVX__ when AVX support is available.
+#if defined(__SSSE3__) || defined(__AVX__)
+#define SNAPPY_HAVE_SSSE3 1
+#else
+#define SNAPPY_HAVE_SSSE3 0
+#endif
+#endif  // !defined(SNAPPY_HAVE_SSSE3)
+#if !defined(SNAPPY_HAVE_BMI2)
+// __BMI2__ is defined by GCC and Clang. Visual Studio doesn't target BMI2
+// specifically, but it does define __AVX2__ when AVX2 support is available.
+// Fortunately, AVX2 was introduced in Haswell, just like BMI2.
+//
+// BMI2 is not defined as a subset of AVX2 (unlike SSSE3 and AVX above). So,
+// GCC and Clang can build code with AVX2 enabled but BMI2 disabled, in which
+// case issuing BMI2 instructions results in a compiler error.
+#if defined(__BMI2__) || (defined(_MSC_VER) && defined(__AVX2__))
+#define SNAPPY_HAVE_BMI2 1
+#else
+#define SNAPPY_HAVE_BMI2 0
 #endif
+#endif  // !defined(SNAPPY_HAVE_BMI2)
+#if SNAPPY_HAVE_SSSE3
+// Please do not replace with <x86intrin.h>. or with headers that assume more
+// advanced SSE versions without checking with all the OWNERS.
+#include <tmmintrin.h>
+#endif
+#if SNAPPY_HAVE_BMI2
+// Please do not replace with <x86intrin.h>. or with headers that assume more
+// advanced SSE versions without checking with all the OWNERS.
+#include <immintrin.h>
+#endif
 #include <stdio.h>
 #include <algorithm>
 #include <string>
 #include <vector>
 namespace snappy {
 using internal::COPY_1_BYTE_OFFSET;
@@ -47,7 +81,6 @@ using internal::COPY_2_BYTE_OFFSET;
 using internal::LITERAL;
 using internal::char_table;
 using internal::kMaximumTagLength;
-using internal::wordmask;
 // Any hash function will produce a valid compressed bitstream, but a good
 // hash function reduces the number of collisions and thus yields better
@@ -89,18 +122,18 @@ size_t MaxCompressedLength(size_t source_len) {
 namespace {
 void UnalignedCopy64(const void* src, void* dst) {
-  memcpy(dst, src, 8);
+  char tmp[8];
+  memcpy(tmp, src, 8);
+  memcpy(dst, tmp, 8);
 }
 void UnalignedCopy128(const void* src, void* dst) {
-  // TODO(alkis): Remove this when we upgrade to a recent compiler that emits
-  // SSE2 moves for memcpy(dst, src, 16).
-#ifdef __SSE2__
-  __m128i x = _mm_loadu_si128(static_cast<const __m128i*>(src));
-  _mm_storeu_si128(static_cast<__m128i*>(dst), x);
-#else
-  memcpy(dst, src, 16);
-#endif
+  // memcpy gets vectorized when the appropriate compiler options are used.
+  // For example, x86 compilers targeting SSE2+ will optimize to an SSE2 load
+  // and store.
+  char tmp[16];
+  memcpy(tmp, src, 16);
+  memcpy(dst, tmp, 16);
 }
 // Copy [src, src+(op_limit-op)) to [op, (op_limit-op)) a byte at a time. Used
@@ -115,12 +148,35 @@ void UnalignedCopy128(const void* src, void* dst) {
 // Note that this does not match the semantics of either memcpy() or memmove().
 inline char* IncrementalCopySlow(const char* src, char* op,
                                  char* const op_limit) {
+  // TODO: Remove pragma when LLVM is aware this
+  // function is only called in cold regions and when cold regions don't get
+  // vectorized or unrolled.
+#ifdef __clang__
+#pragma clang loop unroll(disable)
+#endif
   while (op < op_limit) {
     *op++ = *src++;
   }
   return op_limit;
 }
+#if SNAPPY_HAVE_SSSE3
+// This is a table of shuffle control masks that can be used as the source
+// operand for PSHUFB to permute the contents of the destination XMM register
+// into a repeating byte pattern.
+alignas(16) const char pshufb_fill_patterns[7][16] = {
+  {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+  {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1},
+  {0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0},
+  {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3},
+  {0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0},
+  {0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3},
+  {0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1},
+};
+#endif  // SNAPPY_HAVE_SSSE3
 // Copy [src, src+(op_limit-op)) to [op, (op_limit-op)) but faster than
 // IncrementalCopySlow. buf_limit is the address past the end of the writable
 // region of the buffer.
@@ -132,9 +188,10 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
   // pat  = op - src
   // len  = limit - op
   assert(src < op);
+  assert(op <= op_limit);
   assert(op_limit <= buf_limit);
   // NOTE: The compressor always emits 4 <= len <= 64. It is ok to assume that
-  // to optimize this function but we have to also handle these cases in case
+  // to optimize this function but we have to also handle other cases in case
   // the input does not satisfy these conditions.
   size_t pattern_size = op - src;
@@ -163,26 +220,56 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
   // copying 2x 8 bytes at a time.
   // Handle the uncommon case where pattern is less than 8 bytes.
-  if (PREDICT_FALSE(pattern_size < 8)) {
-    // Expand pattern to at least 8 bytes. The worse case scenario in terms of
-    // buffer usage is when the pattern is size 3. ^ is the original position
-    // of op. x are irrelevant bytes copied by the last UnalignedCopy64.
+  if (SNAPPY_PREDICT_FALSE(pattern_size < 8)) {
+#if SNAPPY_HAVE_SSSE3
+    // Load the first eight bytes into an 128-bit XMM register, then use PSHUFB
+    // to permute the register's contents in-place into a repeating sequence of
+    // the first "pattern_size" bytes.
+    // For example, suppose:
+    //    src       == "abc"
+    //    op        == op + 3
+    // After _mm_shuffle_epi8(), "pattern" will have five copies of "abc"
+    // followed by one byte of slop: abcabcabcabcabca.
     //
-    // abc
-    // abcabcxxxxx
-    // abcabcabcabcxxxxx
-    //    ^
-    // The last x is 14 bytes after ^.
-    if (PREDICT_TRUE(op <= buf_limit - 14)) {
+    // The non-SSE fallback implementation suffers from store-forwarding stalls
+    // because its loads and stores partly overlap. By expanding the pattern
+    // in-place, we avoid the penalty.
+    if (SNAPPY_PREDICT_TRUE(op <= buf_limit - 16)) {
+      const __m128i shuffle_mask = _mm_load_si128(
+          reinterpret_cast<const __m128i*>(pshufb_fill_patterns)
+          + pattern_size - 1);
+      const __m128i pattern = _mm_shuffle_epi8(
+          _mm_loadl_epi64(reinterpret_cast<const __m128i*>(src)), shuffle_mask);
+      // Uninitialized bytes are masked out by the shuffle mask.
+      // TODO: remove annotation and macro defs once MSan is fixed.
+      SNAPPY_ANNOTATE_MEMORY_IS_INITIALIZED(&pattern, sizeof(pattern));
+      pattern_size *= 16 / pattern_size;
+      char* op_end = std::min(op_limit, buf_limit - 15);
+      while (op < op_end) {
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(op), pattern);
+        op += pattern_size;
+      }
+      if (SNAPPY_PREDICT_TRUE(op >= op_limit)) return op_limit;
+    }
+    return IncrementalCopySlow(src, op, op_limit);
+#else  // !SNAPPY_HAVE_SSSE3
+    // If plenty of buffer space remains, expand the pattern to at least 8
+    // bytes. The way the following loop is written, we need 8 bytes of buffer
+    // space if pattern_size >= 4, 11 bytes if pattern_size is 1 or 3, and 10
+    // bytes if pattern_size is 2.  Precisely encoding that is probably not
+    // worthwhile; instead, invoke the slow path if we cannot write 11 bytes
+    // (because 11 are required in the worst case).
+    if (SNAPPY_PREDICT_TRUE(op <= buf_limit - 11)) {
       while (pattern_size < 8) {
         UnalignedCopy64(src, op);
         op += pattern_size;
         pattern_size *= 2;
       }
-      if (PREDICT_TRUE(op >= op_limit)) return op_limit;
+      if (SNAPPY_PREDICT_TRUE(op >= op_limit)) return op_limit;
     } else {
       return IncrementalCopySlow(src, op, op_limit);
     }
+#endif  // SNAPPY_HAVE_SSSE3
   }
   assert(pattern_size >= 8);
@@ -190,16 +277,51 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
   // UnalignedCopy128 might overwrite data in op. UnalignedCopy64 is safe
   // because expanding the pattern to at least 8 bytes guarantees that
   // op - src >= 8.
-  while (op <= buf_limit - 16) {
+  //
+  // Typically, the op_limit is the gating factor so try to simplify the loop
+  // based on that.
+  if (SNAPPY_PREDICT_TRUE(op_limit <= buf_limit - 16)) {
+    // There is at least one, and at most four 16-byte blocks. Writing four
+    // conditionals instead of a loop allows FDO to layout the code with respect
+    // to the actual probabilities of each length.
+    // TODO: Replace with loop with trip count hint.
+    UnalignedCopy64(src, op);
+    UnalignedCopy64(src + 8, op + 8);
+    if (op + 16 < op_limit) {
+      UnalignedCopy64(src + 16, op + 16);
+      UnalignedCopy64(src + 24, op + 24);
+    }
+    if (op + 32 < op_limit) {
+      UnalignedCopy64(src + 32, op + 32);
+      UnalignedCopy64(src + 40, op + 40);
+    }
+    if (op + 48 < op_limit) {
+      UnalignedCopy64(src + 48, op + 48);
+      UnalignedCopy64(src + 56, op + 56);
+    }
+    return op_limit;
+  }
+  // Fall back to doing as much as we can with the available slop in the
+  // buffer. This code path is relatively cold however so we save code size by
+  // avoiding unrolling and vectorizing.
+  //
+  // TODO: Remove pragma when when cold regions don't get vectorized
+  // or unrolled.
+#ifdef __clang__
+#pragma clang loop unroll(disable)
+#endif
+  for (char *op_end = buf_limit - 16; op < op_end; op += 16, src += 16) {
     UnalignedCopy64(src, op);
     UnalignedCopy64(src + 8, op + 8);
-    src += 16;
-    op += 16;
-    if (PREDICT_TRUE(op >= op_limit)) return op_limit;
   }
+  if (op >= op_limit)
+    return op_limit;
   // We only take this branch if we didn't have enough slop and we can do a
   // single 8 byte copy.
-  if (PREDICT_FALSE(op <= buf_limit - 8)) {
+  if (SNAPPY_PREDICT_FALSE(op <= buf_limit - 8)) {
     UnalignedCopy64(src, op);
     src += 8;
     op += 8;
@@ -209,10 +331,10 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
 }  // namespace
+template <bool allow_fast_path>
 static inline char* EmitLiteral(char* op,
                                 const char* literal,
-                                int len,
-                                bool allow_fast_path) {
+                                int len) {
   // The vast majority of copies are below 16 bytes, for which a
   // call to memcpy is overkill. This fast path can sometimes
   // copy up to 15 bytes too much, but that is okay in the
@@ -237,31 +359,29 @@ static inline char* EmitLiteral(char* op,
     // Fits in tag byte
     *op++ = LITERAL | (n << 2);
   } else {
-    // Encode in upcoming bytes
-    char* base = op;
-    int count = 0;
-    op++;
-    while (n > 0) {
-      *op++ = n & 0xff;
-      n >>= 8;
-      count++;
-    }
+    int count = (Bits::Log2Floor(n) >> 3) + 1;
     assert(count >= 1);
     assert(count <= 4);
-    *base = LITERAL | ((59+count) << 2);
+    *op++ = LITERAL | ((59 + count) << 2);
+    // Encode in upcoming bytes.
+    // Write 4 bytes, though we may care about only 1 of them. The output buffer
+    // is guaranteed to have at least 3 more spaces left as 'len >= 61' holds
+    // here and there is a memcpy of size 'len' below.
+    LittleEndian::Store32(op, n);
+    op += count;
   }
   memcpy(op, literal, len);
   return op + len;
 }
-static inline char* EmitCopyAtMost64(char* op, size_t offset, size_t len,
-                                     bool len_less_than_12) {
+template <bool len_less_than_12>
+static inline char* EmitCopyAtMost64(char* op, size_t offset, size_t len) {
   assert(len <= 64);
   assert(len >= 4);
   assert(offset < 65536);
   assert(len_less_than_12 == (len < 12));
-  if (len_less_than_12 && PREDICT_TRUE(offset < 2048)) {
+  if (len_less_than_12 && SNAPPY_PREDICT_TRUE(offset < 2048)) {
     // offset fits in 11 bits.  The 3 highest go in the top of the first byte,
     // and the rest go in the second byte.
     *op++ = COPY_1_BYTE_OFFSET + ((len - 4) << 2) + ((offset >> 3) & 0xe0);
@@ -276,29 +396,33 @@ static inline char* EmitCopyAtMost64(char* op, size_t offset, size_t len,
   return op;
 }
-static inline char* EmitCopy(char* op, size_t offset, size_t len,
-                             bool len_less_than_12) {
+template <bool len_less_than_12>
+static inline char* EmitCopy(char* op, size_t offset, size_t len) {
   assert(len_less_than_12 == (len < 12));
   if (len_less_than_12) {
-    return EmitCopyAtMost64(op, offset, len, true);
+    return EmitCopyAtMost64</*len_less_than_12=*/true>(op, offset, len);
   } else {
     // A special case for len <= 64 might help, but so far measurements suggest
     // it's in the noise.
     // Emit 64 byte copies but make sure to keep at least four bytes reserved.
-    while (PREDICT_FALSE(len >= 68)) {
-      op = EmitCopyAtMost64(op, offset, 64, false);
+    while (SNAPPY_PREDICT_FALSE(len >= 68)) {
+      op = EmitCopyAtMost64</*len_less_than_12=*/false>(op, offset, 64);
       len -= 64;
     }
     // One or two copies will now finish the job.
     if (len > 64) {
-      op = EmitCopyAtMost64(op, offset, 60, false);
+      op = EmitCopyAtMost64</*len_less_than_12=*/false>(op, offset, 60);
       len -= 60;
     }
     // Emit remainder.
-    op = EmitCopyAtMost64(op, offset, len, len < 12);
+    if (len < 12) {
+      op = EmitCopyAtMost64</*len_less_than_12=*/true>(op, offset, len);
+    } else {
+      op = EmitCopyAtMost64</*len_less_than_12=*/false>(op, offset, len);
+    }
     return op;
   }
 }
@@ -314,31 +438,45 @@ bool GetUncompressedLength(const char* start, size_t n, size_t* result) {
   }
 }
-namespace internal {
-uint16* WorkingMemory::GetHashTable(size_t input_size, int* table_size) {
-  // Use smaller hash table when input.size() is smaller, since we
-  // fill the table, incurring O(hash table size) overhead for
-  // compression, and if the input is short, we won't need that
-  // many hash table entries anyway.
-  assert(kMaxHashTableSize >= 256);
-  size_t htsize = 256;
-  while (htsize < kMaxHashTableSize && htsize < input_size) {
-    htsize <<= 1;
+namespace {
+uint32 CalculateTableSize(uint32 input_size) {
+  static_assert(
+      kMaxHashTableSize >= kMinHashTableSize,
+      "kMaxHashTableSize should be greater or equal to kMinHashTableSize.");
+  if (input_size > kMaxHashTableSize) {
+    return kMaxHashTableSize;
   }
-  uint16* table;
-  if (htsize <= ARRAYSIZE(small_table_)) {
-    table = small_table_;
-  } else {
-    if (large_table_ == NULL) {
-      large_table_ = new uint16[kMaxHashTableSize];
-    }
-    table = large_table_;
+  if (input_size < kMinHashTableSize) {
+    return kMinHashTableSize;
   }
+  // This is equivalent to Log2Ceiling(input_size), assuming input_size > 1.
+  // 2 << Log2Floor(x - 1) is equivalent to 1 << (1 + Log2Floor(x - 1)).
+  return 2u << Bits::Log2Floor(input_size - 1);
+}
+}  // namespace
+namespace internal {
+WorkingMemory::WorkingMemory(size_t input_size) {
+  const size_t max_fragment_size = std::min(input_size, kBlockSize);
+  const size_t table_size = CalculateTableSize(max_fragment_size);
+  size_ = table_size * sizeof(*table_) + max_fragment_size +
+          MaxCompressedLength(max_fragment_size);
+  mem_ = std::allocator<char>().allocate(size_);
+  table_ = reinterpret_cast<uint16*>(mem_);
+  input_ = mem_ + table_size * sizeof(*table_);
+  output_ = input_ + max_fragment_size;
+}
+WorkingMemory::~WorkingMemory() {
+  std::allocator<char>().deallocate(mem_, size_);
+}
+uint16* WorkingMemory::GetHashTable(size_t fragment_size,
+                                    int* table_size) const {
+  const size_t htsize = CalculateTableSize(fragment_size);
+  memset(table_, 0, htsize * sizeof(*table_));
   *table_size = htsize;
-  memset(table, 0, htsize * sizeof(*table));
-  return table;
+  return table_;
 }
 }  // end namespace internal
@@ -405,7 +543,7 @@ char* CompressFragment(const char* input,
   // "ip" is the input pointer, and "op" is the output pointer.
   const char* ip = input;
   assert(input_size <= kBlockSize);
-  assert((table_size & (table_size - 1)) == 0); // table must be power of two
+  assert((table_size & (table_size - 1)) == 0);  // table must be power of two
   const int shift = 32 - Bits::Log2Floor(table_size);
   assert(static_cast<int>(kuint32max >> shift) == table_size - 1);
   const char* ip_end = input + input_size;
@@ -415,7 +553,7 @@ char* CompressFragment(const char* input,
   const char* next_emit = ip;
   const size_t kInputMarginBytes = 15;
-  if (PREDICT_TRUE(input_size >= kInputMarginBytes)) {
+  if (SNAPPY_PREDICT_TRUE(input_size >= kInputMarginBytes)) {
     const char* ip_limit = input + input_size - kInputMarginBytes;
     for (uint32 next_hash = Hash(++ip, shift); ; ) {
@@ -456,7 +594,7 @@ char* CompressFragment(const char* input,
         uint32 bytes_between_hash_lookups = skip >> 5;
         skip += bytes_between_hash_lookups;
         next_ip = ip + bytes_between_hash_lookups;
-        if (PREDICT_FALSE(next_ip > ip_limit)) {
+        if (SNAPPY_PREDICT_FALSE(next_ip > ip_limit)) {
           goto emit_remainder;
         }
         next_hash = Hash(next_ip, shift);
@@ -465,14 +603,14 @@ char* CompressFragment(const char* input,
         assert(candidate < ip);
         table[hash] = ip - base_ip;
-      } while (PREDICT_TRUE(UNALIGNED_LOAD32(ip) !=
-                            UNALIGNED_LOAD32(candidate)));
+      } while (SNAPPY_PREDICT_TRUE(UNALIGNED_LOAD32(ip) !=
+                                 UNALIGNED_LOAD32(candidate)));
       // Step 2: A 4-byte match has been found.  We'll later see if more
       // than 4 bytes match.  But, prior to the match, input
       // bytes [next_emit, ip) are unmatched.  Emit them as "literal bytes."
       assert(next_emit + 16 <= ip_end);
-      op = EmitLiteral(op, next_emit, ip - next_emit, true);
+      op = EmitLiteral</*allow_fast_path=*/true>(op, next_emit, ip - next_emit);
       // Step 3: Call EmitCopy, and then see if another EmitCopy could
       // be our next move.  Repeat until we find no match for the
@@ -495,9 +633,13 @@ char* CompressFragment(const char* input,
         ip += matched;
         size_t offset = base - candidate;
         assert(0 == memcmp(base, candidate, matched));
-        op = EmitCopy(op, offset, matched, p.second);
+        if (p.second) {
+          op = EmitCopy</*len_less_than_12=*/true>(op, offset, matched);
+        } else {
+          op = EmitCopy</*len_less_than_12=*/false>(op, offset, matched);
+        }
         next_emit = ip;
-        if (PREDICT_FALSE(ip >= ip_limit)) {
+        if (SNAPPY_PREDICT_FALSE(ip >= ip_limit)) {
           goto emit_remainder;
         }
         // We are now looking for a 4-byte match again.  We read
@@ -520,13 +662,18 @@ char* CompressFragment(const char* input,
  emit_remainder:
   // Emit the remaining bytes as a literal
   if (next_emit < ip_end) {
-    op = EmitLiteral(op, next_emit, ip_end - next_emit, false);
+    op = EmitLiteral</*allow_fast_path=*/false>(op, next_emit,
+                                                ip_end - next_emit);
   }
   return op;
 }
 }  // end namespace internal
+// Called back at avery compression call to trace parameters and sizes.
+static inline void Report(const char *algorithm, size_t compressed_size,
+                          size_t uncompressed_size) {}
 // Signature of output types needed by decompression code.
 // The decompression code is templatized on a type that obeys this
 // signature so that we do not pay virtual function call overhead in
@@ -567,6 +714,28 @@ char* CompressFragment(const char* input,
 //   bool TryFastAppend(const char* ip, size_t available, size_t length);
 // };
+static inline uint32 ExtractLowBytes(uint32 v, int n) {
+  assert(n >= 0);
+  assert(n <= 4);
+#if SNAPPY_HAVE_BMI2
+  return _bzhi_u32(v, 8 * n);
+#else
+  // This needs to be wider than uint32 otherwise `mask << 32` will be
+  // undefined.
+  uint64 mask = 0xffffffff;
+  return v & ~(mask << (8 * n));
+#endif
+}
+static inline bool LeftShiftOverflows(uint8 value, uint32 shift) {
+  assert(shift < 32);
+  static const uint8 masks[] = {
+      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  //
+      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  //
+      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  //
+      0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe};
+  return (value & masks[shift]) != 0;
+}
 // Helper class for decompression
 class SnappyDecompressor {
@@ -605,7 +774,7 @@ class SnappyDecompressor {
   }
   // Read the uncompressed length stored at the start of the compressed data.
-  // On succcess, stores the length in *result and returns true.
+  // On success, stores the length in *result and returns true.
   // On failure, returns false.
   bool ReadUncompressedLength(uint32* result) {
     assert(ip_ == NULL);       // Must not have read anything yet
@@ -620,7 +789,7 @@ class SnappyDecompressor {
       const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip));
       reader_->Skip(1);
       uint32 val = c & 0x7f;
-      if (((val << shift) >> shift) != val) return false;
+      if (LeftShiftOverflows(static_cast<uint8>(val), shift)) return false;
       *result |= val << shift;
       if (c < 128) {
         break;
@@ -633,13 +802,27 @@ class SnappyDecompressor {
   // Process the next item found in the input.
   // Returns true if successful, false on error or end of input.
   template <class Writer>
+#if defined(__GNUC__) && defined(__x86_64__)
+  __attribute__((aligned(32)))
+#endif
   void DecompressAllTags(Writer* writer) {
-    const char* ip = ip_;
-    // For position-independent executables, accessing global arrays can be
-    // slow.  Move wordmask array onto the stack to mitigate this.
-    uint32 wordmask[sizeof(internal::wordmask)/sizeof(uint32)];
-    memcpy(wordmask, internal::wordmask, sizeof(wordmask));
+    // In x86, pad the function body to start 16 bytes later. This function has
+    // a couple of hotspots that are highly sensitive to alignment: we have
+    // observed regressions by more than 20% in some metrics just by moving the
+    // exact same code to a different position in the benchmark binary.
+    //
+    // Putting this code on a 32-byte-aligned boundary + 16 bytes makes us hit
+    // the "lucky" case consistently. Unfortunately, this is a very brittle
+    // workaround, and future differences in code generation may reintroduce
+    // this regression. If you experience a big, difficult to explain, benchmark
+    // performance regression here, first try removing this hack.
+#if defined(__GNUC__) && defined(__x86_64__)
+    // Two 8-byte "NOP DWORD ptr [EAX + EAX*1 + 00000000H]" instructions.
+    asm(".byte 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00");
+    asm(".byte 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00");
+#endif
+    const char* ip = ip_;
     // We could have put this refill fragment only at the beginning of the loop.
     // However, duplicating it at the end of each branch gives the compiler more
     // scope to optimize the <ip_limit_ - ip> expression based on the local
@@ -667,21 +850,22 @@ class SnappyDecompressor {
       // txt[1-4]        25%        75%
       // pb              24%        76%
       // bin             24%        76%
-      if (PREDICT_FALSE((c & 0x3) == LITERAL)) {
+      if (SNAPPY_PREDICT_FALSE((c & 0x3) == LITERAL)) {
         size_t literal_length = (c >> 2) + 1u;
         if (writer->TryFastAppend(ip, ip_limit_ - ip, literal_length)) {
           assert(literal_length < 61);
           ip += literal_length;
-          // NOTE(user): There is no MAYBE_REFILL() here, as TryFastAppend()
+          // NOTE: There is no MAYBE_REFILL() here, as TryFastAppend()
           // will not return true unless there's already at least five spare
           // bytes in addition to the literal.
           continue;
         }
-        if (PREDICT_FALSE(literal_length >= 61)) {
+        if (SNAPPY_PREDICT_FALSE(literal_length >= 61)) {
           // Long literal.
           const size_t literal_length_length = literal_length - 60;
           literal_length =
-              (LittleEndian::Load32(ip) & wordmask[literal_length_length]) + 1;
+              ExtractLowBytes(LittleEndian::Load32(ip), literal_length_length) +
+              1;
           ip += literal_length_length;
         }
@@ -704,7 +888,8 @@ class SnappyDecompressor {
         MAYBE_REFILL();
       } else {
         const size_t entry = char_table[c];
-        const size_t trailer = LittleEndian::Load32(ip) & wordmask[entry >> 11];
+        const size_t trailer =
+            ExtractLowBytes(LittleEndian::Load32(ip), entry >> 11);
         const size_t length = entry & 0xff;
         ip += entry >> 11;
@@ -757,7 +942,7 @@ bool SnappyDecompressor::RefillTag() {
       size_t length;
       const char* src = reader_->Peek(&length);
       if (length == 0) return false;
-      uint32 to_add = min<uint32>(needed - nbuf, length);
+      uint32 to_add = std::min<uint32>(needed - nbuf, length);
       memcpy(scratch_ + nbuf, src, to_add);
       nbuf += to_add;
       reader_->Skip(to_add);
@@ -786,13 +971,18 @@ static bool InternalUncompress(Source* r, Writer* writer) {
   SnappyDecompressor decompressor(r);
   uint32 uncompressed_len = 0;
   if (!decompressor.ReadUncompressedLength(&uncompressed_len)) return false;
-  return InternalUncompressAllTags(&decompressor, writer, uncompressed_len);
+  return InternalUncompressAllTags(&decompressor, writer, r->Available(),
+                                   uncompressed_len);
 }
 template <typename Writer>
 static bool InternalUncompressAllTags(SnappyDecompressor* decompressor,
                                       Writer* writer,
+                                      uint32 compressed_len,
                                       uint32 uncompressed_len) {
+  Report("snappy_uncompress", compressed_len, uncompressed_len);
   writer->SetExpectedLength(uncompressed_len);
   // Process the entire input
@@ -809,21 +999,20 @@ bool GetUncompressedLength(Source* source, uint32* result) {
 size_t Compress(Source* reader, Sink* writer) {
   size_t written = 0;
   size_t N = reader->Available();
+  const size_t uncompressed_size = N;
   char ulength[Varint::kMax32];
   char* p = Varint::Encode32(ulength, N);
   writer->Append(ulength, p-ulength);
   written += (p - ulength);
-  internal::WorkingMemory wmem;
-  char* scratch = NULL;
-  char* scratch_output = NULL;
+  internal::WorkingMemory wmem(N);
   while (N > 0) {
     // Get next block to compress (without copying if possible)
     size_t fragment_size;
     const char* fragment = reader->Peek(&fragment_size);
     assert(fragment_size != 0);  // premature end of input
-    const size_t num_to_read = min(N, kBlockSize);
+    const size_t num_to_read = std::min(N, kBlockSize);
     size_t bytes_read = fragment_size;
     size_t pending_advance = 0;
@@ -832,19 +1021,13 @@ size_t Compress(Source* reader, Sink* writer) {
       pending_advance = num_to_read;
       fragment_size = num_to_read;
     } else {
-      // Read into scratch buffer
-      if (scratch == NULL) {
-        // If this is the last iteration, we want to allocate N bytes
-        // of space, otherwise the max possible kBlockSize space.
-        // num_to_read contains exactly the correct value
-        scratch = new char[num_to_read];
-      }
+      char* scratch = wmem.GetScratchInput();
       memcpy(scratch, fragment, bytes_read);
       reader->Skip(bytes_read);
       while (bytes_read < num_to_read) {
         fragment = reader->Peek(&fragment_size);
-        size_t n = min<size_t>(fragment_size, num_to_read - bytes_read);
+        size_t n = std::min<size_t>(fragment_size, num_to_read - bytes_read);
         memcpy(scratch + bytes_read, fragment, n);
         bytes_read += n;
         reader->Skip(n);
@@ -864,16 +1047,13 @@ size_t Compress(Source* reader, Sink* writer) {
     // Need a scratch buffer for the output, in case the byte sink doesn't
     // have room for us directly.
-    if (scratch_output == NULL) {
-      scratch_output = new char[max_output];
-    } else {
-      // Since we encode kBlockSize regions followed by a region
-      // which is <= kBlockSize in length, a previously allocated
-      // scratch_output[] region is big enough for this iteration.
-    }
-    char* dest = writer->GetAppendBuffer(max_output, scratch_output);
-    char* end = internal::CompressFragment(fragment, fragment_size,
-                                           dest, table, table_size);
+    // Since we encode kBlockSize regions followed by a region
+    // which is <= kBlockSize in length, a previously allocated
+    // scratch_output[] region is big enough for this iteration.
+    char* dest = writer->GetAppendBuffer(max_output, wmem.GetScratchOutput());
+    char* end = internal::CompressFragment(fragment, fragment_size, dest, table,
+                                           table_size);
     writer->Append(dest, end - dest);
     written += (end - dest);
@@ -881,8 +1061,7 @@ size_t Compress(Source* reader, Sink* writer) {
     reader->Skip(pending_advance);
   }
-  delete[] scratch;
-  delete[] scratch_output;
+  Report("snappy_compress", written, uncompressed_size);
   return written;
 }
@@ -896,14 +1075,22 @@ size_t Compress(Source* reader, Sink* writer) {
 // Writer template argument to SnappyDecompressor::DecompressAllTags().
 class SnappyIOVecWriter {
  private:
+  // output_iov_end_ is set to iov + count and used to determine when
+  // the end of the iovs is reached.
+  const struct iovec* output_iov_end_;
+#if !defined(NDEBUG)
   const struct iovec* output_iov_;
-  const size_t output_iov_count_;
+#endif  // !defined(NDEBUG)
+  // Current iov that is being written into.
+  const struct iovec* curr_iov_;
-  // We are currently writing into output_iov_[curr_iov_index_].
-  size_t curr_iov_index_;
+  // Pointer to current iov's write location.
+  char* curr_iov_output_;
-  // Bytes written to output_iov_[curr_iov_index_] so far.
-  size_t curr_iov_written_;
+  // Remaining bytes to write into curr_iov_output.
+  size_t curr_iov_remaining_;
   // Total bytes decompressed into output_iov_ so far.
   size_t total_written_;
@@ -911,22 +1098,24 @@ class SnappyIOVecWriter {
   // Maximum number of bytes that will be decompressed into output_iov_.
   size_t output_limit_;
-  inline char* GetIOVecPointer(size_t index, size_t offset) {
-    return reinterpret_cast<char*>(output_iov_[index].iov_base) +
-        offset;
+  static inline char* GetIOVecPointer(const struct iovec* iov, size_t offset) {
+    return reinterpret_cast<char*>(iov->iov_base) + offset;
   }
  public:
   // Does not take ownership of iov. iov must be valid during the
   // entire lifetime of the SnappyIOVecWriter.
   inline SnappyIOVecWriter(const struct iovec* iov, size_t iov_count)
-      : output_iov_(iov),
-        output_iov_count_(iov_count),
-        curr_iov_index_(0),
-        curr_iov_written_(0),
+      : output_iov_end_(iov + iov_count),
+#if !defined(NDEBUG)
+        output_iov_(iov),
+#endif  // !defined(NDEBUG)
+        curr_iov_(iov),
+        curr_iov_output_(iov_count ? reinterpret_cast<char*>(iov->iov_base)
+                                   : nullptr),
+        curr_iov_remaining_(iov_count ? iov->iov_len : 0),
         total_written_(0),
-        output_limit_(-1) {
-  }
+        output_limit_(-1) {}
   inline void SetExpectedLength(size_t len) {
     output_limit_ = len;
@@ -941,23 +1130,25 @@ class SnappyIOVecWriter {
       return false;
     }
+    return AppendNoCheck(ip, len);
+  }
+  inline bool AppendNoCheck(const char* ip, size_t len) {
     while (len > 0) {
-      assert(curr_iov_written_ <= output_iov_[curr_iov_index_].iov_len);
-      if (curr_iov_written_ >= output_iov_[curr_iov_index_].iov_len) {
+      if (curr_iov_remaining_ == 0) {
         // This iovec is full. Go to the next one.
-        if (curr_iov_index_ + 1 >= output_iov_count_) {
+        if (curr_iov_ + 1 >= output_iov_end_) {
           return false;
         }
-        curr_iov_written_ = 0;
-        ++curr_iov_index_;
+        ++curr_iov_;
+        curr_iov_output_ = reinterpret_cast<char*>(curr_iov_->iov_base);
+        curr_iov_remaining_ = curr_iov_->iov_len;
       }
-      const size_t to_write = std::min(
-          len, output_iov_[curr_iov_index_].iov_len - curr_iov_written_);
-      memcpy(GetIOVecPointer(curr_iov_index_, curr_iov_written_),
-             ip,
-             to_write);
-      curr_iov_written_ += to_write;
+      const size_t to_write = std::min(len, curr_iov_remaining_);
+      memcpy(curr_iov_output_, ip, to_write);
+      curr_iov_output_ += to_write;
+      curr_iov_remaining_ -= to_write;
       total_written_ += to_write;
       ip += to_write;
       len -= to_write;
@@ -969,11 +1160,11 @@ class SnappyIOVecWriter {
   inline bool TryFastAppend(const char* ip, size_t available, size_t len) {
     const size_t space_left = output_limit_ - total_written_;
     if (len <= 16 && available >= 16 + kMaximumTagLength && space_left >= 16 &&
-        output_iov_[curr_iov_index_].iov_len - curr_iov_written_ >= 16) {
+        curr_iov_remaining_ >= 16) {
       // Fast path, used for the majority (about 95%) of invocations.
-      char* ptr = GetIOVecPointer(curr_iov_index_, curr_iov_written_);
-      UnalignedCopy128(ip, ptr);
-      curr_iov_written_ += len;
+      UnalignedCopy128(ip, curr_iov_output_);
+      curr_iov_output_ += len;
+      curr_iov_remaining_ -= len;
       total_written_ += len;
       return true;
     }
@@ -982,7 +1173,9 @@ class SnappyIOVecWriter {
   }
   inline bool AppendFromSelf(size_t offset, size_t len) {
-    if (offset > total_written_ || offset == 0) {
+    // See SnappyArrayWriter::AppendFromSelf for an explanation of
+    // the "offset - 1u" trick.
+    if (offset - 1u >= total_written_) {
       return false;
     }
     const size_t space_left = output_limit_ - total_written_;
@@ -991,8 +1184,8 @@ class SnappyIOVecWriter {
     }
     // Locate the iovec from which we need to start the copy.
-    size_t from_iov_index = curr_iov_index_;
-    size_t from_iov_offset = curr_iov_written_;
+    const iovec* from_iov = curr_iov_;
+    size_t from_iov_offset = curr_iov_->iov_len - curr_iov_remaining_;
     while (offset > 0) {
       if (from_iov_offset >= offset) {
         from_iov_offset -= offset;
@@ -1000,47 +1193,47 @@ class SnappyIOVecWriter {
       }
       offset -= from_iov_offset;
-      assert(from_iov_index > 0);
-      --from_iov_index;
-      from_iov_offset = output_iov_[from_iov_index].iov_len;
+      --from_iov;
+#if !defined(NDEBUG)
+      assert(from_iov >= output_iov_);
+#endif  // !defined(NDEBUG)
+      from_iov_offset = from_iov->iov_len;
     }
     // Copy <len> bytes starting from the iovec pointed to by from_iov_index to
     // the current iovec.
     while (len > 0) {
-      assert(from_iov_index <= curr_iov_index_);
-      if (from_iov_index != curr_iov_index_) {
-        const size_t to_copy = std::min(
-            output_iov_[from_iov_index].iov_len - from_iov_offset,
-            len);
-        Append(GetIOVecPointer(from_iov_index, from_iov_offset), to_copy);
+      assert(from_iov <= curr_iov_);
+      if (from_iov != curr_iov_) {
+        const size_t to_copy =
+            std::min(from_iov->iov_len - from_iov_offset, len);
+        AppendNoCheck(GetIOVecPointer(from_iov, from_iov_offset), to_copy);
         len -= to_copy;
         if (len > 0) {
-          ++from_iov_index;
+          ++from_iov;
           from_iov_offset = 0;
         }
       } else {
-        assert(curr_iov_written_ <= output_iov_[curr_iov_index_].iov_len);
-        size_t to_copy = std::min(output_iov_[curr_iov_index_].iov_len -
-                                      curr_iov_written_,
-                                  len);
+        size_t to_copy = curr_iov_remaining_;
         if (to_copy == 0) {
           // This iovec is full. Go to the next one.
-          if (curr_iov_index_ + 1 >= output_iov_count_) {
+          if (curr_iov_ + 1 >= output_iov_end_) {
             return false;
           }
-          ++curr_iov_index_;
-          curr_iov_written_ = 0;
+          ++curr_iov_;
+          curr_iov_output_ = reinterpret_cast<char*>(curr_iov_->iov_base);
+          curr_iov_remaining_ = curr_iov_->iov_len;
           continue;
         }
         if (to_copy > len) {
           to_copy = len;
         }
-        IncrementalCopySlow(
-            GetIOVecPointer(from_iov_index, from_iov_offset),
-            GetIOVecPointer(curr_iov_index_, curr_iov_written_),
-            GetIOVecPointer(curr_iov_index_, curr_iov_written_) + to_copy);
-        curr_iov_written_ += to_copy;
+        IncrementalCopy(GetIOVecPointer(from_iov, from_iov_offset),
+                        curr_iov_output_, curr_iov_output_ + to_copy,
+                        curr_iov_output_ + curr_iov_remaining_);
+        curr_iov_output_ += to_copy;
+        curr_iov_remaining_ -= to_copy;
         from_iov_offset += to_copy;
         total_written_ += to_copy;
         len -= to_copy;
@@ -1149,7 +1342,7 @@ bool RawUncompress(Source* compressed, char* uncompressed) {
   return InternalUncompress(compressed, &output);
 }
-bool Uncompress(const char* compressed, size_t n, string* uncompressed) {
+bool Uncompress(const char* compressed, size_t n, std::string* uncompressed) {
   size_t ulength;
   if (!GetUncompressedLength(compressed, n, &ulength)) {
     return false;
@@ -1217,7 +1410,8 @@ void RawCompress(const char* input,
   *compressed_length = (writer.CurrentDestination() - compressed);
 }
-size_t Compress(const char* input, size_t input_length, string* compressed) {
+size_t Compress(const char* input, size_t input_length,
+                std::string* compressed) {
   // Pre-grow the buffer to the max length of the compressed output
   STLStringResizeUninitialized(compressed, MaxCompressedLength(input_length));
@@ -1313,7 +1507,8 @@ class SnappyScatteredWriter {
     char* const op_end = op_ptr_ + len;
     // See SnappyArrayWriter::AppendFromSelf for an explanation of
     // the "offset - 1u" trick.
-    if (PREDICT_TRUE(offset - 1u < op_ptr_ - op_base_ && op_end <= op_limit_)) {
+    if (SNAPPY_PREDICT_TRUE(offset - 1u < op_ptr_ - op_base_ &&
+                          op_end <= op_limit_)) {
       // Fast path: src and dst in current block.
       op_ptr_ = IncrementalCopy(op_ptr_ - offset, op_ptr_, op_end, op_limit_);
       return true;
@@ -1344,7 +1539,7 @@ bool SnappyScatteredWriter<Allocator>::SlowAppend(const char* ip, size_t len) {
     }
     // Make new block
-    size_t bsize = min<size_t>(kBlockSize, expected_ - full_size_);
+    size_t bsize = std::min<size_t>(kBlockSize, expected_ - full_size_);
     op_base_ = allocator_.Allocate(bsize);
     op_ptr_ = op_base_;
     op_limit_ = op_base_ + bsize;
@@ -1401,7 +1596,7 @@ class SnappySinkAllocator {
     size_t size_written = 0;
     size_t block_size;
     for (int i = 0; i < blocks_.size(); ++i) {
-      block_size = min<size_t>(blocks_[i].size, size - size_written);
+      block_size = std::min<size_t>(blocks_[i].size, size - size_written);
       dest_->AppendAndTakeOwnership(blocks_[i].data, block_size,
                                     &SnappySinkAllocator::Deleter, NULL);
       size_written += block_size;
@@ -1446,19 +1641,21 @@ bool Uncompress(Source* compressed, Sink* uncompressed) {
   char* buf = uncompressed->GetAppendBufferVariable(
       1, uncompressed_len, &c, 1, &allocated_size);
+  const size_t compressed_len = compressed->Available();
   // If we can get a flat buffer, then use it, otherwise do block by block
   // uncompression
   if (allocated_size >= uncompressed_len) {
     SnappyArrayWriter writer(buf);
-    bool result = InternalUncompressAllTags(
-        &decompressor, &writer, uncompressed_len);
+    bool result = InternalUncompressAllTags(&decompressor, &writer,
+                                            compressed_len, uncompressed_len);
     uncompressed->Append(buf, writer.Produced());
     return result;
   } else {
     SnappySinkAllocator allocator(uncompressed);
     SnappyScatteredWriter<SnappySinkAllocator> writer(allocator);
-    return InternalUncompressAllTags(&decompressor, &writer, uncompressed_len);
+    return InternalUncompressAllTags(&decompressor, &writer, compressed_len,
+                                     uncompressed_len);
   }
 }
-} // end namespace snappy
+}  // namespace snappy