RubyGems - snappy - Versions diffs - 0.1.0-java → 0.2.0-java - Mend

snappy 0.1.0-java → 0.2.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

checksums.yaml +4 -4
data/.github/workflows/main.yml +34 -0
data/.github/workflows/publish.yml +34 -0
data/Gemfile +3 -4
data/Rakefile +32 -30
data/ext/api.c +6 -1
data/lib/snappy.rb +5 -5
data/lib/snappy/hadoop/reader.rb +6 -2
data/lib/snappy/reader.rb +11 -7
data/lib/snappy/shim.rb +1 -1
data/lib/snappy/version.rb +1 -1
data/snappy.gemspec +13 -9
data/test/hadoop/snappy_hadoop_reader_test.rb +115 -0
data/test/hadoop/snappy_hadoop_writer_test.rb +48 -0
data/test/snappy_hadoop_test.rb +26 -0
data/test/snappy_reader_test.rb +148 -0
data/test/snappy_test.rb +95 -0
data/test/snappy_writer_test.rb +55 -0
data/test/test_helper.rb +7 -0
data/vendor/snappy/CMakeLists.txt +177 -54
data/vendor/snappy/NEWS +8 -0
data/vendor/snappy/README.md +19 -20
data/vendor/snappy/cmake/SnappyConfig.cmake.in +33 -0
data/vendor/snappy/cmake/config.h.in +6 -6
data/vendor/snappy/docs/README.md +72 -0
data/vendor/snappy/snappy-internal.h +12 -5
data/vendor/snappy/snappy-stubs-internal.cc +1 -1
data/vendor/snappy/snappy-stubs-internal.h +60 -15
data/vendor/snappy/snappy-stubs-public.h.in +16 -36
data/vendor/snappy/snappy-test.cc +16 -15
data/vendor/snappy/snappy-test.h +12 -60
data/vendor/snappy/snappy.cc +333 -187
data/vendor/snappy/snappy.h +14 -10
data/vendor/snappy/snappy_compress_fuzzer.cc +59 -0
data/vendor/snappy/snappy_uncompress_fuzzer.cc +57 -0
data/vendor/snappy/snappy_unittest.cc +220 -124
metadata +25 -18
data/.travis.yml +0 -31
data/smoke.sh +0 -8
data/test/hadoop/test-snappy-hadoop-reader.rb +0 -103
data/test/hadoop/test-snappy-hadoop-writer.rb +0 -48
data/test/test-snappy-hadoop.rb +0 -22
data/test/test-snappy-reader.rb +0 -129
data/test/test-snappy-writer.rb +0 -55
data/test/test-snappy.rb +0 -58
data/vendor/snappy/cmake/SnappyConfig.cmake +0 -1

data/vendor/snappy/snappy-test.cc CHANGED

@@ -48,12 +48,12 @@ DEFINE_bool(run_microbenchmarks, true,
 namespace snappy {
-string ReadTestDataFile(const string& base, size_t size_limit) {
-  string contents;
+std::string ReadTestDataFile(const std::string& base, size_t size_limit) {
+  std::string contents;
   const char* srcdir = getenv("srcdir");  // This is set by Automake.
-  string prefix;
+  std::string prefix;
   if (srcdir) {
-    prefix = string(srcdir) + "/";
+    prefix = std::string(srcdir) + "/";
   }
   file::GetContents(prefix + "testdata/" + base, &contents, file::Defaults()
       ).CheckSuccess();
@@ -63,11 +63,11 @@ string ReadTestDataFile(const string& base, size_t size_limit) {
   return contents;
 }
-string ReadTestDataFile(const string& base) {
+std::string ReadTestDataFile(const std::string& base) {
   return ReadTestDataFile(base, 0);
 }
-string StringPrintf(const char* format, ...) {
+std::string StrFormat(const char* format, ...) {
   char buf[4096];
   va_list ap;
   va_start(ap, format);
@@ -79,7 +79,7 @@ string StringPrintf(const char* format, ...) {
 bool benchmark_running = false;
 int64 benchmark_real_time_us = 0;
 int64 benchmark_cpu_time_us = 0;
-string *benchmark_label = NULL;
+std::string* benchmark_label = nullptr;
 int64 benchmark_bytes_processed = 0;
 void ResetBenchmarkTiming() {
@@ -163,11 +163,11 @@ void StopBenchmarkTiming() {
   benchmark_running = false;
 }
-void SetBenchmarkLabel(const string& str) {
+void SetBenchmarkLabel(const std::string& str) {
   if (benchmark_label) {
     delete benchmark_label;
   }
-  benchmark_label = new string(str);
+  benchmark_label = new std::string(str);
 }
 void SetBenchmarkBytesProcessed(int64 bytes) {
@@ -217,8 +217,8 @@ void Benchmark::Run() {
       benchmark_runs[run].cpu_time_us = benchmark_cpu_time_us;
     }
-    string heading = StringPrintf("%s/%d", name_.c_str(), test_case_num);
-    string human_readable_speed;
+    std::string heading = StrFormat("%s/%d", name_.c_str(), test_case_num);
+    std::string human_readable_speed;
     std::nth_element(benchmark_runs,
                      benchmark_runs + kMedianPos,
@@ -232,15 +232,16 @@ void Benchmark::Run() {
       int64 bytes_per_second =
           benchmark_bytes_processed * 1000000 / cpu_time_us;
       if (bytes_per_second < 1024) {
-        human_readable_speed = StringPrintf("%dB/s", bytes_per_second);
+        human_readable_speed =
+            StrFormat("%dB/s", static_cast<int>(bytes_per_second));
       } else if (bytes_per_second < 1024 * 1024) {
-        human_readable_speed = StringPrintf(
+        human_readable_speed = StrFormat(
             "%.1fkB/s", bytes_per_second / 1024.0f);
       } else if (bytes_per_second < 1024 * 1024 * 1024) {
-        human_readable_speed = StringPrintf(
+        human_readable_speed = StrFormat(
             "%.1fMB/s", bytes_per_second / (1024.0f * 1024.0f));
       } else {
-        human_readable_speed = StringPrintf(
+        human_readable_speed = StrFormat(
             "%.1fGB/s", bytes_per_second / (1024.0f * 1024.0f * 1024.0f));
       }
     }

data/vendor/snappy/snappy-test.h CHANGED

@@ -55,8 +55,6 @@
 #include <windows.h>
 #endif
-#include <string>
 #ifdef HAVE_GTEST
 #include <gtest/gtest.h>
@@ -169,7 +167,7 @@ namespace file {
 namespace snappy {
 #define FLAGS_test_random_seed 301
-typedef string TypeParam;
+using TypeParam = std::string;
 void Test_CorruptedTest_VerifyCorrupted();
 void Test_Snappy_SimpleTests();
@@ -183,63 +181,13 @@ void Test_Snappy_ReadPastEndOfBuffer();
 void Test_Snappy_FindMatchLength();
 void Test_Snappy_FindMatchLengthRandom();
-string ReadTestDataFile(const string& base, size_t size_limit);
+std::string ReadTestDataFile(const std::string& base, size_t size_limit);
-string ReadTestDataFile(const string& base);
+std::string ReadTestDataFile(const std::string& base);
 // A sprintf() variant that returns a std::string.
 // Not safe for general use due to truncation issues.
-string StringPrintf(const char* format, ...);
-// A simple, non-cryptographically-secure random generator.
-class ACMRandom {
- public:
-  explicit ACMRandom(uint32 seed) : seed_(seed) {}
-  int32 Next();
-  int32 Uniform(int32 n) {
-    return Next() % n;
-  }
-  uint8 Rand8() {
-    return static_cast<uint8>((Next() >> 1) & 0x000000ff);
-  }
-  bool OneIn(int X) { return Uniform(X) == 0; }
-  // Skewed: pick "base" uniformly from range [0,max_log] and then
-  // return "base" random bits.  The effect is to pick a number in the
-  // range [0,2^max_log-1] with bias towards smaller numbers.
-  int32 Skewed(int max_log);
- private:
-  static const uint32 M = 2147483647L;   // 2^31-1
-  uint32 seed_;
-};
-inline int32 ACMRandom::Next() {
-  static const uint64 A = 16807;  // bits 14, 8, 7, 5, 2, 1, 0
-  // We are computing
-  //       seed_ = (seed_ * A) % M,    where M = 2^31-1
-  //
-  // seed_ must not be zero or M, or else all subsequent computed values
-  // will be zero or M respectively.  For all other values, seed_ will end
-  // up cycling through every number in [1,M-1]
-  uint64 product = seed_ * A;
-  // Compute (product % M) using the fact that ((x << 31) % M) == x.
-  seed_ = (product >> 31) + (product & M);
-  // The first reduction may overflow by 1 bit, so we may need to repeat.
-  // mod == M is not possible; using > allows the faster sign-bit-based test.
-  if (seed_ > M) {
-    seed_ -= M;
-  }
-  return seed_;
-}
-inline int32 ACMRandom::Skewed(int max_log) {
-  const int32 base = (Next() - 1) % (max_log+1);
-  return (Next() - 1) & ((1u << base)-1);
-}
+std::string StrFormat(const char* format, ...);
 // A wall-time clock. This stub is not super-accurate, nor resistant to the
 // system time changing.
@@ -293,8 +241,8 @@ typedef void (*BenchmarkFunction)(int, int);
 class Benchmark {
  public:
-  Benchmark(const string& name, BenchmarkFunction function) :
-      name_(name), function_(function) {}
+  Benchmark(const std::string& name, BenchmarkFunction function)
+      : name_(name), function_(function) {}
   Benchmark* DenseRange(int start, int stop) {
     start_ = start;
@@ -305,7 +253,7 @@ class Benchmark {
   void Run();
  private:
-  const string name_;
+  const std::string name_;
   const BenchmarkFunction function_;
   int start_, stop_;
 };
@@ -317,11 +265,13 @@ extern Benchmark* Benchmark_BM_UFlat;
 extern Benchmark* Benchmark_BM_UIOVec;
 extern Benchmark* Benchmark_BM_UValidate;
 extern Benchmark* Benchmark_BM_ZFlat;
+extern Benchmark* Benchmark_BM_ZFlatAll;
+extern Benchmark* Benchmark_BM_ZFlatIncreasingTableSize;
 void ResetBenchmarkTiming();
 void StartBenchmarkTiming();
 void StopBenchmarkTiming();
-void SetBenchmarkLabel(const string& str);
+void SetBenchmarkLabel(const std::string& str);
 void SetBenchmarkBytesProcessed(int64 bytes);
 #ifdef HAVE_LIBZ
@@ -468,6 +418,8 @@ static inline void RunSpecifiedBenchmarks() {
   snappy::Benchmark_BM_UIOVec->Run();
   snappy::Benchmark_BM_UValidate->Run();
   snappy::Benchmark_BM_ZFlat->Run();
+  snappy::Benchmark_BM_ZFlatAll->Run();
+  snappy::Benchmark_BM_ZFlatIncreasingTableSize->Run();
   fprintf(stderr, "\n");
 }

data/vendor/snappy/snappy.cc CHANGED

@@ -30,25 +30,50 @@
 #include "snappy-internal.h"
 #include "snappy-sinksource.h"
-#ifndef SNAPPY_HAVE_SSE2
-#if defined(__SSE2__) || defined(_M_X64) || \
-    (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
-#define SNAPPY_HAVE_SSE2 1
+#if !defined(SNAPPY_HAVE_SSSE3)
+// __SSSE3__ is defined by GCC and Clang. Visual Studio doesn't target SIMD
+// support between SSE2 and AVX (so SSSE3 instructions require AVX support), and
+// defines __AVX__ when AVX support is available.
+#if defined(__SSSE3__) || defined(__AVX__)
+#define SNAPPY_HAVE_SSSE3 1
 #else
-#define SNAPPY_HAVE_SSE2 0
+#define SNAPPY_HAVE_SSSE3 0
 #endif
+#endif  // !defined(SNAPPY_HAVE_SSSE3)
+#if !defined(SNAPPY_HAVE_BMI2)
+// __BMI2__ is defined by GCC and Clang. Visual Studio doesn't target BMI2
+// specifically, but it does define __AVX2__ when AVX2 support is available.
+// Fortunately, AVX2 was introduced in Haswell, just like BMI2.
+//
+// BMI2 is not defined as a subset of AVX2 (unlike SSSE3 and AVX above). So,
+// GCC and Clang can build code with AVX2 enabled but BMI2 disabled, in which
+// case issuing BMI2 instructions results in a compiler error.
+#if defined(__BMI2__) || (defined(_MSC_VER) && defined(__AVX2__))
+#define SNAPPY_HAVE_BMI2 1
+#else
+#define SNAPPY_HAVE_BMI2 0
+#endif
+#endif  // !defined(SNAPPY_HAVE_BMI2)
+#if SNAPPY_HAVE_SSSE3
+// Please do not replace with <x86intrin.h>. or with headers that assume more
+// advanced SSE versions without checking with all the OWNERS.
+#include <tmmintrin.h>
 #endif
-#if SNAPPY_HAVE_SSE2
-#include <emmintrin.h>
+#if SNAPPY_HAVE_BMI2
+// Please do not replace with <x86intrin.h>. or with headers that assume more
+// advanced SSE versions without checking with all the OWNERS.
+#include <immintrin.h>
 #endif
 #include <stdio.h>
 #include <algorithm>
 #include <string>
 #include <vector>
 namespace snappy {
 using internal::COPY_1_BYTE_OFFSET;
@@ -103,16 +128,12 @@ void UnalignedCopy64(const void* src, void* dst) {
 }
 void UnalignedCopy128(const void* src, void* dst) {
-  // TODO(alkis): Remove this when we upgrade to a recent compiler that emits
-  // SSE2 moves for memcpy(dst, src, 16).
-#if SNAPPY_HAVE_SSE2
-  __m128i x = _mm_loadu_si128(static_cast<const __m128i*>(src));
-  _mm_storeu_si128(static_cast<__m128i*>(dst), x);
-#else
+  // memcpy gets vectorized when the appropriate compiler options are used.
+  // For example, x86 compilers targeting SSE2+ will optimize to an SSE2 load
+  // and store.
   char tmp[16];
   memcpy(tmp, src, 16);
   memcpy(dst, tmp, 16);
-#endif
 }
 // Copy [src, src+(op_limit-op)) to [op, (op_limit-op)) a byte at a time. Used
@@ -127,12 +148,35 @@ void UnalignedCopy128(const void* src, void* dst) {
 // Note that this does not match the semantics of either memcpy() or memmove().
 inline char* IncrementalCopySlow(const char* src, char* op,
                                  char* const op_limit) {
+  // TODO: Remove pragma when LLVM is aware this
+  // function is only called in cold regions and when cold regions don't get
+  // vectorized or unrolled.
+#ifdef __clang__
+#pragma clang loop unroll(disable)
+#endif
   while (op < op_limit) {
     *op++ = *src++;
   }
   return op_limit;
 }
+#if SNAPPY_HAVE_SSSE3
+// This is a table of shuffle control masks that can be used as the source
+// operand for PSHUFB to permute the contents of the destination XMM register
+// into a repeating byte pattern.
+alignas(16) const char pshufb_fill_patterns[7][16] = {
+  {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+  {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1},
+  {0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0},
+  {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3},
+  {0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0},
+  {0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3},
+  {0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1},
+};
+#endif  // SNAPPY_HAVE_SSSE3
 // Copy [src, src+(op_limit-op)) to [op, (op_limit-op)) but faster than
 // IncrementalCopySlow. buf_limit is the address past the end of the writable
 // region of the buffer.
@@ -144,9 +188,10 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
   // pat  = op - src
   // len  = limit - op
   assert(src < op);
+  assert(op <= op_limit);
   assert(op_limit <= buf_limit);
   // NOTE: The compressor always emits 4 <= len <= 64. It is ok to assume that
-  // to optimize this function but we have to also handle these cases in case
+  // to optimize this function but we have to also handle other cases in case
   // the input does not satisfy these conditions.
   size_t pattern_size = op - src;
@@ -176,16 +221,45 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
   // Handle the uncommon case where pattern is less than 8 bytes.
   if (SNAPPY_PREDICT_FALSE(pattern_size < 8)) {
-    // Expand pattern to at least 8 bytes. The worse case scenario in terms of
-    // buffer usage is when the pattern is size 3. ^ is the original position
-    // of op. x are irrelevant bytes copied by the last UnalignedCopy64.
+#if SNAPPY_HAVE_SSSE3
+    // Load the first eight bytes into an 128-bit XMM register, then use PSHUFB
+    // to permute the register's contents in-place into a repeating sequence of
+    // the first "pattern_size" bytes.
+    // For example, suppose:
+    //    src       == "abc"
+    //    op        == op + 3
+    // After _mm_shuffle_epi8(), "pattern" will have five copies of "abc"
+    // followed by one byte of slop: abcabcabcabcabca.
     //
-    // abc
-    // abcabcxxxxx
-    // abcabcabcabcxxxxx
-    //    ^
-    // The last x is 14 bytes after ^.
-    if (SNAPPY_PREDICT_TRUE(op <= buf_limit - 14)) {
+    // The non-SSE fallback implementation suffers from store-forwarding stalls
+    // because its loads and stores partly overlap. By expanding the pattern
+    // in-place, we avoid the penalty.
+    if (SNAPPY_PREDICT_TRUE(op <= buf_limit - 16)) {
+      const __m128i shuffle_mask = _mm_load_si128(
+          reinterpret_cast<const __m128i*>(pshufb_fill_patterns)
+          + pattern_size - 1);
+      const __m128i pattern = _mm_shuffle_epi8(
+          _mm_loadl_epi64(reinterpret_cast<const __m128i*>(src)), shuffle_mask);
+      // Uninitialized bytes are masked out by the shuffle mask.
+      // TODO: remove annotation and macro defs once MSan is fixed.
+      SNAPPY_ANNOTATE_MEMORY_IS_INITIALIZED(&pattern, sizeof(pattern));
+      pattern_size *= 16 / pattern_size;
+      char* op_end = std::min(op_limit, buf_limit - 15);
+      while (op < op_end) {
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(op), pattern);
+        op += pattern_size;
+      }
+      if (SNAPPY_PREDICT_TRUE(op >= op_limit)) return op_limit;
+    }
+    return IncrementalCopySlow(src, op, op_limit);
+#else  // !SNAPPY_HAVE_SSSE3
+    // If plenty of buffer space remains, expand the pattern to at least 8
+    // bytes. The way the following loop is written, we need 8 bytes of buffer
+    // space if pattern_size >= 4, 11 bytes if pattern_size is 1 or 3, and 10
+    // bytes if pattern_size is 2.  Precisely encoding that is probably not
+    // worthwhile; instead, invoke the slow path if we cannot write 11 bytes
+    // (because 11 are required in the worst case).
+    if (SNAPPY_PREDICT_TRUE(op <= buf_limit - 11)) {
       while (pattern_size < 8) {
         UnalignedCopy64(src, op);
         op += pattern_size;
@@ -195,6 +269,7 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
     } else {
       return IncrementalCopySlow(src, op, op_limit);
     }
+#endif  // SNAPPY_HAVE_SSSE3
   }
   assert(pattern_size >= 8);
@@ -202,13 +277,48 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
   // UnalignedCopy128 might overwrite data in op. UnalignedCopy64 is safe
   // because expanding the pattern to at least 8 bytes guarantees that
   // op - src >= 8.
-  while (op <= buf_limit - 16) {
+  //
+  // Typically, the op_limit is the gating factor so try to simplify the loop
+  // based on that.
+  if (SNAPPY_PREDICT_TRUE(op_limit <= buf_limit - 16)) {
+    // There is at least one, and at most four 16-byte blocks. Writing four
+    // conditionals instead of a loop allows FDO to layout the code with respect
+    // to the actual probabilities of each length.
+    // TODO: Replace with loop with trip count hint.
+    UnalignedCopy64(src, op);
+    UnalignedCopy64(src + 8, op + 8);
+    if (op + 16 < op_limit) {
+      UnalignedCopy64(src + 16, op + 16);
+      UnalignedCopy64(src + 24, op + 24);
+    }
+    if (op + 32 < op_limit) {
+      UnalignedCopy64(src + 32, op + 32);
+      UnalignedCopy64(src + 40, op + 40);
+    }
+    if (op + 48 < op_limit) {
+      UnalignedCopy64(src + 48, op + 48);
+      UnalignedCopy64(src + 56, op + 56);
+    }
+    return op_limit;
+  }
+  // Fall back to doing as much as we can with the available slop in the
+  // buffer. This code path is relatively cold however so we save code size by
+  // avoiding unrolling and vectorizing.
+  //
+  // TODO: Remove pragma when when cold regions don't get vectorized
+  // or unrolled.
+#ifdef __clang__
+#pragma clang loop unroll(disable)
+#endif
+  for (char *op_end = buf_limit - 16; op < op_end; op += 16, src += 16) {
     UnalignedCopy64(src, op);
     UnalignedCopy64(src + 8, op + 8);
-    src += 16;
-    op += 16;
-    if (SNAPPY_PREDICT_TRUE(op >= op_limit)) return op_limit;
   }
+  if (op >= op_limit)
+    return op_limit;
   // We only take this branch if we didn't have enough slop and we can do a
   // single 8 byte copy.
   if (SNAPPY_PREDICT_FALSE(op <= buf_limit - 8)) {
@@ -221,10 +331,10 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
 }  // namespace
+template <bool allow_fast_path>
 static inline char* EmitLiteral(char* op,
                                 const char* literal,
-                                int len,
-                                bool allow_fast_path) {
+                                int len) {
   // The vast majority of copies are below 16 bytes, for which a
   // call to memcpy is overkill. This fast path can sometimes
   // copy up to 15 bytes too much, but that is okay in the
@@ -249,25 +359,23 @@ static inline char* EmitLiteral(char* op,
     // Fits in tag byte
     *op++ = LITERAL | (n << 2);
   } else {
-    // Encode in upcoming bytes
-    char* base = op;
-    int count = 0;
-    op++;
-    while (n > 0) {
-      *op++ = n & 0xff;
-      n >>= 8;
-      count++;
-    }
+    int count = (Bits::Log2Floor(n) >> 3) + 1;
     assert(count >= 1);
     assert(count <= 4);
-    *base = LITERAL | ((59+count) << 2);
+    *op++ = LITERAL | ((59 + count) << 2);
+    // Encode in upcoming bytes.
+    // Write 4 bytes, though we may care about only 1 of them. The output buffer
+    // is guaranteed to have at least 3 more spaces left as 'len >= 61' holds
+    // here and there is a memcpy of size 'len' below.
+    LittleEndian::Store32(op, n);
+    op += count;
   }
   memcpy(op, literal, len);
   return op + len;
 }
-static inline char* EmitCopyAtMost64(char* op, size_t offset, size_t len,
-                                     bool len_less_than_12) {
+template <bool len_less_than_12>
+static inline char* EmitCopyAtMost64(char* op, size_t offset, size_t len) {
   assert(len <= 64);
   assert(len >= 4);
   assert(offset < 65536);
@@ -288,29 +396,33 @@ static inline char* EmitCopyAtMost64(char* op, size_t offset, size_t len,
   return op;
 }
-static inline char* EmitCopy(char* op, size_t offset, size_t len,
-                             bool len_less_than_12) {
+template <bool len_less_than_12>
+static inline char* EmitCopy(char* op, size_t offset, size_t len) {
   assert(len_less_than_12 == (len < 12));
   if (len_less_than_12) {
-    return EmitCopyAtMost64(op, offset, len, true);
+    return EmitCopyAtMost64</*len_less_than_12=*/true>(op, offset, len);
   } else {
     // A special case for len <= 64 might help, but so far measurements suggest
     // it's in the noise.
     // Emit 64 byte copies but make sure to keep at least four bytes reserved.
     while (SNAPPY_PREDICT_FALSE(len >= 68)) {
-      op = EmitCopyAtMost64(op, offset, 64, false);
+      op = EmitCopyAtMost64</*len_less_than_12=*/false>(op, offset, 64);
       len -= 64;
     }
     // One or two copies will now finish the job.
     if (len > 64) {
-      op = EmitCopyAtMost64(op, offset, 60, false);
+      op = EmitCopyAtMost64</*len_less_than_12=*/false>(op, offset, 60);
       len -= 60;
     }
     // Emit remainder.
-    op = EmitCopyAtMost64(op, offset, len, len < 12);
+    if (len < 12) {
+      op = EmitCopyAtMost64</*len_less_than_12=*/true>(op, offset, len);
+    } else {
+      op = EmitCopyAtMost64</*len_less_than_12=*/false>(op, offset, len);
+    }
     return op;
   }
 }
@@ -326,31 +438,45 @@ bool GetUncompressedLength(const char* start, size_t n, size_t* result) {
   }
 }
-namespace internal {
-uint16* WorkingMemory::GetHashTable(size_t input_size, int* table_size) {
-  // Use smaller hash table when input.size() is smaller, since we
-  // fill the table, incurring O(hash table size) overhead for
-  // compression, and if the input is short, we won't need that
-  // many hash table entries anyway.
-  assert(kMaxHashTableSize >= 256);
-  size_t htsize = 256;
-  while (htsize < kMaxHashTableSize && htsize < input_size) {
-    htsize <<= 1;
+namespace {
+uint32 CalculateTableSize(uint32 input_size) {
+  static_assert(
+      kMaxHashTableSize >= kMinHashTableSize,
+      "kMaxHashTableSize should be greater or equal to kMinHashTableSize.");
+  if (input_size > kMaxHashTableSize) {
+    return kMaxHashTableSize;
   }
-  uint16* table;
-  if (htsize <= ARRAYSIZE(small_table_)) {
-    table = small_table_;
-  } else {
-    if (large_table_ == NULL) {
-      large_table_ = new uint16[kMaxHashTableSize];
-    }
-    table = large_table_;
+  if (input_size < kMinHashTableSize) {
+    return kMinHashTableSize;
   }
+  // This is equivalent to Log2Ceiling(input_size), assuming input_size > 1.
+  // 2 << Log2Floor(x - 1) is equivalent to 1 << (1 + Log2Floor(x - 1)).
+  return 2u << Bits::Log2Floor(input_size - 1);
+}
+}  // namespace
+namespace internal {
+WorkingMemory::WorkingMemory(size_t input_size) {
+  const size_t max_fragment_size = std::min(input_size, kBlockSize);
+  const size_t table_size = CalculateTableSize(max_fragment_size);
+  size_ = table_size * sizeof(*table_) + max_fragment_size +
+          MaxCompressedLength(max_fragment_size);
+  mem_ = std::allocator<char>().allocate(size_);
+  table_ = reinterpret_cast<uint16*>(mem_);
+  input_ = mem_ + table_size * sizeof(*table_);
+  output_ = input_ + max_fragment_size;
+}
+WorkingMemory::~WorkingMemory() {
+  std::allocator<char>().deallocate(mem_, size_);
+}
+uint16* WorkingMemory::GetHashTable(size_t fragment_size,
+                                    int* table_size) const {
+  const size_t htsize = CalculateTableSize(fragment_size);
+  memset(table_, 0, htsize * sizeof(*table_));
   *table_size = htsize;
-  memset(table, 0, htsize * sizeof(*table));
-  return table;
+  return table_;
 }
 }  // end namespace internal
@@ -417,7 +543,7 @@ char* CompressFragment(const char* input,
   // "ip" is the input pointer, and "op" is the output pointer.
   const char* ip = input;
   assert(input_size <= kBlockSize);
-  assert((table_size & (table_size - 1)) == 0); // table must be power of two
+  assert((table_size & (table_size - 1)) == 0);  // table must be power of two
   const int shift = 32 - Bits::Log2Floor(table_size);
   assert(static_cast<int>(kuint32max >> shift) == table_size - 1);
   const char* ip_end = input + input_size;
@@ -484,7 +610,7 @@ char* CompressFragment(const char* input,
       // than 4 bytes match.  But, prior to the match, input
       // bytes [next_emit, ip) are unmatched.  Emit them as "literal bytes."
       assert(next_emit + 16 <= ip_end);
-      op = EmitLiteral(op, next_emit, ip - next_emit, true);
+      op = EmitLiteral</*allow_fast_path=*/true>(op, next_emit, ip - next_emit);
       // Step 3: Call EmitCopy, and then see if another EmitCopy could
       // be our next move.  Repeat until we find no match for the
@@ -507,7 +633,11 @@ char* CompressFragment(const char* input,
         ip += matched;
         size_t offset = base - candidate;
         assert(0 == memcmp(base, candidate, matched));
-        op = EmitCopy(op, offset, matched, p.second);
+        if (p.second) {
+          op = EmitCopy</*len_less_than_12=*/true>(op, offset, matched);
+        } else {
+          op = EmitCopy</*len_less_than_12=*/false>(op, offset, matched);
+        }
         next_emit = ip;
         if (SNAPPY_PREDICT_FALSE(ip >= ip_limit)) {
           goto emit_remainder;
@@ -532,7 +662,8 @@ char* CompressFragment(const char* input,
  emit_remainder:
   // Emit the remaining bytes as a literal
   if (next_emit < ip_end) {
-    op = EmitLiteral(op, next_emit, ip_end - next_emit, false);
+    op = EmitLiteral</*allow_fast_path=*/false>(op, next_emit,
+                                                ip_end - next_emit);
   }
   return op;
@@ -583,14 +714,28 @@ static inline void Report(const char *algorithm, size_t compressed_size,
 //   bool TryFastAppend(const char* ip, size_t available, size_t length);
 // };
-namespace internal {
-// Mapping from i in range [0,4] to a mask to extract the bottom 8*i bits
-static const uint32 wordmask[] = {
-  0u, 0xffu, 0xffffu, 0xffffffu, 0xffffffffu
-};
+static inline uint32 ExtractLowBytes(uint32 v, int n) {
+  assert(n >= 0);
+  assert(n <= 4);
+#if SNAPPY_HAVE_BMI2
+  return _bzhi_u32(v, 8 * n);
+#else
+  // This needs to be wider than uint32 otherwise `mask << 32` will be
+  // undefined.
+  uint64 mask = 0xffffffff;
+  return v & ~(mask << (8 * n));
+#endif
+}
-}  // end namespace internal
+static inline bool LeftShiftOverflows(uint8 value, uint32 shift) {
+  assert(shift < 32);
+  static const uint8 masks[] = {
+      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  //
+      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  //
+      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  //
+      0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe};
+  return (value & masks[shift]) != 0;
+}
 // Helper class for decompression
 class SnappyDecompressor {
@@ -629,7 +774,7 @@ class SnappyDecompressor {
   }
   // Read the uncompressed length stored at the start of the compressed data.
-  // On succcess, stores the length in *result and returns true.
+  // On success, stores the length in *result and returns true.
   // On failure, returns false.
   bool ReadUncompressedLength(uint32* result) {
     assert(ip_ == NULL);       // Must not have read anything yet
@@ -644,7 +789,7 @@ class SnappyDecompressor {
       const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip));
       reader_->Skip(1);
       uint32 val = c & 0x7f;
-      if (((val << shift) >> shift) != val) return false;
+      if (LeftShiftOverflows(static_cast<uint8>(val), shift)) return false;
       *result |= val << shift;
       if (c < 128) {
         break;
@@ -657,22 +802,27 @@ class SnappyDecompressor {
   // Process the next item found in the input.
   // Returns true if successful, false on error or end of input.
   template <class Writer>
+#if defined(__GNUC__) && defined(__x86_64__)
+  __attribute__((aligned(32)))
+#endif
   void DecompressAllTags(Writer* writer) {
-    const char* ip = ip_;
-    // For position-independent executables, accessing global arrays can be
-    // slow.  Move wordmask array onto the stack to mitigate this.
-    uint32 wordmask[sizeof(internal::wordmask)/sizeof(uint32)];
-    // Do not use memcpy to copy internal::wordmask to
-    // wordmask.  LLVM converts stack arrays to global arrays if it detects
-    // const stack arrays and this hurts the performance of position
-    // independent code. This change is temporary and can be reverted when
-    // https://reviews.llvm.org/D30759 is approved.
-    wordmask[0] = internal::wordmask[0];
-    wordmask[1] = internal::wordmask[1];
-    wordmask[2] = internal::wordmask[2];
-    wordmask[3] = internal::wordmask[3];
-    wordmask[4] = internal::wordmask[4];
+    // In x86, pad the function body to start 16 bytes later. This function has
+    // a couple of hotspots that are highly sensitive to alignment: we have
+    // observed regressions by more than 20% in some metrics just by moving the
+    // exact same code to a different position in the benchmark binary.
+    //
+    // Putting this code on a 32-byte-aligned boundary + 16 bytes makes us hit
+    // the "lucky" case consistently. Unfortunately, this is a very brittle
+    // workaround, and future differences in code generation may reintroduce
+    // this regression. If you experience a big, difficult to explain, benchmark
+    // performance regression here, first try removing this hack.
+#if defined(__GNUC__) && defined(__x86_64__)
+    // Two 8-byte "NOP DWORD ptr [EAX + EAX*1 + 00000000H]" instructions.
+    asm(".byte 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00");
+    asm(".byte 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00");
+#endif
+    const char* ip = ip_;
     // We could have put this refill fragment only at the beginning of the loop.
     // However, duplicating it at the end of each branch gives the compiler more
     // scope to optimize the <ip_limit_ - ip> expression based on the local
@@ -685,13 +835,6 @@ class SnappyDecompressor {
         }
     MAYBE_REFILL();
-    // Add loop alignment directive. Without this directive, we observed
-    // significant performance degradation on several intel architectures
-    // in snappy benchmark built with LLVM. The degradation was caused by
-    // increased branch miss prediction.
-#if defined(__clang__) && defined(__x86_64__)
-    asm volatile (".p2align 5");
-#endif
     for ( ;; ) {
       const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip++));
@@ -712,7 +855,7 @@ class SnappyDecompressor {
         if (writer->TryFastAppend(ip, ip_limit_ - ip, literal_length)) {
           assert(literal_length < 61);
           ip += literal_length;
-          // NOTE(user): There is no MAYBE_REFILL() here, as TryFastAppend()
+          // NOTE: There is no MAYBE_REFILL() here, as TryFastAppend()
           // will not return true unless there's already at least five spare
           // bytes in addition to the literal.
           continue;
@@ -721,7 +864,8 @@ class SnappyDecompressor {
           // Long literal.
           const size_t literal_length_length = literal_length - 60;
           literal_length =
-              (LittleEndian::Load32(ip) & wordmask[literal_length_length]) + 1;
+              ExtractLowBytes(LittleEndian::Load32(ip), literal_length_length) +
+              1;
           ip += literal_length_length;
         }
@@ -744,7 +888,8 @@ class SnappyDecompressor {
         MAYBE_REFILL();
       } else {
         const size_t entry = char_table[c];
-        const size_t trailer = LittleEndian::Load32(ip) & wordmask[entry >> 11];
+        const size_t trailer =
+            ExtractLowBytes(LittleEndian::Load32(ip), entry >> 11);
         const size_t length = entry & 0xff;
         ip += entry >> 11;
@@ -860,9 +1005,7 @@ size_t Compress(Source* reader, Sink* writer) {
   writer->Append(ulength, p-ulength);
   written += (p - ulength);
-  internal::WorkingMemory wmem;
-  char* scratch = NULL;
-  char* scratch_output = NULL;
+  internal::WorkingMemory wmem(N);
   while (N > 0) {
     // Get next block to compress (without copying if possible)
@@ -878,13 +1021,7 @@ size_t Compress(Source* reader, Sink* writer) {
       pending_advance = num_to_read;
       fragment_size = num_to_read;
     } else {
-      // Read into scratch buffer
-      if (scratch == NULL) {
-        // If this is the last iteration, we want to allocate N bytes
-        // of space, otherwise the max possible kBlockSize space.
-        // num_to_read contains exactly the correct value
-        scratch = new char[num_to_read];
-      }
+      char* scratch = wmem.GetScratchInput();
       memcpy(scratch, fragment, bytes_read);
       reader->Skip(bytes_read);
@@ -910,16 +1047,13 @@ size_t Compress(Source* reader, Sink* writer) {
     // Need a scratch buffer for the output, in case the byte sink doesn't
     // have room for us directly.
-    if (scratch_output == NULL) {
-      scratch_output = new char[max_output];
-    } else {
-      // Since we encode kBlockSize regions followed by a region
-      // which is <= kBlockSize in length, a previously allocated
-      // scratch_output[] region is big enough for this iteration.
-    }
-    char* dest = writer->GetAppendBuffer(max_output, scratch_output);
-    char* end = internal::CompressFragment(fragment, fragment_size,
-                                           dest, table, table_size);
+    // Since we encode kBlockSize regions followed by a region
+    // which is <= kBlockSize in length, a previously allocated
+    // scratch_output[] region is big enough for this iteration.
+    char* dest = writer->GetAppendBuffer(max_output, wmem.GetScratchOutput());
+    char* end = internal::CompressFragment(fragment, fragment_size, dest, table,
+                                           table_size);
     writer->Append(dest, end - dest);
     written += (end - dest);
@@ -929,9 +1063,6 @@ size_t Compress(Source* reader, Sink* writer) {
   Report("snappy_compress", written, uncompressed_size);
-  delete[] scratch;
-  delete[] scratch_output;
   return written;
 }
@@ -944,14 +1075,22 @@ size_t Compress(Source* reader, Sink* writer) {
 // Writer template argument to SnappyDecompressor::DecompressAllTags().
 class SnappyIOVecWriter {
  private:
+  // output_iov_end_ is set to iov + count and used to determine when
+  // the end of the iovs is reached.
+  const struct iovec* output_iov_end_;
+#if !defined(NDEBUG)
   const struct iovec* output_iov_;
-  const size_t output_iov_count_;
+#endif  // !defined(NDEBUG)
+  // Current iov that is being written into.
+  const struct iovec* curr_iov_;
-  // We are currently writing into output_iov_[curr_iov_index_].
-  size_t curr_iov_index_;
+  // Pointer to current iov's write location.
+  char* curr_iov_output_;
-  // Bytes written to output_iov_[curr_iov_index_] so far.
-  size_t curr_iov_written_;
+  // Remaining bytes to write into curr_iov_output.
+  size_t curr_iov_remaining_;
   // Total bytes decompressed into output_iov_ so far.
   size_t total_written_;
@@ -959,22 +1098,24 @@ class SnappyIOVecWriter {
   // Maximum number of bytes that will be decompressed into output_iov_.
   size_t output_limit_;
-  inline char* GetIOVecPointer(size_t index, size_t offset) {
-    return reinterpret_cast<char*>(output_iov_[index].iov_base) +
-        offset;
+  static inline char* GetIOVecPointer(const struct iovec* iov, size_t offset) {
+    return reinterpret_cast<char*>(iov->iov_base) + offset;
   }
  public:
   // Does not take ownership of iov. iov must be valid during the
   // entire lifetime of the SnappyIOVecWriter.
   inline SnappyIOVecWriter(const struct iovec* iov, size_t iov_count)
-      : output_iov_(iov),
-        output_iov_count_(iov_count),
-        curr_iov_index_(0),
-        curr_iov_written_(0),
+      : output_iov_end_(iov + iov_count),
+#if !defined(NDEBUG)
+        output_iov_(iov),
+#endif  // !defined(NDEBUG)
+        curr_iov_(iov),
+        curr_iov_output_(iov_count ? reinterpret_cast<char*>(iov->iov_base)
+                                   : nullptr),
+        curr_iov_remaining_(iov_count ? iov->iov_len : 0),
         total_written_(0),
-        output_limit_(-1) {
-  }
+        output_limit_(-1) {}
   inline void SetExpectedLength(size_t len) {
     output_limit_ = len;
@@ -989,23 +1130,25 @@ class SnappyIOVecWriter {
       return false;
     }
+    return AppendNoCheck(ip, len);
+  }
+  inline bool AppendNoCheck(const char* ip, size_t len) {
     while (len > 0) {
-      assert(curr_iov_written_ <= output_iov_[curr_iov_index_].iov_len);
-      if (curr_iov_written_ >= output_iov_[curr_iov_index_].iov_len) {
+      if (curr_iov_remaining_ == 0) {
         // This iovec is full. Go to the next one.
-        if (curr_iov_index_ + 1 >= output_iov_count_) {
+        if (curr_iov_ + 1 >= output_iov_end_) {
           return false;
         }
-        curr_iov_written_ = 0;
-        ++curr_iov_index_;
+        ++curr_iov_;
+        curr_iov_output_ = reinterpret_cast<char*>(curr_iov_->iov_base);
+        curr_iov_remaining_ = curr_iov_->iov_len;
       }
-      const size_t to_write = std::min(
-          len, output_iov_[curr_iov_index_].iov_len - curr_iov_written_);
-      memcpy(GetIOVecPointer(curr_iov_index_, curr_iov_written_),
-             ip,
-             to_write);
-      curr_iov_written_ += to_write;
+      const size_t to_write = std::min(len, curr_iov_remaining_);
+      memcpy(curr_iov_output_, ip, to_write);
+      curr_iov_output_ += to_write;
+      curr_iov_remaining_ -= to_write;
       total_written_ += to_write;
       ip += to_write;
       len -= to_write;
@@ -1017,11 +1160,11 @@ class SnappyIOVecWriter {
   inline bool TryFastAppend(const char* ip, size_t available, size_t len) {
     const size_t space_left = output_limit_ - total_written_;
     if (len <= 16 && available >= 16 + kMaximumTagLength && space_left >= 16 &&
-        output_iov_[curr_iov_index_].iov_len - curr_iov_written_ >= 16) {
+        curr_iov_remaining_ >= 16) {
       // Fast path, used for the majority (about 95%) of invocations.
-      char* ptr = GetIOVecPointer(curr_iov_index_, curr_iov_written_);
-      UnalignedCopy128(ip, ptr);
-      curr_iov_written_ += len;
+      UnalignedCopy128(ip, curr_iov_output_);
+      curr_iov_output_ += len;
+      curr_iov_remaining_ -= len;
       total_written_ += len;
       return true;
     }
@@ -1030,7 +1173,9 @@ class SnappyIOVecWriter {
   }
   inline bool AppendFromSelf(size_t offset, size_t len) {
-    if (offset > total_written_ || offset == 0) {
+    // See SnappyArrayWriter::AppendFromSelf for an explanation of
+    // the "offset - 1u" trick.
+    if (offset - 1u >= total_written_) {
       return false;
     }
     const size_t space_left = output_limit_ - total_written_;
@@ -1039,8 +1184,8 @@ class SnappyIOVecWriter {
     }
     // Locate the iovec from which we need to start the copy.
-    size_t from_iov_index = curr_iov_index_;
-    size_t from_iov_offset = curr_iov_written_;
+    const iovec* from_iov = curr_iov_;
+    size_t from_iov_offset = curr_iov_->iov_len - curr_iov_remaining_;
     while (offset > 0) {
       if (from_iov_offset >= offset) {
         from_iov_offset -= offset;
@@ -1048,47 +1193,47 @@ class SnappyIOVecWriter {
       }
       offset -= from_iov_offset;
-      assert(from_iov_index > 0);
-      --from_iov_index;
-      from_iov_offset = output_iov_[from_iov_index].iov_len;
+      --from_iov;
+#if !defined(NDEBUG)
+      assert(from_iov >= output_iov_);
+#endif  // !defined(NDEBUG)
+      from_iov_offset = from_iov->iov_len;
     }
     // Copy <len> bytes starting from the iovec pointed to by from_iov_index to
     // the current iovec.
     while (len > 0) {
-      assert(from_iov_index <= curr_iov_index_);
-      if (from_iov_index != curr_iov_index_) {
-        const size_t to_copy = std::min(
-            output_iov_[from_iov_index].iov_len - from_iov_offset,
-            len);
-        Append(GetIOVecPointer(from_iov_index, from_iov_offset), to_copy);
+      assert(from_iov <= curr_iov_);
+      if (from_iov != curr_iov_) {
+        const size_t to_copy =
+            std::min(from_iov->iov_len - from_iov_offset, len);
+        AppendNoCheck(GetIOVecPointer(from_iov, from_iov_offset), to_copy);
         len -= to_copy;
         if (len > 0) {
-          ++from_iov_index;
+          ++from_iov;
           from_iov_offset = 0;
         }
       } else {
-        assert(curr_iov_written_ <= output_iov_[curr_iov_index_].iov_len);
-        size_t to_copy = std::min(output_iov_[curr_iov_index_].iov_len -
-                                      curr_iov_written_,
-                                  len);
+        size_t to_copy = curr_iov_remaining_;
         if (to_copy == 0) {
           // This iovec is full. Go to the next one.
-          if (curr_iov_index_ + 1 >= output_iov_count_) {
+          if (curr_iov_ + 1 >= output_iov_end_) {
             return false;
           }
-          ++curr_iov_index_;
-          curr_iov_written_ = 0;
+          ++curr_iov_;
+          curr_iov_output_ = reinterpret_cast<char*>(curr_iov_->iov_base);
+          curr_iov_remaining_ = curr_iov_->iov_len;
           continue;
         }
         if (to_copy > len) {
           to_copy = len;
         }
-        IncrementalCopySlow(
-            GetIOVecPointer(from_iov_index, from_iov_offset),
-            GetIOVecPointer(curr_iov_index_, curr_iov_written_),
-            GetIOVecPointer(curr_iov_index_, curr_iov_written_) + to_copy);
-        curr_iov_written_ += to_copy;
+        IncrementalCopy(GetIOVecPointer(from_iov, from_iov_offset),
+                        curr_iov_output_, curr_iov_output_ + to_copy,
+                        curr_iov_output_ + curr_iov_remaining_);
+        curr_iov_output_ += to_copy;
+        curr_iov_remaining_ -= to_copy;
         from_iov_offset += to_copy;
         total_written_ += to_copy;
         len -= to_copy;
@@ -1197,7 +1342,7 @@ bool RawUncompress(Source* compressed, char* uncompressed) {
   return InternalUncompress(compressed, &output);
 }
-bool Uncompress(const char* compressed, size_t n, string* uncompressed) {
+bool Uncompress(const char* compressed, size_t n, std::string* uncompressed) {
   size_t ulength;
   if (!GetUncompressedLength(compressed, n, &ulength)) {
     return false;
@@ -1265,7 +1410,8 @@ void RawCompress(const char* input,
   *compressed_length = (writer.CurrentDestination() - compressed);
 }
-size_t Compress(const char* input, size_t input_length, string* compressed) {
+size_t Compress(const char* input, size_t input_length,
+                std::string* compressed) {
   // Pre-grow the buffer to the max length of the compressed output
   STLStringResizeUninitialized(compressed, MaxCompressedLength(input_length));
@@ -1512,4 +1658,4 @@ bool Uncompress(Source* compressed, Sink* uncompressed) {
   }
 }
-} // end namespace snappy
+}  // namespace snappy