snappy 0.1.0-java → 0.2.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/main.yml +34 -0
  3. data/.github/workflows/publish.yml +34 -0
  4. data/Gemfile +3 -4
  5. data/Rakefile +32 -30
  6. data/ext/api.c +6 -1
  7. data/lib/snappy.rb +5 -5
  8. data/lib/snappy/hadoop/reader.rb +6 -2
  9. data/lib/snappy/reader.rb +11 -7
  10. data/lib/snappy/shim.rb +1 -1
  11. data/lib/snappy/version.rb +1 -1
  12. data/snappy.gemspec +13 -9
  13. data/test/hadoop/snappy_hadoop_reader_test.rb +115 -0
  14. data/test/hadoop/snappy_hadoop_writer_test.rb +48 -0
  15. data/test/snappy_hadoop_test.rb +26 -0
  16. data/test/snappy_reader_test.rb +148 -0
  17. data/test/snappy_test.rb +95 -0
  18. data/test/snappy_writer_test.rb +55 -0
  19. data/test/test_helper.rb +7 -0
  20. data/vendor/snappy/CMakeLists.txt +177 -54
  21. data/vendor/snappy/NEWS +8 -0
  22. data/vendor/snappy/README.md +19 -20
  23. data/vendor/snappy/cmake/SnappyConfig.cmake.in +33 -0
  24. data/vendor/snappy/cmake/config.h.in +6 -6
  25. data/vendor/snappy/docs/README.md +72 -0
  26. data/vendor/snappy/snappy-internal.h +12 -5
  27. data/vendor/snappy/snappy-stubs-internal.cc +1 -1
  28. data/vendor/snappy/snappy-stubs-internal.h +60 -15
  29. data/vendor/snappy/snappy-stubs-public.h.in +16 -36
  30. data/vendor/snappy/snappy-test.cc +16 -15
  31. data/vendor/snappy/snappy-test.h +12 -60
  32. data/vendor/snappy/snappy.cc +333 -187
  33. data/vendor/snappy/snappy.h +14 -10
  34. data/vendor/snappy/snappy_compress_fuzzer.cc +59 -0
  35. data/vendor/snappy/snappy_uncompress_fuzzer.cc +57 -0
  36. data/vendor/snappy/snappy_unittest.cc +220 -124
  37. metadata +25 -18
  38. data/.travis.yml +0 -31
  39. data/smoke.sh +0 -8
  40. data/test/hadoop/test-snappy-hadoop-reader.rb +0 -103
  41. data/test/hadoop/test-snappy-hadoop-writer.rb +0 -48
  42. data/test/test-snappy-hadoop.rb +0 -22
  43. data/test/test-snappy-reader.rb +0 -129
  44. data/test/test-snappy-writer.rb +0 -55
  45. data/test/test-snappy.rb +0 -58
  46. data/vendor/snappy/cmake/SnappyConfig.cmake +0 -1
@@ -48,12 +48,12 @@ DEFINE_bool(run_microbenchmarks, true,
48
48
 
49
49
  namespace snappy {
50
50
 
51
- string ReadTestDataFile(const string& base, size_t size_limit) {
52
- string contents;
51
+ std::string ReadTestDataFile(const std::string& base, size_t size_limit) {
52
+ std::string contents;
53
53
  const char* srcdir = getenv("srcdir"); // This is set by Automake.
54
- string prefix;
54
+ std::string prefix;
55
55
  if (srcdir) {
56
- prefix = string(srcdir) + "/";
56
+ prefix = std::string(srcdir) + "/";
57
57
  }
58
58
  file::GetContents(prefix + "testdata/" + base, &contents, file::Defaults()
59
59
  ).CheckSuccess();
@@ -63,11 +63,11 @@ string ReadTestDataFile(const string& base, size_t size_limit) {
63
63
  return contents;
64
64
  }
65
65
 
66
- string ReadTestDataFile(const string& base) {
66
+ std::string ReadTestDataFile(const std::string& base) {
67
67
  return ReadTestDataFile(base, 0);
68
68
  }
69
69
 
70
- string StringPrintf(const char* format, ...) {
70
+ std::string StrFormat(const char* format, ...) {
71
71
  char buf[4096];
72
72
  va_list ap;
73
73
  va_start(ap, format);
@@ -79,7 +79,7 @@ string StringPrintf(const char* format, ...) {
79
79
  bool benchmark_running = false;
80
80
  int64 benchmark_real_time_us = 0;
81
81
  int64 benchmark_cpu_time_us = 0;
82
- string *benchmark_label = NULL;
82
+ std::string* benchmark_label = nullptr;
83
83
  int64 benchmark_bytes_processed = 0;
84
84
 
85
85
  void ResetBenchmarkTiming() {
@@ -163,11 +163,11 @@ void StopBenchmarkTiming() {
163
163
  benchmark_running = false;
164
164
  }
165
165
 
166
- void SetBenchmarkLabel(const string& str) {
166
+ void SetBenchmarkLabel(const std::string& str) {
167
167
  if (benchmark_label) {
168
168
  delete benchmark_label;
169
169
  }
170
- benchmark_label = new string(str);
170
+ benchmark_label = new std::string(str);
171
171
  }
172
172
 
173
173
  void SetBenchmarkBytesProcessed(int64 bytes) {
@@ -217,8 +217,8 @@ void Benchmark::Run() {
217
217
  benchmark_runs[run].cpu_time_us = benchmark_cpu_time_us;
218
218
  }
219
219
 
220
- string heading = StringPrintf("%s/%d", name_.c_str(), test_case_num);
221
- string human_readable_speed;
220
+ std::string heading = StrFormat("%s/%d", name_.c_str(), test_case_num);
221
+ std::string human_readable_speed;
222
222
 
223
223
  std::nth_element(benchmark_runs,
224
224
  benchmark_runs + kMedianPos,
@@ -232,15 +232,16 @@ void Benchmark::Run() {
232
232
  int64 bytes_per_second =
233
233
  benchmark_bytes_processed * 1000000 / cpu_time_us;
234
234
  if (bytes_per_second < 1024) {
235
- human_readable_speed = StringPrintf("%dB/s", bytes_per_second);
235
+ human_readable_speed =
236
+ StrFormat("%dB/s", static_cast<int>(bytes_per_second));
236
237
  } else if (bytes_per_second < 1024 * 1024) {
237
- human_readable_speed = StringPrintf(
238
+ human_readable_speed = StrFormat(
238
239
  "%.1fkB/s", bytes_per_second / 1024.0f);
239
240
  } else if (bytes_per_second < 1024 * 1024 * 1024) {
240
- human_readable_speed = StringPrintf(
241
+ human_readable_speed = StrFormat(
241
242
  "%.1fMB/s", bytes_per_second / (1024.0f * 1024.0f));
242
243
  } else {
243
- human_readable_speed = StringPrintf(
244
+ human_readable_speed = StrFormat(
244
245
  "%.1fGB/s", bytes_per_second / (1024.0f * 1024.0f * 1024.0f));
245
246
  }
246
247
  }
@@ -55,8 +55,6 @@
55
55
  #include <windows.h>
56
56
  #endif
57
57
 
58
- #include <string>
59
-
60
58
  #ifdef HAVE_GTEST
61
59
 
62
60
  #include <gtest/gtest.h>
@@ -169,7 +167,7 @@ namespace file {
169
167
  namespace snappy {
170
168
 
171
169
  #define FLAGS_test_random_seed 301
172
- typedef string TypeParam;
170
+ using TypeParam = std::string;
173
171
 
174
172
  void Test_CorruptedTest_VerifyCorrupted();
175
173
  void Test_Snappy_SimpleTests();
@@ -183,63 +181,13 @@ void Test_Snappy_ReadPastEndOfBuffer();
183
181
  void Test_Snappy_FindMatchLength();
184
182
  void Test_Snappy_FindMatchLengthRandom();
185
183
 
186
- string ReadTestDataFile(const string& base, size_t size_limit);
184
+ std::string ReadTestDataFile(const std::string& base, size_t size_limit);
187
185
 
188
- string ReadTestDataFile(const string& base);
186
+ std::string ReadTestDataFile(const std::string& base);
189
187
 
190
188
  // A sprintf() variant that returns a std::string.
191
189
  // Not safe for general use due to truncation issues.
192
- string StringPrintf(const char* format, ...);
193
-
194
- // A simple, non-cryptographically-secure random generator.
195
- class ACMRandom {
196
- public:
197
- explicit ACMRandom(uint32 seed) : seed_(seed) {}
198
-
199
- int32 Next();
200
-
201
- int32 Uniform(int32 n) {
202
- return Next() % n;
203
- }
204
- uint8 Rand8() {
205
- return static_cast<uint8>((Next() >> 1) & 0x000000ff);
206
- }
207
- bool OneIn(int X) { return Uniform(X) == 0; }
208
-
209
- // Skewed: pick "base" uniformly from range [0,max_log] and then
210
- // return "base" random bits. The effect is to pick a number in the
211
- // range [0,2^max_log-1] with bias towards smaller numbers.
212
- int32 Skewed(int max_log);
213
-
214
- private:
215
- static const uint32 M = 2147483647L; // 2^31-1
216
- uint32 seed_;
217
- };
218
-
219
- inline int32 ACMRandom::Next() {
220
- static const uint64 A = 16807; // bits 14, 8, 7, 5, 2, 1, 0
221
- // We are computing
222
- // seed_ = (seed_ * A) % M, where M = 2^31-1
223
- //
224
- // seed_ must not be zero or M, or else all subsequent computed values
225
- // will be zero or M respectively. For all other values, seed_ will end
226
- // up cycling through every number in [1,M-1]
227
- uint64 product = seed_ * A;
228
-
229
- // Compute (product % M) using the fact that ((x << 31) % M) == x.
230
- seed_ = (product >> 31) + (product & M);
231
- // The first reduction may overflow by 1 bit, so we may need to repeat.
232
- // mod == M is not possible; using > allows the faster sign-bit-based test.
233
- if (seed_ > M) {
234
- seed_ -= M;
235
- }
236
- return seed_;
237
- }
238
-
239
- inline int32 ACMRandom::Skewed(int max_log) {
240
- const int32 base = (Next() - 1) % (max_log+1);
241
- return (Next() - 1) & ((1u << base)-1);
242
- }
190
+ std::string StrFormat(const char* format, ...);
243
191
 
244
192
  // A wall-time clock. This stub is not super-accurate, nor resistant to the
245
193
  // system time changing.
@@ -293,8 +241,8 @@ typedef void (*BenchmarkFunction)(int, int);
293
241
 
294
242
  class Benchmark {
295
243
  public:
296
- Benchmark(const string& name, BenchmarkFunction function) :
297
- name_(name), function_(function) {}
244
+ Benchmark(const std::string& name, BenchmarkFunction function)
245
+ : name_(name), function_(function) {}
298
246
 
299
247
  Benchmark* DenseRange(int start, int stop) {
300
248
  start_ = start;
@@ -305,7 +253,7 @@ class Benchmark {
305
253
  void Run();
306
254
 
307
255
  private:
308
- const string name_;
256
+ const std::string name_;
309
257
  const BenchmarkFunction function_;
310
258
  int start_, stop_;
311
259
  };
@@ -317,11 +265,13 @@ extern Benchmark* Benchmark_BM_UFlat;
317
265
  extern Benchmark* Benchmark_BM_UIOVec;
318
266
  extern Benchmark* Benchmark_BM_UValidate;
319
267
  extern Benchmark* Benchmark_BM_ZFlat;
268
+ extern Benchmark* Benchmark_BM_ZFlatAll;
269
+ extern Benchmark* Benchmark_BM_ZFlatIncreasingTableSize;
320
270
 
321
271
  void ResetBenchmarkTiming();
322
272
  void StartBenchmarkTiming();
323
273
  void StopBenchmarkTiming();
324
- void SetBenchmarkLabel(const string& str);
274
+ void SetBenchmarkLabel(const std::string& str);
325
275
  void SetBenchmarkBytesProcessed(int64 bytes);
326
276
 
327
277
  #ifdef HAVE_LIBZ
@@ -468,6 +418,8 @@ static inline void RunSpecifiedBenchmarks() {
468
418
  snappy::Benchmark_BM_UIOVec->Run();
469
419
  snappy::Benchmark_BM_UValidate->Run();
470
420
  snappy::Benchmark_BM_ZFlat->Run();
421
+ snappy::Benchmark_BM_ZFlatAll->Run();
422
+ snappy::Benchmark_BM_ZFlatIncreasingTableSize->Run();
471
423
 
472
424
  fprintf(stderr, "\n");
473
425
  }
@@ -30,25 +30,50 @@
30
30
  #include "snappy-internal.h"
31
31
  #include "snappy-sinksource.h"
32
32
 
33
- #ifndef SNAPPY_HAVE_SSE2
34
- #if defined(__SSE2__) || defined(_M_X64) || \
35
- (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
36
- #define SNAPPY_HAVE_SSE2 1
33
+ #if !defined(SNAPPY_HAVE_SSSE3)
34
+ // __SSSE3__ is defined by GCC and Clang. Visual Studio doesn't target SIMD
35
+ // support between SSE2 and AVX (so SSSE3 instructions require AVX support), and
36
+ // defines __AVX__ when AVX support is available.
37
+ #if defined(__SSSE3__) || defined(__AVX__)
38
+ #define SNAPPY_HAVE_SSSE3 1
37
39
  #else
38
- #define SNAPPY_HAVE_SSE2 0
40
+ #define SNAPPY_HAVE_SSSE3 0
39
41
  #endif
42
+ #endif // !defined(SNAPPY_HAVE_SSSE3)
43
+
44
+ #if !defined(SNAPPY_HAVE_BMI2)
45
+ // __BMI2__ is defined by GCC and Clang. Visual Studio doesn't target BMI2
46
+ // specifically, but it does define __AVX2__ when AVX2 support is available.
47
+ // Fortunately, AVX2 was introduced in Haswell, just like BMI2.
48
+ //
49
+ // BMI2 is not defined as a subset of AVX2 (unlike SSSE3 and AVX above). So,
50
+ // GCC and Clang can build code with AVX2 enabled but BMI2 disabled, in which
51
+ // case issuing BMI2 instructions results in a compiler error.
52
+ #if defined(__BMI2__) || (defined(_MSC_VER) && defined(__AVX2__))
53
+ #define SNAPPY_HAVE_BMI2 1
54
+ #else
55
+ #define SNAPPY_HAVE_BMI2 0
56
+ #endif
57
+ #endif // !defined(SNAPPY_HAVE_BMI2)
58
+
59
+ #if SNAPPY_HAVE_SSSE3
60
+ // Please do not replace with <x86intrin.h>. or with headers that assume more
61
+ // advanced SSE versions without checking with all the OWNERS.
62
+ #include <tmmintrin.h>
40
63
  #endif
41
64
 
42
- #if SNAPPY_HAVE_SSE2
43
- #include <emmintrin.h>
65
+ #if SNAPPY_HAVE_BMI2
66
+ // Please do not replace with <x86intrin.h>. or with headers that assume more
67
+ // advanced SSE versions without checking with all the OWNERS.
68
+ #include <immintrin.h>
44
69
  #endif
70
+
45
71
  #include <stdio.h>
46
72
 
47
73
  #include <algorithm>
48
74
  #include <string>
49
75
  #include <vector>
50
76
 
51
-
52
77
  namespace snappy {
53
78
 
54
79
  using internal::COPY_1_BYTE_OFFSET;
@@ -103,16 +128,12 @@ void UnalignedCopy64(const void* src, void* dst) {
103
128
  }
104
129
 
105
130
  void UnalignedCopy128(const void* src, void* dst) {
106
- // TODO(alkis): Remove this when we upgrade to a recent compiler that emits
107
- // SSE2 moves for memcpy(dst, src, 16).
108
- #if SNAPPY_HAVE_SSE2
109
- __m128i x = _mm_loadu_si128(static_cast<const __m128i*>(src));
110
- _mm_storeu_si128(static_cast<__m128i*>(dst), x);
111
- #else
131
+ // memcpy gets vectorized when the appropriate compiler options are used.
132
+ // For example, x86 compilers targeting SSE2+ will optimize to an SSE2 load
133
+ // and store.
112
134
  char tmp[16];
113
135
  memcpy(tmp, src, 16);
114
136
  memcpy(dst, tmp, 16);
115
- #endif
116
137
  }
117
138
 
118
139
  // Copy [src, src+(op_limit-op)) to [op, (op_limit-op)) a byte at a time. Used
@@ -127,12 +148,35 @@ void UnalignedCopy128(const void* src, void* dst) {
127
148
  // Note that this does not match the semantics of either memcpy() or memmove().
128
149
  inline char* IncrementalCopySlow(const char* src, char* op,
129
150
  char* const op_limit) {
151
+ // TODO: Remove pragma when LLVM is aware this
152
+ // function is only called in cold regions and when cold regions don't get
153
+ // vectorized or unrolled.
154
+ #ifdef __clang__
155
+ #pragma clang loop unroll(disable)
156
+ #endif
130
157
  while (op < op_limit) {
131
158
  *op++ = *src++;
132
159
  }
133
160
  return op_limit;
134
161
  }
135
162
 
163
+ #if SNAPPY_HAVE_SSSE3
164
+
165
+ // This is a table of shuffle control masks that can be used as the source
166
+ // operand for PSHUFB to permute the contents of the destination XMM register
167
+ // into a repeating byte pattern.
168
+ alignas(16) const char pshufb_fill_patterns[7][16] = {
169
+ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
170
+ {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1},
171
+ {0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0},
172
+ {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3},
173
+ {0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0},
174
+ {0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3},
175
+ {0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1},
176
+ };
177
+
178
+ #endif // SNAPPY_HAVE_SSSE3
179
+
136
180
  // Copy [src, src+(op_limit-op)) to [op, (op_limit-op)) but faster than
137
181
  // IncrementalCopySlow. buf_limit is the address past the end of the writable
138
182
  // region of the buffer.
@@ -144,9 +188,10 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
144
188
  // pat = op - src
145
189
  // len = limit - op
146
190
  assert(src < op);
191
+ assert(op <= op_limit);
147
192
  assert(op_limit <= buf_limit);
148
193
  // NOTE: The compressor always emits 4 <= len <= 64. It is ok to assume that
149
- // to optimize this function but we have to also handle these cases in case
194
+ // to optimize this function but we have to also handle other cases in case
150
195
  // the input does not satisfy these conditions.
151
196
 
152
197
  size_t pattern_size = op - src;
@@ -176,16 +221,45 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
176
221
 
177
222
  // Handle the uncommon case where pattern is less than 8 bytes.
178
223
  if (SNAPPY_PREDICT_FALSE(pattern_size < 8)) {
179
- // Expand pattern to at least 8 bytes. The worse case scenario in terms of
180
- // buffer usage is when the pattern is size 3. ^ is the original position
181
- // of op. x are irrelevant bytes copied by the last UnalignedCopy64.
224
+ #if SNAPPY_HAVE_SSSE3
225
+ // Load the first eight bytes into an 128-bit XMM register, then use PSHUFB
226
+ // to permute the register's contents in-place into a repeating sequence of
227
+ // the first "pattern_size" bytes.
228
+ // For example, suppose:
229
+ // src == "abc"
230
+ // op == op + 3
231
+ // After _mm_shuffle_epi8(), "pattern" will have five copies of "abc"
232
+ // followed by one byte of slop: abcabcabcabcabca.
182
233
  //
183
- // abc
184
- // abcabcxxxxx
185
- // abcabcabcabcxxxxx
186
- // ^
187
- // The last x is 14 bytes after ^.
188
- if (SNAPPY_PREDICT_TRUE(op <= buf_limit - 14)) {
234
+ // The non-SSE fallback implementation suffers from store-forwarding stalls
235
+ // because its loads and stores partly overlap. By expanding the pattern
236
+ // in-place, we avoid the penalty.
237
+ if (SNAPPY_PREDICT_TRUE(op <= buf_limit - 16)) {
238
+ const __m128i shuffle_mask = _mm_load_si128(
239
+ reinterpret_cast<const __m128i*>(pshufb_fill_patterns)
240
+ + pattern_size - 1);
241
+ const __m128i pattern = _mm_shuffle_epi8(
242
+ _mm_loadl_epi64(reinterpret_cast<const __m128i*>(src)), shuffle_mask);
243
+ // Uninitialized bytes are masked out by the shuffle mask.
244
+ // TODO: remove annotation and macro defs once MSan is fixed.
245
+ SNAPPY_ANNOTATE_MEMORY_IS_INITIALIZED(&pattern, sizeof(pattern));
246
+ pattern_size *= 16 / pattern_size;
247
+ char* op_end = std::min(op_limit, buf_limit - 15);
248
+ while (op < op_end) {
249
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(op), pattern);
250
+ op += pattern_size;
251
+ }
252
+ if (SNAPPY_PREDICT_TRUE(op >= op_limit)) return op_limit;
253
+ }
254
+ return IncrementalCopySlow(src, op, op_limit);
255
+ #else // !SNAPPY_HAVE_SSSE3
256
+ // If plenty of buffer space remains, expand the pattern to at least 8
257
+ // bytes. The way the following loop is written, we need 8 bytes of buffer
258
+ // space if pattern_size >= 4, 11 bytes if pattern_size is 1 or 3, and 10
259
+ // bytes if pattern_size is 2. Precisely encoding that is probably not
260
+ // worthwhile; instead, invoke the slow path if we cannot write 11 bytes
261
+ // (because 11 are required in the worst case).
262
+ if (SNAPPY_PREDICT_TRUE(op <= buf_limit - 11)) {
189
263
  while (pattern_size < 8) {
190
264
  UnalignedCopy64(src, op);
191
265
  op += pattern_size;
@@ -195,6 +269,7 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
195
269
  } else {
196
270
  return IncrementalCopySlow(src, op, op_limit);
197
271
  }
272
+ #endif // SNAPPY_HAVE_SSSE3
198
273
  }
199
274
  assert(pattern_size >= 8);
200
275
 
@@ -202,13 +277,48 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
202
277
  // UnalignedCopy128 might overwrite data in op. UnalignedCopy64 is safe
203
278
  // because expanding the pattern to at least 8 bytes guarantees that
204
279
  // op - src >= 8.
205
- while (op <= buf_limit - 16) {
280
+ //
281
+ // Typically, the op_limit is the gating factor so try to simplify the loop
282
+ // based on that.
283
+ if (SNAPPY_PREDICT_TRUE(op_limit <= buf_limit - 16)) {
284
+ // There is at least one, and at most four 16-byte blocks. Writing four
285
+ // conditionals instead of a loop allows FDO to layout the code with respect
286
+ // to the actual probabilities of each length.
287
+ // TODO: Replace with loop with trip count hint.
288
+ UnalignedCopy64(src, op);
289
+ UnalignedCopy64(src + 8, op + 8);
290
+
291
+ if (op + 16 < op_limit) {
292
+ UnalignedCopy64(src + 16, op + 16);
293
+ UnalignedCopy64(src + 24, op + 24);
294
+ }
295
+ if (op + 32 < op_limit) {
296
+ UnalignedCopy64(src + 32, op + 32);
297
+ UnalignedCopy64(src + 40, op + 40);
298
+ }
299
+ if (op + 48 < op_limit) {
300
+ UnalignedCopy64(src + 48, op + 48);
301
+ UnalignedCopy64(src + 56, op + 56);
302
+ }
303
+ return op_limit;
304
+ }
305
+
306
+ // Fall back to doing as much as we can with the available slop in the
307
+ // buffer. This code path is relatively cold however so we save code size by
308
+ // avoiding unrolling and vectorizing.
309
+ //
310
+ // TODO: Remove pragma when when cold regions don't get vectorized
311
+ // or unrolled.
312
+ #ifdef __clang__
313
+ #pragma clang loop unroll(disable)
314
+ #endif
315
+ for (char *op_end = buf_limit - 16; op < op_end; op += 16, src += 16) {
206
316
  UnalignedCopy64(src, op);
207
317
  UnalignedCopy64(src + 8, op + 8);
208
- src += 16;
209
- op += 16;
210
- if (SNAPPY_PREDICT_TRUE(op >= op_limit)) return op_limit;
211
318
  }
319
+ if (op >= op_limit)
320
+ return op_limit;
321
+
212
322
  // We only take this branch if we didn't have enough slop and we can do a
213
323
  // single 8 byte copy.
214
324
  if (SNAPPY_PREDICT_FALSE(op <= buf_limit - 8)) {
@@ -221,10 +331,10 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
221
331
 
222
332
  } // namespace
223
333
 
334
+ template <bool allow_fast_path>
224
335
  static inline char* EmitLiteral(char* op,
225
336
  const char* literal,
226
- int len,
227
- bool allow_fast_path) {
337
+ int len) {
228
338
  // The vast majority of copies are below 16 bytes, for which a
229
339
  // call to memcpy is overkill. This fast path can sometimes
230
340
  // copy up to 15 bytes too much, but that is okay in the
@@ -249,25 +359,23 @@ static inline char* EmitLiteral(char* op,
249
359
  // Fits in tag byte
250
360
  *op++ = LITERAL | (n << 2);
251
361
  } else {
252
- // Encode in upcoming bytes
253
- char* base = op;
254
- int count = 0;
255
- op++;
256
- while (n > 0) {
257
- *op++ = n & 0xff;
258
- n >>= 8;
259
- count++;
260
- }
362
+ int count = (Bits::Log2Floor(n) >> 3) + 1;
261
363
  assert(count >= 1);
262
364
  assert(count <= 4);
263
- *base = LITERAL | ((59+count) << 2);
365
+ *op++ = LITERAL | ((59 + count) << 2);
366
+ // Encode in upcoming bytes.
367
+ // Write 4 bytes, though we may care about only 1 of them. The output buffer
368
+ // is guaranteed to have at least 3 more spaces left as 'len >= 61' holds
369
+ // here and there is a memcpy of size 'len' below.
370
+ LittleEndian::Store32(op, n);
371
+ op += count;
264
372
  }
265
373
  memcpy(op, literal, len);
266
374
  return op + len;
267
375
  }
268
376
 
269
- static inline char* EmitCopyAtMost64(char* op, size_t offset, size_t len,
270
- bool len_less_than_12) {
377
+ template <bool len_less_than_12>
378
+ static inline char* EmitCopyAtMost64(char* op, size_t offset, size_t len) {
271
379
  assert(len <= 64);
272
380
  assert(len >= 4);
273
381
  assert(offset < 65536);
@@ -288,29 +396,33 @@ static inline char* EmitCopyAtMost64(char* op, size_t offset, size_t len,
288
396
  return op;
289
397
  }
290
398
 
291
- static inline char* EmitCopy(char* op, size_t offset, size_t len,
292
- bool len_less_than_12) {
399
+ template <bool len_less_than_12>
400
+ static inline char* EmitCopy(char* op, size_t offset, size_t len) {
293
401
  assert(len_less_than_12 == (len < 12));
294
402
  if (len_less_than_12) {
295
- return EmitCopyAtMost64(op, offset, len, true);
403
+ return EmitCopyAtMost64</*len_less_than_12=*/true>(op, offset, len);
296
404
  } else {
297
405
  // A special case for len <= 64 might help, but so far measurements suggest
298
406
  // it's in the noise.
299
407
 
300
408
  // Emit 64 byte copies but make sure to keep at least four bytes reserved.
301
409
  while (SNAPPY_PREDICT_FALSE(len >= 68)) {
302
- op = EmitCopyAtMost64(op, offset, 64, false);
410
+ op = EmitCopyAtMost64</*len_less_than_12=*/false>(op, offset, 64);
303
411
  len -= 64;
304
412
  }
305
413
 
306
414
  // One or two copies will now finish the job.
307
415
  if (len > 64) {
308
- op = EmitCopyAtMost64(op, offset, 60, false);
416
+ op = EmitCopyAtMost64</*len_less_than_12=*/false>(op, offset, 60);
309
417
  len -= 60;
310
418
  }
311
419
 
312
420
  // Emit remainder.
313
- op = EmitCopyAtMost64(op, offset, len, len < 12);
421
+ if (len < 12) {
422
+ op = EmitCopyAtMost64</*len_less_than_12=*/true>(op, offset, len);
423
+ } else {
424
+ op = EmitCopyAtMost64</*len_less_than_12=*/false>(op, offset, len);
425
+ }
314
426
  return op;
315
427
  }
316
428
  }
@@ -326,31 +438,45 @@ bool GetUncompressedLength(const char* start, size_t n, size_t* result) {
326
438
  }
327
439
  }
328
440
 
329
- namespace internal {
330
- uint16* WorkingMemory::GetHashTable(size_t input_size, int* table_size) {
331
- // Use smaller hash table when input.size() is smaller, since we
332
- // fill the table, incurring O(hash table size) overhead for
333
- // compression, and if the input is short, we won't need that
334
- // many hash table entries anyway.
335
- assert(kMaxHashTableSize >= 256);
336
- size_t htsize = 256;
337
- while (htsize < kMaxHashTableSize && htsize < input_size) {
338
- htsize <<= 1;
441
+ namespace {
442
+ uint32 CalculateTableSize(uint32 input_size) {
443
+ static_assert(
444
+ kMaxHashTableSize >= kMinHashTableSize,
445
+ "kMaxHashTableSize should be greater or equal to kMinHashTableSize.");
446
+ if (input_size > kMaxHashTableSize) {
447
+ return kMaxHashTableSize;
339
448
  }
340
-
341
- uint16* table;
342
- if (htsize <= ARRAYSIZE(small_table_)) {
343
- table = small_table_;
344
- } else {
345
- if (large_table_ == NULL) {
346
- large_table_ = new uint16[kMaxHashTableSize];
347
- }
348
- table = large_table_;
449
+ if (input_size < kMinHashTableSize) {
450
+ return kMinHashTableSize;
349
451
  }
452
+ // This is equivalent to Log2Ceiling(input_size), assuming input_size > 1.
453
+ // 2 << Log2Floor(x - 1) is equivalent to 1 << (1 + Log2Floor(x - 1)).
454
+ return 2u << Bits::Log2Floor(input_size - 1);
455
+ }
456
+ } // namespace
350
457
 
458
+ namespace internal {
459
+ WorkingMemory::WorkingMemory(size_t input_size) {
460
+ const size_t max_fragment_size = std::min(input_size, kBlockSize);
461
+ const size_t table_size = CalculateTableSize(max_fragment_size);
462
+ size_ = table_size * sizeof(*table_) + max_fragment_size +
463
+ MaxCompressedLength(max_fragment_size);
464
+ mem_ = std::allocator<char>().allocate(size_);
465
+ table_ = reinterpret_cast<uint16*>(mem_);
466
+ input_ = mem_ + table_size * sizeof(*table_);
467
+ output_ = input_ + max_fragment_size;
468
+ }
469
+
470
+ WorkingMemory::~WorkingMemory() {
471
+ std::allocator<char>().deallocate(mem_, size_);
472
+ }
473
+
474
+ uint16* WorkingMemory::GetHashTable(size_t fragment_size,
475
+ int* table_size) const {
476
+ const size_t htsize = CalculateTableSize(fragment_size);
477
+ memset(table_, 0, htsize * sizeof(*table_));
351
478
  *table_size = htsize;
352
- memset(table, 0, htsize * sizeof(*table));
353
- return table;
479
+ return table_;
354
480
  }
355
481
  } // end namespace internal
356
482
 
@@ -417,7 +543,7 @@ char* CompressFragment(const char* input,
417
543
  // "ip" is the input pointer, and "op" is the output pointer.
418
544
  const char* ip = input;
419
545
  assert(input_size <= kBlockSize);
420
- assert((table_size & (table_size - 1)) == 0); // table must be power of two
546
+ assert((table_size & (table_size - 1)) == 0); // table must be power of two
421
547
  const int shift = 32 - Bits::Log2Floor(table_size);
422
548
  assert(static_cast<int>(kuint32max >> shift) == table_size - 1);
423
549
  const char* ip_end = input + input_size;
@@ -484,7 +610,7 @@ char* CompressFragment(const char* input,
484
610
  // than 4 bytes match. But, prior to the match, input
485
611
  // bytes [next_emit, ip) are unmatched. Emit them as "literal bytes."
486
612
  assert(next_emit + 16 <= ip_end);
487
- op = EmitLiteral(op, next_emit, ip - next_emit, true);
613
+ op = EmitLiteral</*allow_fast_path=*/true>(op, next_emit, ip - next_emit);
488
614
 
489
615
  // Step 3: Call EmitCopy, and then see if another EmitCopy could
490
616
  // be our next move. Repeat until we find no match for the
@@ -507,7 +633,11 @@ char* CompressFragment(const char* input,
507
633
  ip += matched;
508
634
  size_t offset = base - candidate;
509
635
  assert(0 == memcmp(base, candidate, matched));
510
- op = EmitCopy(op, offset, matched, p.second);
636
+ if (p.second) {
637
+ op = EmitCopy</*len_less_than_12=*/true>(op, offset, matched);
638
+ } else {
639
+ op = EmitCopy</*len_less_than_12=*/false>(op, offset, matched);
640
+ }
511
641
  next_emit = ip;
512
642
  if (SNAPPY_PREDICT_FALSE(ip >= ip_limit)) {
513
643
  goto emit_remainder;
@@ -532,7 +662,8 @@ char* CompressFragment(const char* input,
532
662
  emit_remainder:
533
663
  // Emit the remaining bytes as a literal
534
664
  if (next_emit < ip_end) {
535
- op = EmitLiteral(op, next_emit, ip_end - next_emit, false);
665
+ op = EmitLiteral</*allow_fast_path=*/false>(op, next_emit,
666
+ ip_end - next_emit);
536
667
  }
537
668
 
538
669
  return op;
@@ -583,14 +714,28 @@ static inline void Report(const char *algorithm, size_t compressed_size,
583
714
  // bool TryFastAppend(const char* ip, size_t available, size_t length);
584
715
  // };
585
716
 
586
- namespace internal {
587
-
588
- // Mapping from i in range [0,4] to a mask to extract the bottom 8*i bits
589
- static const uint32 wordmask[] = {
590
- 0u, 0xffu, 0xffffu, 0xffffffu, 0xffffffffu
591
- };
717
+ static inline uint32 ExtractLowBytes(uint32 v, int n) {
718
+ assert(n >= 0);
719
+ assert(n <= 4);
720
+ #if SNAPPY_HAVE_BMI2
721
+ return _bzhi_u32(v, 8 * n);
722
+ #else
723
+ // This needs to be wider than uint32 otherwise `mask << 32` will be
724
+ // undefined.
725
+ uint64 mask = 0xffffffff;
726
+ return v & ~(mask << (8 * n));
727
+ #endif
728
+ }
592
729
 
593
- } // end namespace internal
730
+ static inline bool LeftShiftOverflows(uint8 value, uint32 shift) {
731
+ assert(shift < 32);
732
+ static const uint8 masks[] = {
733
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, //
734
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, //
735
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, //
736
+ 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe};
737
+ return (value & masks[shift]) != 0;
738
+ }
594
739
 
595
740
  // Helper class for decompression
596
741
  class SnappyDecompressor {
@@ -629,7 +774,7 @@ class SnappyDecompressor {
629
774
  }
630
775
 
631
776
  // Read the uncompressed length stored at the start of the compressed data.
632
- // On succcess, stores the length in *result and returns true.
777
+ // On success, stores the length in *result and returns true.
633
778
  // On failure, returns false.
634
779
  bool ReadUncompressedLength(uint32* result) {
635
780
  assert(ip_ == NULL); // Must not have read anything yet
@@ -644,7 +789,7 @@ class SnappyDecompressor {
644
789
  const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip));
645
790
  reader_->Skip(1);
646
791
  uint32 val = c & 0x7f;
647
- if (((val << shift) >> shift) != val) return false;
792
+ if (LeftShiftOverflows(static_cast<uint8>(val), shift)) return false;
648
793
  *result |= val << shift;
649
794
  if (c < 128) {
650
795
  break;
@@ -657,22 +802,27 @@ class SnappyDecompressor {
657
802
  // Process the next item found in the input.
658
803
  // Returns true if successful, false on error or end of input.
659
804
  template <class Writer>
805
+ #if defined(__GNUC__) && defined(__x86_64__)
806
+ __attribute__((aligned(32)))
807
+ #endif
660
808
  void DecompressAllTags(Writer* writer) {
661
- const char* ip = ip_;
662
- // For position-independent executables, accessing global arrays can be
663
- // slow. Move wordmask array onto the stack to mitigate this.
664
- uint32 wordmask[sizeof(internal::wordmask)/sizeof(uint32)];
665
- // Do not use memcpy to copy internal::wordmask to
666
- // wordmask. LLVM converts stack arrays to global arrays if it detects
667
- // const stack arrays and this hurts the performance of position
668
- // independent code. This change is temporary and can be reverted when
669
- // https://reviews.llvm.org/D30759 is approved.
670
- wordmask[0] = internal::wordmask[0];
671
- wordmask[1] = internal::wordmask[1];
672
- wordmask[2] = internal::wordmask[2];
673
- wordmask[3] = internal::wordmask[3];
674
- wordmask[4] = internal::wordmask[4];
809
+ // In x86, pad the function body to start 16 bytes later. This function has
810
+ // a couple of hotspots that are highly sensitive to alignment: we have
811
+ // observed regressions by more than 20% in some metrics just by moving the
812
+ // exact same code to a different position in the benchmark binary.
813
+ //
814
+ // Putting this code on a 32-byte-aligned boundary + 16 bytes makes us hit
815
+ // the "lucky" case consistently. Unfortunately, this is a very brittle
816
+ // workaround, and future differences in code generation may reintroduce
817
+ // this regression. If you experience a big, difficult to explain, benchmark
818
+ // performance regression here, first try removing this hack.
819
+ #if defined(__GNUC__) && defined(__x86_64__)
820
+ // Two 8-byte "NOP DWORD ptr [EAX + EAX*1 + 00000000H]" instructions.
821
+ asm(".byte 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00");
822
+ asm(".byte 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00");
823
+ #endif
675
824
 
825
+ const char* ip = ip_;
676
826
  // We could have put this refill fragment only at the beginning of the loop.
677
827
  // However, duplicating it at the end of each branch gives the compiler more
678
828
  // scope to optimize the <ip_limit_ - ip> expression based on the local
@@ -685,13 +835,6 @@ class SnappyDecompressor {
685
835
  }
686
836
 
687
837
  MAYBE_REFILL();
688
- // Add loop alignment directive. Without this directive, we observed
689
- // significant performance degradation on several intel architectures
690
- // in snappy benchmark built with LLVM. The degradation was caused by
691
- // increased branch miss prediction.
692
- #if defined(__clang__) && defined(__x86_64__)
693
- asm volatile (".p2align 5");
694
- #endif
695
838
  for ( ;; ) {
696
839
  const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip++));
697
840
 
@@ -712,7 +855,7 @@ class SnappyDecompressor {
712
855
  if (writer->TryFastAppend(ip, ip_limit_ - ip, literal_length)) {
713
856
  assert(literal_length < 61);
714
857
  ip += literal_length;
715
- // NOTE(user): There is no MAYBE_REFILL() here, as TryFastAppend()
858
+ // NOTE: There is no MAYBE_REFILL() here, as TryFastAppend()
716
859
  // will not return true unless there's already at least five spare
717
860
  // bytes in addition to the literal.
718
861
  continue;
@@ -721,7 +864,8 @@ class SnappyDecompressor {
721
864
  // Long literal.
722
865
  const size_t literal_length_length = literal_length - 60;
723
866
  literal_length =
724
- (LittleEndian::Load32(ip) & wordmask[literal_length_length]) + 1;
867
+ ExtractLowBytes(LittleEndian::Load32(ip), literal_length_length) +
868
+ 1;
725
869
  ip += literal_length_length;
726
870
  }
727
871
 
@@ -744,7 +888,8 @@ class SnappyDecompressor {
744
888
  MAYBE_REFILL();
745
889
  } else {
746
890
  const size_t entry = char_table[c];
747
- const size_t trailer = LittleEndian::Load32(ip) & wordmask[entry >> 11];
891
+ const size_t trailer =
892
+ ExtractLowBytes(LittleEndian::Load32(ip), entry >> 11);
748
893
  const size_t length = entry & 0xff;
749
894
  ip += entry >> 11;
750
895
 
@@ -860,9 +1005,7 @@ size_t Compress(Source* reader, Sink* writer) {
860
1005
  writer->Append(ulength, p-ulength);
861
1006
  written += (p - ulength);
862
1007
 
863
- internal::WorkingMemory wmem;
864
- char* scratch = NULL;
865
- char* scratch_output = NULL;
1008
+ internal::WorkingMemory wmem(N);
866
1009
 
867
1010
  while (N > 0) {
868
1011
  // Get next block to compress (without copying if possible)
@@ -878,13 +1021,7 @@ size_t Compress(Source* reader, Sink* writer) {
878
1021
  pending_advance = num_to_read;
879
1022
  fragment_size = num_to_read;
880
1023
  } else {
881
- // Read into scratch buffer
882
- if (scratch == NULL) {
883
- // If this is the last iteration, we want to allocate N bytes
884
- // of space, otherwise the max possible kBlockSize space.
885
- // num_to_read contains exactly the correct value
886
- scratch = new char[num_to_read];
887
- }
1024
+ char* scratch = wmem.GetScratchInput();
888
1025
  memcpy(scratch, fragment, bytes_read);
889
1026
  reader->Skip(bytes_read);
890
1027
 
@@ -910,16 +1047,13 @@ size_t Compress(Source* reader, Sink* writer) {
910
1047
 
911
1048
  // Need a scratch buffer for the output, in case the byte sink doesn't
912
1049
  // have room for us directly.
913
- if (scratch_output == NULL) {
914
- scratch_output = new char[max_output];
915
- } else {
916
- // Since we encode kBlockSize regions followed by a region
917
- // which is <= kBlockSize in length, a previously allocated
918
- // scratch_output[] region is big enough for this iteration.
919
- }
920
- char* dest = writer->GetAppendBuffer(max_output, scratch_output);
921
- char* end = internal::CompressFragment(fragment, fragment_size,
922
- dest, table, table_size);
1050
+
1051
+ // Since we encode kBlockSize regions followed by a region
1052
+ // which is <= kBlockSize in length, a previously allocated
1053
+ // scratch_output[] region is big enough for this iteration.
1054
+ char* dest = writer->GetAppendBuffer(max_output, wmem.GetScratchOutput());
1055
+ char* end = internal::CompressFragment(fragment, fragment_size, dest, table,
1056
+ table_size);
923
1057
  writer->Append(dest, end - dest);
924
1058
  written += (end - dest);
925
1059
 
@@ -929,9 +1063,6 @@ size_t Compress(Source* reader, Sink* writer) {
929
1063
 
930
1064
  Report("snappy_compress", written, uncompressed_size);
931
1065
 
932
- delete[] scratch;
933
- delete[] scratch_output;
934
-
935
1066
  return written;
936
1067
  }
937
1068
 
@@ -944,14 +1075,22 @@ size_t Compress(Source* reader, Sink* writer) {
944
1075
  // Writer template argument to SnappyDecompressor::DecompressAllTags().
945
1076
  class SnappyIOVecWriter {
946
1077
  private:
1078
+ // output_iov_end_ is set to iov + count and used to determine when
1079
+ // the end of the iovs is reached.
1080
+ const struct iovec* output_iov_end_;
1081
+
1082
+ #if !defined(NDEBUG)
947
1083
  const struct iovec* output_iov_;
948
- const size_t output_iov_count_;
1084
+ #endif // !defined(NDEBUG)
1085
+
1086
+ // Current iov that is being written into.
1087
+ const struct iovec* curr_iov_;
949
1088
 
950
- // We are currently writing into output_iov_[curr_iov_index_].
951
- size_t curr_iov_index_;
1089
+ // Pointer to current iov's write location.
1090
+ char* curr_iov_output_;
952
1091
 
953
- // Bytes written to output_iov_[curr_iov_index_] so far.
954
- size_t curr_iov_written_;
1092
+ // Remaining bytes to write into curr_iov_output.
1093
+ size_t curr_iov_remaining_;
955
1094
 
956
1095
  // Total bytes decompressed into output_iov_ so far.
957
1096
  size_t total_written_;
@@ -959,22 +1098,24 @@ class SnappyIOVecWriter {
959
1098
  // Maximum number of bytes that will be decompressed into output_iov_.
960
1099
  size_t output_limit_;
961
1100
 
962
- inline char* GetIOVecPointer(size_t index, size_t offset) {
963
- return reinterpret_cast<char*>(output_iov_[index].iov_base) +
964
- offset;
1101
+ static inline char* GetIOVecPointer(const struct iovec* iov, size_t offset) {
1102
+ return reinterpret_cast<char*>(iov->iov_base) + offset;
965
1103
  }
966
1104
 
967
1105
  public:
968
1106
  // Does not take ownership of iov. iov must be valid during the
969
1107
  // entire lifetime of the SnappyIOVecWriter.
970
1108
  inline SnappyIOVecWriter(const struct iovec* iov, size_t iov_count)
971
- : output_iov_(iov),
972
- output_iov_count_(iov_count),
973
- curr_iov_index_(0),
974
- curr_iov_written_(0),
1109
+ : output_iov_end_(iov + iov_count),
1110
+ #if !defined(NDEBUG)
1111
+ output_iov_(iov),
1112
+ #endif // !defined(NDEBUG)
1113
+ curr_iov_(iov),
1114
+ curr_iov_output_(iov_count ? reinterpret_cast<char*>(iov->iov_base)
1115
+ : nullptr),
1116
+ curr_iov_remaining_(iov_count ? iov->iov_len : 0),
975
1117
  total_written_(0),
976
- output_limit_(-1) {
977
- }
1118
+ output_limit_(-1) {}
978
1119
 
979
1120
  inline void SetExpectedLength(size_t len) {
980
1121
  output_limit_ = len;
@@ -989,23 +1130,25 @@ class SnappyIOVecWriter {
989
1130
  return false;
990
1131
  }
991
1132
 
1133
+ return AppendNoCheck(ip, len);
1134
+ }
1135
+
1136
+ inline bool AppendNoCheck(const char* ip, size_t len) {
992
1137
  while (len > 0) {
993
- assert(curr_iov_written_ <= output_iov_[curr_iov_index_].iov_len);
994
- if (curr_iov_written_ >= output_iov_[curr_iov_index_].iov_len) {
1138
+ if (curr_iov_remaining_ == 0) {
995
1139
  // This iovec is full. Go to the next one.
996
- if (curr_iov_index_ + 1 >= output_iov_count_) {
1140
+ if (curr_iov_ + 1 >= output_iov_end_) {
997
1141
  return false;
998
1142
  }
999
- curr_iov_written_ = 0;
1000
- ++curr_iov_index_;
1143
+ ++curr_iov_;
1144
+ curr_iov_output_ = reinterpret_cast<char*>(curr_iov_->iov_base);
1145
+ curr_iov_remaining_ = curr_iov_->iov_len;
1001
1146
  }
1002
1147
 
1003
- const size_t to_write = std::min(
1004
- len, output_iov_[curr_iov_index_].iov_len - curr_iov_written_);
1005
- memcpy(GetIOVecPointer(curr_iov_index_, curr_iov_written_),
1006
- ip,
1007
- to_write);
1008
- curr_iov_written_ += to_write;
1148
+ const size_t to_write = std::min(len, curr_iov_remaining_);
1149
+ memcpy(curr_iov_output_, ip, to_write);
1150
+ curr_iov_output_ += to_write;
1151
+ curr_iov_remaining_ -= to_write;
1009
1152
  total_written_ += to_write;
1010
1153
  ip += to_write;
1011
1154
  len -= to_write;
@@ -1017,11 +1160,11 @@ class SnappyIOVecWriter {
1017
1160
  inline bool TryFastAppend(const char* ip, size_t available, size_t len) {
1018
1161
  const size_t space_left = output_limit_ - total_written_;
1019
1162
  if (len <= 16 && available >= 16 + kMaximumTagLength && space_left >= 16 &&
1020
- output_iov_[curr_iov_index_].iov_len - curr_iov_written_ >= 16) {
1163
+ curr_iov_remaining_ >= 16) {
1021
1164
  // Fast path, used for the majority (about 95%) of invocations.
1022
- char* ptr = GetIOVecPointer(curr_iov_index_, curr_iov_written_);
1023
- UnalignedCopy128(ip, ptr);
1024
- curr_iov_written_ += len;
1165
+ UnalignedCopy128(ip, curr_iov_output_);
1166
+ curr_iov_output_ += len;
1167
+ curr_iov_remaining_ -= len;
1025
1168
  total_written_ += len;
1026
1169
  return true;
1027
1170
  }
@@ -1030,7 +1173,9 @@ class SnappyIOVecWriter {
1030
1173
  }
1031
1174
 
1032
1175
  inline bool AppendFromSelf(size_t offset, size_t len) {
1033
- if (offset > total_written_ || offset == 0) {
1176
+ // See SnappyArrayWriter::AppendFromSelf for an explanation of
1177
+ // the "offset - 1u" trick.
1178
+ if (offset - 1u >= total_written_) {
1034
1179
  return false;
1035
1180
  }
1036
1181
  const size_t space_left = output_limit_ - total_written_;
@@ -1039,8 +1184,8 @@ class SnappyIOVecWriter {
1039
1184
  }
1040
1185
 
1041
1186
  // Locate the iovec from which we need to start the copy.
1042
- size_t from_iov_index = curr_iov_index_;
1043
- size_t from_iov_offset = curr_iov_written_;
1187
+ const iovec* from_iov = curr_iov_;
1188
+ size_t from_iov_offset = curr_iov_->iov_len - curr_iov_remaining_;
1044
1189
  while (offset > 0) {
1045
1190
  if (from_iov_offset >= offset) {
1046
1191
  from_iov_offset -= offset;
@@ -1048,47 +1193,47 @@ class SnappyIOVecWriter {
1048
1193
  }
1049
1194
 
1050
1195
  offset -= from_iov_offset;
1051
- assert(from_iov_index > 0);
1052
- --from_iov_index;
1053
- from_iov_offset = output_iov_[from_iov_index].iov_len;
1196
+ --from_iov;
1197
+ #if !defined(NDEBUG)
1198
+ assert(from_iov >= output_iov_);
1199
+ #endif // !defined(NDEBUG)
1200
+ from_iov_offset = from_iov->iov_len;
1054
1201
  }
1055
1202
 
1056
1203
  // Copy <len> bytes starting from the iovec pointed to by from_iov_index to
1057
1204
  // the current iovec.
1058
1205
  while (len > 0) {
1059
- assert(from_iov_index <= curr_iov_index_);
1060
- if (from_iov_index != curr_iov_index_) {
1061
- const size_t to_copy = std::min(
1062
- output_iov_[from_iov_index].iov_len - from_iov_offset,
1063
- len);
1064
- Append(GetIOVecPointer(from_iov_index, from_iov_offset), to_copy);
1206
+ assert(from_iov <= curr_iov_);
1207
+ if (from_iov != curr_iov_) {
1208
+ const size_t to_copy =
1209
+ std::min(from_iov->iov_len - from_iov_offset, len);
1210
+ AppendNoCheck(GetIOVecPointer(from_iov, from_iov_offset), to_copy);
1065
1211
  len -= to_copy;
1066
1212
  if (len > 0) {
1067
- ++from_iov_index;
1213
+ ++from_iov;
1068
1214
  from_iov_offset = 0;
1069
1215
  }
1070
1216
  } else {
1071
- assert(curr_iov_written_ <= output_iov_[curr_iov_index_].iov_len);
1072
- size_t to_copy = std::min(output_iov_[curr_iov_index_].iov_len -
1073
- curr_iov_written_,
1074
- len);
1217
+ size_t to_copy = curr_iov_remaining_;
1075
1218
  if (to_copy == 0) {
1076
1219
  // This iovec is full. Go to the next one.
1077
- if (curr_iov_index_ + 1 >= output_iov_count_) {
1220
+ if (curr_iov_ + 1 >= output_iov_end_) {
1078
1221
  return false;
1079
1222
  }
1080
- ++curr_iov_index_;
1081
- curr_iov_written_ = 0;
1223
+ ++curr_iov_;
1224
+ curr_iov_output_ = reinterpret_cast<char*>(curr_iov_->iov_base);
1225
+ curr_iov_remaining_ = curr_iov_->iov_len;
1082
1226
  continue;
1083
1227
  }
1084
1228
  if (to_copy > len) {
1085
1229
  to_copy = len;
1086
1230
  }
1087
- IncrementalCopySlow(
1088
- GetIOVecPointer(from_iov_index, from_iov_offset),
1089
- GetIOVecPointer(curr_iov_index_, curr_iov_written_),
1090
- GetIOVecPointer(curr_iov_index_, curr_iov_written_) + to_copy);
1091
- curr_iov_written_ += to_copy;
1231
+
1232
+ IncrementalCopy(GetIOVecPointer(from_iov, from_iov_offset),
1233
+ curr_iov_output_, curr_iov_output_ + to_copy,
1234
+ curr_iov_output_ + curr_iov_remaining_);
1235
+ curr_iov_output_ += to_copy;
1236
+ curr_iov_remaining_ -= to_copy;
1092
1237
  from_iov_offset += to_copy;
1093
1238
  total_written_ += to_copy;
1094
1239
  len -= to_copy;
@@ -1197,7 +1342,7 @@ bool RawUncompress(Source* compressed, char* uncompressed) {
1197
1342
  return InternalUncompress(compressed, &output);
1198
1343
  }
1199
1344
 
1200
- bool Uncompress(const char* compressed, size_t n, string* uncompressed) {
1345
+ bool Uncompress(const char* compressed, size_t n, std::string* uncompressed) {
1201
1346
  size_t ulength;
1202
1347
  if (!GetUncompressedLength(compressed, n, &ulength)) {
1203
1348
  return false;
@@ -1265,7 +1410,8 @@ void RawCompress(const char* input,
1265
1410
  *compressed_length = (writer.CurrentDestination() - compressed);
1266
1411
  }
1267
1412
 
1268
- size_t Compress(const char* input, size_t input_length, string* compressed) {
1413
+ size_t Compress(const char* input, size_t input_length,
1414
+ std::string* compressed) {
1269
1415
  // Pre-grow the buffer to the max length of the compressed output
1270
1416
  STLStringResizeUninitialized(compressed, MaxCompressedLength(input_length));
1271
1417
 
@@ -1512,4 +1658,4 @@ bool Uncompress(Source* compressed, Sink* uncompressed) {
1512
1658
  }
1513
1659
  }
1514
1660
 
1515
- } // end namespace snappy
1661
+ } // namespace snappy