snappy 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/main.yml +34 -0
  3. data/.github/workflows/publish.yml +34 -0
  4. data/Gemfile +3 -4
  5. data/Rakefile +32 -30
  6. data/ext/api.c +6 -1
  7. data/lib/snappy.rb +5 -5
  8. data/lib/snappy/hadoop/reader.rb +6 -2
  9. data/lib/snappy/reader.rb +11 -7
  10. data/lib/snappy/shim.rb +1 -1
  11. data/lib/snappy/version.rb +1 -1
  12. data/snappy.gemspec +13 -9
  13. data/test/hadoop/snappy_hadoop_reader_test.rb +115 -0
  14. data/test/hadoop/snappy_hadoop_writer_test.rb +48 -0
  15. data/test/snappy_hadoop_test.rb +26 -0
  16. data/test/snappy_reader_test.rb +148 -0
  17. data/test/snappy_test.rb +95 -0
  18. data/test/snappy_writer_test.rb +55 -0
  19. data/test/test_helper.rb +7 -0
  20. data/vendor/snappy/CMakeLists.txt +177 -54
  21. data/vendor/snappy/NEWS +8 -0
  22. data/vendor/snappy/README.md +19 -20
  23. data/vendor/snappy/cmake/SnappyConfig.cmake.in +33 -0
  24. data/vendor/snappy/cmake/config.h.in +6 -6
  25. data/vendor/snappy/docs/README.md +72 -0
  26. data/vendor/snappy/snappy-internal.h +12 -5
  27. data/vendor/snappy/snappy-stubs-internal.cc +1 -1
  28. data/vendor/snappy/snappy-stubs-internal.h +60 -15
  29. data/vendor/snappy/snappy-stubs-public.h.in +16 -36
  30. data/vendor/snappy/snappy-test.cc +16 -15
  31. data/vendor/snappy/snappy-test.h +12 -60
  32. data/vendor/snappy/snappy.cc +333 -187
  33. data/vendor/snappy/snappy.h +14 -10
  34. data/vendor/snappy/snappy_compress_fuzzer.cc +59 -0
  35. data/vendor/snappy/snappy_uncompress_fuzzer.cc +57 -0
  36. data/vendor/snappy/snappy_unittest.cc +220 -124
  37. metadata +26 -20
  38. data/.travis.yml +0 -31
  39. data/smoke.sh +0 -8
  40. data/test/hadoop/test-snappy-hadoop-reader.rb +0 -103
  41. data/test/hadoop/test-snappy-hadoop-writer.rb +0 -48
  42. data/test/test-snappy-hadoop.rb +0 -22
  43. data/test/test-snappy-reader.rb +0 -129
  44. data/test/test-snappy-writer.rb +0 -55
  45. data/test/test-snappy.rb +0 -58
  46. data/vendor/snappy/cmake/SnappyConfig.cmake +0 -1
@@ -48,12 +48,12 @@ DEFINE_bool(run_microbenchmarks, true,
48
48
 
49
49
  namespace snappy {
50
50
 
51
- string ReadTestDataFile(const string& base, size_t size_limit) {
52
- string contents;
51
+ std::string ReadTestDataFile(const std::string& base, size_t size_limit) {
52
+ std::string contents;
53
53
  const char* srcdir = getenv("srcdir"); // This is set by Automake.
54
- string prefix;
54
+ std::string prefix;
55
55
  if (srcdir) {
56
- prefix = string(srcdir) + "/";
56
+ prefix = std::string(srcdir) + "/";
57
57
  }
58
58
  file::GetContents(prefix + "testdata/" + base, &contents, file::Defaults()
59
59
  ).CheckSuccess();
@@ -63,11 +63,11 @@ string ReadTestDataFile(const string& base, size_t size_limit) {
63
63
  return contents;
64
64
  }
65
65
 
66
- string ReadTestDataFile(const string& base) {
66
+ std::string ReadTestDataFile(const std::string& base) {
67
67
  return ReadTestDataFile(base, 0);
68
68
  }
69
69
 
70
- string StringPrintf(const char* format, ...) {
70
+ std::string StrFormat(const char* format, ...) {
71
71
  char buf[4096];
72
72
  va_list ap;
73
73
  va_start(ap, format);
@@ -79,7 +79,7 @@ string StringPrintf(const char* format, ...) {
79
79
  bool benchmark_running = false;
80
80
  int64 benchmark_real_time_us = 0;
81
81
  int64 benchmark_cpu_time_us = 0;
82
- string *benchmark_label = NULL;
82
+ std::string* benchmark_label = nullptr;
83
83
  int64 benchmark_bytes_processed = 0;
84
84
 
85
85
  void ResetBenchmarkTiming() {
@@ -163,11 +163,11 @@ void StopBenchmarkTiming() {
163
163
  benchmark_running = false;
164
164
  }
165
165
 
166
- void SetBenchmarkLabel(const string& str) {
166
+ void SetBenchmarkLabel(const std::string& str) {
167
167
  if (benchmark_label) {
168
168
  delete benchmark_label;
169
169
  }
170
- benchmark_label = new string(str);
170
+ benchmark_label = new std::string(str);
171
171
  }
172
172
 
173
173
  void SetBenchmarkBytesProcessed(int64 bytes) {
@@ -217,8 +217,8 @@ void Benchmark::Run() {
217
217
  benchmark_runs[run].cpu_time_us = benchmark_cpu_time_us;
218
218
  }
219
219
 
220
- string heading = StringPrintf("%s/%d", name_.c_str(), test_case_num);
221
- string human_readable_speed;
220
+ std::string heading = StrFormat("%s/%d", name_.c_str(), test_case_num);
221
+ std::string human_readable_speed;
222
222
 
223
223
  std::nth_element(benchmark_runs,
224
224
  benchmark_runs + kMedianPos,
@@ -232,15 +232,16 @@ void Benchmark::Run() {
232
232
  int64 bytes_per_second =
233
233
  benchmark_bytes_processed * 1000000 / cpu_time_us;
234
234
  if (bytes_per_second < 1024) {
235
- human_readable_speed = StringPrintf("%dB/s", bytes_per_second);
235
+ human_readable_speed =
236
+ StrFormat("%dB/s", static_cast<int>(bytes_per_second));
236
237
  } else if (bytes_per_second < 1024 * 1024) {
237
- human_readable_speed = StringPrintf(
238
+ human_readable_speed = StrFormat(
238
239
  "%.1fkB/s", bytes_per_second / 1024.0f);
239
240
  } else if (bytes_per_second < 1024 * 1024 * 1024) {
240
- human_readable_speed = StringPrintf(
241
+ human_readable_speed = StrFormat(
241
242
  "%.1fMB/s", bytes_per_second / (1024.0f * 1024.0f));
242
243
  } else {
243
- human_readable_speed = StringPrintf(
244
+ human_readable_speed = StrFormat(
244
245
  "%.1fGB/s", bytes_per_second / (1024.0f * 1024.0f * 1024.0f));
245
246
  }
246
247
  }
@@ -55,8 +55,6 @@
55
55
  #include <windows.h>
56
56
  #endif
57
57
 
58
- #include <string>
59
-
60
58
  #ifdef HAVE_GTEST
61
59
 
62
60
  #include <gtest/gtest.h>
@@ -169,7 +167,7 @@ namespace file {
169
167
  namespace snappy {
170
168
 
171
169
  #define FLAGS_test_random_seed 301
172
- typedef string TypeParam;
170
+ using TypeParam = std::string;
173
171
 
174
172
  void Test_CorruptedTest_VerifyCorrupted();
175
173
  void Test_Snappy_SimpleTests();
@@ -183,63 +181,13 @@ void Test_Snappy_ReadPastEndOfBuffer();
183
181
  void Test_Snappy_FindMatchLength();
184
182
  void Test_Snappy_FindMatchLengthRandom();
185
183
 
186
- string ReadTestDataFile(const string& base, size_t size_limit);
184
+ std::string ReadTestDataFile(const std::string& base, size_t size_limit);
187
185
 
188
- string ReadTestDataFile(const string& base);
186
+ std::string ReadTestDataFile(const std::string& base);
189
187
 
190
188
  // A sprintf() variant that returns a std::string.
191
189
  // Not safe for general use due to truncation issues.
192
- string StringPrintf(const char* format, ...);
193
-
194
- // A simple, non-cryptographically-secure random generator.
195
- class ACMRandom {
196
- public:
197
- explicit ACMRandom(uint32 seed) : seed_(seed) {}
198
-
199
- int32 Next();
200
-
201
- int32 Uniform(int32 n) {
202
- return Next() % n;
203
- }
204
- uint8 Rand8() {
205
- return static_cast<uint8>((Next() >> 1) & 0x000000ff);
206
- }
207
- bool OneIn(int X) { return Uniform(X) == 0; }
208
-
209
- // Skewed: pick "base" uniformly from range [0,max_log] and then
210
- // return "base" random bits. The effect is to pick a number in the
211
- // range [0,2^max_log-1] with bias towards smaller numbers.
212
- int32 Skewed(int max_log);
213
-
214
- private:
215
- static const uint32 M = 2147483647L; // 2^31-1
216
- uint32 seed_;
217
- };
218
-
219
- inline int32 ACMRandom::Next() {
220
- static const uint64 A = 16807; // bits 14, 8, 7, 5, 2, 1, 0
221
- // We are computing
222
- // seed_ = (seed_ * A) % M, where M = 2^31-1
223
- //
224
- // seed_ must not be zero or M, or else all subsequent computed values
225
- // will be zero or M respectively. For all other values, seed_ will end
226
- // up cycling through every number in [1,M-1]
227
- uint64 product = seed_ * A;
228
-
229
- // Compute (product % M) using the fact that ((x << 31) % M) == x.
230
- seed_ = (product >> 31) + (product & M);
231
- // The first reduction may overflow by 1 bit, so we may need to repeat.
232
- // mod == M is not possible; using > allows the faster sign-bit-based test.
233
- if (seed_ > M) {
234
- seed_ -= M;
235
- }
236
- return seed_;
237
- }
238
-
239
- inline int32 ACMRandom::Skewed(int max_log) {
240
- const int32 base = (Next() - 1) % (max_log+1);
241
- return (Next() - 1) & ((1u << base)-1);
242
- }
190
+ std::string StrFormat(const char* format, ...);
243
191
 
244
192
  // A wall-time clock. This stub is not super-accurate, nor resistant to the
245
193
  // system time changing.
@@ -293,8 +241,8 @@ typedef void (*BenchmarkFunction)(int, int);
293
241
 
294
242
  class Benchmark {
295
243
  public:
296
- Benchmark(const string& name, BenchmarkFunction function) :
297
- name_(name), function_(function) {}
244
+ Benchmark(const std::string& name, BenchmarkFunction function)
245
+ : name_(name), function_(function) {}
298
246
 
299
247
  Benchmark* DenseRange(int start, int stop) {
300
248
  start_ = start;
@@ -305,7 +253,7 @@ class Benchmark {
305
253
  void Run();
306
254
 
307
255
  private:
308
- const string name_;
256
+ const std::string name_;
309
257
  const BenchmarkFunction function_;
310
258
  int start_, stop_;
311
259
  };
@@ -317,11 +265,13 @@ extern Benchmark* Benchmark_BM_UFlat;
317
265
  extern Benchmark* Benchmark_BM_UIOVec;
318
266
  extern Benchmark* Benchmark_BM_UValidate;
319
267
  extern Benchmark* Benchmark_BM_ZFlat;
268
+ extern Benchmark* Benchmark_BM_ZFlatAll;
269
+ extern Benchmark* Benchmark_BM_ZFlatIncreasingTableSize;
320
270
 
321
271
  void ResetBenchmarkTiming();
322
272
  void StartBenchmarkTiming();
323
273
  void StopBenchmarkTiming();
324
- void SetBenchmarkLabel(const string& str);
274
+ void SetBenchmarkLabel(const std::string& str);
325
275
  void SetBenchmarkBytesProcessed(int64 bytes);
326
276
 
327
277
  #ifdef HAVE_LIBZ
@@ -468,6 +418,8 @@ static inline void RunSpecifiedBenchmarks() {
468
418
  snappy::Benchmark_BM_UIOVec->Run();
469
419
  snappy::Benchmark_BM_UValidate->Run();
470
420
  snappy::Benchmark_BM_ZFlat->Run();
421
+ snappy::Benchmark_BM_ZFlatAll->Run();
422
+ snappy::Benchmark_BM_ZFlatIncreasingTableSize->Run();
471
423
 
472
424
  fprintf(stderr, "\n");
473
425
  }
@@ -30,25 +30,50 @@
30
30
  #include "snappy-internal.h"
31
31
  #include "snappy-sinksource.h"
32
32
 
33
- #ifndef SNAPPY_HAVE_SSE2
34
- #if defined(__SSE2__) || defined(_M_X64) || \
35
- (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
36
- #define SNAPPY_HAVE_SSE2 1
33
+ #if !defined(SNAPPY_HAVE_SSSE3)
34
+ // __SSSE3__ is defined by GCC and Clang. Visual Studio doesn't target SIMD
35
+ // support between SSE2 and AVX (so SSSE3 instructions require AVX support), and
36
+ // defines __AVX__ when AVX support is available.
37
+ #if defined(__SSSE3__) || defined(__AVX__)
38
+ #define SNAPPY_HAVE_SSSE3 1
37
39
  #else
38
- #define SNAPPY_HAVE_SSE2 0
40
+ #define SNAPPY_HAVE_SSSE3 0
39
41
  #endif
42
+ #endif // !defined(SNAPPY_HAVE_SSSE3)
43
+
44
+ #if !defined(SNAPPY_HAVE_BMI2)
45
+ // __BMI2__ is defined by GCC and Clang. Visual Studio doesn't target BMI2
46
+ // specifically, but it does define __AVX2__ when AVX2 support is available.
47
+ // Fortunately, AVX2 was introduced in Haswell, just like BMI2.
48
+ //
49
+ // BMI2 is not defined as a subset of AVX2 (unlike SSSE3 and AVX above). So,
50
+ // GCC and Clang can build code with AVX2 enabled but BMI2 disabled, in which
51
+ // case issuing BMI2 instructions results in a compiler error.
52
+ #if defined(__BMI2__) || (defined(_MSC_VER) && defined(__AVX2__))
53
+ #define SNAPPY_HAVE_BMI2 1
54
+ #else
55
+ #define SNAPPY_HAVE_BMI2 0
56
+ #endif
57
+ #endif // !defined(SNAPPY_HAVE_BMI2)
58
+
59
+ #if SNAPPY_HAVE_SSSE3
60
+ // Please do not replace with <x86intrin.h>. or with headers that assume more
61
+ // advanced SSE versions without checking with all the OWNERS.
62
+ #include <tmmintrin.h>
40
63
  #endif
41
64
 
42
- #if SNAPPY_HAVE_SSE2
43
- #include <emmintrin.h>
65
+ #if SNAPPY_HAVE_BMI2
66
+ // Please do not replace with <x86intrin.h>. or with headers that assume more
67
+ // advanced SSE versions without checking with all the OWNERS.
68
+ #include <immintrin.h>
44
69
  #endif
70
+
45
71
  #include <stdio.h>
46
72
 
47
73
  #include <algorithm>
48
74
  #include <string>
49
75
  #include <vector>
50
76
 
51
-
52
77
  namespace snappy {
53
78
 
54
79
  using internal::COPY_1_BYTE_OFFSET;
@@ -103,16 +128,12 @@ void UnalignedCopy64(const void* src, void* dst) {
103
128
  }
104
129
 
105
130
  void UnalignedCopy128(const void* src, void* dst) {
106
- // TODO(alkis): Remove this when we upgrade to a recent compiler that emits
107
- // SSE2 moves for memcpy(dst, src, 16).
108
- #if SNAPPY_HAVE_SSE2
109
- __m128i x = _mm_loadu_si128(static_cast<const __m128i*>(src));
110
- _mm_storeu_si128(static_cast<__m128i*>(dst), x);
111
- #else
131
+ // memcpy gets vectorized when the appropriate compiler options are used.
132
+ // For example, x86 compilers targeting SSE2+ will optimize to an SSE2 load
133
+ // and store.
112
134
  char tmp[16];
113
135
  memcpy(tmp, src, 16);
114
136
  memcpy(dst, tmp, 16);
115
- #endif
116
137
  }
117
138
 
118
139
  // Copy [src, src+(op_limit-op)) to [op, (op_limit-op)) a byte at a time. Used
@@ -127,12 +148,35 @@ void UnalignedCopy128(const void* src, void* dst) {
127
148
  // Note that this does not match the semantics of either memcpy() or memmove().
128
149
  inline char* IncrementalCopySlow(const char* src, char* op,
129
150
  char* const op_limit) {
151
+ // TODO: Remove pragma when LLVM is aware this
152
+ // function is only called in cold regions and when cold regions don't get
153
+ // vectorized or unrolled.
154
+ #ifdef __clang__
155
+ #pragma clang loop unroll(disable)
156
+ #endif
130
157
  while (op < op_limit) {
131
158
  *op++ = *src++;
132
159
  }
133
160
  return op_limit;
134
161
  }
135
162
 
163
+ #if SNAPPY_HAVE_SSSE3
164
+
165
+ // This is a table of shuffle control masks that can be used as the source
166
+ // operand for PSHUFB to permute the contents of the destination XMM register
167
+ // into a repeating byte pattern.
168
+ alignas(16) const char pshufb_fill_patterns[7][16] = {
169
+ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
170
+ {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1},
171
+ {0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0},
172
+ {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3},
173
+ {0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0},
174
+ {0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3},
175
+ {0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1},
176
+ };
177
+
178
+ #endif // SNAPPY_HAVE_SSSE3
179
+
136
180
  // Copy [src, src+(op_limit-op)) to [op, (op_limit-op)) but faster than
137
181
  // IncrementalCopySlow. buf_limit is the address past the end of the writable
138
182
  // region of the buffer.
@@ -144,9 +188,10 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
144
188
  // pat = op - src
145
189
  // len = limit - op
146
190
  assert(src < op);
191
+ assert(op <= op_limit);
147
192
  assert(op_limit <= buf_limit);
148
193
  // NOTE: The compressor always emits 4 <= len <= 64. It is ok to assume that
149
- // to optimize this function but we have to also handle these cases in case
194
+ // to optimize this function but we have to also handle other cases in case
150
195
  // the input does not satisfy these conditions.
151
196
 
152
197
  size_t pattern_size = op - src;
@@ -176,16 +221,45 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
176
221
 
177
222
  // Handle the uncommon case where pattern is less than 8 bytes.
178
223
  if (SNAPPY_PREDICT_FALSE(pattern_size < 8)) {
179
- // Expand pattern to at least 8 bytes. The worse case scenario in terms of
180
- // buffer usage is when the pattern is size 3. ^ is the original position
181
- // of op. x are irrelevant bytes copied by the last UnalignedCopy64.
224
+ #if SNAPPY_HAVE_SSSE3
225
+ // Load the first eight bytes into an 128-bit XMM register, then use PSHUFB
226
+ // to permute the register's contents in-place into a repeating sequence of
227
+ // the first "pattern_size" bytes.
228
+ // For example, suppose:
229
+ // src == "abc"
230
+ // op == op + 3
231
+ // After _mm_shuffle_epi8(), "pattern" will have five copies of "abc"
232
+ // followed by one byte of slop: abcabcabcabcabca.
182
233
  //
183
- // abc
184
- // abcabcxxxxx
185
- // abcabcabcabcxxxxx
186
- // ^
187
- // The last x is 14 bytes after ^.
188
- if (SNAPPY_PREDICT_TRUE(op <= buf_limit - 14)) {
234
+ // The non-SSE fallback implementation suffers from store-forwarding stalls
235
+ // because its loads and stores partly overlap. By expanding the pattern
236
+ // in-place, we avoid the penalty.
237
+ if (SNAPPY_PREDICT_TRUE(op <= buf_limit - 16)) {
238
+ const __m128i shuffle_mask = _mm_load_si128(
239
+ reinterpret_cast<const __m128i*>(pshufb_fill_patterns)
240
+ + pattern_size - 1);
241
+ const __m128i pattern = _mm_shuffle_epi8(
242
+ _mm_loadl_epi64(reinterpret_cast<const __m128i*>(src)), shuffle_mask);
243
+ // Uninitialized bytes are masked out by the shuffle mask.
244
+ // TODO: remove annotation and macro defs once MSan is fixed.
245
+ SNAPPY_ANNOTATE_MEMORY_IS_INITIALIZED(&pattern, sizeof(pattern));
246
+ pattern_size *= 16 / pattern_size;
247
+ char* op_end = std::min(op_limit, buf_limit - 15);
248
+ while (op < op_end) {
249
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(op), pattern);
250
+ op += pattern_size;
251
+ }
252
+ if (SNAPPY_PREDICT_TRUE(op >= op_limit)) return op_limit;
253
+ }
254
+ return IncrementalCopySlow(src, op, op_limit);
255
+ #else // !SNAPPY_HAVE_SSSE3
256
+ // If plenty of buffer space remains, expand the pattern to at least 8
257
+ // bytes. The way the following loop is written, we need 8 bytes of buffer
258
+ // space if pattern_size >= 4, 11 bytes if pattern_size is 1 or 3, and 10
259
+ // bytes if pattern_size is 2. Precisely encoding that is probably not
260
+ // worthwhile; instead, invoke the slow path if we cannot write 11 bytes
261
+ // (because 11 are required in the worst case).
262
+ if (SNAPPY_PREDICT_TRUE(op <= buf_limit - 11)) {
189
263
  while (pattern_size < 8) {
190
264
  UnalignedCopy64(src, op);
191
265
  op += pattern_size;
@@ -195,6 +269,7 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
195
269
  } else {
196
270
  return IncrementalCopySlow(src, op, op_limit);
197
271
  }
272
+ #endif // SNAPPY_HAVE_SSSE3
198
273
  }
199
274
  assert(pattern_size >= 8);
200
275
 
@@ -202,13 +277,48 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
202
277
  // UnalignedCopy128 might overwrite data in op. UnalignedCopy64 is safe
203
278
  // because expanding the pattern to at least 8 bytes guarantees that
204
279
  // op - src >= 8.
205
- while (op <= buf_limit - 16) {
280
+ //
281
+ // Typically, the op_limit is the gating factor so try to simplify the loop
282
+ // based on that.
283
+ if (SNAPPY_PREDICT_TRUE(op_limit <= buf_limit - 16)) {
284
+ // There is at least one, and at most four 16-byte blocks. Writing four
285
+ // conditionals instead of a loop allows FDO to layout the code with respect
286
+ // to the actual probabilities of each length.
287
+ // TODO: Replace with loop with trip count hint.
288
+ UnalignedCopy64(src, op);
289
+ UnalignedCopy64(src + 8, op + 8);
290
+
291
+ if (op + 16 < op_limit) {
292
+ UnalignedCopy64(src + 16, op + 16);
293
+ UnalignedCopy64(src + 24, op + 24);
294
+ }
295
+ if (op + 32 < op_limit) {
296
+ UnalignedCopy64(src + 32, op + 32);
297
+ UnalignedCopy64(src + 40, op + 40);
298
+ }
299
+ if (op + 48 < op_limit) {
300
+ UnalignedCopy64(src + 48, op + 48);
301
+ UnalignedCopy64(src + 56, op + 56);
302
+ }
303
+ return op_limit;
304
+ }
305
+
306
+ // Fall back to doing as much as we can with the available slop in the
307
+ // buffer. This code path is relatively cold however so we save code size by
308
+ // avoiding unrolling and vectorizing.
309
+ //
310
+ // TODO: Remove pragma when when cold regions don't get vectorized
311
+ // or unrolled.
312
+ #ifdef __clang__
313
+ #pragma clang loop unroll(disable)
314
+ #endif
315
+ for (char *op_end = buf_limit - 16; op < op_end; op += 16, src += 16) {
206
316
  UnalignedCopy64(src, op);
207
317
  UnalignedCopy64(src + 8, op + 8);
208
- src += 16;
209
- op += 16;
210
- if (SNAPPY_PREDICT_TRUE(op >= op_limit)) return op_limit;
211
318
  }
319
+ if (op >= op_limit)
320
+ return op_limit;
321
+
212
322
  // We only take this branch if we didn't have enough slop and we can do a
213
323
  // single 8 byte copy.
214
324
  if (SNAPPY_PREDICT_FALSE(op <= buf_limit - 8)) {
@@ -221,10 +331,10 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
221
331
 
222
332
  } // namespace
223
333
 
334
+ template <bool allow_fast_path>
224
335
  static inline char* EmitLiteral(char* op,
225
336
  const char* literal,
226
- int len,
227
- bool allow_fast_path) {
337
+ int len) {
228
338
  // The vast majority of copies are below 16 bytes, for which a
229
339
  // call to memcpy is overkill. This fast path can sometimes
230
340
  // copy up to 15 bytes too much, but that is okay in the
@@ -249,25 +359,23 @@ static inline char* EmitLiteral(char* op,
249
359
  // Fits in tag byte
250
360
  *op++ = LITERAL | (n << 2);
251
361
  } else {
252
- // Encode in upcoming bytes
253
- char* base = op;
254
- int count = 0;
255
- op++;
256
- while (n > 0) {
257
- *op++ = n & 0xff;
258
- n >>= 8;
259
- count++;
260
- }
362
+ int count = (Bits::Log2Floor(n) >> 3) + 1;
261
363
  assert(count >= 1);
262
364
  assert(count <= 4);
263
- *base = LITERAL | ((59+count) << 2);
365
+ *op++ = LITERAL | ((59 + count) << 2);
366
+ // Encode in upcoming bytes.
367
+ // Write 4 bytes, though we may care about only 1 of them. The output buffer
368
+ // is guaranteed to have at least 3 more spaces left as 'len >= 61' holds
369
+ // here and there is a memcpy of size 'len' below.
370
+ LittleEndian::Store32(op, n);
371
+ op += count;
264
372
  }
265
373
  memcpy(op, literal, len);
266
374
  return op + len;
267
375
  }
268
376
 
269
- static inline char* EmitCopyAtMost64(char* op, size_t offset, size_t len,
270
- bool len_less_than_12) {
377
+ template <bool len_less_than_12>
378
+ static inline char* EmitCopyAtMost64(char* op, size_t offset, size_t len) {
271
379
  assert(len <= 64);
272
380
  assert(len >= 4);
273
381
  assert(offset < 65536);
@@ -288,29 +396,33 @@ static inline char* EmitCopyAtMost64(char* op, size_t offset, size_t len,
288
396
  return op;
289
397
  }
290
398
 
291
- static inline char* EmitCopy(char* op, size_t offset, size_t len,
292
- bool len_less_than_12) {
399
+ template <bool len_less_than_12>
400
+ static inline char* EmitCopy(char* op, size_t offset, size_t len) {
293
401
  assert(len_less_than_12 == (len < 12));
294
402
  if (len_less_than_12) {
295
- return EmitCopyAtMost64(op, offset, len, true);
403
+ return EmitCopyAtMost64</*len_less_than_12=*/true>(op, offset, len);
296
404
  } else {
297
405
  // A special case for len <= 64 might help, but so far measurements suggest
298
406
  // it's in the noise.
299
407
 
300
408
  // Emit 64 byte copies but make sure to keep at least four bytes reserved.
301
409
  while (SNAPPY_PREDICT_FALSE(len >= 68)) {
302
- op = EmitCopyAtMost64(op, offset, 64, false);
410
+ op = EmitCopyAtMost64</*len_less_than_12=*/false>(op, offset, 64);
303
411
  len -= 64;
304
412
  }
305
413
 
306
414
  // One or two copies will now finish the job.
307
415
  if (len > 64) {
308
- op = EmitCopyAtMost64(op, offset, 60, false);
416
+ op = EmitCopyAtMost64</*len_less_than_12=*/false>(op, offset, 60);
309
417
  len -= 60;
310
418
  }
311
419
 
312
420
  // Emit remainder.
313
- op = EmitCopyAtMost64(op, offset, len, len < 12);
421
+ if (len < 12) {
422
+ op = EmitCopyAtMost64</*len_less_than_12=*/true>(op, offset, len);
423
+ } else {
424
+ op = EmitCopyAtMost64</*len_less_than_12=*/false>(op, offset, len);
425
+ }
314
426
  return op;
315
427
  }
316
428
  }
@@ -326,31 +438,45 @@ bool GetUncompressedLength(const char* start, size_t n, size_t* result) {
326
438
  }
327
439
  }
328
440
 
329
- namespace internal {
330
- uint16* WorkingMemory::GetHashTable(size_t input_size, int* table_size) {
331
- // Use smaller hash table when input.size() is smaller, since we
332
- // fill the table, incurring O(hash table size) overhead for
333
- // compression, and if the input is short, we won't need that
334
- // many hash table entries anyway.
335
- assert(kMaxHashTableSize >= 256);
336
- size_t htsize = 256;
337
- while (htsize < kMaxHashTableSize && htsize < input_size) {
338
- htsize <<= 1;
441
+ namespace {
442
+ uint32 CalculateTableSize(uint32 input_size) {
443
+ static_assert(
444
+ kMaxHashTableSize >= kMinHashTableSize,
445
+ "kMaxHashTableSize should be greater or equal to kMinHashTableSize.");
446
+ if (input_size > kMaxHashTableSize) {
447
+ return kMaxHashTableSize;
339
448
  }
340
-
341
- uint16* table;
342
- if (htsize <= ARRAYSIZE(small_table_)) {
343
- table = small_table_;
344
- } else {
345
- if (large_table_ == NULL) {
346
- large_table_ = new uint16[kMaxHashTableSize];
347
- }
348
- table = large_table_;
449
+ if (input_size < kMinHashTableSize) {
450
+ return kMinHashTableSize;
349
451
  }
452
+ // This is equivalent to Log2Ceiling(input_size), assuming input_size > 1.
453
+ // 2 << Log2Floor(x - 1) is equivalent to 1 << (1 + Log2Floor(x - 1)).
454
+ return 2u << Bits::Log2Floor(input_size - 1);
455
+ }
456
+ } // namespace
350
457
 
458
+ namespace internal {
459
+ WorkingMemory::WorkingMemory(size_t input_size) {
460
+ const size_t max_fragment_size = std::min(input_size, kBlockSize);
461
+ const size_t table_size = CalculateTableSize(max_fragment_size);
462
+ size_ = table_size * sizeof(*table_) + max_fragment_size +
463
+ MaxCompressedLength(max_fragment_size);
464
+ mem_ = std::allocator<char>().allocate(size_);
465
+ table_ = reinterpret_cast<uint16*>(mem_);
466
+ input_ = mem_ + table_size * sizeof(*table_);
467
+ output_ = input_ + max_fragment_size;
468
+ }
469
+
470
+ WorkingMemory::~WorkingMemory() {
471
+ std::allocator<char>().deallocate(mem_, size_);
472
+ }
473
+
474
+ uint16* WorkingMemory::GetHashTable(size_t fragment_size,
475
+ int* table_size) const {
476
+ const size_t htsize = CalculateTableSize(fragment_size);
477
+ memset(table_, 0, htsize * sizeof(*table_));
351
478
  *table_size = htsize;
352
- memset(table, 0, htsize * sizeof(*table));
353
- return table;
479
+ return table_;
354
480
  }
355
481
  } // end namespace internal
356
482
 
@@ -417,7 +543,7 @@ char* CompressFragment(const char* input,
417
543
  // "ip" is the input pointer, and "op" is the output pointer.
418
544
  const char* ip = input;
419
545
  assert(input_size <= kBlockSize);
420
- assert((table_size & (table_size - 1)) == 0); // table must be power of two
546
+ assert((table_size & (table_size - 1)) == 0); // table must be power of two
421
547
  const int shift = 32 - Bits::Log2Floor(table_size);
422
548
  assert(static_cast<int>(kuint32max >> shift) == table_size - 1);
423
549
  const char* ip_end = input + input_size;
@@ -484,7 +610,7 @@ char* CompressFragment(const char* input,
484
610
  // than 4 bytes match. But, prior to the match, input
485
611
  // bytes [next_emit, ip) are unmatched. Emit them as "literal bytes."
486
612
  assert(next_emit + 16 <= ip_end);
487
- op = EmitLiteral(op, next_emit, ip - next_emit, true);
613
+ op = EmitLiteral</*allow_fast_path=*/true>(op, next_emit, ip - next_emit);
488
614
 
489
615
  // Step 3: Call EmitCopy, and then see if another EmitCopy could
490
616
  // be our next move. Repeat until we find no match for the
@@ -507,7 +633,11 @@ char* CompressFragment(const char* input,
507
633
  ip += matched;
508
634
  size_t offset = base - candidate;
509
635
  assert(0 == memcmp(base, candidate, matched));
510
- op = EmitCopy(op, offset, matched, p.second);
636
+ if (p.second) {
637
+ op = EmitCopy</*len_less_than_12=*/true>(op, offset, matched);
638
+ } else {
639
+ op = EmitCopy</*len_less_than_12=*/false>(op, offset, matched);
640
+ }
511
641
  next_emit = ip;
512
642
  if (SNAPPY_PREDICT_FALSE(ip >= ip_limit)) {
513
643
  goto emit_remainder;
@@ -532,7 +662,8 @@ char* CompressFragment(const char* input,
532
662
  emit_remainder:
533
663
  // Emit the remaining bytes as a literal
534
664
  if (next_emit < ip_end) {
535
- op = EmitLiteral(op, next_emit, ip_end - next_emit, false);
665
+ op = EmitLiteral</*allow_fast_path=*/false>(op, next_emit,
666
+ ip_end - next_emit);
536
667
  }
537
668
 
538
669
  return op;
@@ -583,14 +714,28 @@ static inline void Report(const char *algorithm, size_t compressed_size,
583
714
  // bool TryFastAppend(const char* ip, size_t available, size_t length);
584
715
  // };
585
716
 
586
- namespace internal {
587
-
588
- // Mapping from i in range [0,4] to a mask to extract the bottom 8*i bits
589
- static const uint32 wordmask[] = {
590
- 0u, 0xffu, 0xffffu, 0xffffffu, 0xffffffffu
591
- };
717
+ static inline uint32 ExtractLowBytes(uint32 v, int n) {
718
+ assert(n >= 0);
719
+ assert(n <= 4);
720
+ #if SNAPPY_HAVE_BMI2
721
+ return _bzhi_u32(v, 8 * n);
722
+ #else
723
+ // This needs to be wider than uint32 otherwise `mask << 32` will be
724
+ // undefined.
725
+ uint64 mask = 0xffffffff;
726
+ return v & ~(mask << (8 * n));
727
+ #endif
728
+ }
592
729
 
593
- } // end namespace internal
730
+ static inline bool LeftShiftOverflows(uint8 value, uint32 shift) {
731
+ assert(shift < 32);
732
+ static const uint8 masks[] = {
733
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, //
734
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, //
735
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, //
736
+ 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe};
737
+ return (value & masks[shift]) != 0;
738
+ }
594
739
 
595
740
  // Helper class for decompression
596
741
  class SnappyDecompressor {
@@ -629,7 +774,7 @@ class SnappyDecompressor {
629
774
  }
630
775
 
631
776
  // Read the uncompressed length stored at the start of the compressed data.
632
- // On succcess, stores the length in *result and returns true.
777
+ // On success, stores the length in *result and returns true.
633
778
  // On failure, returns false.
634
779
  bool ReadUncompressedLength(uint32* result) {
635
780
  assert(ip_ == NULL); // Must not have read anything yet
@@ -644,7 +789,7 @@ class SnappyDecompressor {
644
789
  const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip));
645
790
  reader_->Skip(1);
646
791
  uint32 val = c & 0x7f;
647
- if (((val << shift) >> shift) != val) return false;
792
+ if (LeftShiftOverflows(static_cast<uint8>(val), shift)) return false;
648
793
  *result |= val << shift;
649
794
  if (c < 128) {
650
795
  break;
@@ -657,22 +802,27 @@ class SnappyDecompressor {
657
802
  // Process the next item found in the input.
658
803
  // Returns true if successful, false on error or end of input.
659
804
  template <class Writer>
805
+ #if defined(__GNUC__) && defined(__x86_64__)
806
+ __attribute__((aligned(32)))
807
+ #endif
660
808
  void DecompressAllTags(Writer* writer) {
661
- const char* ip = ip_;
662
- // For position-independent executables, accessing global arrays can be
663
- // slow. Move wordmask array onto the stack to mitigate this.
664
- uint32 wordmask[sizeof(internal::wordmask)/sizeof(uint32)];
665
- // Do not use memcpy to copy internal::wordmask to
666
- // wordmask. LLVM converts stack arrays to global arrays if it detects
667
- // const stack arrays and this hurts the performance of position
668
- // independent code. This change is temporary and can be reverted when
669
- // https://reviews.llvm.org/D30759 is approved.
670
- wordmask[0] = internal::wordmask[0];
671
- wordmask[1] = internal::wordmask[1];
672
- wordmask[2] = internal::wordmask[2];
673
- wordmask[3] = internal::wordmask[3];
674
- wordmask[4] = internal::wordmask[4];
809
+ // In x86, pad the function body to start 16 bytes later. This function has
810
+ // a couple of hotspots that are highly sensitive to alignment: we have
811
+ // observed regressions by more than 20% in some metrics just by moving the
812
+ // exact same code to a different position in the benchmark binary.
813
+ //
814
+ // Putting this code on a 32-byte-aligned boundary + 16 bytes makes us hit
815
+ // the "lucky" case consistently. Unfortunately, this is a very brittle
816
+ // workaround, and future differences in code generation may reintroduce
817
+ // this regression. If you experience a big, difficult to explain, benchmark
818
+ // performance regression here, first try removing this hack.
819
+ #if defined(__GNUC__) && defined(__x86_64__)
820
+ // Two 8-byte "NOP DWORD ptr [EAX + EAX*1 + 00000000H]" instructions.
821
+ asm(".byte 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00");
822
+ asm(".byte 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00");
823
+ #endif
675
824
 
825
+ const char* ip = ip_;
676
826
  // We could have put this refill fragment only at the beginning of the loop.
677
827
  // However, duplicating it at the end of each branch gives the compiler more
678
828
  // scope to optimize the <ip_limit_ - ip> expression based on the local
@@ -685,13 +835,6 @@ class SnappyDecompressor {
685
835
  }
686
836
 
687
837
  MAYBE_REFILL();
688
- // Add loop alignment directive. Without this directive, we observed
689
- // significant performance degradation on several intel architectures
690
- // in snappy benchmark built with LLVM. The degradation was caused by
691
- // increased branch miss prediction.
692
- #if defined(__clang__) && defined(__x86_64__)
693
- asm volatile (".p2align 5");
694
- #endif
695
838
  for ( ;; ) {
696
839
  const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip++));
697
840
 
@@ -712,7 +855,7 @@ class SnappyDecompressor {
712
855
  if (writer->TryFastAppend(ip, ip_limit_ - ip, literal_length)) {
713
856
  assert(literal_length < 61);
714
857
  ip += literal_length;
715
- // NOTE(user): There is no MAYBE_REFILL() here, as TryFastAppend()
858
+ // NOTE: There is no MAYBE_REFILL() here, as TryFastAppend()
716
859
  // will not return true unless there's already at least five spare
717
860
  // bytes in addition to the literal.
718
861
  continue;
@@ -721,7 +864,8 @@ class SnappyDecompressor {
721
864
  // Long literal.
722
865
  const size_t literal_length_length = literal_length - 60;
723
866
  literal_length =
724
- (LittleEndian::Load32(ip) & wordmask[literal_length_length]) + 1;
867
+ ExtractLowBytes(LittleEndian::Load32(ip), literal_length_length) +
868
+ 1;
725
869
  ip += literal_length_length;
726
870
  }
727
871
 
@@ -744,7 +888,8 @@ class SnappyDecompressor {
744
888
  MAYBE_REFILL();
745
889
  } else {
746
890
  const size_t entry = char_table[c];
747
- const size_t trailer = LittleEndian::Load32(ip) & wordmask[entry >> 11];
891
+ const size_t trailer =
892
+ ExtractLowBytes(LittleEndian::Load32(ip), entry >> 11);
748
893
  const size_t length = entry & 0xff;
749
894
  ip += entry >> 11;
750
895
 
@@ -860,9 +1005,7 @@ size_t Compress(Source* reader, Sink* writer) {
860
1005
  writer->Append(ulength, p-ulength);
861
1006
  written += (p - ulength);
862
1007
 
863
- internal::WorkingMemory wmem;
864
- char* scratch = NULL;
865
- char* scratch_output = NULL;
1008
+ internal::WorkingMemory wmem(N);
866
1009
 
867
1010
  while (N > 0) {
868
1011
  // Get next block to compress (without copying if possible)
@@ -878,13 +1021,7 @@ size_t Compress(Source* reader, Sink* writer) {
878
1021
  pending_advance = num_to_read;
879
1022
  fragment_size = num_to_read;
880
1023
  } else {
881
- // Read into scratch buffer
882
- if (scratch == NULL) {
883
- // If this is the last iteration, we want to allocate N bytes
884
- // of space, otherwise the max possible kBlockSize space.
885
- // num_to_read contains exactly the correct value
886
- scratch = new char[num_to_read];
887
- }
1024
+ char* scratch = wmem.GetScratchInput();
888
1025
  memcpy(scratch, fragment, bytes_read);
889
1026
  reader->Skip(bytes_read);
890
1027
 
@@ -910,16 +1047,13 @@ size_t Compress(Source* reader, Sink* writer) {
910
1047
 
911
1048
  // Need a scratch buffer for the output, in case the byte sink doesn't
912
1049
  // have room for us directly.
913
- if (scratch_output == NULL) {
914
- scratch_output = new char[max_output];
915
- } else {
916
- // Since we encode kBlockSize regions followed by a region
917
- // which is <= kBlockSize in length, a previously allocated
918
- // scratch_output[] region is big enough for this iteration.
919
- }
920
- char* dest = writer->GetAppendBuffer(max_output, scratch_output);
921
- char* end = internal::CompressFragment(fragment, fragment_size,
922
- dest, table, table_size);
1050
+
1051
+ // Since we encode kBlockSize regions followed by a region
1052
+ // which is <= kBlockSize in length, a previously allocated
1053
+ // scratch_output[] region is big enough for this iteration.
1054
+ char* dest = writer->GetAppendBuffer(max_output, wmem.GetScratchOutput());
1055
+ char* end = internal::CompressFragment(fragment, fragment_size, dest, table,
1056
+ table_size);
923
1057
  writer->Append(dest, end - dest);
924
1058
  written += (end - dest);
925
1059
 
@@ -929,9 +1063,6 @@ size_t Compress(Source* reader, Sink* writer) {
929
1063
 
930
1064
  Report("snappy_compress", written, uncompressed_size);
931
1065
 
932
- delete[] scratch;
933
- delete[] scratch_output;
934
-
935
1066
  return written;
936
1067
  }
937
1068
 
@@ -944,14 +1075,22 @@ size_t Compress(Source* reader, Sink* writer) {
944
1075
  // Writer template argument to SnappyDecompressor::DecompressAllTags().
945
1076
  class SnappyIOVecWriter {
946
1077
  private:
1078
+ // output_iov_end_ is set to iov + count and used to determine when
1079
+ // the end of the iovs is reached.
1080
+ const struct iovec* output_iov_end_;
1081
+
1082
+ #if !defined(NDEBUG)
947
1083
  const struct iovec* output_iov_;
948
- const size_t output_iov_count_;
1084
+ #endif // !defined(NDEBUG)
1085
+
1086
+ // Current iov that is being written into.
1087
+ const struct iovec* curr_iov_;
949
1088
 
950
- // We are currently writing into output_iov_[curr_iov_index_].
951
- size_t curr_iov_index_;
1089
+ // Pointer to current iov's write location.
1090
+ char* curr_iov_output_;
952
1091
 
953
- // Bytes written to output_iov_[curr_iov_index_] so far.
954
- size_t curr_iov_written_;
1092
+ // Remaining bytes to write into curr_iov_output.
1093
+ size_t curr_iov_remaining_;
955
1094
 
956
1095
  // Total bytes decompressed into output_iov_ so far.
957
1096
  size_t total_written_;
@@ -959,22 +1098,24 @@ class SnappyIOVecWriter {
959
1098
  // Maximum number of bytes that will be decompressed into output_iov_.
960
1099
  size_t output_limit_;
961
1100
 
962
- inline char* GetIOVecPointer(size_t index, size_t offset) {
963
- return reinterpret_cast<char*>(output_iov_[index].iov_base) +
964
- offset;
1101
+ static inline char* GetIOVecPointer(const struct iovec* iov, size_t offset) {
1102
+ return reinterpret_cast<char*>(iov->iov_base) + offset;
965
1103
  }
966
1104
 
967
1105
  public:
968
1106
  // Does not take ownership of iov. iov must be valid during the
969
1107
  // entire lifetime of the SnappyIOVecWriter.
970
1108
  inline SnappyIOVecWriter(const struct iovec* iov, size_t iov_count)
971
- : output_iov_(iov),
972
- output_iov_count_(iov_count),
973
- curr_iov_index_(0),
974
- curr_iov_written_(0),
1109
+ : output_iov_end_(iov + iov_count),
1110
+ #if !defined(NDEBUG)
1111
+ output_iov_(iov),
1112
+ #endif // !defined(NDEBUG)
1113
+ curr_iov_(iov),
1114
+ curr_iov_output_(iov_count ? reinterpret_cast<char*>(iov->iov_base)
1115
+ : nullptr),
1116
+ curr_iov_remaining_(iov_count ? iov->iov_len : 0),
975
1117
  total_written_(0),
976
- output_limit_(-1) {
977
- }
1118
+ output_limit_(-1) {}
978
1119
 
979
1120
  inline void SetExpectedLength(size_t len) {
980
1121
  output_limit_ = len;
@@ -989,23 +1130,25 @@ class SnappyIOVecWriter {
989
1130
  return false;
990
1131
  }
991
1132
 
1133
+ return AppendNoCheck(ip, len);
1134
+ }
1135
+
1136
+ inline bool AppendNoCheck(const char* ip, size_t len) {
992
1137
  while (len > 0) {
993
- assert(curr_iov_written_ <= output_iov_[curr_iov_index_].iov_len);
994
- if (curr_iov_written_ >= output_iov_[curr_iov_index_].iov_len) {
1138
+ if (curr_iov_remaining_ == 0) {
995
1139
  // This iovec is full. Go to the next one.
996
- if (curr_iov_index_ + 1 >= output_iov_count_) {
1140
+ if (curr_iov_ + 1 >= output_iov_end_) {
997
1141
  return false;
998
1142
  }
999
- curr_iov_written_ = 0;
1000
- ++curr_iov_index_;
1143
+ ++curr_iov_;
1144
+ curr_iov_output_ = reinterpret_cast<char*>(curr_iov_->iov_base);
1145
+ curr_iov_remaining_ = curr_iov_->iov_len;
1001
1146
  }
1002
1147
 
1003
- const size_t to_write = std::min(
1004
- len, output_iov_[curr_iov_index_].iov_len - curr_iov_written_);
1005
- memcpy(GetIOVecPointer(curr_iov_index_, curr_iov_written_),
1006
- ip,
1007
- to_write);
1008
- curr_iov_written_ += to_write;
1148
+ const size_t to_write = std::min(len, curr_iov_remaining_);
1149
+ memcpy(curr_iov_output_, ip, to_write);
1150
+ curr_iov_output_ += to_write;
1151
+ curr_iov_remaining_ -= to_write;
1009
1152
  total_written_ += to_write;
1010
1153
  ip += to_write;
1011
1154
  len -= to_write;
@@ -1017,11 +1160,11 @@ class SnappyIOVecWriter {
1017
1160
  inline bool TryFastAppend(const char* ip, size_t available, size_t len) {
1018
1161
  const size_t space_left = output_limit_ - total_written_;
1019
1162
  if (len <= 16 && available >= 16 + kMaximumTagLength && space_left >= 16 &&
1020
- output_iov_[curr_iov_index_].iov_len - curr_iov_written_ >= 16) {
1163
+ curr_iov_remaining_ >= 16) {
1021
1164
  // Fast path, used for the majority (about 95%) of invocations.
1022
- char* ptr = GetIOVecPointer(curr_iov_index_, curr_iov_written_);
1023
- UnalignedCopy128(ip, ptr);
1024
- curr_iov_written_ += len;
1165
+ UnalignedCopy128(ip, curr_iov_output_);
1166
+ curr_iov_output_ += len;
1167
+ curr_iov_remaining_ -= len;
1025
1168
  total_written_ += len;
1026
1169
  return true;
1027
1170
  }
@@ -1030,7 +1173,9 @@ class SnappyIOVecWriter {
1030
1173
  }
1031
1174
 
1032
1175
  inline bool AppendFromSelf(size_t offset, size_t len) {
1033
- if (offset > total_written_ || offset == 0) {
1176
+ // See SnappyArrayWriter::AppendFromSelf for an explanation of
1177
+ // the "offset - 1u" trick.
1178
+ if (offset - 1u >= total_written_) {
1034
1179
  return false;
1035
1180
  }
1036
1181
  const size_t space_left = output_limit_ - total_written_;
@@ -1039,8 +1184,8 @@ class SnappyIOVecWriter {
1039
1184
  }
1040
1185
 
1041
1186
  // Locate the iovec from which we need to start the copy.
1042
- size_t from_iov_index = curr_iov_index_;
1043
- size_t from_iov_offset = curr_iov_written_;
1187
+ const iovec* from_iov = curr_iov_;
1188
+ size_t from_iov_offset = curr_iov_->iov_len - curr_iov_remaining_;
1044
1189
  while (offset > 0) {
1045
1190
  if (from_iov_offset >= offset) {
1046
1191
  from_iov_offset -= offset;
@@ -1048,47 +1193,47 @@ class SnappyIOVecWriter {
1048
1193
  }
1049
1194
 
1050
1195
  offset -= from_iov_offset;
1051
- assert(from_iov_index > 0);
1052
- --from_iov_index;
1053
- from_iov_offset = output_iov_[from_iov_index].iov_len;
1196
+ --from_iov;
1197
+ #if !defined(NDEBUG)
1198
+ assert(from_iov >= output_iov_);
1199
+ #endif // !defined(NDEBUG)
1200
+ from_iov_offset = from_iov->iov_len;
1054
1201
  }
1055
1202
 
1056
1203
  // Copy <len> bytes starting from the iovec pointed to by from_iov_index to
1057
1204
  // the current iovec.
1058
1205
  while (len > 0) {
1059
- assert(from_iov_index <= curr_iov_index_);
1060
- if (from_iov_index != curr_iov_index_) {
1061
- const size_t to_copy = std::min(
1062
- output_iov_[from_iov_index].iov_len - from_iov_offset,
1063
- len);
1064
- Append(GetIOVecPointer(from_iov_index, from_iov_offset), to_copy);
1206
+ assert(from_iov <= curr_iov_);
1207
+ if (from_iov != curr_iov_) {
1208
+ const size_t to_copy =
1209
+ std::min(from_iov->iov_len - from_iov_offset, len);
1210
+ AppendNoCheck(GetIOVecPointer(from_iov, from_iov_offset), to_copy);
1065
1211
  len -= to_copy;
1066
1212
  if (len > 0) {
1067
- ++from_iov_index;
1213
+ ++from_iov;
1068
1214
  from_iov_offset = 0;
1069
1215
  }
1070
1216
  } else {
1071
- assert(curr_iov_written_ <= output_iov_[curr_iov_index_].iov_len);
1072
- size_t to_copy = std::min(output_iov_[curr_iov_index_].iov_len -
1073
- curr_iov_written_,
1074
- len);
1217
+ size_t to_copy = curr_iov_remaining_;
1075
1218
  if (to_copy == 0) {
1076
1219
  // This iovec is full. Go to the next one.
1077
- if (curr_iov_index_ + 1 >= output_iov_count_) {
1220
+ if (curr_iov_ + 1 >= output_iov_end_) {
1078
1221
  return false;
1079
1222
  }
1080
- ++curr_iov_index_;
1081
- curr_iov_written_ = 0;
1223
+ ++curr_iov_;
1224
+ curr_iov_output_ = reinterpret_cast<char*>(curr_iov_->iov_base);
1225
+ curr_iov_remaining_ = curr_iov_->iov_len;
1082
1226
  continue;
1083
1227
  }
1084
1228
  if (to_copy > len) {
1085
1229
  to_copy = len;
1086
1230
  }
1087
- IncrementalCopySlow(
1088
- GetIOVecPointer(from_iov_index, from_iov_offset),
1089
- GetIOVecPointer(curr_iov_index_, curr_iov_written_),
1090
- GetIOVecPointer(curr_iov_index_, curr_iov_written_) + to_copy);
1091
- curr_iov_written_ += to_copy;
1231
+
1232
+ IncrementalCopy(GetIOVecPointer(from_iov, from_iov_offset),
1233
+ curr_iov_output_, curr_iov_output_ + to_copy,
1234
+ curr_iov_output_ + curr_iov_remaining_);
1235
+ curr_iov_output_ += to_copy;
1236
+ curr_iov_remaining_ -= to_copy;
1092
1237
  from_iov_offset += to_copy;
1093
1238
  total_written_ += to_copy;
1094
1239
  len -= to_copy;
@@ -1197,7 +1342,7 @@ bool RawUncompress(Source* compressed, char* uncompressed) {
1197
1342
  return InternalUncompress(compressed, &output);
1198
1343
  }
1199
1344
 
1200
- bool Uncompress(const char* compressed, size_t n, string* uncompressed) {
1345
+ bool Uncompress(const char* compressed, size_t n, std::string* uncompressed) {
1201
1346
  size_t ulength;
1202
1347
  if (!GetUncompressedLength(compressed, n, &ulength)) {
1203
1348
  return false;
@@ -1265,7 +1410,8 @@ void RawCompress(const char* input,
1265
1410
  *compressed_length = (writer.CurrentDestination() - compressed);
1266
1411
  }
1267
1412
 
1268
- size_t Compress(const char* input, size_t input_length, string* compressed) {
1413
+ size_t Compress(const char* input, size_t input_length,
1414
+ std::string* compressed) {
1269
1415
  // Pre-grow the buffer to the max length of the compressed output
1270
1416
  STLStringResizeUninitialized(compressed, MaxCompressedLength(input_length));
1271
1417
 
@@ -1512,4 +1658,4 @@ bool Uncompress(Source* compressed, Sink* uncompressed) {
1512
1658
  }
1513
1659
  }
1514
1660
 
1515
- } // end namespace snappy
1661
+ } // namespace snappy