snappy 0.0.17 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. checksums.yaml +5 -5
  2. data/.dockerignore +2 -0
  3. data/.github/workflows/main.yml +34 -0
  4. data/.github/workflows/publish.yml +34 -0
  5. data/.gitignore +2 -1
  6. data/.gitmodules +1 -1
  7. data/Dockerfile +13 -0
  8. data/Gemfile +4 -0
  9. data/README.md +29 -5
  10. data/Rakefile +32 -29
  11. data/ext/api.c +6 -1
  12. data/ext/extconf.rb +23 -16
  13. data/lib/snappy/hadoop/reader.rb +62 -0
  14. data/lib/snappy/hadoop/writer.rb +51 -0
  15. data/lib/snappy/hadoop.rb +22 -0
  16. data/lib/snappy/reader.rb +14 -10
  17. data/lib/snappy/shim.rb +1 -1
  18. data/lib/snappy/version.rb +1 -1
  19. data/lib/snappy.rb +5 -4
  20. data/snappy.gemspec +13 -13
  21. data/test/hadoop/snappy_hadoop_reader_test.rb +115 -0
  22. data/test/hadoop/snappy_hadoop_writer_test.rb +48 -0
  23. data/test/snappy_hadoop_test.rb +26 -0
  24. data/test/snappy_reader_test.rb +148 -0
  25. data/test/snappy_test.rb +95 -0
  26. data/test/snappy_writer_test.rb +55 -0
  27. data/test/test_helper.rb +7 -0
  28. data/test.sh +3 -0
  29. data/vendor/snappy/CMakeLists.txt +297 -0
  30. data/vendor/snappy/CONTRIBUTING.md +26 -0
  31. data/vendor/snappy/NEWS +40 -0
  32. data/vendor/snappy/{README → README.md} +27 -18
  33. data/vendor/snappy/cmake/SnappyConfig.cmake.in +33 -0
  34. data/vendor/snappy/cmake/config.h.in +62 -0
  35. data/vendor/snappy/docs/README.md +72 -0
  36. data/vendor/snappy/snappy-internal.h +22 -18
  37. data/vendor/snappy/snappy-stubs-internal.cc +1 -1
  38. data/vendor/snappy/snappy-stubs-internal.h +116 -38
  39. data/vendor/snappy/snappy-stubs-public.h.in +20 -46
  40. data/vendor/snappy/snappy-test.cc +26 -22
  41. data/vendor/snappy/snappy-test.h +24 -98
  42. data/vendor/snappy/snappy.cc +380 -183
  43. data/vendor/snappy/snappy.h +14 -10
  44. data/vendor/snappy/snappy_compress_fuzzer.cc +59 -0
  45. data/vendor/snappy/snappy_uncompress_fuzzer.cc +57 -0
  46. data/vendor/snappy/snappy_unittest.cc +236 -261
  47. metadata +37 -92
  48. data/.travis.yml +0 -26
  49. data/smoke.sh +0 -8
  50. data/test/test-snappy-reader.rb +0 -129
  51. data/test/test-snappy-writer.rb +0 -55
  52. data/test/test-snappy.rb +0 -58
  53. data/vendor/snappy/ChangeLog +0 -2468
  54. data/vendor/snappy/INSTALL +0 -370
  55. data/vendor/snappy/Makefile +0 -982
  56. data/vendor/snappy/Makefile.am +0 -26
  57. data/vendor/snappy/Makefile.in +0 -982
  58. data/vendor/snappy/aclocal.m4 +0 -9738
  59. data/vendor/snappy/autogen.sh +0 -12
  60. data/vendor/snappy/autom4te.cache/output.0 +0 -18856
  61. data/vendor/snappy/autom4te.cache/output.1 +0 -18852
  62. data/vendor/snappy/autom4te.cache/requests +0 -297
  63. data/vendor/snappy/autom4te.cache/traces.0 +0 -2689
  64. data/vendor/snappy/autom4te.cache/traces.1 +0 -714
  65. data/vendor/snappy/config.guess +0 -1530
  66. data/vendor/snappy/config.h +0 -135
  67. data/vendor/snappy/config.h.in +0 -134
  68. data/vendor/snappy/config.log +0 -1640
  69. data/vendor/snappy/config.status +0 -2318
  70. data/vendor/snappy/config.sub +0 -1773
  71. data/vendor/snappy/configure +0 -18852
  72. data/vendor/snappy/configure.ac +0 -134
  73. data/vendor/snappy/depcomp +0 -688
  74. data/vendor/snappy/install-sh +0 -527
  75. data/vendor/snappy/libtool +0 -10246
  76. data/vendor/snappy/ltmain.sh +0 -9661
  77. data/vendor/snappy/m4/gtest.m4 +0 -74
  78. data/vendor/snappy/m4/libtool.m4 +0 -8001
  79. data/vendor/snappy/m4/ltoptions.m4 +0 -384
  80. data/vendor/snappy/m4/ltsugar.m4 +0 -123
  81. data/vendor/snappy/m4/ltversion.m4 +0 -23
  82. data/vendor/snappy/m4/lt~obsolete.m4 +0 -98
  83. data/vendor/snappy/missing +0 -331
  84. data/vendor/snappy/snappy-stubs-public.h +0 -100
  85. data/vendor/snappy/snappy.pc +0 -10
  86. data/vendor/snappy/snappy.pc.in +0 -10
  87. data/vendor/snappy/stamp-h1 +0 -1
@@ -30,16 +30,50 @@
30
30
  #include "snappy-internal.h"
31
31
  #include "snappy-sinksource.h"
32
32
 
33
- #if defined(__x86_64__) || defined(_M_X64)
34
- #include <emmintrin.h>
33
+ #if !defined(SNAPPY_HAVE_SSSE3)
34
+ // __SSSE3__ is defined by GCC and Clang. Visual Studio doesn't target SIMD
35
+ // support between SSE2 and AVX (so SSSE3 instructions require AVX support), and
36
+ // defines __AVX__ when AVX support is available.
37
+ #if defined(__SSSE3__) || defined(__AVX__)
38
+ #define SNAPPY_HAVE_SSSE3 1
39
+ #else
40
+ #define SNAPPY_HAVE_SSSE3 0
41
+ #endif
42
+ #endif // !defined(SNAPPY_HAVE_SSSE3)
43
+
44
+ #if !defined(SNAPPY_HAVE_BMI2)
45
+ // __BMI2__ is defined by GCC and Clang. Visual Studio doesn't target BMI2
46
+ // specifically, but it does define __AVX2__ when AVX2 support is available.
47
+ // Fortunately, AVX2 was introduced in Haswell, just like BMI2.
48
+ //
49
+ // BMI2 is not defined as a subset of AVX2 (unlike SSSE3 and AVX above). So,
50
+ // GCC and Clang can build code with AVX2 enabled but BMI2 disabled, in which
51
+ // case issuing BMI2 instructions results in a compiler error.
52
+ #if defined(__BMI2__) || (defined(_MSC_VER) && defined(__AVX2__))
53
+ #define SNAPPY_HAVE_BMI2 1
54
+ #else
55
+ #define SNAPPY_HAVE_BMI2 0
35
56
  #endif
57
+ #endif // !defined(SNAPPY_HAVE_BMI2)
58
+
59
+ #if SNAPPY_HAVE_SSSE3
60
+ // Please do not replace with <x86intrin.h>. or with headers that assume more
61
+ // advanced SSE versions without checking with all the OWNERS.
62
+ #include <tmmintrin.h>
63
+ #endif
64
+
65
+ #if SNAPPY_HAVE_BMI2
66
+ // Please do not replace with <x86intrin.h>. or with headers that assume more
67
+ // advanced SSE versions without checking with all the OWNERS.
68
+ #include <immintrin.h>
69
+ #endif
70
+
36
71
  #include <stdio.h>
37
72
 
38
73
  #include <algorithm>
39
74
  #include <string>
40
75
  #include <vector>
41
76
 
42
-
43
77
  namespace snappy {
44
78
 
45
79
  using internal::COPY_1_BYTE_OFFSET;
@@ -47,7 +81,6 @@ using internal::COPY_2_BYTE_OFFSET;
47
81
  using internal::LITERAL;
48
82
  using internal::char_table;
49
83
  using internal::kMaximumTagLength;
50
- using internal::wordmask;
51
84
 
52
85
  // Any hash function will produce a valid compressed bitstream, but a good
53
86
  // hash function reduces the number of collisions and thus yields better
@@ -89,18 +122,18 @@ size_t MaxCompressedLength(size_t source_len) {
89
122
  namespace {
90
123
 
91
124
  void UnalignedCopy64(const void* src, void* dst) {
92
- memcpy(dst, src, 8);
125
+ char tmp[8];
126
+ memcpy(tmp, src, 8);
127
+ memcpy(dst, tmp, 8);
93
128
  }
94
129
 
95
130
  void UnalignedCopy128(const void* src, void* dst) {
96
- // TODO(alkis): Remove this when we upgrade to a recent compiler that emits
97
- // SSE2 moves for memcpy(dst, src, 16).
98
- #ifdef __SSE2__
99
- __m128i x = _mm_loadu_si128(static_cast<const __m128i*>(src));
100
- _mm_storeu_si128(static_cast<__m128i*>(dst), x);
101
- #else
102
- memcpy(dst, src, 16);
103
- #endif
131
+ // memcpy gets vectorized when the appropriate compiler options are used.
132
+ // For example, x86 compilers targeting SSE2+ will optimize to an SSE2 load
133
+ // and store.
134
+ char tmp[16];
135
+ memcpy(tmp, src, 16);
136
+ memcpy(dst, tmp, 16);
104
137
  }
105
138
 
106
139
  // Copy [src, src+(op_limit-op)) to [op, (op_limit-op)) a byte at a time. Used
@@ -115,12 +148,35 @@ void UnalignedCopy128(const void* src, void* dst) {
115
148
  // Note that this does not match the semantics of either memcpy() or memmove().
116
149
  inline char* IncrementalCopySlow(const char* src, char* op,
117
150
  char* const op_limit) {
151
+ // TODO: Remove pragma when LLVM is aware this
152
+ // function is only called in cold regions and when cold regions don't get
153
+ // vectorized or unrolled.
154
+ #ifdef __clang__
155
+ #pragma clang loop unroll(disable)
156
+ #endif
118
157
  while (op < op_limit) {
119
158
  *op++ = *src++;
120
159
  }
121
160
  return op_limit;
122
161
  }
123
162
 
163
+ #if SNAPPY_HAVE_SSSE3
164
+
165
+ // This is a table of shuffle control masks that can be used as the source
166
+ // operand for PSHUFB to permute the contents of the destination XMM register
167
+ // into a repeating byte pattern.
168
+ alignas(16) const char pshufb_fill_patterns[7][16] = {
169
+ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
170
+ {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1},
171
+ {0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0},
172
+ {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3},
173
+ {0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0},
174
+ {0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3},
175
+ {0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1},
176
+ };
177
+
178
+ #endif // SNAPPY_HAVE_SSSE3
179
+
124
180
  // Copy [src, src+(op_limit-op)) to [op, (op_limit-op)) but faster than
125
181
  // IncrementalCopySlow. buf_limit is the address past the end of the writable
126
182
  // region of the buffer.
@@ -132,9 +188,10 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
132
188
  // pat = op - src
133
189
  // len = limit - op
134
190
  assert(src < op);
191
+ assert(op <= op_limit);
135
192
  assert(op_limit <= buf_limit);
136
193
  // NOTE: The compressor always emits 4 <= len <= 64. It is ok to assume that
137
- // to optimize this function but we have to also handle these cases in case
194
+ // to optimize this function but we have to also handle other cases in case
138
195
  // the input does not satisfy these conditions.
139
196
 
140
197
  size_t pattern_size = op - src;
@@ -163,26 +220,56 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
163
220
  // copying 2x 8 bytes at a time.
164
221
 
165
222
  // Handle the uncommon case where pattern is less than 8 bytes.
166
- if (PREDICT_FALSE(pattern_size < 8)) {
167
- // Expand pattern to at least 8 bytes. The worse case scenario in terms of
168
- // buffer usage is when the pattern is size 3. ^ is the original position
169
- // of op. x are irrelevant bytes copied by the last UnalignedCopy64.
223
+ if (SNAPPY_PREDICT_FALSE(pattern_size < 8)) {
224
+ #if SNAPPY_HAVE_SSSE3
225
+ // Load the first eight bytes into an 128-bit XMM register, then use PSHUFB
226
+ // to permute the register's contents in-place into a repeating sequence of
227
+ // the first "pattern_size" bytes.
228
+ // For example, suppose:
229
+ // src == "abc"
230
+ // op == op + 3
231
+ // After _mm_shuffle_epi8(), "pattern" will have five copies of "abc"
232
+ // followed by one byte of slop: abcabcabcabcabca.
170
233
  //
171
- // abc
172
- // abcabcxxxxx
173
- // abcabcabcabcxxxxx
174
- // ^
175
- // The last x is 14 bytes after ^.
176
- if (PREDICT_TRUE(op <= buf_limit - 14)) {
234
+ // The non-SSE fallback implementation suffers from store-forwarding stalls
235
+ // because its loads and stores partly overlap. By expanding the pattern
236
+ // in-place, we avoid the penalty.
237
+ if (SNAPPY_PREDICT_TRUE(op <= buf_limit - 16)) {
238
+ const __m128i shuffle_mask = _mm_load_si128(
239
+ reinterpret_cast<const __m128i*>(pshufb_fill_patterns)
240
+ + pattern_size - 1);
241
+ const __m128i pattern = _mm_shuffle_epi8(
242
+ _mm_loadl_epi64(reinterpret_cast<const __m128i*>(src)), shuffle_mask);
243
+ // Uninitialized bytes are masked out by the shuffle mask.
244
+ // TODO: remove annotation and macro defs once MSan is fixed.
245
+ SNAPPY_ANNOTATE_MEMORY_IS_INITIALIZED(&pattern, sizeof(pattern));
246
+ pattern_size *= 16 / pattern_size;
247
+ char* op_end = std::min(op_limit, buf_limit - 15);
248
+ while (op < op_end) {
249
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(op), pattern);
250
+ op += pattern_size;
251
+ }
252
+ if (SNAPPY_PREDICT_TRUE(op >= op_limit)) return op_limit;
253
+ }
254
+ return IncrementalCopySlow(src, op, op_limit);
255
+ #else // !SNAPPY_HAVE_SSSE3
256
+ // If plenty of buffer space remains, expand the pattern to at least 8
257
+ // bytes. The way the following loop is written, we need 8 bytes of buffer
258
+ // space if pattern_size >= 4, 11 bytes if pattern_size is 1 or 3, and 10
259
+ // bytes if pattern_size is 2. Precisely encoding that is probably not
260
+ // worthwhile; instead, invoke the slow path if we cannot write 11 bytes
261
+ // (because 11 are required in the worst case).
262
+ if (SNAPPY_PREDICT_TRUE(op <= buf_limit - 11)) {
177
263
  while (pattern_size < 8) {
178
264
  UnalignedCopy64(src, op);
179
265
  op += pattern_size;
180
266
  pattern_size *= 2;
181
267
  }
182
- if (PREDICT_TRUE(op >= op_limit)) return op_limit;
268
+ if (SNAPPY_PREDICT_TRUE(op >= op_limit)) return op_limit;
183
269
  } else {
184
270
  return IncrementalCopySlow(src, op, op_limit);
185
271
  }
272
+ #endif // SNAPPY_HAVE_SSSE3
186
273
  }
187
274
  assert(pattern_size >= 8);
188
275
 
@@ -190,16 +277,51 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
190
277
  // UnalignedCopy128 might overwrite data in op. UnalignedCopy64 is safe
191
278
  // because expanding the pattern to at least 8 bytes guarantees that
192
279
  // op - src >= 8.
193
- while (op <= buf_limit - 16) {
280
+ //
281
+ // Typically, the op_limit is the gating factor so try to simplify the loop
282
+ // based on that.
283
+ if (SNAPPY_PREDICT_TRUE(op_limit <= buf_limit - 16)) {
284
+ // There is at least one, and at most four 16-byte blocks. Writing four
285
+ // conditionals instead of a loop allows FDO to layout the code with respect
286
+ // to the actual probabilities of each length.
287
+ // TODO: Replace with loop with trip count hint.
288
+ UnalignedCopy64(src, op);
289
+ UnalignedCopy64(src + 8, op + 8);
290
+
291
+ if (op + 16 < op_limit) {
292
+ UnalignedCopy64(src + 16, op + 16);
293
+ UnalignedCopy64(src + 24, op + 24);
294
+ }
295
+ if (op + 32 < op_limit) {
296
+ UnalignedCopy64(src + 32, op + 32);
297
+ UnalignedCopy64(src + 40, op + 40);
298
+ }
299
+ if (op + 48 < op_limit) {
300
+ UnalignedCopy64(src + 48, op + 48);
301
+ UnalignedCopy64(src + 56, op + 56);
302
+ }
303
+ return op_limit;
304
+ }
305
+
306
+ // Fall back to doing as much as we can with the available slop in the
307
+ // buffer. This code path is relatively cold however so we save code size by
308
+ // avoiding unrolling and vectorizing.
309
+ //
310
+ // TODO: Remove pragma when when cold regions don't get vectorized
311
+ // or unrolled.
312
+ #ifdef __clang__
313
+ #pragma clang loop unroll(disable)
314
+ #endif
315
+ for (char *op_end = buf_limit - 16; op < op_end; op += 16, src += 16) {
194
316
  UnalignedCopy64(src, op);
195
317
  UnalignedCopy64(src + 8, op + 8);
196
- src += 16;
197
- op += 16;
198
- if (PREDICT_TRUE(op >= op_limit)) return op_limit;
199
318
  }
319
+ if (op >= op_limit)
320
+ return op_limit;
321
+
200
322
  // We only take this branch if we didn't have enough slop and we can do a
201
323
  // single 8 byte copy.
202
- if (PREDICT_FALSE(op <= buf_limit - 8)) {
324
+ if (SNAPPY_PREDICT_FALSE(op <= buf_limit - 8)) {
203
325
  UnalignedCopy64(src, op);
204
326
  src += 8;
205
327
  op += 8;
@@ -209,10 +331,10 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
209
331
 
210
332
  } // namespace
211
333
 
334
+ template <bool allow_fast_path>
212
335
  static inline char* EmitLiteral(char* op,
213
336
  const char* literal,
214
- int len,
215
- bool allow_fast_path) {
337
+ int len) {
216
338
  // The vast majority of copies are below 16 bytes, for which a
217
339
  // call to memcpy is overkill. This fast path can sometimes
218
340
  // copy up to 15 bytes too much, but that is okay in the
@@ -237,31 +359,29 @@ static inline char* EmitLiteral(char* op,
237
359
  // Fits in tag byte
238
360
  *op++ = LITERAL | (n << 2);
239
361
  } else {
240
- // Encode in upcoming bytes
241
- char* base = op;
242
- int count = 0;
243
- op++;
244
- while (n > 0) {
245
- *op++ = n & 0xff;
246
- n >>= 8;
247
- count++;
248
- }
362
+ int count = (Bits::Log2Floor(n) >> 3) + 1;
249
363
  assert(count >= 1);
250
364
  assert(count <= 4);
251
- *base = LITERAL | ((59+count) << 2);
365
+ *op++ = LITERAL | ((59 + count) << 2);
366
+ // Encode in upcoming bytes.
367
+ // Write 4 bytes, though we may care about only 1 of them. The output buffer
368
+ // is guaranteed to have at least 3 more spaces left as 'len >= 61' holds
369
+ // here and there is a memcpy of size 'len' below.
370
+ LittleEndian::Store32(op, n);
371
+ op += count;
252
372
  }
253
373
  memcpy(op, literal, len);
254
374
  return op + len;
255
375
  }
256
376
 
257
- static inline char* EmitCopyAtMost64(char* op, size_t offset, size_t len,
258
- bool len_less_than_12) {
377
+ template <bool len_less_than_12>
378
+ static inline char* EmitCopyAtMost64(char* op, size_t offset, size_t len) {
259
379
  assert(len <= 64);
260
380
  assert(len >= 4);
261
381
  assert(offset < 65536);
262
382
  assert(len_less_than_12 == (len < 12));
263
383
 
264
- if (len_less_than_12 && PREDICT_TRUE(offset < 2048)) {
384
+ if (len_less_than_12 && SNAPPY_PREDICT_TRUE(offset < 2048)) {
265
385
  // offset fits in 11 bits. The 3 highest go in the top of the first byte,
266
386
  // and the rest go in the second byte.
267
387
  *op++ = COPY_1_BYTE_OFFSET + ((len - 4) << 2) + ((offset >> 3) & 0xe0);
@@ -276,29 +396,33 @@ static inline char* EmitCopyAtMost64(char* op, size_t offset, size_t len,
276
396
  return op;
277
397
  }
278
398
 
279
- static inline char* EmitCopy(char* op, size_t offset, size_t len,
280
- bool len_less_than_12) {
399
+ template <bool len_less_than_12>
400
+ static inline char* EmitCopy(char* op, size_t offset, size_t len) {
281
401
  assert(len_less_than_12 == (len < 12));
282
402
  if (len_less_than_12) {
283
- return EmitCopyAtMost64(op, offset, len, true);
403
+ return EmitCopyAtMost64</*len_less_than_12=*/true>(op, offset, len);
284
404
  } else {
285
405
  // A special case for len <= 64 might help, but so far measurements suggest
286
406
  // it's in the noise.
287
407
 
288
408
  // Emit 64 byte copies but make sure to keep at least four bytes reserved.
289
- while (PREDICT_FALSE(len >= 68)) {
290
- op = EmitCopyAtMost64(op, offset, 64, false);
409
+ while (SNAPPY_PREDICT_FALSE(len >= 68)) {
410
+ op = EmitCopyAtMost64</*len_less_than_12=*/false>(op, offset, 64);
291
411
  len -= 64;
292
412
  }
293
413
 
294
414
  // One or two copies will now finish the job.
295
415
  if (len > 64) {
296
- op = EmitCopyAtMost64(op, offset, 60, false);
416
+ op = EmitCopyAtMost64</*len_less_than_12=*/false>(op, offset, 60);
297
417
  len -= 60;
298
418
  }
299
419
 
300
420
  // Emit remainder.
301
- op = EmitCopyAtMost64(op, offset, len, len < 12);
421
+ if (len < 12) {
422
+ op = EmitCopyAtMost64</*len_less_than_12=*/true>(op, offset, len);
423
+ } else {
424
+ op = EmitCopyAtMost64</*len_less_than_12=*/false>(op, offset, len);
425
+ }
302
426
  return op;
303
427
  }
304
428
  }
@@ -314,31 +438,45 @@ bool GetUncompressedLength(const char* start, size_t n, size_t* result) {
314
438
  }
315
439
  }
316
440
 
317
- namespace internal {
318
- uint16* WorkingMemory::GetHashTable(size_t input_size, int* table_size) {
319
- // Use smaller hash table when input.size() is smaller, since we
320
- // fill the table, incurring O(hash table size) overhead for
321
- // compression, and if the input is short, we won't need that
322
- // many hash table entries anyway.
323
- assert(kMaxHashTableSize >= 256);
324
- size_t htsize = 256;
325
- while (htsize < kMaxHashTableSize && htsize < input_size) {
326
- htsize <<= 1;
441
+ namespace {
442
+ uint32 CalculateTableSize(uint32 input_size) {
443
+ static_assert(
444
+ kMaxHashTableSize >= kMinHashTableSize,
445
+ "kMaxHashTableSize should be greater or equal to kMinHashTableSize.");
446
+ if (input_size > kMaxHashTableSize) {
447
+ return kMaxHashTableSize;
327
448
  }
328
-
329
- uint16* table;
330
- if (htsize <= ARRAYSIZE(small_table_)) {
331
- table = small_table_;
332
- } else {
333
- if (large_table_ == NULL) {
334
- large_table_ = new uint16[kMaxHashTableSize];
335
- }
336
- table = large_table_;
449
+ if (input_size < kMinHashTableSize) {
450
+ return kMinHashTableSize;
337
451
  }
452
+ // This is equivalent to Log2Ceiling(input_size), assuming input_size > 1.
453
+ // 2 << Log2Floor(x - 1) is equivalent to 1 << (1 + Log2Floor(x - 1)).
454
+ return 2u << Bits::Log2Floor(input_size - 1);
455
+ }
456
+ } // namespace
457
+
458
+ namespace internal {
459
+ WorkingMemory::WorkingMemory(size_t input_size) {
460
+ const size_t max_fragment_size = std::min(input_size, kBlockSize);
461
+ const size_t table_size = CalculateTableSize(max_fragment_size);
462
+ size_ = table_size * sizeof(*table_) + max_fragment_size +
463
+ MaxCompressedLength(max_fragment_size);
464
+ mem_ = std::allocator<char>().allocate(size_);
465
+ table_ = reinterpret_cast<uint16*>(mem_);
466
+ input_ = mem_ + table_size * sizeof(*table_);
467
+ output_ = input_ + max_fragment_size;
468
+ }
469
+
470
+ WorkingMemory::~WorkingMemory() {
471
+ std::allocator<char>().deallocate(mem_, size_);
472
+ }
338
473
 
474
+ uint16* WorkingMemory::GetHashTable(size_t fragment_size,
475
+ int* table_size) const {
476
+ const size_t htsize = CalculateTableSize(fragment_size);
477
+ memset(table_, 0, htsize * sizeof(*table_));
339
478
  *table_size = htsize;
340
- memset(table, 0, htsize * sizeof(*table));
341
- return table;
479
+ return table_;
342
480
  }
343
481
  } // end namespace internal
344
482
 
@@ -405,7 +543,7 @@ char* CompressFragment(const char* input,
405
543
  // "ip" is the input pointer, and "op" is the output pointer.
406
544
  const char* ip = input;
407
545
  assert(input_size <= kBlockSize);
408
- assert((table_size & (table_size - 1)) == 0); // table must be power of two
546
+ assert((table_size & (table_size - 1)) == 0); // table must be power of two
409
547
  const int shift = 32 - Bits::Log2Floor(table_size);
410
548
  assert(static_cast<int>(kuint32max >> shift) == table_size - 1);
411
549
  const char* ip_end = input + input_size;
@@ -415,7 +553,7 @@ char* CompressFragment(const char* input,
415
553
  const char* next_emit = ip;
416
554
 
417
555
  const size_t kInputMarginBytes = 15;
418
- if (PREDICT_TRUE(input_size >= kInputMarginBytes)) {
556
+ if (SNAPPY_PREDICT_TRUE(input_size >= kInputMarginBytes)) {
419
557
  const char* ip_limit = input + input_size - kInputMarginBytes;
420
558
 
421
559
  for (uint32 next_hash = Hash(++ip, shift); ; ) {
@@ -456,7 +594,7 @@ char* CompressFragment(const char* input,
456
594
  uint32 bytes_between_hash_lookups = skip >> 5;
457
595
  skip += bytes_between_hash_lookups;
458
596
  next_ip = ip + bytes_between_hash_lookups;
459
- if (PREDICT_FALSE(next_ip > ip_limit)) {
597
+ if (SNAPPY_PREDICT_FALSE(next_ip > ip_limit)) {
460
598
  goto emit_remainder;
461
599
  }
462
600
  next_hash = Hash(next_ip, shift);
@@ -465,14 +603,14 @@ char* CompressFragment(const char* input,
465
603
  assert(candidate < ip);
466
604
 
467
605
  table[hash] = ip - base_ip;
468
- } while (PREDICT_TRUE(UNALIGNED_LOAD32(ip) !=
469
- UNALIGNED_LOAD32(candidate)));
606
+ } while (SNAPPY_PREDICT_TRUE(UNALIGNED_LOAD32(ip) !=
607
+ UNALIGNED_LOAD32(candidate)));
470
608
 
471
609
  // Step 2: A 4-byte match has been found. We'll later see if more
472
610
  // than 4 bytes match. But, prior to the match, input
473
611
  // bytes [next_emit, ip) are unmatched. Emit them as "literal bytes."
474
612
  assert(next_emit + 16 <= ip_end);
475
- op = EmitLiteral(op, next_emit, ip - next_emit, true);
613
+ op = EmitLiteral</*allow_fast_path=*/true>(op, next_emit, ip - next_emit);
476
614
 
477
615
  // Step 3: Call EmitCopy, and then see if another EmitCopy could
478
616
  // be our next move. Repeat until we find no match for the
@@ -495,9 +633,13 @@ char* CompressFragment(const char* input,
495
633
  ip += matched;
496
634
  size_t offset = base - candidate;
497
635
  assert(0 == memcmp(base, candidate, matched));
498
- op = EmitCopy(op, offset, matched, p.second);
636
+ if (p.second) {
637
+ op = EmitCopy</*len_less_than_12=*/true>(op, offset, matched);
638
+ } else {
639
+ op = EmitCopy</*len_less_than_12=*/false>(op, offset, matched);
640
+ }
499
641
  next_emit = ip;
500
- if (PREDICT_FALSE(ip >= ip_limit)) {
642
+ if (SNAPPY_PREDICT_FALSE(ip >= ip_limit)) {
501
643
  goto emit_remainder;
502
644
  }
503
645
  // We are now looking for a 4-byte match again. We read
@@ -520,13 +662,18 @@ char* CompressFragment(const char* input,
520
662
  emit_remainder:
521
663
  // Emit the remaining bytes as a literal
522
664
  if (next_emit < ip_end) {
523
- op = EmitLiteral(op, next_emit, ip_end - next_emit, false);
665
+ op = EmitLiteral</*allow_fast_path=*/false>(op, next_emit,
666
+ ip_end - next_emit);
524
667
  }
525
668
 
526
669
  return op;
527
670
  }
528
671
  } // end namespace internal
529
672
 
673
+ // Called back at avery compression call to trace parameters and sizes.
674
+ static inline void Report(const char *algorithm, size_t compressed_size,
675
+ size_t uncompressed_size) {}
676
+
530
677
  // Signature of output types needed by decompression code.
531
678
  // The decompression code is templatized on a type that obeys this
532
679
  // signature so that we do not pay virtual function call overhead in
@@ -567,6 +714,28 @@ char* CompressFragment(const char* input,
567
714
  // bool TryFastAppend(const char* ip, size_t available, size_t length);
568
715
  // };
569
716
 
717
+ static inline uint32 ExtractLowBytes(uint32 v, int n) {
718
+ assert(n >= 0);
719
+ assert(n <= 4);
720
+ #if SNAPPY_HAVE_BMI2
721
+ return _bzhi_u32(v, 8 * n);
722
+ #else
723
+ // This needs to be wider than uint32 otherwise `mask << 32` will be
724
+ // undefined.
725
+ uint64 mask = 0xffffffff;
726
+ return v & ~(mask << (8 * n));
727
+ #endif
728
+ }
729
+
730
+ static inline bool LeftShiftOverflows(uint8 value, uint32 shift) {
731
+ assert(shift < 32);
732
+ static const uint8 masks[] = {
733
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, //
734
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, //
735
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, //
736
+ 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe};
737
+ return (value & masks[shift]) != 0;
738
+ }
570
739
 
571
740
  // Helper class for decompression
572
741
  class SnappyDecompressor {
@@ -605,7 +774,7 @@ class SnappyDecompressor {
605
774
  }
606
775
 
607
776
  // Read the uncompressed length stored at the start of the compressed data.
608
- // On succcess, stores the length in *result and returns true.
777
+ // On success, stores the length in *result and returns true.
609
778
  // On failure, returns false.
610
779
  bool ReadUncompressedLength(uint32* result) {
611
780
  assert(ip_ == NULL); // Must not have read anything yet
@@ -620,7 +789,7 @@ class SnappyDecompressor {
620
789
  const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip));
621
790
  reader_->Skip(1);
622
791
  uint32 val = c & 0x7f;
623
- if (((val << shift) >> shift) != val) return false;
792
+ if (LeftShiftOverflows(static_cast<uint8>(val), shift)) return false;
624
793
  *result |= val << shift;
625
794
  if (c < 128) {
626
795
  break;
@@ -633,13 +802,27 @@ class SnappyDecompressor {
633
802
  // Process the next item found in the input.
634
803
  // Returns true if successful, false on error or end of input.
635
804
  template <class Writer>
805
+ #if defined(__GNUC__) && defined(__x86_64__)
806
+ __attribute__((aligned(32)))
807
+ #endif
636
808
  void DecompressAllTags(Writer* writer) {
637
- const char* ip = ip_;
638
- // For position-independent executables, accessing global arrays can be
639
- // slow. Move wordmask array onto the stack to mitigate this.
640
- uint32 wordmask[sizeof(internal::wordmask)/sizeof(uint32)];
641
- memcpy(wordmask, internal::wordmask, sizeof(wordmask));
809
+ // In x86, pad the function body to start 16 bytes later. This function has
810
+ // a couple of hotspots that are highly sensitive to alignment: we have
811
+ // observed regressions by more than 20% in some metrics just by moving the
812
+ // exact same code to a different position in the benchmark binary.
813
+ //
814
+ // Putting this code on a 32-byte-aligned boundary + 16 bytes makes us hit
815
+ // the "lucky" case consistently. Unfortunately, this is a very brittle
816
+ // workaround, and future differences in code generation may reintroduce
817
+ // this regression. If you experience a big, difficult to explain, benchmark
818
+ // performance regression here, first try removing this hack.
819
+ #if defined(__GNUC__) && defined(__x86_64__)
820
+ // Two 8-byte "NOP DWORD ptr [EAX + EAX*1 + 00000000H]" instructions.
821
+ asm(".byte 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00");
822
+ asm(".byte 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00");
823
+ #endif
642
824
 
825
+ const char* ip = ip_;
643
826
  // We could have put this refill fragment only at the beginning of the loop.
644
827
  // However, duplicating it at the end of each branch gives the compiler more
645
828
  // scope to optimize the <ip_limit_ - ip> expression based on the local
@@ -667,21 +850,22 @@ class SnappyDecompressor {
667
850
  // txt[1-4] 25% 75%
668
851
  // pb 24% 76%
669
852
  // bin 24% 76%
670
- if (PREDICT_FALSE((c & 0x3) == LITERAL)) {
853
+ if (SNAPPY_PREDICT_FALSE((c & 0x3) == LITERAL)) {
671
854
  size_t literal_length = (c >> 2) + 1u;
672
855
  if (writer->TryFastAppend(ip, ip_limit_ - ip, literal_length)) {
673
856
  assert(literal_length < 61);
674
857
  ip += literal_length;
675
- // NOTE(user): There is no MAYBE_REFILL() here, as TryFastAppend()
858
+ // NOTE: There is no MAYBE_REFILL() here, as TryFastAppend()
676
859
  // will not return true unless there's already at least five spare
677
860
  // bytes in addition to the literal.
678
861
  continue;
679
862
  }
680
- if (PREDICT_FALSE(literal_length >= 61)) {
863
+ if (SNAPPY_PREDICT_FALSE(literal_length >= 61)) {
681
864
  // Long literal.
682
865
  const size_t literal_length_length = literal_length - 60;
683
866
  literal_length =
684
- (LittleEndian::Load32(ip) & wordmask[literal_length_length]) + 1;
867
+ ExtractLowBytes(LittleEndian::Load32(ip), literal_length_length) +
868
+ 1;
685
869
  ip += literal_length_length;
686
870
  }
687
871
 
@@ -704,7 +888,8 @@ class SnappyDecompressor {
704
888
  MAYBE_REFILL();
705
889
  } else {
706
890
  const size_t entry = char_table[c];
707
- const size_t trailer = LittleEndian::Load32(ip) & wordmask[entry >> 11];
891
+ const size_t trailer =
892
+ ExtractLowBytes(LittleEndian::Load32(ip), entry >> 11);
708
893
  const size_t length = entry & 0xff;
709
894
  ip += entry >> 11;
710
895
 
@@ -757,7 +942,7 @@ bool SnappyDecompressor::RefillTag() {
757
942
  size_t length;
758
943
  const char* src = reader_->Peek(&length);
759
944
  if (length == 0) return false;
760
- uint32 to_add = min<uint32>(needed - nbuf, length);
945
+ uint32 to_add = std::min<uint32>(needed - nbuf, length);
761
946
  memcpy(scratch_ + nbuf, src, to_add);
762
947
  nbuf += to_add;
763
948
  reader_->Skip(to_add);
@@ -786,13 +971,18 @@ static bool InternalUncompress(Source* r, Writer* writer) {
786
971
  SnappyDecompressor decompressor(r);
787
972
  uint32 uncompressed_len = 0;
788
973
  if (!decompressor.ReadUncompressedLength(&uncompressed_len)) return false;
789
- return InternalUncompressAllTags(&decompressor, writer, uncompressed_len);
974
+
975
+ return InternalUncompressAllTags(&decompressor, writer, r->Available(),
976
+ uncompressed_len);
790
977
  }
791
978
 
792
979
  template <typename Writer>
793
980
  static bool InternalUncompressAllTags(SnappyDecompressor* decompressor,
794
981
  Writer* writer,
982
+ uint32 compressed_len,
795
983
  uint32 uncompressed_len) {
984
+ Report("snappy_uncompress", compressed_len, uncompressed_len);
985
+
796
986
  writer->SetExpectedLength(uncompressed_len);
797
987
 
798
988
  // Process the entire input
@@ -809,21 +999,20 @@ bool GetUncompressedLength(Source* source, uint32* result) {
809
999
  size_t Compress(Source* reader, Sink* writer) {
810
1000
  size_t written = 0;
811
1001
  size_t N = reader->Available();
1002
+ const size_t uncompressed_size = N;
812
1003
  char ulength[Varint::kMax32];
813
1004
  char* p = Varint::Encode32(ulength, N);
814
1005
  writer->Append(ulength, p-ulength);
815
1006
  written += (p - ulength);
816
1007
 
817
- internal::WorkingMemory wmem;
818
- char* scratch = NULL;
819
- char* scratch_output = NULL;
1008
+ internal::WorkingMemory wmem(N);
820
1009
 
821
1010
  while (N > 0) {
822
1011
  // Get next block to compress (without copying if possible)
823
1012
  size_t fragment_size;
824
1013
  const char* fragment = reader->Peek(&fragment_size);
825
1014
  assert(fragment_size != 0); // premature end of input
826
- const size_t num_to_read = min(N, kBlockSize);
1015
+ const size_t num_to_read = std::min(N, kBlockSize);
827
1016
  size_t bytes_read = fragment_size;
828
1017
 
829
1018
  size_t pending_advance = 0;
@@ -832,19 +1021,13 @@ size_t Compress(Source* reader, Sink* writer) {
832
1021
  pending_advance = num_to_read;
833
1022
  fragment_size = num_to_read;
834
1023
  } else {
835
- // Read into scratch buffer
836
- if (scratch == NULL) {
837
- // If this is the last iteration, we want to allocate N bytes
838
- // of space, otherwise the max possible kBlockSize space.
839
- // num_to_read contains exactly the correct value
840
- scratch = new char[num_to_read];
841
- }
1024
+ char* scratch = wmem.GetScratchInput();
842
1025
  memcpy(scratch, fragment, bytes_read);
843
1026
  reader->Skip(bytes_read);
844
1027
 
845
1028
  while (bytes_read < num_to_read) {
846
1029
  fragment = reader->Peek(&fragment_size);
847
- size_t n = min<size_t>(fragment_size, num_to_read - bytes_read);
1030
+ size_t n = std::min<size_t>(fragment_size, num_to_read - bytes_read);
848
1031
  memcpy(scratch + bytes_read, fragment, n);
849
1032
  bytes_read += n;
850
1033
  reader->Skip(n);
@@ -864,16 +1047,13 @@ size_t Compress(Source* reader, Sink* writer) {
864
1047
 
865
1048
  // Need a scratch buffer for the output, in case the byte sink doesn't
866
1049
  // have room for us directly.
867
- if (scratch_output == NULL) {
868
- scratch_output = new char[max_output];
869
- } else {
870
- // Since we encode kBlockSize regions followed by a region
871
- // which is <= kBlockSize in length, a previously allocated
872
- // scratch_output[] region is big enough for this iteration.
873
- }
874
- char* dest = writer->GetAppendBuffer(max_output, scratch_output);
875
- char* end = internal::CompressFragment(fragment, fragment_size,
876
- dest, table, table_size);
1050
+
1051
+ // Since we encode kBlockSize regions followed by a region
1052
+ // which is <= kBlockSize in length, a previously allocated
1053
+ // scratch_output[] region is big enough for this iteration.
1054
+ char* dest = writer->GetAppendBuffer(max_output, wmem.GetScratchOutput());
1055
+ char* end = internal::CompressFragment(fragment, fragment_size, dest, table,
1056
+ table_size);
877
1057
  writer->Append(dest, end - dest);
878
1058
  written += (end - dest);
879
1059
 
@@ -881,8 +1061,7 @@ size_t Compress(Source* reader, Sink* writer) {
881
1061
  reader->Skip(pending_advance);
882
1062
  }
883
1063
 
884
- delete[] scratch;
885
- delete[] scratch_output;
1064
+ Report("snappy_compress", written, uncompressed_size);
886
1065
 
887
1066
  return written;
888
1067
  }
@@ -896,14 +1075,22 @@ size_t Compress(Source* reader, Sink* writer) {
896
1075
  // Writer template argument to SnappyDecompressor::DecompressAllTags().
897
1076
  class SnappyIOVecWriter {
898
1077
  private:
1078
+ // output_iov_end_ is set to iov + count and used to determine when
1079
+ // the end of the iovs is reached.
1080
+ const struct iovec* output_iov_end_;
1081
+
1082
+ #if !defined(NDEBUG)
899
1083
  const struct iovec* output_iov_;
900
- const size_t output_iov_count_;
1084
+ #endif // !defined(NDEBUG)
1085
+
1086
+ // Current iov that is being written into.
1087
+ const struct iovec* curr_iov_;
901
1088
 
902
- // We are currently writing into output_iov_[curr_iov_index_].
903
- size_t curr_iov_index_;
1089
+ // Pointer to current iov's write location.
1090
+ char* curr_iov_output_;
904
1091
 
905
- // Bytes written to output_iov_[curr_iov_index_] so far.
906
- size_t curr_iov_written_;
1092
+ // Remaining bytes to write into curr_iov_output.
1093
+ size_t curr_iov_remaining_;
907
1094
 
908
1095
  // Total bytes decompressed into output_iov_ so far.
909
1096
  size_t total_written_;
@@ -911,22 +1098,24 @@ class SnappyIOVecWriter {
911
1098
  // Maximum number of bytes that will be decompressed into output_iov_.
912
1099
  size_t output_limit_;
913
1100
 
914
- inline char* GetIOVecPointer(size_t index, size_t offset) {
915
- return reinterpret_cast<char*>(output_iov_[index].iov_base) +
916
- offset;
1101
+ static inline char* GetIOVecPointer(const struct iovec* iov, size_t offset) {
1102
+ return reinterpret_cast<char*>(iov->iov_base) + offset;
917
1103
  }
918
1104
 
919
1105
  public:
920
1106
  // Does not take ownership of iov. iov must be valid during the
921
1107
  // entire lifetime of the SnappyIOVecWriter.
922
1108
  inline SnappyIOVecWriter(const struct iovec* iov, size_t iov_count)
923
- : output_iov_(iov),
924
- output_iov_count_(iov_count),
925
- curr_iov_index_(0),
926
- curr_iov_written_(0),
1109
+ : output_iov_end_(iov + iov_count),
1110
+ #if !defined(NDEBUG)
1111
+ output_iov_(iov),
1112
+ #endif // !defined(NDEBUG)
1113
+ curr_iov_(iov),
1114
+ curr_iov_output_(iov_count ? reinterpret_cast<char*>(iov->iov_base)
1115
+ : nullptr),
1116
+ curr_iov_remaining_(iov_count ? iov->iov_len : 0),
927
1117
  total_written_(0),
928
- output_limit_(-1) {
929
- }
1118
+ output_limit_(-1) {}
930
1119
 
931
1120
  inline void SetExpectedLength(size_t len) {
932
1121
  output_limit_ = len;
@@ -941,23 +1130,25 @@ class SnappyIOVecWriter {
941
1130
  return false;
942
1131
  }
943
1132
 
1133
+ return AppendNoCheck(ip, len);
1134
+ }
1135
+
1136
+ inline bool AppendNoCheck(const char* ip, size_t len) {
944
1137
  while (len > 0) {
945
- assert(curr_iov_written_ <= output_iov_[curr_iov_index_].iov_len);
946
- if (curr_iov_written_ >= output_iov_[curr_iov_index_].iov_len) {
1138
+ if (curr_iov_remaining_ == 0) {
947
1139
  // This iovec is full. Go to the next one.
948
- if (curr_iov_index_ + 1 >= output_iov_count_) {
1140
+ if (curr_iov_ + 1 >= output_iov_end_) {
949
1141
  return false;
950
1142
  }
951
- curr_iov_written_ = 0;
952
- ++curr_iov_index_;
1143
+ ++curr_iov_;
1144
+ curr_iov_output_ = reinterpret_cast<char*>(curr_iov_->iov_base);
1145
+ curr_iov_remaining_ = curr_iov_->iov_len;
953
1146
  }
954
1147
 
955
- const size_t to_write = std::min(
956
- len, output_iov_[curr_iov_index_].iov_len - curr_iov_written_);
957
- memcpy(GetIOVecPointer(curr_iov_index_, curr_iov_written_),
958
- ip,
959
- to_write);
960
- curr_iov_written_ += to_write;
1148
+ const size_t to_write = std::min(len, curr_iov_remaining_);
1149
+ memcpy(curr_iov_output_, ip, to_write);
1150
+ curr_iov_output_ += to_write;
1151
+ curr_iov_remaining_ -= to_write;
961
1152
  total_written_ += to_write;
962
1153
  ip += to_write;
963
1154
  len -= to_write;
@@ -969,11 +1160,11 @@ class SnappyIOVecWriter {
969
1160
  inline bool TryFastAppend(const char* ip, size_t available, size_t len) {
970
1161
  const size_t space_left = output_limit_ - total_written_;
971
1162
  if (len <= 16 && available >= 16 + kMaximumTagLength && space_left >= 16 &&
972
- output_iov_[curr_iov_index_].iov_len - curr_iov_written_ >= 16) {
1163
+ curr_iov_remaining_ >= 16) {
973
1164
  // Fast path, used for the majority (about 95%) of invocations.
974
- char* ptr = GetIOVecPointer(curr_iov_index_, curr_iov_written_);
975
- UnalignedCopy128(ip, ptr);
976
- curr_iov_written_ += len;
1165
+ UnalignedCopy128(ip, curr_iov_output_);
1166
+ curr_iov_output_ += len;
1167
+ curr_iov_remaining_ -= len;
977
1168
  total_written_ += len;
978
1169
  return true;
979
1170
  }
@@ -982,7 +1173,9 @@ class SnappyIOVecWriter {
982
1173
  }
983
1174
 
984
1175
  inline bool AppendFromSelf(size_t offset, size_t len) {
985
- if (offset > total_written_ || offset == 0) {
1176
+ // See SnappyArrayWriter::AppendFromSelf for an explanation of
1177
+ // the "offset - 1u" trick.
1178
+ if (offset - 1u >= total_written_) {
986
1179
  return false;
987
1180
  }
988
1181
  const size_t space_left = output_limit_ - total_written_;
@@ -991,8 +1184,8 @@ class SnappyIOVecWriter {
991
1184
  }
992
1185
 
993
1186
  // Locate the iovec from which we need to start the copy.
994
- size_t from_iov_index = curr_iov_index_;
995
- size_t from_iov_offset = curr_iov_written_;
1187
+ const iovec* from_iov = curr_iov_;
1188
+ size_t from_iov_offset = curr_iov_->iov_len - curr_iov_remaining_;
996
1189
  while (offset > 0) {
997
1190
  if (from_iov_offset >= offset) {
998
1191
  from_iov_offset -= offset;
@@ -1000,47 +1193,47 @@ class SnappyIOVecWriter {
1000
1193
  }
1001
1194
 
1002
1195
  offset -= from_iov_offset;
1003
- assert(from_iov_index > 0);
1004
- --from_iov_index;
1005
- from_iov_offset = output_iov_[from_iov_index].iov_len;
1196
+ --from_iov;
1197
+ #if !defined(NDEBUG)
1198
+ assert(from_iov >= output_iov_);
1199
+ #endif // !defined(NDEBUG)
1200
+ from_iov_offset = from_iov->iov_len;
1006
1201
  }
1007
1202
 
1008
1203
  // Copy <len> bytes starting from the iovec pointed to by from_iov_index to
1009
1204
  // the current iovec.
1010
1205
  while (len > 0) {
1011
- assert(from_iov_index <= curr_iov_index_);
1012
- if (from_iov_index != curr_iov_index_) {
1013
- const size_t to_copy = std::min(
1014
- output_iov_[from_iov_index].iov_len - from_iov_offset,
1015
- len);
1016
- Append(GetIOVecPointer(from_iov_index, from_iov_offset), to_copy);
1206
+ assert(from_iov <= curr_iov_);
1207
+ if (from_iov != curr_iov_) {
1208
+ const size_t to_copy =
1209
+ std::min(from_iov->iov_len - from_iov_offset, len);
1210
+ AppendNoCheck(GetIOVecPointer(from_iov, from_iov_offset), to_copy);
1017
1211
  len -= to_copy;
1018
1212
  if (len > 0) {
1019
- ++from_iov_index;
1213
+ ++from_iov;
1020
1214
  from_iov_offset = 0;
1021
1215
  }
1022
1216
  } else {
1023
- assert(curr_iov_written_ <= output_iov_[curr_iov_index_].iov_len);
1024
- size_t to_copy = std::min(output_iov_[curr_iov_index_].iov_len -
1025
- curr_iov_written_,
1026
- len);
1217
+ size_t to_copy = curr_iov_remaining_;
1027
1218
  if (to_copy == 0) {
1028
1219
  // This iovec is full. Go to the next one.
1029
- if (curr_iov_index_ + 1 >= output_iov_count_) {
1220
+ if (curr_iov_ + 1 >= output_iov_end_) {
1030
1221
  return false;
1031
1222
  }
1032
- ++curr_iov_index_;
1033
- curr_iov_written_ = 0;
1223
+ ++curr_iov_;
1224
+ curr_iov_output_ = reinterpret_cast<char*>(curr_iov_->iov_base);
1225
+ curr_iov_remaining_ = curr_iov_->iov_len;
1034
1226
  continue;
1035
1227
  }
1036
1228
  if (to_copy > len) {
1037
1229
  to_copy = len;
1038
1230
  }
1039
- IncrementalCopySlow(
1040
- GetIOVecPointer(from_iov_index, from_iov_offset),
1041
- GetIOVecPointer(curr_iov_index_, curr_iov_written_),
1042
- GetIOVecPointer(curr_iov_index_, curr_iov_written_) + to_copy);
1043
- curr_iov_written_ += to_copy;
1231
+
1232
+ IncrementalCopy(GetIOVecPointer(from_iov, from_iov_offset),
1233
+ curr_iov_output_, curr_iov_output_ + to_copy,
1234
+ curr_iov_output_ + curr_iov_remaining_);
1235
+ curr_iov_output_ += to_copy;
1236
+ curr_iov_remaining_ -= to_copy;
1044
1237
  from_iov_offset += to_copy;
1045
1238
  total_written_ += to_copy;
1046
1239
  len -= to_copy;
@@ -1149,7 +1342,7 @@ bool RawUncompress(Source* compressed, char* uncompressed) {
1149
1342
  return InternalUncompress(compressed, &output);
1150
1343
  }
1151
1344
 
1152
- bool Uncompress(const char* compressed, size_t n, string* uncompressed) {
1345
+ bool Uncompress(const char* compressed, size_t n, std::string* uncompressed) {
1153
1346
  size_t ulength;
1154
1347
  if (!GetUncompressedLength(compressed, n, &ulength)) {
1155
1348
  return false;
@@ -1217,7 +1410,8 @@ void RawCompress(const char* input,
1217
1410
  *compressed_length = (writer.CurrentDestination() - compressed);
1218
1411
  }
1219
1412
 
1220
- size_t Compress(const char* input, size_t input_length, string* compressed) {
1413
+ size_t Compress(const char* input, size_t input_length,
1414
+ std::string* compressed) {
1221
1415
  // Pre-grow the buffer to the max length of the compressed output
1222
1416
  STLStringResizeUninitialized(compressed, MaxCompressedLength(input_length));
1223
1417
 
@@ -1313,7 +1507,8 @@ class SnappyScatteredWriter {
1313
1507
  char* const op_end = op_ptr_ + len;
1314
1508
  // See SnappyArrayWriter::AppendFromSelf for an explanation of
1315
1509
  // the "offset - 1u" trick.
1316
- if (PREDICT_TRUE(offset - 1u < op_ptr_ - op_base_ && op_end <= op_limit_)) {
1510
+ if (SNAPPY_PREDICT_TRUE(offset - 1u < op_ptr_ - op_base_ &&
1511
+ op_end <= op_limit_)) {
1317
1512
  // Fast path: src and dst in current block.
1318
1513
  op_ptr_ = IncrementalCopy(op_ptr_ - offset, op_ptr_, op_end, op_limit_);
1319
1514
  return true;
@@ -1344,7 +1539,7 @@ bool SnappyScatteredWriter<Allocator>::SlowAppend(const char* ip, size_t len) {
1344
1539
  }
1345
1540
 
1346
1541
  // Make new block
1347
- size_t bsize = min<size_t>(kBlockSize, expected_ - full_size_);
1542
+ size_t bsize = std::min<size_t>(kBlockSize, expected_ - full_size_);
1348
1543
  op_base_ = allocator_.Allocate(bsize);
1349
1544
  op_ptr_ = op_base_;
1350
1545
  op_limit_ = op_base_ + bsize;
@@ -1401,7 +1596,7 @@ class SnappySinkAllocator {
1401
1596
  size_t size_written = 0;
1402
1597
  size_t block_size;
1403
1598
  for (int i = 0; i < blocks_.size(); ++i) {
1404
- block_size = min<size_t>(blocks_[i].size, size - size_written);
1599
+ block_size = std::min<size_t>(blocks_[i].size, size - size_written);
1405
1600
  dest_->AppendAndTakeOwnership(blocks_[i].data, block_size,
1406
1601
  &SnappySinkAllocator::Deleter, NULL);
1407
1602
  size_written += block_size;
@@ -1446,19 +1641,21 @@ bool Uncompress(Source* compressed, Sink* uncompressed) {
1446
1641
  char* buf = uncompressed->GetAppendBufferVariable(
1447
1642
  1, uncompressed_len, &c, 1, &allocated_size);
1448
1643
 
1644
+ const size_t compressed_len = compressed->Available();
1449
1645
  // If we can get a flat buffer, then use it, otherwise do block by block
1450
1646
  // uncompression
1451
1647
  if (allocated_size >= uncompressed_len) {
1452
1648
  SnappyArrayWriter writer(buf);
1453
- bool result = InternalUncompressAllTags(
1454
- &decompressor, &writer, uncompressed_len);
1649
+ bool result = InternalUncompressAllTags(&decompressor, &writer,
1650
+ compressed_len, uncompressed_len);
1455
1651
  uncompressed->Append(buf, writer.Produced());
1456
1652
  return result;
1457
1653
  } else {
1458
1654
  SnappySinkAllocator allocator(uncompressed);
1459
1655
  SnappyScatteredWriter<SnappySinkAllocator> writer(allocator);
1460
- return InternalUncompressAllTags(&decompressor, &writer, uncompressed_len);
1656
+ return InternalUncompressAllTags(&decompressor, &writer, compressed_len,
1657
+ uncompressed_len);
1461
1658
  }
1462
1659
  }
1463
1660
 
1464
- } // end namespace snappy
1661
+ } // namespace snappy