snappy 0.0.14-java → 0.2.0-java

Sign up to get free protection for your applications and to get access to all the features.
Files changed (70) hide show
  1. checksums.yaml +5 -5
  2. data/.github/workflows/main.yml +34 -0
  3. data/.github/workflows/publish.yml +34 -0
  4. data/Gemfile +4 -0
  5. data/README.md +28 -4
  6. data/Rakefile +32 -29
  7. data/ext/api.c +6 -1
  8. data/ext/extconf.rb +21 -24
  9. data/lib/snappy.rb +6 -4
  10. data/lib/snappy/hadoop.rb +22 -0
  11. data/lib/snappy/hadoop/reader.rb +62 -0
  12. data/lib/snappy/hadoop/writer.rb +51 -0
  13. data/lib/snappy/reader.rb +19 -11
  14. data/lib/snappy/shim.rb +30 -0
  15. data/lib/snappy/version.rb +3 -1
  16. data/lib/snappy/writer.rb +8 -9
  17. data/snappy.gemspec +17 -37
  18. data/test/hadoop/snappy_hadoop_reader_test.rb +115 -0
  19. data/test/hadoop/snappy_hadoop_writer_test.rb +48 -0
  20. data/test/snappy_hadoop_test.rb +26 -0
  21. data/test/snappy_reader_test.rb +148 -0
  22. data/test/snappy_test.rb +95 -0
  23. data/test/snappy_writer_test.rb +55 -0
  24. data/test/test_helper.rb +7 -0
  25. data/vendor/snappy/CMakeLists.txt +297 -0
  26. data/vendor/snappy/CONTRIBUTING.md +26 -0
  27. data/vendor/snappy/COPYING +1 -1
  28. data/vendor/snappy/NEWS +60 -0
  29. data/vendor/snappy/{README → README.md} +29 -16
  30. data/vendor/snappy/cmake/SnappyConfig.cmake.in +33 -0
  31. data/vendor/snappy/cmake/config.h.in +62 -0
  32. data/vendor/snappy/docs/README.md +72 -0
  33. data/vendor/snappy/snappy-c.h +3 -3
  34. data/vendor/snappy/snappy-internal.h +113 -32
  35. data/vendor/snappy/snappy-sinksource.cc +33 -0
  36. data/vendor/snappy/snappy-sinksource.h +51 -6
  37. data/vendor/snappy/snappy-stubs-internal.cc +1 -1
  38. data/vendor/snappy/snappy-stubs-internal.h +160 -45
  39. data/vendor/snappy/snappy-stubs-public.h.in +23 -47
  40. data/vendor/snappy/snappy-test.cc +31 -24
  41. data/vendor/snappy/snappy-test.h +46 -103
  42. data/vendor/snappy/snappy.cc +786 -431
  43. data/vendor/snappy/snappy.h +37 -14
  44. data/vendor/snappy/snappy_compress_fuzzer.cc +59 -0
  45. data/vendor/snappy/snappy_uncompress_fuzzer.cc +57 -0
  46. data/vendor/snappy/snappy_unittest.cc +441 -290
  47. metadata +35 -75
  48. data/.travis.yml +0 -4
  49. data/test/test-snappy-reader.rb +0 -129
  50. data/test/test-snappy-writer.rb +0 -55
  51. data/test/test-snappy.rb +0 -58
  52. data/vendor/snappy/ChangeLog +0 -1916
  53. data/vendor/snappy/Makefile.am +0 -23
  54. data/vendor/snappy/autogen.sh +0 -7
  55. data/vendor/snappy/configure.ac +0 -133
  56. data/vendor/snappy/m4/gtest.m4 +0 -74
  57. data/vendor/snappy/testdata/alice29.txt +0 -3609
  58. data/vendor/snappy/testdata/asyoulik.txt +0 -4122
  59. data/vendor/snappy/testdata/baddata1.snappy +0 -0
  60. data/vendor/snappy/testdata/baddata2.snappy +0 -0
  61. data/vendor/snappy/testdata/baddata3.snappy +0 -0
  62. data/vendor/snappy/testdata/fireworks.jpeg +0 -0
  63. data/vendor/snappy/testdata/geo.protodata +0 -0
  64. data/vendor/snappy/testdata/html +0 -1
  65. data/vendor/snappy/testdata/html_x_4 +0 -1
  66. data/vendor/snappy/testdata/kppkn.gtb +0 -0
  67. data/vendor/snappy/testdata/lcet10.txt +0 -7519
  68. data/vendor/snappy/testdata/paper-100k.pdf +2 -600
  69. data/vendor/snappy/testdata/plrabn12.txt +0 -10699
  70. data/vendor/snappy/testdata/urls.10K +0 -10000
@@ -30,15 +30,58 @@
30
30
  #include "snappy-internal.h"
31
31
  #include "snappy-sinksource.h"
32
32
 
33
+ #if !defined(SNAPPY_HAVE_SSSE3)
34
+ // __SSSE3__ is defined by GCC and Clang. Visual Studio doesn't target SIMD
35
+ // support between SSE2 and AVX (so SSSE3 instructions require AVX support), and
36
+ // defines __AVX__ when AVX support is available.
37
+ #if defined(__SSSE3__) || defined(__AVX__)
38
+ #define SNAPPY_HAVE_SSSE3 1
39
+ #else
40
+ #define SNAPPY_HAVE_SSSE3 0
41
+ #endif
42
+ #endif // !defined(SNAPPY_HAVE_SSSE3)
43
+
44
+ #if !defined(SNAPPY_HAVE_BMI2)
45
+ // __BMI2__ is defined by GCC and Clang. Visual Studio doesn't target BMI2
46
+ // specifically, but it does define __AVX2__ when AVX2 support is available.
47
+ // Fortunately, AVX2 was introduced in Haswell, just like BMI2.
48
+ //
49
+ // BMI2 is not defined as a subset of AVX2 (unlike SSSE3 and AVX above). So,
50
+ // GCC and Clang can build code with AVX2 enabled but BMI2 disabled, in which
51
+ // case issuing BMI2 instructions results in a compiler error.
52
+ #if defined(__BMI2__) || (defined(_MSC_VER) && defined(__AVX2__))
53
+ #define SNAPPY_HAVE_BMI2 1
54
+ #else
55
+ #define SNAPPY_HAVE_BMI2 0
56
+ #endif
57
+ #endif // !defined(SNAPPY_HAVE_BMI2)
58
+
59
+ #if SNAPPY_HAVE_SSSE3
60
+ // Please do not replace with <x86intrin.h>. or with headers that assume more
61
+ // advanced SSE versions without checking with all the OWNERS.
62
+ #include <tmmintrin.h>
63
+ #endif
64
+
65
+ #if SNAPPY_HAVE_BMI2
66
+ // Please do not replace with <x86intrin.h>. or with headers that assume more
67
+ // advanced SSE versions without checking with all the OWNERS.
68
+ #include <immintrin.h>
69
+ #endif
70
+
33
71
  #include <stdio.h>
34
72
 
35
73
  #include <algorithm>
36
74
  #include <string>
37
75
  #include <vector>
38
76
 
39
-
40
77
  namespace snappy {
41
78
 
79
+ using internal::COPY_1_BYTE_OFFSET;
80
+ using internal::COPY_2_BYTE_OFFSET;
81
+ using internal::LITERAL;
82
+ using internal::char_table;
83
+ using internal::kMaximumTagLength;
84
+
42
85
  // Any hash function will produce a valid compressed bitstream, but a good
43
86
  // hash function reduces the number of collisions and thus yields better
44
87
  // compression for compressible input, and more speed for incompressible
@@ -76,161 +119,313 @@ size_t MaxCompressedLength(size_t source_len) {
76
119
  return 32 + source_len + source_len/6;
77
120
  }
78
121
 
79
- enum {
80
- LITERAL = 0,
81
- COPY_1_BYTE_OFFSET = 1, // 3 bit length + 3 bits of offset in opcode
82
- COPY_2_BYTE_OFFSET = 2,
83
- COPY_4_BYTE_OFFSET = 3
84
- };
85
- static const int kMaximumTagLength = 5; // COPY_4_BYTE_OFFSET plus the actual offset.
86
-
87
- // Copy "len" bytes from "src" to "op", one byte at a time. Used for
88
- // handling COPY operations where the input and output regions may
89
- // overlap. For example, suppose:
90
- // src == "ab"
91
- // op == src + 2
92
- // len == 20
93
- // After IncrementalCopy(src, op, len), the result will have
94
- // eleven copies of "ab"
122
+ namespace {
123
+
124
+ void UnalignedCopy64(const void* src, void* dst) {
125
+ char tmp[8];
126
+ memcpy(tmp, src, 8);
127
+ memcpy(dst, tmp, 8);
128
+ }
129
+
130
+ void UnalignedCopy128(const void* src, void* dst) {
131
+ // memcpy gets vectorized when the appropriate compiler options are used.
132
+ // For example, x86 compilers targeting SSE2+ will optimize to an SSE2 load
133
+ // and store.
134
+ char tmp[16];
135
+ memcpy(tmp, src, 16);
136
+ memcpy(dst, tmp, 16);
137
+ }
138
+
139
+ // Copy [src, src+(op_limit-op)) to [op, (op_limit-op)) a byte at a time. Used
140
+ // for handling COPY operations where the input and output regions may overlap.
141
+ // For example, suppose:
142
+ // src == "ab"
143
+ // op == src + 2
144
+ // op_limit == op + 20
145
+ // After IncrementalCopySlow(src, op, op_limit), the result will have eleven
146
+ // copies of "ab"
95
147
  // ababababababababababab
96
- // Note that this does not match the semantics of either memcpy()
97
- // or memmove().
98
- static inline void IncrementalCopy(const char* src, char* op, ssize_t len) {
99
- assert(len > 0);
100
- do {
148
+ // Note that this does not match the semantics of either memcpy() or memmove().
149
+ inline char* IncrementalCopySlow(const char* src, char* op,
150
+ char* const op_limit) {
151
+ // TODO: Remove pragma when LLVM is aware this
152
+ // function is only called in cold regions and when cold regions don't get
153
+ // vectorized or unrolled.
154
+ #ifdef __clang__
155
+ #pragma clang loop unroll(disable)
156
+ #endif
157
+ while (op < op_limit) {
101
158
  *op++ = *src++;
102
- } while (--len > 0);
159
+ }
160
+ return op_limit;
103
161
  }
104
162
 
105
- // Equivalent to IncrementalCopy except that it can write up to ten extra
106
- // bytes after the end of the copy, and that it is faster.
107
- //
108
- // The main part of this loop is a simple copy of eight bytes at a time until
109
- // we've copied (at least) the requested amount of bytes. However, if op and
110
- // src are less than eight bytes apart (indicating a repeating pattern of
111
- // length < 8), we first need to expand the pattern in order to get the correct
112
- // results. For instance, if the buffer looks like this, with the eight-byte
113
- // <src> and <op> patterns marked as intervals:
114
- //
115
- // abxxxxxxxxxxxx
116
- // [------] src
117
- // [------] op
118
- //
119
- // a single eight-byte copy from <src> to <op> will repeat the pattern once,
120
- // after which we can move <op> two bytes without moving <src>:
121
- //
122
- // ababxxxxxxxxxx
123
- // [------] src
124
- // [------] op
125
- //
126
- // and repeat the exercise until the two no longer overlap.
127
- //
128
- // This allows us to do very well in the special case of one single byte
129
- // repeated many times, without taking a big hit for more general cases.
130
- //
131
- // The worst case of extra writing past the end of the match occurs when
132
- // op - src == 1 and len == 1; the last copy will read from byte positions
133
- // [0..7] and write to [4..11], whereas it was only supposed to write to
134
- // position 1. Thus, ten excess bytes.
163
+ #if SNAPPY_HAVE_SSSE3
164
+
165
+ // This is a table of shuffle control masks that can be used as the source
166
+ // operand for PSHUFB to permute the contents of the destination XMM register
167
+ // into a repeating byte pattern.
168
+ alignas(16) const char pshufb_fill_patterns[7][16] = {
169
+ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
170
+ {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1},
171
+ {0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0},
172
+ {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3},
173
+ {0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0},
174
+ {0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3},
175
+ {0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1},
176
+ };
135
177
 
136
- namespace {
178
+ #endif // SNAPPY_HAVE_SSSE3
179
+
180
+ // Copy [src, src+(op_limit-op)) to [op, (op_limit-op)) but faster than
181
+ // IncrementalCopySlow. buf_limit is the address past the end of the writable
182
+ // region of the buffer.
183
+ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
184
+ char* const buf_limit) {
185
+ // Terminology:
186
+ //
187
+ // slop = buf_limit - op
188
+ // pat = op - src
189
+ // len = limit - op
190
+ assert(src < op);
191
+ assert(op <= op_limit);
192
+ assert(op_limit <= buf_limit);
193
+ // NOTE: The compressor always emits 4 <= len <= 64. It is ok to assume that
194
+ // to optimize this function but we have to also handle other cases in case
195
+ // the input does not satisfy these conditions.
196
+
197
+ size_t pattern_size = op - src;
198
+ // The cases are split into different branches to allow the branch predictor,
199
+ // FDO, and static prediction hints to work better. For each input we list the
200
+ // ratio of invocations that match each condition.
201
+ //
202
+ // input slop < 16 pat < 8 len > 16
203
+ // ------------------------------------------
204
+ // html|html4|cp 0% 1.01% 27.73%
205
+ // urls 0% 0.88% 14.79%
206
+ // jpg 0% 64.29% 7.14%
207
+ // pdf 0% 2.56% 58.06%
208
+ // txt[1-4] 0% 0.23% 0.97%
209
+ // pb 0% 0.96% 13.88%
210
+ // bin 0.01% 22.27% 41.17%
211
+ //
212
+ // It is very rare that we don't have enough slop for doing block copies. It
213
+ // is also rare that we need to expand a pattern. Small patterns are common
214
+ // for incompressible formats and for those we are plenty fast already.
215
+ // Lengths are normally not greater than 16 but they vary depending on the
216
+ // input. In general if we always predict len <= 16 it would be an ok
217
+ // prediction.
218
+ //
219
+ // In order to be fast we want a pattern >= 8 bytes and an unrolled loop
220
+ // copying 2x 8 bytes at a time.
221
+
222
+ // Handle the uncommon case where pattern is less than 8 bytes.
223
+ if (SNAPPY_PREDICT_FALSE(pattern_size < 8)) {
224
+ #if SNAPPY_HAVE_SSSE3
225
+ // Load the first eight bytes into an 128-bit XMM register, then use PSHUFB
226
+ // to permute the register's contents in-place into a repeating sequence of
227
+ // the first "pattern_size" bytes.
228
+ // For example, suppose:
229
+ // src == "abc"
230
+ // op == op + 3
231
+ // After _mm_shuffle_epi8(), "pattern" will have five copies of "abc"
232
+ // followed by one byte of slop: abcabcabcabcabca.
233
+ //
234
+ // The non-SSE fallback implementation suffers from store-forwarding stalls
235
+ // because its loads and stores partly overlap. By expanding the pattern
236
+ // in-place, we avoid the penalty.
237
+ if (SNAPPY_PREDICT_TRUE(op <= buf_limit - 16)) {
238
+ const __m128i shuffle_mask = _mm_load_si128(
239
+ reinterpret_cast<const __m128i*>(pshufb_fill_patterns)
240
+ + pattern_size - 1);
241
+ const __m128i pattern = _mm_shuffle_epi8(
242
+ _mm_loadl_epi64(reinterpret_cast<const __m128i*>(src)), shuffle_mask);
243
+ // Uninitialized bytes are masked out by the shuffle mask.
244
+ // TODO: remove annotation and macro defs once MSan is fixed.
245
+ SNAPPY_ANNOTATE_MEMORY_IS_INITIALIZED(&pattern, sizeof(pattern));
246
+ pattern_size *= 16 / pattern_size;
247
+ char* op_end = std::min(op_limit, buf_limit - 15);
248
+ while (op < op_end) {
249
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(op), pattern);
250
+ op += pattern_size;
251
+ }
252
+ if (SNAPPY_PREDICT_TRUE(op >= op_limit)) return op_limit;
253
+ }
254
+ return IncrementalCopySlow(src, op, op_limit);
255
+ #else // !SNAPPY_HAVE_SSSE3
256
+ // If plenty of buffer space remains, expand the pattern to at least 8
257
+ // bytes. The way the following loop is written, we need 8 bytes of buffer
258
+ // space if pattern_size >= 4, 11 bytes if pattern_size is 1 or 3, and 10
259
+ // bytes if pattern_size is 2. Precisely encoding that is probably not
260
+ // worthwhile; instead, invoke the slow path if we cannot write 11 bytes
261
+ // (because 11 are required in the worst case).
262
+ if (SNAPPY_PREDICT_TRUE(op <= buf_limit - 11)) {
263
+ while (pattern_size < 8) {
264
+ UnalignedCopy64(src, op);
265
+ op += pattern_size;
266
+ pattern_size *= 2;
267
+ }
268
+ if (SNAPPY_PREDICT_TRUE(op >= op_limit)) return op_limit;
269
+ } else {
270
+ return IncrementalCopySlow(src, op, op_limit);
271
+ }
272
+ #endif // SNAPPY_HAVE_SSSE3
273
+ }
274
+ assert(pattern_size >= 8);
275
+
276
+ // Copy 2x 8 bytes at a time. Because op - src can be < 16, a single
277
+ // UnalignedCopy128 might overwrite data in op. UnalignedCopy64 is safe
278
+ // because expanding the pattern to at least 8 bytes guarantees that
279
+ // op - src >= 8.
280
+ //
281
+ // Typically, the op_limit is the gating factor so try to simplify the loop
282
+ // based on that.
283
+ if (SNAPPY_PREDICT_TRUE(op_limit <= buf_limit - 16)) {
284
+ // There is at least one, and at most four 16-byte blocks. Writing four
285
+ // conditionals instead of a loop allows FDO to layout the code with respect
286
+ // to the actual probabilities of each length.
287
+ // TODO: Replace with loop with trip count hint.
288
+ UnalignedCopy64(src, op);
289
+ UnalignedCopy64(src + 8, op + 8);
137
290
 
138
- const int kMaxIncrementCopyOverflow = 10;
291
+ if (op + 16 < op_limit) {
292
+ UnalignedCopy64(src + 16, op + 16);
293
+ UnalignedCopy64(src + 24, op + 24);
294
+ }
295
+ if (op + 32 < op_limit) {
296
+ UnalignedCopy64(src + 32, op + 32);
297
+ UnalignedCopy64(src + 40, op + 40);
298
+ }
299
+ if (op + 48 < op_limit) {
300
+ UnalignedCopy64(src + 48, op + 48);
301
+ UnalignedCopy64(src + 56, op + 56);
302
+ }
303
+ return op_limit;
304
+ }
139
305
 
140
- inline void IncrementalCopyFastPath(const char* src, char* op, ssize_t len) {
141
- while (op - src < 8) {
306
+ // Fall back to doing as much as we can with the available slop in the
307
+ // buffer. This code path is relatively cold however so we save code size by
308
+ // avoiding unrolling and vectorizing.
309
+ //
310
+ // TODO: Remove pragma when when cold regions don't get vectorized
311
+ // or unrolled.
312
+ #ifdef __clang__
313
+ #pragma clang loop unroll(disable)
314
+ #endif
315
+ for (char *op_end = buf_limit - 16; op < op_end; op += 16, src += 16) {
142
316
  UnalignedCopy64(src, op);
143
- len -= op - src;
144
- op += op - src;
317
+ UnalignedCopy64(src + 8, op + 8);
145
318
  }
146
- while (len > 0) {
319
+ if (op >= op_limit)
320
+ return op_limit;
321
+
322
+ // We only take this branch if we didn't have enough slop and we can do a
323
+ // single 8 byte copy.
324
+ if (SNAPPY_PREDICT_FALSE(op <= buf_limit - 8)) {
147
325
  UnalignedCopy64(src, op);
148
326
  src += 8;
149
327
  op += 8;
150
- len -= 8;
151
328
  }
329
+ return IncrementalCopySlow(src, op, op_limit);
152
330
  }
153
331
 
154
332
  } // namespace
155
333
 
334
+ template <bool allow_fast_path>
156
335
  static inline char* EmitLiteral(char* op,
157
336
  const char* literal,
158
- int len,
159
- bool allow_fast_path) {
160
- int n = len - 1; // Zero-length literals are disallowed
161
- if (n < 60) {
337
+ int len) {
338
+ // The vast majority of copies are below 16 bytes, for which a
339
+ // call to memcpy is overkill. This fast path can sometimes
340
+ // copy up to 15 bytes too much, but that is okay in the
341
+ // main loop, since we have a bit to go on for both sides:
342
+ //
343
+ // - The input will always have kInputMarginBytes = 15 extra
344
+ // available bytes, as long as we're in the main loop, and
345
+ // if not, allow_fast_path = false.
346
+ // - The output will always have 32 spare bytes (see
347
+ // MaxCompressedLength).
348
+ assert(len > 0); // Zero-length literals are disallowed
349
+ int n = len - 1;
350
+ if (allow_fast_path && len <= 16) {
162
351
  // Fits in tag byte
163
352
  *op++ = LITERAL | (n << 2);
164
353
 
165
- // The vast majority of copies are below 16 bytes, for which a
166
- // call to memcpy is overkill. This fast path can sometimes
167
- // copy up to 15 bytes too much, but that is okay in the
168
- // main loop, since we have a bit to go on for both sides:
169
- //
170
- // - The input will always have kInputMarginBytes = 15 extra
171
- // available bytes, as long as we're in the main loop, and
172
- // if not, allow_fast_path = false.
173
- // - The output will always have 32 spare bytes (see
174
- // MaxCompressedLength).
175
- if (allow_fast_path && len <= 16) {
176
- UnalignedCopy64(literal, op);
177
- UnalignedCopy64(literal + 8, op + 8);
178
- return op + len;
179
- }
354
+ UnalignedCopy128(literal, op);
355
+ return op + len;
356
+ }
357
+
358
+ if (n < 60) {
359
+ // Fits in tag byte
360
+ *op++ = LITERAL | (n << 2);
180
361
  } else {
181
- // Encode in upcoming bytes
182
- char* base = op;
183
- int count = 0;
184
- op++;
185
- while (n > 0) {
186
- *op++ = n & 0xff;
187
- n >>= 8;
188
- count++;
189
- }
362
+ int count = (Bits::Log2Floor(n) >> 3) + 1;
190
363
  assert(count >= 1);
191
364
  assert(count <= 4);
192
- *base = LITERAL | ((59+count) << 2);
365
+ *op++ = LITERAL | ((59 + count) << 2);
366
+ // Encode in upcoming bytes.
367
+ // Write 4 bytes, though we may care about only 1 of them. The output buffer
368
+ // is guaranteed to have at least 3 more spaces left as 'len >= 61' holds
369
+ // here and there is a memcpy of size 'len' below.
370
+ LittleEndian::Store32(op, n);
371
+ op += count;
193
372
  }
194
373
  memcpy(op, literal, len);
195
374
  return op + len;
196
375
  }
197
376
 
198
- static inline char* EmitCopyLessThan64(char* op, size_t offset, int len) {
377
+ template <bool len_less_than_12>
378
+ static inline char* EmitCopyAtMost64(char* op, size_t offset, size_t len) {
199
379
  assert(len <= 64);
200
380
  assert(len >= 4);
201
381
  assert(offset < 65536);
382
+ assert(len_less_than_12 == (len < 12));
202
383
 
203
- if ((len < 12) && (offset < 2048)) {
204
- size_t len_minus_4 = len - 4;
205
- assert(len_minus_4 < 8); // Must fit in 3 bits
206
- *op++ = COPY_1_BYTE_OFFSET + ((len_minus_4) << 2) + ((offset >> 8) << 5);
384
+ if (len_less_than_12 && SNAPPY_PREDICT_TRUE(offset < 2048)) {
385
+ // offset fits in 11 bits. The 3 highest go in the top of the first byte,
386
+ // and the rest go in the second byte.
387
+ *op++ = COPY_1_BYTE_OFFSET + ((len - 4) << 2) + ((offset >> 3) & 0xe0);
207
388
  *op++ = offset & 0xff;
208
389
  } else {
209
- *op++ = COPY_2_BYTE_OFFSET + ((len-1) << 2);
210
- LittleEndian::Store16(op, offset);
211
- op += 2;
390
+ // Write 4 bytes, though we only care about 3 of them. The output buffer
391
+ // is required to have some slack, so the extra byte won't overrun it.
392
+ uint32 u = COPY_2_BYTE_OFFSET + ((len - 1) << 2) + (offset << 8);
393
+ LittleEndian::Store32(op, u);
394
+ op += 3;
212
395
  }
213
396
  return op;
214
397
  }
215
398
 
216
- static inline char* EmitCopy(char* op, size_t offset, int len) {
217
- // Emit 64 byte copies but make sure to keep at least four bytes reserved
218
- while (len >= 68) {
219
- op = EmitCopyLessThan64(op, offset, 64);
220
- len -= 64;
221
- }
399
+ template <bool len_less_than_12>
400
+ static inline char* EmitCopy(char* op, size_t offset, size_t len) {
401
+ assert(len_less_than_12 == (len < 12));
402
+ if (len_less_than_12) {
403
+ return EmitCopyAtMost64</*len_less_than_12=*/true>(op, offset, len);
404
+ } else {
405
+ // A special case for len <= 64 might help, but so far measurements suggest
406
+ // it's in the noise.
222
407
 
223
- // Emit an extra 60 byte copy if have too much data to fit in one copy
224
- if (len > 64) {
225
- op = EmitCopyLessThan64(op, offset, 60);
226
- len -= 60;
227
- }
408
+ // Emit 64 byte copies but make sure to keep at least four bytes reserved.
409
+ while (SNAPPY_PREDICT_FALSE(len >= 68)) {
410
+ op = EmitCopyAtMost64</*len_less_than_12=*/false>(op, offset, 64);
411
+ len -= 64;
412
+ }
228
413
 
229
- // Emit remainder
230
- op = EmitCopyLessThan64(op, offset, len);
231
- return op;
232
- }
414
+ // One or two copies will now finish the job.
415
+ if (len > 64) {
416
+ op = EmitCopyAtMost64</*len_less_than_12=*/false>(op, offset, 60);
417
+ len -= 60;
418
+ }
233
419
 
420
+ // Emit remainder.
421
+ if (len < 12) {
422
+ op = EmitCopyAtMost64</*len_less_than_12=*/true>(op, offset, len);
423
+ } else {
424
+ op = EmitCopyAtMost64</*len_less_than_12=*/false>(op, offset, len);
425
+ }
426
+ return op;
427
+ }
428
+ }
234
429
 
235
430
  bool GetUncompressedLength(const char* start, size_t n, size_t* result) {
236
431
  uint32 v = 0;
@@ -243,31 +438,45 @@ bool GetUncompressedLength(const char* start, size_t n, size_t* result) {
243
438
  }
244
439
  }
245
440
 
246
- namespace internal {
247
- uint16* WorkingMemory::GetHashTable(size_t input_size, int* table_size) {
248
- // Use smaller hash table when input.size() is smaller, since we
249
- // fill the table, incurring O(hash table size) overhead for
250
- // compression, and if the input is short, we won't need that
251
- // many hash table entries anyway.
252
- assert(kMaxHashTableSize >= 256);
253
- size_t htsize = 256;
254
- while (htsize < kMaxHashTableSize && htsize < input_size) {
255
- htsize <<= 1;
441
+ namespace {
442
+ uint32 CalculateTableSize(uint32 input_size) {
443
+ static_assert(
444
+ kMaxHashTableSize >= kMinHashTableSize,
445
+ "kMaxHashTableSize should be greater or equal to kMinHashTableSize.");
446
+ if (input_size > kMaxHashTableSize) {
447
+ return kMaxHashTableSize;
256
448
  }
257
-
258
- uint16* table;
259
- if (htsize <= ARRAYSIZE(small_table_)) {
260
- table = small_table_;
261
- } else {
262
- if (large_table_ == NULL) {
263
- large_table_ = new uint16[kMaxHashTableSize];
264
- }
265
- table = large_table_;
449
+ if (input_size < kMinHashTableSize) {
450
+ return kMinHashTableSize;
266
451
  }
452
+ // This is equivalent to Log2Ceiling(input_size), assuming input_size > 1.
453
+ // 2 << Log2Floor(x - 1) is equivalent to 1 << (1 + Log2Floor(x - 1)).
454
+ return 2u << Bits::Log2Floor(input_size - 1);
455
+ }
456
+ } // namespace
267
457
 
458
+ namespace internal {
459
+ WorkingMemory::WorkingMemory(size_t input_size) {
460
+ const size_t max_fragment_size = std::min(input_size, kBlockSize);
461
+ const size_t table_size = CalculateTableSize(max_fragment_size);
462
+ size_ = table_size * sizeof(*table_) + max_fragment_size +
463
+ MaxCompressedLength(max_fragment_size);
464
+ mem_ = std::allocator<char>().allocate(size_);
465
+ table_ = reinterpret_cast<uint16*>(mem_);
466
+ input_ = mem_ + table_size * sizeof(*table_);
467
+ output_ = input_ + max_fragment_size;
468
+ }
469
+
470
+ WorkingMemory::~WorkingMemory() {
471
+ std::allocator<char>().deallocate(mem_, size_);
472
+ }
473
+
474
+ uint16* WorkingMemory::GetHashTable(size_t fragment_size,
475
+ int* table_size) const {
476
+ const size_t htsize = CalculateTableSize(fragment_size);
477
+ memset(table_, 0, htsize * sizeof(*table_));
268
478
  *table_size = htsize;
269
- memset(table, 0, htsize * sizeof(*table));
270
- return table;
479
+ return table_;
271
480
  }
272
481
  } // end namespace internal
273
482
 
@@ -334,7 +543,7 @@ char* CompressFragment(const char* input,
334
543
  // "ip" is the input pointer, and "op" is the output pointer.
335
544
  const char* ip = input;
336
545
  assert(input_size <= kBlockSize);
337
- assert((table_size & (table_size - 1)) == 0); // table must be power of two
546
+ assert((table_size & (table_size - 1)) == 0); // table must be power of two
338
547
  const int shift = 32 - Bits::Log2Floor(table_size);
339
548
  assert(static_cast<int>(kuint32max >> shift) == table_size - 1);
340
549
  const char* ip_end = input + input_size;
@@ -344,7 +553,7 @@ char* CompressFragment(const char* input,
344
553
  const char* next_emit = ip;
345
554
 
346
555
  const size_t kInputMarginBytes = 15;
347
- if (PREDICT_TRUE(input_size >= kInputMarginBytes)) {
556
+ if (SNAPPY_PREDICT_TRUE(input_size >= kInputMarginBytes)) {
348
557
  const char* ip_limit = input + input_size - kInputMarginBytes;
349
558
 
350
559
  for (uint32 next_hash = Hash(++ip, shift); ; ) {
@@ -364,9 +573,9 @@ char* CompressFragment(const char* input,
364
573
  //
365
574
  // Heuristic match skipping: If 32 bytes are scanned with no matches
366
575
  // found, start looking only at every other byte. If 32 more bytes are
367
- // scanned, look at every third byte, etc.. When a match is found,
368
- // immediately go back to looking at every byte. This is a small loss
369
- // (~5% performance, ~0.1% density) for compressible data due to more
576
+ // scanned (or skipped), look at every third byte, etc.. When a match is
577
+ // found, immediately go back to looking at every byte. This is a small
578
+ // loss (~5% performance, ~0.1% density) for compressible data due to more
370
579
  // bookkeeping, but for non-compressible data (such as JPEG) it's a huge
371
580
  // win since the compressor quickly "realizes" the data is incompressible
372
581
  // and doesn't bother looking for matches everywhere.
@@ -382,9 +591,10 @@ char* CompressFragment(const char* input,
382
591
  ip = next_ip;
383
592
  uint32 hash = next_hash;
384
593
  assert(hash == Hash(ip, shift));
385
- uint32 bytes_between_hash_lookups = skip++ >> 5;
594
+ uint32 bytes_between_hash_lookups = skip >> 5;
595
+ skip += bytes_between_hash_lookups;
386
596
  next_ip = ip + bytes_between_hash_lookups;
387
- if (PREDICT_FALSE(next_ip > ip_limit)) {
597
+ if (SNAPPY_PREDICT_FALSE(next_ip > ip_limit)) {
388
598
  goto emit_remainder;
389
599
  }
390
600
  next_hash = Hash(next_ip, shift);
@@ -393,14 +603,14 @@ char* CompressFragment(const char* input,
393
603
  assert(candidate < ip);
394
604
 
395
605
  table[hash] = ip - base_ip;
396
- } while (PREDICT_TRUE(UNALIGNED_LOAD32(ip) !=
397
- UNALIGNED_LOAD32(candidate)));
606
+ } while (SNAPPY_PREDICT_TRUE(UNALIGNED_LOAD32(ip) !=
607
+ UNALIGNED_LOAD32(candidate)));
398
608
 
399
609
  // Step 2: A 4-byte match has been found. We'll later see if more
400
610
  // than 4 bytes match. But, prior to the match, input
401
611
  // bytes [next_emit, ip) are unmatched. Emit them as "literal bytes."
402
612
  assert(next_emit + 16 <= ip_end);
403
- op = EmitLiteral(op, next_emit, ip - next_emit, true);
613
+ op = EmitLiteral</*allow_fast_path=*/true>(op, next_emit, ip - next_emit);
404
614
 
405
615
  // Step 3: Call EmitCopy, and then see if another EmitCopy could
406
616
  // be our next move. Repeat until we find no match for the
@@ -417,19 +627,25 @@ char* CompressFragment(const char* input,
417
627
  // We have a 4-byte match at ip, and no need to emit any
418
628
  // "literal bytes" prior to ip.
419
629
  const char* base = ip;
420
- int matched = 4 + FindMatchLength(candidate + 4, ip + 4, ip_end);
630
+ std::pair<size_t, bool> p =
631
+ FindMatchLength(candidate + 4, ip + 4, ip_end);
632
+ size_t matched = 4 + p.first;
421
633
  ip += matched;
422
634
  size_t offset = base - candidate;
423
635
  assert(0 == memcmp(base, candidate, matched));
424
- op = EmitCopy(op, offset, matched);
425
- // We could immediately start working at ip now, but to improve
426
- // compression we first update table[Hash(ip - 1, ...)].
427
- const char* insert_tail = ip - 1;
636
+ if (p.second) {
637
+ op = EmitCopy</*len_less_than_12=*/true>(op, offset, matched);
638
+ } else {
639
+ op = EmitCopy</*len_less_than_12=*/false>(op, offset, matched);
640
+ }
428
641
  next_emit = ip;
429
- if (PREDICT_FALSE(ip >= ip_limit)) {
642
+ if (SNAPPY_PREDICT_FALSE(ip >= ip_limit)) {
430
643
  goto emit_remainder;
431
644
  }
432
- input_bytes = GetEightBytesAt(insert_tail);
645
+ // We are now looking for a 4-byte match again. We read
646
+ // table[Hash(ip, shift)] for that. To improve compression,
647
+ // we also update table[Hash(ip - 1, shift)] and table[Hash(ip, shift)].
648
+ input_bytes = GetEightBytesAt(ip - 1);
433
649
  uint32 prev_hash = HashBytes(GetUint32AtOffset(input_bytes, 0), shift);
434
650
  table[prev_hash] = ip - base_ip - 1;
435
651
  uint32 cur_hash = HashBytes(GetUint32AtOffset(input_bytes, 1), shift);
@@ -446,13 +662,18 @@ char* CompressFragment(const char* input,
446
662
  emit_remainder:
447
663
  // Emit the remaining bytes as a literal
448
664
  if (next_emit < ip_end) {
449
- op = EmitLiteral(op, next_emit, ip_end - next_emit, false);
665
+ op = EmitLiteral</*allow_fast_path=*/false>(op, next_emit,
666
+ ip_end - next_emit);
450
667
  }
451
668
 
452
669
  return op;
453
670
  }
454
671
  } // end namespace internal
455
672
 
673
+ // Called back at avery compression call to trace parameters and sizes.
674
+ static inline void Report(const char *algorithm, size_t compressed_size,
675
+ size_t uncompressed_size) {}
676
+
456
677
  // Signature of output types needed by decompression code.
457
678
  // The decompression code is templatized on a type that obeys this
458
679
  // signature so that we do not pay virtual function call overhead in
@@ -493,162 +714,28 @@ char* CompressFragment(const char* input,
493
714
  // bool TryFastAppend(const char* ip, size_t available, size_t length);
494
715
  // };
495
716
 
496
- // -----------------------------------------------------------------------
497
- // Lookup table for decompression code. Generated by ComputeTable() below.
498
- // -----------------------------------------------------------------------
499
-
500
- // Mapping from i in range [0,4] to a mask to extract the bottom 8*i bits
501
- static const uint32 wordmask[] = {
502
- 0u, 0xffu, 0xffffu, 0xffffffu, 0xffffffffu
503
- };
504
-
505
- // Data stored per entry in lookup table:
506
- // Range Bits-used Description
507
- // ------------------------------------
508
- // 1..64 0..7 Literal/copy length encoded in opcode byte
509
- // 0..7 8..10 Copy offset encoded in opcode byte / 256
510
- // 0..4 11..13 Extra bytes after opcode
511
- //
512
- // We use eight bits for the length even though 7 would have sufficed
513
- // because of efficiency reasons:
514
- // (1) Extracting a byte is faster than a bit-field
515
- // (2) It properly aligns copy offset so we do not need a <<8
516
- static const uint16 char_table[256] = {
517
- 0x0001, 0x0804, 0x1001, 0x2001, 0x0002, 0x0805, 0x1002, 0x2002,
518
- 0x0003, 0x0806, 0x1003, 0x2003, 0x0004, 0x0807, 0x1004, 0x2004,
519
- 0x0005, 0x0808, 0x1005, 0x2005, 0x0006, 0x0809, 0x1006, 0x2006,
520
- 0x0007, 0x080a, 0x1007, 0x2007, 0x0008, 0x080b, 0x1008, 0x2008,
521
- 0x0009, 0x0904, 0x1009, 0x2009, 0x000a, 0x0905, 0x100a, 0x200a,
522
- 0x000b, 0x0906, 0x100b, 0x200b, 0x000c, 0x0907, 0x100c, 0x200c,
523
- 0x000d, 0x0908, 0x100d, 0x200d, 0x000e, 0x0909, 0x100e, 0x200e,
524
- 0x000f, 0x090a, 0x100f, 0x200f, 0x0010, 0x090b, 0x1010, 0x2010,
525
- 0x0011, 0x0a04, 0x1011, 0x2011, 0x0012, 0x0a05, 0x1012, 0x2012,
526
- 0x0013, 0x0a06, 0x1013, 0x2013, 0x0014, 0x0a07, 0x1014, 0x2014,
527
- 0x0015, 0x0a08, 0x1015, 0x2015, 0x0016, 0x0a09, 0x1016, 0x2016,
528
- 0x0017, 0x0a0a, 0x1017, 0x2017, 0x0018, 0x0a0b, 0x1018, 0x2018,
529
- 0x0019, 0x0b04, 0x1019, 0x2019, 0x001a, 0x0b05, 0x101a, 0x201a,
530
- 0x001b, 0x0b06, 0x101b, 0x201b, 0x001c, 0x0b07, 0x101c, 0x201c,
531
- 0x001d, 0x0b08, 0x101d, 0x201d, 0x001e, 0x0b09, 0x101e, 0x201e,
532
- 0x001f, 0x0b0a, 0x101f, 0x201f, 0x0020, 0x0b0b, 0x1020, 0x2020,
533
- 0x0021, 0x0c04, 0x1021, 0x2021, 0x0022, 0x0c05, 0x1022, 0x2022,
534
- 0x0023, 0x0c06, 0x1023, 0x2023, 0x0024, 0x0c07, 0x1024, 0x2024,
535
- 0x0025, 0x0c08, 0x1025, 0x2025, 0x0026, 0x0c09, 0x1026, 0x2026,
536
- 0x0027, 0x0c0a, 0x1027, 0x2027, 0x0028, 0x0c0b, 0x1028, 0x2028,
537
- 0x0029, 0x0d04, 0x1029, 0x2029, 0x002a, 0x0d05, 0x102a, 0x202a,
538
- 0x002b, 0x0d06, 0x102b, 0x202b, 0x002c, 0x0d07, 0x102c, 0x202c,
539
- 0x002d, 0x0d08, 0x102d, 0x202d, 0x002e, 0x0d09, 0x102e, 0x202e,
540
- 0x002f, 0x0d0a, 0x102f, 0x202f, 0x0030, 0x0d0b, 0x1030, 0x2030,
541
- 0x0031, 0x0e04, 0x1031, 0x2031, 0x0032, 0x0e05, 0x1032, 0x2032,
542
- 0x0033, 0x0e06, 0x1033, 0x2033, 0x0034, 0x0e07, 0x1034, 0x2034,
543
- 0x0035, 0x0e08, 0x1035, 0x2035, 0x0036, 0x0e09, 0x1036, 0x2036,
544
- 0x0037, 0x0e0a, 0x1037, 0x2037, 0x0038, 0x0e0b, 0x1038, 0x2038,
545
- 0x0039, 0x0f04, 0x1039, 0x2039, 0x003a, 0x0f05, 0x103a, 0x203a,
546
- 0x003b, 0x0f06, 0x103b, 0x203b, 0x003c, 0x0f07, 0x103c, 0x203c,
547
- 0x0801, 0x0f08, 0x103d, 0x203d, 0x1001, 0x0f09, 0x103e, 0x203e,
548
- 0x1801, 0x0f0a, 0x103f, 0x203f, 0x2001, 0x0f0b, 0x1040, 0x2040
549
- };
550
-
551
- // In debug mode, allow optional computation of the table at startup.
552
- // Also, check that the decompression table is correct.
553
- #ifndef NDEBUG
554
- DEFINE_bool(snappy_dump_decompression_table, false,
555
- "If true, we print the decompression table at startup.");
556
-
557
- static uint16 MakeEntry(unsigned int extra,
558
- unsigned int len,
559
- unsigned int copy_offset) {
560
- // Check that all of the fields fit within the allocated space
561
- assert(extra == (extra & 0x7)); // At most 3 bits
562
- assert(copy_offset == (copy_offset & 0x7)); // At most 3 bits
563
- assert(len == (len & 0x7f)); // At most 7 bits
564
- return len | (copy_offset << 8) | (extra << 11);
717
+ static inline uint32 ExtractLowBytes(uint32 v, int n) {
718
+ assert(n >= 0);
719
+ assert(n <= 4);
720
+ #if SNAPPY_HAVE_BMI2
721
+ return _bzhi_u32(v, 8 * n);
722
+ #else
723
+ // This needs to be wider than uint32 otherwise `mask << 32` will be
724
+ // undefined.
725
+ uint64 mask = 0xffffffff;
726
+ return v & ~(mask << (8 * n));
727
+ #endif
565
728
  }
566
729
 
567
- static void ComputeTable() {
568
- uint16 dst[256];
569
-
570
- // Place invalid entries in all places to detect missing initialization
571
- int assigned = 0;
572
- for (int i = 0; i < 256; i++) {
573
- dst[i] = 0xffff;
574
- }
575
-
576
- // Small LITERAL entries. We store (len-1) in the top 6 bits.
577
- for (unsigned int len = 1; len <= 60; len++) {
578
- dst[LITERAL | ((len-1) << 2)] = MakeEntry(0, len, 0);
579
- assigned++;
580
- }
581
-
582
- // Large LITERAL entries. We use 60..63 in the high 6 bits to
583
- // encode the number of bytes of length info that follow the opcode.
584
- for (unsigned int extra_bytes = 1; extra_bytes <= 4; extra_bytes++) {
585
- // We set the length field in the lookup table to 1 because extra
586
- // bytes encode len-1.
587
- dst[LITERAL | ((extra_bytes+59) << 2)] = MakeEntry(extra_bytes, 1, 0);
588
- assigned++;
589
- }
590
-
591
- // COPY_1_BYTE_OFFSET.
592
- //
593
- // The tag byte in the compressed data stores len-4 in 3 bits, and
594
- // offset/256 in 5 bits. offset%256 is stored in the next byte.
595
- //
596
- // This format is used for length in range [4..11] and offset in
597
- // range [0..2047]
598
- for (unsigned int len = 4; len < 12; len++) {
599
- for (unsigned int offset = 0; offset < 2048; offset += 256) {
600
- dst[COPY_1_BYTE_OFFSET | ((len-4)<<2) | ((offset>>8)<<5)] =
601
- MakeEntry(1, len, offset>>8);
602
- assigned++;
603
- }
604
- }
605
-
606
- // COPY_2_BYTE_OFFSET.
607
- // Tag contains len-1 in top 6 bits, and offset in next two bytes.
608
- for (unsigned int len = 1; len <= 64; len++) {
609
- dst[COPY_2_BYTE_OFFSET | ((len-1)<<2)] = MakeEntry(2, len, 0);
610
- assigned++;
611
- }
612
-
613
- // COPY_4_BYTE_OFFSET.
614
- // Tag contents len-1 in top 6 bits, and offset in next four bytes.
615
- for (unsigned int len = 1; len <= 64; len++) {
616
- dst[COPY_4_BYTE_OFFSET | ((len-1)<<2)] = MakeEntry(4, len, 0);
617
- assigned++;
618
- }
619
-
620
- // Check that each entry was initialized exactly once.
621
- if (assigned != 256) {
622
- fprintf(stderr, "ComputeTable: assigned only %d of 256\n", assigned);
623
- abort();
624
- }
625
- for (int i = 0; i < 256; i++) {
626
- if (dst[i] == 0xffff) {
627
- fprintf(stderr, "ComputeTable: did not assign byte %d\n", i);
628
- abort();
629
- }
630
- }
631
-
632
- if (FLAGS_snappy_dump_decompression_table) {
633
- printf("static const uint16 char_table[256] = {\n ");
634
- for (int i = 0; i < 256; i++) {
635
- printf("0x%04x%s",
636
- dst[i],
637
- ((i == 255) ? "\n" : (((i%8) == 7) ? ",\n " : ", ")));
638
- }
639
- printf("};\n");
640
- }
641
-
642
- // Check that computed table matched recorded table
643
- for (int i = 0; i < 256; i++) {
644
- if (dst[i] != char_table[i]) {
645
- fprintf(stderr, "ComputeTable: byte %d: computed (%x), expect (%x)\n",
646
- i, static_cast<int>(dst[i]), static_cast<int>(char_table[i]));
647
- abort();
648
- }
649
- }
730
+ static inline bool LeftShiftOverflows(uint8 value, uint32 shift) {
731
+ assert(shift < 32);
732
+ static const uint8 masks[] = {
733
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, //
734
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, //
735
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, //
736
+ 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe};
737
+ return (value & masks[shift]) != 0;
650
738
  }
651
- #endif /* !NDEBUG */
652
739
 
653
740
  // Helper class for decompression
654
741
  class SnappyDecompressor {
@@ -687,7 +774,7 @@ class SnappyDecompressor {
687
774
  }
688
775
 
689
776
  // Read the uncompressed length stored at the start of the compressed data.
690
- // On succcess, stores the length in *result and returns true.
777
+ // On success, stores the length in *result and returns true.
691
778
  // On failure, returns false.
692
779
  bool ReadUncompressedLength(uint32* result) {
693
780
  assert(ip_ == NULL); // Must not have read anything yet
@@ -701,7 +788,9 @@ class SnappyDecompressor {
701
788
  if (n == 0) return false;
702
789
  const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip));
703
790
  reader_->Skip(1);
704
- *result |= static_cast<uint32>(c & 0x7f) << shift;
791
+ uint32 val = c & 0x7f;
792
+ if (LeftShiftOverflows(static_cast<uint8>(val), shift)) return false;
793
+ *result |= val << shift;
705
794
  if (c < 128) {
706
795
  break;
707
796
  }
@@ -713,9 +802,27 @@ class SnappyDecompressor {
713
802
  // Process the next item found in the input.
714
803
  // Returns true if successful, false on error or end of input.
715
804
  template <class Writer>
805
+ #if defined(__GNUC__) && defined(__x86_64__)
806
+ __attribute__((aligned(32)))
807
+ #endif
716
808
  void DecompressAllTags(Writer* writer) {
717
- const char* ip = ip_;
809
+ // In x86, pad the function body to start 16 bytes later. This function has
810
+ // a couple of hotspots that are highly sensitive to alignment: we have
811
+ // observed regressions by more than 20% in some metrics just by moving the
812
+ // exact same code to a different position in the benchmark binary.
813
+ //
814
+ // Putting this code on a 32-byte-aligned boundary + 16 bytes makes us hit
815
+ // the "lucky" case consistently. Unfortunately, this is a very brittle
816
+ // workaround, and future differences in code generation may reintroduce
817
+ // this regression. If you experience a big, difficult to explain, benchmark
818
+ // performance regression here, first try removing this hack.
819
+ #if defined(__GNUC__) && defined(__x86_64__)
820
+ // Two 8-byte "NOP DWORD ptr [EAX + EAX*1 + 00000000H]" instructions.
821
+ asm(".byte 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00");
822
+ asm(".byte 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00");
823
+ #endif
718
824
 
825
+ const char* ip = ip_;
719
826
  // We could have put this refill fragment only at the beginning of the loop.
720
827
  // However, duplicating it at the end of each branch gives the compiler more
721
828
  // scope to optimize the <ip_limit_ - ip> expression based on the local
@@ -731,21 +838,34 @@ class SnappyDecompressor {
731
838
  for ( ;; ) {
732
839
  const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip++));
733
840
 
734
- if ((c & 0x3) == LITERAL) {
841
+ // Ratio of iterations that have LITERAL vs non-LITERAL for different
842
+ // inputs.
843
+ //
844
+ // input LITERAL NON_LITERAL
845
+ // -----------------------------------
846
+ // html|html4|cp 23% 77%
847
+ // urls 36% 64%
848
+ // jpg 47% 53%
849
+ // pdf 19% 81%
850
+ // txt[1-4] 25% 75%
851
+ // pb 24% 76%
852
+ // bin 24% 76%
853
+ if (SNAPPY_PREDICT_FALSE((c & 0x3) == LITERAL)) {
735
854
  size_t literal_length = (c >> 2) + 1u;
736
855
  if (writer->TryFastAppend(ip, ip_limit_ - ip, literal_length)) {
737
856
  assert(literal_length < 61);
738
857
  ip += literal_length;
739
- // NOTE(user): There is no MAYBE_REFILL() here, as TryFastAppend()
858
+ // NOTE: There is no MAYBE_REFILL() here, as TryFastAppend()
740
859
  // will not return true unless there's already at least five spare
741
860
  // bytes in addition to the literal.
742
861
  continue;
743
862
  }
744
- if (PREDICT_FALSE(literal_length >= 61)) {
863
+ if (SNAPPY_PREDICT_FALSE(literal_length >= 61)) {
745
864
  // Long literal.
746
865
  const size_t literal_length_length = literal_length - 60;
747
866
  literal_length =
748
- (LittleEndian::Load32(ip) & wordmask[literal_length_length]) + 1;
867
+ ExtractLowBytes(LittleEndian::Load32(ip), literal_length_length) +
868
+ 1;
749
869
  ip += literal_length_length;
750
870
  }
751
871
 
@@ -767,15 +887,16 @@ class SnappyDecompressor {
767
887
  ip += literal_length;
768
888
  MAYBE_REFILL();
769
889
  } else {
770
- const uint32 entry = char_table[c];
771
- const uint32 trailer = LittleEndian::Load32(ip) & wordmask[entry >> 11];
772
- const uint32 length = entry & 0xff;
890
+ const size_t entry = char_table[c];
891
+ const size_t trailer =
892
+ ExtractLowBytes(LittleEndian::Load32(ip), entry >> 11);
893
+ const size_t length = entry & 0xff;
773
894
  ip += entry >> 11;
774
895
 
775
896
  // copy_offset/256 is encoded in bits 8..10. By just fetching
776
897
  // those bits, we get copy_offset (since the bit-field starts at
777
898
  // bit 8).
778
- const uint32 copy_offset = entry & 0x700;
899
+ const size_t copy_offset = entry & 0x700;
779
900
  if (!writer->AppendFromSelf(copy_offset + trailer, length)) {
780
901
  return;
781
902
  }
@@ -795,10 +916,8 @@ bool SnappyDecompressor::RefillTag() {
795
916
  size_t n;
796
917
  ip = reader_->Peek(&n);
797
918
  peeked_ = n;
798
- if (n == 0) {
799
- eof_ = true;
800
- return false;
801
- }
919
+ eof_ = (n == 0);
920
+ if (eof_) return false;
802
921
  ip_limit_ = ip + n;
803
922
  }
804
923
 
@@ -823,7 +942,7 @@ bool SnappyDecompressor::RefillTag() {
823
942
  size_t length;
824
943
  const char* src = reader_->Peek(&length);
825
944
  if (length == 0) return false;
826
- uint32 to_add = min<uint32>(needed - nbuf, length);
945
+ uint32 to_add = std::min<uint32>(needed - nbuf, length);
827
946
  memcpy(scratch_ + nbuf, src, to_add);
828
947
  nbuf += to_add;
829
948
  reader_->Skip(to_add);
@@ -852,17 +971,23 @@ static bool InternalUncompress(Source* r, Writer* writer) {
852
971
  SnappyDecompressor decompressor(r);
853
972
  uint32 uncompressed_len = 0;
854
973
  if (!decompressor.ReadUncompressedLength(&uncompressed_len)) return false;
855
- return InternalUncompressAllTags(&decompressor, writer, uncompressed_len);
974
+
975
+ return InternalUncompressAllTags(&decompressor, writer, r->Available(),
976
+ uncompressed_len);
856
977
  }
857
978
 
858
979
  template <typename Writer>
859
980
  static bool InternalUncompressAllTags(SnappyDecompressor* decompressor,
860
981
  Writer* writer,
982
+ uint32 compressed_len,
861
983
  uint32 uncompressed_len) {
984
+ Report("snappy_uncompress", compressed_len, uncompressed_len);
985
+
862
986
  writer->SetExpectedLength(uncompressed_len);
863
987
 
864
988
  // Process the entire input
865
989
  decompressor->DecompressAllTags(writer);
990
+ writer->Flush();
866
991
  return (decompressor->eof() && writer->CheckLength());
867
992
  }
868
993
 
@@ -874,21 +999,20 @@ bool GetUncompressedLength(Source* source, uint32* result) {
874
999
  size_t Compress(Source* reader, Sink* writer) {
875
1000
  size_t written = 0;
876
1001
  size_t N = reader->Available();
1002
+ const size_t uncompressed_size = N;
877
1003
  char ulength[Varint::kMax32];
878
1004
  char* p = Varint::Encode32(ulength, N);
879
1005
  writer->Append(ulength, p-ulength);
880
1006
  written += (p - ulength);
881
1007
 
882
- internal::WorkingMemory wmem;
883
- char* scratch = NULL;
884
- char* scratch_output = NULL;
1008
+ internal::WorkingMemory wmem(N);
885
1009
 
886
1010
  while (N > 0) {
887
1011
  // Get next block to compress (without copying if possible)
888
1012
  size_t fragment_size;
889
1013
  const char* fragment = reader->Peek(&fragment_size);
890
1014
  assert(fragment_size != 0); // premature end of input
891
- const size_t num_to_read = min(N, kBlockSize);
1015
+ const size_t num_to_read = std::min(N, kBlockSize);
892
1016
  size_t bytes_read = fragment_size;
893
1017
 
894
1018
  size_t pending_advance = 0;
@@ -897,19 +1021,13 @@ size_t Compress(Source* reader, Sink* writer) {
897
1021
  pending_advance = num_to_read;
898
1022
  fragment_size = num_to_read;
899
1023
  } else {
900
- // Read into scratch buffer
901
- if (scratch == NULL) {
902
- // If this is the last iteration, we want to allocate N bytes
903
- // of space, otherwise the max possible kBlockSize space.
904
- // num_to_read contains exactly the correct value
905
- scratch = new char[num_to_read];
906
- }
1024
+ char* scratch = wmem.GetScratchInput();
907
1025
  memcpy(scratch, fragment, bytes_read);
908
1026
  reader->Skip(bytes_read);
909
1027
 
910
1028
  while (bytes_read < num_to_read) {
911
1029
  fragment = reader->Peek(&fragment_size);
912
- size_t n = min<size_t>(fragment_size, num_to_read - bytes_read);
1030
+ size_t n = std::min<size_t>(fragment_size, num_to_read - bytes_read);
913
1031
  memcpy(scratch + bytes_read, fragment, n);
914
1032
  bytes_read += n;
915
1033
  reader->Skip(n);
@@ -929,16 +1047,13 @@ size_t Compress(Source* reader, Sink* writer) {
929
1047
 
930
1048
  // Need a scratch buffer for the output, in case the byte sink doesn't
931
1049
  // have room for us directly.
932
- if (scratch_output == NULL) {
933
- scratch_output = new char[max_output];
934
- } else {
935
- // Since we encode kBlockSize regions followed by a region
936
- // which is <= kBlockSize in length, a previously allocated
937
- // scratch_output[] region is big enough for this iteration.
938
- }
939
- char* dest = writer->GetAppendBuffer(max_output, scratch_output);
940
- char* end = internal::CompressFragment(fragment, fragment_size,
941
- dest, table, table_size);
1050
+
1051
+ // Since we encode kBlockSize regions followed by a region
1052
+ // which is <= kBlockSize in length, a previously allocated
1053
+ // scratch_output[] region is big enough for this iteration.
1054
+ char* dest = writer->GetAppendBuffer(max_output, wmem.GetScratchOutput());
1055
+ char* end = internal::CompressFragment(fragment, fragment_size, dest, table,
1056
+ table_size);
942
1057
  writer->Append(dest, end - dest);
943
1058
  written += (end - dest);
944
1059
 
@@ -946,8 +1061,7 @@ size_t Compress(Source* reader, Sink* writer) {
946
1061
  reader->Skip(pending_advance);
947
1062
  }
948
1063
 
949
- delete[] scratch;
950
- delete[] scratch_output;
1064
+ Report("snappy_compress", written, uncompressed_size);
951
1065
 
952
1066
  return written;
953
1067
  }
@@ -961,14 +1075,22 @@ size_t Compress(Source* reader, Sink* writer) {
961
1075
  // Writer template argument to SnappyDecompressor::DecompressAllTags().
962
1076
  class SnappyIOVecWriter {
963
1077
  private:
1078
+ // output_iov_end_ is set to iov + count and used to determine when
1079
+ // the end of the iovs is reached.
1080
+ const struct iovec* output_iov_end_;
1081
+
1082
+ #if !defined(NDEBUG)
964
1083
  const struct iovec* output_iov_;
965
- const size_t output_iov_count_;
1084
+ #endif // !defined(NDEBUG)
1085
+
1086
+ // Current iov that is being written into.
1087
+ const struct iovec* curr_iov_;
966
1088
 
967
- // We are currently writing into output_iov_[curr_iov_index_].
968
- int curr_iov_index_;
1089
+ // Pointer to current iov's write location.
1090
+ char* curr_iov_output_;
969
1091
 
970
- // Bytes written to output_iov_[curr_iov_index_] so far.
971
- size_t curr_iov_written_;
1092
+ // Remaining bytes to write into curr_iov_output.
1093
+ size_t curr_iov_remaining_;
972
1094
 
973
1095
  // Total bytes decompressed into output_iov_ so far.
974
1096
  size_t total_written_;
@@ -976,22 +1098,24 @@ class SnappyIOVecWriter {
976
1098
  // Maximum number of bytes that will be decompressed into output_iov_.
977
1099
  size_t output_limit_;
978
1100
 
979
- inline char* GetIOVecPointer(int index, size_t offset) {
980
- return reinterpret_cast<char*>(output_iov_[index].iov_base) +
981
- offset;
1101
+ static inline char* GetIOVecPointer(const struct iovec* iov, size_t offset) {
1102
+ return reinterpret_cast<char*>(iov->iov_base) + offset;
982
1103
  }
983
1104
 
984
1105
  public:
985
1106
  // Does not take ownership of iov. iov must be valid during the
986
1107
  // entire lifetime of the SnappyIOVecWriter.
987
1108
  inline SnappyIOVecWriter(const struct iovec* iov, size_t iov_count)
988
- : output_iov_(iov),
989
- output_iov_count_(iov_count),
990
- curr_iov_index_(0),
991
- curr_iov_written_(0),
1109
+ : output_iov_end_(iov + iov_count),
1110
+ #if !defined(NDEBUG)
1111
+ output_iov_(iov),
1112
+ #endif // !defined(NDEBUG)
1113
+ curr_iov_(iov),
1114
+ curr_iov_output_(iov_count ? reinterpret_cast<char*>(iov->iov_base)
1115
+ : nullptr),
1116
+ curr_iov_remaining_(iov_count ? iov->iov_len : 0),
992
1117
  total_written_(0),
993
- output_limit_(-1) {
994
- }
1118
+ output_limit_(-1) {}
995
1119
 
996
1120
  inline void SetExpectedLength(size_t len) {
997
1121
  output_limit_ = len;
@@ -1006,23 +1130,25 @@ class SnappyIOVecWriter {
1006
1130
  return false;
1007
1131
  }
1008
1132
 
1133
+ return AppendNoCheck(ip, len);
1134
+ }
1135
+
1136
+ inline bool AppendNoCheck(const char* ip, size_t len) {
1009
1137
  while (len > 0) {
1010
- assert(curr_iov_written_ <= output_iov_[curr_iov_index_].iov_len);
1011
- if (curr_iov_written_ >= output_iov_[curr_iov_index_].iov_len) {
1138
+ if (curr_iov_remaining_ == 0) {
1012
1139
  // This iovec is full. Go to the next one.
1013
- if (curr_iov_index_ + 1 >= output_iov_count_) {
1140
+ if (curr_iov_ + 1 >= output_iov_end_) {
1014
1141
  return false;
1015
1142
  }
1016
- curr_iov_written_ = 0;
1017
- ++curr_iov_index_;
1143
+ ++curr_iov_;
1144
+ curr_iov_output_ = reinterpret_cast<char*>(curr_iov_->iov_base);
1145
+ curr_iov_remaining_ = curr_iov_->iov_len;
1018
1146
  }
1019
1147
 
1020
- const size_t to_write = std::min(
1021
- len, output_iov_[curr_iov_index_].iov_len - curr_iov_written_);
1022
- memcpy(GetIOVecPointer(curr_iov_index_, curr_iov_written_),
1023
- ip,
1024
- to_write);
1025
- curr_iov_written_ += to_write;
1148
+ const size_t to_write = std::min(len, curr_iov_remaining_);
1149
+ memcpy(curr_iov_output_, ip, to_write);
1150
+ curr_iov_output_ += to_write;
1151
+ curr_iov_remaining_ -= to_write;
1026
1152
  total_written_ += to_write;
1027
1153
  ip += to_write;
1028
1154
  len -= to_write;
@@ -1034,12 +1160,11 @@ class SnappyIOVecWriter {
1034
1160
  inline bool TryFastAppend(const char* ip, size_t available, size_t len) {
1035
1161
  const size_t space_left = output_limit_ - total_written_;
1036
1162
  if (len <= 16 && available >= 16 + kMaximumTagLength && space_left >= 16 &&
1037
- output_iov_[curr_iov_index_].iov_len - curr_iov_written_ >= 16) {
1163
+ curr_iov_remaining_ >= 16) {
1038
1164
  // Fast path, used for the majority (about 95%) of invocations.
1039
- char* ptr = GetIOVecPointer(curr_iov_index_, curr_iov_written_);
1040
- UnalignedCopy64(ip, ptr);
1041
- UnalignedCopy64(ip + 8, ptr + 8);
1042
- curr_iov_written_ += len;
1165
+ UnalignedCopy128(ip, curr_iov_output_);
1166
+ curr_iov_output_ += len;
1167
+ curr_iov_remaining_ -= len;
1043
1168
  total_written_ += len;
1044
1169
  return true;
1045
1170
  }
@@ -1048,7 +1173,9 @@ class SnappyIOVecWriter {
1048
1173
  }
1049
1174
 
1050
1175
  inline bool AppendFromSelf(size_t offset, size_t len) {
1051
- if (offset > total_written_ || offset == 0) {
1176
+ // See SnappyArrayWriter::AppendFromSelf for an explanation of
1177
+ // the "offset - 1u" trick.
1178
+ if (offset - 1u >= total_written_) {
1052
1179
  return false;
1053
1180
  }
1054
1181
  const size_t space_left = output_limit_ - total_written_;
@@ -1057,8 +1184,8 @@ class SnappyIOVecWriter {
1057
1184
  }
1058
1185
 
1059
1186
  // Locate the iovec from which we need to start the copy.
1060
- int from_iov_index = curr_iov_index_;
1061
- size_t from_iov_offset = curr_iov_written_;
1187
+ const iovec* from_iov = curr_iov_;
1188
+ size_t from_iov_offset = curr_iov_->iov_len - curr_iov_remaining_;
1062
1189
  while (offset > 0) {
1063
1190
  if (from_iov_offset >= offset) {
1064
1191
  from_iov_offset -= offset;
@@ -1066,46 +1193,47 @@ class SnappyIOVecWriter {
1066
1193
  }
1067
1194
 
1068
1195
  offset -= from_iov_offset;
1069
- --from_iov_index;
1070
- assert(from_iov_index >= 0);
1071
- from_iov_offset = output_iov_[from_iov_index].iov_len;
1196
+ --from_iov;
1197
+ #if !defined(NDEBUG)
1198
+ assert(from_iov >= output_iov_);
1199
+ #endif // !defined(NDEBUG)
1200
+ from_iov_offset = from_iov->iov_len;
1072
1201
  }
1073
1202
 
1074
1203
  // Copy <len> bytes starting from the iovec pointed to by from_iov_index to
1075
1204
  // the current iovec.
1076
1205
  while (len > 0) {
1077
- assert(from_iov_index <= curr_iov_index_);
1078
- if (from_iov_index != curr_iov_index_) {
1079
- const size_t to_copy = std::min(
1080
- output_iov_[from_iov_index].iov_len - from_iov_offset,
1081
- len);
1082
- Append(GetIOVecPointer(from_iov_index, from_iov_offset), to_copy);
1206
+ assert(from_iov <= curr_iov_);
1207
+ if (from_iov != curr_iov_) {
1208
+ const size_t to_copy =
1209
+ std::min(from_iov->iov_len - from_iov_offset, len);
1210
+ AppendNoCheck(GetIOVecPointer(from_iov, from_iov_offset), to_copy);
1083
1211
  len -= to_copy;
1084
1212
  if (len > 0) {
1085
- ++from_iov_index;
1213
+ ++from_iov;
1086
1214
  from_iov_offset = 0;
1087
1215
  }
1088
1216
  } else {
1089
- assert(curr_iov_written_ <= output_iov_[curr_iov_index_].iov_len);
1090
- size_t to_copy = std::min(output_iov_[curr_iov_index_].iov_len -
1091
- curr_iov_written_,
1092
- len);
1217
+ size_t to_copy = curr_iov_remaining_;
1093
1218
  if (to_copy == 0) {
1094
1219
  // This iovec is full. Go to the next one.
1095
- if (curr_iov_index_ + 1 >= output_iov_count_) {
1220
+ if (curr_iov_ + 1 >= output_iov_end_) {
1096
1221
  return false;
1097
1222
  }
1098
- ++curr_iov_index_;
1099
- curr_iov_written_ = 0;
1223
+ ++curr_iov_;
1224
+ curr_iov_output_ = reinterpret_cast<char*>(curr_iov_->iov_base);
1225
+ curr_iov_remaining_ = curr_iov_->iov_len;
1100
1226
  continue;
1101
1227
  }
1102
1228
  if (to_copy > len) {
1103
1229
  to_copy = len;
1104
1230
  }
1105
- IncrementalCopy(GetIOVecPointer(from_iov_index, from_iov_offset),
1106
- GetIOVecPointer(curr_iov_index_, curr_iov_written_),
1107
- to_copy);
1108
- curr_iov_written_ += to_copy;
1231
+
1232
+ IncrementalCopy(GetIOVecPointer(from_iov, from_iov_offset),
1233
+ curr_iov_output_, curr_iov_output_ + to_copy,
1234
+ curr_iov_output_ + curr_iov_remaining_);
1235
+ curr_iov_output_ += to_copy;
1236
+ curr_iov_remaining_ -= to_copy;
1109
1237
  from_iov_offset += to_copy;
1110
1238
  total_written_ += to_copy;
1111
1239
  len -= to_copy;
@@ -1115,6 +1243,7 @@ class SnappyIOVecWriter {
1115
1243
  return true;
1116
1244
  }
1117
1245
 
1246
+ inline void Flush() {}
1118
1247
  };
1119
1248
 
1120
1249
  bool RawUncompressToIOVec(const char* compressed, size_t compressed_length,
@@ -1145,7 +1274,8 @@ class SnappyArrayWriter {
1145
1274
  public:
1146
1275
  inline explicit SnappyArrayWriter(char* dst)
1147
1276
  : base_(dst),
1148
- op_(dst) {
1277
+ op_(dst),
1278
+ op_limit_(dst) {
1149
1279
  }
1150
1280
 
1151
1281
  inline void SetExpectedLength(size_t len) {
@@ -1172,8 +1302,7 @@ class SnappyArrayWriter {
1172
1302
  const size_t space_left = op_limit_ - op;
1173
1303
  if (len <= 16 && available >= 16 + kMaximumTagLength && space_left >= 16) {
1174
1304
  // Fast path, used for the majority (about 95%) of invocations.
1175
- UnalignedCopy64(ip, op);
1176
- UnalignedCopy64(ip + 8, op + 8);
1305
+ UnalignedCopy128(ip, op);
1177
1306
  op_ = op + len;
1178
1307
  return true;
1179
1308
  } else {
@@ -1182,8 +1311,7 @@ class SnappyArrayWriter {
1182
1311
  }
1183
1312
 
1184
1313
  inline bool AppendFromSelf(size_t offset, size_t len) {
1185
- char* op = op_;
1186
- const size_t space_left = op_limit_ - op;
1314
+ char* const op_end = op_ + len;
1187
1315
 
1188
1316
  // Check if we try to append from before the start of the buffer.
1189
1317
  // Normally this would just be a check for "produced < offset",
@@ -1192,29 +1320,16 @@ class SnappyArrayWriter {
1192
1320
  // to a very big number. This is convenient, as offset==0 is another
1193
1321
  // invalid case that we also want to catch, so that we do not go
1194
1322
  // into an infinite loop.
1195
- assert(op >= base_);
1196
- size_t produced = op - base_;
1197
- if (produced <= offset - 1u) {
1198
- return false;
1199
- }
1200
- if (len <= 16 && offset >= 8 && space_left >= 16) {
1201
- // Fast path, used for the majority (70-80%) of dynamic invocations.
1202
- UnalignedCopy64(op - offset, op);
1203
- UnalignedCopy64(op - offset + 8, op + 8);
1204
- } else {
1205
- if (space_left >= len + kMaxIncrementCopyOverflow) {
1206
- IncrementalCopyFastPath(op - offset, op, len);
1207
- } else {
1208
- if (space_left < len) {
1209
- return false;
1210
- }
1211
- IncrementalCopy(op - offset, op, len);
1212
- }
1213
- }
1323
+ if (Produced() <= offset - 1u || op_end > op_limit_) return false;
1324
+ op_ = IncrementalCopy(op_ - offset, op_, op_end, op_limit_);
1214
1325
 
1215
- op_ = op + len;
1216
1326
  return true;
1217
1327
  }
1328
+ inline size_t Produced() const {
1329
+ assert(op_ >= base_);
1330
+ return op_ - base_;
1331
+ }
1332
+ inline void Flush() {}
1218
1333
  };
1219
1334
 
1220
1335
  bool RawUncompress(const char* compressed, size_t n, char* uncompressed) {
@@ -1227,7 +1342,7 @@ bool RawUncompress(Source* compressed, char* uncompressed) {
1227
1342
  return InternalUncompress(compressed, &output);
1228
1343
  }
1229
1344
 
1230
- bool Uncompress(const char* compressed, size_t n, string* uncompressed) {
1345
+ bool Uncompress(const char* compressed, size_t n, std::string* uncompressed) {
1231
1346
  size_t ulength;
1232
1347
  if (!GetUncompressedLength(compressed, n, &ulength)) {
1233
1348
  return false;
@@ -1241,7 +1356,6 @@ bool Uncompress(const char* compressed, size_t n, string* uncompressed) {
1241
1356
  return RawUncompress(compressed, n, string_as_array(uncompressed));
1242
1357
  }
1243
1358
 
1244
-
1245
1359
  // A Writer that drops everything on the floor and just does validation
1246
1360
  class SnappyDecompressionValidator {
1247
1361
  private:
@@ -1249,7 +1363,7 @@ class SnappyDecompressionValidator {
1249
1363
  size_t produced_;
1250
1364
 
1251
1365
  public:
1252
- inline SnappyDecompressionValidator() : produced_(0) { }
1366
+ inline SnappyDecompressionValidator() : expected_(0), produced_(0) { }
1253
1367
  inline void SetExpectedLength(size_t len) {
1254
1368
  expected_ = len;
1255
1369
  }
@@ -1270,6 +1384,7 @@ class SnappyDecompressionValidator {
1270
1384
  produced_ += len;
1271
1385
  return produced_ <= expected_;
1272
1386
  }
1387
+ inline void Flush() {}
1273
1388
  };
1274
1389
 
1275
1390
  bool IsValidCompressedBuffer(const char* compressed, size_t n) {
@@ -1278,6 +1393,11 @@ bool IsValidCompressedBuffer(const char* compressed, size_t n) {
1278
1393
  return InternalUncompress(&reader, &writer);
1279
1394
  }
1280
1395
 
1396
+ bool IsValidCompressed(Source* compressed) {
1397
+ SnappyDecompressionValidator writer;
1398
+ return InternalUncompress(compressed, &writer);
1399
+ }
1400
+
1281
1401
  void RawCompress(const char* input,
1282
1402
  size_t input_length,
1283
1403
  char* compressed,
@@ -1290,9 +1410,10 @@ void RawCompress(const char* input,
1290
1410
  *compressed_length = (writer.CurrentDestination() - compressed);
1291
1411
  }
1292
1412
 
1293
- size_t Compress(const char* input, size_t input_length, string* compressed) {
1413
+ size_t Compress(const char* input, size_t input_length,
1414
+ std::string* compressed) {
1294
1415
  // Pre-grow the buffer to the max length of the compressed output
1295
- compressed->resize(MaxCompressedLength(input_length));
1416
+ STLStringResizeUninitialized(compressed, MaxCompressedLength(input_length));
1296
1417
 
1297
1418
  size_t compressed_length;
1298
1419
  RawCompress(input, input_length, string_as_array(compressed),
@@ -1301,6 +1422,240 @@ size_t Compress(const char* input, size_t input_length, string* compressed) {
1301
1422
  return compressed_length;
1302
1423
  }
1303
1424
 
1425
+ // -----------------------------------------------------------------------
1426
+ // Sink interface
1427
+ // -----------------------------------------------------------------------
1428
+
1429
+ // A type that decompresses into a Sink. The template parameter
1430
+ // Allocator must export one method "char* Allocate(int size);", which
1431
+ // allocates a buffer of "size" and appends that to the destination.
1432
+ template <typename Allocator>
1433
+ class SnappyScatteredWriter {
1434
+ Allocator allocator_;
1435
+
1436
+ // We need random access into the data generated so far. Therefore
1437
+ // we keep track of all of the generated data as an array of blocks.
1438
+ // All of the blocks except the last have length kBlockSize.
1439
+ std::vector<char*> blocks_;
1440
+ size_t expected_;
1441
+
1442
+ // Total size of all fully generated blocks so far
1443
+ size_t full_size_;
1444
+
1445
+ // Pointer into current output block
1446
+ char* op_base_; // Base of output block
1447
+ char* op_ptr_; // Pointer to next unfilled byte in block
1448
+ char* op_limit_; // Pointer just past block
1449
+
1450
+ inline size_t Size() const {
1451
+ return full_size_ + (op_ptr_ - op_base_);
1452
+ }
1453
+
1454
+ bool SlowAppend(const char* ip, size_t len);
1455
+ bool SlowAppendFromSelf(size_t offset, size_t len);
1456
+
1457
+ public:
1458
+ inline explicit SnappyScatteredWriter(const Allocator& allocator)
1459
+ : allocator_(allocator),
1460
+ full_size_(0),
1461
+ op_base_(NULL),
1462
+ op_ptr_(NULL),
1463
+ op_limit_(NULL) {
1464
+ }
1465
+
1466
+ inline void SetExpectedLength(size_t len) {
1467
+ assert(blocks_.empty());
1468
+ expected_ = len;
1469
+ }
1470
+
1471
+ inline bool CheckLength() const {
1472
+ return Size() == expected_;
1473
+ }
1474
+
1475
+ // Return the number of bytes actually uncompressed so far
1476
+ inline size_t Produced() const {
1477
+ return Size();
1478
+ }
1479
+
1480
+ inline bool Append(const char* ip, size_t len) {
1481
+ size_t avail = op_limit_ - op_ptr_;
1482
+ if (len <= avail) {
1483
+ // Fast path
1484
+ memcpy(op_ptr_, ip, len);
1485
+ op_ptr_ += len;
1486
+ return true;
1487
+ } else {
1488
+ return SlowAppend(ip, len);
1489
+ }
1490
+ }
1491
+
1492
+ inline bool TryFastAppend(const char* ip, size_t available, size_t length) {
1493
+ char* op = op_ptr_;
1494
+ const int space_left = op_limit_ - op;
1495
+ if (length <= 16 && available >= 16 + kMaximumTagLength &&
1496
+ space_left >= 16) {
1497
+ // Fast path, used for the majority (about 95%) of invocations.
1498
+ UnalignedCopy128(ip, op);
1499
+ op_ptr_ = op + length;
1500
+ return true;
1501
+ } else {
1502
+ return false;
1503
+ }
1504
+ }
1505
+
1506
+ inline bool AppendFromSelf(size_t offset, size_t len) {
1507
+ char* const op_end = op_ptr_ + len;
1508
+ // See SnappyArrayWriter::AppendFromSelf for an explanation of
1509
+ // the "offset - 1u" trick.
1510
+ if (SNAPPY_PREDICT_TRUE(offset - 1u < op_ptr_ - op_base_ &&
1511
+ op_end <= op_limit_)) {
1512
+ // Fast path: src and dst in current block.
1513
+ op_ptr_ = IncrementalCopy(op_ptr_ - offset, op_ptr_, op_end, op_limit_);
1514
+ return true;
1515
+ }
1516
+ return SlowAppendFromSelf(offset, len);
1517
+ }
1518
+
1519
+ // Called at the end of the decompress. We ask the allocator
1520
+ // write all blocks to the sink.
1521
+ inline void Flush() { allocator_.Flush(Produced()); }
1522
+ };
1523
+
1524
+ template<typename Allocator>
1525
+ bool SnappyScatteredWriter<Allocator>::SlowAppend(const char* ip, size_t len) {
1526
+ size_t avail = op_limit_ - op_ptr_;
1527
+ while (len > avail) {
1528
+ // Completely fill this block
1529
+ memcpy(op_ptr_, ip, avail);
1530
+ op_ptr_ += avail;
1531
+ assert(op_limit_ - op_ptr_ == 0);
1532
+ full_size_ += (op_ptr_ - op_base_);
1533
+ len -= avail;
1534
+ ip += avail;
1535
+
1536
+ // Bounds check
1537
+ if (full_size_ + len > expected_) {
1538
+ return false;
1539
+ }
1540
+
1541
+ // Make new block
1542
+ size_t bsize = std::min<size_t>(kBlockSize, expected_ - full_size_);
1543
+ op_base_ = allocator_.Allocate(bsize);
1544
+ op_ptr_ = op_base_;
1545
+ op_limit_ = op_base_ + bsize;
1546
+ blocks_.push_back(op_base_);
1547
+ avail = bsize;
1548
+ }
1549
+
1550
+ memcpy(op_ptr_, ip, len);
1551
+ op_ptr_ += len;
1552
+ return true;
1553
+ }
1554
+
1555
+ template<typename Allocator>
1556
+ bool SnappyScatteredWriter<Allocator>::SlowAppendFromSelf(size_t offset,
1557
+ size_t len) {
1558
+ // Overflow check
1559
+ // See SnappyArrayWriter::AppendFromSelf for an explanation of
1560
+ // the "offset - 1u" trick.
1561
+ const size_t cur = Size();
1562
+ if (offset - 1u >= cur) return false;
1563
+ if (expected_ - cur < len) return false;
1564
+
1565
+ // Currently we shouldn't ever hit this path because Compress() chops the
1566
+ // input into blocks and does not create cross-block copies. However, it is
1567
+ // nice if we do not rely on that, since we can get better compression if we
1568
+ // allow cross-block copies and thus might want to change the compressor in
1569
+ // the future.
1570
+ size_t src = cur - offset;
1571
+ while (len-- > 0) {
1572
+ char c = blocks_[src >> kBlockLog][src & (kBlockSize-1)];
1573
+ Append(&c, 1);
1574
+ src++;
1575
+ }
1576
+ return true;
1577
+ }
1578
+
1579
+ class SnappySinkAllocator {
1580
+ public:
1581
+ explicit SnappySinkAllocator(Sink* dest): dest_(dest) {}
1582
+ ~SnappySinkAllocator() {}
1583
+
1584
+ char* Allocate(int size) {
1585
+ Datablock block(new char[size], size);
1586
+ blocks_.push_back(block);
1587
+ return block.data;
1588
+ }
1589
+
1590
+ // We flush only at the end, because the writer wants
1591
+ // random access to the blocks and once we hand the
1592
+ // block over to the sink, we can't access it anymore.
1593
+ // Also we don't write more than has been actually written
1594
+ // to the blocks.
1595
+ void Flush(size_t size) {
1596
+ size_t size_written = 0;
1597
+ size_t block_size;
1598
+ for (int i = 0; i < blocks_.size(); ++i) {
1599
+ block_size = std::min<size_t>(blocks_[i].size, size - size_written);
1600
+ dest_->AppendAndTakeOwnership(blocks_[i].data, block_size,
1601
+ &SnappySinkAllocator::Deleter, NULL);
1602
+ size_written += block_size;
1603
+ }
1604
+ blocks_.clear();
1605
+ }
1606
+
1607
+ private:
1608
+ struct Datablock {
1609
+ char* data;
1610
+ size_t size;
1611
+ Datablock(char* p, size_t s) : data(p), size(s) {}
1612
+ };
1613
+
1614
+ static void Deleter(void* arg, const char* bytes, size_t size) {
1615
+ delete[] bytes;
1616
+ }
1617
+
1618
+ Sink* dest_;
1619
+ std::vector<Datablock> blocks_;
1620
+
1621
+ // Note: copying this object is allowed
1622
+ };
1623
+
1624
+ size_t UncompressAsMuchAsPossible(Source* compressed, Sink* uncompressed) {
1625
+ SnappySinkAllocator allocator(uncompressed);
1626
+ SnappyScatteredWriter<SnappySinkAllocator> writer(allocator);
1627
+ InternalUncompress(compressed, &writer);
1628
+ return writer.Produced();
1629
+ }
1630
+
1631
+ bool Uncompress(Source* compressed, Sink* uncompressed) {
1632
+ // Read the uncompressed length from the front of the compressed input
1633
+ SnappyDecompressor decompressor(compressed);
1634
+ uint32 uncompressed_len = 0;
1635
+ if (!decompressor.ReadUncompressedLength(&uncompressed_len)) {
1636
+ return false;
1637
+ }
1304
1638
 
1305
- } // end namespace snappy
1639
+ char c;
1640
+ size_t allocated_size;
1641
+ char* buf = uncompressed->GetAppendBufferVariable(
1642
+ 1, uncompressed_len, &c, 1, &allocated_size);
1643
+
1644
+ const size_t compressed_len = compressed->Available();
1645
+ // If we can get a flat buffer, then use it, otherwise do block by block
1646
+ // uncompression
1647
+ if (allocated_size >= uncompressed_len) {
1648
+ SnappyArrayWriter writer(buf);
1649
+ bool result = InternalUncompressAllTags(&decompressor, &writer,
1650
+ compressed_len, uncompressed_len);
1651
+ uncompressed->Append(buf, writer.Produced());
1652
+ return result;
1653
+ } else {
1654
+ SnappySinkAllocator allocator(uncompressed);
1655
+ SnappyScatteredWriter<SnappySinkAllocator> writer(allocator);
1656
+ return InternalUncompressAllTags(&decompressor, &writer, compressed_len,
1657
+ uncompressed_len);
1658
+ }
1659
+ }
1306
1660
 
1661
+ } // namespace snappy