snappy 0.0.14-java → 0.2.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. checksums.yaml +5 -5
  2. data/.github/workflows/main.yml +34 -0
  3. data/.github/workflows/publish.yml +34 -0
  4. data/Gemfile +4 -0
  5. data/README.md +28 -4
  6. data/Rakefile +32 -29
  7. data/ext/api.c +6 -1
  8. data/ext/extconf.rb +21 -24
  9. data/lib/snappy.rb +6 -4
  10. data/lib/snappy/hadoop.rb +22 -0
  11. data/lib/snappy/hadoop/reader.rb +62 -0
  12. data/lib/snappy/hadoop/writer.rb +51 -0
  13. data/lib/snappy/reader.rb +19 -11
  14. data/lib/snappy/shim.rb +30 -0
  15. data/lib/snappy/version.rb +3 -1
  16. data/lib/snappy/writer.rb +8 -9
  17. data/snappy.gemspec +17 -37
  18. data/test/hadoop/snappy_hadoop_reader_test.rb +115 -0
  19. data/test/hadoop/snappy_hadoop_writer_test.rb +48 -0
  20. data/test/snappy_hadoop_test.rb +26 -0
  21. data/test/snappy_reader_test.rb +148 -0
  22. data/test/snappy_test.rb +95 -0
  23. data/test/snappy_writer_test.rb +55 -0
  24. data/test/test_helper.rb +7 -0
  25. data/vendor/snappy/CMakeLists.txt +297 -0
  26. data/vendor/snappy/CONTRIBUTING.md +26 -0
  27. data/vendor/snappy/COPYING +1 -1
  28. data/vendor/snappy/NEWS +60 -0
  29. data/vendor/snappy/{README → README.md} +29 -16
  30. data/vendor/snappy/cmake/SnappyConfig.cmake.in +33 -0
  31. data/vendor/snappy/cmake/config.h.in +62 -0
  32. data/vendor/snappy/docs/README.md +72 -0
  33. data/vendor/snappy/snappy-c.h +3 -3
  34. data/vendor/snappy/snappy-internal.h +113 -32
  35. data/vendor/snappy/snappy-sinksource.cc +33 -0
  36. data/vendor/snappy/snappy-sinksource.h +51 -6
  37. data/vendor/snappy/snappy-stubs-internal.cc +1 -1
  38. data/vendor/snappy/snappy-stubs-internal.h +160 -45
  39. data/vendor/snappy/snappy-stubs-public.h.in +23 -47
  40. data/vendor/snappy/snappy-test.cc +31 -24
  41. data/vendor/snappy/snappy-test.h +46 -103
  42. data/vendor/snappy/snappy.cc +786 -431
  43. data/vendor/snappy/snappy.h +37 -14
  44. data/vendor/snappy/snappy_compress_fuzzer.cc +59 -0
  45. data/vendor/snappy/snappy_uncompress_fuzzer.cc +57 -0
  46. data/vendor/snappy/snappy_unittest.cc +441 -290
  47. metadata +35 -75
  48. data/.travis.yml +0 -4
  49. data/test/test-snappy-reader.rb +0 -129
  50. data/test/test-snappy-writer.rb +0 -55
  51. data/test/test-snappy.rb +0 -58
  52. data/vendor/snappy/ChangeLog +0 -1916
  53. data/vendor/snappy/Makefile.am +0 -23
  54. data/vendor/snappy/autogen.sh +0 -7
  55. data/vendor/snappy/configure.ac +0 -133
  56. data/vendor/snappy/m4/gtest.m4 +0 -74
  57. data/vendor/snappy/testdata/alice29.txt +0 -3609
  58. data/vendor/snappy/testdata/asyoulik.txt +0 -4122
  59. data/vendor/snappy/testdata/baddata1.snappy +0 -0
  60. data/vendor/snappy/testdata/baddata2.snappy +0 -0
  61. data/vendor/snappy/testdata/baddata3.snappy +0 -0
  62. data/vendor/snappy/testdata/fireworks.jpeg +0 -0
  63. data/vendor/snappy/testdata/geo.protodata +0 -0
  64. data/vendor/snappy/testdata/html +0 -1
  65. data/vendor/snappy/testdata/html_x_4 +0 -1
  66. data/vendor/snappy/testdata/kppkn.gtb +0 -0
  67. data/vendor/snappy/testdata/lcet10.txt +0 -7519
  68. data/vendor/snappy/testdata/paper-100k.pdf +2 -600
  69. data/vendor/snappy/testdata/plrabn12.txt +0 -10699
  70. data/vendor/snappy/testdata/urls.10K +0 -10000
@@ -30,15 +30,58 @@
30
30
  #include "snappy-internal.h"
31
31
  #include "snappy-sinksource.h"
32
32
 
33
+ #if !defined(SNAPPY_HAVE_SSSE3)
34
+ // __SSSE3__ is defined by GCC and Clang. Visual Studio doesn't target SIMD
35
+ // support between SSE2 and AVX (so SSSE3 instructions require AVX support), and
36
+ // defines __AVX__ when AVX support is available.
37
+ #if defined(__SSSE3__) || defined(__AVX__)
38
+ #define SNAPPY_HAVE_SSSE3 1
39
+ #else
40
+ #define SNAPPY_HAVE_SSSE3 0
41
+ #endif
42
+ #endif // !defined(SNAPPY_HAVE_SSSE3)
43
+
44
+ #if !defined(SNAPPY_HAVE_BMI2)
45
+ // __BMI2__ is defined by GCC and Clang. Visual Studio doesn't target BMI2
46
+ // specifically, but it does define __AVX2__ when AVX2 support is available.
47
+ // Fortunately, AVX2 was introduced in Haswell, just like BMI2.
48
+ //
49
+ // BMI2 is not defined as a subset of AVX2 (unlike SSSE3 and AVX above). So,
50
+ // GCC and Clang can build code with AVX2 enabled but BMI2 disabled, in which
51
+ // case issuing BMI2 instructions results in a compiler error.
52
+ #if defined(__BMI2__) || (defined(_MSC_VER) && defined(__AVX2__))
53
+ #define SNAPPY_HAVE_BMI2 1
54
+ #else
55
+ #define SNAPPY_HAVE_BMI2 0
56
+ #endif
57
+ #endif // !defined(SNAPPY_HAVE_BMI2)
58
+
59
+ #if SNAPPY_HAVE_SSSE3
60
+ // Please do not replace with <x86intrin.h>. or with headers that assume more
61
+ // advanced SSE versions without checking with all the OWNERS.
62
+ #include <tmmintrin.h>
63
+ #endif
64
+
65
+ #if SNAPPY_HAVE_BMI2
66
+ // Please do not replace with <x86intrin.h>. or with headers that assume more
67
+ // advanced SSE versions without checking with all the OWNERS.
68
+ #include <immintrin.h>
69
+ #endif
70
+
33
71
  #include <stdio.h>
34
72
 
35
73
  #include <algorithm>
36
74
  #include <string>
37
75
  #include <vector>
38
76
 
39
-
40
77
  namespace snappy {
41
78
 
79
+ using internal::COPY_1_BYTE_OFFSET;
80
+ using internal::COPY_2_BYTE_OFFSET;
81
+ using internal::LITERAL;
82
+ using internal::char_table;
83
+ using internal::kMaximumTagLength;
84
+
42
85
  // Any hash function will produce a valid compressed bitstream, but a good
43
86
  // hash function reduces the number of collisions and thus yields better
44
87
  // compression for compressible input, and more speed for incompressible
@@ -76,161 +119,313 @@ size_t MaxCompressedLength(size_t source_len) {
76
119
  return 32 + source_len + source_len/6;
77
120
  }
78
121
 
79
- enum {
80
- LITERAL = 0,
81
- COPY_1_BYTE_OFFSET = 1, // 3 bit length + 3 bits of offset in opcode
82
- COPY_2_BYTE_OFFSET = 2,
83
- COPY_4_BYTE_OFFSET = 3
84
- };
85
- static const int kMaximumTagLength = 5; // COPY_4_BYTE_OFFSET plus the actual offset.
86
-
87
- // Copy "len" bytes from "src" to "op", one byte at a time. Used for
88
- // handling COPY operations where the input and output regions may
89
- // overlap. For example, suppose:
90
- // src == "ab"
91
- // op == src + 2
92
- // len == 20
93
- // After IncrementalCopy(src, op, len), the result will have
94
- // eleven copies of "ab"
122
+ namespace {
123
+
124
+ void UnalignedCopy64(const void* src, void* dst) {
125
+ char tmp[8];
126
+ memcpy(tmp, src, 8);
127
+ memcpy(dst, tmp, 8);
128
+ }
129
+
130
+ void UnalignedCopy128(const void* src, void* dst) {
131
+ // memcpy gets vectorized when the appropriate compiler options are used.
132
+ // For example, x86 compilers targeting SSE2+ will optimize to an SSE2 load
133
+ // and store.
134
+ char tmp[16];
135
+ memcpy(tmp, src, 16);
136
+ memcpy(dst, tmp, 16);
137
+ }
138
+
139
+ // Copy [src, src+(op_limit-op)) to [op, (op_limit-op)) a byte at a time. Used
140
+ // for handling COPY operations where the input and output regions may overlap.
141
+ // For example, suppose:
142
+ // src == "ab"
143
+ // op == src + 2
144
+ // op_limit == op + 20
145
+ // After IncrementalCopySlow(src, op, op_limit), the result will have eleven
146
+ // copies of "ab"
95
147
  // ababababababababababab
96
- // Note that this does not match the semantics of either memcpy()
97
- // or memmove().
98
- static inline void IncrementalCopy(const char* src, char* op, ssize_t len) {
99
- assert(len > 0);
100
- do {
148
+ // Note that this does not match the semantics of either memcpy() or memmove().
149
+ inline char* IncrementalCopySlow(const char* src, char* op,
150
+ char* const op_limit) {
151
+ // TODO: Remove pragma when LLVM is aware this
152
+ // function is only called in cold regions and when cold regions don't get
153
+ // vectorized or unrolled.
154
+ #ifdef __clang__
155
+ #pragma clang loop unroll(disable)
156
+ #endif
157
+ while (op < op_limit) {
101
158
  *op++ = *src++;
102
- } while (--len > 0);
159
+ }
160
+ return op_limit;
103
161
  }
104
162
 
105
- // Equivalent to IncrementalCopy except that it can write up to ten extra
106
- // bytes after the end of the copy, and that it is faster.
107
- //
108
- // The main part of this loop is a simple copy of eight bytes at a time until
109
- // we've copied (at least) the requested amount of bytes. However, if op and
110
- // src are less than eight bytes apart (indicating a repeating pattern of
111
- // length < 8), we first need to expand the pattern in order to get the correct
112
- // results. For instance, if the buffer looks like this, with the eight-byte
113
- // <src> and <op> patterns marked as intervals:
114
- //
115
- // abxxxxxxxxxxxx
116
- // [------] src
117
- // [------] op
118
- //
119
- // a single eight-byte copy from <src> to <op> will repeat the pattern once,
120
- // after which we can move <op> two bytes without moving <src>:
121
- //
122
- // ababxxxxxxxxxx
123
- // [------] src
124
- // [------] op
125
- //
126
- // and repeat the exercise until the two no longer overlap.
127
- //
128
- // This allows us to do very well in the special case of one single byte
129
- // repeated many times, without taking a big hit for more general cases.
130
- //
131
- // The worst case of extra writing past the end of the match occurs when
132
- // op - src == 1 and len == 1; the last copy will read from byte positions
133
- // [0..7] and write to [4..11], whereas it was only supposed to write to
134
- // position 1. Thus, ten excess bytes.
163
+ #if SNAPPY_HAVE_SSSE3
164
+
165
+ // This is a table of shuffle control masks that can be used as the source
166
+ // operand for PSHUFB to permute the contents of the destination XMM register
167
+ // into a repeating byte pattern.
168
+ alignas(16) const char pshufb_fill_patterns[7][16] = {
169
+ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
170
+ {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1},
171
+ {0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0},
172
+ {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3},
173
+ {0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0},
174
+ {0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3},
175
+ {0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1},
176
+ };
135
177
 
136
- namespace {
178
+ #endif // SNAPPY_HAVE_SSSE3
179
+
180
+ // Copy [src, src+(op_limit-op)) to [op, (op_limit-op)) but faster than
181
+ // IncrementalCopySlow. buf_limit is the address past the end of the writable
182
+ // region of the buffer.
183
+ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
184
+ char* const buf_limit) {
185
+ // Terminology:
186
+ //
187
+ // slop = buf_limit - op
188
+ // pat = op - src
189
+ // len = limit - op
190
+ assert(src < op);
191
+ assert(op <= op_limit);
192
+ assert(op_limit <= buf_limit);
193
+ // NOTE: The compressor always emits 4 <= len <= 64. It is ok to assume that
194
+ // to optimize this function but we have to also handle other cases in case
195
+ // the input does not satisfy these conditions.
196
+
197
+ size_t pattern_size = op - src;
198
+ // The cases are split into different branches to allow the branch predictor,
199
+ // FDO, and static prediction hints to work better. For each input we list the
200
+ // ratio of invocations that match each condition.
201
+ //
202
+ // input slop < 16 pat < 8 len > 16
203
+ // ------------------------------------------
204
+ // html|html4|cp 0% 1.01% 27.73%
205
+ // urls 0% 0.88% 14.79%
206
+ // jpg 0% 64.29% 7.14%
207
+ // pdf 0% 2.56% 58.06%
208
+ // txt[1-4] 0% 0.23% 0.97%
209
+ // pb 0% 0.96% 13.88%
210
+ // bin 0.01% 22.27% 41.17%
211
+ //
212
+ // It is very rare that we don't have enough slop for doing block copies. It
213
+ // is also rare that we need to expand a pattern. Small patterns are common
214
+ // for incompressible formats and for those we are plenty fast already.
215
+ // Lengths are normally not greater than 16 but they vary depending on the
216
+ // input. In general if we always predict len <= 16 it would be an ok
217
+ // prediction.
218
+ //
219
+ // In order to be fast we want a pattern >= 8 bytes and an unrolled loop
220
+ // copying 2x 8 bytes at a time.
221
+
222
+ // Handle the uncommon case where pattern is less than 8 bytes.
223
+ if (SNAPPY_PREDICT_FALSE(pattern_size < 8)) {
224
+ #if SNAPPY_HAVE_SSSE3
225
+ // Load the first eight bytes into an 128-bit XMM register, then use PSHUFB
226
+ // to permute the register's contents in-place into a repeating sequence of
227
+ // the first "pattern_size" bytes.
228
+ // For example, suppose:
229
+ // src == "abc"
230
+ // op == op + 3
231
+ // After _mm_shuffle_epi8(), "pattern" will have five copies of "abc"
232
+ // followed by one byte of slop: abcabcabcabcabca.
233
+ //
234
+ // The non-SSE fallback implementation suffers from store-forwarding stalls
235
+ // because its loads and stores partly overlap. By expanding the pattern
236
+ // in-place, we avoid the penalty.
237
+ if (SNAPPY_PREDICT_TRUE(op <= buf_limit - 16)) {
238
+ const __m128i shuffle_mask = _mm_load_si128(
239
+ reinterpret_cast<const __m128i*>(pshufb_fill_patterns)
240
+ + pattern_size - 1);
241
+ const __m128i pattern = _mm_shuffle_epi8(
242
+ _mm_loadl_epi64(reinterpret_cast<const __m128i*>(src)), shuffle_mask);
243
+ // Uninitialized bytes are masked out by the shuffle mask.
244
+ // TODO: remove annotation and macro defs once MSan is fixed.
245
+ SNAPPY_ANNOTATE_MEMORY_IS_INITIALIZED(&pattern, sizeof(pattern));
246
+ pattern_size *= 16 / pattern_size;
247
+ char* op_end = std::min(op_limit, buf_limit - 15);
248
+ while (op < op_end) {
249
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(op), pattern);
250
+ op += pattern_size;
251
+ }
252
+ if (SNAPPY_PREDICT_TRUE(op >= op_limit)) return op_limit;
253
+ }
254
+ return IncrementalCopySlow(src, op, op_limit);
255
+ #else // !SNAPPY_HAVE_SSSE3
256
+ // If plenty of buffer space remains, expand the pattern to at least 8
257
+ // bytes. The way the following loop is written, we need 8 bytes of buffer
258
+ // space if pattern_size >= 4, 11 bytes if pattern_size is 1 or 3, and 10
259
+ // bytes if pattern_size is 2. Precisely encoding that is probably not
260
+ // worthwhile; instead, invoke the slow path if we cannot write 11 bytes
261
+ // (because 11 are required in the worst case).
262
+ if (SNAPPY_PREDICT_TRUE(op <= buf_limit - 11)) {
263
+ while (pattern_size < 8) {
264
+ UnalignedCopy64(src, op);
265
+ op += pattern_size;
266
+ pattern_size *= 2;
267
+ }
268
+ if (SNAPPY_PREDICT_TRUE(op >= op_limit)) return op_limit;
269
+ } else {
270
+ return IncrementalCopySlow(src, op, op_limit);
271
+ }
272
+ #endif // SNAPPY_HAVE_SSSE3
273
+ }
274
+ assert(pattern_size >= 8);
275
+
276
+ // Copy 2x 8 bytes at a time. Because op - src can be < 16, a single
277
+ // UnalignedCopy128 might overwrite data in op. UnalignedCopy64 is safe
278
+ // because expanding the pattern to at least 8 bytes guarantees that
279
+ // op - src >= 8.
280
+ //
281
+ // Typically, the op_limit is the gating factor so try to simplify the loop
282
+ // based on that.
283
+ if (SNAPPY_PREDICT_TRUE(op_limit <= buf_limit - 16)) {
284
+ // There is at least one, and at most four 16-byte blocks. Writing four
285
+ // conditionals instead of a loop allows FDO to layout the code with respect
286
+ // to the actual probabilities of each length.
287
+ // TODO: Replace with loop with trip count hint.
288
+ UnalignedCopy64(src, op);
289
+ UnalignedCopy64(src + 8, op + 8);
137
290
 
138
- const int kMaxIncrementCopyOverflow = 10;
291
+ if (op + 16 < op_limit) {
292
+ UnalignedCopy64(src + 16, op + 16);
293
+ UnalignedCopy64(src + 24, op + 24);
294
+ }
295
+ if (op + 32 < op_limit) {
296
+ UnalignedCopy64(src + 32, op + 32);
297
+ UnalignedCopy64(src + 40, op + 40);
298
+ }
299
+ if (op + 48 < op_limit) {
300
+ UnalignedCopy64(src + 48, op + 48);
301
+ UnalignedCopy64(src + 56, op + 56);
302
+ }
303
+ return op_limit;
304
+ }
139
305
 
140
- inline void IncrementalCopyFastPath(const char* src, char* op, ssize_t len) {
141
- while (op - src < 8) {
306
+ // Fall back to doing as much as we can with the available slop in the
307
+ // buffer. This code path is relatively cold however so we save code size by
308
+ // avoiding unrolling and vectorizing.
309
+ //
310
+ // TODO: Remove pragma when when cold regions don't get vectorized
311
+ // or unrolled.
312
+ #ifdef __clang__
313
+ #pragma clang loop unroll(disable)
314
+ #endif
315
+ for (char *op_end = buf_limit - 16; op < op_end; op += 16, src += 16) {
142
316
  UnalignedCopy64(src, op);
143
- len -= op - src;
144
- op += op - src;
317
+ UnalignedCopy64(src + 8, op + 8);
145
318
  }
146
- while (len > 0) {
319
+ if (op >= op_limit)
320
+ return op_limit;
321
+
322
+ // We only take this branch if we didn't have enough slop and we can do a
323
+ // single 8 byte copy.
324
+ if (SNAPPY_PREDICT_FALSE(op <= buf_limit - 8)) {
147
325
  UnalignedCopy64(src, op);
148
326
  src += 8;
149
327
  op += 8;
150
- len -= 8;
151
328
  }
329
+ return IncrementalCopySlow(src, op, op_limit);
152
330
  }
153
331
 
154
332
  } // namespace
155
333
 
334
+ template <bool allow_fast_path>
156
335
  static inline char* EmitLiteral(char* op,
157
336
  const char* literal,
158
- int len,
159
- bool allow_fast_path) {
160
- int n = len - 1; // Zero-length literals are disallowed
161
- if (n < 60) {
337
+ int len) {
338
+ // The vast majority of copies are below 16 bytes, for which a
339
+ // call to memcpy is overkill. This fast path can sometimes
340
+ // copy up to 15 bytes too much, but that is okay in the
341
+ // main loop, since we have a bit to go on for both sides:
342
+ //
343
+ // - The input will always have kInputMarginBytes = 15 extra
344
+ // available bytes, as long as we're in the main loop, and
345
+ // if not, allow_fast_path = false.
346
+ // - The output will always have 32 spare bytes (see
347
+ // MaxCompressedLength).
348
+ assert(len > 0); // Zero-length literals are disallowed
349
+ int n = len - 1;
350
+ if (allow_fast_path && len <= 16) {
162
351
  // Fits in tag byte
163
352
  *op++ = LITERAL | (n << 2);
164
353
 
165
- // The vast majority of copies are below 16 bytes, for which a
166
- // call to memcpy is overkill. This fast path can sometimes
167
- // copy up to 15 bytes too much, but that is okay in the
168
- // main loop, since we have a bit to go on for both sides:
169
- //
170
- // - The input will always have kInputMarginBytes = 15 extra
171
- // available bytes, as long as we're in the main loop, and
172
- // if not, allow_fast_path = false.
173
- // - The output will always have 32 spare bytes (see
174
- // MaxCompressedLength).
175
- if (allow_fast_path && len <= 16) {
176
- UnalignedCopy64(literal, op);
177
- UnalignedCopy64(literal + 8, op + 8);
178
- return op + len;
179
- }
354
+ UnalignedCopy128(literal, op);
355
+ return op + len;
356
+ }
357
+
358
+ if (n < 60) {
359
+ // Fits in tag byte
360
+ *op++ = LITERAL | (n << 2);
180
361
  } else {
181
- // Encode in upcoming bytes
182
- char* base = op;
183
- int count = 0;
184
- op++;
185
- while (n > 0) {
186
- *op++ = n & 0xff;
187
- n >>= 8;
188
- count++;
189
- }
362
+ int count = (Bits::Log2Floor(n) >> 3) + 1;
190
363
  assert(count >= 1);
191
364
  assert(count <= 4);
192
- *base = LITERAL | ((59+count) << 2);
365
+ *op++ = LITERAL | ((59 + count) << 2);
366
+ // Encode in upcoming bytes.
367
+ // Write 4 bytes, though we may care about only 1 of them. The output buffer
368
+ // is guaranteed to have at least 3 more spaces left as 'len >= 61' holds
369
+ // here and there is a memcpy of size 'len' below.
370
+ LittleEndian::Store32(op, n);
371
+ op += count;
193
372
  }
194
373
  memcpy(op, literal, len);
195
374
  return op + len;
196
375
  }
197
376
 
198
- static inline char* EmitCopyLessThan64(char* op, size_t offset, int len) {
377
+ template <bool len_less_than_12>
378
+ static inline char* EmitCopyAtMost64(char* op, size_t offset, size_t len) {
199
379
  assert(len <= 64);
200
380
  assert(len >= 4);
201
381
  assert(offset < 65536);
382
+ assert(len_less_than_12 == (len < 12));
202
383
 
203
- if ((len < 12) && (offset < 2048)) {
204
- size_t len_minus_4 = len - 4;
205
- assert(len_minus_4 < 8); // Must fit in 3 bits
206
- *op++ = COPY_1_BYTE_OFFSET + ((len_minus_4) << 2) + ((offset >> 8) << 5);
384
+ if (len_less_than_12 && SNAPPY_PREDICT_TRUE(offset < 2048)) {
385
+ // offset fits in 11 bits. The 3 highest go in the top of the first byte,
386
+ // and the rest go in the second byte.
387
+ *op++ = COPY_1_BYTE_OFFSET + ((len - 4) << 2) + ((offset >> 3) & 0xe0);
207
388
  *op++ = offset & 0xff;
208
389
  } else {
209
- *op++ = COPY_2_BYTE_OFFSET + ((len-1) << 2);
210
- LittleEndian::Store16(op, offset);
211
- op += 2;
390
+ // Write 4 bytes, though we only care about 3 of them. The output buffer
391
+ // is required to have some slack, so the extra byte won't overrun it.
392
+ uint32 u = COPY_2_BYTE_OFFSET + ((len - 1) << 2) + (offset << 8);
393
+ LittleEndian::Store32(op, u);
394
+ op += 3;
212
395
  }
213
396
  return op;
214
397
  }
215
398
 
216
- static inline char* EmitCopy(char* op, size_t offset, int len) {
217
- // Emit 64 byte copies but make sure to keep at least four bytes reserved
218
- while (len >= 68) {
219
- op = EmitCopyLessThan64(op, offset, 64);
220
- len -= 64;
221
- }
399
+ template <bool len_less_than_12>
400
+ static inline char* EmitCopy(char* op, size_t offset, size_t len) {
401
+ assert(len_less_than_12 == (len < 12));
402
+ if (len_less_than_12) {
403
+ return EmitCopyAtMost64</*len_less_than_12=*/true>(op, offset, len);
404
+ } else {
405
+ // A special case for len <= 64 might help, but so far measurements suggest
406
+ // it's in the noise.
222
407
 
223
- // Emit an extra 60 byte copy if have too much data to fit in one copy
224
- if (len > 64) {
225
- op = EmitCopyLessThan64(op, offset, 60);
226
- len -= 60;
227
- }
408
+ // Emit 64 byte copies but make sure to keep at least four bytes reserved.
409
+ while (SNAPPY_PREDICT_FALSE(len >= 68)) {
410
+ op = EmitCopyAtMost64</*len_less_than_12=*/false>(op, offset, 64);
411
+ len -= 64;
412
+ }
228
413
 
229
- // Emit remainder
230
- op = EmitCopyLessThan64(op, offset, len);
231
- return op;
232
- }
414
+ // One or two copies will now finish the job.
415
+ if (len > 64) {
416
+ op = EmitCopyAtMost64</*len_less_than_12=*/false>(op, offset, 60);
417
+ len -= 60;
418
+ }
233
419
 
420
+ // Emit remainder.
421
+ if (len < 12) {
422
+ op = EmitCopyAtMost64</*len_less_than_12=*/true>(op, offset, len);
423
+ } else {
424
+ op = EmitCopyAtMost64</*len_less_than_12=*/false>(op, offset, len);
425
+ }
426
+ return op;
427
+ }
428
+ }
234
429
 
235
430
  bool GetUncompressedLength(const char* start, size_t n, size_t* result) {
236
431
  uint32 v = 0;
@@ -243,31 +438,45 @@ bool GetUncompressedLength(const char* start, size_t n, size_t* result) {
243
438
  }
244
439
  }
245
440
 
246
- namespace internal {
247
- uint16* WorkingMemory::GetHashTable(size_t input_size, int* table_size) {
248
- // Use smaller hash table when input.size() is smaller, since we
249
- // fill the table, incurring O(hash table size) overhead for
250
- // compression, and if the input is short, we won't need that
251
- // many hash table entries anyway.
252
- assert(kMaxHashTableSize >= 256);
253
- size_t htsize = 256;
254
- while (htsize < kMaxHashTableSize && htsize < input_size) {
255
- htsize <<= 1;
441
+ namespace {
442
+ uint32 CalculateTableSize(uint32 input_size) {
443
+ static_assert(
444
+ kMaxHashTableSize >= kMinHashTableSize,
445
+ "kMaxHashTableSize should be greater or equal to kMinHashTableSize.");
446
+ if (input_size > kMaxHashTableSize) {
447
+ return kMaxHashTableSize;
256
448
  }
257
-
258
- uint16* table;
259
- if (htsize <= ARRAYSIZE(small_table_)) {
260
- table = small_table_;
261
- } else {
262
- if (large_table_ == NULL) {
263
- large_table_ = new uint16[kMaxHashTableSize];
264
- }
265
- table = large_table_;
449
+ if (input_size < kMinHashTableSize) {
450
+ return kMinHashTableSize;
266
451
  }
452
+ // This is equivalent to Log2Ceiling(input_size), assuming input_size > 1.
453
+ // 2 << Log2Floor(x - 1) is equivalent to 1 << (1 + Log2Floor(x - 1)).
454
+ return 2u << Bits::Log2Floor(input_size - 1);
455
+ }
456
+ } // namespace
267
457
 
458
+ namespace internal {
459
+ WorkingMemory::WorkingMemory(size_t input_size) {
460
+ const size_t max_fragment_size = std::min(input_size, kBlockSize);
461
+ const size_t table_size = CalculateTableSize(max_fragment_size);
462
+ size_ = table_size * sizeof(*table_) + max_fragment_size +
463
+ MaxCompressedLength(max_fragment_size);
464
+ mem_ = std::allocator<char>().allocate(size_);
465
+ table_ = reinterpret_cast<uint16*>(mem_);
466
+ input_ = mem_ + table_size * sizeof(*table_);
467
+ output_ = input_ + max_fragment_size;
468
+ }
469
+
470
+ WorkingMemory::~WorkingMemory() {
471
+ std::allocator<char>().deallocate(mem_, size_);
472
+ }
473
+
474
+ uint16* WorkingMemory::GetHashTable(size_t fragment_size,
475
+ int* table_size) const {
476
+ const size_t htsize = CalculateTableSize(fragment_size);
477
+ memset(table_, 0, htsize * sizeof(*table_));
268
478
  *table_size = htsize;
269
- memset(table, 0, htsize * sizeof(*table));
270
- return table;
479
+ return table_;
271
480
  }
272
481
  } // end namespace internal
273
482
 
@@ -334,7 +543,7 @@ char* CompressFragment(const char* input,
334
543
  // "ip" is the input pointer, and "op" is the output pointer.
335
544
  const char* ip = input;
336
545
  assert(input_size <= kBlockSize);
337
- assert((table_size & (table_size - 1)) == 0); // table must be power of two
546
+ assert((table_size & (table_size - 1)) == 0); // table must be power of two
338
547
  const int shift = 32 - Bits::Log2Floor(table_size);
339
548
  assert(static_cast<int>(kuint32max >> shift) == table_size - 1);
340
549
  const char* ip_end = input + input_size;
@@ -344,7 +553,7 @@ char* CompressFragment(const char* input,
344
553
  const char* next_emit = ip;
345
554
 
346
555
  const size_t kInputMarginBytes = 15;
347
- if (PREDICT_TRUE(input_size >= kInputMarginBytes)) {
556
+ if (SNAPPY_PREDICT_TRUE(input_size >= kInputMarginBytes)) {
348
557
  const char* ip_limit = input + input_size - kInputMarginBytes;
349
558
 
350
559
  for (uint32 next_hash = Hash(++ip, shift); ; ) {
@@ -364,9 +573,9 @@ char* CompressFragment(const char* input,
364
573
  //
365
574
  // Heuristic match skipping: If 32 bytes are scanned with no matches
366
575
  // found, start looking only at every other byte. If 32 more bytes are
367
- // scanned, look at every third byte, etc.. When a match is found,
368
- // immediately go back to looking at every byte. This is a small loss
369
- // (~5% performance, ~0.1% density) for compressible data due to more
576
+ // scanned (or skipped), look at every third byte, etc.. When a match is
577
+ // found, immediately go back to looking at every byte. This is a small
578
+ // loss (~5% performance, ~0.1% density) for compressible data due to more
370
579
  // bookkeeping, but for non-compressible data (such as JPEG) it's a huge
371
580
  // win since the compressor quickly "realizes" the data is incompressible
372
581
  // and doesn't bother looking for matches everywhere.
@@ -382,9 +591,10 @@ char* CompressFragment(const char* input,
382
591
  ip = next_ip;
383
592
  uint32 hash = next_hash;
384
593
  assert(hash == Hash(ip, shift));
385
- uint32 bytes_between_hash_lookups = skip++ >> 5;
594
+ uint32 bytes_between_hash_lookups = skip >> 5;
595
+ skip += bytes_between_hash_lookups;
386
596
  next_ip = ip + bytes_between_hash_lookups;
387
- if (PREDICT_FALSE(next_ip > ip_limit)) {
597
+ if (SNAPPY_PREDICT_FALSE(next_ip > ip_limit)) {
388
598
  goto emit_remainder;
389
599
  }
390
600
  next_hash = Hash(next_ip, shift);
@@ -393,14 +603,14 @@ char* CompressFragment(const char* input,
393
603
  assert(candidate < ip);
394
604
 
395
605
  table[hash] = ip - base_ip;
396
- } while (PREDICT_TRUE(UNALIGNED_LOAD32(ip) !=
397
- UNALIGNED_LOAD32(candidate)));
606
+ } while (SNAPPY_PREDICT_TRUE(UNALIGNED_LOAD32(ip) !=
607
+ UNALIGNED_LOAD32(candidate)));
398
608
 
399
609
  // Step 2: A 4-byte match has been found. We'll later see if more
400
610
  // than 4 bytes match. But, prior to the match, input
401
611
  // bytes [next_emit, ip) are unmatched. Emit them as "literal bytes."
402
612
  assert(next_emit + 16 <= ip_end);
403
- op = EmitLiteral(op, next_emit, ip - next_emit, true);
613
+ op = EmitLiteral</*allow_fast_path=*/true>(op, next_emit, ip - next_emit);
404
614
 
405
615
  // Step 3: Call EmitCopy, and then see if another EmitCopy could
406
616
  // be our next move. Repeat until we find no match for the
@@ -417,19 +627,25 @@ char* CompressFragment(const char* input,
417
627
  // We have a 4-byte match at ip, and no need to emit any
418
628
  // "literal bytes" prior to ip.
419
629
  const char* base = ip;
420
- int matched = 4 + FindMatchLength(candidate + 4, ip + 4, ip_end);
630
+ std::pair<size_t, bool> p =
631
+ FindMatchLength(candidate + 4, ip + 4, ip_end);
632
+ size_t matched = 4 + p.first;
421
633
  ip += matched;
422
634
  size_t offset = base - candidate;
423
635
  assert(0 == memcmp(base, candidate, matched));
424
- op = EmitCopy(op, offset, matched);
425
- // We could immediately start working at ip now, but to improve
426
- // compression we first update table[Hash(ip - 1, ...)].
427
- const char* insert_tail = ip - 1;
636
+ if (p.second) {
637
+ op = EmitCopy</*len_less_than_12=*/true>(op, offset, matched);
638
+ } else {
639
+ op = EmitCopy</*len_less_than_12=*/false>(op, offset, matched);
640
+ }
428
641
  next_emit = ip;
429
- if (PREDICT_FALSE(ip >= ip_limit)) {
642
+ if (SNAPPY_PREDICT_FALSE(ip >= ip_limit)) {
430
643
  goto emit_remainder;
431
644
  }
432
- input_bytes = GetEightBytesAt(insert_tail);
645
+ // We are now looking for a 4-byte match again. We read
646
+ // table[Hash(ip, shift)] for that. To improve compression,
647
+ // we also update table[Hash(ip - 1, shift)] and table[Hash(ip, shift)].
648
+ input_bytes = GetEightBytesAt(ip - 1);
433
649
  uint32 prev_hash = HashBytes(GetUint32AtOffset(input_bytes, 0), shift);
434
650
  table[prev_hash] = ip - base_ip - 1;
435
651
  uint32 cur_hash = HashBytes(GetUint32AtOffset(input_bytes, 1), shift);
@@ -446,13 +662,18 @@ char* CompressFragment(const char* input,
446
662
  emit_remainder:
447
663
  // Emit the remaining bytes as a literal
448
664
  if (next_emit < ip_end) {
449
- op = EmitLiteral(op, next_emit, ip_end - next_emit, false);
665
+ op = EmitLiteral</*allow_fast_path=*/false>(op, next_emit,
666
+ ip_end - next_emit);
450
667
  }
451
668
 
452
669
  return op;
453
670
  }
454
671
  } // end namespace internal
455
672
 
673
+ // Called back at avery compression call to trace parameters and sizes.
674
+ static inline void Report(const char *algorithm, size_t compressed_size,
675
+ size_t uncompressed_size) {}
676
+
456
677
  // Signature of output types needed by decompression code.
457
678
  // The decompression code is templatized on a type that obeys this
458
679
  // signature so that we do not pay virtual function call overhead in
@@ -493,162 +714,28 @@ char* CompressFragment(const char* input,
493
714
  // bool TryFastAppend(const char* ip, size_t available, size_t length);
494
715
  // };
495
716
 
496
- // -----------------------------------------------------------------------
497
- // Lookup table for decompression code. Generated by ComputeTable() below.
498
- // -----------------------------------------------------------------------
499
-
500
- // Mapping from i in range [0,4] to a mask to extract the bottom 8*i bits
501
- static const uint32 wordmask[] = {
502
- 0u, 0xffu, 0xffffu, 0xffffffu, 0xffffffffu
503
- };
504
-
505
- // Data stored per entry in lookup table:
506
- // Range Bits-used Description
507
- // ------------------------------------
508
- // 1..64 0..7 Literal/copy length encoded in opcode byte
509
- // 0..7 8..10 Copy offset encoded in opcode byte / 256
510
- // 0..4 11..13 Extra bytes after opcode
511
- //
512
- // We use eight bits for the length even though 7 would have sufficed
513
- // because of efficiency reasons:
514
- // (1) Extracting a byte is faster than a bit-field
515
- // (2) It properly aligns copy offset so we do not need a <<8
516
- static const uint16 char_table[256] = {
517
- 0x0001, 0x0804, 0x1001, 0x2001, 0x0002, 0x0805, 0x1002, 0x2002,
518
- 0x0003, 0x0806, 0x1003, 0x2003, 0x0004, 0x0807, 0x1004, 0x2004,
519
- 0x0005, 0x0808, 0x1005, 0x2005, 0x0006, 0x0809, 0x1006, 0x2006,
520
- 0x0007, 0x080a, 0x1007, 0x2007, 0x0008, 0x080b, 0x1008, 0x2008,
521
- 0x0009, 0x0904, 0x1009, 0x2009, 0x000a, 0x0905, 0x100a, 0x200a,
522
- 0x000b, 0x0906, 0x100b, 0x200b, 0x000c, 0x0907, 0x100c, 0x200c,
523
- 0x000d, 0x0908, 0x100d, 0x200d, 0x000e, 0x0909, 0x100e, 0x200e,
524
- 0x000f, 0x090a, 0x100f, 0x200f, 0x0010, 0x090b, 0x1010, 0x2010,
525
- 0x0011, 0x0a04, 0x1011, 0x2011, 0x0012, 0x0a05, 0x1012, 0x2012,
526
- 0x0013, 0x0a06, 0x1013, 0x2013, 0x0014, 0x0a07, 0x1014, 0x2014,
527
- 0x0015, 0x0a08, 0x1015, 0x2015, 0x0016, 0x0a09, 0x1016, 0x2016,
528
- 0x0017, 0x0a0a, 0x1017, 0x2017, 0x0018, 0x0a0b, 0x1018, 0x2018,
529
- 0x0019, 0x0b04, 0x1019, 0x2019, 0x001a, 0x0b05, 0x101a, 0x201a,
530
- 0x001b, 0x0b06, 0x101b, 0x201b, 0x001c, 0x0b07, 0x101c, 0x201c,
531
- 0x001d, 0x0b08, 0x101d, 0x201d, 0x001e, 0x0b09, 0x101e, 0x201e,
532
- 0x001f, 0x0b0a, 0x101f, 0x201f, 0x0020, 0x0b0b, 0x1020, 0x2020,
533
- 0x0021, 0x0c04, 0x1021, 0x2021, 0x0022, 0x0c05, 0x1022, 0x2022,
534
- 0x0023, 0x0c06, 0x1023, 0x2023, 0x0024, 0x0c07, 0x1024, 0x2024,
535
- 0x0025, 0x0c08, 0x1025, 0x2025, 0x0026, 0x0c09, 0x1026, 0x2026,
536
- 0x0027, 0x0c0a, 0x1027, 0x2027, 0x0028, 0x0c0b, 0x1028, 0x2028,
537
- 0x0029, 0x0d04, 0x1029, 0x2029, 0x002a, 0x0d05, 0x102a, 0x202a,
538
- 0x002b, 0x0d06, 0x102b, 0x202b, 0x002c, 0x0d07, 0x102c, 0x202c,
539
- 0x002d, 0x0d08, 0x102d, 0x202d, 0x002e, 0x0d09, 0x102e, 0x202e,
540
- 0x002f, 0x0d0a, 0x102f, 0x202f, 0x0030, 0x0d0b, 0x1030, 0x2030,
541
- 0x0031, 0x0e04, 0x1031, 0x2031, 0x0032, 0x0e05, 0x1032, 0x2032,
542
- 0x0033, 0x0e06, 0x1033, 0x2033, 0x0034, 0x0e07, 0x1034, 0x2034,
543
- 0x0035, 0x0e08, 0x1035, 0x2035, 0x0036, 0x0e09, 0x1036, 0x2036,
544
- 0x0037, 0x0e0a, 0x1037, 0x2037, 0x0038, 0x0e0b, 0x1038, 0x2038,
545
- 0x0039, 0x0f04, 0x1039, 0x2039, 0x003a, 0x0f05, 0x103a, 0x203a,
546
- 0x003b, 0x0f06, 0x103b, 0x203b, 0x003c, 0x0f07, 0x103c, 0x203c,
547
- 0x0801, 0x0f08, 0x103d, 0x203d, 0x1001, 0x0f09, 0x103e, 0x203e,
548
- 0x1801, 0x0f0a, 0x103f, 0x203f, 0x2001, 0x0f0b, 0x1040, 0x2040
549
- };
550
-
551
- // In debug mode, allow optional computation of the table at startup.
552
- // Also, check that the decompression table is correct.
553
- #ifndef NDEBUG
554
- DEFINE_bool(snappy_dump_decompression_table, false,
555
- "If true, we print the decompression table at startup.");
556
-
557
- static uint16 MakeEntry(unsigned int extra,
558
- unsigned int len,
559
- unsigned int copy_offset) {
560
- // Check that all of the fields fit within the allocated space
561
- assert(extra == (extra & 0x7)); // At most 3 bits
562
- assert(copy_offset == (copy_offset & 0x7)); // At most 3 bits
563
- assert(len == (len & 0x7f)); // At most 7 bits
564
- return len | (copy_offset << 8) | (extra << 11);
717
+ static inline uint32 ExtractLowBytes(uint32 v, int n) {
718
+ assert(n >= 0);
719
+ assert(n <= 4);
720
+ #if SNAPPY_HAVE_BMI2
721
+ return _bzhi_u32(v, 8 * n);
722
+ #else
723
+ // This needs to be wider than uint32 otherwise `mask << 32` will be
724
+ // undefined.
725
+ uint64 mask = 0xffffffff;
726
+ return v & ~(mask << (8 * n));
727
+ #endif
565
728
  }
566
729
 
567
- static void ComputeTable() {
568
- uint16 dst[256];
569
-
570
- // Place invalid entries in all places to detect missing initialization
571
- int assigned = 0;
572
- for (int i = 0; i < 256; i++) {
573
- dst[i] = 0xffff;
574
- }
575
-
576
- // Small LITERAL entries. We store (len-1) in the top 6 bits.
577
- for (unsigned int len = 1; len <= 60; len++) {
578
- dst[LITERAL | ((len-1) << 2)] = MakeEntry(0, len, 0);
579
- assigned++;
580
- }
581
-
582
- // Large LITERAL entries. We use 60..63 in the high 6 bits to
583
- // encode the number of bytes of length info that follow the opcode.
584
- for (unsigned int extra_bytes = 1; extra_bytes <= 4; extra_bytes++) {
585
- // We set the length field in the lookup table to 1 because extra
586
- // bytes encode len-1.
587
- dst[LITERAL | ((extra_bytes+59) << 2)] = MakeEntry(extra_bytes, 1, 0);
588
- assigned++;
589
- }
590
-
591
- // COPY_1_BYTE_OFFSET.
592
- //
593
- // The tag byte in the compressed data stores len-4 in 3 bits, and
594
- // offset/256 in 5 bits. offset%256 is stored in the next byte.
595
- //
596
- // This format is used for length in range [4..11] and offset in
597
- // range [0..2047]
598
- for (unsigned int len = 4; len < 12; len++) {
599
- for (unsigned int offset = 0; offset < 2048; offset += 256) {
600
- dst[COPY_1_BYTE_OFFSET | ((len-4)<<2) | ((offset>>8)<<5)] =
601
- MakeEntry(1, len, offset>>8);
602
- assigned++;
603
- }
604
- }
605
-
606
- // COPY_2_BYTE_OFFSET.
607
- // Tag contains len-1 in top 6 bits, and offset in next two bytes.
608
- for (unsigned int len = 1; len <= 64; len++) {
609
- dst[COPY_2_BYTE_OFFSET | ((len-1)<<2)] = MakeEntry(2, len, 0);
610
- assigned++;
611
- }
612
-
613
- // COPY_4_BYTE_OFFSET.
614
- // Tag contents len-1 in top 6 bits, and offset in next four bytes.
615
- for (unsigned int len = 1; len <= 64; len++) {
616
- dst[COPY_4_BYTE_OFFSET | ((len-1)<<2)] = MakeEntry(4, len, 0);
617
- assigned++;
618
- }
619
-
620
- // Check that each entry was initialized exactly once.
621
- if (assigned != 256) {
622
- fprintf(stderr, "ComputeTable: assigned only %d of 256\n", assigned);
623
- abort();
624
- }
625
- for (int i = 0; i < 256; i++) {
626
- if (dst[i] == 0xffff) {
627
- fprintf(stderr, "ComputeTable: did not assign byte %d\n", i);
628
- abort();
629
- }
630
- }
631
-
632
- if (FLAGS_snappy_dump_decompression_table) {
633
- printf("static const uint16 char_table[256] = {\n ");
634
- for (int i = 0; i < 256; i++) {
635
- printf("0x%04x%s",
636
- dst[i],
637
- ((i == 255) ? "\n" : (((i%8) == 7) ? ",\n " : ", ")));
638
- }
639
- printf("};\n");
640
- }
641
-
642
- // Check that computed table matched recorded table
643
- for (int i = 0; i < 256; i++) {
644
- if (dst[i] != char_table[i]) {
645
- fprintf(stderr, "ComputeTable: byte %d: computed (%x), expect (%x)\n",
646
- i, static_cast<int>(dst[i]), static_cast<int>(char_table[i]));
647
- abort();
648
- }
649
- }
730
+ static inline bool LeftShiftOverflows(uint8 value, uint32 shift) {
731
+ assert(shift < 32);
732
+ static const uint8 masks[] = {
733
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, //
734
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, //
735
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, //
736
+ 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe};
737
+ return (value & masks[shift]) != 0;
650
738
  }
651
- #endif /* !NDEBUG */
652
739
 
653
740
  // Helper class for decompression
654
741
  class SnappyDecompressor {
@@ -687,7 +774,7 @@ class SnappyDecompressor {
687
774
  }
688
775
 
689
776
  // Read the uncompressed length stored at the start of the compressed data.
690
- // On succcess, stores the length in *result and returns true.
777
+ // On success, stores the length in *result and returns true.
691
778
  // On failure, returns false.
692
779
  bool ReadUncompressedLength(uint32* result) {
693
780
  assert(ip_ == NULL); // Must not have read anything yet
@@ -701,7 +788,9 @@ class SnappyDecompressor {
701
788
  if (n == 0) return false;
702
789
  const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip));
703
790
  reader_->Skip(1);
704
- *result |= static_cast<uint32>(c & 0x7f) << shift;
791
+ uint32 val = c & 0x7f;
792
+ if (LeftShiftOverflows(static_cast<uint8>(val), shift)) return false;
793
+ *result |= val << shift;
705
794
  if (c < 128) {
706
795
  break;
707
796
  }
@@ -713,9 +802,27 @@ class SnappyDecompressor {
713
802
  // Process the next item found in the input.
714
803
  // Returns true if successful, false on error or end of input.
715
804
  template <class Writer>
805
+ #if defined(__GNUC__) && defined(__x86_64__)
806
+ __attribute__((aligned(32)))
807
+ #endif
716
808
  void DecompressAllTags(Writer* writer) {
717
- const char* ip = ip_;
809
+ // In x86, pad the function body to start 16 bytes later. This function has
810
+ // a couple of hotspots that are highly sensitive to alignment: we have
811
+ // observed regressions by more than 20% in some metrics just by moving the
812
+ // exact same code to a different position in the benchmark binary.
813
+ //
814
+ // Putting this code on a 32-byte-aligned boundary + 16 bytes makes us hit
815
+ // the "lucky" case consistently. Unfortunately, this is a very brittle
816
+ // workaround, and future differences in code generation may reintroduce
817
+ // this regression. If you experience a big, difficult to explain, benchmark
818
+ // performance regression here, first try removing this hack.
819
+ #if defined(__GNUC__) && defined(__x86_64__)
820
+ // Two 8-byte "NOP DWORD ptr [EAX + EAX*1 + 00000000H]" instructions.
821
+ asm(".byte 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00");
822
+ asm(".byte 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00");
823
+ #endif
718
824
 
825
+ const char* ip = ip_;
719
826
  // We could have put this refill fragment only at the beginning of the loop.
720
827
  // However, duplicating it at the end of each branch gives the compiler more
721
828
  // scope to optimize the <ip_limit_ - ip> expression based on the local
@@ -731,21 +838,34 @@ class SnappyDecompressor {
731
838
  for ( ;; ) {
732
839
  const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip++));
733
840
 
734
- if ((c & 0x3) == LITERAL) {
841
+ // Ratio of iterations that have LITERAL vs non-LITERAL for different
842
+ // inputs.
843
+ //
844
+ // input LITERAL NON_LITERAL
845
+ // -----------------------------------
846
+ // html|html4|cp 23% 77%
847
+ // urls 36% 64%
848
+ // jpg 47% 53%
849
+ // pdf 19% 81%
850
+ // txt[1-4] 25% 75%
851
+ // pb 24% 76%
852
+ // bin 24% 76%
853
+ if (SNAPPY_PREDICT_FALSE((c & 0x3) == LITERAL)) {
735
854
  size_t literal_length = (c >> 2) + 1u;
736
855
  if (writer->TryFastAppend(ip, ip_limit_ - ip, literal_length)) {
737
856
  assert(literal_length < 61);
738
857
  ip += literal_length;
739
- // NOTE(user): There is no MAYBE_REFILL() here, as TryFastAppend()
858
+ // NOTE: There is no MAYBE_REFILL() here, as TryFastAppend()
740
859
  // will not return true unless there's already at least five spare
741
860
  // bytes in addition to the literal.
742
861
  continue;
743
862
  }
744
- if (PREDICT_FALSE(literal_length >= 61)) {
863
+ if (SNAPPY_PREDICT_FALSE(literal_length >= 61)) {
745
864
  // Long literal.
746
865
  const size_t literal_length_length = literal_length - 60;
747
866
  literal_length =
748
- (LittleEndian::Load32(ip) & wordmask[literal_length_length]) + 1;
867
+ ExtractLowBytes(LittleEndian::Load32(ip), literal_length_length) +
868
+ 1;
749
869
  ip += literal_length_length;
750
870
  }
751
871
 
@@ -767,15 +887,16 @@ class SnappyDecompressor {
767
887
  ip += literal_length;
768
888
  MAYBE_REFILL();
769
889
  } else {
770
- const uint32 entry = char_table[c];
771
- const uint32 trailer = LittleEndian::Load32(ip) & wordmask[entry >> 11];
772
- const uint32 length = entry & 0xff;
890
+ const size_t entry = char_table[c];
891
+ const size_t trailer =
892
+ ExtractLowBytes(LittleEndian::Load32(ip), entry >> 11);
893
+ const size_t length = entry & 0xff;
773
894
  ip += entry >> 11;
774
895
 
775
896
  // copy_offset/256 is encoded in bits 8..10. By just fetching
776
897
  // those bits, we get copy_offset (since the bit-field starts at
777
898
  // bit 8).
778
- const uint32 copy_offset = entry & 0x700;
899
+ const size_t copy_offset = entry & 0x700;
779
900
  if (!writer->AppendFromSelf(copy_offset + trailer, length)) {
780
901
  return;
781
902
  }
@@ -795,10 +916,8 @@ bool SnappyDecompressor::RefillTag() {
795
916
  size_t n;
796
917
  ip = reader_->Peek(&n);
797
918
  peeked_ = n;
798
- if (n == 0) {
799
- eof_ = true;
800
- return false;
801
- }
919
+ eof_ = (n == 0);
920
+ if (eof_) return false;
802
921
  ip_limit_ = ip + n;
803
922
  }
804
923
 
@@ -823,7 +942,7 @@ bool SnappyDecompressor::RefillTag() {
823
942
  size_t length;
824
943
  const char* src = reader_->Peek(&length);
825
944
  if (length == 0) return false;
826
- uint32 to_add = min<uint32>(needed - nbuf, length);
945
+ uint32 to_add = std::min<uint32>(needed - nbuf, length);
827
946
  memcpy(scratch_ + nbuf, src, to_add);
828
947
  nbuf += to_add;
829
948
  reader_->Skip(to_add);
@@ -852,17 +971,23 @@ static bool InternalUncompress(Source* r, Writer* writer) {
852
971
  SnappyDecompressor decompressor(r);
853
972
  uint32 uncompressed_len = 0;
854
973
  if (!decompressor.ReadUncompressedLength(&uncompressed_len)) return false;
855
- return InternalUncompressAllTags(&decompressor, writer, uncompressed_len);
974
+
975
+ return InternalUncompressAllTags(&decompressor, writer, r->Available(),
976
+ uncompressed_len);
856
977
  }
857
978
 
858
979
  template <typename Writer>
859
980
  static bool InternalUncompressAllTags(SnappyDecompressor* decompressor,
860
981
  Writer* writer,
982
+ uint32 compressed_len,
861
983
  uint32 uncompressed_len) {
984
+ Report("snappy_uncompress", compressed_len, uncompressed_len);
985
+
862
986
  writer->SetExpectedLength(uncompressed_len);
863
987
 
864
988
  // Process the entire input
865
989
  decompressor->DecompressAllTags(writer);
990
+ writer->Flush();
866
991
  return (decompressor->eof() && writer->CheckLength());
867
992
  }
868
993
 
@@ -874,21 +999,20 @@ bool GetUncompressedLength(Source* source, uint32* result) {
874
999
  size_t Compress(Source* reader, Sink* writer) {
875
1000
  size_t written = 0;
876
1001
  size_t N = reader->Available();
1002
+ const size_t uncompressed_size = N;
877
1003
  char ulength[Varint::kMax32];
878
1004
  char* p = Varint::Encode32(ulength, N);
879
1005
  writer->Append(ulength, p-ulength);
880
1006
  written += (p - ulength);
881
1007
 
882
- internal::WorkingMemory wmem;
883
- char* scratch = NULL;
884
- char* scratch_output = NULL;
1008
+ internal::WorkingMemory wmem(N);
885
1009
 
886
1010
  while (N > 0) {
887
1011
  // Get next block to compress (without copying if possible)
888
1012
  size_t fragment_size;
889
1013
  const char* fragment = reader->Peek(&fragment_size);
890
1014
  assert(fragment_size != 0); // premature end of input
891
- const size_t num_to_read = min(N, kBlockSize);
1015
+ const size_t num_to_read = std::min(N, kBlockSize);
892
1016
  size_t bytes_read = fragment_size;
893
1017
 
894
1018
  size_t pending_advance = 0;
@@ -897,19 +1021,13 @@ size_t Compress(Source* reader, Sink* writer) {
897
1021
  pending_advance = num_to_read;
898
1022
  fragment_size = num_to_read;
899
1023
  } else {
900
- // Read into scratch buffer
901
- if (scratch == NULL) {
902
- // If this is the last iteration, we want to allocate N bytes
903
- // of space, otherwise the max possible kBlockSize space.
904
- // num_to_read contains exactly the correct value
905
- scratch = new char[num_to_read];
906
- }
1024
+ char* scratch = wmem.GetScratchInput();
907
1025
  memcpy(scratch, fragment, bytes_read);
908
1026
  reader->Skip(bytes_read);
909
1027
 
910
1028
  while (bytes_read < num_to_read) {
911
1029
  fragment = reader->Peek(&fragment_size);
912
- size_t n = min<size_t>(fragment_size, num_to_read - bytes_read);
1030
+ size_t n = std::min<size_t>(fragment_size, num_to_read - bytes_read);
913
1031
  memcpy(scratch + bytes_read, fragment, n);
914
1032
  bytes_read += n;
915
1033
  reader->Skip(n);
@@ -929,16 +1047,13 @@ size_t Compress(Source* reader, Sink* writer) {
929
1047
 
930
1048
  // Need a scratch buffer for the output, in case the byte sink doesn't
931
1049
  // have room for us directly.
932
- if (scratch_output == NULL) {
933
- scratch_output = new char[max_output];
934
- } else {
935
- // Since we encode kBlockSize regions followed by a region
936
- // which is <= kBlockSize in length, a previously allocated
937
- // scratch_output[] region is big enough for this iteration.
938
- }
939
- char* dest = writer->GetAppendBuffer(max_output, scratch_output);
940
- char* end = internal::CompressFragment(fragment, fragment_size,
941
- dest, table, table_size);
1050
+
1051
+ // Since we encode kBlockSize regions followed by a region
1052
+ // which is <= kBlockSize in length, a previously allocated
1053
+ // scratch_output[] region is big enough for this iteration.
1054
+ char* dest = writer->GetAppendBuffer(max_output, wmem.GetScratchOutput());
1055
+ char* end = internal::CompressFragment(fragment, fragment_size, dest, table,
1056
+ table_size);
942
1057
  writer->Append(dest, end - dest);
943
1058
  written += (end - dest);
944
1059
 
@@ -946,8 +1061,7 @@ size_t Compress(Source* reader, Sink* writer) {
946
1061
  reader->Skip(pending_advance);
947
1062
  }
948
1063
 
949
- delete[] scratch;
950
- delete[] scratch_output;
1064
+ Report("snappy_compress", written, uncompressed_size);
951
1065
 
952
1066
  return written;
953
1067
  }
@@ -961,14 +1075,22 @@ size_t Compress(Source* reader, Sink* writer) {
961
1075
  // Writer template argument to SnappyDecompressor::DecompressAllTags().
962
1076
  class SnappyIOVecWriter {
963
1077
  private:
1078
+ // output_iov_end_ is set to iov + count and used to determine when
1079
+ // the end of the iovs is reached.
1080
+ const struct iovec* output_iov_end_;
1081
+
1082
+ #if !defined(NDEBUG)
964
1083
  const struct iovec* output_iov_;
965
- const size_t output_iov_count_;
1084
+ #endif // !defined(NDEBUG)
1085
+
1086
+ // Current iov that is being written into.
1087
+ const struct iovec* curr_iov_;
966
1088
 
967
- // We are currently writing into output_iov_[curr_iov_index_].
968
- int curr_iov_index_;
1089
+ // Pointer to current iov's write location.
1090
+ char* curr_iov_output_;
969
1091
 
970
- // Bytes written to output_iov_[curr_iov_index_] so far.
971
- size_t curr_iov_written_;
1092
+ // Remaining bytes to write into curr_iov_output.
1093
+ size_t curr_iov_remaining_;
972
1094
 
973
1095
  // Total bytes decompressed into output_iov_ so far.
974
1096
  size_t total_written_;
@@ -976,22 +1098,24 @@ class SnappyIOVecWriter {
976
1098
  // Maximum number of bytes that will be decompressed into output_iov_.
977
1099
  size_t output_limit_;
978
1100
 
979
- inline char* GetIOVecPointer(int index, size_t offset) {
980
- return reinterpret_cast<char*>(output_iov_[index].iov_base) +
981
- offset;
1101
+ static inline char* GetIOVecPointer(const struct iovec* iov, size_t offset) {
1102
+ return reinterpret_cast<char*>(iov->iov_base) + offset;
982
1103
  }
983
1104
 
984
1105
  public:
985
1106
  // Does not take ownership of iov. iov must be valid during the
986
1107
  // entire lifetime of the SnappyIOVecWriter.
987
1108
  inline SnappyIOVecWriter(const struct iovec* iov, size_t iov_count)
988
- : output_iov_(iov),
989
- output_iov_count_(iov_count),
990
- curr_iov_index_(0),
991
- curr_iov_written_(0),
1109
+ : output_iov_end_(iov + iov_count),
1110
+ #if !defined(NDEBUG)
1111
+ output_iov_(iov),
1112
+ #endif // !defined(NDEBUG)
1113
+ curr_iov_(iov),
1114
+ curr_iov_output_(iov_count ? reinterpret_cast<char*>(iov->iov_base)
1115
+ : nullptr),
1116
+ curr_iov_remaining_(iov_count ? iov->iov_len : 0),
992
1117
  total_written_(0),
993
- output_limit_(-1) {
994
- }
1118
+ output_limit_(-1) {}
995
1119
 
996
1120
  inline void SetExpectedLength(size_t len) {
997
1121
  output_limit_ = len;
@@ -1006,23 +1130,25 @@ class SnappyIOVecWriter {
1006
1130
  return false;
1007
1131
  }
1008
1132
 
1133
+ return AppendNoCheck(ip, len);
1134
+ }
1135
+
1136
+ inline bool AppendNoCheck(const char* ip, size_t len) {
1009
1137
  while (len > 0) {
1010
- assert(curr_iov_written_ <= output_iov_[curr_iov_index_].iov_len);
1011
- if (curr_iov_written_ >= output_iov_[curr_iov_index_].iov_len) {
1138
+ if (curr_iov_remaining_ == 0) {
1012
1139
  // This iovec is full. Go to the next one.
1013
- if (curr_iov_index_ + 1 >= output_iov_count_) {
1140
+ if (curr_iov_ + 1 >= output_iov_end_) {
1014
1141
  return false;
1015
1142
  }
1016
- curr_iov_written_ = 0;
1017
- ++curr_iov_index_;
1143
+ ++curr_iov_;
1144
+ curr_iov_output_ = reinterpret_cast<char*>(curr_iov_->iov_base);
1145
+ curr_iov_remaining_ = curr_iov_->iov_len;
1018
1146
  }
1019
1147
 
1020
- const size_t to_write = std::min(
1021
- len, output_iov_[curr_iov_index_].iov_len - curr_iov_written_);
1022
- memcpy(GetIOVecPointer(curr_iov_index_, curr_iov_written_),
1023
- ip,
1024
- to_write);
1025
- curr_iov_written_ += to_write;
1148
+ const size_t to_write = std::min(len, curr_iov_remaining_);
1149
+ memcpy(curr_iov_output_, ip, to_write);
1150
+ curr_iov_output_ += to_write;
1151
+ curr_iov_remaining_ -= to_write;
1026
1152
  total_written_ += to_write;
1027
1153
  ip += to_write;
1028
1154
  len -= to_write;
@@ -1034,12 +1160,11 @@ class SnappyIOVecWriter {
1034
1160
  inline bool TryFastAppend(const char* ip, size_t available, size_t len) {
1035
1161
  const size_t space_left = output_limit_ - total_written_;
1036
1162
  if (len <= 16 && available >= 16 + kMaximumTagLength && space_left >= 16 &&
1037
- output_iov_[curr_iov_index_].iov_len - curr_iov_written_ >= 16) {
1163
+ curr_iov_remaining_ >= 16) {
1038
1164
  // Fast path, used for the majority (about 95%) of invocations.
1039
- char* ptr = GetIOVecPointer(curr_iov_index_, curr_iov_written_);
1040
- UnalignedCopy64(ip, ptr);
1041
- UnalignedCopy64(ip + 8, ptr + 8);
1042
- curr_iov_written_ += len;
1165
+ UnalignedCopy128(ip, curr_iov_output_);
1166
+ curr_iov_output_ += len;
1167
+ curr_iov_remaining_ -= len;
1043
1168
  total_written_ += len;
1044
1169
  return true;
1045
1170
  }
@@ -1048,7 +1173,9 @@ class SnappyIOVecWriter {
1048
1173
  }
1049
1174
 
1050
1175
  inline bool AppendFromSelf(size_t offset, size_t len) {
1051
- if (offset > total_written_ || offset == 0) {
1176
+ // See SnappyArrayWriter::AppendFromSelf for an explanation of
1177
+ // the "offset - 1u" trick.
1178
+ if (offset - 1u >= total_written_) {
1052
1179
  return false;
1053
1180
  }
1054
1181
  const size_t space_left = output_limit_ - total_written_;
@@ -1057,8 +1184,8 @@ class SnappyIOVecWriter {
1057
1184
  }
1058
1185
 
1059
1186
  // Locate the iovec from which we need to start the copy.
1060
- int from_iov_index = curr_iov_index_;
1061
- size_t from_iov_offset = curr_iov_written_;
1187
+ const iovec* from_iov = curr_iov_;
1188
+ size_t from_iov_offset = curr_iov_->iov_len - curr_iov_remaining_;
1062
1189
  while (offset > 0) {
1063
1190
  if (from_iov_offset >= offset) {
1064
1191
  from_iov_offset -= offset;
@@ -1066,46 +1193,47 @@ class SnappyIOVecWriter {
1066
1193
  }
1067
1194
 
1068
1195
  offset -= from_iov_offset;
1069
- --from_iov_index;
1070
- assert(from_iov_index >= 0);
1071
- from_iov_offset = output_iov_[from_iov_index].iov_len;
1196
+ --from_iov;
1197
+ #if !defined(NDEBUG)
1198
+ assert(from_iov >= output_iov_);
1199
+ #endif // !defined(NDEBUG)
1200
+ from_iov_offset = from_iov->iov_len;
1072
1201
  }
1073
1202
 
1074
1203
  // Copy <len> bytes starting from the iovec pointed to by from_iov_index to
1075
1204
  // the current iovec.
1076
1205
  while (len > 0) {
1077
- assert(from_iov_index <= curr_iov_index_);
1078
- if (from_iov_index != curr_iov_index_) {
1079
- const size_t to_copy = std::min(
1080
- output_iov_[from_iov_index].iov_len - from_iov_offset,
1081
- len);
1082
- Append(GetIOVecPointer(from_iov_index, from_iov_offset), to_copy);
1206
+ assert(from_iov <= curr_iov_);
1207
+ if (from_iov != curr_iov_) {
1208
+ const size_t to_copy =
1209
+ std::min(from_iov->iov_len - from_iov_offset, len);
1210
+ AppendNoCheck(GetIOVecPointer(from_iov, from_iov_offset), to_copy);
1083
1211
  len -= to_copy;
1084
1212
  if (len > 0) {
1085
- ++from_iov_index;
1213
+ ++from_iov;
1086
1214
  from_iov_offset = 0;
1087
1215
  }
1088
1216
  } else {
1089
- assert(curr_iov_written_ <= output_iov_[curr_iov_index_].iov_len);
1090
- size_t to_copy = std::min(output_iov_[curr_iov_index_].iov_len -
1091
- curr_iov_written_,
1092
- len);
1217
+ size_t to_copy = curr_iov_remaining_;
1093
1218
  if (to_copy == 0) {
1094
1219
  // This iovec is full. Go to the next one.
1095
- if (curr_iov_index_ + 1 >= output_iov_count_) {
1220
+ if (curr_iov_ + 1 >= output_iov_end_) {
1096
1221
  return false;
1097
1222
  }
1098
- ++curr_iov_index_;
1099
- curr_iov_written_ = 0;
1223
+ ++curr_iov_;
1224
+ curr_iov_output_ = reinterpret_cast<char*>(curr_iov_->iov_base);
1225
+ curr_iov_remaining_ = curr_iov_->iov_len;
1100
1226
  continue;
1101
1227
  }
1102
1228
  if (to_copy > len) {
1103
1229
  to_copy = len;
1104
1230
  }
1105
- IncrementalCopy(GetIOVecPointer(from_iov_index, from_iov_offset),
1106
- GetIOVecPointer(curr_iov_index_, curr_iov_written_),
1107
- to_copy);
1108
- curr_iov_written_ += to_copy;
1231
+
1232
+ IncrementalCopy(GetIOVecPointer(from_iov, from_iov_offset),
1233
+ curr_iov_output_, curr_iov_output_ + to_copy,
1234
+ curr_iov_output_ + curr_iov_remaining_);
1235
+ curr_iov_output_ += to_copy;
1236
+ curr_iov_remaining_ -= to_copy;
1109
1237
  from_iov_offset += to_copy;
1110
1238
  total_written_ += to_copy;
1111
1239
  len -= to_copy;
@@ -1115,6 +1243,7 @@ class SnappyIOVecWriter {
1115
1243
  return true;
1116
1244
  }
1117
1245
 
1246
+ inline void Flush() {}
1118
1247
  };
1119
1248
 
1120
1249
  bool RawUncompressToIOVec(const char* compressed, size_t compressed_length,
@@ -1145,7 +1274,8 @@ class SnappyArrayWriter {
1145
1274
  public:
1146
1275
  inline explicit SnappyArrayWriter(char* dst)
1147
1276
  : base_(dst),
1148
- op_(dst) {
1277
+ op_(dst),
1278
+ op_limit_(dst) {
1149
1279
  }
1150
1280
 
1151
1281
  inline void SetExpectedLength(size_t len) {
@@ -1172,8 +1302,7 @@ class SnappyArrayWriter {
1172
1302
  const size_t space_left = op_limit_ - op;
1173
1303
  if (len <= 16 && available >= 16 + kMaximumTagLength && space_left >= 16) {
1174
1304
  // Fast path, used for the majority (about 95%) of invocations.
1175
- UnalignedCopy64(ip, op);
1176
- UnalignedCopy64(ip + 8, op + 8);
1305
+ UnalignedCopy128(ip, op);
1177
1306
  op_ = op + len;
1178
1307
  return true;
1179
1308
  } else {
@@ -1182,8 +1311,7 @@ class SnappyArrayWriter {
1182
1311
  }
1183
1312
 
1184
1313
  inline bool AppendFromSelf(size_t offset, size_t len) {
1185
- char* op = op_;
1186
- const size_t space_left = op_limit_ - op;
1314
+ char* const op_end = op_ + len;
1187
1315
 
1188
1316
  // Check if we try to append from before the start of the buffer.
1189
1317
  // Normally this would just be a check for "produced < offset",
@@ -1192,29 +1320,16 @@ class SnappyArrayWriter {
1192
1320
  // to a very big number. This is convenient, as offset==0 is another
1193
1321
  // invalid case that we also want to catch, so that we do not go
1194
1322
  // into an infinite loop.
1195
- assert(op >= base_);
1196
- size_t produced = op - base_;
1197
- if (produced <= offset - 1u) {
1198
- return false;
1199
- }
1200
- if (len <= 16 && offset >= 8 && space_left >= 16) {
1201
- // Fast path, used for the majority (70-80%) of dynamic invocations.
1202
- UnalignedCopy64(op - offset, op);
1203
- UnalignedCopy64(op - offset + 8, op + 8);
1204
- } else {
1205
- if (space_left >= len + kMaxIncrementCopyOverflow) {
1206
- IncrementalCopyFastPath(op - offset, op, len);
1207
- } else {
1208
- if (space_left < len) {
1209
- return false;
1210
- }
1211
- IncrementalCopy(op - offset, op, len);
1212
- }
1213
- }
1323
+ if (Produced() <= offset - 1u || op_end > op_limit_) return false;
1324
+ op_ = IncrementalCopy(op_ - offset, op_, op_end, op_limit_);
1214
1325
 
1215
- op_ = op + len;
1216
1326
  return true;
1217
1327
  }
1328
+ inline size_t Produced() const {
1329
+ assert(op_ >= base_);
1330
+ return op_ - base_;
1331
+ }
1332
+ inline void Flush() {}
1218
1333
  };
1219
1334
 
1220
1335
  bool RawUncompress(const char* compressed, size_t n, char* uncompressed) {
@@ -1227,7 +1342,7 @@ bool RawUncompress(Source* compressed, char* uncompressed) {
1227
1342
  return InternalUncompress(compressed, &output);
1228
1343
  }
1229
1344
 
1230
- bool Uncompress(const char* compressed, size_t n, string* uncompressed) {
1345
+ bool Uncompress(const char* compressed, size_t n, std::string* uncompressed) {
1231
1346
  size_t ulength;
1232
1347
  if (!GetUncompressedLength(compressed, n, &ulength)) {
1233
1348
  return false;
@@ -1241,7 +1356,6 @@ bool Uncompress(const char* compressed, size_t n, string* uncompressed) {
1241
1356
  return RawUncompress(compressed, n, string_as_array(uncompressed));
1242
1357
  }
1243
1358
 
1244
-
1245
1359
  // A Writer that drops everything on the floor and just does validation
1246
1360
  class SnappyDecompressionValidator {
1247
1361
  private:
@@ -1249,7 +1363,7 @@ class SnappyDecompressionValidator {
1249
1363
  size_t produced_;
1250
1364
 
1251
1365
  public:
1252
- inline SnappyDecompressionValidator() : produced_(0) { }
1366
+ inline SnappyDecompressionValidator() : expected_(0), produced_(0) { }
1253
1367
  inline void SetExpectedLength(size_t len) {
1254
1368
  expected_ = len;
1255
1369
  }
@@ -1270,6 +1384,7 @@ class SnappyDecompressionValidator {
1270
1384
  produced_ += len;
1271
1385
  return produced_ <= expected_;
1272
1386
  }
1387
+ inline void Flush() {}
1273
1388
  };
1274
1389
 
1275
1390
  bool IsValidCompressedBuffer(const char* compressed, size_t n) {
@@ -1278,6 +1393,11 @@ bool IsValidCompressedBuffer(const char* compressed, size_t n) {
1278
1393
  return InternalUncompress(&reader, &writer);
1279
1394
  }
1280
1395
 
1396
+ bool IsValidCompressed(Source* compressed) {
1397
+ SnappyDecompressionValidator writer;
1398
+ return InternalUncompress(compressed, &writer);
1399
+ }
1400
+
1281
1401
  void RawCompress(const char* input,
1282
1402
  size_t input_length,
1283
1403
  char* compressed,
@@ -1290,9 +1410,10 @@ void RawCompress(const char* input,
1290
1410
  *compressed_length = (writer.CurrentDestination() - compressed);
1291
1411
  }
1292
1412
 
1293
- size_t Compress(const char* input, size_t input_length, string* compressed) {
1413
+ size_t Compress(const char* input, size_t input_length,
1414
+ std::string* compressed) {
1294
1415
  // Pre-grow the buffer to the max length of the compressed output
1295
- compressed->resize(MaxCompressedLength(input_length));
1416
+ STLStringResizeUninitialized(compressed, MaxCompressedLength(input_length));
1296
1417
 
1297
1418
  size_t compressed_length;
1298
1419
  RawCompress(input, input_length, string_as_array(compressed),
@@ -1301,6 +1422,240 @@ size_t Compress(const char* input, size_t input_length, string* compressed) {
1301
1422
  return compressed_length;
1302
1423
  }
1303
1424
 
1425
+ // -----------------------------------------------------------------------
1426
+ // Sink interface
1427
+ // -----------------------------------------------------------------------
1428
+
1429
+ // A type that decompresses into a Sink. The template parameter
1430
+ // Allocator must export one method "char* Allocate(int size);", which
1431
+ // allocates a buffer of "size" and appends that to the destination.
1432
+ template <typename Allocator>
1433
+ class SnappyScatteredWriter {
1434
+ Allocator allocator_;
1435
+
1436
+ // We need random access into the data generated so far. Therefore
1437
+ // we keep track of all of the generated data as an array of blocks.
1438
+ // All of the blocks except the last have length kBlockSize.
1439
+ std::vector<char*> blocks_;
1440
+ size_t expected_;
1441
+
1442
+ // Total size of all fully generated blocks so far
1443
+ size_t full_size_;
1444
+
1445
+ // Pointer into current output block
1446
+ char* op_base_; // Base of output block
1447
+ char* op_ptr_; // Pointer to next unfilled byte in block
1448
+ char* op_limit_; // Pointer just past block
1449
+
1450
+ inline size_t Size() const {
1451
+ return full_size_ + (op_ptr_ - op_base_);
1452
+ }
1453
+
1454
+ bool SlowAppend(const char* ip, size_t len);
1455
+ bool SlowAppendFromSelf(size_t offset, size_t len);
1456
+
1457
+ public:
1458
+ inline explicit SnappyScatteredWriter(const Allocator& allocator)
1459
+ : allocator_(allocator),
1460
+ full_size_(0),
1461
+ op_base_(NULL),
1462
+ op_ptr_(NULL),
1463
+ op_limit_(NULL) {
1464
+ }
1465
+
1466
+ inline void SetExpectedLength(size_t len) {
1467
+ assert(blocks_.empty());
1468
+ expected_ = len;
1469
+ }
1470
+
1471
+ inline bool CheckLength() const {
1472
+ return Size() == expected_;
1473
+ }
1474
+
1475
+ // Return the number of bytes actually uncompressed so far
1476
+ inline size_t Produced() const {
1477
+ return Size();
1478
+ }
1479
+
1480
+ inline bool Append(const char* ip, size_t len) {
1481
+ size_t avail = op_limit_ - op_ptr_;
1482
+ if (len <= avail) {
1483
+ // Fast path
1484
+ memcpy(op_ptr_, ip, len);
1485
+ op_ptr_ += len;
1486
+ return true;
1487
+ } else {
1488
+ return SlowAppend(ip, len);
1489
+ }
1490
+ }
1491
+
1492
+ inline bool TryFastAppend(const char* ip, size_t available, size_t length) {
1493
+ char* op = op_ptr_;
1494
+ const int space_left = op_limit_ - op;
1495
+ if (length <= 16 && available >= 16 + kMaximumTagLength &&
1496
+ space_left >= 16) {
1497
+ // Fast path, used for the majority (about 95%) of invocations.
1498
+ UnalignedCopy128(ip, op);
1499
+ op_ptr_ = op + length;
1500
+ return true;
1501
+ } else {
1502
+ return false;
1503
+ }
1504
+ }
1505
+
1506
+ inline bool AppendFromSelf(size_t offset, size_t len) {
1507
+ char* const op_end = op_ptr_ + len;
1508
+ // See SnappyArrayWriter::AppendFromSelf for an explanation of
1509
+ // the "offset - 1u" trick.
1510
+ if (SNAPPY_PREDICT_TRUE(offset - 1u < op_ptr_ - op_base_ &&
1511
+ op_end <= op_limit_)) {
1512
+ // Fast path: src and dst in current block.
1513
+ op_ptr_ = IncrementalCopy(op_ptr_ - offset, op_ptr_, op_end, op_limit_);
1514
+ return true;
1515
+ }
1516
+ return SlowAppendFromSelf(offset, len);
1517
+ }
1518
+
1519
+ // Called at the end of the decompress. We ask the allocator
1520
+ // write all blocks to the sink.
1521
+ inline void Flush() { allocator_.Flush(Produced()); }
1522
+ };
1523
+
1524
+ template<typename Allocator>
1525
+ bool SnappyScatteredWriter<Allocator>::SlowAppend(const char* ip, size_t len) {
1526
+ size_t avail = op_limit_ - op_ptr_;
1527
+ while (len > avail) {
1528
+ // Completely fill this block
1529
+ memcpy(op_ptr_, ip, avail);
1530
+ op_ptr_ += avail;
1531
+ assert(op_limit_ - op_ptr_ == 0);
1532
+ full_size_ += (op_ptr_ - op_base_);
1533
+ len -= avail;
1534
+ ip += avail;
1535
+
1536
+ // Bounds check
1537
+ if (full_size_ + len > expected_) {
1538
+ return false;
1539
+ }
1540
+
1541
+ // Make new block
1542
+ size_t bsize = std::min<size_t>(kBlockSize, expected_ - full_size_);
1543
+ op_base_ = allocator_.Allocate(bsize);
1544
+ op_ptr_ = op_base_;
1545
+ op_limit_ = op_base_ + bsize;
1546
+ blocks_.push_back(op_base_);
1547
+ avail = bsize;
1548
+ }
1549
+
1550
+ memcpy(op_ptr_, ip, len);
1551
+ op_ptr_ += len;
1552
+ return true;
1553
+ }
1554
+
1555
+ template<typename Allocator>
1556
+ bool SnappyScatteredWriter<Allocator>::SlowAppendFromSelf(size_t offset,
1557
+ size_t len) {
1558
+ // Overflow check
1559
+ // See SnappyArrayWriter::AppendFromSelf for an explanation of
1560
+ // the "offset - 1u" trick.
1561
+ const size_t cur = Size();
1562
+ if (offset - 1u >= cur) return false;
1563
+ if (expected_ - cur < len) return false;
1564
+
1565
+ // Currently we shouldn't ever hit this path because Compress() chops the
1566
+ // input into blocks and does not create cross-block copies. However, it is
1567
+ // nice if we do not rely on that, since we can get better compression if we
1568
+ // allow cross-block copies and thus might want to change the compressor in
1569
+ // the future.
1570
+ size_t src = cur - offset;
1571
+ while (len-- > 0) {
1572
+ char c = blocks_[src >> kBlockLog][src & (kBlockSize-1)];
1573
+ Append(&c, 1);
1574
+ src++;
1575
+ }
1576
+ return true;
1577
+ }
1578
+
1579
+ class SnappySinkAllocator {
1580
+ public:
1581
+ explicit SnappySinkAllocator(Sink* dest): dest_(dest) {}
1582
+ ~SnappySinkAllocator() {}
1583
+
1584
+ char* Allocate(int size) {
1585
+ Datablock block(new char[size], size);
1586
+ blocks_.push_back(block);
1587
+ return block.data;
1588
+ }
1589
+
1590
+ // We flush only at the end, because the writer wants
1591
+ // random access to the blocks and once we hand the
1592
+ // block over to the sink, we can't access it anymore.
1593
+ // Also we don't write more than has been actually written
1594
+ // to the blocks.
1595
+ void Flush(size_t size) {
1596
+ size_t size_written = 0;
1597
+ size_t block_size;
1598
+ for (int i = 0; i < blocks_.size(); ++i) {
1599
+ block_size = std::min<size_t>(blocks_[i].size, size - size_written);
1600
+ dest_->AppendAndTakeOwnership(blocks_[i].data, block_size,
1601
+ &SnappySinkAllocator::Deleter, NULL);
1602
+ size_written += block_size;
1603
+ }
1604
+ blocks_.clear();
1605
+ }
1606
+
1607
+ private:
1608
+ struct Datablock {
1609
+ char* data;
1610
+ size_t size;
1611
+ Datablock(char* p, size_t s) : data(p), size(s) {}
1612
+ };
1613
+
1614
+ static void Deleter(void* arg, const char* bytes, size_t size) {
1615
+ delete[] bytes;
1616
+ }
1617
+
1618
+ Sink* dest_;
1619
+ std::vector<Datablock> blocks_;
1620
+
1621
+ // Note: copying this object is allowed
1622
+ };
1623
+
1624
+ size_t UncompressAsMuchAsPossible(Source* compressed, Sink* uncompressed) {
1625
+ SnappySinkAllocator allocator(uncompressed);
1626
+ SnappyScatteredWriter<SnappySinkAllocator> writer(allocator);
1627
+ InternalUncompress(compressed, &writer);
1628
+ return writer.Produced();
1629
+ }
1630
+
1631
+ bool Uncompress(Source* compressed, Sink* uncompressed) {
1632
+ // Read the uncompressed length from the front of the compressed input
1633
+ SnappyDecompressor decompressor(compressed);
1634
+ uint32 uncompressed_len = 0;
1635
+ if (!decompressor.ReadUncompressedLength(&uncompressed_len)) {
1636
+ return false;
1637
+ }
1304
1638
 
1305
- } // end namespace snappy
1639
+ char c;
1640
+ size_t allocated_size;
1641
+ char* buf = uncompressed->GetAppendBufferVariable(
1642
+ 1, uncompressed_len, &c, 1, &allocated_size);
1643
+
1644
+ const size_t compressed_len = compressed->Available();
1645
+ // If we can get a flat buffer, then use it, otherwise do block by block
1646
+ // uncompression
1647
+ if (allocated_size >= uncompressed_len) {
1648
+ SnappyArrayWriter writer(buf);
1649
+ bool result = InternalUncompressAllTags(&decompressor, &writer,
1650
+ compressed_len, uncompressed_len);
1651
+ uncompressed->Append(buf, writer.Produced());
1652
+ return result;
1653
+ } else {
1654
+ SnappySinkAllocator allocator(uncompressed);
1655
+ SnappyScatteredWriter<SnappySinkAllocator> writer(allocator);
1656
+ return InternalUncompressAllTags(&decompressor, &writer, compressed_len,
1657
+ uncompressed_len);
1658
+ }
1659
+ }
1306
1660
 
1661
+ } // namespace snappy