snappy 0.0.12-java → 0.1.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.travis.yml +28 -1
- data/Gemfile +6 -1
- data/README.md +28 -4
- data/Rakefile +1 -0
- data/ext/extconf.rb +21 -24
- data/lib/snappy.rb +3 -1
- data/lib/snappy/hadoop.rb +22 -0
- data/lib/snappy/hadoop/reader.rb +58 -0
- data/lib/snappy/hadoop/writer.rb +51 -0
- data/lib/snappy/reader.rb +11 -7
- data/lib/snappy/shim.rb +30 -0
- data/lib/snappy/version.rb +3 -1
- data/lib/snappy/writer.rb +14 -9
- data/smoke.sh +8 -0
- data/snappy.gemspec +6 -30
- data/test/hadoop/test-snappy-hadoop-reader.rb +103 -0
- data/test/hadoop/test-snappy-hadoop-writer.rb +48 -0
- data/test/test-snappy-hadoop.rb +22 -0
- data/vendor/snappy/CMakeLists.txt +174 -0
- data/vendor/snappy/CONTRIBUTING.md +26 -0
- data/vendor/snappy/COPYING +1 -1
- data/vendor/snappy/NEWS +52 -0
- data/vendor/snappy/{README → README.md} +23 -9
- data/vendor/snappy/cmake/SnappyConfig.cmake +1 -0
- data/vendor/snappy/cmake/config.h.in +62 -0
- data/vendor/snappy/snappy-c.h +3 -3
- data/vendor/snappy/snappy-internal.h +101 -27
- data/vendor/snappy/snappy-sinksource.cc +33 -0
- data/vendor/snappy/snappy-sinksource.h +51 -6
- data/vendor/snappy/snappy-stubs-internal.h +107 -37
- data/vendor/snappy/snappy-stubs-public.h.in +16 -20
- data/vendor/snappy/snappy-test.cc +15 -9
- data/vendor/snappy/snappy-test.h +34 -43
- data/vendor/snappy/snappy.cc +529 -320
- data/vendor/snappy/snappy.h +23 -4
- data/vendor/snappy/snappy_unittest.cc +240 -185
- metadata +27 -74
- data/vendor/snappy/ChangeLog +0 -1916
- data/vendor/snappy/Makefile.am +0 -23
- data/vendor/snappy/autogen.sh +0 -7
- data/vendor/snappy/configure.ac +0 -133
- data/vendor/snappy/m4/gtest.m4 +0 -74
- data/vendor/snappy/testdata/alice29.txt +0 -3609
- data/vendor/snappy/testdata/asyoulik.txt +0 -4122
- data/vendor/snappy/testdata/baddata1.snappy +0 -0
- data/vendor/snappy/testdata/baddata2.snappy +0 -0
- data/vendor/snappy/testdata/baddata3.snappy +0 -0
- data/vendor/snappy/testdata/fireworks.jpeg +0 -0
- data/vendor/snappy/testdata/geo.protodata +0 -0
- data/vendor/snappy/testdata/html +0 -1
- data/vendor/snappy/testdata/html_x_4 +0 -1
- data/vendor/snappy/testdata/kppkn.gtb +0 -0
- data/vendor/snappy/testdata/lcet10.txt +0 -7519
- data/vendor/snappy/testdata/paper-100k.pdf +2 -600
- data/vendor/snappy/testdata/plrabn12.txt +0 -10699
- data/vendor/snappy/testdata/urls.10K +0 -10000
data/vendor/snappy/snappy.h
CHANGED
@@ -36,8 +36,8 @@
|
|
36
36
|
// using BMDiff and then compressing the output of BMDiff with
|
37
37
|
// Snappy.
|
38
38
|
|
39
|
-
#ifndef
|
40
|
-
#define
|
39
|
+
#ifndef THIRD_PARTY_SNAPPY_SNAPPY_H__
|
40
|
+
#define THIRD_PARTY_SNAPPY_SNAPPY_H__
|
41
41
|
|
42
42
|
#include <stddef.h>
|
43
43
|
#include <string>
|
@@ -84,6 +84,18 @@ namespace snappy {
|
|
84
84
|
bool Uncompress(const char* compressed, size_t compressed_length,
|
85
85
|
string* uncompressed);
|
86
86
|
|
87
|
+
// Decompresses "compressed" to "*uncompressed".
|
88
|
+
//
|
89
|
+
// returns false if the message is corrupted and could not be decompressed
|
90
|
+
bool Uncompress(Source* compressed, Sink* uncompressed);
|
91
|
+
|
92
|
+
// This routine uncompresses as much of the "compressed" as possible
|
93
|
+
// into sink. It returns the number of valid bytes added to sink
|
94
|
+
// (extra invalid bytes may have been added due to errors; the caller
|
95
|
+
// should ignore those). The emitted data typically has length
|
96
|
+
// GetUncompressedLength(), but may be shorter if an error is
|
97
|
+
// encountered.
|
98
|
+
size_t UncompressAsMuchAsPossible(Source* compressed, Sink* uncompressed);
|
87
99
|
|
88
100
|
// ------------------------------------------------------------------------
|
89
101
|
// Lower-level character array based routines. May be useful for
|
@@ -164,6 +176,14 @@ namespace snappy {
|
|
164
176
|
bool IsValidCompressedBuffer(const char* compressed,
|
165
177
|
size_t compressed_length);
|
166
178
|
|
179
|
+
// Returns true iff the contents of "compressed" can be uncompressed
|
180
|
+
// successfully. Does not return the uncompressed data. Takes
|
181
|
+
// time proportional to *compressed length, but is usually at least
|
182
|
+
// a factor of four faster than actual decompression.
|
183
|
+
// On success, consumes all of *compressed. On failure, consumes an
|
184
|
+
// unspecified prefix of *compressed.
|
185
|
+
bool IsValidCompressed(Source* compressed);
|
186
|
+
|
167
187
|
// The size of a compression block. Note that many parts of the compression
|
168
188
|
// code assumes that kBlockSize <= 65536; in particular, the hash table
|
169
189
|
// can only store 16-bit offsets, and EmitCopy() also assumes the offset
|
@@ -180,5 +200,4 @@ namespace snappy {
|
|
180
200
|
static const size_t kMaxHashTableSize = 1 << kMaxHashTableBits;
|
181
201
|
} // end namespace snappy
|
182
202
|
|
183
|
-
|
184
|
-
#endif // UTIL_SNAPPY_SNAPPY_H__
|
203
|
+
#endif // THIRD_PARTY_SNAPPY_SNAPPY_H__
|
@@ -32,6 +32,7 @@
|
|
32
32
|
|
33
33
|
#include <algorithm>
|
34
34
|
#include <string>
|
35
|
+
#include <utility>
|
35
36
|
#include <vector>
|
36
37
|
|
37
38
|
#include "snappy.h"
|
@@ -50,25 +51,19 @@ DEFINE_bool(zlib, false,
|
|
50
51
|
"Run zlib compression (http://www.zlib.net)");
|
51
52
|
DEFINE_bool(lzo, false,
|
52
53
|
"Run LZO compression (http://www.oberhumer.com/opensource/lzo/)");
|
53
|
-
DEFINE_bool(quicklz, false,
|
54
|
-
"Run quickLZ compression (http://www.quicklz.com/)");
|
55
|
-
DEFINE_bool(liblzf, false,
|
56
|
-
"Run libLZF compression "
|
57
|
-
"(http://www.goof.com/pcg/marc/liblzf.html)");
|
58
|
-
DEFINE_bool(fastlz, false,
|
59
|
-
"Run FastLZ compression (http://www.fastlz.org/");
|
60
54
|
DEFINE_bool(snappy, true, "Run snappy compression");
|
61
55
|
|
62
|
-
|
63
56
|
DEFINE_bool(write_compressed, false,
|
64
57
|
"Write compressed versions of each file to <file>.comp");
|
65
58
|
DEFINE_bool(write_uncompressed, false,
|
66
59
|
"Write uncompressed versions of each file to <file>.uncomp");
|
67
60
|
|
68
|
-
|
61
|
+
DEFINE_bool(snappy_dump_decompression_table, false,
|
62
|
+
"If true, we print the decompression table during tests.");
|
69
63
|
|
64
|
+
namespace snappy {
|
70
65
|
|
71
|
-
#
|
66
|
+
#if defined(HAVE_FUNC_MMAP) && defined(HAVE_FUNC_SYSCONF)
|
72
67
|
|
73
68
|
// To test against code that reads beyond its input, this class copies a
|
74
69
|
// string to a newly allocated group of pages, the last of which
|
@@ -79,7 +74,7 @@ namespace snappy {
|
|
79
74
|
class DataEndingAtUnreadablePage {
|
80
75
|
public:
|
81
76
|
explicit DataEndingAtUnreadablePage(const string& s) {
|
82
|
-
const size_t page_size =
|
77
|
+
const size_t page_size = sysconf(_SC_PAGESIZE);
|
83
78
|
const size_t size = s.size();
|
84
79
|
// Round up space for string to a multiple of page_size.
|
85
80
|
size_t space_for_string = (size + page_size - 1) & ~(page_size - 1);
|
@@ -97,8 +92,9 @@ class DataEndingAtUnreadablePage {
|
|
97
92
|
}
|
98
93
|
|
99
94
|
~DataEndingAtUnreadablePage() {
|
95
|
+
const size_t page_size = sysconf(_SC_PAGESIZE);
|
100
96
|
// Undo the mprotect.
|
101
|
-
CHECK_EQ(0, mprotect(protected_page_,
|
97
|
+
CHECK_EQ(0, mprotect(protected_page_, page_size, PROT_READ|PROT_WRITE));
|
102
98
|
CHECK_EQ(0, munmap(mem_, alloc_size_));
|
103
99
|
}
|
104
100
|
|
@@ -113,7 +109,7 @@ class DataEndingAtUnreadablePage {
|
|
113
109
|
size_t size_;
|
114
110
|
};
|
115
111
|
|
116
|
-
#else // HAVE_FUNC_MMAP
|
112
|
+
#else // defined(HAVE_FUNC_MMAP) && defined(HAVE_FUNC_SYSCONF)
|
117
113
|
|
118
114
|
// Fallback for systems without mmap.
|
119
115
|
typedef string DataEndingAtUnreadablePage;
|
@@ -121,11 +117,11 @@ typedef string DataEndingAtUnreadablePage;
|
|
121
117
|
#endif
|
122
118
|
|
123
119
|
enum CompressorType {
|
124
|
-
ZLIB, LZO,
|
120
|
+
ZLIB, LZO, SNAPPY
|
125
121
|
};
|
126
122
|
|
127
123
|
const char* names[] = {
|
128
|
-
"ZLIB", "LZO", "
|
124
|
+
"ZLIB", "LZO", "SNAPPY"
|
129
125
|
};
|
130
126
|
|
131
127
|
static size_t MinimumRequiredOutputSpace(size_t input_size,
|
@@ -141,26 +137,12 @@ static size_t MinimumRequiredOutputSpace(size_t input_size,
|
|
141
137
|
return input_size + input_size/64 + 16 + 3;
|
142
138
|
#endif // LZO_VERSION
|
143
139
|
|
144
|
-
#ifdef LZF_VERSION
|
145
|
-
case LIBLZF:
|
146
|
-
return input_size;
|
147
|
-
#endif // LZF_VERSION
|
148
|
-
|
149
|
-
#ifdef QLZ_VERSION_MAJOR
|
150
|
-
case QUICKLZ:
|
151
|
-
return input_size + 36000; // 36000 is used for scratch.
|
152
|
-
#endif // QLZ_VERSION_MAJOR
|
153
|
-
|
154
|
-
#ifdef FASTLZ_VERSION
|
155
|
-
case FASTLZ:
|
156
|
-
return max(static_cast<int>(ceil(input_size * 1.05)), 66);
|
157
|
-
#endif // FASTLZ_VERSION
|
158
|
-
|
159
140
|
case SNAPPY:
|
160
141
|
return snappy::MaxCompressedLength(input_size);
|
161
142
|
|
162
143
|
default:
|
163
144
|
LOG(FATAL) << "Unknown compression type number " << comp;
|
145
|
+
return 0;
|
164
146
|
}
|
165
147
|
}
|
166
148
|
|
@@ -214,58 +196,6 @@ static bool Compress(const char* input, size_t input_size, CompressorType comp,
|
|
214
196
|
}
|
215
197
|
#endif // LZO_VERSION
|
216
198
|
|
217
|
-
#ifdef LZF_VERSION
|
218
|
-
case LIBLZF: {
|
219
|
-
int destlen = lzf_compress(input,
|
220
|
-
input_size,
|
221
|
-
string_as_array(compressed),
|
222
|
-
input_size);
|
223
|
-
if (destlen == 0) {
|
224
|
-
// lzf *can* cause lots of blowup when compressing, so they
|
225
|
-
// recommend to limit outsize to insize, and just not compress
|
226
|
-
// if it's bigger. Ideally, we'd just swap input and output.
|
227
|
-
compressed->assign(input, input_size);
|
228
|
-
destlen = input_size;
|
229
|
-
}
|
230
|
-
if (!compressed_is_preallocated) {
|
231
|
-
compressed->resize(destlen);
|
232
|
-
}
|
233
|
-
break;
|
234
|
-
}
|
235
|
-
#endif // LZF_VERSION
|
236
|
-
|
237
|
-
#ifdef QLZ_VERSION_MAJOR
|
238
|
-
case QUICKLZ: {
|
239
|
-
qlz_state_compress *state_compress = new qlz_state_compress;
|
240
|
-
int destlen = qlz_compress(input,
|
241
|
-
string_as_array(compressed),
|
242
|
-
input_size,
|
243
|
-
state_compress);
|
244
|
-
delete state_compress;
|
245
|
-
CHECK_NE(0, destlen);
|
246
|
-
if (!compressed_is_preallocated) {
|
247
|
-
compressed->resize(destlen);
|
248
|
-
}
|
249
|
-
break;
|
250
|
-
}
|
251
|
-
#endif // QLZ_VERSION_MAJOR
|
252
|
-
|
253
|
-
#ifdef FASTLZ_VERSION
|
254
|
-
case FASTLZ: {
|
255
|
-
// Use level 1 compression since we mostly care about speed.
|
256
|
-
int destlen = fastlz_compress_level(
|
257
|
-
1,
|
258
|
-
input,
|
259
|
-
input_size,
|
260
|
-
string_as_array(compressed));
|
261
|
-
if (!compressed_is_preallocated) {
|
262
|
-
compressed->resize(destlen);
|
263
|
-
}
|
264
|
-
CHECK_NE(destlen, 0);
|
265
|
-
break;
|
266
|
-
}
|
267
|
-
#endif // FASTLZ_VERSION
|
268
|
-
|
269
199
|
case SNAPPY: {
|
270
200
|
size_t destlen;
|
271
201
|
snappy::RawCompress(input, input_size,
|
@@ -278,7 +208,6 @@ static bool Compress(const char* input, size_t input_size, CompressorType comp,
|
|
278
208
|
break;
|
279
209
|
}
|
280
210
|
|
281
|
-
|
282
211
|
default: {
|
283
212
|
return false; // the asked-for library wasn't compiled in
|
284
213
|
}
|
@@ -321,56 +250,12 @@ static bool Uncompress(const string& compressed, CompressorType comp,
|
|
321
250
|
}
|
322
251
|
#endif // LZO_VERSION
|
323
252
|
|
324
|
-
#ifdef LZF_VERSION
|
325
|
-
case LIBLZF: {
|
326
|
-
output->resize(size);
|
327
|
-
int destlen = lzf_decompress(compressed.data(),
|
328
|
-
compressed.size(),
|
329
|
-
string_as_array(output),
|
330
|
-
output->size());
|
331
|
-
if (destlen == 0) {
|
332
|
-
// This error probably means we had decided not to compress,
|
333
|
-
// and thus have stored input in output directly.
|
334
|
-
output->assign(compressed.data(), compressed.size());
|
335
|
-
destlen = compressed.size();
|
336
|
-
}
|
337
|
-
CHECK_EQ(destlen, size);
|
338
|
-
break;
|
339
|
-
}
|
340
|
-
#endif // LZF_VERSION
|
341
|
-
|
342
|
-
#ifdef QLZ_VERSION_MAJOR
|
343
|
-
case QUICKLZ: {
|
344
|
-
output->resize(size);
|
345
|
-
qlz_state_decompress *state_decompress = new qlz_state_decompress;
|
346
|
-
int destlen = qlz_decompress(compressed.data(),
|
347
|
-
string_as_array(output),
|
348
|
-
state_decompress);
|
349
|
-
delete state_decompress;
|
350
|
-
CHECK_EQ(destlen, size);
|
351
|
-
break;
|
352
|
-
}
|
353
|
-
#endif // QLZ_VERSION_MAJOR
|
354
|
-
|
355
|
-
#ifdef FASTLZ_VERSION
|
356
|
-
case FASTLZ: {
|
357
|
-
output->resize(size);
|
358
|
-
int destlen = fastlz_decompress(compressed.data(),
|
359
|
-
compressed.length(),
|
360
|
-
string_as_array(output),
|
361
|
-
size);
|
362
|
-
CHECK_EQ(destlen, size);
|
363
|
-
break;
|
364
|
-
}
|
365
|
-
#endif // FASTLZ_VERSION
|
366
|
-
|
367
253
|
case SNAPPY: {
|
368
254
|
snappy::RawUncompress(compressed.data(), compressed.size(),
|
369
255
|
string_as_array(output));
|
370
256
|
break;
|
371
257
|
}
|
372
258
|
|
373
|
-
|
374
259
|
default: {
|
375
260
|
return false; // the asked-for library wasn't compiled in
|
376
261
|
}
|
@@ -392,13 +277,13 @@ static void Measure(const char* data,
|
|
392
277
|
{
|
393
278
|
// Chop the input into blocks
|
394
279
|
int num_blocks = (length + block_size - 1) / block_size;
|
395
|
-
vector<const char*> input(num_blocks);
|
396
|
-
vector<size_t> input_length(num_blocks);
|
397
|
-
vector<string> compressed(num_blocks);
|
398
|
-
vector<string> output(num_blocks);
|
280
|
+
std::vector<const char*> input(num_blocks);
|
281
|
+
std::vector<size_t> input_length(num_blocks);
|
282
|
+
std::vector<string> compressed(num_blocks);
|
283
|
+
std::vector<string> output(num_blocks);
|
399
284
|
for (int b = 0; b < num_blocks; b++) {
|
400
285
|
int input_start = b * block_size;
|
401
|
-
int input_limit = min<int>((b+1)*block_size, length);
|
286
|
+
int input_limit = std::min<int>((b+1)*block_size, length);
|
402
287
|
input[b] = data+input_start;
|
403
288
|
input_length[b] = input_limit-input_start;
|
404
289
|
|
@@ -448,13 +333,13 @@ static void Measure(const char* data,
|
|
448
333
|
}
|
449
334
|
|
450
335
|
compressed_size = 0;
|
451
|
-
for (
|
336
|
+
for (size_t i = 0; i < compressed.size(); i++) {
|
452
337
|
compressed_size += compressed[i].size();
|
453
338
|
}
|
454
339
|
}
|
455
340
|
|
456
|
-
sort(ctime, ctime + kRuns);
|
457
|
-
sort(utime, utime + kRuns);
|
341
|
+
std::sort(ctime, ctime + kRuns);
|
342
|
+
std::sort(utime, utime + kRuns);
|
458
343
|
const int med = kRuns/2;
|
459
344
|
|
460
345
|
float comp_rate = (length / ctime[med]) * repeats / 1048576.0;
|
@@ -469,12 +354,11 @@ static void Measure(const char* data,
|
|
469
354
|
x.c_str(),
|
470
355
|
block_size/(1<<20),
|
471
356
|
static_cast<int>(length), static_cast<uint32>(compressed_size),
|
472
|
-
(compressed_size * 100.0) / max<int>(1, length),
|
357
|
+
(compressed_size * 100.0) / std::max<int>(1, length),
|
473
358
|
comp_rate,
|
474
359
|
urate.c_str());
|
475
360
|
}
|
476
361
|
|
477
|
-
|
478
362
|
static int VerifyString(const string& input) {
|
479
363
|
string compressed;
|
480
364
|
DataEndingAtUnreadablePage i(input);
|
@@ -491,6 +375,23 @@ static int VerifyString(const string& input) {
|
|
491
375
|
return uncompressed.size();
|
492
376
|
}
|
493
377
|
|
378
|
+
static void VerifyStringSink(const string& input) {
|
379
|
+
string compressed;
|
380
|
+
DataEndingAtUnreadablePage i(input);
|
381
|
+
const size_t written = snappy::Compress(i.data(), i.size(), &compressed);
|
382
|
+
CHECK_EQ(written, compressed.size());
|
383
|
+
CHECK_LE(compressed.size(),
|
384
|
+
snappy::MaxCompressedLength(input.size()));
|
385
|
+
CHECK(snappy::IsValidCompressedBuffer(compressed.data(), compressed.size()));
|
386
|
+
|
387
|
+
string uncompressed;
|
388
|
+
uncompressed.resize(input.size());
|
389
|
+
snappy::UncheckedByteArraySink sink(string_as_array(&uncompressed));
|
390
|
+
DataEndingAtUnreadablePage c(compressed);
|
391
|
+
snappy::ByteArraySource source(c.data(), c.size());
|
392
|
+
CHECK(snappy::Uncompress(&source, &sink));
|
393
|
+
CHECK_EQ(uncompressed, input);
|
394
|
+
}
|
494
395
|
|
495
396
|
static void VerifyIOVec(const string& input) {
|
496
397
|
string compressed;
|
@@ -505,13 +406,13 @@ static void VerifyIOVec(const string& input) {
|
|
505
406
|
// ranging from 1 to 10.
|
506
407
|
char* buf = new char[input.size()];
|
507
408
|
ACMRandom rnd(input.size());
|
508
|
-
|
409
|
+
size_t num = rnd.Next() % 10 + 1;
|
509
410
|
if (input.size() < num) {
|
510
411
|
num = input.size();
|
511
412
|
}
|
512
413
|
struct iovec* iov = new iovec[num];
|
513
414
|
int used_so_far = 0;
|
514
|
-
for (
|
415
|
+
for (size_t i = 0; i < num; ++i) {
|
515
416
|
iov[i].iov_base = buf + used_so_far;
|
516
417
|
if (i == num - 1) {
|
517
418
|
iov[i].iov_len = input.size() - used_so_far;
|
@@ -562,6 +463,28 @@ static void VerifyNonBlockedCompression(const string& input) {
|
|
562
463
|
CHECK(snappy::Uncompress(compressed.data(), compressed.size(), &uncomp_str));
|
563
464
|
CHECK_EQ(uncomp_str, input);
|
564
465
|
|
466
|
+
// Uncompress using source/sink
|
467
|
+
string uncomp_str2;
|
468
|
+
uncomp_str2.resize(input.size());
|
469
|
+
snappy::UncheckedByteArraySink sink(string_as_array(&uncomp_str2));
|
470
|
+
snappy::ByteArraySource source(compressed.data(), compressed.size());
|
471
|
+
CHECK(snappy::Uncompress(&source, &sink));
|
472
|
+
CHECK_EQ(uncomp_str2, input);
|
473
|
+
|
474
|
+
// Uncompress into iovec
|
475
|
+
{
|
476
|
+
static const int kNumBlocks = 10;
|
477
|
+
struct iovec vec[kNumBlocks];
|
478
|
+
const int block_size = 1 + input.size() / kNumBlocks;
|
479
|
+
string iovec_data(block_size * kNumBlocks, 'x');
|
480
|
+
for (int i = 0; i < kNumBlocks; i++) {
|
481
|
+
vec[i].iov_base = string_as_array(&iovec_data) + i * block_size;
|
482
|
+
vec[i].iov_len = block_size;
|
483
|
+
}
|
484
|
+
CHECK(snappy::RawUncompressToIOVec(compressed.data(), compressed.size(),
|
485
|
+
vec, kNumBlocks));
|
486
|
+
CHECK_EQ(string(iovec_data.data(), input.size()), input);
|
487
|
+
}
|
565
488
|
}
|
566
489
|
|
567
490
|
// Expand the input so that it is at least K times as big as block size
|
@@ -580,6 +503,8 @@ static int Verify(const string& input) {
|
|
580
503
|
// Compress using string based routines
|
581
504
|
const int result = VerifyString(input);
|
582
505
|
|
506
|
+
// Verify using sink based routines
|
507
|
+
VerifyStringSink(input);
|
583
508
|
|
584
509
|
VerifyNonBlockedCompression(input);
|
585
510
|
VerifyIOVec(input);
|
@@ -589,12 +514,9 @@ static int Verify(const string& input) {
|
|
589
514
|
VerifyIOVec(input);
|
590
515
|
}
|
591
516
|
|
592
|
-
|
593
517
|
return result;
|
594
518
|
}
|
595
519
|
|
596
|
-
// This test checks to ensure that snappy doesn't coredump if it gets
|
597
|
-
// corrupted data.
|
598
520
|
|
599
521
|
static bool IsValidCompressedBuffer(const string& c) {
|
600
522
|
return snappy::IsValidCompressedBuffer(c.data(), c.size());
|
@@ -603,11 +525,13 @@ static bool Uncompress(const string& c, string* u) {
|
|
603
525
|
return snappy::Uncompress(c.data(), c.size(), u);
|
604
526
|
}
|
605
527
|
|
606
|
-
|
528
|
+
// This test checks to ensure that snappy doesn't coredump if it gets
|
529
|
+
// corrupted data.
|
530
|
+
TEST(CorruptedTest, VerifyCorrupted) {
|
607
531
|
string source = "making sure we don't crash with corrupted input";
|
608
532
|
VLOG(1) << source;
|
609
533
|
string dest;
|
610
|
-
|
534
|
+
string uncmp;
|
611
535
|
snappy::Compress(source.data(), source.size(), &dest);
|
612
536
|
|
613
537
|
// Mess around with the data. It's hard to simulate all possible
|
@@ -616,19 +540,19 @@ TYPED_TEST(CorruptedTest, VerifyCorrupted) {
|
|
616
540
|
dest[1]--;
|
617
541
|
dest[3]++;
|
618
542
|
// this really ought to fail.
|
619
|
-
CHECK(!IsValidCompressedBuffer(
|
620
|
-
CHECK(!Uncompress(
|
543
|
+
CHECK(!IsValidCompressedBuffer(dest));
|
544
|
+
CHECK(!Uncompress(dest, &uncmp));
|
621
545
|
|
622
546
|
// This is testing for a security bug - a buffer that decompresses to 100k
|
623
547
|
// but we lie in the snappy header and only reserve 0 bytes of memory :)
|
624
548
|
source.resize(100000);
|
625
|
-
for (
|
549
|
+
for (size_t i = 0; i < source.length(); ++i) {
|
626
550
|
source[i] = 'A';
|
627
551
|
}
|
628
552
|
snappy::Compress(source.data(), source.size(), &dest);
|
629
553
|
dest[0] = dest[1] = dest[2] = dest[3] = 0;
|
630
|
-
CHECK(!IsValidCompressedBuffer(
|
631
|
-
CHECK(!Uncompress(
|
554
|
+
CHECK(!IsValidCompressedBuffer(dest));
|
555
|
+
CHECK(!Uncompress(dest, &uncmp));
|
632
556
|
|
633
557
|
if (sizeof(void *) == 4) {
|
634
558
|
// Another security check; check a crazy big length can't DoS us with an
|
@@ -637,20 +561,20 @@ TYPED_TEST(CorruptedTest, VerifyCorrupted) {
|
|
637
561
|
// where 3 GB might be an acceptable allocation size, Uncompress()
|
638
562
|
// attempts to decompress, and sometimes causes the test to run out of
|
639
563
|
// memory.
|
640
|
-
dest[0] = dest[1] = dest[2] = dest[3] =
|
564
|
+
dest[0] = dest[1] = dest[2] = dest[3] = '\xff';
|
641
565
|
// This decodes to a really large size, i.e., about 3 GB.
|
642
566
|
dest[4] = 'k';
|
643
|
-
CHECK(!IsValidCompressedBuffer(
|
644
|
-
CHECK(!Uncompress(
|
567
|
+
CHECK(!IsValidCompressedBuffer(dest));
|
568
|
+
CHECK(!Uncompress(dest, &uncmp));
|
645
569
|
} else {
|
646
570
|
LOG(WARNING) << "Crazy decompression lengths not checked on 64-bit build";
|
647
571
|
}
|
648
572
|
|
649
573
|
// This decodes to about 2 MB; much smaller, but should still fail.
|
650
|
-
dest[0] = dest[1] = dest[2] =
|
574
|
+
dest[0] = dest[1] = dest[2] = '\xff';
|
651
575
|
dest[3] = 0x00;
|
652
|
-
CHECK(!IsValidCompressedBuffer(
|
653
|
-
CHECK(!Uncompress(
|
576
|
+
CHECK(!IsValidCompressedBuffer(dest));
|
577
|
+
CHECK(!Uncompress(dest, &uncmp));
|
654
578
|
|
655
579
|
// try reading stuff in from a bad file.
|
656
580
|
for (int i = 1; i <= 3; ++i) {
|
@@ -665,8 +589,8 @@ TYPED_TEST(CorruptedTest, VerifyCorrupted) {
|
|
665
589
|
snappy::ByteArraySource source(data.data(), data.size());
|
666
590
|
CHECK(!snappy::GetUncompressedLength(&source, &ulen2) ||
|
667
591
|
(ulen2 < (1<<20)));
|
668
|
-
CHECK(!IsValidCompressedBuffer(
|
669
|
-
CHECK(!Uncompress(
|
592
|
+
CHECK(!IsValidCompressedBuffer(data));
|
593
|
+
CHECK(!Uncompress(data, &uncmp));
|
670
594
|
}
|
671
595
|
}
|
672
596
|
|
@@ -764,7 +688,7 @@ TEST(Snappy, RandomData) {
|
|
764
688
|
}
|
765
689
|
|
766
690
|
string x;
|
767
|
-
|
691
|
+
size_t len = rnd.Uniform(4096);
|
768
692
|
if (i < 100) {
|
769
693
|
len = 65536 + rnd.Uniform(65536);
|
770
694
|
}
|
@@ -929,7 +853,6 @@ TEST(Snappy, IOVecCopyOverflow) {
|
|
929
853
|
}
|
930
854
|
}
|
931
855
|
|
932
|
-
|
933
856
|
static bool CheckUncompressedLength(const string& compressed,
|
934
857
|
size_t* ulength) {
|
935
858
|
const bool result1 = snappy::GetUncompressedLength(compressed.data(),
|
@@ -956,11 +879,11 @@ TEST(SnappyCorruption, TruncatedVarint) {
|
|
956
879
|
TEST(SnappyCorruption, UnterminatedVarint) {
|
957
880
|
string compressed, uncompressed;
|
958
881
|
size_t ulength;
|
959
|
-
compressed.push_back(
|
960
|
-
compressed.push_back(
|
961
|
-
compressed.push_back(
|
962
|
-
compressed.push_back(
|
963
|
-
compressed.push_back(
|
882
|
+
compressed.push_back('\x80');
|
883
|
+
compressed.push_back('\x80');
|
884
|
+
compressed.push_back('\x80');
|
885
|
+
compressed.push_back('\x80');
|
886
|
+
compressed.push_back('\x80');
|
964
887
|
compressed.push_back(10);
|
965
888
|
CHECK(!CheckUncompressedLength(compressed, &ulength));
|
966
889
|
CHECK(!snappy::IsValidCompressedBuffer(compressed.data(), compressed.size()));
|
@@ -968,6 +891,20 @@ TEST(SnappyCorruption, UnterminatedVarint) {
|
|
968
891
|
&uncompressed));
|
969
892
|
}
|
970
893
|
|
894
|
+
TEST(SnappyCorruption, OverflowingVarint) {
|
895
|
+
string compressed, uncompressed;
|
896
|
+
size_t ulength;
|
897
|
+
compressed.push_back('\xfb');
|
898
|
+
compressed.push_back('\xff');
|
899
|
+
compressed.push_back('\xff');
|
900
|
+
compressed.push_back('\xff');
|
901
|
+
compressed.push_back('\x7f');
|
902
|
+
CHECK(!CheckUncompressedLength(compressed, &ulength));
|
903
|
+
CHECK(!snappy::IsValidCompressedBuffer(compressed.data(), compressed.size()));
|
904
|
+
CHECK(!snappy::Uncompress(compressed.data(), compressed.size(),
|
905
|
+
&uncompressed));
|
906
|
+
}
|
907
|
+
|
971
908
|
TEST(Snappy, ReadPastEndOfBuffer) {
|
972
909
|
// Check that we do not read past end of input
|
973
910
|
|
@@ -998,11 +935,13 @@ TEST(Snappy, ZeroOffsetCopyValidation) {
|
|
998
935
|
EXPECT_FALSE(snappy::IsValidCompressedBuffer(compressed, 4));
|
999
936
|
}
|
1000
937
|
|
1001
|
-
|
1002
938
|
namespace {
|
1003
939
|
|
1004
940
|
int TestFindMatchLength(const char* s1, const char *s2, unsigned length) {
|
1005
|
-
|
941
|
+
std::pair<size_t, bool> p =
|
942
|
+
snappy::internal::FindMatchLength(s1, s2, s2 + length);
|
943
|
+
CHECK_EQ(p.first < 8, p.second);
|
944
|
+
return p.first;
|
1006
945
|
}
|
1007
946
|
|
1008
947
|
} // namespace
|
@@ -1112,8 +1051,7 @@ TEST(Snappy, FindMatchLengthRandom) {
|
|
1112
1051
|
}
|
1113
1052
|
DataEndingAtUnreadablePage u(s);
|
1114
1053
|
DataEndingAtUnreadablePage v(t);
|
1115
|
-
int matched =
|
1116
|
-
u.data(), v.data(), v.data() + t.size());
|
1054
|
+
int matched = TestFindMatchLength(u.data(), v.data(), t.size());
|
1117
1055
|
if (matched == t.size()) {
|
1118
1056
|
EXPECT_EQ(s, t);
|
1119
1057
|
} else {
|
@@ -1125,21 +1063,113 @@ TEST(Snappy, FindMatchLengthRandom) {
|
|
1125
1063
|
}
|
1126
1064
|
}
|
1127
1065
|
|
1066
|
+
static uint16 MakeEntry(unsigned int extra,
|
1067
|
+
unsigned int len,
|
1068
|
+
unsigned int copy_offset) {
|
1069
|
+
// Check that all of the fields fit within the allocated space
|
1070
|
+
assert(extra == (extra & 0x7)); // At most 3 bits
|
1071
|
+
assert(copy_offset == (copy_offset & 0x7)); // At most 3 bits
|
1072
|
+
assert(len == (len & 0x7f)); // At most 7 bits
|
1073
|
+
return len | (copy_offset << 8) | (extra << 11);
|
1074
|
+
}
|
1075
|
+
|
1076
|
+
// Check that the decompression table is correct, and optionally print out
|
1077
|
+
// the computed one.
|
1078
|
+
TEST(Snappy, VerifyCharTable) {
|
1079
|
+
using snappy::internal::LITERAL;
|
1080
|
+
using snappy::internal::COPY_1_BYTE_OFFSET;
|
1081
|
+
using snappy::internal::COPY_2_BYTE_OFFSET;
|
1082
|
+
using snappy::internal::COPY_4_BYTE_OFFSET;
|
1083
|
+
using snappy::internal::char_table;
|
1084
|
+
|
1085
|
+
uint16 dst[256];
|
1086
|
+
|
1087
|
+
// Place invalid entries in all places to detect missing initialization
|
1088
|
+
int assigned = 0;
|
1089
|
+
for (int i = 0; i < 256; i++) {
|
1090
|
+
dst[i] = 0xffff;
|
1091
|
+
}
|
1092
|
+
|
1093
|
+
// Small LITERAL entries. We store (len-1) in the top 6 bits.
|
1094
|
+
for (unsigned int len = 1; len <= 60; len++) {
|
1095
|
+
dst[LITERAL | ((len-1) << 2)] = MakeEntry(0, len, 0);
|
1096
|
+
assigned++;
|
1097
|
+
}
|
1098
|
+
|
1099
|
+
// Large LITERAL entries. We use 60..63 in the high 6 bits to
|
1100
|
+
// encode the number of bytes of length info that follow the opcode.
|
1101
|
+
for (unsigned int extra_bytes = 1; extra_bytes <= 4; extra_bytes++) {
|
1102
|
+
// We set the length field in the lookup table to 1 because extra
|
1103
|
+
// bytes encode len-1.
|
1104
|
+
dst[LITERAL | ((extra_bytes+59) << 2)] = MakeEntry(extra_bytes, 1, 0);
|
1105
|
+
assigned++;
|
1106
|
+
}
|
1107
|
+
|
1108
|
+
// COPY_1_BYTE_OFFSET.
|
1109
|
+
//
|
1110
|
+
// The tag byte in the compressed data stores len-4 in 3 bits, and
|
1111
|
+
// offset/256 in 5 bits. offset%256 is stored in the next byte.
|
1112
|
+
//
|
1113
|
+
// This format is used for length in range [4..11] and offset in
|
1114
|
+
// range [0..2047]
|
1115
|
+
for (unsigned int len = 4; len < 12; len++) {
|
1116
|
+
for (unsigned int offset = 0; offset < 2048; offset += 256) {
|
1117
|
+
dst[COPY_1_BYTE_OFFSET | ((len-4)<<2) | ((offset>>8)<<5)] =
|
1118
|
+
MakeEntry(1, len, offset>>8);
|
1119
|
+
assigned++;
|
1120
|
+
}
|
1121
|
+
}
|
1122
|
+
|
1123
|
+
// COPY_2_BYTE_OFFSET.
|
1124
|
+
// Tag contains len-1 in top 6 bits, and offset in next two bytes.
|
1125
|
+
for (unsigned int len = 1; len <= 64; len++) {
|
1126
|
+
dst[COPY_2_BYTE_OFFSET | ((len-1)<<2)] = MakeEntry(2, len, 0);
|
1127
|
+
assigned++;
|
1128
|
+
}
|
1129
|
+
|
1130
|
+
// COPY_4_BYTE_OFFSET.
|
1131
|
+
// Tag contents len-1 in top 6 bits, and offset in next four bytes.
|
1132
|
+
for (unsigned int len = 1; len <= 64; len++) {
|
1133
|
+
dst[COPY_4_BYTE_OFFSET | ((len-1)<<2)] = MakeEntry(4, len, 0);
|
1134
|
+
assigned++;
|
1135
|
+
}
|
1136
|
+
|
1137
|
+
// Check that each entry was initialized exactly once.
|
1138
|
+
EXPECT_EQ(256, assigned) << "Assigned only " << assigned << " of 256";
|
1139
|
+
for (int i = 0; i < 256; i++) {
|
1140
|
+
EXPECT_NE(0xffff, dst[i]) << "Did not assign byte " << i;
|
1141
|
+
}
|
1142
|
+
|
1143
|
+
if (FLAGS_snappy_dump_decompression_table) {
|
1144
|
+
printf("static const uint16 char_table[256] = {\n ");
|
1145
|
+
for (int i = 0; i < 256; i++) {
|
1146
|
+
printf("0x%04x%s",
|
1147
|
+
dst[i],
|
1148
|
+
((i == 255) ? "\n" : (((i%8) == 7) ? ",\n " : ", ")));
|
1149
|
+
}
|
1150
|
+
printf("};\n");
|
1151
|
+
}
|
1152
|
+
|
1153
|
+
// Check that computed table matched recorded table.
|
1154
|
+
for (int i = 0; i < 256; i++) {
|
1155
|
+
EXPECT_EQ(dst[i], char_table[i]) << "Mismatch in byte " << i;
|
1156
|
+
}
|
1157
|
+
}
|
1128
1158
|
|
1129
1159
|
static void CompressFile(const char* fname) {
|
1130
1160
|
string fullinput;
|
1131
|
-
file::GetContents(fname, &fullinput, file::Defaults())
|
1161
|
+
CHECK_OK(file::GetContents(fname, &fullinput, file::Defaults()));
|
1132
1162
|
|
1133
1163
|
string compressed;
|
1134
1164
|
Compress(fullinput.data(), fullinput.size(), SNAPPY, &compressed, false);
|
1135
1165
|
|
1136
|
-
file::SetContents(string(fname).append(".comp"), compressed,
|
1137
|
-
|
1166
|
+
CHECK_OK(file::SetContents(string(fname).append(".comp"), compressed,
|
1167
|
+
file::Defaults()));
|
1138
1168
|
}
|
1139
1169
|
|
1140
1170
|
static void UncompressFile(const char* fname) {
|
1141
1171
|
string fullinput;
|
1142
|
-
file::GetContents(fname, &fullinput, file::Defaults())
|
1172
|
+
CHECK_OK(file::GetContents(fname, &fullinput, file::Defaults()));
|
1143
1173
|
|
1144
1174
|
size_t uncompLength;
|
1145
1175
|
CHECK(CheckUncompressedLength(fullinput, &uncompLength));
|
@@ -1148,28 +1178,25 @@ static void UncompressFile(const char* fname) {
|
|
1148
1178
|
uncompressed.resize(uncompLength);
|
1149
1179
|
CHECK(snappy::Uncompress(fullinput.data(), fullinput.size(), &uncompressed));
|
1150
1180
|
|
1151
|
-
file::SetContents(string(fname).append(".uncomp"), uncompressed,
|
1152
|
-
|
1181
|
+
CHECK_OK(file::SetContents(string(fname).append(".uncomp"), uncompressed,
|
1182
|
+
file::Defaults()));
|
1153
1183
|
}
|
1154
1184
|
|
1155
1185
|
static void MeasureFile(const char* fname) {
|
1156
1186
|
string fullinput;
|
1157
|
-
file::GetContents(fname, &fullinput, file::Defaults())
|
1187
|
+
CHECK_OK(file::GetContents(fname, &fullinput, file::Defaults()));
|
1158
1188
|
printf("%-40s :\n", fname);
|
1159
1189
|
|
1160
1190
|
int start_len = (FLAGS_start_len < 0) ? fullinput.size() : FLAGS_start_len;
|
1161
1191
|
int end_len = fullinput.size();
|
1162
1192
|
if (FLAGS_end_len >= 0) {
|
1163
|
-
end_len = min<int>(fullinput.size(), FLAGS_end_len);
|
1193
|
+
end_len = std::min<int>(fullinput.size(), FLAGS_end_len);
|
1164
1194
|
}
|
1165
1195
|
for (int len = start_len; len <= end_len; len++) {
|
1166
1196
|
const char* const input = fullinput.data();
|
1167
1197
|
int repeats = (FLAGS_bytes + len) / (len + 1);
|
1168
1198
|
if (FLAGS_zlib) Measure(input, len, ZLIB, repeats, 1024<<10);
|
1169
1199
|
if (FLAGS_lzo) Measure(input, len, LZO, repeats, 1024<<10);
|
1170
|
-
if (FLAGS_liblzf) Measure(input, len, LIBLZF, repeats, 1024<<10);
|
1171
|
-
if (FLAGS_quicklz) Measure(input, len, QUICKLZ, repeats, 1024<<10);
|
1172
|
-
if (FLAGS_fastlz) Measure(input, len, FASTLZ, repeats, 1024<<10);
|
1173
1200
|
if (FLAGS_snappy) Measure(input, len, SNAPPY, repeats, 4096<<10);
|
1174
1201
|
|
1175
1202
|
// For block-size based measurements
|
@@ -1298,6 +1325,37 @@ static void BM_UIOVec(int iters, int arg) {
|
|
1298
1325
|
}
|
1299
1326
|
BENCHMARK(BM_UIOVec)->DenseRange(0, 4);
|
1300
1327
|
|
1328
|
+
static void BM_UFlatSink(int iters, int arg) {
|
1329
|
+
StopBenchmarkTiming();
|
1330
|
+
|
1331
|
+
// Pick file to process based on "arg"
|
1332
|
+
CHECK_GE(arg, 0);
|
1333
|
+
CHECK_LT(arg, ARRAYSIZE(files));
|
1334
|
+
string contents = ReadTestDataFile(files[arg].filename,
|
1335
|
+
files[arg].size_limit);
|
1336
|
+
|
1337
|
+
string zcontents;
|
1338
|
+
snappy::Compress(contents.data(), contents.size(), &zcontents);
|
1339
|
+
char* dst = new char[contents.size()];
|
1340
|
+
|
1341
|
+
SetBenchmarkBytesProcessed(static_cast<int64>(iters) *
|
1342
|
+
static_cast<int64>(contents.size()));
|
1343
|
+
SetBenchmarkLabel(files[arg].label);
|
1344
|
+
StartBenchmarkTiming();
|
1345
|
+
while (iters-- > 0) {
|
1346
|
+
snappy::ByteArraySource source(zcontents.data(), zcontents.size());
|
1347
|
+
snappy::UncheckedByteArraySink sink(dst);
|
1348
|
+
CHECK(snappy::Uncompress(&source, &sink));
|
1349
|
+
}
|
1350
|
+
StopBenchmarkTiming();
|
1351
|
+
|
1352
|
+
string s(dst, contents.size());
|
1353
|
+
CHECK_EQ(contents, s);
|
1354
|
+
|
1355
|
+
delete[] dst;
|
1356
|
+
}
|
1357
|
+
|
1358
|
+
BENCHMARK(BM_UFlatSink)->DenseRange(0, ARRAYSIZE(files) - 1);
|
1301
1359
|
|
1302
1360
|
static void BM_ZFlat(int iters, int arg) {
|
1303
1361
|
StopBenchmarkTiming();
|
@@ -1329,23 +1387,20 @@ static void BM_ZFlat(int iters, int arg) {
|
|
1329
1387
|
}
|
1330
1388
|
BENCHMARK(BM_ZFlat)->DenseRange(0, ARRAYSIZE(files) - 1);
|
1331
1389
|
|
1332
|
-
|
1333
1390
|
} // namespace snappy
|
1334
1391
|
|
1335
|
-
|
1336
1392
|
int main(int argc, char** argv) {
|
1337
1393
|
InitGoogle(argv[0], &argc, &argv, true);
|
1338
1394
|
RunSpecifiedBenchmarks();
|
1339
1395
|
|
1340
|
-
|
1341
1396
|
if (argc >= 2) {
|
1342
1397
|
for (int arg = 1; arg < argc; arg++) {
|
1343
1398
|
if (FLAGS_write_compressed) {
|
1344
|
-
CompressFile(argv[arg]);
|
1399
|
+
snappy::CompressFile(argv[arg]);
|
1345
1400
|
} else if (FLAGS_write_uncompressed) {
|
1346
|
-
UncompressFile(argv[arg]);
|
1401
|
+
snappy::UncompressFile(argv[arg]);
|
1347
1402
|
} else {
|
1348
|
-
MeasureFile(argv[arg]);
|
1403
|
+
snappy::MeasureFile(argv[arg]);
|
1349
1404
|
}
|
1350
1405
|
}
|
1351
1406
|
return 0;
|