snappy 0.1.0-java → 0.2.0-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/main.yml +34 -0
- data/.github/workflows/publish.yml +34 -0
- data/Gemfile +3 -4
- data/Rakefile +32 -30
- data/ext/api.c +6 -1
- data/lib/snappy.rb +5 -5
- data/lib/snappy/hadoop/reader.rb +6 -2
- data/lib/snappy/reader.rb +11 -7
- data/lib/snappy/shim.rb +1 -1
- data/lib/snappy/version.rb +1 -1
- data/snappy.gemspec +13 -9
- data/test/hadoop/snappy_hadoop_reader_test.rb +115 -0
- data/test/hadoop/snappy_hadoop_writer_test.rb +48 -0
- data/test/snappy_hadoop_test.rb +26 -0
- data/test/snappy_reader_test.rb +148 -0
- data/test/snappy_test.rb +95 -0
- data/test/snappy_writer_test.rb +55 -0
- data/test/test_helper.rb +7 -0
- data/vendor/snappy/CMakeLists.txt +177 -54
- data/vendor/snappy/NEWS +8 -0
- data/vendor/snappy/README.md +19 -20
- data/vendor/snappy/cmake/SnappyConfig.cmake.in +33 -0
- data/vendor/snappy/cmake/config.h.in +6 -6
- data/vendor/snappy/docs/README.md +72 -0
- data/vendor/snappy/snappy-internal.h +12 -5
- data/vendor/snappy/snappy-stubs-internal.cc +1 -1
- data/vendor/snappy/snappy-stubs-internal.h +60 -15
- data/vendor/snappy/snappy-stubs-public.h.in +16 -36
- data/vendor/snappy/snappy-test.cc +16 -15
- data/vendor/snappy/snappy-test.h +12 -60
- data/vendor/snappy/snappy.cc +333 -187
- data/vendor/snappy/snappy.h +14 -10
- data/vendor/snappy/snappy_compress_fuzzer.cc +59 -0
- data/vendor/snappy/snappy_uncompress_fuzzer.cc +57 -0
- data/vendor/snappy/snappy_unittest.cc +220 -124
- metadata +25 -18
- data/.travis.yml +0 -31
- data/smoke.sh +0 -8
- data/test/hadoop/test-snappy-hadoop-reader.rb +0 -103
- data/test/hadoop/test-snappy-hadoop-writer.rb +0 -48
- data/test/test-snappy-hadoop.rb +0 -22
- data/test/test-snappy-reader.rb +0 -129
- data/test/test-snappy-writer.rb +0 -55
- data/test/test-snappy.rb +0 -58
- data/vendor/snappy/cmake/SnappyConfig.cmake +0 -1
@@ -48,12 +48,12 @@ DEFINE_bool(run_microbenchmarks, true,
|
|
48
48
|
|
49
49
|
namespace snappy {
|
50
50
|
|
51
|
-
string ReadTestDataFile(const string& base, size_t size_limit) {
|
52
|
-
string contents;
|
51
|
+
std::string ReadTestDataFile(const std::string& base, size_t size_limit) {
|
52
|
+
std::string contents;
|
53
53
|
const char* srcdir = getenv("srcdir"); // This is set by Automake.
|
54
|
-
string prefix;
|
54
|
+
std::string prefix;
|
55
55
|
if (srcdir) {
|
56
|
-
prefix = string(srcdir) + "/";
|
56
|
+
prefix = std::string(srcdir) + "/";
|
57
57
|
}
|
58
58
|
file::GetContents(prefix + "testdata/" + base, &contents, file::Defaults()
|
59
59
|
).CheckSuccess();
|
@@ -63,11 +63,11 @@ string ReadTestDataFile(const string& base, size_t size_limit) {
|
|
63
63
|
return contents;
|
64
64
|
}
|
65
65
|
|
66
|
-
string ReadTestDataFile(const string& base) {
|
66
|
+
std::string ReadTestDataFile(const std::string& base) {
|
67
67
|
return ReadTestDataFile(base, 0);
|
68
68
|
}
|
69
69
|
|
70
|
-
string
|
70
|
+
std::string StrFormat(const char* format, ...) {
|
71
71
|
char buf[4096];
|
72
72
|
va_list ap;
|
73
73
|
va_start(ap, format);
|
@@ -79,7 +79,7 @@ string StringPrintf(const char* format, ...) {
|
|
79
79
|
bool benchmark_running = false;
|
80
80
|
int64 benchmark_real_time_us = 0;
|
81
81
|
int64 benchmark_cpu_time_us = 0;
|
82
|
-
string
|
82
|
+
std::string* benchmark_label = nullptr;
|
83
83
|
int64 benchmark_bytes_processed = 0;
|
84
84
|
|
85
85
|
void ResetBenchmarkTiming() {
|
@@ -163,11 +163,11 @@ void StopBenchmarkTiming() {
|
|
163
163
|
benchmark_running = false;
|
164
164
|
}
|
165
165
|
|
166
|
-
void SetBenchmarkLabel(const string& str) {
|
166
|
+
void SetBenchmarkLabel(const std::string& str) {
|
167
167
|
if (benchmark_label) {
|
168
168
|
delete benchmark_label;
|
169
169
|
}
|
170
|
-
benchmark_label = new string(str);
|
170
|
+
benchmark_label = new std::string(str);
|
171
171
|
}
|
172
172
|
|
173
173
|
void SetBenchmarkBytesProcessed(int64 bytes) {
|
@@ -217,8 +217,8 @@ void Benchmark::Run() {
|
|
217
217
|
benchmark_runs[run].cpu_time_us = benchmark_cpu_time_us;
|
218
218
|
}
|
219
219
|
|
220
|
-
string heading =
|
221
|
-
string human_readable_speed;
|
220
|
+
std::string heading = StrFormat("%s/%d", name_.c_str(), test_case_num);
|
221
|
+
std::string human_readable_speed;
|
222
222
|
|
223
223
|
std::nth_element(benchmark_runs,
|
224
224
|
benchmark_runs + kMedianPos,
|
@@ -232,15 +232,16 @@ void Benchmark::Run() {
|
|
232
232
|
int64 bytes_per_second =
|
233
233
|
benchmark_bytes_processed * 1000000 / cpu_time_us;
|
234
234
|
if (bytes_per_second < 1024) {
|
235
|
-
human_readable_speed =
|
235
|
+
human_readable_speed =
|
236
|
+
StrFormat("%dB/s", static_cast<int>(bytes_per_second));
|
236
237
|
} else if (bytes_per_second < 1024 * 1024) {
|
237
|
-
human_readable_speed =
|
238
|
+
human_readable_speed = StrFormat(
|
238
239
|
"%.1fkB/s", bytes_per_second / 1024.0f);
|
239
240
|
} else if (bytes_per_second < 1024 * 1024 * 1024) {
|
240
|
-
human_readable_speed =
|
241
|
+
human_readable_speed = StrFormat(
|
241
242
|
"%.1fMB/s", bytes_per_second / (1024.0f * 1024.0f));
|
242
243
|
} else {
|
243
|
-
human_readable_speed =
|
244
|
+
human_readable_speed = StrFormat(
|
244
245
|
"%.1fGB/s", bytes_per_second / (1024.0f * 1024.0f * 1024.0f));
|
245
246
|
}
|
246
247
|
}
|
data/vendor/snappy/snappy-test.h
CHANGED
@@ -55,8 +55,6 @@
|
|
55
55
|
#include <windows.h>
|
56
56
|
#endif
|
57
57
|
|
58
|
-
#include <string>
|
59
|
-
|
60
58
|
#ifdef HAVE_GTEST
|
61
59
|
|
62
60
|
#include <gtest/gtest.h>
|
@@ -169,7 +167,7 @@ namespace file {
|
|
169
167
|
namespace snappy {
|
170
168
|
|
171
169
|
#define FLAGS_test_random_seed 301
|
172
|
-
|
170
|
+
using TypeParam = std::string;
|
173
171
|
|
174
172
|
void Test_CorruptedTest_VerifyCorrupted();
|
175
173
|
void Test_Snappy_SimpleTests();
|
@@ -183,63 +181,13 @@ void Test_Snappy_ReadPastEndOfBuffer();
|
|
183
181
|
void Test_Snappy_FindMatchLength();
|
184
182
|
void Test_Snappy_FindMatchLengthRandom();
|
185
183
|
|
186
|
-
string ReadTestDataFile(const string& base, size_t size_limit);
|
184
|
+
std::string ReadTestDataFile(const std::string& base, size_t size_limit);
|
187
185
|
|
188
|
-
string ReadTestDataFile(const string& base);
|
186
|
+
std::string ReadTestDataFile(const std::string& base);
|
189
187
|
|
190
188
|
// A sprintf() variant that returns a std::string.
|
191
189
|
// Not safe for general use due to truncation issues.
|
192
|
-
string
|
193
|
-
|
194
|
-
// A simple, non-cryptographically-secure random generator.
|
195
|
-
class ACMRandom {
|
196
|
-
public:
|
197
|
-
explicit ACMRandom(uint32 seed) : seed_(seed) {}
|
198
|
-
|
199
|
-
int32 Next();
|
200
|
-
|
201
|
-
int32 Uniform(int32 n) {
|
202
|
-
return Next() % n;
|
203
|
-
}
|
204
|
-
uint8 Rand8() {
|
205
|
-
return static_cast<uint8>((Next() >> 1) & 0x000000ff);
|
206
|
-
}
|
207
|
-
bool OneIn(int X) { return Uniform(X) == 0; }
|
208
|
-
|
209
|
-
// Skewed: pick "base" uniformly from range [0,max_log] and then
|
210
|
-
// return "base" random bits. The effect is to pick a number in the
|
211
|
-
// range [0,2^max_log-1] with bias towards smaller numbers.
|
212
|
-
int32 Skewed(int max_log);
|
213
|
-
|
214
|
-
private:
|
215
|
-
static const uint32 M = 2147483647L; // 2^31-1
|
216
|
-
uint32 seed_;
|
217
|
-
};
|
218
|
-
|
219
|
-
inline int32 ACMRandom::Next() {
|
220
|
-
static const uint64 A = 16807; // bits 14, 8, 7, 5, 2, 1, 0
|
221
|
-
// We are computing
|
222
|
-
// seed_ = (seed_ * A) % M, where M = 2^31-1
|
223
|
-
//
|
224
|
-
// seed_ must not be zero or M, or else all subsequent computed values
|
225
|
-
// will be zero or M respectively. For all other values, seed_ will end
|
226
|
-
// up cycling through every number in [1,M-1]
|
227
|
-
uint64 product = seed_ * A;
|
228
|
-
|
229
|
-
// Compute (product % M) using the fact that ((x << 31) % M) == x.
|
230
|
-
seed_ = (product >> 31) + (product & M);
|
231
|
-
// The first reduction may overflow by 1 bit, so we may need to repeat.
|
232
|
-
// mod == M is not possible; using > allows the faster sign-bit-based test.
|
233
|
-
if (seed_ > M) {
|
234
|
-
seed_ -= M;
|
235
|
-
}
|
236
|
-
return seed_;
|
237
|
-
}
|
238
|
-
|
239
|
-
inline int32 ACMRandom::Skewed(int max_log) {
|
240
|
-
const int32 base = (Next() - 1) % (max_log+1);
|
241
|
-
return (Next() - 1) & ((1u << base)-1);
|
242
|
-
}
|
190
|
+
std::string StrFormat(const char* format, ...);
|
243
191
|
|
244
192
|
// A wall-time clock. This stub is not super-accurate, nor resistant to the
|
245
193
|
// system time changing.
|
@@ -293,8 +241,8 @@ typedef void (*BenchmarkFunction)(int, int);
|
|
293
241
|
|
294
242
|
class Benchmark {
|
295
243
|
public:
|
296
|
-
Benchmark(const string& name, BenchmarkFunction function)
|
297
|
-
name_(name), function_(function) {}
|
244
|
+
Benchmark(const std::string& name, BenchmarkFunction function)
|
245
|
+
: name_(name), function_(function) {}
|
298
246
|
|
299
247
|
Benchmark* DenseRange(int start, int stop) {
|
300
248
|
start_ = start;
|
@@ -305,7 +253,7 @@ class Benchmark {
|
|
305
253
|
void Run();
|
306
254
|
|
307
255
|
private:
|
308
|
-
const string name_;
|
256
|
+
const std::string name_;
|
309
257
|
const BenchmarkFunction function_;
|
310
258
|
int start_, stop_;
|
311
259
|
};
|
@@ -317,11 +265,13 @@ extern Benchmark* Benchmark_BM_UFlat;
|
|
317
265
|
extern Benchmark* Benchmark_BM_UIOVec;
|
318
266
|
extern Benchmark* Benchmark_BM_UValidate;
|
319
267
|
extern Benchmark* Benchmark_BM_ZFlat;
|
268
|
+
extern Benchmark* Benchmark_BM_ZFlatAll;
|
269
|
+
extern Benchmark* Benchmark_BM_ZFlatIncreasingTableSize;
|
320
270
|
|
321
271
|
void ResetBenchmarkTiming();
|
322
272
|
void StartBenchmarkTiming();
|
323
273
|
void StopBenchmarkTiming();
|
324
|
-
void SetBenchmarkLabel(const string& str);
|
274
|
+
void SetBenchmarkLabel(const std::string& str);
|
325
275
|
void SetBenchmarkBytesProcessed(int64 bytes);
|
326
276
|
|
327
277
|
#ifdef HAVE_LIBZ
|
@@ -468,6 +418,8 @@ static inline void RunSpecifiedBenchmarks() {
|
|
468
418
|
snappy::Benchmark_BM_UIOVec->Run();
|
469
419
|
snappy::Benchmark_BM_UValidate->Run();
|
470
420
|
snappy::Benchmark_BM_ZFlat->Run();
|
421
|
+
snappy::Benchmark_BM_ZFlatAll->Run();
|
422
|
+
snappy::Benchmark_BM_ZFlatIncreasingTableSize->Run();
|
471
423
|
|
472
424
|
fprintf(stderr, "\n");
|
473
425
|
}
|
data/vendor/snappy/snappy.cc
CHANGED
@@ -30,25 +30,50 @@
|
|
30
30
|
#include "snappy-internal.h"
|
31
31
|
#include "snappy-sinksource.h"
|
32
32
|
|
33
|
-
#
|
34
|
-
|
35
|
-
|
36
|
-
|
33
|
+
#if !defined(SNAPPY_HAVE_SSSE3)
|
34
|
+
// __SSSE3__ is defined by GCC and Clang. Visual Studio doesn't target SIMD
|
35
|
+
// support between SSE2 and AVX (so SSSE3 instructions require AVX support), and
|
36
|
+
// defines __AVX__ when AVX support is available.
|
37
|
+
#if defined(__SSSE3__) || defined(__AVX__)
|
38
|
+
#define SNAPPY_HAVE_SSSE3 1
|
37
39
|
#else
|
38
|
-
#define
|
40
|
+
#define SNAPPY_HAVE_SSSE3 0
|
39
41
|
#endif
|
42
|
+
#endif // !defined(SNAPPY_HAVE_SSSE3)
|
43
|
+
|
44
|
+
#if !defined(SNAPPY_HAVE_BMI2)
|
45
|
+
// __BMI2__ is defined by GCC and Clang. Visual Studio doesn't target BMI2
|
46
|
+
// specifically, but it does define __AVX2__ when AVX2 support is available.
|
47
|
+
// Fortunately, AVX2 was introduced in Haswell, just like BMI2.
|
48
|
+
//
|
49
|
+
// BMI2 is not defined as a subset of AVX2 (unlike SSSE3 and AVX above). So,
|
50
|
+
// GCC and Clang can build code with AVX2 enabled but BMI2 disabled, in which
|
51
|
+
// case issuing BMI2 instructions results in a compiler error.
|
52
|
+
#if defined(__BMI2__) || (defined(_MSC_VER) && defined(__AVX2__))
|
53
|
+
#define SNAPPY_HAVE_BMI2 1
|
54
|
+
#else
|
55
|
+
#define SNAPPY_HAVE_BMI2 0
|
56
|
+
#endif
|
57
|
+
#endif // !defined(SNAPPY_HAVE_BMI2)
|
58
|
+
|
59
|
+
#if SNAPPY_HAVE_SSSE3
|
60
|
+
// Please do not replace with <x86intrin.h>. or with headers that assume more
|
61
|
+
// advanced SSE versions without checking with all the OWNERS.
|
62
|
+
#include <tmmintrin.h>
|
40
63
|
#endif
|
41
64
|
|
42
|
-
#if
|
43
|
-
|
65
|
+
#if SNAPPY_HAVE_BMI2
|
66
|
+
// Please do not replace with <x86intrin.h>. or with headers that assume more
|
67
|
+
// advanced SSE versions without checking with all the OWNERS.
|
68
|
+
#include <immintrin.h>
|
44
69
|
#endif
|
70
|
+
|
45
71
|
#include <stdio.h>
|
46
72
|
|
47
73
|
#include <algorithm>
|
48
74
|
#include <string>
|
49
75
|
#include <vector>
|
50
76
|
|
51
|
-
|
52
77
|
namespace snappy {
|
53
78
|
|
54
79
|
using internal::COPY_1_BYTE_OFFSET;
|
@@ -103,16 +128,12 @@ void UnalignedCopy64(const void* src, void* dst) {
|
|
103
128
|
}
|
104
129
|
|
105
130
|
void UnalignedCopy128(const void* src, void* dst) {
|
106
|
-
//
|
107
|
-
// SSE2
|
108
|
-
|
109
|
-
__m128i x = _mm_loadu_si128(static_cast<const __m128i*>(src));
|
110
|
-
_mm_storeu_si128(static_cast<__m128i*>(dst), x);
|
111
|
-
#else
|
131
|
+
// memcpy gets vectorized when the appropriate compiler options are used.
|
132
|
+
// For example, x86 compilers targeting SSE2+ will optimize to an SSE2 load
|
133
|
+
// and store.
|
112
134
|
char tmp[16];
|
113
135
|
memcpy(tmp, src, 16);
|
114
136
|
memcpy(dst, tmp, 16);
|
115
|
-
#endif
|
116
137
|
}
|
117
138
|
|
118
139
|
// Copy [src, src+(op_limit-op)) to [op, (op_limit-op)) a byte at a time. Used
|
@@ -127,12 +148,35 @@ void UnalignedCopy128(const void* src, void* dst) {
|
|
127
148
|
// Note that this does not match the semantics of either memcpy() or memmove().
|
128
149
|
inline char* IncrementalCopySlow(const char* src, char* op,
|
129
150
|
char* const op_limit) {
|
151
|
+
// TODO: Remove pragma when LLVM is aware this
|
152
|
+
// function is only called in cold regions and when cold regions don't get
|
153
|
+
// vectorized or unrolled.
|
154
|
+
#ifdef __clang__
|
155
|
+
#pragma clang loop unroll(disable)
|
156
|
+
#endif
|
130
157
|
while (op < op_limit) {
|
131
158
|
*op++ = *src++;
|
132
159
|
}
|
133
160
|
return op_limit;
|
134
161
|
}
|
135
162
|
|
163
|
+
#if SNAPPY_HAVE_SSSE3
|
164
|
+
|
165
|
+
// This is a table of shuffle control masks that can be used as the source
|
166
|
+
// operand for PSHUFB to permute the contents of the destination XMM register
|
167
|
+
// into a repeating byte pattern.
|
168
|
+
alignas(16) const char pshufb_fill_patterns[7][16] = {
|
169
|
+
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
170
|
+
{0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1},
|
171
|
+
{0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0},
|
172
|
+
{0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3},
|
173
|
+
{0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0},
|
174
|
+
{0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3},
|
175
|
+
{0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1},
|
176
|
+
};
|
177
|
+
|
178
|
+
#endif // SNAPPY_HAVE_SSSE3
|
179
|
+
|
136
180
|
// Copy [src, src+(op_limit-op)) to [op, (op_limit-op)) but faster than
|
137
181
|
// IncrementalCopySlow. buf_limit is the address past the end of the writable
|
138
182
|
// region of the buffer.
|
@@ -144,9 +188,10 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
|
|
144
188
|
// pat = op - src
|
145
189
|
// len = limit - op
|
146
190
|
assert(src < op);
|
191
|
+
assert(op <= op_limit);
|
147
192
|
assert(op_limit <= buf_limit);
|
148
193
|
// NOTE: The compressor always emits 4 <= len <= 64. It is ok to assume that
|
149
|
-
// to optimize this function but we have to also handle
|
194
|
+
// to optimize this function but we have to also handle other cases in case
|
150
195
|
// the input does not satisfy these conditions.
|
151
196
|
|
152
197
|
size_t pattern_size = op - src;
|
@@ -176,16 +221,45 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
|
|
176
221
|
|
177
222
|
// Handle the uncommon case where pattern is less than 8 bytes.
|
178
223
|
if (SNAPPY_PREDICT_FALSE(pattern_size < 8)) {
|
179
|
-
|
180
|
-
//
|
181
|
-
//
|
224
|
+
#if SNAPPY_HAVE_SSSE3
|
225
|
+
// Load the first eight bytes into an 128-bit XMM register, then use PSHUFB
|
226
|
+
// to permute the register's contents in-place into a repeating sequence of
|
227
|
+
// the first "pattern_size" bytes.
|
228
|
+
// For example, suppose:
|
229
|
+
// src == "abc"
|
230
|
+
// op == op + 3
|
231
|
+
// After _mm_shuffle_epi8(), "pattern" will have five copies of "abc"
|
232
|
+
// followed by one byte of slop: abcabcabcabcabca.
|
182
233
|
//
|
183
|
-
//
|
184
|
-
//
|
185
|
-
//
|
186
|
-
|
187
|
-
|
188
|
-
|
234
|
+
// The non-SSE fallback implementation suffers from store-forwarding stalls
|
235
|
+
// because its loads and stores partly overlap. By expanding the pattern
|
236
|
+
// in-place, we avoid the penalty.
|
237
|
+
if (SNAPPY_PREDICT_TRUE(op <= buf_limit - 16)) {
|
238
|
+
const __m128i shuffle_mask = _mm_load_si128(
|
239
|
+
reinterpret_cast<const __m128i*>(pshufb_fill_patterns)
|
240
|
+
+ pattern_size - 1);
|
241
|
+
const __m128i pattern = _mm_shuffle_epi8(
|
242
|
+
_mm_loadl_epi64(reinterpret_cast<const __m128i*>(src)), shuffle_mask);
|
243
|
+
// Uninitialized bytes are masked out by the shuffle mask.
|
244
|
+
// TODO: remove annotation and macro defs once MSan is fixed.
|
245
|
+
SNAPPY_ANNOTATE_MEMORY_IS_INITIALIZED(&pattern, sizeof(pattern));
|
246
|
+
pattern_size *= 16 / pattern_size;
|
247
|
+
char* op_end = std::min(op_limit, buf_limit - 15);
|
248
|
+
while (op < op_end) {
|
249
|
+
_mm_storeu_si128(reinterpret_cast<__m128i*>(op), pattern);
|
250
|
+
op += pattern_size;
|
251
|
+
}
|
252
|
+
if (SNAPPY_PREDICT_TRUE(op >= op_limit)) return op_limit;
|
253
|
+
}
|
254
|
+
return IncrementalCopySlow(src, op, op_limit);
|
255
|
+
#else // !SNAPPY_HAVE_SSSE3
|
256
|
+
// If plenty of buffer space remains, expand the pattern to at least 8
|
257
|
+
// bytes. The way the following loop is written, we need 8 bytes of buffer
|
258
|
+
// space if pattern_size >= 4, 11 bytes if pattern_size is 1 or 3, and 10
|
259
|
+
// bytes if pattern_size is 2. Precisely encoding that is probably not
|
260
|
+
// worthwhile; instead, invoke the slow path if we cannot write 11 bytes
|
261
|
+
// (because 11 are required in the worst case).
|
262
|
+
if (SNAPPY_PREDICT_TRUE(op <= buf_limit - 11)) {
|
189
263
|
while (pattern_size < 8) {
|
190
264
|
UnalignedCopy64(src, op);
|
191
265
|
op += pattern_size;
|
@@ -195,6 +269,7 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
|
|
195
269
|
} else {
|
196
270
|
return IncrementalCopySlow(src, op, op_limit);
|
197
271
|
}
|
272
|
+
#endif // SNAPPY_HAVE_SSSE3
|
198
273
|
}
|
199
274
|
assert(pattern_size >= 8);
|
200
275
|
|
@@ -202,13 +277,48 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
|
|
202
277
|
// UnalignedCopy128 might overwrite data in op. UnalignedCopy64 is safe
|
203
278
|
// because expanding the pattern to at least 8 bytes guarantees that
|
204
279
|
// op - src >= 8.
|
205
|
-
|
280
|
+
//
|
281
|
+
// Typically, the op_limit is the gating factor so try to simplify the loop
|
282
|
+
// based on that.
|
283
|
+
if (SNAPPY_PREDICT_TRUE(op_limit <= buf_limit - 16)) {
|
284
|
+
// There is at least one, and at most four 16-byte blocks. Writing four
|
285
|
+
// conditionals instead of a loop allows FDO to layout the code with respect
|
286
|
+
// to the actual probabilities of each length.
|
287
|
+
// TODO: Replace with loop with trip count hint.
|
288
|
+
UnalignedCopy64(src, op);
|
289
|
+
UnalignedCopy64(src + 8, op + 8);
|
290
|
+
|
291
|
+
if (op + 16 < op_limit) {
|
292
|
+
UnalignedCopy64(src + 16, op + 16);
|
293
|
+
UnalignedCopy64(src + 24, op + 24);
|
294
|
+
}
|
295
|
+
if (op + 32 < op_limit) {
|
296
|
+
UnalignedCopy64(src + 32, op + 32);
|
297
|
+
UnalignedCopy64(src + 40, op + 40);
|
298
|
+
}
|
299
|
+
if (op + 48 < op_limit) {
|
300
|
+
UnalignedCopy64(src + 48, op + 48);
|
301
|
+
UnalignedCopy64(src + 56, op + 56);
|
302
|
+
}
|
303
|
+
return op_limit;
|
304
|
+
}
|
305
|
+
|
306
|
+
// Fall back to doing as much as we can with the available slop in the
|
307
|
+
// buffer. This code path is relatively cold however so we save code size by
|
308
|
+
// avoiding unrolling and vectorizing.
|
309
|
+
//
|
310
|
+
// TODO: Remove pragma when when cold regions don't get vectorized
|
311
|
+
// or unrolled.
|
312
|
+
#ifdef __clang__
|
313
|
+
#pragma clang loop unroll(disable)
|
314
|
+
#endif
|
315
|
+
for (char *op_end = buf_limit - 16; op < op_end; op += 16, src += 16) {
|
206
316
|
UnalignedCopy64(src, op);
|
207
317
|
UnalignedCopy64(src + 8, op + 8);
|
208
|
-
src += 16;
|
209
|
-
op += 16;
|
210
|
-
if (SNAPPY_PREDICT_TRUE(op >= op_limit)) return op_limit;
|
211
318
|
}
|
319
|
+
if (op >= op_limit)
|
320
|
+
return op_limit;
|
321
|
+
|
212
322
|
// We only take this branch if we didn't have enough slop and we can do a
|
213
323
|
// single 8 byte copy.
|
214
324
|
if (SNAPPY_PREDICT_FALSE(op <= buf_limit - 8)) {
|
@@ -221,10 +331,10 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
|
|
221
331
|
|
222
332
|
} // namespace
|
223
333
|
|
334
|
+
template <bool allow_fast_path>
|
224
335
|
static inline char* EmitLiteral(char* op,
|
225
336
|
const char* literal,
|
226
|
-
int len
|
227
|
-
bool allow_fast_path) {
|
337
|
+
int len) {
|
228
338
|
// The vast majority of copies are below 16 bytes, for which a
|
229
339
|
// call to memcpy is overkill. This fast path can sometimes
|
230
340
|
// copy up to 15 bytes too much, but that is okay in the
|
@@ -249,25 +359,23 @@ static inline char* EmitLiteral(char* op,
|
|
249
359
|
// Fits in tag byte
|
250
360
|
*op++ = LITERAL | (n << 2);
|
251
361
|
} else {
|
252
|
-
|
253
|
-
char* base = op;
|
254
|
-
int count = 0;
|
255
|
-
op++;
|
256
|
-
while (n > 0) {
|
257
|
-
*op++ = n & 0xff;
|
258
|
-
n >>= 8;
|
259
|
-
count++;
|
260
|
-
}
|
362
|
+
int count = (Bits::Log2Floor(n) >> 3) + 1;
|
261
363
|
assert(count >= 1);
|
262
364
|
assert(count <= 4);
|
263
|
-
*
|
365
|
+
*op++ = LITERAL | ((59 + count) << 2);
|
366
|
+
// Encode in upcoming bytes.
|
367
|
+
// Write 4 bytes, though we may care about only 1 of them. The output buffer
|
368
|
+
// is guaranteed to have at least 3 more spaces left as 'len >= 61' holds
|
369
|
+
// here and there is a memcpy of size 'len' below.
|
370
|
+
LittleEndian::Store32(op, n);
|
371
|
+
op += count;
|
264
372
|
}
|
265
373
|
memcpy(op, literal, len);
|
266
374
|
return op + len;
|
267
375
|
}
|
268
376
|
|
269
|
-
|
270
|
-
|
377
|
+
template <bool len_less_than_12>
|
378
|
+
static inline char* EmitCopyAtMost64(char* op, size_t offset, size_t len) {
|
271
379
|
assert(len <= 64);
|
272
380
|
assert(len >= 4);
|
273
381
|
assert(offset < 65536);
|
@@ -288,29 +396,33 @@ static inline char* EmitCopyAtMost64(char* op, size_t offset, size_t len,
|
|
288
396
|
return op;
|
289
397
|
}
|
290
398
|
|
291
|
-
|
292
|
-
|
399
|
+
template <bool len_less_than_12>
|
400
|
+
static inline char* EmitCopy(char* op, size_t offset, size_t len) {
|
293
401
|
assert(len_less_than_12 == (len < 12));
|
294
402
|
if (len_less_than_12) {
|
295
|
-
return EmitCopyAtMost64(op, offset, len
|
403
|
+
return EmitCopyAtMost64</*len_less_than_12=*/true>(op, offset, len);
|
296
404
|
} else {
|
297
405
|
// A special case for len <= 64 might help, but so far measurements suggest
|
298
406
|
// it's in the noise.
|
299
407
|
|
300
408
|
// Emit 64 byte copies but make sure to keep at least four bytes reserved.
|
301
409
|
while (SNAPPY_PREDICT_FALSE(len >= 68)) {
|
302
|
-
op = EmitCopyAtMost64(op, offset, 64
|
410
|
+
op = EmitCopyAtMost64</*len_less_than_12=*/false>(op, offset, 64);
|
303
411
|
len -= 64;
|
304
412
|
}
|
305
413
|
|
306
414
|
// One or two copies will now finish the job.
|
307
415
|
if (len > 64) {
|
308
|
-
op = EmitCopyAtMost64(op, offset, 60
|
416
|
+
op = EmitCopyAtMost64</*len_less_than_12=*/false>(op, offset, 60);
|
309
417
|
len -= 60;
|
310
418
|
}
|
311
419
|
|
312
420
|
// Emit remainder.
|
313
|
-
|
421
|
+
if (len < 12) {
|
422
|
+
op = EmitCopyAtMost64</*len_less_than_12=*/true>(op, offset, len);
|
423
|
+
} else {
|
424
|
+
op = EmitCopyAtMost64</*len_less_than_12=*/false>(op, offset, len);
|
425
|
+
}
|
314
426
|
return op;
|
315
427
|
}
|
316
428
|
}
|
@@ -326,31 +438,45 @@ bool GetUncompressedLength(const char* start, size_t n, size_t* result) {
|
|
326
438
|
}
|
327
439
|
}
|
328
440
|
|
329
|
-
namespace
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
size_t htsize = 256;
|
337
|
-
while (htsize < kMaxHashTableSize && htsize < input_size) {
|
338
|
-
htsize <<= 1;
|
441
|
+
namespace {
|
442
|
+
uint32 CalculateTableSize(uint32 input_size) {
|
443
|
+
static_assert(
|
444
|
+
kMaxHashTableSize >= kMinHashTableSize,
|
445
|
+
"kMaxHashTableSize should be greater or equal to kMinHashTableSize.");
|
446
|
+
if (input_size > kMaxHashTableSize) {
|
447
|
+
return kMaxHashTableSize;
|
339
448
|
}
|
340
|
-
|
341
|
-
|
342
|
-
if (htsize <= ARRAYSIZE(small_table_)) {
|
343
|
-
table = small_table_;
|
344
|
-
} else {
|
345
|
-
if (large_table_ == NULL) {
|
346
|
-
large_table_ = new uint16[kMaxHashTableSize];
|
347
|
-
}
|
348
|
-
table = large_table_;
|
449
|
+
if (input_size < kMinHashTableSize) {
|
450
|
+
return kMinHashTableSize;
|
349
451
|
}
|
452
|
+
// This is equivalent to Log2Ceiling(input_size), assuming input_size > 1.
|
453
|
+
// 2 << Log2Floor(x - 1) is equivalent to 1 << (1 + Log2Floor(x - 1)).
|
454
|
+
return 2u << Bits::Log2Floor(input_size - 1);
|
455
|
+
}
|
456
|
+
} // namespace
|
350
457
|
|
458
|
+
namespace internal {
|
459
|
+
WorkingMemory::WorkingMemory(size_t input_size) {
|
460
|
+
const size_t max_fragment_size = std::min(input_size, kBlockSize);
|
461
|
+
const size_t table_size = CalculateTableSize(max_fragment_size);
|
462
|
+
size_ = table_size * sizeof(*table_) + max_fragment_size +
|
463
|
+
MaxCompressedLength(max_fragment_size);
|
464
|
+
mem_ = std::allocator<char>().allocate(size_);
|
465
|
+
table_ = reinterpret_cast<uint16*>(mem_);
|
466
|
+
input_ = mem_ + table_size * sizeof(*table_);
|
467
|
+
output_ = input_ + max_fragment_size;
|
468
|
+
}
|
469
|
+
|
470
|
+
WorkingMemory::~WorkingMemory() {
|
471
|
+
std::allocator<char>().deallocate(mem_, size_);
|
472
|
+
}
|
473
|
+
|
474
|
+
uint16* WorkingMemory::GetHashTable(size_t fragment_size,
|
475
|
+
int* table_size) const {
|
476
|
+
const size_t htsize = CalculateTableSize(fragment_size);
|
477
|
+
memset(table_, 0, htsize * sizeof(*table_));
|
351
478
|
*table_size = htsize;
|
352
|
-
|
353
|
-
return table;
|
479
|
+
return table_;
|
354
480
|
}
|
355
481
|
} // end namespace internal
|
356
482
|
|
@@ -417,7 +543,7 @@ char* CompressFragment(const char* input,
|
|
417
543
|
// "ip" is the input pointer, and "op" is the output pointer.
|
418
544
|
const char* ip = input;
|
419
545
|
assert(input_size <= kBlockSize);
|
420
|
-
assert((table_size & (table_size - 1)) == 0);
|
546
|
+
assert((table_size & (table_size - 1)) == 0); // table must be power of two
|
421
547
|
const int shift = 32 - Bits::Log2Floor(table_size);
|
422
548
|
assert(static_cast<int>(kuint32max >> shift) == table_size - 1);
|
423
549
|
const char* ip_end = input + input_size;
|
@@ -484,7 +610,7 @@ char* CompressFragment(const char* input,
|
|
484
610
|
// than 4 bytes match. But, prior to the match, input
|
485
611
|
// bytes [next_emit, ip) are unmatched. Emit them as "literal bytes."
|
486
612
|
assert(next_emit + 16 <= ip_end);
|
487
|
-
op = EmitLiteral(op, next_emit, ip - next_emit
|
613
|
+
op = EmitLiteral</*allow_fast_path=*/true>(op, next_emit, ip - next_emit);
|
488
614
|
|
489
615
|
// Step 3: Call EmitCopy, and then see if another EmitCopy could
|
490
616
|
// be our next move. Repeat until we find no match for the
|
@@ -507,7 +633,11 @@ char* CompressFragment(const char* input,
|
|
507
633
|
ip += matched;
|
508
634
|
size_t offset = base - candidate;
|
509
635
|
assert(0 == memcmp(base, candidate, matched));
|
510
|
-
|
636
|
+
if (p.second) {
|
637
|
+
op = EmitCopy</*len_less_than_12=*/true>(op, offset, matched);
|
638
|
+
} else {
|
639
|
+
op = EmitCopy</*len_less_than_12=*/false>(op, offset, matched);
|
640
|
+
}
|
511
641
|
next_emit = ip;
|
512
642
|
if (SNAPPY_PREDICT_FALSE(ip >= ip_limit)) {
|
513
643
|
goto emit_remainder;
|
@@ -532,7 +662,8 @@ char* CompressFragment(const char* input,
|
|
532
662
|
emit_remainder:
|
533
663
|
// Emit the remaining bytes as a literal
|
534
664
|
if (next_emit < ip_end) {
|
535
|
-
op = EmitLiteral(op, next_emit,
|
665
|
+
op = EmitLiteral</*allow_fast_path=*/false>(op, next_emit,
|
666
|
+
ip_end - next_emit);
|
536
667
|
}
|
537
668
|
|
538
669
|
return op;
|
@@ -583,14 +714,28 @@ static inline void Report(const char *algorithm, size_t compressed_size,
|
|
583
714
|
// bool TryFastAppend(const char* ip, size_t available, size_t length);
|
584
715
|
// };
|
585
716
|
|
586
|
-
|
587
|
-
|
588
|
-
|
589
|
-
|
590
|
-
|
591
|
-
|
717
|
+
static inline uint32 ExtractLowBytes(uint32 v, int n) {
|
718
|
+
assert(n >= 0);
|
719
|
+
assert(n <= 4);
|
720
|
+
#if SNAPPY_HAVE_BMI2
|
721
|
+
return _bzhi_u32(v, 8 * n);
|
722
|
+
#else
|
723
|
+
// This needs to be wider than uint32 otherwise `mask << 32` will be
|
724
|
+
// undefined.
|
725
|
+
uint64 mask = 0xffffffff;
|
726
|
+
return v & ~(mask << (8 * n));
|
727
|
+
#endif
|
728
|
+
}
|
592
729
|
|
593
|
-
|
730
|
+
static inline bool LeftShiftOverflows(uint8 value, uint32 shift) {
|
731
|
+
assert(shift < 32);
|
732
|
+
static const uint8 masks[] = {
|
733
|
+
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, //
|
734
|
+
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, //
|
735
|
+
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, //
|
736
|
+
0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe};
|
737
|
+
return (value & masks[shift]) != 0;
|
738
|
+
}
|
594
739
|
|
595
740
|
// Helper class for decompression
|
596
741
|
class SnappyDecompressor {
|
@@ -629,7 +774,7 @@ class SnappyDecompressor {
|
|
629
774
|
}
|
630
775
|
|
631
776
|
// Read the uncompressed length stored at the start of the compressed data.
|
632
|
-
// On
|
777
|
+
// On success, stores the length in *result and returns true.
|
633
778
|
// On failure, returns false.
|
634
779
|
bool ReadUncompressedLength(uint32* result) {
|
635
780
|
assert(ip_ == NULL); // Must not have read anything yet
|
@@ -644,7 +789,7 @@ class SnappyDecompressor {
|
|
644
789
|
const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip));
|
645
790
|
reader_->Skip(1);
|
646
791
|
uint32 val = c & 0x7f;
|
647
|
-
if (((val
|
792
|
+
if (LeftShiftOverflows(static_cast<uint8>(val), shift)) return false;
|
648
793
|
*result |= val << shift;
|
649
794
|
if (c < 128) {
|
650
795
|
break;
|
@@ -657,22 +802,27 @@ class SnappyDecompressor {
|
|
657
802
|
// Process the next item found in the input.
|
658
803
|
// Returns true if successful, false on error or end of input.
|
659
804
|
template <class Writer>
|
805
|
+
#if defined(__GNUC__) && defined(__x86_64__)
|
806
|
+
__attribute__((aligned(32)))
|
807
|
+
#endif
|
660
808
|
void DecompressAllTags(Writer* writer) {
|
661
|
-
|
662
|
-
//
|
663
|
-
//
|
664
|
-
|
665
|
-
//
|
666
|
-
//
|
667
|
-
//
|
668
|
-
//
|
669
|
-
//
|
670
|
-
|
671
|
-
|
672
|
-
|
673
|
-
|
674
|
-
|
809
|
+
// In x86, pad the function body to start 16 bytes later. This function has
|
810
|
+
// a couple of hotspots that are highly sensitive to alignment: we have
|
811
|
+
// observed regressions by more than 20% in some metrics just by moving the
|
812
|
+
// exact same code to a different position in the benchmark binary.
|
813
|
+
//
|
814
|
+
// Putting this code on a 32-byte-aligned boundary + 16 bytes makes us hit
|
815
|
+
// the "lucky" case consistently. Unfortunately, this is a very brittle
|
816
|
+
// workaround, and future differences in code generation may reintroduce
|
817
|
+
// this regression. If you experience a big, difficult to explain, benchmark
|
818
|
+
// performance regression here, first try removing this hack.
|
819
|
+
#if defined(__GNUC__) && defined(__x86_64__)
|
820
|
+
// Two 8-byte "NOP DWORD ptr [EAX + EAX*1 + 00000000H]" instructions.
|
821
|
+
asm(".byte 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00");
|
822
|
+
asm(".byte 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00");
|
823
|
+
#endif
|
675
824
|
|
825
|
+
const char* ip = ip_;
|
676
826
|
// We could have put this refill fragment only at the beginning of the loop.
|
677
827
|
// However, duplicating it at the end of each branch gives the compiler more
|
678
828
|
// scope to optimize the <ip_limit_ - ip> expression based on the local
|
@@ -685,13 +835,6 @@ class SnappyDecompressor {
|
|
685
835
|
}
|
686
836
|
|
687
837
|
MAYBE_REFILL();
|
688
|
-
// Add loop alignment directive. Without this directive, we observed
|
689
|
-
// significant performance degradation on several intel architectures
|
690
|
-
// in snappy benchmark built with LLVM. The degradation was caused by
|
691
|
-
// increased branch miss prediction.
|
692
|
-
#if defined(__clang__) && defined(__x86_64__)
|
693
|
-
asm volatile (".p2align 5");
|
694
|
-
#endif
|
695
838
|
for ( ;; ) {
|
696
839
|
const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip++));
|
697
840
|
|
@@ -712,7 +855,7 @@ class SnappyDecompressor {
|
|
712
855
|
if (writer->TryFastAppend(ip, ip_limit_ - ip, literal_length)) {
|
713
856
|
assert(literal_length < 61);
|
714
857
|
ip += literal_length;
|
715
|
-
// NOTE
|
858
|
+
// NOTE: There is no MAYBE_REFILL() here, as TryFastAppend()
|
716
859
|
// will not return true unless there's already at least five spare
|
717
860
|
// bytes in addition to the literal.
|
718
861
|
continue;
|
@@ -721,7 +864,8 @@ class SnappyDecompressor {
|
|
721
864
|
// Long literal.
|
722
865
|
const size_t literal_length_length = literal_length - 60;
|
723
866
|
literal_length =
|
724
|
-
(LittleEndian::Load32(ip)
|
867
|
+
ExtractLowBytes(LittleEndian::Load32(ip), literal_length_length) +
|
868
|
+
1;
|
725
869
|
ip += literal_length_length;
|
726
870
|
}
|
727
871
|
|
@@ -744,7 +888,8 @@ class SnappyDecompressor {
|
|
744
888
|
MAYBE_REFILL();
|
745
889
|
} else {
|
746
890
|
const size_t entry = char_table[c];
|
747
|
-
const size_t trailer =
|
891
|
+
const size_t trailer =
|
892
|
+
ExtractLowBytes(LittleEndian::Load32(ip), entry >> 11);
|
748
893
|
const size_t length = entry & 0xff;
|
749
894
|
ip += entry >> 11;
|
750
895
|
|
@@ -860,9 +1005,7 @@ size_t Compress(Source* reader, Sink* writer) {
|
|
860
1005
|
writer->Append(ulength, p-ulength);
|
861
1006
|
written += (p - ulength);
|
862
1007
|
|
863
|
-
internal::WorkingMemory wmem;
|
864
|
-
char* scratch = NULL;
|
865
|
-
char* scratch_output = NULL;
|
1008
|
+
internal::WorkingMemory wmem(N);
|
866
1009
|
|
867
1010
|
while (N > 0) {
|
868
1011
|
// Get next block to compress (without copying if possible)
|
@@ -878,13 +1021,7 @@ size_t Compress(Source* reader, Sink* writer) {
|
|
878
1021
|
pending_advance = num_to_read;
|
879
1022
|
fragment_size = num_to_read;
|
880
1023
|
} else {
|
881
|
-
|
882
|
-
if (scratch == NULL) {
|
883
|
-
// If this is the last iteration, we want to allocate N bytes
|
884
|
-
// of space, otherwise the max possible kBlockSize space.
|
885
|
-
// num_to_read contains exactly the correct value
|
886
|
-
scratch = new char[num_to_read];
|
887
|
-
}
|
1024
|
+
char* scratch = wmem.GetScratchInput();
|
888
1025
|
memcpy(scratch, fragment, bytes_read);
|
889
1026
|
reader->Skip(bytes_read);
|
890
1027
|
|
@@ -910,16 +1047,13 @@ size_t Compress(Source* reader, Sink* writer) {
|
|
910
1047
|
|
911
1048
|
// Need a scratch buffer for the output, in case the byte sink doesn't
|
912
1049
|
// have room for us directly.
|
913
|
-
|
914
|
-
|
915
|
-
|
916
|
-
|
917
|
-
|
918
|
-
|
919
|
-
|
920
|
-
char* dest = writer->GetAppendBuffer(max_output, scratch_output);
|
921
|
-
char* end = internal::CompressFragment(fragment, fragment_size,
|
922
|
-
dest, table, table_size);
|
1050
|
+
|
1051
|
+
// Since we encode kBlockSize regions followed by a region
|
1052
|
+
// which is <= kBlockSize in length, a previously allocated
|
1053
|
+
// scratch_output[] region is big enough for this iteration.
|
1054
|
+
char* dest = writer->GetAppendBuffer(max_output, wmem.GetScratchOutput());
|
1055
|
+
char* end = internal::CompressFragment(fragment, fragment_size, dest, table,
|
1056
|
+
table_size);
|
923
1057
|
writer->Append(dest, end - dest);
|
924
1058
|
written += (end - dest);
|
925
1059
|
|
@@ -929,9 +1063,6 @@ size_t Compress(Source* reader, Sink* writer) {
|
|
929
1063
|
|
930
1064
|
Report("snappy_compress", written, uncompressed_size);
|
931
1065
|
|
932
|
-
delete[] scratch;
|
933
|
-
delete[] scratch_output;
|
934
|
-
|
935
1066
|
return written;
|
936
1067
|
}
|
937
1068
|
|
@@ -944,14 +1075,22 @@ size_t Compress(Source* reader, Sink* writer) {
|
|
944
1075
|
// Writer template argument to SnappyDecompressor::DecompressAllTags().
|
945
1076
|
class SnappyIOVecWriter {
|
946
1077
|
private:
|
1078
|
+
// output_iov_end_ is set to iov + count and used to determine when
|
1079
|
+
// the end of the iovs is reached.
|
1080
|
+
const struct iovec* output_iov_end_;
|
1081
|
+
|
1082
|
+
#if !defined(NDEBUG)
|
947
1083
|
const struct iovec* output_iov_;
|
948
|
-
|
1084
|
+
#endif // !defined(NDEBUG)
|
1085
|
+
|
1086
|
+
// Current iov that is being written into.
|
1087
|
+
const struct iovec* curr_iov_;
|
949
1088
|
|
950
|
-
//
|
951
|
-
|
1089
|
+
// Pointer to current iov's write location.
|
1090
|
+
char* curr_iov_output_;
|
952
1091
|
|
953
|
-
//
|
954
|
-
size_t
|
1092
|
+
// Remaining bytes to write into curr_iov_output.
|
1093
|
+
size_t curr_iov_remaining_;
|
955
1094
|
|
956
1095
|
// Total bytes decompressed into output_iov_ so far.
|
957
1096
|
size_t total_written_;
|
@@ -959,22 +1098,24 @@ class SnappyIOVecWriter {
|
|
959
1098
|
// Maximum number of bytes that will be decompressed into output_iov_.
|
960
1099
|
size_t output_limit_;
|
961
1100
|
|
962
|
-
inline char* GetIOVecPointer(
|
963
|
-
return reinterpret_cast<char*>(
|
964
|
-
offset;
|
1101
|
+
static inline char* GetIOVecPointer(const struct iovec* iov, size_t offset) {
|
1102
|
+
return reinterpret_cast<char*>(iov->iov_base) + offset;
|
965
1103
|
}
|
966
1104
|
|
967
1105
|
public:
|
968
1106
|
// Does not take ownership of iov. iov must be valid during the
|
969
1107
|
// entire lifetime of the SnappyIOVecWriter.
|
970
1108
|
inline SnappyIOVecWriter(const struct iovec* iov, size_t iov_count)
|
971
|
-
:
|
972
|
-
|
973
|
-
|
974
|
-
|
1109
|
+
: output_iov_end_(iov + iov_count),
|
1110
|
+
#if !defined(NDEBUG)
|
1111
|
+
output_iov_(iov),
|
1112
|
+
#endif // !defined(NDEBUG)
|
1113
|
+
curr_iov_(iov),
|
1114
|
+
curr_iov_output_(iov_count ? reinterpret_cast<char*>(iov->iov_base)
|
1115
|
+
: nullptr),
|
1116
|
+
curr_iov_remaining_(iov_count ? iov->iov_len : 0),
|
975
1117
|
total_written_(0),
|
976
|
-
output_limit_(-1) {
|
977
|
-
}
|
1118
|
+
output_limit_(-1) {}
|
978
1119
|
|
979
1120
|
inline void SetExpectedLength(size_t len) {
|
980
1121
|
output_limit_ = len;
|
@@ -989,23 +1130,25 @@ class SnappyIOVecWriter {
|
|
989
1130
|
return false;
|
990
1131
|
}
|
991
1132
|
|
1133
|
+
return AppendNoCheck(ip, len);
|
1134
|
+
}
|
1135
|
+
|
1136
|
+
inline bool AppendNoCheck(const char* ip, size_t len) {
|
992
1137
|
while (len > 0) {
|
993
|
-
|
994
|
-
if (curr_iov_written_ >= output_iov_[curr_iov_index_].iov_len) {
|
1138
|
+
if (curr_iov_remaining_ == 0) {
|
995
1139
|
// This iovec is full. Go to the next one.
|
996
|
-
if (
|
1140
|
+
if (curr_iov_ + 1 >= output_iov_end_) {
|
997
1141
|
return false;
|
998
1142
|
}
|
999
|
-
|
1000
|
-
|
1143
|
+
++curr_iov_;
|
1144
|
+
curr_iov_output_ = reinterpret_cast<char*>(curr_iov_->iov_base);
|
1145
|
+
curr_iov_remaining_ = curr_iov_->iov_len;
|
1001
1146
|
}
|
1002
1147
|
|
1003
|
-
const size_t to_write = std::min(
|
1004
|
-
|
1005
|
-
|
1006
|
-
|
1007
|
-
to_write);
|
1008
|
-
curr_iov_written_ += to_write;
|
1148
|
+
const size_t to_write = std::min(len, curr_iov_remaining_);
|
1149
|
+
memcpy(curr_iov_output_, ip, to_write);
|
1150
|
+
curr_iov_output_ += to_write;
|
1151
|
+
curr_iov_remaining_ -= to_write;
|
1009
1152
|
total_written_ += to_write;
|
1010
1153
|
ip += to_write;
|
1011
1154
|
len -= to_write;
|
@@ -1017,11 +1160,11 @@ class SnappyIOVecWriter {
|
|
1017
1160
|
inline bool TryFastAppend(const char* ip, size_t available, size_t len) {
|
1018
1161
|
const size_t space_left = output_limit_ - total_written_;
|
1019
1162
|
if (len <= 16 && available >= 16 + kMaximumTagLength && space_left >= 16 &&
|
1020
|
-
|
1163
|
+
curr_iov_remaining_ >= 16) {
|
1021
1164
|
// Fast path, used for the majority (about 95%) of invocations.
|
1022
|
-
|
1023
|
-
|
1024
|
-
|
1165
|
+
UnalignedCopy128(ip, curr_iov_output_);
|
1166
|
+
curr_iov_output_ += len;
|
1167
|
+
curr_iov_remaining_ -= len;
|
1025
1168
|
total_written_ += len;
|
1026
1169
|
return true;
|
1027
1170
|
}
|
@@ -1030,7 +1173,9 @@ class SnappyIOVecWriter {
|
|
1030
1173
|
}
|
1031
1174
|
|
1032
1175
|
inline bool AppendFromSelf(size_t offset, size_t len) {
|
1033
|
-
|
1176
|
+
// See SnappyArrayWriter::AppendFromSelf for an explanation of
|
1177
|
+
// the "offset - 1u" trick.
|
1178
|
+
if (offset - 1u >= total_written_) {
|
1034
1179
|
return false;
|
1035
1180
|
}
|
1036
1181
|
const size_t space_left = output_limit_ - total_written_;
|
@@ -1039,8 +1184,8 @@ class SnappyIOVecWriter {
|
|
1039
1184
|
}
|
1040
1185
|
|
1041
1186
|
// Locate the iovec from which we need to start the copy.
|
1042
|
-
|
1043
|
-
size_t from_iov_offset =
|
1187
|
+
const iovec* from_iov = curr_iov_;
|
1188
|
+
size_t from_iov_offset = curr_iov_->iov_len - curr_iov_remaining_;
|
1044
1189
|
while (offset > 0) {
|
1045
1190
|
if (from_iov_offset >= offset) {
|
1046
1191
|
from_iov_offset -= offset;
|
@@ -1048,47 +1193,47 @@ class SnappyIOVecWriter {
|
|
1048
1193
|
}
|
1049
1194
|
|
1050
1195
|
offset -= from_iov_offset;
|
1051
|
-
|
1052
|
-
|
1053
|
-
|
1196
|
+
--from_iov;
|
1197
|
+
#if !defined(NDEBUG)
|
1198
|
+
assert(from_iov >= output_iov_);
|
1199
|
+
#endif // !defined(NDEBUG)
|
1200
|
+
from_iov_offset = from_iov->iov_len;
|
1054
1201
|
}
|
1055
1202
|
|
1056
1203
|
// Copy <len> bytes starting from the iovec pointed to by from_iov_index to
|
1057
1204
|
// the current iovec.
|
1058
1205
|
while (len > 0) {
|
1059
|
-
assert(
|
1060
|
-
if (
|
1061
|
-
const size_t to_copy =
|
1062
|
-
|
1063
|
-
|
1064
|
-
Append(GetIOVecPointer(from_iov_index, from_iov_offset), to_copy);
|
1206
|
+
assert(from_iov <= curr_iov_);
|
1207
|
+
if (from_iov != curr_iov_) {
|
1208
|
+
const size_t to_copy =
|
1209
|
+
std::min(from_iov->iov_len - from_iov_offset, len);
|
1210
|
+
AppendNoCheck(GetIOVecPointer(from_iov, from_iov_offset), to_copy);
|
1065
1211
|
len -= to_copy;
|
1066
1212
|
if (len > 0) {
|
1067
|
-
++
|
1213
|
+
++from_iov;
|
1068
1214
|
from_iov_offset = 0;
|
1069
1215
|
}
|
1070
1216
|
} else {
|
1071
|
-
|
1072
|
-
size_t to_copy = std::min(output_iov_[curr_iov_index_].iov_len -
|
1073
|
-
curr_iov_written_,
|
1074
|
-
len);
|
1217
|
+
size_t to_copy = curr_iov_remaining_;
|
1075
1218
|
if (to_copy == 0) {
|
1076
1219
|
// This iovec is full. Go to the next one.
|
1077
|
-
if (
|
1220
|
+
if (curr_iov_ + 1 >= output_iov_end_) {
|
1078
1221
|
return false;
|
1079
1222
|
}
|
1080
|
-
++
|
1081
|
-
|
1223
|
+
++curr_iov_;
|
1224
|
+
curr_iov_output_ = reinterpret_cast<char*>(curr_iov_->iov_base);
|
1225
|
+
curr_iov_remaining_ = curr_iov_->iov_len;
|
1082
1226
|
continue;
|
1083
1227
|
}
|
1084
1228
|
if (to_copy > len) {
|
1085
1229
|
to_copy = len;
|
1086
1230
|
}
|
1087
|
-
|
1088
|
-
|
1089
|
-
|
1090
|
-
|
1091
|
-
|
1231
|
+
|
1232
|
+
IncrementalCopy(GetIOVecPointer(from_iov, from_iov_offset),
|
1233
|
+
curr_iov_output_, curr_iov_output_ + to_copy,
|
1234
|
+
curr_iov_output_ + curr_iov_remaining_);
|
1235
|
+
curr_iov_output_ += to_copy;
|
1236
|
+
curr_iov_remaining_ -= to_copy;
|
1092
1237
|
from_iov_offset += to_copy;
|
1093
1238
|
total_written_ += to_copy;
|
1094
1239
|
len -= to_copy;
|
@@ -1197,7 +1342,7 @@ bool RawUncompress(Source* compressed, char* uncompressed) {
|
|
1197
1342
|
return InternalUncompress(compressed, &output);
|
1198
1343
|
}
|
1199
1344
|
|
1200
|
-
bool Uncompress(const char* compressed, size_t n, string* uncompressed) {
|
1345
|
+
bool Uncompress(const char* compressed, size_t n, std::string* uncompressed) {
|
1201
1346
|
size_t ulength;
|
1202
1347
|
if (!GetUncompressedLength(compressed, n, &ulength)) {
|
1203
1348
|
return false;
|
@@ -1265,7 +1410,8 @@ void RawCompress(const char* input,
|
|
1265
1410
|
*compressed_length = (writer.CurrentDestination() - compressed);
|
1266
1411
|
}
|
1267
1412
|
|
1268
|
-
size_t Compress(const char* input, size_t input_length,
|
1413
|
+
size_t Compress(const char* input, size_t input_length,
|
1414
|
+
std::string* compressed) {
|
1269
1415
|
// Pre-grow the buffer to the max length of the compressed output
|
1270
1416
|
STLStringResizeUninitialized(compressed, MaxCompressedLength(input_length));
|
1271
1417
|
|
@@ -1512,4 +1658,4 @@ bool Uncompress(Source* compressed, Sink* uncompressed) {
|
|
1512
1658
|
}
|
1513
1659
|
}
|
1514
1660
|
|
1515
|
-
}
|
1661
|
+
} // namespace snappy
|