snappy 0.1.0-java → 0.2.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/main.yml +34 -0
- data/.github/workflows/publish.yml +34 -0
- data/Gemfile +3 -4
- data/Rakefile +32 -30
- data/ext/api.c +6 -1
- data/lib/snappy.rb +5 -5
- data/lib/snappy/hadoop/reader.rb +6 -2
- data/lib/snappy/reader.rb +11 -7
- data/lib/snappy/shim.rb +1 -1
- data/lib/snappy/version.rb +1 -1
- data/snappy.gemspec +13 -9
- data/test/hadoop/snappy_hadoop_reader_test.rb +115 -0
- data/test/hadoop/snappy_hadoop_writer_test.rb +48 -0
- data/test/snappy_hadoop_test.rb +26 -0
- data/test/snappy_reader_test.rb +148 -0
- data/test/snappy_test.rb +95 -0
- data/test/snappy_writer_test.rb +55 -0
- data/test/test_helper.rb +7 -0
- data/vendor/snappy/CMakeLists.txt +177 -54
- data/vendor/snappy/NEWS +8 -0
- data/vendor/snappy/README.md +19 -20
- data/vendor/snappy/cmake/SnappyConfig.cmake.in +33 -0
- data/vendor/snappy/cmake/config.h.in +6 -6
- data/vendor/snappy/docs/README.md +72 -0
- data/vendor/snappy/snappy-internal.h +12 -5
- data/vendor/snappy/snappy-stubs-internal.cc +1 -1
- data/vendor/snappy/snappy-stubs-internal.h +60 -15
- data/vendor/snappy/snappy-stubs-public.h.in +16 -36
- data/vendor/snappy/snappy-test.cc +16 -15
- data/vendor/snappy/snappy-test.h +12 -60
- data/vendor/snappy/snappy.cc +333 -187
- data/vendor/snappy/snappy.h +14 -10
- data/vendor/snappy/snappy_compress_fuzzer.cc +59 -0
- data/vendor/snappy/snappy_uncompress_fuzzer.cc +57 -0
- data/vendor/snappy/snappy_unittest.cc +220 -124
- metadata +25 -18
- data/.travis.yml +0 -31
- data/smoke.sh +0 -8
- data/test/hadoop/test-snappy-hadoop-reader.rb +0 -103
- data/test/hadoop/test-snappy-hadoop-writer.rb +0 -48
- data/test/test-snappy-hadoop.rb +0 -22
- data/test/test-snappy-reader.rb +0 -129
- data/test/test-snappy-writer.rb +0 -55
- data/test/test-snappy.rb +0 -58
- data/vendor/snappy/cmake/SnappyConfig.cmake +0 -1
@@ -48,12 +48,12 @@ DEFINE_bool(run_microbenchmarks, true,
|
|
48
48
|
|
49
49
|
namespace snappy {
|
50
50
|
|
51
|
-
string ReadTestDataFile(const string& base, size_t size_limit) {
|
52
|
-
string contents;
|
51
|
+
std::string ReadTestDataFile(const std::string& base, size_t size_limit) {
|
52
|
+
std::string contents;
|
53
53
|
const char* srcdir = getenv("srcdir"); // This is set by Automake.
|
54
|
-
string prefix;
|
54
|
+
std::string prefix;
|
55
55
|
if (srcdir) {
|
56
|
-
prefix = string(srcdir) + "/";
|
56
|
+
prefix = std::string(srcdir) + "/";
|
57
57
|
}
|
58
58
|
file::GetContents(prefix + "testdata/" + base, &contents, file::Defaults()
|
59
59
|
).CheckSuccess();
|
@@ -63,11 +63,11 @@ string ReadTestDataFile(const string& base, size_t size_limit) {
|
|
63
63
|
return contents;
|
64
64
|
}
|
65
65
|
|
66
|
-
string ReadTestDataFile(const string& base) {
|
66
|
+
std::string ReadTestDataFile(const std::string& base) {
|
67
67
|
return ReadTestDataFile(base, 0);
|
68
68
|
}
|
69
69
|
|
70
|
-
string
|
70
|
+
std::string StrFormat(const char* format, ...) {
|
71
71
|
char buf[4096];
|
72
72
|
va_list ap;
|
73
73
|
va_start(ap, format);
|
@@ -79,7 +79,7 @@ string StringPrintf(const char* format, ...) {
|
|
79
79
|
bool benchmark_running = false;
|
80
80
|
int64 benchmark_real_time_us = 0;
|
81
81
|
int64 benchmark_cpu_time_us = 0;
|
82
|
-
string
|
82
|
+
std::string* benchmark_label = nullptr;
|
83
83
|
int64 benchmark_bytes_processed = 0;
|
84
84
|
|
85
85
|
void ResetBenchmarkTiming() {
|
@@ -163,11 +163,11 @@ void StopBenchmarkTiming() {
|
|
163
163
|
benchmark_running = false;
|
164
164
|
}
|
165
165
|
|
166
|
-
void SetBenchmarkLabel(const string& str) {
|
166
|
+
void SetBenchmarkLabel(const std::string& str) {
|
167
167
|
if (benchmark_label) {
|
168
168
|
delete benchmark_label;
|
169
169
|
}
|
170
|
-
benchmark_label = new string(str);
|
170
|
+
benchmark_label = new std::string(str);
|
171
171
|
}
|
172
172
|
|
173
173
|
void SetBenchmarkBytesProcessed(int64 bytes) {
|
@@ -217,8 +217,8 @@ void Benchmark::Run() {
|
|
217
217
|
benchmark_runs[run].cpu_time_us = benchmark_cpu_time_us;
|
218
218
|
}
|
219
219
|
|
220
|
-
string heading =
|
221
|
-
string human_readable_speed;
|
220
|
+
std::string heading = StrFormat("%s/%d", name_.c_str(), test_case_num);
|
221
|
+
std::string human_readable_speed;
|
222
222
|
|
223
223
|
std::nth_element(benchmark_runs,
|
224
224
|
benchmark_runs + kMedianPos,
|
@@ -232,15 +232,16 @@ void Benchmark::Run() {
|
|
232
232
|
int64 bytes_per_second =
|
233
233
|
benchmark_bytes_processed * 1000000 / cpu_time_us;
|
234
234
|
if (bytes_per_second < 1024) {
|
235
|
-
human_readable_speed =
|
235
|
+
human_readable_speed =
|
236
|
+
StrFormat("%dB/s", static_cast<int>(bytes_per_second));
|
236
237
|
} else if (bytes_per_second < 1024 * 1024) {
|
237
|
-
human_readable_speed =
|
238
|
+
human_readable_speed = StrFormat(
|
238
239
|
"%.1fkB/s", bytes_per_second / 1024.0f);
|
239
240
|
} else if (bytes_per_second < 1024 * 1024 * 1024) {
|
240
|
-
human_readable_speed =
|
241
|
+
human_readable_speed = StrFormat(
|
241
242
|
"%.1fMB/s", bytes_per_second / (1024.0f * 1024.0f));
|
242
243
|
} else {
|
243
|
-
human_readable_speed =
|
244
|
+
human_readable_speed = StrFormat(
|
244
245
|
"%.1fGB/s", bytes_per_second / (1024.0f * 1024.0f * 1024.0f));
|
245
246
|
}
|
246
247
|
}
|
data/vendor/snappy/snappy-test.h
CHANGED
@@ -55,8 +55,6 @@
|
|
55
55
|
#include <windows.h>
|
56
56
|
#endif
|
57
57
|
|
58
|
-
#include <string>
|
59
|
-
|
60
58
|
#ifdef HAVE_GTEST
|
61
59
|
|
62
60
|
#include <gtest/gtest.h>
|
@@ -169,7 +167,7 @@ namespace file {
|
|
169
167
|
namespace snappy {
|
170
168
|
|
171
169
|
#define FLAGS_test_random_seed 301
|
172
|
-
|
170
|
+
using TypeParam = std::string;
|
173
171
|
|
174
172
|
void Test_CorruptedTest_VerifyCorrupted();
|
175
173
|
void Test_Snappy_SimpleTests();
|
@@ -183,63 +181,13 @@ void Test_Snappy_ReadPastEndOfBuffer();
|
|
183
181
|
void Test_Snappy_FindMatchLength();
|
184
182
|
void Test_Snappy_FindMatchLengthRandom();
|
185
183
|
|
186
|
-
string ReadTestDataFile(const string& base, size_t size_limit);
|
184
|
+
std::string ReadTestDataFile(const std::string& base, size_t size_limit);
|
187
185
|
|
188
|
-
string ReadTestDataFile(const string& base);
|
186
|
+
std::string ReadTestDataFile(const std::string& base);
|
189
187
|
|
190
188
|
// A sprintf() variant that returns a std::string.
|
191
189
|
// Not safe for general use due to truncation issues.
|
192
|
-
string
|
193
|
-
|
194
|
-
// A simple, non-cryptographically-secure random generator.
|
195
|
-
class ACMRandom {
|
196
|
-
public:
|
197
|
-
explicit ACMRandom(uint32 seed) : seed_(seed) {}
|
198
|
-
|
199
|
-
int32 Next();
|
200
|
-
|
201
|
-
int32 Uniform(int32 n) {
|
202
|
-
return Next() % n;
|
203
|
-
}
|
204
|
-
uint8 Rand8() {
|
205
|
-
return static_cast<uint8>((Next() >> 1) & 0x000000ff);
|
206
|
-
}
|
207
|
-
bool OneIn(int X) { return Uniform(X) == 0; }
|
208
|
-
|
209
|
-
// Skewed: pick "base" uniformly from range [0,max_log] and then
|
210
|
-
// return "base" random bits. The effect is to pick a number in the
|
211
|
-
// range [0,2^max_log-1] with bias towards smaller numbers.
|
212
|
-
int32 Skewed(int max_log);
|
213
|
-
|
214
|
-
private:
|
215
|
-
static const uint32 M = 2147483647L; // 2^31-1
|
216
|
-
uint32 seed_;
|
217
|
-
};
|
218
|
-
|
219
|
-
inline int32 ACMRandom::Next() {
|
220
|
-
static const uint64 A = 16807; // bits 14, 8, 7, 5, 2, 1, 0
|
221
|
-
// We are computing
|
222
|
-
// seed_ = (seed_ * A) % M, where M = 2^31-1
|
223
|
-
//
|
224
|
-
// seed_ must not be zero or M, or else all subsequent computed values
|
225
|
-
// will be zero or M respectively. For all other values, seed_ will end
|
226
|
-
// up cycling through every number in [1,M-1]
|
227
|
-
uint64 product = seed_ * A;
|
228
|
-
|
229
|
-
// Compute (product % M) using the fact that ((x << 31) % M) == x.
|
230
|
-
seed_ = (product >> 31) + (product & M);
|
231
|
-
// The first reduction may overflow by 1 bit, so we may need to repeat.
|
232
|
-
// mod == M is not possible; using > allows the faster sign-bit-based test.
|
233
|
-
if (seed_ > M) {
|
234
|
-
seed_ -= M;
|
235
|
-
}
|
236
|
-
return seed_;
|
237
|
-
}
|
238
|
-
|
239
|
-
inline int32 ACMRandom::Skewed(int max_log) {
|
240
|
-
const int32 base = (Next() - 1) % (max_log+1);
|
241
|
-
return (Next() - 1) & ((1u << base)-1);
|
242
|
-
}
|
190
|
+
std::string StrFormat(const char* format, ...);
|
243
191
|
|
244
192
|
// A wall-time clock. This stub is not super-accurate, nor resistant to the
|
245
193
|
// system time changing.
|
@@ -293,8 +241,8 @@ typedef void (*BenchmarkFunction)(int, int);
|
|
293
241
|
|
294
242
|
class Benchmark {
|
295
243
|
public:
|
296
|
-
Benchmark(const string& name, BenchmarkFunction function)
|
297
|
-
name_(name), function_(function) {}
|
244
|
+
Benchmark(const std::string& name, BenchmarkFunction function)
|
245
|
+
: name_(name), function_(function) {}
|
298
246
|
|
299
247
|
Benchmark* DenseRange(int start, int stop) {
|
300
248
|
start_ = start;
|
@@ -305,7 +253,7 @@ class Benchmark {
|
|
305
253
|
void Run();
|
306
254
|
|
307
255
|
private:
|
308
|
-
const string name_;
|
256
|
+
const std::string name_;
|
309
257
|
const BenchmarkFunction function_;
|
310
258
|
int start_, stop_;
|
311
259
|
};
|
@@ -317,11 +265,13 @@ extern Benchmark* Benchmark_BM_UFlat;
|
|
317
265
|
extern Benchmark* Benchmark_BM_UIOVec;
|
318
266
|
extern Benchmark* Benchmark_BM_UValidate;
|
319
267
|
extern Benchmark* Benchmark_BM_ZFlat;
|
268
|
+
extern Benchmark* Benchmark_BM_ZFlatAll;
|
269
|
+
extern Benchmark* Benchmark_BM_ZFlatIncreasingTableSize;
|
320
270
|
|
321
271
|
void ResetBenchmarkTiming();
|
322
272
|
void StartBenchmarkTiming();
|
323
273
|
void StopBenchmarkTiming();
|
324
|
-
void SetBenchmarkLabel(const string& str);
|
274
|
+
void SetBenchmarkLabel(const std::string& str);
|
325
275
|
void SetBenchmarkBytesProcessed(int64 bytes);
|
326
276
|
|
327
277
|
#ifdef HAVE_LIBZ
|
@@ -468,6 +418,8 @@ static inline void RunSpecifiedBenchmarks() {
|
|
468
418
|
snappy::Benchmark_BM_UIOVec->Run();
|
469
419
|
snappy::Benchmark_BM_UValidate->Run();
|
470
420
|
snappy::Benchmark_BM_ZFlat->Run();
|
421
|
+
snappy::Benchmark_BM_ZFlatAll->Run();
|
422
|
+
snappy::Benchmark_BM_ZFlatIncreasingTableSize->Run();
|
471
423
|
|
472
424
|
fprintf(stderr, "\n");
|
473
425
|
}
|
data/vendor/snappy/snappy.cc
CHANGED
@@ -30,25 +30,50 @@
|
|
30
30
|
#include "snappy-internal.h"
|
31
31
|
#include "snappy-sinksource.h"
|
32
32
|
|
33
|
-
#
|
34
|
-
|
35
|
-
|
36
|
-
|
33
|
+
#if !defined(SNAPPY_HAVE_SSSE3)
|
34
|
+
// __SSSE3__ is defined by GCC and Clang. Visual Studio doesn't target SIMD
|
35
|
+
// support between SSE2 and AVX (so SSSE3 instructions require AVX support), and
|
36
|
+
// defines __AVX__ when AVX support is available.
|
37
|
+
#if defined(__SSSE3__) || defined(__AVX__)
|
38
|
+
#define SNAPPY_HAVE_SSSE3 1
|
37
39
|
#else
|
38
|
-
#define
|
40
|
+
#define SNAPPY_HAVE_SSSE3 0
|
39
41
|
#endif
|
42
|
+
#endif // !defined(SNAPPY_HAVE_SSSE3)
|
43
|
+
|
44
|
+
#if !defined(SNAPPY_HAVE_BMI2)
|
45
|
+
// __BMI2__ is defined by GCC and Clang. Visual Studio doesn't target BMI2
|
46
|
+
// specifically, but it does define __AVX2__ when AVX2 support is available.
|
47
|
+
// Fortunately, AVX2 was introduced in Haswell, just like BMI2.
|
48
|
+
//
|
49
|
+
// BMI2 is not defined as a subset of AVX2 (unlike SSSE3 and AVX above). So,
|
50
|
+
// GCC and Clang can build code with AVX2 enabled but BMI2 disabled, in which
|
51
|
+
// case issuing BMI2 instructions results in a compiler error.
|
52
|
+
#if defined(__BMI2__) || (defined(_MSC_VER) && defined(__AVX2__))
|
53
|
+
#define SNAPPY_HAVE_BMI2 1
|
54
|
+
#else
|
55
|
+
#define SNAPPY_HAVE_BMI2 0
|
56
|
+
#endif
|
57
|
+
#endif // !defined(SNAPPY_HAVE_BMI2)
|
58
|
+
|
59
|
+
#if SNAPPY_HAVE_SSSE3
|
60
|
+
// Please do not replace with <x86intrin.h>. or with headers that assume more
|
61
|
+
// advanced SSE versions without checking with all the OWNERS.
|
62
|
+
#include <tmmintrin.h>
|
40
63
|
#endif
|
41
64
|
|
42
|
-
#if
|
43
|
-
|
65
|
+
#if SNAPPY_HAVE_BMI2
|
66
|
+
// Please do not replace with <x86intrin.h>. or with headers that assume more
|
67
|
+
// advanced SSE versions without checking with all the OWNERS.
|
68
|
+
#include <immintrin.h>
|
44
69
|
#endif
|
70
|
+
|
45
71
|
#include <stdio.h>
|
46
72
|
|
47
73
|
#include <algorithm>
|
48
74
|
#include <string>
|
49
75
|
#include <vector>
|
50
76
|
|
51
|
-
|
52
77
|
namespace snappy {
|
53
78
|
|
54
79
|
using internal::COPY_1_BYTE_OFFSET;
|
@@ -103,16 +128,12 @@ void UnalignedCopy64(const void* src, void* dst) {
|
|
103
128
|
}
|
104
129
|
|
105
130
|
void UnalignedCopy128(const void* src, void* dst) {
|
106
|
-
//
|
107
|
-
// SSE2
|
108
|
-
|
109
|
-
__m128i x = _mm_loadu_si128(static_cast<const __m128i*>(src));
|
110
|
-
_mm_storeu_si128(static_cast<__m128i*>(dst), x);
|
111
|
-
#else
|
131
|
+
// memcpy gets vectorized when the appropriate compiler options are used.
|
132
|
+
// For example, x86 compilers targeting SSE2+ will optimize to an SSE2 load
|
133
|
+
// and store.
|
112
134
|
char tmp[16];
|
113
135
|
memcpy(tmp, src, 16);
|
114
136
|
memcpy(dst, tmp, 16);
|
115
|
-
#endif
|
116
137
|
}
|
117
138
|
|
118
139
|
// Copy [src, src+(op_limit-op)) to [op, (op_limit-op)) a byte at a time. Used
|
@@ -127,12 +148,35 @@ void UnalignedCopy128(const void* src, void* dst) {
|
|
127
148
|
// Note that this does not match the semantics of either memcpy() or memmove().
|
128
149
|
inline char* IncrementalCopySlow(const char* src, char* op,
|
129
150
|
char* const op_limit) {
|
151
|
+
// TODO: Remove pragma when LLVM is aware this
|
152
|
+
// function is only called in cold regions and when cold regions don't get
|
153
|
+
// vectorized or unrolled.
|
154
|
+
#ifdef __clang__
|
155
|
+
#pragma clang loop unroll(disable)
|
156
|
+
#endif
|
130
157
|
while (op < op_limit) {
|
131
158
|
*op++ = *src++;
|
132
159
|
}
|
133
160
|
return op_limit;
|
134
161
|
}
|
135
162
|
|
163
|
+
#if SNAPPY_HAVE_SSSE3
|
164
|
+
|
165
|
+
// This is a table of shuffle control masks that can be used as the source
|
166
|
+
// operand for PSHUFB to permute the contents of the destination XMM register
|
167
|
+
// into a repeating byte pattern.
|
168
|
+
alignas(16) const char pshufb_fill_patterns[7][16] = {
|
169
|
+
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
170
|
+
{0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1},
|
171
|
+
{0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0},
|
172
|
+
{0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3},
|
173
|
+
{0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0},
|
174
|
+
{0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3},
|
175
|
+
{0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1},
|
176
|
+
};
|
177
|
+
|
178
|
+
#endif // SNAPPY_HAVE_SSSE3
|
179
|
+
|
136
180
|
// Copy [src, src+(op_limit-op)) to [op, (op_limit-op)) but faster than
|
137
181
|
// IncrementalCopySlow. buf_limit is the address past the end of the writable
|
138
182
|
// region of the buffer.
|
@@ -144,9 +188,10 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
|
|
144
188
|
// pat = op - src
|
145
189
|
// len = limit - op
|
146
190
|
assert(src < op);
|
191
|
+
assert(op <= op_limit);
|
147
192
|
assert(op_limit <= buf_limit);
|
148
193
|
// NOTE: The compressor always emits 4 <= len <= 64. It is ok to assume that
|
149
|
-
// to optimize this function but we have to also handle
|
194
|
+
// to optimize this function but we have to also handle other cases in case
|
150
195
|
// the input does not satisfy these conditions.
|
151
196
|
|
152
197
|
size_t pattern_size = op - src;
|
@@ -176,16 +221,45 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
|
|
176
221
|
|
177
222
|
// Handle the uncommon case where pattern is less than 8 bytes.
|
178
223
|
if (SNAPPY_PREDICT_FALSE(pattern_size < 8)) {
|
179
|
-
|
180
|
-
//
|
181
|
-
//
|
224
|
+
#if SNAPPY_HAVE_SSSE3
|
225
|
+
// Load the first eight bytes into an 128-bit XMM register, then use PSHUFB
|
226
|
+
// to permute the register's contents in-place into a repeating sequence of
|
227
|
+
// the first "pattern_size" bytes.
|
228
|
+
// For example, suppose:
|
229
|
+
// src == "abc"
|
230
|
+
// op == op + 3
|
231
|
+
// After _mm_shuffle_epi8(), "pattern" will have five copies of "abc"
|
232
|
+
// followed by one byte of slop: abcabcabcabcabca.
|
182
233
|
//
|
183
|
-
//
|
184
|
-
//
|
185
|
-
//
|
186
|
-
|
187
|
-
|
188
|
-
|
234
|
+
// The non-SSE fallback implementation suffers from store-forwarding stalls
|
235
|
+
// because its loads and stores partly overlap. By expanding the pattern
|
236
|
+
// in-place, we avoid the penalty.
|
237
|
+
if (SNAPPY_PREDICT_TRUE(op <= buf_limit - 16)) {
|
238
|
+
const __m128i shuffle_mask = _mm_load_si128(
|
239
|
+
reinterpret_cast<const __m128i*>(pshufb_fill_patterns)
|
240
|
+
+ pattern_size - 1);
|
241
|
+
const __m128i pattern = _mm_shuffle_epi8(
|
242
|
+
_mm_loadl_epi64(reinterpret_cast<const __m128i*>(src)), shuffle_mask);
|
243
|
+
// Uninitialized bytes are masked out by the shuffle mask.
|
244
|
+
// TODO: remove annotation and macro defs once MSan is fixed.
|
245
|
+
SNAPPY_ANNOTATE_MEMORY_IS_INITIALIZED(&pattern, sizeof(pattern));
|
246
|
+
pattern_size *= 16 / pattern_size;
|
247
|
+
char* op_end = std::min(op_limit, buf_limit - 15);
|
248
|
+
while (op < op_end) {
|
249
|
+
_mm_storeu_si128(reinterpret_cast<__m128i*>(op), pattern);
|
250
|
+
op += pattern_size;
|
251
|
+
}
|
252
|
+
if (SNAPPY_PREDICT_TRUE(op >= op_limit)) return op_limit;
|
253
|
+
}
|
254
|
+
return IncrementalCopySlow(src, op, op_limit);
|
255
|
+
#else // !SNAPPY_HAVE_SSSE3
|
256
|
+
// If plenty of buffer space remains, expand the pattern to at least 8
|
257
|
+
// bytes. The way the following loop is written, we need 8 bytes of buffer
|
258
|
+
// space if pattern_size >= 4, 11 bytes if pattern_size is 1 or 3, and 10
|
259
|
+
// bytes if pattern_size is 2. Precisely encoding that is probably not
|
260
|
+
// worthwhile; instead, invoke the slow path if we cannot write 11 bytes
|
261
|
+
// (because 11 are required in the worst case).
|
262
|
+
if (SNAPPY_PREDICT_TRUE(op <= buf_limit - 11)) {
|
189
263
|
while (pattern_size < 8) {
|
190
264
|
UnalignedCopy64(src, op);
|
191
265
|
op += pattern_size;
|
@@ -195,6 +269,7 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
|
|
195
269
|
} else {
|
196
270
|
return IncrementalCopySlow(src, op, op_limit);
|
197
271
|
}
|
272
|
+
#endif // SNAPPY_HAVE_SSSE3
|
198
273
|
}
|
199
274
|
assert(pattern_size >= 8);
|
200
275
|
|
@@ -202,13 +277,48 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
|
|
202
277
|
// UnalignedCopy128 might overwrite data in op. UnalignedCopy64 is safe
|
203
278
|
// because expanding the pattern to at least 8 bytes guarantees that
|
204
279
|
// op - src >= 8.
|
205
|
-
|
280
|
+
//
|
281
|
+
// Typically, the op_limit is the gating factor so try to simplify the loop
|
282
|
+
// based on that.
|
283
|
+
if (SNAPPY_PREDICT_TRUE(op_limit <= buf_limit - 16)) {
|
284
|
+
// There is at least one, and at most four 16-byte blocks. Writing four
|
285
|
+
// conditionals instead of a loop allows FDO to layout the code with respect
|
286
|
+
// to the actual probabilities of each length.
|
287
|
+
// TODO: Replace with loop with trip count hint.
|
288
|
+
UnalignedCopy64(src, op);
|
289
|
+
UnalignedCopy64(src + 8, op + 8);
|
290
|
+
|
291
|
+
if (op + 16 < op_limit) {
|
292
|
+
UnalignedCopy64(src + 16, op + 16);
|
293
|
+
UnalignedCopy64(src + 24, op + 24);
|
294
|
+
}
|
295
|
+
if (op + 32 < op_limit) {
|
296
|
+
UnalignedCopy64(src + 32, op + 32);
|
297
|
+
UnalignedCopy64(src + 40, op + 40);
|
298
|
+
}
|
299
|
+
if (op + 48 < op_limit) {
|
300
|
+
UnalignedCopy64(src + 48, op + 48);
|
301
|
+
UnalignedCopy64(src + 56, op + 56);
|
302
|
+
}
|
303
|
+
return op_limit;
|
304
|
+
}
|
305
|
+
|
306
|
+
// Fall back to doing as much as we can with the available slop in the
|
307
|
+
// buffer. This code path is relatively cold however so we save code size by
|
308
|
+
// avoiding unrolling and vectorizing.
|
309
|
+
//
|
310
|
+
// TODO: Remove pragma when when cold regions don't get vectorized
|
311
|
+
// or unrolled.
|
312
|
+
#ifdef __clang__
|
313
|
+
#pragma clang loop unroll(disable)
|
314
|
+
#endif
|
315
|
+
for (char *op_end = buf_limit - 16; op < op_end; op += 16, src += 16) {
|
206
316
|
UnalignedCopy64(src, op);
|
207
317
|
UnalignedCopy64(src + 8, op + 8);
|
208
|
-
src += 16;
|
209
|
-
op += 16;
|
210
|
-
if (SNAPPY_PREDICT_TRUE(op >= op_limit)) return op_limit;
|
211
318
|
}
|
319
|
+
if (op >= op_limit)
|
320
|
+
return op_limit;
|
321
|
+
|
212
322
|
// We only take this branch if we didn't have enough slop and we can do a
|
213
323
|
// single 8 byte copy.
|
214
324
|
if (SNAPPY_PREDICT_FALSE(op <= buf_limit - 8)) {
|
@@ -221,10 +331,10 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
|
|
221
331
|
|
222
332
|
} // namespace
|
223
333
|
|
334
|
+
template <bool allow_fast_path>
|
224
335
|
static inline char* EmitLiteral(char* op,
|
225
336
|
const char* literal,
|
226
|
-
int len
|
227
|
-
bool allow_fast_path) {
|
337
|
+
int len) {
|
228
338
|
// The vast majority of copies are below 16 bytes, for which a
|
229
339
|
// call to memcpy is overkill. This fast path can sometimes
|
230
340
|
// copy up to 15 bytes too much, but that is okay in the
|
@@ -249,25 +359,23 @@ static inline char* EmitLiteral(char* op,
|
|
249
359
|
// Fits in tag byte
|
250
360
|
*op++ = LITERAL | (n << 2);
|
251
361
|
} else {
|
252
|
-
|
253
|
-
char* base = op;
|
254
|
-
int count = 0;
|
255
|
-
op++;
|
256
|
-
while (n > 0) {
|
257
|
-
*op++ = n & 0xff;
|
258
|
-
n >>= 8;
|
259
|
-
count++;
|
260
|
-
}
|
362
|
+
int count = (Bits::Log2Floor(n) >> 3) + 1;
|
261
363
|
assert(count >= 1);
|
262
364
|
assert(count <= 4);
|
263
|
-
*
|
365
|
+
*op++ = LITERAL | ((59 + count) << 2);
|
366
|
+
// Encode in upcoming bytes.
|
367
|
+
// Write 4 bytes, though we may care about only 1 of them. The output buffer
|
368
|
+
// is guaranteed to have at least 3 more spaces left as 'len >= 61' holds
|
369
|
+
// here and there is a memcpy of size 'len' below.
|
370
|
+
LittleEndian::Store32(op, n);
|
371
|
+
op += count;
|
264
372
|
}
|
265
373
|
memcpy(op, literal, len);
|
266
374
|
return op + len;
|
267
375
|
}
|
268
376
|
|
269
|
-
|
270
|
-
|
377
|
+
template <bool len_less_than_12>
|
378
|
+
static inline char* EmitCopyAtMost64(char* op, size_t offset, size_t len) {
|
271
379
|
assert(len <= 64);
|
272
380
|
assert(len >= 4);
|
273
381
|
assert(offset < 65536);
|
@@ -288,29 +396,33 @@ static inline char* EmitCopyAtMost64(char* op, size_t offset, size_t len,
|
|
288
396
|
return op;
|
289
397
|
}
|
290
398
|
|
291
|
-
|
292
|
-
|
399
|
+
template <bool len_less_than_12>
|
400
|
+
static inline char* EmitCopy(char* op, size_t offset, size_t len) {
|
293
401
|
assert(len_less_than_12 == (len < 12));
|
294
402
|
if (len_less_than_12) {
|
295
|
-
return EmitCopyAtMost64(op, offset, len
|
403
|
+
return EmitCopyAtMost64</*len_less_than_12=*/true>(op, offset, len);
|
296
404
|
} else {
|
297
405
|
// A special case for len <= 64 might help, but so far measurements suggest
|
298
406
|
// it's in the noise.
|
299
407
|
|
300
408
|
// Emit 64 byte copies but make sure to keep at least four bytes reserved.
|
301
409
|
while (SNAPPY_PREDICT_FALSE(len >= 68)) {
|
302
|
-
op = EmitCopyAtMost64(op, offset, 64
|
410
|
+
op = EmitCopyAtMost64</*len_less_than_12=*/false>(op, offset, 64);
|
303
411
|
len -= 64;
|
304
412
|
}
|
305
413
|
|
306
414
|
// One or two copies will now finish the job.
|
307
415
|
if (len > 64) {
|
308
|
-
op = EmitCopyAtMost64(op, offset, 60
|
416
|
+
op = EmitCopyAtMost64</*len_less_than_12=*/false>(op, offset, 60);
|
309
417
|
len -= 60;
|
310
418
|
}
|
311
419
|
|
312
420
|
// Emit remainder.
|
313
|
-
|
421
|
+
if (len < 12) {
|
422
|
+
op = EmitCopyAtMost64</*len_less_than_12=*/true>(op, offset, len);
|
423
|
+
} else {
|
424
|
+
op = EmitCopyAtMost64</*len_less_than_12=*/false>(op, offset, len);
|
425
|
+
}
|
314
426
|
return op;
|
315
427
|
}
|
316
428
|
}
|
@@ -326,31 +438,45 @@ bool GetUncompressedLength(const char* start, size_t n, size_t* result) {
|
|
326
438
|
}
|
327
439
|
}
|
328
440
|
|
329
|
-
namespace
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
size_t htsize = 256;
|
337
|
-
while (htsize < kMaxHashTableSize && htsize < input_size) {
|
338
|
-
htsize <<= 1;
|
441
|
+
namespace {
|
442
|
+
uint32 CalculateTableSize(uint32 input_size) {
|
443
|
+
static_assert(
|
444
|
+
kMaxHashTableSize >= kMinHashTableSize,
|
445
|
+
"kMaxHashTableSize should be greater or equal to kMinHashTableSize.");
|
446
|
+
if (input_size > kMaxHashTableSize) {
|
447
|
+
return kMaxHashTableSize;
|
339
448
|
}
|
340
|
-
|
341
|
-
|
342
|
-
if (htsize <= ARRAYSIZE(small_table_)) {
|
343
|
-
table = small_table_;
|
344
|
-
} else {
|
345
|
-
if (large_table_ == NULL) {
|
346
|
-
large_table_ = new uint16[kMaxHashTableSize];
|
347
|
-
}
|
348
|
-
table = large_table_;
|
449
|
+
if (input_size < kMinHashTableSize) {
|
450
|
+
return kMinHashTableSize;
|
349
451
|
}
|
452
|
+
// This is equivalent to Log2Ceiling(input_size), assuming input_size > 1.
|
453
|
+
// 2 << Log2Floor(x - 1) is equivalent to 1 << (1 + Log2Floor(x - 1)).
|
454
|
+
return 2u << Bits::Log2Floor(input_size - 1);
|
455
|
+
}
|
456
|
+
} // namespace
|
350
457
|
|
458
|
+
namespace internal {
|
459
|
+
WorkingMemory::WorkingMemory(size_t input_size) {
|
460
|
+
const size_t max_fragment_size = std::min(input_size, kBlockSize);
|
461
|
+
const size_t table_size = CalculateTableSize(max_fragment_size);
|
462
|
+
size_ = table_size * sizeof(*table_) + max_fragment_size +
|
463
|
+
MaxCompressedLength(max_fragment_size);
|
464
|
+
mem_ = std::allocator<char>().allocate(size_);
|
465
|
+
table_ = reinterpret_cast<uint16*>(mem_);
|
466
|
+
input_ = mem_ + table_size * sizeof(*table_);
|
467
|
+
output_ = input_ + max_fragment_size;
|
468
|
+
}
|
469
|
+
|
470
|
+
WorkingMemory::~WorkingMemory() {
|
471
|
+
std::allocator<char>().deallocate(mem_, size_);
|
472
|
+
}
|
473
|
+
|
474
|
+
uint16* WorkingMemory::GetHashTable(size_t fragment_size,
|
475
|
+
int* table_size) const {
|
476
|
+
const size_t htsize = CalculateTableSize(fragment_size);
|
477
|
+
memset(table_, 0, htsize * sizeof(*table_));
|
351
478
|
*table_size = htsize;
|
352
|
-
|
353
|
-
return table;
|
479
|
+
return table_;
|
354
480
|
}
|
355
481
|
} // end namespace internal
|
356
482
|
|
@@ -417,7 +543,7 @@ char* CompressFragment(const char* input,
|
|
417
543
|
// "ip" is the input pointer, and "op" is the output pointer.
|
418
544
|
const char* ip = input;
|
419
545
|
assert(input_size <= kBlockSize);
|
420
|
-
assert((table_size & (table_size - 1)) == 0);
|
546
|
+
assert((table_size & (table_size - 1)) == 0); // table must be power of two
|
421
547
|
const int shift = 32 - Bits::Log2Floor(table_size);
|
422
548
|
assert(static_cast<int>(kuint32max >> shift) == table_size - 1);
|
423
549
|
const char* ip_end = input + input_size;
|
@@ -484,7 +610,7 @@ char* CompressFragment(const char* input,
|
|
484
610
|
// than 4 bytes match. But, prior to the match, input
|
485
611
|
// bytes [next_emit, ip) are unmatched. Emit them as "literal bytes."
|
486
612
|
assert(next_emit + 16 <= ip_end);
|
487
|
-
op = EmitLiteral(op, next_emit, ip - next_emit
|
613
|
+
op = EmitLiteral</*allow_fast_path=*/true>(op, next_emit, ip - next_emit);
|
488
614
|
|
489
615
|
// Step 3: Call EmitCopy, and then see if another EmitCopy could
|
490
616
|
// be our next move. Repeat until we find no match for the
|
@@ -507,7 +633,11 @@ char* CompressFragment(const char* input,
|
|
507
633
|
ip += matched;
|
508
634
|
size_t offset = base - candidate;
|
509
635
|
assert(0 == memcmp(base, candidate, matched));
|
510
|
-
|
636
|
+
if (p.second) {
|
637
|
+
op = EmitCopy</*len_less_than_12=*/true>(op, offset, matched);
|
638
|
+
} else {
|
639
|
+
op = EmitCopy</*len_less_than_12=*/false>(op, offset, matched);
|
640
|
+
}
|
511
641
|
next_emit = ip;
|
512
642
|
if (SNAPPY_PREDICT_FALSE(ip >= ip_limit)) {
|
513
643
|
goto emit_remainder;
|
@@ -532,7 +662,8 @@ char* CompressFragment(const char* input,
|
|
532
662
|
emit_remainder:
|
533
663
|
// Emit the remaining bytes as a literal
|
534
664
|
if (next_emit < ip_end) {
|
535
|
-
op = EmitLiteral(op, next_emit,
|
665
|
+
op = EmitLiteral</*allow_fast_path=*/false>(op, next_emit,
|
666
|
+
ip_end - next_emit);
|
536
667
|
}
|
537
668
|
|
538
669
|
return op;
|
@@ -583,14 +714,28 @@ static inline void Report(const char *algorithm, size_t compressed_size,
|
|
583
714
|
// bool TryFastAppend(const char* ip, size_t available, size_t length);
|
584
715
|
// };
|
585
716
|
|
586
|
-
|
587
|
-
|
588
|
-
|
589
|
-
|
590
|
-
|
591
|
-
|
717
|
+
static inline uint32 ExtractLowBytes(uint32 v, int n) {
|
718
|
+
assert(n >= 0);
|
719
|
+
assert(n <= 4);
|
720
|
+
#if SNAPPY_HAVE_BMI2
|
721
|
+
return _bzhi_u32(v, 8 * n);
|
722
|
+
#else
|
723
|
+
// This needs to be wider than uint32 otherwise `mask << 32` will be
|
724
|
+
// undefined.
|
725
|
+
uint64 mask = 0xffffffff;
|
726
|
+
return v & ~(mask << (8 * n));
|
727
|
+
#endif
|
728
|
+
}
|
592
729
|
|
593
|
-
|
730
|
+
static inline bool LeftShiftOverflows(uint8 value, uint32 shift) {
|
731
|
+
assert(shift < 32);
|
732
|
+
static const uint8 masks[] = {
|
733
|
+
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, //
|
734
|
+
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, //
|
735
|
+
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, //
|
736
|
+
0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe};
|
737
|
+
return (value & masks[shift]) != 0;
|
738
|
+
}
|
594
739
|
|
595
740
|
// Helper class for decompression
|
596
741
|
class SnappyDecompressor {
|
@@ -629,7 +774,7 @@ class SnappyDecompressor {
|
|
629
774
|
}
|
630
775
|
|
631
776
|
// Read the uncompressed length stored at the start of the compressed data.
|
632
|
-
// On
|
777
|
+
// On success, stores the length in *result and returns true.
|
633
778
|
// On failure, returns false.
|
634
779
|
bool ReadUncompressedLength(uint32* result) {
|
635
780
|
assert(ip_ == NULL); // Must not have read anything yet
|
@@ -644,7 +789,7 @@ class SnappyDecompressor {
|
|
644
789
|
const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip));
|
645
790
|
reader_->Skip(1);
|
646
791
|
uint32 val = c & 0x7f;
|
647
|
-
if (((val
|
792
|
+
if (LeftShiftOverflows(static_cast<uint8>(val), shift)) return false;
|
648
793
|
*result |= val << shift;
|
649
794
|
if (c < 128) {
|
650
795
|
break;
|
@@ -657,22 +802,27 @@ class SnappyDecompressor {
|
|
657
802
|
// Process the next item found in the input.
|
658
803
|
// Returns true if successful, false on error or end of input.
|
659
804
|
template <class Writer>
|
805
|
+
#if defined(__GNUC__) && defined(__x86_64__)
|
806
|
+
__attribute__((aligned(32)))
|
807
|
+
#endif
|
660
808
|
void DecompressAllTags(Writer* writer) {
|
661
|
-
|
662
|
-
//
|
663
|
-
//
|
664
|
-
|
665
|
-
//
|
666
|
-
//
|
667
|
-
//
|
668
|
-
//
|
669
|
-
//
|
670
|
-
|
671
|
-
|
672
|
-
|
673
|
-
|
674
|
-
|
809
|
+
// In x86, pad the function body to start 16 bytes later. This function has
|
810
|
+
// a couple of hotspots that are highly sensitive to alignment: we have
|
811
|
+
// observed regressions by more than 20% in some metrics just by moving the
|
812
|
+
// exact same code to a different position in the benchmark binary.
|
813
|
+
//
|
814
|
+
// Putting this code on a 32-byte-aligned boundary + 16 bytes makes us hit
|
815
|
+
// the "lucky" case consistently. Unfortunately, this is a very brittle
|
816
|
+
// workaround, and future differences in code generation may reintroduce
|
817
|
+
// this regression. If you experience a big, difficult to explain, benchmark
|
818
|
+
// performance regression here, first try removing this hack.
|
819
|
+
#if defined(__GNUC__) && defined(__x86_64__)
|
820
|
+
// Two 8-byte "NOP DWORD ptr [EAX + EAX*1 + 00000000H]" instructions.
|
821
|
+
asm(".byte 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00");
|
822
|
+
asm(".byte 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00");
|
823
|
+
#endif
|
675
824
|
|
825
|
+
const char* ip = ip_;
|
676
826
|
// We could have put this refill fragment only at the beginning of the loop.
|
677
827
|
// However, duplicating it at the end of each branch gives the compiler more
|
678
828
|
// scope to optimize the <ip_limit_ - ip> expression based on the local
|
@@ -685,13 +835,6 @@ class SnappyDecompressor {
|
|
685
835
|
}
|
686
836
|
|
687
837
|
MAYBE_REFILL();
|
688
|
-
// Add loop alignment directive. Without this directive, we observed
|
689
|
-
// significant performance degradation on several intel architectures
|
690
|
-
// in snappy benchmark built with LLVM. The degradation was caused by
|
691
|
-
// increased branch miss prediction.
|
692
|
-
#if defined(__clang__) && defined(__x86_64__)
|
693
|
-
asm volatile (".p2align 5");
|
694
|
-
#endif
|
695
838
|
for ( ;; ) {
|
696
839
|
const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip++));
|
697
840
|
|
@@ -712,7 +855,7 @@ class SnappyDecompressor {
|
|
712
855
|
if (writer->TryFastAppend(ip, ip_limit_ - ip, literal_length)) {
|
713
856
|
assert(literal_length < 61);
|
714
857
|
ip += literal_length;
|
715
|
-
// NOTE
|
858
|
+
// NOTE: There is no MAYBE_REFILL() here, as TryFastAppend()
|
716
859
|
// will not return true unless there's already at least five spare
|
717
860
|
// bytes in addition to the literal.
|
718
861
|
continue;
|
@@ -721,7 +864,8 @@ class SnappyDecompressor {
|
|
721
864
|
// Long literal.
|
722
865
|
const size_t literal_length_length = literal_length - 60;
|
723
866
|
literal_length =
|
724
|
-
(LittleEndian::Load32(ip)
|
867
|
+
ExtractLowBytes(LittleEndian::Load32(ip), literal_length_length) +
|
868
|
+
1;
|
725
869
|
ip += literal_length_length;
|
726
870
|
}
|
727
871
|
|
@@ -744,7 +888,8 @@ class SnappyDecompressor {
|
|
744
888
|
MAYBE_REFILL();
|
745
889
|
} else {
|
746
890
|
const size_t entry = char_table[c];
|
747
|
-
const size_t trailer =
|
891
|
+
const size_t trailer =
|
892
|
+
ExtractLowBytes(LittleEndian::Load32(ip), entry >> 11);
|
748
893
|
const size_t length = entry & 0xff;
|
749
894
|
ip += entry >> 11;
|
750
895
|
|
@@ -860,9 +1005,7 @@ size_t Compress(Source* reader, Sink* writer) {
|
|
860
1005
|
writer->Append(ulength, p-ulength);
|
861
1006
|
written += (p - ulength);
|
862
1007
|
|
863
|
-
internal::WorkingMemory wmem;
|
864
|
-
char* scratch = NULL;
|
865
|
-
char* scratch_output = NULL;
|
1008
|
+
internal::WorkingMemory wmem(N);
|
866
1009
|
|
867
1010
|
while (N > 0) {
|
868
1011
|
// Get next block to compress (without copying if possible)
|
@@ -878,13 +1021,7 @@ size_t Compress(Source* reader, Sink* writer) {
|
|
878
1021
|
pending_advance = num_to_read;
|
879
1022
|
fragment_size = num_to_read;
|
880
1023
|
} else {
|
881
|
-
|
882
|
-
if (scratch == NULL) {
|
883
|
-
// If this is the last iteration, we want to allocate N bytes
|
884
|
-
// of space, otherwise the max possible kBlockSize space.
|
885
|
-
// num_to_read contains exactly the correct value
|
886
|
-
scratch = new char[num_to_read];
|
887
|
-
}
|
1024
|
+
char* scratch = wmem.GetScratchInput();
|
888
1025
|
memcpy(scratch, fragment, bytes_read);
|
889
1026
|
reader->Skip(bytes_read);
|
890
1027
|
|
@@ -910,16 +1047,13 @@ size_t Compress(Source* reader, Sink* writer) {
|
|
910
1047
|
|
911
1048
|
// Need a scratch buffer for the output, in case the byte sink doesn't
|
912
1049
|
// have room for us directly.
|
913
|
-
|
914
|
-
|
915
|
-
|
916
|
-
|
917
|
-
|
918
|
-
|
919
|
-
|
920
|
-
char* dest = writer->GetAppendBuffer(max_output, scratch_output);
|
921
|
-
char* end = internal::CompressFragment(fragment, fragment_size,
|
922
|
-
dest, table, table_size);
|
1050
|
+
|
1051
|
+
// Since we encode kBlockSize regions followed by a region
|
1052
|
+
// which is <= kBlockSize in length, a previously allocated
|
1053
|
+
// scratch_output[] region is big enough for this iteration.
|
1054
|
+
char* dest = writer->GetAppendBuffer(max_output, wmem.GetScratchOutput());
|
1055
|
+
char* end = internal::CompressFragment(fragment, fragment_size, dest, table,
|
1056
|
+
table_size);
|
923
1057
|
writer->Append(dest, end - dest);
|
924
1058
|
written += (end - dest);
|
925
1059
|
|
@@ -929,9 +1063,6 @@ size_t Compress(Source* reader, Sink* writer) {
|
|
929
1063
|
|
930
1064
|
Report("snappy_compress", written, uncompressed_size);
|
931
1065
|
|
932
|
-
delete[] scratch;
|
933
|
-
delete[] scratch_output;
|
934
|
-
|
935
1066
|
return written;
|
936
1067
|
}
|
937
1068
|
|
@@ -944,14 +1075,22 @@ size_t Compress(Source* reader, Sink* writer) {
|
|
944
1075
|
// Writer template argument to SnappyDecompressor::DecompressAllTags().
|
945
1076
|
class SnappyIOVecWriter {
|
946
1077
|
private:
|
1078
|
+
// output_iov_end_ is set to iov + count and used to determine when
|
1079
|
+
// the end of the iovs is reached.
|
1080
|
+
const struct iovec* output_iov_end_;
|
1081
|
+
|
1082
|
+
#if !defined(NDEBUG)
|
947
1083
|
const struct iovec* output_iov_;
|
948
|
-
|
1084
|
+
#endif // !defined(NDEBUG)
|
1085
|
+
|
1086
|
+
// Current iov that is being written into.
|
1087
|
+
const struct iovec* curr_iov_;
|
949
1088
|
|
950
|
-
//
|
951
|
-
|
1089
|
+
// Pointer to current iov's write location.
|
1090
|
+
char* curr_iov_output_;
|
952
1091
|
|
953
|
-
//
|
954
|
-
size_t
|
1092
|
+
// Remaining bytes to write into curr_iov_output.
|
1093
|
+
size_t curr_iov_remaining_;
|
955
1094
|
|
956
1095
|
// Total bytes decompressed into output_iov_ so far.
|
957
1096
|
size_t total_written_;
|
@@ -959,22 +1098,24 @@ class SnappyIOVecWriter {
|
|
959
1098
|
// Maximum number of bytes that will be decompressed into output_iov_.
|
960
1099
|
size_t output_limit_;
|
961
1100
|
|
962
|
-
inline char* GetIOVecPointer(
|
963
|
-
return reinterpret_cast<char*>(
|
964
|
-
offset;
|
1101
|
+
static inline char* GetIOVecPointer(const struct iovec* iov, size_t offset) {
|
1102
|
+
return reinterpret_cast<char*>(iov->iov_base) + offset;
|
965
1103
|
}
|
966
1104
|
|
967
1105
|
public:
|
968
1106
|
// Does not take ownership of iov. iov must be valid during the
|
969
1107
|
// entire lifetime of the SnappyIOVecWriter.
|
970
1108
|
inline SnappyIOVecWriter(const struct iovec* iov, size_t iov_count)
|
971
|
-
:
|
972
|
-
|
973
|
-
|
974
|
-
|
1109
|
+
: output_iov_end_(iov + iov_count),
|
1110
|
+
#if !defined(NDEBUG)
|
1111
|
+
output_iov_(iov),
|
1112
|
+
#endif // !defined(NDEBUG)
|
1113
|
+
curr_iov_(iov),
|
1114
|
+
curr_iov_output_(iov_count ? reinterpret_cast<char*>(iov->iov_base)
|
1115
|
+
: nullptr),
|
1116
|
+
curr_iov_remaining_(iov_count ? iov->iov_len : 0),
|
975
1117
|
total_written_(0),
|
976
|
-
output_limit_(-1) {
|
977
|
-
}
|
1118
|
+
output_limit_(-1) {}
|
978
1119
|
|
979
1120
|
inline void SetExpectedLength(size_t len) {
|
980
1121
|
output_limit_ = len;
|
@@ -989,23 +1130,25 @@ class SnappyIOVecWriter {
|
|
989
1130
|
return false;
|
990
1131
|
}
|
991
1132
|
|
1133
|
+
return AppendNoCheck(ip, len);
|
1134
|
+
}
|
1135
|
+
|
1136
|
+
inline bool AppendNoCheck(const char* ip, size_t len) {
|
992
1137
|
while (len > 0) {
|
993
|
-
|
994
|
-
if (curr_iov_written_ >= output_iov_[curr_iov_index_].iov_len) {
|
1138
|
+
if (curr_iov_remaining_ == 0) {
|
995
1139
|
// This iovec is full. Go to the next one.
|
996
|
-
if (
|
1140
|
+
if (curr_iov_ + 1 >= output_iov_end_) {
|
997
1141
|
return false;
|
998
1142
|
}
|
999
|
-
|
1000
|
-
|
1143
|
+
++curr_iov_;
|
1144
|
+
curr_iov_output_ = reinterpret_cast<char*>(curr_iov_->iov_base);
|
1145
|
+
curr_iov_remaining_ = curr_iov_->iov_len;
|
1001
1146
|
}
|
1002
1147
|
|
1003
|
-
const size_t to_write = std::min(
|
1004
|
-
|
1005
|
-
|
1006
|
-
|
1007
|
-
to_write);
|
1008
|
-
curr_iov_written_ += to_write;
|
1148
|
+
const size_t to_write = std::min(len, curr_iov_remaining_);
|
1149
|
+
memcpy(curr_iov_output_, ip, to_write);
|
1150
|
+
curr_iov_output_ += to_write;
|
1151
|
+
curr_iov_remaining_ -= to_write;
|
1009
1152
|
total_written_ += to_write;
|
1010
1153
|
ip += to_write;
|
1011
1154
|
len -= to_write;
|
@@ -1017,11 +1160,11 @@ class SnappyIOVecWriter {
|
|
1017
1160
|
inline bool TryFastAppend(const char* ip, size_t available, size_t len) {
|
1018
1161
|
const size_t space_left = output_limit_ - total_written_;
|
1019
1162
|
if (len <= 16 && available >= 16 + kMaximumTagLength && space_left >= 16 &&
|
1020
|
-
|
1163
|
+
curr_iov_remaining_ >= 16) {
|
1021
1164
|
// Fast path, used for the majority (about 95%) of invocations.
|
1022
|
-
|
1023
|
-
|
1024
|
-
|
1165
|
+
UnalignedCopy128(ip, curr_iov_output_);
|
1166
|
+
curr_iov_output_ += len;
|
1167
|
+
curr_iov_remaining_ -= len;
|
1025
1168
|
total_written_ += len;
|
1026
1169
|
return true;
|
1027
1170
|
}
|
@@ -1030,7 +1173,9 @@ class SnappyIOVecWriter {
|
|
1030
1173
|
}
|
1031
1174
|
|
1032
1175
|
inline bool AppendFromSelf(size_t offset, size_t len) {
|
1033
|
-
|
1176
|
+
// See SnappyArrayWriter::AppendFromSelf for an explanation of
|
1177
|
+
// the "offset - 1u" trick.
|
1178
|
+
if (offset - 1u >= total_written_) {
|
1034
1179
|
return false;
|
1035
1180
|
}
|
1036
1181
|
const size_t space_left = output_limit_ - total_written_;
|
@@ -1039,8 +1184,8 @@ class SnappyIOVecWriter {
|
|
1039
1184
|
}
|
1040
1185
|
|
1041
1186
|
// Locate the iovec from which we need to start the copy.
|
1042
|
-
|
1043
|
-
size_t from_iov_offset =
|
1187
|
+
const iovec* from_iov = curr_iov_;
|
1188
|
+
size_t from_iov_offset = curr_iov_->iov_len - curr_iov_remaining_;
|
1044
1189
|
while (offset > 0) {
|
1045
1190
|
if (from_iov_offset >= offset) {
|
1046
1191
|
from_iov_offset -= offset;
|
@@ -1048,47 +1193,47 @@ class SnappyIOVecWriter {
|
|
1048
1193
|
}
|
1049
1194
|
|
1050
1195
|
offset -= from_iov_offset;
|
1051
|
-
|
1052
|
-
|
1053
|
-
|
1196
|
+
--from_iov;
|
1197
|
+
#if !defined(NDEBUG)
|
1198
|
+
assert(from_iov >= output_iov_);
|
1199
|
+
#endif // !defined(NDEBUG)
|
1200
|
+
from_iov_offset = from_iov->iov_len;
|
1054
1201
|
}
|
1055
1202
|
|
1056
1203
|
// Copy <len> bytes starting from the iovec pointed to by from_iov_index to
|
1057
1204
|
// the current iovec.
|
1058
1205
|
while (len > 0) {
|
1059
|
-
assert(
|
1060
|
-
if (
|
1061
|
-
const size_t to_copy =
|
1062
|
-
|
1063
|
-
|
1064
|
-
Append(GetIOVecPointer(from_iov_index, from_iov_offset), to_copy);
|
1206
|
+
assert(from_iov <= curr_iov_);
|
1207
|
+
if (from_iov != curr_iov_) {
|
1208
|
+
const size_t to_copy =
|
1209
|
+
std::min(from_iov->iov_len - from_iov_offset, len);
|
1210
|
+
AppendNoCheck(GetIOVecPointer(from_iov, from_iov_offset), to_copy);
|
1065
1211
|
len -= to_copy;
|
1066
1212
|
if (len > 0) {
|
1067
|
-
++
|
1213
|
+
++from_iov;
|
1068
1214
|
from_iov_offset = 0;
|
1069
1215
|
}
|
1070
1216
|
} else {
|
1071
|
-
|
1072
|
-
size_t to_copy = std::min(output_iov_[curr_iov_index_].iov_len -
|
1073
|
-
curr_iov_written_,
|
1074
|
-
len);
|
1217
|
+
size_t to_copy = curr_iov_remaining_;
|
1075
1218
|
if (to_copy == 0) {
|
1076
1219
|
// This iovec is full. Go to the next one.
|
1077
|
-
if (
|
1220
|
+
if (curr_iov_ + 1 >= output_iov_end_) {
|
1078
1221
|
return false;
|
1079
1222
|
}
|
1080
|
-
++
|
1081
|
-
|
1223
|
+
++curr_iov_;
|
1224
|
+
curr_iov_output_ = reinterpret_cast<char*>(curr_iov_->iov_base);
|
1225
|
+
curr_iov_remaining_ = curr_iov_->iov_len;
|
1082
1226
|
continue;
|
1083
1227
|
}
|
1084
1228
|
if (to_copy > len) {
|
1085
1229
|
to_copy = len;
|
1086
1230
|
}
|
1087
|
-
|
1088
|
-
|
1089
|
-
|
1090
|
-
|
1091
|
-
|
1231
|
+
|
1232
|
+
IncrementalCopy(GetIOVecPointer(from_iov, from_iov_offset),
|
1233
|
+
curr_iov_output_, curr_iov_output_ + to_copy,
|
1234
|
+
curr_iov_output_ + curr_iov_remaining_);
|
1235
|
+
curr_iov_output_ += to_copy;
|
1236
|
+
curr_iov_remaining_ -= to_copy;
|
1092
1237
|
from_iov_offset += to_copy;
|
1093
1238
|
total_written_ += to_copy;
|
1094
1239
|
len -= to_copy;
|
@@ -1197,7 +1342,7 @@ bool RawUncompress(Source* compressed, char* uncompressed) {
|
|
1197
1342
|
return InternalUncompress(compressed, &output);
|
1198
1343
|
}
|
1199
1344
|
|
1200
|
-
bool Uncompress(const char* compressed, size_t n, string* uncompressed) {
|
1345
|
+
bool Uncompress(const char* compressed, size_t n, std::string* uncompressed) {
|
1201
1346
|
size_t ulength;
|
1202
1347
|
if (!GetUncompressedLength(compressed, n, &ulength)) {
|
1203
1348
|
return false;
|
@@ -1265,7 +1410,8 @@ void RawCompress(const char* input,
|
|
1265
1410
|
*compressed_length = (writer.CurrentDestination() - compressed);
|
1266
1411
|
}
|
1267
1412
|
|
1268
|
-
size_t Compress(const char* input, size_t input_length,
|
1413
|
+
size_t Compress(const char* input, size_t input_length,
|
1414
|
+
std::string* compressed) {
|
1269
1415
|
// Pre-grow the buffer to the max length of the compressed output
|
1270
1416
|
STLStringResizeUninitialized(compressed, MaxCompressedLength(input_length));
|
1271
1417
|
|
@@ -1512,4 +1658,4 @@ bool Uncompress(Source* compressed, Sink* uncompressed) {
|
|
1512
1658
|
}
|
1513
1659
|
}
|
1514
1660
|
|
1515
|
-
}
|
1661
|
+
} // namespace snappy
|