snappy 0.0.17 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.dockerignore +2 -0
- data/.github/workflows/main.yml +34 -0
- data/.github/workflows/publish.yml +34 -0
- data/.gitignore +2 -1
- data/.gitmodules +1 -1
- data/Dockerfile +13 -0
- data/Gemfile +4 -0
- data/README.md +45 -5
- data/Rakefile +32 -29
- data/ext/api.c +6 -1
- data/ext/extconf.rb +31 -22
- data/lib/snappy/hadoop/reader.rb +62 -0
- data/lib/snappy/hadoop/writer.rb +51 -0
- data/lib/snappy/hadoop.rb +22 -0
- data/lib/snappy/reader.rb +14 -10
- data/lib/snappy/shim.rb +1 -1
- data/lib/snappy/version.rb +1 -1
- data/lib/snappy.rb +5 -4
- data/snappy.gemspec +14 -13
- data/test/hadoop/snappy_hadoop_reader_test.rb +115 -0
- data/test/hadoop/snappy_hadoop_writer_test.rb +48 -0
- data/test/snappy_hadoop_test.rb +26 -0
- data/test/snappy_reader_test.rb +148 -0
- data/test/snappy_test.rb +95 -0
- data/test/snappy_writer_test.rb +55 -0
- data/test/test_helper.rb +7 -0
- data/test.sh +3 -0
- data/vendor/snappy/CMakeLists.txt +420 -0
- data/vendor/snappy/CONTRIBUTING.md +31 -0
- data/vendor/snappy/NEWS +52 -0
- data/vendor/snappy/{README → README.md} +75 -49
- data/vendor/snappy/cmake/SnappyConfig.cmake.in +33 -0
- data/vendor/snappy/cmake/config.h.in +66 -0
- data/vendor/snappy/docs/README.md +72 -0
- data/vendor/snappy/snappy-internal.h +200 -32
- data/vendor/snappy/snappy-sinksource.cc +26 -9
- data/vendor/snappy/snappy-sinksource.h +11 -11
- data/vendor/snappy/snappy-stubs-internal.cc +1 -1
- data/vendor/snappy/snappy-stubs-internal.h +299 -302
- data/vendor/snappy/snappy-stubs-public.h.in +10 -47
- data/vendor/snappy/snappy-test.cc +94 -200
- data/vendor/snappy/snappy-test.h +101 -358
- data/vendor/snappy/snappy.cc +1437 -474
- data/vendor/snappy/snappy.h +31 -12
- data/vendor/snappy/snappy_benchmark.cc +378 -0
- data/vendor/snappy/snappy_compress_fuzzer.cc +60 -0
- data/vendor/snappy/snappy_test_data.cc +57 -0
- data/vendor/snappy/snappy_test_data.h +68 -0
- data/vendor/snappy/snappy_test_tool.cc +471 -0
- data/vendor/snappy/snappy_uncompress_fuzzer.cc +58 -0
- data/vendor/snappy/snappy_unittest.cc +271 -792
- metadata +42 -92
- data/.travis.yml +0 -26
- data/smoke.sh +0 -8
- data/test/test-snappy-reader.rb +0 -129
- data/test/test-snappy-writer.rb +0 -55
- data/test/test-snappy.rb +0 -58
- data/vendor/snappy/ChangeLog +0 -2468
- data/vendor/snappy/INSTALL +0 -370
- data/vendor/snappy/Makefile +0 -982
- data/vendor/snappy/Makefile.am +0 -26
- data/vendor/snappy/Makefile.in +0 -982
- data/vendor/snappy/aclocal.m4 +0 -9738
- data/vendor/snappy/autogen.sh +0 -12
- data/vendor/snappy/autom4te.cache/output.0 +0 -18856
- data/vendor/snappy/autom4te.cache/output.1 +0 -18852
- data/vendor/snappy/autom4te.cache/requests +0 -297
- data/vendor/snappy/autom4te.cache/traces.0 +0 -2689
- data/vendor/snappy/autom4te.cache/traces.1 +0 -714
- data/vendor/snappy/config.guess +0 -1530
- data/vendor/snappy/config.h +0 -135
- data/vendor/snappy/config.h.in +0 -134
- data/vendor/snappy/config.log +0 -1640
- data/vendor/snappy/config.status +0 -2318
- data/vendor/snappy/config.sub +0 -1773
- data/vendor/snappy/configure +0 -18852
- data/vendor/snappy/configure.ac +0 -134
- data/vendor/snappy/depcomp +0 -688
- data/vendor/snappy/install-sh +0 -527
- data/vendor/snappy/libtool +0 -10246
- data/vendor/snappy/ltmain.sh +0 -9661
- data/vendor/snappy/m4/gtest.m4 +0 -74
- data/vendor/snappy/m4/libtool.m4 +0 -8001
- data/vendor/snappy/m4/ltoptions.m4 +0 -384
- data/vendor/snappy/m4/ltsugar.m4 +0 -123
- data/vendor/snappy/m4/ltversion.m4 +0 -23
- data/vendor/snappy/m4/lt~obsolete.m4 +0 -98
- data/vendor/snappy/missing +0 -331
- data/vendor/snappy/snappy-stubs-public.h +0 -100
- data/vendor/snappy/snappy.pc +0 -10
- data/vendor/snappy/snappy.pc.in +0 -10
- data/vendor/snappy/stamp-h1 +0 -1
@@ -0,0 +1,33 @@
|
|
1
|
+
# Copyright 2019 Google Inc. All Rights Reserved.
|
2
|
+
#
|
3
|
+
# Redistribution and use in source and binary forms, with or without
|
4
|
+
# modification, are permitted provided that the following conditions are
|
5
|
+
# met:
|
6
|
+
#
|
7
|
+
# * Redistributions of source code must retain the above copyright
|
8
|
+
# notice, this list of conditions and the following disclaimer.
|
9
|
+
# * Redistributions in binary form must reproduce the above
|
10
|
+
# copyright notice, this list of conditions and the following disclaimer
|
11
|
+
# in the documentation and/or other materials provided with the
|
12
|
+
# distribution.
|
13
|
+
# * Neither the name of Google Inc. nor the names of its
|
14
|
+
# contributors may be used to endorse or promote products derived from
|
15
|
+
# this software without specific prior written permission.
|
16
|
+
#
|
17
|
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
18
|
+
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
19
|
+
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
20
|
+
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
21
|
+
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
22
|
+
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
23
|
+
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
24
|
+
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
25
|
+
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
26
|
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
27
|
+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
28
|
+
|
29
|
+
@PACKAGE_INIT@
|
30
|
+
|
31
|
+
include("${CMAKE_CURRENT_LIST_DIR}/SnappyTargets.cmake")
|
32
|
+
|
33
|
+
check_required_components(Snappy)
|
@@ -0,0 +1,66 @@
|
|
1
|
+
#ifndef THIRD_PARTY_SNAPPY_OPENSOURCE_CMAKE_CONFIG_H_
|
2
|
+
#define THIRD_PARTY_SNAPPY_OPENSOURCE_CMAKE_CONFIG_H_
|
3
|
+
|
4
|
+
/* Define to 1 if the compiler supports __attribute__((always_inline)). */
|
5
|
+
#cmakedefine01 HAVE_ATTRIBUTE_ALWAYS_INLINE
|
6
|
+
|
7
|
+
/* Define to 1 if the compiler supports __builtin_ctz and friends. */
|
8
|
+
#cmakedefine01 HAVE_BUILTIN_CTZ
|
9
|
+
|
10
|
+
/* Define to 1 if the compiler supports __builtin_expect. */
|
11
|
+
#cmakedefine01 HAVE_BUILTIN_EXPECT
|
12
|
+
|
13
|
+
/* Define to 1 if you have a definition for mmap() in <sys/mman.h>. */
|
14
|
+
#cmakedefine01 HAVE_FUNC_MMAP
|
15
|
+
|
16
|
+
/* Define to 1 if you have a definition for sysconf() in <unistd.h>. */
|
17
|
+
#cmakedefine01 HAVE_FUNC_SYSCONF
|
18
|
+
|
19
|
+
/* Define to 1 if you have the `lzo2' library (-llzo2). */
|
20
|
+
#cmakedefine01 HAVE_LIBLZO2
|
21
|
+
|
22
|
+
/* Define to 1 if you have the `z' library (-lz). */
|
23
|
+
#cmakedefine01 HAVE_LIBZ
|
24
|
+
|
25
|
+
/* Define to 1 if you have the `lz4' library (-llz4). */
|
26
|
+
#cmakedefine01 HAVE_LIBLZ4
|
27
|
+
|
28
|
+
/* Define to 1 if you have the <sys/mman.h> header file. */
|
29
|
+
#cmakedefine01 HAVE_SYS_MMAN_H
|
30
|
+
|
31
|
+
/* Define to 1 if you have the <sys/resource.h> header file. */
|
32
|
+
#cmakedefine01 HAVE_SYS_RESOURCE_H
|
33
|
+
|
34
|
+
/* Define to 1 if you have the <sys/time.h> header file. */
|
35
|
+
#cmakedefine01 HAVE_SYS_TIME_H
|
36
|
+
|
37
|
+
/* Define to 1 if you have the <sys/uio.h> header file. */
|
38
|
+
#cmakedefine01 HAVE_SYS_UIO_H
|
39
|
+
|
40
|
+
/* Define to 1 if you have the <unistd.h> header file. */
|
41
|
+
#cmakedefine01 HAVE_UNISTD_H
|
42
|
+
|
43
|
+
/* Define to 1 if you have the <windows.h> header file. */
|
44
|
+
#cmakedefine01 HAVE_WINDOWS_H
|
45
|
+
|
46
|
+
/* Define to 1 if you target processors with SSSE3+ and have <tmmintrin.h>. */
|
47
|
+
#cmakedefine01 SNAPPY_HAVE_SSSE3
|
48
|
+
|
49
|
+
/* Define to 1 if you target processors with SSE4.2 and have <crc32intrin.h>. */
|
50
|
+
#cmakedefine01 SNAPPY_HAVE_X86_CRC32
|
51
|
+
|
52
|
+
/* Define to 1 if you target processors with BMI2+ and have <bmi2intrin.h>. */
|
53
|
+
#cmakedefine01 SNAPPY_HAVE_BMI2
|
54
|
+
|
55
|
+
/* Define to 1 if you target processors with NEON and have <arm_neon.h>. */
|
56
|
+
#cmakedefine01 SNAPPY_HAVE_NEON
|
57
|
+
|
58
|
+
/* Define to 1 if you have <arm_neon.h> and <arm_acle.h> and want to optimize
|
59
|
+
compression speed by using __crc32cw from <arm_acle.h>. */
|
60
|
+
#cmakedefine01 SNAPPY_HAVE_NEON_CRC32
|
61
|
+
|
62
|
+
/* Define to 1 if your processor stores words with the most significant byte
|
63
|
+
first (like Motorola and SPARC, unlike Intel and VAX). */
|
64
|
+
#cmakedefine01 SNAPPY_IS_BIG_ENDIAN
|
65
|
+
|
66
|
+
#endif // THIRD_PARTY_SNAPPY_OPENSOURCE_CMAKE_CONFIG_H_
|
@@ -0,0 +1,72 @@
|
|
1
|
+
Snappy is a compression/decompression library. It does not aim for maximum
|
2
|
+
compression, or compatibility with any other compression library; instead, it
|
3
|
+
aims for very high speeds and reasonable compression. For instance, compared
|
4
|
+
to the fastest mode of zlib, Snappy is an order of magnitude faster for most
|
5
|
+
inputs, but the resulting compressed files are anywhere from 20% to 100%
|
6
|
+
bigger. On a single core of a Core i7 processor in 64-bit mode, Snappy
|
7
|
+
compresses at about 250 MB/sec or more and decompresses at about 500 MB/sec
|
8
|
+
or more.
|
9
|
+
|
10
|
+
Snappy is widely used inside Google, in everything from BigTable and MapReduce
|
11
|
+
to our internal RPC systems. (Snappy has previously been referred to as "Zippy"
|
12
|
+
in some presentations and the likes.)
|
13
|
+
|
14
|
+
For more information, please see the [README](../README.md). Benchmarks against
|
15
|
+
a few other compression libraries (zlib, LZO, LZF, FastLZ, and QuickLZ) are
|
16
|
+
included in the source code distribution. The source code also contains a
|
17
|
+
[formal format specification](../format_description.txt), as well
|
18
|
+
as a specification for a [framing format](../framing_format.txt) useful for
|
19
|
+
higher-level framing and encapsulation of Snappy data, e.g. for transporting
|
20
|
+
Snappy-compressed data across HTTP in a streaming fashion. Note that the Snappy
|
21
|
+
distribution currently has no code implementing the latter, but some of the
|
22
|
+
ports do (see below).
|
23
|
+
|
24
|
+
Snappy is written in C++, but C bindings are included, and several bindings to
|
25
|
+
other languages are maintained by third parties:
|
26
|
+
|
27
|
+
* C#: [Snappy for .NET](http://snappy4net.codeplex.com/) (P/Invoke wrapper),
|
28
|
+
[Snappy.NET](http://snappy.angeloflogic.com/) (P/Invoke wrapper),
|
29
|
+
[Snappy.Sharp](https://github.com/jeffesp/Snappy.Sharp) (native
|
30
|
+
reimplementation)
|
31
|
+
* [C port](http://github.com/andikleen/snappy-c)
|
32
|
+
* [C++ MSVC packaging](http://snappy.angeloflogic.com/) (plus Windows binaries,
|
33
|
+
NuGet packages and command-line tool)
|
34
|
+
* Common Lisp: [Library bindings](http://flambard.github.com/thnappy/),
|
35
|
+
[native reimplementation](https://github.com/brown/snappy)
|
36
|
+
* Erlang: [esnappy](https://github.com/thekvs/esnappy),
|
37
|
+
[snappy-erlang-nif](https://github.com/fdmanana/snappy-erlang-nif)
|
38
|
+
* [Go](https://github.com/golang/snappy/)
|
39
|
+
* [Haskell](http://hackage.haskell.org/package/snappy)
|
40
|
+
* [Haxe](https://github.com/MaddinXx/hxsnappy) (C++/Neko)
|
41
|
+
* [iOS packaging](https://github.com/ideawu/snappy-ios)
|
42
|
+
* Java: [JNI wrapper](https://github.com/xerial/snappy-java) (including the
|
43
|
+
framing format), [native reimplementation](http://code.google.com/p/jsnappy/),
|
44
|
+
[other native reimplementation](https://github.com/dain/snappy) (including
|
45
|
+
the framing format)
|
46
|
+
* [Lua](https://github.com/forhappy/lua-snappy)
|
47
|
+
* [Node.js](https://github.com/kesla/node-snappy) (including the [framing
|
48
|
+
format](https://github.com/kesla/node-snappy-stream))
|
49
|
+
* [Perl](http://search.cpan.org/dist/Compress-Snappy/)
|
50
|
+
* [PHP](https://github.com/kjdev/php-ext-snappy)
|
51
|
+
* [Python](http://pypi.python.org/pypi/python-snappy) (including a command-line
|
52
|
+
tool for the framing format)
|
53
|
+
* [R](https://github.com/lulyon/R-snappy)
|
54
|
+
* [Ruby](https://github.com/miyucy/snappy)
|
55
|
+
* [Rust](https://github.com/BurntSushi/rust-snappy)
|
56
|
+
* [Smalltalk](https://github.com/mumez/sqnappy) (including the framing format)
|
57
|
+
|
58
|
+
Snappy is used or is available as an alternative in software such as
|
59
|
+
|
60
|
+
* [MongoDB](https://www.mongodb.com/)
|
61
|
+
* [Cassandra](http://cassandra.apache.org/)
|
62
|
+
* [Couchbase](http://www.couchbase.com/)
|
63
|
+
* [Hadoop](http://hadoop.apache.org/)
|
64
|
+
* [LessFS](http://www.lessfs.com/wordpress/)
|
65
|
+
* [LevelDB](https://github.com/google/leveldb) (which is in turn used by
|
66
|
+
[Google Chrome](http://chrome.google.com/))
|
67
|
+
* [Lucene](http://lucene.apache.org/)
|
68
|
+
* [VoltDB](http://voltdb.com/)
|
69
|
+
|
70
|
+
If you know of more, do not hesitate to let us know. The easiest way to get in
|
71
|
+
touch is via the
|
72
|
+
[Snappy discussion mailing list](http://groups.google.com/group/snappy-compression).
|
@@ -33,24 +33,108 @@
|
|
33
33
|
|
34
34
|
#include "snappy-stubs-internal.h"
|
35
35
|
|
36
|
+
#if SNAPPY_HAVE_SSSE3
|
37
|
+
// Please do not replace with <x86intrin.h> or with headers that assume more
|
38
|
+
// advanced SSE versions without checking with all the OWNERS.
|
39
|
+
#include <emmintrin.h>
|
40
|
+
#include <tmmintrin.h>
|
41
|
+
#endif
|
42
|
+
|
43
|
+
#if SNAPPY_HAVE_NEON
|
44
|
+
#include <arm_neon.h>
|
45
|
+
#endif
|
46
|
+
|
47
|
+
#if SNAPPY_HAVE_SSSE3 || SNAPPY_HAVE_NEON
|
48
|
+
#define SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE 1
|
49
|
+
#else
|
50
|
+
#define SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE 0
|
51
|
+
#endif
|
52
|
+
|
36
53
|
namespace snappy {
|
37
54
|
namespace internal {
|
38
55
|
|
56
|
+
#if SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE
|
57
|
+
#if SNAPPY_HAVE_SSSE3
|
58
|
+
using V128 = __m128i;
|
59
|
+
#elif SNAPPY_HAVE_NEON
|
60
|
+
using V128 = uint8x16_t;
|
61
|
+
#endif
|
62
|
+
|
63
|
+
// Load 128 bits of integer data. `src` must be 16-byte aligned.
|
64
|
+
inline V128 V128_Load(const V128* src);
|
65
|
+
|
66
|
+
// Load 128 bits of integer data. `src` does not need to be aligned.
|
67
|
+
inline V128 V128_LoadU(const V128* src);
|
68
|
+
|
69
|
+
// Store 128 bits of integer data. `dst` does not need to be aligned.
|
70
|
+
inline void V128_StoreU(V128* dst, V128 val);
|
71
|
+
|
72
|
+
// Shuffle packed 8-bit integers using a shuffle mask.
|
73
|
+
// Each packed integer in the shuffle mask must be in [0,16).
|
74
|
+
inline V128 V128_Shuffle(V128 input, V128 shuffle_mask);
|
75
|
+
|
76
|
+
// Constructs V128 with 16 chars |c|.
|
77
|
+
inline V128 V128_DupChar(char c);
|
78
|
+
|
79
|
+
#if SNAPPY_HAVE_SSSE3
|
80
|
+
inline V128 V128_Load(const V128* src) { return _mm_load_si128(src); }
|
81
|
+
|
82
|
+
inline V128 V128_LoadU(const V128* src) { return _mm_loadu_si128(src); }
|
83
|
+
|
84
|
+
inline void V128_StoreU(V128* dst, V128 val) { _mm_storeu_si128(dst, val); }
|
85
|
+
|
86
|
+
inline V128 V128_Shuffle(V128 input, V128 shuffle_mask) {
|
87
|
+
return _mm_shuffle_epi8(input, shuffle_mask);
|
88
|
+
}
|
89
|
+
|
90
|
+
inline V128 V128_DupChar(char c) { return _mm_set1_epi8(c); }
|
91
|
+
|
92
|
+
#elif SNAPPY_HAVE_NEON
|
93
|
+
inline V128 V128_Load(const V128* src) {
|
94
|
+
return vld1q_u8(reinterpret_cast<const uint8_t*>(src));
|
95
|
+
}
|
96
|
+
|
97
|
+
inline V128 V128_LoadU(const V128* src) {
|
98
|
+
return vld1q_u8(reinterpret_cast<const uint8_t*>(src));
|
99
|
+
}
|
100
|
+
|
101
|
+
inline void V128_StoreU(V128* dst, V128 val) {
|
102
|
+
vst1q_u8(reinterpret_cast<uint8_t*>(dst), val);
|
103
|
+
}
|
104
|
+
|
105
|
+
inline V128 V128_Shuffle(V128 input, V128 shuffle_mask) {
|
106
|
+
assert(vminvq_u8(shuffle_mask) >= 0 && vmaxvq_u8(shuffle_mask) <= 15);
|
107
|
+
return vqtbl1q_u8(input, shuffle_mask);
|
108
|
+
}
|
109
|
+
|
110
|
+
inline V128 V128_DupChar(char c) { return vdupq_n_u8(c); }
|
111
|
+
#endif
|
112
|
+
#endif // SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE
|
113
|
+
|
114
|
+
// Working memory performs a single allocation to hold all scratch space
|
115
|
+
// required for compression.
|
39
116
|
class WorkingMemory {
|
40
117
|
public:
|
41
|
-
WorkingMemory(
|
42
|
-
~WorkingMemory()
|
118
|
+
explicit WorkingMemory(size_t input_size);
|
119
|
+
~WorkingMemory();
|
43
120
|
|
44
121
|
// Allocates and clears a hash table using memory in "*this",
|
45
122
|
// stores the number of buckets in "*table_size" and returns a pointer to
|
46
123
|
// the base of the hash table.
|
47
|
-
|
124
|
+
uint16_t* GetHashTable(size_t fragment_size, int* table_size) const;
|
125
|
+
char* GetScratchInput() const { return input_; }
|
126
|
+
char* GetScratchOutput() const { return output_; }
|
48
127
|
|
49
128
|
private:
|
50
|
-
|
51
|
-
|
129
|
+
char* mem_; // the allocated memory, never nullptr
|
130
|
+
size_t size_; // the size of the allocated memory, never 0
|
131
|
+
uint16_t* table_; // the pointer to the hashtable
|
132
|
+
char* input_; // the pointer to the input scratch buffer
|
133
|
+
char* output_; // the pointer to the output scratch buffer
|
52
134
|
|
53
|
-
|
135
|
+
// No copying
|
136
|
+
WorkingMemory(const WorkingMemory&);
|
137
|
+
void operator=(const WorkingMemory&);
|
54
138
|
};
|
55
139
|
|
56
140
|
// Flat array compression that does not emit the "uncompressed length"
|
@@ -67,7 +151,7 @@ class WorkingMemory {
|
|
67
151
|
char* CompressFragment(const char* input,
|
68
152
|
size_t input_length,
|
69
153
|
char* op,
|
70
|
-
|
154
|
+
uint16_t* table,
|
71
155
|
const int table_size);
|
72
156
|
|
73
157
|
// Find the largest n such that
|
@@ -80,12 +164,19 @@ char* CompressFragment(const char* input,
|
|
80
164
|
// Does not read *(s1 + (s2_limit - s2)) or beyond.
|
81
165
|
// Requires that s2_limit >= s2.
|
82
166
|
//
|
83
|
-
//
|
84
|
-
//
|
85
|
-
|
167
|
+
// In addition populate *data with the next 5 bytes from the end of the match.
|
168
|
+
// This is only done if 8 bytes are available (s2_limit - s2 >= 8). The point is
|
169
|
+
// that on some arch's this can be done faster in this routine than subsequent
|
170
|
+
// loading from s2 + n.
|
171
|
+
//
|
172
|
+
// Separate implementation for 64-bit, little-endian cpus.
|
173
|
+
#if !SNAPPY_IS_BIG_ENDIAN && \
|
174
|
+
(defined(__x86_64__) || defined(_M_X64) || defined(ARCH_PPC) || \
|
175
|
+
defined(ARCH_ARM))
|
86
176
|
static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
|
87
177
|
const char* s2,
|
88
|
-
const char* s2_limit
|
178
|
+
const char* s2_limit,
|
179
|
+
uint64_t* data) {
|
89
180
|
assert(s2_limit >= s2);
|
90
181
|
size_t matched = 0;
|
91
182
|
|
@@ -94,12 +185,72 @@ static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
|
|
94
185
|
// uncommon code paths that determine, without extra effort, whether the match
|
95
186
|
// length is less than 8. In short, we are hoping to avoid a conditional
|
96
187
|
// branch, and perhaps get better code layout from the C++ compiler.
|
97
|
-
if (
|
98
|
-
|
99
|
-
|
100
|
-
if (a1 != a2) {
|
101
|
-
|
102
|
-
|
188
|
+
if (SNAPPY_PREDICT_TRUE(s2 <= s2_limit - 16)) {
|
189
|
+
uint64_t a1 = UNALIGNED_LOAD64(s1);
|
190
|
+
uint64_t a2 = UNALIGNED_LOAD64(s2);
|
191
|
+
if (SNAPPY_PREDICT_TRUE(a1 != a2)) {
|
192
|
+
// This code is critical for performance. The reason is that it determines
|
193
|
+
// how much to advance `ip` (s2). This obviously depends on both the loads
|
194
|
+
// from the `candidate` (s1) and `ip`. Furthermore the next `candidate`
|
195
|
+
// depends on the advanced `ip` calculated here through a load, hash and
|
196
|
+
// new candidate hash lookup (a lot of cycles). This makes s1 (ie.
|
197
|
+
// `candidate`) the variable that limits throughput. This is the reason we
|
198
|
+
// go through hoops to have this function update `data` for the next iter.
|
199
|
+
// The straightforward code would use *data, given by
|
200
|
+
//
|
201
|
+
// *data = UNALIGNED_LOAD64(s2 + matched_bytes) (Latency of 5 cycles),
|
202
|
+
//
|
203
|
+
// as input for the hash table lookup to find next candidate. However
|
204
|
+
// this forces the load on the data dependency chain of s1, because
|
205
|
+
// matched_bytes directly depends on s1. However matched_bytes is 0..7, so
|
206
|
+
// we can also calculate *data by
|
207
|
+
//
|
208
|
+
// *data = AlignRight(UNALIGNED_LOAD64(s2), UNALIGNED_LOAD64(s2 + 8),
|
209
|
+
// matched_bytes);
|
210
|
+
//
|
211
|
+
// The loads do not depend on s1 anymore and are thus off the bottleneck.
|
212
|
+
// The straightforward implementation on x86_64 would be to use
|
213
|
+
//
|
214
|
+
// shrd rax, rdx, cl (cl being matched_bytes * 8)
|
215
|
+
//
|
216
|
+
// unfortunately shrd with a variable shift has a 4 cycle latency. So this
|
217
|
+
// only wins 1 cycle. The BMI2 shrx instruction is a 1 cycle variable
|
218
|
+
// shift instruction but can only shift 64 bits. If we focus on just
|
219
|
+
// obtaining the least significant 4 bytes, we can obtain this by
|
220
|
+
//
|
221
|
+
// *data = ConditionalMove(matched_bytes < 4, UNALIGNED_LOAD64(s2),
|
222
|
+
// UNALIGNED_LOAD64(s2 + 4) >> ((matched_bytes & 3) * 8);
|
223
|
+
//
|
224
|
+
// Writen like above this is not a big win, the conditional move would be
|
225
|
+
// a cmp followed by a cmov (2 cycles) followed by a shift (1 cycle).
|
226
|
+
// However matched_bytes < 4 is equal to
|
227
|
+
// static_cast<uint32_t>(xorval) != 0. Writen that way, the conditional
|
228
|
+
// move (2 cycles) can execute in parallel with FindLSBSetNonZero64
|
229
|
+
// (tzcnt), which takes 3 cycles.
|
230
|
+
uint64_t xorval = a1 ^ a2;
|
231
|
+
int shift = Bits::FindLSBSetNonZero64(xorval);
|
232
|
+
size_t matched_bytes = shift >> 3;
|
233
|
+
uint64_t a3 = UNALIGNED_LOAD64(s2 + 4);
|
234
|
+
#ifndef __x86_64__
|
235
|
+
a2 = static_cast<uint32_t>(xorval) == 0 ? a3 : a2;
|
236
|
+
#else
|
237
|
+
// Ideally this would just be
|
238
|
+
//
|
239
|
+
// a2 = static_cast<uint32_t>(xorval) == 0 ? a3 : a2;
|
240
|
+
//
|
241
|
+
// However clang correctly infers that the above statement participates on
|
242
|
+
// a critical data dependency chain and thus, unfortunately, refuses to
|
243
|
+
// use a conditional move (it's tuned to cut data dependencies). In this
|
244
|
+
// case there is a longer parallel chain anyway AND this will be fairly
|
245
|
+
// unpredictable.
|
246
|
+
asm("testl %k2, %k2\n\t"
|
247
|
+
"cmovzq %1, %0\n\t"
|
248
|
+
: "+r"(a2)
|
249
|
+
: "r"(a3), "r"(xorval)
|
250
|
+
: "cc");
|
251
|
+
#endif
|
252
|
+
*data = a2 >> (shift & (3 * 8));
|
253
|
+
return std::pair<size_t, bool>(matched_bytes, true);
|
103
254
|
} else {
|
104
255
|
matched = 8;
|
105
256
|
s2 += 8;
|
@@ -110,23 +261,40 @@ static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
|
|
110
261
|
// time until we find a 64-bit block that doesn't match; then we find
|
111
262
|
// the first non-matching bit and use that to calculate the total
|
112
263
|
// length of the match.
|
113
|
-
while (
|
114
|
-
|
264
|
+
while (SNAPPY_PREDICT_TRUE(s2 <= s2_limit - 16)) {
|
265
|
+
uint64_t a1 = UNALIGNED_LOAD64(s1 + matched);
|
266
|
+
uint64_t a2 = UNALIGNED_LOAD64(s2);
|
267
|
+
if (a1 == a2) {
|
115
268
|
s2 += 8;
|
116
269
|
matched += 8;
|
117
270
|
} else {
|
118
|
-
|
119
|
-
int
|
120
|
-
|
271
|
+
uint64_t xorval = a1 ^ a2;
|
272
|
+
int shift = Bits::FindLSBSetNonZero64(xorval);
|
273
|
+
size_t matched_bytes = shift >> 3;
|
274
|
+
uint64_t a3 = UNALIGNED_LOAD64(s2 + 4);
|
275
|
+
#ifndef __x86_64__
|
276
|
+
a2 = static_cast<uint32_t>(xorval) == 0 ? a3 : a2;
|
277
|
+
#else
|
278
|
+
asm("testl %k2, %k2\n\t"
|
279
|
+
"cmovzq %1, %0\n\t"
|
280
|
+
: "+r"(a2)
|
281
|
+
: "r"(a3), "r"(xorval)
|
282
|
+
: "cc");
|
283
|
+
#endif
|
284
|
+
*data = a2 >> (shift & (3 * 8));
|
285
|
+
matched += matched_bytes;
|
121
286
|
assert(matched >= 8);
|
122
287
|
return std::pair<size_t, bool>(matched, false);
|
123
288
|
}
|
124
289
|
}
|
125
|
-
while (
|
290
|
+
while (SNAPPY_PREDICT_TRUE(s2 < s2_limit)) {
|
126
291
|
if (s1[matched] == *s2) {
|
127
292
|
++s2;
|
128
293
|
++matched;
|
129
294
|
} else {
|
295
|
+
if (s2 <= s2_limit - 8) {
|
296
|
+
*data = UNALIGNED_LOAD64(s2);
|
297
|
+
}
|
130
298
|
return std::pair<size_t, bool>(matched, matched < 8);
|
131
299
|
}
|
132
300
|
}
|
@@ -135,7 +303,8 @@ static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
|
|
135
303
|
#else
|
136
304
|
static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
|
137
305
|
const char* s2,
|
138
|
-
const char* s2_limit
|
306
|
+
const char* s2_limit,
|
307
|
+
uint64_t* data) {
|
139
308
|
// Implementation based on the x86-64 version, above.
|
140
309
|
assert(s2_limit >= s2);
|
141
310
|
int matched = 0;
|
@@ -146,15 +315,17 @@ static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
|
|
146
315
|
matched += 4;
|
147
316
|
}
|
148
317
|
if (LittleEndian::IsLittleEndian() && s2 <= s2_limit - 4) {
|
149
|
-
|
318
|
+
uint32_t x = UNALIGNED_LOAD32(s2) ^ UNALIGNED_LOAD32(s1 + matched);
|
150
319
|
int matching_bits = Bits::FindLSBSetNonZero(x);
|
151
320
|
matched += matching_bits >> 3;
|
321
|
+
s2 += matching_bits >> 3;
|
152
322
|
} else {
|
153
323
|
while ((s2 < s2_limit) && (s1[matched] == *s2)) {
|
154
324
|
++s2;
|
155
325
|
++matched;
|
156
326
|
}
|
157
327
|
}
|
328
|
+
if (s2 <= s2_limit - 8) *data = LittleEndian::Load64(s2);
|
158
329
|
return std::pair<size_t, bool>(matched, matched < 8);
|
159
330
|
}
|
160
331
|
#endif
|
@@ -170,11 +341,6 @@ enum {
|
|
170
341
|
};
|
171
342
|
static const int kMaximumTagLength = 5; // COPY_4_BYTE_OFFSET plus the actual offset.
|
172
343
|
|
173
|
-
// Mapping from i in range [0,4] to a mask to extract the bottom 8*i bits
|
174
|
-
static const uint32 wordmask[] = {
|
175
|
-
0u, 0xffu, 0xffffu, 0xffffffu, 0xffffffffu
|
176
|
-
};
|
177
|
-
|
178
344
|
// Data stored per entry in lookup table:
|
179
345
|
// Range Bits-used Description
|
180
346
|
// ------------------------------------
|
@@ -186,7 +352,8 @@ static const uint32 wordmask[] = {
|
|
186
352
|
// because of efficiency reasons:
|
187
353
|
// (1) Extracting a byte is faster than a bit-field
|
188
354
|
// (2) It properly aligns copy offset so we do not need a <<8
|
189
|
-
static
|
355
|
+
static constexpr uint16_t char_table[256] = {
|
356
|
+
// clang-format off
|
190
357
|
0x0001, 0x0804, 0x1001, 0x2001, 0x0002, 0x0805, 0x1002, 0x2002,
|
191
358
|
0x0003, 0x0806, 0x1003, 0x2003, 0x0004, 0x0807, 0x1004, 0x2004,
|
192
359
|
0x0005, 0x0808, 0x1005, 0x2005, 0x0006, 0x0809, 0x1006, 0x2006,
|
@@ -218,7 +385,8 @@ static const uint16 char_table[256] = {
|
|
218
385
|
0x0039, 0x0f04, 0x1039, 0x2039, 0x003a, 0x0f05, 0x103a, 0x203a,
|
219
386
|
0x003b, 0x0f06, 0x103b, 0x203b, 0x003c, 0x0f07, 0x103c, 0x203c,
|
220
387
|
0x0801, 0x0f08, 0x103d, 0x203d, 0x1001, 0x0f09, 0x103e, 0x203e,
|
221
|
-
0x1801, 0x0f0a, 0x103f, 0x203f, 0x2001, 0x0f0b, 0x1040, 0x2040
|
388
|
+
0x1801, 0x0f0a, 0x103f, 0x203f, 0x2001, 0x0f0b, 0x1040, 0x2040,
|
389
|
+
// clang-format on
|
222
390
|
};
|
223
391
|
|
224
392
|
} // end namespace internal
|
@@ -26,23 +26,31 @@
|
|
26
26
|
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
27
27
|
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
28
28
|
|
29
|
-
#include <
|
29
|
+
#include <stddef.h>
|
30
|
+
#include <cstring>
|
30
31
|
|
31
32
|
#include "snappy-sinksource.h"
|
32
33
|
|
33
34
|
namespace snappy {
|
34
35
|
|
35
|
-
Source::~Source()
|
36
|
+
Source::~Source() = default;
|
36
37
|
|
37
|
-
Sink::~Sink()
|
38
|
+
Sink::~Sink() = default;
|
38
39
|
|
39
40
|
char* Sink::GetAppendBuffer(size_t length, char* scratch) {
|
41
|
+
// TODO: Switch to [[maybe_unused]] when we can assume C++17.
|
42
|
+
(void)length;
|
43
|
+
|
40
44
|
return scratch;
|
41
45
|
}
|
42
46
|
|
43
47
|
char* Sink::GetAppendBufferVariable(
|
44
48
|
size_t min_size, size_t desired_size_hint, char* scratch,
|
45
49
|
size_t scratch_size, size_t* allocated_size) {
|
50
|
+
// TODO: Switch to [[maybe_unused]] when we can assume C++17.
|
51
|
+
(void)min_size;
|
52
|
+
(void)desired_size_hint;
|
53
|
+
|
46
54
|
*allocated_size = scratch_size;
|
47
55
|
return scratch;
|
48
56
|
}
|
@@ -55,7 +63,7 @@ void Sink::AppendAndTakeOwnership(
|
|
55
63
|
(*deleter)(deleter_arg, bytes, n);
|
56
64
|
}
|
57
65
|
|
58
|
-
ByteArraySource::~ByteArraySource()
|
66
|
+
ByteArraySource::~ByteArraySource() = default;
|
59
67
|
|
60
68
|
size_t ByteArraySource::Available() const { return left_; }
|
61
69
|
|
@@ -74,22 +82,26 @@ UncheckedByteArraySink::~UncheckedByteArraySink() { }
|
|
74
82
|
void UncheckedByteArraySink::Append(const char* data, size_t n) {
|
75
83
|
// Do no copying if the caller filled in the result of GetAppendBuffer()
|
76
84
|
if (data != dest_) {
|
77
|
-
memcpy(dest_, data, n);
|
85
|
+
std::memcpy(dest_, data, n);
|
78
86
|
}
|
79
87
|
dest_ += n;
|
80
88
|
}
|
81
89
|
|
82
90
|
char* UncheckedByteArraySink::GetAppendBuffer(size_t len, char* scratch) {
|
91
|
+
// TODO: Switch to [[maybe_unused]] when we can assume C++17.
|
92
|
+
(void)len;
|
93
|
+
(void)scratch;
|
94
|
+
|
83
95
|
return dest_;
|
84
96
|
}
|
85
97
|
|
86
98
|
void UncheckedByteArraySink::AppendAndTakeOwnership(
|
87
|
-
char*
|
99
|
+
char* bytes, size_t n,
|
88
100
|
void (*deleter)(void*, const char*, size_t),
|
89
101
|
void *deleter_arg) {
|
90
|
-
if (
|
91
|
-
memcpy(dest_,
|
92
|
-
(*deleter)(deleter_arg,
|
102
|
+
if (bytes != dest_) {
|
103
|
+
std::memcpy(dest_, bytes, n);
|
104
|
+
(*deleter)(deleter_arg, bytes, n);
|
93
105
|
}
|
94
106
|
dest_ += n;
|
95
107
|
}
|
@@ -97,6 +109,11 @@ void UncheckedByteArraySink::AppendAndTakeOwnership(
|
|
97
109
|
char* UncheckedByteArraySink::GetAppendBufferVariable(
|
98
110
|
size_t min_size, size_t desired_size_hint, char* scratch,
|
99
111
|
size_t scratch_size, size_t* allocated_size) {
|
112
|
+
// TODO: Switch to [[maybe_unused]] when we can assume C++17.
|
113
|
+
(void)min_size;
|
114
|
+
(void)scratch;
|
115
|
+
(void)scratch_size;
|
116
|
+
|
100
117
|
*allocated_size = desired_size_hint;
|
101
118
|
return dest_;
|
102
119
|
}
|
@@ -146,10 +146,10 @@ class Source {
|
|
146
146
|
class ByteArraySource : public Source {
|
147
147
|
public:
|
148
148
|
ByteArraySource(const char* p, size_t n) : ptr_(p), left_(n) { }
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
149
|
+
~ByteArraySource() override;
|
150
|
+
size_t Available() const override;
|
151
|
+
const char* Peek(size_t* len) override;
|
152
|
+
void Skip(size_t n) override;
|
153
153
|
private:
|
154
154
|
const char* ptr_;
|
155
155
|
size_t left_;
|
@@ -159,15 +159,15 @@ class ByteArraySource : public Source {
|
|
159
159
|
class UncheckedByteArraySink : public Sink {
|
160
160
|
public:
|
161
161
|
explicit UncheckedByteArraySink(char* dest) : dest_(dest) { }
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
162
|
+
~UncheckedByteArraySink() override;
|
163
|
+
void Append(const char* data, size_t n) override;
|
164
|
+
char* GetAppendBuffer(size_t len, char* scratch) override;
|
165
|
+
char* GetAppendBufferVariable(
|
166
166
|
size_t min_size, size_t desired_size_hint, char* scratch,
|
167
|
-
size_t scratch_size, size_t* allocated_size);
|
168
|
-
|
167
|
+
size_t scratch_size, size_t* allocated_size) override;
|
168
|
+
void AppendAndTakeOwnership(
|
169
169
|
char* bytes, size_t n, void (*deleter)(void*, const char*, size_t),
|
170
|
-
void *deleter_arg);
|
170
|
+
void *deleter_arg) override;
|
171
171
|
|
172
172
|
// Return the current output pointer so that a caller can see how
|
173
173
|
// many bytes were produced.
|