libdeflate 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +17 -0
- data/.gitmodules +3 -0
- data/.rspec +2 -0
- data/.rubocop.yml +1 -0
- data/.rubocop_todo.yml +9 -0
- data/.travis.yml +5 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +52 -0
- data/Rakefile +15 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/ext/libdeflate/extconf.rb +14 -0
- data/ext/libdeflate/libdeflate/.gitignore +19 -0
- data/ext/libdeflate/libdeflate/COPYING +21 -0
- data/ext/libdeflate/libdeflate/Makefile +231 -0
- data/ext/libdeflate/libdeflate/Makefile.msc +64 -0
- data/ext/libdeflate/libdeflate/NEWS +57 -0
- data/ext/libdeflate/libdeflate/README.md +170 -0
- data/ext/libdeflate/libdeflate/common/common_defs.h +351 -0
- data/ext/libdeflate/libdeflate/common/compiler_gcc.h +134 -0
- data/ext/libdeflate/libdeflate/common/compiler_msc.h +95 -0
- data/ext/libdeflate/libdeflate/lib/adler32.c +213 -0
- data/ext/libdeflate/libdeflate/lib/adler32_impl.h +281 -0
- data/ext/libdeflate/libdeflate/lib/aligned_malloc.c +57 -0
- data/ext/libdeflate/libdeflate/lib/aligned_malloc.h +13 -0
- data/ext/libdeflate/libdeflate/lib/bt_matchfinder.h +357 -0
- data/ext/libdeflate/libdeflate/lib/crc32.c +368 -0
- data/ext/libdeflate/libdeflate/lib/crc32_impl.h +286 -0
- data/ext/libdeflate/libdeflate/lib/crc32_table.h +526 -0
- data/ext/libdeflate/libdeflate/lib/decompress_impl.h +404 -0
- data/ext/libdeflate/libdeflate/lib/deflate_compress.c +2817 -0
- data/ext/libdeflate/libdeflate/lib/deflate_compress.h +14 -0
- data/ext/libdeflate/libdeflate/lib/deflate_constants.h +66 -0
- data/ext/libdeflate/libdeflate/lib/deflate_decompress.c +889 -0
- data/ext/libdeflate/libdeflate/lib/gzip_compress.c +95 -0
- data/ext/libdeflate/libdeflate/lib/gzip_constants.h +45 -0
- data/ext/libdeflate/libdeflate/lib/gzip_decompress.c +130 -0
- data/ext/libdeflate/libdeflate/lib/hc_matchfinder.h +405 -0
- data/ext/libdeflate/libdeflate/lib/lib_common.h +35 -0
- data/ext/libdeflate/libdeflate/lib/matchfinder_avx2.h +53 -0
- data/ext/libdeflate/libdeflate/lib/matchfinder_common.h +205 -0
- data/ext/libdeflate/libdeflate/lib/matchfinder_neon.h +61 -0
- data/ext/libdeflate/libdeflate/lib/matchfinder_sse2.h +53 -0
- data/ext/libdeflate/libdeflate/lib/unaligned.h +202 -0
- data/ext/libdeflate/libdeflate/lib/x86_cpu_features.c +169 -0
- data/ext/libdeflate/libdeflate/lib/x86_cpu_features.h +48 -0
- data/ext/libdeflate/libdeflate/lib/zlib_compress.c +87 -0
- data/ext/libdeflate/libdeflate/lib/zlib_constants.h +21 -0
- data/ext/libdeflate/libdeflate/lib/zlib_decompress.c +91 -0
- data/ext/libdeflate/libdeflate/libdeflate.h +274 -0
- data/ext/libdeflate/libdeflate/programs/benchmark.c +558 -0
- data/ext/libdeflate/libdeflate/programs/checksum.c +197 -0
- data/ext/libdeflate/libdeflate/programs/detect.sh +62 -0
- data/ext/libdeflate/libdeflate/programs/gzip.c +603 -0
- data/ext/libdeflate/libdeflate/programs/prog_util.c +530 -0
- data/ext/libdeflate/libdeflate/programs/prog_util.h +162 -0
- data/ext/libdeflate/libdeflate/programs/test_checksums.c +135 -0
- data/ext/libdeflate/libdeflate/programs/tgetopt.c +118 -0
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/Makefile +12 -0
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_compress/fuzz.c +40 -0
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_compress/inputs/0 +0 -0
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_decompress/fuzz.c +28 -0
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_decompress/inputs/0 +3 -0
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/gzip_decompress/fuzz.c +28 -0
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/gzip_decompress/inputs/0 +0 -0
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/prepare_for_fuzz.sh +14 -0
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/zlib_decompress/fuzz.c +28 -0
- data/ext/libdeflate/libdeflate/tools/afl-fuzz/zlib_decompress/inputs/0 +3 -0
- data/ext/libdeflate/libdeflate/tools/android_build.sh +104 -0
- data/ext/libdeflate/libdeflate/tools/checksum_benchmarks.sh +76 -0
- data/ext/libdeflate/libdeflate/tools/exec_tests.sh +30 -0
- data/ext/libdeflate/libdeflate/tools/gen_crc32_multipliers.c +108 -0
- data/ext/libdeflate/libdeflate/tools/gen_crc32_table.c +100 -0
- data/ext/libdeflate/libdeflate/tools/gzip_tests.sh +412 -0
- data/ext/libdeflate/libdeflate/tools/make-windows-releases +21 -0
- data/ext/libdeflate/libdeflate/tools/mips_build.sh +9 -0
- data/ext/libdeflate/libdeflate/tools/msc_test.bat +3 -0
- data/ext/libdeflate/libdeflate/tools/pgo_build.sh +23 -0
- data/ext/libdeflate/libdeflate/tools/produce_gzip_benchmark_table.sh +37 -0
- data/ext/libdeflate/libdeflate/tools/run_tests.sh +305 -0
- data/ext/libdeflate/libdeflate/tools/windows_build.sh +10 -0
- data/ext/libdeflate/libdeflate_ext.c +389 -0
- data/ext/libdeflate/libdeflate_ext.h +8 -0
- data/lib/libdeflate.rb +2 -0
- data/lib/libdeflate/version.rb +3 -0
- data/libdeflate.gemspec +33 -0
- metadata +230 -0
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Makefile for the Microsoft toolchain
|
|
3
|
+
#
|
|
4
|
+
# Usage:
|
|
5
|
+
# nmake /f Makefile.msc
|
|
6
|
+
#
|
|
7
|
+
|
|
8
|
+
.SUFFIXES: .c .obj .dllobj
|
|
9
|
+
|
|
10
|
+
CC = cl
|
|
11
|
+
LD = link
|
|
12
|
+
AR = lib
|
|
13
|
+
CFLAGS = /MD /O2 -I. -Icommon
|
|
14
|
+
LDFLAGS =
|
|
15
|
+
|
|
16
|
+
STATIC_LIB = libdeflatestatic.lib
|
|
17
|
+
SHARED_LIB = libdeflate.dll
|
|
18
|
+
IMPORT_LIB = libdeflate.lib
|
|
19
|
+
|
|
20
|
+
STATIC_LIB_OBJ = \
|
|
21
|
+
lib/aligned_malloc.obj \
|
|
22
|
+
lib/adler32.obj \
|
|
23
|
+
lib/crc32.obj \
|
|
24
|
+
lib/deflate_compress.obj \
|
|
25
|
+
lib/deflate_decompress.obj \
|
|
26
|
+
lib/gzip_compress.obj \
|
|
27
|
+
lib/gzip_decompress.obj \
|
|
28
|
+
lib/x86_cpu_features.obj \
|
|
29
|
+
lib/zlib_compress.obj \
|
|
30
|
+
lib/zlib_decompress.obj
|
|
31
|
+
|
|
32
|
+
SHARED_LIB_OBJ = $(STATIC_LIB_OBJ:.obj=.dllobj)
|
|
33
|
+
|
|
34
|
+
PROG_COMMON_OBJ = programs/prog_util.obj \
|
|
35
|
+
programs/tgetopt.obj \
|
|
36
|
+
$(STATIC_LIB)
|
|
37
|
+
|
|
38
|
+
PROG_CFLAGS = $(CFLAGS) -Iprograms
|
|
39
|
+
|
|
40
|
+
all: $(STATIC_LIB) $(SHARED_LIB) $(IMPORT_LIB) gzip.exe gunzip.exe
|
|
41
|
+
|
|
42
|
+
.c.obj:
|
|
43
|
+
$(CC) -c /Fo$@ $(CFLAGS) $**
|
|
44
|
+
|
|
45
|
+
.c.dllobj:
|
|
46
|
+
$(CC) -c /Fo$@ $(CFLAGS) /DLIBDEFLATE_DLL $**
|
|
47
|
+
|
|
48
|
+
$(STATIC_LIB): $(STATIC_LIB_OBJ)
|
|
49
|
+
$(AR) $(ARFLAGS) -out:$@ $(STATIC_LIB_OBJ)
|
|
50
|
+
|
|
51
|
+
$(SHARED_LIB): $(SHARED_LIB_OBJ)
|
|
52
|
+
$(LD) $(LDFLAGS) -out:$@ -dll -implib:$(IMPORT_LIB) $(SHARED_LIB_OBJ)
|
|
53
|
+
|
|
54
|
+
$(IMPORT_LIB): $(SHARED_LIB)
|
|
55
|
+
|
|
56
|
+
gzip.exe:programs/gzip.obj $(PROG_COMMON_OBJ)
|
|
57
|
+
$(LD) $(LDFLAGS) -out:$@ $**
|
|
58
|
+
|
|
59
|
+
gunzip.exe:gzip.exe
|
|
60
|
+
copy $** $@
|
|
61
|
+
|
|
62
|
+
clean:
|
|
63
|
+
-del *.dll *.exe *.exp libdeflate.lib libdeflatestatic.lib gzip.lib \
|
|
64
|
+
lib\*.obj lib\*.dllobj programs\*.obj 2>nul
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
Version 0.7:
|
|
2
|
+
Fixed a very rare bug that caused data to be compressed incorrectly.
|
|
3
|
+
The bug affected compression levels 7 and below since libdeflate v0.2.
|
|
4
|
+
Although there have been no user reports of the bug, and I believe it
|
|
5
|
+
would have been highly unlikely to encounter on realistic data, it could
|
|
6
|
+
occur on data specially crafted to reproduce it.
|
|
7
|
+
|
|
8
|
+
Fixed a compilation error when building with clang 3.7.
|
|
9
|
+
|
|
10
|
+
Version 0.6:
|
|
11
|
+
Various improvements to the gzip program's behavior.
|
|
12
|
+
|
|
13
|
+
Faster CRC-32 on AVX-capable processors.
|
|
14
|
+
|
|
15
|
+
Other minor changes.
|
|
16
|
+
|
|
17
|
+
Version 0.5:
|
|
18
|
+
The CRC-32 checksum algorithm has been optimized with carryless
|
|
19
|
+
multiplication instructions for x86_64 (PCLMUL). This speeds up gzip
|
|
20
|
+
compression and decompression.
|
|
21
|
+
|
|
22
|
+
Build fixes for certain platforms and compilers.
|
|
23
|
+
|
|
24
|
+
Added more test programs and scripts.
|
|
25
|
+
|
|
26
|
+
libdeflate is now entirely MIT-licensed.
|
|
27
|
+
|
|
28
|
+
Version 0.4:
|
|
29
|
+
The Adler-32 checksum algorithm has been optimized with vector
|
|
30
|
+
instructions for x86_64 (SSE2 and AVX2) and ARM (NEON). This speeds up
|
|
31
|
+
zlib compression and decompression.
|
|
32
|
+
|
|
33
|
+
To avoid naming collisions, functions and definitions in libdeflate's
|
|
34
|
+
API have been renamed to be prefixed with "libdeflate_" or
|
|
35
|
+
"LIBDEFLATE_". Programs using the old API will need to be updated.
|
|
36
|
+
|
|
37
|
+
Various bug fixes and other improvements.
|
|
38
|
+
|
|
39
|
+
Version 0.3:
|
|
40
|
+
Some bug fixes and other minor changes.
|
|
41
|
+
|
|
42
|
+
Version 0.2:
|
|
43
|
+
Implemented a new block splitting algorithm which typically improves the
|
|
44
|
+
compression ratio slightly at all compression levels.
|
|
45
|
+
|
|
46
|
+
The compressor now outputs each block using the cheapest type (dynamic
|
|
47
|
+
Huffman, static Huffman, or uncompressed).
|
|
48
|
+
|
|
49
|
+
The gzip program has received an overhaul and now behaves more like the
|
|
50
|
+
standard version.
|
|
51
|
+
|
|
52
|
+
Build system updates, including: some build options were changed and
|
|
53
|
+
some build options were removed, and the default 'make' target now
|
|
54
|
+
includes the gzip program as well as the library.
|
|
55
|
+
|
|
56
|
+
Version 0.1:
|
|
57
|
+
Initial official release.
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
# Overview
|
|
2
|
+
|
|
3
|
+
libdeflate is a library for fast, whole-buffer DEFLATE-based compression and
|
|
4
|
+
decompression.
|
|
5
|
+
|
|
6
|
+
The supported formats are:
|
|
7
|
+
|
|
8
|
+
- DEFLATE (raw)
|
|
9
|
+
- zlib (a.k.a. DEFLATE with a zlib wrapper)
|
|
10
|
+
- gzip (a.k.a. DEFLATE with a gzip wrapper)
|
|
11
|
+
|
|
12
|
+
libdeflate is heavily optimized. It is significantly faster than the zlib
|
|
13
|
+
library, both for compression and decompression, and especially on x86
|
|
14
|
+
processors. In addition, libdeflate provides optional high compression modes
|
|
15
|
+
that provide a better compression ratio than the zlib's "level 9".
|
|
16
|
+
|
|
17
|
+
libdeflate itself is a library, but the following command-line programs which
|
|
18
|
+
use this library are also provided:
|
|
19
|
+
|
|
20
|
+
* gzip (or gunzip), a program which mostly behaves like the standard equivalent,
|
|
21
|
+
except that it does not yet have good streaming support and therefore does not
|
|
22
|
+
yet support very large files
|
|
23
|
+
* benchmark, a program for benchmarking in-memory compression and decompression
|
|
24
|
+
|
|
25
|
+
# Building
|
|
26
|
+
|
|
27
|
+
## For UNIX
|
|
28
|
+
|
|
29
|
+
Just run `make`. You need GNU Make and either GCC or Clang. GCC is recommended
|
|
30
|
+
because it builds slightly faster binaries. There is no `make install` yet;
|
|
31
|
+
just copy the file(s) to where you want.
|
|
32
|
+
|
|
33
|
+
By default, all targets are built, including the library and programs, with the
|
|
34
|
+
exception of the `benchmark` program. `make help` shows the available targets.
|
|
35
|
+
There are also several options which can be set on the `make` command line. See
|
|
36
|
+
the Makefile for details.
|
|
37
|
+
|
|
38
|
+
## For Windows
|
|
39
|
+
|
|
40
|
+
MinGW (GCC) is the recommended compiler to use when building binaries for
|
|
41
|
+
Windows. MinGW can be used on either Windows or Linux. On Windows, you'll need
|
|
42
|
+
the compiler as well as GNU Make and basic UNIX tools such as `sh`. This is
|
|
43
|
+
most easily set up with Cygwin, but some standalone MinGW distributions for
|
|
44
|
+
Windows also work. Or, on Linux, you'll need to install the `mingw-w64-gcc` or
|
|
45
|
+
similarly-named package. Once ready, do the build using a command like:
|
|
46
|
+
|
|
47
|
+
$ make CC=x86_64-w64-mingw32-gcc
|
|
48
|
+
|
|
49
|
+
Some MinGW distributions for Windows may require `CC=gcc` instead.
|
|
50
|
+
|
|
51
|
+
Windows binaries prebuilt with MinGW may also be downloaded from
|
|
52
|
+
https://github.com/ebiggers/libdeflate/releases.
|
|
53
|
+
|
|
54
|
+
Alternatively, a separate Makefile, `Makefile.msc`, is provided for the tools
|
|
55
|
+
that come with Visual Studio, for those who strongly prefer that toolchain.
|
|
56
|
+
|
|
57
|
+
As usual, 64-bit binaries are faster than 32-bit binaries and should be
|
|
58
|
+
preferred whenever possible.
|
|
59
|
+
|
|
60
|
+
# API
|
|
61
|
+
|
|
62
|
+
libdeflate has a simple API that is not zlib-compatible. You can create
|
|
63
|
+
compressors and decompressors and use them to compress or decompress buffers.
|
|
64
|
+
See libdeflate.h for details.
|
|
65
|
+
|
|
66
|
+
There is currently no support for streaming. This has been considered, but it
|
|
67
|
+
always significantly increases complexity and slows down fast paths.
|
|
68
|
+
Unfortunately, at this point it remains a future TODO. So: if your application
|
|
69
|
+
compresses data in "chunks", say, less than 1 MB in size, then libdeflate is a
|
|
70
|
+
great choice for you; that's what it's designed to do. This is perfect for
|
|
71
|
+
certain use cases such as transparent filesystem compression. But if your
|
|
72
|
+
application compresses large files as a single compressed stream, similarly to
|
|
73
|
+
the `gzip` program, then libdeflate isn't for you.
|
|
74
|
+
|
|
75
|
+
Note that with chunk-based compression, you generally should have the
|
|
76
|
+
uncompressed size of each chunk stored outside of the compressed data itself.
|
|
77
|
+
This enables you to allocate an output buffer of the correct size without
|
|
78
|
+
guessing. However, libdeflate's decompression routines do optionally provide
|
|
79
|
+
the actual number of output bytes in case you need it.
|
|
80
|
+
|
|
81
|
+
# DEFLATE vs. zlib vs. gzip
|
|
82
|
+
|
|
83
|
+
The DEFLATE format ([rfc1951](https://www.ietf.org/rfc/rfc1951.txt)), the zlib
|
|
84
|
+
format ([rfc1950](https://www.ietf.org/rfc/rfc1950.txt)), and the gzip format
|
|
85
|
+
([rfc1952](https://www.ietf.org/rfc/rfc1952.txt)) are commonly confused with
|
|
86
|
+
each other as well as with the [zlib software library](http://zlib.net), which
|
|
87
|
+
actually supports all three formats. libdeflate (this library) also supports
|
|
88
|
+
all three formats.
|
|
89
|
+
|
|
90
|
+
Briefly, DEFLATE is a raw compressed stream, whereas zlib and gzip are different
|
|
91
|
+
wrappers for this stream. Both zlib and gzip include checksums, but gzip can
|
|
92
|
+
include extra information such as the original filename. Generally, you should
|
|
93
|
+
choose a format as follows:
|
|
94
|
+
|
|
95
|
+
- If you are compressing whole files with no subdivisions, similar to the `gzip`
|
|
96
|
+
program, you probably should use the gzip format.
|
|
97
|
+
- Otherwise, if you don't need the features of the gzip header and footer but do
|
|
98
|
+
still want a checksum for corruption detection, you probably should use the
|
|
99
|
+
zlib format.
|
|
100
|
+
- Otherwise, you probably should use raw DEFLATE. This is ideal if you don't
|
|
101
|
+
need checksums, e.g. because they're simply not needed for your use case or
|
|
102
|
+
because you already compute your own checksums that are stored separately from
|
|
103
|
+
the compressed stream.
|
|
104
|
+
|
|
105
|
+
Note that gzip and zlib streams can be distinguished from each other based on
|
|
106
|
+
their starting bytes, but this is not necessarily true of raw DEFLATE streams.
|
|
107
|
+
|
|
108
|
+
# Compression levels
|
|
109
|
+
|
|
110
|
+
An often-underappreciated fact of compression formats such as DEFLATE is that
|
|
111
|
+
there are an enormous number of different ways that a given input could be
|
|
112
|
+
compressed. Different algorithms and different amounts of computation time will
|
|
113
|
+
result in different compression ratios, while remaining equally compatible with
|
|
114
|
+
the decompressor.
|
|
115
|
+
|
|
116
|
+
For this reason, the commonly used zlib library provides nine compression
|
|
117
|
+
levels. Level 1 is the fastest but provides the worst compression; level 9
|
|
118
|
+
provides the best compression but is the slowest. It defaults to level 6.
|
|
119
|
+
libdeflate uses this same design but is designed to improve on both zlib's
|
|
120
|
+
performance *and* compression ratio at every compression level. In addition,
|
|
121
|
+
libdeflate's levels go [up to 12](https://xkcd.com/670/) to make room for a
|
|
122
|
+
minimum-cost-path based algorithm (sometimes called "optimal parsing") that can
|
|
123
|
+
significantly improve on zlib's compression ratio.
|
|
124
|
+
|
|
125
|
+
If you are using DEFLATE (or zlib, or gzip) in your application, you should test
|
|
126
|
+
different levels to see which works best for your application.
|
|
127
|
+
|
|
128
|
+
# Motivation
|
|
129
|
+
|
|
130
|
+
Despite DEFLATE's widespread use mainly through the zlib library, in the
|
|
131
|
+
compression community this format from the early 1990s is often considered
|
|
132
|
+
obsolete. And in a few significant ways, it is.
|
|
133
|
+
|
|
134
|
+
So why implement DEFLATE at all, instead of focusing entirely on
|
|
135
|
+
bzip2/LZMA/xz/LZ4/LZX/ZSTD/Brotli/LZHAM/LZFSE/[insert cool new format here]?
|
|
136
|
+
|
|
137
|
+
To do something better, you need to understand what came before. And it turns
|
|
138
|
+
out that most ideas from DEFLATE are still relevant. Many of the newer formats
|
|
139
|
+
share a similar structure as DEFLATE, with different tweaks. The effects of
|
|
140
|
+
trivial but very useful tweaks, such as increasing the sliding window size, are
|
|
141
|
+
often confused with the effects of nontrivial but less useful tweaks. And
|
|
142
|
+
actually, many of these formats are similar enough that common algorithms and
|
|
143
|
+
optimizations (e.g. those dealing with LZ77 matchfinding) can be reused.
|
|
144
|
+
|
|
145
|
+
In addition, comparing compressors fairly is difficult because the performance
|
|
146
|
+
of a compressor depends heavily on optimizations which are not intrinsic to the
|
|
147
|
+
compression format itself. In this respect, the zlib library sometimes compares
|
|
148
|
+
poorly to certain newer code because zlib is not well optimized for modern
|
|
149
|
+
processors. libdeflate addresses this by providing an optimized DEFLATE
|
|
150
|
+
implementation which can be used for benchmarking purposes. And, of course,
|
|
151
|
+
real applications can use it as well.
|
|
152
|
+
|
|
153
|
+
That being said, I have also started [a separate
|
|
154
|
+
project](https://github.com/ebiggers/xpack) for an experimental, more modern
|
|
155
|
+
compression format.
|
|
156
|
+
|
|
157
|
+
# License
|
|
158
|
+
|
|
159
|
+
libdeflate is [MIT-licensed](COPYING).
|
|
160
|
+
|
|
161
|
+
Additional notes (informational only):
|
|
162
|
+
|
|
163
|
+
- I am not aware of any patents covering libdeflate.
|
|
164
|
+
|
|
165
|
+
- Old versions of libdeflate were public domain; I only started copyrighting
|
|
166
|
+
changes in newer versions. Portions of the source code that have not been
|
|
167
|
+
changed since being released in a public domain version can theoretically
|
|
168
|
+
still be used as public domain if you want to. But for practical purposes, it
|
|
169
|
+
probably would be easier to just take the MIT license option, which is nearly
|
|
170
|
+
the same anyway.
|
|
@@ -0,0 +1,351 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* common_defs.h
|
|
3
|
+
*
|
|
4
|
+
* Copyright 2016 Eric Biggers
|
|
5
|
+
*
|
|
6
|
+
* Permission is hereby granted, free of charge, to any person
|
|
7
|
+
* obtaining a copy of this software and associated documentation
|
|
8
|
+
* files (the "Software"), to deal in the Software without
|
|
9
|
+
* restriction, including without limitation the rights to use,
|
|
10
|
+
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
11
|
+
* copies of the Software, and to permit persons to whom the
|
|
12
|
+
* Software is furnished to do so, subject to the following
|
|
13
|
+
* conditions:
|
|
14
|
+
*
|
|
15
|
+
* The above copyright notice and this permission notice shall be
|
|
16
|
+
* included in all copies or substantial portions of the Software.
|
|
17
|
+
*
|
|
18
|
+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
19
|
+
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
|
20
|
+
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
21
|
+
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
22
|
+
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|
23
|
+
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
24
|
+
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
25
|
+
* OTHER DEALINGS IN THE SOFTWARE.
|
|
26
|
+
*/
|
|
27
|
+
|
|
28
|
+
#ifndef COMMON_COMMON_DEFS_H
|
|
29
|
+
#define COMMON_COMMON_DEFS_H
|
|
30
|
+
|
|
31
|
+
#ifdef __GNUC__
|
|
32
|
+
# include "compiler_gcc.h"
|
|
33
|
+
#elif defined(_MSC_VER)
|
|
34
|
+
# include "compiler_msc.h"
|
|
35
|
+
#else
|
|
36
|
+
# pragma message("Unrecognized compiler. Please add a header file for your compiler. Compilation will proceed, but performance may suffer!")
|
|
37
|
+
#endif
|
|
38
|
+
|
|
39
|
+
/* ========================================================================== */
|
|
40
|
+
/* Type definitions */
|
|
41
|
+
/* ========================================================================== */
|
|
42
|
+
|
|
43
|
+
#include <stddef.h> /* size_t */
|
|
44
|
+
|
|
45
|
+
#ifndef __bool_true_false_are_defined
|
|
46
|
+
# include <stdbool.h> /* bool */
|
|
47
|
+
#endif
|
|
48
|
+
|
|
49
|
+
/* Fixed-width integer types */
|
|
50
|
+
#ifndef PRIu32
|
|
51
|
+
# include <inttypes.h>
|
|
52
|
+
#endif
|
|
53
|
+
typedef uint8_t u8;
|
|
54
|
+
typedef uint16_t u16;
|
|
55
|
+
typedef uint32_t u32;
|
|
56
|
+
typedef uint64_t u64;
|
|
57
|
+
typedef int8_t s8;
|
|
58
|
+
typedef int16_t s16;
|
|
59
|
+
typedef int32_t s32;
|
|
60
|
+
typedef int64_t s64;
|
|
61
|
+
|
|
62
|
+
/*
|
|
63
|
+
* Word type of the target architecture. Use 'size_t' instead of 'unsigned
|
|
64
|
+
* long' to account for platforms such as Windows that use 32-bit 'unsigned
|
|
65
|
+
* long' on 64-bit architectures.
|
|
66
|
+
*/
|
|
67
|
+
typedef size_t machine_word_t;
|
|
68
|
+
|
|
69
|
+
/* Number of bytes in a word */
|
|
70
|
+
#define WORDBYTES ((int)sizeof(machine_word_t))
|
|
71
|
+
|
|
72
|
+
/* Number of bits in a word */
|
|
73
|
+
#define WORDBITS (8 * WORDBYTES)
|
|
74
|
+
|
|
75
|
+
/* ========================================================================== */
|
|
76
|
+
/* Optional compiler features */
|
|
77
|
+
/* ========================================================================== */
|
|
78
|
+
|
|
79
|
+
/* LIBEXPORT - export a function from a shared library */
|
|
80
|
+
#ifndef LIBEXPORT
|
|
81
|
+
# define LIBEXPORT
|
|
82
|
+
#endif
|
|
83
|
+
|
|
84
|
+
/* inline - suggest that a function be inlined */
|
|
85
|
+
#ifndef inline
|
|
86
|
+
# define inline
|
|
87
|
+
#endif
|
|
88
|
+
|
|
89
|
+
/* forceinline - force a function to be inlined, if possible */
|
|
90
|
+
#ifndef forceinline
|
|
91
|
+
# define forceinline inline
|
|
92
|
+
#endif
|
|
93
|
+
|
|
94
|
+
/* restrict - annotate a non-aliased pointer */
|
|
95
|
+
#ifndef restrict
|
|
96
|
+
# define restrict
|
|
97
|
+
#endif
|
|
98
|
+
|
|
99
|
+
/* likely(expr) - hint that an expression is usually true */
|
|
100
|
+
#ifndef likely
|
|
101
|
+
# define likely(expr) (expr)
|
|
102
|
+
#endif
|
|
103
|
+
|
|
104
|
+
/* unlikely(expr) - hint that an expression is usually false */
|
|
105
|
+
#ifndef unlikely
|
|
106
|
+
# define unlikely(expr) (expr)
|
|
107
|
+
#endif
|
|
108
|
+
|
|
109
|
+
/* prefetchr(addr) - prefetch into L1 cache for read */
|
|
110
|
+
#ifndef prefetchr
|
|
111
|
+
# define prefetchr(addr)
|
|
112
|
+
#endif
|
|
113
|
+
|
|
114
|
+
/* prefetchw(addr) - prefetch into L1 cache for write */
|
|
115
|
+
#ifndef prefetchw
|
|
116
|
+
# define prefetchw(addr)
|
|
117
|
+
#endif
|
|
118
|
+
|
|
119
|
+
/* Does the compiler support the 'target' function attribute? */
|
|
120
|
+
#ifndef COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE
|
|
121
|
+
# define COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE 0
|
|
122
|
+
#endif
|
|
123
|
+
|
|
124
|
+
/* Are target-specific intrinsics supported in 'target' attribute functions? */
|
|
125
|
+
#ifndef COMPILER_SUPPORTS_TARGET_INTRINSICS
|
|
126
|
+
# define COMPILER_SUPPORTS_TARGET_INTRINSICS 0
|
|
127
|
+
#endif
|
|
128
|
+
|
|
129
|
+
/* Which targets are supported with the 'target' function attribute? */
|
|
130
|
+
#ifndef COMPILER_SUPPORTS_PCLMUL_TARGET
|
|
131
|
+
# define COMPILER_SUPPORTS_PCLMUL_TARGET 0
|
|
132
|
+
#endif
|
|
133
|
+
#ifndef COMPILER_SUPPORTS_BMI2_TARGET
|
|
134
|
+
# define COMPILER_SUPPORTS_BMI2_TARGET 0
|
|
135
|
+
#endif
|
|
136
|
+
#ifndef COMPILER_SUPPORTS_AVX_TARGET
|
|
137
|
+
# define COMPILER_SUPPORTS_AVX_TARGET 0
|
|
138
|
+
#endif
|
|
139
|
+
#ifndef COMPILER_SUPPORTS_AVX2_TARGET
|
|
140
|
+
# define COMPILER_SUPPORTS_AVX2_TARGET 0
|
|
141
|
+
#endif
|
|
142
|
+
|
|
143
|
+
/* _aligned_attribute(n) - declare that the annotated variable, or variables of
|
|
144
|
+
* the annotated type, are to be aligned on n-byte boundaries */
|
|
145
|
+
#ifndef _aligned_attribute
|
|
146
|
+
#endif
|
|
147
|
+
|
|
148
|
+
/* ========================================================================== */
|
|
149
|
+
/* Miscellaneous macros */
|
|
150
|
+
/* ========================================================================== */
|
|
151
|
+
|
|
152
|
+
#define ARRAY_LEN(A) (sizeof(A) / sizeof((A)[0]))
|
|
153
|
+
#define MIN(a, b) ((a) <= (b) ? (a) : (b))
|
|
154
|
+
#define MAX(a, b) ((a) >= (b) ? (a) : (b))
|
|
155
|
+
#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))
|
|
156
|
+
#define STATIC_ASSERT(expr) ((void)sizeof(char[1 - 2 * !(expr)]))
|
|
157
|
+
#define ALIGN(n, a) (((n) + (a) - 1) & ~((a) - 1))
|
|
158
|
+
|
|
159
|
+
/* ========================================================================== */
|
|
160
|
+
/* Endianness handling */
|
|
161
|
+
/* ========================================================================== */
|
|
162
|
+
|
|
163
|
+
/*
|
|
164
|
+
* CPU_IS_LITTLE_ENDIAN() - a macro which evaluates to 1 if the CPU is little
|
|
165
|
+
* endian or 0 if it is big endian. The macro should be defined in a way such
|
|
166
|
+
* that the compiler can evaluate it at compilation time. If not defined, a
|
|
167
|
+
* fallback is used.
|
|
168
|
+
*/
|
|
169
|
+
#ifndef CPU_IS_LITTLE_ENDIAN
|
|
170
|
+
static forceinline int CPU_IS_LITTLE_ENDIAN(void)
|
|
171
|
+
{
|
|
172
|
+
union {
|
|
173
|
+
unsigned int v;
|
|
174
|
+
unsigned char b;
|
|
175
|
+
} u;
|
|
176
|
+
u.v = 1;
|
|
177
|
+
return u.b;
|
|
178
|
+
}
|
|
179
|
+
#endif
|
|
180
|
+
|
|
181
|
+
/* bswap16(n) - swap the bytes of a 16-bit integer */
|
|
182
|
+
#ifndef bswap16
|
|
183
|
+
static forceinline u16 bswap16(u16 n)
|
|
184
|
+
{
|
|
185
|
+
return (n << 8) | (n >> 8);
|
|
186
|
+
}
|
|
187
|
+
#endif
|
|
188
|
+
|
|
189
|
+
/* bswap32(n) - swap the bytes of a 32-bit integer */
|
|
190
|
+
#ifndef bswap32
|
|
191
|
+
static forceinline u32 bswap32(u32 n)
|
|
192
|
+
{
|
|
193
|
+
return ((n & 0x000000FF) << 24) |
|
|
194
|
+
((n & 0x0000FF00) << 8) |
|
|
195
|
+
((n & 0x00FF0000) >> 8) |
|
|
196
|
+
((n & 0xFF000000) >> 24);
|
|
197
|
+
}
|
|
198
|
+
#endif
|
|
199
|
+
|
|
200
|
+
/* bswap64(n) - swap the bytes of a 64-bit integer */
|
|
201
|
+
#ifndef bswap64
|
|
202
|
+
static forceinline u64 bswap64(u64 n)
|
|
203
|
+
{
|
|
204
|
+
return ((n & 0x00000000000000FF) << 56) |
|
|
205
|
+
((n & 0x000000000000FF00) << 40) |
|
|
206
|
+
((n & 0x0000000000FF0000) << 24) |
|
|
207
|
+
((n & 0x00000000FF000000) << 8) |
|
|
208
|
+
((n & 0x000000FF00000000) >> 8) |
|
|
209
|
+
((n & 0x0000FF0000000000) >> 24) |
|
|
210
|
+
((n & 0x00FF000000000000) >> 40) |
|
|
211
|
+
((n & 0xFF00000000000000) >> 56);
|
|
212
|
+
}
|
|
213
|
+
#endif
|
|
214
|
+
|
|
215
|
+
#define le16_bswap(n) (CPU_IS_LITTLE_ENDIAN() ? (n) : bswap16(n))
|
|
216
|
+
#define le32_bswap(n) (CPU_IS_LITTLE_ENDIAN() ? (n) : bswap32(n))
|
|
217
|
+
#define le64_bswap(n) (CPU_IS_LITTLE_ENDIAN() ? (n) : bswap64(n))
|
|
218
|
+
#define be16_bswap(n) (CPU_IS_LITTLE_ENDIAN() ? bswap16(n) : (n))
|
|
219
|
+
#define be32_bswap(n) (CPU_IS_LITTLE_ENDIAN() ? bswap32(n) : (n))
|
|
220
|
+
#define be64_bswap(n) (CPU_IS_LITTLE_ENDIAN() ? bswap64(n) : (n))
|
|
221
|
+
|
|
222
|
+
/* ========================================================================== */
|
|
223
|
+
/* Unaligned memory accesses */
|
|
224
|
+
/* ========================================================================== */
|
|
225
|
+
|
|
226
|
+
/*
|
|
227
|
+
* UNALIGNED_ACCESS_IS_FAST should be defined to 1 if unaligned memory accesses
|
|
228
|
+
* can be performed efficiently on the target platform.
|
|
229
|
+
*/
|
|
230
|
+
#ifndef UNALIGNED_ACCESS_IS_FAST
|
|
231
|
+
# define UNALIGNED_ACCESS_IS_FAST 0
|
|
232
|
+
#endif
|
|
233
|
+
|
|
234
|
+
/*
|
|
235
|
+
* DEFINE_UNALIGNED_TYPE(type) - a macro that, given an integer type 'type',
|
|
236
|
+
* defines load_type_unaligned(addr) and store_type_unaligned(v, addr) functions
|
|
237
|
+
* which load and store variables of type 'type' from/to unaligned memory
|
|
238
|
+
* addresses. If not defined, a fallback is used.
|
|
239
|
+
*/
|
|
240
|
+
#ifndef DEFINE_UNALIGNED_TYPE
|
|
241
|
+
|
|
242
|
+
/*
|
|
243
|
+
* Although memcpy() may seem inefficient, it *usually* gets optimized
|
|
244
|
+
* appropriately by modern compilers. It's portable and may be the best we can
|
|
245
|
+
* do for a fallback...
|
|
246
|
+
*/
|
|
247
|
+
#include <string.h>
|
|
248
|
+
|
|
249
|
+
#define DEFINE_UNALIGNED_TYPE(type) \
|
|
250
|
+
\
|
|
251
|
+
static forceinline type \
|
|
252
|
+
load_##type##_unaligned(const void *p) \
|
|
253
|
+
{ \
|
|
254
|
+
type v; \
|
|
255
|
+
memcpy(&v, p, sizeof(v)); \
|
|
256
|
+
return v; \
|
|
257
|
+
} \
|
|
258
|
+
\
|
|
259
|
+
static forceinline void \
|
|
260
|
+
store_##type##_unaligned(type v, void *p) \
|
|
261
|
+
{ \
|
|
262
|
+
memcpy(p, &v, sizeof(v)); \
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
#endif /* !DEFINE_UNALIGNED_TYPE */
|
|
266
|
+
|
|
267
|
+
/* ========================================================================== */
|
|
268
|
+
/* Bit scan functions */
|
|
269
|
+
/* ========================================================================== */
|
|
270
|
+
|
|
271
|
+
/*
|
|
272
|
+
* Bit Scan Reverse (BSR) - find the 0-based index (relative to the least
|
|
273
|
+
* significant end) of the *most* significant 1 bit in the input value. The
|
|
274
|
+
* input value must be nonzero!
|
|
275
|
+
*/
|
|
276
|
+
|
|
277
|
+
#ifndef bsr32
|
|
278
|
+
static forceinline unsigned
|
|
279
|
+
bsr32(u32 n)
|
|
280
|
+
{
|
|
281
|
+
unsigned i = 0;
|
|
282
|
+
while ((n >>= 1) != 0)
|
|
283
|
+
i++;
|
|
284
|
+
return i;
|
|
285
|
+
}
|
|
286
|
+
#endif
|
|
287
|
+
|
|
288
|
+
#ifndef bsr64
|
|
289
|
+
static forceinline unsigned
|
|
290
|
+
bsr64(u64 n)
|
|
291
|
+
{
|
|
292
|
+
unsigned i = 0;
|
|
293
|
+
while ((n >>= 1) != 0)
|
|
294
|
+
i++;
|
|
295
|
+
return i;
|
|
296
|
+
}
|
|
297
|
+
#endif
|
|
298
|
+
|
|
299
|
+
static forceinline unsigned
|
|
300
|
+
bsrw(machine_word_t n)
|
|
301
|
+
{
|
|
302
|
+
STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
|
|
303
|
+
if (WORDBITS == 32)
|
|
304
|
+
return bsr32(n);
|
|
305
|
+
else
|
|
306
|
+
return bsr64(n);
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
/*
|
|
310
|
+
* Bit Scan Forward (BSF) - find the 0-based index (relative to the least
|
|
311
|
+
* significant end) of the *least* significant 1 bit in the input value. The
|
|
312
|
+
* input value must be nonzero!
|
|
313
|
+
*/
|
|
314
|
+
|
|
315
|
+
#ifndef bsf32
|
|
316
|
+
static forceinline unsigned
|
|
317
|
+
bsf32(u32 n)
|
|
318
|
+
{
|
|
319
|
+
unsigned i = 0;
|
|
320
|
+
while ((n & 1) == 0) {
|
|
321
|
+
i++;
|
|
322
|
+
n >>= 1;
|
|
323
|
+
}
|
|
324
|
+
return i;
|
|
325
|
+
}
|
|
326
|
+
#endif
|
|
327
|
+
|
|
328
|
+
#ifndef bsf64
|
|
329
|
+
static forceinline unsigned
|
|
330
|
+
bsf64(u64 n)
|
|
331
|
+
{
|
|
332
|
+
unsigned i = 0;
|
|
333
|
+
while ((n & 1) == 0) {
|
|
334
|
+
i++;
|
|
335
|
+
n >>= 1;
|
|
336
|
+
}
|
|
337
|
+
return i;
|
|
338
|
+
}
|
|
339
|
+
#endif
|
|
340
|
+
|
|
341
|
+
static forceinline unsigned
|
|
342
|
+
bsfw(machine_word_t n)
|
|
343
|
+
{
|
|
344
|
+
STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
|
|
345
|
+
if (WORDBITS == 32)
|
|
346
|
+
return bsf32(n);
|
|
347
|
+
else
|
|
348
|
+
return bsf64(n);
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
#endif /* COMMON_COMMON_DEFS_H */
|