zstd-ruby 1.4.4.0 → 1.5.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/dependabot.yml +8 -0
- data/.github/workflows/ruby.yml +35 -0
- data/README.md +2 -2
- data/ext/zstdruby/extconf.rb +1 -0
- data/ext/zstdruby/libzstd/BUCK +5 -7
- data/ext/zstdruby/libzstd/Makefile +241 -173
- data/ext/zstdruby/libzstd/README.md +76 -18
- data/ext/zstdruby/libzstd/common/bitstream.h +75 -57
- data/ext/zstdruby/libzstd/common/compiler.h +196 -20
- data/ext/zstdruby/libzstd/common/cpu.h +1 -3
- data/ext/zstdruby/libzstd/common/debug.c +11 -31
- data/ext/zstdruby/libzstd/common/debug.h +22 -49
- data/ext/zstdruby/libzstd/common/entropy_common.c +208 -76
- data/ext/zstdruby/libzstd/common/error_private.c +3 -1
- data/ext/zstdruby/libzstd/common/error_private.h +87 -4
- data/ext/zstdruby/libzstd/common/fse.h +51 -42
- data/ext/zstdruby/libzstd/common/fse_decompress.c +149 -57
- data/ext/zstdruby/libzstd/common/huf.h +60 -54
- data/ext/zstdruby/libzstd/common/mem.h +87 -98
- data/ext/zstdruby/libzstd/common/pool.c +23 -17
- data/ext/zstdruby/libzstd/common/pool.h +3 -3
- data/ext/zstdruby/libzstd/common/portability_macros.h +131 -0
- data/ext/zstdruby/libzstd/common/threading.c +10 -8
- data/ext/zstdruby/libzstd/common/threading.h +4 -3
- data/ext/zstdruby/libzstd/common/xxhash.c +15 -873
- data/ext/zstdruby/libzstd/common/xxhash.h +5572 -191
- data/ext/zstdruby/libzstd/common/zstd_common.c +10 -10
- data/ext/zstdruby/libzstd/common/zstd_deps.h +111 -0
- data/ext/zstdruby/libzstd/common/zstd_internal.h +252 -108
- data/ext/zstdruby/libzstd/common/zstd_trace.h +163 -0
- data/ext/zstdruby/libzstd/compress/clevels.h +134 -0
- data/ext/zstdruby/libzstd/compress/fse_compress.c +105 -85
- data/ext/zstdruby/libzstd/compress/hist.c +41 -63
- data/ext/zstdruby/libzstd/compress/hist.h +13 -33
- data/ext/zstdruby/libzstd/compress/huf_compress.c +831 -259
- data/ext/zstdruby/libzstd/compress/zstd_compress.c +3213 -1007
- data/ext/zstdruby/libzstd/compress/zstd_compress_internal.h +493 -71
- data/ext/zstdruby/libzstd/compress/zstd_compress_literals.c +21 -16
- data/ext/zstdruby/libzstd/compress/zstd_compress_literals.h +4 -2
- data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.c +51 -24
- data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.h +10 -3
- data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.c +573 -0
- data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.h +32 -0
- data/ext/zstdruby/libzstd/compress/zstd_cwksp.h +208 -81
- data/ext/zstdruby/libzstd/compress/zstd_double_fast.c +315 -137
- data/ext/zstdruby/libzstd/compress/zstd_double_fast.h +2 -2
- data/ext/zstdruby/libzstd/compress/zstd_fast.c +319 -128
- data/ext/zstdruby/libzstd/compress/zstd_fast.h +2 -2
- data/ext/zstdruby/libzstd/compress/zstd_lazy.c +1156 -171
- data/ext/zstdruby/libzstd/compress/zstd_lazy.h +59 -1
- data/ext/zstdruby/libzstd/compress/zstd_ldm.c +331 -206
- data/ext/zstdruby/libzstd/compress/zstd_ldm.h +15 -3
- data/ext/zstdruby/libzstd/compress/zstd_ldm_geartab.h +106 -0
- data/ext/zstdruby/libzstd/compress/zstd_opt.c +403 -226
- data/ext/zstdruby/libzstd/compress/zstd_opt.h +1 -1
- data/ext/zstdruby/libzstd/compress/zstdmt_compress.c +188 -453
- data/ext/zstdruby/libzstd/compress/zstdmt_compress.h +32 -114
- data/ext/zstdruby/libzstd/decompress/huf_decompress.c +1065 -410
- data/ext/zstdruby/libzstd/decompress/huf_decompress_amd64.S +571 -0
- data/ext/zstdruby/libzstd/decompress/zstd_ddict.c +20 -16
- data/ext/zstdruby/libzstd/decompress/zstd_ddict.h +3 -3
- data/ext/zstdruby/libzstd/decompress/zstd_decompress.c +691 -230
- data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.c +1072 -323
- data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.h +16 -7
- data/ext/zstdruby/libzstd/decompress/zstd_decompress_internal.h +71 -10
- data/ext/zstdruby/libzstd/deprecated/zbuff.h +3 -3
- data/ext/zstdruby/libzstd/deprecated/zbuff_common.c +2 -2
- data/ext/zstdruby/libzstd/deprecated/zbuff_compress.c +24 -4
- data/ext/zstdruby/libzstd/deprecated/zbuff_decompress.c +1 -1
- data/ext/zstdruby/libzstd/dictBuilder/cover.c +57 -40
- data/ext/zstdruby/libzstd/dictBuilder/cover.h +20 -9
- data/ext/zstdruby/libzstd/dictBuilder/divsufsort.c +1 -1
- data/ext/zstdruby/libzstd/dictBuilder/fastcover.c +54 -35
- data/ext/zstdruby/libzstd/dictBuilder/zdict.c +151 -57
- data/ext/zstdruby/libzstd/dll/example/Makefile +2 -1
- data/ext/zstdruby/libzstd/dll/example/README.md +16 -22
- data/ext/zstdruby/libzstd/legacy/zstd_legacy.h +4 -4
- data/ext/zstdruby/libzstd/legacy/zstd_v01.c +25 -19
- data/ext/zstdruby/libzstd/legacy/zstd_v01.h +1 -1
- data/ext/zstdruby/libzstd/legacy/zstd_v02.c +18 -14
- data/ext/zstdruby/libzstd/legacy/zstd_v02.h +1 -1
- data/ext/zstdruby/libzstd/legacy/zstd_v03.c +18 -14
- data/ext/zstdruby/libzstd/legacy/zstd_v03.h +1 -1
- data/ext/zstdruby/libzstd/legacy/zstd_v04.c +22 -16
- data/ext/zstdruby/libzstd/legacy/zstd_v04.h +1 -1
- data/ext/zstdruby/libzstd/legacy/zstd_v05.c +29 -25
- data/ext/zstdruby/libzstd/legacy/zstd_v05.h +2 -2
- data/ext/zstdruby/libzstd/legacy/zstd_v06.c +29 -25
- data/ext/zstdruby/libzstd/legacy/zstd_v06.h +1 -1
- data/ext/zstdruby/libzstd/legacy/zstd_v07.c +34 -26
- data/ext/zstdruby/libzstd/legacy/zstd_v07.h +1 -1
- data/ext/zstdruby/libzstd/libzstd.mk +185 -0
- data/ext/zstdruby/libzstd/libzstd.pc.in +4 -3
- data/ext/zstdruby/libzstd/modulemap/module.modulemap +4 -0
- data/ext/zstdruby/libzstd/{dictBuilder/zdict.h → zdict.h} +201 -31
- data/ext/zstdruby/libzstd/zstd.h +760 -234
- data/ext/zstdruby/libzstd/{common/zstd_errors.h → zstd_errors.h} +3 -1
- data/ext/zstdruby/zstdruby.c +2 -2
- data/lib/zstd-ruby/version.rb +1 -1
- metadata +20 -9
- data/.travis.yml +0 -14
@@ -0,0 +1,185 @@
|
|
1
|
+
# ################################################################
|
2
|
+
# Copyright (c) Yann Collet, Facebook, Inc.
|
3
|
+
# All rights reserved.
|
4
|
+
#
|
5
|
+
# This source code is licensed under both the BSD-style license (found in the
|
6
|
+
# LICENSE file in the root directory of this source tree) and the GPLv2 (found
|
7
|
+
# in the COPYING file in the root directory of this source tree).
|
8
|
+
# You may select, at your option, one of the above-listed licenses.
|
9
|
+
# ################################################################
|
10
|
+
|
11
|
+
##################################################################
|
12
|
+
# Input Variables
|
13
|
+
##################################################################
|
14
|
+
|
15
|
+
# Zstd lib directory
|
16
|
+
LIBZSTD ?= ./
|
17
|
+
|
18
|
+
# Legacy support
|
19
|
+
ZSTD_LEGACY_SUPPORT ?= 5
|
20
|
+
ZSTD_LEGACY_MULTITHREADED_API ?= 0
|
21
|
+
|
22
|
+
# Build size optimizations
|
23
|
+
HUF_FORCE_DECOMPRESS_X1 ?= 0
|
24
|
+
HUF_FORCE_DECOMPRESS_X2 ?= 0
|
25
|
+
ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT ?= 0
|
26
|
+
ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG ?= 0
|
27
|
+
ZSTD_NO_INLINE ?= 0
|
28
|
+
ZSTD_STRIP_ERROR_STRINGS ?= 0
|
29
|
+
|
30
|
+
# Assembly support
|
31
|
+
ZSTD_NO_ASM ?= 0
|
32
|
+
|
33
|
+
##################################################################
|
34
|
+
# libzstd helpers
|
35
|
+
##################################################################
|
36
|
+
|
37
|
+
# Make 4.3 doesn't support '\#' anymore (https://lwn.net/Articles/810071/)
|
38
|
+
NUM_SYMBOL := \#
|
39
|
+
|
40
|
+
# define silent mode as default (verbose mode with V=1 or VERBOSE=1)
|
41
|
+
$(V)$(VERBOSE).SILENT:
|
42
|
+
|
43
|
+
# When cross-compiling from linux to windows,
|
44
|
+
# one might need to specify TARGET_SYSTEM as "Windows."
|
45
|
+
# Building from Fedora fails without it.
|
46
|
+
# (but Ubuntu and Debian don't need to set anything)
|
47
|
+
TARGET_SYSTEM ?= $(OS)
|
48
|
+
|
49
|
+
# Version numbers
|
50
|
+
LIBVER_SRC := $(LIBZSTD)/zstd.h
|
51
|
+
LIBVER_MAJOR_SCRIPT:=`sed -n '/define ZSTD_VERSION_MAJOR/s/.*[[:blank:]]\([0-9][0-9]*\).*/\1/p' < $(LIBVER_SRC)`
|
52
|
+
LIBVER_MINOR_SCRIPT:=`sed -n '/define ZSTD_VERSION_MINOR/s/.*[[:blank:]]\([0-9][0-9]*\).*/\1/p' < $(LIBVER_SRC)`
|
53
|
+
LIBVER_PATCH_SCRIPT:=`sed -n '/define ZSTD_VERSION_RELEASE/s/.*[[:blank:]]\([0-9][0-9]*\).*/\1/p' < $(LIBVER_SRC)`
|
54
|
+
LIBVER_SCRIPT:= $(LIBVER_MAJOR_SCRIPT).$(LIBVER_MINOR_SCRIPT).$(LIBVER_PATCH_SCRIPT)
|
55
|
+
LIBVER_MAJOR := $(shell echo $(LIBVER_MAJOR_SCRIPT))
|
56
|
+
LIBVER_MINOR := $(shell echo $(LIBVER_MINOR_SCRIPT))
|
57
|
+
LIBVER_PATCH := $(shell echo $(LIBVER_PATCH_SCRIPT))
|
58
|
+
LIBVER := $(shell echo $(LIBVER_SCRIPT))
|
59
|
+
CCVER := $(shell $(CC) --version)
|
60
|
+
ZSTD_VERSION?= $(LIBVER)
|
61
|
+
|
62
|
+
# ZSTD_LIB_MINIFY is a helper variable that
|
63
|
+
# configures a bunch of other variables to space-optimized defaults.
|
64
|
+
ZSTD_LIB_MINIFY ?= 0
|
65
|
+
ifneq ($(ZSTD_LIB_MINIFY), 0)
|
66
|
+
HAVE_CC_OZ ?= $(shell echo "" | $(CC) -Oz -x c -c - -o /dev/null 2> /dev/null && echo 1 || echo 0)
|
67
|
+
ZSTD_LEGACY_SUPPORT ?= 0
|
68
|
+
ZSTD_LIB_DEPRECATED ?= 0
|
69
|
+
HUF_FORCE_DECOMPRESS_X1 ?= 1
|
70
|
+
ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT ?= 1
|
71
|
+
ZSTD_NO_INLINE ?= 1
|
72
|
+
ZSTD_STRIP_ERROR_STRINGS ?= 1
|
73
|
+
ifneq ($(HAVE_CC_OZ), 0)
|
74
|
+
# Some compilers (clang) support an even more space-optimized setting.
|
75
|
+
CFLAGS += -Oz
|
76
|
+
else
|
77
|
+
CFLAGS += -Os
|
78
|
+
endif
|
79
|
+
CFLAGS += -fno-stack-protector -fomit-frame-pointer -fno-ident \
|
80
|
+
-DDYNAMIC_BMI2=0 -DNDEBUG
|
81
|
+
else
|
82
|
+
CFLAGS += -O3
|
83
|
+
endif
|
84
|
+
|
85
|
+
DEBUGLEVEL ?= 0
|
86
|
+
CPPFLAGS += -DXXH_NAMESPACE=ZSTD_ -DDEBUGLEVEL=$(DEBUGLEVEL)
|
87
|
+
ifeq ($(TARGET_SYSTEM),Windows_NT) # MinGW assumed
|
88
|
+
CPPFLAGS += -D__USE_MINGW_ANSI_STDIO # compatibility with %zu formatting
|
89
|
+
endif
|
90
|
+
DEBUGFLAGS= -Wall -Wextra -Wcast-qual -Wcast-align -Wshadow \
|
91
|
+
-Wstrict-aliasing=1 -Wswitch-enum -Wdeclaration-after-statement \
|
92
|
+
-Wstrict-prototypes -Wundef -Wpointer-arith \
|
93
|
+
-Wvla -Wformat=2 -Winit-self -Wfloat-equal -Wwrite-strings \
|
94
|
+
-Wredundant-decls -Wmissing-prototypes -Wc++-compat
|
95
|
+
CFLAGS += $(DEBUGFLAGS) $(MOREFLAGS)
|
96
|
+
LDFLAGS += $(MOREFLAGS)
|
97
|
+
FLAGS = $(CPPFLAGS) $(CFLAGS) $(LDFLAGS)
|
98
|
+
|
99
|
+
HAVE_COLORNEVER = $(shell echo a | grep --color=never a > /dev/null 2> /dev/null && echo 1 || echo 0)
|
100
|
+
GREP_OPTIONS ?=
|
101
|
+
ifeq ($HAVE_COLORNEVER, 1)
|
102
|
+
GREP_OPTIONS += --color=never
|
103
|
+
endif
|
104
|
+
GREP = grep $(GREP_OPTIONS)
|
105
|
+
SED_ERE_OPT ?= -E
|
106
|
+
|
107
|
+
ZSTD_COMMON_FILES := $(sort $(wildcard $(LIBZSTD)/common/*.c))
|
108
|
+
ZSTD_COMPRESS_FILES := $(sort $(wildcard $(LIBZSTD)/compress/*.c))
|
109
|
+
ZSTD_DECOMPRESS_FILES := $(sort $(wildcard $(LIBZSTD)/decompress/*.c))
|
110
|
+
ZSTD_DICTBUILDER_FILES := $(sort $(wildcard $(LIBZSTD)/dictBuilder/*.c))
|
111
|
+
ZSTD_DEPRECATED_FILES := $(sort $(wildcard $(LIBZSTD)/deprecated/*.c))
|
112
|
+
ZSTD_LEGACY_FILES :=
|
113
|
+
|
114
|
+
ZSTD_DECOMPRESS_AMD64_ASM_FILES := $(sort $(wildcard $(LIBZSTD)/decompress/*_amd64.S))
|
115
|
+
|
116
|
+
ifneq ($(ZSTD_NO_ASM), 0)
|
117
|
+
CPPFLAGS += -DZSTD_DISABLE_ASM
|
118
|
+
else
|
119
|
+
# Unconditionally add the ASM files they are disabled by
|
120
|
+
# macros in the .S file.
|
121
|
+
ZSTD_DECOMPRESS_FILES += $(ZSTD_DECOMPRESS_AMD64_ASM_FILES)
|
122
|
+
endif
|
123
|
+
|
124
|
+
ifneq ($(HUF_FORCE_DECOMPRESS_X1), 0)
|
125
|
+
CFLAGS += -DHUF_FORCE_DECOMPRESS_X1
|
126
|
+
endif
|
127
|
+
|
128
|
+
ifneq ($(HUF_FORCE_DECOMPRESS_X2), 0)
|
129
|
+
CFLAGS += -DHUF_FORCE_DECOMPRESS_X2
|
130
|
+
endif
|
131
|
+
|
132
|
+
ifneq ($(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT), 0)
|
133
|
+
CFLAGS += -DZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
|
134
|
+
endif
|
135
|
+
|
136
|
+
ifneq ($(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG), 0)
|
137
|
+
CFLAGS += -DZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
|
138
|
+
endif
|
139
|
+
|
140
|
+
ifneq ($(ZSTD_NO_INLINE), 0)
|
141
|
+
CFLAGS += -DZSTD_NO_INLINE
|
142
|
+
endif
|
143
|
+
|
144
|
+
ifneq ($(ZSTD_STRIP_ERROR_STRINGS), 0)
|
145
|
+
CFLAGS += -DZSTD_STRIP_ERROR_STRINGS
|
146
|
+
endif
|
147
|
+
|
148
|
+
ifneq ($(ZSTD_LEGACY_MULTITHREADED_API), 0)
|
149
|
+
CFLAGS += -DZSTD_LEGACY_MULTITHREADED_API
|
150
|
+
endif
|
151
|
+
|
152
|
+
ifneq ($(ZSTD_LEGACY_SUPPORT), 0)
|
153
|
+
ifeq ($(shell test $(ZSTD_LEGACY_SUPPORT) -lt 8; echo $$?), 0)
|
154
|
+
ZSTD_LEGACY_FILES += $(shell ls $(LIBZSTD)/legacy/*.c | $(GREP) 'v0[$(ZSTD_LEGACY_SUPPORT)-7]')
|
155
|
+
endif
|
156
|
+
endif
|
157
|
+
CPPFLAGS += -DZSTD_LEGACY_SUPPORT=$(ZSTD_LEGACY_SUPPORT)
|
158
|
+
|
159
|
+
UNAME := $(shell uname)
|
160
|
+
|
161
|
+
ifndef BUILD_DIR
|
162
|
+
ifeq ($(UNAME), Darwin)
|
163
|
+
ifeq ($(shell md5 < /dev/null > /dev/null; echo $$?), 0)
|
164
|
+
HASH ?= md5
|
165
|
+
endif
|
166
|
+
else ifeq ($(UNAME), FreeBSD)
|
167
|
+
HASH ?= gmd5sum
|
168
|
+
else ifeq ($(UNAME), NetBSD)
|
169
|
+
HASH ?= md5 -n
|
170
|
+
else ifeq ($(UNAME), OpenBSD)
|
171
|
+
HASH ?= md5
|
172
|
+
endif
|
173
|
+
HASH ?= md5sum
|
174
|
+
|
175
|
+
HASH_DIR = conf_$(shell echo $(CC) $(CPPFLAGS) $(CFLAGS) $(LDFLAGS) $(ZSTD_FILES) | $(HASH) | cut -f 1 -d " " )
|
176
|
+
HAVE_HASH :=$(shell echo 1 | $(HASH) > /dev/null && echo 1 || echo 0)
|
177
|
+
ifeq ($(HAVE_HASH),0)
|
178
|
+
$(info warning : could not find HASH ($(HASH)), needed to differentiate builds using different flags)
|
179
|
+
BUILD_DIR := obj/generic_noconf
|
180
|
+
endif
|
181
|
+
endif # BUILD_DIR
|
182
|
+
|
183
|
+
ZSTD_SUBDIR := $(LIBZSTD)/common $(LIBZSTD)/compress $(LIBZSTD)/decompress $(LIBZSTD)/dictBuilder $(LIBZSTD)/legacy $(LIBZSTD)/deprecated
|
184
|
+
vpath %.c $(ZSTD_SUBDIR)
|
185
|
+
vpath %.S $(ZSTD_SUBDIR)
|
@@ -3,13 +3,14 @@
|
|
3
3
|
# BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
|
4
4
|
|
5
5
|
prefix=@PREFIX@
|
6
|
-
exec_prefix
|
7
|
-
includedir
|
8
|
-
libdir
|
6
|
+
exec_prefix=@EXEC_PREFIX@
|
7
|
+
includedir=@INCLUDEDIR@
|
8
|
+
libdir=@LIBDIR@
|
9
9
|
|
10
10
|
Name: zstd
|
11
11
|
Description: fast lossless compression algorithm library
|
12
12
|
URL: http://www.zstd.net/
|
13
13
|
Version: @VERSION@
|
14
14
|
Libs: -L${libdir} -lzstd
|
15
|
+
Libs.private: @LIBS_PRIVATE@
|
15
16
|
Cflags: -I${includedir}
|
@@ -1,5 +1,5 @@
|
|
1
1
|
/*
|
2
|
-
* Copyright (c)
|
2
|
+
* Copyright (c) Yann Collet, Facebook, Inc.
|
3
3
|
* All rights reserved.
|
4
4
|
*
|
5
5
|
* This source code is licensed under both the BSD-style license (found in the
|
@@ -36,6 +36,145 @@ extern "C" {
|
|
36
36
|
# define ZDICTLIB_API ZDICTLIB_VISIBILITY
|
37
37
|
#endif
|
38
38
|
|
39
|
+
/*******************************************************************************
|
40
|
+
* Zstd dictionary builder
|
41
|
+
*
|
42
|
+
* FAQ
|
43
|
+
* ===
|
44
|
+
* Why should I use a dictionary?
|
45
|
+
* ------------------------------
|
46
|
+
*
|
47
|
+
* Zstd can use dictionaries to improve compression ratio of small data.
|
48
|
+
* Traditionally small files don't compress well because there is very little
|
49
|
+
* repetition in a single sample, since it is small. But, if you are compressing
|
50
|
+
* many similar files, like a bunch of JSON records that share the same
|
51
|
+
* structure, you can train a dictionary on ahead of time on some samples of
|
52
|
+
* these files. Then, zstd can use the dictionary to find repetitions that are
|
53
|
+
* present across samples. This can vastly improve compression ratio.
|
54
|
+
*
|
55
|
+
* When is a dictionary useful?
|
56
|
+
* ----------------------------
|
57
|
+
*
|
58
|
+
* Dictionaries are useful when compressing many small files that are similar.
|
59
|
+
* The larger a file is, the less benefit a dictionary will have. Generally,
|
60
|
+
* we don't expect dictionary compression to be effective past 100KB. And the
|
61
|
+
* smaller a file is, the more we would expect the dictionary to help.
|
62
|
+
*
|
63
|
+
* How do I use a dictionary?
|
64
|
+
* --------------------------
|
65
|
+
*
|
66
|
+
* Simply pass the dictionary to the zstd compressor with
|
67
|
+
* `ZSTD_CCtx_loadDictionary()`. The same dictionary must then be passed to
|
68
|
+
* the decompressor, using `ZSTD_DCtx_loadDictionary()`. There are other
|
69
|
+
* more advanced functions that allow selecting some options, see zstd.h for
|
70
|
+
* complete documentation.
|
71
|
+
*
|
72
|
+
* What is a zstd dictionary?
|
73
|
+
* --------------------------
|
74
|
+
*
|
75
|
+
* A zstd dictionary has two pieces: Its header, and its content. The header
|
76
|
+
* contains a magic number, the dictionary ID, and entropy tables. These
|
77
|
+
* entropy tables allow zstd to save on header costs in the compressed file,
|
78
|
+
* which really matters for small data. The content is just bytes, which are
|
79
|
+
* repeated content that is common across many samples.
|
80
|
+
*
|
81
|
+
* What is a raw content dictionary?
|
82
|
+
* ---------------------------------
|
83
|
+
*
|
84
|
+
* A raw content dictionary is just bytes. It doesn't have a zstd dictionary
|
85
|
+
* header, a dictionary ID, or entropy tables. Any buffer is a valid raw
|
86
|
+
* content dictionary.
|
87
|
+
*
|
88
|
+
* How do I train a dictionary?
|
89
|
+
* ----------------------------
|
90
|
+
*
|
91
|
+
* Gather samples from your use case. These samples should be similar to each
|
92
|
+
* other. If you have several use cases, you could try to train one dictionary
|
93
|
+
* per use case.
|
94
|
+
*
|
95
|
+
* Pass those samples to `ZDICT_trainFromBuffer()` and that will train your
|
96
|
+
* dictionary. There are a few advanced versions of this function, but this
|
97
|
+
* is a great starting point. If you want to further tune your dictionary
|
98
|
+
* you could try `ZDICT_optimizeTrainFromBuffer_cover()`. If that is too slow
|
99
|
+
* you can try `ZDICT_optimizeTrainFromBuffer_fastCover()`.
|
100
|
+
*
|
101
|
+
* If the dictionary training function fails, that is likely because you
|
102
|
+
* either passed too few samples, or a dictionary would not be effective
|
103
|
+
* for your data. Look at the messages that the dictionary trainer printed,
|
104
|
+
* if it doesn't say too few samples, then a dictionary would not be effective.
|
105
|
+
*
|
106
|
+
* How large should my dictionary be?
|
107
|
+
* ----------------------------------
|
108
|
+
*
|
109
|
+
* A reasonable dictionary size, the `dictBufferCapacity`, is about 100KB.
|
110
|
+
* The zstd CLI defaults to a 110KB dictionary. You likely don't need a
|
111
|
+
* dictionary larger than that. But, most use cases can get away with a
|
112
|
+
* smaller dictionary. The advanced dictionary builders can automatically
|
113
|
+
* shrink the dictionary for you, and select a the smallest size that
|
114
|
+
* doesn't hurt compression ratio too much. See the `shrinkDict` parameter.
|
115
|
+
* A smaller dictionary can save memory, and potentially speed up
|
116
|
+
* compression.
|
117
|
+
*
|
118
|
+
* How many samples should I provide to the dictionary builder?
|
119
|
+
* ------------------------------------------------------------
|
120
|
+
*
|
121
|
+
* We generally recommend passing ~100x the size of the dictionary
|
122
|
+
* in samples. A few thousand should suffice. Having too few samples
|
123
|
+
* can hurt the dictionaries effectiveness. Having more samples will
|
124
|
+
* only improve the dictionaries effectiveness. But having too many
|
125
|
+
* samples can slow down the dictionary builder.
|
126
|
+
*
|
127
|
+
* How do I determine if a dictionary will be effective?
|
128
|
+
* -----------------------------------------------------
|
129
|
+
*
|
130
|
+
* Simply train a dictionary and try it out. You can use zstd's built in
|
131
|
+
* benchmarking tool to test the dictionary effectiveness.
|
132
|
+
*
|
133
|
+
* # Benchmark levels 1-3 without a dictionary
|
134
|
+
* zstd -b1e3 -r /path/to/my/files
|
135
|
+
* # Benchmark levels 1-3 with a dictionary
|
136
|
+
* zstd -b1e3 -r /path/to/my/files -D /path/to/my/dictionary
|
137
|
+
*
|
138
|
+
* When should I retrain a dictionary?
|
139
|
+
* -----------------------------------
|
140
|
+
*
|
141
|
+
* You should retrain a dictionary when its effectiveness drops. Dictionary
|
142
|
+
* effectiveness drops as the data you are compressing changes. Generally, we do
|
143
|
+
* expect dictionaries to "decay" over time, as your data changes, but the rate
|
144
|
+
* at which they decay depends on your use case. Internally, we regularly
|
145
|
+
* retrain dictionaries, and if the new dictionary performs significantly
|
146
|
+
* better than the old dictionary, we will ship the new dictionary.
|
147
|
+
*
|
148
|
+
* I have a raw content dictionary, how do I turn it into a zstd dictionary?
|
149
|
+
* -------------------------------------------------------------------------
|
150
|
+
*
|
151
|
+
* If you have a raw content dictionary, e.g. by manually constructing it, or
|
152
|
+
* using a third-party dictionary builder, you can turn it into a zstd
|
153
|
+
* dictionary by using `ZDICT_finalizeDictionary()`. You'll also have to
|
154
|
+
* provide some samples of the data. It will add the zstd header to the
|
155
|
+
* raw content, which contains a dictionary ID and entropy tables, which
|
156
|
+
* will improve compression ratio, and allow zstd to write the dictionary ID
|
157
|
+
* into the frame, if you so choose.
|
158
|
+
*
|
159
|
+
* Do I have to use zstd's dictionary builder?
|
160
|
+
* -------------------------------------------
|
161
|
+
*
|
162
|
+
* No! You can construct dictionary content however you please, it is just
|
163
|
+
* bytes. It will always be valid as a raw content dictionary. If you want
|
164
|
+
* a zstd dictionary, which can improve compression ratio, use
|
165
|
+
* `ZDICT_finalizeDictionary()`.
|
166
|
+
*
|
167
|
+
* What is the attack surface of a zstd dictionary?
|
168
|
+
* ------------------------------------------------
|
169
|
+
*
|
170
|
+
* Zstd is heavily fuzz tested, including loading fuzzed dictionaries, so
|
171
|
+
* zstd should never crash, or access out-of-bounds memory no matter what
|
172
|
+
* the dictionary is. However, if an attacker can control the dictionary
|
173
|
+
* during decompression, they can cause zstd to generate arbitrary bytes,
|
174
|
+
* just like if they controlled the compressed data.
|
175
|
+
*
|
176
|
+
******************************************************************************/
|
177
|
+
|
39
178
|
|
40
179
|
/*! ZDICT_trainFromBuffer():
|
41
180
|
* Train a dictionary from an array of samples.
|
@@ -61,9 +200,63 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCap
|
|
61
200
|
const void* samplesBuffer,
|
62
201
|
const size_t* samplesSizes, unsigned nbSamples);
|
63
202
|
|
203
|
+
typedef struct {
|
204
|
+
int compressionLevel; /*< optimize for a specific zstd compression level; 0 means default */
|
205
|
+
unsigned notificationLevel; /*< Write log to stderr; 0 = none (default); 1 = errors; 2 = progression; 3 = details; 4 = debug; */
|
206
|
+
unsigned dictID; /*< force dictID value; 0 means auto mode (32-bits random value)
|
207
|
+
* NOTE: The zstd format reserves some dictionary IDs for future use.
|
208
|
+
* You may use them in private settings, but be warned that they
|
209
|
+
* may be used by zstd in a public dictionary registry in the future.
|
210
|
+
* These dictionary IDs are:
|
211
|
+
* - low range : <= 32767
|
212
|
+
* - high range : >= (2^31)
|
213
|
+
*/
|
214
|
+
} ZDICT_params_t;
|
215
|
+
|
216
|
+
/*! ZDICT_finalizeDictionary():
|
217
|
+
* Given a custom content as a basis for dictionary, and a set of samples,
|
218
|
+
* finalize dictionary by adding headers and statistics according to the zstd
|
219
|
+
* dictionary format.
|
220
|
+
*
|
221
|
+
* Samples must be stored concatenated in a flat buffer `samplesBuffer`,
|
222
|
+
* supplied with an array of sizes `samplesSizes`, providing the size of each
|
223
|
+
* sample in order. The samples are used to construct the statistics, so they
|
224
|
+
* should be representative of what you will compress with this dictionary.
|
225
|
+
*
|
226
|
+
* The compression level can be set in `parameters`. You should pass the
|
227
|
+
* compression level you expect to use in production. The statistics for each
|
228
|
+
* compression level differ, so tuning the dictionary for the compression level
|
229
|
+
* can help quite a bit.
|
230
|
+
*
|
231
|
+
* You can set an explicit dictionary ID in `parameters`, or allow us to pick
|
232
|
+
* a random dictionary ID for you, but we can't guarantee no collisions.
|
233
|
+
*
|
234
|
+
* The dstDictBuffer and the dictContent may overlap, and the content will be
|
235
|
+
* appended to the end of the header. If the header + the content doesn't fit in
|
236
|
+
* maxDictSize the beginning of the content is truncated to make room, since it
|
237
|
+
* is presumed that the most profitable content is at the end of the dictionary,
|
238
|
+
* since that is the cheapest to reference.
|
239
|
+
*
|
240
|
+
* `maxDictSize` must be >= max(dictContentSize, ZSTD_DICTSIZE_MIN).
|
241
|
+
*
|
242
|
+
* @return: size of dictionary stored into `dstDictBuffer` (<= `maxDictSize`),
|
243
|
+
* or an error code, which can be tested by ZDICT_isError().
|
244
|
+
* Note: ZDICT_finalizeDictionary() will push notifications into stderr if
|
245
|
+
* instructed to, using notificationLevel>0.
|
246
|
+
* NOTE: This function currently may fail in several edge cases including:
|
247
|
+
* * Not enough samples
|
248
|
+
* * Samples are uncompressible
|
249
|
+
* * Samples are all exactly the same
|
250
|
+
*/
|
251
|
+
ZDICTLIB_API size_t ZDICT_finalizeDictionary(void* dstDictBuffer, size_t maxDictSize,
|
252
|
+
const void* dictContent, size_t dictContentSize,
|
253
|
+
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
|
254
|
+
ZDICT_params_t parameters);
|
255
|
+
|
64
256
|
|
65
257
|
/*====== Helper functions ======*/
|
66
258
|
ZDICTLIB_API unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize); /**< extracts dictID; @return zero if error (not a valid dictionary) */
|
259
|
+
ZDICTLIB_API size_t ZDICT_getDictHeaderSize(const void* dictBuffer, size_t dictSize); /* returns dict header size; returns a ZSTD error code on failure */
|
67
260
|
ZDICTLIB_API unsigned ZDICT_isError(size_t errorCode);
|
68
261
|
ZDICTLIB_API const char* ZDICT_getErrorName(size_t errorCode);
|
69
262
|
|
@@ -78,11 +271,9 @@ ZDICTLIB_API const char* ZDICT_getErrorName(size_t errorCode);
|
|
78
271
|
* Use them only in association with static linking.
|
79
272
|
* ==================================================================================== */
|
80
273
|
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
unsigned dictID; /* force dictID value; 0 means auto mode (32-bits random value) */
|
85
|
-
} ZDICT_params_t;
|
274
|
+
#define ZDICT_DICTSIZE_MIN 256
|
275
|
+
/* Deprecated: Remove in v1.6.0 */
|
276
|
+
#define ZDICT_CONTENTSIZE_MIN 128
|
86
277
|
|
87
278
|
/*! ZDICT_cover_params_t:
|
88
279
|
* k and d are the only required parameters.
|
@@ -198,28 +389,6 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_fastCover(void* dictBuffer,
|
|
198
389
|
const size_t* samplesSizes, unsigned nbSamples,
|
199
390
|
ZDICT_fastCover_params_t* parameters);
|
200
391
|
|
201
|
-
/*! ZDICT_finalizeDictionary():
|
202
|
-
* Given a custom content as a basis for dictionary, and a set of samples,
|
203
|
-
* finalize dictionary by adding headers and statistics.
|
204
|
-
*
|
205
|
-
* Samples must be stored concatenated in a flat buffer `samplesBuffer`,
|
206
|
-
* supplied with an array of sizes `samplesSizes`, providing the size of each sample in order.
|
207
|
-
*
|
208
|
-
* dictContentSize must be >= ZDICT_CONTENTSIZE_MIN bytes.
|
209
|
-
* maxDictSize must be >= dictContentSize, and must be >= ZDICT_DICTSIZE_MIN bytes.
|
210
|
-
*
|
211
|
-
* @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`),
|
212
|
-
* or an error code, which can be tested by ZDICT_isError().
|
213
|
-
* Note: ZDICT_finalizeDictionary() will push notifications into stderr if instructed to, using notificationLevel>0.
|
214
|
-
* Note 2: dictBuffer and dictContent can overlap
|
215
|
-
*/
|
216
|
-
#define ZDICT_CONTENTSIZE_MIN 128
|
217
|
-
#define ZDICT_DICTSIZE_MIN 256
|
218
|
-
ZDICTLIB_API size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
|
219
|
-
const void* dictContent, size_t dictContentSize,
|
220
|
-
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
|
221
|
-
ZDICT_params_t parameters);
|
222
|
-
|
223
392
|
typedef struct {
|
224
393
|
unsigned selectivityLevel; /* 0 means default; larger => select more => larger dictionary */
|
225
394
|
ZDICT_params_t zParams;
|
@@ -241,10 +410,11 @@ typedef struct {
|
|
241
410
|
* Note: ZDICT_trainFromBuffer_legacy() will send notifications into stderr if instructed to, using notificationLevel>0.
|
242
411
|
*/
|
243
412
|
ZDICTLIB_API size_t ZDICT_trainFromBuffer_legacy(
|
244
|
-
void
|
245
|
-
const void
|
413
|
+
void* dictBuffer, size_t dictBufferCapacity,
|
414
|
+
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
|
246
415
|
ZDICT_legacy_params_t parameters);
|
247
416
|
|
417
|
+
|
248
418
|
/* Deprecation warnings */
|
249
419
|
/* It is generally possible to disable deprecation warnings from compiler,
|
250
420
|
for example with -Wno-deprecated-declarations for gcc
|
@@ -256,7 +426,7 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_legacy(
|
|
256
426
|
# define ZDICT_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
|
257
427
|
# if defined (__cplusplus) && (__cplusplus >= 201402) /* C++14 or greater */
|
258
428
|
# define ZDICT_DEPRECATED(message) [[deprecated(message)]] ZDICTLIB_API
|
259
|
-
# elif (ZDICT_GCC_VERSION >= 405)
|
429
|
+
# elif defined(__clang__) || (ZDICT_GCC_VERSION >= 405)
|
260
430
|
# define ZDICT_DEPRECATED(message) ZDICTLIB_API __attribute__((deprecated(message)))
|
261
431
|
# elif (ZDICT_GCC_VERSION >= 301)
|
262
432
|
# define ZDICT_DEPRECATED(message) ZDICTLIB_API __attribute__((deprecated))
|