zstd-ruby 1.4.4.0 → 1.5.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (102) hide show
  1. checksums.yaml +4 -4
  2. data/.github/dependabot.yml +8 -0
  3. data/.github/workflows/ruby.yml +35 -0
  4. data/README.md +2 -2
  5. data/ext/zstdruby/extconf.rb +1 -0
  6. data/ext/zstdruby/libzstd/BUCK +5 -7
  7. data/ext/zstdruby/libzstd/Makefile +241 -173
  8. data/ext/zstdruby/libzstd/README.md +76 -18
  9. data/ext/zstdruby/libzstd/common/bitstream.h +75 -57
  10. data/ext/zstdruby/libzstd/common/compiler.h +196 -20
  11. data/ext/zstdruby/libzstd/common/cpu.h +1 -3
  12. data/ext/zstdruby/libzstd/common/debug.c +11 -31
  13. data/ext/zstdruby/libzstd/common/debug.h +22 -49
  14. data/ext/zstdruby/libzstd/common/entropy_common.c +208 -76
  15. data/ext/zstdruby/libzstd/common/error_private.c +3 -1
  16. data/ext/zstdruby/libzstd/common/error_private.h +87 -4
  17. data/ext/zstdruby/libzstd/common/fse.h +51 -42
  18. data/ext/zstdruby/libzstd/common/fse_decompress.c +149 -57
  19. data/ext/zstdruby/libzstd/common/huf.h +60 -54
  20. data/ext/zstdruby/libzstd/common/mem.h +87 -98
  21. data/ext/zstdruby/libzstd/common/pool.c +23 -17
  22. data/ext/zstdruby/libzstd/common/pool.h +3 -3
  23. data/ext/zstdruby/libzstd/common/portability_macros.h +131 -0
  24. data/ext/zstdruby/libzstd/common/threading.c +10 -8
  25. data/ext/zstdruby/libzstd/common/threading.h +4 -3
  26. data/ext/zstdruby/libzstd/common/xxhash.c +15 -873
  27. data/ext/zstdruby/libzstd/common/xxhash.h +5572 -191
  28. data/ext/zstdruby/libzstd/common/zstd_common.c +10 -10
  29. data/ext/zstdruby/libzstd/common/zstd_deps.h +111 -0
  30. data/ext/zstdruby/libzstd/common/zstd_internal.h +252 -108
  31. data/ext/zstdruby/libzstd/common/zstd_trace.h +163 -0
  32. data/ext/zstdruby/libzstd/compress/clevels.h +134 -0
  33. data/ext/zstdruby/libzstd/compress/fse_compress.c +105 -85
  34. data/ext/zstdruby/libzstd/compress/hist.c +41 -63
  35. data/ext/zstdruby/libzstd/compress/hist.h +13 -33
  36. data/ext/zstdruby/libzstd/compress/huf_compress.c +831 -259
  37. data/ext/zstdruby/libzstd/compress/zstd_compress.c +3213 -1007
  38. data/ext/zstdruby/libzstd/compress/zstd_compress_internal.h +493 -71
  39. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.c +21 -16
  40. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.h +4 -2
  41. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.c +51 -24
  42. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.h +10 -3
  43. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.c +573 -0
  44. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.h +32 -0
  45. data/ext/zstdruby/libzstd/compress/zstd_cwksp.h +208 -81
  46. data/ext/zstdruby/libzstd/compress/zstd_double_fast.c +315 -137
  47. data/ext/zstdruby/libzstd/compress/zstd_double_fast.h +2 -2
  48. data/ext/zstdruby/libzstd/compress/zstd_fast.c +319 -128
  49. data/ext/zstdruby/libzstd/compress/zstd_fast.h +2 -2
  50. data/ext/zstdruby/libzstd/compress/zstd_lazy.c +1156 -171
  51. data/ext/zstdruby/libzstd/compress/zstd_lazy.h +59 -1
  52. data/ext/zstdruby/libzstd/compress/zstd_ldm.c +331 -206
  53. data/ext/zstdruby/libzstd/compress/zstd_ldm.h +15 -3
  54. data/ext/zstdruby/libzstd/compress/zstd_ldm_geartab.h +106 -0
  55. data/ext/zstdruby/libzstd/compress/zstd_opt.c +403 -226
  56. data/ext/zstdruby/libzstd/compress/zstd_opt.h +1 -1
  57. data/ext/zstdruby/libzstd/compress/zstdmt_compress.c +188 -453
  58. data/ext/zstdruby/libzstd/compress/zstdmt_compress.h +32 -114
  59. data/ext/zstdruby/libzstd/decompress/huf_decompress.c +1065 -410
  60. data/ext/zstdruby/libzstd/decompress/huf_decompress_amd64.S +571 -0
  61. data/ext/zstdruby/libzstd/decompress/zstd_ddict.c +20 -16
  62. data/ext/zstdruby/libzstd/decompress/zstd_ddict.h +3 -3
  63. data/ext/zstdruby/libzstd/decompress/zstd_decompress.c +691 -230
  64. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.c +1072 -323
  65. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.h +16 -7
  66. data/ext/zstdruby/libzstd/decompress/zstd_decompress_internal.h +71 -10
  67. data/ext/zstdruby/libzstd/deprecated/zbuff.h +3 -3
  68. data/ext/zstdruby/libzstd/deprecated/zbuff_common.c +2 -2
  69. data/ext/zstdruby/libzstd/deprecated/zbuff_compress.c +24 -4
  70. data/ext/zstdruby/libzstd/deprecated/zbuff_decompress.c +1 -1
  71. data/ext/zstdruby/libzstd/dictBuilder/cover.c +57 -40
  72. data/ext/zstdruby/libzstd/dictBuilder/cover.h +20 -9
  73. data/ext/zstdruby/libzstd/dictBuilder/divsufsort.c +1 -1
  74. data/ext/zstdruby/libzstd/dictBuilder/fastcover.c +54 -35
  75. data/ext/zstdruby/libzstd/dictBuilder/zdict.c +151 -57
  76. data/ext/zstdruby/libzstd/dll/example/Makefile +2 -1
  77. data/ext/zstdruby/libzstd/dll/example/README.md +16 -22
  78. data/ext/zstdruby/libzstd/legacy/zstd_legacy.h +4 -4
  79. data/ext/zstdruby/libzstd/legacy/zstd_v01.c +25 -19
  80. data/ext/zstdruby/libzstd/legacy/zstd_v01.h +1 -1
  81. data/ext/zstdruby/libzstd/legacy/zstd_v02.c +18 -14
  82. data/ext/zstdruby/libzstd/legacy/zstd_v02.h +1 -1
  83. data/ext/zstdruby/libzstd/legacy/zstd_v03.c +18 -14
  84. data/ext/zstdruby/libzstd/legacy/zstd_v03.h +1 -1
  85. data/ext/zstdruby/libzstd/legacy/zstd_v04.c +22 -16
  86. data/ext/zstdruby/libzstd/legacy/zstd_v04.h +1 -1
  87. data/ext/zstdruby/libzstd/legacy/zstd_v05.c +29 -25
  88. data/ext/zstdruby/libzstd/legacy/zstd_v05.h +2 -2
  89. data/ext/zstdruby/libzstd/legacy/zstd_v06.c +29 -25
  90. data/ext/zstdruby/libzstd/legacy/zstd_v06.h +1 -1
  91. data/ext/zstdruby/libzstd/legacy/zstd_v07.c +34 -26
  92. data/ext/zstdruby/libzstd/legacy/zstd_v07.h +1 -1
  93. data/ext/zstdruby/libzstd/libzstd.mk +185 -0
  94. data/ext/zstdruby/libzstd/libzstd.pc.in +4 -3
  95. data/ext/zstdruby/libzstd/modulemap/module.modulemap +4 -0
  96. data/ext/zstdruby/libzstd/{dictBuilder/zdict.h → zdict.h} +201 -31
  97. data/ext/zstdruby/libzstd/zstd.h +760 -234
  98. data/ext/zstdruby/libzstd/{common/zstd_errors.h → zstd_errors.h} +3 -1
  99. data/ext/zstdruby/zstdruby.c +2 -2
  100. data/lib/zstd-ruby/version.rb +1 -1
  101. metadata +20 -9
  102. data/.travis.yml +0 -14
@@ -0,0 +1,185 @@
1
+ # ################################################################
2
+ # Copyright (c) Yann Collet, Facebook, Inc.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under both the BSD-style license (found in the
6
+ # LICENSE file in the root directory of this source tree) and the GPLv2 (found
7
+ # in the COPYING file in the root directory of this source tree).
8
+ # You may select, at your option, one of the above-listed licenses.
9
+ # ################################################################
10
+
11
+ ##################################################################
12
+ # Input Variables
13
+ ##################################################################
14
+
15
+ # Zstd lib directory
16
+ LIBZSTD ?= ./
17
+
18
+ # Legacy support
19
+ ZSTD_LEGACY_SUPPORT ?= 5
20
+ ZSTD_LEGACY_MULTITHREADED_API ?= 0
21
+
22
+ # Build size optimizations
23
+ HUF_FORCE_DECOMPRESS_X1 ?= 0
24
+ HUF_FORCE_DECOMPRESS_X2 ?= 0
25
+ ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT ?= 0
26
+ ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG ?= 0
27
+ ZSTD_NO_INLINE ?= 0
28
+ ZSTD_STRIP_ERROR_STRINGS ?= 0
29
+
30
+ # Assembly support
31
+ ZSTD_NO_ASM ?= 0
32
+
33
+ ##################################################################
34
+ # libzstd helpers
35
+ ##################################################################
36
+
37
+ # Make 4.3 doesn't support '\#' anymore (https://lwn.net/Articles/810071/)
38
+ NUM_SYMBOL := \#
39
+
40
+ # define silent mode as default (verbose mode with V=1 or VERBOSE=1)
41
+ $(V)$(VERBOSE).SILENT:
42
+
43
+ # When cross-compiling from linux to windows,
44
+ # one might need to specify TARGET_SYSTEM as "Windows."
45
+ # Building from Fedora fails without it.
46
+ # (but Ubuntu and Debian don't need to set anything)
47
+ TARGET_SYSTEM ?= $(OS)
48
+
49
+ # Version numbers
50
+ LIBVER_SRC := $(LIBZSTD)/zstd.h
51
+ LIBVER_MAJOR_SCRIPT:=`sed -n '/define ZSTD_VERSION_MAJOR/s/.*[[:blank:]]\([0-9][0-9]*\).*/\1/p' < $(LIBVER_SRC)`
52
+ LIBVER_MINOR_SCRIPT:=`sed -n '/define ZSTD_VERSION_MINOR/s/.*[[:blank:]]\([0-9][0-9]*\).*/\1/p' < $(LIBVER_SRC)`
53
+ LIBVER_PATCH_SCRIPT:=`sed -n '/define ZSTD_VERSION_RELEASE/s/.*[[:blank:]]\([0-9][0-9]*\).*/\1/p' < $(LIBVER_SRC)`
54
+ LIBVER_SCRIPT:= $(LIBVER_MAJOR_SCRIPT).$(LIBVER_MINOR_SCRIPT).$(LIBVER_PATCH_SCRIPT)
55
+ LIBVER_MAJOR := $(shell echo $(LIBVER_MAJOR_SCRIPT))
56
+ LIBVER_MINOR := $(shell echo $(LIBVER_MINOR_SCRIPT))
57
+ LIBVER_PATCH := $(shell echo $(LIBVER_PATCH_SCRIPT))
58
+ LIBVER := $(shell echo $(LIBVER_SCRIPT))
59
+ CCVER := $(shell $(CC) --version)
60
+ ZSTD_VERSION?= $(LIBVER)
61
+
62
+ # ZSTD_LIB_MINIFY is a helper variable that
63
+ # configures a bunch of other variables to space-optimized defaults.
64
+ ZSTD_LIB_MINIFY ?= 0
65
+ ifneq ($(ZSTD_LIB_MINIFY), 0)
66
+ HAVE_CC_OZ ?= $(shell echo "" | $(CC) -Oz -x c -c - -o /dev/null 2> /dev/null && echo 1 || echo 0)
67
+ ZSTD_LEGACY_SUPPORT ?= 0
68
+ ZSTD_LIB_DEPRECATED ?= 0
69
+ HUF_FORCE_DECOMPRESS_X1 ?= 1
70
+ ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT ?= 1
71
+ ZSTD_NO_INLINE ?= 1
72
+ ZSTD_STRIP_ERROR_STRINGS ?= 1
73
+ ifneq ($(HAVE_CC_OZ), 0)
74
+ # Some compilers (clang) support an even more space-optimized setting.
75
+ CFLAGS += -Oz
76
+ else
77
+ CFLAGS += -Os
78
+ endif
79
+ CFLAGS += -fno-stack-protector -fomit-frame-pointer -fno-ident \
80
+ -DDYNAMIC_BMI2=0 -DNDEBUG
81
+ else
82
+ CFLAGS += -O3
83
+ endif
84
+
85
+ DEBUGLEVEL ?= 0
86
+ CPPFLAGS += -DXXH_NAMESPACE=ZSTD_ -DDEBUGLEVEL=$(DEBUGLEVEL)
87
+ ifeq ($(TARGET_SYSTEM),Windows_NT) # MinGW assumed
88
+ CPPFLAGS += -D__USE_MINGW_ANSI_STDIO # compatibility with %zu formatting
89
+ endif
90
+ DEBUGFLAGS= -Wall -Wextra -Wcast-qual -Wcast-align -Wshadow \
91
+ -Wstrict-aliasing=1 -Wswitch-enum -Wdeclaration-after-statement \
92
+ -Wstrict-prototypes -Wundef -Wpointer-arith \
93
+ -Wvla -Wformat=2 -Winit-self -Wfloat-equal -Wwrite-strings \
94
+ -Wredundant-decls -Wmissing-prototypes -Wc++-compat
95
+ CFLAGS += $(DEBUGFLAGS) $(MOREFLAGS)
96
+ LDFLAGS += $(MOREFLAGS)
97
+ FLAGS = $(CPPFLAGS) $(CFLAGS) $(LDFLAGS)
98
+
99
+ HAVE_COLORNEVER = $(shell echo a | grep --color=never a > /dev/null 2> /dev/null && echo 1 || echo 0)
100
+ GREP_OPTIONS ?=
101
+ ifeq ($HAVE_COLORNEVER, 1)
102
+ GREP_OPTIONS += --color=never
103
+ endif
104
+ GREP = grep $(GREP_OPTIONS)
105
+ SED_ERE_OPT ?= -E
106
+
107
+ ZSTD_COMMON_FILES := $(sort $(wildcard $(LIBZSTD)/common/*.c))
108
+ ZSTD_COMPRESS_FILES := $(sort $(wildcard $(LIBZSTD)/compress/*.c))
109
+ ZSTD_DECOMPRESS_FILES := $(sort $(wildcard $(LIBZSTD)/decompress/*.c))
110
+ ZSTD_DICTBUILDER_FILES := $(sort $(wildcard $(LIBZSTD)/dictBuilder/*.c))
111
+ ZSTD_DEPRECATED_FILES := $(sort $(wildcard $(LIBZSTD)/deprecated/*.c))
112
+ ZSTD_LEGACY_FILES :=
113
+
114
+ ZSTD_DECOMPRESS_AMD64_ASM_FILES := $(sort $(wildcard $(LIBZSTD)/decompress/*_amd64.S))
115
+
116
+ ifneq ($(ZSTD_NO_ASM), 0)
117
+ CPPFLAGS += -DZSTD_DISABLE_ASM
118
+ else
119
+ # Unconditionally add the ASM files they are disabled by
120
+ # macros in the .S file.
121
+ ZSTD_DECOMPRESS_FILES += $(ZSTD_DECOMPRESS_AMD64_ASM_FILES)
122
+ endif
123
+
124
+ ifneq ($(HUF_FORCE_DECOMPRESS_X1), 0)
125
+ CFLAGS += -DHUF_FORCE_DECOMPRESS_X1
126
+ endif
127
+
128
+ ifneq ($(HUF_FORCE_DECOMPRESS_X2), 0)
129
+ CFLAGS += -DHUF_FORCE_DECOMPRESS_X2
130
+ endif
131
+
132
+ ifneq ($(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT), 0)
133
+ CFLAGS += -DZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
134
+ endif
135
+
136
+ ifneq ($(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG), 0)
137
+ CFLAGS += -DZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
138
+ endif
139
+
140
+ ifneq ($(ZSTD_NO_INLINE), 0)
141
+ CFLAGS += -DZSTD_NO_INLINE
142
+ endif
143
+
144
+ ifneq ($(ZSTD_STRIP_ERROR_STRINGS), 0)
145
+ CFLAGS += -DZSTD_STRIP_ERROR_STRINGS
146
+ endif
147
+
148
+ ifneq ($(ZSTD_LEGACY_MULTITHREADED_API), 0)
149
+ CFLAGS += -DZSTD_LEGACY_MULTITHREADED_API
150
+ endif
151
+
152
+ ifneq ($(ZSTD_LEGACY_SUPPORT), 0)
153
+ ifeq ($(shell test $(ZSTD_LEGACY_SUPPORT) -lt 8; echo $$?), 0)
154
+ ZSTD_LEGACY_FILES += $(shell ls $(LIBZSTD)/legacy/*.c | $(GREP) 'v0[$(ZSTD_LEGACY_SUPPORT)-7]')
155
+ endif
156
+ endif
157
+ CPPFLAGS += -DZSTD_LEGACY_SUPPORT=$(ZSTD_LEGACY_SUPPORT)
158
+
159
+ UNAME := $(shell uname)
160
+
161
+ ifndef BUILD_DIR
162
+ ifeq ($(UNAME), Darwin)
163
+ ifeq ($(shell md5 < /dev/null > /dev/null; echo $$?), 0)
164
+ HASH ?= md5
165
+ endif
166
+ else ifeq ($(UNAME), FreeBSD)
167
+ HASH ?= gmd5sum
168
+ else ifeq ($(UNAME), NetBSD)
169
+ HASH ?= md5 -n
170
+ else ifeq ($(UNAME), OpenBSD)
171
+ HASH ?= md5
172
+ endif
173
+ HASH ?= md5sum
174
+
175
+ HASH_DIR = conf_$(shell echo $(CC) $(CPPFLAGS) $(CFLAGS) $(LDFLAGS) $(ZSTD_FILES) | $(HASH) | cut -f 1 -d " " )
176
+ HAVE_HASH :=$(shell echo 1 | $(HASH) > /dev/null && echo 1 || echo 0)
177
+ ifeq ($(HAVE_HASH),0)
178
+ $(info warning : could not find HASH ($(HASH)), needed to differentiate builds using different flags)
179
+ BUILD_DIR := obj/generic_noconf
180
+ endif
181
+ endif # BUILD_DIR
182
+
183
+ ZSTD_SUBDIR := $(LIBZSTD)/common $(LIBZSTD)/compress $(LIBZSTD)/decompress $(LIBZSTD)/dictBuilder $(LIBZSTD)/legacy $(LIBZSTD)/deprecated
184
+ vpath %.c $(ZSTD_SUBDIR)
185
+ vpath %.S $(ZSTD_SUBDIR)
@@ -3,13 +3,14 @@
3
3
  # BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
4
4
 
5
5
  prefix=@PREFIX@
6
- exec_prefix=${prefix}
7
- includedir=${prefix}/include
8
- libdir=${exec_prefix}/lib
6
+ exec_prefix=@EXEC_PREFIX@
7
+ includedir=@INCLUDEDIR@
8
+ libdir=@LIBDIR@
9
9
 
10
10
  Name: zstd
11
11
  Description: fast lossless compression algorithm library
12
12
  URL: http://www.zstd.net/
13
13
  Version: @VERSION@
14
14
  Libs: -L${libdir} -lzstd
15
+ Libs.private: @LIBS_PRIVATE@
15
16
  Cflags: -I${includedir}
@@ -0,0 +1,4 @@
1
+ module libzstd [extern_c] {
2
+ header "../zstd.h"
3
+ export *
4
+ }
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
2
+ * Copyright (c) Yann Collet, Facebook, Inc.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -36,6 +36,145 @@ extern "C" {
36
36
  # define ZDICTLIB_API ZDICTLIB_VISIBILITY
37
37
  #endif
38
38
 
39
+ /*******************************************************************************
40
+ * Zstd dictionary builder
41
+ *
42
+ * FAQ
43
+ * ===
44
+ * Why should I use a dictionary?
45
+ * ------------------------------
46
+ *
47
+ * Zstd can use dictionaries to improve compression ratio of small data.
48
+ * Traditionally small files don't compress well because there is very little
49
+ * repetition in a single sample, since it is small. But, if you are compressing
50
+ * many similar files, like a bunch of JSON records that share the same
51
+ * structure, you can train a dictionary on ahead of time on some samples of
52
+ * these files. Then, zstd can use the dictionary to find repetitions that are
53
+ * present across samples. This can vastly improve compression ratio.
54
+ *
55
+ * When is a dictionary useful?
56
+ * ----------------------------
57
+ *
58
+ * Dictionaries are useful when compressing many small files that are similar.
59
+ * The larger a file is, the less benefit a dictionary will have. Generally,
60
+ * we don't expect dictionary compression to be effective past 100KB. And the
61
+ * smaller a file is, the more we would expect the dictionary to help.
62
+ *
63
+ * How do I use a dictionary?
64
+ * --------------------------
65
+ *
66
+ * Simply pass the dictionary to the zstd compressor with
67
+ * `ZSTD_CCtx_loadDictionary()`. The same dictionary must then be passed to
68
+ * the decompressor, using `ZSTD_DCtx_loadDictionary()`. There are other
69
+ * more advanced functions that allow selecting some options, see zstd.h for
70
+ * complete documentation.
71
+ *
72
+ * What is a zstd dictionary?
73
+ * --------------------------
74
+ *
75
+ * A zstd dictionary has two pieces: Its header, and its content. The header
76
+ * contains a magic number, the dictionary ID, and entropy tables. These
77
+ * entropy tables allow zstd to save on header costs in the compressed file,
78
+ * which really matters for small data. The content is just bytes, which are
79
+ * repeated content that is common across many samples.
80
+ *
81
+ * What is a raw content dictionary?
82
+ * ---------------------------------
83
+ *
84
+ * A raw content dictionary is just bytes. It doesn't have a zstd dictionary
85
+ * header, a dictionary ID, or entropy tables. Any buffer is a valid raw
86
+ * content dictionary.
87
+ *
88
+ * How do I train a dictionary?
89
+ * ----------------------------
90
+ *
91
+ * Gather samples from your use case. These samples should be similar to each
92
+ * other. If you have several use cases, you could try to train one dictionary
93
+ * per use case.
94
+ *
95
+ * Pass those samples to `ZDICT_trainFromBuffer()` and that will train your
96
+ * dictionary. There are a few advanced versions of this function, but this
97
+ * is a great starting point. If you want to further tune your dictionary
98
+ * you could try `ZDICT_optimizeTrainFromBuffer_cover()`. If that is too slow
99
+ * you can try `ZDICT_optimizeTrainFromBuffer_fastCover()`.
100
+ *
101
+ * If the dictionary training function fails, that is likely because you
102
+ * either passed too few samples, or a dictionary would not be effective
103
+ * for your data. Look at the messages that the dictionary trainer printed,
104
+ * if it doesn't say too few samples, then a dictionary would not be effective.
105
+ *
106
+ * How large should my dictionary be?
107
+ * ----------------------------------
108
+ *
109
+ * A reasonable dictionary size, the `dictBufferCapacity`, is about 100KB.
110
+ * The zstd CLI defaults to a 110KB dictionary. You likely don't need a
111
+ * dictionary larger than that. But, most use cases can get away with a
112
+ * smaller dictionary. The advanced dictionary builders can automatically
113
+ * shrink the dictionary for you, and select a the smallest size that
114
+ * doesn't hurt compression ratio too much. See the `shrinkDict` parameter.
115
+ * A smaller dictionary can save memory, and potentially speed up
116
+ * compression.
117
+ *
118
+ * How many samples should I provide to the dictionary builder?
119
+ * ------------------------------------------------------------
120
+ *
121
+ * We generally recommend passing ~100x the size of the dictionary
122
+ * in samples. A few thousand should suffice. Having too few samples
123
+ * can hurt the dictionaries effectiveness. Having more samples will
124
+ * only improve the dictionaries effectiveness. But having too many
125
+ * samples can slow down the dictionary builder.
126
+ *
127
+ * How do I determine if a dictionary will be effective?
128
+ * -----------------------------------------------------
129
+ *
130
+ * Simply train a dictionary and try it out. You can use zstd's built in
131
+ * benchmarking tool to test the dictionary effectiveness.
132
+ *
133
+ * # Benchmark levels 1-3 without a dictionary
134
+ * zstd -b1e3 -r /path/to/my/files
135
+ * # Benchmark levels 1-3 with a dictionary
136
+ * zstd -b1e3 -r /path/to/my/files -D /path/to/my/dictionary
137
+ *
138
+ * When should I retrain a dictionary?
139
+ * -----------------------------------
140
+ *
141
+ * You should retrain a dictionary when its effectiveness drops. Dictionary
142
+ * effectiveness drops as the data you are compressing changes. Generally, we do
143
+ * expect dictionaries to "decay" over time, as your data changes, but the rate
144
+ * at which they decay depends on your use case. Internally, we regularly
145
+ * retrain dictionaries, and if the new dictionary performs significantly
146
+ * better than the old dictionary, we will ship the new dictionary.
147
+ *
148
+ * I have a raw content dictionary, how do I turn it into a zstd dictionary?
149
+ * -------------------------------------------------------------------------
150
+ *
151
+ * If you have a raw content dictionary, e.g. by manually constructing it, or
152
+ * using a third-party dictionary builder, you can turn it into a zstd
153
+ * dictionary by using `ZDICT_finalizeDictionary()`. You'll also have to
154
+ * provide some samples of the data. It will add the zstd header to the
155
+ * raw content, which contains a dictionary ID and entropy tables, which
156
+ * will improve compression ratio, and allow zstd to write the dictionary ID
157
+ * into the frame, if you so choose.
158
+ *
159
+ * Do I have to use zstd's dictionary builder?
160
+ * -------------------------------------------
161
+ *
162
+ * No! You can construct dictionary content however you please, it is just
163
+ * bytes. It will always be valid as a raw content dictionary. If you want
164
+ * a zstd dictionary, which can improve compression ratio, use
165
+ * `ZDICT_finalizeDictionary()`.
166
+ *
167
+ * What is the attack surface of a zstd dictionary?
168
+ * ------------------------------------------------
169
+ *
170
+ * Zstd is heavily fuzz tested, including loading fuzzed dictionaries, so
171
+ * zstd should never crash, or access out-of-bounds memory no matter what
172
+ * the dictionary is. However, if an attacker can control the dictionary
173
+ * during decompression, they can cause zstd to generate arbitrary bytes,
174
+ * just like if they controlled the compressed data.
175
+ *
176
+ ******************************************************************************/
177
+
39
178
 
40
179
  /*! ZDICT_trainFromBuffer():
41
180
  * Train a dictionary from an array of samples.
@@ -61,9 +200,63 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCap
61
200
  const void* samplesBuffer,
62
201
  const size_t* samplesSizes, unsigned nbSamples);
63
202
 
203
+ typedef struct {
204
+ int compressionLevel; /*< optimize for a specific zstd compression level; 0 means default */
205
+ unsigned notificationLevel; /*< Write log to stderr; 0 = none (default); 1 = errors; 2 = progression; 3 = details; 4 = debug; */
206
+ unsigned dictID; /*< force dictID value; 0 means auto mode (32-bits random value)
207
+ * NOTE: The zstd format reserves some dictionary IDs for future use.
208
+ * You may use them in private settings, but be warned that they
209
+ * may be used by zstd in a public dictionary registry in the future.
210
+ * These dictionary IDs are:
211
+ * - low range : <= 32767
212
+ * - high range : >= (2^31)
213
+ */
214
+ } ZDICT_params_t;
215
+
216
+ /*! ZDICT_finalizeDictionary():
217
+ * Given a custom content as a basis for dictionary, and a set of samples,
218
+ * finalize dictionary by adding headers and statistics according to the zstd
219
+ * dictionary format.
220
+ *
221
+ * Samples must be stored concatenated in a flat buffer `samplesBuffer`,
222
+ * supplied with an array of sizes `samplesSizes`, providing the size of each
223
+ * sample in order. The samples are used to construct the statistics, so they
224
+ * should be representative of what you will compress with this dictionary.
225
+ *
226
+ * The compression level can be set in `parameters`. You should pass the
227
+ * compression level you expect to use in production. The statistics for each
228
+ * compression level differ, so tuning the dictionary for the compression level
229
+ * can help quite a bit.
230
+ *
231
+ * You can set an explicit dictionary ID in `parameters`, or allow us to pick
232
+ * a random dictionary ID for you, but we can't guarantee no collisions.
233
+ *
234
+ * The dstDictBuffer and the dictContent may overlap, and the content will be
235
+ * appended to the end of the header. If the header + the content doesn't fit in
236
+ * maxDictSize the beginning of the content is truncated to make room, since it
237
+ * is presumed that the most profitable content is at the end of the dictionary,
238
+ * since that is the cheapest to reference.
239
+ *
240
+ * `maxDictSize` must be >= max(dictContentSize, ZSTD_DICTSIZE_MIN).
241
+ *
242
+ * @return: size of dictionary stored into `dstDictBuffer` (<= `maxDictSize`),
243
+ * or an error code, which can be tested by ZDICT_isError().
244
+ * Note: ZDICT_finalizeDictionary() will push notifications into stderr if
245
+ * instructed to, using notificationLevel>0.
246
+ * NOTE: This function currently may fail in several edge cases including:
247
+ * * Not enough samples
248
+ * * Samples are uncompressible
249
+ * * Samples are all exactly the same
250
+ */
251
+ ZDICTLIB_API size_t ZDICT_finalizeDictionary(void* dstDictBuffer, size_t maxDictSize,
252
+ const void* dictContent, size_t dictContentSize,
253
+ const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
254
+ ZDICT_params_t parameters);
255
+
64
256
 
65
257
  /*====== Helper functions ======*/
66
258
  ZDICTLIB_API unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize); /**< extracts dictID; @return zero if error (not a valid dictionary) */
259
+ ZDICTLIB_API size_t ZDICT_getDictHeaderSize(const void* dictBuffer, size_t dictSize); /* returns dict header size; returns a ZSTD error code on failure */
67
260
  ZDICTLIB_API unsigned ZDICT_isError(size_t errorCode);
68
261
  ZDICTLIB_API const char* ZDICT_getErrorName(size_t errorCode);
69
262
 
@@ -78,11 +271,9 @@ ZDICTLIB_API const char* ZDICT_getErrorName(size_t errorCode);
78
271
  * Use them only in association with static linking.
79
272
  * ==================================================================================== */
80
273
 
81
- typedef struct {
82
- int compressionLevel; /* optimize for a specific zstd compression level; 0 means default */
83
- unsigned notificationLevel; /* Write log to stderr; 0 = none (default); 1 = errors; 2 = progression; 3 = details; 4 = debug; */
84
- unsigned dictID; /* force dictID value; 0 means auto mode (32-bits random value) */
85
- } ZDICT_params_t;
274
+ #define ZDICT_DICTSIZE_MIN 256
275
+ /* Deprecated: Remove in v1.6.0 */
276
+ #define ZDICT_CONTENTSIZE_MIN 128
86
277
 
87
278
  /*! ZDICT_cover_params_t:
88
279
  * k and d are the only required parameters.
@@ -198,28 +389,6 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_fastCover(void* dictBuffer,
198
389
  const size_t* samplesSizes, unsigned nbSamples,
199
390
  ZDICT_fastCover_params_t* parameters);
200
391
 
201
- /*! ZDICT_finalizeDictionary():
202
- * Given a custom content as a basis for dictionary, and a set of samples,
203
- * finalize dictionary by adding headers and statistics.
204
- *
205
- * Samples must be stored concatenated in a flat buffer `samplesBuffer`,
206
- * supplied with an array of sizes `samplesSizes`, providing the size of each sample in order.
207
- *
208
- * dictContentSize must be >= ZDICT_CONTENTSIZE_MIN bytes.
209
- * maxDictSize must be >= dictContentSize, and must be >= ZDICT_DICTSIZE_MIN bytes.
210
- *
211
- * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`),
212
- * or an error code, which can be tested by ZDICT_isError().
213
- * Note: ZDICT_finalizeDictionary() will push notifications into stderr if instructed to, using notificationLevel>0.
214
- * Note 2: dictBuffer and dictContent can overlap
215
- */
216
- #define ZDICT_CONTENTSIZE_MIN 128
217
- #define ZDICT_DICTSIZE_MIN 256
218
- ZDICTLIB_API size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
219
- const void* dictContent, size_t dictContentSize,
220
- const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
221
- ZDICT_params_t parameters);
222
-
223
392
  typedef struct {
224
393
  unsigned selectivityLevel; /* 0 means default; larger => select more => larger dictionary */
225
394
  ZDICT_params_t zParams;
@@ -241,10 +410,11 @@ typedef struct {
241
410
  * Note: ZDICT_trainFromBuffer_legacy() will send notifications into stderr if instructed to, using notificationLevel>0.
242
411
  */
243
412
  ZDICTLIB_API size_t ZDICT_trainFromBuffer_legacy(
244
- void *dictBuffer, size_t dictBufferCapacity,
245
- const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,
413
+ void* dictBuffer, size_t dictBufferCapacity,
414
+ const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
246
415
  ZDICT_legacy_params_t parameters);
247
416
 
417
+
248
418
  /* Deprecation warnings */
249
419
  /* It is generally possible to disable deprecation warnings from compiler,
250
420
  for example with -Wno-deprecated-declarations for gcc
@@ -256,7 +426,7 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_legacy(
256
426
  # define ZDICT_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
257
427
  # if defined (__cplusplus) && (__cplusplus >= 201402) /* C++14 or greater */
258
428
  # define ZDICT_DEPRECATED(message) [[deprecated(message)]] ZDICTLIB_API
259
- # elif (ZDICT_GCC_VERSION >= 405) || defined(__clang__)
429
+ # elif defined(__clang__) || (ZDICT_GCC_VERSION >= 405)
260
430
  # define ZDICT_DEPRECATED(message) ZDICTLIB_API __attribute__((deprecated(message)))
261
431
  # elif (ZDICT_GCC_VERSION >= 301)
262
432
  # define ZDICT_DEPRECATED(message) ZDICTLIB_API __attribute__((deprecated))