zstd-ruby 1.4.4.0 → 1.5.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. checksums.yaml +4 -4
  2. data/.github/dependabot.yml +8 -0
  3. data/.github/workflows/ruby.yml +35 -0
  4. data/README.md +2 -2
  5. data/ext/zstdruby/extconf.rb +1 -0
  6. data/ext/zstdruby/libzstd/BUCK +5 -7
  7. data/ext/zstdruby/libzstd/Makefile +241 -173
  8. data/ext/zstdruby/libzstd/README.md +76 -18
  9. data/ext/zstdruby/libzstd/common/bitstream.h +75 -57
  10. data/ext/zstdruby/libzstd/common/compiler.h +196 -20
  11. data/ext/zstdruby/libzstd/common/cpu.h +1 -3
  12. data/ext/zstdruby/libzstd/common/debug.c +11 -31
  13. data/ext/zstdruby/libzstd/common/debug.h +22 -49
  14. data/ext/zstdruby/libzstd/common/entropy_common.c +208 -76
  15. data/ext/zstdruby/libzstd/common/error_private.c +3 -1
  16. data/ext/zstdruby/libzstd/common/error_private.h +87 -4
  17. data/ext/zstdruby/libzstd/common/fse.h +51 -42
  18. data/ext/zstdruby/libzstd/common/fse_decompress.c +149 -57
  19. data/ext/zstdruby/libzstd/common/huf.h +60 -54
  20. data/ext/zstdruby/libzstd/common/mem.h +87 -98
  21. data/ext/zstdruby/libzstd/common/pool.c +23 -17
  22. data/ext/zstdruby/libzstd/common/pool.h +3 -3
  23. data/ext/zstdruby/libzstd/common/portability_macros.h +131 -0
  24. data/ext/zstdruby/libzstd/common/threading.c +10 -8
  25. data/ext/zstdruby/libzstd/common/threading.h +4 -3
  26. data/ext/zstdruby/libzstd/common/xxhash.c +15 -873
  27. data/ext/zstdruby/libzstd/common/xxhash.h +5572 -191
  28. data/ext/zstdruby/libzstd/common/zstd_common.c +10 -10
  29. data/ext/zstdruby/libzstd/common/zstd_deps.h +111 -0
  30. data/ext/zstdruby/libzstd/common/zstd_internal.h +252 -108
  31. data/ext/zstdruby/libzstd/common/zstd_trace.h +163 -0
  32. data/ext/zstdruby/libzstd/compress/clevels.h +134 -0
  33. data/ext/zstdruby/libzstd/compress/fse_compress.c +105 -85
  34. data/ext/zstdruby/libzstd/compress/hist.c +41 -63
  35. data/ext/zstdruby/libzstd/compress/hist.h +13 -33
  36. data/ext/zstdruby/libzstd/compress/huf_compress.c +831 -259
  37. data/ext/zstdruby/libzstd/compress/zstd_compress.c +3213 -1007
  38. data/ext/zstdruby/libzstd/compress/zstd_compress_internal.h +493 -71
  39. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.c +21 -16
  40. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.h +4 -2
  41. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.c +51 -24
  42. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.h +10 -3
  43. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.c +573 -0
  44. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.h +32 -0
  45. data/ext/zstdruby/libzstd/compress/zstd_cwksp.h +208 -81
  46. data/ext/zstdruby/libzstd/compress/zstd_double_fast.c +315 -137
  47. data/ext/zstdruby/libzstd/compress/zstd_double_fast.h +2 -2
  48. data/ext/zstdruby/libzstd/compress/zstd_fast.c +319 -128
  49. data/ext/zstdruby/libzstd/compress/zstd_fast.h +2 -2
  50. data/ext/zstdruby/libzstd/compress/zstd_lazy.c +1156 -171
  51. data/ext/zstdruby/libzstd/compress/zstd_lazy.h +59 -1
  52. data/ext/zstdruby/libzstd/compress/zstd_ldm.c +331 -206
  53. data/ext/zstdruby/libzstd/compress/zstd_ldm.h +15 -3
  54. data/ext/zstdruby/libzstd/compress/zstd_ldm_geartab.h +106 -0
  55. data/ext/zstdruby/libzstd/compress/zstd_opt.c +403 -226
  56. data/ext/zstdruby/libzstd/compress/zstd_opt.h +1 -1
  57. data/ext/zstdruby/libzstd/compress/zstdmt_compress.c +188 -453
  58. data/ext/zstdruby/libzstd/compress/zstdmt_compress.h +32 -114
  59. data/ext/zstdruby/libzstd/decompress/huf_decompress.c +1065 -410
  60. data/ext/zstdruby/libzstd/decompress/huf_decompress_amd64.S +571 -0
  61. data/ext/zstdruby/libzstd/decompress/zstd_ddict.c +20 -16
  62. data/ext/zstdruby/libzstd/decompress/zstd_ddict.h +3 -3
  63. data/ext/zstdruby/libzstd/decompress/zstd_decompress.c +691 -230
  64. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.c +1072 -323
  65. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.h +16 -7
  66. data/ext/zstdruby/libzstd/decompress/zstd_decompress_internal.h +71 -10
  67. data/ext/zstdruby/libzstd/deprecated/zbuff.h +3 -3
  68. data/ext/zstdruby/libzstd/deprecated/zbuff_common.c +2 -2
  69. data/ext/zstdruby/libzstd/deprecated/zbuff_compress.c +24 -4
  70. data/ext/zstdruby/libzstd/deprecated/zbuff_decompress.c +1 -1
  71. data/ext/zstdruby/libzstd/dictBuilder/cover.c +57 -40
  72. data/ext/zstdruby/libzstd/dictBuilder/cover.h +20 -9
  73. data/ext/zstdruby/libzstd/dictBuilder/divsufsort.c +1 -1
  74. data/ext/zstdruby/libzstd/dictBuilder/fastcover.c +54 -35
  75. data/ext/zstdruby/libzstd/dictBuilder/zdict.c +151 -57
  76. data/ext/zstdruby/libzstd/dll/example/Makefile +2 -1
  77. data/ext/zstdruby/libzstd/dll/example/README.md +16 -22
  78. data/ext/zstdruby/libzstd/legacy/zstd_legacy.h +4 -4
  79. data/ext/zstdruby/libzstd/legacy/zstd_v01.c +25 -19
  80. data/ext/zstdruby/libzstd/legacy/zstd_v01.h +1 -1
  81. data/ext/zstdruby/libzstd/legacy/zstd_v02.c +18 -14
  82. data/ext/zstdruby/libzstd/legacy/zstd_v02.h +1 -1
  83. data/ext/zstdruby/libzstd/legacy/zstd_v03.c +18 -14
  84. data/ext/zstdruby/libzstd/legacy/zstd_v03.h +1 -1
  85. data/ext/zstdruby/libzstd/legacy/zstd_v04.c +22 -16
  86. data/ext/zstdruby/libzstd/legacy/zstd_v04.h +1 -1
  87. data/ext/zstdruby/libzstd/legacy/zstd_v05.c +29 -25
  88. data/ext/zstdruby/libzstd/legacy/zstd_v05.h +2 -2
  89. data/ext/zstdruby/libzstd/legacy/zstd_v06.c +29 -25
  90. data/ext/zstdruby/libzstd/legacy/zstd_v06.h +1 -1
  91. data/ext/zstdruby/libzstd/legacy/zstd_v07.c +34 -26
  92. data/ext/zstdruby/libzstd/legacy/zstd_v07.h +1 -1
  93. data/ext/zstdruby/libzstd/libzstd.mk +185 -0
  94. data/ext/zstdruby/libzstd/libzstd.pc.in +4 -3
  95. data/ext/zstdruby/libzstd/modulemap/module.modulemap +4 -0
  96. data/ext/zstdruby/libzstd/{dictBuilder/zdict.h → zdict.h} +201 -31
  97. data/ext/zstdruby/libzstd/zstd.h +760 -234
  98. data/ext/zstdruby/libzstd/{common/zstd_errors.h → zstd_errors.h} +3 -1
  99. data/ext/zstdruby/zstdruby.c +2 -2
  100. data/lib/zstd-ruby/version.rb +1 -1
  101. metadata +20 -9
  102. data/.travis.yml +0 -14
@@ -0,0 +1,185 @@
1
+ # ################################################################
2
+ # Copyright (c) Yann Collet, Facebook, Inc.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under both the BSD-style license (found in the
6
+ # LICENSE file in the root directory of this source tree) and the GPLv2 (found
7
+ # in the COPYING file in the root directory of this source tree).
8
+ # You may select, at your option, one of the above-listed licenses.
9
+ # ################################################################
10
+
11
+ ##################################################################
12
+ # Input Variables
13
+ ##################################################################
14
+
15
+ # Zstd lib directory
16
+ LIBZSTD ?= ./
17
+
18
+ # Legacy support
19
+ ZSTD_LEGACY_SUPPORT ?= 5
20
+ ZSTD_LEGACY_MULTITHREADED_API ?= 0
21
+
22
+ # Build size optimizations
23
+ HUF_FORCE_DECOMPRESS_X1 ?= 0
24
+ HUF_FORCE_DECOMPRESS_X2 ?= 0
25
+ ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT ?= 0
26
+ ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG ?= 0
27
+ ZSTD_NO_INLINE ?= 0
28
+ ZSTD_STRIP_ERROR_STRINGS ?= 0
29
+
30
+ # Assembly support
31
+ ZSTD_NO_ASM ?= 0
32
+
33
+ ##################################################################
34
+ # libzstd helpers
35
+ ##################################################################
36
+
37
+ # Make 4.3 doesn't support '\#' anymore (https://lwn.net/Articles/810071/)
38
+ NUM_SYMBOL := \#
39
+
40
+ # define silent mode as default (verbose mode with V=1 or VERBOSE=1)
41
+ $(V)$(VERBOSE).SILENT:
42
+
43
+ # When cross-compiling from linux to windows,
44
+ # one might need to specify TARGET_SYSTEM as "Windows."
45
+ # Building from Fedora fails without it.
46
+ # (but Ubuntu and Debian don't need to set anything)
47
+ TARGET_SYSTEM ?= $(OS)
48
+
49
+ # Version numbers
50
+ LIBVER_SRC := $(LIBZSTD)/zstd.h
51
+ LIBVER_MAJOR_SCRIPT:=`sed -n '/define ZSTD_VERSION_MAJOR/s/.*[[:blank:]]\([0-9][0-9]*\).*/\1/p' < $(LIBVER_SRC)`
52
+ LIBVER_MINOR_SCRIPT:=`sed -n '/define ZSTD_VERSION_MINOR/s/.*[[:blank:]]\([0-9][0-9]*\).*/\1/p' < $(LIBVER_SRC)`
53
+ LIBVER_PATCH_SCRIPT:=`sed -n '/define ZSTD_VERSION_RELEASE/s/.*[[:blank:]]\([0-9][0-9]*\).*/\1/p' < $(LIBVER_SRC)`
54
+ LIBVER_SCRIPT:= $(LIBVER_MAJOR_SCRIPT).$(LIBVER_MINOR_SCRIPT).$(LIBVER_PATCH_SCRIPT)
55
+ LIBVER_MAJOR := $(shell echo $(LIBVER_MAJOR_SCRIPT))
56
+ LIBVER_MINOR := $(shell echo $(LIBVER_MINOR_SCRIPT))
57
+ LIBVER_PATCH := $(shell echo $(LIBVER_PATCH_SCRIPT))
58
+ LIBVER := $(shell echo $(LIBVER_SCRIPT))
59
+ CCVER := $(shell $(CC) --version)
60
+ ZSTD_VERSION?= $(LIBVER)
61
+
62
+ # ZSTD_LIB_MINIFY is a helper variable that
63
+ # configures a bunch of other variables to space-optimized defaults.
64
+ ZSTD_LIB_MINIFY ?= 0
65
+ ifneq ($(ZSTD_LIB_MINIFY), 0)
66
+ HAVE_CC_OZ ?= $(shell echo "" | $(CC) -Oz -x c -c - -o /dev/null 2> /dev/null && echo 1 || echo 0)
67
+ ZSTD_LEGACY_SUPPORT ?= 0
68
+ ZSTD_LIB_DEPRECATED ?= 0
69
+ HUF_FORCE_DECOMPRESS_X1 ?= 1
70
+ ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT ?= 1
71
+ ZSTD_NO_INLINE ?= 1
72
+ ZSTD_STRIP_ERROR_STRINGS ?= 1
73
+ ifneq ($(HAVE_CC_OZ), 0)
74
+ # Some compilers (clang) support an even more space-optimized setting.
75
+ CFLAGS += -Oz
76
+ else
77
+ CFLAGS += -Os
78
+ endif
79
+ CFLAGS += -fno-stack-protector -fomit-frame-pointer -fno-ident \
80
+ -DDYNAMIC_BMI2=0 -DNDEBUG
81
+ else
82
+ CFLAGS += -O3
83
+ endif
84
+
85
+ DEBUGLEVEL ?= 0
86
+ CPPFLAGS += -DXXH_NAMESPACE=ZSTD_ -DDEBUGLEVEL=$(DEBUGLEVEL)
87
+ ifeq ($(TARGET_SYSTEM),Windows_NT) # MinGW assumed
88
+ CPPFLAGS += -D__USE_MINGW_ANSI_STDIO # compatibility with %zu formatting
89
+ endif
90
+ DEBUGFLAGS= -Wall -Wextra -Wcast-qual -Wcast-align -Wshadow \
91
+ -Wstrict-aliasing=1 -Wswitch-enum -Wdeclaration-after-statement \
92
+ -Wstrict-prototypes -Wundef -Wpointer-arith \
93
+ -Wvla -Wformat=2 -Winit-self -Wfloat-equal -Wwrite-strings \
94
+ -Wredundant-decls -Wmissing-prototypes -Wc++-compat
95
+ CFLAGS += $(DEBUGFLAGS) $(MOREFLAGS)
96
+ LDFLAGS += $(MOREFLAGS)
97
+ FLAGS = $(CPPFLAGS) $(CFLAGS) $(LDFLAGS)
98
+
99
+ HAVE_COLORNEVER = $(shell echo a | grep --color=never a > /dev/null 2> /dev/null && echo 1 || echo 0)
100
+ GREP_OPTIONS ?=
101
+ ifeq ($HAVE_COLORNEVER, 1)
102
+ GREP_OPTIONS += --color=never
103
+ endif
104
+ GREP = grep $(GREP_OPTIONS)
105
+ SED_ERE_OPT ?= -E
106
+
107
+ ZSTD_COMMON_FILES := $(sort $(wildcard $(LIBZSTD)/common/*.c))
108
+ ZSTD_COMPRESS_FILES := $(sort $(wildcard $(LIBZSTD)/compress/*.c))
109
+ ZSTD_DECOMPRESS_FILES := $(sort $(wildcard $(LIBZSTD)/decompress/*.c))
110
+ ZSTD_DICTBUILDER_FILES := $(sort $(wildcard $(LIBZSTD)/dictBuilder/*.c))
111
+ ZSTD_DEPRECATED_FILES := $(sort $(wildcard $(LIBZSTD)/deprecated/*.c))
112
+ ZSTD_LEGACY_FILES :=
113
+
114
+ ZSTD_DECOMPRESS_AMD64_ASM_FILES := $(sort $(wildcard $(LIBZSTD)/decompress/*_amd64.S))
115
+
116
+ ifneq ($(ZSTD_NO_ASM), 0)
117
+ CPPFLAGS += -DZSTD_DISABLE_ASM
118
+ else
119
+ # Unconditionally add the ASM files they are disabled by
120
+ # macros in the .S file.
121
+ ZSTD_DECOMPRESS_FILES += $(ZSTD_DECOMPRESS_AMD64_ASM_FILES)
122
+ endif
123
+
124
+ ifneq ($(HUF_FORCE_DECOMPRESS_X1), 0)
125
+ CFLAGS += -DHUF_FORCE_DECOMPRESS_X1
126
+ endif
127
+
128
+ ifneq ($(HUF_FORCE_DECOMPRESS_X2), 0)
129
+ CFLAGS += -DHUF_FORCE_DECOMPRESS_X2
130
+ endif
131
+
132
+ ifneq ($(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT), 0)
133
+ CFLAGS += -DZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
134
+ endif
135
+
136
+ ifneq ($(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG), 0)
137
+ CFLAGS += -DZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
138
+ endif
139
+
140
+ ifneq ($(ZSTD_NO_INLINE), 0)
141
+ CFLAGS += -DZSTD_NO_INLINE
142
+ endif
143
+
144
+ ifneq ($(ZSTD_STRIP_ERROR_STRINGS), 0)
145
+ CFLAGS += -DZSTD_STRIP_ERROR_STRINGS
146
+ endif
147
+
148
+ ifneq ($(ZSTD_LEGACY_MULTITHREADED_API), 0)
149
+ CFLAGS += -DZSTD_LEGACY_MULTITHREADED_API
150
+ endif
151
+
152
+ ifneq ($(ZSTD_LEGACY_SUPPORT), 0)
153
+ ifeq ($(shell test $(ZSTD_LEGACY_SUPPORT) -lt 8; echo $$?), 0)
154
+ ZSTD_LEGACY_FILES += $(shell ls $(LIBZSTD)/legacy/*.c | $(GREP) 'v0[$(ZSTD_LEGACY_SUPPORT)-7]')
155
+ endif
156
+ endif
157
+ CPPFLAGS += -DZSTD_LEGACY_SUPPORT=$(ZSTD_LEGACY_SUPPORT)
158
+
159
+ UNAME := $(shell uname)
160
+
161
+ ifndef BUILD_DIR
162
+ ifeq ($(UNAME), Darwin)
163
+ ifeq ($(shell md5 < /dev/null > /dev/null; echo $$?), 0)
164
+ HASH ?= md5
165
+ endif
166
+ else ifeq ($(UNAME), FreeBSD)
167
+ HASH ?= gmd5sum
168
+ else ifeq ($(UNAME), NetBSD)
169
+ HASH ?= md5 -n
170
+ else ifeq ($(UNAME), OpenBSD)
171
+ HASH ?= md5
172
+ endif
173
+ HASH ?= md5sum
174
+
175
+ HASH_DIR = conf_$(shell echo $(CC) $(CPPFLAGS) $(CFLAGS) $(LDFLAGS) $(ZSTD_FILES) | $(HASH) | cut -f 1 -d " " )
176
+ HAVE_HASH :=$(shell echo 1 | $(HASH) > /dev/null && echo 1 || echo 0)
177
+ ifeq ($(HAVE_HASH),0)
178
+ $(info warning : could not find HASH ($(HASH)), needed to differentiate builds using different flags)
179
+ BUILD_DIR := obj/generic_noconf
180
+ endif
181
+ endif # BUILD_DIR
182
+
183
+ ZSTD_SUBDIR := $(LIBZSTD)/common $(LIBZSTD)/compress $(LIBZSTD)/decompress $(LIBZSTD)/dictBuilder $(LIBZSTD)/legacy $(LIBZSTD)/deprecated
184
+ vpath %.c $(ZSTD_SUBDIR)
185
+ vpath %.S $(ZSTD_SUBDIR)
@@ -3,13 +3,14 @@
3
3
  # BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
4
4
 
5
5
  prefix=@PREFIX@
6
- exec_prefix=${prefix}
7
- includedir=${prefix}/include
8
- libdir=${exec_prefix}/lib
6
+ exec_prefix=@EXEC_PREFIX@
7
+ includedir=@INCLUDEDIR@
8
+ libdir=@LIBDIR@
9
9
 
10
10
  Name: zstd
11
11
  Description: fast lossless compression algorithm library
12
12
  URL: http://www.zstd.net/
13
13
  Version: @VERSION@
14
14
  Libs: -L${libdir} -lzstd
15
+ Libs.private: @LIBS_PRIVATE@
15
16
  Cflags: -I${includedir}
@@ -0,0 +1,4 @@
1
+ module libzstd [extern_c] {
2
+ header "../zstd.h"
3
+ export *
4
+ }
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
2
+ * Copyright (c) Yann Collet, Facebook, Inc.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -36,6 +36,145 @@ extern "C" {
36
36
  # define ZDICTLIB_API ZDICTLIB_VISIBILITY
37
37
  #endif
38
38
 
39
+ /*******************************************************************************
40
+ * Zstd dictionary builder
41
+ *
42
+ * FAQ
43
+ * ===
44
+ * Why should I use a dictionary?
45
+ * ------------------------------
46
+ *
47
+ * Zstd can use dictionaries to improve compression ratio of small data.
48
+ * Traditionally small files don't compress well because there is very little
49
+ * repetition in a single sample, since it is small. But, if you are compressing
50
+ * many similar files, like a bunch of JSON records that share the same
51
+ * structure, you can train a dictionary on ahead of time on some samples of
52
+ * these files. Then, zstd can use the dictionary to find repetitions that are
53
+ * present across samples. This can vastly improve compression ratio.
54
+ *
55
+ * When is a dictionary useful?
56
+ * ----------------------------
57
+ *
58
+ * Dictionaries are useful when compressing many small files that are similar.
59
+ * The larger a file is, the less benefit a dictionary will have. Generally,
60
+ * we don't expect dictionary compression to be effective past 100KB. And the
61
+ * smaller a file is, the more we would expect the dictionary to help.
62
+ *
63
+ * How do I use a dictionary?
64
+ * --------------------------
65
+ *
66
+ * Simply pass the dictionary to the zstd compressor with
67
+ * `ZSTD_CCtx_loadDictionary()`. The same dictionary must then be passed to
68
+ * the decompressor, using `ZSTD_DCtx_loadDictionary()`. There are other
69
+ * more advanced functions that allow selecting some options, see zstd.h for
70
+ * complete documentation.
71
+ *
72
+ * What is a zstd dictionary?
73
+ * --------------------------
74
+ *
75
+ * A zstd dictionary has two pieces: Its header, and its content. The header
76
+ * contains a magic number, the dictionary ID, and entropy tables. These
77
+ * entropy tables allow zstd to save on header costs in the compressed file,
78
+ * which really matters for small data. The content is just bytes, which are
79
+ * repeated content that is common across many samples.
80
+ *
81
+ * What is a raw content dictionary?
82
+ * ---------------------------------
83
+ *
84
+ * A raw content dictionary is just bytes. It doesn't have a zstd dictionary
85
+ * header, a dictionary ID, or entropy tables. Any buffer is a valid raw
86
+ * content dictionary.
87
+ *
88
+ * How do I train a dictionary?
89
+ * ----------------------------
90
+ *
91
+ * Gather samples from your use case. These samples should be similar to each
92
+ * other. If you have several use cases, you could try to train one dictionary
93
+ * per use case.
94
+ *
95
+ * Pass those samples to `ZDICT_trainFromBuffer()` and that will train your
96
+ * dictionary. There are a few advanced versions of this function, but this
97
+ * is a great starting point. If you want to further tune your dictionary
98
+ * you could try `ZDICT_optimizeTrainFromBuffer_cover()`. If that is too slow
99
+ * you can try `ZDICT_optimizeTrainFromBuffer_fastCover()`.
100
+ *
101
+ * If the dictionary training function fails, that is likely because you
102
+ * either passed too few samples, or a dictionary would not be effective
103
+ * for your data. Look at the messages that the dictionary trainer printed,
104
+ * if it doesn't say too few samples, then a dictionary would not be effective.
105
+ *
106
+ * How large should my dictionary be?
107
+ * ----------------------------------
108
+ *
109
+ * A reasonable dictionary size, the `dictBufferCapacity`, is about 100KB.
110
+ * The zstd CLI defaults to a 110KB dictionary. You likely don't need a
111
+ * dictionary larger than that. But, most use cases can get away with a
112
+ * smaller dictionary. The advanced dictionary builders can automatically
113
+ * shrink the dictionary for you, and select a the smallest size that
114
+ * doesn't hurt compression ratio too much. See the `shrinkDict` parameter.
115
+ * A smaller dictionary can save memory, and potentially speed up
116
+ * compression.
117
+ *
118
+ * How many samples should I provide to the dictionary builder?
119
+ * ------------------------------------------------------------
120
+ *
121
+ * We generally recommend passing ~100x the size of the dictionary
122
+ * in samples. A few thousand should suffice. Having too few samples
123
+ * can hurt the dictionaries effectiveness. Having more samples will
124
+ * only improve the dictionaries effectiveness. But having too many
125
+ * samples can slow down the dictionary builder.
126
+ *
127
+ * How do I determine if a dictionary will be effective?
128
+ * -----------------------------------------------------
129
+ *
130
+ * Simply train a dictionary and try it out. You can use zstd's built in
131
+ * benchmarking tool to test the dictionary effectiveness.
132
+ *
133
+ * # Benchmark levels 1-3 without a dictionary
134
+ * zstd -b1e3 -r /path/to/my/files
135
+ * # Benchmark levels 1-3 with a dictionary
136
+ * zstd -b1e3 -r /path/to/my/files -D /path/to/my/dictionary
137
+ *
138
+ * When should I retrain a dictionary?
139
+ * -----------------------------------
140
+ *
141
+ * You should retrain a dictionary when its effectiveness drops. Dictionary
142
+ * effectiveness drops as the data you are compressing changes. Generally, we do
143
+ * expect dictionaries to "decay" over time, as your data changes, but the rate
144
+ * at which they decay depends on your use case. Internally, we regularly
145
+ * retrain dictionaries, and if the new dictionary performs significantly
146
+ * better than the old dictionary, we will ship the new dictionary.
147
+ *
148
+ * I have a raw content dictionary, how do I turn it into a zstd dictionary?
149
+ * -------------------------------------------------------------------------
150
+ *
151
+ * If you have a raw content dictionary, e.g. by manually constructing it, or
152
+ * using a third-party dictionary builder, you can turn it into a zstd
153
+ * dictionary by using `ZDICT_finalizeDictionary()`. You'll also have to
154
+ * provide some samples of the data. It will add the zstd header to the
155
+ * raw content, which contains a dictionary ID and entropy tables, which
156
+ * will improve compression ratio, and allow zstd to write the dictionary ID
157
+ * into the frame, if you so choose.
158
+ *
159
+ * Do I have to use zstd's dictionary builder?
160
+ * -------------------------------------------
161
+ *
162
+ * No! You can construct dictionary content however you please, it is just
163
+ * bytes. It will always be valid as a raw content dictionary. If you want
164
+ * a zstd dictionary, which can improve compression ratio, use
165
+ * `ZDICT_finalizeDictionary()`.
166
+ *
167
+ * What is the attack surface of a zstd dictionary?
168
+ * ------------------------------------------------
169
+ *
170
+ * Zstd is heavily fuzz tested, including loading fuzzed dictionaries, so
171
+ * zstd should never crash, or access out-of-bounds memory no matter what
172
+ * the dictionary is. However, if an attacker can control the dictionary
173
+ * during decompression, they can cause zstd to generate arbitrary bytes,
174
+ * just like if they controlled the compressed data.
175
+ *
176
+ ******************************************************************************/
177
+
39
178
 
40
179
  /*! ZDICT_trainFromBuffer():
41
180
  * Train a dictionary from an array of samples.
@@ -61,9 +200,63 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCap
61
200
  const void* samplesBuffer,
62
201
  const size_t* samplesSizes, unsigned nbSamples);
63
202
 
203
+ typedef struct {
204
+ int compressionLevel; /*< optimize for a specific zstd compression level; 0 means default */
205
+ unsigned notificationLevel; /*< Write log to stderr; 0 = none (default); 1 = errors; 2 = progression; 3 = details; 4 = debug; */
206
+ unsigned dictID; /*< force dictID value; 0 means auto mode (32-bits random value)
207
+ * NOTE: The zstd format reserves some dictionary IDs for future use.
208
+ * You may use them in private settings, but be warned that they
209
+ * may be used by zstd in a public dictionary registry in the future.
210
+ * These dictionary IDs are:
211
+ * - low range : <= 32767
212
+ * - high range : >= (2^31)
213
+ */
214
+ } ZDICT_params_t;
215
+
216
+ /*! ZDICT_finalizeDictionary():
217
+ * Given a custom content as a basis for dictionary, and a set of samples,
218
+ * finalize dictionary by adding headers and statistics according to the zstd
219
+ * dictionary format.
220
+ *
221
+ * Samples must be stored concatenated in a flat buffer `samplesBuffer`,
222
+ * supplied with an array of sizes `samplesSizes`, providing the size of each
223
+ * sample in order. The samples are used to construct the statistics, so they
224
+ * should be representative of what you will compress with this dictionary.
225
+ *
226
+ * The compression level can be set in `parameters`. You should pass the
227
+ * compression level you expect to use in production. The statistics for each
228
+ * compression level differ, so tuning the dictionary for the compression level
229
+ * can help quite a bit.
230
+ *
231
+ * You can set an explicit dictionary ID in `parameters`, or allow us to pick
232
+ * a random dictionary ID for you, but we can't guarantee no collisions.
233
+ *
234
+ * The dstDictBuffer and the dictContent may overlap, and the content will be
235
+ * appended to the end of the header. If the header + the content doesn't fit in
236
+ * maxDictSize the beginning of the content is truncated to make room, since it
237
+ * is presumed that the most profitable content is at the end of the dictionary,
238
+ * since that is the cheapest to reference.
239
+ *
240
+ * `maxDictSize` must be >= max(dictContentSize, ZSTD_DICTSIZE_MIN).
241
+ *
242
+ * @return: size of dictionary stored into `dstDictBuffer` (<= `maxDictSize`),
243
+ * or an error code, which can be tested by ZDICT_isError().
244
+ * Note: ZDICT_finalizeDictionary() will push notifications into stderr if
245
+ * instructed to, using notificationLevel>0.
246
+ * NOTE: This function currently may fail in several edge cases including:
247
+ * * Not enough samples
248
+ * * Samples are uncompressible
249
+ * * Samples are all exactly the same
250
+ */
251
+ ZDICTLIB_API size_t ZDICT_finalizeDictionary(void* dstDictBuffer, size_t maxDictSize,
252
+ const void* dictContent, size_t dictContentSize,
253
+ const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
254
+ ZDICT_params_t parameters);
255
+
64
256
 
65
257
  /*====== Helper functions ======*/
66
258
  ZDICTLIB_API unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize); /**< extracts dictID; @return zero if error (not a valid dictionary) */
259
+ ZDICTLIB_API size_t ZDICT_getDictHeaderSize(const void* dictBuffer, size_t dictSize); /* returns dict header size; returns a ZSTD error code on failure */
67
260
  ZDICTLIB_API unsigned ZDICT_isError(size_t errorCode);
68
261
  ZDICTLIB_API const char* ZDICT_getErrorName(size_t errorCode);
69
262
 
@@ -78,11 +271,9 @@ ZDICTLIB_API const char* ZDICT_getErrorName(size_t errorCode);
78
271
  * Use them only in association with static linking.
79
272
  * ==================================================================================== */
80
273
 
81
- typedef struct {
82
- int compressionLevel; /* optimize for a specific zstd compression level; 0 means default */
83
- unsigned notificationLevel; /* Write log to stderr; 0 = none (default); 1 = errors; 2 = progression; 3 = details; 4 = debug; */
84
- unsigned dictID; /* force dictID value; 0 means auto mode (32-bits random value) */
85
- } ZDICT_params_t;
274
+ #define ZDICT_DICTSIZE_MIN 256
275
+ /* Deprecated: Remove in v1.6.0 */
276
+ #define ZDICT_CONTENTSIZE_MIN 128
86
277
 
87
278
  /*! ZDICT_cover_params_t:
88
279
  * k and d are the only required parameters.
@@ -198,28 +389,6 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_fastCover(void* dictBuffer,
198
389
  const size_t* samplesSizes, unsigned nbSamples,
199
390
  ZDICT_fastCover_params_t* parameters);
200
391
 
201
- /*! ZDICT_finalizeDictionary():
202
- * Given a custom content as a basis for dictionary, and a set of samples,
203
- * finalize dictionary by adding headers and statistics.
204
- *
205
- * Samples must be stored concatenated in a flat buffer `samplesBuffer`,
206
- * supplied with an array of sizes `samplesSizes`, providing the size of each sample in order.
207
- *
208
- * dictContentSize must be >= ZDICT_CONTENTSIZE_MIN bytes.
209
- * maxDictSize must be >= dictContentSize, and must be >= ZDICT_DICTSIZE_MIN bytes.
210
- *
211
- * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`),
212
- * or an error code, which can be tested by ZDICT_isError().
213
- * Note: ZDICT_finalizeDictionary() will push notifications into stderr if instructed to, using notificationLevel>0.
214
- * Note 2: dictBuffer and dictContent can overlap
215
- */
216
- #define ZDICT_CONTENTSIZE_MIN 128
217
- #define ZDICT_DICTSIZE_MIN 256
218
- ZDICTLIB_API size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
219
- const void* dictContent, size_t dictContentSize,
220
- const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
221
- ZDICT_params_t parameters);
222
-
223
392
  typedef struct {
224
393
  unsigned selectivityLevel; /* 0 means default; larger => select more => larger dictionary */
225
394
  ZDICT_params_t zParams;
@@ -241,10 +410,11 @@ typedef struct {
241
410
  * Note: ZDICT_trainFromBuffer_legacy() will send notifications into stderr if instructed to, using notificationLevel>0.
242
411
  */
243
412
  ZDICTLIB_API size_t ZDICT_trainFromBuffer_legacy(
244
- void *dictBuffer, size_t dictBufferCapacity,
245
- const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,
413
+ void* dictBuffer, size_t dictBufferCapacity,
414
+ const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
246
415
  ZDICT_legacy_params_t parameters);
247
416
 
417
+
248
418
  /* Deprecation warnings */
249
419
  /* It is generally possible to disable deprecation warnings from compiler,
250
420
  for example with -Wno-deprecated-declarations for gcc
@@ -256,7 +426,7 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_legacy(
256
426
  # define ZDICT_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
257
427
  # if defined (__cplusplus) && (__cplusplus >= 201402) /* C++14 or greater */
258
428
  # define ZDICT_DEPRECATED(message) [[deprecated(message)]] ZDICTLIB_API
259
- # elif (ZDICT_GCC_VERSION >= 405) || defined(__clang__)
429
+ # elif defined(__clang__) || (ZDICT_GCC_VERSION >= 405)
260
430
  # define ZDICT_DEPRECATED(message) ZDICTLIB_API __attribute__((deprecated(message)))
261
431
  # elif (ZDICT_GCC_VERSION >= 301)
262
432
  # define ZDICT_DEPRECATED(message) ZDICTLIB_API __attribute__((deprecated))