zstdlib 0.13.0-x86_64-linux

Sign up to get free protection for your applications and to get access to all the features.
Files changed (129) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +6 -0
  3. data/CHANGES.md +107 -0
  4. data/Gemfile +3 -0
  5. data/README.md +107 -0
  6. data/Rakefile +59 -0
  7. data/ext/zstdlib_c/extconf.rb +59 -0
  8. data/ext/zstdlib_c/ruby/zlib-2.2/zstdlib.c +4675 -0
  9. data/ext/zstdlib_c/ruby/zlib-2.3/zstdlib.c +4702 -0
  10. data/ext/zstdlib_c/ruby/zlib-2.4/zstdlib.c +4859 -0
  11. data/ext/zstdlib_c/ruby/zlib-2.5/zstdlib.c +4864 -0
  12. data/ext/zstdlib_c/ruby/zlib-2.6/zstdlib.c +4906 -0
  13. data/ext/zstdlib_c/ruby/zlib-2.7/zstdlib.c +4895 -0
  14. data/ext/zstdlib_c/ruby/zlib-3.0/zstdlib.c +4994 -0
  15. data/ext/zstdlib_c/ruby/zlib-3.1/zstdlib.c +5076 -0
  16. data/ext/zstdlib_c/ruby/zlib-3.2/zstdlib.c +5090 -0
  17. data/ext/zstdlib_c/ruby/zlib-3.3/zstdlib.c +5090 -0
  18. data/ext/zstdlib_c/zlib-1.3.1/adler32.c +164 -0
  19. data/ext/zstdlib_c/zlib-1.3.1/compress.c +75 -0
  20. data/ext/zstdlib_c/zlib-1.3.1/crc32.c +1049 -0
  21. data/ext/zstdlib_c/zlib-1.3.1/crc32.h +9446 -0
  22. data/ext/zstdlib_c/zlib-1.3.1/deflate.c +2139 -0
  23. data/ext/zstdlib_c/zlib-1.3.1/deflate.h +377 -0
  24. data/ext/zstdlib_c/zlib-1.3.1/gzclose.c +23 -0
  25. data/ext/zstdlib_c/zlib-1.3.1/gzguts.h +214 -0
  26. data/ext/zstdlib_c/zlib-1.3.1/gzlib.c +582 -0
  27. data/ext/zstdlib_c/zlib-1.3.1/gzread.c +602 -0
  28. data/ext/zstdlib_c/zlib-1.3.1/gzwrite.c +631 -0
  29. data/ext/zstdlib_c/zlib-1.3.1/infback.c +628 -0
  30. data/ext/zstdlib_c/zlib-1.3.1/inffast.c +320 -0
  31. data/ext/zstdlib_c/zlib-1.3.1/inffast.h +11 -0
  32. data/ext/zstdlib_c/zlib-1.3.1/inffixed.h +94 -0
  33. data/ext/zstdlib_c/zlib-1.3.1/inflate.c +1526 -0
  34. data/ext/zstdlib_c/zlib-1.3.1/inflate.h +126 -0
  35. data/ext/zstdlib_c/zlib-1.3.1/inftrees.c +299 -0
  36. data/ext/zstdlib_c/zlib-1.3.1/inftrees.h +62 -0
  37. data/ext/zstdlib_c/zlib-1.3.1/trees.c +1117 -0
  38. data/ext/zstdlib_c/zlib-1.3.1/trees.h +128 -0
  39. data/ext/zstdlib_c/zlib-1.3.1/uncompr.c +85 -0
  40. data/ext/zstdlib_c/zlib-1.3.1/zconf.h +543 -0
  41. data/ext/zstdlib_c/zlib-1.3.1/zlib.h +1938 -0
  42. data/ext/zstdlib_c/zlib-1.3.1/zutil.c +299 -0
  43. data/ext/zstdlib_c/zlib-1.3.1/zutil.h +254 -0
  44. data/ext/zstdlib_c/zlib.mk +14 -0
  45. data/ext/zstdlib_c/zlibwrapper/zlibwrapper.c +10 -0
  46. data/ext/zstdlib_c/zlibwrapper.mk +14 -0
  47. data/ext/zstdlib_c/zstd-1.5.6/lib/common/allocations.h +55 -0
  48. data/ext/zstdlib_c/zstd-1.5.6/lib/common/bits.h +200 -0
  49. data/ext/zstdlib_c/zstd-1.5.6/lib/common/bitstream.h +457 -0
  50. data/ext/zstdlib_c/zstd-1.5.6/lib/common/compiler.h +450 -0
  51. data/ext/zstdlib_c/zstd-1.5.6/lib/common/cpu.h +249 -0
  52. data/ext/zstdlib_c/zstd-1.5.6/lib/common/debug.c +30 -0
  53. data/ext/zstdlib_c/zstd-1.5.6/lib/common/debug.h +116 -0
  54. data/ext/zstdlib_c/zstd-1.5.6/lib/common/entropy_common.c +340 -0
  55. data/ext/zstdlib_c/zstd-1.5.6/lib/common/error_private.c +63 -0
  56. data/ext/zstdlib_c/zstd-1.5.6/lib/common/error_private.h +168 -0
  57. data/ext/zstdlib_c/zstd-1.5.6/lib/common/fse.h +640 -0
  58. data/ext/zstdlib_c/zstd-1.5.6/lib/common/fse_decompress.c +313 -0
  59. data/ext/zstdlib_c/zstd-1.5.6/lib/common/huf.h +286 -0
  60. data/ext/zstdlib_c/zstd-1.5.6/lib/common/mem.h +426 -0
  61. data/ext/zstdlib_c/zstd-1.5.6/lib/common/pool.c +371 -0
  62. data/ext/zstdlib_c/zstd-1.5.6/lib/common/pool.h +90 -0
  63. data/ext/zstdlib_c/zstd-1.5.6/lib/common/portability_macros.h +158 -0
  64. data/ext/zstdlib_c/zstd-1.5.6/lib/common/threading.c +182 -0
  65. data/ext/zstdlib_c/zstd-1.5.6/lib/common/threading.h +150 -0
  66. data/ext/zstdlib_c/zstd-1.5.6/lib/common/xxhash.c +18 -0
  67. data/ext/zstdlib_c/zstd-1.5.6/lib/common/xxhash.h +7020 -0
  68. data/ext/zstdlib_c/zstd-1.5.6/lib/common/zstd_common.c +48 -0
  69. data/ext/zstdlib_c/zstd-1.5.6/lib/common/zstd_deps.h +111 -0
  70. data/ext/zstdlib_c/zstd-1.5.6/lib/common/zstd_internal.h +392 -0
  71. data/ext/zstdlib_c/zstd-1.5.6/lib/common/zstd_trace.h +163 -0
  72. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/clevels.h +134 -0
  73. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/fse_compress.c +625 -0
  74. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/hist.c +181 -0
  75. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/hist.h +75 -0
  76. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/huf_compress.c +1464 -0
  77. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_compress.c +7153 -0
  78. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_compress_internal.h +1534 -0
  79. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_compress_literals.c +235 -0
  80. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_compress_literals.h +39 -0
  81. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_compress_sequences.c +442 -0
  82. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_compress_sequences.h +54 -0
  83. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_compress_superblock.c +688 -0
  84. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_compress_superblock.h +32 -0
  85. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_cwksp.h +748 -0
  86. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_double_fast.c +770 -0
  87. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_double_fast.h +50 -0
  88. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_fast.c +968 -0
  89. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_fast.h +38 -0
  90. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_lazy.c +2199 -0
  91. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_lazy.h +202 -0
  92. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_ldm.c +730 -0
  93. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_ldm.h +117 -0
  94. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_ldm_geartab.h +106 -0
  95. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_opt.c +1576 -0
  96. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_opt.h +80 -0
  97. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstdmt_compress.c +1882 -0
  98. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstdmt_compress.h +113 -0
  99. data/ext/zstdlib_c/zstd-1.5.6/lib/decompress/huf_decompress.c +1944 -0
  100. data/ext/zstdlib_c/zstd-1.5.6/lib/decompress/huf_decompress_amd64.S +595 -0
  101. data/ext/zstdlib_c/zstd-1.5.6/lib/decompress/zstd_ddict.c +244 -0
  102. data/ext/zstdlib_c/zstd-1.5.6/lib/decompress/zstd_ddict.h +44 -0
  103. data/ext/zstdlib_c/zstd-1.5.6/lib/decompress/zstd_decompress.c +2407 -0
  104. data/ext/zstdlib_c/zstd-1.5.6/lib/decompress/zstd_decompress_block.c +2215 -0
  105. data/ext/zstdlib_c/zstd-1.5.6/lib/decompress/zstd_decompress_block.h +73 -0
  106. data/ext/zstdlib_c/zstd-1.5.6/lib/decompress/zstd_decompress_internal.h +240 -0
  107. data/ext/zstdlib_c/zstd-1.5.6/lib/zdict.h +474 -0
  108. data/ext/zstdlib_c/zstd-1.5.6/lib/zstd.h +3089 -0
  109. data/ext/zstdlib_c/zstd-1.5.6/lib/zstd_errors.h +114 -0
  110. data/ext/zstdlib_c/zstd-1.5.6/zlibWrapper/gzclose.c +26 -0
  111. data/ext/zstdlib_c/zstd-1.5.6/zlibWrapper/gzcompatibility.h +68 -0
  112. data/ext/zstdlib_c/zstd-1.5.6/zlibWrapper/gzguts.h +229 -0
  113. data/ext/zstdlib_c/zstd-1.5.6/zlibWrapper/gzlib.c +587 -0
  114. data/ext/zstdlib_c/zstd-1.5.6/zlibWrapper/gzread.c +637 -0
  115. data/ext/zstdlib_c/zstd-1.5.6/zlibWrapper/gzwrite.c +631 -0
  116. data/ext/zstdlib_c/zstd-1.5.6/zlibWrapper/zstd_zlibwrapper.c +1200 -0
  117. data/ext/zstdlib_c/zstd-1.5.6/zlibWrapper/zstd_zlibwrapper.h +91 -0
  118. data/ext/zstdlib_c/zstd.mk +15 -0
  119. data/lib/2.4/zstdlib_c.so +0 -0
  120. data/lib/2.5/zstdlib_c.so +0 -0
  121. data/lib/2.6/zstdlib_c.so +0 -0
  122. data/lib/2.7/zstdlib_c.so +0 -0
  123. data/lib/3.0/zstdlib_c.so +0 -0
  124. data/lib/3.1/zstdlib_c.so +0 -0
  125. data/lib/3.2/zstdlib_c.so +0 -0
  126. data/lib/3.3/zstdlib_c.so +0 -0
  127. data/lib/zstdlib.rb +6 -0
  128. data/test/zstdlib_test.rb +21 -0
  129. metadata +243 -0
@@ -0,0 +1,474 @@
1
+ /*
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ * All rights reserved.
4
+ *
5
+ * This source code is licensed under both the BSD-style license (found in the
6
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7
+ * in the COPYING file in the root directory of this source tree).
8
+ * You may select, at your option, one of the above-listed licenses.
9
+ */
10
+
11
+ #if defined (__cplusplus)
12
+ extern "C" {
13
+ #endif
14
+
15
+ #ifndef ZSTD_ZDICT_H
16
+ #define ZSTD_ZDICT_H
17
+
18
+ /*====== Dependencies ======*/
19
+ #include <stddef.h> /* size_t */
20
+
21
+
22
+ /* ===== ZDICTLIB_API : control library symbols visibility ===== */
23
+ #ifndef ZDICTLIB_VISIBLE
24
+ /* Backwards compatibility with old macro name */
25
+ # ifdef ZDICTLIB_VISIBILITY
26
+ # define ZDICTLIB_VISIBLE ZDICTLIB_VISIBILITY
27
+ # elif defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__MINGW32__)
28
+ # define ZDICTLIB_VISIBLE __attribute__ ((visibility ("default")))
29
+ # else
30
+ # define ZDICTLIB_VISIBLE
31
+ # endif
32
+ #endif
33
+
34
+ #ifndef ZDICTLIB_HIDDEN
35
+ # if defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__MINGW32__)
36
+ # define ZDICTLIB_HIDDEN __attribute__ ((visibility ("hidden")))
37
+ # else
38
+ # define ZDICTLIB_HIDDEN
39
+ # endif
40
+ #endif
41
+
42
+ #if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1)
43
+ # define ZDICTLIB_API __declspec(dllexport) ZDICTLIB_VISIBLE
44
+ #elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1)
45
+ # define ZDICTLIB_API __declspec(dllimport) ZDICTLIB_VISIBLE /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/
46
+ #else
47
+ # define ZDICTLIB_API ZDICTLIB_VISIBLE
48
+ #endif
49
+
50
+ /*******************************************************************************
51
+ * Zstd dictionary builder
52
+ *
53
+ * FAQ
54
+ * ===
55
+ * Why should I use a dictionary?
56
+ * ------------------------------
57
+ *
58
+ * Zstd can use dictionaries to improve compression ratio of small data.
59
+ * Traditionally small files don't compress well because there is very little
60
+ * repetition in a single sample, since it is small. But, if you are compressing
61
+ * many similar files, like a bunch of JSON records that share the same
62
+ * structure, you can train a dictionary on ahead of time on some samples of
63
+ * these files. Then, zstd can use the dictionary to find repetitions that are
64
+ * present across samples. This can vastly improve compression ratio.
65
+ *
66
+ * When is a dictionary useful?
67
+ * ----------------------------
68
+ *
69
+ * Dictionaries are useful when compressing many small files that are similar.
70
+ * The larger a file is, the less benefit a dictionary will have. Generally,
71
+ * we don't expect dictionary compression to be effective past 100KB. And the
72
+ * smaller a file is, the more we would expect the dictionary to help.
73
+ *
74
+ * How do I use a dictionary?
75
+ * --------------------------
76
+ *
77
+ * Simply pass the dictionary to the zstd compressor with
78
+ * `ZSTD_CCtx_loadDictionary()`. The same dictionary must then be passed to
79
+ * the decompressor, using `ZSTD_DCtx_loadDictionary()`. There are other
80
+ * more advanced functions that allow selecting some options, see zstd.h for
81
+ * complete documentation.
82
+ *
83
+ * What is a zstd dictionary?
84
+ * --------------------------
85
+ *
86
+ * A zstd dictionary has two pieces: Its header, and its content. The header
87
+ * contains a magic number, the dictionary ID, and entropy tables. These
88
+ * entropy tables allow zstd to save on header costs in the compressed file,
89
+ * which really matters for small data. The content is just bytes, which are
90
+ * repeated content that is common across many samples.
91
+ *
92
+ * What is a raw content dictionary?
93
+ * ---------------------------------
94
+ *
95
+ * A raw content dictionary is just bytes. It doesn't have a zstd dictionary
96
+ * header, a dictionary ID, or entropy tables. Any buffer is a valid raw
97
+ * content dictionary.
98
+ *
99
+ * How do I train a dictionary?
100
+ * ----------------------------
101
+ *
102
+ * Gather samples from your use case. These samples should be similar to each
103
+ * other. If you have several use cases, you could try to train one dictionary
104
+ * per use case.
105
+ *
106
+ * Pass those samples to `ZDICT_trainFromBuffer()` and that will train your
107
+ * dictionary. There are a few advanced versions of this function, but this
108
+ * is a great starting point. If you want to further tune your dictionary
109
+ * you could try `ZDICT_optimizeTrainFromBuffer_cover()`. If that is too slow
110
+ * you can try `ZDICT_optimizeTrainFromBuffer_fastCover()`.
111
+ *
112
+ * If the dictionary training function fails, that is likely because you
113
+ * either passed too few samples, or a dictionary would not be effective
114
+ * for your data. Look at the messages that the dictionary trainer printed,
115
+ * if it doesn't say too few samples, then a dictionary would not be effective.
116
+ *
117
+ * How large should my dictionary be?
118
+ * ----------------------------------
119
+ *
120
+ * A reasonable dictionary size, the `dictBufferCapacity`, is about 100KB.
121
+ * The zstd CLI defaults to a 110KB dictionary. You likely don't need a
122
+ * dictionary larger than that. But, most use cases can get away with a
123
+ * smaller dictionary. The advanced dictionary builders can automatically
124
+ * shrink the dictionary for you, and select the smallest size that doesn't
125
+ * hurt compression ratio too much. See the `shrinkDict` parameter.
126
+ * A smaller dictionary can save memory, and potentially speed up
127
+ * compression.
128
+ *
129
+ * How many samples should I provide to the dictionary builder?
130
+ * ------------------------------------------------------------
131
+ *
132
+ * We generally recommend passing ~100x the size of the dictionary
133
+ * in samples. A few thousand should suffice. Having too few samples
134
+ * can hurt the dictionaries effectiveness. Having more samples will
135
+ * only improve the dictionaries effectiveness. But having too many
136
+ * samples can slow down the dictionary builder.
137
+ *
138
+ * How do I determine if a dictionary will be effective?
139
+ * -----------------------------------------------------
140
+ *
141
+ * Simply train a dictionary and try it out. You can use zstd's built in
142
+ * benchmarking tool to test the dictionary effectiveness.
143
+ *
144
+ * # Benchmark levels 1-3 without a dictionary
145
+ * zstd -b1e3 -r /path/to/my/files
146
+ * # Benchmark levels 1-3 with a dictionary
147
+ * zstd -b1e3 -r /path/to/my/files -D /path/to/my/dictionary
148
+ *
149
+ * When should I retrain a dictionary?
150
+ * -----------------------------------
151
+ *
152
+ * You should retrain a dictionary when its effectiveness drops. Dictionary
153
+ * effectiveness drops as the data you are compressing changes. Generally, we do
154
+ * expect dictionaries to "decay" over time, as your data changes, but the rate
155
+ * at which they decay depends on your use case. Internally, we regularly
156
+ * retrain dictionaries, and if the new dictionary performs significantly
157
+ * better than the old dictionary, we will ship the new dictionary.
158
+ *
159
+ * I have a raw content dictionary, how do I turn it into a zstd dictionary?
160
+ * -------------------------------------------------------------------------
161
+ *
162
+ * If you have a raw content dictionary, e.g. by manually constructing it, or
163
+ * using a third-party dictionary builder, you can turn it into a zstd
164
+ * dictionary by using `ZDICT_finalizeDictionary()`. You'll also have to
165
+ * provide some samples of the data. It will add the zstd header to the
166
+ * raw content, which contains a dictionary ID and entropy tables, which
167
+ * will improve compression ratio, and allow zstd to write the dictionary ID
168
+ * into the frame, if you so choose.
169
+ *
170
+ * Do I have to use zstd's dictionary builder?
171
+ * -------------------------------------------
172
+ *
173
+ * No! You can construct dictionary content however you please, it is just
174
+ * bytes. It will always be valid as a raw content dictionary. If you want
175
+ * a zstd dictionary, which can improve compression ratio, use
176
+ * `ZDICT_finalizeDictionary()`.
177
+ *
178
+ * What is the attack surface of a zstd dictionary?
179
+ * ------------------------------------------------
180
+ *
181
+ * Zstd is heavily fuzz tested, including loading fuzzed dictionaries, so
182
+ * zstd should never crash, or access out-of-bounds memory no matter what
183
+ * the dictionary is. However, if an attacker can control the dictionary
184
+ * during decompression, they can cause zstd to generate arbitrary bytes,
185
+ * just like if they controlled the compressed data.
186
+ *
187
+ ******************************************************************************/
188
+
189
+
190
+ /*! ZDICT_trainFromBuffer():
191
+ * Train a dictionary from an array of samples.
192
+ * Redirect towards ZDICT_optimizeTrainFromBuffer_fastCover() single-threaded, with d=8, steps=4,
193
+ * f=20, and accel=1.
194
+ * Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
195
+ * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
196
+ * The resulting dictionary will be saved into `dictBuffer`.
197
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
198
+ * or an error code, which can be tested with ZDICT_isError().
199
+ * Note: Dictionary training will fail if there are not enough samples to construct a
200
+ * dictionary, or if most of the samples are too small (< 8 bytes being the lower limit).
201
+ * If dictionary training fails, you should use zstd without a dictionary, as the dictionary
202
+ * would've been ineffective anyways. If you believe your samples would benefit from a dictionary
203
+ * please open an issue with details, and we can look into it.
204
+ * Note: ZDICT_trainFromBuffer()'s memory usage is about 6 MB.
205
+ * Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
206
+ * It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`.
207
+ * In general, it's recommended to provide a few thousands samples, though this can vary a lot.
208
+ * It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
209
+ */
210
+ ZDICTLIB_API size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
211
+ const void* samplesBuffer,
212
+ const size_t* samplesSizes, unsigned nbSamples);
213
+
214
+ typedef struct {
215
+ int compressionLevel; /**< optimize for a specific zstd compression level; 0 means default */
216
+ unsigned notificationLevel; /**< Write log to stderr; 0 = none (default); 1 = errors; 2 = progression; 3 = details; 4 = debug; */
217
+ unsigned dictID; /**< force dictID value; 0 means auto mode (32-bits random value)
218
+ * NOTE: The zstd format reserves some dictionary IDs for future use.
219
+ * You may use them in private settings, but be warned that they
220
+ * may be used by zstd in a public dictionary registry in the future.
221
+ * These dictionary IDs are:
222
+ * - low range : <= 32767
223
+ * - high range : >= (2^31)
224
+ */
225
+ } ZDICT_params_t;
226
+
227
+ /*! ZDICT_finalizeDictionary():
228
+ * Given a custom content as a basis for dictionary, and a set of samples,
229
+ * finalize dictionary by adding headers and statistics according to the zstd
230
+ * dictionary format.
231
+ *
232
+ * Samples must be stored concatenated in a flat buffer `samplesBuffer`,
233
+ * supplied with an array of sizes `samplesSizes`, providing the size of each
234
+ * sample in order. The samples are used to construct the statistics, so they
235
+ * should be representative of what you will compress with this dictionary.
236
+ *
237
+ * The compression level can be set in `parameters`. You should pass the
238
+ * compression level you expect to use in production. The statistics for each
239
+ * compression level differ, so tuning the dictionary for the compression level
240
+ * can help quite a bit.
241
+ *
242
+ * You can set an explicit dictionary ID in `parameters`, or allow us to pick
243
+ * a random dictionary ID for you, but we can't guarantee no collisions.
244
+ *
245
+ * The dstDictBuffer and the dictContent may overlap, and the content will be
246
+ * appended to the end of the header. If the header + the content doesn't fit in
247
+ * maxDictSize the beginning of the content is truncated to make room, since it
248
+ * is presumed that the most profitable content is at the end of the dictionary,
249
+ * since that is the cheapest to reference.
250
+ *
251
+ * `maxDictSize` must be >= max(dictContentSize, ZSTD_DICTSIZE_MIN).
252
+ *
253
+ * @return: size of dictionary stored into `dstDictBuffer` (<= `maxDictSize`),
254
+ * or an error code, which can be tested by ZDICT_isError().
255
+ * Note: ZDICT_finalizeDictionary() will push notifications into stderr if
256
+ * instructed to, using notificationLevel>0.
257
+ * NOTE: This function currently may fail in several edge cases including:
258
+ * * Not enough samples
259
+ * * Samples are uncompressible
260
+ * * Samples are all exactly the same
261
+ */
262
+ ZDICTLIB_API size_t ZDICT_finalizeDictionary(void* dstDictBuffer, size_t maxDictSize,
263
+ const void* dictContent, size_t dictContentSize,
264
+ const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
265
+ ZDICT_params_t parameters);
266
+
267
+
268
+ /*====== Helper functions ======*/
269
+ ZDICTLIB_API unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize); /**< extracts dictID; @return zero if error (not a valid dictionary) */
270
+ ZDICTLIB_API size_t ZDICT_getDictHeaderSize(const void* dictBuffer, size_t dictSize); /* returns dict header size; returns a ZSTD error code on failure */
271
+ ZDICTLIB_API unsigned ZDICT_isError(size_t errorCode);
272
+ ZDICTLIB_API const char* ZDICT_getErrorName(size_t errorCode);
273
+
274
+ #endif /* ZSTD_ZDICT_H */
275
+
276
+ #if defined(ZDICT_STATIC_LINKING_ONLY) && !defined(ZSTD_ZDICT_H_STATIC)
277
+ #define ZSTD_ZDICT_H_STATIC
278
+
279
+ /* This can be overridden externally to hide static symbols. */
280
+ #ifndef ZDICTLIB_STATIC_API
281
+ # if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1)
282
+ # define ZDICTLIB_STATIC_API __declspec(dllexport) ZDICTLIB_VISIBLE
283
+ # elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1)
284
+ # define ZDICTLIB_STATIC_API __declspec(dllimport) ZDICTLIB_VISIBLE
285
+ # else
286
+ # define ZDICTLIB_STATIC_API ZDICTLIB_VISIBLE
287
+ # endif
288
+ #endif
289
+
290
+ /* ====================================================================================
291
+ * The definitions in this section are considered experimental.
292
+ * They should never be used with a dynamic library, as they may change in the future.
293
+ * They are provided for advanced usages.
294
+ * Use them only in association with static linking.
295
+ * ==================================================================================== */
296
+
297
+ #define ZDICT_DICTSIZE_MIN 256
298
+ /* Deprecated: Remove in v1.6.0 */
299
+ #define ZDICT_CONTENTSIZE_MIN 128
300
+
301
+ /*! ZDICT_cover_params_t:
302
+ * k and d are the only required parameters.
303
+ * For others, value 0 means default.
304
+ */
305
+ typedef struct {
306
+ unsigned k; /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */
307
+ unsigned d; /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */
308
+ unsigned steps; /* Number of steps : Only used for optimization : 0 means default (40) : Higher means more parameters checked */
309
+ unsigned nbThreads; /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */
310
+ double splitPoint; /* Percentage of samples used for training: Only used for optimization : the first nbSamples * splitPoint samples will be used to training, the last nbSamples * (1 - splitPoint) samples will be used for testing, 0 means default (1.0), 1.0 when all samples are used for both training and testing */
311
+ unsigned shrinkDict; /* Train dictionaries to shrink in size starting from the minimum size and selects the smallest dictionary that is shrinkDictMaxRegression% worse than the largest dictionary. 0 means no shrinking and 1 means shrinking */
312
+ unsigned shrinkDictMaxRegression; /* Sets shrinkDictMaxRegression so that a smaller dictionary can be at worse shrinkDictMaxRegression% worse than the max dict size dictionary. */
313
+ ZDICT_params_t zParams;
314
+ } ZDICT_cover_params_t;
315
+
316
+ typedef struct {
317
+ unsigned k; /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */
318
+ unsigned d; /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */
319
+ unsigned f; /* log of size of frequency array : constraint: 0 < f <= 31 : 1 means default(20)*/
320
+ unsigned steps; /* Number of steps : Only used for optimization : 0 means default (40) : Higher means more parameters checked */
321
+ unsigned nbThreads; /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */
322
+ double splitPoint; /* Percentage of samples used for training: Only used for optimization : the first nbSamples * splitPoint samples will be used to training, the last nbSamples * (1 - splitPoint) samples will be used for testing, 0 means default (0.75), 1.0 when all samples are used for both training and testing */
323
+ unsigned accel; /* Acceleration level: constraint: 0 < accel <= 10, higher means faster and less accurate, 0 means default(1) */
324
+ unsigned shrinkDict; /* Train dictionaries to shrink in size starting from the minimum size and selects the smallest dictionary that is shrinkDictMaxRegression% worse than the largest dictionary. 0 means no shrinking and 1 means shrinking */
325
+ unsigned shrinkDictMaxRegression; /* Sets shrinkDictMaxRegression so that a smaller dictionary can be at worse shrinkDictMaxRegression% worse than the max dict size dictionary. */
326
+
327
+ ZDICT_params_t zParams;
328
+ } ZDICT_fastCover_params_t;
329
+
330
+ /*! ZDICT_trainFromBuffer_cover():
331
+ * Train a dictionary from an array of samples using the COVER algorithm.
332
+ * Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
333
+ * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
334
+ * The resulting dictionary will be saved into `dictBuffer`.
335
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
336
+ * or an error code, which can be tested with ZDICT_isError().
337
+ * See ZDICT_trainFromBuffer() for details on failure modes.
338
+ * Note: ZDICT_trainFromBuffer_cover() requires about 9 bytes of memory for each input byte.
339
+ * Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
340
+ * It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`.
341
+ * In general, it's recommended to provide a few thousands samples, though this can vary a lot.
342
+ * It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
343
+ */
344
+ ZDICTLIB_STATIC_API size_t ZDICT_trainFromBuffer_cover(
345
+ void *dictBuffer, size_t dictBufferCapacity,
346
+ const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,
347
+ ZDICT_cover_params_t parameters);
348
+
349
+ /*! ZDICT_optimizeTrainFromBuffer_cover():
350
+ * The same requirements as above hold for all the parameters except `parameters`.
351
+ * This function tries many parameter combinations and picks the best parameters.
352
+ * `*parameters` is filled with the best parameters found,
353
+ * dictionary constructed with those parameters is stored in `dictBuffer`.
354
+ *
355
+ * All of the parameters d, k, steps are optional.
356
+ * If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8}.
357
+ * if steps is zero it defaults to its default value.
358
+ * If k is non-zero then we don't check multiple values of k, otherwise we check steps values in [50, 2000].
359
+ *
360
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
361
+ * or an error code, which can be tested with ZDICT_isError().
362
+ * On success `*parameters` contains the parameters selected.
363
+ * See ZDICT_trainFromBuffer() for details on failure modes.
364
+ * Note: ZDICT_optimizeTrainFromBuffer_cover() requires about 8 bytes of memory for each input byte and additionally another 5 bytes of memory for each byte of memory for each thread.
365
+ */
366
+ ZDICTLIB_STATIC_API size_t ZDICT_optimizeTrainFromBuffer_cover(
367
+ void* dictBuffer, size_t dictBufferCapacity,
368
+ const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
369
+ ZDICT_cover_params_t* parameters);
370
+
371
+ /*! ZDICT_trainFromBuffer_fastCover():
372
+ * Train a dictionary from an array of samples using a modified version of COVER algorithm.
373
+ * Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
374
+ * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
375
+ * d and k are required.
376
+ * All other parameters are optional, will use default values if not provided
377
+ * The resulting dictionary will be saved into `dictBuffer`.
378
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
379
+ * or an error code, which can be tested with ZDICT_isError().
380
+ * See ZDICT_trainFromBuffer() for details on failure modes.
381
+ * Note: ZDICT_trainFromBuffer_fastCover() requires 6 * 2^f bytes of memory.
382
+ * Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
383
+ * It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`.
384
+ * In general, it's recommended to provide a few thousands samples, though this can vary a lot.
385
+ * It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
386
+ */
387
+ ZDICTLIB_STATIC_API size_t ZDICT_trainFromBuffer_fastCover(void *dictBuffer,
388
+ size_t dictBufferCapacity, const void *samplesBuffer,
389
+ const size_t *samplesSizes, unsigned nbSamples,
390
+ ZDICT_fastCover_params_t parameters);
391
+
392
+ /*! ZDICT_optimizeTrainFromBuffer_fastCover():
393
+ * The same requirements as above hold for all the parameters except `parameters`.
394
+ * This function tries many parameter combinations (specifically, k and d combinations)
395
+ * and picks the best parameters. `*parameters` is filled with the best parameters found,
396
+ * dictionary constructed with those parameters is stored in `dictBuffer`.
397
+ * All of the parameters d, k, steps, f, and accel are optional.
398
+ * If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8}.
399
+ * if steps is zero it defaults to its default value.
400
+ * If k is non-zero then we don't check multiple values of k, otherwise we check steps values in [50, 2000].
401
+ * If f is zero, default value of 20 is used.
402
+ * If accel is zero, default value of 1 is used.
403
+ *
404
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
405
+ * or an error code, which can be tested with ZDICT_isError().
406
+ * On success `*parameters` contains the parameters selected.
407
+ * See ZDICT_trainFromBuffer() for details on failure modes.
408
+ * Note: ZDICT_optimizeTrainFromBuffer_fastCover() requires about 6 * 2^f bytes of memory for each thread.
409
+ */
410
+ ZDICTLIB_STATIC_API size_t ZDICT_optimizeTrainFromBuffer_fastCover(void* dictBuffer,
411
+ size_t dictBufferCapacity, const void* samplesBuffer,
412
+ const size_t* samplesSizes, unsigned nbSamples,
413
+ ZDICT_fastCover_params_t* parameters);
414
+
415
+ typedef struct {
416
+ unsigned selectivityLevel; /* 0 means default; larger => select more => larger dictionary */
417
+ ZDICT_params_t zParams;
418
+ } ZDICT_legacy_params_t;
419
+
420
+ /*! ZDICT_trainFromBuffer_legacy():
421
+ * Train a dictionary from an array of samples.
422
+ * Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
423
+ * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
424
+ * The resulting dictionary will be saved into `dictBuffer`.
425
+ * `parameters` is optional and can be provided with values set to 0 to mean "default".
426
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
427
+ * or an error code, which can be tested with ZDICT_isError().
428
+ * See ZDICT_trainFromBuffer() for details on failure modes.
429
+ * Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
430
+ * It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`.
431
+ * In general, it's recommended to provide a few thousands samples, though this can vary a lot.
432
+ * It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
433
+ * Note: ZDICT_trainFromBuffer_legacy() will send notifications into stderr if instructed to, using notificationLevel>0.
434
+ */
435
+ ZDICTLIB_STATIC_API size_t ZDICT_trainFromBuffer_legacy(
436
+ void* dictBuffer, size_t dictBufferCapacity,
437
+ const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
438
+ ZDICT_legacy_params_t parameters);
439
+
440
+
441
+ /* Deprecation warnings */
442
+ /* It is generally possible to disable deprecation warnings from compiler,
443
+ for example with -Wno-deprecated-declarations for gcc
444
+ or _CRT_SECURE_NO_WARNINGS in Visual.
445
+ Otherwise, it's also possible to manually define ZDICT_DISABLE_DEPRECATE_WARNINGS */
446
+ #ifdef ZDICT_DISABLE_DEPRECATE_WARNINGS
447
+ # define ZDICT_DEPRECATED(message) /* disable deprecation warnings */
448
+ #else
449
+ # define ZDICT_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
450
+ # if defined (__cplusplus) && (__cplusplus >= 201402) /* C++14 or greater */
451
+ # define ZDICT_DEPRECATED(message) [[deprecated(message)]]
452
+ # elif defined(__clang__) || (ZDICT_GCC_VERSION >= 405)
453
+ # define ZDICT_DEPRECATED(message) __attribute__((deprecated(message)))
454
+ # elif (ZDICT_GCC_VERSION >= 301)
455
+ # define ZDICT_DEPRECATED(message) __attribute__((deprecated))
456
+ # elif defined(_MSC_VER)
457
+ # define ZDICT_DEPRECATED(message) __declspec(deprecated(message))
458
+ # else
459
+ # pragma message("WARNING: You need to implement ZDICT_DEPRECATED for this compiler")
460
+ # define ZDICT_DEPRECATED(message)
461
+ # endif
462
+ #endif /* ZDICT_DISABLE_DEPRECATE_WARNINGS */
463
+
464
+ ZDICT_DEPRECATED("use ZDICT_finalizeDictionary() instead")
465
+ ZDICTLIB_STATIC_API
466
+ size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
467
+ const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples);
468
+
469
+
470
+ #endif /* ZSTD_ZDICT_H_STATIC */
471
+
472
+ #if defined (__cplusplus)
473
+ }
474
+ #endif