zstdlib 0.9.0-x64-mingw-ucrt

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +6 -0
  3. data/CHANGES.md +78 -0
  4. data/Gemfile +3 -0
  5. data/README.md +107 -0
  6. data/Rakefile +59 -0
  7. data/ext/zstdlib_c/extconf.rb +54 -0
  8. data/ext/zstdlib_c/ruby/zlib-2.2/zstdlib.c +4675 -0
  9. data/ext/zstdlib_c/ruby/zlib-2.3/zstdlib.c +4702 -0
  10. data/ext/zstdlib_c/ruby/zlib-2.4/zstdlib.c +4859 -0
  11. data/ext/zstdlib_c/ruby/zlib-2.5/zstdlib.c +4864 -0
  12. data/ext/zstdlib_c/ruby/zlib-2.6/zstdlib.c +4906 -0
  13. data/ext/zstdlib_c/ruby/zlib-2.7/zstdlib.c +4895 -0
  14. data/ext/zstdlib_c/ruby/zlib-3.0/zstdlib.c +4994 -0
  15. data/ext/zstdlib_c/ruby/zlib-3.1/zstdlib.c +5076 -0
  16. data/ext/zstdlib_c/zlib-1.2.11/adler32.c +186 -0
  17. data/ext/zstdlib_c/zlib-1.2.11/compress.c +86 -0
  18. data/ext/zstdlib_c/zlib-1.2.11/crc32.c +442 -0
  19. data/ext/zstdlib_c/zlib-1.2.11/crc32.h +441 -0
  20. data/ext/zstdlib_c/zlib-1.2.11/deflate.c +2163 -0
  21. data/ext/zstdlib_c/zlib-1.2.11/deflate.h +349 -0
  22. data/ext/zstdlib_c/zlib-1.2.11/gzclose.c +25 -0
  23. data/ext/zstdlib_c/zlib-1.2.11/gzguts.h +218 -0
  24. data/ext/zstdlib_c/zlib-1.2.11/gzlib.c +637 -0
  25. data/ext/zstdlib_c/zlib-1.2.11/gzread.c +654 -0
  26. data/ext/zstdlib_c/zlib-1.2.11/gzwrite.c +665 -0
  27. data/ext/zstdlib_c/zlib-1.2.11/infback.c +640 -0
  28. data/ext/zstdlib_c/zlib-1.2.11/inffast.c +323 -0
  29. data/ext/zstdlib_c/zlib-1.2.11/inffast.h +11 -0
  30. data/ext/zstdlib_c/zlib-1.2.11/inffixed.h +94 -0
  31. data/ext/zstdlib_c/zlib-1.2.11/inflate.c +1561 -0
  32. data/ext/zstdlib_c/zlib-1.2.11/inflate.h +125 -0
  33. data/ext/zstdlib_c/zlib-1.2.11/inftrees.c +304 -0
  34. data/ext/zstdlib_c/zlib-1.2.11/inftrees.h +62 -0
  35. data/ext/zstdlib_c/zlib-1.2.11/trees.c +1203 -0
  36. data/ext/zstdlib_c/zlib-1.2.11/trees.h +128 -0
  37. data/ext/zstdlib_c/zlib-1.2.11/uncompr.c +93 -0
  38. data/ext/zstdlib_c/zlib-1.2.11/zconf.h +534 -0
  39. data/ext/zstdlib_c/zlib-1.2.11/zlib.h +1912 -0
  40. data/ext/zstdlib_c/zlib-1.2.11/zutil.c +325 -0
  41. data/ext/zstdlib_c/zlib-1.2.11/zutil.h +271 -0
  42. data/ext/zstdlib_c/zlib.mk +14 -0
  43. data/ext/zstdlib_c/zlibwrapper/zlibwrapper.c +10 -0
  44. data/ext/zstdlib_c/zlibwrapper.mk +14 -0
  45. data/ext/zstdlib_c/zstd-1.5.2/lib/common/bitstream.h +478 -0
  46. data/ext/zstdlib_c/zstd-1.5.2/lib/common/compiler.h +335 -0
  47. data/ext/zstdlib_c/zstd-1.5.2/lib/common/cpu.h +213 -0
  48. data/ext/zstdlib_c/zstd-1.5.2/lib/common/debug.c +24 -0
  49. data/ext/zstdlib_c/zstd-1.5.2/lib/common/debug.h +107 -0
  50. data/ext/zstdlib_c/zstd-1.5.2/lib/common/entropy_common.c +368 -0
  51. data/ext/zstdlib_c/zstd-1.5.2/lib/common/error_private.c +56 -0
  52. data/ext/zstdlib_c/zstd-1.5.2/lib/common/error_private.h +159 -0
  53. data/ext/zstdlib_c/zstd-1.5.2/lib/common/fse.h +717 -0
  54. data/ext/zstdlib_c/zstd-1.5.2/lib/common/fse_decompress.c +403 -0
  55. data/ext/zstdlib_c/zstd-1.5.2/lib/common/huf.h +364 -0
  56. data/ext/zstdlib_c/zstd-1.5.2/lib/common/mem.h +442 -0
  57. data/ext/zstdlib_c/zstd-1.5.2/lib/common/pool.c +355 -0
  58. data/ext/zstdlib_c/zstd-1.5.2/lib/common/pool.h +84 -0
  59. data/ext/zstdlib_c/zstd-1.5.2/lib/common/portability_macros.h +137 -0
  60. data/ext/zstdlib_c/zstd-1.5.2/lib/common/threading.c +122 -0
  61. data/ext/zstdlib_c/zstd-1.5.2/lib/common/threading.h +155 -0
  62. data/ext/zstdlib_c/zstd-1.5.2/lib/common/xxhash.c +24 -0
  63. data/ext/zstdlib_c/zstd-1.5.2/lib/common/xxhash.h +5686 -0
  64. data/ext/zstdlib_c/zstd-1.5.2/lib/common/zstd_common.c +83 -0
  65. data/ext/zstdlib_c/zstd-1.5.2/lib/common/zstd_deps.h +111 -0
  66. data/ext/zstdlib_c/zstd-1.5.2/lib/common/zstd_internal.h +493 -0
  67. data/ext/zstdlib_c/zstd-1.5.2/lib/common/zstd_trace.h +163 -0
  68. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/clevels.h +134 -0
  69. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/fse_compress.c +741 -0
  70. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/hist.c +181 -0
  71. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/hist.h +75 -0
  72. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/huf_compress.c +1370 -0
  73. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/zstd_compress.c +6327 -0
  74. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/zstd_compress_internal.h +1458 -0
  75. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/zstd_compress_literals.c +159 -0
  76. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/zstd_compress_literals.h +31 -0
  77. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/zstd_compress_sequences.c +442 -0
  78. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/zstd_compress_sequences.h +54 -0
  79. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/zstd_compress_superblock.c +573 -0
  80. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/zstd_compress_superblock.h +32 -0
  81. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/zstd_cwksp.h +676 -0
  82. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/zstd_double_fast.c +696 -0
  83. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/zstd_double_fast.h +38 -0
  84. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/zstd_fast.c +675 -0
  85. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/zstd_fast.h +37 -0
  86. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/zstd_lazy.c +2104 -0
  87. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/zstd_lazy.h +125 -0
  88. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/zstd_ldm.c +724 -0
  89. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/zstd_ldm.h +117 -0
  90. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/zstd_ldm_geartab.h +106 -0
  91. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/zstd_opt.c +1446 -0
  92. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/zstd_opt.h +56 -0
  93. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/zstdmt_compress.c +1859 -0
  94. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/zstdmt_compress.h +113 -0
  95. data/ext/zstdlib_c/zstd-1.5.2/lib/decompress/huf_decompress.c +1889 -0
  96. data/ext/zstdlib_c/zstd-1.5.2/lib/decompress/huf_decompress_amd64.S +585 -0
  97. data/ext/zstdlib_c/zstd-1.5.2/lib/decompress/zstd_ddict.c +244 -0
  98. data/ext/zstdlib_c/zstd-1.5.2/lib/decompress/zstd_ddict.h +44 -0
  99. data/ext/zstdlib_c/zstd-1.5.2/lib/decompress/zstd_decompress.c +2230 -0
  100. data/ext/zstdlib_c/zstd-1.5.2/lib/decompress/zstd_decompress_block.c +2072 -0
  101. data/ext/zstdlib_c/zstd-1.5.2/lib/decompress/zstd_decompress_block.h +68 -0
  102. data/ext/zstdlib_c/zstd-1.5.2/lib/decompress/zstd_decompress_internal.h +236 -0
  103. data/ext/zstdlib_c/zstd-1.5.2/lib/zdict.h +452 -0
  104. data/ext/zstdlib_c/zstd-1.5.2/lib/zstd.h +2575 -0
  105. data/ext/zstdlib_c/zstd-1.5.2/lib/zstd_errors.h +95 -0
  106. data/ext/zstdlib_c/zstd-1.5.2/zlibWrapper/gzclose.c +28 -0
  107. data/ext/zstdlib_c/zstd-1.5.2/zlibWrapper/gzcompatibility.h +68 -0
  108. data/ext/zstdlib_c/zstd-1.5.2/zlibWrapper/gzguts.h +229 -0
  109. data/ext/zstdlib_c/zstd-1.5.2/zlibWrapper/gzlib.c +640 -0
  110. data/ext/zstdlib_c/zstd-1.5.2/zlibWrapper/gzread.c +678 -0
  111. data/ext/zstdlib_c/zstd-1.5.2/zlibWrapper/gzwrite.c +671 -0
  112. data/ext/zstdlib_c/zstd-1.5.2/zlibWrapper/zstd_zlibwrapper.c +1198 -0
  113. data/ext/zstdlib_c/zstd-1.5.2/zlibWrapper/zstd_zlibwrapper.h +88 -0
  114. data/ext/zstdlib_c/zstd.mk +15 -0
  115. data/lib/3.1/zstdlib_c.so +0 -0
  116. data/lib/zstdlib.rb +6 -0
  117. data/test/zstdlib_test.rb +21 -0
  118. metadata +232 -0
@@ -0,0 +1,452 @@
1
+ /*
2
+ * Copyright (c) Yann Collet, Facebook, Inc.
3
+ * All rights reserved.
4
+ *
5
+ * This source code is licensed under both the BSD-style license (found in the
6
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7
+ * in the COPYING file in the root directory of this source tree).
8
+ * You may select, at your option, one of the above-listed licenses.
9
+ */
10
+
11
+ #ifndef DICTBUILDER_H_001
12
+ #define DICTBUILDER_H_001
13
+
14
+ #if defined (__cplusplus)
15
+ extern "C" {
16
+ #endif
17
+
18
+
19
+ /*====== Dependencies ======*/
20
+ #include <stddef.h> /* size_t */
21
+
22
+
23
+ /* ===== ZDICTLIB_API : control library symbols visibility ===== */
24
+ #ifndef ZDICTLIB_VISIBILITY
25
+ # if defined(__GNUC__) && (__GNUC__ >= 4)
26
+ # define ZDICTLIB_VISIBILITY __attribute__ ((visibility ("default")))
27
+ # else
28
+ # define ZDICTLIB_VISIBILITY
29
+ # endif
30
+ #endif
31
+ #if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1)
32
+ # define ZDICTLIB_API __declspec(dllexport) ZDICTLIB_VISIBILITY
33
+ #elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1)
34
+ # define ZDICTLIB_API __declspec(dllimport) ZDICTLIB_VISIBILITY /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/
35
+ #else
36
+ # define ZDICTLIB_API ZDICTLIB_VISIBILITY
37
+ #endif
38
+
39
+ /*******************************************************************************
40
+ * Zstd dictionary builder
41
+ *
42
+ * FAQ
43
+ * ===
44
+ * Why should I use a dictionary?
45
+ * ------------------------------
46
+ *
47
+ * Zstd can use dictionaries to improve compression ratio of small data.
48
+ * Traditionally small files don't compress well because there is very little
49
+ * repetition in a single sample, since it is small. But, if you are compressing
50
+ * many similar files, like a bunch of JSON records that share the same
51
+ * structure, you can train a dictionary on ahead of time on some samples of
52
+ * these files. Then, zstd can use the dictionary to find repetitions that are
53
+ * present across samples. This can vastly improve compression ratio.
54
+ *
55
+ * When is a dictionary useful?
56
+ * ----------------------------
57
+ *
58
+ * Dictionaries are useful when compressing many small files that are similar.
59
+ * The larger a file is, the less benefit a dictionary will have. Generally,
60
+ * we don't expect dictionary compression to be effective past 100KB. And the
61
+ * smaller a file is, the more we would expect the dictionary to help.
62
+ *
63
+ * How do I use a dictionary?
64
+ * --------------------------
65
+ *
66
+ * Simply pass the dictionary to the zstd compressor with
67
+ * `ZSTD_CCtx_loadDictionary()`. The same dictionary must then be passed to
68
+ * the decompressor, using `ZSTD_DCtx_loadDictionary()`. There are other
69
+ * more advanced functions that allow selecting some options, see zstd.h for
70
+ * complete documentation.
71
+ *
72
+ * What is a zstd dictionary?
73
+ * --------------------------
74
+ *
75
+ * A zstd dictionary has two pieces: Its header, and its content. The header
76
+ * contains a magic number, the dictionary ID, and entropy tables. These
77
+ * entropy tables allow zstd to save on header costs in the compressed file,
78
+ * which really matters for small data. The content is just bytes, which are
79
+ * repeated content that is common across many samples.
80
+ *
81
+ * What is a raw content dictionary?
82
+ * ---------------------------------
83
+ *
84
+ * A raw content dictionary is just bytes. It doesn't have a zstd dictionary
85
+ * header, a dictionary ID, or entropy tables. Any buffer is a valid raw
86
+ * content dictionary.
87
+ *
88
+ * How do I train a dictionary?
89
+ * ----------------------------
90
+ *
91
+ * Gather samples from your use case. These samples should be similar to each
92
+ * other. If you have several use cases, you could try to train one dictionary
93
+ * per use case.
94
+ *
95
+ * Pass those samples to `ZDICT_trainFromBuffer()` and that will train your
96
+ * dictionary. There are a few advanced versions of this function, but this
97
+ * is a great starting point. If you want to further tune your dictionary
98
+ * you could try `ZDICT_optimizeTrainFromBuffer_cover()`. If that is too slow
99
+ * you can try `ZDICT_optimizeTrainFromBuffer_fastCover()`.
100
+ *
101
+ * If the dictionary training function fails, that is likely because you
102
+ * either passed too few samples, or a dictionary would not be effective
103
+ * for your data. Look at the messages that the dictionary trainer printed,
104
+ * if it doesn't say too few samples, then a dictionary would not be effective.
105
+ *
106
+ * How large should my dictionary be?
107
+ * ----------------------------------
108
+ *
109
+ * A reasonable dictionary size, the `dictBufferCapacity`, is about 100KB.
110
+ * The zstd CLI defaults to a 110KB dictionary. You likely don't need a
111
+ * dictionary larger than that. But, most use cases can get away with a
112
+ * smaller dictionary. The advanced dictionary builders can automatically
113
+ * shrink the dictionary for you, and select a the smallest size that
114
+ * doesn't hurt compression ratio too much. See the `shrinkDict` parameter.
115
+ * A smaller dictionary can save memory, and potentially speed up
116
+ * compression.
117
+ *
118
+ * How many samples should I provide to the dictionary builder?
119
+ * ------------------------------------------------------------
120
+ *
121
+ * We generally recommend passing ~100x the size of the dictionary
122
+ * in samples. A few thousand should suffice. Having too few samples
123
+ * can hurt the dictionaries effectiveness. Having more samples will
124
+ * only improve the dictionaries effectiveness. But having too many
125
+ * samples can slow down the dictionary builder.
126
+ *
127
+ * How do I determine if a dictionary will be effective?
128
+ * -----------------------------------------------------
129
+ *
130
+ * Simply train a dictionary and try it out. You can use zstd's built in
131
+ * benchmarking tool to test the dictionary effectiveness.
132
+ *
133
+ * # Benchmark levels 1-3 without a dictionary
134
+ * zstd -b1e3 -r /path/to/my/files
135
+ * # Benchmark levels 1-3 with a dictionary
136
+ * zstd -b1e3 -r /path/to/my/files -D /path/to/my/dictionary
137
+ *
138
+ * When should I retrain a dictionary?
139
+ * -----------------------------------
140
+ *
141
+ * You should retrain a dictionary when its effectiveness drops. Dictionary
142
+ * effectiveness drops as the data you are compressing changes. Generally, we do
143
+ * expect dictionaries to "decay" over time, as your data changes, but the rate
144
+ * at which they decay depends on your use case. Internally, we regularly
145
+ * retrain dictionaries, and if the new dictionary performs significantly
146
+ * better than the old dictionary, we will ship the new dictionary.
147
+ *
148
+ * I have a raw content dictionary, how do I turn it into a zstd dictionary?
149
+ * -------------------------------------------------------------------------
150
+ *
151
+ * If you have a raw content dictionary, e.g. by manually constructing it, or
152
+ * using a third-party dictionary builder, you can turn it into a zstd
153
+ * dictionary by using `ZDICT_finalizeDictionary()`. You'll also have to
154
+ * provide some samples of the data. It will add the zstd header to the
155
+ * raw content, which contains a dictionary ID and entropy tables, which
156
+ * will improve compression ratio, and allow zstd to write the dictionary ID
157
+ * into the frame, if you so choose.
158
+ *
159
+ * Do I have to use zstd's dictionary builder?
160
+ * -------------------------------------------
161
+ *
162
+ * No! You can construct dictionary content however you please, it is just
163
+ * bytes. It will always be valid as a raw content dictionary. If you want
164
+ * a zstd dictionary, which can improve compression ratio, use
165
+ * `ZDICT_finalizeDictionary()`.
166
+ *
167
+ * What is the attack surface of a zstd dictionary?
168
+ * ------------------------------------------------
169
+ *
170
+ * Zstd is heavily fuzz tested, including loading fuzzed dictionaries, so
171
+ * zstd should never crash, or access out-of-bounds memory no matter what
172
+ * the dictionary is. However, if an attacker can control the dictionary
173
+ * during decompression, they can cause zstd to generate arbitrary bytes,
174
+ * just like if they controlled the compressed data.
175
+ *
176
+ ******************************************************************************/
177
+
178
+
179
+ /*! ZDICT_trainFromBuffer():
180
+ * Train a dictionary from an array of samples.
181
+ * Redirect towards ZDICT_optimizeTrainFromBuffer_fastCover() single-threaded, with d=8, steps=4,
182
+ * f=20, and accel=1.
183
+ * Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
184
+ * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
185
+ * The resulting dictionary will be saved into `dictBuffer`.
186
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
187
+ * or an error code, which can be tested with ZDICT_isError().
188
+ * Note: Dictionary training will fail if there are not enough samples to construct a
189
+ * dictionary, or if most of the samples are too small (< 8 bytes being the lower limit).
190
+ * If dictionary training fails, you should use zstd without a dictionary, as the dictionary
191
+ * would've been ineffective anyways. If you believe your samples would benefit from a dictionary
192
+ * please open an issue with details, and we can look into it.
193
+ * Note: ZDICT_trainFromBuffer()'s memory usage is about 6 MB.
194
+ * Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
195
+ * It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`.
196
+ * In general, it's recommended to provide a few thousands samples, though this can vary a lot.
197
+ * It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
198
+ */
199
+ ZDICTLIB_API size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
200
+ const void* samplesBuffer,
201
+ const size_t* samplesSizes, unsigned nbSamples);
202
+
203
+ typedef struct {
204
+ int compressionLevel; /*< optimize for a specific zstd compression level; 0 means default */
205
+ unsigned notificationLevel; /*< Write log to stderr; 0 = none (default); 1 = errors; 2 = progression; 3 = details; 4 = debug; */
206
+ unsigned dictID; /*< force dictID value; 0 means auto mode (32-bits random value)
207
+ * NOTE: The zstd format reserves some dictionary IDs for future use.
208
+ * You may use them in private settings, but be warned that they
209
+ * may be used by zstd in a public dictionary registry in the future.
210
+ * These dictionary IDs are:
211
+ * - low range : <= 32767
212
+ * - high range : >= (2^31)
213
+ */
214
+ } ZDICT_params_t;
215
+
216
+ /*! ZDICT_finalizeDictionary():
217
+ * Given a custom content as a basis for dictionary, and a set of samples,
218
+ * finalize dictionary by adding headers and statistics according to the zstd
219
+ * dictionary format.
220
+ *
221
+ * Samples must be stored concatenated in a flat buffer `samplesBuffer`,
222
+ * supplied with an array of sizes `samplesSizes`, providing the size of each
223
+ * sample in order. The samples are used to construct the statistics, so they
224
+ * should be representative of what you will compress with this dictionary.
225
+ *
226
+ * The compression level can be set in `parameters`. You should pass the
227
+ * compression level you expect to use in production. The statistics for each
228
+ * compression level differ, so tuning the dictionary for the compression level
229
+ * can help quite a bit.
230
+ *
231
+ * You can set an explicit dictionary ID in `parameters`, or allow us to pick
232
+ * a random dictionary ID for you, but we can't guarantee no collisions.
233
+ *
234
+ * The dstDictBuffer and the dictContent may overlap, and the content will be
235
+ * appended to the end of the header. If the header + the content doesn't fit in
236
+ * maxDictSize the beginning of the content is truncated to make room, since it
237
+ * is presumed that the most profitable content is at the end of the dictionary,
238
+ * since that is the cheapest to reference.
239
+ *
240
+ * `maxDictSize` must be >= max(dictContentSize, ZSTD_DICTSIZE_MIN).
241
+ *
242
+ * @return: size of dictionary stored into `dstDictBuffer` (<= `maxDictSize`),
243
+ * or an error code, which can be tested by ZDICT_isError().
244
+ * Note: ZDICT_finalizeDictionary() will push notifications into stderr if
245
+ * instructed to, using notificationLevel>0.
246
+ * NOTE: This function currently may fail in several edge cases including:
247
+ * * Not enough samples
248
+ * * Samples are uncompressible
249
+ * * Samples are all exactly the same
250
+ */
251
+ ZDICTLIB_API size_t ZDICT_finalizeDictionary(void* dstDictBuffer, size_t maxDictSize,
252
+ const void* dictContent, size_t dictContentSize,
253
+ const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
254
+ ZDICT_params_t parameters);
255
+
256
+
257
+ /*====== Helper functions ======*/
258
+ ZDICTLIB_API unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize); /**< extracts dictID; @return zero if error (not a valid dictionary) */
259
+ ZDICTLIB_API size_t ZDICT_getDictHeaderSize(const void* dictBuffer, size_t dictSize); /* returns dict header size; returns a ZSTD error code on failure */
260
+ ZDICTLIB_API unsigned ZDICT_isError(size_t errorCode);
261
+ ZDICTLIB_API const char* ZDICT_getErrorName(size_t errorCode);
262
+
263
+
264
+
265
+ #ifdef ZDICT_STATIC_LINKING_ONLY
266
+
267
+ /* ====================================================================================
268
+ * The definitions in this section are considered experimental.
269
+ * They should never be used with a dynamic library, as they may change in the future.
270
+ * They are provided for advanced usages.
271
+ * Use them only in association with static linking.
272
+ * ==================================================================================== */
273
+
274
+ #define ZDICT_DICTSIZE_MIN 256
275
+ /* Deprecated: Remove in v1.6.0 */
276
+ #define ZDICT_CONTENTSIZE_MIN 128
277
+
278
+ /*! ZDICT_cover_params_t:
279
+ * k and d are the only required parameters.
280
+ * For others, value 0 means default.
281
+ */
282
+ typedef struct {
283
+ unsigned k; /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */
284
+ unsigned d; /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */
285
+ unsigned steps; /* Number of steps : Only used for optimization : 0 means default (40) : Higher means more parameters checked */
286
+ unsigned nbThreads; /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */
287
+ double splitPoint; /* Percentage of samples used for training: Only used for optimization : the first nbSamples * splitPoint samples will be used to training, the last nbSamples * (1 - splitPoint) samples will be used for testing, 0 means default (1.0), 1.0 when all samples are used for both training and testing */
288
+ unsigned shrinkDict; /* Train dictionaries to shrink in size starting from the minimum size and selects the smallest dictionary that is shrinkDictMaxRegression% worse than the largest dictionary. 0 means no shrinking and 1 means shrinking */
289
+ unsigned shrinkDictMaxRegression; /* Sets shrinkDictMaxRegression so that a smaller dictionary can be at worse shrinkDictMaxRegression% worse than the max dict size dictionary. */
290
+ ZDICT_params_t zParams;
291
+ } ZDICT_cover_params_t;
292
+
293
+ typedef struct {
294
+ unsigned k; /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */
295
+ unsigned d; /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */
296
+ unsigned f; /* log of size of frequency array : constraint: 0 < f <= 31 : 1 means default(20)*/
297
+ unsigned steps; /* Number of steps : Only used for optimization : 0 means default (40) : Higher means more parameters checked */
298
+ unsigned nbThreads; /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */
299
+ double splitPoint; /* Percentage of samples used for training: Only used for optimization : the first nbSamples * splitPoint samples will be used to training, the last nbSamples * (1 - splitPoint) samples will be used for testing, 0 means default (0.75), 1.0 when all samples are used for both training and testing */
300
+ unsigned accel; /* Acceleration level: constraint: 0 < accel <= 10, higher means faster and less accurate, 0 means default(1) */
301
+ unsigned shrinkDict; /* Train dictionaries to shrink in size starting from the minimum size and selects the smallest dictionary that is shrinkDictMaxRegression% worse than the largest dictionary. 0 means no shrinking and 1 means shrinking */
302
+ unsigned shrinkDictMaxRegression; /* Sets shrinkDictMaxRegression so that a smaller dictionary can be at worse shrinkDictMaxRegression% worse than the max dict size dictionary. */
303
+
304
+ ZDICT_params_t zParams;
305
+ } ZDICT_fastCover_params_t;
306
+
307
+ /*! ZDICT_trainFromBuffer_cover():
308
+ * Train a dictionary from an array of samples using the COVER algorithm.
309
+ * Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
310
+ * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
311
+ * The resulting dictionary will be saved into `dictBuffer`.
312
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
313
+ * or an error code, which can be tested with ZDICT_isError().
314
+ * See ZDICT_trainFromBuffer() for details on failure modes.
315
+ * Note: ZDICT_trainFromBuffer_cover() requires about 9 bytes of memory for each input byte.
316
+ * Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
317
+ * It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`.
318
+ * In general, it's recommended to provide a few thousands samples, though this can vary a lot.
319
+ * It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
320
+ */
321
+ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
322
+ void *dictBuffer, size_t dictBufferCapacity,
323
+ const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,
324
+ ZDICT_cover_params_t parameters);
325
+
326
+ /*! ZDICT_optimizeTrainFromBuffer_cover():
327
+ * The same requirements as above hold for all the parameters except `parameters`.
328
+ * This function tries many parameter combinations and picks the best parameters.
329
+ * `*parameters` is filled with the best parameters found,
330
+ * dictionary constructed with those parameters is stored in `dictBuffer`.
331
+ *
332
+ * All of the parameters d, k, steps are optional.
333
+ * If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8}.
334
+ * if steps is zero it defaults to its default value.
335
+ * If k is non-zero then we don't check multiple values of k, otherwise we check steps values in [50, 2000].
336
+ *
337
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
338
+ * or an error code, which can be tested with ZDICT_isError().
339
+ * On success `*parameters` contains the parameters selected.
340
+ * See ZDICT_trainFromBuffer() for details on failure modes.
341
+ * Note: ZDICT_optimizeTrainFromBuffer_cover() requires about 8 bytes of memory for each input byte and additionally another 5 bytes of memory for each byte of memory for each thread.
342
+ */
343
+ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
344
+ void* dictBuffer, size_t dictBufferCapacity,
345
+ const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
346
+ ZDICT_cover_params_t* parameters);
347
+
348
+ /*! ZDICT_trainFromBuffer_fastCover():
349
+ * Train a dictionary from an array of samples using a modified version of COVER algorithm.
350
+ * Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
351
+ * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
352
+ * d and k are required.
353
+ * All other parameters are optional, will use default values if not provided
354
+ * The resulting dictionary will be saved into `dictBuffer`.
355
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
356
+ * or an error code, which can be tested with ZDICT_isError().
357
+ * See ZDICT_trainFromBuffer() for details on failure modes.
358
+ * Note: ZDICT_trainFromBuffer_fastCover() requires 6 * 2^f bytes of memory.
359
+ * Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
360
+ * It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`.
361
+ * In general, it's recommended to provide a few thousands samples, though this can vary a lot.
362
+ * It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
363
+ */
364
+ ZDICTLIB_API size_t ZDICT_trainFromBuffer_fastCover(void *dictBuffer,
365
+ size_t dictBufferCapacity, const void *samplesBuffer,
366
+ const size_t *samplesSizes, unsigned nbSamples,
367
+ ZDICT_fastCover_params_t parameters);
368
+
369
+ /*! ZDICT_optimizeTrainFromBuffer_fastCover():
370
+ * The same requirements as above hold for all the parameters except `parameters`.
371
+ * This function tries many parameter combinations (specifically, k and d combinations)
372
+ * and picks the best parameters. `*parameters` is filled with the best parameters found,
373
+ * dictionary constructed with those parameters is stored in `dictBuffer`.
374
+ * All of the parameters d, k, steps, f, and accel are optional.
375
+ * If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8}.
376
+ * if steps is zero it defaults to its default value.
377
+ * If k is non-zero then we don't check multiple values of k, otherwise we check steps values in [50, 2000].
378
+ * If f is zero, default value of 20 is used.
379
+ * If accel is zero, default value of 1 is used.
380
+ *
381
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
382
+ * or an error code, which can be tested with ZDICT_isError().
383
+ * On success `*parameters` contains the parameters selected.
384
+ * See ZDICT_trainFromBuffer() for details on failure modes.
385
+ * Note: ZDICT_optimizeTrainFromBuffer_fastCover() requires about 6 * 2^f bytes of memory for each thread.
386
+ */
387
+ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_fastCover(void* dictBuffer,
388
+ size_t dictBufferCapacity, const void* samplesBuffer,
389
+ const size_t* samplesSizes, unsigned nbSamples,
390
+ ZDICT_fastCover_params_t* parameters);
391
+
392
+ typedef struct {
393
+ unsigned selectivityLevel; /* 0 means default; larger => select more => larger dictionary */
394
+ ZDICT_params_t zParams;
395
+ } ZDICT_legacy_params_t;
396
+
397
+ /*! ZDICT_trainFromBuffer_legacy():
398
+ * Train a dictionary from an array of samples.
399
+ * Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
400
+ * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
401
+ * The resulting dictionary will be saved into `dictBuffer`.
402
+ * `parameters` is optional and can be provided with values set to 0 to mean "default".
403
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
404
+ * or an error code, which can be tested with ZDICT_isError().
405
+ * See ZDICT_trainFromBuffer() for details on failure modes.
406
+ * Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
407
+ * It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`.
408
+ * In general, it's recommended to provide a few thousands samples, though this can vary a lot.
409
+ * It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
410
+ * Note: ZDICT_trainFromBuffer_legacy() will send notifications into stderr if instructed to, using notificationLevel>0.
411
+ */
412
+ ZDICTLIB_API size_t ZDICT_trainFromBuffer_legacy(
413
+ void* dictBuffer, size_t dictBufferCapacity,
414
+ const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
415
+ ZDICT_legacy_params_t parameters);
416
+
417
+
418
+ /* Deprecation warnings */
419
+ /* It is generally possible to disable deprecation warnings from compiler,
420
+ for example with -Wno-deprecated-declarations for gcc
421
+ or _CRT_SECURE_NO_WARNINGS in Visual.
422
+ Otherwise, it's also possible to manually define ZDICT_DISABLE_DEPRECATE_WARNINGS */
423
+ #ifdef ZDICT_DISABLE_DEPRECATE_WARNINGS
424
+ # define ZDICT_DEPRECATED(message) ZDICTLIB_API /* disable deprecation warnings */
425
+ #else
426
+ # define ZDICT_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
427
+ # if defined (__cplusplus) && (__cplusplus >= 201402) /* C++14 or greater */
428
+ # define ZDICT_DEPRECATED(message) [[deprecated(message)]] ZDICTLIB_API
429
+ # elif defined(__clang__) || (ZDICT_GCC_VERSION >= 405)
430
+ # define ZDICT_DEPRECATED(message) ZDICTLIB_API __attribute__((deprecated(message)))
431
+ # elif (ZDICT_GCC_VERSION >= 301)
432
+ # define ZDICT_DEPRECATED(message) ZDICTLIB_API __attribute__((deprecated))
433
+ # elif defined(_MSC_VER)
434
+ # define ZDICT_DEPRECATED(message) ZDICTLIB_API __declspec(deprecated(message))
435
+ # else
436
+ # pragma message("WARNING: You need to implement ZDICT_DEPRECATED for this compiler")
437
+ # define ZDICT_DEPRECATED(message) ZDICTLIB_API
438
+ # endif
439
+ #endif /* ZDICT_DISABLE_DEPRECATE_WARNINGS */
440
+
441
+ ZDICT_DEPRECATED("use ZDICT_finalizeDictionary() instead")
442
+ size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
443
+ const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples);
444
+
445
+
446
+ #endif /* ZDICT_STATIC_LINKING_ONLY */
447
+
448
+ #if defined (__cplusplus)
449
+ }
450
+ #endif
451
+
452
+ #endif /* DICTBUILDER_H_001 */