zstdlib 0.7.0-x86-mingw32 → 0.8.0-x86-mingw32

Sign up to get free protection for your applications and to get access to all the features.
Files changed (81) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGES.md +5 -0
  3. data/ext/zstdlib/extconf.rb +1 -1
  4. data/ext/zstdlib/ruby/zlib-3.0/zstdlib.c +4994 -0
  5. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/common/bitstream.h +25 -16
  6. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/common/compiler.h +118 -4
  7. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/common/cpu.h +1 -3
  8. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/common/debug.c +1 -1
  9. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/common/debug.h +12 -19
  10. data/ext/zstdlib/zstd-1.5.0/lib/common/entropy_common.c +362 -0
  11. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/common/error_private.c +2 -1
  12. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/common/error_private.h +3 -3
  13. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/common/fse.h +40 -12
  14. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/common/fse_decompress.c +139 -22
  15. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/common/huf.h +29 -7
  16. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/common/mem.h +69 -98
  17. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/common/pool.c +23 -17
  18. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/common/pool.h +2 -2
  19. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/common/threading.c +6 -5
  20. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/common/threading.h +0 -0
  21. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/common/xxhash.c +20 -60
  22. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/common/xxhash.h +2 -2
  23. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/common/zstd_common.c +10 -10
  24. data/ext/zstdlib/zstd-1.5.0/lib/common/zstd_deps.h +111 -0
  25. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/common/zstd_internal.h +105 -62
  26. data/ext/zstdlib/zstd-1.5.0/lib/common/zstd_trace.h +154 -0
  27. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/compress/fse_compress.c +31 -24
  28. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/compress/hist.c +27 -29
  29. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/compress/hist.h +2 -2
  30. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/compress/huf_compress.c +265 -126
  31. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/compress/zstd_compress.c +2843 -728
  32. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/compress/zstd_compress_internal.h +305 -63
  33. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/compress/zstd_compress_literals.c +8 -8
  34. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/compress/zstd_compress_literals.h +1 -1
  35. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/compress/zstd_compress_sequences.c +29 -7
  36. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/compress/zstd_compress_sequences.h +1 -1
  37. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/compress/zstd_compress_superblock.c +22 -295
  38. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/compress/zstd_compress_superblock.h +1 -1
  39. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/compress/zstd_cwksp.h +204 -67
  40. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/compress/zstd_double_fast.c +25 -25
  41. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/compress/zstd_double_fast.h +1 -1
  42. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/compress/zstd_fast.c +23 -23
  43. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/compress/zstd_fast.h +1 -1
  44. data/ext/zstdlib/zstd-1.5.0/lib/compress/zstd_lazy.c +2184 -0
  45. data/ext/zstdlib/zstd-1.5.0/lib/compress/zstd_lazy.h +125 -0
  46. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/compress/zstd_ldm.c +314 -211
  47. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/compress/zstd_ldm.h +9 -2
  48. data/ext/zstdlib/zstd-1.5.0/lib/compress/zstd_ldm_geartab.h +103 -0
  49. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/compress/zstd_opt.c +191 -46
  50. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/compress/zstd_opt.h +1 -1
  51. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/compress/zstdmt_compress.c +93 -415
  52. data/ext/zstdlib/zstd-1.5.0/lib/compress/zstdmt_compress.h +110 -0
  53. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/decompress/huf_decompress.c +342 -239
  54. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/decompress/zstd_ddict.c +9 -9
  55. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/decompress/zstd_ddict.h +2 -2
  56. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/decompress/zstd_decompress.c +369 -87
  57. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/decompress/zstd_decompress_block.c +191 -75
  58. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/decompress/zstd_decompress_block.h +6 -3
  59. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/decompress/zstd_decompress_internal.h +27 -11
  60. data/ext/zstdlib/zstd-1.5.0/lib/zdict.h +452 -0
  61. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/zstd.h +568 -126
  62. data/ext/zstdlib/{zstd-1.4.5/lib/common → zstd-1.5.0/lib}/zstd_errors.h +2 -1
  63. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/zlibWrapper/gzclose.c +0 -0
  64. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/zlibWrapper/gzcompatibility.h +1 -1
  65. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/zlibWrapper/gzguts.h +0 -0
  66. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/zlibWrapper/gzlib.c +0 -0
  67. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/zlibWrapper/gzread.c +0 -0
  68. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/zlibWrapper/gzwrite.c +0 -0
  69. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/zlibWrapper/zstd_zlibwrapper.c +126 -44
  70. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/zlibWrapper/zstd_zlibwrapper.h +1 -1
  71. data/lib/2.2/zstdlib.so +0 -0
  72. data/lib/2.3/zstdlib.so +0 -0
  73. data/lib/2.4/zstdlib.so +0 -0
  74. data/lib/2.5/zstdlib.so +0 -0
  75. data/lib/2.6/zstdlib.so +0 -0
  76. data/lib/2.7/zstdlib.so +0 -0
  77. metadata +69 -64
  78. data/ext/zstdlib/zstd-1.4.5/lib/common/entropy_common.c +0 -216
  79. data/ext/zstdlib/zstd-1.4.5/lib/compress/zstd_lazy.c +0 -1138
  80. data/ext/zstdlib/zstd-1.4.5/lib/compress/zstd_lazy.h +0 -67
  81. data/ext/zstdlib/zstd-1.4.5/lib/compress/zstdmt_compress.h +0 -192
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
2
+ * Copyright (c) Yann Collet, Facebook, Inc.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -15,7 +15,7 @@
15
15
  /*-*******************************************************
16
16
  * Dependencies
17
17
  *********************************************************/
18
- #include <stddef.h> /* size_t */
18
+ #include "../common/zstd_deps.h" /* size_t */
19
19
  #include "../zstd.h" /* DCtx, and some public functions */
20
20
  #include "../common/zstd_internal.h" /* blockProperties_t, and some public functions */
21
21
  #include "zstd_decompress_internal.h" /* ZSTD_seqSymbol */
@@ -48,12 +48,15 @@ size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
48
48
  * this function must be called with valid parameters only
49
49
  * (dt is large enough, normalizedCounter distribution total is a power of 2, max is within range, etc.)
50
50
  * in which case it cannot fail.
51
+ * The workspace must be 4-byte aligned and at least ZSTD_BUILD_FSE_TABLE_WKSP_SIZE bytes, which is
52
+ * defined in zstd_decompress_internal.h.
51
53
  * Internal use only.
52
54
  */
53
55
  void ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
54
56
  const short* normalizedCounter, unsigned maxSymbolValue,
55
57
  const U32* baseValue, const U32* nbAdditionalBits,
56
- unsigned tableLog);
58
+ unsigned tableLog, void* wksp, size_t wkspSize,
59
+ int bmi2);
57
60
 
58
61
 
59
62
  #endif /* ZSTD_DEC_BLOCK_H */
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
2
+ * Copyright (c) Yann Collet, Facebook, Inc.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -27,26 +27,26 @@
27
27
  /*-*******************************************************
28
28
  * Constants
29
29
  *********************************************************/
30
- static const U32 LL_base[MaxLL+1] = {
30
+ static UNUSED_ATTR const U32 LL_base[MaxLL+1] = {
31
31
  0, 1, 2, 3, 4, 5, 6, 7,
32
32
  8, 9, 10, 11, 12, 13, 14, 15,
33
33
  16, 18, 20, 22, 24, 28, 32, 40,
34
34
  48, 64, 0x80, 0x100, 0x200, 0x400, 0x800, 0x1000,
35
35
  0x2000, 0x4000, 0x8000, 0x10000 };
36
36
 
37
- static const U32 OF_base[MaxOff+1] = {
37
+ static UNUSED_ATTR const U32 OF_base[MaxOff+1] = {
38
38
  0, 1, 1, 5, 0xD, 0x1D, 0x3D, 0x7D,
39
39
  0xFD, 0x1FD, 0x3FD, 0x7FD, 0xFFD, 0x1FFD, 0x3FFD, 0x7FFD,
40
40
  0xFFFD, 0x1FFFD, 0x3FFFD, 0x7FFFD, 0xFFFFD, 0x1FFFFD, 0x3FFFFD, 0x7FFFFD,
41
41
  0xFFFFFD, 0x1FFFFFD, 0x3FFFFFD, 0x7FFFFFD, 0xFFFFFFD, 0x1FFFFFFD, 0x3FFFFFFD, 0x7FFFFFFD };
42
42
 
43
- static const U32 OF_bits[MaxOff+1] = {
43
+ static UNUSED_ATTR const U32 OF_bits[MaxOff+1] = {
44
44
  0, 1, 2, 3, 4, 5, 6, 7,
45
45
  8, 9, 10, 11, 12, 13, 14, 15,
46
46
  16, 17, 18, 19, 20, 21, 22, 23,
47
47
  24, 25, 26, 27, 28, 29, 30, 31 };
48
48
 
49
- static const U32 ML_base[MaxML+1] = {
49
+ static UNUSED_ATTR const U32 ML_base[MaxML+1] = {
50
50
  3, 4, 5, 6, 7, 8, 9, 10,
51
51
  11, 12, 13, 14, 15, 16, 17, 18,
52
52
  19, 20, 21, 22, 23, 24, 25, 26,
@@ -73,12 +73,16 @@ static const U32 ML_base[MaxML+1] = {
73
73
 
74
74
  #define SEQSYMBOL_TABLE_SIZE(log) (1 + (1 << (log)))
75
75
 
76
+ #define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE (sizeof(S16) * (MaxSeq + 1) + (1u << MaxFSELog) + sizeof(U64))
77
+ #define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32 ((ZSTD_BUILD_FSE_TABLE_WKSP_SIZE + sizeof(U32) - 1) / sizeof(U32))
78
+
76
79
  typedef struct {
77
80
  ZSTD_seqSymbol LLTable[SEQSYMBOL_TABLE_SIZE(LLFSELog)]; /* Note : Space reserved for FSE Tables */
78
81
  ZSTD_seqSymbol OFTable[SEQSYMBOL_TABLE_SIZE(OffFSELog)]; /* is also used as temporary workspace while building hufTable during DDict creation */
79
82
  ZSTD_seqSymbol MLTable[SEQSYMBOL_TABLE_SIZE(MLFSELog)]; /* and therefore must be at least HUF_DECOMPRESS_WORKSPACE_SIZE large */
80
83
  HUF_DTable hufTable[HUF_DTABLE_SIZE(HufLog)]; /* can accommodate HUF_decompress4X */
81
84
  U32 rep[ZSTD_REP_NUM];
85
+ U32 workspace[ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32];
82
86
  } ZSTD_entropyDTables_t;
83
87
 
84
88
  typedef enum { ZSTDds_getFrameHeaderSize, ZSTDds_decodeFrameHeader,
@@ -95,10 +99,12 @@ typedef enum {
95
99
  ZSTD_use_once = 1 /* Use the dictionary once and set to ZSTD_dont_use */
96
100
  } ZSTD_dictUses_e;
97
101
 
98
- typedef enum {
99
- ZSTD_obm_buffered = 0, /* Buffer the output */
100
- ZSTD_obm_stable = 1 /* ZSTD_outBuffer is stable */
101
- } ZSTD_outBufferMode_e;
102
+ /* Hashset for storing references to multiple ZSTD_DDict within ZSTD_DCtx */
103
+ typedef struct {
104
+ const ZSTD_DDict** ddictPtrTable;
105
+ size_t ddictPtrTableSize;
106
+ size_t ddictPtrCount;
107
+ } ZSTD_DDictHashSet;
102
108
 
103
109
  struct ZSTD_DCtx_s
104
110
  {
@@ -114,6 +120,7 @@ struct ZSTD_DCtx_s
114
120
  const void* dictEnd; /* end of previous segment */
115
121
  size_t expected;
116
122
  ZSTD_frameHeader fParams;
123
+ U64 processedCSize;
117
124
  U64 decodedSize;
118
125
  blockType_e bType; /* used in ZSTD_decompressContinue(), store blockType between block header decoding and block decompression stages */
119
126
  ZSTD_dStage stage;
@@ -122,6 +129,8 @@ struct ZSTD_DCtx_s
122
129
  XXH64_state_t xxhState;
123
130
  size_t headerSize;
124
131
  ZSTD_format_e format;
132
+ ZSTD_forceIgnoreChecksum_e forceIgnoreChecksum; /* User specified: if == 1, will ignore checksums in compressed frame. Default == 0 */
133
+ U32 validateChecksum; /* if == 1, will validate checksum. Is == 1 if (fParams.checksumFlag == 1) and (forceIgnoreChecksum == 0). */
125
134
  const BYTE* litPtr;
126
135
  ZSTD_customMem customMem;
127
136
  size_t litSize;
@@ -135,6 +144,8 @@ struct ZSTD_DCtx_s
135
144
  U32 dictID;
136
145
  int ddictIsCold; /* if == 1 : dictionary is "new" for working context, and presumed "cold" (not in cpu cache) */
137
146
  ZSTD_dictUses_e dictUses;
147
+ ZSTD_DDictHashSet* ddictSet; /* Hash set for multiple ddicts */
148
+ ZSTD_refMultipleDDicts_e refMultipleDDicts; /* User specified: if == 1, will allow references to multiple DDicts. Default == 0 (disabled) */
138
149
 
139
150
  /* streaming */
140
151
  ZSTD_dStreamStage streamStage;
@@ -152,7 +163,7 @@ struct ZSTD_DCtx_s
152
163
  U32 legacyVersion;
153
164
  U32 hostageByte;
154
165
  int noForwardProgress;
155
- ZSTD_outBufferMode_e outBufferMode;
166
+ ZSTD_bufferMode_e outBufferMode;
156
167
  ZSTD_outBuffer expectedOutBuffer;
157
168
 
158
169
  /* workspace */
@@ -165,6 +176,11 @@ struct ZSTD_DCtx_s
165
176
  void const* dictContentBeginForFuzzing;
166
177
  void const* dictContentEndForFuzzing;
167
178
  #endif
179
+
180
+ /* Tracing */
181
+ #if ZSTD_TRACE
182
+ ZSTD_TraceCtx traceCtx;
183
+ #endif
168
184
  }; /* typedef'd to ZSTD_DCtx within "zstd.h" */
169
185
 
170
186
 
@@ -183,7 +199,7 @@ size_t ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy,
183
199
  * If yes, do nothing (continue on current segment).
184
200
  * If not, classify previous segment as "external dictionary", and start a new segment.
185
201
  * This function cannot fail. */
186
- void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst);
202
+ void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize);
187
203
 
188
204
 
189
205
  #endif /* ZSTD_DECOMPRESS_INTERNAL_H */
@@ -0,0 +1,452 @@
1
+ /*
2
+ * Copyright (c) Yann Collet, Facebook, Inc.
3
+ * All rights reserved.
4
+ *
5
+ * This source code is licensed under both the BSD-style license (found in the
6
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7
+ * in the COPYING file in the root directory of this source tree).
8
+ * You may select, at your option, one of the above-listed licenses.
9
+ */
10
+
11
+ #ifndef DICTBUILDER_H_001
12
+ #define DICTBUILDER_H_001
13
+
14
+ #if defined (__cplusplus)
15
+ extern "C" {
16
+ #endif
17
+
18
+
19
+ /*====== Dependencies ======*/
20
+ #include <stddef.h> /* size_t */
21
+
22
+
23
+ /* ===== ZDICTLIB_API : control library symbols visibility ===== */
24
+ #ifndef ZDICTLIB_VISIBILITY
25
+ # if defined(__GNUC__) && (__GNUC__ >= 4)
26
+ # define ZDICTLIB_VISIBILITY __attribute__ ((visibility ("default")))
27
+ # else
28
+ # define ZDICTLIB_VISIBILITY
29
+ # endif
30
+ #endif
31
+ #if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1)
32
+ # define ZDICTLIB_API __declspec(dllexport) ZDICTLIB_VISIBILITY
33
+ #elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1)
34
+ # define ZDICTLIB_API __declspec(dllimport) ZDICTLIB_VISIBILITY /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/
35
+ #else
36
+ # define ZDICTLIB_API ZDICTLIB_VISIBILITY
37
+ #endif
38
+
39
+ /*******************************************************************************
40
+ * Zstd dictionary builder
41
+ *
42
+ * FAQ
43
+ * ===
44
+ * Why should I use a dictionary?
45
+ * ------------------------------
46
+ *
47
+ * Zstd can use dictionaries to improve compression ratio of small data.
48
+ * Traditionally small files don't compress well because there is very little
49
+ * repetion in a single sample, since it is small. But, if you are compressing
50
+ * many similar files, like a bunch of JSON records that share the same
51
+ * structure, you can train a dictionary on ahead of time on some samples of
52
+ * these files. Then, zstd can use the dictionary to find repetitions that are
53
+ * present across samples. This can vastly improve compression ratio.
54
+ *
55
+ * When is a dictionary useful?
56
+ * ----------------------------
57
+ *
58
+ * Dictionaries are useful when compressing many small files that are similar.
59
+ * The larger a file is, the less benefit a dictionary will have. Generally,
60
+ * we don't expect dictionary compression to be effective past 100KB. And the
61
+ * smaller a file is, the more we would expect the dictionary to help.
62
+ *
63
+ * How do I use a dictionary?
64
+ * --------------------------
65
+ *
66
+ * Simply pass the dictionary to the zstd compressor with
67
+ * `ZSTD_CCtx_loadDictionary()`. The same dictionary must then be passed to
68
+ * the decompressor, using `ZSTD_DCtx_loadDictionary()`. There are other
69
+ * more advanced functions that allow selecting some options, see zstd.h for
70
+ * complete documentation.
71
+ *
72
+ * What is a zstd dictionary?
73
+ * --------------------------
74
+ *
75
+ * A zstd dictionary has two pieces: Its header, and its content. The header
76
+ * contains a magic number, the dictionary ID, and entropy tables. These
77
+ * entropy tables allow zstd to save on header costs in the compressed file,
78
+ * which really matters for small data. The content is just bytes, which are
79
+ * repeated content that is common across many samples.
80
+ *
81
+ * What is a raw content dictionary?
82
+ * ---------------------------------
83
+ *
84
+ * A raw content dictionary is just bytes. It doesn't have a zstd dictionary
85
+ * header, a dictionary ID, or entropy tables. Any buffer is a valid raw
86
+ * content dictionary.
87
+ *
88
+ * How do I train a dictionary?
89
+ * ----------------------------
90
+ *
91
+ * Gather samples from your use case. These samples should be similar to each
92
+ * other. If you have several use cases, you could try to train one dictionary
93
+ * per use case.
94
+ *
95
+ * Pass those samples to `ZDICT_trainFromBuffer()` and that will train your
96
+ * dictionary. There are a few advanced versions of this function, but this
97
+ * is a great starting point. If you want to further tune your dictionary
98
+ * you could try `ZDICT_optimizeTrainFromBuffer_cover()`. If that is too slow
99
+ * you can try `ZDICT_optimizeTrainFromBuffer_fastCover()`.
100
+ *
101
+ * If the dictionary training function fails, that is likely because you
102
+ * either passed too few samples, or a dictionary would not be effective
103
+ * for your data. Look at the messages that the dictionary trainer printed,
104
+ * if it doesn't say too few samples, then a dictionary would not be effective.
105
+ *
106
+ * How large should my dictionary be?
107
+ * ----------------------------------
108
+ *
109
+ * A reasonable dictionary size, the `dictBufferCapacity`, is about 100KB.
110
+ * The zstd CLI defaults to a 110KB dictionary. You likely don't need a
111
+ * dictionary larger than that. But, most use cases can get away with a
112
+ * smaller dictionary. The advanced dictionary builders can automatically
113
+ * shrink the dictionary for you, and select a the smallest size that
114
+ * doesn't hurt compression ratio too much. See the `shrinkDict` parameter.
115
+ * A smaller dictionary can save memory, and potentially speed up
116
+ * compression.
117
+ *
118
+ * How many samples should I provide to the dictionary builder?
119
+ * ------------------------------------------------------------
120
+ *
121
+ * We generally recommend passing ~100x the size of the dictionary
122
+ * in samples. A few thousand should suffice. Having too few samples
123
+ * can hurt the dictionaries effectiveness. Having more samples will
124
+ * only improve the dictionaries effectiveness. But having too many
125
+ * samples can slow down the dictionary builder.
126
+ *
127
+ * How do I determine if a dictionary will be effective?
128
+ * -----------------------------------------------------
129
+ *
130
+ * Simply train a dictionary and try it out. You can use zstd's built in
131
+ * benchmarking tool to test the dictionary effectiveness.
132
+ *
133
+ * # Benchmark levels 1-3 without a dictionary
134
+ * zstd -b1e3 -r /path/to/my/files
135
+ * # Benchmark levels 1-3 with a dictioanry
136
+ * zstd -b1e3 -r /path/to/my/files -D /path/to/my/dictionary
137
+ *
138
+ * When should I retrain a dictionary?
139
+ * -----------------------------------
140
+ *
141
+ * You should retrain a dictionary when its effectiveness drops. Dictionary
142
+ * effectiveness drops as the data you are compressing changes. Generally, we do
143
+ * expect dictionaries to "decay" over time, as your data changes, but the rate
144
+ * at which they decay depends on your use case. Internally, we regularly
145
+ * retrain dictionaries, and if the new dictionary performs significantly
146
+ * better than the old dictionary, we will ship the new dictionary.
147
+ *
148
+ * I have a raw content dictionary, how do I turn it into a zstd dictionary?
149
+ * -------------------------------------------------------------------------
150
+ *
151
+ * If you have a raw content dictionary, e.g. by manually constructing it, or
152
+ * using a third-party dictionary builder, you can turn it into a zstd
153
+ * dictionary by using `ZDICT_finalizeDictionary()`. You'll also have to
154
+ * provide some samples of the data. It will add the zstd header to the
155
+ * raw content, which contains a dictionary ID and entropy tables, which
156
+ * will improve compression ratio, and allow zstd to write the dictionary ID
157
+ * into the frame, if you so choose.
158
+ *
159
+ * Do I have to use zstd's dictionary builder?
160
+ * -------------------------------------------
161
+ *
162
+ * No! You can construct dictionary content however you please, it is just
163
+ * bytes. It will always be valid as a raw content dictionary. If you want
164
+ * a zstd dictionary, which can improve compression ratio, use
165
+ * `ZDICT_finalizeDictionary()`.
166
+ *
167
+ * What is the attack surface of a zstd dictionary?
168
+ * ------------------------------------------------
169
+ *
170
+ * Zstd is heavily fuzz tested, including loading fuzzed dictionaries, so
171
+ * zstd should never crash, or access out-of-bounds memory no matter what
172
+ * the dictionary is. However, if an attacker can control the dictionary
173
+ * during decompression, they can cause zstd to generate arbitrary bytes,
174
+ * just like if they controlled the compressed data.
175
+ *
176
+ ******************************************************************************/
177
+
178
+
179
+ /*! ZDICT_trainFromBuffer():
180
+ * Train a dictionary from an array of samples.
181
+ * Redirect towards ZDICT_optimizeTrainFromBuffer_fastCover() single-threaded, with d=8, steps=4,
182
+ * f=20, and accel=1.
183
+ * Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
184
+ * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
185
+ * The resulting dictionary will be saved into `dictBuffer`.
186
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
187
+ * or an error code, which can be tested with ZDICT_isError().
188
+ * Note: Dictionary training will fail if there are not enough samples to construct a
189
+ * dictionary, or if most of the samples are too small (< 8 bytes being the lower limit).
190
+ * If dictionary training fails, you should use zstd without a dictionary, as the dictionary
191
+ * would've been ineffective anyways. If you believe your samples would benefit from a dictionary
192
+ * please open an issue with details, and we can look into it.
193
+ * Note: ZDICT_trainFromBuffer()'s memory usage is about 6 MB.
194
+ * Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
195
+ * It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`.
196
+ * In general, it's recommended to provide a few thousands samples, though this can vary a lot.
197
+ * It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
198
+ */
199
+ ZDICTLIB_API size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
200
+ const void* samplesBuffer,
201
+ const size_t* samplesSizes, unsigned nbSamples);
202
+
203
+ typedef struct {
204
+ int compressionLevel; /*< optimize for a specific zstd compression level; 0 means default */
205
+ unsigned notificationLevel; /*< Write log to stderr; 0 = none (default); 1 = errors; 2 = progression; 3 = details; 4 = debug; */
206
+ unsigned dictID; /*< force dictID value; 0 means auto mode (32-bits random value)
207
+ * NOTE: The zstd format reserves some dictionary IDs for future use.
208
+ * You may use them in private settings, but be warned that they
209
+ * may be used by zstd in a public dictionary registry in the future.
210
+ * These dictionary IDs are:
211
+ * - low range : <= 32767
212
+ * - high range : >= (2^31)
213
+ */
214
+ } ZDICT_params_t;
215
+
216
+ /*! ZDICT_finalizeDictionary():
217
+ * Given a custom content as a basis for dictionary, and a set of samples,
218
+ * finalize dictionary by adding headers and statistics according to the zstd
219
+ * dictionary format.
220
+ *
221
+ * Samples must be stored concatenated in a flat buffer `samplesBuffer`,
222
+ * supplied with an array of sizes `samplesSizes`, providing the size of each
223
+ * sample in order. The samples are used to construct the statistics, so they
224
+ * should be representative of what you will compress with this dictionary.
225
+ *
226
+ * The compression level can be set in `parameters`. You should pass the
227
+ * compression level you expect to use in production. The statistics for each
228
+ * compression level differ, so tuning the dictionary for the compression level
229
+ * can help quite a bit.
230
+ *
231
+ * You can set an explicit dictionary ID in `parameters`, or allow us to pick
232
+ * a random dictionary ID for you, but we can't guarantee no collisions.
233
+ *
234
+ * The dstDictBuffer and the dictContent may overlap, and the content will be
235
+ * appended to the end of the header. If the header + the content doesn't fit in
236
+ * maxDictSize the beginning of the content is truncated to make room, since it
237
+ * is presumed that the most profitable content is at the end of the dictionary,
238
+ * since that is the cheapest to reference.
239
+ *
240
+ * `dictContentSize` must be >= ZDICT_CONTENTSIZE_MIN bytes.
241
+ * `maxDictSize` must be >= max(dictContentSize, ZSTD_DICTSIZE_MIN).
242
+ *
243
+ * @return: size of dictionary stored into `dstDictBuffer` (<= `maxDictSize`),
244
+ * or an error code, which can be tested by ZDICT_isError().
245
+ * Note: ZDICT_finalizeDictionary() will push notifications into stderr if
246
+ * instructed to, using notificationLevel>0.
247
+ * NOTE: This function currently may fail in several edge cases including:
248
+ * * Not enough samples
249
+ * * Samples are uncompressible
250
+ * * Samples are all exactly the same
251
+ */
252
+ ZDICTLIB_API size_t ZDICT_finalizeDictionary(void* dstDictBuffer, size_t maxDictSize,
253
+ const void* dictContent, size_t dictContentSize,
254
+ const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
255
+ ZDICT_params_t parameters);
256
+
257
+
258
+ /*====== Helper functions ======*/
259
+ ZDICTLIB_API unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize); /**< extracts dictID; @return zero if error (not a valid dictionary) */
260
+ ZDICTLIB_API size_t ZDICT_getDictHeaderSize(const void* dictBuffer, size_t dictSize); /* returns dict header size; returns a ZSTD error code on failure */
261
+ ZDICTLIB_API unsigned ZDICT_isError(size_t errorCode);
262
+ ZDICTLIB_API const char* ZDICT_getErrorName(size_t errorCode);
263
+
264
+
265
+
266
+ #ifdef ZDICT_STATIC_LINKING_ONLY
267
+
268
+ /* ====================================================================================
269
+ * The definitions in this section are considered experimental.
270
+ * They should never be used with a dynamic library, as they may change in the future.
271
+ * They are provided for advanced usages.
272
+ * Use them only in association with static linking.
273
+ * ==================================================================================== */
274
+
275
+ #define ZDICT_CONTENTSIZE_MIN 128
276
+ #define ZDICT_DICTSIZE_MIN 256
277
+
278
+ /*! ZDICT_cover_params_t:
279
+ * k and d are the only required parameters.
280
+ * For others, value 0 means default.
281
+ */
282
+ typedef struct {
283
+ unsigned k; /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */
284
+ unsigned d; /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */
285
+ unsigned steps; /* Number of steps : Only used for optimization : 0 means default (40) : Higher means more parameters checked */
286
+ unsigned nbThreads; /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */
287
+ double splitPoint; /* Percentage of samples used for training: Only used for optimization : the first nbSamples * splitPoint samples will be used to training, the last nbSamples * (1 - splitPoint) samples will be used for testing, 0 means default (1.0), 1.0 when all samples are used for both training and testing */
288
+ unsigned shrinkDict; /* Train dictionaries to shrink in size starting from the minimum size and selects the smallest dictionary that is shrinkDictMaxRegression% worse than the largest dictionary. 0 means no shrinking and 1 means shrinking */
289
+ unsigned shrinkDictMaxRegression; /* Sets shrinkDictMaxRegression so that a smaller dictionary can be at worse shrinkDictMaxRegression% worse than the max dict size dictionary. */
290
+ ZDICT_params_t zParams;
291
+ } ZDICT_cover_params_t;
292
+
293
+ typedef struct {
294
+ unsigned k; /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */
295
+ unsigned d; /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */
296
+ unsigned f; /* log of size of frequency array : constraint: 0 < f <= 31 : 1 means default(20)*/
297
+ unsigned steps; /* Number of steps : Only used for optimization : 0 means default (40) : Higher means more parameters checked */
298
+ unsigned nbThreads; /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */
299
+ double splitPoint; /* Percentage of samples used for training: Only used for optimization : the first nbSamples * splitPoint samples will be used to training, the last nbSamples * (1 - splitPoint) samples will be used for testing, 0 means default (0.75), 1.0 when all samples are used for both training and testing */
300
+ unsigned accel; /* Acceleration level: constraint: 0 < accel <= 10, higher means faster and less accurate, 0 means default(1) */
301
+ unsigned shrinkDict; /* Train dictionaries to shrink in size starting from the minimum size and selects the smallest dictionary that is shrinkDictMaxRegression% worse than the largest dictionary. 0 means no shrinking and 1 means shrinking */
302
+ unsigned shrinkDictMaxRegression; /* Sets shrinkDictMaxRegression so that a smaller dictionary can be at worse shrinkDictMaxRegression% worse than the max dict size dictionary. */
303
+
304
+ ZDICT_params_t zParams;
305
+ } ZDICT_fastCover_params_t;
306
+
307
+ /*! ZDICT_trainFromBuffer_cover():
308
+ * Train a dictionary from an array of samples using the COVER algorithm.
309
+ * Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
310
+ * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
311
+ * The resulting dictionary will be saved into `dictBuffer`.
312
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
313
+ * or an error code, which can be tested with ZDICT_isError().
314
+ * See ZDICT_trainFromBuffer() for details on failure modes.
315
+ * Note: ZDICT_trainFromBuffer_cover() requires about 9 bytes of memory for each input byte.
316
+ * Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
317
+ * It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`.
318
+ * In general, it's recommended to provide a few thousands samples, though this can vary a lot.
319
+ * It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
320
+ */
321
+ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
322
+ void *dictBuffer, size_t dictBufferCapacity,
323
+ const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,
324
+ ZDICT_cover_params_t parameters);
325
+
326
+ /*! ZDICT_optimizeTrainFromBuffer_cover():
327
+ * The same requirements as above hold for all the parameters except `parameters`.
328
+ * This function tries many parameter combinations and picks the best parameters.
329
+ * `*parameters` is filled with the best parameters found,
330
+ * dictionary constructed with those parameters is stored in `dictBuffer`.
331
+ *
332
+ * All of the parameters d, k, steps are optional.
333
+ * If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8}.
334
+ * if steps is zero it defaults to its default value.
335
+ * If k is non-zero then we don't check multiple values of k, otherwise we check steps values in [50, 2000].
336
+ *
337
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
338
+ * or an error code, which can be tested with ZDICT_isError().
339
+ * On success `*parameters` contains the parameters selected.
340
+ * See ZDICT_trainFromBuffer() for details on failure modes.
341
+ * Note: ZDICT_optimizeTrainFromBuffer_cover() requires about 8 bytes of memory for each input byte and additionally another 5 bytes of memory for each byte of memory for each thread.
342
+ */
343
+ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
344
+ void* dictBuffer, size_t dictBufferCapacity,
345
+ const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
346
+ ZDICT_cover_params_t* parameters);
347
+
348
+ /*! ZDICT_trainFromBuffer_fastCover():
349
+ * Train a dictionary from an array of samples using a modified version of COVER algorithm.
350
+ * Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
351
+ * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
352
+ * d and k are required.
353
+ * All other parameters are optional, will use default values if not provided
354
+ * The resulting dictionary will be saved into `dictBuffer`.
355
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
356
+ * or an error code, which can be tested with ZDICT_isError().
357
+ * See ZDICT_trainFromBuffer() for details on failure modes.
358
+ * Note: ZDICT_trainFromBuffer_fastCover() requires 6 * 2^f bytes of memory.
359
+ * Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
360
+ * It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`.
361
+ * In general, it's recommended to provide a few thousands samples, though this can vary a lot.
362
+ * It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
363
+ */
364
+ ZDICTLIB_API size_t ZDICT_trainFromBuffer_fastCover(void *dictBuffer,
365
+ size_t dictBufferCapacity, const void *samplesBuffer,
366
+ const size_t *samplesSizes, unsigned nbSamples,
367
+ ZDICT_fastCover_params_t parameters);
368
+
369
+ /*! ZDICT_optimizeTrainFromBuffer_fastCover():
370
+ * The same requirements as above hold for all the parameters except `parameters`.
371
+ * This function tries many parameter combinations (specifically, k and d combinations)
372
+ * and picks the best parameters. `*parameters` is filled with the best parameters found,
373
+ * dictionary constructed with those parameters is stored in `dictBuffer`.
374
+ * All of the parameters d, k, steps, f, and accel are optional.
375
+ * If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8}.
376
+ * if steps is zero it defaults to its default value.
377
+ * If k is non-zero then we don't check multiple values of k, otherwise we check steps values in [50, 2000].
378
+ * If f is zero, default value of 20 is used.
379
+ * If accel is zero, default value of 1 is used.
380
+ *
381
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
382
+ * or an error code, which can be tested with ZDICT_isError().
383
+ * On success `*parameters` contains the parameters selected.
384
+ * See ZDICT_trainFromBuffer() for details on failure modes.
385
+ * Note: ZDICT_optimizeTrainFromBuffer_fastCover() requires about 6 * 2^f bytes of memory for each thread.
386
+ */
387
+ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_fastCover(void* dictBuffer,
388
+ size_t dictBufferCapacity, const void* samplesBuffer,
389
+ const size_t* samplesSizes, unsigned nbSamples,
390
+ ZDICT_fastCover_params_t* parameters);
391
+
392
+ typedef struct {
393
+ unsigned selectivityLevel; /* 0 means default; larger => select more => larger dictionary */
394
+ ZDICT_params_t zParams;
395
+ } ZDICT_legacy_params_t;
396
+
397
+ /*! ZDICT_trainFromBuffer_legacy():
398
+ * Train a dictionary from an array of samples.
399
+ * Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
400
+ * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
401
+ * The resulting dictionary will be saved into `dictBuffer`.
402
+ * `parameters` is optional and can be provided with values set to 0 to mean "default".
403
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
404
+ * or an error code, which can be tested with ZDICT_isError().
405
+ * See ZDICT_trainFromBuffer() for details on failure modes.
406
+ * Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
407
+ * It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`.
408
+ * In general, it's recommended to provide a few thousands samples, though this can vary a lot.
409
+ * It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
410
+ * Note: ZDICT_trainFromBuffer_legacy() will send notifications into stderr if instructed to, using notificationLevel>0.
411
+ */
412
+ ZDICTLIB_API size_t ZDICT_trainFromBuffer_legacy(
413
+ void* dictBuffer, size_t dictBufferCapacity,
414
+ const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
415
+ ZDICT_legacy_params_t parameters);
416
+
417
+
418
+ /* Deprecation warnings */
419
+ /* It is generally possible to disable deprecation warnings from compiler,
420
+ for example with -Wno-deprecated-declarations for gcc
421
+ or _CRT_SECURE_NO_WARNINGS in Visual.
422
+ Otherwise, it's also possible to manually define ZDICT_DISABLE_DEPRECATE_WARNINGS */
423
+ #ifdef ZDICT_DISABLE_DEPRECATE_WARNINGS
424
+ # define ZDICT_DEPRECATED(message) ZDICTLIB_API /* disable deprecation warnings */
425
+ #else
426
+ # define ZDICT_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
427
+ # if defined (__cplusplus) && (__cplusplus >= 201402) /* C++14 or greater */
428
+ # define ZDICT_DEPRECATED(message) [[deprecated(message)]] ZDICTLIB_API
429
+ # elif defined(__clang__) || (ZDICT_GCC_VERSION >= 405)
430
+ # define ZDICT_DEPRECATED(message) ZDICTLIB_API __attribute__((deprecated(message)))
431
+ # elif (ZDICT_GCC_VERSION >= 301)
432
+ # define ZDICT_DEPRECATED(message) ZDICTLIB_API __attribute__((deprecated))
433
+ # elif defined(_MSC_VER)
434
+ # define ZDICT_DEPRECATED(message) ZDICTLIB_API __declspec(deprecated(message))
435
+ # else
436
+ # pragma message("WARNING: You need to implement ZDICT_DEPRECATED for this compiler")
437
+ # define ZDICT_DEPRECATED(message) ZDICTLIB_API
438
+ # endif
439
+ #endif /* ZDICT_DISABLE_DEPRECATE_WARNINGS */
440
+
441
+ ZDICT_DEPRECATED("use ZDICT_finalizeDictionary() instead")
442
+ size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
443
+ const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples);
444
+
445
+
446
+ #endif /* ZDICT_STATIC_LINKING_ONLY */
447
+
448
+ #if defined (__cplusplus)
449
+ }
450
+ #endif
451
+
452
+ #endif /* DICTBUILDER_H_001 */