zstd-ruby 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (71) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +19 -0
  3. data/.gitmodules +3 -0
  4. data/.rspec +2 -0
  5. data/.travis.yml +11 -0
  6. data/CODE_OF_CONDUCT.md +74 -0
  7. data/Gemfile +4 -0
  8. data/LICENSE.txt +29 -0
  9. data/README.md +63 -0
  10. data/Rakefile +22 -0
  11. data/bin/console +14 -0
  12. data/bin/setup +8 -0
  13. data/ext/zstdruby/extconf.rb +23 -0
  14. data/ext/zstdruby/libzstd/.gitignore +2 -0
  15. data/ext/zstdruby/libzstd/Makefile +133 -0
  16. data/ext/zstdruby/libzstd/README.md +77 -0
  17. data/ext/zstdruby/libzstd/common/bitstream.h +414 -0
  18. data/ext/zstdruby/libzstd/common/entropy_common.c +227 -0
  19. data/ext/zstdruby/libzstd/common/error_private.c +43 -0
  20. data/ext/zstdruby/libzstd/common/error_private.h +76 -0
  21. data/ext/zstdruby/libzstd/common/fse.h +668 -0
  22. data/ext/zstdruby/libzstd/common/fse_decompress.c +329 -0
  23. data/ext/zstdruby/libzstd/common/huf.h +238 -0
  24. data/ext/zstdruby/libzstd/common/mem.h +372 -0
  25. data/ext/zstdruby/libzstd/common/xxhash.c +867 -0
  26. data/ext/zstdruby/libzstd/common/xxhash.h +309 -0
  27. data/ext/zstdruby/libzstd/common/zstd_common.c +77 -0
  28. data/ext/zstdruby/libzstd/common/zstd_errors.h +60 -0
  29. data/ext/zstdruby/libzstd/common/zstd_internal.h +270 -0
  30. data/ext/zstdruby/libzstd/compress/fse_compress.c +850 -0
  31. data/ext/zstdruby/libzstd/compress/huf_compress.c +609 -0
  32. data/ext/zstdruby/libzstd/compress/zstd_compress.c +3291 -0
  33. data/ext/zstdruby/libzstd/compress/zstd_opt.h +919 -0
  34. data/ext/zstdruby/libzstd/decompress/huf_decompress.c +885 -0
  35. data/ext/zstdruby/libzstd/decompress/zstd_decompress.c +2154 -0
  36. data/ext/zstdruby/libzstd/deprecated/zbuff.h +210 -0
  37. data/ext/zstdruby/libzstd/deprecated/zbuff_compress.c +145 -0
  38. data/ext/zstdruby/libzstd/deprecated/zbuff_decompress.c +74 -0
  39. data/ext/zstdruby/libzstd/dictBuilder/divsufsort.c +1913 -0
  40. data/ext/zstdruby/libzstd/dictBuilder/divsufsort.h +67 -0
  41. data/ext/zstdruby/libzstd/dictBuilder/zdict.c +1012 -0
  42. data/ext/zstdruby/libzstd/dictBuilder/zdict.h +111 -0
  43. data/ext/zstdruby/libzstd/dll/example/Makefile +47 -0
  44. data/ext/zstdruby/libzstd/dll/example/README.md +69 -0
  45. data/ext/zstdruby/libzstd/dll/example/build_package.bat +17 -0
  46. data/ext/zstdruby/libzstd/dll/example/fullbench-dll.sln +25 -0
  47. data/ext/zstdruby/libzstd/dll/example/fullbench-dll.vcxproj +179 -0
  48. data/ext/zstdruby/libzstd/dll/libzstd.def +86 -0
  49. data/ext/zstdruby/libzstd/legacy/zstd_legacy.h +259 -0
  50. data/ext/zstdruby/libzstd/legacy/zstd_v01.c +2095 -0
  51. data/ext/zstdruby/libzstd/legacy/zstd_v01.h +80 -0
  52. data/ext/zstdruby/libzstd/legacy/zstd_v02.c +3518 -0
  53. data/ext/zstdruby/libzstd/legacy/zstd_v02.h +79 -0
  54. data/ext/zstdruby/libzstd/legacy/zstd_v03.c +3159 -0
  55. data/ext/zstdruby/libzstd/legacy/zstd_v03.h +79 -0
  56. data/ext/zstdruby/libzstd/legacy/zstd_v04.c +3795 -0
  57. data/ext/zstdruby/libzstd/legacy/zstd_v04.h +128 -0
  58. data/ext/zstdruby/libzstd/legacy/zstd_v05.c +4056 -0
  59. data/ext/zstdruby/libzstd/legacy/zstd_v05.h +149 -0
  60. data/ext/zstdruby/libzstd/legacy/zstd_v06.c +4167 -0
  61. data/ext/zstdruby/libzstd/legacy/zstd_v06.h +159 -0
  62. data/ext/zstdruby/libzstd/legacy/zstd_v07.c +4540 -0
  63. data/ext/zstdruby/libzstd/legacy/zstd_v07.h +173 -0
  64. data/ext/zstdruby/libzstd/libzstd.pc.in +14 -0
  65. data/ext/zstdruby/libzstd/zstd.h +673 -0
  66. data/ext/zstdruby/zstdruby.c +117 -0
  67. data/ext/zstdruby/zstdruby.h +6 -0
  68. data/lib/zstd-ruby.rb +6 -0
  69. data/lib/zstd-ruby/version.rb +3 -0
  70. data/zstd-ruby.gemspec +37 -0
  71. metadata +170 -0
@@ -0,0 +1,67 @@
1
+ /*
2
+ * divsufsort.h for libdivsufsort-lite
3
+ * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved.
4
+ *
5
+ * Permission is hereby granted, free of charge, to any person
6
+ * obtaining a copy of this software and associated documentation
7
+ * files (the "Software"), to deal in the Software without
8
+ * restriction, including without limitation the rights to use,
9
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ * copies of the Software, and to permit persons to whom the
11
+ * Software is furnished to do so, subject to the following
12
+ * conditions:
13
+ *
14
+ * The above copyright notice and this permission notice shall be
15
+ * included in all copies or substantial portions of the Software.
16
+ *
17
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
19
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
21
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
22
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
24
+ * OTHER DEALINGS IN THE SOFTWARE.
25
+ */
26
+
27
+ #ifndef _DIVSUFSORT_H
28
+ #define _DIVSUFSORT_H 1
29
+
30
+ #ifdef __cplusplus
31
+ extern "C" {
32
+ #endif /* __cplusplus */
33
+
34
+
35
+ /*- Prototypes -*/
36
+
37
+ /**
38
+ * Constructs the suffix array of a given string.
39
+ * @param T [0..n-1] The input string.
40
+ * @param SA [0..n-1] The output array of suffixes.
41
+ * @param n The length of the given string.
42
+ * @param openMP enables OpenMP optimization.
43
+ * @return 0 if no error occurred, -1 or -2 otherwise.
44
+ */
45
+ int
46
+ divsufsort(const unsigned char *T, int *SA, int n, int openMP);
47
+
48
+ /**
49
+ * Constructs the burrows-wheeler transformed string of a given string.
50
+ * @param T [0..n-1] The input string.
51
+ * @param U [0..n-1] The output string. (can be T)
52
+ * @param A [0..n-1] The temporary array. (can be NULL)
53
+ * @param n The length of the given string.
54
+ * @param num_indexes The length of secondary indexes array. (can be NULL)
55
+ * @param indexes The secondary indexes array. (can be NULL)
56
+ * @param openMP enables OpenMP optimization.
57
+ * @return The primary index if no error occurred, -1 or -2 otherwise.
58
+ */
59
+ int
60
+ divbwt(const unsigned char *T, unsigned char *U, int *A, int n, unsigned char * num_indexes, int * indexes, int openMP);
61
+
62
+
63
+ #ifdef __cplusplus
64
+ } /* extern "C" */
65
+ #endif /* __cplusplus */
66
+
67
+ #endif /* _DIVSUFSORT_H */
@@ -0,0 +1,1012 @@
1
+ /**
2
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
3
+ * All rights reserved.
4
+ *
5
+ * This source code is licensed under the BSD-style license found in the
6
+ * LICENSE file in the root directory of this source tree. An additional grant
7
+ * of patent rights can be found in the PATENTS file in the same directory.
8
+ */
9
+
10
+
11
+ /*-**************************************
12
+ * Tuning parameters
13
+ ****************************************/
14
+ #define ZDICT_MAX_SAMPLES_SIZE (2000U << 20)
15
+ #define ZDICT_MIN_SAMPLES_SIZE 512
16
+
17
+
18
+ /*-**************************************
19
+ * Compiler Options
20
+ ****************************************/
21
+ /* Unix Large Files support (>4GB) */
22
+ #define _FILE_OFFSET_BITS 64
23
+ #if (defined(__sun__) && (!defined(__LP64__))) /* Sun Solaris 32-bits requires specific definitions */
24
+ # define _LARGEFILE_SOURCE
25
+ #elif ! defined(__LP64__) /* No point defining Large file for 64 bit */
26
+ # define _LARGEFILE64_SOURCE
27
+ #endif
28
+
29
+
30
+ /*-*************************************
31
+ * Dependencies
32
+ ***************************************/
33
+ #include <stdlib.h> /* malloc, free */
34
+ #include <string.h> /* memset */
35
+ #include <stdio.h> /* fprintf, fopen, ftello64 */
36
+ #include <time.h> /* clock */
37
+
38
+ #include "mem.h" /* read */
39
+ #include "error_private.h"
40
+ #include "fse.h" /* FSE_normalizeCount, FSE_writeNCount */
41
+ #define HUF_STATIC_LINKING_ONLY
42
+ #include "huf.h"
43
+ #include "zstd_internal.h" /* includes zstd.h */
44
+ #include "xxhash.h"
45
+ #include "divsufsort.h"
46
+ #ifndef ZDICT_STATIC_LINKING_ONLY
47
+ # define ZDICT_STATIC_LINKING_ONLY
48
+ #endif
49
+ #include "zdict.h"
50
+
51
+
52
+ /*-*************************************
53
+ * Constants
54
+ ***************************************/
55
+ #define KB *(1 <<10)
56
+ #define MB *(1 <<20)
57
+ #define GB *(1U<<30)
58
+
59
+ #define DICTLISTSIZE_DEFAULT 10000
60
+
61
+ #define NOISELENGTH 32
62
+
63
+ #define MINRATIO 4
64
+ static const int g_compressionLevel_default = 5;
65
+ static const U32 g_selectivity_default = 9;
66
+ static const size_t g_provision_entropySize = 200;
67
+ static const size_t g_min_fast_dictContent = 192;
68
+
69
+
70
+ /*-*************************************
71
+ * Console display
72
+ ***************************************/
73
+ #define DISPLAY(...) { fprintf(stderr, __VA_ARGS__); fflush( stderr ); }
74
+ #define DISPLAYLEVEL(l, ...) if (notificationLevel>=l) { DISPLAY(__VA_ARGS__); } /* 0 : no display; 1: errors; 2: default; 3: details; 4: debug */
75
+
76
+ static clock_t ZDICT_clockSpan(clock_t nPrevious) { return clock() - nPrevious; }
77
+
78
+ static void ZDICT_printHex(const void* ptr, size_t length)
79
+ {
80
+ const BYTE* const b = (const BYTE*)ptr;
81
+ size_t u;
82
+ for (u=0; u<length; u++) {
83
+ BYTE c = b[u];
84
+ if (c<32 || c>126) c = '.'; /* non-printable char */
85
+ DISPLAY("%c", c);
86
+ }
87
+ }
88
+
89
+
90
+ /*-********************************************************
91
+ * Helper functions
92
+ **********************************************************/
93
+ unsigned ZDICT_isError(size_t errorCode) { return ERR_isError(errorCode); }
94
+
95
+ const char* ZDICT_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); }
96
+
97
+ unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize)
98
+ {
99
+ if (dictSize < 8) return 0;
100
+ if (MEM_readLE32(dictBuffer) != ZSTD_DICT_MAGIC) return 0;
101
+ return MEM_readLE32((const char*)dictBuffer + 4);
102
+ }
103
+
104
+
105
+ /*-********************************************************
106
+ * Dictionary training functions
107
+ **********************************************************/
108
+ static unsigned ZDICT_NbCommonBytes (register size_t val)
109
+ {
110
+ if (MEM_isLittleEndian()) {
111
+ if (MEM_64bits()) {
112
+ # if defined(_MSC_VER) && defined(_WIN64)
113
+ unsigned long r = 0;
114
+ _BitScanForward64( &r, (U64)val );
115
+ return (unsigned)(r>>3);
116
+ # elif defined(__GNUC__) && (__GNUC__ >= 3)
117
+ return (__builtin_ctzll((U64)val) >> 3);
118
+ # else
119
+ static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 };
120
+ return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
121
+ # endif
122
+ } else { /* 32 bits */
123
+ # if defined(_MSC_VER)
124
+ unsigned long r=0;
125
+ _BitScanForward( &r, (U32)val );
126
+ return (unsigned)(r>>3);
127
+ # elif defined(__GNUC__) && (__GNUC__ >= 3)
128
+ return (__builtin_ctz((U32)val) >> 3);
129
+ # else
130
+ static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 };
131
+ return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
132
+ # endif
133
+ }
134
+ } else { /* Big Endian CPU */
135
+ if (MEM_64bits()) {
136
+ # if defined(_MSC_VER) && defined(_WIN64)
137
+ unsigned long r = 0;
138
+ _BitScanReverse64( &r, val );
139
+ return (unsigned)(r>>3);
140
+ # elif defined(__GNUC__) && (__GNUC__ >= 3)
141
+ return (__builtin_clzll(val) >> 3);
142
+ # else
143
+ unsigned r;
144
+ const unsigned n32 = sizeof(size_t)*4; /* calculate this way due to compiler complaining in 32-bits mode */
145
+ if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; }
146
+ if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
147
+ r += (!val);
148
+ return r;
149
+ # endif
150
+ } else { /* 32 bits */
151
+ # if defined(_MSC_VER)
152
+ unsigned long r = 0;
153
+ _BitScanReverse( &r, (unsigned long)val );
154
+ return (unsigned)(r>>3);
155
+ # elif defined(__GNUC__) && (__GNUC__ >= 3)
156
+ return (__builtin_clz((U32)val) >> 3);
157
+ # else
158
+ unsigned r;
159
+ if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }
160
+ r += (!val);
161
+ return r;
162
+ # endif
163
+ } }
164
+ }
165
+
166
+
167
+ /*! ZDICT_count() :
168
+ Count the nb of common bytes between 2 pointers.
169
+ Note : this function presumes end of buffer followed by noisy guard band.
170
+ */
171
+ static size_t ZDICT_count(const void* pIn, const void* pMatch)
172
+ {
173
+ const char* const pStart = (const char*)pIn;
174
+ for (;;) {
175
+ size_t const diff = MEM_readST(pMatch) ^ MEM_readST(pIn);
176
+ if (!diff) {
177
+ pIn = (const char*)pIn+sizeof(size_t);
178
+ pMatch = (const char*)pMatch+sizeof(size_t);
179
+ continue;
180
+ }
181
+ pIn = (const char*)pIn+ZDICT_NbCommonBytes(diff);
182
+ return (size_t)((const char*)pIn - pStart);
183
+ }
184
+ }
185
+
186
+
187
+ typedef struct {
188
+ U32 pos;
189
+ U32 length;
190
+ U32 savings;
191
+ } dictItem;
192
+
193
+ static void ZDICT_initDictItem(dictItem* d)
194
+ {
195
+ d->pos = 1;
196
+ d->length = 0;
197
+ d->savings = (U32)(-1);
198
+ }
199
+
200
+
201
+ #define LLIMIT 64 /* heuristic determined experimentally */
202
+ #define MINMATCHLENGTH 7 /* heuristic determined experimentally */
203
+ static dictItem ZDICT_analyzePos(
204
+ BYTE* doneMarks,
205
+ const int* suffix, U32 start,
206
+ const void* buffer, U32 minRatio, U32 notificationLevel)
207
+ {
208
+ U32 lengthList[LLIMIT] = {0};
209
+ U32 cumulLength[LLIMIT] = {0};
210
+ U32 savings[LLIMIT] = {0};
211
+ const BYTE* b = (const BYTE*)buffer;
212
+ size_t length;
213
+ size_t maxLength = LLIMIT;
214
+ size_t pos = suffix[start];
215
+ U32 end = start;
216
+ dictItem solution;
217
+
218
+ /* init */
219
+ memset(&solution, 0, sizeof(solution));
220
+ doneMarks[pos] = 1;
221
+
222
+ /* trivial repetition cases */
223
+ if ( (MEM_read16(b+pos+0) == MEM_read16(b+pos+2))
224
+ ||(MEM_read16(b+pos+1) == MEM_read16(b+pos+3))
225
+ ||(MEM_read16(b+pos+2) == MEM_read16(b+pos+4)) ) {
226
+ /* skip and mark segment */
227
+ U16 u16 = MEM_read16(b+pos+4);
228
+ U32 u, e = 6;
229
+ while (MEM_read16(b+pos+e) == u16) e+=2 ;
230
+ if (b[pos+e] == b[pos+e-1]) e++;
231
+ for (u=1; u<e; u++)
232
+ doneMarks[pos+u] = 1;
233
+ return solution;
234
+ }
235
+
236
+ /* look forward */
237
+ do {
238
+ end++;
239
+ length = ZDICT_count(b + pos, b + suffix[end]);
240
+ } while (length >=MINMATCHLENGTH);
241
+
242
+ /* look backward */
243
+ do {
244
+ length = ZDICT_count(b + pos, b + *(suffix+start-1));
245
+ if (length >=MINMATCHLENGTH) start--;
246
+ } while(length >= MINMATCHLENGTH);
247
+
248
+ /* exit if not found a minimum nb of repetitions */
249
+ if (end-start < minRatio) {
250
+ U32 idx;
251
+ for(idx=start; idx<end; idx++)
252
+ doneMarks[suffix[idx]] = 1;
253
+ return solution;
254
+ }
255
+
256
+ { int i;
257
+ U32 searchLength;
258
+ U32 refinedStart = start;
259
+ U32 refinedEnd = end;
260
+
261
+ DISPLAYLEVEL(4, "\n");
262
+ DISPLAYLEVEL(4, "found %3u matches of length >= %i at pos %7u ", (U32)(end-start), MINMATCHLENGTH, (U32)pos);
263
+ DISPLAYLEVEL(4, "\n");
264
+
265
+ for (searchLength = MINMATCHLENGTH ; ; searchLength++) {
266
+ BYTE currentChar = 0;
267
+ U32 currentCount = 0;
268
+ U32 currentID = refinedStart;
269
+ U32 id;
270
+ U32 selectedCount = 0;
271
+ U32 selectedID = currentID;
272
+ for (id =refinedStart; id < refinedEnd; id++) {
273
+ if (b[ suffix[id] + searchLength] != currentChar) {
274
+ if (currentCount > selectedCount) {
275
+ selectedCount = currentCount;
276
+ selectedID = currentID;
277
+ }
278
+ currentID = id;
279
+ currentChar = b[ suffix[id] + searchLength];
280
+ currentCount = 0;
281
+ }
282
+ currentCount ++;
283
+ }
284
+ if (currentCount > selectedCount) { /* for last */
285
+ selectedCount = currentCount;
286
+ selectedID = currentID;
287
+ }
288
+
289
+ if (selectedCount < minRatio)
290
+ break;
291
+ refinedStart = selectedID;
292
+ refinedEnd = refinedStart + selectedCount;
293
+ }
294
+
295
+ /* evaluate gain based on new ref */
296
+ start = refinedStart;
297
+ pos = suffix[refinedStart];
298
+ end = start;
299
+ memset(lengthList, 0, sizeof(lengthList));
300
+
301
+ /* look forward */
302
+ do {
303
+ end++;
304
+ length = ZDICT_count(b + pos, b + suffix[end]);
305
+ if (length >= LLIMIT) length = LLIMIT-1;
306
+ lengthList[length]++;
307
+ } while (length >=MINMATCHLENGTH);
308
+
309
+ /* look backward */
310
+ length = MINMATCHLENGTH;
311
+ while ((length >= MINMATCHLENGTH) & (start > 0)) {
312
+ length = ZDICT_count(b + pos, b + suffix[start - 1]);
313
+ if (length >= LLIMIT) length = LLIMIT - 1;
314
+ lengthList[length]++;
315
+ if (length >= MINMATCHLENGTH) start--;
316
+ }
317
+
318
+ /* largest useful length */
319
+ memset(cumulLength, 0, sizeof(cumulLength));
320
+ cumulLength[maxLength-1] = lengthList[maxLength-1];
321
+ for (i=(int)(maxLength-2); i>=0; i--)
322
+ cumulLength[i] = cumulLength[i+1] + lengthList[i];
323
+
324
+ for (i=LLIMIT-1; i>=MINMATCHLENGTH; i--) if (cumulLength[i]>=minRatio) break;
325
+ maxLength = i;
326
+
327
+ /* reduce maxLength in case of final into repetitive data */
328
+ { U32 l = (U32)maxLength;
329
+ BYTE const c = b[pos + maxLength-1];
330
+ while (b[pos+l-2]==c) l--;
331
+ maxLength = l;
332
+ }
333
+ if (maxLength < MINMATCHLENGTH) return solution; /* skip : no long-enough solution */
334
+
335
+ /* calculate savings */
336
+ savings[5] = 0;
337
+ for (i=MINMATCHLENGTH; i<=(int)maxLength; i++)
338
+ savings[i] = savings[i-1] + (lengthList[i] * (i-3));
339
+
340
+ DISPLAYLEVEL(4, "Selected ref at position %u, of length %u : saves %u (ratio: %.2f) \n",
341
+ (U32)pos, (U32)maxLength, savings[maxLength], (double)savings[maxLength] / maxLength);
342
+
343
+ solution.pos = (U32)pos;
344
+ solution.length = (U32)maxLength;
345
+ solution.savings = savings[maxLength];
346
+
347
+ /* mark positions done */
348
+ { U32 id;
349
+ for (id=start; id<end; id++) {
350
+ U32 p, pEnd;
351
+ U32 const testedPos = suffix[id];
352
+ if (testedPos == pos)
353
+ length = solution.length;
354
+ else {
355
+ length = ZDICT_count(b+pos, b+testedPos);
356
+ if (length > solution.length) length = solution.length;
357
+ }
358
+ pEnd = (U32)(testedPos + length);
359
+ for (p=testedPos; p<pEnd; p++)
360
+ doneMarks[p] = 1;
361
+ } } }
362
+
363
+ return solution;
364
+ }
365
+
366
+
367
+ /*! ZDICT_checkMerge
368
+ check if dictItem can be merged, do it if possible
369
+ @return : id of destination elt, 0 if not merged
370
+ */
371
+ static U32 ZDICT_checkMerge(dictItem* table, dictItem elt, U32 eltNbToSkip)
372
+ {
373
+ const U32 tableSize = table->pos;
374
+ const U32 eltEnd = elt.pos + elt.length;
375
+
376
+ /* tail overlap */
377
+ U32 u; for (u=1; u<tableSize; u++) {
378
+ if (u==eltNbToSkip) continue;
379
+ if ((table[u].pos > elt.pos) && (table[u].pos <= eltEnd)) { /* overlap, existing > new */
380
+ /* append */
381
+ U32 addedLength = table[u].pos - elt.pos;
382
+ table[u].length += addedLength;
383
+ table[u].pos = elt.pos;
384
+ table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */
385
+ table[u].savings += elt.length / 8; /* rough approx bonus */
386
+ elt = table[u];
387
+ /* sort : improve rank */
388
+ while ((u>1) && (table[u-1].savings < elt.savings))
389
+ table[u] = table[u-1], u--;
390
+ table[u] = elt;
391
+ return u;
392
+ } }
393
+
394
+ /* front overlap */
395
+ for (u=1; u<tableSize; u++) {
396
+ if (u==eltNbToSkip) continue;
397
+ if ((table[u].pos + table[u].length >= elt.pos) && (table[u].pos < elt.pos)) { /* overlap, existing < new */
398
+ /* append */
399
+ int addedLength = (int)eltEnd - (table[u].pos + table[u].length);
400
+ table[u].savings += elt.length / 8; /* rough approx bonus */
401
+ if (addedLength > 0) { /* otherwise, elt fully included into existing */
402
+ table[u].length += addedLength;
403
+ table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */
404
+ }
405
+ /* sort : improve rank */
406
+ elt = table[u];
407
+ while ((u>1) && (table[u-1].savings < elt.savings))
408
+ table[u] = table[u-1], u--;
409
+ table[u] = elt;
410
+ return u;
411
+ } }
412
+
413
+ return 0;
414
+ }
415
+
416
+
417
+ static void ZDICT_removeDictItem(dictItem* table, U32 id)
418
+ {
419
+ /* convention : first element is nb of elts */
420
+ U32 const max = table->pos;
421
+ U32 u;
422
+ if (!id) return; /* protection, should never happen */
423
+ for (u=id; u<max-1; u++)
424
+ table[u] = table[u+1];
425
+ table->pos--;
426
+ }
427
+
428
+
429
+ static void ZDICT_insertDictItem(dictItem* table, U32 maxSize, dictItem elt)
430
+ {
431
+ /* merge if possible */
432
+ U32 mergeId = ZDICT_checkMerge(table, elt, 0);
433
+ if (mergeId) {
434
+ U32 newMerge = 1;
435
+ while (newMerge) {
436
+ newMerge = ZDICT_checkMerge(table, table[mergeId], mergeId);
437
+ if (newMerge) ZDICT_removeDictItem(table, mergeId);
438
+ mergeId = newMerge;
439
+ }
440
+ return;
441
+ }
442
+
443
+ /* insert */
444
+ { U32 current;
445
+ U32 nextElt = table->pos;
446
+ if (nextElt >= maxSize) nextElt = maxSize-1;
447
+ current = nextElt-1;
448
+ while (table[current].savings < elt.savings) {
449
+ table[current+1] = table[current];
450
+ current--;
451
+ }
452
+ table[current+1] = elt;
453
+ table->pos = nextElt+1;
454
+ }
455
+ }
456
+
457
+
458
+ static U32 ZDICT_dictSize(const dictItem* dictList)
459
+ {
460
+ U32 u, dictSize = 0;
461
+ for (u=1; u<dictList[0].pos; u++)
462
+ dictSize += dictList[u].length;
463
+ return dictSize;
464
+ }
465
+
466
+
467
+ static size_t ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
468
+ const void* const buffer, size_t bufferSize, /* buffer must end with noisy guard band */
469
+ const size_t* fileSizes, unsigned nbFiles,
470
+ U32 minRatio, U32 notificationLevel)
471
+ {
472
+ int* const suffix0 = (int*)malloc((bufferSize+2)*sizeof(*suffix0));
473
+ int* const suffix = suffix0+1;
474
+ U32* reverseSuffix = (U32*)malloc((bufferSize)*sizeof(*reverseSuffix));
475
+ BYTE* doneMarks = (BYTE*)malloc((bufferSize+16)*sizeof(*doneMarks)); /* +16 for overflow security */
476
+ U32* filePos = (U32*)malloc(nbFiles * sizeof(*filePos));
477
+ size_t result = 0;
478
+ clock_t displayClock = 0;
479
+ clock_t const refreshRate = CLOCKS_PER_SEC * 3 / 10;
480
+
481
+ # define DISPLAYUPDATE(l, ...) if (notificationLevel>=l) { \
482
+ if (ZDICT_clockSpan(displayClock) > refreshRate) \
483
+ { displayClock = clock(); DISPLAY(__VA_ARGS__); \
484
+ if (notificationLevel>=4) fflush(stdout); } }
485
+
486
+ /* init */
487
+ DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
488
+ if (!suffix0 || !reverseSuffix || !doneMarks || !filePos) {
489
+ result = ERROR(memory_allocation);
490
+ goto _cleanup;
491
+ }
492
+ if (minRatio < MINRATIO) minRatio = MINRATIO;
493
+ memset(doneMarks, 0, bufferSize+16);
494
+
495
+ /* limit sample set size (divsufsort limitation)*/
496
+ if (bufferSize > ZDICT_MAX_SAMPLES_SIZE) DISPLAYLEVEL(3, "sample set too large : reduced to %u MB ...\n", (U32)(ZDICT_MAX_SAMPLES_SIZE>>20));
497
+ while (bufferSize > ZDICT_MAX_SAMPLES_SIZE) bufferSize -= fileSizes[--nbFiles];
498
+
499
+ /* sort */
500
+ DISPLAYLEVEL(2, "sorting %u files of total size %u MB ...\n", nbFiles, (U32)(bufferSize>>20));
501
+ { int const divSuftSortResult = divsufsort((const unsigned char*)buffer, suffix, (int)bufferSize, 0);
502
+ if (divSuftSortResult != 0) { result = ERROR(GENERIC); goto _cleanup; }
503
+ }
504
+ suffix[bufferSize] = (int)bufferSize; /* leads into noise */
505
+ suffix0[0] = (int)bufferSize; /* leads into noise */
506
+ /* build reverse suffix sort */
507
+ { size_t pos;
508
+ for (pos=0; pos < bufferSize; pos++)
509
+ reverseSuffix[suffix[pos]] = (U32)pos;
510
+ /* note filePos tracks borders between samples.
511
+ It's not used at this stage, but planned to become useful in a later update */
512
+ filePos[0] = 0;
513
+ for (pos=1; pos<nbFiles; pos++)
514
+ filePos[pos] = (U32)(filePos[pos-1] + fileSizes[pos-1]);
515
+ }
516
+
517
+ DISPLAYLEVEL(2, "finding patterns ... \n");
518
+ DISPLAYLEVEL(3, "minimum ratio : %u \n", minRatio);
519
+
520
+ { U32 cursor; for (cursor=0; cursor < bufferSize; ) {
521
+ dictItem solution;
522
+ if (doneMarks[cursor]) { cursor++; continue; }
523
+ solution = ZDICT_analyzePos(doneMarks, suffix, reverseSuffix[cursor], buffer, minRatio, notificationLevel);
524
+ if (solution.length==0) { cursor++; continue; }
525
+ ZDICT_insertDictItem(dictList, dictListSize, solution);
526
+ cursor += solution.length;
527
+ DISPLAYUPDATE(2, "\r%4.2f %% \r", (double)cursor / bufferSize * 100);
528
+ } }
529
+
530
+ _cleanup:
531
+ free(suffix0);
532
+ free(reverseSuffix);
533
+ free(doneMarks);
534
+ free(filePos);
535
+ return result;
536
+ }
537
+
538
+
539
+ static void ZDICT_fillNoise(void* buffer, size_t length)
540
+ {
541
+ unsigned const prime1 = 2654435761U;
542
+ unsigned const prime2 = 2246822519U;
543
+ unsigned acc = prime1;
544
+ size_t p=0;;
545
+ for (p=0; p<length; p++) {
546
+ acc *= prime2;
547
+ ((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21);
548
+ }
549
+ }
550
+
551
+
552
+ typedef struct
553
+ {
554
+ ZSTD_CCtx* ref;
555
+ ZSTD_CCtx* zc;
556
+ void* workPlace; /* must be ZSTD_BLOCKSIZE_ABSOLUTEMAX allocated */
557
+ } EStats_ress_t;
558
+
559
+ #define MAXREPOFFSET 1024
560
+
561
+ static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params,
562
+ U32* countLit, U32* offsetcodeCount, U32* matchlengthCount, U32* litlengthCount, U32* repOffsets,
563
+ const void* src, size_t srcSize, U32 notificationLevel)
564
+ {
565
+ size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_ABSOLUTEMAX, 1 << params.cParams.windowLog);
566
+ size_t cSize;
567
+
568
+ if (srcSize > blockSizeMax) srcSize = blockSizeMax; /* protection vs large samples */
569
+ { size_t const errorCode = ZSTD_copyCCtx(esr.zc, esr.ref, 0);
570
+ if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_copyCCtx failed \n"); return; }
571
+ }
572
+ cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_ABSOLUTEMAX, src, srcSize);
573
+ if (ZSTD_isError(cSize)) { DISPLAYLEVEL(1, "warning : could not compress sample size %u \n", (U32)srcSize); return; }
574
+
575
+ if (cSize) { /* if == 0; block is not compressible */
576
+ const seqStore_t* seqStorePtr = ZSTD_getSeqStore(esr.zc);
577
+
578
+ /* literals stats */
579
+ { const BYTE* bytePtr;
580
+ for(bytePtr = seqStorePtr->litStart; bytePtr < seqStorePtr->lit; bytePtr++)
581
+ countLit[*bytePtr]++;
582
+ }
583
+
584
+ /* seqStats */
585
+ { U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
586
+ ZSTD_seqToCodes(seqStorePtr);
587
+
588
+ { const BYTE* codePtr = seqStorePtr->ofCode;
589
+ U32 u;
590
+ for (u=0; u<nbSeq; u++) offsetcodeCount[codePtr[u]]++;
591
+ }
592
+
593
+ { const BYTE* codePtr = seqStorePtr->mlCode;
594
+ U32 u;
595
+ for (u=0; u<nbSeq; u++) matchlengthCount[codePtr[u]]++;
596
+ }
597
+
598
+ { const BYTE* codePtr = seqStorePtr->llCode;
599
+ U32 u;
600
+ for (u=0; u<nbSeq; u++) litlengthCount[codePtr[u]]++;
601
+ }
602
+
603
+ if (nbSeq >= 2) { /* rep offsets */
604
+ const seqDef* const seq = seqStorePtr->sequencesStart;
605
+ U32 offset1 = seq[0].offset - 3;
606
+ U32 offset2 = seq[1].offset - 3;
607
+ if (offset1 >= MAXREPOFFSET) offset1 = 0;
608
+ if (offset2 >= MAXREPOFFSET) offset2 = 0;
609
+ repOffsets[offset1] += 3;
610
+ repOffsets[offset2] += 1;
611
+ } } }
612
+ }
613
+
614
+ /*
615
+ static size_t ZDICT_maxSampleSize(const size_t* fileSizes, unsigned nbFiles)
616
+ {
617
+ unsigned u;
618
+ size_t max=0;
619
+ for (u=0; u<nbFiles; u++)
620
+ if (max < fileSizes[u]) max = fileSizes[u];
621
+ return max;
622
+ }
623
+ */
624
+
625
+ static size_t ZDICT_totalSampleSize(const size_t* fileSizes, unsigned nbFiles)
626
+ {
627
+ size_t total=0;
628
+ unsigned u;
629
+ for (u=0; u<nbFiles; u++) total += fileSizes[u];
630
+ return total;
631
+ }
632
+
633
+ typedef struct { U32 offset; U32 count; } offsetCount_t;
634
+
635
+ static void ZDICT_insertSortCount(offsetCount_t table[ZSTD_REP_NUM+1], U32 val, U32 count)
636
+ {
637
+ U32 u;
638
+ table[ZSTD_REP_NUM].offset = val;
639
+ table[ZSTD_REP_NUM].count = count;
640
+ for (u=ZSTD_REP_NUM; u>0; u--) {
641
+ offsetCount_t tmp;
642
+ if (table[u-1].count >= table[u].count) break;
643
+ tmp = table[u-1];
644
+ table[u-1] = table[u];
645
+ table[u] = tmp;
646
+ }
647
+ }
648
+
649
+
650
+ #define OFFCODE_MAX 30 /* only applicable to first block */
651
+ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
652
+ unsigned compressionLevel,
653
+ const void* srcBuffer, const size_t* fileSizes, unsigned nbFiles,
654
+ const void* dictBuffer, size_t dictBufferSize,
655
+ unsigned notificationLevel)
656
+ {
657
+ U32 countLit[256];
658
+ HUF_CREATE_STATIC_CTABLE(hufTable, 255);
659
+ U32 offcodeCount[OFFCODE_MAX+1];
660
+ short offcodeNCount[OFFCODE_MAX+1];
661
+ U32 offcodeMax = ZSTD_highbit32((U32)(dictBufferSize + 128 KB));
662
+ U32 matchLengthCount[MaxML+1];
663
+ short matchLengthNCount[MaxML+1];
664
+ U32 litLengthCount[MaxLL+1];
665
+ short litLengthNCount[MaxLL+1];
666
+ U32 repOffset[MAXREPOFFSET];
667
+ offsetCount_t bestRepOffset[ZSTD_REP_NUM+1];
668
+ EStats_ress_t esr;
669
+ ZSTD_parameters params;
670
+ U32 u, huffLog = 11, Offlog = OffFSELog, mlLog = MLFSELog, llLog = LLFSELog, total;
671
+ size_t pos = 0, errorCode;
672
+ size_t eSize = 0;
673
+ size_t const totalSrcSize = ZDICT_totalSampleSize(fileSizes, nbFiles);
674
+ size_t const averageSampleSize = totalSrcSize / (nbFiles + !nbFiles);
675
+ BYTE* dstPtr = (BYTE*)dstBuffer;
676
+
677
+ /* init */
678
+ esr.ref = ZSTD_createCCtx();
679
+ esr.zc = ZSTD_createCCtx();
680
+ esr.workPlace = malloc(ZSTD_BLOCKSIZE_ABSOLUTEMAX);
681
+ if (!esr.ref || !esr.zc || !esr.workPlace) {
682
+ eSize = ERROR(memory_allocation);
683
+ DISPLAYLEVEL(1, "Not enough memory \n");
684
+ goto _cleanup;
685
+ }
686
+ if (offcodeMax>OFFCODE_MAX) { eSize = ERROR(dictionary_wrong); goto _cleanup; } /* too large dictionary */
687
+ for (u=0; u<256; u++) countLit[u]=1; /* any character must be described */
688
+ for (u=0; u<=offcodeMax; u++) offcodeCount[u]=1;
689
+ for (u=0; u<=MaxML; u++) matchLengthCount[u]=1;
690
+ for (u=0; u<=MaxLL; u++) litLengthCount[u]=1;
691
+ memset(repOffset, 0, sizeof(repOffset));
692
+ repOffset[1] = repOffset[4] = repOffset[8] = 1;
693
+ memset(bestRepOffset, 0, sizeof(bestRepOffset));
694
+ if (compressionLevel==0) compressionLevel=g_compressionLevel_default;
695
+ params = ZSTD_getParams(compressionLevel, averageSampleSize, dictBufferSize);
696
+ { size_t const beginResult = ZSTD_compressBegin_advanced(esr.ref, dictBuffer, dictBufferSize, params, 0);
697
+ if (ZSTD_isError(beginResult)) {
698
+ eSize = ERROR(GENERIC);
699
+ DISPLAYLEVEL(1, "error : ZSTD_compressBegin_advanced failed \n");
700
+ goto _cleanup;
701
+ } }
702
+
703
+ /* collect stats on all files */
704
+ for (u=0; u<nbFiles; u++) {
705
+ ZDICT_countEStats(esr, params,
706
+ countLit, offcodeCount, matchLengthCount, litLengthCount, repOffset,
707
+ (const char*)srcBuffer + pos, fileSizes[u],
708
+ notificationLevel);
709
+ pos += fileSizes[u];
710
+ }
711
+
712
+ /* analyze */
713
+ errorCode = HUF_buildCTable (hufTable, countLit, 255, huffLog);
714
+ if (HUF_isError(errorCode)) {
715
+ eSize = ERROR(GENERIC);
716
+ DISPLAYLEVEL(1, "HUF_buildCTable error \n");
717
+ goto _cleanup;
718
+ }
719
+ huffLog = (U32)errorCode;
720
+
721
+ /* looking for most common first offsets */
722
+ { U32 offset;
723
+ for (offset=1; offset<MAXREPOFFSET; offset++)
724
+ ZDICT_insertSortCount(bestRepOffset, offset, repOffset[offset]);
725
+ }
726
+ /* note : the result of this phase should be used to better appreciate the impact on statistics */
727
+
728
+ total=0; for (u=0; u<=offcodeMax; u++) total+=offcodeCount[u];
729
+ errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, offcodeMax);
730
+ if (FSE_isError(errorCode)) {
731
+ eSize = ERROR(GENERIC);
732
+ DISPLAYLEVEL(1, "FSE_normalizeCount error with offcodeCount \n");
733
+ goto _cleanup;
734
+ }
735
+ Offlog = (U32)errorCode;
736
+
737
+ total=0; for (u=0; u<=MaxML; u++) total+=matchLengthCount[u];
738
+ errorCode = FSE_normalizeCount(matchLengthNCount, mlLog, matchLengthCount, total, MaxML);
739
+ if (FSE_isError(errorCode)) {
740
+ eSize = ERROR(GENERIC);
741
+ DISPLAYLEVEL(1, "FSE_normalizeCount error with matchLengthCount \n");
742
+ goto _cleanup;
743
+ }
744
+ mlLog = (U32)errorCode;
745
+
746
+ total=0; for (u=0; u<=MaxLL; u++) total+=litLengthCount[u];
747
+ errorCode = FSE_normalizeCount(litLengthNCount, llLog, litLengthCount, total, MaxLL);
748
+ if (FSE_isError(errorCode)) {
749
+ eSize = ERROR(GENERIC);
750
+ DISPLAYLEVEL(1, "FSE_normalizeCount error with litLengthCount \n");
751
+ goto _cleanup;
752
+ }
753
+ llLog = (U32)errorCode;
754
+
755
+ /* write result to buffer */
756
+ { size_t const hhSize = HUF_writeCTable(dstPtr, maxDstSize, hufTable, 255, huffLog);
757
+ if (HUF_isError(hhSize)) {
758
+ eSize = ERROR(GENERIC);
759
+ DISPLAYLEVEL(1, "HUF_writeCTable error \n");
760
+ goto _cleanup;
761
+ }
762
+ dstPtr += hhSize;
763
+ maxDstSize -= hhSize;
764
+ eSize += hhSize;
765
+ }
766
+
767
+ { size_t const ohSize = FSE_writeNCount(dstPtr, maxDstSize, offcodeNCount, OFFCODE_MAX, Offlog);
768
+ if (FSE_isError(ohSize)) {
769
+ eSize = ERROR(GENERIC);
770
+ DISPLAYLEVEL(1, "FSE_writeNCount error with offcodeNCount \n");
771
+ goto _cleanup;
772
+ }
773
+ dstPtr += ohSize;
774
+ maxDstSize -= ohSize;
775
+ eSize += ohSize;
776
+ }
777
+
778
+ { size_t const mhSize = FSE_writeNCount(dstPtr, maxDstSize, matchLengthNCount, MaxML, mlLog);
779
+ if (FSE_isError(mhSize)) {
780
+ eSize = ERROR(GENERIC);
781
+ DISPLAYLEVEL(1, "FSE_writeNCount error with matchLengthNCount \n");
782
+ goto _cleanup;
783
+ }
784
+ dstPtr += mhSize;
785
+ maxDstSize -= mhSize;
786
+ eSize += mhSize;
787
+ }
788
+
789
+ { size_t const lhSize = FSE_writeNCount(dstPtr, maxDstSize, litLengthNCount, MaxLL, llLog);
790
+ if (FSE_isError(lhSize)) {
791
+ eSize = ERROR(GENERIC);
792
+ DISPLAYLEVEL(1, "FSE_writeNCount error with litlengthNCount \n");
793
+ goto _cleanup;
794
+ }
795
+ dstPtr += lhSize;
796
+ maxDstSize -= lhSize;
797
+ eSize += lhSize;
798
+ }
799
+
800
+ if (maxDstSize<12) {
801
+ eSize = ERROR(GENERIC);
802
+ DISPLAYLEVEL(1, "not enough space to write RepOffsets \n");
803
+ goto _cleanup;
804
+ }
805
+ # if 0
806
+ MEM_writeLE32(dstPtr+0, bestRepOffset[0].offset);
807
+ MEM_writeLE32(dstPtr+4, bestRepOffset[1].offset);
808
+ MEM_writeLE32(dstPtr+8, bestRepOffset[2].offset);
809
+ #else
810
+ /* at this stage, we don't use the result of "most common first offset",
811
+ as the impact of statistics is not properly evaluated */
812
+ MEM_writeLE32(dstPtr+0, repStartValue[0]);
813
+ MEM_writeLE32(dstPtr+4, repStartValue[1]);
814
+ MEM_writeLE32(dstPtr+8, repStartValue[2]);
815
+ #endif
816
+ //dstPtr += 12;
817
+ eSize += 12;
818
+
819
+ _cleanup:
820
+ ZSTD_freeCCtx(esr.ref);
821
+ ZSTD_freeCCtx(esr.zc);
822
+ free(esr.workPlace);
823
+
824
+ return eSize;
825
+ }
826
+
827
+
828
+ size_t ZDICT_addEntropyTablesFromBuffer_advanced(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
829
+ const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
830
+ ZDICT_params_t params)
831
+ {
832
+ size_t hSize;
833
+ int const compressionLevel = (params.compressionLevel <= 0) ? g_compressionLevel_default : params.compressionLevel;
834
+ U32 const notificationLevel = params.notificationLevel;
835
+
836
+ /* dictionary header */
837
+ MEM_writeLE32(dictBuffer, ZSTD_DICT_MAGIC);
838
+ { U64 const randomID = XXH64((char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize, 0);
839
+ U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768;
840
+ U32 const dictID = params.dictID ? params.dictID : compliantID;
841
+ MEM_writeLE32((char*)dictBuffer+4, dictID);
842
+ }
843
+ hSize = 8;
844
+
845
+ /* entropy tables */
846
+ DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
847
+ DISPLAYLEVEL(2, "statistics ... \n");
848
+ { size_t const eSize = ZDICT_analyzeEntropy((char*)dictBuffer+hSize, dictBufferCapacity-hSize,
849
+ compressionLevel,
850
+ samplesBuffer, samplesSizes, nbSamples,
851
+ (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize,
852
+ notificationLevel);
853
+ if (ZDICT_isError(eSize)) return eSize;
854
+ hSize += eSize;
855
+ }
856
+
857
+
858
+ if (hSize + dictContentSize < dictBufferCapacity)
859
+ memmove((char*)dictBuffer + hSize, (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize);
860
+ return MIN(dictBufferCapacity, hSize+dictContentSize);
861
+ }
862
+
863
+
864
+ /*! ZDICT_trainFromBuffer_unsafe() :
865
+ * Warning : `samplesBuffer` must be followed by noisy guard band.
866
+ * @return : size of dictionary, or an error code which can be tested with ZDICT_isError()
867
+ */
868
+ size_t ZDICT_trainFromBuffer_unsafe(
869
+ void* dictBuffer, size_t maxDictSize,
870
+ const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
871
+ ZDICT_params_t params)
872
+ {
873
+ U32 const dictListSize = MAX(MAX(DICTLISTSIZE_DEFAULT, nbSamples), (U32)(maxDictSize/16));
874
+ dictItem* const dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList));
875
+ unsigned const selectivity = params.selectivityLevel == 0 ? g_selectivity_default : params.selectivityLevel;
876
+ unsigned const minRep = (selectivity > 30) ? MINRATIO : nbSamples >> selectivity;
877
+ size_t const targetDictSize = maxDictSize;
878
+ size_t const samplesBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples);
879
+ size_t dictSize = 0;
880
+ U32 const notificationLevel = params.notificationLevel;
881
+
882
+ /* checks */
883
+ if (!dictList) return ERROR(memory_allocation);
884
+ if (maxDictSize <= g_provision_entropySize + g_min_fast_dictContent) { free(dictList); return ERROR(dstSize_tooSmall); }
885
+ if (samplesBuffSize < ZDICT_MIN_SAMPLES_SIZE) { free(dictList); return 0; } /* not enough source to create dictionary */
886
+
887
+ /* init */
888
+ ZDICT_initDictItem(dictList);
889
+
890
+ /* build dictionary */
891
+ ZDICT_trainBuffer(dictList, dictListSize,
892
+ samplesBuffer, samplesBuffSize,
893
+ samplesSizes, nbSamples,
894
+ minRep, notificationLevel);
895
+
896
+ /* display best matches */
897
+ if (params.notificationLevel>= 3) {
898
+ U32 const nb = MIN(25, dictList[0].pos);
899
+ U32 const dictContentSize = ZDICT_dictSize(dictList);
900
+ U32 u;
901
+ DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", dictList[0].pos-1, dictContentSize);
902
+ DISPLAYLEVEL(3, "list %u best segments \n", nb-1);
903
+ for (u=1; u<nb; u++) {
904
+ U32 const pos = dictList[u].pos;
905
+ U32 const length = dictList[u].length;
906
+ U32 const printedLength = MIN(40, length);
907
+ if ((pos > samplesBuffSize) || ((pos + length) > samplesBuffSize))
908
+ return ERROR(GENERIC); /* should never happen */
909
+ DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |",
910
+ u, length, pos, dictList[u].savings);
911
+ ZDICT_printHex((const char*)samplesBuffer+pos, printedLength);
912
+ DISPLAYLEVEL(3, "| \n");
913
+ } }
914
+
915
+
916
+ /* create dictionary */
917
+ { U32 dictContentSize = ZDICT_dictSize(dictList);
918
+ if (dictContentSize < targetDictSize/3) {
919
+ DISPLAYLEVEL(2, "! warning : selected content significantly smaller than requested (%u < %u) \n", dictContentSize, (U32)maxDictSize);
920
+ if (minRep > MINRATIO) {
921
+ DISPLAYLEVEL(2, "! consider increasing selectivity to produce larger dictionary (-s%u) \n", selectivity+1);
922
+ DISPLAYLEVEL(2, "! note : larger dictionaries are not necessarily better, test its efficiency on samples \n");
923
+ }
924
+ if (samplesBuffSize < 10 * targetDictSize)
925
+ DISPLAYLEVEL(2, "! consider increasing the number of samples (total size : %u MB)\n", (U32)(samplesBuffSize>>20));
926
+ }
927
+
928
+ if ((dictContentSize > targetDictSize*3) && (nbSamples > 2*MINRATIO) && (selectivity>1)) {
929
+ U32 proposedSelectivity = selectivity-1;
930
+ while ((nbSamples >> proposedSelectivity) <= MINRATIO) { proposedSelectivity--; }
931
+ DISPLAYLEVEL(2, "! note : calculated dictionary significantly larger than requested (%u > %u) \n", dictContentSize, (U32)maxDictSize);
932
+ DISPLAYLEVEL(2, "! consider increasing dictionary size, or produce denser dictionary (-s%u) \n", proposedSelectivity);
933
+ DISPLAYLEVEL(2, "! always test dictionary efficiency on samples \n");
934
+ }
935
+
936
+ /* limit dictionary size */
937
+ { U32 const max = dictList->pos; /* convention : nb of useful elts within dictList */
938
+ U32 currentSize = 0;
939
+ U32 n; for (n=1; n<max; n++) {
940
+ currentSize += dictList[n].length;
941
+ if (currentSize > targetDictSize) { currentSize -= dictList[n].length; break; }
942
+ }
943
+ dictList->pos = n;
944
+ dictContentSize = currentSize;
945
+ }
946
+
947
+ /* build dict content */
948
+ { U32 u;
949
+ BYTE* ptr = (BYTE*)dictBuffer + maxDictSize;
950
+ for (u=1; u<dictList->pos; u++) {
951
+ U32 l = dictList[u].length;
952
+ ptr -= l;
953
+ if (ptr<(BYTE*)dictBuffer) { free(dictList); return ERROR(GENERIC); } /* should not happen */
954
+ memcpy(ptr, (const char*)samplesBuffer+dictList[u].pos, l);
955
+ } }
956
+
957
+ dictSize = ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, maxDictSize,
958
+ samplesBuffer, samplesSizes, nbSamples,
959
+ params);
960
+ }
961
+
962
+ /* clean up */
963
+ free(dictList);
964
+ return dictSize;
965
+ }
966
+
967
+
968
+ /* issue : samplesBuffer need to be followed by a noisy guard band.
969
+ * work around : duplicate the buffer, and add the noise */
970
+ size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dictBufferCapacity,
971
+ const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
972
+ ZDICT_params_t params)
973
+ {
974
+ size_t result;
975
+ void* newBuff;
976
+ size_t const sBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples);
977
+ if (sBuffSize < ZDICT_MIN_SAMPLES_SIZE) return 0; /* not enough content => no dictionary */
978
+
979
+ newBuff = malloc(sBuffSize + NOISELENGTH);
980
+ if (!newBuff) return ERROR(memory_allocation);
981
+
982
+ memcpy(newBuff, samplesBuffer, sBuffSize);
983
+ ZDICT_fillNoise((char*)newBuff + sBuffSize, NOISELENGTH); /* guard band, for end of buffer condition */
984
+
985
+ result = ZDICT_trainFromBuffer_unsafe(
986
+ dictBuffer, dictBufferCapacity,
987
+ newBuff, samplesSizes, nbSamples,
988
+ params);
989
+ free(newBuff);
990
+ return result;
991
+ }
992
+
993
+
994
+ size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
995
+ const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
996
+ {
997
+ ZDICT_params_t params;
998
+ memset(&params, 0, sizeof(params));
999
+ return ZDICT_trainFromBuffer_advanced(dictBuffer, dictBufferCapacity,
1000
+ samplesBuffer, samplesSizes, nbSamples,
1001
+ params);
1002
+ }
1003
+
1004
+ size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
1005
+ const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
1006
+ {
1007
+ ZDICT_params_t params;
1008
+ memset(&params, 0, sizeof(params));
1009
+ return ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, dictBufferCapacity,
1010
+ samplesBuffer, samplesSizes, nbSamples,
1011
+ params);
1012
+ }