extzstd 0.1 → 0.3.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (134) hide show
  1. checksums.yaml +5 -5
  2. data/HISTORY.ja.md +39 -0
  3. data/README.md +38 -56
  4. data/contrib/zstd/CHANGELOG +613 -0
  5. data/contrib/zstd/CODE_OF_CONDUCT.md +5 -0
  6. data/contrib/zstd/CONTRIBUTING.md +406 -0
  7. data/contrib/zstd/COPYING +339 -0
  8. data/contrib/zstd/Makefile +420 -0
  9. data/contrib/zstd/README.md +179 -41
  10. data/contrib/zstd/TESTING.md +44 -0
  11. data/contrib/zstd/appveyor.yml +292 -0
  12. data/contrib/zstd/lib/BUCK +234 -0
  13. data/contrib/zstd/lib/Makefile +451 -0
  14. data/contrib/zstd/lib/README.md +207 -0
  15. data/contrib/zstd/{common → lib/common}/bitstream.h +187 -138
  16. data/contrib/zstd/lib/common/compiler.h +288 -0
  17. data/contrib/zstd/lib/common/cpu.h +213 -0
  18. data/contrib/zstd/lib/common/debug.c +24 -0
  19. data/contrib/zstd/lib/common/debug.h +107 -0
  20. data/contrib/zstd/lib/common/entropy_common.c +362 -0
  21. data/contrib/zstd/{common → lib/common}/error_private.c +25 -12
  22. data/contrib/zstd/{common → lib/common}/error_private.h +14 -10
  23. data/contrib/zstd/{common → lib/common}/fse.h +173 -92
  24. data/contrib/zstd/{common → lib/common}/fse_decompress.c +149 -85
  25. data/contrib/zstd/lib/common/huf.h +361 -0
  26. data/contrib/zstd/{common → lib/common}/mem.h +115 -59
  27. data/contrib/zstd/lib/common/pool.c +350 -0
  28. data/contrib/zstd/lib/common/pool.h +84 -0
  29. data/contrib/zstd/lib/common/threading.c +122 -0
  30. data/contrib/zstd/lib/common/threading.h +155 -0
  31. data/contrib/zstd/{common → lib/common}/xxhash.c +55 -96
  32. data/contrib/zstd/{common → lib/common}/xxhash.h +23 -47
  33. data/contrib/zstd/lib/common/zstd_common.c +83 -0
  34. data/contrib/zstd/lib/common/zstd_deps.h +111 -0
  35. data/contrib/zstd/lib/common/zstd_errors.h +95 -0
  36. data/contrib/zstd/lib/common/zstd_internal.h +478 -0
  37. data/contrib/zstd/{compress → lib/compress}/fse_compress.c +214 -319
  38. data/contrib/zstd/lib/compress/hist.c +181 -0
  39. data/contrib/zstd/lib/compress/hist.h +75 -0
  40. data/contrib/zstd/lib/compress/huf_compress.c +913 -0
  41. data/contrib/zstd/lib/compress/zstd_compress.c +5208 -0
  42. data/contrib/zstd/lib/compress/zstd_compress_internal.h +1203 -0
  43. data/contrib/zstd/lib/compress/zstd_compress_literals.c +158 -0
  44. data/contrib/zstd/lib/compress/zstd_compress_literals.h +29 -0
  45. data/contrib/zstd/lib/compress/zstd_compress_sequences.c +433 -0
  46. data/contrib/zstd/lib/compress/zstd_compress_sequences.h +54 -0
  47. data/contrib/zstd/lib/compress/zstd_compress_superblock.c +849 -0
  48. data/contrib/zstd/lib/compress/zstd_compress_superblock.h +32 -0
  49. data/contrib/zstd/lib/compress/zstd_cwksp.h +561 -0
  50. data/contrib/zstd/lib/compress/zstd_double_fast.c +521 -0
  51. data/contrib/zstd/lib/compress/zstd_double_fast.h +38 -0
  52. data/contrib/zstd/lib/compress/zstd_fast.c +496 -0
  53. data/contrib/zstd/lib/compress/zstd_fast.h +37 -0
  54. data/contrib/zstd/lib/compress/zstd_lazy.c +1412 -0
  55. data/contrib/zstd/lib/compress/zstd_lazy.h +87 -0
  56. data/contrib/zstd/lib/compress/zstd_ldm.c +660 -0
  57. data/contrib/zstd/lib/compress/zstd_ldm.h +116 -0
  58. data/contrib/zstd/lib/compress/zstd_opt.c +1345 -0
  59. data/contrib/zstd/lib/compress/zstd_opt.h +56 -0
  60. data/contrib/zstd/lib/compress/zstdmt_compress.c +1811 -0
  61. data/contrib/zstd/lib/compress/zstdmt_compress.h +110 -0
  62. data/contrib/zstd/lib/decompress/huf_decompress.c +1350 -0
  63. data/contrib/zstd/lib/decompress/zstd_ddict.c +244 -0
  64. data/contrib/zstd/lib/decompress/zstd_ddict.h +44 -0
  65. data/contrib/zstd/lib/decompress/zstd_decompress.c +1930 -0
  66. data/contrib/zstd/lib/decompress/zstd_decompress_block.c +1540 -0
  67. data/contrib/zstd/lib/decompress/zstd_decompress_block.h +62 -0
  68. data/contrib/zstd/lib/decompress/zstd_decompress_internal.h +190 -0
  69. data/contrib/zstd/{common → lib/deprecated}/zbuff.h +68 -45
  70. data/contrib/zstd/lib/deprecated/zbuff_common.c +26 -0
  71. data/contrib/zstd/lib/deprecated/zbuff_compress.c +147 -0
  72. data/contrib/zstd/lib/deprecated/zbuff_decompress.c +75 -0
  73. data/contrib/zstd/lib/dictBuilder/cover.c +1245 -0
  74. data/contrib/zstd/lib/dictBuilder/cover.h +157 -0
  75. data/contrib/zstd/{dictBuilder → lib/dictBuilder}/divsufsort.c +3 -3
  76. data/contrib/zstd/{dictBuilder → lib/dictBuilder}/divsufsort.h +0 -0
  77. data/contrib/zstd/lib/dictBuilder/fastcover.c +758 -0
  78. data/contrib/zstd/{dictBuilder → lib/dictBuilder}/zdict.c +318 -194
  79. data/contrib/zstd/lib/dictBuilder/zdict.h +305 -0
  80. data/contrib/zstd/{legacy → lib/legacy}/zstd_legacy.h +171 -15
  81. data/contrib/zstd/{legacy → lib/legacy}/zstd_v01.c +191 -124
  82. data/contrib/zstd/{legacy → lib/legacy}/zstd_v01.h +19 -5
  83. data/contrib/zstd/{legacy → lib/legacy}/zstd_v02.c +125 -125
  84. data/contrib/zstd/{legacy → lib/legacy}/zstd_v02.h +19 -5
  85. data/contrib/zstd/{legacy → lib/legacy}/zstd_v03.c +125 -124
  86. data/contrib/zstd/{legacy → lib/legacy}/zstd_v03.h +20 -6
  87. data/contrib/zstd/{legacy → lib/legacy}/zstd_v04.c +151 -299
  88. data/contrib/zstd/{legacy → lib/legacy}/zstd_v04.h +19 -5
  89. data/contrib/zstd/{legacy → lib/legacy}/zstd_v05.c +237 -243
  90. data/contrib/zstd/{legacy → lib/legacy}/zstd_v05.h +19 -6
  91. data/contrib/zstd/{legacy → lib/legacy}/zstd_v06.c +130 -143
  92. data/contrib/zstd/{legacy → lib/legacy}/zstd_v06.h +18 -5
  93. data/contrib/zstd/{legacy → lib/legacy}/zstd_v07.c +158 -157
  94. data/contrib/zstd/{legacy → lib/legacy}/zstd_v07.h +19 -5
  95. data/contrib/zstd/lib/libzstd.pc.in +15 -0
  96. data/contrib/zstd/lib/zstd.h +2391 -0
  97. data/ext/depend +2 -0
  98. data/ext/extconf.rb +15 -6
  99. data/ext/extzstd.c +76 -145
  100. data/ext/extzstd.h +80 -31
  101. data/ext/extzstd_stream.c +417 -142
  102. data/ext/libzstd_conf.h +8 -0
  103. data/ext/zstd_common.c +10 -7
  104. data/ext/zstd_compress.c +14 -5
  105. data/ext/zstd_decompress.c +5 -4
  106. data/ext/zstd_dictbuilder.c +9 -4
  107. data/ext/zstd_dictbuilder_fastcover.c +3 -0
  108. data/ext/zstd_legacy_v01.c +3 -1
  109. data/ext/zstd_legacy_v02.c +3 -1
  110. data/ext/zstd_legacy_v03.c +3 -1
  111. data/ext/zstd_legacy_v04.c +3 -1
  112. data/ext/zstd_legacy_v05.c +3 -1
  113. data/ext/zstd_legacy_v06.c +3 -1
  114. data/ext/zstd_legacy_v07.c +3 -1
  115. data/gemstub.rb +10 -24
  116. data/lib/extzstd.rb +64 -179
  117. data/lib/extzstd/version.rb +6 -1
  118. data/test/test_basic.rb +9 -6
  119. metadata +113 -57
  120. data/HISTORY.ja +0 -5
  121. data/contrib/zstd/common/entropy_common.c +0 -225
  122. data/contrib/zstd/common/huf.h +0 -228
  123. data/contrib/zstd/common/zstd_common.c +0 -83
  124. data/contrib/zstd/common/zstd_errors.h +0 -60
  125. data/contrib/zstd/common/zstd_internal.h +0 -267
  126. data/contrib/zstd/compress/huf_compress.c +0 -533
  127. data/contrib/zstd/compress/zbuff_compress.c +0 -319
  128. data/contrib/zstd/compress/zstd_compress.c +0 -3264
  129. data/contrib/zstd/compress/zstd_opt.h +0 -900
  130. data/contrib/zstd/decompress/huf_decompress.c +0 -883
  131. data/contrib/zstd/decompress/zbuff_decompress.c +0 -252
  132. data/contrib/zstd/decompress/zstd_decompress.c +0 -1842
  133. data/contrib/zstd/dictBuilder/zdict.h +0 -111
  134. data/contrib/zstd/zstd.h +0 -640
@@ -1,18 +1,20 @@
1
- /**
2
- * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
1
+ /*
2
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
3
3
  * All rights reserved.
4
4
  *
5
- * This source code is licensed under the BSD-style license found in the
6
- * LICENSE file in the root directory of this source tree. An additional grant
7
- * of patent rights can be found in the PATENTS file in the same directory.
5
+ * This source code is licensed under both the BSD-style license (found in the
6
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7
+ * in the COPYING file in the root directory of this source tree).
8
+ * You may select, at your option, one of the above-listed licenses.
8
9
  */
9
10
 
10
11
 
11
12
  /*-**************************************
12
13
  * Tuning parameters
13
14
  ****************************************/
15
+ #define MINRATIO 4 /* minimum nb of apparition to be selected in dictionary */
14
16
  #define ZDICT_MAX_SAMPLES_SIZE (2000U << 20)
15
- #define ZDICT_MIN_SAMPLES_SIZE 512
17
+ #define ZDICT_MIN_SAMPLES_SIZE (ZDICT_CONTENTSIZE_MIN * MINRATIO)
16
18
 
17
19
 
18
20
  /*-**************************************
@@ -35,18 +37,18 @@
35
37
  #include <stdio.h> /* fprintf, fopen, ftello64 */
36
38
  #include <time.h> /* clock */
37
39
 
38
- #include "mem.h" /* read */
39
- #include "error_private.h"
40
- #include "fse.h" /* FSE_normalizeCount, FSE_writeNCount */
40
+ #include "../common/mem.h" /* read */
41
+ #include "../common/fse.h" /* FSE_normalizeCount, FSE_writeNCount */
41
42
  #define HUF_STATIC_LINKING_ONLY
42
- #include "huf.h"
43
- #include "zstd_internal.h" /* includes zstd.h */
44
- #include "xxhash.h"
43
+ #include "../common/huf.h" /* HUF_buildCTable, HUF_writeCTable */
44
+ #include "../common/zstd_internal.h" /* includes zstd.h */
45
+ #include "../common/xxhash.h" /* XXH64 */
45
46
  #include "divsufsort.h"
46
47
  #ifndef ZDICT_STATIC_LINKING_ONLY
47
48
  # define ZDICT_STATIC_LINKING_ONLY
48
49
  #endif
49
50
  #include "zdict.h"
51
+ #include "../compress/zstd_compress_internal.h" /* ZSTD_loadCEntropy() */
50
52
 
51
53
 
52
54
  /*-*************************************
@@ -60,17 +62,15 @@
60
62
 
61
63
  #define NOISELENGTH 32
62
64
 
63
- #define MINRATIO 4
64
- static const int g_compressionLevel_default = 5;
65
65
  static const U32 g_selectivity_default = 9;
66
- static const size_t g_provision_entropySize = 200;
67
- static const size_t g_min_fast_dictContent = 192;
68
66
 
69
67
 
70
68
  /*-*************************************
71
69
  * Console display
72
70
  ***************************************/
71
+ #undef DISPLAY
73
72
  #define DISPLAY(...) { fprintf(stderr, __VA_ARGS__); fflush( stderr ); }
73
+ #undef DISPLAYLEVEL
74
74
  #define DISPLAYLEVEL(l, ...) if (notificationLevel>=l) { DISPLAY(__VA_ARGS__); } /* 0 : no display; 1: errors; 2: default; 3: details; 4: debug */
75
75
 
76
76
  static clock_t ZDICT_clockSpan(clock_t nPrevious) { return clock() - nPrevious; }
@@ -97,15 +97,35 @@ const char* ZDICT_getErrorName(size_t errorCode) { return ERR_getErrorName(error
97
97
  unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize)
98
98
  {
99
99
  if (dictSize < 8) return 0;
100
- if (MEM_readLE32(dictBuffer) != ZSTD_DICT_MAGIC) return 0;
100
+ if (MEM_readLE32(dictBuffer) != ZSTD_MAGIC_DICTIONARY) return 0;
101
101
  return MEM_readLE32((const char*)dictBuffer + 4);
102
102
  }
103
103
 
104
+ size_t ZDICT_getDictHeaderSize(const void* dictBuffer, size_t dictSize)
105
+ {
106
+ size_t headerSize;
107
+ if (dictSize <= 8 || MEM_readLE32(dictBuffer) != ZSTD_MAGIC_DICTIONARY) return ERROR(dictionary_corrupted);
108
+
109
+ { ZSTD_compressedBlockState_t* bs = (ZSTD_compressedBlockState_t*)malloc(sizeof(ZSTD_compressedBlockState_t));
110
+ U32* wksp = (U32*)malloc(HUF_WORKSPACE_SIZE);
111
+ if (!bs || !wksp) {
112
+ headerSize = ERROR(memory_allocation);
113
+ } else {
114
+ ZSTD_reset_compressedBlockState(bs);
115
+ headerSize = ZSTD_loadCEntropy(bs, wksp, dictBuffer, dictSize);
116
+ }
117
+
118
+ free(bs);
119
+ free(wksp);
120
+ }
121
+
122
+ return headerSize;
123
+ }
104
124
 
105
125
  /*-********************************************************
106
126
  * Dictionary training functions
107
127
  **********************************************************/
108
- static unsigned ZDICT_NbCommonBytes (register size_t val)
128
+ static unsigned ZDICT_NbCommonBytes (size_t val)
109
129
  {
110
130
  if (MEM_isLittleEndian()) {
111
131
  if (MEM_64bits()) {
@@ -209,7 +229,6 @@ static dictItem ZDICT_analyzePos(
209
229
  U32 cumulLength[LLIMIT] = {0};
210
230
  U32 savings[LLIMIT] = {0};
211
231
  const BYTE* b = (const BYTE*)buffer;
212
- size_t length;
213
232
  size_t maxLength = LLIMIT;
214
233
  size_t pos = suffix[start];
215
234
  U32 end = start;
@@ -224,26 +243,30 @@ static dictItem ZDICT_analyzePos(
224
243
  ||(MEM_read16(b+pos+1) == MEM_read16(b+pos+3))
225
244
  ||(MEM_read16(b+pos+2) == MEM_read16(b+pos+4)) ) {
226
245
  /* skip and mark segment */
227
- U16 u16 = MEM_read16(b+pos+4);
228
- U32 u, e = 6;
229
- while (MEM_read16(b+pos+e) == u16) e+=2 ;
230
- if (b[pos+e] == b[pos+e-1]) e++;
231
- for (u=1; u<e; u++)
246
+ U16 const pattern16 = MEM_read16(b+pos+4);
247
+ U32 u, patternEnd = 6;
248
+ while (MEM_read16(b+pos+patternEnd) == pattern16) patternEnd+=2 ;
249
+ if (b[pos+patternEnd] == b[pos+patternEnd-1]) patternEnd++;
250
+ for (u=1; u<patternEnd; u++)
232
251
  doneMarks[pos+u] = 1;
233
252
  return solution;
234
253
  }
235
254
 
236
255
  /* look forward */
237
- do {
238
- end++;
239
- length = ZDICT_count(b + pos, b + suffix[end]);
240
- } while (length >=MINMATCHLENGTH);
256
+ { size_t length;
257
+ do {
258
+ end++;
259
+ length = ZDICT_count(b + pos, b + suffix[end]);
260
+ } while (length >= MINMATCHLENGTH);
261
+ }
241
262
 
242
263
  /* look backward */
243
- do {
244
- length = ZDICT_count(b + pos, b + *(suffix+start-1));
245
- if (length >=MINMATCHLENGTH) start--;
246
- } while(length >= MINMATCHLENGTH);
264
+ { size_t length;
265
+ do {
266
+ length = ZDICT_count(b + pos, b + *(suffix+start-1));
267
+ if (length >=MINMATCHLENGTH) start--;
268
+ } while(length >= MINMATCHLENGTH);
269
+ }
247
270
 
248
271
  /* exit if not found a minimum nb of repetitions */
249
272
  if (end-start < minRatio) {
@@ -254,15 +277,15 @@ static dictItem ZDICT_analyzePos(
254
277
  }
255
278
 
256
279
  { int i;
257
- U32 searchLength;
280
+ U32 mml;
258
281
  U32 refinedStart = start;
259
282
  U32 refinedEnd = end;
260
283
 
261
284
  DISPLAYLEVEL(4, "\n");
262
- DISPLAYLEVEL(4, "found %3u matches of length >= %i at pos %7u ", (U32)(end-start), MINMATCHLENGTH, (U32)pos);
285
+ DISPLAYLEVEL(4, "found %3u matches of length >= %i at pos %7u ", (unsigned)(end-start), MINMATCHLENGTH, (unsigned)pos);
263
286
  DISPLAYLEVEL(4, "\n");
264
287
 
265
- for (searchLength = MINMATCHLENGTH ; ; searchLength++) {
288
+ for (mml = MINMATCHLENGTH ; ; mml++) {
266
289
  BYTE currentChar = 0;
267
290
  U32 currentCount = 0;
268
291
  U32 currentID = refinedStart;
@@ -270,13 +293,13 @@ static dictItem ZDICT_analyzePos(
270
293
  U32 selectedCount = 0;
271
294
  U32 selectedID = currentID;
272
295
  for (id =refinedStart; id < refinedEnd; id++) {
273
- if (b[ suffix[id] + searchLength] != currentChar) {
296
+ if (b[suffix[id] + mml] != currentChar) {
274
297
  if (currentCount > selectedCount) {
275
298
  selectedCount = currentCount;
276
299
  selectedID = currentID;
277
300
  }
278
301
  currentID = id;
279
- currentChar = b[ suffix[id] + searchLength];
302
+ currentChar = b[ suffix[id] + mml];
280
303
  currentCount = 0;
281
304
  }
282
305
  currentCount ++;
@@ -292,28 +315,31 @@ static dictItem ZDICT_analyzePos(
292
315
  refinedEnd = refinedStart + selectedCount;
293
316
  }
294
317
 
295
- /* evaluate gain based on new ref */
318
+ /* evaluate gain based on new dict */
296
319
  start = refinedStart;
297
320
  pos = suffix[refinedStart];
298
321
  end = start;
299
322
  memset(lengthList, 0, sizeof(lengthList));
300
323
 
301
324
  /* look forward */
302
- do {
303
- end++;
304
- length = ZDICT_count(b + pos, b + suffix[end]);
305
- if (length >= LLIMIT) length = LLIMIT-1;
306
- lengthList[length]++;
307
- } while (length >=MINMATCHLENGTH);
325
+ { size_t length;
326
+ do {
327
+ end++;
328
+ length = ZDICT_count(b + pos, b + suffix[end]);
329
+ if (length >= LLIMIT) length = LLIMIT-1;
330
+ lengthList[length]++;
331
+ } while (length >=MINMATCHLENGTH);
332
+ }
308
333
 
309
334
  /* look backward */
310
- length = MINMATCHLENGTH;
311
- while ((length >= MINMATCHLENGTH) & (start > 0)) {
312
- length = ZDICT_count(b + pos, b + suffix[start - 1]);
313
- if (length >= LLIMIT) length = LLIMIT - 1;
314
- lengthList[length]++;
315
- if (length >= MINMATCHLENGTH) start--;
316
- }
335
+ { size_t length = MINMATCHLENGTH;
336
+ while ((length >= MINMATCHLENGTH) & (start > 0)) {
337
+ length = ZDICT_count(b + pos, b + suffix[start - 1]);
338
+ if (length >= LLIMIT) length = LLIMIT - 1;
339
+ lengthList[length]++;
340
+ if (length >= MINMATCHLENGTH) start--;
341
+ }
342
+ }
317
343
 
318
344
  /* largest useful length */
319
345
  memset(cumulLength, 0, sizeof(cumulLength));
@@ -337,8 +363,8 @@ static dictItem ZDICT_analyzePos(
337
363
  for (i=MINMATCHLENGTH; i<=(int)maxLength; i++)
338
364
  savings[i] = savings[i-1] + (lengthList[i] * (i-3));
339
365
 
340
- DISPLAYLEVEL(4, "Selected ref at position %u, of length %u : saves %u (ratio: %.2f) \n",
341
- (U32)pos, (U32)maxLength, savings[maxLength], (double)savings[maxLength] / maxLength);
366
+ DISPLAYLEVEL(4, "Selected dict at position %u, of length %u : saves %u (ratio: %.2f) \n",
367
+ (unsigned)pos, (unsigned)maxLength, (unsigned)savings[maxLength], (double)savings[maxLength] / maxLength);
342
368
 
343
369
  solution.pos = (U32)pos;
344
370
  solution.length = (U32)maxLength;
@@ -347,12 +373,12 @@ static dictItem ZDICT_analyzePos(
347
373
  /* mark positions done */
348
374
  { U32 id;
349
375
  for (id=start; id<end; id++) {
350
- U32 p, pEnd;
376
+ U32 p, pEnd, length;
351
377
  U32 const testedPos = suffix[id];
352
378
  if (testedPos == pos)
353
379
  length = solution.length;
354
380
  else {
355
- length = ZDICT_count(b+pos, b+testedPos);
381
+ length = (U32)ZDICT_count(b+pos, b+testedPos);
356
382
  if (length > solution.length) length = solution.length;
357
383
  }
358
384
  pEnd = (U32)(testedPos + length);
@@ -364,21 +390,35 @@ static dictItem ZDICT_analyzePos(
364
390
  }
365
391
 
366
392
 
367
- /*! ZDICT_checkMerge
393
+ static int isIncluded(const void* in, const void* container, size_t length)
394
+ {
395
+ const char* const ip = (const char*) in;
396
+ const char* const into = (const char*) container;
397
+ size_t u;
398
+
399
+ for (u=0; u<length; u++) { /* works because end of buffer is a noisy guard band */
400
+ if (ip[u] != into[u]) break;
401
+ }
402
+
403
+ return u==length;
404
+ }
405
+
406
+ /*! ZDICT_tryMerge() :
368
407
  check if dictItem can be merged, do it if possible
369
408
  @return : id of destination elt, 0 if not merged
370
409
  */
371
- static U32 ZDICT_checkMerge(dictItem* table, dictItem elt, U32 eltNbToSkip)
410
+ static U32 ZDICT_tryMerge(dictItem* table, dictItem elt, U32 eltNbToSkip, const void* buffer)
372
411
  {
373
412
  const U32 tableSize = table->pos;
374
413
  const U32 eltEnd = elt.pos + elt.length;
414
+ const char* const buf = (const char*) buffer;
375
415
 
376
416
  /* tail overlap */
377
417
  U32 u; for (u=1; u<tableSize; u++) {
378
418
  if (u==eltNbToSkip) continue;
379
419
  if ((table[u].pos > elt.pos) && (table[u].pos <= eltEnd)) { /* overlap, existing > new */
380
420
  /* append */
381
- U32 addedLength = table[u].pos - elt.pos;
421
+ U32 const addedLength = table[u].pos - elt.pos;
382
422
  table[u].length += addedLength;
383
423
  table[u].pos = elt.pos;
384
424
  table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */
@@ -394,9 +434,10 @@ static U32 ZDICT_checkMerge(dictItem* table, dictItem elt, U32 eltNbToSkip)
394
434
  /* front overlap */
395
435
  for (u=1; u<tableSize; u++) {
396
436
  if (u==eltNbToSkip) continue;
437
+
397
438
  if ((table[u].pos + table[u].length >= elt.pos) && (table[u].pos < elt.pos)) { /* overlap, existing < new */
398
439
  /* append */
399
- int addedLength = (int)eltEnd - (table[u].pos + table[u].length);
440
+ int const addedLength = (int)eltEnd - (table[u].pos + table[u].length);
400
441
  table[u].savings += elt.length / 8; /* rough approx bonus */
401
442
  if (addedLength > 0) { /* otherwise, elt fully included into existing */
402
443
  table[u].length += addedLength;
@@ -408,7 +449,18 @@ static U32 ZDICT_checkMerge(dictItem* table, dictItem elt, U32 eltNbToSkip)
408
449
  table[u] = table[u-1], u--;
409
450
  table[u] = elt;
410
451
  return u;
411
- } }
452
+ }
453
+
454
+ if (MEM_read64(buf + table[u].pos) == MEM_read64(buf + elt.pos + 1)) {
455
+ if (isIncluded(buf + table[u].pos, buf + elt.pos + 1, table[u].length)) {
456
+ size_t const addedLength = MAX( (int)elt.length - (int)table[u].length , 1 );
457
+ table[u].pos = elt.pos;
458
+ table[u].savings += (U32)(elt.savings * addedLength / elt.length);
459
+ table[u].length = MIN(elt.length, table[u].length + 1);
460
+ return u;
461
+ }
462
+ }
463
+ }
412
464
 
413
465
  return 0;
414
466
  }
@@ -416,8 +468,8 @@ static U32 ZDICT_checkMerge(dictItem* table, dictItem elt, U32 eltNbToSkip)
416
468
 
417
469
  static void ZDICT_removeDictItem(dictItem* table, U32 id)
418
470
  {
419
- /* convention : first element is nb of elts */
420
- U32 const max = table->pos;
471
+ /* convention : table[0].pos stores nb of elts */
472
+ U32 const max = table[0].pos;
421
473
  U32 u;
422
474
  if (!id) return; /* protection, should never happen */
423
475
  for (u=id; u<max-1; u++)
@@ -426,14 +478,14 @@ static void ZDICT_removeDictItem(dictItem* table, U32 id)
426
478
  }
427
479
 
428
480
 
429
- static void ZDICT_insertDictItem(dictItem* table, U32 maxSize, dictItem elt)
481
+ static void ZDICT_insertDictItem(dictItem* table, U32 maxSize, dictItem elt, const void* buffer)
430
482
  {
431
483
  /* merge if possible */
432
- U32 mergeId = ZDICT_checkMerge(table, elt, 0);
484
+ U32 mergeId = ZDICT_tryMerge(table, elt, 0, buffer);
433
485
  if (mergeId) {
434
486
  U32 newMerge = 1;
435
487
  while (newMerge) {
436
- newMerge = ZDICT_checkMerge(table, table[mergeId], mergeId);
488
+ newMerge = ZDICT_tryMerge(table, table[mergeId], mergeId, buffer);
437
489
  if (newMerge) ZDICT_removeDictItem(table, mergeId);
438
490
  mergeId = newMerge;
439
491
  }
@@ -464,10 +516,10 @@ static U32 ZDICT_dictSize(const dictItem* dictList)
464
516
  }
465
517
 
466
518
 
467
- static size_t ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
519
+ static size_t ZDICT_trainBuffer_legacy(dictItem* dictList, U32 dictListSize,
468
520
  const void* const buffer, size_t bufferSize, /* buffer must end with noisy guard band */
469
521
  const size_t* fileSizes, unsigned nbFiles,
470
- U32 minRatio, U32 notificationLevel)
522
+ unsigned minRatio, U32 notificationLevel)
471
523
  {
472
524
  int* const suffix0 = (int*)malloc((bufferSize+2)*sizeof(*suffix0));
473
525
  int* const suffix = suffix0+1;
@@ -478,10 +530,11 @@ static size_t ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
478
530
  clock_t displayClock = 0;
479
531
  clock_t const refreshRate = CLOCKS_PER_SEC * 3 / 10;
480
532
 
533
+ # undef DISPLAYUPDATE
481
534
  # define DISPLAYUPDATE(l, ...) if (notificationLevel>=l) { \
482
535
  if (ZDICT_clockSpan(displayClock) > refreshRate) \
483
536
  { displayClock = clock(); DISPLAY(__VA_ARGS__); \
484
- if (notificationLevel>=4) fflush(stdout); } }
537
+ if (notificationLevel>=4) fflush(stderr); } }
485
538
 
486
539
  /* init */
487
540
  DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
@@ -493,11 +546,11 @@ static size_t ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
493
546
  memset(doneMarks, 0, bufferSize+16);
494
547
 
495
548
  /* limit sample set size (divsufsort limitation)*/
496
- if (bufferSize > ZDICT_MAX_SAMPLES_SIZE) DISPLAYLEVEL(3, "sample set too large : reduced to %u MB ...\n", (U32)(ZDICT_MAX_SAMPLES_SIZE>>20));
549
+ if (bufferSize > ZDICT_MAX_SAMPLES_SIZE) DISPLAYLEVEL(3, "sample set too large : reduced to %u MB ...\n", (unsigned)(ZDICT_MAX_SAMPLES_SIZE>>20));
497
550
  while (bufferSize > ZDICT_MAX_SAMPLES_SIZE) bufferSize -= fileSizes[--nbFiles];
498
551
 
499
552
  /* sort */
500
- DISPLAYLEVEL(2, "sorting %u files of total size %u MB ...\n", nbFiles, (U32)(bufferSize>>20));
553
+ DISPLAYLEVEL(2, "sorting %u files of total size %u MB ...\n", nbFiles, (unsigned)(bufferSize>>20));
501
554
  { int const divSuftSortResult = divsufsort((const unsigned char*)buffer, suffix, (int)bufferSize, 0);
502
555
  if (divSuftSortResult != 0) { result = ERROR(GENERIC); goto _cleanup; }
503
556
  }
@@ -522,7 +575,7 @@ static size_t ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
522
575
  if (doneMarks[cursor]) { cursor++; continue; }
523
576
  solution = ZDICT_analyzePos(doneMarks, suffix, reverseSuffix[cursor], buffer, minRatio, notificationLevel);
524
577
  if (solution.length==0) { cursor++; continue; }
525
- ZDICT_insertDictItem(dictList, dictListSize, solution);
578
+ ZDICT_insertDictItem(dictList, dictListSize, solution, buffer);
526
579
  cursor += solution.length;
527
580
  DISPLAYUPDATE(2, "\r%4.2f %% \r", (double)cursor / bufferSize * 100);
528
581
  } }
@@ -541,7 +594,7 @@ static void ZDICT_fillNoise(void* buffer, size_t length)
541
594
  unsigned const prime1 = 2654435761U;
542
595
  unsigned const prime2 = 2246822519U;
543
596
  unsigned acc = prime1;
544
- size_t p=0;;
597
+ size_t p=0;
545
598
  for (p=0; p<length; p++) {
546
599
  acc *= prime2;
547
600
  ((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21);
@@ -551,29 +604,31 @@ static void ZDICT_fillNoise(void* buffer, size_t length)
551
604
 
552
605
  typedef struct
553
606
  {
554
- ZSTD_CCtx* ref;
555
- ZSTD_CCtx* zc;
556
- void* workPlace; /* must be ZSTD_BLOCKSIZE_ABSOLUTEMAX allocated */
607
+ ZSTD_CDict* dict; /* dictionary */
608
+ ZSTD_CCtx* zc; /* working context */
609
+ void* workPlace; /* must be ZSTD_BLOCKSIZE_MAX allocated */
557
610
  } EStats_ress_t;
558
611
 
559
612
  #define MAXREPOFFSET 1024
560
613
 
561
- static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params,
562
- U32* countLit, U32* offsetcodeCount, U32* matchlengthCount, U32* litlengthCount, U32* repOffsets,
563
- const void* src, size_t srcSize, U32 notificationLevel)
614
+ static void ZDICT_countEStats(EStats_ress_t esr, const ZSTD_parameters* params,
615
+ unsigned* countLit, unsigned* offsetcodeCount, unsigned* matchlengthCount, unsigned* litlengthCount, U32* repOffsets,
616
+ const void* src, size_t srcSize,
617
+ U32 notificationLevel)
564
618
  {
565
- size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_ABSOLUTEMAX, 1 << params.cParams.windowLog);
619
+ size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params->cParams.windowLog);
566
620
  size_t cSize;
567
621
 
568
622
  if (srcSize > blockSizeMax) srcSize = blockSizeMax; /* protection vs large samples */
569
- { size_t const errorCode = ZSTD_copyCCtx(esr.zc, esr.ref, 0);
570
- if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_copyCCtx failed \n"); return; }
623
+ { size_t const errorCode = ZSTD_compressBegin_usingCDict(esr.zc, esr.dict);
624
+ if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_compressBegin_usingCDict failed \n"); return; }
625
+
571
626
  }
572
- cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_ABSOLUTEMAX, src, srcSize);
573
- if (ZSTD_isError(cSize)) { DISPLAYLEVEL(1, "warning : could not compress sample size %u \n", (U32)srcSize); return; }
627
+ cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_MAX, src, srcSize);
628
+ if (ZSTD_isError(cSize)) { DISPLAYLEVEL(3, "warning : could not compress sample size %u \n", (unsigned)srcSize); return; }
574
629
 
575
630
  if (cSize) { /* if == 0; block is not compressible */
576
- const seqStore_t* seqStorePtr = ZSTD_getSeqStore(esr.zc);
631
+ const seqStore_t* const seqStorePtr = ZSTD_getSeqStore(esr.zc);
577
632
 
578
633
  /* literals stats */
579
634
  { const BYTE* bytePtr;
@@ -611,17 +666,6 @@ static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params,
611
666
  } } }
612
667
  }
613
668
 
614
- /*
615
- static size_t ZDICT_maxSampleSize(const size_t* fileSizes, unsigned nbFiles)
616
- {
617
- unsigned u;
618
- size_t max=0;
619
- for (u=0; u<nbFiles; u++)
620
- if (max < fileSizes[u]) max = fileSizes[u];
621
- return max;
622
- }
623
- */
624
-
625
669
  static size_t ZDICT_totalSampleSize(const size_t* fileSizes, unsigned nbFiles)
626
670
  {
627
671
  size_t total=0;
@@ -646,26 +690,38 @@ static void ZDICT_insertSortCount(offsetCount_t table[ZSTD_REP_NUM+1], U32 val,
646
690
  }
647
691
  }
648
692
 
693
+ /* ZDICT_flatLit() :
694
+ * rewrite `countLit` to contain a mostly flat but still compressible distribution of literals.
695
+ * necessary to avoid generating a non-compressible distribution that HUF_writeCTable() cannot encode.
696
+ */
697
+ static void ZDICT_flatLit(unsigned* countLit)
698
+ {
699
+ int u;
700
+ for (u=1; u<256; u++) countLit[u] = 2;
701
+ countLit[0] = 4;
702
+ countLit[253] = 1;
703
+ countLit[254] = 1;
704
+ }
649
705
 
650
706
  #define OFFCODE_MAX 30 /* only applicable to first block */
651
707
  static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
652
- unsigned compressionLevel,
708
+ int compressionLevel,
653
709
  const void* srcBuffer, const size_t* fileSizes, unsigned nbFiles,
654
710
  const void* dictBuffer, size_t dictBufferSize,
655
711
  unsigned notificationLevel)
656
712
  {
657
- U32 countLit[256];
713
+ unsigned countLit[256];
658
714
  HUF_CREATE_STATIC_CTABLE(hufTable, 255);
659
- U32 offcodeCount[OFFCODE_MAX+1];
715
+ unsigned offcodeCount[OFFCODE_MAX+1];
660
716
  short offcodeNCount[OFFCODE_MAX+1];
661
717
  U32 offcodeMax = ZSTD_highbit32((U32)(dictBufferSize + 128 KB));
662
- U32 matchLengthCount[MaxML+1];
718
+ unsigned matchLengthCount[MaxML+1];
663
719
  short matchLengthNCount[MaxML+1];
664
- U32 litLengthCount[MaxLL+1];
720
+ unsigned litLengthCount[MaxLL+1];
665
721
  short litLengthNCount[MaxLL+1];
666
722
  U32 repOffset[MAXREPOFFSET];
667
723
  offsetCount_t bestRepOffset[ZSTD_REP_NUM+1];
668
- EStats_ress_t esr;
724
+ EStats_ress_t esr = { NULL, NULL, NULL };
669
725
  ZSTD_parameters params;
670
726
  U32 u, huffLog = 11, Offlog = OffFSELog, mlLog = MLFSELog, llLog = LLFSELog, total;
671
727
  size_t pos = 0, errorCode;
@@ -675,48 +731,51 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
675
731
  BYTE* dstPtr = (BYTE*)dstBuffer;
676
732
 
677
733
  /* init */
678
- esr.ref = ZSTD_createCCtx();
734
+ DEBUGLOG(4, "ZDICT_analyzeEntropy");
735
+ if (offcodeMax>OFFCODE_MAX) { eSize = ERROR(dictionaryCreation_failed); goto _cleanup; } /* too large dictionary */
736
+ for (u=0; u<256; u++) countLit[u] = 1; /* any character must be described */
737
+ for (u=0; u<=offcodeMax; u++) offcodeCount[u] = 1;
738
+ for (u=0; u<=MaxML; u++) matchLengthCount[u] = 1;
739
+ for (u=0; u<=MaxLL; u++) litLengthCount[u] = 1;
740
+ memset(repOffset, 0, sizeof(repOffset));
741
+ repOffset[1] = repOffset[4] = repOffset[8] = 1;
742
+ memset(bestRepOffset, 0, sizeof(bestRepOffset));
743
+ if (compressionLevel==0) compressionLevel = ZSTD_CLEVEL_DEFAULT;
744
+ params = ZSTD_getParams(compressionLevel, averageSampleSize, dictBufferSize);
745
+
746
+ esr.dict = ZSTD_createCDict_advanced(dictBuffer, dictBufferSize, ZSTD_dlm_byRef, ZSTD_dct_rawContent, params.cParams, ZSTD_defaultCMem);
679
747
  esr.zc = ZSTD_createCCtx();
680
- esr.workPlace = malloc(ZSTD_BLOCKSIZE_ABSOLUTEMAX);
681
- if (!esr.ref || !esr.zc || !esr.workPlace) {
748
+ esr.workPlace = malloc(ZSTD_BLOCKSIZE_MAX);
749
+ if (!esr.dict || !esr.zc || !esr.workPlace) {
682
750
  eSize = ERROR(memory_allocation);
683
751
  DISPLAYLEVEL(1, "Not enough memory \n");
684
752
  goto _cleanup;
685
753
  }
686
- if (offcodeMax>OFFCODE_MAX) { eSize = ERROR(dictionary_wrong); goto _cleanup; } /* too large dictionary */
687
- for (u=0; u<256; u++) countLit[u]=1; /* any character must be described */
688
- for (u=0; u<=offcodeMax; u++) offcodeCount[u]=1;
689
- for (u=0; u<=MaxML; u++) matchLengthCount[u]=1;
690
- for (u=0; u<=MaxLL; u++) litLengthCount[u]=1;
691
- memset(repOffset, 0, sizeof(repOffset));
692
- repOffset[1] = repOffset[4] = repOffset[8] = 1;
693
- memset(bestRepOffset, 0, sizeof(bestRepOffset));
694
- if (compressionLevel==0) compressionLevel=g_compressionLevel_default;
695
- params = ZSTD_getParams(compressionLevel, averageSampleSize, dictBufferSize);
696
- { size_t const beginResult = ZSTD_compressBegin_advanced(esr.ref, dictBuffer, dictBufferSize, params, 0);
697
- if (ZSTD_isError(beginResult)) {
698
- eSize = ERROR(GENERIC);
699
- DISPLAYLEVEL(1, "error : ZSTD_compressBegin_advanced failed \n");
700
- goto _cleanup;
701
- } }
702
754
 
703
- /* collect stats on all files */
755
+ /* collect stats on all samples */
704
756
  for (u=0; u<nbFiles; u++) {
705
- ZDICT_countEStats(esr, params,
757
+ ZDICT_countEStats(esr, &params,
706
758
  countLit, offcodeCount, matchLengthCount, litLengthCount, repOffset,
707
759
  (const char*)srcBuffer + pos, fileSizes[u],
708
760
  notificationLevel);
709
761
  pos += fileSizes[u];
710
762
  }
711
763
 
712
- /* analyze */
713
- errorCode = HUF_buildCTable (hufTable, countLit, 255, huffLog);
714
- if (HUF_isError(errorCode)) {
715
- eSize = ERROR(GENERIC);
716
- DISPLAYLEVEL(1, "HUF_buildCTable error \n");
717
- goto _cleanup;
764
+ /* analyze, build stats, starting with literals */
765
+ { size_t maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog);
766
+ if (HUF_isError(maxNbBits)) {
767
+ eSize = maxNbBits;
768
+ DISPLAYLEVEL(1, " HUF_buildCTable error \n");
769
+ goto _cleanup;
770
+ }
771
+ if (maxNbBits==8) { /* not compressible : will fail on HUF_writeCTable() */
772
+ DISPLAYLEVEL(2, "warning : pathological dataset : literals are not compressible : samples are noisy or too regular \n");
773
+ ZDICT_flatLit(countLit); /* replace distribution by a fake "mostly flat but still compressible" distribution, that HUF_writeCTable() can encode */
774
+ maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog);
775
+ assert(maxNbBits==9);
776
+ }
777
+ huffLog = (U32)maxNbBits;
718
778
  }
719
- huffLog = (U32)errorCode;
720
779
 
721
780
  /* looking for most common first offsets */
722
781
  { U32 offset;
@@ -726,27 +785,27 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
726
785
  /* note : the result of this phase should be used to better appreciate the impact on statistics */
727
786
 
728
787
  total=0; for (u=0; u<=offcodeMax; u++) total+=offcodeCount[u];
729
- errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, offcodeMax);
788
+ errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, offcodeMax, /* useLowProbCount */ 1);
730
789
  if (FSE_isError(errorCode)) {
731
- eSize = ERROR(GENERIC);
790
+ eSize = errorCode;
732
791
  DISPLAYLEVEL(1, "FSE_normalizeCount error with offcodeCount \n");
733
792
  goto _cleanup;
734
793
  }
735
794
  Offlog = (U32)errorCode;
736
795
 
737
796
  total=0; for (u=0; u<=MaxML; u++) total+=matchLengthCount[u];
738
- errorCode = FSE_normalizeCount(matchLengthNCount, mlLog, matchLengthCount, total, MaxML);
797
+ errorCode = FSE_normalizeCount(matchLengthNCount, mlLog, matchLengthCount, total, MaxML, /* useLowProbCount */ 1);
739
798
  if (FSE_isError(errorCode)) {
740
- eSize = ERROR(GENERIC);
799
+ eSize = errorCode;
741
800
  DISPLAYLEVEL(1, "FSE_normalizeCount error with matchLengthCount \n");
742
801
  goto _cleanup;
743
802
  }
744
803
  mlLog = (U32)errorCode;
745
804
 
746
805
  total=0; for (u=0; u<=MaxLL; u++) total+=litLengthCount[u];
747
- errorCode = FSE_normalizeCount(litLengthNCount, llLog, litLengthCount, total, MaxLL);
806
+ errorCode = FSE_normalizeCount(litLengthNCount, llLog, litLengthCount, total, MaxLL, /* useLowProbCount */ 1);
748
807
  if (FSE_isError(errorCode)) {
749
- eSize = ERROR(GENERIC);
808
+ eSize = errorCode;
750
809
  DISPLAYLEVEL(1, "FSE_normalizeCount error with litLengthCount \n");
751
810
  goto _cleanup;
752
811
  }
@@ -755,7 +814,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
755
814
  /* write result to buffer */
756
815
  { size_t const hhSize = HUF_writeCTable(dstPtr, maxDstSize, hufTable, 255, huffLog);
757
816
  if (HUF_isError(hhSize)) {
758
- eSize = ERROR(GENERIC);
817
+ eSize = hhSize;
759
818
  DISPLAYLEVEL(1, "HUF_writeCTable error \n");
760
819
  goto _cleanup;
761
820
  }
@@ -766,7 +825,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
766
825
 
767
826
  { size_t const ohSize = FSE_writeNCount(dstPtr, maxDstSize, offcodeNCount, OFFCODE_MAX, Offlog);
768
827
  if (FSE_isError(ohSize)) {
769
- eSize = ERROR(GENERIC);
828
+ eSize = ohSize;
770
829
  DISPLAYLEVEL(1, "FSE_writeNCount error with offcodeNCount \n");
771
830
  goto _cleanup;
772
831
  }
@@ -777,7 +836,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
777
836
 
778
837
  { size_t const mhSize = FSE_writeNCount(dstPtr, maxDstSize, matchLengthNCount, MaxML, mlLog);
779
838
  if (FSE_isError(mhSize)) {
780
- eSize = ERROR(GENERIC);
839
+ eSize = mhSize;
781
840
  DISPLAYLEVEL(1, "FSE_writeNCount error with matchLengthNCount \n");
782
841
  goto _cleanup;
783
842
  }
@@ -788,7 +847,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
788
847
 
789
848
  { size_t const lhSize = FSE_writeNCount(dstPtr, maxDstSize, litLengthNCount, MaxLL, llLog);
790
849
  if (FSE_isError(lhSize)) {
791
- eSize = ERROR(GENERIC);
850
+ eSize = lhSize;
792
851
  DISPLAYLEVEL(1, "FSE_writeNCount error with litlengthNCount \n");
793
852
  goto _cleanup;
794
853
  }
@@ -798,7 +857,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
798
857
  }
799
858
 
800
859
  if (maxDstSize<12) {
801
- eSize = ERROR(GENERIC);
860
+ eSize = ERROR(dstSize_tooSmall);
802
861
  DISPLAYLEVEL(1, "not enough space to write RepOffsets \n");
803
862
  goto _cleanup;
804
863
  }
@@ -813,11 +872,10 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
813
872
  MEM_writeLE32(dstPtr+4, repStartValue[1]);
814
873
  MEM_writeLE32(dstPtr+8, repStartValue[2]);
815
874
  #endif
816
- //dstPtr += 12;
817
875
  eSize += 12;
818
876
 
819
877
  _cleanup:
820
- ZSTD_freeCCtx(esr.ref);
878
+ ZSTD_freeCDict(esr.dict);
821
879
  ZSTD_freeCCtx(esr.zc);
822
880
  free(esr.workPlace);
823
881
 
@@ -825,26 +883,68 @@ _cleanup:
825
883
  }
826
884
 
827
885
 
828
- size_t ZDICT_addEntropyTablesFromBuffer_advanced(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
829
- const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
830
- ZDICT_params_t params)
886
+
887
+ size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
888
+ const void* customDictContent, size_t dictContentSize,
889
+ const void* samplesBuffer, const size_t* samplesSizes,
890
+ unsigned nbSamples, ZDICT_params_t params)
831
891
  {
832
892
  size_t hSize;
833
- int const compressionLevel = (params.compressionLevel <= 0) ? g_compressionLevel_default : params.compressionLevel;
893
+ #define HBUFFSIZE 256 /* should prove large enough for all entropy headers */
894
+ BYTE header[HBUFFSIZE];
895
+ int const compressionLevel = (params.compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : params.compressionLevel;
834
896
  U32 const notificationLevel = params.notificationLevel;
835
897
 
898
+ /* check conditions */
899
+ DEBUGLOG(4, "ZDICT_finalizeDictionary");
900
+ if (dictBufferCapacity < dictContentSize) return ERROR(dstSize_tooSmall);
901
+ if (dictContentSize < ZDICT_CONTENTSIZE_MIN) return ERROR(srcSize_wrong);
902
+ if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) return ERROR(dstSize_tooSmall);
903
+
836
904
  /* dictionary header */
837
- MEM_writeLE32(dictBuffer, ZSTD_DICT_MAGIC);
838
- { U64 const randomID = XXH64((char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize, 0);
905
+ MEM_writeLE32(header, ZSTD_MAGIC_DICTIONARY);
906
+ { U64 const randomID = XXH64(customDictContent, dictContentSize, 0);
839
907
  U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768;
840
908
  U32 const dictID = params.dictID ? params.dictID : compliantID;
841
- MEM_writeLE32((char*)dictBuffer+4, dictID);
909
+ MEM_writeLE32(header+4, dictID);
842
910
  }
843
911
  hSize = 8;
844
912
 
845
913
  /* entropy tables */
846
914
  DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
847
915
  DISPLAYLEVEL(2, "statistics ... \n");
916
+ { size_t const eSize = ZDICT_analyzeEntropy(header+hSize, HBUFFSIZE-hSize,
917
+ compressionLevel,
918
+ samplesBuffer, samplesSizes, nbSamples,
919
+ customDictContent, dictContentSize,
920
+ notificationLevel);
921
+ if (ZDICT_isError(eSize)) return eSize;
922
+ hSize += eSize;
923
+ }
924
+
925
+ /* copy elements in final buffer ; note : src and dst buffer can overlap */
926
+ if (hSize + dictContentSize > dictBufferCapacity) dictContentSize = dictBufferCapacity - hSize;
927
+ { size_t const dictSize = hSize + dictContentSize;
928
+ char* dictEnd = (char*)dictBuffer + dictSize;
929
+ memmove(dictEnd - dictContentSize, customDictContent, dictContentSize);
930
+ memcpy(dictBuffer, header, hSize);
931
+ return dictSize;
932
+ }
933
+ }
934
+
935
+
936
+ static size_t ZDICT_addEntropyTablesFromBuffer_advanced(
937
+ void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
938
+ const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
939
+ ZDICT_params_t params)
940
+ {
941
+ int const compressionLevel = (params.compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : params.compressionLevel;
942
+ U32 const notificationLevel = params.notificationLevel;
943
+ size_t hSize = 8;
944
+
945
+ /* calculate entropy tables */
946
+ DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
947
+ DISPLAYLEVEL(2, "statistics ... \n");
848
948
  { size_t const eSize = ZDICT_analyzeEntropy((char*)dictBuffer+hSize, dictBufferCapacity-hSize,
849
949
  compressionLevel,
850
950
  samplesBuffer, samplesSizes, nbSamples,
@@ -854,21 +954,32 @@ size_t ZDICT_addEntropyTablesFromBuffer_advanced(void* dictBuffer, size_t dictCo
854
954
  hSize += eSize;
855
955
  }
856
956
 
957
+ /* add dictionary header (after entropy tables) */
958
+ MEM_writeLE32(dictBuffer, ZSTD_MAGIC_DICTIONARY);
959
+ { U64 const randomID = XXH64((char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize, 0);
960
+ U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768;
961
+ U32 const dictID = params.dictID ? params.dictID : compliantID;
962
+ MEM_writeLE32((char*)dictBuffer+4, dictID);
963
+ }
857
964
 
858
965
  if (hSize + dictContentSize < dictBufferCapacity)
859
966
  memmove((char*)dictBuffer + hSize, (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize);
860
967
  return MIN(dictBufferCapacity, hSize+dictContentSize);
861
968
  }
862
969
 
863
-
864
- /*! ZDICT_trainFromBuffer_unsafe() :
970
+ /* Hidden declaration for dbio.c */
971
+ size_t ZDICT_trainFromBuffer_unsafe_legacy(
972
+ void* dictBuffer, size_t maxDictSize,
973
+ const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
974
+ ZDICT_legacy_params_t params);
975
+ /*! ZDICT_trainFromBuffer_unsafe_legacy() :
865
976
  * Warning : `samplesBuffer` must be followed by noisy guard band.
866
977
  * @return : size of dictionary, or an error code which can be tested with ZDICT_isError()
867
978
  */
868
- size_t ZDICT_trainFromBuffer_unsafe(
979
+ size_t ZDICT_trainFromBuffer_unsafe_legacy(
869
980
  void* dictBuffer, size_t maxDictSize,
870
981
  const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
871
- ZDICT_params_t params)
982
+ ZDICT_legacy_params_t params)
872
983
  {
873
984
  U32 const dictListSize = MAX(MAX(DICTLISTSIZE_DEFAULT, nbSamples), (U32)(maxDictSize/16));
874
985
  dictItem* const dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList));
@@ -877,58 +988,63 @@ size_t ZDICT_trainFromBuffer_unsafe(
877
988
  size_t const targetDictSize = maxDictSize;
878
989
  size_t const samplesBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples);
879
990
  size_t dictSize = 0;
880
- U32 const notificationLevel = params.notificationLevel;
991
+ U32 const notificationLevel = params.zParams.notificationLevel;
881
992
 
882
993
  /* checks */
883
994
  if (!dictList) return ERROR(memory_allocation);
884
- if (maxDictSize <= g_provision_entropySize + g_min_fast_dictContent) { free(dictList); return ERROR(dstSize_tooSmall); }
885
- if (samplesBuffSize < ZDICT_MIN_SAMPLES_SIZE) { free(dictList); return 0; } /* not enough source to create dictionary */
995
+ if (maxDictSize < ZDICT_DICTSIZE_MIN) { free(dictList); return ERROR(dstSize_tooSmall); } /* requested dictionary size is too small */
996
+ if (samplesBuffSize < ZDICT_MIN_SAMPLES_SIZE) { free(dictList); return ERROR(dictionaryCreation_failed); } /* not enough source to create dictionary */
886
997
 
887
998
  /* init */
888
999
  ZDICT_initDictItem(dictList);
889
1000
 
890
1001
  /* build dictionary */
891
- ZDICT_trainBuffer(dictList, dictListSize,
892
- samplesBuffer, samplesBuffSize,
893
- samplesSizes, nbSamples,
894
- minRep, notificationLevel);
1002
+ ZDICT_trainBuffer_legacy(dictList, dictListSize,
1003
+ samplesBuffer, samplesBuffSize,
1004
+ samplesSizes, nbSamples,
1005
+ minRep, notificationLevel);
895
1006
 
896
1007
  /* display best matches */
897
- if (params.notificationLevel>= 3) {
898
- U32 const nb = MIN(25, dictList[0].pos);
899
- U32 const dictContentSize = ZDICT_dictSize(dictList);
900
- U32 u;
901
- DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", dictList[0].pos, dictContentSize);
902
- DISPLAYLEVEL(3, "list %u best segments \n", nb);
903
- for (u=1; u<=nb; u++) {
904
- U32 pos = dictList[u].pos;
905
- U32 length = dictList[u].length;
906
- U32 printedLength = MIN(40, length);
1008
+ if (params.zParams.notificationLevel>= 3) {
1009
+ unsigned const nb = MIN(25, dictList[0].pos);
1010
+ unsigned const dictContentSize = ZDICT_dictSize(dictList);
1011
+ unsigned u;
1012
+ DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", (unsigned)dictList[0].pos-1, dictContentSize);
1013
+ DISPLAYLEVEL(3, "list %u best segments \n", nb-1);
1014
+ for (u=1; u<nb; u++) {
1015
+ unsigned const pos = dictList[u].pos;
1016
+ unsigned const length = dictList[u].length;
1017
+ U32 const printedLength = MIN(40, length);
1018
+ if ((pos > samplesBuffSize) || ((pos + length) > samplesBuffSize)) {
1019
+ free(dictList);
1020
+ return ERROR(GENERIC); /* should never happen */
1021
+ }
907
1022
  DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |",
908
- u, length, pos, dictList[u].savings);
1023
+ u, length, pos, (unsigned)dictList[u].savings);
909
1024
  ZDICT_printHex((const char*)samplesBuffer+pos, printedLength);
910
1025
  DISPLAYLEVEL(3, "| \n");
911
1026
  } }
912
1027
 
913
1028
 
914
1029
  /* create dictionary */
915
- { U32 dictContentSize = ZDICT_dictSize(dictList);
916
- if (dictContentSize < targetDictSize/3) {
917
- DISPLAYLEVEL(2, "! warning : selected content significantly smaller than requested (%u < %u) \n", dictContentSize, (U32)maxDictSize);
1030
+ { unsigned dictContentSize = ZDICT_dictSize(dictList);
1031
+ if (dictContentSize < ZDICT_CONTENTSIZE_MIN) { free(dictList); return ERROR(dictionaryCreation_failed); } /* dictionary content too small */
1032
+ if (dictContentSize < targetDictSize/4) {
1033
+ DISPLAYLEVEL(2, "! warning : selected content significantly smaller than requested (%u < %u) \n", dictContentSize, (unsigned)maxDictSize);
1034
+ if (samplesBuffSize < 10 * targetDictSize)
1035
+ DISPLAYLEVEL(2, "! consider increasing the number of samples (total size : %u MB)\n", (unsigned)(samplesBuffSize>>20));
918
1036
  if (minRep > MINRATIO) {
919
1037
  DISPLAYLEVEL(2, "! consider increasing selectivity to produce larger dictionary (-s%u) \n", selectivity+1);
920
1038
  DISPLAYLEVEL(2, "! note : larger dictionaries are not necessarily better, test its efficiency on samples \n");
921
1039
  }
922
- if (samplesBuffSize < 10 * targetDictSize)
923
- DISPLAYLEVEL(2, "! consider increasing the number of samples (total size : %u MB)\n", (U32)(samplesBuffSize>>20));
924
1040
  }
925
1041
 
926
1042
  if ((dictContentSize > targetDictSize*3) && (nbSamples > 2*MINRATIO) && (selectivity>1)) {
927
- U32 proposedSelectivity = selectivity-1;
1043
+ unsigned proposedSelectivity = selectivity-1;
928
1044
  while ((nbSamples >> proposedSelectivity) <= MINRATIO) { proposedSelectivity--; }
929
- DISPLAYLEVEL(2, "! note : calculated dictionary significantly larger than requested (%u > %u) \n", dictContentSize, (U32)maxDictSize);
1045
+ DISPLAYLEVEL(2, "! note : calculated dictionary significantly larger than requested (%u > %u) \n", dictContentSize, (unsigned)maxDictSize);
930
1046
  DISPLAYLEVEL(2, "! consider increasing dictionary size, or produce denser dictionary (-s%u) \n", proposedSelectivity);
931
- DISPLAYLEVEL(2, "! always test dictionary efficiency on samples \n");
1047
+ DISPLAYLEVEL(2, "! always test dictionary efficiency on real samples \n");
932
1048
  }
933
1049
 
934
1050
  /* limit dictionary size */
@@ -954,7 +1070,7 @@ size_t ZDICT_trainFromBuffer_unsafe(
954
1070
 
955
1071
  dictSize = ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, maxDictSize,
956
1072
  samplesBuffer, samplesSizes, nbSamples,
957
- params);
1073
+ params.zParams);
958
1074
  }
959
1075
 
960
1076
  /* clean up */
@@ -963,11 +1079,12 @@ size_t ZDICT_trainFromBuffer_unsafe(
963
1079
  }
964
1080
 
965
1081
 
966
- /* issue : samplesBuffer need to be followed by a noisy guard band.
967
- * work around : duplicate the buffer, and add the noise */
968
- size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dictBufferCapacity,
969
- const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
970
- ZDICT_params_t params)
1082
+ /* ZDICT_trainFromBuffer_legacy() :
1083
+ * issue : samplesBuffer need to be followed by a noisy guard band.
1084
+ * work around : duplicate the buffer, and add the noise */
1085
+ size_t ZDICT_trainFromBuffer_legacy(void* dictBuffer, size_t dictBufferCapacity,
1086
+ const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
1087
+ ZDICT_legacy_params_t params)
971
1088
  {
972
1089
  size_t result;
973
1090
  void* newBuff;
@@ -980,10 +1097,9 @@ size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dictBufferCapacit
980
1097
  memcpy(newBuff, samplesBuffer, sBuffSize);
981
1098
  ZDICT_fillNoise((char*)newBuff + sBuffSize, NOISELENGTH); /* guard band, for end of buffer condition */
982
1099
 
983
- result = ZDICT_trainFromBuffer_unsafe(
984
- dictBuffer, dictBufferCapacity,
985
- newBuff, samplesSizes, nbSamples,
986
- params);
1100
+ result =
1101
+ ZDICT_trainFromBuffer_unsafe_legacy(dictBuffer, dictBufferCapacity, newBuff,
1102
+ samplesSizes, nbSamples, params);
987
1103
  free(newBuff);
988
1104
  return result;
989
1105
  }
@@ -992,15 +1108,23 @@ size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dictBufferCapacit
992
1108
  size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
993
1109
  const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
994
1110
  {
995
- ZDICT_params_t params;
1111
+ ZDICT_fastCover_params_t params;
1112
+ DEBUGLOG(3, "ZDICT_trainFromBuffer");
996
1113
  memset(&params, 0, sizeof(params));
997
- return ZDICT_trainFromBuffer_advanced(dictBuffer, dictBufferCapacity,
998
- samplesBuffer, samplesSizes, nbSamples,
999
- params);
1114
+ params.d = 8;
1115
+ params.steps = 4;
1116
+ /* Use default level since no compression level information is available */
1117
+ params.zParams.compressionLevel = ZSTD_CLEVEL_DEFAULT;
1118
+ #if defined(DEBUGLEVEL) && (DEBUGLEVEL>=1)
1119
+ params.zParams.notificationLevel = DEBUGLEVEL;
1120
+ #endif
1121
+ return ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, dictBufferCapacity,
1122
+ samplesBuffer, samplesSizes, nbSamples,
1123
+ &params);
1000
1124
  }
1001
1125
 
1002
1126
  size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
1003
- const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
1127
+ const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
1004
1128
  {
1005
1129
  ZDICT_params_t params;
1006
1130
  memset(&params, 0, sizeof(params));