extzstd 0.1 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. checksums.yaml +5 -5
  2. data/HISTORY.ja.md +39 -0
  3. data/README.md +38 -56
  4. data/contrib/zstd/CHANGELOG +613 -0
  5. data/contrib/zstd/CODE_OF_CONDUCT.md +5 -0
  6. data/contrib/zstd/CONTRIBUTING.md +406 -0
  7. data/contrib/zstd/COPYING +339 -0
  8. data/contrib/zstd/Makefile +420 -0
  9. data/contrib/zstd/README.md +179 -41
  10. data/contrib/zstd/TESTING.md +44 -0
  11. data/contrib/zstd/appveyor.yml +292 -0
  12. data/contrib/zstd/lib/BUCK +234 -0
  13. data/contrib/zstd/lib/Makefile +451 -0
  14. data/contrib/zstd/lib/README.md +207 -0
  15. data/contrib/zstd/{common → lib/common}/bitstream.h +187 -138
  16. data/contrib/zstd/lib/common/compiler.h +288 -0
  17. data/contrib/zstd/lib/common/cpu.h +213 -0
  18. data/contrib/zstd/lib/common/debug.c +24 -0
  19. data/contrib/zstd/lib/common/debug.h +107 -0
  20. data/contrib/zstd/lib/common/entropy_common.c +362 -0
  21. data/contrib/zstd/{common → lib/common}/error_private.c +25 -12
  22. data/contrib/zstd/{common → lib/common}/error_private.h +14 -10
  23. data/contrib/zstd/{common → lib/common}/fse.h +173 -92
  24. data/contrib/zstd/{common → lib/common}/fse_decompress.c +149 -85
  25. data/contrib/zstd/lib/common/huf.h +361 -0
  26. data/contrib/zstd/{common → lib/common}/mem.h +115 -59
  27. data/contrib/zstd/lib/common/pool.c +350 -0
  28. data/contrib/zstd/lib/common/pool.h +84 -0
  29. data/contrib/zstd/lib/common/threading.c +122 -0
  30. data/contrib/zstd/lib/common/threading.h +155 -0
  31. data/contrib/zstd/{common → lib/common}/xxhash.c +55 -96
  32. data/contrib/zstd/{common → lib/common}/xxhash.h +23 -47
  33. data/contrib/zstd/lib/common/zstd_common.c +83 -0
  34. data/contrib/zstd/lib/common/zstd_deps.h +111 -0
  35. data/contrib/zstd/lib/common/zstd_errors.h +95 -0
  36. data/contrib/zstd/lib/common/zstd_internal.h +478 -0
  37. data/contrib/zstd/{compress → lib/compress}/fse_compress.c +214 -319
  38. data/contrib/zstd/lib/compress/hist.c +181 -0
  39. data/contrib/zstd/lib/compress/hist.h +75 -0
  40. data/contrib/zstd/lib/compress/huf_compress.c +913 -0
  41. data/contrib/zstd/lib/compress/zstd_compress.c +5208 -0
  42. data/contrib/zstd/lib/compress/zstd_compress_internal.h +1203 -0
  43. data/contrib/zstd/lib/compress/zstd_compress_literals.c +158 -0
  44. data/contrib/zstd/lib/compress/zstd_compress_literals.h +29 -0
  45. data/contrib/zstd/lib/compress/zstd_compress_sequences.c +433 -0
  46. data/contrib/zstd/lib/compress/zstd_compress_sequences.h +54 -0
  47. data/contrib/zstd/lib/compress/zstd_compress_superblock.c +849 -0
  48. data/contrib/zstd/lib/compress/zstd_compress_superblock.h +32 -0
  49. data/contrib/zstd/lib/compress/zstd_cwksp.h +561 -0
  50. data/contrib/zstd/lib/compress/zstd_double_fast.c +521 -0
  51. data/contrib/zstd/lib/compress/zstd_double_fast.h +38 -0
  52. data/contrib/zstd/lib/compress/zstd_fast.c +496 -0
  53. data/contrib/zstd/lib/compress/zstd_fast.h +37 -0
  54. data/contrib/zstd/lib/compress/zstd_lazy.c +1412 -0
  55. data/contrib/zstd/lib/compress/zstd_lazy.h +87 -0
  56. data/contrib/zstd/lib/compress/zstd_ldm.c +660 -0
  57. data/contrib/zstd/lib/compress/zstd_ldm.h +116 -0
  58. data/contrib/zstd/lib/compress/zstd_opt.c +1345 -0
  59. data/contrib/zstd/lib/compress/zstd_opt.h +56 -0
  60. data/contrib/zstd/lib/compress/zstdmt_compress.c +1811 -0
  61. data/contrib/zstd/lib/compress/zstdmt_compress.h +110 -0
  62. data/contrib/zstd/lib/decompress/huf_decompress.c +1350 -0
  63. data/contrib/zstd/lib/decompress/zstd_ddict.c +244 -0
  64. data/contrib/zstd/lib/decompress/zstd_ddict.h +44 -0
  65. data/contrib/zstd/lib/decompress/zstd_decompress.c +1930 -0
  66. data/contrib/zstd/lib/decompress/zstd_decompress_block.c +1540 -0
  67. data/contrib/zstd/lib/decompress/zstd_decompress_block.h +62 -0
  68. data/contrib/zstd/lib/decompress/zstd_decompress_internal.h +190 -0
  69. data/contrib/zstd/{common → lib/deprecated}/zbuff.h +68 -45
  70. data/contrib/zstd/lib/deprecated/zbuff_common.c +26 -0
  71. data/contrib/zstd/lib/deprecated/zbuff_compress.c +147 -0
  72. data/contrib/zstd/lib/deprecated/zbuff_decompress.c +75 -0
  73. data/contrib/zstd/lib/dictBuilder/cover.c +1245 -0
  74. data/contrib/zstd/lib/dictBuilder/cover.h +157 -0
  75. data/contrib/zstd/{dictBuilder → lib/dictBuilder}/divsufsort.c +3 -3
  76. data/contrib/zstd/{dictBuilder → lib/dictBuilder}/divsufsort.h +0 -0
  77. data/contrib/zstd/lib/dictBuilder/fastcover.c +758 -0
  78. data/contrib/zstd/{dictBuilder → lib/dictBuilder}/zdict.c +318 -194
  79. data/contrib/zstd/lib/dictBuilder/zdict.h +305 -0
  80. data/contrib/zstd/{legacy → lib/legacy}/zstd_legacy.h +171 -15
  81. data/contrib/zstd/{legacy → lib/legacy}/zstd_v01.c +191 -124
  82. data/contrib/zstd/{legacy → lib/legacy}/zstd_v01.h +19 -5
  83. data/contrib/zstd/{legacy → lib/legacy}/zstd_v02.c +125 -125
  84. data/contrib/zstd/{legacy → lib/legacy}/zstd_v02.h +19 -5
  85. data/contrib/zstd/{legacy → lib/legacy}/zstd_v03.c +125 -124
  86. data/contrib/zstd/{legacy → lib/legacy}/zstd_v03.h +20 -6
  87. data/contrib/zstd/{legacy → lib/legacy}/zstd_v04.c +151 -299
  88. data/contrib/zstd/{legacy → lib/legacy}/zstd_v04.h +19 -5
  89. data/contrib/zstd/{legacy → lib/legacy}/zstd_v05.c +237 -243
  90. data/contrib/zstd/{legacy → lib/legacy}/zstd_v05.h +19 -6
  91. data/contrib/zstd/{legacy → lib/legacy}/zstd_v06.c +130 -143
  92. data/contrib/zstd/{legacy → lib/legacy}/zstd_v06.h +18 -5
  93. data/contrib/zstd/{legacy → lib/legacy}/zstd_v07.c +158 -157
  94. data/contrib/zstd/{legacy → lib/legacy}/zstd_v07.h +19 -5
  95. data/contrib/zstd/lib/libzstd.pc.in +15 -0
  96. data/contrib/zstd/lib/zstd.h +2391 -0
  97. data/ext/depend +2 -0
  98. data/ext/extconf.rb +15 -6
  99. data/ext/extzstd.c +76 -145
  100. data/ext/extzstd.h +80 -31
  101. data/ext/extzstd_stream.c +417 -142
  102. data/ext/libzstd_conf.h +8 -0
  103. data/ext/zstd_common.c +10 -7
  104. data/ext/zstd_compress.c +14 -5
  105. data/ext/zstd_decompress.c +5 -4
  106. data/ext/zstd_dictbuilder.c +9 -4
  107. data/ext/zstd_dictbuilder_fastcover.c +3 -0
  108. data/ext/zstd_legacy_v01.c +3 -1
  109. data/ext/zstd_legacy_v02.c +3 -1
  110. data/ext/zstd_legacy_v03.c +3 -1
  111. data/ext/zstd_legacy_v04.c +3 -1
  112. data/ext/zstd_legacy_v05.c +3 -1
  113. data/ext/zstd_legacy_v06.c +3 -1
  114. data/ext/zstd_legacy_v07.c +3 -1
  115. data/gemstub.rb +10 -24
  116. data/lib/extzstd.rb +64 -179
  117. data/lib/extzstd/version.rb +6 -1
  118. data/test/test_basic.rb +9 -6
  119. metadata +113 -57
  120. data/HISTORY.ja +0 -5
  121. data/contrib/zstd/common/entropy_common.c +0 -225
  122. data/contrib/zstd/common/huf.h +0 -228
  123. data/contrib/zstd/common/zstd_common.c +0 -83
  124. data/contrib/zstd/common/zstd_errors.h +0 -60
  125. data/contrib/zstd/common/zstd_internal.h +0 -267
  126. data/contrib/zstd/compress/huf_compress.c +0 -533
  127. data/contrib/zstd/compress/zbuff_compress.c +0 -319
  128. data/contrib/zstd/compress/zstd_compress.c +0 -3264
  129. data/contrib/zstd/compress/zstd_opt.h +0 -900
  130. data/contrib/zstd/decompress/huf_decompress.c +0 -883
  131. data/contrib/zstd/decompress/zbuff_decompress.c +0 -252
  132. data/contrib/zstd/decompress/zstd_decompress.c +0 -1842
  133. data/contrib/zstd/dictBuilder/zdict.h +0 -111
  134. data/contrib/zstd/zstd.h +0 -640
@@ -1,18 +1,20 @@
1
- /**
2
- * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
1
+ /*
2
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
3
3
  * All rights reserved.
4
4
  *
5
- * This source code is licensed under the BSD-style license found in the
6
- * LICENSE file in the root directory of this source tree. An additional grant
7
- * of patent rights can be found in the PATENTS file in the same directory.
5
+ * This source code is licensed under both the BSD-style license (found in the
6
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7
+ * in the COPYING file in the root directory of this source tree).
8
+ * You may select, at your option, one of the above-listed licenses.
8
9
  */
9
10
 
10
11
 
11
12
  /*-**************************************
12
13
  * Tuning parameters
13
14
  ****************************************/
15
+ #define MINRATIO 4 /* minimum nb of apparition to be selected in dictionary */
14
16
  #define ZDICT_MAX_SAMPLES_SIZE (2000U << 20)
15
- #define ZDICT_MIN_SAMPLES_SIZE 512
17
+ #define ZDICT_MIN_SAMPLES_SIZE (ZDICT_CONTENTSIZE_MIN * MINRATIO)
16
18
 
17
19
 
18
20
  /*-**************************************
@@ -35,18 +37,18 @@
35
37
  #include <stdio.h> /* fprintf, fopen, ftello64 */
36
38
  #include <time.h> /* clock */
37
39
 
38
- #include "mem.h" /* read */
39
- #include "error_private.h"
40
- #include "fse.h" /* FSE_normalizeCount, FSE_writeNCount */
40
+ #include "../common/mem.h" /* read */
41
+ #include "../common/fse.h" /* FSE_normalizeCount, FSE_writeNCount */
41
42
  #define HUF_STATIC_LINKING_ONLY
42
- #include "huf.h"
43
- #include "zstd_internal.h" /* includes zstd.h */
44
- #include "xxhash.h"
43
+ #include "../common/huf.h" /* HUF_buildCTable, HUF_writeCTable */
44
+ #include "../common/zstd_internal.h" /* includes zstd.h */
45
+ #include "../common/xxhash.h" /* XXH64 */
45
46
  #include "divsufsort.h"
46
47
  #ifndef ZDICT_STATIC_LINKING_ONLY
47
48
  # define ZDICT_STATIC_LINKING_ONLY
48
49
  #endif
49
50
  #include "zdict.h"
51
+ #include "../compress/zstd_compress_internal.h" /* ZSTD_loadCEntropy() */
50
52
 
51
53
 
52
54
  /*-*************************************
@@ -60,17 +62,15 @@
60
62
 
61
63
  #define NOISELENGTH 32
62
64
 
63
- #define MINRATIO 4
64
- static const int g_compressionLevel_default = 5;
65
65
  static const U32 g_selectivity_default = 9;
66
- static const size_t g_provision_entropySize = 200;
67
- static const size_t g_min_fast_dictContent = 192;
68
66
 
69
67
 
70
68
  /*-*************************************
71
69
  * Console display
72
70
  ***************************************/
71
+ #undef DISPLAY
73
72
  #define DISPLAY(...) { fprintf(stderr, __VA_ARGS__); fflush( stderr ); }
73
+ #undef DISPLAYLEVEL
74
74
  #define DISPLAYLEVEL(l, ...) if (notificationLevel>=l) { DISPLAY(__VA_ARGS__); } /* 0 : no display; 1: errors; 2: default; 3: details; 4: debug */
75
75
 
76
76
  static clock_t ZDICT_clockSpan(clock_t nPrevious) { return clock() - nPrevious; }
@@ -97,15 +97,35 @@ const char* ZDICT_getErrorName(size_t errorCode) { return ERR_getErrorName(error
97
97
  unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize)
98
98
  {
99
99
  if (dictSize < 8) return 0;
100
- if (MEM_readLE32(dictBuffer) != ZSTD_DICT_MAGIC) return 0;
100
+ if (MEM_readLE32(dictBuffer) != ZSTD_MAGIC_DICTIONARY) return 0;
101
101
  return MEM_readLE32((const char*)dictBuffer + 4);
102
102
  }
103
103
 
104
+ size_t ZDICT_getDictHeaderSize(const void* dictBuffer, size_t dictSize)
105
+ {
106
+ size_t headerSize;
107
+ if (dictSize <= 8 || MEM_readLE32(dictBuffer) != ZSTD_MAGIC_DICTIONARY) return ERROR(dictionary_corrupted);
108
+
109
+ { ZSTD_compressedBlockState_t* bs = (ZSTD_compressedBlockState_t*)malloc(sizeof(ZSTD_compressedBlockState_t));
110
+ U32* wksp = (U32*)malloc(HUF_WORKSPACE_SIZE);
111
+ if (!bs || !wksp) {
112
+ headerSize = ERROR(memory_allocation);
113
+ } else {
114
+ ZSTD_reset_compressedBlockState(bs);
115
+ headerSize = ZSTD_loadCEntropy(bs, wksp, dictBuffer, dictSize);
116
+ }
117
+
118
+ free(bs);
119
+ free(wksp);
120
+ }
121
+
122
+ return headerSize;
123
+ }
104
124
 
105
125
  /*-********************************************************
106
126
  * Dictionary training functions
107
127
  **********************************************************/
108
- static unsigned ZDICT_NbCommonBytes (register size_t val)
128
+ static unsigned ZDICT_NbCommonBytes (size_t val)
109
129
  {
110
130
  if (MEM_isLittleEndian()) {
111
131
  if (MEM_64bits()) {
@@ -209,7 +229,6 @@ static dictItem ZDICT_analyzePos(
209
229
  U32 cumulLength[LLIMIT] = {0};
210
230
  U32 savings[LLIMIT] = {0};
211
231
  const BYTE* b = (const BYTE*)buffer;
212
- size_t length;
213
232
  size_t maxLength = LLIMIT;
214
233
  size_t pos = suffix[start];
215
234
  U32 end = start;
@@ -224,26 +243,30 @@ static dictItem ZDICT_analyzePos(
224
243
  ||(MEM_read16(b+pos+1) == MEM_read16(b+pos+3))
225
244
  ||(MEM_read16(b+pos+2) == MEM_read16(b+pos+4)) ) {
226
245
  /* skip and mark segment */
227
- U16 u16 = MEM_read16(b+pos+4);
228
- U32 u, e = 6;
229
- while (MEM_read16(b+pos+e) == u16) e+=2 ;
230
- if (b[pos+e] == b[pos+e-1]) e++;
231
- for (u=1; u<e; u++)
246
+ U16 const pattern16 = MEM_read16(b+pos+4);
247
+ U32 u, patternEnd = 6;
248
+ while (MEM_read16(b+pos+patternEnd) == pattern16) patternEnd+=2 ;
249
+ if (b[pos+patternEnd] == b[pos+patternEnd-1]) patternEnd++;
250
+ for (u=1; u<patternEnd; u++)
232
251
  doneMarks[pos+u] = 1;
233
252
  return solution;
234
253
  }
235
254
 
236
255
  /* look forward */
237
- do {
238
- end++;
239
- length = ZDICT_count(b + pos, b + suffix[end]);
240
- } while (length >=MINMATCHLENGTH);
256
+ { size_t length;
257
+ do {
258
+ end++;
259
+ length = ZDICT_count(b + pos, b + suffix[end]);
260
+ } while (length >= MINMATCHLENGTH);
261
+ }
241
262
 
242
263
  /* look backward */
243
- do {
244
- length = ZDICT_count(b + pos, b + *(suffix+start-1));
245
- if (length >=MINMATCHLENGTH) start--;
246
- } while(length >= MINMATCHLENGTH);
264
+ { size_t length;
265
+ do {
266
+ length = ZDICT_count(b + pos, b + *(suffix+start-1));
267
+ if (length >=MINMATCHLENGTH) start--;
268
+ } while(length >= MINMATCHLENGTH);
269
+ }
247
270
 
248
271
  /* exit if not found a minimum nb of repetitions */
249
272
  if (end-start < minRatio) {
@@ -254,15 +277,15 @@ static dictItem ZDICT_analyzePos(
254
277
  }
255
278
 
256
279
  { int i;
257
- U32 searchLength;
280
+ U32 mml;
258
281
  U32 refinedStart = start;
259
282
  U32 refinedEnd = end;
260
283
 
261
284
  DISPLAYLEVEL(4, "\n");
262
- DISPLAYLEVEL(4, "found %3u matches of length >= %i at pos %7u ", (U32)(end-start), MINMATCHLENGTH, (U32)pos);
285
+ DISPLAYLEVEL(4, "found %3u matches of length >= %i at pos %7u ", (unsigned)(end-start), MINMATCHLENGTH, (unsigned)pos);
263
286
  DISPLAYLEVEL(4, "\n");
264
287
 
265
- for (searchLength = MINMATCHLENGTH ; ; searchLength++) {
288
+ for (mml = MINMATCHLENGTH ; ; mml++) {
266
289
  BYTE currentChar = 0;
267
290
  U32 currentCount = 0;
268
291
  U32 currentID = refinedStart;
@@ -270,13 +293,13 @@ static dictItem ZDICT_analyzePos(
270
293
  U32 selectedCount = 0;
271
294
  U32 selectedID = currentID;
272
295
  for (id =refinedStart; id < refinedEnd; id++) {
273
- if (b[ suffix[id] + searchLength] != currentChar) {
296
+ if (b[suffix[id] + mml] != currentChar) {
274
297
  if (currentCount > selectedCount) {
275
298
  selectedCount = currentCount;
276
299
  selectedID = currentID;
277
300
  }
278
301
  currentID = id;
279
- currentChar = b[ suffix[id] + searchLength];
302
+ currentChar = b[ suffix[id] + mml];
280
303
  currentCount = 0;
281
304
  }
282
305
  currentCount ++;
@@ -292,28 +315,31 @@ static dictItem ZDICT_analyzePos(
292
315
  refinedEnd = refinedStart + selectedCount;
293
316
  }
294
317
 
295
- /* evaluate gain based on new ref */
318
+ /* evaluate gain based on new dict */
296
319
  start = refinedStart;
297
320
  pos = suffix[refinedStart];
298
321
  end = start;
299
322
  memset(lengthList, 0, sizeof(lengthList));
300
323
 
301
324
  /* look forward */
302
- do {
303
- end++;
304
- length = ZDICT_count(b + pos, b + suffix[end]);
305
- if (length >= LLIMIT) length = LLIMIT-1;
306
- lengthList[length]++;
307
- } while (length >=MINMATCHLENGTH);
325
+ { size_t length;
326
+ do {
327
+ end++;
328
+ length = ZDICT_count(b + pos, b + suffix[end]);
329
+ if (length >= LLIMIT) length = LLIMIT-1;
330
+ lengthList[length]++;
331
+ } while (length >=MINMATCHLENGTH);
332
+ }
308
333
 
309
334
  /* look backward */
310
- length = MINMATCHLENGTH;
311
- while ((length >= MINMATCHLENGTH) & (start > 0)) {
312
- length = ZDICT_count(b + pos, b + suffix[start - 1]);
313
- if (length >= LLIMIT) length = LLIMIT - 1;
314
- lengthList[length]++;
315
- if (length >= MINMATCHLENGTH) start--;
316
- }
335
+ { size_t length = MINMATCHLENGTH;
336
+ while ((length >= MINMATCHLENGTH) & (start > 0)) {
337
+ length = ZDICT_count(b + pos, b + suffix[start - 1]);
338
+ if (length >= LLIMIT) length = LLIMIT - 1;
339
+ lengthList[length]++;
340
+ if (length >= MINMATCHLENGTH) start--;
341
+ }
342
+ }
317
343
 
318
344
  /* largest useful length */
319
345
  memset(cumulLength, 0, sizeof(cumulLength));
@@ -337,8 +363,8 @@ static dictItem ZDICT_analyzePos(
337
363
  for (i=MINMATCHLENGTH; i<=(int)maxLength; i++)
338
364
  savings[i] = savings[i-1] + (lengthList[i] * (i-3));
339
365
 
340
- DISPLAYLEVEL(4, "Selected ref at position %u, of length %u : saves %u (ratio: %.2f) \n",
341
- (U32)pos, (U32)maxLength, savings[maxLength], (double)savings[maxLength] / maxLength);
366
+ DISPLAYLEVEL(4, "Selected dict at position %u, of length %u : saves %u (ratio: %.2f) \n",
367
+ (unsigned)pos, (unsigned)maxLength, (unsigned)savings[maxLength], (double)savings[maxLength] / maxLength);
342
368
 
343
369
  solution.pos = (U32)pos;
344
370
  solution.length = (U32)maxLength;
@@ -347,12 +373,12 @@ static dictItem ZDICT_analyzePos(
347
373
  /* mark positions done */
348
374
  { U32 id;
349
375
  for (id=start; id<end; id++) {
350
- U32 p, pEnd;
376
+ U32 p, pEnd, length;
351
377
  U32 const testedPos = suffix[id];
352
378
  if (testedPos == pos)
353
379
  length = solution.length;
354
380
  else {
355
- length = ZDICT_count(b+pos, b+testedPos);
381
+ length = (U32)ZDICT_count(b+pos, b+testedPos);
356
382
  if (length > solution.length) length = solution.length;
357
383
  }
358
384
  pEnd = (U32)(testedPos + length);
@@ -364,21 +390,35 @@ static dictItem ZDICT_analyzePos(
364
390
  }
365
391
 
366
392
 
367
- /*! ZDICT_checkMerge
393
+ static int isIncluded(const void* in, const void* container, size_t length)
394
+ {
395
+ const char* const ip = (const char*) in;
396
+ const char* const into = (const char*) container;
397
+ size_t u;
398
+
399
+ for (u=0; u<length; u++) { /* works because end of buffer is a noisy guard band */
400
+ if (ip[u] != into[u]) break;
401
+ }
402
+
403
+ return u==length;
404
+ }
405
+
406
+ /*! ZDICT_tryMerge() :
368
407
  check if dictItem can be merged, do it if possible
369
408
  @return : id of destination elt, 0 if not merged
370
409
  */
371
- static U32 ZDICT_checkMerge(dictItem* table, dictItem elt, U32 eltNbToSkip)
410
+ static U32 ZDICT_tryMerge(dictItem* table, dictItem elt, U32 eltNbToSkip, const void* buffer)
372
411
  {
373
412
  const U32 tableSize = table->pos;
374
413
  const U32 eltEnd = elt.pos + elt.length;
414
+ const char* const buf = (const char*) buffer;
375
415
 
376
416
  /* tail overlap */
377
417
  U32 u; for (u=1; u<tableSize; u++) {
378
418
  if (u==eltNbToSkip) continue;
379
419
  if ((table[u].pos > elt.pos) && (table[u].pos <= eltEnd)) { /* overlap, existing > new */
380
420
  /* append */
381
- U32 addedLength = table[u].pos - elt.pos;
421
+ U32 const addedLength = table[u].pos - elt.pos;
382
422
  table[u].length += addedLength;
383
423
  table[u].pos = elt.pos;
384
424
  table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */
@@ -394,9 +434,10 @@ static U32 ZDICT_checkMerge(dictItem* table, dictItem elt, U32 eltNbToSkip)
394
434
  /* front overlap */
395
435
  for (u=1; u<tableSize; u++) {
396
436
  if (u==eltNbToSkip) continue;
437
+
397
438
  if ((table[u].pos + table[u].length >= elt.pos) && (table[u].pos < elt.pos)) { /* overlap, existing < new */
398
439
  /* append */
399
- int addedLength = (int)eltEnd - (table[u].pos + table[u].length);
440
+ int const addedLength = (int)eltEnd - (table[u].pos + table[u].length);
400
441
  table[u].savings += elt.length / 8; /* rough approx bonus */
401
442
  if (addedLength > 0) { /* otherwise, elt fully included into existing */
402
443
  table[u].length += addedLength;
@@ -408,7 +449,18 @@ static U32 ZDICT_checkMerge(dictItem* table, dictItem elt, U32 eltNbToSkip)
408
449
  table[u] = table[u-1], u--;
409
450
  table[u] = elt;
410
451
  return u;
411
- } }
452
+ }
453
+
454
+ if (MEM_read64(buf + table[u].pos) == MEM_read64(buf + elt.pos + 1)) {
455
+ if (isIncluded(buf + table[u].pos, buf + elt.pos + 1, table[u].length)) {
456
+ size_t const addedLength = MAX( (int)elt.length - (int)table[u].length , 1 );
457
+ table[u].pos = elt.pos;
458
+ table[u].savings += (U32)(elt.savings * addedLength / elt.length);
459
+ table[u].length = MIN(elt.length, table[u].length + 1);
460
+ return u;
461
+ }
462
+ }
463
+ }
412
464
 
413
465
  return 0;
414
466
  }
@@ -416,8 +468,8 @@ static U32 ZDICT_checkMerge(dictItem* table, dictItem elt, U32 eltNbToSkip)
416
468
 
417
469
  static void ZDICT_removeDictItem(dictItem* table, U32 id)
418
470
  {
419
- /* convention : first element is nb of elts */
420
- U32 const max = table->pos;
471
+ /* convention : table[0].pos stores nb of elts */
472
+ U32 const max = table[0].pos;
421
473
  U32 u;
422
474
  if (!id) return; /* protection, should never happen */
423
475
  for (u=id; u<max-1; u++)
@@ -426,14 +478,14 @@ static void ZDICT_removeDictItem(dictItem* table, U32 id)
426
478
  }
427
479
 
428
480
 
429
- static void ZDICT_insertDictItem(dictItem* table, U32 maxSize, dictItem elt)
481
+ static void ZDICT_insertDictItem(dictItem* table, U32 maxSize, dictItem elt, const void* buffer)
430
482
  {
431
483
  /* merge if possible */
432
- U32 mergeId = ZDICT_checkMerge(table, elt, 0);
484
+ U32 mergeId = ZDICT_tryMerge(table, elt, 0, buffer);
433
485
  if (mergeId) {
434
486
  U32 newMerge = 1;
435
487
  while (newMerge) {
436
- newMerge = ZDICT_checkMerge(table, table[mergeId], mergeId);
488
+ newMerge = ZDICT_tryMerge(table, table[mergeId], mergeId, buffer);
437
489
  if (newMerge) ZDICT_removeDictItem(table, mergeId);
438
490
  mergeId = newMerge;
439
491
  }
@@ -464,10 +516,10 @@ static U32 ZDICT_dictSize(const dictItem* dictList)
464
516
  }
465
517
 
466
518
 
467
- static size_t ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
519
+ static size_t ZDICT_trainBuffer_legacy(dictItem* dictList, U32 dictListSize,
468
520
  const void* const buffer, size_t bufferSize, /* buffer must end with noisy guard band */
469
521
  const size_t* fileSizes, unsigned nbFiles,
470
- U32 minRatio, U32 notificationLevel)
522
+ unsigned minRatio, U32 notificationLevel)
471
523
  {
472
524
  int* const suffix0 = (int*)malloc((bufferSize+2)*sizeof(*suffix0));
473
525
  int* const suffix = suffix0+1;
@@ -478,10 +530,11 @@ static size_t ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
478
530
  clock_t displayClock = 0;
479
531
  clock_t const refreshRate = CLOCKS_PER_SEC * 3 / 10;
480
532
 
533
+ # undef DISPLAYUPDATE
481
534
  # define DISPLAYUPDATE(l, ...) if (notificationLevel>=l) { \
482
535
  if (ZDICT_clockSpan(displayClock) > refreshRate) \
483
536
  { displayClock = clock(); DISPLAY(__VA_ARGS__); \
484
- if (notificationLevel>=4) fflush(stdout); } }
537
+ if (notificationLevel>=4) fflush(stderr); } }
485
538
 
486
539
  /* init */
487
540
  DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
@@ -493,11 +546,11 @@ static size_t ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
493
546
  memset(doneMarks, 0, bufferSize+16);
494
547
 
495
548
  /* limit sample set size (divsufsort limitation)*/
496
- if (bufferSize > ZDICT_MAX_SAMPLES_SIZE) DISPLAYLEVEL(3, "sample set too large : reduced to %u MB ...\n", (U32)(ZDICT_MAX_SAMPLES_SIZE>>20));
549
+ if (bufferSize > ZDICT_MAX_SAMPLES_SIZE) DISPLAYLEVEL(3, "sample set too large : reduced to %u MB ...\n", (unsigned)(ZDICT_MAX_SAMPLES_SIZE>>20));
497
550
  while (bufferSize > ZDICT_MAX_SAMPLES_SIZE) bufferSize -= fileSizes[--nbFiles];
498
551
 
499
552
  /* sort */
500
- DISPLAYLEVEL(2, "sorting %u files of total size %u MB ...\n", nbFiles, (U32)(bufferSize>>20));
553
+ DISPLAYLEVEL(2, "sorting %u files of total size %u MB ...\n", nbFiles, (unsigned)(bufferSize>>20));
501
554
  { int const divSuftSortResult = divsufsort((const unsigned char*)buffer, suffix, (int)bufferSize, 0);
502
555
  if (divSuftSortResult != 0) { result = ERROR(GENERIC); goto _cleanup; }
503
556
  }
@@ -522,7 +575,7 @@ static size_t ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
522
575
  if (doneMarks[cursor]) { cursor++; continue; }
523
576
  solution = ZDICT_analyzePos(doneMarks, suffix, reverseSuffix[cursor], buffer, minRatio, notificationLevel);
524
577
  if (solution.length==0) { cursor++; continue; }
525
- ZDICT_insertDictItem(dictList, dictListSize, solution);
578
+ ZDICT_insertDictItem(dictList, dictListSize, solution, buffer);
526
579
  cursor += solution.length;
527
580
  DISPLAYUPDATE(2, "\r%4.2f %% \r", (double)cursor / bufferSize * 100);
528
581
  } }
@@ -541,7 +594,7 @@ static void ZDICT_fillNoise(void* buffer, size_t length)
541
594
  unsigned const prime1 = 2654435761U;
542
595
  unsigned const prime2 = 2246822519U;
543
596
  unsigned acc = prime1;
544
- size_t p=0;;
597
+ size_t p=0;
545
598
  for (p=0; p<length; p++) {
546
599
  acc *= prime2;
547
600
  ((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21);
@@ -551,29 +604,31 @@ static void ZDICT_fillNoise(void* buffer, size_t length)
551
604
 
552
605
  typedef struct
553
606
  {
554
- ZSTD_CCtx* ref;
555
- ZSTD_CCtx* zc;
556
- void* workPlace; /* must be ZSTD_BLOCKSIZE_ABSOLUTEMAX allocated */
607
+ ZSTD_CDict* dict; /* dictionary */
608
+ ZSTD_CCtx* zc; /* working context */
609
+ void* workPlace; /* must be ZSTD_BLOCKSIZE_MAX allocated */
557
610
  } EStats_ress_t;
558
611
 
559
612
  #define MAXREPOFFSET 1024
560
613
 
561
- static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params,
562
- U32* countLit, U32* offsetcodeCount, U32* matchlengthCount, U32* litlengthCount, U32* repOffsets,
563
- const void* src, size_t srcSize, U32 notificationLevel)
614
+ static void ZDICT_countEStats(EStats_ress_t esr, const ZSTD_parameters* params,
615
+ unsigned* countLit, unsigned* offsetcodeCount, unsigned* matchlengthCount, unsigned* litlengthCount, U32* repOffsets,
616
+ const void* src, size_t srcSize,
617
+ U32 notificationLevel)
564
618
  {
565
- size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_ABSOLUTEMAX, 1 << params.cParams.windowLog);
619
+ size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params->cParams.windowLog);
566
620
  size_t cSize;
567
621
 
568
622
  if (srcSize > blockSizeMax) srcSize = blockSizeMax; /* protection vs large samples */
569
- { size_t const errorCode = ZSTD_copyCCtx(esr.zc, esr.ref, 0);
570
- if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_copyCCtx failed \n"); return; }
623
+ { size_t const errorCode = ZSTD_compressBegin_usingCDict(esr.zc, esr.dict);
624
+ if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_compressBegin_usingCDict failed \n"); return; }
625
+
571
626
  }
572
- cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_ABSOLUTEMAX, src, srcSize);
573
- if (ZSTD_isError(cSize)) { DISPLAYLEVEL(1, "warning : could not compress sample size %u \n", (U32)srcSize); return; }
627
+ cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_MAX, src, srcSize);
628
+ if (ZSTD_isError(cSize)) { DISPLAYLEVEL(3, "warning : could not compress sample size %u \n", (unsigned)srcSize); return; }
574
629
 
575
630
  if (cSize) { /* if == 0; block is not compressible */
576
- const seqStore_t* seqStorePtr = ZSTD_getSeqStore(esr.zc);
631
+ const seqStore_t* const seqStorePtr = ZSTD_getSeqStore(esr.zc);
577
632
 
578
633
  /* literals stats */
579
634
  { const BYTE* bytePtr;
@@ -611,17 +666,6 @@ static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params,
611
666
  } } }
612
667
  }
613
668
 
614
- /*
615
- static size_t ZDICT_maxSampleSize(const size_t* fileSizes, unsigned nbFiles)
616
- {
617
- unsigned u;
618
- size_t max=0;
619
- for (u=0; u<nbFiles; u++)
620
- if (max < fileSizes[u]) max = fileSizes[u];
621
- return max;
622
- }
623
- */
624
-
625
669
  static size_t ZDICT_totalSampleSize(const size_t* fileSizes, unsigned nbFiles)
626
670
  {
627
671
  size_t total=0;
@@ -646,26 +690,38 @@ static void ZDICT_insertSortCount(offsetCount_t table[ZSTD_REP_NUM+1], U32 val,
646
690
  }
647
691
  }
648
692
 
693
+ /* ZDICT_flatLit() :
694
+ * rewrite `countLit` to contain a mostly flat but still compressible distribution of literals.
695
+ * necessary to avoid generating a non-compressible distribution that HUF_writeCTable() cannot encode.
696
+ */
697
+ static void ZDICT_flatLit(unsigned* countLit)
698
+ {
699
+ int u;
700
+ for (u=1; u<256; u++) countLit[u] = 2;
701
+ countLit[0] = 4;
702
+ countLit[253] = 1;
703
+ countLit[254] = 1;
704
+ }
649
705
 
650
706
  #define OFFCODE_MAX 30 /* only applicable to first block */
651
707
  static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
652
- unsigned compressionLevel,
708
+ int compressionLevel,
653
709
  const void* srcBuffer, const size_t* fileSizes, unsigned nbFiles,
654
710
  const void* dictBuffer, size_t dictBufferSize,
655
711
  unsigned notificationLevel)
656
712
  {
657
- U32 countLit[256];
713
+ unsigned countLit[256];
658
714
  HUF_CREATE_STATIC_CTABLE(hufTable, 255);
659
- U32 offcodeCount[OFFCODE_MAX+1];
715
+ unsigned offcodeCount[OFFCODE_MAX+1];
660
716
  short offcodeNCount[OFFCODE_MAX+1];
661
717
  U32 offcodeMax = ZSTD_highbit32((U32)(dictBufferSize + 128 KB));
662
- U32 matchLengthCount[MaxML+1];
718
+ unsigned matchLengthCount[MaxML+1];
663
719
  short matchLengthNCount[MaxML+1];
664
- U32 litLengthCount[MaxLL+1];
720
+ unsigned litLengthCount[MaxLL+1];
665
721
  short litLengthNCount[MaxLL+1];
666
722
  U32 repOffset[MAXREPOFFSET];
667
723
  offsetCount_t bestRepOffset[ZSTD_REP_NUM+1];
668
- EStats_ress_t esr;
724
+ EStats_ress_t esr = { NULL, NULL, NULL };
669
725
  ZSTD_parameters params;
670
726
  U32 u, huffLog = 11, Offlog = OffFSELog, mlLog = MLFSELog, llLog = LLFSELog, total;
671
727
  size_t pos = 0, errorCode;
@@ -675,48 +731,51 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
675
731
  BYTE* dstPtr = (BYTE*)dstBuffer;
676
732
 
677
733
  /* init */
678
- esr.ref = ZSTD_createCCtx();
734
+ DEBUGLOG(4, "ZDICT_analyzeEntropy");
735
+ if (offcodeMax>OFFCODE_MAX) { eSize = ERROR(dictionaryCreation_failed); goto _cleanup; } /* too large dictionary */
736
+ for (u=0; u<256; u++) countLit[u] = 1; /* any character must be described */
737
+ for (u=0; u<=offcodeMax; u++) offcodeCount[u] = 1;
738
+ for (u=0; u<=MaxML; u++) matchLengthCount[u] = 1;
739
+ for (u=0; u<=MaxLL; u++) litLengthCount[u] = 1;
740
+ memset(repOffset, 0, sizeof(repOffset));
741
+ repOffset[1] = repOffset[4] = repOffset[8] = 1;
742
+ memset(bestRepOffset, 0, sizeof(bestRepOffset));
743
+ if (compressionLevel==0) compressionLevel = ZSTD_CLEVEL_DEFAULT;
744
+ params = ZSTD_getParams(compressionLevel, averageSampleSize, dictBufferSize);
745
+
746
+ esr.dict = ZSTD_createCDict_advanced(dictBuffer, dictBufferSize, ZSTD_dlm_byRef, ZSTD_dct_rawContent, params.cParams, ZSTD_defaultCMem);
679
747
  esr.zc = ZSTD_createCCtx();
680
- esr.workPlace = malloc(ZSTD_BLOCKSIZE_ABSOLUTEMAX);
681
- if (!esr.ref || !esr.zc || !esr.workPlace) {
748
+ esr.workPlace = malloc(ZSTD_BLOCKSIZE_MAX);
749
+ if (!esr.dict || !esr.zc || !esr.workPlace) {
682
750
  eSize = ERROR(memory_allocation);
683
751
  DISPLAYLEVEL(1, "Not enough memory \n");
684
752
  goto _cleanup;
685
753
  }
686
- if (offcodeMax>OFFCODE_MAX) { eSize = ERROR(dictionary_wrong); goto _cleanup; } /* too large dictionary */
687
- for (u=0; u<256; u++) countLit[u]=1; /* any character must be described */
688
- for (u=0; u<=offcodeMax; u++) offcodeCount[u]=1;
689
- for (u=0; u<=MaxML; u++) matchLengthCount[u]=1;
690
- for (u=0; u<=MaxLL; u++) litLengthCount[u]=1;
691
- memset(repOffset, 0, sizeof(repOffset));
692
- repOffset[1] = repOffset[4] = repOffset[8] = 1;
693
- memset(bestRepOffset, 0, sizeof(bestRepOffset));
694
- if (compressionLevel==0) compressionLevel=g_compressionLevel_default;
695
- params = ZSTD_getParams(compressionLevel, averageSampleSize, dictBufferSize);
696
- { size_t const beginResult = ZSTD_compressBegin_advanced(esr.ref, dictBuffer, dictBufferSize, params, 0);
697
- if (ZSTD_isError(beginResult)) {
698
- eSize = ERROR(GENERIC);
699
- DISPLAYLEVEL(1, "error : ZSTD_compressBegin_advanced failed \n");
700
- goto _cleanup;
701
- } }
702
754
 
703
- /* collect stats on all files */
755
+ /* collect stats on all samples */
704
756
  for (u=0; u<nbFiles; u++) {
705
- ZDICT_countEStats(esr, params,
757
+ ZDICT_countEStats(esr, &params,
706
758
  countLit, offcodeCount, matchLengthCount, litLengthCount, repOffset,
707
759
  (const char*)srcBuffer + pos, fileSizes[u],
708
760
  notificationLevel);
709
761
  pos += fileSizes[u];
710
762
  }
711
763
 
712
- /* analyze */
713
- errorCode = HUF_buildCTable (hufTable, countLit, 255, huffLog);
714
- if (HUF_isError(errorCode)) {
715
- eSize = ERROR(GENERIC);
716
- DISPLAYLEVEL(1, "HUF_buildCTable error \n");
717
- goto _cleanup;
764
+ /* analyze, build stats, starting with literals */
765
+ { size_t maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog);
766
+ if (HUF_isError(maxNbBits)) {
767
+ eSize = maxNbBits;
768
+ DISPLAYLEVEL(1, " HUF_buildCTable error \n");
769
+ goto _cleanup;
770
+ }
771
+ if (maxNbBits==8) { /* not compressible : will fail on HUF_writeCTable() */
772
+ DISPLAYLEVEL(2, "warning : pathological dataset : literals are not compressible : samples are noisy or too regular \n");
773
+ ZDICT_flatLit(countLit); /* replace distribution by a fake "mostly flat but still compressible" distribution, that HUF_writeCTable() can encode */
774
+ maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog);
775
+ assert(maxNbBits==9);
776
+ }
777
+ huffLog = (U32)maxNbBits;
718
778
  }
719
- huffLog = (U32)errorCode;
720
779
 
721
780
  /* looking for most common first offsets */
722
781
  { U32 offset;
@@ -726,27 +785,27 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
726
785
  /* note : the result of this phase should be used to better appreciate the impact on statistics */
727
786
 
728
787
  total=0; for (u=0; u<=offcodeMax; u++) total+=offcodeCount[u];
729
- errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, offcodeMax);
788
+ errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, offcodeMax, /* useLowProbCount */ 1);
730
789
  if (FSE_isError(errorCode)) {
731
- eSize = ERROR(GENERIC);
790
+ eSize = errorCode;
732
791
  DISPLAYLEVEL(1, "FSE_normalizeCount error with offcodeCount \n");
733
792
  goto _cleanup;
734
793
  }
735
794
  Offlog = (U32)errorCode;
736
795
 
737
796
  total=0; for (u=0; u<=MaxML; u++) total+=matchLengthCount[u];
738
- errorCode = FSE_normalizeCount(matchLengthNCount, mlLog, matchLengthCount, total, MaxML);
797
+ errorCode = FSE_normalizeCount(matchLengthNCount, mlLog, matchLengthCount, total, MaxML, /* useLowProbCount */ 1);
739
798
  if (FSE_isError(errorCode)) {
740
- eSize = ERROR(GENERIC);
799
+ eSize = errorCode;
741
800
  DISPLAYLEVEL(1, "FSE_normalizeCount error with matchLengthCount \n");
742
801
  goto _cleanup;
743
802
  }
744
803
  mlLog = (U32)errorCode;
745
804
 
746
805
  total=0; for (u=0; u<=MaxLL; u++) total+=litLengthCount[u];
747
- errorCode = FSE_normalizeCount(litLengthNCount, llLog, litLengthCount, total, MaxLL);
806
+ errorCode = FSE_normalizeCount(litLengthNCount, llLog, litLengthCount, total, MaxLL, /* useLowProbCount */ 1);
748
807
  if (FSE_isError(errorCode)) {
749
- eSize = ERROR(GENERIC);
808
+ eSize = errorCode;
750
809
  DISPLAYLEVEL(1, "FSE_normalizeCount error with litLengthCount \n");
751
810
  goto _cleanup;
752
811
  }
@@ -755,7 +814,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
755
814
  /* write result to buffer */
756
815
  { size_t const hhSize = HUF_writeCTable(dstPtr, maxDstSize, hufTable, 255, huffLog);
757
816
  if (HUF_isError(hhSize)) {
758
- eSize = ERROR(GENERIC);
817
+ eSize = hhSize;
759
818
  DISPLAYLEVEL(1, "HUF_writeCTable error \n");
760
819
  goto _cleanup;
761
820
  }
@@ -766,7 +825,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
766
825
 
767
826
  { size_t const ohSize = FSE_writeNCount(dstPtr, maxDstSize, offcodeNCount, OFFCODE_MAX, Offlog);
768
827
  if (FSE_isError(ohSize)) {
769
- eSize = ERROR(GENERIC);
828
+ eSize = ohSize;
770
829
  DISPLAYLEVEL(1, "FSE_writeNCount error with offcodeNCount \n");
771
830
  goto _cleanup;
772
831
  }
@@ -777,7 +836,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
777
836
 
778
837
  { size_t const mhSize = FSE_writeNCount(dstPtr, maxDstSize, matchLengthNCount, MaxML, mlLog);
779
838
  if (FSE_isError(mhSize)) {
780
- eSize = ERROR(GENERIC);
839
+ eSize = mhSize;
781
840
  DISPLAYLEVEL(1, "FSE_writeNCount error with matchLengthNCount \n");
782
841
  goto _cleanup;
783
842
  }
@@ -788,7 +847,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
788
847
 
789
848
  { size_t const lhSize = FSE_writeNCount(dstPtr, maxDstSize, litLengthNCount, MaxLL, llLog);
790
849
  if (FSE_isError(lhSize)) {
791
- eSize = ERROR(GENERIC);
850
+ eSize = lhSize;
792
851
  DISPLAYLEVEL(1, "FSE_writeNCount error with litlengthNCount \n");
793
852
  goto _cleanup;
794
853
  }
@@ -798,7 +857,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
798
857
  }
799
858
 
800
859
  if (maxDstSize<12) {
801
- eSize = ERROR(GENERIC);
860
+ eSize = ERROR(dstSize_tooSmall);
802
861
  DISPLAYLEVEL(1, "not enough space to write RepOffsets \n");
803
862
  goto _cleanup;
804
863
  }
@@ -813,11 +872,10 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
813
872
  MEM_writeLE32(dstPtr+4, repStartValue[1]);
814
873
  MEM_writeLE32(dstPtr+8, repStartValue[2]);
815
874
  #endif
816
- //dstPtr += 12;
817
875
  eSize += 12;
818
876
 
819
877
  _cleanup:
820
- ZSTD_freeCCtx(esr.ref);
878
+ ZSTD_freeCDict(esr.dict);
821
879
  ZSTD_freeCCtx(esr.zc);
822
880
  free(esr.workPlace);
823
881
 
@@ -825,26 +883,68 @@ _cleanup:
825
883
  }
826
884
 
827
885
 
828
- size_t ZDICT_addEntropyTablesFromBuffer_advanced(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
829
- const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
830
- ZDICT_params_t params)
886
+
887
+ size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
888
+ const void* customDictContent, size_t dictContentSize,
889
+ const void* samplesBuffer, const size_t* samplesSizes,
890
+ unsigned nbSamples, ZDICT_params_t params)
831
891
  {
832
892
  size_t hSize;
833
- int const compressionLevel = (params.compressionLevel <= 0) ? g_compressionLevel_default : params.compressionLevel;
893
+ #define HBUFFSIZE 256 /* should prove large enough for all entropy headers */
894
+ BYTE header[HBUFFSIZE];
895
+ int const compressionLevel = (params.compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : params.compressionLevel;
834
896
  U32 const notificationLevel = params.notificationLevel;
835
897
 
898
+ /* check conditions */
899
+ DEBUGLOG(4, "ZDICT_finalizeDictionary");
900
+ if (dictBufferCapacity < dictContentSize) return ERROR(dstSize_tooSmall);
901
+ if (dictContentSize < ZDICT_CONTENTSIZE_MIN) return ERROR(srcSize_wrong);
902
+ if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) return ERROR(dstSize_tooSmall);
903
+
836
904
  /* dictionary header */
837
- MEM_writeLE32(dictBuffer, ZSTD_DICT_MAGIC);
838
- { U64 const randomID = XXH64((char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize, 0);
905
+ MEM_writeLE32(header, ZSTD_MAGIC_DICTIONARY);
906
+ { U64 const randomID = XXH64(customDictContent, dictContentSize, 0);
839
907
  U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768;
840
908
  U32 const dictID = params.dictID ? params.dictID : compliantID;
841
- MEM_writeLE32((char*)dictBuffer+4, dictID);
909
+ MEM_writeLE32(header+4, dictID);
842
910
  }
843
911
  hSize = 8;
844
912
 
845
913
  /* entropy tables */
846
914
  DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
847
915
  DISPLAYLEVEL(2, "statistics ... \n");
916
+ { size_t const eSize = ZDICT_analyzeEntropy(header+hSize, HBUFFSIZE-hSize,
917
+ compressionLevel,
918
+ samplesBuffer, samplesSizes, nbSamples,
919
+ customDictContent, dictContentSize,
920
+ notificationLevel);
921
+ if (ZDICT_isError(eSize)) return eSize;
922
+ hSize += eSize;
923
+ }
924
+
925
+ /* copy elements in final buffer ; note : src and dst buffer can overlap */
926
+ if (hSize + dictContentSize > dictBufferCapacity) dictContentSize = dictBufferCapacity - hSize;
927
+ { size_t const dictSize = hSize + dictContentSize;
928
+ char* dictEnd = (char*)dictBuffer + dictSize;
929
+ memmove(dictEnd - dictContentSize, customDictContent, dictContentSize);
930
+ memcpy(dictBuffer, header, hSize);
931
+ return dictSize;
932
+ }
933
+ }
934
+
935
+
936
+ static size_t ZDICT_addEntropyTablesFromBuffer_advanced(
937
+ void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
938
+ const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
939
+ ZDICT_params_t params)
940
+ {
941
+ int const compressionLevel = (params.compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : params.compressionLevel;
942
+ U32 const notificationLevel = params.notificationLevel;
943
+ size_t hSize = 8;
944
+
945
+ /* calculate entropy tables */
946
+ DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
947
+ DISPLAYLEVEL(2, "statistics ... \n");
848
948
  { size_t const eSize = ZDICT_analyzeEntropy((char*)dictBuffer+hSize, dictBufferCapacity-hSize,
849
949
  compressionLevel,
850
950
  samplesBuffer, samplesSizes, nbSamples,
@@ -854,21 +954,32 @@ size_t ZDICT_addEntropyTablesFromBuffer_advanced(void* dictBuffer, size_t dictCo
854
954
  hSize += eSize;
855
955
  }
856
956
 
957
+ /* add dictionary header (after entropy tables) */
958
+ MEM_writeLE32(dictBuffer, ZSTD_MAGIC_DICTIONARY);
959
+ { U64 const randomID = XXH64((char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize, 0);
960
+ U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768;
961
+ U32 const dictID = params.dictID ? params.dictID : compliantID;
962
+ MEM_writeLE32((char*)dictBuffer+4, dictID);
963
+ }
857
964
 
858
965
  if (hSize + dictContentSize < dictBufferCapacity)
859
966
  memmove((char*)dictBuffer + hSize, (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize);
860
967
  return MIN(dictBufferCapacity, hSize+dictContentSize);
861
968
  }
862
969
 
863
-
864
- /*! ZDICT_trainFromBuffer_unsafe() :
970
+ /* Hidden declaration for dbio.c */
971
+ size_t ZDICT_trainFromBuffer_unsafe_legacy(
972
+ void* dictBuffer, size_t maxDictSize,
973
+ const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
974
+ ZDICT_legacy_params_t params);
975
+ /*! ZDICT_trainFromBuffer_unsafe_legacy() :
865
976
  * Warning : `samplesBuffer` must be followed by noisy guard band.
866
977
  * @return : size of dictionary, or an error code which can be tested with ZDICT_isError()
867
978
  */
868
- size_t ZDICT_trainFromBuffer_unsafe(
979
+ size_t ZDICT_trainFromBuffer_unsafe_legacy(
869
980
  void* dictBuffer, size_t maxDictSize,
870
981
  const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
871
- ZDICT_params_t params)
982
+ ZDICT_legacy_params_t params)
872
983
  {
873
984
  U32 const dictListSize = MAX(MAX(DICTLISTSIZE_DEFAULT, nbSamples), (U32)(maxDictSize/16));
874
985
  dictItem* const dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList));
@@ -877,58 +988,63 @@ size_t ZDICT_trainFromBuffer_unsafe(
877
988
  size_t const targetDictSize = maxDictSize;
878
989
  size_t const samplesBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples);
879
990
  size_t dictSize = 0;
880
- U32 const notificationLevel = params.notificationLevel;
991
+ U32 const notificationLevel = params.zParams.notificationLevel;
881
992
 
882
993
  /* checks */
883
994
  if (!dictList) return ERROR(memory_allocation);
884
- if (maxDictSize <= g_provision_entropySize + g_min_fast_dictContent) { free(dictList); return ERROR(dstSize_tooSmall); }
885
- if (samplesBuffSize < ZDICT_MIN_SAMPLES_SIZE) { free(dictList); return 0; } /* not enough source to create dictionary */
995
+ if (maxDictSize < ZDICT_DICTSIZE_MIN) { free(dictList); return ERROR(dstSize_tooSmall); } /* requested dictionary size is too small */
996
+ if (samplesBuffSize < ZDICT_MIN_SAMPLES_SIZE) { free(dictList); return ERROR(dictionaryCreation_failed); } /* not enough source to create dictionary */
886
997
 
887
998
  /* init */
888
999
  ZDICT_initDictItem(dictList);
889
1000
 
890
1001
  /* build dictionary */
891
- ZDICT_trainBuffer(dictList, dictListSize,
892
- samplesBuffer, samplesBuffSize,
893
- samplesSizes, nbSamples,
894
- minRep, notificationLevel);
1002
+ ZDICT_trainBuffer_legacy(dictList, dictListSize,
1003
+ samplesBuffer, samplesBuffSize,
1004
+ samplesSizes, nbSamples,
1005
+ minRep, notificationLevel);
895
1006
 
896
1007
  /* display best matches */
897
- if (params.notificationLevel>= 3) {
898
- U32 const nb = MIN(25, dictList[0].pos);
899
- U32 const dictContentSize = ZDICT_dictSize(dictList);
900
- U32 u;
901
- DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", dictList[0].pos, dictContentSize);
902
- DISPLAYLEVEL(3, "list %u best segments \n", nb);
903
- for (u=1; u<=nb; u++) {
904
- U32 pos = dictList[u].pos;
905
- U32 length = dictList[u].length;
906
- U32 printedLength = MIN(40, length);
1008
+ if (params.zParams.notificationLevel>= 3) {
1009
+ unsigned const nb = MIN(25, dictList[0].pos);
1010
+ unsigned const dictContentSize = ZDICT_dictSize(dictList);
1011
+ unsigned u;
1012
+ DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", (unsigned)dictList[0].pos-1, dictContentSize);
1013
+ DISPLAYLEVEL(3, "list %u best segments \n", nb-1);
1014
+ for (u=1; u<nb; u++) {
1015
+ unsigned const pos = dictList[u].pos;
1016
+ unsigned const length = dictList[u].length;
1017
+ U32 const printedLength = MIN(40, length);
1018
+ if ((pos > samplesBuffSize) || ((pos + length) > samplesBuffSize)) {
1019
+ free(dictList);
1020
+ return ERROR(GENERIC); /* should never happen */
1021
+ }
907
1022
  DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |",
908
- u, length, pos, dictList[u].savings);
1023
+ u, length, pos, (unsigned)dictList[u].savings);
909
1024
  ZDICT_printHex((const char*)samplesBuffer+pos, printedLength);
910
1025
  DISPLAYLEVEL(3, "| \n");
911
1026
  } }
912
1027
 
913
1028
 
914
1029
  /* create dictionary */
915
- { U32 dictContentSize = ZDICT_dictSize(dictList);
916
- if (dictContentSize < targetDictSize/3) {
917
- DISPLAYLEVEL(2, "! warning : selected content significantly smaller than requested (%u < %u) \n", dictContentSize, (U32)maxDictSize);
1030
+ { unsigned dictContentSize = ZDICT_dictSize(dictList);
1031
+ if (dictContentSize < ZDICT_CONTENTSIZE_MIN) { free(dictList); return ERROR(dictionaryCreation_failed); } /* dictionary content too small */
1032
+ if (dictContentSize < targetDictSize/4) {
1033
+ DISPLAYLEVEL(2, "! warning : selected content significantly smaller than requested (%u < %u) \n", dictContentSize, (unsigned)maxDictSize);
1034
+ if (samplesBuffSize < 10 * targetDictSize)
1035
+ DISPLAYLEVEL(2, "! consider increasing the number of samples (total size : %u MB)\n", (unsigned)(samplesBuffSize>>20));
918
1036
  if (minRep > MINRATIO) {
919
1037
  DISPLAYLEVEL(2, "! consider increasing selectivity to produce larger dictionary (-s%u) \n", selectivity+1);
920
1038
  DISPLAYLEVEL(2, "! note : larger dictionaries are not necessarily better, test its efficiency on samples \n");
921
1039
  }
922
- if (samplesBuffSize < 10 * targetDictSize)
923
- DISPLAYLEVEL(2, "! consider increasing the number of samples (total size : %u MB)\n", (U32)(samplesBuffSize>>20));
924
1040
  }
925
1041
 
926
1042
  if ((dictContentSize > targetDictSize*3) && (nbSamples > 2*MINRATIO) && (selectivity>1)) {
927
- U32 proposedSelectivity = selectivity-1;
1043
+ unsigned proposedSelectivity = selectivity-1;
928
1044
  while ((nbSamples >> proposedSelectivity) <= MINRATIO) { proposedSelectivity--; }
929
- DISPLAYLEVEL(2, "! note : calculated dictionary significantly larger than requested (%u > %u) \n", dictContentSize, (U32)maxDictSize);
1045
+ DISPLAYLEVEL(2, "! note : calculated dictionary significantly larger than requested (%u > %u) \n", dictContentSize, (unsigned)maxDictSize);
930
1046
  DISPLAYLEVEL(2, "! consider increasing dictionary size, or produce denser dictionary (-s%u) \n", proposedSelectivity);
931
- DISPLAYLEVEL(2, "! always test dictionary efficiency on samples \n");
1047
+ DISPLAYLEVEL(2, "! always test dictionary efficiency on real samples \n");
932
1048
  }
933
1049
 
934
1050
  /* limit dictionary size */
@@ -954,7 +1070,7 @@ size_t ZDICT_trainFromBuffer_unsafe(
954
1070
 
955
1071
  dictSize = ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, maxDictSize,
956
1072
  samplesBuffer, samplesSizes, nbSamples,
957
- params);
1073
+ params.zParams);
958
1074
  }
959
1075
 
960
1076
  /* clean up */
@@ -963,11 +1079,12 @@ size_t ZDICT_trainFromBuffer_unsafe(
963
1079
  }
964
1080
 
965
1081
 
966
- /* issue : samplesBuffer need to be followed by a noisy guard band.
967
- * work around : duplicate the buffer, and add the noise */
968
- size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dictBufferCapacity,
969
- const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
970
- ZDICT_params_t params)
1082
+ /* ZDICT_trainFromBuffer_legacy() :
1083
+ * issue : samplesBuffer need to be followed by a noisy guard band.
1084
+ * work around : duplicate the buffer, and add the noise */
1085
+ size_t ZDICT_trainFromBuffer_legacy(void* dictBuffer, size_t dictBufferCapacity,
1086
+ const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
1087
+ ZDICT_legacy_params_t params)
971
1088
  {
972
1089
  size_t result;
973
1090
  void* newBuff;
@@ -980,10 +1097,9 @@ size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dictBufferCapacit
980
1097
  memcpy(newBuff, samplesBuffer, sBuffSize);
981
1098
  ZDICT_fillNoise((char*)newBuff + sBuffSize, NOISELENGTH); /* guard band, for end of buffer condition */
982
1099
 
983
- result = ZDICT_trainFromBuffer_unsafe(
984
- dictBuffer, dictBufferCapacity,
985
- newBuff, samplesSizes, nbSamples,
986
- params);
1100
+ result =
1101
+ ZDICT_trainFromBuffer_unsafe_legacy(dictBuffer, dictBufferCapacity, newBuff,
1102
+ samplesSizes, nbSamples, params);
987
1103
  free(newBuff);
988
1104
  return result;
989
1105
  }
@@ -992,15 +1108,23 @@ size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dictBufferCapacit
992
1108
  size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
993
1109
  const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
994
1110
  {
995
- ZDICT_params_t params;
1111
+ ZDICT_fastCover_params_t params;
1112
+ DEBUGLOG(3, "ZDICT_trainFromBuffer");
996
1113
  memset(&params, 0, sizeof(params));
997
- return ZDICT_trainFromBuffer_advanced(dictBuffer, dictBufferCapacity,
998
- samplesBuffer, samplesSizes, nbSamples,
999
- params);
1114
+ params.d = 8;
1115
+ params.steps = 4;
1116
+ /* Use default level since no compression level information is available */
1117
+ params.zParams.compressionLevel = ZSTD_CLEVEL_DEFAULT;
1118
+ #if defined(DEBUGLEVEL) && (DEBUGLEVEL>=1)
1119
+ params.zParams.notificationLevel = DEBUGLEVEL;
1120
+ #endif
1121
+ return ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, dictBufferCapacity,
1122
+ samplesBuffer, samplesSizes, nbSamples,
1123
+ &params);
1000
1124
  }
1001
1125
 
1002
1126
  size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
1003
- const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
1127
+ const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
1004
1128
  {
1005
1129
  ZDICT_params_t params;
1006
1130
  memset(&params, 0, sizeof(params));