extzstd 0.2 → 0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. checksums.yaml +4 -4
  2. data/HISTORY.ja.md +13 -0
  3. data/README.md +17 -14
  4. data/contrib/zstd/{NEWS → CHANGELOG} +115 -2
  5. data/contrib/zstd/CODE_OF_CONDUCT.md +5 -0
  6. data/contrib/zstd/Makefile +99 -53
  7. data/contrib/zstd/README.md +59 -39
  8. data/contrib/zstd/TESTING.md +1 -1
  9. data/contrib/zstd/appveyor.yml +17 -6
  10. data/contrib/zstd/lib/BUCK +29 -2
  11. data/contrib/zstd/lib/Makefile +118 -21
  12. data/contrib/zstd/lib/README.md +84 -44
  13. data/contrib/zstd/lib/common/bitstream.h +17 -33
  14. data/contrib/zstd/lib/common/compiler.h +62 -8
  15. data/contrib/zstd/lib/common/cpu.h +215 -0
  16. data/contrib/zstd/lib/common/debug.c +44 -0
  17. data/contrib/zstd/lib/common/debug.h +134 -0
  18. data/contrib/zstd/lib/common/entropy_common.c +16 -1
  19. data/contrib/zstd/lib/common/error_private.c +7 -0
  20. data/contrib/zstd/lib/common/fse.h +48 -44
  21. data/contrib/zstd/lib/common/fse_decompress.c +3 -3
  22. data/contrib/zstd/lib/common/huf.h +169 -113
  23. data/contrib/zstd/lib/common/mem.h +20 -2
  24. data/contrib/zstd/lib/common/pool.c +135 -49
  25. data/contrib/zstd/lib/common/pool.h +40 -21
  26. data/contrib/zstd/lib/common/threading.c +2 -2
  27. data/contrib/zstd/lib/common/threading.h +12 -12
  28. data/contrib/zstd/lib/common/xxhash.c +3 -2
  29. data/contrib/zstd/lib/common/zstd_common.c +3 -6
  30. data/contrib/zstd/lib/common/zstd_errors.h +17 -7
  31. data/contrib/zstd/lib/common/zstd_internal.h +76 -48
  32. data/contrib/zstd/lib/compress/fse_compress.c +89 -209
  33. data/contrib/zstd/lib/compress/hist.c +203 -0
  34. data/contrib/zstd/lib/compress/hist.h +95 -0
  35. data/contrib/zstd/lib/compress/huf_compress.c +188 -80
  36. data/contrib/zstd/lib/compress/zstd_compress.c +2500 -1203
  37. data/contrib/zstd/lib/compress/zstd_compress_internal.h +463 -62
  38. data/contrib/zstd/lib/compress/zstd_double_fast.c +321 -131
  39. data/contrib/zstd/lib/compress/zstd_double_fast.h +13 -4
  40. data/contrib/zstd/lib/compress/zstd_fast.c +335 -108
  41. data/contrib/zstd/lib/compress/zstd_fast.h +12 -6
  42. data/contrib/zstd/lib/compress/zstd_lazy.c +654 -313
  43. data/contrib/zstd/lib/compress/zstd_lazy.h +44 -16
  44. data/contrib/zstd/lib/compress/zstd_ldm.c +310 -420
  45. data/contrib/zstd/lib/compress/zstd_ldm.h +63 -26
  46. data/contrib/zstd/lib/compress/zstd_opt.c +773 -325
  47. data/contrib/zstd/lib/compress/zstd_opt.h +31 -5
  48. data/contrib/zstd/lib/compress/zstdmt_compress.c +1468 -518
  49. data/contrib/zstd/lib/compress/zstdmt_compress.h +96 -45
  50. data/contrib/zstd/lib/decompress/huf_decompress.c +518 -282
  51. data/contrib/zstd/lib/decompress/zstd_ddict.c +240 -0
  52. data/contrib/zstd/lib/decompress/zstd_ddict.h +44 -0
  53. data/contrib/zstd/lib/decompress/zstd_decompress.c +613 -1513
  54. data/contrib/zstd/lib/decompress/zstd_decompress_block.c +1311 -0
  55. data/contrib/zstd/lib/decompress/zstd_decompress_block.h +59 -0
  56. data/contrib/zstd/lib/decompress/zstd_decompress_internal.h +175 -0
  57. data/contrib/zstd/lib/dictBuilder/cover.c +194 -113
  58. data/contrib/zstd/lib/dictBuilder/cover.h +112 -0
  59. data/contrib/zstd/lib/dictBuilder/divsufsort.c +3 -3
  60. data/contrib/zstd/lib/dictBuilder/fastcover.c +740 -0
  61. data/contrib/zstd/lib/dictBuilder/zdict.c +142 -106
  62. data/contrib/zstd/lib/dictBuilder/zdict.h +115 -49
  63. data/contrib/zstd/lib/legacy/zstd_legacy.h +44 -12
  64. data/contrib/zstd/lib/legacy/zstd_v01.c +41 -10
  65. data/contrib/zstd/lib/legacy/zstd_v01.h +12 -7
  66. data/contrib/zstd/lib/legacy/zstd_v02.c +37 -12
  67. data/contrib/zstd/lib/legacy/zstd_v02.h +12 -7
  68. data/contrib/zstd/lib/legacy/zstd_v03.c +38 -12
  69. data/contrib/zstd/lib/legacy/zstd_v03.h +12 -7
  70. data/contrib/zstd/lib/legacy/zstd_v04.c +55 -174
  71. data/contrib/zstd/lib/legacy/zstd_v04.h +12 -7
  72. data/contrib/zstd/lib/legacy/zstd_v05.c +59 -31
  73. data/contrib/zstd/lib/legacy/zstd_v05.h +12 -7
  74. data/contrib/zstd/lib/legacy/zstd_v06.c +48 -20
  75. data/contrib/zstd/lib/legacy/zstd_v06.h +10 -5
  76. data/contrib/zstd/lib/legacy/zstd_v07.c +62 -29
  77. data/contrib/zstd/lib/legacy/zstd_v07.h +10 -5
  78. data/contrib/zstd/lib/zstd.h +1346 -832
  79. data/ext/extzstd.c +27 -19
  80. data/ext/extzstd_stream.c +20 -4
  81. data/ext/zstd_compress.c +1 -0
  82. data/ext/zstd_decompress.c +4 -0
  83. data/ext/zstd_dictbuilder.c +4 -0
  84. data/ext/zstd_dictbuilder_fastcover.c +5 -0
  85. data/lib/extzstd.rb +52 -220
  86. data/lib/extzstd/version.rb +1 -1
  87. metadata +21 -7
  88. data/contrib/zstd/circle.yml +0 -63
@@ -207,7 +207,6 @@ static dictItem ZDICT_analyzePos(
207
207
  U32 cumulLength[LLIMIT] = {0};
208
208
  U32 savings[LLIMIT] = {0};
209
209
  const BYTE* b = (const BYTE*)buffer;
210
- size_t length;
211
210
  size_t maxLength = LLIMIT;
212
211
  size_t pos = suffix[start];
213
212
  U32 end = start;
@@ -222,26 +221,30 @@ static dictItem ZDICT_analyzePos(
222
221
  ||(MEM_read16(b+pos+1) == MEM_read16(b+pos+3))
223
222
  ||(MEM_read16(b+pos+2) == MEM_read16(b+pos+4)) ) {
224
223
  /* skip and mark segment */
225
- U16 u16 = MEM_read16(b+pos+4);
226
- U32 u, e = 6;
227
- while (MEM_read16(b+pos+e) == u16) e+=2 ;
228
- if (b[pos+e] == b[pos+e-1]) e++;
229
- for (u=1; u<e; u++)
224
+ U16 const pattern16 = MEM_read16(b+pos+4);
225
+ U32 u, patternEnd = 6;
226
+ while (MEM_read16(b+pos+patternEnd) == pattern16) patternEnd+=2 ;
227
+ if (b[pos+patternEnd] == b[pos+patternEnd-1]) patternEnd++;
228
+ for (u=1; u<patternEnd; u++)
230
229
  doneMarks[pos+u] = 1;
231
230
  return solution;
232
231
  }
233
232
 
234
233
  /* look forward */
235
- do {
236
- end++;
237
- length = ZDICT_count(b + pos, b + suffix[end]);
238
- } while (length >=MINMATCHLENGTH);
234
+ { size_t length;
235
+ do {
236
+ end++;
237
+ length = ZDICT_count(b + pos, b + suffix[end]);
238
+ } while (length >= MINMATCHLENGTH);
239
+ }
239
240
 
240
241
  /* look backward */
241
- do {
242
- length = ZDICT_count(b + pos, b + *(suffix+start-1));
243
- if (length >=MINMATCHLENGTH) start--;
244
- } while(length >= MINMATCHLENGTH);
242
+ { size_t length;
243
+ do {
244
+ length = ZDICT_count(b + pos, b + *(suffix+start-1));
245
+ if (length >=MINMATCHLENGTH) start--;
246
+ } while(length >= MINMATCHLENGTH);
247
+ }
245
248
 
246
249
  /* exit if not found a minimum nb of repetitions */
247
250
  if (end-start < minRatio) {
@@ -252,15 +255,15 @@ static dictItem ZDICT_analyzePos(
252
255
  }
253
256
 
254
257
  { int i;
255
- U32 searchLength;
258
+ U32 mml;
256
259
  U32 refinedStart = start;
257
260
  U32 refinedEnd = end;
258
261
 
259
262
  DISPLAYLEVEL(4, "\n");
260
- DISPLAYLEVEL(4, "found %3u matches of length >= %i at pos %7u ", (U32)(end-start), MINMATCHLENGTH, (U32)pos);
263
+ DISPLAYLEVEL(4, "found %3u matches of length >= %i at pos %7u ", (unsigned)(end-start), MINMATCHLENGTH, (unsigned)pos);
261
264
  DISPLAYLEVEL(4, "\n");
262
265
 
263
- for (searchLength = MINMATCHLENGTH ; ; searchLength++) {
266
+ for (mml = MINMATCHLENGTH ; ; mml++) {
264
267
  BYTE currentChar = 0;
265
268
  U32 currentCount = 0;
266
269
  U32 currentID = refinedStart;
@@ -268,13 +271,13 @@ static dictItem ZDICT_analyzePos(
268
271
  U32 selectedCount = 0;
269
272
  U32 selectedID = currentID;
270
273
  for (id =refinedStart; id < refinedEnd; id++) {
271
- if (b[ suffix[id] + searchLength] != currentChar) {
274
+ if (b[suffix[id] + mml] != currentChar) {
272
275
  if (currentCount > selectedCount) {
273
276
  selectedCount = currentCount;
274
277
  selectedID = currentID;
275
278
  }
276
279
  currentID = id;
277
- currentChar = b[ suffix[id] + searchLength];
280
+ currentChar = b[ suffix[id] + mml];
278
281
  currentCount = 0;
279
282
  }
280
283
  currentCount ++;
@@ -290,27 +293,30 @@ static dictItem ZDICT_analyzePos(
290
293
  refinedEnd = refinedStart + selectedCount;
291
294
  }
292
295
 
293
- /* evaluate gain based on new ref */
296
+ /* evaluate gain based on new dict */
294
297
  start = refinedStart;
295
298
  pos = suffix[refinedStart];
296
299
  end = start;
297
300
  memset(lengthList, 0, sizeof(lengthList));
298
301
 
299
302
  /* look forward */
300
- do {
301
- end++;
302
- length = ZDICT_count(b + pos, b + suffix[end]);
303
- if (length >= LLIMIT) length = LLIMIT-1;
304
- lengthList[length]++;
305
- } while (length >=MINMATCHLENGTH);
303
+ { size_t length;
304
+ do {
305
+ end++;
306
+ length = ZDICT_count(b + pos, b + suffix[end]);
307
+ if (length >= LLIMIT) length = LLIMIT-1;
308
+ lengthList[length]++;
309
+ } while (length >=MINMATCHLENGTH);
310
+ }
306
311
 
307
312
  /* look backward */
308
- length = MINMATCHLENGTH;
309
- while ((length >= MINMATCHLENGTH) & (start > 0)) {
310
- length = ZDICT_count(b + pos, b + suffix[start - 1]);
311
- if (length >= LLIMIT) length = LLIMIT - 1;
312
- lengthList[length]++;
313
- if (length >= MINMATCHLENGTH) start--;
313
+ { size_t length = MINMATCHLENGTH;
314
+ while ((length >= MINMATCHLENGTH) & (start > 0)) {
315
+ length = ZDICT_count(b + pos, b + suffix[start - 1]);
316
+ if (length >= LLIMIT) length = LLIMIT - 1;
317
+ lengthList[length]++;
318
+ if (length >= MINMATCHLENGTH) start--;
319
+ }
314
320
  }
315
321
 
316
322
  /* largest useful length */
@@ -335,8 +341,8 @@ static dictItem ZDICT_analyzePos(
335
341
  for (i=MINMATCHLENGTH; i<=(int)maxLength; i++)
336
342
  savings[i] = savings[i-1] + (lengthList[i] * (i-3));
337
343
 
338
- DISPLAYLEVEL(4, "Selected ref at position %u, of length %u : saves %u (ratio: %.2f) \n",
339
- (U32)pos, (U32)maxLength, savings[maxLength], (double)savings[maxLength] / maxLength);
344
+ DISPLAYLEVEL(4, "Selected dict at position %u, of length %u : saves %u (ratio: %.2f) \n",
345
+ (unsigned)pos, (unsigned)maxLength, (unsigned)savings[maxLength], (double)savings[maxLength] / maxLength);
340
346
 
341
347
  solution.pos = (U32)pos;
342
348
  solution.length = (U32)maxLength;
@@ -345,12 +351,12 @@ static dictItem ZDICT_analyzePos(
345
351
  /* mark positions done */
346
352
  { U32 id;
347
353
  for (id=start; id<end; id++) {
348
- U32 p, pEnd;
354
+ U32 p, pEnd, length;
349
355
  U32 const testedPos = suffix[id];
350
356
  if (testedPos == pos)
351
357
  length = solution.length;
352
358
  else {
353
- length = ZDICT_count(b+pos, b+testedPos);
359
+ length = (U32)ZDICT_count(b+pos, b+testedPos);
354
360
  if (length > solution.length) length = solution.length;
355
361
  }
356
362
  pEnd = (U32)(testedPos + length);
@@ -491,7 +497,7 @@ static U32 ZDICT_dictSize(const dictItem* dictList)
491
497
  static size_t ZDICT_trainBuffer_legacy(dictItem* dictList, U32 dictListSize,
492
498
  const void* const buffer, size_t bufferSize, /* buffer must end with noisy guard band */
493
499
  const size_t* fileSizes, unsigned nbFiles,
494
- U32 minRatio, U32 notificationLevel)
500
+ unsigned minRatio, U32 notificationLevel)
495
501
  {
496
502
  int* const suffix0 = (int*)malloc((bufferSize+2)*sizeof(*suffix0));
497
503
  int* const suffix = suffix0+1;
@@ -517,11 +523,11 @@ static size_t ZDICT_trainBuffer_legacy(dictItem* dictList, U32 dictListSize,
517
523
  memset(doneMarks, 0, bufferSize+16);
518
524
 
519
525
  /* limit sample set size (divsufsort limitation)*/
520
- if (bufferSize > ZDICT_MAX_SAMPLES_SIZE) DISPLAYLEVEL(3, "sample set too large : reduced to %u MB ...\n", (U32)(ZDICT_MAX_SAMPLES_SIZE>>20));
526
+ if (bufferSize > ZDICT_MAX_SAMPLES_SIZE) DISPLAYLEVEL(3, "sample set too large : reduced to %u MB ...\n", (unsigned)(ZDICT_MAX_SAMPLES_SIZE>>20));
521
527
  while (bufferSize > ZDICT_MAX_SAMPLES_SIZE) bufferSize -= fileSizes[--nbFiles];
522
528
 
523
529
  /* sort */
524
- DISPLAYLEVEL(2, "sorting %u files of total size %u MB ...\n", nbFiles, (U32)(bufferSize>>20));
530
+ DISPLAYLEVEL(2, "sorting %u files of total size %u MB ...\n", nbFiles, (unsigned)(bufferSize>>20));
525
531
  { int const divSuftSortResult = divsufsort((const unsigned char*)buffer, suffix, (int)bufferSize, 0);
526
532
  if (divSuftSortResult != 0) { result = ERROR(GENERIC); goto _cleanup; }
527
533
  }
@@ -575,29 +581,31 @@ static void ZDICT_fillNoise(void* buffer, size_t length)
575
581
 
576
582
  typedef struct
577
583
  {
578
- ZSTD_CCtx* ref;
579
- ZSTD_CCtx* zc;
584
+ ZSTD_CDict* dict; /* dictionary */
585
+ ZSTD_CCtx* zc; /* working context */
580
586
  void* workPlace; /* must be ZSTD_BLOCKSIZE_MAX allocated */
581
587
  } EStats_ress_t;
582
588
 
583
589
  #define MAXREPOFFSET 1024
584
590
 
585
591
  static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params,
586
- U32* countLit, U32* offsetcodeCount, U32* matchlengthCount, U32* litlengthCount, U32* repOffsets,
587
- const void* src, size_t srcSize, U32 notificationLevel)
592
+ unsigned* countLit, unsigned* offsetcodeCount, unsigned* matchlengthCount, unsigned* litlengthCount, U32* repOffsets,
593
+ const void* src, size_t srcSize,
594
+ U32 notificationLevel)
588
595
  {
589
596
  size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params.cParams.windowLog);
590
597
  size_t cSize;
591
598
 
592
599
  if (srcSize > blockSizeMax) srcSize = blockSizeMax; /* protection vs large samples */
593
- { size_t const errorCode = ZSTD_copyCCtx(esr.zc, esr.ref, 0);
594
- if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_copyCCtx failed \n"); return; }
600
+ { size_t const errorCode = ZSTD_compressBegin_usingCDict(esr.zc, esr.dict);
601
+ if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_compressBegin_usingCDict failed \n"); return; }
602
+
595
603
  }
596
604
  cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_MAX, src, srcSize);
597
- if (ZSTD_isError(cSize)) { DISPLAYLEVEL(3, "warning : could not compress sample size %u \n", (U32)srcSize); return; }
605
+ if (ZSTD_isError(cSize)) { DISPLAYLEVEL(3, "warning : could not compress sample size %u \n", (unsigned)srcSize); return; }
598
606
 
599
607
  if (cSize) { /* if == 0; block is not compressible */
600
- const seqStore_t* seqStorePtr = ZSTD_getSeqStore(esr.zc);
608
+ const seqStore_t* const seqStorePtr = ZSTD_getSeqStore(esr.zc);
601
609
 
602
610
  /* literals stats */
603
611
  { const BYTE* bytePtr;
@@ -659,6 +667,18 @@ static void ZDICT_insertSortCount(offsetCount_t table[ZSTD_REP_NUM+1], U32 val,
659
667
  }
660
668
  }
661
669
 
670
+ /* ZDICT_flatLit() :
671
+ * rewrite `countLit` to contain a mostly flat but still compressible distribution of literals.
672
+ * necessary to avoid generating a non-compressible distribution that HUF_writeCTable() cannot encode.
673
+ */
674
+ static void ZDICT_flatLit(unsigned* countLit)
675
+ {
676
+ int u;
677
+ for (u=1; u<256; u++) countLit[u] = 2;
678
+ countLit[0] = 4;
679
+ countLit[253] = 1;
680
+ countLit[254] = 1;
681
+ }
662
682
 
663
683
  #define OFFCODE_MAX 30 /* only applicable to first block */
664
684
  static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
@@ -667,18 +687,18 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
667
687
  const void* dictBuffer, size_t dictBufferSize,
668
688
  unsigned notificationLevel)
669
689
  {
670
- U32 countLit[256];
690
+ unsigned countLit[256];
671
691
  HUF_CREATE_STATIC_CTABLE(hufTable, 255);
672
- U32 offcodeCount[OFFCODE_MAX+1];
692
+ unsigned offcodeCount[OFFCODE_MAX+1];
673
693
  short offcodeNCount[OFFCODE_MAX+1];
674
694
  U32 offcodeMax = ZSTD_highbit32((U32)(dictBufferSize + 128 KB));
675
- U32 matchLengthCount[MaxML+1];
695
+ unsigned matchLengthCount[MaxML+1];
676
696
  short matchLengthNCount[MaxML+1];
677
- U32 litLengthCount[MaxLL+1];
697
+ unsigned litLengthCount[MaxLL+1];
678
698
  short litLengthNCount[MaxLL+1];
679
699
  U32 repOffset[MAXREPOFFSET];
680
700
  offsetCount_t bestRepOffset[ZSTD_REP_NUM+1];
681
- EStats_ress_t esr;
701
+ EStats_ress_t esr = { NULL, NULL, NULL };
682
702
  ZSTD_parameters params;
683
703
  U32 u, huffLog = 11, Offlog = OffFSELog, mlLog = MLFSELog, llLog = LLFSELog, total;
684
704
  size_t pos = 0, errorCode;
@@ -688,14 +708,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
688
708
  BYTE* dstPtr = (BYTE*)dstBuffer;
689
709
 
690
710
  /* init */
691
- esr.ref = ZSTD_createCCtx();
692
- esr.zc = ZSTD_createCCtx();
693
- esr.workPlace = malloc(ZSTD_BLOCKSIZE_MAX);
694
- if (!esr.ref || !esr.zc || !esr.workPlace) {
695
- eSize = ERROR(memory_allocation);
696
- DISPLAYLEVEL(1, "Not enough memory \n");
697
- goto _cleanup;
698
- }
711
+ DEBUGLOG(4, "ZDICT_analyzeEntropy");
699
712
  if (offcodeMax>OFFCODE_MAX) { eSize = ERROR(dictionaryCreation_failed); goto _cleanup; } /* too large dictionary */
700
713
  for (u=0; u<256; u++) countLit[u] = 1; /* any character must be described */
701
714
  for (u=0; u<=offcodeMax; u++) offcodeCount[u] = 1;
@@ -704,16 +717,19 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
704
717
  memset(repOffset, 0, sizeof(repOffset));
705
718
  repOffset[1] = repOffset[4] = repOffset[8] = 1;
706
719
  memset(bestRepOffset, 0, sizeof(bestRepOffset));
707
- if (compressionLevel<=0) compressionLevel = g_compressionLevel_default;
720
+ if (compressionLevel==0) compressionLevel = g_compressionLevel_default;
708
721
  params = ZSTD_getParams(compressionLevel, averageSampleSize, dictBufferSize);
709
- { size_t const beginResult = ZSTD_compressBegin_advanced(esr.ref, dictBuffer, dictBufferSize, params, 0);
710
- if (ZSTD_isError(beginResult)) {
711
- DISPLAYLEVEL(1, "error : ZSTD_compressBegin_advanced() failed : %s \n", ZSTD_getErrorName(beginResult));
712
- eSize = ERROR(GENERIC);
713
- goto _cleanup;
714
- } }
715
722
 
716
- /* collect stats on all files */
723
+ esr.dict = ZSTD_createCDict_advanced(dictBuffer, dictBufferSize, ZSTD_dlm_byRef, ZSTD_dct_rawContent, params.cParams, ZSTD_defaultCMem);
724
+ esr.zc = ZSTD_createCCtx();
725
+ esr.workPlace = malloc(ZSTD_BLOCKSIZE_MAX);
726
+ if (!esr.dict || !esr.zc || !esr.workPlace) {
727
+ eSize = ERROR(memory_allocation);
728
+ DISPLAYLEVEL(1, "Not enough memory \n");
729
+ goto _cleanup;
730
+ }
731
+
732
+ /* collect stats on all samples */
717
733
  for (u=0; u<nbFiles; u++) {
718
734
  ZDICT_countEStats(esr, params,
719
735
  countLit, offcodeCount, matchLengthCount, litLengthCount, repOffset,
@@ -722,14 +738,21 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
722
738
  pos += fileSizes[u];
723
739
  }
724
740
 
725
- /* analyze */
726
- errorCode = HUF_buildCTable (hufTable, countLit, 255, huffLog);
727
- if (HUF_isError(errorCode)) {
728
- eSize = ERROR(GENERIC);
729
- DISPLAYLEVEL(1, "HUF_buildCTable error \n");
730
- goto _cleanup;
741
+ /* analyze, build stats, starting with literals */
742
+ { size_t maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog);
743
+ if (HUF_isError(maxNbBits)) {
744
+ eSize = ERROR(GENERIC);
745
+ DISPLAYLEVEL(1, " HUF_buildCTable error \n");
746
+ goto _cleanup;
747
+ }
748
+ if (maxNbBits==8) { /* not compressible : will fail on HUF_writeCTable() */
749
+ DISPLAYLEVEL(2, "warning : pathological dataset : literals are not compressible : samples are noisy or too regular \n");
750
+ ZDICT_flatLit(countLit); /* replace distribution by a fake "mostly flat but still compressible" distribution, that HUF_writeCTable() can encode */
751
+ maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog);
752
+ assert(maxNbBits==9);
753
+ }
754
+ huffLog = (U32)maxNbBits;
731
755
  }
732
- huffLog = (U32)errorCode;
733
756
 
734
757
  /* looking for most common first offsets */
735
758
  { U32 offset;
@@ -829,7 +852,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
829
852
  eSize += 12;
830
853
 
831
854
  _cleanup:
832
- ZSTD_freeCCtx(esr.ref);
855
+ ZSTD_freeCDict(esr.dict);
833
856
  ZSTD_freeCCtx(esr.zc);
834
857
  free(esr.workPlace);
835
858
 
@@ -840,16 +863,17 @@ _cleanup:
840
863
 
841
864
  size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
842
865
  const void* customDictContent, size_t dictContentSize,
843
- const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
844
- ZDICT_params_t params)
866
+ const void* samplesBuffer, const size_t* samplesSizes,
867
+ unsigned nbSamples, ZDICT_params_t params)
845
868
  {
846
869
  size_t hSize;
847
870
  #define HBUFFSIZE 256 /* should prove large enough for all entropy headers */
848
871
  BYTE header[HBUFFSIZE];
849
- int const compressionLevel = (params.compressionLevel <= 0) ? g_compressionLevel_default : params.compressionLevel;
872
+ int const compressionLevel = (params.compressionLevel == 0) ? g_compressionLevel_default : params.compressionLevel;
850
873
  U32 const notificationLevel = params.notificationLevel;
851
874
 
852
875
  /* check conditions */
876
+ DEBUGLOG(4, "ZDICT_finalizeDictionary");
853
877
  if (dictBufferCapacity < dictContentSize) return ERROR(dstSize_tooSmall);
854
878
  if (dictContentSize < ZDICT_CONTENTSIZE_MIN) return ERROR(srcSize_wrong);
855
879
  if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) return ERROR(dstSize_tooSmall);
@@ -886,11 +910,12 @@ size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
886
910
  }
887
911
 
888
912
 
889
- size_t ZDICT_addEntropyTablesFromBuffer_advanced(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
890
- const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
891
- ZDICT_params_t params)
913
+ static size_t ZDICT_addEntropyTablesFromBuffer_advanced(
914
+ void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
915
+ const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
916
+ ZDICT_params_t params)
892
917
  {
893
- int const compressionLevel = (params.compressionLevel <= 0) ? g_compressionLevel_default : params.compressionLevel;
918
+ int const compressionLevel = (params.compressionLevel == 0) ? g_compressionLevel_default : params.compressionLevel;
894
919
  U32 const notificationLevel = params.notificationLevel;
895
920
  size_t hSize = 8;
896
921
 
@@ -919,7 +944,11 @@ size_t ZDICT_addEntropyTablesFromBuffer_advanced(void* dictBuffer, size_t dictCo
919
944
  return MIN(dictBufferCapacity, hSize+dictContentSize);
920
945
  }
921
946
 
922
-
947
+ /* Hidden declaration for dbio.c */
948
+ size_t ZDICT_trainFromBuffer_unsafe_legacy(
949
+ void* dictBuffer, size_t maxDictSize,
950
+ const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
951
+ ZDICT_legacy_params_t params);
923
952
  /*! ZDICT_trainFromBuffer_unsafe_legacy() :
924
953
  * Warning : `samplesBuffer` must be followed by noisy guard band.
925
954
  * @return : size of dictionary, or an error code which can be tested with ZDICT_isError()
@@ -954,31 +983,33 @@ size_t ZDICT_trainFromBuffer_unsafe_legacy(
954
983
 
955
984
  /* display best matches */
956
985
  if (params.zParams.notificationLevel>= 3) {
957
- U32 const nb = MIN(25, dictList[0].pos);
958
- U32 const dictContentSize = ZDICT_dictSize(dictList);
959
- U32 u;
960
- DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", dictList[0].pos-1, dictContentSize);
986
+ unsigned const nb = MIN(25, dictList[0].pos);
987
+ unsigned const dictContentSize = ZDICT_dictSize(dictList);
988
+ unsigned u;
989
+ DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", (unsigned)dictList[0].pos-1, dictContentSize);
961
990
  DISPLAYLEVEL(3, "list %u best segments \n", nb-1);
962
991
  for (u=1; u<nb; u++) {
963
- U32 const pos = dictList[u].pos;
964
- U32 const length = dictList[u].length;
992
+ unsigned const pos = dictList[u].pos;
993
+ unsigned const length = dictList[u].length;
965
994
  U32 const printedLength = MIN(40, length);
966
- if ((pos > samplesBuffSize) || ((pos + length) > samplesBuffSize))
995
+ if ((pos > samplesBuffSize) || ((pos + length) > samplesBuffSize)) {
996
+ free(dictList);
967
997
  return ERROR(GENERIC); /* should never happen */
998
+ }
968
999
  DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |",
969
- u, length, pos, dictList[u].savings);
1000
+ u, length, pos, (unsigned)dictList[u].savings);
970
1001
  ZDICT_printHex((const char*)samplesBuffer+pos, printedLength);
971
1002
  DISPLAYLEVEL(3, "| \n");
972
1003
  } }
973
1004
 
974
1005
 
975
1006
  /* create dictionary */
976
- { U32 dictContentSize = ZDICT_dictSize(dictList);
1007
+ { unsigned dictContentSize = ZDICT_dictSize(dictList);
977
1008
  if (dictContentSize < ZDICT_CONTENTSIZE_MIN) { free(dictList); return ERROR(dictionaryCreation_failed); } /* dictionary content too small */
978
1009
  if (dictContentSize < targetDictSize/4) {
979
- DISPLAYLEVEL(2, "! warning : selected content significantly smaller than requested (%u < %u) \n", dictContentSize, (U32)maxDictSize);
1010
+ DISPLAYLEVEL(2, "! warning : selected content significantly smaller than requested (%u < %u) \n", dictContentSize, (unsigned)maxDictSize);
980
1011
  if (samplesBuffSize < 10 * targetDictSize)
981
- DISPLAYLEVEL(2, "! consider increasing the number of samples (total size : %u MB)\n", (U32)(samplesBuffSize>>20));
1012
+ DISPLAYLEVEL(2, "! consider increasing the number of samples (total size : %u MB)\n", (unsigned)(samplesBuffSize>>20));
982
1013
  if (minRep > MINRATIO) {
983
1014
  DISPLAYLEVEL(2, "! consider increasing selectivity to produce larger dictionary (-s%u) \n", selectivity+1);
984
1015
  DISPLAYLEVEL(2, "! note : larger dictionaries are not necessarily better, test its efficiency on samples \n");
@@ -986,9 +1017,9 @@ size_t ZDICT_trainFromBuffer_unsafe_legacy(
986
1017
  }
987
1018
 
988
1019
  if ((dictContentSize > targetDictSize*3) && (nbSamples > 2*MINRATIO) && (selectivity>1)) {
989
- U32 proposedSelectivity = selectivity-1;
1020
+ unsigned proposedSelectivity = selectivity-1;
990
1021
  while ((nbSamples >> proposedSelectivity) <= MINRATIO) { proposedSelectivity--; }
991
- DISPLAYLEVEL(2, "! note : calculated dictionary significantly larger than requested (%u > %u) \n", dictContentSize, (U32)maxDictSize);
1022
+ DISPLAYLEVEL(2, "! note : calculated dictionary significantly larger than requested (%u > %u) \n", dictContentSize, (unsigned)maxDictSize);
992
1023
  DISPLAYLEVEL(2, "! consider increasing dictionary size, or produce denser dictionary (-s%u) \n", proposedSelectivity);
993
1024
  DISPLAYLEVEL(2, "! always test dictionary efficiency on real samples \n");
994
1025
  }
@@ -1025,8 +1056,9 @@ size_t ZDICT_trainFromBuffer_unsafe_legacy(
1025
1056
  }
1026
1057
 
1027
1058
 
1028
- /* issue : samplesBuffer need to be followed by a noisy guard band.
1029
- * work around : duplicate the buffer, and add the noise */
1059
+ /* ZDICT_trainFromBuffer_legacy() :
1060
+ * issue : samplesBuffer need to be followed by a noisy guard band.
1061
+ * work around : duplicate the buffer, and add the noise */
1030
1062
  size_t ZDICT_trainFromBuffer_legacy(void* dictBuffer, size_t dictBufferCapacity,
1031
1063
  const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
1032
1064
  ZDICT_legacy_params_t params)
@@ -1053,19 +1085,23 @@ size_t ZDICT_trainFromBuffer_legacy(void* dictBuffer, size_t dictBufferCapacity,
1053
1085
  size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
1054
1086
  const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
1055
1087
  {
1056
- ZDICT_cover_params_t params;
1088
+ ZDICT_fastCover_params_t params;
1089
+ DEBUGLOG(3, "ZDICT_trainFromBuffer");
1057
1090
  memset(&params, 0, sizeof(params));
1058
1091
  params.d = 8;
1059
1092
  params.steps = 4;
1060
- /* Default to level 6 since no compression level information is avaialble */
1061
- params.zParams.compressionLevel = 6;
1062
- return ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, dictBufferCapacity,
1063
- samplesBuffer, samplesSizes,
1064
- nbSamples, &params);
1093
+ /* Default to level 6 since no compression level information is available */
1094
+ params.zParams.compressionLevel = 3;
1095
+ #if defined(DEBUGLEVEL) && (DEBUGLEVEL>=1)
1096
+ params.zParams.notificationLevel = DEBUGLEVEL;
1097
+ #endif
1098
+ return ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, dictBufferCapacity,
1099
+ samplesBuffer, samplesSizes, nbSamples,
1100
+ &params);
1065
1101
  }
1066
1102
 
1067
1103
  size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
1068
- const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
1104
+ const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
1069
1105
  {
1070
1106
  ZDICT_params_t params;
1071
1107
  memset(&params, 0, sizeof(params));