extzstd 0.2 → 0.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (88) hide show
  1. checksums.yaml +4 -4
  2. data/HISTORY.ja.md +13 -0
  3. data/README.md +17 -14
  4. data/contrib/zstd/{NEWS → CHANGELOG} +115 -2
  5. data/contrib/zstd/CODE_OF_CONDUCT.md +5 -0
  6. data/contrib/zstd/Makefile +99 -53
  7. data/contrib/zstd/README.md +59 -39
  8. data/contrib/zstd/TESTING.md +1 -1
  9. data/contrib/zstd/appveyor.yml +17 -6
  10. data/contrib/zstd/lib/BUCK +29 -2
  11. data/contrib/zstd/lib/Makefile +118 -21
  12. data/contrib/zstd/lib/README.md +84 -44
  13. data/contrib/zstd/lib/common/bitstream.h +17 -33
  14. data/contrib/zstd/lib/common/compiler.h +62 -8
  15. data/contrib/zstd/lib/common/cpu.h +215 -0
  16. data/contrib/zstd/lib/common/debug.c +44 -0
  17. data/contrib/zstd/lib/common/debug.h +134 -0
  18. data/contrib/zstd/lib/common/entropy_common.c +16 -1
  19. data/contrib/zstd/lib/common/error_private.c +7 -0
  20. data/contrib/zstd/lib/common/fse.h +48 -44
  21. data/contrib/zstd/lib/common/fse_decompress.c +3 -3
  22. data/contrib/zstd/lib/common/huf.h +169 -113
  23. data/contrib/zstd/lib/common/mem.h +20 -2
  24. data/contrib/zstd/lib/common/pool.c +135 -49
  25. data/contrib/zstd/lib/common/pool.h +40 -21
  26. data/contrib/zstd/lib/common/threading.c +2 -2
  27. data/contrib/zstd/lib/common/threading.h +12 -12
  28. data/contrib/zstd/lib/common/xxhash.c +3 -2
  29. data/contrib/zstd/lib/common/zstd_common.c +3 -6
  30. data/contrib/zstd/lib/common/zstd_errors.h +17 -7
  31. data/contrib/zstd/lib/common/zstd_internal.h +76 -48
  32. data/contrib/zstd/lib/compress/fse_compress.c +89 -209
  33. data/contrib/zstd/lib/compress/hist.c +203 -0
  34. data/contrib/zstd/lib/compress/hist.h +95 -0
  35. data/contrib/zstd/lib/compress/huf_compress.c +188 -80
  36. data/contrib/zstd/lib/compress/zstd_compress.c +2500 -1203
  37. data/contrib/zstd/lib/compress/zstd_compress_internal.h +463 -62
  38. data/contrib/zstd/lib/compress/zstd_double_fast.c +321 -131
  39. data/contrib/zstd/lib/compress/zstd_double_fast.h +13 -4
  40. data/contrib/zstd/lib/compress/zstd_fast.c +335 -108
  41. data/contrib/zstd/lib/compress/zstd_fast.h +12 -6
  42. data/contrib/zstd/lib/compress/zstd_lazy.c +654 -313
  43. data/contrib/zstd/lib/compress/zstd_lazy.h +44 -16
  44. data/contrib/zstd/lib/compress/zstd_ldm.c +310 -420
  45. data/contrib/zstd/lib/compress/zstd_ldm.h +63 -26
  46. data/contrib/zstd/lib/compress/zstd_opt.c +773 -325
  47. data/contrib/zstd/lib/compress/zstd_opt.h +31 -5
  48. data/contrib/zstd/lib/compress/zstdmt_compress.c +1468 -518
  49. data/contrib/zstd/lib/compress/zstdmt_compress.h +96 -45
  50. data/contrib/zstd/lib/decompress/huf_decompress.c +518 -282
  51. data/contrib/zstd/lib/decompress/zstd_ddict.c +240 -0
  52. data/contrib/zstd/lib/decompress/zstd_ddict.h +44 -0
  53. data/contrib/zstd/lib/decompress/zstd_decompress.c +613 -1513
  54. data/contrib/zstd/lib/decompress/zstd_decompress_block.c +1311 -0
  55. data/contrib/zstd/lib/decompress/zstd_decompress_block.h +59 -0
  56. data/contrib/zstd/lib/decompress/zstd_decompress_internal.h +175 -0
  57. data/contrib/zstd/lib/dictBuilder/cover.c +194 -113
  58. data/contrib/zstd/lib/dictBuilder/cover.h +112 -0
  59. data/contrib/zstd/lib/dictBuilder/divsufsort.c +3 -3
  60. data/contrib/zstd/lib/dictBuilder/fastcover.c +740 -0
  61. data/contrib/zstd/lib/dictBuilder/zdict.c +142 -106
  62. data/contrib/zstd/lib/dictBuilder/zdict.h +115 -49
  63. data/contrib/zstd/lib/legacy/zstd_legacy.h +44 -12
  64. data/contrib/zstd/lib/legacy/zstd_v01.c +41 -10
  65. data/contrib/zstd/lib/legacy/zstd_v01.h +12 -7
  66. data/contrib/zstd/lib/legacy/zstd_v02.c +37 -12
  67. data/contrib/zstd/lib/legacy/zstd_v02.h +12 -7
  68. data/contrib/zstd/lib/legacy/zstd_v03.c +38 -12
  69. data/contrib/zstd/lib/legacy/zstd_v03.h +12 -7
  70. data/contrib/zstd/lib/legacy/zstd_v04.c +55 -174
  71. data/contrib/zstd/lib/legacy/zstd_v04.h +12 -7
  72. data/contrib/zstd/lib/legacy/zstd_v05.c +59 -31
  73. data/contrib/zstd/lib/legacy/zstd_v05.h +12 -7
  74. data/contrib/zstd/lib/legacy/zstd_v06.c +48 -20
  75. data/contrib/zstd/lib/legacy/zstd_v06.h +10 -5
  76. data/contrib/zstd/lib/legacy/zstd_v07.c +62 -29
  77. data/contrib/zstd/lib/legacy/zstd_v07.h +10 -5
  78. data/contrib/zstd/lib/zstd.h +1346 -832
  79. data/ext/extzstd.c +27 -19
  80. data/ext/extzstd_stream.c +20 -4
  81. data/ext/zstd_compress.c +1 -0
  82. data/ext/zstd_decompress.c +4 -0
  83. data/ext/zstd_dictbuilder.c +4 -0
  84. data/ext/zstd_dictbuilder_fastcover.c +5 -0
  85. data/lib/extzstd.rb +52 -220
  86. data/lib/extzstd/version.rb +1 -1
  87. metadata +21 -7
  88. data/contrib/zstd/circle.yml +0 -63
@@ -207,7 +207,6 @@ static dictItem ZDICT_analyzePos(
207
207
  U32 cumulLength[LLIMIT] = {0};
208
208
  U32 savings[LLIMIT] = {0};
209
209
  const BYTE* b = (const BYTE*)buffer;
210
- size_t length;
211
210
  size_t maxLength = LLIMIT;
212
211
  size_t pos = suffix[start];
213
212
  U32 end = start;
@@ -222,26 +221,30 @@ static dictItem ZDICT_analyzePos(
222
221
  ||(MEM_read16(b+pos+1) == MEM_read16(b+pos+3))
223
222
  ||(MEM_read16(b+pos+2) == MEM_read16(b+pos+4)) ) {
224
223
  /* skip and mark segment */
225
- U16 u16 = MEM_read16(b+pos+4);
226
- U32 u, e = 6;
227
- while (MEM_read16(b+pos+e) == u16) e+=2 ;
228
- if (b[pos+e] == b[pos+e-1]) e++;
229
- for (u=1; u<e; u++)
224
+ U16 const pattern16 = MEM_read16(b+pos+4);
225
+ U32 u, patternEnd = 6;
226
+ while (MEM_read16(b+pos+patternEnd) == pattern16) patternEnd+=2 ;
227
+ if (b[pos+patternEnd] == b[pos+patternEnd-1]) patternEnd++;
228
+ for (u=1; u<patternEnd; u++)
230
229
  doneMarks[pos+u] = 1;
231
230
  return solution;
232
231
  }
233
232
 
234
233
  /* look forward */
235
- do {
236
- end++;
237
- length = ZDICT_count(b + pos, b + suffix[end]);
238
- } while (length >=MINMATCHLENGTH);
234
+ { size_t length;
235
+ do {
236
+ end++;
237
+ length = ZDICT_count(b + pos, b + suffix[end]);
238
+ } while (length >= MINMATCHLENGTH);
239
+ }
239
240
 
240
241
  /* look backward */
241
- do {
242
- length = ZDICT_count(b + pos, b + *(suffix+start-1));
243
- if (length >=MINMATCHLENGTH) start--;
244
- } while(length >= MINMATCHLENGTH);
242
+ { size_t length;
243
+ do {
244
+ length = ZDICT_count(b + pos, b + *(suffix+start-1));
245
+ if (length >=MINMATCHLENGTH) start--;
246
+ } while(length >= MINMATCHLENGTH);
247
+ }
245
248
 
246
249
  /* exit if not found a minimum nb of repetitions */
247
250
  if (end-start < minRatio) {
@@ -252,15 +255,15 @@ static dictItem ZDICT_analyzePos(
252
255
  }
253
256
 
254
257
  { int i;
255
- U32 searchLength;
258
+ U32 mml;
256
259
  U32 refinedStart = start;
257
260
  U32 refinedEnd = end;
258
261
 
259
262
  DISPLAYLEVEL(4, "\n");
260
- DISPLAYLEVEL(4, "found %3u matches of length >= %i at pos %7u ", (U32)(end-start), MINMATCHLENGTH, (U32)pos);
263
+ DISPLAYLEVEL(4, "found %3u matches of length >= %i at pos %7u ", (unsigned)(end-start), MINMATCHLENGTH, (unsigned)pos);
261
264
  DISPLAYLEVEL(4, "\n");
262
265
 
263
- for (searchLength = MINMATCHLENGTH ; ; searchLength++) {
266
+ for (mml = MINMATCHLENGTH ; ; mml++) {
264
267
  BYTE currentChar = 0;
265
268
  U32 currentCount = 0;
266
269
  U32 currentID = refinedStart;
@@ -268,13 +271,13 @@ static dictItem ZDICT_analyzePos(
268
271
  U32 selectedCount = 0;
269
272
  U32 selectedID = currentID;
270
273
  for (id =refinedStart; id < refinedEnd; id++) {
271
- if (b[ suffix[id] + searchLength] != currentChar) {
274
+ if (b[suffix[id] + mml] != currentChar) {
272
275
  if (currentCount > selectedCount) {
273
276
  selectedCount = currentCount;
274
277
  selectedID = currentID;
275
278
  }
276
279
  currentID = id;
277
- currentChar = b[ suffix[id] + searchLength];
280
+ currentChar = b[ suffix[id] + mml];
278
281
  currentCount = 0;
279
282
  }
280
283
  currentCount ++;
@@ -290,27 +293,30 @@ static dictItem ZDICT_analyzePos(
290
293
  refinedEnd = refinedStart + selectedCount;
291
294
  }
292
295
 
293
- /* evaluate gain based on new ref */
296
+ /* evaluate gain based on new dict */
294
297
  start = refinedStart;
295
298
  pos = suffix[refinedStart];
296
299
  end = start;
297
300
  memset(lengthList, 0, sizeof(lengthList));
298
301
 
299
302
  /* look forward */
300
- do {
301
- end++;
302
- length = ZDICT_count(b + pos, b + suffix[end]);
303
- if (length >= LLIMIT) length = LLIMIT-1;
304
- lengthList[length]++;
305
- } while (length >=MINMATCHLENGTH);
303
+ { size_t length;
304
+ do {
305
+ end++;
306
+ length = ZDICT_count(b + pos, b + suffix[end]);
307
+ if (length >= LLIMIT) length = LLIMIT-1;
308
+ lengthList[length]++;
309
+ } while (length >=MINMATCHLENGTH);
310
+ }
306
311
 
307
312
  /* look backward */
308
- length = MINMATCHLENGTH;
309
- while ((length >= MINMATCHLENGTH) & (start > 0)) {
310
- length = ZDICT_count(b + pos, b + suffix[start - 1]);
311
- if (length >= LLIMIT) length = LLIMIT - 1;
312
- lengthList[length]++;
313
- if (length >= MINMATCHLENGTH) start--;
313
+ { size_t length = MINMATCHLENGTH;
314
+ while ((length >= MINMATCHLENGTH) & (start > 0)) {
315
+ length = ZDICT_count(b + pos, b + suffix[start - 1]);
316
+ if (length >= LLIMIT) length = LLIMIT - 1;
317
+ lengthList[length]++;
318
+ if (length >= MINMATCHLENGTH) start--;
319
+ }
314
320
  }
315
321
 
316
322
  /* largest useful length */
@@ -335,8 +341,8 @@ static dictItem ZDICT_analyzePos(
335
341
  for (i=MINMATCHLENGTH; i<=(int)maxLength; i++)
336
342
  savings[i] = savings[i-1] + (lengthList[i] * (i-3));
337
343
 
338
- DISPLAYLEVEL(4, "Selected ref at position %u, of length %u : saves %u (ratio: %.2f) \n",
339
- (U32)pos, (U32)maxLength, savings[maxLength], (double)savings[maxLength] / maxLength);
344
+ DISPLAYLEVEL(4, "Selected dict at position %u, of length %u : saves %u (ratio: %.2f) \n",
345
+ (unsigned)pos, (unsigned)maxLength, (unsigned)savings[maxLength], (double)savings[maxLength] / maxLength);
340
346
 
341
347
  solution.pos = (U32)pos;
342
348
  solution.length = (U32)maxLength;
@@ -345,12 +351,12 @@ static dictItem ZDICT_analyzePos(
345
351
  /* mark positions done */
346
352
  { U32 id;
347
353
  for (id=start; id<end; id++) {
348
- U32 p, pEnd;
354
+ U32 p, pEnd, length;
349
355
  U32 const testedPos = suffix[id];
350
356
  if (testedPos == pos)
351
357
  length = solution.length;
352
358
  else {
353
- length = ZDICT_count(b+pos, b+testedPos);
359
+ length = (U32)ZDICT_count(b+pos, b+testedPos);
354
360
  if (length > solution.length) length = solution.length;
355
361
  }
356
362
  pEnd = (U32)(testedPos + length);
@@ -491,7 +497,7 @@ static U32 ZDICT_dictSize(const dictItem* dictList)
491
497
  static size_t ZDICT_trainBuffer_legacy(dictItem* dictList, U32 dictListSize,
492
498
  const void* const buffer, size_t bufferSize, /* buffer must end with noisy guard band */
493
499
  const size_t* fileSizes, unsigned nbFiles,
494
- U32 minRatio, U32 notificationLevel)
500
+ unsigned minRatio, U32 notificationLevel)
495
501
  {
496
502
  int* const suffix0 = (int*)malloc((bufferSize+2)*sizeof(*suffix0));
497
503
  int* const suffix = suffix0+1;
@@ -517,11 +523,11 @@ static size_t ZDICT_trainBuffer_legacy(dictItem* dictList, U32 dictListSize,
517
523
  memset(doneMarks, 0, bufferSize+16);
518
524
 
519
525
  /* limit sample set size (divsufsort limitation)*/
520
- if (bufferSize > ZDICT_MAX_SAMPLES_SIZE) DISPLAYLEVEL(3, "sample set too large : reduced to %u MB ...\n", (U32)(ZDICT_MAX_SAMPLES_SIZE>>20));
526
+ if (bufferSize > ZDICT_MAX_SAMPLES_SIZE) DISPLAYLEVEL(3, "sample set too large : reduced to %u MB ...\n", (unsigned)(ZDICT_MAX_SAMPLES_SIZE>>20));
521
527
  while (bufferSize > ZDICT_MAX_SAMPLES_SIZE) bufferSize -= fileSizes[--nbFiles];
522
528
 
523
529
  /* sort */
524
- DISPLAYLEVEL(2, "sorting %u files of total size %u MB ...\n", nbFiles, (U32)(bufferSize>>20));
530
+ DISPLAYLEVEL(2, "sorting %u files of total size %u MB ...\n", nbFiles, (unsigned)(bufferSize>>20));
525
531
  { int const divSuftSortResult = divsufsort((const unsigned char*)buffer, suffix, (int)bufferSize, 0);
526
532
  if (divSuftSortResult != 0) { result = ERROR(GENERIC); goto _cleanup; }
527
533
  }
@@ -575,29 +581,31 @@ static void ZDICT_fillNoise(void* buffer, size_t length)
575
581
 
576
582
  typedef struct
577
583
  {
578
- ZSTD_CCtx* ref;
579
- ZSTD_CCtx* zc;
584
+ ZSTD_CDict* dict; /* dictionary */
585
+ ZSTD_CCtx* zc; /* working context */
580
586
  void* workPlace; /* must be ZSTD_BLOCKSIZE_MAX allocated */
581
587
  } EStats_ress_t;
582
588
 
583
589
  #define MAXREPOFFSET 1024
584
590
 
585
591
  static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params,
586
- U32* countLit, U32* offsetcodeCount, U32* matchlengthCount, U32* litlengthCount, U32* repOffsets,
587
- const void* src, size_t srcSize, U32 notificationLevel)
592
+ unsigned* countLit, unsigned* offsetcodeCount, unsigned* matchlengthCount, unsigned* litlengthCount, U32* repOffsets,
593
+ const void* src, size_t srcSize,
594
+ U32 notificationLevel)
588
595
  {
589
596
  size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params.cParams.windowLog);
590
597
  size_t cSize;
591
598
 
592
599
  if (srcSize > blockSizeMax) srcSize = blockSizeMax; /* protection vs large samples */
593
- { size_t const errorCode = ZSTD_copyCCtx(esr.zc, esr.ref, 0);
594
- if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_copyCCtx failed \n"); return; }
600
+ { size_t const errorCode = ZSTD_compressBegin_usingCDict(esr.zc, esr.dict);
601
+ if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_compressBegin_usingCDict failed \n"); return; }
602
+
595
603
  }
596
604
  cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_MAX, src, srcSize);
597
- if (ZSTD_isError(cSize)) { DISPLAYLEVEL(3, "warning : could not compress sample size %u \n", (U32)srcSize); return; }
605
+ if (ZSTD_isError(cSize)) { DISPLAYLEVEL(3, "warning : could not compress sample size %u \n", (unsigned)srcSize); return; }
598
606
 
599
607
  if (cSize) { /* if == 0; block is not compressible */
600
- const seqStore_t* seqStorePtr = ZSTD_getSeqStore(esr.zc);
608
+ const seqStore_t* const seqStorePtr = ZSTD_getSeqStore(esr.zc);
601
609
 
602
610
  /* literals stats */
603
611
  { const BYTE* bytePtr;
@@ -659,6 +667,18 @@ static void ZDICT_insertSortCount(offsetCount_t table[ZSTD_REP_NUM+1], U32 val,
659
667
  }
660
668
  }
661
669
 
670
+ /* ZDICT_flatLit() :
671
+ * rewrite `countLit` to contain a mostly flat but still compressible distribution of literals.
672
+ * necessary to avoid generating a non-compressible distribution that HUF_writeCTable() cannot encode.
673
+ */
674
+ static void ZDICT_flatLit(unsigned* countLit)
675
+ {
676
+ int u;
677
+ for (u=1; u<256; u++) countLit[u] = 2;
678
+ countLit[0] = 4;
679
+ countLit[253] = 1;
680
+ countLit[254] = 1;
681
+ }
662
682
 
663
683
  #define OFFCODE_MAX 30 /* only applicable to first block */
664
684
  static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
@@ -667,18 +687,18 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
667
687
  const void* dictBuffer, size_t dictBufferSize,
668
688
  unsigned notificationLevel)
669
689
  {
670
- U32 countLit[256];
690
+ unsigned countLit[256];
671
691
  HUF_CREATE_STATIC_CTABLE(hufTable, 255);
672
- U32 offcodeCount[OFFCODE_MAX+1];
692
+ unsigned offcodeCount[OFFCODE_MAX+1];
673
693
  short offcodeNCount[OFFCODE_MAX+1];
674
694
  U32 offcodeMax = ZSTD_highbit32((U32)(dictBufferSize + 128 KB));
675
- U32 matchLengthCount[MaxML+1];
695
+ unsigned matchLengthCount[MaxML+1];
676
696
  short matchLengthNCount[MaxML+1];
677
- U32 litLengthCount[MaxLL+1];
697
+ unsigned litLengthCount[MaxLL+1];
678
698
  short litLengthNCount[MaxLL+1];
679
699
  U32 repOffset[MAXREPOFFSET];
680
700
  offsetCount_t bestRepOffset[ZSTD_REP_NUM+1];
681
- EStats_ress_t esr;
701
+ EStats_ress_t esr = { NULL, NULL, NULL };
682
702
  ZSTD_parameters params;
683
703
  U32 u, huffLog = 11, Offlog = OffFSELog, mlLog = MLFSELog, llLog = LLFSELog, total;
684
704
  size_t pos = 0, errorCode;
@@ -688,14 +708,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
688
708
  BYTE* dstPtr = (BYTE*)dstBuffer;
689
709
 
690
710
  /* init */
691
- esr.ref = ZSTD_createCCtx();
692
- esr.zc = ZSTD_createCCtx();
693
- esr.workPlace = malloc(ZSTD_BLOCKSIZE_MAX);
694
- if (!esr.ref || !esr.zc || !esr.workPlace) {
695
- eSize = ERROR(memory_allocation);
696
- DISPLAYLEVEL(1, "Not enough memory \n");
697
- goto _cleanup;
698
- }
711
+ DEBUGLOG(4, "ZDICT_analyzeEntropy");
699
712
  if (offcodeMax>OFFCODE_MAX) { eSize = ERROR(dictionaryCreation_failed); goto _cleanup; } /* too large dictionary */
700
713
  for (u=0; u<256; u++) countLit[u] = 1; /* any character must be described */
701
714
  for (u=0; u<=offcodeMax; u++) offcodeCount[u] = 1;
@@ -704,16 +717,19 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
704
717
  memset(repOffset, 0, sizeof(repOffset));
705
718
  repOffset[1] = repOffset[4] = repOffset[8] = 1;
706
719
  memset(bestRepOffset, 0, sizeof(bestRepOffset));
707
- if (compressionLevel<=0) compressionLevel = g_compressionLevel_default;
720
+ if (compressionLevel==0) compressionLevel = g_compressionLevel_default;
708
721
  params = ZSTD_getParams(compressionLevel, averageSampleSize, dictBufferSize);
709
- { size_t const beginResult = ZSTD_compressBegin_advanced(esr.ref, dictBuffer, dictBufferSize, params, 0);
710
- if (ZSTD_isError(beginResult)) {
711
- DISPLAYLEVEL(1, "error : ZSTD_compressBegin_advanced() failed : %s \n", ZSTD_getErrorName(beginResult));
712
- eSize = ERROR(GENERIC);
713
- goto _cleanup;
714
- } }
715
722
 
716
- /* collect stats on all files */
723
+ esr.dict = ZSTD_createCDict_advanced(dictBuffer, dictBufferSize, ZSTD_dlm_byRef, ZSTD_dct_rawContent, params.cParams, ZSTD_defaultCMem);
724
+ esr.zc = ZSTD_createCCtx();
725
+ esr.workPlace = malloc(ZSTD_BLOCKSIZE_MAX);
726
+ if (!esr.dict || !esr.zc || !esr.workPlace) {
727
+ eSize = ERROR(memory_allocation);
728
+ DISPLAYLEVEL(1, "Not enough memory \n");
729
+ goto _cleanup;
730
+ }
731
+
732
+ /* collect stats on all samples */
717
733
  for (u=0; u<nbFiles; u++) {
718
734
  ZDICT_countEStats(esr, params,
719
735
  countLit, offcodeCount, matchLengthCount, litLengthCount, repOffset,
@@ -722,14 +738,21 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
722
738
  pos += fileSizes[u];
723
739
  }
724
740
 
725
- /* analyze */
726
- errorCode = HUF_buildCTable (hufTable, countLit, 255, huffLog);
727
- if (HUF_isError(errorCode)) {
728
- eSize = ERROR(GENERIC);
729
- DISPLAYLEVEL(1, "HUF_buildCTable error \n");
730
- goto _cleanup;
741
+ /* analyze, build stats, starting with literals */
742
+ { size_t maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog);
743
+ if (HUF_isError(maxNbBits)) {
744
+ eSize = ERROR(GENERIC);
745
+ DISPLAYLEVEL(1, " HUF_buildCTable error \n");
746
+ goto _cleanup;
747
+ }
748
+ if (maxNbBits==8) { /* not compressible : will fail on HUF_writeCTable() */
749
+ DISPLAYLEVEL(2, "warning : pathological dataset : literals are not compressible : samples are noisy or too regular \n");
750
+ ZDICT_flatLit(countLit); /* replace distribution by a fake "mostly flat but still compressible" distribution, that HUF_writeCTable() can encode */
751
+ maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog);
752
+ assert(maxNbBits==9);
753
+ }
754
+ huffLog = (U32)maxNbBits;
731
755
  }
732
- huffLog = (U32)errorCode;
733
756
 
734
757
  /* looking for most common first offsets */
735
758
  { U32 offset;
@@ -829,7 +852,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
829
852
  eSize += 12;
830
853
 
831
854
  _cleanup:
832
- ZSTD_freeCCtx(esr.ref);
855
+ ZSTD_freeCDict(esr.dict);
833
856
  ZSTD_freeCCtx(esr.zc);
834
857
  free(esr.workPlace);
835
858
 
@@ -840,16 +863,17 @@ _cleanup:
840
863
 
841
864
  size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
842
865
  const void* customDictContent, size_t dictContentSize,
843
- const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
844
- ZDICT_params_t params)
866
+ const void* samplesBuffer, const size_t* samplesSizes,
867
+ unsigned nbSamples, ZDICT_params_t params)
845
868
  {
846
869
  size_t hSize;
847
870
  #define HBUFFSIZE 256 /* should prove large enough for all entropy headers */
848
871
  BYTE header[HBUFFSIZE];
849
- int const compressionLevel = (params.compressionLevel <= 0) ? g_compressionLevel_default : params.compressionLevel;
872
+ int const compressionLevel = (params.compressionLevel == 0) ? g_compressionLevel_default : params.compressionLevel;
850
873
  U32 const notificationLevel = params.notificationLevel;
851
874
 
852
875
  /* check conditions */
876
+ DEBUGLOG(4, "ZDICT_finalizeDictionary");
853
877
  if (dictBufferCapacity < dictContentSize) return ERROR(dstSize_tooSmall);
854
878
  if (dictContentSize < ZDICT_CONTENTSIZE_MIN) return ERROR(srcSize_wrong);
855
879
  if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) return ERROR(dstSize_tooSmall);
@@ -886,11 +910,12 @@ size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
886
910
  }
887
911
 
888
912
 
889
- size_t ZDICT_addEntropyTablesFromBuffer_advanced(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
890
- const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
891
- ZDICT_params_t params)
913
+ static size_t ZDICT_addEntropyTablesFromBuffer_advanced(
914
+ void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
915
+ const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
916
+ ZDICT_params_t params)
892
917
  {
893
- int const compressionLevel = (params.compressionLevel <= 0) ? g_compressionLevel_default : params.compressionLevel;
918
+ int const compressionLevel = (params.compressionLevel == 0) ? g_compressionLevel_default : params.compressionLevel;
894
919
  U32 const notificationLevel = params.notificationLevel;
895
920
  size_t hSize = 8;
896
921
 
@@ -919,7 +944,11 @@ size_t ZDICT_addEntropyTablesFromBuffer_advanced(void* dictBuffer, size_t dictCo
919
944
  return MIN(dictBufferCapacity, hSize+dictContentSize);
920
945
  }
921
946
 
922
-
947
+ /* Hidden declaration for dbio.c */
948
+ size_t ZDICT_trainFromBuffer_unsafe_legacy(
949
+ void* dictBuffer, size_t maxDictSize,
950
+ const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
951
+ ZDICT_legacy_params_t params);
923
952
  /*! ZDICT_trainFromBuffer_unsafe_legacy() :
924
953
  * Warning : `samplesBuffer` must be followed by noisy guard band.
925
954
  * @return : size of dictionary, or an error code which can be tested with ZDICT_isError()
@@ -954,31 +983,33 @@ size_t ZDICT_trainFromBuffer_unsafe_legacy(
954
983
 
955
984
  /* display best matches */
956
985
  if (params.zParams.notificationLevel>= 3) {
957
- U32 const nb = MIN(25, dictList[0].pos);
958
- U32 const dictContentSize = ZDICT_dictSize(dictList);
959
- U32 u;
960
- DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", dictList[0].pos-1, dictContentSize);
986
+ unsigned const nb = MIN(25, dictList[0].pos);
987
+ unsigned const dictContentSize = ZDICT_dictSize(dictList);
988
+ unsigned u;
989
+ DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", (unsigned)dictList[0].pos-1, dictContentSize);
961
990
  DISPLAYLEVEL(3, "list %u best segments \n", nb-1);
962
991
  for (u=1; u<nb; u++) {
963
- U32 const pos = dictList[u].pos;
964
- U32 const length = dictList[u].length;
992
+ unsigned const pos = dictList[u].pos;
993
+ unsigned const length = dictList[u].length;
965
994
  U32 const printedLength = MIN(40, length);
966
- if ((pos > samplesBuffSize) || ((pos + length) > samplesBuffSize))
995
+ if ((pos > samplesBuffSize) || ((pos + length) > samplesBuffSize)) {
996
+ free(dictList);
967
997
  return ERROR(GENERIC); /* should never happen */
998
+ }
968
999
  DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |",
969
- u, length, pos, dictList[u].savings);
1000
+ u, length, pos, (unsigned)dictList[u].savings);
970
1001
  ZDICT_printHex((const char*)samplesBuffer+pos, printedLength);
971
1002
  DISPLAYLEVEL(3, "| \n");
972
1003
  } }
973
1004
 
974
1005
 
975
1006
  /* create dictionary */
976
- { U32 dictContentSize = ZDICT_dictSize(dictList);
1007
+ { unsigned dictContentSize = ZDICT_dictSize(dictList);
977
1008
  if (dictContentSize < ZDICT_CONTENTSIZE_MIN) { free(dictList); return ERROR(dictionaryCreation_failed); } /* dictionary content too small */
978
1009
  if (dictContentSize < targetDictSize/4) {
979
- DISPLAYLEVEL(2, "! warning : selected content significantly smaller than requested (%u < %u) \n", dictContentSize, (U32)maxDictSize);
1010
+ DISPLAYLEVEL(2, "! warning : selected content significantly smaller than requested (%u < %u) \n", dictContentSize, (unsigned)maxDictSize);
980
1011
  if (samplesBuffSize < 10 * targetDictSize)
981
- DISPLAYLEVEL(2, "! consider increasing the number of samples (total size : %u MB)\n", (U32)(samplesBuffSize>>20));
1012
+ DISPLAYLEVEL(2, "! consider increasing the number of samples (total size : %u MB)\n", (unsigned)(samplesBuffSize>>20));
982
1013
  if (minRep > MINRATIO) {
983
1014
  DISPLAYLEVEL(2, "! consider increasing selectivity to produce larger dictionary (-s%u) \n", selectivity+1);
984
1015
  DISPLAYLEVEL(2, "! note : larger dictionaries are not necessarily better, test its efficiency on samples \n");
@@ -986,9 +1017,9 @@ size_t ZDICT_trainFromBuffer_unsafe_legacy(
986
1017
  }
987
1018
 
988
1019
  if ((dictContentSize > targetDictSize*3) && (nbSamples > 2*MINRATIO) && (selectivity>1)) {
989
- U32 proposedSelectivity = selectivity-1;
1020
+ unsigned proposedSelectivity = selectivity-1;
990
1021
  while ((nbSamples >> proposedSelectivity) <= MINRATIO) { proposedSelectivity--; }
991
- DISPLAYLEVEL(2, "! note : calculated dictionary significantly larger than requested (%u > %u) \n", dictContentSize, (U32)maxDictSize);
1022
+ DISPLAYLEVEL(2, "! note : calculated dictionary significantly larger than requested (%u > %u) \n", dictContentSize, (unsigned)maxDictSize);
992
1023
  DISPLAYLEVEL(2, "! consider increasing dictionary size, or produce denser dictionary (-s%u) \n", proposedSelectivity);
993
1024
  DISPLAYLEVEL(2, "! always test dictionary efficiency on real samples \n");
994
1025
  }
@@ -1025,8 +1056,9 @@ size_t ZDICT_trainFromBuffer_unsafe_legacy(
1025
1056
  }
1026
1057
 
1027
1058
 
1028
- /* issue : samplesBuffer need to be followed by a noisy guard band.
1029
- * work around : duplicate the buffer, and add the noise */
1059
+ /* ZDICT_trainFromBuffer_legacy() :
1060
+ * issue : samplesBuffer need to be followed by a noisy guard band.
1061
+ * work around : duplicate the buffer, and add the noise */
1030
1062
  size_t ZDICT_trainFromBuffer_legacy(void* dictBuffer, size_t dictBufferCapacity,
1031
1063
  const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
1032
1064
  ZDICT_legacy_params_t params)
@@ -1053,19 +1085,23 @@ size_t ZDICT_trainFromBuffer_legacy(void* dictBuffer, size_t dictBufferCapacity,
1053
1085
  size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
1054
1086
  const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
1055
1087
  {
1056
- ZDICT_cover_params_t params;
1088
+ ZDICT_fastCover_params_t params;
1089
+ DEBUGLOG(3, "ZDICT_trainFromBuffer");
1057
1090
  memset(&params, 0, sizeof(params));
1058
1091
  params.d = 8;
1059
1092
  params.steps = 4;
1060
- /* Default to level 6 since no compression level information is avaialble */
1061
- params.zParams.compressionLevel = 6;
1062
- return ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, dictBufferCapacity,
1063
- samplesBuffer, samplesSizes,
1064
- nbSamples, &params);
1093
+ /* Default to level 6 since no compression level information is available */
1094
+ params.zParams.compressionLevel = 3;
1095
+ #if defined(DEBUGLEVEL) && (DEBUGLEVEL>=1)
1096
+ params.zParams.notificationLevel = DEBUGLEVEL;
1097
+ #endif
1098
+ return ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, dictBufferCapacity,
1099
+ samplesBuffer, samplesSizes, nbSamples,
1100
+ &params);
1065
1101
  }
1066
1102
 
1067
1103
  size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
1068
- const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
1104
+ const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
1069
1105
  {
1070
1106
  ZDICT_params_t params;
1071
1107
  memset(&params, 0, sizeof(params));