extzstd 0.1.1 → 0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. checksums.yaml +5 -5
  2. data/HISTORY.ja.md +18 -0
  3. data/README.md +15 -50
  4. data/contrib/zstd/CONTRIBUTING.md +1 -1
  5. data/contrib/zstd/COPYING +339 -0
  6. data/contrib/zstd/Makefile +82 -51
  7. data/contrib/zstd/NEWS +92 -5
  8. data/contrib/zstd/README.md +50 -41
  9. data/contrib/zstd/appveyor.yml +164 -102
  10. data/contrib/zstd/circle.yml +10 -22
  11. data/contrib/zstd/lib/BUCK +31 -10
  12. data/contrib/zstd/lib/Makefile +57 -31
  13. data/contrib/zstd/lib/README.md +68 -37
  14. data/contrib/zstd/lib/common/bitstream.h +130 -76
  15. data/contrib/zstd/lib/common/compiler.h +86 -0
  16. data/contrib/zstd/lib/common/error_private.c +15 -11
  17. data/contrib/zstd/lib/common/error_private.h +8 -8
  18. data/contrib/zstd/lib/common/fse.h +19 -9
  19. data/contrib/zstd/lib/common/fse_decompress.c +3 -22
  20. data/contrib/zstd/lib/common/huf.h +68 -26
  21. data/contrib/zstd/lib/common/mem.h +23 -35
  22. data/contrib/zstd/lib/common/pool.c +123 -63
  23. data/contrib/zstd/lib/common/pool.h +19 -10
  24. data/contrib/zstd/lib/common/threading.c +11 -16
  25. data/contrib/zstd/lib/common/threading.h +52 -33
  26. data/contrib/zstd/lib/common/xxhash.c +28 -22
  27. data/contrib/zstd/lib/common/zstd_common.c +40 -27
  28. data/contrib/zstd/lib/common/zstd_errors.h +43 -34
  29. data/contrib/zstd/lib/common/zstd_internal.h +131 -123
  30. data/contrib/zstd/lib/compress/fse_compress.c +17 -33
  31. data/contrib/zstd/lib/compress/huf_compress.c +15 -9
  32. data/contrib/zstd/lib/compress/zstd_compress.c +2096 -2363
  33. data/contrib/zstd/lib/compress/zstd_compress_internal.h +462 -0
  34. data/contrib/zstd/lib/compress/zstd_double_fast.c +309 -0
  35. data/contrib/zstd/lib/compress/zstd_double_fast.h +29 -0
  36. data/contrib/zstd/lib/compress/zstd_fast.c +243 -0
  37. data/contrib/zstd/lib/compress/zstd_fast.h +31 -0
  38. data/contrib/zstd/lib/compress/zstd_lazy.c +765 -0
  39. data/contrib/zstd/lib/compress/zstd_lazy.h +39 -0
  40. data/contrib/zstd/lib/compress/zstd_ldm.c +707 -0
  41. data/contrib/zstd/lib/compress/zstd_ldm.h +68 -0
  42. data/contrib/zstd/lib/compress/zstd_opt.c +785 -0
  43. data/contrib/zstd/lib/compress/zstd_opt.h +19 -908
  44. data/contrib/zstd/lib/compress/zstdmt_compress.c +737 -327
  45. data/contrib/zstd/lib/compress/zstdmt_compress.h +88 -26
  46. data/contrib/zstd/lib/decompress/huf_decompress.c +158 -50
  47. data/contrib/zstd/lib/decompress/zstd_decompress.c +884 -699
  48. data/contrib/zstd/lib/deprecated/zbuff.h +5 -4
  49. data/contrib/zstd/lib/deprecated/zbuff_common.c +5 -5
  50. data/contrib/zstd/lib/deprecated/zbuff_compress.c +6 -4
  51. data/contrib/zstd/lib/deprecated/zbuff_decompress.c +5 -4
  52. data/contrib/zstd/lib/dictBuilder/cover.c +93 -77
  53. data/contrib/zstd/lib/dictBuilder/zdict.c +107 -92
  54. data/contrib/zstd/lib/dictBuilder/zdict.h +112 -102
  55. data/contrib/zstd/lib/legacy/zstd_legacy.h +9 -4
  56. data/contrib/zstd/lib/legacy/zstd_v01.c +7 -6
  57. data/contrib/zstd/lib/legacy/zstd_v01.h +5 -4
  58. data/contrib/zstd/lib/legacy/zstd_v02.c +27 -99
  59. data/contrib/zstd/lib/legacy/zstd_v02.h +5 -4
  60. data/contrib/zstd/lib/legacy/zstd_v03.c +26 -98
  61. data/contrib/zstd/lib/legacy/zstd_v03.h +5 -4
  62. data/contrib/zstd/lib/legacy/zstd_v04.c +22 -91
  63. data/contrib/zstd/lib/legacy/zstd_v04.h +5 -4
  64. data/contrib/zstd/lib/legacy/zstd_v05.c +23 -99
  65. data/contrib/zstd/lib/legacy/zstd_v05.h +5 -4
  66. data/contrib/zstd/lib/legacy/zstd_v06.c +22 -96
  67. data/contrib/zstd/lib/legacy/zstd_v06.h +5 -4
  68. data/contrib/zstd/lib/legacy/zstd_v07.c +19 -95
  69. data/contrib/zstd/lib/legacy/zstd_v07.h +5 -4
  70. data/contrib/zstd/lib/zstd.h +895 -271
  71. data/ext/extconf.rb +11 -2
  72. data/ext/extzstd.c +45 -128
  73. data/ext/extzstd.h +74 -31
  74. data/ext/extzstd_stream.c +401 -142
  75. data/ext/zstd_common.c +5 -0
  76. data/ext/zstd_compress.c +8 -0
  77. data/ext/zstd_decompress.c +1 -0
  78. data/ext/zstd_dictbuilder.c +2 -0
  79. data/lib/extzstd/version.rb +1 -1
  80. data/lib/extzstd.rb +48 -1
  81. data/test/test_basic.rb +9 -1
  82. metadata +17 -7
  83. data/HISTORY.ja +0 -10
  84. data/contrib/zstd/LICENSE-examples +0 -11
  85. data/contrib/zstd/PATENTS +0 -33
@@ -1,18 +1,20 @@
1
- /**
1
+ /*
2
2
  * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
3
3
  * All rights reserved.
4
4
  *
5
- * This source code is licensed under the BSD-style license found in the
6
- * LICENSE file in the root directory of this source tree. An additional grant
7
- * of patent rights can be found in the PATENTS file in the same directory.
5
+ * This source code is licensed under both the BSD-style license (found in the
6
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7
+ * in the COPYING file in the root directory of this source tree).
8
+ * You may select, at your option, one of the above-listed licenses.
8
9
  */
9
10
 
10
11
 
11
12
  /*-**************************************
12
13
  * Tuning parameters
13
14
  ****************************************/
15
+ #define MINRATIO 4 /* minimum nb of apparition to be selected in dictionary */
14
16
  #define ZDICT_MAX_SAMPLES_SIZE (2000U << 20)
15
- #define ZDICT_MIN_SAMPLES_SIZE 512
17
+ #define ZDICT_MIN_SAMPLES_SIZE (ZDICT_CONTENTSIZE_MIN * MINRATIO)
16
18
 
17
19
 
18
20
  /*-**************************************
@@ -59,11 +61,8 @@
59
61
 
60
62
  #define NOISELENGTH 32
61
63
 
62
- #define MINRATIO 4
63
- static const int g_compressionLevel_default = 6;
64
+ static const int g_compressionLevel_default = 3;
64
65
  static const U32 g_selectivity_default = 9;
65
- static const size_t g_provision_entropySize = 200;
66
- static const size_t g_min_fast_dictContent = 192;
67
66
 
68
67
 
69
68
  /*-*************************************
@@ -96,7 +95,7 @@ const char* ZDICT_getErrorName(size_t errorCode) { return ERR_getErrorName(error
96
95
  unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize)
97
96
  {
98
97
  if (dictSize < 8) return 0;
99
- if (MEM_readLE32(dictBuffer) != ZSTD_DICT_MAGIC) return 0;
98
+ if (MEM_readLE32(dictBuffer) != ZSTD_MAGIC_DICTIONARY) return 0;
100
99
  return MEM_readLE32((const char*)dictBuffer + 4);
101
100
  }
102
101
 
@@ -104,7 +103,7 @@ unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize)
104
103
  /*-********************************************************
105
104
  * Dictionary training functions
106
105
  **********************************************************/
107
- static unsigned ZDICT_NbCommonBytes (register size_t val)
106
+ static unsigned ZDICT_NbCommonBytes (size_t val)
108
107
  {
109
108
  if (MEM_isLittleEndian()) {
110
109
  if (MEM_64bits()) {
@@ -308,10 +307,10 @@ static dictItem ZDICT_analyzePos(
308
307
  /* look backward */
309
308
  length = MINMATCHLENGTH;
310
309
  while ((length >= MINMATCHLENGTH) & (start > 0)) {
311
- length = ZDICT_count(b + pos, b + suffix[start - 1]);
312
- if (length >= LLIMIT) length = LLIMIT - 1;
313
- lengthList[length]++;
314
- if (length >= MINMATCHLENGTH) start--;
310
+ length = ZDICT_count(b + pos, b + suffix[start - 1]);
311
+ if (length >= LLIMIT) length = LLIMIT - 1;
312
+ lengthList[length]++;
313
+ if (length >= MINMATCHLENGTH) start--;
315
314
  }
316
315
 
317
316
  /* largest useful length */
@@ -363,21 +362,35 @@ static dictItem ZDICT_analyzePos(
363
362
  }
364
363
 
365
364
 
366
- /*! ZDICT_checkMerge
365
+ static int isIncluded(const void* in, const void* container, size_t length)
366
+ {
367
+ const char* const ip = (const char*) in;
368
+ const char* const into = (const char*) container;
369
+ size_t u;
370
+
371
+ for (u=0; u<length; u++) { /* works because end of buffer is a noisy guard band */
372
+ if (ip[u] != into[u]) break;
373
+ }
374
+
375
+ return u==length;
376
+ }
377
+
378
+ /*! ZDICT_tryMerge() :
367
379
  check if dictItem can be merged, do it if possible
368
380
  @return : id of destination elt, 0 if not merged
369
381
  */
370
- static U32 ZDICT_checkMerge(dictItem* table, dictItem elt, U32 eltNbToSkip)
382
+ static U32 ZDICT_tryMerge(dictItem* table, dictItem elt, U32 eltNbToSkip, const void* buffer)
371
383
  {
372
384
  const U32 tableSize = table->pos;
373
385
  const U32 eltEnd = elt.pos + elt.length;
386
+ const char* const buf = (const char*) buffer;
374
387
 
375
388
  /* tail overlap */
376
389
  U32 u; for (u=1; u<tableSize; u++) {
377
390
  if (u==eltNbToSkip) continue;
378
391
  if ((table[u].pos > elt.pos) && (table[u].pos <= eltEnd)) { /* overlap, existing > new */
379
392
  /* append */
380
- U32 addedLength = table[u].pos - elt.pos;
393
+ U32 const addedLength = table[u].pos - elt.pos;
381
394
  table[u].length += addedLength;
382
395
  table[u].pos = elt.pos;
383
396
  table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */
@@ -393,9 +406,10 @@ static U32 ZDICT_checkMerge(dictItem* table, dictItem elt, U32 eltNbToSkip)
393
406
  /* front overlap */
394
407
  for (u=1; u<tableSize; u++) {
395
408
  if (u==eltNbToSkip) continue;
409
+
396
410
  if ((table[u].pos + table[u].length >= elt.pos) && (table[u].pos < elt.pos)) { /* overlap, existing < new */
397
411
  /* append */
398
- int addedLength = (int)eltEnd - (table[u].pos + table[u].length);
412
+ int const addedLength = (int)eltEnd - (table[u].pos + table[u].length);
399
413
  table[u].savings += elt.length / 8; /* rough approx bonus */
400
414
  if (addedLength > 0) { /* otherwise, elt fully included into existing */
401
415
  table[u].length += addedLength;
@@ -407,7 +421,18 @@ static U32 ZDICT_checkMerge(dictItem* table, dictItem elt, U32 eltNbToSkip)
407
421
  table[u] = table[u-1], u--;
408
422
  table[u] = elt;
409
423
  return u;
410
- } }
424
+ }
425
+
426
+ if (MEM_read64(buf + table[u].pos) == MEM_read64(buf + elt.pos + 1)) {
427
+ if (isIncluded(buf + table[u].pos, buf + elt.pos + 1, table[u].length)) {
428
+ size_t const addedLength = MAX( (int)elt.length - (int)table[u].length , 1 );
429
+ table[u].pos = elt.pos;
430
+ table[u].savings += (U32)(elt.savings * addedLength / elt.length);
431
+ table[u].length = MIN(elt.length, table[u].length + 1);
432
+ return u;
433
+ }
434
+ }
435
+ }
411
436
 
412
437
  return 0;
413
438
  }
@@ -415,8 +440,8 @@ static U32 ZDICT_checkMerge(dictItem* table, dictItem elt, U32 eltNbToSkip)
415
440
 
416
441
  static void ZDICT_removeDictItem(dictItem* table, U32 id)
417
442
  {
418
- /* convention : first element is nb of elts */
419
- U32 const max = table->pos;
443
+ /* convention : table[0].pos stores nb of elts */
444
+ U32 const max = table[0].pos;
420
445
  U32 u;
421
446
  if (!id) return; /* protection, should never happen */
422
447
  for (u=id; u<max-1; u++)
@@ -425,14 +450,14 @@ static void ZDICT_removeDictItem(dictItem* table, U32 id)
425
450
  }
426
451
 
427
452
 
428
- static void ZDICT_insertDictItem(dictItem* table, U32 maxSize, dictItem elt)
453
+ static void ZDICT_insertDictItem(dictItem* table, U32 maxSize, dictItem elt, const void* buffer)
429
454
  {
430
455
  /* merge if possible */
431
- U32 mergeId = ZDICT_checkMerge(table, elt, 0);
456
+ U32 mergeId = ZDICT_tryMerge(table, elt, 0, buffer);
432
457
  if (mergeId) {
433
458
  U32 newMerge = 1;
434
459
  while (newMerge) {
435
- newMerge = ZDICT_checkMerge(table, table[mergeId], mergeId);
460
+ newMerge = ZDICT_tryMerge(table, table[mergeId], mergeId, buffer);
436
461
  if (newMerge) ZDICT_removeDictItem(table, mergeId);
437
462
  mergeId = newMerge;
438
463
  }
@@ -463,7 +488,7 @@ static U32 ZDICT_dictSize(const dictItem* dictList)
463
488
  }
464
489
 
465
490
 
466
- static size_t ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
491
+ static size_t ZDICT_trainBuffer_legacy(dictItem* dictList, U32 dictListSize,
467
492
  const void* const buffer, size_t bufferSize, /* buffer must end with noisy guard band */
468
493
  const size_t* fileSizes, unsigned nbFiles,
469
494
  U32 minRatio, U32 notificationLevel)
@@ -480,7 +505,7 @@ static size_t ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
480
505
  # define DISPLAYUPDATE(l, ...) if (notificationLevel>=l) { \
481
506
  if (ZDICT_clockSpan(displayClock) > refreshRate) \
482
507
  { displayClock = clock(); DISPLAY(__VA_ARGS__); \
483
- if (notificationLevel>=4) fflush(stdout); } }
508
+ if (notificationLevel>=4) fflush(stderr); } }
484
509
 
485
510
  /* init */
486
511
  DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
@@ -521,7 +546,7 @@ static size_t ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
521
546
  if (doneMarks[cursor]) { cursor++; continue; }
522
547
  solution = ZDICT_analyzePos(doneMarks, suffix, reverseSuffix[cursor], buffer, minRatio, notificationLevel);
523
548
  if (solution.length==0) { cursor++; continue; }
524
- ZDICT_insertDictItem(dictList, dictListSize, solution);
549
+ ZDICT_insertDictItem(dictList, dictListSize, solution, buffer);
525
550
  cursor += solution.length;
526
551
  DISPLAYUPDATE(2, "\r%4.2f %% \r", (double)cursor / bufferSize * 100);
527
552
  } }
@@ -552,7 +577,7 @@ typedef struct
552
577
  {
553
578
  ZSTD_CCtx* ref;
554
579
  ZSTD_CCtx* zc;
555
- void* workPlace; /* must be ZSTD_BLOCKSIZE_ABSOLUTEMAX allocated */
580
+ void* workPlace; /* must be ZSTD_BLOCKSIZE_MAX allocated */
556
581
  } EStats_ress_t;
557
582
 
558
583
  #define MAXREPOFFSET 1024
@@ -561,14 +586,14 @@ static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params,
561
586
  U32* countLit, U32* offsetcodeCount, U32* matchlengthCount, U32* litlengthCount, U32* repOffsets,
562
587
  const void* src, size_t srcSize, U32 notificationLevel)
563
588
  {
564
- size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_ABSOLUTEMAX, 1 << params.cParams.windowLog);
589
+ size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params.cParams.windowLog);
565
590
  size_t cSize;
566
591
 
567
592
  if (srcSize > blockSizeMax) srcSize = blockSizeMax; /* protection vs large samples */
568
593
  { size_t const errorCode = ZSTD_copyCCtx(esr.zc, esr.ref, 0);
569
594
  if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_copyCCtx failed \n"); return; }
570
595
  }
571
- cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_ABSOLUTEMAX, src, srcSize);
596
+ cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_MAX, src, srcSize);
572
597
  if (ZSTD_isError(cSize)) { DISPLAYLEVEL(3, "warning : could not compress sample size %u \n", (U32)srcSize); return; }
573
598
 
574
599
  if (cSize) { /* if == 0; block is not compressible */
@@ -610,17 +635,6 @@ static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params,
610
635
  } } }
611
636
  }
612
637
 
613
- /*
614
- static size_t ZDICT_maxSampleSize(const size_t* fileSizes, unsigned nbFiles)
615
- {
616
- unsigned u;
617
- size_t max=0;
618
- for (u=0; u<nbFiles; u++)
619
- if (max < fileSizes[u]) max = fileSizes[u];
620
- return max;
621
- }
622
- */
623
-
624
638
  static size_t ZDICT_totalSampleSize(const size_t* fileSizes, unsigned nbFiles)
625
639
  {
626
640
  size_t total=0;
@@ -676,26 +690,26 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
676
690
  /* init */
677
691
  esr.ref = ZSTD_createCCtx();
678
692
  esr.zc = ZSTD_createCCtx();
679
- esr.workPlace = malloc(ZSTD_BLOCKSIZE_ABSOLUTEMAX);
693
+ esr.workPlace = malloc(ZSTD_BLOCKSIZE_MAX);
680
694
  if (!esr.ref || !esr.zc || !esr.workPlace) {
681
695
  eSize = ERROR(memory_allocation);
682
696
  DISPLAYLEVEL(1, "Not enough memory \n");
683
697
  goto _cleanup;
684
698
  }
685
- if (offcodeMax>OFFCODE_MAX) { eSize = ERROR(dictionary_wrong); goto _cleanup; } /* too large dictionary */
686
- for (u=0; u<256; u++) countLit[u]=1; /* any character must be described */
687
- for (u=0; u<=offcodeMax; u++) offcodeCount[u]=1;
688
- for (u=0; u<=MaxML; u++) matchLengthCount[u]=1;
689
- for (u=0; u<=MaxLL; u++) litLengthCount[u]=1;
699
+ if (offcodeMax>OFFCODE_MAX) { eSize = ERROR(dictionaryCreation_failed); goto _cleanup; } /* too large dictionary */
700
+ for (u=0; u<256; u++) countLit[u] = 1; /* any character must be described */
701
+ for (u=0; u<=offcodeMax; u++) offcodeCount[u] = 1;
702
+ for (u=0; u<=MaxML; u++) matchLengthCount[u] = 1;
703
+ for (u=0; u<=MaxLL; u++) litLengthCount[u] = 1;
690
704
  memset(repOffset, 0, sizeof(repOffset));
691
705
  repOffset[1] = repOffset[4] = repOffset[8] = 1;
692
706
  memset(bestRepOffset, 0, sizeof(bestRepOffset));
693
- if (compressionLevel==0) compressionLevel=g_compressionLevel_default;
707
+ if (compressionLevel<=0) compressionLevel = g_compressionLevel_default;
694
708
  params = ZSTD_getParams(compressionLevel, averageSampleSize, dictBufferSize);
695
709
  { size_t const beginResult = ZSTD_compressBegin_advanced(esr.ref, dictBuffer, dictBufferSize, params, 0);
696
- if (ZSTD_isError(beginResult)) {
710
+ if (ZSTD_isError(beginResult)) {
711
+ DISPLAYLEVEL(1, "error : ZSTD_compressBegin_advanced() failed : %s \n", ZSTD_getErrorName(beginResult));
697
712
  eSize = ERROR(GENERIC);
698
- DISPLAYLEVEL(1, "error : ZSTD_compressBegin_advanced failed \n");
699
713
  goto _cleanup;
700
714
  } }
701
715
 
@@ -812,7 +826,6 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
812
826
  MEM_writeLE32(dstPtr+4, repStartValue[1]);
813
827
  MEM_writeLE32(dstPtr+8, repStartValue[2]);
814
828
  #endif
815
- //dstPtr += 12;
816
829
  eSize += 12;
817
830
 
818
831
  _cleanup:
@@ -831,7 +844,7 @@ size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
831
844
  ZDICT_params_t params)
832
845
  {
833
846
  size_t hSize;
834
- #define HBUFFSIZE 256
847
+ #define HBUFFSIZE 256 /* should prove large enough for all entropy headers */
835
848
  BYTE header[HBUFFSIZE];
836
849
  int const compressionLevel = (params.compressionLevel <= 0) ? g_compressionLevel_default : params.compressionLevel;
837
850
  U32 const notificationLevel = params.notificationLevel;
@@ -842,7 +855,7 @@ size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
842
855
  if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) return ERROR(dstSize_tooSmall);
843
856
 
844
857
  /* dictionary header */
845
- MEM_writeLE32(header, ZSTD_DICT_MAGIC);
858
+ MEM_writeLE32(header, ZSTD_MAGIC_DICTIONARY);
846
859
  { U64 const randomID = XXH64(customDictContent, dictContentSize, 0);
847
860
  U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768;
848
861
  U32 const dictID = params.dictID ? params.dictID : compliantID;
@@ -877,20 +890,11 @@ size_t ZDICT_addEntropyTablesFromBuffer_advanced(void* dictBuffer, size_t dictCo
877
890
  const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
878
891
  ZDICT_params_t params)
879
892
  {
880
- size_t hSize;
881
893
  int const compressionLevel = (params.compressionLevel <= 0) ? g_compressionLevel_default : params.compressionLevel;
882
894
  U32 const notificationLevel = params.notificationLevel;
895
+ size_t hSize = 8;
883
896
 
884
- /* dictionary header */
885
- MEM_writeLE32(dictBuffer, ZSTD_DICT_MAGIC);
886
- { U64 const randomID = XXH64((char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize, 0);
887
- U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768;
888
- U32 const dictID = params.dictID ? params.dictID : compliantID;
889
- MEM_writeLE32((char*)dictBuffer+4, dictID);
890
- }
891
- hSize = 8;
892
-
893
- /* entropy tables */
897
+ /* calculate entropy tables */
894
898
  DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
895
899
  DISPLAYLEVEL(2, "statistics ... \n");
896
900
  { size_t const eSize = ZDICT_analyzeEntropy((char*)dictBuffer+hSize, dictBufferCapacity-hSize,
@@ -902,6 +906,13 @@ size_t ZDICT_addEntropyTablesFromBuffer_advanced(void* dictBuffer, size_t dictCo
902
906
  hSize += eSize;
903
907
  }
904
908
 
909
+ /* add dictionary header (after entropy tables) */
910
+ MEM_writeLE32(dictBuffer, ZSTD_MAGIC_DICTIONARY);
911
+ { U64 const randomID = XXH64((char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize, 0);
912
+ U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768;
913
+ U32 const dictID = params.dictID ? params.dictID : compliantID;
914
+ MEM_writeLE32((char*)dictBuffer+4, dictID);
915
+ }
905
916
 
906
917
  if (hSize + dictContentSize < dictBufferCapacity)
907
918
  memmove((char*)dictBuffer + hSize, (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize);
@@ -909,14 +920,14 @@ size_t ZDICT_addEntropyTablesFromBuffer_advanced(void* dictBuffer, size_t dictCo
909
920
  }
910
921
 
911
922
 
912
- /*! ZDICT_trainFromBuffer_unsafe() :
923
+ /*! ZDICT_trainFromBuffer_unsafe_legacy() :
913
924
  * Warning : `samplesBuffer` must be followed by noisy guard band.
914
925
  * @return : size of dictionary, or an error code which can be tested with ZDICT_isError()
915
926
  */
916
- size_t ZDICT_trainFromBuffer_unsafe(
927
+ size_t ZDICT_trainFromBuffer_unsafe_legacy(
917
928
  void* dictBuffer, size_t maxDictSize,
918
929
  const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
919
- ZDICT_params_t params)
930
+ ZDICT_legacy_params_t params)
920
931
  {
921
932
  U32 const dictListSize = MAX(MAX(DICTLISTSIZE_DEFAULT, nbSamples), (U32)(maxDictSize/16));
922
933
  dictItem* const dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList));
@@ -925,24 +936,24 @@ size_t ZDICT_trainFromBuffer_unsafe(
925
936
  size_t const targetDictSize = maxDictSize;
926
937
  size_t const samplesBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples);
927
938
  size_t dictSize = 0;
928
- U32 const notificationLevel = params.notificationLevel;
939
+ U32 const notificationLevel = params.zParams.notificationLevel;
929
940
 
930
941
  /* checks */
931
942
  if (!dictList) return ERROR(memory_allocation);
932
- if (maxDictSize <= g_provision_entropySize + g_min_fast_dictContent) { free(dictList); return ERROR(dstSize_tooSmall); }
933
- if (samplesBuffSize < ZDICT_MIN_SAMPLES_SIZE) { free(dictList); return 0; } /* not enough source to create dictionary */
943
+ if (maxDictSize < ZDICT_DICTSIZE_MIN) { free(dictList); return ERROR(dstSize_tooSmall); } /* requested dictionary size is too small */
944
+ if (samplesBuffSize < ZDICT_MIN_SAMPLES_SIZE) { free(dictList); return ERROR(dictionaryCreation_failed); } /* not enough source to create dictionary */
934
945
 
935
946
  /* init */
936
947
  ZDICT_initDictItem(dictList);
937
948
 
938
949
  /* build dictionary */
939
- ZDICT_trainBuffer(dictList, dictListSize,
940
- samplesBuffer, samplesBuffSize,
941
- samplesSizes, nbSamples,
942
- minRep, notificationLevel);
950
+ ZDICT_trainBuffer_legacy(dictList, dictListSize,
951
+ samplesBuffer, samplesBuffSize,
952
+ samplesSizes, nbSamples,
953
+ minRep, notificationLevel);
943
954
 
944
955
  /* display best matches */
945
- if (params.notificationLevel>= 3) {
956
+ if (params.zParams.notificationLevel>= 3) {
946
957
  U32 const nb = MIN(25, dictList[0].pos);
947
958
  U32 const dictContentSize = ZDICT_dictSize(dictList);
948
959
  U32 u;
@@ -963,14 +974,15 @@ size_t ZDICT_trainFromBuffer_unsafe(
963
974
 
964
975
  /* create dictionary */
965
976
  { U32 dictContentSize = ZDICT_dictSize(dictList);
966
- if (dictContentSize < targetDictSize/3) {
977
+ if (dictContentSize < ZDICT_CONTENTSIZE_MIN) { free(dictList); return ERROR(dictionaryCreation_failed); } /* dictionary content too small */
978
+ if (dictContentSize < targetDictSize/4) {
967
979
  DISPLAYLEVEL(2, "! warning : selected content significantly smaller than requested (%u < %u) \n", dictContentSize, (U32)maxDictSize);
980
+ if (samplesBuffSize < 10 * targetDictSize)
981
+ DISPLAYLEVEL(2, "! consider increasing the number of samples (total size : %u MB)\n", (U32)(samplesBuffSize>>20));
968
982
  if (minRep > MINRATIO) {
969
983
  DISPLAYLEVEL(2, "! consider increasing selectivity to produce larger dictionary (-s%u) \n", selectivity+1);
970
984
  DISPLAYLEVEL(2, "! note : larger dictionaries are not necessarily better, test its efficiency on samples \n");
971
985
  }
972
- if (samplesBuffSize < 10 * targetDictSize)
973
- DISPLAYLEVEL(2, "! consider increasing the number of samples (total size : %u MB)\n", (U32)(samplesBuffSize>>20));
974
986
  }
975
987
 
976
988
  if ((dictContentSize > targetDictSize*3) && (nbSamples > 2*MINRATIO) && (selectivity>1)) {
@@ -978,7 +990,7 @@ size_t ZDICT_trainFromBuffer_unsafe(
978
990
  while ((nbSamples >> proposedSelectivity) <= MINRATIO) { proposedSelectivity--; }
979
991
  DISPLAYLEVEL(2, "! note : calculated dictionary significantly larger than requested (%u > %u) \n", dictContentSize, (U32)maxDictSize);
980
992
  DISPLAYLEVEL(2, "! consider increasing dictionary size, or produce denser dictionary (-s%u) \n", proposedSelectivity);
981
- DISPLAYLEVEL(2, "! always test dictionary efficiency on samples \n");
993
+ DISPLAYLEVEL(2, "! always test dictionary efficiency on real samples \n");
982
994
  }
983
995
 
984
996
  /* limit dictionary size */
@@ -1004,7 +1016,7 @@ size_t ZDICT_trainFromBuffer_unsafe(
1004
1016
 
1005
1017
  dictSize = ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, maxDictSize,
1006
1018
  samplesBuffer, samplesSizes, nbSamples,
1007
- params);
1019
+ params.zParams);
1008
1020
  }
1009
1021
 
1010
1022
  /* clean up */
@@ -1015,9 +1027,9 @@ size_t ZDICT_trainFromBuffer_unsafe(
1015
1027
 
1016
1028
  /* issue : samplesBuffer need to be followed by a noisy guard band.
1017
1029
  * work around : duplicate the buffer, and add the noise */
1018
- size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dictBufferCapacity,
1019
- const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
1020
- ZDICT_params_t params)
1030
+ size_t ZDICT_trainFromBuffer_legacy(void* dictBuffer, size_t dictBufferCapacity,
1031
+ const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
1032
+ ZDICT_legacy_params_t params)
1021
1033
  {
1022
1034
  size_t result;
1023
1035
  void* newBuff;
@@ -1030,10 +1042,9 @@ size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dictBufferCapacit
1030
1042
  memcpy(newBuff, samplesBuffer, sBuffSize);
1031
1043
  ZDICT_fillNoise((char*)newBuff + sBuffSize, NOISELENGTH); /* guard band, for end of buffer condition */
1032
1044
 
1033
- result = ZDICT_trainFromBuffer_unsafe(
1034
- dictBuffer, dictBufferCapacity,
1035
- newBuff, samplesSizes, nbSamples,
1036
- params);
1045
+ result =
1046
+ ZDICT_trainFromBuffer_unsafe_legacy(dictBuffer, dictBufferCapacity, newBuff,
1047
+ samplesSizes, nbSamples, params);
1037
1048
  free(newBuff);
1038
1049
  return result;
1039
1050
  }
@@ -1042,11 +1053,15 @@ size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dictBufferCapacit
1042
1053
  size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
1043
1054
  const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
1044
1055
  {
1045
- ZDICT_params_t params;
1056
+ ZDICT_cover_params_t params;
1046
1057
  memset(&params, 0, sizeof(params));
1047
- return ZDICT_trainFromBuffer_advanced(dictBuffer, dictBufferCapacity,
1048
- samplesBuffer, samplesSizes, nbSamples,
1049
- params);
1058
+ params.d = 8;
1059
+ params.steps = 4;
1060
+ /* Default to level 6 since no compression level information is avaialble */
1061
+ params.zParams.compressionLevel = 6;
1062
+ return ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, dictBufferCapacity,
1063
+ samplesBuffer, samplesSizes,
1064
+ nbSamples, &params);
1050
1065
  }
1051
1066
 
1052
1067
  size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,