zstd-ruby 1.3.8.0 → 1.4.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (90) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +6 -5
  3. data/README.md +1 -1
  4. data/ext/zstdruby/libzstd/Makefile +133 -61
  5. data/ext/zstdruby/libzstd/README.md +51 -18
  6. data/ext/zstdruby/libzstd/common/bitstream.h +38 -39
  7. data/ext/zstdruby/libzstd/common/compiler.h +41 -6
  8. data/ext/zstdruby/libzstd/common/cpu.h +1 -1
  9. data/ext/zstdruby/libzstd/common/debug.c +11 -31
  10. data/ext/zstdruby/libzstd/common/debug.h +11 -31
  11. data/ext/zstdruby/libzstd/common/entropy_common.c +13 -33
  12. data/ext/zstdruby/libzstd/common/error_private.c +2 -1
  13. data/ext/zstdruby/libzstd/common/error_private.h +6 -2
  14. data/ext/zstdruby/libzstd/common/fse.h +13 -33
  15. data/ext/zstdruby/libzstd/common/fse_decompress.c +12 -35
  16. data/ext/zstdruby/libzstd/common/huf.h +15 -33
  17. data/ext/zstdruby/libzstd/common/mem.h +75 -2
  18. data/ext/zstdruby/libzstd/common/pool.c +8 -4
  19. data/ext/zstdruby/libzstd/common/pool.h +2 -2
  20. data/ext/zstdruby/libzstd/common/threading.c +52 -6
  21. data/ext/zstdruby/libzstd/common/threading.h +36 -4
  22. data/ext/zstdruby/libzstd/common/xxhash.c +25 -37
  23. data/ext/zstdruby/libzstd/common/xxhash.h +11 -31
  24. data/ext/zstdruby/libzstd/common/zstd_common.c +1 -1
  25. data/ext/zstdruby/libzstd/common/zstd_errors.h +2 -1
  26. data/ext/zstdruby/libzstd/common/zstd_internal.h +203 -22
  27. data/ext/zstdruby/libzstd/compress/fse_compress.c +19 -42
  28. data/ext/zstdruby/libzstd/compress/hist.c +15 -35
  29. data/ext/zstdruby/libzstd/compress/hist.h +12 -32
  30. data/ext/zstdruby/libzstd/compress/huf_compress.c +92 -92
  31. data/ext/zstdruby/libzstd/compress/zstd_compress.c +1460 -1472
  32. data/ext/zstdruby/libzstd/compress/zstd_compress_internal.h +330 -65
  33. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.c +158 -0
  34. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.h +29 -0
  35. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.c +419 -0
  36. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.h +54 -0
  37. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.c +845 -0
  38. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.h +32 -0
  39. data/ext/zstdruby/libzstd/compress/zstd_cwksp.h +525 -0
  40. data/ext/zstdruby/libzstd/compress/zstd_double_fast.c +65 -43
  41. data/ext/zstdruby/libzstd/compress/zstd_double_fast.h +2 -2
  42. data/ext/zstdruby/libzstd/compress/zstd_fast.c +264 -159
  43. data/ext/zstdruby/libzstd/compress/zstd_fast.h +2 -2
  44. data/ext/zstdruby/libzstd/compress/zstd_lazy.c +74 -42
  45. data/ext/zstdruby/libzstd/compress/zstd_lazy.h +2 -2
  46. data/ext/zstdruby/libzstd/compress/zstd_ldm.c +33 -11
  47. data/ext/zstdruby/libzstd/compress/zstd_ldm.h +7 -2
  48. data/ext/zstdruby/libzstd/compress/zstd_opt.c +108 -125
  49. data/ext/zstdruby/libzstd/compress/zstd_opt.h +1 -1
  50. data/ext/zstdruby/libzstd/compress/zstdmt_compress.c +129 -93
  51. data/ext/zstdruby/libzstd/compress/zstdmt_compress.h +46 -28
  52. data/ext/zstdruby/libzstd/decompress/huf_decompress.c +76 -60
  53. data/ext/zstdruby/libzstd/decompress/zstd_ddict.c +14 -10
  54. data/ext/zstdruby/libzstd/decompress/zstd_ddict.h +2 -2
  55. data/ext/zstdruby/libzstd/decompress/zstd_decompress.c +471 -258
  56. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.c +471 -346
  57. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.h +3 -3
  58. data/ext/zstdruby/libzstd/decompress/zstd_decompress_internal.h +25 -4
  59. data/ext/zstdruby/libzstd/deprecated/zbuff.h +9 -8
  60. data/ext/zstdruby/libzstd/deprecated/zbuff_common.c +2 -2
  61. data/ext/zstdruby/libzstd/deprecated/zbuff_compress.c +1 -1
  62. data/ext/zstdruby/libzstd/deprecated/zbuff_decompress.c +1 -1
  63. data/ext/zstdruby/libzstd/dictBuilder/cover.c +220 -65
  64. data/ext/zstdruby/libzstd/dictBuilder/cover.h +81 -7
  65. data/ext/zstdruby/libzstd/dictBuilder/fastcover.c +85 -56
  66. data/ext/zstdruby/libzstd/dictBuilder/zdict.c +43 -19
  67. data/ext/zstdruby/libzstd/dictBuilder/zdict.h +73 -35
  68. data/ext/zstdruby/libzstd/dll/example/Makefile +2 -1
  69. data/ext/zstdruby/libzstd/dll/example/build_package.bat +3 -2
  70. data/ext/zstdruby/libzstd/legacy/zstd_legacy.h +49 -15
  71. data/ext/zstdruby/libzstd/legacy/zstd_v01.c +142 -117
  72. data/ext/zstdruby/libzstd/legacy/zstd_v01.h +13 -8
  73. data/ext/zstdruby/libzstd/legacy/zstd_v02.c +54 -25
  74. data/ext/zstdruby/libzstd/legacy/zstd_v02.h +13 -8
  75. data/ext/zstdruby/libzstd/legacy/zstd_v03.c +55 -25
  76. data/ext/zstdruby/libzstd/legacy/zstd_v03.h +13 -8
  77. data/ext/zstdruby/libzstd/legacy/zstd_v04.c +62 -29
  78. data/ext/zstdruby/libzstd/legacy/zstd_v04.h +13 -8
  79. data/ext/zstdruby/libzstd/legacy/zstd_v05.c +145 -109
  80. data/ext/zstdruby/libzstd/legacy/zstd_v05.h +14 -9
  81. data/ext/zstdruby/libzstd/legacy/zstd_v06.c +56 -26
  82. data/ext/zstdruby/libzstd/legacy/zstd_v06.h +11 -6
  83. data/ext/zstdruby/libzstd/legacy/zstd_v07.c +65 -28
  84. data/ext/zstdruby/libzstd/legacy/zstd_v07.h +11 -6
  85. data/ext/zstdruby/libzstd/libzstd.pc.in +3 -2
  86. data/ext/zstdruby/libzstd/zstd.h +921 -597
  87. data/lib/zstd-ruby/version.rb +1 -1
  88. data/zstd-ruby.gemspec +2 -2
  89. metadata +19 -14
  90. data/ext/zstdruby/libzstd/dll/libzstd.def +0 -87
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
2
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -16,8 +16,8 @@
16
16
  * Dependencies
17
17
  *********************************************************/
18
18
  #include <stddef.h> /* size_t */
19
- #include "zstd.h" /* DCtx, and some public functions */
20
- #include "zstd_internal.h" /* blockProperties_t, and some public functions */
19
+ #include "../zstd.h" /* DCtx, and some public functions */
20
+ #include "../common/zstd_internal.h" /* blockProperties_t, and some public functions */
21
21
  #include "zstd_decompress_internal.h" /* ZSTD_seqSymbol */
22
22
 
23
23
 
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
2
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -19,8 +19,8 @@
19
19
  /*-*******************************************************
20
20
  * Dependencies
21
21
  *********************************************************/
22
- #include "mem.h" /* BYTE, U16, U32 */
23
- #include "zstd_internal.h" /* ZSTD_seqSymbol */
22
+ #include "../common/mem.h" /* BYTE, U16, U32 */
23
+ #include "../common/zstd_internal.h" /* ZSTD_seqSymbol */
24
24
 
25
25
 
26
26
 
@@ -89,6 +89,17 @@ typedef enum { ZSTDds_getFrameHeaderSize, ZSTDds_decodeFrameHeader,
89
89
  typedef enum { zdss_init=0, zdss_loadHeader,
90
90
  zdss_read, zdss_load, zdss_flush } ZSTD_dStreamStage;
91
91
 
92
+ typedef enum {
93
+ ZSTD_use_indefinitely = -1, /* Use the dictionary indefinitely */
94
+ ZSTD_dont_use = 0, /* Do not use the dictionary (if one exists free it) */
95
+ ZSTD_use_once = 1 /* Use the dictionary once and set to ZSTD_dont_use */
96
+ } ZSTD_dictUses_e;
97
+
98
+ typedef enum {
99
+ ZSTD_obm_buffered = 0, /* Buffer the output */
100
+ ZSTD_obm_stable = 1 /* ZSTD_outBuffer is stable */
101
+ } ZSTD_outBufferMode_e;
102
+
92
103
  struct ZSTD_DCtx_s
93
104
  {
94
105
  const ZSTD_seqSymbol* LLTptr;
@@ -123,6 +134,7 @@ struct ZSTD_DCtx_s
123
134
  const ZSTD_DDict* ddict; /* set by ZSTD_initDStream_usingDDict(), or ZSTD_DCtx_refDDict() */
124
135
  U32 dictID;
125
136
  int ddictIsCold; /* if == 1 : dictionary is "new" for working context, and presumed "cold" (not in cpu cache) */
137
+ ZSTD_dictUses_e dictUses;
126
138
 
127
139
  /* streaming */
128
140
  ZSTD_dStreamStage streamStage;
@@ -140,10 +152,19 @@ struct ZSTD_DCtx_s
140
152
  U32 legacyVersion;
141
153
  U32 hostageByte;
142
154
  int noForwardProgress;
155
+ ZSTD_outBufferMode_e outBufferMode;
156
+ ZSTD_outBuffer expectedOutBuffer;
143
157
 
144
158
  /* workspace */
145
159
  BYTE litBuffer[ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH];
146
160
  BYTE headerBuffer[ZSTD_FRAMEHEADERSIZE_MAX];
161
+
162
+ size_t oversizedDuration;
163
+
164
+ #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
165
+ void const* dictContentBeginForFuzzing;
166
+ void const* dictContentEndForFuzzing;
167
+ #endif
147
168
  }; /* typedef'd to ZSTD_DCtx within "zstd.h" */
148
169
 
149
170
 
@@ -153,7 +174,7 @@ struct ZSTD_DCtx_s
153
174
 
154
175
  /*! ZSTD_loadDEntropy() :
155
176
  * dict : must point at beginning of a valid zstd dictionary.
156
- * @return : size of entropy tables read */
177
+ * @return : size of dictionary header (size of magic number + dict ID + entropy tables) */
157
178
  size_t ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy,
158
179
  const void* const dict, size_t const dictSize);
159
180
 
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
2
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -28,7 +28,7 @@ extern "C" {
28
28
  * Dependencies
29
29
  ***************************************/
30
30
  #include <stddef.h> /* size_t */
31
- #include "zstd.h" /* ZSTD_CStream, ZSTD_DStream, ZSTDLIB_API */
31
+ #include "../zstd.h" /* ZSTD_CStream, ZSTD_DStream, ZSTDLIB_API */
32
32
 
33
33
 
34
34
  /* ***************************************************************
@@ -36,16 +36,17 @@ extern "C" {
36
36
  *****************************************************************/
37
37
  /* Deprecation warnings */
38
38
  /* Should these warnings be a problem,
39
- it is generally possible to disable them,
40
- typically with -Wno-deprecated-declarations for gcc
41
- or _CRT_SECURE_NO_WARNINGS in Visual.
42
- Otherwise, it's also possible to define ZBUFF_DISABLE_DEPRECATE_WARNINGS */
39
+ * it is generally possible to disable them,
40
+ * typically with -Wno-deprecated-declarations for gcc
41
+ * or _CRT_SECURE_NO_WARNINGS in Visual.
42
+ * Otherwise, it's also possible to define ZBUFF_DISABLE_DEPRECATE_WARNINGS
43
+ */
43
44
  #ifdef ZBUFF_DISABLE_DEPRECATE_WARNINGS
44
45
  # define ZBUFF_DEPRECATED(message) ZSTDLIB_API /* disable deprecation warnings */
45
46
  #else
46
47
  # if defined (__cplusplus) && (__cplusplus >= 201402) /* C++14 or greater */
47
48
  # define ZBUFF_DEPRECATED(message) [[deprecated(message)]] ZSTDLIB_API
48
- # elif (defined(__GNUC__) && (__GNUC__ >= 5)) || defined(__clang__)
49
+ # elif (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__)
49
50
  # define ZBUFF_DEPRECATED(message) ZSTDLIB_API __attribute__((deprecated(message)))
50
51
  # elif defined(__GNUC__) && (__GNUC__ >= 3)
51
52
  # define ZBUFF_DEPRECATED(message) ZSTDLIB_API __attribute__((deprecated))
@@ -185,7 +186,7 @@ ZBUFF_DEPRECATED("use ZSTD_DStreamOutSize") size_t ZBUFF_recommendedDOutSize(voi
185
186
 
186
187
  /*--- Dependency ---*/
187
188
  #define ZSTD_STATIC_LINKING_ONLY /* ZSTD_parameters, ZSTD_customMem */
188
- #include "zstd.h"
189
+ #include "../zstd.h"
189
190
 
190
191
 
191
192
  /*--- Custom memory allocator ---*/
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
2
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -11,7 +11,7 @@
11
11
  /*-*************************************
12
12
  * Dependencies
13
13
  ***************************************/
14
- #include "error_private.h"
14
+ #include "../common/error_private.h"
15
15
  #include "zbuff.h"
16
16
 
17
17
  /*-****************************************
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
2
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
2
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
2
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -26,11 +26,11 @@
26
26
  #include <string.h> /* memset */
27
27
  #include <time.h> /* clock */
28
28
 
29
- #include "mem.h" /* read */
30
- #include "pool.h"
31
- #include "threading.h"
29
+ #include "../common/mem.h" /* read */
30
+ #include "../common/pool.h"
31
+ #include "../common/threading.h"
32
32
  #include "cover.h"
33
- #include "zstd_internal.h" /* includes zstd.h */
33
+ #include "../common/zstd_internal.h" /* includes zstd.h */
34
34
  #ifndef ZDICT_STATIC_LINKING_ONLY
35
35
  #define ZDICT_STATIC_LINKING_ONLY
36
36
  #endif
@@ -391,7 +391,7 @@ static void COVER_group(COVER_ctx_t *ctx, const void *group,
391
391
  *
392
392
  * Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1})
393
393
  *
394
- * Once the dmer d is in the dictionay we set F(d) = 0.
394
+ * Once the dmer d is in the dictionary we set F(d) = 0.
395
395
  */
396
396
  static COVER_segment_t COVER_selectSegment(const COVER_ctx_t *ctx, U32 *freqs,
397
397
  COVER_map_t *activeDmers, U32 begin,
@@ -435,7 +435,7 @@ static COVER_segment_t COVER_selectSegment(const COVER_ctx_t *ctx, U32 *freqs,
435
435
  U32 *delDmerOcc = COVER_map_at(activeDmers, delDmer);
436
436
  activeSegment.begin += 1;
437
437
  *delDmerOcc -= 1;
438
- /* If this is the last occurence of the dmer, subtract its score */
438
+ /* If this is the last occurrence of the dmer, subtract its score */
439
439
  if (*delDmerOcc == 0) {
440
440
  COVER_map_remove(activeDmers, delDmer);
441
441
  activeSegment.score -= freqs[delDmer];
@@ -526,10 +526,10 @@ static void COVER_ctx_destroy(COVER_ctx_t *ctx) {
526
526
  * Prepare a context for dictionary building.
527
527
  * The context is only dependent on the parameter `d` and can used multiple
528
528
  * times.
529
- * Returns 1 on success or zero on error.
529
+ * Returns 0 on success or error code on error.
530
530
  * The context must be destroyed with `COVER_ctx_destroy()`.
531
531
  */
532
- static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
532
+ static size_t COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
533
533
  const size_t *samplesSizes, unsigned nbSamples,
534
534
  unsigned d, double splitPoint) {
535
535
  const BYTE *const samples = (const BYTE *)samplesBuffer;
@@ -544,17 +544,17 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
544
544
  totalSamplesSize >= (size_t)COVER_MAX_SAMPLES_SIZE) {
545
545
  DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n",
546
546
  (unsigned)(totalSamplesSize>>20), (COVER_MAX_SAMPLES_SIZE >> 20));
547
- return 0;
547
+ return ERROR(srcSize_wrong);
548
548
  }
549
549
  /* Check if there are at least 5 training samples */
550
550
  if (nbTrainSamples < 5) {
551
551
  DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid.", nbTrainSamples);
552
- return 0;
552
+ return ERROR(srcSize_wrong);
553
553
  }
554
554
  /* Check if there's testing sample */
555
555
  if (nbTestSamples < 1) {
556
556
  DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.", nbTestSamples);
557
- return 0;
557
+ return ERROR(srcSize_wrong);
558
558
  }
559
559
  /* Zero the context */
560
560
  memset(ctx, 0, sizeof(*ctx));
@@ -577,7 +577,7 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
577
577
  if (!ctx->suffix || !ctx->dmerAt || !ctx->offsets) {
578
578
  DISPLAYLEVEL(1, "Failed to allocate scratch buffers\n");
579
579
  COVER_ctx_destroy(ctx);
580
- return 0;
580
+ return ERROR(memory_allocation);
581
581
  }
582
582
  ctx->freqs = NULL;
583
583
  ctx->d = d;
@@ -624,7 +624,40 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
624
624
  (ctx->d <= 8 ? &COVER_cmp8 : &COVER_cmp), &COVER_group);
625
625
  ctx->freqs = ctx->suffix;
626
626
  ctx->suffix = NULL;
627
- return 1;
627
+ return 0;
628
+ }
629
+
630
+ void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel)
631
+ {
632
+ const double ratio = (double)nbDmers / maxDictSize;
633
+ if (ratio >= 10) {
634
+ return;
635
+ }
636
+ LOCALDISPLAYLEVEL(displayLevel, 1,
637
+ "WARNING: The maximum dictionary size %u is too large "
638
+ "compared to the source size %u! "
639
+ "size(source)/size(dictionary) = %f, but it should be >= "
640
+ "10! This may lead to a subpar dictionary! We recommend "
641
+ "training on sources at least 10x, and preferably 100x "
642
+ "the size of the dictionary! \n", (U32)maxDictSize,
643
+ (U32)nbDmers, ratio);
644
+ }
645
+
646
+ COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize,
647
+ U32 nbDmers, U32 k, U32 passes)
648
+ {
649
+ const U32 minEpochSize = k * 10;
650
+ COVER_epoch_info_t epochs;
651
+ epochs.num = MAX(1, maxDictSize / k / passes);
652
+ epochs.size = nbDmers / epochs.num;
653
+ if (epochs.size >= minEpochSize) {
654
+ assert(epochs.size * epochs.num <= nbDmers);
655
+ return epochs;
656
+ }
657
+ epochs.size = MIN(minEpochSize, nbDmers);
658
+ epochs.num = nbDmers / epochs.size;
659
+ assert(epochs.size * epochs.num <= nbDmers);
660
+ return epochs;
628
661
  }
629
662
 
630
663
  /**
@@ -636,28 +669,34 @@ static size_t COVER_buildDictionary(const COVER_ctx_t *ctx, U32 *freqs,
636
669
  ZDICT_cover_params_t parameters) {
637
670
  BYTE *const dict = (BYTE *)dictBuffer;
638
671
  size_t tail = dictBufferCapacity;
639
- /* Divide the data up into epochs of equal size.
640
- * We will select at least one segment from each epoch.
641
- */
642
- const unsigned epochs = MAX(1, (U32)(dictBufferCapacity / parameters.k / 4));
643
- const unsigned epochSize = (U32)(ctx->suffixSize / epochs);
672
+ /* Divide the data into epochs. We will select one segment from each epoch. */
673
+ const COVER_epoch_info_t epochs = COVER_computeEpochs(
674
+ (U32)dictBufferCapacity, (U32)ctx->suffixSize, parameters.k, 4);
675
+ const size_t maxZeroScoreRun = MAX(10, MIN(100, epochs.num >> 3));
676
+ size_t zeroScoreRun = 0;
644
677
  size_t epoch;
645
678
  DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n",
646
- epochs, epochSize);
679
+ (U32)epochs.num, (U32)epochs.size);
647
680
  /* Loop through the epochs until there are no more segments or the dictionary
648
681
  * is full.
649
682
  */
650
- for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs) {
651
- const U32 epochBegin = (U32)(epoch * epochSize);
652
- const U32 epochEnd = epochBegin + epochSize;
683
+ for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs.num) {
684
+ const U32 epochBegin = (U32)(epoch * epochs.size);
685
+ const U32 epochEnd = epochBegin + epochs.size;
653
686
  size_t segmentSize;
654
687
  /* Select a segment */
655
688
  COVER_segment_t segment = COVER_selectSegment(
656
689
  ctx, freqs, activeDmers, epochBegin, epochEnd, parameters);
657
- /* If the segment covers no dmers, then we are out of content */
690
+ /* If the segment covers no dmers, then we are out of content.
691
+ * There may be new content in other epochs, for continue for some time.
692
+ */
658
693
  if (segment.score == 0) {
659
- break;
694
+ if (++zeroScoreRun >= maxZeroScoreRun) {
695
+ break;
696
+ }
697
+ continue;
660
698
  }
699
+ zeroScoreRun = 0;
661
700
  /* Trim the segment if necessary and if it is too small then we are done */
662
701
  segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail);
663
702
  if (segmentSize < parameters.d) {
@@ -690,11 +729,11 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
690
729
  /* Checks */
691
730
  if (!COVER_checkParameters(parameters, dictBufferCapacity)) {
692
731
  DISPLAYLEVEL(1, "Cover parameters incorrect\n");
693
- return ERROR(GENERIC);
732
+ return ERROR(parameter_outOfBound);
694
733
  }
695
734
  if (nbSamples == 0) {
696
735
  DISPLAYLEVEL(1, "Cover must have at least one input file\n");
697
- return ERROR(GENERIC);
736
+ return ERROR(srcSize_wrong);
698
737
  }
699
738
  if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
700
739
  DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
@@ -702,14 +741,18 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
702
741
  return ERROR(dstSize_tooSmall);
703
742
  }
704
743
  /* Initialize context and activeDmers */
705
- if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
706
- parameters.d, parameters.splitPoint)) {
707
- return ERROR(GENERIC);
744
+ {
745
+ size_t const initVal = COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
746
+ parameters.d, parameters.splitPoint);
747
+ if (ZSTD_isError(initVal)) {
748
+ return initVal;
749
+ }
708
750
  }
751
+ COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize, g_displayLevel);
709
752
  if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
710
753
  DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
711
754
  COVER_ctx_destroy(&ctx);
712
- return ERROR(GENERIC);
755
+ return ERROR(memory_allocation);
713
756
  }
714
757
 
715
758
  DISPLAYLEVEL(2, "Building dictionary\n");
@@ -770,7 +813,7 @@ size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,
770
813
  cctx, dst, dstCapacity, samples + offsets[i],
771
814
  samplesSizes[i], cdict);
772
815
  if (ZSTD_isError(size)) {
773
- totalCompressedSize = ERROR(GENERIC);
816
+ totalCompressedSize = size;
774
817
  goto _compressCleanup;
775
818
  }
776
819
  totalCompressedSize += size;
@@ -846,9 +889,11 @@ void COVER_best_start(COVER_best_t *best) {
846
889
  * Decrements liveJobs and signals any waiting threads if liveJobs == 0.
847
890
  * If this dictionary is the best so far save it and its parameters.
848
891
  */
849
- void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
850
- ZDICT_cover_params_t parameters, void *dict,
851
- size_t dictSize) {
892
+ void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters,
893
+ COVER_dictSelection_t selection) {
894
+ void* dict = selection.dictContent;
895
+ size_t compressedSize = selection.totalCompressedSize;
896
+ size_t dictSize = selection.dictSize;
852
897
  if (!best) {
853
898
  return;
854
899
  }
@@ -874,10 +919,12 @@ void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
874
919
  }
875
920
  }
876
921
  /* Save the dictionary, parameters, and size */
877
- memcpy(best->dict, dict, dictSize);
878
- best->dictSize = dictSize;
879
- best->parameters = parameters;
880
- best->compressedSize = compressedSize;
922
+ if (dict) {
923
+ memcpy(best->dict, dict, dictSize);
924
+ best->dictSize = dictSize;
925
+ best->parameters = parameters;
926
+ best->compressedSize = compressedSize;
927
+ }
881
928
  }
882
929
  if (liveJobs == 0) {
883
930
  ZSTD_pthread_cond_broadcast(&best->cond);
@@ -886,6 +933,111 @@ void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
886
933
  }
887
934
  }
888
935
 
936
+ COVER_dictSelection_t COVER_dictSelectionError(size_t error) {
937
+ COVER_dictSelection_t selection = { NULL, 0, error };
938
+ return selection;
939
+ }
940
+
941
+ unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection) {
942
+ return (ZSTD_isError(selection.totalCompressedSize) || !selection.dictContent);
943
+ }
944
+
945
+ void COVER_dictSelectionFree(COVER_dictSelection_t selection){
946
+ free(selection.dictContent);
947
+ }
948
+
949
+ COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent,
950
+ size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples,
951
+ size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize) {
952
+
953
+ size_t largestDict = 0;
954
+ size_t largestCompressed = 0;
955
+ BYTE* customDictContentEnd = customDictContent + dictContentSize;
956
+
957
+ BYTE * largestDictbuffer = (BYTE *)malloc(dictContentSize);
958
+ BYTE * candidateDictBuffer = (BYTE *)malloc(dictContentSize);
959
+ double regressionTolerance = ((double)params.shrinkDictMaxRegression / 100.0) + 1.00;
960
+
961
+ if (!largestDictbuffer || !candidateDictBuffer) {
962
+ free(largestDictbuffer);
963
+ free(candidateDictBuffer);
964
+ return COVER_dictSelectionError(dictContentSize);
965
+ }
966
+
967
+ /* Initial dictionary size and compressed size */
968
+ memcpy(largestDictbuffer, customDictContent, dictContentSize);
969
+ dictContentSize = ZDICT_finalizeDictionary(
970
+ largestDictbuffer, dictContentSize, customDictContent, dictContentSize,
971
+ samplesBuffer, samplesSizes, nbFinalizeSamples, params.zParams);
972
+
973
+ if (ZDICT_isError(dictContentSize)) {
974
+ free(largestDictbuffer);
975
+ free(candidateDictBuffer);
976
+ return COVER_dictSelectionError(dictContentSize);
977
+ }
978
+
979
+ totalCompressedSize = COVER_checkTotalCompressedSize(params, samplesSizes,
980
+ samplesBuffer, offsets,
981
+ nbCheckSamples, nbSamples,
982
+ largestDictbuffer, dictContentSize);
983
+
984
+ if (ZSTD_isError(totalCompressedSize)) {
985
+ free(largestDictbuffer);
986
+ free(candidateDictBuffer);
987
+ return COVER_dictSelectionError(totalCompressedSize);
988
+ }
989
+
990
+ if (params.shrinkDict == 0) {
991
+ COVER_dictSelection_t selection = { largestDictbuffer, dictContentSize, totalCompressedSize };
992
+ free(candidateDictBuffer);
993
+ return selection;
994
+ }
995
+
996
+ largestDict = dictContentSize;
997
+ largestCompressed = totalCompressedSize;
998
+ dictContentSize = ZDICT_DICTSIZE_MIN;
999
+
1000
+ /* Largest dict is initially at least ZDICT_DICTSIZE_MIN */
1001
+ while (dictContentSize < largestDict) {
1002
+ memcpy(candidateDictBuffer, largestDictbuffer, largestDict);
1003
+ dictContentSize = ZDICT_finalizeDictionary(
1004
+ candidateDictBuffer, dictContentSize, customDictContentEnd - dictContentSize, dictContentSize,
1005
+ samplesBuffer, samplesSizes, nbFinalizeSamples, params.zParams);
1006
+
1007
+ if (ZDICT_isError(dictContentSize)) {
1008
+ free(largestDictbuffer);
1009
+ free(candidateDictBuffer);
1010
+ return COVER_dictSelectionError(dictContentSize);
1011
+
1012
+ }
1013
+
1014
+ totalCompressedSize = COVER_checkTotalCompressedSize(params, samplesSizes,
1015
+ samplesBuffer, offsets,
1016
+ nbCheckSamples, nbSamples,
1017
+ candidateDictBuffer, dictContentSize);
1018
+
1019
+ if (ZSTD_isError(totalCompressedSize)) {
1020
+ free(largestDictbuffer);
1021
+ free(candidateDictBuffer);
1022
+ return COVER_dictSelectionError(totalCompressedSize);
1023
+ }
1024
+
1025
+ if (totalCompressedSize <= largestCompressed * regressionTolerance) {
1026
+ COVER_dictSelection_t selection = { candidateDictBuffer, dictContentSize, totalCompressedSize };
1027
+ free(largestDictbuffer);
1028
+ return selection;
1029
+ }
1030
+ dictContentSize *= 2;
1031
+ }
1032
+ dictContentSize = largestDict;
1033
+ totalCompressedSize = largestCompressed;
1034
+ {
1035
+ COVER_dictSelection_t selection = { largestDictbuffer, dictContentSize, totalCompressedSize };
1036
+ free(candidateDictBuffer);
1037
+ return selection;
1038
+ }
1039
+ }
1040
+
889
1041
  /**
890
1042
  * Parameters for COVER_tryParameters().
891
1043
  */
@@ -911,6 +1063,7 @@ static void COVER_tryParameters(void *opaque) {
911
1063
  /* Allocate space for hash table, dict, and freqs */
912
1064
  COVER_map_t activeDmers;
913
1065
  BYTE *const dict = (BYTE * const)malloc(dictBufferCapacity);
1066
+ COVER_dictSelection_t selection = COVER_dictSelectionError(ERROR(GENERIC));
914
1067
  U32 *freqs = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
915
1068
  if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
916
1069
  DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
@@ -926,29 +1079,21 @@ static void COVER_tryParameters(void *opaque) {
926
1079
  {
927
1080
  const size_t tail = COVER_buildDictionary(ctx, freqs, &activeDmers, dict,
928
1081
  dictBufferCapacity, parameters);
929
- dictBufferCapacity = ZDICT_finalizeDictionary(
930
- dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
931
- ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbTrainSamples,
932
- parameters.zParams);
933
- if (ZDICT_isError(dictBufferCapacity)) {
934
- DISPLAYLEVEL(1, "Failed to finalize dictionary\n");
1082
+ selection = COVER_selectDict(dict + tail, dictBufferCapacity - tail,
1083
+ ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbTrainSamples, ctx->nbTrainSamples, ctx->nbSamples, parameters, ctx->offsets,
1084
+ totalCompressedSize);
1085
+
1086
+ if (COVER_dictSelectionIsError(selection)) {
1087
+ DISPLAYLEVEL(1, "Failed to select dictionary\n");
935
1088
  goto _cleanup;
936
1089
  }
937
1090
  }
938
- /* Check total compressed size */
939
- totalCompressedSize = COVER_checkTotalCompressedSize(parameters, ctx->samplesSizes,
940
- ctx->samples, ctx->offsets,
941
- ctx->nbTrainSamples, ctx->nbSamples,
942
- dict, dictBufferCapacity);
943
-
944
1091
  _cleanup:
945
- COVER_best_finish(data->best, totalCompressedSize, parameters, dict,
946
- dictBufferCapacity);
1092
+ free(dict);
1093
+ COVER_best_finish(data->best, parameters, selection);
947
1094
  free(data);
948
1095
  COVER_map_destroy(&activeDmers);
949
- if (dict) {
950
- free(dict);
951
- }
1096
+ COVER_dictSelectionFree(selection);
952
1097
  if (freqs) {
953
1098
  free(freqs);
954
1099
  }
@@ -970,6 +1115,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
970
1115
  const unsigned kStepSize = MAX((kMaxK - kMinK) / kSteps, 1);
971
1116
  const unsigned kIterations =
972
1117
  (1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize);
1118
+ const unsigned shrinkDict = 0;
973
1119
  /* Local variables */
974
1120
  const int displayLevel = parameters->zParams.notificationLevel;
975
1121
  unsigned iteration = 1;
@@ -977,19 +1123,20 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
977
1123
  unsigned k;
978
1124
  COVER_best_t best;
979
1125
  POOL_ctx *pool = NULL;
1126
+ int warned = 0;
980
1127
 
981
1128
  /* Checks */
982
1129
  if (splitPoint <= 0 || splitPoint > 1) {
983
1130
  LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
984
- return ERROR(GENERIC);
1131
+ return ERROR(parameter_outOfBound);
985
1132
  }
986
1133
  if (kMinK < kMaxD || kMaxK < kMinK) {
987
1134
  LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
988
- return ERROR(GENERIC);
1135
+ return ERROR(parameter_outOfBound);
989
1136
  }
990
1137
  if (nbSamples == 0) {
991
1138
  DISPLAYLEVEL(1, "Cover must have at least one input file\n");
992
- return ERROR(GENERIC);
1139
+ return ERROR(srcSize_wrong);
993
1140
  }
994
1141
  if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
995
1142
  DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
@@ -1013,11 +1160,18 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
1013
1160
  /* Initialize the context for this value of d */
1014
1161
  COVER_ctx_t ctx;
1015
1162
  LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d);
1016
- if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint)) {
1017
- LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n");
1018
- COVER_best_destroy(&best);
1019
- POOL_free(pool);
1020
- return ERROR(GENERIC);
1163
+ {
1164
+ const size_t initVal = COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint);
1165
+ if (ZSTD_isError(initVal)) {
1166
+ LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n");
1167
+ COVER_best_destroy(&best);
1168
+ POOL_free(pool);
1169
+ return initVal;
1170
+ }
1171
+ }
1172
+ if (!warned) {
1173
+ COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize, displayLevel);
1174
+ warned = 1;
1021
1175
  }
1022
1176
  /* Loop through k reusing the same context */
1023
1177
  for (k = kMinK; k <= kMaxK; k += kStepSize) {
@@ -1030,7 +1184,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
1030
1184
  COVER_best_destroy(&best);
1031
1185
  COVER_ctx_destroy(&ctx);
1032
1186
  POOL_free(pool);
1033
- return ERROR(GENERIC);
1187
+ return ERROR(memory_allocation);
1034
1188
  }
1035
1189
  data->ctx = &ctx;
1036
1190
  data->best = &best;
@@ -1040,6 +1194,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
1040
1194
  data->parameters.d = d;
1041
1195
  data->parameters.splitPoint = splitPoint;
1042
1196
  data->parameters.steps = kSteps;
1197
+ data->parameters.shrinkDict = shrinkDict;
1043
1198
  data->parameters.zParams.notificationLevel = g_displayLevel;
1044
1199
  /* Check the parameters */
1045
1200
  if (!COVER_checkParameters(data->parameters, dictBufferCapacity)) {