zstd-ruby 1.3.8.0 → 1.4.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +6 -5
  3. data/README.md +1 -1
  4. data/ext/zstdruby/libzstd/Makefile +133 -61
  5. data/ext/zstdruby/libzstd/README.md +51 -18
  6. data/ext/zstdruby/libzstd/common/bitstream.h +38 -39
  7. data/ext/zstdruby/libzstd/common/compiler.h +41 -6
  8. data/ext/zstdruby/libzstd/common/cpu.h +1 -1
  9. data/ext/zstdruby/libzstd/common/debug.c +11 -31
  10. data/ext/zstdruby/libzstd/common/debug.h +11 -31
  11. data/ext/zstdruby/libzstd/common/entropy_common.c +13 -33
  12. data/ext/zstdruby/libzstd/common/error_private.c +2 -1
  13. data/ext/zstdruby/libzstd/common/error_private.h +6 -2
  14. data/ext/zstdruby/libzstd/common/fse.h +13 -33
  15. data/ext/zstdruby/libzstd/common/fse_decompress.c +12 -35
  16. data/ext/zstdruby/libzstd/common/huf.h +15 -33
  17. data/ext/zstdruby/libzstd/common/mem.h +75 -2
  18. data/ext/zstdruby/libzstd/common/pool.c +8 -4
  19. data/ext/zstdruby/libzstd/common/pool.h +2 -2
  20. data/ext/zstdruby/libzstd/common/threading.c +52 -6
  21. data/ext/zstdruby/libzstd/common/threading.h +36 -4
  22. data/ext/zstdruby/libzstd/common/xxhash.c +25 -37
  23. data/ext/zstdruby/libzstd/common/xxhash.h +11 -31
  24. data/ext/zstdruby/libzstd/common/zstd_common.c +1 -1
  25. data/ext/zstdruby/libzstd/common/zstd_errors.h +2 -1
  26. data/ext/zstdruby/libzstd/common/zstd_internal.h +203 -22
  27. data/ext/zstdruby/libzstd/compress/fse_compress.c +19 -42
  28. data/ext/zstdruby/libzstd/compress/hist.c +15 -35
  29. data/ext/zstdruby/libzstd/compress/hist.h +12 -32
  30. data/ext/zstdruby/libzstd/compress/huf_compress.c +92 -92
  31. data/ext/zstdruby/libzstd/compress/zstd_compress.c +1460 -1472
  32. data/ext/zstdruby/libzstd/compress/zstd_compress_internal.h +330 -65
  33. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.c +158 -0
  34. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.h +29 -0
  35. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.c +419 -0
  36. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.h +54 -0
  37. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.c +845 -0
  38. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.h +32 -0
  39. data/ext/zstdruby/libzstd/compress/zstd_cwksp.h +525 -0
  40. data/ext/zstdruby/libzstd/compress/zstd_double_fast.c +65 -43
  41. data/ext/zstdruby/libzstd/compress/zstd_double_fast.h +2 -2
  42. data/ext/zstdruby/libzstd/compress/zstd_fast.c +264 -159
  43. data/ext/zstdruby/libzstd/compress/zstd_fast.h +2 -2
  44. data/ext/zstdruby/libzstd/compress/zstd_lazy.c +74 -42
  45. data/ext/zstdruby/libzstd/compress/zstd_lazy.h +2 -2
  46. data/ext/zstdruby/libzstd/compress/zstd_ldm.c +33 -11
  47. data/ext/zstdruby/libzstd/compress/zstd_ldm.h +7 -2
  48. data/ext/zstdruby/libzstd/compress/zstd_opt.c +108 -125
  49. data/ext/zstdruby/libzstd/compress/zstd_opt.h +1 -1
  50. data/ext/zstdruby/libzstd/compress/zstdmt_compress.c +129 -93
  51. data/ext/zstdruby/libzstd/compress/zstdmt_compress.h +46 -28
  52. data/ext/zstdruby/libzstd/decompress/huf_decompress.c +76 -60
  53. data/ext/zstdruby/libzstd/decompress/zstd_ddict.c +14 -10
  54. data/ext/zstdruby/libzstd/decompress/zstd_ddict.h +2 -2
  55. data/ext/zstdruby/libzstd/decompress/zstd_decompress.c +471 -258
  56. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.c +471 -346
  57. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.h +3 -3
  58. data/ext/zstdruby/libzstd/decompress/zstd_decompress_internal.h +25 -4
  59. data/ext/zstdruby/libzstd/deprecated/zbuff.h +9 -8
  60. data/ext/zstdruby/libzstd/deprecated/zbuff_common.c +2 -2
  61. data/ext/zstdruby/libzstd/deprecated/zbuff_compress.c +1 -1
  62. data/ext/zstdruby/libzstd/deprecated/zbuff_decompress.c +1 -1
  63. data/ext/zstdruby/libzstd/dictBuilder/cover.c +220 -65
  64. data/ext/zstdruby/libzstd/dictBuilder/cover.h +81 -7
  65. data/ext/zstdruby/libzstd/dictBuilder/fastcover.c +85 -56
  66. data/ext/zstdruby/libzstd/dictBuilder/zdict.c +43 -19
  67. data/ext/zstdruby/libzstd/dictBuilder/zdict.h +73 -35
  68. data/ext/zstdruby/libzstd/dll/example/Makefile +2 -1
  69. data/ext/zstdruby/libzstd/dll/example/build_package.bat +3 -2
  70. data/ext/zstdruby/libzstd/legacy/zstd_legacy.h +49 -15
  71. data/ext/zstdruby/libzstd/legacy/zstd_v01.c +142 -117
  72. data/ext/zstdruby/libzstd/legacy/zstd_v01.h +13 -8
  73. data/ext/zstdruby/libzstd/legacy/zstd_v02.c +54 -25
  74. data/ext/zstdruby/libzstd/legacy/zstd_v02.h +13 -8
  75. data/ext/zstdruby/libzstd/legacy/zstd_v03.c +55 -25
  76. data/ext/zstdruby/libzstd/legacy/zstd_v03.h +13 -8
  77. data/ext/zstdruby/libzstd/legacy/zstd_v04.c +62 -29
  78. data/ext/zstdruby/libzstd/legacy/zstd_v04.h +13 -8
  79. data/ext/zstdruby/libzstd/legacy/zstd_v05.c +145 -109
  80. data/ext/zstdruby/libzstd/legacy/zstd_v05.h +14 -9
  81. data/ext/zstdruby/libzstd/legacy/zstd_v06.c +56 -26
  82. data/ext/zstdruby/libzstd/legacy/zstd_v06.h +11 -6
  83. data/ext/zstdruby/libzstd/legacy/zstd_v07.c +65 -28
  84. data/ext/zstdruby/libzstd/legacy/zstd_v07.h +11 -6
  85. data/ext/zstdruby/libzstd/libzstd.pc.in +3 -2
  86. data/ext/zstdruby/libzstd/zstd.h +921 -597
  87. data/lib/zstd-ruby/version.rb +1 -1
  88. data/zstd-ruby.gemspec +2 -2
  89. metadata +19 -14
  90. data/ext/zstdruby/libzstd/dll/libzstd.def +0 -87
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
2
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -16,8 +16,8 @@
16
16
  * Dependencies
17
17
  *********************************************************/
18
18
  #include <stddef.h> /* size_t */
19
- #include "zstd.h" /* DCtx, and some public functions */
20
- #include "zstd_internal.h" /* blockProperties_t, and some public functions */
19
+ #include "../zstd.h" /* DCtx, and some public functions */
20
+ #include "../common/zstd_internal.h" /* blockProperties_t, and some public functions */
21
21
  #include "zstd_decompress_internal.h" /* ZSTD_seqSymbol */
22
22
 
23
23
 
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
2
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -19,8 +19,8 @@
19
19
  /*-*******************************************************
20
20
  * Dependencies
21
21
  *********************************************************/
22
- #include "mem.h" /* BYTE, U16, U32 */
23
- #include "zstd_internal.h" /* ZSTD_seqSymbol */
22
+ #include "../common/mem.h" /* BYTE, U16, U32 */
23
+ #include "../common/zstd_internal.h" /* ZSTD_seqSymbol */
24
24
 
25
25
 
26
26
 
@@ -89,6 +89,17 @@ typedef enum { ZSTDds_getFrameHeaderSize, ZSTDds_decodeFrameHeader,
89
89
  typedef enum { zdss_init=0, zdss_loadHeader,
90
90
  zdss_read, zdss_load, zdss_flush } ZSTD_dStreamStage;
91
91
 
92
+ typedef enum {
93
+ ZSTD_use_indefinitely = -1, /* Use the dictionary indefinitely */
94
+ ZSTD_dont_use = 0, /* Do not use the dictionary (if one exists free it) */
95
+ ZSTD_use_once = 1 /* Use the dictionary once and set to ZSTD_dont_use */
96
+ } ZSTD_dictUses_e;
97
+
98
+ typedef enum {
99
+ ZSTD_obm_buffered = 0, /* Buffer the output */
100
+ ZSTD_obm_stable = 1 /* ZSTD_outBuffer is stable */
101
+ } ZSTD_outBufferMode_e;
102
+
92
103
  struct ZSTD_DCtx_s
93
104
  {
94
105
  const ZSTD_seqSymbol* LLTptr;
@@ -123,6 +134,7 @@ struct ZSTD_DCtx_s
123
134
  const ZSTD_DDict* ddict; /* set by ZSTD_initDStream_usingDDict(), or ZSTD_DCtx_refDDict() */
124
135
  U32 dictID;
125
136
  int ddictIsCold; /* if == 1 : dictionary is "new" for working context, and presumed "cold" (not in cpu cache) */
137
+ ZSTD_dictUses_e dictUses;
126
138
 
127
139
  /* streaming */
128
140
  ZSTD_dStreamStage streamStage;
@@ -140,10 +152,19 @@ struct ZSTD_DCtx_s
140
152
  U32 legacyVersion;
141
153
  U32 hostageByte;
142
154
  int noForwardProgress;
155
+ ZSTD_outBufferMode_e outBufferMode;
156
+ ZSTD_outBuffer expectedOutBuffer;
143
157
 
144
158
  /* workspace */
145
159
  BYTE litBuffer[ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH];
146
160
  BYTE headerBuffer[ZSTD_FRAMEHEADERSIZE_MAX];
161
+
162
+ size_t oversizedDuration;
163
+
164
+ #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
165
+ void const* dictContentBeginForFuzzing;
166
+ void const* dictContentEndForFuzzing;
167
+ #endif
147
168
  }; /* typedef'd to ZSTD_DCtx within "zstd.h" */
148
169
 
149
170
 
@@ -153,7 +174,7 @@ struct ZSTD_DCtx_s
153
174
 
154
175
  /*! ZSTD_loadDEntropy() :
155
176
  * dict : must point at beginning of a valid zstd dictionary.
156
- * @return : size of entropy tables read */
177
+ * @return : size of dictionary header (size of magic number + dict ID + entropy tables) */
157
178
  size_t ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy,
158
179
  const void* const dict, size_t const dictSize);
159
180
 
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
2
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -28,7 +28,7 @@ extern "C" {
28
28
  * Dependencies
29
29
  ***************************************/
30
30
  #include <stddef.h> /* size_t */
31
- #include "zstd.h" /* ZSTD_CStream, ZSTD_DStream, ZSTDLIB_API */
31
+ #include "../zstd.h" /* ZSTD_CStream, ZSTD_DStream, ZSTDLIB_API */
32
32
 
33
33
 
34
34
  /* ***************************************************************
@@ -36,16 +36,17 @@ extern "C" {
36
36
  *****************************************************************/
37
37
  /* Deprecation warnings */
38
38
  /* Should these warnings be a problem,
39
- it is generally possible to disable them,
40
- typically with -Wno-deprecated-declarations for gcc
41
- or _CRT_SECURE_NO_WARNINGS in Visual.
42
- Otherwise, it's also possible to define ZBUFF_DISABLE_DEPRECATE_WARNINGS */
39
+ * it is generally possible to disable them,
40
+ * typically with -Wno-deprecated-declarations for gcc
41
+ * or _CRT_SECURE_NO_WARNINGS in Visual.
42
+ * Otherwise, it's also possible to define ZBUFF_DISABLE_DEPRECATE_WARNINGS
43
+ */
43
44
  #ifdef ZBUFF_DISABLE_DEPRECATE_WARNINGS
44
45
  # define ZBUFF_DEPRECATED(message) ZSTDLIB_API /* disable deprecation warnings */
45
46
  #else
46
47
  # if defined (__cplusplus) && (__cplusplus >= 201402) /* C++14 or greater */
47
48
  # define ZBUFF_DEPRECATED(message) [[deprecated(message)]] ZSTDLIB_API
48
- # elif (defined(__GNUC__) && (__GNUC__ >= 5)) || defined(__clang__)
49
+ # elif (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__)
49
50
  # define ZBUFF_DEPRECATED(message) ZSTDLIB_API __attribute__((deprecated(message)))
50
51
  # elif defined(__GNUC__) && (__GNUC__ >= 3)
51
52
  # define ZBUFF_DEPRECATED(message) ZSTDLIB_API __attribute__((deprecated))
@@ -185,7 +186,7 @@ ZBUFF_DEPRECATED("use ZSTD_DStreamOutSize") size_t ZBUFF_recommendedDOutSize(voi
185
186
 
186
187
  /*--- Dependency ---*/
187
188
  #define ZSTD_STATIC_LINKING_ONLY /* ZSTD_parameters, ZSTD_customMem */
188
- #include "zstd.h"
189
+ #include "../zstd.h"
189
190
 
190
191
 
191
192
  /*--- Custom memory allocator ---*/
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
2
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -11,7 +11,7 @@
11
11
  /*-*************************************
12
12
  * Dependencies
13
13
  ***************************************/
14
- #include "error_private.h"
14
+ #include "../common/error_private.h"
15
15
  #include "zbuff.h"
16
16
 
17
17
  /*-****************************************
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
2
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
2
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
2
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -26,11 +26,11 @@
26
26
  #include <string.h> /* memset */
27
27
  #include <time.h> /* clock */
28
28
 
29
- #include "mem.h" /* read */
30
- #include "pool.h"
31
- #include "threading.h"
29
+ #include "../common/mem.h" /* read */
30
+ #include "../common/pool.h"
31
+ #include "../common/threading.h"
32
32
  #include "cover.h"
33
- #include "zstd_internal.h" /* includes zstd.h */
33
+ #include "../common/zstd_internal.h" /* includes zstd.h */
34
34
  #ifndef ZDICT_STATIC_LINKING_ONLY
35
35
  #define ZDICT_STATIC_LINKING_ONLY
36
36
  #endif
@@ -391,7 +391,7 @@ static void COVER_group(COVER_ctx_t *ctx, const void *group,
391
391
  *
392
392
  * Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1})
393
393
  *
394
- * Once the dmer d is in the dictionay we set F(d) = 0.
394
+ * Once the dmer d is in the dictionary we set F(d) = 0.
395
395
  */
396
396
  static COVER_segment_t COVER_selectSegment(const COVER_ctx_t *ctx, U32 *freqs,
397
397
  COVER_map_t *activeDmers, U32 begin,
@@ -435,7 +435,7 @@ static COVER_segment_t COVER_selectSegment(const COVER_ctx_t *ctx, U32 *freqs,
435
435
  U32 *delDmerOcc = COVER_map_at(activeDmers, delDmer);
436
436
  activeSegment.begin += 1;
437
437
  *delDmerOcc -= 1;
438
- /* If this is the last occurence of the dmer, subtract its score */
438
+ /* If this is the last occurrence of the dmer, subtract its score */
439
439
  if (*delDmerOcc == 0) {
440
440
  COVER_map_remove(activeDmers, delDmer);
441
441
  activeSegment.score -= freqs[delDmer];
@@ -526,10 +526,10 @@ static void COVER_ctx_destroy(COVER_ctx_t *ctx) {
526
526
  * Prepare a context for dictionary building.
527
527
  * The context is only dependent on the parameter `d` and can used multiple
528
528
  * times.
529
- * Returns 1 on success or zero on error.
529
+ * Returns 0 on success or error code on error.
530
530
  * The context must be destroyed with `COVER_ctx_destroy()`.
531
531
  */
532
- static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
532
+ static size_t COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
533
533
  const size_t *samplesSizes, unsigned nbSamples,
534
534
  unsigned d, double splitPoint) {
535
535
  const BYTE *const samples = (const BYTE *)samplesBuffer;
@@ -544,17 +544,17 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
544
544
  totalSamplesSize >= (size_t)COVER_MAX_SAMPLES_SIZE) {
545
545
  DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n",
546
546
  (unsigned)(totalSamplesSize>>20), (COVER_MAX_SAMPLES_SIZE >> 20));
547
- return 0;
547
+ return ERROR(srcSize_wrong);
548
548
  }
549
549
  /* Check if there are at least 5 training samples */
550
550
  if (nbTrainSamples < 5) {
551
551
  DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid.", nbTrainSamples);
552
- return 0;
552
+ return ERROR(srcSize_wrong);
553
553
  }
554
554
  /* Check if there's testing sample */
555
555
  if (nbTestSamples < 1) {
556
556
  DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.", nbTestSamples);
557
- return 0;
557
+ return ERROR(srcSize_wrong);
558
558
  }
559
559
  /* Zero the context */
560
560
  memset(ctx, 0, sizeof(*ctx));
@@ -577,7 +577,7 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
577
577
  if (!ctx->suffix || !ctx->dmerAt || !ctx->offsets) {
578
578
  DISPLAYLEVEL(1, "Failed to allocate scratch buffers\n");
579
579
  COVER_ctx_destroy(ctx);
580
- return 0;
580
+ return ERROR(memory_allocation);
581
581
  }
582
582
  ctx->freqs = NULL;
583
583
  ctx->d = d;
@@ -624,7 +624,40 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
624
624
  (ctx->d <= 8 ? &COVER_cmp8 : &COVER_cmp), &COVER_group);
625
625
  ctx->freqs = ctx->suffix;
626
626
  ctx->suffix = NULL;
627
- return 1;
627
+ return 0;
628
+ }
629
+
630
+ void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel)
631
+ {
632
+ const double ratio = (double)nbDmers / maxDictSize;
633
+ if (ratio >= 10) {
634
+ return;
635
+ }
636
+ LOCALDISPLAYLEVEL(displayLevel, 1,
637
+ "WARNING: The maximum dictionary size %u is too large "
638
+ "compared to the source size %u! "
639
+ "size(source)/size(dictionary) = %f, but it should be >= "
640
+ "10! This may lead to a subpar dictionary! We recommend "
641
+ "training on sources at least 10x, and preferably 100x "
642
+ "the size of the dictionary! \n", (U32)maxDictSize,
643
+ (U32)nbDmers, ratio);
644
+ }
645
+
646
+ COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize,
647
+ U32 nbDmers, U32 k, U32 passes)
648
+ {
649
+ const U32 minEpochSize = k * 10;
650
+ COVER_epoch_info_t epochs;
651
+ epochs.num = MAX(1, maxDictSize / k / passes);
652
+ epochs.size = nbDmers / epochs.num;
653
+ if (epochs.size >= minEpochSize) {
654
+ assert(epochs.size * epochs.num <= nbDmers);
655
+ return epochs;
656
+ }
657
+ epochs.size = MIN(minEpochSize, nbDmers);
658
+ epochs.num = nbDmers / epochs.size;
659
+ assert(epochs.size * epochs.num <= nbDmers);
660
+ return epochs;
628
661
  }
629
662
 
630
663
  /**
@@ -636,28 +669,34 @@ static size_t COVER_buildDictionary(const COVER_ctx_t *ctx, U32 *freqs,
636
669
  ZDICT_cover_params_t parameters) {
637
670
  BYTE *const dict = (BYTE *)dictBuffer;
638
671
  size_t tail = dictBufferCapacity;
639
- /* Divide the data up into epochs of equal size.
640
- * We will select at least one segment from each epoch.
641
- */
642
- const unsigned epochs = MAX(1, (U32)(dictBufferCapacity / parameters.k / 4));
643
- const unsigned epochSize = (U32)(ctx->suffixSize / epochs);
672
+ /* Divide the data into epochs. We will select one segment from each epoch. */
673
+ const COVER_epoch_info_t epochs = COVER_computeEpochs(
674
+ (U32)dictBufferCapacity, (U32)ctx->suffixSize, parameters.k, 4);
675
+ const size_t maxZeroScoreRun = MAX(10, MIN(100, epochs.num >> 3));
676
+ size_t zeroScoreRun = 0;
644
677
  size_t epoch;
645
678
  DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n",
646
- epochs, epochSize);
679
+ (U32)epochs.num, (U32)epochs.size);
647
680
  /* Loop through the epochs until there are no more segments or the dictionary
648
681
  * is full.
649
682
  */
650
- for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs) {
651
- const U32 epochBegin = (U32)(epoch * epochSize);
652
- const U32 epochEnd = epochBegin + epochSize;
683
+ for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs.num) {
684
+ const U32 epochBegin = (U32)(epoch * epochs.size);
685
+ const U32 epochEnd = epochBegin + epochs.size;
653
686
  size_t segmentSize;
654
687
  /* Select a segment */
655
688
  COVER_segment_t segment = COVER_selectSegment(
656
689
  ctx, freqs, activeDmers, epochBegin, epochEnd, parameters);
657
- /* If the segment covers no dmers, then we are out of content */
690
+ /* If the segment covers no dmers, then we are out of content.
691
+ * There may be new content in other epochs, for continue for some time.
692
+ */
658
693
  if (segment.score == 0) {
659
- break;
694
+ if (++zeroScoreRun >= maxZeroScoreRun) {
695
+ break;
696
+ }
697
+ continue;
660
698
  }
699
+ zeroScoreRun = 0;
661
700
  /* Trim the segment if necessary and if it is too small then we are done */
662
701
  segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail);
663
702
  if (segmentSize < parameters.d) {
@@ -690,11 +729,11 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
690
729
  /* Checks */
691
730
  if (!COVER_checkParameters(parameters, dictBufferCapacity)) {
692
731
  DISPLAYLEVEL(1, "Cover parameters incorrect\n");
693
- return ERROR(GENERIC);
732
+ return ERROR(parameter_outOfBound);
694
733
  }
695
734
  if (nbSamples == 0) {
696
735
  DISPLAYLEVEL(1, "Cover must have at least one input file\n");
697
- return ERROR(GENERIC);
736
+ return ERROR(srcSize_wrong);
698
737
  }
699
738
  if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
700
739
  DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
@@ -702,14 +741,18 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
702
741
  return ERROR(dstSize_tooSmall);
703
742
  }
704
743
  /* Initialize context and activeDmers */
705
- if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
706
- parameters.d, parameters.splitPoint)) {
707
- return ERROR(GENERIC);
744
+ {
745
+ size_t const initVal = COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
746
+ parameters.d, parameters.splitPoint);
747
+ if (ZSTD_isError(initVal)) {
748
+ return initVal;
749
+ }
708
750
  }
751
+ COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize, g_displayLevel);
709
752
  if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
710
753
  DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
711
754
  COVER_ctx_destroy(&ctx);
712
- return ERROR(GENERIC);
755
+ return ERROR(memory_allocation);
713
756
  }
714
757
 
715
758
  DISPLAYLEVEL(2, "Building dictionary\n");
@@ -770,7 +813,7 @@ size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,
770
813
  cctx, dst, dstCapacity, samples + offsets[i],
771
814
  samplesSizes[i], cdict);
772
815
  if (ZSTD_isError(size)) {
773
- totalCompressedSize = ERROR(GENERIC);
816
+ totalCompressedSize = size;
774
817
  goto _compressCleanup;
775
818
  }
776
819
  totalCompressedSize += size;
@@ -846,9 +889,11 @@ void COVER_best_start(COVER_best_t *best) {
846
889
  * Decrements liveJobs and signals any waiting threads if liveJobs == 0.
847
890
  * If this dictionary is the best so far save it and its parameters.
848
891
  */
849
- void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
850
- ZDICT_cover_params_t parameters, void *dict,
851
- size_t dictSize) {
892
+ void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters,
893
+ COVER_dictSelection_t selection) {
894
+ void* dict = selection.dictContent;
895
+ size_t compressedSize = selection.totalCompressedSize;
896
+ size_t dictSize = selection.dictSize;
852
897
  if (!best) {
853
898
  return;
854
899
  }
@@ -874,10 +919,12 @@ void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
874
919
  }
875
920
  }
876
921
  /* Save the dictionary, parameters, and size */
877
- memcpy(best->dict, dict, dictSize);
878
- best->dictSize = dictSize;
879
- best->parameters = parameters;
880
- best->compressedSize = compressedSize;
922
+ if (dict) {
923
+ memcpy(best->dict, dict, dictSize);
924
+ best->dictSize = dictSize;
925
+ best->parameters = parameters;
926
+ best->compressedSize = compressedSize;
927
+ }
881
928
  }
882
929
  if (liveJobs == 0) {
883
930
  ZSTD_pthread_cond_broadcast(&best->cond);
@@ -886,6 +933,111 @@ void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
886
933
  }
887
934
  }
888
935
 
936
+ COVER_dictSelection_t COVER_dictSelectionError(size_t error) {
937
+ COVER_dictSelection_t selection = { NULL, 0, error };
938
+ return selection;
939
+ }
940
+
941
+ unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection) {
942
+ return (ZSTD_isError(selection.totalCompressedSize) || !selection.dictContent);
943
+ }
944
+
945
+ void COVER_dictSelectionFree(COVER_dictSelection_t selection){
946
+ free(selection.dictContent);
947
+ }
948
+
949
+ COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent,
950
+ size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples,
951
+ size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize) {
952
+
953
+ size_t largestDict = 0;
954
+ size_t largestCompressed = 0;
955
+ BYTE* customDictContentEnd = customDictContent + dictContentSize;
956
+
957
+ BYTE * largestDictbuffer = (BYTE *)malloc(dictContentSize);
958
+ BYTE * candidateDictBuffer = (BYTE *)malloc(dictContentSize);
959
+ double regressionTolerance = ((double)params.shrinkDictMaxRegression / 100.0) + 1.00;
960
+
961
+ if (!largestDictbuffer || !candidateDictBuffer) {
962
+ free(largestDictbuffer);
963
+ free(candidateDictBuffer);
964
+ return COVER_dictSelectionError(dictContentSize);
965
+ }
966
+
967
+ /* Initial dictionary size and compressed size */
968
+ memcpy(largestDictbuffer, customDictContent, dictContentSize);
969
+ dictContentSize = ZDICT_finalizeDictionary(
970
+ largestDictbuffer, dictContentSize, customDictContent, dictContentSize,
971
+ samplesBuffer, samplesSizes, nbFinalizeSamples, params.zParams);
972
+
973
+ if (ZDICT_isError(dictContentSize)) {
974
+ free(largestDictbuffer);
975
+ free(candidateDictBuffer);
976
+ return COVER_dictSelectionError(dictContentSize);
977
+ }
978
+
979
+ totalCompressedSize = COVER_checkTotalCompressedSize(params, samplesSizes,
980
+ samplesBuffer, offsets,
981
+ nbCheckSamples, nbSamples,
982
+ largestDictbuffer, dictContentSize);
983
+
984
+ if (ZSTD_isError(totalCompressedSize)) {
985
+ free(largestDictbuffer);
986
+ free(candidateDictBuffer);
987
+ return COVER_dictSelectionError(totalCompressedSize);
988
+ }
989
+
990
+ if (params.shrinkDict == 0) {
991
+ COVER_dictSelection_t selection = { largestDictbuffer, dictContentSize, totalCompressedSize };
992
+ free(candidateDictBuffer);
993
+ return selection;
994
+ }
995
+
996
+ largestDict = dictContentSize;
997
+ largestCompressed = totalCompressedSize;
998
+ dictContentSize = ZDICT_DICTSIZE_MIN;
999
+
1000
+ /* Largest dict is initially at least ZDICT_DICTSIZE_MIN */
1001
+ while (dictContentSize < largestDict) {
1002
+ memcpy(candidateDictBuffer, largestDictbuffer, largestDict);
1003
+ dictContentSize = ZDICT_finalizeDictionary(
1004
+ candidateDictBuffer, dictContentSize, customDictContentEnd - dictContentSize, dictContentSize,
1005
+ samplesBuffer, samplesSizes, nbFinalizeSamples, params.zParams);
1006
+
1007
+ if (ZDICT_isError(dictContentSize)) {
1008
+ free(largestDictbuffer);
1009
+ free(candidateDictBuffer);
1010
+ return COVER_dictSelectionError(dictContentSize);
1011
+
1012
+ }
1013
+
1014
+ totalCompressedSize = COVER_checkTotalCompressedSize(params, samplesSizes,
1015
+ samplesBuffer, offsets,
1016
+ nbCheckSamples, nbSamples,
1017
+ candidateDictBuffer, dictContentSize);
1018
+
1019
+ if (ZSTD_isError(totalCompressedSize)) {
1020
+ free(largestDictbuffer);
1021
+ free(candidateDictBuffer);
1022
+ return COVER_dictSelectionError(totalCompressedSize);
1023
+ }
1024
+
1025
+ if (totalCompressedSize <= largestCompressed * regressionTolerance) {
1026
+ COVER_dictSelection_t selection = { candidateDictBuffer, dictContentSize, totalCompressedSize };
1027
+ free(largestDictbuffer);
1028
+ return selection;
1029
+ }
1030
+ dictContentSize *= 2;
1031
+ }
1032
+ dictContentSize = largestDict;
1033
+ totalCompressedSize = largestCompressed;
1034
+ {
1035
+ COVER_dictSelection_t selection = { largestDictbuffer, dictContentSize, totalCompressedSize };
1036
+ free(candidateDictBuffer);
1037
+ return selection;
1038
+ }
1039
+ }
1040
+
889
1041
  /**
890
1042
  * Parameters for COVER_tryParameters().
891
1043
  */
@@ -911,6 +1063,7 @@ static void COVER_tryParameters(void *opaque) {
911
1063
  /* Allocate space for hash table, dict, and freqs */
912
1064
  COVER_map_t activeDmers;
913
1065
  BYTE *const dict = (BYTE * const)malloc(dictBufferCapacity);
1066
+ COVER_dictSelection_t selection = COVER_dictSelectionError(ERROR(GENERIC));
914
1067
  U32 *freqs = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
915
1068
  if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
916
1069
  DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
@@ -926,29 +1079,21 @@ static void COVER_tryParameters(void *opaque) {
926
1079
  {
927
1080
  const size_t tail = COVER_buildDictionary(ctx, freqs, &activeDmers, dict,
928
1081
  dictBufferCapacity, parameters);
929
- dictBufferCapacity = ZDICT_finalizeDictionary(
930
- dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
931
- ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbTrainSamples,
932
- parameters.zParams);
933
- if (ZDICT_isError(dictBufferCapacity)) {
934
- DISPLAYLEVEL(1, "Failed to finalize dictionary\n");
1082
+ selection = COVER_selectDict(dict + tail, dictBufferCapacity - tail,
1083
+ ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbTrainSamples, ctx->nbTrainSamples, ctx->nbSamples, parameters, ctx->offsets,
1084
+ totalCompressedSize);
1085
+
1086
+ if (COVER_dictSelectionIsError(selection)) {
1087
+ DISPLAYLEVEL(1, "Failed to select dictionary\n");
935
1088
  goto _cleanup;
936
1089
  }
937
1090
  }
938
- /* Check total compressed size */
939
- totalCompressedSize = COVER_checkTotalCompressedSize(parameters, ctx->samplesSizes,
940
- ctx->samples, ctx->offsets,
941
- ctx->nbTrainSamples, ctx->nbSamples,
942
- dict, dictBufferCapacity);
943
-
944
1091
  _cleanup:
945
- COVER_best_finish(data->best, totalCompressedSize, parameters, dict,
946
- dictBufferCapacity);
1092
+ free(dict);
1093
+ COVER_best_finish(data->best, parameters, selection);
947
1094
  free(data);
948
1095
  COVER_map_destroy(&activeDmers);
949
- if (dict) {
950
- free(dict);
951
- }
1096
+ COVER_dictSelectionFree(selection);
952
1097
  if (freqs) {
953
1098
  free(freqs);
954
1099
  }
@@ -970,6 +1115,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
970
1115
  const unsigned kStepSize = MAX((kMaxK - kMinK) / kSteps, 1);
971
1116
  const unsigned kIterations =
972
1117
  (1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize);
1118
+ const unsigned shrinkDict = 0;
973
1119
  /* Local variables */
974
1120
  const int displayLevel = parameters->zParams.notificationLevel;
975
1121
  unsigned iteration = 1;
@@ -977,19 +1123,20 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
977
1123
  unsigned k;
978
1124
  COVER_best_t best;
979
1125
  POOL_ctx *pool = NULL;
1126
+ int warned = 0;
980
1127
 
981
1128
  /* Checks */
982
1129
  if (splitPoint <= 0 || splitPoint > 1) {
983
1130
  LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
984
- return ERROR(GENERIC);
1131
+ return ERROR(parameter_outOfBound);
985
1132
  }
986
1133
  if (kMinK < kMaxD || kMaxK < kMinK) {
987
1134
  LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
988
- return ERROR(GENERIC);
1135
+ return ERROR(parameter_outOfBound);
989
1136
  }
990
1137
  if (nbSamples == 0) {
991
1138
  DISPLAYLEVEL(1, "Cover must have at least one input file\n");
992
- return ERROR(GENERIC);
1139
+ return ERROR(srcSize_wrong);
993
1140
  }
994
1141
  if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
995
1142
  DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
@@ -1013,11 +1160,18 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
1013
1160
  /* Initialize the context for this value of d */
1014
1161
  COVER_ctx_t ctx;
1015
1162
  LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d);
1016
- if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint)) {
1017
- LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n");
1018
- COVER_best_destroy(&best);
1019
- POOL_free(pool);
1020
- return ERROR(GENERIC);
1163
+ {
1164
+ const size_t initVal = COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint);
1165
+ if (ZSTD_isError(initVal)) {
1166
+ LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n");
1167
+ COVER_best_destroy(&best);
1168
+ POOL_free(pool);
1169
+ return initVal;
1170
+ }
1171
+ }
1172
+ if (!warned) {
1173
+ COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize, displayLevel);
1174
+ warned = 1;
1021
1175
  }
1022
1176
  /* Loop through k reusing the same context */
1023
1177
  for (k = kMinK; k <= kMaxK; k += kStepSize) {
@@ -1030,7 +1184,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
1030
1184
  COVER_best_destroy(&best);
1031
1185
  COVER_ctx_destroy(&ctx);
1032
1186
  POOL_free(pool);
1033
- return ERROR(GENERIC);
1187
+ return ERROR(memory_allocation);
1034
1188
  }
1035
1189
  data->ctx = &ctx;
1036
1190
  data->best = &best;
@@ -1040,6 +1194,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
1040
1194
  data->parameters.d = d;
1041
1195
  data->parameters.splitPoint = splitPoint;
1042
1196
  data->parameters.steps = kSteps;
1197
+ data->parameters.shrinkDict = shrinkDict;
1043
1198
  data->parameters.zParams.notificationLevel = g_displayLevel;
1044
1199
  /* Check the parameters */
1045
1200
  if (!COVER_checkParameters(data->parameters, dictBufferCapacity)) {