zstd-ruby 1.3.8.0 → 1.4.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +6 -5
  3. data/README.md +1 -1
  4. data/ext/zstdruby/libzstd/Makefile +133 -61
  5. data/ext/zstdruby/libzstd/README.md +51 -18
  6. data/ext/zstdruby/libzstd/common/bitstream.h +38 -39
  7. data/ext/zstdruby/libzstd/common/compiler.h +41 -6
  8. data/ext/zstdruby/libzstd/common/cpu.h +1 -1
  9. data/ext/zstdruby/libzstd/common/debug.c +11 -31
  10. data/ext/zstdruby/libzstd/common/debug.h +11 -31
  11. data/ext/zstdruby/libzstd/common/entropy_common.c +13 -33
  12. data/ext/zstdruby/libzstd/common/error_private.c +2 -1
  13. data/ext/zstdruby/libzstd/common/error_private.h +6 -2
  14. data/ext/zstdruby/libzstd/common/fse.h +13 -33
  15. data/ext/zstdruby/libzstd/common/fse_decompress.c +12 -35
  16. data/ext/zstdruby/libzstd/common/huf.h +15 -33
  17. data/ext/zstdruby/libzstd/common/mem.h +75 -2
  18. data/ext/zstdruby/libzstd/common/pool.c +8 -4
  19. data/ext/zstdruby/libzstd/common/pool.h +2 -2
  20. data/ext/zstdruby/libzstd/common/threading.c +52 -6
  21. data/ext/zstdruby/libzstd/common/threading.h +36 -4
  22. data/ext/zstdruby/libzstd/common/xxhash.c +25 -37
  23. data/ext/zstdruby/libzstd/common/xxhash.h +11 -31
  24. data/ext/zstdruby/libzstd/common/zstd_common.c +1 -1
  25. data/ext/zstdruby/libzstd/common/zstd_errors.h +2 -1
  26. data/ext/zstdruby/libzstd/common/zstd_internal.h +203 -22
  27. data/ext/zstdruby/libzstd/compress/fse_compress.c +19 -42
  28. data/ext/zstdruby/libzstd/compress/hist.c +15 -35
  29. data/ext/zstdruby/libzstd/compress/hist.h +12 -32
  30. data/ext/zstdruby/libzstd/compress/huf_compress.c +92 -92
  31. data/ext/zstdruby/libzstd/compress/zstd_compress.c +1460 -1472
  32. data/ext/zstdruby/libzstd/compress/zstd_compress_internal.h +330 -65
  33. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.c +158 -0
  34. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.h +29 -0
  35. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.c +419 -0
  36. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.h +54 -0
  37. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.c +845 -0
  38. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.h +32 -0
  39. data/ext/zstdruby/libzstd/compress/zstd_cwksp.h +525 -0
  40. data/ext/zstdruby/libzstd/compress/zstd_double_fast.c +65 -43
  41. data/ext/zstdruby/libzstd/compress/zstd_double_fast.h +2 -2
  42. data/ext/zstdruby/libzstd/compress/zstd_fast.c +264 -159
  43. data/ext/zstdruby/libzstd/compress/zstd_fast.h +2 -2
  44. data/ext/zstdruby/libzstd/compress/zstd_lazy.c +74 -42
  45. data/ext/zstdruby/libzstd/compress/zstd_lazy.h +2 -2
  46. data/ext/zstdruby/libzstd/compress/zstd_ldm.c +33 -11
  47. data/ext/zstdruby/libzstd/compress/zstd_ldm.h +7 -2
  48. data/ext/zstdruby/libzstd/compress/zstd_opt.c +108 -125
  49. data/ext/zstdruby/libzstd/compress/zstd_opt.h +1 -1
  50. data/ext/zstdruby/libzstd/compress/zstdmt_compress.c +129 -93
  51. data/ext/zstdruby/libzstd/compress/zstdmt_compress.h +46 -28
  52. data/ext/zstdruby/libzstd/decompress/huf_decompress.c +76 -60
  53. data/ext/zstdruby/libzstd/decompress/zstd_ddict.c +14 -10
  54. data/ext/zstdruby/libzstd/decompress/zstd_ddict.h +2 -2
  55. data/ext/zstdruby/libzstd/decompress/zstd_decompress.c +471 -258
  56. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.c +471 -346
  57. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.h +3 -3
  58. data/ext/zstdruby/libzstd/decompress/zstd_decompress_internal.h +25 -4
  59. data/ext/zstdruby/libzstd/deprecated/zbuff.h +9 -8
  60. data/ext/zstdruby/libzstd/deprecated/zbuff_common.c +2 -2
  61. data/ext/zstdruby/libzstd/deprecated/zbuff_compress.c +1 -1
  62. data/ext/zstdruby/libzstd/deprecated/zbuff_decompress.c +1 -1
  63. data/ext/zstdruby/libzstd/dictBuilder/cover.c +220 -65
  64. data/ext/zstdruby/libzstd/dictBuilder/cover.h +81 -7
  65. data/ext/zstdruby/libzstd/dictBuilder/fastcover.c +85 -56
  66. data/ext/zstdruby/libzstd/dictBuilder/zdict.c +43 -19
  67. data/ext/zstdruby/libzstd/dictBuilder/zdict.h +73 -35
  68. data/ext/zstdruby/libzstd/dll/example/Makefile +2 -1
  69. data/ext/zstdruby/libzstd/dll/example/build_package.bat +3 -2
  70. data/ext/zstdruby/libzstd/legacy/zstd_legacy.h +49 -15
  71. data/ext/zstdruby/libzstd/legacy/zstd_v01.c +142 -117
  72. data/ext/zstdruby/libzstd/legacy/zstd_v01.h +13 -8
  73. data/ext/zstdruby/libzstd/legacy/zstd_v02.c +54 -25
  74. data/ext/zstdruby/libzstd/legacy/zstd_v02.h +13 -8
  75. data/ext/zstdruby/libzstd/legacy/zstd_v03.c +55 -25
  76. data/ext/zstdruby/libzstd/legacy/zstd_v03.h +13 -8
  77. data/ext/zstdruby/libzstd/legacy/zstd_v04.c +62 -29
  78. data/ext/zstdruby/libzstd/legacy/zstd_v04.h +13 -8
  79. data/ext/zstdruby/libzstd/legacy/zstd_v05.c +145 -109
  80. data/ext/zstdruby/libzstd/legacy/zstd_v05.h +14 -9
  81. data/ext/zstdruby/libzstd/legacy/zstd_v06.c +56 -26
  82. data/ext/zstdruby/libzstd/legacy/zstd_v06.h +11 -6
  83. data/ext/zstdruby/libzstd/legacy/zstd_v07.c +65 -28
  84. data/ext/zstdruby/libzstd/legacy/zstd_v07.h +11 -6
  85. data/ext/zstdruby/libzstd/libzstd.pc.in +3 -2
  86. data/ext/zstdruby/libzstd/zstd.h +921 -597
  87. data/lib/zstd-ruby/version.rb +1 -1
  88. data/zstd-ruby.gemspec +2 -2
  89. metadata +19 -14
  90. data/ext/zstdruby/libzstd/dll/libzstd.def +0 -87
@@ -1,11 +1,21 @@
1
+ /*
2
+ * Copyright (c) 2017-2020, Facebook, Inc.
3
+ * All rights reserved.
4
+ *
5
+ * This source code is licensed under both the BSD-style license (found in the
6
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7
+ * in the COPYING file in the root directory of this source tree).
8
+ * You may select, at your option, one of the above-listed licenses.
9
+ */
10
+
1
11
  #include <stdio.h> /* fprintf */
2
12
  #include <stdlib.h> /* malloc, free, qsort */
3
13
  #include <string.h> /* memset */
4
14
  #include <time.h> /* clock */
5
- #include "mem.h" /* read */
6
- #include "pool.h"
7
- #include "threading.h"
8
- #include "zstd_internal.h" /* includes zstd.h */
15
+ #include "../common/mem.h" /* read */
16
+ #include "../common/pool.h"
17
+ #include "../common/threading.h"
18
+ #include "../common/zstd_internal.h" /* includes zstd.h */
9
19
  #ifndef ZDICT_STATIC_LINKING_ONLY
10
20
  #define ZDICT_STATIC_LINKING_ONLY
11
21
  #endif
@@ -38,6 +48,44 @@ typedef struct {
38
48
  U32 score;
39
49
  } COVER_segment_t;
40
50
 
51
+ /**
52
+ *Number of epochs and size of each epoch.
53
+ */
54
+ typedef struct {
55
+ U32 num;
56
+ U32 size;
57
+ } COVER_epoch_info_t;
58
+
59
+ /**
60
+ * Struct used for the dictionary selection function.
61
+ */
62
+ typedef struct COVER_dictSelection {
63
+ BYTE* dictContent;
64
+ size_t dictSize;
65
+ size_t totalCompressedSize;
66
+ } COVER_dictSelection_t;
67
+
68
+ /**
69
+ * Computes the number of epochs and the size of each epoch.
70
+ * We will make sure that each epoch gets at least 10 * k bytes.
71
+ *
72
+ * The COVER algorithms divide the data up into epochs of equal size and
73
+ * select one segment from each epoch.
74
+ *
75
+ * @param maxDictSize The maximum allowed dictionary size.
76
+ * @param nbDmers The number of dmers we are training on.
77
+ * @param k The parameter k (segment size).
78
+ * @param passes The target number of passes over the dmer corpus.
79
+ * More passes means a better dictionary.
80
+ */
81
+ COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize, U32 nbDmers,
82
+ U32 k, U32 passes);
83
+
84
+ /**
85
+ * Warns the user when their corpus is too small.
86
+ */
87
+ void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel);
88
+
41
89
  /**
42
90
  * Checks total compressed size of a dictionary
43
91
  */
@@ -78,6 +126,32 @@ void COVER_best_start(COVER_best_t *best);
78
126
  * Decrements liveJobs and signals any waiting threads if liveJobs == 0.
79
127
  * If this dictionary is the best so far save it and its parameters.
80
128
  */
81
- void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
82
- ZDICT_cover_params_t parameters, void *dict,
83
- size_t dictSize);
129
+ void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters,
130
+ COVER_dictSelection_t selection);
131
+ /**
132
+ * Error function for COVER_selectDict function. Checks if the return
133
+ * value is an error.
134
+ */
135
+ unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection);
136
+
137
+ /**
138
+ * Error function for COVER_selectDict function. Returns a struct where
139
+ * return.totalCompressedSize is a ZSTD error.
140
+ */
141
+ COVER_dictSelection_t COVER_dictSelectionError(size_t error);
142
+
143
+ /**
144
+ * Always call after selectDict is called to free up used memory from
145
+ * newly created dictionary.
146
+ */
147
+ void COVER_dictSelectionFree(COVER_dictSelection_t selection);
148
+
149
+ /**
150
+ * Called to finalize the dictionary and select one based on whether or not
151
+ * the shrink-dict flag was enabled. If enabled the dictionary used is the
152
+ * smallest dictionary within a specified regression of the compressed size
153
+ * from the largest dictionary.
154
+ */
155
+ COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent,
156
+ size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples,
157
+ size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize);
@@ -1,3 +1,13 @@
1
+ /*
2
+ * Copyright (c) 2018-2020, Facebook, Inc.
3
+ * All rights reserved.
4
+ *
5
+ * This source code is licensed under both the BSD-style license (found in the
6
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7
+ * in the COPYING file in the root directory of this source tree).
8
+ * You may select, at your option, one of the above-listed licenses.
9
+ */
10
+
1
11
  /*-*************************************
2
12
  * Dependencies
3
13
  ***************************************/
@@ -6,11 +16,11 @@
6
16
  #include <string.h> /* memset */
7
17
  #include <time.h> /* clock */
8
18
 
9
- #include "mem.h" /* read */
10
- #include "pool.h"
11
- #include "threading.h"
19
+ #include "../common/mem.h" /* read */
20
+ #include "../common/pool.h"
21
+ #include "../common/threading.h"
12
22
  #include "cover.h"
13
- #include "zstd_internal.h" /* includes zstd.h */
23
+ #include "../common/zstd_internal.h" /* includes zstd.h */
14
24
  #ifndef ZDICT_STATIC_LINKING_ONLY
15
25
  #define ZDICT_STATIC_LINKING_ONLY
16
26
  #endif
@@ -132,7 +142,7 @@ typedef struct {
132
142
  *
133
143
  * Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1})
134
144
  *
135
- * Once the dmer with hash value d is in the dictionay we set F(d) = 0.
145
+ * Once the dmer with hash value d is in the dictionary we set F(d) = 0.
136
146
  */
137
147
  static COVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx,
138
148
  U32 *freqs, U32 begin, U32 end,
@@ -161,7 +171,7 @@ static COVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx,
161
171
  /* Get hash value of current dmer */
162
172
  const size_t idx = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.end, f, d);
163
173
 
164
- /* Add frequency of this index to score if this is the first occurence of index in active segment */
174
+ /* Add frequency of this index to score if this is the first occurrence of index in active segment */
165
175
  if (segmentFreqs[idx] == 0) {
166
176
  activeSegment.score += freqs[idx];
167
177
  }
@@ -287,10 +297,10 @@ FASTCOVER_computeFrequency(U32* freqs, const FASTCOVER_ctx_t* ctx)
287
297
  * Prepare a context for dictionary building.
288
298
  * The context is only dependent on the parameter `d` and can used multiple
289
299
  * times.
290
- * Returns 1 on success or zero on error.
300
+ * Returns 0 on success or error code on error.
291
301
  * The context must be destroyed with `FASTCOVER_ctx_destroy()`.
292
302
  */
293
- static int
303
+ static size_t
294
304
  FASTCOVER_ctx_init(FASTCOVER_ctx_t* ctx,
295
305
  const void* samplesBuffer,
296
306
  const size_t* samplesSizes, unsigned nbSamples,
@@ -310,19 +320,19 @@ FASTCOVER_ctx_init(FASTCOVER_ctx_t* ctx,
310
320
  totalSamplesSize >= (size_t)FASTCOVER_MAX_SAMPLES_SIZE) {
311
321
  DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n",
312
322
  (unsigned)(totalSamplesSize >> 20), (FASTCOVER_MAX_SAMPLES_SIZE >> 20));
313
- return 0;
323
+ return ERROR(srcSize_wrong);
314
324
  }
315
325
 
316
326
  /* Check if there are at least 5 training samples */
317
327
  if (nbTrainSamples < 5) {
318
328
  DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid\n", nbTrainSamples);
319
- return 0;
329
+ return ERROR(srcSize_wrong);
320
330
  }
321
331
 
322
332
  /* Check if there's testing sample */
323
333
  if (nbTestSamples < 1) {
324
334
  DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.\n", nbTestSamples);
325
- return 0;
335
+ return ERROR(srcSize_wrong);
326
336
  }
327
337
 
328
338
  /* Zero the context */
@@ -347,7 +357,7 @@ FASTCOVER_ctx_init(FASTCOVER_ctx_t* ctx,
347
357
  if (ctx->offsets == NULL) {
348
358
  DISPLAYLEVEL(1, "Failed to allocate scratch buffers \n");
349
359
  FASTCOVER_ctx_destroy(ctx);
350
- return 0;
360
+ return ERROR(memory_allocation);
351
361
  }
352
362
 
353
363
  /* Fill offsets from the samplesSizes */
@@ -364,13 +374,13 @@ FASTCOVER_ctx_init(FASTCOVER_ctx_t* ctx,
364
374
  if (ctx->freqs == NULL) {
365
375
  DISPLAYLEVEL(1, "Failed to allocate frequency table \n");
366
376
  FASTCOVER_ctx_destroy(ctx);
367
- return 0;
377
+ return ERROR(memory_allocation);
368
378
  }
369
379
 
370
380
  DISPLAYLEVEL(2, "Computing frequencies\n");
371
381
  FASTCOVER_computeFrequency(ctx->freqs, ctx);
372
382
 
373
- return 1;
383
+ return 0;
374
384
  }
375
385
 
376
386
 
@@ -386,29 +396,35 @@ FASTCOVER_buildDictionary(const FASTCOVER_ctx_t* ctx,
386
396
  {
387
397
  BYTE *const dict = (BYTE *)dictBuffer;
388
398
  size_t tail = dictBufferCapacity;
389
- /* Divide the data up into epochs of equal size.
390
- * We will select at least one segment from each epoch.
391
- */
392
- const unsigned epochs = MAX(1, (U32)(dictBufferCapacity / parameters.k));
393
- const unsigned epochSize = (U32)(ctx->nbDmers / epochs);
399
+ /* Divide the data into epochs. We will select one segment from each epoch. */
400
+ const COVER_epoch_info_t epochs = COVER_computeEpochs(
401
+ (U32)dictBufferCapacity, (U32)ctx->nbDmers, parameters.k, 1);
402
+ const size_t maxZeroScoreRun = 10;
403
+ size_t zeroScoreRun = 0;
394
404
  size_t epoch;
395
405
  DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n",
396
- epochs, epochSize);
406
+ (U32)epochs.num, (U32)epochs.size);
397
407
  /* Loop through the epochs until there are no more segments or the dictionary
398
408
  * is full.
399
409
  */
400
- for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs) {
401
- const U32 epochBegin = (U32)(epoch * epochSize);
402
- const U32 epochEnd = epochBegin + epochSize;
410
+ for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs.num) {
411
+ const U32 epochBegin = (U32)(epoch * epochs.size);
412
+ const U32 epochEnd = epochBegin + epochs.size;
403
413
  size_t segmentSize;
404
414
  /* Select a segment */
405
415
  COVER_segment_t segment = FASTCOVER_selectSegment(
406
416
  ctx, freqs, epochBegin, epochEnd, parameters, segmentFreqs);
407
417
 
408
- /* If the segment covers no dmers, then we are out of content */
418
+ /* If the segment covers no dmers, then we are out of content.
419
+ * There may be new content in other epochs, for continue for some time.
420
+ */
409
421
  if (segment.score == 0) {
410
- break;
422
+ if (++zeroScoreRun >= maxZeroScoreRun) {
423
+ break;
424
+ }
425
+ continue;
411
426
  }
427
+ zeroScoreRun = 0;
412
428
 
413
429
  /* Trim the segment if necessary and if it is too small then we are done */
414
430
  segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail);
@@ -429,7 +445,6 @@ FASTCOVER_buildDictionary(const FASTCOVER_ctx_t* ctx,
429
445
  return tail;
430
446
  }
431
447
 
432
-
433
448
  /**
434
449
  * Parameters for FASTCOVER_tryParameters().
435
450
  */
@@ -458,6 +473,7 @@ static void FASTCOVER_tryParameters(void *opaque)
458
473
  U16* segmentFreqs = (U16 *)calloc(((U64)1 << ctx->f), sizeof(U16));
459
474
  /* Allocate space for hash table, dict, and freqs */
460
475
  BYTE *const dict = (BYTE * const)malloc(dictBufferCapacity);
476
+ COVER_dictSelection_t selection = COVER_dictSelectionError(ERROR(GENERIC));
461
477
  U32 *freqs = (U32*) malloc(((U64)1 << ctx->f) * sizeof(U32));
462
478
  if (!segmentFreqs || !dict || !freqs) {
463
479
  DISPLAYLEVEL(1, "Failed to allocate buffers: out of memory\n");
@@ -467,27 +483,24 @@ static void FASTCOVER_tryParameters(void *opaque)
467
483
  memcpy(freqs, ctx->freqs, ((U64)1 << ctx->f) * sizeof(U32));
468
484
  /* Build the dictionary */
469
485
  { const size_t tail = FASTCOVER_buildDictionary(ctx, freqs, dict, dictBufferCapacity,
470
- parameters, segmentFreqs);
486
+ parameters, segmentFreqs);
487
+
471
488
  const unsigned nbFinalizeSamples = (unsigned)(ctx->nbTrainSamples * ctx->accelParams.finalize / 100);
472
- dictBufferCapacity = ZDICT_finalizeDictionary(
473
- dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
474
- ctx->samples, ctx->samplesSizes, nbFinalizeSamples, parameters.zParams);
475
- if (ZDICT_isError(dictBufferCapacity)) {
476
- DISPLAYLEVEL(1, "Failed to finalize dictionary\n");
489
+ selection = COVER_selectDict(dict + tail, dictBufferCapacity - tail,
490
+ ctx->samples, ctx->samplesSizes, nbFinalizeSamples, ctx->nbTrainSamples, ctx->nbSamples, parameters, ctx->offsets,
491
+ totalCompressedSize);
492
+
493
+ if (COVER_dictSelectionIsError(selection)) {
494
+ DISPLAYLEVEL(1, "Failed to select dictionary\n");
477
495
  goto _cleanup;
478
496
  }
479
497
  }
480
- /* Check total compressed size */
481
- totalCompressedSize = COVER_checkTotalCompressedSize(parameters, ctx->samplesSizes,
482
- ctx->samples, ctx->offsets,
483
- ctx->nbTrainSamples, ctx->nbSamples,
484
- dict, dictBufferCapacity);
485
498
  _cleanup:
486
- COVER_best_finish(data->best, totalCompressedSize, parameters, dict,
487
- dictBufferCapacity);
499
+ free(dict);
500
+ COVER_best_finish(data->best, parameters, selection);
488
501
  free(data);
489
502
  free(segmentFreqs);
490
- free(dict);
503
+ COVER_dictSelectionFree(selection);
491
504
  free(freqs);
492
505
  }
493
506
 
@@ -502,6 +515,7 @@ FASTCOVER_convertToCoverParams(ZDICT_fastCover_params_t fastCoverParams,
502
515
  coverParams->nbThreads = fastCoverParams.nbThreads;
503
516
  coverParams->splitPoint = fastCoverParams.splitPoint;
504
517
  coverParams->zParams = fastCoverParams.zParams;
518
+ coverParams->shrinkDict = fastCoverParams.shrinkDict;
505
519
  }
506
520
 
507
521
 
@@ -518,6 +532,7 @@ FASTCOVER_convertToFastCoverParams(ZDICT_cover_params_t coverParams,
518
532
  fastCoverParams->f = f;
519
533
  fastCoverParams->accel = accel;
520
534
  fastCoverParams->zParams = coverParams.zParams;
535
+ fastCoverParams->shrinkDict = coverParams.shrinkDict;
521
536
  }
522
537
 
523
538
 
@@ -544,11 +559,11 @@ ZDICT_trainFromBuffer_fastCover(void* dictBuffer, size_t dictBufferCapacity,
544
559
  if (!FASTCOVER_checkParameters(coverParams, dictBufferCapacity, parameters.f,
545
560
  parameters.accel)) {
546
561
  DISPLAYLEVEL(1, "FASTCOVER parameters incorrect\n");
547
- return ERROR(GENERIC);
562
+ return ERROR(parameter_outOfBound);
548
563
  }
549
564
  if (nbSamples == 0) {
550
565
  DISPLAYLEVEL(1, "FASTCOVER must have at least one input file\n");
551
- return ERROR(GENERIC);
566
+ return ERROR(srcSize_wrong);
552
567
  }
553
568
  if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
554
569
  DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
@@ -558,12 +573,16 @@ ZDICT_trainFromBuffer_fastCover(void* dictBuffer, size_t dictBufferCapacity,
558
573
  /* Assign corresponding FASTCOVER_accel_t to accelParams*/
559
574
  accelParams = FASTCOVER_defaultAccelParameters[parameters.accel];
560
575
  /* Initialize context */
561
- if (!FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
576
+ {
577
+ size_t const initVal = FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
562
578
  coverParams.d, parameters.splitPoint, parameters.f,
563
- accelParams)) {
564
- DISPLAYLEVEL(1, "Failed to initialize context\n");
565
- return ERROR(GENERIC);
579
+ accelParams);
580
+ if (ZSTD_isError(initVal)) {
581
+ DISPLAYLEVEL(1, "Failed to initialize context\n");
582
+ return initVal;
583
+ }
566
584
  }
585
+ COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.nbDmers, g_displayLevel);
567
586
  /* Build the dictionary */
568
587
  DISPLAYLEVEL(2, "Building dictionary\n");
569
588
  {
@@ -609,6 +628,7 @@ ZDICT_optimizeTrainFromBuffer_fastCover(
609
628
  (1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize);
610
629
  const unsigned f = parameters->f == 0 ? DEFAULT_F : parameters->f;
611
630
  const unsigned accel = parameters->accel == 0 ? DEFAULT_ACCEL : parameters->accel;
631
+ const unsigned shrinkDict = 0;
612
632
  /* Local variables */
613
633
  const int displayLevel = parameters->zParams.notificationLevel;
614
634
  unsigned iteration = 1;
@@ -616,22 +636,23 @@ ZDICT_optimizeTrainFromBuffer_fastCover(
616
636
  unsigned k;
617
637
  COVER_best_t best;
618
638
  POOL_ctx *pool = NULL;
639
+ int warned = 0;
619
640
  /* Checks */
620
641
  if (splitPoint <= 0 || splitPoint > 1) {
621
642
  LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect splitPoint\n");
622
- return ERROR(GENERIC);
643
+ return ERROR(parameter_outOfBound);
623
644
  }
624
645
  if (accel == 0 || accel > FASTCOVER_MAX_ACCEL) {
625
646
  LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect accel\n");
626
- return ERROR(GENERIC);
647
+ return ERROR(parameter_outOfBound);
627
648
  }
628
649
  if (kMinK < kMaxD || kMaxK < kMinK) {
629
650
  LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect k\n");
630
- return ERROR(GENERIC);
651
+ return ERROR(parameter_outOfBound);
631
652
  }
632
653
  if (nbSamples == 0) {
633
654
  LOCALDISPLAYLEVEL(displayLevel, 1, "FASTCOVER must have at least one input file\n");
634
- return ERROR(GENERIC);
655
+ return ERROR(srcSize_wrong);
635
656
  }
636
657
  if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
637
658
  LOCALDISPLAYLEVEL(displayLevel, 1, "dictBufferCapacity must be at least %u\n",
@@ -658,11 +679,18 @@ ZDICT_optimizeTrainFromBuffer_fastCover(
658
679
  /* Initialize the context for this value of d */
659
680
  FASTCOVER_ctx_t ctx;
660
681
  LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d);
661
- if (!FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint, f, accelParams)) {
662
- LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n");
663
- COVER_best_destroy(&best);
664
- POOL_free(pool);
665
- return ERROR(GENERIC);
682
+ {
683
+ size_t const initVal = FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint, f, accelParams);
684
+ if (ZSTD_isError(initVal)) {
685
+ LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n");
686
+ COVER_best_destroy(&best);
687
+ POOL_free(pool);
688
+ return initVal;
689
+ }
690
+ }
691
+ if (!warned) {
692
+ COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.nbDmers, displayLevel);
693
+ warned = 1;
666
694
  }
667
695
  /* Loop through k reusing the same context */
668
696
  for (k = kMinK; k <= kMaxK; k += kStepSize) {
@@ -675,7 +703,7 @@ ZDICT_optimizeTrainFromBuffer_fastCover(
675
703
  COVER_best_destroy(&best);
676
704
  FASTCOVER_ctx_destroy(&ctx);
677
705
  POOL_free(pool);
678
- return ERROR(GENERIC);
706
+ return ERROR(memory_allocation);
679
707
  }
680
708
  data->ctx = &ctx;
681
709
  data->best = &best;
@@ -685,6 +713,7 @@ ZDICT_optimizeTrainFromBuffer_fastCover(
685
713
  data->parameters.d = d;
686
714
  data->parameters.splitPoint = splitPoint;
687
715
  data->parameters.steps = kSteps;
716
+ data->parameters.shrinkDict = shrinkDict;
688
717
  data->parameters.zParams.notificationLevel = g_displayLevel;
689
718
  /* Check the parameters */
690
719
  if (!FASTCOVER_checkParameters(data->parameters, dictBufferCapacity,
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
2
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -37,17 +37,18 @@
37
37
  #include <stdio.h> /* fprintf, fopen, ftello64 */
38
38
  #include <time.h> /* clock */
39
39
 
40
- #include "mem.h" /* read */
41
- #include "fse.h" /* FSE_normalizeCount, FSE_writeNCount */
40
+ #include "../common/mem.h" /* read */
41
+ #include "../common/fse.h" /* FSE_normalizeCount, FSE_writeNCount */
42
42
  #define HUF_STATIC_LINKING_ONLY
43
- #include "huf.h" /* HUF_buildCTable, HUF_writeCTable */
44
- #include "zstd_internal.h" /* includes zstd.h */
45
- #include "xxhash.h" /* XXH64 */
43
+ #include "../common/huf.h" /* HUF_buildCTable, HUF_writeCTable */
44
+ #include "../common/zstd_internal.h" /* includes zstd.h */
45
+ #include "../common/xxhash.h" /* XXH64 */
46
46
  #include "divsufsort.h"
47
47
  #ifndef ZDICT_STATIC_LINKING_ONLY
48
48
  # define ZDICT_STATIC_LINKING_ONLY
49
49
  #endif
50
50
  #include "zdict.h"
51
+ #include "../compress/zstd_compress_internal.h" /* ZSTD_loadCEntropy() */
51
52
 
52
53
 
53
54
  /*-*************************************
@@ -99,6 +100,29 @@ unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize)
99
100
  return MEM_readLE32((const char*)dictBuffer + 4);
100
101
  }
101
102
 
103
+ size_t ZDICT_getDictHeaderSize(const void* dictBuffer, size_t dictSize)
104
+ {
105
+ size_t headerSize;
106
+ if (dictSize <= 8 || MEM_readLE32(dictBuffer) != ZSTD_MAGIC_DICTIONARY) return ERROR(dictionary_corrupted);
107
+
108
+ { unsigned offcodeMaxValue = MaxOff;
109
+ ZSTD_compressedBlockState_t* bs = (ZSTD_compressedBlockState_t*)malloc(sizeof(ZSTD_compressedBlockState_t));
110
+ U32* wksp = (U32*)malloc(HUF_WORKSPACE_SIZE);
111
+ short* offcodeNCount = (short*)malloc((MaxOff+1)*sizeof(short));
112
+ if (!bs || !wksp || !offcodeNCount) {
113
+ headerSize = ERROR(memory_allocation);
114
+ } else {
115
+ ZSTD_reset_compressedBlockState(bs);
116
+ headerSize = ZSTD_loadCEntropy(bs, wksp, offcodeNCount, &offcodeMaxValue, dictBuffer, dictSize);
117
+ }
118
+
119
+ free(bs);
120
+ free(wksp);
121
+ free(offcodeNCount);
122
+ }
123
+
124
+ return headerSize;
125
+ }
102
126
 
103
127
  /*-********************************************************
104
128
  * Dictionary training functions
@@ -571,7 +595,7 @@ static void ZDICT_fillNoise(void* buffer, size_t length)
571
595
  unsigned const prime1 = 2654435761U;
572
596
  unsigned const prime2 = 2246822519U;
573
597
  unsigned acc = prime1;
574
- size_t p=0;;
598
+ size_t p=0;
575
599
  for (p=0; p<length; p++) {
576
600
  acc *= prime2;
577
601
  ((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21);
@@ -588,12 +612,12 @@ typedef struct
588
612
 
589
613
  #define MAXREPOFFSET 1024
590
614
 
591
- static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params,
615
+ static void ZDICT_countEStats(EStats_ress_t esr, const ZSTD_parameters* params,
592
616
  unsigned* countLit, unsigned* offsetcodeCount, unsigned* matchlengthCount, unsigned* litlengthCount, U32* repOffsets,
593
617
  const void* src, size_t srcSize,
594
618
  U32 notificationLevel)
595
619
  {
596
- size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params.cParams.windowLog);
620
+ size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params->cParams.windowLog);
597
621
  size_t cSize;
598
622
 
599
623
  if (srcSize > blockSizeMax) srcSize = blockSizeMax; /* protection vs large samples */
@@ -731,7 +755,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
731
755
 
732
756
  /* collect stats on all samples */
733
757
  for (u=0; u<nbFiles; u++) {
734
- ZDICT_countEStats(esr, params,
758
+ ZDICT_countEStats(esr, &params,
735
759
  countLit, offcodeCount, matchLengthCount, litLengthCount, repOffset,
736
760
  (const char*)srcBuffer + pos, fileSizes[u],
737
761
  notificationLevel);
@@ -741,7 +765,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
741
765
  /* analyze, build stats, starting with literals */
742
766
  { size_t maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog);
743
767
  if (HUF_isError(maxNbBits)) {
744
- eSize = ERROR(GENERIC);
768
+ eSize = maxNbBits;
745
769
  DISPLAYLEVEL(1, " HUF_buildCTable error \n");
746
770
  goto _cleanup;
747
771
  }
@@ -764,7 +788,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
764
788
  total=0; for (u=0; u<=offcodeMax; u++) total+=offcodeCount[u];
765
789
  errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, offcodeMax);
766
790
  if (FSE_isError(errorCode)) {
767
- eSize = ERROR(GENERIC);
791
+ eSize = errorCode;
768
792
  DISPLAYLEVEL(1, "FSE_normalizeCount error with offcodeCount \n");
769
793
  goto _cleanup;
770
794
  }
@@ -773,7 +797,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
773
797
  total=0; for (u=0; u<=MaxML; u++) total+=matchLengthCount[u];
774
798
  errorCode = FSE_normalizeCount(matchLengthNCount, mlLog, matchLengthCount, total, MaxML);
775
799
  if (FSE_isError(errorCode)) {
776
- eSize = ERROR(GENERIC);
800
+ eSize = errorCode;
777
801
  DISPLAYLEVEL(1, "FSE_normalizeCount error with matchLengthCount \n");
778
802
  goto _cleanup;
779
803
  }
@@ -782,7 +806,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
782
806
  total=0; for (u=0; u<=MaxLL; u++) total+=litLengthCount[u];
783
807
  errorCode = FSE_normalizeCount(litLengthNCount, llLog, litLengthCount, total, MaxLL);
784
808
  if (FSE_isError(errorCode)) {
785
- eSize = ERROR(GENERIC);
809
+ eSize = errorCode;
786
810
  DISPLAYLEVEL(1, "FSE_normalizeCount error with litLengthCount \n");
787
811
  goto _cleanup;
788
812
  }
@@ -791,7 +815,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
791
815
  /* write result to buffer */
792
816
  { size_t const hhSize = HUF_writeCTable(dstPtr, maxDstSize, hufTable, 255, huffLog);
793
817
  if (HUF_isError(hhSize)) {
794
- eSize = ERROR(GENERIC);
818
+ eSize = hhSize;
795
819
  DISPLAYLEVEL(1, "HUF_writeCTable error \n");
796
820
  goto _cleanup;
797
821
  }
@@ -802,7 +826,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
802
826
 
803
827
  { size_t const ohSize = FSE_writeNCount(dstPtr, maxDstSize, offcodeNCount, OFFCODE_MAX, Offlog);
804
828
  if (FSE_isError(ohSize)) {
805
- eSize = ERROR(GENERIC);
829
+ eSize = ohSize;
806
830
  DISPLAYLEVEL(1, "FSE_writeNCount error with offcodeNCount \n");
807
831
  goto _cleanup;
808
832
  }
@@ -813,7 +837,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
813
837
 
814
838
  { size_t const mhSize = FSE_writeNCount(dstPtr, maxDstSize, matchLengthNCount, MaxML, mlLog);
815
839
  if (FSE_isError(mhSize)) {
816
- eSize = ERROR(GENERIC);
840
+ eSize = mhSize;
817
841
  DISPLAYLEVEL(1, "FSE_writeNCount error with matchLengthNCount \n");
818
842
  goto _cleanup;
819
843
  }
@@ -824,7 +848,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
824
848
 
825
849
  { size_t const lhSize = FSE_writeNCount(dstPtr, maxDstSize, litLengthNCount, MaxLL, llLog);
826
850
  if (FSE_isError(lhSize)) {
827
- eSize = ERROR(GENERIC);
851
+ eSize = lhSize;
828
852
  DISPLAYLEVEL(1, "FSE_writeNCount error with litlengthNCount \n");
829
853
  goto _cleanup;
830
854
  }
@@ -834,7 +858,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
834
858
  }
835
859
 
836
860
  if (maxDstSize<12) {
837
- eSize = ERROR(GENERIC);
861
+ eSize = ERROR(dstSize_tooSmall);
838
862
  DISPLAYLEVEL(1, "not enough space to write RepOffsets \n");
839
863
  goto _cleanup;
840
864
  }