zstd-ruby 1.3.8.0 → 1.4.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (90) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +6 -5
  3. data/README.md +1 -1
  4. data/ext/zstdruby/libzstd/Makefile +133 -61
  5. data/ext/zstdruby/libzstd/README.md +51 -18
  6. data/ext/zstdruby/libzstd/common/bitstream.h +38 -39
  7. data/ext/zstdruby/libzstd/common/compiler.h +41 -6
  8. data/ext/zstdruby/libzstd/common/cpu.h +1 -1
  9. data/ext/zstdruby/libzstd/common/debug.c +11 -31
  10. data/ext/zstdruby/libzstd/common/debug.h +11 -31
  11. data/ext/zstdruby/libzstd/common/entropy_common.c +13 -33
  12. data/ext/zstdruby/libzstd/common/error_private.c +2 -1
  13. data/ext/zstdruby/libzstd/common/error_private.h +6 -2
  14. data/ext/zstdruby/libzstd/common/fse.h +13 -33
  15. data/ext/zstdruby/libzstd/common/fse_decompress.c +12 -35
  16. data/ext/zstdruby/libzstd/common/huf.h +15 -33
  17. data/ext/zstdruby/libzstd/common/mem.h +75 -2
  18. data/ext/zstdruby/libzstd/common/pool.c +8 -4
  19. data/ext/zstdruby/libzstd/common/pool.h +2 -2
  20. data/ext/zstdruby/libzstd/common/threading.c +52 -6
  21. data/ext/zstdruby/libzstd/common/threading.h +36 -4
  22. data/ext/zstdruby/libzstd/common/xxhash.c +25 -37
  23. data/ext/zstdruby/libzstd/common/xxhash.h +11 -31
  24. data/ext/zstdruby/libzstd/common/zstd_common.c +1 -1
  25. data/ext/zstdruby/libzstd/common/zstd_errors.h +2 -1
  26. data/ext/zstdruby/libzstd/common/zstd_internal.h +203 -22
  27. data/ext/zstdruby/libzstd/compress/fse_compress.c +19 -42
  28. data/ext/zstdruby/libzstd/compress/hist.c +15 -35
  29. data/ext/zstdruby/libzstd/compress/hist.h +12 -32
  30. data/ext/zstdruby/libzstd/compress/huf_compress.c +92 -92
  31. data/ext/zstdruby/libzstd/compress/zstd_compress.c +1460 -1472
  32. data/ext/zstdruby/libzstd/compress/zstd_compress_internal.h +330 -65
  33. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.c +158 -0
  34. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.h +29 -0
  35. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.c +419 -0
  36. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.h +54 -0
  37. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.c +845 -0
  38. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.h +32 -0
  39. data/ext/zstdruby/libzstd/compress/zstd_cwksp.h +525 -0
  40. data/ext/zstdruby/libzstd/compress/zstd_double_fast.c +65 -43
  41. data/ext/zstdruby/libzstd/compress/zstd_double_fast.h +2 -2
  42. data/ext/zstdruby/libzstd/compress/zstd_fast.c +264 -159
  43. data/ext/zstdruby/libzstd/compress/zstd_fast.h +2 -2
  44. data/ext/zstdruby/libzstd/compress/zstd_lazy.c +74 -42
  45. data/ext/zstdruby/libzstd/compress/zstd_lazy.h +2 -2
  46. data/ext/zstdruby/libzstd/compress/zstd_ldm.c +33 -11
  47. data/ext/zstdruby/libzstd/compress/zstd_ldm.h +7 -2
  48. data/ext/zstdruby/libzstd/compress/zstd_opt.c +108 -125
  49. data/ext/zstdruby/libzstd/compress/zstd_opt.h +1 -1
  50. data/ext/zstdruby/libzstd/compress/zstdmt_compress.c +129 -93
  51. data/ext/zstdruby/libzstd/compress/zstdmt_compress.h +46 -28
  52. data/ext/zstdruby/libzstd/decompress/huf_decompress.c +76 -60
  53. data/ext/zstdruby/libzstd/decompress/zstd_ddict.c +14 -10
  54. data/ext/zstdruby/libzstd/decompress/zstd_ddict.h +2 -2
  55. data/ext/zstdruby/libzstd/decompress/zstd_decompress.c +471 -258
  56. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.c +471 -346
  57. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.h +3 -3
  58. data/ext/zstdruby/libzstd/decompress/zstd_decompress_internal.h +25 -4
  59. data/ext/zstdruby/libzstd/deprecated/zbuff.h +9 -8
  60. data/ext/zstdruby/libzstd/deprecated/zbuff_common.c +2 -2
  61. data/ext/zstdruby/libzstd/deprecated/zbuff_compress.c +1 -1
  62. data/ext/zstdruby/libzstd/deprecated/zbuff_decompress.c +1 -1
  63. data/ext/zstdruby/libzstd/dictBuilder/cover.c +220 -65
  64. data/ext/zstdruby/libzstd/dictBuilder/cover.h +81 -7
  65. data/ext/zstdruby/libzstd/dictBuilder/fastcover.c +85 -56
  66. data/ext/zstdruby/libzstd/dictBuilder/zdict.c +43 -19
  67. data/ext/zstdruby/libzstd/dictBuilder/zdict.h +73 -35
  68. data/ext/zstdruby/libzstd/dll/example/Makefile +2 -1
  69. data/ext/zstdruby/libzstd/dll/example/build_package.bat +3 -2
  70. data/ext/zstdruby/libzstd/legacy/zstd_legacy.h +49 -15
  71. data/ext/zstdruby/libzstd/legacy/zstd_v01.c +142 -117
  72. data/ext/zstdruby/libzstd/legacy/zstd_v01.h +13 -8
  73. data/ext/zstdruby/libzstd/legacy/zstd_v02.c +54 -25
  74. data/ext/zstdruby/libzstd/legacy/zstd_v02.h +13 -8
  75. data/ext/zstdruby/libzstd/legacy/zstd_v03.c +55 -25
  76. data/ext/zstdruby/libzstd/legacy/zstd_v03.h +13 -8
  77. data/ext/zstdruby/libzstd/legacy/zstd_v04.c +62 -29
  78. data/ext/zstdruby/libzstd/legacy/zstd_v04.h +13 -8
  79. data/ext/zstdruby/libzstd/legacy/zstd_v05.c +145 -109
  80. data/ext/zstdruby/libzstd/legacy/zstd_v05.h +14 -9
  81. data/ext/zstdruby/libzstd/legacy/zstd_v06.c +56 -26
  82. data/ext/zstdruby/libzstd/legacy/zstd_v06.h +11 -6
  83. data/ext/zstdruby/libzstd/legacy/zstd_v07.c +65 -28
  84. data/ext/zstdruby/libzstd/legacy/zstd_v07.h +11 -6
  85. data/ext/zstdruby/libzstd/libzstd.pc.in +3 -2
  86. data/ext/zstdruby/libzstd/zstd.h +921 -597
  87. data/lib/zstd-ruby/version.rb +1 -1
  88. data/zstd-ruby.gemspec +2 -2
  89. metadata +19 -14
  90. data/ext/zstdruby/libzstd/dll/libzstd.def +0 -87
@@ -1,11 +1,21 @@
1
+ /*
2
+ * Copyright (c) 2017-2020, Facebook, Inc.
3
+ * All rights reserved.
4
+ *
5
+ * This source code is licensed under both the BSD-style license (found in the
6
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7
+ * in the COPYING file in the root directory of this source tree).
8
+ * You may select, at your option, one of the above-listed licenses.
9
+ */
10
+
1
11
  #include <stdio.h> /* fprintf */
2
12
  #include <stdlib.h> /* malloc, free, qsort */
3
13
  #include <string.h> /* memset */
4
14
  #include <time.h> /* clock */
5
- #include "mem.h" /* read */
6
- #include "pool.h"
7
- #include "threading.h"
8
- #include "zstd_internal.h" /* includes zstd.h */
15
+ #include "../common/mem.h" /* read */
16
+ #include "../common/pool.h"
17
+ #include "../common/threading.h"
18
+ #include "../common/zstd_internal.h" /* includes zstd.h */
9
19
  #ifndef ZDICT_STATIC_LINKING_ONLY
10
20
  #define ZDICT_STATIC_LINKING_ONLY
11
21
  #endif
@@ -38,6 +48,44 @@ typedef struct {
38
48
  U32 score;
39
49
  } COVER_segment_t;
40
50
 
51
+ /**
52
+ *Number of epochs and size of each epoch.
53
+ */
54
+ typedef struct {
55
+ U32 num;
56
+ U32 size;
57
+ } COVER_epoch_info_t;
58
+
59
+ /**
60
+ * Struct used for the dictionary selection function.
61
+ */
62
+ typedef struct COVER_dictSelection {
63
+ BYTE* dictContent;
64
+ size_t dictSize;
65
+ size_t totalCompressedSize;
66
+ } COVER_dictSelection_t;
67
+
68
+ /**
69
+ * Computes the number of epochs and the size of each epoch.
70
+ * We will make sure that each epoch gets at least 10 * k bytes.
71
+ *
72
+ * The COVER algorithms divide the data up into epochs of equal size and
73
+ * select one segment from each epoch.
74
+ *
75
+ * @param maxDictSize The maximum allowed dictionary size.
76
+ * @param nbDmers The number of dmers we are training on.
77
+ * @param k The parameter k (segment size).
78
+ * @param passes The target number of passes over the dmer corpus.
79
+ * More passes means a better dictionary.
80
+ */
81
+ COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize, U32 nbDmers,
82
+ U32 k, U32 passes);
83
+
84
+ /**
85
+ * Warns the user when their corpus is too small.
86
+ */
87
+ void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel);
88
+
41
89
  /**
42
90
  * Checks total compressed size of a dictionary
43
91
  */
@@ -78,6 +126,32 @@ void COVER_best_start(COVER_best_t *best);
78
126
  * Decrements liveJobs and signals any waiting threads if liveJobs == 0.
79
127
  * If this dictionary is the best so far save it and its parameters.
80
128
  */
81
- void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
82
- ZDICT_cover_params_t parameters, void *dict,
83
- size_t dictSize);
129
+ void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters,
130
+ COVER_dictSelection_t selection);
131
+ /**
132
+ * Error function for COVER_selectDict function. Checks if the return
133
+ * value is an error.
134
+ */
135
+ unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection);
136
+
137
+ /**
138
+ * Error function for COVER_selectDict function. Returns a struct where
139
+ * return.totalCompressedSize is a ZSTD error.
140
+ */
141
+ COVER_dictSelection_t COVER_dictSelectionError(size_t error);
142
+
143
+ /**
144
+ * Always call after selectDict is called to free up used memory from
145
+ * newly created dictionary.
146
+ */
147
+ void COVER_dictSelectionFree(COVER_dictSelection_t selection);
148
+
149
+ /**
150
+ * Called to finalize the dictionary and select one based on whether or not
151
+ * the shrink-dict flag was enabled. If enabled the dictionary used is the
152
+ * smallest dictionary within a specified regression of the compressed size
153
+ * from the largest dictionary.
154
+ */
155
+ COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent,
156
+ size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples,
157
+ size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize);
@@ -1,3 +1,13 @@
1
+ /*
2
+ * Copyright (c) 2018-2020, Facebook, Inc.
3
+ * All rights reserved.
4
+ *
5
+ * This source code is licensed under both the BSD-style license (found in the
6
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7
+ * in the COPYING file in the root directory of this source tree).
8
+ * You may select, at your option, one of the above-listed licenses.
9
+ */
10
+
1
11
  /*-*************************************
2
12
  * Dependencies
3
13
  ***************************************/
@@ -6,11 +16,11 @@
6
16
  #include <string.h> /* memset */
7
17
  #include <time.h> /* clock */
8
18
 
9
- #include "mem.h" /* read */
10
- #include "pool.h"
11
- #include "threading.h"
19
+ #include "../common/mem.h" /* read */
20
+ #include "../common/pool.h"
21
+ #include "../common/threading.h"
12
22
  #include "cover.h"
13
- #include "zstd_internal.h" /* includes zstd.h */
23
+ #include "../common/zstd_internal.h" /* includes zstd.h */
14
24
  #ifndef ZDICT_STATIC_LINKING_ONLY
15
25
  #define ZDICT_STATIC_LINKING_ONLY
16
26
  #endif
@@ -132,7 +142,7 @@ typedef struct {
132
142
  *
133
143
  * Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1})
134
144
  *
135
- * Once the dmer with hash value d is in the dictionay we set F(d) = 0.
145
+ * Once the dmer with hash value d is in the dictionary we set F(d) = 0.
136
146
  */
137
147
  static COVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx,
138
148
  U32 *freqs, U32 begin, U32 end,
@@ -161,7 +171,7 @@ static COVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx,
161
171
  /* Get hash value of current dmer */
162
172
  const size_t idx = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.end, f, d);
163
173
 
164
- /* Add frequency of this index to score if this is the first occurence of index in active segment */
174
+ /* Add frequency of this index to score if this is the first occurrence of index in active segment */
165
175
  if (segmentFreqs[idx] == 0) {
166
176
  activeSegment.score += freqs[idx];
167
177
  }
@@ -287,10 +297,10 @@ FASTCOVER_computeFrequency(U32* freqs, const FASTCOVER_ctx_t* ctx)
287
297
  * Prepare a context for dictionary building.
288
298
  * The context is only dependent on the parameter `d` and can used multiple
289
299
  * times.
290
- * Returns 1 on success or zero on error.
300
+ * Returns 0 on success or error code on error.
291
301
  * The context must be destroyed with `FASTCOVER_ctx_destroy()`.
292
302
  */
293
- static int
303
+ static size_t
294
304
  FASTCOVER_ctx_init(FASTCOVER_ctx_t* ctx,
295
305
  const void* samplesBuffer,
296
306
  const size_t* samplesSizes, unsigned nbSamples,
@@ -310,19 +320,19 @@ FASTCOVER_ctx_init(FASTCOVER_ctx_t* ctx,
310
320
  totalSamplesSize >= (size_t)FASTCOVER_MAX_SAMPLES_SIZE) {
311
321
  DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n",
312
322
  (unsigned)(totalSamplesSize >> 20), (FASTCOVER_MAX_SAMPLES_SIZE >> 20));
313
- return 0;
323
+ return ERROR(srcSize_wrong);
314
324
  }
315
325
 
316
326
  /* Check if there are at least 5 training samples */
317
327
  if (nbTrainSamples < 5) {
318
328
  DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid\n", nbTrainSamples);
319
- return 0;
329
+ return ERROR(srcSize_wrong);
320
330
  }
321
331
 
322
332
  /* Check if there's testing sample */
323
333
  if (nbTestSamples < 1) {
324
334
  DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.\n", nbTestSamples);
325
- return 0;
335
+ return ERROR(srcSize_wrong);
326
336
  }
327
337
 
328
338
  /* Zero the context */
@@ -347,7 +357,7 @@ FASTCOVER_ctx_init(FASTCOVER_ctx_t* ctx,
347
357
  if (ctx->offsets == NULL) {
348
358
  DISPLAYLEVEL(1, "Failed to allocate scratch buffers \n");
349
359
  FASTCOVER_ctx_destroy(ctx);
350
- return 0;
360
+ return ERROR(memory_allocation);
351
361
  }
352
362
 
353
363
  /* Fill offsets from the samplesSizes */
@@ -364,13 +374,13 @@ FASTCOVER_ctx_init(FASTCOVER_ctx_t* ctx,
364
374
  if (ctx->freqs == NULL) {
365
375
  DISPLAYLEVEL(1, "Failed to allocate frequency table \n");
366
376
  FASTCOVER_ctx_destroy(ctx);
367
- return 0;
377
+ return ERROR(memory_allocation);
368
378
  }
369
379
 
370
380
  DISPLAYLEVEL(2, "Computing frequencies\n");
371
381
  FASTCOVER_computeFrequency(ctx->freqs, ctx);
372
382
 
373
- return 1;
383
+ return 0;
374
384
  }
375
385
 
376
386
 
@@ -386,29 +396,35 @@ FASTCOVER_buildDictionary(const FASTCOVER_ctx_t* ctx,
386
396
  {
387
397
  BYTE *const dict = (BYTE *)dictBuffer;
388
398
  size_t tail = dictBufferCapacity;
389
- /* Divide the data up into epochs of equal size.
390
- * We will select at least one segment from each epoch.
391
- */
392
- const unsigned epochs = MAX(1, (U32)(dictBufferCapacity / parameters.k));
393
- const unsigned epochSize = (U32)(ctx->nbDmers / epochs);
399
+ /* Divide the data into epochs. We will select one segment from each epoch. */
400
+ const COVER_epoch_info_t epochs = COVER_computeEpochs(
401
+ (U32)dictBufferCapacity, (U32)ctx->nbDmers, parameters.k, 1);
402
+ const size_t maxZeroScoreRun = 10;
403
+ size_t zeroScoreRun = 0;
394
404
  size_t epoch;
395
405
  DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n",
396
- epochs, epochSize);
406
+ (U32)epochs.num, (U32)epochs.size);
397
407
  /* Loop through the epochs until there are no more segments or the dictionary
398
408
  * is full.
399
409
  */
400
- for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs) {
401
- const U32 epochBegin = (U32)(epoch * epochSize);
402
- const U32 epochEnd = epochBegin + epochSize;
410
+ for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs.num) {
411
+ const U32 epochBegin = (U32)(epoch * epochs.size);
412
+ const U32 epochEnd = epochBegin + epochs.size;
403
413
  size_t segmentSize;
404
414
  /* Select a segment */
405
415
  COVER_segment_t segment = FASTCOVER_selectSegment(
406
416
  ctx, freqs, epochBegin, epochEnd, parameters, segmentFreqs);
407
417
 
408
- /* If the segment covers no dmers, then we are out of content */
418
+ /* If the segment covers no dmers, then we are out of content.
419
+ * There may be new content in other epochs, for continue for some time.
420
+ */
409
421
  if (segment.score == 0) {
410
- break;
422
+ if (++zeroScoreRun >= maxZeroScoreRun) {
423
+ break;
424
+ }
425
+ continue;
411
426
  }
427
+ zeroScoreRun = 0;
412
428
 
413
429
  /* Trim the segment if necessary and if it is too small then we are done */
414
430
  segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail);
@@ -429,7 +445,6 @@ FASTCOVER_buildDictionary(const FASTCOVER_ctx_t* ctx,
429
445
  return tail;
430
446
  }
431
447
 
432
-
433
448
  /**
434
449
  * Parameters for FASTCOVER_tryParameters().
435
450
  */
@@ -458,6 +473,7 @@ static void FASTCOVER_tryParameters(void *opaque)
458
473
  U16* segmentFreqs = (U16 *)calloc(((U64)1 << ctx->f), sizeof(U16));
459
474
  /* Allocate space for hash table, dict, and freqs */
460
475
  BYTE *const dict = (BYTE * const)malloc(dictBufferCapacity);
476
+ COVER_dictSelection_t selection = COVER_dictSelectionError(ERROR(GENERIC));
461
477
  U32 *freqs = (U32*) malloc(((U64)1 << ctx->f) * sizeof(U32));
462
478
  if (!segmentFreqs || !dict || !freqs) {
463
479
  DISPLAYLEVEL(1, "Failed to allocate buffers: out of memory\n");
@@ -467,27 +483,24 @@ static void FASTCOVER_tryParameters(void *opaque)
467
483
  memcpy(freqs, ctx->freqs, ((U64)1 << ctx->f) * sizeof(U32));
468
484
  /* Build the dictionary */
469
485
  { const size_t tail = FASTCOVER_buildDictionary(ctx, freqs, dict, dictBufferCapacity,
470
- parameters, segmentFreqs);
486
+ parameters, segmentFreqs);
487
+
471
488
  const unsigned nbFinalizeSamples = (unsigned)(ctx->nbTrainSamples * ctx->accelParams.finalize / 100);
472
- dictBufferCapacity = ZDICT_finalizeDictionary(
473
- dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
474
- ctx->samples, ctx->samplesSizes, nbFinalizeSamples, parameters.zParams);
475
- if (ZDICT_isError(dictBufferCapacity)) {
476
- DISPLAYLEVEL(1, "Failed to finalize dictionary\n");
489
+ selection = COVER_selectDict(dict + tail, dictBufferCapacity - tail,
490
+ ctx->samples, ctx->samplesSizes, nbFinalizeSamples, ctx->nbTrainSamples, ctx->nbSamples, parameters, ctx->offsets,
491
+ totalCompressedSize);
492
+
493
+ if (COVER_dictSelectionIsError(selection)) {
494
+ DISPLAYLEVEL(1, "Failed to select dictionary\n");
477
495
  goto _cleanup;
478
496
  }
479
497
  }
480
- /* Check total compressed size */
481
- totalCompressedSize = COVER_checkTotalCompressedSize(parameters, ctx->samplesSizes,
482
- ctx->samples, ctx->offsets,
483
- ctx->nbTrainSamples, ctx->nbSamples,
484
- dict, dictBufferCapacity);
485
498
  _cleanup:
486
- COVER_best_finish(data->best, totalCompressedSize, parameters, dict,
487
- dictBufferCapacity);
499
+ free(dict);
500
+ COVER_best_finish(data->best, parameters, selection);
488
501
  free(data);
489
502
  free(segmentFreqs);
490
- free(dict);
503
+ COVER_dictSelectionFree(selection);
491
504
  free(freqs);
492
505
  }
493
506
 
@@ -502,6 +515,7 @@ FASTCOVER_convertToCoverParams(ZDICT_fastCover_params_t fastCoverParams,
502
515
  coverParams->nbThreads = fastCoverParams.nbThreads;
503
516
  coverParams->splitPoint = fastCoverParams.splitPoint;
504
517
  coverParams->zParams = fastCoverParams.zParams;
518
+ coverParams->shrinkDict = fastCoverParams.shrinkDict;
505
519
  }
506
520
 
507
521
 
@@ -518,6 +532,7 @@ FASTCOVER_convertToFastCoverParams(ZDICT_cover_params_t coverParams,
518
532
  fastCoverParams->f = f;
519
533
  fastCoverParams->accel = accel;
520
534
  fastCoverParams->zParams = coverParams.zParams;
535
+ fastCoverParams->shrinkDict = coverParams.shrinkDict;
521
536
  }
522
537
 
523
538
 
@@ -544,11 +559,11 @@ ZDICT_trainFromBuffer_fastCover(void* dictBuffer, size_t dictBufferCapacity,
544
559
  if (!FASTCOVER_checkParameters(coverParams, dictBufferCapacity, parameters.f,
545
560
  parameters.accel)) {
546
561
  DISPLAYLEVEL(1, "FASTCOVER parameters incorrect\n");
547
- return ERROR(GENERIC);
562
+ return ERROR(parameter_outOfBound);
548
563
  }
549
564
  if (nbSamples == 0) {
550
565
  DISPLAYLEVEL(1, "FASTCOVER must have at least one input file\n");
551
- return ERROR(GENERIC);
566
+ return ERROR(srcSize_wrong);
552
567
  }
553
568
  if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
554
569
  DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
@@ -558,12 +573,16 @@ ZDICT_trainFromBuffer_fastCover(void* dictBuffer, size_t dictBufferCapacity,
558
573
  /* Assign corresponding FASTCOVER_accel_t to accelParams*/
559
574
  accelParams = FASTCOVER_defaultAccelParameters[parameters.accel];
560
575
  /* Initialize context */
561
- if (!FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
576
+ {
577
+ size_t const initVal = FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
562
578
  coverParams.d, parameters.splitPoint, parameters.f,
563
- accelParams)) {
564
- DISPLAYLEVEL(1, "Failed to initialize context\n");
565
- return ERROR(GENERIC);
579
+ accelParams);
580
+ if (ZSTD_isError(initVal)) {
581
+ DISPLAYLEVEL(1, "Failed to initialize context\n");
582
+ return initVal;
583
+ }
566
584
  }
585
+ COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.nbDmers, g_displayLevel);
567
586
  /* Build the dictionary */
568
587
  DISPLAYLEVEL(2, "Building dictionary\n");
569
588
  {
@@ -609,6 +628,7 @@ ZDICT_optimizeTrainFromBuffer_fastCover(
609
628
  (1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize);
610
629
  const unsigned f = parameters->f == 0 ? DEFAULT_F : parameters->f;
611
630
  const unsigned accel = parameters->accel == 0 ? DEFAULT_ACCEL : parameters->accel;
631
+ const unsigned shrinkDict = 0;
612
632
  /* Local variables */
613
633
  const int displayLevel = parameters->zParams.notificationLevel;
614
634
  unsigned iteration = 1;
@@ -616,22 +636,23 @@ ZDICT_optimizeTrainFromBuffer_fastCover(
616
636
  unsigned k;
617
637
  COVER_best_t best;
618
638
  POOL_ctx *pool = NULL;
639
+ int warned = 0;
619
640
  /* Checks */
620
641
  if (splitPoint <= 0 || splitPoint > 1) {
621
642
  LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect splitPoint\n");
622
- return ERROR(GENERIC);
643
+ return ERROR(parameter_outOfBound);
623
644
  }
624
645
  if (accel == 0 || accel > FASTCOVER_MAX_ACCEL) {
625
646
  LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect accel\n");
626
- return ERROR(GENERIC);
647
+ return ERROR(parameter_outOfBound);
627
648
  }
628
649
  if (kMinK < kMaxD || kMaxK < kMinK) {
629
650
  LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect k\n");
630
- return ERROR(GENERIC);
651
+ return ERROR(parameter_outOfBound);
631
652
  }
632
653
  if (nbSamples == 0) {
633
654
  LOCALDISPLAYLEVEL(displayLevel, 1, "FASTCOVER must have at least one input file\n");
634
- return ERROR(GENERIC);
655
+ return ERROR(srcSize_wrong);
635
656
  }
636
657
  if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
637
658
  LOCALDISPLAYLEVEL(displayLevel, 1, "dictBufferCapacity must be at least %u\n",
@@ -658,11 +679,18 @@ ZDICT_optimizeTrainFromBuffer_fastCover(
658
679
  /* Initialize the context for this value of d */
659
680
  FASTCOVER_ctx_t ctx;
660
681
  LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d);
661
- if (!FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint, f, accelParams)) {
662
- LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n");
663
- COVER_best_destroy(&best);
664
- POOL_free(pool);
665
- return ERROR(GENERIC);
682
+ {
683
+ size_t const initVal = FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint, f, accelParams);
684
+ if (ZSTD_isError(initVal)) {
685
+ LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n");
686
+ COVER_best_destroy(&best);
687
+ POOL_free(pool);
688
+ return initVal;
689
+ }
690
+ }
691
+ if (!warned) {
692
+ COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.nbDmers, displayLevel);
693
+ warned = 1;
666
694
  }
667
695
  /* Loop through k reusing the same context */
668
696
  for (k = kMinK; k <= kMaxK; k += kStepSize) {
@@ -675,7 +703,7 @@ ZDICT_optimizeTrainFromBuffer_fastCover(
675
703
  COVER_best_destroy(&best);
676
704
  FASTCOVER_ctx_destroy(&ctx);
677
705
  POOL_free(pool);
678
- return ERROR(GENERIC);
706
+ return ERROR(memory_allocation);
679
707
  }
680
708
  data->ctx = &ctx;
681
709
  data->best = &best;
@@ -685,6 +713,7 @@ ZDICT_optimizeTrainFromBuffer_fastCover(
685
713
  data->parameters.d = d;
686
714
  data->parameters.splitPoint = splitPoint;
687
715
  data->parameters.steps = kSteps;
716
+ data->parameters.shrinkDict = shrinkDict;
688
717
  data->parameters.zParams.notificationLevel = g_displayLevel;
689
718
  /* Check the parameters */
690
719
  if (!FASTCOVER_checkParameters(data->parameters, dictBufferCapacity,
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
2
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -37,17 +37,18 @@
37
37
  #include <stdio.h> /* fprintf, fopen, ftello64 */
38
38
  #include <time.h> /* clock */
39
39
 
40
- #include "mem.h" /* read */
41
- #include "fse.h" /* FSE_normalizeCount, FSE_writeNCount */
40
+ #include "../common/mem.h" /* read */
41
+ #include "../common/fse.h" /* FSE_normalizeCount, FSE_writeNCount */
42
42
  #define HUF_STATIC_LINKING_ONLY
43
- #include "huf.h" /* HUF_buildCTable, HUF_writeCTable */
44
- #include "zstd_internal.h" /* includes zstd.h */
45
- #include "xxhash.h" /* XXH64 */
43
+ #include "../common/huf.h" /* HUF_buildCTable, HUF_writeCTable */
44
+ #include "../common/zstd_internal.h" /* includes zstd.h */
45
+ #include "../common/xxhash.h" /* XXH64 */
46
46
  #include "divsufsort.h"
47
47
  #ifndef ZDICT_STATIC_LINKING_ONLY
48
48
  # define ZDICT_STATIC_LINKING_ONLY
49
49
  #endif
50
50
  #include "zdict.h"
51
+ #include "../compress/zstd_compress_internal.h" /* ZSTD_loadCEntropy() */
51
52
 
52
53
 
53
54
  /*-*************************************
@@ -99,6 +100,29 @@ unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize)
99
100
  return MEM_readLE32((const char*)dictBuffer + 4);
100
101
  }
101
102
 
103
+ size_t ZDICT_getDictHeaderSize(const void* dictBuffer, size_t dictSize)
104
+ {
105
+ size_t headerSize;
106
+ if (dictSize <= 8 || MEM_readLE32(dictBuffer) != ZSTD_MAGIC_DICTIONARY) return ERROR(dictionary_corrupted);
107
+
108
+ { unsigned offcodeMaxValue = MaxOff;
109
+ ZSTD_compressedBlockState_t* bs = (ZSTD_compressedBlockState_t*)malloc(sizeof(ZSTD_compressedBlockState_t));
110
+ U32* wksp = (U32*)malloc(HUF_WORKSPACE_SIZE);
111
+ short* offcodeNCount = (short*)malloc((MaxOff+1)*sizeof(short));
112
+ if (!bs || !wksp || !offcodeNCount) {
113
+ headerSize = ERROR(memory_allocation);
114
+ } else {
115
+ ZSTD_reset_compressedBlockState(bs);
116
+ headerSize = ZSTD_loadCEntropy(bs, wksp, offcodeNCount, &offcodeMaxValue, dictBuffer, dictSize);
117
+ }
118
+
119
+ free(bs);
120
+ free(wksp);
121
+ free(offcodeNCount);
122
+ }
123
+
124
+ return headerSize;
125
+ }
102
126
 
103
127
  /*-********************************************************
104
128
  * Dictionary training functions
@@ -571,7 +595,7 @@ static void ZDICT_fillNoise(void* buffer, size_t length)
571
595
  unsigned const prime1 = 2654435761U;
572
596
  unsigned const prime2 = 2246822519U;
573
597
  unsigned acc = prime1;
574
- size_t p=0;;
598
+ size_t p=0;
575
599
  for (p=0; p<length; p++) {
576
600
  acc *= prime2;
577
601
  ((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21);
@@ -588,12 +612,12 @@ typedef struct
588
612
 
589
613
  #define MAXREPOFFSET 1024
590
614
 
591
- static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params,
615
+ static void ZDICT_countEStats(EStats_ress_t esr, const ZSTD_parameters* params,
592
616
  unsigned* countLit, unsigned* offsetcodeCount, unsigned* matchlengthCount, unsigned* litlengthCount, U32* repOffsets,
593
617
  const void* src, size_t srcSize,
594
618
  U32 notificationLevel)
595
619
  {
596
- size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params.cParams.windowLog);
620
+ size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params->cParams.windowLog);
597
621
  size_t cSize;
598
622
 
599
623
  if (srcSize > blockSizeMax) srcSize = blockSizeMax; /* protection vs large samples */
@@ -731,7 +755,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
731
755
 
732
756
  /* collect stats on all samples */
733
757
  for (u=0; u<nbFiles; u++) {
734
- ZDICT_countEStats(esr, params,
758
+ ZDICT_countEStats(esr, &params,
735
759
  countLit, offcodeCount, matchLengthCount, litLengthCount, repOffset,
736
760
  (const char*)srcBuffer + pos, fileSizes[u],
737
761
  notificationLevel);
@@ -741,7 +765,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
741
765
  /* analyze, build stats, starting with literals */
742
766
  { size_t maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog);
743
767
  if (HUF_isError(maxNbBits)) {
744
- eSize = ERROR(GENERIC);
768
+ eSize = maxNbBits;
745
769
  DISPLAYLEVEL(1, " HUF_buildCTable error \n");
746
770
  goto _cleanup;
747
771
  }
@@ -764,7 +788,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
764
788
  total=0; for (u=0; u<=offcodeMax; u++) total+=offcodeCount[u];
765
789
  errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, offcodeMax);
766
790
  if (FSE_isError(errorCode)) {
767
- eSize = ERROR(GENERIC);
791
+ eSize = errorCode;
768
792
  DISPLAYLEVEL(1, "FSE_normalizeCount error with offcodeCount \n");
769
793
  goto _cleanup;
770
794
  }
@@ -773,7 +797,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
773
797
  total=0; for (u=0; u<=MaxML; u++) total+=matchLengthCount[u];
774
798
  errorCode = FSE_normalizeCount(matchLengthNCount, mlLog, matchLengthCount, total, MaxML);
775
799
  if (FSE_isError(errorCode)) {
776
- eSize = ERROR(GENERIC);
800
+ eSize = errorCode;
777
801
  DISPLAYLEVEL(1, "FSE_normalizeCount error with matchLengthCount \n");
778
802
  goto _cleanup;
779
803
  }
@@ -782,7 +806,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
782
806
  total=0; for (u=0; u<=MaxLL; u++) total+=litLengthCount[u];
783
807
  errorCode = FSE_normalizeCount(litLengthNCount, llLog, litLengthCount, total, MaxLL);
784
808
  if (FSE_isError(errorCode)) {
785
- eSize = ERROR(GENERIC);
809
+ eSize = errorCode;
786
810
  DISPLAYLEVEL(1, "FSE_normalizeCount error with litLengthCount \n");
787
811
  goto _cleanup;
788
812
  }
@@ -791,7 +815,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
791
815
  /* write result to buffer */
792
816
  { size_t const hhSize = HUF_writeCTable(dstPtr, maxDstSize, hufTable, 255, huffLog);
793
817
  if (HUF_isError(hhSize)) {
794
- eSize = ERROR(GENERIC);
818
+ eSize = hhSize;
795
819
  DISPLAYLEVEL(1, "HUF_writeCTable error \n");
796
820
  goto _cleanup;
797
821
  }
@@ -802,7 +826,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
802
826
 
803
827
  { size_t const ohSize = FSE_writeNCount(dstPtr, maxDstSize, offcodeNCount, OFFCODE_MAX, Offlog);
804
828
  if (FSE_isError(ohSize)) {
805
- eSize = ERROR(GENERIC);
829
+ eSize = ohSize;
806
830
  DISPLAYLEVEL(1, "FSE_writeNCount error with offcodeNCount \n");
807
831
  goto _cleanup;
808
832
  }
@@ -813,7 +837,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
813
837
 
814
838
  { size_t const mhSize = FSE_writeNCount(dstPtr, maxDstSize, matchLengthNCount, MaxML, mlLog);
815
839
  if (FSE_isError(mhSize)) {
816
- eSize = ERROR(GENERIC);
840
+ eSize = mhSize;
817
841
  DISPLAYLEVEL(1, "FSE_writeNCount error with matchLengthNCount \n");
818
842
  goto _cleanup;
819
843
  }
@@ -824,7 +848,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
824
848
 
825
849
  { size_t const lhSize = FSE_writeNCount(dstPtr, maxDstSize, litLengthNCount, MaxLL, llLog);
826
850
  if (FSE_isError(lhSize)) {
827
- eSize = ERROR(GENERIC);
851
+ eSize = lhSize;
828
852
  DISPLAYLEVEL(1, "FSE_writeNCount error with litlengthNCount \n");
829
853
  goto _cleanup;
830
854
  }
@@ -834,7 +858,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
834
858
  }
835
859
 
836
860
  if (maxDstSize<12) {
837
- eSize = ERROR(GENERIC);
861
+ eSize = ERROR(dstSize_tooSmall);
838
862
  DISPLAYLEVEL(1, "not enough space to write RepOffsets \n");
839
863
  goto _cleanup;
840
864
  }