zstd-ruby 1.3.8.0 → 1.4.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +6 -5
- data/README.md +1 -1
- data/ext/zstdruby/libzstd/Makefile +133 -61
- data/ext/zstdruby/libzstd/README.md +51 -18
- data/ext/zstdruby/libzstd/common/bitstream.h +38 -39
- data/ext/zstdruby/libzstd/common/compiler.h +41 -6
- data/ext/zstdruby/libzstd/common/cpu.h +1 -1
- data/ext/zstdruby/libzstd/common/debug.c +11 -31
- data/ext/zstdruby/libzstd/common/debug.h +11 -31
- data/ext/zstdruby/libzstd/common/entropy_common.c +13 -33
- data/ext/zstdruby/libzstd/common/error_private.c +2 -1
- data/ext/zstdruby/libzstd/common/error_private.h +6 -2
- data/ext/zstdruby/libzstd/common/fse.h +13 -33
- data/ext/zstdruby/libzstd/common/fse_decompress.c +12 -35
- data/ext/zstdruby/libzstd/common/huf.h +15 -33
- data/ext/zstdruby/libzstd/common/mem.h +75 -2
- data/ext/zstdruby/libzstd/common/pool.c +8 -4
- data/ext/zstdruby/libzstd/common/pool.h +2 -2
- data/ext/zstdruby/libzstd/common/threading.c +52 -6
- data/ext/zstdruby/libzstd/common/threading.h +36 -4
- data/ext/zstdruby/libzstd/common/xxhash.c +25 -37
- data/ext/zstdruby/libzstd/common/xxhash.h +11 -31
- data/ext/zstdruby/libzstd/common/zstd_common.c +1 -1
- data/ext/zstdruby/libzstd/common/zstd_errors.h +2 -1
- data/ext/zstdruby/libzstd/common/zstd_internal.h +203 -22
- data/ext/zstdruby/libzstd/compress/fse_compress.c +19 -42
- data/ext/zstdruby/libzstd/compress/hist.c +15 -35
- data/ext/zstdruby/libzstd/compress/hist.h +12 -32
- data/ext/zstdruby/libzstd/compress/huf_compress.c +92 -92
- data/ext/zstdruby/libzstd/compress/zstd_compress.c +1460 -1472
- data/ext/zstdruby/libzstd/compress/zstd_compress_internal.h +330 -65
- data/ext/zstdruby/libzstd/compress/zstd_compress_literals.c +158 -0
- data/ext/zstdruby/libzstd/compress/zstd_compress_literals.h +29 -0
- data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.c +419 -0
- data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.h +54 -0
- data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.c +845 -0
- data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.h +32 -0
- data/ext/zstdruby/libzstd/compress/zstd_cwksp.h +525 -0
- data/ext/zstdruby/libzstd/compress/zstd_double_fast.c +65 -43
- data/ext/zstdruby/libzstd/compress/zstd_double_fast.h +2 -2
- data/ext/zstdruby/libzstd/compress/zstd_fast.c +264 -159
- data/ext/zstdruby/libzstd/compress/zstd_fast.h +2 -2
- data/ext/zstdruby/libzstd/compress/zstd_lazy.c +74 -42
- data/ext/zstdruby/libzstd/compress/zstd_lazy.h +2 -2
- data/ext/zstdruby/libzstd/compress/zstd_ldm.c +33 -11
- data/ext/zstdruby/libzstd/compress/zstd_ldm.h +7 -2
- data/ext/zstdruby/libzstd/compress/zstd_opt.c +108 -125
- data/ext/zstdruby/libzstd/compress/zstd_opt.h +1 -1
- data/ext/zstdruby/libzstd/compress/zstdmt_compress.c +129 -93
- data/ext/zstdruby/libzstd/compress/zstdmt_compress.h +46 -28
- data/ext/zstdruby/libzstd/decompress/huf_decompress.c +76 -60
- data/ext/zstdruby/libzstd/decompress/zstd_ddict.c +14 -10
- data/ext/zstdruby/libzstd/decompress/zstd_ddict.h +2 -2
- data/ext/zstdruby/libzstd/decompress/zstd_decompress.c +471 -258
- data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.c +471 -346
- data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.h +3 -3
- data/ext/zstdruby/libzstd/decompress/zstd_decompress_internal.h +25 -4
- data/ext/zstdruby/libzstd/deprecated/zbuff.h +9 -8
- data/ext/zstdruby/libzstd/deprecated/zbuff_common.c +2 -2
- data/ext/zstdruby/libzstd/deprecated/zbuff_compress.c +1 -1
- data/ext/zstdruby/libzstd/deprecated/zbuff_decompress.c +1 -1
- data/ext/zstdruby/libzstd/dictBuilder/cover.c +220 -65
- data/ext/zstdruby/libzstd/dictBuilder/cover.h +81 -7
- data/ext/zstdruby/libzstd/dictBuilder/fastcover.c +85 -56
- data/ext/zstdruby/libzstd/dictBuilder/zdict.c +43 -19
- data/ext/zstdruby/libzstd/dictBuilder/zdict.h +73 -35
- data/ext/zstdruby/libzstd/dll/example/Makefile +2 -1
- data/ext/zstdruby/libzstd/dll/example/build_package.bat +3 -2
- data/ext/zstdruby/libzstd/legacy/zstd_legacy.h +49 -15
- data/ext/zstdruby/libzstd/legacy/zstd_v01.c +142 -117
- data/ext/zstdruby/libzstd/legacy/zstd_v01.h +13 -8
- data/ext/zstdruby/libzstd/legacy/zstd_v02.c +54 -25
- data/ext/zstdruby/libzstd/legacy/zstd_v02.h +13 -8
- data/ext/zstdruby/libzstd/legacy/zstd_v03.c +55 -25
- data/ext/zstdruby/libzstd/legacy/zstd_v03.h +13 -8
- data/ext/zstdruby/libzstd/legacy/zstd_v04.c +62 -29
- data/ext/zstdruby/libzstd/legacy/zstd_v04.h +13 -8
- data/ext/zstdruby/libzstd/legacy/zstd_v05.c +145 -109
- data/ext/zstdruby/libzstd/legacy/zstd_v05.h +14 -9
- data/ext/zstdruby/libzstd/legacy/zstd_v06.c +56 -26
- data/ext/zstdruby/libzstd/legacy/zstd_v06.h +11 -6
- data/ext/zstdruby/libzstd/legacy/zstd_v07.c +65 -28
- data/ext/zstdruby/libzstd/legacy/zstd_v07.h +11 -6
- data/ext/zstdruby/libzstd/libzstd.pc.in +3 -2
- data/ext/zstdruby/libzstd/zstd.h +921 -597
- data/lib/zstd-ruby/version.rb +1 -1
- data/zstd-ruby.gemspec +2 -2
- metadata +19 -14
- data/ext/zstdruby/libzstd/dll/libzstd.def +0 -87
|
@@ -1,11 +1,21 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright (c) 2017-2020, Facebook, Inc.
|
|
3
|
+
* All rights reserved.
|
|
4
|
+
*
|
|
5
|
+
* This source code is licensed under both the BSD-style license (found in the
|
|
6
|
+
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
|
|
7
|
+
* in the COPYING file in the root directory of this source tree).
|
|
8
|
+
* You may select, at your option, one of the above-listed licenses.
|
|
9
|
+
*/
|
|
10
|
+
|
|
1
11
|
#include <stdio.h> /* fprintf */
|
|
2
12
|
#include <stdlib.h> /* malloc, free, qsort */
|
|
3
13
|
#include <string.h> /* memset */
|
|
4
14
|
#include <time.h> /* clock */
|
|
5
|
-
#include "mem.h" /* read */
|
|
6
|
-
#include "pool.h"
|
|
7
|
-
#include "threading.h"
|
|
8
|
-
#include "zstd_internal.h" /* includes zstd.h */
|
|
15
|
+
#include "../common/mem.h" /* read */
|
|
16
|
+
#include "../common/pool.h"
|
|
17
|
+
#include "../common/threading.h"
|
|
18
|
+
#include "../common/zstd_internal.h" /* includes zstd.h */
|
|
9
19
|
#ifndef ZDICT_STATIC_LINKING_ONLY
|
|
10
20
|
#define ZDICT_STATIC_LINKING_ONLY
|
|
11
21
|
#endif
|
|
@@ -38,6 +48,44 @@ typedef struct {
|
|
|
38
48
|
U32 score;
|
|
39
49
|
} COVER_segment_t;
|
|
40
50
|
|
|
51
|
+
/**
|
|
52
|
+
*Number of epochs and size of each epoch.
|
|
53
|
+
*/
|
|
54
|
+
typedef struct {
|
|
55
|
+
U32 num;
|
|
56
|
+
U32 size;
|
|
57
|
+
} COVER_epoch_info_t;
|
|
58
|
+
|
|
59
|
+
/**
|
|
60
|
+
* Struct used for the dictionary selection function.
|
|
61
|
+
*/
|
|
62
|
+
typedef struct COVER_dictSelection {
|
|
63
|
+
BYTE* dictContent;
|
|
64
|
+
size_t dictSize;
|
|
65
|
+
size_t totalCompressedSize;
|
|
66
|
+
} COVER_dictSelection_t;
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* Computes the number of epochs and the size of each epoch.
|
|
70
|
+
* We will make sure that each epoch gets at least 10 * k bytes.
|
|
71
|
+
*
|
|
72
|
+
* The COVER algorithms divide the data up into epochs of equal size and
|
|
73
|
+
* select one segment from each epoch.
|
|
74
|
+
*
|
|
75
|
+
* @param maxDictSize The maximum allowed dictionary size.
|
|
76
|
+
* @param nbDmers The number of dmers we are training on.
|
|
77
|
+
* @param k The parameter k (segment size).
|
|
78
|
+
* @param passes The target number of passes over the dmer corpus.
|
|
79
|
+
* More passes means a better dictionary.
|
|
80
|
+
*/
|
|
81
|
+
COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize, U32 nbDmers,
|
|
82
|
+
U32 k, U32 passes);
|
|
83
|
+
|
|
84
|
+
/**
|
|
85
|
+
* Warns the user when their corpus is too small.
|
|
86
|
+
*/
|
|
87
|
+
void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel);
|
|
88
|
+
|
|
41
89
|
/**
|
|
42
90
|
* Checks total compressed size of a dictionary
|
|
43
91
|
*/
|
|
@@ -78,6 +126,32 @@ void COVER_best_start(COVER_best_t *best);
|
|
|
78
126
|
* Decrements liveJobs and signals any waiting threads if liveJobs == 0.
|
|
79
127
|
* If this dictionary is the best so far save it and its parameters.
|
|
80
128
|
*/
|
|
81
|
-
void COVER_best_finish(COVER_best_t *best,
|
|
82
|
-
|
|
83
|
-
|
|
129
|
+
void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters,
|
|
130
|
+
COVER_dictSelection_t selection);
|
|
131
|
+
/**
|
|
132
|
+
* Error function for COVER_selectDict function. Checks if the return
|
|
133
|
+
* value is an error.
|
|
134
|
+
*/
|
|
135
|
+
unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection);
|
|
136
|
+
|
|
137
|
+
/**
|
|
138
|
+
* Error function for COVER_selectDict function. Returns a struct where
|
|
139
|
+
* return.totalCompressedSize is a ZSTD error.
|
|
140
|
+
*/
|
|
141
|
+
COVER_dictSelection_t COVER_dictSelectionError(size_t error);
|
|
142
|
+
|
|
143
|
+
/**
|
|
144
|
+
* Always call after selectDict is called to free up used memory from
|
|
145
|
+
* newly created dictionary.
|
|
146
|
+
*/
|
|
147
|
+
void COVER_dictSelectionFree(COVER_dictSelection_t selection);
|
|
148
|
+
|
|
149
|
+
/**
|
|
150
|
+
* Called to finalize the dictionary and select one based on whether or not
|
|
151
|
+
* the shrink-dict flag was enabled. If enabled the dictionary used is the
|
|
152
|
+
* smallest dictionary within a specified regression of the compressed size
|
|
153
|
+
* from the largest dictionary.
|
|
154
|
+
*/
|
|
155
|
+
COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent,
|
|
156
|
+
size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples,
|
|
157
|
+
size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize);
|
|
@@ -1,3 +1,13 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright (c) 2018-2020, Facebook, Inc.
|
|
3
|
+
* All rights reserved.
|
|
4
|
+
*
|
|
5
|
+
* This source code is licensed under both the BSD-style license (found in the
|
|
6
|
+
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
|
|
7
|
+
* in the COPYING file in the root directory of this source tree).
|
|
8
|
+
* You may select, at your option, one of the above-listed licenses.
|
|
9
|
+
*/
|
|
10
|
+
|
|
1
11
|
/*-*************************************
|
|
2
12
|
* Dependencies
|
|
3
13
|
***************************************/
|
|
@@ -6,11 +16,11 @@
|
|
|
6
16
|
#include <string.h> /* memset */
|
|
7
17
|
#include <time.h> /* clock */
|
|
8
18
|
|
|
9
|
-
#include "mem.h" /* read */
|
|
10
|
-
#include "pool.h"
|
|
11
|
-
#include "threading.h"
|
|
19
|
+
#include "../common/mem.h" /* read */
|
|
20
|
+
#include "../common/pool.h"
|
|
21
|
+
#include "../common/threading.h"
|
|
12
22
|
#include "cover.h"
|
|
13
|
-
#include "zstd_internal.h" /* includes zstd.h */
|
|
23
|
+
#include "../common/zstd_internal.h" /* includes zstd.h */
|
|
14
24
|
#ifndef ZDICT_STATIC_LINKING_ONLY
|
|
15
25
|
#define ZDICT_STATIC_LINKING_ONLY
|
|
16
26
|
#endif
|
|
@@ -132,7 +142,7 @@ typedef struct {
|
|
|
132
142
|
*
|
|
133
143
|
* Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1})
|
|
134
144
|
*
|
|
135
|
-
* Once the dmer with hash value d is in the
|
|
145
|
+
* Once the dmer with hash value d is in the dictionary we set F(d) = 0.
|
|
136
146
|
*/
|
|
137
147
|
static COVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx,
|
|
138
148
|
U32 *freqs, U32 begin, U32 end,
|
|
@@ -161,7 +171,7 @@ static COVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx,
|
|
|
161
171
|
/* Get hash value of current dmer */
|
|
162
172
|
const size_t idx = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.end, f, d);
|
|
163
173
|
|
|
164
|
-
/* Add frequency of this index to score if this is the first
|
|
174
|
+
/* Add frequency of this index to score if this is the first occurrence of index in active segment */
|
|
165
175
|
if (segmentFreqs[idx] == 0) {
|
|
166
176
|
activeSegment.score += freqs[idx];
|
|
167
177
|
}
|
|
@@ -287,10 +297,10 @@ FASTCOVER_computeFrequency(U32* freqs, const FASTCOVER_ctx_t* ctx)
|
|
|
287
297
|
* Prepare a context for dictionary building.
|
|
288
298
|
* The context is only dependent on the parameter `d` and can used multiple
|
|
289
299
|
* times.
|
|
290
|
-
* Returns
|
|
300
|
+
* Returns 0 on success or error code on error.
|
|
291
301
|
* The context must be destroyed with `FASTCOVER_ctx_destroy()`.
|
|
292
302
|
*/
|
|
293
|
-
static
|
|
303
|
+
static size_t
|
|
294
304
|
FASTCOVER_ctx_init(FASTCOVER_ctx_t* ctx,
|
|
295
305
|
const void* samplesBuffer,
|
|
296
306
|
const size_t* samplesSizes, unsigned nbSamples,
|
|
@@ -310,19 +320,19 @@ FASTCOVER_ctx_init(FASTCOVER_ctx_t* ctx,
|
|
|
310
320
|
totalSamplesSize >= (size_t)FASTCOVER_MAX_SAMPLES_SIZE) {
|
|
311
321
|
DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n",
|
|
312
322
|
(unsigned)(totalSamplesSize >> 20), (FASTCOVER_MAX_SAMPLES_SIZE >> 20));
|
|
313
|
-
return
|
|
323
|
+
return ERROR(srcSize_wrong);
|
|
314
324
|
}
|
|
315
325
|
|
|
316
326
|
/* Check if there are at least 5 training samples */
|
|
317
327
|
if (nbTrainSamples < 5) {
|
|
318
328
|
DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid\n", nbTrainSamples);
|
|
319
|
-
return
|
|
329
|
+
return ERROR(srcSize_wrong);
|
|
320
330
|
}
|
|
321
331
|
|
|
322
332
|
/* Check if there's testing sample */
|
|
323
333
|
if (nbTestSamples < 1) {
|
|
324
334
|
DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.\n", nbTestSamples);
|
|
325
|
-
return
|
|
335
|
+
return ERROR(srcSize_wrong);
|
|
326
336
|
}
|
|
327
337
|
|
|
328
338
|
/* Zero the context */
|
|
@@ -347,7 +357,7 @@ FASTCOVER_ctx_init(FASTCOVER_ctx_t* ctx,
|
|
|
347
357
|
if (ctx->offsets == NULL) {
|
|
348
358
|
DISPLAYLEVEL(1, "Failed to allocate scratch buffers \n");
|
|
349
359
|
FASTCOVER_ctx_destroy(ctx);
|
|
350
|
-
return
|
|
360
|
+
return ERROR(memory_allocation);
|
|
351
361
|
}
|
|
352
362
|
|
|
353
363
|
/* Fill offsets from the samplesSizes */
|
|
@@ -364,13 +374,13 @@ FASTCOVER_ctx_init(FASTCOVER_ctx_t* ctx,
|
|
|
364
374
|
if (ctx->freqs == NULL) {
|
|
365
375
|
DISPLAYLEVEL(1, "Failed to allocate frequency table \n");
|
|
366
376
|
FASTCOVER_ctx_destroy(ctx);
|
|
367
|
-
return
|
|
377
|
+
return ERROR(memory_allocation);
|
|
368
378
|
}
|
|
369
379
|
|
|
370
380
|
DISPLAYLEVEL(2, "Computing frequencies\n");
|
|
371
381
|
FASTCOVER_computeFrequency(ctx->freqs, ctx);
|
|
372
382
|
|
|
373
|
-
return
|
|
383
|
+
return 0;
|
|
374
384
|
}
|
|
375
385
|
|
|
376
386
|
|
|
@@ -386,29 +396,35 @@ FASTCOVER_buildDictionary(const FASTCOVER_ctx_t* ctx,
|
|
|
386
396
|
{
|
|
387
397
|
BYTE *const dict = (BYTE *)dictBuffer;
|
|
388
398
|
size_t tail = dictBufferCapacity;
|
|
389
|
-
/* Divide the data
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
const
|
|
393
|
-
|
|
399
|
+
/* Divide the data into epochs. We will select one segment from each epoch. */
|
|
400
|
+
const COVER_epoch_info_t epochs = COVER_computeEpochs(
|
|
401
|
+
(U32)dictBufferCapacity, (U32)ctx->nbDmers, parameters.k, 1);
|
|
402
|
+
const size_t maxZeroScoreRun = 10;
|
|
403
|
+
size_t zeroScoreRun = 0;
|
|
394
404
|
size_t epoch;
|
|
395
405
|
DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n",
|
|
396
|
-
epochs,
|
|
406
|
+
(U32)epochs.num, (U32)epochs.size);
|
|
397
407
|
/* Loop through the epochs until there are no more segments or the dictionary
|
|
398
408
|
* is full.
|
|
399
409
|
*/
|
|
400
|
-
for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs) {
|
|
401
|
-
const U32 epochBegin = (U32)(epoch *
|
|
402
|
-
const U32 epochEnd = epochBegin +
|
|
410
|
+
for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs.num) {
|
|
411
|
+
const U32 epochBegin = (U32)(epoch * epochs.size);
|
|
412
|
+
const U32 epochEnd = epochBegin + epochs.size;
|
|
403
413
|
size_t segmentSize;
|
|
404
414
|
/* Select a segment */
|
|
405
415
|
COVER_segment_t segment = FASTCOVER_selectSegment(
|
|
406
416
|
ctx, freqs, epochBegin, epochEnd, parameters, segmentFreqs);
|
|
407
417
|
|
|
408
|
-
/* If the segment covers no dmers, then we are out of content
|
|
418
|
+
/* If the segment covers no dmers, then we are out of content.
|
|
419
|
+
* There may be new content in other epochs, for continue for some time.
|
|
420
|
+
*/
|
|
409
421
|
if (segment.score == 0) {
|
|
410
|
-
|
|
422
|
+
if (++zeroScoreRun >= maxZeroScoreRun) {
|
|
423
|
+
break;
|
|
424
|
+
}
|
|
425
|
+
continue;
|
|
411
426
|
}
|
|
427
|
+
zeroScoreRun = 0;
|
|
412
428
|
|
|
413
429
|
/* Trim the segment if necessary and if it is too small then we are done */
|
|
414
430
|
segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail);
|
|
@@ -429,7 +445,6 @@ FASTCOVER_buildDictionary(const FASTCOVER_ctx_t* ctx,
|
|
|
429
445
|
return tail;
|
|
430
446
|
}
|
|
431
447
|
|
|
432
|
-
|
|
433
448
|
/**
|
|
434
449
|
* Parameters for FASTCOVER_tryParameters().
|
|
435
450
|
*/
|
|
@@ -458,6 +473,7 @@ static void FASTCOVER_tryParameters(void *opaque)
|
|
|
458
473
|
U16* segmentFreqs = (U16 *)calloc(((U64)1 << ctx->f), sizeof(U16));
|
|
459
474
|
/* Allocate space for hash table, dict, and freqs */
|
|
460
475
|
BYTE *const dict = (BYTE * const)malloc(dictBufferCapacity);
|
|
476
|
+
COVER_dictSelection_t selection = COVER_dictSelectionError(ERROR(GENERIC));
|
|
461
477
|
U32 *freqs = (U32*) malloc(((U64)1 << ctx->f) * sizeof(U32));
|
|
462
478
|
if (!segmentFreqs || !dict || !freqs) {
|
|
463
479
|
DISPLAYLEVEL(1, "Failed to allocate buffers: out of memory\n");
|
|
@@ -467,27 +483,24 @@ static void FASTCOVER_tryParameters(void *opaque)
|
|
|
467
483
|
memcpy(freqs, ctx->freqs, ((U64)1 << ctx->f) * sizeof(U32));
|
|
468
484
|
/* Build the dictionary */
|
|
469
485
|
{ const size_t tail = FASTCOVER_buildDictionary(ctx, freqs, dict, dictBufferCapacity,
|
|
470
|
-
|
|
486
|
+
parameters, segmentFreqs);
|
|
487
|
+
|
|
471
488
|
const unsigned nbFinalizeSamples = (unsigned)(ctx->nbTrainSamples * ctx->accelParams.finalize / 100);
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
489
|
+
selection = COVER_selectDict(dict + tail, dictBufferCapacity - tail,
|
|
490
|
+
ctx->samples, ctx->samplesSizes, nbFinalizeSamples, ctx->nbTrainSamples, ctx->nbSamples, parameters, ctx->offsets,
|
|
491
|
+
totalCompressedSize);
|
|
492
|
+
|
|
493
|
+
if (COVER_dictSelectionIsError(selection)) {
|
|
494
|
+
DISPLAYLEVEL(1, "Failed to select dictionary\n");
|
|
477
495
|
goto _cleanup;
|
|
478
496
|
}
|
|
479
497
|
}
|
|
480
|
-
/* Check total compressed size */
|
|
481
|
-
totalCompressedSize = COVER_checkTotalCompressedSize(parameters, ctx->samplesSizes,
|
|
482
|
-
ctx->samples, ctx->offsets,
|
|
483
|
-
ctx->nbTrainSamples, ctx->nbSamples,
|
|
484
|
-
dict, dictBufferCapacity);
|
|
485
498
|
_cleanup:
|
|
486
|
-
|
|
487
|
-
|
|
499
|
+
free(dict);
|
|
500
|
+
COVER_best_finish(data->best, parameters, selection);
|
|
488
501
|
free(data);
|
|
489
502
|
free(segmentFreqs);
|
|
490
|
-
|
|
503
|
+
COVER_dictSelectionFree(selection);
|
|
491
504
|
free(freqs);
|
|
492
505
|
}
|
|
493
506
|
|
|
@@ -502,6 +515,7 @@ FASTCOVER_convertToCoverParams(ZDICT_fastCover_params_t fastCoverParams,
|
|
|
502
515
|
coverParams->nbThreads = fastCoverParams.nbThreads;
|
|
503
516
|
coverParams->splitPoint = fastCoverParams.splitPoint;
|
|
504
517
|
coverParams->zParams = fastCoverParams.zParams;
|
|
518
|
+
coverParams->shrinkDict = fastCoverParams.shrinkDict;
|
|
505
519
|
}
|
|
506
520
|
|
|
507
521
|
|
|
@@ -518,6 +532,7 @@ FASTCOVER_convertToFastCoverParams(ZDICT_cover_params_t coverParams,
|
|
|
518
532
|
fastCoverParams->f = f;
|
|
519
533
|
fastCoverParams->accel = accel;
|
|
520
534
|
fastCoverParams->zParams = coverParams.zParams;
|
|
535
|
+
fastCoverParams->shrinkDict = coverParams.shrinkDict;
|
|
521
536
|
}
|
|
522
537
|
|
|
523
538
|
|
|
@@ -544,11 +559,11 @@ ZDICT_trainFromBuffer_fastCover(void* dictBuffer, size_t dictBufferCapacity,
|
|
|
544
559
|
if (!FASTCOVER_checkParameters(coverParams, dictBufferCapacity, parameters.f,
|
|
545
560
|
parameters.accel)) {
|
|
546
561
|
DISPLAYLEVEL(1, "FASTCOVER parameters incorrect\n");
|
|
547
|
-
return ERROR(
|
|
562
|
+
return ERROR(parameter_outOfBound);
|
|
548
563
|
}
|
|
549
564
|
if (nbSamples == 0) {
|
|
550
565
|
DISPLAYLEVEL(1, "FASTCOVER must have at least one input file\n");
|
|
551
|
-
return ERROR(
|
|
566
|
+
return ERROR(srcSize_wrong);
|
|
552
567
|
}
|
|
553
568
|
if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
|
|
554
569
|
DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
|
|
@@ -558,12 +573,16 @@ ZDICT_trainFromBuffer_fastCover(void* dictBuffer, size_t dictBufferCapacity,
|
|
|
558
573
|
/* Assign corresponding FASTCOVER_accel_t to accelParams*/
|
|
559
574
|
accelParams = FASTCOVER_defaultAccelParameters[parameters.accel];
|
|
560
575
|
/* Initialize context */
|
|
561
|
-
|
|
576
|
+
{
|
|
577
|
+
size_t const initVal = FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
|
|
562
578
|
coverParams.d, parameters.splitPoint, parameters.f,
|
|
563
|
-
accelParams)
|
|
564
|
-
|
|
565
|
-
|
|
579
|
+
accelParams);
|
|
580
|
+
if (ZSTD_isError(initVal)) {
|
|
581
|
+
DISPLAYLEVEL(1, "Failed to initialize context\n");
|
|
582
|
+
return initVal;
|
|
583
|
+
}
|
|
566
584
|
}
|
|
585
|
+
COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.nbDmers, g_displayLevel);
|
|
567
586
|
/* Build the dictionary */
|
|
568
587
|
DISPLAYLEVEL(2, "Building dictionary\n");
|
|
569
588
|
{
|
|
@@ -609,6 +628,7 @@ ZDICT_optimizeTrainFromBuffer_fastCover(
|
|
|
609
628
|
(1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize);
|
|
610
629
|
const unsigned f = parameters->f == 0 ? DEFAULT_F : parameters->f;
|
|
611
630
|
const unsigned accel = parameters->accel == 0 ? DEFAULT_ACCEL : parameters->accel;
|
|
631
|
+
const unsigned shrinkDict = 0;
|
|
612
632
|
/* Local variables */
|
|
613
633
|
const int displayLevel = parameters->zParams.notificationLevel;
|
|
614
634
|
unsigned iteration = 1;
|
|
@@ -616,22 +636,23 @@ ZDICT_optimizeTrainFromBuffer_fastCover(
|
|
|
616
636
|
unsigned k;
|
|
617
637
|
COVER_best_t best;
|
|
618
638
|
POOL_ctx *pool = NULL;
|
|
639
|
+
int warned = 0;
|
|
619
640
|
/* Checks */
|
|
620
641
|
if (splitPoint <= 0 || splitPoint > 1) {
|
|
621
642
|
LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect splitPoint\n");
|
|
622
|
-
return ERROR(
|
|
643
|
+
return ERROR(parameter_outOfBound);
|
|
623
644
|
}
|
|
624
645
|
if (accel == 0 || accel > FASTCOVER_MAX_ACCEL) {
|
|
625
646
|
LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect accel\n");
|
|
626
|
-
return ERROR(
|
|
647
|
+
return ERROR(parameter_outOfBound);
|
|
627
648
|
}
|
|
628
649
|
if (kMinK < kMaxD || kMaxK < kMinK) {
|
|
629
650
|
LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect k\n");
|
|
630
|
-
return ERROR(
|
|
651
|
+
return ERROR(parameter_outOfBound);
|
|
631
652
|
}
|
|
632
653
|
if (nbSamples == 0) {
|
|
633
654
|
LOCALDISPLAYLEVEL(displayLevel, 1, "FASTCOVER must have at least one input file\n");
|
|
634
|
-
return ERROR(
|
|
655
|
+
return ERROR(srcSize_wrong);
|
|
635
656
|
}
|
|
636
657
|
if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
|
|
637
658
|
LOCALDISPLAYLEVEL(displayLevel, 1, "dictBufferCapacity must be at least %u\n",
|
|
@@ -658,11 +679,18 @@ ZDICT_optimizeTrainFromBuffer_fastCover(
|
|
|
658
679
|
/* Initialize the context for this value of d */
|
|
659
680
|
FASTCOVER_ctx_t ctx;
|
|
660
681
|
LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d);
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
682
|
+
{
|
|
683
|
+
size_t const initVal = FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint, f, accelParams);
|
|
684
|
+
if (ZSTD_isError(initVal)) {
|
|
685
|
+
LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n");
|
|
686
|
+
COVER_best_destroy(&best);
|
|
687
|
+
POOL_free(pool);
|
|
688
|
+
return initVal;
|
|
689
|
+
}
|
|
690
|
+
}
|
|
691
|
+
if (!warned) {
|
|
692
|
+
COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.nbDmers, displayLevel);
|
|
693
|
+
warned = 1;
|
|
666
694
|
}
|
|
667
695
|
/* Loop through k reusing the same context */
|
|
668
696
|
for (k = kMinK; k <= kMaxK; k += kStepSize) {
|
|
@@ -675,7 +703,7 @@ ZDICT_optimizeTrainFromBuffer_fastCover(
|
|
|
675
703
|
COVER_best_destroy(&best);
|
|
676
704
|
FASTCOVER_ctx_destroy(&ctx);
|
|
677
705
|
POOL_free(pool);
|
|
678
|
-
return ERROR(
|
|
706
|
+
return ERROR(memory_allocation);
|
|
679
707
|
}
|
|
680
708
|
data->ctx = &ctx;
|
|
681
709
|
data->best = &best;
|
|
@@ -685,6 +713,7 @@ ZDICT_optimizeTrainFromBuffer_fastCover(
|
|
|
685
713
|
data->parameters.d = d;
|
|
686
714
|
data->parameters.splitPoint = splitPoint;
|
|
687
715
|
data->parameters.steps = kSteps;
|
|
716
|
+
data->parameters.shrinkDict = shrinkDict;
|
|
688
717
|
data->parameters.zParams.notificationLevel = g_displayLevel;
|
|
689
718
|
/* Check the parameters */
|
|
690
719
|
if (!FASTCOVER_checkParameters(data->parameters, dictBufferCapacity,
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
/*
|
|
2
|
-
* Copyright (c) 2016-
|
|
2
|
+
* Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
|
|
3
3
|
* All rights reserved.
|
|
4
4
|
*
|
|
5
5
|
* This source code is licensed under both the BSD-style license (found in the
|
|
@@ -37,17 +37,18 @@
|
|
|
37
37
|
#include <stdio.h> /* fprintf, fopen, ftello64 */
|
|
38
38
|
#include <time.h> /* clock */
|
|
39
39
|
|
|
40
|
-
#include "mem.h" /* read */
|
|
41
|
-
#include "fse.h" /* FSE_normalizeCount, FSE_writeNCount */
|
|
40
|
+
#include "../common/mem.h" /* read */
|
|
41
|
+
#include "../common/fse.h" /* FSE_normalizeCount, FSE_writeNCount */
|
|
42
42
|
#define HUF_STATIC_LINKING_ONLY
|
|
43
|
-
#include "huf.h" /* HUF_buildCTable, HUF_writeCTable */
|
|
44
|
-
#include "zstd_internal.h" /* includes zstd.h */
|
|
45
|
-
#include "xxhash.h" /* XXH64 */
|
|
43
|
+
#include "../common/huf.h" /* HUF_buildCTable, HUF_writeCTable */
|
|
44
|
+
#include "../common/zstd_internal.h" /* includes zstd.h */
|
|
45
|
+
#include "../common/xxhash.h" /* XXH64 */
|
|
46
46
|
#include "divsufsort.h"
|
|
47
47
|
#ifndef ZDICT_STATIC_LINKING_ONLY
|
|
48
48
|
# define ZDICT_STATIC_LINKING_ONLY
|
|
49
49
|
#endif
|
|
50
50
|
#include "zdict.h"
|
|
51
|
+
#include "../compress/zstd_compress_internal.h" /* ZSTD_loadCEntropy() */
|
|
51
52
|
|
|
52
53
|
|
|
53
54
|
/*-*************************************
|
|
@@ -99,6 +100,29 @@ unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize)
|
|
|
99
100
|
return MEM_readLE32((const char*)dictBuffer + 4);
|
|
100
101
|
}
|
|
101
102
|
|
|
103
|
+
size_t ZDICT_getDictHeaderSize(const void* dictBuffer, size_t dictSize)
|
|
104
|
+
{
|
|
105
|
+
size_t headerSize;
|
|
106
|
+
if (dictSize <= 8 || MEM_readLE32(dictBuffer) != ZSTD_MAGIC_DICTIONARY) return ERROR(dictionary_corrupted);
|
|
107
|
+
|
|
108
|
+
{ unsigned offcodeMaxValue = MaxOff;
|
|
109
|
+
ZSTD_compressedBlockState_t* bs = (ZSTD_compressedBlockState_t*)malloc(sizeof(ZSTD_compressedBlockState_t));
|
|
110
|
+
U32* wksp = (U32*)malloc(HUF_WORKSPACE_SIZE);
|
|
111
|
+
short* offcodeNCount = (short*)malloc((MaxOff+1)*sizeof(short));
|
|
112
|
+
if (!bs || !wksp || !offcodeNCount) {
|
|
113
|
+
headerSize = ERROR(memory_allocation);
|
|
114
|
+
} else {
|
|
115
|
+
ZSTD_reset_compressedBlockState(bs);
|
|
116
|
+
headerSize = ZSTD_loadCEntropy(bs, wksp, offcodeNCount, &offcodeMaxValue, dictBuffer, dictSize);
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
free(bs);
|
|
120
|
+
free(wksp);
|
|
121
|
+
free(offcodeNCount);
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
return headerSize;
|
|
125
|
+
}
|
|
102
126
|
|
|
103
127
|
/*-********************************************************
|
|
104
128
|
* Dictionary training functions
|
|
@@ -571,7 +595,7 @@ static void ZDICT_fillNoise(void* buffer, size_t length)
|
|
|
571
595
|
unsigned const prime1 = 2654435761U;
|
|
572
596
|
unsigned const prime2 = 2246822519U;
|
|
573
597
|
unsigned acc = prime1;
|
|
574
|
-
size_t p=0
|
|
598
|
+
size_t p=0;
|
|
575
599
|
for (p=0; p<length; p++) {
|
|
576
600
|
acc *= prime2;
|
|
577
601
|
((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21);
|
|
@@ -588,12 +612,12 @@ typedef struct
|
|
|
588
612
|
|
|
589
613
|
#define MAXREPOFFSET 1024
|
|
590
614
|
|
|
591
|
-
static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params,
|
|
615
|
+
static void ZDICT_countEStats(EStats_ress_t esr, const ZSTD_parameters* params,
|
|
592
616
|
unsigned* countLit, unsigned* offsetcodeCount, unsigned* matchlengthCount, unsigned* litlengthCount, U32* repOffsets,
|
|
593
617
|
const void* src, size_t srcSize,
|
|
594
618
|
U32 notificationLevel)
|
|
595
619
|
{
|
|
596
|
-
size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params
|
|
620
|
+
size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params->cParams.windowLog);
|
|
597
621
|
size_t cSize;
|
|
598
622
|
|
|
599
623
|
if (srcSize > blockSizeMax) srcSize = blockSizeMax; /* protection vs large samples */
|
|
@@ -731,7 +755,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
|
731
755
|
|
|
732
756
|
/* collect stats on all samples */
|
|
733
757
|
for (u=0; u<nbFiles; u++) {
|
|
734
|
-
ZDICT_countEStats(esr, params,
|
|
758
|
+
ZDICT_countEStats(esr, ¶ms,
|
|
735
759
|
countLit, offcodeCount, matchLengthCount, litLengthCount, repOffset,
|
|
736
760
|
(const char*)srcBuffer + pos, fileSizes[u],
|
|
737
761
|
notificationLevel);
|
|
@@ -741,7 +765,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
|
741
765
|
/* analyze, build stats, starting with literals */
|
|
742
766
|
{ size_t maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog);
|
|
743
767
|
if (HUF_isError(maxNbBits)) {
|
|
744
|
-
eSize =
|
|
768
|
+
eSize = maxNbBits;
|
|
745
769
|
DISPLAYLEVEL(1, " HUF_buildCTable error \n");
|
|
746
770
|
goto _cleanup;
|
|
747
771
|
}
|
|
@@ -764,7 +788,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
|
764
788
|
total=0; for (u=0; u<=offcodeMax; u++) total+=offcodeCount[u];
|
|
765
789
|
errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, offcodeMax);
|
|
766
790
|
if (FSE_isError(errorCode)) {
|
|
767
|
-
eSize =
|
|
791
|
+
eSize = errorCode;
|
|
768
792
|
DISPLAYLEVEL(1, "FSE_normalizeCount error with offcodeCount \n");
|
|
769
793
|
goto _cleanup;
|
|
770
794
|
}
|
|
@@ -773,7 +797,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
|
773
797
|
total=0; for (u=0; u<=MaxML; u++) total+=matchLengthCount[u];
|
|
774
798
|
errorCode = FSE_normalizeCount(matchLengthNCount, mlLog, matchLengthCount, total, MaxML);
|
|
775
799
|
if (FSE_isError(errorCode)) {
|
|
776
|
-
eSize =
|
|
800
|
+
eSize = errorCode;
|
|
777
801
|
DISPLAYLEVEL(1, "FSE_normalizeCount error with matchLengthCount \n");
|
|
778
802
|
goto _cleanup;
|
|
779
803
|
}
|
|
@@ -782,7 +806,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
|
782
806
|
total=0; for (u=0; u<=MaxLL; u++) total+=litLengthCount[u];
|
|
783
807
|
errorCode = FSE_normalizeCount(litLengthNCount, llLog, litLengthCount, total, MaxLL);
|
|
784
808
|
if (FSE_isError(errorCode)) {
|
|
785
|
-
eSize =
|
|
809
|
+
eSize = errorCode;
|
|
786
810
|
DISPLAYLEVEL(1, "FSE_normalizeCount error with litLengthCount \n");
|
|
787
811
|
goto _cleanup;
|
|
788
812
|
}
|
|
@@ -791,7 +815,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
|
791
815
|
/* write result to buffer */
|
|
792
816
|
{ size_t const hhSize = HUF_writeCTable(dstPtr, maxDstSize, hufTable, 255, huffLog);
|
|
793
817
|
if (HUF_isError(hhSize)) {
|
|
794
|
-
eSize =
|
|
818
|
+
eSize = hhSize;
|
|
795
819
|
DISPLAYLEVEL(1, "HUF_writeCTable error \n");
|
|
796
820
|
goto _cleanup;
|
|
797
821
|
}
|
|
@@ -802,7 +826,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
|
802
826
|
|
|
803
827
|
{ size_t const ohSize = FSE_writeNCount(dstPtr, maxDstSize, offcodeNCount, OFFCODE_MAX, Offlog);
|
|
804
828
|
if (FSE_isError(ohSize)) {
|
|
805
|
-
eSize =
|
|
829
|
+
eSize = ohSize;
|
|
806
830
|
DISPLAYLEVEL(1, "FSE_writeNCount error with offcodeNCount \n");
|
|
807
831
|
goto _cleanup;
|
|
808
832
|
}
|
|
@@ -813,7 +837,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
|
813
837
|
|
|
814
838
|
{ size_t const mhSize = FSE_writeNCount(dstPtr, maxDstSize, matchLengthNCount, MaxML, mlLog);
|
|
815
839
|
if (FSE_isError(mhSize)) {
|
|
816
|
-
eSize =
|
|
840
|
+
eSize = mhSize;
|
|
817
841
|
DISPLAYLEVEL(1, "FSE_writeNCount error with matchLengthNCount \n");
|
|
818
842
|
goto _cleanup;
|
|
819
843
|
}
|
|
@@ -824,7 +848,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
|
824
848
|
|
|
825
849
|
{ size_t const lhSize = FSE_writeNCount(dstPtr, maxDstSize, litLengthNCount, MaxLL, llLog);
|
|
826
850
|
if (FSE_isError(lhSize)) {
|
|
827
|
-
eSize =
|
|
851
|
+
eSize = lhSize;
|
|
828
852
|
DISPLAYLEVEL(1, "FSE_writeNCount error with litlengthNCount \n");
|
|
829
853
|
goto _cleanup;
|
|
830
854
|
}
|
|
@@ -834,7 +858,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
|
834
858
|
}
|
|
835
859
|
|
|
836
860
|
if (maxDstSize<12) {
|
|
837
|
-
eSize = ERROR(
|
|
861
|
+
eSize = ERROR(dstSize_tooSmall);
|
|
838
862
|
DISPLAYLEVEL(1, "not enough space to write RepOffsets \n");
|
|
839
863
|
goto _cleanup;
|
|
840
864
|
}
|