zstd-ruby 1.3.8.0 → 1.4.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +6 -5
- data/README.md +1 -1
- data/ext/zstdruby/libzstd/Makefile +133 -61
- data/ext/zstdruby/libzstd/README.md +51 -18
- data/ext/zstdruby/libzstd/common/bitstream.h +38 -39
- data/ext/zstdruby/libzstd/common/compiler.h +41 -6
- data/ext/zstdruby/libzstd/common/cpu.h +1 -1
- data/ext/zstdruby/libzstd/common/debug.c +11 -31
- data/ext/zstdruby/libzstd/common/debug.h +11 -31
- data/ext/zstdruby/libzstd/common/entropy_common.c +13 -33
- data/ext/zstdruby/libzstd/common/error_private.c +2 -1
- data/ext/zstdruby/libzstd/common/error_private.h +6 -2
- data/ext/zstdruby/libzstd/common/fse.h +13 -33
- data/ext/zstdruby/libzstd/common/fse_decompress.c +12 -35
- data/ext/zstdruby/libzstd/common/huf.h +15 -33
- data/ext/zstdruby/libzstd/common/mem.h +75 -2
- data/ext/zstdruby/libzstd/common/pool.c +8 -4
- data/ext/zstdruby/libzstd/common/pool.h +2 -2
- data/ext/zstdruby/libzstd/common/threading.c +52 -6
- data/ext/zstdruby/libzstd/common/threading.h +36 -4
- data/ext/zstdruby/libzstd/common/xxhash.c +25 -37
- data/ext/zstdruby/libzstd/common/xxhash.h +11 -31
- data/ext/zstdruby/libzstd/common/zstd_common.c +1 -1
- data/ext/zstdruby/libzstd/common/zstd_errors.h +2 -1
- data/ext/zstdruby/libzstd/common/zstd_internal.h +203 -22
- data/ext/zstdruby/libzstd/compress/fse_compress.c +19 -42
- data/ext/zstdruby/libzstd/compress/hist.c +15 -35
- data/ext/zstdruby/libzstd/compress/hist.h +12 -32
- data/ext/zstdruby/libzstd/compress/huf_compress.c +92 -92
- data/ext/zstdruby/libzstd/compress/zstd_compress.c +1460 -1472
- data/ext/zstdruby/libzstd/compress/zstd_compress_internal.h +330 -65
- data/ext/zstdruby/libzstd/compress/zstd_compress_literals.c +158 -0
- data/ext/zstdruby/libzstd/compress/zstd_compress_literals.h +29 -0
- data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.c +419 -0
- data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.h +54 -0
- data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.c +845 -0
- data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.h +32 -0
- data/ext/zstdruby/libzstd/compress/zstd_cwksp.h +525 -0
- data/ext/zstdruby/libzstd/compress/zstd_double_fast.c +65 -43
- data/ext/zstdruby/libzstd/compress/zstd_double_fast.h +2 -2
- data/ext/zstdruby/libzstd/compress/zstd_fast.c +264 -159
- data/ext/zstdruby/libzstd/compress/zstd_fast.h +2 -2
- data/ext/zstdruby/libzstd/compress/zstd_lazy.c +74 -42
- data/ext/zstdruby/libzstd/compress/zstd_lazy.h +2 -2
- data/ext/zstdruby/libzstd/compress/zstd_ldm.c +33 -11
- data/ext/zstdruby/libzstd/compress/zstd_ldm.h +7 -2
- data/ext/zstdruby/libzstd/compress/zstd_opt.c +108 -125
- data/ext/zstdruby/libzstd/compress/zstd_opt.h +1 -1
- data/ext/zstdruby/libzstd/compress/zstdmt_compress.c +129 -93
- data/ext/zstdruby/libzstd/compress/zstdmt_compress.h +46 -28
- data/ext/zstdruby/libzstd/decompress/huf_decompress.c +76 -60
- data/ext/zstdruby/libzstd/decompress/zstd_ddict.c +14 -10
- data/ext/zstdruby/libzstd/decompress/zstd_ddict.h +2 -2
- data/ext/zstdruby/libzstd/decompress/zstd_decompress.c +471 -258
- data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.c +471 -346
- data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.h +3 -3
- data/ext/zstdruby/libzstd/decompress/zstd_decompress_internal.h +25 -4
- data/ext/zstdruby/libzstd/deprecated/zbuff.h +9 -8
- data/ext/zstdruby/libzstd/deprecated/zbuff_common.c +2 -2
- data/ext/zstdruby/libzstd/deprecated/zbuff_compress.c +1 -1
- data/ext/zstdruby/libzstd/deprecated/zbuff_decompress.c +1 -1
- data/ext/zstdruby/libzstd/dictBuilder/cover.c +220 -65
- data/ext/zstdruby/libzstd/dictBuilder/cover.h +81 -7
- data/ext/zstdruby/libzstd/dictBuilder/fastcover.c +85 -56
- data/ext/zstdruby/libzstd/dictBuilder/zdict.c +43 -19
- data/ext/zstdruby/libzstd/dictBuilder/zdict.h +73 -35
- data/ext/zstdruby/libzstd/dll/example/Makefile +2 -1
- data/ext/zstdruby/libzstd/dll/example/build_package.bat +3 -2
- data/ext/zstdruby/libzstd/legacy/zstd_legacy.h +49 -15
- data/ext/zstdruby/libzstd/legacy/zstd_v01.c +142 -117
- data/ext/zstdruby/libzstd/legacy/zstd_v01.h +13 -8
- data/ext/zstdruby/libzstd/legacy/zstd_v02.c +54 -25
- data/ext/zstdruby/libzstd/legacy/zstd_v02.h +13 -8
- data/ext/zstdruby/libzstd/legacy/zstd_v03.c +55 -25
- data/ext/zstdruby/libzstd/legacy/zstd_v03.h +13 -8
- data/ext/zstdruby/libzstd/legacy/zstd_v04.c +62 -29
- data/ext/zstdruby/libzstd/legacy/zstd_v04.h +13 -8
- data/ext/zstdruby/libzstd/legacy/zstd_v05.c +145 -109
- data/ext/zstdruby/libzstd/legacy/zstd_v05.h +14 -9
- data/ext/zstdruby/libzstd/legacy/zstd_v06.c +56 -26
- data/ext/zstdruby/libzstd/legacy/zstd_v06.h +11 -6
- data/ext/zstdruby/libzstd/legacy/zstd_v07.c +65 -28
- data/ext/zstdruby/libzstd/legacy/zstd_v07.h +11 -6
- data/ext/zstdruby/libzstd/libzstd.pc.in +3 -2
- data/ext/zstdruby/libzstd/zstd.h +921 -597
- data/lib/zstd-ruby/version.rb +1 -1
- data/zstd-ruby.gemspec +2 -2
- metadata +19 -14
- data/ext/zstdruby/libzstd/dll/libzstd.def +0 -87
@@ -1,11 +1,21 @@
|
|
1
|
+
/*
|
2
|
+
* Copyright (c) 2017-2020, Facebook, Inc.
|
3
|
+
* All rights reserved.
|
4
|
+
*
|
5
|
+
* This source code is licensed under both the BSD-style license (found in the
|
6
|
+
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
|
7
|
+
* in the COPYING file in the root directory of this source tree).
|
8
|
+
* You may select, at your option, one of the above-listed licenses.
|
9
|
+
*/
|
10
|
+
|
1
11
|
#include <stdio.h> /* fprintf */
|
2
12
|
#include <stdlib.h> /* malloc, free, qsort */
|
3
13
|
#include <string.h> /* memset */
|
4
14
|
#include <time.h> /* clock */
|
5
|
-
#include "mem.h" /* read */
|
6
|
-
#include "pool.h"
|
7
|
-
#include "threading.h"
|
8
|
-
#include "zstd_internal.h" /* includes zstd.h */
|
15
|
+
#include "../common/mem.h" /* read */
|
16
|
+
#include "../common/pool.h"
|
17
|
+
#include "../common/threading.h"
|
18
|
+
#include "../common/zstd_internal.h" /* includes zstd.h */
|
9
19
|
#ifndef ZDICT_STATIC_LINKING_ONLY
|
10
20
|
#define ZDICT_STATIC_LINKING_ONLY
|
11
21
|
#endif
|
@@ -38,6 +48,44 @@ typedef struct {
|
|
38
48
|
U32 score;
|
39
49
|
} COVER_segment_t;
|
40
50
|
|
51
|
+
/**
|
52
|
+
*Number of epochs and size of each epoch.
|
53
|
+
*/
|
54
|
+
typedef struct {
|
55
|
+
U32 num;
|
56
|
+
U32 size;
|
57
|
+
} COVER_epoch_info_t;
|
58
|
+
|
59
|
+
/**
|
60
|
+
* Struct used for the dictionary selection function.
|
61
|
+
*/
|
62
|
+
typedef struct COVER_dictSelection {
|
63
|
+
BYTE* dictContent;
|
64
|
+
size_t dictSize;
|
65
|
+
size_t totalCompressedSize;
|
66
|
+
} COVER_dictSelection_t;
|
67
|
+
|
68
|
+
/**
|
69
|
+
* Computes the number of epochs and the size of each epoch.
|
70
|
+
* We will make sure that each epoch gets at least 10 * k bytes.
|
71
|
+
*
|
72
|
+
* The COVER algorithms divide the data up into epochs of equal size and
|
73
|
+
* select one segment from each epoch.
|
74
|
+
*
|
75
|
+
* @param maxDictSize The maximum allowed dictionary size.
|
76
|
+
* @param nbDmers The number of dmers we are training on.
|
77
|
+
* @param k The parameter k (segment size).
|
78
|
+
* @param passes The target number of passes over the dmer corpus.
|
79
|
+
* More passes means a better dictionary.
|
80
|
+
*/
|
81
|
+
COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize, U32 nbDmers,
|
82
|
+
U32 k, U32 passes);
|
83
|
+
|
84
|
+
/**
|
85
|
+
* Warns the user when their corpus is too small.
|
86
|
+
*/
|
87
|
+
void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel);
|
88
|
+
|
41
89
|
/**
|
42
90
|
* Checks total compressed size of a dictionary
|
43
91
|
*/
|
@@ -78,6 +126,32 @@ void COVER_best_start(COVER_best_t *best);
|
|
78
126
|
* Decrements liveJobs and signals any waiting threads if liveJobs == 0.
|
79
127
|
* If this dictionary is the best so far save it and its parameters.
|
80
128
|
*/
|
81
|
-
void COVER_best_finish(COVER_best_t *best,
|
82
|
-
|
83
|
-
|
129
|
+
void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters,
|
130
|
+
COVER_dictSelection_t selection);
|
131
|
+
/**
|
132
|
+
* Error function for COVER_selectDict function. Checks if the return
|
133
|
+
* value is an error.
|
134
|
+
*/
|
135
|
+
unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection);
|
136
|
+
|
137
|
+
/**
|
138
|
+
* Error function for COVER_selectDict function. Returns a struct where
|
139
|
+
* return.totalCompressedSize is a ZSTD error.
|
140
|
+
*/
|
141
|
+
COVER_dictSelection_t COVER_dictSelectionError(size_t error);
|
142
|
+
|
143
|
+
/**
|
144
|
+
* Always call after selectDict is called to free up used memory from
|
145
|
+
* newly created dictionary.
|
146
|
+
*/
|
147
|
+
void COVER_dictSelectionFree(COVER_dictSelection_t selection);
|
148
|
+
|
149
|
+
/**
|
150
|
+
* Called to finalize the dictionary and select one based on whether or not
|
151
|
+
* the shrink-dict flag was enabled. If enabled the dictionary used is the
|
152
|
+
* smallest dictionary within a specified regression of the compressed size
|
153
|
+
* from the largest dictionary.
|
154
|
+
*/
|
155
|
+
COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent,
|
156
|
+
size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples,
|
157
|
+
size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize);
|
@@ -1,3 +1,13 @@
|
|
1
|
+
/*
|
2
|
+
* Copyright (c) 2018-2020, Facebook, Inc.
|
3
|
+
* All rights reserved.
|
4
|
+
*
|
5
|
+
* This source code is licensed under both the BSD-style license (found in the
|
6
|
+
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
|
7
|
+
* in the COPYING file in the root directory of this source tree).
|
8
|
+
* You may select, at your option, one of the above-listed licenses.
|
9
|
+
*/
|
10
|
+
|
1
11
|
/*-*************************************
|
2
12
|
* Dependencies
|
3
13
|
***************************************/
|
@@ -6,11 +16,11 @@
|
|
6
16
|
#include <string.h> /* memset */
|
7
17
|
#include <time.h> /* clock */
|
8
18
|
|
9
|
-
#include "mem.h" /* read */
|
10
|
-
#include "pool.h"
|
11
|
-
#include "threading.h"
|
19
|
+
#include "../common/mem.h" /* read */
|
20
|
+
#include "../common/pool.h"
|
21
|
+
#include "../common/threading.h"
|
12
22
|
#include "cover.h"
|
13
|
-
#include "zstd_internal.h" /* includes zstd.h */
|
23
|
+
#include "../common/zstd_internal.h" /* includes zstd.h */
|
14
24
|
#ifndef ZDICT_STATIC_LINKING_ONLY
|
15
25
|
#define ZDICT_STATIC_LINKING_ONLY
|
16
26
|
#endif
|
@@ -132,7 +142,7 @@ typedef struct {
|
|
132
142
|
*
|
133
143
|
* Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1})
|
134
144
|
*
|
135
|
-
* Once the dmer with hash value d is in the
|
145
|
+
* Once the dmer with hash value d is in the dictionary we set F(d) = 0.
|
136
146
|
*/
|
137
147
|
static COVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx,
|
138
148
|
U32 *freqs, U32 begin, U32 end,
|
@@ -161,7 +171,7 @@ static COVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx,
|
|
161
171
|
/* Get hash value of current dmer */
|
162
172
|
const size_t idx = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.end, f, d);
|
163
173
|
|
164
|
-
/* Add frequency of this index to score if this is the first
|
174
|
+
/* Add frequency of this index to score if this is the first occurrence of index in active segment */
|
165
175
|
if (segmentFreqs[idx] == 0) {
|
166
176
|
activeSegment.score += freqs[idx];
|
167
177
|
}
|
@@ -287,10 +297,10 @@ FASTCOVER_computeFrequency(U32* freqs, const FASTCOVER_ctx_t* ctx)
|
|
287
297
|
* Prepare a context for dictionary building.
|
288
298
|
* The context is only dependent on the parameter `d` and can used multiple
|
289
299
|
* times.
|
290
|
-
* Returns
|
300
|
+
* Returns 0 on success or error code on error.
|
291
301
|
* The context must be destroyed with `FASTCOVER_ctx_destroy()`.
|
292
302
|
*/
|
293
|
-
static
|
303
|
+
static size_t
|
294
304
|
FASTCOVER_ctx_init(FASTCOVER_ctx_t* ctx,
|
295
305
|
const void* samplesBuffer,
|
296
306
|
const size_t* samplesSizes, unsigned nbSamples,
|
@@ -310,19 +320,19 @@ FASTCOVER_ctx_init(FASTCOVER_ctx_t* ctx,
|
|
310
320
|
totalSamplesSize >= (size_t)FASTCOVER_MAX_SAMPLES_SIZE) {
|
311
321
|
DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n",
|
312
322
|
(unsigned)(totalSamplesSize >> 20), (FASTCOVER_MAX_SAMPLES_SIZE >> 20));
|
313
|
-
return
|
323
|
+
return ERROR(srcSize_wrong);
|
314
324
|
}
|
315
325
|
|
316
326
|
/* Check if there are at least 5 training samples */
|
317
327
|
if (nbTrainSamples < 5) {
|
318
328
|
DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid\n", nbTrainSamples);
|
319
|
-
return
|
329
|
+
return ERROR(srcSize_wrong);
|
320
330
|
}
|
321
331
|
|
322
332
|
/* Check if there's testing sample */
|
323
333
|
if (nbTestSamples < 1) {
|
324
334
|
DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.\n", nbTestSamples);
|
325
|
-
return
|
335
|
+
return ERROR(srcSize_wrong);
|
326
336
|
}
|
327
337
|
|
328
338
|
/* Zero the context */
|
@@ -347,7 +357,7 @@ FASTCOVER_ctx_init(FASTCOVER_ctx_t* ctx,
|
|
347
357
|
if (ctx->offsets == NULL) {
|
348
358
|
DISPLAYLEVEL(1, "Failed to allocate scratch buffers \n");
|
349
359
|
FASTCOVER_ctx_destroy(ctx);
|
350
|
-
return
|
360
|
+
return ERROR(memory_allocation);
|
351
361
|
}
|
352
362
|
|
353
363
|
/* Fill offsets from the samplesSizes */
|
@@ -364,13 +374,13 @@ FASTCOVER_ctx_init(FASTCOVER_ctx_t* ctx,
|
|
364
374
|
if (ctx->freqs == NULL) {
|
365
375
|
DISPLAYLEVEL(1, "Failed to allocate frequency table \n");
|
366
376
|
FASTCOVER_ctx_destroy(ctx);
|
367
|
-
return
|
377
|
+
return ERROR(memory_allocation);
|
368
378
|
}
|
369
379
|
|
370
380
|
DISPLAYLEVEL(2, "Computing frequencies\n");
|
371
381
|
FASTCOVER_computeFrequency(ctx->freqs, ctx);
|
372
382
|
|
373
|
-
return
|
383
|
+
return 0;
|
374
384
|
}
|
375
385
|
|
376
386
|
|
@@ -386,29 +396,35 @@ FASTCOVER_buildDictionary(const FASTCOVER_ctx_t* ctx,
|
|
386
396
|
{
|
387
397
|
BYTE *const dict = (BYTE *)dictBuffer;
|
388
398
|
size_t tail = dictBufferCapacity;
|
389
|
-
/* Divide the data
|
390
|
-
|
391
|
-
|
392
|
-
const
|
393
|
-
|
399
|
+
/* Divide the data into epochs. We will select one segment from each epoch. */
|
400
|
+
const COVER_epoch_info_t epochs = COVER_computeEpochs(
|
401
|
+
(U32)dictBufferCapacity, (U32)ctx->nbDmers, parameters.k, 1);
|
402
|
+
const size_t maxZeroScoreRun = 10;
|
403
|
+
size_t zeroScoreRun = 0;
|
394
404
|
size_t epoch;
|
395
405
|
DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n",
|
396
|
-
epochs,
|
406
|
+
(U32)epochs.num, (U32)epochs.size);
|
397
407
|
/* Loop through the epochs until there are no more segments or the dictionary
|
398
408
|
* is full.
|
399
409
|
*/
|
400
|
-
for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs) {
|
401
|
-
const U32 epochBegin = (U32)(epoch *
|
402
|
-
const U32 epochEnd = epochBegin +
|
410
|
+
for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs.num) {
|
411
|
+
const U32 epochBegin = (U32)(epoch * epochs.size);
|
412
|
+
const U32 epochEnd = epochBegin + epochs.size;
|
403
413
|
size_t segmentSize;
|
404
414
|
/* Select a segment */
|
405
415
|
COVER_segment_t segment = FASTCOVER_selectSegment(
|
406
416
|
ctx, freqs, epochBegin, epochEnd, parameters, segmentFreqs);
|
407
417
|
|
408
|
-
/* If the segment covers no dmers, then we are out of content
|
418
|
+
/* If the segment covers no dmers, then we are out of content.
|
419
|
+
* There may be new content in other epochs, for continue for some time.
|
420
|
+
*/
|
409
421
|
if (segment.score == 0) {
|
410
|
-
|
422
|
+
if (++zeroScoreRun >= maxZeroScoreRun) {
|
423
|
+
break;
|
424
|
+
}
|
425
|
+
continue;
|
411
426
|
}
|
427
|
+
zeroScoreRun = 0;
|
412
428
|
|
413
429
|
/* Trim the segment if necessary and if it is too small then we are done */
|
414
430
|
segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail);
|
@@ -429,7 +445,6 @@ FASTCOVER_buildDictionary(const FASTCOVER_ctx_t* ctx,
|
|
429
445
|
return tail;
|
430
446
|
}
|
431
447
|
|
432
|
-
|
433
448
|
/**
|
434
449
|
* Parameters for FASTCOVER_tryParameters().
|
435
450
|
*/
|
@@ -458,6 +473,7 @@ static void FASTCOVER_tryParameters(void *opaque)
|
|
458
473
|
U16* segmentFreqs = (U16 *)calloc(((U64)1 << ctx->f), sizeof(U16));
|
459
474
|
/* Allocate space for hash table, dict, and freqs */
|
460
475
|
BYTE *const dict = (BYTE * const)malloc(dictBufferCapacity);
|
476
|
+
COVER_dictSelection_t selection = COVER_dictSelectionError(ERROR(GENERIC));
|
461
477
|
U32 *freqs = (U32*) malloc(((U64)1 << ctx->f) * sizeof(U32));
|
462
478
|
if (!segmentFreqs || !dict || !freqs) {
|
463
479
|
DISPLAYLEVEL(1, "Failed to allocate buffers: out of memory\n");
|
@@ -467,27 +483,24 @@ static void FASTCOVER_tryParameters(void *opaque)
|
|
467
483
|
memcpy(freqs, ctx->freqs, ((U64)1 << ctx->f) * sizeof(U32));
|
468
484
|
/* Build the dictionary */
|
469
485
|
{ const size_t tail = FASTCOVER_buildDictionary(ctx, freqs, dict, dictBufferCapacity,
|
470
|
-
|
486
|
+
parameters, segmentFreqs);
|
487
|
+
|
471
488
|
const unsigned nbFinalizeSamples = (unsigned)(ctx->nbTrainSamples * ctx->accelParams.finalize / 100);
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
489
|
+
selection = COVER_selectDict(dict + tail, dictBufferCapacity - tail,
|
490
|
+
ctx->samples, ctx->samplesSizes, nbFinalizeSamples, ctx->nbTrainSamples, ctx->nbSamples, parameters, ctx->offsets,
|
491
|
+
totalCompressedSize);
|
492
|
+
|
493
|
+
if (COVER_dictSelectionIsError(selection)) {
|
494
|
+
DISPLAYLEVEL(1, "Failed to select dictionary\n");
|
477
495
|
goto _cleanup;
|
478
496
|
}
|
479
497
|
}
|
480
|
-
/* Check total compressed size */
|
481
|
-
totalCompressedSize = COVER_checkTotalCompressedSize(parameters, ctx->samplesSizes,
|
482
|
-
ctx->samples, ctx->offsets,
|
483
|
-
ctx->nbTrainSamples, ctx->nbSamples,
|
484
|
-
dict, dictBufferCapacity);
|
485
498
|
_cleanup:
|
486
|
-
|
487
|
-
|
499
|
+
free(dict);
|
500
|
+
COVER_best_finish(data->best, parameters, selection);
|
488
501
|
free(data);
|
489
502
|
free(segmentFreqs);
|
490
|
-
|
503
|
+
COVER_dictSelectionFree(selection);
|
491
504
|
free(freqs);
|
492
505
|
}
|
493
506
|
|
@@ -502,6 +515,7 @@ FASTCOVER_convertToCoverParams(ZDICT_fastCover_params_t fastCoverParams,
|
|
502
515
|
coverParams->nbThreads = fastCoverParams.nbThreads;
|
503
516
|
coverParams->splitPoint = fastCoverParams.splitPoint;
|
504
517
|
coverParams->zParams = fastCoverParams.zParams;
|
518
|
+
coverParams->shrinkDict = fastCoverParams.shrinkDict;
|
505
519
|
}
|
506
520
|
|
507
521
|
|
@@ -518,6 +532,7 @@ FASTCOVER_convertToFastCoverParams(ZDICT_cover_params_t coverParams,
|
|
518
532
|
fastCoverParams->f = f;
|
519
533
|
fastCoverParams->accel = accel;
|
520
534
|
fastCoverParams->zParams = coverParams.zParams;
|
535
|
+
fastCoverParams->shrinkDict = coverParams.shrinkDict;
|
521
536
|
}
|
522
537
|
|
523
538
|
|
@@ -544,11 +559,11 @@ ZDICT_trainFromBuffer_fastCover(void* dictBuffer, size_t dictBufferCapacity,
|
|
544
559
|
if (!FASTCOVER_checkParameters(coverParams, dictBufferCapacity, parameters.f,
|
545
560
|
parameters.accel)) {
|
546
561
|
DISPLAYLEVEL(1, "FASTCOVER parameters incorrect\n");
|
547
|
-
return ERROR(
|
562
|
+
return ERROR(parameter_outOfBound);
|
548
563
|
}
|
549
564
|
if (nbSamples == 0) {
|
550
565
|
DISPLAYLEVEL(1, "FASTCOVER must have at least one input file\n");
|
551
|
-
return ERROR(
|
566
|
+
return ERROR(srcSize_wrong);
|
552
567
|
}
|
553
568
|
if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
|
554
569
|
DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
|
@@ -558,12 +573,16 @@ ZDICT_trainFromBuffer_fastCover(void* dictBuffer, size_t dictBufferCapacity,
|
|
558
573
|
/* Assign corresponding FASTCOVER_accel_t to accelParams*/
|
559
574
|
accelParams = FASTCOVER_defaultAccelParameters[parameters.accel];
|
560
575
|
/* Initialize context */
|
561
|
-
|
576
|
+
{
|
577
|
+
size_t const initVal = FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
|
562
578
|
coverParams.d, parameters.splitPoint, parameters.f,
|
563
|
-
accelParams)
|
564
|
-
|
565
|
-
|
579
|
+
accelParams);
|
580
|
+
if (ZSTD_isError(initVal)) {
|
581
|
+
DISPLAYLEVEL(1, "Failed to initialize context\n");
|
582
|
+
return initVal;
|
583
|
+
}
|
566
584
|
}
|
585
|
+
COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.nbDmers, g_displayLevel);
|
567
586
|
/* Build the dictionary */
|
568
587
|
DISPLAYLEVEL(2, "Building dictionary\n");
|
569
588
|
{
|
@@ -609,6 +628,7 @@ ZDICT_optimizeTrainFromBuffer_fastCover(
|
|
609
628
|
(1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize);
|
610
629
|
const unsigned f = parameters->f == 0 ? DEFAULT_F : parameters->f;
|
611
630
|
const unsigned accel = parameters->accel == 0 ? DEFAULT_ACCEL : parameters->accel;
|
631
|
+
const unsigned shrinkDict = 0;
|
612
632
|
/* Local variables */
|
613
633
|
const int displayLevel = parameters->zParams.notificationLevel;
|
614
634
|
unsigned iteration = 1;
|
@@ -616,22 +636,23 @@ ZDICT_optimizeTrainFromBuffer_fastCover(
|
|
616
636
|
unsigned k;
|
617
637
|
COVER_best_t best;
|
618
638
|
POOL_ctx *pool = NULL;
|
639
|
+
int warned = 0;
|
619
640
|
/* Checks */
|
620
641
|
if (splitPoint <= 0 || splitPoint > 1) {
|
621
642
|
LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect splitPoint\n");
|
622
|
-
return ERROR(
|
643
|
+
return ERROR(parameter_outOfBound);
|
623
644
|
}
|
624
645
|
if (accel == 0 || accel > FASTCOVER_MAX_ACCEL) {
|
625
646
|
LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect accel\n");
|
626
|
-
return ERROR(
|
647
|
+
return ERROR(parameter_outOfBound);
|
627
648
|
}
|
628
649
|
if (kMinK < kMaxD || kMaxK < kMinK) {
|
629
650
|
LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect k\n");
|
630
|
-
return ERROR(
|
651
|
+
return ERROR(parameter_outOfBound);
|
631
652
|
}
|
632
653
|
if (nbSamples == 0) {
|
633
654
|
LOCALDISPLAYLEVEL(displayLevel, 1, "FASTCOVER must have at least one input file\n");
|
634
|
-
return ERROR(
|
655
|
+
return ERROR(srcSize_wrong);
|
635
656
|
}
|
636
657
|
if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
|
637
658
|
LOCALDISPLAYLEVEL(displayLevel, 1, "dictBufferCapacity must be at least %u\n",
|
@@ -658,11 +679,18 @@ ZDICT_optimizeTrainFromBuffer_fastCover(
|
|
658
679
|
/* Initialize the context for this value of d */
|
659
680
|
FASTCOVER_ctx_t ctx;
|
660
681
|
LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d);
|
661
|
-
|
662
|
-
|
663
|
-
|
664
|
-
|
665
|
-
|
682
|
+
{
|
683
|
+
size_t const initVal = FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint, f, accelParams);
|
684
|
+
if (ZSTD_isError(initVal)) {
|
685
|
+
LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n");
|
686
|
+
COVER_best_destroy(&best);
|
687
|
+
POOL_free(pool);
|
688
|
+
return initVal;
|
689
|
+
}
|
690
|
+
}
|
691
|
+
if (!warned) {
|
692
|
+
COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.nbDmers, displayLevel);
|
693
|
+
warned = 1;
|
666
694
|
}
|
667
695
|
/* Loop through k reusing the same context */
|
668
696
|
for (k = kMinK; k <= kMaxK; k += kStepSize) {
|
@@ -675,7 +703,7 @@ ZDICT_optimizeTrainFromBuffer_fastCover(
|
|
675
703
|
COVER_best_destroy(&best);
|
676
704
|
FASTCOVER_ctx_destroy(&ctx);
|
677
705
|
POOL_free(pool);
|
678
|
-
return ERROR(
|
706
|
+
return ERROR(memory_allocation);
|
679
707
|
}
|
680
708
|
data->ctx = &ctx;
|
681
709
|
data->best = &best;
|
@@ -685,6 +713,7 @@ ZDICT_optimizeTrainFromBuffer_fastCover(
|
|
685
713
|
data->parameters.d = d;
|
686
714
|
data->parameters.splitPoint = splitPoint;
|
687
715
|
data->parameters.steps = kSteps;
|
716
|
+
data->parameters.shrinkDict = shrinkDict;
|
688
717
|
data->parameters.zParams.notificationLevel = g_displayLevel;
|
689
718
|
/* Check the parameters */
|
690
719
|
if (!FASTCOVER_checkParameters(data->parameters, dictBufferCapacity,
|
@@ -1,5 +1,5 @@
|
|
1
1
|
/*
|
2
|
-
* Copyright (c) 2016-
|
2
|
+
* Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
|
3
3
|
* All rights reserved.
|
4
4
|
*
|
5
5
|
* This source code is licensed under both the BSD-style license (found in the
|
@@ -37,17 +37,18 @@
|
|
37
37
|
#include <stdio.h> /* fprintf, fopen, ftello64 */
|
38
38
|
#include <time.h> /* clock */
|
39
39
|
|
40
|
-
#include "mem.h" /* read */
|
41
|
-
#include "fse.h" /* FSE_normalizeCount, FSE_writeNCount */
|
40
|
+
#include "../common/mem.h" /* read */
|
41
|
+
#include "../common/fse.h" /* FSE_normalizeCount, FSE_writeNCount */
|
42
42
|
#define HUF_STATIC_LINKING_ONLY
|
43
|
-
#include "huf.h" /* HUF_buildCTable, HUF_writeCTable */
|
44
|
-
#include "zstd_internal.h" /* includes zstd.h */
|
45
|
-
#include "xxhash.h" /* XXH64 */
|
43
|
+
#include "../common/huf.h" /* HUF_buildCTable, HUF_writeCTable */
|
44
|
+
#include "../common/zstd_internal.h" /* includes zstd.h */
|
45
|
+
#include "../common/xxhash.h" /* XXH64 */
|
46
46
|
#include "divsufsort.h"
|
47
47
|
#ifndef ZDICT_STATIC_LINKING_ONLY
|
48
48
|
# define ZDICT_STATIC_LINKING_ONLY
|
49
49
|
#endif
|
50
50
|
#include "zdict.h"
|
51
|
+
#include "../compress/zstd_compress_internal.h" /* ZSTD_loadCEntropy() */
|
51
52
|
|
52
53
|
|
53
54
|
/*-*************************************
|
@@ -99,6 +100,29 @@ unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize)
|
|
99
100
|
return MEM_readLE32((const char*)dictBuffer + 4);
|
100
101
|
}
|
101
102
|
|
103
|
+
size_t ZDICT_getDictHeaderSize(const void* dictBuffer, size_t dictSize)
|
104
|
+
{
|
105
|
+
size_t headerSize;
|
106
|
+
if (dictSize <= 8 || MEM_readLE32(dictBuffer) != ZSTD_MAGIC_DICTIONARY) return ERROR(dictionary_corrupted);
|
107
|
+
|
108
|
+
{ unsigned offcodeMaxValue = MaxOff;
|
109
|
+
ZSTD_compressedBlockState_t* bs = (ZSTD_compressedBlockState_t*)malloc(sizeof(ZSTD_compressedBlockState_t));
|
110
|
+
U32* wksp = (U32*)malloc(HUF_WORKSPACE_SIZE);
|
111
|
+
short* offcodeNCount = (short*)malloc((MaxOff+1)*sizeof(short));
|
112
|
+
if (!bs || !wksp || !offcodeNCount) {
|
113
|
+
headerSize = ERROR(memory_allocation);
|
114
|
+
} else {
|
115
|
+
ZSTD_reset_compressedBlockState(bs);
|
116
|
+
headerSize = ZSTD_loadCEntropy(bs, wksp, offcodeNCount, &offcodeMaxValue, dictBuffer, dictSize);
|
117
|
+
}
|
118
|
+
|
119
|
+
free(bs);
|
120
|
+
free(wksp);
|
121
|
+
free(offcodeNCount);
|
122
|
+
}
|
123
|
+
|
124
|
+
return headerSize;
|
125
|
+
}
|
102
126
|
|
103
127
|
/*-********************************************************
|
104
128
|
* Dictionary training functions
|
@@ -571,7 +595,7 @@ static void ZDICT_fillNoise(void* buffer, size_t length)
|
|
571
595
|
unsigned const prime1 = 2654435761U;
|
572
596
|
unsigned const prime2 = 2246822519U;
|
573
597
|
unsigned acc = prime1;
|
574
|
-
size_t p=0
|
598
|
+
size_t p=0;
|
575
599
|
for (p=0; p<length; p++) {
|
576
600
|
acc *= prime2;
|
577
601
|
((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21);
|
@@ -588,12 +612,12 @@ typedef struct
|
|
588
612
|
|
589
613
|
#define MAXREPOFFSET 1024
|
590
614
|
|
591
|
-
static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params,
|
615
|
+
static void ZDICT_countEStats(EStats_ress_t esr, const ZSTD_parameters* params,
|
592
616
|
unsigned* countLit, unsigned* offsetcodeCount, unsigned* matchlengthCount, unsigned* litlengthCount, U32* repOffsets,
|
593
617
|
const void* src, size_t srcSize,
|
594
618
|
U32 notificationLevel)
|
595
619
|
{
|
596
|
-
size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params
|
620
|
+
size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params->cParams.windowLog);
|
597
621
|
size_t cSize;
|
598
622
|
|
599
623
|
if (srcSize > blockSizeMax) srcSize = blockSizeMax; /* protection vs large samples */
|
@@ -731,7 +755,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
731
755
|
|
732
756
|
/* collect stats on all samples */
|
733
757
|
for (u=0; u<nbFiles; u++) {
|
734
|
-
ZDICT_countEStats(esr, params,
|
758
|
+
ZDICT_countEStats(esr, ¶ms,
|
735
759
|
countLit, offcodeCount, matchLengthCount, litLengthCount, repOffset,
|
736
760
|
(const char*)srcBuffer + pos, fileSizes[u],
|
737
761
|
notificationLevel);
|
@@ -741,7 +765,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
741
765
|
/* analyze, build stats, starting with literals */
|
742
766
|
{ size_t maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog);
|
743
767
|
if (HUF_isError(maxNbBits)) {
|
744
|
-
eSize =
|
768
|
+
eSize = maxNbBits;
|
745
769
|
DISPLAYLEVEL(1, " HUF_buildCTable error \n");
|
746
770
|
goto _cleanup;
|
747
771
|
}
|
@@ -764,7 +788,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
764
788
|
total=0; for (u=0; u<=offcodeMax; u++) total+=offcodeCount[u];
|
765
789
|
errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, offcodeMax);
|
766
790
|
if (FSE_isError(errorCode)) {
|
767
|
-
eSize =
|
791
|
+
eSize = errorCode;
|
768
792
|
DISPLAYLEVEL(1, "FSE_normalizeCount error with offcodeCount \n");
|
769
793
|
goto _cleanup;
|
770
794
|
}
|
@@ -773,7 +797,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
773
797
|
total=0; for (u=0; u<=MaxML; u++) total+=matchLengthCount[u];
|
774
798
|
errorCode = FSE_normalizeCount(matchLengthNCount, mlLog, matchLengthCount, total, MaxML);
|
775
799
|
if (FSE_isError(errorCode)) {
|
776
|
-
eSize =
|
800
|
+
eSize = errorCode;
|
777
801
|
DISPLAYLEVEL(1, "FSE_normalizeCount error with matchLengthCount \n");
|
778
802
|
goto _cleanup;
|
779
803
|
}
|
@@ -782,7 +806,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
782
806
|
total=0; for (u=0; u<=MaxLL; u++) total+=litLengthCount[u];
|
783
807
|
errorCode = FSE_normalizeCount(litLengthNCount, llLog, litLengthCount, total, MaxLL);
|
784
808
|
if (FSE_isError(errorCode)) {
|
785
|
-
eSize =
|
809
|
+
eSize = errorCode;
|
786
810
|
DISPLAYLEVEL(1, "FSE_normalizeCount error with litLengthCount \n");
|
787
811
|
goto _cleanup;
|
788
812
|
}
|
@@ -791,7 +815,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
791
815
|
/* write result to buffer */
|
792
816
|
{ size_t const hhSize = HUF_writeCTable(dstPtr, maxDstSize, hufTable, 255, huffLog);
|
793
817
|
if (HUF_isError(hhSize)) {
|
794
|
-
eSize =
|
818
|
+
eSize = hhSize;
|
795
819
|
DISPLAYLEVEL(1, "HUF_writeCTable error \n");
|
796
820
|
goto _cleanup;
|
797
821
|
}
|
@@ -802,7 +826,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
802
826
|
|
803
827
|
{ size_t const ohSize = FSE_writeNCount(dstPtr, maxDstSize, offcodeNCount, OFFCODE_MAX, Offlog);
|
804
828
|
if (FSE_isError(ohSize)) {
|
805
|
-
eSize =
|
829
|
+
eSize = ohSize;
|
806
830
|
DISPLAYLEVEL(1, "FSE_writeNCount error with offcodeNCount \n");
|
807
831
|
goto _cleanup;
|
808
832
|
}
|
@@ -813,7 +837,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
813
837
|
|
814
838
|
{ size_t const mhSize = FSE_writeNCount(dstPtr, maxDstSize, matchLengthNCount, MaxML, mlLog);
|
815
839
|
if (FSE_isError(mhSize)) {
|
816
|
-
eSize =
|
840
|
+
eSize = mhSize;
|
817
841
|
DISPLAYLEVEL(1, "FSE_writeNCount error with matchLengthNCount \n");
|
818
842
|
goto _cleanup;
|
819
843
|
}
|
@@ -824,7 +848,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
824
848
|
|
825
849
|
{ size_t const lhSize = FSE_writeNCount(dstPtr, maxDstSize, litLengthNCount, MaxLL, llLog);
|
826
850
|
if (FSE_isError(lhSize)) {
|
827
|
-
eSize =
|
851
|
+
eSize = lhSize;
|
828
852
|
DISPLAYLEVEL(1, "FSE_writeNCount error with litlengthNCount \n");
|
829
853
|
goto _cleanup;
|
830
854
|
}
|
@@ -834,7 +858,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
834
858
|
}
|
835
859
|
|
836
860
|
if (maxDstSize<12) {
|
837
|
-
eSize = ERROR(
|
861
|
+
eSize = ERROR(dstSize_tooSmall);
|
838
862
|
DISPLAYLEVEL(1, "not enough space to write RepOffsets \n");
|
839
863
|
goto _cleanup;
|
840
864
|
}
|