zstd-ruby 1.3.8.0 → 1.4.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +6 -5
- data/README.md +1 -1
- data/ext/zstdruby/libzstd/Makefile +133 -61
- data/ext/zstdruby/libzstd/README.md +51 -18
- data/ext/zstdruby/libzstd/common/bitstream.h +38 -39
- data/ext/zstdruby/libzstd/common/compiler.h +41 -6
- data/ext/zstdruby/libzstd/common/cpu.h +1 -1
- data/ext/zstdruby/libzstd/common/debug.c +11 -31
- data/ext/zstdruby/libzstd/common/debug.h +11 -31
- data/ext/zstdruby/libzstd/common/entropy_common.c +13 -33
- data/ext/zstdruby/libzstd/common/error_private.c +2 -1
- data/ext/zstdruby/libzstd/common/error_private.h +6 -2
- data/ext/zstdruby/libzstd/common/fse.h +13 -33
- data/ext/zstdruby/libzstd/common/fse_decompress.c +12 -35
- data/ext/zstdruby/libzstd/common/huf.h +15 -33
- data/ext/zstdruby/libzstd/common/mem.h +75 -2
- data/ext/zstdruby/libzstd/common/pool.c +8 -4
- data/ext/zstdruby/libzstd/common/pool.h +2 -2
- data/ext/zstdruby/libzstd/common/threading.c +52 -6
- data/ext/zstdruby/libzstd/common/threading.h +36 -4
- data/ext/zstdruby/libzstd/common/xxhash.c +25 -37
- data/ext/zstdruby/libzstd/common/xxhash.h +11 -31
- data/ext/zstdruby/libzstd/common/zstd_common.c +1 -1
- data/ext/zstdruby/libzstd/common/zstd_errors.h +2 -1
- data/ext/zstdruby/libzstd/common/zstd_internal.h +203 -22
- data/ext/zstdruby/libzstd/compress/fse_compress.c +19 -42
- data/ext/zstdruby/libzstd/compress/hist.c +15 -35
- data/ext/zstdruby/libzstd/compress/hist.h +12 -32
- data/ext/zstdruby/libzstd/compress/huf_compress.c +92 -92
- data/ext/zstdruby/libzstd/compress/zstd_compress.c +1460 -1472
- data/ext/zstdruby/libzstd/compress/zstd_compress_internal.h +330 -65
- data/ext/zstdruby/libzstd/compress/zstd_compress_literals.c +158 -0
- data/ext/zstdruby/libzstd/compress/zstd_compress_literals.h +29 -0
- data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.c +419 -0
- data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.h +54 -0
- data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.c +845 -0
- data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.h +32 -0
- data/ext/zstdruby/libzstd/compress/zstd_cwksp.h +525 -0
- data/ext/zstdruby/libzstd/compress/zstd_double_fast.c +65 -43
- data/ext/zstdruby/libzstd/compress/zstd_double_fast.h +2 -2
- data/ext/zstdruby/libzstd/compress/zstd_fast.c +264 -159
- data/ext/zstdruby/libzstd/compress/zstd_fast.h +2 -2
- data/ext/zstdruby/libzstd/compress/zstd_lazy.c +74 -42
- data/ext/zstdruby/libzstd/compress/zstd_lazy.h +2 -2
- data/ext/zstdruby/libzstd/compress/zstd_ldm.c +33 -11
- data/ext/zstdruby/libzstd/compress/zstd_ldm.h +7 -2
- data/ext/zstdruby/libzstd/compress/zstd_opt.c +108 -125
- data/ext/zstdruby/libzstd/compress/zstd_opt.h +1 -1
- data/ext/zstdruby/libzstd/compress/zstdmt_compress.c +129 -93
- data/ext/zstdruby/libzstd/compress/zstdmt_compress.h +46 -28
- data/ext/zstdruby/libzstd/decompress/huf_decompress.c +76 -60
- data/ext/zstdruby/libzstd/decompress/zstd_ddict.c +14 -10
- data/ext/zstdruby/libzstd/decompress/zstd_ddict.h +2 -2
- data/ext/zstdruby/libzstd/decompress/zstd_decompress.c +471 -258
- data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.c +471 -346
- data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.h +3 -3
- data/ext/zstdruby/libzstd/decompress/zstd_decompress_internal.h +25 -4
- data/ext/zstdruby/libzstd/deprecated/zbuff.h +9 -8
- data/ext/zstdruby/libzstd/deprecated/zbuff_common.c +2 -2
- data/ext/zstdruby/libzstd/deprecated/zbuff_compress.c +1 -1
- data/ext/zstdruby/libzstd/deprecated/zbuff_decompress.c +1 -1
- data/ext/zstdruby/libzstd/dictBuilder/cover.c +220 -65
- data/ext/zstdruby/libzstd/dictBuilder/cover.h +81 -7
- data/ext/zstdruby/libzstd/dictBuilder/fastcover.c +85 -56
- data/ext/zstdruby/libzstd/dictBuilder/zdict.c +43 -19
- data/ext/zstdruby/libzstd/dictBuilder/zdict.h +73 -35
- data/ext/zstdruby/libzstd/dll/example/Makefile +2 -1
- data/ext/zstdruby/libzstd/dll/example/build_package.bat +3 -2
- data/ext/zstdruby/libzstd/legacy/zstd_legacy.h +49 -15
- data/ext/zstdruby/libzstd/legacy/zstd_v01.c +142 -117
- data/ext/zstdruby/libzstd/legacy/zstd_v01.h +13 -8
- data/ext/zstdruby/libzstd/legacy/zstd_v02.c +54 -25
- data/ext/zstdruby/libzstd/legacy/zstd_v02.h +13 -8
- data/ext/zstdruby/libzstd/legacy/zstd_v03.c +55 -25
- data/ext/zstdruby/libzstd/legacy/zstd_v03.h +13 -8
- data/ext/zstdruby/libzstd/legacy/zstd_v04.c +62 -29
- data/ext/zstdruby/libzstd/legacy/zstd_v04.h +13 -8
- data/ext/zstdruby/libzstd/legacy/zstd_v05.c +145 -109
- data/ext/zstdruby/libzstd/legacy/zstd_v05.h +14 -9
- data/ext/zstdruby/libzstd/legacy/zstd_v06.c +56 -26
- data/ext/zstdruby/libzstd/legacy/zstd_v06.h +11 -6
- data/ext/zstdruby/libzstd/legacy/zstd_v07.c +65 -28
- data/ext/zstdruby/libzstd/legacy/zstd_v07.h +11 -6
- data/ext/zstdruby/libzstd/libzstd.pc.in +3 -2
- data/ext/zstdruby/libzstd/zstd.h +921 -597
- data/lib/zstd-ruby/version.rb +1 -1
- data/zstd-ruby.gemspec +2 -2
- metadata +19 -14
- data/ext/zstdruby/libzstd/dll/libzstd.def +0 -87
@@ -1,5 +1,5 @@
|
|
1
1
|
/*
|
2
|
-
* Copyright (c) 2016-
|
2
|
+
* Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
|
3
3
|
* All rights reserved.
|
4
4
|
*
|
5
5
|
* This source code is licensed under both the BSD-style license (found in the
|
@@ -16,8 +16,8 @@
|
|
16
16
|
* Dependencies
|
17
17
|
*********************************************************/
|
18
18
|
#include <stddef.h> /* size_t */
|
19
|
-
#include "zstd.h" /* DCtx, and some public functions */
|
20
|
-
#include "zstd_internal.h" /* blockProperties_t, and some public functions */
|
19
|
+
#include "../zstd.h" /* DCtx, and some public functions */
|
20
|
+
#include "../common/zstd_internal.h" /* blockProperties_t, and some public functions */
|
21
21
|
#include "zstd_decompress_internal.h" /* ZSTD_seqSymbol */
|
22
22
|
|
23
23
|
|
@@ -1,5 +1,5 @@
|
|
1
1
|
/*
|
2
|
-
* Copyright (c) 2016-
|
2
|
+
* Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
|
3
3
|
* All rights reserved.
|
4
4
|
*
|
5
5
|
* This source code is licensed under both the BSD-style license (found in the
|
@@ -19,8 +19,8 @@
|
|
19
19
|
/*-*******************************************************
|
20
20
|
* Dependencies
|
21
21
|
*********************************************************/
|
22
|
-
#include "mem.h" /* BYTE, U16, U32 */
|
23
|
-
#include "zstd_internal.h" /* ZSTD_seqSymbol */
|
22
|
+
#include "../common/mem.h" /* BYTE, U16, U32 */
|
23
|
+
#include "../common/zstd_internal.h" /* ZSTD_seqSymbol */
|
24
24
|
|
25
25
|
|
26
26
|
|
@@ -89,6 +89,17 @@ typedef enum { ZSTDds_getFrameHeaderSize, ZSTDds_decodeFrameHeader,
|
|
89
89
|
typedef enum { zdss_init=0, zdss_loadHeader,
|
90
90
|
zdss_read, zdss_load, zdss_flush } ZSTD_dStreamStage;
|
91
91
|
|
92
|
+
typedef enum {
|
93
|
+
ZSTD_use_indefinitely = -1, /* Use the dictionary indefinitely */
|
94
|
+
ZSTD_dont_use = 0, /* Do not use the dictionary (if one exists free it) */
|
95
|
+
ZSTD_use_once = 1 /* Use the dictionary once and set to ZSTD_dont_use */
|
96
|
+
} ZSTD_dictUses_e;
|
97
|
+
|
98
|
+
typedef enum {
|
99
|
+
ZSTD_obm_buffered = 0, /* Buffer the output */
|
100
|
+
ZSTD_obm_stable = 1 /* ZSTD_outBuffer is stable */
|
101
|
+
} ZSTD_outBufferMode_e;
|
102
|
+
|
92
103
|
struct ZSTD_DCtx_s
|
93
104
|
{
|
94
105
|
const ZSTD_seqSymbol* LLTptr;
|
@@ -123,6 +134,7 @@ struct ZSTD_DCtx_s
|
|
123
134
|
const ZSTD_DDict* ddict; /* set by ZSTD_initDStream_usingDDict(), or ZSTD_DCtx_refDDict() */
|
124
135
|
U32 dictID;
|
125
136
|
int ddictIsCold; /* if == 1 : dictionary is "new" for working context, and presumed "cold" (not in cpu cache) */
|
137
|
+
ZSTD_dictUses_e dictUses;
|
126
138
|
|
127
139
|
/* streaming */
|
128
140
|
ZSTD_dStreamStage streamStage;
|
@@ -140,10 +152,19 @@ struct ZSTD_DCtx_s
|
|
140
152
|
U32 legacyVersion;
|
141
153
|
U32 hostageByte;
|
142
154
|
int noForwardProgress;
|
155
|
+
ZSTD_outBufferMode_e outBufferMode;
|
156
|
+
ZSTD_outBuffer expectedOutBuffer;
|
143
157
|
|
144
158
|
/* workspace */
|
145
159
|
BYTE litBuffer[ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH];
|
146
160
|
BYTE headerBuffer[ZSTD_FRAMEHEADERSIZE_MAX];
|
161
|
+
|
162
|
+
size_t oversizedDuration;
|
163
|
+
|
164
|
+
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
|
165
|
+
void const* dictContentBeginForFuzzing;
|
166
|
+
void const* dictContentEndForFuzzing;
|
167
|
+
#endif
|
147
168
|
}; /* typedef'd to ZSTD_DCtx within "zstd.h" */
|
148
169
|
|
149
170
|
|
@@ -153,7 +174,7 @@ struct ZSTD_DCtx_s
|
|
153
174
|
|
154
175
|
/*! ZSTD_loadDEntropy() :
|
155
176
|
* dict : must point at beginning of a valid zstd dictionary.
|
156
|
-
* @return : size of entropy tables
|
177
|
+
* @return : size of dictionary header (size of magic number + dict ID + entropy tables) */
|
157
178
|
size_t ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy,
|
158
179
|
const void* const dict, size_t const dictSize);
|
159
180
|
|
@@ -1,5 +1,5 @@
|
|
1
1
|
/*
|
2
|
-
* Copyright (c) 2016-
|
2
|
+
* Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
|
3
3
|
* All rights reserved.
|
4
4
|
*
|
5
5
|
* This source code is licensed under both the BSD-style license (found in the
|
@@ -28,7 +28,7 @@ extern "C" {
|
|
28
28
|
* Dependencies
|
29
29
|
***************************************/
|
30
30
|
#include <stddef.h> /* size_t */
|
31
|
-
#include "zstd.h" /* ZSTD_CStream, ZSTD_DStream, ZSTDLIB_API */
|
31
|
+
#include "../zstd.h" /* ZSTD_CStream, ZSTD_DStream, ZSTDLIB_API */
|
32
32
|
|
33
33
|
|
34
34
|
/* ***************************************************************
|
@@ -36,16 +36,17 @@ extern "C" {
|
|
36
36
|
*****************************************************************/
|
37
37
|
/* Deprecation warnings */
|
38
38
|
/* Should these warnings be a problem,
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
39
|
+
* it is generally possible to disable them,
|
40
|
+
* typically with -Wno-deprecated-declarations for gcc
|
41
|
+
* or _CRT_SECURE_NO_WARNINGS in Visual.
|
42
|
+
* Otherwise, it's also possible to define ZBUFF_DISABLE_DEPRECATE_WARNINGS
|
43
|
+
*/
|
43
44
|
#ifdef ZBUFF_DISABLE_DEPRECATE_WARNINGS
|
44
45
|
# define ZBUFF_DEPRECATED(message) ZSTDLIB_API /* disable deprecation warnings */
|
45
46
|
#else
|
46
47
|
# if defined (__cplusplus) && (__cplusplus >= 201402) /* C++14 or greater */
|
47
48
|
# define ZBUFF_DEPRECATED(message) [[deprecated(message)]] ZSTDLIB_API
|
48
|
-
# elif (defined(
|
49
|
+
# elif (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__)
|
49
50
|
# define ZBUFF_DEPRECATED(message) ZSTDLIB_API __attribute__((deprecated(message)))
|
50
51
|
# elif defined(__GNUC__) && (__GNUC__ >= 3)
|
51
52
|
# define ZBUFF_DEPRECATED(message) ZSTDLIB_API __attribute__((deprecated))
|
@@ -185,7 +186,7 @@ ZBUFF_DEPRECATED("use ZSTD_DStreamOutSize") size_t ZBUFF_recommendedDOutSize(voi
|
|
185
186
|
|
186
187
|
/*--- Dependency ---*/
|
187
188
|
#define ZSTD_STATIC_LINKING_ONLY /* ZSTD_parameters, ZSTD_customMem */
|
188
|
-
#include "zstd.h"
|
189
|
+
#include "../zstd.h"
|
189
190
|
|
190
191
|
|
191
192
|
/*--- Custom memory allocator ---*/
|
@@ -1,5 +1,5 @@
|
|
1
1
|
/*
|
2
|
-
* Copyright (c) 2016-
|
2
|
+
* Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
|
3
3
|
* All rights reserved.
|
4
4
|
*
|
5
5
|
* This source code is licensed under both the BSD-style license (found in the
|
@@ -11,7 +11,7 @@
|
|
11
11
|
/*-*************************************
|
12
12
|
* Dependencies
|
13
13
|
***************************************/
|
14
|
-
#include "error_private.h"
|
14
|
+
#include "../common/error_private.h"
|
15
15
|
#include "zbuff.h"
|
16
16
|
|
17
17
|
/*-****************************************
|
@@ -1,5 +1,5 @@
|
|
1
1
|
/*
|
2
|
-
* Copyright (c) 2016-
|
2
|
+
* Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
|
3
3
|
* All rights reserved.
|
4
4
|
*
|
5
5
|
* This source code is licensed under both the BSD-style license (found in the
|
@@ -26,11 +26,11 @@
|
|
26
26
|
#include <string.h> /* memset */
|
27
27
|
#include <time.h> /* clock */
|
28
28
|
|
29
|
-
#include "mem.h" /* read */
|
30
|
-
#include "pool.h"
|
31
|
-
#include "threading.h"
|
29
|
+
#include "../common/mem.h" /* read */
|
30
|
+
#include "../common/pool.h"
|
31
|
+
#include "../common/threading.h"
|
32
32
|
#include "cover.h"
|
33
|
-
#include "zstd_internal.h" /* includes zstd.h */
|
33
|
+
#include "../common/zstd_internal.h" /* includes zstd.h */
|
34
34
|
#ifndef ZDICT_STATIC_LINKING_ONLY
|
35
35
|
#define ZDICT_STATIC_LINKING_ONLY
|
36
36
|
#endif
|
@@ -391,7 +391,7 @@ static void COVER_group(COVER_ctx_t *ctx, const void *group,
|
|
391
391
|
*
|
392
392
|
* Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1})
|
393
393
|
*
|
394
|
-
* Once the dmer d is in the
|
394
|
+
* Once the dmer d is in the dictionary we set F(d) = 0.
|
395
395
|
*/
|
396
396
|
static COVER_segment_t COVER_selectSegment(const COVER_ctx_t *ctx, U32 *freqs,
|
397
397
|
COVER_map_t *activeDmers, U32 begin,
|
@@ -435,7 +435,7 @@ static COVER_segment_t COVER_selectSegment(const COVER_ctx_t *ctx, U32 *freqs,
|
|
435
435
|
U32 *delDmerOcc = COVER_map_at(activeDmers, delDmer);
|
436
436
|
activeSegment.begin += 1;
|
437
437
|
*delDmerOcc -= 1;
|
438
|
-
/* If this is the last
|
438
|
+
/* If this is the last occurrence of the dmer, subtract its score */
|
439
439
|
if (*delDmerOcc == 0) {
|
440
440
|
COVER_map_remove(activeDmers, delDmer);
|
441
441
|
activeSegment.score -= freqs[delDmer];
|
@@ -526,10 +526,10 @@ static void COVER_ctx_destroy(COVER_ctx_t *ctx) {
|
|
526
526
|
* Prepare a context for dictionary building.
|
527
527
|
* The context is only dependent on the parameter `d` and can used multiple
|
528
528
|
* times.
|
529
|
-
* Returns
|
529
|
+
* Returns 0 on success or error code on error.
|
530
530
|
* The context must be destroyed with `COVER_ctx_destroy()`.
|
531
531
|
*/
|
532
|
-
static
|
532
|
+
static size_t COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
|
533
533
|
const size_t *samplesSizes, unsigned nbSamples,
|
534
534
|
unsigned d, double splitPoint) {
|
535
535
|
const BYTE *const samples = (const BYTE *)samplesBuffer;
|
@@ -544,17 +544,17 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
|
|
544
544
|
totalSamplesSize >= (size_t)COVER_MAX_SAMPLES_SIZE) {
|
545
545
|
DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n",
|
546
546
|
(unsigned)(totalSamplesSize>>20), (COVER_MAX_SAMPLES_SIZE >> 20));
|
547
|
-
return
|
547
|
+
return ERROR(srcSize_wrong);
|
548
548
|
}
|
549
549
|
/* Check if there are at least 5 training samples */
|
550
550
|
if (nbTrainSamples < 5) {
|
551
551
|
DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid.", nbTrainSamples);
|
552
|
-
return
|
552
|
+
return ERROR(srcSize_wrong);
|
553
553
|
}
|
554
554
|
/* Check if there's testing sample */
|
555
555
|
if (nbTestSamples < 1) {
|
556
556
|
DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.", nbTestSamples);
|
557
|
-
return
|
557
|
+
return ERROR(srcSize_wrong);
|
558
558
|
}
|
559
559
|
/* Zero the context */
|
560
560
|
memset(ctx, 0, sizeof(*ctx));
|
@@ -577,7 +577,7 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
|
|
577
577
|
if (!ctx->suffix || !ctx->dmerAt || !ctx->offsets) {
|
578
578
|
DISPLAYLEVEL(1, "Failed to allocate scratch buffers\n");
|
579
579
|
COVER_ctx_destroy(ctx);
|
580
|
-
return
|
580
|
+
return ERROR(memory_allocation);
|
581
581
|
}
|
582
582
|
ctx->freqs = NULL;
|
583
583
|
ctx->d = d;
|
@@ -624,7 +624,40 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
|
|
624
624
|
(ctx->d <= 8 ? &COVER_cmp8 : &COVER_cmp), &COVER_group);
|
625
625
|
ctx->freqs = ctx->suffix;
|
626
626
|
ctx->suffix = NULL;
|
627
|
-
return
|
627
|
+
return 0;
|
628
|
+
}
|
629
|
+
|
630
|
+
void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel)
|
631
|
+
{
|
632
|
+
const double ratio = (double)nbDmers / maxDictSize;
|
633
|
+
if (ratio >= 10) {
|
634
|
+
return;
|
635
|
+
}
|
636
|
+
LOCALDISPLAYLEVEL(displayLevel, 1,
|
637
|
+
"WARNING: The maximum dictionary size %u is too large "
|
638
|
+
"compared to the source size %u! "
|
639
|
+
"size(source)/size(dictionary) = %f, but it should be >= "
|
640
|
+
"10! This may lead to a subpar dictionary! We recommend "
|
641
|
+
"training on sources at least 10x, and preferably 100x "
|
642
|
+
"the size of the dictionary! \n", (U32)maxDictSize,
|
643
|
+
(U32)nbDmers, ratio);
|
644
|
+
}
|
645
|
+
|
646
|
+
COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize,
|
647
|
+
U32 nbDmers, U32 k, U32 passes)
|
648
|
+
{
|
649
|
+
const U32 minEpochSize = k * 10;
|
650
|
+
COVER_epoch_info_t epochs;
|
651
|
+
epochs.num = MAX(1, maxDictSize / k / passes);
|
652
|
+
epochs.size = nbDmers / epochs.num;
|
653
|
+
if (epochs.size >= minEpochSize) {
|
654
|
+
assert(epochs.size * epochs.num <= nbDmers);
|
655
|
+
return epochs;
|
656
|
+
}
|
657
|
+
epochs.size = MIN(minEpochSize, nbDmers);
|
658
|
+
epochs.num = nbDmers / epochs.size;
|
659
|
+
assert(epochs.size * epochs.num <= nbDmers);
|
660
|
+
return epochs;
|
628
661
|
}
|
629
662
|
|
630
663
|
/**
|
@@ -636,28 +669,34 @@ static size_t COVER_buildDictionary(const COVER_ctx_t *ctx, U32 *freqs,
|
|
636
669
|
ZDICT_cover_params_t parameters) {
|
637
670
|
BYTE *const dict = (BYTE *)dictBuffer;
|
638
671
|
size_t tail = dictBufferCapacity;
|
639
|
-
/* Divide the data
|
640
|
-
|
641
|
-
|
642
|
-
const
|
643
|
-
|
672
|
+
/* Divide the data into epochs. We will select one segment from each epoch. */
|
673
|
+
const COVER_epoch_info_t epochs = COVER_computeEpochs(
|
674
|
+
(U32)dictBufferCapacity, (U32)ctx->suffixSize, parameters.k, 4);
|
675
|
+
const size_t maxZeroScoreRun = MAX(10, MIN(100, epochs.num >> 3));
|
676
|
+
size_t zeroScoreRun = 0;
|
644
677
|
size_t epoch;
|
645
678
|
DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n",
|
646
|
-
epochs,
|
679
|
+
(U32)epochs.num, (U32)epochs.size);
|
647
680
|
/* Loop through the epochs until there are no more segments or the dictionary
|
648
681
|
* is full.
|
649
682
|
*/
|
650
|
-
for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs) {
|
651
|
-
const U32 epochBegin = (U32)(epoch *
|
652
|
-
const U32 epochEnd = epochBegin +
|
683
|
+
for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs.num) {
|
684
|
+
const U32 epochBegin = (U32)(epoch * epochs.size);
|
685
|
+
const U32 epochEnd = epochBegin + epochs.size;
|
653
686
|
size_t segmentSize;
|
654
687
|
/* Select a segment */
|
655
688
|
COVER_segment_t segment = COVER_selectSegment(
|
656
689
|
ctx, freqs, activeDmers, epochBegin, epochEnd, parameters);
|
657
|
-
/* If the segment covers no dmers, then we are out of content
|
690
|
+
/* If the segment covers no dmers, then we are out of content.
|
691
|
+
* There may be new content in other epochs, for continue for some time.
|
692
|
+
*/
|
658
693
|
if (segment.score == 0) {
|
659
|
-
|
694
|
+
if (++zeroScoreRun >= maxZeroScoreRun) {
|
695
|
+
break;
|
696
|
+
}
|
697
|
+
continue;
|
660
698
|
}
|
699
|
+
zeroScoreRun = 0;
|
661
700
|
/* Trim the segment if necessary and if it is too small then we are done */
|
662
701
|
segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail);
|
663
702
|
if (segmentSize < parameters.d) {
|
@@ -690,11 +729,11 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
|
|
690
729
|
/* Checks */
|
691
730
|
if (!COVER_checkParameters(parameters, dictBufferCapacity)) {
|
692
731
|
DISPLAYLEVEL(1, "Cover parameters incorrect\n");
|
693
|
-
return ERROR(
|
732
|
+
return ERROR(parameter_outOfBound);
|
694
733
|
}
|
695
734
|
if (nbSamples == 0) {
|
696
735
|
DISPLAYLEVEL(1, "Cover must have at least one input file\n");
|
697
|
-
return ERROR(
|
736
|
+
return ERROR(srcSize_wrong);
|
698
737
|
}
|
699
738
|
if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
|
700
739
|
DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
|
@@ -702,14 +741,18 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
|
|
702
741
|
return ERROR(dstSize_tooSmall);
|
703
742
|
}
|
704
743
|
/* Initialize context and activeDmers */
|
705
|
-
|
706
|
-
|
707
|
-
|
744
|
+
{
|
745
|
+
size_t const initVal = COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
|
746
|
+
parameters.d, parameters.splitPoint);
|
747
|
+
if (ZSTD_isError(initVal)) {
|
748
|
+
return initVal;
|
749
|
+
}
|
708
750
|
}
|
751
|
+
COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize, g_displayLevel);
|
709
752
|
if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
|
710
753
|
DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
|
711
754
|
COVER_ctx_destroy(&ctx);
|
712
|
-
return ERROR(
|
755
|
+
return ERROR(memory_allocation);
|
713
756
|
}
|
714
757
|
|
715
758
|
DISPLAYLEVEL(2, "Building dictionary\n");
|
@@ -770,7 +813,7 @@ size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,
|
|
770
813
|
cctx, dst, dstCapacity, samples + offsets[i],
|
771
814
|
samplesSizes[i], cdict);
|
772
815
|
if (ZSTD_isError(size)) {
|
773
|
-
totalCompressedSize =
|
816
|
+
totalCompressedSize = size;
|
774
817
|
goto _compressCleanup;
|
775
818
|
}
|
776
819
|
totalCompressedSize += size;
|
@@ -846,9 +889,11 @@ void COVER_best_start(COVER_best_t *best) {
|
|
846
889
|
* Decrements liveJobs and signals any waiting threads if liveJobs == 0.
|
847
890
|
* If this dictionary is the best so far save it and its parameters.
|
848
891
|
*/
|
849
|
-
void COVER_best_finish(COVER_best_t *best,
|
850
|
-
|
851
|
-
|
892
|
+
void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters,
|
893
|
+
COVER_dictSelection_t selection) {
|
894
|
+
void* dict = selection.dictContent;
|
895
|
+
size_t compressedSize = selection.totalCompressedSize;
|
896
|
+
size_t dictSize = selection.dictSize;
|
852
897
|
if (!best) {
|
853
898
|
return;
|
854
899
|
}
|
@@ -874,10 +919,12 @@ void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
|
|
874
919
|
}
|
875
920
|
}
|
876
921
|
/* Save the dictionary, parameters, and size */
|
877
|
-
|
878
|
-
|
879
|
-
|
880
|
-
|
922
|
+
if (dict) {
|
923
|
+
memcpy(best->dict, dict, dictSize);
|
924
|
+
best->dictSize = dictSize;
|
925
|
+
best->parameters = parameters;
|
926
|
+
best->compressedSize = compressedSize;
|
927
|
+
}
|
881
928
|
}
|
882
929
|
if (liveJobs == 0) {
|
883
930
|
ZSTD_pthread_cond_broadcast(&best->cond);
|
@@ -886,6 +933,111 @@ void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
|
|
886
933
|
}
|
887
934
|
}
|
888
935
|
|
936
|
+
COVER_dictSelection_t COVER_dictSelectionError(size_t error) {
|
937
|
+
COVER_dictSelection_t selection = { NULL, 0, error };
|
938
|
+
return selection;
|
939
|
+
}
|
940
|
+
|
941
|
+
unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection) {
|
942
|
+
return (ZSTD_isError(selection.totalCompressedSize) || !selection.dictContent);
|
943
|
+
}
|
944
|
+
|
945
|
+
void COVER_dictSelectionFree(COVER_dictSelection_t selection){
|
946
|
+
free(selection.dictContent);
|
947
|
+
}
|
948
|
+
|
949
|
+
COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent,
|
950
|
+
size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples,
|
951
|
+
size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize) {
|
952
|
+
|
953
|
+
size_t largestDict = 0;
|
954
|
+
size_t largestCompressed = 0;
|
955
|
+
BYTE* customDictContentEnd = customDictContent + dictContentSize;
|
956
|
+
|
957
|
+
BYTE * largestDictbuffer = (BYTE *)malloc(dictContentSize);
|
958
|
+
BYTE * candidateDictBuffer = (BYTE *)malloc(dictContentSize);
|
959
|
+
double regressionTolerance = ((double)params.shrinkDictMaxRegression / 100.0) + 1.00;
|
960
|
+
|
961
|
+
if (!largestDictbuffer || !candidateDictBuffer) {
|
962
|
+
free(largestDictbuffer);
|
963
|
+
free(candidateDictBuffer);
|
964
|
+
return COVER_dictSelectionError(dictContentSize);
|
965
|
+
}
|
966
|
+
|
967
|
+
/* Initial dictionary size and compressed size */
|
968
|
+
memcpy(largestDictbuffer, customDictContent, dictContentSize);
|
969
|
+
dictContentSize = ZDICT_finalizeDictionary(
|
970
|
+
largestDictbuffer, dictContentSize, customDictContent, dictContentSize,
|
971
|
+
samplesBuffer, samplesSizes, nbFinalizeSamples, params.zParams);
|
972
|
+
|
973
|
+
if (ZDICT_isError(dictContentSize)) {
|
974
|
+
free(largestDictbuffer);
|
975
|
+
free(candidateDictBuffer);
|
976
|
+
return COVER_dictSelectionError(dictContentSize);
|
977
|
+
}
|
978
|
+
|
979
|
+
totalCompressedSize = COVER_checkTotalCompressedSize(params, samplesSizes,
|
980
|
+
samplesBuffer, offsets,
|
981
|
+
nbCheckSamples, nbSamples,
|
982
|
+
largestDictbuffer, dictContentSize);
|
983
|
+
|
984
|
+
if (ZSTD_isError(totalCompressedSize)) {
|
985
|
+
free(largestDictbuffer);
|
986
|
+
free(candidateDictBuffer);
|
987
|
+
return COVER_dictSelectionError(totalCompressedSize);
|
988
|
+
}
|
989
|
+
|
990
|
+
if (params.shrinkDict == 0) {
|
991
|
+
COVER_dictSelection_t selection = { largestDictbuffer, dictContentSize, totalCompressedSize };
|
992
|
+
free(candidateDictBuffer);
|
993
|
+
return selection;
|
994
|
+
}
|
995
|
+
|
996
|
+
largestDict = dictContentSize;
|
997
|
+
largestCompressed = totalCompressedSize;
|
998
|
+
dictContentSize = ZDICT_DICTSIZE_MIN;
|
999
|
+
|
1000
|
+
/* Largest dict is initially at least ZDICT_DICTSIZE_MIN */
|
1001
|
+
while (dictContentSize < largestDict) {
|
1002
|
+
memcpy(candidateDictBuffer, largestDictbuffer, largestDict);
|
1003
|
+
dictContentSize = ZDICT_finalizeDictionary(
|
1004
|
+
candidateDictBuffer, dictContentSize, customDictContentEnd - dictContentSize, dictContentSize,
|
1005
|
+
samplesBuffer, samplesSizes, nbFinalizeSamples, params.zParams);
|
1006
|
+
|
1007
|
+
if (ZDICT_isError(dictContentSize)) {
|
1008
|
+
free(largestDictbuffer);
|
1009
|
+
free(candidateDictBuffer);
|
1010
|
+
return COVER_dictSelectionError(dictContentSize);
|
1011
|
+
|
1012
|
+
}
|
1013
|
+
|
1014
|
+
totalCompressedSize = COVER_checkTotalCompressedSize(params, samplesSizes,
|
1015
|
+
samplesBuffer, offsets,
|
1016
|
+
nbCheckSamples, nbSamples,
|
1017
|
+
candidateDictBuffer, dictContentSize);
|
1018
|
+
|
1019
|
+
if (ZSTD_isError(totalCompressedSize)) {
|
1020
|
+
free(largestDictbuffer);
|
1021
|
+
free(candidateDictBuffer);
|
1022
|
+
return COVER_dictSelectionError(totalCompressedSize);
|
1023
|
+
}
|
1024
|
+
|
1025
|
+
if (totalCompressedSize <= largestCompressed * regressionTolerance) {
|
1026
|
+
COVER_dictSelection_t selection = { candidateDictBuffer, dictContentSize, totalCompressedSize };
|
1027
|
+
free(largestDictbuffer);
|
1028
|
+
return selection;
|
1029
|
+
}
|
1030
|
+
dictContentSize *= 2;
|
1031
|
+
}
|
1032
|
+
dictContentSize = largestDict;
|
1033
|
+
totalCompressedSize = largestCompressed;
|
1034
|
+
{
|
1035
|
+
COVER_dictSelection_t selection = { largestDictbuffer, dictContentSize, totalCompressedSize };
|
1036
|
+
free(candidateDictBuffer);
|
1037
|
+
return selection;
|
1038
|
+
}
|
1039
|
+
}
|
1040
|
+
|
889
1041
|
/**
|
890
1042
|
* Parameters for COVER_tryParameters().
|
891
1043
|
*/
|
@@ -911,6 +1063,7 @@ static void COVER_tryParameters(void *opaque) {
|
|
911
1063
|
/* Allocate space for hash table, dict, and freqs */
|
912
1064
|
COVER_map_t activeDmers;
|
913
1065
|
BYTE *const dict = (BYTE * const)malloc(dictBufferCapacity);
|
1066
|
+
COVER_dictSelection_t selection = COVER_dictSelectionError(ERROR(GENERIC));
|
914
1067
|
U32 *freqs = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
|
915
1068
|
if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
|
916
1069
|
DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
|
@@ -926,29 +1079,21 @@ static void COVER_tryParameters(void *opaque) {
|
|
926
1079
|
{
|
927
1080
|
const size_t tail = COVER_buildDictionary(ctx, freqs, &activeDmers, dict,
|
928
1081
|
dictBufferCapacity, parameters);
|
929
|
-
|
930
|
-
|
931
|
-
|
932
|
-
|
933
|
-
if (
|
934
|
-
DISPLAYLEVEL(1, "Failed to
|
1082
|
+
selection = COVER_selectDict(dict + tail, dictBufferCapacity - tail,
|
1083
|
+
ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbTrainSamples, ctx->nbTrainSamples, ctx->nbSamples, parameters, ctx->offsets,
|
1084
|
+
totalCompressedSize);
|
1085
|
+
|
1086
|
+
if (COVER_dictSelectionIsError(selection)) {
|
1087
|
+
DISPLAYLEVEL(1, "Failed to select dictionary\n");
|
935
1088
|
goto _cleanup;
|
936
1089
|
}
|
937
1090
|
}
|
938
|
-
/* Check total compressed size */
|
939
|
-
totalCompressedSize = COVER_checkTotalCompressedSize(parameters, ctx->samplesSizes,
|
940
|
-
ctx->samples, ctx->offsets,
|
941
|
-
ctx->nbTrainSamples, ctx->nbSamples,
|
942
|
-
dict, dictBufferCapacity);
|
943
|
-
|
944
1091
|
_cleanup:
|
945
|
-
|
946
|
-
|
1092
|
+
free(dict);
|
1093
|
+
COVER_best_finish(data->best, parameters, selection);
|
947
1094
|
free(data);
|
948
1095
|
COVER_map_destroy(&activeDmers);
|
949
|
-
|
950
|
-
free(dict);
|
951
|
-
}
|
1096
|
+
COVER_dictSelectionFree(selection);
|
952
1097
|
if (freqs) {
|
953
1098
|
free(freqs);
|
954
1099
|
}
|
@@ -970,6 +1115,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
|
|
970
1115
|
const unsigned kStepSize = MAX((kMaxK - kMinK) / kSteps, 1);
|
971
1116
|
const unsigned kIterations =
|
972
1117
|
(1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize);
|
1118
|
+
const unsigned shrinkDict = 0;
|
973
1119
|
/* Local variables */
|
974
1120
|
const int displayLevel = parameters->zParams.notificationLevel;
|
975
1121
|
unsigned iteration = 1;
|
@@ -977,19 +1123,20 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
|
|
977
1123
|
unsigned k;
|
978
1124
|
COVER_best_t best;
|
979
1125
|
POOL_ctx *pool = NULL;
|
1126
|
+
int warned = 0;
|
980
1127
|
|
981
1128
|
/* Checks */
|
982
1129
|
if (splitPoint <= 0 || splitPoint > 1) {
|
983
1130
|
LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
|
984
|
-
return ERROR(
|
1131
|
+
return ERROR(parameter_outOfBound);
|
985
1132
|
}
|
986
1133
|
if (kMinK < kMaxD || kMaxK < kMinK) {
|
987
1134
|
LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
|
988
|
-
return ERROR(
|
1135
|
+
return ERROR(parameter_outOfBound);
|
989
1136
|
}
|
990
1137
|
if (nbSamples == 0) {
|
991
1138
|
DISPLAYLEVEL(1, "Cover must have at least one input file\n");
|
992
|
-
return ERROR(
|
1139
|
+
return ERROR(srcSize_wrong);
|
993
1140
|
}
|
994
1141
|
if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
|
995
1142
|
DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
|
@@ -1013,11 +1160,18 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
|
|
1013
1160
|
/* Initialize the context for this value of d */
|
1014
1161
|
COVER_ctx_t ctx;
|
1015
1162
|
LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d);
|
1016
|
-
|
1017
|
-
|
1018
|
-
|
1019
|
-
|
1020
|
-
|
1163
|
+
{
|
1164
|
+
const size_t initVal = COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint);
|
1165
|
+
if (ZSTD_isError(initVal)) {
|
1166
|
+
LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n");
|
1167
|
+
COVER_best_destroy(&best);
|
1168
|
+
POOL_free(pool);
|
1169
|
+
return initVal;
|
1170
|
+
}
|
1171
|
+
}
|
1172
|
+
if (!warned) {
|
1173
|
+
COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize, displayLevel);
|
1174
|
+
warned = 1;
|
1021
1175
|
}
|
1022
1176
|
/* Loop through k reusing the same context */
|
1023
1177
|
for (k = kMinK; k <= kMaxK; k += kStepSize) {
|
@@ -1030,7 +1184,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
|
|
1030
1184
|
COVER_best_destroy(&best);
|
1031
1185
|
COVER_ctx_destroy(&ctx);
|
1032
1186
|
POOL_free(pool);
|
1033
|
-
return ERROR(
|
1187
|
+
return ERROR(memory_allocation);
|
1034
1188
|
}
|
1035
1189
|
data->ctx = &ctx;
|
1036
1190
|
data->best = &best;
|
@@ -1040,6 +1194,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
|
|
1040
1194
|
data->parameters.d = d;
|
1041
1195
|
data->parameters.splitPoint = splitPoint;
|
1042
1196
|
data->parameters.steps = kSteps;
|
1197
|
+
data->parameters.shrinkDict = shrinkDict;
|
1043
1198
|
data->parameters.zParams.notificationLevel = g_displayLevel;
|
1044
1199
|
/* Check the parameters */
|
1045
1200
|
if (!COVER_checkParameters(data->parameters, dictBufferCapacity)) {
|