zstd-ruby 1.3.8.0 → 1.4.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +6 -5
- data/README.md +1 -1
- data/ext/zstdruby/libzstd/Makefile +133 -61
- data/ext/zstdruby/libzstd/README.md +51 -18
- data/ext/zstdruby/libzstd/common/bitstream.h +38 -39
- data/ext/zstdruby/libzstd/common/compiler.h +41 -6
- data/ext/zstdruby/libzstd/common/cpu.h +1 -1
- data/ext/zstdruby/libzstd/common/debug.c +11 -31
- data/ext/zstdruby/libzstd/common/debug.h +11 -31
- data/ext/zstdruby/libzstd/common/entropy_common.c +13 -33
- data/ext/zstdruby/libzstd/common/error_private.c +2 -1
- data/ext/zstdruby/libzstd/common/error_private.h +6 -2
- data/ext/zstdruby/libzstd/common/fse.h +13 -33
- data/ext/zstdruby/libzstd/common/fse_decompress.c +12 -35
- data/ext/zstdruby/libzstd/common/huf.h +15 -33
- data/ext/zstdruby/libzstd/common/mem.h +75 -2
- data/ext/zstdruby/libzstd/common/pool.c +8 -4
- data/ext/zstdruby/libzstd/common/pool.h +2 -2
- data/ext/zstdruby/libzstd/common/threading.c +52 -6
- data/ext/zstdruby/libzstd/common/threading.h +36 -4
- data/ext/zstdruby/libzstd/common/xxhash.c +25 -37
- data/ext/zstdruby/libzstd/common/xxhash.h +11 -31
- data/ext/zstdruby/libzstd/common/zstd_common.c +1 -1
- data/ext/zstdruby/libzstd/common/zstd_errors.h +2 -1
- data/ext/zstdruby/libzstd/common/zstd_internal.h +203 -22
- data/ext/zstdruby/libzstd/compress/fse_compress.c +19 -42
- data/ext/zstdruby/libzstd/compress/hist.c +15 -35
- data/ext/zstdruby/libzstd/compress/hist.h +12 -32
- data/ext/zstdruby/libzstd/compress/huf_compress.c +92 -92
- data/ext/zstdruby/libzstd/compress/zstd_compress.c +1460 -1472
- data/ext/zstdruby/libzstd/compress/zstd_compress_internal.h +330 -65
- data/ext/zstdruby/libzstd/compress/zstd_compress_literals.c +158 -0
- data/ext/zstdruby/libzstd/compress/zstd_compress_literals.h +29 -0
- data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.c +419 -0
- data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.h +54 -0
- data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.c +845 -0
- data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.h +32 -0
- data/ext/zstdruby/libzstd/compress/zstd_cwksp.h +525 -0
- data/ext/zstdruby/libzstd/compress/zstd_double_fast.c +65 -43
- data/ext/zstdruby/libzstd/compress/zstd_double_fast.h +2 -2
- data/ext/zstdruby/libzstd/compress/zstd_fast.c +264 -159
- data/ext/zstdruby/libzstd/compress/zstd_fast.h +2 -2
- data/ext/zstdruby/libzstd/compress/zstd_lazy.c +74 -42
- data/ext/zstdruby/libzstd/compress/zstd_lazy.h +2 -2
- data/ext/zstdruby/libzstd/compress/zstd_ldm.c +33 -11
- data/ext/zstdruby/libzstd/compress/zstd_ldm.h +7 -2
- data/ext/zstdruby/libzstd/compress/zstd_opt.c +108 -125
- data/ext/zstdruby/libzstd/compress/zstd_opt.h +1 -1
- data/ext/zstdruby/libzstd/compress/zstdmt_compress.c +129 -93
- data/ext/zstdruby/libzstd/compress/zstdmt_compress.h +46 -28
- data/ext/zstdruby/libzstd/decompress/huf_decompress.c +76 -60
- data/ext/zstdruby/libzstd/decompress/zstd_ddict.c +14 -10
- data/ext/zstdruby/libzstd/decompress/zstd_ddict.h +2 -2
- data/ext/zstdruby/libzstd/decompress/zstd_decompress.c +471 -258
- data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.c +471 -346
- data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.h +3 -3
- data/ext/zstdruby/libzstd/decompress/zstd_decompress_internal.h +25 -4
- data/ext/zstdruby/libzstd/deprecated/zbuff.h +9 -8
- data/ext/zstdruby/libzstd/deprecated/zbuff_common.c +2 -2
- data/ext/zstdruby/libzstd/deprecated/zbuff_compress.c +1 -1
- data/ext/zstdruby/libzstd/deprecated/zbuff_decompress.c +1 -1
- data/ext/zstdruby/libzstd/dictBuilder/cover.c +220 -65
- data/ext/zstdruby/libzstd/dictBuilder/cover.h +81 -7
- data/ext/zstdruby/libzstd/dictBuilder/fastcover.c +85 -56
- data/ext/zstdruby/libzstd/dictBuilder/zdict.c +43 -19
- data/ext/zstdruby/libzstd/dictBuilder/zdict.h +73 -35
- data/ext/zstdruby/libzstd/dll/example/Makefile +2 -1
- data/ext/zstdruby/libzstd/dll/example/build_package.bat +3 -2
- data/ext/zstdruby/libzstd/legacy/zstd_legacy.h +49 -15
- data/ext/zstdruby/libzstd/legacy/zstd_v01.c +142 -117
- data/ext/zstdruby/libzstd/legacy/zstd_v01.h +13 -8
- data/ext/zstdruby/libzstd/legacy/zstd_v02.c +54 -25
- data/ext/zstdruby/libzstd/legacy/zstd_v02.h +13 -8
- data/ext/zstdruby/libzstd/legacy/zstd_v03.c +55 -25
- data/ext/zstdruby/libzstd/legacy/zstd_v03.h +13 -8
- data/ext/zstdruby/libzstd/legacy/zstd_v04.c +62 -29
- data/ext/zstdruby/libzstd/legacy/zstd_v04.h +13 -8
- data/ext/zstdruby/libzstd/legacy/zstd_v05.c +145 -109
- data/ext/zstdruby/libzstd/legacy/zstd_v05.h +14 -9
- data/ext/zstdruby/libzstd/legacy/zstd_v06.c +56 -26
- data/ext/zstdruby/libzstd/legacy/zstd_v06.h +11 -6
- data/ext/zstdruby/libzstd/legacy/zstd_v07.c +65 -28
- data/ext/zstdruby/libzstd/legacy/zstd_v07.h +11 -6
- data/ext/zstdruby/libzstd/libzstd.pc.in +3 -2
- data/ext/zstdruby/libzstd/zstd.h +921 -597
- data/lib/zstd-ruby/version.rb +1 -1
- data/zstd-ruby.gemspec +2 -2
- metadata +19 -14
- data/ext/zstdruby/libzstd/dll/libzstd.def +0 -87
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
/*
|
|
2
|
-
* Copyright (c) 2016-
|
|
2
|
+
* Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
|
|
3
3
|
* All rights reserved.
|
|
4
4
|
*
|
|
5
5
|
* This source code is licensed under both the BSD-style license (found in the
|
|
@@ -16,8 +16,8 @@
|
|
|
16
16
|
* Dependencies
|
|
17
17
|
*********************************************************/
|
|
18
18
|
#include <stddef.h> /* size_t */
|
|
19
|
-
#include "zstd.h" /* DCtx, and some public functions */
|
|
20
|
-
#include "zstd_internal.h" /* blockProperties_t, and some public functions */
|
|
19
|
+
#include "../zstd.h" /* DCtx, and some public functions */
|
|
20
|
+
#include "../common/zstd_internal.h" /* blockProperties_t, and some public functions */
|
|
21
21
|
#include "zstd_decompress_internal.h" /* ZSTD_seqSymbol */
|
|
22
22
|
|
|
23
23
|
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
/*
|
|
2
|
-
* Copyright (c) 2016-
|
|
2
|
+
* Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
|
|
3
3
|
* All rights reserved.
|
|
4
4
|
*
|
|
5
5
|
* This source code is licensed under both the BSD-style license (found in the
|
|
@@ -19,8 +19,8 @@
|
|
|
19
19
|
/*-*******************************************************
|
|
20
20
|
* Dependencies
|
|
21
21
|
*********************************************************/
|
|
22
|
-
#include "mem.h" /* BYTE, U16, U32 */
|
|
23
|
-
#include "zstd_internal.h" /* ZSTD_seqSymbol */
|
|
22
|
+
#include "../common/mem.h" /* BYTE, U16, U32 */
|
|
23
|
+
#include "../common/zstd_internal.h" /* ZSTD_seqSymbol */
|
|
24
24
|
|
|
25
25
|
|
|
26
26
|
|
|
@@ -89,6 +89,17 @@ typedef enum { ZSTDds_getFrameHeaderSize, ZSTDds_decodeFrameHeader,
|
|
|
89
89
|
typedef enum { zdss_init=0, zdss_loadHeader,
|
|
90
90
|
zdss_read, zdss_load, zdss_flush } ZSTD_dStreamStage;
|
|
91
91
|
|
|
92
|
+
typedef enum {
|
|
93
|
+
ZSTD_use_indefinitely = -1, /* Use the dictionary indefinitely */
|
|
94
|
+
ZSTD_dont_use = 0, /* Do not use the dictionary (if one exists free it) */
|
|
95
|
+
ZSTD_use_once = 1 /* Use the dictionary once and set to ZSTD_dont_use */
|
|
96
|
+
} ZSTD_dictUses_e;
|
|
97
|
+
|
|
98
|
+
typedef enum {
|
|
99
|
+
ZSTD_obm_buffered = 0, /* Buffer the output */
|
|
100
|
+
ZSTD_obm_stable = 1 /* ZSTD_outBuffer is stable */
|
|
101
|
+
} ZSTD_outBufferMode_e;
|
|
102
|
+
|
|
92
103
|
struct ZSTD_DCtx_s
|
|
93
104
|
{
|
|
94
105
|
const ZSTD_seqSymbol* LLTptr;
|
|
@@ -123,6 +134,7 @@ struct ZSTD_DCtx_s
|
|
|
123
134
|
const ZSTD_DDict* ddict; /* set by ZSTD_initDStream_usingDDict(), or ZSTD_DCtx_refDDict() */
|
|
124
135
|
U32 dictID;
|
|
125
136
|
int ddictIsCold; /* if == 1 : dictionary is "new" for working context, and presumed "cold" (not in cpu cache) */
|
|
137
|
+
ZSTD_dictUses_e dictUses;
|
|
126
138
|
|
|
127
139
|
/* streaming */
|
|
128
140
|
ZSTD_dStreamStage streamStage;
|
|
@@ -140,10 +152,19 @@ struct ZSTD_DCtx_s
|
|
|
140
152
|
U32 legacyVersion;
|
|
141
153
|
U32 hostageByte;
|
|
142
154
|
int noForwardProgress;
|
|
155
|
+
ZSTD_outBufferMode_e outBufferMode;
|
|
156
|
+
ZSTD_outBuffer expectedOutBuffer;
|
|
143
157
|
|
|
144
158
|
/* workspace */
|
|
145
159
|
BYTE litBuffer[ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH];
|
|
146
160
|
BYTE headerBuffer[ZSTD_FRAMEHEADERSIZE_MAX];
|
|
161
|
+
|
|
162
|
+
size_t oversizedDuration;
|
|
163
|
+
|
|
164
|
+
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
|
|
165
|
+
void const* dictContentBeginForFuzzing;
|
|
166
|
+
void const* dictContentEndForFuzzing;
|
|
167
|
+
#endif
|
|
147
168
|
}; /* typedef'd to ZSTD_DCtx within "zstd.h" */
|
|
148
169
|
|
|
149
170
|
|
|
@@ -153,7 +174,7 @@ struct ZSTD_DCtx_s
|
|
|
153
174
|
|
|
154
175
|
/*! ZSTD_loadDEntropy() :
|
|
155
176
|
* dict : must point at beginning of a valid zstd dictionary.
|
|
156
|
-
* @return : size of entropy tables
|
|
177
|
+
* @return : size of dictionary header (size of magic number + dict ID + entropy tables) */
|
|
157
178
|
size_t ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy,
|
|
158
179
|
const void* const dict, size_t const dictSize);
|
|
159
180
|
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
/*
|
|
2
|
-
* Copyright (c) 2016-
|
|
2
|
+
* Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
|
|
3
3
|
* All rights reserved.
|
|
4
4
|
*
|
|
5
5
|
* This source code is licensed under both the BSD-style license (found in the
|
|
@@ -28,7 +28,7 @@ extern "C" {
|
|
|
28
28
|
* Dependencies
|
|
29
29
|
***************************************/
|
|
30
30
|
#include <stddef.h> /* size_t */
|
|
31
|
-
#include "zstd.h" /* ZSTD_CStream, ZSTD_DStream, ZSTDLIB_API */
|
|
31
|
+
#include "../zstd.h" /* ZSTD_CStream, ZSTD_DStream, ZSTDLIB_API */
|
|
32
32
|
|
|
33
33
|
|
|
34
34
|
/* ***************************************************************
|
|
@@ -36,16 +36,17 @@ extern "C" {
|
|
|
36
36
|
*****************************************************************/
|
|
37
37
|
/* Deprecation warnings */
|
|
38
38
|
/* Should these warnings be a problem,
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
39
|
+
* it is generally possible to disable them,
|
|
40
|
+
* typically with -Wno-deprecated-declarations for gcc
|
|
41
|
+
* or _CRT_SECURE_NO_WARNINGS in Visual.
|
|
42
|
+
* Otherwise, it's also possible to define ZBUFF_DISABLE_DEPRECATE_WARNINGS
|
|
43
|
+
*/
|
|
43
44
|
#ifdef ZBUFF_DISABLE_DEPRECATE_WARNINGS
|
|
44
45
|
# define ZBUFF_DEPRECATED(message) ZSTDLIB_API /* disable deprecation warnings */
|
|
45
46
|
#else
|
|
46
47
|
# if defined (__cplusplus) && (__cplusplus >= 201402) /* C++14 or greater */
|
|
47
48
|
# define ZBUFF_DEPRECATED(message) [[deprecated(message)]] ZSTDLIB_API
|
|
48
|
-
# elif (defined(
|
|
49
|
+
# elif (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__)
|
|
49
50
|
# define ZBUFF_DEPRECATED(message) ZSTDLIB_API __attribute__((deprecated(message)))
|
|
50
51
|
# elif defined(__GNUC__) && (__GNUC__ >= 3)
|
|
51
52
|
# define ZBUFF_DEPRECATED(message) ZSTDLIB_API __attribute__((deprecated))
|
|
@@ -185,7 +186,7 @@ ZBUFF_DEPRECATED("use ZSTD_DStreamOutSize") size_t ZBUFF_recommendedDOutSize(voi
|
|
|
185
186
|
|
|
186
187
|
/*--- Dependency ---*/
|
|
187
188
|
#define ZSTD_STATIC_LINKING_ONLY /* ZSTD_parameters, ZSTD_customMem */
|
|
188
|
-
#include "zstd.h"
|
|
189
|
+
#include "../zstd.h"
|
|
189
190
|
|
|
190
191
|
|
|
191
192
|
/*--- Custom memory allocator ---*/
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
/*
|
|
2
|
-
* Copyright (c) 2016-
|
|
2
|
+
* Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
|
|
3
3
|
* All rights reserved.
|
|
4
4
|
*
|
|
5
5
|
* This source code is licensed under both the BSD-style license (found in the
|
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
/*-*************************************
|
|
12
12
|
* Dependencies
|
|
13
13
|
***************************************/
|
|
14
|
-
#include "error_private.h"
|
|
14
|
+
#include "../common/error_private.h"
|
|
15
15
|
#include "zbuff.h"
|
|
16
16
|
|
|
17
17
|
/*-****************************************
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
/*
|
|
2
|
-
* Copyright (c) 2016-
|
|
2
|
+
* Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
|
|
3
3
|
* All rights reserved.
|
|
4
4
|
*
|
|
5
5
|
* This source code is licensed under both the BSD-style license (found in the
|
|
@@ -26,11 +26,11 @@
|
|
|
26
26
|
#include <string.h> /* memset */
|
|
27
27
|
#include <time.h> /* clock */
|
|
28
28
|
|
|
29
|
-
#include "mem.h" /* read */
|
|
30
|
-
#include "pool.h"
|
|
31
|
-
#include "threading.h"
|
|
29
|
+
#include "../common/mem.h" /* read */
|
|
30
|
+
#include "../common/pool.h"
|
|
31
|
+
#include "../common/threading.h"
|
|
32
32
|
#include "cover.h"
|
|
33
|
-
#include "zstd_internal.h" /* includes zstd.h */
|
|
33
|
+
#include "../common/zstd_internal.h" /* includes zstd.h */
|
|
34
34
|
#ifndef ZDICT_STATIC_LINKING_ONLY
|
|
35
35
|
#define ZDICT_STATIC_LINKING_ONLY
|
|
36
36
|
#endif
|
|
@@ -391,7 +391,7 @@ static void COVER_group(COVER_ctx_t *ctx, const void *group,
|
|
|
391
391
|
*
|
|
392
392
|
* Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1})
|
|
393
393
|
*
|
|
394
|
-
* Once the dmer d is in the
|
|
394
|
+
* Once the dmer d is in the dictionary we set F(d) = 0.
|
|
395
395
|
*/
|
|
396
396
|
static COVER_segment_t COVER_selectSegment(const COVER_ctx_t *ctx, U32 *freqs,
|
|
397
397
|
COVER_map_t *activeDmers, U32 begin,
|
|
@@ -435,7 +435,7 @@ static COVER_segment_t COVER_selectSegment(const COVER_ctx_t *ctx, U32 *freqs,
|
|
|
435
435
|
U32 *delDmerOcc = COVER_map_at(activeDmers, delDmer);
|
|
436
436
|
activeSegment.begin += 1;
|
|
437
437
|
*delDmerOcc -= 1;
|
|
438
|
-
/* If this is the last
|
|
438
|
+
/* If this is the last occurrence of the dmer, subtract its score */
|
|
439
439
|
if (*delDmerOcc == 0) {
|
|
440
440
|
COVER_map_remove(activeDmers, delDmer);
|
|
441
441
|
activeSegment.score -= freqs[delDmer];
|
|
@@ -526,10 +526,10 @@ static void COVER_ctx_destroy(COVER_ctx_t *ctx) {
|
|
|
526
526
|
* Prepare a context for dictionary building.
|
|
527
527
|
* The context is only dependent on the parameter `d` and can used multiple
|
|
528
528
|
* times.
|
|
529
|
-
* Returns
|
|
529
|
+
* Returns 0 on success or error code on error.
|
|
530
530
|
* The context must be destroyed with `COVER_ctx_destroy()`.
|
|
531
531
|
*/
|
|
532
|
-
static
|
|
532
|
+
static size_t COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
|
|
533
533
|
const size_t *samplesSizes, unsigned nbSamples,
|
|
534
534
|
unsigned d, double splitPoint) {
|
|
535
535
|
const BYTE *const samples = (const BYTE *)samplesBuffer;
|
|
@@ -544,17 +544,17 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
|
|
|
544
544
|
totalSamplesSize >= (size_t)COVER_MAX_SAMPLES_SIZE) {
|
|
545
545
|
DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n",
|
|
546
546
|
(unsigned)(totalSamplesSize>>20), (COVER_MAX_SAMPLES_SIZE >> 20));
|
|
547
|
-
return
|
|
547
|
+
return ERROR(srcSize_wrong);
|
|
548
548
|
}
|
|
549
549
|
/* Check if there are at least 5 training samples */
|
|
550
550
|
if (nbTrainSamples < 5) {
|
|
551
551
|
DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid.", nbTrainSamples);
|
|
552
|
-
return
|
|
552
|
+
return ERROR(srcSize_wrong);
|
|
553
553
|
}
|
|
554
554
|
/* Check if there's testing sample */
|
|
555
555
|
if (nbTestSamples < 1) {
|
|
556
556
|
DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.", nbTestSamples);
|
|
557
|
-
return
|
|
557
|
+
return ERROR(srcSize_wrong);
|
|
558
558
|
}
|
|
559
559
|
/* Zero the context */
|
|
560
560
|
memset(ctx, 0, sizeof(*ctx));
|
|
@@ -577,7 +577,7 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
|
|
|
577
577
|
if (!ctx->suffix || !ctx->dmerAt || !ctx->offsets) {
|
|
578
578
|
DISPLAYLEVEL(1, "Failed to allocate scratch buffers\n");
|
|
579
579
|
COVER_ctx_destroy(ctx);
|
|
580
|
-
return
|
|
580
|
+
return ERROR(memory_allocation);
|
|
581
581
|
}
|
|
582
582
|
ctx->freqs = NULL;
|
|
583
583
|
ctx->d = d;
|
|
@@ -624,7 +624,40 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
|
|
|
624
624
|
(ctx->d <= 8 ? &COVER_cmp8 : &COVER_cmp), &COVER_group);
|
|
625
625
|
ctx->freqs = ctx->suffix;
|
|
626
626
|
ctx->suffix = NULL;
|
|
627
|
-
return
|
|
627
|
+
return 0;
|
|
628
|
+
}
|
|
629
|
+
|
|
630
|
+
void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel)
|
|
631
|
+
{
|
|
632
|
+
const double ratio = (double)nbDmers / maxDictSize;
|
|
633
|
+
if (ratio >= 10) {
|
|
634
|
+
return;
|
|
635
|
+
}
|
|
636
|
+
LOCALDISPLAYLEVEL(displayLevel, 1,
|
|
637
|
+
"WARNING: The maximum dictionary size %u is too large "
|
|
638
|
+
"compared to the source size %u! "
|
|
639
|
+
"size(source)/size(dictionary) = %f, but it should be >= "
|
|
640
|
+
"10! This may lead to a subpar dictionary! We recommend "
|
|
641
|
+
"training on sources at least 10x, and preferably 100x "
|
|
642
|
+
"the size of the dictionary! \n", (U32)maxDictSize,
|
|
643
|
+
(U32)nbDmers, ratio);
|
|
644
|
+
}
|
|
645
|
+
|
|
646
|
+
COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize,
|
|
647
|
+
U32 nbDmers, U32 k, U32 passes)
|
|
648
|
+
{
|
|
649
|
+
const U32 minEpochSize = k * 10;
|
|
650
|
+
COVER_epoch_info_t epochs;
|
|
651
|
+
epochs.num = MAX(1, maxDictSize / k / passes);
|
|
652
|
+
epochs.size = nbDmers / epochs.num;
|
|
653
|
+
if (epochs.size >= minEpochSize) {
|
|
654
|
+
assert(epochs.size * epochs.num <= nbDmers);
|
|
655
|
+
return epochs;
|
|
656
|
+
}
|
|
657
|
+
epochs.size = MIN(minEpochSize, nbDmers);
|
|
658
|
+
epochs.num = nbDmers / epochs.size;
|
|
659
|
+
assert(epochs.size * epochs.num <= nbDmers);
|
|
660
|
+
return epochs;
|
|
628
661
|
}
|
|
629
662
|
|
|
630
663
|
/**
|
|
@@ -636,28 +669,34 @@ static size_t COVER_buildDictionary(const COVER_ctx_t *ctx, U32 *freqs,
|
|
|
636
669
|
ZDICT_cover_params_t parameters) {
|
|
637
670
|
BYTE *const dict = (BYTE *)dictBuffer;
|
|
638
671
|
size_t tail = dictBufferCapacity;
|
|
639
|
-
/* Divide the data
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
const
|
|
643
|
-
|
|
672
|
+
/* Divide the data into epochs. We will select one segment from each epoch. */
|
|
673
|
+
const COVER_epoch_info_t epochs = COVER_computeEpochs(
|
|
674
|
+
(U32)dictBufferCapacity, (U32)ctx->suffixSize, parameters.k, 4);
|
|
675
|
+
const size_t maxZeroScoreRun = MAX(10, MIN(100, epochs.num >> 3));
|
|
676
|
+
size_t zeroScoreRun = 0;
|
|
644
677
|
size_t epoch;
|
|
645
678
|
DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n",
|
|
646
|
-
epochs,
|
|
679
|
+
(U32)epochs.num, (U32)epochs.size);
|
|
647
680
|
/* Loop through the epochs until there are no more segments or the dictionary
|
|
648
681
|
* is full.
|
|
649
682
|
*/
|
|
650
|
-
for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs) {
|
|
651
|
-
const U32 epochBegin = (U32)(epoch *
|
|
652
|
-
const U32 epochEnd = epochBegin +
|
|
683
|
+
for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs.num) {
|
|
684
|
+
const U32 epochBegin = (U32)(epoch * epochs.size);
|
|
685
|
+
const U32 epochEnd = epochBegin + epochs.size;
|
|
653
686
|
size_t segmentSize;
|
|
654
687
|
/* Select a segment */
|
|
655
688
|
COVER_segment_t segment = COVER_selectSegment(
|
|
656
689
|
ctx, freqs, activeDmers, epochBegin, epochEnd, parameters);
|
|
657
|
-
/* If the segment covers no dmers, then we are out of content
|
|
690
|
+
/* If the segment covers no dmers, then we are out of content.
|
|
691
|
+
* There may be new content in other epochs, for continue for some time.
|
|
692
|
+
*/
|
|
658
693
|
if (segment.score == 0) {
|
|
659
|
-
|
|
694
|
+
if (++zeroScoreRun >= maxZeroScoreRun) {
|
|
695
|
+
break;
|
|
696
|
+
}
|
|
697
|
+
continue;
|
|
660
698
|
}
|
|
699
|
+
zeroScoreRun = 0;
|
|
661
700
|
/* Trim the segment if necessary and if it is too small then we are done */
|
|
662
701
|
segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail);
|
|
663
702
|
if (segmentSize < parameters.d) {
|
|
@@ -690,11 +729,11 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
|
|
|
690
729
|
/* Checks */
|
|
691
730
|
if (!COVER_checkParameters(parameters, dictBufferCapacity)) {
|
|
692
731
|
DISPLAYLEVEL(1, "Cover parameters incorrect\n");
|
|
693
|
-
return ERROR(
|
|
732
|
+
return ERROR(parameter_outOfBound);
|
|
694
733
|
}
|
|
695
734
|
if (nbSamples == 0) {
|
|
696
735
|
DISPLAYLEVEL(1, "Cover must have at least one input file\n");
|
|
697
|
-
return ERROR(
|
|
736
|
+
return ERROR(srcSize_wrong);
|
|
698
737
|
}
|
|
699
738
|
if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
|
|
700
739
|
DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
|
|
@@ -702,14 +741,18 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
|
|
|
702
741
|
return ERROR(dstSize_tooSmall);
|
|
703
742
|
}
|
|
704
743
|
/* Initialize context and activeDmers */
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
744
|
+
{
|
|
745
|
+
size_t const initVal = COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
|
|
746
|
+
parameters.d, parameters.splitPoint);
|
|
747
|
+
if (ZSTD_isError(initVal)) {
|
|
748
|
+
return initVal;
|
|
749
|
+
}
|
|
708
750
|
}
|
|
751
|
+
COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize, g_displayLevel);
|
|
709
752
|
if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
|
|
710
753
|
DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
|
|
711
754
|
COVER_ctx_destroy(&ctx);
|
|
712
|
-
return ERROR(
|
|
755
|
+
return ERROR(memory_allocation);
|
|
713
756
|
}
|
|
714
757
|
|
|
715
758
|
DISPLAYLEVEL(2, "Building dictionary\n");
|
|
@@ -770,7 +813,7 @@ size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,
|
|
|
770
813
|
cctx, dst, dstCapacity, samples + offsets[i],
|
|
771
814
|
samplesSizes[i], cdict);
|
|
772
815
|
if (ZSTD_isError(size)) {
|
|
773
|
-
totalCompressedSize =
|
|
816
|
+
totalCompressedSize = size;
|
|
774
817
|
goto _compressCleanup;
|
|
775
818
|
}
|
|
776
819
|
totalCompressedSize += size;
|
|
@@ -846,9 +889,11 @@ void COVER_best_start(COVER_best_t *best) {
|
|
|
846
889
|
* Decrements liveJobs and signals any waiting threads if liveJobs == 0.
|
|
847
890
|
* If this dictionary is the best so far save it and its parameters.
|
|
848
891
|
*/
|
|
849
|
-
void COVER_best_finish(COVER_best_t *best,
|
|
850
|
-
|
|
851
|
-
|
|
892
|
+
void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters,
|
|
893
|
+
COVER_dictSelection_t selection) {
|
|
894
|
+
void* dict = selection.dictContent;
|
|
895
|
+
size_t compressedSize = selection.totalCompressedSize;
|
|
896
|
+
size_t dictSize = selection.dictSize;
|
|
852
897
|
if (!best) {
|
|
853
898
|
return;
|
|
854
899
|
}
|
|
@@ -874,10 +919,12 @@ void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
|
|
|
874
919
|
}
|
|
875
920
|
}
|
|
876
921
|
/* Save the dictionary, parameters, and size */
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
|
|
922
|
+
if (dict) {
|
|
923
|
+
memcpy(best->dict, dict, dictSize);
|
|
924
|
+
best->dictSize = dictSize;
|
|
925
|
+
best->parameters = parameters;
|
|
926
|
+
best->compressedSize = compressedSize;
|
|
927
|
+
}
|
|
881
928
|
}
|
|
882
929
|
if (liveJobs == 0) {
|
|
883
930
|
ZSTD_pthread_cond_broadcast(&best->cond);
|
|
@@ -886,6 +933,111 @@ void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
|
|
|
886
933
|
}
|
|
887
934
|
}
|
|
888
935
|
|
|
936
|
+
COVER_dictSelection_t COVER_dictSelectionError(size_t error) {
|
|
937
|
+
COVER_dictSelection_t selection = { NULL, 0, error };
|
|
938
|
+
return selection;
|
|
939
|
+
}
|
|
940
|
+
|
|
941
|
+
unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection) {
|
|
942
|
+
return (ZSTD_isError(selection.totalCompressedSize) || !selection.dictContent);
|
|
943
|
+
}
|
|
944
|
+
|
|
945
|
+
void COVER_dictSelectionFree(COVER_dictSelection_t selection){
|
|
946
|
+
free(selection.dictContent);
|
|
947
|
+
}
|
|
948
|
+
|
|
949
|
+
COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent,
|
|
950
|
+
size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples,
|
|
951
|
+
size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize) {
|
|
952
|
+
|
|
953
|
+
size_t largestDict = 0;
|
|
954
|
+
size_t largestCompressed = 0;
|
|
955
|
+
BYTE* customDictContentEnd = customDictContent + dictContentSize;
|
|
956
|
+
|
|
957
|
+
BYTE * largestDictbuffer = (BYTE *)malloc(dictContentSize);
|
|
958
|
+
BYTE * candidateDictBuffer = (BYTE *)malloc(dictContentSize);
|
|
959
|
+
double regressionTolerance = ((double)params.shrinkDictMaxRegression / 100.0) + 1.00;
|
|
960
|
+
|
|
961
|
+
if (!largestDictbuffer || !candidateDictBuffer) {
|
|
962
|
+
free(largestDictbuffer);
|
|
963
|
+
free(candidateDictBuffer);
|
|
964
|
+
return COVER_dictSelectionError(dictContentSize);
|
|
965
|
+
}
|
|
966
|
+
|
|
967
|
+
/* Initial dictionary size and compressed size */
|
|
968
|
+
memcpy(largestDictbuffer, customDictContent, dictContentSize);
|
|
969
|
+
dictContentSize = ZDICT_finalizeDictionary(
|
|
970
|
+
largestDictbuffer, dictContentSize, customDictContent, dictContentSize,
|
|
971
|
+
samplesBuffer, samplesSizes, nbFinalizeSamples, params.zParams);
|
|
972
|
+
|
|
973
|
+
if (ZDICT_isError(dictContentSize)) {
|
|
974
|
+
free(largestDictbuffer);
|
|
975
|
+
free(candidateDictBuffer);
|
|
976
|
+
return COVER_dictSelectionError(dictContentSize);
|
|
977
|
+
}
|
|
978
|
+
|
|
979
|
+
totalCompressedSize = COVER_checkTotalCompressedSize(params, samplesSizes,
|
|
980
|
+
samplesBuffer, offsets,
|
|
981
|
+
nbCheckSamples, nbSamples,
|
|
982
|
+
largestDictbuffer, dictContentSize);
|
|
983
|
+
|
|
984
|
+
if (ZSTD_isError(totalCompressedSize)) {
|
|
985
|
+
free(largestDictbuffer);
|
|
986
|
+
free(candidateDictBuffer);
|
|
987
|
+
return COVER_dictSelectionError(totalCompressedSize);
|
|
988
|
+
}
|
|
989
|
+
|
|
990
|
+
if (params.shrinkDict == 0) {
|
|
991
|
+
COVER_dictSelection_t selection = { largestDictbuffer, dictContentSize, totalCompressedSize };
|
|
992
|
+
free(candidateDictBuffer);
|
|
993
|
+
return selection;
|
|
994
|
+
}
|
|
995
|
+
|
|
996
|
+
largestDict = dictContentSize;
|
|
997
|
+
largestCompressed = totalCompressedSize;
|
|
998
|
+
dictContentSize = ZDICT_DICTSIZE_MIN;
|
|
999
|
+
|
|
1000
|
+
/* Largest dict is initially at least ZDICT_DICTSIZE_MIN */
|
|
1001
|
+
while (dictContentSize < largestDict) {
|
|
1002
|
+
memcpy(candidateDictBuffer, largestDictbuffer, largestDict);
|
|
1003
|
+
dictContentSize = ZDICT_finalizeDictionary(
|
|
1004
|
+
candidateDictBuffer, dictContentSize, customDictContentEnd - dictContentSize, dictContentSize,
|
|
1005
|
+
samplesBuffer, samplesSizes, nbFinalizeSamples, params.zParams);
|
|
1006
|
+
|
|
1007
|
+
if (ZDICT_isError(dictContentSize)) {
|
|
1008
|
+
free(largestDictbuffer);
|
|
1009
|
+
free(candidateDictBuffer);
|
|
1010
|
+
return COVER_dictSelectionError(dictContentSize);
|
|
1011
|
+
|
|
1012
|
+
}
|
|
1013
|
+
|
|
1014
|
+
totalCompressedSize = COVER_checkTotalCompressedSize(params, samplesSizes,
|
|
1015
|
+
samplesBuffer, offsets,
|
|
1016
|
+
nbCheckSamples, nbSamples,
|
|
1017
|
+
candidateDictBuffer, dictContentSize);
|
|
1018
|
+
|
|
1019
|
+
if (ZSTD_isError(totalCompressedSize)) {
|
|
1020
|
+
free(largestDictbuffer);
|
|
1021
|
+
free(candidateDictBuffer);
|
|
1022
|
+
return COVER_dictSelectionError(totalCompressedSize);
|
|
1023
|
+
}
|
|
1024
|
+
|
|
1025
|
+
if (totalCompressedSize <= largestCompressed * regressionTolerance) {
|
|
1026
|
+
COVER_dictSelection_t selection = { candidateDictBuffer, dictContentSize, totalCompressedSize };
|
|
1027
|
+
free(largestDictbuffer);
|
|
1028
|
+
return selection;
|
|
1029
|
+
}
|
|
1030
|
+
dictContentSize *= 2;
|
|
1031
|
+
}
|
|
1032
|
+
dictContentSize = largestDict;
|
|
1033
|
+
totalCompressedSize = largestCompressed;
|
|
1034
|
+
{
|
|
1035
|
+
COVER_dictSelection_t selection = { largestDictbuffer, dictContentSize, totalCompressedSize };
|
|
1036
|
+
free(candidateDictBuffer);
|
|
1037
|
+
return selection;
|
|
1038
|
+
}
|
|
1039
|
+
}
|
|
1040
|
+
|
|
889
1041
|
/**
|
|
890
1042
|
* Parameters for COVER_tryParameters().
|
|
891
1043
|
*/
|
|
@@ -911,6 +1063,7 @@ static void COVER_tryParameters(void *opaque) {
|
|
|
911
1063
|
/* Allocate space for hash table, dict, and freqs */
|
|
912
1064
|
COVER_map_t activeDmers;
|
|
913
1065
|
BYTE *const dict = (BYTE * const)malloc(dictBufferCapacity);
|
|
1066
|
+
COVER_dictSelection_t selection = COVER_dictSelectionError(ERROR(GENERIC));
|
|
914
1067
|
U32 *freqs = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
|
|
915
1068
|
if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
|
|
916
1069
|
DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
|
|
@@ -926,29 +1079,21 @@ static void COVER_tryParameters(void *opaque) {
|
|
|
926
1079
|
{
|
|
927
1080
|
const size_t tail = COVER_buildDictionary(ctx, freqs, &activeDmers, dict,
|
|
928
1081
|
dictBufferCapacity, parameters);
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
if (
|
|
934
|
-
DISPLAYLEVEL(1, "Failed to
|
|
1082
|
+
selection = COVER_selectDict(dict + tail, dictBufferCapacity - tail,
|
|
1083
|
+
ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbTrainSamples, ctx->nbTrainSamples, ctx->nbSamples, parameters, ctx->offsets,
|
|
1084
|
+
totalCompressedSize);
|
|
1085
|
+
|
|
1086
|
+
if (COVER_dictSelectionIsError(selection)) {
|
|
1087
|
+
DISPLAYLEVEL(1, "Failed to select dictionary\n");
|
|
935
1088
|
goto _cleanup;
|
|
936
1089
|
}
|
|
937
1090
|
}
|
|
938
|
-
/* Check total compressed size */
|
|
939
|
-
totalCompressedSize = COVER_checkTotalCompressedSize(parameters, ctx->samplesSizes,
|
|
940
|
-
ctx->samples, ctx->offsets,
|
|
941
|
-
ctx->nbTrainSamples, ctx->nbSamples,
|
|
942
|
-
dict, dictBufferCapacity);
|
|
943
|
-
|
|
944
1091
|
_cleanup:
|
|
945
|
-
|
|
946
|
-
|
|
1092
|
+
free(dict);
|
|
1093
|
+
COVER_best_finish(data->best, parameters, selection);
|
|
947
1094
|
free(data);
|
|
948
1095
|
COVER_map_destroy(&activeDmers);
|
|
949
|
-
|
|
950
|
-
free(dict);
|
|
951
|
-
}
|
|
1096
|
+
COVER_dictSelectionFree(selection);
|
|
952
1097
|
if (freqs) {
|
|
953
1098
|
free(freqs);
|
|
954
1099
|
}
|
|
@@ -970,6 +1115,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
|
|
|
970
1115
|
const unsigned kStepSize = MAX((kMaxK - kMinK) / kSteps, 1);
|
|
971
1116
|
const unsigned kIterations =
|
|
972
1117
|
(1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize);
|
|
1118
|
+
const unsigned shrinkDict = 0;
|
|
973
1119
|
/* Local variables */
|
|
974
1120
|
const int displayLevel = parameters->zParams.notificationLevel;
|
|
975
1121
|
unsigned iteration = 1;
|
|
@@ -977,19 +1123,20 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
|
|
|
977
1123
|
unsigned k;
|
|
978
1124
|
COVER_best_t best;
|
|
979
1125
|
POOL_ctx *pool = NULL;
|
|
1126
|
+
int warned = 0;
|
|
980
1127
|
|
|
981
1128
|
/* Checks */
|
|
982
1129
|
if (splitPoint <= 0 || splitPoint > 1) {
|
|
983
1130
|
LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
|
|
984
|
-
return ERROR(
|
|
1131
|
+
return ERROR(parameter_outOfBound);
|
|
985
1132
|
}
|
|
986
1133
|
if (kMinK < kMaxD || kMaxK < kMinK) {
|
|
987
1134
|
LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
|
|
988
|
-
return ERROR(
|
|
1135
|
+
return ERROR(parameter_outOfBound);
|
|
989
1136
|
}
|
|
990
1137
|
if (nbSamples == 0) {
|
|
991
1138
|
DISPLAYLEVEL(1, "Cover must have at least one input file\n");
|
|
992
|
-
return ERROR(
|
|
1139
|
+
return ERROR(srcSize_wrong);
|
|
993
1140
|
}
|
|
994
1141
|
if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
|
|
995
1142
|
DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
|
|
@@ -1013,11 +1160,18 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
|
|
|
1013
1160
|
/* Initialize the context for this value of d */
|
|
1014
1161
|
COVER_ctx_t ctx;
|
|
1015
1162
|
LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d);
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
|
|
1163
|
+
{
|
|
1164
|
+
const size_t initVal = COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint);
|
|
1165
|
+
if (ZSTD_isError(initVal)) {
|
|
1166
|
+
LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n");
|
|
1167
|
+
COVER_best_destroy(&best);
|
|
1168
|
+
POOL_free(pool);
|
|
1169
|
+
return initVal;
|
|
1170
|
+
}
|
|
1171
|
+
}
|
|
1172
|
+
if (!warned) {
|
|
1173
|
+
COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize, displayLevel);
|
|
1174
|
+
warned = 1;
|
|
1021
1175
|
}
|
|
1022
1176
|
/* Loop through k reusing the same context */
|
|
1023
1177
|
for (k = kMinK; k <= kMaxK; k += kStepSize) {
|
|
@@ -1030,7 +1184,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
|
|
|
1030
1184
|
COVER_best_destroy(&best);
|
|
1031
1185
|
COVER_ctx_destroy(&ctx);
|
|
1032
1186
|
POOL_free(pool);
|
|
1033
|
-
return ERROR(
|
|
1187
|
+
return ERROR(memory_allocation);
|
|
1034
1188
|
}
|
|
1035
1189
|
data->ctx = &ctx;
|
|
1036
1190
|
data->best = &best;
|
|
@@ -1040,6 +1194,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
|
|
|
1040
1194
|
data->parameters.d = d;
|
|
1041
1195
|
data->parameters.splitPoint = splitPoint;
|
|
1042
1196
|
data->parameters.steps = kSteps;
|
|
1197
|
+
data->parameters.shrinkDict = shrinkDict;
|
|
1043
1198
|
data->parameters.zParams.notificationLevel = g_displayLevel;
|
|
1044
1199
|
/* Check the parameters */
|
|
1045
1200
|
if (!COVER_checkParameters(data->parameters, dictBufferCapacity)) {
|