extzstd 0.2 → 0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/HISTORY.ja.md +13 -0
- data/README.md +17 -14
- data/contrib/zstd/{NEWS → CHANGELOG} +115 -2
- data/contrib/zstd/CODE_OF_CONDUCT.md +5 -0
- data/contrib/zstd/Makefile +99 -53
- data/contrib/zstd/README.md +59 -39
- data/contrib/zstd/TESTING.md +1 -1
- data/contrib/zstd/appveyor.yml +17 -6
- data/contrib/zstd/lib/BUCK +29 -2
- data/contrib/zstd/lib/Makefile +118 -21
- data/contrib/zstd/lib/README.md +84 -44
- data/contrib/zstd/lib/common/bitstream.h +17 -33
- data/contrib/zstd/lib/common/compiler.h +62 -8
- data/contrib/zstd/lib/common/cpu.h +215 -0
- data/contrib/zstd/lib/common/debug.c +44 -0
- data/contrib/zstd/lib/common/debug.h +134 -0
- data/contrib/zstd/lib/common/entropy_common.c +16 -1
- data/contrib/zstd/lib/common/error_private.c +7 -0
- data/contrib/zstd/lib/common/fse.h +48 -44
- data/contrib/zstd/lib/common/fse_decompress.c +3 -3
- data/contrib/zstd/lib/common/huf.h +169 -113
- data/contrib/zstd/lib/common/mem.h +20 -2
- data/contrib/zstd/lib/common/pool.c +135 -49
- data/contrib/zstd/lib/common/pool.h +40 -21
- data/contrib/zstd/lib/common/threading.c +2 -2
- data/contrib/zstd/lib/common/threading.h +12 -12
- data/contrib/zstd/lib/common/xxhash.c +3 -2
- data/contrib/zstd/lib/common/zstd_common.c +3 -6
- data/contrib/zstd/lib/common/zstd_errors.h +17 -7
- data/contrib/zstd/lib/common/zstd_internal.h +76 -48
- data/contrib/zstd/lib/compress/fse_compress.c +89 -209
- data/contrib/zstd/lib/compress/hist.c +203 -0
- data/contrib/zstd/lib/compress/hist.h +95 -0
- data/contrib/zstd/lib/compress/huf_compress.c +188 -80
- data/contrib/zstd/lib/compress/zstd_compress.c +2500 -1203
- data/contrib/zstd/lib/compress/zstd_compress_internal.h +463 -62
- data/contrib/zstd/lib/compress/zstd_double_fast.c +321 -131
- data/contrib/zstd/lib/compress/zstd_double_fast.h +13 -4
- data/contrib/zstd/lib/compress/zstd_fast.c +335 -108
- data/contrib/zstd/lib/compress/zstd_fast.h +12 -6
- data/contrib/zstd/lib/compress/zstd_lazy.c +654 -313
- data/contrib/zstd/lib/compress/zstd_lazy.h +44 -16
- data/contrib/zstd/lib/compress/zstd_ldm.c +310 -420
- data/contrib/zstd/lib/compress/zstd_ldm.h +63 -26
- data/contrib/zstd/lib/compress/zstd_opt.c +773 -325
- data/contrib/zstd/lib/compress/zstd_opt.h +31 -5
- data/contrib/zstd/lib/compress/zstdmt_compress.c +1468 -518
- data/contrib/zstd/lib/compress/zstdmt_compress.h +96 -45
- data/contrib/zstd/lib/decompress/huf_decompress.c +518 -282
- data/contrib/zstd/lib/decompress/zstd_ddict.c +240 -0
- data/contrib/zstd/lib/decompress/zstd_ddict.h +44 -0
- data/contrib/zstd/lib/decompress/zstd_decompress.c +613 -1513
- data/contrib/zstd/lib/decompress/zstd_decompress_block.c +1311 -0
- data/contrib/zstd/lib/decompress/zstd_decompress_block.h +59 -0
- data/contrib/zstd/lib/decompress/zstd_decompress_internal.h +175 -0
- data/contrib/zstd/lib/dictBuilder/cover.c +194 -113
- data/contrib/zstd/lib/dictBuilder/cover.h +112 -0
- data/contrib/zstd/lib/dictBuilder/divsufsort.c +3 -3
- data/contrib/zstd/lib/dictBuilder/fastcover.c +740 -0
- data/contrib/zstd/lib/dictBuilder/zdict.c +142 -106
- data/contrib/zstd/lib/dictBuilder/zdict.h +115 -49
- data/contrib/zstd/lib/legacy/zstd_legacy.h +44 -12
- data/contrib/zstd/lib/legacy/zstd_v01.c +41 -10
- data/contrib/zstd/lib/legacy/zstd_v01.h +12 -7
- data/contrib/zstd/lib/legacy/zstd_v02.c +37 -12
- data/contrib/zstd/lib/legacy/zstd_v02.h +12 -7
- data/contrib/zstd/lib/legacy/zstd_v03.c +38 -12
- data/contrib/zstd/lib/legacy/zstd_v03.h +12 -7
- data/contrib/zstd/lib/legacy/zstd_v04.c +55 -174
- data/contrib/zstd/lib/legacy/zstd_v04.h +12 -7
- data/contrib/zstd/lib/legacy/zstd_v05.c +59 -31
- data/contrib/zstd/lib/legacy/zstd_v05.h +12 -7
- data/contrib/zstd/lib/legacy/zstd_v06.c +48 -20
- data/contrib/zstd/lib/legacy/zstd_v06.h +10 -5
- data/contrib/zstd/lib/legacy/zstd_v07.c +62 -29
- data/contrib/zstd/lib/legacy/zstd_v07.h +10 -5
- data/contrib/zstd/lib/zstd.h +1346 -832
- data/ext/extzstd.c +27 -19
- data/ext/extzstd_stream.c +20 -4
- data/ext/zstd_compress.c +1 -0
- data/ext/zstd_decompress.c +4 -0
- data/ext/zstd_dictbuilder.c +4 -0
- data/ext/zstd_dictbuilder_fastcover.c +5 -0
- data/lib/extzstd.rb +52 -220
- data/lib/extzstd/version.rb +1 -1
- metadata +21 -7
- data/contrib/zstd/circle.yml +0 -63
@@ -0,0 +1,59 @@
|
|
1
|
+
/*
|
2
|
+
* Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
|
3
|
+
* All rights reserved.
|
4
|
+
*
|
5
|
+
* This source code is licensed under both the BSD-style license (found in the
|
6
|
+
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
|
7
|
+
* in the COPYING file in the root directory of this source tree).
|
8
|
+
* You may select, at your option, one of the above-listed licenses.
|
9
|
+
*/
|
10
|
+
|
11
|
+
|
12
|
+
#ifndef ZSTD_DEC_BLOCK_H
|
13
|
+
#define ZSTD_DEC_BLOCK_H
|
14
|
+
|
15
|
+
/*-*******************************************************
|
16
|
+
* Dependencies
|
17
|
+
*********************************************************/
|
18
|
+
#include <stddef.h> /* size_t */
|
19
|
+
#include "zstd.h" /* DCtx, and some public functions */
|
20
|
+
#include "zstd_internal.h" /* blockProperties_t, and some public functions */
|
21
|
+
#include "zstd_decompress_internal.h" /* ZSTD_seqSymbol */
|
22
|
+
|
23
|
+
|
24
|
+
/* === Prototypes === */
|
25
|
+
|
26
|
+
/* note: prototypes already published within `zstd.h` :
|
27
|
+
* ZSTD_decompressBlock()
|
28
|
+
*/
|
29
|
+
|
30
|
+
/* note: prototypes already published within `zstd_internal.h` :
|
31
|
+
* ZSTD_getcBlockSize()
|
32
|
+
* ZSTD_decodeSeqHeaders()
|
33
|
+
*/
|
34
|
+
|
35
|
+
|
36
|
+
/* ZSTD_decompressBlock_internal() :
|
37
|
+
* decompress block, starting at `src`,
|
38
|
+
* into destination buffer `dst`.
|
39
|
+
* @return : decompressed block size,
|
40
|
+
* or an error code (which can be tested using ZSTD_isError())
|
41
|
+
*/
|
42
|
+
size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
|
43
|
+
void* dst, size_t dstCapacity,
|
44
|
+
const void* src, size_t srcSize, const int frame);
|
45
|
+
|
46
|
+
/* ZSTD_buildFSETable() :
|
47
|
+
* generate FSE decoding table for one symbol (ll, ml or off)
|
48
|
+
* this function must be called with valid parameters only
|
49
|
+
* (dt is large enough, normalizedCounter distribution total is a power of 2, max is within range, etc.)
|
50
|
+
* in which case it cannot fail.
|
51
|
+
* Internal use only.
|
52
|
+
*/
|
53
|
+
void ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
|
54
|
+
const short* normalizedCounter, unsigned maxSymbolValue,
|
55
|
+
const U32* baseValue, const U32* nbAdditionalBits,
|
56
|
+
unsigned tableLog);
|
57
|
+
|
58
|
+
|
59
|
+
#endif /* ZSTD_DEC_BLOCK_H */
|
@@ -0,0 +1,175 @@
|
|
1
|
+
/*
|
2
|
+
* Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
|
3
|
+
* All rights reserved.
|
4
|
+
*
|
5
|
+
* This source code is licensed under both the BSD-style license (found in the
|
6
|
+
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
|
7
|
+
* in the COPYING file in the root directory of this source tree).
|
8
|
+
* You may select, at your option, one of the above-listed licenses.
|
9
|
+
*/
|
10
|
+
|
11
|
+
|
12
|
+
/* zstd_decompress_internal:
|
13
|
+
* objects and definitions shared within lib/decompress modules */
|
14
|
+
|
15
|
+
#ifndef ZSTD_DECOMPRESS_INTERNAL_H
|
16
|
+
#define ZSTD_DECOMPRESS_INTERNAL_H
|
17
|
+
|
18
|
+
|
19
|
+
/*-*******************************************************
|
20
|
+
* Dependencies
|
21
|
+
*********************************************************/
|
22
|
+
#include "mem.h" /* BYTE, U16, U32 */
|
23
|
+
#include "zstd_internal.h" /* ZSTD_seqSymbol */
|
24
|
+
|
25
|
+
|
26
|
+
|
27
|
+
/*-*******************************************************
|
28
|
+
* Constants
|
29
|
+
*********************************************************/
|
30
|
+
static const U32 LL_base[MaxLL+1] = {
|
31
|
+
0, 1, 2, 3, 4, 5, 6, 7,
|
32
|
+
8, 9, 10, 11, 12, 13, 14, 15,
|
33
|
+
16, 18, 20, 22, 24, 28, 32, 40,
|
34
|
+
48, 64, 0x80, 0x100, 0x200, 0x400, 0x800, 0x1000,
|
35
|
+
0x2000, 0x4000, 0x8000, 0x10000 };
|
36
|
+
|
37
|
+
static const U32 OF_base[MaxOff+1] = {
|
38
|
+
0, 1, 1, 5, 0xD, 0x1D, 0x3D, 0x7D,
|
39
|
+
0xFD, 0x1FD, 0x3FD, 0x7FD, 0xFFD, 0x1FFD, 0x3FFD, 0x7FFD,
|
40
|
+
0xFFFD, 0x1FFFD, 0x3FFFD, 0x7FFFD, 0xFFFFD, 0x1FFFFD, 0x3FFFFD, 0x7FFFFD,
|
41
|
+
0xFFFFFD, 0x1FFFFFD, 0x3FFFFFD, 0x7FFFFFD, 0xFFFFFFD, 0x1FFFFFFD, 0x3FFFFFFD, 0x7FFFFFFD };
|
42
|
+
|
43
|
+
static const U32 OF_bits[MaxOff+1] = {
|
44
|
+
0, 1, 2, 3, 4, 5, 6, 7,
|
45
|
+
8, 9, 10, 11, 12, 13, 14, 15,
|
46
|
+
16, 17, 18, 19, 20, 21, 22, 23,
|
47
|
+
24, 25, 26, 27, 28, 29, 30, 31 };
|
48
|
+
|
49
|
+
static const U32 ML_base[MaxML+1] = {
|
50
|
+
3, 4, 5, 6, 7, 8, 9, 10,
|
51
|
+
11, 12, 13, 14, 15, 16, 17, 18,
|
52
|
+
19, 20, 21, 22, 23, 24, 25, 26,
|
53
|
+
27, 28, 29, 30, 31, 32, 33, 34,
|
54
|
+
35, 37, 39, 41, 43, 47, 51, 59,
|
55
|
+
67, 83, 99, 0x83, 0x103, 0x203, 0x403, 0x803,
|
56
|
+
0x1003, 0x2003, 0x4003, 0x8003, 0x10003 };
|
57
|
+
|
58
|
+
|
59
|
+
/*-*******************************************************
|
60
|
+
* Decompression types
|
61
|
+
*********************************************************/
|
62
|
+
typedef struct {
|
63
|
+
U32 fastMode;
|
64
|
+
U32 tableLog;
|
65
|
+
} ZSTD_seqSymbol_header;
|
66
|
+
|
67
|
+
typedef struct {
|
68
|
+
U16 nextState;
|
69
|
+
BYTE nbAdditionalBits;
|
70
|
+
BYTE nbBits;
|
71
|
+
U32 baseValue;
|
72
|
+
} ZSTD_seqSymbol;
|
73
|
+
|
74
|
+
#define SEQSYMBOL_TABLE_SIZE(log) (1 + (1 << (log)))
|
75
|
+
|
76
|
+
typedef struct {
|
77
|
+
ZSTD_seqSymbol LLTable[SEQSYMBOL_TABLE_SIZE(LLFSELog)]; /* Note : Space reserved for FSE Tables */
|
78
|
+
ZSTD_seqSymbol OFTable[SEQSYMBOL_TABLE_SIZE(OffFSELog)]; /* is also used as temporary workspace while building hufTable during DDict creation */
|
79
|
+
ZSTD_seqSymbol MLTable[SEQSYMBOL_TABLE_SIZE(MLFSELog)]; /* and therefore must be at least HUF_DECOMPRESS_WORKSPACE_SIZE large */
|
80
|
+
HUF_DTable hufTable[HUF_DTABLE_SIZE(HufLog)]; /* can accommodate HUF_decompress4X */
|
81
|
+
U32 rep[ZSTD_REP_NUM];
|
82
|
+
} ZSTD_entropyDTables_t;
|
83
|
+
|
84
|
+
typedef enum { ZSTDds_getFrameHeaderSize, ZSTDds_decodeFrameHeader,
|
85
|
+
ZSTDds_decodeBlockHeader, ZSTDds_decompressBlock,
|
86
|
+
ZSTDds_decompressLastBlock, ZSTDds_checkChecksum,
|
87
|
+
ZSTDds_decodeSkippableHeader, ZSTDds_skipFrame } ZSTD_dStage;
|
88
|
+
|
89
|
+
typedef enum { zdss_init=0, zdss_loadHeader,
|
90
|
+
zdss_read, zdss_load, zdss_flush } ZSTD_dStreamStage;
|
91
|
+
|
92
|
+
typedef enum {
|
93
|
+
ZSTD_use_indefinitely = -1, /* Use the dictionary indefinitely */
|
94
|
+
ZSTD_dont_use = 0, /* Do not use the dictionary (if one exists free it) */
|
95
|
+
ZSTD_use_once = 1 /* Use the dictionary once and set to ZSTD_dont_use */
|
96
|
+
} ZSTD_dictUses_e;
|
97
|
+
|
98
|
+
struct ZSTD_DCtx_s
|
99
|
+
{
|
100
|
+
const ZSTD_seqSymbol* LLTptr;
|
101
|
+
const ZSTD_seqSymbol* MLTptr;
|
102
|
+
const ZSTD_seqSymbol* OFTptr;
|
103
|
+
const HUF_DTable* HUFptr;
|
104
|
+
ZSTD_entropyDTables_t entropy;
|
105
|
+
U32 workspace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32]; /* space needed when building huffman tables */
|
106
|
+
const void* previousDstEnd; /* detect continuity */
|
107
|
+
const void* prefixStart; /* start of current segment */
|
108
|
+
const void* virtualStart; /* virtual start of previous segment if it was just before current one */
|
109
|
+
const void* dictEnd; /* end of previous segment */
|
110
|
+
size_t expected;
|
111
|
+
ZSTD_frameHeader fParams;
|
112
|
+
U64 decodedSize;
|
113
|
+
blockType_e bType; /* used in ZSTD_decompressContinue(), store blockType between block header decoding and block decompression stages */
|
114
|
+
ZSTD_dStage stage;
|
115
|
+
U32 litEntropy;
|
116
|
+
U32 fseEntropy;
|
117
|
+
XXH64_state_t xxhState;
|
118
|
+
size_t headerSize;
|
119
|
+
ZSTD_format_e format;
|
120
|
+
const BYTE* litPtr;
|
121
|
+
ZSTD_customMem customMem;
|
122
|
+
size_t litSize;
|
123
|
+
size_t rleSize;
|
124
|
+
size_t staticSize;
|
125
|
+
int bmi2; /* == 1 if the CPU supports BMI2 and 0 otherwise. CPU support is determined dynamically once per context lifetime. */
|
126
|
+
|
127
|
+
/* dictionary */
|
128
|
+
ZSTD_DDict* ddictLocal;
|
129
|
+
const ZSTD_DDict* ddict; /* set by ZSTD_initDStream_usingDDict(), or ZSTD_DCtx_refDDict() */
|
130
|
+
U32 dictID;
|
131
|
+
int ddictIsCold; /* if == 1 : dictionary is "new" for working context, and presumed "cold" (not in cpu cache) */
|
132
|
+
ZSTD_dictUses_e dictUses;
|
133
|
+
|
134
|
+
/* streaming */
|
135
|
+
ZSTD_dStreamStage streamStage;
|
136
|
+
char* inBuff;
|
137
|
+
size_t inBuffSize;
|
138
|
+
size_t inPos;
|
139
|
+
size_t maxWindowSize;
|
140
|
+
char* outBuff;
|
141
|
+
size_t outBuffSize;
|
142
|
+
size_t outStart;
|
143
|
+
size_t outEnd;
|
144
|
+
size_t lhSize;
|
145
|
+
void* legacyContext;
|
146
|
+
U32 previousLegacyVersion;
|
147
|
+
U32 legacyVersion;
|
148
|
+
U32 hostageByte;
|
149
|
+
int noForwardProgress;
|
150
|
+
|
151
|
+
/* workspace */
|
152
|
+
BYTE litBuffer[ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH];
|
153
|
+
BYTE headerBuffer[ZSTD_FRAMEHEADERSIZE_MAX];
|
154
|
+
}; /* typedef'd to ZSTD_DCtx within "zstd.h" */
|
155
|
+
|
156
|
+
|
157
|
+
/*-*******************************************************
|
158
|
+
* Shared internal functions
|
159
|
+
*********************************************************/
|
160
|
+
|
161
|
+
/*! ZSTD_loadDEntropy() :
|
162
|
+
* dict : must point at beginning of a valid zstd dictionary.
|
163
|
+
* @return : size of entropy tables read */
|
164
|
+
size_t ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy,
|
165
|
+
const void* const dict, size_t const dictSize);
|
166
|
+
|
167
|
+
/*! ZSTD_checkContinuity() :
|
168
|
+
* check if next `dst` follows previous position, where decompression ended.
|
169
|
+
* If yes, do nothing (continue on current segment).
|
170
|
+
* If not, classify previous segment as "external dictionary", and start a new segment.
|
171
|
+
* This function cannot fail. */
|
172
|
+
void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst);
|
173
|
+
|
174
|
+
|
175
|
+
#endif /* ZSTD_DECOMPRESS_INTERNAL_H */
|
@@ -29,6 +29,7 @@
|
|
29
29
|
#include "mem.h" /* read */
|
30
30
|
#include "pool.h"
|
31
31
|
#include "threading.h"
|
32
|
+
#include "cover.h"
|
32
33
|
#include "zstd_internal.h" /* includes zstd.h */
|
33
34
|
#ifndef ZDICT_STATIC_LINKING_ONLY
|
34
35
|
#define ZDICT_STATIC_LINKING_ONLY
|
@@ -38,7 +39,8 @@
|
|
38
39
|
/*-*************************************
|
39
40
|
* Constants
|
40
41
|
***************************************/
|
41
|
-
#define COVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((
|
42
|
+
#define COVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((unsigned)-1) : ((unsigned)1 GB))
|
43
|
+
#define DEFAULT_SPLITPOINT 1.0
|
42
44
|
|
43
45
|
/*-*************************************
|
44
46
|
* Console display
|
@@ -184,7 +186,7 @@ static void COVER_map_remove(COVER_map_t *map, U32 key) {
|
|
184
186
|
}
|
185
187
|
|
186
188
|
/**
|
187
|
-
*
|
189
|
+
* Destroys a map that is inited with COVER_map_init().
|
188
190
|
*/
|
189
191
|
static void COVER_map_destroy(COVER_map_t *map) {
|
190
192
|
if (map->data) {
|
@@ -203,6 +205,8 @@ typedef struct {
|
|
203
205
|
size_t *offsets;
|
204
206
|
const size_t *samplesSizes;
|
205
207
|
size_t nbSamples;
|
208
|
+
size_t nbTrainSamples;
|
209
|
+
size_t nbTestSamples;
|
206
210
|
U32 *suffix;
|
207
211
|
size_t suffixSize;
|
208
212
|
U32 *freqs;
|
@@ -220,9 +224,9 @@ static COVER_ctx_t *g_ctx = NULL;
|
|
220
224
|
/**
|
221
225
|
* Returns the sum of the sample sizes.
|
222
226
|
*/
|
223
|
-
|
227
|
+
size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) {
|
224
228
|
size_t sum = 0;
|
225
|
-
|
229
|
+
unsigned i;
|
226
230
|
for (i = 0; i < nbSamples; ++i) {
|
227
231
|
sum += samplesSizes[i];
|
228
232
|
}
|
@@ -377,14 +381,6 @@ static void COVER_group(COVER_ctx_t *ctx, const void *group,
|
|
377
381
|
ctx->suffix[dmerId] = freq;
|
378
382
|
}
|
379
383
|
|
380
|
-
/**
|
381
|
-
* A segment is a range in the source as well as the score of the segment.
|
382
|
-
*/
|
383
|
-
typedef struct {
|
384
|
-
U32 begin;
|
385
|
-
U32 end;
|
386
|
-
U32 score;
|
387
|
-
} COVER_segment_t;
|
388
384
|
|
389
385
|
/**
|
390
386
|
* Selects the best segment in an epoch.
|
@@ -395,7 +391,7 @@ typedef struct {
|
|
395
391
|
*
|
396
392
|
* Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1})
|
397
393
|
*
|
398
|
-
* Once the dmer d is in the
|
394
|
+
* Once the dmer d is in the dictionary we set F(d) = 0.
|
399
395
|
*/
|
400
396
|
static COVER_segment_t COVER_selectSegment(const COVER_ctx_t *ctx, U32 *freqs,
|
401
397
|
COVER_map_t *activeDmers, U32 begin,
|
@@ -439,7 +435,7 @@ static COVER_segment_t COVER_selectSegment(const COVER_ctx_t *ctx, U32 *freqs,
|
|
439
435
|
U32 *delDmerOcc = COVER_map_at(activeDmers, delDmer);
|
440
436
|
activeSegment.begin += 1;
|
441
437
|
*delDmerOcc -= 1;
|
442
|
-
/* If this is the last
|
438
|
+
/* If this is the last occurrence of the dmer, subtract its score */
|
443
439
|
if (*delDmerOcc == 0) {
|
444
440
|
COVER_map_remove(activeDmers, delDmer);
|
445
441
|
activeSegment.score -= freqs[delDmer];
|
@@ -494,6 +490,10 @@ static int COVER_checkParameters(ZDICT_cover_params_t parameters,
|
|
494
490
|
if (parameters.d > parameters.k) {
|
495
491
|
return 0;
|
496
492
|
}
|
493
|
+
/* 0 < splitPoint <= 1 */
|
494
|
+
if (parameters.splitPoint <= 0 || parameters.splitPoint > 1){
|
495
|
+
return 0;
|
496
|
+
}
|
497
497
|
return 1;
|
498
498
|
}
|
499
499
|
|
@@ -531,25 +531,44 @@ static void COVER_ctx_destroy(COVER_ctx_t *ctx) {
|
|
531
531
|
*/
|
532
532
|
static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
|
533
533
|
const size_t *samplesSizes, unsigned nbSamples,
|
534
|
-
unsigned d) {
|
534
|
+
unsigned d, double splitPoint) {
|
535
535
|
const BYTE *const samples = (const BYTE *)samplesBuffer;
|
536
536
|
const size_t totalSamplesSize = COVER_sum(samplesSizes, nbSamples);
|
537
|
+
/* Split samples into testing and training sets */
|
538
|
+
const unsigned nbTrainSamples = splitPoint < 1.0 ? (unsigned)((double)nbSamples * splitPoint) : nbSamples;
|
539
|
+
const unsigned nbTestSamples = splitPoint < 1.0 ? nbSamples - nbTrainSamples : nbSamples;
|
540
|
+
const size_t trainingSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes, nbTrainSamples) : totalSamplesSize;
|
541
|
+
const size_t testSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes + nbTrainSamples, nbTestSamples) : totalSamplesSize;
|
537
542
|
/* Checks */
|
538
543
|
if (totalSamplesSize < MAX(d, sizeof(U64)) ||
|
539
544
|
totalSamplesSize >= (size_t)COVER_MAX_SAMPLES_SIZE) {
|
540
|
-
DISPLAYLEVEL(1, "Total samples size is too large, maximum size is %u MB\n",
|
541
|
-
(COVER_MAX_SAMPLES_SIZE >> 20));
|
545
|
+
DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n",
|
546
|
+
(unsigned)(totalSamplesSize>>20), (COVER_MAX_SAMPLES_SIZE >> 20));
|
547
|
+
return 0;
|
548
|
+
}
|
549
|
+
/* Check if there are at least 5 training samples */
|
550
|
+
if (nbTrainSamples < 5) {
|
551
|
+
DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid.", nbTrainSamples);
|
552
|
+
return 0;
|
553
|
+
}
|
554
|
+
/* Check if there's testing sample */
|
555
|
+
if (nbTestSamples < 1) {
|
556
|
+
DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.", nbTestSamples);
|
542
557
|
return 0;
|
543
558
|
}
|
544
559
|
/* Zero the context */
|
545
560
|
memset(ctx, 0, sizeof(*ctx));
|
546
|
-
DISPLAYLEVEL(2, "Training on %u samples of total size %u\n",
|
547
|
-
(
|
561
|
+
DISPLAYLEVEL(2, "Training on %u samples of total size %u\n", nbTrainSamples,
|
562
|
+
(unsigned)trainingSamplesSize);
|
563
|
+
DISPLAYLEVEL(2, "Testing on %u samples of total size %u\n", nbTestSamples,
|
564
|
+
(unsigned)testSamplesSize);
|
548
565
|
ctx->samples = samples;
|
549
566
|
ctx->samplesSizes = samplesSizes;
|
550
567
|
ctx->nbSamples = nbSamples;
|
568
|
+
ctx->nbTrainSamples = nbTrainSamples;
|
569
|
+
ctx->nbTestSamples = nbTestSamples;
|
551
570
|
/* Partial suffix array */
|
552
|
-
ctx->suffixSize =
|
571
|
+
ctx->suffixSize = trainingSamplesSize - MAX(d, sizeof(U64)) + 1;
|
553
572
|
ctx->suffix = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
|
554
573
|
/* Maps index to the dmerID */
|
555
574
|
ctx->dmerAt = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
|
@@ -563,7 +582,7 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
|
|
563
582
|
ctx->freqs = NULL;
|
564
583
|
ctx->d = d;
|
565
584
|
|
566
|
-
/* Fill offsets from the
|
585
|
+
/* Fill offsets from the samplesSizes */
|
567
586
|
{
|
568
587
|
U32 i;
|
569
588
|
ctx->offsets[0] = 0;
|
@@ -581,10 +600,17 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
|
|
581
600
|
for (i = 0; i < ctx->suffixSize; ++i) {
|
582
601
|
ctx->suffix[i] = i;
|
583
602
|
}
|
584
|
-
/* qsort doesn't take an opaque pointer, so pass as a global
|
603
|
+
/* qsort doesn't take an opaque pointer, so pass as a global.
|
604
|
+
* On OpenBSD qsort() is not guaranteed to be stable, their mergesort() is.
|
605
|
+
*/
|
585
606
|
g_ctx = ctx;
|
607
|
+
#if defined(__OpenBSD__)
|
608
|
+
mergesort(ctx->suffix, ctx->suffixSize, sizeof(U32),
|
609
|
+
(ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp));
|
610
|
+
#else
|
586
611
|
qsort(ctx->suffix, ctx->suffixSize, sizeof(U32),
|
587
612
|
(ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp));
|
613
|
+
#endif
|
588
614
|
}
|
589
615
|
DISPLAYLEVEL(2, "Computing frequencies\n");
|
590
616
|
/* For each dmer group (group of positions with the same first d bytes):
|
@@ -601,6 +627,39 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
|
|
601
627
|
return 1;
|
602
628
|
}
|
603
629
|
|
630
|
+
void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel)
|
631
|
+
{
|
632
|
+
const double ratio = (double)nbDmers / maxDictSize;
|
633
|
+
if (ratio >= 10) {
|
634
|
+
return;
|
635
|
+
}
|
636
|
+
LOCALDISPLAYLEVEL(displayLevel, 1,
|
637
|
+
"WARNING: The maximum dictionary size %u is too large "
|
638
|
+
"compared to the source size %u! "
|
639
|
+
"size(source)/size(dictionary) = %f, but it should be >= "
|
640
|
+
"10! This may lead to a subpar dictionary! We recommend "
|
641
|
+
"training on sources at least 10x, and up to 100x the "
|
642
|
+
"size of the dictionary!\n", (U32)maxDictSize,
|
643
|
+
(U32)nbDmers, ratio);
|
644
|
+
}
|
645
|
+
|
646
|
+
COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize,
|
647
|
+
U32 nbDmers, U32 k, U32 passes)
|
648
|
+
{
|
649
|
+
const U32 minEpochSize = k * 10;
|
650
|
+
COVER_epoch_info_t epochs;
|
651
|
+
epochs.num = MAX(1, maxDictSize / k / passes);
|
652
|
+
epochs.size = nbDmers / epochs.num;
|
653
|
+
if (epochs.size >= minEpochSize) {
|
654
|
+
assert(epochs.size * epochs.num <= nbDmers);
|
655
|
+
return epochs;
|
656
|
+
}
|
657
|
+
epochs.size = MIN(minEpochSize, nbDmers);
|
658
|
+
epochs.num = nbDmers / epochs.size;
|
659
|
+
assert(epochs.size * epochs.num <= nbDmers);
|
660
|
+
return epochs;
|
661
|
+
}
|
662
|
+
|
604
663
|
/**
|
605
664
|
* Given the prepared context build the dictionary.
|
606
665
|
*/
|
@@ -610,28 +669,34 @@ static size_t COVER_buildDictionary(const COVER_ctx_t *ctx, U32 *freqs,
|
|
610
669
|
ZDICT_cover_params_t parameters) {
|
611
670
|
BYTE *const dict = (BYTE *)dictBuffer;
|
612
671
|
size_t tail = dictBufferCapacity;
|
613
|
-
/* Divide the data
|
614
|
-
|
615
|
-
|
616
|
-
const
|
617
|
-
|
672
|
+
/* Divide the data into epochs. We will select one segment from each epoch. */
|
673
|
+
const COVER_epoch_info_t epochs = COVER_computeEpochs(
|
674
|
+
(U32)dictBufferCapacity, (U32)ctx->suffixSize, parameters.k, 4);
|
675
|
+
const size_t maxZeroScoreRun = MAX(10, MIN(100, epochs.num >> 3));
|
676
|
+
size_t zeroScoreRun = 0;
|
618
677
|
size_t epoch;
|
619
|
-
DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n",
|
620
|
-
|
678
|
+
DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n",
|
679
|
+
(U32)epochs.num, (U32)epochs.size);
|
621
680
|
/* Loop through the epochs until there are no more segments or the dictionary
|
622
681
|
* is full.
|
623
682
|
*/
|
624
|
-
for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs) {
|
625
|
-
const U32 epochBegin = (U32)(epoch *
|
626
|
-
const U32 epochEnd = epochBegin +
|
683
|
+
for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs.num) {
|
684
|
+
const U32 epochBegin = (U32)(epoch * epochs.size);
|
685
|
+
const U32 epochEnd = epochBegin + epochs.size;
|
627
686
|
size_t segmentSize;
|
628
687
|
/* Select a segment */
|
629
688
|
COVER_segment_t segment = COVER_selectSegment(
|
630
689
|
ctx, freqs, activeDmers, epochBegin, epochEnd, parameters);
|
631
|
-
/* If the segment covers no dmers, then we are out of content
|
690
|
+
/* If the segment covers no dmers, then we are out of content.
|
691
|
+
* There may be new content in other epochs, for continue for some time.
|
692
|
+
*/
|
632
693
|
if (segment.score == 0) {
|
633
|
-
|
694
|
+
if (++zeroScoreRun >= maxZeroScoreRun) {
|
695
|
+
break;
|
696
|
+
}
|
697
|
+
continue;
|
634
698
|
}
|
699
|
+
zeroScoreRun = 0;
|
635
700
|
/* Trim the segment if necessary and if it is too small then we are done */
|
636
701
|
segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail);
|
637
702
|
if (segmentSize < parameters.d) {
|
@@ -644,19 +709,23 @@ static size_t COVER_buildDictionary(const COVER_ctx_t *ctx, U32 *freqs,
|
|
644
709
|
memcpy(dict + tail, ctx->samples + segment.begin, segmentSize);
|
645
710
|
DISPLAYUPDATE(
|
646
711
|
2, "\r%u%% ",
|
647
|
-
(
|
712
|
+
(unsigned)(((dictBufferCapacity - tail) * 100) / dictBufferCapacity));
|
648
713
|
}
|
649
714
|
DISPLAYLEVEL(2, "\r%79s\r", "");
|
650
715
|
return tail;
|
651
716
|
}
|
652
717
|
|
653
718
|
ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
|
654
|
-
void *dictBuffer, size_t dictBufferCapacity,
|
655
|
-
const size_t *samplesSizes, unsigned nbSamples,
|
656
|
-
ZDICT_cover_params_t parameters)
|
657
|
-
|
719
|
+
void *dictBuffer, size_t dictBufferCapacity,
|
720
|
+
const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,
|
721
|
+
ZDICT_cover_params_t parameters)
|
722
|
+
{
|
723
|
+
BYTE* const dict = (BYTE*)dictBuffer;
|
658
724
|
COVER_ctx_t ctx;
|
659
725
|
COVER_map_t activeDmers;
|
726
|
+
parameters.splitPoint = 1.0;
|
727
|
+
/* Initialize global data */
|
728
|
+
g_displayLevel = parameters.zParams.notificationLevel;
|
660
729
|
/* Checks */
|
661
730
|
if (!COVER_checkParameters(parameters, dictBufferCapacity)) {
|
662
731
|
DISPLAYLEVEL(1, "Cover parameters incorrect\n");
|
@@ -671,13 +740,12 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
|
|
671
740
|
ZDICT_DICTSIZE_MIN);
|
672
741
|
return ERROR(dstSize_tooSmall);
|
673
742
|
}
|
674
|
-
/* Initialize global data */
|
675
|
-
g_displayLevel = parameters.zParams.notificationLevel;
|
676
743
|
/* Initialize context and activeDmers */
|
677
744
|
if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
|
678
|
-
parameters.d)) {
|
745
|
+
parameters.d, parameters.splitPoint)) {
|
679
746
|
return ERROR(GENERIC);
|
680
747
|
}
|
748
|
+
COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize, g_displayLevel);
|
681
749
|
if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
|
682
750
|
DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
|
683
751
|
COVER_ctx_destroy(&ctx);
|
@@ -694,7 +762,7 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
|
|
694
762
|
samplesBuffer, samplesSizes, nbSamples, parameters.zParams);
|
695
763
|
if (!ZSTD_isError(dictionarySize)) {
|
696
764
|
DISPLAYLEVEL(2, "Constructed dictionary of size %u\n",
|
697
|
-
(
|
765
|
+
(unsigned)dictionarySize);
|
698
766
|
}
|
699
767
|
COVER_ctx_destroy(&ctx);
|
700
768
|
COVER_map_destroy(&activeDmers);
|
@@ -702,28 +770,65 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
|
|
702
770
|
}
|
703
771
|
}
|
704
772
|
|
705
|
-
|
706
|
-
|
707
|
-
|
708
|
-
*
|
709
|
-
*
|
710
|
-
|
711
|
-
*
|
712
|
-
|
713
|
-
|
714
|
-
|
715
|
-
|
716
|
-
|
717
|
-
|
718
|
-
size_t
|
719
|
-
|
720
|
-
|
721
|
-
|
773
|
+
|
774
|
+
|
775
|
+
size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,
|
776
|
+
const size_t *samplesSizes, const BYTE *samples,
|
777
|
+
size_t *offsets,
|
778
|
+
size_t nbTrainSamples, size_t nbSamples,
|
779
|
+
BYTE *const dict, size_t dictBufferCapacity) {
|
780
|
+
size_t totalCompressedSize = ERROR(GENERIC);
|
781
|
+
/* Pointers */
|
782
|
+
ZSTD_CCtx *cctx;
|
783
|
+
ZSTD_CDict *cdict;
|
784
|
+
void *dst;
|
785
|
+
/* Local variables */
|
786
|
+
size_t dstCapacity;
|
787
|
+
size_t i;
|
788
|
+
/* Allocate dst with enough space to compress the maximum sized sample */
|
789
|
+
{
|
790
|
+
size_t maxSampleSize = 0;
|
791
|
+
i = parameters.splitPoint < 1.0 ? nbTrainSamples : 0;
|
792
|
+
for (; i < nbSamples; ++i) {
|
793
|
+
maxSampleSize = MAX(samplesSizes[i], maxSampleSize);
|
794
|
+
}
|
795
|
+
dstCapacity = ZSTD_compressBound(maxSampleSize);
|
796
|
+
dst = malloc(dstCapacity);
|
797
|
+
}
|
798
|
+
/* Create the cctx and cdict */
|
799
|
+
cctx = ZSTD_createCCtx();
|
800
|
+
cdict = ZSTD_createCDict(dict, dictBufferCapacity,
|
801
|
+
parameters.zParams.compressionLevel);
|
802
|
+
if (!dst || !cctx || !cdict) {
|
803
|
+
goto _compressCleanup;
|
804
|
+
}
|
805
|
+
/* Compress each sample and sum their sizes (or error) */
|
806
|
+
totalCompressedSize = dictBufferCapacity;
|
807
|
+
i = parameters.splitPoint < 1.0 ? nbTrainSamples : 0;
|
808
|
+
for (; i < nbSamples; ++i) {
|
809
|
+
const size_t size = ZSTD_compress_usingCDict(
|
810
|
+
cctx, dst, dstCapacity, samples + offsets[i],
|
811
|
+
samplesSizes[i], cdict);
|
812
|
+
if (ZSTD_isError(size)) {
|
813
|
+
totalCompressedSize = ERROR(GENERIC);
|
814
|
+
goto _compressCleanup;
|
815
|
+
}
|
816
|
+
totalCompressedSize += size;
|
817
|
+
}
|
818
|
+
_compressCleanup:
|
819
|
+
ZSTD_freeCCtx(cctx);
|
820
|
+
ZSTD_freeCDict(cdict);
|
821
|
+
if (dst) {
|
822
|
+
free(dst);
|
823
|
+
}
|
824
|
+
return totalCompressedSize;
|
825
|
+
}
|
826
|
+
|
722
827
|
|
723
828
|
/**
|
724
829
|
* Initialize the `COVER_best_t`.
|
725
830
|
*/
|
726
|
-
|
831
|
+
void COVER_best_init(COVER_best_t *best) {
|
727
832
|
if (best==NULL) return; /* compatible with init on NULL */
|
728
833
|
(void)ZSTD_pthread_mutex_init(&best->mutex, NULL);
|
729
834
|
(void)ZSTD_pthread_cond_init(&best->cond, NULL);
|
@@ -737,7 +842,7 @@ static void COVER_best_init(COVER_best_t *best) {
|
|
737
842
|
/**
|
738
843
|
* Wait until liveJobs == 0.
|
739
844
|
*/
|
740
|
-
|
845
|
+
void COVER_best_wait(COVER_best_t *best) {
|
741
846
|
if (!best) {
|
742
847
|
return;
|
743
848
|
}
|
@@ -751,7 +856,7 @@ static void COVER_best_wait(COVER_best_t *best) {
|
|
751
856
|
/**
|
752
857
|
* Call COVER_best_wait() and then destroy the COVER_best_t.
|
753
858
|
*/
|
754
|
-
|
859
|
+
void COVER_best_destroy(COVER_best_t *best) {
|
755
860
|
if (!best) {
|
756
861
|
return;
|
757
862
|
}
|
@@ -767,7 +872,7 @@ static void COVER_best_destroy(COVER_best_t *best) {
|
|
767
872
|
* Called when a thread is about to be launched.
|
768
873
|
* Increments liveJobs.
|
769
874
|
*/
|
770
|
-
|
875
|
+
void COVER_best_start(COVER_best_t *best) {
|
771
876
|
if (!best) {
|
772
877
|
return;
|
773
878
|
}
|
@@ -781,7 +886,7 @@ static void COVER_best_start(COVER_best_t *best) {
|
|
781
886
|
* Decrements liveJobs and signals any waiting threads if liveJobs == 0.
|
782
887
|
* If this dictionary is the best so far save it and its parameters.
|
783
888
|
*/
|
784
|
-
|
889
|
+
void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
|
785
890
|
ZDICT_cover_params_t parameters, void *dict,
|
786
891
|
size_t dictSize) {
|
787
892
|
if (!best) {
|
@@ -803,6 +908,8 @@ static void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
|
|
803
908
|
if (!best->dict) {
|
804
909
|
best->compressedSize = ERROR(GENERIC);
|
805
910
|
best->dictSize = 0;
|
911
|
+
ZSTD_pthread_cond_signal(&best->cond);
|
912
|
+
ZSTD_pthread_mutex_unlock(&best->mutex);
|
806
913
|
return;
|
807
914
|
}
|
808
915
|
}
|
@@ -812,10 +919,10 @@ static void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
|
|
812
919
|
best->parameters = parameters;
|
813
920
|
best->compressedSize = compressedSize;
|
814
921
|
}
|
815
|
-
ZSTD_pthread_mutex_unlock(&best->mutex);
|
816
922
|
if (liveJobs == 0) {
|
817
923
|
ZSTD_pthread_cond_broadcast(&best->cond);
|
818
924
|
}
|
925
|
+
ZSTD_pthread_mutex_unlock(&best->mutex);
|
819
926
|
}
|
820
927
|
}
|
821
928
|
|
@@ -830,7 +937,7 @@ typedef struct COVER_tryParameters_data_s {
|
|
830
937
|
} COVER_tryParameters_data_t;
|
831
938
|
|
832
939
|
/**
|
833
|
-
* Tries a set of parameters and
|
940
|
+
* Tries a set of parameters and updates the COVER_best_t with the results.
|
834
941
|
* This function is thread safe if zstd is compiled with multithreaded support.
|
835
942
|
* It takes its parameters as an *OWNING* opaque pointer to support threading.
|
836
943
|
*/
|
@@ -861,7 +968,7 @@ static void COVER_tryParameters(void *opaque) {
|
|
861
968
|
dictBufferCapacity, parameters);
|
862
969
|
dictBufferCapacity = ZDICT_finalizeDictionary(
|
863
970
|
dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
|
864
|
-
ctx->samples, ctx->samplesSizes, (unsigned)ctx->
|
971
|
+
ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbTrainSamples,
|
865
972
|
parameters.zParams);
|
866
973
|
if (ZDICT_isError(dictBufferCapacity)) {
|
867
974
|
DISPLAYLEVEL(1, "Failed to finalize dictionary\n");
|
@@ -869,49 +976,10 @@ static void COVER_tryParameters(void *opaque) {
|
|
869
976
|
}
|
870
977
|
}
|
871
978
|
/* Check total compressed size */
|
872
|
-
|
873
|
-
|
874
|
-
|
875
|
-
|
876
|
-
void *dst;
|
877
|
-
/* Local variables */
|
878
|
-
size_t dstCapacity;
|
879
|
-
size_t i;
|
880
|
-
/* Allocate dst with enough space to compress the maximum sized sample */
|
881
|
-
{
|
882
|
-
size_t maxSampleSize = 0;
|
883
|
-
for (i = 0; i < ctx->nbSamples; ++i) {
|
884
|
-
maxSampleSize = MAX(ctx->samplesSizes[i], maxSampleSize);
|
885
|
-
}
|
886
|
-
dstCapacity = ZSTD_compressBound(maxSampleSize);
|
887
|
-
dst = malloc(dstCapacity);
|
888
|
-
}
|
889
|
-
/* Create the cctx and cdict */
|
890
|
-
cctx = ZSTD_createCCtx();
|
891
|
-
cdict = ZSTD_createCDict(dict, dictBufferCapacity,
|
892
|
-
parameters.zParams.compressionLevel);
|
893
|
-
if (!dst || !cctx || !cdict) {
|
894
|
-
goto _compressCleanup;
|
895
|
-
}
|
896
|
-
/* Compress each sample and sum their sizes (or error) */
|
897
|
-
totalCompressedSize = dictBufferCapacity;
|
898
|
-
for (i = 0; i < ctx->nbSamples; ++i) {
|
899
|
-
const size_t size = ZSTD_compress_usingCDict(
|
900
|
-
cctx, dst, dstCapacity, ctx->samples + ctx->offsets[i],
|
901
|
-
ctx->samplesSizes[i], cdict);
|
902
|
-
if (ZSTD_isError(size)) {
|
903
|
-
totalCompressedSize = ERROR(GENERIC);
|
904
|
-
goto _compressCleanup;
|
905
|
-
}
|
906
|
-
totalCompressedSize += size;
|
907
|
-
}
|
908
|
-
_compressCleanup:
|
909
|
-
ZSTD_freeCCtx(cctx);
|
910
|
-
ZSTD_freeCDict(cdict);
|
911
|
-
if (dst) {
|
912
|
-
free(dst);
|
913
|
-
}
|
914
|
-
}
|
979
|
+
totalCompressedSize = COVER_checkTotalCompressedSize(parameters, ctx->samplesSizes,
|
980
|
+
ctx->samples, ctx->offsets,
|
981
|
+
ctx->nbTrainSamples, ctx->nbSamples,
|
982
|
+
dict, dictBufferCapacity);
|
915
983
|
|
916
984
|
_cleanup:
|
917
985
|
COVER_best_finish(data->best, totalCompressedSize, parameters, dict,
|
@@ -932,6 +1000,8 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
|
|
932
1000
|
ZDICT_cover_params_t *parameters) {
|
933
1001
|
/* constants */
|
934
1002
|
const unsigned nbThreads = parameters->nbThreads;
|
1003
|
+
const double splitPoint =
|
1004
|
+
parameters->splitPoint <= 0.0 ? DEFAULT_SPLITPOINT : parameters->splitPoint;
|
935
1005
|
const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d;
|
936
1006
|
const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d;
|
937
1007
|
const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k;
|
@@ -947,7 +1017,13 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
|
|
947
1017
|
unsigned k;
|
948
1018
|
COVER_best_t best;
|
949
1019
|
POOL_ctx *pool = NULL;
|
1020
|
+
int warned = 0;
|
1021
|
+
|
950
1022
|
/* Checks */
|
1023
|
+
if (splitPoint <= 0 || splitPoint > 1) {
|
1024
|
+
LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
|
1025
|
+
return ERROR(GENERIC);
|
1026
|
+
}
|
951
1027
|
if (kMinK < kMaxD || kMaxK < kMinK) {
|
952
1028
|
LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
|
953
1029
|
return ERROR(GENERIC);
|
@@ -978,12 +1054,16 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
|
|
978
1054
|
/* Initialize the context for this value of d */
|
979
1055
|
COVER_ctx_t ctx;
|
980
1056
|
LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d);
|
981
|
-
if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d)) {
|
1057
|
+
if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint)) {
|
982
1058
|
LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n");
|
983
1059
|
COVER_best_destroy(&best);
|
984
1060
|
POOL_free(pool);
|
985
1061
|
return ERROR(GENERIC);
|
986
1062
|
}
|
1063
|
+
if (!warned) {
|
1064
|
+
COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize, displayLevel);
|
1065
|
+
warned = 1;
|
1066
|
+
}
|
987
1067
|
/* Loop through k reusing the same context */
|
988
1068
|
for (k = kMinK; k <= kMaxK; k += kStepSize) {
|
989
1069
|
/* Prepare the arguments */
|
@@ -1003,6 +1083,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
|
|
1003
1083
|
data->parameters = *parameters;
|
1004
1084
|
data->parameters.k = k;
|
1005
1085
|
data->parameters.d = d;
|
1086
|
+
data->parameters.splitPoint = splitPoint;
|
1006
1087
|
data->parameters.steps = kSteps;
|
1007
1088
|
data->parameters.zParams.notificationLevel = g_displayLevel;
|
1008
1089
|
/* Check the parameters */
|
@@ -1020,7 +1101,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
|
|
1020
1101
|
}
|
1021
1102
|
/* Print status */
|
1022
1103
|
LOCALDISPLAYUPDATE(displayLevel, 2, "\r%u%% ",
|
1023
|
-
(
|
1104
|
+
(unsigned)((iteration * 100) / kIterations));
|
1024
1105
|
++iteration;
|
1025
1106
|
}
|
1026
1107
|
COVER_best_wait(&best);
|