extzstd 0.2 → 0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/HISTORY.ja.md +13 -0
- data/README.md +17 -14
- data/contrib/zstd/{NEWS → CHANGELOG} +115 -2
- data/contrib/zstd/CODE_OF_CONDUCT.md +5 -0
- data/contrib/zstd/Makefile +99 -53
- data/contrib/zstd/README.md +59 -39
- data/contrib/zstd/TESTING.md +1 -1
- data/contrib/zstd/appveyor.yml +17 -6
- data/contrib/zstd/lib/BUCK +29 -2
- data/contrib/zstd/lib/Makefile +118 -21
- data/contrib/zstd/lib/README.md +84 -44
- data/contrib/zstd/lib/common/bitstream.h +17 -33
- data/contrib/zstd/lib/common/compiler.h +62 -8
- data/contrib/zstd/lib/common/cpu.h +215 -0
- data/contrib/zstd/lib/common/debug.c +44 -0
- data/contrib/zstd/lib/common/debug.h +134 -0
- data/contrib/zstd/lib/common/entropy_common.c +16 -1
- data/contrib/zstd/lib/common/error_private.c +7 -0
- data/contrib/zstd/lib/common/fse.h +48 -44
- data/contrib/zstd/lib/common/fse_decompress.c +3 -3
- data/contrib/zstd/lib/common/huf.h +169 -113
- data/contrib/zstd/lib/common/mem.h +20 -2
- data/contrib/zstd/lib/common/pool.c +135 -49
- data/contrib/zstd/lib/common/pool.h +40 -21
- data/contrib/zstd/lib/common/threading.c +2 -2
- data/contrib/zstd/lib/common/threading.h +12 -12
- data/contrib/zstd/lib/common/xxhash.c +3 -2
- data/contrib/zstd/lib/common/zstd_common.c +3 -6
- data/contrib/zstd/lib/common/zstd_errors.h +17 -7
- data/contrib/zstd/lib/common/zstd_internal.h +76 -48
- data/contrib/zstd/lib/compress/fse_compress.c +89 -209
- data/contrib/zstd/lib/compress/hist.c +203 -0
- data/contrib/zstd/lib/compress/hist.h +95 -0
- data/contrib/zstd/lib/compress/huf_compress.c +188 -80
- data/contrib/zstd/lib/compress/zstd_compress.c +2500 -1203
- data/contrib/zstd/lib/compress/zstd_compress_internal.h +463 -62
- data/contrib/zstd/lib/compress/zstd_double_fast.c +321 -131
- data/contrib/zstd/lib/compress/zstd_double_fast.h +13 -4
- data/contrib/zstd/lib/compress/zstd_fast.c +335 -108
- data/contrib/zstd/lib/compress/zstd_fast.h +12 -6
- data/contrib/zstd/lib/compress/zstd_lazy.c +654 -313
- data/contrib/zstd/lib/compress/zstd_lazy.h +44 -16
- data/contrib/zstd/lib/compress/zstd_ldm.c +310 -420
- data/contrib/zstd/lib/compress/zstd_ldm.h +63 -26
- data/contrib/zstd/lib/compress/zstd_opt.c +773 -325
- data/contrib/zstd/lib/compress/zstd_opt.h +31 -5
- data/contrib/zstd/lib/compress/zstdmt_compress.c +1468 -518
- data/contrib/zstd/lib/compress/zstdmt_compress.h +96 -45
- data/contrib/zstd/lib/decompress/huf_decompress.c +518 -282
- data/contrib/zstd/lib/decompress/zstd_ddict.c +240 -0
- data/contrib/zstd/lib/decompress/zstd_ddict.h +44 -0
- data/contrib/zstd/lib/decompress/zstd_decompress.c +613 -1513
- data/contrib/zstd/lib/decompress/zstd_decompress_block.c +1311 -0
- data/contrib/zstd/lib/decompress/zstd_decompress_block.h +59 -0
- data/contrib/zstd/lib/decompress/zstd_decompress_internal.h +175 -0
- data/contrib/zstd/lib/dictBuilder/cover.c +194 -113
- data/contrib/zstd/lib/dictBuilder/cover.h +112 -0
- data/contrib/zstd/lib/dictBuilder/divsufsort.c +3 -3
- data/contrib/zstd/lib/dictBuilder/fastcover.c +740 -0
- data/contrib/zstd/lib/dictBuilder/zdict.c +142 -106
- data/contrib/zstd/lib/dictBuilder/zdict.h +115 -49
- data/contrib/zstd/lib/legacy/zstd_legacy.h +44 -12
- data/contrib/zstd/lib/legacy/zstd_v01.c +41 -10
- data/contrib/zstd/lib/legacy/zstd_v01.h +12 -7
- data/contrib/zstd/lib/legacy/zstd_v02.c +37 -12
- data/contrib/zstd/lib/legacy/zstd_v02.h +12 -7
- data/contrib/zstd/lib/legacy/zstd_v03.c +38 -12
- data/contrib/zstd/lib/legacy/zstd_v03.h +12 -7
- data/contrib/zstd/lib/legacy/zstd_v04.c +55 -174
- data/contrib/zstd/lib/legacy/zstd_v04.h +12 -7
- data/contrib/zstd/lib/legacy/zstd_v05.c +59 -31
- data/contrib/zstd/lib/legacy/zstd_v05.h +12 -7
- data/contrib/zstd/lib/legacy/zstd_v06.c +48 -20
- data/contrib/zstd/lib/legacy/zstd_v06.h +10 -5
- data/contrib/zstd/lib/legacy/zstd_v07.c +62 -29
- data/contrib/zstd/lib/legacy/zstd_v07.h +10 -5
- data/contrib/zstd/lib/zstd.h +1346 -832
- data/ext/extzstd.c +27 -19
- data/ext/extzstd_stream.c +20 -4
- data/ext/zstd_compress.c +1 -0
- data/ext/zstd_decompress.c +4 -0
- data/ext/zstd_dictbuilder.c +4 -0
- data/ext/zstd_dictbuilder_fastcover.c +5 -0
- data/lib/extzstd.rb +52 -220
- data/lib/extzstd/version.rb +1 -1
- metadata +21 -7
- data/contrib/zstd/circle.yml +0 -63
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
|
|
3
|
+
* All rights reserved.
|
|
4
|
+
*
|
|
5
|
+
* This source code is licensed under both the BSD-style license (found in the
|
|
6
|
+
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
|
|
7
|
+
* in the COPYING file in the root directory of this source tree).
|
|
8
|
+
* You may select, at your option, one of the above-listed licenses.
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
#ifndef ZSTD_DEC_BLOCK_H
|
|
13
|
+
#define ZSTD_DEC_BLOCK_H
|
|
14
|
+
|
|
15
|
+
/*-*******************************************************
|
|
16
|
+
* Dependencies
|
|
17
|
+
*********************************************************/
|
|
18
|
+
#include <stddef.h> /* size_t */
|
|
19
|
+
#include "zstd.h" /* DCtx, and some public functions */
|
|
20
|
+
#include "zstd_internal.h" /* blockProperties_t, and some public functions */
|
|
21
|
+
#include "zstd_decompress_internal.h" /* ZSTD_seqSymbol */
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
/* === Prototypes === */
|
|
25
|
+
|
|
26
|
+
/* note: prototypes already published within `zstd.h` :
|
|
27
|
+
* ZSTD_decompressBlock()
|
|
28
|
+
*/
|
|
29
|
+
|
|
30
|
+
/* note: prototypes already published within `zstd_internal.h` :
|
|
31
|
+
* ZSTD_getcBlockSize()
|
|
32
|
+
* ZSTD_decodeSeqHeaders()
|
|
33
|
+
*/
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
/* ZSTD_decompressBlock_internal() :
|
|
37
|
+
* decompress block, starting at `src`,
|
|
38
|
+
* into destination buffer `dst`.
|
|
39
|
+
* @return : decompressed block size,
|
|
40
|
+
* or an error code (which can be tested using ZSTD_isError())
|
|
41
|
+
*/
|
|
42
|
+
size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
|
|
43
|
+
void* dst, size_t dstCapacity,
|
|
44
|
+
const void* src, size_t srcSize, const int frame);
|
|
45
|
+
|
|
46
|
+
/* ZSTD_buildFSETable() :
|
|
47
|
+
* generate FSE decoding table for one symbol (ll, ml or off)
|
|
48
|
+
* this function must be called with valid parameters only
|
|
49
|
+
* (dt is large enough, normalizedCounter distribution total is a power of 2, max is within range, etc.)
|
|
50
|
+
* in which case it cannot fail.
|
|
51
|
+
* Internal use only.
|
|
52
|
+
*/
|
|
53
|
+
void ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
|
|
54
|
+
const short* normalizedCounter, unsigned maxSymbolValue,
|
|
55
|
+
const U32* baseValue, const U32* nbAdditionalBits,
|
|
56
|
+
unsigned tableLog);
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
#endif /* ZSTD_DEC_BLOCK_H */
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
|
|
3
|
+
* All rights reserved.
|
|
4
|
+
*
|
|
5
|
+
* This source code is licensed under both the BSD-style license (found in the
|
|
6
|
+
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
|
|
7
|
+
* in the COPYING file in the root directory of this source tree).
|
|
8
|
+
* You may select, at your option, one of the above-listed licenses.
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
/* zstd_decompress_internal:
|
|
13
|
+
* objects and definitions shared within lib/decompress modules */
|
|
14
|
+
|
|
15
|
+
#ifndef ZSTD_DECOMPRESS_INTERNAL_H
|
|
16
|
+
#define ZSTD_DECOMPRESS_INTERNAL_H
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
/*-*******************************************************
|
|
20
|
+
* Dependencies
|
|
21
|
+
*********************************************************/
|
|
22
|
+
#include "mem.h" /* BYTE, U16, U32 */
|
|
23
|
+
#include "zstd_internal.h" /* ZSTD_seqSymbol */
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
/*-*******************************************************
|
|
28
|
+
* Constants
|
|
29
|
+
*********************************************************/
|
|
30
|
+
static const U32 LL_base[MaxLL+1] = {
|
|
31
|
+
0, 1, 2, 3, 4, 5, 6, 7,
|
|
32
|
+
8, 9, 10, 11, 12, 13, 14, 15,
|
|
33
|
+
16, 18, 20, 22, 24, 28, 32, 40,
|
|
34
|
+
48, 64, 0x80, 0x100, 0x200, 0x400, 0x800, 0x1000,
|
|
35
|
+
0x2000, 0x4000, 0x8000, 0x10000 };
|
|
36
|
+
|
|
37
|
+
static const U32 OF_base[MaxOff+1] = {
|
|
38
|
+
0, 1, 1, 5, 0xD, 0x1D, 0x3D, 0x7D,
|
|
39
|
+
0xFD, 0x1FD, 0x3FD, 0x7FD, 0xFFD, 0x1FFD, 0x3FFD, 0x7FFD,
|
|
40
|
+
0xFFFD, 0x1FFFD, 0x3FFFD, 0x7FFFD, 0xFFFFD, 0x1FFFFD, 0x3FFFFD, 0x7FFFFD,
|
|
41
|
+
0xFFFFFD, 0x1FFFFFD, 0x3FFFFFD, 0x7FFFFFD, 0xFFFFFFD, 0x1FFFFFFD, 0x3FFFFFFD, 0x7FFFFFFD };
|
|
42
|
+
|
|
43
|
+
static const U32 OF_bits[MaxOff+1] = {
|
|
44
|
+
0, 1, 2, 3, 4, 5, 6, 7,
|
|
45
|
+
8, 9, 10, 11, 12, 13, 14, 15,
|
|
46
|
+
16, 17, 18, 19, 20, 21, 22, 23,
|
|
47
|
+
24, 25, 26, 27, 28, 29, 30, 31 };
|
|
48
|
+
|
|
49
|
+
static const U32 ML_base[MaxML+1] = {
|
|
50
|
+
3, 4, 5, 6, 7, 8, 9, 10,
|
|
51
|
+
11, 12, 13, 14, 15, 16, 17, 18,
|
|
52
|
+
19, 20, 21, 22, 23, 24, 25, 26,
|
|
53
|
+
27, 28, 29, 30, 31, 32, 33, 34,
|
|
54
|
+
35, 37, 39, 41, 43, 47, 51, 59,
|
|
55
|
+
67, 83, 99, 0x83, 0x103, 0x203, 0x403, 0x803,
|
|
56
|
+
0x1003, 0x2003, 0x4003, 0x8003, 0x10003 };
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
/*-*******************************************************
|
|
60
|
+
* Decompression types
|
|
61
|
+
*********************************************************/
|
|
62
|
+
typedef struct {
|
|
63
|
+
U32 fastMode;
|
|
64
|
+
U32 tableLog;
|
|
65
|
+
} ZSTD_seqSymbol_header;
|
|
66
|
+
|
|
67
|
+
typedef struct {
|
|
68
|
+
U16 nextState;
|
|
69
|
+
BYTE nbAdditionalBits;
|
|
70
|
+
BYTE nbBits;
|
|
71
|
+
U32 baseValue;
|
|
72
|
+
} ZSTD_seqSymbol;
|
|
73
|
+
|
|
74
|
+
#define SEQSYMBOL_TABLE_SIZE(log) (1 + (1 << (log)))
|
|
75
|
+
|
|
76
|
+
typedef struct {
|
|
77
|
+
ZSTD_seqSymbol LLTable[SEQSYMBOL_TABLE_SIZE(LLFSELog)]; /* Note : Space reserved for FSE Tables */
|
|
78
|
+
ZSTD_seqSymbol OFTable[SEQSYMBOL_TABLE_SIZE(OffFSELog)]; /* is also used as temporary workspace while building hufTable during DDict creation */
|
|
79
|
+
ZSTD_seqSymbol MLTable[SEQSYMBOL_TABLE_SIZE(MLFSELog)]; /* and therefore must be at least HUF_DECOMPRESS_WORKSPACE_SIZE large */
|
|
80
|
+
HUF_DTable hufTable[HUF_DTABLE_SIZE(HufLog)]; /* can accommodate HUF_decompress4X */
|
|
81
|
+
U32 rep[ZSTD_REP_NUM];
|
|
82
|
+
} ZSTD_entropyDTables_t;
|
|
83
|
+
|
|
84
|
+
typedef enum { ZSTDds_getFrameHeaderSize, ZSTDds_decodeFrameHeader,
|
|
85
|
+
ZSTDds_decodeBlockHeader, ZSTDds_decompressBlock,
|
|
86
|
+
ZSTDds_decompressLastBlock, ZSTDds_checkChecksum,
|
|
87
|
+
ZSTDds_decodeSkippableHeader, ZSTDds_skipFrame } ZSTD_dStage;
|
|
88
|
+
|
|
89
|
+
typedef enum { zdss_init=0, zdss_loadHeader,
|
|
90
|
+
zdss_read, zdss_load, zdss_flush } ZSTD_dStreamStage;
|
|
91
|
+
|
|
92
|
+
typedef enum {
|
|
93
|
+
ZSTD_use_indefinitely = -1, /* Use the dictionary indefinitely */
|
|
94
|
+
ZSTD_dont_use = 0, /* Do not use the dictionary (if one exists free it) */
|
|
95
|
+
ZSTD_use_once = 1 /* Use the dictionary once and set to ZSTD_dont_use */
|
|
96
|
+
} ZSTD_dictUses_e;
|
|
97
|
+
|
|
98
|
+
struct ZSTD_DCtx_s
|
|
99
|
+
{
|
|
100
|
+
const ZSTD_seqSymbol* LLTptr;
|
|
101
|
+
const ZSTD_seqSymbol* MLTptr;
|
|
102
|
+
const ZSTD_seqSymbol* OFTptr;
|
|
103
|
+
const HUF_DTable* HUFptr;
|
|
104
|
+
ZSTD_entropyDTables_t entropy;
|
|
105
|
+
U32 workspace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32]; /* space needed when building huffman tables */
|
|
106
|
+
const void* previousDstEnd; /* detect continuity */
|
|
107
|
+
const void* prefixStart; /* start of current segment */
|
|
108
|
+
const void* virtualStart; /* virtual start of previous segment if it was just before current one */
|
|
109
|
+
const void* dictEnd; /* end of previous segment */
|
|
110
|
+
size_t expected;
|
|
111
|
+
ZSTD_frameHeader fParams;
|
|
112
|
+
U64 decodedSize;
|
|
113
|
+
blockType_e bType; /* used in ZSTD_decompressContinue(), store blockType between block header decoding and block decompression stages */
|
|
114
|
+
ZSTD_dStage stage;
|
|
115
|
+
U32 litEntropy;
|
|
116
|
+
U32 fseEntropy;
|
|
117
|
+
XXH64_state_t xxhState;
|
|
118
|
+
size_t headerSize;
|
|
119
|
+
ZSTD_format_e format;
|
|
120
|
+
const BYTE* litPtr;
|
|
121
|
+
ZSTD_customMem customMem;
|
|
122
|
+
size_t litSize;
|
|
123
|
+
size_t rleSize;
|
|
124
|
+
size_t staticSize;
|
|
125
|
+
int bmi2; /* == 1 if the CPU supports BMI2 and 0 otherwise. CPU support is determined dynamically once per context lifetime. */
|
|
126
|
+
|
|
127
|
+
/* dictionary */
|
|
128
|
+
ZSTD_DDict* ddictLocal;
|
|
129
|
+
const ZSTD_DDict* ddict; /* set by ZSTD_initDStream_usingDDict(), or ZSTD_DCtx_refDDict() */
|
|
130
|
+
U32 dictID;
|
|
131
|
+
int ddictIsCold; /* if == 1 : dictionary is "new" for working context, and presumed "cold" (not in cpu cache) */
|
|
132
|
+
ZSTD_dictUses_e dictUses;
|
|
133
|
+
|
|
134
|
+
/* streaming */
|
|
135
|
+
ZSTD_dStreamStage streamStage;
|
|
136
|
+
char* inBuff;
|
|
137
|
+
size_t inBuffSize;
|
|
138
|
+
size_t inPos;
|
|
139
|
+
size_t maxWindowSize;
|
|
140
|
+
char* outBuff;
|
|
141
|
+
size_t outBuffSize;
|
|
142
|
+
size_t outStart;
|
|
143
|
+
size_t outEnd;
|
|
144
|
+
size_t lhSize;
|
|
145
|
+
void* legacyContext;
|
|
146
|
+
U32 previousLegacyVersion;
|
|
147
|
+
U32 legacyVersion;
|
|
148
|
+
U32 hostageByte;
|
|
149
|
+
int noForwardProgress;
|
|
150
|
+
|
|
151
|
+
/* workspace */
|
|
152
|
+
BYTE litBuffer[ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH];
|
|
153
|
+
BYTE headerBuffer[ZSTD_FRAMEHEADERSIZE_MAX];
|
|
154
|
+
}; /* typedef'd to ZSTD_DCtx within "zstd.h" */
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
/*-*******************************************************
|
|
158
|
+
* Shared internal functions
|
|
159
|
+
*********************************************************/
|
|
160
|
+
|
|
161
|
+
/*! ZSTD_loadDEntropy() :
|
|
162
|
+
* dict : must point at beginning of a valid zstd dictionary.
|
|
163
|
+
* @return : size of entropy tables read */
|
|
164
|
+
size_t ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy,
|
|
165
|
+
const void* const dict, size_t const dictSize);
|
|
166
|
+
|
|
167
|
+
/*! ZSTD_checkContinuity() :
|
|
168
|
+
* check if next `dst` follows previous position, where decompression ended.
|
|
169
|
+
* If yes, do nothing (continue on current segment).
|
|
170
|
+
* If not, classify previous segment as "external dictionary", and start a new segment.
|
|
171
|
+
* This function cannot fail. */
|
|
172
|
+
void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst);
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
#endif /* ZSTD_DECOMPRESS_INTERNAL_H */
|
|
@@ -29,6 +29,7 @@
|
|
|
29
29
|
#include "mem.h" /* read */
|
|
30
30
|
#include "pool.h"
|
|
31
31
|
#include "threading.h"
|
|
32
|
+
#include "cover.h"
|
|
32
33
|
#include "zstd_internal.h" /* includes zstd.h */
|
|
33
34
|
#ifndef ZDICT_STATIC_LINKING_ONLY
|
|
34
35
|
#define ZDICT_STATIC_LINKING_ONLY
|
|
@@ -38,7 +39,8 @@
|
|
|
38
39
|
/*-*************************************
|
|
39
40
|
* Constants
|
|
40
41
|
***************************************/
|
|
41
|
-
#define COVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((
|
|
42
|
+
#define COVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((unsigned)-1) : ((unsigned)1 GB))
|
|
43
|
+
#define DEFAULT_SPLITPOINT 1.0
|
|
42
44
|
|
|
43
45
|
/*-*************************************
|
|
44
46
|
* Console display
|
|
@@ -184,7 +186,7 @@ static void COVER_map_remove(COVER_map_t *map, U32 key) {
|
|
|
184
186
|
}
|
|
185
187
|
|
|
186
188
|
/**
|
|
187
|
-
*
|
|
189
|
+
* Destroys a map that is inited with COVER_map_init().
|
|
188
190
|
*/
|
|
189
191
|
static void COVER_map_destroy(COVER_map_t *map) {
|
|
190
192
|
if (map->data) {
|
|
@@ -203,6 +205,8 @@ typedef struct {
|
|
|
203
205
|
size_t *offsets;
|
|
204
206
|
const size_t *samplesSizes;
|
|
205
207
|
size_t nbSamples;
|
|
208
|
+
size_t nbTrainSamples;
|
|
209
|
+
size_t nbTestSamples;
|
|
206
210
|
U32 *suffix;
|
|
207
211
|
size_t suffixSize;
|
|
208
212
|
U32 *freqs;
|
|
@@ -220,9 +224,9 @@ static COVER_ctx_t *g_ctx = NULL;
|
|
|
220
224
|
/**
|
|
221
225
|
* Returns the sum of the sample sizes.
|
|
222
226
|
*/
|
|
223
|
-
|
|
227
|
+
size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) {
|
|
224
228
|
size_t sum = 0;
|
|
225
|
-
|
|
229
|
+
unsigned i;
|
|
226
230
|
for (i = 0; i < nbSamples; ++i) {
|
|
227
231
|
sum += samplesSizes[i];
|
|
228
232
|
}
|
|
@@ -377,14 +381,6 @@ static void COVER_group(COVER_ctx_t *ctx, const void *group,
|
|
|
377
381
|
ctx->suffix[dmerId] = freq;
|
|
378
382
|
}
|
|
379
383
|
|
|
380
|
-
/**
|
|
381
|
-
* A segment is a range in the source as well as the score of the segment.
|
|
382
|
-
*/
|
|
383
|
-
typedef struct {
|
|
384
|
-
U32 begin;
|
|
385
|
-
U32 end;
|
|
386
|
-
U32 score;
|
|
387
|
-
} COVER_segment_t;
|
|
388
384
|
|
|
389
385
|
/**
|
|
390
386
|
* Selects the best segment in an epoch.
|
|
@@ -395,7 +391,7 @@ typedef struct {
|
|
|
395
391
|
*
|
|
396
392
|
* Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1})
|
|
397
393
|
*
|
|
398
|
-
* Once the dmer d is in the
|
|
394
|
+
* Once the dmer d is in the dictionary we set F(d) = 0.
|
|
399
395
|
*/
|
|
400
396
|
static COVER_segment_t COVER_selectSegment(const COVER_ctx_t *ctx, U32 *freqs,
|
|
401
397
|
COVER_map_t *activeDmers, U32 begin,
|
|
@@ -439,7 +435,7 @@ static COVER_segment_t COVER_selectSegment(const COVER_ctx_t *ctx, U32 *freqs,
|
|
|
439
435
|
U32 *delDmerOcc = COVER_map_at(activeDmers, delDmer);
|
|
440
436
|
activeSegment.begin += 1;
|
|
441
437
|
*delDmerOcc -= 1;
|
|
442
|
-
/* If this is the last
|
|
438
|
+
/* If this is the last occurrence of the dmer, subtract its score */
|
|
443
439
|
if (*delDmerOcc == 0) {
|
|
444
440
|
COVER_map_remove(activeDmers, delDmer);
|
|
445
441
|
activeSegment.score -= freqs[delDmer];
|
|
@@ -494,6 +490,10 @@ static int COVER_checkParameters(ZDICT_cover_params_t parameters,
|
|
|
494
490
|
if (parameters.d > parameters.k) {
|
|
495
491
|
return 0;
|
|
496
492
|
}
|
|
493
|
+
/* 0 < splitPoint <= 1 */
|
|
494
|
+
if (parameters.splitPoint <= 0 || parameters.splitPoint > 1){
|
|
495
|
+
return 0;
|
|
496
|
+
}
|
|
497
497
|
return 1;
|
|
498
498
|
}
|
|
499
499
|
|
|
@@ -531,25 +531,44 @@ static void COVER_ctx_destroy(COVER_ctx_t *ctx) {
|
|
|
531
531
|
*/
|
|
532
532
|
static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
|
|
533
533
|
const size_t *samplesSizes, unsigned nbSamples,
|
|
534
|
-
unsigned d) {
|
|
534
|
+
unsigned d, double splitPoint) {
|
|
535
535
|
const BYTE *const samples = (const BYTE *)samplesBuffer;
|
|
536
536
|
const size_t totalSamplesSize = COVER_sum(samplesSizes, nbSamples);
|
|
537
|
+
/* Split samples into testing and training sets */
|
|
538
|
+
const unsigned nbTrainSamples = splitPoint < 1.0 ? (unsigned)((double)nbSamples * splitPoint) : nbSamples;
|
|
539
|
+
const unsigned nbTestSamples = splitPoint < 1.0 ? nbSamples - nbTrainSamples : nbSamples;
|
|
540
|
+
const size_t trainingSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes, nbTrainSamples) : totalSamplesSize;
|
|
541
|
+
const size_t testSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes + nbTrainSamples, nbTestSamples) : totalSamplesSize;
|
|
537
542
|
/* Checks */
|
|
538
543
|
if (totalSamplesSize < MAX(d, sizeof(U64)) ||
|
|
539
544
|
totalSamplesSize >= (size_t)COVER_MAX_SAMPLES_SIZE) {
|
|
540
|
-
DISPLAYLEVEL(1, "Total samples size is too large, maximum size is %u MB\n",
|
|
541
|
-
(COVER_MAX_SAMPLES_SIZE >> 20));
|
|
545
|
+
DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n",
|
|
546
|
+
(unsigned)(totalSamplesSize>>20), (COVER_MAX_SAMPLES_SIZE >> 20));
|
|
547
|
+
return 0;
|
|
548
|
+
}
|
|
549
|
+
/* Check if there are at least 5 training samples */
|
|
550
|
+
if (nbTrainSamples < 5) {
|
|
551
|
+
DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid.", nbTrainSamples);
|
|
552
|
+
return 0;
|
|
553
|
+
}
|
|
554
|
+
/* Check if there's testing sample */
|
|
555
|
+
if (nbTestSamples < 1) {
|
|
556
|
+
DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.", nbTestSamples);
|
|
542
557
|
return 0;
|
|
543
558
|
}
|
|
544
559
|
/* Zero the context */
|
|
545
560
|
memset(ctx, 0, sizeof(*ctx));
|
|
546
|
-
DISPLAYLEVEL(2, "Training on %u samples of total size %u\n",
|
|
547
|
-
(
|
|
561
|
+
DISPLAYLEVEL(2, "Training on %u samples of total size %u\n", nbTrainSamples,
|
|
562
|
+
(unsigned)trainingSamplesSize);
|
|
563
|
+
DISPLAYLEVEL(2, "Testing on %u samples of total size %u\n", nbTestSamples,
|
|
564
|
+
(unsigned)testSamplesSize);
|
|
548
565
|
ctx->samples = samples;
|
|
549
566
|
ctx->samplesSizes = samplesSizes;
|
|
550
567
|
ctx->nbSamples = nbSamples;
|
|
568
|
+
ctx->nbTrainSamples = nbTrainSamples;
|
|
569
|
+
ctx->nbTestSamples = nbTestSamples;
|
|
551
570
|
/* Partial suffix array */
|
|
552
|
-
ctx->suffixSize =
|
|
571
|
+
ctx->suffixSize = trainingSamplesSize - MAX(d, sizeof(U64)) + 1;
|
|
553
572
|
ctx->suffix = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
|
|
554
573
|
/* Maps index to the dmerID */
|
|
555
574
|
ctx->dmerAt = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
|
|
@@ -563,7 +582,7 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
|
|
|
563
582
|
ctx->freqs = NULL;
|
|
564
583
|
ctx->d = d;
|
|
565
584
|
|
|
566
|
-
/* Fill offsets from the
|
|
585
|
+
/* Fill offsets from the samplesSizes */
|
|
567
586
|
{
|
|
568
587
|
U32 i;
|
|
569
588
|
ctx->offsets[0] = 0;
|
|
@@ -581,10 +600,17 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
|
|
|
581
600
|
for (i = 0; i < ctx->suffixSize; ++i) {
|
|
582
601
|
ctx->suffix[i] = i;
|
|
583
602
|
}
|
|
584
|
-
/* qsort doesn't take an opaque pointer, so pass as a global
|
|
603
|
+
/* qsort doesn't take an opaque pointer, so pass as a global.
|
|
604
|
+
* On OpenBSD qsort() is not guaranteed to be stable, their mergesort() is.
|
|
605
|
+
*/
|
|
585
606
|
g_ctx = ctx;
|
|
607
|
+
#if defined(__OpenBSD__)
|
|
608
|
+
mergesort(ctx->suffix, ctx->suffixSize, sizeof(U32),
|
|
609
|
+
(ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp));
|
|
610
|
+
#else
|
|
586
611
|
qsort(ctx->suffix, ctx->suffixSize, sizeof(U32),
|
|
587
612
|
(ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp));
|
|
613
|
+
#endif
|
|
588
614
|
}
|
|
589
615
|
DISPLAYLEVEL(2, "Computing frequencies\n");
|
|
590
616
|
/* For each dmer group (group of positions with the same first d bytes):
|
|
@@ -601,6 +627,39 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
|
|
|
601
627
|
return 1;
|
|
602
628
|
}
|
|
603
629
|
|
|
630
|
+
void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel)
|
|
631
|
+
{
|
|
632
|
+
const double ratio = (double)nbDmers / maxDictSize;
|
|
633
|
+
if (ratio >= 10) {
|
|
634
|
+
return;
|
|
635
|
+
}
|
|
636
|
+
LOCALDISPLAYLEVEL(displayLevel, 1,
|
|
637
|
+
"WARNING: The maximum dictionary size %u is too large "
|
|
638
|
+
"compared to the source size %u! "
|
|
639
|
+
"size(source)/size(dictionary) = %f, but it should be >= "
|
|
640
|
+
"10! This may lead to a subpar dictionary! We recommend "
|
|
641
|
+
"training on sources at least 10x, and up to 100x the "
|
|
642
|
+
"size of the dictionary!\n", (U32)maxDictSize,
|
|
643
|
+
(U32)nbDmers, ratio);
|
|
644
|
+
}
|
|
645
|
+
|
|
646
|
+
COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize,
|
|
647
|
+
U32 nbDmers, U32 k, U32 passes)
|
|
648
|
+
{
|
|
649
|
+
const U32 minEpochSize = k * 10;
|
|
650
|
+
COVER_epoch_info_t epochs;
|
|
651
|
+
epochs.num = MAX(1, maxDictSize / k / passes);
|
|
652
|
+
epochs.size = nbDmers / epochs.num;
|
|
653
|
+
if (epochs.size >= minEpochSize) {
|
|
654
|
+
assert(epochs.size * epochs.num <= nbDmers);
|
|
655
|
+
return epochs;
|
|
656
|
+
}
|
|
657
|
+
epochs.size = MIN(minEpochSize, nbDmers);
|
|
658
|
+
epochs.num = nbDmers / epochs.size;
|
|
659
|
+
assert(epochs.size * epochs.num <= nbDmers);
|
|
660
|
+
return epochs;
|
|
661
|
+
}
|
|
662
|
+
|
|
604
663
|
/**
|
|
605
664
|
* Given the prepared context build the dictionary.
|
|
606
665
|
*/
|
|
@@ -610,28 +669,34 @@ static size_t COVER_buildDictionary(const COVER_ctx_t *ctx, U32 *freqs,
|
|
|
610
669
|
ZDICT_cover_params_t parameters) {
|
|
611
670
|
BYTE *const dict = (BYTE *)dictBuffer;
|
|
612
671
|
size_t tail = dictBufferCapacity;
|
|
613
|
-
/* Divide the data
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
const
|
|
617
|
-
|
|
672
|
+
/* Divide the data into epochs. We will select one segment from each epoch. */
|
|
673
|
+
const COVER_epoch_info_t epochs = COVER_computeEpochs(
|
|
674
|
+
(U32)dictBufferCapacity, (U32)ctx->suffixSize, parameters.k, 4);
|
|
675
|
+
const size_t maxZeroScoreRun = MAX(10, MIN(100, epochs.num >> 3));
|
|
676
|
+
size_t zeroScoreRun = 0;
|
|
618
677
|
size_t epoch;
|
|
619
|
-
DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n",
|
|
620
|
-
|
|
678
|
+
DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n",
|
|
679
|
+
(U32)epochs.num, (U32)epochs.size);
|
|
621
680
|
/* Loop through the epochs until there are no more segments or the dictionary
|
|
622
681
|
* is full.
|
|
623
682
|
*/
|
|
624
|
-
for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs) {
|
|
625
|
-
const U32 epochBegin = (U32)(epoch *
|
|
626
|
-
const U32 epochEnd = epochBegin +
|
|
683
|
+
for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs.num) {
|
|
684
|
+
const U32 epochBegin = (U32)(epoch * epochs.size);
|
|
685
|
+
const U32 epochEnd = epochBegin + epochs.size;
|
|
627
686
|
size_t segmentSize;
|
|
628
687
|
/* Select a segment */
|
|
629
688
|
COVER_segment_t segment = COVER_selectSegment(
|
|
630
689
|
ctx, freqs, activeDmers, epochBegin, epochEnd, parameters);
|
|
631
|
-
/* If the segment covers no dmers, then we are out of content
|
|
690
|
+
/* If the segment covers no dmers, then we are out of content.
|
|
691
|
+
* There may be new content in other epochs, for continue for some time.
|
|
692
|
+
*/
|
|
632
693
|
if (segment.score == 0) {
|
|
633
|
-
|
|
694
|
+
if (++zeroScoreRun >= maxZeroScoreRun) {
|
|
695
|
+
break;
|
|
696
|
+
}
|
|
697
|
+
continue;
|
|
634
698
|
}
|
|
699
|
+
zeroScoreRun = 0;
|
|
635
700
|
/* Trim the segment if necessary and if it is too small then we are done */
|
|
636
701
|
segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail);
|
|
637
702
|
if (segmentSize < parameters.d) {
|
|
@@ -644,19 +709,23 @@ static size_t COVER_buildDictionary(const COVER_ctx_t *ctx, U32 *freqs,
|
|
|
644
709
|
memcpy(dict + tail, ctx->samples + segment.begin, segmentSize);
|
|
645
710
|
DISPLAYUPDATE(
|
|
646
711
|
2, "\r%u%% ",
|
|
647
|
-
(
|
|
712
|
+
(unsigned)(((dictBufferCapacity - tail) * 100) / dictBufferCapacity));
|
|
648
713
|
}
|
|
649
714
|
DISPLAYLEVEL(2, "\r%79s\r", "");
|
|
650
715
|
return tail;
|
|
651
716
|
}
|
|
652
717
|
|
|
653
718
|
ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
|
|
654
|
-
void *dictBuffer, size_t dictBufferCapacity,
|
|
655
|
-
const size_t *samplesSizes, unsigned nbSamples,
|
|
656
|
-
ZDICT_cover_params_t parameters)
|
|
657
|
-
|
|
719
|
+
void *dictBuffer, size_t dictBufferCapacity,
|
|
720
|
+
const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,
|
|
721
|
+
ZDICT_cover_params_t parameters)
|
|
722
|
+
{
|
|
723
|
+
BYTE* const dict = (BYTE*)dictBuffer;
|
|
658
724
|
COVER_ctx_t ctx;
|
|
659
725
|
COVER_map_t activeDmers;
|
|
726
|
+
parameters.splitPoint = 1.0;
|
|
727
|
+
/* Initialize global data */
|
|
728
|
+
g_displayLevel = parameters.zParams.notificationLevel;
|
|
660
729
|
/* Checks */
|
|
661
730
|
if (!COVER_checkParameters(parameters, dictBufferCapacity)) {
|
|
662
731
|
DISPLAYLEVEL(1, "Cover parameters incorrect\n");
|
|
@@ -671,13 +740,12 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
|
|
|
671
740
|
ZDICT_DICTSIZE_MIN);
|
|
672
741
|
return ERROR(dstSize_tooSmall);
|
|
673
742
|
}
|
|
674
|
-
/* Initialize global data */
|
|
675
|
-
g_displayLevel = parameters.zParams.notificationLevel;
|
|
676
743
|
/* Initialize context and activeDmers */
|
|
677
744
|
if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
|
|
678
|
-
parameters.d)) {
|
|
745
|
+
parameters.d, parameters.splitPoint)) {
|
|
679
746
|
return ERROR(GENERIC);
|
|
680
747
|
}
|
|
748
|
+
COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize, g_displayLevel);
|
|
681
749
|
if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
|
|
682
750
|
DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
|
|
683
751
|
COVER_ctx_destroy(&ctx);
|
|
@@ -694,7 +762,7 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
|
|
|
694
762
|
samplesBuffer, samplesSizes, nbSamples, parameters.zParams);
|
|
695
763
|
if (!ZSTD_isError(dictionarySize)) {
|
|
696
764
|
DISPLAYLEVEL(2, "Constructed dictionary of size %u\n",
|
|
697
|
-
(
|
|
765
|
+
(unsigned)dictionarySize);
|
|
698
766
|
}
|
|
699
767
|
COVER_ctx_destroy(&ctx);
|
|
700
768
|
COVER_map_destroy(&activeDmers);
|
|
@@ -702,28 +770,65 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
|
|
|
702
770
|
}
|
|
703
771
|
}
|
|
704
772
|
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
*
|
|
709
|
-
*
|
|
710
|
-
|
|
711
|
-
*
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
size_t
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
773
|
+
|
|
774
|
+
|
|
775
|
+
size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,
|
|
776
|
+
const size_t *samplesSizes, const BYTE *samples,
|
|
777
|
+
size_t *offsets,
|
|
778
|
+
size_t nbTrainSamples, size_t nbSamples,
|
|
779
|
+
BYTE *const dict, size_t dictBufferCapacity) {
|
|
780
|
+
size_t totalCompressedSize = ERROR(GENERIC);
|
|
781
|
+
/* Pointers */
|
|
782
|
+
ZSTD_CCtx *cctx;
|
|
783
|
+
ZSTD_CDict *cdict;
|
|
784
|
+
void *dst;
|
|
785
|
+
/* Local variables */
|
|
786
|
+
size_t dstCapacity;
|
|
787
|
+
size_t i;
|
|
788
|
+
/* Allocate dst with enough space to compress the maximum sized sample */
|
|
789
|
+
{
|
|
790
|
+
size_t maxSampleSize = 0;
|
|
791
|
+
i = parameters.splitPoint < 1.0 ? nbTrainSamples : 0;
|
|
792
|
+
for (; i < nbSamples; ++i) {
|
|
793
|
+
maxSampleSize = MAX(samplesSizes[i], maxSampleSize);
|
|
794
|
+
}
|
|
795
|
+
dstCapacity = ZSTD_compressBound(maxSampleSize);
|
|
796
|
+
dst = malloc(dstCapacity);
|
|
797
|
+
}
|
|
798
|
+
/* Create the cctx and cdict */
|
|
799
|
+
cctx = ZSTD_createCCtx();
|
|
800
|
+
cdict = ZSTD_createCDict(dict, dictBufferCapacity,
|
|
801
|
+
parameters.zParams.compressionLevel);
|
|
802
|
+
if (!dst || !cctx || !cdict) {
|
|
803
|
+
goto _compressCleanup;
|
|
804
|
+
}
|
|
805
|
+
/* Compress each sample and sum their sizes (or error) */
|
|
806
|
+
totalCompressedSize = dictBufferCapacity;
|
|
807
|
+
i = parameters.splitPoint < 1.0 ? nbTrainSamples : 0;
|
|
808
|
+
for (; i < nbSamples; ++i) {
|
|
809
|
+
const size_t size = ZSTD_compress_usingCDict(
|
|
810
|
+
cctx, dst, dstCapacity, samples + offsets[i],
|
|
811
|
+
samplesSizes[i], cdict);
|
|
812
|
+
if (ZSTD_isError(size)) {
|
|
813
|
+
totalCompressedSize = ERROR(GENERIC);
|
|
814
|
+
goto _compressCleanup;
|
|
815
|
+
}
|
|
816
|
+
totalCompressedSize += size;
|
|
817
|
+
}
|
|
818
|
+
_compressCleanup:
|
|
819
|
+
ZSTD_freeCCtx(cctx);
|
|
820
|
+
ZSTD_freeCDict(cdict);
|
|
821
|
+
if (dst) {
|
|
822
|
+
free(dst);
|
|
823
|
+
}
|
|
824
|
+
return totalCompressedSize;
|
|
825
|
+
}
|
|
826
|
+
|
|
722
827
|
|
|
723
828
|
/**
|
|
724
829
|
* Initialize the `COVER_best_t`.
|
|
725
830
|
*/
|
|
726
|
-
|
|
831
|
+
void COVER_best_init(COVER_best_t *best) {
|
|
727
832
|
if (best==NULL) return; /* compatible with init on NULL */
|
|
728
833
|
(void)ZSTD_pthread_mutex_init(&best->mutex, NULL);
|
|
729
834
|
(void)ZSTD_pthread_cond_init(&best->cond, NULL);
|
|
@@ -737,7 +842,7 @@ static void COVER_best_init(COVER_best_t *best) {
|
|
|
737
842
|
/**
|
|
738
843
|
* Wait until liveJobs == 0.
|
|
739
844
|
*/
|
|
740
|
-
|
|
845
|
+
void COVER_best_wait(COVER_best_t *best) {
|
|
741
846
|
if (!best) {
|
|
742
847
|
return;
|
|
743
848
|
}
|
|
@@ -751,7 +856,7 @@ static void COVER_best_wait(COVER_best_t *best) {
|
|
|
751
856
|
/**
|
|
752
857
|
* Call COVER_best_wait() and then destroy the COVER_best_t.
|
|
753
858
|
*/
|
|
754
|
-
|
|
859
|
+
void COVER_best_destroy(COVER_best_t *best) {
|
|
755
860
|
if (!best) {
|
|
756
861
|
return;
|
|
757
862
|
}
|
|
@@ -767,7 +872,7 @@ static void COVER_best_destroy(COVER_best_t *best) {
|
|
|
767
872
|
* Called when a thread is about to be launched.
|
|
768
873
|
* Increments liveJobs.
|
|
769
874
|
*/
|
|
770
|
-
|
|
875
|
+
void COVER_best_start(COVER_best_t *best) {
|
|
771
876
|
if (!best) {
|
|
772
877
|
return;
|
|
773
878
|
}
|
|
@@ -781,7 +886,7 @@ static void COVER_best_start(COVER_best_t *best) {
|
|
|
781
886
|
* Decrements liveJobs and signals any waiting threads if liveJobs == 0.
|
|
782
887
|
* If this dictionary is the best so far save it and its parameters.
|
|
783
888
|
*/
|
|
784
|
-
|
|
889
|
+
void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
|
|
785
890
|
ZDICT_cover_params_t parameters, void *dict,
|
|
786
891
|
size_t dictSize) {
|
|
787
892
|
if (!best) {
|
|
@@ -803,6 +908,8 @@ static void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
|
|
|
803
908
|
if (!best->dict) {
|
|
804
909
|
best->compressedSize = ERROR(GENERIC);
|
|
805
910
|
best->dictSize = 0;
|
|
911
|
+
ZSTD_pthread_cond_signal(&best->cond);
|
|
912
|
+
ZSTD_pthread_mutex_unlock(&best->mutex);
|
|
806
913
|
return;
|
|
807
914
|
}
|
|
808
915
|
}
|
|
@@ -812,10 +919,10 @@ static void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
|
|
|
812
919
|
best->parameters = parameters;
|
|
813
920
|
best->compressedSize = compressedSize;
|
|
814
921
|
}
|
|
815
|
-
ZSTD_pthread_mutex_unlock(&best->mutex);
|
|
816
922
|
if (liveJobs == 0) {
|
|
817
923
|
ZSTD_pthread_cond_broadcast(&best->cond);
|
|
818
924
|
}
|
|
925
|
+
ZSTD_pthread_mutex_unlock(&best->mutex);
|
|
819
926
|
}
|
|
820
927
|
}
|
|
821
928
|
|
|
@@ -830,7 +937,7 @@ typedef struct COVER_tryParameters_data_s {
|
|
|
830
937
|
} COVER_tryParameters_data_t;
|
|
831
938
|
|
|
832
939
|
/**
|
|
833
|
-
* Tries a set of parameters and
|
|
940
|
+
* Tries a set of parameters and updates the COVER_best_t with the results.
|
|
834
941
|
* This function is thread safe if zstd is compiled with multithreaded support.
|
|
835
942
|
* It takes its parameters as an *OWNING* opaque pointer to support threading.
|
|
836
943
|
*/
|
|
@@ -861,7 +968,7 @@ static void COVER_tryParameters(void *opaque) {
|
|
|
861
968
|
dictBufferCapacity, parameters);
|
|
862
969
|
dictBufferCapacity = ZDICT_finalizeDictionary(
|
|
863
970
|
dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
|
|
864
|
-
ctx->samples, ctx->samplesSizes, (unsigned)ctx->
|
|
971
|
+
ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbTrainSamples,
|
|
865
972
|
parameters.zParams);
|
|
866
973
|
if (ZDICT_isError(dictBufferCapacity)) {
|
|
867
974
|
DISPLAYLEVEL(1, "Failed to finalize dictionary\n");
|
|
@@ -869,49 +976,10 @@ static void COVER_tryParameters(void *opaque) {
|
|
|
869
976
|
}
|
|
870
977
|
}
|
|
871
978
|
/* Check total compressed size */
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
void *dst;
|
|
877
|
-
/* Local variables */
|
|
878
|
-
size_t dstCapacity;
|
|
879
|
-
size_t i;
|
|
880
|
-
/* Allocate dst with enough space to compress the maximum sized sample */
|
|
881
|
-
{
|
|
882
|
-
size_t maxSampleSize = 0;
|
|
883
|
-
for (i = 0; i < ctx->nbSamples; ++i) {
|
|
884
|
-
maxSampleSize = MAX(ctx->samplesSizes[i], maxSampleSize);
|
|
885
|
-
}
|
|
886
|
-
dstCapacity = ZSTD_compressBound(maxSampleSize);
|
|
887
|
-
dst = malloc(dstCapacity);
|
|
888
|
-
}
|
|
889
|
-
/* Create the cctx and cdict */
|
|
890
|
-
cctx = ZSTD_createCCtx();
|
|
891
|
-
cdict = ZSTD_createCDict(dict, dictBufferCapacity,
|
|
892
|
-
parameters.zParams.compressionLevel);
|
|
893
|
-
if (!dst || !cctx || !cdict) {
|
|
894
|
-
goto _compressCleanup;
|
|
895
|
-
}
|
|
896
|
-
/* Compress each sample and sum their sizes (or error) */
|
|
897
|
-
totalCompressedSize = dictBufferCapacity;
|
|
898
|
-
for (i = 0; i < ctx->nbSamples; ++i) {
|
|
899
|
-
const size_t size = ZSTD_compress_usingCDict(
|
|
900
|
-
cctx, dst, dstCapacity, ctx->samples + ctx->offsets[i],
|
|
901
|
-
ctx->samplesSizes[i], cdict);
|
|
902
|
-
if (ZSTD_isError(size)) {
|
|
903
|
-
totalCompressedSize = ERROR(GENERIC);
|
|
904
|
-
goto _compressCleanup;
|
|
905
|
-
}
|
|
906
|
-
totalCompressedSize += size;
|
|
907
|
-
}
|
|
908
|
-
_compressCleanup:
|
|
909
|
-
ZSTD_freeCCtx(cctx);
|
|
910
|
-
ZSTD_freeCDict(cdict);
|
|
911
|
-
if (dst) {
|
|
912
|
-
free(dst);
|
|
913
|
-
}
|
|
914
|
-
}
|
|
979
|
+
totalCompressedSize = COVER_checkTotalCompressedSize(parameters, ctx->samplesSizes,
|
|
980
|
+
ctx->samples, ctx->offsets,
|
|
981
|
+
ctx->nbTrainSamples, ctx->nbSamples,
|
|
982
|
+
dict, dictBufferCapacity);
|
|
915
983
|
|
|
916
984
|
_cleanup:
|
|
917
985
|
COVER_best_finish(data->best, totalCompressedSize, parameters, dict,
|
|
@@ -932,6 +1000,8 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
|
|
|
932
1000
|
ZDICT_cover_params_t *parameters) {
|
|
933
1001
|
/* constants */
|
|
934
1002
|
const unsigned nbThreads = parameters->nbThreads;
|
|
1003
|
+
const double splitPoint =
|
|
1004
|
+
parameters->splitPoint <= 0.0 ? DEFAULT_SPLITPOINT : parameters->splitPoint;
|
|
935
1005
|
const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d;
|
|
936
1006
|
const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d;
|
|
937
1007
|
const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k;
|
|
@@ -947,7 +1017,13 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
|
|
|
947
1017
|
unsigned k;
|
|
948
1018
|
COVER_best_t best;
|
|
949
1019
|
POOL_ctx *pool = NULL;
|
|
1020
|
+
int warned = 0;
|
|
1021
|
+
|
|
950
1022
|
/* Checks */
|
|
1023
|
+
if (splitPoint <= 0 || splitPoint > 1) {
|
|
1024
|
+
LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
|
|
1025
|
+
return ERROR(GENERIC);
|
|
1026
|
+
}
|
|
951
1027
|
if (kMinK < kMaxD || kMaxK < kMinK) {
|
|
952
1028
|
LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
|
|
953
1029
|
return ERROR(GENERIC);
|
|
@@ -978,12 +1054,16 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
|
|
|
978
1054
|
/* Initialize the context for this value of d */
|
|
979
1055
|
COVER_ctx_t ctx;
|
|
980
1056
|
LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d);
|
|
981
|
-
if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d)) {
|
|
1057
|
+
if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint)) {
|
|
982
1058
|
LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n");
|
|
983
1059
|
COVER_best_destroy(&best);
|
|
984
1060
|
POOL_free(pool);
|
|
985
1061
|
return ERROR(GENERIC);
|
|
986
1062
|
}
|
|
1063
|
+
if (!warned) {
|
|
1064
|
+
COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize, displayLevel);
|
|
1065
|
+
warned = 1;
|
|
1066
|
+
}
|
|
987
1067
|
/* Loop through k reusing the same context */
|
|
988
1068
|
for (k = kMinK; k <= kMaxK; k += kStepSize) {
|
|
989
1069
|
/* Prepare the arguments */
|
|
@@ -1003,6 +1083,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
|
|
|
1003
1083
|
data->parameters = *parameters;
|
|
1004
1084
|
data->parameters.k = k;
|
|
1005
1085
|
data->parameters.d = d;
|
|
1086
|
+
data->parameters.splitPoint = splitPoint;
|
|
1006
1087
|
data->parameters.steps = kSteps;
|
|
1007
1088
|
data->parameters.zParams.notificationLevel = g_displayLevel;
|
|
1008
1089
|
/* Check the parameters */
|
|
@@ -1020,7 +1101,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
|
|
|
1020
1101
|
}
|
|
1021
1102
|
/* Print status */
|
|
1022
1103
|
LOCALDISPLAYUPDATE(displayLevel, 2, "\r%u%% ",
|
|
1023
|
-
(
|
|
1104
|
+
(unsigned)((iteration * 100) / kIterations));
|
|
1024
1105
|
++iteration;
|
|
1025
1106
|
}
|
|
1026
1107
|
COVER_best_wait(&best);
|