extzstd 0.1 → 0.3.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/HISTORY.ja.md +39 -0
- data/README.md +38 -56
- data/contrib/zstd/CHANGELOG +613 -0
- data/contrib/zstd/CODE_OF_CONDUCT.md +5 -0
- data/contrib/zstd/CONTRIBUTING.md +406 -0
- data/contrib/zstd/COPYING +339 -0
- data/contrib/zstd/Makefile +420 -0
- data/contrib/zstd/README.md +179 -41
- data/contrib/zstd/TESTING.md +44 -0
- data/contrib/zstd/appveyor.yml +292 -0
- data/contrib/zstd/lib/BUCK +234 -0
- data/contrib/zstd/lib/Makefile +451 -0
- data/contrib/zstd/lib/README.md +207 -0
- data/contrib/zstd/{common → lib/common}/bitstream.h +187 -138
- data/contrib/zstd/lib/common/compiler.h +288 -0
- data/contrib/zstd/lib/common/cpu.h +213 -0
- data/contrib/zstd/lib/common/debug.c +24 -0
- data/contrib/zstd/lib/common/debug.h +107 -0
- data/contrib/zstd/lib/common/entropy_common.c +362 -0
- data/contrib/zstd/{common → lib/common}/error_private.c +25 -12
- data/contrib/zstd/{common → lib/common}/error_private.h +14 -10
- data/contrib/zstd/{common → lib/common}/fse.h +173 -92
- data/contrib/zstd/{common → lib/common}/fse_decompress.c +149 -85
- data/contrib/zstd/lib/common/huf.h +361 -0
- data/contrib/zstd/{common → lib/common}/mem.h +115 -59
- data/contrib/zstd/lib/common/pool.c +350 -0
- data/contrib/zstd/lib/common/pool.h +84 -0
- data/contrib/zstd/lib/common/threading.c +122 -0
- data/contrib/zstd/lib/common/threading.h +155 -0
- data/contrib/zstd/{common → lib/common}/xxhash.c +55 -96
- data/contrib/zstd/{common → lib/common}/xxhash.h +23 -47
- data/contrib/zstd/lib/common/zstd_common.c +83 -0
- data/contrib/zstd/lib/common/zstd_deps.h +111 -0
- data/contrib/zstd/lib/common/zstd_errors.h +95 -0
- data/contrib/zstd/lib/common/zstd_internal.h +478 -0
- data/contrib/zstd/{compress → lib/compress}/fse_compress.c +214 -319
- data/contrib/zstd/lib/compress/hist.c +181 -0
- data/contrib/zstd/lib/compress/hist.h +75 -0
- data/contrib/zstd/lib/compress/huf_compress.c +913 -0
- data/contrib/zstd/lib/compress/zstd_compress.c +5208 -0
- data/contrib/zstd/lib/compress/zstd_compress_internal.h +1203 -0
- data/contrib/zstd/lib/compress/zstd_compress_literals.c +158 -0
- data/contrib/zstd/lib/compress/zstd_compress_literals.h +29 -0
- data/contrib/zstd/lib/compress/zstd_compress_sequences.c +433 -0
- data/contrib/zstd/lib/compress/zstd_compress_sequences.h +54 -0
- data/contrib/zstd/lib/compress/zstd_compress_superblock.c +849 -0
- data/contrib/zstd/lib/compress/zstd_compress_superblock.h +32 -0
- data/contrib/zstd/lib/compress/zstd_cwksp.h +561 -0
- data/contrib/zstd/lib/compress/zstd_double_fast.c +521 -0
- data/contrib/zstd/lib/compress/zstd_double_fast.h +38 -0
- data/contrib/zstd/lib/compress/zstd_fast.c +496 -0
- data/contrib/zstd/lib/compress/zstd_fast.h +37 -0
- data/contrib/zstd/lib/compress/zstd_lazy.c +1412 -0
- data/contrib/zstd/lib/compress/zstd_lazy.h +87 -0
- data/contrib/zstd/lib/compress/zstd_ldm.c +660 -0
- data/contrib/zstd/lib/compress/zstd_ldm.h +116 -0
- data/contrib/zstd/lib/compress/zstd_opt.c +1345 -0
- data/contrib/zstd/lib/compress/zstd_opt.h +56 -0
- data/contrib/zstd/lib/compress/zstdmt_compress.c +1811 -0
- data/contrib/zstd/lib/compress/zstdmt_compress.h +110 -0
- data/contrib/zstd/lib/decompress/huf_decompress.c +1350 -0
- data/contrib/zstd/lib/decompress/zstd_ddict.c +244 -0
- data/contrib/zstd/lib/decompress/zstd_ddict.h +44 -0
- data/contrib/zstd/lib/decompress/zstd_decompress.c +1930 -0
- data/contrib/zstd/lib/decompress/zstd_decompress_block.c +1540 -0
- data/contrib/zstd/lib/decompress/zstd_decompress_block.h +62 -0
- data/contrib/zstd/lib/decompress/zstd_decompress_internal.h +190 -0
- data/contrib/zstd/{common → lib/deprecated}/zbuff.h +68 -45
- data/contrib/zstd/lib/deprecated/zbuff_common.c +26 -0
- data/contrib/zstd/lib/deprecated/zbuff_compress.c +147 -0
- data/contrib/zstd/lib/deprecated/zbuff_decompress.c +75 -0
- data/contrib/zstd/lib/dictBuilder/cover.c +1245 -0
- data/contrib/zstd/lib/dictBuilder/cover.h +157 -0
- data/contrib/zstd/{dictBuilder → lib/dictBuilder}/divsufsort.c +3 -3
- data/contrib/zstd/{dictBuilder → lib/dictBuilder}/divsufsort.h +0 -0
- data/contrib/zstd/lib/dictBuilder/fastcover.c +758 -0
- data/contrib/zstd/{dictBuilder → lib/dictBuilder}/zdict.c +318 -194
- data/contrib/zstd/lib/dictBuilder/zdict.h +305 -0
- data/contrib/zstd/{legacy → lib/legacy}/zstd_legacy.h +171 -15
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v01.c +191 -124
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v01.h +19 -5
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v02.c +125 -125
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v02.h +19 -5
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v03.c +125 -124
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v03.h +20 -6
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v04.c +151 -299
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v04.h +19 -5
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v05.c +237 -243
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v05.h +19 -6
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v06.c +130 -143
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v06.h +18 -5
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v07.c +158 -157
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v07.h +19 -5
- data/contrib/zstd/lib/libzstd.pc.in +15 -0
- data/contrib/zstd/lib/zstd.h +2391 -0
- data/ext/depend +2 -0
- data/ext/extconf.rb +15 -6
- data/ext/extzstd.c +76 -145
- data/ext/extzstd.h +80 -31
- data/ext/extzstd_stream.c +417 -142
- data/ext/libzstd_conf.h +8 -0
- data/ext/zstd_common.c +10 -7
- data/ext/zstd_compress.c +14 -5
- data/ext/zstd_decompress.c +5 -4
- data/ext/zstd_dictbuilder.c +9 -4
- data/ext/zstd_dictbuilder_fastcover.c +3 -0
- data/ext/zstd_legacy_v01.c +3 -1
- data/ext/zstd_legacy_v02.c +3 -1
- data/ext/zstd_legacy_v03.c +3 -1
- data/ext/zstd_legacy_v04.c +3 -1
- data/ext/zstd_legacy_v05.c +3 -1
- data/ext/zstd_legacy_v06.c +3 -1
- data/ext/zstd_legacy_v07.c +3 -1
- data/gemstub.rb +10 -24
- data/lib/extzstd.rb +64 -179
- data/lib/extzstd/version.rb +6 -1
- data/test/test_basic.rb +9 -6
- metadata +113 -57
- data/HISTORY.ja +0 -5
- data/contrib/zstd/common/entropy_common.c +0 -225
- data/contrib/zstd/common/huf.h +0 -228
- data/contrib/zstd/common/zstd_common.c +0 -83
- data/contrib/zstd/common/zstd_errors.h +0 -60
- data/contrib/zstd/common/zstd_internal.h +0 -267
- data/contrib/zstd/compress/huf_compress.c +0 -533
- data/contrib/zstd/compress/zbuff_compress.c +0 -319
- data/contrib/zstd/compress/zstd_compress.c +0 -3264
- data/contrib/zstd/compress/zstd_opt.h +0 -900
- data/contrib/zstd/decompress/huf_decompress.c +0 -883
- data/contrib/zstd/decompress/zbuff_decompress.c +0 -252
- data/contrib/zstd/decompress/zstd_decompress.c +0 -1842
- data/contrib/zstd/dictBuilder/zdict.h +0 -111
- data/contrib/zstd/zstd.h +0 -640
@@ -0,0 +1,147 @@
|
|
1
|
+
/*
|
2
|
+
* Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
|
3
|
+
* All rights reserved.
|
4
|
+
*
|
5
|
+
* This source code is licensed under both the BSD-style license (found in the
|
6
|
+
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
|
7
|
+
* in the COPYING file in the root directory of this source tree).
|
8
|
+
* You may select, at your option, one of the above-listed licenses.
|
9
|
+
*/
|
10
|
+
|
11
|
+
|
12
|
+
|
13
|
+
/* *************************************
|
14
|
+
* Dependencies
|
15
|
+
***************************************/
|
16
|
+
#define ZBUFF_STATIC_LINKING_ONLY
|
17
|
+
#include "zbuff.h"
|
18
|
+
|
19
|
+
|
20
|
+
/*-***********************************************************
|
21
|
+
* Streaming compression
|
22
|
+
*
|
23
|
+
* A ZBUFF_CCtx object is required to track streaming operation.
|
24
|
+
* Use ZBUFF_createCCtx() and ZBUFF_freeCCtx() to create/release resources.
|
25
|
+
* Use ZBUFF_compressInit() to start a new compression operation.
|
26
|
+
* ZBUFF_CCtx objects can be reused multiple times.
|
27
|
+
*
|
28
|
+
* Use ZBUFF_compressContinue() repetitively to consume your input.
|
29
|
+
* *srcSizePtr and *dstCapacityPtr can be any size.
|
30
|
+
* The function will report how many bytes were read or written by modifying *srcSizePtr and *dstCapacityPtr.
|
31
|
+
* Note that it may not consume the entire input, in which case it's up to the caller to call again the function with remaining input.
|
32
|
+
* The content of dst will be overwritten (up to *dstCapacityPtr) at each function call, so save its content if it matters or change dst .
|
33
|
+
* @return : a hint to preferred nb of bytes to use as input for next function call (it's only a hint, to improve latency)
|
34
|
+
* or an error code, which can be tested using ZBUFF_isError().
|
35
|
+
*
|
36
|
+
* ZBUFF_compressFlush() can be used to instruct ZBUFF to compress and output whatever remains within its buffer.
|
37
|
+
* Note that it will not output more than *dstCapacityPtr.
|
38
|
+
* Therefore, some content might still be left into its internal buffer if dst buffer is too small.
|
39
|
+
* @return : nb of bytes still present into internal buffer (0 if it's empty)
|
40
|
+
* or an error code, which can be tested using ZBUFF_isError().
|
41
|
+
*
|
42
|
+
* ZBUFF_compressEnd() instructs to finish a frame.
|
43
|
+
* It will perform a flush and write frame epilogue.
|
44
|
+
* Similar to ZBUFF_compressFlush(), it may not be able to output the entire internal buffer content if *dstCapacityPtr is too small.
|
45
|
+
* @return : nb of bytes still present into internal buffer (0 if it's empty)
|
46
|
+
* or an error code, which can be tested using ZBUFF_isError().
|
47
|
+
*
|
48
|
+
* Hint : recommended buffer sizes (not compulsory)
|
49
|
+
* input : ZSTD_BLOCKSIZE_MAX (128 KB), internal unit size, it improves latency to use this value.
|
50
|
+
* output : ZSTD_compressBound(ZSTD_BLOCKSIZE_MAX) + ZSTD_blockHeaderSize + ZBUFF_endFrameSize : ensures it's always possible to write/flush/end a full block at best speed.
|
51
|
+
* ***********************************************************/
|
52
|
+
|
53
|
+
ZBUFF_CCtx* ZBUFF_createCCtx(void)
|
54
|
+
{
|
55
|
+
return ZSTD_createCStream();
|
56
|
+
}
|
57
|
+
|
58
|
+
ZBUFF_CCtx* ZBUFF_createCCtx_advanced(ZSTD_customMem customMem)
|
59
|
+
{
|
60
|
+
return ZSTD_createCStream_advanced(customMem);
|
61
|
+
}
|
62
|
+
|
63
|
+
size_t ZBUFF_freeCCtx(ZBUFF_CCtx* zbc)
|
64
|
+
{
|
65
|
+
return ZSTD_freeCStream(zbc);
|
66
|
+
}
|
67
|
+
|
68
|
+
|
69
|
+
/* ====== Initialization ====== */
|
70
|
+
|
71
|
+
size_t ZBUFF_compressInit_advanced(ZBUFF_CCtx* zbc,
|
72
|
+
const void* dict, size_t dictSize,
|
73
|
+
ZSTD_parameters params, unsigned long long pledgedSrcSize)
|
74
|
+
{
|
75
|
+
if (pledgedSrcSize==0) pledgedSrcSize = ZSTD_CONTENTSIZE_UNKNOWN; /* preserve "0 == unknown" behavior */
|
76
|
+
return ZSTD_initCStream_advanced(zbc, dict, dictSize, params, pledgedSrcSize);
|
77
|
+
}
|
78
|
+
|
79
|
+
|
80
|
+
size_t ZBUFF_compressInitDictionary(ZBUFF_CCtx* zbc, const void* dict, size_t dictSize, int compressionLevel)
|
81
|
+
{
|
82
|
+
return ZSTD_initCStream_usingDict(zbc, dict, dictSize, compressionLevel);
|
83
|
+
}
|
84
|
+
|
85
|
+
size_t ZBUFF_compressInit(ZBUFF_CCtx* zbc, int compressionLevel)
|
86
|
+
{
|
87
|
+
return ZSTD_initCStream(zbc, compressionLevel);
|
88
|
+
}
|
89
|
+
|
90
|
+
/* ====== Compression ====== */
|
91
|
+
|
92
|
+
|
93
|
+
size_t ZBUFF_compressContinue(ZBUFF_CCtx* zbc,
|
94
|
+
void* dst, size_t* dstCapacityPtr,
|
95
|
+
const void* src, size_t* srcSizePtr)
|
96
|
+
{
|
97
|
+
size_t result;
|
98
|
+
ZSTD_outBuffer outBuff;
|
99
|
+
ZSTD_inBuffer inBuff;
|
100
|
+
outBuff.dst = dst;
|
101
|
+
outBuff.pos = 0;
|
102
|
+
outBuff.size = *dstCapacityPtr;
|
103
|
+
inBuff.src = src;
|
104
|
+
inBuff.pos = 0;
|
105
|
+
inBuff.size = *srcSizePtr;
|
106
|
+
result = ZSTD_compressStream(zbc, &outBuff, &inBuff);
|
107
|
+
*dstCapacityPtr = outBuff.pos;
|
108
|
+
*srcSizePtr = inBuff.pos;
|
109
|
+
return result;
|
110
|
+
}
|
111
|
+
|
112
|
+
|
113
|
+
|
114
|
+
/* ====== Finalize ====== */
|
115
|
+
|
116
|
+
size_t ZBUFF_compressFlush(ZBUFF_CCtx* zbc, void* dst, size_t* dstCapacityPtr)
|
117
|
+
{
|
118
|
+
size_t result;
|
119
|
+
ZSTD_outBuffer outBuff;
|
120
|
+
outBuff.dst = dst;
|
121
|
+
outBuff.pos = 0;
|
122
|
+
outBuff.size = *dstCapacityPtr;
|
123
|
+
result = ZSTD_flushStream(zbc, &outBuff);
|
124
|
+
*dstCapacityPtr = outBuff.pos;
|
125
|
+
return result;
|
126
|
+
}
|
127
|
+
|
128
|
+
|
129
|
+
size_t ZBUFF_compressEnd(ZBUFF_CCtx* zbc, void* dst, size_t* dstCapacityPtr)
|
130
|
+
{
|
131
|
+
size_t result;
|
132
|
+
ZSTD_outBuffer outBuff;
|
133
|
+
outBuff.dst = dst;
|
134
|
+
outBuff.pos = 0;
|
135
|
+
outBuff.size = *dstCapacityPtr;
|
136
|
+
result = ZSTD_endStream(zbc, &outBuff);
|
137
|
+
*dstCapacityPtr = outBuff.pos;
|
138
|
+
return result;
|
139
|
+
}
|
140
|
+
|
141
|
+
|
142
|
+
|
143
|
+
/* *************************************
|
144
|
+
* Tool functions
|
145
|
+
***************************************/
|
146
|
+
size_t ZBUFF_recommendedCInSize(void) { return ZSTD_CStreamInSize(); }
|
147
|
+
size_t ZBUFF_recommendedCOutSize(void) { return ZSTD_CStreamOutSize(); }
|
@@ -0,0 +1,75 @@
|
|
1
|
+
/*
|
2
|
+
* Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
|
3
|
+
* All rights reserved.
|
4
|
+
*
|
5
|
+
* This source code is licensed under both the BSD-style license (found in the
|
6
|
+
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
|
7
|
+
* in the COPYING file in the root directory of this source tree).
|
8
|
+
* You may select, at your option, one of the above-listed licenses.
|
9
|
+
*/
|
10
|
+
|
11
|
+
|
12
|
+
|
13
|
+
/* *************************************
|
14
|
+
* Dependencies
|
15
|
+
***************************************/
|
16
|
+
#define ZBUFF_STATIC_LINKING_ONLY
|
17
|
+
#include "zbuff.h"
|
18
|
+
|
19
|
+
|
20
|
+
ZBUFF_DCtx* ZBUFF_createDCtx(void)
|
21
|
+
{
|
22
|
+
return ZSTD_createDStream();
|
23
|
+
}
|
24
|
+
|
25
|
+
ZBUFF_DCtx* ZBUFF_createDCtx_advanced(ZSTD_customMem customMem)
|
26
|
+
{
|
27
|
+
return ZSTD_createDStream_advanced(customMem);
|
28
|
+
}
|
29
|
+
|
30
|
+
size_t ZBUFF_freeDCtx(ZBUFF_DCtx* zbd)
|
31
|
+
{
|
32
|
+
return ZSTD_freeDStream(zbd);
|
33
|
+
}
|
34
|
+
|
35
|
+
|
36
|
+
/* *** Initialization *** */
|
37
|
+
|
38
|
+
size_t ZBUFF_decompressInitDictionary(ZBUFF_DCtx* zbd, const void* dict, size_t dictSize)
|
39
|
+
{
|
40
|
+
return ZSTD_initDStream_usingDict(zbd, dict, dictSize);
|
41
|
+
}
|
42
|
+
|
43
|
+
size_t ZBUFF_decompressInit(ZBUFF_DCtx* zbd)
|
44
|
+
{
|
45
|
+
return ZSTD_initDStream(zbd);
|
46
|
+
}
|
47
|
+
|
48
|
+
|
49
|
+
/* *** Decompression *** */
|
50
|
+
|
51
|
+
size_t ZBUFF_decompressContinue(ZBUFF_DCtx* zbd,
|
52
|
+
void* dst, size_t* dstCapacityPtr,
|
53
|
+
const void* src, size_t* srcSizePtr)
|
54
|
+
{
|
55
|
+
ZSTD_outBuffer outBuff;
|
56
|
+
ZSTD_inBuffer inBuff;
|
57
|
+
size_t result;
|
58
|
+
outBuff.dst = dst;
|
59
|
+
outBuff.pos = 0;
|
60
|
+
outBuff.size = *dstCapacityPtr;
|
61
|
+
inBuff.src = src;
|
62
|
+
inBuff.pos = 0;
|
63
|
+
inBuff.size = *srcSizePtr;
|
64
|
+
result = ZSTD_decompressStream(zbd, &outBuff, &inBuff);
|
65
|
+
*dstCapacityPtr = outBuff.pos;
|
66
|
+
*srcSizePtr = inBuff.pos;
|
67
|
+
return result;
|
68
|
+
}
|
69
|
+
|
70
|
+
|
71
|
+
/* *************************************
|
72
|
+
* Tool functions
|
73
|
+
***************************************/
|
74
|
+
size_t ZBUFF_recommendedDInSize(void) { return ZSTD_DStreamInSize(); }
|
75
|
+
size_t ZBUFF_recommendedDOutSize(void) { return ZSTD_DStreamOutSize(); }
|
@@ -0,0 +1,1245 @@
|
|
1
|
+
/*
|
2
|
+
* Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
|
3
|
+
* All rights reserved.
|
4
|
+
*
|
5
|
+
* This source code is licensed under both the BSD-style license (found in the
|
6
|
+
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
|
7
|
+
* in the COPYING file in the root directory of this source tree).
|
8
|
+
* You may select, at your option, one of the above-listed licenses.
|
9
|
+
*/
|
10
|
+
|
11
|
+
/* *****************************************************************************
|
12
|
+
* Constructs a dictionary using a heuristic based on the following paper:
|
13
|
+
*
|
14
|
+
* Liao, Petri, Moffat, Wirth
|
15
|
+
* Effective Construction of Relative Lempel-Ziv Dictionaries
|
16
|
+
* Published in WWW 2016.
|
17
|
+
*
|
18
|
+
* Adapted from code originally written by @ot (Giuseppe Ottaviano).
|
19
|
+
******************************************************************************/
|
20
|
+
|
21
|
+
/*-*************************************
|
22
|
+
* Dependencies
|
23
|
+
***************************************/
|
24
|
+
#include <stdio.h> /* fprintf */
|
25
|
+
#include <stdlib.h> /* malloc, free, qsort */
|
26
|
+
#include <string.h> /* memset */
|
27
|
+
#include <time.h> /* clock */
|
28
|
+
|
29
|
+
#include "../common/mem.h" /* read */
|
30
|
+
#include "../common/pool.h"
|
31
|
+
#include "../common/threading.h"
|
32
|
+
#include "cover.h"
|
33
|
+
#include "../common/zstd_internal.h" /* includes zstd.h */
|
34
|
+
#ifndef ZDICT_STATIC_LINKING_ONLY
|
35
|
+
#define ZDICT_STATIC_LINKING_ONLY
|
36
|
+
#endif
|
37
|
+
#include "zdict.h"
|
38
|
+
|
39
|
+
/*-*************************************
|
40
|
+
* Constants
|
41
|
+
***************************************/
|
42
|
+
#define COVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((unsigned)-1) : ((unsigned)1 GB))
|
43
|
+
#define COVER_DEFAULT_SPLITPOINT 1.0
|
44
|
+
|
45
|
+
/*-*************************************
|
46
|
+
* Console display
|
47
|
+
***************************************/
|
48
|
+
#ifndef LOCALDISPLAYLEVEL
|
49
|
+
static int g_displayLevel = 2;
|
50
|
+
#endif
|
51
|
+
#undef DISPLAY
|
52
|
+
#define DISPLAY(...) \
|
53
|
+
{ \
|
54
|
+
fprintf(stderr, __VA_ARGS__); \
|
55
|
+
fflush(stderr); \
|
56
|
+
}
|
57
|
+
#undef LOCALDISPLAYLEVEL
|
58
|
+
#define LOCALDISPLAYLEVEL(displayLevel, l, ...) \
|
59
|
+
if (displayLevel >= l) { \
|
60
|
+
DISPLAY(__VA_ARGS__); \
|
61
|
+
} /* 0 : no display; 1: errors; 2: default; 3: details; 4: debug */
|
62
|
+
#undef DISPLAYLEVEL
|
63
|
+
#define DISPLAYLEVEL(l, ...) LOCALDISPLAYLEVEL(g_displayLevel, l, __VA_ARGS__)
|
64
|
+
|
65
|
+
#ifndef LOCALDISPLAYUPDATE
|
66
|
+
static const clock_t g_refreshRate = CLOCKS_PER_SEC * 15 / 100;
|
67
|
+
static clock_t g_time = 0;
|
68
|
+
#endif
|
69
|
+
#undef LOCALDISPLAYUPDATE
|
70
|
+
#define LOCALDISPLAYUPDATE(displayLevel, l, ...) \
|
71
|
+
if (displayLevel >= l) { \
|
72
|
+
if ((clock() - g_time > g_refreshRate) || (displayLevel >= 4)) { \
|
73
|
+
g_time = clock(); \
|
74
|
+
DISPLAY(__VA_ARGS__); \
|
75
|
+
} \
|
76
|
+
}
|
77
|
+
#undef DISPLAYUPDATE
|
78
|
+
#define DISPLAYUPDATE(l, ...) LOCALDISPLAYUPDATE(g_displayLevel, l, __VA_ARGS__)
|
79
|
+
|
80
|
+
/*-*************************************
|
81
|
+
* Hash table
|
82
|
+
***************************************
|
83
|
+
* A small specialized hash map for storing activeDmers.
|
84
|
+
* The map does not resize, so if it becomes full it will loop forever.
|
85
|
+
* Thus, the map must be large enough to store every value.
|
86
|
+
* The map implements linear probing and keeps its load less than 0.5.
|
87
|
+
*/
|
88
|
+
|
89
|
+
#define MAP_EMPTY_VALUE ((U32)-1)
|
90
|
+
typedef struct COVER_map_pair_t_s {
|
91
|
+
U32 key;
|
92
|
+
U32 value;
|
93
|
+
} COVER_map_pair_t;
|
94
|
+
|
95
|
+
typedef struct COVER_map_s {
|
96
|
+
COVER_map_pair_t *data;
|
97
|
+
U32 sizeLog;
|
98
|
+
U32 size;
|
99
|
+
U32 sizeMask;
|
100
|
+
} COVER_map_t;
|
101
|
+
|
102
|
+
/**
|
103
|
+
* Clear the map.
|
104
|
+
*/
|
105
|
+
static void COVER_map_clear(COVER_map_t *map) {
|
106
|
+
memset(map->data, MAP_EMPTY_VALUE, map->size * sizeof(COVER_map_pair_t));
|
107
|
+
}
|
108
|
+
|
109
|
+
/**
|
110
|
+
* Initializes a map of the given size.
|
111
|
+
* Returns 1 on success and 0 on failure.
|
112
|
+
* The map must be destroyed with COVER_map_destroy().
|
113
|
+
* The map is only guaranteed to be large enough to hold size elements.
|
114
|
+
*/
|
115
|
+
static int COVER_map_init(COVER_map_t *map, U32 size) {
|
116
|
+
map->sizeLog = ZSTD_highbit32(size) + 2;
|
117
|
+
map->size = (U32)1 << map->sizeLog;
|
118
|
+
map->sizeMask = map->size - 1;
|
119
|
+
map->data = (COVER_map_pair_t *)malloc(map->size * sizeof(COVER_map_pair_t));
|
120
|
+
if (!map->data) {
|
121
|
+
map->sizeLog = 0;
|
122
|
+
map->size = 0;
|
123
|
+
return 0;
|
124
|
+
}
|
125
|
+
COVER_map_clear(map);
|
126
|
+
return 1;
|
127
|
+
}
|
128
|
+
|
129
|
+
/**
|
130
|
+
* Internal hash function
|
131
|
+
*/
|
132
|
+
static const U32 COVER_prime4bytes = 2654435761U;
|
133
|
+
static U32 COVER_map_hash(COVER_map_t *map, U32 key) {
|
134
|
+
return (key * COVER_prime4bytes) >> (32 - map->sizeLog);
|
135
|
+
}
|
136
|
+
|
137
|
+
/**
|
138
|
+
* Helper function that returns the index that a key should be placed into.
|
139
|
+
*/
|
140
|
+
static U32 COVER_map_index(COVER_map_t *map, U32 key) {
|
141
|
+
const U32 hash = COVER_map_hash(map, key);
|
142
|
+
U32 i;
|
143
|
+
for (i = hash;; i = (i + 1) & map->sizeMask) {
|
144
|
+
COVER_map_pair_t *pos = &map->data[i];
|
145
|
+
if (pos->value == MAP_EMPTY_VALUE) {
|
146
|
+
return i;
|
147
|
+
}
|
148
|
+
if (pos->key == key) {
|
149
|
+
return i;
|
150
|
+
}
|
151
|
+
}
|
152
|
+
}
|
153
|
+
|
154
|
+
/**
|
155
|
+
* Returns the pointer to the value for key.
|
156
|
+
* If key is not in the map, it is inserted and the value is set to 0.
|
157
|
+
* The map must not be full.
|
158
|
+
*/
|
159
|
+
static U32 *COVER_map_at(COVER_map_t *map, U32 key) {
|
160
|
+
COVER_map_pair_t *pos = &map->data[COVER_map_index(map, key)];
|
161
|
+
if (pos->value == MAP_EMPTY_VALUE) {
|
162
|
+
pos->key = key;
|
163
|
+
pos->value = 0;
|
164
|
+
}
|
165
|
+
return &pos->value;
|
166
|
+
}
|
167
|
+
|
168
|
+
/**
|
169
|
+
* Deletes key from the map if present.
|
170
|
+
*/
|
171
|
+
static void COVER_map_remove(COVER_map_t *map, U32 key) {
|
172
|
+
U32 i = COVER_map_index(map, key);
|
173
|
+
COVER_map_pair_t *del = &map->data[i];
|
174
|
+
U32 shift = 1;
|
175
|
+
if (del->value == MAP_EMPTY_VALUE) {
|
176
|
+
return;
|
177
|
+
}
|
178
|
+
for (i = (i + 1) & map->sizeMask;; i = (i + 1) & map->sizeMask) {
|
179
|
+
COVER_map_pair_t *const pos = &map->data[i];
|
180
|
+
/* If the position is empty we are done */
|
181
|
+
if (pos->value == MAP_EMPTY_VALUE) {
|
182
|
+
del->value = MAP_EMPTY_VALUE;
|
183
|
+
return;
|
184
|
+
}
|
185
|
+
/* If pos can be moved to del do so */
|
186
|
+
if (((i - COVER_map_hash(map, pos->key)) & map->sizeMask) >= shift) {
|
187
|
+
del->key = pos->key;
|
188
|
+
del->value = pos->value;
|
189
|
+
del = pos;
|
190
|
+
shift = 1;
|
191
|
+
} else {
|
192
|
+
++shift;
|
193
|
+
}
|
194
|
+
}
|
195
|
+
}
|
196
|
+
|
197
|
+
/**
|
198
|
+
* Destroys a map that is inited with COVER_map_init().
|
199
|
+
*/
|
200
|
+
static void COVER_map_destroy(COVER_map_t *map) {
|
201
|
+
if (map->data) {
|
202
|
+
free(map->data);
|
203
|
+
}
|
204
|
+
map->data = NULL;
|
205
|
+
map->size = 0;
|
206
|
+
}
|
207
|
+
|
208
|
+
/*-*************************************
|
209
|
+
* Context
|
210
|
+
***************************************/
|
211
|
+
|
212
|
+
typedef struct {
|
213
|
+
const BYTE *samples;
|
214
|
+
size_t *offsets;
|
215
|
+
const size_t *samplesSizes;
|
216
|
+
size_t nbSamples;
|
217
|
+
size_t nbTrainSamples;
|
218
|
+
size_t nbTestSamples;
|
219
|
+
U32 *suffix;
|
220
|
+
size_t suffixSize;
|
221
|
+
U32 *freqs;
|
222
|
+
U32 *dmerAt;
|
223
|
+
unsigned d;
|
224
|
+
} COVER_ctx_t;
|
225
|
+
|
226
|
+
/* We need a global context for qsort... */
|
227
|
+
static COVER_ctx_t *g_coverCtx = NULL;
|
228
|
+
|
229
|
+
/*-*************************************
|
230
|
+
* Helper functions
|
231
|
+
***************************************/
|
232
|
+
|
233
|
+
/**
|
234
|
+
* Returns the sum of the sample sizes.
|
235
|
+
*/
|
236
|
+
size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) {
|
237
|
+
size_t sum = 0;
|
238
|
+
unsigned i;
|
239
|
+
for (i = 0; i < nbSamples; ++i) {
|
240
|
+
sum += samplesSizes[i];
|
241
|
+
}
|
242
|
+
return sum;
|
243
|
+
}
|
244
|
+
|
245
|
+
/**
|
246
|
+
* Returns -1 if the dmer at lp is less than the dmer at rp.
|
247
|
+
* Return 0 if the dmers at lp and rp are equal.
|
248
|
+
* Returns 1 if the dmer at lp is greater than the dmer at rp.
|
249
|
+
*/
|
250
|
+
static int COVER_cmp(COVER_ctx_t *ctx, const void *lp, const void *rp) {
|
251
|
+
U32 const lhs = *(U32 const *)lp;
|
252
|
+
U32 const rhs = *(U32 const *)rp;
|
253
|
+
return memcmp(ctx->samples + lhs, ctx->samples + rhs, ctx->d);
|
254
|
+
}
|
255
|
+
/**
|
256
|
+
* Faster version for d <= 8.
|
257
|
+
*/
|
258
|
+
static int COVER_cmp8(COVER_ctx_t *ctx, const void *lp, const void *rp) {
|
259
|
+
U64 const mask = (ctx->d == 8) ? (U64)-1 : (((U64)1 << (8 * ctx->d)) - 1);
|
260
|
+
U64 const lhs = MEM_readLE64(ctx->samples + *(U32 const *)lp) & mask;
|
261
|
+
U64 const rhs = MEM_readLE64(ctx->samples + *(U32 const *)rp) & mask;
|
262
|
+
if (lhs < rhs) {
|
263
|
+
return -1;
|
264
|
+
}
|
265
|
+
return (lhs > rhs);
|
266
|
+
}
|
267
|
+
|
268
|
+
/**
|
269
|
+
* Same as COVER_cmp() except ties are broken by pointer value
|
270
|
+
* NOTE: g_coverCtx must be set to call this function. A global is required because
|
271
|
+
* qsort doesn't take an opaque pointer.
|
272
|
+
*/
|
273
|
+
static int WIN_CDECL COVER_strict_cmp(const void *lp, const void *rp) {
|
274
|
+
int result = COVER_cmp(g_coverCtx, lp, rp);
|
275
|
+
if (result == 0) {
|
276
|
+
result = lp < rp ? -1 : 1;
|
277
|
+
}
|
278
|
+
return result;
|
279
|
+
}
|
280
|
+
/**
|
281
|
+
* Faster version for d <= 8.
|
282
|
+
*/
|
283
|
+
static int WIN_CDECL COVER_strict_cmp8(const void *lp, const void *rp) {
|
284
|
+
int result = COVER_cmp8(g_coverCtx, lp, rp);
|
285
|
+
if (result == 0) {
|
286
|
+
result = lp < rp ? -1 : 1;
|
287
|
+
}
|
288
|
+
return result;
|
289
|
+
}
|
290
|
+
|
291
|
+
/**
|
292
|
+
* Returns the first pointer in [first, last) whose element does not compare
|
293
|
+
* less than value. If no such element exists it returns last.
|
294
|
+
*/
|
295
|
+
static const size_t *COVER_lower_bound(const size_t *first, const size_t *last,
|
296
|
+
size_t value) {
|
297
|
+
size_t count = last - first;
|
298
|
+
while (count != 0) {
|
299
|
+
size_t step = count / 2;
|
300
|
+
const size_t *ptr = first;
|
301
|
+
ptr += step;
|
302
|
+
if (*ptr < value) {
|
303
|
+
first = ++ptr;
|
304
|
+
count -= step + 1;
|
305
|
+
} else {
|
306
|
+
count = step;
|
307
|
+
}
|
308
|
+
}
|
309
|
+
return first;
|
310
|
+
}
|
311
|
+
|
312
|
+
/**
|
313
|
+
* Generic groupBy function.
|
314
|
+
* Groups an array sorted by cmp into groups with equivalent values.
|
315
|
+
* Calls grp for each group.
|
316
|
+
*/
|
317
|
+
static void
|
318
|
+
COVER_groupBy(const void *data, size_t count, size_t size, COVER_ctx_t *ctx,
|
319
|
+
int (*cmp)(COVER_ctx_t *, const void *, const void *),
|
320
|
+
void (*grp)(COVER_ctx_t *, const void *, const void *)) {
|
321
|
+
const BYTE *ptr = (const BYTE *)data;
|
322
|
+
size_t num = 0;
|
323
|
+
while (num < count) {
|
324
|
+
const BYTE *grpEnd = ptr + size;
|
325
|
+
++num;
|
326
|
+
while (num < count && cmp(ctx, ptr, grpEnd) == 0) {
|
327
|
+
grpEnd += size;
|
328
|
+
++num;
|
329
|
+
}
|
330
|
+
grp(ctx, ptr, grpEnd);
|
331
|
+
ptr = grpEnd;
|
332
|
+
}
|
333
|
+
}
|
334
|
+
|
335
|
+
/*-*************************************
|
336
|
+
* Cover functions
|
337
|
+
***************************************/
|
338
|
+
|
339
|
+
/**
|
340
|
+
* Called on each group of positions with the same dmer.
|
341
|
+
* Counts the frequency of each dmer and saves it in the suffix array.
|
342
|
+
* Fills `ctx->dmerAt`.
|
343
|
+
*/
|
344
|
+
static void COVER_group(COVER_ctx_t *ctx, const void *group,
|
345
|
+
const void *groupEnd) {
|
346
|
+
/* The group consists of all the positions with the same first d bytes. */
|
347
|
+
const U32 *grpPtr = (const U32 *)group;
|
348
|
+
const U32 *grpEnd = (const U32 *)groupEnd;
|
349
|
+
/* The dmerId is how we will reference this dmer.
|
350
|
+
* This allows us to map the whole dmer space to a much smaller space, the
|
351
|
+
* size of the suffix array.
|
352
|
+
*/
|
353
|
+
const U32 dmerId = (U32)(grpPtr - ctx->suffix);
|
354
|
+
/* Count the number of samples this dmer shows up in */
|
355
|
+
U32 freq = 0;
|
356
|
+
/* Details */
|
357
|
+
const size_t *curOffsetPtr = ctx->offsets;
|
358
|
+
const size_t *offsetsEnd = ctx->offsets + ctx->nbSamples;
|
359
|
+
/* Once *grpPtr >= curSampleEnd this occurrence of the dmer is in a
|
360
|
+
* different sample than the last.
|
361
|
+
*/
|
362
|
+
size_t curSampleEnd = ctx->offsets[0];
|
363
|
+
for (; grpPtr != grpEnd; ++grpPtr) {
|
364
|
+
/* Save the dmerId for this position so we can get back to it. */
|
365
|
+
ctx->dmerAt[*grpPtr] = dmerId;
|
366
|
+
/* Dictionaries only help for the first reference to the dmer.
|
367
|
+
* After that zstd can reference the match from the previous reference.
|
368
|
+
* So only count each dmer once for each sample it is in.
|
369
|
+
*/
|
370
|
+
if (*grpPtr < curSampleEnd) {
|
371
|
+
continue;
|
372
|
+
}
|
373
|
+
freq += 1;
|
374
|
+
/* Binary search to find the end of the sample *grpPtr is in.
|
375
|
+
* In the common case that grpPtr + 1 == grpEnd we can skip the binary
|
376
|
+
* search because the loop is over.
|
377
|
+
*/
|
378
|
+
if (grpPtr + 1 != grpEnd) {
|
379
|
+
const size_t *sampleEndPtr =
|
380
|
+
COVER_lower_bound(curOffsetPtr, offsetsEnd, *grpPtr);
|
381
|
+
curSampleEnd = *sampleEndPtr;
|
382
|
+
curOffsetPtr = sampleEndPtr + 1;
|
383
|
+
}
|
384
|
+
}
|
385
|
+
/* At this point we are never going to look at this segment of the suffix
|
386
|
+
* array again. We take advantage of this fact to save memory.
|
387
|
+
* We store the frequency of the dmer in the first position of the group,
|
388
|
+
* which is dmerId.
|
389
|
+
*/
|
390
|
+
ctx->suffix[dmerId] = freq;
|
391
|
+
}
|
392
|
+
|
393
|
+
|
394
|
+
/**
|
395
|
+
* Selects the best segment in an epoch.
|
396
|
+
* Segments of are scored according to the function:
|
397
|
+
*
|
398
|
+
* Let F(d) be the frequency of dmer d.
|
399
|
+
* Let S_i be the dmer at position i of segment S which has length k.
|
400
|
+
*
|
401
|
+
* Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1})
|
402
|
+
*
|
403
|
+
* Once the dmer d is in the dictionary we set F(d) = 0.
|
404
|
+
*/
|
405
|
+
static COVER_segment_t COVER_selectSegment(const COVER_ctx_t *ctx, U32 *freqs,
|
406
|
+
COVER_map_t *activeDmers, U32 begin,
|
407
|
+
U32 end,
|
408
|
+
ZDICT_cover_params_t parameters) {
|
409
|
+
/* Constants */
|
410
|
+
const U32 k = parameters.k;
|
411
|
+
const U32 d = parameters.d;
|
412
|
+
const U32 dmersInK = k - d + 1;
|
413
|
+
/* Try each segment (activeSegment) and save the best (bestSegment) */
|
414
|
+
COVER_segment_t bestSegment = {0, 0, 0};
|
415
|
+
COVER_segment_t activeSegment;
|
416
|
+
/* Reset the activeDmers in the segment */
|
417
|
+
COVER_map_clear(activeDmers);
|
418
|
+
/* The activeSegment starts at the beginning of the epoch. */
|
419
|
+
activeSegment.begin = begin;
|
420
|
+
activeSegment.end = begin;
|
421
|
+
activeSegment.score = 0;
|
422
|
+
/* Slide the activeSegment through the whole epoch.
|
423
|
+
* Save the best segment in bestSegment.
|
424
|
+
*/
|
425
|
+
while (activeSegment.end < end) {
|
426
|
+
/* The dmerId for the dmer at the next position */
|
427
|
+
U32 newDmer = ctx->dmerAt[activeSegment.end];
|
428
|
+
/* The entry in activeDmers for this dmerId */
|
429
|
+
U32 *newDmerOcc = COVER_map_at(activeDmers, newDmer);
|
430
|
+
/* If the dmer isn't already present in the segment add its score. */
|
431
|
+
if (*newDmerOcc == 0) {
|
432
|
+
/* The paper suggest using the L-0.5 norm, but experiments show that it
|
433
|
+
* doesn't help.
|
434
|
+
*/
|
435
|
+
activeSegment.score += freqs[newDmer];
|
436
|
+
}
|
437
|
+
/* Add the dmer to the segment */
|
438
|
+
activeSegment.end += 1;
|
439
|
+
*newDmerOcc += 1;
|
440
|
+
|
441
|
+
/* If the window is now too large, drop the first position */
|
442
|
+
if (activeSegment.end - activeSegment.begin == dmersInK + 1) {
|
443
|
+
U32 delDmer = ctx->dmerAt[activeSegment.begin];
|
444
|
+
U32 *delDmerOcc = COVER_map_at(activeDmers, delDmer);
|
445
|
+
activeSegment.begin += 1;
|
446
|
+
*delDmerOcc -= 1;
|
447
|
+
/* If this is the last occurrence of the dmer, subtract its score */
|
448
|
+
if (*delDmerOcc == 0) {
|
449
|
+
COVER_map_remove(activeDmers, delDmer);
|
450
|
+
activeSegment.score -= freqs[delDmer];
|
451
|
+
}
|
452
|
+
}
|
453
|
+
|
454
|
+
/* If this segment is the best so far save it */
|
455
|
+
if (activeSegment.score > bestSegment.score) {
|
456
|
+
bestSegment = activeSegment;
|
457
|
+
}
|
458
|
+
}
|
459
|
+
{
|
460
|
+
/* Trim off the zero frequency head and tail from the segment. */
|
461
|
+
U32 newBegin = bestSegment.end;
|
462
|
+
U32 newEnd = bestSegment.begin;
|
463
|
+
U32 pos;
|
464
|
+
for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {
|
465
|
+
U32 freq = freqs[ctx->dmerAt[pos]];
|
466
|
+
if (freq != 0) {
|
467
|
+
newBegin = MIN(newBegin, pos);
|
468
|
+
newEnd = pos + 1;
|
469
|
+
}
|
470
|
+
}
|
471
|
+
bestSegment.begin = newBegin;
|
472
|
+
bestSegment.end = newEnd;
|
473
|
+
}
|
474
|
+
{
|
475
|
+
/* Zero out the frequency of each dmer covered by the chosen segment. */
|
476
|
+
U32 pos;
|
477
|
+
for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {
|
478
|
+
freqs[ctx->dmerAt[pos]] = 0;
|
479
|
+
}
|
480
|
+
}
|
481
|
+
return bestSegment;
|
482
|
+
}
|
483
|
+
|
484
|
+
/**
|
485
|
+
* Check the validity of the parameters.
|
486
|
+
* Returns non-zero if the parameters are valid and 0 otherwise.
|
487
|
+
*/
|
488
|
+
static int COVER_checkParameters(ZDICT_cover_params_t parameters,
|
489
|
+
size_t maxDictSize) {
|
490
|
+
/* k and d are required parameters */
|
491
|
+
if (parameters.d == 0 || parameters.k == 0) {
|
492
|
+
return 0;
|
493
|
+
}
|
494
|
+
/* k <= maxDictSize */
|
495
|
+
if (parameters.k > maxDictSize) {
|
496
|
+
return 0;
|
497
|
+
}
|
498
|
+
/* d <= k */
|
499
|
+
if (parameters.d > parameters.k) {
|
500
|
+
return 0;
|
501
|
+
}
|
502
|
+
/* 0 < splitPoint <= 1 */
|
503
|
+
if (parameters.splitPoint <= 0 || parameters.splitPoint > 1){
|
504
|
+
return 0;
|
505
|
+
}
|
506
|
+
return 1;
|
507
|
+
}
|
508
|
+
|
509
|
+
/**
|
510
|
+
* Clean up a context initialized with `COVER_ctx_init()`.
|
511
|
+
*/
|
512
|
+
static void COVER_ctx_destroy(COVER_ctx_t *ctx) {
|
513
|
+
if (!ctx) {
|
514
|
+
return;
|
515
|
+
}
|
516
|
+
if (ctx->suffix) {
|
517
|
+
free(ctx->suffix);
|
518
|
+
ctx->suffix = NULL;
|
519
|
+
}
|
520
|
+
if (ctx->freqs) {
|
521
|
+
free(ctx->freqs);
|
522
|
+
ctx->freqs = NULL;
|
523
|
+
}
|
524
|
+
if (ctx->dmerAt) {
|
525
|
+
free(ctx->dmerAt);
|
526
|
+
ctx->dmerAt = NULL;
|
527
|
+
}
|
528
|
+
if (ctx->offsets) {
|
529
|
+
free(ctx->offsets);
|
530
|
+
ctx->offsets = NULL;
|
531
|
+
}
|
532
|
+
}
|
533
|
+
|
534
|
+
/**
|
535
|
+
* Prepare a context for dictionary building.
|
536
|
+
* The context is only dependent on the parameter `d` and can used multiple
|
537
|
+
* times.
|
538
|
+
* Returns 0 on success or error code on error.
|
539
|
+
* The context must be destroyed with `COVER_ctx_destroy()`.
|
540
|
+
*/
|
541
|
+
static size_t COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
|
542
|
+
const size_t *samplesSizes, unsigned nbSamples,
|
543
|
+
unsigned d, double splitPoint) {
|
544
|
+
const BYTE *const samples = (const BYTE *)samplesBuffer;
|
545
|
+
const size_t totalSamplesSize = COVER_sum(samplesSizes, nbSamples);
|
546
|
+
/* Split samples into testing and training sets */
|
547
|
+
const unsigned nbTrainSamples = splitPoint < 1.0 ? (unsigned)((double)nbSamples * splitPoint) : nbSamples;
|
548
|
+
const unsigned nbTestSamples = splitPoint < 1.0 ? nbSamples - nbTrainSamples : nbSamples;
|
549
|
+
const size_t trainingSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes, nbTrainSamples) : totalSamplesSize;
|
550
|
+
const size_t testSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes + nbTrainSamples, nbTestSamples) : totalSamplesSize;
|
551
|
+
/* Checks */
|
552
|
+
if (totalSamplesSize < MAX(d, sizeof(U64)) ||
|
553
|
+
totalSamplesSize >= (size_t)COVER_MAX_SAMPLES_SIZE) {
|
554
|
+
DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n",
|
555
|
+
(unsigned)(totalSamplesSize>>20), (COVER_MAX_SAMPLES_SIZE >> 20));
|
556
|
+
return ERROR(srcSize_wrong);
|
557
|
+
}
|
558
|
+
/* Check if there are at least 5 training samples */
|
559
|
+
if (nbTrainSamples < 5) {
|
560
|
+
DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid.", nbTrainSamples);
|
561
|
+
return ERROR(srcSize_wrong);
|
562
|
+
}
|
563
|
+
/* Check if there's testing sample */
|
564
|
+
if (nbTestSamples < 1) {
|
565
|
+
DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.", nbTestSamples);
|
566
|
+
return ERROR(srcSize_wrong);
|
567
|
+
}
|
568
|
+
/* Zero the context */
|
569
|
+
memset(ctx, 0, sizeof(*ctx));
|
570
|
+
DISPLAYLEVEL(2, "Training on %u samples of total size %u\n", nbTrainSamples,
|
571
|
+
(unsigned)trainingSamplesSize);
|
572
|
+
DISPLAYLEVEL(2, "Testing on %u samples of total size %u\n", nbTestSamples,
|
573
|
+
(unsigned)testSamplesSize);
|
574
|
+
ctx->samples = samples;
|
575
|
+
ctx->samplesSizes = samplesSizes;
|
576
|
+
ctx->nbSamples = nbSamples;
|
577
|
+
ctx->nbTrainSamples = nbTrainSamples;
|
578
|
+
ctx->nbTestSamples = nbTestSamples;
|
579
|
+
/* Partial suffix array */
|
580
|
+
ctx->suffixSize = trainingSamplesSize - MAX(d, sizeof(U64)) + 1;
|
581
|
+
ctx->suffix = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
|
582
|
+
/* Maps index to the dmerID */
|
583
|
+
ctx->dmerAt = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
|
584
|
+
/* The offsets of each file */
|
585
|
+
ctx->offsets = (size_t *)malloc((nbSamples + 1) * sizeof(size_t));
|
586
|
+
if (!ctx->suffix || !ctx->dmerAt || !ctx->offsets) {
|
587
|
+
DISPLAYLEVEL(1, "Failed to allocate scratch buffers\n");
|
588
|
+
COVER_ctx_destroy(ctx);
|
589
|
+
return ERROR(memory_allocation);
|
590
|
+
}
|
591
|
+
ctx->freqs = NULL;
|
592
|
+
ctx->d = d;
|
593
|
+
|
594
|
+
/* Fill offsets from the samplesSizes */
|
595
|
+
{
|
596
|
+
U32 i;
|
597
|
+
ctx->offsets[0] = 0;
|
598
|
+
for (i = 1; i <= nbSamples; ++i) {
|
599
|
+
ctx->offsets[i] = ctx->offsets[i - 1] + samplesSizes[i - 1];
|
600
|
+
}
|
601
|
+
}
|
602
|
+
DISPLAYLEVEL(2, "Constructing partial suffix array\n");
|
603
|
+
{
|
604
|
+
/* suffix is a partial suffix array.
|
605
|
+
* It only sorts suffixes by their first parameters.d bytes.
|
606
|
+
* The sort is stable, so each dmer group is sorted by position in input.
|
607
|
+
*/
|
608
|
+
U32 i;
|
609
|
+
for (i = 0; i < ctx->suffixSize; ++i) {
|
610
|
+
ctx->suffix[i] = i;
|
611
|
+
}
|
612
|
+
/* qsort doesn't take an opaque pointer, so pass as a global.
|
613
|
+
* On OpenBSD qsort() is not guaranteed to be stable, their mergesort() is.
|
614
|
+
*/
|
615
|
+
g_coverCtx = ctx;
|
616
|
+
#if defined(__OpenBSD__)
|
617
|
+
mergesort(ctx->suffix, ctx->suffixSize, sizeof(U32),
|
618
|
+
(ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp));
|
619
|
+
#else
|
620
|
+
qsort(ctx->suffix, ctx->suffixSize, sizeof(U32),
|
621
|
+
(ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp));
|
622
|
+
#endif
|
623
|
+
}
|
624
|
+
DISPLAYLEVEL(2, "Computing frequencies\n");
|
625
|
+
/* For each dmer group (group of positions with the same first d bytes):
|
626
|
+
* 1. For each position we set dmerAt[position] = dmerID. The dmerID is
|
627
|
+
* (groupBeginPtr - suffix). This allows us to go from position to
|
628
|
+
* dmerID so we can look up values in freq.
|
629
|
+
* 2. We calculate how many samples the dmer occurs in and save it in
|
630
|
+
* freqs[dmerId].
|
631
|
+
*/
|
632
|
+
COVER_groupBy(ctx->suffix, ctx->suffixSize, sizeof(U32), ctx,
|
633
|
+
(ctx->d <= 8 ? &COVER_cmp8 : &COVER_cmp), &COVER_group);
|
634
|
+
ctx->freqs = ctx->suffix;
|
635
|
+
ctx->suffix = NULL;
|
636
|
+
return 0;
|
637
|
+
}
|
638
|
+
|
639
|
+
void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel)
|
640
|
+
{
|
641
|
+
const double ratio = (double)nbDmers / maxDictSize;
|
642
|
+
if (ratio >= 10) {
|
643
|
+
return;
|
644
|
+
}
|
645
|
+
LOCALDISPLAYLEVEL(displayLevel, 1,
|
646
|
+
"WARNING: The maximum dictionary size %u is too large "
|
647
|
+
"compared to the source size %u! "
|
648
|
+
"size(source)/size(dictionary) = %f, but it should be >= "
|
649
|
+
"10! This may lead to a subpar dictionary! We recommend "
|
650
|
+
"training on sources at least 10x, and preferably 100x "
|
651
|
+
"the size of the dictionary! \n", (U32)maxDictSize,
|
652
|
+
(U32)nbDmers, ratio);
|
653
|
+
}
|
654
|
+
|
655
|
+
COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize,
|
656
|
+
U32 nbDmers, U32 k, U32 passes)
|
657
|
+
{
|
658
|
+
const U32 minEpochSize = k * 10;
|
659
|
+
COVER_epoch_info_t epochs;
|
660
|
+
epochs.num = MAX(1, maxDictSize / k / passes);
|
661
|
+
epochs.size = nbDmers / epochs.num;
|
662
|
+
if (epochs.size >= minEpochSize) {
|
663
|
+
assert(epochs.size * epochs.num <= nbDmers);
|
664
|
+
return epochs;
|
665
|
+
}
|
666
|
+
epochs.size = MIN(minEpochSize, nbDmers);
|
667
|
+
epochs.num = nbDmers / epochs.size;
|
668
|
+
assert(epochs.size * epochs.num <= nbDmers);
|
669
|
+
return epochs;
|
670
|
+
}
|
671
|
+
|
672
|
+
/**
|
673
|
+
* Given the prepared context build the dictionary.
|
674
|
+
*/
|
675
|
+
static size_t COVER_buildDictionary(const COVER_ctx_t *ctx, U32 *freqs,
|
676
|
+
COVER_map_t *activeDmers, void *dictBuffer,
|
677
|
+
size_t dictBufferCapacity,
|
678
|
+
ZDICT_cover_params_t parameters) {
|
679
|
+
BYTE *const dict = (BYTE *)dictBuffer;
|
680
|
+
size_t tail = dictBufferCapacity;
|
681
|
+
/* Divide the data into epochs. We will select one segment from each epoch. */
|
682
|
+
const COVER_epoch_info_t epochs = COVER_computeEpochs(
|
683
|
+
(U32)dictBufferCapacity, (U32)ctx->suffixSize, parameters.k, 4);
|
684
|
+
const size_t maxZeroScoreRun = MAX(10, MIN(100, epochs.num >> 3));
|
685
|
+
size_t zeroScoreRun = 0;
|
686
|
+
size_t epoch;
|
687
|
+
DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n",
|
688
|
+
(U32)epochs.num, (U32)epochs.size);
|
689
|
+
/* Loop through the epochs until there are no more segments or the dictionary
|
690
|
+
* is full.
|
691
|
+
*/
|
692
|
+
for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs.num) {
|
693
|
+
const U32 epochBegin = (U32)(epoch * epochs.size);
|
694
|
+
const U32 epochEnd = epochBegin + epochs.size;
|
695
|
+
size_t segmentSize;
|
696
|
+
/* Select a segment */
|
697
|
+
COVER_segment_t segment = COVER_selectSegment(
|
698
|
+
ctx, freqs, activeDmers, epochBegin, epochEnd, parameters);
|
699
|
+
/* If the segment covers no dmers, then we are out of content.
|
700
|
+
* There may be new content in other epochs, for continue for some time.
|
701
|
+
*/
|
702
|
+
if (segment.score == 0) {
|
703
|
+
if (++zeroScoreRun >= maxZeroScoreRun) {
|
704
|
+
break;
|
705
|
+
}
|
706
|
+
continue;
|
707
|
+
}
|
708
|
+
zeroScoreRun = 0;
|
709
|
+
/* Trim the segment if necessary and if it is too small then we are done */
|
710
|
+
segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail);
|
711
|
+
if (segmentSize < parameters.d) {
|
712
|
+
break;
|
713
|
+
}
|
714
|
+
/* We fill the dictionary from the back to allow the best segments to be
|
715
|
+
* referenced with the smallest offsets.
|
716
|
+
*/
|
717
|
+
tail -= segmentSize;
|
718
|
+
memcpy(dict + tail, ctx->samples + segment.begin, segmentSize);
|
719
|
+
DISPLAYUPDATE(
|
720
|
+
2, "\r%u%% ",
|
721
|
+
(unsigned)(((dictBufferCapacity - tail) * 100) / dictBufferCapacity));
|
722
|
+
}
|
723
|
+
DISPLAYLEVEL(2, "\r%79s\r", "");
|
724
|
+
return tail;
|
725
|
+
}
|
726
|
+
|
727
|
+
ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
|
728
|
+
void *dictBuffer, size_t dictBufferCapacity,
|
729
|
+
const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,
|
730
|
+
ZDICT_cover_params_t parameters)
|
731
|
+
{
|
732
|
+
BYTE* const dict = (BYTE*)dictBuffer;
|
733
|
+
COVER_ctx_t ctx;
|
734
|
+
COVER_map_t activeDmers;
|
735
|
+
parameters.splitPoint = 1.0;
|
736
|
+
/* Initialize global data */
|
737
|
+
g_displayLevel = parameters.zParams.notificationLevel;
|
738
|
+
/* Checks */
|
739
|
+
if (!COVER_checkParameters(parameters, dictBufferCapacity)) {
|
740
|
+
DISPLAYLEVEL(1, "Cover parameters incorrect\n");
|
741
|
+
return ERROR(parameter_outOfBound);
|
742
|
+
}
|
743
|
+
if (nbSamples == 0) {
|
744
|
+
DISPLAYLEVEL(1, "Cover must have at least one input file\n");
|
745
|
+
return ERROR(srcSize_wrong);
|
746
|
+
}
|
747
|
+
if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
|
748
|
+
DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
|
749
|
+
ZDICT_DICTSIZE_MIN);
|
750
|
+
return ERROR(dstSize_tooSmall);
|
751
|
+
}
|
752
|
+
/* Initialize context and activeDmers */
|
753
|
+
{
|
754
|
+
size_t const initVal = COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
|
755
|
+
parameters.d, parameters.splitPoint);
|
756
|
+
if (ZSTD_isError(initVal)) {
|
757
|
+
return initVal;
|
758
|
+
}
|
759
|
+
}
|
760
|
+
COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize, g_displayLevel);
|
761
|
+
if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
|
762
|
+
DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
|
763
|
+
COVER_ctx_destroy(&ctx);
|
764
|
+
return ERROR(memory_allocation);
|
765
|
+
}
|
766
|
+
|
767
|
+
DISPLAYLEVEL(2, "Building dictionary\n");
|
768
|
+
{
|
769
|
+
const size_t tail =
|
770
|
+
COVER_buildDictionary(&ctx, ctx.freqs, &activeDmers, dictBuffer,
|
771
|
+
dictBufferCapacity, parameters);
|
772
|
+
const size_t dictionarySize = ZDICT_finalizeDictionary(
|
773
|
+
dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
|
774
|
+
samplesBuffer, samplesSizes, nbSamples, parameters.zParams);
|
775
|
+
if (!ZSTD_isError(dictionarySize)) {
|
776
|
+
DISPLAYLEVEL(2, "Constructed dictionary of size %u\n",
|
777
|
+
(unsigned)dictionarySize);
|
778
|
+
}
|
779
|
+
COVER_ctx_destroy(&ctx);
|
780
|
+
COVER_map_destroy(&activeDmers);
|
781
|
+
return dictionarySize;
|
782
|
+
}
|
783
|
+
}
|
784
|
+
|
785
|
+
|
786
|
+
|
787
|
+
size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,
|
788
|
+
const size_t *samplesSizes, const BYTE *samples,
|
789
|
+
size_t *offsets,
|
790
|
+
size_t nbTrainSamples, size_t nbSamples,
|
791
|
+
BYTE *const dict, size_t dictBufferCapacity) {
|
792
|
+
size_t totalCompressedSize = ERROR(GENERIC);
|
793
|
+
/* Pointers */
|
794
|
+
ZSTD_CCtx *cctx;
|
795
|
+
ZSTD_CDict *cdict;
|
796
|
+
void *dst;
|
797
|
+
/* Local variables */
|
798
|
+
size_t dstCapacity;
|
799
|
+
size_t i;
|
800
|
+
/* Allocate dst with enough space to compress the maximum sized sample */
|
801
|
+
{
|
802
|
+
size_t maxSampleSize = 0;
|
803
|
+
i = parameters.splitPoint < 1.0 ? nbTrainSamples : 0;
|
804
|
+
for (; i < nbSamples; ++i) {
|
805
|
+
maxSampleSize = MAX(samplesSizes[i], maxSampleSize);
|
806
|
+
}
|
807
|
+
dstCapacity = ZSTD_compressBound(maxSampleSize);
|
808
|
+
dst = malloc(dstCapacity);
|
809
|
+
}
|
810
|
+
/* Create the cctx and cdict */
|
811
|
+
cctx = ZSTD_createCCtx();
|
812
|
+
cdict = ZSTD_createCDict(dict, dictBufferCapacity,
|
813
|
+
parameters.zParams.compressionLevel);
|
814
|
+
if (!dst || !cctx || !cdict) {
|
815
|
+
goto _compressCleanup;
|
816
|
+
}
|
817
|
+
/* Compress each sample and sum their sizes (or error) */
|
818
|
+
totalCompressedSize = dictBufferCapacity;
|
819
|
+
i = parameters.splitPoint < 1.0 ? nbTrainSamples : 0;
|
820
|
+
for (; i < nbSamples; ++i) {
|
821
|
+
const size_t size = ZSTD_compress_usingCDict(
|
822
|
+
cctx, dst, dstCapacity, samples + offsets[i],
|
823
|
+
samplesSizes[i], cdict);
|
824
|
+
if (ZSTD_isError(size)) {
|
825
|
+
totalCompressedSize = size;
|
826
|
+
goto _compressCleanup;
|
827
|
+
}
|
828
|
+
totalCompressedSize += size;
|
829
|
+
}
|
830
|
+
_compressCleanup:
|
831
|
+
ZSTD_freeCCtx(cctx);
|
832
|
+
ZSTD_freeCDict(cdict);
|
833
|
+
if (dst) {
|
834
|
+
free(dst);
|
835
|
+
}
|
836
|
+
return totalCompressedSize;
|
837
|
+
}
|
838
|
+
|
839
|
+
|
840
|
+
/**
|
841
|
+
* Initialize the `COVER_best_t`.
|
842
|
+
*/
|
843
|
+
void COVER_best_init(COVER_best_t *best) {
|
844
|
+
if (best==NULL) return; /* compatible with init on NULL */
|
845
|
+
(void)ZSTD_pthread_mutex_init(&best->mutex, NULL);
|
846
|
+
(void)ZSTD_pthread_cond_init(&best->cond, NULL);
|
847
|
+
best->liveJobs = 0;
|
848
|
+
best->dict = NULL;
|
849
|
+
best->dictSize = 0;
|
850
|
+
best->compressedSize = (size_t)-1;
|
851
|
+
memset(&best->parameters, 0, sizeof(best->parameters));
|
852
|
+
}
|
853
|
+
|
854
|
+
/**
|
855
|
+
* Wait until liveJobs == 0.
|
856
|
+
*/
|
857
|
+
void COVER_best_wait(COVER_best_t *best) {
|
858
|
+
if (!best) {
|
859
|
+
return;
|
860
|
+
}
|
861
|
+
ZSTD_pthread_mutex_lock(&best->mutex);
|
862
|
+
while (best->liveJobs != 0) {
|
863
|
+
ZSTD_pthread_cond_wait(&best->cond, &best->mutex);
|
864
|
+
}
|
865
|
+
ZSTD_pthread_mutex_unlock(&best->mutex);
|
866
|
+
}
|
867
|
+
|
868
|
+
/**
|
869
|
+
* Call COVER_best_wait() and then destroy the COVER_best_t.
|
870
|
+
*/
|
871
|
+
void COVER_best_destroy(COVER_best_t *best) {
|
872
|
+
if (!best) {
|
873
|
+
return;
|
874
|
+
}
|
875
|
+
COVER_best_wait(best);
|
876
|
+
if (best->dict) {
|
877
|
+
free(best->dict);
|
878
|
+
}
|
879
|
+
ZSTD_pthread_mutex_destroy(&best->mutex);
|
880
|
+
ZSTD_pthread_cond_destroy(&best->cond);
|
881
|
+
}
|
882
|
+
|
883
|
+
/**
|
884
|
+
* Called when a thread is about to be launched.
|
885
|
+
* Increments liveJobs.
|
886
|
+
*/
|
887
|
+
void COVER_best_start(COVER_best_t *best) {
|
888
|
+
if (!best) {
|
889
|
+
return;
|
890
|
+
}
|
891
|
+
ZSTD_pthread_mutex_lock(&best->mutex);
|
892
|
+
++best->liveJobs;
|
893
|
+
ZSTD_pthread_mutex_unlock(&best->mutex);
|
894
|
+
}
|
895
|
+
|
896
|
+
/**
|
897
|
+
* Called when a thread finishes executing, both on error or success.
|
898
|
+
* Decrements liveJobs and signals any waiting threads if liveJobs == 0.
|
899
|
+
* If this dictionary is the best so far save it and its parameters.
|
900
|
+
*/
|
901
|
+
void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters,
|
902
|
+
COVER_dictSelection_t selection) {
|
903
|
+
void* dict = selection.dictContent;
|
904
|
+
size_t compressedSize = selection.totalCompressedSize;
|
905
|
+
size_t dictSize = selection.dictSize;
|
906
|
+
if (!best) {
|
907
|
+
return;
|
908
|
+
}
|
909
|
+
{
|
910
|
+
size_t liveJobs;
|
911
|
+
ZSTD_pthread_mutex_lock(&best->mutex);
|
912
|
+
--best->liveJobs;
|
913
|
+
liveJobs = best->liveJobs;
|
914
|
+
/* If the new dictionary is better */
|
915
|
+
if (compressedSize < best->compressedSize) {
|
916
|
+
/* Allocate space if necessary */
|
917
|
+
if (!best->dict || best->dictSize < dictSize) {
|
918
|
+
if (best->dict) {
|
919
|
+
free(best->dict);
|
920
|
+
}
|
921
|
+
best->dict = malloc(dictSize);
|
922
|
+
if (!best->dict) {
|
923
|
+
best->compressedSize = ERROR(GENERIC);
|
924
|
+
best->dictSize = 0;
|
925
|
+
ZSTD_pthread_cond_signal(&best->cond);
|
926
|
+
ZSTD_pthread_mutex_unlock(&best->mutex);
|
927
|
+
return;
|
928
|
+
}
|
929
|
+
}
|
930
|
+
/* Save the dictionary, parameters, and size */
|
931
|
+
if (dict) {
|
932
|
+
memcpy(best->dict, dict, dictSize);
|
933
|
+
best->dictSize = dictSize;
|
934
|
+
best->parameters = parameters;
|
935
|
+
best->compressedSize = compressedSize;
|
936
|
+
}
|
937
|
+
}
|
938
|
+
if (liveJobs == 0) {
|
939
|
+
ZSTD_pthread_cond_broadcast(&best->cond);
|
940
|
+
}
|
941
|
+
ZSTD_pthread_mutex_unlock(&best->mutex);
|
942
|
+
}
|
943
|
+
}
|
944
|
+
|
945
|
+
COVER_dictSelection_t COVER_dictSelectionError(size_t error) {
|
946
|
+
COVER_dictSelection_t selection = { NULL, 0, error };
|
947
|
+
return selection;
|
948
|
+
}
|
949
|
+
|
950
|
+
unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection) {
|
951
|
+
return (ZSTD_isError(selection.totalCompressedSize) || !selection.dictContent);
|
952
|
+
}
|
953
|
+
|
954
|
+
void COVER_dictSelectionFree(COVER_dictSelection_t selection){
|
955
|
+
free(selection.dictContent);
|
956
|
+
}
|
957
|
+
|
958
|
+
COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent, size_t dictBufferCapacity,
|
959
|
+
size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples,
|
960
|
+
size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize) {
|
961
|
+
|
962
|
+
size_t largestDict = 0;
|
963
|
+
size_t largestCompressed = 0;
|
964
|
+
BYTE* customDictContentEnd = customDictContent + dictContentSize;
|
965
|
+
|
966
|
+
BYTE * largestDictbuffer = (BYTE *)malloc(dictBufferCapacity);
|
967
|
+
BYTE * candidateDictBuffer = (BYTE *)malloc(dictBufferCapacity);
|
968
|
+
double regressionTolerance = ((double)params.shrinkDictMaxRegression / 100.0) + 1.00;
|
969
|
+
|
970
|
+
if (!largestDictbuffer || !candidateDictBuffer) {
|
971
|
+
free(largestDictbuffer);
|
972
|
+
free(candidateDictBuffer);
|
973
|
+
return COVER_dictSelectionError(dictContentSize);
|
974
|
+
}
|
975
|
+
|
976
|
+
/* Initial dictionary size and compressed size */
|
977
|
+
memcpy(largestDictbuffer, customDictContent, dictContentSize);
|
978
|
+
dictContentSize = ZDICT_finalizeDictionary(
|
979
|
+
largestDictbuffer, dictBufferCapacity, customDictContent, dictContentSize,
|
980
|
+
samplesBuffer, samplesSizes, nbFinalizeSamples, params.zParams);
|
981
|
+
|
982
|
+
if (ZDICT_isError(dictContentSize)) {
|
983
|
+
free(largestDictbuffer);
|
984
|
+
free(candidateDictBuffer);
|
985
|
+
return COVER_dictSelectionError(dictContentSize);
|
986
|
+
}
|
987
|
+
|
988
|
+
totalCompressedSize = COVER_checkTotalCompressedSize(params, samplesSizes,
|
989
|
+
samplesBuffer, offsets,
|
990
|
+
nbCheckSamples, nbSamples,
|
991
|
+
largestDictbuffer, dictContentSize);
|
992
|
+
|
993
|
+
if (ZSTD_isError(totalCompressedSize)) {
|
994
|
+
free(largestDictbuffer);
|
995
|
+
free(candidateDictBuffer);
|
996
|
+
return COVER_dictSelectionError(totalCompressedSize);
|
997
|
+
}
|
998
|
+
|
999
|
+
if (params.shrinkDict == 0) {
|
1000
|
+
COVER_dictSelection_t selection = { largestDictbuffer, dictContentSize, totalCompressedSize };
|
1001
|
+
free(candidateDictBuffer);
|
1002
|
+
return selection;
|
1003
|
+
}
|
1004
|
+
|
1005
|
+
largestDict = dictContentSize;
|
1006
|
+
largestCompressed = totalCompressedSize;
|
1007
|
+
dictContentSize = ZDICT_DICTSIZE_MIN;
|
1008
|
+
|
1009
|
+
/* Largest dict is initially at least ZDICT_DICTSIZE_MIN */
|
1010
|
+
while (dictContentSize < largestDict) {
|
1011
|
+
memcpy(candidateDictBuffer, largestDictbuffer, largestDict);
|
1012
|
+
dictContentSize = ZDICT_finalizeDictionary(
|
1013
|
+
candidateDictBuffer, dictBufferCapacity, customDictContentEnd - dictContentSize, dictContentSize,
|
1014
|
+
samplesBuffer, samplesSizes, nbFinalizeSamples, params.zParams);
|
1015
|
+
|
1016
|
+
if (ZDICT_isError(dictContentSize)) {
|
1017
|
+
free(largestDictbuffer);
|
1018
|
+
free(candidateDictBuffer);
|
1019
|
+
return COVER_dictSelectionError(dictContentSize);
|
1020
|
+
|
1021
|
+
}
|
1022
|
+
|
1023
|
+
totalCompressedSize = COVER_checkTotalCompressedSize(params, samplesSizes,
|
1024
|
+
samplesBuffer, offsets,
|
1025
|
+
nbCheckSamples, nbSamples,
|
1026
|
+
candidateDictBuffer, dictContentSize);
|
1027
|
+
|
1028
|
+
if (ZSTD_isError(totalCompressedSize)) {
|
1029
|
+
free(largestDictbuffer);
|
1030
|
+
free(candidateDictBuffer);
|
1031
|
+
return COVER_dictSelectionError(totalCompressedSize);
|
1032
|
+
}
|
1033
|
+
|
1034
|
+
if (totalCompressedSize <= largestCompressed * regressionTolerance) {
|
1035
|
+
COVER_dictSelection_t selection = { candidateDictBuffer, dictContentSize, totalCompressedSize };
|
1036
|
+
free(largestDictbuffer);
|
1037
|
+
return selection;
|
1038
|
+
}
|
1039
|
+
dictContentSize *= 2;
|
1040
|
+
}
|
1041
|
+
dictContentSize = largestDict;
|
1042
|
+
totalCompressedSize = largestCompressed;
|
1043
|
+
{
|
1044
|
+
COVER_dictSelection_t selection = { largestDictbuffer, dictContentSize, totalCompressedSize };
|
1045
|
+
free(candidateDictBuffer);
|
1046
|
+
return selection;
|
1047
|
+
}
|
1048
|
+
}
|
1049
|
+
|
1050
|
+
/**
|
1051
|
+
* Parameters for COVER_tryParameters().
|
1052
|
+
*/
|
1053
|
+
typedef struct COVER_tryParameters_data_s {
|
1054
|
+
const COVER_ctx_t *ctx;
|
1055
|
+
COVER_best_t *best;
|
1056
|
+
size_t dictBufferCapacity;
|
1057
|
+
ZDICT_cover_params_t parameters;
|
1058
|
+
} COVER_tryParameters_data_t;
|
1059
|
+
|
1060
|
+
/**
|
1061
|
+
* Tries a set of parameters and updates the COVER_best_t with the results.
|
1062
|
+
* This function is thread safe if zstd is compiled with multithreaded support.
|
1063
|
+
* It takes its parameters as an *OWNING* opaque pointer to support threading.
|
1064
|
+
*/
|
1065
|
+
static void COVER_tryParameters(void *opaque) {
|
1066
|
+
/* Save parameters as local variables */
|
1067
|
+
COVER_tryParameters_data_t *const data = (COVER_tryParameters_data_t *)opaque;
|
1068
|
+
const COVER_ctx_t *const ctx = data->ctx;
|
1069
|
+
const ZDICT_cover_params_t parameters = data->parameters;
|
1070
|
+
size_t dictBufferCapacity = data->dictBufferCapacity;
|
1071
|
+
size_t totalCompressedSize = ERROR(GENERIC);
|
1072
|
+
/* Allocate space for hash table, dict, and freqs */
|
1073
|
+
COVER_map_t activeDmers;
|
1074
|
+
BYTE *const dict = (BYTE * const)malloc(dictBufferCapacity);
|
1075
|
+
COVER_dictSelection_t selection = COVER_dictSelectionError(ERROR(GENERIC));
|
1076
|
+
U32 *freqs = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
|
1077
|
+
if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
|
1078
|
+
DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
|
1079
|
+
goto _cleanup;
|
1080
|
+
}
|
1081
|
+
if (!dict || !freqs) {
|
1082
|
+
DISPLAYLEVEL(1, "Failed to allocate buffers: out of memory\n");
|
1083
|
+
goto _cleanup;
|
1084
|
+
}
|
1085
|
+
/* Copy the frequencies because we need to modify them */
|
1086
|
+
memcpy(freqs, ctx->freqs, ctx->suffixSize * sizeof(U32));
|
1087
|
+
/* Build the dictionary */
|
1088
|
+
{
|
1089
|
+
const size_t tail = COVER_buildDictionary(ctx, freqs, &activeDmers, dict,
|
1090
|
+
dictBufferCapacity, parameters);
|
1091
|
+
selection = COVER_selectDict(dict + tail, dictBufferCapacity, dictBufferCapacity - tail,
|
1092
|
+
ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbTrainSamples, ctx->nbTrainSamples, ctx->nbSamples, parameters, ctx->offsets,
|
1093
|
+
totalCompressedSize);
|
1094
|
+
|
1095
|
+
if (COVER_dictSelectionIsError(selection)) {
|
1096
|
+
DISPLAYLEVEL(1, "Failed to select dictionary\n");
|
1097
|
+
goto _cleanup;
|
1098
|
+
}
|
1099
|
+
}
|
1100
|
+
_cleanup:
|
1101
|
+
free(dict);
|
1102
|
+
COVER_best_finish(data->best, parameters, selection);
|
1103
|
+
free(data);
|
1104
|
+
COVER_map_destroy(&activeDmers);
|
1105
|
+
COVER_dictSelectionFree(selection);
|
1106
|
+
if (freqs) {
|
1107
|
+
free(freqs);
|
1108
|
+
}
|
1109
|
+
}
|
1110
|
+
|
1111
|
+
ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
|
1112
|
+
void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer,
|
1113
|
+
const size_t *samplesSizes, unsigned nbSamples,
|
1114
|
+
ZDICT_cover_params_t *parameters) {
|
1115
|
+
/* constants */
|
1116
|
+
const unsigned nbThreads = parameters->nbThreads;
|
1117
|
+
const double splitPoint =
|
1118
|
+
parameters->splitPoint <= 0.0 ? COVER_DEFAULT_SPLITPOINT : parameters->splitPoint;
|
1119
|
+
const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d;
|
1120
|
+
const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d;
|
1121
|
+
const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k;
|
1122
|
+
const unsigned kMaxK = parameters->k == 0 ? 2000 : parameters->k;
|
1123
|
+
const unsigned kSteps = parameters->steps == 0 ? 40 : parameters->steps;
|
1124
|
+
const unsigned kStepSize = MAX((kMaxK - kMinK) / kSteps, 1);
|
1125
|
+
const unsigned kIterations =
|
1126
|
+
(1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize);
|
1127
|
+
const unsigned shrinkDict = 0;
|
1128
|
+
/* Local variables */
|
1129
|
+
const int displayLevel = parameters->zParams.notificationLevel;
|
1130
|
+
unsigned iteration = 1;
|
1131
|
+
unsigned d;
|
1132
|
+
unsigned k;
|
1133
|
+
COVER_best_t best;
|
1134
|
+
POOL_ctx *pool = NULL;
|
1135
|
+
int warned = 0;
|
1136
|
+
|
1137
|
+
/* Checks */
|
1138
|
+
if (splitPoint <= 0 || splitPoint > 1) {
|
1139
|
+
LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
|
1140
|
+
return ERROR(parameter_outOfBound);
|
1141
|
+
}
|
1142
|
+
if (kMinK < kMaxD || kMaxK < kMinK) {
|
1143
|
+
LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
|
1144
|
+
return ERROR(parameter_outOfBound);
|
1145
|
+
}
|
1146
|
+
if (nbSamples == 0) {
|
1147
|
+
DISPLAYLEVEL(1, "Cover must have at least one input file\n");
|
1148
|
+
return ERROR(srcSize_wrong);
|
1149
|
+
}
|
1150
|
+
if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
|
1151
|
+
DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
|
1152
|
+
ZDICT_DICTSIZE_MIN);
|
1153
|
+
return ERROR(dstSize_tooSmall);
|
1154
|
+
}
|
1155
|
+
if (nbThreads > 1) {
|
1156
|
+
pool = POOL_create(nbThreads, 1);
|
1157
|
+
if (!pool) {
|
1158
|
+
return ERROR(memory_allocation);
|
1159
|
+
}
|
1160
|
+
}
|
1161
|
+
/* Initialization */
|
1162
|
+
COVER_best_init(&best);
|
1163
|
+
/* Turn down global display level to clean up display at level 2 and below */
|
1164
|
+
g_displayLevel = displayLevel == 0 ? 0 : displayLevel - 1;
|
1165
|
+
/* Loop through d first because each new value needs a new context */
|
1166
|
+
LOCALDISPLAYLEVEL(displayLevel, 2, "Trying %u different sets of parameters\n",
|
1167
|
+
kIterations);
|
1168
|
+
for (d = kMinD; d <= kMaxD; d += 2) {
|
1169
|
+
/* Initialize the context for this value of d */
|
1170
|
+
COVER_ctx_t ctx;
|
1171
|
+
LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d);
|
1172
|
+
{
|
1173
|
+
const size_t initVal = COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint);
|
1174
|
+
if (ZSTD_isError(initVal)) {
|
1175
|
+
LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n");
|
1176
|
+
COVER_best_destroy(&best);
|
1177
|
+
POOL_free(pool);
|
1178
|
+
return initVal;
|
1179
|
+
}
|
1180
|
+
}
|
1181
|
+
if (!warned) {
|
1182
|
+
COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize, displayLevel);
|
1183
|
+
warned = 1;
|
1184
|
+
}
|
1185
|
+
/* Loop through k reusing the same context */
|
1186
|
+
for (k = kMinK; k <= kMaxK; k += kStepSize) {
|
1187
|
+
/* Prepare the arguments */
|
1188
|
+
COVER_tryParameters_data_t *data = (COVER_tryParameters_data_t *)malloc(
|
1189
|
+
sizeof(COVER_tryParameters_data_t));
|
1190
|
+
LOCALDISPLAYLEVEL(displayLevel, 3, "k=%u\n", k);
|
1191
|
+
if (!data) {
|
1192
|
+
LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to allocate parameters\n");
|
1193
|
+
COVER_best_destroy(&best);
|
1194
|
+
COVER_ctx_destroy(&ctx);
|
1195
|
+
POOL_free(pool);
|
1196
|
+
return ERROR(memory_allocation);
|
1197
|
+
}
|
1198
|
+
data->ctx = &ctx;
|
1199
|
+
data->best = &best;
|
1200
|
+
data->dictBufferCapacity = dictBufferCapacity;
|
1201
|
+
data->parameters = *parameters;
|
1202
|
+
data->parameters.k = k;
|
1203
|
+
data->parameters.d = d;
|
1204
|
+
data->parameters.splitPoint = splitPoint;
|
1205
|
+
data->parameters.steps = kSteps;
|
1206
|
+
data->parameters.shrinkDict = shrinkDict;
|
1207
|
+
data->parameters.zParams.notificationLevel = g_displayLevel;
|
1208
|
+
/* Check the parameters */
|
1209
|
+
if (!COVER_checkParameters(data->parameters, dictBufferCapacity)) {
|
1210
|
+
DISPLAYLEVEL(1, "Cover parameters incorrect\n");
|
1211
|
+
free(data);
|
1212
|
+
continue;
|
1213
|
+
}
|
1214
|
+
/* Call the function and pass ownership of data to it */
|
1215
|
+
COVER_best_start(&best);
|
1216
|
+
if (pool) {
|
1217
|
+
POOL_add(pool, &COVER_tryParameters, data);
|
1218
|
+
} else {
|
1219
|
+
COVER_tryParameters(data);
|
1220
|
+
}
|
1221
|
+
/* Print status */
|
1222
|
+
LOCALDISPLAYUPDATE(displayLevel, 2, "\r%u%% ",
|
1223
|
+
(unsigned)((iteration * 100) / kIterations));
|
1224
|
+
++iteration;
|
1225
|
+
}
|
1226
|
+
COVER_best_wait(&best);
|
1227
|
+
COVER_ctx_destroy(&ctx);
|
1228
|
+
}
|
1229
|
+
LOCALDISPLAYLEVEL(displayLevel, 2, "\r%79s\r", "");
|
1230
|
+
/* Fill the output buffer and parameters with output of the best parameters */
|
1231
|
+
{
|
1232
|
+
const size_t dictSize = best.dictSize;
|
1233
|
+
if (ZSTD_isError(best.compressedSize)) {
|
1234
|
+
const size_t compressedSize = best.compressedSize;
|
1235
|
+
COVER_best_destroy(&best);
|
1236
|
+
POOL_free(pool);
|
1237
|
+
return compressedSize;
|
1238
|
+
}
|
1239
|
+
*parameters = best.parameters;
|
1240
|
+
memcpy(dictBuffer, best.dict, dictSize);
|
1241
|
+
COVER_best_destroy(&best);
|
1242
|
+
POOL_free(pool);
|
1243
|
+
return dictSize;
|
1244
|
+
}
|
1245
|
+
}
|