extzstd 0.0.3.CONCEPT → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/HISTORY.ja.md +39 -0
- data/LICENSE +6 -6
- data/README.md +26 -45
- data/contrib/zstd/CHANGELOG +555 -0
- data/contrib/zstd/CODE_OF_CONDUCT.md +5 -0
- data/contrib/zstd/CONTRIBUTING.md +392 -0
- data/contrib/zstd/COPYING +339 -0
- data/contrib/zstd/LICENSE +13 -9
- data/contrib/zstd/Makefile +414 -0
- data/contrib/zstd/README.md +170 -45
- data/contrib/zstd/TESTING.md +44 -0
- data/contrib/zstd/appveyor.yml +289 -0
- data/contrib/zstd/lib/BUCK +234 -0
- data/contrib/zstd/lib/Makefile +354 -0
- data/contrib/zstd/lib/README.md +179 -0
- data/contrib/zstd/{common → lib/common}/bitstream.h +170 -130
- data/contrib/zstd/lib/common/compiler.h +175 -0
- data/contrib/zstd/lib/common/cpu.h +215 -0
- data/contrib/zstd/lib/common/debug.c +24 -0
- data/contrib/zstd/lib/common/debug.h +114 -0
- data/contrib/zstd/{common → lib/common}/entropy_common.c +79 -94
- data/contrib/zstd/lib/common/error_private.c +55 -0
- data/contrib/zstd/lib/common/error_private.h +80 -0
- data/contrib/zstd/{common → lib/common}/fse.h +153 -93
- data/contrib/zstd/{common → lib/common}/fse_decompress.c +37 -82
- data/contrib/zstd/lib/common/huf.h +340 -0
- data/contrib/zstd/{common → lib/common}/mem.h +154 -78
- data/contrib/zstd/lib/common/pool.c +344 -0
- data/contrib/zstd/lib/common/pool.h +84 -0
- data/contrib/zstd/lib/common/threading.c +121 -0
- data/contrib/zstd/lib/common/threading.h +155 -0
- data/contrib/zstd/{common → lib/common}/xxhash.c +85 -75
- data/contrib/zstd/{common → lib/common}/xxhash.h +85 -73
- data/contrib/zstd/lib/common/zstd_common.c +83 -0
- data/contrib/zstd/lib/common/zstd_errors.h +94 -0
- data/contrib/zstd/lib/common/zstd_internal.h +447 -0
- data/contrib/zstd/{compress → lib/compress}/fse_compress.c +194 -303
- data/contrib/zstd/lib/compress/hist.c +183 -0
- data/contrib/zstd/lib/compress/hist.h +75 -0
- data/contrib/zstd/lib/compress/huf_compress.c +798 -0
- data/contrib/zstd/lib/compress/zstd_compress.c +4278 -0
- data/contrib/zstd/lib/compress/zstd_compress_internal.h +1125 -0
- data/contrib/zstd/lib/compress/zstd_compress_literals.c +158 -0
- data/contrib/zstd/lib/compress/zstd_compress_literals.h +29 -0
- data/contrib/zstd/lib/compress/zstd_compress_sequences.c +419 -0
- data/contrib/zstd/lib/compress/zstd_compress_sequences.h +54 -0
- data/contrib/zstd/lib/compress/zstd_compress_superblock.c +845 -0
- data/contrib/zstd/lib/compress/zstd_compress_superblock.h +32 -0
- data/contrib/zstd/lib/compress/zstd_cwksp.h +525 -0
- data/contrib/zstd/lib/compress/zstd_double_fast.c +521 -0
- data/contrib/zstd/lib/compress/zstd_double_fast.h +38 -0
- data/contrib/zstd/lib/compress/zstd_fast.c +496 -0
- data/contrib/zstd/lib/compress/zstd_fast.h +37 -0
- data/contrib/zstd/lib/compress/zstd_lazy.c +1138 -0
- data/contrib/zstd/lib/compress/zstd_lazy.h +67 -0
- data/contrib/zstd/lib/compress/zstd_ldm.c +619 -0
- data/contrib/zstd/lib/compress/zstd_ldm.h +110 -0
- data/contrib/zstd/lib/compress/zstd_opt.c +1200 -0
- data/contrib/zstd/lib/compress/zstd_opt.h +56 -0
- data/contrib/zstd/lib/compress/zstdmt_compress.c +2143 -0
- data/contrib/zstd/lib/compress/zstdmt_compress.h +192 -0
- data/contrib/zstd/lib/decompress/huf_decompress.c +1248 -0
- data/contrib/zstd/lib/decompress/zstd_ddict.c +244 -0
- data/contrib/zstd/lib/decompress/zstd_ddict.h +44 -0
- data/contrib/zstd/lib/decompress/zstd_decompress.c +1885 -0
- data/contrib/zstd/lib/decompress/zstd_decompress_block.c +1432 -0
- data/contrib/zstd/lib/decompress/zstd_decompress_block.h +59 -0
- data/contrib/zstd/lib/decompress/zstd_decompress_internal.h +189 -0
- data/contrib/zstd/{common → lib/deprecated}/zbuff.h +86 -69
- data/contrib/zstd/lib/deprecated/zbuff_common.c +26 -0
- data/contrib/zstd/lib/deprecated/zbuff_compress.c +147 -0
- data/contrib/zstd/lib/deprecated/zbuff_decompress.c +75 -0
- data/contrib/zstd/lib/dictBuilder/cover.c +1236 -0
- data/contrib/zstd/lib/dictBuilder/cover.h +157 -0
- data/contrib/zstd/{dictBuilder → lib/dictBuilder}/divsufsort.c +3 -3
- data/contrib/zstd/{dictBuilder → lib/dictBuilder}/divsufsort.h +5 -5
- data/contrib/zstd/lib/dictBuilder/fastcover.c +757 -0
- data/contrib/zstd/{dictBuilder → lib/dictBuilder}/zdict.c +437 -347
- data/contrib/zstd/lib/dictBuilder/zdict.h +305 -0
- data/contrib/zstd/lib/legacy/zstd_legacy.h +415 -0
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v01.c +272 -292
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v01.h +26 -32
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v02.c +162 -392
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v02.h +26 -32
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v03.c +162 -391
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v03.h +27 -33
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v04.c +195 -604
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v04.h +26 -32
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v05.c +300 -575
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v05.h +22 -31
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v06.c +165 -592
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v06.h +54 -67
- data/contrib/zstd/lib/legacy/zstd_v07.c +4541 -0
- data/contrib/zstd/lib/legacy/zstd_v07.h +187 -0
- data/contrib/zstd/lib/libzstd.pc.in +15 -0
- data/contrib/zstd/lib/zstd.h +2090 -0
- data/ext/depend +2 -0
- data/ext/extconf.rb +18 -5
- data/ext/extzstd.c +296 -214
- data/ext/extzstd.h +81 -36
- data/ext/extzstd_nogvls.h +0 -117
- data/ext/extzstd_stream.c +622 -0
- data/ext/libzstd_conf.h +8 -0
- data/ext/zstd_common.c +11 -0
- data/ext/zstd_compress.c +15 -0
- data/ext/zstd_decompress.c +6 -0
- data/ext/zstd_dictbuilder.c +10 -0
- data/ext/zstd_dictbuilder_fastcover.c +3 -0
- data/ext/zstd_legacy_v01.c +3 -1
- data/ext/zstd_legacy_v02.c +3 -1
- data/ext/zstd_legacy_v03.c +3 -1
- data/ext/zstd_legacy_v04.c +3 -1
- data/ext/zstd_legacy_v05.c +3 -1
- data/ext/zstd_legacy_v06.c +3 -1
- data/ext/zstd_legacy_v07.c +3 -0
- data/gemstub.rb +27 -21
- data/lib/extzstd.rb +82 -161
- data/lib/extzstd/version.rb +1 -1
- data/test/test_basic.rb +19 -6
- metadata +127 -59
- data/contrib/zstd/common/error_private.h +0 -125
- data/contrib/zstd/common/error_public.h +0 -77
- data/contrib/zstd/common/huf.h +0 -228
- data/contrib/zstd/common/zstd.h +0 -475
- data/contrib/zstd/common/zstd_common.c +0 -91
- data/contrib/zstd/common/zstd_internal.h +0 -238
- data/contrib/zstd/compress/huf_compress.c +0 -577
- data/contrib/zstd/compress/zbuff_compress.c +0 -327
- data/contrib/zstd/compress/zstd_compress.c +0 -3074
- data/contrib/zstd/compress/zstd_opt.h +0 -1046
- data/contrib/zstd/decompress/huf_decompress.c +0 -894
- data/contrib/zstd/decompress/zbuff_decompress.c +0 -294
- data/contrib/zstd/decompress/zstd_decompress.c +0 -1362
- data/contrib/zstd/dictBuilder/zdict.h +0 -113
- data/contrib/zstd/legacy/zstd_legacy.h +0 -140
- data/ext/extzstd_buffered.c +0 -265
- data/ext/zstd_amalgam.c +0 -18
@@ -1,40 +1,20 @@
|
|
1
1
|
/*
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
* Redistributions of source code must retain the above copyright
|
12
|
-
notice, this list of conditions and the following disclaimer.
|
13
|
-
* Redistributions in binary form must reproduce the above
|
14
|
-
copyright notice, this list of conditions and the following disclaimer
|
15
|
-
in the documentation and/or other materials provided with the
|
16
|
-
distribution.
|
17
|
-
|
18
|
-
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
19
|
-
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
20
|
-
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
21
|
-
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
22
|
-
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
23
|
-
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
24
|
-
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
25
|
-
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
26
|
-
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
27
|
-
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
28
|
-
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
29
|
-
|
30
|
-
You can contact the author at :
|
31
|
-
- Zstd homepage : https://www.zstd.net
|
32
|
-
*/
|
2
|
+
* Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
|
3
|
+
* All rights reserved.
|
4
|
+
*
|
5
|
+
* This source code is licensed under both the BSD-style license (found in the
|
6
|
+
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
|
7
|
+
* in the COPYING file in the root directory of this source tree).
|
8
|
+
* You may select, at your option, one of the above-listed licenses.
|
9
|
+
*/
|
10
|
+
|
33
11
|
|
34
12
|
/*-**************************************
|
35
13
|
* Tuning parameters
|
36
14
|
****************************************/
|
15
|
+
#define MINRATIO 4 /* minimum nb of apparition to be selected in dictionary */
|
37
16
|
#define ZDICT_MAX_SAMPLES_SIZE (2000U << 20)
|
17
|
+
#define ZDICT_MIN_SAMPLES_SIZE (ZDICT_CONTENTSIZE_MIN * MINRATIO)
|
38
18
|
|
39
19
|
|
40
20
|
/*-**************************************
|
@@ -57,18 +37,18 @@
|
|
57
37
|
#include <stdio.h> /* fprintf, fopen, ftello64 */
|
58
38
|
#include <time.h> /* clock */
|
59
39
|
|
60
|
-
#include "mem.h" /* read */
|
61
|
-
#include "
|
62
|
-
#include "fse.h" /* FSE_normalizeCount, FSE_writeNCount */
|
40
|
+
#include "../common/mem.h" /* read */
|
41
|
+
#include "../common/fse.h" /* FSE_normalizeCount, FSE_writeNCount */
|
63
42
|
#define HUF_STATIC_LINKING_ONLY
|
64
|
-
#include "huf.h"
|
65
|
-
#include "zstd_internal.h" /* includes zstd.h */
|
66
|
-
#include "xxhash.h"
|
43
|
+
#include "../common/huf.h" /* HUF_buildCTable, HUF_writeCTable */
|
44
|
+
#include "../common/zstd_internal.h" /* includes zstd.h */
|
45
|
+
#include "../common/xxhash.h" /* XXH64 */
|
67
46
|
#include "divsufsort.h"
|
68
47
|
#ifndef ZDICT_STATIC_LINKING_ONLY
|
69
48
|
# define ZDICT_STATIC_LINKING_ONLY
|
70
49
|
#endif
|
71
50
|
#include "zdict.h"
|
51
|
+
#include "../compress/zstd_compress_internal.h" /* ZSTD_loadCEntropy() */
|
72
52
|
|
73
53
|
|
74
54
|
/*-*************************************
|
@@ -78,43 +58,30 @@
|
|
78
58
|
#define MB *(1 <<20)
|
79
59
|
#define GB *(1U<<30)
|
80
60
|
|
81
|
-
#define
|
61
|
+
#define DICTLISTSIZE_DEFAULT 10000
|
82
62
|
|
83
63
|
#define NOISELENGTH 32
|
84
|
-
#define PRIME1 2654435761U
|
85
|
-
#define PRIME2 2246822519U
|
86
64
|
|
87
|
-
|
88
|
-
static const U32 g_compressionLevel_default = 5;
|
65
|
+
static const int g_compressionLevel_default = 3;
|
89
66
|
static const U32 g_selectivity_default = 9;
|
90
|
-
static const size_t g_provision_entropySize = 200;
|
91
|
-
static const size_t g_min_fast_dictContent = 192;
|
92
67
|
|
93
68
|
|
94
69
|
/*-*************************************
|
95
70
|
* Console display
|
96
71
|
***************************************/
|
97
72
|
#define DISPLAY(...) { fprintf(stderr, __VA_ARGS__); fflush( stderr ); }
|
98
|
-
#define DISPLAYLEVEL(l, ...) if (
|
99
|
-
static unsigned g_displayLevel = 0; /* 0 : no display; 1: errors; 2: default; 4: full information */
|
100
|
-
|
101
|
-
#define DISPLAYUPDATE(l, ...) if (g_displayLevel>=l) { \
|
102
|
-
if (ZDICT_clockSpan(g_time) > refreshRate) \
|
103
|
-
{ g_time = clock(); DISPLAY(__VA_ARGS__); \
|
104
|
-
if (g_displayLevel>=4) fflush(stdout); } }
|
105
|
-
static const clock_t refreshRate = CLOCKS_PER_SEC * 3 / 10;
|
106
|
-
static clock_t g_time = 0;
|
73
|
+
#define DISPLAYLEVEL(l, ...) if (notificationLevel>=l) { DISPLAY(__VA_ARGS__); } /* 0 : no display; 1: errors; 2: default; 3: details; 4: debug */
|
107
74
|
|
108
75
|
static clock_t ZDICT_clockSpan(clock_t nPrevious) { return clock() - nPrevious; }
|
109
76
|
|
110
|
-
static void ZDICT_printHex(
|
77
|
+
static void ZDICT_printHex(const void* ptr, size_t length)
|
111
78
|
{
|
112
79
|
const BYTE* const b = (const BYTE*)ptr;
|
113
80
|
size_t u;
|
114
81
|
for (u=0; u<length; u++) {
|
115
82
|
BYTE c = b[u];
|
116
83
|
if (c<32 || c>126) c = '.'; /* non-printable char */
|
117
|
-
|
84
|
+
DISPLAY("%c", c);
|
118
85
|
}
|
119
86
|
}
|
120
87
|
|
@@ -126,11 +93,41 @@ unsigned ZDICT_isError(size_t errorCode) { return ERR_isError(errorCode); }
|
|
126
93
|
|
127
94
|
const char* ZDICT_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); }
|
128
95
|
|
96
|
+
unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize)
|
97
|
+
{
|
98
|
+
if (dictSize < 8) return 0;
|
99
|
+
if (MEM_readLE32(dictBuffer) != ZSTD_MAGIC_DICTIONARY) return 0;
|
100
|
+
return MEM_readLE32((const char*)dictBuffer + 4);
|
101
|
+
}
|
102
|
+
|
103
|
+
size_t ZDICT_getDictHeaderSize(const void* dictBuffer, size_t dictSize)
|
104
|
+
{
|
105
|
+
size_t headerSize;
|
106
|
+
if (dictSize <= 8 || MEM_readLE32(dictBuffer) != ZSTD_MAGIC_DICTIONARY) return ERROR(dictionary_corrupted);
|
107
|
+
|
108
|
+
{ unsigned offcodeMaxValue = MaxOff;
|
109
|
+
ZSTD_compressedBlockState_t* bs = (ZSTD_compressedBlockState_t*)malloc(sizeof(ZSTD_compressedBlockState_t));
|
110
|
+
U32* wksp = (U32*)malloc(HUF_WORKSPACE_SIZE);
|
111
|
+
short* offcodeNCount = (short*)malloc((MaxOff+1)*sizeof(short));
|
112
|
+
if (!bs || !wksp || !offcodeNCount) {
|
113
|
+
headerSize = ERROR(memory_allocation);
|
114
|
+
} else {
|
115
|
+
ZSTD_reset_compressedBlockState(bs);
|
116
|
+
headerSize = ZSTD_loadCEntropy(bs, wksp, offcodeNCount, &offcodeMaxValue, dictBuffer, dictSize);
|
117
|
+
}
|
118
|
+
|
119
|
+
free(bs);
|
120
|
+
free(wksp);
|
121
|
+
free(offcodeNCount);
|
122
|
+
}
|
123
|
+
|
124
|
+
return headerSize;
|
125
|
+
}
|
129
126
|
|
130
127
|
/*-********************************************************
|
131
128
|
* Dictionary training functions
|
132
129
|
**********************************************************/
|
133
|
-
static unsigned ZDICT_NbCommonBytes (
|
130
|
+
static unsigned ZDICT_NbCommonBytes (size_t val)
|
134
131
|
{
|
135
132
|
if (MEM_isLittleEndian()) {
|
136
133
|
if (MEM_64bits()) {
|
@@ -228,13 +225,12 @@ static void ZDICT_initDictItem(dictItem* d)
|
|
228
225
|
static dictItem ZDICT_analyzePos(
|
229
226
|
BYTE* doneMarks,
|
230
227
|
const int* suffix, U32 start,
|
231
|
-
const void* buffer, U32 minRatio)
|
228
|
+
const void* buffer, U32 minRatio, U32 notificationLevel)
|
232
229
|
{
|
233
230
|
U32 lengthList[LLIMIT] = {0};
|
234
231
|
U32 cumulLength[LLIMIT] = {0};
|
235
232
|
U32 savings[LLIMIT] = {0};
|
236
233
|
const BYTE* b = (const BYTE*)buffer;
|
237
|
-
size_t length;
|
238
234
|
size_t maxLength = LLIMIT;
|
239
235
|
size_t pos = suffix[start];
|
240
236
|
U32 end = start;
|
@@ -249,26 +245,30 @@ static dictItem ZDICT_analyzePos(
|
|
249
245
|
||(MEM_read16(b+pos+1) == MEM_read16(b+pos+3))
|
250
246
|
||(MEM_read16(b+pos+2) == MEM_read16(b+pos+4)) ) {
|
251
247
|
/* skip and mark segment */
|
252
|
-
U16
|
253
|
-
U32 u,
|
254
|
-
while (MEM_read16(b+pos+
|
255
|
-
if (b[pos+
|
256
|
-
for (u=1; u<
|
248
|
+
U16 const pattern16 = MEM_read16(b+pos+4);
|
249
|
+
U32 u, patternEnd = 6;
|
250
|
+
while (MEM_read16(b+pos+patternEnd) == pattern16) patternEnd+=2 ;
|
251
|
+
if (b[pos+patternEnd] == b[pos+patternEnd-1]) patternEnd++;
|
252
|
+
for (u=1; u<patternEnd; u++)
|
257
253
|
doneMarks[pos+u] = 1;
|
258
254
|
return solution;
|
259
255
|
}
|
260
256
|
|
261
257
|
/* look forward */
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
258
|
+
{ size_t length;
|
259
|
+
do {
|
260
|
+
end++;
|
261
|
+
length = ZDICT_count(b + pos, b + suffix[end]);
|
262
|
+
} while (length >= MINMATCHLENGTH);
|
263
|
+
}
|
266
264
|
|
267
265
|
/* look backward */
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
266
|
+
{ size_t length;
|
267
|
+
do {
|
268
|
+
length = ZDICT_count(b + pos, b + *(suffix+start-1));
|
269
|
+
if (length >=MINMATCHLENGTH) start--;
|
270
|
+
} while(length >= MINMATCHLENGTH);
|
271
|
+
}
|
272
272
|
|
273
273
|
/* exit if not found a minimum nb of repetitions */
|
274
274
|
if (end-start < minRatio) {
|
@@ -279,15 +279,15 @@ static dictItem ZDICT_analyzePos(
|
|
279
279
|
}
|
280
280
|
|
281
281
|
{ int i;
|
282
|
-
U32
|
282
|
+
U32 mml;
|
283
283
|
U32 refinedStart = start;
|
284
284
|
U32 refinedEnd = end;
|
285
285
|
|
286
286
|
DISPLAYLEVEL(4, "\n");
|
287
|
-
DISPLAYLEVEL(4, "found %3u matches of length >= %i at pos %7u ", (
|
287
|
+
DISPLAYLEVEL(4, "found %3u matches of length >= %i at pos %7u ", (unsigned)(end-start), MINMATCHLENGTH, (unsigned)pos);
|
288
288
|
DISPLAYLEVEL(4, "\n");
|
289
289
|
|
290
|
-
for (
|
290
|
+
for (mml = MINMATCHLENGTH ; ; mml++) {
|
291
291
|
BYTE currentChar = 0;
|
292
292
|
U32 currentCount = 0;
|
293
293
|
U32 currentID = refinedStart;
|
@@ -295,13 +295,13 @@ static dictItem ZDICT_analyzePos(
|
|
295
295
|
U32 selectedCount = 0;
|
296
296
|
U32 selectedID = currentID;
|
297
297
|
for (id =refinedStart; id < refinedEnd; id++) {
|
298
|
-
if (b[
|
298
|
+
if (b[suffix[id] + mml] != currentChar) {
|
299
299
|
if (currentCount > selectedCount) {
|
300
300
|
selectedCount = currentCount;
|
301
301
|
selectedID = currentID;
|
302
302
|
}
|
303
303
|
currentID = id;
|
304
|
-
currentChar = b[ suffix[id] +
|
304
|
+
currentChar = b[ suffix[id] + mml];
|
305
305
|
currentCount = 0;
|
306
306
|
}
|
307
307
|
currentCount ++;
|
@@ -317,27 +317,31 @@ static dictItem ZDICT_analyzePos(
|
|
317
317
|
refinedEnd = refinedStart + selectedCount;
|
318
318
|
}
|
319
319
|
|
320
|
-
/* evaluate gain based on new
|
320
|
+
/* evaluate gain based on new dict */
|
321
321
|
start = refinedStart;
|
322
322
|
pos = suffix[refinedStart];
|
323
323
|
end = start;
|
324
324
|
memset(lengthList, 0, sizeof(lengthList));
|
325
325
|
|
326
326
|
/* look forward */
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
327
|
+
{ size_t length;
|
328
|
+
do {
|
329
|
+
end++;
|
330
|
+
length = ZDICT_count(b + pos, b + suffix[end]);
|
331
|
+
if (length >= LLIMIT) length = LLIMIT-1;
|
332
|
+
lengthList[length]++;
|
333
|
+
} while (length >=MINMATCHLENGTH);
|
334
|
+
}
|
333
335
|
|
334
336
|
/* look backward */
|
335
|
-
|
336
|
-
length
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
337
|
+
{ size_t length = MINMATCHLENGTH;
|
338
|
+
while ((length >= MINMATCHLENGTH) & (start > 0)) {
|
339
|
+
length = ZDICT_count(b + pos, b + suffix[start - 1]);
|
340
|
+
if (length >= LLIMIT) length = LLIMIT - 1;
|
341
|
+
lengthList[length]++;
|
342
|
+
if (length >= MINMATCHLENGTH) start--;
|
343
|
+
}
|
344
|
+
}
|
341
345
|
|
342
346
|
/* largest useful length */
|
343
347
|
memset(cumulLength, 0, sizeof(cumulLength));
|
@@ -361,8 +365,8 @@ static dictItem ZDICT_analyzePos(
|
|
361
365
|
for (i=MINMATCHLENGTH; i<=(int)maxLength; i++)
|
362
366
|
savings[i] = savings[i-1] + (lengthList[i] * (i-3));
|
363
367
|
|
364
|
-
DISPLAYLEVEL(4, "Selected
|
365
|
-
(
|
368
|
+
DISPLAYLEVEL(4, "Selected dict at position %u, of length %u : saves %u (ratio: %.2f) \n",
|
369
|
+
(unsigned)pos, (unsigned)maxLength, (unsigned)savings[maxLength], (double)savings[maxLength] / maxLength);
|
366
370
|
|
367
371
|
solution.pos = (U32)pos;
|
368
372
|
solution.length = (U32)maxLength;
|
@@ -371,12 +375,12 @@ static dictItem ZDICT_analyzePos(
|
|
371
375
|
/* mark positions done */
|
372
376
|
{ U32 id;
|
373
377
|
for (id=start; id<end; id++) {
|
374
|
-
U32 p, pEnd;
|
378
|
+
U32 p, pEnd, length;
|
375
379
|
U32 const testedPos = suffix[id];
|
376
380
|
if (testedPos == pos)
|
377
381
|
length = solution.length;
|
378
382
|
else {
|
379
|
-
length = ZDICT_count(b+pos, b+testedPos);
|
383
|
+
length = (U32)ZDICT_count(b+pos, b+testedPos);
|
380
384
|
if (length > solution.length) length = solution.length;
|
381
385
|
}
|
382
386
|
pEnd = (U32)(testedPos + length);
|
@@ -388,28 +392,43 @@ static dictItem ZDICT_analyzePos(
|
|
388
392
|
}
|
389
393
|
|
390
394
|
|
391
|
-
|
395
|
+
static int isIncluded(const void* in, const void* container, size_t length)
|
396
|
+
{
|
397
|
+
const char* const ip = (const char*) in;
|
398
|
+
const char* const into = (const char*) container;
|
399
|
+
size_t u;
|
400
|
+
|
401
|
+
for (u=0; u<length; u++) { /* works because end of buffer is a noisy guard band */
|
402
|
+
if (ip[u] != into[u]) break;
|
403
|
+
}
|
404
|
+
|
405
|
+
return u==length;
|
406
|
+
}
|
407
|
+
|
408
|
+
/*! ZDICT_tryMerge() :
|
392
409
|
check if dictItem can be merged, do it if possible
|
393
410
|
@return : id of destination elt, 0 if not merged
|
394
411
|
*/
|
395
|
-
static U32
|
412
|
+
static U32 ZDICT_tryMerge(dictItem* table, dictItem elt, U32 eltNbToSkip, const void* buffer)
|
396
413
|
{
|
397
414
|
const U32 tableSize = table->pos;
|
398
|
-
const U32
|
415
|
+
const U32 eltEnd = elt.pos + elt.length;
|
416
|
+
const char* const buf = (const char*) buffer;
|
399
417
|
|
400
418
|
/* tail overlap */
|
401
419
|
U32 u; for (u=1; u<tableSize; u++) {
|
402
420
|
if (u==eltNbToSkip) continue;
|
403
|
-
if ((table[u].pos > elt.pos) && (table[u].pos
|
421
|
+
if ((table[u].pos > elt.pos) && (table[u].pos <= eltEnd)) { /* overlap, existing > new */
|
404
422
|
/* append */
|
405
|
-
U32 addedLength = table[u].pos - elt.pos;
|
423
|
+
U32 const addedLength = table[u].pos - elt.pos;
|
406
424
|
table[u].length += addedLength;
|
407
425
|
table[u].pos = elt.pos;
|
408
426
|
table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */
|
409
|
-
table[u].savings += elt.length / 8; /* rough approx */
|
427
|
+
table[u].savings += elt.length / 8; /* rough approx bonus */
|
410
428
|
elt = table[u];
|
429
|
+
/* sort : improve rank */
|
411
430
|
while ((u>1) && (table[u-1].savings < elt.savings))
|
412
|
-
|
431
|
+
table[u] = table[u-1], u--;
|
413
432
|
table[u] = elt;
|
414
433
|
return u;
|
415
434
|
} }
|
@@ -417,20 +436,33 @@ static U32 ZDICT_checkMerge(dictItem* table, dictItem elt, U32 eltNbToSkip)
|
|
417
436
|
/* front overlap */
|
418
437
|
for (u=1; u<tableSize; u++) {
|
419
438
|
if (u==eltNbToSkip) continue;
|
420
|
-
|
439
|
+
|
440
|
+
if ((table[u].pos + table[u].length >= elt.pos) && (table[u].pos < elt.pos)) { /* overlap, existing < new */
|
421
441
|
/* append */
|
422
|
-
int addedLength = (
|
423
|
-
table[u].savings += elt.length / 8; /* rough approx */
|
424
|
-
if (addedLength > 0) { /* otherwise,
|
442
|
+
int const addedLength = (int)eltEnd - (table[u].pos + table[u].length);
|
443
|
+
table[u].savings += elt.length / 8; /* rough approx bonus */
|
444
|
+
if (addedLength > 0) { /* otherwise, elt fully included into existing */
|
425
445
|
table[u].length += addedLength;
|
426
446
|
table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */
|
427
447
|
}
|
448
|
+
/* sort : improve rank */
|
428
449
|
elt = table[u];
|
429
450
|
while ((u>1) && (table[u-1].savings < elt.savings))
|
430
451
|
table[u] = table[u-1], u--;
|
431
452
|
table[u] = elt;
|
432
453
|
return u;
|
433
|
-
|
454
|
+
}
|
455
|
+
|
456
|
+
if (MEM_read64(buf + table[u].pos) == MEM_read64(buf + elt.pos + 1)) {
|
457
|
+
if (isIncluded(buf + table[u].pos, buf + elt.pos + 1, table[u].length)) {
|
458
|
+
size_t const addedLength = MAX( (int)elt.length - (int)table[u].length , 1 );
|
459
|
+
table[u].pos = elt.pos;
|
460
|
+
table[u].savings += (U32)(elt.savings * addedLength / elt.length);
|
461
|
+
table[u].length = MIN(elt.length, table[u].length + 1);
|
462
|
+
return u;
|
463
|
+
}
|
464
|
+
}
|
465
|
+
}
|
434
466
|
|
435
467
|
return 0;
|
436
468
|
}
|
@@ -438,8 +470,8 @@ static U32 ZDICT_checkMerge(dictItem* table, dictItem elt, U32 eltNbToSkip)
|
|
438
470
|
|
439
471
|
static void ZDICT_removeDictItem(dictItem* table, U32 id)
|
440
472
|
{
|
441
|
-
/* convention :
|
442
|
-
U32 const max = table
|
473
|
+
/* convention : table[0].pos stores nb of elts */
|
474
|
+
U32 const max = table[0].pos;
|
443
475
|
U32 u;
|
444
476
|
if (!id) return; /* protection, should never happen */
|
445
477
|
for (u=id; u<max-1; u++)
|
@@ -448,14 +480,14 @@ static void ZDICT_removeDictItem(dictItem* table, U32 id)
|
|
448
480
|
}
|
449
481
|
|
450
482
|
|
451
|
-
static void ZDICT_insertDictItem(dictItem* table, U32 maxSize, dictItem elt)
|
483
|
+
static void ZDICT_insertDictItem(dictItem* table, U32 maxSize, dictItem elt, const void* buffer)
|
452
484
|
{
|
453
485
|
/* merge if possible */
|
454
|
-
U32 mergeId =
|
486
|
+
U32 mergeId = ZDICT_tryMerge(table, elt, 0, buffer);
|
455
487
|
if (mergeId) {
|
456
488
|
U32 newMerge = 1;
|
457
489
|
while (newMerge) {
|
458
|
-
newMerge =
|
490
|
+
newMerge = ZDICT_tryMerge(table, table[mergeId], mergeId, buffer);
|
459
491
|
if (newMerge) ZDICT_removeDictItem(table, mergeId);
|
460
492
|
mergeId = newMerge;
|
461
493
|
}
|
@@ -486,18 +518,24 @@ static U32 ZDICT_dictSize(const dictItem* dictList)
|
|
486
518
|
}
|
487
519
|
|
488
520
|
|
489
|
-
static size_t
|
521
|
+
static size_t ZDICT_trainBuffer_legacy(dictItem* dictList, U32 dictListSize,
|
490
522
|
const void* const buffer, size_t bufferSize, /* buffer must end with noisy guard band */
|
491
523
|
const size_t* fileSizes, unsigned nbFiles,
|
492
|
-
|
524
|
+
unsigned minRatio, U32 notificationLevel)
|
493
525
|
{
|
494
526
|
int* const suffix0 = (int*)malloc((bufferSize+2)*sizeof(*suffix0));
|
495
527
|
int* const suffix = suffix0+1;
|
496
528
|
U32* reverseSuffix = (U32*)malloc((bufferSize)*sizeof(*reverseSuffix));
|
497
529
|
BYTE* doneMarks = (BYTE*)malloc((bufferSize+16)*sizeof(*doneMarks)); /* +16 for overflow security */
|
498
530
|
U32* filePos = (U32*)malloc(nbFiles * sizeof(*filePos));
|
499
|
-
U32 minRatio = nbFiles >> shiftRatio;
|
500
531
|
size_t result = 0;
|
532
|
+
clock_t displayClock = 0;
|
533
|
+
clock_t const refreshRate = CLOCKS_PER_SEC * 3 / 10;
|
534
|
+
|
535
|
+
# define DISPLAYUPDATE(l, ...) if (notificationLevel>=l) { \
|
536
|
+
if (ZDICT_clockSpan(displayClock) > refreshRate) \
|
537
|
+
{ displayClock = clock(); DISPLAY(__VA_ARGS__); \
|
538
|
+
if (notificationLevel>=4) fflush(stderr); } }
|
501
539
|
|
502
540
|
/* init */
|
503
541
|
DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
|
@@ -509,11 +547,11 @@ static size_t ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
|
|
509
547
|
memset(doneMarks, 0, bufferSize+16);
|
510
548
|
|
511
549
|
/* limit sample set size (divsufsort limitation)*/
|
512
|
-
if (bufferSize > ZDICT_MAX_SAMPLES_SIZE) DISPLAYLEVEL(3, "sample set too large : reduced to %u MB ...\n", (
|
550
|
+
if (bufferSize > ZDICT_MAX_SAMPLES_SIZE) DISPLAYLEVEL(3, "sample set too large : reduced to %u MB ...\n", (unsigned)(ZDICT_MAX_SAMPLES_SIZE>>20));
|
513
551
|
while (bufferSize > ZDICT_MAX_SAMPLES_SIZE) bufferSize -= fileSizes[--nbFiles];
|
514
552
|
|
515
553
|
/* sort */
|
516
|
-
DISPLAYLEVEL(2, "sorting %u files of total size %u MB ...\n", nbFiles, (
|
554
|
+
DISPLAYLEVEL(2, "sorting %u files of total size %u MB ...\n", nbFiles, (unsigned)(bufferSize>>20));
|
517
555
|
{ int const divSuftSortResult = divsufsort((const unsigned char*)buffer, suffix, (int)bufferSize, 0);
|
518
556
|
if (divSuftSortResult != 0) { result = ERROR(GENERIC); goto _cleanup; }
|
519
557
|
}
|
@@ -523,7 +561,8 @@ static size_t ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
|
|
523
561
|
{ size_t pos;
|
524
562
|
for (pos=0; pos < bufferSize; pos++)
|
525
563
|
reverseSuffix[suffix[pos]] = (U32)pos;
|
526
|
-
/*
|
564
|
+
/* note filePos tracks borders between samples.
|
565
|
+
It's not used at this stage, but planned to become useful in a later update */
|
527
566
|
filePos[0] = 0;
|
528
567
|
for (pos=1; pos<nbFiles; pos++)
|
529
568
|
filePos[pos] = (U32)(filePos[pos-1] + fileSizes[pos-1]);
|
@@ -535,23 +574,13 @@ static size_t ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
|
|
535
574
|
{ U32 cursor; for (cursor=0; cursor < bufferSize; ) {
|
536
575
|
dictItem solution;
|
537
576
|
if (doneMarks[cursor]) { cursor++; continue; }
|
538
|
-
solution = ZDICT_analyzePos(doneMarks, suffix, reverseSuffix[cursor], buffer, minRatio);
|
577
|
+
solution = ZDICT_analyzePos(doneMarks, suffix, reverseSuffix[cursor], buffer, minRatio, notificationLevel);
|
539
578
|
if (solution.length==0) { cursor++; continue; }
|
540
|
-
ZDICT_insertDictItem(dictList, dictListSize, solution);
|
579
|
+
ZDICT_insertDictItem(dictList, dictListSize, solution, buffer);
|
541
580
|
cursor += solution.length;
|
542
581
|
DISPLAYUPDATE(2, "\r%4.2f %% \r", (double)cursor / bufferSize * 100);
|
543
582
|
} }
|
544
583
|
|
545
|
-
/* limit dictionary size */
|
546
|
-
{ U32 const max = dictList->pos; /* convention : nb of useful elts within dictList */
|
547
|
-
U32 currentSize = 0;
|
548
|
-
U32 n; for (n=1; n<max; n++) {
|
549
|
-
currentSize += dictList[n].length;
|
550
|
-
if (currentSize > maxDictSize) break;
|
551
|
-
}
|
552
|
-
dictList->pos = n;
|
553
|
-
}
|
554
|
-
|
555
584
|
_cleanup:
|
556
585
|
free(suffix0);
|
557
586
|
free(reverseSuffix);
|
@@ -563,10 +592,12 @@ _cleanup:
|
|
563
592
|
|
564
593
|
static void ZDICT_fillNoise(void* buffer, size_t length)
|
565
594
|
{
|
566
|
-
unsigned
|
567
|
-
|
595
|
+
unsigned const prime1 = 2654435761U;
|
596
|
+
unsigned const prime2 = 2246822519U;
|
597
|
+
unsigned acc = prime1;
|
598
|
+
size_t p=0;
|
568
599
|
for (p=0; p<length; p++) {
|
569
|
-
acc *=
|
600
|
+
acc *= prime2;
|
570
601
|
((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21);
|
571
602
|
}
|
572
603
|
}
|
@@ -574,29 +605,31 @@ static void ZDICT_fillNoise(void* buffer, size_t length)
|
|
574
605
|
|
575
606
|
typedef struct
|
576
607
|
{
|
577
|
-
|
578
|
-
ZSTD_CCtx* zc;
|
608
|
+
ZSTD_CDict* dict; /* dictionary */
|
609
|
+
ZSTD_CCtx* zc; /* working context */
|
579
610
|
void* workPlace; /* must be ZSTD_BLOCKSIZE_MAX allocated */
|
580
611
|
} EStats_ress_t;
|
581
612
|
|
582
613
|
#define MAXREPOFFSET 1024
|
583
614
|
|
584
|
-
static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params,
|
585
|
-
|
586
|
-
|
615
|
+
static void ZDICT_countEStats(EStats_ress_t esr, const ZSTD_parameters* params,
|
616
|
+
unsigned* countLit, unsigned* offsetcodeCount, unsigned* matchlengthCount, unsigned* litlengthCount, U32* repOffsets,
|
617
|
+
const void* src, size_t srcSize,
|
618
|
+
U32 notificationLevel)
|
587
619
|
{
|
588
|
-
size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params
|
620
|
+
size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params->cParams.windowLog);
|
589
621
|
size_t cSize;
|
590
622
|
|
591
623
|
if (srcSize > blockSizeMax) srcSize = blockSizeMax; /* protection vs large samples */
|
592
|
-
|
593
|
-
|
594
|
-
|
624
|
+
{ size_t const errorCode = ZSTD_compressBegin_usingCDict(esr.zc, esr.dict);
|
625
|
+
if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_compressBegin_usingCDict failed \n"); return; }
|
626
|
+
|
627
|
+
}
|
595
628
|
cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_MAX, src, srcSize);
|
596
|
-
if (ZSTD_isError(cSize)) { DISPLAYLEVEL(
|
629
|
+
if (ZSTD_isError(cSize)) { DISPLAYLEVEL(3, "warning : could not compress sample size %u \n", (unsigned)srcSize); return; }
|
597
630
|
|
598
631
|
if (cSize) { /* if == 0; block is not compressible */
|
599
|
-
const seqStore_t* seqStorePtr = ZSTD_getSeqStore(esr.zc);
|
632
|
+
const seqStore_t* const seqStorePtr = ZSTD_getSeqStore(esr.zc);
|
600
633
|
|
601
634
|
/* literals stats */
|
602
635
|
{ const BYTE* bytePtr;
|
@@ -605,46 +638,34 @@ static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params,
|
|
605
638
|
}
|
606
639
|
|
607
640
|
/* seqStats */
|
608
|
-
{
|
609
|
-
ZSTD_seqToCodes(seqStorePtr
|
641
|
+
{ U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
|
642
|
+
ZSTD_seqToCodes(seqStorePtr);
|
610
643
|
|
611
|
-
{ const BYTE* codePtr = seqStorePtr->
|
612
|
-
|
644
|
+
{ const BYTE* codePtr = seqStorePtr->ofCode;
|
645
|
+
U32 u;
|
613
646
|
for (u=0; u<nbSeq; u++) offsetcodeCount[codePtr[u]]++;
|
614
647
|
}
|
615
648
|
|
616
|
-
{ const BYTE* codePtr = seqStorePtr->
|
617
|
-
|
649
|
+
{ const BYTE* codePtr = seqStorePtr->mlCode;
|
650
|
+
U32 u;
|
618
651
|
for (u=0; u<nbSeq; u++) matchlengthCount[codePtr[u]]++;
|
619
652
|
}
|
620
653
|
|
621
|
-
{ const BYTE* codePtr = seqStorePtr->
|
622
|
-
|
654
|
+
{ const BYTE* codePtr = seqStorePtr->llCode;
|
655
|
+
U32 u;
|
623
656
|
for (u=0; u<nbSeq; u++) litlengthCount[codePtr[u]]++;
|
624
|
-
|
625
|
-
|
626
|
-
/* rep offsets */
|
627
|
-
{ const U32* const offsetPtr = seqStorePtr->offsetStart;
|
628
|
-
U32 offset1 = offsetPtr[0] - 3;
|
629
|
-
U32 offset2 = offsetPtr[1] - 3;
|
630
|
-
if (offset1 >= MAXREPOFFSET) offset1 = 0;
|
631
|
-
if (offset2 >= MAXREPOFFSET) offset2 = 0;
|
632
|
-
repOffsets[offset1] += 3;
|
633
|
-
repOffsets[offset2] += 1;
|
634
|
-
}
|
635
|
-
}
|
636
|
-
}
|
657
|
+
}
|
637
658
|
|
638
|
-
/*
|
639
|
-
|
640
|
-
|
641
|
-
|
642
|
-
|
643
|
-
|
644
|
-
|
645
|
-
|
659
|
+
if (nbSeq >= 2) { /* rep offsets */
|
660
|
+
const seqDef* const seq = seqStorePtr->sequencesStart;
|
661
|
+
U32 offset1 = seq[0].offset - 3;
|
662
|
+
U32 offset2 = seq[1].offset - 3;
|
663
|
+
if (offset1 >= MAXREPOFFSET) offset1 = 0;
|
664
|
+
if (offset2 >= MAXREPOFFSET) offset2 = 0;
|
665
|
+
repOffsets[offset1] += 3;
|
666
|
+
repOffsets[offset2] += 1;
|
667
|
+
} } }
|
646
668
|
}
|
647
|
-
*/
|
648
669
|
|
649
670
|
static size_t ZDICT_totalSampleSize(const size_t* fileSizes, unsigned nbFiles)
|
650
671
|
{
|
@@ -670,72 +691,92 @@ static void ZDICT_insertSortCount(offsetCount_t table[ZSTD_REP_NUM+1], U32 val,
|
|
670
691
|
}
|
671
692
|
}
|
672
693
|
|
694
|
+
/* ZDICT_flatLit() :
|
695
|
+
* rewrite `countLit` to contain a mostly flat but still compressible distribution of literals.
|
696
|
+
* necessary to avoid generating a non-compressible distribution that HUF_writeCTable() cannot encode.
|
697
|
+
*/
|
698
|
+
static void ZDICT_flatLit(unsigned* countLit)
|
699
|
+
{
|
700
|
+
int u;
|
701
|
+
for (u=1; u<256; u++) countLit[u] = 2;
|
702
|
+
countLit[0] = 4;
|
703
|
+
countLit[253] = 1;
|
704
|
+
countLit[254] = 1;
|
705
|
+
}
|
673
706
|
|
674
|
-
#define OFFCODE_MAX
|
707
|
+
#define OFFCODE_MAX 30 /* only applicable to first block */
|
675
708
|
static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
676
|
-
|
677
|
-
|
678
|
-
|
709
|
+
unsigned compressionLevel,
|
710
|
+
const void* srcBuffer, const size_t* fileSizes, unsigned nbFiles,
|
711
|
+
const void* dictBuffer, size_t dictBufferSize,
|
712
|
+
unsigned notificationLevel)
|
679
713
|
{
|
680
|
-
|
714
|
+
unsigned countLit[256];
|
681
715
|
HUF_CREATE_STATIC_CTABLE(hufTable, 255);
|
682
|
-
|
716
|
+
unsigned offcodeCount[OFFCODE_MAX+1];
|
683
717
|
short offcodeNCount[OFFCODE_MAX+1];
|
684
|
-
U32
|
718
|
+
U32 offcodeMax = ZSTD_highbit32((U32)(dictBufferSize + 128 KB));
|
719
|
+
unsigned matchLengthCount[MaxML+1];
|
685
720
|
short matchLengthNCount[MaxML+1];
|
686
|
-
|
721
|
+
unsigned litLengthCount[MaxLL+1];
|
687
722
|
short litLengthNCount[MaxLL+1];
|
688
|
-
U32 repOffset[MAXREPOFFSET]
|
723
|
+
U32 repOffset[MAXREPOFFSET];
|
689
724
|
offsetCount_t bestRepOffset[ZSTD_REP_NUM+1];
|
690
|
-
EStats_ress_t esr;
|
725
|
+
EStats_ress_t esr = { NULL, NULL, NULL };
|
691
726
|
ZSTD_parameters params;
|
692
|
-
U32 u, huffLog =
|
727
|
+
U32 u, huffLog = 11, Offlog = OffFSELog, mlLog = MLFSELog, llLog = LLFSELog, total;
|
693
728
|
size_t pos = 0, errorCode;
|
694
729
|
size_t eSize = 0;
|
695
730
|
size_t const totalSrcSize = ZDICT_totalSampleSize(fileSizes, nbFiles);
|
696
|
-
size_t const averageSampleSize = totalSrcSize / nbFiles;
|
731
|
+
size_t const averageSampleSize = totalSrcSize / (nbFiles + !nbFiles);
|
697
732
|
BYTE* dstPtr = (BYTE*)dstBuffer;
|
698
733
|
|
699
734
|
/* init */
|
700
|
-
|
701
|
-
|
702
|
-
for (u=0; u
|
703
|
-
for (u=0; u<=
|
735
|
+
DEBUGLOG(4, "ZDICT_analyzeEntropy");
|
736
|
+
if (offcodeMax>OFFCODE_MAX) { eSize = ERROR(dictionaryCreation_failed); goto _cleanup; } /* too large dictionary */
|
737
|
+
for (u=0; u<256; u++) countLit[u] = 1; /* any character must be described */
|
738
|
+
for (u=0; u<=offcodeMax; u++) offcodeCount[u] = 1;
|
739
|
+
for (u=0; u<=MaxML; u++) matchLengthCount[u] = 1;
|
740
|
+
for (u=0; u<=MaxLL; u++) litLengthCount[u] = 1;
|
741
|
+
memset(repOffset, 0, sizeof(repOffset));
|
704
742
|
repOffset[1] = repOffset[4] = repOffset[8] = 1;
|
705
743
|
memset(bestRepOffset, 0, sizeof(bestRepOffset));
|
706
|
-
|
744
|
+
if (compressionLevel==0) compressionLevel = g_compressionLevel_default;
|
745
|
+
params = ZSTD_getParams(compressionLevel, averageSampleSize, dictBufferSize);
|
746
|
+
|
747
|
+
esr.dict = ZSTD_createCDict_advanced(dictBuffer, dictBufferSize, ZSTD_dlm_byRef, ZSTD_dct_rawContent, params.cParams, ZSTD_defaultCMem);
|
707
748
|
esr.zc = ZSTD_createCCtx();
|
708
749
|
esr.workPlace = malloc(ZSTD_BLOCKSIZE_MAX);
|
709
|
-
if (!esr.
|
710
|
-
|
711
|
-
|
712
|
-
|
750
|
+
if (!esr.dict || !esr.zc || !esr.workPlace) {
|
751
|
+
eSize = ERROR(memory_allocation);
|
752
|
+
DISPLAYLEVEL(1, "Not enough memory \n");
|
753
|
+
goto _cleanup;
|
713
754
|
}
|
714
|
-
|
715
|
-
|
716
|
-
{ size_t const beginResult = ZSTD_compressBegin_advanced(esr.ref, dictBuffer, dictBufferSize, params, 0);
|
717
|
-
if (ZSTD_isError(beginResult)) {
|
718
|
-
eSize = ERROR(GENERIC);
|
719
|
-
DISPLAYLEVEL(1, "error : ZSTD_compressBegin_advanced failed ");
|
720
|
-
goto _cleanup;
|
721
|
-
} }
|
722
|
-
|
723
|
-
/* collect stats on all files */
|
755
|
+
|
756
|
+
/* collect stats on all samples */
|
724
757
|
for (u=0; u<nbFiles; u++) {
|
725
|
-
ZDICT_countEStats(esr, params,
|
726
|
-
|
727
|
-
|
758
|
+
ZDICT_countEStats(esr, ¶ms,
|
759
|
+
countLit, offcodeCount, matchLengthCount, litLengthCount, repOffset,
|
760
|
+
(const char*)srcBuffer + pos, fileSizes[u],
|
761
|
+
notificationLevel);
|
728
762
|
pos += fileSizes[u];
|
729
763
|
}
|
730
764
|
|
731
|
-
/* analyze */
|
732
|
-
|
733
|
-
|
734
|
-
|
735
|
-
|
736
|
-
|
765
|
+
/* analyze, build stats, starting with literals */
|
766
|
+
{ size_t maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog);
|
767
|
+
if (HUF_isError(maxNbBits)) {
|
768
|
+
eSize = maxNbBits;
|
769
|
+
DISPLAYLEVEL(1, " HUF_buildCTable error \n");
|
770
|
+
goto _cleanup;
|
771
|
+
}
|
772
|
+
if (maxNbBits==8) { /* not compressible : will fail on HUF_writeCTable() */
|
773
|
+
DISPLAYLEVEL(2, "warning : pathological dataset : literals are not compressible : samples are noisy or too regular \n");
|
774
|
+
ZDICT_flatLit(countLit); /* replace distribution by a fake "mostly flat but still compressible" distribution, that HUF_writeCTable() can encode */
|
775
|
+
maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog);
|
776
|
+
assert(maxNbBits==9);
|
777
|
+
}
|
778
|
+
huffLog = (U32)maxNbBits;
|
737
779
|
}
|
738
|
-
huffLog = (U32)errorCode;
|
739
780
|
|
740
781
|
/* looking for most common first offsets */
|
741
782
|
{ U32 offset;
|
@@ -744,11 +785,11 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
744
785
|
}
|
745
786
|
/* note : the result of this phase should be used to better appreciate the impact on statistics */
|
746
787
|
|
747
|
-
total=0; for (u=0; u<=
|
748
|
-
errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total,
|
788
|
+
total=0; for (u=0; u<=offcodeMax; u++) total+=offcodeCount[u];
|
789
|
+
errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, offcodeMax);
|
749
790
|
if (FSE_isError(errorCode)) {
|
750
|
-
eSize =
|
751
|
-
DISPLAYLEVEL(1, "FSE_normalizeCount error with offcodeCount");
|
791
|
+
eSize = errorCode;
|
792
|
+
DISPLAYLEVEL(1, "FSE_normalizeCount error with offcodeCount \n");
|
752
793
|
goto _cleanup;
|
753
794
|
}
|
754
795
|
Offlog = (U32)errorCode;
|
@@ -756,8 +797,8 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
756
797
|
total=0; for (u=0; u<=MaxML; u++) total+=matchLengthCount[u];
|
757
798
|
errorCode = FSE_normalizeCount(matchLengthNCount, mlLog, matchLengthCount, total, MaxML);
|
758
799
|
if (FSE_isError(errorCode)) {
|
759
|
-
eSize =
|
760
|
-
DISPLAYLEVEL(1, "FSE_normalizeCount error with matchLengthCount");
|
800
|
+
eSize = errorCode;
|
801
|
+
DISPLAYLEVEL(1, "FSE_normalizeCount error with matchLengthCount \n");
|
761
802
|
goto _cleanup;
|
762
803
|
}
|
763
804
|
mlLog = (U32)errorCode;
|
@@ -765,18 +806,17 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
765
806
|
total=0; for (u=0; u<=MaxLL; u++) total+=litLengthCount[u];
|
766
807
|
errorCode = FSE_normalizeCount(litLengthNCount, llLog, litLengthCount, total, MaxLL);
|
767
808
|
if (FSE_isError(errorCode)) {
|
768
|
-
eSize =
|
769
|
-
DISPLAYLEVEL(1, "FSE_normalizeCount error with litLengthCount");
|
809
|
+
eSize = errorCode;
|
810
|
+
DISPLAYLEVEL(1, "FSE_normalizeCount error with litLengthCount \n");
|
770
811
|
goto _cleanup;
|
771
812
|
}
|
772
813
|
llLog = (U32)errorCode;
|
773
814
|
|
774
|
-
|
775
815
|
/* write result to buffer */
|
776
816
|
{ size_t const hhSize = HUF_writeCTable(dstPtr, maxDstSize, hufTable, 255, huffLog);
|
777
817
|
if (HUF_isError(hhSize)) {
|
778
|
-
eSize =
|
779
|
-
DISPLAYLEVEL(1, "HUF_writeCTable error");
|
818
|
+
eSize = hhSize;
|
819
|
+
DISPLAYLEVEL(1, "HUF_writeCTable error \n");
|
780
820
|
goto _cleanup;
|
781
821
|
}
|
782
822
|
dstPtr += hhSize;
|
@@ -786,8 +826,8 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
786
826
|
|
787
827
|
{ size_t const ohSize = FSE_writeNCount(dstPtr, maxDstSize, offcodeNCount, OFFCODE_MAX, Offlog);
|
788
828
|
if (FSE_isError(ohSize)) {
|
789
|
-
eSize =
|
790
|
-
DISPLAYLEVEL(1, "FSE_writeNCount error with offcodeNCount");
|
829
|
+
eSize = ohSize;
|
830
|
+
DISPLAYLEVEL(1, "FSE_writeNCount error with offcodeNCount \n");
|
791
831
|
goto _cleanup;
|
792
832
|
}
|
793
833
|
dstPtr += ohSize;
|
@@ -797,8 +837,8 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
797
837
|
|
798
838
|
{ size_t const mhSize = FSE_writeNCount(dstPtr, maxDstSize, matchLengthNCount, MaxML, mlLog);
|
799
839
|
if (FSE_isError(mhSize)) {
|
800
|
-
eSize =
|
801
|
-
DISPLAYLEVEL(1, "FSE_writeNCount error with matchLengthNCount");
|
840
|
+
eSize = mhSize;
|
841
|
+
DISPLAYLEVEL(1, "FSE_writeNCount error with matchLengthNCount \n");
|
802
842
|
goto _cleanup;
|
803
843
|
}
|
804
844
|
dstPtr += mhSize;
|
@@ -808,8 +848,8 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
808
848
|
|
809
849
|
{ size_t const lhSize = FSE_writeNCount(dstPtr, maxDstSize, litLengthNCount, MaxLL, llLog);
|
810
850
|
if (FSE_isError(lhSize)) {
|
811
|
-
eSize =
|
812
|
-
DISPLAYLEVEL(1, "FSE_writeNCount error with litlengthNCount");
|
851
|
+
eSize = lhSize;
|
852
|
+
DISPLAYLEVEL(1, "FSE_writeNCount error with litlengthNCount \n");
|
813
853
|
goto _cleanup;
|
814
854
|
}
|
815
855
|
dstPtr += lhSize;
|
@@ -818,8 +858,8 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
818
858
|
}
|
819
859
|
|
820
860
|
if (maxDstSize<12) {
|
821
|
-
eSize = ERROR(
|
822
|
-
DISPLAYLEVEL(1, "not enough space to write RepOffsets");
|
861
|
+
eSize = ERROR(dstSize_tooSmall);
|
862
|
+
DISPLAYLEVEL(1, "not enough space to write RepOffsets \n");
|
823
863
|
goto _cleanup;
|
824
864
|
}
|
825
865
|
# if 0
|
@@ -833,11 +873,10 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
833
873
|
MEM_writeLE32(dstPtr+4, repStartValue[1]);
|
834
874
|
MEM_writeLE32(dstPtr+8, repStartValue[2]);
|
835
875
|
#endif
|
836
|
-
dstPtr += 12;
|
837
876
|
eSize += 12;
|
838
877
|
|
839
878
|
_cleanup:
|
840
|
-
|
879
|
+
ZSTD_freeCDict(esr.dict);
|
841
880
|
ZSTD_freeCCtx(esr.zc);
|
842
881
|
free(esr.workPlace);
|
843
882
|
|
@@ -845,129 +884,180 @@ _cleanup:
|
|
845
884
|
}
|
846
885
|
|
847
886
|
|
848
|
-
#define DIB_FASTSEGMENTSIZE 64
|
849
|
-
/*! ZDICT_fastSampling() (based on an idea proposed by Giuseppe Ottaviano) :
|
850
|
-
Fill `dictBuffer` with stripes of size DIB_FASTSEGMENTSIZE from `samplesBuffer`,
|
851
|
-
up to `dictSize`.
|
852
|
-
Filling starts from the end of `dictBuffer`, down to maximum possible.
|
853
|
-
if `dictSize` is not a multiply of DIB_FASTSEGMENTSIZE, some bytes at beginning of `dictBuffer` won't be used.
|
854
|
-
@return : amount of data written into `dictBuffer`,
|
855
|
-
or an error code
|
856
|
-
*/
|
857
|
-
static size_t ZDICT_fastSampling(void* dictBuffer, size_t dictSize,
|
858
|
-
const void* samplesBuffer, size_t samplesSize)
|
859
|
-
{
|
860
|
-
char* dstPtr = (char*)dictBuffer + dictSize;
|
861
|
-
const char* srcPtr = (const char*)samplesBuffer;
|
862
|
-
size_t const nbSegments = dictSize / DIB_FASTSEGMENTSIZE;
|
863
|
-
size_t segNb, interSize;
|
864
|
-
|
865
|
-
if (nbSegments <= 2) return ERROR(srcSize_wrong);
|
866
|
-
if (samplesSize < dictSize) return ERROR(srcSize_wrong);
|
867
|
-
|
868
|
-
/* first and last segments are part of dictionary, in case they contain interesting header/footer */
|
869
|
-
dstPtr -= DIB_FASTSEGMENTSIZE;
|
870
|
-
memcpy(dstPtr, srcPtr, DIB_FASTSEGMENTSIZE);
|
871
|
-
dstPtr -= DIB_FASTSEGMENTSIZE;
|
872
|
-
memcpy(dstPtr, srcPtr+samplesSize-DIB_FASTSEGMENTSIZE, DIB_FASTSEGMENTSIZE);
|
873
|
-
|
874
|
-
/* regularly copy a segment */
|
875
|
-
interSize = (samplesSize - nbSegments*DIB_FASTSEGMENTSIZE) / (nbSegments-1);
|
876
|
-
srcPtr += DIB_FASTSEGMENTSIZE;
|
877
|
-
for (segNb=2; segNb < nbSegments; segNb++) {
|
878
|
-
srcPtr += interSize;
|
879
|
-
dstPtr -= DIB_FASTSEGMENTSIZE;
|
880
|
-
memcpy(dstPtr, srcPtr, DIB_FASTSEGMENTSIZE);
|
881
|
-
srcPtr += DIB_FASTSEGMENTSIZE;
|
882
|
-
}
|
883
|
-
|
884
|
-
return nbSegments * DIB_FASTSEGMENTSIZE;
|
885
|
-
}
|
886
887
|
|
887
|
-
size_t
|
888
|
-
|
889
|
-
|
888
|
+
size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
|
889
|
+
const void* customDictContent, size_t dictContentSize,
|
890
|
+
const void* samplesBuffer, const size_t* samplesSizes,
|
891
|
+
unsigned nbSamples, ZDICT_params_t params)
|
890
892
|
{
|
891
893
|
size_t hSize;
|
892
|
-
|
894
|
+
#define HBUFFSIZE 256 /* should prove large enough for all entropy headers */
|
895
|
+
BYTE header[HBUFFSIZE];
|
896
|
+
int const compressionLevel = (params.compressionLevel == 0) ? g_compressionLevel_default : params.compressionLevel;
|
897
|
+
U32 const notificationLevel = params.notificationLevel;
|
898
|
+
|
899
|
+
/* check conditions */
|
900
|
+
DEBUGLOG(4, "ZDICT_finalizeDictionary");
|
901
|
+
if (dictBufferCapacity < dictContentSize) return ERROR(dstSize_tooSmall);
|
902
|
+
if (dictContentSize < ZDICT_CONTENTSIZE_MIN) return ERROR(srcSize_wrong);
|
903
|
+
if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) return ERROR(dstSize_tooSmall);
|
893
904
|
|
894
905
|
/* dictionary header */
|
895
|
-
MEM_writeLE32(
|
896
|
-
{ U64 const randomID = XXH64(
|
906
|
+
MEM_writeLE32(header, ZSTD_MAGIC_DICTIONARY);
|
907
|
+
{ U64 const randomID = XXH64(customDictContent, dictContentSize, 0);
|
897
908
|
U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768;
|
898
909
|
U32 const dictID = params.dictID ? params.dictID : compliantID;
|
899
|
-
MEM_writeLE32(
|
910
|
+
MEM_writeLE32(header+4, dictID);
|
900
911
|
}
|
901
912
|
hSize = 8;
|
902
913
|
|
903
914
|
/* entropy tables */
|
904
915
|
DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
|
905
916
|
DISPLAYLEVEL(2, "statistics ... \n");
|
906
|
-
|
917
|
+
{ size_t const eSize = ZDICT_analyzeEntropy(header+hSize, HBUFFSIZE-hSize,
|
918
|
+
compressionLevel,
|
919
|
+
samplesBuffer, samplesSizes, nbSamples,
|
920
|
+
customDictContent, dictContentSize,
|
921
|
+
notificationLevel);
|
922
|
+
if (ZDICT_isError(eSize)) return eSize;
|
923
|
+
hSize += eSize;
|
924
|
+
}
|
925
|
+
|
926
|
+
/* copy elements in final buffer ; note : src and dst buffer can overlap */
|
927
|
+
if (hSize + dictContentSize > dictBufferCapacity) dictContentSize = dictBufferCapacity - hSize;
|
928
|
+
{ size_t const dictSize = hSize + dictContentSize;
|
929
|
+
char* dictEnd = (char*)dictBuffer + dictSize;
|
930
|
+
memmove(dictEnd - dictContentSize, customDictContent, dictContentSize);
|
931
|
+
memcpy(dictBuffer, header, hSize);
|
932
|
+
return dictSize;
|
933
|
+
}
|
934
|
+
}
|
935
|
+
|
936
|
+
|
937
|
+
static size_t ZDICT_addEntropyTablesFromBuffer_advanced(
|
938
|
+
void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
|
939
|
+
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
|
940
|
+
ZDICT_params_t params)
|
941
|
+
{
|
942
|
+
int const compressionLevel = (params.compressionLevel == 0) ? g_compressionLevel_default : params.compressionLevel;
|
943
|
+
U32 const notificationLevel = params.notificationLevel;
|
944
|
+
size_t hSize = 8;
|
945
|
+
|
946
|
+
/* calculate entropy tables */
|
947
|
+
DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
|
948
|
+
DISPLAYLEVEL(2, "statistics ... \n");
|
949
|
+
{ size_t const eSize = ZDICT_analyzeEntropy((char*)dictBuffer+hSize, dictBufferCapacity-hSize,
|
907
950
|
compressionLevel,
|
908
951
|
samplesBuffer, samplesSizes, nbSamples,
|
909
|
-
(char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize
|
952
|
+
(char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize,
|
953
|
+
notificationLevel);
|
954
|
+
if (ZDICT_isError(eSize)) return eSize;
|
955
|
+
hSize += eSize;
|
956
|
+
}
|
957
|
+
|
958
|
+
/* add dictionary header (after entropy tables) */
|
959
|
+
MEM_writeLE32(dictBuffer, ZSTD_MAGIC_DICTIONARY);
|
960
|
+
{ U64 const randomID = XXH64((char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize, 0);
|
961
|
+
U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768;
|
962
|
+
U32 const dictID = params.dictID ? params.dictID : compliantID;
|
963
|
+
MEM_writeLE32((char*)dictBuffer+4, dictID);
|
964
|
+
}
|
910
965
|
|
911
966
|
if (hSize + dictContentSize < dictBufferCapacity)
|
912
967
|
memmove((char*)dictBuffer + hSize, (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize);
|
913
968
|
return MIN(dictBufferCapacity, hSize+dictContentSize);
|
914
969
|
}
|
915
970
|
|
916
|
-
|
917
|
-
|
918
|
-
|
919
|
-
*
|
920
|
-
|
971
|
+
/* Hidden declaration for dbio.c */
|
972
|
+
size_t ZDICT_trainFromBuffer_unsafe_legacy(
|
973
|
+
void* dictBuffer, size_t maxDictSize,
|
974
|
+
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
|
975
|
+
ZDICT_legacy_params_t params);
|
976
|
+
/*! ZDICT_trainFromBuffer_unsafe_legacy() :
|
977
|
+
* Warning : `samplesBuffer` must be followed by noisy guard band.
|
978
|
+
* @return : size of dictionary, or an error code which can be tested with ZDICT_isError()
|
921
979
|
*/
|
922
|
-
size_t
|
980
|
+
size_t ZDICT_trainFromBuffer_unsafe_legacy(
|
923
981
|
void* dictBuffer, size_t maxDictSize,
|
924
982
|
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
|
925
|
-
|
983
|
+
ZDICT_legacy_params_t params)
|
926
984
|
{
|
927
|
-
U32 const dictListSize = MAX(
|
985
|
+
U32 const dictListSize = MAX(MAX(DICTLISTSIZE_DEFAULT, nbSamples), (U32)(maxDictSize/16));
|
928
986
|
dictItem* const dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList));
|
929
|
-
unsigned selectivity = params.selectivityLevel;
|
987
|
+
unsigned const selectivity = params.selectivityLevel == 0 ? g_selectivity_default : params.selectivityLevel;
|
988
|
+
unsigned const minRep = (selectivity > 30) ? MINRATIO : nbSamples >> selectivity;
|
930
989
|
size_t const targetDictSize = maxDictSize;
|
931
|
-
size_t
|
990
|
+
size_t const samplesBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples);
|
932
991
|
size_t dictSize = 0;
|
992
|
+
U32 const notificationLevel = params.zParams.notificationLevel;
|
933
993
|
|
934
994
|
/* checks */
|
935
995
|
if (!dictList) return ERROR(memory_allocation);
|
936
|
-
if (maxDictSize
|
996
|
+
if (maxDictSize < ZDICT_DICTSIZE_MIN) { free(dictList); return ERROR(dstSize_tooSmall); } /* requested dictionary size is too small */
|
997
|
+
if (samplesBuffSize < ZDICT_MIN_SAMPLES_SIZE) { free(dictList); return ERROR(dictionaryCreation_failed); } /* not enough source to create dictionary */
|
937
998
|
|
938
999
|
/* init */
|
939
|
-
{ unsigned u; for (u=0, sBuffSize=0; u<nbSamples; u++) sBuffSize += samplesSizes[u]; }
|
940
|
-
if (sBuffSize < DIB_MINSAMPLESSIZE) { free(dictList); return 0; } /* not enough source to create dictionary */
|
941
1000
|
ZDICT_initDictItem(dictList);
|
942
|
-
g_displayLevel = params.notificationLevel;
|
943
|
-
if (selectivity==0) selectivity = g_selectivity_default;
|
944
1001
|
|
945
1002
|
/* build dictionary */
|
946
|
-
|
947
|
-
|
948
|
-
|
949
|
-
|
950
|
-
|
951
|
-
|
952
|
-
|
953
|
-
|
954
|
-
|
955
|
-
|
956
|
-
|
957
|
-
|
958
|
-
|
959
|
-
|
960
|
-
|
961
|
-
|
962
|
-
|
963
|
-
|
964
|
-
|
965
|
-
|
966
|
-
|
967
|
-
|
1003
|
+
ZDICT_trainBuffer_legacy(dictList, dictListSize,
|
1004
|
+
samplesBuffer, samplesBuffSize,
|
1005
|
+
samplesSizes, nbSamples,
|
1006
|
+
minRep, notificationLevel);
|
1007
|
+
|
1008
|
+
/* display best matches */
|
1009
|
+
if (params.zParams.notificationLevel>= 3) {
|
1010
|
+
unsigned const nb = MIN(25, dictList[0].pos);
|
1011
|
+
unsigned const dictContentSize = ZDICT_dictSize(dictList);
|
1012
|
+
unsigned u;
|
1013
|
+
DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", (unsigned)dictList[0].pos-1, dictContentSize);
|
1014
|
+
DISPLAYLEVEL(3, "list %u best segments \n", nb-1);
|
1015
|
+
for (u=1; u<nb; u++) {
|
1016
|
+
unsigned const pos = dictList[u].pos;
|
1017
|
+
unsigned const length = dictList[u].length;
|
1018
|
+
U32 const printedLength = MIN(40, length);
|
1019
|
+
if ((pos > samplesBuffSize) || ((pos + length) > samplesBuffSize)) {
|
1020
|
+
free(dictList);
|
1021
|
+
return ERROR(GENERIC); /* should never happen */
|
1022
|
+
}
|
1023
|
+
DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |",
|
1024
|
+
u, length, pos, (unsigned)dictList[u].savings);
|
1025
|
+
ZDICT_printHex((const char*)samplesBuffer+pos, printedLength);
|
1026
|
+
DISPLAYLEVEL(3, "| \n");
|
1027
|
+
} }
|
1028
|
+
|
968
1029
|
|
969
1030
|
/* create dictionary */
|
970
|
-
{
|
1031
|
+
{ unsigned dictContentSize = ZDICT_dictSize(dictList);
|
1032
|
+
if (dictContentSize < ZDICT_CONTENTSIZE_MIN) { free(dictList); return ERROR(dictionaryCreation_failed); } /* dictionary content too small */
|
1033
|
+
if (dictContentSize < targetDictSize/4) {
|
1034
|
+
DISPLAYLEVEL(2, "! warning : selected content significantly smaller than requested (%u < %u) \n", dictContentSize, (unsigned)maxDictSize);
|
1035
|
+
if (samplesBuffSize < 10 * targetDictSize)
|
1036
|
+
DISPLAYLEVEL(2, "! consider increasing the number of samples (total size : %u MB)\n", (unsigned)(samplesBuffSize>>20));
|
1037
|
+
if (minRep > MINRATIO) {
|
1038
|
+
DISPLAYLEVEL(2, "! consider increasing selectivity to produce larger dictionary (-s%u) \n", selectivity+1);
|
1039
|
+
DISPLAYLEVEL(2, "! note : larger dictionaries are not necessarily better, test its efficiency on samples \n");
|
1040
|
+
}
|
1041
|
+
}
|
1042
|
+
|
1043
|
+
if ((dictContentSize > targetDictSize*3) && (nbSamples > 2*MINRATIO) && (selectivity>1)) {
|
1044
|
+
unsigned proposedSelectivity = selectivity-1;
|
1045
|
+
while ((nbSamples >> proposedSelectivity) <= MINRATIO) { proposedSelectivity--; }
|
1046
|
+
DISPLAYLEVEL(2, "! note : calculated dictionary significantly larger than requested (%u > %u) \n", dictContentSize, (unsigned)maxDictSize);
|
1047
|
+
DISPLAYLEVEL(2, "! consider increasing dictionary size, or produce denser dictionary (-s%u) \n", proposedSelectivity);
|
1048
|
+
DISPLAYLEVEL(2, "! always test dictionary efficiency on real samples \n");
|
1049
|
+
}
|
1050
|
+
|
1051
|
+
/* limit dictionary size */
|
1052
|
+
{ U32 const max = dictList->pos; /* convention : nb of useful elts within dictList */
|
1053
|
+
U32 currentSize = 0;
|
1054
|
+
U32 n; for (n=1; n<max; n++) {
|
1055
|
+
currentSize += dictList[n].length;
|
1056
|
+
if (currentSize > targetDictSize) { currentSize -= dictList[n].length; break; }
|
1057
|
+
}
|
1058
|
+
dictList->pos = n;
|
1059
|
+
dictContentSize = currentSize;
|
1060
|
+
}
|
971
1061
|
|
972
1062
|
/* build dict content */
|
973
1063
|
{ U32 u;
|
@@ -979,17 +1069,9 @@ size_t ZDICT_trainFromBuffer_unsafe(
|
|
979
1069
|
memcpy(ptr, (const char*)samplesBuffer+dictList[u].pos, l);
|
980
1070
|
} }
|
981
1071
|
|
982
|
-
/* fast mode dict content */
|
983
|
-
if (selectivity==1) { /* note could also be used to complete a dictionary, but not necessarily better */
|
984
|
-
DISPLAYLEVEL(3, "\r%70s\r", ""); /* clean display line */
|
985
|
-
DISPLAYLEVEL(3, "Adding %u KB with fast sampling \n", (U32)(targetDictSize>>10));
|
986
|
-
dictContentSize = (U32)ZDICT_fastSampling(dictBuffer, targetDictSize,
|
987
|
-
samplesBuffer, sBuffSize);
|
988
|
-
}
|
989
|
-
|
990
1072
|
dictSize = ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, maxDictSize,
|
991
1073
|
samplesBuffer, samplesSizes, nbSamples,
|
992
|
-
params);
|
1074
|
+
params.zParams);
|
993
1075
|
}
|
994
1076
|
|
995
1077
|
/* clean up */
|
@@ -998,44 +1080,52 @@ size_t ZDICT_trainFromBuffer_unsafe(
|
|
998
1080
|
}
|
999
1081
|
|
1000
1082
|
|
1001
|
-
/*
|
1002
|
-
*
|
1003
|
-
|
1004
|
-
|
1005
|
-
|
1083
|
+
/* ZDICT_trainFromBuffer_legacy() :
|
1084
|
+
* issue : samplesBuffer need to be followed by a noisy guard band.
|
1085
|
+
* work around : duplicate the buffer, and add the noise */
|
1086
|
+
size_t ZDICT_trainFromBuffer_legacy(void* dictBuffer, size_t dictBufferCapacity,
|
1087
|
+
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
|
1088
|
+
ZDICT_legacy_params_t params)
|
1006
1089
|
{
|
1090
|
+
size_t result;
|
1007
1091
|
void* newBuff;
|
1008
|
-
size_t sBuffSize;
|
1092
|
+
size_t const sBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples);
|
1093
|
+
if (sBuffSize < ZDICT_MIN_SAMPLES_SIZE) return 0; /* not enough content => no dictionary */
|
1009
1094
|
|
1010
|
-
{ unsigned u; for (u=0, sBuffSize=0; u<nbSamples; u++) sBuffSize += samplesSizes[u]; }
|
1011
|
-
if (sBuffSize==0) return 0; /* empty content => no dictionary */
|
1012
1095
|
newBuff = malloc(sBuffSize + NOISELENGTH);
|
1013
1096
|
if (!newBuff) return ERROR(memory_allocation);
|
1014
1097
|
|
1015
1098
|
memcpy(newBuff, samplesBuffer, sBuffSize);
|
1016
1099
|
ZDICT_fillNoise((char*)newBuff + sBuffSize, NOISELENGTH); /* guard band, for end of buffer condition */
|
1017
1100
|
|
1018
|
-
|
1019
|
-
|
1020
|
-
|
1021
|
-
|
1022
|
-
|
1023
|
-
return result; }
|
1101
|
+
result =
|
1102
|
+
ZDICT_trainFromBuffer_unsafe_legacy(dictBuffer, dictBufferCapacity, newBuff,
|
1103
|
+
samplesSizes, nbSamples, params);
|
1104
|
+
free(newBuff);
|
1105
|
+
return result;
|
1024
1106
|
}
|
1025
1107
|
|
1026
1108
|
|
1027
1109
|
size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
|
1028
1110
|
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
|
1029
1111
|
{
|
1030
|
-
|
1112
|
+
ZDICT_fastCover_params_t params;
|
1113
|
+
DEBUGLOG(3, "ZDICT_trainFromBuffer");
|
1031
1114
|
memset(¶ms, 0, sizeof(params));
|
1032
|
-
|
1033
|
-
|
1034
|
-
|
1115
|
+
params.d = 8;
|
1116
|
+
params.steps = 4;
|
1117
|
+
/* Default to level 6 since no compression level information is available */
|
1118
|
+
params.zParams.compressionLevel = 3;
|
1119
|
+
#if defined(DEBUGLEVEL) && (DEBUGLEVEL>=1)
|
1120
|
+
params.zParams.notificationLevel = DEBUGLEVEL;
|
1121
|
+
#endif
|
1122
|
+
return ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, dictBufferCapacity,
|
1123
|
+
samplesBuffer, samplesSizes, nbSamples,
|
1124
|
+
¶ms);
|
1035
1125
|
}
|
1036
1126
|
|
1037
1127
|
size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
|
1038
|
-
|
1128
|
+
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
|
1039
1129
|
{
|
1040
1130
|
ZDICT_params_t params;
|
1041
1131
|
memset(¶ms, 0, sizeof(params));
|