extzstd 0.0.3.CONCEPT → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/HISTORY.ja.md +39 -0
- data/LICENSE +6 -6
- data/README.md +26 -45
- data/contrib/zstd/CHANGELOG +555 -0
- data/contrib/zstd/CODE_OF_CONDUCT.md +5 -0
- data/contrib/zstd/CONTRIBUTING.md +392 -0
- data/contrib/zstd/COPYING +339 -0
- data/contrib/zstd/LICENSE +13 -9
- data/contrib/zstd/Makefile +414 -0
- data/contrib/zstd/README.md +170 -45
- data/contrib/zstd/TESTING.md +44 -0
- data/contrib/zstd/appveyor.yml +289 -0
- data/contrib/zstd/lib/BUCK +234 -0
- data/contrib/zstd/lib/Makefile +354 -0
- data/contrib/zstd/lib/README.md +179 -0
- data/contrib/zstd/{common → lib/common}/bitstream.h +170 -130
- data/contrib/zstd/lib/common/compiler.h +175 -0
- data/contrib/zstd/lib/common/cpu.h +215 -0
- data/contrib/zstd/lib/common/debug.c +24 -0
- data/contrib/zstd/lib/common/debug.h +114 -0
- data/contrib/zstd/{common → lib/common}/entropy_common.c +79 -94
- data/contrib/zstd/lib/common/error_private.c +55 -0
- data/contrib/zstd/lib/common/error_private.h +80 -0
- data/contrib/zstd/{common → lib/common}/fse.h +153 -93
- data/contrib/zstd/{common → lib/common}/fse_decompress.c +37 -82
- data/contrib/zstd/lib/common/huf.h +340 -0
- data/contrib/zstd/{common → lib/common}/mem.h +154 -78
- data/contrib/zstd/lib/common/pool.c +344 -0
- data/contrib/zstd/lib/common/pool.h +84 -0
- data/contrib/zstd/lib/common/threading.c +121 -0
- data/contrib/zstd/lib/common/threading.h +155 -0
- data/contrib/zstd/{common → lib/common}/xxhash.c +85 -75
- data/contrib/zstd/{common → lib/common}/xxhash.h +85 -73
- data/contrib/zstd/lib/common/zstd_common.c +83 -0
- data/contrib/zstd/lib/common/zstd_errors.h +94 -0
- data/contrib/zstd/lib/common/zstd_internal.h +447 -0
- data/contrib/zstd/{compress → lib/compress}/fse_compress.c +194 -303
- data/contrib/zstd/lib/compress/hist.c +183 -0
- data/contrib/zstd/lib/compress/hist.h +75 -0
- data/contrib/zstd/lib/compress/huf_compress.c +798 -0
- data/contrib/zstd/lib/compress/zstd_compress.c +4278 -0
- data/contrib/zstd/lib/compress/zstd_compress_internal.h +1125 -0
- data/contrib/zstd/lib/compress/zstd_compress_literals.c +158 -0
- data/contrib/zstd/lib/compress/zstd_compress_literals.h +29 -0
- data/contrib/zstd/lib/compress/zstd_compress_sequences.c +419 -0
- data/contrib/zstd/lib/compress/zstd_compress_sequences.h +54 -0
- data/contrib/zstd/lib/compress/zstd_compress_superblock.c +845 -0
- data/contrib/zstd/lib/compress/zstd_compress_superblock.h +32 -0
- data/contrib/zstd/lib/compress/zstd_cwksp.h +525 -0
- data/contrib/zstd/lib/compress/zstd_double_fast.c +521 -0
- data/contrib/zstd/lib/compress/zstd_double_fast.h +38 -0
- data/contrib/zstd/lib/compress/zstd_fast.c +496 -0
- data/contrib/zstd/lib/compress/zstd_fast.h +37 -0
- data/contrib/zstd/lib/compress/zstd_lazy.c +1138 -0
- data/contrib/zstd/lib/compress/zstd_lazy.h +67 -0
- data/contrib/zstd/lib/compress/zstd_ldm.c +619 -0
- data/contrib/zstd/lib/compress/zstd_ldm.h +110 -0
- data/contrib/zstd/lib/compress/zstd_opt.c +1200 -0
- data/contrib/zstd/lib/compress/zstd_opt.h +56 -0
- data/contrib/zstd/lib/compress/zstdmt_compress.c +2143 -0
- data/contrib/zstd/lib/compress/zstdmt_compress.h +192 -0
- data/contrib/zstd/lib/decompress/huf_decompress.c +1248 -0
- data/contrib/zstd/lib/decompress/zstd_ddict.c +244 -0
- data/contrib/zstd/lib/decompress/zstd_ddict.h +44 -0
- data/contrib/zstd/lib/decompress/zstd_decompress.c +1885 -0
- data/contrib/zstd/lib/decompress/zstd_decompress_block.c +1432 -0
- data/contrib/zstd/lib/decompress/zstd_decompress_block.h +59 -0
- data/contrib/zstd/lib/decompress/zstd_decompress_internal.h +189 -0
- data/contrib/zstd/{common → lib/deprecated}/zbuff.h +86 -69
- data/contrib/zstd/lib/deprecated/zbuff_common.c +26 -0
- data/contrib/zstd/lib/deprecated/zbuff_compress.c +147 -0
- data/contrib/zstd/lib/deprecated/zbuff_decompress.c +75 -0
- data/contrib/zstd/lib/dictBuilder/cover.c +1236 -0
- data/contrib/zstd/lib/dictBuilder/cover.h +157 -0
- data/contrib/zstd/{dictBuilder → lib/dictBuilder}/divsufsort.c +3 -3
- data/contrib/zstd/{dictBuilder → lib/dictBuilder}/divsufsort.h +5 -5
- data/contrib/zstd/lib/dictBuilder/fastcover.c +757 -0
- data/contrib/zstd/{dictBuilder → lib/dictBuilder}/zdict.c +437 -347
- data/contrib/zstd/lib/dictBuilder/zdict.h +305 -0
- data/contrib/zstd/lib/legacy/zstd_legacy.h +415 -0
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v01.c +272 -292
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v01.h +26 -32
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v02.c +162 -392
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v02.h +26 -32
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v03.c +162 -391
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v03.h +27 -33
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v04.c +195 -604
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v04.h +26 -32
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v05.c +300 -575
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v05.h +22 -31
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v06.c +165 -592
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v06.h +54 -67
- data/contrib/zstd/lib/legacy/zstd_v07.c +4541 -0
- data/contrib/zstd/lib/legacy/zstd_v07.h +187 -0
- data/contrib/zstd/lib/libzstd.pc.in +15 -0
- data/contrib/zstd/lib/zstd.h +2090 -0
- data/ext/depend +2 -0
- data/ext/extconf.rb +18 -5
- data/ext/extzstd.c +296 -214
- data/ext/extzstd.h +81 -36
- data/ext/extzstd_nogvls.h +0 -117
- data/ext/extzstd_stream.c +622 -0
- data/ext/libzstd_conf.h +8 -0
- data/ext/zstd_common.c +11 -0
- data/ext/zstd_compress.c +15 -0
- data/ext/zstd_decompress.c +6 -0
- data/ext/zstd_dictbuilder.c +10 -0
- data/ext/zstd_dictbuilder_fastcover.c +3 -0
- data/ext/zstd_legacy_v01.c +3 -1
- data/ext/zstd_legacy_v02.c +3 -1
- data/ext/zstd_legacy_v03.c +3 -1
- data/ext/zstd_legacy_v04.c +3 -1
- data/ext/zstd_legacy_v05.c +3 -1
- data/ext/zstd_legacy_v06.c +3 -1
- data/ext/zstd_legacy_v07.c +3 -0
- data/gemstub.rb +27 -21
- data/lib/extzstd.rb +82 -161
- data/lib/extzstd/version.rb +1 -1
- data/test/test_basic.rb +19 -6
- metadata +127 -59
- data/contrib/zstd/common/error_private.h +0 -125
- data/contrib/zstd/common/error_public.h +0 -77
- data/contrib/zstd/common/huf.h +0 -228
- data/contrib/zstd/common/zstd.h +0 -475
- data/contrib/zstd/common/zstd_common.c +0 -91
- data/contrib/zstd/common/zstd_internal.h +0 -238
- data/contrib/zstd/compress/huf_compress.c +0 -577
- data/contrib/zstd/compress/zbuff_compress.c +0 -327
- data/contrib/zstd/compress/zstd_compress.c +0 -3074
- data/contrib/zstd/compress/zstd_opt.h +0 -1046
- data/contrib/zstd/decompress/huf_decompress.c +0 -894
- data/contrib/zstd/decompress/zbuff_decompress.c +0 -294
- data/contrib/zstd/decompress/zstd_decompress.c +0 -1362
- data/contrib/zstd/dictBuilder/zdict.h +0 -113
- data/contrib/zstd/legacy/zstd_legacy.h +0 -140
- data/ext/extzstd_buffered.c +0 -265
- data/ext/zstd_amalgam.c +0 -18
|
@@ -1,40 +1,20 @@
|
|
|
1
1
|
/*
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
* Redistributions of source code must retain the above copyright
|
|
12
|
-
notice, this list of conditions and the following disclaimer.
|
|
13
|
-
* Redistributions in binary form must reproduce the above
|
|
14
|
-
copyright notice, this list of conditions and the following disclaimer
|
|
15
|
-
in the documentation and/or other materials provided with the
|
|
16
|
-
distribution.
|
|
17
|
-
|
|
18
|
-
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
19
|
-
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
20
|
-
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
21
|
-
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
22
|
-
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
23
|
-
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
24
|
-
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
25
|
-
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
26
|
-
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
27
|
-
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
28
|
-
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
29
|
-
|
|
30
|
-
You can contact the author at :
|
|
31
|
-
- Zstd homepage : https://www.zstd.net
|
|
32
|
-
*/
|
|
2
|
+
* Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
|
|
3
|
+
* All rights reserved.
|
|
4
|
+
*
|
|
5
|
+
* This source code is licensed under both the BSD-style license (found in the
|
|
6
|
+
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
|
|
7
|
+
* in the COPYING file in the root directory of this source tree).
|
|
8
|
+
* You may select, at your option, one of the above-listed licenses.
|
|
9
|
+
*/
|
|
10
|
+
|
|
33
11
|
|
|
34
12
|
/*-**************************************
|
|
35
13
|
* Tuning parameters
|
|
36
14
|
****************************************/
|
|
15
|
+
#define MINRATIO 4 /* minimum nb of apparition to be selected in dictionary */
|
|
37
16
|
#define ZDICT_MAX_SAMPLES_SIZE (2000U << 20)
|
|
17
|
+
#define ZDICT_MIN_SAMPLES_SIZE (ZDICT_CONTENTSIZE_MIN * MINRATIO)
|
|
38
18
|
|
|
39
19
|
|
|
40
20
|
/*-**************************************
|
|
@@ -57,18 +37,18 @@
|
|
|
57
37
|
#include <stdio.h> /* fprintf, fopen, ftello64 */
|
|
58
38
|
#include <time.h> /* clock */
|
|
59
39
|
|
|
60
|
-
#include "mem.h" /* read */
|
|
61
|
-
#include "
|
|
62
|
-
#include "fse.h" /* FSE_normalizeCount, FSE_writeNCount */
|
|
40
|
+
#include "../common/mem.h" /* read */
|
|
41
|
+
#include "../common/fse.h" /* FSE_normalizeCount, FSE_writeNCount */
|
|
63
42
|
#define HUF_STATIC_LINKING_ONLY
|
|
64
|
-
#include "huf.h"
|
|
65
|
-
#include "zstd_internal.h" /* includes zstd.h */
|
|
66
|
-
#include "xxhash.h"
|
|
43
|
+
#include "../common/huf.h" /* HUF_buildCTable, HUF_writeCTable */
|
|
44
|
+
#include "../common/zstd_internal.h" /* includes zstd.h */
|
|
45
|
+
#include "../common/xxhash.h" /* XXH64 */
|
|
67
46
|
#include "divsufsort.h"
|
|
68
47
|
#ifndef ZDICT_STATIC_LINKING_ONLY
|
|
69
48
|
# define ZDICT_STATIC_LINKING_ONLY
|
|
70
49
|
#endif
|
|
71
50
|
#include "zdict.h"
|
|
51
|
+
#include "../compress/zstd_compress_internal.h" /* ZSTD_loadCEntropy() */
|
|
72
52
|
|
|
73
53
|
|
|
74
54
|
/*-*************************************
|
|
@@ -78,43 +58,30 @@
|
|
|
78
58
|
#define MB *(1 <<20)
|
|
79
59
|
#define GB *(1U<<30)
|
|
80
60
|
|
|
81
|
-
#define
|
|
61
|
+
#define DICTLISTSIZE_DEFAULT 10000
|
|
82
62
|
|
|
83
63
|
#define NOISELENGTH 32
|
|
84
|
-
#define PRIME1 2654435761U
|
|
85
|
-
#define PRIME2 2246822519U
|
|
86
64
|
|
|
87
|
-
|
|
88
|
-
static const U32 g_compressionLevel_default = 5;
|
|
65
|
+
static const int g_compressionLevel_default = 3;
|
|
89
66
|
static const U32 g_selectivity_default = 9;
|
|
90
|
-
static const size_t g_provision_entropySize = 200;
|
|
91
|
-
static const size_t g_min_fast_dictContent = 192;
|
|
92
67
|
|
|
93
68
|
|
|
94
69
|
/*-*************************************
|
|
95
70
|
* Console display
|
|
96
71
|
***************************************/
|
|
97
72
|
#define DISPLAY(...) { fprintf(stderr, __VA_ARGS__); fflush( stderr ); }
|
|
98
|
-
#define DISPLAYLEVEL(l, ...) if (
|
|
99
|
-
static unsigned g_displayLevel = 0; /* 0 : no display; 1: errors; 2: default; 4: full information */
|
|
100
|
-
|
|
101
|
-
#define DISPLAYUPDATE(l, ...) if (g_displayLevel>=l) { \
|
|
102
|
-
if (ZDICT_clockSpan(g_time) > refreshRate) \
|
|
103
|
-
{ g_time = clock(); DISPLAY(__VA_ARGS__); \
|
|
104
|
-
if (g_displayLevel>=4) fflush(stdout); } }
|
|
105
|
-
static const clock_t refreshRate = CLOCKS_PER_SEC * 3 / 10;
|
|
106
|
-
static clock_t g_time = 0;
|
|
73
|
+
#define DISPLAYLEVEL(l, ...) if (notificationLevel>=l) { DISPLAY(__VA_ARGS__); } /* 0 : no display; 1: errors; 2: default; 3: details; 4: debug */
|
|
107
74
|
|
|
108
75
|
static clock_t ZDICT_clockSpan(clock_t nPrevious) { return clock() - nPrevious; }
|
|
109
76
|
|
|
110
|
-
static void ZDICT_printHex(
|
|
77
|
+
static void ZDICT_printHex(const void* ptr, size_t length)
|
|
111
78
|
{
|
|
112
79
|
const BYTE* const b = (const BYTE*)ptr;
|
|
113
80
|
size_t u;
|
|
114
81
|
for (u=0; u<length; u++) {
|
|
115
82
|
BYTE c = b[u];
|
|
116
83
|
if (c<32 || c>126) c = '.'; /* non-printable char */
|
|
117
|
-
|
|
84
|
+
DISPLAY("%c", c);
|
|
118
85
|
}
|
|
119
86
|
}
|
|
120
87
|
|
|
@@ -126,11 +93,41 @@ unsigned ZDICT_isError(size_t errorCode) { return ERR_isError(errorCode); }
|
|
|
126
93
|
|
|
127
94
|
const char* ZDICT_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); }
|
|
128
95
|
|
|
96
|
+
unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize)
|
|
97
|
+
{
|
|
98
|
+
if (dictSize < 8) return 0;
|
|
99
|
+
if (MEM_readLE32(dictBuffer) != ZSTD_MAGIC_DICTIONARY) return 0;
|
|
100
|
+
return MEM_readLE32((const char*)dictBuffer + 4);
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
size_t ZDICT_getDictHeaderSize(const void* dictBuffer, size_t dictSize)
|
|
104
|
+
{
|
|
105
|
+
size_t headerSize;
|
|
106
|
+
if (dictSize <= 8 || MEM_readLE32(dictBuffer) != ZSTD_MAGIC_DICTIONARY) return ERROR(dictionary_corrupted);
|
|
107
|
+
|
|
108
|
+
{ unsigned offcodeMaxValue = MaxOff;
|
|
109
|
+
ZSTD_compressedBlockState_t* bs = (ZSTD_compressedBlockState_t*)malloc(sizeof(ZSTD_compressedBlockState_t));
|
|
110
|
+
U32* wksp = (U32*)malloc(HUF_WORKSPACE_SIZE);
|
|
111
|
+
short* offcodeNCount = (short*)malloc((MaxOff+1)*sizeof(short));
|
|
112
|
+
if (!bs || !wksp || !offcodeNCount) {
|
|
113
|
+
headerSize = ERROR(memory_allocation);
|
|
114
|
+
} else {
|
|
115
|
+
ZSTD_reset_compressedBlockState(bs);
|
|
116
|
+
headerSize = ZSTD_loadCEntropy(bs, wksp, offcodeNCount, &offcodeMaxValue, dictBuffer, dictSize);
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
free(bs);
|
|
120
|
+
free(wksp);
|
|
121
|
+
free(offcodeNCount);
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
return headerSize;
|
|
125
|
+
}
|
|
129
126
|
|
|
130
127
|
/*-********************************************************
|
|
131
128
|
* Dictionary training functions
|
|
132
129
|
**********************************************************/
|
|
133
|
-
static unsigned ZDICT_NbCommonBytes (
|
|
130
|
+
static unsigned ZDICT_NbCommonBytes (size_t val)
|
|
134
131
|
{
|
|
135
132
|
if (MEM_isLittleEndian()) {
|
|
136
133
|
if (MEM_64bits()) {
|
|
@@ -228,13 +225,12 @@ static void ZDICT_initDictItem(dictItem* d)
|
|
|
228
225
|
static dictItem ZDICT_analyzePos(
|
|
229
226
|
BYTE* doneMarks,
|
|
230
227
|
const int* suffix, U32 start,
|
|
231
|
-
const void* buffer, U32 minRatio)
|
|
228
|
+
const void* buffer, U32 minRatio, U32 notificationLevel)
|
|
232
229
|
{
|
|
233
230
|
U32 lengthList[LLIMIT] = {0};
|
|
234
231
|
U32 cumulLength[LLIMIT] = {0};
|
|
235
232
|
U32 savings[LLIMIT] = {0};
|
|
236
233
|
const BYTE* b = (const BYTE*)buffer;
|
|
237
|
-
size_t length;
|
|
238
234
|
size_t maxLength = LLIMIT;
|
|
239
235
|
size_t pos = suffix[start];
|
|
240
236
|
U32 end = start;
|
|
@@ -249,26 +245,30 @@ static dictItem ZDICT_analyzePos(
|
|
|
249
245
|
||(MEM_read16(b+pos+1) == MEM_read16(b+pos+3))
|
|
250
246
|
||(MEM_read16(b+pos+2) == MEM_read16(b+pos+4)) ) {
|
|
251
247
|
/* skip and mark segment */
|
|
252
|
-
U16
|
|
253
|
-
U32 u,
|
|
254
|
-
while (MEM_read16(b+pos+
|
|
255
|
-
if (b[pos+
|
|
256
|
-
for (u=1; u<
|
|
248
|
+
U16 const pattern16 = MEM_read16(b+pos+4);
|
|
249
|
+
U32 u, patternEnd = 6;
|
|
250
|
+
while (MEM_read16(b+pos+patternEnd) == pattern16) patternEnd+=2 ;
|
|
251
|
+
if (b[pos+patternEnd] == b[pos+patternEnd-1]) patternEnd++;
|
|
252
|
+
for (u=1; u<patternEnd; u++)
|
|
257
253
|
doneMarks[pos+u] = 1;
|
|
258
254
|
return solution;
|
|
259
255
|
}
|
|
260
256
|
|
|
261
257
|
/* look forward */
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
258
|
+
{ size_t length;
|
|
259
|
+
do {
|
|
260
|
+
end++;
|
|
261
|
+
length = ZDICT_count(b + pos, b + suffix[end]);
|
|
262
|
+
} while (length >= MINMATCHLENGTH);
|
|
263
|
+
}
|
|
266
264
|
|
|
267
265
|
/* look backward */
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
266
|
+
{ size_t length;
|
|
267
|
+
do {
|
|
268
|
+
length = ZDICT_count(b + pos, b + *(suffix+start-1));
|
|
269
|
+
if (length >=MINMATCHLENGTH) start--;
|
|
270
|
+
} while(length >= MINMATCHLENGTH);
|
|
271
|
+
}
|
|
272
272
|
|
|
273
273
|
/* exit if not found a minimum nb of repetitions */
|
|
274
274
|
if (end-start < minRatio) {
|
|
@@ -279,15 +279,15 @@ static dictItem ZDICT_analyzePos(
|
|
|
279
279
|
}
|
|
280
280
|
|
|
281
281
|
{ int i;
|
|
282
|
-
U32
|
|
282
|
+
U32 mml;
|
|
283
283
|
U32 refinedStart = start;
|
|
284
284
|
U32 refinedEnd = end;
|
|
285
285
|
|
|
286
286
|
DISPLAYLEVEL(4, "\n");
|
|
287
|
-
DISPLAYLEVEL(4, "found %3u matches of length >= %i at pos %7u ", (
|
|
287
|
+
DISPLAYLEVEL(4, "found %3u matches of length >= %i at pos %7u ", (unsigned)(end-start), MINMATCHLENGTH, (unsigned)pos);
|
|
288
288
|
DISPLAYLEVEL(4, "\n");
|
|
289
289
|
|
|
290
|
-
for (
|
|
290
|
+
for (mml = MINMATCHLENGTH ; ; mml++) {
|
|
291
291
|
BYTE currentChar = 0;
|
|
292
292
|
U32 currentCount = 0;
|
|
293
293
|
U32 currentID = refinedStart;
|
|
@@ -295,13 +295,13 @@ static dictItem ZDICT_analyzePos(
|
|
|
295
295
|
U32 selectedCount = 0;
|
|
296
296
|
U32 selectedID = currentID;
|
|
297
297
|
for (id =refinedStart; id < refinedEnd; id++) {
|
|
298
|
-
if (b[
|
|
298
|
+
if (b[suffix[id] + mml] != currentChar) {
|
|
299
299
|
if (currentCount > selectedCount) {
|
|
300
300
|
selectedCount = currentCount;
|
|
301
301
|
selectedID = currentID;
|
|
302
302
|
}
|
|
303
303
|
currentID = id;
|
|
304
|
-
currentChar = b[ suffix[id] +
|
|
304
|
+
currentChar = b[ suffix[id] + mml];
|
|
305
305
|
currentCount = 0;
|
|
306
306
|
}
|
|
307
307
|
currentCount ++;
|
|
@@ -317,27 +317,31 @@ static dictItem ZDICT_analyzePos(
|
|
|
317
317
|
refinedEnd = refinedStart + selectedCount;
|
|
318
318
|
}
|
|
319
319
|
|
|
320
|
-
/* evaluate gain based on new
|
|
320
|
+
/* evaluate gain based on new dict */
|
|
321
321
|
start = refinedStart;
|
|
322
322
|
pos = suffix[refinedStart];
|
|
323
323
|
end = start;
|
|
324
324
|
memset(lengthList, 0, sizeof(lengthList));
|
|
325
325
|
|
|
326
326
|
/* look forward */
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
327
|
+
{ size_t length;
|
|
328
|
+
do {
|
|
329
|
+
end++;
|
|
330
|
+
length = ZDICT_count(b + pos, b + suffix[end]);
|
|
331
|
+
if (length >= LLIMIT) length = LLIMIT-1;
|
|
332
|
+
lengthList[length]++;
|
|
333
|
+
} while (length >=MINMATCHLENGTH);
|
|
334
|
+
}
|
|
333
335
|
|
|
334
336
|
/* look backward */
|
|
335
|
-
|
|
336
|
-
length
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
337
|
+
{ size_t length = MINMATCHLENGTH;
|
|
338
|
+
while ((length >= MINMATCHLENGTH) & (start > 0)) {
|
|
339
|
+
length = ZDICT_count(b + pos, b + suffix[start - 1]);
|
|
340
|
+
if (length >= LLIMIT) length = LLIMIT - 1;
|
|
341
|
+
lengthList[length]++;
|
|
342
|
+
if (length >= MINMATCHLENGTH) start--;
|
|
343
|
+
}
|
|
344
|
+
}
|
|
341
345
|
|
|
342
346
|
/* largest useful length */
|
|
343
347
|
memset(cumulLength, 0, sizeof(cumulLength));
|
|
@@ -361,8 +365,8 @@ static dictItem ZDICT_analyzePos(
|
|
|
361
365
|
for (i=MINMATCHLENGTH; i<=(int)maxLength; i++)
|
|
362
366
|
savings[i] = savings[i-1] + (lengthList[i] * (i-3));
|
|
363
367
|
|
|
364
|
-
DISPLAYLEVEL(4, "Selected
|
|
365
|
-
(
|
|
368
|
+
DISPLAYLEVEL(4, "Selected dict at position %u, of length %u : saves %u (ratio: %.2f) \n",
|
|
369
|
+
(unsigned)pos, (unsigned)maxLength, (unsigned)savings[maxLength], (double)savings[maxLength] / maxLength);
|
|
366
370
|
|
|
367
371
|
solution.pos = (U32)pos;
|
|
368
372
|
solution.length = (U32)maxLength;
|
|
@@ -371,12 +375,12 @@ static dictItem ZDICT_analyzePos(
|
|
|
371
375
|
/* mark positions done */
|
|
372
376
|
{ U32 id;
|
|
373
377
|
for (id=start; id<end; id++) {
|
|
374
|
-
U32 p, pEnd;
|
|
378
|
+
U32 p, pEnd, length;
|
|
375
379
|
U32 const testedPos = suffix[id];
|
|
376
380
|
if (testedPos == pos)
|
|
377
381
|
length = solution.length;
|
|
378
382
|
else {
|
|
379
|
-
length = ZDICT_count(b+pos, b+testedPos);
|
|
383
|
+
length = (U32)ZDICT_count(b+pos, b+testedPos);
|
|
380
384
|
if (length > solution.length) length = solution.length;
|
|
381
385
|
}
|
|
382
386
|
pEnd = (U32)(testedPos + length);
|
|
@@ -388,28 +392,43 @@ static dictItem ZDICT_analyzePos(
|
|
|
388
392
|
}
|
|
389
393
|
|
|
390
394
|
|
|
391
|
-
|
|
395
|
+
static int isIncluded(const void* in, const void* container, size_t length)
|
|
396
|
+
{
|
|
397
|
+
const char* const ip = (const char*) in;
|
|
398
|
+
const char* const into = (const char*) container;
|
|
399
|
+
size_t u;
|
|
400
|
+
|
|
401
|
+
for (u=0; u<length; u++) { /* works because end of buffer is a noisy guard band */
|
|
402
|
+
if (ip[u] != into[u]) break;
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
return u==length;
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
/*! ZDICT_tryMerge() :
|
|
392
409
|
check if dictItem can be merged, do it if possible
|
|
393
410
|
@return : id of destination elt, 0 if not merged
|
|
394
411
|
*/
|
|
395
|
-
static U32
|
|
412
|
+
static U32 ZDICT_tryMerge(dictItem* table, dictItem elt, U32 eltNbToSkip, const void* buffer)
|
|
396
413
|
{
|
|
397
414
|
const U32 tableSize = table->pos;
|
|
398
|
-
const U32
|
|
415
|
+
const U32 eltEnd = elt.pos + elt.length;
|
|
416
|
+
const char* const buf = (const char*) buffer;
|
|
399
417
|
|
|
400
418
|
/* tail overlap */
|
|
401
419
|
U32 u; for (u=1; u<tableSize; u++) {
|
|
402
420
|
if (u==eltNbToSkip) continue;
|
|
403
|
-
if ((table[u].pos > elt.pos) && (table[u].pos
|
|
421
|
+
if ((table[u].pos > elt.pos) && (table[u].pos <= eltEnd)) { /* overlap, existing > new */
|
|
404
422
|
/* append */
|
|
405
|
-
U32 addedLength = table[u].pos - elt.pos;
|
|
423
|
+
U32 const addedLength = table[u].pos - elt.pos;
|
|
406
424
|
table[u].length += addedLength;
|
|
407
425
|
table[u].pos = elt.pos;
|
|
408
426
|
table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */
|
|
409
|
-
table[u].savings += elt.length / 8; /* rough approx */
|
|
427
|
+
table[u].savings += elt.length / 8; /* rough approx bonus */
|
|
410
428
|
elt = table[u];
|
|
429
|
+
/* sort : improve rank */
|
|
411
430
|
while ((u>1) && (table[u-1].savings < elt.savings))
|
|
412
|
-
|
|
431
|
+
table[u] = table[u-1], u--;
|
|
413
432
|
table[u] = elt;
|
|
414
433
|
return u;
|
|
415
434
|
} }
|
|
@@ -417,20 +436,33 @@ static U32 ZDICT_checkMerge(dictItem* table, dictItem elt, U32 eltNbToSkip)
|
|
|
417
436
|
/* front overlap */
|
|
418
437
|
for (u=1; u<tableSize; u++) {
|
|
419
438
|
if (u==eltNbToSkip) continue;
|
|
420
|
-
|
|
439
|
+
|
|
440
|
+
if ((table[u].pos + table[u].length >= elt.pos) && (table[u].pos < elt.pos)) { /* overlap, existing < new */
|
|
421
441
|
/* append */
|
|
422
|
-
int addedLength = (
|
|
423
|
-
table[u].savings += elt.length / 8; /* rough approx */
|
|
424
|
-
if (addedLength > 0) { /* otherwise,
|
|
442
|
+
int const addedLength = (int)eltEnd - (table[u].pos + table[u].length);
|
|
443
|
+
table[u].savings += elt.length / 8; /* rough approx bonus */
|
|
444
|
+
if (addedLength > 0) { /* otherwise, elt fully included into existing */
|
|
425
445
|
table[u].length += addedLength;
|
|
426
446
|
table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */
|
|
427
447
|
}
|
|
448
|
+
/* sort : improve rank */
|
|
428
449
|
elt = table[u];
|
|
429
450
|
while ((u>1) && (table[u-1].savings < elt.savings))
|
|
430
451
|
table[u] = table[u-1], u--;
|
|
431
452
|
table[u] = elt;
|
|
432
453
|
return u;
|
|
433
|
-
|
|
454
|
+
}
|
|
455
|
+
|
|
456
|
+
if (MEM_read64(buf + table[u].pos) == MEM_read64(buf + elt.pos + 1)) {
|
|
457
|
+
if (isIncluded(buf + table[u].pos, buf + elt.pos + 1, table[u].length)) {
|
|
458
|
+
size_t const addedLength = MAX( (int)elt.length - (int)table[u].length , 1 );
|
|
459
|
+
table[u].pos = elt.pos;
|
|
460
|
+
table[u].savings += (U32)(elt.savings * addedLength / elt.length);
|
|
461
|
+
table[u].length = MIN(elt.length, table[u].length + 1);
|
|
462
|
+
return u;
|
|
463
|
+
}
|
|
464
|
+
}
|
|
465
|
+
}
|
|
434
466
|
|
|
435
467
|
return 0;
|
|
436
468
|
}
|
|
@@ -438,8 +470,8 @@ static U32 ZDICT_checkMerge(dictItem* table, dictItem elt, U32 eltNbToSkip)
|
|
|
438
470
|
|
|
439
471
|
static void ZDICT_removeDictItem(dictItem* table, U32 id)
|
|
440
472
|
{
|
|
441
|
-
/* convention :
|
|
442
|
-
U32 const max = table
|
|
473
|
+
/* convention : table[0].pos stores nb of elts */
|
|
474
|
+
U32 const max = table[0].pos;
|
|
443
475
|
U32 u;
|
|
444
476
|
if (!id) return; /* protection, should never happen */
|
|
445
477
|
for (u=id; u<max-1; u++)
|
|
@@ -448,14 +480,14 @@ static void ZDICT_removeDictItem(dictItem* table, U32 id)
|
|
|
448
480
|
}
|
|
449
481
|
|
|
450
482
|
|
|
451
|
-
static void ZDICT_insertDictItem(dictItem* table, U32 maxSize, dictItem elt)
|
|
483
|
+
static void ZDICT_insertDictItem(dictItem* table, U32 maxSize, dictItem elt, const void* buffer)
|
|
452
484
|
{
|
|
453
485
|
/* merge if possible */
|
|
454
|
-
U32 mergeId =
|
|
486
|
+
U32 mergeId = ZDICT_tryMerge(table, elt, 0, buffer);
|
|
455
487
|
if (mergeId) {
|
|
456
488
|
U32 newMerge = 1;
|
|
457
489
|
while (newMerge) {
|
|
458
|
-
newMerge =
|
|
490
|
+
newMerge = ZDICT_tryMerge(table, table[mergeId], mergeId, buffer);
|
|
459
491
|
if (newMerge) ZDICT_removeDictItem(table, mergeId);
|
|
460
492
|
mergeId = newMerge;
|
|
461
493
|
}
|
|
@@ -486,18 +518,24 @@ static U32 ZDICT_dictSize(const dictItem* dictList)
|
|
|
486
518
|
}
|
|
487
519
|
|
|
488
520
|
|
|
489
|
-
static size_t
|
|
521
|
+
static size_t ZDICT_trainBuffer_legacy(dictItem* dictList, U32 dictListSize,
|
|
490
522
|
const void* const buffer, size_t bufferSize, /* buffer must end with noisy guard band */
|
|
491
523
|
const size_t* fileSizes, unsigned nbFiles,
|
|
492
|
-
|
|
524
|
+
unsigned minRatio, U32 notificationLevel)
|
|
493
525
|
{
|
|
494
526
|
int* const suffix0 = (int*)malloc((bufferSize+2)*sizeof(*suffix0));
|
|
495
527
|
int* const suffix = suffix0+1;
|
|
496
528
|
U32* reverseSuffix = (U32*)malloc((bufferSize)*sizeof(*reverseSuffix));
|
|
497
529
|
BYTE* doneMarks = (BYTE*)malloc((bufferSize+16)*sizeof(*doneMarks)); /* +16 for overflow security */
|
|
498
530
|
U32* filePos = (U32*)malloc(nbFiles * sizeof(*filePos));
|
|
499
|
-
U32 minRatio = nbFiles >> shiftRatio;
|
|
500
531
|
size_t result = 0;
|
|
532
|
+
clock_t displayClock = 0;
|
|
533
|
+
clock_t const refreshRate = CLOCKS_PER_SEC * 3 / 10;
|
|
534
|
+
|
|
535
|
+
# define DISPLAYUPDATE(l, ...) if (notificationLevel>=l) { \
|
|
536
|
+
if (ZDICT_clockSpan(displayClock) > refreshRate) \
|
|
537
|
+
{ displayClock = clock(); DISPLAY(__VA_ARGS__); \
|
|
538
|
+
if (notificationLevel>=4) fflush(stderr); } }
|
|
501
539
|
|
|
502
540
|
/* init */
|
|
503
541
|
DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
|
|
@@ -509,11 +547,11 @@ static size_t ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
|
|
|
509
547
|
memset(doneMarks, 0, bufferSize+16);
|
|
510
548
|
|
|
511
549
|
/* limit sample set size (divsufsort limitation)*/
|
|
512
|
-
if (bufferSize > ZDICT_MAX_SAMPLES_SIZE) DISPLAYLEVEL(3, "sample set too large : reduced to %u MB ...\n", (
|
|
550
|
+
if (bufferSize > ZDICT_MAX_SAMPLES_SIZE) DISPLAYLEVEL(3, "sample set too large : reduced to %u MB ...\n", (unsigned)(ZDICT_MAX_SAMPLES_SIZE>>20));
|
|
513
551
|
while (bufferSize > ZDICT_MAX_SAMPLES_SIZE) bufferSize -= fileSizes[--nbFiles];
|
|
514
552
|
|
|
515
553
|
/* sort */
|
|
516
|
-
DISPLAYLEVEL(2, "sorting %u files of total size %u MB ...\n", nbFiles, (
|
|
554
|
+
DISPLAYLEVEL(2, "sorting %u files of total size %u MB ...\n", nbFiles, (unsigned)(bufferSize>>20));
|
|
517
555
|
{ int const divSuftSortResult = divsufsort((const unsigned char*)buffer, suffix, (int)bufferSize, 0);
|
|
518
556
|
if (divSuftSortResult != 0) { result = ERROR(GENERIC); goto _cleanup; }
|
|
519
557
|
}
|
|
@@ -523,7 +561,8 @@ static size_t ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
|
|
|
523
561
|
{ size_t pos;
|
|
524
562
|
for (pos=0; pos < bufferSize; pos++)
|
|
525
563
|
reverseSuffix[suffix[pos]] = (U32)pos;
|
|
526
|
-
/*
|
|
564
|
+
/* note filePos tracks borders between samples.
|
|
565
|
+
It's not used at this stage, but planned to become useful in a later update */
|
|
527
566
|
filePos[0] = 0;
|
|
528
567
|
for (pos=1; pos<nbFiles; pos++)
|
|
529
568
|
filePos[pos] = (U32)(filePos[pos-1] + fileSizes[pos-1]);
|
|
@@ -535,23 +574,13 @@ static size_t ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
|
|
|
535
574
|
{ U32 cursor; for (cursor=0; cursor < bufferSize; ) {
|
|
536
575
|
dictItem solution;
|
|
537
576
|
if (doneMarks[cursor]) { cursor++; continue; }
|
|
538
|
-
solution = ZDICT_analyzePos(doneMarks, suffix, reverseSuffix[cursor], buffer, minRatio);
|
|
577
|
+
solution = ZDICT_analyzePos(doneMarks, suffix, reverseSuffix[cursor], buffer, minRatio, notificationLevel);
|
|
539
578
|
if (solution.length==0) { cursor++; continue; }
|
|
540
|
-
ZDICT_insertDictItem(dictList, dictListSize, solution);
|
|
579
|
+
ZDICT_insertDictItem(dictList, dictListSize, solution, buffer);
|
|
541
580
|
cursor += solution.length;
|
|
542
581
|
DISPLAYUPDATE(2, "\r%4.2f %% \r", (double)cursor / bufferSize * 100);
|
|
543
582
|
} }
|
|
544
583
|
|
|
545
|
-
/* limit dictionary size */
|
|
546
|
-
{ U32 const max = dictList->pos; /* convention : nb of useful elts within dictList */
|
|
547
|
-
U32 currentSize = 0;
|
|
548
|
-
U32 n; for (n=1; n<max; n++) {
|
|
549
|
-
currentSize += dictList[n].length;
|
|
550
|
-
if (currentSize > maxDictSize) break;
|
|
551
|
-
}
|
|
552
|
-
dictList->pos = n;
|
|
553
|
-
}
|
|
554
|
-
|
|
555
584
|
_cleanup:
|
|
556
585
|
free(suffix0);
|
|
557
586
|
free(reverseSuffix);
|
|
@@ -563,10 +592,12 @@ _cleanup:
|
|
|
563
592
|
|
|
564
593
|
static void ZDICT_fillNoise(void* buffer, size_t length)
|
|
565
594
|
{
|
|
566
|
-
unsigned
|
|
567
|
-
|
|
595
|
+
unsigned const prime1 = 2654435761U;
|
|
596
|
+
unsigned const prime2 = 2246822519U;
|
|
597
|
+
unsigned acc = prime1;
|
|
598
|
+
size_t p=0;
|
|
568
599
|
for (p=0; p<length; p++) {
|
|
569
|
-
acc *=
|
|
600
|
+
acc *= prime2;
|
|
570
601
|
((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21);
|
|
571
602
|
}
|
|
572
603
|
}
|
|
@@ -574,29 +605,31 @@ static void ZDICT_fillNoise(void* buffer, size_t length)
|
|
|
574
605
|
|
|
575
606
|
typedef struct
|
|
576
607
|
{
|
|
577
|
-
|
|
578
|
-
ZSTD_CCtx* zc;
|
|
608
|
+
ZSTD_CDict* dict; /* dictionary */
|
|
609
|
+
ZSTD_CCtx* zc; /* working context */
|
|
579
610
|
void* workPlace; /* must be ZSTD_BLOCKSIZE_MAX allocated */
|
|
580
611
|
} EStats_ress_t;
|
|
581
612
|
|
|
582
613
|
#define MAXREPOFFSET 1024
|
|
583
614
|
|
|
584
|
-
static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params,
|
|
585
|
-
|
|
586
|
-
|
|
615
|
+
static void ZDICT_countEStats(EStats_ress_t esr, const ZSTD_parameters* params,
|
|
616
|
+
unsigned* countLit, unsigned* offsetcodeCount, unsigned* matchlengthCount, unsigned* litlengthCount, U32* repOffsets,
|
|
617
|
+
const void* src, size_t srcSize,
|
|
618
|
+
U32 notificationLevel)
|
|
587
619
|
{
|
|
588
|
-
size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params
|
|
620
|
+
size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params->cParams.windowLog);
|
|
589
621
|
size_t cSize;
|
|
590
622
|
|
|
591
623
|
if (srcSize > blockSizeMax) srcSize = blockSizeMax; /* protection vs large samples */
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
624
|
+
{ size_t const errorCode = ZSTD_compressBegin_usingCDict(esr.zc, esr.dict);
|
|
625
|
+
if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_compressBegin_usingCDict failed \n"); return; }
|
|
626
|
+
|
|
627
|
+
}
|
|
595
628
|
cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_MAX, src, srcSize);
|
|
596
|
-
if (ZSTD_isError(cSize)) { DISPLAYLEVEL(
|
|
629
|
+
if (ZSTD_isError(cSize)) { DISPLAYLEVEL(3, "warning : could not compress sample size %u \n", (unsigned)srcSize); return; }
|
|
597
630
|
|
|
598
631
|
if (cSize) { /* if == 0; block is not compressible */
|
|
599
|
-
const seqStore_t* seqStorePtr = ZSTD_getSeqStore(esr.zc);
|
|
632
|
+
const seqStore_t* const seqStorePtr = ZSTD_getSeqStore(esr.zc);
|
|
600
633
|
|
|
601
634
|
/* literals stats */
|
|
602
635
|
{ const BYTE* bytePtr;
|
|
@@ -605,46 +638,34 @@ static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params,
|
|
|
605
638
|
}
|
|
606
639
|
|
|
607
640
|
/* seqStats */
|
|
608
|
-
{
|
|
609
|
-
ZSTD_seqToCodes(seqStorePtr
|
|
641
|
+
{ U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
|
|
642
|
+
ZSTD_seqToCodes(seqStorePtr);
|
|
610
643
|
|
|
611
|
-
{ const BYTE* codePtr = seqStorePtr->
|
|
612
|
-
|
|
644
|
+
{ const BYTE* codePtr = seqStorePtr->ofCode;
|
|
645
|
+
U32 u;
|
|
613
646
|
for (u=0; u<nbSeq; u++) offsetcodeCount[codePtr[u]]++;
|
|
614
647
|
}
|
|
615
648
|
|
|
616
|
-
{ const BYTE* codePtr = seqStorePtr->
|
|
617
|
-
|
|
649
|
+
{ const BYTE* codePtr = seqStorePtr->mlCode;
|
|
650
|
+
U32 u;
|
|
618
651
|
for (u=0; u<nbSeq; u++) matchlengthCount[codePtr[u]]++;
|
|
619
652
|
}
|
|
620
653
|
|
|
621
|
-
{ const BYTE* codePtr = seqStorePtr->
|
|
622
|
-
|
|
654
|
+
{ const BYTE* codePtr = seqStorePtr->llCode;
|
|
655
|
+
U32 u;
|
|
623
656
|
for (u=0; u<nbSeq; u++) litlengthCount[codePtr[u]]++;
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
/* rep offsets */
|
|
627
|
-
{ const U32* const offsetPtr = seqStorePtr->offsetStart;
|
|
628
|
-
U32 offset1 = offsetPtr[0] - 3;
|
|
629
|
-
U32 offset2 = offsetPtr[1] - 3;
|
|
630
|
-
if (offset1 >= MAXREPOFFSET) offset1 = 0;
|
|
631
|
-
if (offset2 >= MAXREPOFFSET) offset2 = 0;
|
|
632
|
-
repOffsets[offset1] += 3;
|
|
633
|
-
repOffsets[offset2] += 1;
|
|
634
|
-
}
|
|
635
|
-
}
|
|
636
|
-
}
|
|
657
|
+
}
|
|
637
658
|
|
|
638
|
-
/*
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
659
|
+
if (nbSeq >= 2) { /* rep offsets */
|
|
660
|
+
const seqDef* const seq = seqStorePtr->sequencesStart;
|
|
661
|
+
U32 offset1 = seq[0].offset - 3;
|
|
662
|
+
U32 offset2 = seq[1].offset - 3;
|
|
663
|
+
if (offset1 >= MAXREPOFFSET) offset1 = 0;
|
|
664
|
+
if (offset2 >= MAXREPOFFSET) offset2 = 0;
|
|
665
|
+
repOffsets[offset1] += 3;
|
|
666
|
+
repOffsets[offset2] += 1;
|
|
667
|
+
} } }
|
|
646
668
|
}
|
|
647
|
-
*/
|
|
648
669
|
|
|
649
670
|
static size_t ZDICT_totalSampleSize(const size_t* fileSizes, unsigned nbFiles)
|
|
650
671
|
{
|
|
@@ -670,72 +691,92 @@ static void ZDICT_insertSortCount(offsetCount_t table[ZSTD_REP_NUM+1], U32 val,
|
|
|
670
691
|
}
|
|
671
692
|
}
|
|
672
693
|
|
|
694
|
+
/* ZDICT_flatLit() :
|
|
695
|
+
* rewrite `countLit` to contain a mostly flat but still compressible distribution of literals.
|
|
696
|
+
* necessary to avoid generating a non-compressible distribution that HUF_writeCTable() cannot encode.
|
|
697
|
+
*/
|
|
698
|
+
static void ZDICT_flatLit(unsigned* countLit)
|
|
699
|
+
{
|
|
700
|
+
int u;
|
|
701
|
+
for (u=1; u<256; u++) countLit[u] = 2;
|
|
702
|
+
countLit[0] = 4;
|
|
703
|
+
countLit[253] = 1;
|
|
704
|
+
countLit[254] = 1;
|
|
705
|
+
}
|
|
673
706
|
|
|
674
|
-
#define OFFCODE_MAX
|
|
707
|
+
#define OFFCODE_MAX 30 /* only applicable to first block */
|
|
675
708
|
static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
709
|
+
unsigned compressionLevel,
|
|
710
|
+
const void* srcBuffer, const size_t* fileSizes, unsigned nbFiles,
|
|
711
|
+
const void* dictBuffer, size_t dictBufferSize,
|
|
712
|
+
unsigned notificationLevel)
|
|
679
713
|
{
|
|
680
|
-
|
|
714
|
+
unsigned countLit[256];
|
|
681
715
|
HUF_CREATE_STATIC_CTABLE(hufTable, 255);
|
|
682
|
-
|
|
716
|
+
unsigned offcodeCount[OFFCODE_MAX+1];
|
|
683
717
|
short offcodeNCount[OFFCODE_MAX+1];
|
|
684
|
-
U32
|
|
718
|
+
U32 offcodeMax = ZSTD_highbit32((U32)(dictBufferSize + 128 KB));
|
|
719
|
+
unsigned matchLengthCount[MaxML+1];
|
|
685
720
|
short matchLengthNCount[MaxML+1];
|
|
686
|
-
|
|
721
|
+
unsigned litLengthCount[MaxLL+1];
|
|
687
722
|
short litLengthNCount[MaxLL+1];
|
|
688
|
-
U32 repOffset[MAXREPOFFSET]
|
|
723
|
+
U32 repOffset[MAXREPOFFSET];
|
|
689
724
|
offsetCount_t bestRepOffset[ZSTD_REP_NUM+1];
|
|
690
|
-
EStats_ress_t esr;
|
|
725
|
+
EStats_ress_t esr = { NULL, NULL, NULL };
|
|
691
726
|
ZSTD_parameters params;
|
|
692
|
-
U32 u, huffLog =
|
|
727
|
+
U32 u, huffLog = 11, Offlog = OffFSELog, mlLog = MLFSELog, llLog = LLFSELog, total;
|
|
693
728
|
size_t pos = 0, errorCode;
|
|
694
729
|
size_t eSize = 0;
|
|
695
730
|
size_t const totalSrcSize = ZDICT_totalSampleSize(fileSizes, nbFiles);
|
|
696
|
-
size_t const averageSampleSize = totalSrcSize / nbFiles;
|
|
731
|
+
size_t const averageSampleSize = totalSrcSize / (nbFiles + !nbFiles);
|
|
697
732
|
BYTE* dstPtr = (BYTE*)dstBuffer;
|
|
698
733
|
|
|
699
734
|
/* init */
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
for (u=0; u
|
|
703
|
-
for (u=0; u<=
|
|
735
|
+
DEBUGLOG(4, "ZDICT_analyzeEntropy");
|
|
736
|
+
if (offcodeMax>OFFCODE_MAX) { eSize = ERROR(dictionaryCreation_failed); goto _cleanup; } /* too large dictionary */
|
|
737
|
+
for (u=0; u<256; u++) countLit[u] = 1; /* any character must be described */
|
|
738
|
+
for (u=0; u<=offcodeMax; u++) offcodeCount[u] = 1;
|
|
739
|
+
for (u=0; u<=MaxML; u++) matchLengthCount[u] = 1;
|
|
740
|
+
for (u=0; u<=MaxLL; u++) litLengthCount[u] = 1;
|
|
741
|
+
memset(repOffset, 0, sizeof(repOffset));
|
|
704
742
|
repOffset[1] = repOffset[4] = repOffset[8] = 1;
|
|
705
743
|
memset(bestRepOffset, 0, sizeof(bestRepOffset));
|
|
706
|
-
|
|
744
|
+
if (compressionLevel==0) compressionLevel = g_compressionLevel_default;
|
|
745
|
+
params = ZSTD_getParams(compressionLevel, averageSampleSize, dictBufferSize);
|
|
746
|
+
|
|
747
|
+
esr.dict = ZSTD_createCDict_advanced(dictBuffer, dictBufferSize, ZSTD_dlm_byRef, ZSTD_dct_rawContent, params.cParams, ZSTD_defaultCMem);
|
|
707
748
|
esr.zc = ZSTD_createCCtx();
|
|
708
749
|
esr.workPlace = malloc(ZSTD_BLOCKSIZE_MAX);
|
|
709
|
-
if (!esr.
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
750
|
+
if (!esr.dict || !esr.zc || !esr.workPlace) {
|
|
751
|
+
eSize = ERROR(memory_allocation);
|
|
752
|
+
DISPLAYLEVEL(1, "Not enough memory \n");
|
|
753
|
+
goto _cleanup;
|
|
713
754
|
}
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
{ size_t const beginResult = ZSTD_compressBegin_advanced(esr.ref, dictBuffer, dictBufferSize, params, 0);
|
|
717
|
-
if (ZSTD_isError(beginResult)) {
|
|
718
|
-
eSize = ERROR(GENERIC);
|
|
719
|
-
DISPLAYLEVEL(1, "error : ZSTD_compressBegin_advanced failed ");
|
|
720
|
-
goto _cleanup;
|
|
721
|
-
} }
|
|
722
|
-
|
|
723
|
-
/* collect stats on all files */
|
|
755
|
+
|
|
756
|
+
/* collect stats on all samples */
|
|
724
757
|
for (u=0; u<nbFiles; u++) {
|
|
725
|
-
ZDICT_countEStats(esr, params,
|
|
726
|
-
|
|
727
|
-
|
|
758
|
+
ZDICT_countEStats(esr, ¶ms,
|
|
759
|
+
countLit, offcodeCount, matchLengthCount, litLengthCount, repOffset,
|
|
760
|
+
(const char*)srcBuffer + pos, fileSizes[u],
|
|
761
|
+
notificationLevel);
|
|
728
762
|
pos += fileSizes[u];
|
|
729
763
|
}
|
|
730
764
|
|
|
731
|
-
/* analyze */
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
765
|
+
/* analyze, build stats, starting with literals */
|
|
766
|
+
{ size_t maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog);
|
|
767
|
+
if (HUF_isError(maxNbBits)) {
|
|
768
|
+
eSize = maxNbBits;
|
|
769
|
+
DISPLAYLEVEL(1, " HUF_buildCTable error \n");
|
|
770
|
+
goto _cleanup;
|
|
771
|
+
}
|
|
772
|
+
if (maxNbBits==8) { /* not compressible : will fail on HUF_writeCTable() */
|
|
773
|
+
DISPLAYLEVEL(2, "warning : pathological dataset : literals are not compressible : samples are noisy or too regular \n");
|
|
774
|
+
ZDICT_flatLit(countLit); /* replace distribution by a fake "mostly flat but still compressible" distribution, that HUF_writeCTable() can encode */
|
|
775
|
+
maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog);
|
|
776
|
+
assert(maxNbBits==9);
|
|
777
|
+
}
|
|
778
|
+
huffLog = (U32)maxNbBits;
|
|
737
779
|
}
|
|
738
|
-
huffLog = (U32)errorCode;
|
|
739
780
|
|
|
740
781
|
/* looking for most common first offsets */
|
|
741
782
|
{ U32 offset;
|
|
@@ -744,11 +785,11 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
|
744
785
|
}
|
|
745
786
|
/* note : the result of this phase should be used to better appreciate the impact on statistics */
|
|
746
787
|
|
|
747
|
-
total=0; for (u=0; u<=
|
|
748
|
-
errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total,
|
|
788
|
+
total=0; for (u=0; u<=offcodeMax; u++) total+=offcodeCount[u];
|
|
789
|
+
errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, offcodeMax);
|
|
749
790
|
if (FSE_isError(errorCode)) {
|
|
750
|
-
eSize =
|
|
751
|
-
DISPLAYLEVEL(1, "FSE_normalizeCount error with offcodeCount");
|
|
791
|
+
eSize = errorCode;
|
|
792
|
+
DISPLAYLEVEL(1, "FSE_normalizeCount error with offcodeCount \n");
|
|
752
793
|
goto _cleanup;
|
|
753
794
|
}
|
|
754
795
|
Offlog = (U32)errorCode;
|
|
@@ -756,8 +797,8 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
|
756
797
|
total=0; for (u=0; u<=MaxML; u++) total+=matchLengthCount[u];
|
|
757
798
|
errorCode = FSE_normalizeCount(matchLengthNCount, mlLog, matchLengthCount, total, MaxML);
|
|
758
799
|
if (FSE_isError(errorCode)) {
|
|
759
|
-
eSize =
|
|
760
|
-
DISPLAYLEVEL(1, "FSE_normalizeCount error with matchLengthCount");
|
|
800
|
+
eSize = errorCode;
|
|
801
|
+
DISPLAYLEVEL(1, "FSE_normalizeCount error with matchLengthCount \n");
|
|
761
802
|
goto _cleanup;
|
|
762
803
|
}
|
|
763
804
|
mlLog = (U32)errorCode;
|
|
@@ -765,18 +806,17 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
|
765
806
|
total=0; for (u=0; u<=MaxLL; u++) total+=litLengthCount[u];
|
|
766
807
|
errorCode = FSE_normalizeCount(litLengthNCount, llLog, litLengthCount, total, MaxLL);
|
|
767
808
|
if (FSE_isError(errorCode)) {
|
|
768
|
-
eSize =
|
|
769
|
-
DISPLAYLEVEL(1, "FSE_normalizeCount error with litLengthCount");
|
|
809
|
+
eSize = errorCode;
|
|
810
|
+
DISPLAYLEVEL(1, "FSE_normalizeCount error with litLengthCount \n");
|
|
770
811
|
goto _cleanup;
|
|
771
812
|
}
|
|
772
813
|
llLog = (U32)errorCode;
|
|
773
814
|
|
|
774
|
-
|
|
775
815
|
/* write result to buffer */
|
|
776
816
|
{ size_t const hhSize = HUF_writeCTable(dstPtr, maxDstSize, hufTable, 255, huffLog);
|
|
777
817
|
if (HUF_isError(hhSize)) {
|
|
778
|
-
eSize =
|
|
779
|
-
DISPLAYLEVEL(1, "HUF_writeCTable error");
|
|
818
|
+
eSize = hhSize;
|
|
819
|
+
DISPLAYLEVEL(1, "HUF_writeCTable error \n");
|
|
780
820
|
goto _cleanup;
|
|
781
821
|
}
|
|
782
822
|
dstPtr += hhSize;
|
|
@@ -786,8 +826,8 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
|
786
826
|
|
|
787
827
|
{ size_t const ohSize = FSE_writeNCount(dstPtr, maxDstSize, offcodeNCount, OFFCODE_MAX, Offlog);
|
|
788
828
|
if (FSE_isError(ohSize)) {
|
|
789
|
-
eSize =
|
|
790
|
-
DISPLAYLEVEL(1, "FSE_writeNCount error with offcodeNCount");
|
|
829
|
+
eSize = ohSize;
|
|
830
|
+
DISPLAYLEVEL(1, "FSE_writeNCount error with offcodeNCount \n");
|
|
791
831
|
goto _cleanup;
|
|
792
832
|
}
|
|
793
833
|
dstPtr += ohSize;
|
|
@@ -797,8 +837,8 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
|
797
837
|
|
|
798
838
|
{ size_t const mhSize = FSE_writeNCount(dstPtr, maxDstSize, matchLengthNCount, MaxML, mlLog);
|
|
799
839
|
if (FSE_isError(mhSize)) {
|
|
800
|
-
eSize =
|
|
801
|
-
DISPLAYLEVEL(1, "FSE_writeNCount error with matchLengthNCount");
|
|
840
|
+
eSize = mhSize;
|
|
841
|
+
DISPLAYLEVEL(1, "FSE_writeNCount error with matchLengthNCount \n");
|
|
802
842
|
goto _cleanup;
|
|
803
843
|
}
|
|
804
844
|
dstPtr += mhSize;
|
|
@@ -808,8 +848,8 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
|
808
848
|
|
|
809
849
|
{ size_t const lhSize = FSE_writeNCount(dstPtr, maxDstSize, litLengthNCount, MaxLL, llLog);
|
|
810
850
|
if (FSE_isError(lhSize)) {
|
|
811
|
-
eSize =
|
|
812
|
-
DISPLAYLEVEL(1, "FSE_writeNCount error with litlengthNCount");
|
|
851
|
+
eSize = lhSize;
|
|
852
|
+
DISPLAYLEVEL(1, "FSE_writeNCount error with litlengthNCount \n");
|
|
813
853
|
goto _cleanup;
|
|
814
854
|
}
|
|
815
855
|
dstPtr += lhSize;
|
|
@@ -818,8 +858,8 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
|
818
858
|
}
|
|
819
859
|
|
|
820
860
|
if (maxDstSize<12) {
|
|
821
|
-
eSize = ERROR(
|
|
822
|
-
DISPLAYLEVEL(1, "not enough space to write RepOffsets");
|
|
861
|
+
eSize = ERROR(dstSize_tooSmall);
|
|
862
|
+
DISPLAYLEVEL(1, "not enough space to write RepOffsets \n");
|
|
823
863
|
goto _cleanup;
|
|
824
864
|
}
|
|
825
865
|
# if 0
|
|
@@ -833,11 +873,10 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
|
833
873
|
MEM_writeLE32(dstPtr+4, repStartValue[1]);
|
|
834
874
|
MEM_writeLE32(dstPtr+8, repStartValue[2]);
|
|
835
875
|
#endif
|
|
836
|
-
dstPtr += 12;
|
|
837
876
|
eSize += 12;
|
|
838
877
|
|
|
839
878
|
_cleanup:
|
|
840
|
-
|
|
879
|
+
ZSTD_freeCDict(esr.dict);
|
|
841
880
|
ZSTD_freeCCtx(esr.zc);
|
|
842
881
|
free(esr.workPlace);
|
|
843
882
|
|
|
@@ -845,129 +884,180 @@ _cleanup:
|
|
|
845
884
|
}
|
|
846
885
|
|
|
847
886
|
|
|
848
|
-
#define DIB_FASTSEGMENTSIZE 64
|
|
849
|
-
/*! ZDICT_fastSampling() (based on an idea proposed by Giuseppe Ottaviano) :
|
|
850
|
-
Fill `dictBuffer` with stripes of size DIB_FASTSEGMENTSIZE from `samplesBuffer`,
|
|
851
|
-
up to `dictSize`.
|
|
852
|
-
Filling starts from the end of `dictBuffer`, down to maximum possible.
|
|
853
|
-
if `dictSize` is not a multiply of DIB_FASTSEGMENTSIZE, some bytes at beginning of `dictBuffer` won't be used.
|
|
854
|
-
@return : amount of data written into `dictBuffer`,
|
|
855
|
-
or an error code
|
|
856
|
-
*/
|
|
857
|
-
static size_t ZDICT_fastSampling(void* dictBuffer, size_t dictSize,
|
|
858
|
-
const void* samplesBuffer, size_t samplesSize)
|
|
859
|
-
{
|
|
860
|
-
char* dstPtr = (char*)dictBuffer + dictSize;
|
|
861
|
-
const char* srcPtr = (const char*)samplesBuffer;
|
|
862
|
-
size_t const nbSegments = dictSize / DIB_FASTSEGMENTSIZE;
|
|
863
|
-
size_t segNb, interSize;
|
|
864
|
-
|
|
865
|
-
if (nbSegments <= 2) return ERROR(srcSize_wrong);
|
|
866
|
-
if (samplesSize < dictSize) return ERROR(srcSize_wrong);
|
|
867
|
-
|
|
868
|
-
/* first and last segments are part of dictionary, in case they contain interesting header/footer */
|
|
869
|
-
dstPtr -= DIB_FASTSEGMENTSIZE;
|
|
870
|
-
memcpy(dstPtr, srcPtr, DIB_FASTSEGMENTSIZE);
|
|
871
|
-
dstPtr -= DIB_FASTSEGMENTSIZE;
|
|
872
|
-
memcpy(dstPtr, srcPtr+samplesSize-DIB_FASTSEGMENTSIZE, DIB_FASTSEGMENTSIZE);
|
|
873
|
-
|
|
874
|
-
/* regularly copy a segment */
|
|
875
|
-
interSize = (samplesSize - nbSegments*DIB_FASTSEGMENTSIZE) / (nbSegments-1);
|
|
876
|
-
srcPtr += DIB_FASTSEGMENTSIZE;
|
|
877
|
-
for (segNb=2; segNb < nbSegments; segNb++) {
|
|
878
|
-
srcPtr += interSize;
|
|
879
|
-
dstPtr -= DIB_FASTSEGMENTSIZE;
|
|
880
|
-
memcpy(dstPtr, srcPtr, DIB_FASTSEGMENTSIZE);
|
|
881
|
-
srcPtr += DIB_FASTSEGMENTSIZE;
|
|
882
|
-
}
|
|
883
|
-
|
|
884
|
-
return nbSegments * DIB_FASTSEGMENTSIZE;
|
|
885
|
-
}
|
|
886
887
|
|
|
887
|
-
size_t
|
|
888
|
-
|
|
889
|
-
|
|
888
|
+
size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
|
|
889
|
+
const void* customDictContent, size_t dictContentSize,
|
|
890
|
+
const void* samplesBuffer, const size_t* samplesSizes,
|
|
891
|
+
unsigned nbSamples, ZDICT_params_t params)
|
|
890
892
|
{
|
|
891
893
|
size_t hSize;
|
|
892
|
-
|
|
894
|
+
#define HBUFFSIZE 256 /* should prove large enough for all entropy headers */
|
|
895
|
+
BYTE header[HBUFFSIZE];
|
|
896
|
+
int const compressionLevel = (params.compressionLevel == 0) ? g_compressionLevel_default : params.compressionLevel;
|
|
897
|
+
U32 const notificationLevel = params.notificationLevel;
|
|
898
|
+
|
|
899
|
+
/* check conditions */
|
|
900
|
+
DEBUGLOG(4, "ZDICT_finalizeDictionary");
|
|
901
|
+
if (dictBufferCapacity < dictContentSize) return ERROR(dstSize_tooSmall);
|
|
902
|
+
if (dictContentSize < ZDICT_CONTENTSIZE_MIN) return ERROR(srcSize_wrong);
|
|
903
|
+
if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) return ERROR(dstSize_tooSmall);
|
|
893
904
|
|
|
894
905
|
/* dictionary header */
|
|
895
|
-
MEM_writeLE32(
|
|
896
|
-
{ U64 const randomID = XXH64(
|
|
906
|
+
MEM_writeLE32(header, ZSTD_MAGIC_DICTIONARY);
|
|
907
|
+
{ U64 const randomID = XXH64(customDictContent, dictContentSize, 0);
|
|
897
908
|
U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768;
|
|
898
909
|
U32 const dictID = params.dictID ? params.dictID : compliantID;
|
|
899
|
-
MEM_writeLE32(
|
|
910
|
+
MEM_writeLE32(header+4, dictID);
|
|
900
911
|
}
|
|
901
912
|
hSize = 8;
|
|
902
913
|
|
|
903
914
|
/* entropy tables */
|
|
904
915
|
DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
|
|
905
916
|
DISPLAYLEVEL(2, "statistics ... \n");
|
|
906
|
-
|
|
917
|
+
{ size_t const eSize = ZDICT_analyzeEntropy(header+hSize, HBUFFSIZE-hSize,
|
|
918
|
+
compressionLevel,
|
|
919
|
+
samplesBuffer, samplesSizes, nbSamples,
|
|
920
|
+
customDictContent, dictContentSize,
|
|
921
|
+
notificationLevel);
|
|
922
|
+
if (ZDICT_isError(eSize)) return eSize;
|
|
923
|
+
hSize += eSize;
|
|
924
|
+
}
|
|
925
|
+
|
|
926
|
+
/* copy elements in final buffer ; note : src and dst buffer can overlap */
|
|
927
|
+
if (hSize + dictContentSize > dictBufferCapacity) dictContentSize = dictBufferCapacity - hSize;
|
|
928
|
+
{ size_t const dictSize = hSize + dictContentSize;
|
|
929
|
+
char* dictEnd = (char*)dictBuffer + dictSize;
|
|
930
|
+
memmove(dictEnd - dictContentSize, customDictContent, dictContentSize);
|
|
931
|
+
memcpy(dictBuffer, header, hSize);
|
|
932
|
+
return dictSize;
|
|
933
|
+
}
|
|
934
|
+
}
|
|
935
|
+
|
|
936
|
+
|
|
937
|
+
static size_t ZDICT_addEntropyTablesFromBuffer_advanced(
|
|
938
|
+
void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
|
|
939
|
+
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
|
|
940
|
+
ZDICT_params_t params)
|
|
941
|
+
{
|
|
942
|
+
int const compressionLevel = (params.compressionLevel == 0) ? g_compressionLevel_default : params.compressionLevel;
|
|
943
|
+
U32 const notificationLevel = params.notificationLevel;
|
|
944
|
+
size_t hSize = 8;
|
|
945
|
+
|
|
946
|
+
/* calculate entropy tables */
|
|
947
|
+
DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
|
|
948
|
+
DISPLAYLEVEL(2, "statistics ... \n");
|
|
949
|
+
{ size_t const eSize = ZDICT_analyzeEntropy((char*)dictBuffer+hSize, dictBufferCapacity-hSize,
|
|
907
950
|
compressionLevel,
|
|
908
951
|
samplesBuffer, samplesSizes, nbSamples,
|
|
909
|
-
(char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize
|
|
952
|
+
(char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize,
|
|
953
|
+
notificationLevel);
|
|
954
|
+
if (ZDICT_isError(eSize)) return eSize;
|
|
955
|
+
hSize += eSize;
|
|
956
|
+
}
|
|
957
|
+
|
|
958
|
+
/* add dictionary header (after entropy tables) */
|
|
959
|
+
MEM_writeLE32(dictBuffer, ZSTD_MAGIC_DICTIONARY);
|
|
960
|
+
{ U64 const randomID = XXH64((char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize, 0);
|
|
961
|
+
U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768;
|
|
962
|
+
U32 const dictID = params.dictID ? params.dictID : compliantID;
|
|
963
|
+
MEM_writeLE32((char*)dictBuffer+4, dictID);
|
|
964
|
+
}
|
|
910
965
|
|
|
911
966
|
if (hSize + dictContentSize < dictBufferCapacity)
|
|
912
967
|
memmove((char*)dictBuffer + hSize, (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize);
|
|
913
968
|
return MIN(dictBufferCapacity, hSize+dictContentSize);
|
|
914
969
|
}
|
|
915
970
|
|
|
916
|
-
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
*
|
|
920
|
-
|
|
971
|
+
/* Hidden declaration for dbio.c */
|
|
972
|
+
size_t ZDICT_trainFromBuffer_unsafe_legacy(
|
|
973
|
+
void* dictBuffer, size_t maxDictSize,
|
|
974
|
+
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
|
|
975
|
+
ZDICT_legacy_params_t params);
|
|
976
|
+
/*! ZDICT_trainFromBuffer_unsafe_legacy() :
|
|
977
|
+
* Warning : `samplesBuffer` must be followed by noisy guard band.
|
|
978
|
+
* @return : size of dictionary, or an error code which can be tested with ZDICT_isError()
|
|
921
979
|
*/
|
|
922
|
-
size_t
|
|
980
|
+
size_t ZDICT_trainFromBuffer_unsafe_legacy(
|
|
923
981
|
void* dictBuffer, size_t maxDictSize,
|
|
924
982
|
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
|
|
925
|
-
|
|
983
|
+
ZDICT_legacy_params_t params)
|
|
926
984
|
{
|
|
927
|
-
U32 const dictListSize = MAX(
|
|
985
|
+
U32 const dictListSize = MAX(MAX(DICTLISTSIZE_DEFAULT, nbSamples), (U32)(maxDictSize/16));
|
|
928
986
|
dictItem* const dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList));
|
|
929
|
-
unsigned selectivity = params.selectivityLevel;
|
|
987
|
+
unsigned const selectivity = params.selectivityLevel == 0 ? g_selectivity_default : params.selectivityLevel;
|
|
988
|
+
unsigned const minRep = (selectivity > 30) ? MINRATIO : nbSamples >> selectivity;
|
|
930
989
|
size_t const targetDictSize = maxDictSize;
|
|
931
|
-
size_t
|
|
990
|
+
size_t const samplesBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples);
|
|
932
991
|
size_t dictSize = 0;
|
|
992
|
+
U32 const notificationLevel = params.zParams.notificationLevel;
|
|
933
993
|
|
|
934
994
|
/* checks */
|
|
935
995
|
if (!dictList) return ERROR(memory_allocation);
|
|
936
|
-
if (maxDictSize
|
|
996
|
+
if (maxDictSize < ZDICT_DICTSIZE_MIN) { free(dictList); return ERROR(dstSize_tooSmall); } /* requested dictionary size is too small */
|
|
997
|
+
if (samplesBuffSize < ZDICT_MIN_SAMPLES_SIZE) { free(dictList); return ERROR(dictionaryCreation_failed); } /* not enough source to create dictionary */
|
|
937
998
|
|
|
938
999
|
/* init */
|
|
939
|
-
{ unsigned u; for (u=0, sBuffSize=0; u<nbSamples; u++) sBuffSize += samplesSizes[u]; }
|
|
940
|
-
if (sBuffSize < DIB_MINSAMPLESSIZE) { free(dictList); return 0; } /* not enough source to create dictionary */
|
|
941
1000
|
ZDICT_initDictItem(dictList);
|
|
942
|
-
g_displayLevel = params.notificationLevel;
|
|
943
|
-
if (selectivity==0) selectivity = g_selectivity_default;
|
|
944
1001
|
|
|
945
1002
|
/* build dictionary */
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
1003
|
+
ZDICT_trainBuffer_legacy(dictList, dictListSize,
|
|
1004
|
+
samplesBuffer, samplesBuffSize,
|
|
1005
|
+
samplesSizes, nbSamples,
|
|
1006
|
+
minRep, notificationLevel);
|
|
1007
|
+
|
|
1008
|
+
/* display best matches */
|
|
1009
|
+
if (params.zParams.notificationLevel>= 3) {
|
|
1010
|
+
unsigned const nb = MIN(25, dictList[0].pos);
|
|
1011
|
+
unsigned const dictContentSize = ZDICT_dictSize(dictList);
|
|
1012
|
+
unsigned u;
|
|
1013
|
+
DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", (unsigned)dictList[0].pos-1, dictContentSize);
|
|
1014
|
+
DISPLAYLEVEL(3, "list %u best segments \n", nb-1);
|
|
1015
|
+
for (u=1; u<nb; u++) {
|
|
1016
|
+
unsigned const pos = dictList[u].pos;
|
|
1017
|
+
unsigned const length = dictList[u].length;
|
|
1018
|
+
U32 const printedLength = MIN(40, length);
|
|
1019
|
+
if ((pos > samplesBuffSize) || ((pos + length) > samplesBuffSize)) {
|
|
1020
|
+
free(dictList);
|
|
1021
|
+
return ERROR(GENERIC); /* should never happen */
|
|
1022
|
+
}
|
|
1023
|
+
DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |",
|
|
1024
|
+
u, length, pos, (unsigned)dictList[u].savings);
|
|
1025
|
+
ZDICT_printHex((const char*)samplesBuffer+pos, printedLength);
|
|
1026
|
+
DISPLAYLEVEL(3, "| \n");
|
|
1027
|
+
} }
|
|
1028
|
+
|
|
968
1029
|
|
|
969
1030
|
/* create dictionary */
|
|
970
|
-
{
|
|
1031
|
+
{ unsigned dictContentSize = ZDICT_dictSize(dictList);
|
|
1032
|
+
if (dictContentSize < ZDICT_CONTENTSIZE_MIN) { free(dictList); return ERROR(dictionaryCreation_failed); } /* dictionary content too small */
|
|
1033
|
+
if (dictContentSize < targetDictSize/4) {
|
|
1034
|
+
DISPLAYLEVEL(2, "! warning : selected content significantly smaller than requested (%u < %u) \n", dictContentSize, (unsigned)maxDictSize);
|
|
1035
|
+
if (samplesBuffSize < 10 * targetDictSize)
|
|
1036
|
+
DISPLAYLEVEL(2, "! consider increasing the number of samples (total size : %u MB)\n", (unsigned)(samplesBuffSize>>20));
|
|
1037
|
+
if (minRep > MINRATIO) {
|
|
1038
|
+
DISPLAYLEVEL(2, "! consider increasing selectivity to produce larger dictionary (-s%u) \n", selectivity+1);
|
|
1039
|
+
DISPLAYLEVEL(2, "! note : larger dictionaries are not necessarily better, test its efficiency on samples \n");
|
|
1040
|
+
}
|
|
1041
|
+
}
|
|
1042
|
+
|
|
1043
|
+
if ((dictContentSize > targetDictSize*3) && (nbSamples > 2*MINRATIO) && (selectivity>1)) {
|
|
1044
|
+
unsigned proposedSelectivity = selectivity-1;
|
|
1045
|
+
while ((nbSamples >> proposedSelectivity) <= MINRATIO) { proposedSelectivity--; }
|
|
1046
|
+
DISPLAYLEVEL(2, "! note : calculated dictionary significantly larger than requested (%u > %u) \n", dictContentSize, (unsigned)maxDictSize);
|
|
1047
|
+
DISPLAYLEVEL(2, "! consider increasing dictionary size, or produce denser dictionary (-s%u) \n", proposedSelectivity);
|
|
1048
|
+
DISPLAYLEVEL(2, "! always test dictionary efficiency on real samples \n");
|
|
1049
|
+
}
|
|
1050
|
+
|
|
1051
|
+
/* limit dictionary size */
|
|
1052
|
+
{ U32 const max = dictList->pos; /* convention : nb of useful elts within dictList */
|
|
1053
|
+
U32 currentSize = 0;
|
|
1054
|
+
U32 n; for (n=1; n<max; n++) {
|
|
1055
|
+
currentSize += dictList[n].length;
|
|
1056
|
+
if (currentSize > targetDictSize) { currentSize -= dictList[n].length; break; }
|
|
1057
|
+
}
|
|
1058
|
+
dictList->pos = n;
|
|
1059
|
+
dictContentSize = currentSize;
|
|
1060
|
+
}
|
|
971
1061
|
|
|
972
1062
|
/* build dict content */
|
|
973
1063
|
{ U32 u;
|
|
@@ -979,17 +1069,9 @@ size_t ZDICT_trainFromBuffer_unsafe(
|
|
|
979
1069
|
memcpy(ptr, (const char*)samplesBuffer+dictList[u].pos, l);
|
|
980
1070
|
} }
|
|
981
1071
|
|
|
982
|
-
/* fast mode dict content */
|
|
983
|
-
if (selectivity==1) { /* note could also be used to complete a dictionary, but not necessarily better */
|
|
984
|
-
DISPLAYLEVEL(3, "\r%70s\r", ""); /* clean display line */
|
|
985
|
-
DISPLAYLEVEL(3, "Adding %u KB with fast sampling \n", (U32)(targetDictSize>>10));
|
|
986
|
-
dictContentSize = (U32)ZDICT_fastSampling(dictBuffer, targetDictSize,
|
|
987
|
-
samplesBuffer, sBuffSize);
|
|
988
|
-
}
|
|
989
|
-
|
|
990
1072
|
dictSize = ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, maxDictSize,
|
|
991
1073
|
samplesBuffer, samplesSizes, nbSamples,
|
|
992
|
-
params);
|
|
1074
|
+
params.zParams);
|
|
993
1075
|
}
|
|
994
1076
|
|
|
995
1077
|
/* clean up */
|
|
@@ -998,44 +1080,52 @@ size_t ZDICT_trainFromBuffer_unsafe(
|
|
|
998
1080
|
}
|
|
999
1081
|
|
|
1000
1082
|
|
|
1001
|
-
/*
|
|
1002
|
-
*
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
|
|
1083
|
+
/* ZDICT_trainFromBuffer_legacy() :
|
|
1084
|
+
* issue : samplesBuffer need to be followed by a noisy guard band.
|
|
1085
|
+
* work around : duplicate the buffer, and add the noise */
|
|
1086
|
+
size_t ZDICT_trainFromBuffer_legacy(void* dictBuffer, size_t dictBufferCapacity,
|
|
1087
|
+
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
|
|
1088
|
+
ZDICT_legacy_params_t params)
|
|
1006
1089
|
{
|
|
1090
|
+
size_t result;
|
|
1007
1091
|
void* newBuff;
|
|
1008
|
-
size_t sBuffSize;
|
|
1092
|
+
size_t const sBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples);
|
|
1093
|
+
if (sBuffSize < ZDICT_MIN_SAMPLES_SIZE) return 0; /* not enough content => no dictionary */
|
|
1009
1094
|
|
|
1010
|
-
{ unsigned u; for (u=0, sBuffSize=0; u<nbSamples; u++) sBuffSize += samplesSizes[u]; }
|
|
1011
|
-
if (sBuffSize==0) return 0; /* empty content => no dictionary */
|
|
1012
1095
|
newBuff = malloc(sBuffSize + NOISELENGTH);
|
|
1013
1096
|
if (!newBuff) return ERROR(memory_allocation);
|
|
1014
1097
|
|
|
1015
1098
|
memcpy(newBuff, samplesBuffer, sBuffSize);
|
|
1016
1099
|
ZDICT_fillNoise((char*)newBuff + sBuffSize, NOISELENGTH); /* guard band, for end of buffer condition */
|
|
1017
1100
|
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
return result; }
|
|
1101
|
+
result =
|
|
1102
|
+
ZDICT_trainFromBuffer_unsafe_legacy(dictBuffer, dictBufferCapacity, newBuff,
|
|
1103
|
+
samplesSizes, nbSamples, params);
|
|
1104
|
+
free(newBuff);
|
|
1105
|
+
return result;
|
|
1024
1106
|
}
|
|
1025
1107
|
|
|
1026
1108
|
|
|
1027
1109
|
size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
|
|
1028
1110
|
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
|
|
1029
1111
|
{
|
|
1030
|
-
|
|
1112
|
+
ZDICT_fastCover_params_t params;
|
|
1113
|
+
DEBUGLOG(3, "ZDICT_trainFromBuffer");
|
|
1031
1114
|
memset(¶ms, 0, sizeof(params));
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
1115
|
+
params.d = 8;
|
|
1116
|
+
params.steps = 4;
|
|
1117
|
+
/* Default to level 6 since no compression level information is available */
|
|
1118
|
+
params.zParams.compressionLevel = 3;
|
|
1119
|
+
#if defined(DEBUGLEVEL) && (DEBUGLEVEL>=1)
|
|
1120
|
+
params.zParams.notificationLevel = DEBUGLEVEL;
|
|
1121
|
+
#endif
|
|
1122
|
+
return ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, dictBufferCapacity,
|
|
1123
|
+
samplesBuffer, samplesSizes, nbSamples,
|
|
1124
|
+
¶ms);
|
|
1035
1125
|
}
|
|
1036
1126
|
|
|
1037
1127
|
size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
|
|
1038
|
-
|
|
1128
|
+
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
|
|
1039
1129
|
{
|
|
1040
1130
|
ZDICT_params_t params;
|
|
1041
1131
|
memset(¶ms, 0, sizeof(params));
|