extzstd 0.1 → 0.3.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/HISTORY.ja.md +39 -0
- data/README.md +38 -56
- data/contrib/zstd/CHANGELOG +613 -0
- data/contrib/zstd/CODE_OF_CONDUCT.md +5 -0
- data/contrib/zstd/CONTRIBUTING.md +406 -0
- data/contrib/zstd/COPYING +339 -0
- data/contrib/zstd/Makefile +420 -0
- data/contrib/zstd/README.md +179 -41
- data/contrib/zstd/TESTING.md +44 -0
- data/contrib/zstd/appveyor.yml +292 -0
- data/contrib/zstd/lib/BUCK +234 -0
- data/contrib/zstd/lib/Makefile +451 -0
- data/contrib/zstd/lib/README.md +207 -0
- data/contrib/zstd/{common → lib/common}/bitstream.h +187 -138
- data/contrib/zstd/lib/common/compiler.h +288 -0
- data/contrib/zstd/lib/common/cpu.h +213 -0
- data/contrib/zstd/lib/common/debug.c +24 -0
- data/contrib/zstd/lib/common/debug.h +107 -0
- data/contrib/zstd/lib/common/entropy_common.c +362 -0
- data/contrib/zstd/{common → lib/common}/error_private.c +25 -12
- data/contrib/zstd/{common → lib/common}/error_private.h +14 -10
- data/contrib/zstd/{common → lib/common}/fse.h +173 -92
- data/contrib/zstd/{common → lib/common}/fse_decompress.c +149 -85
- data/contrib/zstd/lib/common/huf.h +361 -0
- data/contrib/zstd/{common → lib/common}/mem.h +115 -59
- data/contrib/zstd/lib/common/pool.c +350 -0
- data/contrib/zstd/lib/common/pool.h +84 -0
- data/contrib/zstd/lib/common/threading.c +122 -0
- data/contrib/zstd/lib/common/threading.h +155 -0
- data/contrib/zstd/{common → lib/common}/xxhash.c +55 -96
- data/contrib/zstd/{common → lib/common}/xxhash.h +23 -47
- data/contrib/zstd/lib/common/zstd_common.c +83 -0
- data/contrib/zstd/lib/common/zstd_deps.h +111 -0
- data/contrib/zstd/lib/common/zstd_errors.h +95 -0
- data/contrib/zstd/lib/common/zstd_internal.h +478 -0
- data/contrib/zstd/{compress → lib/compress}/fse_compress.c +214 -319
- data/contrib/zstd/lib/compress/hist.c +181 -0
- data/contrib/zstd/lib/compress/hist.h +75 -0
- data/contrib/zstd/lib/compress/huf_compress.c +913 -0
- data/contrib/zstd/lib/compress/zstd_compress.c +5208 -0
- data/contrib/zstd/lib/compress/zstd_compress_internal.h +1203 -0
- data/contrib/zstd/lib/compress/zstd_compress_literals.c +158 -0
- data/contrib/zstd/lib/compress/zstd_compress_literals.h +29 -0
- data/contrib/zstd/lib/compress/zstd_compress_sequences.c +433 -0
- data/contrib/zstd/lib/compress/zstd_compress_sequences.h +54 -0
- data/contrib/zstd/lib/compress/zstd_compress_superblock.c +849 -0
- data/contrib/zstd/lib/compress/zstd_compress_superblock.h +32 -0
- data/contrib/zstd/lib/compress/zstd_cwksp.h +561 -0
- data/contrib/zstd/lib/compress/zstd_double_fast.c +521 -0
- data/contrib/zstd/lib/compress/zstd_double_fast.h +38 -0
- data/contrib/zstd/lib/compress/zstd_fast.c +496 -0
- data/contrib/zstd/lib/compress/zstd_fast.h +37 -0
- data/contrib/zstd/lib/compress/zstd_lazy.c +1412 -0
- data/contrib/zstd/lib/compress/zstd_lazy.h +87 -0
- data/contrib/zstd/lib/compress/zstd_ldm.c +660 -0
- data/contrib/zstd/lib/compress/zstd_ldm.h +116 -0
- data/contrib/zstd/lib/compress/zstd_opt.c +1345 -0
- data/contrib/zstd/lib/compress/zstd_opt.h +56 -0
- data/contrib/zstd/lib/compress/zstdmt_compress.c +1811 -0
- data/contrib/zstd/lib/compress/zstdmt_compress.h +110 -0
- data/contrib/zstd/lib/decompress/huf_decompress.c +1350 -0
- data/contrib/zstd/lib/decompress/zstd_ddict.c +244 -0
- data/contrib/zstd/lib/decompress/zstd_ddict.h +44 -0
- data/contrib/zstd/lib/decompress/zstd_decompress.c +1930 -0
- data/contrib/zstd/lib/decompress/zstd_decompress_block.c +1540 -0
- data/contrib/zstd/lib/decompress/zstd_decompress_block.h +62 -0
- data/contrib/zstd/lib/decompress/zstd_decompress_internal.h +190 -0
- data/contrib/zstd/{common → lib/deprecated}/zbuff.h +68 -45
- data/contrib/zstd/lib/deprecated/zbuff_common.c +26 -0
- data/contrib/zstd/lib/deprecated/zbuff_compress.c +147 -0
- data/contrib/zstd/lib/deprecated/zbuff_decompress.c +75 -0
- data/contrib/zstd/lib/dictBuilder/cover.c +1245 -0
- data/contrib/zstd/lib/dictBuilder/cover.h +157 -0
- data/contrib/zstd/{dictBuilder → lib/dictBuilder}/divsufsort.c +3 -3
- data/contrib/zstd/{dictBuilder → lib/dictBuilder}/divsufsort.h +0 -0
- data/contrib/zstd/lib/dictBuilder/fastcover.c +758 -0
- data/contrib/zstd/{dictBuilder → lib/dictBuilder}/zdict.c +318 -194
- data/contrib/zstd/lib/dictBuilder/zdict.h +305 -0
- data/contrib/zstd/{legacy → lib/legacy}/zstd_legacy.h +171 -15
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v01.c +191 -124
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v01.h +19 -5
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v02.c +125 -125
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v02.h +19 -5
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v03.c +125 -124
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v03.h +20 -6
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v04.c +151 -299
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v04.h +19 -5
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v05.c +237 -243
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v05.h +19 -6
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v06.c +130 -143
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v06.h +18 -5
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v07.c +158 -157
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v07.h +19 -5
- data/contrib/zstd/lib/libzstd.pc.in +15 -0
- data/contrib/zstd/lib/zstd.h +2391 -0
- data/ext/depend +2 -0
- data/ext/extconf.rb +15 -6
- data/ext/extzstd.c +76 -145
- data/ext/extzstd.h +80 -31
- data/ext/extzstd_stream.c +417 -142
- data/ext/libzstd_conf.h +8 -0
- data/ext/zstd_common.c +10 -7
- data/ext/zstd_compress.c +14 -5
- data/ext/zstd_decompress.c +5 -4
- data/ext/zstd_dictbuilder.c +9 -4
- data/ext/zstd_dictbuilder_fastcover.c +3 -0
- data/ext/zstd_legacy_v01.c +3 -1
- data/ext/zstd_legacy_v02.c +3 -1
- data/ext/zstd_legacy_v03.c +3 -1
- data/ext/zstd_legacy_v04.c +3 -1
- data/ext/zstd_legacy_v05.c +3 -1
- data/ext/zstd_legacy_v06.c +3 -1
- data/ext/zstd_legacy_v07.c +3 -1
- data/gemstub.rb +10 -24
- data/lib/extzstd.rb +64 -179
- data/lib/extzstd/version.rb +6 -1
- data/test/test_basic.rb +9 -6
- metadata +113 -57
- data/HISTORY.ja +0 -5
- data/contrib/zstd/common/entropy_common.c +0 -225
- data/contrib/zstd/common/huf.h +0 -228
- data/contrib/zstd/common/zstd_common.c +0 -83
- data/contrib/zstd/common/zstd_errors.h +0 -60
- data/contrib/zstd/common/zstd_internal.h +0 -267
- data/contrib/zstd/compress/huf_compress.c +0 -533
- data/contrib/zstd/compress/zbuff_compress.c +0 -319
- data/contrib/zstd/compress/zstd_compress.c +0 -3264
- data/contrib/zstd/compress/zstd_opt.h +0 -900
- data/contrib/zstd/decompress/huf_decompress.c +0 -883
- data/contrib/zstd/decompress/zbuff_decompress.c +0 -252
- data/contrib/zstd/decompress/zstd_decompress.c +0 -1842
- data/contrib/zstd/dictBuilder/zdict.h +0 -111
- data/contrib/zstd/zstd.h +0 -640
@@ -1,18 +1,20 @@
|
|
1
|
-
|
2
|
-
* Copyright (c) 2016-
|
1
|
+
/*
|
2
|
+
* Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
|
3
3
|
* All rights reserved.
|
4
4
|
*
|
5
|
-
* This source code is licensed under the BSD-style license found in the
|
6
|
-
* LICENSE file in the root directory of this source tree
|
7
|
-
*
|
5
|
+
* This source code is licensed under both the BSD-style license (found in the
|
6
|
+
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
|
7
|
+
* in the COPYING file in the root directory of this source tree).
|
8
|
+
* You may select, at your option, one of the above-listed licenses.
|
8
9
|
*/
|
9
10
|
|
10
11
|
|
11
12
|
/*-**************************************
|
12
13
|
* Tuning parameters
|
13
14
|
****************************************/
|
15
|
+
#define MINRATIO 4 /* minimum nb of apparition to be selected in dictionary */
|
14
16
|
#define ZDICT_MAX_SAMPLES_SIZE (2000U << 20)
|
15
|
-
#define ZDICT_MIN_SAMPLES_SIZE
|
17
|
+
#define ZDICT_MIN_SAMPLES_SIZE (ZDICT_CONTENTSIZE_MIN * MINRATIO)
|
16
18
|
|
17
19
|
|
18
20
|
/*-**************************************
|
@@ -35,18 +37,18 @@
|
|
35
37
|
#include <stdio.h> /* fprintf, fopen, ftello64 */
|
36
38
|
#include <time.h> /* clock */
|
37
39
|
|
38
|
-
#include "mem.h" /* read */
|
39
|
-
#include "
|
40
|
-
#include "fse.h" /* FSE_normalizeCount, FSE_writeNCount */
|
40
|
+
#include "../common/mem.h" /* read */
|
41
|
+
#include "../common/fse.h" /* FSE_normalizeCount, FSE_writeNCount */
|
41
42
|
#define HUF_STATIC_LINKING_ONLY
|
42
|
-
#include "huf.h"
|
43
|
-
#include "zstd_internal.h" /* includes zstd.h */
|
44
|
-
#include "xxhash.h"
|
43
|
+
#include "../common/huf.h" /* HUF_buildCTable, HUF_writeCTable */
|
44
|
+
#include "../common/zstd_internal.h" /* includes zstd.h */
|
45
|
+
#include "../common/xxhash.h" /* XXH64 */
|
45
46
|
#include "divsufsort.h"
|
46
47
|
#ifndef ZDICT_STATIC_LINKING_ONLY
|
47
48
|
# define ZDICT_STATIC_LINKING_ONLY
|
48
49
|
#endif
|
49
50
|
#include "zdict.h"
|
51
|
+
#include "../compress/zstd_compress_internal.h" /* ZSTD_loadCEntropy() */
|
50
52
|
|
51
53
|
|
52
54
|
/*-*************************************
|
@@ -60,17 +62,15 @@
|
|
60
62
|
|
61
63
|
#define NOISELENGTH 32
|
62
64
|
|
63
|
-
#define MINRATIO 4
|
64
|
-
static const int g_compressionLevel_default = 5;
|
65
65
|
static const U32 g_selectivity_default = 9;
|
66
|
-
static const size_t g_provision_entropySize = 200;
|
67
|
-
static const size_t g_min_fast_dictContent = 192;
|
68
66
|
|
69
67
|
|
70
68
|
/*-*************************************
|
71
69
|
* Console display
|
72
70
|
***************************************/
|
71
|
+
#undef DISPLAY
|
73
72
|
#define DISPLAY(...) { fprintf(stderr, __VA_ARGS__); fflush( stderr ); }
|
73
|
+
#undef DISPLAYLEVEL
|
74
74
|
#define DISPLAYLEVEL(l, ...) if (notificationLevel>=l) { DISPLAY(__VA_ARGS__); } /* 0 : no display; 1: errors; 2: default; 3: details; 4: debug */
|
75
75
|
|
76
76
|
static clock_t ZDICT_clockSpan(clock_t nPrevious) { return clock() - nPrevious; }
|
@@ -97,15 +97,35 @@ const char* ZDICT_getErrorName(size_t errorCode) { return ERR_getErrorName(error
|
|
97
97
|
unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize)
|
98
98
|
{
|
99
99
|
if (dictSize < 8) return 0;
|
100
|
-
if (MEM_readLE32(dictBuffer) !=
|
100
|
+
if (MEM_readLE32(dictBuffer) != ZSTD_MAGIC_DICTIONARY) return 0;
|
101
101
|
return MEM_readLE32((const char*)dictBuffer + 4);
|
102
102
|
}
|
103
103
|
|
104
|
+
size_t ZDICT_getDictHeaderSize(const void* dictBuffer, size_t dictSize)
|
105
|
+
{
|
106
|
+
size_t headerSize;
|
107
|
+
if (dictSize <= 8 || MEM_readLE32(dictBuffer) != ZSTD_MAGIC_DICTIONARY) return ERROR(dictionary_corrupted);
|
108
|
+
|
109
|
+
{ ZSTD_compressedBlockState_t* bs = (ZSTD_compressedBlockState_t*)malloc(sizeof(ZSTD_compressedBlockState_t));
|
110
|
+
U32* wksp = (U32*)malloc(HUF_WORKSPACE_SIZE);
|
111
|
+
if (!bs || !wksp) {
|
112
|
+
headerSize = ERROR(memory_allocation);
|
113
|
+
} else {
|
114
|
+
ZSTD_reset_compressedBlockState(bs);
|
115
|
+
headerSize = ZSTD_loadCEntropy(bs, wksp, dictBuffer, dictSize);
|
116
|
+
}
|
117
|
+
|
118
|
+
free(bs);
|
119
|
+
free(wksp);
|
120
|
+
}
|
121
|
+
|
122
|
+
return headerSize;
|
123
|
+
}
|
104
124
|
|
105
125
|
/*-********************************************************
|
106
126
|
* Dictionary training functions
|
107
127
|
**********************************************************/
|
108
|
-
static unsigned ZDICT_NbCommonBytes (
|
128
|
+
static unsigned ZDICT_NbCommonBytes (size_t val)
|
109
129
|
{
|
110
130
|
if (MEM_isLittleEndian()) {
|
111
131
|
if (MEM_64bits()) {
|
@@ -209,7 +229,6 @@ static dictItem ZDICT_analyzePos(
|
|
209
229
|
U32 cumulLength[LLIMIT] = {0};
|
210
230
|
U32 savings[LLIMIT] = {0};
|
211
231
|
const BYTE* b = (const BYTE*)buffer;
|
212
|
-
size_t length;
|
213
232
|
size_t maxLength = LLIMIT;
|
214
233
|
size_t pos = suffix[start];
|
215
234
|
U32 end = start;
|
@@ -224,26 +243,30 @@ static dictItem ZDICT_analyzePos(
|
|
224
243
|
||(MEM_read16(b+pos+1) == MEM_read16(b+pos+3))
|
225
244
|
||(MEM_read16(b+pos+2) == MEM_read16(b+pos+4)) ) {
|
226
245
|
/* skip and mark segment */
|
227
|
-
U16
|
228
|
-
U32 u,
|
229
|
-
while (MEM_read16(b+pos+
|
230
|
-
if (b[pos+
|
231
|
-
for (u=1; u<
|
246
|
+
U16 const pattern16 = MEM_read16(b+pos+4);
|
247
|
+
U32 u, patternEnd = 6;
|
248
|
+
while (MEM_read16(b+pos+patternEnd) == pattern16) patternEnd+=2 ;
|
249
|
+
if (b[pos+patternEnd] == b[pos+patternEnd-1]) patternEnd++;
|
250
|
+
for (u=1; u<patternEnd; u++)
|
232
251
|
doneMarks[pos+u] = 1;
|
233
252
|
return solution;
|
234
253
|
}
|
235
254
|
|
236
255
|
/* look forward */
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
256
|
+
{ size_t length;
|
257
|
+
do {
|
258
|
+
end++;
|
259
|
+
length = ZDICT_count(b + pos, b + suffix[end]);
|
260
|
+
} while (length >= MINMATCHLENGTH);
|
261
|
+
}
|
241
262
|
|
242
263
|
/* look backward */
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
264
|
+
{ size_t length;
|
265
|
+
do {
|
266
|
+
length = ZDICT_count(b + pos, b + *(suffix+start-1));
|
267
|
+
if (length >=MINMATCHLENGTH) start--;
|
268
|
+
} while(length >= MINMATCHLENGTH);
|
269
|
+
}
|
247
270
|
|
248
271
|
/* exit if not found a minimum nb of repetitions */
|
249
272
|
if (end-start < minRatio) {
|
@@ -254,15 +277,15 @@ static dictItem ZDICT_analyzePos(
|
|
254
277
|
}
|
255
278
|
|
256
279
|
{ int i;
|
257
|
-
U32
|
280
|
+
U32 mml;
|
258
281
|
U32 refinedStart = start;
|
259
282
|
U32 refinedEnd = end;
|
260
283
|
|
261
284
|
DISPLAYLEVEL(4, "\n");
|
262
|
-
DISPLAYLEVEL(4, "found %3u matches of length >= %i at pos %7u ", (
|
285
|
+
DISPLAYLEVEL(4, "found %3u matches of length >= %i at pos %7u ", (unsigned)(end-start), MINMATCHLENGTH, (unsigned)pos);
|
263
286
|
DISPLAYLEVEL(4, "\n");
|
264
287
|
|
265
|
-
for (
|
288
|
+
for (mml = MINMATCHLENGTH ; ; mml++) {
|
266
289
|
BYTE currentChar = 0;
|
267
290
|
U32 currentCount = 0;
|
268
291
|
U32 currentID = refinedStart;
|
@@ -270,13 +293,13 @@ static dictItem ZDICT_analyzePos(
|
|
270
293
|
U32 selectedCount = 0;
|
271
294
|
U32 selectedID = currentID;
|
272
295
|
for (id =refinedStart; id < refinedEnd; id++) {
|
273
|
-
if (b[
|
296
|
+
if (b[suffix[id] + mml] != currentChar) {
|
274
297
|
if (currentCount > selectedCount) {
|
275
298
|
selectedCount = currentCount;
|
276
299
|
selectedID = currentID;
|
277
300
|
}
|
278
301
|
currentID = id;
|
279
|
-
currentChar = b[ suffix[id] +
|
302
|
+
currentChar = b[ suffix[id] + mml];
|
280
303
|
currentCount = 0;
|
281
304
|
}
|
282
305
|
currentCount ++;
|
@@ -292,28 +315,31 @@ static dictItem ZDICT_analyzePos(
|
|
292
315
|
refinedEnd = refinedStart + selectedCount;
|
293
316
|
}
|
294
317
|
|
295
|
-
/* evaluate gain based on new
|
318
|
+
/* evaluate gain based on new dict */
|
296
319
|
start = refinedStart;
|
297
320
|
pos = suffix[refinedStart];
|
298
321
|
end = start;
|
299
322
|
memset(lengthList, 0, sizeof(lengthList));
|
300
323
|
|
301
324
|
/* look forward */
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
325
|
+
{ size_t length;
|
326
|
+
do {
|
327
|
+
end++;
|
328
|
+
length = ZDICT_count(b + pos, b + suffix[end]);
|
329
|
+
if (length >= LLIMIT) length = LLIMIT-1;
|
330
|
+
lengthList[length]++;
|
331
|
+
} while (length >=MINMATCHLENGTH);
|
332
|
+
}
|
308
333
|
|
309
334
|
/* look backward */
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
335
|
+
{ size_t length = MINMATCHLENGTH;
|
336
|
+
while ((length >= MINMATCHLENGTH) & (start > 0)) {
|
337
|
+
length = ZDICT_count(b + pos, b + suffix[start - 1]);
|
338
|
+
if (length >= LLIMIT) length = LLIMIT - 1;
|
339
|
+
lengthList[length]++;
|
340
|
+
if (length >= MINMATCHLENGTH) start--;
|
341
|
+
}
|
342
|
+
}
|
317
343
|
|
318
344
|
/* largest useful length */
|
319
345
|
memset(cumulLength, 0, sizeof(cumulLength));
|
@@ -337,8 +363,8 @@ static dictItem ZDICT_analyzePos(
|
|
337
363
|
for (i=MINMATCHLENGTH; i<=(int)maxLength; i++)
|
338
364
|
savings[i] = savings[i-1] + (lengthList[i] * (i-3));
|
339
365
|
|
340
|
-
DISPLAYLEVEL(4, "Selected
|
341
|
-
(
|
366
|
+
DISPLAYLEVEL(4, "Selected dict at position %u, of length %u : saves %u (ratio: %.2f) \n",
|
367
|
+
(unsigned)pos, (unsigned)maxLength, (unsigned)savings[maxLength], (double)savings[maxLength] / maxLength);
|
342
368
|
|
343
369
|
solution.pos = (U32)pos;
|
344
370
|
solution.length = (U32)maxLength;
|
@@ -347,12 +373,12 @@ static dictItem ZDICT_analyzePos(
|
|
347
373
|
/* mark positions done */
|
348
374
|
{ U32 id;
|
349
375
|
for (id=start; id<end; id++) {
|
350
|
-
U32 p, pEnd;
|
376
|
+
U32 p, pEnd, length;
|
351
377
|
U32 const testedPos = suffix[id];
|
352
378
|
if (testedPos == pos)
|
353
379
|
length = solution.length;
|
354
380
|
else {
|
355
|
-
length = ZDICT_count(b+pos, b+testedPos);
|
381
|
+
length = (U32)ZDICT_count(b+pos, b+testedPos);
|
356
382
|
if (length > solution.length) length = solution.length;
|
357
383
|
}
|
358
384
|
pEnd = (U32)(testedPos + length);
|
@@ -364,21 +390,35 @@ static dictItem ZDICT_analyzePos(
|
|
364
390
|
}
|
365
391
|
|
366
392
|
|
367
|
-
|
393
|
+
static int isIncluded(const void* in, const void* container, size_t length)
|
394
|
+
{
|
395
|
+
const char* const ip = (const char*) in;
|
396
|
+
const char* const into = (const char*) container;
|
397
|
+
size_t u;
|
398
|
+
|
399
|
+
for (u=0; u<length; u++) { /* works because end of buffer is a noisy guard band */
|
400
|
+
if (ip[u] != into[u]) break;
|
401
|
+
}
|
402
|
+
|
403
|
+
return u==length;
|
404
|
+
}
|
405
|
+
|
406
|
+
/*! ZDICT_tryMerge() :
|
368
407
|
check if dictItem can be merged, do it if possible
|
369
408
|
@return : id of destination elt, 0 if not merged
|
370
409
|
*/
|
371
|
-
static U32
|
410
|
+
static U32 ZDICT_tryMerge(dictItem* table, dictItem elt, U32 eltNbToSkip, const void* buffer)
|
372
411
|
{
|
373
412
|
const U32 tableSize = table->pos;
|
374
413
|
const U32 eltEnd = elt.pos + elt.length;
|
414
|
+
const char* const buf = (const char*) buffer;
|
375
415
|
|
376
416
|
/* tail overlap */
|
377
417
|
U32 u; for (u=1; u<tableSize; u++) {
|
378
418
|
if (u==eltNbToSkip) continue;
|
379
419
|
if ((table[u].pos > elt.pos) && (table[u].pos <= eltEnd)) { /* overlap, existing > new */
|
380
420
|
/* append */
|
381
|
-
U32 addedLength = table[u].pos - elt.pos;
|
421
|
+
U32 const addedLength = table[u].pos - elt.pos;
|
382
422
|
table[u].length += addedLength;
|
383
423
|
table[u].pos = elt.pos;
|
384
424
|
table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */
|
@@ -394,9 +434,10 @@ static U32 ZDICT_checkMerge(dictItem* table, dictItem elt, U32 eltNbToSkip)
|
|
394
434
|
/* front overlap */
|
395
435
|
for (u=1; u<tableSize; u++) {
|
396
436
|
if (u==eltNbToSkip) continue;
|
437
|
+
|
397
438
|
if ((table[u].pos + table[u].length >= elt.pos) && (table[u].pos < elt.pos)) { /* overlap, existing < new */
|
398
439
|
/* append */
|
399
|
-
int addedLength = (int)eltEnd - (table[u].pos + table[u].length);
|
440
|
+
int const addedLength = (int)eltEnd - (table[u].pos + table[u].length);
|
400
441
|
table[u].savings += elt.length / 8; /* rough approx bonus */
|
401
442
|
if (addedLength > 0) { /* otherwise, elt fully included into existing */
|
402
443
|
table[u].length += addedLength;
|
@@ -408,7 +449,18 @@ static U32 ZDICT_checkMerge(dictItem* table, dictItem elt, U32 eltNbToSkip)
|
|
408
449
|
table[u] = table[u-1], u--;
|
409
450
|
table[u] = elt;
|
410
451
|
return u;
|
411
|
-
|
452
|
+
}
|
453
|
+
|
454
|
+
if (MEM_read64(buf + table[u].pos) == MEM_read64(buf + elt.pos + 1)) {
|
455
|
+
if (isIncluded(buf + table[u].pos, buf + elt.pos + 1, table[u].length)) {
|
456
|
+
size_t const addedLength = MAX( (int)elt.length - (int)table[u].length , 1 );
|
457
|
+
table[u].pos = elt.pos;
|
458
|
+
table[u].savings += (U32)(elt.savings * addedLength / elt.length);
|
459
|
+
table[u].length = MIN(elt.length, table[u].length + 1);
|
460
|
+
return u;
|
461
|
+
}
|
462
|
+
}
|
463
|
+
}
|
412
464
|
|
413
465
|
return 0;
|
414
466
|
}
|
@@ -416,8 +468,8 @@ static U32 ZDICT_checkMerge(dictItem* table, dictItem elt, U32 eltNbToSkip)
|
|
416
468
|
|
417
469
|
static void ZDICT_removeDictItem(dictItem* table, U32 id)
|
418
470
|
{
|
419
|
-
/* convention :
|
420
|
-
U32 const max = table
|
471
|
+
/* convention : table[0].pos stores nb of elts */
|
472
|
+
U32 const max = table[0].pos;
|
421
473
|
U32 u;
|
422
474
|
if (!id) return; /* protection, should never happen */
|
423
475
|
for (u=id; u<max-1; u++)
|
@@ -426,14 +478,14 @@ static void ZDICT_removeDictItem(dictItem* table, U32 id)
|
|
426
478
|
}
|
427
479
|
|
428
480
|
|
429
|
-
static void ZDICT_insertDictItem(dictItem* table, U32 maxSize, dictItem elt)
|
481
|
+
static void ZDICT_insertDictItem(dictItem* table, U32 maxSize, dictItem elt, const void* buffer)
|
430
482
|
{
|
431
483
|
/* merge if possible */
|
432
|
-
U32 mergeId =
|
484
|
+
U32 mergeId = ZDICT_tryMerge(table, elt, 0, buffer);
|
433
485
|
if (mergeId) {
|
434
486
|
U32 newMerge = 1;
|
435
487
|
while (newMerge) {
|
436
|
-
newMerge =
|
488
|
+
newMerge = ZDICT_tryMerge(table, table[mergeId], mergeId, buffer);
|
437
489
|
if (newMerge) ZDICT_removeDictItem(table, mergeId);
|
438
490
|
mergeId = newMerge;
|
439
491
|
}
|
@@ -464,10 +516,10 @@ static U32 ZDICT_dictSize(const dictItem* dictList)
|
|
464
516
|
}
|
465
517
|
|
466
518
|
|
467
|
-
static size_t
|
519
|
+
static size_t ZDICT_trainBuffer_legacy(dictItem* dictList, U32 dictListSize,
|
468
520
|
const void* const buffer, size_t bufferSize, /* buffer must end with noisy guard band */
|
469
521
|
const size_t* fileSizes, unsigned nbFiles,
|
470
|
-
|
522
|
+
unsigned minRatio, U32 notificationLevel)
|
471
523
|
{
|
472
524
|
int* const suffix0 = (int*)malloc((bufferSize+2)*sizeof(*suffix0));
|
473
525
|
int* const suffix = suffix0+1;
|
@@ -478,10 +530,11 @@ static size_t ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
|
|
478
530
|
clock_t displayClock = 0;
|
479
531
|
clock_t const refreshRate = CLOCKS_PER_SEC * 3 / 10;
|
480
532
|
|
533
|
+
# undef DISPLAYUPDATE
|
481
534
|
# define DISPLAYUPDATE(l, ...) if (notificationLevel>=l) { \
|
482
535
|
if (ZDICT_clockSpan(displayClock) > refreshRate) \
|
483
536
|
{ displayClock = clock(); DISPLAY(__VA_ARGS__); \
|
484
|
-
if (notificationLevel>=4) fflush(
|
537
|
+
if (notificationLevel>=4) fflush(stderr); } }
|
485
538
|
|
486
539
|
/* init */
|
487
540
|
DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
|
@@ -493,11 +546,11 @@ static size_t ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
|
|
493
546
|
memset(doneMarks, 0, bufferSize+16);
|
494
547
|
|
495
548
|
/* limit sample set size (divsufsort limitation)*/
|
496
|
-
if (bufferSize > ZDICT_MAX_SAMPLES_SIZE) DISPLAYLEVEL(3, "sample set too large : reduced to %u MB ...\n", (
|
549
|
+
if (bufferSize > ZDICT_MAX_SAMPLES_SIZE) DISPLAYLEVEL(3, "sample set too large : reduced to %u MB ...\n", (unsigned)(ZDICT_MAX_SAMPLES_SIZE>>20));
|
497
550
|
while (bufferSize > ZDICT_MAX_SAMPLES_SIZE) bufferSize -= fileSizes[--nbFiles];
|
498
551
|
|
499
552
|
/* sort */
|
500
|
-
DISPLAYLEVEL(2, "sorting %u files of total size %u MB ...\n", nbFiles, (
|
553
|
+
DISPLAYLEVEL(2, "sorting %u files of total size %u MB ...\n", nbFiles, (unsigned)(bufferSize>>20));
|
501
554
|
{ int const divSuftSortResult = divsufsort((const unsigned char*)buffer, suffix, (int)bufferSize, 0);
|
502
555
|
if (divSuftSortResult != 0) { result = ERROR(GENERIC); goto _cleanup; }
|
503
556
|
}
|
@@ -522,7 +575,7 @@ static size_t ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
|
|
522
575
|
if (doneMarks[cursor]) { cursor++; continue; }
|
523
576
|
solution = ZDICT_analyzePos(doneMarks, suffix, reverseSuffix[cursor], buffer, minRatio, notificationLevel);
|
524
577
|
if (solution.length==0) { cursor++; continue; }
|
525
|
-
ZDICT_insertDictItem(dictList, dictListSize, solution);
|
578
|
+
ZDICT_insertDictItem(dictList, dictListSize, solution, buffer);
|
526
579
|
cursor += solution.length;
|
527
580
|
DISPLAYUPDATE(2, "\r%4.2f %% \r", (double)cursor / bufferSize * 100);
|
528
581
|
} }
|
@@ -541,7 +594,7 @@ static void ZDICT_fillNoise(void* buffer, size_t length)
|
|
541
594
|
unsigned const prime1 = 2654435761U;
|
542
595
|
unsigned const prime2 = 2246822519U;
|
543
596
|
unsigned acc = prime1;
|
544
|
-
size_t p=0
|
597
|
+
size_t p=0;
|
545
598
|
for (p=0; p<length; p++) {
|
546
599
|
acc *= prime2;
|
547
600
|
((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21);
|
@@ -551,29 +604,31 @@ static void ZDICT_fillNoise(void* buffer, size_t length)
|
|
551
604
|
|
552
605
|
typedef struct
|
553
606
|
{
|
554
|
-
|
555
|
-
ZSTD_CCtx* zc;
|
556
|
-
void* workPlace; /* must be
|
607
|
+
ZSTD_CDict* dict; /* dictionary */
|
608
|
+
ZSTD_CCtx* zc; /* working context */
|
609
|
+
void* workPlace; /* must be ZSTD_BLOCKSIZE_MAX allocated */
|
557
610
|
} EStats_ress_t;
|
558
611
|
|
559
612
|
#define MAXREPOFFSET 1024
|
560
613
|
|
561
|
-
static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params,
|
562
|
-
|
563
|
-
|
614
|
+
static void ZDICT_countEStats(EStats_ress_t esr, const ZSTD_parameters* params,
|
615
|
+
unsigned* countLit, unsigned* offsetcodeCount, unsigned* matchlengthCount, unsigned* litlengthCount, U32* repOffsets,
|
616
|
+
const void* src, size_t srcSize,
|
617
|
+
U32 notificationLevel)
|
564
618
|
{
|
565
|
-
size_t const blockSizeMax = MIN (
|
619
|
+
size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params->cParams.windowLog);
|
566
620
|
size_t cSize;
|
567
621
|
|
568
622
|
if (srcSize > blockSizeMax) srcSize = blockSizeMax; /* protection vs large samples */
|
569
|
-
{
|
570
|
-
|
623
|
+
{ size_t const errorCode = ZSTD_compressBegin_usingCDict(esr.zc, esr.dict);
|
624
|
+
if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_compressBegin_usingCDict failed \n"); return; }
|
625
|
+
|
571
626
|
}
|
572
|
-
cSize = ZSTD_compressBlock(esr.zc, esr.workPlace,
|
573
|
-
if (ZSTD_isError(cSize)) { DISPLAYLEVEL(
|
627
|
+
cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_MAX, src, srcSize);
|
628
|
+
if (ZSTD_isError(cSize)) { DISPLAYLEVEL(3, "warning : could not compress sample size %u \n", (unsigned)srcSize); return; }
|
574
629
|
|
575
630
|
if (cSize) { /* if == 0; block is not compressible */
|
576
|
-
const seqStore_t* seqStorePtr = ZSTD_getSeqStore(esr.zc);
|
631
|
+
const seqStore_t* const seqStorePtr = ZSTD_getSeqStore(esr.zc);
|
577
632
|
|
578
633
|
/* literals stats */
|
579
634
|
{ const BYTE* bytePtr;
|
@@ -611,17 +666,6 @@ static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params,
|
|
611
666
|
} } }
|
612
667
|
}
|
613
668
|
|
614
|
-
/*
|
615
|
-
static size_t ZDICT_maxSampleSize(const size_t* fileSizes, unsigned nbFiles)
|
616
|
-
{
|
617
|
-
unsigned u;
|
618
|
-
size_t max=0;
|
619
|
-
for (u=0; u<nbFiles; u++)
|
620
|
-
if (max < fileSizes[u]) max = fileSizes[u];
|
621
|
-
return max;
|
622
|
-
}
|
623
|
-
*/
|
624
|
-
|
625
669
|
static size_t ZDICT_totalSampleSize(const size_t* fileSizes, unsigned nbFiles)
|
626
670
|
{
|
627
671
|
size_t total=0;
|
@@ -646,26 +690,38 @@ static void ZDICT_insertSortCount(offsetCount_t table[ZSTD_REP_NUM+1], U32 val,
|
|
646
690
|
}
|
647
691
|
}
|
648
692
|
|
693
|
+
/* ZDICT_flatLit() :
|
694
|
+
* rewrite `countLit` to contain a mostly flat but still compressible distribution of literals.
|
695
|
+
* necessary to avoid generating a non-compressible distribution that HUF_writeCTable() cannot encode.
|
696
|
+
*/
|
697
|
+
static void ZDICT_flatLit(unsigned* countLit)
|
698
|
+
{
|
699
|
+
int u;
|
700
|
+
for (u=1; u<256; u++) countLit[u] = 2;
|
701
|
+
countLit[0] = 4;
|
702
|
+
countLit[253] = 1;
|
703
|
+
countLit[254] = 1;
|
704
|
+
}
|
649
705
|
|
650
706
|
#define OFFCODE_MAX 30 /* only applicable to first block */
|
651
707
|
static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
652
|
-
|
708
|
+
int compressionLevel,
|
653
709
|
const void* srcBuffer, const size_t* fileSizes, unsigned nbFiles,
|
654
710
|
const void* dictBuffer, size_t dictBufferSize,
|
655
711
|
unsigned notificationLevel)
|
656
712
|
{
|
657
|
-
|
713
|
+
unsigned countLit[256];
|
658
714
|
HUF_CREATE_STATIC_CTABLE(hufTable, 255);
|
659
|
-
|
715
|
+
unsigned offcodeCount[OFFCODE_MAX+1];
|
660
716
|
short offcodeNCount[OFFCODE_MAX+1];
|
661
717
|
U32 offcodeMax = ZSTD_highbit32((U32)(dictBufferSize + 128 KB));
|
662
|
-
|
718
|
+
unsigned matchLengthCount[MaxML+1];
|
663
719
|
short matchLengthNCount[MaxML+1];
|
664
|
-
|
720
|
+
unsigned litLengthCount[MaxLL+1];
|
665
721
|
short litLengthNCount[MaxLL+1];
|
666
722
|
U32 repOffset[MAXREPOFFSET];
|
667
723
|
offsetCount_t bestRepOffset[ZSTD_REP_NUM+1];
|
668
|
-
EStats_ress_t esr;
|
724
|
+
EStats_ress_t esr = { NULL, NULL, NULL };
|
669
725
|
ZSTD_parameters params;
|
670
726
|
U32 u, huffLog = 11, Offlog = OffFSELog, mlLog = MLFSELog, llLog = LLFSELog, total;
|
671
727
|
size_t pos = 0, errorCode;
|
@@ -675,48 +731,51 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
675
731
|
BYTE* dstPtr = (BYTE*)dstBuffer;
|
676
732
|
|
677
733
|
/* init */
|
678
|
-
|
734
|
+
DEBUGLOG(4, "ZDICT_analyzeEntropy");
|
735
|
+
if (offcodeMax>OFFCODE_MAX) { eSize = ERROR(dictionaryCreation_failed); goto _cleanup; } /* too large dictionary */
|
736
|
+
for (u=0; u<256; u++) countLit[u] = 1; /* any character must be described */
|
737
|
+
for (u=0; u<=offcodeMax; u++) offcodeCount[u] = 1;
|
738
|
+
for (u=0; u<=MaxML; u++) matchLengthCount[u] = 1;
|
739
|
+
for (u=0; u<=MaxLL; u++) litLengthCount[u] = 1;
|
740
|
+
memset(repOffset, 0, sizeof(repOffset));
|
741
|
+
repOffset[1] = repOffset[4] = repOffset[8] = 1;
|
742
|
+
memset(bestRepOffset, 0, sizeof(bestRepOffset));
|
743
|
+
if (compressionLevel==0) compressionLevel = ZSTD_CLEVEL_DEFAULT;
|
744
|
+
params = ZSTD_getParams(compressionLevel, averageSampleSize, dictBufferSize);
|
745
|
+
|
746
|
+
esr.dict = ZSTD_createCDict_advanced(dictBuffer, dictBufferSize, ZSTD_dlm_byRef, ZSTD_dct_rawContent, params.cParams, ZSTD_defaultCMem);
|
679
747
|
esr.zc = ZSTD_createCCtx();
|
680
|
-
esr.workPlace = malloc(
|
681
|
-
if (!esr.
|
748
|
+
esr.workPlace = malloc(ZSTD_BLOCKSIZE_MAX);
|
749
|
+
if (!esr.dict || !esr.zc || !esr.workPlace) {
|
682
750
|
eSize = ERROR(memory_allocation);
|
683
751
|
DISPLAYLEVEL(1, "Not enough memory \n");
|
684
752
|
goto _cleanup;
|
685
753
|
}
|
686
|
-
if (offcodeMax>OFFCODE_MAX) { eSize = ERROR(dictionary_wrong); goto _cleanup; } /* too large dictionary */
|
687
|
-
for (u=0; u<256; u++) countLit[u]=1; /* any character must be described */
|
688
|
-
for (u=0; u<=offcodeMax; u++) offcodeCount[u]=1;
|
689
|
-
for (u=0; u<=MaxML; u++) matchLengthCount[u]=1;
|
690
|
-
for (u=0; u<=MaxLL; u++) litLengthCount[u]=1;
|
691
|
-
memset(repOffset, 0, sizeof(repOffset));
|
692
|
-
repOffset[1] = repOffset[4] = repOffset[8] = 1;
|
693
|
-
memset(bestRepOffset, 0, sizeof(bestRepOffset));
|
694
|
-
if (compressionLevel==0) compressionLevel=g_compressionLevel_default;
|
695
|
-
params = ZSTD_getParams(compressionLevel, averageSampleSize, dictBufferSize);
|
696
|
-
{ size_t const beginResult = ZSTD_compressBegin_advanced(esr.ref, dictBuffer, dictBufferSize, params, 0);
|
697
|
-
if (ZSTD_isError(beginResult)) {
|
698
|
-
eSize = ERROR(GENERIC);
|
699
|
-
DISPLAYLEVEL(1, "error : ZSTD_compressBegin_advanced failed \n");
|
700
|
-
goto _cleanup;
|
701
|
-
} }
|
702
754
|
|
703
|
-
/* collect stats on all
|
755
|
+
/* collect stats on all samples */
|
704
756
|
for (u=0; u<nbFiles; u++) {
|
705
|
-
ZDICT_countEStats(esr, params,
|
757
|
+
ZDICT_countEStats(esr, ¶ms,
|
706
758
|
countLit, offcodeCount, matchLengthCount, litLengthCount, repOffset,
|
707
759
|
(const char*)srcBuffer + pos, fileSizes[u],
|
708
760
|
notificationLevel);
|
709
761
|
pos += fileSizes[u];
|
710
762
|
}
|
711
763
|
|
712
|
-
/* analyze */
|
713
|
-
|
714
|
-
|
715
|
-
|
716
|
-
|
717
|
-
|
764
|
+
/* analyze, build stats, starting with literals */
|
765
|
+
{ size_t maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog);
|
766
|
+
if (HUF_isError(maxNbBits)) {
|
767
|
+
eSize = maxNbBits;
|
768
|
+
DISPLAYLEVEL(1, " HUF_buildCTable error \n");
|
769
|
+
goto _cleanup;
|
770
|
+
}
|
771
|
+
if (maxNbBits==8) { /* not compressible : will fail on HUF_writeCTable() */
|
772
|
+
DISPLAYLEVEL(2, "warning : pathological dataset : literals are not compressible : samples are noisy or too regular \n");
|
773
|
+
ZDICT_flatLit(countLit); /* replace distribution by a fake "mostly flat but still compressible" distribution, that HUF_writeCTable() can encode */
|
774
|
+
maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog);
|
775
|
+
assert(maxNbBits==9);
|
776
|
+
}
|
777
|
+
huffLog = (U32)maxNbBits;
|
718
778
|
}
|
719
|
-
huffLog = (U32)errorCode;
|
720
779
|
|
721
780
|
/* looking for most common first offsets */
|
722
781
|
{ U32 offset;
|
@@ -726,27 +785,27 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
726
785
|
/* note : the result of this phase should be used to better appreciate the impact on statistics */
|
727
786
|
|
728
787
|
total=0; for (u=0; u<=offcodeMax; u++) total+=offcodeCount[u];
|
729
|
-
errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, offcodeMax);
|
788
|
+
errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, offcodeMax, /* useLowProbCount */ 1);
|
730
789
|
if (FSE_isError(errorCode)) {
|
731
|
-
eSize =
|
790
|
+
eSize = errorCode;
|
732
791
|
DISPLAYLEVEL(1, "FSE_normalizeCount error with offcodeCount \n");
|
733
792
|
goto _cleanup;
|
734
793
|
}
|
735
794
|
Offlog = (U32)errorCode;
|
736
795
|
|
737
796
|
total=0; for (u=0; u<=MaxML; u++) total+=matchLengthCount[u];
|
738
|
-
errorCode = FSE_normalizeCount(matchLengthNCount, mlLog, matchLengthCount, total, MaxML);
|
797
|
+
errorCode = FSE_normalizeCount(matchLengthNCount, mlLog, matchLengthCount, total, MaxML, /* useLowProbCount */ 1);
|
739
798
|
if (FSE_isError(errorCode)) {
|
740
|
-
eSize =
|
799
|
+
eSize = errorCode;
|
741
800
|
DISPLAYLEVEL(1, "FSE_normalizeCount error with matchLengthCount \n");
|
742
801
|
goto _cleanup;
|
743
802
|
}
|
744
803
|
mlLog = (U32)errorCode;
|
745
804
|
|
746
805
|
total=0; for (u=0; u<=MaxLL; u++) total+=litLengthCount[u];
|
747
|
-
errorCode = FSE_normalizeCount(litLengthNCount, llLog, litLengthCount, total, MaxLL);
|
806
|
+
errorCode = FSE_normalizeCount(litLengthNCount, llLog, litLengthCount, total, MaxLL, /* useLowProbCount */ 1);
|
748
807
|
if (FSE_isError(errorCode)) {
|
749
|
-
eSize =
|
808
|
+
eSize = errorCode;
|
750
809
|
DISPLAYLEVEL(1, "FSE_normalizeCount error with litLengthCount \n");
|
751
810
|
goto _cleanup;
|
752
811
|
}
|
@@ -755,7 +814,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
755
814
|
/* write result to buffer */
|
756
815
|
{ size_t const hhSize = HUF_writeCTable(dstPtr, maxDstSize, hufTable, 255, huffLog);
|
757
816
|
if (HUF_isError(hhSize)) {
|
758
|
-
eSize =
|
817
|
+
eSize = hhSize;
|
759
818
|
DISPLAYLEVEL(1, "HUF_writeCTable error \n");
|
760
819
|
goto _cleanup;
|
761
820
|
}
|
@@ -766,7 +825,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
766
825
|
|
767
826
|
{ size_t const ohSize = FSE_writeNCount(dstPtr, maxDstSize, offcodeNCount, OFFCODE_MAX, Offlog);
|
768
827
|
if (FSE_isError(ohSize)) {
|
769
|
-
eSize =
|
828
|
+
eSize = ohSize;
|
770
829
|
DISPLAYLEVEL(1, "FSE_writeNCount error with offcodeNCount \n");
|
771
830
|
goto _cleanup;
|
772
831
|
}
|
@@ -777,7 +836,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
777
836
|
|
778
837
|
{ size_t const mhSize = FSE_writeNCount(dstPtr, maxDstSize, matchLengthNCount, MaxML, mlLog);
|
779
838
|
if (FSE_isError(mhSize)) {
|
780
|
-
eSize =
|
839
|
+
eSize = mhSize;
|
781
840
|
DISPLAYLEVEL(1, "FSE_writeNCount error with matchLengthNCount \n");
|
782
841
|
goto _cleanup;
|
783
842
|
}
|
@@ -788,7 +847,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
788
847
|
|
789
848
|
{ size_t const lhSize = FSE_writeNCount(dstPtr, maxDstSize, litLengthNCount, MaxLL, llLog);
|
790
849
|
if (FSE_isError(lhSize)) {
|
791
|
-
eSize =
|
850
|
+
eSize = lhSize;
|
792
851
|
DISPLAYLEVEL(1, "FSE_writeNCount error with litlengthNCount \n");
|
793
852
|
goto _cleanup;
|
794
853
|
}
|
@@ -798,7 +857,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
798
857
|
}
|
799
858
|
|
800
859
|
if (maxDstSize<12) {
|
801
|
-
eSize = ERROR(
|
860
|
+
eSize = ERROR(dstSize_tooSmall);
|
802
861
|
DISPLAYLEVEL(1, "not enough space to write RepOffsets \n");
|
803
862
|
goto _cleanup;
|
804
863
|
}
|
@@ -813,11 +872,10 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
813
872
|
MEM_writeLE32(dstPtr+4, repStartValue[1]);
|
814
873
|
MEM_writeLE32(dstPtr+8, repStartValue[2]);
|
815
874
|
#endif
|
816
|
-
//dstPtr += 12;
|
817
875
|
eSize += 12;
|
818
876
|
|
819
877
|
_cleanup:
|
820
|
-
|
878
|
+
ZSTD_freeCDict(esr.dict);
|
821
879
|
ZSTD_freeCCtx(esr.zc);
|
822
880
|
free(esr.workPlace);
|
823
881
|
|
@@ -825,26 +883,68 @@ _cleanup:
|
|
825
883
|
}
|
826
884
|
|
827
885
|
|
828
|
-
|
829
|
-
|
830
|
-
|
886
|
+
|
887
|
+
size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
|
888
|
+
const void* customDictContent, size_t dictContentSize,
|
889
|
+
const void* samplesBuffer, const size_t* samplesSizes,
|
890
|
+
unsigned nbSamples, ZDICT_params_t params)
|
831
891
|
{
|
832
892
|
size_t hSize;
|
833
|
-
|
893
|
+
#define HBUFFSIZE 256 /* should prove large enough for all entropy headers */
|
894
|
+
BYTE header[HBUFFSIZE];
|
895
|
+
int const compressionLevel = (params.compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : params.compressionLevel;
|
834
896
|
U32 const notificationLevel = params.notificationLevel;
|
835
897
|
|
898
|
+
/* check conditions */
|
899
|
+
DEBUGLOG(4, "ZDICT_finalizeDictionary");
|
900
|
+
if (dictBufferCapacity < dictContentSize) return ERROR(dstSize_tooSmall);
|
901
|
+
if (dictContentSize < ZDICT_CONTENTSIZE_MIN) return ERROR(srcSize_wrong);
|
902
|
+
if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) return ERROR(dstSize_tooSmall);
|
903
|
+
|
836
904
|
/* dictionary header */
|
837
|
-
MEM_writeLE32(
|
838
|
-
{ U64 const randomID = XXH64(
|
905
|
+
MEM_writeLE32(header, ZSTD_MAGIC_DICTIONARY);
|
906
|
+
{ U64 const randomID = XXH64(customDictContent, dictContentSize, 0);
|
839
907
|
U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768;
|
840
908
|
U32 const dictID = params.dictID ? params.dictID : compliantID;
|
841
|
-
MEM_writeLE32(
|
909
|
+
MEM_writeLE32(header+4, dictID);
|
842
910
|
}
|
843
911
|
hSize = 8;
|
844
912
|
|
845
913
|
/* entropy tables */
|
846
914
|
DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
|
847
915
|
DISPLAYLEVEL(2, "statistics ... \n");
|
916
|
+
{ size_t const eSize = ZDICT_analyzeEntropy(header+hSize, HBUFFSIZE-hSize,
|
917
|
+
compressionLevel,
|
918
|
+
samplesBuffer, samplesSizes, nbSamples,
|
919
|
+
customDictContent, dictContentSize,
|
920
|
+
notificationLevel);
|
921
|
+
if (ZDICT_isError(eSize)) return eSize;
|
922
|
+
hSize += eSize;
|
923
|
+
}
|
924
|
+
|
925
|
+
/* copy elements in final buffer ; note : src and dst buffer can overlap */
|
926
|
+
if (hSize + dictContentSize > dictBufferCapacity) dictContentSize = dictBufferCapacity - hSize;
|
927
|
+
{ size_t const dictSize = hSize + dictContentSize;
|
928
|
+
char* dictEnd = (char*)dictBuffer + dictSize;
|
929
|
+
memmove(dictEnd - dictContentSize, customDictContent, dictContentSize);
|
930
|
+
memcpy(dictBuffer, header, hSize);
|
931
|
+
return dictSize;
|
932
|
+
}
|
933
|
+
}
|
934
|
+
|
935
|
+
|
936
|
+
static size_t ZDICT_addEntropyTablesFromBuffer_advanced(
|
937
|
+
void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
|
938
|
+
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
|
939
|
+
ZDICT_params_t params)
|
940
|
+
{
|
941
|
+
int const compressionLevel = (params.compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : params.compressionLevel;
|
942
|
+
U32 const notificationLevel = params.notificationLevel;
|
943
|
+
size_t hSize = 8;
|
944
|
+
|
945
|
+
/* calculate entropy tables */
|
946
|
+
DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
|
947
|
+
DISPLAYLEVEL(2, "statistics ... \n");
|
848
948
|
{ size_t const eSize = ZDICT_analyzeEntropy((char*)dictBuffer+hSize, dictBufferCapacity-hSize,
|
849
949
|
compressionLevel,
|
850
950
|
samplesBuffer, samplesSizes, nbSamples,
|
@@ -854,21 +954,32 @@ size_t ZDICT_addEntropyTablesFromBuffer_advanced(void* dictBuffer, size_t dictCo
|
|
854
954
|
hSize += eSize;
|
855
955
|
}
|
856
956
|
|
957
|
+
/* add dictionary header (after entropy tables) */
|
958
|
+
MEM_writeLE32(dictBuffer, ZSTD_MAGIC_DICTIONARY);
|
959
|
+
{ U64 const randomID = XXH64((char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize, 0);
|
960
|
+
U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768;
|
961
|
+
U32 const dictID = params.dictID ? params.dictID : compliantID;
|
962
|
+
MEM_writeLE32((char*)dictBuffer+4, dictID);
|
963
|
+
}
|
857
964
|
|
858
965
|
if (hSize + dictContentSize < dictBufferCapacity)
|
859
966
|
memmove((char*)dictBuffer + hSize, (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize);
|
860
967
|
return MIN(dictBufferCapacity, hSize+dictContentSize);
|
861
968
|
}
|
862
969
|
|
863
|
-
|
864
|
-
|
970
|
+
/* Hidden declaration for dbio.c */
|
971
|
+
size_t ZDICT_trainFromBuffer_unsafe_legacy(
|
972
|
+
void* dictBuffer, size_t maxDictSize,
|
973
|
+
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
|
974
|
+
ZDICT_legacy_params_t params);
|
975
|
+
/*! ZDICT_trainFromBuffer_unsafe_legacy() :
|
865
976
|
* Warning : `samplesBuffer` must be followed by noisy guard band.
|
866
977
|
* @return : size of dictionary, or an error code which can be tested with ZDICT_isError()
|
867
978
|
*/
|
868
|
-
size_t
|
979
|
+
size_t ZDICT_trainFromBuffer_unsafe_legacy(
|
869
980
|
void* dictBuffer, size_t maxDictSize,
|
870
981
|
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
|
871
|
-
|
982
|
+
ZDICT_legacy_params_t params)
|
872
983
|
{
|
873
984
|
U32 const dictListSize = MAX(MAX(DICTLISTSIZE_DEFAULT, nbSamples), (U32)(maxDictSize/16));
|
874
985
|
dictItem* const dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList));
|
@@ -877,58 +988,63 @@ size_t ZDICT_trainFromBuffer_unsafe(
|
|
877
988
|
size_t const targetDictSize = maxDictSize;
|
878
989
|
size_t const samplesBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples);
|
879
990
|
size_t dictSize = 0;
|
880
|
-
U32 const notificationLevel = params.notificationLevel;
|
991
|
+
U32 const notificationLevel = params.zParams.notificationLevel;
|
881
992
|
|
882
993
|
/* checks */
|
883
994
|
if (!dictList) return ERROR(memory_allocation);
|
884
|
-
if (maxDictSize
|
885
|
-
if (samplesBuffSize < ZDICT_MIN_SAMPLES_SIZE) { free(dictList); return
|
995
|
+
if (maxDictSize < ZDICT_DICTSIZE_MIN) { free(dictList); return ERROR(dstSize_tooSmall); } /* requested dictionary size is too small */
|
996
|
+
if (samplesBuffSize < ZDICT_MIN_SAMPLES_SIZE) { free(dictList); return ERROR(dictionaryCreation_failed); } /* not enough source to create dictionary */
|
886
997
|
|
887
998
|
/* init */
|
888
999
|
ZDICT_initDictItem(dictList);
|
889
1000
|
|
890
1001
|
/* build dictionary */
|
891
|
-
|
892
|
-
|
893
|
-
|
894
|
-
|
1002
|
+
ZDICT_trainBuffer_legacy(dictList, dictListSize,
|
1003
|
+
samplesBuffer, samplesBuffSize,
|
1004
|
+
samplesSizes, nbSamples,
|
1005
|
+
minRep, notificationLevel);
|
895
1006
|
|
896
1007
|
/* display best matches */
|
897
|
-
if (params.notificationLevel>= 3) {
|
898
|
-
|
899
|
-
|
900
|
-
|
901
|
-
DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", dictList[0].pos, dictContentSize);
|
902
|
-
DISPLAYLEVEL(3, "list %u best segments \n", nb);
|
903
|
-
for (u=1; u
|
904
|
-
|
905
|
-
|
906
|
-
U32 printedLength = MIN(40, length);
|
1008
|
+
if (params.zParams.notificationLevel>= 3) {
|
1009
|
+
unsigned const nb = MIN(25, dictList[0].pos);
|
1010
|
+
unsigned const dictContentSize = ZDICT_dictSize(dictList);
|
1011
|
+
unsigned u;
|
1012
|
+
DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", (unsigned)dictList[0].pos-1, dictContentSize);
|
1013
|
+
DISPLAYLEVEL(3, "list %u best segments \n", nb-1);
|
1014
|
+
for (u=1; u<nb; u++) {
|
1015
|
+
unsigned const pos = dictList[u].pos;
|
1016
|
+
unsigned const length = dictList[u].length;
|
1017
|
+
U32 const printedLength = MIN(40, length);
|
1018
|
+
if ((pos > samplesBuffSize) || ((pos + length) > samplesBuffSize)) {
|
1019
|
+
free(dictList);
|
1020
|
+
return ERROR(GENERIC); /* should never happen */
|
1021
|
+
}
|
907
1022
|
DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |",
|
908
|
-
u, length, pos, dictList[u].savings);
|
1023
|
+
u, length, pos, (unsigned)dictList[u].savings);
|
909
1024
|
ZDICT_printHex((const char*)samplesBuffer+pos, printedLength);
|
910
1025
|
DISPLAYLEVEL(3, "| \n");
|
911
1026
|
} }
|
912
1027
|
|
913
1028
|
|
914
1029
|
/* create dictionary */
|
915
|
-
{
|
916
|
-
if (dictContentSize <
|
917
|
-
|
1030
|
+
{ unsigned dictContentSize = ZDICT_dictSize(dictList);
|
1031
|
+
if (dictContentSize < ZDICT_CONTENTSIZE_MIN) { free(dictList); return ERROR(dictionaryCreation_failed); } /* dictionary content too small */
|
1032
|
+
if (dictContentSize < targetDictSize/4) {
|
1033
|
+
DISPLAYLEVEL(2, "! warning : selected content significantly smaller than requested (%u < %u) \n", dictContentSize, (unsigned)maxDictSize);
|
1034
|
+
if (samplesBuffSize < 10 * targetDictSize)
|
1035
|
+
DISPLAYLEVEL(2, "! consider increasing the number of samples (total size : %u MB)\n", (unsigned)(samplesBuffSize>>20));
|
918
1036
|
if (minRep > MINRATIO) {
|
919
1037
|
DISPLAYLEVEL(2, "! consider increasing selectivity to produce larger dictionary (-s%u) \n", selectivity+1);
|
920
1038
|
DISPLAYLEVEL(2, "! note : larger dictionaries are not necessarily better, test its efficiency on samples \n");
|
921
1039
|
}
|
922
|
-
if (samplesBuffSize < 10 * targetDictSize)
|
923
|
-
DISPLAYLEVEL(2, "! consider increasing the number of samples (total size : %u MB)\n", (U32)(samplesBuffSize>>20));
|
924
1040
|
}
|
925
1041
|
|
926
1042
|
if ((dictContentSize > targetDictSize*3) && (nbSamples > 2*MINRATIO) && (selectivity>1)) {
|
927
|
-
|
1043
|
+
unsigned proposedSelectivity = selectivity-1;
|
928
1044
|
while ((nbSamples >> proposedSelectivity) <= MINRATIO) { proposedSelectivity--; }
|
929
|
-
DISPLAYLEVEL(2, "! note : calculated dictionary significantly larger than requested (%u > %u) \n", dictContentSize, (
|
1045
|
+
DISPLAYLEVEL(2, "! note : calculated dictionary significantly larger than requested (%u > %u) \n", dictContentSize, (unsigned)maxDictSize);
|
930
1046
|
DISPLAYLEVEL(2, "! consider increasing dictionary size, or produce denser dictionary (-s%u) \n", proposedSelectivity);
|
931
|
-
DISPLAYLEVEL(2, "! always test dictionary efficiency on samples \n");
|
1047
|
+
DISPLAYLEVEL(2, "! always test dictionary efficiency on real samples \n");
|
932
1048
|
}
|
933
1049
|
|
934
1050
|
/* limit dictionary size */
|
@@ -954,7 +1070,7 @@ size_t ZDICT_trainFromBuffer_unsafe(
|
|
954
1070
|
|
955
1071
|
dictSize = ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, maxDictSize,
|
956
1072
|
samplesBuffer, samplesSizes, nbSamples,
|
957
|
-
params);
|
1073
|
+
params.zParams);
|
958
1074
|
}
|
959
1075
|
|
960
1076
|
/* clean up */
|
@@ -963,11 +1079,12 @@ size_t ZDICT_trainFromBuffer_unsafe(
|
|
963
1079
|
}
|
964
1080
|
|
965
1081
|
|
966
|
-
/*
|
967
|
-
*
|
968
|
-
|
969
|
-
|
970
|
-
|
1082
|
+
/* ZDICT_trainFromBuffer_legacy() :
|
1083
|
+
* issue : samplesBuffer need to be followed by a noisy guard band.
|
1084
|
+
* work around : duplicate the buffer, and add the noise */
|
1085
|
+
size_t ZDICT_trainFromBuffer_legacy(void* dictBuffer, size_t dictBufferCapacity,
|
1086
|
+
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
|
1087
|
+
ZDICT_legacy_params_t params)
|
971
1088
|
{
|
972
1089
|
size_t result;
|
973
1090
|
void* newBuff;
|
@@ -980,10 +1097,9 @@ size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dictBufferCapacit
|
|
980
1097
|
memcpy(newBuff, samplesBuffer, sBuffSize);
|
981
1098
|
ZDICT_fillNoise((char*)newBuff + sBuffSize, NOISELENGTH); /* guard band, for end of buffer condition */
|
982
1099
|
|
983
|
-
result =
|
984
|
-
|
985
|
-
|
986
|
-
params);
|
1100
|
+
result =
|
1101
|
+
ZDICT_trainFromBuffer_unsafe_legacy(dictBuffer, dictBufferCapacity, newBuff,
|
1102
|
+
samplesSizes, nbSamples, params);
|
987
1103
|
free(newBuff);
|
988
1104
|
return result;
|
989
1105
|
}
|
@@ -992,15 +1108,23 @@ size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dictBufferCapacit
|
|
992
1108
|
size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
|
993
1109
|
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
|
994
1110
|
{
|
995
|
-
|
1111
|
+
ZDICT_fastCover_params_t params;
|
1112
|
+
DEBUGLOG(3, "ZDICT_trainFromBuffer");
|
996
1113
|
memset(¶ms, 0, sizeof(params));
|
997
|
-
|
998
|
-
|
999
|
-
|
1114
|
+
params.d = 8;
|
1115
|
+
params.steps = 4;
|
1116
|
+
/* Use default level since no compression level information is available */
|
1117
|
+
params.zParams.compressionLevel = ZSTD_CLEVEL_DEFAULT;
|
1118
|
+
#if defined(DEBUGLEVEL) && (DEBUGLEVEL>=1)
|
1119
|
+
params.zParams.notificationLevel = DEBUGLEVEL;
|
1120
|
+
#endif
|
1121
|
+
return ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, dictBufferCapacity,
|
1122
|
+
samplesBuffer, samplesSizes, nbSamples,
|
1123
|
+
¶ms);
|
1000
1124
|
}
|
1001
1125
|
|
1002
1126
|
size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
|
1003
|
-
|
1127
|
+
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
|
1004
1128
|
{
|
1005
1129
|
ZDICT_params_t params;
|
1006
1130
|
memset(¶ms, 0, sizeof(params));
|