extzstd 0.2 → 0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/HISTORY.ja.md +13 -0
- data/README.md +17 -14
- data/contrib/zstd/{NEWS → CHANGELOG} +115 -2
- data/contrib/zstd/CODE_OF_CONDUCT.md +5 -0
- data/contrib/zstd/Makefile +99 -53
- data/contrib/zstd/README.md +59 -39
- data/contrib/zstd/TESTING.md +1 -1
- data/contrib/zstd/appveyor.yml +17 -6
- data/contrib/zstd/lib/BUCK +29 -2
- data/contrib/zstd/lib/Makefile +118 -21
- data/contrib/zstd/lib/README.md +84 -44
- data/contrib/zstd/lib/common/bitstream.h +17 -33
- data/contrib/zstd/lib/common/compiler.h +62 -8
- data/contrib/zstd/lib/common/cpu.h +215 -0
- data/contrib/zstd/lib/common/debug.c +44 -0
- data/contrib/zstd/lib/common/debug.h +134 -0
- data/contrib/zstd/lib/common/entropy_common.c +16 -1
- data/contrib/zstd/lib/common/error_private.c +7 -0
- data/contrib/zstd/lib/common/fse.h +48 -44
- data/contrib/zstd/lib/common/fse_decompress.c +3 -3
- data/contrib/zstd/lib/common/huf.h +169 -113
- data/contrib/zstd/lib/common/mem.h +20 -2
- data/contrib/zstd/lib/common/pool.c +135 -49
- data/contrib/zstd/lib/common/pool.h +40 -21
- data/contrib/zstd/lib/common/threading.c +2 -2
- data/contrib/zstd/lib/common/threading.h +12 -12
- data/contrib/zstd/lib/common/xxhash.c +3 -2
- data/contrib/zstd/lib/common/zstd_common.c +3 -6
- data/contrib/zstd/lib/common/zstd_errors.h +17 -7
- data/contrib/zstd/lib/common/zstd_internal.h +76 -48
- data/contrib/zstd/lib/compress/fse_compress.c +89 -209
- data/contrib/zstd/lib/compress/hist.c +203 -0
- data/contrib/zstd/lib/compress/hist.h +95 -0
- data/contrib/zstd/lib/compress/huf_compress.c +188 -80
- data/contrib/zstd/lib/compress/zstd_compress.c +2500 -1203
- data/contrib/zstd/lib/compress/zstd_compress_internal.h +463 -62
- data/contrib/zstd/lib/compress/zstd_double_fast.c +321 -131
- data/contrib/zstd/lib/compress/zstd_double_fast.h +13 -4
- data/contrib/zstd/lib/compress/zstd_fast.c +335 -108
- data/contrib/zstd/lib/compress/zstd_fast.h +12 -6
- data/contrib/zstd/lib/compress/zstd_lazy.c +654 -313
- data/contrib/zstd/lib/compress/zstd_lazy.h +44 -16
- data/contrib/zstd/lib/compress/zstd_ldm.c +310 -420
- data/contrib/zstd/lib/compress/zstd_ldm.h +63 -26
- data/contrib/zstd/lib/compress/zstd_opt.c +773 -325
- data/contrib/zstd/lib/compress/zstd_opt.h +31 -5
- data/contrib/zstd/lib/compress/zstdmt_compress.c +1468 -518
- data/contrib/zstd/lib/compress/zstdmt_compress.h +96 -45
- data/contrib/zstd/lib/decompress/huf_decompress.c +518 -282
- data/contrib/zstd/lib/decompress/zstd_ddict.c +240 -0
- data/contrib/zstd/lib/decompress/zstd_ddict.h +44 -0
- data/contrib/zstd/lib/decompress/zstd_decompress.c +613 -1513
- data/contrib/zstd/lib/decompress/zstd_decompress_block.c +1311 -0
- data/contrib/zstd/lib/decompress/zstd_decompress_block.h +59 -0
- data/contrib/zstd/lib/decompress/zstd_decompress_internal.h +175 -0
- data/contrib/zstd/lib/dictBuilder/cover.c +194 -113
- data/contrib/zstd/lib/dictBuilder/cover.h +112 -0
- data/contrib/zstd/lib/dictBuilder/divsufsort.c +3 -3
- data/contrib/zstd/lib/dictBuilder/fastcover.c +740 -0
- data/contrib/zstd/lib/dictBuilder/zdict.c +142 -106
- data/contrib/zstd/lib/dictBuilder/zdict.h +115 -49
- data/contrib/zstd/lib/legacy/zstd_legacy.h +44 -12
- data/contrib/zstd/lib/legacy/zstd_v01.c +41 -10
- data/contrib/zstd/lib/legacy/zstd_v01.h +12 -7
- data/contrib/zstd/lib/legacy/zstd_v02.c +37 -12
- data/contrib/zstd/lib/legacy/zstd_v02.h +12 -7
- data/contrib/zstd/lib/legacy/zstd_v03.c +38 -12
- data/contrib/zstd/lib/legacy/zstd_v03.h +12 -7
- data/contrib/zstd/lib/legacy/zstd_v04.c +55 -174
- data/contrib/zstd/lib/legacy/zstd_v04.h +12 -7
- data/contrib/zstd/lib/legacy/zstd_v05.c +59 -31
- data/contrib/zstd/lib/legacy/zstd_v05.h +12 -7
- data/contrib/zstd/lib/legacy/zstd_v06.c +48 -20
- data/contrib/zstd/lib/legacy/zstd_v06.h +10 -5
- data/contrib/zstd/lib/legacy/zstd_v07.c +62 -29
- data/contrib/zstd/lib/legacy/zstd_v07.h +10 -5
- data/contrib/zstd/lib/zstd.h +1346 -832
- data/ext/extzstd.c +27 -19
- data/ext/extzstd_stream.c +20 -4
- data/ext/zstd_compress.c +1 -0
- data/ext/zstd_decompress.c +4 -0
- data/ext/zstd_dictbuilder.c +4 -0
- data/ext/zstd_dictbuilder_fastcover.c +5 -0
- data/lib/extzstd.rb +52 -220
- data/lib/extzstd/version.rb +1 -1
- metadata +21 -7
- data/contrib/zstd/circle.yml +0 -63
|
@@ -207,7 +207,6 @@ static dictItem ZDICT_analyzePos(
|
|
|
207
207
|
U32 cumulLength[LLIMIT] = {0};
|
|
208
208
|
U32 savings[LLIMIT] = {0};
|
|
209
209
|
const BYTE* b = (const BYTE*)buffer;
|
|
210
|
-
size_t length;
|
|
211
210
|
size_t maxLength = LLIMIT;
|
|
212
211
|
size_t pos = suffix[start];
|
|
213
212
|
U32 end = start;
|
|
@@ -222,26 +221,30 @@ static dictItem ZDICT_analyzePos(
|
|
|
222
221
|
||(MEM_read16(b+pos+1) == MEM_read16(b+pos+3))
|
|
223
222
|
||(MEM_read16(b+pos+2) == MEM_read16(b+pos+4)) ) {
|
|
224
223
|
/* skip and mark segment */
|
|
225
|
-
U16
|
|
226
|
-
U32 u,
|
|
227
|
-
while (MEM_read16(b+pos+
|
|
228
|
-
if (b[pos+
|
|
229
|
-
for (u=1; u<
|
|
224
|
+
U16 const pattern16 = MEM_read16(b+pos+4);
|
|
225
|
+
U32 u, patternEnd = 6;
|
|
226
|
+
while (MEM_read16(b+pos+patternEnd) == pattern16) patternEnd+=2 ;
|
|
227
|
+
if (b[pos+patternEnd] == b[pos+patternEnd-1]) patternEnd++;
|
|
228
|
+
for (u=1; u<patternEnd; u++)
|
|
230
229
|
doneMarks[pos+u] = 1;
|
|
231
230
|
return solution;
|
|
232
231
|
}
|
|
233
232
|
|
|
234
233
|
/* look forward */
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
234
|
+
{ size_t length;
|
|
235
|
+
do {
|
|
236
|
+
end++;
|
|
237
|
+
length = ZDICT_count(b + pos, b + suffix[end]);
|
|
238
|
+
} while (length >= MINMATCHLENGTH);
|
|
239
|
+
}
|
|
239
240
|
|
|
240
241
|
/* look backward */
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
242
|
+
{ size_t length;
|
|
243
|
+
do {
|
|
244
|
+
length = ZDICT_count(b + pos, b + *(suffix+start-1));
|
|
245
|
+
if (length >=MINMATCHLENGTH) start--;
|
|
246
|
+
} while(length >= MINMATCHLENGTH);
|
|
247
|
+
}
|
|
245
248
|
|
|
246
249
|
/* exit if not found a minimum nb of repetitions */
|
|
247
250
|
if (end-start < minRatio) {
|
|
@@ -252,15 +255,15 @@ static dictItem ZDICT_analyzePos(
|
|
|
252
255
|
}
|
|
253
256
|
|
|
254
257
|
{ int i;
|
|
255
|
-
U32
|
|
258
|
+
U32 mml;
|
|
256
259
|
U32 refinedStart = start;
|
|
257
260
|
U32 refinedEnd = end;
|
|
258
261
|
|
|
259
262
|
DISPLAYLEVEL(4, "\n");
|
|
260
|
-
DISPLAYLEVEL(4, "found %3u matches of length >= %i at pos %7u ", (
|
|
263
|
+
DISPLAYLEVEL(4, "found %3u matches of length >= %i at pos %7u ", (unsigned)(end-start), MINMATCHLENGTH, (unsigned)pos);
|
|
261
264
|
DISPLAYLEVEL(4, "\n");
|
|
262
265
|
|
|
263
|
-
for (
|
|
266
|
+
for (mml = MINMATCHLENGTH ; ; mml++) {
|
|
264
267
|
BYTE currentChar = 0;
|
|
265
268
|
U32 currentCount = 0;
|
|
266
269
|
U32 currentID = refinedStart;
|
|
@@ -268,13 +271,13 @@ static dictItem ZDICT_analyzePos(
|
|
|
268
271
|
U32 selectedCount = 0;
|
|
269
272
|
U32 selectedID = currentID;
|
|
270
273
|
for (id =refinedStart; id < refinedEnd; id++) {
|
|
271
|
-
if (b[
|
|
274
|
+
if (b[suffix[id] + mml] != currentChar) {
|
|
272
275
|
if (currentCount > selectedCount) {
|
|
273
276
|
selectedCount = currentCount;
|
|
274
277
|
selectedID = currentID;
|
|
275
278
|
}
|
|
276
279
|
currentID = id;
|
|
277
|
-
currentChar = b[ suffix[id] +
|
|
280
|
+
currentChar = b[ suffix[id] + mml];
|
|
278
281
|
currentCount = 0;
|
|
279
282
|
}
|
|
280
283
|
currentCount ++;
|
|
@@ -290,27 +293,30 @@ static dictItem ZDICT_analyzePos(
|
|
|
290
293
|
refinedEnd = refinedStart + selectedCount;
|
|
291
294
|
}
|
|
292
295
|
|
|
293
|
-
/* evaluate gain based on new
|
|
296
|
+
/* evaluate gain based on new dict */
|
|
294
297
|
start = refinedStart;
|
|
295
298
|
pos = suffix[refinedStart];
|
|
296
299
|
end = start;
|
|
297
300
|
memset(lengthList, 0, sizeof(lengthList));
|
|
298
301
|
|
|
299
302
|
/* look forward */
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
303
|
+
{ size_t length;
|
|
304
|
+
do {
|
|
305
|
+
end++;
|
|
306
|
+
length = ZDICT_count(b + pos, b + suffix[end]);
|
|
307
|
+
if (length >= LLIMIT) length = LLIMIT-1;
|
|
308
|
+
lengthList[length]++;
|
|
309
|
+
} while (length >=MINMATCHLENGTH);
|
|
310
|
+
}
|
|
306
311
|
|
|
307
312
|
/* look backward */
|
|
308
|
-
length = MINMATCHLENGTH;
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
313
|
+
{ size_t length = MINMATCHLENGTH;
|
|
314
|
+
while ((length >= MINMATCHLENGTH) & (start > 0)) {
|
|
315
|
+
length = ZDICT_count(b + pos, b + suffix[start - 1]);
|
|
316
|
+
if (length >= LLIMIT) length = LLIMIT - 1;
|
|
317
|
+
lengthList[length]++;
|
|
318
|
+
if (length >= MINMATCHLENGTH) start--;
|
|
319
|
+
}
|
|
314
320
|
}
|
|
315
321
|
|
|
316
322
|
/* largest useful length */
|
|
@@ -335,8 +341,8 @@ static dictItem ZDICT_analyzePos(
|
|
|
335
341
|
for (i=MINMATCHLENGTH; i<=(int)maxLength; i++)
|
|
336
342
|
savings[i] = savings[i-1] + (lengthList[i] * (i-3));
|
|
337
343
|
|
|
338
|
-
DISPLAYLEVEL(4, "Selected
|
|
339
|
-
(
|
|
344
|
+
DISPLAYLEVEL(4, "Selected dict at position %u, of length %u : saves %u (ratio: %.2f) \n",
|
|
345
|
+
(unsigned)pos, (unsigned)maxLength, (unsigned)savings[maxLength], (double)savings[maxLength] / maxLength);
|
|
340
346
|
|
|
341
347
|
solution.pos = (U32)pos;
|
|
342
348
|
solution.length = (U32)maxLength;
|
|
@@ -345,12 +351,12 @@ static dictItem ZDICT_analyzePos(
|
|
|
345
351
|
/* mark positions done */
|
|
346
352
|
{ U32 id;
|
|
347
353
|
for (id=start; id<end; id++) {
|
|
348
|
-
U32 p, pEnd;
|
|
354
|
+
U32 p, pEnd, length;
|
|
349
355
|
U32 const testedPos = suffix[id];
|
|
350
356
|
if (testedPos == pos)
|
|
351
357
|
length = solution.length;
|
|
352
358
|
else {
|
|
353
|
-
length = ZDICT_count(b+pos, b+testedPos);
|
|
359
|
+
length = (U32)ZDICT_count(b+pos, b+testedPos);
|
|
354
360
|
if (length > solution.length) length = solution.length;
|
|
355
361
|
}
|
|
356
362
|
pEnd = (U32)(testedPos + length);
|
|
@@ -491,7 +497,7 @@ static U32 ZDICT_dictSize(const dictItem* dictList)
|
|
|
491
497
|
static size_t ZDICT_trainBuffer_legacy(dictItem* dictList, U32 dictListSize,
|
|
492
498
|
const void* const buffer, size_t bufferSize, /* buffer must end with noisy guard band */
|
|
493
499
|
const size_t* fileSizes, unsigned nbFiles,
|
|
494
|
-
|
|
500
|
+
unsigned minRatio, U32 notificationLevel)
|
|
495
501
|
{
|
|
496
502
|
int* const suffix0 = (int*)malloc((bufferSize+2)*sizeof(*suffix0));
|
|
497
503
|
int* const suffix = suffix0+1;
|
|
@@ -517,11 +523,11 @@ static size_t ZDICT_trainBuffer_legacy(dictItem* dictList, U32 dictListSize,
|
|
|
517
523
|
memset(doneMarks, 0, bufferSize+16);
|
|
518
524
|
|
|
519
525
|
/* limit sample set size (divsufsort limitation)*/
|
|
520
|
-
if (bufferSize > ZDICT_MAX_SAMPLES_SIZE) DISPLAYLEVEL(3, "sample set too large : reduced to %u MB ...\n", (
|
|
526
|
+
if (bufferSize > ZDICT_MAX_SAMPLES_SIZE) DISPLAYLEVEL(3, "sample set too large : reduced to %u MB ...\n", (unsigned)(ZDICT_MAX_SAMPLES_SIZE>>20));
|
|
521
527
|
while (bufferSize > ZDICT_MAX_SAMPLES_SIZE) bufferSize -= fileSizes[--nbFiles];
|
|
522
528
|
|
|
523
529
|
/* sort */
|
|
524
|
-
DISPLAYLEVEL(2, "sorting %u files of total size %u MB ...\n", nbFiles, (
|
|
530
|
+
DISPLAYLEVEL(2, "sorting %u files of total size %u MB ...\n", nbFiles, (unsigned)(bufferSize>>20));
|
|
525
531
|
{ int const divSuftSortResult = divsufsort((const unsigned char*)buffer, suffix, (int)bufferSize, 0);
|
|
526
532
|
if (divSuftSortResult != 0) { result = ERROR(GENERIC); goto _cleanup; }
|
|
527
533
|
}
|
|
@@ -575,29 +581,31 @@ static void ZDICT_fillNoise(void* buffer, size_t length)
|
|
|
575
581
|
|
|
576
582
|
typedef struct
|
|
577
583
|
{
|
|
578
|
-
|
|
579
|
-
ZSTD_CCtx* zc;
|
|
584
|
+
ZSTD_CDict* dict; /* dictionary */
|
|
585
|
+
ZSTD_CCtx* zc; /* working context */
|
|
580
586
|
void* workPlace; /* must be ZSTD_BLOCKSIZE_MAX allocated */
|
|
581
587
|
} EStats_ress_t;
|
|
582
588
|
|
|
583
589
|
#define MAXREPOFFSET 1024
|
|
584
590
|
|
|
585
591
|
static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params,
|
|
586
|
-
|
|
587
|
-
|
|
592
|
+
unsigned* countLit, unsigned* offsetcodeCount, unsigned* matchlengthCount, unsigned* litlengthCount, U32* repOffsets,
|
|
593
|
+
const void* src, size_t srcSize,
|
|
594
|
+
U32 notificationLevel)
|
|
588
595
|
{
|
|
589
596
|
size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params.cParams.windowLog);
|
|
590
597
|
size_t cSize;
|
|
591
598
|
|
|
592
599
|
if (srcSize > blockSizeMax) srcSize = blockSizeMax; /* protection vs large samples */
|
|
593
|
-
{
|
|
594
|
-
|
|
600
|
+
{ size_t const errorCode = ZSTD_compressBegin_usingCDict(esr.zc, esr.dict);
|
|
601
|
+
if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_compressBegin_usingCDict failed \n"); return; }
|
|
602
|
+
|
|
595
603
|
}
|
|
596
604
|
cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_MAX, src, srcSize);
|
|
597
|
-
if (ZSTD_isError(cSize)) { DISPLAYLEVEL(3, "warning : could not compress sample size %u \n", (
|
|
605
|
+
if (ZSTD_isError(cSize)) { DISPLAYLEVEL(3, "warning : could not compress sample size %u \n", (unsigned)srcSize); return; }
|
|
598
606
|
|
|
599
607
|
if (cSize) { /* if == 0; block is not compressible */
|
|
600
|
-
const seqStore_t* seqStorePtr = ZSTD_getSeqStore(esr.zc);
|
|
608
|
+
const seqStore_t* const seqStorePtr = ZSTD_getSeqStore(esr.zc);
|
|
601
609
|
|
|
602
610
|
/* literals stats */
|
|
603
611
|
{ const BYTE* bytePtr;
|
|
@@ -659,6 +667,18 @@ static void ZDICT_insertSortCount(offsetCount_t table[ZSTD_REP_NUM+1], U32 val,
|
|
|
659
667
|
}
|
|
660
668
|
}
|
|
661
669
|
|
|
670
|
+
/* ZDICT_flatLit() :
|
|
671
|
+
* rewrite `countLit` to contain a mostly flat but still compressible distribution of literals.
|
|
672
|
+
* necessary to avoid generating a non-compressible distribution that HUF_writeCTable() cannot encode.
|
|
673
|
+
*/
|
|
674
|
+
static void ZDICT_flatLit(unsigned* countLit)
|
|
675
|
+
{
|
|
676
|
+
int u;
|
|
677
|
+
for (u=1; u<256; u++) countLit[u] = 2;
|
|
678
|
+
countLit[0] = 4;
|
|
679
|
+
countLit[253] = 1;
|
|
680
|
+
countLit[254] = 1;
|
|
681
|
+
}
|
|
662
682
|
|
|
663
683
|
#define OFFCODE_MAX 30 /* only applicable to first block */
|
|
664
684
|
static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
@@ -667,18 +687,18 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
|
667
687
|
const void* dictBuffer, size_t dictBufferSize,
|
|
668
688
|
unsigned notificationLevel)
|
|
669
689
|
{
|
|
670
|
-
|
|
690
|
+
unsigned countLit[256];
|
|
671
691
|
HUF_CREATE_STATIC_CTABLE(hufTable, 255);
|
|
672
|
-
|
|
692
|
+
unsigned offcodeCount[OFFCODE_MAX+1];
|
|
673
693
|
short offcodeNCount[OFFCODE_MAX+1];
|
|
674
694
|
U32 offcodeMax = ZSTD_highbit32((U32)(dictBufferSize + 128 KB));
|
|
675
|
-
|
|
695
|
+
unsigned matchLengthCount[MaxML+1];
|
|
676
696
|
short matchLengthNCount[MaxML+1];
|
|
677
|
-
|
|
697
|
+
unsigned litLengthCount[MaxLL+1];
|
|
678
698
|
short litLengthNCount[MaxLL+1];
|
|
679
699
|
U32 repOffset[MAXREPOFFSET];
|
|
680
700
|
offsetCount_t bestRepOffset[ZSTD_REP_NUM+1];
|
|
681
|
-
EStats_ress_t esr;
|
|
701
|
+
EStats_ress_t esr = { NULL, NULL, NULL };
|
|
682
702
|
ZSTD_parameters params;
|
|
683
703
|
U32 u, huffLog = 11, Offlog = OffFSELog, mlLog = MLFSELog, llLog = LLFSELog, total;
|
|
684
704
|
size_t pos = 0, errorCode;
|
|
@@ -688,14 +708,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
|
688
708
|
BYTE* dstPtr = (BYTE*)dstBuffer;
|
|
689
709
|
|
|
690
710
|
/* init */
|
|
691
|
-
|
|
692
|
-
esr.zc = ZSTD_createCCtx();
|
|
693
|
-
esr.workPlace = malloc(ZSTD_BLOCKSIZE_MAX);
|
|
694
|
-
if (!esr.ref || !esr.zc || !esr.workPlace) {
|
|
695
|
-
eSize = ERROR(memory_allocation);
|
|
696
|
-
DISPLAYLEVEL(1, "Not enough memory \n");
|
|
697
|
-
goto _cleanup;
|
|
698
|
-
}
|
|
711
|
+
DEBUGLOG(4, "ZDICT_analyzeEntropy");
|
|
699
712
|
if (offcodeMax>OFFCODE_MAX) { eSize = ERROR(dictionaryCreation_failed); goto _cleanup; } /* too large dictionary */
|
|
700
713
|
for (u=0; u<256; u++) countLit[u] = 1; /* any character must be described */
|
|
701
714
|
for (u=0; u<=offcodeMax; u++) offcodeCount[u] = 1;
|
|
@@ -704,16 +717,19 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
|
704
717
|
memset(repOffset, 0, sizeof(repOffset));
|
|
705
718
|
repOffset[1] = repOffset[4] = repOffset[8] = 1;
|
|
706
719
|
memset(bestRepOffset, 0, sizeof(bestRepOffset));
|
|
707
|
-
if (compressionLevel
|
|
720
|
+
if (compressionLevel==0) compressionLevel = g_compressionLevel_default;
|
|
708
721
|
params = ZSTD_getParams(compressionLevel, averageSampleSize, dictBufferSize);
|
|
709
|
-
{ size_t const beginResult = ZSTD_compressBegin_advanced(esr.ref, dictBuffer, dictBufferSize, params, 0);
|
|
710
|
-
if (ZSTD_isError(beginResult)) {
|
|
711
|
-
DISPLAYLEVEL(1, "error : ZSTD_compressBegin_advanced() failed : %s \n", ZSTD_getErrorName(beginResult));
|
|
712
|
-
eSize = ERROR(GENERIC);
|
|
713
|
-
goto _cleanup;
|
|
714
|
-
} }
|
|
715
722
|
|
|
716
|
-
|
|
723
|
+
esr.dict = ZSTD_createCDict_advanced(dictBuffer, dictBufferSize, ZSTD_dlm_byRef, ZSTD_dct_rawContent, params.cParams, ZSTD_defaultCMem);
|
|
724
|
+
esr.zc = ZSTD_createCCtx();
|
|
725
|
+
esr.workPlace = malloc(ZSTD_BLOCKSIZE_MAX);
|
|
726
|
+
if (!esr.dict || !esr.zc || !esr.workPlace) {
|
|
727
|
+
eSize = ERROR(memory_allocation);
|
|
728
|
+
DISPLAYLEVEL(1, "Not enough memory \n");
|
|
729
|
+
goto _cleanup;
|
|
730
|
+
}
|
|
731
|
+
|
|
732
|
+
/* collect stats on all samples */
|
|
717
733
|
for (u=0; u<nbFiles; u++) {
|
|
718
734
|
ZDICT_countEStats(esr, params,
|
|
719
735
|
countLit, offcodeCount, matchLengthCount, litLengthCount, repOffset,
|
|
@@ -722,14 +738,21 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
|
722
738
|
pos += fileSizes[u];
|
|
723
739
|
}
|
|
724
740
|
|
|
725
|
-
/* analyze */
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
741
|
+
/* analyze, build stats, starting with literals */
|
|
742
|
+
{ size_t maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog);
|
|
743
|
+
if (HUF_isError(maxNbBits)) {
|
|
744
|
+
eSize = ERROR(GENERIC);
|
|
745
|
+
DISPLAYLEVEL(1, " HUF_buildCTable error \n");
|
|
746
|
+
goto _cleanup;
|
|
747
|
+
}
|
|
748
|
+
if (maxNbBits==8) { /* not compressible : will fail on HUF_writeCTable() */
|
|
749
|
+
DISPLAYLEVEL(2, "warning : pathological dataset : literals are not compressible : samples are noisy or too regular \n");
|
|
750
|
+
ZDICT_flatLit(countLit); /* replace distribution by a fake "mostly flat but still compressible" distribution, that HUF_writeCTable() can encode */
|
|
751
|
+
maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog);
|
|
752
|
+
assert(maxNbBits==9);
|
|
753
|
+
}
|
|
754
|
+
huffLog = (U32)maxNbBits;
|
|
731
755
|
}
|
|
732
|
-
huffLog = (U32)errorCode;
|
|
733
756
|
|
|
734
757
|
/* looking for most common first offsets */
|
|
735
758
|
{ U32 offset;
|
|
@@ -829,7 +852,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
|
829
852
|
eSize += 12;
|
|
830
853
|
|
|
831
854
|
_cleanup:
|
|
832
|
-
|
|
855
|
+
ZSTD_freeCDict(esr.dict);
|
|
833
856
|
ZSTD_freeCCtx(esr.zc);
|
|
834
857
|
free(esr.workPlace);
|
|
835
858
|
|
|
@@ -840,16 +863,17 @@ _cleanup:
|
|
|
840
863
|
|
|
841
864
|
size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
|
|
842
865
|
const void* customDictContent, size_t dictContentSize,
|
|
843
|
-
const void* samplesBuffer, const size_t* samplesSizes,
|
|
844
|
-
ZDICT_params_t params)
|
|
866
|
+
const void* samplesBuffer, const size_t* samplesSizes,
|
|
867
|
+
unsigned nbSamples, ZDICT_params_t params)
|
|
845
868
|
{
|
|
846
869
|
size_t hSize;
|
|
847
870
|
#define HBUFFSIZE 256 /* should prove large enough for all entropy headers */
|
|
848
871
|
BYTE header[HBUFFSIZE];
|
|
849
|
-
int const compressionLevel = (params.compressionLevel
|
|
872
|
+
int const compressionLevel = (params.compressionLevel == 0) ? g_compressionLevel_default : params.compressionLevel;
|
|
850
873
|
U32 const notificationLevel = params.notificationLevel;
|
|
851
874
|
|
|
852
875
|
/* check conditions */
|
|
876
|
+
DEBUGLOG(4, "ZDICT_finalizeDictionary");
|
|
853
877
|
if (dictBufferCapacity < dictContentSize) return ERROR(dstSize_tooSmall);
|
|
854
878
|
if (dictContentSize < ZDICT_CONTENTSIZE_MIN) return ERROR(srcSize_wrong);
|
|
855
879
|
if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) return ERROR(dstSize_tooSmall);
|
|
@@ -886,11 +910,12 @@ size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
|
|
|
886
910
|
}
|
|
887
911
|
|
|
888
912
|
|
|
889
|
-
size_t ZDICT_addEntropyTablesFromBuffer_advanced(
|
|
890
|
-
|
|
891
|
-
|
|
913
|
+
static size_t ZDICT_addEntropyTablesFromBuffer_advanced(
|
|
914
|
+
void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
|
|
915
|
+
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
|
|
916
|
+
ZDICT_params_t params)
|
|
892
917
|
{
|
|
893
|
-
int const compressionLevel = (params.compressionLevel
|
|
918
|
+
int const compressionLevel = (params.compressionLevel == 0) ? g_compressionLevel_default : params.compressionLevel;
|
|
894
919
|
U32 const notificationLevel = params.notificationLevel;
|
|
895
920
|
size_t hSize = 8;
|
|
896
921
|
|
|
@@ -919,7 +944,11 @@ size_t ZDICT_addEntropyTablesFromBuffer_advanced(void* dictBuffer, size_t dictCo
|
|
|
919
944
|
return MIN(dictBufferCapacity, hSize+dictContentSize);
|
|
920
945
|
}
|
|
921
946
|
|
|
922
|
-
|
|
947
|
+
/* Hidden declaration for dbio.c */
|
|
948
|
+
size_t ZDICT_trainFromBuffer_unsafe_legacy(
|
|
949
|
+
void* dictBuffer, size_t maxDictSize,
|
|
950
|
+
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
|
|
951
|
+
ZDICT_legacy_params_t params);
|
|
923
952
|
/*! ZDICT_trainFromBuffer_unsafe_legacy() :
|
|
924
953
|
* Warning : `samplesBuffer` must be followed by noisy guard band.
|
|
925
954
|
* @return : size of dictionary, or an error code which can be tested with ZDICT_isError()
|
|
@@ -954,31 +983,33 @@ size_t ZDICT_trainFromBuffer_unsafe_legacy(
|
|
|
954
983
|
|
|
955
984
|
/* display best matches */
|
|
956
985
|
if (params.zParams.notificationLevel>= 3) {
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", dictList[0].pos-1, dictContentSize);
|
|
986
|
+
unsigned const nb = MIN(25, dictList[0].pos);
|
|
987
|
+
unsigned const dictContentSize = ZDICT_dictSize(dictList);
|
|
988
|
+
unsigned u;
|
|
989
|
+
DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", (unsigned)dictList[0].pos-1, dictContentSize);
|
|
961
990
|
DISPLAYLEVEL(3, "list %u best segments \n", nb-1);
|
|
962
991
|
for (u=1; u<nb; u++) {
|
|
963
|
-
|
|
964
|
-
|
|
992
|
+
unsigned const pos = dictList[u].pos;
|
|
993
|
+
unsigned const length = dictList[u].length;
|
|
965
994
|
U32 const printedLength = MIN(40, length);
|
|
966
|
-
if ((pos > samplesBuffSize) || ((pos + length) > samplesBuffSize))
|
|
995
|
+
if ((pos > samplesBuffSize) || ((pos + length) > samplesBuffSize)) {
|
|
996
|
+
free(dictList);
|
|
967
997
|
return ERROR(GENERIC); /* should never happen */
|
|
998
|
+
}
|
|
968
999
|
DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |",
|
|
969
|
-
u, length, pos, dictList[u].savings);
|
|
1000
|
+
u, length, pos, (unsigned)dictList[u].savings);
|
|
970
1001
|
ZDICT_printHex((const char*)samplesBuffer+pos, printedLength);
|
|
971
1002
|
DISPLAYLEVEL(3, "| \n");
|
|
972
1003
|
} }
|
|
973
1004
|
|
|
974
1005
|
|
|
975
1006
|
/* create dictionary */
|
|
976
|
-
{
|
|
1007
|
+
{ unsigned dictContentSize = ZDICT_dictSize(dictList);
|
|
977
1008
|
if (dictContentSize < ZDICT_CONTENTSIZE_MIN) { free(dictList); return ERROR(dictionaryCreation_failed); } /* dictionary content too small */
|
|
978
1009
|
if (dictContentSize < targetDictSize/4) {
|
|
979
|
-
DISPLAYLEVEL(2, "! warning : selected content significantly smaller than requested (%u < %u) \n", dictContentSize, (
|
|
1010
|
+
DISPLAYLEVEL(2, "! warning : selected content significantly smaller than requested (%u < %u) \n", dictContentSize, (unsigned)maxDictSize);
|
|
980
1011
|
if (samplesBuffSize < 10 * targetDictSize)
|
|
981
|
-
DISPLAYLEVEL(2, "! consider increasing the number of samples (total size : %u MB)\n", (
|
|
1012
|
+
DISPLAYLEVEL(2, "! consider increasing the number of samples (total size : %u MB)\n", (unsigned)(samplesBuffSize>>20));
|
|
982
1013
|
if (minRep > MINRATIO) {
|
|
983
1014
|
DISPLAYLEVEL(2, "! consider increasing selectivity to produce larger dictionary (-s%u) \n", selectivity+1);
|
|
984
1015
|
DISPLAYLEVEL(2, "! note : larger dictionaries are not necessarily better, test its efficiency on samples \n");
|
|
@@ -986,9 +1017,9 @@ size_t ZDICT_trainFromBuffer_unsafe_legacy(
|
|
|
986
1017
|
}
|
|
987
1018
|
|
|
988
1019
|
if ((dictContentSize > targetDictSize*3) && (nbSamples > 2*MINRATIO) && (selectivity>1)) {
|
|
989
|
-
|
|
1020
|
+
unsigned proposedSelectivity = selectivity-1;
|
|
990
1021
|
while ((nbSamples >> proposedSelectivity) <= MINRATIO) { proposedSelectivity--; }
|
|
991
|
-
DISPLAYLEVEL(2, "! note : calculated dictionary significantly larger than requested (%u > %u) \n", dictContentSize, (
|
|
1022
|
+
DISPLAYLEVEL(2, "! note : calculated dictionary significantly larger than requested (%u > %u) \n", dictContentSize, (unsigned)maxDictSize);
|
|
992
1023
|
DISPLAYLEVEL(2, "! consider increasing dictionary size, or produce denser dictionary (-s%u) \n", proposedSelectivity);
|
|
993
1024
|
DISPLAYLEVEL(2, "! always test dictionary efficiency on real samples \n");
|
|
994
1025
|
}
|
|
@@ -1025,8 +1056,9 @@ size_t ZDICT_trainFromBuffer_unsafe_legacy(
|
|
|
1025
1056
|
}
|
|
1026
1057
|
|
|
1027
1058
|
|
|
1028
|
-
/*
|
|
1029
|
-
*
|
|
1059
|
+
/* ZDICT_trainFromBuffer_legacy() :
|
|
1060
|
+
* issue : samplesBuffer need to be followed by a noisy guard band.
|
|
1061
|
+
* work around : duplicate the buffer, and add the noise */
|
|
1030
1062
|
size_t ZDICT_trainFromBuffer_legacy(void* dictBuffer, size_t dictBufferCapacity,
|
|
1031
1063
|
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
|
|
1032
1064
|
ZDICT_legacy_params_t params)
|
|
@@ -1053,19 +1085,23 @@ size_t ZDICT_trainFromBuffer_legacy(void* dictBuffer, size_t dictBufferCapacity,
|
|
|
1053
1085
|
size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
|
|
1054
1086
|
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
|
|
1055
1087
|
{
|
|
1056
|
-
|
|
1088
|
+
ZDICT_fastCover_params_t params;
|
|
1089
|
+
DEBUGLOG(3, "ZDICT_trainFromBuffer");
|
|
1057
1090
|
memset(¶ms, 0, sizeof(params));
|
|
1058
1091
|
params.d = 8;
|
|
1059
1092
|
params.steps = 4;
|
|
1060
|
-
/* Default to level 6 since no compression level information is
|
|
1061
|
-
params.zParams.compressionLevel =
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
|
|
1093
|
+
/* Default to level 6 since no compression level information is available */
|
|
1094
|
+
params.zParams.compressionLevel = 3;
|
|
1095
|
+
#if defined(DEBUGLEVEL) && (DEBUGLEVEL>=1)
|
|
1096
|
+
params.zParams.notificationLevel = DEBUGLEVEL;
|
|
1097
|
+
#endif
|
|
1098
|
+
return ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, dictBufferCapacity,
|
|
1099
|
+
samplesBuffer, samplesSizes, nbSamples,
|
|
1100
|
+
¶ms);
|
|
1065
1101
|
}
|
|
1066
1102
|
|
|
1067
1103
|
size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
|
|
1068
|
-
|
|
1104
|
+
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
|
|
1069
1105
|
{
|
|
1070
1106
|
ZDICT_params_t params;
|
|
1071
1107
|
memset(¶ms, 0, sizeof(params));
|