extzstd 0.2 → 0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/HISTORY.ja.md +13 -0
- data/README.md +17 -14
- data/contrib/zstd/{NEWS → CHANGELOG} +115 -2
- data/contrib/zstd/CODE_OF_CONDUCT.md +5 -0
- data/contrib/zstd/Makefile +99 -53
- data/contrib/zstd/README.md +59 -39
- data/contrib/zstd/TESTING.md +1 -1
- data/contrib/zstd/appveyor.yml +17 -6
- data/contrib/zstd/lib/BUCK +29 -2
- data/contrib/zstd/lib/Makefile +118 -21
- data/contrib/zstd/lib/README.md +84 -44
- data/contrib/zstd/lib/common/bitstream.h +17 -33
- data/contrib/zstd/lib/common/compiler.h +62 -8
- data/contrib/zstd/lib/common/cpu.h +215 -0
- data/contrib/zstd/lib/common/debug.c +44 -0
- data/contrib/zstd/lib/common/debug.h +134 -0
- data/contrib/zstd/lib/common/entropy_common.c +16 -1
- data/contrib/zstd/lib/common/error_private.c +7 -0
- data/contrib/zstd/lib/common/fse.h +48 -44
- data/contrib/zstd/lib/common/fse_decompress.c +3 -3
- data/contrib/zstd/lib/common/huf.h +169 -113
- data/contrib/zstd/lib/common/mem.h +20 -2
- data/contrib/zstd/lib/common/pool.c +135 -49
- data/contrib/zstd/lib/common/pool.h +40 -21
- data/contrib/zstd/lib/common/threading.c +2 -2
- data/contrib/zstd/lib/common/threading.h +12 -12
- data/contrib/zstd/lib/common/xxhash.c +3 -2
- data/contrib/zstd/lib/common/zstd_common.c +3 -6
- data/contrib/zstd/lib/common/zstd_errors.h +17 -7
- data/contrib/zstd/lib/common/zstd_internal.h +76 -48
- data/contrib/zstd/lib/compress/fse_compress.c +89 -209
- data/contrib/zstd/lib/compress/hist.c +203 -0
- data/contrib/zstd/lib/compress/hist.h +95 -0
- data/contrib/zstd/lib/compress/huf_compress.c +188 -80
- data/contrib/zstd/lib/compress/zstd_compress.c +2500 -1203
- data/contrib/zstd/lib/compress/zstd_compress_internal.h +463 -62
- data/contrib/zstd/lib/compress/zstd_double_fast.c +321 -131
- data/contrib/zstd/lib/compress/zstd_double_fast.h +13 -4
- data/contrib/zstd/lib/compress/zstd_fast.c +335 -108
- data/contrib/zstd/lib/compress/zstd_fast.h +12 -6
- data/contrib/zstd/lib/compress/zstd_lazy.c +654 -313
- data/contrib/zstd/lib/compress/zstd_lazy.h +44 -16
- data/contrib/zstd/lib/compress/zstd_ldm.c +310 -420
- data/contrib/zstd/lib/compress/zstd_ldm.h +63 -26
- data/contrib/zstd/lib/compress/zstd_opt.c +773 -325
- data/contrib/zstd/lib/compress/zstd_opt.h +31 -5
- data/contrib/zstd/lib/compress/zstdmt_compress.c +1468 -518
- data/contrib/zstd/lib/compress/zstdmt_compress.h +96 -45
- data/contrib/zstd/lib/decompress/huf_decompress.c +518 -282
- data/contrib/zstd/lib/decompress/zstd_ddict.c +240 -0
- data/contrib/zstd/lib/decompress/zstd_ddict.h +44 -0
- data/contrib/zstd/lib/decompress/zstd_decompress.c +613 -1513
- data/contrib/zstd/lib/decompress/zstd_decompress_block.c +1311 -0
- data/contrib/zstd/lib/decompress/zstd_decompress_block.h +59 -0
- data/contrib/zstd/lib/decompress/zstd_decompress_internal.h +175 -0
- data/contrib/zstd/lib/dictBuilder/cover.c +194 -113
- data/contrib/zstd/lib/dictBuilder/cover.h +112 -0
- data/contrib/zstd/lib/dictBuilder/divsufsort.c +3 -3
- data/contrib/zstd/lib/dictBuilder/fastcover.c +740 -0
- data/contrib/zstd/lib/dictBuilder/zdict.c +142 -106
- data/contrib/zstd/lib/dictBuilder/zdict.h +115 -49
- data/contrib/zstd/lib/legacy/zstd_legacy.h +44 -12
- data/contrib/zstd/lib/legacy/zstd_v01.c +41 -10
- data/contrib/zstd/lib/legacy/zstd_v01.h +12 -7
- data/contrib/zstd/lib/legacy/zstd_v02.c +37 -12
- data/contrib/zstd/lib/legacy/zstd_v02.h +12 -7
- data/contrib/zstd/lib/legacy/zstd_v03.c +38 -12
- data/contrib/zstd/lib/legacy/zstd_v03.h +12 -7
- data/contrib/zstd/lib/legacy/zstd_v04.c +55 -174
- data/contrib/zstd/lib/legacy/zstd_v04.h +12 -7
- data/contrib/zstd/lib/legacy/zstd_v05.c +59 -31
- data/contrib/zstd/lib/legacy/zstd_v05.h +12 -7
- data/contrib/zstd/lib/legacy/zstd_v06.c +48 -20
- data/contrib/zstd/lib/legacy/zstd_v06.h +10 -5
- data/contrib/zstd/lib/legacy/zstd_v07.c +62 -29
- data/contrib/zstd/lib/legacy/zstd_v07.h +10 -5
- data/contrib/zstd/lib/zstd.h +1346 -832
- data/ext/extzstd.c +27 -19
- data/ext/extzstd_stream.c +20 -4
- data/ext/zstd_compress.c +1 -0
- data/ext/zstd_decompress.c +4 -0
- data/ext/zstd_dictbuilder.c +4 -0
- data/ext/zstd_dictbuilder_fastcover.c +5 -0
- data/lib/extzstd.rb +52 -220
- data/lib/extzstd/version.rb +1 -1
- metadata +21 -7
- data/contrib/zstd/circle.yml +0 -63
@@ -207,7 +207,6 @@ static dictItem ZDICT_analyzePos(
|
|
207
207
|
U32 cumulLength[LLIMIT] = {0};
|
208
208
|
U32 savings[LLIMIT] = {0};
|
209
209
|
const BYTE* b = (const BYTE*)buffer;
|
210
|
-
size_t length;
|
211
210
|
size_t maxLength = LLIMIT;
|
212
211
|
size_t pos = suffix[start];
|
213
212
|
U32 end = start;
|
@@ -222,26 +221,30 @@ static dictItem ZDICT_analyzePos(
|
|
222
221
|
||(MEM_read16(b+pos+1) == MEM_read16(b+pos+3))
|
223
222
|
||(MEM_read16(b+pos+2) == MEM_read16(b+pos+4)) ) {
|
224
223
|
/* skip and mark segment */
|
225
|
-
U16
|
226
|
-
U32 u,
|
227
|
-
while (MEM_read16(b+pos+
|
228
|
-
if (b[pos+
|
229
|
-
for (u=1; u<
|
224
|
+
U16 const pattern16 = MEM_read16(b+pos+4);
|
225
|
+
U32 u, patternEnd = 6;
|
226
|
+
while (MEM_read16(b+pos+patternEnd) == pattern16) patternEnd+=2 ;
|
227
|
+
if (b[pos+patternEnd] == b[pos+patternEnd-1]) patternEnd++;
|
228
|
+
for (u=1; u<patternEnd; u++)
|
230
229
|
doneMarks[pos+u] = 1;
|
231
230
|
return solution;
|
232
231
|
}
|
233
232
|
|
234
233
|
/* look forward */
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
234
|
+
{ size_t length;
|
235
|
+
do {
|
236
|
+
end++;
|
237
|
+
length = ZDICT_count(b + pos, b + suffix[end]);
|
238
|
+
} while (length >= MINMATCHLENGTH);
|
239
|
+
}
|
239
240
|
|
240
241
|
/* look backward */
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
242
|
+
{ size_t length;
|
243
|
+
do {
|
244
|
+
length = ZDICT_count(b + pos, b + *(suffix+start-1));
|
245
|
+
if (length >=MINMATCHLENGTH) start--;
|
246
|
+
} while(length >= MINMATCHLENGTH);
|
247
|
+
}
|
245
248
|
|
246
249
|
/* exit if not found a minimum nb of repetitions */
|
247
250
|
if (end-start < minRatio) {
|
@@ -252,15 +255,15 @@ static dictItem ZDICT_analyzePos(
|
|
252
255
|
}
|
253
256
|
|
254
257
|
{ int i;
|
255
|
-
U32
|
258
|
+
U32 mml;
|
256
259
|
U32 refinedStart = start;
|
257
260
|
U32 refinedEnd = end;
|
258
261
|
|
259
262
|
DISPLAYLEVEL(4, "\n");
|
260
|
-
DISPLAYLEVEL(4, "found %3u matches of length >= %i at pos %7u ", (
|
263
|
+
DISPLAYLEVEL(4, "found %3u matches of length >= %i at pos %7u ", (unsigned)(end-start), MINMATCHLENGTH, (unsigned)pos);
|
261
264
|
DISPLAYLEVEL(4, "\n");
|
262
265
|
|
263
|
-
for (
|
266
|
+
for (mml = MINMATCHLENGTH ; ; mml++) {
|
264
267
|
BYTE currentChar = 0;
|
265
268
|
U32 currentCount = 0;
|
266
269
|
U32 currentID = refinedStart;
|
@@ -268,13 +271,13 @@ static dictItem ZDICT_analyzePos(
|
|
268
271
|
U32 selectedCount = 0;
|
269
272
|
U32 selectedID = currentID;
|
270
273
|
for (id =refinedStart; id < refinedEnd; id++) {
|
271
|
-
if (b[
|
274
|
+
if (b[suffix[id] + mml] != currentChar) {
|
272
275
|
if (currentCount > selectedCount) {
|
273
276
|
selectedCount = currentCount;
|
274
277
|
selectedID = currentID;
|
275
278
|
}
|
276
279
|
currentID = id;
|
277
|
-
currentChar = b[ suffix[id] +
|
280
|
+
currentChar = b[ suffix[id] + mml];
|
278
281
|
currentCount = 0;
|
279
282
|
}
|
280
283
|
currentCount ++;
|
@@ -290,27 +293,30 @@ static dictItem ZDICT_analyzePos(
|
|
290
293
|
refinedEnd = refinedStart + selectedCount;
|
291
294
|
}
|
292
295
|
|
293
|
-
/* evaluate gain based on new
|
296
|
+
/* evaluate gain based on new dict */
|
294
297
|
start = refinedStart;
|
295
298
|
pos = suffix[refinedStart];
|
296
299
|
end = start;
|
297
300
|
memset(lengthList, 0, sizeof(lengthList));
|
298
301
|
|
299
302
|
/* look forward */
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
303
|
+
{ size_t length;
|
304
|
+
do {
|
305
|
+
end++;
|
306
|
+
length = ZDICT_count(b + pos, b + suffix[end]);
|
307
|
+
if (length >= LLIMIT) length = LLIMIT-1;
|
308
|
+
lengthList[length]++;
|
309
|
+
} while (length >=MINMATCHLENGTH);
|
310
|
+
}
|
306
311
|
|
307
312
|
/* look backward */
|
308
|
-
length = MINMATCHLENGTH;
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
313
|
+
{ size_t length = MINMATCHLENGTH;
|
314
|
+
while ((length >= MINMATCHLENGTH) & (start > 0)) {
|
315
|
+
length = ZDICT_count(b + pos, b + suffix[start - 1]);
|
316
|
+
if (length >= LLIMIT) length = LLIMIT - 1;
|
317
|
+
lengthList[length]++;
|
318
|
+
if (length >= MINMATCHLENGTH) start--;
|
319
|
+
}
|
314
320
|
}
|
315
321
|
|
316
322
|
/* largest useful length */
|
@@ -335,8 +341,8 @@ static dictItem ZDICT_analyzePos(
|
|
335
341
|
for (i=MINMATCHLENGTH; i<=(int)maxLength; i++)
|
336
342
|
savings[i] = savings[i-1] + (lengthList[i] * (i-3));
|
337
343
|
|
338
|
-
DISPLAYLEVEL(4, "Selected
|
339
|
-
(
|
344
|
+
DISPLAYLEVEL(4, "Selected dict at position %u, of length %u : saves %u (ratio: %.2f) \n",
|
345
|
+
(unsigned)pos, (unsigned)maxLength, (unsigned)savings[maxLength], (double)savings[maxLength] / maxLength);
|
340
346
|
|
341
347
|
solution.pos = (U32)pos;
|
342
348
|
solution.length = (U32)maxLength;
|
@@ -345,12 +351,12 @@ static dictItem ZDICT_analyzePos(
|
|
345
351
|
/* mark positions done */
|
346
352
|
{ U32 id;
|
347
353
|
for (id=start; id<end; id++) {
|
348
|
-
U32 p, pEnd;
|
354
|
+
U32 p, pEnd, length;
|
349
355
|
U32 const testedPos = suffix[id];
|
350
356
|
if (testedPos == pos)
|
351
357
|
length = solution.length;
|
352
358
|
else {
|
353
|
-
length = ZDICT_count(b+pos, b+testedPos);
|
359
|
+
length = (U32)ZDICT_count(b+pos, b+testedPos);
|
354
360
|
if (length > solution.length) length = solution.length;
|
355
361
|
}
|
356
362
|
pEnd = (U32)(testedPos + length);
|
@@ -491,7 +497,7 @@ static U32 ZDICT_dictSize(const dictItem* dictList)
|
|
491
497
|
static size_t ZDICT_trainBuffer_legacy(dictItem* dictList, U32 dictListSize,
|
492
498
|
const void* const buffer, size_t bufferSize, /* buffer must end with noisy guard band */
|
493
499
|
const size_t* fileSizes, unsigned nbFiles,
|
494
|
-
|
500
|
+
unsigned minRatio, U32 notificationLevel)
|
495
501
|
{
|
496
502
|
int* const suffix0 = (int*)malloc((bufferSize+2)*sizeof(*suffix0));
|
497
503
|
int* const suffix = suffix0+1;
|
@@ -517,11 +523,11 @@ static size_t ZDICT_trainBuffer_legacy(dictItem* dictList, U32 dictListSize,
|
|
517
523
|
memset(doneMarks, 0, bufferSize+16);
|
518
524
|
|
519
525
|
/* limit sample set size (divsufsort limitation)*/
|
520
|
-
if (bufferSize > ZDICT_MAX_SAMPLES_SIZE) DISPLAYLEVEL(3, "sample set too large : reduced to %u MB ...\n", (
|
526
|
+
if (bufferSize > ZDICT_MAX_SAMPLES_SIZE) DISPLAYLEVEL(3, "sample set too large : reduced to %u MB ...\n", (unsigned)(ZDICT_MAX_SAMPLES_SIZE>>20));
|
521
527
|
while (bufferSize > ZDICT_MAX_SAMPLES_SIZE) bufferSize -= fileSizes[--nbFiles];
|
522
528
|
|
523
529
|
/* sort */
|
524
|
-
DISPLAYLEVEL(2, "sorting %u files of total size %u MB ...\n", nbFiles, (
|
530
|
+
DISPLAYLEVEL(2, "sorting %u files of total size %u MB ...\n", nbFiles, (unsigned)(bufferSize>>20));
|
525
531
|
{ int const divSuftSortResult = divsufsort((const unsigned char*)buffer, suffix, (int)bufferSize, 0);
|
526
532
|
if (divSuftSortResult != 0) { result = ERROR(GENERIC); goto _cleanup; }
|
527
533
|
}
|
@@ -575,29 +581,31 @@ static void ZDICT_fillNoise(void* buffer, size_t length)
|
|
575
581
|
|
576
582
|
typedef struct
|
577
583
|
{
|
578
|
-
|
579
|
-
ZSTD_CCtx* zc;
|
584
|
+
ZSTD_CDict* dict; /* dictionary */
|
585
|
+
ZSTD_CCtx* zc; /* working context */
|
580
586
|
void* workPlace; /* must be ZSTD_BLOCKSIZE_MAX allocated */
|
581
587
|
} EStats_ress_t;
|
582
588
|
|
583
589
|
#define MAXREPOFFSET 1024
|
584
590
|
|
585
591
|
static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params,
|
586
|
-
|
587
|
-
|
592
|
+
unsigned* countLit, unsigned* offsetcodeCount, unsigned* matchlengthCount, unsigned* litlengthCount, U32* repOffsets,
|
593
|
+
const void* src, size_t srcSize,
|
594
|
+
U32 notificationLevel)
|
588
595
|
{
|
589
596
|
size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params.cParams.windowLog);
|
590
597
|
size_t cSize;
|
591
598
|
|
592
599
|
if (srcSize > blockSizeMax) srcSize = blockSizeMax; /* protection vs large samples */
|
593
|
-
{
|
594
|
-
|
600
|
+
{ size_t const errorCode = ZSTD_compressBegin_usingCDict(esr.zc, esr.dict);
|
601
|
+
if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_compressBegin_usingCDict failed \n"); return; }
|
602
|
+
|
595
603
|
}
|
596
604
|
cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_MAX, src, srcSize);
|
597
|
-
if (ZSTD_isError(cSize)) { DISPLAYLEVEL(3, "warning : could not compress sample size %u \n", (
|
605
|
+
if (ZSTD_isError(cSize)) { DISPLAYLEVEL(3, "warning : could not compress sample size %u \n", (unsigned)srcSize); return; }
|
598
606
|
|
599
607
|
if (cSize) { /* if == 0; block is not compressible */
|
600
|
-
const seqStore_t* seqStorePtr = ZSTD_getSeqStore(esr.zc);
|
608
|
+
const seqStore_t* const seqStorePtr = ZSTD_getSeqStore(esr.zc);
|
601
609
|
|
602
610
|
/* literals stats */
|
603
611
|
{ const BYTE* bytePtr;
|
@@ -659,6 +667,18 @@ static void ZDICT_insertSortCount(offsetCount_t table[ZSTD_REP_NUM+1], U32 val,
|
|
659
667
|
}
|
660
668
|
}
|
661
669
|
|
670
|
+
/* ZDICT_flatLit() :
|
671
|
+
* rewrite `countLit` to contain a mostly flat but still compressible distribution of literals.
|
672
|
+
* necessary to avoid generating a non-compressible distribution that HUF_writeCTable() cannot encode.
|
673
|
+
*/
|
674
|
+
static void ZDICT_flatLit(unsigned* countLit)
|
675
|
+
{
|
676
|
+
int u;
|
677
|
+
for (u=1; u<256; u++) countLit[u] = 2;
|
678
|
+
countLit[0] = 4;
|
679
|
+
countLit[253] = 1;
|
680
|
+
countLit[254] = 1;
|
681
|
+
}
|
662
682
|
|
663
683
|
#define OFFCODE_MAX 30 /* only applicable to first block */
|
664
684
|
static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
@@ -667,18 +687,18 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
667
687
|
const void* dictBuffer, size_t dictBufferSize,
|
668
688
|
unsigned notificationLevel)
|
669
689
|
{
|
670
|
-
|
690
|
+
unsigned countLit[256];
|
671
691
|
HUF_CREATE_STATIC_CTABLE(hufTable, 255);
|
672
|
-
|
692
|
+
unsigned offcodeCount[OFFCODE_MAX+1];
|
673
693
|
short offcodeNCount[OFFCODE_MAX+1];
|
674
694
|
U32 offcodeMax = ZSTD_highbit32((U32)(dictBufferSize + 128 KB));
|
675
|
-
|
695
|
+
unsigned matchLengthCount[MaxML+1];
|
676
696
|
short matchLengthNCount[MaxML+1];
|
677
|
-
|
697
|
+
unsigned litLengthCount[MaxLL+1];
|
678
698
|
short litLengthNCount[MaxLL+1];
|
679
699
|
U32 repOffset[MAXREPOFFSET];
|
680
700
|
offsetCount_t bestRepOffset[ZSTD_REP_NUM+1];
|
681
|
-
EStats_ress_t esr;
|
701
|
+
EStats_ress_t esr = { NULL, NULL, NULL };
|
682
702
|
ZSTD_parameters params;
|
683
703
|
U32 u, huffLog = 11, Offlog = OffFSELog, mlLog = MLFSELog, llLog = LLFSELog, total;
|
684
704
|
size_t pos = 0, errorCode;
|
@@ -688,14 +708,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
688
708
|
BYTE* dstPtr = (BYTE*)dstBuffer;
|
689
709
|
|
690
710
|
/* init */
|
691
|
-
|
692
|
-
esr.zc = ZSTD_createCCtx();
|
693
|
-
esr.workPlace = malloc(ZSTD_BLOCKSIZE_MAX);
|
694
|
-
if (!esr.ref || !esr.zc || !esr.workPlace) {
|
695
|
-
eSize = ERROR(memory_allocation);
|
696
|
-
DISPLAYLEVEL(1, "Not enough memory \n");
|
697
|
-
goto _cleanup;
|
698
|
-
}
|
711
|
+
DEBUGLOG(4, "ZDICT_analyzeEntropy");
|
699
712
|
if (offcodeMax>OFFCODE_MAX) { eSize = ERROR(dictionaryCreation_failed); goto _cleanup; } /* too large dictionary */
|
700
713
|
for (u=0; u<256; u++) countLit[u] = 1; /* any character must be described */
|
701
714
|
for (u=0; u<=offcodeMax; u++) offcodeCount[u] = 1;
|
@@ -704,16 +717,19 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
704
717
|
memset(repOffset, 0, sizeof(repOffset));
|
705
718
|
repOffset[1] = repOffset[4] = repOffset[8] = 1;
|
706
719
|
memset(bestRepOffset, 0, sizeof(bestRepOffset));
|
707
|
-
if (compressionLevel
|
720
|
+
if (compressionLevel==0) compressionLevel = g_compressionLevel_default;
|
708
721
|
params = ZSTD_getParams(compressionLevel, averageSampleSize, dictBufferSize);
|
709
|
-
{ size_t const beginResult = ZSTD_compressBegin_advanced(esr.ref, dictBuffer, dictBufferSize, params, 0);
|
710
|
-
if (ZSTD_isError(beginResult)) {
|
711
|
-
DISPLAYLEVEL(1, "error : ZSTD_compressBegin_advanced() failed : %s \n", ZSTD_getErrorName(beginResult));
|
712
|
-
eSize = ERROR(GENERIC);
|
713
|
-
goto _cleanup;
|
714
|
-
} }
|
715
722
|
|
716
|
-
|
723
|
+
esr.dict = ZSTD_createCDict_advanced(dictBuffer, dictBufferSize, ZSTD_dlm_byRef, ZSTD_dct_rawContent, params.cParams, ZSTD_defaultCMem);
|
724
|
+
esr.zc = ZSTD_createCCtx();
|
725
|
+
esr.workPlace = malloc(ZSTD_BLOCKSIZE_MAX);
|
726
|
+
if (!esr.dict || !esr.zc || !esr.workPlace) {
|
727
|
+
eSize = ERROR(memory_allocation);
|
728
|
+
DISPLAYLEVEL(1, "Not enough memory \n");
|
729
|
+
goto _cleanup;
|
730
|
+
}
|
731
|
+
|
732
|
+
/* collect stats on all samples */
|
717
733
|
for (u=0; u<nbFiles; u++) {
|
718
734
|
ZDICT_countEStats(esr, params,
|
719
735
|
countLit, offcodeCount, matchLengthCount, litLengthCount, repOffset,
|
@@ -722,14 +738,21 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
722
738
|
pos += fileSizes[u];
|
723
739
|
}
|
724
740
|
|
725
|
-
/* analyze */
|
726
|
-
|
727
|
-
|
728
|
-
|
729
|
-
|
730
|
-
|
741
|
+
/* analyze, build stats, starting with literals */
|
742
|
+
{ size_t maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog);
|
743
|
+
if (HUF_isError(maxNbBits)) {
|
744
|
+
eSize = ERROR(GENERIC);
|
745
|
+
DISPLAYLEVEL(1, " HUF_buildCTable error \n");
|
746
|
+
goto _cleanup;
|
747
|
+
}
|
748
|
+
if (maxNbBits==8) { /* not compressible : will fail on HUF_writeCTable() */
|
749
|
+
DISPLAYLEVEL(2, "warning : pathological dataset : literals are not compressible : samples are noisy or too regular \n");
|
750
|
+
ZDICT_flatLit(countLit); /* replace distribution by a fake "mostly flat but still compressible" distribution, that HUF_writeCTable() can encode */
|
751
|
+
maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog);
|
752
|
+
assert(maxNbBits==9);
|
753
|
+
}
|
754
|
+
huffLog = (U32)maxNbBits;
|
731
755
|
}
|
732
|
-
huffLog = (U32)errorCode;
|
733
756
|
|
734
757
|
/* looking for most common first offsets */
|
735
758
|
{ U32 offset;
|
@@ -829,7 +852,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
829
852
|
eSize += 12;
|
830
853
|
|
831
854
|
_cleanup:
|
832
|
-
|
855
|
+
ZSTD_freeCDict(esr.dict);
|
833
856
|
ZSTD_freeCCtx(esr.zc);
|
834
857
|
free(esr.workPlace);
|
835
858
|
|
@@ -840,16 +863,17 @@ _cleanup:
|
|
840
863
|
|
841
864
|
size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
|
842
865
|
const void* customDictContent, size_t dictContentSize,
|
843
|
-
const void* samplesBuffer, const size_t* samplesSizes,
|
844
|
-
ZDICT_params_t params)
|
866
|
+
const void* samplesBuffer, const size_t* samplesSizes,
|
867
|
+
unsigned nbSamples, ZDICT_params_t params)
|
845
868
|
{
|
846
869
|
size_t hSize;
|
847
870
|
#define HBUFFSIZE 256 /* should prove large enough for all entropy headers */
|
848
871
|
BYTE header[HBUFFSIZE];
|
849
|
-
int const compressionLevel = (params.compressionLevel
|
872
|
+
int const compressionLevel = (params.compressionLevel == 0) ? g_compressionLevel_default : params.compressionLevel;
|
850
873
|
U32 const notificationLevel = params.notificationLevel;
|
851
874
|
|
852
875
|
/* check conditions */
|
876
|
+
DEBUGLOG(4, "ZDICT_finalizeDictionary");
|
853
877
|
if (dictBufferCapacity < dictContentSize) return ERROR(dstSize_tooSmall);
|
854
878
|
if (dictContentSize < ZDICT_CONTENTSIZE_MIN) return ERROR(srcSize_wrong);
|
855
879
|
if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) return ERROR(dstSize_tooSmall);
|
@@ -886,11 +910,12 @@ size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
|
|
886
910
|
}
|
887
911
|
|
888
912
|
|
889
|
-
size_t ZDICT_addEntropyTablesFromBuffer_advanced(
|
890
|
-
|
891
|
-
|
913
|
+
static size_t ZDICT_addEntropyTablesFromBuffer_advanced(
|
914
|
+
void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
|
915
|
+
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
|
916
|
+
ZDICT_params_t params)
|
892
917
|
{
|
893
|
-
int const compressionLevel = (params.compressionLevel
|
918
|
+
int const compressionLevel = (params.compressionLevel == 0) ? g_compressionLevel_default : params.compressionLevel;
|
894
919
|
U32 const notificationLevel = params.notificationLevel;
|
895
920
|
size_t hSize = 8;
|
896
921
|
|
@@ -919,7 +944,11 @@ size_t ZDICT_addEntropyTablesFromBuffer_advanced(void* dictBuffer, size_t dictCo
|
|
919
944
|
return MIN(dictBufferCapacity, hSize+dictContentSize);
|
920
945
|
}
|
921
946
|
|
922
|
-
|
947
|
+
/* Hidden declaration for dbio.c */
|
948
|
+
size_t ZDICT_trainFromBuffer_unsafe_legacy(
|
949
|
+
void* dictBuffer, size_t maxDictSize,
|
950
|
+
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
|
951
|
+
ZDICT_legacy_params_t params);
|
923
952
|
/*! ZDICT_trainFromBuffer_unsafe_legacy() :
|
924
953
|
* Warning : `samplesBuffer` must be followed by noisy guard band.
|
925
954
|
* @return : size of dictionary, or an error code which can be tested with ZDICT_isError()
|
@@ -954,31 +983,33 @@ size_t ZDICT_trainFromBuffer_unsafe_legacy(
|
|
954
983
|
|
955
984
|
/* display best matches */
|
956
985
|
if (params.zParams.notificationLevel>= 3) {
|
957
|
-
|
958
|
-
|
959
|
-
|
960
|
-
DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", dictList[0].pos-1, dictContentSize);
|
986
|
+
unsigned const nb = MIN(25, dictList[0].pos);
|
987
|
+
unsigned const dictContentSize = ZDICT_dictSize(dictList);
|
988
|
+
unsigned u;
|
989
|
+
DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", (unsigned)dictList[0].pos-1, dictContentSize);
|
961
990
|
DISPLAYLEVEL(3, "list %u best segments \n", nb-1);
|
962
991
|
for (u=1; u<nb; u++) {
|
963
|
-
|
964
|
-
|
992
|
+
unsigned const pos = dictList[u].pos;
|
993
|
+
unsigned const length = dictList[u].length;
|
965
994
|
U32 const printedLength = MIN(40, length);
|
966
|
-
if ((pos > samplesBuffSize) || ((pos + length) > samplesBuffSize))
|
995
|
+
if ((pos > samplesBuffSize) || ((pos + length) > samplesBuffSize)) {
|
996
|
+
free(dictList);
|
967
997
|
return ERROR(GENERIC); /* should never happen */
|
998
|
+
}
|
968
999
|
DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |",
|
969
|
-
u, length, pos, dictList[u].savings);
|
1000
|
+
u, length, pos, (unsigned)dictList[u].savings);
|
970
1001
|
ZDICT_printHex((const char*)samplesBuffer+pos, printedLength);
|
971
1002
|
DISPLAYLEVEL(3, "| \n");
|
972
1003
|
} }
|
973
1004
|
|
974
1005
|
|
975
1006
|
/* create dictionary */
|
976
|
-
{
|
1007
|
+
{ unsigned dictContentSize = ZDICT_dictSize(dictList);
|
977
1008
|
if (dictContentSize < ZDICT_CONTENTSIZE_MIN) { free(dictList); return ERROR(dictionaryCreation_failed); } /* dictionary content too small */
|
978
1009
|
if (dictContentSize < targetDictSize/4) {
|
979
|
-
DISPLAYLEVEL(2, "! warning : selected content significantly smaller than requested (%u < %u) \n", dictContentSize, (
|
1010
|
+
DISPLAYLEVEL(2, "! warning : selected content significantly smaller than requested (%u < %u) \n", dictContentSize, (unsigned)maxDictSize);
|
980
1011
|
if (samplesBuffSize < 10 * targetDictSize)
|
981
|
-
DISPLAYLEVEL(2, "! consider increasing the number of samples (total size : %u MB)\n", (
|
1012
|
+
DISPLAYLEVEL(2, "! consider increasing the number of samples (total size : %u MB)\n", (unsigned)(samplesBuffSize>>20));
|
982
1013
|
if (minRep > MINRATIO) {
|
983
1014
|
DISPLAYLEVEL(2, "! consider increasing selectivity to produce larger dictionary (-s%u) \n", selectivity+1);
|
984
1015
|
DISPLAYLEVEL(2, "! note : larger dictionaries are not necessarily better, test its efficiency on samples \n");
|
@@ -986,9 +1017,9 @@ size_t ZDICT_trainFromBuffer_unsafe_legacy(
|
|
986
1017
|
}
|
987
1018
|
|
988
1019
|
if ((dictContentSize > targetDictSize*3) && (nbSamples > 2*MINRATIO) && (selectivity>1)) {
|
989
|
-
|
1020
|
+
unsigned proposedSelectivity = selectivity-1;
|
990
1021
|
while ((nbSamples >> proposedSelectivity) <= MINRATIO) { proposedSelectivity--; }
|
991
|
-
DISPLAYLEVEL(2, "! note : calculated dictionary significantly larger than requested (%u > %u) \n", dictContentSize, (
|
1022
|
+
DISPLAYLEVEL(2, "! note : calculated dictionary significantly larger than requested (%u > %u) \n", dictContentSize, (unsigned)maxDictSize);
|
992
1023
|
DISPLAYLEVEL(2, "! consider increasing dictionary size, or produce denser dictionary (-s%u) \n", proposedSelectivity);
|
993
1024
|
DISPLAYLEVEL(2, "! always test dictionary efficiency on real samples \n");
|
994
1025
|
}
|
@@ -1025,8 +1056,9 @@ size_t ZDICT_trainFromBuffer_unsafe_legacy(
|
|
1025
1056
|
}
|
1026
1057
|
|
1027
1058
|
|
1028
|
-
/*
|
1029
|
-
*
|
1059
|
+
/* ZDICT_trainFromBuffer_legacy() :
|
1060
|
+
* issue : samplesBuffer need to be followed by a noisy guard band.
|
1061
|
+
* work around : duplicate the buffer, and add the noise */
|
1030
1062
|
size_t ZDICT_trainFromBuffer_legacy(void* dictBuffer, size_t dictBufferCapacity,
|
1031
1063
|
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
|
1032
1064
|
ZDICT_legacy_params_t params)
|
@@ -1053,19 +1085,23 @@ size_t ZDICT_trainFromBuffer_legacy(void* dictBuffer, size_t dictBufferCapacity,
|
|
1053
1085
|
size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
|
1054
1086
|
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
|
1055
1087
|
{
|
1056
|
-
|
1088
|
+
ZDICT_fastCover_params_t params;
|
1089
|
+
DEBUGLOG(3, "ZDICT_trainFromBuffer");
|
1057
1090
|
memset(¶ms, 0, sizeof(params));
|
1058
1091
|
params.d = 8;
|
1059
1092
|
params.steps = 4;
|
1060
|
-
/* Default to level 6 since no compression level information is
|
1061
|
-
params.zParams.compressionLevel =
|
1062
|
-
|
1063
|
-
|
1064
|
-
|
1093
|
+
/* Default to level 6 since no compression level information is available */
|
1094
|
+
params.zParams.compressionLevel = 3;
|
1095
|
+
#if defined(DEBUGLEVEL) && (DEBUGLEVEL>=1)
|
1096
|
+
params.zParams.notificationLevel = DEBUGLEVEL;
|
1097
|
+
#endif
|
1098
|
+
return ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, dictBufferCapacity,
|
1099
|
+
samplesBuffer, samplesSizes, nbSamples,
|
1100
|
+
¶ms);
|
1065
1101
|
}
|
1066
1102
|
|
1067
1103
|
size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
|
1068
|
-
|
1104
|
+
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
|
1069
1105
|
{
|
1070
1106
|
ZDICT_params_t params;
|
1071
1107
|
memset(¶ms, 0, sizeof(params));
|