zstd-ruby 1.3.8.0 → 1.4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +6 -5
- data/README.md +1 -1
- data/ext/zstdruby/libzstd/Makefile +7 -3
- data/ext/zstdruby/libzstd/README.md +4 -2
- data/ext/zstdruby/libzstd/common/compiler.h +1 -1
- data/ext/zstdruby/libzstd/common/fse.h +1 -1
- data/ext/zstdruby/libzstd/common/threading.c +2 -2
- data/ext/zstdruby/libzstd/common/xxhash.c +2 -2
- data/ext/zstdruby/libzstd/common/zstd_internal.h +55 -2
- data/ext/zstdruby/libzstd/compress/fse_compress.c +2 -2
- data/ext/zstdruby/libzstd/compress/zstd_compress.c +423 -296
- data/ext/zstdruby/libzstd/compress/zstd_compress_internal.h +14 -11
- data/ext/zstdruby/libzstd/compress/zstd_fast.c +203 -124
- data/ext/zstdruby/libzstd/compress/zstd_lazy.h +1 -1
- data/ext/zstdruby/libzstd/compress/zstd_ldm.c +1 -1
- data/ext/zstdruby/libzstd/compress/zstd_opt.c +27 -11
- data/ext/zstdruby/libzstd/compress/zstdmt_compress.c +41 -49
- data/ext/zstdruby/libzstd/compress/zstdmt_compress.h +43 -26
- data/ext/zstdruby/libzstd/decompress/zstd_ddict.c +4 -4
- data/ext/zstdruby/libzstd/decompress/zstd_decompress.c +257 -164
- data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.c +51 -47
- data/ext/zstdruby/libzstd/decompress/zstd_decompress_internal.h +7 -0
- data/ext/zstdruby/libzstd/dictBuilder/cover.c +58 -13
- data/ext/zstdruby/libzstd/dictBuilder/cover.h +29 -0
- data/ext/zstdruby/libzstd/dictBuilder/fastcover.c +25 -13
- data/ext/zstdruby/libzstd/dictBuilder/zdict.h +18 -8
- data/ext/zstdruby/libzstd/dll/example/build_package.bat +3 -2
- data/ext/zstdruby/libzstd/legacy/zstd_legacy.h +42 -12
- data/ext/zstdruby/libzstd/legacy/zstd_v01.c +32 -7
- data/ext/zstdruby/libzstd/legacy/zstd_v01.h +12 -7
- data/ext/zstdruby/libzstd/legacy/zstd_v02.c +31 -12
- data/ext/zstdruby/libzstd/legacy/zstd_v02.h +12 -7
- data/ext/zstdruby/libzstd/legacy/zstd_v03.c +32 -12
- data/ext/zstdruby/libzstd/legacy/zstd_v03.h +12 -7
- data/ext/zstdruby/libzstd/legacy/zstd_v04.c +32 -12
- data/ext/zstdruby/libzstd/legacy/zstd_v04.h +12 -7
- data/ext/zstdruby/libzstd/legacy/zstd_v05.c +32 -7
- data/ext/zstdruby/libzstd/legacy/zstd_v05.h +12 -7
- data/ext/zstdruby/libzstd/legacy/zstd_v06.c +36 -8
- data/ext/zstdruby/libzstd/legacy/zstd_v06.h +10 -5
- data/ext/zstdruby/libzstd/legacy/zstd_v07.c +40 -9
- data/ext/zstdruby/libzstd/legacy/zstd_v07.h +10 -5
- data/ext/zstdruby/libzstd/zstd.h +689 -542
- data/lib/zstd-ruby/version.rb +1 -1
- data/zstd-ruby.gemspec +1 -1
- metadata +6 -7
- data/ext/zstdruby/libzstd/dll/libzstd.def +0 -87
@@ -56,14 +56,15 @@ static void ZSTD_copy4(void* dst, const void* src) { memcpy(dst, src, 4); }
|
|
56
56
|
size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
|
57
57
|
blockProperties_t* bpPtr)
|
58
58
|
{
|
59
|
-
|
59
|
+
RETURN_ERROR_IF(srcSize < ZSTD_blockHeaderSize, srcSize_wrong);
|
60
|
+
|
60
61
|
{ U32 const cBlockHeader = MEM_readLE24(src);
|
61
62
|
U32 const cSize = cBlockHeader >> 3;
|
62
63
|
bpPtr->lastBlock = cBlockHeader & 1;
|
63
64
|
bpPtr->blockType = (blockType_e)((cBlockHeader >> 1) & 3);
|
64
65
|
bpPtr->origSize = cSize; /* only useful for RLE */
|
65
66
|
if (bpPtr->blockType == bt_rle) return 1;
|
66
|
-
|
67
|
+
RETURN_ERROR_IF(bpPtr->blockType == bt_reserved, corruption_detected);
|
67
68
|
return cSize;
|
68
69
|
}
|
69
70
|
}
|
@@ -78,7 +79,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
|
|
78
79
|
size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
|
79
80
|
const void* src, size_t srcSize) /* note : srcSize < BLOCKSIZE */
|
80
81
|
{
|
81
|
-
|
82
|
+
RETURN_ERROR_IF(srcSize < MIN_CBLOCK_SIZE, corruption_detected);
|
82
83
|
|
83
84
|
{ const BYTE* const istart = (const BYTE*) src;
|
84
85
|
symbolEncodingType_e const litEncType = (symbolEncodingType_e)(istart[0] & 3);
|
@@ -86,11 +87,11 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
|
|
86
87
|
switch(litEncType)
|
87
88
|
{
|
88
89
|
case set_repeat:
|
89
|
-
|
90
|
+
RETURN_ERROR_IF(dctx->litEntropy==0, dictionary_corrupted);
|
90
91
|
/* fall-through */
|
91
92
|
|
92
93
|
case set_compressed:
|
93
|
-
|
94
|
+
RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3");
|
94
95
|
{ size_t lhSize, litSize, litCSize;
|
95
96
|
U32 singleStream=0;
|
96
97
|
U32 const lhlCode = (istart[0] >> 2) & 3;
|
@@ -118,8 +119,8 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
|
|
118
119
|
litCSize = (lhc >> 22) + (istart[4] << 10);
|
119
120
|
break;
|
120
121
|
}
|
121
|
-
|
122
|
-
|
122
|
+
RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected);
|
123
|
+
RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected);
|
123
124
|
|
124
125
|
/* prefetch huffman table if cold */
|
125
126
|
if (dctx->ddictIsCold && (litSize > 768 /* heuristic */)) {
|
@@ -157,7 +158,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
|
|
157
158
|
}
|
158
159
|
}
|
159
160
|
|
160
|
-
|
161
|
+
RETURN_ERROR_IF(HUF_isError(hufSuccess), corruption_detected);
|
161
162
|
|
162
163
|
dctx->litPtr = dctx->litBuffer;
|
163
164
|
dctx->litSize = litSize;
|
@@ -187,7 +188,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
|
|
187
188
|
}
|
188
189
|
|
189
190
|
if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) { /* risk reading beyond src buffer with wildcopy */
|
190
|
-
|
191
|
+
RETURN_ERROR_IF(litSize+lhSize > srcSize, corruption_detected);
|
191
192
|
memcpy(dctx->litBuffer, istart+lhSize, litSize);
|
192
193
|
dctx->litPtr = dctx->litBuffer;
|
193
194
|
dctx->litSize = litSize;
|
@@ -216,17 +217,17 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
|
|
216
217
|
case 3:
|
217
218
|
lhSize = 3;
|
218
219
|
litSize = MEM_readLE24(istart) >> 4;
|
219
|
-
|
220
|
+
RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4");
|
220
221
|
break;
|
221
222
|
}
|
222
|
-
|
223
|
+
RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected);
|
223
224
|
memset(dctx->litBuffer, istart[lhSize], litSize + WILDCOPY_OVERLENGTH);
|
224
225
|
dctx->litPtr = dctx->litBuffer;
|
225
226
|
dctx->litSize = litSize;
|
226
227
|
return lhSize+1;
|
227
228
|
}
|
228
229
|
default:
|
229
|
-
|
230
|
+
RETURN_ERROR(corruption_detected, "impossible");
|
230
231
|
}
|
231
232
|
}
|
232
233
|
}
|
@@ -436,8 +437,8 @@ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymb
|
|
436
437
|
switch(type)
|
437
438
|
{
|
438
439
|
case set_rle :
|
439
|
-
|
440
|
-
|
440
|
+
RETURN_ERROR_IF(!srcSize, srcSize_wrong);
|
441
|
+
RETURN_ERROR_IF((*(const BYTE*)src) > max, corruption_detected);
|
441
442
|
{ U32 const symbol = *(const BYTE*)src;
|
442
443
|
U32 const baseline = baseValue[symbol];
|
443
444
|
U32 const nbBits = nbAdditionalBits[symbol];
|
@@ -449,7 +450,7 @@ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymb
|
|
449
450
|
*DTablePtr = defaultTable;
|
450
451
|
return 0;
|
451
452
|
case set_repeat:
|
452
|
-
|
453
|
+
RETURN_ERROR_IF(!flagRepeatTable, corruption_detected);
|
453
454
|
/* prefetch FSE table if used */
|
454
455
|
if (ddictIsCold && (nbSeq > 24 /* heuristic */)) {
|
455
456
|
const void* const pStart = *DTablePtr;
|
@@ -461,15 +462,15 @@ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymb
|
|
461
462
|
{ unsigned tableLog;
|
462
463
|
S16 norm[MaxSeq+1];
|
463
464
|
size_t const headerSize = FSE_readNCount(norm, &max, &tableLog, src, srcSize);
|
464
|
-
|
465
|
-
|
465
|
+
RETURN_ERROR_IF(FSE_isError(headerSize), corruption_detected);
|
466
|
+
RETURN_ERROR_IF(tableLog > maxLog, corruption_detected);
|
466
467
|
ZSTD_buildFSETable(DTableSpace, norm, max, baseValue, nbAdditionalBits, tableLog);
|
467
468
|
*DTablePtr = DTableSpace;
|
468
469
|
return headerSize;
|
469
470
|
}
|
470
|
-
default :
|
471
|
+
default :
|
471
472
|
assert(0);
|
472
|
-
|
473
|
+
RETURN_ERROR(GENERIC, "impossible");
|
473
474
|
}
|
474
475
|
}
|
475
476
|
|
@@ -483,28 +484,28 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
|
|
483
484
|
DEBUGLOG(5, "ZSTD_decodeSeqHeaders");
|
484
485
|
|
485
486
|
/* check */
|
486
|
-
|
487
|
+
RETURN_ERROR_IF(srcSize < MIN_SEQUENCES_SIZE, srcSize_wrong);
|
487
488
|
|
488
489
|
/* SeqHead */
|
489
490
|
nbSeq = *ip++;
|
490
491
|
if (!nbSeq) {
|
491
492
|
*nbSeqPtr=0;
|
492
|
-
|
493
|
+
RETURN_ERROR_IF(srcSize != 1, srcSize_wrong);
|
493
494
|
return 1;
|
494
495
|
}
|
495
496
|
if (nbSeq > 0x7F) {
|
496
497
|
if (nbSeq == 0xFF) {
|
497
|
-
|
498
|
+
RETURN_ERROR_IF(ip+2 > iend, srcSize_wrong);
|
498
499
|
nbSeq = MEM_readLE16(ip) + LONGNBSEQ, ip+=2;
|
499
500
|
} else {
|
500
|
-
|
501
|
+
RETURN_ERROR_IF(ip >= iend, srcSize_wrong);
|
501
502
|
nbSeq = ((nbSeq-0x80)<<8) + *ip++;
|
502
503
|
}
|
503
504
|
}
|
504
505
|
*nbSeqPtr = nbSeq;
|
505
506
|
|
506
507
|
/* FSE table descriptors */
|
507
|
-
|
508
|
+
RETURN_ERROR_IF(ip+4 > iend, srcSize_wrong); /* minimum possible size */
|
508
509
|
{ symbolEncodingType_e const LLtype = (symbolEncodingType_e)(*ip >> 6);
|
509
510
|
symbolEncodingType_e const OFtype = (symbolEncodingType_e)((*ip >> 4) & 3);
|
510
511
|
symbolEncodingType_e const MLtype = (symbolEncodingType_e)((*ip >> 2) & 3);
|
@@ -517,7 +518,7 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
|
|
517
518
|
LL_base, LL_bits,
|
518
519
|
LL_defaultDTable, dctx->fseEntropy,
|
519
520
|
dctx->ddictIsCold, nbSeq);
|
520
|
-
|
521
|
+
RETURN_ERROR_IF(ZSTD_isError(llhSize), corruption_detected);
|
521
522
|
ip += llhSize;
|
522
523
|
}
|
523
524
|
|
@@ -527,7 +528,7 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
|
|
527
528
|
OF_base, OF_bits,
|
528
529
|
OF_defaultDTable, dctx->fseEntropy,
|
529
530
|
dctx->ddictIsCold, nbSeq);
|
530
|
-
|
531
|
+
RETURN_ERROR_IF(ZSTD_isError(ofhSize), corruption_detected);
|
531
532
|
ip += ofhSize;
|
532
533
|
}
|
533
534
|
|
@@ -537,7 +538,7 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
|
|
537
538
|
ML_base, ML_bits,
|
538
539
|
ML_defaultDTable, dctx->fseEntropy,
|
539
540
|
dctx->ddictIsCold, nbSeq);
|
540
|
-
|
541
|
+
RETURN_ERROR_IF(ZSTD_isError(mlhSize), corruption_detected);
|
541
542
|
ip += mlhSize;
|
542
543
|
}
|
543
544
|
}
|
@@ -590,8 +591,8 @@ size_t ZSTD_execSequenceLast7(BYTE* op,
|
|
590
591
|
const BYTE* match = oLitEnd - sequence.offset;
|
591
592
|
|
592
593
|
/* check */
|
593
|
-
|
594
|
-
|
594
|
+
RETURN_ERROR_IF(oMatchEnd>oend, dstSize_tooSmall, "last match must fit within dstBuffer");
|
595
|
+
RETURN_ERROR_IF(iLitEnd > litLimit, corruption_detected, "try to read beyond literal buffer");
|
595
596
|
|
596
597
|
/* copy literals */
|
597
598
|
while (op < oLitEnd) *op++ = *(*litPtr)++;
|
@@ -599,7 +600,7 @@ size_t ZSTD_execSequenceLast7(BYTE* op,
|
|
599
600
|
/* copy Match */
|
600
601
|
if (sequence.offset > (size_t)(oLitEnd - base)) {
|
601
602
|
/* offset beyond prefix */
|
602
|
-
|
603
|
+
RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - vBase),corruption_detected);
|
603
604
|
match = dictEnd - (base-match);
|
604
605
|
if (match + sequence.matchLength <= dictEnd) {
|
605
606
|
memmove(oLitEnd, match, sequence.matchLength);
|
@@ -631,8 +632,8 @@ size_t ZSTD_execSequence(BYTE* op,
|
|
631
632
|
const BYTE* match = oLitEnd - sequence.offset;
|
632
633
|
|
633
634
|
/* check */
|
634
|
-
|
635
|
-
|
635
|
+
RETURN_ERROR_IF(oMatchEnd>oend, dstSize_tooSmall, "last match must start at a minimum distance of WILDCOPY_OVERLENGTH from oend");
|
636
|
+
RETURN_ERROR_IF(iLitEnd > litLimit, corruption_detected, "over-read beyond lit buffer");
|
636
637
|
if (oLitEnd>oend_w) return ZSTD_execSequenceLast7(op, oend, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd);
|
637
638
|
|
638
639
|
/* copy Literals */
|
@@ -645,8 +646,7 @@ size_t ZSTD_execSequence(BYTE* op,
|
|
645
646
|
/* copy Match */
|
646
647
|
if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
|
647
648
|
/* offset beyond prefix -> go into extDict */
|
648
|
-
|
649
|
-
return ERROR(corruption_detected);
|
649
|
+
RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected);
|
650
650
|
match = dictEnd + (match - prefixStart);
|
651
651
|
if (match + sequence.matchLength <= dictEnd) {
|
652
652
|
memmove(oLitEnd, match, sequence.matchLength);
|
@@ -712,8 +712,8 @@ size_t ZSTD_execSequenceLong(BYTE* op,
|
|
712
712
|
const BYTE* match = sequence.match;
|
713
713
|
|
714
714
|
/* check */
|
715
|
-
|
716
|
-
|
715
|
+
RETURN_ERROR_IF(oMatchEnd > oend, dstSize_tooSmall, "last match must start at a minimum distance of WILDCOPY_OVERLENGTH from oend");
|
716
|
+
RETURN_ERROR_IF(iLitEnd > litLimit, corruption_detected, "over-read beyond lit buffer");
|
717
717
|
if (oLitEnd > oend_w) return ZSTD_execSequenceLast7(op, oend, sequence, litPtr, litLimit, prefixStart, dictStart, dictEnd);
|
718
718
|
|
719
719
|
/* copy Literals */
|
@@ -726,7 +726,7 @@ size_t ZSTD_execSequenceLong(BYTE* op,
|
|
726
726
|
/* copy Match */
|
727
727
|
if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
|
728
728
|
/* offset beyond prefix */
|
729
|
-
|
729
|
+
RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - dictStart), corruption_detected);
|
730
730
|
if (match + sequence.matchLength <= dictEnd) {
|
731
731
|
memmove(oLitEnd, match, sequence.matchLength);
|
732
732
|
return sequenceLength;
|
@@ -801,7 +801,7 @@ ZSTD_updateFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD)
|
|
801
801
|
/* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum
|
802
802
|
* offset bits. But we can only read at most (STREAM_ACCUMULATOR_MIN_32 - 1)
|
803
803
|
* bits before reloading. This value is the maximum number of bytes we read
|
804
|
-
* after reloading when we are decoding long
|
804
|
+
* after reloading when we are decoding long offsets.
|
805
805
|
*/
|
806
806
|
#define LONG_OFFSETS_MAX_EXTRA_BITS_32 \
|
807
807
|
(ZSTD_WINDOWLOG_MAX_32 > STREAM_ACCUMULATOR_MIN_32 \
|
@@ -911,7 +911,9 @@ ZSTD_decompressSequences_body( ZSTD_DCtx* dctx,
|
|
911
911
|
seqState_t seqState;
|
912
912
|
dctx->fseEntropy = 1;
|
913
913
|
{ U32 i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; }
|
914
|
-
|
914
|
+
RETURN_ERROR_IF(
|
915
|
+
ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend-ip)),
|
916
|
+
corruption_detected);
|
915
917
|
ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr);
|
916
918
|
ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr);
|
917
919
|
ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
|
@@ -927,14 +929,14 @@ ZSTD_decompressSequences_body( ZSTD_DCtx* dctx,
|
|
927
929
|
|
928
930
|
/* check if reached exact end */
|
929
931
|
DEBUGLOG(5, "ZSTD_decompressSequences_body: after decode loop, remaining nbSeq : %i", nbSeq);
|
930
|
-
|
932
|
+
RETURN_ERROR_IF(nbSeq, corruption_detected);
|
931
933
|
/* save reps for next block */
|
932
934
|
{ U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
|
933
935
|
}
|
934
936
|
|
935
937
|
/* last literal segment */
|
936
938
|
{ size_t const lastLLSize = litEnd - litPtr;
|
937
|
-
|
939
|
+
RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall);
|
938
940
|
memcpy(op, litPtr, lastLLSize);
|
939
941
|
op += lastLLSize;
|
940
942
|
}
|
@@ -1066,7 +1068,9 @@ ZSTD_decompressSequencesLong_body(
|
|
1066
1068
|
seqState.pos = (size_t)(op-prefixStart);
|
1067
1069
|
seqState.dictEnd = dictEnd;
|
1068
1070
|
assert(iend >= ip);
|
1069
|
-
|
1071
|
+
RETURN_ERROR_IF(
|
1072
|
+
ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend-ip)),
|
1073
|
+
corruption_detected);
|
1070
1074
|
ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr);
|
1071
1075
|
ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr);
|
1072
1076
|
ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
|
@@ -1076,7 +1080,7 @@ ZSTD_decompressSequencesLong_body(
|
|
1076
1080
|
sequences[seqNb] = ZSTD_decodeSequenceLong(&seqState, isLongOffset);
|
1077
1081
|
PREFETCH_L1(sequences[seqNb].match); PREFETCH_L1(sequences[seqNb].match + sequences[seqNb].matchLength - 1); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
|
1078
1082
|
}
|
1079
|
-
|
1083
|
+
RETURN_ERROR_IF(seqNb<seqAdvance, corruption_detected);
|
1080
1084
|
|
1081
1085
|
/* decode and decompress */
|
1082
1086
|
for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (seqNb<nbSeq) ; seqNb++) {
|
@@ -1087,7 +1091,7 @@ ZSTD_decompressSequencesLong_body(
|
|
1087
1091
|
sequences[seqNb & STORED_SEQS_MASK] = sequence;
|
1088
1092
|
op += oneSeqSize;
|
1089
1093
|
}
|
1090
|
-
|
1094
|
+
RETURN_ERROR_IF(seqNb<nbSeq, corruption_detected);
|
1091
1095
|
|
1092
1096
|
/* finish queue */
|
1093
1097
|
seqNb -= seqAdvance;
|
@@ -1103,7 +1107,7 @@ ZSTD_decompressSequencesLong_body(
|
|
1103
1107
|
|
1104
1108
|
/* last literal segment */
|
1105
1109
|
{ size_t const lastLLSize = litEnd - litPtr;
|
1106
|
-
|
1110
|
+
RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall);
|
1107
1111
|
memcpy(op, litPtr, lastLLSize);
|
1108
1112
|
op += lastLLSize;
|
1109
1113
|
}
|
@@ -1176,7 +1180,7 @@ ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,
|
|
1176
1180
|
/* ZSTD_decompressSequencesLong() :
|
1177
1181
|
* decompression function triggered when a minimum share of offsets is considered "long",
|
1178
1182
|
* aka out of cache.
|
1179
|
-
* note : "long" definition seems overloaded here, sometimes meaning "wider than bitstream register", and sometimes
|
1183
|
+
* note : "long" definition seems overloaded here, sometimes meaning "wider than bitstream register", and sometimes meaning "farther than memory cache distance".
|
1180
1184
|
* This function will try to mitigate main memory latency through the use of prefetching */
|
1181
1185
|
static size_t
|
1182
1186
|
ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx,
|
@@ -1240,7 +1244,7 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
|
|
1240
1244
|
ZSTD_longOffset_e const isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (!frame || (dctx->fParams.windowSize > (1ULL << STREAM_ACCUMULATOR_MIN))));
|
1241
1245
|
DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize);
|
1242
1246
|
|
1243
|
-
|
1247
|
+
RETURN_ERROR_IF(srcSize >= ZSTD_BLOCKSIZE_MAX, srcSize_wrong);
|
1244
1248
|
|
1245
1249
|
/* Decode literals section */
|
1246
1250
|
{ size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize);
|
@@ -89,6 +89,12 @@ typedef enum { ZSTDds_getFrameHeaderSize, ZSTDds_decodeFrameHeader,
|
|
89
89
|
typedef enum { zdss_init=0, zdss_loadHeader,
|
90
90
|
zdss_read, zdss_load, zdss_flush } ZSTD_dStreamStage;
|
91
91
|
|
92
|
+
typedef enum {
|
93
|
+
ZSTD_use_indefinitely = -1, /* Use the dictionary indefinitely */
|
94
|
+
ZSTD_dont_use = 0, /* Do not use the dictionary (if one exists free it) */
|
95
|
+
ZSTD_use_once = 1 /* Use the dictionary once and set to ZSTD_dont_use */
|
96
|
+
} ZSTD_dictUses_e;
|
97
|
+
|
92
98
|
struct ZSTD_DCtx_s
|
93
99
|
{
|
94
100
|
const ZSTD_seqSymbol* LLTptr;
|
@@ -123,6 +129,7 @@ struct ZSTD_DCtx_s
|
|
123
129
|
const ZSTD_DDict* ddict; /* set by ZSTD_initDStream_usingDDict(), or ZSTD_DCtx_refDDict() */
|
124
130
|
U32 dictID;
|
125
131
|
int ddictIsCold; /* if == 1 : dictionary is "new" for working context, and presumed "cold" (not in cpu cache) */
|
132
|
+
ZSTD_dictUses_e dictUses;
|
126
133
|
|
127
134
|
/* streaming */
|
128
135
|
ZSTD_dStreamStage streamStage;
|
@@ -391,7 +391,7 @@ static void COVER_group(COVER_ctx_t *ctx, const void *group,
|
|
391
391
|
*
|
392
392
|
* Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1})
|
393
393
|
*
|
394
|
-
* Once the dmer d is in the
|
394
|
+
* Once the dmer d is in the dictionary we set F(d) = 0.
|
395
395
|
*/
|
396
396
|
static COVER_segment_t COVER_selectSegment(const COVER_ctx_t *ctx, U32 *freqs,
|
397
397
|
COVER_map_t *activeDmers, U32 begin,
|
@@ -435,7 +435,7 @@ static COVER_segment_t COVER_selectSegment(const COVER_ctx_t *ctx, U32 *freqs,
|
|
435
435
|
U32 *delDmerOcc = COVER_map_at(activeDmers, delDmer);
|
436
436
|
activeSegment.begin += 1;
|
437
437
|
*delDmerOcc -= 1;
|
438
|
-
/* If this is the last
|
438
|
+
/* If this is the last occurrence of the dmer, subtract its score */
|
439
439
|
if (*delDmerOcc == 0) {
|
440
440
|
COVER_map_remove(activeDmers, delDmer);
|
441
441
|
activeSegment.score -= freqs[delDmer];
|
@@ -627,6 +627,39 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
|
|
627
627
|
return 1;
|
628
628
|
}
|
629
629
|
|
630
|
+
void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel)
|
631
|
+
{
|
632
|
+
const double ratio = (double)nbDmers / maxDictSize;
|
633
|
+
if (ratio >= 10) {
|
634
|
+
return;
|
635
|
+
}
|
636
|
+
LOCALDISPLAYLEVEL(displayLevel, 1,
|
637
|
+
"WARNING: The maximum dictionary size %u is too large "
|
638
|
+
"compared to the source size %u! "
|
639
|
+
"size(source)/size(dictionary) = %f, but it should be >= "
|
640
|
+
"10! This may lead to a subpar dictionary! We recommend "
|
641
|
+
"training on sources at least 10x, and up to 100x the "
|
642
|
+
"size of the dictionary!\n", (U32)maxDictSize,
|
643
|
+
(U32)nbDmers, ratio);
|
644
|
+
}
|
645
|
+
|
646
|
+
COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize,
|
647
|
+
U32 nbDmers, U32 k, U32 passes)
|
648
|
+
{
|
649
|
+
const U32 minEpochSize = k * 10;
|
650
|
+
COVER_epoch_info_t epochs;
|
651
|
+
epochs.num = MAX(1, maxDictSize / k / passes);
|
652
|
+
epochs.size = nbDmers / epochs.num;
|
653
|
+
if (epochs.size >= minEpochSize) {
|
654
|
+
assert(epochs.size * epochs.num <= nbDmers);
|
655
|
+
return epochs;
|
656
|
+
}
|
657
|
+
epochs.size = MIN(minEpochSize, nbDmers);
|
658
|
+
epochs.num = nbDmers / epochs.size;
|
659
|
+
assert(epochs.size * epochs.num <= nbDmers);
|
660
|
+
return epochs;
|
661
|
+
}
|
662
|
+
|
630
663
|
/**
|
631
664
|
* Given the prepared context build the dictionary.
|
632
665
|
*/
|
@@ -636,28 +669,34 @@ static size_t COVER_buildDictionary(const COVER_ctx_t *ctx, U32 *freqs,
|
|
636
669
|
ZDICT_cover_params_t parameters) {
|
637
670
|
BYTE *const dict = (BYTE *)dictBuffer;
|
638
671
|
size_t tail = dictBufferCapacity;
|
639
|
-
/* Divide the data
|
640
|
-
|
641
|
-
|
642
|
-
const
|
643
|
-
|
672
|
+
/* Divide the data into epochs. We will select one segment from each epoch. */
|
673
|
+
const COVER_epoch_info_t epochs = COVER_computeEpochs(
|
674
|
+
(U32)dictBufferCapacity, (U32)ctx->suffixSize, parameters.k, 4);
|
675
|
+
const size_t maxZeroScoreRun = MAX(10, MIN(100, epochs.num >> 3));
|
676
|
+
size_t zeroScoreRun = 0;
|
644
677
|
size_t epoch;
|
645
678
|
DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n",
|
646
|
-
epochs,
|
679
|
+
(U32)epochs.num, (U32)epochs.size);
|
647
680
|
/* Loop through the epochs until there are no more segments or the dictionary
|
648
681
|
* is full.
|
649
682
|
*/
|
650
|
-
for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs) {
|
651
|
-
const U32 epochBegin = (U32)(epoch *
|
652
|
-
const U32 epochEnd = epochBegin +
|
683
|
+
for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs.num) {
|
684
|
+
const U32 epochBegin = (U32)(epoch * epochs.size);
|
685
|
+
const U32 epochEnd = epochBegin + epochs.size;
|
653
686
|
size_t segmentSize;
|
654
687
|
/* Select a segment */
|
655
688
|
COVER_segment_t segment = COVER_selectSegment(
|
656
689
|
ctx, freqs, activeDmers, epochBegin, epochEnd, parameters);
|
657
|
-
/* If the segment covers no dmers, then we are out of content
|
690
|
+
/* If the segment covers no dmers, then we are out of content.
|
691
|
+
* There may be new content in other epochs, for continue for some time.
|
692
|
+
*/
|
658
693
|
if (segment.score == 0) {
|
659
|
-
|
694
|
+
if (++zeroScoreRun >= maxZeroScoreRun) {
|
695
|
+
break;
|
696
|
+
}
|
697
|
+
continue;
|
660
698
|
}
|
699
|
+
zeroScoreRun = 0;
|
661
700
|
/* Trim the segment if necessary and if it is too small then we are done */
|
662
701
|
segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail);
|
663
702
|
if (segmentSize < parameters.d) {
|
@@ -706,6 +745,7 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
|
|
706
745
|
parameters.d, parameters.splitPoint)) {
|
707
746
|
return ERROR(GENERIC);
|
708
747
|
}
|
748
|
+
COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize, g_displayLevel);
|
709
749
|
if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
|
710
750
|
DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
|
711
751
|
COVER_ctx_destroy(&ctx);
|
@@ -977,6 +1017,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
|
|
977
1017
|
unsigned k;
|
978
1018
|
COVER_best_t best;
|
979
1019
|
POOL_ctx *pool = NULL;
|
1020
|
+
int warned = 0;
|
980
1021
|
|
981
1022
|
/* Checks */
|
982
1023
|
if (splitPoint <= 0 || splitPoint > 1) {
|
@@ -1019,6 +1060,10 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
|
|
1019
1060
|
POOL_free(pool);
|
1020
1061
|
return ERROR(GENERIC);
|
1021
1062
|
}
|
1063
|
+
if (!warned) {
|
1064
|
+
COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize, displayLevel);
|
1065
|
+
warned = 1;
|
1066
|
+
}
|
1022
1067
|
/* Loop through k reusing the same context */
|
1023
1068
|
for (k = kMinK; k <= kMaxK; k += kStepSize) {
|
1024
1069
|
/* Prepare the arguments */
|