zstd-ruby 1.3.8.0 → 1.4.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +6 -5
- data/README.md +1 -1
- data/ext/zstdruby/libzstd/Makefile +7 -3
- data/ext/zstdruby/libzstd/README.md +4 -2
- data/ext/zstdruby/libzstd/common/compiler.h +1 -1
- data/ext/zstdruby/libzstd/common/fse.h +1 -1
- data/ext/zstdruby/libzstd/common/threading.c +2 -2
- data/ext/zstdruby/libzstd/common/xxhash.c +2 -2
- data/ext/zstdruby/libzstd/common/zstd_internal.h +55 -2
- data/ext/zstdruby/libzstd/compress/fse_compress.c +2 -2
- data/ext/zstdruby/libzstd/compress/zstd_compress.c +423 -296
- data/ext/zstdruby/libzstd/compress/zstd_compress_internal.h +14 -11
- data/ext/zstdruby/libzstd/compress/zstd_fast.c +203 -124
- data/ext/zstdruby/libzstd/compress/zstd_lazy.h +1 -1
- data/ext/zstdruby/libzstd/compress/zstd_ldm.c +1 -1
- data/ext/zstdruby/libzstd/compress/zstd_opt.c +27 -11
- data/ext/zstdruby/libzstd/compress/zstdmt_compress.c +41 -49
- data/ext/zstdruby/libzstd/compress/zstdmt_compress.h +43 -26
- data/ext/zstdruby/libzstd/decompress/zstd_ddict.c +4 -4
- data/ext/zstdruby/libzstd/decompress/zstd_decompress.c +257 -164
- data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.c +51 -47
- data/ext/zstdruby/libzstd/decompress/zstd_decompress_internal.h +7 -0
- data/ext/zstdruby/libzstd/dictBuilder/cover.c +58 -13
- data/ext/zstdruby/libzstd/dictBuilder/cover.h +29 -0
- data/ext/zstdruby/libzstd/dictBuilder/fastcover.c +25 -13
- data/ext/zstdruby/libzstd/dictBuilder/zdict.h +18 -8
- data/ext/zstdruby/libzstd/dll/example/build_package.bat +3 -2
- data/ext/zstdruby/libzstd/legacy/zstd_legacy.h +42 -12
- data/ext/zstdruby/libzstd/legacy/zstd_v01.c +32 -7
- data/ext/zstdruby/libzstd/legacy/zstd_v01.h +12 -7
- data/ext/zstdruby/libzstd/legacy/zstd_v02.c +31 -12
- data/ext/zstdruby/libzstd/legacy/zstd_v02.h +12 -7
- data/ext/zstdruby/libzstd/legacy/zstd_v03.c +32 -12
- data/ext/zstdruby/libzstd/legacy/zstd_v03.h +12 -7
- data/ext/zstdruby/libzstd/legacy/zstd_v04.c +32 -12
- data/ext/zstdruby/libzstd/legacy/zstd_v04.h +12 -7
- data/ext/zstdruby/libzstd/legacy/zstd_v05.c +32 -7
- data/ext/zstdruby/libzstd/legacy/zstd_v05.h +12 -7
- data/ext/zstdruby/libzstd/legacy/zstd_v06.c +36 -8
- data/ext/zstdruby/libzstd/legacy/zstd_v06.h +10 -5
- data/ext/zstdruby/libzstd/legacy/zstd_v07.c +40 -9
- data/ext/zstdruby/libzstd/legacy/zstd_v07.h +10 -5
- data/ext/zstdruby/libzstd/zstd.h +689 -542
- data/lib/zstd-ruby/version.rb +1 -1
- data/zstd-ruby.gemspec +1 -1
- metadata +6 -7
- data/ext/zstdruby/libzstd/dll/libzstd.def +0 -87
@@ -56,14 +56,15 @@ static void ZSTD_copy4(void* dst, const void* src) { memcpy(dst, src, 4); }
|
|
56
56
|
size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
|
57
57
|
blockProperties_t* bpPtr)
|
58
58
|
{
|
59
|
-
|
59
|
+
RETURN_ERROR_IF(srcSize < ZSTD_blockHeaderSize, srcSize_wrong);
|
60
|
+
|
60
61
|
{ U32 const cBlockHeader = MEM_readLE24(src);
|
61
62
|
U32 const cSize = cBlockHeader >> 3;
|
62
63
|
bpPtr->lastBlock = cBlockHeader & 1;
|
63
64
|
bpPtr->blockType = (blockType_e)((cBlockHeader >> 1) & 3);
|
64
65
|
bpPtr->origSize = cSize; /* only useful for RLE */
|
65
66
|
if (bpPtr->blockType == bt_rle) return 1;
|
66
|
-
|
67
|
+
RETURN_ERROR_IF(bpPtr->blockType == bt_reserved, corruption_detected);
|
67
68
|
return cSize;
|
68
69
|
}
|
69
70
|
}
|
@@ -78,7 +79,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
|
|
78
79
|
size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
|
79
80
|
const void* src, size_t srcSize) /* note : srcSize < BLOCKSIZE */
|
80
81
|
{
|
81
|
-
|
82
|
+
RETURN_ERROR_IF(srcSize < MIN_CBLOCK_SIZE, corruption_detected);
|
82
83
|
|
83
84
|
{ const BYTE* const istart = (const BYTE*) src;
|
84
85
|
symbolEncodingType_e const litEncType = (symbolEncodingType_e)(istart[0] & 3);
|
@@ -86,11 +87,11 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
|
|
86
87
|
switch(litEncType)
|
87
88
|
{
|
88
89
|
case set_repeat:
|
89
|
-
|
90
|
+
RETURN_ERROR_IF(dctx->litEntropy==0, dictionary_corrupted);
|
90
91
|
/* fall-through */
|
91
92
|
|
92
93
|
case set_compressed:
|
93
|
-
|
94
|
+
RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3");
|
94
95
|
{ size_t lhSize, litSize, litCSize;
|
95
96
|
U32 singleStream=0;
|
96
97
|
U32 const lhlCode = (istart[0] >> 2) & 3;
|
@@ -118,8 +119,8 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
|
|
118
119
|
litCSize = (lhc >> 22) + (istart[4] << 10);
|
119
120
|
break;
|
120
121
|
}
|
121
|
-
|
122
|
-
|
122
|
+
RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected);
|
123
|
+
RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected);
|
123
124
|
|
124
125
|
/* prefetch huffman table if cold */
|
125
126
|
if (dctx->ddictIsCold && (litSize > 768 /* heuristic */)) {
|
@@ -157,7 +158,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
|
|
157
158
|
}
|
158
159
|
}
|
159
160
|
|
160
|
-
|
161
|
+
RETURN_ERROR_IF(HUF_isError(hufSuccess), corruption_detected);
|
161
162
|
|
162
163
|
dctx->litPtr = dctx->litBuffer;
|
163
164
|
dctx->litSize = litSize;
|
@@ -187,7 +188,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
|
|
187
188
|
}
|
188
189
|
|
189
190
|
if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) { /* risk reading beyond src buffer with wildcopy */
|
190
|
-
|
191
|
+
RETURN_ERROR_IF(litSize+lhSize > srcSize, corruption_detected);
|
191
192
|
memcpy(dctx->litBuffer, istart+lhSize, litSize);
|
192
193
|
dctx->litPtr = dctx->litBuffer;
|
193
194
|
dctx->litSize = litSize;
|
@@ -216,17 +217,17 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
|
|
216
217
|
case 3:
|
217
218
|
lhSize = 3;
|
218
219
|
litSize = MEM_readLE24(istart) >> 4;
|
219
|
-
|
220
|
+
RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4");
|
220
221
|
break;
|
221
222
|
}
|
222
|
-
|
223
|
+
RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected);
|
223
224
|
memset(dctx->litBuffer, istart[lhSize], litSize + WILDCOPY_OVERLENGTH);
|
224
225
|
dctx->litPtr = dctx->litBuffer;
|
225
226
|
dctx->litSize = litSize;
|
226
227
|
return lhSize+1;
|
227
228
|
}
|
228
229
|
default:
|
229
|
-
|
230
|
+
RETURN_ERROR(corruption_detected, "impossible");
|
230
231
|
}
|
231
232
|
}
|
232
233
|
}
|
@@ -436,8 +437,8 @@ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymb
|
|
436
437
|
switch(type)
|
437
438
|
{
|
438
439
|
case set_rle :
|
439
|
-
|
440
|
-
|
440
|
+
RETURN_ERROR_IF(!srcSize, srcSize_wrong);
|
441
|
+
RETURN_ERROR_IF((*(const BYTE*)src) > max, corruption_detected);
|
441
442
|
{ U32 const symbol = *(const BYTE*)src;
|
442
443
|
U32 const baseline = baseValue[symbol];
|
443
444
|
U32 const nbBits = nbAdditionalBits[symbol];
|
@@ -449,7 +450,7 @@ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymb
|
|
449
450
|
*DTablePtr = defaultTable;
|
450
451
|
return 0;
|
451
452
|
case set_repeat:
|
452
|
-
|
453
|
+
RETURN_ERROR_IF(!flagRepeatTable, corruption_detected);
|
453
454
|
/* prefetch FSE table if used */
|
454
455
|
if (ddictIsCold && (nbSeq > 24 /* heuristic */)) {
|
455
456
|
const void* const pStart = *DTablePtr;
|
@@ -461,15 +462,15 @@ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymb
|
|
461
462
|
{ unsigned tableLog;
|
462
463
|
S16 norm[MaxSeq+1];
|
463
464
|
size_t const headerSize = FSE_readNCount(norm, &max, &tableLog, src, srcSize);
|
464
|
-
|
465
|
-
|
465
|
+
RETURN_ERROR_IF(FSE_isError(headerSize), corruption_detected);
|
466
|
+
RETURN_ERROR_IF(tableLog > maxLog, corruption_detected);
|
466
467
|
ZSTD_buildFSETable(DTableSpace, norm, max, baseValue, nbAdditionalBits, tableLog);
|
467
468
|
*DTablePtr = DTableSpace;
|
468
469
|
return headerSize;
|
469
470
|
}
|
470
|
-
default :
|
471
|
+
default :
|
471
472
|
assert(0);
|
472
|
-
|
473
|
+
RETURN_ERROR(GENERIC, "impossible");
|
473
474
|
}
|
474
475
|
}
|
475
476
|
|
@@ -483,28 +484,28 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
|
|
483
484
|
DEBUGLOG(5, "ZSTD_decodeSeqHeaders");
|
484
485
|
|
485
486
|
/* check */
|
486
|
-
|
487
|
+
RETURN_ERROR_IF(srcSize < MIN_SEQUENCES_SIZE, srcSize_wrong);
|
487
488
|
|
488
489
|
/* SeqHead */
|
489
490
|
nbSeq = *ip++;
|
490
491
|
if (!nbSeq) {
|
491
492
|
*nbSeqPtr=0;
|
492
|
-
|
493
|
+
RETURN_ERROR_IF(srcSize != 1, srcSize_wrong);
|
493
494
|
return 1;
|
494
495
|
}
|
495
496
|
if (nbSeq > 0x7F) {
|
496
497
|
if (nbSeq == 0xFF) {
|
497
|
-
|
498
|
+
RETURN_ERROR_IF(ip+2 > iend, srcSize_wrong);
|
498
499
|
nbSeq = MEM_readLE16(ip) + LONGNBSEQ, ip+=2;
|
499
500
|
} else {
|
500
|
-
|
501
|
+
RETURN_ERROR_IF(ip >= iend, srcSize_wrong);
|
501
502
|
nbSeq = ((nbSeq-0x80)<<8) + *ip++;
|
502
503
|
}
|
503
504
|
}
|
504
505
|
*nbSeqPtr = nbSeq;
|
505
506
|
|
506
507
|
/* FSE table descriptors */
|
507
|
-
|
508
|
+
RETURN_ERROR_IF(ip+4 > iend, srcSize_wrong); /* minimum possible size */
|
508
509
|
{ symbolEncodingType_e const LLtype = (symbolEncodingType_e)(*ip >> 6);
|
509
510
|
symbolEncodingType_e const OFtype = (symbolEncodingType_e)((*ip >> 4) & 3);
|
510
511
|
symbolEncodingType_e const MLtype = (symbolEncodingType_e)((*ip >> 2) & 3);
|
@@ -517,7 +518,7 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
|
|
517
518
|
LL_base, LL_bits,
|
518
519
|
LL_defaultDTable, dctx->fseEntropy,
|
519
520
|
dctx->ddictIsCold, nbSeq);
|
520
|
-
|
521
|
+
RETURN_ERROR_IF(ZSTD_isError(llhSize), corruption_detected);
|
521
522
|
ip += llhSize;
|
522
523
|
}
|
523
524
|
|
@@ -527,7 +528,7 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
|
|
527
528
|
OF_base, OF_bits,
|
528
529
|
OF_defaultDTable, dctx->fseEntropy,
|
529
530
|
dctx->ddictIsCold, nbSeq);
|
530
|
-
|
531
|
+
RETURN_ERROR_IF(ZSTD_isError(ofhSize), corruption_detected);
|
531
532
|
ip += ofhSize;
|
532
533
|
}
|
533
534
|
|
@@ -537,7 +538,7 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
|
|
537
538
|
ML_base, ML_bits,
|
538
539
|
ML_defaultDTable, dctx->fseEntropy,
|
539
540
|
dctx->ddictIsCold, nbSeq);
|
540
|
-
|
541
|
+
RETURN_ERROR_IF(ZSTD_isError(mlhSize), corruption_detected);
|
541
542
|
ip += mlhSize;
|
542
543
|
}
|
543
544
|
}
|
@@ -590,8 +591,8 @@ size_t ZSTD_execSequenceLast7(BYTE* op,
|
|
590
591
|
const BYTE* match = oLitEnd - sequence.offset;
|
591
592
|
|
592
593
|
/* check */
|
593
|
-
|
594
|
-
|
594
|
+
RETURN_ERROR_IF(oMatchEnd>oend, dstSize_tooSmall, "last match must fit within dstBuffer");
|
595
|
+
RETURN_ERROR_IF(iLitEnd > litLimit, corruption_detected, "try to read beyond literal buffer");
|
595
596
|
|
596
597
|
/* copy literals */
|
597
598
|
while (op < oLitEnd) *op++ = *(*litPtr)++;
|
@@ -599,7 +600,7 @@ size_t ZSTD_execSequenceLast7(BYTE* op,
|
|
599
600
|
/* copy Match */
|
600
601
|
if (sequence.offset > (size_t)(oLitEnd - base)) {
|
601
602
|
/* offset beyond prefix */
|
602
|
-
|
603
|
+
RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - vBase),corruption_detected);
|
603
604
|
match = dictEnd - (base-match);
|
604
605
|
if (match + sequence.matchLength <= dictEnd) {
|
605
606
|
memmove(oLitEnd, match, sequence.matchLength);
|
@@ -631,8 +632,8 @@ size_t ZSTD_execSequence(BYTE* op,
|
|
631
632
|
const BYTE* match = oLitEnd - sequence.offset;
|
632
633
|
|
633
634
|
/* check */
|
634
|
-
|
635
|
-
|
635
|
+
RETURN_ERROR_IF(oMatchEnd>oend, dstSize_tooSmall, "last match must start at a minimum distance of WILDCOPY_OVERLENGTH from oend");
|
636
|
+
RETURN_ERROR_IF(iLitEnd > litLimit, corruption_detected, "over-read beyond lit buffer");
|
636
637
|
if (oLitEnd>oend_w) return ZSTD_execSequenceLast7(op, oend, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd);
|
637
638
|
|
638
639
|
/* copy Literals */
|
@@ -645,8 +646,7 @@ size_t ZSTD_execSequence(BYTE* op,
|
|
645
646
|
/* copy Match */
|
646
647
|
if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
|
647
648
|
/* offset beyond prefix -> go into extDict */
|
648
|
-
|
649
|
-
return ERROR(corruption_detected);
|
649
|
+
RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected);
|
650
650
|
match = dictEnd + (match - prefixStart);
|
651
651
|
if (match + sequence.matchLength <= dictEnd) {
|
652
652
|
memmove(oLitEnd, match, sequence.matchLength);
|
@@ -712,8 +712,8 @@ size_t ZSTD_execSequenceLong(BYTE* op,
|
|
712
712
|
const BYTE* match = sequence.match;
|
713
713
|
|
714
714
|
/* check */
|
715
|
-
|
716
|
-
|
715
|
+
RETURN_ERROR_IF(oMatchEnd > oend, dstSize_tooSmall, "last match must start at a minimum distance of WILDCOPY_OVERLENGTH from oend");
|
716
|
+
RETURN_ERROR_IF(iLitEnd > litLimit, corruption_detected, "over-read beyond lit buffer");
|
717
717
|
if (oLitEnd > oend_w) return ZSTD_execSequenceLast7(op, oend, sequence, litPtr, litLimit, prefixStart, dictStart, dictEnd);
|
718
718
|
|
719
719
|
/* copy Literals */
|
@@ -726,7 +726,7 @@ size_t ZSTD_execSequenceLong(BYTE* op,
|
|
726
726
|
/* copy Match */
|
727
727
|
if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
|
728
728
|
/* offset beyond prefix */
|
729
|
-
|
729
|
+
RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - dictStart), corruption_detected);
|
730
730
|
if (match + sequence.matchLength <= dictEnd) {
|
731
731
|
memmove(oLitEnd, match, sequence.matchLength);
|
732
732
|
return sequenceLength;
|
@@ -801,7 +801,7 @@ ZSTD_updateFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD)
|
|
801
801
|
/* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum
|
802
802
|
* offset bits. But we can only read at most (STREAM_ACCUMULATOR_MIN_32 - 1)
|
803
803
|
* bits before reloading. This value is the maximum number of bytes we read
|
804
|
-
* after reloading when we are decoding long
|
804
|
+
* after reloading when we are decoding long offsets.
|
805
805
|
*/
|
806
806
|
#define LONG_OFFSETS_MAX_EXTRA_BITS_32 \
|
807
807
|
(ZSTD_WINDOWLOG_MAX_32 > STREAM_ACCUMULATOR_MIN_32 \
|
@@ -911,7 +911,9 @@ ZSTD_decompressSequences_body( ZSTD_DCtx* dctx,
|
|
911
911
|
seqState_t seqState;
|
912
912
|
dctx->fseEntropy = 1;
|
913
913
|
{ U32 i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; }
|
914
|
-
|
914
|
+
RETURN_ERROR_IF(
|
915
|
+
ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend-ip)),
|
916
|
+
corruption_detected);
|
915
917
|
ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr);
|
916
918
|
ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr);
|
917
919
|
ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
|
@@ -927,14 +929,14 @@ ZSTD_decompressSequences_body( ZSTD_DCtx* dctx,
|
|
927
929
|
|
928
930
|
/* check if reached exact end */
|
929
931
|
DEBUGLOG(5, "ZSTD_decompressSequences_body: after decode loop, remaining nbSeq : %i", nbSeq);
|
930
|
-
|
932
|
+
RETURN_ERROR_IF(nbSeq, corruption_detected);
|
931
933
|
/* save reps for next block */
|
932
934
|
{ U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
|
933
935
|
}
|
934
936
|
|
935
937
|
/* last literal segment */
|
936
938
|
{ size_t const lastLLSize = litEnd - litPtr;
|
937
|
-
|
939
|
+
RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall);
|
938
940
|
memcpy(op, litPtr, lastLLSize);
|
939
941
|
op += lastLLSize;
|
940
942
|
}
|
@@ -1066,7 +1068,9 @@ ZSTD_decompressSequencesLong_body(
|
|
1066
1068
|
seqState.pos = (size_t)(op-prefixStart);
|
1067
1069
|
seqState.dictEnd = dictEnd;
|
1068
1070
|
assert(iend >= ip);
|
1069
|
-
|
1071
|
+
RETURN_ERROR_IF(
|
1072
|
+
ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend-ip)),
|
1073
|
+
corruption_detected);
|
1070
1074
|
ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr);
|
1071
1075
|
ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr);
|
1072
1076
|
ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
|
@@ -1076,7 +1080,7 @@ ZSTD_decompressSequencesLong_body(
|
|
1076
1080
|
sequences[seqNb] = ZSTD_decodeSequenceLong(&seqState, isLongOffset);
|
1077
1081
|
PREFETCH_L1(sequences[seqNb].match); PREFETCH_L1(sequences[seqNb].match + sequences[seqNb].matchLength - 1); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
|
1078
1082
|
}
|
1079
|
-
|
1083
|
+
RETURN_ERROR_IF(seqNb<seqAdvance, corruption_detected);
|
1080
1084
|
|
1081
1085
|
/* decode and decompress */
|
1082
1086
|
for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (seqNb<nbSeq) ; seqNb++) {
|
@@ -1087,7 +1091,7 @@ ZSTD_decompressSequencesLong_body(
|
|
1087
1091
|
sequences[seqNb & STORED_SEQS_MASK] = sequence;
|
1088
1092
|
op += oneSeqSize;
|
1089
1093
|
}
|
1090
|
-
|
1094
|
+
RETURN_ERROR_IF(seqNb<nbSeq, corruption_detected);
|
1091
1095
|
|
1092
1096
|
/* finish queue */
|
1093
1097
|
seqNb -= seqAdvance;
|
@@ -1103,7 +1107,7 @@ ZSTD_decompressSequencesLong_body(
|
|
1103
1107
|
|
1104
1108
|
/* last literal segment */
|
1105
1109
|
{ size_t const lastLLSize = litEnd - litPtr;
|
1106
|
-
|
1110
|
+
RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall);
|
1107
1111
|
memcpy(op, litPtr, lastLLSize);
|
1108
1112
|
op += lastLLSize;
|
1109
1113
|
}
|
@@ -1176,7 +1180,7 @@ ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,
|
|
1176
1180
|
/* ZSTD_decompressSequencesLong() :
|
1177
1181
|
* decompression function triggered when a minimum share of offsets is considered "long",
|
1178
1182
|
* aka out of cache.
|
1179
|
-
* note : "long" definition seems overloaded here, sometimes meaning "wider than bitstream register", and sometimes
|
1183
|
+
* note : "long" definition seems overloaded here, sometimes meaning "wider than bitstream register", and sometimes meaning "farther than memory cache distance".
|
1180
1184
|
* This function will try to mitigate main memory latency through the use of prefetching */
|
1181
1185
|
static size_t
|
1182
1186
|
ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx,
|
@@ -1240,7 +1244,7 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
|
|
1240
1244
|
ZSTD_longOffset_e const isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (!frame || (dctx->fParams.windowSize > (1ULL << STREAM_ACCUMULATOR_MIN))));
|
1241
1245
|
DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize);
|
1242
1246
|
|
1243
|
-
|
1247
|
+
RETURN_ERROR_IF(srcSize >= ZSTD_BLOCKSIZE_MAX, srcSize_wrong);
|
1244
1248
|
|
1245
1249
|
/* Decode literals section */
|
1246
1250
|
{ size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize);
|
@@ -89,6 +89,12 @@ typedef enum { ZSTDds_getFrameHeaderSize, ZSTDds_decodeFrameHeader,
|
|
89
89
|
typedef enum { zdss_init=0, zdss_loadHeader,
|
90
90
|
zdss_read, zdss_load, zdss_flush } ZSTD_dStreamStage;
|
91
91
|
|
92
|
+
typedef enum {
|
93
|
+
ZSTD_use_indefinitely = -1, /* Use the dictionary indefinitely */
|
94
|
+
ZSTD_dont_use = 0, /* Do not use the dictionary (if one exists free it) */
|
95
|
+
ZSTD_use_once = 1 /* Use the dictionary once and set to ZSTD_dont_use */
|
96
|
+
} ZSTD_dictUses_e;
|
97
|
+
|
92
98
|
struct ZSTD_DCtx_s
|
93
99
|
{
|
94
100
|
const ZSTD_seqSymbol* LLTptr;
|
@@ -123,6 +129,7 @@ struct ZSTD_DCtx_s
|
|
123
129
|
const ZSTD_DDict* ddict; /* set by ZSTD_initDStream_usingDDict(), or ZSTD_DCtx_refDDict() */
|
124
130
|
U32 dictID;
|
125
131
|
int ddictIsCold; /* if == 1 : dictionary is "new" for working context, and presumed "cold" (not in cpu cache) */
|
132
|
+
ZSTD_dictUses_e dictUses;
|
126
133
|
|
127
134
|
/* streaming */
|
128
135
|
ZSTD_dStreamStage streamStage;
|
@@ -391,7 +391,7 @@ static void COVER_group(COVER_ctx_t *ctx, const void *group,
|
|
391
391
|
*
|
392
392
|
* Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1})
|
393
393
|
*
|
394
|
-
* Once the dmer d is in the
|
394
|
+
* Once the dmer d is in the dictionary we set F(d) = 0.
|
395
395
|
*/
|
396
396
|
static COVER_segment_t COVER_selectSegment(const COVER_ctx_t *ctx, U32 *freqs,
|
397
397
|
COVER_map_t *activeDmers, U32 begin,
|
@@ -435,7 +435,7 @@ static COVER_segment_t COVER_selectSegment(const COVER_ctx_t *ctx, U32 *freqs,
|
|
435
435
|
U32 *delDmerOcc = COVER_map_at(activeDmers, delDmer);
|
436
436
|
activeSegment.begin += 1;
|
437
437
|
*delDmerOcc -= 1;
|
438
|
-
/* If this is the last
|
438
|
+
/* If this is the last occurrence of the dmer, subtract its score */
|
439
439
|
if (*delDmerOcc == 0) {
|
440
440
|
COVER_map_remove(activeDmers, delDmer);
|
441
441
|
activeSegment.score -= freqs[delDmer];
|
@@ -627,6 +627,39 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
|
|
627
627
|
return 1;
|
628
628
|
}
|
629
629
|
|
630
|
+
void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel)
|
631
|
+
{
|
632
|
+
const double ratio = (double)nbDmers / maxDictSize;
|
633
|
+
if (ratio >= 10) {
|
634
|
+
return;
|
635
|
+
}
|
636
|
+
LOCALDISPLAYLEVEL(displayLevel, 1,
|
637
|
+
"WARNING: The maximum dictionary size %u is too large "
|
638
|
+
"compared to the source size %u! "
|
639
|
+
"size(source)/size(dictionary) = %f, but it should be >= "
|
640
|
+
"10! This may lead to a subpar dictionary! We recommend "
|
641
|
+
"training on sources at least 10x, and up to 100x the "
|
642
|
+
"size of the dictionary!\n", (U32)maxDictSize,
|
643
|
+
(U32)nbDmers, ratio);
|
644
|
+
}
|
645
|
+
|
646
|
+
COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize,
|
647
|
+
U32 nbDmers, U32 k, U32 passes)
|
648
|
+
{
|
649
|
+
const U32 minEpochSize = k * 10;
|
650
|
+
COVER_epoch_info_t epochs;
|
651
|
+
epochs.num = MAX(1, maxDictSize / k / passes);
|
652
|
+
epochs.size = nbDmers / epochs.num;
|
653
|
+
if (epochs.size >= minEpochSize) {
|
654
|
+
assert(epochs.size * epochs.num <= nbDmers);
|
655
|
+
return epochs;
|
656
|
+
}
|
657
|
+
epochs.size = MIN(minEpochSize, nbDmers);
|
658
|
+
epochs.num = nbDmers / epochs.size;
|
659
|
+
assert(epochs.size * epochs.num <= nbDmers);
|
660
|
+
return epochs;
|
661
|
+
}
|
662
|
+
|
630
663
|
/**
|
631
664
|
* Given the prepared context build the dictionary.
|
632
665
|
*/
|
@@ -636,28 +669,34 @@ static size_t COVER_buildDictionary(const COVER_ctx_t *ctx, U32 *freqs,
|
|
636
669
|
ZDICT_cover_params_t parameters) {
|
637
670
|
BYTE *const dict = (BYTE *)dictBuffer;
|
638
671
|
size_t tail = dictBufferCapacity;
|
639
|
-
/* Divide the data
|
640
|
-
|
641
|
-
|
642
|
-
const
|
643
|
-
|
672
|
+
/* Divide the data into epochs. We will select one segment from each epoch. */
|
673
|
+
const COVER_epoch_info_t epochs = COVER_computeEpochs(
|
674
|
+
(U32)dictBufferCapacity, (U32)ctx->suffixSize, parameters.k, 4);
|
675
|
+
const size_t maxZeroScoreRun = MAX(10, MIN(100, epochs.num >> 3));
|
676
|
+
size_t zeroScoreRun = 0;
|
644
677
|
size_t epoch;
|
645
678
|
DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n",
|
646
|
-
epochs,
|
679
|
+
(U32)epochs.num, (U32)epochs.size);
|
647
680
|
/* Loop through the epochs until there are no more segments or the dictionary
|
648
681
|
* is full.
|
649
682
|
*/
|
650
|
-
for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs) {
|
651
|
-
const U32 epochBegin = (U32)(epoch *
|
652
|
-
const U32 epochEnd = epochBegin +
|
683
|
+
for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs.num) {
|
684
|
+
const U32 epochBegin = (U32)(epoch * epochs.size);
|
685
|
+
const U32 epochEnd = epochBegin + epochs.size;
|
653
686
|
size_t segmentSize;
|
654
687
|
/* Select a segment */
|
655
688
|
COVER_segment_t segment = COVER_selectSegment(
|
656
689
|
ctx, freqs, activeDmers, epochBegin, epochEnd, parameters);
|
657
|
-
/* If the segment covers no dmers, then we are out of content
|
690
|
+
/* If the segment covers no dmers, then we are out of content.
|
691
|
+
* There may be new content in other epochs, for continue for some time.
|
692
|
+
*/
|
658
693
|
if (segment.score == 0) {
|
659
|
-
|
694
|
+
if (++zeroScoreRun >= maxZeroScoreRun) {
|
695
|
+
break;
|
696
|
+
}
|
697
|
+
continue;
|
660
698
|
}
|
699
|
+
zeroScoreRun = 0;
|
661
700
|
/* Trim the segment if necessary and if it is too small then we are done */
|
662
701
|
segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail);
|
663
702
|
if (segmentSize < parameters.d) {
|
@@ -706,6 +745,7 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
|
|
706
745
|
parameters.d, parameters.splitPoint)) {
|
707
746
|
return ERROR(GENERIC);
|
708
747
|
}
|
748
|
+
COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize, g_displayLevel);
|
709
749
|
if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
|
710
750
|
DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
|
711
751
|
COVER_ctx_destroy(&ctx);
|
@@ -977,6 +1017,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
|
|
977
1017
|
unsigned k;
|
978
1018
|
COVER_best_t best;
|
979
1019
|
POOL_ctx *pool = NULL;
|
1020
|
+
int warned = 0;
|
980
1021
|
|
981
1022
|
/* Checks */
|
982
1023
|
if (splitPoint <= 0 || splitPoint > 1) {
|
@@ -1019,6 +1060,10 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
|
|
1019
1060
|
POOL_free(pool);
|
1020
1061
|
return ERROR(GENERIC);
|
1021
1062
|
}
|
1063
|
+
if (!warned) {
|
1064
|
+
COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize, displayLevel);
|
1065
|
+
warned = 1;
|
1066
|
+
}
|
1022
1067
|
/* Loop through k reusing the same context */
|
1023
1068
|
for (k = kMinK; k <= kMaxK; k += kStepSize) {
|
1024
1069
|
/* Prepare the arguments */
|