zstd-ruby 1.5.5.0 → 1.5.6.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/README.md +81 -11
- data/ext/zstdruby/common.h +172 -1
- data/ext/zstdruby/extconf.rb +3 -1
- data/ext/zstdruby/libzstd/common/allocations.h +1 -1
- data/ext/zstdruby/libzstd/common/bitstream.h +49 -29
- data/ext/zstdruby/libzstd/common/compiler.h +114 -22
- data/ext/zstdruby/libzstd/common/cpu.h +36 -0
- data/ext/zstdruby/libzstd/common/debug.c +6 -0
- data/ext/zstdruby/libzstd/common/debug.h +20 -11
- data/ext/zstdruby/libzstd/common/error_private.h +45 -36
- data/ext/zstdruby/libzstd/common/fse.h +3 -2
- data/ext/zstdruby/libzstd/common/fse_decompress.c +19 -17
- data/ext/zstdruby/libzstd/common/huf.h +14 -1
- data/ext/zstdruby/libzstd/common/mem.h +0 -9
- data/ext/zstdruby/libzstd/common/pool.c +1 -1
- data/ext/zstdruby/libzstd/common/pool.h +1 -1
- data/ext/zstdruby/libzstd/common/portability_macros.h +2 -0
- data/ext/zstdruby/libzstd/common/threading.c +8 -2
- data/ext/zstdruby/libzstd/common/xxhash.c +5 -11
- data/ext/zstdruby/libzstd/common/xxhash.h +2341 -1007
- data/ext/zstdruby/libzstd/common/zstd_internal.h +5 -5
- data/ext/zstdruby/libzstd/compress/fse_compress.c +8 -7
- data/ext/zstdruby/libzstd/compress/huf_compress.c +54 -25
- data/ext/zstdruby/libzstd/compress/zstd_compress.c +282 -161
- data/ext/zstdruby/libzstd/compress/zstd_compress_internal.h +29 -27
- data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.c +224 -113
- data/ext/zstdruby/libzstd/compress/zstd_cwksp.h +19 -13
- data/ext/zstdruby/libzstd/compress/zstd_double_fast.c +17 -5
- data/ext/zstdruby/libzstd/compress/zstd_double_fast.h +11 -0
- data/ext/zstdruby/libzstd/compress/zstd_fast.c +14 -6
- data/ext/zstdruby/libzstd/compress/zstd_lazy.c +129 -87
- data/ext/zstdruby/libzstd/compress/zstd_lazy.h +103 -28
- data/ext/zstdruby/libzstd/compress/zstd_ldm.c +8 -2
- data/ext/zstdruby/libzstd/compress/zstd_opt.c +216 -112
- data/ext/zstdruby/libzstd/compress/zstd_opt.h +31 -7
- data/ext/zstdruby/libzstd/compress/zstdmt_compress.c +94 -79
- data/ext/zstdruby/libzstd/decompress/huf_decompress.c +188 -126
- data/ext/zstdruby/libzstd/decompress/huf_decompress_amd64.S +38 -19
- data/ext/zstdruby/libzstd/decompress/zstd_decompress.c +84 -32
- data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.c +231 -208
- data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.h +1 -1
- data/ext/zstdruby/libzstd/decompress/zstd_decompress_internal.h +2 -0
- data/ext/zstdruby/libzstd/dictBuilder/cover.c +16 -12
- data/ext/zstdruby/libzstd/dictBuilder/cover.h +2 -8
- data/ext/zstdruby/libzstd/dictBuilder/fastcover.c +2 -2
- data/ext/zstdruby/libzstd/dictBuilder/zdict.c +12 -6
- data/ext/zstdruby/libzstd/zstd.h +129 -60
- data/ext/zstdruby/main.c +2 -1
- data/ext/zstdruby/skippable_frame.c +1 -1
- data/ext/zstdruby/streaming_compress.c +75 -23
- data/ext/zstdruby/streaming_decompress.c +41 -40
- data/ext/zstdruby/zstdruby.c +60 -52
- data/lib/zstd-ruby/stream_reader.rb +22 -0
- data/lib/zstd-ruby/stream_writer.rb +23 -0
- data/lib/zstd-ruby/version.rb +1 -1
- data/lib/zstd-ruby.rb +2 -0
- data/renovate.json +6 -0
- data/zstd-ruby.gemspec +2 -1
- metadata +20 -4
- data/ext/zstdruby/streaming_compress.h +0 -5
@@ -34,6 +34,12 @@
|
|
34
34
|
* Macros
|
35
35
|
****************************************************************/
|
36
36
|
|
37
|
+
#ifdef HUF_DISABLE_FAST_DECODE
|
38
|
+
# define HUF_ENABLE_FAST_DECODE 0
|
39
|
+
#else
|
40
|
+
# define HUF_ENABLE_FAST_DECODE 1
|
41
|
+
#endif
|
42
|
+
|
37
43
|
/* These two optional macros force the use one way or another of the two
|
38
44
|
* Huffman decompression implementations. You can't force in both directions
|
39
45
|
* at the same time.
|
@@ -158,17 +164,18 @@ static size_t HUF_initFastDStream(BYTE const* ip) {
|
|
158
164
|
* op [in/out] - The output pointers, must be updated to reflect what is written.
|
159
165
|
* bits [in/out] - The bitstream containers, must be updated to reflect the current state.
|
160
166
|
* dt [in] - The decoding table.
|
161
|
-
*
|
167
|
+
* ilowest [in] - The beginning of the valid range of the input. Decoders may read
|
168
|
+
* down to this pointer. It may be below iend[0].
|
162
169
|
* oend [in] - The end of the output stream. op[3] must not cross oend.
|
163
170
|
* iend [in] - The end of each input stream. ip[i] may cross iend[i],
|
164
|
-
* as long as it is above
|
171
|
+
* as long as it is above ilowest, but that indicates corruption.
|
165
172
|
*/
|
166
173
|
typedef struct {
|
167
174
|
BYTE const* ip[4];
|
168
175
|
BYTE* op[4];
|
169
176
|
U64 bits[4];
|
170
177
|
void const* dt;
|
171
|
-
BYTE const*
|
178
|
+
BYTE const* ilowest;
|
172
179
|
BYTE* oend;
|
173
180
|
BYTE const* iend[4];
|
174
181
|
} HUF_DecompressFastArgs;
|
@@ -186,9 +193,9 @@ static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* ds
|
|
186
193
|
void const* dt = DTable + 1;
|
187
194
|
U32 const dtLog = HUF_getDTableDesc(DTable).tableLog;
|
188
195
|
|
189
|
-
const BYTE* const
|
196
|
+
const BYTE* const istart = (const BYTE*)src;
|
190
197
|
|
191
|
-
BYTE* const oend = (BYTE*)dst
|
198
|
+
BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize);
|
192
199
|
|
193
200
|
/* The fast decoding loop assumes 64-bit little-endian.
|
194
201
|
* This condition is false on x32.
|
@@ -196,6 +203,11 @@ static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* ds
|
|
196
203
|
if (!MEM_isLittleEndian() || MEM_32bits())
|
197
204
|
return 0;
|
198
205
|
|
206
|
+
/* Avoid nullptr addition */
|
207
|
+
if (dstSize == 0)
|
208
|
+
return 0;
|
209
|
+
assert(dst != NULL);
|
210
|
+
|
199
211
|
/* strict minimum : jump table + 1 byte per stream */
|
200
212
|
if (srcSize < 10)
|
201
213
|
return ERROR(corruption_detected);
|
@@ -209,7 +221,6 @@ static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* ds
|
|
209
221
|
|
210
222
|
/* Read the jump table. */
|
211
223
|
{
|
212
|
-
const BYTE* const istart = (const BYTE*)src;
|
213
224
|
size_t const length1 = MEM_readLE16(istart);
|
214
225
|
size_t const length2 = MEM_readLE16(istart+2);
|
215
226
|
size_t const length3 = MEM_readLE16(istart+4);
|
@@ -221,10 +232,8 @@ static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* ds
|
|
221
232
|
|
222
233
|
/* HUF_initFastDStream() requires this, and this small of an input
|
223
234
|
* won't benefit from the ASM loop anyways.
|
224
|
-
* length1 must be >= 16 so that ip[0] >= ilimit before the loop
|
225
|
-
* starts.
|
226
235
|
*/
|
227
|
-
if (length1 <
|
236
|
+
if (length1 < 8 || length2 < 8 || length3 < 8 || length4 < 8)
|
228
237
|
return 0;
|
229
238
|
if (length4 > srcSize) return ERROR(corruption_detected); /* overflow */
|
230
239
|
}
|
@@ -256,11 +265,12 @@ static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* ds
|
|
256
265
|
args->bits[2] = HUF_initFastDStream(args->ip[2]);
|
257
266
|
args->bits[3] = HUF_initFastDStream(args->ip[3]);
|
258
267
|
|
259
|
-
/*
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
268
|
+
/* The decoders must be sure to never read beyond ilowest.
|
269
|
+
* This is lower than iend[0], but allowing decoders to read
|
270
|
+
* down to ilowest can allow an extra iteration or two in the
|
271
|
+
* fast loop.
|
272
|
+
*/
|
273
|
+
args->ilowest = istart;
|
264
274
|
|
265
275
|
args->oend = oend;
|
266
276
|
args->dt = dt;
|
@@ -285,13 +295,31 @@ static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressFastArg
|
|
285
295
|
assert(sizeof(size_t) == 8);
|
286
296
|
bit->bitContainer = MEM_readLEST(args->ip[stream]);
|
287
297
|
bit->bitsConsumed = ZSTD_countTrailingZeros64(args->bits[stream]);
|
288
|
-
bit->start = (const char*)args->
|
298
|
+
bit->start = (const char*)args->ilowest;
|
289
299
|
bit->limitPtr = bit->start + sizeof(size_t);
|
290
300
|
bit->ptr = (const char*)args->ip[stream];
|
291
301
|
|
292
302
|
return 0;
|
293
303
|
}
|
294
304
|
|
305
|
+
/* Calls X(N) for each stream 0, 1, 2, 3. */
|
306
|
+
#define HUF_4X_FOR_EACH_STREAM(X) \
|
307
|
+
do { \
|
308
|
+
X(0); \
|
309
|
+
X(1); \
|
310
|
+
X(2); \
|
311
|
+
X(3); \
|
312
|
+
} while (0)
|
313
|
+
|
314
|
+
/* Calls X(N, var) for each stream 0, 1, 2, 3. */
|
315
|
+
#define HUF_4X_FOR_EACH_STREAM_WITH_VAR(X, var) \
|
316
|
+
do { \
|
317
|
+
X(0, (var)); \
|
318
|
+
X(1, (var)); \
|
319
|
+
X(2, (var)); \
|
320
|
+
X(3, (var)); \
|
321
|
+
} while (0)
|
322
|
+
|
295
323
|
|
296
324
|
#ifndef HUF_FORCE_DECOMPRESS_X2
|
297
325
|
|
@@ -500,15 +528,19 @@ HUF_decodeSymbolX1(BIT_DStream_t* Dstream, const HUF_DEltX1* dt, const U32 dtLog
|
|
500
528
|
}
|
501
529
|
|
502
530
|
#define HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr) \
|
503
|
-
*ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog)
|
531
|
+
do { *ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog); } while (0)
|
504
532
|
|
505
|
-
#define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr)
|
506
|
-
|
507
|
-
|
533
|
+
#define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr) \
|
534
|
+
do { \
|
535
|
+
if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
|
536
|
+
HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr); \
|
537
|
+
} while (0)
|
508
538
|
|
509
|
-
#define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr)
|
510
|
-
|
511
|
-
|
539
|
+
#define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr) \
|
540
|
+
do { \
|
541
|
+
if (MEM_64bits()) \
|
542
|
+
HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr); \
|
543
|
+
} while (0)
|
512
544
|
|
513
545
|
HINT_INLINE size_t
|
514
546
|
HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, const HUF_DEltX1* const dt, const U32 dtLog)
|
@@ -546,7 +578,7 @@ HUF_decompress1X1_usingDTable_internal_body(
|
|
546
578
|
const HUF_DTable* DTable)
|
547
579
|
{
|
548
580
|
BYTE* op = (BYTE*)dst;
|
549
|
-
BYTE* const oend = op
|
581
|
+
BYTE* const oend = ZSTD_maybeNullPtrAdd(op, dstSize);
|
550
582
|
const void* dtPtr = DTable + 1;
|
551
583
|
const HUF_DEltX1* const dt = (const HUF_DEltX1*)dtPtr;
|
552
584
|
BIT_DStream_t bitD;
|
@@ -574,6 +606,7 @@ HUF_decompress4X1_usingDTable_internal_body(
|
|
574
606
|
{
|
575
607
|
/* Check */
|
576
608
|
if (cSrcSize < 10) return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */
|
609
|
+
if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */
|
577
610
|
|
578
611
|
{ const BYTE* const istart = (const BYTE*) cSrc;
|
579
612
|
BYTE* const ostart = (BYTE*) dst;
|
@@ -609,7 +642,7 @@ HUF_decompress4X1_usingDTable_internal_body(
|
|
609
642
|
|
610
643
|
if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */
|
611
644
|
if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */
|
612
|
-
|
645
|
+
assert(dstSize >= 6); /* validated above */
|
613
646
|
CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
|
614
647
|
CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
|
615
648
|
CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
|
@@ -692,7 +725,7 @@ void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs*
|
|
692
725
|
BYTE* op[4];
|
693
726
|
U16 const* const dtable = (U16 const*)args->dt;
|
694
727
|
BYTE* const oend = args->oend;
|
695
|
-
BYTE const* const
|
728
|
+
BYTE const* const ilowest = args->ilowest;
|
696
729
|
|
697
730
|
/* Copy the arguments to local variables */
|
698
731
|
ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
|
@@ -705,13 +738,12 @@ void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs*
|
|
705
738
|
for (;;) {
|
706
739
|
BYTE* olimit;
|
707
740
|
int stream;
|
708
|
-
int symbol;
|
709
741
|
|
710
742
|
/* Assert loop preconditions */
|
711
743
|
#ifndef NDEBUG
|
712
744
|
for (stream = 0; stream < 4; ++stream) {
|
713
745
|
assert(op[stream] <= (stream == 3 ? oend : op[stream + 1]));
|
714
|
-
assert(ip[stream] >=
|
746
|
+
assert(ip[stream] >= ilowest);
|
715
747
|
}
|
716
748
|
#endif
|
717
749
|
/* Compute olimit */
|
@@ -721,7 +753,7 @@ void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs*
|
|
721
753
|
/* Each iteration consumes up to 11 bits * 5 = 55 bits < 7 bytes
|
722
754
|
* per stream.
|
723
755
|
*/
|
724
|
-
size_t const iiters = (size_t)(ip[0] -
|
756
|
+
size_t const iiters = (size_t)(ip[0] - ilowest) / 7;
|
725
757
|
/* We can safely run iters iterations before running bounds checks */
|
726
758
|
size_t const iters = MIN(oiters, iiters);
|
727
759
|
size_t const symbols = iters * 5;
|
@@ -732,8 +764,8 @@ void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs*
|
|
732
764
|
*/
|
733
765
|
olimit = op[3] + symbols;
|
734
766
|
|
735
|
-
/* Exit fast decoding loop once we
|
736
|
-
if (op[3]
|
767
|
+
/* Exit fast decoding loop once we reach the end. */
|
768
|
+
if (op[3] == olimit)
|
737
769
|
break;
|
738
770
|
|
739
771
|
/* Exit the decoding loop if any input pointer has crossed the
|
@@ -752,27 +784,42 @@ void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs*
|
|
752
784
|
}
|
753
785
|
#endif
|
754
786
|
|
787
|
+
#define HUF_4X1_DECODE_SYMBOL(_stream, _symbol) \
|
788
|
+
do { \
|
789
|
+
int const index = (int)(bits[(_stream)] >> 53); \
|
790
|
+
int const entry = (int)dtable[index]; \
|
791
|
+
bits[(_stream)] <<= (entry & 0x3F); \
|
792
|
+
op[(_stream)][(_symbol)] = (BYTE)((entry >> 8) & 0xFF); \
|
793
|
+
} while (0)
|
794
|
+
|
795
|
+
#define HUF_4X1_RELOAD_STREAM(_stream) \
|
796
|
+
do { \
|
797
|
+
int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \
|
798
|
+
int const nbBits = ctz & 7; \
|
799
|
+
int const nbBytes = ctz >> 3; \
|
800
|
+
op[(_stream)] += 5; \
|
801
|
+
ip[(_stream)] -= nbBytes; \
|
802
|
+
bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1; \
|
803
|
+
bits[(_stream)] <<= nbBits; \
|
804
|
+
} while (0)
|
805
|
+
|
806
|
+
/* Manually unroll the loop because compilers don't consistently
|
807
|
+
* unroll the inner loops, which destroys performance.
|
808
|
+
*/
|
755
809
|
do {
|
756
810
|
/* Decode 5 symbols in each of the 4 streams */
|
757
|
-
|
758
|
-
|
759
|
-
|
760
|
-
|
761
|
-
|
762
|
-
|
763
|
-
|
764
|
-
|
765
|
-
/* Reload the bitstreams */
|
766
|
-
for (stream = 0; stream < 4; ++stream) {
|
767
|
-
int const ctz = ZSTD_countTrailingZeros64(bits[stream]);
|
768
|
-
int const nbBits = ctz & 7;
|
769
|
-
int const nbBytes = ctz >> 3;
|
770
|
-
op[stream] += 5;
|
771
|
-
ip[stream] -= nbBytes;
|
772
|
-
bits[stream] = MEM_read64(ip[stream]) | 1;
|
773
|
-
bits[stream] <<= nbBits;
|
774
|
-
}
|
811
|
+
HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 0);
|
812
|
+
HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 1);
|
813
|
+
HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 2);
|
814
|
+
HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 3);
|
815
|
+
HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 4);
|
816
|
+
|
817
|
+
/* Reload each of the 4 the bitstreams */
|
818
|
+
HUF_4X_FOR_EACH_STREAM(HUF_4X1_RELOAD_STREAM);
|
775
819
|
} while (op[3] < olimit);
|
820
|
+
|
821
|
+
#undef HUF_4X1_DECODE_SYMBOL
|
822
|
+
#undef HUF_4X1_RELOAD_STREAM
|
776
823
|
}
|
777
824
|
|
778
825
|
_out:
|
@@ -797,8 +844,8 @@ HUF_decompress4X1_usingDTable_internal_fast(
|
|
797
844
|
HUF_DecompressFastLoopFn loopFn)
|
798
845
|
{
|
799
846
|
void const* dt = DTable + 1;
|
800
|
-
const
|
801
|
-
BYTE* const oend = (BYTE*)dst
|
847
|
+
BYTE const* const ilowest = (BYTE const*)cSrc;
|
848
|
+
BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize);
|
802
849
|
HUF_DecompressFastArgs args;
|
803
850
|
{ size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
|
804
851
|
FORWARD_IF_ERROR(ret, "Failed to init fast loop args");
|
@@ -806,18 +853,22 @@ HUF_decompress4X1_usingDTable_internal_fast(
|
|
806
853
|
return 0;
|
807
854
|
}
|
808
855
|
|
809
|
-
assert(args.ip[0] >= args.
|
856
|
+
assert(args.ip[0] >= args.ilowest);
|
810
857
|
loopFn(&args);
|
811
858
|
|
812
|
-
/* Our loop guarantees that ip[] >=
|
859
|
+
/* Our loop guarantees that ip[] >= ilowest and that we haven't
|
813
860
|
* overwritten any op[].
|
814
861
|
*/
|
815
|
-
assert(args.ip[0] >=
|
816
|
-
assert(args.ip[
|
817
|
-
assert(args.ip[
|
818
|
-
assert(args.ip[
|
862
|
+
assert(args.ip[0] >= ilowest);
|
863
|
+
assert(args.ip[0] >= ilowest);
|
864
|
+
assert(args.ip[1] >= ilowest);
|
865
|
+
assert(args.ip[2] >= ilowest);
|
866
|
+
assert(args.ip[3] >= ilowest);
|
819
867
|
assert(args.op[3] <= oend);
|
820
|
-
|
868
|
+
|
869
|
+
assert(ilowest == args.ilowest);
|
870
|
+
assert(ilowest + 6 == args.iend[0]);
|
871
|
+
(void)ilowest;
|
821
872
|
|
822
873
|
/* finish bit streams one by one. */
|
823
874
|
{ size_t const segmentSize = (dstSize+3) / 4;
|
@@ -868,7 +919,7 @@ static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize,
|
|
868
919
|
}
|
869
920
|
#endif
|
870
921
|
|
871
|
-
if (!(flags & HUF_flags_disableFast)) {
|
922
|
+
if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) {
|
872
923
|
size_t const ret = HUF_decompress4X1_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);
|
873
924
|
if (ret != 0)
|
874
925
|
return ret;
|
@@ -1239,15 +1290,19 @@ HUF_decodeLastSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, c
|
|
1239
1290
|
}
|
1240
1291
|
|
1241
1292
|
#define HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) \
|
1242
|
-
ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
|
1293
|
+
do { ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); } while (0)
|
1243
1294
|
|
1244
|
-
#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr)
|
1245
|
-
|
1246
|
-
|
1295
|
+
#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \
|
1296
|
+
do { \
|
1297
|
+
if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
|
1298
|
+
ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); \
|
1299
|
+
} while (0)
|
1247
1300
|
|
1248
|
-
#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr)
|
1249
|
-
|
1250
|
-
|
1301
|
+
#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \
|
1302
|
+
do { \
|
1303
|
+
if (MEM_64bits()) \
|
1304
|
+
ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); \
|
1305
|
+
} while (0)
|
1251
1306
|
|
1252
1307
|
HINT_INLINE size_t
|
1253
1308
|
HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd,
|
@@ -1307,7 +1362,7 @@ HUF_decompress1X2_usingDTable_internal_body(
|
|
1307
1362
|
|
1308
1363
|
/* decode */
|
1309
1364
|
{ BYTE* const ostart = (BYTE*) dst;
|
1310
|
-
BYTE* const oend = ostart
|
1365
|
+
BYTE* const oend = ZSTD_maybeNullPtrAdd(ostart, dstSize);
|
1311
1366
|
const void* const dtPtr = DTable+1; /* force compiler to not use strict-aliasing */
|
1312
1367
|
const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr;
|
1313
1368
|
DTableDesc const dtd = HUF_getDTableDesc(DTable);
|
@@ -1332,6 +1387,7 @@ HUF_decompress4X2_usingDTable_internal_body(
|
|
1332
1387
|
const HUF_DTable* DTable)
|
1333
1388
|
{
|
1334
1389
|
if (cSrcSize < 10) return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */
|
1390
|
+
if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */
|
1335
1391
|
|
1336
1392
|
{ const BYTE* const istart = (const BYTE*) cSrc;
|
1337
1393
|
BYTE* const ostart = (BYTE*) dst;
|
@@ -1367,7 +1423,7 @@ HUF_decompress4X2_usingDTable_internal_body(
|
|
1367
1423
|
|
1368
1424
|
if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */
|
1369
1425
|
if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */
|
1370
|
-
|
1426
|
+
assert(dstSize >= 6 /* validated above */);
|
1371
1427
|
CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
|
1372
1428
|
CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
|
1373
1429
|
CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
|
@@ -1472,7 +1528,7 @@ void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs*
|
|
1472
1528
|
BYTE* op[4];
|
1473
1529
|
BYTE* oend[4];
|
1474
1530
|
HUF_DEltX2 const* const dtable = (HUF_DEltX2 const*)args->dt;
|
1475
|
-
BYTE const* const
|
1531
|
+
BYTE const* const ilowest = args->ilowest;
|
1476
1532
|
|
1477
1533
|
/* Copy the arguments to local registers. */
|
1478
1534
|
ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
|
@@ -1490,13 +1546,12 @@ void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs*
|
|
1490
1546
|
for (;;) {
|
1491
1547
|
BYTE* olimit;
|
1492
1548
|
int stream;
|
1493
|
-
int symbol;
|
1494
1549
|
|
1495
1550
|
/* Assert loop preconditions */
|
1496
1551
|
#ifndef NDEBUG
|
1497
1552
|
for (stream = 0; stream < 4; ++stream) {
|
1498
1553
|
assert(op[stream] <= oend[stream]);
|
1499
|
-
assert(ip[stream] >=
|
1554
|
+
assert(ip[stream] >= ilowest);
|
1500
1555
|
}
|
1501
1556
|
#endif
|
1502
1557
|
/* Compute olimit */
|
@@ -1509,7 +1564,7 @@ void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs*
|
|
1509
1564
|
* We also know that each input pointer is >= ip[0]. So we can run
|
1510
1565
|
* iters loops before running out of input.
|
1511
1566
|
*/
|
1512
|
-
size_t iters = (size_t)(ip[0] -
|
1567
|
+
size_t iters = (size_t)(ip[0] - ilowest) / 7;
|
1513
1568
|
/* Each iteration can produce up to 10 bytes of output per stream.
|
1514
1569
|
* Each output stream my advance at different rates. So take the
|
1515
1570
|
* minimum number of safe iterations among all the output streams.
|
@@ -1527,8 +1582,8 @@ void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs*
|
|
1527
1582
|
*/
|
1528
1583
|
olimit = op[3] + (iters * 5);
|
1529
1584
|
|
1530
|
-
/* Exit the fast decoding loop
|
1531
|
-
if (op[3]
|
1585
|
+
/* Exit the fast decoding loop once we reach the end. */
|
1586
|
+
if (op[3] == olimit)
|
1532
1587
|
break;
|
1533
1588
|
|
1534
1589
|
/* Exit the decoding loop if any input pointer has crossed the
|
@@ -1547,54 +1602,58 @@ void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs*
|
|
1547
1602
|
}
|
1548
1603
|
#endif
|
1549
1604
|
|
1605
|
+
#define HUF_4X2_DECODE_SYMBOL(_stream, _decode3) \
|
1606
|
+
do { \
|
1607
|
+
if ((_decode3) || (_stream) != 3) { \
|
1608
|
+
int const index = (int)(bits[(_stream)] >> 53); \
|
1609
|
+
HUF_DEltX2 const entry = dtable[index]; \
|
1610
|
+
MEM_write16(op[(_stream)], entry.sequence); \
|
1611
|
+
bits[(_stream)] <<= (entry.nbBits) & 0x3F; \
|
1612
|
+
op[(_stream)] += (entry.length); \
|
1613
|
+
} \
|
1614
|
+
} while (0)
|
1615
|
+
|
1616
|
+
#define HUF_4X2_RELOAD_STREAM(_stream) \
|
1617
|
+
do { \
|
1618
|
+
HUF_4X2_DECODE_SYMBOL(3, 1); \
|
1619
|
+
{ \
|
1620
|
+
int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \
|
1621
|
+
int const nbBits = ctz & 7; \
|
1622
|
+
int const nbBytes = ctz >> 3; \
|
1623
|
+
ip[(_stream)] -= nbBytes; \
|
1624
|
+
bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1; \
|
1625
|
+
bits[(_stream)] <<= nbBits; \
|
1626
|
+
} \
|
1627
|
+
} while (0)
|
1628
|
+
|
1629
|
+
/* Manually unroll the loop because compilers don't consistently
|
1630
|
+
* unroll the inner loops, which destroys performance.
|
1631
|
+
*/
|
1550
1632
|
do {
|
1551
|
-
/*
|
1552
|
-
|
1553
|
-
|
1554
|
-
|
1555
|
-
|
1556
|
-
|
1557
|
-
|
1558
|
-
|
1559
|
-
|
1560
|
-
|
1561
|
-
/*
|
1562
|
-
|
1563
|
-
|
1564
|
-
|
1565
|
-
|
1566
|
-
|
1567
|
-
|
1568
|
-
|
1569
|
-
/* Do 4 table lookups from the final stream & reload bitstreams */
|
1570
|
-
for (stream = 0; stream < 4; ++stream) {
|
1571
|
-
/* Do a table lookup from the final stream.
|
1572
|
-
* This is interleaved with the reloading to reduce register
|
1573
|
-
* pressure. This shouldn't be necessary, but compilers can
|
1574
|
-
* struggle with codegen with high register pressure.
|
1575
|
-
*/
|
1576
|
-
{
|
1577
|
-
int const index = (int)(bits[3] >> 53);
|
1578
|
-
HUF_DEltX2 const entry = dtable[index];
|
1579
|
-
MEM_write16(op[3], entry.sequence);
|
1580
|
-
bits[3] <<= (entry.nbBits);
|
1581
|
-
op[3] += (entry.length);
|
1582
|
-
}
|
1583
|
-
/* Reload the bistreams. The final bitstream must be reloaded
|
1584
|
-
* after the 5th symbol was decoded.
|
1585
|
-
*/
|
1586
|
-
{
|
1587
|
-
int const ctz = ZSTD_countTrailingZeros64(bits[stream]);
|
1588
|
-
int const nbBits = ctz & 7;
|
1589
|
-
int const nbBytes = ctz >> 3;
|
1590
|
-
ip[stream] -= nbBytes;
|
1591
|
-
bits[stream] = MEM_read64(ip[stream]) | 1;
|
1592
|
-
bits[stream] <<= nbBits;
|
1593
|
-
}
|
1594
|
-
}
|
1633
|
+
/* Decode 5 symbols from each of the first 3 streams.
|
1634
|
+
* The final stream will be decoded during the reload phase
|
1635
|
+
* to reduce register pressure.
|
1636
|
+
*/
|
1637
|
+
HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
|
1638
|
+
HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
|
1639
|
+
HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
|
1640
|
+
HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
|
1641
|
+
HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
|
1642
|
+
|
1643
|
+
/* Decode one symbol from the final stream */
|
1644
|
+
HUF_4X2_DECODE_SYMBOL(3, 1);
|
1645
|
+
|
1646
|
+
/* Decode 4 symbols from the final stream & reload bitstreams.
|
1647
|
+
* The final stream is reloaded last, meaning that all 5 symbols
|
1648
|
+
* are decoded from the final stream before it is reloaded.
|
1649
|
+
*/
|
1650
|
+
HUF_4X_FOR_EACH_STREAM(HUF_4X2_RELOAD_STREAM);
|
1595
1651
|
} while (op[3] < olimit);
|
1596
1652
|
}
|
1597
1653
|
|
1654
|
+
#undef HUF_4X2_DECODE_SYMBOL
|
1655
|
+
#undef HUF_4X2_RELOAD_STREAM
|
1656
|
+
|
1598
1657
|
_out:
|
1599
1658
|
|
1600
1659
|
/* Save the final values of each of the state variables back to args. */
|
@@ -1611,8 +1670,8 @@ HUF_decompress4X2_usingDTable_internal_fast(
|
|
1611
1670
|
const HUF_DTable* DTable,
|
1612
1671
|
HUF_DecompressFastLoopFn loopFn) {
|
1613
1672
|
void const* dt = DTable + 1;
|
1614
|
-
const BYTE* const
|
1615
|
-
BYTE* const oend = (BYTE*)dst
|
1673
|
+
const BYTE* const ilowest = (const BYTE*)cSrc;
|
1674
|
+
BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize);
|
1616
1675
|
HUF_DecompressFastArgs args;
|
1617
1676
|
{
|
1618
1677
|
size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
|
@@ -1621,16 +1680,19 @@ HUF_decompress4X2_usingDTable_internal_fast(
|
|
1621
1680
|
return 0;
|
1622
1681
|
}
|
1623
1682
|
|
1624
|
-
assert(args.ip[0] >= args.
|
1683
|
+
assert(args.ip[0] >= args.ilowest);
|
1625
1684
|
loopFn(&args);
|
1626
1685
|
|
1627
1686
|
/* note : op4 already verified within main loop */
|
1628
|
-
assert(args.ip[0] >=
|
1629
|
-
assert(args.ip[1] >=
|
1630
|
-
assert(args.ip[2] >=
|
1631
|
-
assert(args.ip[3] >=
|
1687
|
+
assert(args.ip[0] >= ilowest);
|
1688
|
+
assert(args.ip[1] >= ilowest);
|
1689
|
+
assert(args.ip[2] >= ilowest);
|
1690
|
+
assert(args.ip[3] >= ilowest);
|
1632
1691
|
assert(args.op[3] <= oend);
|
1633
|
-
|
1692
|
+
|
1693
|
+
assert(ilowest == args.ilowest);
|
1694
|
+
assert(ilowest + 6 == args.iend[0]);
|
1695
|
+
(void)ilowest;
|
1634
1696
|
|
1635
1697
|
/* finish bitStreams one by one */
|
1636
1698
|
{
|
@@ -1679,7 +1741,7 @@ static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize,
|
|
1679
1741
|
}
|
1680
1742
|
#endif
|
1681
1743
|
|
1682
|
-
if (!(flags & HUF_flags_disableFast)) {
|
1744
|
+
if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) {
|
1683
1745
|
size_t const ret = HUF_decompress4X2_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);
|
1684
1746
|
if (ret != 0)
|
1685
1747
|
return ret;
|