zstdlib 0.8.0-x86-mingw32 → 0.9.0-x86-mingw32
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGES.md +10 -0
- data/README.md +7 -1
- data/Rakefile +38 -8
- data/ext/{zstdlib → zstdlib_c}/extconf.rb +10 -5
- data/ext/{zstdlib → zstdlib_c}/ruby/zlib-2.2/zstdlib.c +2 -2
- data/ext/{zstdlib → zstdlib_c}/ruby/zlib-2.3/zstdlib.c +2 -2
- data/ext/{zstdlib → zstdlib_c}/ruby/zlib-2.4/zstdlib.c +2 -2
- data/ext/{zstdlib → zstdlib_c}/ruby/zlib-2.5/zstdlib.c +2 -2
- data/ext/{zstdlib → zstdlib_c}/ruby/zlib-2.6/zstdlib.c +2 -2
- data/ext/{zstdlib → zstdlib_c}/ruby/zlib-2.7/zstdlib.c +2 -2
- data/ext/{zstdlib → zstdlib_c}/ruby/zlib-3.0/zstdlib.c +2 -2
- data/ext/zstdlib_c/ruby/zlib-3.1/zstdlib.c +5076 -0
- data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/adler32.c +0 -0
- data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/compress.c +0 -0
- data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/crc32.c +0 -0
- data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/crc32.h +0 -0
- data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/deflate.c +0 -0
- data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/deflate.h +0 -0
- data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/gzclose.c +0 -0
- data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/gzguts.h +0 -0
- data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/gzlib.c +0 -0
- data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/gzread.c +0 -0
- data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/gzwrite.c +0 -0
- data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/infback.c +0 -0
- data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/inffast.c +0 -0
- data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/inffast.h +0 -0
- data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/inffixed.h +0 -0
- data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/inflate.c +0 -0
- data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/inflate.h +0 -0
- data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/inftrees.c +0 -0
- data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/inftrees.h +0 -0
- data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/trees.c +0 -0
- data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/trees.h +0 -0
- data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/uncompr.c +0 -0
- data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/zconf.h +0 -0
- data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/zlib.h +0 -0
- data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/zutil.c +0 -0
- data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/zutil.h +0 -0
- data/ext/{zstdlib → zstdlib_c}/zlib.mk +0 -0
- data/ext/{zstdlib → zstdlib_c}/zlibwrapper/zlibwrapper.c +1 -5
- data/ext/{zstdlib → zstdlib_c}/zlibwrapper.mk +0 -0
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/bitstream.h +24 -9
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/compiler.h +89 -43
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/cpu.h +0 -0
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/debug.c +0 -0
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/debug.h +0 -0
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/entropy_common.c +11 -5
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/error_private.c +0 -0
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/error_private.h +79 -0
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/fse.h +2 -1
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/fse_decompress.c +1 -1
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/huf.h +24 -22
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/mem.h +18 -0
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/pool.c +11 -6
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/pool.h +2 -2
- data/ext/zstdlib_c/zstd-1.5.2/lib/common/portability_macros.h +137 -0
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/threading.c +0 -0
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/threading.h +0 -0
- data/ext/zstdlib_c/zstd-1.5.2/lib/common/xxhash.c +24 -0
- data/ext/zstdlib_c/zstd-1.5.2/lib/common/xxhash.h +5686 -0
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/zstd_common.c +0 -0
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/zstd_deps.h +0 -0
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/zstd_internal.h +95 -92
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/zstd_trace.h +12 -3
- data/ext/zstdlib_c/zstd-1.5.2/lib/compress/clevels.h +134 -0
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/fse_compress.c +63 -27
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/hist.c +0 -0
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/hist.h +0 -0
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/huf_compress.c +537 -104
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_compress.c +307 -373
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_compress_internal.h +174 -83
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_compress_literals.c +4 -3
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_compress_literals.h +3 -1
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_compress_sequences.c +15 -14
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_compress_sequences.h +0 -0
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_compress_superblock.c +4 -3
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_compress_superblock.h +0 -0
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_cwksp.h +41 -27
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_double_fast.c +295 -120
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_double_fast.h +0 -0
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_fast.c +309 -130
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_fast.h +0 -0
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_lazy.c +482 -562
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_lazy.h +0 -0
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_ldm.c +9 -7
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_ldm.h +1 -1
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_ldm_geartab.h +4 -1
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_opt.c +249 -148
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_opt.h +0 -0
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstdmt_compress.c +76 -38
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstdmt_compress.h +4 -1
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/decompress/huf_decompress.c +727 -189
- data/ext/zstdlib_c/zstd-1.5.2/lib/decompress/huf_decompress_amd64.S +585 -0
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/decompress/zstd_ddict.c +0 -0
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/decompress/zstd_ddict.h +0 -0
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/decompress/zstd_decompress.c +85 -22
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/decompress/zstd_decompress_block.c +744 -220
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/decompress/zstd_decompress_block.h +8 -2
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/decompress/zstd_decompress_internal.h +34 -3
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/zdict.h +4 -4
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/zstd.h +179 -136
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/zstd_errors.h +0 -0
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/zlibWrapper/gzclose.c +0 -0
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/zlibWrapper/gzcompatibility.h +0 -0
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/zlibWrapper/gzguts.h +0 -0
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/zlibWrapper/gzlib.c +0 -0
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/zlibWrapper/gzread.c +0 -0
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/zlibWrapper/gzwrite.c +0 -0
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/zlibWrapper/zstd_zlibwrapper.c +7 -0
- data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/zlibWrapper/zstd_zlibwrapper.h +0 -0
- data/ext/zstdlib_c/zstd.mk +15 -0
- data/lib/2.4/zstdlib_c.so +0 -0
- data/lib/2.5/zstdlib_c.so +0 -0
- data/lib/2.6/zstdlib_c.so +0 -0
- data/lib/2.7/zstdlib_c.so +0 -0
- data/lib/3.0/zstdlib_c.so +0 -0
- data/lib/3.1/zstdlib_c.so +0 -0
- data/lib/zstdlib.rb +2 -2
- metadata +125 -121
- data/ext/zstdlib/zstd-1.5.0/lib/common/xxhash.c +0 -824
- data/ext/zstdlib/zstd-1.5.0/lib/common/xxhash.h +0 -285
- data/ext/zstdlib/zstd.mk +0 -14
- data/lib/2.2/zstdlib.so +0 -0
- data/lib/2.3/zstdlib.so +0 -0
- data/lib/2.4/zstdlib.so +0 -0
- data/lib/2.5/zstdlib.so +0 -0
- data/lib/2.6/zstdlib.so +0 -0
- data/lib/2.7/zstdlib.so +0 -0
@@ -22,6 +22,13 @@
|
|
22
22
|
#define HUF_STATIC_LINKING_ONLY
|
23
23
|
#include "../common/huf.h"
|
24
24
|
#include "../common/error_private.h"
|
25
|
+
#include "../common/zstd_internal.h"
|
26
|
+
|
27
|
+
/* **************************************************************
|
28
|
+
* Constants
|
29
|
+
****************************************************************/
|
30
|
+
|
31
|
+
#define HUF_DECODER_FAST_TABLELOG 11
|
25
32
|
|
26
33
|
/* **************************************************************
|
27
34
|
* Macros
|
@@ -36,6 +43,30 @@
|
|
36
43
|
#error "Cannot force the use of the X1 and X2 decoders at the same time!"
|
37
44
|
#endif
|
38
45
|
|
46
|
+
#if ZSTD_ENABLE_ASM_X86_64_BMI2 && DYNAMIC_BMI2
|
47
|
+
# define HUF_ASM_X86_64_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE
|
48
|
+
#else
|
49
|
+
# define HUF_ASM_X86_64_BMI2_ATTRS
|
50
|
+
#endif
|
51
|
+
|
52
|
+
#ifdef __cplusplus
|
53
|
+
# define HUF_EXTERN_C extern "C"
|
54
|
+
#else
|
55
|
+
# define HUF_EXTERN_C
|
56
|
+
#endif
|
57
|
+
#define HUF_ASM_DECL HUF_EXTERN_C
|
58
|
+
|
59
|
+
#if DYNAMIC_BMI2 || (ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__))
|
60
|
+
# define HUF_NEED_BMI2_FUNCTION 1
|
61
|
+
#else
|
62
|
+
# define HUF_NEED_BMI2_FUNCTION 0
|
63
|
+
#endif
|
64
|
+
|
65
|
+
#if !(ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__))
|
66
|
+
# define HUF_NEED_DEFAULT_FUNCTION 1
|
67
|
+
#else
|
68
|
+
# define HUF_NEED_DEFAULT_FUNCTION 0
|
69
|
+
#endif
|
39
70
|
|
40
71
|
/* **************************************************************
|
41
72
|
* Error Management
|
@@ -65,7 +96,7 @@
|
|
65
96
|
return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \
|
66
97
|
} \
|
67
98
|
\
|
68
|
-
static
|
99
|
+
static BMI2_TARGET_ATTRIBUTE size_t fn##_bmi2( \
|
69
100
|
void* dst, size_t dstSize, \
|
70
101
|
const void* cSrc, size_t cSrcSize, \
|
71
102
|
const HUF_DTable* DTable) \
|
@@ -107,13 +138,147 @@ static DTableDesc HUF_getDTableDesc(const HUF_DTable* table)
|
|
107
138
|
return dtd;
|
108
139
|
}
|
109
140
|
|
141
|
+
#if ZSTD_ENABLE_ASM_X86_64_BMI2
|
142
|
+
|
143
|
+
static size_t HUF_initDStream(BYTE const* ip) {
|
144
|
+
BYTE const lastByte = ip[7];
|
145
|
+
size_t const bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;
|
146
|
+
size_t const value = MEM_readLEST(ip) | 1;
|
147
|
+
assert(bitsConsumed <= 8);
|
148
|
+
return value << bitsConsumed;
|
149
|
+
}
|
150
|
+
typedef struct {
|
151
|
+
BYTE const* ip[4];
|
152
|
+
BYTE* op[4];
|
153
|
+
U64 bits[4];
|
154
|
+
void const* dt;
|
155
|
+
BYTE const* ilimit;
|
156
|
+
BYTE* oend;
|
157
|
+
BYTE const* iend[4];
|
158
|
+
} HUF_DecompressAsmArgs;
|
159
|
+
|
160
|
+
/**
|
161
|
+
* Initializes args for the asm decoding loop.
|
162
|
+
* @returns 0 on success
|
163
|
+
* 1 if the fallback implementation should be used.
|
164
|
+
* Or an error code on failure.
|
165
|
+
*/
|
166
|
+
static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable)
|
167
|
+
{
|
168
|
+
void const* dt = DTable + 1;
|
169
|
+
U32 const dtLog = HUF_getDTableDesc(DTable).tableLog;
|
170
|
+
|
171
|
+
const BYTE* const ilimit = (const BYTE*)src + 6 + 8;
|
172
|
+
|
173
|
+
BYTE* const oend = (BYTE*)dst + dstSize;
|
174
|
+
|
175
|
+
/* The following condition is false on x32 platform,
|
176
|
+
* but HUF_asm is not compatible with this ABI */
|
177
|
+
if (!(MEM_isLittleEndian() && !MEM_32bits())) return 1;
|
178
|
+
|
179
|
+
/* strict minimum : jump table + 1 byte per stream */
|
180
|
+
if (srcSize < 10)
|
181
|
+
return ERROR(corruption_detected);
|
182
|
+
|
183
|
+
/* Must have at least 8 bytes per stream because we don't handle initializing smaller bit containers.
|
184
|
+
* If table log is not correct at this point, fallback to the old decoder.
|
185
|
+
* On small inputs we don't have enough data to trigger the fast loop, so use the old decoder.
|
186
|
+
*/
|
187
|
+
if (dtLog != HUF_DECODER_FAST_TABLELOG)
|
188
|
+
return 1;
|
189
|
+
|
190
|
+
/* Read the jump table. */
|
191
|
+
{
|
192
|
+
const BYTE* const istart = (const BYTE*)src;
|
193
|
+
size_t const length1 = MEM_readLE16(istart);
|
194
|
+
size_t const length2 = MEM_readLE16(istart+2);
|
195
|
+
size_t const length3 = MEM_readLE16(istart+4);
|
196
|
+
size_t const length4 = srcSize - (length1 + length2 + length3 + 6);
|
197
|
+
args->iend[0] = istart + 6; /* jumpTable */
|
198
|
+
args->iend[1] = args->iend[0] + length1;
|
199
|
+
args->iend[2] = args->iend[1] + length2;
|
200
|
+
args->iend[3] = args->iend[2] + length3;
|
201
|
+
|
202
|
+
/* HUF_initDStream() requires this, and this small of an input
|
203
|
+
* won't benefit from the ASM loop anyways.
|
204
|
+
* length1 must be >= 16 so that ip[0] >= ilimit before the loop
|
205
|
+
* starts.
|
206
|
+
*/
|
207
|
+
if (length1 < 16 || length2 < 8 || length3 < 8 || length4 < 8)
|
208
|
+
return 1;
|
209
|
+
if (length4 > srcSize) return ERROR(corruption_detected); /* overflow */
|
210
|
+
}
|
211
|
+
/* ip[] contains the position that is currently loaded into bits[]. */
|
212
|
+
args->ip[0] = args->iend[1] - sizeof(U64);
|
213
|
+
args->ip[1] = args->iend[2] - sizeof(U64);
|
214
|
+
args->ip[2] = args->iend[3] - sizeof(U64);
|
215
|
+
args->ip[3] = (BYTE const*)src + srcSize - sizeof(U64);
|
216
|
+
|
217
|
+
/* op[] contains the output pointers. */
|
218
|
+
args->op[0] = (BYTE*)dst;
|
219
|
+
args->op[1] = args->op[0] + (dstSize+3)/4;
|
220
|
+
args->op[2] = args->op[1] + (dstSize+3)/4;
|
221
|
+
args->op[3] = args->op[2] + (dstSize+3)/4;
|
222
|
+
|
223
|
+
/* No point to call the ASM loop for tiny outputs. */
|
224
|
+
if (args->op[3] >= oend)
|
225
|
+
return 1;
|
226
|
+
|
227
|
+
/* bits[] is the bit container.
|
228
|
+
* It is read from the MSB down to the LSB.
|
229
|
+
* It is shifted left as it is read, and zeros are
|
230
|
+
* shifted in. After the lowest valid bit a 1 is
|
231
|
+
* set, so that CountTrailingZeros(bits[]) can be used
|
232
|
+
* to count how many bits we've consumed.
|
233
|
+
*/
|
234
|
+
args->bits[0] = HUF_initDStream(args->ip[0]);
|
235
|
+
args->bits[1] = HUF_initDStream(args->ip[1]);
|
236
|
+
args->bits[2] = HUF_initDStream(args->ip[2]);
|
237
|
+
args->bits[3] = HUF_initDStream(args->ip[3]);
|
238
|
+
|
239
|
+
/* If ip[] >= ilimit, it is guaranteed to be safe to
|
240
|
+
* reload bits[]. It may be beyond its section, but is
|
241
|
+
* guaranteed to be valid (>= istart).
|
242
|
+
*/
|
243
|
+
args->ilimit = ilimit;
|
244
|
+
|
245
|
+
args->oend = oend;
|
246
|
+
args->dt = dt;
|
247
|
+
|
248
|
+
return 0;
|
249
|
+
}
|
250
|
+
|
251
|
+
static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs const* args, int stream, BYTE* segmentEnd)
|
252
|
+
{
|
253
|
+
/* Validate that we haven't overwritten. */
|
254
|
+
if (args->op[stream] > segmentEnd)
|
255
|
+
return ERROR(corruption_detected);
|
256
|
+
/* Validate that we haven't read beyond iend[].
|
257
|
+
* Note that ip[] may be < iend[] because the MSB is
|
258
|
+
* the next bit to read, and we may have consumed 100%
|
259
|
+
* of the stream, so down to iend[i] - 8 is valid.
|
260
|
+
*/
|
261
|
+
if (args->ip[stream] < args->iend[stream] - 8)
|
262
|
+
return ERROR(corruption_detected);
|
263
|
+
|
264
|
+
/* Construct the BIT_DStream_t. */
|
265
|
+
bit->bitContainer = MEM_readLE64(args->ip[stream]);
|
266
|
+
bit->bitsConsumed = ZSTD_countTrailingZeros((size_t)args->bits[stream]);
|
267
|
+
bit->start = (const char*)args->iend[0];
|
268
|
+
bit->limitPtr = bit->start + sizeof(size_t);
|
269
|
+
bit->ptr = (const char*)args->ip[stream];
|
270
|
+
|
271
|
+
return 0;
|
272
|
+
}
|
273
|
+
#endif
|
274
|
+
|
110
275
|
|
111
276
|
#ifndef HUF_FORCE_DECOMPRESS_X2
|
112
277
|
|
113
278
|
/*-***************************/
|
114
279
|
/* single-symbol decoding */
|
115
280
|
/*-***************************/
|
116
|
-
typedef struct { BYTE
|
281
|
+
typedef struct { BYTE nbBits; BYTE byte; } HUF_DEltX1; /* single-symbol decoding */
|
117
282
|
|
118
283
|
/**
|
119
284
|
* Packs 4 HUF_DEltX1 structs into a U64. This is used to lay down 4 entries at
|
@@ -122,14 +287,44 @@ typedef struct { BYTE byte; BYTE nbBits; } HUF_DEltX1; /* single-symbol decodi
|
|
122
287
|
static U64 HUF_DEltX1_set4(BYTE symbol, BYTE nbBits) {
|
123
288
|
U64 D4;
|
124
289
|
if (MEM_isLittleEndian()) {
|
125
|
-
D4 = symbol + (nbBits << 8);
|
126
|
-
} else {
|
127
290
|
D4 = (symbol << 8) + nbBits;
|
291
|
+
} else {
|
292
|
+
D4 = symbol + (nbBits << 8);
|
128
293
|
}
|
129
294
|
D4 *= 0x0001000100010001ULL;
|
130
295
|
return D4;
|
131
296
|
}
|
132
297
|
|
298
|
+
/**
|
299
|
+
* Increase the tableLog to targetTableLog and rescales the stats.
|
300
|
+
* If tableLog > targetTableLog this is a no-op.
|
301
|
+
* @returns New tableLog
|
302
|
+
*/
|
303
|
+
static U32 HUF_rescaleStats(BYTE* huffWeight, U32* rankVal, U32 nbSymbols, U32 tableLog, U32 targetTableLog)
|
304
|
+
{
|
305
|
+
if (tableLog > targetTableLog)
|
306
|
+
return tableLog;
|
307
|
+
if (tableLog < targetTableLog) {
|
308
|
+
U32 const scale = targetTableLog - tableLog;
|
309
|
+
U32 s;
|
310
|
+
/* Increase the weight for all non-zero probability symbols by scale. */
|
311
|
+
for (s = 0; s < nbSymbols; ++s) {
|
312
|
+
huffWeight[s] += (BYTE)((huffWeight[s] == 0) ? 0 : scale);
|
313
|
+
}
|
314
|
+
/* Update rankVal to reflect the new weights.
|
315
|
+
* All weights except 0 get moved to weight + scale.
|
316
|
+
* Weights [1, scale] are empty.
|
317
|
+
*/
|
318
|
+
for (s = targetTableLog; s > scale; --s) {
|
319
|
+
rankVal[s] = rankVal[s - scale];
|
320
|
+
}
|
321
|
+
for (s = scale; s > 0; --s) {
|
322
|
+
rankVal[s] = 0;
|
323
|
+
}
|
324
|
+
}
|
325
|
+
return targetTableLog;
|
326
|
+
}
|
327
|
+
|
133
328
|
typedef struct {
|
134
329
|
U32 rankVal[HUF_TABLELOG_ABSOLUTEMAX + 1];
|
135
330
|
U32 rankStart[HUF_TABLELOG_ABSOLUTEMAX + 1];
|
@@ -162,8 +357,12 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
|
|
162
357
|
iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), bmi2);
|
163
358
|
if (HUF_isError(iSize)) return iSize;
|
164
359
|
|
360
|
+
|
165
361
|
/* Table header */
|
166
362
|
{ DTableDesc dtd = HUF_getDTableDesc(DTable);
|
363
|
+
U32 const maxTableLog = dtd.maxTableLog + 1;
|
364
|
+
U32 const targetTableLog = MIN(maxTableLog, HUF_DECODER_FAST_TABLELOG);
|
365
|
+
tableLog = HUF_rescaleStats(wksp->huffWeight, wksp->rankVal, nbSymbols, tableLog, targetTableLog);
|
167
366
|
if (tableLog > (U32)(dtd.maxTableLog+1)) return ERROR(tableLog_tooLarge); /* DTable too small, Huffman tree cannot fit in */
|
168
367
|
dtd.tableType = 0;
|
169
368
|
dtd.tableLog = (BYTE)tableLog;
|
@@ -207,7 +406,7 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
|
|
207
406
|
|
208
407
|
/* fill DTable
|
209
408
|
* We fill all entries of each weight in order.
|
210
|
-
* That way length is a constant for each iteration of the
|
409
|
+
* That way length is a constant for each iteration of the outer loop.
|
211
410
|
* We can switch based on the length to a different inner loop which is
|
212
411
|
* optimized for that particular case.
|
213
412
|
*/
|
@@ -304,11 +503,15 @@ HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, cons
|
|
304
503
|
BYTE* const pStart = p;
|
305
504
|
|
306
505
|
/* up to 4 symbols at a time */
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
506
|
+
if ((pEnd - p) > 3) {
|
507
|
+
while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-3)) {
|
508
|
+
HUF_DECODE_SYMBOLX1_2(p, bitDPtr);
|
509
|
+
HUF_DECODE_SYMBOLX1_1(p, bitDPtr);
|
510
|
+
HUF_DECODE_SYMBOLX1_2(p, bitDPtr);
|
511
|
+
HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
|
512
|
+
}
|
513
|
+
} else {
|
514
|
+
BIT_reloadDStream(bitDPtr);
|
312
515
|
}
|
313
516
|
|
314
517
|
/* [0-3] symbols remaining */
|
@@ -388,33 +591,36 @@ HUF_decompress4X1_usingDTable_internal_body(
|
|
388
591
|
U32 endSignal = 1;
|
389
592
|
|
390
593
|
if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */
|
594
|
+
if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */
|
391
595
|
CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
|
392
596
|
CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
|
393
597
|
CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
|
394
598
|
CHECK_F( BIT_initDStream(&bitD4, istart4, length4) );
|
395
599
|
|
396
600
|
/* up to 16 symbols per loop (4 symbols per stream) in 64-bit mode */
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
601
|
+
if ((size_t)(oend - op4) >= sizeof(size_t)) {
|
602
|
+
for ( ; (endSignal) & (op4 < olimit) ; ) {
|
603
|
+
HUF_DECODE_SYMBOLX1_2(op1, &bitD1);
|
604
|
+
HUF_DECODE_SYMBOLX1_2(op2, &bitD2);
|
605
|
+
HUF_DECODE_SYMBOLX1_2(op3, &bitD3);
|
606
|
+
HUF_DECODE_SYMBOLX1_2(op4, &bitD4);
|
607
|
+
HUF_DECODE_SYMBOLX1_1(op1, &bitD1);
|
608
|
+
HUF_DECODE_SYMBOLX1_1(op2, &bitD2);
|
609
|
+
HUF_DECODE_SYMBOLX1_1(op3, &bitD3);
|
610
|
+
HUF_DECODE_SYMBOLX1_1(op4, &bitD4);
|
611
|
+
HUF_DECODE_SYMBOLX1_2(op1, &bitD1);
|
612
|
+
HUF_DECODE_SYMBOLX1_2(op2, &bitD2);
|
613
|
+
HUF_DECODE_SYMBOLX1_2(op3, &bitD3);
|
614
|
+
HUF_DECODE_SYMBOLX1_2(op4, &bitD4);
|
615
|
+
HUF_DECODE_SYMBOLX1_0(op1, &bitD1);
|
616
|
+
HUF_DECODE_SYMBOLX1_0(op2, &bitD2);
|
617
|
+
HUF_DECODE_SYMBOLX1_0(op3, &bitD3);
|
618
|
+
HUF_DECODE_SYMBOLX1_0(op4, &bitD4);
|
619
|
+
endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished;
|
620
|
+
endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished;
|
621
|
+
endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished;
|
622
|
+
endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished;
|
623
|
+
}
|
418
624
|
}
|
419
625
|
|
420
626
|
/* check corruption */
|
@@ -440,6 +646,79 @@ HUF_decompress4X1_usingDTable_internal_body(
|
|
440
646
|
}
|
441
647
|
}
|
442
648
|
|
649
|
+
#if HUF_NEED_BMI2_FUNCTION
|
650
|
+
static BMI2_TARGET_ATTRIBUTE
|
651
|
+
size_t HUF_decompress4X1_usingDTable_internal_bmi2(void* dst, size_t dstSize, void const* cSrc,
|
652
|
+
size_t cSrcSize, HUF_DTable const* DTable) {
|
653
|
+
return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
|
654
|
+
}
|
655
|
+
#endif
|
656
|
+
|
657
|
+
#if HUF_NEED_DEFAULT_FUNCTION
|
658
|
+
static
|
659
|
+
size_t HUF_decompress4X1_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,
|
660
|
+
size_t cSrcSize, HUF_DTable const* DTable) {
|
661
|
+
return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
|
662
|
+
}
|
663
|
+
#endif
|
664
|
+
|
665
|
+
#if ZSTD_ENABLE_ASM_X86_64_BMI2
|
666
|
+
|
667
|
+
HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN;
|
668
|
+
|
669
|
+
static HUF_ASM_X86_64_BMI2_ATTRS
|
670
|
+
size_t
|
671
|
+
HUF_decompress4X1_usingDTable_internal_bmi2_asm(
|
672
|
+
void* dst, size_t dstSize,
|
673
|
+
const void* cSrc, size_t cSrcSize,
|
674
|
+
const HUF_DTable* DTable)
|
675
|
+
{
|
676
|
+
void const* dt = DTable + 1;
|
677
|
+
const BYTE* const iend = (const BYTE*)cSrc + 6;
|
678
|
+
BYTE* const oend = (BYTE*)dst + dstSize;
|
679
|
+
HUF_DecompressAsmArgs args;
|
680
|
+
{
|
681
|
+
size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
|
682
|
+
FORWARD_IF_ERROR(ret, "Failed to init asm args");
|
683
|
+
if (ret != 0)
|
684
|
+
return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
|
685
|
+
}
|
686
|
+
|
687
|
+
assert(args.ip[0] >= args.ilimit);
|
688
|
+
HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(&args);
|
689
|
+
|
690
|
+
/* Our loop guarantees that ip[] >= ilimit and that we haven't
|
691
|
+
* overwritten any op[].
|
692
|
+
*/
|
693
|
+
assert(args.ip[0] >= iend);
|
694
|
+
assert(args.ip[1] >= iend);
|
695
|
+
assert(args.ip[2] >= iend);
|
696
|
+
assert(args.ip[3] >= iend);
|
697
|
+
assert(args.op[3] <= oend);
|
698
|
+
(void)iend;
|
699
|
+
|
700
|
+
/* finish bit streams one by one. */
|
701
|
+
{
|
702
|
+
size_t const segmentSize = (dstSize+3) / 4;
|
703
|
+
BYTE* segmentEnd = (BYTE*)dst;
|
704
|
+
int i;
|
705
|
+
for (i = 0; i < 4; ++i) {
|
706
|
+
BIT_DStream_t bit;
|
707
|
+
if (segmentSize <= (size_t)(oend - segmentEnd))
|
708
|
+
segmentEnd += segmentSize;
|
709
|
+
else
|
710
|
+
segmentEnd = oend;
|
711
|
+
FORWARD_IF_ERROR(HUF_initRemainingDStream(&bit, &args, i, segmentEnd), "corruption");
|
712
|
+
/* Decompress and validate that we've produced exactly the expected length. */
|
713
|
+
args.op[i] += HUF_decodeStreamX1(args.op[i], &bit, segmentEnd, (HUF_DEltX1 const*)dt, HUF_DECODER_FAST_TABLELOG);
|
714
|
+
if (args.op[i] != segmentEnd) return ERROR(corruption_detected);
|
715
|
+
}
|
716
|
+
}
|
717
|
+
|
718
|
+
/* decoded size */
|
719
|
+
return dstSize;
|
720
|
+
}
|
721
|
+
#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */
|
443
722
|
|
444
723
|
typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize,
|
445
724
|
const void *cSrc,
|
@@ -447,8 +726,28 @@ typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize,
|
|
447
726
|
const HUF_DTable *DTable);
|
448
727
|
|
449
728
|
HUF_DGEN(HUF_decompress1X1_usingDTable_internal)
|
450
|
-
HUF_DGEN(HUF_decompress4X1_usingDTable_internal)
|
451
729
|
|
730
|
+
static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc,
|
731
|
+
size_t cSrcSize, HUF_DTable const* DTable, int bmi2)
|
732
|
+
{
|
733
|
+
#if DYNAMIC_BMI2
|
734
|
+
if (bmi2) {
|
735
|
+
# if ZSTD_ENABLE_ASM_X86_64_BMI2
|
736
|
+
return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
|
737
|
+
# else
|
738
|
+
return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
|
739
|
+
# endif
|
740
|
+
}
|
741
|
+
#else
|
742
|
+
(void)bmi2;
|
743
|
+
#endif
|
744
|
+
|
745
|
+
#if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
|
746
|
+
return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
|
747
|
+
#else
|
748
|
+
return HUF_decompress4X1_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable);
|
749
|
+
#endif
|
750
|
+
}
|
452
751
|
|
453
752
|
|
454
753
|
size_t HUF_decompress1X1_usingDTable(
|
@@ -518,106 +817,226 @@ size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
|
|
518
817
|
/* *************************/
|
519
818
|
|
520
819
|
typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUF_DEltX2; /* double-symbols decoding */
|
521
|
-
typedef struct { BYTE symbol;
|
820
|
+
typedef struct { BYTE symbol; } sortedSymbol_t;
|
522
821
|
typedef U32 rankValCol_t[HUF_TABLELOG_MAX + 1];
|
523
822
|
typedef rankValCol_t rankVal_t[HUF_TABLELOG_MAX];
|
524
823
|
|
824
|
+
/**
|
825
|
+
* Constructs a HUF_DEltX2 in a U32.
|
826
|
+
*/
|
827
|
+
static U32 HUF_buildDEltX2U32(U32 symbol, U32 nbBits, U32 baseSeq, int level)
|
828
|
+
{
|
829
|
+
U32 seq;
|
830
|
+
DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, sequence) == 0);
|
831
|
+
DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, nbBits) == 2);
|
832
|
+
DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, length) == 3);
|
833
|
+
DEBUG_STATIC_ASSERT(sizeof(HUF_DEltX2) == sizeof(U32));
|
834
|
+
if (MEM_isLittleEndian()) {
|
835
|
+
seq = level == 1 ? symbol : (baseSeq + (symbol << 8));
|
836
|
+
return seq + (nbBits << 16) + ((U32)level << 24);
|
837
|
+
} else {
|
838
|
+
seq = level == 1 ? (symbol << 8) : ((baseSeq << 8) + symbol);
|
839
|
+
return (seq << 16) + (nbBits << 8) + (U32)level;
|
840
|
+
}
|
841
|
+
}
|
525
842
|
|
526
|
-
|
527
|
-
*
|
528
|
-
|
529
|
-
|
530
|
-
const sortedSymbol_t* sortedSymbols, const U32 sortedListSize,
|
531
|
-
U32 nbBitsBaseline, U16 baseSeq, U32* wksp, size_t wkspSize)
|
843
|
+
/**
|
844
|
+
* Constructs a HUF_DEltX2.
|
845
|
+
*/
|
846
|
+
static HUF_DEltX2 HUF_buildDEltX2(U32 symbol, U32 nbBits, U32 baseSeq, int level)
|
532
847
|
{
|
533
848
|
HUF_DEltX2 DElt;
|
534
|
-
U32
|
849
|
+
U32 const val = HUF_buildDEltX2U32(symbol, nbBits, baseSeq, level);
|
850
|
+
DEBUG_STATIC_ASSERT(sizeof(DElt) == sizeof(val));
|
851
|
+
ZSTD_memcpy(&DElt, &val, sizeof(val));
|
852
|
+
return DElt;
|
853
|
+
}
|
535
854
|
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
855
|
+
/**
|
856
|
+
* Constructs 2 HUF_DEltX2s and packs them into a U64.
|
857
|
+
*/
|
858
|
+
static U64 HUF_buildDEltX2U64(U32 symbol, U32 nbBits, U16 baseSeq, int level)
|
859
|
+
{
|
860
|
+
U32 DElt = HUF_buildDEltX2U32(symbol, nbBits, baseSeq, level);
|
861
|
+
return (U64)DElt + ((U64)DElt << 32);
|
862
|
+
}
|
540
863
|
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
864
|
+
/**
|
865
|
+
* Fills the DTable rank with all the symbols from [begin, end) that are each
|
866
|
+
* nbBits long.
|
867
|
+
*
|
868
|
+
* @param DTableRank The start of the rank in the DTable.
|
869
|
+
* @param begin The first symbol to fill (inclusive).
|
870
|
+
* @param end The last symbol to fill (exclusive).
|
871
|
+
* @param nbBits Each symbol is nbBits long.
|
872
|
+
* @param tableLog The table log.
|
873
|
+
* @param baseSeq If level == 1 { 0 } else { the first level symbol }
|
874
|
+
* @param level The level in the table. Must be 1 or 2.
|
875
|
+
*/
|
876
|
+
static void HUF_fillDTableX2ForWeight(
|
877
|
+
HUF_DEltX2* DTableRank,
|
878
|
+
sortedSymbol_t const* begin, sortedSymbol_t const* end,
|
879
|
+
U32 nbBits, U32 tableLog,
|
880
|
+
U16 baseSeq, int const level)
|
881
|
+
{
|
882
|
+
U32 const length = 1U << ((tableLog - nbBits) & 0x1F /* quiet static-analyzer */);
|
883
|
+
const sortedSymbol_t* ptr;
|
884
|
+
assert(level >= 1 && level <= 2);
|
885
|
+
switch (length) {
|
886
|
+
case 1:
|
887
|
+
for (ptr = begin; ptr != end; ++ptr) {
|
888
|
+
HUF_DEltX2 const DElt = HUF_buildDEltX2(ptr->symbol, nbBits, baseSeq, level);
|
889
|
+
*DTableRank++ = DElt;
|
890
|
+
}
|
891
|
+
break;
|
892
|
+
case 2:
|
893
|
+
for (ptr = begin; ptr != end; ++ptr) {
|
894
|
+
HUF_DEltX2 const DElt = HUF_buildDEltX2(ptr->symbol, nbBits, baseSeq, level);
|
895
|
+
DTableRank[0] = DElt;
|
896
|
+
DTableRank[1] = DElt;
|
897
|
+
DTableRank += 2;
|
898
|
+
}
|
899
|
+
break;
|
900
|
+
case 4:
|
901
|
+
for (ptr = begin; ptr != end; ++ptr) {
|
902
|
+
U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level);
|
903
|
+
ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2));
|
904
|
+
ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2));
|
905
|
+
DTableRank += 4;
|
906
|
+
}
|
907
|
+
break;
|
908
|
+
case 8:
|
909
|
+
for (ptr = begin; ptr != end; ++ptr) {
|
910
|
+
U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level);
|
911
|
+
ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2));
|
912
|
+
ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2));
|
913
|
+
ZSTD_memcpy(DTableRank + 4, &DEltX2, sizeof(DEltX2));
|
914
|
+
ZSTD_memcpy(DTableRank + 6, &DEltX2, sizeof(DEltX2));
|
915
|
+
DTableRank += 8;
|
916
|
+
}
|
917
|
+
break;
|
918
|
+
default:
|
919
|
+
for (ptr = begin; ptr != end; ++ptr) {
|
920
|
+
U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level);
|
921
|
+
HUF_DEltX2* const DTableRankEnd = DTableRank + length;
|
922
|
+
for (; DTableRank != DTableRankEnd; DTableRank += 8) {
|
923
|
+
ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2));
|
924
|
+
ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2));
|
925
|
+
ZSTD_memcpy(DTableRank + 4, &DEltX2, sizeof(DEltX2));
|
926
|
+
ZSTD_memcpy(DTableRank + 6, &DEltX2, sizeof(DEltX2));
|
927
|
+
}
|
928
|
+
}
|
929
|
+
break;
|
549
930
|
}
|
931
|
+
}
|
550
932
|
|
551
|
-
|
552
|
-
|
553
|
-
|
554
|
-
|
555
|
-
|
556
|
-
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
|
561
|
-
|
562
|
-
|
563
|
-
|
564
|
-
|
933
|
+
/* HUF_fillDTableX2Level2() :
|
934
|
+
* `rankValOrigin` must be a table of at least (HUF_TABLELOG_MAX + 1) U32 */
|
935
|
+
static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 targetLog, const U32 consumedBits,
|
936
|
+
const U32* rankVal, const int minWeight, const int maxWeight1,
|
937
|
+
const sortedSymbol_t* sortedSymbols, U32 const* rankStart,
|
938
|
+
U32 nbBitsBaseline, U16 baseSeq)
|
939
|
+
{
|
940
|
+
/* Fill skipped values (all positions up to rankVal[minWeight]).
|
941
|
+
* These are positions only get a single symbol because the combined weight
|
942
|
+
* is too large.
|
943
|
+
*/
|
944
|
+
if (minWeight>1) {
|
945
|
+
U32 const length = 1U << ((targetLog - consumedBits) & 0x1F /* quiet static-analyzer */);
|
946
|
+
U64 const DEltX2 = HUF_buildDEltX2U64(baseSeq, consumedBits, /* baseSeq */ 0, /* level */ 1);
|
947
|
+
int const skipSize = rankVal[minWeight];
|
948
|
+
assert(length > 1);
|
949
|
+
assert((U32)skipSize < length);
|
950
|
+
switch (length) {
|
951
|
+
case 2:
|
952
|
+
assert(skipSize == 1);
|
953
|
+
ZSTD_memcpy(DTable, &DEltX2, sizeof(DEltX2));
|
954
|
+
break;
|
955
|
+
case 4:
|
956
|
+
assert(skipSize <= 4);
|
957
|
+
ZSTD_memcpy(DTable + 0, &DEltX2, sizeof(DEltX2));
|
958
|
+
ZSTD_memcpy(DTable + 2, &DEltX2, sizeof(DEltX2));
|
959
|
+
break;
|
960
|
+
default:
|
961
|
+
{
|
962
|
+
int i;
|
963
|
+
for (i = 0; i < skipSize; i += 8) {
|
964
|
+
ZSTD_memcpy(DTable + i + 0, &DEltX2, sizeof(DEltX2));
|
965
|
+
ZSTD_memcpy(DTable + i + 2, &DEltX2, sizeof(DEltX2));
|
966
|
+
ZSTD_memcpy(DTable + i + 4, &DEltX2, sizeof(DEltX2));
|
967
|
+
ZSTD_memcpy(DTable + i + 6, &DEltX2, sizeof(DEltX2));
|
968
|
+
}
|
969
|
+
}
|
970
|
+
}
|
971
|
+
}
|
565
972
|
|
566
|
-
|
567
|
-
|
973
|
+
/* Fill each of the second level symbols by weight. */
|
974
|
+
{
|
975
|
+
int w;
|
976
|
+
for (w = minWeight; w < maxWeight1; ++w) {
|
977
|
+
int const begin = rankStart[w];
|
978
|
+
int const end = rankStart[w+1];
|
979
|
+
U32 const nbBits = nbBitsBaseline - w;
|
980
|
+
U32 const totalBits = nbBits + consumedBits;
|
981
|
+
HUF_fillDTableX2ForWeight(
|
982
|
+
DTable + rankVal[w],
|
983
|
+
sortedSymbols + begin, sortedSymbols + end,
|
984
|
+
totalBits, targetLog,
|
985
|
+
baseSeq, /* level */ 2);
|
986
|
+
}
|
987
|
+
}
|
568
988
|
}
|
569
989
|
|
570
|
-
|
571
990
|
static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog,
|
572
|
-
const sortedSymbol_t* sortedList,
|
991
|
+
const sortedSymbol_t* sortedList,
|
573
992
|
const U32* rankStart, rankVal_t rankValOrigin, const U32 maxWeight,
|
574
|
-
const U32 nbBitsBaseline
|
993
|
+
const U32 nbBitsBaseline)
|
575
994
|
{
|
576
|
-
U32* rankVal =
|
995
|
+
U32* const rankVal = rankValOrigin[0];
|
577
996
|
const int scaleLog = nbBitsBaseline - targetLog; /* note : targetLog >= srcLog, hence scaleLog <= 1 */
|
578
997
|
const U32 minBits = nbBitsBaseline - maxWeight;
|
579
|
-
|
580
|
-
|
581
|
-
|
582
|
-
|
583
|
-
|
584
|
-
|
585
|
-
|
586
|
-
|
587
|
-
|
588
|
-
|
589
|
-
|
590
|
-
|
591
|
-
|
592
|
-
const U32 start = rankVal[weight];
|
593
|
-
const U32 length = 1 << (targetLog-nbBits);
|
594
|
-
|
595
|
-
if (targetLog-nbBits >= minBits) { /* enough room for a second symbol */
|
596
|
-
U32 sortedRank;
|
998
|
+
int w;
|
999
|
+
int const wEnd = (int)maxWeight + 1;
|
1000
|
+
|
1001
|
+
/* Fill DTable in order of weight. */
|
1002
|
+
for (w = 1; w < wEnd; ++w) {
|
1003
|
+
int const begin = (int)rankStart[w];
|
1004
|
+
int const end = (int)rankStart[w+1];
|
1005
|
+
U32 const nbBits = nbBitsBaseline - w;
|
1006
|
+
|
1007
|
+
if (targetLog-nbBits >= minBits) {
|
1008
|
+
/* Enough room for a second symbol. */
|
1009
|
+
int start = rankVal[w];
|
1010
|
+
U32 const length = 1U << ((targetLog - nbBits) & 0x1F /* quiet static-analyzer */);
|
597
1011
|
int minWeight = nbBits + scaleLog;
|
1012
|
+
int s;
|
598
1013
|
if (minWeight < 1) minWeight = 1;
|
599
|
-
|
600
|
-
|
601
|
-
|
602
|
-
|
603
|
-
|
1014
|
+
/* Fill the DTable for every symbol of weight w.
|
1015
|
+
* These symbols get at least 1 second symbol.
|
1016
|
+
*/
|
1017
|
+
for (s = begin; s != end; ++s) {
|
1018
|
+
HUF_fillDTableX2Level2(
|
1019
|
+
DTable + start, targetLog, nbBits,
|
1020
|
+
rankValOrigin[nbBits], minWeight, wEnd,
|
1021
|
+
sortedList, rankStart,
|
1022
|
+
nbBitsBaseline, sortedList[s].symbol);
|
1023
|
+
start += length;
|
1024
|
+
}
|
604
1025
|
} else {
|
605
|
-
|
606
|
-
|
607
|
-
|
608
|
-
|
609
|
-
|
610
|
-
|
611
|
-
|
612
|
-
} }
|
613
|
-
rankVal[weight] += length;
|
1026
|
+
/* Only a single symbol. */
|
1027
|
+
HUF_fillDTableX2ForWeight(
|
1028
|
+
DTable + rankVal[w],
|
1029
|
+
sortedList + begin, sortedList + end,
|
1030
|
+
nbBits, targetLog,
|
1031
|
+
/* baseSeq */ 0, /* level */ 1);
|
1032
|
+
}
|
614
1033
|
}
|
615
1034
|
}
|
616
1035
|
|
617
1036
|
typedef struct {
|
618
1037
|
rankValCol_t rankVal[HUF_TABLELOG_MAX];
|
619
1038
|
U32 rankStats[HUF_TABLELOG_MAX + 1];
|
620
|
-
U32 rankStart0[HUF_TABLELOG_MAX +
|
1039
|
+
U32 rankStart0[HUF_TABLELOG_MAX + 3];
|
621
1040
|
sortedSymbol_t sortedSymbol[HUF_SYMBOLVALUE_MAX + 1];
|
622
1041
|
BYTE weightList[HUF_SYMBOLVALUE_MAX + 1];
|
623
1042
|
U32 calleeWksp[HUF_READ_STATS_WORKSPACE_SIZE_U32];
|
@@ -627,9 +1046,16 @@ size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,
|
|
627
1046
|
const void* src, size_t srcSize,
|
628
1047
|
void* workSpace, size_t wkspSize)
|
629
1048
|
{
|
630
|
-
|
1049
|
+
return HUF_readDTableX2_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0);
|
1050
|
+
}
|
1051
|
+
|
1052
|
+
size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable,
|
1053
|
+
const void* src, size_t srcSize,
|
1054
|
+
void* workSpace, size_t wkspSize, int bmi2)
|
1055
|
+
{
|
1056
|
+
U32 tableLog, maxW, nbSymbols;
|
631
1057
|
DTableDesc dtd = HUF_getDTableDesc(DTable);
|
632
|
-
U32
|
1058
|
+
U32 maxTableLog = dtd.maxTableLog;
|
633
1059
|
size_t iSize;
|
634
1060
|
void* dtPtr = DTable+1; /* force compiler to avoid strict-aliasing */
|
635
1061
|
HUF_DEltX2* const dt = (HUF_DEltX2*)dtPtr;
|
@@ -647,11 +1073,12 @@ size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,
|
|
647
1073
|
if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
|
648
1074
|
/* ZSTD_memset(weightList, 0, sizeof(weightList)); */ /* is not necessary, even though some analyzer complain ... */
|
649
1075
|
|
650
|
-
iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp),
|
1076
|
+
iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), bmi2);
|
651
1077
|
if (HUF_isError(iSize)) return iSize;
|
652
1078
|
|
653
1079
|
/* check result */
|
654
1080
|
if (tableLog > maxTableLog) return ERROR(tableLog_tooLarge); /* DTable can't fit code depth */
|
1081
|
+
if (tableLog <= HUF_DECODER_FAST_TABLELOG && maxTableLog > HUF_DECODER_FAST_TABLELOG) maxTableLog = HUF_DECODER_FAST_TABLELOG;
|
655
1082
|
|
656
1083
|
/* find maxWeight */
|
657
1084
|
for (maxW = tableLog; wksp->rankStats[maxW]==0; maxW--) {} /* necessarily finds a solution before 0 */
|
@@ -664,7 +1091,7 @@ size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,
|
|
664
1091
|
rankStart[w] = curr;
|
665
1092
|
}
|
666
1093
|
rankStart[0] = nextRankStart; /* put all 0w symbols at the end of sorted list*/
|
667
|
-
|
1094
|
+
rankStart[maxW+1] = nextRankStart;
|
668
1095
|
}
|
669
1096
|
|
670
1097
|
/* sort symbols by weight */
|
@@ -673,7 +1100,6 @@ size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,
|
|
673
1100
|
U32 const w = wksp->weightList[s];
|
674
1101
|
U32 const r = rankStart[w]++;
|
675
1102
|
wksp->sortedSymbol[r].symbol = (BYTE)s;
|
676
|
-
wksp->sortedSymbol[r].weight = (BYTE)w;
|
677
1103
|
}
|
678
1104
|
rankStart[0] = 0; /* forget 0w symbols; this is beginning of weight(1) */
|
679
1105
|
}
|
@@ -698,10 +1124,9 @@ size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,
|
|
698
1124
|
} } } }
|
699
1125
|
|
700
1126
|
HUF_fillDTableX2(dt, maxTableLog,
|
701
|
-
wksp->sortedSymbol,
|
1127
|
+
wksp->sortedSymbol,
|
702
1128
|
wksp->rankStart0, wksp->rankVal, maxW,
|
703
|
-
tableLog+1
|
704
|
-
wksp->calleeWksp, sizeof(wksp->calleeWksp) / sizeof(U32));
|
1129
|
+
tableLog+1);
|
705
1130
|
|
706
1131
|
dtd.tableLog = (BYTE)maxTableLog;
|
707
1132
|
dtd.tableType = 1;
|
@@ -714,7 +1139,7 @@ FORCE_INLINE_TEMPLATE U32
|
|
714
1139
|
HUF_decodeSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, const U32 dtLog)
|
715
1140
|
{
|
716
1141
|
size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */
|
717
|
-
ZSTD_memcpy(op, dt
|
1142
|
+
ZSTD_memcpy(op, &dt[val].sequence, 2);
|
718
1143
|
BIT_skipBits(DStream, dt[val].nbBits);
|
719
1144
|
return dt[val].length;
|
720
1145
|
}
|
@@ -723,15 +1148,17 @@ FORCE_INLINE_TEMPLATE U32
|
|
723
1148
|
HUF_decodeLastSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, const U32 dtLog)
|
724
1149
|
{
|
725
1150
|
size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */
|
726
|
-
ZSTD_memcpy(op, dt
|
727
|
-
if (dt[val].length==1)
|
728
|
-
|
1151
|
+
ZSTD_memcpy(op, &dt[val].sequence, 1);
|
1152
|
+
if (dt[val].length==1) {
|
1153
|
+
BIT_skipBits(DStream, dt[val].nbBits);
|
1154
|
+
} else {
|
729
1155
|
if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8)) {
|
730
1156
|
BIT_skipBits(DStream, dt[val].nbBits);
|
731
1157
|
if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8))
|
732
1158
|
/* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */
|
733
1159
|
DStream->bitsConsumed = (sizeof(DStream->bitContainer)*8);
|
734
|
-
|
1160
|
+
}
|
1161
|
+
}
|
735
1162
|
return 1;
|
736
1163
|
}
|
737
1164
|
|
@@ -753,19 +1180,37 @@ HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd,
|
|
753
1180
|
BYTE* const pStart = p;
|
754
1181
|
|
755
1182
|
/* up to 8 symbols at a time */
|
756
|
-
|
757
|
-
|
758
|
-
|
759
|
-
|
760
|
-
|
1183
|
+
if ((size_t)(pEnd - p) >= sizeof(bitDPtr->bitContainer)) {
|
1184
|
+
if (dtLog <= 11 && MEM_64bits()) {
|
1185
|
+
/* up to 10 symbols at a time */
|
1186
|
+
while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-9)) {
|
1187
|
+
HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
|
1188
|
+
HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
|
1189
|
+
HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
|
1190
|
+
HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
|
1191
|
+
HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
|
1192
|
+
}
|
1193
|
+
} else {
|
1194
|
+
/* up to 8 symbols at a time */
|
1195
|
+
while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-(sizeof(bitDPtr->bitContainer)-1))) {
|
1196
|
+
HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
|
1197
|
+
HUF_DECODE_SYMBOLX2_1(p, bitDPtr);
|
1198
|
+
HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
|
1199
|
+
HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
|
1200
|
+
}
|
1201
|
+
}
|
1202
|
+
} else {
|
1203
|
+
BIT_reloadDStream(bitDPtr);
|
761
1204
|
}
|
762
1205
|
|
763
1206
|
/* closer to end : up to 2 symbols at a time */
|
764
|
-
|
765
|
-
|
1207
|
+
if ((size_t)(pEnd - p) >= 2) {
|
1208
|
+
while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p <= pEnd-2))
|
1209
|
+
HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
|
766
1210
|
|
767
|
-
|
768
|
-
|
1211
|
+
while (p <= pEnd-2)
|
1212
|
+
HUF_DECODE_SYMBOLX2_0(p, bitDPtr); /* no need to reload : reached the end of DStream */
|
1213
|
+
}
|
769
1214
|
|
770
1215
|
if (p < pEnd)
|
771
1216
|
p += HUF_decodeLastSymbolX2(p, bitDPtr, dt, dtLog);
|
@@ -799,7 +1244,6 @@ HUF_decompress1X2_usingDTable_internal_body(
|
|
799
1244
|
/* decoded size */
|
800
1245
|
return dstSize;
|
801
1246
|
}
|
802
|
-
|
803
1247
|
FORCE_INLINE_TEMPLATE size_t
|
804
1248
|
HUF_decompress4X2_usingDTable_internal_body(
|
805
1249
|
void* dst, size_t dstSize,
|
@@ -841,57 +1285,60 @@ HUF_decompress4X2_usingDTable_internal_body(
|
|
841
1285
|
U32 const dtLog = dtd.tableLog;
|
842
1286
|
|
843
1287
|
if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */
|
1288
|
+
if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */
|
844
1289
|
CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
|
845
1290
|
CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
|
846
1291
|
CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
|
847
1292
|
CHECK_F( BIT_initDStream(&bitD4, istart4, length4) );
|
848
1293
|
|
849
1294
|
/* 16-32 symbols per loop (4-8 symbols per stream) */
|
850
|
-
|
1295
|
+
if ((size_t)(oend - op4) >= sizeof(size_t)) {
|
1296
|
+
for ( ; (endSignal) & (op4 < olimit); ) {
|
851
1297
|
#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
|
852
|
-
|
853
|
-
|
854
|
-
|
855
|
-
|
856
|
-
|
857
|
-
|
858
|
-
|
859
|
-
|
860
|
-
|
861
|
-
|
862
|
-
|
863
|
-
|
864
|
-
|
865
|
-
|
866
|
-
|
867
|
-
|
868
|
-
|
869
|
-
|
870
|
-
|
871
|
-
|
1298
|
+
HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
|
1299
|
+
HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
|
1300
|
+
HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
|
1301
|
+
HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
|
1302
|
+
HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
|
1303
|
+
HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
|
1304
|
+
HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
|
1305
|
+
HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
|
1306
|
+
endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished;
|
1307
|
+
endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished;
|
1308
|
+
HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
|
1309
|
+
HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
|
1310
|
+
HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
|
1311
|
+
HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
|
1312
|
+
HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
|
1313
|
+
HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
|
1314
|
+
HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
|
1315
|
+
HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
|
1316
|
+
endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished;
|
1317
|
+
endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished;
|
872
1318
|
#else
|
873
|
-
|
874
|
-
|
875
|
-
|
876
|
-
|
877
|
-
|
878
|
-
|
879
|
-
|
880
|
-
|
881
|
-
|
882
|
-
|
883
|
-
|
884
|
-
|
885
|
-
|
886
|
-
|
887
|
-
|
888
|
-
|
889
|
-
|
890
|
-
|
891
|
-
|
892
|
-
|
893
|
-
|
1319
|
+
HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
|
1320
|
+
HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
|
1321
|
+
HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
|
1322
|
+
HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
|
1323
|
+
HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
|
1324
|
+
HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
|
1325
|
+
HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
|
1326
|
+
HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
|
1327
|
+
HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
|
1328
|
+
HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
|
1329
|
+
HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
|
1330
|
+
HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
|
1331
|
+
HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
|
1332
|
+
HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
|
1333
|
+
HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
|
1334
|
+
HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
|
1335
|
+
endSignal = (U32)LIKELY((U32)
|
1336
|
+
(BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished)
|
1337
|
+
& (BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished)
|
1338
|
+
& (BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished)
|
1339
|
+
& (BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished));
|
894
1340
|
#endif
|
1341
|
+
}
|
895
1342
|
}
|
896
1343
|
|
897
1344
|
/* check corruption */
|
@@ -915,8 +1362,99 @@ HUF_decompress4X2_usingDTable_internal_body(
|
|
915
1362
|
}
|
916
1363
|
}
|
917
1364
|
|
1365
|
+
#if HUF_NEED_BMI2_FUNCTION
|
1366
|
+
static BMI2_TARGET_ATTRIBUTE
|
1367
|
+
size_t HUF_decompress4X2_usingDTable_internal_bmi2(void* dst, size_t dstSize, void const* cSrc,
|
1368
|
+
size_t cSrcSize, HUF_DTable const* DTable) {
|
1369
|
+
return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
|
1370
|
+
}
|
1371
|
+
#endif
|
1372
|
+
|
1373
|
+
#if HUF_NEED_DEFAULT_FUNCTION
|
1374
|
+
static
|
1375
|
+
size_t HUF_decompress4X2_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,
|
1376
|
+
size_t cSrcSize, HUF_DTable const* DTable) {
|
1377
|
+
return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
|
1378
|
+
}
|
1379
|
+
#endif
|
1380
|
+
|
1381
|
+
#if ZSTD_ENABLE_ASM_X86_64_BMI2
|
1382
|
+
|
1383
|
+
HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN;
|
1384
|
+
|
1385
|
+
static HUF_ASM_X86_64_BMI2_ATTRS size_t
|
1386
|
+
HUF_decompress4X2_usingDTable_internal_bmi2_asm(
|
1387
|
+
void* dst, size_t dstSize,
|
1388
|
+
const void* cSrc, size_t cSrcSize,
|
1389
|
+
const HUF_DTable* DTable) {
|
1390
|
+
void const* dt = DTable + 1;
|
1391
|
+
const BYTE* const iend = (const BYTE*)cSrc + 6;
|
1392
|
+
BYTE* const oend = (BYTE*)dst + dstSize;
|
1393
|
+
HUF_DecompressAsmArgs args;
|
1394
|
+
{
|
1395
|
+
size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
|
1396
|
+
FORWARD_IF_ERROR(ret, "Failed to init asm args");
|
1397
|
+
if (ret != 0)
|
1398
|
+
return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
|
1399
|
+
}
|
1400
|
+
|
1401
|
+
assert(args.ip[0] >= args.ilimit);
|
1402
|
+
HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(&args);
|
1403
|
+
|
1404
|
+
/* note : op4 already verified within main loop */
|
1405
|
+
assert(args.ip[0] >= iend);
|
1406
|
+
assert(args.ip[1] >= iend);
|
1407
|
+
assert(args.ip[2] >= iend);
|
1408
|
+
assert(args.ip[3] >= iend);
|
1409
|
+
assert(args.op[3] <= oend);
|
1410
|
+
(void)iend;
|
1411
|
+
|
1412
|
+
/* finish bitStreams one by one */
|
1413
|
+
{
|
1414
|
+
size_t const segmentSize = (dstSize+3) / 4;
|
1415
|
+
BYTE* segmentEnd = (BYTE*)dst;
|
1416
|
+
int i;
|
1417
|
+
for (i = 0; i < 4; ++i) {
|
1418
|
+
BIT_DStream_t bit;
|
1419
|
+
if (segmentSize <= (size_t)(oend - segmentEnd))
|
1420
|
+
segmentEnd += segmentSize;
|
1421
|
+
else
|
1422
|
+
segmentEnd = oend;
|
1423
|
+
FORWARD_IF_ERROR(HUF_initRemainingDStream(&bit, &args, i, segmentEnd), "corruption");
|
1424
|
+
args.op[i] += HUF_decodeStreamX2(args.op[i], &bit, segmentEnd, (HUF_DEltX2 const*)dt, HUF_DECODER_FAST_TABLELOG);
|
1425
|
+
if (args.op[i] != segmentEnd)
|
1426
|
+
return ERROR(corruption_detected);
|
1427
|
+
}
|
1428
|
+
}
|
1429
|
+
|
1430
|
+
/* decoded size */
|
1431
|
+
return dstSize;
|
1432
|
+
}
|
1433
|
+
#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */
|
1434
|
+
|
1435
|
+
static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc,
|
1436
|
+
size_t cSrcSize, HUF_DTable const* DTable, int bmi2)
|
1437
|
+
{
|
1438
|
+
#if DYNAMIC_BMI2
|
1439
|
+
if (bmi2) {
|
1440
|
+
# if ZSTD_ENABLE_ASM_X86_64_BMI2
|
1441
|
+
return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
|
1442
|
+
# else
|
1443
|
+
return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
|
1444
|
+
# endif
|
1445
|
+
}
|
1446
|
+
#else
|
1447
|
+
(void)bmi2;
|
1448
|
+
#endif
|
1449
|
+
|
1450
|
+
#if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
|
1451
|
+
return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
|
1452
|
+
#else
|
1453
|
+
return HUF_decompress4X2_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable);
|
1454
|
+
#endif
|
1455
|
+
}
|
1456
|
+
|
918
1457
|
HUF_DGEN(HUF_decompress1X2_usingDTable_internal)
|
919
|
-
HUF_DGEN(HUF_decompress4X2_usingDTable_internal)
|
920
1458
|
|
921
1459
|
size_t HUF_decompress1X2_usingDTable(
|
922
1460
|
void* dst, size_t dstSize,
|
@@ -1025,25 +1563,25 @@ size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize,
|
|
1025
1563
|
|
1026
1564
|
#if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2)
|
1027
1565
|
typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t;
|
1028
|
-
static const algo_time_t algoTime[16 /* Quantization */][
|
1566
|
+
static const algo_time_t algoTime[16 /* Quantization */][2 /* single, double */] =
|
1029
1567
|
{
|
1030
1568
|
/* single, double, quad */
|
1031
|
-
{{0,0}, {1,1}
|
1032
|
-
{{0,0}, {1,1}
|
1033
|
-
{{
|
1034
|
-
{{
|
1035
|
-
{{
|
1036
|
-
{{
|
1037
|
-
{{
|
1038
|
-
{{
|
1039
|
-
{{
|
1040
|
-
{{
|
1041
|
-
{{
|
1042
|
-
{{
|
1043
|
-
{{
|
1044
|
-
{{
|
1045
|
-
{{
|
1046
|
-
{{
|
1569
|
+
{{0,0}, {1,1}}, /* Q==0 : impossible */
|
1570
|
+
{{0,0}, {1,1}}, /* Q==1 : impossible */
|
1571
|
+
{{ 150,216}, { 381,119}}, /* Q == 2 : 12-18% */
|
1572
|
+
{{ 170,205}, { 514,112}}, /* Q == 3 : 18-25% */
|
1573
|
+
{{ 177,199}, { 539,110}}, /* Q == 4 : 25-32% */
|
1574
|
+
{{ 197,194}, { 644,107}}, /* Q == 5 : 32-38% */
|
1575
|
+
{{ 221,192}, { 735,107}}, /* Q == 6 : 38-44% */
|
1576
|
+
{{ 256,189}, { 881,106}}, /* Q == 7 : 44-50% */
|
1577
|
+
{{ 359,188}, {1167,109}}, /* Q == 8 : 50-56% */
|
1578
|
+
{{ 582,187}, {1570,114}}, /* Q == 9 : 56-62% */
|
1579
|
+
{{ 688,187}, {1712,122}}, /* Q ==10 : 62-69% */
|
1580
|
+
{{ 825,186}, {1965,136}}, /* Q ==11 : 69-75% */
|
1581
|
+
{{ 976,185}, {2131,150}}, /* Q ==12 : 75-81% */
|
1582
|
+
{{1180,186}, {2070,175}}, /* Q ==13 : 81-87% */
|
1583
|
+
{{1377,185}, {1731,202}}, /* Q ==14 : 87-93% */
|
1584
|
+
{{1412,185}, {1695,202}}, /* Q ==15 : 93-99% */
|
1047
1585
|
};
|
1048
1586
|
#endif
|
1049
1587
|
|
@@ -1070,7 +1608,7 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize)
|
|
1070
1608
|
U32 const D256 = (U32)(dstSize >> 8);
|
1071
1609
|
U32 const DTime0 = algoTime[Q][0].tableTime + (algoTime[Q][0].decode256Time * D256);
|
1072
1610
|
U32 DTime1 = algoTime[Q][1].tableTime + (algoTime[Q][1].decode256Time * D256);
|
1073
|
-
DTime1 += DTime1 >>
|
1611
|
+
DTime1 += DTime1 >> 5; /* small advantage to algorithm using less memory, to reduce cache eviction */
|
1074
1612
|
return DTime1 < DTime0;
|
1075
1613
|
}
|
1076
1614
|
#endif
|