zstd-ruby 1.3.3.0 → 1.3.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (44) hide show
  1. checksums.yaml +5 -5
  2. data/README.md +1 -1
  3. data/ext/zstdruby/libzstd/BUCK +13 -0
  4. data/ext/zstdruby/libzstd/README.md +32 -25
  5. data/ext/zstdruby/libzstd/common/bitstream.h +1 -1
  6. data/ext/zstdruby/libzstd/common/compiler.h +25 -0
  7. data/ext/zstdruby/libzstd/common/cpu.h +216 -0
  8. data/ext/zstdruby/libzstd/common/error_private.c +1 -0
  9. data/ext/zstdruby/libzstd/common/fse.h +1 -1
  10. data/ext/zstdruby/libzstd/common/fse_decompress.c +2 -2
  11. data/ext/zstdruby/libzstd/common/huf.h +114 -89
  12. data/ext/zstdruby/libzstd/common/pool.c +46 -17
  13. data/ext/zstdruby/libzstd/common/pool.h +18 -9
  14. data/ext/zstdruby/libzstd/common/threading.h +12 -12
  15. data/ext/zstdruby/libzstd/common/zstd_errors.h +16 -7
  16. data/ext/zstdruby/libzstd/common/zstd_internal.h +4 -5
  17. data/ext/zstdruby/libzstd/compress/fse_compress.c +19 -11
  18. data/ext/zstdruby/libzstd/compress/huf_compress.c +160 -62
  19. data/ext/zstdruby/libzstd/compress/zstd_compress.c +973 -644
  20. data/ext/zstdruby/libzstd/compress/zstd_compress_internal.h +281 -34
  21. data/ext/zstdruby/libzstd/compress/zstd_double_fast.c +80 -62
  22. data/ext/zstdruby/libzstd/compress/zstd_double_fast.h +11 -4
  23. data/ext/zstdruby/libzstd/compress/zstd_fast.c +87 -71
  24. data/ext/zstdruby/libzstd/compress/zstd_fast.h +10 -6
  25. data/ext/zstdruby/libzstd/compress/zstd_lazy.c +333 -274
  26. data/ext/zstdruby/libzstd/compress/zstd_lazy.h +33 -16
  27. data/ext/zstdruby/libzstd/compress/zstd_ldm.c +305 -359
  28. data/ext/zstdruby/libzstd/compress/zstd_ldm.h +64 -21
  29. data/ext/zstdruby/libzstd/compress/zstd_opt.c +194 -56
  30. data/ext/zstdruby/libzstd/compress/zstd_opt.h +17 -5
  31. data/ext/zstdruby/libzstd/compress/zstdmt_compress.c +1131 -449
  32. data/ext/zstdruby/libzstd/compress/zstdmt_compress.h +32 -16
  33. data/ext/zstdruby/libzstd/decompress/huf_decompress.c +390 -290
  34. data/ext/zstdruby/libzstd/decompress/zstd_decompress.c +777 -439
  35. data/ext/zstdruby/libzstd/dictBuilder/cover.c +11 -8
  36. data/ext/zstdruby/libzstd/dictBuilder/zdict.c +83 -50
  37. data/ext/zstdruby/libzstd/dictBuilder/zdict.h +44 -43
  38. data/ext/zstdruby/libzstd/legacy/zstd_legacy.h +2 -0
  39. data/ext/zstdruby/libzstd/legacy/zstd_v04.c +42 -118
  40. data/ext/zstdruby/libzstd/legacy/zstd_v06.c +2 -2
  41. data/ext/zstdruby/libzstd/legacy/zstd_v07.c +2 -2
  42. data/ext/zstdruby/libzstd/zstd.h +254 -254
  43. data/lib/zstd-ruby/version.rb +1 -1
  44. metadata +4 -3
@@ -30,15 +30,15 @@
30
30
 
31
31
  /* === Memory management === */
32
32
  typedef struct ZSTDMT_CCtx_s ZSTDMT_CCtx;
33
- ZSTDLIB_API ZSTDMT_CCtx* ZSTDMT_createCCtx(unsigned nbThreads);
34
- ZSTDLIB_API ZSTDMT_CCtx* ZSTDMT_createCCtx_advanced(unsigned nbThreads,
33
+ ZSTDLIB_API ZSTDMT_CCtx* ZSTDMT_createCCtx(unsigned nbWorkers);
34
+ ZSTDLIB_API ZSTDMT_CCtx* ZSTDMT_createCCtx_advanced(unsigned nbWorkers,
35
35
  ZSTD_customMem cMem);
36
36
  ZSTDLIB_API size_t ZSTDMT_freeCCtx(ZSTDMT_CCtx* mtctx);
37
37
 
38
38
  ZSTDLIB_API size_t ZSTDMT_sizeof_CCtx(ZSTDMT_CCtx* mtctx);
39
39
 
40
40
 
41
- /* === Simple buffer-to-butter one-pass function === */
41
+ /* === Simple one-pass compression function === */
42
42
 
43
43
  ZSTDLIB_API size_t ZSTDMT_compressCCtx(ZSTDMT_CCtx* mtctx,
44
44
  void* dst, size_t dstCapacity,
@@ -50,7 +50,7 @@ ZSTDLIB_API size_t ZSTDMT_compressCCtx(ZSTDMT_CCtx* mtctx,
50
50
  /* === Streaming functions === */
51
51
 
52
52
  ZSTDLIB_API size_t ZSTDMT_initCStream(ZSTDMT_CCtx* mtctx, int compressionLevel);
53
- ZSTDLIB_API size_t ZSTDMT_resetCStream(ZSTDMT_CCtx* mtctx, unsigned long long pledgedSrcSize); /**< if srcSize is not known at reset time, use ZSTD_CONTENTSIZE_UNKNOWN. Note: for compatibility with older programs, 0 means the same as ZSTD_CONTENTSIZE_UNKNOWN, but it may change in the future, to mean "empty" */
53
+ ZSTDLIB_API size_t ZSTDMT_resetCStream(ZSTDMT_CCtx* mtctx, unsigned long long pledgedSrcSize); /**< if srcSize is not known at reset time, use ZSTD_CONTENTSIZE_UNKNOWN. Note: for compatibility with older programs, 0 means the same as ZSTD_CONTENTSIZE_UNKNOWN, but it will change in the future to mean "empty" */
54
54
 
55
55
  ZSTDLIB_API size_t ZSTDMT_compressStream(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output, ZSTD_inBuffer* input);
56
56
 
@@ -68,7 +68,7 @@ ZSTDLIB_API size_t ZSTDMT_compress_advanced(ZSTDMT_CCtx* mtctx,
68
68
  void* dst, size_t dstCapacity,
69
69
  const void* src, size_t srcSize,
70
70
  const ZSTD_CDict* cdict,
71
- ZSTD_parameters const params,
71
+ ZSTD_parameters params,
72
72
  unsigned overlapLog);
73
73
 
74
74
  ZSTDLIB_API size_t ZSTDMT_initCStream_advanced(ZSTDMT_CCtx* mtctx,
@@ -85,7 +85,7 @@ ZSTDLIB_API size_t ZSTDMT_initCStream_usingCDict(ZSTDMT_CCtx* mtctx,
85
85
  * List of parameters that can be set using ZSTDMT_setMTCtxParameter() */
86
86
  typedef enum {
87
87
  ZSTDMT_p_jobSize, /* Each job is compressed in parallel. By default, this value is dynamically determined depending on compression parameters. Can be set explicitly here. */
88
- ZSTDMT_p_overlapSectionLog /* Each job may reload a part of previous job to enhance compressionr ratio; 0 == no overlap, 6(default) == use 1/8th of window, >=9 == use full window */
88
+ ZSTDMT_p_overlapSectionLog /* Each job may reload a part of previous job to enhance compressionr ratio; 0 == no overlap, 6(default) == use 1/8th of window, >=9 == use full window. This is a "sticky" parameter : its value will be re-used on next compression job */
89
89
  } ZSTDMT_parameter;
90
90
 
91
91
  /* ZSTDMT_setMTCtxParameter() :
@@ -97,30 +97,46 @@ ZSTDLIB_API size_t ZSTDMT_setMTCtxParameter(ZSTDMT_CCtx* mtctx, ZSTDMT_parameter
97
97
 
98
98
 
99
99
  /*! ZSTDMT_compressStream_generic() :
100
- * Combines ZSTDMT_compressStream() with ZSTDMT_flushStream() or ZSTDMT_endStream()
100
+ * Combines ZSTDMT_compressStream() with optional ZSTDMT_flushStream() or ZSTDMT_endStream()
101
101
  * depending on flush directive.
102
102
  * @return : minimum amount of data still to be flushed
103
103
  * 0 if fully flushed
104
- * or an error code */
104
+ * or an error code
105
+ * note : needs to be init using any ZSTD_initCStream*() variant */
105
106
  ZSTDLIB_API size_t ZSTDMT_compressStream_generic(ZSTDMT_CCtx* mtctx,
106
107
  ZSTD_outBuffer* output,
107
108
  ZSTD_inBuffer* input,
108
109
  ZSTD_EndDirective endOp);
109
110
 
110
111
 
111
- /* === Private definitions; never ever use directly === */
112
+ /* ========================================================
113
+ * === Private interface, for use by ZSTD_compress.c ===
114
+ * === Not exposed in libzstd. Never invoke directly ===
115
+ * ======================================================== */
112
116
 
113
117
  size_t ZSTDMT_CCtxParam_setMTCtxParameter(ZSTD_CCtx_params* params, ZSTDMT_parameter parameter, unsigned value);
114
118
 
115
- /* ZSTDMT_CCtxParam_setNbThreads()
116
- * Set nbThreads, and clamp it correctly,
117
- * also reset jobSize and overlapLog */
118
- size_t ZSTDMT_CCtxParam_setNbThreads(ZSTD_CCtx_params* params, unsigned nbThreads);
119
+ /* ZSTDMT_CCtxParam_setNbWorkers()
120
+ * Set nbWorkers, and clamp it.
121
+ * Also reset jobSize and overlapLog */
122
+ size_t ZSTDMT_CCtxParam_setNbWorkers(ZSTD_CCtx_params* params, unsigned nbWorkers);
119
123
 
120
- /* ZSTDMT_getNbThreads():
124
+ /*! ZSTDMT_updateCParams_whileCompressing() :
125
+ * Updates only a selected set of compression parameters, to remain compatible with current frame.
126
+ * New parameters will be applied to next compression job. */
127
+ void ZSTDMT_updateCParams_whileCompressing(ZSTDMT_CCtx* mtctx, const ZSTD_CCtx_params* cctxParams);
128
+
129
+ /* ZSTDMT_getNbWorkers():
121
130
  * @return nb threads currently active in mtctx.
122
131
  * mtctx must be valid */
123
- size_t ZSTDMT_getNbThreads(const ZSTDMT_CCtx* mtctx);
132
+ unsigned ZSTDMT_getNbWorkers(const ZSTDMT_CCtx* mtctx);
133
+
134
+ /* ZSTDMT_getFrameProgression():
135
+ * tells how much data has been consumed (input) and produced (output) for current frame.
136
+ * able to count progression inside worker threads.
137
+ */
138
+ ZSTD_frameProgression ZSTDMT_getFrameProgression(ZSTDMT_CCtx* mtctx);
139
+
124
140
 
125
141
  /*! ZSTDMT_initCStream_internal() :
126
142
  * Private use only. Init streaming operation.
@@ -128,7 +144,7 @@ size_t ZSTDMT_getNbThreads(const ZSTDMT_CCtx* mtctx);
128
144
  * must receive dict, or cdict, or none, but not both.
129
145
  * @return : 0, or an error code */
130
146
  size_t ZSTDMT_initCStream_internal(ZSTDMT_CCtx* zcs,
131
- const void* dict, size_t dictSize, ZSTD_dictMode_e dictMode,
147
+ const void* dict, size_t dictSize, ZSTD_dictContentType_e dictContentType,
132
148
  const ZSTD_CDict* cdict,
133
149
  ZSTD_CCtx_params params, unsigned long long pledgedSrcSize);
134
150
 
@@ -49,18 +49,19 @@
49
49
  ****************************************************************/
50
50
  #define HUF_isError ERR_isError
51
51
  #define HUF_STATIC_ASSERT(c) { enum { HUF_static_assert = 1/(int)(!!(c)) }; } /* use only *after* variable declarations */
52
+ #define CHECK_F(f) { size_t const err_ = (f); if (HUF_isError(err_)) return err_; }
52
53
 
53
54
 
54
55
  /* **************************************************************
55
56
  * Byte alignment for workSpace management
56
57
  ****************************************************************/
57
- #define HUF_ALIGN(x, a) HUF_ALIGN_MASK((x), (a) - 1)
58
+ #define HUF_ALIGN(x, a) HUF_ALIGN_MASK((x), (a) - 1)
58
59
  #define HUF_ALIGN_MASK(x, mask) (((x) + (mask)) & ~(mask))
59
60
 
61
+
60
62
  /*-***************************/
61
63
  /* generic DTableDesc */
62
64
  /*-***************************/
63
-
64
65
  typedef struct { BYTE maxTableLog; BYTE tableType; BYTE tableLog; BYTE reserved; } DTableDesc;
65
66
 
66
67
  static DTableDesc HUF_getDTableDesc(const HUF_DTable* table)
@@ -74,7 +75,6 @@ static DTableDesc HUF_getDTableDesc(const HUF_DTable* table)
74
75
  /*-***************************/
75
76
  /* single-symbol decoding */
76
77
  /*-***************************/
77
-
78
78
  typedef struct { BYTE byte; BYTE nbBits; } HUF_DEltX2; /* single-symbol decoding */
79
79
 
80
80
  size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize)
@@ -94,10 +94,7 @@ size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, const void* src, size_t srcSize
94
94
  huffWeight = (BYTE *)((U32 *)workSpace + spaceUsed32);
95
95
  spaceUsed32 += HUF_ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2;
96
96
 
97
- if ((spaceUsed32 << 2) > wkspSize)
98
- return ERROR(tableLog_tooLarge);
99
- workSpace = (U32 *)workSpace + spaceUsed32;
100
- wkspSize -= (spaceUsed32 << 2);
97
+ if ((spaceUsed32 << 2) > wkspSize) return ERROR(tableLog_tooLarge);
101
98
 
102
99
  HUF_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable));
103
100
  /* memset(huffWeight, 0, sizeof(huffWeight)); */ /* is not necessary, even though some analyzer complain ... */
@@ -144,8 +141,10 @@ size_t HUF_readDTableX2(HUF_DTable* DTable, const void* src, size_t srcSize)
144
141
  workSpace, sizeof(workSpace));
145
142
  }
146
143
 
144
+ typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUF_DEltX4; /* double-symbols decoding */
147
145
 
148
- static BYTE HUF_decodeSymbolX2(BIT_DStream_t* Dstream, const HUF_DEltX2* dt, const U32 dtLog)
146
+ FORCE_INLINE_TEMPLATE BYTE
147
+ HUF_decodeSymbolX2(BIT_DStream_t* Dstream, const HUF_DEltX2* dt, const U32 dtLog)
149
148
  {
150
149
  size_t const val = BIT_lookBitsFast(Dstream, dtLog); /* note : dtLog >= 1 */
151
150
  BYTE const c = dt[val].byte;
@@ -156,7 +155,7 @@ static BYTE HUF_decodeSymbolX2(BIT_DStream_t* Dstream, const HUF_DEltX2* dt, con
156
155
  #define HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) \
157
156
  *ptr++ = HUF_decodeSymbolX2(DStreamPtr, dt, dtLog)
158
157
 
159
- #define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \
158
+ #define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \
160
159
  if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
161
160
  HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr)
162
161
 
@@ -164,30 +163,33 @@ static BYTE HUF_decodeSymbolX2(BIT_DStream_t* Dstream, const HUF_DEltX2* dt, con
164
163
  if (MEM_64bits()) \
165
164
  HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr)
166
165
 
167
- HINT_INLINE size_t HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, const HUF_DEltX2* const dt, const U32 dtLog)
166
+ HINT_INLINE size_t
167
+ HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, const HUF_DEltX2* const dt, const U32 dtLog)
168
168
  {
169
169
  BYTE* const pStart = p;
170
170
 
171
171
  /* up to 4 symbols at a time */
172
- while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p <= pEnd-4)) {
172
+ while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-3)) {
173
173
  HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
174
174
  HUF_DECODE_SYMBOLX2_1(p, bitDPtr);
175
175
  HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
176
176
  HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
177
177
  }
178
178
 
179
- /* closer to the end */
180
- while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p < pEnd))
181
- HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
179
+ /* [0-3] symbols remaining */
180
+ if (MEM_32bits())
181
+ while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd))
182
+ HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
182
183
 
183
- /* no more data to retrieve from bitstream, hence no need to reload */
184
+ /* no more data to retrieve from bitstream, no need to reload */
184
185
  while (p < pEnd)
185
186
  HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
186
187
 
187
188
  return pEnd-pStart;
188
189
  }
189
190
 
190
- static size_t HUF_decompress1X2_usingDTable_internal(
191
+ FORCE_INLINE_TEMPLATE size_t
192
+ HUF_decompress1X2_usingDTable_internal_body(
191
193
  void* dst, size_t dstSize,
192
194
  const void* cSrc, size_t cSrcSize,
193
195
  const HUF_DTable* DTable)
@@ -200,58 +202,17 @@ static size_t HUF_decompress1X2_usingDTable_internal(
200
202
  DTableDesc const dtd = HUF_getDTableDesc(DTable);
201
203
  U32 const dtLog = dtd.tableLog;
202
204
 
203
- { size_t const errorCode = BIT_initDStream(&bitD, cSrc, cSrcSize);
204
- if (HUF_isError(errorCode)) return errorCode; }
205
+ CHECK_F( BIT_initDStream(&bitD, cSrc, cSrcSize) );
205
206
 
206
207
  HUF_decodeStreamX2(op, &bitD, oend, dt, dtLog);
207
208
 
208
- /* check */
209
209
  if (!BIT_endOfDStream(&bitD)) return ERROR(corruption_detected);
210
210
 
211
211
  return dstSize;
212
212
  }
213
213
 
214
- size_t HUF_decompress1X2_usingDTable(
215
- void* dst, size_t dstSize,
216
- const void* cSrc, size_t cSrcSize,
217
- const HUF_DTable* DTable)
218
- {
219
- DTableDesc dtd = HUF_getDTableDesc(DTable);
220
- if (dtd.tableType != 0) return ERROR(GENERIC);
221
- return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable);
222
- }
223
-
224
- size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
225
- const void* cSrc, size_t cSrcSize,
226
- void* workSpace, size_t wkspSize)
227
- {
228
- const BYTE* ip = (const BYTE*) cSrc;
229
-
230
- size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize, workSpace, wkspSize);
231
- if (HUF_isError(hSize)) return hSize;
232
- if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
233
- ip += hSize; cSrcSize -= hSize;
234
-
235
- return HUF_decompress1X2_usingDTable_internal (dst, dstSize, ip, cSrcSize, DCtx);
236
- }
237
-
238
-
239
- size_t HUF_decompress1X2_DCtx(HUF_DTable* DCtx, void* dst, size_t dstSize,
240
- const void* cSrc, size_t cSrcSize)
241
- {
242
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
243
- return HUF_decompress1X2_DCtx_wksp(DCtx, dst, dstSize, cSrc, cSrcSize,
244
- workSpace, sizeof(workSpace));
245
- }
246
-
247
- size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
248
- {
249
- HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_TABLELOG_MAX);
250
- return HUF_decompress1X2_DCtx (DTable, dst, dstSize, cSrc, cSrcSize);
251
- }
252
-
253
-
254
- static size_t HUF_decompress4X2_usingDTable_internal(
214
+ FORCE_INLINE_TEMPLATE size_t
215
+ HUF_decompress4X2_usingDTable_internal_body(
255
216
  void* dst, size_t dstSize,
256
217
  const void* cSrc, size_t cSrcSize,
257
218
  const HUF_DTable* DTable)
@@ -286,23 +247,19 @@ static size_t HUF_decompress4X2_usingDTable_internal(
286
247
  BYTE* op2 = opStart2;
287
248
  BYTE* op3 = opStart3;
288
249
  BYTE* op4 = opStart4;
289
- U32 endSignal;
250
+ U32 endSignal = BIT_DStream_unfinished;
290
251
  DTableDesc const dtd = HUF_getDTableDesc(DTable);
291
252
  U32 const dtLog = dtd.tableLog;
292
253
 
293
254
  if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */
294
- { size_t const errorCode = BIT_initDStream(&bitD1, istart1, length1);
295
- if (HUF_isError(errorCode)) return errorCode; }
296
- { size_t const errorCode = BIT_initDStream(&bitD2, istart2, length2);
297
- if (HUF_isError(errorCode)) return errorCode; }
298
- { size_t const errorCode = BIT_initDStream(&bitD3, istart3, length3);
299
- if (HUF_isError(errorCode)) return errorCode; }
300
- { size_t const errorCode = BIT_initDStream(&bitD4, istart4, length4);
301
- if (HUF_isError(errorCode)) return errorCode; }
255
+ CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
256
+ CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
257
+ CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
258
+ CHECK_F( BIT_initDStream(&bitD4, istart4, length4) );
302
259
 
303
- /* 16-32 symbols per loop (4-8 symbols per stream) */
260
+ /* up to 16 symbols per loop (4 symbols per stream) in 64-bit mode */
304
261
  endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
305
- for ( ; (endSignal==BIT_DStream_unfinished) && (op4<(oend-7)) ; ) {
262
+ while ( (endSignal==BIT_DStream_unfinished) && (op4<(oend-3)) ) {
306
263
  HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
307
264
  HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
308
265
  HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
@@ -319,10 +276,15 @@ static size_t HUF_decompress4X2_usingDTable_internal(
319
276
  HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
320
277
  HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
321
278
  HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
322
- endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
279
+ BIT_reloadDStream(&bitD1);
280
+ BIT_reloadDStream(&bitD2);
281
+ BIT_reloadDStream(&bitD3);
282
+ BIT_reloadDStream(&bitD4);
323
283
  }
324
284
 
325
285
  /* check corruption */
286
+ /* note : should not be necessary : op# advance in lock step, and we control op4.
287
+ * but curiously, binary generated by gcc 7.2 & 7.3 with -mbmi2 runs faster when >=1 test is present */
326
288
  if (op1 > opStart2) return ERROR(corruption_detected);
327
289
  if (op2 > opStart3) return ERROR(corruption_detected);
328
290
  if (op3 > opStart4) return ERROR(corruption_detected);
@@ -335,8 +297,8 @@ static size_t HUF_decompress4X2_usingDTable_internal(
335
297
  HUF_decodeStreamX2(op4, &bitD4, oend, dt, dtLog);
336
298
 
337
299
  /* check */
338
- endSignal = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
339
- if (!endSignal) return ERROR(corruption_detected);
300
+ { U32 const endCheck = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
301
+ if (!endCheck) return ERROR(corruption_detected); }
340
302
 
341
303
  /* decoded size */
342
304
  return dstSize;
@@ -344,30 +306,309 @@ static size_t HUF_decompress4X2_usingDTable_internal(
344
306
  }
345
307
 
346
308
 
347
- size_t HUF_decompress4X2_usingDTable(
309
+ FORCE_INLINE_TEMPLATE U32
310
+ HUF_decodeSymbolX4(void* op, BIT_DStream_t* DStream, const HUF_DEltX4* dt, const U32 dtLog)
311
+ {
312
+ size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */
313
+ memcpy(op, dt+val, 2);
314
+ BIT_skipBits(DStream, dt[val].nbBits);
315
+ return dt[val].length;
316
+ }
317
+
318
+ FORCE_INLINE_TEMPLATE U32
319
+ HUF_decodeLastSymbolX4(void* op, BIT_DStream_t* DStream, const HUF_DEltX4* dt, const U32 dtLog)
320
+ {
321
+ size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */
322
+ memcpy(op, dt+val, 1);
323
+ if (dt[val].length==1) BIT_skipBits(DStream, dt[val].nbBits);
324
+ else {
325
+ if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8)) {
326
+ BIT_skipBits(DStream, dt[val].nbBits);
327
+ if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8))
328
+ /* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */
329
+ DStream->bitsConsumed = (sizeof(DStream->bitContainer)*8);
330
+ } }
331
+ return 1;
332
+ }
333
+
334
+ #define HUF_DECODE_SYMBOLX4_0(ptr, DStreamPtr) \
335
+ ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
336
+
337
+ #define HUF_DECODE_SYMBOLX4_1(ptr, DStreamPtr) \
338
+ if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
339
+ ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
340
+
341
+ #define HUF_DECODE_SYMBOLX4_2(ptr, DStreamPtr) \
342
+ if (MEM_64bits()) \
343
+ ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
344
+
345
+ HINT_INLINE size_t
346
+ HUF_decodeStreamX4(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd,
347
+ const HUF_DEltX4* const dt, const U32 dtLog)
348
+ {
349
+ BYTE* const pStart = p;
350
+
351
+ /* up to 8 symbols at a time */
352
+ while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-(sizeof(bitDPtr->bitContainer)-1))) {
353
+ HUF_DECODE_SYMBOLX4_2(p, bitDPtr);
354
+ HUF_DECODE_SYMBOLX4_1(p, bitDPtr);
355
+ HUF_DECODE_SYMBOLX4_2(p, bitDPtr);
356
+ HUF_DECODE_SYMBOLX4_0(p, bitDPtr);
357
+ }
358
+
359
+ /* closer to end : up to 2 symbols at a time */
360
+ while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p <= pEnd-2))
361
+ HUF_DECODE_SYMBOLX4_0(p, bitDPtr);
362
+
363
+ while (p <= pEnd-2)
364
+ HUF_DECODE_SYMBOLX4_0(p, bitDPtr); /* no need to reload : reached the end of DStream */
365
+
366
+ if (p < pEnd)
367
+ p += HUF_decodeLastSymbolX4(p, bitDPtr, dt, dtLog);
368
+
369
+ return p-pStart;
370
+ }
371
+
372
+ FORCE_INLINE_TEMPLATE size_t
373
+ HUF_decompress1X4_usingDTable_internal_body(
374
+ void* dst, size_t dstSize,
375
+ const void* cSrc, size_t cSrcSize,
376
+ const HUF_DTable* DTable)
377
+ {
378
+ BIT_DStream_t bitD;
379
+
380
+ /* Init */
381
+ CHECK_F( BIT_initDStream(&bitD, cSrc, cSrcSize) );
382
+
383
+ /* decode */
384
+ { BYTE* const ostart = (BYTE*) dst;
385
+ BYTE* const oend = ostart + dstSize;
386
+ const void* const dtPtr = DTable+1; /* force compiler to not use strict-aliasing */
387
+ const HUF_DEltX4* const dt = (const HUF_DEltX4*)dtPtr;
388
+ DTableDesc const dtd = HUF_getDTableDesc(DTable);
389
+ HUF_decodeStreamX4(ostart, &bitD, oend, dt, dtd.tableLog);
390
+ }
391
+
392
+ /* check */
393
+ if (!BIT_endOfDStream(&bitD)) return ERROR(corruption_detected);
394
+
395
+ /* decoded size */
396
+ return dstSize;
397
+ }
398
+
399
+
400
+ FORCE_INLINE_TEMPLATE size_t
401
+ HUF_decompress4X4_usingDTable_internal_body(
402
+ void* dst, size_t dstSize,
403
+ const void* cSrc, size_t cSrcSize,
404
+ const HUF_DTable* DTable)
405
+ {
406
+ if (cSrcSize < 10) return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */
407
+
408
+ { const BYTE* const istart = (const BYTE*) cSrc;
409
+ BYTE* const ostart = (BYTE*) dst;
410
+ BYTE* const oend = ostart + dstSize;
411
+ const void* const dtPtr = DTable+1;
412
+ const HUF_DEltX4* const dt = (const HUF_DEltX4*)dtPtr;
413
+
414
+ /* Init */
415
+ BIT_DStream_t bitD1;
416
+ BIT_DStream_t bitD2;
417
+ BIT_DStream_t bitD3;
418
+ BIT_DStream_t bitD4;
419
+ size_t const length1 = MEM_readLE16(istart);
420
+ size_t const length2 = MEM_readLE16(istart+2);
421
+ size_t const length3 = MEM_readLE16(istart+4);
422
+ size_t const length4 = cSrcSize - (length1 + length2 + length3 + 6);
423
+ const BYTE* const istart1 = istart + 6; /* jumpTable */
424
+ const BYTE* const istart2 = istart1 + length1;
425
+ const BYTE* const istart3 = istart2 + length2;
426
+ const BYTE* const istart4 = istart3 + length3;
427
+ size_t const segmentSize = (dstSize+3) / 4;
428
+ BYTE* const opStart2 = ostart + segmentSize;
429
+ BYTE* const opStart3 = opStart2 + segmentSize;
430
+ BYTE* const opStart4 = opStart3 + segmentSize;
431
+ BYTE* op1 = ostart;
432
+ BYTE* op2 = opStart2;
433
+ BYTE* op3 = opStart3;
434
+ BYTE* op4 = opStart4;
435
+ U32 endSignal;
436
+ DTableDesc const dtd = HUF_getDTableDesc(DTable);
437
+ U32 const dtLog = dtd.tableLog;
438
+
439
+ if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */
440
+ CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
441
+ CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
442
+ CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
443
+ CHECK_F( BIT_initDStream(&bitD4, istart4, length4) );
444
+
445
+ /* 16-32 symbols per loop (4-8 symbols per stream) */
446
+ endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
447
+ for ( ; (endSignal==BIT_DStream_unfinished) & (op4<(oend-(sizeof(bitD4.bitContainer)-1))) ; ) {
448
+ HUF_DECODE_SYMBOLX4_2(op1, &bitD1);
449
+ HUF_DECODE_SYMBOLX4_2(op2, &bitD2);
450
+ HUF_DECODE_SYMBOLX4_2(op3, &bitD3);
451
+ HUF_DECODE_SYMBOLX4_2(op4, &bitD4);
452
+ HUF_DECODE_SYMBOLX4_1(op1, &bitD1);
453
+ HUF_DECODE_SYMBOLX4_1(op2, &bitD2);
454
+ HUF_DECODE_SYMBOLX4_1(op3, &bitD3);
455
+ HUF_DECODE_SYMBOLX4_1(op4, &bitD4);
456
+ HUF_DECODE_SYMBOLX4_2(op1, &bitD1);
457
+ HUF_DECODE_SYMBOLX4_2(op2, &bitD2);
458
+ HUF_DECODE_SYMBOLX4_2(op3, &bitD3);
459
+ HUF_DECODE_SYMBOLX4_2(op4, &bitD4);
460
+ HUF_DECODE_SYMBOLX4_0(op1, &bitD1);
461
+ HUF_DECODE_SYMBOLX4_0(op2, &bitD2);
462
+ HUF_DECODE_SYMBOLX4_0(op3, &bitD3);
463
+ HUF_DECODE_SYMBOLX4_0(op4, &bitD4);
464
+
465
+ endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
466
+ }
467
+
468
+ /* check corruption */
469
+ if (op1 > opStart2) return ERROR(corruption_detected);
470
+ if (op2 > opStart3) return ERROR(corruption_detected);
471
+ if (op3 > opStart4) return ERROR(corruption_detected);
472
+ /* note : op4 already verified within main loop */
473
+
474
+ /* finish bitStreams one by one */
475
+ HUF_decodeStreamX4(op1, &bitD1, opStart2, dt, dtLog);
476
+ HUF_decodeStreamX4(op2, &bitD2, opStart3, dt, dtLog);
477
+ HUF_decodeStreamX4(op3, &bitD3, opStart4, dt, dtLog);
478
+ HUF_decodeStreamX4(op4, &bitD4, oend, dt, dtLog);
479
+
480
+ /* check */
481
+ { U32 const endCheck = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
482
+ if (!endCheck) return ERROR(corruption_detected); }
483
+
484
+ /* decoded size */
485
+ return dstSize;
486
+ }
487
+ }
488
+
489
+
490
+ typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize,
491
+ const void *cSrc,
492
+ size_t cSrcSize,
493
+ const HUF_DTable *DTable);
494
+ #if DYNAMIC_BMI2
495
+
496
+ #define X(fn) \
497
+ \
498
+ static size_t fn##_default( \
499
+ void* dst, size_t dstSize, \
500
+ const void* cSrc, size_t cSrcSize, \
501
+ const HUF_DTable* DTable) \
502
+ { \
503
+ return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \
504
+ } \
505
+ \
506
+ static TARGET_ATTRIBUTE("bmi2") size_t fn##_bmi2( \
507
+ void* dst, size_t dstSize, \
508
+ const void* cSrc, size_t cSrcSize, \
509
+ const HUF_DTable* DTable) \
510
+ { \
511
+ return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \
512
+ } \
513
+ \
514
+ static size_t fn(void* dst, size_t dstSize, void const* cSrc, \
515
+ size_t cSrcSize, HUF_DTable const* DTable, int bmi2) \
516
+ { \
517
+ if (bmi2) { \
518
+ return fn##_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); \
519
+ } \
520
+ return fn##_default(dst, dstSize, cSrc, cSrcSize, DTable); \
521
+ }
522
+
523
+ #else
524
+
525
+ #define X(fn) \
526
+ static size_t fn(void* dst, size_t dstSize, void const* cSrc, \
527
+ size_t cSrcSize, HUF_DTable const* DTable, int bmi2) \
528
+ { \
529
+ (void)bmi2; \
530
+ return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \
531
+ }
532
+
533
+ #endif
534
+
535
+ X(HUF_decompress1X2_usingDTable_internal)
536
+ X(HUF_decompress4X2_usingDTable_internal)
537
+ X(HUF_decompress1X4_usingDTable_internal)
538
+ X(HUF_decompress4X4_usingDTable_internal)
539
+
540
+ #undef X
541
+
542
+
543
+ size_t HUF_decompress1X2_usingDTable(
348
544
  void* dst, size_t dstSize,
349
545
  const void* cSrc, size_t cSrcSize,
350
546
  const HUF_DTable* DTable)
351
547
  {
352
548
  DTableDesc dtd = HUF_getDTableDesc(DTable);
353
549
  if (dtd.tableType != 0) return ERROR(GENERIC);
354
- return HUF_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable);
550
+ return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
355
551
  }
356
552
 
357
-
358
- size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
553
+ size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
359
554
  const void* cSrc, size_t cSrcSize,
360
555
  void* workSpace, size_t wkspSize)
361
556
  {
362
557
  const BYTE* ip = (const BYTE*) cSrc;
363
558
 
559
+ size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize, workSpace, wkspSize);
560
+ if (HUF_isError(hSize)) return hSize;
561
+ if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
562
+ ip += hSize; cSrcSize -= hSize;
563
+
564
+ return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
565
+ }
566
+
567
+
568
+ size_t HUF_decompress1X2_DCtx(HUF_DTable* DCtx, void* dst, size_t dstSize,
569
+ const void* cSrc, size_t cSrcSize)
570
+ {
571
+ U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
572
+ return HUF_decompress1X2_DCtx_wksp(DCtx, dst, dstSize, cSrc, cSrcSize,
573
+ workSpace, sizeof(workSpace));
574
+ }
575
+
576
+ size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
577
+ {
578
+ HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_TABLELOG_MAX);
579
+ return HUF_decompress1X2_DCtx (DTable, dst, dstSize, cSrc, cSrcSize);
580
+ }
581
+
582
+ size_t HUF_decompress4X2_usingDTable(
583
+ void* dst, size_t dstSize,
584
+ const void* cSrc, size_t cSrcSize,
585
+ const HUF_DTable* DTable)
586
+ {
587
+ DTableDesc dtd = HUF_getDTableDesc(DTable);
588
+ if (dtd.tableType != 0) return ERROR(GENERIC);
589
+ return HUF_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
590
+ }
591
+
592
+ static size_t HUF_decompress4X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
593
+ const void* cSrc, size_t cSrcSize,
594
+ void* workSpace, size_t wkspSize, int bmi2)
595
+ {
596
+ const BYTE* ip = (const BYTE*) cSrc;
597
+
364
598
  size_t const hSize = HUF_readDTableX2_wksp (dctx, cSrc, cSrcSize,
365
599
  workSpace, wkspSize);
366
600
  if (HUF_isError(hSize)) return hSize;
367
601
  if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
368
602
  ip += hSize; cSrcSize -= hSize;
369
603
 
370
- return HUF_decompress4X2_usingDTable_internal (dst, dstSize, ip, cSrcSize, dctx);
604
+ return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
605
+ }
606
+
607
+ size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
608
+ const void* cSrc, size_t cSrcSize,
609
+ void* workSpace, size_t wkspSize)
610
+ {
611
+ return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, 0);
371
612
  }
372
613
 
373
614
 
@@ -387,8 +628,6 @@ size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cS
387
628
  /* *************************/
388
629
  /* double-symbols decoding */
389
630
  /* *************************/
390
- typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUF_DEltX4; /* double-symbols decoding */
391
-
392
631
  typedef struct { BYTE symbol; BYTE weight; } sortedSymbol_t;
393
632
 
394
633
  /* HUF_fillDTableX4Level2() :
@@ -508,10 +747,7 @@ size_t HUF_readDTableX4_wksp(HUF_DTable* DTable, const void* src,
508
747
  weightList = (BYTE *)((U32 *)workSpace + spaceUsed32);
509
748
  spaceUsed32 += HUF_ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2;
510
749
 
511
- if ((spaceUsed32 << 2) > wkspSize)
512
- return ERROR(tableLog_tooLarge);
513
- workSpace = (U32 *)workSpace + spaceUsed32;
514
- wkspSize -= (spaceUsed32 << 2);
750
+ if ((spaceUsed32 << 2) > wkspSize) return ERROR(tableLog_tooLarge);
515
751
 
516
752
  rankStart = rankStart0 + 1;
517
753
  memset(rankStats, 0, sizeof(U32) * (2 * HUF_TABLELOG_MAX + 2 + 1));
@@ -588,95 +824,6 @@ size_t HUF_readDTableX4(HUF_DTable* DTable, const void* src, size_t srcSize)
588
824
  workSpace, sizeof(workSpace));
589
825
  }
590
826
 
591
- static U32 HUF_decodeSymbolX4(void* op, BIT_DStream_t* DStream, const HUF_DEltX4* dt, const U32 dtLog)
592
- {
593
- size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */
594
- memcpy(op, dt+val, 2);
595
- BIT_skipBits(DStream, dt[val].nbBits);
596
- return dt[val].length;
597
- }
598
-
599
- static U32 HUF_decodeLastSymbolX4(void* op, BIT_DStream_t* DStream, const HUF_DEltX4* dt, const U32 dtLog)
600
- {
601
- size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */
602
- memcpy(op, dt+val, 1);
603
- if (dt[val].length==1) BIT_skipBits(DStream, dt[val].nbBits);
604
- else {
605
- if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8)) {
606
- BIT_skipBits(DStream, dt[val].nbBits);
607
- if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8))
608
- /* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */
609
- DStream->bitsConsumed = (sizeof(DStream->bitContainer)*8);
610
- } }
611
- return 1;
612
- }
613
-
614
-
615
- #define HUF_DECODE_SYMBOLX4_0(ptr, DStreamPtr) \
616
- ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
617
-
618
- #define HUF_DECODE_SYMBOLX4_1(ptr, DStreamPtr) \
619
- if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
620
- ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
621
-
622
- #define HUF_DECODE_SYMBOLX4_2(ptr, DStreamPtr) \
623
- if (MEM_64bits()) \
624
- ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
625
-
626
- HINT_INLINE size_t HUF_decodeStreamX4(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd, const HUF_DEltX4* const dt, const U32 dtLog)
627
- {
628
- BYTE* const pStart = p;
629
-
630
- /* up to 8 symbols at a time */
631
- while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-(sizeof(bitDPtr->bitContainer)-1))) {
632
- HUF_DECODE_SYMBOLX4_2(p, bitDPtr);
633
- HUF_DECODE_SYMBOLX4_1(p, bitDPtr);
634
- HUF_DECODE_SYMBOLX4_2(p, bitDPtr);
635
- HUF_DECODE_SYMBOLX4_0(p, bitDPtr);
636
- }
637
-
638
- /* closer to end : up to 2 symbols at a time */
639
- while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p <= pEnd-2))
640
- HUF_DECODE_SYMBOLX4_0(p, bitDPtr);
641
-
642
- while (p <= pEnd-2)
643
- HUF_DECODE_SYMBOLX4_0(p, bitDPtr); /* no need to reload : reached the end of DStream */
644
-
645
- if (p < pEnd)
646
- p += HUF_decodeLastSymbolX4(p, bitDPtr, dt, dtLog);
647
-
648
- return p-pStart;
649
- }
650
-
651
-
652
- static size_t HUF_decompress1X4_usingDTable_internal(
653
- void* dst, size_t dstSize,
654
- const void* cSrc, size_t cSrcSize,
655
- const HUF_DTable* DTable)
656
- {
657
- BIT_DStream_t bitD;
658
-
659
- /* Init */
660
- { size_t const errorCode = BIT_initDStream(&bitD, cSrc, cSrcSize);
661
- if (HUF_isError(errorCode)) return errorCode;
662
- }
663
-
664
- /* decode */
665
- { BYTE* const ostart = (BYTE*) dst;
666
- BYTE* const oend = ostart + dstSize;
667
- const void* const dtPtr = DTable+1; /* force compiler to not use strict-aliasing */
668
- const HUF_DEltX4* const dt = (const HUF_DEltX4*)dtPtr;
669
- DTableDesc const dtd = HUF_getDTableDesc(DTable);
670
- HUF_decodeStreamX4(ostart, &bitD, oend, dt, dtd.tableLog);
671
- }
672
-
673
- /* check */
674
- if (!BIT_endOfDStream(&bitD)) return ERROR(corruption_detected);
675
-
676
- /* decoded size */
677
- return dstSize;
678
- }
679
-
680
827
  size_t HUF_decompress1X4_usingDTable(
681
828
  void* dst, size_t dstSize,
682
829
  const void* cSrc, size_t cSrcSize,
@@ -684,7 +831,7 @@ size_t HUF_decompress1X4_usingDTable(
684
831
  {
685
832
  DTableDesc dtd = HUF_getDTableDesc(DTable);
686
833
  if (dtd.tableType != 1) return ERROR(GENERIC);
687
- return HUF_decompress1X4_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable);
834
+ return HUF_decompress1X4_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
688
835
  }
689
836
 
690
837
  size_t HUF_decompress1X4_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
@@ -699,7 +846,7 @@ size_t HUF_decompress1X4_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
699
846
  if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
700
847
  ip += hSize; cSrcSize -= hSize;
701
848
 
702
- return HUF_decompress1X4_usingDTable_internal (dst, dstSize, ip, cSrcSize, DCtx);
849
+ return HUF_decompress1X4_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
703
850
  }
704
851
 
705
852
 
@@ -717,99 +864,6 @@ size_t HUF_decompress1X4 (void* dst, size_t dstSize, const void* cSrc, size_t cS
717
864
  return HUF_decompress1X4_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
718
865
  }
719
866
 
720
- static size_t HUF_decompress4X4_usingDTable_internal(
721
- void* dst, size_t dstSize,
722
- const void* cSrc, size_t cSrcSize,
723
- const HUF_DTable* DTable)
724
- {
725
- if (cSrcSize < 10) return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */
726
-
727
- { const BYTE* const istart = (const BYTE*) cSrc;
728
- BYTE* const ostart = (BYTE*) dst;
729
- BYTE* const oend = ostart + dstSize;
730
- const void* const dtPtr = DTable+1;
731
- const HUF_DEltX4* const dt = (const HUF_DEltX4*)dtPtr;
732
-
733
- /* Init */
734
- BIT_DStream_t bitD1;
735
- BIT_DStream_t bitD2;
736
- BIT_DStream_t bitD3;
737
- BIT_DStream_t bitD4;
738
- size_t const length1 = MEM_readLE16(istart);
739
- size_t const length2 = MEM_readLE16(istart+2);
740
- size_t const length3 = MEM_readLE16(istart+4);
741
- size_t const length4 = cSrcSize - (length1 + length2 + length3 + 6);
742
- const BYTE* const istart1 = istart + 6; /* jumpTable */
743
- const BYTE* const istart2 = istart1 + length1;
744
- const BYTE* const istart3 = istart2 + length2;
745
- const BYTE* const istart4 = istart3 + length3;
746
- size_t const segmentSize = (dstSize+3) / 4;
747
- BYTE* const opStart2 = ostart + segmentSize;
748
- BYTE* const opStart3 = opStart2 + segmentSize;
749
- BYTE* const opStart4 = opStart3 + segmentSize;
750
- BYTE* op1 = ostart;
751
- BYTE* op2 = opStart2;
752
- BYTE* op3 = opStart3;
753
- BYTE* op4 = opStart4;
754
- U32 endSignal;
755
- DTableDesc const dtd = HUF_getDTableDesc(DTable);
756
- U32 const dtLog = dtd.tableLog;
757
-
758
- if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */
759
- { size_t const errorCode = BIT_initDStream(&bitD1, istart1, length1);
760
- if (HUF_isError(errorCode)) return errorCode; }
761
- { size_t const errorCode = BIT_initDStream(&bitD2, istart2, length2);
762
- if (HUF_isError(errorCode)) return errorCode; }
763
- { size_t const errorCode = BIT_initDStream(&bitD3, istart3, length3);
764
- if (HUF_isError(errorCode)) return errorCode; }
765
- { size_t const errorCode = BIT_initDStream(&bitD4, istart4, length4);
766
- if (HUF_isError(errorCode)) return errorCode; }
767
-
768
- /* 16-32 symbols per loop (4-8 symbols per stream) */
769
- endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
770
- for ( ; (endSignal==BIT_DStream_unfinished) & (op4<(oend-(sizeof(bitD4.bitContainer)-1))) ; ) {
771
- HUF_DECODE_SYMBOLX4_2(op1, &bitD1);
772
- HUF_DECODE_SYMBOLX4_2(op2, &bitD2);
773
- HUF_DECODE_SYMBOLX4_2(op3, &bitD3);
774
- HUF_DECODE_SYMBOLX4_2(op4, &bitD4);
775
- HUF_DECODE_SYMBOLX4_1(op1, &bitD1);
776
- HUF_DECODE_SYMBOLX4_1(op2, &bitD2);
777
- HUF_DECODE_SYMBOLX4_1(op3, &bitD3);
778
- HUF_DECODE_SYMBOLX4_1(op4, &bitD4);
779
- HUF_DECODE_SYMBOLX4_2(op1, &bitD1);
780
- HUF_DECODE_SYMBOLX4_2(op2, &bitD2);
781
- HUF_DECODE_SYMBOLX4_2(op3, &bitD3);
782
- HUF_DECODE_SYMBOLX4_2(op4, &bitD4);
783
- HUF_DECODE_SYMBOLX4_0(op1, &bitD1);
784
- HUF_DECODE_SYMBOLX4_0(op2, &bitD2);
785
- HUF_DECODE_SYMBOLX4_0(op3, &bitD3);
786
- HUF_DECODE_SYMBOLX4_0(op4, &bitD4);
787
-
788
- endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
789
- }
790
-
791
- /* check corruption */
792
- if (op1 > opStart2) return ERROR(corruption_detected);
793
- if (op2 > opStart3) return ERROR(corruption_detected);
794
- if (op3 > opStart4) return ERROR(corruption_detected);
795
- /* note : op4 already verified within main loop */
796
-
797
- /* finish bitStreams one by one */
798
- HUF_decodeStreamX4(op1, &bitD1, opStart2, dt, dtLog);
799
- HUF_decodeStreamX4(op2, &bitD2, opStart3, dt, dtLog);
800
- HUF_decodeStreamX4(op3, &bitD3, opStart4, dt, dtLog);
801
- HUF_decodeStreamX4(op4, &bitD4, oend, dt, dtLog);
802
-
803
- /* check */
804
- { U32 const endCheck = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
805
- if (!endCheck) return ERROR(corruption_detected); }
806
-
807
- /* decoded size */
808
- return dstSize;
809
- }
810
- }
811
-
812
-
813
867
  size_t HUF_decompress4X4_usingDTable(
814
868
  void* dst, size_t dstSize,
815
869
  const void* cSrc, size_t cSrcSize,
@@ -817,13 +871,12 @@ size_t HUF_decompress4X4_usingDTable(
817
871
  {
818
872
  DTableDesc dtd = HUF_getDTableDesc(DTable);
819
873
  if (dtd.tableType != 1) return ERROR(GENERIC);
820
- return HUF_decompress4X4_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable);
874
+ return HUF_decompress4X4_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
821
875
  }
822
876
 
823
-
824
- size_t HUF_decompress4X4_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
877
+ static size_t HUF_decompress4X4_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
825
878
  const void* cSrc, size_t cSrcSize,
826
- void* workSpace, size_t wkspSize)
879
+ void* workSpace, size_t wkspSize, int bmi2)
827
880
  {
828
881
  const BYTE* ip = (const BYTE*) cSrc;
829
882
 
@@ -833,7 +886,14 @@ size_t HUF_decompress4X4_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
833
886
  if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
834
887
  ip += hSize; cSrcSize -= hSize;
835
888
 
836
- return HUF_decompress4X4_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx);
889
+ return HUF_decompress4X4_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
890
+ }
891
+
892
+ size_t HUF_decompress4X4_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
893
+ const void* cSrc, size_t cSrcSize,
894
+ void* workSpace, size_t wkspSize)
895
+ {
896
+ return HUF_decompress4X4_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, /* bmi2 */ 0);
837
897
  }
838
898
 
839
899
 
@@ -861,8 +921,8 @@ size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize,
861
921
  const HUF_DTable* DTable)
862
922
  {
863
923
  DTableDesc const dtd = HUF_getDTableDesc(DTable);
864
- return dtd.tableType ? HUF_decompress1X4_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable) :
865
- HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable);
924
+ return dtd.tableType ? HUF_decompress1X4_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
925
+ HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
866
926
  }
867
927
 
868
928
  size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize,
@@ -870,8 +930,8 @@ size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize,
870
930
  const HUF_DTable* DTable)
871
931
  {
872
932
  DTableDesc const dtd = HUF_getDTableDesc(DTable);
873
- return dtd.tableType ? HUF_decompress4X4_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable) :
874
- HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable);
933
+ return dtd.tableType ? HUF_decompress4X4_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
934
+ HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
875
935
  }
876
936
 
877
937
 
@@ -898,21 +958,22 @@ static const algo_time_t algoTime[16 /* Quantization */][3 /* single, double, qu
898
958
  };
899
959
 
900
960
  /** HUF_selectDecoder() :
901
- * Tells which decoder is likely to decode faster,
902
- * based on a set of pre-determined metrics.
903
- * @return : 0==HUF_decompress4X2, 1==HUF_decompress4X4 .
904
- * Assumption : 0 < cSrcSize, dstSize <= 128 KB */
961
+ * Tells which decoder is likely to decode faster,
962
+ * based on a set of pre-computed metrics.
963
+ * @return : 0==HUF_decompress4X2, 1==HUF_decompress4X4 .
964
+ * Assumption : 0 < dstSize <= 128 KB */
905
965
  U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize)
906
966
  {
967
+ assert(dstSize > 0);
968
+ assert(dstSize <= 128 KB);
907
969
  /* decoder timing evaluation */
908
- U32 const Q = cSrcSize >= dstSize ? 15 : (U32)(cSrcSize * 16 / dstSize); /* Q < 16 */
909
- U32 const D256 = (U32)(dstSize >> 8);
910
- U32 const DTime0 = algoTime[Q][0].tableTime + (algoTime[Q][0].decode256Time * D256);
911
- U32 DTime1 = algoTime[Q][1].tableTime + (algoTime[Q][1].decode256Time * D256);
912
- DTime1 += DTime1 >> 3; /* advantage to algorithm using less memory, for cache eviction */
913
-
914
- return DTime1 < DTime0;
915
- }
970
+ { U32 const Q = (cSrcSize >= dstSize) ? 15 : (U32)(cSrcSize * 16 / dstSize); /* Q < 16 */
971
+ U32 const D256 = (U32)(dstSize >> 8);
972
+ U32 const DTime0 = algoTime[Q][0].tableTime + (algoTime[Q][0].decode256Time * D256);
973
+ U32 DTime1 = algoTime[Q][1].tableTime + (algoTime[Q][1].decode256Time * D256);
974
+ DTime1 += DTime1 >> 3; /* advantage to algorithm using less memory, to reduce cache eviction */
975
+ return DTime1 < DTime0;
976
+ } }
916
977
 
917
978
 
918
979
  typedef size_t (*decompressionAlgo)(void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);
@@ -994,3 +1055,42 @@ size_t HUF_decompress1X_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize,
994
1055
  return HUF_decompress1X_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
995
1056
  workSpace, sizeof(workSpace));
996
1057
  }
1058
+
1059
+
1060
+ size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
1061
+ {
1062
+ DTableDesc const dtd = HUF_getDTableDesc(DTable);
1063
+ return dtd.tableType ? HUF_decompress1X4_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
1064
+ HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
1065
+ }
1066
+
1067
+ size_t HUF_decompress1X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
1068
+ {
1069
+ const BYTE* ip = (const BYTE*) cSrc;
1070
+
1071
+ size_t const hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize);
1072
+ if (HUF_isError(hSize)) return hSize;
1073
+ if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
1074
+ ip += hSize; cSrcSize -= hSize;
1075
+
1076
+ return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
1077
+ }
1078
+
1079
+ size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
1080
+ {
1081
+ DTableDesc const dtd = HUF_getDTableDesc(DTable);
1082
+ return dtd.tableType ? HUF_decompress4X4_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
1083
+ HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
1084
+ }
1085
+
1086
+ size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
1087
+ {
1088
+ /* validation checks */
1089
+ if (dstSize == 0) return ERROR(dstSize_tooSmall);
1090
+ if (cSrcSize == 0) return ERROR(corruption_detected);
1091
+
1092
+ { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
1093
+ return algoNb ? HUF_decompress4X4_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2) :
1094
+ HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
1095
+ }
1096
+ }