zstd-ruby 1.3.3.0 → 1.3.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. checksums.yaml +5 -5
  2. data/README.md +1 -1
  3. data/ext/zstdruby/libzstd/BUCK +13 -0
  4. data/ext/zstdruby/libzstd/README.md +32 -25
  5. data/ext/zstdruby/libzstd/common/bitstream.h +1 -1
  6. data/ext/zstdruby/libzstd/common/compiler.h +25 -0
  7. data/ext/zstdruby/libzstd/common/cpu.h +216 -0
  8. data/ext/zstdruby/libzstd/common/error_private.c +1 -0
  9. data/ext/zstdruby/libzstd/common/fse.h +1 -1
  10. data/ext/zstdruby/libzstd/common/fse_decompress.c +2 -2
  11. data/ext/zstdruby/libzstd/common/huf.h +114 -89
  12. data/ext/zstdruby/libzstd/common/pool.c +46 -17
  13. data/ext/zstdruby/libzstd/common/pool.h +18 -9
  14. data/ext/zstdruby/libzstd/common/threading.h +12 -12
  15. data/ext/zstdruby/libzstd/common/zstd_errors.h +16 -7
  16. data/ext/zstdruby/libzstd/common/zstd_internal.h +4 -5
  17. data/ext/zstdruby/libzstd/compress/fse_compress.c +19 -11
  18. data/ext/zstdruby/libzstd/compress/huf_compress.c +160 -62
  19. data/ext/zstdruby/libzstd/compress/zstd_compress.c +973 -644
  20. data/ext/zstdruby/libzstd/compress/zstd_compress_internal.h +281 -34
  21. data/ext/zstdruby/libzstd/compress/zstd_double_fast.c +80 -62
  22. data/ext/zstdruby/libzstd/compress/zstd_double_fast.h +11 -4
  23. data/ext/zstdruby/libzstd/compress/zstd_fast.c +87 -71
  24. data/ext/zstdruby/libzstd/compress/zstd_fast.h +10 -6
  25. data/ext/zstdruby/libzstd/compress/zstd_lazy.c +333 -274
  26. data/ext/zstdruby/libzstd/compress/zstd_lazy.h +33 -16
  27. data/ext/zstdruby/libzstd/compress/zstd_ldm.c +305 -359
  28. data/ext/zstdruby/libzstd/compress/zstd_ldm.h +64 -21
  29. data/ext/zstdruby/libzstd/compress/zstd_opt.c +194 -56
  30. data/ext/zstdruby/libzstd/compress/zstd_opt.h +17 -5
  31. data/ext/zstdruby/libzstd/compress/zstdmt_compress.c +1131 -449
  32. data/ext/zstdruby/libzstd/compress/zstdmt_compress.h +32 -16
  33. data/ext/zstdruby/libzstd/decompress/huf_decompress.c +390 -290
  34. data/ext/zstdruby/libzstd/decompress/zstd_decompress.c +777 -439
  35. data/ext/zstdruby/libzstd/dictBuilder/cover.c +11 -8
  36. data/ext/zstdruby/libzstd/dictBuilder/zdict.c +83 -50
  37. data/ext/zstdruby/libzstd/dictBuilder/zdict.h +44 -43
  38. data/ext/zstdruby/libzstd/legacy/zstd_legacy.h +2 -0
  39. data/ext/zstdruby/libzstd/legacy/zstd_v04.c +42 -118
  40. data/ext/zstdruby/libzstd/legacy/zstd_v06.c +2 -2
  41. data/ext/zstdruby/libzstd/legacy/zstd_v07.c +2 -2
  42. data/ext/zstdruby/libzstd/zstd.h +254 -254
  43. data/lib/zstd-ruby/version.rb +1 -1
  44. metadata +4 -3
@@ -30,15 +30,15 @@
30
30
 
31
31
  /* === Memory management === */
32
32
  typedef struct ZSTDMT_CCtx_s ZSTDMT_CCtx;
33
- ZSTDLIB_API ZSTDMT_CCtx* ZSTDMT_createCCtx(unsigned nbThreads);
34
- ZSTDLIB_API ZSTDMT_CCtx* ZSTDMT_createCCtx_advanced(unsigned nbThreads,
33
+ ZSTDLIB_API ZSTDMT_CCtx* ZSTDMT_createCCtx(unsigned nbWorkers);
34
+ ZSTDLIB_API ZSTDMT_CCtx* ZSTDMT_createCCtx_advanced(unsigned nbWorkers,
35
35
  ZSTD_customMem cMem);
36
36
  ZSTDLIB_API size_t ZSTDMT_freeCCtx(ZSTDMT_CCtx* mtctx);
37
37
 
38
38
  ZSTDLIB_API size_t ZSTDMT_sizeof_CCtx(ZSTDMT_CCtx* mtctx);
39
39
 
40
40
 
41
- /* === Simple buffer-to-butter one-pass function === */
41
+ /* === Simple one-pass compression function === */
42
42
 
43
43
  ZSTDLIB_API size_t ZSTDMT_compressCCtx(ZSTDMT_CCtx* mtctx,
44
44
  void* dst, size_t dstCapacity,
@@ -50,7 +50,7 @@ ZSTDLIB_API size_t ZSTDMT_compressCCtx(ZSTDMT_CCtx* mtctx,
50
50
  /* === Streaming functions === */
51
51
 
52
52
  ZSTDLIB_API size_t ZSTDMT_initCStream(ZSTDMT_CCtx* mtctx, int compressionLevel);
53
- ZSTDLIB_API size_t ZSTDMT_resetCStream(ZSTDMT_CCtx* mtctx, unsigned long long pledgedSrcSize); /**< if srcSize is not known at reset time, use ZSTD_CONTENTSIZE_UNKNOWN. Note: for compatibility with older programs, 0 means the same as ZSTD_CONTENTSIZE_UNKNOWN, but it may change in the future, to mean "empty" */
53
+ ZSTDLIB_API size_t ZSTDMT_resetCStream(ZSTDMT_CCtx* mtctx, unsigned long long pledgedSrcSize); /**< if srcSize is not known at reset time, use ZSTD_CONTENTSIZE_UNKNOWN. Note: for compatibility with older programs, 0 means the same as ZSTD_CONTENTSIZE_UNKNOWN, but it will change in the future to mean "empty" */
54
54
 
55
55
  ZSTDLIB_API size_t ZSTDMT_compressStream(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output, ZSTD_inBuffer* input);
56
56
 
@@ -68,7 +68,7 @@ ZSTDLIB_API size_t ZSTDMT_compress_advanced(ZSTDMT_CCtx* mtctx,
68
68
  void* dst, size_t dstCapacity,
69
69
  const void* src, size_t srcSize,
70
70
  const ZSTD_CDict* cdict,
71
- ZSTD_parameters const params,
71
+ ZSTD_parameters params,
72
72
  unsigned overlapLog);
73
73
 
74
74
  ZSTDLIB_API size_t ZSTDMT_initCStream_advanced(ZSTDMT_CCtx* mtctx,
@@ -85,7 +85,7 @@ ZSTDLIB_API size_t ZSTDMT_initCStream_usingCDict(ZSTDMT_CCtx* mtctx,
85
85
  * List of parameters that can be set using ZSTDMT_setMTCtxParameter() */
86
86
  typedef enum {
87
87
  ZSTDMT_p_jobSize, /* Each job is compressed in parallel. By default, this value is dynamically determined depending on compression parameters. Can be set explicitly here. */
88
- ZSTDMT_p_overlapSectionLog /* Each job may reload a part of previous job to enhance compressionr ratio; 0 == no overlap, 6(default) == use 1/8th of window, >=9 == use full window */
88
+ ZSTDMT_p_overlapSectionLog /* Each job may reload a part of previous job to enhance compressionr ratio; 0 == no overlap, 6(default) == use 1/8th of window, >=9 == use full window. This is a "sticky" parameter : its value will be re-used on next compression job */
89
89
  } ZSTDMT_parameter;
90
90
 
91
91
  /* ZSTDMT_setMTCtxParameter() :
@@ -97,30 +97,46 @@ ZSTDLIB_API size_t ZSTDMT_setMTCtxParameter(ZSTDMT_CCtx* mtctx, ZSTDMT_parameter
97
97
 
98
98
 
99
99
  /*! ZSTDMT_compressStream_generic() :
100
- * Combines ZSTDMT_compressStream() with ZSTDMT_flushStream() or ZSTDMT_endStream()
100
+ * Combines ZSTDMT_compressStream() with optional ZSTDMT_flushStream() or ZSTDMT_endStream()
101
101
  * depending on flush directive.
102
102
  * @return : minimum amount of data still to be flushed
103
103
  * 0 if fully flushed
104
- * or an error code */
104
+ * or an error code
105
+ * note : needs to be init using any ZSTD_initCStream*() variant */
105
106
  ZSTDLIB_API size_t ZSTDMT_compressStream_generic(ZSTDMT_CCtx* mtctx,
106
107
  ZSTD_outBuffer* output,
107
108
  ZSTD_inBuffer* input,
108
109
  ZSTD_EndDirective endOp);
109
110
 
110
111
 
111
- /* === Private definitions; never ever use directly === */
112
+ /* ========================================================
113
+ * === Private interface, for use by ZSTD_compress.c ===
114
+ * === Not exposed in libzstd. Never invoke directly ===
115
+ * ======================================================== */
112
116
 
113
117
  size_t ZSTDMT_CCtxParam_setMTCtxParameter(ZSTD_CCtx_params* params, ZSTDMT_parameter parameter, unsigned value);
114
118
 
115
- /* ZSTDMT_CCtxParam_setNbThreads()
116
- * Set nbThreads, and clamp it correctly,
117
- * also reset jobSize and overlapLog */
118
- size_t ZSTDMT_CCtxParam_setNbThreads(ZSTD_CCtx_params* params, unsigned nbThreads);
119
+ /* ZSTDMT_CCtxParam_setNbWorkers()
120
+ * Set nbWorkers, and clamp it.
121
+ * Also reset jobSize and overlapLog */
122
+ size_t ZSTDMT_CCtxParam_setNbWorkers(ZSTD_CCtx_params* params, unsigned nbWorkers);
119
123
 
120
- /* ZSTDMT_getNbThreads():
124
+ /*! ZSTDMT_updateCParams_whileCompressing() :
125
+ * Updates only a selected set of compression parameters, to remain compatible with current frame.
126
+ * New parameters will be applied to next compression job. */
127
+ void ZSTDMT_updateCParams_whileCompressing(ZSTDMT_CCtx* mtctx, const ZSTD_CCtx_params* cctxParams);
128
+
129
+ /* ZSTDMT_getNbWorkers():
121
130
  * @return nb threads currently active in mtctx.
122
131
  * mtctx must be valid */
123
- size_t ZSTDMT_getNbThreads(const ZSTDMT_CCtx* mtctx);
132
+ unsigned ZSTDMT_getNbWorkers(const ZSTDMT_CCtx* mtctx);
133
+
134
+ /* ZSTDMT_getFrameProgression():
135
+ * tells how much data has been consumed (input) and produced (output) for current frame.
136
+ * able to count progression inside worker threads.
137
+ */
138
+ ZSTD_frameProgression ZSTDMT_getFrameProgression(ZSTDMT_CCtx* mtctx);
139
+
124
140
 
125
141
  /*! ZSTDMT_initCStream_internal() :
126
142
  * Private use only. Init streaming operation.
@@ -128,7 +144,7 @@ size_t ZSTDMT_getNbThreads(const ZSTDMT_CCtx* mtctx);
128
144
  * must receive dict, or cdict, or none, but not both.
129
145
  * @return : 0, or an error code */
130
146
  size_t ZSTDMT_initCStream_internal(ZSTDMT_CCtx* zcs,
131
- const void* dict, size_t dictSize, ZSTD_dictMode_e dictMode,
147
+ const void* dict, size_t dictSize, ZSTD_dictContentType_e dictContentType,
132
148
  const ZSTD_CDict* cdict,
133
149
  ZSTD_CCtx_params params, unsigned long long pledgedSrcSize);
134
150
 
@@ -49,18 +49,19 @@
49
49
  ****************************************************************/
50
50
  #define HUF_isError ERR_isError
51
51
  #define HUF_STATIC_ASSERT(c) { enum { HUF_static_assert = 1/(int)(!!(c)) }; } /* use only *after* variable declarations */
52
+ #define CHECK_F(f) { size_t const err_ = (f); if (HUF_isError(err_)) return err_; }
52
53
 
53
54
 
54
55
  /* **************************************************************
55
56
  * Byte alignment for workSpace management
56
57
  ****************************************************************/
57
- #define HUF_ALIGN(x, a) HUF_ALIGN_MASK((x), (a) - 1)
58
+ #define HUF_ALIGN(x, a) HUF_ALIGN_MASK((x), (a) - 1)
58
59
  #define HUF_ALIGN_MASK(x, mask) (((x) + (mask)) & ~(mask))
59
60
 
61
+
60
62
  /*-***************************/
61
63
  /* generic DTableDesc */
62
64
  /*-***************************/
63
-
64
65
  typedef struct { BYTE maxTableLog; BYTE tableType; BYTE tableLog; BYTE reserved; } DTableDesc;
65
66
 
66
67
  static DTableDesc HUF_getDTableDesc(const HUF_DTable* table)
@@ -74,7 +75,6 @@ static DTableDesc HUF_getDTableDesc(const HUF_DTable* table)
74
75
  /*-***************************/
75
76
  /* single-symbol decoding */
76
77
  /*-***************************/
77
-
78
78
  typedef struct { BYTE byte; BYTE nbBits; } HUF_DEltX2; /* single-symbol decoding */
79
79
 
80
80
  size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize)
@@ -94,10 +94,7 @@ size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, const void* src, size_t srcSize
94
94
  huffWeight = (BYTE *)((U32 *)workSpace + spaceUsed32);
95
95
  spaceUsed32 += HUF_ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2;
96
96
 
97
- if ((spaceUsed32 << 2) > wkspSize)
98
- return ERROR(tableLog_tooLarge);
99
- workSpace = (U32 *)workSpace + spaceUsed32;
100
- wkspSize -= (spaceUsed32 << 2);
97
+ if ((spaceUsed32 << 2) > wkspSize) return ERROR(tableLog_tooLarge);
101
98
 
102
99
  HUF_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable));
103
100
  /* memset(huffWeight, 0, sizeof(huffWeight)); */ /* is not necessary, even though some analyzer complain ... */
@@ -144,8 +141,10 @@ size_t HUF_readDTableX2(HUF_DTable* DTable, const void* src, size_t srcSize)
144
141
  workSpace, sizeof(workSpace));
145
142
  }
146
143
 
144
+ typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUF_DEltX4; /* double-symbols decoding */
147
145
 
148
- static BYTE HUF_decodeSymbolX2(BIT_DStream_t* Dstream, const HUF_DEltX2* dt, const U32 dtLog)
146
+ FORCE_INLINE_TEMPLATE BYTE
147
+ HUF_decodeSymbolX2(BIT_DStream_t* Dstream, const HUF_DEltX2* dt, const U32 dtLog)
149
148
  {
150
149
  size_t const val = BIT_lookBitsFast(Dstream, dtLog); /* note : dtLog >= 1 */
151
150
  BYTE const c = dt[val].byte;
@@ -156,7 +155,7 @@ static BYTE HUF_decodeSymbolX2(BIT_DStream_t* Dstream, const HUF_DEltX2* dt, con
156
155
  #define HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) \
157
156
  *ptr++ = HUF_decodeSymbolX2(DStreamPtr, dt, dtLog)
158
157
 
159
- #define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \
158
+ #define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \
160
159
  if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
161
160
  HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr)
162
161
 
@@ -164,30 +163,33 @@ static BYTE HUF_decodeSymbolX2(BIT_DStream_t* Dstream, const HUF_DEltX2* dt, con
164
163
  if (MEM_64bits()) \
165
164
  HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr)
166
165
 
167
- HINT_INLINE size_t HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, const HUF_DEltX2* const dt, const U32 dtLog)
166
+ HINT_INLINE size_t
167
+ HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, const HUF_DEltX2* const dt, const U32 dtLog)
168
168
  {
169
169
  BYTE* const pStart = p;
170
170
 
171
171
  /* up to 4 symbols at a time */
172
- while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p <= pEnd-4)) {
172
+ while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-3)) {
173
173
  HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
174
174
  HUF_DECODE_SYMBOLX2_1(p, bitDPtr);
175
175
  HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
176
176
  HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
177
177
  }
178
178
 
179
- /* closer to the end */
180
- while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p < pEnd))
181
- HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
179
+ /* [0-3] symbols remaining */
180
+ if (MEM_32bits())
181
+ while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd))
182
+ HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
182
183
 
183
- /* no more data to retrieve from bitstream, hence no need to reload */
184
+ /* no more data to retrieve from bitstream, no need to reload */
184
185
  while (p < pEnd)
185
186
  HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
186
187
 
187
188
  return pEnd-pStart;
188
189
  }
189
190
 
190
- static size_t HUF_decompress1X2_usingDTable_internal(
191
+ FORCE_INLINE_TEMPLATE size_t
192
+ HUF_decompress1X2_usingDTable_internal_body(
191
193
  void* dst, size_t dstSize,
192
194
  const void* cSrc, size_t cSrcSize,
193
195
  const HUF_DTable* DTable)
@@ -200,58 +202,17 @@ static size_t HUF_decompress1X2_usingDTable_internal(
200
202
  DTableDesc const dtd = HUF_getDTableDesc(DTable);
201
203
  U32 const dtLog = dtd.tableLog;
202
204
 
203
- { size_t const errorCode = BIT_initDStream(&bitD, cSrc, cSrcSize);
204
- if (HUF_isError(errorCode)) return errorCode; }
205
+ CHECK_F( BIT_initDStream(&bitD, cSrc, cSrcSize) );
205
206
 
206
207
  HUF_decodeStreamX2(op, &bitD, oend, dt, dtLog);
207
208
 
208
- /* check */
209
209
  if (!BIT_endOfDStream(&bitD)) return ERROR(corruption_detected);
210
210
 
211
211
  return dstSize;
212
212
  }
213
213
 
214
- size_t HUF_decompress1X2_usingDTable(
215
- void* dst, size_t dstSize,
216
- const void* cSrc, size_t cSrcSize,
217
- const HUF_DTable* DTable)
218
- {
219
- DTableDesc dtd = HUF_getDTableDesc(DTable);
220
- if (dtd.tableType != 0) return ERROR(GENERIC);
221
- return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable);
222
- }
223
-
224
- size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
225
- const void* cSrc, size_t cSrcSize,
226
- void* workSpace, size_t wkspSize)
227
- {
228
- const BYTE* ip = (const BYTE*) cSrc;
229
-
230
- size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize, workSpace, wkspSize);
231
- if (HUF_isError(hSize)) return hSize;
232
- if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
233
- ip += hSize; cSrcSize -= hSize;
234
-
235
- return HUF_decompress1X2_usingDTable_internal (dst, dstSize, ip, cSrcSize, DCtx);
236
- }
237
-
238
-
239
- size_t HUF_decompress1X2_DCtx(HUF_DTable* DCtx, void* dst, size_t dstSize,
240
- const void* cSrc, size_t cSrcSize)
241
- {
242
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
243
- return HUF_decompress1X2_DCtx_wksp(DCtx, dst, dstSize, cSrc, cSrcSize,
244
- workSpace, sizeof(workSpace));
245
- }
246
-
247
- size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
248
- {
249
- HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_TABLELOG_MAX);
250
- return HUF_decompress1X2_DCtx (DTable, dst, dstSize, cSrc, cSrcSize);
251
- }
252
-
253
-
254
- static size_t HUF_decompress4X2_usingDTable_internal(
214
+ FORCE_INLINE_TEMPLATE size_t
215
+ HUF_decompress4X2_usingDTable_internal_body(
255
216
  void* dst, size_t dstSize,
256
217
  const void* cSrc, size_t cSrcSize,
257
218
  const HUF_DTable* DTable)
@@ -286,23 +247,19 @@ static size_t HUF_decompress4X2_usingDTable_internal(
286
247
  BYTE* op2 = opStart2;
287
248
  BYTE* op3 = opStart3;
288
249
  BYTE* op4 = opStart4;
289
- U32 endSignal;
250
+ U32 endSignal = BIT_DStream_unfinished;
290
251
  DTableDesc const dtd = HUF_getDTableDesc(DTable);
291
252
  U32 const dtLog = dtd.tableLog;
292
253
 
293
254
  if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */
294
- { size_t const errorCode = BIT_initDStream(&bitD1, istart1, length1);
295
- if (HUF_isError(errorCode)) return errorCode; }
296
- { size_t const errorCode = BIT_initDStream(&bitD2, istart2, length2);
297
- if (HUF_isError(errorCode)) return errorCode; }
298
- { size_t const errorCode = BIT_initDStream(&bitD3, istart3, length3);
299
- if (HUF_isError(errorCode)) return errorCode; }
300
- { size_t const errorCode = BIT_initDStream(&bitD4, istart4, length4);
301
- if (HUF_isError(errorCode)) return errorCode; }
255
+ CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
256
+ CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
257
+ CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
258
+ CHECK_F( BIT_initDStream(&bitD4, istart4, length4) );
302
259
 
303
- /* 16-32 symbols per loop (4-8 symbols per stream) */
260
+ /* up to 16 symbols per loop (4 symbols per stream) in 64-bit mode */
304
261
  endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
305
- for ( ; (endSignal==BIT_DStream_unfinished) && (op4<(oend-7)) ; ) {
262
+ while ( (endSignal==BIT_DStream_unfinished) && (op4<(oend-3)) ) {
306
263
  HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
307
264
  HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
308
265
  HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
@@ -319,10 +276,15 @@ static size_t HUF_decompress4X2_usingDTable_internal(
319
276
  HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
320
277
  HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
321
278
  HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
322
- endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
279
+ BIT_reloadDStream(&bitD1);
280
+ BIT_reloadDStream(&bitD2);
281
+ BIT_reloadDStream(&bitD3);
282
+ BIT_reloadDStream(&bitD4);
323
283
  }
324
284
 
325
285
  /* check corruption */
286
+ /* note : should not be necessary : op# advance in lock step, and we control op4.
287
+ * but curiously, binary generated by gcc 7.2 & 7.3 with -mbmi2 runs faster when >=1 test is present */
326
288
  if (op1 > opStart2) return ERROR(corruption_detected);
327
289
  if (op2 > opStart3) return ERROR(corruption_detected);
328
290
  if (op3 > opStart4) return ERROR(corruption_detected);
@@ -335,8 +297,8 @@ static size_t HUF_decompress4X2_usingDTable_internal(
335
297
  HUF_decodeStreamX2(op4, &bitD4, oend, dt, dtLog);
336
298
 
337
299
  /* check */
338
- endSignal = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
339
- if (!endSignal) return ERROR(corruption_detected);
300
+ { U32 const endCheck = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
301
+ if (!endCheck) return ERROR(corruption_detected); }
340
302
 
341
303
  /* decoded size */
342
304
  return dstSize;
@@ -344,30 +306,309 @@ static size_t HUF_decompress4X2_usingDTable_internal(
344
306
  }
345
307
 
346
308
 
347
- size_t HUF_decompress4X2_usingDTable(
309
+ FORCE_INLINE_TEMPLATE U32
310
+ HUF_decodeSymbolX4(void* op, BIT_DStream_t* DStream, const HUF_DEltX4* dt, const U32 dtLog)
311
+ {
312
+ size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */
313
+ memcpy(op, dt+val, 2);
314
+ BIT_skipBits(DStream, dt[val].nbBits);
315
+ return dt[val].length;
316
+ }
317
+
318
+ FORCE_INLINE_TEMPLATE U32
319
+ HUF_decodeLastSymbolX4(void* op, BIT_DStream_t* DStream, const HUF_DEltX4* dt, const U32 dtLog)
320
+ {
321
+ size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */
322
+ memcpy(op, dt+val, 1);
323
+ if (dt[val].length==1) BIT_skipBits(DStream, dt[val].nbBits);
324
+ else {
325
+ if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8)) {
326
+ BIT_skipBits(DStream, dt[val].nbBits);
327
+ if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8))
328
+ /* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */
329
+ DStream->bitsConsumed = (sizeof(DStream->bitContainer)*8);
330
+ } }
331
+ return 1;
332
+ }
333
+
334
+ #define HUF_DECODE_SYMBOLX4_0(ptr, DStreamPtr) \
335
+ ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
336
+
337
+ #define HUF_DECODE_SYMBOLX4_1(ptr, DStreamPtr) \
338
+ if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
339
+ ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
340
+
341
+ #define HUF_DECODE_SYMBOLX4_2(ptr, DStreamPtr) \
342
+ if (MEM_64bits()) \
343
+ ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
344
+
345
+ HINT_INLINE size_t
346
+ HUF_decodeStreamX4(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd,
347
+ const HUF_DEltX4* const dt, const U32 dtLog)
348
+ {
349
+ BYTE* const pStart = p;
350
+
351
+ /* up to 8 symbols at a time */
352
+ while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-(sizeof(bitDPtr->bitContainer)-1))) {
353
+ HUF_DECODE_SYMBOLX4_2(p, bitDPtr);
354
+ HUF_DECODE_SYMBOLX4_1(p, bitDPtr);
355
+ HUF_DECODE_SYMBOLX4_2(p, bitDPtr);
356
+ HUF_DECODE_SYMBOLX4_0(p, bitDPtr);
357
+ }
358
+
359
+ /* closer to end : up to 2 symbols at a time */
360
+ while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p <= pEnd-2))
361
+ HUF_DECODE_SYMBOLX4_0(p, bitDPtr);
362
+
363
+ while (p <= pEnd-2)
364
+ HUF_DECODE_SYMBOLX4_0(p, bitDPtr); /* no need to reload : reached the end of DStream */
365
+
366
+ if (p < pEnd)
367
+ p += HUF_decodeLastSymbolX4(p, bitDPtr, dt, dtLog);
368
+
369
+ return p-pStart;
370
+ }
371
+
372
+ FORCE_INLINE_TEMPLATE size_t
373
+ HUF_decompress1X4_usingDTable_internal_body(
374
+ void* dst, size_t dstSize,
375
+ const void* cSrc, size_t cSrcSize,
376
+ const HUF_DTable* DTable)
377
+ {
378
+ BIT_DStream_t bitD;
379
+
380
+ /* Init */
381
+ CHECK_F( BIT_initDStream(&bitD, cSrc, cSrcSize) );
382
+
383
+ /* decode */
384
+ { BYTE* const ostart = (BYTE*) dst;
385
+ BYTE* const oend = ostart + dstSize;
386
+ const void* const dtPtr = DTable+1; /* force compiler to not use strict-aliasing */
387
+ const HUF_DEltX4* const dt = (const HUF_DEltX4*)dtPtr;
388
+ DTableDesc const dtd = HUF_getDTableDesc(DTable);
389
+ HUF_decodeStreamX4(ostart, &bitD, oend, dt, dtd.tableLog);
390
+ }
391
+
392
+ /* check */
393
+ if (!BIT_endOfDStream(&bitD)) return ERROR(corruption_detected);
394
+
395
+ /* decoded size */
396
+ return dstSize;
397
+ }
398
+
399
+
400
+ FORCE_INLINE_TEMPLATE size_t
401
+ HUF_decompress4X4_usingDTable_internal_body(
402
+ void* dst, size_t dstSize,
403
+ const void* cSrc, size_t cSrcSize,
404
+ const HUF_DTable* DTable)
405
+ {
406
+ if (cSrcSize < 10) return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */
407
+
408
+ { const BYTE* const istart = (const BYTE*) cSrc;
409
+ BYTE* const ostart = (BYTE*) dst;
410
+ BYTE* const oend = ostart + dstSize;
411
+ const void* const dtPtr = DTable+1;
412
+ const HUF_DEltX4* const dt = (const HUF_DEltX4*)dtPtr;
413
+
414
+ /* Init */
415
+ BIT_DStream_t bitD1;
416
+ BIT_DStream_t bitD2;
417
+ BIT_DStream_t bitD3;
418
+ BIT_DStream_t bitD4;
419
+ size_t const length1 = MEM_readLE16(istart);
420
+ size_t const length2 = MEM_readLE16(istart+2);
421
+ size_t const length3 = MEM_readLE16(istart+4);
422
+ size_t const length4 = cSrcSize - (length1 + length2 + length3 + 6);
423
+ const BYTE* const istart1 = istart + 6; /* jumpTable */
424
+ const BYTE* const istart2 = istart1 + length1;
425
+ const BYTE* const istart3 = istart2 + length2;
426
+ const BYTE* const istart4 = istart3 + length3;
427
+ size_t const segmentSize = (dstSize+3) / 4;
428
+ BYTE* const opStart2 = ostart + segmentSize;
429
+ BYTE* const opStart3 = opStart2 + segmentSize;
430
+ BYTE* const opStart4 = opStart3 + segmentSize;
431
+ BYTE* op1 = ostart;
432
+ BYTE* op2 = opStart2;
433
+ BYTE* op3 = opStart3;
434
+ BYTE* op4 = opStart4;
435
+ U32 endSignal;
436
+ DTableDesc const dtd = HUF_getDTableDesc(DTable);
437
+ U32 const dtLog = dtd.tableLog;
438
+
439
+ if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */
440
+ CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
441
+ CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
442
+ CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
443
+ CHECK_F( BIT_initDStream(&bitD4, istart4, length4) );
444
+
445
+ /* 16-32 symbols per loop (4-8 symbols per stream) */
446
+ endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
447
+ for ( ; (endSignal==BIT_DStream_unfinished) & (op4<(oend-(sizeof(bitD4.bitContainer)-1))) ; ) {
448
+ HUF_DECODE_SYMBOLX4_2(op1, &bitD1);
449
+ HUF_DECODE_SYMBOLX4_2(op2, &bitD2);
450
+ HUF_DECODE_SYMBOLX4_2(op3, &bitD3);
451
+ HUF_DECODE_SYMBOLX4_2(op4, &bitD4);
452
+ HUF_DECODE_SYMBOLX4_1(op1, &bitD1);
453
+ HUF_DECODE_SYMBOLX4_1(op2, &bitD2);
454
+ HUF_DECODE_SYMBOLX4_1(op3, &bitD3);
455
+ HUF_DECODE_SYMBOLX4_1(op4, &bitD4);
456
+ HUF_DECODE_SYMBOLX4_2(op1, &bitD1);
457
+ HUF_DECODE_SYMBOLX4_2(op2, &bitD2);
458
+ HUF_DECODE_SYMBOLX4_2(op3, &bitD3);
459
+ HUF_DECODE_SYMBOLX4_2(op4, &bitD4);
460
+ HUF_DECODE_SYMBOLX4_0(op1, &bitD1);
461
+ HUF_DECODE_SYMBOLX4_0(op2, &bitD2);
462
+ HUF_DECODE_SYMBOLX4_0(op3, &bitD3);
463
+ HUF_DECODE_SYMBOLX4_0(op4, &bitD4);
464
+
465
+ endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
466
+ }
467
+
468
+ /* check corruption */
469
+ if (op1 > opStart2) return ERROR(corruption_detected);
470
+ if (op2 > opStart3) return ERROR(corruption_detected);
471
+ if (op3 > opStart4) return ERROR(corruption_detected);
472
+ /* note : op4 already verified within main loop */
473
+
474
+ /* finish bitStreams one by one */
475
+ HUF_decodeStreamX4(op1, &bitD1, opStart2, dt, dtLog);
476
+ HUF_decodeStreamX4(op2, &bitD2, opStart3, dt, dtLog);
477
+ HUF_decodeStreamX4(op3, &bitD3, opStart4, dt, dtLog);
478
+ HUF_decodeStreamX4(op4, &bitD4, oend, dt, dtLog);
479
+
480
+ /* check */
481
+ { U32 const endCheck = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
482
+ if (!endCheck) return ERROR(corruption_detected); }
483
+
484
+ /* decoded size */
485
+ return dstSize;
486
+ }
487
+ }
488
+
489
+
490
+ typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize,
491
+ const void *cSrc,
492
+ size_t cSrcSize,
493
+ const HUF_DTable *DTable);
494
+ #if DYNAMIC_BMI2
495
+
496
+ #define X(fn) \
497
+ \
498
+ static size_t fn##_default( \
499
+ void* dst, size_t dstSize, \
500
+ const void* cSrc, size_t cSrcSize, \
501
+ const HUF_DTable* DTable) \
502
+ { \
503
+ return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \
504
+ } \
505
+ \
506
+ static TARGET_ATTRIBUTE("bmi2") size_t fn##_bmi2( \
507
+ void* dst, size_t dstSize, \
508
+ const void* cSrc, size_t cSrcSize, \
509
+ const HUF_DTable* DTable) \
510
+ { \
511
+ return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \
512
+ } \
513
+ \
514
+ static size_t fn(void* dst, size_t dstSize, void const* cSrc, \
515
+ size_t cSrcSize, HUF_DTable const* DTable, int bmi2) \
516
+ { \
517
+ if (bmi2) { \
518
+ return fn##_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); \
519
+ } \
520
+ return fn##_default(dst, dstSize, cSrc, cSrcSize, DTable); \
521
+ }
522
+
523
+ #else
524
+
525
+ #define X(fn) \
526
+ static size_t fn(void* dst, size_t dstSize, void const* cSrc, \
527
+ size_t cSrcSize, HUF_DTable const* DTable, int bmi2) \
528
+ { \
529
+ (void)bmi2; \
530
+ return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \
531
+ }
532
+
533
+ #endif
534
+
535
+ X(HUF_decompress1X2_usingDTable_internal)
536
+ X(HUF_decompress4X2_usingDTable_internal)
537
+ X(HUF_decompress1X4_usingDTable_internal)
538
+ X(HUF_decompress4X4_usingDTable_internal)
539
+
540
+ #undef X
541
+
542
+
543
+ size_t HUF_decompress1X2_usingDTable(
348
544
  void* dst, size_t dstSize,
349
545
  const void* cSrc, size_t cSrcSize,
350
546
  const HUF_DTable* DTable)
351
547
  {
352
548
  DTableDesc dtd = HUF_getDTableDesc(DTable);
353
549
  if (dtd.tableType != 0) return ERROR(GENERIC);
354
- return HUF_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable);
550
+ return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
355
551
  }
356
552
 
357
-
358
- size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
553
+ size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
359
554
  const void* cSrc, size_t cSrcSize,
360
555
  void* workSpace, size_t wkspSize)
361
556
  {
362
557
  const BYTE* ip = (const BYTE*) cSrc;
363
558
 
559
+ size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize, workSpace, wkspSize);
560
+ if (HUF_isError(hSize)) return hSize;
561
+ if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
562
+ ip += hSize; cSrcSize -= hSize;
563
+
564
+ return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
565
+ }
566
+
567
+
568
+ size_t HUF_decompress1X2_DCtx(HUF_DTable* DCtx, void* dst, size_t dstSize,
569
+ const void* cSrc, size_t cSrcSize)
570
+ {
571
+ U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
572
+ return HUF_decompress1X2_DCtx_wksp(DCtx, dst, dstSize, cSrc, cSrcSize,
573
+ workSpace, sizeof(workSpace));
574
+ }
575
+
576
+ size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
577
+ {
578
+ HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_TABLELOG_MAX);
579
+ return HUF_decompress1X2_DCtx (DTable, dst, dstSize, cSrc, cSrcSize);
580
+ }
581
+
582
+ size_t HUF_decompress4X2_usingDTable(
583
+ void* dst, size_t dstSize,
584
+ const void* cSrc, size_t cSrcSize,
585
+ const HUF_DTable* DTable)
586
+ {
587
+ DTableDesc dtd = HUF_getDTableDesc(DTable);
588
+ if (dtd.tableType != 0) return ERROR(GENERIC);
589
+ return HUF_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
590
+ }
591
+
592
+ static size_t HUF_decompress4X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
593
+ const void* cSrc, size_t cSrcSize,
594
+ void* workSpace, size_t wkspSize, int bmi2)
595
+ {
596
+ const BYTE* ip = (const BYTE*) cSrc;
597
+
364
598
  size_t const hSize = HUF_readDTableX2_wksp (dctx, cSrc, cSrcSize,
365
599
  workSpace, wkspSize);
366
600
  if (HUF_isError(hSize)) return hSize;
367
601
  if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
368
602
  ip += hSize; cSrcSize -= hSize;
369
603
 
370
- return HUF_decompress4X2_usingDTable_internal (dst, dstSize, ip, cSrcSize, dctx);
604
+ return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
605
+ }
606
+
607
+ size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
608
+ const void* cSrc, size_t cSrcSize,
609
+ void* workSpace, size_t wkspSize)
610
+ {
611
+ return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, 0);
371
612
  }
372
613
 
373
614
 
@@ -387,8 +628,6 @@ size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cS
387
628
  /* *************************/
388
629
  /* double-symbols decoding */
389
630
  /* *************************/
390
- typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUF_DEltX4; /* double-symbols decoding */
391
-
392
631
  typedef struct { BYTE symbol; BYTE weight; } sortedSymbol_t;
393
632
 
394
633
  /* HUF_fillDTableX4Level2() :
@@ -508,10 +747,7 @@ size_t HUF_readDTableX4_wksp(HUF_DTable* DTable, const void* src,
508
747
  weightList = (BYTE *)((U32 *)workSpace + spaceUsed32);
509
748
  spaceUsed32 += HUF_ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2;
510
749
 
511
- if ((spaceUsed32 << 2) > wkspSize)
512
- return ERROR(tableLog_tooLarge);
513
- workSpace = (U32 *)workSpace + spaceUsed32;
514
- wkspSize -= (spaceUsed32 << 2);
750
+ if ((spaceUsed32 << 2) > wkspSize) return ERROR(tableLog_tooLarge);
515
751
 
516
752
  rankStart = rankStart0 + 1;
517
753
  memset(rankStats, 0, sizeof(U32) * (2 * HUF_TABLELOG_MAX + 2 + 1));
@@ -588,95 +824,6 @@ size_t HUF_readDTableX4(HUF_DTable* DTable, const void* src, size_t srcSize)
588
824
  workSpace, sizeof(workSpace));
589
825
  }
590
826
 
591
- static U32 HUF_decodeSymbolX4(void* op, BIT_DStream_t* DStream, const HUF_DEltX4* dt, const U32 dtLog)
592
- {
593
- size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */
594
- memcpy(op, dt+val, 2);
595
- BIT_skipBits(DStream, dt[val].nbBits);
596
- return dt[val].length;
597
- }
598
-
599
- static U32 HUF_decodeLastSymbolX4(void* op, BIT_DStream_t* DStream, const HUF_DEltX4* dt, const U32 dtLog)
600
- {
601
- size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */
602
- memcpy(op, dt+val, 1);
603
- if (dt[val].length==1) BIT_skipBits(DStream, dt[val].nbBits);
604
- else {
605
- if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8)) {
606
- BIT_skipBits(DStream, dt[val].nbBits);
607
- if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8))
608
- /* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */
609
- DStream->bitsConsumed = (sizeof(DStream->bitContainer)*8);
610
- } }
611
- return 1;
612
- }
613
-
614
-
615
- #define HUF_DECODE_SYMBOLX4_0(ptr, DStreamPtr) \
616
- ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
617
-
618
- #define HUF_DECODE_SYMBOLX4_1(ptr, DStreamPtr) \
619
- if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
620
- ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
621
-
622
- #define HUF_DECODE_SYMBOLX4_2(ptr, DStreamPtr) \
623
- if (MEM_64bits()) \
624
- ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
625
-
626
- HINT_INLINE size_t HUF_decodeStreamX4(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd, const HUF_DEltX4* const dt, const U32 dtLog)
627
- {
628
- BYTE* const pStart = p;
629
-
630
- /* up to 8 symbols at a time */
631
- while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-(sizeof(bitDPtr->bitContainer)-1))) {
632
- HUF_DECODE_SYMBOLX4_2(p, bitDPtr);
633
- HUF_DECODE_SYMBOLX4_1(p, bitDPtr);
634
- HUF_DECODE_SYMBOLX4_2(p, bitDPtr);
635
- HUF_DECODE_SYMBOLX4_0(p, bitDPtr);
636
- }
637
-
638
- /* closer to end : up to 2 symbols at a time */
639
- while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p <= pEnd-2))
640
- HUF_DECODE_SYMBOLX4_0(p, bitDPtr);
641
-
642
- while (p <= pEnd-2)
643
- HUF_DECODE_SYMBOLX4_0(p, bitDPtr); /* no need to reload : reached the end of DStream */
644
-
645
- if (p < pEnd)
646
- p += HUF_decodeLastSymbolX4(p, bitDPtr, dt, dtLog);
647
-
648
- return p-pStart;
649
- }
650
-
651
-
652
- static size_t HUF_decompress1X4_usingDTable_internal(
653
- void* dst, size_t dstSize,
654
- const void* cSrc, size_t cSrcSize,
655
- const HUF_DTable* DTable)
656
- {
657
- BIT_DStream_t bitD;
658
-
659
- /* Init */
660
- { size_t const errorCode = BIT_initDStream(&bitD, cSrc, cSrcSize);
661
- if (HUF_isError(errorCode)) return errorCode;
662
- }
663
-
664
- /* decode */
665
- { BYTE* const ostart = (BYTE*) dst;
666
- BYTE* const oend = ostart + dstSize;
667
- const void* const dtPtr = DTable+1; /* force compiler to not use strict-aliasing */
668
- const HUF_DEltX4* const dt = (const HUF_DEltX4*)dtPtr;
669
- DTableDesc const dtd = HUF_getDTableDesc(DTable);
670
- HUF_decodeStreamX4(ostart, &bitD, oend, dt, dtd.tableLog);
671
- }
672
-
673
- /* check */
674
- if (!BIT_endOfDStream(&bitD)) return ERROR(corruption_detected);
675
-
676
- /* decoded size */
677
- return dstSize;
678
- }
679
-
680
827
  size_t HUF_decompress1X4_usingDTable(
681
828
  void* dst, size_t dstSize,
682
829
  const void* cSrc, size_t cSrcSize,
@@ -684,7 +831,7 @@ size_t HUF_decompress1X4_usingDTable(
684
831
  {
685
832
  DTableDesc dtd = HUF_getDTableDesc(DTable);
686
833
  if (dtd.tableType != 1) return ERROR(GENERIC);
687
- return HUF_decompress1X4_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable);
834
+ return HUF_decompress1X4_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
688
835
  }
689
836
 
690
837
  size_t HUF_decompress1X4_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
@@ -699,7 +846,7 @@ size_t HUF_decompress1X4_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
699
846
  if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
700
847
  ip += hSize; cSrcSize -= hSize;
701
848
 
702
- return HUF_decompress1X4_usingDTable_internal (dst, dstSize, ip, cSrcSize, DCtx);
849
+ return HUF_decompress1X4_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
703
850
  }
704
851
 
705
852
 
@@ -717,99 +864,6 @@ size_t HUF_decompress1X4 (void* dst, size_t dstSize, const void* cSrc, size_t cS
717
864
  return HUF_decompress1X4_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
718
865
  }
719
866
 
720
- static size_t HUF_decompress4X4_usingDTable_internal(
721
- void* dst, size_t dstSize,
722
- const void* cSrc, size_t cSrcSize,
723
- const HUF_DTable* DTable)
724
- {
725
- if (cSrcSize < 10) return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */
726
-
727
- { const BYTE* const istart = (const BYTE*) cSrc;
728
- BYTE* const ostart = (BYTE*) dst;
729
- BYTE* const oend = ostart + dstSize;
730
- const void* const dtPtr = DTable+1;
731
- const HUF_DEltX4* const dt = (const HUF_DEltX4*)dtPtr;
732
-
733
- /* Init */
734
- BIT_DStream_t bitD1;
735
- BIT_DStream_t bitD2;
736
- BIT_DStream_t bitD3;
737
- BIT_DStream_t bitD4;
738
- size_t const length1 = MEM_readLE16(istart);
739
- size_t const length2 = MEM_readLE16(istart+2);
740
- size_t const length3 = MEM_readLE16(istart+4);
741
- size_t const length4 = cSrcSize - (length1 + length2 + length3 + 6);
742
- const BYTE* const istart1 = istart + 6; /* jumpTable */
743
- const BYTE* const istart2 = istart1 + length1;
744
- const BYTE* const istart3 = istart2 + length2;
745
- const BYTE* const istart4 = istart3 + length3;
746
- size_t const segmentSize = (dstSize+3) / 4;
747
- BYTE* const opStart2 = ostart + segmentSize;
748
- BYTE* const opStart3 = opStart2 + segmentSize;
749
- BYTE* const opStart4 = opStart3 + segmentSize;
750
- BYTE* op1 = ostart;
751
- BYTE* op2 = opStart2;
752
- BYTE* op3 = opStart3;
753
- BYTE* op4 = opStart4;
754
- U32 endSignal;
755
- DTableDesc const dtd = HUF_getDTableDesc(DTable);
756
- U32 const dtLog = dtd.tableLog;
757
-
758
- if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */
759
- { size_t const errorCode = BIT_initDStream(&bitD1, istart1, length1);
760
- if (HUF_isError(errorCode)) return errorCode; }
761
- { size_t const errorCode = BIT_initDStream(&bitD2, istart2, length2);
762
- if (HUF_isError(errorCode)) return errorCode; }
763
- { size_t const errorCode = BIT_initDStream(&bitD3, istart3, length3);
764
- if (HUF_isError(errorCode)) return errorCode; }
765
- { size_t const errorCode = BIT_initDStream(&bitD4, istart4, length4);
766
- if (HUF_isError(errorCode)) return errorCode; }
767
-
768
- /* 16-32 symbols per loop (4-8 symbols per stream) */
769
- endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
770
- for ( ; (endSignal==BIT_DStream_unfinished) & (op4<(oend-(sizeof(bitD4.bitContainer)-1))) ; ) {
771
- HUF_DECODE_SYMBOLX4_2(op1, &bitD1);
772
- HUF_DECODE_SYMBOLX4_2(op2, &bitD2);
773
- HUF_DECODE_SYMBOLX4_2(op3, &bitD3);
774
- HUF_DECODE_SYMBOLX4_2(op4, &bitD4);
775
- HUF_DECODE_SYMBOLX4_1(op1, &bitD1);
776
- HUF_DECODE_SYMBOLX4_1(op2, &bitD2);
777
- HUF_DECODE_SYMBOLX4_1(op3, &bitD3);
778
- HUF_DECODE_SYMBOLX4_1(op4, &bitD4);
779
- HUF_DECODE_SYMBOLX4_2(op1, &bitD1);
780
- HUF_DECODE_SYMBOLX4_2(op2, &bitD2);
781
- HUF_DECODE_SYMBOLX4_2(op3, &bitD3);
782
- HUF_DECODE_SYMBOLX4_2(op4, &bitD4);
783
- HUF_DECODE_SYMBOLX4_0(op1, &bitD1);
784
- HUF_DECODE_SYMBOLX4_0(op2, &bitD2);
785
- HUF_DECODE_SYMBOLX4_0(op3, &bitD3);
786
- HUF_DECODE_SYMBOLX4_0(op4, &bitD4);
787
-
788
- endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
789
- }
790
-
791
- /* check corruption */
792
- if (op1 > opStart2) return ERROR(corruption_detected);
793
- if (op2 > opStart3) return ERROR(corruption_detected);
794
- if (op3 > opStart4) return ERROR(corruption_detected);
795
- /* note : op4 already verified within main loop */
796
-
797
- /* finish bitStreams one by one */
798
- HUF_decodeStreamX4(op1, &bitD1, opStart2, dt, dtLog);
799
- HUF_decodeStreamX4(op2, &bitD2, opStart3, dt, dtLog);
800
- HUF_decodeStreamX4(op3, &bitD3, opStart4, dt, dtLog);
801
- HUF_decodeStreamX4(op4, &bitD4, oend, dt, dtLog);
802
-
803
- /* check */
804
- { U32 const endCheck = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
805
- if (!endCheck) return ERROR(corruption_detected); }
806
-
807
- /* decoded size */
808
- return dstSize;
809
- }
810
- }
811
-
812
-
813
867
  size_t HUF_decompress4X4_usingDTable(
814
868
  void* dst, size_t dstSize,
815
869
  const void* cSrc, size_t cSrcSize,
@@ -817,13 +871,12 @@ size_t HUF_decompress4X4_usingDTable(
817
871
  {
818
872
  DTableDesc dtd = HUF_getDTableDesc(DTable);
819
873
  if (dtd.tableType != 1) return ERROR(GENERIC);
820
- return HUF_decompress4X4_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable);
874
+ return HUF_decompress4X4_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
821
875
  }
822
876
 
823
-
824
- size_t HUF_decompress4X4_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
877
+ static size_t HUF_decompress4X4_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
825
878
  const void* cSrc, size_t cSrcSize,
826
- void* workSpace, size_t wkspSize)
879
+ void* workSpace, size_t wkspSize, int bmi2)
827
880
  {
828
881
  const BYTE* ip = (const BYTE*) cSrc;
829
882
 
@@ -833,7 +886,14 @@ size_t HUF_decompress4X4_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
833
886
  if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
834
887
  ip += hSize; cSrcSize -= hSize;
835
888
 
836
- return HUF_decompress4X4_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx);
889
+ return HUF_decompress4X4_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
890
+ }
891
+
892
+ size_t HUF_decompress4X4_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
893
+ const void* cSrc, size_t cSrcSize,
894
+ void* workSpace, size_t wkspSize)
895
+ {
896
+ return HUF_decompress4X4_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, /* bmi2 */ 0);
837
897
  }
838
898
 
839
899
 
@@ -861,8 +921,8 @@ size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize,
861
921
  const HUF_DTable* DTable)
862
922
  {
863
923
  DTableDesc const dtd = HUF_getDTableDesc(DTable);
864
- return dtd.tableType ? HUF_decompress1X4_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable) :
865
- HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable);
924
+ return dtd.tableType ? HUF_decompress1X4_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
925
+ HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
866
926
  }
867
927
 
868
928
  size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize,
@@ -870,8 +930,8 @@ size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize,
870
930
  const HUF_DTable* DTable)
871
931
  {
872
932
  DTableDesc const dtd = HUF_getDTableDesc(DTable);
873
- return dtd.tableType ? HUF_decompress4X4_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable) :
874
- HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable);
933
+ return dtd.tableType ? HUF_decompress4X4_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
934
+ HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
875
935
  }
876
936
 
877
937
 
@@ -898,21 +958,22 @@ static const algo_time_t algoTime[16 /* Quantization */][3 /* single, double, qu
898
958
  };
899
959
 
900
960
  /** HUF_selectDecoder() :
901
- * Tells which decoder is likely to decode faster,
902
- * based on a set of pre-determined metrics.
903
- * @return : 0==HUF_decompress4X2, 1==HUF_decompress4X4 .
904
- * Assumption : 0 < cSrcSize, dstSize <= 128 KB */
961
+ * Tells which decoder is likely to decode faster,
962
+ * based on a set of pre-computed metrics.
963
+ * @return : 0==HUF_decompress4X2, 1==HUF_decompress4X4 .
964
+ * Assumption : 0 < dstSize <= 128 KB */
905
965
  U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize)
906
966
  {
967
+ assert(dstSize > 0);
968
+ assert(dstSize <= 128 KB);
907
969
  /* decoder timing evaluation */
908
- U32 const Q = cSrcSize >= dstSize ? 15 : (U32)(cSrcSize * 16 / dstSize); /* Q < 16 */
909
- U32 const D256 = (U32)(dstSize >> 8);
910
- U32 const DTime0 = algoTime[Q][0].tableTime + (algoTime[Q][0].decode256Time * D256);
911
- U32 DTime1 = algoTime[Q][1].tableTime + (algoTime[Q][1].decode256Time * D256);
912
- DTime1 += DTime1 >> 3; /* advantage to algorithm using less memory, for cache eviction */
913
-
914
- return DTime1 < DTime0;
915
- }
970
+ { U32 const Q = (cSrcSize >= dstSize) ? 15 : (U32)(cSrcSize * 16 / dstSize); /* Q < 16 */
971
+ U32 const D256 = (U32)(dstSize >> 8);
972
+ U32 const DTime0 = algoTime[Q][0].tableTime + (algoTime[Q][0].decode256Time * D256);
973
+ U32 DTime1 = algoTime[Q][1].tableTime + (algoTime[Q][1].decode256Time * D256);
974
+ DTime1 += DTime1 >> 3; /* advantage to algorithm using less memory, to reduce cache eviction */
975
+ return DTime1 < DTime0;
976
+ } }
916
977
 
917
978
 
918
979
  typedef size_t (*decompressionAlgo)(void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);
@@ -994,3 +1055,42 @@ size_t HUF_decompress1X_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize,
994
1055
  return HUF_decompress1X_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
995
1056
  workSpace, sizeof(workSpace));
996
1057
  }
1058
+
1059
+
1060
+ size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
1061
+ {
1062
+ DTableDesc const dtd = HUF_getDTableDesc(DTable);
1063
+ return dtd.tableType ? HUF_decompress1X4_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
1064
+ HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
1065
+ }
1066
+
1067
+ size_t HUF_decompress1X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
1068
+ {
1069
+ const BYTE* ip = (const BYTE*) cSrc;
1070
+
1071
+ size_t const hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize);
1072
+ if (HUF_isError(hSize)) return hSize;
1073
+ if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
1074
+ ip += hSize; cSrcSize -= hSize;
1075
+
1076
+ return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
1077
+ }
1078
+
1079
+ size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
1080
+ {
1081
+ DTableDesc const dtd = HUF_getDTableDesc(DTable);
1082
+ return dtd.tableType ? HUF_decompress4X4_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
1083
+ HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
1084
+ }
1085
+
1086
+ size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
1087
+ {
1088
+ /* validation checks */
1089
+ if (dstSize == 0) return ERROR(dstSize_tooSmall);
1090
+ if (cSrcSize == 0) return ERROR(corruption_detected);
1091
+
1092
+ { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
1093
+ return algoNb ? HUF_decompress4X4_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2) :
1094
+ HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
1095
+ }
1096
+ }