zstd-ruby 1.5.2.2 → 1.5.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +15 -3
  3. data/ext/zstdruby/common.h +7 -0
  4. data/ext/zstdruby/libzstd/common/bits.h +175 -0
  5. data/ext/zstdruby/libzstd/common/bitstream.h +18 -59
  6. data/ext/zstdruby/libzstd/common/compiler.h +22 -3
  7. data/ext/zstdruby/libzstd/common/cpu.h +1 -1
  8. data/ext/zstdruby/libzstd/common/debug.c +1 -1
  9. data/ext/zstdruby/libzstd/common/debug.h +1 -1
  10. data/ext/zstdruby/libzstd/common/entropy_common.c +12 -40
  11. data/ext/zstdruby/libzstd/common/error_private.c +9 -2
  12. data/ext/zstdruby/libzstd/common/error_private.h +1 -1
  13. data/ext/zstdruby/libzstd/common/fse.h +5 -83
  14. data/ext/zstdruby/libzstd/common/fse_decompress.c +7 -99
  15. data/ext/zstdruby/libzstd/common/huf.h +65 -156
  16. data/ext/zstdruby/libzstd/common/mem.h +39 -46
  17. data/ext/zstdruby/libzstd/common/pool.c +26 -10
  18. data/ext/zstdruby/libzstd/common/pool.h +7 -1
  19. data/ext/zstdruby/libzstd/common/portability_macros.h +22 -3
  20. data/ext/zstdruby/libzstd/common/threading.c +68 -14
  21. data/ext/zstdruby/libzstd/common/threading.h +5 -10
  22. data/ext/zstdruby/libzstd/common/xxhash.c +2 -2
  23. data/ext/zstdruby/libzstd/common/xxhash.h +8 -8
  24. data/ext/zstdruby/libzstd/common/zstd_common.c +1 -1
  25. data/ext/zstdruby/libzstd/common/zstd_deps.h +1 -1
  26. data/ext/zstdruby/libzstd/common/zstd_internal.h +17 -113
  27. data/ext/zstdruby/libzstd/common/zstd_trace.h +3 -3
  28. data/ext/zstdruby/libzstd/compress/clevels.h +1 -1
  29. data/ext/zstdruby/libzstd/compress/fse_compress.c +7 -124
  30. data/ext/zstdruby/libzstd/compress/hist.c +1 -1
  31. data/ext/zstdruby/libzstd/compress/hist.h +1 -1
  32. data/ext/zstdruby/libzstd/compress/huf_compress.c +234 -169
  33. data/ext/zstdruby/libzstd/compress/zstd_compress.c +1055 -455
  34. data/ext/zstdruby/libzstd/compress/zstd_compress_internal.h +165 -145
  35. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.c +115 -39
  36. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.h +16 -8
  37. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.c +3 -3
  38. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.h +1 -1
  39. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.c +25 -21
  40. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.h +1 -1
  41. data/ext/zstdruby/libzstd/compress/zstd_cwksp.h +5 -3
  42. data/ext/zstdruby/libzstd/compress/zstd_double_fast.c +95 -33
  43. data/ext/zstdruby/libzstd/compress/zstd_double_fast.h +3 -2
  44. data/ext/zstdruby/libzstd/compress/zstd_fast.c +433 -148
  45. data/ext/zstdruby/libzstd/compress/zstd_fast.h +3 -2
  46. data/ext/zstdruby/libzstd/compress/zstd_lazy.c +306 -283
  47. data/ext/zstdruby/libzstd/compress/zstd_lazy.h +4 -2
  48. data/ext/zstdruby/libzstd/compress/zstd_ldm.c +5 -5
  49. data/ext/zstdruby/libzstd/compress/zstd_ldm.h +1 -1
  50. data/ext/zstdruby/libzstd/compress/zstd_ldm_geartab.h +1 -1
  51. data/ext/zstdruby/libzstd/compress/zstd_opt.c +104 -80
  52. data/ext/zstdruby/libzstd/compress/zstd_opt.h +1 -1
  53. data/ext/zstdruby/libzstd/compress/zstdmt_compress.c +12 -5
  54. data/ext/zstdruby/libzstd/compress/zstdmt_compress.h +1 -1
  55. data/ext/zstdruby/libzstd/decompress/huf_decompress.c +434 -441
  56. data/ext/zstdruby/libzstd/decompress/huf_decompress_amd64.S +30 -39
  57. data/ext/zstdruby/libzstd/decompress/zstd_ddict.c +3 -4
  58. data/ext/zstdruby/libzstd/decompress/zstd_ddict.h +1 -1
  59. data/ext/zstdruby/libzstd/decompress/zstd_decompress.c +164 -42
  60. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.c +186 -65
  61. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.h +1 -1
  62. data/ext/zstdruby/libzstd/decompress/zstd_decompress_internal.h +4 -2
  63. data/ext/zstdruby/libzstd/dictBuilder/cover.c +19 -15
  64. data/ext/zstdruby/libzstd/dictBuilder/cover.h +1 -1
  65. data/ext/zstdruby/libzstd/dictBuilder/fastcover.c +2 -2
  66. data/ext/zstdruby/libzstd/dictBuilder/zdict.c +9 -87
  67. data/ext/zstdruby/libzstd/zdict.h +53 -31
  68. data/ext/zstdruby/libzstd/zstd.h +489 -90
  69. data/ext/zstdruby/libzstd/zstd_errors.h +27 -8
  70. data/ext/zstdruby/main.c +4 -0
  71. data/ext/zstdruby/streaming_compress.c +1 -7
  72. data/ext/zstdruby/zstdruby.c +110 -26
  73. data/lib/zstd-ruby/version.rb +1 -1
  74. data/lib/zstd-ruby.rb +0 -1
  75. metadata +7 -6
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) Yann Collet, Facebook, Inc.
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -20,12 +20,12 @@
20
20
  #include "../common/mem.h" /* low level memory routines */
21
21
  #define FSE_STATIC_LINKING_ONLY
22
22
  #include "../common/fse.h"
23
- #define HUF_STATIC_LINKING_ONLY
24
23
  #include "../common/huf.h"
25
24
  #include "../common/zstd_internal.h"
26
25
  #include "zstd_decompress_internal.h" /* ZSTD_DCtx */
27
26
  #include "zstd_ddict.h" /* ZSTD_DDictDictContent */
28
27
  #include "zstd_decompress_block.h"
28
+ #include "../common/bits.h" /* ZSTD_highbit32 */
29
29
 
30
30
  /*_*******************************************************
31
31
  * Macros
@@ -89,7 +89,7 @@ static void ZSTD_allocateLiteralsBuffer(ZSTD_DCtx* dctx, void* const dst, const
89
89
  dctx->litBufferEnd = dctx->litBuffer + litSize - ZSTD_LITBUFFEREXTRASIZE;
90
90
  }
91
91
  else {
92
- /* initially this will be stored entirely in dst during huffman decoding, it will partially shifted to litExtraBuffer after */
92
+ /* initially this will be stored entirely in dst during huffman decoding, it will partially be shifted to litExtraBuffer after */
93
93
  dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize;
94
94
  dctx->litBufferEnd = (BYTE*)dst + expectedWriteSize;
95
95
  }
@@ -134,13 +134,16 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
134
134
  ZSTD_FALLTHROUGH;
135
135
 
136
136
  case set_compressed:
137
- RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3");
137
+ RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need up to 5 for case 3");
138
138
  { size_t lhSize, litSize, litCSize;
139
139
  U32 singleStream=0;
140
140
  U32 const lhlCode = (istart[0] >> 2) & 3;
141
141
  U32 const lhc = MEM_readLE32(istart);
142
142
  size_t hufSuccess;
143
143
  size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
144
+ int const flags = 0
145
+ | (ZSTD_DCtx_get_bmi2(dctx) ? HUF_flags_bmi2 : 0)
146
+ | (dctx->disableHufAsm ? HUF_flags_disableAsm : 0);
144
147
  switch(lhlCode)
145
148
  {
146
149
  case 0: case 1: default: /* note : default is impossible, since lhlCode into [0..3] */
@@ -165,6 +168,10 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
165
168
  }
166
169
  RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
167
170
  RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
171
+ if (!singleStream)
172
+ RETURN_ERROR_IF(litSize < MIN_LITERALS_FOR_4_STREAMS, literals_headerWrong,
173
+ "Not enough literals (%zu) for the 4-streams mode (min %u)",
174
+ litSize, MIN_LITERALS_FOR_4_STREAMS);
168
175
  RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, "");
169
176
  RETURN_ERROR_IF(expectedWriteSize < litSize , dstSize_tooSmall, "");
170
177
  ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 0);
@@ -176,13 +183,14 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
176
183
 
177
184
  if (litEncType==set_repeat) {
178
185
  if (singleStream) {
179
- hufSuccess = HUF_decompress1X_usingDTable_bmi2(
186
+ hufSuccess = HUF_decompress1X_usingDTable(
180
187
  dctx->litBuffer, litSize, istart+lhSize, litCSize,
181
- dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx));
188
+ dctx->HUFptr, flags);
182
189
  } else {
183
- hufSuccess = HUF_decompress4X_usingDTable_bmi2(
190
+ assert(litSize >= MIN_LITERALS_FOR_4_STREAMS);
191
+ hufSuccess = HUF_decompress4X_usingDTable(
184
192
  dctx->litBuffer, litSize, istart+lhSize, litCSize,
185
- dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx));
193
+ dctx->HUFptr, flags);
186
194
  }
187
195
  } else {
188
196
  if (singleStream) {
@@ -190,18 +198,18 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
190
198
  hufSuccess = HUF_decompress1X_DCtx_wksp(
191
199
  dctx->entropy.hufTable, dctx->litBuffer, litSize,
192
200
  istart+lhSize, litCSize, dctx->workspace,
193
- sizeof(dctx->workspace));
201
+ sizeof(dctx->workspace), flags);
194
202
  #else
195
- hufSuccess = HUF_decompress1X1_DCtx_wksp_bmi2(
203
+ hufSuccess = HUF_decompress1X1_DCtx_wksp(
196
204
  dctx->entropy.hufTable, dctx->litBuffer, litSize,
197
205
  istart+lhSize, litCSize, dctx->workspace,
198
- sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx));
206
+ sizeof(dctx->workspace), flags);
199
207
  #endif
200
208
  } else {
201
- hufSuccess = HUF_decompress4X_hufOnly_wksp_bmi2(
209
+ hufSuccess = HUF_decompress4X_hufOnly_wksp(
202
210
  dctx->entropy.hufTable, dctx->litBuffer, litSize,
203
211
  istart+lhSize, litCSize, dctx->workspace,
204
- sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx));
212
+ sizeof(dctx->workspace), flags);
205
213
  }
206
214
  }
207
215
  if (dctx->litBufferLocation == ZSTD_split)
@@ -237,6 +245,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
237
245
  break;
238
246
  case 3:
239
247
  lhSize = 3;
248
+ RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize = 3");
240
249
  litSize = MEM_readLE24(istart) >> 4;
241
250
  break;
242
251
  }
@@ -279,12 +288,13 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
279
288
  break;
280
289
  case 1:
281
290
  lhSize = 2;
291
+ RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 3");
282
292
  litSize = MEM_readLE16(istart) >> 4;
283
293
  break;
284
294
  case 3:
285
295
  lhSize = 3;
296
+ RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 4");
286
297
  litSize = MEM_readLE24(istart) >> 4;
287
- RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4");
288
298
  break;
289
299
  }
290
300
  RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
@@ -506,14 +516,15 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
506
516
  for (i = 8; i < n; i += 8) {
507
517
  MEM_write64(spread + pos + i, sv);
508
518
  }
509
- pos += n;
519
+ assert(n>=0);
520
+ pos += (size_t)n;
510
521
  }
511
522
  }
512
523
  /* Now we spread those positions across the table.
513
- * The benefit of doing it in two stages is that we avoid the the
524
+ * The benefit of doing it in two stages is that we avoid the
514
525
  * variable size inner loop, which caused lots of branch misses.
515
526
  * Now we can run through all the positions without any branch misses.
516
- * We unroll the loop twice, since that is what emperically worked best.
527
+ * We unroll the loop twice, since that is what empirically worked best.
517
528
  */
518
529
  {
519
530
  size_t position = 0;
@@ -540,7 +551,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
540
551
  for (i=0; i<n; i++) {
541
552
  tableDecode[position].baseValue = s;
542
553
  position = (position + step) & tableMask;
543
- while (position > highThreshold) position = (position + step) & tableMask; /* lowprob area */
554
+ while (UNLIKELY(position > highThreshold)) position = (position + step) & tableMask; /* lowprob area */
544
555
  } }
545
556
  assert(position == 0); /* position must reach all cells once, otherwise normalizedCounter is incorrect */
546
557
  }
@@ -551,7 +562,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
551
562
  for (u=0; u<tableSize; u++) {
552
563
  U32 const symbol = tableDecode[u].baseValue;
553
564
  U32 const nextState = symbolNext[symbol]++;
554
- tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32(nextState) );
565
+ tableDecode[u].nbBits = (BYTE) (tableLog - ZSTD_highbit32(nextState) );
555
566
  tableDecode[u].nextState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
556
567
  assert(nbAdditionalBits[symbol] < 255);
557
568
  tableDecode[u].nbAdditionalBits = nbAdditionalBits[symbol];
@@ -964,6 +975,11 @@ size_t ZSTD_execSequence(BYTE* op,
964
975
 
965
976
  assert(op != NULL /* Precondition */);
966
977
  assert(oend_w < oend /* No underflow */);
978
+
979
+ #if defined(__aarch64__)
980
+ /* prefetch sequence starting from match that will be used for copy later */
981
+ PREFETCH_L1(match);
982
+ #endif
967
983
  /* Handle edge cases in a slow path:
968
984
  * - Read beyond end of literals
969
985
  * - Match end is within WILDCOPY_OVERLIMIT of oend
@@ -1154,7 +1170,7 @@ ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16
1154
1170
  }
1155
1171
 
1156
1172
  /* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum
1157
- * offset bits. But we can only read at most (STREAM_ACCUMULATOR_MIN_32 - 1)
1173
+ * offset bits. But we can only read at most STREAM_ACCUMULATOR_MIN_32
1158
1174
  * bits before reloading. This value is the maximum number of bytes we read
1159
1175
  * after reloading when we are decoding long offsets.
1160
1176
  */
@@ -1169,9 +1185,27 @@ FORCE_INLINE_TEMPLATE seq_t
1169
1185
  ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
1170
1186
  {
1171
1187
  seq_t seq;
1188
+ /*
1189
+ * ZSTD_seqSymbol is a structure with a total of 64 bits wide. So it can be
1190
+ * loaded in one operation and extracted its fields by simply shifting or
1191
+ * bit-extracting on aarch64.
1192
+ * GCC doesn't recognize this and generates more unnecessary ldr/ldrb/ldrh
1193
+ * operations that cause performance drop. This can be avoided by using this
1194
+ * ZSTD_memcpy hack.
1195
+ */
1196
+ #if defined(__aarch64__) && (defined(__GNUC__) && !defined(__clang__))
1197
+ ZSTD_seqSymbol llDInfoS, mlDInfoS, ofDInfoS;
1198
+ ZSTD_seqSymbol* const llDInfo = &llDInfoS;
1199
+ ZSTD_seqSymbol* const mlDInfo = &mlDInfoS;
1200
+ ZSTD_seqSymbol* const ofDInfo = &ofDInfoS;
1201
+ ZSTD_memcpy(llDInfo, seqState->stateLL.table + seqState->stateLL.state, sizeof(ZSTD_seqSymbol));
1202
+ ZSTD_memcpy(mlDInfo, seqState->stateML.table + seqState->stateML.state, sizeof(ZSTD_seqSymbol));
1203
+ ZSTD_memcpy(ofDInfo, seqState->stateOffb.table + seqState->stateOffb.state, sizeof(ZSTD_seqSymbol));
1204
+ #else
1172
1205
  const ZSTD_seqSymbol* const llDInfo = seqState->stateLL.table + seqState->stateLL.state;
1173
1206
  const ZSTD_seqSymbol* const mlDInfo = seqState->stateML.table + seqState->stateML.state;
1174
1207
  const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state;
1208
+ #endif
1175
1209
  seq.matchLength = mlDInfo->baseValue;
1176
1210
  seq.litLength = llDInfo->baseValue;
1177
1211
  { U32 const ofBase = ofDInfo->baseValue;
@@ -1186,9 +1220,13 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
1186
1220
  U32 const llnbBits = llDInfo->nbBits;
1187
1221
  U32 const mlnbBits = mlDInfo->nbBits;
1188
1222
  U32 const ofnbBits = ofDInfo->nbBits;
1223
+
1224
+ assert(llBits <= MaxLLBits);
1225
+ assert(mlBits <= MaxMLBits);
1226
+ assert(ofBits <= MaxOff);
1189
1227
  /*
1190
1228
  * As gcc has better branch and block analyzers, sometimes it is only
1191
- * valuable to mark likelyness for clang, it gives around 3-4% of
1229
+ * valuable to mark likeliness for clang, it gives around 3-4% of
1192
1230
  * performance.
1193
1231
  */
1194
1232
 
@@ -1201,13 +1239,16 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
1201
1239
  #endif
1202
1240
  ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
1203
1241
  ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
1204
- assert(ofBits <= MaxOff);
1242
+ ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 > LONG_OFFSETS_MAX_EXTRA_BITS_32);
1243
+ ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 - LONG_OFFSETS_MAX_EXTRA_BITS_32 >= MaxMLBits);
1205
1244
  if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) {
1206
- U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed);
1245
+ /* Always read extra bits, this keeps the logic simple,
1246
+ * avoids branches, and avoids accidentally reading 0 bits.
1247
+ */
1248
+ U32 const extraBits = LONG_OFFSETS_MAX_EXTRA_BITS_32;
1207
1249
  offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
1208
1250
  BIT_reloadDStream(&seqState->DStream);
1209
- if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits);
1210
- assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32); /* to avoid another reload */
1251
+ offset += BIT_readBitsFast(&seqState->DStream, extraBits);
1211
1252
  } else {
1212
1253
  offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */
1213
1254
  if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
@@ -1552,7 +1593,7 @@ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx,
1552
1593
  const BYTE* const prefixStart = (const BYTE*)(dctx->prefixStart);
1553
1594
  const BYTE* const vBase = (const BYTE*)(dctx->virtualStart);
1554
1595
  const BYTE* const dictEnd = (const BYTE*)(dctx->dictEnd);
1555
- DEBUGLOG(5, "ZSTD_decompressSequences_body");
1596
+ DEBUGLOG(5, "ZSTD_decompressSequences_body: nbSeq = %d", nbSeq);
1556
1597
  (void)frame;
1557
1598
 
1558
1599
  /* Regen sequences */
@@ -1945,34 +1986,79 @@ ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx,
1945
1986
  #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
1946
1987
 
1947
1988
 
1989
+ /**
1990
+ * @returns The total size of the history referencable by zstd, including
1991
+ * both the prefix and the extDict. At @p op any offset larger than this
1992
+ * is invalid.
1993
+ */
1994
+ static size_t ZSTD_totalHistorySize(BYTE* op, BYTE const* virtualStart)
1995
+ {
1996
+ return (size_t)(op - virtualStart);
1997
+ }
1948
1998
 
1949
- #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
1950
- !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
1951
- /* ZSTD_getLongOffsetsShare() :
1999
+ typedef struct {
2000
+ unsigned longOffsetShare;
2001
+ unsigned maxNbAdditionalBits;
2002
+ } ZSTD_OffsetInfo;
2003
+
2004
+ /* ZSTD_getOffsetInfo() :
1952
2005
  * condition : offTable must be valid
1953
2006
  * @return : "share" of long offsets (arbitrarily defined as > (1<<23))
1954
- * compared to maximum possible of (1<<OffFSELog) */
1955
- static unsigned
1956
- ZSTD_getLongOffsetsShare(const ZSTD_seqSymbol* offTable)
2007
+ * compared to maximum possible of (1<<OffFSELog),
2008
+ * as well as the maximum number additional bits required.
2009
+ */
2010
+ static ZSTD_OffsetInfo
2011
+ ZSTD_getOffsetInfo(const ZSTD_seqSymbol* offTable, int nbSeq)
1957
2012
  {
1958
- const void* ptr = offTable;
1959
- U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog;
1960
- const ZSTD_seqSymbol* table = offTable + 1;
1961
- U32 const max = 1 << tableLog;
1962
- U32 u, total = 0;
1963
- DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog);
1964
-
1965
- assert(max <= (1 << OffFSELog)); /* max not too large */
1966
- for (u=0; u<max; u++) {
1967
- if (table[u].nbAdditionalBits > 22) total += 1;
2013
+ ZSTD_OffsetInfo info = {0, 0};
2014
+ /* If nbSeq == 0, then the offTable is uninitialized, but we have
2015
+ * no sequences, so both values should be 0.
2016
+ */
2017
+ if (nbSeq != 0) {
2018
+ const void* ptr = offTable;
2019
+ U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog;
2020
+ const ZSTD_seqSymbol* table = offTable + 1;
2021
+ U32 const max = 1 << tableLog;
2022
+ U32 u;
2023
+ DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog);
2024
+
2025
+ assert(max <= (1 << OffFSELog)); /* max not too large */
2026
+ for (u=0; u<max; u++) {
2027
+ info.maxNbAdditionalBits = MAX(info.maxNbAdditionalBits, table[u].nbAdditionalBits);
2028
+ if (table[u].nbAdditionalBits > 22) info.longOffsetShare += 1;
2029
+ }
2030
+
2031
+ assert(tableLog <= OffFSELog);
2032
+ info.longOffsetShare <<= (OffFSELog - tableLog); /* scale to OffFSELog */
1968
2033
  }
1969
2034
 
1970
- assert(tableLog <= OffFSELog);
1971
- total <<= (OffFSELog - tableLog); /* scale to OffFSELog */
2035
+ return info;
2036
+ }
1972
2037
 
1973
- return total;
2038
+ /**
2039
+ * @returns The maximum offset we can decode in one read of our bitstream, without
2040
+ * reloading more bits in the middle of the offset bits read. Any offsets larger
2041
+ * than this must use the long offset decoder.
2042
+ */
2043
+ static size_t ZSTD_maxShortOffset(void)
2044
+ {
2045
+ if (MEM_64bits()) {
2046
+ /* We can decode any offset without reloading bits.
2047
+ * This might change if the max window size grows.
2048
+ */
2049
+ ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31);
2050
+ return (size_t)-1;
2051
+ } else {
2052
+ /* The maximum offBase is (1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1.
2053
+ * This offBase would require STREAM_ACCUMULATOR_MIN extra bits.
2054
+ * Then we have to subtract ZSTD_REP_NUM to get the maximum possible offset.
2055
+ */
2056
+ size_t const maxOffbase = ((size_t)1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1;
2057
+ size_t const maxOffset = maxOffbase - ZSTD_REP_NUM;
2058
+ assert(ZSTD_highbit32((U32)maxOffbase) == STREAM_ACCUMULATOR_MIN);
2059
+ return maxOffset;
2060
+ }
1974
2061
  }
1975
- #endif
1976
2062
 
1977
2063
  size_t
1978
2064
  ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
@@ -1980,20 +2066,21 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
1980
2066
  const void* src, size_t srcSize, const int frame, const streaming_operation streaming)
1981
2067
  { /* blockType == blockCompressed */
1982
2068
  const BYTE* ip = (const BYTE*)src;
1983
- /* isLongOffset must be true if there are long offsets.
1984
- * Offsets are long if they are larger than 2^STREAM_ACCUMULATOR_MIN.
1985
- * We don't expect that to be the case in 64-bit mode.
1986
- * In block mode, window size is not known, so we have to be conservative.
1987
- * (note: but it could be evaluated from current-lowLimit)
1988
- */
1989
- ZSTD_longOffset_e const isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (!frame || (dctx->fParams.windowSize > (1ULL << STREAM_ACCUMULATOR_MIN))));
1990
2069
  DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize);
1991
2070
 
1992
- RETURN_ERROR_IF(srcSize >= ZSTD_BLOCKSIZE_MAX, srcSize_wrong, "");
2071
+ /* Note : the wording of the specification
2072
+ * allows compressed block to be sized exactly ZSTD_BLOCKSIZE_MAX.
2073
+ * This generally does not happen, as it makes little sense,
2074
+ * since an uncompressed block would feature same size and have no decompression cost.
2075
+ * Also, note that decoder from reference libzstd before < v1.5.4
2076
+ * would consider this edge case as an error.
2077
+ * As a consequence, avoid generating compressed blocks of size ZSTD_BLOCKSIZE_MAX
2078
+ * for broader compatibility with the deployed ecosystem of zstd decoders */
2079
+ RETURN_ERROR_IF(srcSize > ZSTD_BLOCKSIZE_MAX, srcSize_wrong, "");
1993
2080
 
1994
2081
  /* Decode literals section */
1995
2082
  { size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, streaming);
1996
- DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : %u", (U32)litCSize);
2083
+ DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : cSize=%u, nbLiterals=%zu", (U32)litCSize, dctx->litSize);
1997
2084
  if (ZSTD_isError(litCSize)) return litCSize;
1998
2085
  ip += litCSize;
1999
2086
  srcSize -= litCSize;
@@ -2001,6 +2088,23 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
2001
2088
 
2002
2089
  /* Build Decoding Tables */
2003
2090
  {
2091
+ /* Compute the maximum block size, which must also work when !frame and fParams are unset.
2092
+ * Additionally, take the min with dstCapacity to ensure that the totalHistorySize fits in a size_t.
2093
+ */
2094
+ size_t const blockSizeMax = MIN(dstCapacity, (frame ? dctx->fParams.blockSizeMax : ZSTD_BLOCKSIZE_MAX));
2095
+ size_t const totalHistorySize = ZSTD_totalHistorySize((BYTE*)dst + blockSizeMax, (BYTE const*)dctx->virtualStart);
2096
+ /* isLongOffset must be true if there are long offsets.
2097
+ * Offsets are long if they are larger than ZSTD_maxShortOffset().
2098
+ * We don't expect that to be the case in 64-bit mode.
2099
+ *
2100
+ * We check here to see if our history is large enough to allow long offsets.
2101
+ * If it isn't, then we can't possible have (valid) long offsets. If the offset
2102
+ * is invalid, then it is okay to read it incorrectly.
2103
+ *
2104
+ * If isLongOffsets is true, then we will later check our decoding table to see
2105
+ * if it is even possible to generate long offsets.
2106
+ */
2107
+ ZSTD_longOffset_e isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (totalHistorySize > ZSTD_maxShortOffset()));
2004
2108
  /* These macros control at build-time which decompressor implementation
2005
2109
  * we use. If neither is defined, we do some inspection and dispatch at
2006
2110
  * runtime.
@@ -2008,6 +2112,11 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
2008
2112
  #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
2009
2113
  !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
2010
2114
  int usePrefetchDecoder = dctx->ddictIsCold;
2115
+ #else
2116
+ /* Set to 1 to avoid computing offset info if we don't need to.
2117
+ * Otherwise this value is ignored.
2118
+ */
2119
+ int usePrefetchDecoder = 1;
2011
2120
  #endif
2012
2121
  int nbSeq;
2013
2122
  size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize);
@@ -2017,26 +2126,38 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
2017
2126
 
2018
2127
  RETURN_ERROR_IF(dst == NULL && nbSeq > 0, dstSize_tooSmall, "NULL not handled");
2019
2128
 
2020
- #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
2021
- !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
2022
- if ( !usePrefetchDecoder
2023
- && (!frame || (dctx->fParams.windowSize > (1<<24)))
2024
- && (nbSeq>ADVANCED_SEQS) ) { /* could probably use a larger nbSeq limit */
2025
- U32 const shareLongOffsets = ZSTD_getLongOffsetsShare(dctx->OFTptr);
2026
- U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */
2027
- usePrefetchDecoder = (shareLongOffsets >= minShare);
2129
+ /* If we could potentially have long offsets, or we might want to use the prefetch decoder,
2130
+ * compute information about the share of long offsets, and the maximum nbAdditionalBits.
2131
+ * NOTE: could probably use a larger nbSeq limit
2132
+ */
2133
+ if (isLongOffset || (!usePrefetchDecoder && (totalHistorySize > (1u << 24)) && (nbSeq > 8))) {
2134
+ ZSTD_OffsetInfo const info = ZSTD_getOffsetInfo(dctx->OFTptr, nbSeq);
2135
+ if (isLongOffset && info.maxNbAdditionalBits <= STREAM_ACCUMULATOR_MIN) {
2136
+ /* If isLongOffset, but the maximum number of additional bits that we see in our table is small
2137
+ * enough, then we know it is impossible to have too long an offset in this block, so we can
2138
+ * use the regular offset decoder.
2139
+ */
2140
+ isLongOffset = ZSTD_lo_isRegularOffset;
2141
+ }
2142
+ if (!usePrefetchDecoder) {
2143
+ U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */
2144
+ usePrefetchDecoder = (info.longOffsetShare >= minShare);
2145
+ }
2028
2146
  }
2029
- #endif
2030
2147
 
2031
2148
  dctx->ddictIsCold = 0;
2032
2149
 
2033
2150
  #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
2034
2151
  !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
2035
- if (usePrefetchDecoder)
2152
+ if (usePrefetchDecoder) {
2153
+ #else
2154
+ (void)usePrefetchDecoder;
2155
+ {
2036
2156
  #endif
2037
2157
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
2038
2158
  return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
2039
2159
  #endif
2160
+ }
2040
2161
 
2041
2162
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
2042
2163
  /* else */
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) Yann Collet, Facebook, Inc.
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) Yann Collet, Facebook, Inc.
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -75,12 +75,13 @@ static UNUSED_ATTR const U32 ML_base[MaxML+1] = {
75
75
 
76
76
  #define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE (sizeof(S16) * (MaxSeq + 1) + (1u << MaxFSELog) + sizeof(U64))
77
77
  #define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32 ((ZSTD_BUILD_FSE_TABLE_WKSP_SIZE + sizeof(U32) - 1) / sizeof(U32))
78
+ #define ZSTD_HUFFDTABLE_CAPACITY_LOG 12
78
79
 
79
80
  typedef struct {
80
81
  ZSTD_seqSymbol LLTable[SEQSYMBOL_TABLE_SIZE(LLFSELog)]; /* Note : Space reserved for FSE Tables */
81
82
  ZSTD_seqSymbol OFTable[SEQSYMBOL_TABLE_SIZE(OffFSELog)]; /* is also used as temporary workspace while building hufTable during DDict creation */
82
83
  ZSTD_seqSymbol MLTable[SEQSYMBOL_TABLE_SIZE(MLFSELog)]; /* and therefore must be at least HUF_DECOMPRESS_WORKSPACE_SIZE large */
83
- HUF_DTable hufTable[HUF_DTABLE_SIZE(HufLog)]; /* can accommodate HUF_decompress4X */
84
+ HUF_DTable hufTable[HUF_DTABLE_SIZE(ZSTD_HUFFDTABLE_CAPACITY_LOG)]; /* can accommodate HUF_decompress4X */
84
85
  U32 rep[ZSTD_REP_NUM];
85
86
  U32 workspace[ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32];
86
87
  } ZSTD_entropyDTables_t;
@@ -164,6 +165,7 @@ struct ZSTD_DCtx_s
164
165
  ZSTD_dictUses_e dictUses;
165
166
  ZSTD_DDictHashSet* ddictSet; /* Hash set for multiple ddicts */
166
167
  ZSTD_refMultipleDDicts_e refMultipleDDicts; /* User specified: if == 1, will allow references to multiple DDicts. Default == 0 (disabled) */
168
+ int disableHufAsm;
167
169
 
168
170
  /* streaming */
169
171
  ZSTD_dStreamStage streamStage;
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) Yann Collet, Facebook, Inc.
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -34,6 +34,7 @@
34
34
  #include "../common/pool.h"
35
35
  #include "../common/threading.h"
36
36
  #include "../common/zstd_internal.h" /* includes zstd.h */
37
+ #include "../common/bits.h" /* ZSTD_highbit32 */
37
38
  #include "../zdict.h"
38
39
  #include "cover.h"
39
40
 
@@ -541,7 +542,7 @@ static void COVER_ctx_destroy(COVER_ctx_t *ctx) {
541
542
 
542
543
  /**
543
544
  * Prepare a context for dictionary building.
544
- * The context is only dependent on the parameter `d` and can used multiple
545
+ * The context is only dependent on the parameter `d` and can be used multiple
545
546
  * times.
546
547
  * Returns 0 on success or error code on error.
547
548
  * The context must be destroyed with `COVER_ctx_destroy()`.
@@ -646,7 +647,7 @@ static size_t COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
646
647
 
647
648
  void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel)
648
649
  {
649
- const double ratio = (double)nbDmers / maxDictSize;
650
+ const double ratio = (double)nbDmers / (double)maxDictSize;
650
651
  if (ratio >= 10) {
651
652
  return;
652
653
  }
@@ -950,9 +951,17 @@ void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters,
950
951
  }
951
952
  }
952
953
 
954
+ static COVER_dictSelection_t setDictSelection(BYTE* buf, size_t s, size_t csz)
955
+ {
956
+ COVER_dictSelection_t ds;
957
+ ds.dictContent = buf;
958
+ ds.dictSize = s;
959
+ ds.totalCompressedSize = csz;
960
+ return ds;
961
+ }
962
+
953
963
  COVER_dictSelection_t COVER_dictSelectionError(size_t error) {
954
- COVER_dictSelection_t selection = { NULL, 0, error };
955
- return selection;
964
+ return setDictSelection(NULL, 0, error);
956
965
  }
957
966
 
958
967
  unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection) {
@@ -1005,9 +1014,8 @@ COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent, size_t dictBuffe
1005
1014
  }
1006
1015
 
1007
1016
  if (params.shrinkDict == 0) {
1008
- COVER_dictSelection_t selection = { largestDictbuffer, dictContentSize, totalCompressedSize };
1009
1017
  free(candidateDictBuffer);
1010
- return selection;
1018
+ return setDictSelection(largestDictbuffer, dictContentSize, totalCompressedSize);
1011
1019
  }
1012
1020
 
1013
1021
  largestDict = dictContentSize;
@@ -1039,20 +1047,16 @@ COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent, size_t dictBuffe
1039
1047
  return COVER_dictSelectionError(totalCompressedSize);
1040
1048
  }
1041
1049
 
1042
- if (totalCompressedSize <= largestCompressed * regressionTolerance) {
1043
- COVER_dictSelection_t selection = { candidateDictBuffer, dictContentSize, totalCompressedSize };
1050
+ if ((double)totalCompressedSize <= (double)largestCompressed * regressionTolerance) {
1044
1051
  free(largestDictbuffer);
1045
- return selection;
1052
+ return setDictSelection( candidateDictBuffer, dictContentSize, totalCompressedSize );
1046
1053
  }
1047
1054
  dictContentSize *= 2;
1048
1055
  }
1049
1056
  dictContentSize = largestDict;
1050
1057
  totalCompressedSize = largestCompressed;
1051
- {
1052
- COVER_dictSelection_t selection = { largestDictbuffer, dictContentSize, totalCompressedSize };
1053
- free(candidateDictBuffer);
1054
- return selection;
1055
- }
1058
+ free(candidateDictBuffer);
1059
+ return setDictSelection( largestDictbuffer, dictContentSize, totalCompressedSize );
1056
1060
  }
1057
1061
 
1058
1062
  /**
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) Facebook, Inc.
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) Facebook, Inc.
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -304,7 +304,7 @@ FASTCOVER_computeFrequency(U32* freqs, const FASTCOVER_ctx_t* ctx)
304
304
 
305
305
  /**
306
306
  * Prepare a context for dictionary building.
307
- * The context is only dependent on the parameter `d` and can used multiple
307
+ * The context is only dependent on the parameter `d` and can be used multiple
308
308
  * times.
309
309
  * Returns 0 on success or error code on error.
310
310
  * The context must be destroyed with `FASTCOVER_ctx_destroy()`.