zstdlib 0.7.0 → 0.8.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (75) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGES.md +5 -0
  3. data/ext/zstdlib/extconf.rb +1 -1
  4. data/ext/zstdlib/ruby/zlib-3.0/zstdlib.c +4994 -0
  5. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/common/bitstream.h +25 -16
  6. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/common/compiler.h +118 -4
  7. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/common/cpu.h +1 -3
  8. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/common/debug.c +1 -1
  9. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/common/debug.h +12 -19
  10. data/ext/zstdlib/zstd-1.5.0/lib/common/entropy_common.c +362 -0
  11. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/common/error_private.c +2 -1
  12. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/common/error_private.h +3 -3
  13. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/common/fse.h +40 -12
  14. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/common/fse_decompress.c +139 -22
  15. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/common/huf.h +29 -7
  16. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/common/mem.h +69 -98
  17. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/common/pool.c +23 -17
  18. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/common/pool.h +2 -2
  19. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/common/threading.c +6 -5
  20. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/common/threading.h +0 -0
  21. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/common/xxhash.c +20 -60
  22. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/common/xxhash.h +2 -2
  23. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/common/zstd_common.c +10 -10
  24. data/ext/zstdlib/zstd-1.5.0/lib/common/zstd_deps.h +111 -0
  25. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/common/zstd_internal.h +105 -62
  26. data/ext/zstdlib/zstd-1.5.0/lib/common/zstd_trace.h +154 -0
  27. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/compress/fse_compress.c +31 -24
  28. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/compress/hist.c +27 -29
  29. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/compress/hist.h +2 -2
  30. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/compress/huf_compress.c +265 -126
  31. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/compress/zstd_compress.c +2843 -728
  32. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/compress/zstd_compress_internal.h +305 -63
  33. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/compress/zstd_compress_literals.c +8 -8
  34. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/compress/zstd_compress_literals.h +1 -1
  35. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/compress/zstd_compress_sequences.c +29 -7
  36. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/compress/zstd_compress_sequences.h +1 -1
  37. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/compress/zstd_compress_superblock.c +22 -295
  38. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/compress/zstd_compress_superblock.h +1 -1
  39. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/compress/zstd_cwksp.h +204 -67
  40. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/compress/zstd_double_fast.c +25 -25
  41. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/compress/zstd_double_fast.h +1 -1
  42. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/compress/zstd_fast.c +23 -23
  43. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/compress/zstd_fast.h +1 -1
  44. data/ext/zstdlib/zstd-1.5.0/lib/compress/zstd_lazy.c +2184 -0
  45. data/ext/zstdlib/zstd-1.5.0/lib/compress/zstd_lazy.h +125 -0
  46. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/compress/zstd_ldm.c +314 -211
  47. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/compress/zstd_ldm.h +9 -2
  48. data/ext/zstdlib/zstd-1.5.0/lib/compress/zstd_ldm_geartab.h +103 -0
  49. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/compress/zstd_opt.c +191 -46
  50. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/compress/zstd_opt.h +1 -1
  51. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/compress/zstdmt_compress.c +93 -415
  52. data/ext/zstdlib/zstd-1.5.0/lib/compress/zstdmt_compress.h +110 -0
  53. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/decompress/huf_decompress.c +342 -239
  54. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/decompress/zstd_ddict.c +9 -9
  55. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/decompress/zstd_ddict.h +2 -2
  56. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/decompress/zstd_decompress.c +369 -87
  57. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/decompress/zstd_decompress_block.c +191 -75
  58. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/decompress/zstd_decompress_block.h +6 -3
  59. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/decompress/zstd_decompress_internal.h +27 -11
  60. data/ext/zstdlib/zstd-1.5.0/lib/zdict.h +452 -0
  61. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/lib/zstd.h +568 -126
  62. data/ext/zstdlib/{zstd-1.4.5/lib/common → zstd-1.5.0/lib}/zstd_errors.h +2 -1
  63. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/zlibWrapper/gzclose.c +0 -0
  64. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/zlibWrapper/gzcompatibility.h +1 -1
  65. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/zlibWrapper/gzguts.h +0 -0
  66. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/zlibWrapper/gzlib.c +0 -0
  67. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/zlibWrapper/gzread.c +0 -0
  68. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/zlibWrapper/gzwrite.c +0 -0
  69. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/zlibWrapper/zstd_zlibwrapper.c +126 -44
  70. data/ext/zstdlib/{zstd-1.4.5 → zstd-1.5.0}/zlibWrapper/zstd_zlibwrapper.h +1 -1
  71. metadata +70 -65
  72. data/ext/zstdlib/zstd-1.4.5/lib/common/entropy_common.c +0 -216
  73. data/ext/zstdlib/zstd-1.4.5/lib/compress/zstd_lazy.c +0 -1138
  74. data/ext/zstdlib/zstd-1.4.5/lib/compress/zstd_lazy.h +0 -67
  75. data/ext/zstdlib/zstd-1.4.5/lib/compress/zstdmt_compress.h +0 -192
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
2
+ * Copyright (c) Yann Collet, Facebook, Inc.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -14,7 +14,7 @@
14
14
  /*-*******************************************************
15
15
  * Dependencies
16
16
  *********************************************************/
17
- #include <string.h> /* memcpy, memmove, memset */
17
+ #include "../common/zstd_deps.h" /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */
18
18
  #include "../common/compiler.h" /* prefetch */
19
19
  #include "../common/cpu.h" /* bmi2 */
20
20
  #include "../common/mem.h" /* low level memory routines */
@@ -44,7 +44,7 @@
44
44
  /*_*******************************************************
45
45
  * Memory operations
46
46
  **********************************************************/
47
- static void ZSTD_copy4(void* dst, const void* src) { memcpy(dst, src, 4); }
47
+ static void ZSTD_copy4(void* dst, const void* src) { ZSTD_memcpy(dst, src, 4); }
48
48
 
49
49
 
50
50
  /*-*************************************************************
@@ -166,7 +166,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
166
166
  dctx->litSize = litSize;
167
167
  dctx->litEntropy = 1;
168
168
  if (litEncType==set_compressed) dctx->HUFptr = dctx->entropy.hufTable;
169
- memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
169
+ ZSTD_memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
170
170
  return litCSize + lhSize;
171
171
  }
172
172
 
@@ -191,10 +191,10 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
191
191
 
192
192
  if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) { /* risk reading beyond src buffer with wildcopy */
193
193
  RETURN_ERROR_IF(litSize+lhSize > srcSize, corruption_detected, "");
194
- memcpy(dctx->litBuffer, istart+lhSize, litSize);
194
+ ZSTD_memcpy(dctx->litBuffer, istart+lhSize, litSize);
195
195
  dctx->litPtr = dctx->litBuffer;
196
196
  dctx->litSize = litSize;
197
- memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
197
+ ZSTD_memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
198
198
  return lhSize+litSize;
199
199
  }
200
200
  /* direct reference into compressed stream */
@@ -223,7 +223,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
223
223
  break;
224
224
  }
225
225
  RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
226
- memset(dctx->litBuffer, istart[lhSize], litSize + WILDCOPY_OVERLENGTH);
226
+ ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize + WILDCOPY_OVERLENGTH);
227
227
  dctx->litPtr = dctx->litBuffer;
228
228
  dctx->litSize = litSize;
229
229
  return lhSize+1;
@@ -236,7 +236,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
236
236
 
237
237
  /* Default FSE distribution tables.
238
238
  * These are pre-calculated FSE decoding tables using default distributions as defined in specification :
239
- * https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md#default-distributions
239
+ * https://github.com/facebook/zstd/blob/release/doc/zstd_compression_format.md#default-distributions
240
240
  * They were generated programmatically with following method :
241
241
  * - start from default distributions, present in /lib/common/zstd_internal.h
242
242
  * - generate tables normally, using ZSTD_buildFSETable()
@@ -364,23 +364,26 @@ static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U32 nbAddB
364
364
  * generate FSE decoding table for one symbol (ll, ml or off)
365
365
  * cannot fail if input is valid =>
366
366
  * all inputs are presumed validated at this stage */
367
- void
368
- ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
367
+ FORCE_INLINE_TEMPLATE
368
+ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
369
369
  const short* normalizedCounter, unsigned maxSymbolValue,
370
370
  const U32* baseValue, const U32* nbAdditionalBits,
371
- unsigned tableLog)
371
+ unsigned tableLog, void* wksp, size_t wkspSize)
372
372
  {
373
373
  ZSTD_seqSymbol* const tableDecode = dt+1;
374
- U16 symbolNext[MaxSeq+1];
375
-
376
374
  U32 const maxSV1 = maxSymbolValue + 1;
377
375
  U32 const tableSize = 1 << tableLog;
378
- U32 highThreshold = tableSize-1;
376
+
377
+ U16* symbolNext = (U16*)wksp;
378
+ BYTE* spread = (BYTE*)(symbolNext + MaxSeq + 1);
379
+ U32 highThreshold = tableSize - 1;
380
+
379
381
 
380
382
  /* Sanity Checks */
381
383
  assert(maxSymbolValue <= MaxSeq);
382
384
  assert(tableLog <= MaxFSELog);
383
-
385
+ assert(wkspSize >= ZSTD_BUILD_FSE_TABLE_WKSP_SIZE);
386
+ (void)wkspSize;
384
387
  /* Init, lay down lowprob symbols */
385
388
  { ZSTD_seqSymbol_header DTableH;
386
389
  DTableH.tableLog = tableLog;
@@ -396,16 +399,69 @@ ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
396
399
  assert(normalizedCounter[s]>=0);
397
400
  symbolNext[s] = (U16)normalizedCounter[s];
398
401
  } } }
399
- memcpy(dt, &DTableH, sizeof(DTableH));
402
+ ZSTD_memcpy(dt, &DTableH, sizeof(DTableH));
400
403
  }
401
404
 
402
405
  /* Spread symbols */
403
- { U32 const tableMask = tableSize-1;
406
+ assert(tableSize <= 512);
407
+ /* Specialized symbol spreading for the case when there are
408
+ * no low probability (-1 count) symbols. When compressing
409
+ * small blocks we avoid low probability symbols to hit this
410
+ * case, since header decoding speed matters more.
411
+ */
412
+ if (highThreshold == tableSize - 1) {
413
+ size_t const tableMask = tableSize-1;
414
+ size_t const step = FSE_TABLESTEP(tableSize);
415
+ /* First lay down the symbols in order.
416
+ * We use a uint64_t to lay down 8 bytes at a time. This reduces branch
417
+ * misses since small blocks generally have small table logs, so nearly
418
+ * all symbols have counts <= 8. We ensure we have 8 bytes at the end of
419
+ * our buffer to handle the over-write.
420
+ */
421
+ {
422
+ U64 const add = 0x0101010101010101ull;
423
+ size_t pos = 0;
424
+ U64 sv = 0;
425
+ U32 s;
426
+ for (s=0; s<maxSV1; ++s, sv += add) {
427
+ int i;
428
+ int const n = normalizedCounter[s];
429
+ MEM_write64(spread + pos, sv);
430
+ for (i = 8; i < n; i += 8) {
431
+ MEM_write64(spread + pos + i, sv);
432
+ }
433
+ pos += n;
434
+ }
435
+ }
436
+ /* Now we spread those positions across the table.
437
+ * The benefit of doing it in two stages is that we avoid the the
438
+ * variable size inner loop, which caused lots of branch misses.
439
+ * Now we can run through all the positions without any branch misses.
440
+ * We unroll the loop twice, since that is what emperically worked best.
441
+ */
442
+ {
443
+ size_t position = 0;
444
+ size_t s;
445
+ size_t const unroll = 2;
446
+ assert(tableSize % unroll == 0); /* FSE_MIN_TABLELOG is 5 */
447
+ for (s = 0; s < (size_t)tableSize; s += unroll) {
448
+ size_t u;
449
+ for (u = 0; u < unroll; ++u) {
450
+ size_t const uPosition = (position + (u * step)) & tableMask;
451
+ tableDecode[uPosition].baseValue = spread[s + u];
452
+ }
453
+ position = (position + (unroll * step)) & tableMask;
454
+ }
455
+ assert(position == 0);
456
+ }
457
+ } else {
458
+ U32 const tableMask = tableSize-1;
404
459
  U32 const step = FSE_TABLESTEP(tableSize);
405
460
  U32 s, position = 0;
406
461
  for (s=0; s<maxSV1; s++) {
407
462
  int i;
408
- for (i=0; i<normalizedCounter[s]; i++) {
463
+ int const n = normalizedCounter[s];
464
+ for (i=0; i<n; i++) {
409
465
  tableDecode[position].baseValue = s;
410
466
  position = (position + step) & tableMask;
411
467
  while (position > highThreshold) position = (position + step) & tableMask; /* lowprob area */
@@ -414,7 +470,8 @@ ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
414
470
  }
415
471
 
416
472
  /* Build Decoding table */
417
- { U32 u;
473
+ {
474
+ U32 u;
418
475
  for (u=0; u<tableSize; u++) {
419
476
  U32 const symbol = tableDecode[u].baseValue;
420
477
  U32 const nextState = symbolNext[symbol]++;
@@ -423,7 +480,46 @@ ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
423
480
  assert(nbAdditionalBits[symbol] < 255);
424
481
  tableDecode[u].nbAdditionalBits = (BYTE)nbAdditionalBits[symbol];
425
482
  tableDecode[u].baseValue = baseValue[symbol];
426
- } }
483
+ }
484
+ }
485
+ }
486
+
487
+ /* Avoids the FORCE_INLINE of the _body() function. */
488
+ static void ZSTD_buildFSETable_body_default(ZSTD_seqSymbol* dt,
489
+ const short* normalizedCounter, unsigned maxSymbolValue,
490
+ const U32* baseValue, const U32* nbAdditionalBits,
491
+ unsigned tableLog, void* wksp, size_t wkspSize)
492
+ {
493
+ ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue,
494
+ baseValue, nbAdditionalBits, tableLog, wksp, wkspSize);
495
+ }
496
+
497
+ #if DYNAMIC_BMI2
498
+ TARGET_ATTRIBUTE("bmi2") static void ZSTD_buildFSETable_body_bmi2(ZSTD_seqSymbol* dt,
499
+ const short* normalizedCounter, unsigned maxSymbolValue,
500
+ const U32* baseValue, const U32* nbAdditionalBits,
501
+ unsigned tableLog, void* wksp, size_t wkspSize)
502
+ {
503
+ ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue,
504
+ baseValue, nbAdditionalBits, tableLog, wksp, wkspSize);
505
+ }
506
+ #endif
507
+
508
+ void ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
509
+ const short* normalizedCounter, unsigned maxSymbolValue,
510
+ const U32* baseValue, const U32* nbAdditionalBits,
511
+ unsigned tableLog, void* wksp, size_t wkspSize, int bmi2)
512
+ {
513
+ #if DYNAMIC_BMI2
514
+ if (bmi2) {
515
+ ZSTD_buildFSETable_body_bmi2(dt, normalizedCounter, maxSymbolValue,
516
+ baseValue, nbAdditionalBits, tableLog, wksp, wkspSize);
517
+ return;
518
+ }
519
+ #endif
520
+ (void)bmi2;
521
+ ZSTD_buildFSETable_body_default(dt, normalizedCounter, maxSymbolValue,
522
+ baseValue, nbAdditionalBits, tableLog, wksp, wkspSize);
427
523
  }
428
524
 
429
525
 
@@ -435,7 +531,8 @@ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymb
435
531
  const void* src, size_t srcSize,
436
532
  const U32* baseValue, const U32* nbAdditionalBits,
437
533
  const ZSTD_seqSymbol* defaultTable, U32 flagRepeatTable,
438
- int ddictIsCold, int nbSeq)
534
+ int ddictIsCold, int nbSeq, U32* wksp, size_t wkspSize,
535
+ int bmi2)
439
536
  {
440
537
  switch(type)
441
538
  {
@@ -467,7 +564,7 @@ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymb
467
564
  size_t const headerSize = FSE_readNCount(norm, &max, &tableLog, src, srcSize);
468
565
  RETURN_ERROR_IF(FSE_isError(headerSize), corruption_detected, "");
469
566
  RETURN_ERROR_IF(tableLog > maxLog, corruption_detected, "");
470
- ZSTD_buildFSETable(DTableSpace, norm, max, baseValue, nbAdditionalBits, tableLog);
567
+ ZSTD_buildFSETable(DTableSpace, norm, max, baseValue, nbAdditionalBits, tableLog, wksp, wkspSize, bmi2);
471
568
  *DTablePtr = DTableSpace;
472
569
  return headerSize;
473
570
  }
@@ -480,7 +577,7 @@ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymb
480
577
  size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
481
578
  const void* src, size_t srcSize)
482
579
  {
483
- const BYTE* const istart = (const BYTE* const)src;
580
+ const BYTE* const istart = (const BYTE*)src;
484
581
  const BYTE* const iend = istart + srcSize;
485
582
  const BYTE* ip = istart;
486
583
  int nbSeq;
@@ -499,7 +596,8 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
499
596
  if (nbSeq > 0x7F) {
500
597
  if (nbSeq == 0xFF) {
501
598
  RETURN_ERROR_IF(ip+2 > iend, srcSize_wrong, "");
502
- nbSeq = MEM_readLE16(ip) + LONGNBSEQ, ip+=2;
599
+ nbSeq = MEM_readLE16(ip) + LONGNBSEQ;
600
+ ip+=2;
503
601
  } else {
504
602
  RETURN_ERROR_IF(ip >= iend, srcSize_wrong, "");
505
603
  nbSeq = ((nbSeq-0x80)<<8) + *ip++;
@@ -520,7 +618,9 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
520
618
  ip, iend-ip,
521
619
  LL_base, LL_bits,
522
620
  LL_defaultDTable, dctx->fseEntropy,
523
- dctx->ddictIsCold, nbSeq);
621
+ dctx->ddictIsCold, nbSeq,
622
+ dctx->workspace, sizeof(dctx->workspace),
623
+ dctx->bmi2);
524
624
  RETURN_ERROR_IF(ZSTD_isError(llhSize), corruption_detected, "ZSTD_buildSeqTable failed");
525
625
  ip += llhSize;
526
626
  }
@@ -530,7 +630,9 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
530
630
  ip, iend-ip,
531
631
  OF_base, OF_bits,
532
632
  OF_defaultDTable, dctx->fseEntropy,
533
- dctx->ddictIsCold, nbSeq);
633
+ dctx->ddictIsCold, nbSeq,
634
+ dctx->workspace, sizeof(dctx->workspace),
635
+ dctx->bmi2);
534
636
  RETURN_ERROR_IF(ZSTD_isError(ofhSize), corruption_detected, "ZSTD_buildSeqTable failed");
535
637
  ip += ofhSize;
536
638
  }
@@ -540,7 +642,9 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
540
642
  ip, iend-ip,
541
643
  ML_base, ML_bits,
542
644
  ML_defaultDTable, dctx->fseEntropy,
543
- dctx->ddictIsCold, nbSeq);
645
+ dctx->ddictIsCold, nbSeq,
646
+ dctx->workspace, sizeof(dctx->workspace),
647
+ dctx->bmi2);
544
648
  RETURN_ERROR_IF(ZSTD_isError(mlhSize), corruption_detected, "ZSTD_buildSeqTable failed");
545
649
  ip += mlhSize;
546
650
  }
@@ -554,7 +658,6 @@ typedef struct {
554
658
  size_t litLength;
555
659
  size_t matchLength;
556
660
  size_t offset;
557
- const BYTE* match;
558
661
  } seq_t;
559
662
 
560
663
  typedef struct {
@@ -568,9 +671,6 @@ typedef struct {
568
671
  ZSTD_fseState stateOffb;
569
672
  ZSTD_fseState stateML;
570
673
  size_t prevOffset[ZSTD_REP_NUM];
571
- const BYTE* prefixStart;
572
- const BYTE* dictEnd;
573
- size_t pos;
574
674
  } seqState_t;
575
675
 
576
676
  /*! ZSTD_overlapCopy8() :
@@ -686,12 +786,12 @@ size_t ZSTD_execSequenceEnd(BYTE* op,
686
786
  RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, "");
687
787
  match = dictEnd - (prefixStart-match);
688
788
  if (match + sequence.matchLength <= dictEnd) {
689
- memmove(oLitEnd, match, sequence.matchLength);
789
+ ZSTD_memmove(oLitEnd, match, sequence.matchLength);
690
790
  return sequenceLength;
691
791
  }
692
792
  /* span extDict & currentPrefixSegment */
693
793
  { size_t const length1 = dictEnd - match;
694
- memmove(oLitEnd, match, length1);
794
+ ZSTD_memmove(oLitEnd, match, length1);
695
795
  op = oLitEnd + length1;
696
796
  sequence.matchLength -= length1;
697
797
  match = prefixStart;
@@ -752,12 +852,12 @@ size_t ZSTD_execSequence(BYTE* op,
752
852
  RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virtualStart)), corruption_detected, "");
753
853
  match = dictEnd + (match - prefixStart);
754
854
  if (match + sequence.matchLength <= dictEnd) {
755
- memmove(oLitEnd, match, sequence.matchLength);
855
+ ZSTD_memmove(oLitEnd, match, sequence.matchLength);
756
856
  return sequenceLength;
757
857
  }
758
858
  /* span extDict & currentPrefixSegment */
759
859
  { size_t const length1 = dictEnd - match;
760
- memmove(oLitEnd, match, length1);
860
+ ZSTD_memmove(oLitEnd, match, length1);
761
861
  op = oLitEnd + length1;
762
862
  sequence.matchLength -= length1;
763
863
  match = prefixStart;
@@ -832,10 +932,9 @@ ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, ZSTD
832
932
  : 0)
833
933
 
834
934
  typedef enum { ZSTD_lo_isRegularOffset, ZSTD_lo_isLongOffset=1 } ZSTD_longOffset_e;
835
- typedef enum { ZSTD_p_noPrefetch=0, ZSTD_p_prefetch=1 } ZSTD_prefetch_e;
836
935
 
837
936
  FORCE_INLINE_TEMPLATE seq_t
838
- ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets, const ZSTD_prefetch_e prefetch)
937
+ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
839
938
  {
840
939
  seq_t seq;
841
940
  ZSTD_seqSymbol const llDInfo = seqState->stateLL.table[seqState->stateLL.state];
@@ -910,14 +1009,6 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets, c
910
1009
  DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u",
911
1010
  (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
912
1011
 
913
- if (prefetch == ZSTD_p_prefetch) {
914
- size_t const pos = seqState->pos + seq.litLength;
915
- const BYTE* const matchBase = (seq.offset > pos) ? seqState->dictEnd : seqState->prefixStart;
916
- seq.match = matchBase + pos - seq.offset; /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted.
917
- * No consequence though : no memory access will occur, offset is only used for prefetching */
918
- seqState->pos = pos + seq.matchLength;
919
- }
920
-
921
1012
  /* ANS state update
922
1013
  * gcc-9.0.0 does 2.5% worse with ZSTD_updateFseStateWithDInfo().
923
1014
  * clang-9.2.0 does 7% worse with ZSTD_updateFseState().
@@ -948,7 +1039,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets, c
948
1039
  }
949
1040
 
950
1041
  #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
951
- static int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd)
1042
+ MEM_STATIC int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd)
952
1043
  {
953
1044
  size_t const windowSize = dctx->fParams.windowSize;
954
1045
  /* No dictionary used. */
@@ -969,6 +1060,7 @@ MEM_STATIC void ZSTD_assertValidSequence(
969
1060
  seq_t const seq,
970
1061
  BYTE const* prefixStart, BYTE const* virtualStart)
971
1062
  {
1063
+ #if DEBUGLEVEL >= 1
972
1064
  size_t const windowSize = dctx->fParams.windowSize;
973
1065
  size_t const sequenceSize = seq.litLength + seq.matchLength;
974
1066
  BYTE const* const oLitEnd = op + seq.litLength;
@@ -986,6 +1078,9 @@ MEM_STATIC void ZSTD_assertValidSequence(
986
1078
  /* Offset must be within our window. */
987
1079
  assert(seq.offset <= windowSize);
988
1080
  }
1081
+ #else
1082
+ (void)dctx, (void)op, (void)oend, (void)seq, (void)prefixStart, (void)virtualStart;
1083
+ #endif
989
1084
  }
990
1085
  #endif
991
1086
 
@@ -1000,7 +1095,7 @@ ZSTD_decompressSequences_body( ZSTD_DCtx* dctx,
1000
1095
  {
1001
1096
  const BYTE* ip = (const BYTE*)seqStart;
1002
1097
  const BYTE* const iend = ip + seqSize;
1003
- BYTE* const ostart = (BYTE* const)dst;
1098
+ BYTE* const ostart = (BYTE*)dst;
1004
1099
  BYTE* const oend = ostart + maxDstSize;
1005
1100
  BYTE* op = ostart;
1006
1101
  const BYTE* litPtr = dctx->litPtr;
@@ -1014,7 +1109,6 @@ ZSTD_decompressSequences_body( ZSTD_DCtx* dctx,
1014
1109
  /* Regen sequences */
1015
1110
  if (nbSeq) {
1016
1111
  seqState_t seqState;
1017
- size_t error = 0;
1018
1112
  dctx->fseEntropy = 1;
1019
1113
  { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; }
1020
1114
  RETURN_ERROR_IF(
@@ -1048,13 +1142,14 @@ ZSTD_decompressSequences_body( ZSTD_DCtx* dctx,
1048
1142
  * If you see most cycles served out of the DSB you've hit the good case.
1049
1143
  * If it is pretty even then you may be in an okay case.
1050
1144
  *
1051
- * I've been able to reproduce this issue on the following CPUs:
1145
+ * This issue has been reproduced on the following CPUs:
1052
1146
  * - Kabylake: Macbook Pro (15-inch, 2019) 2.4 GHz Intel Core i9
1053
1147
  * Use Instruments->Counters to get DSB/MITE cycles.
1054
1148
  * I never got performance swings, but I was able to
1055
1149
  * go from the good case of mostly DSB to half of the
1056
1150
  * cycles served from MITE.
1057
1151
  * - Coffeelake: Intel i9-9900k
1152
+ * - Coffeelake: Intel i7-9700k
1058
1153
  *
1059
1154
  * I haven't been able to reproduce the instability or DSB misses on any
1060
1155
  * of the following CPUS:
@@ -1067,33 +1162,35 @@ ZSTD_decompressSequences_body( ZSTD_DCtx* dctx,
1067
1162
  *
1068
1163
  * https://gist.github.com/terrelln/9889fc06a423fd5ca6e99351564473f4
1069
1164
  */
1165
+ __asm__(".p2align 6");
1166
+ __asm__("nop");
1070
1167
  __asm__(".p2align 5");
1071
1168
  __asm__("nop");
1169
+ # if __GNUC__ >= 9
1170
+ /* better for gcc-9 and gcc-10, worse for clang and gcc-8 */
1171
+ __asm__(".p2align 3");
1172
+ # else
1072
1173
  __asm__(".p2align 4");
1174
+ # endif
1073
1175
  #endif
1074
1176
  for ( ; ; ) {
1075
- seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, ZSTD_p_noPrefetch);
1177
+ seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1076
1178
  size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd);
1077
1179
  #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1078
1180
  assert(!ZSTD_isError(oneSeqSize));
1079
1181
  if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
1080
1182
  #endif
1183
+ if (UNLIKELY(ZSTD_isError(oneSeqSize)))
1184
+ return oneSeqSize;
1081
1185
  DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
1186
+ op += oneSeqSize;
1187
+ if (UNLIKELY(!--nbSeq))
1188
+ break;
1082
1189
  BIT_reloadDStream(&(seqState.DStream));
1083
- /* gcc and clang both don't like early returns in this loop.
1084
- * gcc doesn't like early breaks either.
1085
- * Instead save an error and report it at the end.
1086
- * When there is an error, don't increment op, so we don't
1087
- * overwrite.
1088
- */
1089
- if (UNLIKELY(ZSTD_isError(oneSeqSize))) error = oneSeqSize;
1090
- else op += oneSeqSize;
1091
- if (UNLIKELY(!--nbSeq)) break;
1092
1190
  }
1093
1191
 
1094
1192
  /* check if reached exact end */
1095
1193
  DEBUGLOG(5, "ZSTD_decompressSequences_body: after decode loop, remaining nbSeq : %i", nbSeq);
1096
- if (ZSTD_isError(error)) return error;
1097
1194
  RETURN_ERROR_IF(nbSeq, corruption_detected, "");
1098
1195
  RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, "");
1099
1196
  /* save reps for next block */
@@ -1104,7 +1201,7 @@ ZSTD_decompressSequences_body( ZSTD_DCtx* dctx,
1104
1201
  { size_t const lastLLSize = litEnd - litPtr;
1105
1202
  RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
1106
1203
  if (op != NULL) {
1107
- memcpy(op, litPtr, lastLLSize);
1204
+ ZSTD_memcpy(op, litPtr, lastLLSize);
1108
1205
  op += lastLLSize;
1109
1206
  }
1110
1207
  }
@@ -1124,6 +1221,24 @@ ZSTD_decompressSequences_default(ZSTD_DCtx* dctx,
1124
1221
  #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
1125
1222
 
1126
1223
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
1224
+
1225
+ FORCE_INLINE_TEMPLATE size_t
1226
+ ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence,
1227
+ const BYTE* const prefixStart, const BYTE* const dictEnd)
1228
+ {
1229
+ prefetchPos += sequence.litLength;
1230
+ { const BYTE* const matchBase = (sequence.offset > prefetchPos) ? dictEnd : prefixStart;
1231
+ const BYTE* const match = matchBase + prefetchPos - sequence.offset; /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted.
1232
+ * No consequence though : memory address is only used for prefetching, not for dereferencing */
1233
+ PREFETCH_L1(match); PREFETCH_L1(match+CACHELINE_SIZE); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
1234
+ }
1235
+ return prefetchPos + sequence.matchLength;
1236
+ }
1237
+
1238
+ /* This decoding function employs prefetching
1239
+ * to reduce latency impact of cache misses.
1240
+ * It's generally employed when block contains a significant portion of long-distance matches
1241
+ * or when coupled with a "cold" dictionary */
1127
1242
  FORCE_INLINE_TEMPLATE size_t
1128
1243
  ZSTD_decompressSequencesLong_body(
1129
1244
  ZSTD_DCtx* dctx,
@@ -1134,7 +1249,7 @@ ZSTD_decompressSequencesLong_body(
1134
1249
  {
1135
1250
  const BYTE* ip = (const BYTE*)seqStart;
1136
1251
  const BYTE* const iend = ip + seqSize;
1137
- BYTE* const ostart = (BYTE* const)dst;
1252
+ BYTE* const ostart = (BYTE*)dst;
1138
1253
  BYTE* const oend = ostart + maxDstSize;
1139
1254
  BYTE* op = ostart;
1140
1255
  const BYTE* litPtr = dctx->litPtr;
@@ -1146,18 +1261,17 @@ ZSTD_decompressSequencesLong_body(
1146
1261
 
1147
1262
  /* Regen sequences */
1148
1263
  if (nbSeq) {
1149
- #define STORED_SEQS 4
1264
+ #define STORED_SEQS 8
1150
1265
  #define STORED_SEQS_MASK (STORED_SEQS-1)
1151
- #define ADVANCED_SEQS 4
1266
+ #define ADVANCED_SEQS STORED_SEQS
1152
1267
  seq_t sequences[STORED_SEQS];
1153
1268
  int const seqAdvance = MIN(nbSeq, ADVANCED_SEQS);
1154
1269
  seqState_t seqState;
1155
1270
  int seqNb;
1271
+ size_t prefetchPos = (size_t)(op-prefixStart); /* track position relative to prefixStart */
1272
+
1156
1273
  dctx->fseEntropy = 1;
1157
1274
  { int i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; }
1158
- seqState.prefixStart = prefixStart;
1159
- seqState.pos = (size_t)(op-prefixStart);
1160
- seqState.dictEnd = dictEnd;
1161
1275
  assert(dst != NULL);
1162
1276
  assert(iend >= ip);
1163
1277
  RETURN_ERROR_IF(
@@ -1169,21 +1283,23 @@ ZSTD_decompressSequencesLong_body(
1169
1283
 
1170
1284
  /* prepare in advance */
1171
1285
  for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && (seqNb<seqAdvance); seqNb++) {
1172
- sequences[seqNb] = ZSTD_decodeSequence(&seqState, isLongOffset, ZSTD_p_prefetch);
1173
- PREFETCH_L1(sequences[seqNb].match); PREFETCH_L1(sequences[seqNb].match + sequences[seqNb].matchLength - 1); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
1286
+ seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1287
+ prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
1288
+ sequences[seqNb] = sequence;
1174
1289
  }
1175
1290
  RETURN_ERROR_IF(seqNb<seqAdvance, corruption_detected, "");
1176
1291
 
1177
1292
  /* decode and decompress */
1178
1293
  for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (seqNb<nbSeq) ; seqNb++) {
1179
- seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, ZSTD_p_prefetch);
1294
+ seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1180
1295
  size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb-ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd);
1181
1296
  #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1182
1297
  assert(!ZSTD_isError(oneSeqSize));
1183
1298
  if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb-ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
1184
1299
  #endif
1185
1300
  if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1186
- PREFETCH_L1(sequence.match); PREFETCH_L1(sequence.match + sequence.matchLength - 1); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
1301
+
1302
+ prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
1187
1303
  sequences[seqNb & STORED_SEQS_MASK] = sequence;
1188
1304
  op += oneSeqSize;
1189
1305
  }
@@ -1209,7 +1325,7 @@ ZSTD_decompressSequencesLong_body(
1209
1325
  { size_t const lastLLSize = litEnd - litPtr;
1210
1326
  RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
1211
1327
  if (op != NULL) {
1212
- memcpy(op, litPtr, lastLLSize);
1328
+ ZSTD_memcpy(op, litPtr, lastLLSize);
1213
1329
  op += lastLLSize;
1214
1330
  }
1215
1331
  }
@@ -1409,9 +1525,9 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
1409
1525
  }
1410
1526
 
1411
1527
 
1412
- void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst)
1528
+ void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize)
1413
1529
  {
1414
- if (dst != dctx->previousDstEnd) { /* not contiguous */
1530
+ if (dst != dctx->previousDstEnd && dstSize > 0) { /* not contiguous */
1415
1531
  dctx->dictEnd = dctx->previousDstEnd;
1416
1532
  dctx->virtualStart = (const char*)dst - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->prefixStart));
1417
1533
  dctx->prefixStart = dst;
@@ -1425,7 +1541,7 @@ size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
1425
1541
  const void* src, size_t srcSize)
1426
1542
  {
1427
1543
  size_t dSize;
1428
- ZSTD_checkContinuity(dctx, dst);
1544
+ ZSTD_checkContinuity(dctx, dst, dstCapacity);
1429
1545
  dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 0);
1430
1546
  dctx->previousDstEnd = (char*)dst + dSize;
1431
1547
  return dSize;