zstdlib 0.10.0 → 0.11.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (78) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGES.md +8 -0
  3. data/ext/zstdlib_c/extconf.rb +2 -2
  4. data/ext/zstdlib_c/ruby/zlib-3.2/zstdlib.c +5090 -0
  5. data/ext/zstdlib_c/zstd-1.5.5/lib/common/allocations.h +55 -0
  6. data/ext/zstdlib_c/zstd-1.5.5/lib/common/bits.h +200 -0
  7. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/common/bitstream.h +19 -60
  8. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/common/compiler.h +26 -3
  9. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/common/cpu.h +1 -1
  10. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/common/debug.c +1 -1
  11. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/common/debug.h +1 -1
  12. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/common/entropy_common.c +12 -40
  13. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/common/error_private.c +9 -2
  14. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/common/error_private.h +1 -1
  15. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/common/fse.h +5 -83
  16. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/common/fse_decompress.c +7 -99
  17. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/common/huf.h +65 -156
  18. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/common/mem.h +39 -46
  19. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/common/pool.c +26 -10
  20. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/common/pool.h +7 -1
  21. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/common/portability_macros.h +22 -3
  22. data/ext/zstdlib_c/zstd-1.5.5/lib/common/threading.c +176 -0
  23. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/common/threading.h +5 -10
  24. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/common/xxhash.c +2 -2
  25. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/common/xxhash.h +8 -8
  26. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/common/zstd_common.c +1 -36
  27. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/common/zstd_deps.h +1 -1
  28. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/common/zstd_internal.h +17 -118
  29. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/common/zstd_trace.h +3 -3
  30. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/compress/clevels.h +1 -1
  31. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/compress/fse_compress.c +7 -124
  32. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/compress/hist.c +1 -1
  33. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/compress/hist.h +1 -1
  34. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/compress/huf_compress.c +234 -169
  35. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/compress/zstd_compress.c +1243 -538
  36. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/compress/zstd_compress_internal.h +225 -151
  37. data/ext/zstdlib_c/zstd-1.5.5/lib/compress/zstd_compress_literals.c +235 -0
  38. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/compress/zstd_compress_literals.h +16 -8
  39. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/compress/zstd_compress_sequences.c +3 -3
  40. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/compress/zstd_compress_sequences.h +1 -1
  41. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/compress/zstd_compress_superblock.c +25 -21
  42. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/compress/zstd_compress_superblock.h +1 -1
  43. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/compress/zstd_cwksp.h +128 -62
  44. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/compress/zstd_double_fast.c +95 -33
  45. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/compress/zstd_double_fast.h +3 -2
  46. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/compress/zstd_fast.c +433 -148
  47. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/compress/zstd_fast.h +3 -2
  48. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/compress/zstd_lazy.c +398 -345
  49. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/compress/zstd_lazy.h +4 -2
  50. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/compress/zstd_ldm.c +5 -5
  51. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/compress/zstd_ldm.h +1 -1
  52. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/compress/zstd_ldm_geartab.h +1 -1
  53. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/compress/zstd_opt.c +106 -80
  54. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/compress/zstd_opt.h +1 -1
  55. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/compress/zstdmt_compress.c +17 -9
  56. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/compress/zstdmt_compress.h +1 -1
  57. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/decompress/huf_decompress.c +434 -441
  58. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/decompress/huf_decompress_amd64.S +30 -39
  59. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/decompress/zstd_ddict.c +4 -4
  60. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/decompress/zstd_ddict.h +1 -1
  61. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/decompress/zstd_decompress.c +205 -80
  62. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/decompress/zstd_decompress_block.c +201 -81
  63. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/decompress/zstd_decompress_block.h +6 -1
  64. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/decompress/zstd_decompress_internal.h +4 -2
  65. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/zdict.h +53 -31
  66. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/zstd.h +580 -135
  67. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/zstd_errors.h +27 -8
  68. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/zlibWrapper/gzclose.c +1 -1
  69. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/zlibWrapper/gzcompatibility.h +8 -8
  70. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/zlibWrapper/gzguts.h +10 -10
  71. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/zlibWrapper/gzlib.c +3 -3
  72. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/zlibWrapper/gzread.c +10 -10
  73. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/zlibWrapper/gzwrite.c +5 -5
  74. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/zlibWrapper/zstd_zlibwrapper.c +46 -44
  75. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/zlibWrapper/zstd_zlibwrapper.h +4 -1
  76. metadata +77 -74
  77. data/ext/zstdlib_c/zstd-1.5.2/lib/common/threading.c +0 -122
  78. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/zstd_compress_literals.c +0 -159
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) Yann Collet, Facebook, Inc.
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -10,6 +10,9 @@
10
10
 
11
11
  #include "zstd_compress_internal.h"
12
12
  #include "zstd_lazy.h"
13
+ #include "../common/bits.h" /* ZSTD_countTrailingZeros64 */
14
+
15
+ #define kLazySkippingStep 8
13
16
 
14
17
 
15
18
  /*-*************************************
@@ -197,8 +200,8 @@ ZSTD_DUBT_findBetterDictMatch (
197
200
  U32 matchIndex = dictMatchIndex + dictIndexDelta;
198
201
  if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) {
199
202
  DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)",
200
- curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, STORE_OFFSET(curr - matchIndex), dictMatchIndex, matchIndex);
201
- bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex);
203
+ curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, OFFSET_TO_OFFBASE(curr - matchIndex), dictMatchIndex, matchIndex);
204
+ bestLength = matchLength, *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
202
205
  }
203
206
  if (ip+matchLength == iend) { /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */
204
207
  break; /* drop, to guarantee consistency (miss a little bit of compression) */
@@ -218,7 +221,7 @@ ZSTD_DUBT_findBetterDictMatch (
218
221
  }
219
222
 
220
223
  if (bestLength >= MINMATCH) {
221
- U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex;
224
+ U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offsetPtr); (void)mIndex;
222
225
  DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
223
226
  curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
224
227
  }
@@ -230,7 +233,7 @@ ZSTD_DUBT_findBetterDictMatch (
230
233
  static size_t
231
234
  ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
232
235
  const BYTE* const ip, const BYTE* const iend,
233
- size_t* offsetPtr,
236
+ size_t* offBasePtr,
234
237
  U32 const mls,
235
238
  const ZSTD_dictMode_e dictMode)
236
239
  {
@@ -327,8 +330,8 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
327
330
  if (matchLength > bestLength) {
328
331
  if (matchLength > matchEndIdx - matchIndex)
329
332
  matchEndIdx = matchIndex + (U32)matchLength;
330
- if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) )
331
- bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex);
333
+ if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr - matchIndex + 1) - ZSTD_highbit32((U32)*offBasePtr)) )
334
+ bestLength = matchLength, *offBasePtr = OFFSET_TO_OFFBASE(curr - matchIndex);
332
335
  if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */
333
336
  if (dictMode == ZSTD_dictMatchState) {
334
337
  nbCompares = 0; /* in addition to avoiding checking any
@@ -361,16 +364,16 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
361
364
  if (dictMode == ZSTD_dictMatchState && nbCompares) {
362
365
  bestLength = ZSTD_DUBT_findBetterDictMatch(
363
366
  ms, ip, iend,
364
- offsetPtr, bestLength, nbCompares,
367
+ offBasePtr, bestLength, nbCompares,
365
368
  mls, dictMode);
366
369
  }
367
370
 
368
371
  assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased */
369
372
  ms->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */
370
373
  if (bestLength >= MINMATCH) {
371
- U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex;
374
+ U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offBasePtr); (void)mIndex;
372
375
  DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
373
- curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
376
+ curr, (U32)bestLength, (U32)*offBasePtr, mIndex);
374
377
  }
375
378
  return bestLength;
376
379
  }
@@ -381,14 +384,14 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
381
384
  FORCE_INLINE_TEMPLATE size_t
382
385
  ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms,
383
386
  const BYTE* const ip, const BYTE* const iLimit,
384
- size_t* offsetPtr,
387
+ size_t* offBasePtr,
385
388
  const U32 mls /* template */,
386
389
  const ZSTD_dictMode_e dictMode)
387
390
  {
388
391
  DEBUGLOG(7, "ZSTD_BtFindBestMatch");
389
392
  if (ip < ms->window.base + ms->nextToUpdate) return 0; /* skipped area */
390
393
  ZSTD_updateDUBT(ms, ip, iLimit, mls);
391
- return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offsetPtr, mls, dictMode);
394
+ return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offBasePtr, mls, dictMode);
392
395
  }
393
396
 
394
397
  /***********************************
@@ -561,7 +564,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
561
564
  /* save best solution */
562
565
  if (currentMl > ml) {
563
566
  ml = currentMl;
564
- *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta));
567
+ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta));
565
568
  if (ip+currentMl == iLimit) {
566
569
  /* best possible, avoids read overflow on next attempt */
567
570
  return ml;
@@ -598,7 +601,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
598
601
  /* save best solution */
599
602
  if (currentMl > ml) {
600
603
  ml = currentMl;
601
- *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta));
604
+ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta));
602
605
  if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
603
606
  }
604
607
  }
@@ -617,7 +620,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
617
620
  FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
618
621
  ZSTD_matchState_t* ms,
619
622
  const ZSTD_compressionParameters* const cParams,
620
- const BYTE* ip, U32 const mls)
623
+ const BYTE* ip, U32 const mls, U32 const lazySkipping)
621
624
  {
622
625
  U32* const hashTable = ms->hashTable;
623
626
  const U32 hashLog = cParams->hashLog;
@@ -632,6 +635,9 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
632
635
  NEXT_IN_CHAIN(idx, chainMask) = hashTable[h];
633
636
  hashTable[h] = idx;
634
637
  idx++;
638
+ /* Stop inserting every position when in the lazy skipping mode. */
639
+ if (lazySkipping)
640
+ break;
635
641
  }
636
642
 
637
643
  ms->nextToUpdate = target;
@@ -640,7 +646,7 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
640
646
 
641
647
  U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {
642
648
  const ZSTD_compressionParameters* const cParams = &ms->cParams;
643
- return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch);
649
+ return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch, /* lazySkipping*/ 0);
644
650
  }
645
651
 
646
652
  /* inlining is important to hardwire a hot branch (template emulation) */
@@ -684,14 +690,15 @@ size_t ZSTD_HcFindBestMatch(
684
690
  }
685
691
 
686
692
  /* HC4 match finder */
687
- matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls);
693
+ matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls, ms->lazySkipping);
688
694
 
689
695
  for ( ; (matchIndex>=lowLimit) & (nbAttempts>0) ; nbAttempts--) {
690
696
  size_t currentMl=0;
691
697
  if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
692
698
  const BYTE* const match = base + matchIndex;
693
699
  assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */
694
- if (match[ml] == ip[ml]) /* potentially better */
700
+ /* read 4B starting from (match + ml + 1 - sizeof(U32)) */
701
+ if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3)) /* potentially better */
695
702
  currentMl = ZSTD_count(ip, match, iLimit);
696
703
  } else {
697
704
  const BYTE* const match = dictBase + matchIndex;
@@ -703,7 +710,7 @@ size_t ZSTD_HcFindBestMatch(
703
710
  /* save best solution */
704
711
  if (currentMl > ml) {
705
712
  ml = currentMl;
706
- *offsetPtr = STORE_OFFSET(curr - matchIndex);
713
+ *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
707
714
  if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
708
715
  }
709
716
 
@@ -739,7 +746,7 @@ size_t ZSTD_HcFindBestMatch(
739
746
  if (currentMl > ml) {
740
747
  ml = currentMl;
741
748
  assert(curr > matchIndex + dmsIndexDelta);
742
- *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta));
749
+ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta));
743
750
  if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
744
751
  }
745
752
 
@@ -756,8 +763,6 @@ size_t ZSTD_HcFindBestMatch(
756
763
  * (SIMD) Row-based matchfinder
757
764
  ***********************************/
758
765
  /* Constants for row-based hash */
759
- #define ZSTD_ROW_HASH_TAG_OFFSET 16 /* byte offset of hashes in the match state's tagTable from the beginning of a row */
760
- #define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */
761
766
  #define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1)
762
767
  #define ZSTD_ROW_HASH_MAX_ENTRIES 64 /* absolute maximum number of entries per row, for all configurations */
763
768
 
@@ -769,73 +774,19 @@ typedef U64 ZSTD_VecMask; /* Clarifies when we are interacting with a U64 repr
769
774
  * Starting from the LSB, returns the idx of the next non-zero bit.
770
775
  * Basically counting the nb of trailing zeroes.
771
776
  */
772
- static U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
773
- assert(val != 0);
774
- # if defined(_MSC_VER) && defined(_WIN64)
775
- if (val != 0) {
776
- unsigned long r;
777
- _BitScanForward64(&r, val);
778
- return (U32)(r);
779
- } else {
780
- /* Should not reach this code path */
781
- __assume(0);
782
- }
783
- # elif (defined(__GNUC__) && ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))
784
- if (sizeof(size_t) == 4) {
785
- U32 mostSignificantWord = (U32)(val >> 32);
786
- U32 leastSignificantWord = (U32)val;
787
- if (leastSignificantWord == 0) {
788
- return 32 + (U32)__builtin_ctz(mostSignificantWord);
789
- } else {
790
- return (U32)__builtin_ctz(leastSignificantWord);
791
- }
792
- } else {
793
- return (U32)__builtin_ctzll(val);
794
- }
795
- # else
796
- /* Software ctz version: http://aggregate.org/MAGIC/#Trailing%20Zero%20Count
797
- * and: https://stackoverflow.com/questions/2709430/count-number-of-bits-in-a-64-bit-long-big-integer
798
- */
799
- val = ~val & (val - 1ULL); /* Lowest set bit mask */
800
- val = val - ((val >> 1) & 0x5555555555555555);
801
- val = (val & 0x3333333333333333ULL) + ((val >> 2) & 0x3333333333333333ULL);
802
- return (U32)((((val + (val >> 4)) & 0xF0F0F0F0F0F0F0FULL) * 0x101010101010101ULL) >> 56);
803
- # endif
804
- }
805
-
806
- /* ZSTD_rotateRight_*():
807
- * Rotates a bitfield to the right by "count" bits.
808
- * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts
809
- */
810
- FORCE_INLINE_TEMPLATE
811
- U64 ZSTD_rotateRight_U64(U64 const value, U32 count) {
812
- assert(count < 64);
813
- count &= 0x3F; /* for fickle pattern recognition */
814
- return (value >> count) | (U64)(value << ((0U - count) & 0x3F));
815
- }
816
-
817
- FORCE_INLINE_TEMPLATE
818
- U32 ZSTD_rotateRight_U32(U32 const value, U32 count) {
819
- assert(count < 32);
820
- count &= 0x1F; /* for fickle pattern recognition */
821
- return (value >> count) | (U32)(value << ((0U - count) & 0x1F));
822
- }
823
-
824
- FORCE_INLINE_TEMPLATE
825
- U16 ZSTD_rotateRight_U16(U16 const value, U32 count) {
826
- assert(count < 16);
827
- count &= 0x0F; /* for fickle pattern recognition */
828
- return (value >> count) | (U16)(value << ((0U - count) & 0x0F));
777
+ MEM_STATIC U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
778
+ return ZSTD_countTrailingZeros64(val);
829
779
  }
830
780
 
831
781
  /* ZSTD_row_nextIndex():
832
782
  * Returns the next index to insert at within a tagTable row, and updates the "head"
833
- * value to reflect the update. Essentially cycles backwards from [0, {entries per row})
783
+ * value to reflect the update. Essentially cycles backwards from [1, {entries per row})
834
784
  */
835
785
  FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const rowMask) {
836
- U32 const next = (*tagRow - 1) & rowMask;
837
- *tagRow = (BYTE)next;
838
- return next;
786
+ U32 next = (*tagRow-1) & rowMask;
787
+ next += (next == 0) ? rowMask : 0; /* skip first position */
788
+ *tagRow = (BYTE)next;
789
+ return next;
839
790
  }
840
791
 
841
792
  /* ZSTD_isAligned():
@@ -849,7 +800,7 @@ MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) {
849
800
  /* ZSTD_row_prefetch():
850
801
  * Performs prefetching for the hashTable and tagTable at a given row.
851
802
  */
852
- FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* tagTable, U32 const relRow, U32 const rowLog) {
803
+ FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, BYTE const* tagTable, U32 const relRow, U32 const rowLog) {
853
804
  PREFETCH_L1(hashTable + relRow);
854
805
  if (rowLog >= 5) {
855
806
  PREFETCH_L1(hashTable + relRow + 16);
@@ -873,13 +824,13 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const B
873
824
  U32 idx, const BYTE* const iLimit)
874
825
  {
875
826
  U32 const* const hashTable = ms->hashTable;
876
- U16 const* const tagTable = ms->tagTable;
827
+ BYTE const* const tagTable = ms->tagTable;
877
828
  U32 const hashLog = ms->rowHashLog;
878
829
  U32 const maxElemsToPrefetch = (base + idx) > iLimit ? 0 : (U32)(iLimit - (base + idx) + 1);
879
830
  U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch);
880
831
 
881
832
  for (; idx < lim; ++idx) {
882
- U32 const hash = (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
833
+ U32 const hash = (U32)ZSTD_hashPtrSalted(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt);
883
834
  U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
884
835
  ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
885
836
  ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash;
@@ -895,11 +846,12 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const B
895
846
  * base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable.
896
847
  */
897
848
  FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable,
898
- U16 const* tagTable, BYTE const* base,
849
+ BYTE const* tagTable, BYTE const* base,
899
850
  U32 idx, U32 const hashLog,
900
- U32 const rowLog, U32 const mls)
851
+ U32 const rowLog, U32 const mls,
852
+ U64 const hashSalt)
901
853
  {
902
- U32 const newHash = (U32)ZSTD_hashPtr(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
854
+ U32 const newHash = (U32)ZSTD_hashPtrSalted(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt);
903
855
  U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
904
856
  ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
905
857
  { U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK];
@@ -917,22 +869,21 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms,
917
869
  U32 const rowMask, U32 const useCache)
918
870
  {
919
871
  U32* const hashTable = ms->hashTable;
920
- U16* const tagTable = ms->tagTable;
872
+ BYTE* const tagTable = ms->tagTable;
921
873
  U32 const hashLog = ms->rowHashLog;
922
874
  const BYTE* const base = ms->window.base;
923
875
 
924
876
  DEBUGLOG(6, "ZSTD_row_update_internalImpl(): updateStartIdx=%u, updateEndIdx=%u", updateStartIdx, updateEndIdx);
925
877
  for (; updateStartIdx < updateEndIdx; ++updateStartIdx) {
926
- U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls)
927
- : (U32)ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
878
+ U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls, ms->hashSalt)
879
+ : (U32)ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt);
928
880
  U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
929
881
  U32* const row = hashTable + relRow;
930
- BYTE* tagRow = (BYTE*)(tagTable + relRow); /* Though tagTable is laid out as a table of U16, each tag is only 1 byte.
931
- Explicit cast allows us to get exact desired position within each row */
882
+ BYTE* tagRow = tagTable + relRow;
932
883
  U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
933
884
 
934
- assert(hash == ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls));
935
- ((BYTE*)tagRow)[pos + ZSTD_ROW_HASH_TAG_OFFSET] = hash & ZSTD_ROW_HASH_TAG_MASK;
885
+ assert(hash == ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt));
886
+ tagRow[pos] = hash & ZSTD_ROW_HASH_TAG_MASK;
936
887
  row[pos] = updateStartIdx;
937
888
  }
938
889
  }
@@ -980,7 +931,35 @@ void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) {
980
931
  const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */);
981
932
 
982
933
  DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog);
983
- ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* dont use cache */);
934
+ ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* don't use cache */);
935
+ }
936
+
937
+ /* Returns the mask width of bits group of which will be set to 1. Given not all
938
+ * architectures have easy movemask instruction, this helps to iterate over
939
+ * groups of bits easier and faster.
940
+ */
941
+ FORCE_INLINE_TEMPLATE U32
942
+ ZSTD_row_matchMaskGroupWidth(const U32 rowEntries)
943
+ {
944
+ assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
945
+ assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
946
+ (void)rowEntries;
947
+ #if defined(ZSTD_ARCH_ARM_NEON)
948
+ /* NEON path only works for little endian */
949
+ if (!MEM_isLittleEndian()) {
950
+ return 1;
951
+ }
952
+ if (rowEntries == 16) {
953
+ return 4;
954
+ }
955
+ if (rowEntries == 32) {
956
+ return 2;
957
+ }
958
+ if (rowEntries == 64) {
959
+ return 1;
960
+ }
961
+ #endif
962
+ return 1;
984
963
  }
985
964
 
986
965
  #if defined(ZSTD_ARCH_X86_SSE2)
@@ -1003,71 +982,82 @@ ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U
1003
982
  }
1004
983
  #endif
1005
984
 
1006
- /* Returns a ZSTD_VecMask (U32) that has the nth bit set to 1 if the newly-computed "tag" matches
1007
- * the hash at the nth position in a row of the tagTable.
1008
- * Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield
1009
- * to match up with the actual layout of the entries within the hashTable */
985
+ #if defined(ZSTD_ARCH_ARM_NEON)
986
+ FORCE_INLINE_TEMPLATE ZSTD_VecMask
987
+ ZSTD_row_getNEONMask(const U32 rowEntries, const BYTE* const src, const BYTE tag, const U32 headGrouped)
988
+ {
989
+ assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
990
+ if (rowEntries == 16) {
991
+ /* vshrn_n_u16 shifts by 4 every u16 and narrows to 8 lower bits.
992
+ * After that groups of 4 bits represent the equalMask. We lower
993
+ * all bits except the highest in these groups by doing AND with
994
+ * 0x88 = 0b10001000.
995
+ */
996
+ const uint8x16_t chunk = vld1q_u8(src);
997
+ const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag)));
998
+ const uint8x8_t res = vshrn_n_u16(equalMask, 4);
999
+ const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0);
1000
+ return ZSTD_rotateRight_U64(matches, headGrouped) & 0x8888888888888888ull;
1001
+ } else if (rowEntries == 32) {
1002
+ /* Same idea as with rowEntries == 16 but doing AND with
1003
+ * 0x55 = 0b01010101.
1004
+ */
1005
+ const uint16x8x2_t chunk = vld2q_u16((const uint16_t*)(const void*)src);
1006
+ const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]);
1007
+ const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]);
1008
+ const uint8x16_t dup = vdupq_n_u8(tag);
1009
+ const uint8x8_t t0 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk0, dup)), 6);
1010
+ const uint8x8_t t1 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk1, dup)), 6);
1011
+ const uint8x8_t res = vsli_n_u8(t0, t1, 4);
1012
+ const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0) ;
1013
+ return ZSTD_rotateRight_U64(matches, headGrouped) & 0x5555555555555555ull;
1014
+ } else { /* rowEntries == 64 */
1015
+ const uint8x16x4_t chunk = vld4q_u8(src);
1016
+ const uint8x16_t dup = vdupq_n_u8(tag);
1017
+ const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup);
1018
+ const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup);
1019
+ const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup);
1020
+ const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup);
1021
+
1022
+ const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1);
1023
+ const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1);
1024
+ const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2);
1025
+ const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4);
1026
+ const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4);
1027
+ const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0);
1028
+ return ZSTD_rotateRight_U64(matches, headGrouped);
1029
+ }
1030
+ }
1031
+ #endif
1032
+
1033
+ /* Returns a ZSTD_VecMask (U64) that has the nth group (determined by
1034
+ * ZSTD_row_matchMaskGroupWidth) of bits set to 1 if the newly-computed "tag"
1035
+ * matches the hash at the nth position in a row of the tagTable.
1036
+ * Each row is a circular buffer beginning at the value of "headGrouped". So we
1037
+ * must rotate the "matches" bitfield to match up with the actual layout of the
1038
+ * entries within the hashTable */
1010
1039
  FORCE_INLINE_TEMPLATE ZSTD_VecMask
1011
- ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries)
1040
+ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 headGrouped, const U32 rowEntries)
1012
1041
  {
1013
- const BYTE* const src = tagRow + ZSTD_ROW_HASH_TAG_OFFSET;
1042
+ const BYTE* const src = tagRow;
1014
1043
  assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
1015
1044
  assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
1045
+ assert(ZSTD_row_matchMaskGroupWidth(rowEntries) * rowEntries <= sizeof(ZSTD_VecMask) * 8);
1016
1046
 
1017
1047
  #if defined(ZSTD_ARCH_X86_SSE2)
1018
1048
 
1019
- return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, head);
1049
+ return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, headGrouped);
1020
1050
 
1021
1051
  #else /* SW or NEON-LE */
1022
1052
 
1023
1053
  # if defined(ZSTD_ARCH_ARM_NEON)
1024
1054
  /* This NEON path only works for little endian - otherwise use SWAR below */
1025
1055
  if (MEM_isLittleEndian()) {
1026
- if (rowEntries == 16) {
1027
- const uint8x16_t chunk = vld1q_u8(src);
1028
- const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag)));
1029
- const uint16x8_t t0 = vshlq_n_u16(equalMask, 7);
1030
- const uint32x4_t t1 = vreinterpretq_u32_u16(vsriq_n_u16(t0, t0, 14));
1031
- const uint64x2_t t2 = vreinterpretq_u64_u32(vshrq_n_u32(t1, 14));
1032
- const uint8x16_t t3 = vreinterpretq_u8_u64(vsraq_n_u64(t2, t2, 28));
1033
- const U16 hi = (U16)vgetq_lane_u8(t3, 8);
1034
- const U16 lo = (U16)vgetq_lane_u8(t3, 0);
1035
- return ZSTD_rotateRight_U16((hi << 8) | lo, head);
1036
- } else if (rowEntries == 32) {
1037
- const uint16x8x2_t chunk = vld2q_u16((const U16*)(const void*)src);
1038
- const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]);
1039
- const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]);
1040
- const uint8x16_t equalMask0 = vceqq_u8(chunk0, vdupq_n_u8(tag));
1041
- const uint8x16_t equalMask1 = vceqq_u8(chunk1, vdupq_n_u8(tag));
1042
- const int8x8_t pack0 = vqmovn_s16(vreinterpretq_s16_u8(equalMask0));
1043
- const int8x8_t pack1 = vqmovn_s16(vreinterpretq_s16_u8(equalMask1));
1044
- const uint8x8_t t0 = vreinterpret_u8_s8(pack0);
1045
- const uint8x8_t t1 = vreinterpret_u8_s8(pack1);
1046
- const uint8x8_t t2 = vsri_n_u8(t1, t0, 2);
1047
- const uint8x8x2_t t3 = vuzp_u8(t2, t0);
1048
- const uint8x8_t t4 = vsri_n_u8(t3.val[1], t3.val[0], 4);
1049
- const U32 matches = vget_lane_u32(vreinterpret_u32_u8(t4), 0);
1050
- return ZSTD_rotateRight_U32(matches, head);
1051
- } else { /* rowEntries == 64 */
1052
- const uint8x16x4_t chunk = vld4q_u8(src);
1053
- const uint8x16_t dup = vdupq_n_u8(tag);
1054
- const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup);
1055
- const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup);
1056
- const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup);
1057
- const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup);
1058
-
1059
- const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1);
1060
- const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1);
1061
- const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2);
1062
- const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4);
1063
- const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4);
1064
- const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0);
1065
- return ZSTD_rotateRight_U64(matches, head);
1066
- }
1056
+ return ZSTD_row_getNEONMask(rowEntries, src, tag, headGrouped);
1067
1057
  }
1068
1058
  # endif /* ZSTD_ARCH_ARM_NEON */
1069
1059
  /* SWAR */
1070
- { const size_t chunkSize = sizeof(size_t);
1060
+ { const int chunkSize = sizeof(size_t);
1071
1061
  const size_t shiftAmount = ((chunkSize * 8) - chunkSize);
1072
1062
  const size_t xFF = ~((size_t)0);
1073
1063
  const size_t x01 = xFF / 0xFF;
@@ -1100,11 +1090,11 @@ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head,
1100
1090
  }
1101
1091
  matches = ~matches;
1102
1092
  if (rowEntries == 16) {
1103
- return ZSTD_rotateRight_U16((U16)matches, head);
1093
+ return ZSTD_rotateRight_U16((U16)matches, headGrouped);
1104
1094
  } else if (rowEntries == 32) {
1105
- return ZSTD_rotateRight_U32((U32)matches, head);
1095
+ return ZSTD_rotateRight_U32((U32)matches, headGrouped);
1106
1096
  } else {
1107
- return ZSTD_rotateRight_U64((U64)matches, head);
1097
+ return ZSTD_rotateRight_U64((U64)matches, headGrouped);
1108
1098
  }
1109
1099
  }
1110
1100
  #endif
@@ -1134,7 +1124,7 @@ size_t ZSTD_RowFindBestMatch(
1134
1124
  const U32 rowLog)
1135
1125
  {
1136
1126
  U32* const hashTable = ms->hashTable;
1137
- U16* const tagTable = ms->tagTable;
1127
+ BYTE* const tagTable = ms->tagTable;
1138
1128
  U32* const hashCache = ms->hashCache;
1139
1129
  const U32 hashLog = ms->rowHashLog;
1140
1130
  const ZSTD_compressionParameters* const cParams = &ms->cParams;
@@ -1152,8 +1142,11 @@ size_t ZSTD_RowFindBestMatch(
1152
1142
  const U32 rowEntries = (1U << rowLog);
1153
1143
  const U32 rowMask = rowEntries - 1;
1154
1144
  const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */
1145
+ const U32 groupWidth = ZSTD_row_matchMaskGroupWidth(rowEntries);
1146
+ const U64 hashSalt = ms->hashSalt;
1155
1147
  U32 nbAttempts = 1U << cappedSearchLog;
1156
1148
  size_t ml=4-1;
1149
+ U32 hash;
1157
1150
 
1158
1151
  /* DMS/DDS variables that may be referenced laster */
1159
1152
  const ZSTD_matchState_t* const dms = ms->dictMatchState;
@@ -1177,7 +1170,7 @@ size_t ZSTD_RowFindBestMatch(
1177
1170
  if (dictMode == ZSTD_dictMatchState) {
1178
1171
  /* Prefetch DMS rows */
1179
1172
  U32* const dmsHashTable = dms->hashTable;
1180
- U16* const dmsTagTable = dms->tagTable;
1173
+ BYTE* const dmsTagTable = dms->tagTable;
1181
1174
  U32 const dmsHash = (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
1182
1175
  U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
1183
1176
  dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK;
@@ -1187,23 +1180,34 @@ size_t ZSTD_RowFindBestMatch(
1187
1180
  }
1188
1181
 
1189
1182
  /* Update the hashTable and tagTable up to (but not including) ip */
1190
- ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);
1183
+ if (!ms->lazySkipping) {
1184
+ ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);
1185
+ hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls, hashSalt);
1186
+ } else {
1187
+ /* Stop inserting every position when in the lazy skipping mode.
1188
+ * The hash cache is also not kept up to date in this mode.
1189
+ */
1190
+ hash = (U32)ZSTD_hashPtrSalted(ip, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt);
1191
+ ms->nextToUpdate = curr;
1192
+ }
1193
+ ms->hashSaltEntropy += hash; /* collect salt entropy */
1194
+
1191
1195
  { /* Get the hash for ip, compute the appropriate row */
1192
- U32 const hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls);
1193
1196
  U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
1194
1197
  U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK;
1195
1198
  U32* const row = hashTable + relRow;
1196
1199
  BYTE* tagRow = (BYTE*)(tagTable + relRow);
1197
- U32 const head = *tagRow & rowMask;
1200
+ U32 const headGrouped = (*tagRow & rowMask) * groupWidth;
1198
1201
  U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
1199
1202
  size_t numMatches = 0;
1200
1203
  size_t currMatch = 0;
1201
- ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, head, rowEntries);
1204
+ ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, headGrouped, rowEntries);
1202
1205
 
1203
1206
  /* Cycle through the matches and prefetch */
1204
- for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
1205
- U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
1207
+ for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) {
1208
+ U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask;
1206
1209
  U32 const matchIndex = row[matchPos];
1210
+ if(matchPos == 0) continue;
1207
1211
  assert(numMatches < rowEntries);
1208
1212
  if (matchIndex < lowLimit)
1209
1213
  break;
@@ -1213,13 +1217,14 @@ size_t ZSTD_RowFindBestMatch(
1213
1217
  PREFETCH_L1(dictBase + matchIndex);
1214
1218
  }
1215
1219
  matchBuffer[numMatches++] = matchIndex;
1220
+ --nbAttempts;
1216
1221
  }
1217
1222
 
1218
1223
  /* Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop
1219
1224
  in ZSTD_row_update_internal() at the next search. */
1220
1225
  {
1221
1226
  U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
1222
- tagRow[pos + ZSTD_ROW_HASH_TAG_OFFSET] = (BYTE)tag;
1227
+ tagRow[pos] = (BYTE)tag;
1223
1228
  row[pos] = ms->nextToUpdate++;
1224
1229
  }
1225
1230
 
@@ -1233,7 +1238,8 @@ size_t ZSTD_RowFindBestMatch(
1233
1238
  if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
1234
1239
  const BYTE* const match = base + matchIndex;
1235
1240
  assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */
1236
- if (match[ml] == ip[ml]) /* potentially better */
1241
+ /* read 4B starting from (match + ml + 1 - sizeof(U32)) */
1242
+ if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3)) /* potentially better */
1237
1243
  currentMl = ZSTD_count(ip, match, iLimit);
1238
1244
  } else {
1239
1245
  const BYTE* const match = dictBase + matchIndex;
@@ -1245,7 +1251,7 @@ size_t ZSTD_RowFindBestMatch(
1245
1251
  /* Save best solution */
1246
1252
  if (currentMl > ml) {
1247
1253
  ml = currentMl;
1248
- *offsetPtr = STORE_OFFSET(curr - matchIndex);
1254
+ *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
1249
1255
  if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
1250
1256
  }
1251
1257
  }
@@ -1263,19 +1269,21 @@ size_t ZSTD_RowFindBestMatch(
1263
1269
  const U32 dmsSize = (U32)(dmsEnd - dmsBase);
1264
1270
  const U32 dmsIndexDelta = dictLimit - dmsSize;
1265
1271
 
1266
- { U32 const head = *dmsTagRow & rowMask;
1272
+ { U32 const headGrouped = (*dmsTagRow & rowMask) * groupWidth;
1267
1273
  U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
1268
1274
  size_t numMatches = 0;
1269
1275
  size_t currMatch = 0;
1270
- ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, head, rowEntries);
1276
+ ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, headGrouped, rowEntries);
1271
1277
 
1272
- for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
1273
- U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
1278
+ for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) {
1279
+ U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask;
1274
1280
  U32 const matchIndex = dmsRow[matchPos];
1281
+ if(matchPos == 0) continue;
1275
1282
  if (matchIndex < dmsLowestIndex)
1276
1283
  break;
1277
1284
  PREFETCH_L1(dmsBase + matchIndex);
1278
1285
  matchBuffer[numMatches++] = matchIndex;
1286
+ --nbAttempts;
1279
1287
  }
1280
1288
 
1281
1289
  /* Return the longest match */
@@ -1294,7 +1302,7 @@ size_t ZSTD_RowFindBestMatch(
1294
1302
  if (currentMl > ml) {
1295
1303
  ml = currentMl;
1296
1304
  assert(curr > matchIndex + dmsIndexDelta);
1297
- *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta));
1305
+ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta));
1298
1306
  if (ip+currentMl == iLimit) break;
1299
1307
  }
1300
1308
  }
@@ -1304,14 +1312,10 @@ size_t ZSTD_RowFindBestMatch(
1304
1312
  }
1305
1313
 
1306
1314
 
1307
- typedef size_t (*searchMax_f)(
1308
- ZSTD_matchState_t* ms,
1309
- const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
1310
-
1311
1315
  /**
1312
- * This struct contains the functions necessary for lazy to search.
1313
- * Currently, that is only searchMax. However, it is still valuable to have the
1314
- * VTable because this makes it easier to add more functions to the VTable later.
1316
+ * Generate search functions templated on (dictMode, mls, rowLog).
1317
+ * These functions are outlined for code size & compilation time.
1318
+ * ZSTD_searchMax() dispatches to the correct implementation function.
1315
1319
  *
1316
1320
  * TODO: The start of the search function involves loading and calculating a
1317
1321
  * bunch of constants from the ZSTD_matchState_t. These computations could be
@@ -1329,25 +1333,25 @@ typedef size_t (*searchMax_f)(
1329
1333
  * the single segment loop. It should go in searchMax instead of its own
1330
1334
  * function to avoid having multiple virtual function calls per search.
1331
1335
  */
1332
- typedef struct {
1333
- searchMax_f searchMax;
1334
- } ZSTD_LazyVTable;
1335
1336
 
1336
- #define GEN_ZSTD_BT_VTABLE(dictMode, mls) \
1337
- static size_t ZSTD_BtFindBestMatch_##dictMode##_##mls( \
1338
- ZSTD_matchState_t* ms, \
1339
- const BYTE* ip, const BYTE* const iLimit, \
1340
- size_t* offsetPtr) \
1341
- { \
1342
- assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
1343
- return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode); \
1344
- } \
1345
- static const ZSTD_LazyVTable ZSTD_BtVTable_##dictMode##_##mls = { \
1346
- ZSTD_BtFindBestMatch_##dictMode##_##mls \
1347
- };
1337
+ #define ZSTD_BT_SEARCH_FN(dictMode, mls) ZSTD_BtFindBestMatch_##dictMode##_##mls
1338
+ #define ZSTD_HC_SEARCH_FN(dictMode, mls) ZSTD_HcFindBestMatch_##dictMode##_##mls
1339
+ #define ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog) ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog
1348
1340
 
1349
- #define GEN_ZSTD_HC_VTABLE(dictMode, mls) \
1350
- static size_t ZSTD_HcFindBestMatch_##dictMode##_##mls( \
1341
+ #define ZSTD_SEARCH_FN_ATTRS FORCE_NOINLINE
1342
+
1343
+ #define GEN_ZSTD_BT_SEARCH_FN(dictMode, mls) \
1344
+ ZSTD_SEARCH_FN_ATTRS size_t ZSTD_BT_SEARCH_FN(dictMode, mls)( \
1345
+ ZSTD_matchState_t* ms, \
1346
+ const BYTE* ip, const BYTE* const iLimit, \
1347
+ size_t* offBasePtr) \
1348
+ { \
1349
+ assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
1350
+ return ZSTD_BtFindBestMatch(ms, ip, iLimit, offBasePtr, mls, ZSTD_##dictMode); \
1351
+ } \
1352
+
1353
+ #define GEN_ZSTD_HC_SEARCH_FN(dictMode, mls) \
1354
+ ZSTD_SEARCH_FN_ATTRS size_t ZSTD_HC_SEARCH_FN(dictMode, mls)( \
1351
1355
  ZSTD_matchState_t* ms, \
1352
1356
  const BYTE* ip, const BYTE* const iLimit, \
1353
1357
  size_t* offsetPtr) \
@@ -1355,12 +1359,9 @@ typedef struct {
1355
1359
  assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
1356
1360
  return ZSTD_HcFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode); \
1357
1361
  } \
1358
- static const ZSTD_LazyVTable ZSTD_HcVTable_##dictMode##_##mls = { \
1359
- ZSTD_HcFindBestMatch_##dictMode##_##mls \
1360
- };
1361
1362
 
1362
- #define GEN_ZSTD_ROW_VTABLE(dictMode, mls, rowLog) \
1363
- static size_t ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog( \
1363
+ #define GEN_ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog) \
1364
+ ZSTD_SEARCH_FN_ATTRS size_t ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)( \
1364
1365
  ZSTD_matchState_t* ms, \
1365
1366
  const BYTE* ip, const BYTE* const iLimit, \
1366
1367
  size_t* offsetPtr) \
@@ -1369,9 +1370,6 @@ typedef struct {
1369
1370
  assert(MAX(4, MIN(6, ms->cParams.searchLog)) == rowLog); \
1370
1371
  return ZSTD_RowFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode, rowLog); \
1371
1372
  } \
1372
- static const ZSTD_LazyVTable ZSTD_RowVTable_##dictMode##_##mls##_##rowLog = { \
1373
- ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog \
1374
- };
1375
1373
 
1376
1374
  #define ZSTD_FOR_EACH_ROWLOG(X, dictMode, mls) \
1377
1375
  X(dictMode, mls, 4) \
@@ -1394,84 +1392,103 @@ typedef struct {
1394
1392
  X(__VA_ARGS__, dictMatchState) \
1395
1393
  X(__VA_ARGS__, dedicatedDictSearch)
1396
1394
 
1397
- /* Generate Row VTables for each combination of (dictMode, mls, rowLog) */
1398
- ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS_ROWLOG, GEN_ZSTD_ROW_VTABLE)
1399
- /* Generate Binary Tree VTables for each combination of (dictMode, mls) */
1400
- ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_BT_VTABLE)
1401
- /* Generate Hash Chain VTables for each combination of (dictMode, mls) */
1402
- ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_HC_VTABLE)
1403
-
1404
- #define GEN_ZSTD_BT_VTABLE_ARRAY(dictMode) \
1405
- { \
1406
- &ZSTD_BtVTable_##dictMode##_4, \
1407
- &ZSTD_BtVTable_##dictMode##_5, \
1408
- &ZSTD_BtVTable_##dictMode##_6 \
1409
- }
1395
+ /* Generate row search fns for each combination of (dictMode, mls, rowLog) */
1396
+ ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS_ROWLOG, GEN_ZSTD_ROW_SEARCH_FN)
1397
+ /* Generate binary Tree search fns for each combination of (dictMode, mls) */
1398
+ ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_BT_SEARCH_FN)
1399
+ /* Generate hash chain search fns for each combination of (dictMode, mls) */
1400
+ ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_HC_SEARCH_FN)
1410
1401
 
1411
- #define GEN_ZSTD_HC_VTABLE_ARRAY(dictMode) \
1412
- { \
1413
- &ZSTD_HcVTable_##dictMode##_4, \
1414
- &ZSTD_HcVTable_##dictMode##_5, \
1415
- &ZSTD_HcVTable_##dictMode##_6 \
1416
- }
1417
-
1418
- #define GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, mls) \
1419
- { \
1420
- &ZSTD_RowVTable_##dictMode##_##mls##_4, \
1421
- &ZSTD_RowVTable_##dictMode##_##mls##_5, \
1422
- &ZSTD_RowVTable_##dictMode##_##mls##_6 \
1423
- }
1402
+ typedef enum { search_hashChain=0, search_binaryTree=1, search_rowHash=2 } searchMethod_e;
1424
1403
 
1425
- #define GEN_ZSTD_ROW_VTABLE_ARRAY(dictMode) \
1426
- { \
1427
- GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, 4), \
1428
- GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, 5), \
1429
- GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, 6) \
1404
+ #define GEN_ZSTD_CALL_BT_SEARCH_FN(dictMode, mls) \
1405
+ case mls: \
1406
+ return ZSTD_BT_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr);
1407
+ #define GEN_ZSTD_CALL_HC_SEARCH_FN(dictMode, mls) \
1408
+ case mls: \
1409
+ return ZSTD_HC_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr);
1410
+ #define GEN_ZSTD_CALL_ROW_SEARCH_FN(dictMode, mls, rowLog) \
1411
+ case rowLog: \
1412
+ return ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)(ms, ip, iend, offsetPtr);
1413
+
1414
+ #define ZSTD_SWITCH_MLS(X, dictMode) \
1415
+ switch (mls) { \
1416
+ ZSTD_FOR_EACH_MLS(X, dictMode) \
1430
1417
  }
1431
1418
 
1432
- #define GEN_ZSTD_VTABLE_ARRAY(X) \
1433
- { \
1434
- X(noDict), \
1435
- X(extDict), \
1436
- X(dictMatchState), \
1437
- X(dedicatedDictSearch) \
1438
- }
1439
-
1440
- /* *******************************
1441
- * Common parser - lazy strategy
1442
- *********************************/
1443
- typedef enum { search_hashChain=0, search_binaryTree=1, search_rowHash=2 } searchMethod_e;
1419
+ #define ZSTD_SWITCH_ROWLOG(dictMode, mls) \
1420
+ case mls: \
1421
+ switch (rowLog) { \
1422
+ ZSTD_FOR_EACH_ROWLOG(GEN_ZSTD_CALL_ROW_SEARCH_FN, dictMode, mls) \
1423
+ } \
1424
+ ZSTD_UNREACHABLE; \
1425
+ break;
1426
+
1427
+ #define ZSTD_SWITCH_SEARCH_METHOD(dictMode) \
1428
+ switch (searchMethod) { \
1429
+ case search_hashChain: \
1430
+ ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_HC_SEARCH_FN, dictMode) \
1431
+ break; \
1432
+ case search_binaryTree: \
1433
+ ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_BT_SEARCH_FN, dictMode) \
1434
+ break; \
1435
+ case search_rowHash: \
1436
+ ZSTD_SWITCH_MLS(ZSTD_SWITCH_ROWLOG, dictMode) \
1437
+ break; \
1438
+ } \
1439
+ ZSTD_UNREACHABLE;
1444
1440
 
1445
1441
  /**
1446
- * This table is indexed first by the four ZSTD_dictMode_e values, and then
1447
- * by the two searchMethod_e values. NULLs are placed for configurations
1448
- * that should never occur (extDict modes go to the other implementation
1449
- * below and there is no DDSS for binary tree search yet).
1442
+ * Searches for the longest match at @p ip.
1443
+ * Dispatches to the correct implementation function based on the
1444
+ * (searchMethod, dictMode, mls, rowLog). We use switch statements
1445
+ * here instead of using an indirect function call through a function
1446
+ * pointer because after Spectre and Meltdown mitigations, indirect
1447
+ * function calls can be very costly, especially in the kernel.
1448
+ *
1449
+ * NOTE: dictMode and searchMethod should be templated, so those switch
1450
+ * statements should be optimized out. Only the mls & rowLog switches
1451
+ * should be left.
1452
+ *
1453
+ * @param ms The match state.
1454
+ * @param ip The position to search at.
1455
+ * @param iend The end of the input data.
1456
+ * @param[out] offsetPtr Stores the match offset into this pointer.
1457
+ * @param mls The minimum search length, in the range [4, 6].
1458
+ * @param rowLog The row log (if applicable), in the range [4, 6].
1459
+ * @param searchMethod The search method to use (templated).
1460
+ * @param dictMode The dictMode (templated).
1461
+ *
1462
+ * @returns The length of the longest match found, or < mls if no match is found.
1463
+ * If a match is found its offset is stored in @p offsetPtr.
1450
1464
  */
1451
-
1452
- static ZSTD_LazyVTable const*
1453
- ZSTD_selectLazyVTable(ZSTD_matchState_t const* ms, searchMethod_e searchMethod, ZSTD_dictMode_e dictMode)
1465
+ FORCE_INLINE_TEMPLATE size_t ZSTD_searchMax(
1466
+ ZSTD_matchState_t* ms,
1467
+ const BYTE* ip,
1468
+ const BYTE* iend,
1469
+ size_t* offsetPtr,
1470
+ U32 const mls,
1471
+ U32 const rowLog,
1472
+ searchMethod_e const searchMethod,
1473
+ ZSTD_dictMode_e const dictMode)
1454
1474
  {
1455
- /* Fill the Hc/Bt VTable arrays with the right functions for the (dictMode, mls) combination. */
1456
- ZSTD_LazyVTable const* const hcVTables[4][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_HC_VTABLE_ARRAY);
1457
- ZSTD_LazyVTable const* const btVTables[4][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_BT_VTABLE_ARRAY);
1458
- /* Fill the Row VTable array with the right functions for the (dictMode, mls, rowLog) combination. */
1459
- ZSTD_LazyVTable const* const rowVTables[4][3][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_ROW_VTABLE_ARRAY);
1460
-
1461
- U32 const mls = MAX(4, MIN(6, ms->cParams.minMatch));
1462
- U32 const rowLog = MAX(4, MIN(6, ms->cParams.searchLog));
1463
- switch (searchMethod) {
1464
- case search_hashChain:
1465
- return hcVTables[dictMode][mls - 4];
1466
- case search_binaryTree:
1467
- return btVTables[dictMode][mls - 4];
1468
- case search_rowHash:
1469
- return rowVTables[dictMode][mls - 4][rowLog - 4];
1470
- default:
1471
- return NULL;
1475
+ if (dictMode == ZSTD_noDict) {
1476
+ ZSTD_SWITCH_SEARCH_METHOD(noDict)
1477
+ } else if (dictMode == ZSTD_extDict) {
1478
+ ZSTD_SWITCH_SEARCH_METHOD(extDict)
1479
+ } else if (dictMode == ZSTD_dictMatchState) {
1480
+ ZSTD_SWITCH_SEARCH_METHOD(dictMatchState)
1481
+ } else if (dictMode == ZSTD_dedicatedDictSearch) {
1482
+ ZSTD_SWITCH_SEARCH_METHOD(dedicatedDictSearch)
1472
1483
  }
1484
+ ZSTD_UNREACHABLE;
1485
+ return 0;
1473
1486
  }
1474
1487
 
1488
+ /* *******************************
1489
+ * Common parser - lazy strategy
1490
+ *********************************/
1491
+
1475
1492
  FORCE_INLINE_TEMPLATE size_t
1476
1493
  ZSTD_compressBlock_lazy_generic(
1477
1494
  ZSTD_matchState_t* ms, seqStore_t* seqStore,
@@ -1488,9 +1505,11 @@ ZSTD_compressBlock_lazy_generic(
1488
1505
  const BYTE* const base = ms->window.base;
1489
1506
  const U32 prefixLowestIndex = ms->window.dictLimit;
1490
1507
  const BYTE* const prefixLowest = base + prefixLowestIndex;
1508
+ const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6);
1509
+ const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);
1491
1510
 
1492
- searchMax_f const searchMax = ZSTD_selectLazyVTable(ms, searchMethod, dictMode)->searchMax;
1493
- U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0;
1511
+ U32 offset_1 = rep[0], offset_2 = rep[1];
1512
+ U32 offsetSaved1 = 0, offsetSaved2 = 0;
1494
1513
 
1495
1514
  const int isDMS = dictMode == ZSTD_dictMatchState;
1496
1515
  const int isDDS = dictMode == ZSTD_dedicatedDictSearch;
@@ -1505,16 +1524,14 @@ ZSTD_compressBlock_lazy_generic(
1505
1524
  0;
1506
1525
  const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictLowest));
1507
1526
 
1508
- assert(searchMax != NULL);
1509
-
1510
1527
  DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=%u) (searchFunc=%u)", (U32)dictMode, (U32)searchMethod);
1511
1528
  ip += (dictAndPrefixLength == 0);
1512
1529
  if (dictMode == ZSTD_noDict) {
1513
1530
  U32 const curr = (U32)(ip - base);
1514
1531
  U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, ms->cParams.windowLog);
1515
1532
  U32 const maxRep = curr - windowLow;
1516
- if (offset_2 > maxRep) savedOffset = offset_2, offset_2 = 0;
1517
- if (offset_1 > maxRep) savedOffset = offset_1, offset_1 = 0;
1533
+ if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0;
1534
+ if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0;
1518
1535
  }
1519
1536
  if (isDxS) {
1520
1537
  /* dictMatchState repCode checks don't currently handle repCode == 0
@@ -1523,11 +1540,11 @@ ZSTD_compressBlock_lazy_generic(
1523
1540
  assert(offset_2 <= dictAndPrefixLength);
1524
1541
  }
1525
1542
 
1543
+ /* Reset the lazy skipping state */
1544
+ ms->lazySkipping = 0;
1545
+
1526
1546
  if (searchMethod == search_rowHash) {
1527
- const U32 rowLog = MAX(4, MIN(6, ms->cParams.searchLog));
1528
- ZSTD_row_fillHashCache(ms, base, rowLog,
1529
- MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
1530
- ms->nextToUpdate, ilimit);
1547
+ ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
1531
1548
  }
1532
1549
 
1533
1550
  /* Match Loop */
@@ -1539,7 +1556,7 @@ ZSTD_compressBlock_lazy_generic(
1539
1556
  #endif
1540
1557
  while (ip < ilimit) {
1541
1558
  size_t matchLength=0;
1542
- size_t offcode=STORE_REPCODE_1;
1559
+ size_t offBase = REPCODE1_TO_OFFBASE;
1543
1560
  const BYTE* start=ip+1;
1544
1561
  DEBUGLOG(7, "search baseline (depth 0)");
1545
1562
 
@@ -1564,14 +1581,23 @@ ZSTD_compressBlock_lazy_generic(
1564
1581
  }
1565
1582
 
1566
1583
  /* first search (depth 0) */
1567
- { size_t offsetFound = 999999999;
1568
- size_t const ml2 = searchMax(ms, ip, iend, &offsetFound);
1584
+ { size_t offbaseFound = 999999999;
1585
+ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offbaseFound, mls, rowLog, searchMethod, dictMode);
1569
1586
  if (ml2 > matchLength)
1570
- matchLength = ml2, start = ip, offcode=offsetFound;
1587
+ matchLength = ml2, start = ip, offBase = offbaseFound;
1571
1588
  }
1572
1589
 
1573
1590
  if (matchLength < 4) {
1574
- ip += ((ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */
1591
+ size_t const step = ((size_t)(ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */;
1592
+ ip += step;
1593
+ /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time.
1594
+ * In this mode we stop inserting every position into our tables, and only insert
1595
+ * positions that we search, which is one in step positions.
1596
+ * The exact cutoff is flexible, I've just chosen a number that is reasonably high,
1597
+ * so we minimize the compression ratio loss in "normal" scenarios. This mode gets
1598
+ * triggered once we've gone 2KB without finding any matches.
1599
+ */
1600
+ ms->lazySkipping = step > kLazySkippingStep;
1575
1601
  continue;
1576
1602
  }
1577
1603
 
@@ -1581,12 +1607,12 @@ ZSTD_compressBlock_lazy_generic(
1581
1607
  DEBUGLOG(7, "search depth 1");
1582
1608
  ip ++;
1583
1609
  if ( (dictMode == ZSTD_noDict)
1584
- && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
1610
+ && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
1585
1611
  size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
1586
1612
  int const gain2 = (int)(mlRep * 3);
1587
- int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
1613
+ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
1588
1614
  if ((mlRep >= 4) && (gain2 > gain1))
1589
- matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
1615
+ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
1590
1616
  }
1591
1617
  if (isDxS) {
1592
1618
  const U32 repIndex = (U32)(ip - base) - offset_1;
@@ -1598,17 +1624,17 @@ ZSTD_compressBlock_lazy_generic(
1598
1624
  const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
1599
1625
  size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
1600
1626
  int const gain2 = (int)(mlRep * 3);
1601
- int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
1627
+ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
1602
1628
  if ((mlRep >= 4) && (gain2 > gain1))
1603
- matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
1629
+ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
1604
1630
  }
1605
1631
  }
1606
- { size_t offset2=999999999;
1607
- size_t const ml2 = searchMax(ms, ip, iend, &offset2);
1608
- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */
1609
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4);
1632
+ { size_t ofbCandidate=999999999;
1633
+ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);
1634
+ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */
1635
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4);
1610
1636
  if ((ml2 >= 4) && (gain2 > gain1)) {
1611
- matchLength = ml2, offcode = offset2, start = ip;
1637
+ matchLength = ml2, offBase = ofbCandidate, start = ip;
1612
1638
  continue; /* search a better one */
1613
1639
  } }
1614
1640
 
@@ -1617,12 +1643,12 @@ ZSTD_compressBlock_lazy_generic(
1617
1643
  DEBUGLOG(7, "search depth 2");
1618
1644
  ip ++;
1619
1645
  if ( (dictMode == ZSTD_noDict)
1620
- && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
1646
+ && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
1621
1647
  size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
1622
1648
  int const gain2 = (int)(mlRep * 4);
1623
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
1649
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
1624
1650
  if ((mlRep >= 4) && (gain2 > gain1))
1625
- matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
1651
+ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
1626
1652
  }
1627
1653
  if (isDxS) {
1628
1654
  const U32 repIndex = (U32)(ip - base) - offset_1;
@@ -1634,17 +1660,17 @@ ZSTD_compressBlock_lazy_generic(
1634
1660
  const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
1635
1661
  size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
1636
1662
  int const gain2 = (int)(mlRep * 4);
1637
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
1663
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
1638
1664
  if ((mlRep >= 4) && (gain2 > gain1))
1639
- matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
1665
+ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
1640
1666
  }
1641
1667
  }
1642
- { size_t offset2=999999999;
1643
- size_t const ml2 = searchMax(ms, ip, iend, &offset2);
1644
- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */
1645
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7);
1668
+ { size_t ofbCandidate=999999999;
1669
+ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);
1670
+ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */
1671
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7);
1646
1672
  if ((ml2 >= 4) && (gain2 > gain1)) {
1647
- matchLength = ml2, offcode = offset2, start = ip;
1673
+ matchLength = ml2, offBase = ofbCandidate, start = ip;
1648
1674
  continue;
1649
1675
  } } }
1650
1676
  break; /* nothing found : store previous solution */
@@ -1655,26 +1681,33 @@ ZSTD_compressBlock_lazy_generic(
1655
1681
  * notably if `value` is unsigned, resulting in a large positive `-value`.
1656
1682
  */
1657
1683
  /* catch up */
1658
- if (STORED_IS_OFFSET(offcode)) {
1684
+ if (OFFBASE_IS_OFFSET(offBase)) {
1659
1685
  if (dictMode == ZSTD_noDict) {
1660
- while ( ((start > anchor) & (start - STORED_OFFSET(offcode) > prefixLowest))
1661
- && (start[-1] == (start-STORED_OFFSET(offcode))[-1]) ) /* only search for offset within prefix */
1686
+ while ( ((start > anchor) & (start - OFFBASE_TO_OFFSET(offBase) > prefixLowest))
1687
+ && (start[-1] == (start-OFFBASE_TO_OFFSET(offBase))[-1]) ) /* only search for offset within prefix */
1662
1688
  { start--; matchLength++; }
1663
1689
  }
1664
1690
  if (isDxS) {
1665
- U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode));
1691
+ U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase));
1666
1692
  const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex;
1667
1693
  const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest;
1668
1694
  while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */
1669
1695
  }
1670
- offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode);
1696
+ offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase);
1671
1697
  }
1672
1698
  /* store sequence */
1673
1699
  _storeSequence:
1674
1700
  { size_t const litLength = (size_t)(start - anchor);
1675
- ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength);
1701
+ ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
1676
1702
  anchor = ip = start + matchLength;
1677
1703
  }
1704
+ if (ms->lazySkipping) {
1705
+ /* We've found a match, disable lazy skipping mode, and refill the hash cache. */
1706
+ if (searchMethod == search_rowHash) {
1707
+ ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
1708
+ }
1709
+ ms->lazySkipping = 0;
1710
+ }
1678
1711
 
1679
1712
  /* check immediate repcode */
1680
1713
  if (isDxS) {
@@ -1688,8 +1721,8 @@ _storeSequence:
1688
1721
  && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
1689
1722
  const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend;
1690
1723
  matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4;
1691
- offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap offset_2 <=> offset_1 */
1692
- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);
1724
+ offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap offset_2 <=> offset_1 */
1725
+ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
1693
1726
  ip += matchLength;
1694
1727
  anchor = ip;
1695
1728
  continue;
@@ -1703,16 +1736,20 @@ _storeSequence:
1703
1736
  && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) {
1704
1737
  /* store sequence */
1705
1738
  matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
1706
- offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap repcodes */
1707
- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);
1739
+ offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap repcodes */
1740
+ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
1708
1741
  ip += matchLength;
1709
1742
  anchor = ip;
1710
1743
  continue; /* faster when present ... (?) */
1711
1744
  } } }
1712
1745
 
1713
- /* Save reps for next block */
1714
- rep[0] = offset_1 ? offset_1 : savedOffset;
1715
- rep[1] = offset_2 ? offset_2 : savedOffset;
1746
+ /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
1747
+ * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
1748
+ offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
1749
+
1750
+ /* save reps for next block */
1751
+ rep[0] = offset_1 ? offset_1 : offsetSaved1;
1752
+ rep[1] = offset_2 ? offset_2 : offsetSaved2;
1716
1753
 
1717
1754
  /* Return the last literals size */
1718
1755
  return (size_t)(iend - anchor);
@@ -1881,19 +1918,20 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1881
1918
  const BYTE* const dictEnd = dictBase + dictLimit;
1882
1919
  const BYTE* const dictStart = dictBase + ms->window.lowLimit;
1883
1920
  const U32 windowLog = ms->cParams.windowLog;
1884
- const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5;
1921
+ const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6);
1922
+ const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);
1885
1923
 
1886
- searchMax_f const searchMax = ZSTD_selectLazyVTable(ms, searchMethod, ZSTD_extDict)->searchMax;
1887
1924
  U32 offset_1 = rep[0], offset_2 = rep[1];
1888
1925
 
1889
1926
  DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod);
1890
1927
 
1928
+ /* Reset the lazy skipping state */
1929
+ ms->lazySkipping = 0;
1930
+
1891
1931
  /* init */
1892
1932
  ip += (ip == prefixStart);
1893
1933
  if (searchMethod == search_rowHash) {
1894
- ZSTD_row_fillHashCache(ms, base, rowLog,
1895
- MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
1896
- ms->nextToUpdate, ilimit);
1934
+ ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
1897
1935
  }
1898
1936
 
1899
1937
  /* Match Loop */
@@ -1905,7 +1943,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1905
1943
  #endif
1906
1944
  while (ip < ilimit) {
1907
1945
  size_t matchLength=0;
1908
- size_t offcode=STORE_REPCODE_1;
1946
+ size_t offBase = REPCODE1_TO_OFFBASE;
1909
1947
  const BYTE* start=ip+1;
1910
1948
  U32 curr = (U32)(ip-base);
1911
1949
 
@@ -1924,14 +1962,23 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1924
1962
  } }
1925
1963
 
1926
1964
  /* first search (depth 0) */
1927
- { size_t offsetFound = 999999999;
1928
- size_t const ml2 = searchMax(ms, ip, iend, &offsetFound);
1965
+ { size_t ofbCandidate = 999999999;
1966
+ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
1929
1967
  if (ml2 > matchLength)
1930
- matchLength = ml2, start = ip, offcode=offsetFound;
1968
+ matchLength = ml2, start = ip, offBase = ofbCandidate;
1931
1969
  }
1932
1970
 
1933
1971
  if (matchLength < 4) {
1934
- ip += ((ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */
1972
+ size_t const step = ((size_t)(ip-anchor) >> kSearchStrength);
1973
+ ip += step + 1; /* jump faster over incompressible sections */
1974
+ /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time.
1975
+ * In this mode we stop inserting every position into our tables, and only insert
1976
+ * positions that we search, which is one in step positions.
1977
+ * The exact cutoff is flexible, I've just chosen a number that is reasonably high,
1978
+ * so we minimize the compression ratio loss in "normal" scenarios. This mode gets
1979
+ * triggered once we've gone 2KB without finding any matches.
1980
+ */
1981
+ ms->lazySkipping = step > kLazySkippingStep;
1935
1982
  continue;
1936
1983
  }
1937
1984
 
@@ -1941,7 +1988,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1941
1988
  ip ++;
1942
1989
  curr++;
1943
1990
  /* check repCode */
1944
- if (offcode) {
1991
+ if (offBase) {
1945
1992
  const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
1946
1993
  const U32 repIndex = (U32)(curr - offset_1);
1947
1994
  const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
@@ -1953,18 +2000,18 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1953
2000
  const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
1954
2001
  size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
1955
2002
  int const gain2 = (int)(repLength * 3);
1956
- int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
2003
+ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
1957
2004
  if ((repLength >= 4) && (gain2 > gain1))
1958
- matchLength = repLength, offcode = STORE_REPCODE_1, start = ip;
2005
+ matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip;
1959
2006
  } }
1960
2007
 
1961
2008
  /* search match, depth 1 */
1962
- { size_t offset2=999999999;
1963
- size_t const ml2 = searchMax(ms, ip, iend, &offset2);
1964
- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */
1965
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4);
2009
+ { size_t ofbCandidate = 999999999;
2010
+ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
2011
+ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */
2012
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4);
1966
2013
  if ((ml2 >= 4) && (gain2 > gain1)) {
1967
- matchLength = ml2, offcode = offset2, start = ip;
2014
+ matchLength = ml2, offBase = ofbCandidate, start = ip;
1968
2015
  continue; /* search a better one */
1969
2016
  } }
1970
2017
 
@@ -1973,7 +2020,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1973
2020
  ip ++;
1974
2021
  curr++;
1975
2022
  /* check repCode */
1976
- if (offcode) {
2023
+ if (offBase) {
1977
2024
  const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
1978
2025
  const U32 repIndex = (U32)(curr - offset_1);
1979
2026
  const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
@@ -1985,38 +2032,45 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1985
2032
  const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
1986
2033
  size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
1987
2034
  int const gain2 = (int)(repLength * 4);
1988
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
2035
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
1989
2036
  if ((repLength >= 4) && (gain2 > gain1))
1990
- matchLength = repLength, offcode = STORE_REPCODE_1, start = ip;
2037
+ matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip;
1991
2038
  } }
1992
2039
 
1993
2040
  /* search match, depth 2 */
1994
- { size_t offset2=999999999;
1995
- size_t const ml2 = searchMax(ms, ip, iend, &offset2);
1996
- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */
1997
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7);
2041
+ { size_t ofbCandidate = 999999999;
2042
+ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
2043
+ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */
2044
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7);
1998
2045
  if ((ml2 >= 4) && (gain2 > gain1)) {
1999
- matchLength = ml2, offcode = offset2, start = ip;
2046
+ matchLength = ml2, offBase = ofbCandidate, start = ip;
2000
2047
  continue;
2001
2048
  } } }
2002
2049
  break; /* nothing found : store previous solution */
2003
2050
  }
2004
2051
 
2005
2052
  /* catch up */
2006
- if (STORED_IS_OFFSET(offcode)) {
2007
- U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode));
2053
+ if (OFFBASE_IS_OFFSET(offBase)) {
2054
+ U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase));
2008
2055
  const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex;
2009
2056
  const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart;
2010
2057
  while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */
2011
- offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode);
2058
+ offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase);
2012
2059
  }
2013
2060
 
2014
2061
  /* store sequence */
2015
2062
  _storeSequence:
2016
2063
  { size_t const litLength = (size_t)(start - anchor);
2017
- ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength);
2064
+ ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
2018
2065
  anchor = ip = start + matchLength;
2019
2066
  }
2067
+ if (ms->lazySkipping) {
2068
+ /* We've found a match, disable lazy skipping mode, and refill the hash cache. */
2069
+ if (searchMethod == search_rowHash) {
2070
+ ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
2071
+ }
2072
+ ms->lazySkipping = 0;
2073
+ }
2020
2074
 
2021
2075
  /* check immediate repcode */
2022
2076
  while (ip <= ilimit) {
@@ -2031,8 +2085,8 @@ _storeSequence:
2031
2085
  /* repcode detected we should take it */
2032
2086
  const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
2033
2087
  matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
2034
- offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap offset history */
2035
- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);
2088
+ offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap offset history */
2089
+ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
2036
2090
  ip += matchLength;
2037
2091
  anchor = ip;
2038
2092
  continue; /* faster when present ... (?) */
@@ -2098,7 +2152,6 @@ size_t ZSTD_compressBlock_lazy_extDict_row(
2098
2152
  size_t ZSTD_compressBlock_lazy2_extDict_row(
2099
2153
  ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2100
2154
  void const* src, size_t srcSize)
2101
-
2102
2155
  {
2103
2156
  return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2);
2104
2157
  }