zstdlib 0.10.0-x86-mingw32 → 0.11.0-x86-mingw32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGES.md +8 -0
  3. data/ext/zstdlib_c/extconf.rb +2 -2
  4. data/ext/zstdlib_c/ruby/zlib-3.2/zstdlib.c +5090 -0
  5. data/ext/zstdlib_c/zstd-1.5.5/lib/common/allocations.h +55 -0
  6. data/ext/zstdlib_c/zstd-1.5.5/lib/common/bits.h +200 -0
  7. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/common/bitstream.h +19 -60
  8. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/common/compiler.h +26 -3
  9. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/common/cpu.h +1 -1
  10. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/common/debug.c +1 -1
  11. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/common/debug.h +1 -1
  12. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/common/entropy_common.c +12 -40
  13. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/common/error_private.c +9 -2
  14. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/common/error_private.h +1 -1
  15. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/common/fse.h +5 -83
  16. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/common/fse_decompress.c +7 -99
  17. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/common/huf.h +65 -156
  18. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/common/mem.h +39 -46
  19. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/common/pool.c +26 -10
  20. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/common/pool.h +7 -1
  21. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/common/portability_macros.h +22 -3
  22. data/ext/zstdlib_c/zstd-1.5.5/lib/common/threading.c +176 -0
  23. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/common/threading.h +5 -10
  24. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/common/xxhash.c +2 -2
  25. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/common/xxhash.h +8 -8
  26. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/common/zstd_common.c +1 -36
  27. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/common/zstd_deps.h +1 -1
  28. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/common/zstd_internal.h +17 -118
  29. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/common/zstd_trace.h +3 -3
  30. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/compress/clevels.h +1 -1
  31. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/compress/fse_compress.c +7 -124
  32. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/compress/hist.c +1 -1
  33. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/compress/hist.h +1 -1
  34. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/compress/huf_compress.c +234 -169
  35. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/compress/zstd_compress.c +1243 -538
  36. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/compress/zstd_compress_internal.h +225 -151
  37. data/ext/zstdlib_c/zstd-1.5.5/lib/compress/zstd_compress_literals.c +235 -0
  38. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/compress/zstd_compress_literals.h +16 -8
  39. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/compress/zstd_compress_sequences.c +3 -3
  40. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/compress/zstd_compress_sequences.h +1 -1
  41. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/compress/zstd_compress_superblock.c +25 -21
  42. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/compress/zstd_compress_superblock.h +1 -1
  43. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/compress/zstd_cwksp.h +128 -62
  44. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/compress/zstd_double_fast.c +95 -33
  45. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/compress/zstd_double_fast.h +3 -2
  46. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/compress/zstd_fast.c +433 -148
  47. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/compress/zstd_fast.h +3 -2
  48. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/compress/zstd_lazy.c +398 -345
  49. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/compress/zstd_lazy.h +4 -2
  50. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/compress/zstd_ldm.c +5 -5
  51. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/compress/zstd_ldm.h +1 -1
  52. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/compress/zstd_ldm_geartab.h +1 -1
  53. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/compress/zstd_opt.c +106 -80
  54. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/compress/zstd_opt.h +1 -1
  55. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/compress/zstdmt_compress.c +17 -9
  56. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/compress/zstdmt_compress.h +1 -1
  57. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/decompress/huf_decompress.c +434 -441
  58. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/decompress/huf_decompress_amd64.S +30 -39
  59. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/decompress/zstd_ddict.c +4 -4
  60. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/decompress/zstd_ddict.h +1 -1
  61. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/decompress/zstd_decompress.c +205 -80
  62. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/decompress/zstd_decompress_block.c +201 -81
  63. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/decompress/zstd_decompress_block.h +6 -1
  64. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/decompress/zstd_decompress_internal.h +4 -2
  65. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/zdict.h +53 -31
  66. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/zstd.h +580 -135
  67. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/lib/zstd_errors.h +27 -8
  68. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/zlibWrapper/gzclose.c +1 -1
  69. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/zlibWrapper/gzcompatibility.h +8 -8
  70. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/zlibWrapper/gzguts.h +10 -10
  71. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/zlibWrapper/gzlib.c +3 -3
  72. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/zlibWrapper/gzread.c +10 -10
  73. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/zlibWrapper/gzwrite.c +5 -5
  74. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/zlibWrapper/zstd_zlibwrapper.c +46 -44
  75. data/ext/zstdlib_c/{zstd-1.5.2 → zstd-1.5.5}/zlibWrapper/zstd_zlibwrapper.h +4 -1
  76. data/lib/2.4/zstdlib_c.so +0 -0
  77. data/lib/2.5/zstdlib_c.so +0 -0
  78. data/lib/2.6/zstdlib_c.so +0 -0
  79. data/lib/2.7/zstdlib_c.so +0 -0
  80. data/lib/3.0/zstdlib_c.so +0 -0
  81. data/lib/3.1/zstdlib_c.so +0 -0
  82. data/lib/3.2/zstdlib_c.so +0 -0
  83. metadata +82 -78
  84. data/ext/zstdlib_c/zstd-1.5.2/lib/common/threading.c +0 -122
  85. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/zstd_compress_literals.c +0 -159
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) Yann Collet, Facebook, Inc.
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -10,6 +10,9 @@
10
10
 
11
11
  #include "zstd_compress_internal.h"
12
12
  #include "zstd_lazy.h"
13
+ #include "../common/bits.h" /* ZSTD_countTrailingZeros64 */
14
+
15
+ #define kLazySkippingStep 8
13
16
 
14
17
 
15
18
  /*-*************************************
@@ -197,8 +200,8 @@ ZSTD_DUBT_findBetterDictMatch (
197
200
  U32 matchIndex = dictMatchIndex + dictIndexDelta;
198
201
  if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) {
199
202
  DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)",
200
- curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, STORE_OFFSET(curr - matchIndex), dictMatchIndex, matchIndex);
201
- bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex);
203
+ curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, OFFSET_TO_OFFBASE(curr - matchIndex), dictMatchIndex, matchIndex);
204
+ bestLength = matchLength, *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
202
205
  }
203
206
  if (ip+matchLength == iend) { /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */
204
207
  break; /* drop, to guarantee consistency (miss a little bit of compression) */
@@ -218,7 +221,7 @@ ZSTD_DUBT_findBetterDictMatch (
218
221
  }
219
222
 
220
223
  if (bestLength >= MINMATCH) {
221
- U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex;
224
+ U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offsetPtr); (void)mIndex;
222
225
  DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
223
226
  curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
224
227
  }
@@ -230,7 +233,7 @@ ZSTD_DUBT_findBetterDictMatch (
230
233
  static size_t
231
234
  ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
232
235
  const BYTE* const ip, const BYTE* const iend,
233
- size_t* offsetPtr,
236
+ size_t* offBasePtr,
234
237
  U32 const mls,
235
238
  const ZSTD_dictMode_e dictMode)
236
239
  {
@@ -327,8 +330,8 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
327
330
  if (matchLength > bestLength) {
328
331
  if (matchLength > matchEndIdx - matchIndex)
329
332
  matchEndIdx = matchIndex + (U32)matchLength;
330
- if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) )
331
- bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex);
333
+ if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr - matchIndex + 1) - ZSTD_highbit32((U32)*offBasePtr)) )
334
+ bestLength = matchLength, *offBasePtr = OFFSET_TO_OFFBASE(curr - matchIndex);
332
335
  if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */
333
336
  if (dictMode == ZSTD_dictMatchState) {
334
337
  nbCompares = 0; /* in addition to avoiding checking any
@@ -361,16 +364,16 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
361
364
  if (dictMode == ZSTD_dictMatchState && nbCompares) {
362
365
  bestLength = ZSTD_DUBT_findBetterDictMatch(
363
366
  ms, ip, iend,
364
- offsetPtr, bestLength, nbCompares,
367
+ offBasePtr, bestLength, nbCompares,
365
368
  mls, dictMode);
366
369
  }
367
370
 
368
371
  assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased */
369
372
  ms->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */
370
373
  if (bestLength >= MINMATCH) {
371
- U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex;
374
+ U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offBasePtr); (void)mIndex;
372
375
  DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
373
- curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
376
+ curr, (U32)bestLength, (U32)*offBasePtr, mIndex);
374
377
  }
375
378
  return bestLength;
376
379
  }
@@ -381,14 +384,14 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
381
384
  FORCE_INLINE_TEMPLATE size_t
382
385
  ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms,
383
386
  const BYTE* const ip, const BYTE* const iLimit,
384
- size_t* offsetPtr,
387
+ size_t* offBasePtr,
385
388
  const U32 mls /* template */,
386
389
  const ZSTD_dictMode_e dictMode)
387
390
  {
388
391
  DEBUGLOG(7, "ZSTD_BtFindBestMatch");
389
392
  if (ip < ms->window.base + ms->nextToUpdate) return 0; /* skipped area */
390
393
  ZSTD_updateDUBT(ms, ip, iLimit, mls);
391
- return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offsetPtr, mls, dictMode);
394
+ return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offBasePtr, mls, dictMode);
392
395
  }
393
396
 
394
397
  /***********************************
@@ -561,7 +564,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
561
564
  /* save best solution */
562
565
  if (currentMl > ml) {
563
566
  ml = currentMl;
564
- *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta));
567
+ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta));
565
568
  if (ip+currentMl == iLimit) {
566
569
  /* best possible, avoids read overflow on next attempt */
567
570
  return ml;
@@ -598,7 +601,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
598
601
  /* save best solution */
599
602
  if (currentMl > ml) {
600
603
  ml = currentMl;
601
- *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta));
604
+ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta));
602
605
  if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
603
606
  }
604
607
  }
@@ -617,7 +620,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
617
620
  FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
618
621
  ZSTD_matchState_t* ms,
619
622
  const ZSTD_compressionParameters* const cParams,
620
- const BYTE* ip, U32 const mls)
623
+ const BYTE* ip, U32 const mls, U32 const lazySkipping)
621
624
  {
622
625
  U32* const hashTable = ms->hashTable;
623
626
  const U32 hashLog = cParams->hashLog;
@@ -632,6 +635,9 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
632
635
  NEXT_IN_CHAIN(idx, chainMask) = hashTable[h];
633
636
  hashTable[h] = idx;
634
637
  idx++;
638
+ /* Stop inserting every position when in the lazy skipping mode. */
639
+ if (lazySkipping)
640
+ break;
635
641
  }
636
642
 
637
643
  ms->nextToUpdate = target;
@@ -640,7 +646,7 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
640
646
 
641
647
  U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {
642
648
  const ZSTD_compressionParameters* const cParams = &ms->cParams;
643
- return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch);
649
+ return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch, /* lazySkipping*/ 0);
644
650
  }
645
651
 
646
652
  /* inlining is important to hardwire a hot branch (template emulation) */
@@ -684,14 +690,15 @@ size_t ZSTD_HcFindBestMatch(
684
690
  }
685
691
 
686
692
  /* HC4 match finder */
687
- matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls);
693
+ matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls, ms->lazySkipping);
688
694
 
689
695
  for ( ; (matchIndex>=lowLimit) & (nbAttempts>0) ; nbAttempts--) {
690
696
  size_t currentMl=0;
691
697
  if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
692
698
  const BYTE* const match = base + matchIndex;
693
699
  assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */
694
- if (match[ml] == ip[ml]) /* potentially better */
700
+ /* read 4B starting from (match + ml + 1 - sizeof(U32)) */
701
+ if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3)) /* potentially better */
695
702
  currentMl = ZSTD_count(ip, match, iLimit);
696
703
  } else {
697
704
  const BYTE* const match = dictBase + matchIndex;
@@ -703,7 +710,7 @@ size_t ZSTD_HcFindBestMatch(
703
710
  /* save best solution */
704
711
  if (currentMl > ml) {
705
712
  ml = currentMl;
706
- *offsetPtr = STORE_OFFSET(curr - matchIndex);
713
+ *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
707
714
  if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
708
715
  }
709
716
 
@@ -739,7 +746,7 @@ size_t ZSTD_HcFindBestMatch(
739
746
  if (currentMl > ml) {
740
747
  ml = currentMl;
741
748
  assert(curr > matchIndex + dmsIndexDelta);
742
- *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta));
749
+ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta));
743
750
  if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
744
751
  }
745
752
 
@@ -756,8 +763,6 @@ size_t ZSTD_HcFindBestMatch(
756
763
  * (SIMD) Row-based matchfinder
757
764
  ***********************************/
758
765
  /* Constants for row-based hash */
759
- #define ZSTD_ROW_HASH_TAG_OFFSET 16 /* byte offset of hashes in the match state's tagTable from the beginning of a row */
760
- #define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */
761
766
  #define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1)
762
767
  #define ZSTD_ROW_HASH_MAX_ENTRIES 64 /* absolute maximum number of entries per row, for all configurations */
763
768
 
@@ -769,73 +774,19 @@ typedef U64 ZSTD_VecMask; /* Clarifies when we are interacting with a U64 repr
769
774
  * Starting from the LSB, returns the idx of the next non-zero bit.
770
775
  * Basically counting the nb of trailing zeroes.
771
776
  */
772
- static U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
773
- assert(val != 0);
774
- # if defined(_MSC_VER) && defined(_WIN64)
775
- if (val != 0) {
776
- unsigned long r;
777
- _BitScanForward64(&r, val);
778
- return (U32)(r);
779
- } else {
780
- /* Should not reach this code path */
781
- __assume(0);
782
- }
783
- # elif (defined(__GNUC__) && ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))
784
- if (sizeof(size_t) == 4) {
785
- U32 mostSignificantWord = (U32)(val >> 32);
786
- U32 leastSignificantWord = (U32)val;
787
- if (leastSignificantWord == 0) {
788
- return 32 + (U32)__builtin_ctz(mostSignificantWord);
789
- } else {
790
- return (U32)__builtin_ctz(leastSignificantWord);
791
- }
792
- } else {
793
- return (U32)__builtin_ctzll(val);
794
- }
795
- # else
796
- /* Software ctz version: http://aggregate.org/MAGIC/#Trailing%20Zero%20Count
797
- * and: https://stackoverflow.com/questions/2709430/count-number-of-bits-in-a-64-bit-long-big-integer
798
- */
799
- val = ~val & (val - 1ULL); /* Lowest set bit mask */
800
- val = val - ((val >> 1) & 0x5555555555555555);
801
- val = (val & 0x3333333333333333ULL) + ((val >> 2) & 0x3333333333333333ULL);
802
- return (U32)((((val + (val >> 4)) & 0xF0F0F0F0F0F0F0FULL) * 0x101010101010101ULL) >> 56);
803
- # endif
804
- }
805
-
806
- /* ZSTD_rotateRight_*():
807
- * Rotates a bitfield to the right by "count" bits.
808
- * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts
809
- */
810
- FORCE_INLINE_TEMPLATE
811
- U64 ZSTD_rotateRight_U64(U64 const value, U32 count) {
812
- assert(count < 64);
813
- count &= 0x3F; /* for fickle pattern recognition */
814
- return (value >> count) | (U64)(value << ((0U - count) & 0x3F));
815
- }
816
-
817
- FORCE_INLINE_TEMPLATE
818
- U32 ZSTD_rotateRight_U32(U32 const value, U32 count) {
819
- assert(count < 32);
820
- count &= 0x1F; /* for fickle pattern recognition */
821
- return (value >> count) | (U32)(value << ((0U - count) & 0x1F));
822
- }
823
-
824
- FORCE_INLINE_TEMPLATE
825
- U16 ZSTD_rotateRight_U16(U16 const value, U32 count) {
826
- assert(count < 16);
827
- count &= 0x0F; /* for fickle pattern recognition */
828
- return (value >> count) | (U16)(value << ((0U - count) & 0x0F));
777
+ MEM_STATIC U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
778
+ return ZSTD_countTrailingZeros64(val);
829
779
  }
830
780
 
831
781
  /* ZSTD_row_nextIndex():
832
782
  * Returns the next index to insert at within a tagTable row, and updates the "head"
833
- * value to reflect the update. Essentially cycles backwards from [0, {entries per row})
783
+ * value to reflect the update. Essentially cycles backwards from [1, {entries per row})
834
784
  */
835
785
  FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const rowMask) {
836
- U32 const next = (*tagRow - 1) & rowMask;
837
- *tagRow = (BYTE)next;
838
- return next;
786
+ U32 next = (*tagRow-1) & rowMask;
787
+ next += (next == 0) ? rowMask : 0; /* skip first position */
788
+ *tagRow = (BYTE)next;
789
+ return next;
839
790
  }
840
791
 
841
792
  /* ZSTD_isAligned():
@@ -849,7 +800,7 @@ MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) {
849
800
  /* ZSTD_row_prefetch():
850
801
  * Performs prefetching for the hashTable and tagTable at a given row.
851
802
  */
852
- FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* tagTable, U32 const relRow, U32 const rowLog) {
803
+ FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, BYTE const* tagTable, U32 const relRow, U32 const rowLog) {
853
804
  PREFETCH_L1(hashTable + relRow);
854
805
  if (rowLog >= 5) {
855
806
  PREFETCH_L1(hashTable + relRow + 16);
@@ -873,13 +824,13 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const B
873
824
  U32 idx, const BYTE* const iLimit)
874
825
  {
875
826
  U32 const* const hashTable = ms->hashTable;
876
- U16 const* const tagTable = ms->tagTable;
827
+ BYTE const* const tagTable = ms->tagTable;
877
828
  U32 const hashLog = ms->rowHashLog;
878
829
  U32 const maxElemsToPrefetch = (base + idx) > iLimit ? 0 : (U32)(iLimit - (base + idx) + 1);
879
830
  U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch);
880
831
 
881
832
  for (; idx < lim; ++idx) {
882
- U32 const hash = (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
833
+ U32 const hash = (U32)ZSTD_hashPtrSalted(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt);
883
834
  U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
884
835
  ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
885
836
  ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash;
@@ -895,11 +846,12 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const B
895
846
  * base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable.
896
847
  */
897
848
  FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable,
898
- U16 const* tagTable, BYTE const* base,
849
+ BYTE const* tagTable, BYTE const* base,
899
850
  U32 idx, U32 const hashLog,
900
- U32 const rowLog, U32 const mls)
851
+ U32 const rowLog, U32 const mls,
852
+ U64 const hashSalt)
901
853
  {
902
- U32 const newHash = (U32)ZSTD_hashPtr(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
854
+ U32 const newHash = (U32)ZSTD_hashPtrSalted(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt);
903
855
  U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
904
856
  ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
905
857
  { U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK];
@@ -917,22 +869,21 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms,
917
869
  U32 const rowMask, U32 const useCache)
918
870
  {
919
871
  U32* const hashTable = ms->hashTable;
920
- U16* const tagTable = ms->tagTable;
872
+ BYTE* const tagTable = ms->tagTable;
921
873
  U32 const hashLog = ms->rowHashLog;
922
874
  const BYTE* const base = ms->window.base;
923
875
 
924
876
  DEBUGLOG(6, "ZSTD_row_update_internalImpl(): updateStartIdx=%u, updateEndIdx=%u", updateStartIdx, updateEndIdx);
925
877
  for (; updateStartIdx < updateEndIdx; ++updateStartIdx) {
926
- U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls)
927
- : (U32)ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
878
+ U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls, ms->hashSalt)
879
+ : (U32)ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt);
928
880
  U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
929
881
  U32* const row = hashTable + relRow;
930
- BYTE* tagRow = (BYTE*)(tagTable + relRow); /* Though tagTable is laid out as a table of U16, each tag is only 1 byte.
931
- Explicit cast allows us to get exact desired position within each row */
882
+ BYTE* tagRow = tagTable + relRow;
932
883
  U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
933
884
 
934
- assert(hash == ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls));
935
- ((BYTE*)tagRow)[pos + ZSTD_ROW_HASH_TAG_OFFSET] = hash & ZSTD_ROW_HASH_TAG_MASK;
885
+ assert(hash == ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt));
886
+ tagRow[pos] = hash & ZSTD_ROW_HASH_TAG_MASK;
936
887
  row[pos] = updateStartIdx;
937
888
  }
938
889
  }
@@ -980,7 +931,35 @@ void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) {
980
931
  const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */);
981
932
 
982
933
  DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog);
983
- ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* dont use cache */);
934
+ ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* don't use cache */);
935
+ }
936
+
937
+ /* Returns the mask width of bits group of which will be set to 1. Given not all
938
+ * architectures have easy movemask instruction, this helps to iterate over
939
+ * groups of bits easier and faster.
940
+ */
941
+ FORCE_INLINE_TEMPLATE U32
942
+ ZSTD_row_matchMaskGroupWidth(const U32 rowEntries)
943
+ {
944
+ assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
945
+ assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
946
+ (void)rowEntries;
947
+ #if defined(ZSTD_ARCH_ARM_NEON)
948
+ /* NEON path only works for little endian */
949
+ if (!MEM_isLittleEndian()) {
950
+ return 1;
951
+ }
952
+ if (rowEntries == 16) {
953
+ return 4;
954
+ }
955
+ if (rowEntries == 32) {
956
+ return 2;
957
+ }
958
+ if (rowEntries == 64) {
959
+ return 1;
960
+ }
961
+ #endif
962
+ return 1;
984
963
  }
985
964
 
986
965
  #if defined(ZSTD_ARCH_X86_SSE2)
@@ -1003,71 +982,82 @@ ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U
1003
982
  }
1004
983
  #endif
1005
984
 
1006
- /* Returns a ZSTD_VecMask (U32) that has the nth bit set to 1 if the newly-computed "tag" matches
1007
- * the hash at the nth position in a row of the tagTable.
1008
- * Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield
1009
- * to match up with the actual layout of the entries within the hashTable */
985
+ #if defined(ZSTD_ARCH_ARM_NEON)
986
+ FORCE_INLINE_TEMPLATE ZSTD_VecMask
987
+ ZSTD_row_getNEONMask(const U32 rowEntries, const BYTE* const src, const BYTE tag, const U32 headGrouped)
988
+ {
989
+ assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
990
+ if (rowEntries == 16) {
991
+ /* vshrn_n_u16 shifts by 4 every u16 and narrows to 8 lower bits.
992
+ * After that groups of 4 bits represent the equalMask. We lower
993
+ * all bits except the highest in these groups by doing AND with
994
+ * 0x88 = 0b10001000.
995
+ */
996
+ const uint8x16_t chunk = vld1q_u8(src);
997
+ const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag)));
998
+ const uint8x8_t res = vshrn_n_u16(equalMask, 4);
999
+ const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0);
1000
+ return ZSTD_rotateRight_U64(matches, headGrouped) & 0x8888888888888888ull;
1001
+ } else if (rowEntries == 32) {
1002
+ /* Same idea as with rowEntries == 16 but doing AND with
1003
+ * 0x55 = 0b01010101.
1004
+ */
1005
+ const uint16x8x2_t chunk = vld2q_u16((const uint16_t*)(const void*)src);
1006
+ const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]);
1007
+ const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]);
1008
+ const uint8x16_t dup = vdupq_n_u8(tag);
1009
+ const uint8x8_t t0 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk0, dup)), 6);
1010
+ const uint8x8_t t1 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk1, dup)), 6);
1011
+ const uint8x8_t res = vsli_n_u8(t0, t1, 4);
1012
+ const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0) ;
1013
+ return ZSTD_rotateRight_U64(matches, headGrouped) & 0x5555555555555555ull;
1014
+ } else { /* rowEntries == 64 */
1015
+ const uint8x16x4_t chunk = vld4q_u8(src);
1016
+ const uint8x16_t dup = vdupq_n_u8(tag);
1017
+ const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup);
1018
+ const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup);
1019
+ const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup);
1020
+ const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup);
1021
+
1022
+ const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1);
1023
+ const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1);
1024
+ const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2);
1025
+ const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4);
1026
+ const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4);
1027
+ const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0);
1028
+ return ZSTD_rotateRight_U64(matches, headGrouped);
1029
+ }
1030
+ }
1031
+ #endif
1032
+
1033
+ /* Returns a ZSTD_VecMask (U64) that has the nth group (determined by
1034
+ * ZSTD_row_matchMaskGroupWidth) of bits set to 1 if the newly-computed "tag"
1035
+ * matches the hash at the nth position in a row of the tagTable.
1036
+ * Each row is a circular buffer beginning at the value of "headGrouped". So we
1037
+ * must rotate the "matches" bitfield to match up with the actual layout of the
1038
+ * entries within the hashTable */
1010
1039
  FORCE_INLINE_TEMPLATE ZSTD_VecMask
1011
- ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries)
1040
+ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 headGrouped, const U32 rowEntries)
1012
1041
  {
1013
- const BYTE* const src = tagRow + ZSTD_ROW_HASH_TAG_OFFSET;
1042
+ const BYTE* const src = tagRow;
1014
1043
  assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
1015
1044
  assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
1045
+ assert(ZSTD_row_matchMaskGroupWidth(rowEntries) * rowEntries <= sizeof(ZSTD_VecMask) * 8);
1016
1046
 
1017
1047
  #if defined(ZSTD_ARCH_X86_SSE2)
1018
1048
 
1019
- return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, head);
1049
+ return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, headGrouped);
1020
1050
 
1021
1051
  #else /* SW or NEON-LE */
1022
1052
 
1023
1053
  # if defined(ZSTD_ARCH_ARM_NEON)
1024
1054
  /* This NEON path only works for little endian - otherwise use SWAR below */
1025
1055
  if (MEM_isLittleEndian()) {
1026
- if (rowEntries == 16) {
1027
- const uint8x16_t chunk = vld1q_u8(src);
1028
- const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag)));
1029
- const uint16x8_t t0 = vshlq_n_u16(equalMask, 7);
1030
- const uint32x4_t t1 = vreinterpretq_u32_u16(vsriq_n_u16(t0, t0, 14));
1031
- const uint64x2_t t2 = vreinterpretq_u64_u32(vshrq_n_u32(t1, 14));
1032
- const uint8x16_t t3 = vreinterpretq_u8_u64(vsraq_n_u64(t2, t2, 28));
1033
- const U16 hi = (U16)vgetq_lane_u8(t3, 8);
1034
- const U16 lo = (U16)vgetq_lane_u8(t3, 0);
1035
- return ZSTD_rotateRight_U16((hi << 8) | lo, head);
1036
- } else if (rowEntries == 32) {
1037
- const uint16x8x2_t chunk = vld2q_u16((const U16*)(const void*)src);
1038
- const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]);
1039
- const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]);
1040
- const uint8x16_t equalMask0 = vceqq_u8(chunk0, vdupq_n_u8(tag));
1041
- const uint8x16_t equalMask1 = vceqq_u8(chunk1, vdupq_n_u8(tag));
1042
- const int8x8_t pack0 = vqmovn_s16(vreinterpretq_s16_u8(equalMask0));
1043
- const int8x8_t pack1 = vqmovn_s16(vreinterpretq_s16_u8(equalMask1));
1044
- const uint8x8_t t0 = vreinterpret_u8_s8(pack0);
1045
- const uint8x8_t t1 = vreinterpret_u8_s8(pack1);
1046
- const uint8x8_t t2 = vsri_n_u8(t1, t0, 2);
1047
- const uint8x8x2_t t3 = vuzp_u8(t2, t0);
1048
- const uint8x8_t t4 = vsri_n_u8(t3.val[1], t3.val[0], 4);
1049
- const U32 matches = vget_lane_u32(vreinterpret_u32_u8(t4), 0);
1050
- return ZSTD_rotateRight_U32(matches, head);
1051
- } else { /* rowEntries == 64 */
1052
- const uint8x16x4_t chunk = vld4q_u8(src);
1053
- const uint8x16_t dup = vdupq_n_u8(tag);
1054
- const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup);
1055
- const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup);
1056
- const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup);
1057
- const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup);
1058
-
1059
- const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1);
1060
- const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1);
1061
- const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2);
1062
- const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4);
1063
- const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4);
1064
- const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0);
1065
- return ZSTD_rotateRight_U64(matches, head);
1066
- }
1056
+ return ZSTD_row_getNEONMask(rowEntries, src, tag, headGrouped);
1067
1057
  }
1068
1058
  # endif /* ZSTD_ARCH_ARM_NEON */
1069
1059
  /* SWAR */
1070
- { const size_t chunkSize = sizeof(size_t);
1060
+ { const int chunkSize = sizeof(size_t);
1071
1061
  const size_t shiftAmount = ((chunkSize * 8) - chunkSize);
1072
1062
  const size_t xFF = ~((size_t)0);
1073
1063
  const size_t x01 = xFF / 0xFF;
@@ -1100,11 +1090,11 @@ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head,
1100
1090
  }
1101
1091
  matches = ~matches;
1102
1092
  if (rowEntries == 16) {
1103
- return ZSTD_rotateRight_U16((U16)matches, head);
1093
+ return ZSTD_rotateRight_U16((U16)matches, headGrouped);
1104
1094
  } else if (rowEntries == 32) {
1105
- return ZSTD_rotateRight_U32((U32)matches, head);
1095
+ return ZSTD_rotateRight_U32((U32)matches, headGrouped);
1106
1096
  } else {
1107
- return ZSTD_rotateRight_U64((U64)matches, head);
1097
+ return ZSTD_rotateRight_U64((U64)matches, headGrouped);
1108
1098
  }
1109
1099
  }
1110
1100
  #endif
@@ -1134,7 +1124,7 @@ size_t ZSTD_RowFindBestMatch(
1134
1124
  const U32 rowLog)
1135
1125
  {
1136
1126
  U32* const hashTable = ms->hashTable;
1137
- U16* const tagTable = ms->tagTable;
1127
+ BYTE* const tagTable = ms->tagTable;
1138
1128
  U32* const hashCache = ms->hashCache;
1139
1129
  const U32 hashLog = ms->rowHashLog;
1140
1130
  const ZSTD_compressionParameters* const cParams = &ms->cParams;
@@ -1152,8 +1142,11 @@ size_t ZSTD_RowFindBestMatch(
1152
1142
  const U32 rowEntries = (1U << rowLog);
1153
1143
  const U32 rowMask = rowEntries - 1;
1154
1144
  const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */
1145
+ const U32 groupWidth = ZSTD_row_matchMaskGroupWidth(rowEntries);
1146
+ const U64 hashSalt = ms->hashSalt;
1155
1147
  U32 nbAttempts = 1U << cappedSearchLog;
1156
1148
  size_t ml=4-1;
1149
+ U32 hash;
1157
1150
 
1158
1151
  /* DMS/DDS variables that may be referenced laster */
1159
1152
  const ZSTD_matchState_t* const dms = ms->dictMatchState;
@@ -1177,7 +1170,7 @@ size_t ZSTD_RowFindBestMatch(
1177
1170
  if (dictMode == ZSTD_dictMatchState) {
1178
1171
  /* Prefetch DMS rows */
1179
1172
  U32* const dmsHashTable = dms->hashTable;
1180
- U16* const dmsTagTable = dms->tagTable;
1173
+ BYTE* const dmsTagTable = dms->tagTable;
1181
1174
  U32 const dmsHash = (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
1182
1175
  U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
1183
1176
  dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK;
@@ -1187,23 +1180,34 @@ size_t ZSTD_RowFindBestMatch(
1187
1180
  }
1188
1181
 
1189
1182
  /* Update the hashTable and tagTable up to (but not including) ip */
1190
- ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);
1183
+ if (!ms->lazySkipping) {
1184
+ ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);
1185
+ hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls, hashSalt);
1186
+ } else {
1187
+ /* Stop inserting every position when in the lazy skipping mode.
1188
+ * The hash cache is also not kept up to date in this mode.
1189
+ */
1190
+ hash = (U32)ZSTD_hashPtrSalted(ip, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt);
1191
+ ms->nextToUpdate = curr;
1192
+ }
1193
+ ms->hashSaltEntropy += hash; /* collect salt entropy */
1194
+
1191
1195
  { /* Get the hash for ip, compute the appropriate row */
1192
- U32 const hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls);
1193
1196
  U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
1194
1197
  U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK;
1195
1198
  U32* const row = hashTable + relRow;
1196
1199
  BYTE* tagRow = (BYTE*)(tagTable + relRow);
1197
- U32 const head = *tagRow & rowMask;
1200
+ U32 const headGrouped = (*tagRow & rowMask) * groupWidth;
1198
1201
  U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
1199
1202
  size_t numMatches = 0;
1200
1203
  size_t currMatch = 0;
1201
- ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, head, rowEntries);
1204
+ ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, headGrouped, rowEntries);
1202
1205
 
1203
1206
  /* Cycle through the matches and prefetch */
1204
- for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
1205
- U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
1207
+ for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) {
1208
+ U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask;
1206
1209
  U32 const matchIndex = row[matchPos];
1210
+ if(matchPos == 0) continue;
1207
1211
  assert(numMatches < rowEntries);
1208
1212
  if (matchIndex < lowLimit)
1209
1213
  break;
@@ -1213,13 +1217,14 @@ size_t ZSTD_RowFindBestMatch(
1213
1217
  PREFETCH_L1(dictBase + matchIndex);
1214
1218
  }
1215
1219
  matchBuffer[numMatches++] = matchIndex;
1220
+ --nbAttempts;
1216
1221
  }
1217
1222
 
1218
1223
  /* Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop
1219
1224
  in ZSTD_row_update_internal() at the next search. */
1220
1225
  {
1221
1226
  U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
1222
- tagRow[pos + ZSTD_ROW_HASH_TAG_OFFSET] = (BYTE)tag;
1227
+ tagRow[pos] = (BYTE)tag;
1223
1228
  row[pos] = ms->nextToUpdate++;
1224
1229
  }
1225
1230
 
@@ -1233,7 +1238,8 @@ size_t ZSTD_RowFindBestMatch(
1233
1238
  if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
1234
1239
  const BYTE* const match = base + matchIndex;
1235
1240
  assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */
1236
- if (match[ml] == ip[ml]) /* potentially better */
1241
+ /* read 4B starting from (match + ml + 1 - sizeof(U32)) */
1242
+ if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3)) /* potentially better */
1237
1243
  currentMl = ZSTD_count(ip, match, iLimit);
1238
1244
  } else {
1239
1245
  const BYTE* const match = dictBase + matchIndex;
@@ -1245,7 +1251,7 @@ size_t ZSTD_RowFindBestMatch(
1245
1251
  /* Save best solution */
1246
1252
  if (currentMl > ml) {
1247
1253
  ml = currentMl;
1248
- *offsetPtr = STORE_OFFSET(curr - matchIndex);
1254
+ *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
1249
1255
  if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
1250
1256
  }
1251
1257
  }
@@ -1263,19 +1269,21 @@ size_t ZSTD_RowFindBestMatch(
1263
1269
  const U32 dmsSize = (U32)(dmsEnd - dmsBase);
1264
1270
  const U32 dmsIndexDelta = dictLimit - dmsSize;
1265
1271
 
1266
- { U32 const head = *dmsTagRow & rowMask;
1272
+ { U32 const headGrouped = (*dmsTagRow & rowMask) * groupWidth;
1267
1273
  U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
1268
1274
  size_t numMatches = 0;
1269
1275
  size_t currMatch = 0;
1270
- ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, head, rowEntries);
1276
+ ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, headGrouped, rowEntries);
1271
1277
 
1272
- for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
1273
- U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
1278
+ for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) {
1279
+ U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask;
1274
1280
  U32 const matchIndex = dmsRow[matchPos];
1281
+ if(matchPos == 0) continue;
1275
1282
  if (matchIndex < dmsLowestIndex)
1276
1283
  break;
1277
1284
  PREFETCH_L1(dmsBase + matchIndex);
1278
1285
  matchBuffer[numMatches++] = matchIndex;
1286
+ --nbAttempts;
1279
1287
  }
1280
1288
 
1281
1289
  /* Return the longest match */
@@ -1294,7 +1302,7 @@ size_t ZSTD_RowFindBestMatch(
1294
1302
  if (currentMl > ml) {
1295
1303
  ml = currentMl;
1296
1304
  assert(curr > matchIndex + dmsIndexDelta);
1297
- *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta));
1305
+ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta));
1298
1306
  if (ip+currentMl == iLimit) break;
1299
1307
  }
1300
1308
  }
@@ -1304,14 +1312,10 @@ size_t ZSTD_RowFindBestMatch(
1304
1312
  }
1305
1313
 
1306
1314
 
1307
- typedef size_t (*searchMax_f)(
1308
- ZSTD_matchState_t* ms,
1309
- const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
1310
-
1311
1315
  /**
1312
- * This struct contains the functions necessary for lazy to search.
1313
- * Currently, that is only searchMax. However, it is still valuable to have the
1314
- * VTable because this makes it easier to add more functions to the VTable later.
1316
+ * Generate search functions templated on (dictMode, mls, rowLog).
1317
+ * These functions are outlined for code size & compilation time.
1318
+ * ZSTD_searchMax() dispatches to the correct implementation function.
1315
1319
  *
1316
1320
  * TODO: The start of the search function involves loading and calculating a
1317
1321
  * bunch of constants from the ZSTD_matchState_t. These computations could be
@@ -1329,25 +1333,25 @@ typedef size_t (*searchMax_f)(
1329
1333
  * the single segment loop. It should go in searchMax instead of its own
1330
1334
  * function to avoid having multiple virtual function calls per search.
1331
1335
  */
1332
- typedef struct {
1333
- searchMax_f searchMax;
1334
- } ZSTD_LazyVTable;
1335
1336
 
1336
- #define GEN_ZSTD_BT_VTABLE(dictMode, mls) \
1337
- static size_t ZSTD_BtFindBestMatch_##dictMode##_##mls( \
1338
- ZSTD_matchState_t* ms, \
1339
- const BYTE* ip, const BYTE* const iLimit, \
1340
- size_t* offsetPtr) \
1341
- { \
1342
- assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
1343
- return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode); \
1344
- } \
1345
- static const ZSTD_LazyVTable ZSTD_BtVTable_##dictMode##_##mls = { \
1346
- ZSTD_BtFindBestMatch_##dictMode##_##mls \
1347
- };
1337
+ #define ZSTD_BT_SEARCH_FN(dictMode, mls) ZSTD_BtFindBestMatch_##dictMode##_##mls
1338
+ #define ZSTD_HC_SEARCH_FN(dictMode, mls) ZSTD_HcFindBestMatch_##dictMode##_##mls
1339
+ #define ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog) ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog
1348
1340
 
1349
- #define GEN_ZSTD_HC_VTABLE(dictMode, mls) \
1350
- static size_t ZSTD_HcFindBestMatch_##dictMode##_##mls( \
1341
+ #define ZSTD_SEARCH_FN_ATTRS FORCE_NOINLINE
1342
+
1343
+ #define GEN_ZSTD_BT_SEARCH_FN(dictMode, mls) \
1344
+ ZSTD_SEARCH_FN_ATTRS size_t ZSTD_BT_SEARCH_FN(dictMode, mls)( \
1345
+ ZSTD_matchState_t* ms, \
1346
+ const BYTE* ip, const BYTE* const iLimit, \
1347
+ size_t* offBasePtr) \
1348
+ { \
1349
+ assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
1350
+ return ZSTD_BtFindBestMatch(ms, ip, iLimit, offBasePtr, mls, ZSTD_##dictMode); \
1351
+ } \
1352
+
1353
+ #define GEN_ZSTD_HC_SEARCH_FN(dictMode, mls) \
1354
+ ZSTD_SEARCH_FN_ATTRS size_t ZSTD_HC_SEARCH_FN(dictMode, mls)( \
1351
1355
  ZSTD_matchState_t* ms, \
1352
1356
  const BYTE* ip, const BYTE* const iLimit, \
1353
1357
  size_t* offsetPtr) \
@@ -1355,12 +1359,9 @@ typedef struct {
1355
1359
  assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
1356
1360
  return ZSTD_HcFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode); \
1357
1361
  } \
1358
- static const ZSTD_LazyVTable ZSTD_HcVTable_##dictMode##_##mls = { \
1359
- ZSTD_HcFindBestMatch_##dictMode##_##mls \
1360
- };
1361
1362
 
1362
- #define GEN_ZSTD_ROW_VTABLE(dictMode, mls, rowLog) \
1363
- static size_t ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog( \
1363
+ #define GEN_ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog) \
1364
+ ZSTD_SEARCH_FN_ATTRS size_t ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)( \
1364
1365
  ZSTD_matchState_t* ms, \
1365
1366
  const BYTE* ip, const BYTE* const iLimit, \
1366
1367
  size_t* offsetPtr) \
@@ -1369,9 +1370,6 @@ typedef struct {
1369
1370
  assert(MAX(4, MIN(6, ms->cParams.searchLog)) == rowLog); \
1370
1371
  return ZSTD_RowFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode, rowLog); \
1371
1372
  } \
1372
- static const ZSTD_LazyVTable ZSTD_RowVTable_##dictMode##_##mls##_##rowLog = { \
1373
- ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog \
1374
- };
1375
1373
 
1376
1374
  #define ZSTD_FOR_EACH_ROWLOG(X, dictMode, mls) \
1377
1375
  X(dictMode, mls, 4) \
@@ -1394,84 +1392,103 @@ typedef struct {
1394
1392
  X(__VA_ARGS__, dictMatchState) \
1395
1393
  X(__VA_ARGS__, dedicatedDictSearch)
1396
1394
 
1397
- /* Generate Row VTables for each combination of (dictMode, mls, rowLog) */
1398
- ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS_ROWLOG, GEN_ZSTD_ROW_VTABLE)
1399
- /* Generate Binary Tree VTables for each combination of (dictMode, mls) */
1400
- ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_BT_VTABLE)
1401
- /* Generate Hash Chain VTables for each combination of (dictMode, mls) */
1402
- ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_HC_VTABLE)
1403
-
1404
- #define GEN_ZSTD_BT_VTABLE_ARRAY(dictMode) \
1405
- { \
1406
- &ZSTD_BtVTable_##dictMode##_4, \
1407
- &ZSTD_BtVTable_##dictMode##_5, \
1408
- &ZSTD_BtVTable_##dictMode##_6 \
1409
- }
1395
+ /* Generate row search fns for each combination of (dictMode, mls, rowLog) */
1396
+ ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS_ROWLOG, GEN_ZSTD_ROW_SEARCH_FN)
1397
+ /* Generate binary Tree search fns for each combination of (dictMode, mls) */
1398
+ ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_BT_SEARCH_FN)
1399
+ /* Generate hash chain search fns for each combination of (dictMode, mls) */
1400
+ ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_HC_SEARCH_FN)
1410
1401
 
1411
- #define GEN_ZSTD_HC_VTABLE_ARRAY(dictMode) \
1412
- { \
1413
- &ZSTD_HcVTable_##dictMode##_4, \
1414
- &ZSTD_HcVTable_##dictMode##_5, \
1415
- &ZSTD_HcVTable_##dictMode##_6 \
1416
- }
1417
-
1418
- #define GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, mls) \
1419
- { \
1420
- &ZSTD_RowVTable_##dictMode##_##mls##_4, \
1421
- &ZSTD_RowVTable_##dictMode##_##mls##_5, \
1422
- &ZSTD_RowVTable_##dictMode##_##mls##_6 \
1423
- }
1402
+ typedef enum { search_hashChain=0, search_binaryTree=1, search_rowHash=2 } searchMethod_e;
1424
1403
 
1425
- #define GEN_ZSTD_ROW_VTABLE_ARRAY(dictMode) \
1426
- { \
1427
- GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, 4), \
1428
- GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, 5), \
1429
- GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, 6) \
1404
+ #define GEN_ZSTD_CALL_BT_SEARCH_FN(dictMode, mls) \
1405
+ case mls: \
1406
+ return ZSTD_BT_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr);
1407
+ #define GEN_ZSTD_CALL_HC_SEARCH_FN(dictMode, mls) \
1408
+ case mls: \
1409
+ return ZSTD_HC_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr);
1410
+ #define GEN_ZSTD_CALL_ROW_SEARCH_FN(dictMode, mls, rowLog) \
1411
+ case rowLog: \
1412
+ return ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)(ms, ip, iend, offsetPtr);
1413
+
1414
+ #define ZSTD_SWITCH_MLS(X, dictMode) \
1415
+ switch (mls) { \
1416
+ ZSTD_FOR_EACH_MLS(X, dictMode) \
1430
1417
  }
1431
1418
 
1432
- #define GEN_ZSTD_VTABLE_ARRAY(X) \
1433
- { \
1434
- X(noDict), \
1435
- X(extDict), \
1436
- X(dictMatchState), \
1437
- X(dedicatedDictSearch) \
1438
- }
1439
-
1440
- /* *******************************
1441
- * Common parser - lazy strategy
1442
- *********************************/
1443
- typedef enum { search_hashChain=0, search_binaryTree=1, search_rowHash=2 } searchMethod_e;
1419
+ #define ZSTD_SWITCH_ROWLOG(dictMode, mls) \
1420
+ case mls: \
1421
+ switch (rowLog) { \
1422
+ ZSTD_FOR_EACH_ROWLOG(GEN_ZSTD_CALL_ROW_SEARCH_FN, dictMode, mls) \
1423
+ } \
1424
+ ZSTD_UNREACHABLE; \
1425
+ break;
1426
+
1427
+ #define ZSTD_SWITCH_SEARCH_METHOD(dictMode) \
1428
+ switch (searchMethod) { \
1429
+ case search_hashChain: \
1430
+ ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_HC_SEARCH_FN, dictMode) \
1431
+ break; \
1432
+ case search_binaryTree: \
1433
+ ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_BT_SEARCH_FN, dictMode) \
1434
+ break; \
1435
+ case search_rowHash: \
1436
+ ZSTD_SWITCH_MLS(ZSTD_SWITCH_ROWLOG, dictMode) \
1437
+ break; \
1438
+ } \
1439
+ ZSTD_UNREACHABLE;
1444
1440
 
1445
1441
  /**
1446
- * This table is indexed first by the four ZSTD_dictMode_e values, and then
1447
- * by the two searchMethod_e values. NULLs are placed for configurations
1448
- * that should never occur (extDict modes go to the other implementation
1449
- * below and there is no DDSS for binary tree search yet).
1442
+ * Searches for the longest match at @p ip.
1443
+ * Dispatches to the correct implementation function based on the
1444
+ * (searchMethod, dictMode, mls, rowLog). We use switch statements
1445
+ * here instead of using an indirect function call through a function
1446
+ * pointer because after Spectre and Meltdown mitigations, indirect
1447
+ * function calls can be very costly, especially in the kernel.
1448
+ *
1449
+ * NOTE: dictMode and searchMethod should be templated, so those switch
1450
+ * statements should be optimized out. Only the mls & rowLog switches
1451
+ * should be left.
1452
+ *
1453
+ * @param ms The match state.
1454
+ * @param ip The position to search at.
1455
+ * @param iend The end of the input data.
1456
+ * @param[out] offsetPtr Stores the match offset into this pointer.
1457
+ * @param mls The minimum search length, in the range [4, 6].
1458
+ * @param rowLog The row log (if applicable), in the range [4, 6].
1459
+ * @param searchMethod The search method to use (templated).
1460
+ * @param dictMode The dictMode (templated).
1461
+ *
1462
+ * @returns The length of the longest match found, or < mls if no match is found.
1463
+ * If a match is found its offset is stored in @p offsetPtr.
1450
1464
  */
1451
-
1452
- static ZSTD_LazyVTable const*
1453
- ZSTD_selectLazyVTable(ZSTD_matchState_t const* ms, searchMethod_e searchMethod, ZSTD_dictMode_e dictMode)
1465
+ FORCE_INLINE_TEMPLATE size_t ZSTD_searchMax(
1466
+ ZSTD_matchState_t* ms,
1467
+ const BYTE* ip,
1468
+ const BYTE* iend,
1469
+ size_t* offsetPtr,
1470
+ U32 const mls,
1471
+ U32 const rowLog,
1472
+ searchMethod_e const searchMethod,
1473
+ ZSTD_dictMode_e const dictMode)
1454
1474
  {
1455
- /* Fill the Hc/Bt VTable arrays with the right functions for the (dictMode, mls) combination. */
1456
- ZSTD_LazyVTable const* const hcVTables[4][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_HC_VTABLE_ARRAY);
1457
- ZSTD_LazyVTable const* const btVTables[4][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_BT_VTABLE_ARRAY);
1458
- /* Fill the Row VTable array with the right functions for the (dictMode, mls, rowLog) combination. */
1459
- ZSTD_LazyVTable const* const rowVTables[4][3][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_ROW_VTABLE_ARRAY);
1460
-
1461
- U32 const mls = MAX(4, MIN(6, ms->cParams.minMatch));
1462
- U32 const rowLog = MAX(4, MIN(6, ms->cParams.searchLog));
1463
- switch (searchMethod) {
1464
- case search_hashChain:
1465
- return hcVTables[dictMode][mls - 4];
1466
- case search_binaryTree:
1467
- return btVTables[dictMode][mls - 4];
1468
- case search_rowHash:
1469
- return rowVTables[dictMode][mls - 4][rowLog - 4];
1470
- default:
1471
- return NULL;
1475
+ if (dictMode == ZSTD_noDict) {
1476
+ ZSTD_SWITCH_SEARCH_METHOD(noDict)
1477
+ } else if (dictMode == ZSTD_extDict) {
1478
+ ZSTD_SWITCH_SEARCH_METHOD(extDict)
1479
+ } else if (dictMode == ZSTD_dictMatchState) {
1480
+ ZSTD_SWITCH_SEARCH_METHOD(dictMatchState)
1481
+ } else if (dictMode == ZSTD_dedicatedDictSearch) {
1482
+ ZSTD_SWITCH_SEARCH_METHOD(dedicatedDictSearch)
1472
1483
  }
1484
+ ZSTD_UNREACHABLE;
1485
+ return 0;
1473
1486
  }
1474
1487
 
1488
+ /* *******************************
1489
+ * Common parser - lazy strategy
1490
+ *********************************/
1491
+
1475
1492
  FORCE_INLINE_TEMPLATE size_t
1476
1493
  ZSTD_compressBlock_lazy_generic(
1477
1494
  ZSTD_matchState_t* ms, seqStore_t* seqStore,
@@ -1488,9 +1505,11 @@ ZSTD_compressBlock_lazy_generic(
1488
1505
  const BYTE* const base = ms->window.base;
1489
1506
  const U32 prefixLowestIndex = ms->window.dictLimit;
1490
1507
  const BYTE* const prefixLowest = base + prefixLowestIndex;
1508
+ const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6);
1509
+ const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);
1491
1510
 
1492
- searchMax_f const searchMax = ZSTD_selectLazyVTable(ms, searchMethod, dictMode)->searchMax;
1493
- U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0;
1511
+ U32 offset_1 = rep[0], offset_2 = rep[1];
1512
+ U32 offsetSaved1 = 0, offsetSaved2 = 0;
1494
1513
 
1495
1514
  const int isDMS = dictMode == ZSTD_dictMatchState;
1496
1515
  const int isDDS = dictMode == ZSTD_dedicatedDictSearch;
@@ -1505,16 +1524,14 @@ ZSTD_compressBlock_lazy_generic(
1505
1524
  0;
1506
1525
  const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictLowest));
1507
1526
 
1508
- assert(searchMax != NULL);
1509
-
1510
1527
  DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=%u) (searchFunc=%u)", (U32)dictMode, (U32)searchMethod);
1511
1528
  ip += (dictAndPrefixLength == 0);
1512
1529
  if (dictMode == ZSTD_noDict) {
1513
1530
  U32 const curr = (U32)(ip - base);
1514
1531
  U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, ms->cParams.windowLog);
1515
1532
  U32 const maxRep = curr - windowLow;
1516
- if (offset_2 > maxRep) savedOffset = offset_2, offset_2 = 0;
1517
- if (offset_1 > maxRep) savedOffset = offset_1, offset_1 = 0;
1533
+ if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0;
1534
+ if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0;
1518
1535
  }
1519
1536
  if (isDxS) {
1520
1537
  /* dictMatchState repCode checks don't currently handle repCode == 0
@@ -1523,11 +1540,11 @@ ZSTD_compressBlock_lazy_generic(
1523
1540
  assert(offset_2 <= dictAndPrefixLength);
1524
1541
  }
1525
1542
 
1543
+ /* Reset the lazy skipping state */
1544
+ ms->lazySkipping = 0;
1545
+
1526
1546
  if (searchMethod == search_rowHash) {
1527
- const U32 rowLog = MAX(4, MIN(6, ms->cParams.searchLog));
1528
- ZSTD_row_fillHashCache(ms, base, rowLog,
1529
- MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
1530
- ms->nextToUpdate, ilimit);
1547
+ ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
1531
1548
  }
1532
1549
 
1533
1550
  /* Match Loop */
@@ -1539,7 +1556,7 @@ ZSTD_compressBlock_lazy_generic(
1539
1556
  #endif
1540
1557
  while (ip < ilimit) {
1541
1558
  size_t matchLength=0;
1542
- size_t offcode=STORE_REPCODE_1;
1559
+ size_t offBase = REPCODE1_TO_OFFBASE;
1543
1560
  const BYTE* start=ip+1;
1544
1561
  DEBUGLOG(7, "search baseline (depth 0)");
1545
1562
 
@@ -1564,14 +1581,23 @@ ZSTD_compressBlock_lazy_generic(
1564
1581
  }
1565
1582
 
1566
1583
  /* first search (depth 0) */
1567
- { size_t offsetFound = 999999999;
1568
- size_t const ml2 = searchMax(ms, ip, iend, &offsetFound);
1584
+ { size_t offbaseFound = 999999999;
1585
+ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offbaseFound, mls, rowLog, searchMethod, dictMode);
1569
1586
  if (ml2 > matchLength)
1570
- matchLength = ml2, start = ip, offcode=offsetFound;
1587
+ matchLength = ml2, start = ip, offBase = offbaseFound;
1571
1588
  }
1572
1589
 
1573
1590
  if (matchLength < 4) {
1574
- ip += ((ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */
1591
+ size_t const step = ((size_t)(ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */;
1592
+ ip += step;
1593
+ /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time.
1594
+ * In this mode we stop inserting every position into our tables, and only insert
1595
+ * positions that we search, which is one in step positions.
1596
+ * The exact cutoff is flexible, I've just chosen a number that is reasonably high,
1597
+ * so we minimize the compression ratio loss in "normal" scenarios. This mode gets
1598
+ * triggered once we've gone 2KB without finding any matches.
1599
+ */
1600
+ ms->lazySkipping = step > kLazySkippingStep;
1575
1601
  continue;
1576
1602
  }
1577
1603
 
@@ -1581,12 +1607,12 @@ ZSTD_compressBlock_lazy_generic(
1581
1607
  DEBUGLOG(7, "search depth 1");
1582
1608
  ip ++;
1583
1609
  if ( (dictMode == ZSTD_noDict)
1584
- && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
1610
+ && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
1585
1611
  size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
1586
1612
  int const gain2 = (int)(mlRep * 3);
1587
- int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
1613
+ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
1588
1614
  if ((mlRep >= 4) && (gain2 > gain1))
1589
- matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
1615
+ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
1590
1616
  }
1591
1617
  if (isDxS) {
1592
1618
  const U32 repIndex = (U32)(ip - base) - offset_1;
@@ -1598,17 +1624,17 @@ ZSTD_compressBlock_lazy_generic(
1598
1624
  const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
1599
1625
  size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
1600
1626
  int const gain2 = (int)(mlRep * 3);
1601
- int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
1627
+ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
1602
1628
  if ((mlRep >= 4) && (gain2 > gain1))
1603
- matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
1629
+ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
1604
1630
  }
1605
1631
  }
1606
- { size_t offset2=999999999;
1607
- size_t const ml2 = searchMax(ms, ip, iend, &offset2);
1608
- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */
1609
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4);
1632
+ { size_t ofbCandidate=999999999;
1633
+ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);
1634
+ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */
1635
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4);
1610
1636
  if ((ml2 >= 4) && (gain2 > gain1)) {
1611
- matchLength = ml2, offcode = offset2, start = ip;
1637
+ matchLength = ml2, offBase = ofbCandidate, start = ip;
1612
1638
  continue; /* search a better one */
1613
1639
  } }
1614
1640
 
@@ -1617,12 +1643,12 @@ ZSTD_compressBlock_lazy_generic(
1617
1643
  DEBUGLOG(7, "search depth 2");
1618
1644
  ip ++;
1619
1645
  if ( (dictMode == ZSTD_noDict)
1620
- && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
1646
+ && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
1621
1647
  size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
1622
1648
  int const gain2 = (int)(mlRep * 4);
1623
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
1649
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
1624
1650
  if ((mlRep >= 4) && (gain2 > gain1))
1625
- matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
1651
+ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
1626
1652
  }
1627
1653
  if (isDxS) {
1628
1654
  const U32 repIndex = (U32)(ip - base) - offset_1;
@@ -1634,17 +1660,17 @@ ZSTD_compressBlock_lazy_generic(
1634
1660
  const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
1635
1661
  size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
1636
1662
  int const gain2 = (int)(mlRep * 4);
1637
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
1663
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
1638
1664
  if ((mlRep >= 4) && (gain2 > gain1))
1639
- matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
1665
+ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
1640
1666
  }
1641
1667
  }
1642
- { size_t offset2=999999999;
1643
- size_t const ml2 = searchMax(ms, ip, iend, &offset2);
1644
- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */
1645
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7);
1668
+ { size_t ofbCandidate=999999999;
1669
+ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);
1670
+ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */
1671
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7);
1646
1672
  if ((ml2 >= 4) && (gain2 > gain1)) {
1647
- matchLength = ml2, offcode = offset2, start = ip;
1673
+ matchLength = ml2, offBase = ofbCandidate, start = ip;
1648
1674
  continue;
1649
1675
  } } }
1650
1676
  break; /* nothing found : store previous solution */
@@ -1655,26 +1681,33 @@ ZSTD_compressBlock_lazy_generic(
1655
1681
  * notably if `value` is unsigned, resulting in a large positive `-value`.
1656
1682
  */
1657
1683
  /* catch up */
1658
- if (STORED_IS_OFFSET(offcode)) {
1684
+ if (OFFBASE_IS_OFFSET(offBase)) {
1659
1685
  if (dictMode == ZSTD_noDict) {
1660
- while ( ((start > anchor) & (start - STORED_OFFSET(offcode) > prefixLowest))
1661
- && (start[-1] == (start-STORED_OFFSET(offcode))[-1]) ) /* only search for offset within prefix */
1686
+ while ( ((start > anchor) & (start - OFFBASE_TO_OFFSET(offBase) > prefixLowest))
1687
+ && (start[-1] == (start-OFFBASE_TO_OFFSET(offBase))[-1]) ) /* only search for offset within prefix */
1662
1688
  { start--; matchLength++; }
1663
1689
  }
1664
1690
  if (isDxS) {
1665
- U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode));
1691
+ U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase));
1666
1692
  const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex;
1667
1693
  const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest;
1668
1694
  while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */
1669
1695
  }
1670
- offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode);
1696
+ offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase);
1671
1697
  }
1672
1698
  /* store sequence */
1673
1699
  _storeSequence:
1674
1700
  { size_t const litLength = (size_t)(start - anchor);
1675
- ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength);
1701
+ ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
1676
1702
  anchor = ip = start + matchLength;
1677
1703
  }
1704
+ if (ms->lazySkipping) {
1705
+ /* We've found a match, disable lazy skipping mode, and refill the hash cache. */
1706
+ if (searchMethod == search_rowHash) {
1707
+ ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
1708
+ }
1709
+ ms->lazySkipping = 0;
1710
+ }
1678
1711
 
1679
1712
  /* check immediate repcode */
1680
1713
  if (isDxS) {
@@ -1688,8 +1721,8 @@ _storeSequence:
1688
1721
  && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
1689
1722
  const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend;
1690
1723
  matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4;
1691
- offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap offset_2 <=> offset_1 */
1692
- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);
1724
+ offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap offset_2 <=> offset_1 */
1725
+ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
1693
1726
  ip += matchLength;
1694
1727
  anchor = ip;
1695
1728
  continue;
@@ -1703,16 +1736,20 @@ _storeSequence:
1703
1736
  && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) {
1704
1737
  /* store sequence */
1705
1738
  matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
1706
- offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap repcodes */
1707
- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);
1739
+ offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap repcodes */
1740
+ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
1708
1741
  ip += matchLength;
1709
1742
  anchor = ip;
1710
1743
  continue; /* faster when present ... (?) */
1711
1744
  } } }
1712
1745
 
1713
- /* Save reps for next block */
1714
- rep[0] = offset_1 ? offset_1 : savedOffset;
1715
- rep[1] = offset_2 ? offset_2 : savedOffset;
1746
+ /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
1747
+ * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
1748
+ offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
1749
+
1750
+ /* save reps for next block */
1751
+ rep[0] = offset_1 ? offset_1 : offsetSaved1;
1752
+ rep[1] = offset_2 ? offset_2 : offsetSaved2;
1716
1753
 
1717
1754
  /* Return the last literals size */
1718
1755
  return (size_t)(iend - anchor);
@@ -1881,19 +1918,20 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1881
1918
  const BYTE* const dictEnd = dictBase + dictLimit;
1882
1919
  const BYTE* const dictStart = dictBase + ms->window.lowLimit;
1883
1920
  const U32 windowLog = ms->cParams.windowLog;
1884
- const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5;
1921
+ const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6);
1922
+ const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);
1885
1923
 
1886
- searchMax_f const searchMax = ZSTD_selectLazyVTable(ms, searchMethod, ZSTD_extDict)->searchMax;
1887
1924
  U32 offset_1 = rep[0], offset_2 = rep[1];
1888
1925
 
1889
1926
  DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod);
1890
1927
 
1928
+ /* Reset the lazy skipping state */
1929
+ ms->lazySkipping = 0;
1930
+
1891
1931
  /* init */
1892
1932
  ip += (ip == prefixStart);
1893
1933
  if (searchMethod == search_rowHash) {
1894
- ZSTD_row_fillHashCache(ms, base, rowLog,
1895
- MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
1896
- ms->nextToUpdate, ilimit);
1934
+ ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
1897
1935
  }
1898
1936
 
1899
1937
  /* Match Loop */
@@ -1905,7 +1943,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1905
1943
  #endif
1906
1944
  while (ip < ilimit) {
1907
1945
  size_t matchLength=0;
1908
- size_t offcode=STORE_REPCODE_1;
1946
+ size_t offBase = REPCODE1_TO_OFFBASE;
1909
1947
  const BYTE* start=ip+1;
1910
1948
  U32 curr = (U32)(ip-base);
1911
1949
 
@@ -1924,14 +1962,23 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1924
1962
  } }
1925
1963
 
1926
1964
  /* first search (depth 0) */
1927
- { size_t offsetFound = 999999999;
1928
- size_t const ml2 = searchMax(ms, ip, iend, &offsetFound);
1965
+ { size_t ofbCandidate = 999999999;
1966
+ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
1929
1967
  if (ml2 > matchLength)
1930
- matchLength = ml2, start = ip, offcode=offsetFound;
1968
+ matchLength = ml2, start = ip, offBase = ofbCandidate;
1931
1969
  }
1932
1970
 
1933
1971
  if (matchLength < 4) {
1934
- ip += ((ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */
1972
+ size_t const step = ((size_t)(ip-anchor) >> kSearchStrength);
1973
+ ip += step + 1; /* jump faster over incompressible sections */
1974
+ /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time.
1975
+ * In this mode we stop inserting every position into our tables, and only insert
1976
+ * positions that we search, which is one in step positions.
1977
+ * The exact cutoff is flexible, I've just chosen a number that is reasonably high,
1978
+ * so we minimize the compression ratio loss in "normal" scenarios. This mode gets
1979
+ * triggered once we've gone 2KB without finding any matches.
1980
+ */
1981
+ ms->lazySkipping = step > kLazySkippingStep;
1935
1982
  continue;
1936
1983
  }
1937
1984
 
@@ -1941,7 +1988,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1941
1988
  ip ++;
1942
1989
  curr++;
1943
1990
  /* check repCode */
1944
- if (offcode) {
1991
+ if (offBase) {
1945
1992
  const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
1946
1993
  const U32 repIndex = (U32)(curr - offset_1);
1947
1994
  const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
@@ -1953,18 +2000,18 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1953
2000
  const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
1954
2001
  size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
1955
2002
  int const gain2 = (int)(repLength * 3);
1956
- int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
2003
+ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
1957
2004
  if ((repLength >= 4) && (gain2 > gain1))
1958
- matchLength = repLength, offcode = STORE_REPCODE_1, start = ip;
2005
+ matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip;
1959
2006
  } }
1960
2007
 
1961
2008
  /* search match, depth 1 */
1962
- { size_t offset2=999999999;
1963
- size_t const ml2 = searchMax(ms, ip, iend, &offset2);
1964
- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */
1965
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4);
2009
+ { size_t ofbCandidate = 999999999;
2010
+ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
2011
+ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */
2012
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4);
1966
2013
  if ((ml2 >= 4) && (gain2 > gain1)) {
1967
- matchLength = ml2, offcode = offset2, start = ip;
2014
+ matchLength = ml2, offBase = ofbCandidate, start = ip;
1968
2015
  continue; /* search a better one */
1969
2016
  } }
1970
2017
 
@@ -1973,7 +2020,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1973
2020
  ip ++;
1974
2021
  curr++;
1975
2022
  /* check repCode */
1976
- if (offcode) {
2023
+ if (offBase) {
1977
2024
  const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
1978
2025
  const U32 repIndex = (U32)(curr - offset_1);
1979
2026
  const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
@@ -1985,38 +2032,45 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1985
2032
  const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
1986
2033
  size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
1987
2034
  int const gain2 = (int)(repLength * 4);
1988
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
2035
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
1989
2036
  if ((repLength >= 4) && (gain2 > gain1))
1990
- matchLength = repLength, offcode = STORE_REPCODE_1, start = ip;
2037
+ matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip;
1991
2038
  } }
1992
2039
 
1993
2040
  /* search match, depth 2 */
1994
- { size_t offset2=999999999;
1995
- size_t const ml2 = searchMax(ms, ip, iend, &offset2);
1996
- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */
1997
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7);
2041
+ { size_t ofbCandidate = 999999999;
2042
+ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
2043
+ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */
2044
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7);
1998
2045
  if ((ml2 >= 4) && (gain2 > gain1)) {
1999
- matchLength = ml2, offcode = offset2, start = ip;
2046
+ matchLength = ml2, offBase = ofbCandidate, start = ip;
2000
2047
  continue;
2001
2048
  } } }
2002
2049
  break; /* nothing found : store previous solution */
2003
2050
  }
2004
2051
 
2005
2052
  /* catch up */
2006
- if (STORED_IS_OFFSET(offcode)) {
2007
- U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode));
2053
+ if (OFFBASE_IS_OFFSET(offBase)) {
2054
+ U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase));
2008
2055
  const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex;
2009
2056
  const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart;
2010
2057
  while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */
2011
- offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode);
2058
+ offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase);
2012
2059
  }
2013
2060
 
2014
2061
  /* store sequence */
2015
2062
  _storeSequence:
2016
2063
  { size_t const litLength = (size_t)(start - anchor);
2017
- ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength);
2064
+ ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
2018
2065
  anchor = ip = start + matchLength;
2019
2066
  }
2067
+ if (ms->lazySkipping) {
2068
+ /* We've found a match, disable lazy skipping mode, and refill the hash cache. */
2069
+ if (searchMethod == search_rowHash) {
2070
+ ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
2071
+ }
2072
+ ms->lazySkipping = 0;
2073
+ }
2020
2074
 
2021
2075
  /* check immediate repcode */
2022
2076
  while (ip <= ilimit) {
@@ -2031,8 +2085,8 @@ _storeSequence:
2031
2085
  /* repcode detected we should take it */
2032
2086
  const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
2033
2087
  matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
2034
- offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap offset history */
2035
- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);
2088
+ offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap offset history */
2089
+ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
2036
2090
  ip += matchLength;
2037
2091
  anchor = ip;
2038
2092
  continue; /* faster when present ... (?) */
@@ -2098,7 +2152,6 @@ size_t ZSTD_compressBlock_lazy_extDict_row(
2098
2152
  size_t ZSTD_compressBlock_lazy2_extDict_row(
2099
2153
  ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2100
2154
  void const* src, size_t srcSize)
2101
-
2102
2155
  {
2103
2156
  return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2);
2104
2157
  }