zstd-ruby 1.5.1.1 → 1.5.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (117) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +2 -0
  3. data/README.md +78 -5
  4. data/Rakefile +8 -2
  5. data/ext/zstdruby/common.h +15 -0
  6. data/ext/zstdruby/extconf.rb +1 -1
  7. data/ext/zstdruby/libzstd/common/allocations.h +55 -0
  8. data/ext/zstdruby/libzstd/common/bits.h +200 -0
  9. data/ext/zstdruby/libzstd/common/bitstream.h +19 -60
  10. data/ext/zstdruby/libzstd/common/compiler.h +26 -3
  11. data/ext/zstdruby/libzstd/common/cpu.h +1 -1
  12. data/ext/zstdruby/libzstd/common/debug.c +1 -1
  13. data/ext/zstdruby/libzstd/common/debug.h +1 -1
  14. data/ext/zstdruby/libzstd/common/entropy_common.c +12 -40
  15. data/ext/zstdruby/libzstd/common/error_private.c +9 -2
  16. data/ext/zstdruby/libzstd/common/error_private.h +1 -1
  17. data/ext/zstdruby/libzstd/common/fse.h +5 -83
  18. data/ext/zstdruby/libzstd/common/fse_decompress.c +7 -99
  19. data/ext/zstdruby/libzstd/common/huf.h +65 -156
  20. data/ext/zstdruby/libzstd/common/mem.h +39 -46
  21. data/ext/zstdruby/libzstd/common/pool.c +37 -16
  22. data/ext/zstdruby/libzstd/common/pool.h +9 -3
  23. data/ext/zstdruby/libzstd/common/portability_macros.h +28 -3
  24. data/ext/zstdruby/libzstd/common/threading.c +68 -14
  25. data/ext/zstdruby/libzstd/common/threading.h +5 -10
  26. data/ext/zstdruby/libzstd/common/xxhash.c +2 -2
  27. data/ext/zstdruby/libzstd/common/xxhash.h +8 -8
  28. data/ext/zstdruby/libzstd/common/zstd_common.c +1 -36
  29. data/ext/zstdruby/libzstd/common/zstd_deps.h +1 -1
  30. data/ext/zstdruby/libzstd/common/zstd_internal.h +20 -122
  31. data/ext/zstdruby/libzstd/common/zstd_trace.h +3 -3
  32. data/ext/zstdruby/libzstd/compress/clevels.h +1 -1
  33. data/ext/zstdruby/libzstd/compress/fse_compress.c +7 -124
  34. data/ext/zstdruby/libzstd/compress/hist.c +1 -1
  35. data/ext/zstdruby/libzstd/compress/hist.h +1 -1
  36. data/ext/zstdruby/libzstd/compress/huf_compress.c +234 -169
  37. data/ext/zstdruby/libzstd/compress/zstd_compress.c +1317 -594
  38. data/ext/zstdruby/libzstd/compress/zstd_compress_internal.h +272 -165
  39. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.c +115 -39
  40. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.h +16 -8
  41. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.c +13 -13
  42. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.h +1 -1
  43. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.c +25 -21
  44. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.h +1 -1
  45. data/ext/zstdruby/libzstd/compress/zstd_cwksp.h +162 -82
  46. data/ext/zstdruby/libzstd/compress/zstd_double_fast.c +95 -33
  47. data/ext/zstdruby/libzstd/compress/zstd_double_fast.h +3 -2
  48. data/ext/zstdruby/libzstd/compress/zstd_fast.c +434 -149
  49. data/ext/zstdruby/libzstd/compress/zstd_fast.h +3 -2
  50. data/ext/zstdruby/libzstd/compress/zstd_lazy.c +405 -348
  51. data/ext/zstdruby/libzstd/compress/zstd_lazy.h +4 -2
  52. data/ext/zstdruby/libzstd/compress/zstd_ldm.c +9 -7
  53. data/ext/zstdruby/libzstd/compress/zstd_ldm.h +1 -1
  54. data/ext/zstdruby/libzstd/compress/zstd_ldm_geartab.h +1 -1
  55. data/ext/zstdruby/libzstd/compress/zstd_opt.c +149 -100
  56. data/ext/zstdruby/libzstd/compress/zstd_opt.h +1 -1
  57. data/ext/zstdruby/libzstd/compress/zstdmt_compress.c +32 -16
  58. data/ext/zstdruby/libzstd/compress/zstdmt_compress.h +5 -2
  59. data/ext/zstdruby/libzstd/decompress/huf_decompress.c +434 -441
  60. data/ext/zstdruby/libzstd/decompress/huf_decompress_amd64.S +42 -37
  61. data/ext/zstdruby/libzstd/decompress/zstd_ddict.c +4 -4
  62. data/ext/zstdruby/libzstd/decompress/zstd_ddict.h +1 -1
  63. data/ext/zstdruby/libzstd/decompress/zstd_decompress.c +205 -80
  64. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.c +201 -81
  65. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.h +6 -1
  66. data/ext/zstdruby/libzstd/decompress/zstd_decompress_internal.h +4 -2
  67. data/ext/zstdruby/libzstd/dictBuilder/cover.c +19 -15
  68. data/ext/zstdruby/libzstd/dictBuilder/cover.h +1 -1
  69. data/ext/zstdruby/libzstd/dictBuilder/fastcover.c +2 -2
  70. data/ext/zstdruby/libzstd/dictBuilder/zdict.c +13 -91
  71. data/ext/zstdruby/libzstd/zdict.h +53 -31
  72. data/ext/zstdruby/libzstd/zstd.h +580 -135
  73. data/ext/zstdruby/libzstd/zstd_errors.h +27 -8
  74. data/ext/zstdruby/main.c +20 -0
  75. data/ext/zstdruby/skippable_frame.c +63 -0
  76. data/ext/zstdruby/streaming_compress.c +177 -0
  77. data/ext/zstdruby/streaming_compress.h +5 -0
  78. data/ext/zstdruby/streaming_decompress.c +123 -0
  79. data/ext/zstdruby/zstdruby.c +113 -31
  80. data/lib/zstd-ruby/version.rb +1 -1
  81. data/lib/zstd-ruby.rb +0 -1
  82. data/zstd-ruby.gemspec +1 -1
  83. metadata +11 -37
  84. data/.github/dependabot.yml +0 -8
  85. data/.github/workflows/ruby.yml +0 -35
  86. data/ext/zstdruby/libzstd/.gitignore +0 -3
  87. data/ext/zstdruby/libzstd/BUCK +0 -232
  88. data/ext/zstdruby/libzstd/Makefile +0 -357
  89. data/ext/zstdruby/libzstd/README.md +0 -217
  90. data/ext/zstdruby/libzstd/deprecated/zbuff.h +0 -214
  91. data/ext/zstdruby/libzstd/deprecated/zbuff_common.c +0 -26
  92. data/ext/zstdruby/libzstd/deprecated/zbuff_compress.c +0 -167
  93. data/ext/zstdruby/libzstd/deprecated/zbuff_decompress.c +0 -75
  94. data/ext/zstdruby/libzstd/dll/example/Makefile +0 -48
  95. data/ext/zstdruby/libzstd/dll/example/README.md +0 -63
  96. data/ext/zstdruby/libzstd/dll/example/build_package.bat +0 -20
  97. data/ext/zstdruby/libzstd/dll/example/fullbench-dll.sln +0 -25
  98. data/ext/zstdruby/libzstd/dll/example/fullbench-dll.vcxproj +0 -181
  99. data/ext/zstdruby/libzstd/legacy/zstd_legacy.h +0 -415
  100. data/ext/zstdruby/libzstd/legacy/zstd_v01.c +0 -2158
  101. data/ext/zstdruby/libzstd/legacy/zstd_v01.h +0 -94
  102. data/ext/zstdruby/libzstd/legacy/zstd_v02.c +0 -3518
  103. data/ext/zstdruby/libzstd/legacy/zstd_v02.h +0 -93
  104. data/ext/zstdruby/libzstd/legacy/zstd_v03.c +0 -3160
  105. data/ext/zstdruby/libzstd/legacy/zstd_v03.h +0 -93
  106. data/ext/zstdruby/libzstd/legacy/zstd_v04.c +0 -3647
  107. data/ext/zstdruby/libzstd/legacy/zstd_v04.h +0 -142
  108. data/ext/zstdruby/libzstd/legacy/zstd_v05.c +0 -4050
  109. data/ext/zstdruby/libzstd/legacy/zstd_v05.h +0 -162
  110. data/ext/zstdruby/libzstd/legacy/zstd_v06.c +0 -4154
  111. data/ext/zstdruby/libzstd/legacy/zstd_v06.h +0 -172
  112. data/ext/zstdruby/libzstd/legacy/zstd_v07.c +0 -4541
  113. data/ext/zstdruby/libzstd/legacy/zstd_v07.h +0 -187
  114. data/ext/zstdruby/libzstd/libzstd.mk +0 -185
  115. data/ext/zstdruby/libzstd/libzstd.pc.in +0 -16
  116. data/ext/zstdruby/libzstd/modulemap/module.modulemap +0 -4
  117. data/ext/zstdruby/zstdruby.h +0 -6
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) Yann Collet, Facebook, Inc.
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -10,6 +10,9 @@
10
10
 
11
11
  #include "zstd_compress_internal.h"
12
12
  #include "zstd_lazy.h"
13
+ #include "../common/bits.h" /* ZSTD_countTrailingZeros64 */
14
+
15
+ #define kLazySkippingStep 8
13
16
 
14
17
 
15
18
  /*-*************************************
@@ -197,8 +200,8 @@ ZSTD_DUBT_findBetterDictMatch (
197
200
  U32 matchIndex = dictMatchIndex + dictIndexDelta;
198
201
  if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) {
199
202
  DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)",
200
- curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, ZSTD_REP_MOVE + curr - matchIndex, dictMatchIndex, matchIndex);
201
- bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + curr - matchIndex;
203
+ curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, OFFSET_TO_OFFBASE(curr - matchIndex), dictMatchIndex, matchIndex);
204
+ bestLength = matchLength, *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
202
205
  }
203
206
  if (ip+matchLength == iend) { /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */
204
207
  break; /* drop, to guarantee consistency (miss a little bit of compression) */
@@ -218,7 +221,7 @@ ZSTD_DUBT_findBetterDictMatch (
218
221
  }
219
222
 
220
223
  if (bestLength >= MINMATCH) {
221
- U32 const mIndex = curr - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex;
224
+ U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offsetPtr); (void)mIndex;
222
225
  DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
223
226
  curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
224
227
  }
@@ -230,7 +233,7 @@ ZSTD_DUBT_findBetterDictMatch (
230
233
  static size_t
231
234
  ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
232
235
  const BYTE* const ip, const BYTE* const iend,
233
- size_t* offsetPtr,
236
+ size_t* offBasePtr,
234
237
  U32 const mls,
235
238
  const ZSTD_dictMode_e dictMode)
236
239
  {
@@ -327,8 +330,8 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
327
330
  if (matchLength > bestLength) {
328
331
  if (matchLength > matchEndIdx - matchIndex)
329
332
  matchEndIdx = matchIndex + (U32)matchLength;
330
- if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) )
331
- bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + curr - matchIndex;
333
+ if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr - matchIndex + 1) - ZSTD_highbit32((U32)*offBasePtr)) )
334
+ bestLength = matchLength, *offBasePtr = OFFSET_TO_OFFBASE(curr - matchIndex);
332
335
  if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */
333
336
  if (dictMode == ZSTD_dictMatchState) {
334
337
  nbCompares = 0; /* in addition to avoiding checking any
@@ -361,16 +364,16 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
361
364
  if (dictMode == ZSTD_dictMatchState && nbCompares) {
362
365
  bestLength = ZSTD_DUBT_findBetterDictMatch(
363
366
  ms, ip, iend,
364
- offsetPtr, bestLength, nbCompares,
367
+ offBasePtr, bestLength, nbCompares,
365
368
  mls, dictMode);
366
369
  }
367
370
 
368
371
  assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased */
369
372
  ms->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */
370
373
  if (bestLength >= MINMATCH) {
371
- U32 const mIndex = curr - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex;
374
+ U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offBasePtr); (void)mIndex;
372
375
  DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
373
- curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
376
+ curr, (U32)bestLength, (U32)*offBasePtr, mIndex);
374
377
  }
375
378
  return bestLength;
376
379
  }
@@ -381,14 +384,14 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
381
384
  FORCE_INLINE_TEMPLATE size_t
382
385
  ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms,
383
386
  const BYTE* const ip, const BYTE* const iLimit,
384
- size_t* offsetPtr,
387
+ size_t* offBasePtr,
385
388
  const U32 mls /* template */,
386
389
  const ZSTD_dictMode_e dictMode)
387
390
  {
388
391
  DEBUGLOG(7, "ZSTD_BtFindBestMatch");
389
392
  if (ip < ms->window.base + ms->nextToUpdate) return 0; /* skipped area */
390
393
  ZSTD_updateDUBT(ms, ip, iLimit, mls);
391
- return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offsetPtr, mls, dictMode);
394
+ return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offBasePtr, mls, dictMode);
392
395
  }
393
396
 
394
397
  /***********************************
@@ -561,7 +564,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
561
564
  /* save best solution */
562
565
  if (currentMl > ml) {
563
566
  ml = currentMl;
564
- *offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE;
567
+ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta));
565
568
  if (ip+currentMl == iLimit) {
566
569
  /* best possible, avoids read overflow on next attempt */
567
570
  return ml;
@@ -598,7 +601,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
598
601
  /* save best solution */
599
602
  if (currentMl > ml) {
600
603
  ml = currentMl;
601
- *offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE;
604
+ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta));
602
605
  if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
603
606
  }
604
607
  }
@@ -617,7 +620,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
617
620
  FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
618
621
  ZSTD_matchState_t* ms,
619
622
  const ZSTD_compressionParameters* const cParams,
620
- const BYTE* ip, U32 const mls)
623
+ const BYTE* ip, U32 const mls, U32 const lazySkipping)
621
624
  {
622
625
  U32* const hashTable = ms->hashTable;
623
626
  const U32 hashLog = cParams->hashLog;
@@ -632,6 +635,9 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
632
635
  NEXT_IN_CHAIN(idx, chainMask) = hashTable[h];
633
636
  hashTable[h] = idx;
634
637
  idx++;
638
+ /* Stop inserting every position when in the lazy skipping mode. */
639
+ if (lazySkipping)
640
+ break;
635
641
  }
636
642
 
637
643
  ms->nextToUpdate = target;
@@ -640,7 +646,7 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
640
646
 
641
647
  U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {
642
648
  const ZSTD_compressionParameters* const cParams = &ms->cParams;
643
- return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch);
649
+ return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch, /* lazySkipping*/ 0);
644
650
  }
645
651
 
646
652
  /* inlining is important to hardwire a hot branch (template emulation) */
@@ -684,14 +690,15 @@ size_t ZSTD_HcFindBestMatch(
684
690
  }
685
691
 
686
692
  /* HC4 match finder */
687
- matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls);
693
+ matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls, ms->lazySkipping);
688
694
 
689
695
  for ( ; (matchIndex>=lowLimit) & (nbAttempts>0) ; nbAttempts--) {
690
696
  size_t currentMl=0;
691
697
  if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
692
698
  const BYTE* const match = base + matchIndex;
693
699
  assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */
694
- if (match[ml] == ip[ml]) /* potentially better */
700
+ /* read 4B starting from (match + ml + 1 - sizeof(U32)) */
701
+ if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3)) /* potentially better */
695
702
  currentMl = ZSTD_count(ip, match, iLimit);
696
703
  } else {
697
704
  const BYTE* const match = dictBase + matchIndex;
@@ -703,7 +710,7 @@ size_t ZSTD_HcFindBestMatch(
703
710
  /* save best solution */
704
711
  if (currentMl > ml) {
705
712
  ml = currentMl;
706
- *offsetPtr = curr - matchIndex + ZSTD_REP_MOVE;
713
+ *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
707
714
  if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
708
715
  }
709
716
 
@@ -738,7 +745,8 @@ size_t ZSTD_HcFindBestMatch(
738
745
  /* save best solution */
739
746
  if (currentMl > ml) {
740
747
  ml = currentMl;
741
- *offsetPtr = curr - (matchIndex + dmsIndexDelta) + ZSTD_REP_MOVE;
748
+ assert(curr > matchIndex + dmsIndexDelta);
749
+ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta));
742
750
  if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
743
751
  }
744
752
 
@@ -755,8 +763,6 @@ size_t ZSTD_HcFindBestMatch(
755
763
  * (SIMD) Row-based matchfinder
756
764
  ***********************************/
757
765
  /* Constants for row-based hash */
758
- #define ZSTD_ROW_HASH_TAG_OFFSET 16 /* byte offset of hashes in the match state's tagTable from the beginning of a row */
759
- #define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */
760
766
  #define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1)
761
767
  #define ZSTD_ROW_HASH_MAX_ENTRIES 64 /* absolute maximum number of entries per row, for all configurations */
762
768
 
@@ -768,73 +774,19 @@ typedef U64 ZSTD_VecMask; /* Clarifies when we are interacting with a U64 repr
768
774
  * Starting from the LSB, returns the idx of the next non-zero bit.
769
775
  * Basically counting the nb of trailing zeroes.
770
776
  */
771
- static U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
772
- assert(val != 0);
773
- # if defined(_MSC_VER) && defined(_WIN64)
774
- if (val != 0) {
775
- unsigned long r;
776
- _BitScanForward64(&r, val);
777
- return (U32)(r);
778
- } else {
779
- /* Should not reach this code path */
780
- __assume(0);
781
- }
782
- # elif (defined(__GNUC__) && ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))
783
- if (sizeof(size_t) == 4) {
784
- U32 mostSignificantWord = (U32)(val >> 32);
785
- U32 leastSignificantWord = (U32)val;
786
- if (leastSignificantWord == 0) {
787
- return 32 + (U32)__builtin_ctz(mostSignificantWord);
788
- } else {
789
- return (U32)__builtin_ctz(leastSignificantWord);
790
- }
791
- } else {
792
- return (U32)__builtin_ctzll(val);
793
- }
794
- # else
795
- /* Software ctz version: http://aggregate.org/MAGIC/#Trailing%20Zero%20Count
796
- * and: https://stackoverflow.com/questions/2709430/count-number-of-bits-in-a-64-bit-long-big-integer
797
- */
798
- val = ~val & (val - 1ULL); /* Lowest set bit mask */
799
- val = val - ((val >> 1) & 0x5555555555555555);
800
- val = (val & 0x3333333333333333ULL) + ((val >> 2) & 0x3333333333333333ULL);
801
- return (U32)((((val + (val >> 4)) & 0xF0F0F0F0F0F0F0FULL) * 0x101010101010101ULL) >> 56);
802
- # endif
803
- }
804
-
805
- /* ZSTD_rotateRight_*():
806
- * Rotates a bitfield to the right by "count" bits.
807
- * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts
808
- */
809
- FORCE_INLINE_TEMPLATE
810
- U64 ZSTD_rotateRight_U64(U64 const value, U32 count) {
811
- assert(count < 64);
812
- count &= 0x3F; /* for fickle pattern recognition */
813
- return (value >> count) | (U64)(value << ((0U - count) & 0x3F));
814
- }
815
-
816
- FORCE_INLINE_TEMPLATE
817
- U32 ZSTD_rotateRight_U32(U32 const value, U32 count) {
818
- assert(count < 32);
819
- count &= 0x1F; /* for fickle pattern recognition */
820
- return (value >> count) | (U32)(value << ((0U - count) & 0x1F));
821
- }
822
-
823
- FORCE_INLINE_TEMPLATE
824
- U16 ZSTD_rotateRight_U16(U16 const value, U32 count) {
825
- assert(count < 16);
826
- count &= 0x0F; /* for fickle pattern recognition */
827
- return (value >> count) | (U16)(value << ((0U - count) & 0x0F));
777
+ MEM_STATIC U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
778
+ return ZSTD_countTrailingZeros64(val);
828
779
  }
829
780
 
830
781
  /* ZSTD_row_nextIndex():
831
782
  * Returns the next index to insert at within a tagTable row, and updates the "head"
832
- * value to reflect the update. Essentially cycles backwards from [0, {entries per row})
783
+ * value to reflect the update. Essentially cycles backwards from [1, {entries per row})
833
784
  */
834
785
  FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const rowMask) {
835
- U32 const next = (*tagRow - 1) & rowMask;
836
- *tagRow = (BYTE)next;
837
- return next;
786
+ U32 next = (*tagRow-1) & rowMask;
787
+ next += (next == 0) ? rowMask : 0; /* skip first position */
788
+ *tagRow = (BYTE)next;
789
+ return next;
838
790
  }
839
791
 
840
792
  /* ZSTD_isAligned():
@@ -848,7 +800,7 @@ MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) {
848
800
  /* ZSTD_row_prefetch():
849
801
  * Performs prefetching for the hashTable and tagTable at a given row.
850
802
  */
851
- FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* tagTable, U32 const relRow, U32 const rowLog) {
803
+ FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, BYTE const* tagTable, U32 const relRow, U32 const rowLog) {
852
804
  PREFETCH_L1(hashTable + relRow);
853
805
  if (rowLog >= 5) {
854
806
  PREFETCH_L1(hashTable + relRow + 16);
@@ -872,13 +824,13 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const B
872
824
  U32 idx, const BYTE* const iLimit)
873
825
  {
874
826
  U32 const* const hashTable = ms->hashTable;
875
- U16 const* const tagTable = ms->tagTable;
827
+ BYTE const* const tagTable = ms->tagTable;
876
828
  U32 const hashLog = ms->rowHashLog;
877
829
  U32 const maxElemsToPrefetch = (base + idx) > iLimit ? 0 : (U32)(iLimit - (base + idx) + 1);
878
830
  U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch);
879
831
 
880
832
  for (; idx < lim; ++idx) {
881
- U32 const hash = (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
833
+ U32 const hash = (U32)ZSTD_hashPtrSalted(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt);
882
834
  U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
883
835
  ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
884
836
  ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash;
@@ -894,11 +846,12 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const B
894
846
  * base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable.
895
847
  */
896
848
  FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable,
897
- U16 const* tagTable, BYTE const* base,
849
+ BYTE const* tagTable, BYTE const* base,
898
850
  U32 idx, U32 const hashLog,
899
- U32 const rowLog, U32 const mls)
851
+ U32 const rowLog, U32 const mls,
852
+ U64 const hashSalt)
900
853
  {
901
- U32 const newHash = (U32)ZSTD_hashPtr(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
854
+ U32 const newHash = (U32)ZSTD_hashPtrSalted(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt);
902
855
  U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
903
856
  ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
904
857
  { U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK];
@@ -916,22 +869,21 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms,
916
869
  U32 const rowMask, U32 const useCache)
917
870
  {
918
871
  U32* const hashTable = ms->hashTable;
919
- U16* const tagTable = ms->tagTable;
872
+ BYTE* const tagTable = ms->tagTable;
920
873
  U32 const hashLog = ms->rowHashLog;
921
874
  const BYTE* const base = ms->window.base;
922
875
 
923
876
  DEBUGLOG(6, "ZSTD_row_update_internalImpl(): updateStartIdx=%u, updateEndIdx=%u", updateStartIdx, updateEndIdx);
924
877
  for (; updateStartIdx < updateEndIdx; ++updateStartIdx) {
925
- U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls)
926
- : (U32)ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
878
+ U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls, ms->hashSalt)
879
+ : (U32)ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt);
927
880
  U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
928
881
  U32* const row = hashTable + relRow;
929
- BYTE* tagRow = (BYTE*)(tagTable + relRow); /* Though tagTable is laid out as a table of U16, each tag is only 1 byte.
930
- Explicit cast allows us to get exact desired position within each row */
882
+ BYTE* tagRow = tagTable + relRow;
931
883
  U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
932
884
 
933
- assert(hash == ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls));
934
- ((BYTE*)tagRow)[pos + ZSTD_ROW_HASH_TAG_OFFSET] = hash & ZSTD_ROW_HASH_TAG_MASK;
885
+ assert(hash == ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt));
886
+ tagRow[pos] = hash & ZSTD_ROW_HASH_TAG_MASK;
935
887
  row[pos] = updateStartIdx;
936
888
  }
937
889
  }
@@ -979,7 +931,35 @@ void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) {
979
931
  const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */);
980
932
 
981
933
  DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog);
982
- ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* dont use cache */);
934
+ ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* don't use cache */);
935
+ }
936
+
937
+ /* Returns the mask width of bits group of which will be set to 1. Given not all
938
+ * architectures have easy movemask instruction, this helps to iterate over
939
+ * groups of bits easier and faster.
940
+ */
941
+ FORCE_INLINE_TEMPLATE U32
942
+ ZSTD_row_matchMaskGroupWidth(const U32 rowEntries)
943
+ {
944
+ assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
945
+ assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
946
+ (void)rowEntries;
947
+ #if defined(ZSTD_ARCH_ARM_NEON)
948
+ /* NEON path only works for little endian */
949
+ if (!MEM_isLittleEndian()) {
950
+ return 1;
951
+ }
952
+ if (rowEntries == 16) {
953
+ return 4;
954
+ }
955
+ if (rowEntries == 32) {
956
+ return 2;
957
+ }
958
+ if (rowEntries == 64) {
959
+ return 1;
960
+ }
961
+ #endif
962
+ return 1;
983
963
  }
984
964
 
985
965
  #if defined(ZSTD_ARCH_X86_SSE2)
@@ -1002,71 +982,82 @@ ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U
1002
982
  }
1003
983
  #endif
1004
984
 
1005
- /* Returns a ZSTD_VecMask (U32) that has the nth bit set to 1 if the newly-computed "tag" matches
1006
- * the hash at the nth position in a row of the tagTable.
1007
- * Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield
1008
- * to match up with the actual layout of the entries within the hashTable */
985
+ #if defined(ZSTD_ARCH_ARM_NEON)
986
+ FORCE_INLINE_TEMPLATE ZSTD_VecMask
987
+ ZSTD_row_getNEONMask(const U32 rowEntries, const BYTE* const src, const BYTE tag, const U32 headGrouped)
988
+ {
989
+ assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
990
+ if (rowEntries == 16) {
991
+ /* vshrn_n_u16 shifts by 4 every u16 and narrows to 8 lower bits.
992
+ * After that groups of 4 bits represent the equalMask. We lower
993
+ * all bits except the highest in these groups by doing AND with
994
+ * 0x88 = 0b10001000.
995
+ */
996
+ const uint8x16_t chunk = vld1q_u8(src);
997
+ const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag)));
998
+ const uint8x8_t res = vshrn_n_u16(equalMask, 4);
999
+ const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0);
1000
+ return ZSTD_rotateRight_U64(matches, headGrouped) & 0x8888888888888888ull;
1001
+ } else if (rowEntries == 32) {
1002
+ /* Same idea as with rowEntries == 16 but doing AND with
1003
+ * 0x55 = 0b01010101.
1004
+ */
1005
+ const uint16x8x2_t chunk = vld2q_u16((const uint16_t*)(const void*)src);
1006
+ const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]);
1007
+ const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]);
1008
+ const uint8x16_t dup = vdupq_n_u8(tag);
1009
+ const uint8x8_t t0 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk0, dup)), 6);
1010
+ const uint8x8_t t1 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk1, dup)), 6);
1011
+ const uint8x8_t res = vsli_n_u8(t0, t1, 4);
1012
+ const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0) ;
1013
+ return ZSTD_rotateRight_U64(matches, headGrouped) & 0x5555555555555555ull;
1014
+ } else { /* rowEntries == 64 */
1015
+ const uint8x16x4_t chunk = vld4q_u8(src);
1016
+ const uint8x16_t dup = vdupq_n_u8(tag);
1017
+ const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup);
1018
+ const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup);
1019
+ const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup);
1020
+ const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup);
1021
+
1022
+ const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1);
1023
+ const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1);
1024
+ const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2);
1025
+ const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4);
1026
+ const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4);
1027
+ const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0);
1028
+ return ZSTD_rotateRight_U64(matches, headGrouped);
1029
+ }
1030
+ }
1031
+ #endif
1032
+
1033
+ /* Returns a ZSTD_VecMask (U64) that has the nth group (determined by
1034
+ * ZSTD_row_matchMaskGroupWidth) of bits set to 1 if the newly-computed "tag"
1035
+ * matches the hash at the nth position in a row of the tagTable.
1036
+ * Each row is a circular buffer beginning at the value of "headGrouped". So we
1037
+ * must rotate the "matches" bitfield to match up with the actual layout of the
1038
+ * entries within the hashTable */
1009
1039
  FORCE_INLINE_TEMPLATE ZSTD_VecMask
1010
- ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries)
1040
+ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 headGrouped, const U32 rowEntries)
1011
1041
  {
1012
- const BYTE* const src = tagRow + ZSTD_ROW_HASH_TAG_OFFSET;
1042
+ const BYTE* const src = tagRow;
1013
1043
  assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
1014
1044
  assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
1045
+ assert(ZSTD_row_matchMaskGroupWidth(rowEntries) * rowEntries <= sizeof(ZSTD_VecMask) * 8);
1015
1046
 
1016
1047
  #if defined(ZSTD_ARCH_X86_SSE2)
1017
1048
 
1018
- return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, head);
1049
+ return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, headGrouped);
1019
1050
 
1020
1051
  #else /* SW or NEON-LE */
1021
1052
 
1022
1053
  # if defined(ZSTD_ARCH_ARM_NEON)
1023
1054
  /* This NEON path only works for little endian - otherwise use SWAR below */
1024
1055
  if (MEM_isLittleEndian()) {
1025
- if (rowEntries == 16) {
1026
- const uint8x16_t chunk = vld1q_u8(src);
1027
- const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag)));
1028
- const uint16x8_t t0 = vshlq_n_u16(equalMask, 7);
1029
- const uint32x4_t t1 = vreinterpretq_u32_u16(vsriq_n_u16(t0, t0, 14));
1030
- const uint64x2_t t2 = vreinterpretq_u64_u32(vshrq_n_u32(t1, 14));
1031
- const uint8x16_t t3 = vreinterpretq_u8_u64(vsraq_n_u64(t2, t2, 28));
1032
- const U16 hi = (U16)vgetq_lane_u8(t3, 8);
1033
- const U16 lo = (U16)vgetq_lane_u8(t3, 0);
1034
- return ZSTD_rotateRight_U16((hi << 8) | lo, head);
1035
- } else if (rowEntries == 32) {
1036
- const uint16x8x2_t chunk = vld2q_u16((const U16*)(const void*)src);
1037
- const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]);
1038
- const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]);
1039
- const uint8x16_t equalMask0 = vceqq_u8(chunk0, vdupq_n_u8(tag));
1040
- const uint8x16_t equalMask1 = vceqq_u8(chunk1, vdupq_n_u8(tag));
1041
- const int8x8_t pack0 = vqmovn_s16(vreinterpretq_s16_u8(equalMask0));
1042
- const int8x8_t pack1 = vqmovn_s16(vreinterpretq_s16_u8(equalMask1));
1043
- const uint8x8_t t0 = vreinterpret_u8_s8(pack0);
1044
- const uint8x8_t t1 = vreinterpret_u8_s8(pack1);
1045
- const uint8x8_t t2 = vsri_n_u8(t1, t0, 2);
1046
- const uint8x8x2_t t3 = vuzp_u8(t2, t0);
1047
- const uint8x8_t t4 = vsri_n_u8(t3.val[1], t3.val[0], 4);
1048
- const U32 matches = vget_lane_u32(vreinterpret_u32_u8(t4), 0);
1049
- return ZSTD_rotateRight_U32(matches, head);
1050
- } else { /* rowEntries == 64 */
1051
- const uint8x16x4_t chunk = vld4q_u8(src);
1052
- const uint8x16_t dup = vdupq_n_u8(tag);
1053
- const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup);
1054
- const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup);
1055
- const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup);
1056
- const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup);
1057
-
1058
- const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1);
1059
- const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1);
1060
- const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2);
1061
- const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4);
1062
- const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4);
1063
- const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0);
1064
- return ZSTD_rotateRight_U64(matches, head);
1065
- }
1056
+ return ZSTD_row_getNEONMask(rowEntries, src, tag, headGrouped);
1066
1057
  }
1067
1058
  # endif /* ZSTD_ARCH_ARM_NEON */
1068
1059
  /* SWAR */
1069
- { const size_t chunkSize = sizeof(size_t);
1060
+ { const int chunkSize = sizeof(size_t);
1070
1061
  const size_t shiftAmount = ((chunkSize * 8) - chunkSize);
1071
1062
  const size_t xFF = ~((size_t)0);
1072
1063
  const size_t x01 = xFF / 0xFF;
@@ -1099,11 +1090,11 @@ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head,
1099
1090
  }
1100
1091
  matches = ~matches;
1101
1092
  if (rowEntries == 16) {
1102
- return ZSTD_rotateRight_U16((U16)matches, head);
1093
+ return ZSTD_rotateRight_U16((U16)matches, headGrouped);
1103
1094
  } else if (rowEntries == 32) {
1104
- return ZSTD_rotateRight_U32((U32)matches, head);
1095
+ return ZSTD_rotateRight_U32((U32)matches, headGrouped);
1105
1096
  } else {
1106
- return ZSTD_rotateRight_U64((U64)matches, head);
1097
+ return ZSTD_rotateRight_U64((U64)matches, headGrouped);
1107
1098
  }
1108
1099
  }
1109
1100
  #endif
@@ -1133,7 +1124,7 @@ size_t ZSTD_RowFindBestMatch(
1133
1124
  const U32 rowLog)
1134
1125
  {
1135
1126
  U32* const hashTable = ms->hashTable;
1136
- U16* const tagTable = ms->tagTable;
1127
+ BYTE* const tagTable = ms->tagTable;
1137
1128
  U32* const hashCache = ms->hashCache;
1138
1129
  const U32 hashLog = ms->rowHashLog;
1139
1130
  const ZSTD_compressionParameters* const cParams = &ms->cParams;
@@ -1151,8 +1142,11 @@ size_t ZSTD_RowFindBestMatch(
1151
1142
  const U32 rowEntries = (1U << rowLog);
1152
1143
  const U32 rowMask = rowEntries - 1;
1153
1144
  const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */
1145
+ const U32 groupWidth = ZSTD_row_matchMaskGroupWidth(rowEntries);
1146
+ const U64 hashSalt = ms->hashSalt;
1154
1147
  U32 nbAttempts = 1U << cappedSearchLog;
1155
1148
  size_t ml=4-1;
1149
+ U32 hash;
1156
1150
 
1157
1151
  /* DMS/DDS variables that may be referenced laster */
1158
1152
  const ZSTD_matchState_t* const dms = ms->dictMatchState;
@@ -1176,7 +1170,7 @@ size_t ZSTD_RowFindBestMatch(
1176
1170
  if (dictMode == ZSTD_dictMatchState) {
1177
1171
  /* Prefetch DMS rows */
1178
1172
  U32* const dmsHashTable = dms->hashTable;
1179
- U16* const dmsTagTable = dms->tagTable;
1173
+ BYTE* const dmsTagTable = dms->tagTable;
1180
1174
  U32 const dmsHash = (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
1181
1175
  U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
1182
1176
  dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK;
@@ -1186,23 +1180,34 @@ size_t ZSTD_RowFindBestMatch(
1186
1180
  }
1187
1181
 
1188
1182
  /* Update the hashTable and tagTable up to (but not including) ip */
1189
- ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);
1183
+ if (!ms->lazySkipping) {
1184
+ ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);
1185
+ hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls, hashSalt);
1186
+ } else {
1187
+ /* Stop inserting every position when in the lazy skipping mode.
1188
+ * The hash cache is also not kept up to date in this mode.
1189
+ */
1190
+ hash = (U32)ZSTD_hashPtrSalted(ip, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt);
1191
+ ms->nextToUpdate = curr;
1192
+ }
1193
+ ms->hashSaltEntropy += hash; /* collect salt entropy */
1194
+
1190
1195
  { /* Get the hash for ip, compute the appropriate row */
1191
- U32 const hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls);
1192
1196
  U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
1193
1197
  U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK;
1194
1198
  U32* const row = hashTable + relRow;
1195
1199
  BYTE* tagRow = (BYTE*)(tagTable + relRow);
1196
- U32 const head = *tagRow & rowMask;
1200
+ U32 const headGrouped = (*tagRow & rowMask) * groupWidth;
1197
1201
  U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
1198
1202
  size_t numMatches = 0;
1199
1203
  size_t currMatch = 0;
1200
- ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, head, rowEntries);
1204
+ ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, headGrouped, rowEntries);
1201
1205
 
1202
1206
  /* Cycle through the matches and prefetch */
1203
- for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
1204
- U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
1207
+ for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) {
1208
+ U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask;
1205
1209
  U32 const matchIndex = row[matchPos];
1210
+ if(matchPos == 0) continue;
1206
1211
  assert(numMatches < rowEntries);
1207
1212
  if (matchIndex < lowLimit)
1208
1213
  break;
@@ -1212,13 +1217,14 @@ size_t ZSTD_RowFindBestMatch(
1212
1217
  PREFETCH_L1(dictBase + matchIndex);
1213
1218
  }
1214
1219
  matchBuffer[numMatches++] = matchIndex;
1220
+ --nbAttempts;
1215
1221
  }
1216
1222
 
1217
1223
  /* Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop
1218
1224
  in ZSTD_row_update_internal() at the next search. */
1219
1225
  {
1220
1226
  U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
1221
- tagRow[pos + ZSTD_ROW_HASH_TAG_OFFSET] = (BYTE)tag;
1227
+ tagRow[pos] = (BYTE)tag;
1222
1228
  row[pos] = ms->nextToUpdate++;
1223
1229
  }
1224
1230
 
@@ -1232,7 +1238,8 @@ size_t ZSTD_RowFindBestMatch(
1232
1238
  if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
1233
1239
  const BYTE* const match = base + matchIndex;
1234
1240
  assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */
1235
- if (match[ml] == ip[ml]) /* potentially better */
1241
+ /* read 4B starting from (match + ml + 1 - sizeof(U32)) */
1242
+ if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3)) /* potentially better */
1236
1243
  currentMl = ZSTD_count(ip, match, iLimit);
1237
1244
  } else {
1238
1245
  const BYTE* const match = dictBase + matchIndex;
@@ -1244,7 +1251,7 @@ size_t ZSTD_RowFindBestMatch(
1244
1251
  /* Save best solution */
1245
1252
  if (currentMl > ml) {
1246
1253
  ml = currentMl;
1247
- *offsetPtr = curr - matchIndex + ZSTD_REP_MOVE;
1254
+ *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
1248
1255
  if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
1249
1256
  }
1250
1257
  }
@@ -1262,19 +1269,21 @@ size_t ZSTD_RowFindBestMatch(
1262
1269
  const U32 dmsSize = (U32)(dmsEnd - dmsBase);
1263
1270
  const U32 dmsIndexDelta = dictLimit - dmsSize;
1264
1271
 
1265
- { U32 const head = *dmsTagRow & rowMask;
1272
+ { U32 const headGrouped = (*dmsTagRow & rowMask) * groupWidth;
1266
1273
  U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
1267
1274
  size_t numMatches = 0;
1268
1275
  size_t currMatch = 0;
1269
- ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, head, rowEntries);
1276
+ ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, headGrouped, rowEntries);
1270
1277
 
1271
- for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
1272
- U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
1278
+ for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) {
1279
+ U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask;
1273
1280
  U32 const matchIndex = dmsRow[matchPos];
1281
+ if(matchPos == 0) continue;
1274
1282
  if (matchIndex < dmsLowestIndex)
1275
1283
  break;
1276
1284
  PREFETCH_L1(dmsBase + matchIndex);
1277
1285
  matchBuffer[numMatches++] = matchIndex;
1286
+ --nbAttempts;
1278
1287
  }
1279
1288
 
1280
1289
  /* Return the longest match */
@@ -1292,7 +1301,8 @@ size_t ZSTD_RowFindBestMatch(
1292
1301
 
1293
1302
  if (currentMl > ml) {
1294
1303
  ml = currentMl;
1295
- *offsetPtr = curr - (matchIndex + dmsIndexDelta) + ZSTD_REP_MOVE;
1304
+ assert(curr > matchIndex + dmsIndexDelta);
1305
+ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta));
1296
1306
  if (ip+currentMl == iLimit) break;
1297
1307
  }
1298
1308
  }
@@ -1302,14 +1312,10 @@ size_t ZSTD_RowFindBestMatch(
1302
1312
  }
1303
1313
 
1304
1314
 
1305
- typedef size_t (*searchMax_f)(
1306
- ZSTD_matchState_t* ms,
1307
- const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
1308
-
1309
1315
  /**
1310
- * This struct contains the functions necessary for lazy to search.
1311
- * Currently, that is only searchMax. However, it is still valuable to have the
1312
- * VTable because this makes it easier to add more functions to the VTable later.
1316
+ * Generate search functions templated on (dictMode, mls, rowLog).
1317
+ * These functions are outlined for code size & compilation time.
1318
+ * ZSTD_searchMax() dispatches to the correct implementation function.
1313
1319
  *
1314
1320
  * TODO: The start of the search function involves loading and calculating a
1315
1321
  * bunch of constants from the ZSTD_matchState_t. These computations could be
@@ -1327,25 +1333,25 @@ typedef size_t (*searchMax_f)(
1327
1333
  * the single segment loop. It should go in searchMax instead of its own
1328
1334
  * function to avoid having multiple virtual function calls per search.
1329
1335
  */
1330
- typedef struct {
1331
- searchMax_f searchMax;
1332
- } ZSTD_LazyVTable;
1333
1336
 
1334
- #define GEN_ZSTD_BT_VTABLE(dictMode, mls) \
1335
- static size_t ZSTD_BtFindBestMatch_##dictMode##_##mls( \
1336
- ZSTD_matchState_t* ms, \
1337
- const BYTE* ip, const BYTE* const iLimit, \
1338
- size_t* offsetPtr) \
1339
- { \
1340
- assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
1341
- return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode); \
1342
- } \
1343
- static const ZSTD_LazyVTable ZSTD_BtVTable_##dictMode##_##mls = { \
1344
- ZSTD_BtFindBestMatch_##dictMode##_##mls \
1345
- };
1337
+ #define ZSTD_BT_SEARCH_FN(dictMode, mls) ZSTD_BtFindBestMatch_##dictMode##_##mls
1338
+ #define ZSTD_HC_SEARCH_FN(dictMode, mls) ZSTD_HcFindBestMatch_##dictMode##_##mls
1339
+ #define ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog) ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog
1346
1340
 
1347
- #define GEN_ZSTD_HC_VTABLE(dictMode, mls) \
1348
- static size_t ZSTD_HcFindBestMatch_##dictMode##_##mls( \
1341
+ #define ZSTD_SEARCH_FN_ATTRS FORCE_NOINLINE
1342
+
1343
+ #define GEN_ZSTD_BT_SEARCH_FN(dictMode, mls) \
1344
+ ZSTD_SEARCH_FN_ATTRS size_t ZSTD_BT_SEARCH_FN(dictMode, mls)( \
1345
+ ZSTD_matchState_t* ms, \
1346
+ const BYTE* ip, const BYTE* const iLimit, \
1347
+ size_t* offBasePtr) \
1348
+ { \
1349
+ assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
1350
+ return ZSTD_BtFindBestMatch(ms, ip, iLimit, offBasePtr, mls, ZSTD_##dictMode); \
1351
+ } \
1352
+
1353
+ #define GEN_ZSTD_HC_SEARCH_FN(dictMode, mls) \
1354
+ ZSTD_SEARCH_FN_ATTRS size_t ZSTD_HC_SEARCH_FN(dictMode, mls)( \
1349
1355
  ZSTD_matchState_t* ms, \
1350
1356
  const BYTE* ip, const BYTE* const iLimit, \
1351
1357
  size_t* offsetPtr) \
@@ -1353,12 +1359,9 @@ typedef struct {
1353
1359
  assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
1354
1360
  return ZSTD_HcFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode); \
1355
1361
  } \
1356
- static const ZSTD_LazyVTable ZSTD_HcVTable_##dictMode##_##mls = { \
1357
- ZSTD_HcFindBestMatch_##dictMode##_##mls \
1358
- };
1359
1362
 
1360
- #define GEN_ZSTD_ROW_VTABLE(dictMode, mls, rowLog) \
1361
- static size_t ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog( \
1363
+ #define GEN_ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog) \
1364
+ ZSTD_SEARCH_FN_ATTRS size_t ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)( \
1362
1365
  ZSTD_matchState_t* ms, \
1363
1366
  const BYTE* ip, const BYTE* const iLimit, \
1364
1367
  size_t* offsetPtr) \
@@ -1367,9 +1370,6 @@ typedef struct {
1367
1370
  assert(MAX(4, MIN(6, ms->cParams.searchLog)) == rowLog); \
1368
1371
  return ZSTD_RowFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode, rowLog); \
1369
1372
  } \
1370
- static const ZSTD_LazyVTable ZSTD_RowVTable_##dictMode##_##mls##_##rowLog = { \
1371
- ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog \
1372
- };
1373
1373
 
1374
1374
  #define ZSTD_FOR_EACH_ROWLOG(X, dictMode, mls) \
1375
1375
  X(dictMode, mls, 4) \
@@ -1392,84 +1392,103 @@ typedef struct {
1392
1392
  X(__VA_ARGS__, dictMatchState) \
1393
1393
  X(__VA_ARGS__, dedicatedDictSearch)
1394
1394
 
1395
- /* Generate Row VTables for each combination of (dictMode, mls, rowLog) */
1396
- ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS_ROWLOG, GEN_ZSTD_ROW_VTABLE)
1397
- /* Generate Binary Tree VTables for each combination of (dictMode, mls) */
1398
- ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_BT_VTABLE)
1399
- /* Generate Hash Chain VTables for each combination of (dictMode, mls) */
1400
- ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_HC_VTABLE)
1401
-
1402
- #define GEN_ZSTD_BT_VTABLE_ARRAY(dictMode) \
1403
- { \
1404
- &ZSTD_BtVTable_##dictMode##_4, \
1405
- &ZSTD_BtVTable_##dictMode##_5, \
1406
- &ZSTD_BtVTable_##dictMode##_6 \
1407
- }
1395
+ /* Generate row search fns for each combination of (dictMode, mls, rowLog) */
1396
+ ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS_ROWLOG, GEN_ZSTD_ROW_SEARCH_FN)
1397
+ /* Generate binary Tree search fns for each combination of (dictMode, mls) */
1398
+ ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_BT_SEARCH_FN)
1399
+ /* Generate hash chain search fns for each combination of (dictMode, mls) */
1400
+ ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_HC_SEARCH_FN)
1408
1401
 
1409
- #define GEN_ZSTD_HC_VTABLE_ARRAY(dictMode) \
1410
- { \
1411
- &ZSTD_HcVTable_##dictMode##_4, \
1412
- &ZSTD_HcVTable_##dictMode##_5, \
1413
- &ZSTD_HcVTable_##dictMode##_6 \
1414
- }
1415
-
1416
- #define GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, mls) \
1417
- { \
1418
- &ZSTD_RowVTable_##dictMode##_##mls##_4, \
1419
- &ZSTD_RowVTable_##dictMode##_##mls##_5, \
1420
- &ZSTD_RowVTable_##dictMode##_##mls##_6 \
1421
- }
1402
+ typedef enum { search_hashChain=0, search_binaryTree=1, search_rowHash=2 } searchMethod_e;
1422
1403
 
1423
- #define GEN_ZSTD_ROW_VTABLE_ARRAY(dictMode) \
1424
- { \
1425
- GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, 4), \
1426
- GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, 5), \
1427
- GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, 6) \
1404
+ #define GEN_ZSTD_CALL_BT_SEARCH_FN(dictMode, mls) \
1405
+ case mls: \
1406
+ return ZSTD_BT_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr);
1407
+ #define GEN_ZSTD_CALL_HC_SEARCH_FN(dictMode, mls) \
1408
+ case mls: \
1409
+ return ZSTD_HC_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr);
1410
+ #define GEN_ZSTD_CALL_ROW_SEARCH_FN(dictMode, mls, rowLog) \
1411
+ case rowLog: \
1412
+ return ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)(ms, ip, iend, offsetPtr);
1413
+
1414
+ #define ZSTD_SWITCH_MLS(X, dictMode) \
1415
+ switch (mls) { \
1416
+ ZSTD_FOR_EACH_MLS(X, dictMode) \
1428
1417
  }
1429
1418
 
1430
- #define GEN_ZSTD_VTABLE_ARRAY(X) \
1431
- { \
1432
- X(noDict), \
1433
- X(extDict), \
1434
- X(dictMatchState), \
1435
- X(dedicatedDictSearch) \
1436
- }
1437
-
1438
- /* *******************************
1439
- * Common parser - lazy strategy
1440
- *********************************/
1441
- typedef enum { search_hashChain=0, search_binaryTree=1, search_rowHash=2 } searchMethod_e;
1419
+ #define ZSTD_SWITCH_ROWLOG(dictMode, mls) \
1420
+ case mls: \
1421
+ switch (rowLog) { \
1422
+ ZSTD_FOR_EACH_ROWLOG(GEN_ZSTD_CALL_ROW_SEARCH_FN, dictMode, mls) \
1423
+ } \
1424
+ ZSTD_UNREACHABLE; \
1425
+ break;
1426
+
1427
+ #define ZSTD_SWITCH_SEARCH_METHOD(dictMode) \
1428
+ switch (searchMethod) { \
1429
+ case search_hashChain: \
1430
+ ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_HC_SEARCH_FN, dictMode) \
1431
+ break; \
1432
+ case search_binaryTree: \
1433
+ ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_BT_SEARCH_FN, dictMode) \
1434
+ break; \
1435
+ case search_rowHash: \
1436
+ ZSTD_SWITCH_MLS(ZSTD_SWITCH_ROWLOG, dictMode) \
1437
+ break; \
1438
+ } \
1439
+ ZSTD_UNREACHABLE;
1442
1440
 
1443
1441
  /**
1444
- * This table is indexed first by the four ZSTD_dictMode_e values, and then
1445
- * by the two searchMethod_e values. NULLs are placed for configurations
1446
- * that should never occur (extDict modes go to the other implementation
1447
- * below and there is no DDSS for binary tree search yet).
1442
+ * Searches for the longest match at @p ip.
1443
+ * Dispatches to the correct implementation function based on the
1444
+ * (searchMethod, dictMode, mls, rowLog). We use switch statements
1445
+ * here instead of using an indirect function call through a function
1446
+ * pointer because after Spectre and Meltdown mitigations, indirect
1447
+ * function calls can be very costly, especially in the kernel.
1448
+ *
1449
+ * NOTE: dictMode and searchMethod should be templated, so those switch
1450
+ * statements should be optimized out. Only the mls & rowLog switches
1451
+ * should be left.
1452
+ *
1453
+ * @param ms The match state.
1454
+ * @param ip The position to search at.
1455
+ * @param iend The end of the input data.
1456
+ * @param[out] offsetPtr Stores the match offset into this pointer.
1457
+ * @param mls The minimum search length, in the range [4, 6].
1458
+ * @param rowLog The row log (if applicable), in the range [4, 6].
1459
+ * @param searchMethod The search method to use (templated).
1460
+ * @param dictMode The dictMode (templated).
1461
+ *
1462
+ * @returns The length of the longest match found, or < mls if no match is found.
1463
+ * If a match is found its offset is stored in @p offsetPtr.
1448
1464
  */
1449
-
1450
- static ZSTD_LazyVTable const*
1451
- ZSTD_selectLazyVTable(ZSTD_matchState_t const* ms, searchMethod_e searchMethod, ZSTD_dictMode_e dictMode)
1465
+ FORCE_INLINE_TEMPLATE size_t ZSTD_searchMax(
1466
+ ZSTD_matchState_t* ms,
1467
+ const BYTE* ip,
1468
+ const BYTE* iend,
1469
+ size_t* offsetPtr,
1470
+ U32 const mls,
1471
+ U32 const rowLog,
1472
+ searchMethod_e const searchMethod,
1473
+ ZSTD_dictMode_e const dictMode)
1452
1474
  {
1453
- /* Fill the Hc/Bt VTable arrays with the right functions for the (dictMode, mls) combination. */
1454
- ZSTD_LazyVTable const* const hcVTables[4][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_HC_VTABLE_ARRAY);
1455
- ZSTD_LazyVTable const* const btVTables[4][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_BT_VTABLE_ARRAY);
1456
- /* Fill the Row VTable array with the right functions for the (dictMode, mls, rowLog) combination. */
1457
- ZSTD_LazyVTable const* const rowVTables[4][3][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_ROW_VTABLE_ARRAY);
1458
-
1459
- U32 const mls = MAX(4, MIN(6, ms->cParams.minMatch));
1460
- U32 const rowLog = MAX(4, MIN(6, ms->cParams.searchLog));
1461
- switch (searchMethod) {
1462
- case search_hashChain:
1463
- return hcVTables[dictMode][mls - 4];
1464
- case search_binaryTree:
1465
- return btVTables[dictMode][mls - 4];
1466
- case search_rowHash:
1467
- return rowVTables[dictMode][mls - 4][rowLog - 4];
1468
- default:
1469
- return NULL;
1475
+ if (dictMode == ZSTD_noDict) {
1476
+ ZSTD_SWITCH_SEARCH_METHOD(noDict)
1477
+ } else if (dictMode == ZSTD_extDict) {
1478
+ ZSTD_SWITCH_SEARCH_METHOD(extDict)
1479
+ } else if (dictMode == ZSTD_dictMatchState) {
1480
+ ZSTD_SWITCH_SEARCH_METHOD(dictMatchState)
1481
+ } else if (dictMode == ZSTD_dedicatedDictSearch) {
1482
+ ZSTD_SWITCH_SEARCH_METHOD(dedicatedDictSearch)
1470
1483
  }
1484
+ ZSTD_UNREACHABLE;
1485
+ return 0;
1471
1486
  }
1472
1487
 
1488
+ /* *******************************
1489
+ * Common parser - lazy strategy
1490
+ *********************************/
1491
+
1473
1492
  FORCE_INLINE_TEMPLATE size_t
1474
1493
  ZSTD_compressBlock_lazy_generic(
1475
1494
  ZSTD_matchState_t* ms, seqStore_t* seqStore,
@@ -1486,9 +1505,11 @@ ZSTD_compressBlock_lazy_generic(
1486
1505
  const BYTE* const base = ms->window.base;
1487
1506
  const U32 prefixLowestIndex = ms->window.dictLimit;
1488
1507
  const BYTE* const prefixLowest = base + prefixLowestIndex;
1508
+ const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6);
1509
+ const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);
1489
1510
 
1490
- searchMax_f const searchMax = ZSTD_selectLazyVTable(ms, searchMethod, dictMode)->searchMax;
1491
- U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0;
1511
+ U32 offset_1 = rep[0], offset_2 = rep[1];
1512
+ U32 offsetSaved1 = 0, offsetSaved2 = 0;
1492
1513
 
1493
1514
  const int isDMS = dictMode == ZSTD_dictMatchState;
1494
1515
  const int isDDS = dictMode == ZSTD_dedicatedDictSearch;
@@ -1503,16 +1524,14 @@ ZSTD_compressBlock_lazy_generic(
1503
1524
  0;
1504
1525
  const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictLowest));
1505
1526
 
1506
- assert(searchMax != NULL);
1507
-
1508
1527
  DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=%u) (searchFunc=%u)", (U32)dictMode, (U32)searchMethod);
1509
1528
  ip += (dictAndPrefixLength == 0);
1510
1529
  if (dictMode == ZSTD_noDict) {
1511
1530
  U32 const curr = (U32)(ip - base);
1512
1531
  U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, ms->cParams.windowLog);
1513
1532
  U32 const maxRep = curr - windowLow;
1514
- if (offset_2 > maxRep) savedOffset = offset_2, offset_2 = 0;
1515
- if (offset_1 > maxRep) savedOffset = offset_1, offset_1 = 0;
1533
+ if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0;
1534
+ if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0;
1516
1535
  }
1517
1536
  if (isDxS) {
1518
1537
  /* dictMatchState repCode checks don't currently handle repCode == 0
@@ -1521,11 +1540,11 @@ ZSTD_compressBlock_lazy_generic(
1521
1540
  assert(offset_2 <= dictAndPrefixLength);
1522
1541
  }
1523
1542
 
1543
+ /* Reset the lazy skipping state */
1544
+ ms->lazySkipping = 0;
1545
+
1524
1546
  if (searchMethod == search_rowHash) {
1525
- const U32 rowLog = MAX(4, MIN(6, ms->cParams.searchLog));
1526
- ZSTD_row_fillHashCache(ms, base, rowLog,
1527
- MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
1528
- ms->nextToUpdate, ilimit);
1547
+ ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
1529
1548
  }
1530
1549
 
1531
1550
  /* Match Loop */
@@ -1537,8 +1556,9 @@ ZSTD_compressBlock_lazy_generic(
1537
1556
  #endif
1538
1557
  while (ip < ilimit) {
1539
1558
  size_t matchLength=0;
1540
- size_t offset=0;
1559
+ size_t offBase = REPCODE1_TO_OFFBASE;
1541
1560
  const BYTE* start=ip+1;
1561
+ DEBUGLOG(7, "search baseline (depth 0)");
1542
1562
 
1543
1563
  /* check repCode */
1544
1564
  if (isDxS) {
@@ -1561,28 +1581,38 @@ ZSTD_compressBlock_lazy_generic(
1561
1581
  }
1562
1582
 
1563
1583
  /* first search (depth 0) */
1564
- { size_t offsetFound = 999999999;
1565
- size_t const ml2 = searchMax(ms, ip, iend, &offsetFound);
1584
+ { size_t offbaseFound = 999999999;
1585
+ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offbaseFound, mls, rowLog, searchMethod, dictMode);
1566
1586
  if (ml2 > matchLength)
1567
- matchLength = ml2, start = ip, offset=offsetFound;
1587
+ matchLength = ml2, start = ip, offBase = offbaseFound;
1568
1588
  }
1569
1589
 
1570
1590
  if (matchLength < 4) {
1571
- ip += ((ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */
1591
+ size_t const step = ((size_t)(ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */;
1592
+ ip += step;
1593
+ /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time.
1594
+ * In this mode we stop inserting every position into our tables, and only insert
1595
+ * positions that we search, which is one in step positions.
1596
+ * The exact cutoff is flexible, I've just chosen a number that is reasonably high,
1597
+ * so we minimize the compression ratio loss in "normal" scenarios. This mode gets
1598
+ * triggered once we've gone 2KB without finding any matches.
1599
+ */
1600
+ ms->lazySkipping = step > kLazySkippingStep;
1572
1601
  continue;
1573
1602
  }
1574
1603
 
1575
1604
  /* let's try to find a better solution */
1576
1605
  if (depth>=1)
1577
1606
  while (ip<ilimit) {
1607
+ DEBUGLOG(7, "search depth 1");
1578
1608
  ip ++;
1579
1609
  if ( (dictMode == ZSTD_noDict)
1580
- && (offset) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
1610
+ && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
1581
1611
  size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
1582
1612
  int const gain2 = (int)(mlRep * 3);
1583
- int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1);
1613
+ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
1584
1614
  if ((mlRep >= 4) && (gain2 > gain1))
1585
- matchLength = mlRep, offset = 0, start = ip;
1615
+ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
1586
1616
  }
1587
1617
  if (isDxS) {
1588
1618
  const U32 repIndex = (U32)(ip - base) - offset_1;
@@ -1594,30 +1624,31 @@ ZSTD_compressBlock_lazy_generic(
1594
1624
  const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
1595
1625
  size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
1596
1626
  int const gain2 = (int)(mlRep * 3);
1597
- int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1);
1627
+ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
1598
1628
  if ((mlRep >= 4) && (gain2 > gain1))
1599
- matchLength = mlRep, offset = 0, start = ip;
1629
+ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
1600
1630
  }
1601
1631
  }
1602
- { size_t offset2=999999999;
1603
- size_t const ml2 = searchMax(ms, ip, iend, &offset2);
1604
- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */
1605
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 4);
1632
+ { size_t ofbCandidate=999999999;
1633
+ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);
1634
+ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */
1635
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4);
1606
1636
  if ((ml2 >= 4) && (gain2 > gain1)) {
1607
- matchLength = ml2, offset = offset2, start = ip;
1637
+ matchLength = ml2, offBase = ofbCandidate, start = ip;
1608
1638
  continue; /* search a better one */
1609
1639
  } }
1610
1640
 
1611
1641
  /* let's find an even better one */
1612
1642
  if ((depth==2) && (ip<ilimit)) {
1643
+ DEBUGLOG(7, "search depth 2");
1613
1644
  ip ++;
1614
1645
  if ( (dictMode == ZSTD_noDict)
1615
- && (offset) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
1646
+ && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
1616
1647
  size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
1617
1648
  int const gain2 = (int)(mlRep * 4);
1618
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1);
1649
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
1619
1650
  if ((mlRep >= 4) && (gain2 > gain1))
1620
- matchLength = mlRep, offset = 0, start = ip;
1651
+ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
1621
1652
  }
1622
1653
  if (isDxS) {
1623
1654
  const U32 repIndex = (U32)(ip - base) - offset_1;
@@ -1629,48 +1660,54 @@ ZSTD_compressBlock_lazy_generic(
1629
1660
  const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
1630
1661
  size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
1631
1662
  int const gain2 = (int)(mlRep * 4);
1632
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1);
1663
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
1633
1664
  if ((mlRep >= 4) && (gain2 > gain1))
1634
- matchLength = mlRep, offset = 0, start = ip;
1665
+ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
1635
1666
  }
1636
1667
  }
1637
- { size_t offset2=999999999;
1638
- size_t const ml2 = searchMax(ms, ip, iend, &offset2);
1639
- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */
1640
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 7);
1668
+ { size_t ofbCandidate=999999999;
1669
+ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);
1670
+ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */
1671
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7);
1641
1672
  if ((ml2 >= 4) && (gain2 > gain1)) {
1642
- matchLength = ml2, offset = offset2, start = ip;
1673
+ matchLength = ml2, offBase = ofbCandidate, start = ip;
1643
1674
  continue;
1644
1675
  } } }
1645
1676
  break; /* nothing found : store previous solution */
1646
1677
  }
1647
1678
 
1648
1679
  /* NOTE:
1649
- * start[-offset+ZSTD_REP_MOVE-1] is undefined behavior.
1650
- * (-offset+ZSTD_REP_MOVE-1) is unsigned, and is added to start, which
1651
- * overflows the pointer, which is undefined behavior.
1680
+ * Pay attention that `start[-value]` can lead to strange undefined behavior
1681
+ * notably if `value` is unsigned, resulting in a large positive `-value`.
1652
1682
  */
1653
1683
  /* catch up */
1654
- if (offset) {
1684
+ if (OFFBASE_IS_OFFSET(offBase)) {
1655
1685
  if (dictMode == ZSTD_noDict) {
1656
- while ( ((start > anchor) & (start - (offset-ZSTD_REP_MOVE) > prefixLowest))
1657
- && (start[-1] == (start-(offset-ZSTD_REP_MOVE))[-1]) ) /* only search for offset within prefix */
1686
+ while ( ((start > anchor) & (start - OFFBASE_TO_OFFSET(offBase) > prefixLowest))
1687
+ && (start[-1] == (start-OFFBASE_TO_OFFSET(offBase))[-1]) ) /* only search for offset within prefix */
1658
1688
  { start--; matchLength++; }
1659
1689
  }
1660
1690
  if (isDxS) {
1661
- U32 const matchIndex = (U32)((size_t)(start-base) - (offset - ZSTD_REP_MOVE));
1691
+ U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase));
1662
1692
  const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex;
1663
1693
  const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest;
1664
1694
  while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */
1665
1695
  }
1666
- offset_2 = offset_1; offset_1 = (U32)(offset - ZSTD_REP_MOVE);
1696
+ offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase);
1667
1697
  }
1668
1698
  /* store sequence */
1669
1699
  _storeSequence:
1670
1700
  { size_t const litLength = (size_t)(start - anchor);
1671
- ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offset, matchLength-MINMATCH);
1701
+ ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
1672
1702
  anchor = ip = start + matchLength;
1673
1703
  }
1704
+ if (ms->lazySkipping) {
1705
+ /* We've found a match, disable lazy skipping mode, and refill the hash cache. */
1706
+ if (searchMethod == search_rowHash) {
1707
+ ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
1708
+ }
1709
+ ms->lazySkipping = 0;
1710
+ }
1674
1711
 
1675
1712
  /* check immediate repcode */
1676
1713
  if (isDxS) {
@@ -1684,8 +1721,8 @@ _storeSequence:
1684
1721
  && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
1685
1722
  const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend;
1686
1723
  matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4;
1687
- offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap offset_2 <=> offset_1 */
1688
- ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MINMATCH);
1724
+ offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap offset_2 <=> offset_1 */
1725
+ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
1689
1726
  ip += matchLength;
1690
1727
  anchor = ip;
1691
1728
  continue;
@@ -1699,16 +1736,20 @@ _storeSequence:
1699
1736
  && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) {
1700
1737
  /* store sequence */
1701
1738
  matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
1702
- offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap repcodes */
1703
- ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MINMATCH);
1739
+ offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap repcodes */
1740
+ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
1704
1741
  ip += matchLength;
1705
1742
  anchor = ip;
1706
1743
  continue; /* faster when present ... (?) */
1707
1744
  } } }
1708
1745
 
1709
- /* Save reps for next block */
1710
- rep[0] = offset_1 ? offset_1 : savedOffset;
1711
- rep[1] = offset_2 ? offset_2 : savedOffset;
1746
+ /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
1747
+ * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
1748
+ offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
1749
+
1750
+ /* save reps for next block */
1751
+ rep[0] = offset_1 ? offset_1 : offsetSaved1;
1752
+ rep[1] = offset_2 ? offset_2 : offsetSaved2;
1712
1753
 
1713
1754
  /* Return the last literals size */
1714
1755
  return (size_t)(iend - anchor);
@@ -1877,19 +1918,20 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1877
1918
  const BYTE* const dictEnd = dictBase + dictLimit;
1878
1919
  const BYTE* const dictStart = dictBase + ms->window.lowLimit;
1879
1920
  const U32 windowLog = ms->cParams.windowLog;
1880
- const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5;
1921
+ const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6);
1922
+ const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);
1881
1923
 
1882
- searchMax_f const searchMax = ZSTD_selectLazyVTable(ms, searchMethod, ZSTD_extDict)->searchMax;
1883
1924
  U32 offset_1 = rep[0], offset_2 = rep[1];
1884
1925
 
1885
1926
  DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod);
1886
1927
 
1928
+ /* Reset the lazy skipping state */
1929
+ ms->lazySkipping = 0;
1930
+
1887
1931
  /* init */
1888
1932
  ip += (ip == prefixStart);
1889
1933
  if (searchMethod == search_rowHash) {
1890
- ZSTD_row_fillHashCache(ms, base, rowLog,
1891
- MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
1892
- ms->nextToUpdate, ilimit);
1934
+ ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
1893
1935
  }
1894
1936
 
1895
1937
  /* Match Loop */
@@ -1901,7 +1943,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1901
1943
  #endif
1902
1944
  while (ip < ilimit) {
1903
1945
  size_t matchLength=0;
1904
- size_t offset=0;
1946
+ size_t offBase = REPCODE1_TO_OFFBASE;
1905
1947
  const BYTE* start=ip+1;
1906
1948
  U32 curr = (U32)(ip-base);
1907
1949
 
@@ -1920,14 +1962,23 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1920
1962
  } }
1921
1963
 
1922
1964
  /* first search (depth 0) */
1923
- { size_t offsetFound = 999999999;
1924
- size_t const ml2 = searchMax(ms, ip, iend, &offsetFound);
1965
+ { size_t ofbCandidate = 999999999;
1966
+ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
1925
1967
  if (ml2 > matchLength)
1926
- matchLength = ml2, start = ip, offset=offsetFound;
1968
+ matchLength = ml2, start = ip, offBase = ofbCandidate;
1927
1969
  }
1928
1970
 
1929
1971
  if (matchLength < 4) {
1930
- ip += ((ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */
1972
+ size_t const step = ((size_t)(ip-anchor) >> kSearchStrength);
1973
+ ip += step + 1; /* jump faster over incompressible sections */
1974
+ /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time.
1975
+ * In this mode we stop inserting every position into our tables, and only insert
1976
+ * positions that we search, which is one in step positions.
1977
+ * The exact cutoff is flexible, I've just chosen a number that is reasonably high,
1978
+ * so we minimize the compression ratio loss in "normal" scenarios. This mode gets
1979
+ * triggered once we've gone 2KB without finding any matches.
1980
+ */
1981
+ ms->lazySkipping = step > kLazySkippingStep;
1931
1982
  continue;
1932
1983
  }
1933
1984
 
@@ -1937,7 +1988,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1937
1988
  ip ++;
1938
1989
  curr++;
1939
1990
  /* check repCode */
1940
- if (offset) {
1991
+ if (offBase) {
1941
1992
  const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
1942
1993
  const U32 repIndex = (U32)(curr - offset_1);
1943
1994
  const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
@@ -1949,18 +2000,18 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1949
2000
  const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
1950
2001
  size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
1951
2002
  int const gain2 = (int)(repLength * 3);
1952
- int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1);
2003
+ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
1953
2004
  if ((repLength >= 4) && (gain2 > gain1))
1954
- matchLength = repLength, offset = 0, start = ip;
2005
+ matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip;
1955
2006
  } }
1956
2007
 
1957
2008
  /* search match, depth 1 */
1958
- { size_t offset2=999999999;
1959
- size_t const ml2 = searchMax(ms, ip, iend, &offset2);
1960
- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */
1961
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 4);
2009
+ { size_t ofbCandidate = 999999999;
2010
+ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
2011
+ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */
2012
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4);
1962
2013
  if ((ml2 >= 4) && (gain2 > gain1)) {
1963
- matchLength = ml2, offset = offset2, start = ip;
2014
+ matchLength = ml2, offBase = ofbCandidate, start = ip;
1964
2015
  continue; /* search a better one */
1965
2016
  } }
1966
2017
 
@@ -1969,7 +2020,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1969
2020
  ip ++;
1970
2021
  curr++;
1971
2022
  /* check repCode */
1972
- if (offset) {
2023
+ if (offBase) {
1973
2024
  const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
1974
2025
  const U32 repIndex = (U32)(curr - offset_1);
1975
2026
  const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
@@ -1981,38 +2032,45 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1981
2032
  const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
1982
2033
  size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
1983
2034
  int const gain2 = (int)(repLength * 4);
1984
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1);
2035
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
1985
2036
  if ((repLength >= 4) && (gain2 > gain1))
1986
- matchLength = repLength, offset = 0, start = ip;
2037
+ matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip;
1987
2038
  } }
1988
2039
 
1989
2040
  /* search match, depth 2 */
1990
- { size_t offset2=999999999;
1991
- size_t const ml2 = searchMax(ms, ip, iend, &offset2);
1992
- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */
1993
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 7);
2041
+ { size_t ofbCandidate = 999999999;
2042
+ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
2043
+ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */
2044
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7);
1994
2045
  if ((ml2 >= 4) && (gain2 > gain1)) {
1995
- matchLength = ml2, offset = offset2, start = ip;
2046
+ matchLength = ml2, offBase = ofbCandidate, start = ip;
1996
2047
  continue;
1997
2048
  } } }
1998
2049
  break; /* nothing found : store previous solution */
1999
2050
  }
2000
2051
 
2001
2052
  /* catch up */
2002
- if (offset) {
2003
- U32 const matchIndex = (U32)((size_t)(start-base) - (offset - ZSTD_REP_MOVE));
2053
+ if (OFFBASE_IS_OFFSET(offBase)) {
2054
+ U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase));
2004
2055
  const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex;
2005
2056
  const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart;
2006
2057
  while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */
2007
- offset_2 = offset_1; offset_1 = (U32)(offset - ZSTD_REP_MOVE);
2058
+ offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase);
2008
2059
  }
2009
2060
 
2010
2061
  /* store sequence */
2011
2062
  _storeSequence:
2012
2063
  { size_t const litLength = (size_t)(start - anchor);
2013
- ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offset, matchLength-MINMATCH);
2064
+ ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
2014
2065
  anchor = ip = start + matchLength;
2015
2066
  }
2067
+ if (ms->lazySkipping) {
2068
+ /* We've found a match, disable lazy skipping mode, and refill the hash cache. */
2069
+ if (searchMethod == search_rowHash) {
2070
+ ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
2071
+ }
2072
+ ms->lazySkipping = 0;
2073
+ }
2016
2074
 
2017
2075
  /* check immediate repcode */
2018
2076
  while (ip <= ilimit) {
@@ -2027,8 +2085,8 @@ _storeSequence:
2027
2085
  /* repcode detected we should take it */
2028
2086
  const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
2029
2087
  matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
2030
- offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap offset history */
2031
- ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MINMATCH);
2088
+ offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap offset history */
2089
+ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
2032
2090
  ip += matchLength;
2033
2091
  anchor = ip;
2034
2092
  continue; /* faster when present ... (?) */
@@ -2094,7 +2152,6 @@ size_t ZSTD_compressBlock_lazy_extDict_row(
2094
2152
  size_t ZSTD_compressBlock_lazy2_extDict_row(
2095
2153
  ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2096
2154
  void const* src, size_t srcSize)
2097
-
2098
2155
  {
2099
2156
  return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2);
2100
2157
  }