zstd-ruby 1.5.2.3 → 1.5.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (71) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +3 -3
  3. data/ext/zstdruby/libzstd/common/bits.h +175 -0
  4. data/ext/zstdruby/libzstd/common/bitstream.h +18 -59
  5. data/ext/zstdruby/libzstd/common/compiler.h +22 -3
  6. data/ext/zstdruby/libzstd/common/cpu.h +1 -1
  7. data/ext/zstdruby/libzstd/common/debug.c +1 -1
  8. data/ext/zstdruby/libzstd/common/debug.h +1 -1
  9. data/ext/zstdruby/libzstd/common/entropy_common.c +12 -40
  10. data/ext/zstdruby/libzstd/common/error_private.c +9 -2
  11. data/ext/zstdruby/libzstd/common/error_private.h +1 -1
  12. data/ext/zstdruby/libzstd/common/fse.h +5 -83
  13. data/ext/zstdruby/libzstd/common/fse_decompress.c +7 -99
  14. data/ext/zstdruby/libzstd/common/huf.h +65 -156
  15. data/ext/zstdruby/libzstd/common/mem.h +39 -46
  16. data/ext/zstdruby/libzstd/common/pool.c +26 -10
  17. data/ext/zstdruby/libzstd/common/pool.h +7 -1
  18. data/ext/zstdruby/libzstd/common/portability_macros.h +22 -3
  19. data/ext/zstdruby/libzstd/common/threading.c +68 -14
  20. data/ext/zstdruby/libzstd/common/threading.h +5 -10
  21. data/ext/zstdruby/libzstd/common/xxhash.c +2 -2
  22. data/ext/zstdruby/libzstd/common/xxhash.h +8 -8
  23. data/ext/zstdruby/libzstd/common/zstd_common.c +1 -1
  24. data/ext/zstdruby/libzstd/common/zstd_deps.h +1 -1
  25. data/ext/zstdruby/libzstd/common/zstd_internal.h +17 -113
  26. data/ext/zstdruby/libzstd/common/zstd_trace.h +3 -3
  27. data/ext/zstdruby/libzstd/compress/clevels.h +1 -1
  28. data/ext/zstdruby/libzstd/compress/fse_compress.c +7 -124
  29. data/ext/zstdruby/libzstd/compress/hist.c +1 -1
  30. data/ext/zstdruby/libzstd/compress/hist.h +1 -1
  31. data/ext/zstdruby/libzstd/compress/huf_compress.c +234 -169
  32. data/ext/zstdruby/libzstd/compress/zstd_compress.c +1055 -455
  33. data/ext/zstdruby/libzstd/compress/zstd_compress_internal.h +165 -145
  34. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.c +115 -39
  35. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.h +16 -8
  36. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.c +3 -3
  37. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.h +1 -1
  38. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.c +25 -21
  39. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.h +1 -1
  40. data/ext/zstdruby/libzstd/compress/zstd_cwksp.h +5 -3
  41. data/ext/zstdruby/libzstd/compress/zstd_double_fast.c +95 -33
  42. data/ext/zstdruby/libzstd/compress/zstd_double_fast.h +3 -2
  43. data/ext/zstdruby/libzstd/compress/zstd_fast.c +433 -148
  44. data/ext/zstdruby/libzstd/compress/zstd_fast.h +3 -2
  45. data/ext/zstdruby/libzstd/compress/zstd_lazy.c +306 -283
  46. data/ext/zstdruby/libzstd/compress/zstd_lazy.h +4 -2
  47. data/ext/zstdruby/libzstd/compress/zstd_ldm.c +5 -5
  48. data/ext/zstdruby/libzstd/compress/zstd_ldm.h +1 -1
  49. data/ext/zstdruby/libzstd/compress/zstd_ldm_geartab.h +1 -1
  50. data/ext/zstdruby/libzstd/compress/zstd_opt.c +104 -80
  51. data/ext/zstdruby/libzstd/compress/zstd_opt.h +1 -1
  52. data/ext/zstdruby/libzstd/compress/zstdmt_compress.c +12 -5
  53. data/ext/zstdruby/libzstd/compress/zstdmt_compress.h +1 -1
  54. data/ext/zstdruby/libzstd/decompress/huf_decompress.c +434 -441
  55. data/ext/zstdruby/libzstd/decompress/huf_decompress_amd64.S +30 -39
  56. data/ext/zstdruby/libzstd/decompress/zstd_ddict.c +3 -4
  57. data/ext/zstdruby/libzstd/decompress/zstd_ddict.h +1 -1
  58. data/ext/zstdruby/libzstd/decompress/zstd_decompress.c +164 -42
  59. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.c +186 -65
  60. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.h +1 -1
  61. data/ext/zstdruby/libzstd/decompress/zstd_decompress_internal.h +4 -2
  62. data/ext/zstdruby/libzstd/dictBuilder/cover.c +19 -15
  63. data/ext/zstdruby/libzstd/dictBuilder/cover.h +1 -1
  64. data/ext/zstdruby/libzstd/dictBuilder/fastcover.c +2 -2
  65. data/ext/zstdruby/libzstd/dictBuilder/zdict.c +9 -87
  66. data/ext/zstdruby/libzstd/zdict.h +53 -31
  67. data/ext/zstdruby/libzstd/zstd.h +489 -90
  68. data/ext/zstdruby/libzstd/zstd_errors.h +27 -8
  69. data/ext/zstdruby/main.c +4 -0
  70. data/lib/zstd-ruby/version.rb +1 -1
  71. metadata +7 -6
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) Yann Collet, Facebook, Inc.
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -10,6 +10,7 @@
10
10
 
11
11
  #include "zstd_compress_internal.h"
12
12
  #include "zstd_lazy.h"
13
+ #include "../common/bits.h" /* ZSTD_countTrailingZeros64 */
13
14
 
14
15
 
15
16
  /*-*************************************
@@ -197,8 +198,8 @@ ZSTD_DUBT_findBetterDictMatch (
197
198
  U32 matchIndex = dictMatchIndex + dictIndexDelta;
198
199
  if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) {
199
200
  DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)",
200
- curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, STORE_OFFSET(curr - matchIndex), dictMatchIndex, matchIndex);
201
- bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex);
201
+ curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, OFFSET_TO_OFFBASE(curr - matchIndex), dictMatchIndex, matchIndex);
202
+ bestLength = matchLength, *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
202
203
  }
203
204
  if (ip+matchLength == iend) { /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */
204
205
  break; /* drop, to guarantee consistency (miss a little bit of compression) */
@@ -218,7 +219,7 @@ ZSTD_DUBT_findBetterDictMatch (
218
219
  }
219
220
 
220
221
  if (bestLength >= MINMATCH) {
221
- U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex;
222
+ U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offsetPtr); (void)mIndex;
222
223
  DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
223
224
  curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
224
225
  }
@@ -230,7 +231,7 @@ ZSTD_DUBT_findBetterDictMatch (
230
231
  static size_t
231
232
  ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
232
233
  const BYTE* const ip, const BYTE* const iend,
233
- size_t* offsetPtr,
234
+ size_t* offBasePtr,
234
235
  U32 const mls,
235
236
  const ZSTD_dictMode_e dictMode)
236
237
  {
@@ -327,8 +328,8 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
327
328
  if (matchLength > bestLength) {
328
329
  if (matchLength > matchEndIdx - matchIndex)
329
330
  matchEndIdx = matchIndex + (U32)matchLength;
330
- if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) )
331
- bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex);
331
+ if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr - matchIndex + 1) - ZSTD_highbit32((U32)*offBasePtr)) )
332
+ bestLength = matchLength, *offBasePtr = OFFSET_TO_OFFBASE(curr - matchIndex);
332
333
  if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */
333
334
  if (dictMode == ZSTD_dictMatchState) {
334
335
  nbCompares = 0; /* in addition to avoiding checking any
@@ -361,16 +362,16 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
361
362
  if (dictMode == ZSTD_dictMatchState && nbCompares) {
362
363
  bestLength = ZSTD_DUBT_findBetterDictMatch(
363
364
  ms, ip, iend,
364
- offsetPtr, bestLength, nbCompares,
365
+ offBasePtr, bestLength, nbCompares,
365
366
  mls, dictMode);
366
367
  }
367
368
 
368
369
  assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased */
369
370
  ms->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */
370
371
  if (bestLength >= MINMATCH) {
371
- U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex;
372
+ U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offBasePtr); (void)mIndex;
372
373
  DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
373
- curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
374
+ curr, (U32)bestLength, (U32)*offBasePtr, mIndex);
374
375
  }
375
376
  return bestLength;
376
377
  }
@@ -381,14 +382,14 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
381
382
  FORCE_INLINE_TEMPLATE size_t
382
383
  ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms,
383
384
  const BYTE* const ip, const BYTE* const iLimit,
384
- size_t* offsetPtr,
385
+ size_t* offBasePtr,
385
386
  const U32 mls /* template */,
386
387
  const ZSTD_dictMode_e dictMode)
387
388
  {
388
389
  DEBUGLOG(7, "ZSTD_BtFindBestMatch");
389
390
  if (ip < ms->window.base + ms->nextToUpdate) return 0; /* skipped area */
390
391
  ZSTD_updateDUBT(ms, ip, iLimit, mls);
391
- return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offsetPtr, mls, dictMode);
392
+ return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offBasePtr, mls, dictMode);
392
393
  }
393
394
 
394
395
  /***********************************
@@ -561,7 +562,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
561
562
  /* save best solution */
562
563
  if (currentMl > ml) {
563
564
  ml = currentMl;
564
- *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta));
565
+ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta));
565
566
  if (ip+currentMl == iLimit) {
566
567
  /* best possible, avoids read overflow on next attempt */
567
568
  return ml;
@@ -598,7 +599,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
598
599
  /* save best solution */
599
600
  if (currentMl > ml) {
600
601
  ml = currentMl;
601
- *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta));
602
+ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta));
602
603
  if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
603
604
  }
604
605
  }
@@ -691,7 +692,8 @@ size_t ZSTD_HcFindBestMatch(
691
692
  if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
692
693
  const BYTE* const match = base + matchIndex;
693
694
  assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */
694
- if (match[ml] == ip[ml]) /* potentially better */
695
+ /* read 4B starting from (match + ml + 1 - sizeof(U32)) */
696
+ if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3)) /* potentially better */
695
697
  currentMl = ZSTD_count(ip, match, iLimit);
696
698
  } else {
697
699
  const BYTE* const match = dictBase + matchIndex;
@@ -703,7 +705,7 @@ size_t ZSTD_HcFindBestMatch(
703
705
  /* save best solution */
704
706
  if (currentMl > ml) {
705
707
  ml = currentMl;
706
- *offsetPtr = STORE_OFFSET(curr - matchIndex);
708
+ *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
707
709
  if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
708
710
  }
709
711
 
@@ -739,7 +741,7 @@ size_t ZSTD_HcFindBestMatch(
739
741
  if (currentMl > ml) {
740
742
  ml = currentMl;
741
743
  assert(curr > matchIndex + dmsIndexDelta);
742
- *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta));
744
+ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta));
743
745
  if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
744
746
  }
745
747
 
@@ -757,7 +759,6 @@ size_t ZSTD_HcFindBestMatch(
757
759
  ***********************************/
758
760
  /* Constants for row-based hash */
759
761
  #define ZSTD_ROW_HASH_TAG_OFFSET 16 /* byte offset of hashes in the match state's tagTable from the beginning of a row */
760
- #define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */
761
762
  #define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1)
762
763
  #define ZSTD_ROW_HASH_MAX_ENTRIES 64 /* absolute maximum number of entries per row, for all configurations */
763
764
 
@@ -769,38 +770,8 @@ typedef U64 ZSTD_VecMask; /* Clarifies when we are interacting with a U64 repr
769
770
  * Starting from the LSB, returns the idx of the next non-zero bit.
770
771
  * Basically counting the nb of trailing zeroes.
771
772
  */
772
- static U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
773
- assert(val != 0);
774
- # if defined(_MSC_VER) && defined(_WIN64)
775
- if (val != 0) {
776
- unsigned long r;
777
- _BitScanForward64(&r, val);
778
- return (U32)(r);
779
- } else {
780
- /* Should not reach this code path */
781
- __assume(0);
782
- }
783
- # elif (defined(__GNUC__) && ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))
784
- if (sizeof(size_t) == 4) {
785
- U32 mostSignificantWord = (U32)(val >> 32);
786
- U32 leastSignificantWord = (U32)val;
787
- if (leastSignificantWord == 0) {
788
- return 32 + (U32)__builtin_ctz(mostSignificantWord);
789
- } else {
790
- return (U32)__builtin_ctz(leastSignificantWord);
791
- }
792
- } else {
793
- return (U32)__builtin_ctzll(val);
794
- }
795
- # else
796
- /* Software ctz version: http://aggregate.org/MAGIC/#Trailing%20Zero%20Count
797
- * and: https://stackoverflow.com/questions/2709430/count-number-of-bits-in-a-64-bit-long-big-integer
798
- */
799
- val = ~val & (val - 1ULL); /* Lowest set bit mask */
800
- val = val - ((val >> 1) & 0x5555555555555555);
801
- val = (val & 0x3333333333333333ULL) + ((val >> 2) & 0x3333333333333333ULL);
802
- return (U32)((((val + (val >> 4)) & 0xF0F0F0F0F0F0F0FULL) * 0x101010101010101ULL) >> 56);
803
- # endif
773
+ MEM_STATIC U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
774
+ return ZSTD_countTrailingZeros64(val);
804
775
  }
805
776
 
806
777
  /* ZSTD_rotateRight_*():
@@ -980,7 +951,35 @@ void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) {
980
951
  const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */);
981
952
 
982
953
  DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog);
983
- ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* dont use cache */);
954
+ ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* don't use cache */);
955
+ }
956
+
957
+ /* Returns the mask width of bits group of which will be set to 1. Given not all
958
+ * architectures have easy movemask instruction, this helps to iterate over
959
+ * groups of bits easier and faster.
960
+ */
961
+ FORCE_INLINE_TEMPLATE U32
962
+ ZSTD_row_matchMaskGroupWidth(const U32 rowEntries)
963
+ {
964
+ assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
965
+ assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
966
+ (void)rowEntries;
967
+ #if defined(ZSTD_ARCH_ARM_NEON)
968
+ /* NEON path only works for little endian */
969
+ if (!MEM_isLittleEndian()) {
970
+ return 1;
971
+ }
972
+ if (rowEntries == 16) {
973
+ return 4;
974
+ }
975
+ if (rowEntries == 32) {
976
+ return 2;
977
+ }
978
+ if (rowEntries == 64) {
979
+ return 1;
980
+ }
981
+ #endif
982
+ return 1;
984
983
  }
985
984
 
986
985
  #if defined(ZSTD_ARCH_X86_SSE2)
@@ -1003,71 +1002,82 @@ ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U
1003
1002
  }
1004
1003
  #endif
1005
1004
 
1006
- /* Returns a ZSTD_VecMask (U32) that has the nth bit set to 1 if the newly-computed "tag" matches
1007
- * the hash at the nth position in a row of the tagTable.
1008
- * Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield
1009
- * to match up with the actual layout of the entries within the hashTable */
1005
+ #if defined(ZSTD_ARCH_ARM_NEON)
1010
1006
  FORCE_INLINE_TEMPLATE ZSTD_VecMask
1011
- ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries)
1007
+ ZSTD_row_getNEONMask(const U32 rowEntries, const BYTE* const src, const BYTE tag, const U32 headGrouped)
1008
+ {
1009
+ assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
1010
+ if (rowEntries == 16) {
1011
+ /* vshrn_n_u16 shifts by 4 every u16 and narrows to 8 lower bits.
1012
+ * After that groups of 4 bits represent the equalMask. We lower
1013
+ * all bits except the highest in these groups by doing AND with
1014
+ * 0x88 = 0b10001000.
1015
+ */
1016
+ const uint8x16_t chunk = vld1q_u8(src);
1017
+ const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag)));
1018
+ const uint8x8_t res = vshrn_n_u16(equalMask, 4);
1019
+ const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0);
1020
+ return ZSTD_rotateRight_U64(matches, headGrouped) & 0x8888888888888888ull;
1021
+ } else if (rowEntries == 32) {
1022
+ /* Same idea as with rowEntries == 16 but doing AND with
1023
+ * 0x55 = 0b01010101.
1024
+ */
1025
+ const uint16x8x2_t chunk = vld2q_u16((const uint16_t*)(const void*)src);
1026
+ const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]);
1027
+ const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]);
1028
+ const uint8x16_t dup = vdupq_n_u8(tag);
1029
+ const uint8x8_t t0 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk0, dup)), 6);
1030
+ const uint8x8_t t1 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk1, dup)), 6);
1031
+ const uint8x8_t res = vsli_n_u8(t0, t1, 4);
1032
+ const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0) ;
1033
+ return ZSTD_rotateRight_U64(matches, headGrouped) & 0x5555555555555555ull;
1034
+ } else { /* rowEntries == 64 */
1035
+ const uint8x16x4_t chunk = vld4q_u8(src);
1036
+ const uint8x16_t dup = vdupq_n_u8(tag);
1037
+ const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup);
1038
+ const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup);
1039
+ const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup);
1040
+ const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup);
1041
+
1042
+ const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1);
1043
+ const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1);
1044
+ const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2);
1045
+ const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4);
1046
+ const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4);
1047
+ const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0);
1048
+ return ZSTD_rotateRight_U64(matches, headGrouped);
1049
+ }
1050
+ }
1051
+ #endif
1052
+
1053
+ /* Returns a ZSTD_VecMask (U64) that has the nth group (determined by
1054
+ * ZSTD_row_matchMaskGroupWidth) of bits set to 1 if the newly-computed "tag"
1055
+ * matches the hash at the nth position in a row of the tagTable.
1056
+ * Each row is a circular buffer beginning at the value of "headGrouped". So we
1057
+ * must rotate the "matches" bitfield to match up with the actual layout of the
1058
+ * entries within the hashTable */
1059
+ FORCE_INLINE_TEMPLATE ZSTD_VecMask
1060
+ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 headGrouped, const U32 rowEntries)
1012
1061
  {
1013
1062
  const BYTE* const src = tagRow + ZSTD_ROW_HASH_TAG_OFFSET;
1014
1063
  assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
1015
1064
  assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
1065
+ assert(ZSTD_row_matchMaskGroupWidth(rowEntries) * rowEntries <= sizeof(ZSTD_VecMask) * 8);
1016
1066
 
1017
1067
  #if defined(ZSTD_ARCH_X86_SSE2)
1018
1068
 
1019
- return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, head);
1069
+ return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, headGrouped);
1020
1070
 
1021
1071
  #else /* SW or NEON-LE */
1022
1072
 
1023
1073
  # if defined(ZSTD_ARCH_ARM_NEON)
1024
1074
  /* This NEON path only works for little endian - otherwise use SWAR below */
1025
1075
  if (MEM_isLittleEndian()) {
1026
- if (rowEntries == 16) {
1027
- const uint8x16_t chunk = vld1q_u8(src);
1028
- const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag)));
1029
- const uint16x8_t t0 = vshlq_n_u16(equalMask, 7);
1030
- const uint32x4_t t1 = vreinterpretq_u32_u16(vsriq_n_u16(t0, t0, 14));
1031
- const uint64x2_t t2 = vreinterpretq_u64_u32(vshrq_n_u32(t1, 14));
1032
- const uint8x16_t t3 = vreinterpretq_u8_u64(vsraq_n_u64(t2, t2, 28));
1033
- const U16 hi = (U16)vgetq_lane_u8(t3, 8);
1034
- const U16 lo = (U16)vgetq_lane_u8(t3, 0);
1035
- return ZSTD_rotateRight_U16((hi << 8) | lo, head);
1036
- } else if (rowEntries == 32) {
1037
- const uint16x8x2_t chunk = vld2q_u16((const U16*)(const void*)src);
1038
- const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]);
1039
- const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]);
1040
- const uint8x16_t equalMask0 = vceqq_u8(chunk0, vdupq_n_u8(tag));
1041
- const uint8x16_t equalMask1 = vceqq_u8(chunk1, vdupq_n_u8(tag));
1042
- const int8x8_t pack0 = vqmovn_s16(vreinterpretq_s16_u8(equalMask0));
1043
- const int8x8_t pack1 = vqmovn_s16(vreinterpretq_s16_u8(equalMask1));
1044
- const uint8x8_t t0 = vreinterpret_u8_s8(pack0);
1045
- const uint8x8_t t1 = vreinterpret_u8_s8(pack1);
1046
- const uint8x8_t t2 = vsri_n_u8(t1, t0, 2);
1047
- const uint8x8x2_t t3 = vuzp_u8(t2, t0);
1048
- const uint8x8_t t4 = vsri_n_u8(t3.val[1], t3.val[0], 4);
1049
- const U32 matches = vget_lane_u32(vreinterpret_u32_u8(t4), 0);
1050
- return ZSTD_rotateRight_U32(matches, head);
1051
- } else { /* rowEntries == 64 */
1052
- const uint8x16x4_t chunk = vld4q_u8(src);
1053
- const uint8x16_t dup = vdupq_n_u8(tag);
1054
- const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup);
1055
- const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup);
1056
- const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup);
1057
- const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup);
1058
-
1059
- const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1);
1060
- const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1);
1061
- const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2);
1062
- const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4);
1063
- const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4);
1064
- const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0);
1065
- return ZSTD_rotateRight_U64(matches, head);
1066
- }
1076
+ return ZSTD_row_getNEONMask(rowEntries, src, tag, headGrouped);
1067
1077
  }
1068
1078
  # endif /* ZSTD_ARCH_ARM_NEON */
1069
1079
  /* SWAR */
1070
- { const size_t chunkSize = sizeof(size_t);
1080
+ { const int chunkSize = sizeof(size_t);
1071
1081
  const size_t shiftAmount = ((chunkSize * 8) - chunkSize);
1072
1082
  const size_t xFF = ~((size_t)0);
1073
1083
  const size_t x01 = xFF / 0xFF;
@@ -1100,11 +1110,11 @@ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head,
1100
1110
  }
1101
1111
  matches = ~matches;
1102
1112
  if (rowEntries == 16) {
1103
- return ZSTD_rotateRight_U16((U16)matches, head);
1113
+ return ZSTD_rotateRight_U16((U16)matches, headGrouped);
1104
1114
  } else if (rowEntries == 32) {
1105
- return ZSTD_rotateRight_U32((U32)matches, head);
1115
+ return ZSTD_rotateRight_U32((U32)matches, headGrouped);
1106
1116
  } else {
1107
- return ZSTD_rotateRight_U64((U64)matches, head);
1117
+ return ZSTD_rotateRight_U64((U64)matches, headGrouped);
1108
1118
  }
1109
1119
  }
1110
1120
  #endif
@@ -1152,6 +1162,7 @@ size_t ZSTD_RowFindBestMatch(
1152
1162
  const U32 rowEntries = (1U << rowLog);
1153
1163
  const U32 rowMask = rowEntries - 1;
1154
1164
  const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */
1165
+ const U32 groupWidth = ZSTD_row_matchMaskGroupWidth(rowEntries);
1155
1166
  U32 nbAttempts = 1U << cappedSearchLog;
1156
1167
  size_t ml=4-1;
1157
1168
 
@@ -1194,15 +1205,15 @@ size_t ZSTD_RowFindBestMatch(
1194
1205
  U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK;
1195
1206
  U32* const row = hashTable + relRow;
1196
1207
  BYTE* tagRow = (BYTE*)(tagTable + relRow);
1197
- U32 const head = *tagRow & rowMask;
1208
+ U32 const headGrouped = (*tagRow & rowMask) * groupWidth;
1198
1209
  U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
1199
1210
  size_t numMatches = 0;
1200
1211
  size_t currMatch = 0;
1201
- ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, head, rowEntries);
1212
+ ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, headGrouped, rowEntries);
1202
1213
 
1203
1214
  /* Cycle through the matches and prefetch */
1204
1215
  for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
1205
- U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
1216
+ U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask;
1206
1217
  U32 const matchIndex = row[matchPos];
1207
1218
  assert(numMatches < rowEntries);
1208
1219
  if (matchIndex < lowLimit)
@@ -1233,7 +1244,8 @@ size_t ZSTD_RowFindBestMatch(
1233
1244
  if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
1234
1245
  const BYTE* const match = base + matchIndex;
1235
1246
  assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */
1236
- if (match[ml] == ip[ml]) /* potentially better */
1247
+ /* read 4B starting from (match + ml + 1 - sizeof(U32)) */
1248
+ if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3)) /* potentially better */
1237
1249
  currentMl = ZSTD_count(ip, match, iLimit);
1238
1250
  } else {
1239
1251
  const BYTE* const match = dictBase + matchIndex;
@@ -1245,7 +1257,7 @@ size_t ZSTD_RowFindBestMatch(
1245
1257
  /* Save best solution */
1246
1258
  if (currentMl > ml) {
1247
1259
  ml = currentMl;
1248
- *offsetPtr = STORE_OFFSET(curr - matchIndex);
1260
+ *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
1249
1261
  if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
1250
1262
  }
1251
1263
  }
@@ -1263,14 +1275,14 @@ size_t ZSTD_RowFindBestMatch(
1263
1275
  const U32 dmsSize = (U32)(dmsEnd - dmsBase);
1264
1276
  const U32 dmsIndexDelta = dictLimit - dmsSize;
1265
1277
 
1266
- { U32 const head = *dmsTagRow & rowMask;
1278
+ { U32 const headGrouped = (*dmsTagRow & rowMask) * groupWidth;
1267
1279
  U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
1268
1280
  size_t numMatches = 0;
1269
1281
  size_t currMatch = 0;
1270
- ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, head, rowEntries);
1282
+ ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, headGrouped, rowEntries);
1271
1283
 
1272
1284
  for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
1273
- U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
1285
+ U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask;
1274
1286
  U32 const matchIndex = dmsRow[matchPos];
1275
1287
  if (matchIndex < dmsLowestIndex)
1276
1288
  break;
@@ -1294,7 +1306,7 @@ size_t ZSTD_RowFindBestMatch(
1294
1306
  if (currentMl > ml) {
1295
1307
  ml = currentMl;
1296
1308
  assert(curr > matchIndex + dmsIndexDelta);
1297
- *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta));
1309
+ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta));
1298
1310
  if (ip+currentMl == iLimit) break;
1299
1311
  }
1300
1312
  }
@@ -1304,14 +1316,10 @@ size_t ZSTD_RowFindBestMatch(
1304
1316
  }
1305
1317
 
1306
1318
 
1307
- typedef size_t (*searchMax_f)(
1308
- ZSTD_matchState_t* ms,
1309
- const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
1310
-
1311
1319
  /**
1312
- * This struct contains the functions necessary for lazy to search.
1313
- * Currently, that is only searchMax. However, it is still valuable to have the
1314
- * VTable because this makes it easier to add more functions to the VTable later.
1320
+ * Generate search functions templated on (dictMode, mls, rowLog).
1321
+ * These functions are outlined for code size & compilation time.
1322
+ * ZSTD_searchMax() dispatches to the correct implementation function.
1315
1323
  *
1316
1324
  * TODO: The start of the search function involves loading and calculating a
1317
1325
  * bunch of constants from the ZSTD_matchState_t. These computations could be
@@ -1329,25 +1337,25 @@ typedef size_t (*searchMax_f)(
1329
1337
  * the single segment loop. It should go in searchMax instead of its own
1330
1338
  * function to avoid having multiple virtual function calls per search.
1331
1339
  */
1332
- typedef struct {
1333
- searchMax_f searchMax;
1334
- } ZSTD_LazyVTable;
1335
1340
 
1336
- #define GEN_ZSTD_BT_VTABLE(dictMode, mls) \
1337
- static size_t ZSTD_BtFindBestMatch_##dictMode##_##mls( \
1338
- ZSTD_matchState_t* ms, \
1339
- const BYTE* ip, const BYTE* const iLimit, \
1340
- size_t* offsetPtr) \
1341
- { \
1342
- assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
1343
- return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode); \
1344
- } \
1345
- static const ZSTD_LazyVTable ZSTD_BtVTable_##dictMode##_##mls = { \
1346
- ZSTD_BtFindBestMatch_##dictMode##_##mls \
1347
- };
1341
+ #define ZSTD_BT_SEARCH_FN(dictMode, mls) ZSTD_BtFindBestMatch_##dictMode##_##mls
1342
+ #define ZSTD_HC_SEARCH_FN(dictMode, mls) ZSTD_HcFindBestMatch_##dictMode##_##mls
1343
+ #define ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog) ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog
1344
+
1345
+ #define ZSTD_SEARCH_FN_ATTRS FORCE_NOINLINE
1346
+
1347
+ #define GEN_ZSTD_BT_SEARCH_FN(dictMode, mls) \
1348
+ ZSTD_SEARCH_FN_ATTRS size_t ZSTD_BT_SEARCH_FN(dictMode, mls)( \
1349
+ ZSTD_matchState_t* ms, \
1350
+ const BYTE* ip, const BYTE* const iLimit, \
1351
+ size_t* offBasePtr) \
1352
+ { \
1353
+ assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
1354
+ return ZSTD_BtFindBestMatch(ms, ip, iLimit, offBasePtr, mls, ZSTD_##dictMode); \
1355
+ } \
1348
1356
 
1349
- #define GEN_ZSTD_HC_VTABLE(dictMode, mls) \
1350
- static size_t ZSTD_HcFindBestMatch_##dictMode##_##mls( \
1357
+ #define GEN_ZSTD_HC_SEARCH_FN(dictMode, mls) \
1358
+ ZSTD_SEARCH_FN_ATTRS size_t ZSTD_HC_SEARCH_FN(dictMode, mls)( \
1351
1359
  ZSTD_matchState_t* ms, \
1352
1360
  const BYTE* ip, const BYTE* const iLimit, \
1353
1361
  size_t* offsetPtr) \
@@ -1355,12 +1363,9 @@ typedef struct {
1355
1363
  assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
1356
1364
  return ZSTD_HcFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode); \
1357
1365
  } \
1358
- static const ZSTD_LazyVTable ZSTD_HcVTable_##dictMode##_##mls = { \
1359
- ZSTD_HcFindBestMatch_##dictMode##_##mls \
1360
- };
1361
1366
 
1362
- #define GEN_ZSTD_ROW_VTABLE(dictMode, mls, rowLog) \
1363
- static size_t ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog( \
1367
+ #define GEN_ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog) \
1368
+ ZSTD_SEARCH_FN_ATTRS size_t ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)( \
1364
1369
  ZSTD_matchState_t* ms, \
1365
1370
  const BYTE* ip, const BYTE* const iLimit, \
1366
1371
  size_t* offsetPtr) \
@@ -1369,9 +1374,6 @@ typedef struct {
1369
1374
  assert(MAX(4, MIN(6, ms->cParams.searchLog)) == rowLog); \
1370
1375
  return ZSTD_RowFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode, rowLog); \
1371
1376
  } \
1372
- static const ZSTD_LazyVTable ZSTD_RowVTable_##dictMode##_##mls##_##rowLog = { \
1373
- ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog \
1374
- };
1375
1377
 
1376
1378
  #define ZSTD_FOR_EACH_ROWLOG(X, dictMode, mls) \
1377
1379
  X(dictMode, mls, 4) \
@@ -1394,84 +1396,103 @@ typedef struct {
1394
1396
  X(__VA_ARGS__, dictMatchState) \
1395
1397
  X(__VA_ARGS__, dedicatedDictSearch)
1396
1398
 
1397
- /* Generate Row VTables for each combination of (dictMode, mls, rowLog) */
1398
- ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS_ROWLOG, GEN_ZSTD_ROW_VTABLE)
1399
- /* Generate Binary Tree VTables for each combination of (dictMode, mls) */
1400
- ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_BT_VTABLE)
1401
- /* Generate Hash Chain VTables for each combination of (dictMode, mls) */
1402
- ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_HC_VTABLE)
1403
-
1404
- #define GEN_ZSTD_BT_VTABLE_ARRAY(dictMode) \
1405
- { \
1406
- &ZSTD_BtVTable_##dictMode##_4, \
1407
- &ZSTD_BtVTable_##dictMode##_5, \
1408
- &ZSTD_BtVTable_##dictMode##_6 \
1409
- }
1410
-
1411
- #define GEN_ZSTD_HC_VTABLE_ARRAY(dictMode) \
1412
- { \
1413
- &ZSTD_HcVTable_##dictMode##_4, \
1414
- &ZSTD_HcVTable_##dictMode##_5, \
1415
- &ZSTD_HcVTable_##dictMode##_6 \
1416
- }
1399
+ /* Generate row search fns for each combination of (dictMode, mls, rowLog) */
1400
+ ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS_ROWLOG, GEN_ZSTD_ROW_SEARCH_FN)
1401
+ /* Generate binary Tree search fns for each combination of (dictMode, mls) */
1402
+ ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_BT_SEARCH_FN)
1403
+ /* Generate hash chain search fns for each combination of (dictMode, mls) */
1404
+ ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_HC_SEARCH_FN)
1417
1405
 
1418
- #define GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, mls) \
1419
- { \
1420
- &ZSTD_RowVTable_##dictMode##_##mls##_4, \
1421
- &ZSTD_RowVTable_##dictMode##_##mls##_5, \
1422
- &ZSTD_RowVTable_##dictMode##_##mls##_6 \
1423
- }
1424
-
1425
- #define GEN_ZSTD_ROW_VTABLE_ARRAY(dictMode) \
1426
- { \
1427
- GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, 4), \
1428
- GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, 5), \
1429
- GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, 6) \
1430
- }
1406
+ typedef enum { search_hashChain=0, search_binaryTree=1, search_rowHash=2 } searchMethod_e;
1431
1407
 
1432
- #define GEN_ZSTD_VTABLE_ARRAY(X) \
1433
- { \
1434
- X(noDict), \
1435
- X(extDict), \
1436
- X(dictMatchState), \
1437
- X(dedicatedDictSearch) \
1408
+ #define GEN_ZSTD_CALL_BT_SEARCH_FN(dictMode, mls) \
1409
+ case mls: \
1410
+ return ZSTD_BT_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr);
1411
+ #define GEN_ZSTD_CALL_HC_SEARCH_FN(dictMode, mls) \
1412
+ case mls: \
1413
+ return ZSTD_HC_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr);
1414
+ #define GEN_ZSTD_CALL_ROW_SEARCH_FN(dictMode, mls, rowLog) \
1415
+ case rowLog: \
1416
+ return ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)(ms, ip, iend, offsetPtr);
1417
+
1418
+ #define ZSTD_SWITCH_MLS(X, dictMode) \
1419
+ switch (mls) { \
1420
+ ZSTD_FOR_EACH_MLS(X, dictMode) \
1438
1421
  }
1439
1422
 
1440
- /* *******************************
1441
- * Common parser - lazy strategy
1442
- *********************************/
1443
- typedef enum { search_hashChain=0, search_binaryTree=1, search_rowHash=2 } searchMethod_e;
1423
+ #define ZSTD_SWITCH_ROWLOG(dictMode, mls) \
1424
+ case mls: \
1425
+ switch (rowLog) { \
1426
+ ZSTD_FOR_EACH_ROWLOG(GEN_ZSTD_CALL_ROW_SEARCH_FN, dictMode, mls) \
1427
+ } \
1428
+ ZSTD_UNREACHABLE; \
1429
+ break;
1430
+
1431
+ #define ZSTD_SWITCH_SEARCH_METHOD(dictMode) \
1432
+ switch (searchMethod) { \
1433
+ case search_hashChain: \
1434
+ ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_HC_SEARCH_FN, dictMode) \
1435
+ break; \
1436
+ case search_binaryTree: \
1437
+ ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_BT_SEARCH_FN, dictMode) \
1438
+ break; \
1439
+ case search_rowHash: \
1440
+ ZSTD_SWITCH_MLS(ZSTD_SWITCH_ROWLOG, dictMode) \
1441
+ break; \
1442
+ } \
1443
+ ZSTD_UNREACHABLE;
1444
1444
 
1445
1445
  /**
1446
- * This table is indexed first by the four ZSTD_dictMode_e values, and then
1447
- * by the two searchMethod_e values. NULLs are placed for configurations
1448
- * that should never occur (extDict modes go to the other implementation
1449
- * below and there is no DDSS for binary tree search yet).
1446
+ * Searches for the longest match at @p ip.
1447
+ * Dispatches to the correct implementation function based on the
1448
+ * (searchMethod, dictMode, mls, rowLog). We use switch statements
1449
+ * here instead of using an indirect function call through a function
1450
+ * pointer because after Spectre and Meltdown mitigations, indirect
1451
+ * function calls can be very costly, especially in the kernel.
1452
+ *
1453
+ * NOTE: dictMode and searchMethod should be templated, so those switch
1454
+ * statements should be optimized out. Only the mls & rowLog switches
1455
+ * should be left.
1456
+ *
1457
+ * @param ms The match state.
1458
+ * @param ip The position to search at.
1459
+ * @param iend The end of the input data.
1460
+ * @param[out] offsetPtr Stores the match offset into this pointer.
1461
+ * @param mls The minimum search length, in the range [4, 6].
1462
+ * @param rowLog The row log (if applicable), in the range [4, 6].
1463
+ * @param searchMethod The search method to use (templated).
1464
+ * @param dictMode The dictMode (templated).
1465
+ *
1466
+ * @returns The length of the longest match found, or < mls if no match is found.
1467
+ * If a match is found its offset is stored in @p offsetPtr.
1450
1468
  */
1451
-
1452
- static ZSTD_LazyVTable const*
1453
- ZSTD_selectLazyVTable(ZSTD_matchState_t const* ms, searchMethod_e searchMethod, ZSTD_dictMode_e dictMode)
1469
+ FORCE_INLINE_TEMPLATE size_t ZSTD_searchMax(
1470
+ ZSTD_matchState_t* ms,
1471
+ const BYTE* ip,
1472
+ const BYTE* iend,
1473
+ size_t* offsetPtr,
1474
+ U32 const mls,
1475
+ U32 const rowLog,
1476
+ searchMethod_e const searchMethod,
1477
+ ZSTD_dictMode_e const dictMode)
1454
1478
  {
1455
- /* Fill the Hc/Bt VTable arrays with the right functions for the (dictMode, mls) combination. */
1456
- ZSTD_LazyVTable const* const hcVTables[4][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_HC_VTABLE_ARRAY);
1457
- ZSTD_LazyVTable const* const btVTables[4][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_BT_VTABLE_ARRAY);
1458
- /* Fill the Row VTable array with the right functions for the (dictMode, mls, rowLog) combination. */
1459
- ZSTD_LazyVTable const* const rowVTables[4][3][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_ROW_VTABLE_ARRAY);
1460
-
1461
- U32 const mls = MAX(4, MIN(6, ms->cParams.minMatch));
1462
- U32 const rowLog = MAX(4, MIN(6, ms->cParams.searchLog));
1463
- switch (searchMethod) {
1464
- case search_hashChain:
1465
- return hcVTables[dictMode][mls - 4];
1466
- case search_binaryTree:
1467
- return btVTables[dictMode][mls - 4];
1468
- case search_rowHash:
1469
- return rowVTables[dictMode][mls - 4][rowLog - 4];
1470
- default:
1471
- return NULL;
1479
+ if (dictMode == ZSTD_noDict) {
1480
+ ZSTD_SWITCH_SEARCH_METHOD(noDict)
1481
+ } else if (dictMode == ZSTD_extDict) {
1482
+ ZSTD_SWITCH_SEARCH_METHOD(extDict)
1483
+ } else if (dictMode == ZSTD_dictMatchState) {
1484
+ ZSTD_SWITCH_SEARCH_METHOD(dictMatchState)
1485
+ } else if (dictMode == ZSTD_dedicatedDictSearch) {
1486
+ ZSTD_SWITCH_SEARCH_METHOD(dedicatedDictSearch)
1472
1487
  }
1488
+ ZSTD_UNREACHABLE;
1489
+ return 0;
1473
1490
  }
1474
1491
 
1492
+ /* *******************************
1493
+ * Common parser - lazy strategy
1494
+ *********************************/
1495
+
1475
1496
  FORCE_INLINE_TEMPLATE size_t
1476
1497
  ZSTD_compressBlock_lazy_generic(
1477
1498
  ZSTD_matchState_t* ms, seqStore_t* seqStore,
@@ -1488,9 +1509,11 @@ ZSTD_compressBlock_lazy_generic(
1488
1509
  const BYTE* const base = ms->window.base;
1489
1510
  const U32 prefixLowestIndex = ms->window.dictLimit;
1490
1511
  const BYTE* const prefixLowest = base + prefixLowestIndex;
1512
+ const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6);
1513
+ const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);
1491
1514
 
1492
- searchMax_f const searchMax = ZSTD_selectLazyVTable(ms, searchMethod, dictMode)->searchMax;
1493
- U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0;
1515
+ U32 offset_1 = rep[0], offset_2 = rep[1];
1516
+ U32 offsetSaved1 = 0, offsetSaved2 = 0;
1494
1517
 
1495
1518
  const int isDMS = dictMode == ZSTD_dictMatchState;
1496
1519
  const int isDDS = dictMode == ZSTD_dedicatedDictSearch;
@@ -1505,16 +1528,14 @@ ZSTD_compressBlock_lazy_generic(
1505
1528
  0;
1506
1529
  const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictLowest));
1507
1530
 
1508
- assert(searchMax != NULL);
1509
-
1510
1531
  DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=%u) (searchFunc=%u)", (U32)dictMode, (U32)searchMethod);
1511
1532
  ip += (dictAndPrefixLength == 0);
1512
1533
  if (dictMode == ZSTD_noDict) {
1513
1534
  U32 const curr = (U32)(ip - base);
1514
1535
  U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, ms->cParams.windowLog);
1515
1536
  U32 const maxRep = curr - windowLow;
1516
- if (offset_2 > maxRep) savedOffset = offset_2, offset_2 = 0;
1517
- if (offset_1 > maxRep) savedOffset = offset_1, offset_1 = 0;
1537
+ if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0;
1538
+ if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0;
1518
1539
  }
1519
1540
  if (isDxS) {
1520
1541
  /* dictMatchState repCode checks don't currently handle repCode == 0
@@ -1524,7 +1545,6 @@ ZSTD_compressBlock_lazy_generic(
1524
1545
  }
1525
1546
 
1526
1547
  if (searchMethod == search_rowHash) {
1527
- const U32 rowLog = MAX(4, MIN(6, ms->cParams.searchLog));
1528
1548
  ZSTD_row_fillHashCache(ms, base, rowLog,
1529
1549
  MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
1530
1550
  ms->nextToUpdate, ilimit);
@@ -1539,7 +1559,7 @@ ZSTD_compressBlock_lazy_generic(
1539
1559
  #endif
1540
1560
  while (ip < ilimit) {
1541
1561
  size_t matchLength=0;
1542
- size_t offcode=STORE_REPCODE_1;
1562
+ size_t offBase = REPCODE1_TO_OFFBASE;
1543
1563
  const BYTE* start=ip+1;
1544
1564
  DEBUGLOG(7, "search baseline (depth 0)");
1545
1565
 
@@ -1564,10 +1584,10 @@ ZSTD_compressBlock_lazy_generic(
1564
1584
  }
1565
1585
 
1566
1586
  /* first search (depth 0) */
1567
- { size_t offsetFound = 999999999;
1568
- size_t const ml2 = searchMax(ms, ip, iend, &offsetFound);
1587
+ { size_t offbaseFound = 999999999;
1588
+ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offbaseFound, mls, rowLog, searchMethod, dictMode);
1569
1589
  if (ml2 > matchLength)
1570
- matchLength = ml2, start = ip, offcode=offsetFound;
1590
+ matchLength = ml2, start = ip, offBase = offbaseFound;
1571
1591
  }
1572
1592
 
1573
1593
  if (matchLength < 4) {
@@ -1581,12 +1601,12 @@ ZSTD_compressBlock_lazy_generic(
1581
1601
  DEBUGLOG(7, "search depth 1");
1582
1602
  ip ++;
1583
1603
  if ( (dictMode == ZSTD_noDict)
1584
- && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
1604
+ && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
1585
1605
  size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
1586
1606
  int const gain2 = (int)(mlRep * 3);
1587
- int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
1607
+ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
1588
1608
  if ((mlRep >= 4) && (gain2 > gain1))
1589
- matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
1609
+ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
1590
1610
  }
1591
1611
  if (isDxS) {
1592
1612
  const U32 repIndex = (U32)(ip - base) - offset_1;
@@ -1598,17 +1618,17 @@ ZSTD_compressBlock_lazy_generic(
1598
1618
  const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
1599
1619
  size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
1600
1620
  int const gain2 = (int)(mlRep * 3);
1601
- int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
1621
+ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
1602
1622
  if ((mlRep >= 4) && (gain2 > gain1))
1603
- matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
1623
+ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
1604
1624
  }
1605
1625
  }
1606
- { size_t offset2=999999999;
1607
- size_t const ml2 = searchMax(ms, ip, iend, &offset2);
1608
- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */
1609
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4);
1626
+ { size_t ofbCandidate=999999999;
1627
+ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);
1628
+ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */
1629
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4);
1610
1630
  if ((ml2 >= 4) && (gain2 > gain1)) {
1611
- matchLength = ml2, offcode = offset2, start = ip;
1631
+ matchLength = ml2, offBase = ofbCandidate, start = ip;
1612
1632
  continue; /* search a better one */
1613
1633
  } }
1614
1634
 
@@ -1617,12 +1637,12 @@ ZSTD_compressBlock_lazy_generic(
1617
1637
  DEBUGLOG(7, "search depth 2");
1618
1638
  ip ++;
1619
1639
  if ( (dictMode == ZSTD_noDict)
1620
- && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
1640
+ && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
1621
1641
  size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
1622
1642
  int const gain2 = (int)(mlRep * 4);
1623
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
1643
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
1624
1644
  if ((mlRep >= 4) && (gain2 > gain1))
1625
- matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
1645
+ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
1626
1646
  }
1627
1647
  if (isDxS) {
1628
1648
  const U32 repIndex = (U32)(ip - base) - offset_1;
@@ -1634,17 +1654,17 @@ ZSTD_compressBlock_lazy_generic(
1634
1654
  const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
1635
1655
  size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
1636
1656
  int const gain2 = (int)(mlRep * 4);
1637
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
1657
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
1638
1658
  if ((mlRep >= 4) && (gain2 > gain1))
1639
- matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
1659
+ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
1640
1660
  }
1641
1661
  }
1642
- { size_t offset2=999999999;
1643
- size_t const ml2 = searchMax(ms, ip, iend, &offset2);
1644
- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */
1645
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7);
1662
+ { size_t ofbCandidate=999999999;
1663
+ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);
1664
+ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */
1665
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7);
1646
1666
  if ((ml2 >= 4) && (gain2 > gain1)) {
1647
- matchLength = ml2, offcode = offset2, start = ip;
1667
+ matchLength = ml2, offBase = ofbCandidate, start = ip;
1648
1668
  continue;
1649
1669
  } } }
1650
1670
  break; /* nothing found : store previous solution */
@@ -1655,24 +1675,24 @@ ZSTD_compressBlock_lazy_generic(
1655
1675
  * notably if `value` is unsigned, resulting in a large positive `-value`.
1656
1676
  */
1657
1677
  /* catch up */
1658
- if (STORED_IS_OFFSET(offcode)) {
1678
+ if (OFFBASE_IS_OFFSET(offBase)) {
1659
1679
  if (dictMode == ZSTD_noDict) {
1660
- while ( ((start > anchor) & (start - STORED_OFFSET(offcode) > prefixLowest))
1661
- && (start[-1] == (start-STORED_OFFSET(offcode))[-1]) ) /* only search for offset within prefix */
1680
+ while ( ((start > anchor) & (start - OFFBASE_TO_OFFSET(offBase) > prefixLowest))
1681
+ && (start[-1] == (start-OFFBASE_TO_OFFSET(offBase))[-1]) ) /* only search for offset within prefix */
1662
1682
  { start--; matchLength++; }
1663
1683
  }
1664
1684
  if (isDxS) {
1665
- U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode));
1685
+ U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase));
1666
1686
  const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex;
1667
1687
  const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest;
1668
1688
  while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */
1669
1689
  }
1670
- offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode);
1690
+ offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase);
1671
1691
  }
1672
1692
  /* store sequence */
1673
1693
  _storeSequence:
1674
1694
  { size_t const litLength = (size_t)(start - anchor);
1675
- ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength);
1695
+ ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
1676
1696
  anchor = ip = start + matchLength;
1677
1697
  }
1678
1698
 
@@ -1688,8 +1708,8 @@ _storeSequence:
1688
1708
  && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
1689
1709
  const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend;
1690
1710
  matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4;
1691
- offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap offset_2 <=> offset_1 */
1692
- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);
1711
+ offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap offset_2 <=> offset_1 */
1712
+ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
1693
1713
  ip += matchLength;
1694
1714
  anchor = ip;
1695
1715
  continue;
@@ -1703,16 +1723,20 @@ _storeSequence:
1703
1723
  && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) {
1704
1724
  /* store sequence */
1705
1725
  matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
1706
- offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap repcodes */
1707
- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);
1726
+ offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap repcodes */
1727
+ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
1708
1728
  ip += matchLength;
1709
1729
  anchor = ip;
1710
1730
  continue; /* faster when present ... (?) */
1711
1731
  } } }
1712
1732
 
1713
- /* Save reps for next block */
1714
- rep[0] = offset_1 ? offset_1 : savedOffset;
1715
- rep[1] = offset_2 ? offset_2 : savedOffset;
1733
+ /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
1734
+ * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
1735
+ offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
1736
+
1737
+ /* save reps for next block */
1738
+ rep[0] = offset_1 ? offset_1 : offsetSaved1;
1739
+ rep[1] = offset_2 ? offset_2 : offsetSaved2;
1716
1740
 
1717
1741
  /* Return the last literals size */
1718
1742
  return (size_t)(iend - anchor);
@@ -1881,9 +1905,9 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1881
1905
  const BYTE* const dictEnd = dictBase + dictLimit;
1882
1906
  const BYTE* const dictStart = dictBase + ms->window.lowLimit;
1883
1907
  const U32 windowLog = ms->cParams.windowLog;
1884
- const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5;
1908
+ const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6);
1909
+ const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);
1885
1910
 
1886
- searchMax_f const searchMax = ZSTD_selectLazyVTable(ms, searchMethod, ZSTD_extDict)->searchMax;
1887
1911
  U32 offset_1 = rep[0], offset_2 = rep[1];
1888
1912
 
1889
1913
  DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod);
@@ -1905,7 +1929,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1905
1929
  #endif
1906
1930
  while (ip < ilimit) {
1907
1931
  size_t matchLength=0;
1908
- size_t offcode=STORE_REPCODE_1;
1932
+ size_t offBase = REPCODE1_TO_OFFBASE;
1909
1933
  const BYTE* start=ip+1;
1910
1934
  U32 curr = (U32)(ip-base);
1911
1935
 
@@ -1924,10 +1948,10 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1924
1948
  } }
1925
1949
 
1926
1950
  /* first search (depth 0) */
1927
- { size_t offsetFound = 999999999;
1928
- size_t const ml2 = searchMax(ms, ip, iend, &offsetFound);
1951
+ { size_t ofbCandidate = 999999999;
1952
+ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
1929
1953
  if (ml2 > matchLength)
1930
- matchLength = ml2, start = ip, offcode=offsetFound;
1954
+ matchLength = ml2, start = ip, offBase = ofbCandidate;
1931
1955
  }
1932
1956
 
1933
1957
  if (matchLength < 4) {
@@ -1941,7 +1965,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1941
1965
  ip ++;
1942
1966
  curr++;
1943
1967
  /* check repCode */
1944
- if (offcode) {
1968
+ if (offBase) {
1945
1969
  const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
1946
1970
  const U32 repIndex = (U32)(curr - offset_1);
1947
1971
  const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
@@ -1953,18 +1977,18 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1953
1977
  const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
1954
1978
  size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
1955
1979
  int const gain2 = (int)(repLength * 3);
1956
- int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
1980
+ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
1957
1981
  if ((repLength >= 4) && (gain2 > gain1))
1958
- matchLength = repLength, offcode = STORE_REPCODE_1, start = ip;
1982
+ matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip;
1959
1983
  } }
1960
1984
 
1961
1985
  /* search match, depth 1 */
1962
- { size_t offset2=999999999;
1963
- size_t const ml2 = searchMax(ms, ip, iend, &offset2);
1964
- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */
1965
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4);
1986
+ { size_t ofbCandidate = 999999999;
1987
+ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
1988
+ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */
1989
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4);
1966
1990
  if ((ml2 >= 4) && (gain2 > gain1)) {
1967
- matchLength = ml2, offcode = offset2, start = ip;
1991
+ matchLength = ml2, offBase = ofbCandidate, start = ip;
1968
1992
  continue; /* search a better one */
1969
1993
  } }
1970
1994
 
@@ -1973,7 +1997,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1973
1997
  ip ++;
1974
1998
  curr++;
1975
1999
  /* check repCode */
1976
- if (offcode) {
2000
+ if (offBase) {
1977
2001
  const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
1978
2002
  const U32 repIndex = (U32)(curr - offset_1);
1979
2003
  const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
@@ -1985,36 +2009,36 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1985
2009
  const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
1986
2010
  size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
1987
2011
  int const gain2 = (int)(repLength * 4);
1988
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
2012
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
1989
2013
  if ((repLength >= 4) && (gain2 > gain1))
1990
- matchLength = repLength, offcode = STORE_REPCODE_1, start = ip;
2014
+ matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip;
1991
2015
  } }
1992
2016
 
1993
2017
  /* search match, depth 2 */
1994
- { size_t offset2=999999999;
1995
- size_t const ml2 = searchMax(ms, ip, iend, &offset2);
1996
- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */
1997
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7);
2018
+ { size_t ofbCandidate = 999999999;
2019
+ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
2020
+ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */
2021
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7);
1998
2022
  if ((ml2 >= 4) && (gain2 > gain1)) {
1999
- matchLength = ml2, offcode = offset2, start = ip;
2023
+ matchLength = ml2, offBase = ofbCandidate, start = ip;
2000
2024
  continue;
2001
2025
  } } }
2002
2026
  break; /* nothing found : store previous solution */
2003
2027
  }
2004
2028
 
2005
2029
  /* catch up */
2006
- if (STORED_IS_OFFSET(offcode)) {
2007
- U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode));
2030
+ if (OFFBASE_IS_OFFSET(offBase)) {
2031
+ U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase));
2008
2032
  const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex;
2009
2033
  const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart;
2010
2034
  while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */
2011
- offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode);
2035
+ offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase);
2012
2036
  }
2013
2037
 
2014
2038
  /* store sequence */
2015
2039
  _storeSequence:
2016
2040
  { size_t const litLength = (size_t)(start - anchor);
2017
- ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength);
2041
+ ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
2018
2042
  anchor = ip = start + matchLength;
2019
2043
  }
2020
2044
 
@@ -2031,8 +2055,8 @@ _storeSequence:
2031
2055
  /* repcode detected we should take it */
2032
2056
  const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
2033
2057
  matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
2034
- offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap offset history */
2035
- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);
2058
+ offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap offset history */
2059
+ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
2036
2060
  ip += matchLength;
2037
2061
  anchor = ip;
2038
2062
  continue; /* faster when present ... (?) */
@@ -2098,7 +2122,6 @@ size_t ZSTD_compressBlock_lazy_extDict_row(
2098
2122
  size_t ZSTD_compressBlock_lazy2_extDict_row(
2099
2123
  ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2100
2124
  void const* src, size_t srcSize)
2101
-
2102
2125
  {
2103
2126
  return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2);
2104
2127
  }