zstd-ruby 1.5.2.2 → 1.5.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +15 -3
  3. data/ext/zstdruby/common.h +7 -0
  4. data/ext/zstdruby/libzstd/common/bits.h +175 -0
  5. data/ext/zstdruby/libzstd/common/bitstream.h +18 -59
  6. data/ext/zstdruby/libzstd/common/compiler.h +22 -3
  7. data/ext/zstdruby/libzstd/common/cpu.h +1 -1
  8. data/ext/zstdruby/libzstd/common/debug.c +1 -1
  9. data/ext/zstdruby/libzstd/common/debug.h +1 -1
  10. data/ext/zstdruby/libzstd/common/entropy_common.c +12 -40
  11. data/ext/zstdruby/libzstd/common/error_private.c +9 -2
  12. data/ext/zstdruby/libzstd/common/error_private.h +1 -1
  13. data/ext/zstdruby/libzstd/common/fse.h +5 -83
  14. data/ext/zstdruby/libzstd/common/fse_decompress.c +7 -99
  15. data/ext/zstdruby/libzstd/common/huf.h +65 -156
  16. data/ext/zstdruby/libzstd/common/mem.h +39 -46
  17. data/ext/zstdruby/libzstd/common/pool.c +26 -10
  18. data/ext/zstdruby/libzstd/common/pool.h +7 -1
  19. data/ext/zstdruby/libzstd/common/portability_macros.h +22 -3
  20. data/ext/zstdruby/libzstd/common/threading.c +68 -14
  21. data/ext/zstdruby/libzstd/common/threading.h +5 -10
  22. data/ext/zstdruby/libzstd/common/xxhash.c +2 -2
  23. data/ext/zstdruby/libzstd/common/xxhash.h +8 -8
  24. data/ext/zstdruby/libzstd/common/zstd_common.c +1 -1
  25. data/ext/zstdruby/libzstd/common/zstd_deps.h +1 -1
  26. data/ext/zstdruby/libzstd/common/zstd_internal.h +17 -113
  27. data/ext/zstdruby/libzstd/common/zstd_trace.h +3 -3
  28. data/ext/zstdruby/libzstd/compress/clevels.h +1 -1
  29. data/ext/zstdruby/libzstd/compress/fse_compress.c +7 -124
  30. data/ext/zstdruby/libzstd/compress/hist.c +1 -1
  31. data/ext/zstdruby/libzstd/compress/hist.h +1 -1
  32. data/ext/zstdruby/libzstd/compress/huf_compress.c +234 -169
  33. data/ext/zstdruby/libzstd/compress/zstd_compress.c +1055 -455
  34. data/ext/zstdruby/libzstd/compress/zstd_compress_internal.h +165 -145
  35. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.c +115 -39
  36. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.h +16 -8
  37. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.c +3 -3
  38. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.h +1 -1
  39. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.c +25 -21
  40. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.h +1 -1
  41. data/ext/zstdruby/libzstd/compress/zstd_cwksp.h +5 -3
  42. data/ext/zstdruby/libzstd/compress/zstd_double_fast.c +95 -33
  43. data/ext/zstdruby/libzstd/compress/zstd_double_fast.h +3 -2
  44. data/ext/zstdruby/libzstd/compress/zstd_fast.c +433 -148
  45. data/ext/zstdruby/libzstd/compress/zstd_fast.h +3 -2
  46. data/ext/zstdruby/libzstd/compress/zstd_lazy.c +306 -283
  47. data/ext/zstdruby/libzstd/compress/zstd_lazy.h +4 -2
  48. data/ext/zstdruby/libzstd/compress/zstd_ldm.c +5 -5
  49. data/ext/zstdruby/libzstd/compress/zstd_ldm.h +1 -1
  50. data/ext/zstdruby/libzstd/compress/zstd_ldm_geartab.h +1 -1
  51. data/ext/zstdruby/libzstd/compress/zstd_opt.c +104 -80
  52. data/ext/zstdruby/libzstd/compress/zstd_opt.h +1 -1
  53. data/ext/zstdruby/libzstd/compress/zstdmt_compress.c +12 -5
  54. data/ext/zstdruby/libzstd/compress/zstdmt_compress.h +1 -1
  55. data/ext/zstdruby/libzstd/decompress/huf_decompress.c +434 -441
  56. data/ext/zstdruby/libzstd/decompress/huf_decompress_amd64.S +30 -39
  57. data/ext/zstdruby/libzstd/decompress/zstd_ddict.c +3 -4
  58. data/ext/zstdruby/libzstd/decompress/zstd_ddict.h +1 -1
  59. data/ext/zstdruby/libzstd/decompress/zstd_decompress.c +164 -42
  60. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.c +186 -65
  61. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.h +1 -1
  62. data/ext/zstdruby/libzstd/decompress/zstd_decompress_internal.h +4 -2
  63. data/ext/zstdruby/libzstd/dictBuilder/cover.c +19 -15
  64. data/ext/zstdruby/libzstd/dictBuilder/cover.h +1 -1
  65. data/ext/zstdruby/libzstd/dictBuilder/fastcover.c +2 -2
  66. data/ext/zstdruby/libzstd/dictBuilder/zdict.c +9 -87
  67. data/ext/zstdruby/libzstd/zdict.h +53 -31
  68. data/ext/zstdruby/libzstd/zstd.h +489 -90
  69. data/ext/zstdruby/libzstd/zstd_errors.h +27 -8
  70. data/ext/zstdruby/main.c +4 -0
  71. data/ext/zstdruby/streaming_compress.c +1 -7
  72. data/ext/zstdruby/zstdruby.c +110 -26
  73. data/lib/zstd-ruby/version.rb +1 -1
  74. data/lib/zstd-ruby.rb +0 -1
  75. metadata +7 -6
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) Yann Collet, Facebook, Inc.
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -10,6 +10,7 @@
10
10
 
11
11
  #include "zstd_compress_internal.h"
12
12
  #include "zstd_lazy.h"
13
+ #include "../common/bits.h" /* ZSTD_countTrailingZeros64 */
13
14
 
14
15
 
15
16
  /*-*************************************
@@ -197,8 +198,8 @@ ZSTD_DUBT_findBetterDictMatch (
197
198
  U32 matchIndex = dictMatchIndex + dictIndexDelta;
198
199
  if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) {
199
200
  DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)",
200
- curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, STORE_OFFSET(curr - matchIndex), dictMatchIndex, matchIndex);
201
- bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex);
201
+ curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, OFFSET_TO_OFFBASE(curr - matchIndex), dictMatchIndex, matchIndex);
202
+ bestLength = matchLength, *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
202
203
  }
203
204
  if (ip+matchLength == iend) { /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */
204
205
  break; /* drop, to guarantee consistency (miss a little bit of compression) */
@@ -218,7 +219,7 @@ ZSTD_DUBT_findBetterDictMatch (
218
219
  }
219
220
 
220
221
  if (bestLength >= MINMATCH) {
221
- U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex;
222
+ U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offsetPtr); (void)mIndex;
222
223
  DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
223
224
  curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
224
225
  }
@@ -230,7 +231,7 @@ ZSTD_DUBT_findBetterDictMatch (
230
231
  static size_t
231
232
  ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
232
233
  const BYTE* const ip, const BYTE* const iend,
233
- size_t* offsetPtr,
234
+ size_t* offBasePtr,
234
235
  U32 const mls,
235
236
  const ZSTD_dictMode_e dictMode)
236
237
  {
@@ -327,8 +328,8 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
327
328
  if (matchLength > bestLength) {
328
329
  if (matchLength > matchEndIdx - matchIndex)
329
330
  matchEndIdx = matchIndex + (U32)matchLength;
330
- if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) )
331
- bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex);
331
+ if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr - matchIndex + 1) - ZSTD_highbit32((U32)*offBasePtr)) )
332
+ bestLength = matchLength, *offBasePtr = OFFSET_TO_OFFBASE(curr - matchIndex);
332
333
  if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */
333
334
  if (dictMode == ZSTD_dictMatchState) {
334
335
  nbCompares = 0; /* in addition to avoiding checking any
@@ -361,16 +362,16 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
361
362
  if (dictMode == ZSTD_dictMatchState && nbCompares) {
362
363
  bestLength = ZSTD_DUBT_findBetterDictMatch(
363
364
  ms, ip, iend,
364
- offsetPtr, bestLength, nbCompares,
365
+ offBasePtr, bestLength, nbCompares,
365
366
  mls, dictMode);
366
367
  }
367
368
 
368
369
  assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased */
369
370
  ms->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */
370
371
  if (bestLength >= MINMATCH) {
371
- U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex;
372
+ U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offBasePtr); (void)mIndex;
372
373
  DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
373
- curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
374
+ curr, (U32)bestLength, (U32)*offBasePtr, mIndex);
374
375
  }
375
376
  return bestLength;
376
377
  }
@@ -381,14 +382,14 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
381
382
  FORCE_INLINE_TEMPLATE size_t
382
383
  ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms,
383
384
  const BYTE* const ip, const BYTE* const iLimit,
384
- size_t* offsetPtr,
385
+ size_t* offBasePtr,
385
386
  const U32 mls /* template */,
386
387
  const ZSTD_dictMode_e dictMode)
387
388
  {
388
389
  DEBUGLOG(7, "ZSTD_BtFindBestMatch");
389
390
  if (ip < ms->window.base + ms->nextToUpdate) return 0; /* skipped area */
390
391
  ZSTD_updateDUBT(ms, ip, iLimit, mls);
391
- return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offsetPtr, mls, dictMode);
392
+ return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offBasePtr, mls, dictMode);
392
393
  }
393
394
 
394
395
  /***********************************
@@ -561,7 +562,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
561
562
  /* save best solution */
562
563
  if (currentMl > ml) {
563
564
  ml = currentMl;
564
- *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta));
565
+ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta));
565
566
  if (ip+currentMl == iLimit) {
566
567
  /* best possible, avoids read overflow on next attempt */
567
568
  return ml;
@@ -598,7 +599,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
598
599
  /* save best solution */
599
600
  if (currentMl > ml) {
600
601
  ml = currentMl;
601
- *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta));
602
+ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta));
602
603
  if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
603
604
  }
604
605
  }
@@ -691,7 +692,8 @@ size_t ZSTD_HcFindBestMatch(
691
692
  if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
692
693
  const BYTE* const match = base + matchIndex;
693
694
  assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */
694
- if (match[ml] == ip[ml]) /* potentially better */
695
+ /* read 4B starting from (match + ml + 1 - sizeof(U32)) */
696
+ if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3)) /* potentially better */
695
697
  currentMl = ZSTD_count(ip, match, iLimit);
696
698
  } else {
697
699
  const BYTE* const match = dictBase + matchIndex;
@@ -703,7 +705,7 @@ size_t ZSTD_HcFindBestMatch(
703
705
  /* save best solution */
704
706
  if (currentMl > ml) {
705
707
  ml = currentMl;
706
- *offsetPtr = STORE_OFFSET(curr - matchIndex);
708
+ *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
707
709
  if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
708
710
  }
709
711
 
@@ -739,7 +741,7 @@ size_t ZSTD_HcFindBestMatch(
739
741
  if (currentMl > ml) {
740
742
  ml = currentMl;
741
743
  assert(curr > matchIndex + dmsIndexDelta);
742
- *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta));
744
+ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta));
743
745
  if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
744
746
  }
745
747
 
@@ -757,7 +759,6 @@ size_t ZSTD_HcFindBestMatch(
757
759
  ***********************************/
758
760
  /* Constants for row-based hash */
759
761
  #define ZSTD_ROW_HASH_TAG_OFFSET 16 /* byte offset of hashes in the match state's tagTable from the beginning of a row */
760
- #define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */
761
762
  #define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1)
762
763
  #define ZSTD_ROW_HASH_MAX_ENTRIES 64 /* absolute maximum number of entries per row, for all configurations */
763
764
 
@@ -769,38 +770,8 @@ typedef U64 ZSTD_VecMask; /* Clarifies when we are interacting with a U64 repr
769
770
  * Starting from the LSB, returns the idx of the next non-zero bit.
770
771
  * Basically counting the nb of trailing zeroes.
771
772
  */
772
- static U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
773
- assert(val != 0);
774
- # if defined(_MSC_VER) && defined(_WIN64)
775
- if (val != 0) {
776
- unsigned long r;
777
- _BitScanForward64(&r, val);
778
- return (U32)(r);
779
- } else {
780
- /* Should not reach this code path */
781
- __assume(0);
782
- }
783
- # elif (defined(__GNUC__) && ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))
784
- if (sizeof(size_t) == 4) {
785
- U32 mostSignificantWord = (U32)(val >> 32);
786
- U32 leastSignificantWord = (U32)val;
787
- if (leastSignificantWord == 0) {
788
- return 32 + (U32)__builtin_ctz(mostSignificantWord);
789
- } else {
790
- return (U32)__builtin_ctz(leastSignificantWord);
791
- }
792
- } else {
793
- return (U32)__builtin_ctzll(val);
794
- }
795
- # else
796
- /* Software ctz version: http://aggregate.org/MAGIC/#Trailing%20Zero%20Count
797
- * and: https://stackoverflow.com/questions/2709430/count-number-of-bits-in-a-64-bit-long-big-integer
798
- */
799
- val = ~val & (val - 1ULL); /* Lowest set bit mask */
800
- val = val - ((val >> 1) & 0x5555555555555555);
801
- val = (val & 0x3333333333333333ULL) + ((val >> 2) & 0x3333333333333333ULL);
802
- return (U32)((((val + (val >> 4)) & 0xF0F0F0F0F0F0F0FULL) * 0x101010101010101ULL) >> 56);
803
- # endif
773
+ MEM_STATIC U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
774
+ return ZSTD_countTrailingZeros64(val);
804
775
  }
805
776
 
806
777
  /* ZSTD_rotateRight_*():
@@ -980,7 +951,35 @@ void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) {
980
951
  const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */);
981
952
 
982
953
  DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog);
983
- ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* dont use cache */);
954
+ ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* don't use cache */);
955
+ }
956
+
957
+ /* Returns the mask width of bits group of which will be set to 1. Given not all
958
+ * architectures have easy movemask instruction, this helps to iterate over
959
+ * groups of bits easier and faster.
960
+ */
961
+ FORCE_INLINE_TEMPLATE U32
962
+ ZSTD_row_matchMaskGroupWidth(const U32 rowEntries)
963
+ {
964
+ assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
965
+ assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
966
+ (void)rowEntries;
967
+ #if defined(ZSTD_ARCH_ARM_NEON)
968
+ /* NEON path only works for little endian */
969
+ if (!MEM_isLittleEndian()) {
970
+ return 1;
971
+ }
972
+ if (rowEntries == 16) {
973
+ return 4;
974
+ }
975
+ if (rowEntries == 32) {
976
+ return 2;
977
+ }
978
+ if (rowEntries == 64) {
979
+ return 1;
980
+ }
981
+ #endif
982
+ return 1;
984
983
  }
985
984
 
986
985
  #if defined(ZSTD_ARCH_X86_SSE2)
@@ -1003,71 +1002,82 @@ ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U
1003
1002
  }
1004
1003
  #endif
1005
1004
 
1006
- /* Returns a ZSTD_VecMask (U32) that has the nth bit set to 1 if the newly-computed "tag" matches
1007
- * the hash at the nth position in a row of the tagTable.
1008
- * Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield
1009
- * to match up with the actual layout of the entries within the hashTable */
1005
+ #if defined(ZSTD_ARCH_ARM_NEON)
1010
1006
  FORCE_INLINE_TEMPLATE ZSTD_VecMask
1011
- ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries)
1007
+ ZSTD_row_getNEONMask(const U32 rowEntries, const BYTE* const src, const BYTE tag, const U32 headGrouped)
1008
+ {
1009
+ assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
1010
+ if (rowEntries == 16) {
1011
+ /* vshrn_n_u16 shifts by 4 every u16 and narrows to 8 lower bits.
1012
+ * After that groups of 4 bits represent the equalMask. We lower
1013
+ * all bits except the highest in these groups by doing AND with
1014
+ * 0x88 = 0b10001000.
1015
+ */
1016
+ const uint8x16_t chunk = vld1q_u8(src);
1017
+ const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag)));
1018
+ const uint8x8_t res = vshrn_n_u16(equalMask, 4);
1019
+ const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0);
1020
+ return ZSTD_rotateRight_U64(matches, headGrouped) & 0x8888888888888888ull;
1021
+ } else if (rowEntries == 32) {
1022
+ /* Same idea as with rowEntries == 16 but doing AND with
1023
+ * 0x55 = 0b01010101.
1024
+ */
1025
+ const uint16x8x2_t chunk = vld2q_u16((const uint16_t*)(const void*)src);
1026
+ const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]);
1027
+ const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]);
1028
+ const uint8x16_t dup = vdupq_n_u8(tag);
1029
+ const uint8x8_t t0 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk0, dup)), 6);
1030
+ const uint8x8_t t1 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk1, dup)), 6);
1031
+ const uint8x8_t res = vsli_n_u8(t0, t1, 4);
1032
+ const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0) ;
1033
+ return ZSTD_rotateRight_U64(matches, headGrouped) & 0x5555555555555555ull;
1034
+ } else { /* rowEntries == 64 */
1035
+ const uint8x16x4_t chunk = vld4q_u8(src);
1036
+ const uint8x16_t dup = vdupq_n_u8(tag);
1037
+ const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup);
1038
+ const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup);
1039
+ const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup);
1040
+ const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup);
1041
+
1042
+ const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1);
1043
+ const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1);
1044
+ const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2);
1045
+ const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4);
1046
+ const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4);
1047
+ const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0);
1048
+ return ZSTD_rotateRight_U64(matches, headGrouped);
1049
+ }
1050
+ }
1051
+ #endif
1052
+
1053
+ /* Returns a ZSTD_VecMask (U64) that has the nth group (determined by
1054
+ * ZSTD_row_matchMaskGroupWidth) of bits set to 1 if the newly-computed "tag"
1055
+ * matches the hash at the nth position in a row of the tagTable.
1056
+ * Each row is a circular buffer beginning at the value of "headGrouped". So we
1057
+ * must rotate the "matches" bitfield to match up with the actual layout of the
1058
+ * entries within the hashTable */
1059
+ FORCE_INLINE_TEMPLATE ZSTD_VecMask
1060
+ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 headGrouped, const U32 rowEntries)
1012
1061
  {
1013
1062
  const BYTE* const src = tagRow + ZSTD_ROW_HASH_TAG_OFFSET;
1014
1063
  assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
1015
1064
  assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
1065
+ assert(ZSTD_row_matchMaskGroupWidth(rowEntries) * rowEntries <= sizeof(ZSTD_VecMask) * 8);
1016
1066
 
1017
1067
  #if defined(ZSTD_ARCH_X86_SSE2)
1018
1068
 
1019
- return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, head);
1069
+ return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, headGrouped);
1020
1070
 
1021
1071
  #else /* SW or NEON-LE */
1022
1072
 
1023
1073
  # if defined(ZSTD_ARCH_ARM_NEON)
1024
1074
  /* This NEON path only works for little endian - otherwise use SWAR below */
1025
1075
  if (MEM_isLittleEndian()) {
1026
- if (rowEntries == 16) {
1027
- const uint8x16_t chunk = vld1q_u8(src);
1028
- const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag)));
1029
- const uint16x8_t t0 = vshlq_n_u16(equalMask, 7);
1030
- const uint32x4_t t1 = vreinterpretq_u32_u16(vsriq_n_u16(t0, t0, 14));
1031
- const uint64x2_t t2 = vreinterpretq_u64_u32(vshrq_n_u32(t1, 14));
1032
- const uint8x16_t t3 = vreinterpretq_u8_u64(vsraq_n_u64(t2, t2, 28));
1033
- const U16 hi = (U16)vgetq_lane_u8(t3, 8);
1034
- const U16 lo = (U16)vgetq_lane_u8(t3, 0);
1035
- return ZSTD_rotateRight_U16((hi << 8) | lo, head);
1036
- } else if (rowEntries == 32) {
1037
- const uint16x8x2_t chunk = vld2q_u16((const U16*)(const void*)src);
1038
- const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]);
1039
- const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]);
1040
- const uint8x16_t equalMask0 = vceqq_u8(chunk0, vdupq_n_u8(tag));
1041
- const uint8x16_t equalMask1 = vceqq_u8(chunk1, vdupq_n_u8(tag));
1042
- const int8x8_t pack0 = vqmovn_s16(vreinterpretq_s16_u8(equalMask0));
1043
- const int8x8_t pack1 = vqmovn_s16(vreinterpretq_s16_u8(equalMask1));
1044
- const uint8x8_t t0 = vreinterpret_u8_s8(pack0);
1045
- const uint8x8_t t1 = vreinterpret_u8_s8(pack1);
1046
- const uint8x8_t t2 = vsri_n_u8(t1, t0, 2);
1047
- const uint8x8x2_t t3 = vuzp_u8(t2, t0);
1048
- const uint8x8_t t4 = vsri_n_u8(t3.val[1], t3.val[0], 4);
1049
- const U32 matches = vget_lane_u32(vreinterpret_u32_u8(t4), 0);
1050
- return ZSTD_rotateRight_U32(matches, head);
1051
- } else { /* rowEntries == 64 */
1052
- const uint8x16x4_t chunk = vld4q_u8(src);
1053
- const uint8x16_t dup = vdupq_n_u8(tag);
1054
- const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup);
1055
- const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup);
1056
- const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup);
1057
- const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup);
1058
-
1059
- const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1);
1060
- const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1);
1061
- const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2);
1062
- const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4);
1063
- const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4);
1064
- const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0);
1065
- return ZSTD_rotateRight_U64(matches, head);
1066
- }
1076
+ return ZSTD_row_getNEONMask(rowEntries, src, tag, headGrouped);
1067
1077
  }
1068
1078
  # endif /* ZSTD_ARCH_ARM_NEON */
1069
1079
  /* SWAR */
1070
- { const size_t chunkSize = sizeof(size_t);
1080
+ { const int chunkSize = sizeof(size_t);
1071
1081
  const size_t shiftAmount = ((chunkSize * 8) - chunkSize);
1072
1082
  const size_t xFF = ~((size_t)0);
1073
1083
  const size_t x01 = xFF / 0xFF;
@@ -1100,11 +1110,11 @@ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head,
1100
1110
  }
1101
1111
  matches = ~matches;
1102
1112
  if (rowEntries == 16) {
1103
- return ZSTD_rotateRight_U16((U16)matches, head);
1113
+ return ZSTD_rotateRight_U16((U16)matches, headGrouped);
1104
1114
  } else if (rowEntries == 32) {
1105
- return ZSTD_rotateRight_U32((U32)matches, head);
1115
+ return ZSTD_rotateRight_U32((U32)matches, headGrouped);
1106
1116
  } else {
1107
- return ZSTD_rotateRight_U64((U64)matches, head);
1117
+ return ZSTD_rotateRight_U64((U64)matches, headGrouped);
1108
1118
  }
1109
1119
  }
1110
1120
  #endif
@@ -1152,6 +1162,7 @@ size_t ZSTD_RowFindBestMatch(
1152
1162
  const U32 rowEntries = (1U << rowLog);
1153
1163
  const U32 rowMask = rowEntries - 1;
1154
1164
  const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */
1165
+ const U32 groupWidth = ZSTD_row_matchMaskGroupWidth(rowEntries);
1155
1166
  U32 nbAttempts = 1U << cappedSearchLog;
1156
1167
  size_t ml=4-1;
1157
1168
 
@@ -1194,15 +1205,15 @@ size_t ZSTD_RowFindBestMatch(
1194
1205
  U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK;
1195
1206
  U32* const row = hashTable + relRow;
1196
1207
  BYTE* tagRow = (BYTE*)(tagTable + relRow);
1197
- U32 const head = *tagRow & rowMask;
1208
+ U32 const headGrouped = (*tagRow & rowMask) * groupWidth;
1198
1209
  U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
1199
1210
  size_t numMatches = 0;
1200
1211
  size_t currMatch = 0;
1201
- ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, head, rowEntries);
1212
+ ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, headGrouped, rowEntries);
1202
1213
 
1203
1214
  /* Cycle through the matches and prefetch */
1204
1215
  for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
1205
- U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
1216
+ U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask;
1206
1217
  U32 const matchIndex = row[matchPos];
1207
1218
  assert(numMatches < rowEntries);
1208
1219
  if (matchIndex < lowLimit)
@@ -1233,7 +1244,8 @@ size_t ZSTD_RowFindBestMatch(
1233
1244
  if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
1234
1245
  const BYTE* const match = base + matchIndex;
1235
1246
  assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */
1236
- if (match[ml] == ip[ml]) /* potentially better */
1247
+ /* read 4B starting from (match + ml + 1 - sizeof(U32)) */
1248
+ if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3)) /* potentially better */
1237
1249
  currentMl = ZSTD_count(ip, match, iLimit);
1238
1250
  } else {
1239
1251
  const BYTE* const match = dictBase + matchIndex;
@@ -1245,7 +1257,7 @@ size_t ZSTD_RowFindBestMatch(
1245
1257
  /* Save best solution */
1246
1258
  if (currentMl > ml) {
1247
1259
  ml = currentMl;
1248
- *offsetPtr = STORE_OFFSET(curr - matchIndex);
1260
+ *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
1249
1261
  if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
1250
1262
  }
1251
1263
  }
@@ -1263,14 +1275,14 @@ size_t ZSTD_RowFindBestMatch(
1263
1275
  const U32 dmsSize = (U32)(dmsEnd - dmsBase);
1264
1276
  const U32 dmsIndexDelta = dictLimit - dmsSize;
1265
1277
 
1266
- { U32 const head = *dmsTagRow & rowMask;
1278
+ { U32 const headGrouped = (*dmsTagRow & rowMask) * groupWidth;
1267
1279
  U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
1268
1280
  size_t numMatches = 0;
1269
1281
  size_t currMatch = 0;
1270
- ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, head, rowEntries);
1282
+ ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, headGrouped, rowEntries);
1271
1283
 
1272
1284
  for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
1273
- U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
1285
+ U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask;
1274
1286
  U32 const matchIndex = dmsRow[matchPos];
1275
1287
  if (matchIndex < dmsLowestIndex)
1276
1288
  break;
@@ -1294,7 +1306,7 @@ size_t ZSTD_RowFindBestMatch(
1294
1306
  if (currentMl > ml) {
1295
1307
  ml = currentMl;
1296
1308
  assert(curr > matchIndex + dmsIndexDelta);
1297
- *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta));
1309
+ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta));
1298
1310
  if (ip+currentMl == iLimit) break;
1299
1311
  }
1300
1312
  }
@@ -1304,14 +1316,10 @@ size_t ZSTD_RowFindBestMatch(
1304
1316
  }
1305
1317
 
1306
1318
 
1307
- typedef size_t (*searchMax_f)(
1308
- ZSTD_matchState_t* ms,
1309
- const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
1310
-
1311
1319
  /**
1312
- * This struct contains the functions necessary for lazy to search.
1313
- * Currently, that is only searchMax. However, it is still valuable to have the
1314
- * VTable because this makes it easier to add more functions to the VTable later.
1320
+ * Generate search functions templated on (dictMode, mls, rowLog).
1321
+ * These functions are outlined for code size & compilation time.
1322
+ * ZSTD_searchMax() dispatches to the correct implementation function.
1315
1323
  *
1316
1324
  * TODO: The start of the search function involves loading and calculating a
1317
1325
  * bunch of constants from the ZSTD_matchState_t. These computations could be
@@ -1329,25 +1337,25 @@ typedef size_t (*searchMax_f)(
1329
1337
  * the single segment loop. It should go in searchMax instead of its own
1330
1338
  * function to avoid having multiple virtual function calls per search.
1331
1339
  */
1332
- typedef struct {
1333
- searchMax_f searchMax;
1334
- } ZSTD_LazyVTable;
1335
1340
 
1336
- #define GEN_ZSTD_BT_VTABLE(dictMode, mls) \
1337
- static size_t ZSTD_BtFindBestMatch_##dictMode##_##mls( \
1338
- ZSTD_matchState_t* ms, \
1339
- const BYTE* ip, const BYTE* const iLimit, \
1340
- size_t* offsetPtr) \
1341
- { \
1342
- assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
1343
- return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode); \
1344
- } \
1345
- static const ZSTD_LazyVTable ZSTD_BtVTable_##dictMode##_##mls = { \
1346
- ZSTD_BtFindBestMatch_##dictMode##_##mls \
1347
- };
1341
+ #define ZSTD_BT_SEARCH_FN(dictMode, mls) ZSTD_BtFindBestMatch_##dictMode##_##mls
1342
+ #define ZSTD_HC_SEARCH_FN(dictMode, mls) ZSTD_HcFindBestMatch_##dictMode##_##mls
1343
+ #define ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog) ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog
1344
+
1345
+ #define ZSTD_SEARCH_FN_ATTRS FORCE_NOINLINE
1346
+
1347
+ #define GEN_ZSTD_BT_SEARCH_FN(dictMode, mls) \
1348
+ ZSTD_SEARCH_FN_ATTRS size_t ZSTD_BT_SEARCH_FN(dictMode, mls)( \
1349
+ ZSTD_matchState_t* ms, \
1350
+ const BYTE* ip, const BYTE* const iLimit, \
1351
+ size_t* offBasePtr) \
1352
+ { \
1353
+ assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
1354
+ return ZSTD_BtFindBestMatch(ms, ip, iLimit, offBasePtr, mls, ZSTD_##dictMode); \
1355
+ } \
1348
1356
 
1349
- #define GEN_ZSTD_HC_VTABLE(dictMode, mls) \
1350
- static size_t ZSTD_HcFindBestMatch_##dictMode##_##mls( \
1357
+ #define GEN_ZSTD_HC_SEARCH_FN(dictMode, mls) \
1358
+ ZSTD_SEARCH_FN_ATTRS size_t ZSTD_HC_SEARCH_FN(dictMode, mls)( \
1351
1359
  ZSTD_matchState_t* ms, \
1352
1360
  const BYTE* ip, const BYTE* const iLimit, \
1353
1361
  size_t* offsetPtr) \
@@ -1355,12 +1363,9 @@ typedef struct {
1355
1363
  assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
1356
1364
  return ZSTD_HcFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode); \
1357
1365
  } \
1358
- static const ZSTD_LazyVTable ZSTD_HcVTable_##dictMode##_##mls = { \
1359
- ZSTD_HcFindBestMatch_##dictMode##_##mls \
1360
- };
1361
1366
 
1362
- #define GEN_ZSTD_ROW_VTABLE(dictMode, mls, rowLog) \
1363
- static size_t ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog( \
1367
+ #define GEN_ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog) \
1368
+ ZSTD_SEARCH_FN_ATTRS size_t ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)( \
1364
1369
  ZSTD_matchState_t* ms, \
1365
1370
  const BYTE* ip, const BYTE* const iLimit, \
1366
1371
  size_t* offsetPtr) \
@@ -1369,9 +1374,6 @@ typedef struct {
1369
1374
  assert(MAX(4, MIN(6, ms->cParams.searchLog)) == rowLog); \
1370
1375
  return ZSTD_RowFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode, rowLog); \
1371
1376
  } \
1372
- static const ZSTD_LazyVTable ZSTD_RowVTable_##dictMode##_##mls##_##rowLog = { \
1373
- ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog \
1374
- };
1375
1377
 
1376
1378
  #define ZSTD_FOR_EACH_ROWLOG(X, dictMode, mls) \
1377
1379
  X(dictMode, mls, 4) \
@@ -1394,84 +1396,103 @@ typedef struct {
1394
1396
  X(__VA_ARGS__, dictMatchState) \
1395
1397
  X(__VA_ARGS__, dedicatedDictSearch)
1396
1398
 
1397
- /* Generate Row VTables for each combination of (dictMode, mls, rowLog) */
1398
- ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS_ROWLOG, GEN_ZSTD_ROW_VTABLE)
1399
- /* Generate Binary Tree VTables for each combination of (dictMode, mls) */
1400
- ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_BT_VTABLE)
1401
- /* Generate Hash Chain VTables for each combination of (dictMode, mls) */
1402
- ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_HC_VTABLE)
1403
-
1404
- #define GEN_ZSTD_BT_VTABLE_ARRAY(dictMode) \
1405
- { \
1406
- &ZSTD_BtVTable_##dictMode##_4, \
1407
- &ZSTD_BtVTable_##dictMode##_5, \
1408
- &ZSTD_BtVTable_##dictMode##_6 \
1409
- }
1410
-
1411
- #define GEN_ZSTD_HC_VTABLE_ARRAY(dictMode) \
1412
- { \
1413
- &ZSTD_HcVTable_##dictMode##_4, \
1414
- &ZSTD_HcVTable_##dictMode##_5, \
1415
- &ZSTD_HcVTable_##dictMode##_6 \
1416
- }
1399
+ /* Generate row search fns for each combination of (dictMode, mls, rowLog) */
1400
+ ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS_ROWLOG, GEN_ZSTD_ROW_SEARCH_FN)
1401
+ /* Generate binary Tree search fns for each combination of (dictMode, mls) */
1402
+ ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_BT_SEARCH_FN)
1403
+ /* Generate hash chain search fns for each combination of (dictMode, mls) */
1404
+ ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_HC_SEARCH_FN)
1417
1405
 
1418
- #define GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, mls) \
1419
- { \
1420
- &ZSTD_RowVTable_##dictMode##_##mls##_4, \
1421
- &ZSTD_RowVTable_##dictMode##_##mls##_5, \
1422
- &ZSTD_RowVTable_##dictMode##_##mls##_6 \
1423
- }
1424
-
1425
- #define GEN_ZSTD_ROW_VTABLE_ARRAY(dictMode) \
1426
- { \
1427
- GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, 4), \
1428
- GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, 5), \
1429
- GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, 6) \
1430
- }
1406
+ typedef enum { search_hashChain=0, search_binaryTree=1, search_rowHash=2 } searchMethod_e;
1431
1407
 
1432
- #define GEN_ZSTD_VTABLE_ARRAY(X) \
1433
- { \
1434
- X(noDict), \
1435
- X(extDict), \
1436
- X(dictMatchState), \
1437
- X(dedicatedDictSearch) \
1408
+ #define GEN_ZSTD_CALL_BT_SEARCH_FN(dictMode, mls) \
1409
+ case mls: \
1410
+ return ZSTD_BT_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr);
1411
+ #define GEN_ZSTD_CALL_HC_SEARCH_FN(dictMode, mls) \
1412
+ case mls: \
1413
+ return ZSTD_HC_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr);
1414
+ #define GEN_ZSTD_CALL_ROW_SEARCH_FN(dictMode, mls, rowLog) \
1415
+ case rowLog: \
1416
+ return ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)(ms, ip, iend, offsetPtr);
1417
+
1418
+ #define ZSTD_SWITCH_MLS(X, dictMode) \
1419
+ switch (mls) { \
1420
+ ZSTD_FOR_EACH_MLS(X, dictMode) \
1438
1421
  }
1439
1422
 
1440
- /* *******************************
1441
- * Common parser - lazy strategy
1442
- *********************************/
1443
- typedef enum { search_hashChain=0, search_binaryTree=1, search_rowHash=2 } searchMethod_e;
1423
+ #define ZSTD_SWITCH_ROWLOG(dictMode, mls) \
1424
+ case mls: \
1425
+ switch (rowLog) { \
1426
+ ZSTD_FOR_EACH_ROWLOG(GEN_ZSTD_CALL_ROW_SEARCH_FN, dictMode, mls) \
1427
+ } \
1428
+ ZSTD_UNREACHABLE; \
1429
+ break;
1430
+
1431
+ #define ZSTD_SWITCH_SEARCH_METHOD(dictMode) \
1432
+ switch (searchMethod) { \
1433
+ case search_hashChain: \
1434
+ ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_HC_SEARCH_FN, dictMode) \
1435
+ break; \
1436
+ case search_binaryTree: \
1437
+ ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_BT_SEARCH_FN, dictMode) \
1438
+ break; \
1439
+ case search_rowHash: \
1440
+ ZSTD_SWITCH_MLS(ZSTD_SWITCH_ROWLOG, dictMode) \
1441
+ break; \
1442
+ } \
1443
+ ZSTD_UNREACHABLE;
1444
1444
 
1445
1445
  /**
1446
- * This table is indexed first by the four ZSTD_dictMode_e values, and then
1447
- * by the two searchMethod_e values. NULLs are placed for configurations
1448
- * that should never occur (extDict modes go to the other implementation
1449
- * below and there is no DDSS for binary tree search yet).
1446
+ * Searches for the longest match at @p ip.
1447
+ * Dispatches to the correct implementation function based on the
1448
+ * (searchMethod, dictMode, mls, rowLog). We use switch statements
1449
+ * here instead of using an indirect function call through a function
1450
+ * pointer because after Spectre and Meltdown mitigations, indirect
1451
+ * function calls can be very costly, especially in the kernel.
1452
+ *
1453
+ * NOTE: dictMode and searchMethod should be templated, so those switch
1454
+ * statements should be optimized out. Only the mls & rowLog switches
1455
+ * should be left.
1456
+ *
1457
+ * @param ms The match state.
1458
+ * @param ip The position to search at.
1459
+ * @param iend The end of the input data.
1460
+ * @param[out] offsetPtr Stores the match offset into this pointer.
1461
+ * @param mls The minimum search length, in the range [4, 6].
1462
+ * @param rowLog The row log (if applicable), in the range [4, 6].
1463
+ * @param searchMethod The search method to use (templated).
1464
+ * @param dictMode The dictMode (templated).
1465
+ *
1466
+ * @returns The length of the longest match found, or < mls if no match is found.
1467
+ * If a match is found its offset is stored in @p offsetPtr.
1450
1468
  */
1451
-
1452
- static ZSTD_LazyVTable const*
1453
- ZSTD_selectLazyVTable(ZSTD_matchState_t const* ms, searchMethod_e searchMethod, ZSTD_dictMode_e dictMode)
1469
+ FORCE_INLINE_TEMPLATE size_t ZSTD_searchMax(
1470
+ ZSTD_matchState_t* ms,
1471
+ const BYTE* ip,
1472
+ const BYTE* iend,
1473
+ size_t* offsetPtr,
1474
+ U32 const mls,
1475
+ U32 const rowLog,
1476
+ searchMethod_e const searchMethod,
1477
+ ZSTD_dictMode_e const dictMode)
1454
1478
  {
1455
- /* Fill the Hc/Bt VTable arrays with the right functions for the (dictMode, mls) combination. */
1456
- ZSTD_LazyVTable const* const hcVTables[4][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_HC_VTABLE_ARRAY);
1457
- ZSTD_LazyVTable const* const btVTables[4][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_BT_VTABLE_ARRAY);
1458
- /* Fill the Row VTable array with the right functions for the (dictMode, mls, rowLog) combination. */
1459
- ZSTD_LazyVTable const* const rowVTables[4][3][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_ROW_VTABLE_ARRAY);
1460
-
1461
- U32 const mls = MAX(4, MIN(6, ms->cParams.minMatch));
1462
- U32 const rowLog = MAX(4, MIN(6, ms->cParams.searchLog));
1463
- switch (searchMethod) {
1464
- case search_hashChain:
1465
- return hcVTables[dictMode][mls - 4];
1466
- case search_binaryTree:
1467
- return btVTables[dictMode][mls - 4];
1468
- case search_rowHash:
1469
- return rowVTables[dictMode][mls - 4][rowLog - 4];
1470
- default:
1471
- return NULL;
1479
+ if (dictMode == ZSTD_noDict) {
1480
+ ZSTD_SWITCH_SEARCH_METHOD(noDict)
1481
+ } else if (dictMode == ZSTD_extDict) {
1482
+ ZSTD_SWITCH_SEARCH_METHOD(extDict)
1483
+ } else if (dictMode == ZSTD_dictMatchState) {
1484
+ ZSTD_SWITCH_SEARCH_METHOD(dictMatchState)
1485
+ } else if (dictMode == ZSTD_dedicatedDictSearch) {
1486
+ ZSTD_SWITCH_SEARCH_METHOD(dedicatedDictSearch)
1472
1487
  }
1488
+ ZSTD_UNREACHABLE;
1489
+ return 0;
1473
1490
  }
1474
1491
 
1492
+ /* *******************************
1493
+ * Common parser - lazy strategy
1494
+ *********************************/
1495
+
1475
1496
  FORCE_INLINE_TEMPLATE size_t
1476
1497
  ZSTD_compressBlock_lazy_generic(
1477
1498
  ZSTD_matchState_t* ms, seqStore_t* seqStore,
@@ -1488,9 +1509,11 @@ ZSTD_compressBlock_lazy_generic(
1488
1509
  const BYTE* const base = ms->window.base;
1489
1510
  const U32 prefixLowestIndex = ms->window.dictLimit;
1490
1511
  const BYTE* const prefixLowest = base + prefixLowestIndex;
1512
+ const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6);
1513
+ const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);
1491
1514
 
1492
- searchMax_f const searchMax = ZSTD_selectLazyVTable(ms, searchMethod, dictMode)->searchMax;
1493
- U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0;
1515
+ U32 offset_1 = rep[0], offset_2 = rep[1];
1516
+ U32 offsetSaved1 = 0, offsetSaved2 = 0;
1494
1517
 
1495
1518
  const int isDMS = dictMode == ZSTD_dictMatchState;
1496
1519
  const int isDDS = dictMode == ZSTD_dedicatedDictSearch;
@@ -1505,16 +1528,14 @@ ZSTD_compressBlock_lazy_generic(
1505
1528
  0;
1506
1529
  const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictLowest));
1507
1530
 
1508
- assert(searchMax != NULL);
1509
-
1510
1531
  DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=%u) (searchFunc=%u)", (U32)dictMode, (U32)searchMethod);
1511
1532
  ip += (dictAndPrefixLength == 0);
1512
1533
  if (dictMode == ZSTD_noDict) {
1513
1534
  U32 const curr = (U32)(ip - base);
1514
1535
  U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, ms->cParams.windowLog);
1515
1536
  U32 const maxRep = curr - windowLow;
1516
- if (offset_2 > maxRep) savedOffset = offset_2, offset_2 = 0;
1517
- if (offset_1 > maxRep) savedOffset = offset_1, offset_1 = 0;
1537
+ if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0;
1538
+ if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0;
1518
1539
  }
1519
1540
  if (isDxS) {
1520
1541
  /* dictMatchState repCode checks don't currently handle repCode == 0
@@ -1524,7 +1545,6 @@ ZSTD_compressBlock_lazy_generic(
1524
1545
  }
1525
1546
 
1526
1547
  if (searchMethod == search_rowHash) {
1527
- const U32 rowLog = MAX(4, MIN(6, ms->cParams.searchLog));
1528
1548
  ZSTD_row_fillHashCache(ms, base, rowLog,
1529
1549
  MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
1530
1550
  ms->nextToUpdate, ilimit);
@@ -1539,7 +1559,7 @@ ZSTD_compressBlock_lazy_generic(
1539
1559
  #endif
1540
1560
  while (ip < ilimit) {
1541
1561
  size_t matchLength=0;
1542
- size_t offcode=STORE_REPCODE_1;
1562
+ size_t offBase = REPCODE1_TO_OFFBASE;
1543
1563
  const BYTE* start=ip+1;
1544
1564
  DEBUGLOG(7, "search baseline (depth 0)");
1545
1565
 
@@ -1564,10 +1584,10 @@ ZSTD_compressBlock_lazy_generic(
1564
1584
  }
1565
1585
 
1566
1586
  /* first search (depth 0) */
1567
- { size_t offsetFound = 999999999;
1568
- size_t const ml2 = searchMax(ms, ip, iend, &offsetFound);
1587
+ { size_t offbaseFound = 999999999;
1588
+ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offbaseFound, mls, rowLog, searchMethod, dictMode);
1569
1589
  if (ml2 > matchLength)
1570
- matchLength = ml2, start = ip, offcode=offsetFound;
1590
+ matchLength = ml2, start = ip, offBase = offbaseFound;
1571
1591
  }
1572
1592
 
1573
1593
  if (matchLength < 4) {
@@ -1581,12 +1601,12 @@ ZSTD_compressBlock_lazy_generic(
1581
1601
  DEBUGLOG(7, "search depth 1");
1582
1602
  ip ++;
1583
1603
  if ( (dictMode == ZSTD_noDict)
1584
- && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
1604
+ && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
1585
1605
  size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
1586
1606
  int const gain2 = (int)(mlRep * 3);
1587
- int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
1607
+ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
1588
1608
  if ((mlRep >= 4) && (gain2 > gain1))
1589
- matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
1609
+ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
1590
1610
  }
1591
1611
  if (isDxS) {
1592
1612
  const U32 repIndex = (U32)(ip - base) - offset_1;
@@ -1598,17 +1618,17 @@ ZSTD_compressBlock_lazy_generic(
1598
1618
  const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
1599
1619
  size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
1600
1620
  int const gain2 = (int)(mlRep * 3);
1601
- int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
1621
+ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
1602
1622
  if ((mlRep >= 4) && (gain2 > gain1))
1603
- matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
1623
+ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
1604
1624
  }
1605
1625
  }
1606
- { size_t offset2=999999999;
1607
- size_t const ml2 = searchMax(ms, ip, iend, &offset2);
1608
- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */
1609
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4);
1626
+ { size_t ofbCandidate=999999999;
1627
+ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);
1628
+ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */
1629
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4);
1610
1630
  if ((ml2 >= 4) && (gain2 > gain1)) {
1611
- matchLength = ml2, offcode = offset2, start = ip;
1631
+ matchLength = ml2, offBase = ofbCandidate, start = ip;
1612
1632
  continue; /* search a better one */
1613
1633
  } }
1614
1634
 
@@ -1617,12 +1637,12 @@ ZSTD_compressBlock_lazy_generic(
1617
1637
  DEBUGLOG(7, "search depth 2");
1618
1638
  ip ++;
1619
1639
  if ( (dictMode == ZSTD_noDict)
1620
- && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
1640
+ && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
1621
1641
  size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
1622
1642
  int const gain2 = (int)(mlRep * 4);
1623
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
1643
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
1624
1644
  if ((mlRep >= 4) && (gain2 > gain1))
1625
- matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
1645
+ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
1626
1646
  }
1627
1647
  if (isDxS) {
1628
1648
  const U32 repIndex = (U32)(ip - base) - offset_1;
@@ -1634,17 +1654,17 @@ ZSTD_compressBlock_lazy_generic(
1634
1654
  const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
1635
1655
  size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
1636
1656
  int const gain2 = (int)(mlRep * 4);
1637
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
1657
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
1638
1658
  if ((mlRep >= 4) && (gain2 > gain1))
1639
- matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
1659
+ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
1640
1660
  }
1641
1661
  }
1642
- { size_t offset2=999999999;
1643
- size_t const ml2 = searchMax(ms, ip, iend, &offset2);
1644
- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */
1645
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7);
1662
+ { size_t ofbCandidate=999999999;
1663
+ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);
1664
+ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */
1665
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7);
1646
1666
  if ((ml2 >= 4) && (gain2 > gain1)) {
1647
- matchLength = ml2, offcode = offset2, start = ip;
1667
+ matchLength = ml2, offBase = ofbCandidate, start = ip;
1648
1668
  continue;
1649
1669
  } } }
1650
1670
  break; /* nothing found : store previous solution */
@@ -1655,24 +1675,24 @@ ZSTD_compressBlock_lazy_generic(
1655
1675
  * notably if `value` is unsigned, resulting in a large positive `-value`.
1656
1676
  */
1657
1677
  /* catch up */
1658
- if (STORED_IS_OFFSET(offcode)) {
1678
+ if (OFFBASE_IS_OFFSET(offBase)) {
1659
1679
  if (dictMode == ZSTD_noDict) {
1660
- while ( ((start > anchor) & (start - STORED_OFFSET(offcode) > prefixLowest))
1661
- && (start[-1] == (start-STORED_OFFSET(offcode))[-1]) ) /* only search for offset within prefix */
1680
+ while ( ((start > anchor) & (start - OFFBASE_TO_OFFSET(offBase) > prefixLowest))
1681
+ && (start[-1] == (start-OFFBASE_TO_OFFSET(offBase))[-1]) ) /* only search for offset within prefix */
1662
1682
  { start--; matchLength++; }
1663
1683
  }
1664
1684
  if (isDxS) {
1665
- U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode));
1685
+ U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase));
1666
1686
  const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex;
1667
1687
  const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest;
1668
1688
  while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */
1669
1689
  }
1670
- offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode);
1690
+ offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase);
1671
1691
  }
1672
1692
  /* store sequence */
1673
1693
  _storeSequence:
1674
1694
  { size_t const litLength = (size_t)(start - anchor);
1675
- ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength);
1695
+ ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
1676
1696
  anchor = ip = start + matchLength;
1677
1697
  }
1678
1698
 
@@ -1688,8 +1708,8 @@ _storeSequence:
1688
1708
  && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
1689
1709
  const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend;
1690
1710
  matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4;
1691
- offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap offset_2 <=> offset_1 */
1692
- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);
1711
+ offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap offset_2 <=> offset_1 */
1712
+ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
1693
1713
  ip += matchLength;
1694
1714
  anchor = ip;
1695
1715
  continue;
@@ -1703,16 +1723,20 @@ _storeSequence:
1703
1723
  && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) {
1704
1724
  /* store sequence */
1705
1725
  matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
1706
- offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap repcodes */
1707
- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);
1726
+ offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap repcodes */
1727
+ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
1708
1728
  ip += matchLength;
1709
1729
  anchor = ip;
1710
1730
  continue; /* faster when present ... (?) */
1711
1731
  } } }
1712
1732
 
1713
- /* Save reps for next block */
1714
- rep[0] = offset_1 ? offset_1 : savedOffset;
1715
- rep[1] = offset_2 ? offset_2 : savedOffset;
1733
+ /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
1734
+ * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
1735
+ offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
1736
+
1737
+ /* save reps for next block */
1738
+ rep[0] = offset_1 ? offset_1 : offsetSaved1;
1739
+ rep[1] = offset_2 ? offset_2 : offsetSaved2;
1716
1740
 
1717
1741
  /* Return the last literals size */
1718
1742
  return (size_t)(iend - anchor);
@@ -1881,9 +1905,9 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1881
1905
  const BYTE* const dictEnd = dictBase + dictLimit;
1882
1906
  const BYTE* const dictStart = dictBase + ms->window.lowLimit;
1883
1907
  const U32 windowLog = ms->cParams.windowLog;
1884
- const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5;
1908
+ const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6);
1909
+ const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);
1885
1910
 
1886
- searchMax_f const searchMax = ZSTD_selectLazyVTable(ms, searchMethod, ZSTD_extDict)->searchMax;
1887
1911
  U32 offset_1 = rep[0], offset_2 = rep[1];
1888
1912
 
1889
1913
  DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod);
@@ -1905,7 +1929,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1905
1929
  #endif
1906
1930
  while (ip < ilimit) {
1907
1931
  size_t matchLength=0;
1908
- size_t offcode=STORE_REPCODE_1;
1932
+ size_t offBase = REPCODE1_TO_OFFBASE;
1909
1933
  const BYTE* start=ip+1;
1910
1934
  U32 curr = (U32)(ip-base);
1911
1935
 
@@ -1924,10 +1948,10 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1924
1948
  } }
1925
1949
 
1926
1950
  /* first search (depth 0) */
1927
- { size_t offsetFound = 999999999;
1928
- size_t const ml2 = searchMax(ms, ip, iend, &offsetFound);
1951
+ { size_t ofbCandidate = 999999999;
1952
+ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
1929
1953
  if (ml2 > matchLength)
1930
- matchLength = ml2, start = ip, offcode=offsetFound;
1954
+ matchLength = ml2, start = ip, offBase = ofbCandidate;
1931
1955
  }
1932
1956
 
1933
1957
  if (matchLength < 4) {
@@ -1941,7 +1965,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1941
1965
  ip ++;
1942
1966
  curr++;
1943
1967
  /* check repCode */
1944
- if (offcode) {
1968
+ if (offBase) {
1945
1969
  const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
1946
1970
  const U32 repIndex = (U32)(curr - offset_1);
1947
1971
  const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
@@ -1953,18 +1977,18 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1953
1977
  const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
1954
1978
  size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
1955
1979
  int const gain2 = (int)(repLength * 3);
1956
- int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
1980
+ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
1957
1981
  if ((repLength >= 4) && (gain2 > gain1))
1958
- matchLength = repLength, offcode = STORE_REPCODE_1, start = ip;
1982
+ matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip;
1959
1983
  } }
1960
1984
 
1961
1985
  /* search match, depth 1 */
1962
- { size_t offset2=999999999;
1963
- size_t const ml2 = searchMax(ms, ip, iend, &offset2);
1964
- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */
1965
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4);
1986
+ { size_t ofbCandidate = 999999999;
1987
+ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
1988
+ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */
1989
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4);
1966
1990
  if ((ml2 >= 4) && (gain2 > gain1)) {
1967
- matchLength = ml2, offcode = offset2, start = ip;
1991
+ matchLength = ml2, offBase = ofbCandidate, start = ip;
1968
1992
  continue; /* search a better one */
1969
1993
  } }
1970
1994
 
@@ -1973,7 +1997,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1973
1997
  ip ++;
1974
1998
  curr++;
1975
1999
  /* check repCode */
1976
- if (offcode) {
2000
+ if (offBase) {
1977
2001
  const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
1978
2002
  const U32 repIndex = (U32)(curr - offset_1);
1979
2003
  const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
@@ -1985,36 +2009,36 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1985
2009
  const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
1986
2010
  size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
1987
2011
  int const gain2 = (int)(repLength * 4);
1988
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
2012
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
1989
2013
  if ((repLength >= 4) && (gain2 > gain1))
1990
- matchLength = repLength, offcode = STORE_REPCODE_1, start = ip;
2014
+ matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip;
1991
2015
  } }
1992
2016
 
1993
2017
  /* search match, depth 2 */
1994
- { size_t offset2=999999999;
1995
- size_t const ml2 = searchMax(ms, ip, iend, &offset2);
1996
- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */
1997
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7);
2018
+ { size_t ofbCandidate = 999999999;
2019
+ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
2020
+ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */
2021
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7);
1998
2022
  if ((ml2 >= 4) && (gain2 > gain1)) {
1999
- matchLength = ml2, offcode = offset2, start = ip;
2023
+ matchLength = ml2, offBase = ofbCandidate, start = ip;
2000
2024
  continue;
2001
2025
  } } }
2002
2026
  break; /* nothing found : store previous solution */
2003
2027
  }
2004
2028
 
2005
2029
  /* catch up */
2006
- if (STORED_IS_OFFSET(offcode)) {
2007
- U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode));
2030
+ if (OFFBASE_IS_OFFSET(offBase)) {
2031
+ U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase));
2008
2032
  const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex;
2009
2033
  const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart;
2010
2034
  while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */
2011
- offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode);
2035
+ offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase);
2012
2036
  }
2013
2037
 
2014
2038
  /* store sequence */
2015
2039
  _storeSequence:
2016
2040
  { size_t const litLength = (size_t)(start - anchor);
2017
- ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength);
2041
+ ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
2018
2042
  anchor = ip = start + matchLength;
2019
2043
  }
2020
2044
 
@@ -2031,8 +2055,8 @@ _storeSequence:
2031
2055
  /* repcode detected we should take it */
2032
2056
  const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
2033
2057
  matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
2034
- offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap offset history */
2035
- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);
2058
+ offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap offset history */
2059
+ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
2036
2060
  ip += matchLength;
2037
2061
  anchor = ip;
2038
2062
  continue; /* faster when present ... (?) */
@@ -2098,7 +2122,6 @@ size_t ZSTD_compressBlock_lazy_extDict_row(
2098
2122
  size_t ZSTD_compressBlock_lazy2_extDict_row(
2099
2123
  ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2100
2124
  void const* src, size_t srcSize)
2101
-
2102
2125
  {
2103
2126
  return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2);
2104
2127
  }