zstd-ruby 1.5.0.0 → 1.5.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (62) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/ruby.yml +2 -2
  3. data/README.md +1 -1
  4. data/ext/zstdruby/extconf.rb +1 -0
  5. data/ext/zstdruby/libzstd/Makefile +50 -175
  6. data/ext/zstdruby/libzstd/README.md +7 -1
  7. data/ext/zstdruby/libzstd/common/bitstream.h +24 -9
  8. data/ext/zstdruby/libzstd/common/compiler.h +89 -43
  9. data/ext/zstdruby/libzstd/common/entropy_common.c +11 -5
  10. data/ext/zstdruby/libzstd/common/error_private.h +79 -0
  11. data/ext/zstdruby/libzstd/common/fse.h +2 -1
  12. data/ext/zstdruby/libzstd/common/fse_decompress.c +1 -1
  13. data/ext/zstdruby/libzstd/common/huf.h +24 -22
  14. data/ext/zstdruby/libzstd/common/mem.h +18 -0
  15. data/ext/zstdruby/libzstd/common/portability_macros.h +131 -0
  16. data/ext/zstdruby/libzstd/common/xxhash.c +5 -805
  17. data/ext/zstdruby/libzstd/common/xxhash.h +5568 -167
  18. data/ext/zstdruby/libzstd/common/zstd_internal.h +92 -88
  19. data/ext/zstdruby/libzstd/common/zstd_trace.h +12 -3
  20. data/ext/zstdruby/libzstd/compress/clevels.h +134 -0
  21. data/ext/zstdruby/libzstd/compress/fse_compress.c +63 -27
  22. data/ext/zstdruby/libzstd/compress/huf_compress.c +537 -104
  23. data/ext/zstdruby/libzstd/compress/zstd_compress.c +194 -278
  24. data/ext/zstdruby/libzstd/compress/zstd_compress_internal.h +102 -44
  25. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.c +4 -3
  26. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.h +3 -1
  27. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.c +5 -4
  28. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.c +3 -2
  29. data/ext/zstdruby/libzstd/compress/zstd_cwksp.h +3 -3
  30. data/ext/zstdruby/libzstd/compress/zstd_double_fast.c +289 -114
  31. data/ext/zstdruby/libzstd/compress/zstd_fast.c +302 -123
  32. data/ext/zstdruby/libzstd/compress/zstd_lazy.c +418 -502
  33. data/ext/zstdruby/libzstd/compress/zstd_ldm.c +4 -4
  34. data/ext/zstdruby/libzstd/compress/zstd_ldm.h +1 -1
  35. data/ext/zstdruby/libzstd/compress/zstd_ldm_geartab.h +4 -1
  36. data/ext/zstdruby/libzstd/compress/zstd_opt.c +186 -108
  37. data/ext/zstdruby/libzstd/compress/zstdmt_compress.c +59 -29
  38. data/ext/zstdruby/libzstd/decompress/huf_decompress.c +727 -189
  39. data/ext/zstdruby/libzstd/decompress/huf_decompress_amd64.S +571 -0
  40. data/ext/zstdruby/libzstd/decompress/zstd_decompress.c +85 -22
  41. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.c +744 -220
  42. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.h +8 -2
  43. data/ext/zstdruby/libzstd/decompress/zstd_decompress_internal.h +34 -3
  44. data/ext/zstdruby/libzstd/deprecated/zbuff_compress.c +23 -3
  45. data/ext/zstdruby/libzstd/dictBuilder/cover.c +9 -2
  46. data/ext/zstdruby/libzstd/dictBuilder/fastcover.c +11 -4
  47. data/ext/zstdruby/libzstd/dictBuilder/zdict.c +99 -28
  48. data/ext/zstdruby/libzstd/legacy/zstd_v01.c +2 -6
  49. data/ext/zstdruby/libzstd/legacy/zstd_v02.c +3 -7
  50. data/ext/zstdruby/libzstd/legacy/zstd_v03.c +3 -7
  51. data/ext/zstdruby/libzstd/legacy/zstd_v04.c +3 -7
  52. data/ext/zstdruby/libzstd/legacy/zstd_v05.c +3 -7
  53. data/ext/zstdruby/libzstd/legacy/zstd_v06.c +3 -7
  54. data/ext/zstdruby/libzstd/legacy/zstd_v07.c +3 -7
  55. data/ext/zstdruby/libzstd/libzstd.mk +185 -0
  56. data/ext/zstdruby/libzstd/libzstd.pc.in +1 -0
  57. data/ext/zstdruby/libzstd/modulemap/module.modulemap +4 -0
  58. data/ext/zstdruby/libzstd/zdict.h +4 -4
  59. data/ext/zstdruby/libzstd/zstd.h +179 -136
  60. data/ext/zstdruby/zstdruby.c +2 -2
  61. data/lib/zstd-ruby/version.rb +1 -1
  62. metadata +8 -3
@@ -61,7 +61,7 @@ ZSTD_updateDUBT(ZSTD_matchState_t* ms,
61
61
  * assumption : curr >= btlow == (curr - btmask)
62
62
  * doesn't fail */
63
63
  static void
64
- ZSTD_insertDUBT1(ZSTD_matchState_t* ms,
64
+ ZSTD_insertDUBT1(const ZSTD_matchState_t* ms,
65
65
  U32 curr, const BYTE* inputEnd,
66
66
  U32 nbCompares, U32 btLow,
67
67
  const ZSTD_dictMode_e dictMode)
@@ -93,7 +93,7 @@ ZSTD_insertDUBT1(ZSTD_matchState_t* ms,
93
93
  assert(curr >= btLow);
94
94
  assert(ip < iend); /* condition for ZSTD_count */
95
95
 
96
- while (nbCompares-- && (matchIndex > windowLow)) {
96
+ for (; nbCompares && (matchIndex > windowLow); --nbCompares) {
97
97
  U32* const nextPtr = bt + 2*(matchIndex & btMask);
98
98
  size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
99
99
  assert(matchIndex < curr);
@@ -151,7 +151,7 @@ ZSTD_insertDUBT1(ZSTD_matchState_t* ms,
151
151
 
152
152
  static size_t
153
153
  ZSTD_DUBT_findBetterDictMatch (
154
- ZSTD_matchState_t* ms,
154
+ const ZSTD_matchState_t* ms,
155
155
  const BYTE* const ip, const BYTE* const iend,
156
156
  size_t* offsetPtr,
157
157
  size_t bestLength,
@@ -185,7 +185,7 @@ ZSTD_DUBT_findBetterDictMatch (
185
185
  (void)dictMode;
186
186
  assert(dictMode == ZSTD_dictMatchState);
187
187
 
188
- while (nbCompares-- && (dictMatchIndex > dictLowLimit)) {
188
+ for (; nbCompares && (dictMatchIndex > dictLowLimit); --nbCompares) {
189
189
  U32* const nextPtr = dictBt + 2*(dictMatchIndex & btMask);
190
190
  size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
191
191
  const BYTE* match = dictBase + dictMatchIndex;
@@ -309,7 +309,7 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
309
309
  matchIndex = hashTable[h];
310
310
  hashTable[h] = curr; /* Update Hash Table */
311
311
 
312
- while (nbCompares-- && (matchIndex > windowLow)) {
312
+ for (; nbCompares && (matchIndex > windowLow); --nbCompares) {
313
313
  U32* const nextPtr = bt + 2*(matchIndex & btMask);
314
314
  size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
315
315
  const BYTE* match;
@@ -357,6 +357,7 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
357
357
 
358
358
  *smallerPtr = *largerPtr = 0;
359
359
 
360
+ assert(nbCompares <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */
360
361
  if (dictMode == ZSTD_dictMatchState && nbCompares) {
361
362
  bestLength = ZSTD_DUBT_findBetterDictMatch(
362
363
  ms, ip, iend,
@@ -390,54 +391,6 @@ ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms,
390
391
  return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offsetPtr, mls, dictMode);
391
392
  }
392
393
 
393
-
394
- static size_t
395
- ZSTD_BtFindBestMatch_selectMLS ( ZSTD_matchState_t* ms,
396
- const BYTE* ip, const BYTE* const iLimit,
397
- size_t* offsetPtr)
398
- {
399
- switch(ms->cParams.minMatch)
400
- {
401
- default : /* includes case 3 */
402
- case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_noDict);
403
- case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_noDict);
404
- case 7 :
405
- case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_noDict);
406
- }
407
- }
408
-
409
-
410
- static size_t ZSTD_BtFindBestMatch_dictMatchState_selectMLS (
411
- ZSTD_matchState_t* ms,
412
- const BYTE* ip, const BYTE* const iLimit,
413
- size_t* offsetPtr)
414
- {
415
- switch(ms->cParams.minMatch)
416
- {
417
- default : /* includes case 3 */
418
- case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_dictMatchState);
419
- case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_dictMatchState);
420
- case 7 :
421
- case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_dictMatchState);
422
- }
423
- }
424
-
425
-
426
- static size_t ZSTD_BtFindBestMatch_extDict_selectMLS (
427
- ZSTD_matchState_t* ms,
428
- const BYTE* ip, const BYTE* const iLimit,
429
- size_t* offsetPtr)
430
- {
431
- switch(ms->cParams.minMatch)
432
- {
433
- default : /* includes case 3 */
434
- case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_extDict);
435
- case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_extDict);
436
- case 7 :
437
- case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_extDict);
438
- }
439
- }
440
-
441
394
  /***********************************
442
395
  * Dedicated dict search
443
396
  ***********************************/
@@ -450,7 +403,7 @@ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const B
450
403
  U32* const chainTable = ms->chainTable;
451
404
  U32 const chainSize = 1 << ms->cParams.chainLog;
452
405
  U32 idx = ms->nextToUpdate;
453
- U32 const minChain = chainSize < target ? target - chainSize : idx;
406
+ U32 const minChain = chainSize < target - idx ? target - chainSize : idx;
454
407
  U32 const bucketSize = 1 << ZSTD_LAZY_DDSS_BUCKET_LOG;
455
408
  U32 const cacheSize = bucketSize - 1;
456
409
  U32 const chainAttempts = (1 << ms->cParams.searchLog) - cacheSize;
@@ -464,7 +417,7 @@ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const B
464
417
  U32 const hashLog = ms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG;
465
418
  U32* const tmpHashTable = hashTable;
466
419
  U32* const tmpChainTable = hashTable + ((size_t)1 << hashLog);
467
- U32 const tmpChainSize = ((1 << ZSTD_LAZY_DDSS_BUCKET_LOG) - 1) << hashLog;
420
+ U32 const tmpChainSize = (U32)((1 << ZSTD_LAZY_DDSS_BUCKET_LOG) - 1) << hashLog;
468
421
  U32 const tmpMinChain = tmpChainSize < target ? target - tmpChainSize : idx;
469
422
  U32 hashIdx;
470
423
 
@@ -692,7 +645,7 @@ U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {
692
645
 
693
646
  /* inlining is important to hardwire a hot branch (template emulation) */
694
647
  FORCE_INLINE_TEMPLATE
695
- size_t ZSTD_HcFindBestMatch_generic (
648
+ size_t ZSTD_HcFindBestMatch(
696
649
  ZSTD_matchState_t* ms,
697
650
  const BYTE* const ip, const BYTE* const iLimit,
698
651
  size_t* offsetPtr,
@@ -758,6 +711,7 @@ size_t ZSTD_HcFindBestMatch_generic (
758
711
  matchIndex = NEXT_IN_CHAIN(matchIndex, chainMask);
759
712
  }
760
713
 
714
+ assert(nbAttempts <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */
761
715
  if (dictMode == ZSTD_dedicatedDictSearch) {
762
716
  ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts, dms,
763
717
  ip, iLimit, prefixStart, curr, dictLimit, ddsIdx);
@@ -797,310 +751,80 @@ size_t ZSTD_HcFindBestMatch_generic (
797
751
  return ml;
798
752
  }
799
753
 
800
-
801
- FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_selectMLS (
802
- ZSTD_matchState_t* ms,
803
- const BYTE* ip, const BYTE* const iLimit,
804
- size_t* offsetPtr)
805
- {
806
- switch(ms->cParams.minMatch)
807
- {
808
- default : /* includes case 3 */
809
- case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_noDict);
810
- case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_noDict);
811
- case 7 :
812
- case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_noDict);
813
- }
814
- }
815
-
816
-
817
- static size_t ZSTD_HcFindBestMatch_dictMatchState_selectMLS (
818
- ZSTD_matchState_t* ms,
819
- const BYTE* ip, const BYTE* const iLimit,
820
- size_t* offsetPtr)
821
- {
822
- switch(ms->cParams.minMatch)
823
- {
824
- default : /* includes case 3 */
825
- case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_dictMatchState);
826
- case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_dictMatchState);
827
- case 7 :
828
- case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_dictMatchState);
829
- }
830
- }
831
-
832
-
833
- static size_t ZSTD_HcFindBestMatch_dedicatedDictSearch_selectMLS (
834
- ZSTD_matchState_t* ms,
835
- const BYTE* ip, const BYTE* const iLimit,
836
- size_t* offsetPtr)
837
- {
838
- switch(ms->cParams.minMatch)
839
- {
840
- default : /* includes case 3 */
841
- case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_dedicatedDictSearch);
842
- case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_dedicatedDictSearch);
843
- case 7 :
844
- case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_dedicatedDictSearch);
845
- }
846
- }
847
-
848
-
849
- FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_extDict_selectMLS (
850
- ZSTD_matchState_t* ms,
851
- const BYTE* ip, const BYTE* const iLimit,
852
- size_t* offsetPtr)
853
- {
854
- switch(ms->cParams.minMatch)
855
- {
856
- default : /* includes case 3 */
857
- case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_extDict);
858
- case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_extDict);
859
- case 7 :
860
- case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_extDict);
861
- }
862
- }
863
-
864
754
  /* *********************************
865
755
  * (SIMD) Row-based matchfinder
866
756
  ***********************************/
867
757
  /* Constants for row-based hash */
868
- #define ZSTD_ROW_HASH_TAG_OFFSET 1 /* byte offset of hashes in the match state's tagTable from the beginning of a row */
869
- #define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */
758
+ #define ZSTD_ROW_HASH_TAG_OFFSET 16 /* byte offset of hashes in the match state's tagTable from the beginning of a row */
759
+ #define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */
870
760
  #define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1)
761
+ #define ZSTD_ROW_HASH_MAX_ENTRIES 64 /* absolute maximum number of entries per row, for all configurations */
871
762
 
872
763
  #define ZSTD_ROW_HASH_CACHE_MASK (ZSTD_ROW_HASH_CACHE_SIZE - 1)
873
764
 
874
- typedef U32 ZSTD_VecMask; /* Clarifies when we are interacting with a U32 representing a mask of matches */
875
-
876
- #if !defined(ZSTD_NO_INTRINSICS) && defined(__SSE2__) /* SIMD SSE version */
877
-
878
- #include <emmintrin.h>
879
- typedef __m128i ZSTD_Vec128;
880
-
881
- /* Returns a 128-bit container with 128-bits from src */
882
- static ZSTD_Vec128 ZSTD_Vec128_read(const void* const src) {
883
- return _mm_loadu_si128((ZSTD_Vec128 const*)src);
884
- }
885
-
886
- /* Returns a ZSTD_Vec128 with the byte "val" packed 16 times */
887
- static ZSTD_Vec128 ZSTD_Vec128_set8(BYTE val) {
888
- return _mm_set1_epi8((char)val);
889
- }
890
-
891
- /* Do byte-by-byte comparison result of x and y. Then collapse 128-bit resultant mask
892
- * into a 32-bit mask that is the MSB of each byte.
893
- * */
894
- static ZSTD_VecMask ZSTD_Vec128_cmpMask8(ZSTD_Vec128 x, ZSTD_Vec128 y) {
895
- return (ZSTD_VecMask)_mm_movemask_epi8(_mm_cmpeq_epi8(x, y));
896
- }
897
-
898
- typedef struct {
899
- __m128i fst;
900
- __m128i snd;
901
- } ZSTD_Vec256;
902
-
903
- static ZSTD_Vec256 ZSTD_Vec256_read(const void* const ptr) {
904
- ZSTD_Vec256 v;
905
- v.fst = ZSTD_Vec128_read(ptr);
906
- v.snd = ZSTD_Vec128_read((ZSTD_Vec128 const*)ptr + 1);
907
- return v;
908
- }
909
-
910
- static ZSTD_Vec256 ZSTD_Vec256_set8(BYTE val) {
911
- ZSTD_Vec256 v;
912
- v.fst = ZSTD_Vec128_set8(val);
913
- v.snd = ZSTD_Vec128_set8(val);
914
- return v;
915
- }
916
-
917
- static ZSTD_VecMask ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x, ZSTD_Vec256 y) {
918
- ZSTD_VecMask fstMask;
919
- ZSTD_VecMask sndMask;
920
- fstMask = ZSTD_Vec128_cmpMask8(x.fst, y.fst);
921
- sndMask = ZSTD_Vec128_cmpMask8(x.snd, y.snd);
922
- return fstMask | (sndMask << 16);
923
- }
924
-
925
- #elif !defined(ZSTD_NO_INTRINSICS) && defined(__ARM_NEON) /* SIMD ARM NEON Version */
926
-
927
- #include <arm_neon.h>
928
- typedef uint8x16_t ZSTD_Vec128;
929
-
930
- static ZSTD_Vec128 ZSTD_Vec128_read(const void* const src) {
931
- return vld1q_u8((const BYTE* const)src);
932
- }
933
-
934
- static ZSTD_Vec128 ZSTD_Vec128_set8(BYTE val) {
935
- return vdupq_n_u8(val);
936
- }
937
-
938
- /* Mimics '_mm_movemask_epi8()' from SSE */
939
- static U32 ZSTD_vmovmaskq_u8(ZSTD_Vec128 val) {
940
- /* Shift out everything but the MSB bits in each byte */
941
- uint16x8_t highBits = vreinterpretq_u16_u8(vshrq_n_u8(val, 7));
942
- /* Merge the even lanes together with vsra (right shift and add) */
943
- uint32x4_t paired16 = vreinterpretq_u32_u16(vsraq_n_u16(highBits, highBits, 7));
944
- uint64x2_t paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
945
- uint8x16_t paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
946
- /* Extract the low 8 bits from each lane, merge */
947
- return vgetq_lane_u8(paired64, 0) | ((U32)vgetq_lane_u8(paired64, 8) << 8);
948
- }
949
-
950
- static ZSTD_VecMask ZSTD_Vec128_cmpMask8(ZSTD_Vec128 x, ZSTD_Vec128 y) {
951
- return (ZSTD_VecMask)ZSTD_vmovmaskq_u8(vceqq_u8(x, y));
952
- }
953
-
954
- typedef struct {
955
- uint8x16_t fst;
956
- uint8x16_t snd;
957
- } ZSTD_Vec256;
958
-
959
- static ZSTD_Vec256 ZSTD_Vec256_read(const void* const ptr) {
960
- ZSTD_Vec256 v;
961
- v.fst = ZSTD_Vec128_read(ptr);
962
- v.snd = ZSTD_Vec128_read((ZSTD_Vec128 const*)ptr + 1);
963
- return v;
964
- }
965
-
966
- static ZSTD_Vec256 ZSTD_Vec256_set8(BYTE val) {
967
- ZSTD_Vec256 v;
968
- v.fst = ZSTD_Vec128_set8(val);
969
- v.snd = ZSTD_Vec128_set8(val);
970
- return v;
971
- }
972
-
973
- static ZSTD_VecMask ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x, ZSTD_Vec256 y) {
974
- ZSTD_VecMask fstMask;
975
- ZSTD_VecMask sndMask;
976
- fstMask = ZSTD_Vec128_cmpMask8(x.fst, y.fst);
977
- sndMask = ZSTD_Vec128_cmpMask8(x.snd, y.snd);
978
- return fstMask | (sndMask << 16);
979
- }
980
-
981
- #else /* Scalar fallback version */
982
-
983
- #define VEC128_NB_SIZE_T (16 / sizeof(size_t))
984
- typedef struct {
985
- size_t vec[VEC128_NB_SIZE_T];
986
- } ZSTD_Vec128;
987
-
988
- static ZSTD_Vec128 ZSTD_Vec128_read(const void* const src) {
989
- ZSTD_Vec128 ret;
990
- ZSTD_memcpy(ret.vec, src, VEC128_NB_SIZE_T*sizeof(size_t));
991
- return ret;
992
- }
993
-
994
- static ZSTD_Vec128 ZSTD_Vec128_set8(BYTE val) {
995
- ZSTD_Vec128 ret = { {0} };
996
- int startBit = sizeof(size_t) * 8 - 8;
997
- for (;startBit >= 0; startBit -= 8) {
998
- unsigned j = 0;
999
- for (;j < VEC128_NB_SIZE_T; ++j) {
1000
- ret.vec[j] |= ((size_t)val << startBit);
1001
- }
1002
- }
1003
- return ret;
1004
- }
1005
-
1006
- /* Compare x to y, byte by byte, generating a "matches" bitfield */
1007
- static ZSTD_VecMask ZSTD_Vec128_cmpMask8(ZSTD_Vec128 x, ZSTD_Vec128 y) {
1008
- ZSTD_VecMask res = 0;
1009
- unsigned i = 0;
1010
- unsigned l = 0;
1011
- for (; i < VEC128_NB_SIZE_T; ++i) {
1012
- const size_t cmp1 = x.vec[i];
1013
- const size_t cmp2 = y.vec[i];
1014
- unsigned j = 0;
1015
- for (; j < sizeof(size_t); ++j, ++l) {
1016
- if (((cmp1 >> j*8) & 0xFF) == ((cmp2 >> j*8) & 0xFF)) {
1017
- res |= ((U32)1 << (j+i*sizeof(size_t)));
1018
- }
1019
- }
1020
- }
1021
- return res;
1022
- }
1023
-
1024
- #define VEC256_NB_SIZE_T 2*VEC128_NB_SIZE_T
1025
- typedef struct {
1026
- size_t vec[VEC256_NB_SIZE_T];
1027
- } ZSTD_Vec256;
1028
-
1029
- static ZSTD_Vec256 ZSTD_Vec256_read(const void* const src) {
1030
- ZSTD_Vec256 ret;
1031
- ZSTD_memcpy(ret.vec, src, VEC256_NB_SIZE_T*sizeof(size_t));
1032
- return ret;
1033
- }
1034
-
1035
- static ZSTD_Vec256 ZSTD_Vec256_set8(BYTE val) {
1036
- ZSTD_Vec256 ret = { {0} };
1037
- int startBit = sizeof(size_t) * 8 - 8;
1038
- for (;startBit >= 0; startBit -= 8) {
1039
- unsigned j = 0;
1040
- for (;j < VEC256_NB_SIZE_T; ++j) {
1041
- ret.vec[j] |= ((size_t)val << startBit);
1042
- }
1043
- }
1044
- return ret;
1045
- }
1046
-
1047
- /* Compare x to y, byte by byte, generating a "matches" bitfield */
1048
- static ZSTD_VecMask ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x, ZSTD_Vec256 y) {
1049
- ZSTD_VecMask res = 0;
1050
- unsigned i = 0;
1051
- unsigned l = 0;
1052
- for (; i < VEC256_NB_SIZE_T; ++i) {
1053
- const size_t cmp1 = x.vec[i];
1054
- const size_t cmp2 = y.vec[i];
1055
- unsigned j = 0;
1056
- for (; j < sizeof(size_t); ++j, ++l) {
1057
- if (((cmp1 >> j*8) & 0xFF) == ((cmp2 >> j*8) & 0xFF)) {
1058
- res |= ((U32)1 << (j+i*sizeof(size_t)));
1059
- }
1060
- }
1061
- }
1062
- return res;
1063
- }
1064
-
1065
- #endif /* !defined(ZSTD_NO_INTRINSICS) && defined(__SSE2__) */
765
+ typedef U64 ZSTD_VecMask; /* Clarifies when we are interacting with a U64 representing a mask of matches */
1066
766
 
1067
767
  /* ZSTD_VecMask_next():
1068
768
  * Starting from the LSB, returns the idx of the next non-zero bit.
1069
769
  * Basically counting the nb of trailing zeroes.
1070
770
  */
1071
771
  static U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
1072
- # if defined(_MSC_VER) /* Visual */
1073
- unsigned long r=0;
1074
- return _BitScanForward(&r, val) ? (U32)r : 0;
1075
- # elif defined(__GNUC__) && (__GNUC__ >= 3)
1076
- return (U32)__builtin_ctz(val);
772
+ assert(val != 0);
773
+ # if defined(_MSC_VER) && defined(_WIN64)
774
+ if (val != 0) {
775
+ unsigned long r;
776
+ _BitScanForward64(&r, val);
777
+ return (U32)(r);
778
+ } else {
779
+ /* Should not reach this code path */
780
+ __assume(0);
781
+ }
782
+ # elif (defined(__GNUC__) && ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))
783
+ if (sizeof(size_t) == 4) {
784
+ U32 mostSignificantWord = (U32)(val >> 32);
785
+ U32 leastSignificantWord = (U32)val;
786
+ if (leastSignificantWord == 0) {
787
+ return 32 + (U32)__builtin_ctz(mostSignificantWord);
788
+ } else {
789
+ return (U32)__builtin_ctz(leastSignificantWord);
790
+ }
791
+ } else {
792
+ return (U32)__builtin_ctzll(val);
793
+ }
1077
794
  # else
1078
- /* Software ctz version: http://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightMultLookup */
1079
- static const U32 multiplyDeBruijnBitPosition[32] =
1080
- {
1081
- 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
1082
- 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
1083
- };
1084
- return multiplyDeBruijnBitPosition[((U32)((v & -(int)v) * 0x077CB531U)) >> 27];
795
+ /* Software ctz version: http://aggregate.org/MAGIC/#Trailing%20Zero%20Count
796
+ * and: https://stackoverflow.com/questions/2709430/count-number-of-bits-in-a-64-bit-long-big-integer
797
+ */
798
+ val = ~val & (val - 1ULL); /* Lowest set bit mask */
799
+ val = val - ((val >> 1) & 0x5555555555555555);
800
+ val = (val & 0x3333333333333333ULL) + ((val >> 2) & 0x3333333333333333ULL);
801
+ return (U32)((((val + (val >> 4)) & 0xF0F0F0F0F0F0F0FULL) * 0x101010101010101ULL) >> 56);
1085
802
  # endif
1086
803
  }
1087
804
 
1088
- /* ZSTD_VecMask_rotateRight():
1089
- * Rotates a bitfield to the right by "rotation" bits.
1090
- * If the rotation is greater than totalBits, the returned mask is 0.
805
+ /* ZSTD_rotateRight_*():
806
+ * Rotates a bitfield to the right by "count" bits.
807
+ * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts
1091
808
  */
1092
- FORCE_INLINE_TEMPLATE ZSTD_VecMask
1093
- ZSTD_VecMask_rotateRight(ZSTD_VecMask mask, U32 const rotation, U32 const totalBits) {
1094
- if (rotation == 0)
1095
- return mask;
1096
- switch (totalBits) {
1097
- default:
1098
- assert(0);
1099
- case 16:
1100
- return (mask >> rotation) | (U16)(mask << (16 - rotation));
1101
- case 32:
1102
- return (mask >> rotation) | (U32)(mask << (32 - rotation));
1103
- }
809
+ FORCE_INLINE_TEMPLATE
810
+ U64 ZSTD_rotateRight_U64(U64 const value, U32 count) {
811
+ assert(count < 64);
812
+ count &= 0x3F; /* for fickle pattern recognition */
813
+ return (value >> count) | (U64)(value << ((0U - count) & 0x3F));
814
+ }
815
+
816
+ FORCE_INLINE_TEMPLATE
817
+ U32 ZSTD_rotateRight_U32(U32 const value, U32 count) {
818
+ assert(count < 32);
819
+ count &= 0x1F; /* for fickle pattern recognition */
820
+ return (value >> count) | (U32)(value << ((0U - count) & 0x1F));
821
+ }
822
+
823
+ FORCE_INLINE_TEMPLATE
824
+ U16 ZSTD_rotateRight_U16(U16 const value, U32 count) {
825
+ assert(count < 16);
826
+ count &= 0x0F; /* for fickle pattern recognition */
827
+ return (value >> count) | (U16)(value << ((0U - count) & 0x0F));
1104
828
  }
1105
829
 
1106
830
  /* ZSTD_row_nextIndex():
@@ -1126,20 +850,24 @@ MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) {
1126
850
  */
1127
851
  FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* tagTable, U32 const relRow, U32 const rowLog) {
1128
852
  PREFETCH_L1(hashTable + relRow);
1129
- if (rowLog == 5) {
853
+ if (rowLog >= 5) {
1130
854
  PREFETCH_L1(hashTable + relRow + 16);
855
+ /* Note: prefetching more of the hash table does not appear to be beneficial for 128-entry rows */
1131
856
  }
1132
857
  PREFETCH_L1(tagTable + relRow);
1133
- assert(rowLog == 4 || rowLog == 5);
858
+ if (rowLog == 6) {
859
+ PREFETCH_L1(tagTable + relRow + 32);
860
+ }
861
+ assert(rowLog == 4 || rowLog == 5 || rowLog == 6);
1134
862
  assert(ZSTD_isAligned(hashTable + relRow, 64)); /* prefetched hash row always 64-byte aligned */
1135
- assert(ZSTD_isAligned(tagTable + relRow, (size_t)1 << rowLog)); /* prefetched tagRow sits on a multiple of 32 or 64 bytes */
863
+ assert(ZSTD_isAligned(tagTable + relRow, (size_t)1 << rowLog)); /* prefetched tagRow sits on correct multiple of bytes (32,64,128) */
1136
864
  }
1137
865
 
1138
866
  /* ZSTD_row_fillHashCache():
1139
867
  * Fill up the hash cache starting at idx, prefetching up to ZSTD_ROW_HASH_CACHE_SIZE entries,
1140
868
  * but not beyond iLimit.
1141
869
  */
1142
- static void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base,
870
+ FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base,
1143
871
  U32 const rowLog, U32 const mls,
1144
872
  U32 idx, const BYTE* const iLimit)
1145
873
  {
@@ -1179,35 +907,65 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTab
1179
907
  }
1180
908
  }
1181
909
 
1182
- /* ZSTD_row_update_internal():
1183
- * Inserts the byte at ip into the appropriate position in the hash table.
1184
- * Determines the relative row, and the position within the {16, 32} entry row to insert at.
910
+ /* ZSTD_row_update_internalImpl():
911
+ * Updates the hash table with positions starting from updateStartIdx until updateEndIdx.
1185
912
  */
1186
- FORCE_INLINE_TEMPLATE void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip,
1187
- U32 const mls, U32 const rowLog,
1188
- U32 const rowMask, U32 const useCache)
913
+ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms,
914
+ U32 updateStartIdx, U32 const updateEndIdx,
915
+ U32 const mls, U32 const rowLog,
916
+ U32 const rowMask, U32 const useCache)
1189
917
  {
1190
918
  U32* const hashTable = ms->hashTable;
1191
919
  U16* const tagTable = ms->tagTable;
1192
920
  U32 const hashLog = ms->rowHashLog;
1193
921
  const BYTE* const base = ms->window.base;
1194
- const U32 target = (U32)(ip - base);
1195
- U32 idx = ms->nextToUpdate;
1196
922
 
1197
- DEBUGLOG(6, "ZSTD_row_update_internal(): nextToUpdate=%u, current=%u", idx, target);
1198
- for (; idx < target; ++idx) {
1199
- U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, idx, hashLog, rowLog, mls)
1200
- : (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
923
+ DEBUGLOG(6, "ZSTD_row_update_internalImpl(): updateStartIdx=%u, updateEndIdx=%u", updateStartIdx, updateEndIdx);
924
+ for (; updateStartIdx < updateEndIdx; ++updateStartIdx) {
925
+ U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls)
926
+ : (U32)ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
1201
927
  U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
1202
928
  U32* const row = hashTable + relRow;
1203
929
  BYTE* tagRow = (BYTE*)(tagTable + relRow); /* Though tagTable is laid out as a table of U16, each tag is only 1 byte.
1204
930
  Explicit cast allows us to get exact desired position within each row */
1205
931
  U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
1206
932
 
1207
- assert(hash == ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls));
933
+ assert(hash == ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls));
1208
934
  ((BYTE*)tagRow)[pos + ZSTD_ROW_HASH_TAG_OFFSET] = hash & ZSTD_ROW_HASH_TAG_MASK;
1209
- row[pos] = idx;
935
+ row[pos] = updateStartIdx;
1210
936
  }
937
+ }
938
+
939
+ /* ZSTD_row_update_internal():
940
+ * Inserts the byte at ip into the appropriate position in the hash table, and updates ms->nextToUpdate.
941
+ * Skips sections of long matches as is necessary.
942
+ */
943
+ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip,
944
+ U32 const mls, U32 const rowLog,
945
+ U32 const rowMask, U32 const useCache)
946
+ {
947
+ U32 idx = ms->nextToUpdate;
948
+ const BYTE* const base = ms->window.base;
949
+ const U32 target = (U32)(ip - base);
950
+ const U32 kSkipThreshold = 384;
951
+ const U32 kMaxMatchStartPositionsToUpdate = 96;
952
+ const U32 kMaxMatchEndPositionsToUpdate = 32;
953
+
954
+ if (useCache) {
955
+ /* Only skip positions when using hash cache, i.e.
956
+ * if we are loading a dict, don't skip anything.
957
+ * If we decide to skip, then we only update a set number
958
+ * of positions at the beginning and end of the match.
959
+ */
960
+ if (UNLIKELY(target - idx > kSkipThreshold)) {
961
+ U32 const bound = idx + kMaxMatchStartPositionsToUpdate;
962
+ ZSTD_row_update_internalImpl(ms, idx, bound, mls, rowLog, rowMask, useCache);
963
+ idx = target - kMaxMatchEndPositionsToUpdate;
964
+ ZSTD_row_fillHashCache(ms, base, rowLog, mls, idx, ip+1);
965
+ }
966
+ }
967
+ assert(target >= idx);
968
+ ZSTD_row_update_internalImpl(ms, idx, target, mls, rowLog, rowMask, useCache);
1211
969
  ms->nextToUpdate = target;
1212
970
  }
1213
971
 
@@ -1216,7 +974,7 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const
1216
974
  * processing.
1217
975
  */
1218
976
  void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) {
1219
- const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5;
977
+ const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);
1220
978
  const U32 rowMask = (1u << rowLog) - 1;
1221
979
  const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */);
1222
980
 
@@ -1224,26 +982,131 @@ void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) {
1224
982
  ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* dont use cache */);
1225
983
  }
1226
984
 
985
+ #if defined(ZSTD_ARCH_X86_SSE2)
986
+ FORCE_INLINE_TEMPLATE ZSTD_VecMask
987
+ ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U32 head)
988
+ {
989
+ const __m128i comparisonMask = _mm_set1_epi8((char)tag);
990
+ int matches[4] = {0};
991
+ int i;
992
+ assert(nbChunks == 1 || nbChunks == 2 || nbChunks == 4);
993
+ for (i=0; i<nbChunks; i++) {
994
+ const __m128i chunk = _mm_loadu_si128((const __m128i*)(const void*)(src + 16*i));
995
+ const __m128i equalMask = _mm_cmpeq_epi8(chunk, comparisonMask);
996
+ matches[i] = _mm_movemask_epi8(equalMask);
997
+ }
998
+ if (nbChunks == 1) return ZSTD_rotateRight_U16((U16)matches[0], head);
999
+ if (nbChunks == 2) return ZSTD_rotateRight_U32((U32)matches[1] << 16 | (U32)matches[0], head);
1000
+ assert(nbChunks == 4);
1001
+ return ZSTD_rotateRight_U64((U64)matches[3] << 48 | (U64)matches[2] << 32 | (U64)matches[1] << 16 | (U64)matches[0], head);
1002
+ }
1003
+ #endif
1004
+
1227
1005
  /* Returns a ZSTD_VecMask (U32) that has the nth bit set to 1 if the newly-computed "tag" matches
1228
1006
  * the hash at the nth position in a row of the tagTable.
1229
- */
1230
- FORCE_INLINE_TEMPLATE
1231
- ZSTD_VecMask ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries) {
1232
- ZSTD_VecMask matches = 0;
1233
- if (rowEntries == 16) {
1234
- ZSTD_Vec128 hashes = ZSTD_Vec128_read(tagRow + ZSTD_ROW_HASH_TAG_OFFSET);
1235
- ZSTD_Vec128 expandedTags = ZSTD_Vec128_set8(tag);
1236
- matches = ZSTD_Vec128_cmpMask8(hashes, expandedTags);
1237
- } else if (rowEntries == 32) {
1238
- ZSTD_Vec256 hashes = ZSTD_Vec256_read(tagRow + ZSTD_ROW_HASH_TAG_OFFSET);
1239
- ZSTD_Vec256 expandedTags = ZSTD_Vec256_set8(tag);
1240
- matches = ZSTD_Vec256_cmpMask8(hashes, expandedTags);
1241
- } else {
1242
- assert(0);
1007
+ * Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield
1008
+ * to match up with the actual layout of the entries within the hashTable */
1009
+ FORCE_INLINE_TEMPLATE ZSTD_VecMask
1010
+ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries)
1011
+ {
1012
+ const BYTE* const src = tagRow + ZSTD_ROW_HASH_TAG_OFFSET;
1013
+ assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
1014
+ assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
1015
+
1016
+ #if defined(ZSTD_ARCH_X86_SSE2)
1017
+
1018
+ return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, head);
1019
+
1020
+ #else /* SW or NEON-LE */
1021
+
1022
+ # if defined(ZSTD_ARCH_ARM_NEON)
1023
+ /* This NEON path only works for little endian - otherwise use SWAR below */
1024
+ if (MEM_isLittleEndian()) {
1025
+ if (rowEntries == 16) {
1026
+ const uint8x16_t chunk = vld1q_u8(src);
1027
+ const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag)));
1028
+ const uint16x8_t t0 = vshlq_n_u16(equalMask, 7);
1029
+ const uint32x4_t t1 = vreinterpretq_u32_u16(vsriq_n_u16(t0, t0, 14));
1030
+ const uint64x2_t t2 = vreinterpretq_u64_u32(vshrq_n_u32(t1, 14));
1031
+ const uint8x16_t t3 = vreinterpretq_u8_u64(vsraq_n_u64(t2, t2, 28));
1032
+ const U16 hi = (U16)vgetq_lane_u8(t3, 8);
1033
+ const U16 lo = (U16)vgetq_lane_u8(t3, 0);
1034
+ return ZSTD_rotateRight_U16((hi << 8) | lo, head);
1035
+ } else if (rowEntries == 32) {
1036
+ const uint16x8x2_t chunk = vld2q_u16((const U16*)(const void*)src);
1037
+ const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]);
1038
+ const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]);
1039
+ const uint8x16_t equalMask0 = vceqq_u8(chunk0, vdupq_n_u8(tag));
1040
+ const uint8x16_t equalMask1 = vceqq_u8(chunk1, vdupq_n_u8(tag));
1041
+ const int8x8_t pack0 = vqmovn_s16(vreinterpretq_s16_u8(equalMask0));
1042
+ const int8x8_t pack1 = vqmovn_s16(vreinterpretq_s16_u8(equalMask1));
1043
+ const uint8x8_t t0 = vreinterpret_u8_s8(pack0);
1044
+ const uint8x8_t t1 = vreinterpret_u8_s8(pack1);
1045
+ const uint8x8_t t2 = vsri_n_u8(t1, t0, 2);
1046
+ const uint8x8x2_t t3 = vuzp_u8(t2, t0);
1047
+ const uint8x8_t t4 = vsri_n_u8(t3.val[1], t3.val[0], 4);
1048
+ const U32 matches = vget_lane_u32(vreinterpret_u32_u8(t4), 0);
1049
+ return ZSTD_rotateRight_U32(matches, head);
1050
+ } else { /* rowEntries == 64 */
1051
+ const uint8x16x4_t chunk = vld4q_u8(src);
1052
+ const uint8x16_t dup = vdupq_n_u8(tag);
1053
+ const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup);
1054
+ const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup);
1055
+ const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup);
1056
+ const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup);
1057
+
1058
+ const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1);
1059
+ const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1);
1060
+ const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2);
1061
+ const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4);
1062
+ const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4);
1063
+ const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0);
1064
+ return ZSTD_rotateRight_U64(matches, head);
1065
+ }
1243
1066
  }
1244
- /* Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield
1245
- to match up with the actual layout of the entries within the hashTable */
1246
- return ZSTD_VecMask_rotateRight(matches, head, rowEntries);
1067
+ # endif /* ZSTD_ARCH_ARM_NEON */
1068
+ /* SWAR */
1069
+ { const size_t chunkSize = sizeof(size_t);
1070
+ const size_t shiftAmount = ((chunkSize * 8) - chunkSize);
1071
+ const size_t xFF = ~((size_t)0);
1072
+ const size_t x01 = xFF / 0xFF;
1073
+ const size_t x80 = x01 << 7;
1074
+ const size_t splatChar = tag * x01;
1075
+ ZSTD_VecMask matches = 0;
1076
+ int i = rowEntries - chunkSize;
1077
+ assert((sizeof(size_t) == 4) || (sizeof(size_t) == 8));
1078
+ if (MEM_isLittleEndian()) { /* runtime check so have two loops */
1079
+ const size_t extractMagic = (xFF / 0x7F) >> chunkSize;
1080
+ do {
1081
+ size_t chunk = MEM_readST(&src[i]);
1082
+ chunk ^= splatChar;
1083
+ chunk = (((chunk | x80) - x01) | chunk) & x80;
1084
+ matches <<= chunkSize;
1085
+ matches |= (chunk * extractMagic) >> shiftAmount;
1086
+ i -= chunkSize;
1087
+ } while (i >= 0);
1088
+ } else { /* big endian: reverse bits during extraction */
1089
+ const size_t msb = xFF ^ (xFF >> 1);
1090
+ const size_t extractMagic = (msb / 0x1FF) | msb;
1091
+ do {
1092
+ size_t chunk = MEM_readST(&src[i]);
1093
+ chunk ^= splatChar;
1094
+ chunk = (((chunk | x80) - x01) | chunk) & x80;
1095
+ matches <<= chunkSize;
1096
+ matches |= ((chunk >> 7) * extractMagic) >> shiftAmount;
1097
+ i -= chunkSize;
1098
+ } while (i >= 0);
1099
+ }
1100
+ matches = ~matches;
1101
+ if (rowEntries == 16) {
1102
+ return ZSTD_rotateRight_U16((U16)matches, head);
1103
+ } else if (rowEntries == 32) {
1104
+ return ZSTD_rotateRight_U32((U32)matches, head);
1105
+ } else {
1106
+ return ZSTD_rotateRight_U64((U64)matches, head);
1107
+ }
1108
+ }
1109
+ #endif
1247
1110
  }
1248
1111
 
1249
1112
  /* The high-level approach of the SIMD row based match finder is as follows:
@@ -1262,7 +1125,7 @@ ZSTD_VecMask ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, con
1262
1125
  * - Pick the longest match.
1263
1126
  */
1264
1127
  FORCE_INLINE_TEMPLATE
1265
- size_t ZSTD_RowFindBestMatch_generic (
1128
+ size_t ZSTD_RowFindBestMatch(
1266
1129
  ZSTD_matchState_t* ms,
1267
1130
  const BYTE* const ip, const BYTE* const iLimit,
1268
1131
  size_t* offsetPtr,
@@ -1293,11 +1156,13 @@ size_t ZSTD_RowFindBestMatch_generic (
1293
1156
 
1294
1157
  /* DMS/DDS variables that may be referenced laster */
1295
1158
  const ZSTD_matchState_t* const dms = ms->dictMatchState;
1296
- size_t ddsIdx;
1297
- U32 ddsExtraAttempts; /* cctx hash tables are limited in searches, but allow extra searches into DDS */
1298
- U32 dmsTag;
1299
- U32* dmsRow;
1300
- BYTE* dmsTagRow;
1159
+
1160
+ /* Initialize the following variables to satisfy static analyzer */
1161
+ size_t ddsIdx = 0;
1162
+ U32 ddsExtraAttempts = 0; /* cctx hash tables are limited in searches, but allow extra searches into DDS */
1163
+ U32 dmsTag = 0;
1164
+ U32* dmsRow = NULL;
1165
+ BYTE* dmsTagRow = NULL;
1301
1166
 
1302
1167
  if (dictMode == ZSTD_dedicatedDictSearch) {
1303
1168
  const U32 ddsHashLog = dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG;
@@ -1329,7 +1194,7 @@ size_t ZSTD_RowFindBestMatch_generic (
1329
1194
  U32* const row = hashTable + relRow;
1330
1195
  BYTE* tagRow = (BYTE*)(tagTable + relRow);
1331
1196
  U32 const head = *tagRow & rowMask;
1332
- U32 matchBuffer[32 /* maximum nb entries per row */];
1197
+ U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
1333
1198
  size_t numMatches = 0;
1334
1199
  size_t currMatch = 0;
1335
1200
  ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, head, rowEntries);
@@ -1385,6 +1250,7 @@ size_t ZSTD_RowFindBestMatch_generic (
1385
1250
  }
1386
1251
  }
1387
1252
 
1253
+ assert(nbAttempts <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */
1388
1254
  if (dictMode == ZSTD_dedicatedDictSearch) {
1389
1255
  ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts + ddsExtraAttempts, dms,
1390
1256
  ip, iLimit, prefixStart, curr, dictLimit, ddsIdx);
@@ -1397,7 +1263,7 @@ size_t ZSTD_RowFindBestMatch_generic (
1397
1263
  const U32 dmsIndexDelta = dictLimit - dmsSize;
1398
1264
 
1399
1265
  { U32 const head = *dmsTagRow & rowMask;
1400
- U32 matchBuffer[32 /* maximum nb row entries */];
1266
+ U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
1401
1267
  size_t numMatches = 0;
1402
1268
  size_t currMatch = 0;
1403
1269
  ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, head, rowEntries);
@@ -1435,84 +1301,175 @@ size_t ZSTD_RowFindBestMatch_generic (
1435
1301
  return ml;
1436
1302
  }
1437
1303
 
1438
- /* Inlining is important to hardwire a hot branch (template emulation) */
1439
- FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_selectMLS (
1440
- ZSTD_matchState_t* ms,
1441
- const BYTE* ip, const BYTE* const iLimit,
1442
- const ZSTD_dictMode_e dictMode, size_t* offsetPtr, const U32 rowLog)
1443
- {
1444
- switch(ms->cParams.minMatch)
1445
- {
1446
- default : /* includes case 3 */
1447
- case 4 : return ZSTD_RowFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, dictMode, rowLog);
1448
- case 5 : return ZSTD_RowFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, dictMode, rowLog);
1449
- case 7 :
1450
- case 6 : return ZSTD_RowFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, dictMode, rowLog);
1451
- }
1452
- }
1453
1304
 
1454
- FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_selectRowLog (
1455
- ZSTD_matchState_t* ms,
1456
- const BYTE* ip, const BYTE* const iLimit,
1457
- size_t* offsetPtr)
1458
- {
1459
- const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 5);
1460
- switch(cappedSearchLog)
1461
- {
1462
- default :
1463
- case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_noDict, offsetPtr, 4);
1464
- case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_noDict, offsetPtr, 5);
1305
+ typedef size_t (*searchMax_f)(
1306
+ ZSTD_matchState_t* ms,
1307
+ const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
1308
+
1309
+ /**
1310
+ * This struct contains the functions necessary for lazy to search.
1311
+ * Currently, that is only searchMax. However, it is still valuable to have the
1312
+ * VTable because this makes it easier to add more functions to the VTable later.
1313
+ *
1314
+ * TODO: The start of the search function involves loading and calculating a
1315
+ * bunch of constants from the ZSTD_matchState_t. These computations could be
1316
+ * done in an initialization function, and saved somewhere in the match state.
1317
+ * Then we could pass a pointer to the saved state instead of the match state,
1318
+ * and avoid duplicate computations.
1319
+ *
1320
+ * TODO: Move the match re-winding into searchMax. This improves compression
1321
+ * ratio, and unlocks further simplifications with the next TODO.
1322
+ *
1323
+ * TODO: Try moving the repcode search into searchMax. After the re-winding
1324
+ * and repcode search are in searchMax, there is no more logic in the match
1325
+ * finder loop that requires knowledge about the dictMode. So we should be
1326
+ * able to avoid force inlining it, and we can join the extDict loop with
1327
+ * the single segment loop. It should go in searchMax instead of its own
1328
+ * function to avoid having multiple virtual function calls per search.
1329
+ */
1330
+ typedef struct {
1331
+ searchMax_f searchMax;
1332
+ } ZSTD_LazyVTable;
1333
+
1334
+ #define GEN_ZSTD_BT_VTABLE(dictMode, mls) \
1335
+ static size_t ZSTD_BtFindBestMatch_##dictMode##_##mls( \
1336
+ ZSTD_matchState_t* ms, \
1337
+ const BYTE* ip, const BYTE* const iLimit, \
1338
+ size_t* offsetPtr) \
1339
+ { \
1340
+ assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
1341
+ return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode); \
1342
+ } \
1343
+ static const ZSTD_LazyVTable ZSTD_BtVTable_##dictMode##_##mls = { \
1344
+ ZSTD_BtFindBestMatch_##dictMode##_##mls \
1345
+ };
1346
+
1347
+ #define GEN_ZSTD_HC_VTABLE(dictMode, mls) \
1348
+ static size_t ZSTD_HcFindBestMatch_##dictMode##_##mls( \
1349
+ ZSTD_matchState_t* ms, \
1350
+ const BYTE* ip, const BYTE* const iLimit, \
1351
+ size_t* offsetPtr) \
1352
+ { \
1353
+ assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
1354
+ return ZSTD_HcFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode); \
1355
+ } \
1356
+ static const ZSTD_LazyVTable ZSTD_HcVTable_##dictMode##_##mls = { \
1357
+ ZSTD_HcFindBestMatch_##dictMode##_##mls \
1358
+ };
1359
+
1360
+ #define GEN_ZSTD_ROW_VTABLE(dictMode, mls, rowLog) \
1361
+ static size_t ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog( \
1362
+ ZSTD_matchState_t* ms, \
1363
+ const BYTE* ip, const BYTE* const iLimit, \
1364
+ size_t* offsetPtr) \
1365
+ { \
1366
+ assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
1367
+ assert(MAX(4, MIN(6, ms->cParams.searchLog)) == rowLog); \
1368
+ return ZSTD_RowFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode, rowLog); \
1369
+ } \
1370
+ static const ZSTD_LazyVTable ZSTD_RowVTable_##dictMode##_##mls##_##rowLog = { \
1371
+ ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog \
1372
+ };
1373
+
1374
+ #define ZSTD_FOR_EACH_ROWLOG(X, dictMode, mls) \
1375
+ X(dictMode, mls, 4) \
1376
+ X(dictMode, mls, 5) \
1377
+ X(dictMode, mls, 6)
1378
+
1379
+ #define ZSTD_FOR_EACH_MLS_ROWLOG(X, dictMode) \
1380
+ ZSTD_FOR_EACH_ROWLOG(X, dictMode, 4) \
1381
+ ZSTD_FOR_EACH_ROWLOG(X, dictMode, 5) \
1382
+ ZSTD_FOR_EACH_ROWLOG(X, dictMode, 6)
1383
+
1384
+ #define ZSTD_FOR_EACH_MLS(X, dictMode) \
1385
+ X(dictMode, 4) \
1386
+ X(dictMode, 5) \
1387
+ X(dictMode, 6)
1388
+
1389
+ #define ZSTD_FOR_EACH_DICT_MODE(X, ...) \
1390
+ X(__VA_ARGS__, noDict) \
1391
+ X(__VA_ARGS__, extDict) \
1392
+ X(__VA_ARGS__, dictMatchState) \
1393
+ X(__VA_ARGS__, dedicatedDictSearch)
1394
+
1395
+ /* Generate Row VTables for each combination of (dictMode, mls, rowLog) */
1396
+ ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS_ROWLOG, GEN_ZSTD_ROW_VTABLE)
1397
+ /* Generate Binary Tree VTables for each combination of (dictMode, mls) */
1398
+ ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_BT_VTABLE)
1399
+ /* Generate Hash Chain VTables for each combination of (dictMode, mls) */
1400
+ ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_HC_VTABLE)
1401
+
1402
+ #define GEN_ZSTD_BT_VTABLE_ARRAY(dictMode) \
1403
+ { \
1404
+ &ZSTD_BtVTable_##dictMode##_4, \
1405
+ &ZSTD_BtVTable_##dictMode##_5, \
1406
+ &ZSTD_BtVTable_##dictMode##_6 \
1465
1407
  }
1466
- }
1467
1408
 
1468
- FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_dictMatchState_selectRowLog(
1469
- ZSTD_matchState_t* ms,
1470
- const BYTE* ip, const BYTE* const iLimit,
1471
- size_t* offsetPtr)
1472
- {
1473
- const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 5);
1474
- switch(cappedSearchLog)
1475
- {
1476
- default :
1477
- case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dictMatchState, offsetPtr, 4);
1478
- case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dictMatchState, offsetPtr, 5);
1409
+ #define GEN_ZSTD_HC_VTABLE_ARRAY(dictMode) \
1410
+ { \
1411
+ &ZSTD_HcVTable_##dictMode##_4, \
1412
+ &ZSTD_HcVTable_##dictMode##_5, \
1413
+ &ZSTD_HcVTable_##dictMode##_6 \
1479
1414
  }
1480
- }
1481
1415
 
1482
- FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_dedicatedDictSearch_selectRowLog(
1483
- ZSTD_matchState_t* ms,
1484
- const BYTE* ip, const BYTE* const iLimit,
1485
- size_t* offsetPtr)
1486
- {
1487
- const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 5);
1488
- switch(cappedSearchLog)
1489
- {
1490
- default :
1491
- case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dedicatedDictSearch, offsetPtr, 4);
1492
- case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dedicatedDictSearch, offsetPtr, 5);
1416
+ #define GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, mls) \
1417
+ { \
1418
+ &ZSTD_RowVTable_##dictMode##_##mls##_4, \
1419
+ &ZSTD_RowVTable_##dictMode##_##mls##_5, \
1420
+ &ZSTD_RowVTable_##dictMode##_##mls##_6 \
1493
1421
  }
1494
- }
1495
1422
 
1496
- FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_extDict_selectRowLog (
1497
- ZSTD_matchState_t* ms,
1498
- const BYTE* ip, const BYTE* const iLimit,
1499
- size_t* offsetPtr)
1500
- {
1501
- const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 5);
1502
- switch(cappedSearchLog)
1503
- {
1504
- default :
1505
- case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_extDict, offsetPtr, 4);
1506
- case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_extDict, offsetPtr, 5);
1423
+ #define GEN_ZSTD_ROW_VTABLE_ARRAY(dictMode) \
1424
+ { \
1425
+ GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, 4), \
1426
+ GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, 5), \
1427
+ GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, 6) \
1507
1428
  }
1508
- }
1509
1429
 
1430
+ #define GEN_ZSTD_VTABLE_ARRAY(X) \
1431
+ { \
1432
+ X(noDict), \
1433
+ X(extDict), \
1434
+ X(dictMatchState), \
1435
+ X(dedicatedDictSearch) \
1436
+ }
1510
1437
 
1511
1438
  /* *******************************
1512
1439
  * Common parser - lazy strategy
1513
1440
  *********************************/
1514
1441
  typedef enum { search_hashChain=0, search_binaryTree=1, search_rowHash=2 } searchMethod_e;
1515
1442
 
1443
+ /**
1444
+ * This table is indexed first by the four ZSTD_dictMode_e values, and then
1445
+ * by the two searchMethod_e values. NULLs are placed for configurations
1446
+ * that should never occur (extDict modes go to the other implementation
1447
+ * below and there is no DDSS for binary tree search yet).
1448
+ */
1449
+
1450
+ static ZSTD_LazyVTable const*
1451
+ ZSTD_selectLazyVTable(ZSTD_matchState_t const* ms, searchMethod_e searchMethod, ZSTD_dictMode_e dictMode)
1452
+ {
1453
+ /* Fill the Hc/Bt VTable arrays with the right functions for the (dictMode, mls) combination. */
1454
+ ZSTD_LazyVTable const* const hcVTables[4][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_HC_VTABLE_ARRAY);
1455
+ ZSTD_LazyVTable const* const btVTables[4][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_BT_VTABLE_ARRAY);
1456
+ /* Fill the Row VTable array with the right functions for the (dictMode, mls, rowLog) combination. */
1457
+ ZSTD_LazyVTable const* const rowVTables[4][3][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_ROW_VTABLE_ARRAY);
1458
+
1459
+ U32 const mls = MAX(4, MIN(6, ms->cParams.minMatch));
1460
+ U32 const rowLog = MAX(4, MIN(6, ms->cParams.searchLog));
1461
+ switch (searchMethod) {
1462
+ case search_hashChain:
1463
+ return hcVTables[dictMode][mls - 4];
1464
+ case search_binaryTree:
1465
+ return btVTables[dictMode][mls - 4];
1466
+ case search_rowHash:
1467
+ return rowVTables[dictMode][mls - 4][rowLog - 4];
1468
+ default:
1469
+ return NULL;
1470
+ }
1471
+ }
1472
+
1516
1473
  FORCE_INLINE_TEMPLATE size_t
1517
1474
  ZSTD_compressBlock_lazy_generic(
1518
1475
  ZSTD_matchState_t* ms, seqStore_t* seqStore,
@@ -1525,46 +1482,12 @@ ZSTD_compressBlock_lazy_generic(
1525
1482
  const BYTE* ip = istart;
1526
1483
  const BYTE* anchor = istart;
1527
1484
  const BYTE* const iend = istart + srcSize;
1528
- const BYTE* const ilimit = searchMethod == search_rowHash ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8;
1485
+ const BYTE* const ilimit = (searchMethod == search_rowHash) ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8;
1529
1486
  const BYTE* const base = ms->window.base;
1530
1487
  const U32 prefixLowestIndex = ms->window.dictLimit;
1531
1488
  const BYTE* const prefixLowest = base + prefixLowestIndex;
1532
- const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5;
1533
1489
 
1534
- typedef size_t (*searchMax_f)(
1535
- ZSTD_matchState_t* ms,
1536
- const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
1537
-
1538
- /**
1539
- * This table is indexed first by the four ZSTD_dictMode_e values, and then
1540
- * by the two searchMethod_e values. NULLs are placed for configurations
1541
- * that should never occur (extDict modes go to the other implementation
1542
- * below and there is no DDSS for binary tree search yet).
1543
- */
1544
- const searchMax_f searchFuncs[4][3] = {
1545
- {
1546
- ZSTD_HcFindBestMatch_selectMLS,
1547
- ZSTD_BtFindBestMatch_selectMLS,
1548
- ZSTD_RowFindBestMatch_selectRowLog
1549
- },
1550
- {
1551
- NULL,
1552
- NULL,
1553
- NULL
1554
- },
1555
- {
1556
- ZSTD_HcFindBestMatch_dictMatchState_selectMLS,
1557
- ZSTD_BtFindBestMatch_dictMatchState_selectMLS,
1558
- ZSTD_RowFindBestMatch_dictMatchState_selectRowLog
1559
- },
1560
- {
1561
- ZSTD_HcFindBestMatch_dedicatedDictSearch_selectMLS,
1562
- NULL,
1563
- ZSTD_RowFindBestMatch_dedicatedDictSearch_selectRowLog
1564
- }
1565
- };
1566
-
1567
- searchMax_f const searchMax = searchFuncs[dictMode][(int)searchMethod];
1490
+ searchMax_f const searchMax = ZSTD_selectLazyVTable(ms, searchMethod, dictMode)->searchMax;
1568
1491
  U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0;
1569
1492
 
1570
1493
  const int isDMS = dictMode == ZSTD_dictMatchState;
@@ -1599,6 +1522,7 @@ ZSTD_compressBlock_lazy_generic(
1599
1522
  }
1600
1523
 
1601
1524
  if (searchMethod == search_rowHash) {
1525
+ const U32 rowLog = MAX(4, MIN(6, ms->cParams.searchLog));
1602
1526
  ZSTD_row_fillHashCache(ms, base, rowLog,
1603
1527
  MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
1604
1528
  ms->nextToUpdate, ilimit);
@@ -1734,7 +1658,7 @@ ZSTD_compressBlock_lazy_generic(
1734
1658
  { start--; matchLength++; }
1735
1659
  }
1736
1660
  if (isDxS) {
1737
- U32 const matchIndex = (U32)((start-base) - (offset - ZSTD_REP_MOVE));
1661
+ U32 const matchIndex = (U32)((size_t)(start-base) - (offset - ZSTD_REP_MOVE));
1738
1662
  const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex;
1739
1663
  const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest;
1740
1664
  while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */
@@ -1743,7 +1667,7 @@ ZSTD_compressBlock_lazy_generic(
1743
1667
  }
1744
1668
  /* store sequence */
1745
1669
  _storeSequence:
1746
- { size_t const litLength = start - anchor;
1670
+ { size_t const litLength = (size_t)(start - anchor);
1747
1671
  ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offset, matchLength-MINMATCH);
1748
1672
  anchor = ip = start + matchLength;
1749
1673
  }
@@ -1955,15 +1879,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1955
1879
  const U32 windowLog = ms->cParams.windowLog;
1956
1880
  const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5;
1957
1881
 
1958
- typedef size_t (*searchMax_f)(
1959
- ZSTD_matchState_t* ms,
1960
- const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
1961
- const searchMax_f searchFuncs[3] = {
1962
- ZSTD_HcFindBestMatch_extDict_selectMLS,
1963
- ZSTD_BtFindBestMatch_extDict_selectMLS,
1964
- ZSTD_RowFindBestMatch_extDict_selectRowLog
1965
- };
1966
- searchMax_f searchMax = searchFuncs[(int)searchMethod];
1882
+ searchMax_f const searchMax = ZSTD_selectLazyVTable(ms, searchMethod, ZSTD_extDict)->searchMax;
1967
1883
  U32 offset_1 = rep[0], offset_2 = rep[1];
1968
1884
 
1969
1885
  DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod);
@@ -1995,7 +1911,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1995
1911
  const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
1996
1912
  const BYTE* const repMatch = repBase + repIndex;
1997
1913
  if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow */
1998
- & (offset_1 < curr+1 - windowLow) ) /* note: we are searching at curr+1 */
1914
+ & (offset_1 <= curr+1 - windowLow) ) /* note: we are searching at curr+1 */
1999
1915
  if (MEM_read32(ip+1) == MEM_read32(repMatch)) {
2000
1916
  /* repcode detected we should take it */
2001
1917
  const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
@@ -2010,7 +1926,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
2010
1926
  matchLength = ml2, start = ip, offset=offsetFound;
2011
1927
  }
2012
1928
 
2013
- if (matchLength < 4) {
1929
+ if (matchLength < 4) {
2014
1930
  ip += ((ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */
2015
1931
  continue;
2016
1932
  }
@@ -2027,7 +1943,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
2027
1943
  const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
2028
1944
  const BYTE* const repMatch = repBase + repIndex;
2029
1945
  if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */
2030
- & (offset_1 < curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
1946
+ & (offset_1 <= curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
2031
1947
  if (MEM_read32(ip) == MEM_read32(repMatch)) {
2032
1948
  /* repcode detected */
2033
1949
  const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
@@ -2059,7 +1975,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
2059
1975
  const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
2060
1976
  const BYTE* const repMatch = repBase + repIndex;
2061
1977
  if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */
2062
- & (offset_1 < curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
1978
+ & (offset_1 <= curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
2063
1979
  if (MEM_read32(ip) == MEM_read32(repMatch)) {
2064
1980
  /* repcode detected */
2065
1981
  const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
@@ -2084,7 +2000,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
2084
2000
 
2085
2001
  /* catch up */
2086
2002
  if (offset) {
2087
- U32 const matchIndex = (U32)((start-base) - (offset - ZSTD_REP_MOVE));
2003
+ U32 const matchIndex = (U32)((size_t)(start-base) - (offset - ZSTD_REP_MOVE));
2088
2004
  const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex;
2089
2005
  const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart;
2090
2006
  while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */
@@ -2093,7 +2009,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
2093
2009
 
2094
2010
  /* store sequence */
2095
2011
  _storeSequence:
2096
- { size_t const litLength = start - anchor;
2012
+ { size_t const litLength = (size_t)(start - anchor);
2097
2013
  ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offset, matchLength-MINMATCH);
2098
2014
  anchor = ip = start + matchLength;
2099
2015
  }
@@ -2106,7 +2022,7 @@ _storeSequence:
2106
2022
  const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
2107
2023
  const BYTE* const repMatch = repBase + repIndex;
2108
2024
  if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */
2109
- & (offset_2 < repCurrent - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
2025
+ & (offset_2 <= repCurrent - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
2110
2026
  if (MEM_read32(ip) == MEM_read32(repMatch)) {
2111
2027
  /* repcode detected we should take it */
2112
2028
  const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;