zstd-ruby 1.5.0.0 → 1.5.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (65) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/ruby.yml +2 -2
  3. data/README.md +1 -1
  4. data/ext/zstdruby/extconf.rb +2 -1
  5. data/ext/zstdruby/libzstd/Makefile +50 -175
  6. data/ext/zstdruby/libzstd/README.md +7 -1
  7. data/ext/zstdruby/libzstd/common/bitstream.h +24 -9
  8. data/ext/zstdruby/libzstd/common/compiler.h +89 -43
  9. data/ext/zstdruby/libzstd/common/entropy_common.c +11 -5
  10. data/ext/zstdruby/libzstd/common/error_private.h +79 -0
  11. data/ext/zstdruby/libzstd/common/fse.h +2 -1
  12. data/ext/zstdruby/libzstd/common/fse_decompress.c +1 -1
  13. data/ext/zstdruby/libzstd/common/huf.h +24 -22
  14. data/ext/zstdruby/libzstd/common/mem.h +18 -0
  15. data/ext/zstdruby/libzstd/common/pool.c +11 -6
  16. data/ext/zstdruby/libzstd/common/pool.h +2 -2
  17. data/ext/zstdruby/libzstd/common/portability_macros.h +137 -0
  18. data/ext/zstdruby/libzstd/common/xxhash.c +5 -805
  19. data/ext/zstdruby/libzstd/common/xxhash.h +5568 -167
  20. data/ext/zstdruby/libzstd/common/zstd_internal.h +95 -92
  21. data/ext/zstdruby/libzstd/common/zstd_trace.h +12 -3
  22. data/ext/zstdruby/libzstd/compress/clevels.h +134 -0
  23. data/ext/zstdruby/libzstd/compress/fse_compress.c +63 -27
  24. data/ext/zstdruby/libzstd/compress/huf_compress.c +537 -104
  25. data/ext/zstdruby/libzstd/compress/zstd_compress.c +307 -373
  26. data/ext/zstdruby/libzstd/compress/zstd_compress_internal.h +174 -83
  27. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.c +4 -3
  28. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.h +3 -1
  29. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.c +15 -14
  30. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.c +4 -3
  31. data/ext/zstdruby/libzstd/compress/zstd_cwksp.h +41 -27
  32. data/ext/zstdruby/libzstd/compress/zstd_double_fast.c +295 -120
  33. data/ext/zstdruby/libzstd/compress/zstd_fast.c +309 -130
  34. data/ext/zstdruby/libzstd/compress/zstd_lazy.c +482 -562
  35. data/ext/zstdruby/libzstd/compress/zstd_ldm.c +9 -7
  36. data/ext/zstdruby/libzstd/compress/zstd_ldm.h +1 -1
  37. data/ext/zstdruby/libzstd/compress/zstd_ldm_geartab.h +4 -1
  38. data/ext/zstdruby/libzstd/compress/zstd_opt.c +249 -148
  39. data/ext/zstdruby/libzstd/compress/zstdmt_compress.c +76 -38
  40. data/ext/zstdruby/libzstd/compress/zstdmt_compress.h +4 -1
  41. data/ext/zstdruby/libzstd/decompress/huf_decompress.c +727 -189
  42. data/ext/zstdruby/libzstd/decompress/huf_decompress_amd64.S +585 -0
  43. data/ext/zstdruby/libzstd/decompress/zstd_decompress.c +85 -22
  44. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.c +744 -220
  45. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.h +8 -2
  46. data/ext/zstdruby/libzstd/decompress/zstd_decompress_internal.h +34 -3
  47. data/ext/zstdruby/libzstd/deprecated/zbuff_compress.c +23 -3
  48. data/ext/zstdruby/libzstd/dictBuilder/cover.c +9 -2
  49. data/ext/zstdruby/libzstd/dictBuilder/fastcover.c +11 -4
  50. data/ext/zstdruby/libzstd/dictBuilder/zdict.c +101 -30
  51. data/ext/zstdruby/libzstd/legacy/zstd_v01.c +2 -6
  52. data/ext/zstdruby/libzstd/legacy/zstd_v02.c +3 -7
  53. data/ext/zstdruby/libzstd/legacy/zstd_v03.c +3 -7
  54. data/ext/zstdruby/libzstd/legacy/zstd_v04.c +3 -7
  55. data/ext/zstdruby/libzstd/legacy/zstd_v05.c +3 -7
  56. data/ext/zstdruby/libzstd/legacy/zstd_v06.c +3 -7
  57. data/ext/zstdruby/libzstd/legacy/zstd_v07.c +3 -7
  58. data/ext/zstdruby/libzstd/libzstd.mk +203 -0
  59. data/ext/zstdruby/libzstd/libzstd.pc.in +1 -0
  60. data/ext/zstdruby/libzstd/module.modulemap +25 -0
  61. data/ext/zstdruby/libzstd/zdict.h +4 -4
  62. data/ext/zstdruby/libzstd/zstd.h +179 -136
  63. data/ext/zstdruby/zstdruby.c +2 -2
  64. data/lib/zstd-ruby/version.rb +1 -1
  65. metadata +11 -6
@@ -61,7 +61,7 @@ ZSTD_updateDUBT(ZSTD_matchState_t* ms,
61
61
  * assumption : curr >= btlow == (curr - btmask)
62
62
  * doesn't fail */
63
63
  static void
64
- ZSTD_insertDUBT1(ZSTD_matchState_t* ms,
64
+ ZSTD_insertDUBT1(const ZSTD_matchState_t* ms,
65
65
  U32 curr, const BYTE* inputEnd,
66
66
  U32 nbCompares, U32 btLow,
67
67
  const ZSTD_dictMode_e dictMode)
@@ -93,7 +93,7 @@ ZSTD_insertDUBT1(ZSTD_matchState_t* ms,
93
93
  assert(curr >= btLow);
94
94
  assert(ip < iend); /* condition for ZSTD_count */
95
95
 
96
- while (nbCompares-- && (matchIndex > windowLow)) {
96
+ for (; nbCompares && (matchIndex > windowLow); --nbCompares) {
97
97
  U32* const nextPtr = bt + 2*(matchIndex & btMask);
98
98
  size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
99
99
  assert(matchIndex < curr);
@@ -151,7 +151,7 @@ ZSTD_insertDUBT1(ZSTD_matchState_t* ms,
151
151
 
152
152
  static size_t
153
153
  ZSTD_DUBT_findBetterDictMatch (
154
- ZSTD_matchState_t* ms,
154
+ const ZSTD_matchState_t* ms,
155
155
  const BYTE* const ip, const BYTE* const iend,
156
156
  size_t* offsetPtr,
157
157
  size_t bestLength,
@@ -185,7 +185,7 @@ ZSTD_DUBT_findBetterDictMatch (
185
185
  (void)dictMode;
186
186
  assert(dictMode == ZSTD_dictMatchState);
187
187
 
188
- while (nbCompares-- && (dictMatchIndex > dictLowLimit)) {
188
+ for (; nbCompares && (dictMatchIndex > dictLowLimit); --nbCompares) {
189
189
  U32* const nextPtr = dictBt + 2*(dictMatchIndex & btMask);
190
190
  size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
191
191
  const BYTE* match = dictBase + dictMatchIndex;
@@ -197,8 +197,8 @@ ZSTD_DUBT_findBetterDictMatch (
197
197
  U32 matchIndex = dictMatchIndex + dictIndexDelta;
198
198
  if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) {
199
199
  DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)",
200
- curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, ZSTD_REP_MOVE + curr - matchIndex, dictMatchIndex, matchIndex);
201
- bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + curr - matchIndex;
200
+ curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, STORE_OFFSET(curr - matchIndex), dictMatchIndex, matchIndex);
201
+ bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex);
202
202
  }
203
203
  if (ip+matchLength == iend) { /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */
204
204
  break; /* drop, to guarantee consistency (miss a little bit of compression) */
@@ -218,7 +218,7 @@ ZSTD_DUBT_findBetterDictMatch (
218
218
  }
219
219
 
220
220
  if (bestLength >= MINMATCH) {
221
- U32 const mIndex = curr - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex;
221
+ U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex;
222
222
  DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
223
223
  curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
224
224
  }
@@ -309,7 +309,7 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
309
309
  matchIndex = hashTable[h];
310
310
  hashTable[h] = curr; /* Update Hash Table */
311
311
 
312
- while (nbCompares-- && (matchIndex > windowLow)) {
312
+ for (; nbCompares && (matchIndex > windowLow); --nbCompares) {
313
313
  U32* const nextPtr = bt + 2*(matchIndex & btMask);
314
314
  size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
315
315
  const BYTE* match;
@@ -328,7 +328,7 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
328
328
  if (matchLength > matchEndIdx - matchIndex)
329
329
  matchEndIdx = matchIndex + (U32)matchLength;
330
330
  if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) )
331
- bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + curr - matchIndex;
331
+ bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex);
332
332
  if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */
333
333
  if (dictMode == ZSTD_dictMatchState) {
334
334
  nbCompares = 0; /* in addition to avoiding checking any
@@ -357,6 +357,7 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
357
357
 
358
358
  *smallerPtr = *largerPtr = 0;
359
359
 
360
+ assert(nbCompares <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */
360
361
  if (dictMode == ZSTD_dictMatchState && nbCompares) {
361
362
  bestLength = ZSTD_DUBT_findBetterDictMatch(
362
363
  ms, ip, iend,
@@ -367,7 +368,7 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
367
368
  assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased */
368
369
  ms->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */
369
370
  if (bestLength >= MINMATCH) {
370
- U32 const mIndex = curr - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex;
371
+ U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex;
371
372
  DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
372
373
  curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
373
374
  }
@@ -390,54 +391,6 @@ ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms,
390
391
  return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offsetPtr, mls, dictMode);
391
392
  }
392
393
 
393
-
394
- static size_t
395
- ZSTD_BtFindBestMatch_selectMLS ( ZSTD_matchState_t* ms,
396
- const BYTE* ip, const BYTE* const iLimit,
397
- size_t* offsetPtr)
398
- {
399
- switch(ms->cParams.minMatch)
400
- {
401
- default : /* includes case 3 */
402
- case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_noDict);
403
- case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_noDict);
404
- case 7 :
405
- case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_noDict);
406
- }
407
- }
408
-
409
-
410
- static size_t ZSTD_BtFindBestMatch_dictMatchState_selectMLS (
411
- ZSTD_matchState_t* ms,
412
- const BYTE* ip, const BYTE* const iLimit,
413
- size_t* offsetPtr)
414
- {
415
- switch(ms->cParams.minMatch)
416
- {
417
- default : /* includes case 3 */
418
- case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_dictMatchState);
419
- case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_dictMatchState);
420
- case 7 :
421
- case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_dictMatchState);
422
- }
423
- }
424
-
425
-
426
- static size_t ZSTD_BtFindBestMatch_extDict_selectMLS (
427
- ZSTD_matchState_t* ms,
428
- const BYTE* ip, const BYTE* const iLimit,
429
- size_t* offsetPtr)
430
- {
431
- switch(ms->cParams.minMatch)
432
- {
433
- default : /* includes case 3 */
434
- case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_extDict);
435
- case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_extDict);
436
- case 7 :
437
- case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_extDict);
438
- }
439
- }
440
-
441
394
  /***********************************
442
395
  * Dedicated dict search
443
396
  ***********************************/
@@ -450,7 +403,7 @@ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const B
450
403
  U32* const chainTable = ms->chainTable;
451
404
  U32 const chainSize = 1 << ms->cParams.chainLog;
452
405
  U32 idx = ms->nextToUpdate;
453
- U32 const minChain = chainSize < target ? target - chainSize : idx;
406
+ U32 const minChain = chainSize < target - idx ? target - chainSize : idx;
454
407
  U32 const bucketSize = 1 << ZSTD_LAZY_DDSS_BUCKET_LOG;
455
408
  U32 const cacheSize = bucketSize - 1;
456
409
  U32 const chainAttempts = (1 << ms->cParams.searchLog) - cacheSize;
@@ -464,7 +417,7 @@ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const B
464
417
  U32 const hashLog = ms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG;
465
418
  U32* const tmpHashTable = hashTable;
466
419
  U32* const tmpChainTable = hashTable + ((size_t)1 << hashLog);
467
- U32 const tmpChainSize = ((1 << ZSTD_LAZY_DDSS_BUCKET_LOG) - 1) << hashLog;
420
+ U32 const tmpChainSize = (U32)((1 << ZSTD_LAZY_DDSS_BUCKET_LOG) - 1) << hashLog;
468
421
  U32 const tmpMinChain = tmpChainSize < target ? target - tmpChainSize : idx;
469
422
  U32 hashIdx;
470
423
 
@@ -608,7 +561,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
608
561
  /* save best solution */
609
562
  if (currentMl > ml) {
610
563
  ml = currentMl;
611
- *offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE;
564
+ *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta));
612
565
  if (ip+currentMl == iLimit) {
613
566
  /* best possible, avoids read overflow on next attempt */
614
567
  return ml;
@@ -645,7 +598,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
645
598
  /* save best solution */
646
599
  if (currentMl > ml) {
647
600
  ml = currentMl;
648
- *offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE;
601
+ *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta));
649
602
  if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
650
603
  }
651
604
  }
@@ -692,7 +645,7 @@ U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {
692
645
 
693
646
  /* inlining is important to hardwire a hot branch (template emulation) */
694
647
  FORCE_INLINE_TEMPLATE
695
- size_t ZSTD_HcFindBestMatch_generic (
648
+ size_t ZSTD_HcFindBestMatch(
696
649
  ZSTD_matchState_t* ms,
697
650
  const BYTE* const ip, const BYTE* const iLimit,
698
651
  size_t* offsetPtr,
@@ -750,7 +703,7 @@ size_t ZSTD_HcFindBestMatch_generic (
750
703
  /* save best solution */
751
704
  if (currentMl > ml) {
752
705
  ml = currentMl;
753
- *offsetPtr = curr - matchIndex + ZSTD_REP_MOVE;
706
+ *offsetPtr = STORE_OFFSET(curr - matchIndex);
754
707
  if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
755
708
  }
756
709
 
@@ -758,6 +711,7 @@ size_t ZSTD_HcFindBestMatch_generic (
758
711
  matchIndex = NEXT_IN_CHAIN(matchIndex, chainMask);
759
712
  }
760
713
 
714
+ assert(nbAttempts <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */
761
715
  if (dictMode == ZSTD_dedicatedDictSearch) {
762
716
  ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts, dms,
763
717
  ip, iLimit, prefixStart, curr, dictLimit, ddsIdx);
@@ -784,7 +738,8 @@ size_t ZSTD_HcFindBestMatch_generic (
784
738
  /* save best solution */
785
739
  if (currentMl > ml) {
786
740
  ml = currentMl;
787
- *offsetPtr = curr - (matchIndex + dmsIndexDelta) + ZSTD_REP_MOVE;
741
+ assert(curr > matchIndex + dmsIndexDelta);
742
+ *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta));
788
743
  if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
789
744
  }
790
745
 
@@ -797,310 +752,80 @@ size_t ZSTD_HcFindBestMatch_generic (
797
752
  return ml;
798
753
  }
799
754
 
800
-
801
- FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_selectMLS (
802
- ZSTD_matchState_t* ms,
803
- const BYTE* ip, const BYTE* const iLimit,
804
- size_t* offsetPtr)
805
- {
806
- switch(ms->cParams.minMatch)
807
- {
808
- default : /* includes case 3 */
809
- case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_noDict);
810
- case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_noDict);
811
- case 7 :
812
- case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_noDict);
813
- }
814
- }
815
-
816
-
817
- static size_t ZSTD_HcFindBestMatch_dictMatchState_selectMLS (
818
- ZSTD_matchState_t* ms,
819
- const BYTE* ip, const BYTE* const iLimit,
820
- size_t* offsetPtr)
821
- {
822
- switch(ms->cParams.minMatch)
823
- {
824
- default : /* includes case 3 */
825
- case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_dictMatchState);
826
- case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_dictMatchState);
827
- case 7 :
828
- case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_dictMatchState);
829
- }
830
- }
831
-
832
-
833
- static size_t ZSTD_HcFindBestMatch_dedicatedDictSearch_selectMLS (
834
- ZSTD_matchState_t* ms,
835
- const BYTE* ip, const BYTE* const iLimit,
836
- size_t* offsetPtr)
837
- {
838
- switch(ms->cParams.minMatch)
839
- {
840
- default : /* includes case 3 */
841
- case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_dedicatedDictSearch);
842
- case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_dedicatedDictSearch);
843
- case 7 :
844
- case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_dedicatedDictSearch);
845
- }
846
- }
847
-
848
-
849
- FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_extDict_selectMLS (
850
- ZSTD_matchState_t* ms,
851
- const BYTE* ip, const BYTE* const iLimit,
852
- size_t* offsetPtr)
853
- {
854
- switch(ms->cParams.minMatch)
855
- {
856
- default : /* includes case 3 */
857
- case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_extDict);
858
- case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_extDict);
859
- case 7 :
860
- case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_extDict);
861
- }
862
- }
863
-
864
755
  /* *********************************
865
756
  * (SIMD) Row-based matchfinder
866
757
  ***********************************/
867
758
  /* Constants for row-based hash */
868
- #define ZSTD_ROW_HASH_TAG_OFFSET 1 /* byte offset of hashes in the match state's tagTable from the beginning of a row */
869
- #define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */
759
+ #define ZSTD_ROW_HASH_TAG_OFFSET 16 /* byte offset of hashes in the match state's tagTable from the beginning of a row */
760
+ #define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */
870
761
  #define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1)
762
+ #define ZSTD_ROW_HASH_MAX_ENTRIES 64 /* absolute maximum number of entries per row, for all configurations */
871
763
 
872
764
  #define ZSTD_ROW_HASH_CACHE_MASK (ZSTD_ROW_HASH_CACHE_SIZE - 1)
873
765
 
874
- typedef U32 ZSTD_VecMask; /* Clarifies when we are interacting with a U32 representing a mask of matches */
875
-
876
- #if !defined(ZSTD_NO_INTRINSICS) && defined(__SSE2__) /* SIMD SSE version */
877
-
878
- #include <emmintrin.h>
879
- typedef __m128i ZSTD_Vec128;
880
-
881
- /* Returns a 128-bit container with 128-bits from src */
882
- static ZSTD_Vec128 ZSTD_Vec128_read(const void* const src) {
883
- return _mm_loadu_si128((ZSTD_Vec128 const*)src);
884
- }
885
-
886
- /* Returns a ZSTD_Vec128 with the byte "val" packed 16 times */
887
- static ZSTD_Vec128 ZSTD_Vec128_set8(BYTE val) {
888
- return _mm_set1_epi8((char)val);
889
- }
890
-
891
- /* Do byte-by-byte comparison result of x and y. Then collapse 128-bit resultant mask
892
- * into a 32-bit mask that is the MSB of each byte.
893
- * */
894
- static ZSTD_VecMask ZSTD_Vec128_cmpMask8(ZSTD_Vec128 x, ZSTD_Vec128 y) {
895
- return (ZSTD_VecMask)_mm_movemask_epi8(_mm_cmpeq_epi8(x, y));
896
- }
897
-
898
- typedef struct {
899
- __m128i fst;
900
- __m128i snd;
901
- } ZSTD_Vec256;
902
-
903
- static ZSTD_Vec256 ZSTD_Vec256_read(const void* const ptr) {
904
- ZSTD_Vec256 v;
905
- v.fst = ZSTD_Vec128_read(ptr);
906
- v.snd = ZSTD_Vec128_read((ZSTD_Vec128 const*)ptr + 1);
907
- return v;
908
- }
909
-
910
- static ZSTD_Vec256 ZSTD_Vec256_set8(BYTE val) {
911
- ZSTD_Vec256 v;
912
- v.fst = ZSTD_Vec128_set8(val);
913
- v.snd = ZSTD_Vec128_set8(val);
914
- return v;
915
- }
916
-
917
- static ZSTD_VecMask ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x, ZSTD_Vec256 y) {
918
- ZSTD_VecMask fstMask;
919
- ZSTD_VecMask sndMask;
920
- fstMask = ZSTD_Vec128_cmpMask8(x.fst, y.fst);
921
- sndMask = ZSTD_Vec128_cmpMask8(x.snd, y.snd);
922
- return fstMask | (sndMask << 16);
923
- }
924
-
925
- #elif !defined(ZSTD_NO_INTRINSICS) && defined(__ARM_NEON) /* SIMD ARM NEON Version */
926
-
927
- #include <arm_neon.h>
928
- typedef uint8x16_t ZSTD_Vec128;
929
-
930
- static ZSTD_Vec128 ZSTD_Vec128_read(const void* const src) {
931
- return vld1q_u8((const BYTE* const)src);
932
- }
933
-
934
- static ZSTD_Vec128 ZSTD_Vec128_set8(BYTE val) {
935
- return vdupq_n_u8(val);
936
- }
937
-
938
- /* Mimics '_mm_movemask_epi8()' from SSE */
939
- static U32 ZSTD_vmovmaskq_u8(ZSTD_Vec128 val) {
940
- /* Shift out everything but the MSB bits in each byte */
941
- uint16x8_t highBits = vreinterpretq_u16_u8(vshrq_n_u8(val, 7));
942
- /* Merge the even lanes together with vsra (right shift and add) */
943
- uint32x4_t paired16 = vreinterpretq_u32_u16(vsraq_n_u16(highBits, highBits, 7));
944
- uint64x2_t paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
945
- uint8x16_t paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
946
- /* Extract the low 8 bits from each lane, merge */
947
- return vgetq_lane_u8(paired64, 0) | ((U32)vgetq_lane_u8(paired64, 8) << 8);
948
- }
949
-
950
- static ZSTD_VecMask ZSTD_Vec128_cmpMask8(ZSTD_Vec128 x, ZSTD_Vec128 y) {
951
- return (ZSTD_VecMask)ZSTD_vmovmaskq_u8(vceqq_u8(x, y));
952
- }
953
-
954
- typedef struct {
955
- uint8x16_t fst;
956
- uint8x16_t snd;
957
- } ZSTD_Vec256;
958
-
959
- static ZSTD_Vec256 ZSTD_Vec256_read(const void* const ptr) {
960
- ZSTD_Vec256 v;
961
- v.fst = ZSTD_Vec128_read(ptr);
962
- v.snd = ZSTD_Vec128_read((ZSTD_Vec128 const*)ptr + 1);
963
- return v;
964
- }
965
-
966
- static ZSTD_Vec256 ZSTD_Vec256_set8(BYTE val) {
967
- ZSTD_Vec256 v;
968
- v.fst = ZSTD_Vec128_set8(val);
969
- v.snd = ZSTD_Vec128_set8(val);
970
- return v;
971
- }
972
-
973
- static ZSTD_VecMask ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x, ZSTD_Vec256 y) {
974
- ZSTD_VecMask fstMask;
975
- ZSTD_VecMask sndMask;
976
- fstMask = ZSTD_Vec128_cmpMask8(x.fst, y.fst);
977
- sndMask = ZSTD_Vec128_cmpMask8(x.snd, y.snd);
978
- return fstMask | (sndMask << 16);
979
- }
980
-
981
- #else /* Scalar fallback version */
982
-
983
- #define VEC128_NB_SIZE_T (16 / sizeof(size_t))
984
- typedef struct {
985
- size_t vec[VEC128_NB_SIZE_T];
986
- } ZSTD_Vec128;
987
-
988
- static ZSTD_Vec128 ZSTD_Vec128_read(const void* const src) {
989
- ZSTD_Vec128 ret;
990
- ZSTD_memcpy(ret.vec, src, VEC128_NB_SIZE_T*sizeof(size_t));
991
- return ret;
992
- }
993
-
994
- static ZSTD_Vec128 ZSTD_Vec128_set8(BYTE val) {
995
- ZSTD_Vec128 ret = { {0} };
996
- int startBit = sizeof(size_t) * 8 - 8;
997
- for (;startBit >= 0; startBit -= 8) {
998
- unsigned j = 0;
999
- for (;j < VEC128_NB_SIZE_T; ++j) {
1000
- ret.vec[j] |= ((size_t)val << startBit);
1001
- }
1002
- }
1003
- return ret;
1004
- }
1005
-
1006
- /* Compare x to y, byte by byte, generating a "matches" bitfield */
1007
- static ZSTD_VecMask ZSTD_Vec128_cmpMask8(ZSTD_Vec128 x, ZSTD_Vec128 y) {
1008
- ZSTD_VecMask res = 0;
1009
- unsigned i = 0;
1010
- unsigned l = 0;
1011
- for (; i < VEC128_NB_SIZE_T; ++i) {
1012
- const size_t cmp1 = x.vec[i];
1013
- const size_t cmp2 = y.vec[i];
1014
- unsigned j = 0;
1015
- for (; j < sizeof(size_t); ++j, ++l) {
1016
- if (((cmp1 >> j*8) & 0xFF) == ((cmp2 >> j*8) & 0xFF)) {
1017
- res |= ((U32)1 << (j+i*sizeof(size_t)));
1018
- }
1019
- }
1020
- }
1021
- return res;
1022
- }
1023
-
1024
- #define VEC256_NB_SIZE_T 2*VEC128_NB_SIZE_T
1025
- typedef struct {
1026
- size_t vec[VEC256_NB_SIZE_T];
1027
- } ZSTD_Vec256;
1028
-
1029
- static ZSTD_Vec256 ZSTD_Vec256_read(const void* const src) {
1030
- ZSTD_Vec256 ret;
1031
- ZSTD_memcpy(ret.vec, src, VEC256_NB_SIZE_T*sizeof(size_t));
1032
- return ret;
1033
- }
1034
-
1035
- static ZSTD_Vec256 ZSTD_Vec256_set8(BYTE val) {
1036
- ZSTD_Vec256 ret = { {0} };
1037
- int startBit = sizeof(size_t) * 8 - 8;
1038
- for (;startBit >= 0; startBit -= 8) {
1039
- unsigned j = 0;
1040
- for (;j < VEC256_NB_SIZE_T; ++j) {
1041
- ret.vec[j] |= ((size_t)val << startBit);
1042
- }
1043
- }
1044
- return ret;
1045
- }
1046
-
1047
- /* Compare x to y, byte by byte, generating a "matches" bitfield */
1048
- static ZSTD_VecMask ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x, ZSTD_Vec256 y) {
1049
- ZSTD_VecMask res = 0;
1050
- unsigned i = 0;
1051
- unsigned l = 0;
1052
- for (; i < VEC256_NB_SIZE_T; ++i) {
1053
- const size_t cmp1 = x.vec[i];
1054
- const size_t cmp2 = y.vec[i];
1055
- unsigned j = 0;
1056
- for (; j < sizeof(size_t); ++j, ++l) {
1057
- if (((cmp1 >> j*8) & 0xFF) == ((cmp2 >> j*8) & 0xFF)) {
1058
- res |= ((U32)1 << (j+i*sizeof(size_t)));
1059
- }
1060
- }
1061
- }
1062
- return res;
1063
- }
1064
-
1065
- #endif /* !defined(ZSTD_NO_INTRINSICS) && defined(__SSE2__) */
766
+ typedef U64 ZSTD_VecMask; /* Clarifies when we are interacting with a U64 representing a mask of matches */
1066
767
 
1067
768
  /* ZSTD_VecMask_next():
1068
769
  * Starting from the LSB, returns the idx of the next non-zero bit.
1069
770
  * Basically counting the nb of trailing zeroes.
1070
771
  */
1071
772
  static U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
1072
- # if defined(_MSC_VER) /* Visual */
1073
- unsigned long r=0;
1074
- return _BitScanForward(&r, val) ? (U32)r : 0;
1075
- # elif defined(__GNUC__) && (__GNUC__ >= 3)
1076
- return (U32)__builtin_ctz(val);
773
+ assert(val != 0);
774
+ # if defined(_MSC_VER) && defined(_WIN64)
775
+ if (val != 0) {
776
+ unsigned long r;
777
+ _BitScanForward64(&r, val);
778
+ return (U32)(r);
779
+ } else {
780
+ /* Should not reach this code path */
781
+ __assume(0);
782
+ }
783
+ # elif (defined(__GNUC__) && ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))
784
+ if (sizeof(size_t) == 4) {
785
+ U32 mostSignificantWord = (U32)(val >> 32);
786
+ U32 leastSignificantWord = (U32)val;
787
+ if (leastSignificantWord == 0) {
788
+ return 32 + (U32)__builtin_ctz(mostSignificantWord);
789
+ } else {
790
+ return (U32)__builtin_ctz(leastSignificantWord);
791
+ }
792
+ } else {
793
+ return (U32)__builtin_ctzll(val);
794
+ }
1077
795
  # else
1078
- /* Software ctz version: http://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightMultLookup */
1079
- static const U32 multiplyDeBruijnBitPosition[32] =
1080
- {
1081
- 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
1082
- 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
1083
- };
1084
- return multiplyDeBruijnBitPosition[((U32)((v & -(int)v) * 0x077CB531U)) >> 27];
796
+ /* Software ctz version: http://aggregate.org/MAGIC/#Trailing%20Zero%20Count
797
+ * and: https://stackoverflow.com/questions/2709430/count-number-of-bits-in-a-64-bit-long-big-integer
798
+ */
799
+ val = ~val & (val - 1ULL); /* Lowest set bit mask */
800
+ val = val - ((val >> 1) & 0x5555555555555555);
801
+ val = (val & 0x3333333333333333ULL) + ((val >> 2) & 0x3333333333333333ULL);
802
+ return (U32)((((val + (val >> 4)) & 0xF0F0F0F0F0F0F0FULL) * 0x101010101010101ULL) >> 56);
1085
803
  # endif
1086
804
  }
1087
805
 
1088
- /* ZSTD_VecMask_rotateRight():
1089
- * Rotates a bitfield to the right by "rotation" bits.
1090
- * If the rotation is greater than totalBits, the returned mask is 0.
806
+ /* ZSTD_rotateRight_*():
807
+ * Rotates a bitfield to the right by "count" bits.
808
+ * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts
1091
809
  */
1092
- FORCE_INLINE_TEMPLATE ZSTD_VecMask
1093
- ZSTD_VecMask_rotateRight(ZSTD_VecMask mask, U32 const rotation, U32 const totalBits) {
1094
- if (rotation == 0)
1095
- return mask;
1096
- switch (totalBits) {
1097
- default:
1098
- assert(0);
1099
- case 16:
1100
- return (mask >> rotation) | (U16)(mask << (16 - rotation));
1101
- case 32:
1102
- return (mask >> rotation) | (U32)(mask << (32 - rotation));
1103
- }
810
+ FORCE_INLINE_TEMPLATE
811
+ U64 ZSTD_rotateRight_U64(U64 const value, U32 count) {
812
+ assert(count < 64);
813
+ count &= 0x3F; /* for fickle pattern recognition */
814
+ return (value >> count) | (U64)(value << ((0U - count) & 0x3F));
815
+ }
816
+
817
+ FORCE_INLINE_TEMPLATE
818
+ U32 ZSTD_rotateRight_U32(U32 const value, U32 count) {
819
+ assert(count < 32);
820
+ count &= 0x1F; /* for fickle pattern recognition */
821
+ return (value >> count) | (U32)(value << ((0U - count) & 0x1F));
822
+ }
823
+
824
+ FORCE_INLINE_TEMPLATE
825
+ U16 ZSTD_rotateRight_U16(U16 const value, U32 count) {
826
+ assert(count < 16);
827
+ count &= 0x0F; /* for fickle pattern recognition */
828
+ return (value >> count) | (U16)(value << ((0U - count) & 0x0F));
1104
829
  }
1105
830
 
1106
831
  /* ZSTD_row_nextIndex():
@@ -1126,20 +851,24 @@ MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) {
1126
851
  */
1127
852
  FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* tagTable, U32 const relRow, U32 const rowLog) {
1128
853
  PREFETCH_L1(hashTable + relRow);
1129
- if (rowLog == 5) {
854
+ if (rowLog >= 5) {
1130
855
  PREFETCH_L1(hashTable + relRow + 16);
856
+ /* Note: prefetching more of the hash table does not appear to be beneficial for 128-entry rows */
1131
857
  }
1132
858
  PREFETCH_L1(tagTable + relRow);
1133
- assert(rowLog == 4 || rowLog == 5);
859
+ if (rowLog == 6) {
860
+ PREFETCH_L1(tagTable + relRow + 32);
861
+ }
862
+ assert(rowLog == 4 || rowLog == 5 || rowLog == 6);
1134
863
  assert(ZSTD_isAligned(hashTable + relRow, 64)); /* prefetched hash row always 64-byte aligned */
1135
- assert(ZSTD_isAligned(tagTable + relRow, (size_t)1 << rowLog)); /* prefetched tagRow sits on a multiple of 32 or 64 bytes */
864
+ assert(ZSTD_isAligned(tagTable + relRow, (size_t)1 << rowLog)); /* prefetched tagRow sits on correct multiple of bytes (32,64,128) */
1136
865
  }
1137
866
 
1138
867
  /* ZSTD_row_fillHashCache():
1139
868
  * Fill up the hash cache starting at idx, prefetching up to ZSTD_ROW_HASH_CACHE_SIZE entries,
1140
869
  * but not beyond iLimit.
1141
870
  */
1142
- static void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base,
871
+ FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base,
1143
872
  U32 const rowLog, U32 const mls,
1144
873
  U32 idx, const BYTE* const iLimit)
1145
874
  {
@@ -1179,35 +908,65 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTab
1179
908
  }
1180
909
  }
1181
910
 
1182
- /* ZSTD_row_update_internal():
1183
- * Inserts the byte at ip into the appropriate position in the hash table.
1184
- * Determines the relative row, and the position within the {16, 32} entry row to insert at.
911
+ /* ZSTD_row_update_internalImpl():
912
+ * Updates the hash table with positions starting from updateStartIdx until updateEndIdx.
1185
913
  */
1186
- FORCE_INLINE_TEMPLATE void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip,
1187
- U32 const mls, U32 const rowLog,
1188
- U32 const rowMask, U32 const useCache)
914
+ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms,
915
+ U32 updateStartIdx, U32 const updateEndIdx,
916
+ U32 const mls, U32 const rowLog,
917
+ U32 const rowMask, U32 const useCache)
1189
918
  {
1190
919
  U32* const hashTable = ms->hashTable;
1191
920
  U16* const tagTable = ms->tagTable;
1192
921
  U32 const hashLog = ms->rowHashLog;
1193
922
  const BYTE* const base = ms->window.base;
1194
- const U32 target = (U32)(ip - base);
1195
- U32 idx = ms->nextToUpdate;
1196
923
 
1197
- DEBUGLOG(6, "ZSTD_row_update_internal(): nextToUpdate=%u, current=%u", idx, target);
1198
- for (; idx < target; ++idx) {
1199
- U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, idx, hashLog, rowLog, mls)
1200
- : (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
924
+ DEBUGLOG(6, "ZSTD_row_update_internalImpl(): updateStartIdx=%u, updateEndIdx=%u", updateStartIdx, updateEndIdx);
925
+ for (; updateStartIdx < updateEndIdx; ++updateStartIdx) {
926
+ U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls)
927
+ : (U32)ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
1201
928
  U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
1202
929
  U32* const row = hashTable + relRow;
1203
930
  BYTE* tagRow = (BYTE*)(tagTable + relRow); /* Though tagTable is laid out as a table of U16, each tag is only 1 byte.
1204
931
  Explicit cast allows us to get exact desired position within each row */
1205
932
  U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
1206
933
 
1207
- assert(hash == ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls));
934
+ assert(hash == ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls));
1208
935
  ((BYTE*)tagRow)[pos + ZSTD_ROW_HASH_TAG_OFFSET] = hash & ZSTD_ROW_HASH_TAG_MASK;
1209
- row[pos] = idx;
936
+ row[pos] = updateStartIdx;
1210
937
  }
938
+ }
939
+
940
+ /* ZSTD_row_update_internal():
941
+ * Inserts the byte at ip into the appropriate position in the hash table, and updates ms->nextToUpdate.
942
+ * Skips sections of long matches as is necessary.
943
+ */
944
+ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip,
945
+ U32 const mls, U32 const rowLog,
946
+ U32 const rowMask, U32 const useCache)
947
+ {
948
+ U32 idx = ms->nextToUpdate;
949
+ const BYTE* const base = ms->window.base;
950
+ const U32 target = (U32)(ip - base);
951
+ const U32 kSkipThreshold = 384;
952
+ const U32 kMaxMatchStartPositionsToUpdate = 96;
953
+ const U32 kMaxMatchEndPositionsToUpdate = 32;
954
+
955
+ if (useCache) {
956
+ /* Only skip positions when using hash cache, i.e.
957
+ * if we are loading a dict, don't skip anything.
958
+ * If we decide to skip, then we only update a set number
959
+ * of positions at the beginning and end of the match.
960
+ */
961
+ if (UNLIKELY(target - idx > kSkipThreshold)) {
962
+ U32 const bound = idx + kMaxMatchStartPositionsToUpdate;
963
+ ZSTD_row_update_internalImpl(ms, idx, bound, mls, rowLog, rowMask, useCache);
964
+ idx = target - kMaxMatchEndPositionsToUpdate;
965
+ ZSTD_row_fillHashCache(ms, base, rowLog, mls, idx, ip+1);
966
+ }
967
+ }
968
+ assert(target >= idx);
969
+ ZSTD_row_update_internalImpl(ms, idx, target, mls, rowLog, rowMask, useCache);
1211
970
  ms->nextToUpdate = target;
1212
971
  }
1213
972
 
@@ -1216,7 +975,7 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const
1216
975
  * processing.
1217
976
  */
1218
977
  void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) {
1219
- const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5;
978
+ const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);
1220
979
  const U32 rowMask = (1u << rowLog) - 1;
1221
980
  const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */);
1222
981
 
@@ -1224,26 +983,131 @@ void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) {
1224
983
  ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* dont use cache */);
1225
984
  }
1226
985
 
986
+ #if defined(ZSTD_ARCH_X86_SSE2)
987
+ FORCE_INLINE_TEMPLATE ZSTD_VecMask
988
+ ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U32 head)
989
+ {
990
+ const __m128i comparisonMask = _mm_set1_epi8((char)tag);
991
+ int matches[4] = {0};
992
+ int i;
993
+ assert(nbChunks == 1 || nbChunks == 2 || nbChunks == 4);
994
+ for (i=0; i<nbChunks; i++) {
995
+ const __m128i chunk = _mm_loadu_si128((const __m128i*)(const void*)(src + 16*i));
996
+ const __m128i equalMask = _mm_cmpeq_epi8(chunk, comparisonMask);
997
+ matches[i] = _mm_movemask_epi8(equalMask);
998
+ }
999
+ if (nbChunks == 1) return ZSTD_rotateRight_U16((U16)matches[0], head);
1000
+ if (nbChunks == 2) return ZSTD_rotateRight_U32((U32)matches[1] << 16 | (U32)matches[0], head);
1001
+ assert(nbChunks == 4);
1002
+ return ZSTD_rotateRight_U64((U64)matches[3] << 48 | (U64)matches[2] << 32 | (U64)matches[1] << 16 | (U64)matches[0], head);
1003
+ }
1004
+ #endif
1005
+
1227
1006
  /* Returns a ZSTD_VecMask (U32) that has the nth bit set to 1 if the newly-computed "tag" matches
1228
1007
  * the hash at the nth position in a row of the tagTable.
1229
- */
1230
- FORCE_INLINE_TEMPLATE
1231
- ZSTD_VecMask ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries) {
1232
- ZSTD_VecMask matches = 0;
1233
- if (rowEntries == 16) {
1234
- ZSTD_Vec128 hashes = ZSTD_Vec128_read(tagRow + ZSTD_ROW_HASH_TAG_OFFSET);
1235
- ZSTD_Vec128 expandedTags = ZSTD_Vec128_set8(tag);
1236
- matches = ZSTD_Vec128_cmpMask8(hashes, expandedTags);
1237
- } else if (rowEntries == 32) {
1238
- ZSTD_Vec256 hashes = ZSTD_Vec256_read(tagRow + ZSTD_ROW_HASH_TAG_OFFSET);
1239
- ZSTD_Vec256 expandedTags = ZSTD_Vec256_set8(tag);
1240
- matches = ZSTD_Vec256_cmpMask8(hashes, expandedTags);
1241
- } else {
1242
- assert(0);
1008
+ * Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield
1009
+ * to match up with the actual layout of the entries within the hashTable */
1010
+ FORCE_INLINE_TEMPLATE ZSTD_VecMask
1011
+ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries)
1012
+ {
1013
+ const BYTE* const src = tagRow + ZSTD_ROW_HASH_TAG_OFFSET;
1014
+ assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
1015
+ assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
1016
+
1017
+ #if defined(ZSTD_ARCH_X86_SSE2)
1018
+
1019
+ return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, head);
1020
+
1021
+ #else /* SW or NEON-LE */
1022
+
1023
+ # if defined(ZSTD_ARCH_ARM_NEON)
1024
+ /* This NEON path only works for little endian - otherwise use SWAR below */
1025
+ if (MEM_isLittleEndian()) {
1026
+ if (rowEntries == 16) {
1027
+ const uint8x16_t chunk = vld1q_u8(src);
1028
+ const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag)));
1029
+ const uint16x8_t t0 = vshlq_n_u16(equalMask, 7);
1030
+ const uint32x4_t t1 = vreinterpretq_u32_u16(vsriq_n_u16(t0, t0, 14));
1031
+ const uint64x2_t t2 = vreinterpretq_u64_u32(vshrq_n_u32(t1, 14));
1032
+ const uint8x16_t t3 = vreinterpretq_u8_u64(vsraq_n_u64(t2, t2, 28));
1033
+ const U16 hi = (U16)vgetq_lane_u8(t3, 8);
1034
+ const U16 lo = (U16)vgetq_lane_u8(t3, 0);
1035
+ return ZSTD_rotateRight_U16((hi << 8) | lo, head);
1036
+ } else if (rowEntries == 32) {
1037
+ const uint16x8x2_t chunk = vld2q_u16((const U16*)(const void*)src);
1038
+ const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]);
1039
+ const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]);
1040
+ const uint8x16_t equalMask0 = vceqq_u8(chunk0, vdupq_n_u8(tag));
1041
+ const uint8x16_t equalMask1 = vceqq_u8(chunk1, vdupq_n_u8(tag));
1042
+ const int8x8_t pack0 = vqmovn_s16(vreinterpretq_s16_u8(equalMask0));
1043
+ const int8x8_t pack1 = vqmovn_s16(vreinterpretq_s16_u8(equalMask1));
1044
+ const uint8x8_t t0 = vreinterpret_u8_s8(pack0);
1045
+ const uint8x8_t t1 = vreinterpret_u8_s8(pack1);
1046
+ const uint8x8_t t2 = vsri_n_u8(t1, t0, 2);
1047
+ const uint8x8x2_t t3 = vuzp_u8(t2, t0);
1048
+ const uint8x8_t t4 = vsri_n_u8(t3.val[1], t3.val[0], 4);
1049
+ const U32 matches = vget_lane_u32(vreinterpret_u32_u8(t4), 0);
1050
+ return ZSTD_rotateRight_U32(matches, head);
1051
+ } else { /* rowEntries == 64 */
1052
+ const uint8x16x4_t chunk = vld4q_u8(src);
1053
+ const uint8x16_t dup = vdupq_n_u8(tag);
1054
+ const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup);
1055
+ const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup);
1056
+ const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup);
1057
+ const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup);
1058
+
1059
+ const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1);
1060
+ const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1);
1061
+ const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2);
1062
+ const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4);
1063
+ const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4);
1064
+ const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0);
1065
+ return ZSTD_rotateRight_U64(matches, head);
1066
+ }
1243
1067
  }
1244
- /* Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield
1245
- to match up with the actual layout of the entries within the hashTable */
1246
- return ZSTD_VecMask_rotateRight(matches, head, rowEntries);
1068
+ # endif /* ZSTD_ARCH_ARM_NEON */
1069
+ /* SWAR */
1070
+ { const size_t chunkSize = sizeof(size_t);
1071
+ const size_t shiftAmount = ((chunkSize * 8) - chunkSize);
1072
+ const size_t xFF = ~((size_t)0);
1073
+ const size_t x01 = xFF / 0xFF;
1074
+ const size_t x80 = x01 << 7;
1075
+ const size_t splatChar = tag * x01;
1076
+ ZSTD_VecMask matches = 0;
1077
+ int i = rowEntries - chunkSize;
1078
+ assert((sizeof(size_t) == 4) || (sizeof(size_t) == 8));
1079
+ if (MEM_isLittleEndian()) { /* runtime check so have two loops */
1080
+ const size_t extractMagic = (xFF / 0x7F) >> chunkSize;
1081
+ do {
1082
+ size_t chunk = MEM_readST(&src[i]);
1083
+ chunk ^= splatChar;
1084
+ chunk = (((chunk | x80) - x01) | chunk) & x80;
1085
+ matches <<= chunkSize;
1086
+ matches |= (chunk * extractMagic) >> shiftAmount;
1087
+ i -= chunkSize;
1088
+ } while (i >= 0);
1089
+ } else { /* big endian: reverse bits during extraction */
1090
+ const size_t msb = xFF ^ (xFF >> 1);
1091
+ const size_t extractMagic = (msb / 0x1FF) | msb;
1092
+ do {
1093
+ size_t chunk = MEM_readST(&src[i]);
1094
+ chunk ^= splatChar;
1095
+ chunk = (((chunk | x80) - x01) | chunk) & x80;
1096
+ matches <<= chunkSize;
1097
+ matches |= ((chunk >> 7) * extractMagic) >> shiftAmount;
1098
+ i -= chunkSize;
1099
+ } while (i >= 0);
1100
+ }
1101
+ matches = ~matches;
1102
+ if (rowEntries == 16) {
1103
+ return ZSTD_rotateRight_U16((U16)matches, head);
1104
+ } else if (rowEntries == 32) {
1105
+ return ZSTD_rotateRight_U32((U32)matches, head);
1106
+ } else {
1107
+ return ZSTD_rotateRight_U64((U64)matches, head);
1108
+ }
1109
+ }
1110
+ #endif
1247
1111
  }
1248
1112
 
1249
1113
  /* The high-level approach of the SIMD row based match finder is as follows:
@@ -1262,7 +1126,7 @@ ZSTD_VecMask ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, con
1262
1126
  * - Pick the longest match.
1263
1127
  */
1264
1128
  FORCE_INLINE_TEMPLATE
1265
- size_t ZSTD_RowFindBestMatch_generic (
1129
+ size_t ZSTD_RowFindBestMatch(
1266
1130
  ZSTD_matchState_t* ms,
1267
1131
  const BYTE* const ip, const BYTE* const iLimit,
1268
1132
  size_t* offsetPtr,
@@ -1293,11 +1157,13 @@ size_t ZSTD_RowFindBestMatch_generic (
1293
1157
 
1294
1158
  /* DMS/DDS variables that may be referenced laster */
1295
1159
  const ZSTD_matchState_t* const dms = ms->dictMatchState;
1296
- size_t ddsIdx;
1297
- U32 ddsExtraAttempts; /* cctx hash tables are limited in searches, but allow extra searches into DDS */
1298
- U32 dmsTag;
1299
- U32* dmsRow;
1300
- BYTE* dmsTagRow;
1160
+
1161
+ /* Initialize the following variables to satisfy static analyzer */
1162
+ size_t ddsIdx = 0;
1163
+ U32 ddsExtraAttempts = 0; /* cctx hash tables are limited in searches, but allow extra searches into DDS */
1164
+ U32 dmsTag = 0;
1165
+ U32* dmsRow = NULL;
1166
+ BYTE* dmsTagRow = NULL;
1301
1167
 
1302
1168
  if (dictMode == ZSTD_dedicatedDictSearch) {
1303
1169
  const U32 ddsHashLog = dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG;
@@ -1329,7 +1195,7 @@ size_t ZSTD_RowFindBestMatch_generic (
1329
1195
  U32* const row = hashTable + relRow;
1330
1196
  BYTE* tagRow = (BYTE*)(tagTable + relRow);
1331
1197
  U32 const head = *tagRow & rowMask;
1332
- U32 matchBuffer[32 /* maximum nb entries per row */];
1198
+ U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
1333
1199
  size_t numMatches = 0;
1334
1200
  size_t currMatch = 0;
1335
1201
  ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, head, rowEntries);
@@ -1379,12 +1245,13 @@ size_t ZSTD_RowFindBestMatch_generic (
1379
1245
  /* Save best solution */
1380
1246
  if (currentMl > ml) {
1381
1247
  ml = currentMl;
1382
- *offsetPtr = curr - matchIndex + ZSTD_REP_MOVE;
1248
+ *offsetPtr = STORE_OFFSET(curr - matchIndex);
1383
1249
  if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
1384
1250
  }
1385
1251
  }
1386
1252
  }
1387
1253
 
1254
+ assert(nbAttempts <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */
1388
1255
  if (dictMode == ZSTD_dedicatedDictSearch) {
1389
1256
  ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts + ddsExtraAttempts, dms,
1390
1257
  ip, iLimit, prefixStart, curr, dictLimit, ddsIdx);
@@ -1397,7 +1264,7 @@ size_t ZSTD_RowFindBestMatch_generic (
1397
1264
  const U32 dmsIndexDelta = dictLimit - dmsSize;
1398
1265
 
1399
1266
  { U32 const head = *dmsTagRow & rowMask;
1400
- U32 matchBuffer[32 /* maximum nb row entries */];
1267
+ U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
1401
1268
  size_t numMatches = 0;
1402
1269
  size_t currMatch = 0;
1403
1270
  ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, head, rowEntries);
@@ -1426,7 +1293,8 @@ size_t ZSTD_RowFindBestMatch_generic (
1426
1293
 
1427
1294
  if (currentMl > ml) {
1428
1295
  ml = currentMl;
1429
- *offsetPtr = curr - (matchIndex + dmsIndexDelta) + ZSTD_REP_MOVE;
1296
+ assert(curr > matchIndex + dmsIndexDelta);
1297
+ *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta));
1430
1298
  if (ip+currentMl == iLimit) break;
1431
1299
  }
1432
1300
  }
@@ -1435,84 +1303,175 @@ size_t ZSTD_RowFindBestMatch_generic (
1435
1303
  return ml;
1436
1304
  }
1437
1305
 
1438
- /* Inlining is important to hardwire a hot branch (template emulation) */
1439
- FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_selectMLS (
1440
- ZSTD_matchState_t* ms,
1441
- const BYTE* ip, const BYTE* const iLimit,
1442
- const ZSTD_dictMode_e dictMode, size_t* offsetPtr, const U32 rowLog)
1443
- {
1444
- switch(ms->cParams.minMatch)
1445
- {
1446
- default : /* includes case 3 */
1447
- case 4 : return ZSTD_RowFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, dictMode, rowLog);
1448
- case 5 : return ZSTD_RowFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, dictMode, rowLog);
1449
- case 7 :
1450
- case 6 : return ZSTD_RowFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, dictMode, rowLog);
1451
- }
1452
- }
1453
1306
 
1454
- FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_selectRowLog (
1455
- ZSTD_matchState_t* ms,
1456
- const BYTE* ip, const BYTE* const iLimit,
1457
- size_t* offsetPtr)
1458
- {
1459
- const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 5);
1460
- switch(cappedSearchLog)
1461
- {
1462
- default :
1463
- case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_noDict, offsetPtr, 4);
1464
- case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_noDict, offsetPtr, 5);
1307
+ typedef size_t (*searchMax_f)(
1308
+ ZSTD_matchState_t* ms,
1309
+ const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
1310
+
1311
+ /**
1312
+ * This struct contains the functions necessary for lazy to search.
1313
+ * Currently, that is only searchMax. However, it is still valuable to have the
1314
+ * VTable because this makes it easier to add more functions to the VTable later.
1315
+ *
1316
+ * TODO: The start of the search function involves loading and calculating a
1317
+ * bunch of constants from the ZSTD_matchState_t. These computations could be
1318
+ * done in an initialization function, and saved somewhere in the match state.
1319
+ * Then we could pass a pointer to the saved state instead of the match state,
1320
+ * and avoid duplicate computations.
1321
+ *
1322
+ * TODO: Move the match re-winding into searchMax. This improves compression
1323
+ * ratio, and unlocks further simplifications with the next TODO.
1324
+ *
1325
+ * TODO: Try moving the repcode search into searchMax. After the re-winding
1326
+ * and repcode search are in searchMax, there is no more logic in the match
1327
+ * finder loop that requires knowledge about the dictMode. So we should be
1328
+ * able to avoid force inlining it, and we can join the extDict loop with
1329
+ * the single segment loop. It should go in searchMax instead of its own
1330
+ * function to avoid having multiple virtual function calls per search.
1331
+ */
1332
+ typedef struct {
1333
+ searchMax_f searchMax;
1334
+ } ZSTD_LazyVTable;
1335
+
1336
+ #define GEN_ZSTD_BT_VTABLE(dictMode, mls) \
1337
+ static size_t ZSTD_BtFindBestMatch_##dictMode##_##mls( \
1338
+ ZSTD_matchState_t* ms, \
1339
+ const BYTE* ip, const BYTE* const iLimit, \
1340
+ size_t* offsetPtr) \
1341
+ { \
1342
+ assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
1343
+ return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode); \
1344
+ } \
1345
+ static const ZSTD_LazyVTable ZSTD_BtVTable_##dictMode##_##mls = { \
1346
+ ZSTD_BtFindBestMatch_##dictMode##_##mls \
1347
+ };
1348
+
1349
+ #define GEN_ZSTD_HC_VTABLE(dictMode, mls) \
1350
+ static size_t ZSTD_HcFindBestMatch_##dictMode##_##mls( \
1351
+ ZSTD_matchState_t* ms, \
1352
+ const BYTE* ip, const BYTE* const iLimit, \
1353
+ size_t* offsetPtr) \
1354
+ { \
1355
+ assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
1356
+ return ZSTD_HcFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode); \
1357
+ } \
1358
+ static const ZSTD_LazyVTable ZSTD_HcVTable_##dictMode##_##mls = { \
1359
+ ZSTD_HcFindBestMatch_##dictMode##_##mls \
1360
+ };
1361
+
1362
+ #define GEN_ZSTD_ROW_VTABLE(dictMode, mls, rowLog) \
1363
+ static size_t ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog( \
1364
+ ZSTD_matchState_t* ms, \
1365
+ const BYTE* ip, const BYTE* const iLimit, \
1366
+ size_t* offsetPtr) \
1367
+ { \
1368
+ assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
1369
+ assert(MAX(4, MIN(6, ms->cParams.searchLog)) == rowLog); \
1370
+ return ZSTD_RowFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode, rowLog); \
1371
+ } \
1372
+ static const ZSTD_LazyVTable ZSTD_RowVTable_##dictMode##_##mls##_##rowLog = { \
1373
+ ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog \
1374
+ };
1375
+
1376
+ #define ZSTD_FOR_EACH_ROWLOG(X, dictMode, mls) \
1377
+ X(dictMode, mls, 4) \
1378
+ X(dictMode, mls, 5) \
1379
+ X(dictMode, mls, 6)
1380
+
1381
+ #define ZSTD_FOR_EACH_MLS_ROWLOG(X, dictMode) \
1382
+ ZSTD_FOR_EACH_ROWLOG(X, dictMode, 4) \
1383
+ ZSTD_FOR_EACH_ROWLOG(X, dictMode, 5) \
1384
+ ZSTD_FOR_EACH_ROWLOG(X, dictMode, 6)
1385
+
1386
+ #define ZSTD_FOR_EACH_MLS(X, dictMode) \
1387
+ X(dictMode, 4) \
1388
+ X(dictMode, 5) \
1389
+ X(dictMode, 6)
1390
+
1391
+ #define ZSTD_FOR_EACH_DICT_MODE(X, ...) \
1392
+ X(__VA_ARGS__, noDict) \
1393
+ X(__VA_ARGS__, extDict) \
1394
+ X(__VA_ARGS__, dictMatchState) \
1395
+ X(__VA_ARGS__, dedicatedDictSearch)
1396
+
1397
+ /* Generate Row VTables for each combination of (dictMode, mls, rowLog) */
1398
+ ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS_ROWLOG, GEN_ZSTD_ROW_VTABLE)
1399
+ /* Generate Binary Tree VTables for each combination of (dictMode, mls) */
1400
+ ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_BT_VTABLE)
1401
+ /* Generate Hash Chain VTables for each combination of (dictMode, mls) */
1402
+ ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_HC_VTABLE)
1403
+
1404
+ #define GEN_ZSTD_BT_VTABLE_ARRAY(dictMode) \
1405
+ { \
1406
+ &ZSTD_BtVTable_##dictMode##_4, \
1407
+ &ZSTD_BtVTable_##dictMode##_5, \
1408
+ &ZSTD_BtVTable_##dictMode##_6 \
1465
1409
  }
1466
- }
1467
1410
 
1468
- FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_dictMatchState_selectRowLog(
1469
- ZSTD_matchState_t* ms,
1470
- const BYTE* ip, const BYTE* const iLimit,
1471
- size_t* offsetPtr)
1472
- {
1473
- const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 5);
1474
- switch(cappedSearchLog)
1475
- {
1476
- default :
1477
- case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dictMatchState, offsetPtr, 4);
1478
- case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dictMatchState, offsetPtr, 5);
1411
+ #define GEN_ZSTD_HC_VTABLE_ARRAY(dictMode) \
1412
+ { \
1413
+ &ZSTD_HcVTable_##dictMode##_4, \
1414
+ &ZSTD_HcVTable_##dictMode##_5, \
1415
+ &ZSTD_HcVTable_##dictMode##_6 \
1479
1416
  }
1480
- }
1481
1417
 
1482
- FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_dedicatedDictSearch_selectRowLog(
1483
- ZSTD_matchState_t* ms,
1484
- const BYTE* ip, const BYTE* const iLimit,
1485
- size_t* offsetPtr)
1486
- {
1487
- const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 5);
1488
- switch(cappedSearchLog)
1489
- {
1490
- default :
1491
- case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dedicatedDictSearch, offsetPtr, 4);
1492
- case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dedicatedDictSearch, offsetPtr, 5);
1418
+ #define GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, mls) \
1419
+ { \
1420
+ &ZSTD_RowVTable_##dictMode##_##mls##_4, \
1421
+ &ZSTD_RowVTable_##dictMode##_##mls##_5, \
1422
+ &ZSTD_RowVTable_##dictMode##_##mls##_6 \
1493
1423
  }
1494
- }
1495
1424
 
1496
- FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_extDict_selectRowLog (
1497
- ZSTD_matchState_t* ms,
1498
- const BYTE* ip, const BYTE* const iLimit,
1499
- size_t* offsetPtr)
1500
- {
1501
- const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 5);
1502
- switch(cappedSearchLog)
1503
- {
1504
- default :
1505
- case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_extDict, offsetPtr, 4);
1506
- case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_extDict, offsetPtr, 5);
1425
+ #define GEN_ZSTD_ROW_VTABLE_ARRAY(dictMode) \
1426
+ { \
1427
+ GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, 4), \
1428
+ GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, 5), \
1429
+ GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, 6) \
1507
1430
  }
1508
- }
1509
1431
 
1432
+ #define GEN_ZSTD_VTABLE_ARRAY(X) \
1433
+ { \
1434
+ X(noDict), \
1435
+ X(extDict), \
1436
+ X(dictMatchState), \
1437
+ X(dedicatedDictSearch) \
1438
+ }
1510
1439
 
1511
1440
  /* *******************************
1512
1441
  * Common parser - lazy strategy
1513
1442
  *********************************/
1514
1443
  typedef enum { search_hashChain=0, search_binaryTree=1, search_rowHash=2 } searchMethod_e;
1515
1444
 
1445
+ /**
1446
+ * This table is indexed first by the four ZSTD_dictMode_e values, and then
1447
+ * by the two searchMethod_e values. NULLs are placed for configurations
1448
+ * that should never occur (extDict modes go to the other implementation
1449
+ * below and there is no DDSS for binary tree search yet).
1450
+ */
1451
+
1452
+ static ZSTD_LazyVTable const*
1453
+ ZSTD_selectLazyVTable(ZSTD_matchState_t const* ms, searchMethod_e searchMethod, ZSTD_dictMode_e dictMode)
1454
+ {
1455
+ /* Fill the Hc/Bt VTable arrays with the right functions for the (dictMode, mls) combination. */
1456
+ ZSTD_LazyVTable const* const hcVTables[4][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_HC_VTABLE_ARRAY);
1457
+ ZSTD_LazyVTable const* const btVTables[4][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_BT_VTABLE_ARRAY);
1458
+ /* Fill the Row VTable array with the right functions for the (dictMode, mls, rowLog) combination. */
1459
+ ZSTD_LazyVTable const* const rowVTables[4][3][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_ROW_VTABLE_ARRAY);
1460
+
1461
+ U32 const mls = MAX(4, MIN(6, ms->cParams.minMatch));
1462
+ U32 const rowLog = MAX(4, MIN(6, ms->cParams.searchLog));
1463
+ switch (searchMethod) {
1464
+ case search_hashChain:
1465
+ return hcVTables[dictMode][mls - 4];
1466
+ case search_binaryTree:
1467
+ return btVTables[dictMode][mls - 4];
1468
+ case search_rowHash:
1469
+ return rowVTables[dictMode][mls - 4][rowLog - 4];
1470
+ default:
1471
+ return NULL;
1472
+ }
1473
+ }
1474
+
1516
1475
  FORCE_INLINE_TEMPLATE size_t
1517
1476
  ZSTD_compressBlock_lazy_generic(
1518
1477
  ZSTD_matchState_t* ms, seqStore_t* seqStore,
@@ -1525,46 +1484,12 @@ ZSTD_compressBlock_lazy_generic(
1525
1484
  const BYTE* ip = istart;
1526
1485
  const BYTE* anchor = istart;
1527
1486
  const BYTE* const iend = istart + srcSize;
1528
- const BYTE* const ilimit = searchMethod == search_rowHash ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8;
1487
+ const BYTE* const ilimit = (searchMethod == search_rowHash) ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8;
1529
1488
  const BYTE* const base = ms->window.base;
1530
1489
  const U32 prefixLowestIndex = ms->window.dictLimit;
1531
1490
  const BYTE* const prefixLowest = base + prefixLowestIndex;
1532
- const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5;
1533
1491
 
1534
- typedef size_t (*searchMax_f)(
1535
- ZSTD_matchState_t* ms,
1536
- const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
1537
-
1538
- /**
1539
- * This table is indexed first by the four ZSTD_dictMode_e values, and then
1540
- * by the two searchMethod_e values. NULLs are placed for configurations
1541
- * that should never occur (extDict modes go to the other implementation
1542
- * below and there is no DDSS for binary tree search yet).
1543
- */
1544
- const searchMax_f searchFuncs[4][3] = {
1545
- {
1546
- ZSTD_HcFindBestMatch_selectMLS,
1547
- ZSTD_BtFindBestMatch_selectMLS,
1548
- ZSTD_RowFindBestMatch_selectRowLog
1549
- },
1550
- {
1551
- NULL,
1552
- NULL,
1553
- NULL
1554
- },
1555
- {
1556
- ZSTD_HcFindBestMatch_dictMatchState_selectMLS,
1557
- ZSTD_BtFindBestMatch_dictMatchState_selectMLS,
1558
- ZSTD_RowFindBestMatch_dictMatchState_selectRowLog
1559
- },
1560
- {
1561
- ZSTD_HcFindBestMatch_dedicatedDictSearch_selectMLS,
1562
- NULL,
1563
- ZSTD_RowFindBestMatch_dedicatedDictSearch_selectRowLog
1564
- }
1565
- };
1566
-
1567
- searchMax_f const searchMax = searchFuncs[dictMode][(int)searchMethod];
1492
+ searchMax_f const searchMax = ZSTD_selectLazyVTable(ms, searchMethod, dictMode)->searchMax;
1568
1493
  U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0;
1569
1494
 
1570
1495
  const int isDMS = dictMode == ZSTD_dictMatchState;
@@ -1599,6 +1524,7 @@ ZSTD_compressBlock_lazy_generic(
1599
1524
  }
1600
1525
 
1601
1526
  if (searchMethod == search_rowHash) {
1527
+ const U32 rowLog = MAX(4, MIN(6, ms->cParams.searchLog));
1602
1528
  ZSTD_row_fillHashCache(ms, base, rowLog,
1603
1529
  MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
1604
1530
  ms->nextToUpdate, ilimit);
@@ -1613,8 +1539,9 @@ ZSTD_compressBlock_lazy_generic(
1613
1539
  #endif
1614
1540
  while (ip < ilimit) {
1615
1541
  size_t matchLength=0;
1616
- size_t offset=0;
1542
+ size_t offcode=STORE_REPCODE_1;
1617
1543
  const BYTE* start=ip+1;
1544
+ DEBUGLOG(7, "search baseline (depth 0)");
1618
1545
 
1619
1546
  /* check repCode */
1620
1547
  if (isDxS) {
@@ -1640,7 +1567,7 @@ ZSTD_compressBlock_lazy_generic(
1640
1567
  { size_t offsetFound = 999999999;
1641
1568
  size_t const ml2 = searchMax(ms, ip, iend, &offsetFound);
1642
1569
  if (ml2 > matchLength)
1643
- matchLength = ml2, start = ip, offset=offsetFound;
1570
+ matchLength = ml2, start = ip, offcode=offsetFound;
1644
1571
  }
1645
1572
 
1646
1573
  if (matchLength < 4) {
@@ -1651,14 +1578,15 @@ ZSTD_compressBlock_lazy_generic(
1651
1578
  /* let's try to find a better solution */
1652
1579
  if (depth>=1)
1653
1580
  while (ip<ilimit) {
1581
+ DEBUGLOG(7, "search depth 1");
1654
1582
  ip ++;
1655
1583
  if ( (dictMode == ZSTD_noDict)
1656
- && (offset) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
1584
+ && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
1657
1585
  size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
1658
1586
  int const gain2 = (int)(mlRep * 3);
1659
- int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1);
1587
+ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
1660
1588
  if ((mlRep >= 4) && (gain2 > gain1))
1661
- matchLength = mlRep, offset = 0, start = ip;
1589
+ matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
1662
1590
  }
1663
1591
  if (isDxS) {
1664
1592
  const U32 repIndex = (U32)(ip - base) - offset_1;
@@ -1670,30 +1598,31 @@ ZSTD_compressBlock_lazy_generic(
1670
1598
  const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
1671
1599
  size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
1672
1600
  int const gain2 = (int)(mlRep * 3);
1673
- int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1);
1601
+ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
1674
1602
  if ((mlRep >= 4) && (gain2 > gain1))
1675
- matchLength = mlRep, offset = 0, start = ip;
1603
+ matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
1676
1604
  }
1677
1605
  }
1678
1606
  { size_t offset2=999999999;
1679
1607
  size_t const ml2 = searchMax(ms, ip, iend, &offset2);
1680
- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */
1681
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 4);
1608
+ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */
1609
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4);
1682
1610
  if ((ml2 >= 4) && (gain2 > gain1)) {
1683
- matchLength = ml2, offset = offset2, start = ip;
1611
+ matchLength = ml2, offcode = offset2, start = ip;
1684
1612
  continue; /* search a better one */
1685
1613
  } }
1686
1614
 
1687
1615
  /* let's find an even better one */
1688
1616
  if ((depth==2) && (ip<ilimit)) {
1617
+ DEBUGLOG(7, "search depth 2");
1689
1618
  ip ++;
1690
1619
  if ( (dictMode == ZSTD_noDict)
1691
- && (offset) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
1620
+ && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
1692
1621
  size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
1693
1622
  int const gain2 = (int)(mlRep * 4);
1694
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1);
1623
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
1695
1624
  if ((mlRep >= 4) && (gain2 > gain1))
1696
- matchLength = mlRep, offset = 0, start = ip;
1625
+ matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
1697
1626
  }
1698
1627
  if (isDxS) {
1699
1628
  const U32 repIndex = (U32)(ip - base) - offset_1;
@@ -1705,46 +1634,45 @@ ZSTD_compressBlock_lazy_generic(
1705
1634
  const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
1706
1635
  size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
1707
1636
  int const gain2 = (int)(mlRep * 4);
1708
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1);
1637
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
1709
1638
  if ((mlRep >= 4) && (gain2 > gain1))
1710
- matchLength = mlRep, offset = 0, start = ip;
1639
+ matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
1711
1640
  }
1712
1641
  }
1713
1642
  { size_t offset2=999999999;
1714
1643
  size_t const ml2 = searchMax(ms, ip, iend, &offset2);
1715
- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */
1716
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 7);
1644
+ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */
1645
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7);
1717
1646
  if ((ml2 >= 4) && (gain2 > gain1)) {
1718
- matchLength = ml2, offset = offset2, start = ip;
1647
+ matchLength = ml2, offcode = offset2, start = ip;
1719
1648
  continue;
1720
1649
  } } }
1721
1650
  break; /* nothing found : store previous solution */
1722
1651
  }
1723
1652
 
1724
1653
  /* NOTE:
1725
- * start[-offset+ZSTD_REP_MOVE-1] is undefined behavior.
1726
- * (-offset+ZSTD_REP_MOVE-1) is unsigned, and is added to start, which
1727
- * overflows the pointer, which is undefined behavior.
1654
+ * Pay attention that `start[-value]` can lead to strange undefined behavior
1655
+ * notably if `value` is unsigned, resulting in a large positive `-value`.
1728
1656
  */
1729
1657
  /* catch up */
1730
- if (offset) {
1658
+ if (STORED_IS_OFFSET(offcode)) {
1731
1659
  if (dictMode == ZSTD_noDict) {
1732
- while ( ((start > anchor) & (start - (offset-ZSTD_REP_MOVE) > prefixLowest))
1733
- && (start[-1] == (start-(offset-ZSTD_REP_MOVE))[-1]) ) /* only search for offset within prefix */
1660
+ while ( ((start > anchor) & (start - STORED_OFFSET(offcode) > prefixLowest))
1661
+ && (start[-1] == (start-STORED_OFFSET(offcode))[-1]) ) /* only search for offset within prefix */
1734
1662
  { start--; matchLength++; }
1735
1663
  }
1736
1664
  if (isDxS) {
1737
- U32 const matchIndex = (U32)((start-base) - (offset - ZSTD_REP_MOVE));
1665
+ U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode));
1738
1666
  const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex;
1739
1667
  const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest;
1740
1668
  while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */
1741
1669
  }
1742
- offset_2 = offset_1; offset_1 = (U32)(offset - ZSTD_REP_MOVE);
1670
+ offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode);
1743
1671
  }
1744
1672
  /* store sequence */
1745
1673
  _storeSequence:
1746
- { size_t const litLength = start - anchor;
1747
- ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offset, matchLength-MINMATCH);
1674
+ { size_t const litLength = (size_t)(start - anchor);
1675
+ ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength);
1748
1676
  anchor = ip = start + matchLength;
1749
1677
  }
1750
1678
 
@@ -1760,8 +1688,8 @@ _storeSequence:
1760
1688
  && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
1761
1689
  const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend;
1762
1690
  matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4;
1763
- offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap offset_2 <=> offset_1 */
1764
- ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MINMATCH);
1691
+ offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap offset_2 <=> offset_1 */
1692
+ ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);
1765
1693
  ip += matchLength;
1766
1694
  anchor = ip;
1767
1695
  continue;
@@ -1775,8 +1703,8 @@ _storeSequence:
1775
1703
  && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) {
1776
1704
  /* store sequence */
1777
1705
  matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
1778
- offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap repcodes */
1779
- ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MINMATCH);
1706
+ offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap repcodes */
1707
+ ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);
1780
1708
  ip += matchLength;
1781
1709
  anchor = ip;
1782
1710
  continue; /* faster when present ... (?) */
@@ -1955,15 +1883,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1955
1883
  const U32 windowLog = ms->cParams.windowLog;
1956
1884
  const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5;
1957
1885
 
1958
- typedef size_t (*searchMax_f)(
1959
- ZSTD_matchState_t* ms,
1960
- const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
1961
- const searchMax_f searchFuncs[3] = {
1962
- ZSTD_HcFindBestMatch_extDict_selectMLS,
1963
- ZSTD_BtFindBestMatch_extDict_selectMLS,
1964
- ZSTD_RowFindBestMatch_extDict_selectRowLog
1965
- };
1966
- searchMax_f searchMax = searchFuncs[(int)searchMethod];
1886
+ searchMax_f const searchMax = ZSTD_selectLazyVTable(ms, searchMethod, ZSTD_extDict)->searchMax;
1967
1887
  U32 offset_1 = rep[0], offset_2 = rep[1];
1968
1888
 
1969
1889
  DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod);
@@ -1985,7 +1905,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1985
1905
  #endif
1986
1906
  while (ip < ilimit) {
1987
1907
  size_t matchLength=0;
1988
- size_t offset=0;
1908
+ size_t offcode=STORE_REPCODE_1;
1989
1909
  const BYTE* start=ip+1;
1990
1910
  U32 curr = (U32)(ip-base);
1991
1911
 
@@ -1995,7 +1915,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1995
1915
  const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
1996
1916
  const BYTE* const repMatch = repBase + repIndex;
1997
1917
  if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow */
1998
- & (offset_1 < curr+1 - windowLow) ) /* note: we are searching at curr+1 */
1918
+ & (offset_1 <= curr+1 - windowLow) ) /* note: we are searching at curr+1 */
1999
1919
  if (MEM_read32(ip+1) == MEM_read32(repMatch)) {
2000
1920
  /* repcode detected we should take it */
2001
1921
  const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
@@ -2007,10 +1927,10 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
2007
1927
  { size_t offsetFound = 999999999;
2008
1928
  size_t const ml2 = searchMax(ms, ip, iend, &offsetFound);
2009
1929
  if (ml2 > matchLength)
2010
- matchLength = ml2, start = ip, offset=offsetFound;
1930
+ matchLength = ml2, start = ip, offcode=offsetFound;
2011
1931
  }
2012
1932
 
2013
- if (matchLength < 4) {
1933
+ if (matchLength < 4) {
2014
1934
  ip += ((ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */
2015
1935
  continue;
2016
1936
  }
@@ -2021,30 +1941,30 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
2021
1941
  ip ++;
2022
1942
  curr++;
2023
1943
  /* check repCode */
2024
- if (offset) {
1944
+ if (offcode) {
2025
1945
  const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
2026
1946
  const U32 repIndex = (U32)(curr - offset_1);
2027
1947
  const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
2028
1948
  const BYTE* const repMatch = repBase + repIndex;
2029
1949
  if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */
2030
- & (offset_1 < curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
1950
+ & (offset_1 <= curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
2031
1951
  if (MEM_read32(ip) == MEM_read32(repMatch)) {
2032
1952
  /* repcode detected */
2033
1953
  const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
2034
1954
  size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
2035
1955
  int const gain2 = (int)(repLength * 3);
2036
- int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1);
1956
+ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
2037
1957
  if ((repLength >= 4) && (gain2 > gain1))
2038
- matchLength = repLength, offset = 0, start = ip;
1958
+ matchLength = repLength, offcode = STORE_REPCODE_1, start = ip;
2039
1959
  } }
2040
1960
 
2041
1961
  /* search match, depth 1 */
2042
1962
  { size_t offset2=999999999;
2043
1963
  size_t const ml2 = searchMax(ms, ip, iend, &offset2);
2044
- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */
2045
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 4);
1964
+ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */
1965
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4);
2046
1966
  if ((ml2 >= 4) && (gain2 > gain1)) {
2047
- matchLength = ml2, offset = offset2, start = ip;
1967
+ matchLength = ml2, offcode = offset2, start = ip;
2048
1968
  continue; /* search a better one */
2049
1969
  } }
2050
1970
 
@@ -2053,48 +1973,48 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
2053
1973
  ip ++;
2054
1974
  curr++;
2055
1975
  /* check repCode */
2056
- if (offset) {
1976
+ if (offcode) {
2057
1977
  const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
2058
1978
  const U32 repIndex = (U32)(curr - offset_1);
2059
1979
  const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
2060
1980
  const BYTE* const repMatch = repBase + repIndex;
2061
1981
  if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */
2062
- & (offset_1 < curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
1982
+ & (offset_1 <= curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
2063
1983
  if (MEM_read32(ip) == MEM_read32(repMatch)) {
2064
1984
  /* repcode detected */
2065
1985
  const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
2066
1986
  size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
2067
1987
  int const gain2 = (int)(repLength * 4);
2068
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1);
1988
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
2069
1989
  if ((repLength >= 4) && (gain2 > gain1))
2070
- matchLength = repLength, offset = 0, start = ip;
1990
+ matchLength = repLength, offcode = STORE_REPCODE_1, start = ip;
2071
1991
  } }
2072
1992
 
2073
1993
  /* search match, depth 2 */
2074
1994
  { size_t offset2=999999999;
2075
1995
  size_t const ml2 = searchMax(ms, ip, iend, &offset2);
2076
- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */
2077
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 7);
1996
+ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */
1997
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7);
2078
1998
  if ((ml2 >= 4) && (gain2 > gain1)) {
2079
- matchLength = ml2, offset = offset2, start = ip;
1999
+ matchLength = ml2, offcode = offset2, start = ip;
2080
2000
  continue;
2081
2001
  } } }
2082
2002
  break; /* nothing found : store previous solution */
2083
2003
  }
2084
2004
 
2085
2005
  /* catch up */
2086
- if (offset) {
2087
- U32 const matchIndex = (U32)((start-base) - (offset - ZSTD_REP_MOVE));
2006
+ if (STORED_IS_OFFSET(offcode)) {
2007
+ U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode));
2088
2008
  const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex;
2089
2009
  const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart;
2090
2010
  while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */
2091
- offset_2 = offset_1; offset_1 = (U32)(offset - ZSTD_REP_MOVE);
2011
+ offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode);
2092
2012
  }
2093
2013
 
2094
2014
  /* store sequence */
2095
2015
  _storeSequence:
2096
- { size_t const litLength = start - anchor;
2097
- ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offset, matchLength-MINMATCH);
2016
+ { size_t const litLength = (size_t)(start - anchor);
2017
+ ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength);
2098
2018
  anchor = ip = start + matchLength;
2099
2019
  }
2100
2020
 
@@ -2106,13 +2026,13 @@ _storeSequence:
2106
2026
  const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
2107
2027
  const BYTE* const repMatch = repBase + repIndex;
2108
2028
  if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */
2109
- & (offset_2 < repCurrent - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
2029
+ & (offset_2 <= repCurrent - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
2110
2030
  if (MEM_read32(ip) == MEM_read32(repMatch)) {
2111
2031
  /* repcode detected we should take it */
2112
2032
  const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
2113
2033
  matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
2114
- offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap offset history */
2115
- ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MINMATCH);
2034
+ offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap offset history */
2035
+ ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);
2116
2036
  ip += matchLength;
2117
2037
  anchor = ip;
2118
2038
  continue; /* faster when present ... (?) */