zstd-ruby 1.4.5.0 → 1.5.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. checksums.yaml +4 -4
  2. data/.github/dependabot.yml +8 -0
  3. data/.github/workflows/ruby.yml +35 -0
  4. data/README.md +2 -2
  5. data/ext/zstdruby/extconf.rb +2 -1
  6. data/ext/zstdruby/libzstd/BUCK +5 -7
  7. data/ext/zstdruby/libzstd/Makefile +225 -222
  8. data/ext/zstdruby/libzstd/README.md +43 -5
  9. data/ext/zstdruby/libzstd/common/bitstream.h +46 -22
  10. data/ext/zstdruby/libzstd/common/compiler.h +182 -22
  11. data/ext/zstdruby/libzstd/common/cpu.h +1 -3
  12. data/ext/zstdruby/libzstd/common/debug.c +1 -1
  13. data/ext/zstdruby/libzstd/common/debug.h +12 -19
  14. data/ext/zstdruby/libzstd/common/entropy_common.c +196 -44
  15. data/ext/zstdruby/libzstd/common/error_private.c +2 -1
  16. data/ext/zstdruby/libzstd/common/error_private.h +82 -3
  17. data/ext/zstdruby/libzstd/common/fse.h +41 -12
  18. data/ext/zstdruby/libzstd/common/fse_decompress.c +139 -22
  19. data/ext/zstdruby/libzstd/common/huf.h +47 -23
  20. data/ext/zstdruby/libzstd/common/mem.h +87 -98
  21. data/ext/zstdruby/libzstd/common/pool.c +23 -17
  22. data/ext/zstdruby/libzstd/common/pool.h +2 -2
  23. data/ext/zstdruby/libzstd/common/portability_macros.h +131 -0
  24. data/ext/zstdruby/libzstd/common/threading.c +6 -5
  25. data/ext/zstdruby/libzstd/common/xxhash.c +6 -846
  26. data/ext/zstdruby/libzstd/common/xxhash.h +5568 -167
  27. data/ext/zstdruby/libzstd/common/zstd_common.c +10 -10
  28. data/ext/zstdruby/libzstd/common/zstd_deps.h +111 -0
  29. data/ext/zstdruby/libzstd/common/zstd_internal.h +189 -142
  30. data/ext/zstdruby/libzstd/common/zstd_trace.h +163 -0
  31. data/ext/zstdruby/libzstd/compress/clevels.h +134 -0
  32. data/ext/zstdruby/libzstd/compress/fse_compress.c +89 -46
  33. data/ext/zstdruby/libzstd/compress/hist.c +27 -29
  34. data/ext/zstdruby/libzstd/compress/hist.h +2 -2
  35. data/ext/zstdruby/libzstd/compress/huf_compress.c +770 -198
  36. data/ext/zstdruby/libzstd/compress/zstd_compress.c +2894 -863
  37. data/ext/zstdruby/libzstd/compress/zstd_compress_internal.h +390 -90
  38. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.c +12 -11
  39. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.h +4 -2
  40. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.c +31 -8
  41. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.h +1 -1
  42. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.c +25 -297
  43. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.h +1 -1
  44. data/ext/zstdruby/libzstd/compress/zstd_cwksp.h +206 -69
  45. data/ext/zstdruby/libzstd/compress/zstd_double_fast.c +307 -132
  46. data/ext/zstdruby/libzstd/compress/zstd_double_fast.h +1 -1
  47. data/ext/zstdruby/libzstd/compress/zstd_fast.c +322 -143
  48. data/ext/zstdruby/libzstd/compress/zstd_fast.h +1 -1
  49. data/ext/zstdruby/libzstd/compress/zstd_lazy.c +1136 -174
  50. data/ext/zstdruby/libzstd/compress/zstd_lazy.h +59 -1
  51. data/ext/zstdruby/libzstd/compress/zstd_ldm.c +316 -213
  52. data/ext/zstdruby/libzstd/compress/zstd_ldm.h +9 -2
  53. data/ext/zstdruby/libzstd/compress/zstd_ldm_geartab.h +106 -0
  54. data/ext/zstdruby/libzstd/compress/zstd_opt.c +373 -150
  55. data/ext/zstdruby/libzstd/compress/zstd_opt.h +1 -1
  56. data/ext/zstdruby/libzstd/compress/zstdmt_compress.c +152 -444
  57. data/ext/zstdruby/libzstd/compress/zstdmt_compress.h +31 -113
  58. data/ext/zstdruby/libzstd/decompress/huf_decompress.c +1044 -403
  59. data/ext/zstdruby/libzstd/decompress/huf_decompress_amd64.S +571 -0
  60. data/ext/zstdruby/libzstd/decompress/zstd_ddict.c +9 -9
  61. data/ext/zstdruby/libzstd/decompress/zstd_ddict.h +2 -2
  62. data/ext/zstdruby/libzstd/decompress/zstd_decompress.c +450 -105
  63. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.c +913 -273
  64. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.h +14 -5
  65. data/ext/zstdruby/libzstd/decompress/zstd_decompress_internal.h +59 -12
  66. data/ext/zstdruby/libzstd/deprecated/zbuff.h +1 -1
  67. data/ext/zstdruby/libzstd/deprecated/zbuff_common.c +1 -1
  68. data/ext/zstdruby/libzstd/deprecated/zbuff_compress.c +24 -4
  69. data/ext/zstdruby/libzstd/deprecated/zbuff_decompress.c +1 -1
  70. data/ext/zstdruby/libzstd/dictBuilder/cover.c +55 -38
  71. data/ext/zstdruby/libzstd/dictBuilder/cover.h +7 -6
  72. data/ext/zstdruby/libzstd/dictBuilder/divsufsort.c +1 -1
  73. data/ext/zstdruby/libzstd/dictBuilder/fastcover.c +43 -34
  74. data/ext/zstdruby/libzstd/dictBuilder/zdict.c +128 -58
  75. data/ext/zstdruby/libzstd/dll/example/Makefile +1 -1
  76. data/ext/zstdruby/libzstd/dll/example/README.md +16 -22
  77. data/ext/zstdruby/libzstd/legacy/zstd_legacy.h +1 -1
  78. data/ext/zstdruby/libzstd/legacy/zstd_v01.c +8 -8
  79. data/ext/zstdruby/libzstd/legacy/zstd_v01.h +1 -1
  80. data/ext/zstdruby/libzstd/legacy/zstd_v02.c +9 -9
  81. data/ext/zstdruby/libzstd/legacy/zstd_v02.h +1 -1
  82. data/ext/zstdruby/libzstd/legacy/zstd_v03.c +9 -9
  83. data/ext/zstdruby/libzstd/legacy/zstd_v03.h +1 -1
  84. data/ext/zstdruby/libzstd/legacy/zstd_v04.c +10 -10
  85. data/ext/zstdruby/libzstd/legacy/zstd_v04.h +1 -1
  86. data/ext/zstdruby/libzstd/legacy/zstd_v05.c +13 -13
  87. data/ext/zstdruby/libzstd/legacy/zstd_v05.h +1 -1
  88. data/ext/zstdruby/libzstd/legacy/zstd_v06.c +13 -13
  89. data/ext/zstdruby/libzstd/legacy/zstd_v06.h +1 -1
  90. data/ext/zstdruby/libzstd/legacy/zstd_v07.c +13 -13
  91. data/ext/zstdruby/libzstd/legacy/zstd_v07.h +1 -1
  92. data/ext/zstdruby/libzstd/libzstd.mk +185 -0
  93. data/ext/zstdruby/libzstd/libzstd.pc.in +4 -3
  94. data/ext/zstdruby/libzstd/modulemap/module.modulemap +4 -0
  95. data/ext/zstdruby/libzstd/{dictBuilder/zdict.h → zdict.h} +154 -7
  96. data/ext/zstdruby/libzstd/zstd.h +699 -214
  97. data/ext/zstdruby/libzstd/{common/zstd_errors.h → zstd_errors.h} +2 -1
  98. data/ext/zstdruby/zstdruby.c +2 -2
  99. data/lib/zstd-ruby/version.rb +1 -1
  100. metadata +15 -6
  101. data/.travis.yml +0 -14
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
2
+ * Copyright (c) Yann Collet, Facebook, Inc.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -58,11 +58,11 @@ ZSTD_updateDUBT(ZSTD_matchState_t* ms,
58
58
 
59
59
  /** ZSTD_insertDUBT1() :
60
60
  * sort one already inserted but unsorted position
61
- * assumption : current >= btlow == (current - btmask)
61
+ * assumption : curr >= btlow == (curr - btmask)
62
62
  * doesn't fail */
63
63
  static void
64
- ZSTD_insertDUBT1(ZSTD_matchState_t* ms,
65
- U32 current, const BYTE* inputEnd,
64
+ ZSTD_insertDUBT1(const ZSTD_matchState_t* ms,
65
+ U32 curr, const BYTE* inputEnd,
66
66
  U32 nbCompares, U32 btLow,
67
67
  const ZSTD_dictMode_e dictMode)
68
68
  {
@@ -74,41 +74,41 @@ ZSTD_insertDUBT1(ZSTD_matchState_t* ms,
74
74
  const BYTE* const base = ms->window.base;
75
75
  const BYTE* const dictBase = ms->window.dictBase;
76
76
  const U32 dictLimit = ms->window.dictLimit;
77
- const BYTE* const ip = (current>=dictLimit) ? base + current : dictBase + current;
78
- const BYTE* const iend = (current>=dictLimit) ? inputEnd : dictBase + dictLimit;
77
+ const BYTE* const ip = (curr>=dictLimit) ? base + curr : dictBase + curr;
78
+ const BYTE* const iend = (curr>=dictLimit) ? inputEnd : dictBase + dictLimit;
79
79
  const BYTE* const dictEnd = dictBase + dictLimit;
80
80
  const BYTE* const prefixStart = base + dictLimit;
81
81
  const BYTE* match;
82
- U32* smallerPtr = bt + 2*(current&btMask);
82
+ U32* smallerPtr = bt + 2*(curr&btMask);
83
83
  U32* largerPtr = smallerPtr + 1;
84
84
  U32 matchIndex = *smallerPtr; /* this candidate is unsorted : next sorted candidate is reached through *smallerPtr, while *largerPtr contains previous unsorted candidate (which is already saved and can be overwritten) */
85
85
  U32 dummy32; /* to be nullified at the end */
86
86
  U32 const windowValid = ms->window.lowLimit;
87
87
  U32 const maxDistance = 1U << cParams->windowLog;
88
- U32 const windowLow = (current - windowValid > maxDistance) ? current - maxDistance : windowValid;
88
+ U32 const windowLow = (curr - windowValid > maxDistance) ? curr - maxDistance : windowValid;
89
89
 
90
90
 
91
91
  DEBUGLOG(8, "ZSTD_insertDUBT1(%u) (dictLimit=%u, lowLimit=%u)",
92
- current, dictLimit, windowLow);
93
- assert(current >= btLow);
92
+ curr, dictLimit, windowLow);
93
+ assert(curr >= btLow);
94
94
  assert(ip < iend); /* condition for ZSTD_count */
95
95
 
96
- while (nbCompares-- && (matchIndex > windowLow)) {
96
+ for (; nbCompares && (matchIndex > windowLow); --nbCompares) {
97
97
  U32* const nextPtr = bt + 2*(matchIndex & btMask);
98
98
  size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
99
- assert(matchIndex < current);
99
+ assert(matchIndex < curr);
100
100
  /* note : all candidates are now supposed sorted,
101
101
  * but it's still possible to have nextPtr[1] == ZSTD_DUBT_UNSORTED_MARK
102
102
  * when a real index has the same value as ZSTD_DUBT_UNSORTED_MARK */
103
103
 
104
104
  if ( (dictMode != ZSTD_extDict)
105
105
  || (matchIndex+matchLength >= dictLimit) /* both in current segment*/
106
- || (current < dictLimit) /* both in extDict */) {
106
+ || (curr < dictLimit) /* both in extDict */) {
107
107
  const BYTE* const mBase = ( (dictMode != ZSTD_extDict)
108
108
  || (matchIndex+matchLength >= dictLimit)) ?
109
109
  base : dictBase;
110
110
  assert( (matchIndex+matchLength >= dictLimit) /* might be wrong if extDict is incorrectly set to 0 */
111
- || (current < dictLimit) );
111
+ || (curr < dictLimit) );
112
112
  match = mBase + matchIndex;
113
113
  matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend);
114
114
  } else {
@@ -119,7 +119,7 @@ ZSTD_insertDUBT1(ZSTD_matchState_t* ms,
119
119
  }
120
120
 
121
121
  DEBUGLOG(8, "ZSTD_insertDUBT1: comparing %u with %u : found %u common bytes ",
122
- current, matchIndex, (U32)matchLength);
122
+ curr, matchIndex, (U32)matchLength);
123
123
 
124
124
  if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */
125
125
  break; /* drop , to guarantee consistency ; miss a bit of compression, but other solutions can corrupt tree */
@@ -151,7 +151,7 @@ ZSTD_insertDUBT1(ZSTD_matchState_t* ms,
151
151
 
152
152
  static size_t
153
153
  ZSTD_DUBT_findBetterDictMatch (
154
- ZSTD_matchState_t* ms,
154
+ const ZSTD_matchState_t* ms,
155
155
  const BYTE* const ip, const BYTE* const iend,
156
156
  size_t* offsetPtr,
157
157
  size_t bestLength,
@@ -168,7 +168,7 @@ ZSTD_DUBT_findBetterDictMatch (
168
168
 
169
169
  const BYTE* const base = ms->window.base;
170
170
  const BYTE* const prefixStart = base + ms->window.dictLimit;
171
- U32 const current = (U32)(ip-base);
171
+ U32 const curr = (U32)(ip-base);
172
172
  const BYTE* const dictBase = dms->window.base;
173
173
  const BYTE* const dictEnd = dms->window.nextSrc;
174
174
  U32 const dictHighLimit = (U32)(dms->window.nextSrc - dms->window.base);
@@ -185,7 +185,7 @@ ZSTD_DUBT_findBetterDictMatch (
185
185
  (void)dictMode;
186
186
  assert(dictMode == ZSTD_dictMatchState);
187
187
 
188
- while (nbCompares-- && (dictMatchIndex > dictLowLimit)) {
188
+ for (; nbCompares && (dictMatchIndex > dictLowLimit); --nbCompares) {
189
189
  U32* const nextPtr = dictBt + 2*(dictMatchIndex & btMask);
190
190
  size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
191
191
  const BYTE* match = dictBase + dictMatchIndex;
@@ -195,10 +195,10 @@ ZSTD_DUBT_findBetterDictMatch (
195
195
 
196
196
  if (matchLength > bestLength) {
197
197
  U32 matchIndex = dictMatchIndex + dictIndexDelta;
198
- if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(current-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) {
198
+ if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) {
199
199
  DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)",
200
- current, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, ZSTD_REP_MOVE + current - matchIndex, dictMatchIndex, matchIndex);
201
- bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + current - matchIndex;
200
+ curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, ZSTD_REP_MOVE + curr - matchIndex, dictMatchIndex, matchIndex);
201
+ bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + curr - matchIndex;
202
202
  }
203
203
  if (ip+matchLength == iend) { /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */
204
204
  break; /* drop, to guarantee consistency (miss a little bit of compression) */
@@ -218,9 +218,9 @@ ZSTD_DUBT_findBetterDictMatch (
218
218
  }
219
219
 
220
220
  if (bestLength >= MINMATCH) {
221
- U32 const mIndex = current - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex;
221
+ U32 const mIndex = curr - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex;
222
222
  DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
223
- current, (U32)bestLength, (U32)*offsetPtr, mIndex);
223
+ curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
224
224
  }
225
225
  return bestLength;
226
226
 
@@ -241,13 +241,13 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
241
241
  U32 matchIndex = hashTable[h];
242
242
 
243
243
  const BYTE* const base = ms->window.base;
244
- U32 const current = (U32)(ip-base);
245
- U32 const windowLow = ZSTD_getLowestMatchIndex(ms, current, cParams->windowLog);
244
+ U32 const curr = (U32)(ip-base);
245
+ U32 const windowLow = ZSTD_getLowestMatchIndex(ms, curr, cParams->windowLog);
246
246
 
247
247
  U32* const bt = ms->chainTable;
248
248
  U32 const btLog = cParams->chainLog - 1;
249
249
  U32 const btMask = (1 << btLog) - 1;
250
- U32 const btLow = (btMask >= current) ? 0 : current - btMask;
250
+ U32 const btLow = (btMask >= curr) ? 0 : curr - btMask;
251
251
  U32 const unsortLimit = MAX(btLow, windowLow);
252
252
 
253
253
  U32* nextCandidate = bt + 2*(matchIndex&btMask);
@@ -256,8 +256,9 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
256
256
  U32 nbCandidates = nbCompares;
257
257
  U32 previousCandidate = 0;
258
258
 
259
- DEBUGLOG(7, "ZSTD_DUBT_findBestMatch (%u) ", current);
259
+ DEBUGLOG(7, "ZSTD_DUBT_findBestMatch (%u) ", curr);
260
260
  assert(ip <= iend-8); /* required for h calculation */
261
+ assert(dictMode != ZSTD_dedicatedDictSearch);
261
262
 
262
263
  /* reach end of unsorted candidates list */
263
264
  while ( (matchIndex > unsortLimit)
@@ -299,16 +300,16 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
299
300
  const U32 dictLimit = ms->window.dictLimit;
300
301
  const BYTE* const dictEnd = dictBase + dictLimit;
301
302
  const BYTE* const prefixStart = base + dictLimit;
302
- U32* smallerPtr = bt + 2*(current&btMask);
303
- U32* largerPtr = bt + 2*(current&btMask) + 1;
304
- U32 matchEndIdx = current + 8 + 1;
303
+ U32* smallerPtr = bt + 2*(curr&btMask);
304
+ U32* largerPtr = bt + 2*(curr&btMask) + 1;
305
+ U32 matchEndIdx = curr + 8 + 1;
305
306
  U32 dummy32; /* to be nullified at the end */
306
307
  size_t bestLength = 0;
307
308
 
308
309
  matchIndex = hashTable[h];
309
- hashTable[h] = current; /* Update Hash Table */
310
+ hashTable[h] = curr; /* Update Hash Table */
310
311
 
311
- while (nbCompares-- && (matchIndex > windowLow)) {
312
+ for (; nbCompares && (matchIndex > windowLow); --nbCompares) {
312
313
  U32* const nextPtr = bt + 2*(matchIndex & btMask);
313
314
  size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
314
315
  const BYTE* match;
@@ -326,8 +327,8 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
326
327
  if (matchLength > bestLength) {
327
328
  if (matchLength > matchEndIdx - matchIndex)
328
329
  matchEndIdx = matchIndex + (U32)matchLength;
329
- if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(current-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) )
330
- bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + current - matchIndex;
330
+ if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) )
331
+ bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + curr - matchIndex;
331
332
  if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */
332
333
  if (dictMode == ZSTD_dictMatchState) {
333
334
  nbCompares = 0; /* in addition to avoiding checking any
@@ -356,6 +357,7 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
356
357
 
357
358
  *smallerPtr = *largerPtr = 0;
358
359
 
360
+ assert(nbCompares <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */
359
361
  if (dictMode == ZSTD_dictMatchState && nbCompares) {
360
362
  bestLength = ZSTD_DUBT_findBetterDictMatch(
361
363
  ms, ip, iend,
@@ -363,12 +365,12 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
363
365
  mls, dictMode);
364
366
  }
365
367
 
366
- assert(matchEndIdx > current+8); /* ensure nextToUpdate is increased */
368
+ assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased */
367
369
  ms->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */
368
370
  if (bestLength >= MINMATCH) {
369
- U32 const mIndex = current - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex;
371
+ U32 const mIndex = curr - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex;
370
372
  DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
371
- current, (U32)bestLength, (U32)*offsetPtr, mIndex);
373
+ curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
372
374
  }
373
375
  return bestLength;
374
376
  }
@@ -389,56 +391,222 @@ ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms,
389
391
  return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offsetPtr, mls, dictMode);
390
392
  }
391
393
 
394
+ /***********************************
395
+ * Dedicated dict search
396
+ ***********************************/
392
397
 
393
- static size_t
394
- ZSTD_BtFindBestMatch_selectMLS ( ZSTD_matchState_t* ms,
395
- const BYTE* ip, const BYTE* const iLimit,
396
- size_t* offsetPtr)
398
+ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip)
397
399
  {
398
- switch(ms->cParams.minMatch)
400
+ const BYTE* const base = ms->window.base;
401
+ U32 const target = (U32)(ip - base);
402
+ U32* const hashTable = ms->hashTable;
403
+ U32* const chainTable = ms->chainTable;
404
+ U32 const chainSize = 1 << ms->cParams.chainLog;
405
+ U32 idx = ms->nextToUpdate;
406
+ U32 const minChain = chainSize < target - idx ? target - chainSize : idx;
407
+ U32 const bucketSize = 1 << ZSTD_LAZY_DDSS_BUCKET_LOG;
408
+ U32 const cacheSize = bucketSize - 1;
409
+ U32 const chainAttempts = (1 << ms->cParams.searchLog) - cacheSize;
410
+ U32 const chainLimit = chainAttempts > 255 ? 255 : chainAttempts;
411
+
412
+ /* We know the hashtable is oversized by a factor of `bucketSize`.
413
+ * We are going to temporarily pretend `bucketSize == 1`, keeping only a
414
+ * single entry. We will use the rest of the space to construct a temporary
415
+ * chaintable.
416
+ */
417
+ U32 const hashLog = ms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG;
418
+ U32* const tmpHashTable = hashTable;
419
+ U32* const tmpChainTable = hashTable + ((size_t)1 << hashLog);
420
+ U32 const tmpChainSize = (U32)((1 << ZSTD_LAZY_DDSS_BUCKET_LOG) - 1) << hashLog;
421
+ U32 const tmpMinChain = tmpChainSize < target ? target - tmpChainSize : idx;
422
+ U32 hashIdx;
423
+
424
+ assert(ms->cParams.chainLog <= 24);
425
+ assert(ms->cParams.hashLog > ms->cParams.chainLog);
426
+ assert(idx != 0);
427
+ assert(tmpMinChain <= minChain);
428
+
429
+ /* fill conventional hash table and conventional chain table */
430
+ for ( ; idx < target; idx++) {
431
+ U32 const h = (U32)ZSTD_hashPtr(base + idx, hashLog, ms->cParams.minMatch);
432
+ if (idx >= tmpMinChain) {
433
+ tmpChainTable[idx - tmpMinChain] = hashTable[h];
434
+ }
435
+ tmpHashTable[h] = idx;
436
+ }
437
+
438
+ /* sort chains into ddss chain table */
399
439
  {
400
- default : /* includes case 3 */
401
- case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_noDict);
402
- case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_noDict);
403
- case 7 :
404
- case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_noDict);
440
+ U32 chainPos = 0;
441
+ for (hashIdx = 0; hashIdx < (1U << hashLog); hashIdx++) {
442
+ U32 count;
443
+ U32 countBeyondMinChain = 0;
444
+ U32 i = tmpHashTable[hashIdx];
445
+ for (count = 0; i >= tmpMinChain && count < cacheSize; count++) {
446
+ /* skip through the chain to the first position that won't be
447
+ * in the hash cache bucket */
448
+ if (i < minChain) {
449
+ countBeyondMinChain++;
450
+ }
451
+ i = tmpChainTable[i - tmpMinChain];
452
+ }
453
+ if (count == cacheSize) {
454
+ for (count = 0; count < chainLimit;) {
455
+ if (i < minChain) {
456
+ if (!i || ++countBeyondMinChain > cacheSize) {
457
+ /* only allow pulling `cacheSize` number of entries
458
+ * into the cache or chainTable beyond `minChain`,
459
+ * to replace the entries pulled out of the
460
+ * chainTable into the cache. This lets us reach
461
+ * back further without increasing the total number
462
+ * of entries in the chainTable, guaranteeing the
463
+ * DDSS chain table will fit into the space
464
+ * allocated for the regular one. */
465
+ break;
466
+ }
467
+ }
468
+ chainTable[chainPos++] = i;
469
+ count++;
470
+ if (i < tmpMinChain) {
471
+ break;
472
+ }
473
+ i = tmpChainTable[i - tmpMinChain];
474
+ }
475
+ } else {
476
+ count = 0;
477
+ }
478
+ if (count) {
479
+ tmpHashTable[hashIdx] = ((chainPos - count) << 8) + count;
480
+ } else {
481
+ tmpHashTable[hashIdx] = 0;
482
+ }
483
+ }
484
+ assert(chainPos <= chainSize); /* I believe this is guaranteed... */
485
+ }
486
+
487
+ /* move chain pointers into the last entry of each hash bucket */
488
+ for (hashIdx = (1 << hashLog); hashIdx; ) {
489
+ U32 const bucketIdx = --hashIdx << ZSTD_LAZY_DDSS_BUCKET_LOG;
490
+ U32 const chainPackedPointer = tmpHashTable[hashIdx];
491
+ U32 i;
492
+ for (i = 0; i < cacheSize; i++) {
493
+ hashTable[bucketIdx + i] = 0;
494
+ }
495
+ hashTable[bucketIdx + bucketSize - 1] = chainPackedPointer;
496
+ }
497
+
498
+ /* fill the buckets of the hash table */
499
+ for (idx = ms->nextToUpdate; idx < target; idx++) {
500
+ U32 const h = (U32)ZSTD_hashPtr(base + idx, hashLog, ms->cParams.minMatch)
501
+ << ZSTD_LAZY_DDSS_BUCKET_LOG;
502
+ U32 i;
503
+ /* Shift hash cache down 1. */
504
+ for (i = cacheSize - 1; i; i--)
505
+ hashTable[h + i] = hashTable[h + i - 1];
506
+ hashTable[h] = idx;
405
507
  }
508
+
509
+ ms->nextToUpdate = target;
406
510
  }
407
511
 
512
+ /* Returns the longest match length found in the dedicated dict search structure.
513
+ * If none are longer than the argument ml, then ml will be returned.
514
+ */
515
+ FORCE_INLINE_TEMPLATE
516
+ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nbAttempts,
517
+ const ZSTD_matchState_t* const dms,
518
+ const BYTE* const ip, const BYTE* const iLimit,
519
+ const BYTE* const prefixStart, const U32 curr,
520
+ const U32 dictLimit, const size_t ddsIdx) {
521
+ const U32 ddsLowestIndex = dms->window.dictLimit;
522
+ const BYTE* const ddsBase = dms->window.base;
523
+ const BYTE* const ddsEnd = dms->window.nextSrc;
524
+ const U32 ddsSize = (U32)(ddsEnd - ddsBase);
525
+ const U32 ddsIndexDelta = dictLimit - ddsSize;
526
+ const U32 bucketSize = (1 << ZSTD_LAZY_DDSS_BUCKET_LOG);
527
+ const U32 bucketLimit = nbAttempts < bucketSize - 1 ? nbAttempts : bucketSize - 1;
528
+ U32 ddsAttempt;
529
+ U32 matchIndex;
530
+
531
+ for (ddsAttempt = 0; ddsAttempt < bucketSize - 1; ddsAttempt++) {
532
+ PREFETCH_L1(ddsBase + dms->hashTable[ddsIdx + ddsAttempt]);
533
+ }
408
534
 
409
- static size_t ZSTD_BtFindBestMatch_dictMatchState_selectMLS (
410
- ZSTD_matchState_t* ms,
411
- const BYTE* ip, const BYTE* const iLimit,
412
- size_t* offsetPtr)
413
- {
414
- switch(ms->cParams.minMatch)
415
535
  {
416
- default : /* includes case 3 */
417
- case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_dictMatchState);
418
- case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_dictMatchState);
419
- case 7 :
420
- case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_dictMatchState);
536
+ U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1];
537
+ U32 const chainIndex = chainPackedPointer >> 8;
538
+
539
+ PREFETCH_L1(&dms->chainTable[chainIndex]);
421
540
  }
422
- }
423
541
 
542
+ for (ddsAttempt = 0; ddsAttempt < bucketLimit; ddsAttempt++) {
543
+ size_t currentMl=0;
544
+ const BYTE* match;
545
+ matchIndex = dms->hashTable[ddsIdx + ddsAttempt];
546
+ match = ddsBase + matchIndex;
547
+
548
+ if (!matchIndex) {
549
+ return ml;
550
+ }
551
+
552
+ /* guaranteed by table construction */
553
+ (void)ddsLowestIndex;
554
+ assert(matchIndex >= ddsLowestIndex);
555
+ assert(match+4 <= ddsEnd);
556
+ if (MEM_read32(match) == MEM_read32(ip)) {
557
+ /* assumption : matchIndex <= dictLimit-4 (by table construction) */
558
+ currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4;
559
+ }
560
+
561
+ /* save best solution */
562
+ if (currentMl > ml) {
563
+ ml = currentMl;
564
+ *offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE;
565
+ if (ip+currentMl == iLimit) {
566
+ /* best possible, avoids read overflow on next attempt */
567
+ return ml;
568
+ }
569
+ }
570
+ }
424
571
 
425
- static size_t ZSTD_BtFindBestMatch_extDict_selectMLS (
426
- ZSTD_matchState_t* ms,
427
- const BYTE* ip, const BYTE* const iLimit,
428
- size_t* offsetPtr)
429
- {
430
- switch(ms->cParams.minMatch)
431
572
  {
432
- default : /* includes case 3 */
433
- case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_extDict);
434
- case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_extDict);
435
- case 7 :
436
- case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_extDict);
573
+ U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1];
574
+ U32 chainIndex = chainPackedPointer >> 8;
575
+ U32 const chainLength = chainPackedPointer & 0xFF;
576
+ U32 const chainAttempts = nbAttempts - ddsAttempt;
577
+ U32 const chainLimit = chainAttempts > chainLength ? chainLength : chainAttempts;
578
+ U32 chainAttempt;
579
+
580
+ for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++) {
581
+ PREFETCH_L1(ddsBase + dms->chainTable[chainIndex + chainAttempt]);
582
+ }
583
+
584
+ for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++, chainIndex++) {
585
+ size_t currentMl=0;
586
+ const BYTE* match;
587
+ matchIndex = dms->chainTable[chainIndex];
588
+ match = ddsBase + matchIndex;
589
+
590
+ /* guaranteed by table construction */
591
+ assert(matchIndex >= ddsLowestIndex);
592
+ assert(match+4 <= ddsEnd);
593
+ if (MEM_read32(match) == MEM_read32(ip)) {
594
+ /* assumption : matchIndex <= dictLimit-4 (by table construction) */
595
+ currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4;
596
+ }
597
+
598
+ /* save best solution */
599
+ if (currentMl > ml) {
600
+ ml = currentMl;
601
+ *offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE;
602
+ if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
603
+ }
604
+ }
437
605
  }
606
+ return ml;
438
607
  }
439
608
 
440
609
 
441
-
442
610
  /* *********************************
443
611
  * Hash Chain
444
612
  ***********************************/
@@ -446,7 +614,7 @@ static size_t ZSTD_BtFindBestMatch_extDict_selectMLS (
446
614
 
447
615
  /* Update chains up to ip (excluded)
448
616
  Assumption : always within prefix (i.e. not within extDict) */
449
- static U32 ZSTD_insertAndFindFirstIndex_internal(
617
+ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
450
618
  ZSTD_matchState_t* ms,
451
619
  const ZSTD_compressionParameters* const cParams,
452
620
  const BYTE* ip, U32 const mls)
@@ -475,10 +643,9 @@ U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {
475
643
  return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch);
476
644
  }
477
645
 
478
-
479
646
  /* inlining is important to hardwire a hot branch (template emulation) */
480
647
  FORCE_INLINE_TEMPLATE
481
- size_t ZSTD_HcFindBestMatch_generic (
648
+ size_t ZSTD_HcFindBestMatch(
482
649
  ZSTD_matchState_t* ms,
483
650
  const BYTE* const ip, const BYTE* const iLimit,
484
651
  size_t* offsetPtr,
@@ -493,20 +660,33 @@ size_t ZSTD_HcFindBestMatch_generic (
493
660
  const U32 dictLimit = ms->window.dictLimit;
494
661
  const BYTE* const prefixStart = base + dictLimit;
495
662
  const BYTE* const dictEnd = dictBase + dictLimit;
496
- const U32 current = (U32)(ip-base);
663
+ const U32 curr = (U32)(ip-base);
497
664
  const U32 maxDistance = 1U << cParams->windowLog;
498
665
  const U32 lowestValid = ms->window.lowLimit;
499
- const U32 withinMaxDistance = (current - lowestValid > maxDistance) ? current - maxDistance : lowestValid;
666
+ const U32 withinMaxDistance = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid;
500
667
  const U32 isDictionary = (ms->loadedDictEnd != 0);
501
668
  const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance;
502
- const U32 minChain = current > chainSize ? current - chainSize : 0;
669
+ const U32 minChain = curr > chainSize ? curr - chainSize : 0;
503
670
  U32 nbAttempts = 1U << cParams->searchLog;
504
671
  size_t ml=4-1;
505
672
 
673
+ const ZSTD_matchState_t* const dms = ms->dictMatchState;
674
+ const U32 ddsHashLog = dictMode == ZSTD_dedicatedDictSearch
675
+ ? dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG : 0;
676
+ const size_t ddsIdx = dictMode == ZSTD_dedicatedDictSearch
677
+ ? ZSTD_hashPtr(ip, ddsHashLog, mls) << ZSTD_LAZY_DDSS_BUCKET_LOG : 0;
678
+
679
+ U32 matchIndex;
680
+
681
+ if (dictMode == ZSTD_dedicatedDictSearch) {
682
+ const U32* entry = &dms->hashTable[ddsIdx];
683
+ PREFETCH_L1(entry);
684
+ }
685
+
506
686
  /* HC4 match finder */
507
- U32 matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls);
687
+ matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls);
508
688
 
509
- for ( ; (matchIndex>lowLimit) & (nbAttempts>0) ; nbAttempts--) {
689
+ for ( ; (matchIndex>=lowLimit) & (nbAttempts>0) ; nbAttempts--) {
510
690
  size_t currentMl=0;
511
691
  if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
512
692
  const BYTE* const match = base + matchIndex;
@@ -523,7 +703,7 @@ size_t ZSTD_HcFindBestMatch_generic (
523
703
  /* save best solution */
524
704
  if (currentMl > ml) {
525
705
  ml = currentMl;
526
- *offsetPtr = current - matchIndex + ZSTD_REP_MOVE;
706
+ *offsetPtr = curr - matchIndex + ZSTD_REP_MOVE;
527
707
  if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
528
708
  }
529
709
 
@@ -531,8 +711,11 @@ size_t ZSTD_HcFindBestMatch_generic (
531
711
  matchIndex = NEXT_IN_CHAIN(matchIndex, chainMask);
532
712
  }
533
713
 
534
- if (dictMode == ZSTD_dictMatchState) {
535
- const ZSTD_matchState_t* const dms = ms->dictMatchState;
714
+ assert(nbAttempts <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */
715
+ if (dictMode == ZSTD_dedicatedDictSearch) {
716
+ ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts, dms,
717
+ ip, iLimit, prefixStart, curr, dictLimit, ddsIdx);
718
+ } else if (dictMode == ZSTD_dictMatchState) {
536
719
  const U32* const dmsChainTable = dms->chainTable;
537
720
  const U32 dmsChainSize = (1 << dms->cParams.chainLog);
538
721
  const U32 dmsChainMask = dmsChainSize - 1;
@@ -545,7 +728,7 @@ size_t ZSTD_HcFindBestMatch_generic (
545
728
 
546
729
  matchIndex = dms->hashTable[ZSTD_hashPtr(ip, dms->cParams.hashLog, mls)];
547
730
 
548
- for ( ; (matchIndex>dmsLowestIndex) & (nbAttempts>0) ; nbAttempts--) {
731
+ for ( ; (matchIndex>=dmsLowestIndex) & (nbAttempts>0) ; nbAttempts--) {
549
732
  size_t currentMl=0;
550
733
  const BYTE* const match = dmsBase + matchIndex;
551
734
  assert(match+4 <= dmsEnd);
@@ -555,11 +738,12 @@ size_t ZSTD_HcFindBestMatch_generic (
555
738
  /* save best solution */
556
739
  if (currentMl > ml) {
557
740
  ml = currentMl;
558
- *offsetPtr = current - (matchIndex + dmsIndexDelta) + ZSTD_REP_MOVE;
741
+ *offsetPtr = curr - (matchIndex + dmsIndexDelta) + ZSTD_REP_MOVE;
559
742
  if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
560
743
  }
561
744
 
562
745
  if (matchIndex <= dmsMinChain) break;
746
+
563
747
  matchIndex = dmsChainTable[matchIndex & dmsChainMask];
564
748
  }
565
749
  }
@@ -567,59 +751,724 @@ size_t ZSTD_HcFindBestMatch_generic (
567
751
  return ml;
568
752
  }
569
753
 
754
+ /* *********************************
755
+ * (SIMD) Row-based matchfinder
756
+ ***********************************/
757
+ /* Constants for row-based hash */
758
+ #define ZSTD_ROW_HASH_TAG_OFFSET 16 /* byte offset of hashes in the match state's tagTable from the beginning of a row */
759
+ #define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */
760
+ #define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1)
761
+ #define ZSTD_ROW_HASH_MAX_ENTRIES 64 /* absolute maximum number of entries per row, for all configurations */
570
762
 
571
- FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_selectMLS (
572
- ZSTD_matchState_t* ms,
573
- const BYTE* ip, const BYTE* const iLimit,
574
- size_t* offsetPtr)
763
+ #define ZSTD_ROW_HASH_CACHE_MASK (ZSTD_ROW_HASH_CACHE_SIZE - 1)
764
+
765
+ typedef U64 ZSTD_VecMask; /* Clarifies when we are interacting with a U64 representing a mask of matches */
766
+
767
+ /* ZSTD_VecMask_next():
768
+ * Starting from the LSB, returns the idx of the next non-zero bit.
769
+ * Basically counting the nb of trailing zeroes.
770
+ */
771
+ static U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
772
+ assert(val != 0);
773
+ # if defined(_MSC_VER) && defined(_WIN64)
774
+ if (val != 0) {
775
+ unsigned long r;
776
+ _BitScanForward64(&r, val);
777
+ return (U32)(r);
778
+ } else {
779
+ /* Should not reach this code path */
780
+ __assume(0);
781
+ }
782
+ # elif (defined(__GNUC__) && ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))
783
+ if (sizeof(size_t) == 4) {
784
+ U32 mostSignificantWord = (U32)(val >> 32);
785
+ U32 leastSignificantWord = (U32)val;
786
+ if (leastSignificantWord == 0) {
787
+ return 32 + (U32)__builtin_ctz(mostSignificantWord);
788
+ } else {
789
+ return (U32)__builtin_ctz(leastSignificantWord);
790
+ }
791
+ } else {
792
+ return (U32)__builtin_ctzll(val);
793
+ }
794
+ # else
795
+ /* Software ctz version: http://aggregate.org/MAGIC/#Trailing%20Zero%20Count
796
+ * and: https://stackoverflow.com/questions/2709430/count-number-of-bits-in-a-64-bit-long-big-integer
797
+ */
798
+ val = ~val & (val - 1ULL); /* Lowest set bit mask */
799
+ val = val - ((val >> 1) & 0x5555555555555555);
800
+ val = (val & 0x3333333333333333ULL) + ((val >> 2) & 0x3333333333333333ULL);
801
+ return (U32)((((val + (val >> 4)) & 0xF0F0F0F0F0F0F0FULL) * 0x101010101010101ULL) >> 56);
802
+ # endif
803
+ }
804
+
805
+ /* ZSTD_rotateRight_*():
806
+ * Rotates a bitfield to the right by "count" bits.
807
+ * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts
808
+ */
809
+ FORCE_INLINE_TEMPLATE
810
+ U64 ZSTD_rotateRight_U64(U64 const value, U32 count) {
811
+ assert(count < 64);
812
+ count &= 0x3F; /* for fickle pattern recognition */
813
+ return (value >> count) | (U64)(value << ((0U - count) & 0x3F));
814
+ }
815
+
816
+ FORCE_INLINE_TEMPLATE
817
+ U32 ZSTD_rotateRight_U32(U32 const value, U32 count) {
818
+ assert(count < 32);
819
+ count &= 0x1F; /* for fickle pattern recognition */
820
+ return (value >> count) | (U32)(value << ((0U - count) & 0x1F));
821
+ }
822
+
823
+ FORCE_INLINE_TEMPLATE
824
+ U16 ZSTD_rotateRight_U16(U16 const value, U32 count) {
825
+ assert(count < 16);
826
+ count &= 0x0F; /* for fickle pattern recognition */
827
+ return (value >> count) | (U16)(value << ((0U - count) & 0x0F));
828
+ }
829
+
830
+ /* ZSTD_row_nextIndex():
831
+ * Returns the next index to insert at within a tagTable row, and updates the "head"
832
+ * value to reflect the update. Essentially cycles backwards from [0, {entries per row})
833
+ */
834
+ FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const rowMask) {
835
+ U32 const next = (*tagRow - 1) & rowMask;
836
+ *tagRow = (BYTE)next;
837
+ return next;
838
+ }
839
+
840
+ /* ZSTD_isAligned():
841
+ * Checks that a pointer is aligned to "align" bytes which must be a power of 2.
842
+ */
843
+ MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) {
844
+ assert((align & (align - 1)) == 0);
845
+ return (((size_t)ptr) & (align - 1)) == 0;
846
+ }
847
+
848
+ /* ZSTD_row_prefetch():
849
+ * Performs prefetching for the hashTable and tagTable at a given row.
850
+ */
851
+ FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* tagTable, U32 const relRow, U32 const rowLog) {
852
+ PREFETCH_L1(hashTable + relRow);
853
+ if (rowLog >= 5) {
854
+ PREFETCH_L1(hashTable + relRow + 16);
855
+ /* Note: prefetching more of the hash table does not appear to be beneficial for 128-entry rows */
856
+ }
857
+ PREFETCH_L1(tagTable + relRow);
858
+ if (rowLog == 6) {
859
+ PREFETCH_L1(tagTable + relRow + 32);
860
+ }
861
+ assert(rowLog == 4 || rowLog == 5 || rowLog == 6);
862
+ assert(ZSTD_isAligned(hashTable + relRow, 64)); /* prefetched hash row always 64-byte aligned */
863
+ assert(ZSTD_isAligned(tagTable + relRow, (size_t)1 << rowLog)); /* prefetched tagRow sits on correct multiple of bytes (32,64,128) */
864
+ }
865
+
866
+ /* ZSTD_row_fillHashCache():
867
+ * Fill up the hash cache starting at idx, prefetching up to ZSTD_ROW_HASH_CACHE_SIZE entries,
868
+ * but not beyond iLimit.
869
+ */
870
+ FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base,
871
+ U32 const rowLog, U32 const mls,
872
+ U32 idx, const BYTE* const iLimit)
575
873
  {
576
- switch(ms->cParams.minMatch)
577
- {
578
- default : /* includes case 3 */
579
- case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_noDict);
580
- case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_noDict);
581
- case 7 :
582
- case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_noDict);
874
+ U32 const* const hashTable = ms->hashTable;
875
+ U16 const* const tagTable = ms->tagTable;
876
+ U32 const hashLog = ms->rowHashLog;
877
+ U32 const maxElemsToPrefetch = (base + idx) > iLimit ? 0 : (U32)(iLimit - (base + idx) + 1);
878
+ U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch);
879
+
880
+ for (; idx < lim; ++idx) {
881
+ U32 const hash = (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
882
+ U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
883
+ ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
884
+ ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash;
583
885
  }
886
+
887
+ DEBUGLOG(6, "ZSTD_row_fillHashCache(): [%u %u %u %u %u %u %u %u]", ms->hashCache[0], ms->hashCache[1],
888
+ ms->hashCache[2], ms->hashCache[3], ms->hashCache[4],
889
+ ms->hashCache[5], ms->hashCache[6], ms->hashCache[7]);
584
890
  }
585
891
 
892
+ /* ZSTD_row_nextCachedHash():
893
+ * Returns the hash of base + idx, and replaces the hash in the hash cache with the byte at
894
+ * base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable.
895
+ */
896
+ FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable,
897
+ U16 const* tagTable, BYTE const* base,
898
+ U32 idx, U32 const hashLog,
899
+ U32 const rowLog, U32 const mls)
900
+ {
901
+ U32 const newHash = (U32)ZSTD_hashPtr(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
902
+ U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
903
+ ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
904
+ { U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK];
905
+ cache[idx & ZSTD_ROW_HASH_CACHE_MASK] = newHash;
906
+ return hash;
907
+ }
908
+ }
586
909
 
587
- static size_t ZSTD_HcFindBestMatch_dictMatchState_selectMLS (
588
- ZSTD_matchState_t* ms,
589
- const BYTE* ip, const BYTE* const iLimit,
590
- size_t* offsetPtr)
910
+ /* ZSTD_row_update_internalImpl():
911
+ * Updates the hash table with positions starting from updateStartIdx until updateEndIdx.
912
+ */
913
+ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms,
914
+ U32 updateStartIdx, U32 const updateEndIdx,
915
+ U32 const mls, U32 const rowLog,
916
+ U32 const rowMask, U32 const useCache)
591
917
  {
592
- switch(ms->cParams.minMatch)
593
- {
594
- default : /* includes case 3 */
595
- case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_dictMatchState);
596
- case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_dictMatchState);
597
- case 7 :
598
- case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_dictMatchState);
918
+ U32* const hashTable = ms->hashTable;
919
+ U16* const tagTable = ms->tagTable;
920
+ U32 const hashLog = ms->rowHashLog;
921
+ const BYTE* const base = ms->window.base;
922
+
923
+ DEBUGLOG(6, "ZSTD_row_update_internalImpl(): updateStartIdx=%u, updateEndIdx=%u", updateStartIdx, updateEndIdx);
924
+ for (; updateStartIdx < updateEndIdx; ++updateStartIdx) {
925
+ U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls)
926
+ : (U32)ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
927
+ U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
928
+ U32* const row = hashTable + relRow;
929
+ BYTE* tagRow = (BYTE*)(tagTable + relRow); /* Though tagTable is laid out as a table of U16, each tag is only 1 byte.
930
+ Explicit cast allows us to get exact desired position within each row */
931
+ U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
932
+
933
+ assert(hash == ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls));
934
+ ((BYTE*)tagRow)[pos + ZSTD_ROW_HASH_TAG_OFFSET] = hash & ZSTD_ROW_HASH_TAG_MASK;
935
+ row[pos] = updateStartIdx;
936
+ }
937
+ }
938
+
939
+ /* ZSTD_row_update_internal():
940
+ * Inserts the byte at ip into the appropriate position in the hash table, and updates ms->nextToUpdate.
941
+ * Skips sections of long matches as is necessary.
942
+ */
943
+ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip,
944
+ U32 const mls, U32 const rowLog,
945
+ U32 const rowMask, U32 const useCache)
946
+ {
947
+ U32 idx = ms->nextToUpdate;
948
+ const BYTE* const base = ms->window.base;
949
+ const U32 target = (U32)(ip - base);
950
+ const U32 kSkipThreshold = 384;
951
+ const U32 kMaxMatchStartPositionsToUpdate = 96;
952
+ const U32 kMaxMatchEndPositionsToUpdate = 32;
953
+
954
+ if (useCache) {
955
+ /* Only skip positions when using hash cache, i.e.
956
+ * if we are loading a dict, don't skip anything.
957
+ * If we decide to skip, then we only update a set number
958
+ * of positions at the beginning and end of the match.
959
+ */
960
+ if (UNLIKELY(target - idx > kSkipThreshold)) {
961
+ U32 const bound = idx + kMaxMatchStartPositionsToUpdate;
962
+ ZSTD_row_update_internalImpl(ms, idx, bound, mls, rowLog, rowMask, useCache);
963
+ idx = target - kMaxMatchEndPositionsToUpdate;
964
+ ZSTD_row_fillHashCache(ms, base, rowLog, mls, idx, ip+1);
965
+ }
599
966
  }
967
+ assert(target >= idx);
968
+ ZSTD_row_update_internalImpl(ms, idx, target, mls, rowLog, rowMask, useCache);
969
+ ms->nextToUpdate = target;
970
+ }
971
+
972
+ /* ZSTD_row_update():
973
+ * External wrapper for ZSTD_row_update_internal(). Used for filling the hashtable during dictionary
974
+ * processing.
975
+ */
976
+ void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) {
977
+ const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);
978
+ const U32 rowMask = (1u << rowLog) - 1;
979
+ const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */);
980
+
981
+ DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog);
982
+ ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* dont use cache */);
600
983
  }
601
984
 
985
+ #if defined(ZSTD_ARCH_X86_SSE2)
986
+ FORCE_INLINE_TEMPLATE ZSTD_VecMask
987
+ ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U32 head)
988
+ {
989
+ const __m128i comparisonMask = _mm_set1_epi8((char)tag);
990
+ int matches[4] = {0};
991
+ int i;
992
+ assert(nbChunks == 1 || nbChunks == 2 || nbChunks == 4);
993
+ for (i=0; i<nbChunks; i++) {
994
+ const __m128i chunk = _mm_loadu_si128((const __m128i*)(const void*)(src + 16*i));
995
+ const __m128i equalMask = _mm_cmpeq_epi8(chunk, comparisonMask);
996
+ matches[i] = _mm_movemask_epi8(equalMask);
997
+ }
998
+ if (nbChunks == 1) return ZSTD_rotateRight_U16((U16)matches[0], head);
999
+ if (nbChunks == 2) return ZSTD_rotateRight_U32((U32)matches[1] << 16 | (U32)matches[0], head);
1000
+ assert(nbChunks == 4);
1001
+ return ZSTD_rotateRight_U64((U64)matches[3] << 48 | (U64)matches[2] << 32 | (U64)matches[1] << 16 | (U64)matches[0], head);
1002
+ }
1003
+ #endif
602
1004
 
603
- FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_extDict_selectMLS (
1005
+ /* Returns a ZSTD_VecMask (U32) that has the nth bit set to 1 if the newly-computed "tag" matches
1006
+ * the hash at the nth position in a row of the tagTable.
1007
+ * Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield
1008
+ * to match up with the actual layout of the entries within the hashTable */
1009
+ FORCE_INLINE_TEMPLATE ZSTD_VecMask
1010
+ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries)
1011
+ {
1012
+ const BYTE* const src = tagRow + ZSTD_ROW_HASH_TAG_OFFSET;
1013
+ assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
1014
+ assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
1015
+
1016
+ #if defined(ZSTD_ARCH_X86_SSE2)
1017
+
1018
+ return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, head);
1019
+
1020
+ #else /* SW or NEON-LE */
1021
+
1022
+ # if defined(ZSTD_ARCH_ARM_NEON)
1023
+ /* This NEON path only works for little endian - otherwise use SWAR below */
1024
+ if (MEM_isLittleEndian()) {
1025
+ if (rowEntries == 16) {
1026
+ const uint8x16_t chunk = vld1q_u8(src);
1027
+ const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag)));
1028
+ const uint16x8_t t0 = vshlq_n_u16(equalMask, 7);
1029
+ const uint32x4_t t1 = vreinterpretq_u32_u16(vsriq_n_u16(t0, t0, 14));
1030
+ const uint64x2_t t2 = vreinterpretq_u64_u32(vshrq_n_u32(t1, 14));
1031
+ const uint8x16_t t3 = vreinterpretq_u8_u64(vsraq_n_u64(t2, t2, 28));
1032
+ const U16 hi = (U16)vgetq_lane_u8(t3, 8);
1033
+ const U16 lo = (U16)vgetq_lane_u8(t3, 0);
1034
+ return ZSTD_rotateRight_U16((hi << 8) | lo, head);
1035
+ } else if (rowEntries == 32) {
1036
+ const uint16x8x2_t chunk = vld2q_u16((const U16*)(const void*)src);
1037
+ const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]);
1038
+ const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]);
1039
+ const uint8x16_t equalMask0 = vceqq_u8(chunk0, vdupq_n_u8(tag));
1040
+ const uint8x16_t equalMask1 = vceqq_u8(chunk1, vdupq_n_u8(tag));
1041
+ const int8x8_t pack0 = vqmovn_s16(vreinterpretq_s16_u8(equalMask0));
1042
+ const int8x8_t pack1 = vqmovn_s16(vreinterpretq_s16_u8(equalMask1));
1043
+ const uint8x8_t t0 = vreinterpret_u8_s8(pack0);
1044
+ const uint8x8_t t1 = vreinterpret_u8_s8(pack1);
1045
+ const uint8x8_t t2 = vsri_n_u8(t1, t0, 2);
1046
+ const uint8x8x2_t t3 = vuzp_u8(t2, t0);
1047
+ const uint8x8_t t4 = vsri_n_u8(t3.val[1], t3.val[0], 4);
1048
+ const U32 matches = vget_lane_u32(vreinterpret_u32_u8(t4), 0);
1049
+ return ZSTD_rotateRight_U32(matches, head);
1050
+ } else { /* rowEntries == 64 */
1051
+ const uint8x16x4_t chunk = vld4q_u8(src);
1052
+ const uint8x16_t dup = vdupq_n_u8(tag);
1053
+ const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup);
1054
+ const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup);
1055
+ const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup);
1056
+ const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup);
1057
+
1058
+ const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1);
1059
+ const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1);
1060
+ const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2);
1061
+ const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4);
1062
+ const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4);
1063
+ const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0);
1064
+ return ZSTD_rotateRight_U64(matches, head);
1065
+ }
1066
+ }
1067
+ # endif /* ZSTD_ARCH_ARM_NEON */
1068
+ /* SWAR */
1069
+ { const size_t chunkSize = sizeof(size_t);
1070
+ const size_t shiftAmount = ((chunkSize * 8) - chunkSize);
1071
+ const size_t xFF = ~((size_t)0);
1072
+ const size_t x01 = xFF / 0xFF;
1073
+ const size_t x80 = x01 << 7;
1074
+ const size_t splatChar = tag * x01;
1075
+ ZSTD_VecMask matches = 0;
1076
+ int i = rowEntries - chunkSize;
1077
+ assert((sizeof(size_t) == 4) || (sizeof(size_t) == 8));
1078
+ if (MEM_isLittleEndian()) { /* runtime check so have two loops */
1079
+ const size_t extractMagic = (xFF / 0x7F) >> chunkSize;
1080
+ do {
1081
+ size_t chunk = MEM_readST(&src[i]);
1082
+ chunk ^= splatChar;
1083
+ chunk = (((chunk | x80) - x01) | chunk) & x80;
1084
+ matches <<= chunkSize;
1085
+ matches |= (chunk * extractMagic) >> shiftAmount;
1086
+ i -= chunkSize;
1087
+ } while (i >= 0);
1088
+ } else { /* big endian: reverse bits during extraction */
1089
+ const size_t msb = xFF ^ (xFF >> 1);
1090
+ const size_t extractMagic = (msb / 0x1FF) | msb;
1091
+ do {
1092
+ size_t chunk = MEM_readST(&src[i]);
1093
+ chunk ^= splatChar;
1094
+ chunk = (((chunk | x80) - x01) | chunk) & x80;
1095
+ matches <<= chunkSize;
1096
+ matches |= ((chunk >> 7) * extractMagic) >> shiftAmount;
1097
+ i -= chunkSize;
1098
+ } while (i >= 0);
1099
+ }
1100
+ matches = ~matches;
1101
+ if (rowEntries == 16) {
1102
+ return ZSTD_rotateRight_U16((U16)matches, head);
1103
+ } else if (rowEntries == 32) {
1104
+ return ZSTD_rotateRight_U32((U32)matches, head);
1105
+ } else {
1106
+ return ZSTD_rotateRight_U64((U64)matches, head);
1107
+ }
1108
+ }
1109
+ #endif
1110
+ }
1111
+
1112
+ /* The high-level approach of the SIMD row based match finder is as follows:
1113
+ * - Figure out where to insert the new entry:
1114
+ * - Generate a hash from a byte along with an additional 1-byte "short hash". The additional byte is our "tag"
1115
+ * - The hashTable is effectively split into groups or "rows" of 16 or 32 entries of U32, and the hash determines
1116
+ * which row to insert into.
1117
+ * - Determine the correct position within the row to insert the entry into. Each row of 16 or 32 can
1118
+ * be considered as a circular buffer with a "head" index that resides in the tagTable.
1119
+ * - Also insert the "tag" into the equivalent row and position in the tagTable.
1120
+ * - Note: The tagTable has 17 or 33 1-byte entries per row, due to 16 or 32 tags, and 1 "head" entry.
1121
+ * The 17 or 33 entry rows are spaced out to occur every 32 or 64 bytes, respectively,
1122
+ * for alignment/performance reasons, leaving some bytes unused.
1123
+ * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte "short hash" and
1124
+ * generate a bitfield that we can cycle through to check the collisions in the hash table.
1125
+ * - Pick the longest match.
1126
+ */
1127
+ FORCE_INLINE_TEMPLATE
1128
+ size_t ZSTD_RowFindBestMatch(
604
1129
  ZSTD_matchState_t* ms,
605
- const BYTE* ip, const BYTE* const iLimit,
606
- size_t* offsetPtr)
1130
+ const BYTE* const ip, const BYTE* const iLimit,
1131
+ size_t* offsetPtr,
1132
+ const U32 mls, const ZSTD_dictMode_e dictMode,
1133
+ const U32 rowLog)
607
1134
  {
608
- switch(ms->cParams.minMatch)
609
- {
610
- default : /* includes case 3 */
611
- case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_extDict);
612
- case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_extDict);
613
- case 7 :
614
- case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_extDict);
1135
+ U32* const hashTable = ms->hashTable;
1136
+ U16* const tagTable = ms->tagTable;
1137
+ U32* const hashCache = ms->hashCache;
1138
+ const U32 hashLog = ms->rowHashLog;
1139
+ const ZSTD_compressionParameters* const cParams = &ms->cParams;
1140
+ const BYTE* const base = ms->window.base;
1141
+ const BYTE* const dictBase = ms->window.dictBase;
1142
+ const U32 dictLimit = ms->window.dictLimit;
1143
+ const BYTE* const prefixStart = base + dictLimit;
1144
+ const BYTE* const dictEnd = dictBase + dictLimit;
1145
+ const U32 curr = (U32)(ip-base);
1146
+ const U32 maxDistance = 1U << cParams->windowLog;
1147
+ const U32 lowestValid = ms->window.lowLimit;
1148
+ const U32 withinMaxDistance = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid;
1149
+ const U32 isDictionary = (ms->loadedDictEnd != 0);
1150
+ const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance;
1151
+ const U32 rowEntries = (1U << rowLog);
1152
+ const U32 rowMask = rowEntries - 1;
1153
+ const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */
1154
+ U32 nbAttempts = 1U << cappedSearchLog;
1155
+ size_t ml=4-1;
1156
+
1157
+ /* DMS/DDS variables that may be referenced laster */
1158
+ const ZSTD_matchState_t* const dms = ms->dictMatchState;
1159
+
1160
+ /* Initialize the following variables to satisfy static analyzer */
1161
+ size_t ddsIdx = 0;
1162
+ U32 ddsExtraAttempts = 0; /* cctx hash tables are limited in searches, but allow extra searches into DDS */
1163
+ U32 dmsTag = 0;
1164
+ U32* dmsRow = NULL;
1165
+ BYTE* dmsTagRow = NULL;
1166
+
1167
+ if (dictMode == ZSTD_dedicatedDictSearch) {
1168
+ const U32 ddsHashLog = dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG;
1169
+ { /* Prefetch DDS hashtable entry */
1170
+ ddsIdx = ZSTD_hashPtr(ip, ddsHashLog, mls) << ZSTD_LAZY_DDSS_BUCKET_LOG;
1171
+ PREFETCH_L1(&dms->hashTable[ddsIdx]);
1172
+ }
1173
+ ddsExtraAttempts = cParams->searchLog > rowLog ? 1U << (cParams->searchLog - rowLog) : 0;
1174
+ }
1175
+
1176
+ if (dictMode == ZSTD_dictMatchState) {
1177
+ /* Prefetch DMS rows */
1178
+ U32* const dmsHashTable = dms->hashTable;
1179
+ U16* const dmsTagTable = dms->tagTable;
1180
+ U32 const dmsHash = (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
1181
+ U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
1182
+ dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK;
1183
+ dmsTagRow = (BYTE*)(dmsTagTable + dmsRelRow);
1184
+ dmsRow = dmsHashTable + dmsRelRow;
1185
+ ZSTD_row_prefetch(dmsHashTable, dmsTagTable, dmsRelRow, rowLog);
1186
+ }
1187
+
1188
+ /* Update the hashTable and tagTable up to (but not including) ip */
1189
+ ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);
1190
+ { /* Get the hash for ip, compute the appropriate row */
1191
+ U32 const hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls);
1192
+ U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
1193
+ U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK;
1194
+ U32* const row = hashTable + relRow;
1195
+ BYTE* tagRow = (BYTE*)(tagTable + relRow);
1196
+ U32 const head = *tagRow & rowMask;
1197
+ U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
1198
+ size_t numMatches = 0;
1199
+ size_t currMatch = 0;
1200
+ ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, head, rowEntries);
1201
+
1202
+ /* Cycle through the matches and prefetch */
1203
+ for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
1204
+ U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
1205
+ U32 const matchIndex = row[matchPos];
1206
+ assert(numMatches < rowEntries);
1207
+ if (matchIndex < lowLimit)
1208
+ break;
1209
+ if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
1210
+ PREFETCH_L1(base + matchIndex);
1211
+ } else {
1212
+ PREFETCH_L1(dictBase + matchIndex);
1213
+ }
1214
+ matchBuffer[numMatches++] = matchIndex;
1215
+ }
1216
+
1217
+ /* Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop
1218
+ in ZSTD_row_update_internal() at the next search. */
1219
+ {
1220
+ U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
1221
+ tagRow[pos + ZSTD_ROW_HASH_TAG_OFFSET] = (BYTE)tag;
1222
+ row[pos] = ms->nextToUpdate++;
1223
+ }
1224
+
1225
+ /* Return the longest match */
1226
+ for (; currMatch < numMatches; ++currMatch) {
1227
+ U32 const matchIndex = matchBuffer[currMatch];
1228
+ size_t currentMl=0;
1229
+ assert(matchIndex < curr);
1230
+ assert(matchIndex >= lowLimit);
1231
+
1232
+ if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
1233
+ const BYTE* const match = base + matchIndex;
1234
+ assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */
1235
+ if (match[ml] == ip[ml]) /* potentially better */
1236
+ currentMl = ZSTD_count(ip, match, iLimit);
1237
+ } else {
1238
+ const BYTE* const match = dictBase + matchIndex;
1239
+ assert(match+4 <= dictEnd);
1240
+ if (MEM_read32(match) == MEM_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */
1241
+ currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dictEnd, prefixStart) + 4;
1242
+ }
1243
+
1244
+ /* Save best solution */
1245
+ if (currentMl > ml) {
1246
+ ml = currentMl;
1247
+ *offsetPtr = curr - matchIndex + ZSTD_REP_MOVE;
1248
+ if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
1249
+ }
1250
+ }
615
1251
  }
1252
+
1253
+ assert(nbAttempts <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */
1254
+ if (dictMode == ZSTD_dedicatedDictSearch) {
1255
+ ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts + ddsExtraAttempts, dms,
1256
+ ip, iLimit, prefixStart, curr, dictLimit, ddsIdx);
1257
+ } else if (dictMode == ZSTD_dictMatchState) {
1258
+ /* TODO: Measure and potentially add prefetching to DMS */
1259
+ const U32 dmsLowestIndex = dms->window.dictLimit;
1260
+ const BYTE* const dmsBase = dms->window.base;
1261
+ const BYTE* const dmsEnd = dms->window.nextSrc;
1262
+ const U32 dmsSize = (U32)(dmsEnd - dmsBase);
1263
+ const U32 dmsIndexDelta = dictLimit - dmsSize;
1264
+
1265
+ { U32 const head = *dmsTagRow & rowMask;
1266
+ U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
1267
+ size_t numMatches = 0;
1268
+ size_t currMatch = 0;
1269
+ ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, head, rowEntries);
1270
+
1271
+ for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
1272
+ U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
1273
+ U32 const matchIndex = dmsRow[matchPos];
1274
+ if (matchIndex < dmsLowestIndex)
1275
+ break;
1276
+ PREFETCH_L1(dmsBase + matchIndex);
1277
+ matchBuffer[numMatches++] = matchIndex;
1278
+ }
1279
+
1280
+ /* Return the longest match */
1281
+ for (; currMatch < numMatches; ++currMatch) {
1282
+ U32 const matchIndex = matchBuffer[currMatch];
1283
+ size_t currentMl=0;
1284
+ assert(matchIndex >= dmsLowestIndex);
1285
+ assert(matchIndex < curr);
1286
+
1287
+ { const BYTE* const match = dmsBase + matchIndex;
1288
+ assert(match+4 <= dmsEnd);
1289
+ if (MEM_read32(match) == MEM_read32(ip))
1290
+ currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dmsEnd, prefixStart) + 4;
1291
+ }
1292
+
1293
+ if (currentMl > ml) {
1294
+ ml = currentMl;
1295
+ *offsetPtr = curr - (matchIndex + dmsIndexDelta) + ZSTD_REP_MOVE;
1296
+ if (ip+currentMl == iLimit) break;
1297
+ }
1298
+ }
1299
+ }
1300
+ }
1301
+ return ml;
616
1302
  }
617
1303
 
618
1304
 
1305
+ typedef size_t (*searchMax_f)(
1306
+ ZSTD_matchState_t* ms,
1307
+ const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
1308
+
1309
+ /**
1310
+ * This struct contains the functions necessary for lazy to search.
1311
+ * Currently, that is only searchMax. However, it is still valuable to have the
1312
+ * VTable because this makes it easier to add more functions to the VTable later.
1313
+ *
1314
+ * TODO: The start of the search function involves loading and calculating a
1315
+ * bunch of constants from the ZSTD_matchState_t. These computations could be
1316
+ * done in an initialization function, and saved somewhere in the match state.
1317
+ * Then we could pass a pointer to the saved state instead of the match state,
1318
+ * and avoid duplicate computations.
1319
+ *
1320
+ * TODO: Move the match re-winding into searchMax. This improves compression
1321
+ * ratio, and unlocks further simplifications with the next TODO.
1322
+ *
1323
+ * TODO: Try moving the repcode search into searchMax. After the re-winding
1324
+ * and repcode search are in searchMax, there is no more logic in the match
1325
+ * finder loop that requires knowledge about the dictMode. So we should be
1326
+ * able to avoid force inlining it, and we can join the extDict loop with
1327
+ * the single segment loop. It should go in searchMax instead of its own
1328
+ * function to avoid having multiple virtual function calls per search.
1329
+ */
1330
+ typedef struct {
1331
+ searchMax_f searchMax;
1332
+ } ZSTD_LazyVTable;
1333
+
1334
+ #define GEN_ZSTD_BT_VTABLE(dictMode, mls) \
1335
+ static size_t ZSTD_BtFindBestMatch_##dictMode##_##mls( \
1336
+ ZSTD_matchState_t* ms, \
1337
+ const BYTE* ip, const BYTE* const iLimit, \
1338
+ size_t* offsetPtr) \
1339
+ { \
1340
+ assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
1341
+ return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode); \
1342
+ } \
1343
+ static const ZSTD_LazyVTable ZSTD_BtVTable_##dictMode##_##mls = { \
1344
+ ZSTD_BtFindBestMatch_##dictMode##_##mls \
1345
+ };
1346
+
1347
+ #define GEN_ZSTD_HC_VTABLE(dictMode, mls) \
1348
+ static size_t ZSTD_HcFindBestMatch_##dictMode##_##mls( \
1349
+ ZSTD_matchState_t* ms, \
1350
+ const BYTE* ip, const BYTE* const iLimit, \
1351
+ size_t* offsetPtr) \
1352
+ { \
1353
+ assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
1354
+ return ZSTD_HcFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode); \
1355
+ } \
1356
+ static const ZSTD_LazyVTable ZSTD_HcVTable_##dictMode##_##mls = { \
1357
+ ZSTD_HcFindBestMatch_##dictMode##_##mls \
1358
+ };
1359
+
1360
+ #define GEN_ZSTD_ROW_VTABLE(dictMode, mls, rowLog) \
1361
+ static size_t ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog( \
1362
+ ZSTD_matchState_t* ms, \
1363
+ const BYTE* ip, const BYTE* const iLimit, \
1364
+ size_t* offsetPtr) \
1365
+ { \
1366
+ assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
1367
+ assert(MAX(4, MIN(6, ms->cParams.searchLog)) == rowLog); \
1368
+ return ZSTD_RowFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode, rowLog); \
1369
+ } \
1370
+ static const ZSTD_LazyVTable ZSTD_RowVTable_##dictMode##_##mls##_##rowLog = { \
1371
+ ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog \
1372
+ };
1373
+
1374
+ #define ZSTD_FOR_EACH_ROWLOG(X, dictMode, mls) \
1375
+ X(dictMode, mls, 4) \
1376
+ X(dictMode, mls, 5) \
1377
+ X(dictMode, mls, 6)
1378
+
1379
+ #define ZSTD_FOR_EACH_MLS_ROWLOG(X, dictMode) \
1380
+ ZSTD_FOR_EACH_ROWLOG(X, dictMode, 4) \
1381
+ ZSTD_FOR_EACH_ROWLOG(X, dictMode, 5) \
1382
+ ZSTD_FOR_EACH_ROWLOG(X, dictMode, 6)
1383
+
1384
+ #define ZSTD_FOR_EACH_MLS(X, dictMode) \
1385
+ X(dictMode, 4) \
1386
+ X(dictMode, 5) \
1387
+ X(dictMode, 6)
1388
+
1389
+ #define ZSTD_FOR_EACH_DICT_MODE(X, ...) \
1390
+ X(__VA_ARGS__, noDict) \
1391
+ X(__VA_ARGS__, extDict) \
1392
+ X(__VA_ARGS__, dictMatchState) \
1393
+ X(__VA_ARGS__, dedicatedDictSearch)
1394
+
1395
+ /* Generate Row VTables for each combination of (dictMode, mls, rowLog) */
1396
+ ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS_ROWLOG, GEN_ZSTD_ROW_VTABLE)
1397
+ /* Generate Binary Tree VTables for each combination of (dictMode, mls) */
1398
+ ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_BT_VTABLE)
1399
+ /* Generate Hash Chain VTables for each combination of (dictMode, mls) */
1400
+ ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_HC_VTABLE)
1401
+
1402
+ #define GEN_ZSTD_BT_VTABLE_ARRAY(dictMode) \
1403
+ { \
1404
+ &ZSTD_BtVTable_##dictMode##_4, \
1405
+ &ZSTD_BtVTable_##dictMode##_5, \
1406
+ &ZSTD_BtVTable_##dictMode##_6 \
1407
+ }
1408
+
1409
+ #define GEN_ZSTD_HC_VTABLE_ARRAY(dictMode) \
1410
+ { \
1411
+ &ZSTD_HcVTable_##dictMode##_4, \
1412
+ &ZSTD_HcVTable_##dictMode##_5, \
1413
+ &ZSTD_HcVTable_##dictMode##_6 \
1414
+ }
1415
+
1416
+ #define GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, mls) \
1417
+ { \
1418
+ &ZSTD_RowVTable_##dictMode##_##mls##_4, \
1419
+ &ZSTD_RowVTable_##dictMode##_##mls##_5, \
1420
+ &ZSTD_RowVTable_##dictMode##_##mls##_6 \
1421
+ }
1422
+
1423
+ #define GEN_ZSTD_ROW_VTABLE_ARRAY(dictMode) \
1424
+ { \
1425
+ GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, 4), \
1426
+ GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, 5), \
1427
+ GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, 6) \
1428
+ }
1429
+
1430
+ #define GEN_ZSTD_VTABLE_ARRAY(X) \
1431
+ { \
1432
+ X(noDict), \
1433
+ X(extDict), \
1434
+ X(dictMatchState), \
1435
+ X(dedicatedDictSearch) \
1436
+ }
1437
+
619
1438
  /* *******************************
620
1439
  * Common parser - lazy strategy
621
1440
  *********************************/
622
- typedef enum { search_hashChain, search_binaryTree } searchMethod_e;
1441
+ typedef enum { search_hashChain=0, search_binaryTree=1, search_rowHash=2 } searchMethod_e;
1442
+
1443
+ /**
1444
+ * This table is indexed first by the four ZSTD_dictMode_e values, and then
1445
+ * by the two searchMethod_e values. NULLs are placed for configurations
1446
+ * that should never occur (extDict modes go to the other implementation
1447
+ * below and there is no DDSS for binary tree search yet).
1448
+ */
1449
+
1450
+ static ZSTD_LazyVTable const*
1451
+ ZSTD_selectLazyVTable(ZSTD_matchState_t const* ms, searchMethod_e searchMethod, ZSTD_dictMode_e dictMode)
1452
+ {
1453
+ /* Fill the Hc/Bt VTable arrays with the right functions for the (dictMode, mls) combination. */
1454
+ ZSTD_LazyVTable const* const hcVTables[4][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_HC_VTABLE_ARRAY);
1455
+ ZSTD_LazyVTable const* const btVTables[4][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_BT_VTABLE_ARRAY);
1456
+ /* Fill the Row VTable array with the right functions for the (dictMode, mls, rowLog) combination. */
1457
+ ZSTD_LazyVTable const* const rowVTables[4][3][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_ROW_VTABLE_ARRAY);
1458
+
1459
+ U32 const mls = MAX(4, MIN(6, ms->cParams.minMatch));
1460
+ U32 const rowLog = MAX(4, MIN(6, ms->cParams.searchLog));
1461
+ switch (searchMethod) {
1462
+ case search_hashChain:
1463
+ return hcVTables[dictMode][mls - 4];
1464
+ case search_binaryTree:
1465
+ return btVTables[dictMode][mls - 4];
1466
+ case search_rowHash:
1467
+ return rowVTables[dictMode][mls - 4][rowLog - 4];
1468
+ default:
1469
+ return NULL;
1470
+ }
1471
+ }
623
1472
 
624
1473
  FORCE_INLINE_TEMPLATE size_t
625
1474
  ZSTD_compressBlock_lazy_generic(
@@ -633,53 +1482,52 @@ ZSTD_compressBlock_lazy_generic(
633
1482
  const BYTE* ip = istart;
634
1483
  const BYTE* anchor = istart;
635
1484
  const BYTE* const iend = istart + srcSize;
636
- const BYTE* const ilimit = iend - 8;
1485
+ const BYTE* const ilimit = (searchMethod == search_rowHash) ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8;
637
1486
  const BYTE* const base = ms->window.base;
638
1487
  const U32 prefixLowestIndex = ms->window.dictLimit;
639
1488
  const BYTE* const prefixLowest = base + prefixLowestIndex;
640
1489
 
641
- typedef size_t (*searchMax_f)(
642
- ZSTD_matchState_t* ms,
643
- const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
644
- searchMax_f const searchMax = dictMode == ZSTD_dictMatchState ?
645
- (searchMethod==search_binaryTree ? ZSTD_BtFindBestMatch_dictMatchState_selectMLS
646
- : ZSTD_HcFindBestMatch_dictMatchState_selectMLS) :
647
- (searchMethod==search_binaryTree ? ZSTD_BtFindBestMatch_selectMLS
648
- : ZSTD_HcFindBestMatch_selectMLS);
1490
+ searchMax_f const searchMax = ZSTD_selectLazyVTable(ms, searchMethod, dictMode)->searchMax;
649
1491
  U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0;
650
1492
 
1493
+ const int isDMS = dictMode == ZSTD_dictMatchState;
1494
+ const int isDDS = dictMode == ZSTD_dedicatedDictSearch;
1495
+ const int isDxS = isDMS || isDDS;
651
1496
  const ZSTD_matchState_t* const dms = ms->dictMatchState;
652
- const U32 dictLowestIndex = dictMode == ZSTD_dictMatchState ?
653
- dms->window.dictLimit : 0;
654
- const BYTE* const dictBase = dictMode == ZSTD_dictMatchState ?
655
- dms->window.base : NULL;
656
- const BYTE* const dictLowest = dictMode == ZSTD_dictMatchState ?
657
- dictBase + dictLowestIndex : NULL;
658
- const BYTE* const dictEnd = dictMode == ZSTD_dictMatchState ?
659
- dms->window.nextSrc : NULL;
660
- const U32 dictIndexDelta = dictMode == ZSTD_dictMatchState ?
1497
+ const U32 dictLowestIndex = isDxS ? dms->window.dictLimit : 0;
1498
+ const BYTE* const dictBase = isDxS ? dms->window.base : NULL;
1499
+ const BYTE* const dictLowest = isDxS ? dictBase + dictLowestIndex : NULL;
1500
+ const BYTE* const dictEnd = isDxS ? dms->window.nextSrc : NULL;
1501
+ const U32 dictIndexDelta = isDxS ?
661
1502
  prefixLowestIndex - (U32)(dictEnd - dictBase) :
662
1503
  0;
663
1504
  const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictLowest));
664
1505
 
665
- DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=%u)", (U32)dictMode);
1506
+ assert(searchMax != NULL);
666
1507
 
667
- /* init */
1508
+ DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=%u) (searchFunc=%u)", (U32)dictMode, (U32)searchMethod);
668
1509
  ip += (dictAndPrefixLength == 0);
669
1510
  if (dictMode == ZSTD_noDict) {
670
- U32 const current = (U32)(ip - base);
671
- U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, current, ms->cParams.windowLog);
672
- U32 const maxRep = current - windowLow;
1511
+ U32 const curr = (U32)(ip - base);
1512
+ U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, ms->cParams.windowLog);
1513
+ U32 const maxRep = curr - windowLow;
673
1514
  if (offset_2 > maxRep) savedOffset = offset_2, offset_2 = 0;
674
1515
  if (offset_1 > maxRep) savedOffset = offset_1, offset_1 = 0;
675
1516
  }
676
- if (dictMode == ZSTD_dictMatchState) {
1517
+ if (isDxS) {
677
1518
  /* dictMatchState repCode checks don't currently handle repCode == 0
678
1519
  * disabling. */
679
1520
  assert(offset_1 <= dictAndPrefixLength);
680
1521
  assert(offset_2 <= dictAndPrefixLength);
681
1522
  }
682
1523
 
1524
+ if (searchMethod == search_rowHash) {
1525
+ const U32 rowLog = MAX(4, MIN(6, ms->cParams.searchLog));
1526
+ ZSTD_row_fillHashCache(ms, base, rowLog,
1527
+ MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
1528
+ ms->nextToUpdate, ilimit);
1529
+ }
1530
+
683
1531
  /* Match Loop */
684
1532
  #if defined(__GNUC__) && defined(__x86_64__)
685
1533
  /* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the
@@ -693,9 +1541,9 @@ ZSTD_compressBlock_lazy_generic(
693
1541
  const BYTE* start=ip+1;
694
1542
 
695
1543
  /* check repCode */
696
- if (dictMode == ZSTD_dictMatchState) {
1544
+ if (isDxS) {
697
1545
  const U32 repIndex = (U32)(ip - base) + 1 - offset_1;
698
- const BYTE* repMatch = (dictMode == ZSTD_dictMatchState
1546
+ const BYTE* repMatch = ((dictMode == ZSTD_dictMatchState || dictMode == ZSTD_dedicatedDictSearch)
699
1547
  && repIndex < prefixLowestIndex) ?
700
1548
  dictBase + (repIndex - dictIndexDelta) :
701
1549
  base + repIndex;
@@ -736,7 +1584,7 @@ ZSTD_compressBlock_lazy_generic(
736
1584
  if ((mlRep >= 4) && (gain2 > gain1))
737
1585
  matchLength = mlRep, offset = 0, start = ip;
738
1586
  }
739
- if (dictMode == ZSTD_dictMatchState) {
1587
+ if (isDxS) {
740
1588
  const U32 repIndex = (U32)(ip - base) - offset_1;
741
1589
  const BYTE* repMatch = repIndex < prefixLowestIndex ?
742
1590
  dictBase + (repIndex - dictIndexDelta) :
@@ -771,7 +1619,7 @@ ZSTD_compressBlock_lazy_generic(
771
1619
  if ((mlRep >= 4) && (gain2 > gain1))
772
1620
  matchLength = mlRep, offset = 0, start = ip;
773
1621
  }
774
- if (dictMode == ZSTD_dictMatchState) {
1622
+ if (isDxS) {
775
1623
  const U32 repIndex = (U32)(ip - base) - offset_1;
776
1624
  const BYTE* repMatch = repIndex < prefixLowestIndex ?
777
1625
  dictBase + (repIndex - dictIndexDelta) :
@@ -809,8 +1657,8 @@ ZSTD_compressBlock_lazy_generic(
809
1657
  && (start[-1] == (start-(offset-ZSTD_REP_MOVE))[-1]) ) /* only search for offset within prefix */
810
1658
  { start--; matchLength++; }
811
1659
  }
812
- if (dictMode == ZSTD_dictMatchState) {
813
- U32 const matchIndex = (U32)((start-base) - (offset - ZSTD_REP_MOVE));
1660
+ if (isDxS) {
1661
+ U32 const matchIndex = (U32)((size_t)(start-base) - (offset - ZSTD_REP_MOVE));
814
1662
  const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex;
815
1663
  const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest;
816
1664
  while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */
@@ -819,18 +1667,17 @@ ZSTD_compressBlock_lazy_generic(
819
1667
  }
820
1668
  /* store sequence */
821
1669
  _storeSequence:
822
- { size_t const litLength = start - anchor;
1670
+ { size_t const litLength = (size_t)(start - anchor);
823
1671
  ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offset, matchLength-MINMATCH);
824
1672
  anchor = ip = start + matchLength;
825
1673
  }
826
1674
 
827
1675
  /* check immediate repcode */
828
- if (dictMode == ZSTD_dictMatchState) {
1676
+ if (isDxS) {
829
1677
  while (ip <= ilimit) {
830
1678
  U32 const current2 = (U32)(ip-base);
831
1679
  U32 const repIndex = current2 - offset_2;
832
- const BYTE* repMatch = dictMode == ZSTD_dictMatchState
833
- && repIndex < prefixLowestIndex ?
1680
+ const BYTE* repMatch = repIndex < prefixLowestIndex ?
834
1681
  dictBase - dictIndexDelta + repIndex :
835
1682
  base + repIndex;
836
1683
  if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex) >= 3 /* intentional overflow */)
@@ -925,6 +1772,92 @@ size_t ZSTD_compressBlock_greedy_dictMatchState(
925
1772
  }
926
1773
 
927
1774
 
1775
+ size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(
1776
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1777
+ void const* src, size_t srcSize)
1778
+ {
1779
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dedicatedDictSearch);
1780
+ }
1781
+
1782
+ size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(
1783
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1784
+ void const* src, size_t srcSize)
1785
+ {
1786
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dedicatedDictSearch);
1787
+ }
1788
+
1789
+ size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
1790
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1791
+ void const* src, size_t srcSize)
1792
+ {
1793
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch);
1794
+ }
1795
+
1796
+ /* Row-based matchfinder */
1797
+ size_t ZSTD_compressBlock_lazy2_row(
1798
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1799
+ void const* src, size_t srcSize)
1800
+ {
1801
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict);
1802
+ }
1803
+
1804
+ size_t ZSTD_compressBlock_lazy_row(
1805
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1806
+ void const* src, size_t srcSize)
1807
+ {
1808
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict);
1809
+ }
1810
+
1811
+ size_t ZSTD_compressBlock_greedy_row(
1812
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1813
+ void const* src, size_t srcSize)
1814
+ {
1815
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict);
1816
+ }
1817
+
1818
+ size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
1819
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1820
+ void const* src, size_t srcSize)
1821
+ {
1822
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState);
1823
+ }
1824
+
1825
+ size_t ZSTD_compressBlock_lazy_dictMatchState_row(
1826
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1827
+ void const* src, size_t srcSize)
1828
+ {
1829
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState);
1830
+ }
1831
+
1832
+ size_t ZSTD_compressBlock_greedy_dictMatchState_row(
1833
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1834
+ void const* src, size_t srcSize)
1835
+ {
1836
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState);
1837
+ }
1838
+
1839
+
1840
+ size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row(
1841
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1842
+ void const* src, size_t srcSize)
1843
+ {
1844
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dedicatedDictSearch);
1845
+ }
1846
+
1847
+ size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
1848
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1849
+ void const* src, size_t srcSize)
1850
+ {
1851
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch);
1852
+ }
1853
+
1854
+ size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
1855
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1856
+ void const* src, size_t srcSize)
1857
+ {
1858
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch);
1859
+ }
1860
+
928
1861
  FORCE_INLINE_TEMPLATE
929
1862
  size_t ZSTD_compressBlock_lazy_extDict_generic(
930
1863
  ZSTD_matchState_t* ms, seqStore_t* seqStore,
@@ -936,7 +1869,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
936
1869
  const BYTE* ip = istart;
937
1870
  const BYTE* anchor = istart;
938
1871
  const BYTE* const iend = istart + srcSize;
939
- const BYTE* const ilimit = iend - 8;
1872
+ const BYTE* const ilimit = searchMethod == search_rowHash ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8;
940
1873
  const BYTE* const base = ms->window.base;
941
1874
  const U32 dictLimit = ms->window.dictLimit;
942
1875
  const BYTE* const prefixStart = base + dictLimit;
@@ -944,18 +1877,20 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
944
1877
  const BYTE* const dictEnd = dictBase + dictLimit;
945
1878
  const BYTE* const dictStart = dictBase + ms->window.lowLimit;
946
1879
  const U32 windowLog = ms->cParams.windowLog;
1880
+ const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5;
947
1881
 
948
- typedef size_t (*searchMax_f)(
949
- ZSTD_matchState_t* ms,
950
- const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
951
- searchMax_f searchMax = searchMethod==search_binaryTree ? ZSTD_BtFindBestMatch_extDict_selectMLS : ZSTD_HcFindBestMatch_extDict_selectMLS;
952
-
1882
+ searchMax_f const searchMax = ZSTD_selectLazyVTable(ms, searchMethod, ZSTD_extDict)->searchMax;
953
1883
  U32 offset_1 = rep[0], offset_2 = rep[1];
954
1884
 
955
- DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic");
1885
+ DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod);
956
1886
 
957
1887
  /* init */
958
1888
  ip += (ip == prefixStart);
1889
+ if (searchMethod == search_rowHash) {
1890
+ ZSTD_row_fillHashCache(ms, base, rowLog,
1891
+ MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
1892
+ ms->nextToUpdate, ilimit);
1893
+ }
959
1894
 
960
1895
  /* Match Loop */
961
1896
  #if defined(__GNUC__) && defined(__x86_64__)
@@ -968,14 +1903,15 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
968
1903
  size_t matchLength=0;
969
1904
  size_t offset=0;
970
1905
  const BYTE* start=ip+1;
971
- U32 current = (U32)(ip-base);
1906
+ U32 curr = (U32)(ip-base);
972
1907
 
973
1908
  /* check repCode */
974
- { const U32 windowLow = ZSTD_getLowestMatchIndex(ms, current+1, windowLog);
975
- const U32 repIndex = (U32)(current+1 - offset_1);
1909
+ { const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr+1, windowLog);
1910
+ const U32 repIndex = (U32)(curr+1 - offset_1);
976
1911
  const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
977
1912
  const BYTE* const repMatch = repBase + repIndex;
978
- if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > windowLow)) /* intentional overflow */
1913
+ if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow */
1914
+ & (offset_1 <= curr+1 - windowLow) ) /* note: we are searching at curr+1 */
979
1915
  if (MEM_read32(ip+1) == MEM_read32(repMatch)) {
980
1916
  /* repcode detected we should take it */
981
1917
  const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
@@ -990,7 +1926,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
990
1926
  matchLength = ml2, start = ip, offset=offsetFound;
991
1927
  }
992
1928
 
993
- if (matchLength < 4) {
1929
+ if (matchLength < 4) {
994
1930
  ip += ((ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */
995
1931
  continue;
996
1932
  }
@@ -999,14 +1935,15 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
999
1935
  if (depth>=1)
1000
1936
  while (ip<ilimit) {
1001
1937
  ip ++;
1002
- current++;
1938
+ curr++;
1003
1939
  /* check repCode */
1004
1940
  if (offset) {
1005
- const U32 windowLow = ZSTD_getLowestMatchIndex(ms, current, windowLog);
1006
- const U32 repIndex = (U32)(current - offset_1);
1941
+ const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
1942
+ const U32 repIndex = (U32)(curr - offset_1);
1007
1943
  const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
1008
1944
  const BYTE* const repMatch = repBase + repIndex;
1009
- if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > windowLow)) /* intentional overflow */
1945
+ if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */
1946
+ & (offset_1 <= curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
1010
1947
  if (MEM_read32(ip) == MEM_read32(repMatch)) {
1011
1948
  /* repcode detected */
1012
1949
  const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
@@ -1030,14 +1967,15 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1030
1967
  /* let's find an even better one */
1031
1968
  if ((depth==2) && (ip<ilimit)) {
1032
1969
  ip ++;
1033
- current++;
1970
+ curr++;
1034
1971
  /* check repCode */
1035
1972
  if (offset) {
1036
- const U32 windowLow = ZSTD_getLowestMatchIndex(ms, current, windowLog);
1037
- const U32 repIndex = (U32)(current - offset_1);
1973
+ const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
1974
+ const U32 repIndex = (U32)(curr - offset_1);
1038
1975
  const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
1039
1976
  const BYTE* const repMatch = repBase + repIndex;
1040
- if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > windowLow)) /* intentional overflow */
1977
+ if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */
1978
+ & (offset_1 <= curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
1041
1979
  if (MEM_read32(ip) == MEM_read32(repMatch)) {
1042
1980
  /* repcode detected */
1043
1981
  const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
@@ -1062,7 +2000,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1062
2000
 
1063
2001
  /* catch up */
1064
2002
  if (offset) {
1065
- U32 const matchIndex = (U32)((start-base) - (offset - ZSTD_REP_MOVE));
2003
+ U32 const matchIndex = (U32)((size_t)(start-base) - (offset - ZSTD_REP_MOVE));
1066
2004
  const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex;
1067
2005
  const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart;
1068
2006
  while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */
@@ -1071,7 +2009,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1071
2009
 
1072
2010
  /* store sequence */
1073
2011
  _storeSequence:
1074
- { size_t const litLength = start - anchor;
2012
+ { size_t const litLength = (size_t)(start - anchor);
1075
2013
  ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offset, matchLength-MINMATCH);
1076
2014
  anchor = ip = start + matchLength;
1077
2015
  }
@@ -1083,7 +2021,8 @@ _storeSequence:
1083
2021
  const U32 repIndex = repCurrent - offset_2;
1084
2022
  const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
1085
2023
  const BYTE* const repMatch = repBase + repIndex;
1086
- if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > windowLow)) /* intentional overflow */
2024
+ if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */
2025
+ & (offset_2 <= repCurrent - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
1087
2026
  if (MEM_read32(ip) == MEM_read32(repMatch)) {
1088
2027
  /* repcode detected we should take it */
1089
2028
  const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
@@ -1136,3 +2075,26 @@ size_t ZSTD_compressBlock_btlazy2_extDict(
1136
2075
  {
1137
2076
  return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2);
1138
2077
  }
2078
+
2079
+ size_t ZSTD_compressBlock_greedy_extDict_row(
2080
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2081
+ void const* src, size_t srcSize)
2082
+ {
2083
+ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0);
2084
+ }
2085
+
2086
+ size_t ZSTD_compressBlock_lazy_extDict_row(
2087
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2088
+ void const* src, size_t srcSize)
2089
+
2090
+ {
2091
+ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1);
2092
+ }
2093
+
2094
+ size_t ZSTD_compressBlock_lazy2_extDict_row(
2095
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2096
+ void const* src, size_t srcSize)
2097
+
2098
+ {
2099
+ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2);
2100
+ }