zstd-ruby 1.4.4.0 → 1.5.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. checksums.yaml +4 -4
  2. data/.github/dependabot.yml +8 -0
  3. data/.github/workflows/ruby.yml +35 -0
  4. data/README.md +2 -2
  5. data/ext/zstdruby/extconf.rb +1 -0
  6. data/ext/zstdruby/libzstd/BUCK +5 -7
  7. data/ext/zstdruby/libzstd/Makefile +241 -173
  8. data/ext/zstdruby/libzstd/README.md +76 -18
  9. data/ext/zstdruby/libzstd/common/bitstream.h +75 -57
  10. data/ext/zstdruby/libzstd/common/compiler.h +196 -20
  11. data/ext/zstdruby/libzstd/common/cpu.h +1 -3
  12. data/ext/zstdruby/libzstd/common/debug.c +11 -31
  13. data/ext/zstdruby/libzstd/common/debug.h +22 -49
  14. data/ext/zstdruby/libzstd/common/entropy_common.c +208 -76
  15. data/ext/zstdruby/libzstd/common/error_private.c +3 -1
  16. data/ext/zstdruby/libzstd/common/error_private.h +87 -4
  17. data/ext/zstdruby/libzstd/common/fse.h +51 -42
  18. data/ext/zstdruby/libzstd/common/fse_decompress.c +149 -57
  19. data/ext/zstdruby/libzstd/common/huf.h +60 -54
  20. data/ext/zstdruby/libzstd/common/mem.h +87 -98
  21. data/ext/zstdruby/libzstd/common/pool.c +23 -17
  22. data/ext/zstdruby/libzstd/common/pool.h +3 -3
  23. data/ext/zstdruby/libzstd/common/portability_macros.h +131 -0
  24. data/ext/zstdruby/libzstd/common/threading.c +10 -8
  25. data/ext/zstdruby/libzstd/common/threading.h +4 -3
  26. data/ext/zstdruby/libzstd/common/xxhash.c +15 -873
  27. data/ext/zstdruby/libzstd/common/xxhash.h +5572 -191
  28. data/ext/zstdruby/libzstd/common/zstd_common.c +10 -10
  29. data/ext/zstdruby/libzstd/common/zstd_deps.h +111 -0
  30. data/ext/zstdruby/libzstd/common/zstd_internal.h +252 -108
  31. data/ext/zstdruby/libzstd/common/zstd_trace.h +163 -0
  32. data/ext/zstdruby/libzstd/compress/clevels.h +134 -0
  33. data/ext/zstdruby/libzstd/compress/fse_compress.c +105 -85
  34. data/ext/zstdruby/libzstd/compress/hist.c +41 -63
  35. data/ext/zstdruby/libzstd/compress/hist.h +13 -33
  36. data/ext/zstdruby/libzstd/compress/huf_compress.c +831 -259
  37. data/ext/zstdruby/libzstd/compress/zstd_compress.c +3213 -1007
  38. data/ext/zstdruby/libzstd/compress/zstd_compress_internal.h +493 -71
  39. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.c +21 -16
  40. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.h +4 -2
  41. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.c +51 -24
  42. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.h +10 -3
  43. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.c +573 -0
  44. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.h +32 -0
  45. data/ext/zstdruby/libzstd/compress/zstd_cwksp.h +208 -81
  46. data/ext/zstdruby/libzstd/compress/zstd_double_fast.c +315 -137
  47. data/ext/zstdruby/libzstd/compress/zstd_double_fast.h +2 -2
  48. data/ext/zstdruby/libzstd/compress/zstd_fast.c +319 -128
  49. data/ext/zstdruby/libzstd/compress/zstd_fast.h +2 -2
  50. data/ext/zstdruby/libzstd/compress/zstd_lazy.c +1156 -171
  51. data/ext/zstdruby/libzstd/compress/zstd_lazy.h +59 -1
  52. data/ext/zstdruby/libzstd/compress/zstd_ldm.c +331 -206
  53. data/ext/zstdruby/libzstd/compress/zstd_ldm.h +15 -3
  54. data/ext/zstdruby/libzstd/compress/zstd_ldm_geartab.h +106 -0
  55. data/ext/zstdruby/libzstd/compress/zstd_opt.c +403 -226
  56. data/ext/zstdruby/libzstd/compress/zstd_opt.h +1 -1
  57. data/ext/zstdruby/libzstd/compress/zstdmt_compress.c +188 -453
  58. data/ext/zstdruby/libzstd/compress/zstdmt_compress.h +32 -114
  59. data/ext/zstdruby/libzstd/decompress/huf_decompress.c +1065 -410
  60. data/ext/zstdruby/libzstd/decompress/huf_decompress_amd64.S +571 -0
  61. data/ext/zstdruby/libzstd/decompress/zstd_ddict.c +20 -16
  62. data/ext/zstdruby/libzstd/decompress/zstd_ddict.h +3 -3
  63. data/ext/zstdruby/libzstd/decompress/zstd_decompress.c +691 -230
  64. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.c +1072 -323
  65. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.h +16 -7
  66. data/ext/zstdruby/libzstd/decompress/zstd_decompress_internal.h +71 -10
  67. data/ext/zstdruby/libzstd/deprecated/zbuff.h +3 -3
  68. data/ext/zstdruby/libzstd/deprecated/zbuff_common.c +2 -2
  69. data/ext/zstdruby/libzstd/deprecated/zbuff_compress.c +24 -4
  70. data/ext/zstdruby/libzstd/deprecated/zbuff_decompress.c +1 -1
  71. data/ext/zstdruby/libzstd/dictBuilder/cover.c +57 -40
  72. data/ext/zstdruby/libzstd/dictBuilder/cover.h +20 -9
  73. data/ext/zstdruby/libzstd/dictBuilder/divsufsort.c +1 -1
  74. data/ext/zstdruby/libzstd/dictBuilder/fastcover.c +54 -35
  75. data/ext/zstdruby/libzstd/dictBuilder/zdict.c +151 -57
  76. data/ext/zstdruby/libzstd/dll/example/Makefile +2 -1
  77. data/ext/zstdruby/libzstd/dll/example/README.md +16 -22
  78. data/ext/zstdruby/libzstd/legacy/zstd_legacy.h +4 -4
  79. data/ext/zstdruby/libzstd/legacy/zstd_v01.c +25 -19
  80. data/ext/zstdruby/libzstd/legacy/zstd_v01.h +1 -1
  81. data/ext/zstdruby/libzstd/legacy/zstd_v02.c +18 -14
  82. data/ext/zstdruby/libzstd/legacy/zstd_v02.h +1 -1
  83. data/ext/zstdruby/libzstd/legacy/zstd_v03.c +18 -14
  84. data/ext/zstdruby/libzstd/legacy/zstd_v03.h +1 -1
  85. data/ext/zstdruby/libzstd/legacy/zstd_v04.c +22 -16
  86. data/ext/zstdruby/libzstd/legacy/zstd_v04.h +1 -1
  87. data/ext/zstdruby/libzstd/legacy/zstd_v05.c +29 -25
  88. data/ext/zstdruby/libzstd/legacy/zstd_v05.h +2 -2
  89. data/ext/zstdruby/libzstd/legacy/zstd_v06.c +29 -25
  90. data/ext/zstdruby/libzstd/legacy/zstd_v06.h +1 -1
  91. data/ext/zstdruby/libzstd/legacy/zstd_v07.c +34 -26
  92. data/ext/zstdruby/libzstd/legacy/zstd_v07.h +1 -1
  93. data/ext/zstdruby/libzstd/libzstd.mk +185 -0
  94. data/ext/zstdruby/libzstd/libzstd.pc.in +4 -3
  95. data/ext/zstdruby/libzstd/modulemap/module.modulemap +4 -0
  96. data/ext/zstdruby/libzstd/{dictBuilder/zdict.h → zdict.h} +201 -31
  97. data/ext/zstdruby/libzstd/zstd.h +760 -234
  98. data/ext/zstdruby/libzstd/{common/zstd_errors.h → zstd_errors.h} +3 -1
  99. data/ext/zstdruby/zstdruby.c +2 -2
  100. data/lib/zstd-ruby/version.rb +1 -1
  101. metadata +20 -9
  102. data/.travis.yml +0 -14
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
2
+ * Copyright (c) Yann Collet, Facebook, Inc.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -58,11 +58,11 @@ ZSTD_updateDUBT(ZSTD_matchState_t* ms,
58
58
 
59
59
  /** ZSTD_insertDUBT1() :
60
60
  * sort one already inserted but unsorted position
61
- * assumption : current >= btlow == (current - btmask)
61
+ * assumption : curr >= btlow == (curr - btmask)
62
62
  * doesn't fail */
63
63
  static void
64
- ZSTD_insertDUBT1(ZSTD_matchState_t* ms,
65
- U32 current, const BYTE* inputEnd,
64
+ ZSTD_insertDUBT1(const ZSTD_matchState_t* ms,
65
+ U32 curr, const BYTE* inputEnd,
66
66
  U32 nbCompares, U32 btLow,
67
67
  const ZSTD_dictMode_e dictMode)
68
68
  {
@@ -74,41 +74,41 @@ ZSTD_insertDUBT1(ZSTD_matchState_t* ms,
74
74
  const BYTE* const base = ms->window.base;
75
75
  const BYTE* const dictBase = ms->window.dictBase;
76
76
  const U32 dictLimit = ms->window.dictLimit;
77
- const BYTE* const ip = (current>=dictLimit) ? base + current : dictBase + current;
78
- const BYTE* const iend = (current>=dictLimit) ? inputEnd : dictBase + dictLimit;
77
+ const BYTE* const ip = (curr>=dictLimit) ? base + curr : dictBase + curr;
78
+ const BYTE* const iend = (curr>=dictLimit) ? inputEnd : dictBase + dictLimit;
79
79
  const BYTE* const dictEnd = dictBase + dictLimit;
80
80
  const BYTE* const prefixStart = base + dictLimit;
81
81
  const BYTE* match;
82
- U32* smallerPtr = bt + 2*(current&btMask);
82
+ U32* smallerPtr = bt + 2*(curr&btMask);
83
83
  U32* largerPtr = smallerPtr + 1;
84
84
  U32 matchIndex = *smallerPtr; /* this candidate is unsorted : next sorted candidate is reached through *smallerPtr, while *largerPtr contains previous unsorted candidate (which is already saved and can be overwritten) */
85
85
  U32 dummy32; /* to be nullified at the end */
86
86
  U32 const windowValid = ms->window.lowLimit;
87
87
  U32 const maxDistance = 1U << cParams->windowLog;
88
- U32 const windowLow = (current - windowValid > maxDistance) ? current - maxDistance : windowValid;
88
+ U32 const windowLow = (curr - windowValid > maxDistance) ? curr - maxDistance : windowValid;
89
89
 
90
90
 
91
91
  DEBUGLOG(8, "ZSTD_insertDUBT1(%u) (dictLimit=%u, lowLimit=%u)",
92
- current, dictLimit, windowLow);
93
- assert(current >= btLow);
92
+ curr, dictLimit, windowLow);
93
+ assert(curr >= btLow);
94
94
  assert(ip < iend); /* condition for ZSTD_count */
95
95
 
96
- while (nbCompares-- && (matchIndex > windowLow)) {
96
+ for (; nbCompares && (matchIndex > windowLow); --nbCompares) {
97
97
  U32* const nextPtr = bt + 2*(matchIndex & btMask);
98
98
  size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
99
- assert(matchIndex < current);
99
+ assert(matchIndex < curr);
100
100
  /* note : all candidates are now supposed sorted,
101
101
  * but it's still possible to have nextPtr[1] == ZSTD_DUBT_UNSORTED_MARK
102
102
  * when a real index has the same value as ZSTD_DUBT_UNSORTED_MARK */
103
103
 
104
104
  if ( (dictMode != ZSTD_extDict)
105
105
  || (matchIndex+matchLength >= dictLimit) /* both in current segment*/
106
- || (current < dictLimit) /* both in extDict */) {
106
+ || (curr < dictLimit) /* both in extDict */) {
107
107
  const BYTE* const mBase = ( (dictMode != ZSTD_extDict)
108
108
  || (matchIndex+matchLength >= dictLimit)) ?
109
109
  base : dictBase;
110
110
  assert( (matchIndex+matchLength >= dictLimit) /* might be wrong if extDict is incorrectly set to 0 */
111
- || (current < dictLimit) );
111
+ || (curr < dictLimit) );
112
112
  match = mBase + matchIndex;
113
113
  matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend);
114
114
  } else {
@@ -119,7 +119,7 @@ ZSTD_insertDUBT1(ZSTD_matchState_t* ms,
119
119
  }
120
120
 
121
121
  DEBUGLOG(8, "ZSTD_insertDUBT1: comparing %u with %u : found %u common bytes ",
122
- current, matchIndex, (U32)matchLength);
122
+ curr, matchIndex, (U32)matchLength);
123
123
 
124
124
  if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */
125
125
  break; /* drop , to guarantee consistency ; miss a bit of compression, but other solutions can corrupt tree */
@@ -151,7 +151,7 @@ ZSTD_insertDUBT1(ZSTD_matchState_t* ms,
151
151
 
152
152
  static size_t
153
153
  ZSTD_DUBT_findBetterDictMatch (
154
- ZSTD_matchState_t* ms,
154
+ const ZSTD_matchState_t* ms,
155
155
  const BYTE* const ip, const BYTE* const iend,
156
156
  size_t* offsetPtr,
157
157
  size_t bestLength,
@@ -168,7 +168,7 @@ ZSTD_DUBT_findBetterDictMatch (
168
168
 
169
169
  const BYTE* const base = ms->window.base;
170
170
  const BYTE* const prefixStart = base + ms->window.dictLimit;
171
- U32 const current = (U32)(ip-base);
171
+ U32 const curr = (U32)(ip-base);
172
172
  const BYTE* const dictBase = dms->window.base;
173
173
  const BYTE* const dictEnd = dms->window.nextSrc;
174
174
  U32 const dictHighLimit = (U32)(dms->window.nextSrc - dms->window.base);
@@ -185,7 +185,7 @@ ZSTD_DUBT_findBetterDictMatch (
185
185
  (void)dictMode;
186
186
  assert(dictMode == ZSTD_dictMatchState);
187
187
 
188
- while (nbCompares-- && (dictMatchIndex > dictLowLimit)) {
188
+ for (; nbCompares && (dictMatchIndex > dictLowLimit); --nbCompares) {
189
189
  U32* const nextPtr = dictBt + 2*(dictMatchIndex & btMask);
190
190
  size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
191
191
  const BYTE* match = dictBase + dictMatchIndex;
@@ -195,10 +195,10 @@ ZSTD_DUBT_findBetterDictMatch (
195
195
 
196
196
  if (matchLength > bestLength) {
197
197
  U32 matchIndex = dictMatchIndex + dictIndexDelta;
198
- if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(current-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) {
198
+ if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) {
199
199
  DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)",
200
- current, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, ZSTD_REP_MOVE + current - matchIndex, dictMatchIndex, matchIndex);
201
- bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + current - matchIndex;
200
+ curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, ZSTD_REP_MOVE + curr - matchIndex, dictMatchIndex, matchIndex);
201
+ bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + curr - matchIndex;
202
202
  }
203
203
  if (ip+matchLength == iend) { /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */
204
204
  break; /* drop, to guarantee consistency (miss a little bit of compression) */
@@ -218,9 +218,9 @@ ZSTD_DUBT_findBetterDictMatch (
218
218
  }
219
219
 
220
220
  if (bestLength >= MINMATCH) {
221
- U32 const mIndex = current - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex;
221
+ U32 const mIndex = curr - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex;
222
222
  DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
223
- current, (U32)bestLength, (U32)*offsetPtr, mIndex);
223
+ curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
224
224
  }
225
225
  return bestLength;
226
226
 
@@ -241,13 +241,13 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
241
241
  U32 matchIndex = hashTable[h];
242
242
 
243
243
  const BYTE* const base = ms->window.base;
244
- U32 const current = (U32)(ip-base);
245
- U32 const windowLow = ZSTD_getLowestMatchIndex(ms, current, cParams->windowLog);
244
+ U32 const curr = (U32)(ip-base);
245
+ U32 const windowLow = ZSTD_getLowestMatchIndex(ms, curr, cParams->windowLog);
246
246
 
247
247
  U32* const bt = ms->chainTable;
248
248
  U32 const btLog = cParams->chainLog - 1;
249
249
  U32 const btMask = (1 << btLog) - 1;
250
- U32 const btLow = (btMask >= current) ? 0 : current - btMask;
250
+ U32 const btLow = (btMask >= curr) ? 0 : curr - btMask;
251
251
  U32 const unsortLimit = MAX(btLow, windowLow);
252
252
 
253
253
  U32* nextCandidate = bt + 2*(matchIndex&btMask);
@@ -256,8 +256,9 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
256
256
  U32 nbCandidates = nbCompares;
257
257
  U32 previousCandidate = 0;
258
258
 
259
- DEBUGLOG(7, "ZSTD_DUBT_findBestMatch (%u) ", current);
259
+ DEBUGLOG(7, "ZSTD_DUBT_findBestMatch (%u) ", curr);
260
260
  assert(ip <= iend-8); /* required for h calculation */
261
+ assert(dictMode != ZSTD_dedicatedDictSearch);
261
262
 
262
263
  /* reach end of unsorted candidates list */
263
264
  while ( (matchIndex > unsortLimit)
@@ -299,16 +300,16 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
299
300
  const U32 dictLimit = ms->window.dictLimit;
300
301
  const BYTE* const dictEnd = dictBase + dictLimit;
301
302
  const BYTE* const prefixStart = base + dictLimit;
302
- U32* smallerPtr = bt + 2*(current&btMask);
303
- U32* largerPtr = bt + 2*(current&btMask) + 1;
304
- U32 matchEndIdx = current + 8 + 1;
303
+ U32* smallerPtr = bt + 2*(curr&btMask);
304
+ U32* largerPtr = bt + 2*(curr&btMask) + 1;
305
+ U32 matchEndIdx = curr + 8 + 1;
305
306
  U32 dummy32; /* to be nullified at the end */
306
307
  size_t bestLength = 0;
307
308
 
308
309
  matchIndex = hashTable[h];
309
- hashTable[h] = current; /* Update Hash Table */
310
+ hashTable[h] = curr; /* Update Hash Table */
310
311
 
311
- while (nbCompares-- && (matchIndex > windowLow)) {
312
+ for (; nbCompares && (matchIndex > windowLow); --nbCompares) {
312
313
  U32* const nextPtr = bt + 2*(matchIndex & btMask);
313
314
  size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
314
315
  const BYTE* match;
@@ -326,8 +327,8 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
326
327
  if (matchLength > bestLength) {
327
328
  if (matchLength > matchEndIdx - matchIndex)
328
329
  matchEndIdx = matchIndex + (U32)matchLength;
329
- if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(current-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) )
330
- bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + current - matchIndex;
330
+ if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) )
331
+ bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + curr - matchIndex;
331
332
  if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */
332
333
  if (dictMode == ZSTD_dictMatchState) {
333
334
  nbCompares = 0; /* in addition to avoiding checking any
@@ -356,6 +357,7 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
356
357
 
357
358
  *smallerPtr = *largerPtr = 0;
358
359
 
360
+ assert(nbCompares <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */
359
361
  if (dictMode == ZSTD_dictMatchState && nbCompares) {
360
362
  bestLength = ZSTD_DUBT_findBetterDictMatch(
361
363
  ms, ip, iend,
@@ -363,12 +365,12 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
363
365
  mls, dictMode);
364
366
  }
365
367
 
366
- assert(matchEndIdx > current+8); /* ensure nextToUpdate is increased */
368
+ assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased */
367
369
  ms->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */
368
370
  if (bestLength >= MINMATCH) {
369
- U32 const mIndex = current - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex;
371
+ U32 const mIndex = curr - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex;
370
372
  DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
371
- current, (U32)bestLength, (U32)*offsetPtr, mIndex);
373
+ curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
372
374
  }
373
375
  return bestLength;
374
376
  }
@@ -389,56 +391,222 @@ ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms,
389
391
  return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offsetPtr, mls, dictMode);
390
392
  }
391
393
 
394
+ /***********************************
395
+ * Dedicated dict search
396
+ ***********************************/
392
397
 
393
- static size_t
394
- ZSTD_BtFindBestMatch_selectMLS ( ZSTD_matchState_t* ms,
395
- const BYTE* ip, const BYTE* const iLimit,
396
- size_t* offsetPtr)
398
+ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip)
397
399
  {
398
- switch(ms->cParams.minMatch)
400
+ const BYTE* const base = ms->window.base;
401
+ U32 const target = (U32)(ip - base);
402
+ U32* const hashTable = ms->hashTable;
403
+ U32* const chainTable = ms->chainTable;
404
+ U32 const chainSize = 1 << ms->cParams.chainLog;
405
+ U32 idx = ms->nextToUpdate;
406
+ U32 const minChain = chainSize < target - idx ? target - chainSize : idx;
407
+ U32 const bucketSize = 1 << ZSTD_LAZY_DDSS_BUCKET_LOG;
408
+ U32 const cacheSize = bucketSize - 1;
409
+ U32 const chainAttempts = (1 << ms->cParams.searchLog) - cacheSize;
410
+ U32 const chainLimit = chainAttempts > 255 ? 255 : chainAttempts;
411
+
412
+ /* We know the hashtable is oversized by a factor of `bucketSize`.
413
+ * We are going to temporarily pretend `bucketSize == 1`, keeping only a
414
+ * single entry. We will use the rest of the space to construct a temporary
415
+ * chaintable.
416
+ */
417
+ U32 const hashLog = ms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG;
418
+ U32* const tmpHashTable = hashTable;
419
+ U32* const tmpChainTable = hashTable + ((size_t)1 << hashLog);
420
+ U32 const tmpChainSize = (U32)((1 << ZSTD_LAZY_DDSS_BUCKET_LOG) - 1) << hashLog;
421
+ U32 const tmpMinChain = tmpChainSize < target ? target - tmpChainSize : idx;
422
+ U32 hashIdx;
423
+
424
+ assert(ms->cParams.chainLog <= 24);
425
+ assert(ms->cParams.hashLog > ms->cParams.chainLog);
426
+ assert(idx != 0);
427
+ assert(tmpMinChain <= minChain);
428
+
429
+ /* fill conventional hash table and conventional chain table */
430
+ for ( ; idx < target; idx++) {
431
+ U32 const h = (U32)ZSTD_hashPtr(base + idx, hashLog, ms->cParams.minMatch);
432
+ if (idx >= tmpMinChain) {
433
+ tmpChainTable[idx - tmpMinChain] = hashTable[h];
434
+ }
435
+ tmpHashTable[h] = idx;
436
+ }
437
+
438
+ /* sort chains into ddss chain table */
399
439
  {
400
- default : /* includes case 3 */
401
- case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_noDict);
402
- case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_noDict);
403
- case 7 :
404
- case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_noDict);
440
+ U32 chainPos = 0;
441
+ for (hashIdx = 0; hashIdx < (1U << hashLog); hashIdx++) {
442
+ U32 count;
443
+ U32 countBeyondMinChain = 0;
444
+ U32 i = tmpHashTable[hashIdx];
445
+ for (count = 0; i >= tmpMinChain && count < cacheSize; count++) {
446
+ /* skip through the chain to the first position that won't be
447
+ * in the hash cache bucket */
448
+ if (i < minChain) {
449
+ countBeyondMinChain++;
450
+ }
451
+ i = tmpChainTable[i - tmpMinChain];
452
+ }
453
+ if (count == cacheSize) {
454
+ for (count = 0; count < chainLimit;) {
455
+ if (i < minChain) {
456
+ if (!i || ++countBeyondMinChain > cacheSize) {
457
+ /* only allow pulling `cacheSize` number of entries
458
+ * into the cache or chainTable beyond `minChain`,
459
+ * to replace the entries pulled out of the
460
+ * chainTable into the cache. This lets us reach
461
+ * back further without increasing the total number
462
+ * of entries in the chainTable, guaranteeing the
463
+ * DDSS chain table will fit into the space
464
+ * allocated for the regular one. */
465
+ break;
466
+ }
467
+ }
468
+ chainTable[chainPos++] = i;
469
+ count++;
470
+ if (i < tmpMinChain) {
471
+ break;
472
+ }
473
+ i = tmpChainTable[i - tmpMinChain];
474
+ }
475
+ } else {
476
+ count = 0;
477
+ }
478
+ if (count) {
479
+ tmpHashTable[hashIdx] = ((chainPos - count) << 8) + count;
480
+ } else {
481
+ tmpHashTable[hashIdx] = 0;
482
+ }
483
+ }
484
+ assert(chainPos <= chainSize); /* I believe this is guaranteed... */
405
485
  }
486
+
487
+ /* move chain pointers into the last entry of each hash bucket */
488
+ for (hashIdx = (1 << hashLog); hashIdx; ) {
489
+ U32 const bucketIdx = --hashIdx << ZSTD_LAZY_DDSS_BUCKET_LOG;
490
+ U32 const chainPackedPointer = tmpHashTable[hashIdx];
491
+ U32 i;
492
+ for (i = 0; i < cacheSize; i++) {
493
+ hashTable[bucketIdx + i] = 0;
494
+ }
495
+ hashTable[bucketIdx + bucketSize - 1] = chainPackedPointer;
496
+ }
497
+
498
+ /* fill the buckets of the hash table */
499
+ for (idx = ms->nextToUpdate; idx < target; idx++) {
500
+ U32 const h = (U32)ZSTD_hashPtr(base + idx, hashLog, ms->cParams.minMatch)
501
+ << ZSTD_LAZY_DDSS_BUCKET_LOG;
502
+ U32 i;
503
+ /* Shift hash cache down 1. */
504
+ for (i = cacheSize - 1; i; i--)
505
+ hashTable[h + i] = hashTable[h + i - 1];
506
+ hashTable[h] = idx;
507
+ }
508
+
509
+ ms->nextToUpdate = target;
406
510
  }
407
511
 
512
+ /* Returns the longest match length found in the dedicated dict search structure.
513
+ * If none are longer than the argument ml, then ml will be returned.
514
+ */
515
+ FORCE_INLINE_TEMPLATE
516
+ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nbAttempts,
517
+ const ZSTD_matchState_t* const dms,
518
+ const BYTE* const ip, const BYTE* const iLimit,
519
+ const BYTE* const prefixStart, const U32 curr,
520
+ const U32 dictLimit, const size_t ddsIdx) {
521
+ const U32 ddsLowestIndex = dms->window.dictLimit;
522
+ const BYTE* const ddsBase = dms->window.base;
523
+ const BYTE* const ddsEnd = dms->window.nextSrc;
524
+ const U32 ddsSize = (U32)(ddsEnd - ddsBase);
525
+ const U32 ddsIndexDelta = dictLimit - ddsSize;
526
+ const U32 bucketSize = (1 << ZSTD_LAZY_DDSS_BUCKET_LOG);
527
+ const U32 bucketLimit = nbAttempts < bucketSize - 1 ? nbAttempts : bucketSize - 1;
528
+ U32 ddsAttempt;
529
+ U32 matchIndex;
530
+
531
+ for (ddsAttempt = 0; ddsAttempt < bucketSize - 1; ddsAttempt++) {
532
+ PREFETCH_L1(ddsBase + dms->hashTable[ddsIdx + ddsAttempt]);
533
+ }
408
534
 
409
- static size_t ZSTD_BtFindBestMatch_dictMatchState_selectMLS (
410
- ZSTD_matchState_t* ms,
411
- const BYTE* ip, const BYTE* const iLimit,
412
- size_t* offsetPtr)
413
- {
414
- switch(ms->cParams.minMatch)
415
535
  {
416
- default : /* includes case 3 */
417
- case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_dictMatchState);
418
- case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_dictMatchState);
419
- case 7 :
420
- case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_dictMatchState);
536
+ U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1];
537
+ U32 const chainIndex = chainPackedPointer >> 8;
538
+
539
+ PREFETCH_L1(&dms->chainTable[chainIndex]);
421
540
  }
422
- }
423
541
 
542
+ for (ddsAttempt = 0; ddsAttempt < bucketLimit; ddsAttempt++) {
543
+ size_t currentMl=0;
544
+ const BYTE* match;
545
+ matchIndex = dms->hashTable[ddsIdx + ddsAttempt];
546
+ match = ddsBase + matchIndex;
547
+
548
+ if (!matchIndex) {
549
+ return ml;
550
+ }
551
+
552
+ /* guaranteed by table construction */
553
+ (void)ddsLowestIndex;
554
+ assert(matchIndex >= ddsLowestIndex);
555
+ assert(match+4 <= ddsEnd);
556
+ if (MEM_read32(match) == MEM_read32(ip)) {
557
+ /* assumption : matchIndex <= dictLimit-4 (by table construction) */
558
+ currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4;
559
+ }
560
+
561
+ /* save best solution */
562
+ if (currentMl > ml) {
563
+ ml = currentMl;
564
+ *offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE;
565
+ if (ip+currentMl == iLimit) {
566
+ /* best possible, avoids read overflow on next attempt */
567
+ return ml;
568
+ }
569
+ }
570
+ }
424
571
 
425
- static size_t ZSTD_BtFindBestMatch_extDict_selectMLS (
426
- ZSTD_matchState_t* ms,
427
- const BYTE* ip, const BYTE* const iLimit,
428
- size_t* offsetPtr)
429
- {
430
- switch(ms->cParams.minMatch)
431
572
  {
432
- default : /* includes case 3 */
433
- case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_extDict);
434
- case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_extDict);
435
- case 7 :
436
- case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_extDict);
573
+ U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1];
574
+ U32 chainIndex = chainPackedPointer >> 8;
575
+ U32 const chainLength = chainPackedPointer & 0xFF;
576
+ U32 const chainAttempts = nbAttempts - ddsAttempt;
577
+ U32 const chainLimit = chainAttempts > chainLength ? chainLength : chainAttempts;
578
+ U32 chainAttempt;
579
+
580
+ for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++) {
581
+ PREFETCH_L1(ddsBase + dms->chainTable[chainIndex + chainAttempt]);
582
+ }
583
+
584
+ for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++, chainIndex++) {
585
+ size_t currentMl=0;
586
+ const BYTE* match;
587
+ matchIndex = dms->chainTable[chainIndex];
588
+ match = ddsBase + matchIndex;
589
+
590
+ /* guaranteed by table construction */
591
+ assert(matchIndex >= ddsLowestIndex);
592
+ assert(match+4 <= ddsEnd);
593
+ if (MEM_read32(match) == MEM_read32(ip)) {
594
+ /* assumption : matchIndex <= dictLimit-4 (by table construction) */
595
+ currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4;
596
+ }
597
+
598
+ /* save best solution */
599
+ if (currentMl > ml) {
600
+ ml = currentMl;
601
+ *offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE;
602
+ if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
603
+ }
604
+ }
437
605
  }
606
+ return ml;
438
607
  }
439
608
 
440
609
 
441
-
442
610
  /* *********************************
443
611
  * Hash Chain
444
612
  ***********************************/
@@ -446,7 +614,7 @@ static size_t ZSTD_BtFindBestMatch_extDict_selectMLS (
446
614
 
447
615
  /* Update chains up to ip (excluded)
448
616
  Assumption : always within prefix (i.e. not within extDict) */
449
- static U32 ZSTD_insertAndFindFirstIndex_internal(
617
+ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
450
618
  ZSTD_matchState_t* ms,
451
619
  const ZSTD_compressionParameters* const cParams,
452
620
  const BYTE* ip, U32 const mls)
@@ -475,10 +643,9 @@ U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {
475
643
  return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch);
476
644
  }
477
645
 
478
-
479
646
  /* inlining is important to hardwire a hot branch (template emulation) */
480
647
  FORCE_INLINE_TEMPLATE
481
- size_t ZSTD_HcFindBestMatch_generic (
648
+ size_t ZSTD_HcFindBestMatch(
482
649
  ZSTD_matchState_t* ms,
483
650
  const BYTE* const ip, const BYTE* const iLimit,
484
651
  size_t* offsetPtr,
@@ -493,20 +660,33 @@ size_t ZSTD_HcFindBestMatch_generic (
493
660
  const U32 dictLimit = ms->window.dictLimit;
494
661
  const BYTE* const prefixStart = base + dictLimit;
495
662
  const BYTE* const dictEnd = dictBase + dictLimit;
496
- const U32 current = (U32)(ip-base);
663
+ const U32 curr = (U32)(ip-base);
497
664
  const U32 maxDistance = 1U << cParams->windowLog;
498
665
  const U32 lowestValid = ms->window.lowLimit;
499
- const U32 withinMaxDistance = (current - lowestValid > maxDistance) ? current - maxDistance : lowestValid;
666
+ const U32 withinMaxDistance = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid;
500
667
  const U32 isDictionary = (ms->loadedDictEnd != 0);
501
668
  const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance;
502
- const U32 minChain = current > chainSize ? current - chainSize : 0;
669
+ const U32 minChain = curr > chainSize ? curr - chainSize : 0;
503
670
  U32 nbAttempts = 1U << cParams->searchLog;
504
671
  size_t ml=4-1;
505
672
 
673
+ const ZSTD_matchState_t* const dms = ms->dictMatchState;
674
+ const U32 ddsHashLog = dictMode == ZSTD_dedicatedDictSearch
675
+ ? dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG : 0;
676
+ const size_t ddsIdx = dictMode == ZSTD_dedicatedDictSearch
677
+ ? ZSTD_hashPtr(ip, ddsHashLog, mls) << ZSTD_LAZY_DDSS_BUCKET_LOG : 0;
678
+
679
+ U32 matchIndex;
680
+
681
+ if (dictMode == ZSTD_dedicatedDictSearch) {
682
+ const U32* entry = &dms->hashTable[ddsIdx];
683
+ PREFETCH_L1(entry);
684
+ }
685
+
506
686
  /* HC4 match finder */
507
- U32 matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls);
687
+ matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls);
508
688
 
509
- for ( ; (matchIndex>lowLimit) & (nbAttempts>0) ; nbAttempts--) {
689
+ for ( ; (matchIndex>=lowLimit) & (nbAttempts>0) ; nbAttempts--) {
510
690
  size_t currentMl=0;
511
691
  if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
512
692
  const BYTE* const match = base + matchIndex;
@@ -523,7 +703,7 @@ size_t ZSTD_HcFindBestMatch_generic (
523
703
  /* save best solution */
524
704
  if (currentMl > ml) {
525
705
  ml = currentMl;
526
- *offsetPtr = current - matchIndex + ZSTD_REP_MOVE;
706
+ *offsetPtr = curr - matchIndex + ZSTD_REP_MOVE;
527
707
  if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
528
708
  }
529
709
 
@@ -531,8 +711,11 @@ size_t ZSTD_HcFindBestMatch_generic (
531
711
  matchIndex = NEXT_IN_CHAIN(matchIndex, chainMask);
532
712
  }
533
713
 
534
- if (dictMode == ZSTD_dictMatchState) {
535
- const ZSTD_matchState_t* const dms = ms->dictMatchState;
714
+ assert(nbAttempts <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */
715
+ if (dictMode == ZSTD_dedicatedDictSearch) {
716
+ ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts, dms,
717
+ ip, iLimit, prefixStart, curr, dictLimit, ddsIdx);
718
+ } else if (dictMode == ZSTD_dictMatchState) {
536
719
  const U32* const dmsChainTable = dms->chainTable;
537
720
  const U32 dmsChainSize = (1 << dms->cParams.chainLog);
538
721
  const U32 dmsChainMask = dmsChainSize - 1;
@@ -545,7 +728,7 @@ size_t ZSTD_HcFindBestMatch_generic (
545
728
 
546
729
  matchIndex = dms->hashTable[ZSTD_hashPtr(ip, dms->cParams.hashLog, mls)];
547
730
 
548
- for ( ; (matchIndex>dmsLowestIndex) & (nbAttempts>0) ; nbAttempts--) {
731
+ for ( ; (matchIndex>=dmsLowestIndex) & (nbAttempts>0) ; nbAttempts--) {
549
732
  size_t currentMl=0;
550
733
  const BYTE* const match = dmsBase + matchIndex;
551
734
  assert(match+4 <= dmsEnd);
@@ -555,11 +738,12 @@ size_t ZSTD_HcFindBestMatch_generic (
555
738
  /* save best solution */
556
739
  if (currentMl > ml) {
557
740
  ml = currentMl;
558
- *offsetPtr = current - (matchIndex + dmsIndexDelta) + ZSTD_REP_MOVE;
741
+ *offsetPtr = curr - (matchIndex + dmsIndexDelta) + ZSTD_REP_MOVE;
559
742
  if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
560
743
  }
561
744
 
562
745
  if (matchIndex <= dmsMinChain) break;
746
+
563
747
  matchIndex = dmsChainTable[matchIndex & dmsChainMask];
564
748
  }
565
749
  }
@@ -567,59 +751,724 @@ size_t ZSTD_HcFindBestMatch_generic (
567
751
  return ml;
568
752
  }
569
753
 
754
+ /* *********************************
755
+ * (SIMD) Row-based matchfinder
756
+ ***********************************/
757
+ /* Constants for row-based hash */
758
+ #define ZSTD_ROW_HASH_TAG_OFFSET 16 /* byte offset of hashes in the match state's tagTable from the beginning of a row */
759
+ #define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */
760
+ #define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1)
761
+ #define ZSTD_ROW_HASH_MAX_ENTRIES 64 /* absolute maximum number of entries per row, for all configurations */
570
762
 
571
- FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_selectMLS (
572
- ZSTD_matchState_t* ms,
573
- const BYTE* ip, const BYTE* const iLimit,
574
- size_t* offsetPtr)
763
+ #define ZSTD_ROW_HASH_CACHE_MASK (ZSTD_ROW_HASH_CACHE_SIZE - 1)
764
+
765
+ typedef U64 ZSTD_VecMask; /* Clarifies when we are interacting with a U64 representing a mask of matches */
766
+
767
+ /* ZSTD_VecMask_next():
768
+ * Starting from the LSB, returns the idx of the next non-zero bit.
769
+ * Basically counting the nb of trailing zeroes.
770
+ */
771
+ static U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
772
+ assert(val != 0);
773
+ # if defined(_MSC_VER) && defined(_WIN64)
774
+ if (val != 0) {
775
+ unsigned long r;
776
+ _BitScanForward64(&r, val);
777
+ return (U32)(r);
778
+ } else {
779
+ /* Should not reach this code path */
780
+ __assume(0);
781
+ }
782
+ # elif (defined(__GNUC__) && ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))
783
+ if (sizeof(size_t) == 4) {
784
+ U32 mostSignificantWord = (U32)(val >> 32);
785
+ U32 leastSignificantWord = (U32)val;
786
+ if (leastSignificantWord == 0) {
787
+ return 32 + (U32)__builtin_ctz(mostSignificantWord);
788
+ } else {
789
+ return (U32)__builtin_ctz(leastSignificantWord);
790
+ }
791
+ } else {
792
+ return (U32)__builtin_ctzll(val);
793
+ }
794
+ # else
795
+ /* Software ctz version: http://aggregate.org/MAGIC/#Trailing%20Zero%20Count
796
+ * and: https://stackoverflow.com/questions/2709430/count-number-of-bits-in-a-64-bit-long-big-integer
797
+ */
798
+ val = ~val & (val - 1ULL); /* Lowest set bit mask */
799
+ val = val - ((val >> 1) & 0x5555555555555555);
800
+ val = (val & 0x3333333333333333ULL) + ((val >> 2) & 0x3333333333333333ULL);
801
+ return (U32)((((val + (val >> 4)) & 0xF0F0F0F0F0F0F0FULL) * 0x101010101010101ULL) >> 56);
802
+ # endif
803
+ }
804
+
805
+ /* ZSTD_rotateRight_*():
806
+ * Rotates a bitfield to the right by "count" bits.
807
+ * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts
808
+ */
809
+ FORCE_INLINE_TEMPLATE
810
+ U64 ZSTD_rotateRight_U64(U64 const value, U32 count) {
811
+ assert(count < 64);
812
+ count &= 0x3F; /* for fickle pattern recognition */
813
+ return (value >> count) | (U64)(value << ((0U - count) & 0x3F));
814
+ }
815
+
816
+ FORCE_INLINE_TEMPLATE
817
+ U32 ZSTD_rotateRight_U32(U32 const value, U32 count) {
818
+ assert(count < 32);
819
+ count &= 0x1F; /* for fickle pattern recognition */
820
+ return (value >> count) | (U32)(value << ((0U - count) & 0x1F));
821
+ }
822
+
823
+ FORCE_INLINE_TEMPLATE
824
+ U16 ZSTD_rotateRight_U16(U16 const value, U32 count) {
825
+ assert(count < 16);
826
+ count &= 0x0F; /* for fickle pattern recognition */
827
+ return (value >> count) | (U16)(value << ((0U - count) & 0x0F));
828
+ }
829
+
830
+ /* ZSTD_row_nextIndex():
831
+ * Returns the next index to insert at within a tagTable row, and updates the "head"
832
+ * value to reflect the update. Essentially cycles backwards from [0, {entries per row})
833
+ */
834
+ FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const rowMask) {
835
+ U32 const next = (*tagRow - 1) & rowMask;
836
+ *tagRow = (BYTE)next;
837
+ return next;
838
+ }
839
+
840
+ /* ZSTD_isAligned():
841
+ * Checks that a pointer is aligned to "align" bytes which must be a power of 2.
842
+ */
843
+ MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) {
844
+ assert((align & (align - 1)) == 0);
845
+ return (((size_t)ptr) & (align - 1)) == 0;
846
+ }
847
+
848
+ /* ZSTD_row_prefetch():
849
+ * Performs prefetching for the hashTable and tagTable at a given row.
850
+ */
851
+ FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* tagTable, U32 const relRow, U32 const rowLog) {
852
+ PREFETCH_L1(hashTable + relRow);
853
+ if (rowLog >= 5) {
854
+ PREFETCH_L1(hashTable + relRow + 16);
855
+ /* Note: prefetching more of the hash table does not appear to be beneficial for 128-entry rows */
856
+ }
857
+ PREFETCH_L1(tagTable + relRow);
858
+ if (rowLog == 6) {
859
+ PREFETCH_L1(tagTable + relRow + 32);
860
+ }
861
+ assert(rowLog == 4 || rowLog == 5 || rowLog == 6);
862
+ assert(ZSTD_isAligned(hashTable + relRow, 64)); /* prefetched hash row always 64-byte aligned */
863
+ assert(ZSTD_isAligned(tagTable + relRow, (size_t)1 << rowLog)); /* prefetched tagRow sits on correct multiple of bytes (32,64,128) */
864
+ }
865
+
866
+ /* ZSTD_row_fillHashCache():
867
+ * Fill up the hash cache starting at idx, prefetching up to ZSTD_ROW_HASH_CACHE_SIZE entries,
868
+ * but not beyond iLimit.
869
+ */
870
+ FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base,
871
+ U32 const rowLog, U32 const mls,
872
+ U32 idx, const BYTE* const iLimit)
575
873
  {
576
- switch(ms->cParams.minMatch)
577
- {
578
- default : /* includes case 3 */
579
- case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_noDict);
580
- case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_noDict);
581
- case 7 :
582
- case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_noDict);
874
+ U32 const* const hashTable = ms->hashTable;
875
+ U16 const* const tagTable = ms->tagTable;
876
+ U32 const hashLog = ms->rowHashLog;
877
+ U32 const maxElemsToPrefetch = (base + idx) > iLimit ? 0 : (U32)(iLimit - (base + idx) + 1);
878
+ U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch);
879
+
880
+ for (; idx < lim; ++idx) {
881
+ U32 const hash = (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
882
+ U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
883
+ ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
884
+ ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash;
583
885
  }
886
+
887
+ DEBUGLOG(6, "ZSTD_row_fillHashCache(): [%u %u %u %u %u %u %u %u]", ms->hashCache[0], ms->hashCache[1],
888
+ ms->hashCache[2], ms->hashCache[3], ms->hashCache[4],
889
+ ms->hashCache[5], ms->hashCache[6], ms->hashCache[7]);
584
890
  }
585
891
 
892
+ /* ZSTD_row_nextCachedHash():
893
+ * Returns the hash of base + idx, and replaces the hash in the hash cache with the byte at
894
+ * base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable.
895
+ */
896
+ FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable,
897
+ U16 const* tagTable, BYTE const* base,
898
+ U32 idx, U32 const hashLog,
899
+ U32 const rowLog, U32 const mls)
900
+ {
901
+ U32 const newHash = (U32)ZSTD_hashPtr(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
902
+ U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
903
+ ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
904
+ { U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK];
905
+ cache[idx & ZSTD_ROW_HASH_CACHE_MASK] = newHash;
906
+ return hash;
907
+ }
908
+ }
586
909
 
587
- static size_t ZSTD_HcFindBestMatch_dictMatchState_selectMLS (
588
- ZSTD_matchState_t* ms,
589
- const BYTE* ip, const BYTE* const iLimit,
590
- size_t* offsetPtr)
910
+ /* ZSTD_row_update_internalImpl():
911
+ * Updates the hash table with positions starting from updateStartIdx until updateEndIdx.
912
+ */
913
+ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms,
914
+ U32 updateStartIdx, U32 const updateEndIdx,
915
+ U32 const mls, U32 const rowLog,
916
+ U32 const rowMask, U32 const useCache)
591
917
  {
592
- switch(ms->cParams.minMatch)
593
- {
594
- default : /* includes case 3 */
595
- case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_dictMatchState);
596
- case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_dictMatchState);
597
- case 7 :
598
- case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_dictMatchState);
918
+ U32* const hashTable = ms->hashTable;
919
+ U16* const tagTable = ms->tagTable;
920
+ U32 const hashLog = ms->rowHashLog;
921
+ const BYTE* const base = ms->window.base;
922
+
923
+ DEBUGLOG(6, "ZSTD_row_update_internalImpl(): updateStartIdx=%u, updateEndIdx=%u", updateStartIdx, updateEndIdx);
924
+ for (; updateStartIdx < updateEndIdx; ++updateStartIdx) {
925
+ U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls)
926
+ : (U32)ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
927
+ U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
928
+ U32* const row = hashTable + relRow;
929
+ BYTE* tagRow = (BYTE*)(tagTable + relRow); /* Though tagTable is laid out as a table of U16, each tag is only 1 byte.
930
+ Explicit cast allows us to get exact desired position within each row */
931
+ U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
932
+
933
+ assert(hash == ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls));
934
+ ((BYTE*)tagRow)[pos + ZSTD_ROW_HASH_TAG_OFFSET] = hash & ZSTD_ROW_HASH_TAG_MASK;
935
+ row[pos] = updateStartIdx;
599
936
  }
600
937
  }
601
938
 
939
+ /* ZSTD_row_update_internal():
940
+ * Inserts the byte at ip into the appropriate position in the hash table, and updates ms->nextToUpdate.
941
+ * Skips sections of long matches as is necessary.
942
+ */
943
+ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip,
944
+ U32 const mls, U32 const rowLog,
945
+ U32 const rowMask, U32 const useCache)
946
+ {
947
+ U32 idx = ms->nextToUpdate;
948
+ const BYTE* const base = ms->window.base;
949
+ const U32 target = (U32)(ip - base);
950
+ const U32 kSkipThreshold = 384;
951
+ const U32 kMaxMatchStartPositionsToUpdate = 96;
952
+ const U32 kMaxMatchEndPositionsToUpdate = 32;
953
+
954
+ if (useCache) {
955
+ /* Only skip positions when using hash cache, i.e.
956
+ * if we are loading a dict, don't skip anything.
957
+ * If we decide to skip, then we only update a set number
958
+ * of positions at the beginning and end of the match.
959
+ */
960
+ if (UNLIKELY(target - idx > kSkipThreshold)) {
961
+ U32 const bound = idx + kMaxMatchStartPositionsToUpdate;
962
+ ZSTD_row_update_internalImpl(ms, idx, bound, mls, rowLog, rowMask, useCache);
963
+ idx = target - kMaxMatchEndPositionsToUpdate;
964
+ ZSTD_row_fillHashCache(ms, base, rowLog, mls, idx, ip+1);
965
+ }
966
+ }
967
+ assert(target >= idx);
968
+ ZSTD_row_update_internalImpl(ms, idx, target, mls, rowLog, rowMask, useCache);
969
+ ms->nextToUpdate = target;
970
+ }
602
971
 
603
- FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_extDict_selectMLS (
972
+ /* ZSTD_row_update():
973
+ * External wrapper for ZSTD_row_update_internal(). Used for filling the hashtable during dictionary
974
+ * processing.
975
+ */
976
+ void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) {
977
+ const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);
978
+ const U32 rowMask = (1u << rowLog) - 1;
979
+ const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */);
980
+
981
+ DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog);
982
+ ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* dont use cache */);
983
+ }
984
+
985
+ #if defined(ZSTD_ARCH_X86_SSE2)
986
+ FORCE_INLINE_TEMPLATE ZSTD_VecMask
987
+ ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U32 head)
988
+ {
989
+ const __m128i comparisonMask = _mm_set1_epi8((char)tag);
990
+ int matches[4] = {0};
991
+ int i;
992
+ assert(nbChunks == 1 || nbChunks == 2 || nbChunks == 4);
993
+ for (i=0; i<nbChunks; i++) {
994
+ const __m128i chunk = _mm_loadu_si128((const __m128i*)(const void*)(src + 16*i));
995
+ const __m128i equalMask = _mm_cmpeq_epi8(chunk, comparisonMask);
996
+ matches[i] = _mm_movemask_epi8(equalMask);
997
+ }
998
+ if (nbChunks == 1) return ZSTD_rotateRight_U16((U16)matches[0], head);
999
+ if (nbChunks == 2) return ZSTD_rotateRight_U32((U32)matches[1] << 16 | (U32)matches[0], head);
1000
+ assert(nbChunks == 4);
1001
+ return ZSTD_rotateRight_U64((U64)matches[3] << 48 | (U64)matches[2] << 32 | (U64)matches[1] << 16 | (U64)matches[0], head);
1002
+ }
1003
+ #endif
1004
+
1005
+ /* Returns a ZSTD_VecMask (U32) that has the nth bit set to 1 if the newly-computed "tag" matches
1006
+ * the hash at the nth position in a row of the tagTable.
1007
+ * Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield
1008
+ * to match up with the actual layout of the entries within the hashTable */
1009
+ FORCE_INLINE_TEMPLATE ZSTD_VecMask
1010
+ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries)
1011
+ {
1012
+ const BYTE* const src = tagRow + ZSTD_ROW_HASH_TAG_OFFSET;
1013
+ assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
1014
+ assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
1015
+
1016
+ #if defined(ZSTD_ARCH_X86_SSE2)
1017
+
1018
+ return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, head);
1019
+
1020
+ #else /* SW or NEON-LE */
1021
+
1022
+ # if defined(ZSTD_ARCH_ARM_NEON)
1023
+ /* This NEON path only works for little endian - otherwise use SWAR below */
1024
+ if (MEM_isLittleEndian()) {
1025
+ if (rowEntries == 16) {
1026
+ const uint8x16_t chunk = vld1q_u8(src);
1027
+ const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag)));
1028
+ const uint16x8_t t0 = vshlq_n_u16(equalMask, 7);
1029
+ const uint32x4_t t1 = vreinterpretq_u32_u16(vsriq_n_u16(t0, t0, 14));
1030
+ const uint64x2_t t2 = vreinterpretq_u64_u32(vshrq_n_u32(t1, 14));
1031
+ const uint8x16_t t3 = vreinterpretq_u8_u64(vsraq_n_u64(t2, t2, 28));
1032
+ const U16 hi = (U16)vgetq_lane_u8(t3, 8);
1033
+ const U16 lo = (U16)vgetq_lane_u8(t3, 0);
1034
+ return ZSTD_rotateRight_U16((hi << 8) | lo, head);
1035
+ } else if (rowEntries == 32) {
1036
+ const uint16x8x2_t chunk = vld2q_u16((const U16*)(const void*)src);
1037
+ const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]);
1038
+ const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]);
1039
+ const uint8x16_t equalMask0 = vceqq_u8(chunk0, vdupq_n_u8(tag));
1040
+ const uint8x16_t equalMask1 = vceqq_u8(chunk1, vdupq_n_u8(tag));
1041
+ const int8x8_t pack0 = vqmovn_s16(vreinterpretq_s16_u8(equalMask0));
1042
+ const int8x8_t pack1 = vqmovn_s16(vreinterpretq_s16_u8(equalMask1));
1043
+ const uint8x8_t t0 = vreinterpret_u8_s8(pack0);
1044
+ const uint8x8_t t1 = vreinterpret_u8_s8(pack1);
1045
+ const uint8x8_t t2 = vsri_n_u8(t1, t0, 2);
1046
+ const uint8x8x2_t t3 = vuzp_u8(t2, t0);
1047
+ const uint8x8_t t4 = vsri_n_u8(t3.val[1], t3.val[0], 4);
1048
+ const U32 matches = vget_lane_u32(vreinterpret_u32_u8(t4), 0);
1049
+ return ZSTD_rotateRight_U32(matches, head);
1050
+ } else { /* rowEntries == 64 */
1051
+ const uint8x16x4_t chunk = vld4q_u8(src);
1052
+ const uint8x16_t dup = vdupq_n_u8(tag);
1053
+ const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup);
1054
+ const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup);
1055
+ const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup);
1056
+ const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup);
1057
+
1058
+ const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1);
1059
+ const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1);
1060
+ const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2);
1061
+ const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4);
1062
+ const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4);
1063
+ const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0);
1064
+ return ZSTD_rotateRight_U64(matches, head);
1065
+ }
1066
+ }
1067
+ # endif /* ZSTD_ARCH_ARM_NEON */
1068
+ /* SWAR */
1069
+ { const size_t chunkSize = sizeof(size_t);
1070
+ const size_t shiftAmount = ((chunkSize * 8) - chunkSize);
1071
+ const size_t xFF = ~((size_t)0);
1072
+ const size_t x01 = xFF / 0xFF;
1073
+ const size_t x80 = x01 << 7;
1074
+ const size_t splatChar = tag * x01;
1075
+ ZSTD_VecMask matches = 0;
1076
+ int i = rowEntries - chunkSize;
1077
+ assert((sizeof(size_t) == 4) || (sizeof(size_t) == 8));
1078
+ if (MEM_isLittleEndian()) { /* runtime check so have two loops */
1079
+ const size_t extractMagic = (xFF / 0x7F) >> chunkSize;
1080
+ do {
1081
+ size_t chunk = MEM_readST(&src[i]);
1082
+ chunk ^= splatChar;
1083
+ chunk = (((chunk | x80) - x01) | chunk) & x80;
1084
+ matches <<= chunkSize;
1085
+ matches |= (chunk * extractMagic) >> shiftAmount;
1086
+ i -= chunkSize;
1087
+ } while (i >= 0);
1088
+ } else { /* big endian: reverse bits during extraction */
1089
+ const size_t msb = xFF ^ (xFF >> 1);
1090
+ const size_t extractMagic = (msb / 0x1FF) | msb;
1091
+ do {
1092
+ size_t chunk = MEM_readST(&src[i]);
1093
+ chunk ^= splatChar;
1094
+ chunk = (((chunk | x80) - x01) | chunk) & x80;
1095
+ matches <<= chunkSize;
1096
+ matches |= ((chunk >> 7) * extractMagic) >> shiftAmount;
1097
+ i -= chunkSize;
1098
+ } while (i >= 0);
1099
+ }
1100
+ matches = ~matches;
1101
+ if (rowEntries == 16) {
1102
+ return ZSTD_rotateRight_U16((U16)matches, head);
1103
+ } else if (rowEntries == 32) {
1104
+ return ZSTD_rotateRight_U32((U32)matches, head);
1105
+ } else {
1106
+ return ZSTD_rotateRight_U64((U64)matches, head);
1107
+ }
1108
+ }
1109
+ #endif
1110
+ }
1111
+
1112
+ /* The high-level approach of the SIMD row based match finder is as follows:
1113
+ * - Figure out where to insert the new entry:
1114
+ * - Generate a hash from a byte along with an additional 1-byte "short hash". The additional byte is our "tag"
1115
+ * - The hashTable is effectively split into groups or "rows" of 16 or 32 entries of U32, and the hash determines
1116
+ * which row to insert into.
1117
+ * - Determine the correct position within the row to insert the entry into. Each row of 16 or 32 can
1118
+ * be considered as a circular buffer with a "head" index that resides in the tagTable.
1119
+ * - Also insert the "tag" into the equivalent row and position in the tagTable.
1120
+ * - Note: The tagTable has 17 or 33 1-byte entries per row, due to 16 or 32 tags, and 1 "head" entry.
1121
+ * The 17 or 33 entry rows are spaced out to occur every 32 or 64 bytes, respectively,
1122
+ * for alignment/performance reasons, leaving some bytes unused.
1123
+ * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte "short hash" and
1124
+ * generate a bitfield that we can cycle through to check the collisions in the hash table.
1125
+ * - Pick the longest match.
1126
+ */
1127
+ FORCE_INLINE_TEMPLATE
1128
+ size_t ZSTD_RowFindBestMatch(
604
1129
  ZSTD_matchState_t* ms,
605
- const BYTE* ip, const BYTE* const iLimit,
606
- size_t* offsetPtr)
1130
+ const BYTE* const ip, const BYTE* const iLimit,
1131
+ size_t* offsetPtr,
1132
+ const U32 mls, const ZSTD_dictMode_e dictMode,
1133
+ const U32 rowLog)
607
1134
  {
608
- switch(ms->cParams.minMatch)
609
- {
610
- default : /* includes case 3 */
611
- case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_extDict);
612
- case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_extDict);
613
- case 7 :
614
- case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_extDict);
1135
+ U32* const hashTable = ms->hashTable;
1136
+ U16* const tagTable = ms->tagTable;
1137
+ U32* const hashCache = ms->hashCache;
1138
+ const U32 hashLog = ms->rowHashLog;
1139
+ const ZSTD_compressionParameters* const cParams = &ms->cParams;
1140
+ const BYTE* const base = ms->window.base;
1141
+ const BYTE* const dictBase = ms->window.dictBase;
1142
+ const U32 dictLimit = ms->window.dictLimit;
1143
+ const BYTE* const prefixStart = base + dictLimit;
1144
+ const BYTE* const dictEnd = dictBase + dictLimit;
1145
+ const U32 curr = (U32)(ip-base);
1146
+ const U32 maxDistance = 1U << cParams->windowLog;
1147
+ const U32 lowestValid = ms->window.lowLimit;
1148
+ const U32 withinMaxDistance = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid;
1149
+ const U32 isDictionary = (ms->loadedDictEnd != 0);
1150
+ const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance;
1151
+ const U32 rowEntries = (1U << rowLog);
1152
+ const U32 rowMask = rowEntries - 1;
1153
+ const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */
1154
+ U32 nbAttempts = 1U << cappedSearchLog;
1155
+ size_t ml=4-1;
1156
+
1157
+ /* DMS/DDS variables that may be referenced laster */
1158
+ const ZSTD_matchState_t* const dms = ms->dictMatchState;
1159
+
1160
+ /* Initialize the following variables to satisfy static analyzer */
1161
+ size_t ddsIdx = 0;
1162
+ U32 ddsExtraAttempts = 0; /* cctx hash tables are limited in searches, but allow extra searches into DDS */
1163
+ U32 dmsTag = 0;
1164
+ U32* dmsRow = NULL;
1165
+ BYTE* dmsTagRow = NULL;
1166
+
1167
+ if (dictMode == ZSTD_dedicatedDictSearch) {
1168
+ const U32 ddsHashLog = dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG;
1169
+ { /* Prefetch DDS hashtable entry */
1170
+ ddsIdx = ZSTD_hashPtr(ip, ddsHashLog, mls) << ZSTD_LAZY_DDSS_BUCKET_LOG;
1171
+ PREFETCH_L1(&dms->hashTable[ddsIdx]);
1172
+ }
1173
+ ddsExtraAttempts = cParams->searchLog > rowLog ? 1U << (cParams->searchLog - rowLog) : 0;
1174
+ }
1175
+
1176
+ if (dictMode == ZSTD_dictMatchState) {
1177
+ /* Prefetch DMS rows */
1178
+ U32* const dmsHashTable = dms->hashTable;
1179
+ U16* const dmsTagTable = dms->tagTable;
1180
+ U32 const dmsHash = (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
1181
+ U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
1182
+ dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK;
1183
+ dmsTagRow = (BYTE*)(dmsTagTable + dmsRelRow);
1184
+ dmsRow = dmsHashTable + dmsRelRow;
1185
+ ZSTD_row_prefetch(dmsHashTable, dmsTagTable, dmsRelRow, rowLog);
1186
+ }
1187
+
1188
+ /* Update the hashTable and tagTable up to (but not including) ip */
1189
+ ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);
1190
+ { /* Get the hash for ip, compute the appropriate row */
1191
+ U32 const hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls);
1192
+ U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
1193
+ U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK;
1194
+ U32* const row = hashTable + relRow;
1195
+ BYTE* tagRow = (BYTE*)(tagTable + relRow);
1196
+ U32 const head = *tagRow & rowMask;
1197
+ U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
1198
+ size_t numMatches = 0;
1199
+ size_t currMatch = 0;
1200
+ ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, head, rowEntries);
1201
+
1202
+ /* Cycle through the matches and prefetch */
1203
+ for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
1204
+ U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
1205
+ U32 const matchIndex = row[matchPos];
1206
+ assert(numMatches < rowEntries);
1207
+ if (matchIndex < lowLimit)
1208
+ break;
1209
+ if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
1210
+ PREFETCH_L1(base + matchIndex);
1211
+ } else {
1212
+ PREFETCH_L1(dictBase + matchIndex);
1213
+ }
1214
+ matchBuffer[numMatches++] = matchIndex;
1215
+ }
1216
+
1217
+ /* Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop
1218
+ in ZSTD_row_update_internal() at the next search. */
1219
+ {
1220
+ U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
1221
+ tagRow[pos + ZSTD_ROW_HASH_TAG_OFFSET] = (BYTE)tag;
1222
+ row[pos] = ms->nextToUpdate++;
1223
+ }
1224
+
1225
+ /* Return the longest match */
1226
+ for (; currMatch < numMatches; ++currMatch) {
1227
+ U32 const matchIndex = matchBuffer[currMatch];
1228
+ size_t currentMl=0;
1229
+ assert(matchIndex < curr);
1230
+ assert(matchIndex >= lowLimit);
1231
+
1232
+ if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
1233
+ const BYTE* const match = base + matchIndex;
1234
+ assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */
1235
+ if (match[ml] == ip[ml]) /* potentially better */
1236
+ currentMl = ZSTD_count(ip, match, iLimit);
1237
+ } else {
1238
+ const BYTE* const match = dictBase + matchIndex;
1239
+ assert(match+4 <= dictEnd);
1240
+ if (MEM_read32(match) == MEM_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */
1241
+ currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dictEnd, prefixStart) + 4;
1242
+ }
1243
+
1244
+ /* Save best solution */
1245
+ if (currentMl > ml) {
1246
+ ml = currentMl;
1247
+ *offsetPtr = curr - matchIndex + ZSTD_REP_MOVE;
1248
+ if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
1249
+ }
1250
+ }
1251
+ }
1252
+
1253
+ assert(nbAttempts <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */
1254
+ if (dictMode == ZSTD_dedicatedDictSearch) {
1255
+ ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts + ddsExtraAttempts, dms,
1256
+ ip, iLimit, prefixStart, curr, dictLimit, ddsIdx);
1257
+ } else if (dictMode == ZSTD_dictMatchState) {
1258
+ /* TODO: Measure and potentially add prefetching to DMS */
1259
+ const U32 dmsLowestIndex = dms->window.dictLimit;
1260
+ const BYTE* const dmsBase = dms->window.base;
1261
+ const BYTE* const dmsEnd = dms->window.nextSrc;
1262
+ const U32 dmsSize = (U32)(dmsEnd - dmsBase);
1263
+ const U32 dmsIndexDelta = dictLimit - dmsSize;
1264
+
1265
+ { U32 const head = *dmsTagRow & rowMask;
1266
+ U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
1267
+ size_t numMatches = 0;
1268
+ size_t currMatch = 0;
1269
+ ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, head, rowEntries);
1270
+
1271
+ for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
1272
+ U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
1273
+ U32 const matchIndex = dmsRow[matchPos];
1274
+ if (matchIndex < dmsLowestIndex)
1275
+ break;
1276
+ PREFETCH_L1(dmsBase + matchIndex);
1277
+ matchBuffer[numMatches++] = matchIndex;
1278
+ }
1279
+
1280
+ /* Return the longest match */
1281
+ for (; currMatch < numMatches; ++currMatch) {
1282
+ U32 const matchIndex = matchBuffer[currMatch];
1283
+ size_t currentMl=0;
1284
+ assert(matchIndex >= dmsLowestIndex);
1285
+ assert(matchIndex < curr);
1286
+
1287
+ { const BYTE* const match = dmsBase + matchIndex;
1288
+ assert(match+4 <= dmsEnd);
1289
+ if (MEM_read32(match) == MEM_read32(ip))
1290
+ currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dmsEnd, prefixStart) + 4;
1291
+ }
1292
+
1293
+ if (currentMl > ml) {
1294
+ ml = currentMl;
1295
+ *offsetPtr = curr - (matchIndex + dmsIndexDelta) + ZSTD_REP_MOVE;
1296
+ if (ip+currentMl == iLimit) break;
1297
+ }
1298
+ }
1299
+ }
615
1300
  }
1301
+ return ml;
616
1302
  }
617
1303
 
618
1304
 
1305
+ typedef size_t (*searchMax_f)(
1306
+ ZSTD_matchState_t* ms,
1307
+ const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
1308
+
1309
+ /**
1310
+ * This struct contains the functions necessary for lazy to search.
1311
+ * Currently, that is only searchMax. However, it is still valuable to have the
1312
+ * VTable because this makes it easier to add more functions to the VTable later.
1313
+ *
1314
+ * TODO: The start of the search function involves loading and calculating a
1315
+ * bunch of constants from the ZSTD_matchState_t. These computations could be
1316
+ * done in an initialization function, and saved somewhere in the match state.
1317
+ * Then we could pass a pointer to the saved state instead of the match state,
1318
+ * and avoid duplicate computations.
1319
+ *
1320
+ * TODO: Move the match re-winding into searchMax. This improves compression
1321
+ * ratio, and unlocks further simplifications with the next TODO.
1322
+ *
1323
+ * TODO: Try moving the repcode search into searchMax. After the re-winding
1324
+ * and repcode search are in searchMax, there is no more logic in the match
1325
+ * finder loop that requires knowledge about the dictMode. So we should be
1326
+ * able to avoid force inlining it, and we can join the extDict loop with
1327
+ * the single segment loop. It should go in searchMax instead of its own
1328
+ * function to avoid having multiple virtual function calls per search.
1329
+ */
1330
+ typedef struct {
1331
+ searchMax_f searchMax;
1332
+ } ZSTD_LazyVTable;
1333
+
1334
+ #define GEN_ZSTD_BT_VTABLE(dictMode, mls) \
1335
+ static size_t ZSTD_BtFindBestMatch_##dictMode##_##mls( \
1336
+ ZSTD_matchState_t* ms, \
1337
+ const BYTE* ip, const BYTE* const iLimit, \
1338
+ size_t* offsetPtr) \
1339
+ { \
1340
+ assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
1341
+ return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode); \
1342
+ } \
1343
+ static const ZSTD_LazyVTable ZSTD_BtVTable_##dictMode##_##mls = { \
1344
+ ZSTD_BtFindBestMatch_##dictMode##_##mls \
1345
+ };
1346
+
1347
+ #define GEN_ZSTD_HC_VTABLE(dictMode, mls) \
1348
+ static size_t ZSTD_HcFindBestMatch_##dictMode##_##mls( \
1349
+ ZSTD_matchState_t* ms, \
1350
+ const BYTE* ip, const BYTE* const iLimit, \
1351
+ size_t* offsetPtr) \
1352
+ { \
1353
+ assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
1354
+ return ZSTD_HcFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode); \
1355
+ } \
1356
+ static const ZSTD_LazyVTable ZSTD_HcVTable_##dictMode##_##mls = { \
1357
+ ZSTD_HcFindBestMatch_##dictMode##_##mls \
1358
+ };
1359
+
1360
+ #define GEN_ZSTD_ROW_VTABLE(dictMode, mls, rowLog) \
1361
+ static size_t ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog( \
1362
+ ZSTD_matchState_t* ms, \
1363
+ const BYTE* ip, const BYTE* const iLimit, \
1364
+ size_t* offsetPtr) \
1365
+ { \
1366
+ assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
1367
+ assert(MAX(4, MIN(6, ms->cParams.searchLog)) == rowLog); \
1368
+ return ZSTD_RowFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode, rowLog); \
1369
+ } \
1370
+ static const ZSTD_LazyVTable ZSTD_RowVTable_##dictMode##_##mls##_##rowLog = { \
1371
+ ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog \
1372
+ };
1373
+
1374
+ #define ZSTD_FOR_EACH_ROWLOG(X, dictMode, mls) \
1375
+ X(dictMode, mls, 4) \
1376
+ X(dictMode, mls, 5) \
1377
+ X(dictMode, mls, 6)
1378
+
1379
+ #define ZSTD_FOR_EACH_MLS_ROWLOG(X, dictMode) \
1380
+ ZSTD_FOR_EACH_ROWLOG(X, dictMode, 4) \
1381
+ ZSTD_FOR_EACH_ROWLOG(X, dictMode, 5) \
1382
+ ZSTD_FOR_EACH_ROWLOG(X, dictMode, 6)
1383
+
1384
+ #define ZSTD_FOR_EACH_MLS(X, dictMode) \
1385
+ X(dictMode, 4) \
1386
+ X(dictMode, 5) \
1387
+ X(dictMode, 6)
1388
+
1389
+ #define ZSTD_FOR_EACH_DICT_MODE(X, ...) \
1390
+ X(__VA_ARGS__, noDict) \
1391
+ X(__VA_ARGS__, extDict) \
1392
+ X(__VA_ARGS__, dictMatchState) \
1393
+ X(__VA_ARGS__, dedicatedDictSearch)
1394
+
1395
+ /* Generate Row VTables for each combination of (dictMode, mls, rowLog) */
1396
+ ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS_ROWLOG, GEN_ZSTD_ROW_VTABLE)
1397
+ /* Generate Binary Tree VTables for each combination of (dictMode, mls) */
1398
+ ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_BT_VTABLE)
1399
+ /* Generate Hash Chain VTables for each combination of (dictMode, mls) */
1400
+ ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_HC_VTABLE)
1401
+
1402
+ #define GEN_ZSTD_BT_VTABLE_ARRAY(dictMode) \
1403
+ { \
1404
+ &ZSTD_BtVTable_##dictMode##_4, \
1405
+ &ZSTD_BtVTable_##dictMode##_5, \
1406
+ &ZSTD_BtVTable_##dictMode##_6 \
1407
+ }
1408
+
1409
+ #define GEN_ZSTD_HC_VTABLE_ARRAY(dictMode) \
1410
+ { \
1411
+ &ZSTD_HcVTable_##dictMode##_4, \
1412
+ &ZSTD_HcVTable_##dictMode##_5, \
1413
+ &ZSTD_HcVTable_##dictMode##_6 \
1414
+ }
1415
+
1416
+ #define GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, mls) \
1417
+ { \
1418
+ &ZSTD_RowVTable_##dictMode##_##mls##_4, \
1419
+ &ZSTD_RowVTable_##dictMode##_##mls##_5, \
1420
+ &ZSTD_RowVTable_##dictMode##_##mls##_6 \
1421
+ }
1422
+
1423
+ #define GEN_ZSTD_ROW_VTABLE_ARRAY(dictMode) \
1424
+ { \
1425
+ GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, 4), \
1426
+ GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, 5), \
1427
+ GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, 6) \
1428
+ }
1429
+
1430
+ #define GEN_ZSTD_VTABLE_ARRAY(X) \
1431
+ { \
1432
+ X(noDict), \
1433
+ X(extDict), \
1434
+ X(dictMatchState), \
1435
+ X(dedicatedDictSearch) \
1436
+ }
1437
+
619
1438
  /* *******************************
620
1439
  * Common parser - lazy strategy
621
1440
  *********************************/
622
- typedef enum { search_hashChain, search_binaryTree } searchMethod_e;
1441
+ typedef enum { search_hashChain=0, search_binaryTree=1, search_rowHash=2 } searchMethod_e;
1442
+
1443
+ /**
1444
+ * This table is indexed first by the four ZSTD_dictMode_e values, and then
1445
+ * by the two searchMethod_e values. NULLs are placed for configurations
1446
+ * that should never occur (extDict modes go to the other implementation
1447
+ * below and there is no DDSS for binary tree search yet).
1448
+ */
1449
+
1450
+ static ZSTD_LazyVTable const*
1451
+ ZSTD_selectLazyVTable(ZSTD_matchState_t const* ms, searchMethod_e searchMethod, ZSTD_dictMode_e dictMode)
1452
+ {
1453
+ /* Fill the Hc/Bt VTable arrays with the right functions for the (dictMode, mls) combination. */
1454
+ ZSTD_LazyVTable const* const hcVTables[4][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_HC_VTABLE_ARRAY);
1455
+ ZSTD_LazyVTable const* const btVTables[4][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_BT_VTABLE_ARRAY);
1456
+ /* Fill the Row VTable array with the right functions for the (dictMode, mls, rowLog) combination. */
1457
+ ZSTD_LazyVTable const* const rowVTables[4][3][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_ROW_VTABLE_ARRAY);
1458
+
1459
+ U32 const mls = MAX(4, MIN(6, ms->cParams.minMatch));
1460
+ U32 const rowLog = MAX(4, MIN(6, ms->cParams.searchLog));
1461
+ switch (searchMethod) {
1462
+ case search_hashChain:
1463
+ return hcVTables[dictMode][mls - 4];
1464
+ case search_binaryTree:
1465
+ return btVTables[dictMode][mls - 4];
1466
+ case search_rowHash:
1467
+ return rowVTables[dictMode][mls - 4][rowLog - 4];
1468
+ default:
1469
+ return NULL;
1470
+ }
1471
+ }
623
1472
 
624
1473
  FORCE_INLINE_TEMPLATE size_t
625
1474
  ZSTD_compressBlock_lazy_generic(
@@ -633,59 +1482,68 @@ ZSTD_compressBlock_lazy_generic(
633
1482
  const BYTE* ip = istart;
634
1483
  const BYTE* anchor = istart;
635
1484
  const BYTE* const iend = istart + srcSize;
636
- const BYTE* const ilimit = iend - 8;
1485
+ const BYTE* const ilimit = (searchMethod == search_rowHash) ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8;
637
1486
  const BYTE* const base = ms->window.base;
638
1487
  const U32 prefixLowestIndex = ms->window.dictLimit;
639
1488
  const BYTE* const prefixLowest = base + prefixLowestIndex;
640
1489
 
641
- typedef size_t (*searchMax_f)(
642
- ZSTD_matchState_t* ms,
643
- const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
644
- searchMax_f const searchMax = dictMode == ZSTD_dictMatchState ?
645
- (searchMethod==search_binaryTree ? ZSTD_BtFindBestMatch_dictMatchState_selectMLS
646
- : ZSTD_HcFindBestMatch_dictMatchState_selectMLS) :
647
- (searchMethod==search_binaryTree ? ZSTD_BtFindBestMatch_selectMLS
648
- : ZSTD_HcFindBestMatch_selectMLS);
1490
+ searchMax_f const searchMax = ZSTD_selectLazyVTable(ms, searchMethod, dictMode)->searchMax;
649
1491
  U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0;
650
1492
 
1493
+ const int isDMS = dictMode == ZSTD_dictMatchState;
1494
+ const int isDDS = dictMode == ZSTD_dedicatedDictSearch;
1495
+ const int isDxS = isDMS || isDDS;
651
1496
  const ZSTD_matchState_t* const dms = ms->dictMatchState;
652
- const U32 dictLowestIndex = dictMode == ZSTD_dictMatchState ?
653
- dms->window.dictLimit : 0;
654
- const BYTE* const dictBase = dictMode == ZSTD_dictMatchState ?
655
- dms->window.base : NULL;
656
- const BYTE* const dictLowest = dictMode == ZSTD_dictMatchState ?
657
- dictBase + dictLowestIndex : NULL;
658
- const BYTE* const dictEnd = dictMode == ZSTD_dictMatchState ?
659
- dms->window.nextSrc : NULL;
660
- const U32 dictIndexDelta = dictMode == ZSTD_dictMatchState ?
1497
+ const U32 dictLowestIndex = isDxS ? dms->window.dictLimit : 0;
1498
+ const BYTE* const dictBase = isDxS ? dms->window.base : NULL;
1499
+ const BYTE* const dictLowest = isDxS ? dictBase + dictLowestIndex : NULL;
1500
+ const BYTE* const dictEnd = isDxS ? dms->window.nextSrc : NULL;
1501
+ const U32 dictIndexDelta = isDxS ?
661
1502
  prefixLowestIndex - (U32)(dictEnd - dictBase) :
662
1503
  0;
663
- const U32 dictAndPrefixLength = (U32)(ip - prefixLowest + dictEnd - dictLowest);
1504
+ const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictLowest));
664
1505
 
665
- /* init */
1506
+ assert(searchMax != NULL);
1507
+
1508
+ DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=%u) (searchFunc=%u)", (U32)dictMode, (U32)searchMethod);
666
1509
  ip += (dictAndPrefixLength == 0);
667
1510
  if (dictMode == ZSTD_noDict) {
668
- U32 const maxRep = (U32)(ip - prefixLowest);
1511
+ U32 const curr = (U32)(ip - base);
1512
+ U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, ms->cParams.windowLog);
1513
+ U32 const maxRep = curr - windowLow;
669
1514
  if (offset_2 > maxRep) savedOffset = offset_2, offset_2 = 0;
670
1515
  if (offset_1 > maxRep) savedOffset = offset_1, offset_1 = 0;
671
1516
  }
672
- if (dictMode == ZSTD_dictMatchState) {
1517
+ if (isDxS) {
673
1518
  /* dictMatchState repCode checks don't currently handle repCode == 0
674
1519
  * disabling. */
675
1520
  assert(offset_1 <= dictAndPrefixLength);
676
1521
  assert(offset_2 <= dictAndPrefixLength);
677
1522
  }
678
1523
 
1524
+ if (searchMethod == search_rowHash) {
1525
+ const U32 rowLog = MAX(4, MIN(6, ms->cParams.searchLog));
1526
+ ZSTD_row_fillHashCache(ms, base, rowLog,
1527
+ MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
1528
+ ms->nextToUpdate, ilimit);
1529
+ }
1530
+
679
1531
  /* Match Loop */
1532
+ #if defined(__GNUC__) && defined(__x86_64__)
1533
+ /* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the
1534
+ * code alignment is perturbed. To fix the instability align the loop on 32-bytes.
1535
+ */
1536
+ __asm__(".p2align 5");
1537
+ #endif
680
1538
  while (ip < ilimit) {
681
1539
  size_t matchLength=0;
682
1540
  size_t offset=0;
683
1541
  const BYTE* start=ip+1;
684
1542
 
685
1543
  /* check repCode */
686
- if (dictMode == ZSTD_dictMatchState) {
1544
+ if (isDxS) {
687
1545
  const U32 repIndex = (U32)(ip - base) + 1 - offset_1;
688
- const BYTE* repMatch = (dictMode == ZSTD_dictMatchState
1546
+ const BYTE* repMatch = ((dictMode == ZSTD_dictMatchState || dictMode == ZSTD_dedicatedDictSearch)
689
1547
  && repIndex < prefixLowestIndex) ?
690
1548
  dictBase + (repIndex - dictIndexDelta) :
691
1549
  base + repIndex;
@@ -726,7 +1584,7 @@ ZSTD_compressBlock_lazy_generic(
726
1584
  if ((mlRep >= 4) && (gain2 > gain1))
727
1585
  matchLength = mlRep, offset = 0, start = ip;
728
1586
  }
729
- if (dictMode == ZSTD_dictMatchState) {
1587
+ if (isDxS) {
730
1588
  const U32 repIndex = (U32)(ip - base) - offset_1;
731
1589
  const BYTE* repMatch = repIndex < prefixLowestIndex ?
732
1590
  dictBase + (repIndex - dictIndexDelta) :
@@ -761,7 +1619,7 @@ ZSTD_compressBlock_lazy_generic(
761
1619
  if ((mlRep >= 4) && (gain2 > gain1))
762
1620
  matchLength = mlRep, offset = 0, start = ip;
763
1621
  }
764
- if (dictMode == ZSTD_dictMatchState) {
1622
+ if (isDxS) {
765
1623
  const U32 repIndex = (U32)(ip - base) - offset_1;
766
1624
  const BYTE* repMatch = repIndex < prefixLowestIndex ?
767
1625
  dictBase + (repIndex - dictIndexDelta) :
@@ -799,8 +1657,8 @@ ZSTD_compressBlock_lazy_generic(
799
1657
  && (start[-1] == (start-(offset-ZSTD_REP_MOVE))[-1]) ) /* only search for offset within prefix */
800
1658
  { start--; matchLength++; }
801
1659
  }
802
- if (dictMode == ZSTD_dictMatchState) {
803
- U32 const matchIndex = (U32)((start-base) - (offset - ZSTD_REP_MOVE));
1660
+ if (isDxS) {
1661
+ U32 const matchIndex = (U32)((size_t)(start-base) - (offset - ZSTD_REP_MOVE));
804
1662
  const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex;
805
1663
  const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest;
806
1664
  while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */
@@ -809,18 +1667,17 @@ ZSTD_compressBlock_lazy_generic(
809
1667
  }
810
1668
  /* store sequence */
811
1669
  _storeSequence:
812
- { size_t const litLength = start - anchor;
1670
+ { size_t const litLength = (size_t)(start - anchor);
813
1671
  ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offset, matchLength-MINMATCH);
814
1672
  anchor = ip = start + matchLength;
815
1673
  }
816
1674
 
817
1675
  /* check immediate repcode */
818
- if (dictMode == ZSTD_dictMatchState) {
1676
+ if (isDxS) {
819
1677
  while (ip <= ilimit) {
820
1678
  U32 const current2 = (U32)(ip-base);
821
1679
  U32 const repIndex = current2 - offset_2;
822
- const BYTE* repMatch = dictMode == ZSTD_dictMatchState
823
- && repIndex < prefixLowestIndex ?
1680
+ const BYTE* repMatch = repIndex < prefixLowestIndex ?
824
1681
  dictBase - dictIndexDelta + repIndex :
825
1682
  base + repIndex;
826
1683
  if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex) >= 3 /* intentional overflow */)
@@ -915,6 +1772,92 @@ size_t ZSTD_compressBlock_greedy_dictMatchState(
915
1772
  }
916
1773
 
917
1774
 
1775
+ size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(
1776
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1777
+ void const* src, size_t srcSize)
1778
+ {
1779
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dedicatedDictSearch);
1780
+ }
1781
+
1782
+ size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(
1783
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1784
+ void const* src, size_t srcSize)
1785
+ {
1786
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dedicatedDictSearch);
1787
+ }
1788
+
1789
+ size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
1790
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1791
+ void const* src, size_t srcSize)
1792
+ {
1793
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch);
1794
+ }
1795
+
1796
+ /* Row-based matchfinder */
1797
+ size_t ZSTD_compressBlock_lazy2_row(
1798
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1799
+ void const* src, size_t srcSize)
1800
+ {
1801
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict);
1802
+ }
1803
+
1804
+ size_t ZSTD_compressBlock_lazy_row(
1805
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1806
+ void const* src, size_t srcSize)
1807
+ {
1808
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict);
1809
+ }
1810
+
1811
+ size_t ZSTD_compressBlock_greedy_row(
1812
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1813
+ void const* src, size_t srcSize)
1814
+ {
1815
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict);
1816
+ }
1817
+
1818
+ size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
1819
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1820
+ void const* src, size_t srcSize)
1821
+ {
1822
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState);
1823
+ }
1824
+
1825
+ size_t ZSTD_compressBlock_lazy_dictMatchState_row(
1826
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1827
+ void const* src, size_t srcSize)
1828
+ {
1829
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState);
1830
+ }
1831
+
1832
+ size_t ZSTD_compressBlock_greedy_dictMatchState_row(
1833
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1834
+ void const* src, size_t srcSize)
1835
+ {
1836
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState);
1837
+ }
1838
+
1839
+
1840
+ size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row(
1841
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1842
+ void const* src, size_t srcSize)
1843
+ {
1844
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dedicatedDictSearch);
1845
+ }
1846
+
1847
+ size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
1848
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1849
+ void const* src, size_t srcSize)
1850
+ {
1851
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch);
1852
+ }
1853
+
1854
+ size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
1855
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1856
+ void const* src, size_t srcSize)
1857
+ {
1858
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch);
1859
+ }
1860
+
918
1861
  FORCE_INLINE_TEMPLATE
919
1862
  size_t ZSTD_compressBlock_lazy_extDict_generic(
920
1863
  ZSTD_matchState_t* ms, seqStore_t* seqStore,
@@ -926,37 +1869,49 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
926
1869
  const BYTE* ip = istart;
927
1870
  const BYTE* anchor = istart;
928
1871
  const BYTE* const iend = istart + srcSize;
929
- const BYTE* const ilimit = iend - 8;
1872
+ const BYTE* const ilimit = searchMethod == search_rowHash ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8;
930
1873
  const BYTE* const base = ms->window.base;
931
1874
  const U32 dictLimit = ms->window.dictLimit;
932
- const U32 lowestIndex = ms->window.lowLimit;
933
1875
  const BYTE* const prefixStart = base + dictLimit;
934
1876
  const BYTE* const dictBase = ms->window.dictBase;
935
1877
  const BYTE* const dictEnd = dictBase + dictLimit;
936
- const BYTE* const dictStart = dictBase + lowestIndex;
937
-
938
- typedef size_t (*searchMax_f)(
939
- ZSTD_matchState_t* ms,
940
- const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
941
- searchMax_f searchMax = searchMethod==search_binaryTree ? ZSTD_BtFindBestMatch_extDict_selectMLS : ZSTD_HcFindBestMatch_extDict_selectMLS;
1878
+ const BYTE* const dictStart = dictBase + ms->window.lowLimit;
1879
+ const U32 windowLog = ms->cParams.windowLog;
1880
+ const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5;
942
1881
 
1882
+ searchMax_f const searchMax = ZSTD_selectLazyVTable(ms, searchMethod, ZSTD_extDict)->searchMax;
943
1883
  U32 offset_1 = rep[0], offset_2 = rep[1];
944
1884
 
1885
+ DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod);
1886
+
945
1887
  /* init */
946
1888
  ip += (ip == prefixStart);
1889
+ if (searchMethod == search_rowHash) {
1890
+ ZSTD_row_fillHashCache(ms, base, rowLog,
1891
+ MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
1892
+ ms->nextToUpdate, ilimit);
1893
+ }
947
1894
 
948
1895
  /* Match Loop */
1896
+ #if defined(__GNUC__) && defined(__x86_64__)
1897
+ /* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the
1898
+ * code alignment is perturbed. To fix the instability align the loop on 32-bytes.
1899
+ */
1900
+ __asm__(".p2align 5");
1901
+ #endif
949
1902
  while (ip < ilimit) {
950
1903
  size_t matchLength=0;
951
1904
  size_t offset=0;
952
1905
  const BYTE* start=ip+1;
953
- U32 current = (U32)(ip-base);
1906
+ U32 curr = (U32)(ip-base);
954
1907
 
955
1908
  /* check repCode */
956
- { const U32 repIndex = (U32)(current+1 - offset_1);
1909
+ { const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr+1, windowLog);
1910
+ const U32 repIndex = (U32)(curr+1 - offset_1);
957
1911
  const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
958
1912
  const BYTE* const repMatch = repBase + repIndex;
959
- if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > lowestIndex)) /* intentional overflow */
1913
+ if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow */
1914
+ & (offset_1 <= curr+1 - windowLow) ) /* note: we are searching at curr+1 */
960
1915
  if (MEM_read32(ip+1) == MEM_read32(repMatch)) {
961
1916
  /* repcode detected we should take it */
962
1917
  const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
@@ -971,7 +1926,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
971
1926
  matchLength = ml2, start = ip, offset=offsetFound;
972
1927
  }
973
1928
 
974
- if (matchLength < 4) {
1929
+ if (matchLength < 4) {
975
1930
  ip += ((ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */
976
1931
  continue;
977
1932
  }
@@ -980,13 +1935,15 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
980
1935
  if (depth>=1)
981
1936
  while (ip<ilimit) {
982
1937
  ip ++;
983
- current++;
1938
+ curr++;
984
1939
  /* check repCode */
985
1940
  if (offset) {
986
- const U32 repIndex = (U32)(current - offset_1);
1941
+ const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
1942
+ const U32 repIndex = (U32)(curr - offset_1);
987
1943
  const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
988
1944
  const BYTE* const repMatch = repBase + repIndex;
989
- if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > lowestIndex)) /* intentional overflow */
1945
+ if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */
1946
+ & (offset_1 <= curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
990
1947
  if (MEM_read32(ip) == MEM_read32(repMatch)) {
991
1948
  /* repcode detected */
992
1949
  const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
@@ -1010,13 +1967,15 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1010
1967
  /* let's find an even better one */
1011
1968
  if ((depth==2) && (ip<ilimit)) {
1012
1969
  ip ++;
1013
- current++;
1970
+ curr++;
1014
1971
  /* check repCode */
1015
1972
  if (offset) {
1016
- const U32 repIndex = (U32)(current - offset_1);
1973
+ const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
1974
+ const U32 repIndex = (U32)(curr - offset_1);
1017
1975
  const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
1018
1976
  const BYTE* const repMatch = repBase + repIndex;
1019
- if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > lowestIndex)) /* intentional overflow */
1977
+ if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */
1978
+ & (offset_1 <= curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
1020
1979
  if (MEM_read32(ip) == MEM_read32(repMatch)) {
1021
1980
  /* repcode detected */
1022
1981
  const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
@@ -1041,7 +2000,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1041
2000
 
1042
2001
  /* catch up */
1043
2002
  if (offset) {
1044
- U32 const matchIndex = (U32)((start-base) - (offset - ZSTD_REP_MOVE));
2003
+ U32 const matchIndex = (U32)((size_t)(start-base) - (offset - ZSTD_REP_MOVE));
1045
2004
  const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex;
1046
2005
  const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart;
1047
2006
  while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */
@@ -1050,17 +2009,20 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1050
2009
 
1051
2010
  /* store sequence */
1052
2011
  _storeSequence:
1053
- { size_t const litLength = start - anchor;
2012
+ { size_t const litLength = (size_t)(start - anchor);
1054
2013
  ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offset, matchLength-MINMATCH);
1055
2014
  anchor = ip = start + matchLength;
1056
2015
  }
1057
2016
 
1058
2017
  /* check immediate repcode */
1059
2018
  while (ip <= ilimit) {
1060
- const U32 repIndex = (U32)((ip-base) - offset_2);
2019
+ const U32 repCurrent = (U32)(ip-base);
2020
+ const U32 windowLow = ZSTD_getLowestMatchIndex(ms, repCurrent, windowLog);
2021
+ const U32 repIndex = repCurrent - offset_2;
1061
2022
  const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
1062
2023
  const BYTE* const repMatch = repBase + repIndex;
1063
- if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > lowestIndex)) /* intentional overflow */
2024
+ if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */
2025
+ & (offset_2 <= repCurrent - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
1064
2026
  if (MEM_read32(ip) == MEM_read32(repMatch)) {
1065
2027
  /* repcode detected we should take it */
1066
2028
  const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
@@ -1113,3 +2075,26 @@ size_t ZSTD_compressBlock_btlazy2_extDict(
1113
2075
  {
1114
2076
  return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2);
1115
2077
  }
2078
+
2079
+ size_t ZSTD_compressBlock_greedy_extDict_row(
2080
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2081
+ void const* src, size_t srcSize)
2082
+ {
2083
+ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0);
2084
+ }
2085
+
2086
+ size_t ZSTD_compressBlock_lazy_extDict_row(
2087
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2088
+ void const* src, size_t srcSize)
2089
+
2090
+ {
2091
+ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1);
2092
+ }
2093
+
2094
+ size_t ZSTD_compressBlock_lazy2_extDict_row(
2095
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2096
+ void const* src, size_t srcSize)
2097
+
2098
+ {
2099
+ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2);
2100
+ }