zstd-ruby 1.4.4.0 → 1.5.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (102) hide show
  1. checksums.yaml +4 -4
  2. data/.github/dependabot.yml +8 -0
  3. data/.github/workflows/ruby.yml +35 -0
  4. data/README.md +2 -2
  5. data/ext/zstdruby/extconf.rb +1 -0
  6. data/ext/zstdruby/libzstd/BUCK +5 -7
  7. data/ext/zstdruby/libzstd/Makefile +241 -173
  8. data/ext/zstdruby/libzstd/README.md +76 -18
  9. data/ext/zstdruby/libzstd/common/bitstream.h +75 -57
  10. data/ext/zstdruby/libzstd/common/compiler.h +196 -20
  11. data/ext/zstdruby/libzstd/common/cpu.h +1 -3
  12. data/ext/zstdruby/libzstd/common/debug.c +11 -31
  13. data/ext/zstdruby/libzstd/common/debug.h +22 -49
  14. data/ext/zstdruby/libzstd/common/entropy_common.c +208 -76
  15. data/ext/zstdruby/libzstd/common/error_private.c +3 -1
  16. data/ext/zstdruby/libzstd/common/error_private.h +87 -4
  17. data/ext/zstdruby/libzstd/common/fse.h +51 -42
  18. data/ext/zstdruby/libzstd/common/fse_decompress.c +149 -57
  19. data/ext/zstdruby/libzstd/common/huf.h +60 -54
  20. data/ext/zstdruby/libzstd/common/mem.h +87 -98
  21. data/ext/zstdruby/libzstd/common/pool.c +23 -17
  22. data/ext/zstdruby/libzstd/common/pool.h +3 -3
  23. data/ext/zstdruby/libzstd/common/portability_macros.h +131 -0
  24. data/ext/zstdruby/libzstd/common/threading.c +10 -8
  25. data/ext/zstdruby/libzstd/common/threading.h +4 -3
  26. data/ext/zstdruby/libzstd/common/xxhash.c +15 -873
  27. data/ext/zstdruby/libzstd/common/xxhash.h +5572 -191
  28. data/ext/zstdruby/libzstd/common/zstd_common.c +10 -10
  29. data/ext/zstdruby/libzstd/common/zstd_deps.h +111 -0
  30. data/ext/zstdruby/libzstd/common/zstd_internal.h +252 -108
  31. data/ext/zstdruby/libzstd/common/zstd_trace.h +163 -0
  32. data/ext/zstdruby/libzstd/compress/clevels.h +134 -0
  33. data/ext/zstdruby/libzstd/compress/fse_compress.c +105 -85
  34. data/ext/zstdruby/libzstd/compress/hist.c +41 -63
  35. data/ext/zstdruby/libzstd/compress/hist.h +13 -33
  36. data/ext/zstdruby/libzstd/compress/huf_compress.c +831 -259
  37. data/ext/zstdruby/libzstd/compress/zstd_compress.c +3213 -1007
  38. data/ext/zstdruby/libzstd/compress/zstd_compress_internal.h +493 -71
  39. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.c +21 -16
  40. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.h +4 -2
  41. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.c +51 -24
  42. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.h +10 -3
  43. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.c +573 -0
  44. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.h +32 -0
  45. data/ext/zstdruby/libzstd/compress/zstd_cwksp.h +208 -81
  46. data/ext/zstdruby/libzstd/compress/zstd_double_fast.c +315 -137
  47. data/ext/zstdruby/libzstd/compress/zstd_double_fast.h +2 -2
  48. data/ext/zstdruby/libzstd/compress/zstd_fast.c +319 -128
  49. data/ext/zstdruby/libzstd/compress/zstd_fast.h +2 -2
  50. data/ext/zstdruby/libzstd/compress/zstd_lazy.c +1156 -171
  51. data/ext/zstdruby/libzstd/compress/zstd_lazy.h +59 -1
  52. data/ext/zstdruby/libzstd/compress/zstd_ldm.c +331 -206
  53. data/ext/zstdruby/libzstd/compress/zstd_ldm.h +15 -3
  54. data/ext/zstdruby/libzstd/compress/zstd_ldm_geartab.h +106 -0
  55. data/ext/zstdruby/libzstd/compress/zstd_opt.c +403 -226
  56. data/ext/zstdruby/libzstd/compress/zstd_opt.h +1 -1
  57. data/ext/zstdruby/libzstd/compress/zstdmt_compress.c +188 -453
  58. data/ext/zstdruby/libzstd/compress/zstdmt_compress.h +32 -114
  59. data/ext/zstdruby/libzstd/decompress/huf_decompress.c +1065 -410
  60. data/ext/zstdruby/libzstd/decompress/huf_decompress_amd64.S +571 -0
  61. data/ext/zstdruby/libzstd/decompress/zstd_ddict.c +20 -16
  62. data/ext/zstdruby/libzstd/decompress/zstd_ddict.h +3 -3
  63. data/ext/zstdruby/libzstd/decompress/zstd_decompress.c +691 -230
  64. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.c +1072 -323
  65. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.h +16 -7
  66. data/ext/zstdruby/libzstd/decompress/zstd_decompress_internal.h +71 -10
  67. data/ext/zstdruby/libzstd/deprecated/zbuff.h +3 -3
  68. data/ext/zstdruby/libzstd/deprecated/zbuff_common.c +2 -2
  69. data/ext/zstdruby/libzstd/deprecated/zbuff_compress.c +24 -4
  70. data/ext/zstdruby/libzstd/deprecated/zbuff_decompress.c +1 -1
  71. data/ext/zstdruby/libzstd/dictBuilder/cover.c +57 -40
  72. data/ext/zstdruby/libzstd/dictBuilder/cover.h +20 -9
  73. data/ext/zstdruby/libzstd/dictBuilder/divsufsort.c +1 -1
  74. data/ext/zstdruby/libzstd/dictBuilder/fastcover.c +54 -35
  75. data/ext/zstdruby/libzstd/dictBuilder/zdict.c +151 -57
  76. data/ext/zstdruby/libzstd/dll/example/Makefile +2 -1
  77. data/ext/zstdruby/libzstd/dll/example/README.md +16 -22
  78. data/ext/zstdruby/libzstd/legacy/zstd_legacy.h +4 -4
  79. data/ext/zstdruby/libzstd/legacy/zstd_v01.c +25 -19
  80. data/ext/zstdruby/libzstd/legacy/zstd_v01.h +1 -1
  81. data/ext/zstdruby/libzstd/legacy/zstd_v02.c +18 -14
  82. data/ext/zstdruby/libzstd/legacy/zstd_v02.h +1 -1
  83. data/ext/zstdruby/libzstd/legacy/zstd_v03.c +18 -14
  84. data/ext/zstdruby/libzstd/legacy/zstd_v03.h +1 -1
  85. data/ext/zstdruby/libzstd/legacy/zstd_v04.c +22 -16
  86. data/ext/zstdruby/libzstd/legacy/zstd_v04.h +1 -1
  87. data/ext/zstdruby/libzstd/legacy/zstd_v05.c +29 -25
  88. data/ext/zstdruby/libzstd/legacy/zstd_v05.h +2 -2
  89. data/ext/zstdruby/libzstd/legacy/zstd_v06.c +29 -25
  90. data/ext/zstdruby/libzstd/legacy/zstd_v06.h +1 -1
  91. data/ext/zstdruby/libzstd/legacy/zstd_v07.c +34 -26
  92. data/ext/zstdruby/libzstd/legacy/zstd_v07.h +1 -1
  93. data/ext/zstdruby/libzstd/libzstd.mk +185 -0
  94. data/ext/zstdruby/libzstd/libzstd.pc.in +4 -3
  95. data/ext/zstdruby/libzstd/modulemap/module.modulemap +4 -0
  96. data/ext/zstdruby/libzstd/{dictBuilder/zdict.h → zdict.h} +201 -31
  97. data/ext/zstdruby/libzstd/zstd.h +760 -234
  98. data/ext/zstdruby/libzstd/{common/zstd_errors.h → zstd_errors.h} +3 -1
  99. data/ext/zstdruby/zstdruby.c +2 -2
  100. data/lib/zstd-ruby/version.rb +1 -1
  101. metadata +20 -9
  102. data/.travis.yml +0 -14
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
2
+ * Copyright (c) Yann Collet, Facebook, Inc.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -58,11 +58,11 @@ ZSTD_updateDUBT(ZSTD_matchState_t* ms,
58
58
 
59
59
  /** ZSTD_insertDUBT1() :
60
60
  * sort one already inserted but unsorted position
61
- * assumption : current >= btlow == (current - btmask)
61
+ * assumption : curr >= btlow == (curr - btmask)
62
62
  * doesn't fail */
63
63
  static void
64
- ZSTD_insertDUBT1(ZSTD_matchState_t* ms,
65
- U32 current, const BYTE* inputEnd,
64
+ ZSTD_insertDUBT1(const ZSTD_matchState_t* ms,
65
+ U32 curr, const BYTE* inputEnd,
66
66
  U32 nbCompares, U32 btLow,
67
67
  const ZSTD_dictMode_e dictMode)
68
68
  {
@@ -74,41 +74,41 @@ ZSTD_insertDUBT1(ZSTD_matchState_t* ms,
74
74
  const BYTE* const base = ms->window.base;
75
75
  const BYTE* const dictBase = ms->window.dictBase;
76
76
  const U32 dictLimit = ms->window.dictLimit;
77
- const BYTE* const ip = (current>=dictLimit) ? base + current : dictBase + current;
78
- const BYTE* const iend = (current>=dictLimit) ? inputEnd : dictBase + dictLimit;
77
+ const BYTE* const ip = (curr>=dictLimit) ? base + curr : dictBase + curr;
78
+ const BYTE* const iend = (curr>=dictLimit) ? inputEnd : dictBase + dictLimit;
79
79
  const BYTE* const dictEnd = dictBase + dictLimit;
80
80
  const BYTE* const prefixStart = base + dictLimit;
81
81
  const BYTE* match;
82
- U32* smallerPtr = bt + 2*(current&btMask);
82
+ U32* smallerPtr = bt + 2*(curr&btMask);
83
83
  U32* largerPtr = smallerPtr + 1;
84
84
  U32 matchIndex = *smallerPtr; /* this candidate is unsorted : next sorted candidate is reached through *smallerPtr, while *largerPtr contains previous unsorted candidate (which is already saved and can be overwritten) */
85
85
  U32 dummy32; /* to be nullified at the end */
86
86
  U32 const windowValid = ms->window.lowLimit;
87
87
  U32 const maxDistance = 1U << cParams->windowLog;
88
- U32 const windowLow = (current - windowValid > maxDistance) ? current - maxDistance : windowValid;
88
+ U32 const windowLow = (curr - windowValid > maxDistance) ? curr - maxDistance : windowValid;
89
89
 
90
90
 
91
91
  DEBUGLOG(8, "ZSTD_insertDUBT1(%u) (dictLimit=%u, lowLimit=%u)",
92
- current, dictLimit, windowLow);
93
- assert(current >= btLow);
92
+ curr, dictLimit, windowLow);
93
+ assert(curr >= btLow);
94
94
  assert(ip < iend); /* condition for ZSTD_count */
95
95
 
96
- while (nbCompares-- && (matchIndex > windowLow)) {
96
+ for (; nbCompares && (matchIndex > windowLow); --nbCompares) {
97
97
  U32* const nextPtr = bt + 2*(matchIndex & btMask);
98
98
  size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
99
- assert(matchIndex < current);
99
+ assert(matchIndex < curr);
100
100
  /* note : all candidates are now supposed sorted,
101
101
  * but it's still possible to have nextPtr[1] == ZSTD_DUBT_UNSORTED_MARK
102
102
  * when a real index has the same value as ZSTD_DUBT_UNSORTED_MARK */
103
103
 
104
104
  if ( (dictMode != ZSTD_extDict)
105
105
  || (matchIndex+matchLength >= dictLimit) /* both in current segment*/
106
- || (current < dictLimit) /* both in extDict */) {
106
+ || (curr < dictLimit) /* both in extDict */) {
107
107
  const BYTE* const mBase = ( (dictMode != ZSTD_extDict)
108
108
  || (matchIndex+matchLength >= dictLimit)) ?
109
109
  base : dictBase;
110
110
  assert( (matchIndex+matchLength >= dictLimit) /* might be wrong if extDict is incorrectly set to 0 */
111
- || (current < dictLimit) );
111
+ || (curr < dictLimit) );
112
112
  match = mBase + matchIndex;
113
113
  matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend);
114
114
  } else {
@@ -119,7 +119,7 @@ ZSTD_insertDUBT1(ZSTD_matchState_t* ms,
119
119
  }
120
120
 
121
121
  DEBUGLOG(8, "ZSTD_insertDUBT1: comparing %u with %u : found %u common bytes ",
122
- current, matchIndex, (U32)matchLength);
122
+ curr, matchIndex, (U32)matchLength);
123
123
 
124
124
  if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */
125
125
  break; /* drop , to guarantee consistency ; miss a bit of compression, but other solutions can corrupt tree */
@@ -151,7 +151,7 @@ ZSTD_insertDUBT1(ZSTD_matchState_t* ms,
151
151
 
152
152
  static size_t
153
153
  ZSTD_DUBT_findBetterDictMatch (
154
- ZSTD_matchState_t* ms,
154
+ const ZSTD_matchState_t* ms,
155
155
  const BYTE* const ip, const BYTE* const iend,
156
156
  size_t* offsetPtr,
157
157
  size_t bestLength,
@@ -168,7 +168,7 @@ ZSTD_DUBT_findBetterDictMatch (
168
168
 
169
169
  const BYTE* const base = ms->window.base;
170
170
  const BYTE* const prefixStart = base + ms->window.dictLimit;
171
- U32 const current = (U32)(ip-base);
171
+ U32 const curr = (U32)(ip-base);
172
172
  const BYTE* const dictBase = dms->window.base;
173
173
  const BYTE* const dictEnd = dms->window.nextSrc;
174
174
  U32 const dictHighLimit = (U32)(dms->window.nextSrc - dms->window.base);
@@ -185,7 +185,7 @@ ZSTD_DUBT_findBetterDictMatch (
185
185
  (void)dictMode;
186
186
  assert(dictMode == ZSTD_dictMatchState);
187
187
 
188
- while (nbCompares-- && (dictMatchIndex > dictLowLimit)) {
188
+ for (; nbCompares && (dictMatchIndex > dictLowLimit); --nbCompares) {
189
189
  U32* const nextPtr = dictBt + 2*(dictMatchIndex & btMask);
190
190
  size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
191
191
  const BYTE* match = dictBase + dictMatchIndex;
@@ -195,10 +195,10 @@ ZSTD_DUBT_findBetterDictMatch (
195
195
 
196
196
  if (matchLength > bestLength) {
197
197
  U32 matchIndex = dictMatchIndex + dictIndexDelta;
198
- if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(current-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) {
198
+ if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) {
199
199
  DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)",
200
- current, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, ZSTD_REP_MOVE + current - matchIndex, dictMatchIndex, matchIndex);
201
- bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + current - matchIndex;
200
+ curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, ZSTD_REP_MOVE + curr - matchIndex, dictMatchIndex, matchIndex);
201
+ bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + curr - matchIndex;
202
202
  }
203
203
  if (ip+matchLength == iend) { /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */
204
204
  break; /* drop, to guarantee consistency (miss a little bit of compression) */
@@ -218,9 +218,9 @@ ZSTD_DUBT_findBetterDictMatch (
218
218
  }
219
219
 
220
220
  if (bestLength >= MINMATCH) {
221
- U32 const mIndex = current - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex;
221
+ U32 const mIndex = curr - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex;
222
222
  DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
223
- current, (U32)bestLength, (U32)*offsetPtr, mIndex);
223
+ curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
224
224
  }
225
225
  return bestLength;
226
226
 
@@ -241,13 +241,13 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
241
241
  U32 matchIndex = hashTable[h];
242
242
 
243
243
  const BYTE* const base = ms->window.base;
244
- U32 const current = (U32)(ip-base);
245
- U32 const windowLow = ZSTD_getLowestMatchIndex(ms, current, cParams->windowLog);
244
+ U32 const curr = (U32)(ip-base);
245
+ U32 const windowLow = ZSTD_getLowestMatchIndex(ms, curr, cParams->windowLog);
246
246
 
247
247
  U32* const bt = ms->chainTable;
248
248
  U32 const btLog = cParams->chainLog - 1;
249
249
  U32 const btMask = (1 << btLog) - 1;
250
- U32 const btLow = (btMask >= current) ? 0 : current - btMask;
250
+ U32 const btLow = (btMask >= curr) ? 0 : curr - btMask;
251
251
  U32 const unsortLimit = MAX(btLow, windowLow);
252
252
 
253
253
  U32* nextCandidate = bt + 2*(matchIndex&btMask);
@@ -256,8 +256,9 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
256
256
  U32 nbCandidates = nbCompares;
257
257
  U32 previousCandidate = 0;
258
258
 
259
- DEBUGLOG(7, "ZSTD_DUBT_findBestMatch (%u) ", current);
259
+ DEBUGLOG(7, "ZSTD_DUBT_findBestMatch (%u) ", curr);
260
260
  assert(ip <= iend-8); /* required for h calculation */
261
+ assert(dictMode != ZSTD_dedicatedDictSearch);
261
262
 
262
263
  /* reach end of unsorted candidates list */
263
264
  while ( (matchIndex > unsortLimit)
@@ -299,16 +300,16 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
299
300
  const U32 dictLimit = ms->window.dictLimit;
300
301
  const BYTE* const dictEnd = dictBase + dictLimit;
301
302
  const BYTE* const prefixStart = base + dictLimit;
302
- U32* smallerPtr = bt + 2*(current&btMask);
303
- U32* largerPtr = bt + 2*(current&btMask) + 1;
304
- U32 matchEndIdx = current + 8 + 1;
303
+ U32* smallerPtr = bt + 2*(curr&btMask);
304
+ U32* largerPtr = bt + 2*(curr&btMask) + 1;
305
+ U32 matchEndIdx = curr + 8 + 1;
305
306
  U32 dummy32; /* to be nullified at the end */
306
307
  size_t bestLength = 0;
307
308
 
308
309
  matchIndex = hashTable[h];
309
- hashTable[h] = current; /* Update Hash Table */
310
+ hashTable[h] = curr; /* Update Hash Table */
310
311
 
311
- while (nbCompares-- && (matchIndex > windowLow)) {
312
+ for (; nbCompares && (matchIndex > windowLow); --nbCompares) {
312
313
  U32* const nextPtr = bt + 2*(matchIndex & btMask);
313
314
  size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
314
315
  const BYTE* match;
@@ -326,8 +327,8 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
326
327
  if (matchLength > bestLength) {
327
328
  if (matchLength > matchEndIdx - matchIndex)
328
329
  matchEndIdx = matchIndex + (U32)matchLength;
329
- if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(current-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) )
330
- bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + current - matchIndex;
330
+ if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) )
331
+ bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + curr - matchIndex;
331
332
  if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */
332
333
  if (dictMode == ZSTD_dictMatchState) {
333
334
  nbCompares = 0; /* in addition to avoiding checking any
@@ -356,6 +357,7 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
356
357
 
357
358
  *smallerPtr = *largerPtr = 0;
358
359
 
360
+ assert(nbCompares <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */
359
361
  if (dictMode == ZSTD_dictMatchState && nbCompares) {
360
362
  bestLength = ZSTD_DUBT_findBetterDictMatch(
361
363
  ms, ip, iend,
@@ -363,12 +365,12 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
363
365
  mls, dictMode);
364
366
  }
365
367
 
366
- assert(matchEndIdx > current+8); /* ensure nextToUpdate is increased */
368
+ assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased */
367
369
  ms->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */
368
370
  if (bestLength >= MINMATCH) {
369
- U32 const mIndex = current - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex;
371
+ U32 const mIndex = curr - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex;
370
372
  DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
371
- current, (U32)bestLength, (U32)*offsetPtr, mIndex);
373
+ curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
372
374
  }
373
375
  return bestLength;
374
376
  }
@@ -389,56 +391,222 @@ ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms,
389
391
  return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offsetPtr, mls, dictMode);
390
392
  }
391
393
 
394
+ /***********************************
395
+ * Dedicated dict search
396
+ ***********************************/
392
397
 
393
- static size_t
394
- ZSTD_BtFindBestMatch_selectMLS ( ZSTD_matchState_t* ms,
395
- const BYTE* ip, const BYTE* const iLimit,
396
- size_t* offsetPtr)
398
+ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip)
397
399
  {
398
- switch(ms->cParams.minMatch)
400
+ const BYTE* const base = ms->window.base;
401
+ U32 const target = (U32)(ip - base);
402
+ U32* const hashTable = ms->hashTable;
403
+ U32* const chainTable = ms->chainTable;
404
+ U32 const chainSize = 1 << ms->cParams.chainLog;
405
+ U32 idx = ms->nextToUpdate;
406
+ U32 const minChain = chainSize < target - idx ? target - chainSize : idx;
407
+ U32 const bucketSize = 1 << ZSTD_LAZY_DDSS_BUCKET_LOG;
408
+ U32 const cacheSize = bucketSize - 1;
409
+ U32 const chainAttempts = (1 << ms->cParams.searchLog) - cacheSize;
410
+ U32 const chainLimit = chainAttempts > 255 ? 255 : chainAttempts;
411
+
412
+ /* We know the hashtable is oversized by a factor of `bucketSize`.
413
+ * We are going to temporarily pretend `bucketSize == 1`, keeping only a
414
+ * single entry. We will use the rest of the space to construct a temporary
415
+ * chaintable.
416
+ */
417
+ U32 const hashLog = ms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG;
418
+ U32* const tmpHashTable = hashTable;
419
+ U32* const tmpChainTable = hashTable + ((size_t)1 << hashLog);
420
+ U32 const tmpChainSize = (U32)((1 << ZSTD_LAZY_DDSS_BUCKET_LOG) - 1) << hashLog;
421
+ U32 const tmpMinChain = tmpChainSize < target ? target - tmpChainSize : idx;
422
+ U32 hashIdx;
423
+
424
+ assert(ms->cParams.chainLog <= 24);
425
+ assert(ms->cParams.hashLog > ms->cParams.chainLog);
426
+ assert(idx != 0);
427
+ assert(tmpMinChain <= minChain);
428
+
429
+ /* fill conventional hash table and conventional chain table */
430
+ for ( ; idx < target; idx++) {
431
+ U32 const h = (U32)ZSTD_hashPtr(base + idx, hashLog, ms->cParams.minMatch);
432
+ if (idx >= tmpMinChain) {
433
+ tmpChainTable[idx - tmpMinChain] = hashTable[h];
434
+ }
435
+ tmpHashTable[h] = idx;
436
+ }
437
+
438
+ /* sort chains into ddss chain table */
399
439
  {
400
- default : /* includes case 3 */
401
- case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_noDict);
402
- case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_noDict);
403
- case 7 :
404
- case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_noDict);
440
+ U32 chainPos = 0;
441
+ for (hashIdx = 0; hashIdx < (1U << hashLog); hashIdx++) {
442
+ U32 count;
443
+ U32 countBeyondMinChain = 0;
444
+ U32 i = tmpHashTable[hashIdx];
445
+ for (count = 0; i >= tmpMinChain && count < cacheSize; count++) {
446
+ /* skip through the chain to the first position that won't be
447
+ * in the hash cache bucket */
448
+ if (i < minChain) {
449
+ countBeyondMinChain++;
450
+ }
451
+ i = tmpChainTable[i - tmpMinChain];
452
+ }
453
+ if (count == cacheSize) {
454
+ for (count = 0; count < chainLimit;) {
455
+ if (i < minChain) {
456
+ if (!i || ++countBeyondMinChain > cacheSize) {
457
+ /* only allow pulling `cacheSize` number of entries
458
+ * into the cache or chainTable beyond `minChain`,
459
+ * to replace the entries pulled out of the
460
+ * chainTable into the cache. This lets us reach
461
+ * back further without increasing the total number
462
+ * of entries in the chainTable, guaranteeing the
463
+ * DDSS chain table will fit into the space
464
+ * allocated for the regular one. */
465
+ break;
466
+ }
467
+ }
468
+ chainTable[chainPos++] = i;
469
+ count++;
470
+ if (i < tmpMinChain) {
471
+ break;
472
+ }
473
+ i = tmpChainTable[i - tmpMinChain];
474
+ }
475
+ } else {
476
+ count = 0;
477
+ }
478
+ if (count) {
479
+ tmpHashTable[hashIdx] = ((chainPos - count) << 8) + count;
480
+ } else {
481
+ tmpHashTable[hashIdx] = 0;
482
+ }
483
+ }
484
+ assert(chainPos <= chainSize); /* I believe this is guaranteed... */
405
485
  }
486
+
487
+ /* move chain pointers into the last entry of each hash bucket */
488
+ for (hashIdx = (1 << hashLog); hashIdx; ) {
489
+ U32 const bucketIdx = --hashIdx << ZSTD_LAZY_DDSS_BUCKET_LOG;
490
+ U32 const chainPackedPointer = tmpHashTable[hashIdx];
491
+ U32 i;
492
+ for (i = 0; i < cacheSize; i++) {
493
+ hashTable[bucketIdx + i] = 0;
494
+ }
495
+ hashTable[bucketIdx + bucketSize - 1] = chainPackedPointer;
496
+ }
497
+
498
+ /* fill the buckets of the hash table */
499
+ for (idx = ms->nextToUpdate; idx < target; idx++) {
500
+ U32 const h = (U32)ZSTD_hashPtr(base + idx, hashLog, ms->cParams.minMatch)
501
+ << ZSTD_LAZY_DDSS_BUCKET_LOG;
502
+ U32 i;
503
+ /* Shift hash cache down 1. */
504
+ for (i = cacheSize - 1; i; i--)
505
+ hashTable[h + i] = hashTable[h + i - 1];
506
+ hashTable[h] = idx;
507
+ }
508
+
509
+ ms->nextToUpdate = target;
406
510
  }
407
511
 
512
+ /* Returns the longest match length found in the dedicated dict search structure.
513
+ * If none are longer than the argument ml, then ml will be returned.
514
+ */
515
+ FORCE_INLINE_TEMPLATE
516
+ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nbAttempts,
517
+ const ZSTD_matchState_t* const dms,
518
+ const BYTE* const ip, const BYTE* const iLimit,
519
+ const BYTE* const prefixStart, const U32 curr,
520
+ const U32 dictLimit, const size_t ddsIdx) {
521
+ const U32 ddsLowestIndex = dms->window.dictLimit;
522
+ const BYTE* const ddsBase = dms->window.base;
523
+ const BYTE* const ddsEnd = dms->window.nextSrc;
524
+ const U32 ddsSize = (U32)(ddsEnd - ddsBase);
525
+ const U32 ddsIndexDelta = dictLimit - ddsSize;
526
+ const U32 bucketSize = (1 << ZSTD_LAZY_DDSS_BUCKET_LOG);
527
+ const U32 bucketLimit = nbAttempts < bucketSize - 1 ? nbAttempts : bucketSize - 1;
528
+ U32 ddsAttempt;
529
+ U32 matchIndex;
530
+
531
+ for (ddsAttempt = 0; ddsAttempt < bucketSize - 1; ddsAttempt++) {
532
+ PREFETCH_L1(ddsBase + dms->hashTable[ddsIdx + ddsAttempt]);
533
+ }
408
534
 
409
- static size_t ZSTD_BtFindBestMatch_dictMatchState_selectMLS (
410
- ZSTD_matchState_t* ms,
411
- const BYTE* ip, const BYTE* const iLimit,
412
- size_t* offsetPtr)
413
- {
414
- switch(ms->cParams.minMatch)
415
535
  {
416
- default : /* includes case 3 */
417
- case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_dictMatchState);
418
- case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_dictMatchState);
419
- case 7 :
420
- case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_dictMatchState);
536
+ U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1];
537
+ U32 const chainIndex = chainPackedPointer >> 8;
538
+
539
+ PREFETCH_L1(&dms->chainTable[chainIndex]);
421
540
  }
422
- }
423
541
 
542
+ for (ddsAttempt = 0; ddsAttempt < bucketLimit; ddsAttempt++) {
543
+ size_t currentMl=0;
544
+ const BYTE* match;
545
+ matchIndex = dms->hashTable[ddsIdx + ddsAttempt];
546
+ match = ddsBase + matchIndex;
547
+
548
+ if (!matchIndex) {
549
+ return ml;
550
+ }
551
+
552
+ /* guaranteed by table construction */
553
+ (void)ddsLowestIndex;
554
+ assert(matchIndex >= ddsLowestIndex);
555
+ assert(match+4 <= ddsEnd);
556
+ if (MEM_read32(match) == MEM_read32(ip)) {
557
+ /* assumption : matchIndex <= dictLimit-4 (by table construction) */
558
+ currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4;
559
+ }
560
+
561
+ /* save best solution */
562
+ if (currentMl > ml) {
563
+ ml = currentMl;
564
+ *offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE;
565
+ if (ip+currentMl == iLimit) {
566
+ /* best possible, avoids read overflow on next attempt */
567
+ return ml;
568
+ }
569
+ }
570
+ }
424
571
 
425
- static size_t ZSTD_BtFindBestMatch_extDict_selectMLS (
426
- ZSTD_matchState_t* ms,
427
- const BYTE* ip, const BYTE* const iLimit,
428
- size_t* offsetPtr)
429
- {
430
- switch(ms->cParams.minMatch)
431
572
  {
432
- default : /* includes case 3 */
433
- case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_extDict);
434
- case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_extDict);
435
- case 7 :
436
- case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_extDict);
573
+ U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1];
574
+ U32 chainIndex = chainPackedPointer >> 8;
575
+ U32 const chainLength = chainPackedPointer & 0xFF;
576
+ U32 const chainAttempts = nbAttempts - ddsAttempt;
577
+ U32 const chainLimit = chainAttempts > chainLength ? chainLength : chainAttempts;
578
+ U32 chainAttempt;
579
+
580
+ for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++) {
581
+ PREFETCH_L1(ddsBase + dms->chainTable[chainIndex + chainAttempt]);
582
+ }
583
+
584
+ for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++, chainIndex++) {
585
+ size_t currentMl=0;
586
+ const BYTE* match;
587
+ matchIndex = dms->chainTable[chainIndex];
588
+ match = ddsBase + matchIndex;
589
+
590
+ /* guaranteed by table construction */
591
+ assert(matchIndex >= ddsLowestIndex);
592
+ assert(match+4 <= ddsEnd);
593
+ if (MEM_read32(match) == MEM_read32(ip)) {
594
+ /* assumption : matchIndex <= dictLimit-4 (by table construction) */
595
+ currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4;
596
+ }
597
+
598
+ /* save best solution */
599
+ if (currentMl > ml) {
600
+ ml = currentMl;
601
+ *offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE;
602
+ if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
603
+ }
604
+ }
437
605
  }
606
+ return ml;
438
607
  }
439
608
 
440
609
 
441
-
442
610
  /* *********************************
443
611
  * Hash Chain
444
612
  ***********************************/
@@ -446,7 +614,7 @@ static size_t ZSTD_BtFindBestMatch_extDict_selectMLS (
446
614
 
447
615
  /* Update chains up to ip (excluded)
448
616
  Assumption : always within prefix (i.e. not within extDict) */
449
- static U32 ZSTD_insertAndFindFirstIndex_internal(
617
+ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
450
618
  ZSTD_matchState_t* ms,
451
619
  const ZSTD_compressionParameters* const cParams,
452
620
  const BYTE* ip, U32 const mls)
@@ -475,10 +643,9 @@ U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {
475
643
  return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch);
476
644
  }
477
645
 
478
-
479
646
  /* inlining is important to hardwire a hot branch (template emulation) */
480
647
  FORCE_INLINE_TEMPLATE
481
- size_t ZSTD_HcFindBestMatch_generic (
648
+ size_t ZSTD_HcFindBestMatch(
482
649
  ZSTD_matchState_t* ms,
483
650
  const BYTE* const ip, const BYTE* const iLimit,
484
651
  size_t* offsetPtr,
@@ -493,20 +660,33 @@ size_t ZSTD_HcFindBestMatch_generic (
493
660
  const U32 dictLimit = ms->window.dictLimit;
494
661
  const BYTE* const prefixStart = base + dictLimit;
495
662
  const BYTE* const dictEnd = dictBase + dictLimit;
496
- const U32 current = (U32)(ip-base);
663
+ const U32 curr = (U32)(ip-base);
497
664
  const U32 maxDistance = 1U << cParams->windowLog;
498
665
  const U32 lowestValid = ms->window.lowLimit;
499
- const U32 withinMaxDistance = (current - lowestValid > maxDistance) ? current - maxDistance : lowestValid;
666
+ const U32 withinMaxDistance = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid;
500
667
  const U32 isDictionary = (ms->loadedDictEnd != 0);
501
668
  const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance;
502
- const U32 minChain = current > chainSize ? current - chainSize : 0;
669
+ const U32 minChain = curr > chainSize ? curr - chainSize : 0;
503
670
  U32 nbAttempts = 1U << cParams->searchLog;
504
671
  size_t ml=4-1;
505
672
 
673
+ const ZSTD_matchState_t* const dms = ms->dictMatchState;
674
+ const U32 ddsHashLog = dictMode == ZSTD_dedicatedDictSearch
675
+ ? dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG : 0;
676
+ const size_t ddsIdx = dictMode == ZSTD_dedicatedDictSearch
677
+ ? ZSTD_hashPtr(ip, ddsHashLog, mls) << ZSTD_LAZY_DDSS_BUCKET_LOG : 0;
678
+
679
+ U32 matchIndex;
680
+
681
+ if (dictMode == ZSTD_dedicatedDictSearch) {
682
+ const U32* entry = &dms->hashTable[ddsIdx];
683
+ PREFETCH_L1(entry);
684
+ }
685
+
506
686
  /* HC4 match finder */
507
- U32 matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls);
687
+ matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls);
508
688
 
509
- for ( ; (matchIndex>lowLimit) & (nbAttempts>0) ; nbAttempts--) {
689
+ for ( ; (matchIndex>=lowLimit) & (nbAttempts>0) ; nbAttempts--) {
510
690
  size_t currentMl=0;
511
691
  if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
512
692
  const BYTE* const match = base + matchIndex;
@@ -523,7 +703,7 @@ size_t ZSTD_HcFindBestMatch_generic (
523
703
  /* save best solution */
524
704
  if (currentMl > ml) {
525
705
  ml = currentMl;
526
- *offsetPtr = current - matchIndex + ZSTD_REP_MOVE;
706
+ *offsetPtr = curr - matchIndex + ZSTD_REP_MOVE;
527
707
  if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
528
708
  }
529
709
 
@@ -531,8 +711,11 @@ size_t ZSTD_HcFindBestMatch_generic (
531
711
  matchIndex = NEXT_IN_CHAIN(matchIndex, chainMask);
532
712
  }
533
713
 
534
- if (dictMode == ZSTD_dictMatchState) {
535
- const ZSTD_matchState_t* const dms = ms->dictMatchState;
714
+ assert(nbAttempts <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */
715
+ if (dictMode == ZSTD_dedicatedDictSearch) {
716
+ ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts, dms,
717
+ ip, iLimit, prefixStart, curr, dictLimit, ddsIdx);
718
+ } else if (dictMode == ZSTD_dictMatchState) {
536
719
  const U32* const dmsChainTable = dms->chainTable;
537
720
  const U32 dmsChainSize = (1 << dms->cParams.chainLog);
538
721
  const U32 dmsChainMask = dmsChainSize - 1;
@@ -545,7 +728,7 @@ size_t ZSTD_HcFindBestMatch_generic (
545
728
 
546
729
  matchIndex = dms->hashTable[ZSTD_hashPtr(ip, dms->cParams.hashLog, mls)];
547
730
 
548
- for ( ; (matchIndex>dmsLowestIndex) & (nbAttempts>0) ; nbAttempts--) {
731
+ for ( ; (matchIndex>=dmsLowestIndex) & (nbAttempts>0) ; nbAttempts--) {
549
732
  size_t currentMl=0;
550
733
  const BYTE* const match = dmsBase + matchIndex;
551
734
  assert(match+4 <= dmsEnd);
@@ -555,11 +738,12 @@ size_t ZSTD_HcFindBestMatch_generic (
555
738
  /* save best solution */
556
739
  if (currentMl > ml) {
557
740
  ml = currentMl;
558
- *offsetPtr = current - (matchIndex + dmsIndexDelta) + ZSTD_REP_MOVE;
741
+ *offsetPtr = curr - (matchIndex + dmsIndexDelta) + ZSTD_REP_MOVE;
559
742
  if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
560
743
  }
561
744
 
562
745
  if (matchIndex <= dmsMinChain) break;
746
+
563
747
  matchIndex = dmsChainTable[matchIndex & dmsChainMask];
564
748
  }
565
749
  }
@@ -567,59 +751,724 @@ size_t ZSTD_HcFindBestMatch_generic (
567
751
  return ml;
568
752
  }
569
753
 
754
+ /* *********************************
755
+ * (SIMD) Row-based matchfinder
756
+ ***********************************/
757
+ /* Constants for row-based hash */
758
+ #define ZSTD_ROW_HASH_TAG_OFFSET 16 /* byte offset of hashes in the match state's tagTable from the beginning of a row */
759
+ #define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */
760
+ #define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1)
761
+ #define ZSTD_ROW_HASH_MAX_ENTRIES 64 /* absolute maximum number of entries per row, for all configurations */
570
762
 
571
- FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_selectMLS (
572
- ZSTD_matchState_t* ms,
573
- const BYTE* ip, const BYTE* const iLimit,
574
- size_t* offsetPtr)
763
+ #define ZSTD_ROW_HASH_CACHE_MASK (ZSTD_ROW_HASH_CACHE_SIZE - 1)
764
+
765
+ typedef U64 ZSTD_VecMask; /* Clarifies when we are interacting with a U64 representing a mask of matches */
766
+
767
+ /* ZSTD_VecMask_next():
768
+ * Starting from the LSB, returns the idx of the next non-zero bit.
769
+ * Basically counting the nb of trailing zeroes.
770
+ */
771
+ static U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
772
+ assert(val != 0);
773
+ # if defined(_MSC_VER) && defined(_WIN64)
774
+ if (val != 0) {
775
+ unsigned long r;
776
+ _BitScanForward64(&r, val);
777
+ return (U32)(r);
778
+ } else {
779
+ /* Should not reach this code path */
780
+ __assume(0);
781
+ }
782
+ # elif (defined(__GNUC__) && ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))
783
+ if (sizeof(size_t) == 4) {
784
+ U32 mostSignificantWord = (U32)(val >> 32);
785
+ U32 leastSignificantWord = (U32)val;
786
+ if (leastSignificantWord == 0) {
787
+ return 32 + (U32)__builtin_ctz(mostSignificantWord);
788
+ } else {
789
+ return (U32)__builtin_ctz(leastSignificantWord);
790
+ }
791
+ } else {
792
+ return (U32)__builtin_ctzll(val);
793
+ }
794
+ # else
795
+ /* Software ctz version: http://aggregate.org/MAGIC/#Trailing%20Zero%20Count
796
+ * and: https://stackoverflow.com/questions/2709430/count-number-of-bits-in-a-64-bit-long-big-integer
797
+ */
798
+ val = ~val & (val - 1ULL); /* Lowest set bit mask */
799
+ val = val - ((val >> 1) & 0x5555555555555555);
800
+ val = (val & 0x3333333333333333ULL) + ((val >> 2) & 0x3333333333333333ULL);
801
+ return (U32)((((val + (val >> 4)) & 0xF0F0F0F0F0F0F0FULL) * 0x101010101010101ULL) >> 56);
802
+ # endif
803
+ }
804
+
805
+ /* ZSTD_rotateRight_*():
806
+ * Rotates a bitfield to the right by "count" bits.
807
+ * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts
808
+ */
809
+ FORCE_INLINE_TEMPLATE
810
+ U64 ZSTD_rotateRight_U64(U64 const value, U32 count) {
811
+ assert(count < 64);
812
+ count &= 0x3F; /* for fickle pattern recognition */
813
+ return (value >> count) | (U64)(value << ((0U - count) & 0x3F));
814
+ }
815
+
816
+ FORCE_INLINE_TEMPLATE
817
+ U32 ZSTD_rotateRight_U32(U32 const value, U32 count) {
818
+ assert(count < 32);
819
+ count &= 0x1F; /* for fickle pattern recognition */
820
+ return (value >> count) | (U32)(value << ((0U - count) & 0x1F));
821
+ }
822
+
823
+ FORCE_INLINE_TEMPLATE
824
+ U16 ZSTD_rotateRight_U16(U16 const value, U32 count) {
825
+ assert(count < 16);
826
+ count &= 0x0F; /* for fickle pattern recognition */
827
+ return (value >> count) | (U16)(value << ((0U - count) & 0x0F));
828
+ }
829
+
830
+ /* ZSTD_row_nextIndex():
831
+ * Returns the next index to insert at within a tagTable row, and updates the "head"
832
+ * value to reflect the update. Essentially cycles backwards from [0, {entries per row})
833
+ */
834
+ FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const rowMask) {
835
+ U32 const next = (*tagRow - 1) & rowMask;
836
+ *tagRow = (BYTE)next;
837
+ return next;
838
+ }
839
+
840
+ /* ZSTD_isAligned():
841
+ * Checks that a pointer is aligned to "align" bytes which must be a power of 2.
842
+ */
843
+ MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) {
844
+ assert((align & (align - 1)) == 0);
845
+ return (((size_t)ptr) & (align - 1)) == 0;
846
+ }
847
+
848
+ /* ZSTD_row_prefetch():
849
+ * Performs prefetching for the hashTable and tagTable at a given row.
850
+ */
851
+ FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* tagTable, U32 const relRow, U32 const rowLog) {
852
+ PREFETCH_L1(hashTable + relRow);
853
+ if (rowLog >= 5) {
854
+ PREFETCH_L1(hashTable + relRow + 16);
855
+ /* Note: prefetching more of the hash table does not appear to be beneficial for 128-entry rows */
856
+ }
857
+ PREFETCH_L1(tagTable + relRow);
858
+ if (rowLog == 6) {
859
+ PREFETCH_L1(tagTable + relRow + 32);
860
+ }
861
+ assert(rowLog == 4 || rowLog == 5 || rowLog == 6);
862
+ assert(ZSTD_isAligned(hashTable + relRow, 64)); /* prefetched hash row always 64-byte aligned */
863
+ assert(ZSTD_isAligned(tagTable + relRow, (size_t)1 << rowLog)); /* prefetched tagRow sits on correct multiple of bytes (32,64,128) */
864
+ }
865
+
866
+ /* ZSTD_row_fillHashCache():
867
+ * Fill up the hash cache starting at idx, prefetching up to ZSTD_ROW_HASH_CACHE_SIZE entries,
868
+ * but not beyond iLimit.
869
+ */
870
+ FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base,
871
+ U32 const rowLog, U32 const mls,
872
+ U32 idx, const BYTE* const iLimit)
575
873
  {
576
- switch(ms->cParams.minMatch)
577
- {
578
- default : /* includes case 3 */
579
- case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_noDict);
580
- case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_noDict);
581
- case 7 :
582
- case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_noDict);
874
+ U32 const* const hashTable = ms->hashTable;
875
+ U16 const* const tagTable = ms->tagTable;
876
+ U32 const hashLog = ms->rowHashLog;
877
+ U32 const maxElemsToPrefetch = (base + idx) > iLimit ? 0 : (U32)(iLimit - (base + idx) + 1);
878
+ U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch);
879
+
880
+ for (; idx < lim; ++idx) {
881
+ U32 const hash = (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
882
+ U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
883
+ ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
884
+ ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash;
583
885
  }
886
+
887
+ DEBUGLOG(6, "ZSTD_row_fillHashCache(): [%u %u %u %u %u %u %u %u]", ms->hashCache[0], ms->hashCache[1],
888
+ ms->hashCache[2], ms->hashCache[3], ms->hashCache[4],
889
+ ms->hashCache[5], ms->hashCache[6], ms->hashCache[7]);
584
890
  }
585
891
 
892
+ /* ZSTD_row_nextCachedHash():
893
+ * Returns the hash of base + idx, and replaces the hash in the hash cache with the byte at
894
+ * base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable.
895
+ */
896
+ FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable,
897
+ U16 const* tagTable, BYTE const* base,
898
+ U32 idx, U32 const hashLog,
899
+ U32 const rowLog, U32 const mls)
900
+ {
901
+ U32 const newHash = (U32)ZSTD_hashPtr(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
902
+ U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
903
+ ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
904
+ { U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK];
905
+ cache[idx & ZSTD_ROW_HASH_CACHE_MASK] = newHash;
906
+ return hash;
907
+ }
908
+ }
586
909
 
587
- static size_t ZSTD_HcFindBestMatch_dictMatchState_selectMLS (
588
- ZSTD_matchState_t* ms,
589
- const BYTE* ip, const BYTE* const iLimit,
590
- size_t* offsetPtr)
910
+ /* ZSTD_row_update_internalImpl():
911
+ * Updates the hash table with positions starting from updateStartIdx until updateEndIdx.
912
+ */
913
+ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms,
914
+ U32 updateStartIdx, U32 const updateEndIdx,
915
+ U32 const mls, U32 const rowLog,
916
+ U32 const rowMask, U32 const useCache)
591
917
  {
592
- switch(ms->cParams.minMatch)
593
- {
594
- default : /* includes case 3 */
595
- case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_dictMatchState);
596
- case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_dictMatchState);
597
- case 7 :
598
- case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_dictMatchState);
918
+ U32* const hashTable = ms->hashTable;
919
+ U16* const tagTable = ms->tagTable;
920
+ U32 const hashLog = ms->rowHashLog;
921
+ const BYTE* const base = ms->window.base;
922
+
923
+ DEBUGLOG(6, "ZSTD_row_update_internalImpl(): updateStartIdx=%u, updateEndIdx=%u", updateStartIdx, updateEndIdx);
924
+ for (; updateStartIdx < updateEndIdx; ++updateStartIdx) {
925
+ U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls)
926
+ : (U32)ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
927
+ U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
928
+ U32* const row = hashTable + relRow;
929
+ BYTE* tagRow = (BYTE*)(tagTable + relRow); /* Though tagTable is laid out as a table of U16, each tag is only 1 byte.
930
+ Explicit cast allows us to get exact desired position within each row */
931
+ U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
932
+
933
+ assert(hash == ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls));
934
+ ((BYTE*)tagRow)[pos + ZSTD_ROW_HASH_TAG_OFFSET] = hash & ZSTD_ROW_HASH_TAG_MASK;
935
+ row[pos] = updateStartIdx;
599
936
  }
600
937
  }
601
938
 
939
+ /* ZSTD_row_update_internal():
940
+ * Inserts the byte at ip into the appropriate position in the hash table, and updates ms->nextToUpdate.
941
+ * Skips sections of long matches as is necessary.
942
+ */
943
+ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip,
944
+ U32 const mls, U32 const rowLog,
945
+ U32 const rowMask, U32 const useCache)
946
+ {
947
+ U32 idx = ms->nextToUpdate;
948
+ const BYTE* const base = ms->window.base;
949
+ const U32 target = (U32)(ip - base);
950
+ const U32 kSkipThreshold = 384;
951
+ const U32 kMaxMatchStartPositionsToUpdate = 96;
952
+ const U32 kMaxMatchEndPositionsToUpdate = 32;
953
+
954
+ if (useCache) {
955
+ /* Only skip positions when using hash cache, i.e.
956
+ * if we are loading a dict, don't skip anything.
957
+ * If we decide to skip, then we only update a set number
958
+ * of positions at the beginning and end of the match.
959
+ */
960
+ if (UNLIKELY(target - idx > kSkipThreshold)) {
961
+ U32 const bound = idx + kMaxMatchStartPositionsToUpdate;
962
+ ZSTD_row_update_internalImpl(ms, idx, bound, mls, rowLog, rowMask, useCache);
963
+ idx = target - kMaxMatchEndPositionsToUpdate;
964
+ ZSTD_row_fillHashCache(ms, base, rowLog, mls, idx, ip+1);
965
+ }
966
+ }
967
+ assert(target >= idx);
968
+ ZSTD_row_update_internalImpl(ms, idx, target, mls, rowLog, rowMask, useCache);
969
+ ms->nextToUpdate = target;
970
+ }
602
971
 
603
- FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_extDict_selectMLS (
972
+ /* ZSTD_row_update():
973
+ * External wrapper for ZSTD_row_update_internal(). Used for filling the hashtable during dictionary
974
+ * processing.
975
+ */
976
+ void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) {
977
+ const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);
978
+ const U32 rowMask = (1u << rowLog) - 1;
979
+ const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */);
980
+
981
+ DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog);
982
+ ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* dont use cache */);
983
+ }
984
+
985
+ #if defined(ZSTD_ARCH_X86_SSE2)
986
+ FORCE_INLINE_TEMPLATE ZSTD_VecMask
987
+ ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U32 head)
988
+ {
989
+ const __m128i comparisonMask = _mm_set1_epi8((char)tag);
990
+ int matches[4] = {0};
991
+ int i;
992
+ assert(nbChunks == 1 || nbChunks == 2 || nbChunks == 4);
993
+ for (i=0; i<nbChunks; i++) {
994
+ const __m128i chunk = _mm_loadu_si128((const __m128i*)(const void*)(src + 16*i));
995
+ const __m128i equalMask = _mm_cmpeq_epi8(chunk, comparisonMask);
996
+ matches[i] = _mm_movemask_epi8(equalMask);
997
+ }
998
+ if (nbChunks == 1) return ZSTD_rotateRight_U16((U16)matches[0], head);
999
+ if (nbChunks == 2) return ZSTD_rotateRight_U32((U32)matches[1] << 16 | (U32)matches[0], head);
1000
+ assert(nbChunks == 4);
1001
+ return ZSTD_rotateRight_U64((U64)matches[3] << 48 | (U64)matches[2] << 32 | (U64)matches[1] << 16 | (U64)matches[0], head);
1002
+ }
1003
+ #endif
1004
+
1005
+ /* Returns a ZSTD_VecMask (U32) that has the nth bit set to 1 if the newly-computed "tag" matches
1006
+ * the hash at the nth position in a row of the tagTable.
1007
+ * Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield
1008
+ * to match up with the actual layout of the entries within the hashTable */
1009
+ FORCE_INLINE_TEMPLATE ZSTD_VecMask
1010
+ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries)
1011
+ {
1012
+ const BYTE* const src = tagRow + ZSTD_ROW_HASH_TAG_OFFSET;
1013
+ assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
1014
+ assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
1015
+
1016
+ #if defined(ZSTD_ARCH_X86_SSE2)
1017
+
1018
+ return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, head);
1019
+
1020
+ #else /* SW or NEON-LE */
1021
+
1022
+ # if defined(ZSTD_ARCH_ARM_NEON)
1023
+ /* This NEON path only works for little endian - otherwise use SWAR below */
1024
+ if (MEM_isLittleEndian()) {
1025
+ if (rowEntries == 16) {
1026
+ const uint8x16_t chunk = vld1q_u8(src);
1027
+ const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag)));
1028
+ const uint16x8_t t0 = vshlq_n_u16(equalMask, 7);
1029
+ const uint32x4_t t1 = vreinterpretq_u32_u16(vsriq_n_u16(t0, t0, 14));
1030
+ const uint64x2_t t2 = vreinterpretq_u64_u32(vshrq_n_u32(t1, 14));
1031
+ const uint8x16_t t3 = vreinterpretq_u8_u64(vsraq_n_u64(t2, t2, 28));
1032
+ const U16 hi = (U16)vgetq_lane_u8(t3, 8);
1033
+ const U16 lo = (U16)vgetq_lane_u8(t3, 0);
1034
+ return ZSTD_rotateRight_U16((hi << 8) | lo, head);
1035
+ } else if (rowEntries == 32) {
1036
+ const uint16x8x2_t chunk = vld2q_u16((const U16*)(const void*)src);
1037
+ const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]);
1038
+ const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]);
1039
+ const uint8x16_t equalMask0 = vceqq_u8(chunk0, vdupq_n_u8(tag));
1040
+ const uint8x16_t equalMask1 = vceqq_u8(chunk1, vdupq_n_u8(tag));
1041
+ const int8x8_t pack0 = vqmovn_s16(vreinterpretq_s16_u8(equalMask0));
1042
+ const int8x8_t pack1 = vqmovn_s16(vreinterpretq_s16_u8(equalMask1));
1043
+ const uint8x8_t t0 = vreinterpret_u8_s8(pack0);
1044
+ const uint8x8_t t1 = vreinterpret_u8_s8(pack1);
1045
+ const uint8x8_t t2 = vsri_n_u8(t1, t0, 2);
1046
+ const uint8x8x2_t t3 = vuzp_u8(t2, t0);
1047
+ const uint8x8_t t4 = vsri_n_u8(t3.val[1], t3.val[0], 4);
1048
+ const U32 matches = vget_lane_u32(vreinterpret_u32_u8(t4), 0);
1049
+ return ZSTD_rotateRight_U32(matches, head);
1050
+ } else { /* rowEntries == 64 */
1051
+ const uint8x16x4_t chunk = vld4q_u8(src);
1052
+ const uint8x16_t dup = vdupq_n_u8(tag);
1053
+ const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup);
1054
+ const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup);
1055
+ const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup);
1056
+ const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup);
1057
+
1058
+ const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1);
1059
+ const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1);
1060
+ const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2);
1061
+ const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4);
1062
+ const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4);
1063
+ const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0);
1064
+ return ZSTD_rotateRight_U64(matches, head);
1065
+ }
1066
+ }
1067
+ # endif /* ZSTD_ARCH_ARM_NEON */
1068
+ /* SWAR */
1069
+ { const size_t chunkSize = sizeof(size_t);
1070
+ const size_t shiftAmount = ((chunkSize * 8) - chunkSize);
1071
+ const size_t xFF = ~((size_t)0);
1072
+ const size_t x01 = xFF / 0xFF;
1073
+ const size_t x80 = x01 << 7;
1074
+ const size_t splatChar = tag * x01;
1075
+ ZSTD_VecMask matches = 0;
1076
+ int i = rowEntries - chunkSize;
1077
+ assert((sizeof(size_t) == 4) || (sizeof(size_t) == 8));
1078
+ if (MEM_isLittleEndian()) { /* runtime check so have two loops */
1079
+ const size_t extractMagic = (xFF / 0x7F) >> chunkSize;
1080
+ do {
1081
+ size_t chunk = MEM_readST(&src[i]);
1082
+ chunk ^= splatChar;
1083
+ chunk = (((chunk | x80) - x01) | chunk) & x80;
1084
+ matches <<= chunkSize;
1085
+ matches |= (chunk * extractMagic) >> shiftAmount;
1086
+ i -= chunkSize;
1087
+ } while (i >= 0);
1088
+ } else { /* big endian: reverse bits during extraction */
1089
+ const size_t msb = xFF ^ (xFF >> 1);
1090
+ const size_t extractMagic = (msb / 0x1FF) | msb;
1091
+ do {
1092
+ size_t chunk = MEM_readST(&src[i]);
1093
+ chunk ^= splatChar;
1094
+ chunk = (((chunk | x80) - x01) | chunk) & x80;
1095
+ matches <<= chunkSize;
1096
+ matches |= ((chunk >> 7) * extractMagic) >> shiftAmount;
1097
+ i -= chunkSize;
1098
+ } while (i >= 0);
1099
+ }
1100
+ matches = ~matches;
1101
+ if (rowEntries == 16) {
1102
+ return ZSTD_rotateRight_U16((U16)matches, head);
1103
+ } else if (rowEntries == 32) {
1104
+ return ZSTD_rotateRight_U32((U32)matches, head);
1105
+ } else {
1106
+ return ZSTD_rotateRight_U64((U64)matches, head);
1107
+ }
1108
+ }
1109
+ #endif
1110
+ }
1111
+
1112
+ /* The high-level approach of the SIMD row based match finder is as follows:
1113
+ * - Figure out where to insert the new entry:
1114
+ * - Generate a hash from a byte along with an additional 1-byte "short hash". The additional byte is our "tag"
1115
+ * - The hashTable is effectively split into groups or "rows" of 16 or 32 entries of U32, and the hash determines
1116
+ * which row to insert into.
1117
+ * - Determine the correct position within the row to insert the entry into. Each row of 16 or 32 can
1118
+ * be considered as a circular buffer with a "head" index that resides in the tagTable.
1119
+ * - Also insert the "tag" into the equivalent row and position in the tagTable.
1120
+ * - Note: The tagTable has 17 or 33 1-byte entries per row, due to 16 or 32 tags, and 1 "head" entry.
1121
+ * The 17 or 33 entry rows are spaced out to occur every 32 or 64 bytes, respectively,
1122
+ * for alignment/performance reasons, leaving some bytes unused.
1123
+ * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte "short hash" and
1124
+ * generate a bitfield that we can cycle through to check the collisions in the hash table.
1125
+ * - Pick the longest match.
1126
+ */
1127
+ FORCE_INLINE_TEMPLATE
1128
+ size_t ZSTD_RowFindBestMatch(
604
1129
  ZSTD_matchState_t* ms,
605
- const BYTE* ip, const BYTE* const iLimit,
606
- size_t* offsetPtr)
1130
+ const BYTE* const ip, const BYTE* const iLimit,
1131
+ size_t* offsetPtr,
1132
+ const U32 mls, const ZSTD_dictMode_e dictMode,
1133
+ const U32 rowLog)
607
1134
  {
608
- switch(ms->cParams.minMatch)
609
- {
610
- default : /* includes case 3 */
611
- case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_extDict);
612
- case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_extDict);
613
- case 7 :
614
- case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_extDict);
1135
+ U32* const hashTable = ms->hashTable;
1136
+ U16* const tagTable = ms->tagTable;
1137
+ U32* const hashCache = ms->hashCache;
1138
+ const U32 hashLog = ms->rowHashLog;
1139
+ const ZSTD_compressionParameters* const cParams = &ms->cParams;
1140
+ const BYTE* const base = ms->window.base;
1141
+ const BYTE* const dictBase = ms->window.dictBase;
1142
+ const U32 dictLimit = ms->window.dictLimit;
1143
+ const BYTE* const prefixStart = base + dictLimit;
1144
+ const BYTE* const dictEnd = dictBase + dictLimit;
1145
+ const U32 curr = (U32)(ip-base);
1146
+ const U32 maxDistance = 1U << cParams->windowLog;
1147
+ const U32 lowestValid = ms->window.lowLimit;
1148
+ const U32 withinMaxDistance = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid;
1149
+ const U32 isDictionary = (ms->loadedDictEnd != 0);
1150
+ const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance;
1151
+ const U32 rowEntries = (1U << rowLog);
1152
+ const U32 rowMask = rowEntries - 1;
1153
+ const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */
1154
+ U32 nbAttempts = 1U << cappedSearchLog;
1155
+ size_t ml=4-1;
1156
+
1157
+ /* DMS/DDS variables that may be referenced laster */
1158
+ const ZSTD_matchState_t* const dms = ms->dictMatchState;
1159
+
1160
+ /* Initialize the following variables to satisfy static analyzer */
1161
+ size_t ddsIdx = 0;
1162
+ U32 ddsExtraAttempts = 0; /* cctx hash tables are limited in searches, but allow extra searches into DDS */
1163
+ U32 dmsTag = 0;
1164
+ U32* dmsRow = NULL;
1165
+ BYTE* dmsTagRow = NULL;
1166
+
1167
+ if (dictMode == ZSTD_dedicatedDictSearch) {
1168
+ const U32 ddsHashLog = dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG;
1169
+ { /* Prefetch DDS hashtable entry */
1170
+ ddsIdx = ZSTD_hashPtr(ip, ddsHashLog, mls) << ZSTD_LAZY_DDSS_BUCKET_LOG;
1171
+ PREFETCH_L1(&dms->hashTable[ddsIdx]);
1172
+ }
1173
+ ddsExtraAttempts = cParams->searchLog > rowLog ? 1U << (cParams->searchLog - rowLog) : 0;
1174
+ }
1175
+
1176
+ if (dictMode == ZSTD_dictMatchState) {
1177
+ /* Prefetch DMS rows */
1178
+ U32* const dmsHashTable = dms->hashTable;
1179
+ U16* const dmsTagTable = dms->tagTable;
1180
+ U32 const dmsHash = (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
1181
+ U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
1182
+ dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK;
1183
+ dmsTagRow = (BYTE*)(dmsTagTable + dmsRelRow);
1184
+ dmsRow = dmsHashTable + dmsRelRow;
1185
+ ZSTD_row_prefetch(dmsHashTable, dmsTagTable, dmsRelRow, rowLog);
1186
+ }
1187
+
1188
+ /* Update the hashTable and tagTable up to (but not including) ip */
1189
+ ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);
1190
+ { /* Get the hash for ip, compute the appropriate row */
1191
+ U32 const hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls);
1192
+ U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
1193
+ U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK;
1194
+ U32* const row = hashTable + relRow;
1195
+ BYTE* tagRow = (BYTE*)(tagTable + relRow);
1196
+ U32 const head = *tagRow & rowMask;
1197
+ U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
1198
+ size_t numMatches = 0;
1199
+ size_t currMatch = 0;
1200
+ ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, head, rowEntries);
1201
+
1202
+ /* Cycle through the matches and prefetch */
1203
+ for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
1204
+ U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
1205
+ U32 const matchIndex = row[matchPos];
1206
+ assert(numMatches < rowEntries);
1207
+ if (matchIndex < lowLimit)
1208
+ break;
1209
+ if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
1210
+ PREFETCH_L1(base + matchIndex);
1211
+ } else {
1212
+ PREFETCH_L1(dictBase + matchIndex);
1213
+ }
1214
+ matchBuffer[numMatches++] = matchIndex;
1215
+ }
1216
+
1217
+ /* Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop
1218
+ in ZSTD_row_update_internal() at the next search. */
1219
+ {
1220
+ U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
1221
+ tagRow[pos + ZSTD_ROW_HASH_TAG_OFFSET] = (BYTE)tag;
1222
+ row[pos] = ms->nextToUpdate++;
1223
+ }
1224
+
1225
+ /* Return the longest match */
1226
+ for (; currMatch < numMatches; ++currMatch) {
1227
+ U32 const matchIndex = matchBuffer[currMatch];
1228
+ size_t currentMl=0;
1229
+ assert(matchIndex < curr);
1230
+ assert(matchIndex >= lowLimit);
1231
+
1232
+ if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
1233
+ const BYTE* const match = base + matchIndex;
1234
+ assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */
1235
+ if (match[ml] == ip[ml]) /* potentially better */
1236
+ currentMl = ZSTD_count(ip, match, iLimit);
1237
+ } else {
1238
+ const BYTE* const match = dictBase + matchIndex;
1239
+ assert(match+4 <= dictEnd);
1240
+ if (MEM_read32(match) == MEM_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */
1241
+ currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dictEnd, prefixStart) + 4;
1242
+ }
1243
+
1244
+ /* Save best solution */
1245
+ if (currentMl > ml) {
1246
+ ml = currentMl;
1247
+ *offsetPtr = curr - matchIndex + ZSTD_REP_MOVE;
1248
+ if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
1249
+ }
1250
+ }
1251
+ }
1252
+
1253
+ assert(nbAttempts <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */
1254
+ if (dictMode == ZSTD_dedicatedDictSearch) {
1255
+ ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts + ddsExtraAttempts, dms,
1256
+ ip, iLimit, prefixStart, curr, dictLimit, ddsIdx);
1257
+ } else if (dictMode == ZSTD_dictMatchState) {
1258
+ /* TODO: Measure and potentially add prefetching to DMS */
1259
+ const U32 dmsLowestIndex = dms->window.dictLimit;
1260
+ const BYTE* const dmsBase = dms->window.base;
1261
+ const BYTE* const dmsEnd = dms->window.nextSrc;
1262
+ const U32 dmsSize = (U32)(dmsEnd - dmsBase);
1263
+ const U32 dmsIndexDelta = dictLimit - dmsSize;
1264
+
1265
+ { U32 const head = *dmsTagRow & rowMask;
1266
+ U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
1267
+ size_t numMatches = 0;
1268
+ size_t currMatch = 0;
1269
+ ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, head, rowEntries);
1270
+
1271
+ for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
1272
+ U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
1273
+ U32 const matchIndex = dmsRow[matchPos];
1274
+ if (matchIndex < dmsLowestIndex)
1275
+ break;
1276
+ PREFETCH_L1(dmsBase + matchIndex);
1277
+ matchBuffer[numMatches++] = matchIndex;
1278
+ }
1279
+
1280
+ /* Return the longest match */
1281
+ for (; currMatch < numMatches; ++currMatch) {
1282
+ U32 const matchIndex = matchBuffer[currMatch];
1283
+ size_t currentMl=0;
1284
+ assert(matchIndex >= dmsLowestIndex);
1285
+ assert(matchIndex < curr);
1286
+
1287
+ { const BYTE* const match = dmsBase + matchIndex;
1288
+ assert(match+4 <= dmsEnd);
1289
+ if (MEM_read32(match) == MEM_read32(ip))
1290
+ currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dmsEnd, prefixStart) + 4;
1291
+ }
1292
+
1293
+ if (currentMl > ml) {
1294
+ ml = currentMl;
1295
+ *offsetPtr = curr - (matchIndex + dmsIndexDelta) + ZSTD_REP_MOVE;
1296
+ if (ip+currentMl == iLimit) break;
1297
+ }
1298
+ }
1299
+ }
615
1300
  }
1301
+ return ml;
616
1302
  }
617
1303
 
618
1304
 
1305
+ typedef size_t (*searchMax_f)(
1306
+ ZSTD_matchState_t* ms,
1307
+ const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
1308
+
1309
+ /**
1310
+ * This struct contains the functions necessary for lazy to search.
1311
+ * Currently, that is only searchMax. However, it is still valuable to have the
1312
+ * VTable because this makes it easier to add more functions to the VTable later.
1313
+ *
1314
+ * TODO: The start of the search function involves loading and calculating a
1315
+ * bunch of constants from the ZSTD_matchState_t. These computations could be
1316
+ * done in an initialization function, and saved somewhere in the match state.
1317
+ * Then we could pass a pointer to the saved state instead of the match state,
1318
+ * and avoid duplicate computations.
1319
+ *
1320
+ * TODO: Move the match re-winding into searchMax. This improves compression
1321
+ * ratio, and unlocks further simplifications with the next TODO.
1322
+ *
1323
+ * TODO: Try moving the repcode search into searchMax. After the re-winding
1324
+ * and repcode search are in searchMax, there is no more logic in the match
1325
+ * finder loop that requires knowledge about the dictMode. So we should be
1326
+ * able to avoid force inlining it, and we can join the extDict loop with
1327
+ * the single segment loop. It should go in searchMax instead of its own
1328
+ * function to avoid having multiple virtual function calls per search.
1329
+ */
1330
+ typedef struct {
1331
+ searchMax_f searchMax;
1332
+ } ZSTD_LazyVTable;
1333
+
1334
+ #define GEN_ZSTD_BT_VTABLE(dictMode, mls) \
1335
+ static size_t ZSTD_BtFindBestMatch_##dictMode##_##mls( \
1336
+ ZSTD_matchState_t* ms, \
1337
+ const BYTE* ip, const BYTE* const iLimit, \
1338
+ size_t* offsetPtr) \
1339
+ { \
1340
+ assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
1341
+ return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode); \
1342
+ } \
1343
+ static const ZSTD_LazyVTable ZSTD_BtVTable_##dictMode##_##mls = { \
1344
+ ZSTD_BtFindBestMatch_##dictMode##_##mls \
1345
+ };
1346
+
1347
+ #define GEN_ZSTD_HC_VTABLE(dictMode, mls) \
1348
+ static size_t ZSTD_HcFindBestMatch_##dictMode##_##mls( \
1349
+ ZSTD_matchState_t* ms, \
1350
+ const BYTE* ip, const BYTE* const iLimit, \
1351
+ size_t* offsetPtr) \
1352
+ { \
1353
+ assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
1354
+ return ZSTD_HcFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode); \
1355
+ } \
1356
+ static const ZSTD_LazyVTable ZSTD_HcVTable_##dictMode##_##mls = { \
1357
+ ZSTD_HcFindBestMatch_##dictMode##_##mls \
1358
+ };
1359
+
1360
+ #define GEN_ZSTD_ROW_VTABLE(dictMode, mls, rowLog) \
1361
+ static size_t ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog( \
1362
+ ZSTD_matchState_t* ms, \
1363
+ const BYTE* ip, const BYTE* const iLimit, \
1364
+ size_t* offsetPtr) \
1365
+ { \
1366
+ assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
1367
+ assert(MAX(4, MIN(6, ms->cParams.searchLog)) == rowLog); \
1368
+ return ZSTD_RowFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode, rowLog); \
1369
+ } \
1370
+ static const ZSTD_LazyVTable ZSTD_RowVTable_##dictMode##_##mls##_##rowLog = { \
1371
+ ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog \
1372
+ };
1373
+
1374
+ #define ZSTD_FOR_EACH_ROWLOG(X, dictMode, mls) \
1375
+ X(dictMode, mls, 4) \
1376
+ X(dictMode, mls, 5) \
1377
+ X(dictMode, mls, 6)
1378
+
1379
+ #define ZSTD_FOR_EACH_MLS_ROWLOG(X, dictMode) \
1380
+ ZSTD_FOR_EACH_ROWLOG(X, dictMode, 4) \
1381
+ ZSTD_FOR_EACH_ROWLOG(X, dictMode, 5) \
1382
+ ZSTD_FOR_EACH_ROWLOG(X, dictMode, 6)
1383
+
1384
+ #define ZSTD_FOR_EACH_MLS(X, dictMode) \
1385
+ X(dictMode, 4) \
1386
+ X(dictMode, 5) \
1387
+ X(dictMode, 6)
1388
+
1389
+ #define ZSTD_FOR_EACH_DICT_MODE(X, ...) \
1390
+ X(__VA_ARGS__, noDict) \
1391
+ X(__VA_ARGS__, extDict) \
1392
+ X(__VA_ARGS__, dictMatchState) \
1393
+ X(__VA_ARGS__, dedicatedDictSearch)
1394
+
1395
+ /* Generate Row VTables for each combination of (dictMode, mls, rowLog) */
1396
+ ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS_ROWLOG, GEN_ZSTD_ROW_VTABLE)
1397
+ /* Generate Binary Tree VTables for each combination of (dictMode, mls) */
1398
+ ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_BT_VTABLE)
1399
+ /* Generate Hash Chain VTables for each combination of (dictMode, mls) */
1400
+ ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_HC_VTABLE)
1401
+
1402
+ #define GEN_ZSTD_BT_VTABLE_ARRAY(dictMode) \
1403
+ { \
1404
+ &ZSTD_BtVTable_##dictMode##_4, \
1405
+ &ZSTD_BtVTable_##dictMode##_5, \
1406
+ &ZSTD_BtVTable_##dictMode##_6 \
1407
+ }
1408
+
1409
+ #define GEN_ZSTD_HC_VTABLE_ARRAY(dictMode) \
1410
+ { \
1411
+ &ZSTD_HcVTable_##dictMode##_4, \
1412
+ &ZSTD_HcVTable_##dictMode##_5, \
1413
+ &ZSTD_HcVTable_##dictMode##_6 \
1414
+ }
1415
+
1416
+ #define GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, mls) \
1417
+ { \
1418
+ &ZSTD_RowVTable_##dictMode##_##mls##_4, \
1419
+ &ZSTD_RowVTable_##dictMode##_##mls##_5, \
1420
+ &ZSTD_RowVTable_##dictMode##_##mls##_6 \
1421
+ }
1422
+
1423
+ #define GEN_ZSTD_ROW_VTABLE_ARRAY(dictMode) \
1424
+ { \
1425
+ GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, 4), \
1426
+ GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, 5), \
1427
+ GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, 6) \
1428
+ }
1429
+
1430
+ #define GEN_ZSTD_VTABLE_ARRAY(X) \
1431
+ { \
1432
+ X(noDict), \
1433
+ X(extDict), \
1434
+ X(dictMatchState), \
1435
+ X(dedicatedDictSearch) \
1436
+ }
1437
+
619
1438
  /* *******************************
620
1439
  * Common parser - lazy strategy
621
1440
  *********************************/
622
- typedef enum { search_hashChain, search_binaryTree } searchMethod_e;
1441
+ typedef enum { search_hashChain=0, search_binaryTree=1, search_rowHash=2 } searchMethod_e;
1442
+
1443
+ /**
1444
+ * This table is indexed first by the four ZSTD_dictMode_e values, and then
1445
+ * by the two searchMethod_e values. NULLs are placed for configurations
1446
+ * that should never occur (extDict modes go to the other implementation
1447
+ * below and there is no DDSS for binary tree search yet).
1448
+ */
1449
+
1450
+ static ZSTD_LazyVTable const*
1451
+ ZSTD_selectLazyVTable(ZSTD_matchState_t const* ms, searchMethod_e searchMethod, ZSTD_dictMode_e dictMode)
1452
+ {
1453
+ /* Fill the Hc/Bt VTable arrays with the right functions for the (dictMode, mls) combination. */
1454
+ ZSTD_LazyVTable const* const hcVTables[4][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_HC_VTABLE_ARRAY);
1455
+ ZSTD_LazyVTable const* const btVTables[4][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_BT_VTABLE_ARRAY);
1456
+ /* Fill the Row VTable array with the right functions for the (dictMode, mls, rowLog) combination. */
1457
+ ZSTD_LazyVTable const* const rowVTables[4][3][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_ROW_VTABLE_ARRAY);
1458
+
1459
+ U32 const mls = MAX(4, MIN(6, ms->cParams.minMatch));
1460
+ U32 const rowLog = MAX(4, MIN(6, ms->cParams.searchLog));
1461
+ switch (searchMethod) {
1462
+ case search_hashChain:
1463
+ return hcVTables[dictMode][mls - 4];
1464
+ case search_binaryTree:
1465
+ return btVTables[dictMode][mls - 4];
1466
+ case search_rowHash:
1467
+ return rowVTables[dictMode][mls - 4][rowLog - 4];
1468
+ default:
1469
+ return NULL;
1470
+ }
1471
+ }
623
1472
 
624
1473
  FORCE_INLINE_TEMPLATE size_t
625
1474
  ZSTD_compressBlock_lazy_generic(
@@ -633,59 +1482,68 @@ ZSTD_compressBlock_lazy_generic(
633
1482
  const BYTE* ip = istart;
634
1483
  const BYTE* anchor = istart;
635
1484
  const BYTE* const iend = istart + srcSize;
636
- const BYTE* const ilimit = iend - 8;
1485
+ const BYTE* const ilimit = (searchMethod == search_rowHash) ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8;
637
1486
  const BYTE* const base = ms->window.base;
638
1487
  const U32 prefixLowestIndex = ms->window.dictLimit;
639
1488
  const BYTE* const prefixLowest = base + prefixLowestIndex;
640
1489
 
641
- typedef size_t (*searchMax_f)(
642
- ZSTD_matchState_t* ms,
643
- const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
644
- searchMax_f const searchMax = dictMode == ZSTD_dictMatchState ?
645
- (searchMethod==search_binaryTree ? ZSTD_BtFindBestMatch_dictMatchState_selectMLS
646
- : ZSTD_HcFindBestMatch_dictMatchState_selectMLS) :
647
- (searchMethod==search_binaryTree ? ZSTD_BtFindBestMatch_selectMLS
648
- : ZSTD_HcFindBestMatch_selectMLS);
1490
+ searchMax_f const searchMax = ZSTD_selectLazyVTable(ms, searchMethod, dictMode)->searchMax;
649
1491
  U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0;
650
1492
 
1493
+ const int isDMS = dictMode == ZSTD_dictMatchState;
1494
+ const int isDDS = dictMode == ZSTD_dedicatedDictSearch;
1495
+ const int isDxS = isDMS || isDDS;
651
1496
  const ZSTD_matchState_t* const dms = ms->dictMatchState;
652
- const U32 dictLowestIndex = dictMode == ZSTD_dictMatchState ?
653
- dms->window.dictLimit : 0;
654
- const BYTE* const dictBase = dictMode == ZSTD_dictMatchState ?
655
- dms->window.base : NULL;
656
- const BYTE* const dictLowest = dictMode == ZSTD_dictMatchState ?
657
- dictBase + dictLowestIndex : NULL;
658
- const BYTE* const dictEnd = dictMode == ZSTD_dictMatchState ?
659
- dms->window.nextSrc : NULL;
660
- const U32 dictIndexDelta = dictMode == ZSTD_dictMatchState ?
1497
+ const U32 dictLowestIndex = isDxS ? dms->window.dictLimit : 0;
1498
+ const BYTE* const dictBase = isDxS ? dms->window.base : NULL;
1499
+ const BYTE* const dictLowest = isDxS ? dictBase + dictLowestIndex : NULL;
1500
+ const BYTE* const dictEnd = isDxS ? dms->window.nextSrc : NULL;
1501
+ const U32 dictIndexDelta = isDxS ?
661
1502
  prefixLowestIndex - (U32)(dictEnd - dictBase) :
662
1503
  0;
663
- const U32 dictAndPrefixLength = (U32)(ip - prefixLowest + dictEnd - dictLowest);
1504
+ const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictLowest));
664
1505
 
665
- /* init */
1506
+ assert(searchMax != NULL);
1507
+
1508
+ DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=%u) (searchFunc=%u)", (U32)dictMode, (U32)searchMethod);
666
1509
  ip += (dictAndPrefixLength == 0);
667
1510
  if (dictMode == ZSTD_noDict) {
668
- U32 const maxRep = (U32)(ip - prefixLowest);
1511
+ U32 const curr = (U32)(ip - base);
1512
+ U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, ms->cParams.windowLog);
1513
+ U32 const maxRep = curr - windowLow;
669
1514
  if (offset_2 > maxRep) savedOffset = offset_2, offset_2 = 0;
670
1515
  if (offset_1 > maxRep) savedOffset = offset_1, offset_1 = 0;
671
1516
  }
672
- if (dictMode == ZSTD_dictMatchState) {
1517
+ if (isDxS) {
673
1518
  /* dictMatchState repCode checks don't currently handle repCode == 0
674
1519
  * disabling. */
675
1520
  assert(offset_1 <= dictAndPrefixLength);
676
1521
  assert(offset_2 <= dictAndPrefixLength);
677
1522
  }
678
1523
 
1524
+ if (searchMethod == search_rowHash) {
1525
+ const U32 rowLog = MAX(4, MIN(6, ms->cParams.searchLog));
1526
+ ZSTD_row_fillHashCache(ms, base, rowLog,
1527
+ MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
1528
+ ms->nextToUpdate, ilimit);
1529
+ }
1530
+
679
1531
  /* Match Loop */
1532
+ #if defined(__GNUC__) && defined(__x86_64__)
1533
+ /* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the
1534
+ * code alignment is perturbed. To fix the instability align the loop on 32-bytes.
1535
+ */
1536
+ __asm__(".p2align 5");
1537
+ #endif
680
1538
  while (ip < ilimit) {
681
1539
  size_t matchLength=0;
682
1540
  size_t offset=0;
683
1541
  const BYTE* start=ip+1;
684
1542
 
685
1543
  /* check repCode */
686
- if (dictMode == ZSTD_dictMatchState) {
1544
+ if (isDxS) {
687
1545
  const U32 repIndex = (U32)(ip - base) + 1 - offset_1;
688
- const BYTE* repMatch = (dictMode == ZSTD_dictMatchState
1546
+ const BYTE* repMatch = ((dictMode == ZSTD_dictMatchState || dictMode == ZSTD_dedicatedDictSearch)
689
1547
  && repIndex < prefixLowestIndex) ?
690
1548
  dictBase + (repIndex - dictIndexDelta) :
691
1549
  base + repIndex;
@@ -726,7 +1584,7 @@ ZSTD_compressBlock_lazy_generic(
726
1584
  if ((mlRep >= 4) && (gain2 > gain1))
727
1585
  matchLength = mlRep, offset = 0, start = ip;
728
1586
  }
729
- if (dictMode == ZSTD_dictMatchState) {
1587
+ if (isDxS) {
730
1588
  const U32 repIndex = (U32)(ip - base) - offset_1;
731
1589
  const BYTE* repMatch = repIndex < prefixLowestIndex ?
732
1590
  dictBase + (repIndex - dictIndexDelta) :
@@ -761,7 +1619,7 @@ ZSTD_compressBlock_lazy_generic(
761
1619
  if ((mlRep >= 4) && (gain2 > gain1))
762
1620
  matchLength = mlRep, offset = 0, start = ip;
763
1621
  }
764
- if (dictMode == ZSTD_dictMatchState) {
1622
+ if (isDxS) {
765
1623
  const U32 repIndex = (U32)(ip - base) - offset_1;
766
1624
  const BYTE* repMatch = repIndex < prefixLowestIndex ?
767
1625
  dictBase + (repIndex - dictIndexDelta) :
@@ -799,8 +1657,8 @@ ZSTD_compressBlock_lazy_generic(
799
1657
  && (start[-1] == (start-(offset-ZSTD_REP_MOVE))[-1]) ) /* only search for offset within prefix */
800
1658
  { start--; matchLength++; }
801
1659
  }
802
- if (dictMode == ZSTD_dictMatchState) {
803
- U32 const matchIndex = (U32)((start-base) - (offset - ZSTD_REP_MOVE));
1660
+ if (isDxS) {
1661
+ U32 const matchIndex = (U32)((size_t)(start-base) - (offset - ZSTD_REP_MOVE));
804
1662
  const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex;
805
1663
  const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest;
806
1664
  while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */
@@ -809,18 +1667,17 @@ ZSTD_compressBlock_lazy_generic(
809
1667
  }
810
1668
  /* store sequence */
811
1669
  _storeSequence:
812
- { size_t const litLength = start - anchor;
1670
+ { size_t const litLength = (size_t)(start - anchor);
813
1671
  ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offset, matchLength-MINMATCH);
814
1672
  anchor = ip = start + matchLength;
815
1673
  }
816
1674
 
817
1675
  /* check immediate repcode */
818
- if (dictMode == ZSTD_dictMatchState) {
1676
+ if (isDxS) {
819
1677
  while (ip <= ilimit) {
820
1678
  U32 const current2 = (U32)(ip-base);
821
1679
  U32 const repIndex = current2 - offset_2;
822
- const BYTE* repMatch = dictMode == ZSTD_dictMatchState
823
- && repIndex < prefixLowestIndex ?
1680
+ const BYTE* repMatch = repIndex < prefixLowestIndex ?
824
1681
  dictBase - dictIndexDelta + repIndex :
825
1682
  base + repIndex;
826
1683
  if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex) >= 3 /* intentional overflow */)
@@ -915,6 +1772,92 @@ size_t ZSTD_compressBlock_greedy_dictMatchState(
915
1772
  }
916
1773
 
917
1774
 
1775
+ size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(
1776
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1777
+ void const* src, size_t srcSize)
1778
+ {
1779
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dedicatedDictSearch);
1780
+ }
1781
+
1782
+ size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(
1783
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1784
+ void const* src, size_t srcSize)
1785
+ {
1786
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dedicatedDictSearch);
1787
+ }
1788
+
1789
+ size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
1790
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1791
+ void const* src, size_t srcSize)
1792
+ {
1793
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch);
1794
+ }
1795
+
1796
+ /* Row-based matchfinder */
1797
+ size_t ZSTD_compressBlock_lazy2_row(
1798
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1799
+ void const* src, size_t srcSize)
1800
+ {
1801
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict);
1802
+ }
1803
+
1804
+ size_t ZSTD_compressBlock_lazy_row(
1805
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1806
+ void const* src, size_t srcSize)
1807
+ {
1808
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict);
1809
+ }
1810
+
1811
+ size_t ZSTD_compressBlock_greedy_row(
1812
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1813
+ void const* src, size_t srcSize)
1814
+ {
1815
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict);
1816
+ }
1817
+
1818
+ size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
1819
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1820
+ void const* src, size_t srcSize)
1821
+ {
1822
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState);
1823
+ }
1824
+
1825
+ size_t ZSTD_compressBlock_lazy_dictMatchState_row(
1826
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1827
+ void const* src, size_t srcSize)
1828
+ {
1829
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState);
1830
+ }
1831
+
1832
+ size_t ZSTD_compressBlock_greedy_dictMatchState_row(
1833
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1834
+ void const* src, size_t srcSize)
1835
+ {
1836
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState);
1837
+ }
1838
+
1839
+
1840
+ size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row(
1841
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1842
+ void const* src, size_t srcSize)
1843
+ {
1844
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dedicatedDictSearch);
1845
+ }
1846
+
1847
+ size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
1848
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1849
+ void const* src, size_t srcSize)
1850
+ {
1851
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch);
1852
+ }
1853
+
1854
+ size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
1855
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1856
+ void const* src, size_t srcSize)
1857
+ {
1858
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch);
1859
+ }
1860
+
918
1861
  FORCE_INLINE_TEMPLATE
919
1862
  size_t ZSTD_compressBlock_lazy_extDict_generic(
920
1863
  ZSTD_matchState_t* ms, seqStore_t* seqStore,
@@ -926,37 +1869,49 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
926
1869
  const BYTE* ip = istart;
927
1870
  const BYTE* anchor = istart;
928
1871
  const BYTE* const iend = istart + srcSize;
929
- const BYTE* const ilimit = iend - 8;
1872
+ const BYTE* const ilimit = searchMethod == search_rowHash ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8;
930
1873
  const BYTE* const base = ms->window.base;
931
1874
  const U32 dictLimit = ms->window.dictLimit;
932
- const U32 lowestIndex = ms->window.lowLimit;
933
1875
  const BYTE* const prefixStart = base + dictLimit;
934
1876
  const BYTE* const dictBase = ms->window.dictBase;
935
1877
  const BYTE* const dictEnd = dictBase + dictLimit;
936
- const BYTE* const dictStart = dictBase + lowestIndex;
937
-
938
- typedef size_t (*searchMax_f)(
939
- ZSTD_matchState_t* ms,
940
- const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
941
- searchMax_f searchMax = searchMethod==search_binaryTree ? ZSTD_BtFindBestMatch_extDict_selectMLS : ZSTD_HcFindBestMatch_extDict_selectMLS;
1878
+ const BYTE* const dictStart = dictBase + ms->window.lowLimit;
1879
+ const U32 windowLog = ms->cParams.windowLog;
1880
+ const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5;
942
1881
 
1882
+ searchMax_f const searchMax = ZSTD_selectLazyVTable(ms, searchMethod, ZSTD_extDict)->searchMax;
943
1883
  U32 offset_1 = rep[0], offset_2 = rep[1];
944
1884
 
1885
+ DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod);
1886
+
945
1887
  /* init */
946
1888
  ip += (ip == prefixStart);
1889
+ if (searchMethod == search_rowHash) {
1890
+ ZSTD_row_fillHashCache(ms, base, rowLog,
1891
+ MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
1892
+ ms->nextToUpdate, ilimit);
1893
+ }
947
1894
 
948
1895
  /* Match Loop */
1896
+ #if defined(__GNUC__) && defined(__x86_64__)
1897
+ /* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the
1898
+ * code alignment is perturbed. To fix the instability align the loop on 32-bytes.
1899
+ */
1900
+ __asm__(".p2align 5");
1901
+ #endif
949
1902
  while (ip < ilimit) {
950
1903
  size_t matchLength=0;
951
1904
  size_t offset=0;
952
1905
  const BYTE* start=ip+1;
953
- U32 current = (U32)(ip-base);
1906
+ U32 curr = (U32)(ip-base);
954
1907
 
955
1908
  /* check repCode */
956
- { const U32 repIndex = (U32)(current+1 - offset_1);
1909
+ { const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr+1, windowLog);
1910
+ const U32 repIndex = (U32)(curr+1 - offset_1);
957
1911
  const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
958
1912
  const BYTE* const repMatch = repBase + repIndex;
959
- if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > lowestIndex)) /* intentional overflow */
1913
+ if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow */
1914
+ & (offset_1 <= curr+1 - windowLow) ) /* note: we are searching at curr+1 */
960
1915
  if (MEM_read32(ip+1) == MEM_read32(repMatch)) {
961
1916
  /* repcode detected we should take it */
962
1917
  const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
@@ -971,7 +1926,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
971
1926
  matchLength = ml2, start = ip, offset=offsetFound;
972
1927
  }
973
1928
 
974
- if (matchLength < 4) {
1929
+ if (matchLength < 4) {
975
1930
  ip += ((ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */
976
1931
  continue;
977
1932
  }
@@ -980,13 +1935,15 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
980
1935
  if (depth>=1)
981
1936
  while (ip<ilimit) {
982
1937
  ip ++;
983
- current++;
1938
+ curr++;
984
1939
  /* check repCode */
985
1940
  if (offset) {
986
- const U32 repIndex = (U32)(current - offset_1);
1941
+ const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
1942
+ const U32 repIndex = (U32)(curr - offset_1);
987
1943
  const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
988
1944
  const BYTE* const repMatch = repBase + repIndex;
989
- if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > lowestIndex)) /* intentional overflow */
1945
+ if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */
1946
+ & (offset_1 <= curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
990
1947
  if (MEM_read32(ip) == MEM_read32(repMatch)) {
991
1948
  /* repcode detected */
992
1949
  const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
@@ -1010,13 +1967,15 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1010
1967
  /* let's find an even better one */
1011
1968
  if ((depth==2) && (ip<ilimit)) {
1012
1969
  ip ++;
1013
- current++;
1970
+ curr++;
1014
1971
  /* check repCode */
1015
1972
  if (offset) {
1016
- const U32 repIndex = (U32)(current - offset_1);
1973
+ const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
1974
+ const U32 repIndex = (U32)(curr - offset_1);
1017
1975
  const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
1018
1976
  const BYTE* const repMatch = repBase + repIndex;
1019
- if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > lowestIndex)) /* intentional overflow */
1977
+ if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */
1978
+ & (offset_1 <= curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
1020
1979
  if (MEM_read32(ip) == MEM_read32(repMatch)) {
1021
1980
  /* repcode detected */
1022
1981
  const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
@@ -1041,7 +2000,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1041
2000
 
1042
2001
  /* catch up */
1043
2002
  if (offset) {
1044
- U32 const matchIndex = (U32)((start-base) - (offset - ZSTD_REP_MOVE));
2003
+ U32 const matchIndex = (U32)((size_t)(start-base) - (offset - ZSTD_REP_MOVE));
1045
2004
  const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex;
1046
2005
  const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart;
1047
2006
  while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */
@@ -1050,17 +2009,20 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1050
2009
 
1051
2010
  /* store sequence */
1052
2011
  _storeSequence:
1053
- { size_t const litLength = start - anchor;
2012
+ { size_t const litLength = (size_t)(start - anchor);
1054
2013
  ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offset, matchLength-MINMATCH);
1055
2014
  anchor = ip = start + matchLength;
1056
2015
  }
1057
2016
 
1058
2017
  /* check immediate repcode */
1059
2018
  while (ip <= ilimit) {
1060
- const U32 repIndex = (U32)((ip-base) - offset_2);
2019
+ const U32 repCurrent = (U32)(ip-base);
2020
+ const U32 windowLow = ZSTD_getLowestMatchIndex(ms, repCurrent, windowLog);
2021
+ const U32 repIndex = repCurrent - offset_2;
1061
2022
  const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
1062
2023
  const BYTE* const repMatch = repBase + repIndex;
1063
- if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > lowestIndex)) /* intentional overflow */
2024
+ if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */
2025
+ & (offset_2 <= repCurrent - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
1064
2026
  if (MEM_read32(ip) == MEM_read32(repMatch)) {
1065
2027
  /* repcode detected we should take it */
1066
2028
  const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
@@ -1113,3 +2075,26 @@ size_t ZSTD_compressBlock_btlazy2_extDict(
1113
2075
  {
1114
2076
  return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2);
1115
2077
  }
2078
+
2079
+ size_t ZSTD_compressBlock_greedy_extDict_row(
2080
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2081
+ void const* src, size_t srcSize)
2082
+ {
2083
+ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0);
2084
+ }
2085
+
2086
+ size_t ZSTD_compressBlock_lazy_extDict_row(
2087
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2088
+ void const* src, size_t srcSize)
2089
+
2090
+ {
2091
+ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1);
2092
+ }
2093
+
2094
+ size_t ZSTD_compressBlock_lazy2_extDict_row(
2095
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2096
+ void const* src, size_t srcSize)
2097
+
2098
+ {
2099
+ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2);
2100
+ }