zstd-ruby 1.4.1.0 → 1.5.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. checksums.yaml +4 -4
  2. data/.github/dependabot.yml +8 -0
  3. data/.github/workflows/ruby.yml +35 -0
  4. data/README.md +2 -2
  5. data/ext/zstdruby/libzstd/BUCK +5 -7
  6. data/ext/zstdruby/libzstd/Makefile +304 -113
  7. data/ext/zstdruby/libzstd/README.md +83 -20
  8. data/ext/zstdruby/libzstd/common/bitstream.h +59 -51
  9. data/ext/zstdruby/libzstd/common/compiler.h +150 -8
  10. data/ext/zstdruby/libzstd/common/cpu.h +1 -3
  11. data/ext/zstdruby/libzstd/common/debug.c +11 -31
  12. data/ext/zstdruby/libzstd/common/debug.h +22 -49
  13. data/ext/zstdruby/libzstd/common/entropy_common.c +201 -75
  14. data/ext/zstdruby/libzstd/common/error_private.c +3 -1
  15. data/ext/zstdruby/libzstd/common/error_private.h +8 -4
  16. data/ext/zstdruby/libzstd/common/fse.h +50 -42
  17. data/ext/zstdruby/libzstd/common/fse_decompress.c +149 -55
  18. data/ext/zstdruby/libzstd/common/huf.h +43 -39
  19. data/ext/zstdruby/libzstd/common/mem.h +69 -25
  20. data/ext/zstdruby/libzstd/common/pool.c +30 -20
  21. data/ext/zstdruby/libzstd/common/pool.h +3 -3
  22. data/ext/zstdruby/libzstd/common/threading.c +51 -4
  23. data/ext/zstdruby/libzstd/common/threading.h +36 -4
  24. data/ext/zstdruby/libzstd/common/xxhash.c +40 -92
  25. data/ext/zstdruby/libzstd/common/xxhash.h +12 -32
  26. data/ext/zstdruby/libzstd/common/zstd_common.c +10 -10
  27. data/ext/zstdruby/libzstd/common/zstd_deps.h +111 -0
  28. data/ext/zstdruby/libzstd/common/zstd_internal.h +230 -111
  29. data/ext/zstdruby/libzstd/common/zstd_trace.h +154 -0
  30. data/ext/zstdruby/libzstd/compress/fse_compress.c +47 -63
  31. data/ext/zstdruby/libzstd/compress/hist.c +41 -63
  32. data/ext/zstdruby/libzstd/compress/hist.h +13 -33
  33. data/ext/zstdruby/libzstd/compress/huf_compress.c +332 -193
  34. data/ext/zstdruby/libzstd/compress/zstd_compress.c +3614 -1696
  35. data/ext/zstdruby/libzstd/compress/zstd_compress_internal.h +546 -86
  36. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.c +158 -0
  37. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.h +29 -0
  38. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.c +441 -0
  39. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.h +54 -0
  40. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.c +572 -0
  41. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.h +32 -0
  42. data/ext/zstdruby/libzstd/compress/zstd_cwksp.h +662 -0
  43. data/ext/zstdruby/libzstd/compress/zstd_double_fast.c +43 -41
  44. data/ext/zstdruby/libzstd/compress/zstd_double_fast.h +2 -2
  45. data/ext/zstdruby/libzstd/compress/zstd_fast.c +85 -80
  46. data/ext/zstdruby/libzstd/compress/zstd_fast.h +2 -2
  47. data/ext/zstdruby/libzstd/compress/zstd_lazy.c +1184 -111
  48. data/ext/zstdruby/libzstd/compress/zstd_lazy.h +59 -1
  49. data/ext/zstdruby/libzstd/compress/zstd_ldm.c +333 -208
  50. data/ext/zstdruby/libzstd/compress/zstd_ldm.h +15 -3
  51. data/ext/zstdruby/libzstd/compress/zstd_ldm_geartab.h +103 -0
  52. data/ext/zstdruby/libzstd/compress/zstd_opt.c +228 -129
  53. data/ext/zstdruby/libzstd/compress/zstd_opt.h +1 -1
  54. data/ext/zstdruby/libzstd/compress/zstdmt_compress.c +151 -440
  55. data/ext/zstdruby/libzstd/compress/zstdmt_compress.h +32 -114
  56. data/ext/zstdruby/libzstd/decompress/huf_decompress.c +395 -276
  57. data/ext/zstdruby/libzstd/decompress/zstd_ddict.c +20 -16
  58. data/ext/zstdruby/libzstd/decompress/zstd_ddict.h +3 -3
  59. data/ext/zstdruby/libzstd/decompress/zstd_decompress.c +630 -231
  60. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.c +606 -380
  61. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.h +8 -5
  62. data/ext/zstdruby/libzstd/decompress/zstd_decompress_internal.h +39 -9
  63. data/ext/zstdruby/libzstd/deprecated/zbuff.h +9 -8
  64. data/ext/zstdruby/libzstd/deprecated/zbuff_common.c +2 -2
  65. data/ext/zstdruby/libzstd/deprecated/zbuff_compress.c +1 -1
  66. data/ext/zstdruby/libzstd/deprecated/zbuff_decompress.c +1 -1
  67. data/ext/zstdruby/libzstd/dictBuilder/cover.c +55 -46
  68. data/ext/zstdruby/libzstd/dictBuilder/cover.h +20 -9
  69. data/ext/zstdruby/libzstd/dictBuilder/divsufsort.c +1 -1
  70. data/ext/zstdruby/libzstd/dictBuilder/fastcover.c +43 -31
  71. data/ext/zstdruby/libzstd/dictBuilder/zdict.c +53 -30
  72. data/ext/zstdruby/libzstd/dll/example/Makefile +2 -1
  73. data/ext/zstdruby/libzstd/dll/example/README.md +16 -22
  74. data/ext/zstdruby/libzstd/legacy/zstd_legacy.h +4 -4
  75. data/ext/zstdruby/libzstd/legacy/zstd_v01.c +24 -14
  76. data/ext/zstdruby/libzstd/legacy/zstd_v01.h +1 -1
  77. data/ext/zstdruby/libzstd/legacy/zstd_v02.c +17 -8
  78. data/ext/zstdruby/libzstd/legacy/zstd_v02.h +1 -1
  79. data/ext/zstdruby/libzstd/legacy/zstd_v03.c +17 -8
  80. data/ext/zstdruby/libzstd/legacy/zstd_v03.h +1 -1
  81. data/ext/zstdruby/libzstd/legacy/zstd_v04.c +25 -11
  82. data/ext/zstdruby/libzstd/legacy/zstd_v04.h +1 -1
  83. data/ext/zstdruby/libzstd/legacy/zstd_v05.c +43 -32
  84. data/ext/zstdruby/libzstd/legacy/zstd_v05.h +2 -2
  85. data/ext/zstdruby/libzstd/legacy/zstd_v06.c +27 -19
  86. data/ext/zstdruby/libzstd/legacy/zstd_v06.h +1 -1
  87. data/ext/zstdruby/libzstd/legacy/zstd_v07.c +32 -20
  88. data/ext/zstdruby/libzstd/legacy/zstd_v07.h +1 -1
  89. data/ext/zstdruby/libzstd/libzstd.pc.in +2 -1
  90. data/ext/zstdruby/libzstd/{dictBuilder/zdict.h → zdict.h} +201 -31
  91. data/ext/zstdruby/libzstd/zstd.h +740 -153
  92. data/ext/zstdruby/libzstd/{common/zstd_errors.h → zstd_errors.h} +3 -1
  93. data/lib/zstd-ruby/version.rb +1 -1
  94. data/zstd-ruby.gemspec +1 -1
  95. metadata +21 -10
  96. data/.travis.yml +0 -14
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
2
+ * Copyright (c) Yann Collet, Facebook, Inc.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -15,7 +15,7 @@
15
15
  extern "C" {
16
16
  #endif
17
17
 
18
- #include "mem.h" /* U32 */
18
+ #include "../common/mem.h" /* U32 */
19
19
  #include "zstd_compress_internal.h"
20
20
 
21
21
  void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
2
+ * Copyright (c) Yann Collet, Facebook, Inc.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -58,11 +58,11 @@ ZSTD_updateDUBT(ZSTD_matchState_t* ms,
58
58
 
59
59
  /** ZSTD_insertDUBT1() :
60
60
  * sort one already inserted but unsorted position
61
- * assumption : current >= btlow == (current - btmask)
61
+ * assumption : curr >= btlow == (curr - btmask)
62
62
  * doesn't fail */
63
63
  static void
64
64
  ZSTD_insertDUBT1(ZSTD_matchState_t* ms,
65
- U32 current, const BYTE* inputEnd,
65
+ U32 curr, const BYTE* inputEnd,
66
66
  U32 nbCompares, U32 btLow,
67
67
  const ZSTD_dictMode_e dictMode)
68
68
  {
@@ -74,41 +74,41 @@ ZSTD_insertDUBT1(ZSTD_matchState_t* ms,
74
74
  const BYTE* const base = ms->window.base;
75
75
  const BYTE* const dictBase = ms->window.dictBase;
76
76
  const U32 dictLimit = ms->window.dictLimit;
77
- const BYTE* const ip = (current>=dictLimit) ? base + current : dictBase + current;
78
- const BYTE* const iend = (current>=dictLimit) ? inputEnd : dictBase + dictLimit;
77
+ const BYTE* const ip = (curr>=dictLimit) ? base + curr : dictBase + curr;
78
+ const BYTE* const iend = (curr>=dictLimit) ? inputEnd : dictBase + dictLimit;
79
79
  const BYTE* const dictEnd = dictBase + dictLimit;
80
80
  const BYTE* const prefixStart = base + dictLimit;
81
81
  const BYTE* match;
82
- U32* smallerPtr = bt + 2*(current&btMask);
82
+ U32* smallerPtr = bt + 2*(curr&btMask);
83
83
  U32* largerPtr = smallerPtr + 1;
84
84
  U32 matchIndex = *smallerPtr; /* this candidate is unsorted : next sorted candidate is reached through *smallerPtr, while *largerPtr contains previous unsorted candidate (which is already saved and can be overwritten) */
85
85
  U32 dummy32; /* to be nullified at the end */
86
86
  U32 const windowValid = ms->window.lowLimit;
87
87
  U32 const maxDistance = 1U << cParams->windowLog;
88
- U32 const windowLow = (current - windowValid > maxDistance) ? current - maxDistance : windowValid;
88
+ U32 const windowLow = (curr - windowValid > maxDistance) ? curr - maxDistance : windowValid;
89
89
 
90
90
 
91
91
  DEBUGLOG(8, "ZSTD_insertDUBT1(%u) (dictLimit=%u, lowLimit=%u)",
92
- current, dictLimit, windowLow);
93
- assert(current >= btLow);
92
+ curr, dictLimit, windowLow);
93
+ assert(curr >= btLow);
94
94
  assert(ip < iend); /* condition for ZSTD_count */
95
95
 
96
96
  while (nbCompares-- && (matchIndex > windowLow)) {
97
97
  U32* const nextPtr = bt + 2*(matchIndex & btMask);
98
98
  size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
99
- assert(matchIndex < current);
99
+ assert(matchIndex < curr);
100
100
  /* note : all candidates are now supposed sorted,
101
101
  * but it's still possible to have nextPtr[1] == ZSTD_DUBT_UNSORTED_MARK
102
102
  * when a real index has the same value as ZSTD_DUBT_UNSORTED_MARK */
103
103
 
104
104
  if ( (dictMode != ZSTD_extDict)
105
105
  || (matchIndex+matchLength >= dictLimit) /* both in current segment*/
106
- || (current < dictLimit) /* both in extDict */) {
106
+ || (curr < dictLimit) /* both in extDict */) {
107
107
  const BYTE* const mBase = ( (dictMode != ZSTD_extDict)
108
108
  || (matchIndex+matchLength >= dictLimit)) ?
109
109
  base : dictBase;
110
110
  assert( (matchIndex+matchLength >= dictLimit) /* might be wrong if extDict is incorrectly set to 0 */
111
- || (current < dictLimit) );
111
+ || (curr < dictLimit) );
112
112
  match = mBase + matchIndex;
113
113
  matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend);
114
114
  } else {
@@ -119,7 +119,7 @@ ZSTD_insertDUBT1(ZSTD_matchState_t* ms,
119
119
  }
120
120
 
121
121
  DEBUGLOG(8, "ZSTD_insertDUBT1: comparing %u with %u : found %u common bytes ",
122
- current, matchIndex, (U32)matchLength);
122
+ curr, matchIndex, (U32)matchLength);
123
123
 
124
124
  if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */
125
125
  break; /* drop , to guarantee consistency ; miss a bit of compression, but other solutions can corrupt tree */
@@ -168,7 +168,7 @@ ZSTD_DUBT_findBetterDictMatch (
168
168
 
169
169
  const BYTE* const base = ms->window.base;
170
170
  const BYTE* const prefixStart = base + ms->window.dictLimit;
171
- U32 const current = (U32)(ip-base);
171
+ U32 const curr = (U32)(ip-base);
172
172
  const BYTE* const dictBase = dms->window.base;
173
173
  const BYTE* const dictEnd = dms->window.nextSrc;
174
174
  U32 const dictHighLimit = (U32)(dms->window.nextSrc - dms->window.base);
@@ -195,10 +195,10 @@ ZSTD_DUBT_findBetterDictMatch (
195
195
 
196
196
  if (matchLength > bestLength) {
197
197
  U32 matchIndex = dictMatchIndex + dictIndexDelta;
198
- if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(current-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) {
198
+ if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) {
199
199
  DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)",
200
- current, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, ZSTD_REP_MOVE + current - matchIndex, dictMatchIndex, matchIndex);
201
- bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + current - matchIndex;
200
+ curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, ZSTD_REP_MOVE + curr - matchIndex, dictMatchIndex, matchIndex);
201
+ bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + curr - matchIndex;
202
202
  }
203
203
  if (ip+matchLength == iend) { /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */
204
204
  break; /* drop, to guarantee consistency (miss a little bit of compression) */
@@ -218,9 +218,9 @@ ZSTD_DUBT_findBetterDictMatch (
218
218
  }
219
219
 
220
220
  if (bestLength >= MINMATCH) {
221
- U32 const mIndex = current - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex;
221
+ U32 const mIndex = curr - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex;
222
222
  DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
223
- current, (U32)bestLength, (U32)*offsetPtr, mIndex);
223
+ curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
224
224
  }
225
225
  return bestLength;
226
226
 
@@ -241,15 +241,13 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
241
241
  U32 matchIndex = hashTable[h];
242
242
 
243
243
  const BYTE* const base = ms->window.base;
244
- U32 const current = (U32)(ip-base);
245
- U32 const maxDistance = 1U << cParams->windowLog;
246
- U32 const windowValid = ms->window.lowLimit;
247
- U32 const windowLow = (current - windowValid > maxDistance) ? current - maxDistance : windowValid;
244
+ U32 const curr = (U32)(ip-base);
245
+ U32 const windowLow = ZSTD_getLowestMatchIndex(ms, curr, cParams->windowLog);
248
246
 
249
247
  U32* const bt = ms->chainTable;
250
248
  U32 const btLog = cParams->chainLog - 1;
251
249
  U32 const btMask = (1 << btLog) - 1;
252
- U32 const btLow = (btMask >= current) ? 0 : current - btMask;
250
+ U32 const btLow = (btMask >= curr) ? 0 : curr - btMask;
253
251
  U32 const unsortLimit = MAX(btLow, windowLow);
254
252
 
255
253
  U32* nextCandidate = bt + 2*(matchIndex&btMask);
@@ -258,8 +256,9 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
258
256
  U32 nbCandidates = nbCompares;
259
257
  U32 previousCandidate = 0;
260
258
 
261
- DEBUGLOG(7, "ZSTD_DUBT_findBestMatch (%u) ", current);
259
+ DEBUGLOG(7, "ZSTD_DUBT_findBestMatch (%u) ", curr);
262
260
  assert(ip <= iend-8); /* required for h calculation */
261
+ assert(dictMode != ZSTD_dedicatedDictSearch);
263
262
 
264
263
  /* reach end of unsorted candidates list */
265
264
  while ( (matchIndex > unsortLimit)
@@ -301,14 +300,14 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
301
300
  const U32 dictLimit = ms->window.dictLimit;
302
301
  const BYTE* const dictEnd = dictBase + dictLimit;
303
302
  const BYTE* const prefixStart = base + dictLimit;
304
- U32* smallerPtr = bt + 2*(current&btMask);
305
- U32* largerPtr = bt + 2*(current&btMask) + 1;
306
- U32 matchEndIdx = current + 8 + 1;
303
+ U32* smallerPtr = bt + 2*(curr&btMask);
304
+ U32* largerPtr = bt + 2*(curr&btMask) + 1;
305
+ U32 matchEndIdx = curr + 8 + 1;
307
306
  U32 dummy32; /* to be nullified at the end */
308
307
  size_t bestLength = 0;
309
308
 
310
309
  matchIndex = hashTable[h];
311
- hashTable[h] = current; /* Update Hash Table */
310
+ hashTable[h] = curr; /* Update Hash Table */
312
311
 
313
312
  while (nbCompares-- && (matchIndex > windowLow)) {
314
313
  U32* const nextPtr = bt + 2*(matchIndex & btMask);
@@ -328,8 +327,8 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
328
327
  if (matchLength > bestLength) {
329
328
  if (matchLength > matchEndIdx - matchIndex)
330
329
  matchEndIdx = matchIndex + (U32)matchLength;
331
- if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(current-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) )
332
- bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + current - matchIndex;
330
+ if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) )
331
+ bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + curr - matchIndex;
333
332
  if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */
334
333
  if (dictMode == ZSTD_dictMatchState) {
335
334
  nbCompares = 0; /* in addition to avoiding checking any
@@ -365,12 +364,12 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
365
364
  mls, dictMode);
366
365
  }
367
366
 
368
- assert(matchEndIdx > current+8); /* ensure nextToUpdate is increased */
367
+ assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased */
369
368
  ms->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */
370
369
  if (bestLength >= MINMATCH) {
371
- U32 const mIndex = current - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex;
370
+ U32 const mIndex = curr - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex;
372
371
  DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
373
- current, (U32)bestLength, (U32)*offsetPtr, mIndex);
372
+ curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
374
373
  }
375
374
  return bestLength;
376
375
  }
@@ -439,6 +438,220 @@ static size_t ZSTD_BtFindBestMatch_extDict_selectMLS (
439
438
  }
440
439
  }
441
440
 
441
+ /***********************************
442
+ * Dedicated dict search
443
+ ***********************************/
444
+
445
+ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip)
446
+ {
447
+ const BYTE* const base = ms->window.base;
448
+ U32 const target = (U32)(ip - base);
449
+ U32* const hashTable = ms->hashTable;
450
+ U32* const chainTable = ms->chainTable;
451
+ U32 const chainSize = 1 << ms->cParams.chainLog;
452
+ U32 idx = ms->nextToUpdate;
453
+ U32 const minChain = chainSize < target ? target - chainSize : idx;
454
+ U32 const bucketSize = 1 << ZSTD_LAZY_DDSS_BUCKET_LOG;
455
+ U32 const cacheSize = bucketSize - 1;
456
+ U32 const chainAttempts = (1 << ms->cParams.searchLog) - cacheSize;
457
+ U32 const chainLimit = chainAttempts > 255 ? 255 : chainAttempts;
458
+
459
+ /* We know the hashtable is oversized by a factor of `bucketSize`.
460
+ * We are going to temporarily pretend `bucketSize == 1`, keeping only a
461
+ * single entry. We will use the rest of the space to construct a temporary
462
+ * chaintable.
463
+ */
464
+ U32 const hashLog = ms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG;
465
+ U32* const tmpHashTable = hashTable;
466
+ U32* const tmpChainTable = hashTable + ((size_t)1 << hashLog);
467
+ U32 const tmpChainSize = ((1 << ZSTD_LAZY_DDSS_BUCKET_LOG) - 1) << hashLog;
468
+ U32 const tmpMinChain = tmpChainSize < target ? target - tmpChainSize : idx;
469
+ U32 hashIdx;
470
+
471
+ assert(ms->cParams.chainLog <= 24);
472
+ assert(ms->cParams.hashLog > ms->cParams.chainLog);
473
+ assert(idx != 0);
474
+ assert(tmpMinChain <= minChain);
475
+
476
+ /* fill conventional hash table and conventional chain table */
477
+ for ( ; idx < target; idx++) {
478
+ U32 const h = (U32)ZSTD_hashPtr(base + idx, hashLog, ms->cParams.minMatch);
479
+ if (idx >= tmpMinChain) {
480
+ tmpChainTable[idx - tmpMinChain] = hashTable[h];
481
+ }
482
+ tmpHashTable[h] = idx;
483
+ }
484
+
485
+ /* sort chains into ddss chain table */
486
+ {
487
+ U32 chainPos = 0;
488
+ for (hashIdx = 0; hashIdx < (1U << hashLog); hashIdx++) {
489
+ U32 count;
490
+ U32 countBeyondMinChain = 0;
491
+ U32 i = tmpHashTable[hashIdx];
492
+ for (count = 0; i >= tmpMinChain && count < cacheSize; count++) {
493
+ /* skip through the chain to the first position that won't be
494
+ * in the hash cache bucket */
495
+ if (i < minChain) {
496
+ countBeyondMinChain++;
497
+ }
498
+ i = tmpChainTable[i - tmpMinChain];
499
+ }
500
+ if (count == cacheSize) {
501
+ for (count = 0; count < chainLimit;) {
502
+ if (i < minChain) {
503
+ if (!i || ++countBeyondMinChain > cacheSize) {
504
+ /* only allow pulling `cacheSize` number of entries
505
+ * into the cache or chainTable beyond `minChain`,
506
+ * to replace the entries pulled out of the
507
+ * chainTable into the cache. This lets us reach
508
+ * back further without increasing the total number
509
+ * of entries in the chainTable, guaranteeing the
510
+ * DDSS chain table will fit into the space
511
+ * allocated for the regular one. */
512
+ break;
513
+ }
514
+ }
515
+ chainTable[chainPos++] = i;
516
+ count++;
517
+ if (i < tmpMinChain) {
518
+ break;
519
+ }
520
+ i = tmpChainTable[i - tmpMinChain];
521
+ }
522
+ } else {
523
+ count = 0;
524
+ }
525
+ if (count) {
526
+ tmpHashTable[hashIdx] = ((chainPos - count) << 8) + count;
527
+ } else {
528
+ tmpHashTable[hashIdx] = 0;
529
+ }
530
+ }
531
+ assert(chainPos <= chainSize); /* I believe this is guaranteed... */
532
+ }
533
+
534
+ /* move chain pointers into the last entry of each hash bucket */
535
+ for (hashIdx = (1 << hashLog); hashIdx; ) {
536
+ U32 const bucketIdx = --hashIdx << ZSTD_LAZY_DDSS_BUCKET_LOG;
537
+ U32 const chainPackedPointer = tmpHashTable[hashIdx];
538
+ U32 i;
539
+ for (i = 0; i < cacheSize; i++) {
540
+ hashTable[bucketIdx + i] = 0;
541
+ }
542
+ hashTable[bucketIdx + bucketSize - 1] = chainPackedPointer;
543
+ }
544
+
545
+ /* fill the buckets of the hash table */
546
+ for (idx = ms->nextToUpdate; idx < target; idx++) {
547
+ U32 const h = (U32)ZSTD_hashPtr(base + idx, hashLog, ms->cParams.minMatch)
548
+ << ZSTD_LAZY_DDSS_BUCKET_LOG;
549
+ U32 i;
550
+ /* Shift hash cache down 1. */
551
+ for (i = cacheSize - 1; i; i--)
552
+ hashTable[h + i] = hashTable[h + i - 1];
553
+ hashTable[h] = idx;
554
+ }
555
+
556
+ ms->nextToUpdate = target;
557
+ }
558
+
559
+ /* Returns the longest match length found in the dedicated dict search structure.
560
+ * If none are longer than the argument ml, then ml will be returned.
561
+ */
562
+ FORCE_INLINE_TEMPLATE
563
+ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nbAttempts,
564
+ const ZSTD_matchState_t* const dms,
565
+ const BYTE* const ip, const BYTE* const iLimit,
566
+ const BYTE* const prefixStart, const U32 curr,
567
+ const U32 dictLimit, const size_t ddsIdx) {
568
+ const U32 ddsLowestIndex = dms->window.dictLimit;
569
+ const BYTE* const ddsBase = dms->window.base;
570
+ const BYTE* const ddsEnd = dms->window.nextSrc;
571
+ const U32 ddsSize = (U32)(ddsEnd - ddsBase);
572
+ const U32 ddsIndexDelta = dictLimit - ddsSize;
573
+ const U32 bucketSize = (1 << ZSTD_LAZY_DDSS_BUCKET_LOG);
574
+ const U32 bucketLimit = nbAttempts < bucketSize - 1 ? nbAttempts : bucketSize - 1;
575
+ U32 ddsAttempt;
576
+ U32 matchIndex;
577
+
578
+ for (ddsAttempt = 0; ddsAttempt < bucketSize - 1; ddsAttempt++) {
579
+ PREFETCH_L1(ddsBase + dms->hashTable[ddsIdx + ddsAttempt]);
580
+ }
581
+
582
+ {
583
+ U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1];
584
+ U32 const chainIndex = chainPackedPointer >> 8;
585
+
586
+ PREFETCH_L1(&dms->chainTable[chainIndex]);
587
+ }
588
+
589
+ for (ddsAttempt = 0; ddsAttempt < bucketLimit; ddsAttempt++) {
590
+ size_t currentMl=0;
591
+ const BYTE* match;
592
+ matchIndex = dms->hashTable[ddsIdx + ddsAttempt];
593
+ match = ddsBase + matchIndex;
594
+
595
+ if (!matchIndex) {
596
+ return ml;
597
+ }
598
+
599
+ /* guaranteed by table construction */
600
+ (void)ddsLowestIndex;
601
+ assert(matchIndex >= ddsLowestIndex);
602
+ assert(match+4 <= ddsEnd);
603
+ if (MEM_read32(match) == MEM_read32(ip)) {
604
+ /* assumption : matchIndex <= dictLimit-4 (by table construction) */
605
+ currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4;
606
+ }
607
+
608
+ /* save best solution */
609
+ if (currentMl > ml) {
610
+ ml = currentMl;
611
+ *offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE;
612
+ if (ip+currentMl == iLimit) {
613
+ /* best possible, avoids read overflow on next attempt */
614
+ return ml;
615
+ }
616
+ }
617
+ }
618
+
619
+ {
620
+ U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1];
621
+ U32 chainIndex = chainPackedPointer >> 8;
622
+ U32 const chainLength = chainPackedPointer & 0xFF;
623
+ U32 const chainAttempts = nbAttempts - ddsAttempt;
624
+ U32 const chainLimit = chainAttempts > chainLength ? chainLength : chainAttempts;
625
+ U32 chainAttempt;
626
+
627
+ for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++) {
628
+ PREFETCH_L1(ddsBase + dms->chainTable[chainIndex + chainAttempt]);
629
+ }
630
+
631
+ for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++, chainIndex++) {
632
+ size_t currentMl=0;
633
+ const BYTE* match;
634
+ matchIndex = dms->chainTable[chainIndex];
635
+ match = ddsBase + matchIndex;
636
+
637
+ /* guaranteed by table construction */
638
+ assert(matchIndex >= ddsLowestIndex);
639
+ assert(match+4 <= ddsEnd);
640
+ if (MEM_read32(match) == MEM_read32(ip)) {
641
+ /* assumption : matchIndex <= dictLimit-4 (by table construction) */
642
+ currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4;
643
+ }
644
+
645
+ /* save best solution */
646
+ if (currentMl > ml) {
647
+ ml = currentMl;
648
+ *offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE;
649
+ if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
650
+ }
651
+ }
652
+ }
653
+ return ml;
654
+ }
442
655
 
443
656
 
444
657
  /* *********************************
@@ -448,7 +661,7 @@ static size_t ZSTD_BtFindBestMatch_extDict_selectMLS (
448
661
 
449
662
  /* Update chains up to ip (excluded)
450
663
  Assumption : always within prefix (i.e. not within extDict) */
451
- static U32 ZSTD_insertAndFindFirstIndex_internal(
664
+ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
452
665
  ZSTD_matchState_t* ms,
453
666
  const ZSTD_compressionParameters* const cParams,
454
667
  const BYTE* ip, U32 const mls)
@@ -477,7 +690,6 @@ U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {
477
690
  return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch);
478
691
  }
479
692
 
480
-
481
693
  /* inlining is important to hardwire a hot branch (template emulation) */
482
694
  FORCE_INLINE_TEMPLATE
483
695
  size_t ZSTD_HcFindBestMatch_generic (
@@ -495,18 +707,33 @@ size_t ZSTD_HcFindBestMatch_generic (
495
707
  const U32 dictLimit = ms->window.dictLimit;
496
708
  const BYTE* const prefixStart = base + dictLimit;
497
709
  const BYTE* const dictEnd = dictBase + dictLimit;
498
- const U32 current = (U32)(ip-base);
710
+ const U32 curr = (U32)(ip-base);
499
711
  const U32 maxDistance = 1U << cParams->windowLog;
500
- const U32 lowValid = ms->window.lowLimit;
501
- const U32 lowLimit = (current - lowValid > maxDistance) ? current - maxDistance : lowValid;
502
- const U32 minChain = current > chainSize ? current - chainSize : 0;
712
+ const U32 lowestValid = ms->window.lowLimit;
713
+ const U32 withinMaxDistance = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid;
714
+ const U32 isDictionary = (ms->loadedDictEnd != 0);
715
+ const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance;
716
+ const U32 minChain = curr > chainSize ? curr - chainSize : 0;
503
717
  U32 nbAttempts = 1U << cParams->searchLog;
504
718
  size_t ml=4-1;
505
719
 
720
+ const ZSTD_matchState_t* const dms = ms->dictMatchState;
721
+ const U32 ddsHashLog = dictMode == ZSTD_dedicatedDictSearch
722
+ ? dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG : 0;
723
+ const size_t ddsIdx = dictMode == ZSTD_dedicatedDictSearch
724
+ ? ZSTD_hashPtr(ip, ddsHashLog, mls) << ZSTD_LAZY_DDSS_BUCKET_LOG : 0;
725
+
726
+ U32 matchIndex;
727
+
728
+ if (dictMode == ZSTD_dedicatedDictSearch) {
729
+ const U32* entry = &dms->hashTable[ddsIdx];
730
+ PREFETCH_L1(entry);
731
+ }
732
+
506
733
  /* HC4 match finder */
507
- U32 matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls);
734
+ matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls);
508
735
 
509
- for ( ; (matchIndex>lowLimit) & (nbAttempts>0) ; nbAttempts--) {
736
+ for ( ; (matchIndex>=lowLimit) & (nbAttempts>0) ; nbAttempts--) {
510
737
  size_t currentMl=0;
511
738
  if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
512
739
  const BYTE* const match = base + matchIndex;
@@ -523,7 +750,7 @@ size_t ZSTD_HcFindBestMatch_generic (
523
750
  /* save best solution */
524
751
  if (currentMl > ml) {
525
752
  ml = currentMl;
526
- *offsetPtr = current - matchIndex + ZSTD_REP_MOVE;
753
+ *offsetPtr = curr - matchIndex + ZSTD_REP_MOVE;
527
754
  if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
528
755
  }
529
756
 
@@ -531,8 +758,10 @@ size_t ZSTD_HcFindBestMatch_generic (
531
758
  matchIndex = NEXT_IN_CHAIN(matchIndex, chainMask);
532
759
  }
533
760
 
534
- if (dictMode == ZSTD_dictMatchState) {
535
- const ZSTD_matchState_t* const dms = ms->dictMatchState;
761
+ if (dictMode == ZSTD_dedicatedDictSearch) {
762
+ ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts, dms,
763
+ ip, iLimit, prefixStart, curr, dictLimit, ddsIdx);
764
+ } else if (dictMode == ZSTD_dictMatchState) {
536
765
  const U32* const dmsChainTable = dms->chainTable;
537
766
  const U32 dmsChainSize = (1 << dms->cParams.chainLog);
538
767
  const U32 dmsChainMask = dmsChainSize - 1;
@@ -545,7 +774,7 @@ size_t ZSTD_HcFindBestMatch_generic (
545
774
 
546
775
  matchIndex = dms->hashTable[ZSTD_hashPtr(ip, dms->cParams.hashLog, mls)];
547
776
 
548
- for ( ; (matchIndex>dmsLowestIndex) & (nbAttempts>0) ; nbAttempts--) {
777
+ for ( ; (matchIndex>=dmsLowestIndex) & (nbAttempts>0) ; nbAttempts--) {
549
778
  size_t currentMl=0;
550
779
  const BYTE* const match = dmsBase + matchIndex;
551
780
  assert(match+4 <= dmsEnd);
@@ -555,11 +784,12 @@ size_t ZSTD_HcFindBestMatch_generic (
555
784
  /* save best solution */
556
785
  if (currentMl > ml) {
557
786
  ml = currentMl;
558
- *offsetPtr = current - (matchIndex + dmsIndexDelta) + ZSTD_REP_MOVE;
787
+ *offsetPtr = curr - (matchIndex + dmsIndexDelta) + ZSTD_REP_MOVE;
559
788
  if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
560
789
  }
561
790
 
562
791
  if (matchIndex <= dmsMinChain) break;
792
+
563
793
  matchIndex = dmsChainTable[matchIndex & dmsChainMask];
564
794
  }
565
795
  }
@@ -600,6 +830,22 @@ static size_t ZSTD_HcFindBestMatch_dictMatchState_selectMLS (
600
830
  }
601
831
 
602
832
 
833
+ static size_t ZSTD_HcFindBestMatch_dedicatedDictSearch_selectMLS (
834
+ ZSTD_matchState_t* ms,
835
+ const BYTE* ip, const BYTE* const iLimit,
836
+ size_t* offsetPtr)
837
+ {
838
+ switch(ms->cParams.minMatch)
839
+ {
840
+ default : /* includes case 3 */
841
+ case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_dedicatedDictSearch);
842
+ case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_dedicatedDictSearch);
843
+ case 7 :
844
+ case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_dedicatedDictSearch);
845
+ }
846
+ }
847
+
848
+
603
849
  FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_extDict_selectMLS (
604
850
  ZSTD_matchState_t* ms,
605
851
  const BYTE* ip, const BYTE* const iLimit,
@@ -615,73 +861,765 @@ FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_extDict_selectMLS (
615
861
  }
616
862
  }
617
863
 
864
+ /* *********************************
865
+ * (SIMD) Row-based matchfinder
866
+ ***********************************/
867
+ /* Constants for row-based hash */
868
+ #define ZSTD_ROW_HASH_TAG_OFFSET 1 /* byte offset of hashes in the match state's tagTable from the beginning of a row */
869
+ #define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */
870
+ #define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1)
871
+
872
+ #define ZSTD_ROW_HASH_CACHE_MASK (ZSTD_ROW_HASH_CACHE_SIZE - 1)
873
+
874
+ typedef U32 ZSTD_VecMask; /* Clarifies when we are interacting with a U32 representing a mask of matches */
875
+
876
+ #if !defined(ZSTD_NO_INTRINSICS) && defined(__SSE2__) /* SIMD SSE version */
877
+
878
+ #include <emmintrin.h>
879
+ typedef __m128i ZSTD_Vec128;
880
+
881
+ /* Returns a 128-bit container with 128-bits from src */
882
+ static ZSTD_Vec128 ZSTD_Vec128_read(const void* const src) {
883
+ return _mm_loadu_si128((ZSTD_Vec128 const*)src);
884
+ }
885
+
886
+ /* Returns a ZSTD_Vec128 with the byte "val" packed 16 times */
887
+ static ZSTD_Vec128 ZSTD_Vec128_set8(BYTE val) {
888
+ return _mm_set1_epi8((char)val);
889
+ }
890
+
891
+ /* Do byte-by-byte comparison result of x and y. Then collapse 128-bit resultant mask
892
+ * into a 32-bit mask that is the MSB of each byte.
893
+ * */
894
+ static ZSTD_VecMask ZSTD_Vec128_cmpMask8(ZSTD_Vec128 x, ZSTD_Vec128 y) {
895
+ return (ZSTD_VecMask)_mm_movemask_epi8(_mm_cmpeq_epi8(x, y));
896
+ }
897
+
898
+ typedef struct {
899
+ __m128i fst;
900
+ __m128i snd;
901
+ } ZSTD_Vec256;
902
+
903
+ static ZSTD_Vec256 ZSTD_Vec256_read(const void* const ptr) {
904
+ ZSTD_Vec256 v;
905
+ v.fst = ZSTD_Vec128_read(ptr);
906
+ v.snd = ZSTD_Vec128_read((ZSTD_Vec128 const*)ptr + 1);
907
+ return v;
908
+ }
909
+
910
+ static ZSTD_Vec256 ZSTD_Vec256_set8(BYTE val) {
911
+ ZSTD_Vec256 v;
912
+ v.fst = ZSTD_Vec128_set8(val);
913
+ v.snd = ZSTD_Vec128_set8(val);
914
+ return v;
915
+ }
916
+
917
+ static ZSTD_VecMask ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x, ZSTD_Vec256 y) {
918
+ ZSTD_VecMask fstMask;
919
+ ZSTD_VecMask sndMask;
920
+ fstMask = ZSTD_Vec128_cmpMask8(x.fst, y.fst);
921
+ sndMask = ZSTD_Vec128_cmpMask8(x.snd, y.snd);
922
+ return fstMask | (sndMask << 16);
923
+ }
924
+
925
+ #elif !defined(ZSTD_NO_INTRINSICS) && defined(__ARM_NEON) /* SIMD ARM NEON Version */
926
+
927
+ #include <arm_neon.h>
928
+ typedef uint8x16_t ZSTD_Vec128;
929
+
930
+ static ZSTD_Vec128 ZSTD_Vec128_read(const void* const src) {
931
+ return vld1q_u8((const BYTE* const)src);
932
+ }
933
+
934
+ static ZSTD_Vec128 ZSTD_Vec128_set8(BYTE val) {
935
+ return vdupq_n_u8(val);
936
+ }
937
+
938
+ /* Mimics '_mm_movemask_epi8()' from SSE */
939
+ static U32 ZSTD_vmovmaskq_u8(ZSTD_Vec128 val) {
940
+ /* Shift out everything but the MSB bits in each byte */
941
+ uint16x8_t highBits = vreinterpretq_u16_u8(vshrq_n_u8(val, 7));
942
+ /* Merge the even lanes together with vsra (right shift and add) */
943
+ uint32x4_t paired16 = vreinterpretq_u32_u16(vsraq_n_u16(highBits, highBits, 7));
944
+ uint64x2_t paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
945
+ uint8x16_t paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
946
+ /* Extract the low 8 bits from each lane, merge */
947
+ return vgetq_lane_u8(paired64, 0) | ((U32)vgetq_lane_u8(paired64, 8) << 8);
948
+ }
949
+
950
+ static ZSTD_VecMask ZSTD_Vec128_cmpMask8(ZSTD_Vec128 x, ZSTD_Vec128 y) {
951
+ return (ZSTD_VecMask)ZSTD_vmovmaskq_u8(vceqq_u8(x, y));
952
+ }
953
+
954
+ typedef struct {
955
+ uint8x16_t fst;
956
+ uint8x16_t snd;
957
+ } ZSTD_Vec256;
958
+
959
+ static ZSTD_Vec256 ZSTD_Vec256_read(const void* const ptr) {
960
+ ZSTD_Vec256 v;
961
+ v.fst = ZSTD_Vec128_read(ptr);
962
+ v.snd = ZSTD_Vec128_read((ZSTD_Vec128 const*)ptr + 1);
963
+ return v;
964
+ }
965
+
966
+ static ZSTD_Vec256 ZSTD_Vec256_set8(BYTE val) {
967
+ ZSTD_Vec256 v;
968
+ v.fst = ZSTD_Vec128_set8(val);
969
+ v.snd = ZSTD_Vec128_set8(val);
970
+ return v;
971
+ }
972
+
973
+ static ZSTD_VecMask ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x, ZSTD_Vec256 y) {
974
+ ZSTD_VecMask fstMask;
975
+ ZSTD_VecMask sndMask;
976
+ fstMask = ZSTD_Vec128_cmpMask8(x.fst, y.fst);
977
+ sndMask = ZSTD_Vec128_cmpMask8(x.snd, y.snd);
978
+ return fstMask | (sndMask << 16);
979
+ }
980
+
981
+ #else /* Scalar fallback version */
982
+
983
+ #define VEC128_NB_SIZE_T (16 / sizeof(size_t))
984
+ typedef struct {
985
+ size_t vec[VEC128_NB_SIZE_T];
986
+ } ZSTD_Vec128;
987
+
988
+ static ZSTD_Vec128 ZSTD_Vec128_read(const void* const src) {
989
+ ZSTD_Vec128 ret;
990
+ ZSTD_memcpy(ret.vec, src, VEC128_NB_SIZE_T*sizeof(size_t));
991
+ return ret;
992
+ }
993
+
994
+ static ZSTD_Vec128 ZSTD_Vec128_set8(BYTE val) {
995
+ ZSTD_Vec128 ret = { {0} };
996
+ int startBit = sizeof(size_t) * 8 - 8;
997
+ for (;startBit >= 0; startBit -= 8) {
998
+ unsigned j = 0;
999
+ for (;j < VEC128_NB_SIZE_T; ++j) {
1000
+ ret.vec[j] |= ((size_t)val << startBit);
1001
+ }
1002
+ }
1003
+ return ret;
1004
+ }
1005
+
1006
+ /* Compare x to y, byte by byte, generating a "matches" bitfield */
1007
+ static ZSTD_VecMask ZSTD_Vec128_cmpMask8(ZSTD_Vec128 x, ZSTD_Vec128 y) {
1008
+ ZSTD_VecMask res = 0;
1009
+ unsigned i = 0;
1010
+ unsigned l = 0;
1011
+ for (; i < VEC128_NB_SIZE_T; ++i) {
1012
+ const size_t cmp1 = x.vec[i];
1013
+ const size_t cmp2 = y.vec[i];
1014
+ unsigned j = 0;
1015
+ for (; j < sizeof(size_t); ++j, ++l) {
1016
+ if (((cmp1 >> j*8) & 0xFF) == ((cmp2 >> j*8) & 0xFF)) {
1017
+ res |= ((U32)1 << (j+i*sizeof(size_t)));
1018
+ }
1019
+ }
1020
+ }
1021
+ return res;
1022
+ }
1023
+
1024
+ #define VEC256_NB_SIZE_T 2*VEC128_NB_SIZE_T
1025
+ typedef struct {
1026
+ size_t vec[VEC256_NB_SIZE_T];
1027
+ } ZSTD_Vec256;
1028
+
1029
+ static ZSTD_Vec256 ZSTD_Vec256_read(const void* const src) {
1030
+ ZSTD_Vec256 ret;
1031
+ ZSTD_memcpy(ret.vec, src, VEC256_NB_SIZE_T*sizeof(size_t));
1032
+ return ret;
1033
+ }
1034
+
1035
+ static ZSTD_Vec256 ZSTD_Vec256_set8(BYTE val) {
1036
+ ZSTD_Vec256 ret = { {0} };
1037
+ int startBit = sizeof(size_t) * 8 - 8;
1038
+ for (;startBit >= 0; startBit -= 8) {
1039
+ unsigned j = 0;
1040
+ for (;j < VEC256_NB_SIZE_T; ++j) {
1041
+ ret.vec[j] |= ((size_t)val << startBit);
1042
+ }
1043
+ }
1044
+ return ret;
1045
+ }
1046
+
1047
+ /* Compare x to y, byte by byte, generating a "matches" bitfield */
1048
+ static ZSTD_VecMask ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x, ZSTD_Vec256 y) {
1049
+ ZSTD_VecMask res = 0;
1050
+ unsigned i = 0;
1051
+ unsigned l = 0;
1052
+ for (; i < VEC256_NB_SIZE_T; ++i) {
1053
+ const size_t cmp1 = x.vec[i];
1054
+ const size_t cmp2 = y.vec[i];
1055
+ unsigned j = 0;
1056
+ for (; j < sizeof(size_t); ++j, ++l) {
1057
+ if (((cmp1 >> j*8) & 0xFF) == ((cmp2 >> j*8) & 0xFF)) {
1058
+ res |= ((U32)1 << (j+i*sizeof(size_t)));
1059
+ }
1060
+ }
1061
+ }
1062
+ return res;
1063
+ }
1064
+
1065
+ #endif /* !defined(ZSTD_NO_INTRINSICS) && defined(__SSE2__) */
1066
+
1067
+ /* ZSTD_VecMask_next():
1068
+ * Starting from the LSB, returns the idx of the next non-zero bit.
1069
+ * Basically counting the nb of trailing zeroes.
1070
+ */
1071
+ static U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
1072
+ # if defined(_MSC_VER) /* Visual */
1073
+ unsigned long r=0;
1074
+ return _BitScanForward(&r, val) ? (U32)r : 0;
1075
+ # elif defined(__GNUC__) && (__GNUC__ >= 3)
1076
+ return (U32)__builtin_ctz(val);
1077
+ # else
1078
+ /* Software ctz version: http://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightMultLookup */
1079
+ static const U32 multiplyDeBruijnBitPosition[32] =
1080
+ {
1081
+ 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
1082
+ 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
1083
+ };
1084
+ return multiplyDeBruijnBitPosition[((U32)((v & -(int)v) * 0x077CB531U)) >> 27];
1085
+ # endif
1086
+ }
1087
+
1088
+ /* ZSTD_VecMask_rotateRight():
1089
+ * Rotates a bitfield to the right by "rotation" bits.
1090
+ * If the rotation is greater than totalBits, the returned mask is 0.
1091
+ */
1092
+ FORCE_INLINE_TEMPLATE ZSTD_VecMask
1093
+ ZSTD_VecMask_rotateRight(ZSTD_VecMask mask, U32 const rotation, U32 const totalBits) {
1094
+ if (rotation == 0)
1095
+ return mask;
1096
+ switch (totalBits) {
1097
+ default:
1098
+ assert(0);
1099
+ case 16:
1100
+ return (mask >> rotation) | (U16)(mask << (16 - rotation));
1101
+ case 32:
1102
+ return (mask >> rotation) | (U32)(mask << (32 - rotation));
1103
+ }
1104
+ }
1105
+
1106
+ /* ZSTD_row_nextIndex():
1107
+ * Returns the next index to insert at within a tagTable row, and updates the "head"
1108
+ * value to reflect the update. Essentially cycles backwards from [0, {entries per row})
1109
+ */
1110
+ FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const rowMask) {
1111
+ U32 const next = (*tagRow - 1) & rowMask;
1112
+ *tagRow = (BYTE)next;
1113
+ return next;
1114
+ }
1115
+
1116
+ /* ZSTD_isAligned():
1117
+ * Checks that a pointer is aligned to "align" bytes which must be a power of 2.
1118
+ */
1119
+ MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) {
1120
+ assert((align & (align - 1)) == 0);
1121
+ return (((size_t)ptr) & (align - 1)) == 0;
1122
+ }
1123
+
1124
+ /* ZSTD_row_prefetch():
1125
+ * Performs prefetching for the hashTable and tagTable at a given row.
1126
+ */
1127
+ FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* tagTable, U32 const relRow, U32 const rowLog) {
1128
+ PREFETCH_L1(hashTable + relRow);
1129
+ if (rowLog == 5) {
1130
+ PREFETCH_L1(hashTable + relRow + 16);
1131
+ }
1132
+ PREFETCH_L1(tagTable + relRow);
1133
+ assert(rowLog == 4 || rowLog == 5);
1134
+ assert(ZSTD_isAligned(hashTable + relRow, 64)); /* prefetched hash row always 64-byte aligned */
1135
+ assert(ZSTD_isAligned(tagTable + relRow, (size_t)1 << rowLog)); /* prefetched tagRow sits on a multiple of 32 or 64 bytes */
1136
+ }
1137
+
1138
+ /* ZSTD_row_fillHashCache():
1139
+ * Fill up the hash cache starting at idx, prefetching up to ZSTD_ROW_HASH_CACHE_SIZE entries,
1140
+ * but not beyond iLimit.
1141
+ */
1142
+ static void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base,
1143
+ U32 const rowLog, U32 const mls,
1144
+ U32 idx, const BYTE* const iLimit)
1145
+ {
1146
+ U32 const* const hashTable = ms->hashTable;
1147
+ U16 const* const tagTable = ms->tagTable;
1148
+ U32 const hashLog = ms->rowHashLog;
1149
+ U32 const maxElemsToPrefetch = (base + idx) > iLimit ? 0 : (U32)(iLimit - (base + idx) + 1);
1150
+ U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch);
1151
+
1152
+ for (; idx < lim; ++idx) {
1153
+ U32 const hash = (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
1154
+ U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
1155
+ ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
1156
+ ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash;
1157
+ }
1158
+
1159
+ DEBUGLOG(6, "ZSTD_row_fillHashCache(): [%u %u %u %u %u %u %u %u]", ms->hashCache[0], ms->hashCache[1],
1160
+ ms->hashCache[2], ms->hashCache[3], ms->hashCache[4],
1161
+ ms->hashCache[5], ms->hashCache[6], ms->hashCache[7]);
1162
+ }
1163
+
1164
+ /* ZSTD_row_nextCachedHash():
1165
+ * Returns the hash of base + idx, and replaces the hash in the hash cache with the byte at
1166
+ * base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable.
1167
+ */
1168
+ FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable,
1169
+ U16 const* tagTable, BYTE const* base,
1170
+ U32 idx, U32 const hashLog,
1171
+ U32 const rowLog, U32 const mls)
1172
+ {
1173
+ U32 const newHash = (U32)ZSTD_hashPtr(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
1174
+ U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
1175
+ ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
1176
+ { U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK];
1177
+ cache[idx & ZSTD_ROW_HASH_CACHE_MASK] = newHash;
1178
+ return hash;
1179
+ }
1180
+ }
1181
+
1182
+ /* ZSTD_row_update_internal():
1183
+ * Inserts the byte at ip into the appropriate position in the hash table.
1184
+ * Determines the relative row, and the position within the {16, 32} entry row to insert at.
1185
+ */
1186
+ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip,
1187
+ U32 const mls, U32 const rowLog,
1188
+ U32 const rowMask, U32 const useCache)
1189
+ {
1190
+ U32* const hashTable = ms->hashTable;
1191
+ U16* const tagTable = ms->tagTable;
1192
+ U32 const hashLog = ms->rowHashLog;
1193
+ const BYTE* const base = ms->window.base;
1194
+ const U32 target = (U32)(ip - base);
1195
+ U32 idx = ms->nextToUpdate;
1196
+
1197
+ DEBUGLOG(6, "ZSTD_row_update_internal(): nextToUpdate=%u, current=%u", idx, target);
1198
+ for (; idx < target; ++idx) {
1199
+ U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, idx, hashLog, rowLog, mls)
1200
+ : (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
1201
+ U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
1202
+ U32* const row = hashTable + relRow;
1203
+ BYTE* tagRow = (BYTE*)(tagTable + relRow); /* Though tagTable is laid out as a table of U16, each tag is only 1 byte.
1204
+ Explicit cast allows us to get exact desired position within each row */
1205
+ U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
1206
+
1207
+ assert(hash == ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls));
1208
+ ((BYTE*)tagRow)[pos + ZSTD_ROW_HASH_TAG_OFFSET] = hash & ZSTD_ROW_HASH_TAG_MASK;
1209
+ row[pos] = idx;
1210
+ }
1211
+ ms->nextToUpdate = target;
1212
+ }
1213
+
1214
+ /* ZSTD_row_update():
1215
+ * External wrapper for ZSTD_row_update_internal(). Used for filling the hashtable during dictionary
1216
+ * processing.
1217
+ */
1218
+ void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) {
1219
+ const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5;
1220
+ const U32 rowMask = (1u << rowLog) - 1;
1221
+ const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */);
1222
+
1223
+ DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog);
1224
+ ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* dont use cache */);
1225
+ }
1226
+
1227
+ /* Returns a ZSTD_VecMask (U32) that has the nth bit set to 1 if the newly-computed "tag" matches
1228
+ * the hash at the nth position in a row of the tagTable.
1229
+ */
1230
+ FORCE_INLINE_TEMPLATE
1231
+ ZSTD_VecMask ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries) {
1232
+ ZSTD_VecMask matches = 0;
1233
+ if (rowEntries == 16) {
1234
+ ZSTD_Vec128 hashes = ZSTD_Vec128_read(tagRow + ZSTD_ROW_HASH_TAG_OFFSET);
1235
+ ZSTD_Vec128 expandedTags = ZSTD_Vec128_set8(tag);
1236
+ matches = ZSTD_Vec128_cmpMask8(hashes, expandedTags);
1237
+ } else if (rowEntries == 32) {
1238
+ ZSTD_Vec256 hashes = ZSTD_Vec256_read(tagRow + ZSTD_ROW_HASH_TAG_OFFSET);
1239
+ ZSTD_Vec256 expandedTags = ZSTD_Vec256_set8(tag);
1240
+ matches = ZSTD_Vec256_cmpMask8(hashes, expandedTags);
1241
+ } else {
1242
+ assert(0);
1243
+ }
1244
+ /* Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield
1245
+ to match up with the actual layout of the entries within the hashTable */
1246
+ return ZSTD_VecMask_rotateRight(matches, head, rowEntries);
1247
+ }
1248
+
1249
+ /* The high-level approach of the SIMD row based match finder is as follows:
1250
+ * - Figure out where to insert the new entry:
1251
+ * - Generate a hash from a byte along with an additional 1-byte "short hash". The additional byte is our "tag"
1252
+ * - The hashTable is effectively split into groups or "rows" of 16 or 32 entries of U32, and the hash determines
1253
+ * which row to insert into.
1254
+ * - Determine the correct position within the row to insert the entry into. Each row of 16 or 32 can
1255
+ * be considered as a circular buffer with a "head" index that resides in the tagTable.
1256
+ * - Also insert the "tag" into the equivalent row and position in the tagTable.
1257
+ * - Note: The tagTable has 17 or 33 1-byte entries per row, due to 16 or 32 tags, and 1 "head" entry.
1258
+ * The 17 or 33 entry rows are spaced out to occur every 32 or 64 bytes, respectively,
1259
+ * for alignment/performance reasons, leaving some bytes unused.
1260
+ * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte "short hash" and
1261
+ * generate a bitfield that we can cycle through to check the collisions in the hash table.
1262
+ * - Pick the longest match.
1263
+ */
1264
+ FORCE_INLINE_TEMPLATE
1265
+ size_t ZSTD_RowFindBestMatch_generic (
1266
+ ZSTD_matchState_t* ms,
1267
+ const BYTE* const ip, const BYTE* const iLimit,
1268
+ size_t* offsetPtr,
1269
+ const U32 mls, const ZSTD_dictMode_e dictMode,
1270
+ const U32 rowLog)
1271
+ {
1272
+ U32* const hashTable = ms->hashTable;
1273
+ U16* const tagTable = ms->tagTable;
1274
+ U32* const hashCache = ms->hashCache;
1275
+ const U32 hashLog = ms->rowHashLog;
1276
+ const ZSTD_compressionParameters* const cParams = &ms->cParams;
1277
+ const BYTE* const base = ms->window.base;
1278
+ const BYTE* const dictBase = ms->window.dictBase;
1279
+ const U32 dictLimit = ms->window.dictLimit;
1280
+ const BYTE* const prefixStart = base + dictLimit;
1281
+ const BYTE* const dictEnd = dictBase + dictLimit;
1282
+ const U32 curr = (U32)(ip-base);
1283
+ const U32 maxDistance = 1U << cParams->windowLog;
1284
+ const U32 lowestValid = ms->window.lowLimit;
1285
+ const U32 withinMaxDistance = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid;
1286
+ const U32 isDictionary = (ms->loadedDictEnd != 0);
1287
+ const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance;
1288
+ const U32 rowEntries = (1U << rowLog);
1289
+ const U32 rowMask = rowEntries - 1;
1290
+ const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */
1291
+ U32 nbAttempts = 1U << cappedSearchLog;
1292
+ size_t ml=4-1;
1293
+
1294
+ /* DMS/DDS variables that may be referenced laster */
1295
+ const ZSTD_matchState_t* const dms = ms->dictMatchState;
1296
+ size_t ddsIdx;
1297
+ U32 ddsExtraAttempts; /* cctx hash tables are limited in searches, but allow extra searches into DDS */
1298
+ U32 dmsTag;
1299
+ U32* dmsRow;
1300
+ BYTE* dmsTagRow;
1301
+
1302
+ if (dictMode == ZSTD_dedicatedDictSearch) {
1303
+ const U32 ddsHashLog = dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG;
1304
+ { /* Prefetch DDS hashtable entry */
1305
+ ddsIdx = ZSTD_hashPtr(ip, ddsHashLog, mls) << ZSTD_LAZY_DDSS_BUCKET_LOG;
1306
+ PREFETCH_L1(&dms->hashTable[ddsIdx]);
1307
+ }
1308
+ ddsExtraAttempts = cParams->searchLog > rowLog ? 1U << (cParams->searchLog - rowLog) : 0;
1309
+ }
1310
+
1311
+ if (dictMode == ZSTD_dictMatchState) {
1312
+ /* Prefetch DMS rows */
1313
+ U32* const dmsHashTable = dms->hashTable;
1314
+ U16* const dmsTagTable = dms->tagTable;
1315
+ U32 const dmsHash = (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
1316
+ U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
1317
+ dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK;
1318
+ dmsTagRow = (BYTE*)(dmsTagTable + dmsRelRow);
1319
+ dmsRow = dmsHashTable + dmsRelRow;
1320
+ ZSTD_row_prefetch(dmsHashTable, dmsTagTable, dmsRelRow, rowLog);
1321
+ }
1322
+
1323
+ /* Update the hashTable and tagTable up to (but not including) ip */
1324
+ ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);
1325
+ { /* Get the hash for ip, compute the appropriate row */
1326
+ U32 const hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls);
1327
+ U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
1328
+ U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK;
1329
+ U32* const row = hashTable + relRow;
1330
+ BYTE* tagRow = (BYTE*)(tagTable + relRow);
1331
+ U32 const head = *tagRow & rowMask;
1332
+ U32 matchBuffer[32 /* maximum nb entries per row */];
1333
+ size_t numMatches = 0;
1334
+ size_t currMatch = 0;
1335
+ ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, head, rowEntries);
1336
+
1337
+ /* Cycle through the matches and prefetch */
1338
+ for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
1339
+ U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
1340
+ U32 const matchIndex = row[matchPos];
1341
+ assert(numMatches < rowEntries);
1342
+ if (matchIndex < lowLimit)
1343
+ break;
1344
+ if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
1345
+ PREFETCH_L1(base + matchIndex);
1346
+ } else {
1347
+ PREFETCH_L1(dictBase + matchIndex);
1348
+ }
1349
+ matchBuffer[numMatches++] = matchIndex;
1350
+ }
1351
+
1352
+ /* Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop
1353
+ in ZSTD_row_update_internal() at the next search. */
1354
+ {
1355
+ U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
1356
+ tagRow[pos + ZSTD_ROW_HASH_TAG_OFFSET] = (BYTE)tag;
1357
+ row[pos] = ms->nextToUpdate++;
1358
+ }
1359
+
1360
+ /* Return the longest match */
1361
+ for (; currMatch < numMatches; ++currMatch) {
1362
+ U32 const matchIndex = matchBuffer[currMatch];
1363
+ size_t currentMl=0;
1364
+ assert(matchIndex < curr);
1365
+ assert(matchIndex >= lowLimit);
1366
+
1367
+ if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
1368
+ const BYTE* const match = base + matchIndex;
1369
+ assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */
1370
+ if (match[ml] == ip[ml]) /* potentially better */
1371
+ currentMl = ZSTD_count(ip, match, iLimit);
1372
+ } else {
1373
+ const BYTE* const match = dictBase + matchIndex;
1374
+ assert(match+4 <= dictEnd);
1375
+ if (MEM_read32(match) == MEM_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */
1376
+ currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dictEnd, prefixStart) + 4;
1377
+ }
1378
+
1379
+ /* Save best solution */
1380
+ if (currentMl > ml) {
1381
+ ml = currentMl;
1382
+ *offsetPtr = curr - matchIndex + ZSTD_REP_MOVE;
1383
+ if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
1384
+ }
1385
+ }
1386
+ }
1387
+
1388
+ if (dictMode == ZSTD_dedicatedDictSearch) {
1389
+ ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts + ddsExtraAttempts, dms,
1390
+ ip, iLimit, prefixStart, curr, dictLimit, ddsIdx);
1391
+ } else if (dictMode == ZSTD_dictMatchState) {
1392
+ /* TODO: Measure and potentially add prefetching to DMS */
1393
+ const U32 dmsLowestIndex = dms->window.dictLimit;
1394
+ const BYTE* const dmsBase = dms->window.base;
1395
+ const BYTE* const dmsEnd = dms->window.nextSrc;
1396
+ const U32 dmsSize = (U32)(dmsEnd - dmsBase);
1397
+ const U32 dmsIndexDelta = dictLimit - dmsSize;
1398
+
1399
+ { U32 const head = *dmsTagRow & rowMask;
1400
+ U32 matchBuffer[32 /* maximum nb row entries */];
1401
+ size_t numMatches = 0;
1402
+ size_t currMatch = 0;
1403
+ ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, head, rowEntries);
1404
+
1405
+ for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
1406
+ U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
1407
+ U32 const matchIndex = dmsRow[matchPos];
1408
+ if (matchIndex < dmsLowestIndex)
1409
+ break;
1410
+ PREFETCH_L1(dmsBase + matchIndex);
1411
+ matchBuffer[numMatches++] = matchIndex;
1412
+ }
1413
+
1414
+ /* Return the longest match */
1415
+ for (; currMatch < numMatches; ++currMatch) {
1416
+ U32 const matchIndex = matchBuffer[currMatch];
1417
+ size_t currentMl=0;
1418
+ assert(matchIndex >= dmsLowestIndex);
1419
+ assert(matchIndex < curr);
1420
+
1421
+ { const BYTE* const match = dmsBase + matchIndex;
1422
+ assert(match+4 <= dmsEnd);
1423
+ if (MEM_read32(match) == MEM_read32(ip))
1424
+ currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dmsEnd, prefixStart) + 4;
1425
+ }
1426
+
1427
+ if (currentMl > ml) {
1428
+ ml = currentMl;
1429
+ *offsetPtr = curr - (matchIndex + dmsIndexDelta) + ZSTD_REP_MOVE;
1430
+ if (ip+currentMl == iLimit) break;
1431
+ }
1432
+ }
1433
+ }
1434
+ }
1435
+ return ml;
1436
+ }
1437
+
1438
+ /* Inlining is important to hardwire a hot branch (template emulation) */
1439
+ FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_selectMLS (
1440
+ ZSTD_matchState_t* ms,
1441
+ const BYTE* ip, const BYTE* const iLimit,
1442
+ const ZSTD_dictMode_e dictMode, size_t* offsetPtr, const U32 rowLog)
1443
+ {
1444
+ switch(ms->cParams.minMatch)
1445
+ {
1446
+ default : /* includes case 3 */
1447
+ case 4 : return ZSTD_RowFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, dictMode, rowLog);
1448
+ case 5 : return ZSTD_RowFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, dictMode, rowLog);
1449
+ case 7 :
1450
+ case 6 : return ZSTD_RowFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, dictMode, rowLog);
1451
+ }
1452
+ }
1453
+
1454
+ FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_selectRowLog (
1455
+ ZSTD_matchState_t* ms,
1456
+ const BYTE* ip, const BYTE* const iLimit,
1457
+ size_t* offsetPtr)
1458
+ {
1459
+ const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 5);
1460
+ switch(cappedSearchLog)
1461
+ {
1462
+ default :
1463
+ case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_noDict, offsetPtr, 4);
1464
+ case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_noDict, offsetPtr, 5);
1465
+ }
1466
+ }
1467
+
1468
+ FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_dictMatchState_selectRowLog(
1469
+ ZSTD_matchState_t* ms,
1470
+ const BYTE* ip, const BYTE* const iLimit,
1471
+ size_t* offsetPtr)
1472
+ {
1473
+ const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 5);
1474
+ switch(cappedSearchLog)
1475
+ {
1476
+ default :
1477
+ case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dictMatchState, offsetPtr, 4);
1478
+ case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dictMatchState, offsetPtr, 5);
1479
+ }
1480
+ }
1481
+
1482
+ FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_dedicatedDictSearch_selectRowLog(
1483
+ ZSTD_matchState_t* ms,
1484
+ const BYTE* ip, const BYTE* const iLimit,
1485
+ size_t* offsetPtr)
1486
+ {
1487
+ const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 5);
1488
+ switch(cappedSearchLog)
1489
+ {
1490
+ default :
1491
+ case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dedicatedDictSearch, offsetPtr, 4);
1492
+ case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dedicatedDictSearch, offsetPtr, 5);
1493
+ }
1494
+ }
1495
+
1496
+ FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_extDict_selectRowLog (
1497
+ ZSTD_matchState_t* ms,
1498
+ const BYTE* ip, const BYTE* const iLimit,
1499
+ size_t* offsetPtr)
1500
+ {
1501
+ const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 5);
1502
+ switch(cappedSearchLog)
1503
+ {
1504
+ default :
1505
+ case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_extDict, offsetPtr, 4);
1506
+ case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_extDict, offsetPtr, 5);
1507
+ }
1508
+ }
1509
+
618
1510
 
619
1511
  /* *******************************
620
1512
  * Common parser - lazy strategy
621
1513
  *********************************/
622
- FORCE_INLINE_TEMPLATE
623
- size_t ZSTD_compressBlock_lazy_generic(
1514
+ typedef enum { search_hashChain=0, search_binaryTree=1, search_rowHash=2 } searchMethod_e;
1515
+
1516
+ FORCE_INLINE_TEMPLATE size_t
1517
+ ZSTD_compressBlock_lazy_generic(
624
1518
  ZSTD_matchState_t* ms, seqStore_t* seqStore,
625
1519
  U32 rep[ZSTD_REP_NUM],
626
1520
  const void* src, size_t srcSize,
627
- const U32 searchMethod, const U32 depth,
1521
+ const searchMethod_e searchMethod, const U32 depth,
628
1522
  ZSTD_dictMode_e const dictMode)
629
1523
  {
630
1524
  const BYTE* const istart = (const BYTE*)src;
631
1525
  const BYTE* ip = istart;
632
1526
  const BYTE* anchor = istart;
633
1527
  const BYTE* const iend = istart + srcSize;
634
- const BYTE* const ilimit = iend - 8;
1528
+ const BYTE* const ilimit = searchMethod == search_rowHash ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8;
635
1529
  const BYTE* const base = ms->window.base;
636
1530
  const U32 prefixLowestIndex = ms->window.dictLimit;
637
1531
  const BYTE* const prefixLowest = base + prefixLowestIndex;
1532
+ const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5;
638
1533
 
639
1534
  typedef size_t (*searchMax_f)(
640
1535
  ZSTD_matchState_t* ms,
641
1536
  const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
642
- searchMax_f const searchMax = dictMode == ZSTD_dictMatchState ?
643
- (searchMethod ? ZSTD_BtFindBestMatch_dictMatchState_selectMLS : ZSTD_HcFindBestMatch_dictMatchState_selectMLS) :
644
- (searchMethod ? ZSTD_BtFindBestMatch_selectMLS : ZSTD_HcFindBestMatch_selectMLS);
1537
+
1538
+ /**
1539
+ * This table is indexed first by the four ZSTD_dictMode_e values, and then
1540
+ * by the two searchMethod_e values. NULLs are placed for configurations
1541
+ * that should never occur (extDict modes go to the other implementation
1542
+ * below and there is no DDSS for binary tree search yet).
1543
+ */
1544
+ const searchMax_f searchFuncs[4][3] = {
1545
+ {
1546
+ ZSTD_HcFindBestMatch_selectMLS,
1547
+ ZSTD_BtFindBestMatch_selectMLS,
1548
+ ZSTD_RowFindBestMatch_selectRowLog
1549
+ },
1550
+ {
1551
+ NULL,
1552
+ NULL,
1553
+ NULL
1554
+ },
1555
+ {
1556
+ ZSTD_HcFindBestMatch_dictMatchState_selectMLS,
1557
+ ZSTD_BtFindBestMatch_dictMatchState_selectMLS,
1558
+ ZSTD_RowFindBestMatch_dictMatchState_selectRowLog
1559
+ },
1560
+ {
1561
+ ZSTD_HcFindBestMatch_dedicatedDictSearch_selectMLS,
1562
+ NULL,
1563
+ ZSTD_RowFindBestMatch_dedicatedDictSearch_selectRowLog
1564
+ }
1565
+ };
1566
+
1567
+ searchMax_f const searchMax = searchFuncs[dictMode][(int)searchMethod];
645
1568
  U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0;
646
1569
 
1570
+ const int isDMS = dictMode == ZSTD_dictMatchState;
1571
+ const int isDDS = dictMode == ZSTD_dedicatedDictSearch;
1572
+ const int isDxS = isDMS || isDDS;
647
1573
  const ZSTD_matchState_t* const dms = ms->dictMatchState;
648
- const U32 dictLowestIndex = dictMode == ZSTD_dictMatchState ?
649
- dms->window.dictLimit : 0;
650
- const BYTE* const dictBase = dictMode == ZSTD_dictMatchState ?
651
- dms->window.base : NULL;
652
- const BYTE* const dictLowest = dictMode == ZSTD_dictMatchState ?
653
- dictBase + dictLowestIndex : NULL;
654
- const BYTE* const dictEnd = dictMode == ZSTD_dictMatchState ?
655
- dms->window.nextSrc : NULL;
656
- const U32 dictIndexDelta = dictMode == ZSTD_dictMatchState ?
1574
+ const U32 dictLowestIndex = isDxS ? dms->window.dictLimit : 0;
1575
+ const BYTE* const dictBase = isDxS ? dms->window.base : NULL;
1576
+ const BYTE* const dictLowest = isDxS ? dictBase + dictLowestIndex : NULL;
1577
+ const BYTE* const dictEnd = isDxS ? dms->window.nextSrc : NULL;
1578
+ const U32 dictIndexDelta = isDxS ?
657
1579
  prefixLowestIndex - (U32)(dictEnd - dictBase) :
658
1580
  0;
659
- const U32 dictAndPrefixLength = (U32)(ip - prefixLowest + dictEnd - dictLowest);
1581
+ const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictLowest));
660
1582
 
661
- /* init */
1583
+ assert(searchMax != NULL);
1584
+
1585
+ DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=%u) (searchFunc=%u)", (U32)dictMode, (U32)searchMethod);
662
1586
  ip += (dictAndPrefixLength == 0);
663
1587
  if (dictMode == ZSTD_noDict) {
664
- U32 const maxRep = (U32)(ip - prefixLowest);
1588
+ U32 const curr = (U32)(ip - base);
1589
+ U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, ms->cParams.windowLog);
1590
+ U32 const maxRep = curr - windowLow;
665
1591
  if (offset_2 > maxRep) savedOffset = offset_2, offset_2 = 0;
666
1592
  if (offset_1 > maxRep) savedOffset = offset_1, offset_1 = 0;
667
1593
  }
668
- if (dictMode == ZSTD_dictMatchState) {
1594
+ if (isDxS) {
669
1595
  /* dictMatchState repCode checks don't currently handle repCode == 0
670
1596
  * disabling. */
671
1597
  assert(offset_1 <= dictAndPrefixLength);
672
1598
  assert(offset_2 <= dictAndPrefixLength);
673
1599
  }
674
1600
 
1601
+ if (searchMethod == search_rowHash) {
1602
+ ZSTD_row_fillHashCache(ms, base, rowLog,
1603
+ MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
1604
+ ms->nextToUpdate, ilimit);
1605
+ }
1606
+
675
1607
  /* Match Loop */
1608
+ #if defined(__GNUC__) && defined(__x86_64__)
1609
+ /* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the
1610
+ * code alignment is perturbed. To fix the instability align the loop on 32-bytes.
1611
+ */
1612
+ __asm__(".p2align 5");
1613
+ #endif
676
1614
  while (ip < ilimit) {
677
1615
  size_t matchLength=0;
678
1616
  size_t offset=0;
679
1617
  const BYTE* start=ip+1;
680
1618
 
681
1619
  /* check repCode */
682
- if (dictMode == ZSTD_dictMatchState) {
1620
+ if (isDxS) {
683
1621
  const U32 repIndex = (U32)(ip - base) + 1 - offset_1;
684
- const BYTE* repMatch = (dictMode == ZSTD_dictMatchState
1622
+ const BYTE* repMatch = ((dictMode == ZSTD_dictMatchState || dictMode == ZSTD_dedicatedDictSearch)
685
1623
  && repIndex < prefixLowestIndex) ?
686
1624
  dictBase + (repIndex - dictIndexDelta) :
687
1625
  base + repIndex;
@@ -722,7 +1660,7 @@ size_t ZSTD_compressBlock_lazy_generic(
722
1660
  if ((mlRep >= 4) && (gain2 > gain1))
723
1661
  matchLength = mlRep, offset = 0, start = ip;
724
1662
  }
725
- if (dictMode == ZSTD_dictMatchState) {
1663
+ if (isDxS) {
726
1664
  const U32 repIndex = (U32)(ip - base) - offset_1;
727
1665
  const BYTE* repMatch = repIndex < prefixLowestIndex ?
728
1666
  dictBase + (repIndex - dictIndexDelta) :
@@ -757,7 +1695,7 @@ size_t ZSTD_compressBlock_lazy_generic(
757
1695
  if ((mlRep >= 4) && (gain2 > gain1))
758
1696
  matchLength = mlRep, offset = 0, start = ip;
759
1697
  }
760
- if (dictMode == ZSTD_dictMatchState) {
1698
+ if (isDxS) {
761
1699
  const U32 repIndex = (U32)(ip - base) - offset_1;
762
1700
  const BYTE* repMatch = repIndex < prefixLowestIndex ?
763
1701
  dictBase + (repIndex - dictIndexDelta) :
@@ -795,7 +1733,7 @@ size_t ZSTD_compressBlock_lazy_generic(
795
1733
  && (start[-1] == (start-(offset-ZSTD_REP_MOVE))[-1]) ) /* only search for offset within prefix */
796
1734
  { start--; matchLength++; }
797
1735
  }
798
- if (dictMode == ZSTD_dictMatchState) {
1736
+ if (isDxS) {
799
1737
  U32 const matchIndex = (U32)((start-base) - (offset - ZSTD_REP_MOVE));
800
1738
  const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex;
801
1739
  const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest;
@@ -806,17 +1744,16 @@ size_t ZSTD_compressBlock_lazy_generic(
806
1744
  /* store sequence */
807
1745
  _storeSequence:
808
1746
  { size_t const litLength = start - anchor;
809
- ZSTD_storeSeq(seqStore, litLength, anchor, (U32)offset, matchLength-MINMATCH);
1747
+ ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offset, matchLength-MINMATCH);
810
1748
  anchor = ip = start + matchLength;
811
1749
  }
812
1750
 
813
1751
  /* check immediate repcode */
814
- if (dictMode == ZSTD_dictMatchState) {
1752
+ if (isDxS) {
815
1753
  while (ip <= ilimit) {
816
1754
  U32 const current2 = (U32)(ip-base);
817
1755
  U32 const repIndex = current2 - offset_2;
818
- const BYTE* repMatch = dictMode == ZSTD_dictMatchState
819
- && repIndex < prefixLowestIndex ?
1756
+ const BYTE* repMatch = repIndex < prefixLowestIndex ?
820
1757
  dictBase - dictIndexDelta + repIndex :
821
1758
  base + repIndex;
822
1759
  if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex) >= 3 /* intentional overflow */)
@@ -824,7 +1761,7 @@ _storeSequence:
824
1761
  const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend;
825
1762
  matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4;
826
1763
  offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap offset_2 <=> offset_1 */
827
- ZSTD_storeSeq(seqStore, 0, anchor, 0, matchLength-MINMATCH);
1764
+ ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MINMATCH);
828
1765
  ip += matchLength;
829
1766
  anchor = ip;
830
1767
  continue;
@@ -839,7 +1776,7 @@ _storeSequence:
839
1776
  /* store sequence */
840
1777
  matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
841
1778
  offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap repcodes */
842
- ZSTD_storeSeq(seqStore, 0, anchor, 0, matchLength-MINMATCH);
1779
+ ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MINMATCH);
843
1780
  ip += matchLength;
844
1781
  anchor = ip;
845
1782
  continue; /* faster when present ... (?) */
@@ -850,7 +1787,7 @@ _storeSequence:
850
1787
  rep[1] = offset_2 ? offset_2 : savedOffset;
851
1788
 
852
1789
  /* Return the last literals size */
853
- return iend - anchor;
1790
+ return (size_t)(iend - anchor);
854
1791
  }
855
1792
 
856
1793
 
@@ -858,101 +1795,207 @@ size_t ZSTD_compressBlock_btlazy2(
858
1795
  ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
859
1796
  void const* src, size_t srcSize)
860
1797
  {
861
- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, 1, 2, ZSTD_noDict);
1798
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_noDict);
862
1799
  }
863
1800
 
864
1801
  size_t ZSTD_compressBlock_lazy2(
865
1802
  ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
866
1803
  void const* src, size_t srcSize)
867
1804
  {
868
- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, 0, 2, ZSTD_noDict);
1805
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_noDict);
869
1806
  }
870
1807
 
871
1808
  size_t ZSTD_compressBlock_lazy(
872
1809
  ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
873
1810
  void const* src, size_t srcSize)
874
1811
  {
875
- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, 0, 1, ZSTD_noDict);
1812
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_noDict);
876
1813
  }
877
1814
 
878
1815
  size_t ZSTD_compressBlock_greedy(
879
1816
  ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
880
1817
  void const* src, size_t srcSize)
881
1818
  {
882
- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, 0, 0, ZSTD_noDict);
1819
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_noDict);
883
1820
  }
884
1821
 
885
1822
  size_t ZSTD_compressBlock_btlazy2_dictMatchState(
886
1823
  ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
887
1824
  void const* src, size_t srcSize)
888
1825
  {
889
- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, 1, 2, ZSTD_dictMatchState);
1826
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_dictMatchState);
890
1827
  }
891
1828
 
892
1829
  size_t ZSTD_compressBlock_lazy2_dictMatchState(
893
1830
  ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
894
1831
  void const* src, size_t srcSize)
895
1832
  {
896
- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, 0, 2, ZSTD_dictMatchState);
1833
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dictMatchState);
897
1834
  }
898
1835
 
899
1836
  size_t ZSTD_compressBlock_lazy_dictMatchState(
900
1837
  ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
901
1838
  void const* src, size_t srcSize)
902
1839
  {
903
- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, 0, 1, ZSTD_dictMatchState);
1840
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dictMatchState);
904
1841
  }
905
1842
 
906
1843
  size_t ZSTD_compressBlock_greedy_dictMatchState(
907
1844
  ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
908
1845
  void const* src, size_t srcSize)
909
1846
  {
910
- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, 0, 0, ZSTD_dictMatchState);
1847
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dictMatchState);
1848
+ }
1849
+
1850
+
1851
+ size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(
1852
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1853
+ void const* src, size_t srcSize)
1854
+ {
1855
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dedicatedDictSearch);
1856
+ }
1857
+
1858
+ size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(
1859
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1860
+ void const* src, size_t srcSize)
1861
+ {
1862
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dedicatedDictSearch);
1863
+ }
1864
+
1865
+ size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
1866
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1867
+ void const* src, size_t srcSize)
1868
+ {
1869
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch);
1870
+ }
1871
+
1872
+ /* Row-based matchfinder */
1873
+ size_t ZSTD_compressBlock_lazy2_row(
1874
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1875
+ void const* src, size_t srcSize)
1876
+ {
1877
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict);
1878
+ }
1879
+
1880
+ size_t ZSTD_compressBlock_lazy_row(
1881
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1882
+ void const* src, size_t srcSize)
1883
+ {
1884
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict);
1885
+ }
1886
+
1887
+ size_t ZSTD_compressBlock_greedy_row(
1888
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1889
+ void const* src, size_t srcSize)
1890
+ {
1891
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict);
1892
+ }
1893
+
1894
+ size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
1895
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1896
+ void const* src, size_t srcSize)
1897
+ {
1898
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState);
911
1899
  }
912
1900
 
1901
+ size_t ZSTD_compressBlock_lazy_dictMatchState_row(
1902
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1903
+ void const* src, size_t srcSize)
1904
+ {
1905
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState);
1906
+ }
1907
+
1908
+ size_t ZSTD_compressBlock_greedy_dictMatchState_row(
1909
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1910
+ void const* src, size_t srcSize)
1911
+ {
1912
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState);
1913
+ }
1914
+
1915
+
1916
+ size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row(
1917
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1918
+ void const* src, size_t srcSize)
1919
+ {
1920
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dedicatedDictSearch);
1921
+ }
1922
+
1923
+ size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
1924
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1925
+ void const* src, size_t srcSize)
1926
+ {
1927
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch);
1928
+ }
1929
+
1930
+ size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
1931
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1932
+ void const* src, size_t srcSize)
1933
+ {
1934
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch);
1935
+ }
913
1936
 
914
1937
  FORCE_INLINE_TEMPLATE
915
1938
  size_t ZSTD_compressBlock_lazy_extDict_generic(
916
1939
  ZSTD_matchState_t* ms, seqStore_t* seqStore,
917
1940
  U32 rep[ZSTD_REP_NUM],
918
1941
  const void* src, size_t srcSize,
919
- const U32 searchMethod, const U32 depth)
1942
+ const searchMethod_e searchMethod, const U32 depth)
920
1943
  {
921
1944
  const BYTE* const istart = (const BYTE*)src;
922
1945
  const BYTE* ip = istart;
923
1946
  const BYTE* anchor = istart;
924
1947
  const BYTE* const iend = istart + srcSize;
925
- const BYTE* const ilimit = iend - 8;
1948
+ const BYTE* const ilimit = searchMethod == search_rowHash ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8;
926
1949
  const BYTE* const base = ms->window.base;
927
1950
  const U32 dictLimit = ms->window.dictLimit;
928
- const U32 lowestIndex = ms->window.lowLimit;
929
1951
  const BYTE* const prefixStart = base + dictLimit;
930
1952
  const BYTE* const dictBase = ms->window.dictBase;
931
1953
  const BYTE* const dictEnd = dictBase + dictLimit;
932
- const BYTE* const dictStart = dictBase + lowestIndex;
1954
+ const BYTE* const dictStart = dictBase + ms->window.lowLimit;
1955
+ const U32 windowLog = ms->cParams.windowLog;
1956
+ const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5;
933
1957
 
934
1958
  typedef size_t (*searchMax_f)(
935
1959
  ZSTD_matchState_t* ms,
936
1960
  const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
937
- searchMax_f searchMax = searchMethod ? ZSTD_BtFindBestMatch_extDict_selectMLS : ZSTD_HcFindBestMatch_extDict_selectMLS;
938
-
1961
+ const searchMax_f searchFuncs[3] = {
1962
+ ZSTD_HcFindBestMatch_extDict_selectMLS,
1963
+ ZSTD_BtFindBestMatch_extDict_selectMLS,
1964
+ ZSTD_RowFindBestMatch_extDict_selectRowLog
1965
+ };
1966
+ searchMax_f searchMax = searchFuncs[(int)searchMethod];
939
1967
  U32 offset_1 = rep[0], offset_2 = rep[1];
940
1968
 
1969
+ DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod);
1970
+
941
1971
  /* init */
942
1972
  ip += (ip == prefixStart);
1973
+ if (searchMethod == search_rowHash) {
1974
+ ZSTD_row_fillHashCache(ms, base, rowLog,
1975
+ MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
1976
+ ms->nextToUpdate, ilimit);
1977
+ }
943
1978
 
944
1979
  /* Match Loop */
1980
+ #if defined(__GNUC__) && defined(__x86_64__)
1981
+ /* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the
1982
+ * code alignment is perturbed. To fix the instability align the loop on 32-bytes.
1983
+ */
1984
+ __asm__(".p2align 5");
1985
+ #endif
945
1986
  while (ip < ilimit) {
946
1987
  size_t matchLength=0;
947
1988
  size_t offset=0;
948
1989
  const BYTE* start=ip+1;
949
- U32 current = (U32)(ip-base);
1990
+ U32 curr = (U32)(ip-base);
950
1991
 
951
1992
  /* check repCode */
952
- { const U32 repIndex = (U32)(current+1 - offset_1);
1993
+ { const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr+1, windowLog);
1994
+ const U32 repIndex = (U32)(curr+1 - offset_1);
953
1995
  const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
954
1996
  const BYTE* const repMatch = repBase + repIndex;
955
- if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > lowestIndex)) /* intentional overflow */
1997
+ if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow */
1998
+ & (offset_1 < curr+1 - windowLow) ) /* note: we are searching at curr+1 */
956
1999
  if (MEM_read32(ip+1) == MEM_read32(repMatch)) {
957
2000
  /* repcode detected we should take it */
958
2001
  const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
@@ -976,13 +2019,15 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
976
2019
  if (depth>=1)
977
2020
  while (ip<ilimit) {
978
2021
  ip ++;
979
- current++;
2022
+ curr++;
980
2023
  /* check repCode */
981
2024
  if (offset) {
982
- const U32 repIndex = (U32)(current - offset_1);
2025
+ const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
2026
+ const U32 repIndex = (U32)(curr - offset_1);
983
2027
  const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
984
2028
  const BYTE* const repMatch = repBase + repIndex;
985
- if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > lowestIndex)) /* intentional overflow */
2029
+ if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */
2030
+ & (offset_1 < curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
986
2031
  if (MEM_read32(ip) == MEM_read32(repMatch)) {
987
2032
  /* repcode detected */
988
2033
  const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
@@ -1006,13 +2051,15 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1006
2051
  /* let's find an even better one */
1007
2052
  if ((depth==2) && (ip<ilimit)) {
1008
2053
  ip ++;
1009
- current++;
2054
+ curr++;
1010
2055
  /* check repCode */
1011
2056
  if (offset) {
1012
- const U32 repIndex = (U32)(current - offset_1);
2057
+ const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
2058
+ const U32 repIndex = (U32)(curr - offset_1);
1013
2059
  const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
1014
2060
  const BYTE* const repMatch = repBase + repIndex;
1015
- if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > lowestIndex)) /* intentional overflow */
2061
+ if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */
2062
+ & (offset_1 < curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
1016
2063
  if (MEM_read32(ip) == MEM_read32(repMatch)) {
1017
2064
  /* repcode detected */
1018
2065
  const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
@@ -1047,22 +2094,25 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1047
2094
  /* store sequence */
1048
2095
  _storeSequence:
1049
2096
  { size_t const litLength = start - anchor;
1050
- ZSTD_storeSeq(seqStore, litLength, anchor, (U32)offset, matchLength-MINMATCH);
2097
+ ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offset, matchLength-MINMATCH);
1051
2098
  anchor = ip = start + matchLength;
1052
2099
  }
1053
2100
 
1054
2101
  /* check immediate repcode */
1055
2102
  while (ip <= ilimit) {
1056
- const U32 repIndex = (U32)((ip-base) - offset_2);
2103
+ const U32 repCurrent = (U32)(ip-base);
2104
+ const U32 windowLow = ZSTD_getLowestMatchIndex(ms, repCurrent, windowLog);
2105
+ const U32 repIndex = repCurrent - offset_2;
1057
2106
  const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
1058
2107
  const BYTE* const repMatch = repBase + repIndex;
1059
- if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > lowestIndex)) /* intentional overflow */
2108
+ if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */
2109
+ & (offset_2 < repCurrent - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
1060
2110
  if (MEM_read32(ip) == MEM_read32(repMatch)) {
1061
2111
  /* repcode detected we should take it */
1062
2112
  const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
1063
2113
  matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
1064
2114
  offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap offset history */
1065
- ZSTD_storeSeq(seqStore, 0, anchor, 0, matchLength-MINMATCH);
2115
+ ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MINMATCH);
1066
2116
  ip += matchLength;
1067
2117
  anchor = ip;
1068
2118
  continue; /* faster when present ... (?) */
@@ -1075,7 +2125,7 @@ _storeSequence:
1075
2125
  rep[1] = offset_2;
1076
2126
 
1077
2127
  /* Return the last literals size */
1078
- return iend - anchor;
2128
+ return (size_t)(iend - anchor);
1079
2129
  }
1080
2130
 
1081
2131
 
@@ -1083,7 +2133,7 @@ size_t ZSTD_compressBlock_greedy_extDict(
1083
2133
  ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1084
2134
  void const* src, size_t srcSize)
1085
2135
  {
1086
- return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, 0, 0);
2136
+ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0);
1087
2137
  }
1088
2138
 
1089
2139
  size_t ZSTD_compressBlock_lazy_extDict(
@@ -1091,7 +2141,7 @@ size_t ZSTD_compressBlock_lazy_extDict(
1091
2141
  void const* src, size_t srcSize)
1092
2142
 
1093
2143
  {
1094
- return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, 0, 1);
2144
+ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1);
1095
2145
  }
1096
2146
 
1097
2147
  size_t ZSTD_compressBlock_lazy2_extDict(
@@ -1099,7 +2149,7 @@ size_t ZSTD_compressBlock_lazy2_extDict(
1099
2149
  void const* src, size_t srcSize)
1100
2150
 
1101
2151
  {
1102
- return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, 0, 2);
2152
+ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2);
1103
2153
  }
1104
2154
 
1105
2155
  size_t ZSTD_compressBlock_btlazy2_extDict(
@@ -1107,5 +2157,28 @@ size_t ZSTD_compressBlock_btlazy2_extDict(
1107
2157
  void const* src, size_t srcSize)
1108
2158
 
1109
2159
  {
1110
- return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, 1, 2);
2160
+ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2);
2161
+ }
2162
+
2163
+ size_t ZSTD_compressBlock_greedy_extDict_row(
2164
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2165
+ void const* src, size_t srcSize)
2166
+ {
2167
+ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0);
2168
+ }
2169
+
2170
+ size_t ZSTD_compressBlock_lazy_extDict_row(
2171
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2172
+ void const* src, size_t srcSize)
2173
+
2174
+ {
2175
+ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1);
2176
+ }
2177
+
2178
+ size_t ZSTD_compressBlock_lazy2_extDict_row(
2179
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2180
+ void const* src, size_t srcSize)
2181
+
2182
+ {
2183
+ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2);
1111
2184
  }