zstd-ruby 1.4.1.0 → 1.5.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (96) hide show
  1. checksums.yaml +4 -4
  2. data/.github/dependabot.yml +8 -0
  3. data/.github/workflows/ruby.yml +35 -0
  4. data/README.md +2 -2
  5. data/ext/zstdruby/libzstd/BUCK +5 -7
  6. data/ext/zstdruby/libzstd/Makefile +304 -113
  7. data/ext/zstdruby/libzstd/README.md +83 -20
  8. data/ext/zstdruby/libzstd/common/bitstream.h +59 -51
  9. data/ext/zstdruby/libzstd/common/compiler.h +150 -8
  10. data/ext/zstdruby/libzstd/common/cpu.h +1 -3
  11. data/ext/zstdruby/libzstd/common/debug.c +11 -31
  12. data/ext/zstdruby/libzstd/common/debug.h +22 -49
  13. data/ext/zstdruby/libzstd/common/entropy_common.c +201 -75
  14. data/ext/zstdruby/libzstd/common/error_private.c +3 -1
  15. data/ext/zstdruby/libzstd/common/error_private.h +8 -4
  16. data/ext/zstdruby/libzstd/common/fse.h +50 -42
  17. data/ext/zstdruby/libzstd/common/fse_decompress.c +149 -55
  18. data/ext/zstdruby/libzstd/common/huf.h +43 -39
  19. data/ext/zstdruby/libzstd/common/mem.h +69 -25
  20. data/ext/zstdruby/libzstd/common/pool.c +30 -20
  21. data/ext/zstdruby/libzstd/common/pool.h +3 -3
  22. data/ext/zstdruby/libzstd/common/threading.c +51 -4
  23. data/ext/zstdruby/libzstd/common/threading.h +36 -4
  24. data/ext/zstdruby/libzstd/common/xxhash.c +40 -92
  25. data/ext/zstdruby/libzstd/common/xxhash.h +12 -32
  26. data/ext/zstdruby/libzstd/common/zstd_common.c +10 -10
  27. data/ext/zstdruby/libzstd/common/zstd_deps.h +111 -0
  28. data/ext/zstdruby/libzstd/common/zstd_internal.h +230 -111
  29. data/ext/zstdruby/libzstd/common/zstd_trace.h +154 -0
  30. data/ext/zstdruby/libzstd/compress/fse_compress.c +47 -63
  31. data/ext/zstdruby/libzstd/compress/hist.c +41 -63
  32. data/ext/zstdruby/libzstd/compress/hist.h +13 -33
  33. data/ext/zstdruby/libzstd/compress/huf_compress.c +332 -193
  34. data/ext/zstdruby/libzstd/compress/zstd_compress.c +3614 -1696
  35. data/ext/zstdruby/libzstd/compress/zstd_compress_internal.h +546 -86
  36. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.c +158 -0
  37. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.h +29 -0
  38. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.c +441 -0
  39. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.h +54 -0
  40. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.c +572 -0
  41. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.h +32 -0
  42. data/ext/zstdruby/libzstd/compress/zstd_cwksp.h +662 -0
  43. data/ext/zstdruby/libzstd/compress/zstd_double_fast.c +43 -41
  44. data/ext/zstdruby/libzstd/compress/zstd_double_fast.h +2 -2
  45. data/ext/zstdruby/libzstd/compress/zstd_fast.c +85 -80
  46. data/ext/zstdruby/libzstd/compress/zstd_fast.h +2 -2
  47. data/ext/zstdruby/libzstd/compress/zstd_lazy.c +1184 -111
  48. data/ext/zstdruby/libzstd/compress/zstd_lazy.h +59 -1
  49. data/ext/zstdruby/libzstd/compress/zstd_ldm.c +333 -208
  50. data/ext/zstdruby/libzstd/compress/zstd_ldm.h +15 -3
  51. data/ext/zstdruby/libzstd/compress/zstd_ldm_geartab.h +103 -0
  52. data/ext/zstdruby/libzstd/compress/zstd_opt.c +228 -129
  53. data/ext/zstdruby/libzstd/compress/zstd_opt.h +1 -1
  54. data/ext/zstdruby/libzstd/compress/zstdmt_compress.c +151 -440
  55. data/ext/zstdruby/libzstd/compress/zstdmt_compress.h +32 -114
  56. data/ext/zstdruby/libzstd/decompress/huf_decompress.c +395 -276
  57. data/ext/zstdruby/libzstd/decompress/zstd_ddict.c +20 -16
  58. data/ext/zstdruby/libzstd/decompress/zstd_ddict.h +3 -3
  59. data/ext/zstdruby/libzstd/decompress/zstd_decompress.c +630 -231
  60. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.c +606 -380
  61. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.h +8 -5
  62. data/ext/zstdruby/libzstd/decompress/zstd_decompress_internal.h +39 -9
  63. data/ext/zstdruby/libzstd/deprecated/zbuff.h +9 -8
  64. data/ext/zstdruby/libzstd/deprecated/zbuff_common.c +2 -2
  65. data/ext/zstdruby/libzstd/deprecated/zbuff_compress.c +1 -1
  66. data/ext/zstdruby/libzstd/deprecated/zbuff_decompress.c +1 -1
  67. data/ext/zstdruby/libzstd/dictBuilder/cover.c +55 -46
  68. data/ext/zstdruby/libzstd/dictBuilder/cover.h +20 -9
  69. data/ext/zstdruby/libzstd/dictBuilder/divsufsort.c +1 -1
  70. data/ext/zstdruby/libzstd/dictBuilder/fastcover.c +43 -31
  71. data/ext/zstdruby/libzstd/dictBuilder/zdict.c +53 -30
  72. data/ext/zstdruby/libzstd/dll/example/Makefile +2 -1
  73. data/ext/zstdruby/libzstd/dll/example/README.md +16 -22
  74. data/ext/zstdruby/libzstd/legacy/zstd_legacy.h +4 -4
  75. data/ext/zstdruby/libzstd/legacy/zstd_v01.c +24 -14
  76. data/ext/zstdruby/libzstd/legacy/zstd_v01.h +1 -1
  77. data/ext/zstdruby/libzstd/legacy/zstd_v02.c +17 -8
  78. data/ext/zstdruby/libzstd/legacy/zstd_v02.h +1 -1
  79. data/ext/zstdruby/libzstd/legacy/zstd_v03.c +17 -8
  80. data/ext/zstdruby/libzstd/legacy/zstd_v03.h +1 -1
  81. data/ext/zstdruby/libzstd/legacy/zstd_v04.c +25 -11
  82. data/ext/zstdruby/libzstd/legacy/zstd_v04.h +1 -1
  83. data/ext/zstdruby/libzstd/legacy/zstd_v05.c +43 -32
  84. data/ext/zstdruby/libzstd/legacy/zstd_v05.h +2 -2
  85. data/ext/zstdruby/libzstd/legacy/zstd_v06.c +27 -19
  86. data/ext/zstdruby/libzstd/legacy/zstd_v06.h +1 -1
  87. data/ext/zstdruby/libzstd/legacy/zstd_v07.c +32 -20
  88. data/ext/zstdruby/libzstd/legacy/zstd_v07.h +1 -1
  89. data/ext/zstdruby/libzstd/libzstd.pc.in +2 -1
  90. data/ext/zstdruby/libzstd/{dictBuilder/zdict.h → zdict.h} +201 -31
  91. data/ext/zstdruby/libzstd/zstd.h +740 -153
  92. data/ext/zstdruby/libzstd/{common/zstd_errors.h → zstd_errors.h} +3 -1
  93. data/lib/zstd-ruby/version.rb +1 -1
  94. data/zstd-ruby.gemspec +1 -1
  95. metadata +21 -10
  96. data/.travis.yml +0 -14
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
2
+ * Copyright (c) Yann Collet, Facebook, Inc.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -15,7 +15,7 @@
15
15
  extern "C" {
16
16
  #endif
17
17
 
18
- #include "mem.h" /* U32 */
18
+ #include "../common/mem.h" /* U32 */
19
19
  #include "zstd_compress_internal.h"
20
20
 
21
21
  void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
2
+ * Copyright (c) Yann Collet, Facebook, Inc.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -58,11 +58,11 @@ ZSTD_updateDUBT(ZSTD_matchState_t* ms,
58
58
 
59
59
  /** ZSTD_insertDUBT1() :
60
60
  * sort one already inserted but unsorted position
61
- * assumption : current >= btlow == (current - btmask)
61
+ * assumption : curr >= btlow == (curr - btmask)
62
62
  * doesn't fail */
63
63
  static void
64
64
  ZSTD_insertDUBT1(ZSTD_matchState_t* ms,
65
- U32 current, const BYTE* inputEnd,
65
+ U32 curr, const BYTE* inputEnd,
66
66
  U32 nbCompares, U32 btLow,
67
67
  const ZSTD_dictMode_e dictMode)
68
68
  {
@@ -74,41 +74,41 @@ ZSTD_insertDUBT1(ZSTD_matchState_t* ms,
74
74
  const BYTE* const base = ms->window.base;
75
75
  const BYTE* const dictBase = ms->window.dictBase;
76
76
  const U32 dictLimit = ms->window.dictLimit;
77
- const BYTE* const ip = (current>=dictLimit) ? base + current : dictBase + current;
78
- const BYTE* const iend = (current>=dictLimit) ? inputEnd : dictBase + dictLimit;
77
+ const BYTE* const ip = (curr>=dictLimit) ? base + curr : dictBase + curr;
78
+ const BYTE* const iend = (curr>=dictLimit) ? inputEnd : dictBase + dictLimit;
79
79
  const BYTE* const dictEnd = dictBase + dictLimit;
80
80
  const BYTE* const prefixStart = base + dictLimit;
81
81
  const BYTE* match;
82
- U32* smallerPtr = bt + 2*(current&btMask);
82
+ U32* smallerPtr = bt + 2*(curr&btMask);
83
83
  U32* largerPtr = smallerPtr + 1;
84
84
  U32 matchIndex = *smallerPtr; /* this candidate is unsorted : next sorted candidate is reached through *smallerPtr, while *largerPtr contains previous unsorted candidate (which is already saved and can be overwritten) */
85
85
  U32 dummy32; /* to be nullified at the end */
86
86
  U32 const windowValid = ms->window.lowLimit;
87
87
  U32 const maxDistance = 1U << cParams->windowLog;
88
- U32 const windowLow = (current - windowValid > maxDistance) ? current - maxDistance : windowValid;
88
+ U32 const windowLow = (curr - windowValid > maxDistance) ? curr - maxDistance : windowValid;
89
89
 
90
90
 
91
91
  DEBUGLOG(8, "ZSTD_insertDUBT1(%u) (dictLimit=%u, lowLimit=%u)",
92
- current, dictLimit, windowLow);
93
- assert(current >= btLow);
92
+ curr, dictLimit, windowLow);
93
+ assert(curr >= btLow);
94
94
  assert(ip < iend); /* condition for ZSTD_count */
95
95
 
96
96
  while (nbCompares-- && (matchIndex > windowLow)) {
97
97
  U32* const nextPtr = bt + 2*(matchIndex & btMask);
98
98
  size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
99
- assert(matchIndex < current);
99
+ assert(matchIndex < curr);
100
100
  /* note : all candidates are now supposed sorted,
101
101
  * but it's still possible to have nextPtr[1] == ZSTD_DUBT_UNSORTED_MARK
102
102
  * when a real index has the same value as ZSTD_DUBT_UNSORTED_MARK */
103
103
 
104
104
  if ( (dictMode != ZSTD_extDict)
105
105
  || (matchIndex+matchLength >= dictLimit) /* both in current segment*/
106
- || (current < dictLimit) /* both in extDict */) {
106
+ || (curr < dictLimit) /* both in extDict */) {
107
107
  const BYTE* const mBase = ( (dictMode != ZSTD_extDict)
108
108
  || (matchIndex+matchLength >= dictLimit)) ?
109
109
  base : dictBase;
110
110
  assert( (matchIndex+matchLength >= dictLimit) /* might be wrong if extDict is incorrectly set to 0 */
111
- || (current < dictLimit) );
111
+ || (curr < dictLimit) );
112
112
  match = mBase + matchIndex;
113
113
  matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend);
114
114
  } else {
@@ -119,7 +119,7 @@ ZSTD_insertDUBT1(ZSTD_matchState_t* ms,
119
119
  }
120
120
 
121
121
  DEBUGLOG(8, "ZSTD_insertDUBT1: comparing %u with %u : found %u common bytes ",
122
- current, matchIndex, (U32)matchLength);
122
+ curr, matchIndex, (U32)matchLength);
123
123
 
124
124
  if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */
125
125
  break; /* drop , to guarantee consistency ; miss a bit of compression, but other solutions can corrupt tree */
@@ -168,7 +168,7 @@ ZSTD_DUBT_findBetterDictMatch (
168
168
 
169
169
  const BYTE* const base = ms->window.base;
170
170
  const BYTE* const prefixStart = base + ms->window.dictLimit;
171
- U32 const current = (U32)(ip-base);
171
+ U32 const curr = (U32)(ip-base);
172
172
  const BYTE* const dictBase = dms->window.base;
173
173
  const BYTE* const dictEnd = dms->window.nextSrc;
174
174
  U32 const dictHighLimit = (U32)(dms->window.nextSrc - dms->window.base);
@@ -195,10 +195,10 @@ ZSTD_DUBT_findBetterDictMatch (
195
195
 
196
196
  if (matchLength > bestLength) {
197
197
  U32 matchIndex = dictMatchIndex + dictIndexDelta;
198
- if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(current-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) {
198
+ if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) {
199
199
  DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)",
200
- current, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, ZSTD_REP_MOVE + current - matchIndex, dictMatchIndex, matchIndex);
201
- bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + current - matchIndex;
200
+ curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, ZSTD_REP_MOVE + curr - matchIndex, dictMatchIndex, matchIndex);
201
+ bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + curr - matchIndex;
202
202
  }
203
203
  if (ip+matchLength == iend) { /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */
204
204
  break; /* drop, to guarantee consistency (miss a little bit of compression) */
@@ -218,9 +218,9 @@ ZSTD_DUBT_findBetterDictMatch (
218
218
  }
219
219
 
220
220
  if (bestLength >= MINMATCH) {
221
- U32 const mIndex = current - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex;
221
+ U32 const mIndex = curr - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex;
222
222
  DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
223
- current, (U32)bestLength, (U32)*offsetPtr, mIndex);
223
+ curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
224
224
  }
225
225
  return bestLength;
226
226
 
@@ -241,15 +241,13 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
241
241
  U32 matchIndex = hashTable[h];
242
242
 
243
243
  const BYTE* const base = ms->window.base;
244
- U32 const current = (U32)(ip-base);
245
- U32 const maxDistance = 1U << cParams->windowLog;
246
- U32 const windowValid = ms->window.lowLimit;
247
- U32 const windowLow = (current - windowValid > maxDistance) ? current - maxDistance : windowValid;
244
+ U32 const curr = (U32)(ip-base);
245
+ U32 const windowLow = ZSTD_getLowestMatchIndex(ms, curr, cParams->windowLog);
248
246
 
249
247
  U32* const bt = ms->chainTable;
250
248
  U32 const btLog = cParams->chainLog - 1;
251
249
  U32 const btMask = (1 << btLog) - 1;
252
- U32 const btLow = (btMask >= current) ? 0 : current - btMask;
250
+ U32 const btLow = (btMask >= curr) ? 0 : curr - btMask;
253
251
  U32 const unsortLimit = MAX(btLow, windowLow);
254
252
 
255
253
  U32* nextCandidate = bt + 2*(matchIndex&btMask);
@@ -258,8 +256,9 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
258
256
  U32 nbCandidates = nbCompares;
259
257
  U32 previousCandidate = 0;
260
258
 
261
- DEBUGLOG(7, "ZSTD_DUBT_findBestMatch (%u) ", current);
259
+ DEBUGLOG(7, "ZSTD_DUBT_findBestMatch (%u) ", curr);
262
260
  assert(ip <= iend-8); /* required for h calculation */
261
+ assert(dictMode != ZSTD_dedicatedDictSearch);
263
262
 
264
263
  /* reach end of unsorted candidates list */
265
264
  while ( (matchIndex > unsortLimit)
@@ -301,14 +300,14 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
301
300
  const U32 dictLimit = ms->window.dictLimit;
302
301
  const BYTE* const dictEnd = dictBase + dictLimit;
303
302
  const BYTE* const prefixStart = base + dictLimit;
304
- U32* smallerPtr = bt + 2*(current&btMask);
305
- U32* largerPtr = bt + 2*(current&btMask) + 1;
306
- U32 matchEndIdx = current + 8 + 1;
303
+ U32* smallerPtr = bt + 2*(curr&btMask);
304
+ U32* largerPtr = bt + 2*(curr&btMask) + 1;
305
+ U32 matchEndIdx = curr + 8 + 1;
307
306
  U32 dummy32; /* to be nullified at the end */
308
307
  size_t bestLength = 0;
309
308
 
310
309
  matchIndex = hashTable[h];
311
- hashTable[h] = current; /* Update Hash Table */
310
+ hashTable[h] = curr; /* Update Hash Table */
312
311
 
313
312
  while (nbCompares-- && (matchIndex > windowLow)) {
314
313
  U32* const nextPtr = bt + 2*(matchIndex & btMask);
@@ -328,8 +327,8 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
328
327
  if (matchLength > bestLength) {
329
328
  if (matchLength > matchEndIdx - matchIndex)
330
329
  matchEndIdx = matchIndex + (U32)matchLength;
331
- if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(current-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) )
332
- bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + current - matchIndex;
330
+ if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) )
331
+ bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + curr - matchIndex;
333
332
  if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */
334
333
  if (dictMode == ZSTD_dictMatchState) {
335
334
  nbCompares = 0; /* in addition to avoiding checking any
@@ -365,12 +364,12 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
365
364
  mls, dictMode);
366
365
  }
367
366
 
368
- assert(matchEndIdx > current+8); /* ensure nextToUpdate is increased */
367
+ assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased */
369
368
  ms->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */
370
369
  if (bestLength >= MINMATCH) {
371
- U32 const mIndex = current - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex;
370
+ U32 const mIndex = curr - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex;
372
371
  DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
373
- current, (U32)bestLength, (U32)*offsetPtr, mIndex);
372
+ curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
374
373
  }
375
374
  return bestLength;
376
375
  }
@@ -439,6 +438,220 @@ static size_t ZSTD_BtFindBestMatch_extDict_selectMLS (
439
438
  }
440
439
  }
441
440
 
441
+ /***********************************
442
+ * Dedicated dict search
443
+ ***********************************/
444
+
445
+ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip)
446
+ {
447
+ const BYTE* const base = ms->window.base;
448
+ U32 const target = (U32)(ip - base);
449
+ U32* const hashTable = ms->hashTable;
450
+ U32* const chainTable = ms->chainTable;
451
+ U32 const chainSize = 1 << ms->cParams.chainLog;
452
+ U32 idx = ms->nextToUpdate;
453
+ U32 const minChain = chainSize < target ? target - chainSize : idx;
454
+ U32 const bucketSize = 1 << ZSTD_LAZY_DDSS_BUCKET_LOG;
455
+ U32 const cacheSize = bucketSize - 1;
456
+ U32 const chainAttempts = (1 << ms->cParams.searchLog) - cacheSize;
457
+ U32 const chainLimit = chainAttempts > 255 ? 255 : chainAttempts;
458
+
459
+ /* We know the hashtable is oversized by a factor of `bucketSize`.
460
+ * We are going to temporarily pretend `bucketSize == 1`, keeping only a
461
+ * single entry. We will use the rest of the space to construct a temporary
462
+ * chaintable.
463
+ */
464
+ U32 const hashLog = ms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG;
465
+ U32* const tmpHashTable = hashTable;
466
+ U32* const tmpChainTable = hashTable + ((size_t)1 << hashLog);
467
+ U32 const tmpChainSize = ((1 << ZSTD_LAZY_DDSS_BUCKET_LOG) - 1) << hashLog;
468
+ U32 const tmpMinChain = tmpChainSize < target ? target - tmpChainSize : idx;
469
+ U32 hashIdx;
470
+
471
+ assert(ms->cParams.chainLog <= 24);
472
+ assert(ms->cParams.hashLog > ms->cParams.chainLog);
473
+ assert(idx != 0);
474
+ assert(tmpMinChain <= minChain);
475
+
476
+ /* fill conventional hash table and conventional chain table */
477
+ for ( ; idx < target; idx++) {
478
+ U32 const h = (U32)ZSTD_hashPtr(base + idx, hashLog, ms->cParams.minMatch);
479
+ if (idx >= tmpMinChain) {
480
+ tmpChainTable[idx - tmpMinChain] = hashTable[h];
481
+ }
482
+ tmpHashTable[h] = idx;
483
+ }
484
+
485
+ /* sort chains into ddss chain table */
486
+ {
487
+ U32 chainPos = 0;
488
+ for (hashIdx = 0; hashIdx < (1U << hashLog); hashIdx++) {
489
+ U32 count;
490
+ U32 countBeyondMinChain = 0;
491
+ U32 i = tmpHashTable[hashIdx];
492
+ for (count = 0; i >= tmpMinChain && count < cacheSize; count++) {
493
+ /* skip through the chain to the first position that won't be
494
+ * in the hash cache bucket */
495
+ if (i < minChain) {
496
+ countBeyondMinChain++;
497
+ }
498
+ i = tmpChainTable[i - tmpMinChain];
499
+ }
500
+ if (count == cacheSize) {
501
+ for (count = 0; count < chainLimit;) {
502
+ if (i < minChain) {
503
+ if (!i || ++countBeyondMinChain > cacheSize) {
504
+ /* only allow pulling `cacheSize` number of entries
505
+ * into the cache or chainTable beyond `minChain`,
506
+ * to replace the entries pulled out of the
507
+ * chainTable into the cache. This lets us reach
508
+ * back further without increasing the total number
509
+ * of entries in the chainTable, guaranteeing the
510
+ * DDSS chain table will fit into the space
511
+ * allocated for the regular one. */
512
+ break;
513
+ }
514
+ }
515
+ chainTable[chainPos++] = i;
516
+ count++;
517
+ if (i < tmpMinChain) {
518
+ break;
519
+ }
520
+ i = tmpChainTable[i - tmpMinChain];
521
+ }
522
+ } else {
523
+ count = 0;
524
+ }
525
+ if (count) {
526
+ tmpHashTable[hashIdx] = ((chainPos - count) << 8) + count;
527
+ } else {
528
+ tmpHashTable[hashIdx] = 0;
529
+ }
530
+ }
531
+ assert(chainPos <= chainSize); /* I believe this is guaranteed... */
532
+ }
533
+
534
+ /* move chain pointers into the last entry of each hash bucket */
535
+ for (hashIdx = (1 << hashLog); hashIdx; ) {
536
+ U32 const bucketIdx = --hashIdx << ZSTD_LAZY_DDSS_BUCKET_LOG;
537
+ U32 const chainPackedPointer = tmpHashTable[hashIdx];
538
+ U32 i;
539
+ for (i = 0; i < cacheSize; i++) {
540
+ hashTable[bucketIdx + i] = 0;
541
+ }
542
+ hashTable[bucketIdx + bucketSize - 1] = chainPackedPointer;
543
+ }
544
+
545
+ /* fill the buckets of the hash table */
546
+ for (idx = ms->nextToUpdate; idx < target; idx++) {
547
+ U32 const h = (U32)ZSTD_hashPtr(base + idx, hashLog, ms->cParams.minMatch)
548
+ << ZSTD_LAZY_DDSS_BUCKET_LOG;
549
+ U32 i;
550
+ /* Shift hash cache down 1. */
551
+ for (i = cacheSize - 1; i; i--)
552
+ hashTable[h + i] = hashTable[h + i - 1];
553
+ hashTable[h] = idx;
554
+ }
555
+
556
+ ms->nextToUpdate = target;
557
+ }
558
+
559
+ /* Returns the longest match length found in the dedicated dict search structure.
560
+ * If none are longer than the argument ml, then ml will be returned.
561
+ */
562
+ FORCE_INLINE_TEMPLATE
563
+ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nbAttempts,
564
+ const ZSTD_matchState_t* const dms,
565
+ const BYTE* const ip, const BYTE* const iLimit,
566
+ const BYTE* const prefixStart, const U32 curr,
567
+ const U32 dictLimit, const size_t ddsIdx) {
568
+ const U32 ddsLowestIndex = dms->window.dictLimit;
569
+ const BYTE* const ddsBase = dms->window.base;
570
+ const BYTE* const ddsEnd = dms->window.nextSrc;
571
+ const U32 ddsSize = (U32)(ddsEnd - ddsBase);
572
+ const U32 ddsIndexDelta = dictLimit - ddsSize;
573
+ const U32 bucketSize = (1 << ZSTD_LAZY_DDSS_BUCKET_LOG);
574
+ const U32 bucketLimit = nbAttempts < bucketSize - 1 ? nbAttempts : bucketSize - 1;
575
+ U32 ddsAttempt;
576
+ U32 matchIndex;
577
+
578
+ for (ddsAttempt = 0; ddsAttempt < bucketSize - 1; ddsAttempt++) {
579
+ PREFETCH_L1(ddsBase + dms->hashTable[ddsIdx + ddsAttempt]);
580
+ }
581
+
582
+ {
583
+ U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1];
584
+ U32 const chainIndex = chainPackedPointer >> 8;
585
+
586
+ PREFETCH_L1(&dms->chainTable[chainIndex]);
587
+ }
588
+
589
+ for (ddsAttempt = 0; ddsAttempt < bucketLimit; ddsAttempt++) {
590
+ size_t currentMl=0;
591
+ const BYTE* match;
592
+ matchIndex = dms->hashTable[ddsIdx + ddsAttempt];
593
+ match = ddsBase + matchIndex;
594
+
595
+ if (!matchIndex) {
596
+ return ml;
597
+ }
598
+
599
+ /* guaranteed by table construction */
600
+ (void)ddsLowestIndex;
601
+ assert(matchIndex >= ddsLowestIndex);
602
+ assert(match+4 <= ddsEnd);
603
+ if (MEM_read32(match) == MEM_read32(ip)) {
604
+ /* assumption : matchIndex <= dictLimit-4 (by table construction) */
605
+ currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4;
606
+ }
607
+
608
+ /* save best solution */
609
+ if (currentMl > ml) {
610
+ ml = currentMl;
611
+ *offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE;
612
+ if (ip+currentMl == iLimit) {
613
+ /* best possible, avoids read overflow on next attempt */
614
+ return ml;
615
+ }
616
+ }
617
+ }
618
+
619
+ {
620
+ U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1];
621
+ U32 chainIndex = chainPackedPointer >> 8;
622
+ U32 const chainLength = chainPackedPointer & 0xFF;
623
+ U32 const chainAttempts = nbAttempts - ddsAttempt;
624
+ U32 const chainLimit = chainAttempts > chainLength ? chainLength : chainAttempts;
625
+ U32 chainAttempt;
626
+
627
+ for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++) {
628
+ PREFETCH_L1(ddsBase + dms->chainTable[chainIndex + chainAttempt]);
629
+ }
630
+
631
+ for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++, chainIndex++) {
632
+ size_t currentMl=0;
633
+ const BYTE* match;
634
+ matchIndex = dms->chainTable[chainIndex];
635
+ match = ddsBase + matchIndex;
636
+
637
+ /* guaranteed by table construction */
638
+ assert(matchIndex >= ddsLowestIndex);
639
+ assert(match+4 <= ddsEnd);
640
+ if (MEM_read32(match) == MEM_read32(ip)) {
641
+ /* assumption : matchIndex <= dictLimit-4 (by table construction) */
642
+ currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4;
643
+ }
644
+
645
+ /* save best solution */
646
+ if (currentMl > ml) {
647
+ ml = currentMl;
648
+ *offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE;
649
+ if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
650
+ }
651
+ }
652
+ }
653
+ return ml;
654
+ }
442
655
 
443
656
 
444
657
  /* *********************************
@@ -448,7 +661,7 @@ static size_t ZSTD_BtFindBestMatch_extDict_selectMLS (
448
661
 
449
662
  /* Update chains up to ip (excluded)
450
663
  Assumption : always within prefix (i.e. not within extDict) */
451
- static U32 ZSTD_insertAndFindFirstIndex_internal(
664
+ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
452
665
  ZSTD_matchState_t* ms,
453
666
  const ZSTD_compressionParameters* const cParams,
454
667
  const BYTE* ip, U32 const mls)
@@ -477,7 +690,6 @@ U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {
477
690
  return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch);
478
691
  }
479
692
 
480
-
481
693
  /* inlining is important to hardwire a hot branch (template emulation) */
482
694
  FORCE_INLINE_TEMPLATE
483
695
  size_t ZSTD_HcFindBestMatch_generic (
@@ -495,18 +707,33 @@ size_t ZSTD_HcFindBestMatch_generic (
495
707
  const U32 dictLimit = ms->window.dictLimit;
496
708
  const BYTE* const prefixStart = base + dictLimit;
497
709
  const BYTE* const dictEnd = dictBase + dictLimit;
498
- const U32 current = (U32)(ip-base);
710
+ const U32 curr = (U32)(ip-base);
499
711
  const U32 maxDistance = 1U << cParams->windowLog;
500
- const U32 lowValid = ms->window.lowLimit;
501
- const U32 lowLimit = (current - lowValid > maxDistance) ? current - maxDistance : lowValid;
502
- const U32 minChain = current > chainSize ? current - chainSize : 0;
712
+ const U32 lowestValid = ms->window.lowLimit;
713
+ const U32 withinMaxDistance = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid;
714
+ const U32 isDictionary = (ms->loadedDictEnd != 0);
715
+ const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance;
716
+ const U32 minChain = curr > chainSize ? curr - chainSize : 0;
503
717
  U32 nbAttempts = 1U << cParams->searchLog;
504
718
  size_t ml=4-1;
505
719
 
720
+ const ZSTD_matchState_t* const dms = ms->dictMatchState;
721
+ const U32 ddsHashLog = dictMode == ZSTD_dedicatedDictSearch
722
+ ? dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG : 0;
723
+ const size_t ddsIdx = dictMode == ZSTD_dedicatedDictSearch
724
+ ? ZSTD_hashPtr(ip, ddsHashLog, mls) << ZSTD_LAZY_DDSS_BUCKET_LOG : 0;
725
+
726
+ U32 matchIndex;
727
+
728
+ if (dictMode == ZSTD_dedicatedDictSearch) {
729
+ const U32* entry = &dms->hashTable[ddsIdx];
730
+ PREFETCH_L1(entry);
731
+ }
732
+
506
733
  /* HC4 match finder */
507
- U32 matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls);
734
+ matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls);
508
735
 
509
- for ( ; (matchIndex>lowLimit) & (nbAttempts>0) ; nbAttempts--) {
736
+ for ( ; (matchIndex>=lowLimit) & (nbAttempts>0) ; nbAttempts--) {
510
737
  size_t currentMl=0;
511
738
  if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
512
739
  const BYTE* const match = base + matchIndex;
@@ -523,7 +750,7 @@ size_t ZSTD_HcFindBestMatch_generic (
523
750
  /* save best solution */
524
751
  if (currentMl > ml) {
525
752
  ml = currentMl;
526
- *offsetPtr = current - matchIndex + ZSTD_REP_MOVE;
753
+ *offsetPtr = curr - matchIndex + ZSTD_REP_MOVE;
527
754
  if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
528
755
  }
529
756
 
@@ -531,8 +758,10 @@ size_t ZSTD_HcFindBestMatch_generic (
531
758
  matchIndex = NEXT_IN_CHAIN(matchIndex, chainMask);
532
759
  }
533
760
 
534
- if (dictMode == ZSTD_dictMatchState) {
535
- const ZSTD_matchState_t* const dms = ms->dictMatchState;
761
+ if (dictMode == ZSTD_dedicatedDictSearch) {
762
+ ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts, dms,
763
+ ip, iLimit, prefixStart, curr, dictLimit, ddsIdx);
764
+ } else if (dictMode == ZSTD_dictMatchState) {
536
765
  const U32* const dmsChainTable = dms->chainTable;
537
766
  const U32 dmsChainSize = (1 << dms->cParams.chainLog);
538
767
  const U32 dmsChainMask = dmsChainSize - 1;
@@ -545,7 +774,7 @@ size_t ZSTD_HcFindBestMatch_generic (
545
774
 
546
775
  matchIndex = dms->hashTable[ZSTD_hashPtr(ip, dms->cParams.hashLog, mls)];
547
776
 
548
- for ( ; (matchIndex>dmsLowestIndex) & (nbAttempts>0) ; nbAttempts--) {
777
+ for ( ; (matchIndex>=dmsLowestIndex) & (nbAttempts>0) ; nbAttempts--) {
549
778
  size_t currentMl=0;
550
779
  const BYTE* const match = dmsBase + matchIndex;
551
780
  assert(match+4 <= dmsEnd);
@@ -555,11 +784,12 @@ size_t ZSTD_HcFindBestMatch_generic (
555
784
  /* save best solution */
556
785
  if (currentMl > ml) {
557
786
  ml = currentMl;
558
- *offsetPtr = current - (matchIndex + dmsIndexDelta) + ZSTD_REP_MOVE;
787
+ *offsetPtr = curr - (matchIndex + dmsIndexDelta) + ZSTD_REP_MOVE;
559
788
  if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
560
789
  }
561
790
 
562
791
  if (matchIndex <= dmsMinChain) break;
792
+
563
793
  matchIndex = dmsChainTable[matchIndex & dmsChainMask];
564
794
  }
565
795
  }
@@ -600,6 +830,22 @@ static size_t ZSTD_HcFindBestMatch_dictMatchState_selectMLS (
600
830
  }
601
831
 
602
832
 
833
+ static size_t ZSTD_HcFindBestMatch_dedicatedDictSearch_selectMLS (
834
+ ZSTD_matchState_t* ms,
835
+ const BYTE* ip, const BYTE* const iLimit,
836
+ size_t* offsetPtr)
837
+ {
838
+ switch(ms->cParams.minMatch)
839
+ {
840
+ default : /* includes case 3 */
841
+ case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_dedicatedDictSearch);
842
+ case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_dedicatedDictSearch);
843
+ case 7 :
844
+ case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_dedicatedDictSearch);
845
+ }
846
+ }
847
+
848
+
603
849
  FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_extDict_selectMLS (
604
850
  ZSTD_matchState_t* ms,
605
851
  const BYTE* ip, const BYTE* const iLimit,
@@ -615,73 +861,765 @@ FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_extDict_selectMLS (
615
861
  }
616
862
  }
617
863
 
864
+ /* *********************************
865
+ * (SIMD) Row-based matchfinder
866
+ ***********************************/
867
+ /* Constants for row-based hash */
868
+ #define ZSTD_ROW_HASH_TAG_OFFSET 1 /* byte offset of hashes in the match state's tagTable from the beginning of a row */
869
+ #define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */
870
+ #define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1)
871
+
872
+ #define ZSTD_ROW_HASH_CACHE_MASK (ZSTD_ROW_HASH_CACHE_SIZE - 1)
873
+
874
+ typedef U32 ZSTD_VecMask; /* Clarifies when we are interacting with a U32 representing a mask of matches */
875
+
876
+ #if !defined(ZSTD_NO_INTRINSICS) && defined(__SSE2__) /* SIMD SSE version */
877
+
878
+ #include <emmintrin.h>
879
+ typedef __m128i ZSTD_Vec128;
880
+
881
+ /* Returns a 128-bit container with 128-bits from src */
882
+ static ZSTD_Vec128 ZSTD_Vec128_read(const void* const src) {
883
+ return _mm_loadu_si128((ZSTD_Vec128 const*)src);
884
+ }
885
+
886
+ /* Returns a ZSTD_Vec128 with the byte "val" packed 16 times */
887
+ static ZSTD_Vec128 ZSTD_Vec128_set8(BYTE val) {
888
+ return _mm_set1_epi8((char)val);
889
+ }
890
+
891
+ /* Do byte-by-byte comparison result of x and y. Then collapse 128-bit resultant mask
892
+ * into a 32-bit mask that is the MSB of each byte.
893
+ * */
894
+ static ZSTD_VecMask ZSTD_Vec128_cmpMask8(ZSTD_Vec128 x, ZSTD_Vec128 y) {
895
+ return (ZSTD_VecMask)_mm_movemask_epi8(_mm_cmpeq_epi8(x, y));
896
+ }
897
+
898
+ typedef struct {
899
+ __m128i fst;
900
+ __m128i snd;
901
+ } ZSTD_Vec256;
902
+
903
+ static ZSTD_Vec256 ZSTD_Vec256_read(const void* const ptr) {
904
+ ZSTD_Vec256 v;
905
+ v.fst = ZSTD_Vec128_read(ptr);
906
+ v.snd = ZSTD_Vec128_read((ZSTD_Vec128 const*)ptr + 1);
907
+ return v;
908
+ }
909
+
910
+ static ZSTD_Vec256 ZSTD_Vec256_set8(BYTE val) {
911
+ ZSTD_Vec256 v;
912
+ v.fst = ZSTD_Vec128_set8(val);
913
+ v.snd = ZSTD_Vec128_set8(val);
914
+ return v;
915
+ }
916
+
917
+ static ZSTD_VecMask ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x, ZSTD_Vec256 y) {
918
+ ZSTD_VecMask fstMask;
919
+ ZSTD_VecMask sndMask;
920
+ fstMask = ZSTD_Vec128_cmpMask8(x.fst, y.fst);
921
+ sndMask = ZSTD_Vec128_cmpMask8(x.snd, y.snd);
922
+ return fstMask | (sndMask << 16);
923
+ }
924
+
925
+ #elif !defined(ZSTD_NO_INTRINSICS) && defined(__ARM_NEON) /* SIMD ARM NEON Version */
926
+
927
+ #include <arm_neon.h>
928
+ typedef uint8x16_t ZSTD_Vec128;
929
+
930
+ static ZSTD_Vec128 ZSTD_Vec128_read(const void* const src) {
931
+ return vld1q_u8((const BYTE* const)src);
932
+ }
933
+
934
+ static ZSTD_Vec128 ZSTD_Vec128_set8(BYTE val) {
935
+ return vdupq_n_u8(val);
936
+ }
937
+
938
+ /* Mimics '_mm_movemask_epi8()' from SSE */
939
+ static U32 ZSTD_vmovmaskq_u8(ZSTD_Vec128 val) {
940
+ /* Shift out everything but the MSB bits in each byte */
941
+ uint16x8_t highBits = vreinterpretq_u16_u8(vshrq_n_u8(val, 7));
942
+ /* Merge the even lanes together with vsra (right shift and add) */
943
+ uint32x4_t paired16 = vreinterpretq_u32_u16(vsraq_n_u16(highBits, highBits, 7));
944
+ uint64x2_t paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
945
+ uint8x16_t paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
946
+ /* Extract the low 8 bits from each lane, merge */
947
+ return vgetq_lane_u8(paired64, 0) | ((U32)vgetq_lane_u8(paired64, 8) << 8);
948
+ }
949
+
950
+ static ZSTD_VecMask ZSTD_Vec128_cmpMask8(ZSTD_Vec128 x, ZSTD_Vec128 y) {
951
+ return (ZSTD_VecMask)ZSTD_vmovmaskq_u8(vceqq_u8(x, y));
952
+ }
953
+
954
+ typedef struct {
955
+ uint8x16_t fst;
956
+ uint8x16_t snd;
957
+ } ZSTD_Vec256;
958
+
959
+ static ZSTD_Vec256 ZSTD_Vec256_read(const void* const ptr) {
960
+ ZSTD_Vec256 v;
961
+ v.fst = ZSTD_Vec128_read(ptr);
962
+ v.snd = ZSTD_Vec128_read((ZSTD_Vec128 const*)ptr + 1);
963
+ return v;
964
+ }
965
+
966
+ static ZSTD_Vec256 ZSTD_Vec256_set8(BYTE val) {
967
+ ZSTD_Vec256 v;
968
+ v.fst = ZSTD_Vec128_set8(val);
969
+ v.snd = ZSTD_Vec128_set8(val);
970
+ return v;
971
+ }
972
+
973
+ static ZSTD_VecMask ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x, ZSTD_Vec256 y) {
974
+ ZSTD_VecMask fstMask;
975
+ ZSTD_VecMask sndMask;
976
+ fstMask = ZSTD_Vec128_cmpMask8(x.fst, y.fst);
977
+ sndMask = ZSTD_Vec128_cmpMask8(x.snd, y.snd);
978
+ return fstMask | (sndMask << 16);
979
+ }
980
+
981
+ #else /* Scalar fallback version */
982
+
983
+ #define VEC128_NB_SIZE_T (16 / sizeof(size_t))
984
+ typedef struct {
985
+ size_t vec[VEC128_NB_SIZE_T];
986
+ } ZSTD_Vec128;
987
+
988
+ static ZSTD_Vec128 ZSTD_Vec128_read(const void* const src) {
989
+ ZSTD_Vec128 ret;
990
+ ZSTD_memcpy(ret.vec, src, VEC128_NB_SIZE_T*sizeof(size_t));
991
+ return ret;
992
+ }
993
+
994
+ static ZSTD_Vec128 ZSTD_Vec128_set8(BYTE val) {
995
+ ZSTD_Vec128 ret = { {0} };
996
+ int startBit = sizeof(size_t) * 8 - 8;
997
+ for (;startBit >= 0; startBit -= 8) {
998
+ unsigned j = 0;
999
+ for (;j < VEC128_NB_SIZE_T; ++j) {
1000
+ ret.vec[j] |= ((size_t)val << startBit);
1001
+ }
1002
+ }
1003
+ return ret;
1004
+ }
1005
+
1006
+ /* Compare x to y, byte by byte, generating a "matches" bitfield */
1007
+ static ZSTD_VecMask ZSTD_Vec128_cmpMask8(ZSTD_Vec128 x, ZSTD_Vec128 y) {
1008
+ ZSTD_VecMask res = 0;
1009
+ unsigned i = 0;
1010
+ unsigned l = 0;
1011
+ for (; i < VEC128_NB_SIZE_T; ++i) {
1012
+ const size_t cmp1 = x.vec[i];
1013
+ const size_t cmp2 = y.vec[i];
1014
+ unsigned j = 0;
1015
+ for (; j < sizeof(size_t); ++j, ++l) {
1016
+ if (((cmp1 >> j*8) & 0xFF) == ((cmp2 >> j*8) & 0xFF)) {
1017
+ res |= ((U32)1 << (j+i*sizeof(size_t)));
1018
+ }
1019
+ }
1020
+ }
1021
+ return res;
1022
+ }
1023
+
1024
+ #define VEC256_NB_SIZE_T 2*VEC128_NB_SIZE_T
1025
+ typedef struct {
1026
+ size_t vec[VEC256_NB_SIZE_T];
1027
+ } ZSTD_Vec256;
1028
+
1029
+ static ZSTD_Vec256 ZSTD_Vec256_read(const void* const src) {
1030
+ ZSTD_Vec256 ret;
1031
+ ZSTD_memcpy(ret.vec, src, VEC256_NB_SIZE_T*sizeof(size_t));
1032
+ return ret;
1033
+ }
1034
+
1035
+ static ZSTD_Vec256 ZSTD_Vec256_set8(BYTE val) {
1036
+ ZSTD_Vec256 ret = { {0} };
1037
+ int startBit = sizeof(size_t) * 8 - 8;
1038
+ for (;startBit >= 0; startBit -= 8) {
1039
+ unsigned j = 0;
1040
+ for (;j < VEC256_NB_SIZE_T; ++j) {
1041
+ ret.vec[j] |= ((size_t)val << startBit);
1042
+ }
1043
+ }
1044
+ return ret;
1045
+ }
1046
+
1047
+ /* Compare x to y, byte by byte, generating a "matches" bitfield */
1048
+ static ZSTD_VecMask ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x, ZSTD_Vec256 y) {
1049
+ ZSTD_VecMask res = 0;
1050
+ unsigned i = 0;
1051
+ unsigned l = 0;
1052
+ for (; i < VEC256_NB_SIZE_T; ++i) {
1053
+ const size_t cmp1 = x.vec[i];
1054
+ const size_t cmp2 = y.vec[i];
1055
+ unsigned j = 0;
1056
+ for (; j < sizeof(size_t); ++j, ++l) {
1057
+ if (((cmp1 >> j*8) & 0xFF) == ((cmp2 >> j*8) & 0xFF)) {
1058
+ res |= ((U32)1 << (j+i*sizeof(size_t)));
1059
+ }
1060
+ }
1061
+ }
1062
+ return res;
1063
+ }
1064
+
1065
+ #endif /* !defined(ZSTD_NO_INTRINSICS) && defined(__SSE2__) */
1066
+
1067
+ /* ZSTD_VecMask_next():
1068
+ * Starting from the LSB, returns the idx of the next non-zero bit.
1069
+ * Basically counting the nb of trailing zeroes.
1070
+ */
1071
+ static U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
1072
+ # if defined(_MSC_VER) /* Visual */
1073
+ unsigned long r=0;
1074
+ return _BitScanForward(&r, val) ? (U32)r : 0;
1075
+ # elif defined(__GNUC__) && (__GNUC__ >= 3)
1076
+ return (U32)__builtin_ctz(val);
1077
+ # else
1078
+ /* Software ctz version: http://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightMultLookup */
1079
+ static const U32 multiplyDeBruijnBitPosition[32] =
1080
+ {
1081
+ 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
1082
+ 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
1083
+ };
1084
+ return multiplyDeBruijnBitPosition[((U32)((v & -(int)v) * 0x077CB531U)) >> 27];
1085
+ # endif
1086
+ }
1087
+
1088
+ /* ZSTD_VecMask_rotateRight():
1089
+ * Rotates a bitfield to the right by "rotation" bits.
1090
+ * If the rotation is greater than totalBits, the returned mask is 0.
1091
+ */
1092
+ FORCE_INLINE_TEMPLATE ZSTD_VecMask
1093
+ ZSTD_VecMask_rotateRight(ZSTD_VecMask mask, U32 const rotation, U32 const totalBits) {
1094
+ if (rotation == 0)
1095
+ return mask;
1096
+ switch (totalBits) {
1097
+ default:
1098
+ assert(0);
1099
+ case 16:
1100
+ return (mask >> rotation) | (U16)(mask << (16 - rotation));
1101
+ case 32:
1102
+ return (mask >> rotation) | (U32)(mask << (32 - rotation));
1103
+ }
1104
+ }
1105
+
1106
+ /* ZSTD_row_nextIndex():
1107
+ * Returns the next index to insert at within a tagTable row, and updates the "head"
1108
+ * value to reflect the update. Essentially cycles backwards from [0, {entries per row})
1109
+ */
1110
+ FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const rowMask) {
1111
+ U32 const next = (*tagRow - 1) & rowMask;
1112
+ *tagRow = (BYTE)next;
1113
+ return next;
1114
+ }
1115
+
1116
+ /* ZSTD_isAligned():
1117
+ * Checks that a pointer is aligned to "align" bytes which must be a power of 2.
1118
+ */
1119
+ MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) {
1120
+ assert((align & (align - 1)) == 0);
1121
+ return (((size_t)ptr) & (align - 1)) == 0;
1122
+ }
1123
+
1124
+ /* ZSTD_row_prefetch():
1125
+ * Performs prefetching for the hashTable and tagTable at a given row.
1126
+ */
1127
+ FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* tagTable, U32 const relRow, U32 const rowLog) {
1128
+ PREFETCH_L1(hashTable + relRow);
1129
+ if (rowLog == 5) {
1130
+ PREFETCH_L1(hashTable + relRow + 16);
1131
+ }
1132
+ PREFETCH_L1(tagTable + relRow);
1133
+ assert(rowLog == 4 || rowLog == 5);
1134
+ assert(ZSTD_isAligned(hashTable + relRow, 64)); /* prefetched hash row always 64-byte aligned */
1135
+ assert(ZSTD_isAligned(tagTable + relRow, (size_t)1 << rowLog)); /* prefetched tagRow sits on a multiple of 32 or 64 bytes */
1136
+ }
1137
+
1138
+ /* ZSTD_row_fillHashCache():
1139
+ * Fill up the hash cache starting at idx, prefetching up to ZSTD_ROW_HASH_CACHE_SIZE entries,
1140
+ * but not beyond iLimit.
1141
+ */
1142
+ static void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base,
1143
+ U32 const rowLog, U32 const mls,
1144
+ U32 idx, const BYTE* const iLimit)
1145
+ {
1146
+ U32 const* const hashTable = ms->hashTable;
1147
+ U16 const* const tagTable = ms->tagTable;
1148
+ U32 const hashLog = ms->rowHashLog;
1149
+ U32 const maxElemsToPrefetch = (base + idx) > iLimit ? 0 : (U32)(iLimit - (base + idx) + 1);
1150
+ U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch);
1151
+
1152
+ for (; idx < lim; ++idx) {
1153
+ U32 const hash = (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
1154
+ U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
1155
+ ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
1156
+ ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash;
1157
+ }
1158
+
1159
+ DEBUGLOG(6, "ZSTD_row_fillHashCache(): [%u %u %u %u %u %u %u %u]", ms->hashCache[0], ms->hashCache[1],
1160
+ ms->hashCache[2], ms->hashCache[3], ms->hashCache[4],
1161
+ ms->hashCache[5], ms->hashCache[6], ms->hashCache[7]);
1162
+ }
1163
+
1164
+ /* ZSTD_row_nextCachedHash():
1165
+ * Returns the hash of base + idx, and replaces the hash in the hash cache with the byte at
1166
+ * base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable.
1167
+ */
1168
+ FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable,
1169
+ U16 const* tagTable, BYTE const* base,
1170
+ U32 idx, U32 const hashLog,
1171
+ U32 const rowLog, U32 const mls)
1172
+ {
1173
+ U32 const newHash = (U32)ZSTD_hashPtr(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
1174
+ U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
1175
+ ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
1176
+ { U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK];
1177
+ cache[idx & ZSTD_ROW_HASH_CACHE_MASK] = newHash;
1178
+ return hash;
1179
+ }
1180
+ }
1181
+
1182
+ /* ZSTD_row_update_internal():
1183
+ * Inserts the byte at ip into the appropriate position in the hash table.
1184
+ * Determines the relative row, and the position within the {16, 32} entry row to insert at.
1185
+ */
1186
+ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip,
1187
+ U32 const mls, U32 const rowLog,
1188
+ U32 const rowMask, U32 const useCache)
1189
+ {
1190
+ U32* const hashTable = ms->hashTable;
1191
+ U16* const tagTable = ms->tagTable;
1192
+ U32 const hashLog = ms->rowHashLog;
1193
+ const BYTE* const base = ms->window.base;
1194
+ const U32 target = (U32)(ip - base);
1195
+ U32 idx = ms->nextToUpdate;
1196
+
1197
+ DEBUGLOG(6, "ZSTD_row_update_internal(): nextToUpdate=%u, current=%u", idx, target);
1198
+ for (; idx < target; ++idx) {
1199
+ U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, idx, hashLog, rowLog, mls)
1200
+ : (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
1201
+ U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
1202
+ U32* const row = hashTable + relRow;
1203
+ BYTE* tagRow = (BYTE*)(tagTable + relRow); /* Though tagTable is laid out as a table of U16, each tag is only 1 byte.
1204
+ Explicit cast allows us to get exact desired position within each row */
1205
+ U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
1206
+
1207
+ assert(hash == ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls));
1208
+ ((BYTE*)tagRow)[pos + ZSTD_ROW_HASH_TAG_OFFSET] = hash & ZSTD_ROW_HASH_TAG_MASK;
1209
+ row[pos] = idx;
1210
+ }
1211
+ ms->nextToUpdate = target;
1212
+ }
1213
+
1214
+ /* ZSTD_row_update():
1215
+ * External wrapper for ZSTD_row_update_internal(). Used for filling the hashtable during dictionary
1216
+ * processing.
1217
+ */
1218
+ void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) {
1219
+ const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5;
1220
+ const U32 rowMask = (1u << rowLog) - 1;
1221
+ const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */);
1222
+
1223
+ DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog);
1224
+ ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* dont use cache */);
1225
+ }
1226
+
1227
+ /* Returns a ZSTD_VecMask (U32) that has the nth bit set to 1 if the newly-computed "tag" matches
1228
+ * the hash at the nth position in a row of the tagTable.
1229
+ */
1230
+ FORCE_INLINE_TEMPLATE
1231
+ ZSTD_VecMask ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries) {
1232
+ ZSTD_VecMask matches = 0;
1233
+ if (rowEntries == 16) {
1234
+ ZSTD_Vec128 hashes = ZSTD_Vec128_read(tagRow + ZSTD_ROW_HASH_TAG_OFFSET);
1235
+ ZSTD_Vec128 expandedTags = ZSTD_Vec128_set8(tag);
1236
+ matches = ZSTD_Vec128_cmpMask8(hashes, expandedTags);
1237
+ } else if (rowEntries == 32) {
1238
+ ZSTD_Vec256 hashes = ZSTD_Vec256_read(tagRow + ZSTD_ROW_HASH_TAG_OFFSET);
1239
+ ZSTD_Vec256 expandedTags = ZSTD_Vec256_set8(tag);
1240
+ matches = ZSTD_Vec256_cmpMask8(hashes, expandedTags);
1241
+ } else {
1242
+ assert(0);
1243
+ }
1244
+ /* Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield
1245
+ to match up with the actual layout of the entries within the hashTable */
1246
+ return ZSTD_VecMask_rotateRight(matches, head, rowEntries);
1247
+ }
1248
+
1249
+ /* The high-level approach of the SIMD row based match finder is as follows:
1250
+ * - Figure out where to insert the new entry:
1251
+ * - Generate a hash from a byte along with an additional 1-byte "short hash". The additional byte is our "tag"
1252
+ * - The hashTable is effectively split into groups or "rows" of 16 or 32 entries of U32, and the hash determines
1253
+ * which row to insert into.
1254
+ * - Determine the correct position within the row to insert the entry into. Each row of 16 or 32 can
1255
+ * be considered as a circular buffer with a "head" index that resides in the tagTable.
1256
+ * - Also insert the "tag" into the equivalent row and position in the tagTable.
1257
+ * - Note: The tagTable has 17 or 33 1-byte entries per row, due to 16 or 32 tags, and 1 "head" entry.
1258
+ * The 17 or 33 entry rows are spaced out to occur every 32 or 64 bytes, respectively,
1259
+ * for alignment/performance reasons, leaving some bytes unused.
1260
+ * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte "short hash" and
1261
+ * generate a bitfield that we can cycle through to check the collisions in the hash table.
1262
+ * - Pick the longest match.
1263
+ */
1264
+ FORCE_INLINE_TEMPLATE
1265
+ size_t ZSTD_RowFindBestMatch_generic (
1266
+ ZSTD_matchState_t* ms,
1267
+ const BYTE* const ip, const BYTE* const iLimit,
1268
+ size_t* offsetPtr,
1269
+ const U32 mls, const ZSTD_dictMode_e dictMode,
1270
+ const U32 rowLog)
1271
+ {
1272
+ U32* const hashTable = ms->hashTable;
1273
+ U16* const tagTable = ms->tagTable;
1274
+ U32* const hashCache = ms->hashCache;
1275
+ const U32 hashLog = ms->rowHashLog;
1276
+ const ZSTD_compressionParameters* const cParams = &ms->cParams;
1277
+ const BYTE* const base = ms->window.base;
1278
+ const BYTE* const dictBase = ms->window.dictBase;
1279
+ const U32 dictLimit = ms->window.dictLimit;
1280
+ const BYTE* const prefixStart = base + dictLimit;
1281
+ const BYTE* const dictEnd = dictBase + dictLimit;
1282
+ const U32 curr = (U32)(ip-base);
1283
+ const U32 maxDistance = 1U << cParams->windowLog;
1284
+ const U32 lowestValid = ms->window.lowLimit;
1285
+ const U32 withinMaxDistance = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid;
1286
+ const U32 isDictionary = (ms->loadedDictEnd != 0);
1287
+ const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance;
1288
+ const U32 rowEntries = (1U << rowLog);
1289
+ const U32 rowMask = rowEntries - 1;
1290
+ const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */
1291
+ U32 nbAttempts = 1U << cappedSearchLog;
1292
+ size_t ml=4-1;
1293
+
1294
+ /* DMS/DDS variables that may be referenced laster */
1295
+ const ZSTD_matchState_t* const dms = ms->dictMatchState;
1296
+ size_t ddsIdx;
1297
+ U32 ddsExtraAttempts; /* cctx hash tables are limited in searches, but allow extra searches into DDS */
1298
+ U32 dmsTag;
1299
+ U32* dmsRow;
1300
+ BYTE* dmsTagRow;
1301
+
1302
+ if (dictMode == ZSTD_dedicatedDictSearch) {
1303
+ const U32 ddsHashLog = dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG;
1304
+ { /* Prefetch DDS hashtable entry */
1305
+ ddsIdx = ZSTD_hashPtr(ip, ddsHashLog, mls) << ZSTD_LAZY_DDSS_BUCKET_LOG;
1306
+ PREFETCH_L1(&dms->hashTable[ddsIdx]);
1307
+ }
1308
+ ddsExtraAttempts = cParams->searchLog > rowLog ? 1U << (cParams->searchLog - rowLog) : 0;
1309
+ }
1310
+
1311
+ if (dictMode == ZSTD_dictMatchState) {
1312
+ /* Prefetch DMS rows */
1313
+ U32* const dmsHashTable = dms->hashTable;
1314
+ U16* const dmsTagTable = dms->tagTable;
1315
+ U32 const dmsHash = (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
1316
+ U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
1317
+ dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK;
1318
+ dmsTagRow = (BYTE*)(dmsTagTable + dmsRelRow);
1319
+ dmsRow = dmsHashTable + dmsRelRow;
1320
+ ZSTD_row_prefetch(dmsHashTable, dmsTagTable, dmsRelRow, rowLog);
1321
+ }
1322
+
1323
+ /* Update the hashTable and tagTable up to (but not including) ip */
1324
+ ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);
1325
+ { /* Get the hash for ip, compute the appropriate row */
1326
+ U32 const hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls);
1327
+ U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
1328
+ U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK;
1329
+ U32* const row = hashTable + relRow;
1330
+ BYTE* tagRow = (BYTE*)(tagTable + relRow);
1331
+ U32 const head = *tagRow & rowMask;
1332
+ U32 matchBuffer[32 /* maximum nb entries per row */];
1333
+ size_t numMatches = 0;
1334
+ size_t currMatch = 0;
1335
+ ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, head, rowEntries);
1336
+
1337
+ /* Cycle through the matches and prefetch */
1338
+ for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
1339
+ U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
1340
+ U32 const matchIndex = row[matchPos];
1341
+ assert(numMatches < rowEntries);
1342
+ if (matchIndex < lowLimit)
1343
+ break;
1344
+ if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
1345
+ PREFETCH_L1(base + matchIndex);
1346
+ } else {
1347
+ PREFETCH_L1(dictBase + matchIndex);
1348
+ }
1349
+ matchBuffer[numMatches++] = matchIndex;
1350
+ }
1351
+
1352
+ /* Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop
1353
+ in ZSTD_row_update_internal() at the next search. */
1354
+ {
1355
+ U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
1356
+ tagRow[pos + ZSTD_ROW_HASH_TAG_OFFSET] = (BYTE)tag;
1357
+ row[pos] = ms->nextToUpdate++;
1358
+ }
1359
+
1360
+ /* Return the longest match */
1361
+ for (; currMatch < numMatches; ++currMatch) {
1362
+ U32 const matchIndex = matchBuffer[currMatch];
1363
+ size_t currentMl=0;
1364
+ assert(matchIndex < curr);
1365
+ assert(matchIndex >= lowLimit);
1366
+
1367
+ if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
1368
+ const BYTE* const match = base + matchIndex;
1369
+ assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */
1370
+ if (match[ml] == ip[ml]) /* potentially better */
1371
+ currentMl = ZSTD_count(ip, match, iLimit);
1372
+ } else {
1373
+ const BYTE* const match = dictBase + matchIndex;
1374
+ assert(match+4 <= dictEnd);
1375
+ if (MEM_read32(match) == MEM_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */
1376
+ currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dictEnd, prefixStart) + 4;
1377
+ }
1378
+
1379
+ /* Save best solution */
1380
+ if (currentMl > ml) {
1381
+ ml = currentMl;
1382
+ *offsetPtr = curr - matchIndex + ZSTD_REP_MOVE;
1383
+ if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
1384
+ }
1385
+ }
1386
+ }
1387
+
1388
+ if (dictMode == ZSTD_dedicatedDictSearch) {
1389
+ ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts + ddsExtraAttempts, dms,
1390
+ ip, iLimit, prefixStart, curr, dictLimit, ddsIdx);
1391
+ } else if (dictMode == ZSTD_dictMatchState) {
1392
+ /* TODO: Measure and potentially add prefetching to DMS */
1393
+ const U32 dmsLowestIndex = dms->window.dictLimit;
1394
+ const BYTE* const dmsBase = dms->window.base;
1395
+ const BYTE* const dmsEnd = dms->window.nextSrc;
1396
+ const U32 dmsSize = (U32)(dmsEnd - dmsBase);
1397
+ const U32 dmsIndexDelta = dictLimit - dmsSize;
1398
+
1399
+ { U32 const head = *dmsTagRow & rowMask;
1400
+ U32 matchBuffer[32 /* maximum nb row entries */];
1401
+ size_t numMatches = 0;
1402
+ size_t currMatch = 0;
1403
+ ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, head, rowEntries);
1404
+
1405
+ for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
1406
+ U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
1407
+ U32 const matchIndex = dmsRow[matchPos];
1408
+ if (matchIndex < dmsLowestIndex)
1409
+ break;
1410
+ PREFETCH_L1(dmsBase + matchIndex);
1411
+ matchBuffer[numMatches++] = matchIndex;
1412
+ }
1413
+
1414
+ /* Return the longest match */
1415
+ for (; currMatch < numMatches; ++currMatch) {
1416
+ U32 const matchIndex = matchBuffer[currMatch];
1417
+ size_t currentMl=0;
1418
+ assert(matchIndex >= dmsLowestIndex);
1419
+ assert(matchIndex < curr);
1420
+
1421
+ { const BYTE* const match = dmsBase + matchIndex;
1422
+ assert(match+4 <= dmsEnd);
1423
+ if (MEM_read32(match) == MEM_read32(ip))
1424
+ currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dmsEnd, prefixStart) + 4;
1425
+ }
1426
+
1427
+ if (currentMl > ml) {
1428
+ ml = currentMl;
1429
+ *offsetPtr = curr - (matchIndex + dmsIndexDelta) + ZSTD_REP_MOVE;
1430
+ if (ip+currentMl == iLimit) break;
1431
+ }
1432
+ }
1433
+ }
1434
+ }
1435
+ return ml;
1436
+ }
1437
+
1438
+ /* Inlining is important to hardwire a hot branch (template emulation) */
1439
+ FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_selectMLS (
1440
+ ZSTD_matchState_t* ms,
1441
+ const BYTE* ip, const BYTE* const iLimit,
1442
+ const ZSTD_dictMode_e dictMode, size_t* offsetPtr, const U32 rowLog)
1443
+ {
1444
+ switch(ms->cParams.minMatch)
1445
+ {
1446
+ default : /* includes case 3 */
1447
+ case 4 : return ZSTD_RowFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, dictMode, rowLog);
1448
+ case 5 : return ZSTD_RowFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, dictMode, rowLog);
1449
+ case 7 :
1450
+ case 6 : return ZSTD_RowFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, dictMode, rowLog);
1451
+ }
1452
+ }
1453
+
1454
+ FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_selectRowLog (
1455
+ ZSTD_matchState_t* ms,
1456
+ const BYTE* ip, const BYTE* const iLimit,
1457
+ size_t* offsetPtr)
1458
+ {
1459
+ const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 5);
1460
+ switch(cappedSearchLog)
1461
+ {
1462
+ default :
1463
+ case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_noDict, offsetPtr, 4);
1464
+ case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_noDict, offsetPtr, 5);
1465
+ }
1466
+ }
1467
+
1468
+ FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_dictMatchState_selectRowLog(
1469
+ ZSTD_matchState_t* ms,
1470
+ const BYTE* ip, const BYTE* const iLimit,
1471
+ size_t* offsetPtr)
1472
+ {
1473
+ const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 5);
1474
+ switch(cappedSearchLog)
1475
+ {
1476
+ default :
1477
+ case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dictMatchState, offsetPtr, 4);
1478
+ case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dictMatchState, offsetPtr, 5);
1479
+ }
1480
+ }
1481
+
1482
+ FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_dedicatedDictSearch_selectRowLog(
1483
+ ZSTD_matchState_t* ms,
1484
+ const BYTE* ip, const BYTE* const iLimit,
1485
+ size_t* offsetPtr)
1486
+ {
1487
+ const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 5);
1488
+ switch(cappedSearchLog)
1489
+ {
1490
+ default :
1491
+ case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dedicatedDictSearch, offsetPtr, 4);
1492
+ case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dedicatedDictSearch, offsetPtr, 5);
1493
+ }
1494
+ }
1495
+
1496
+ FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_extDict_selectRowLog (
1497
+ ZSTD_matchState_t* ms,
1498
+ const BYTE* ip, const BYTE* const iLimit,
1499
+ size_t* offsetPtr)
1500
+ {
1501
+ const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 5);
1502
+ switch(cappedSearchLog)
1503
+ {
1504
+ default :
1505
+ case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_extDict, offsetPtr, 4);
1506
+ case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_extDict, offsetPtr, 5);
1507
+ }
1508
+ }
1509
+
618
1510
 
619
1511
  /* *******************************
620
1512
  * Common parser - lazy strategy
621
1513
  *********************************/
622
- FORCE_INLINE_TEMPLATE
623
- size_t ZSTD_compressBlock_lazy_generic(
1514
+ typedef enum { search_hashChain=0, search_binaryTree=1, search_rowHash=2 } searchMethod_e;
1515
+
1516
+ FORCE_INLINE_TEMPLATE size_t
1517
+ ZSTD_compressBlock_lazy_generic(
624
1518
  ZSTD_matchState_t* ms, seqStore_t* seqStore,
625
1519
  U32 rep[ZSTD_REP_NUM],
626
1520
  const void* src, size_t srcSize,
627
- const U32 searchMethod, const U32 depth,
1521
+ const searchMethod_e searchMethod, const U32 depth,
628
1522
  ZSTD_dictMode_e const dictMode)
629
1523
  {
630
1524
  const BYTE* const istart = (const BYTE*)src;
631
1525
  const BYTE* ip = istart;
632
1526
  const BYTE* anchor = istart;
633
1527
  const BYTE* const iend = istart + srcSize;
634
- const BYTE* const ilimit = iend - 8;
1528
+ const BYTE* const ilimit = searchMethod == search_rowHash ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8;
635
1529
  const BYTE* const base = ms->window.base;
636
1530
  const U32 prefixLowestIndex = ms->window.dictLimit;
637
1531
  const BYTE* const prefixLowest = base + prefixLowestIndex;
1532
+ const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5;
638
1533
 
639
1534
  typedef size_t (*searchMax_f)(
640
1535
  ZSTD_matchState_t* ms,
641
1536
  const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
642
- searchMax_f const searchMax = dictMode == ZSTD_dictMatchState ?
643
- (searchMethod ? ZSTD_BtFindBestMatch_dictMatchState_selectMLS : ZSTD_HcFindBestMatch_dictMatchState_selectMLS) :
644
- (searchMethod ? ZSTD_BtFindBestMatch_selectMLS : ZSTD_HcFindBestMatch_selectMLS);
1537
+
1538
+ /**
1539
+ * This table is indexed first by the four ZSTD_dictMode_e values, and then
1540
+ * by the two searchMethod_e values. NULLs are placed for configurations
1541
+ * that should never occur (extDict modes go to the other implementation
1542
+ * below and there is no DDSS for binary tree search yet).
1543
+ */
1544
+ const searchMax_f searchFuncs[4][3] = {
1545
+ {
1546
+ ZSTD_HcFindBestMatch_selectMLS,
1547
+ ZSTD_BtFindBestMatch_selectMLS,
1548
+ ZSTD_RowFindBestMatch_selectRowLog
1549
+ },
1550
+ {
1551
+ NULL,
1552
+ NULL,
1553
+ NULL
1554
+ },
1555
+ {
1556
+ ZSTD_HcFindBestMatch_dictMatchState_selectMLS,
1557
+ ZSTD_BtFindBestMatch_dictMatchState_selectMLS,
1558
+ ZSTD_RowFindBestMatch_dictMatchState_selectRowLog
1559
+ },
1560
+ {
1561
+ ZSTD_HcFindBestMatch_dedicatedDictSearch_selectMLS,
1562
+ NULL,
1563
+ ZSTD_RowFindBestMatch_dedicatedDictSearch_selectRowLog
1564
+ }
1565
+ };
1566
+
1567
+ searchMax_f const searchMax = searchFuncs[dictMode][(int)searchMethod];
645
1568
  U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0;
646
1569
 
1570
+ const int isDMS = dictMode == ZSTD_dictMatchState;
1571
+ const int isDDS = dictMode == ZSTD_dedicatedDictSearch;
1572
+ const int isDxS = isDMS || isDDS;
647
1573
  const ZSTD_matchState_t* const dms = ms->dictMatchState;
648
- const U32 dictLowestIndex = dictMode == ZSTD_dictMatchState ?
649
- dms->window.dictLimit : 0;
650
- const BYTE* const dictBase = dictMode == ZSTD_dictMatchState ?
651
- dms->window.base : NULL;
652
- const BYTE* const dictLowest = dictMode == ZSTD_dictMatchState ?
653
- dictBase + dictLowestIndex : NULL;
654
- const BYTE* const dictEnd = dictMode == ZSTD_dictMatchState ?
655
- dms->window.nextSrc : NULL;
656
- const U32 dictIndexDelta = dictMode == ZSTD_dictMatchState ?
1574
+ const U32 dictLowestIndex = isDxS ? dms->window.dictLimit : 0;
1575
+ const BYTE* const dictBase = isDxS ? dms->window.base : NULL;
1576
+ const BYTE* const dictLowest = isDxS ? dictBase + dictLowestIndex : NULL;
1577
+ const BYTE* const dictEnd = isDxS ? dms->window.nextSrc : NULL;
1578
+ const U32 dictIndexDelta = isDxS ?
657
1579
  prefixLowestIndex - (U32)(dictEnd - dictBase) :
658
1580
  0;
659
- const U32 dictAndPrefixLength = (U32)(ip - prefixLowest + dictEnd - dictLowest);
1581
+ const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictLowest));
660
1582
 
661
- /* init */
1583
+ assert(searchMax != NULL);
1584
+
1585
+ DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=%u) (searchFunc=%u)", (U32)dictMode, (U32)searchMethod);
662
1586
  ip += (dictAndPrefixLength == 0);
663
1587
  if (dictMode == ZSTD_noDict) {
664
- U32 const maxRep = (U32)(ip - prefixLowest);
1588
+ U32 const curr = (U32)(ip - base);
1589
+ U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, ms->cParams.windowLog);
1590
+ U32 const maxRep = curr - windowLow;
665
1591
  if (offset_2 > maxRep) savedOffset = offset_2, offset_2 = 0;
666
1592
  if (offset_1 > maxRep) savedOffset = offset_1, offset_1 = 0;
667
1593
  }
668
- if (dictMode == ZSTD_dictMatchState) {
1594
+ if (isDxS) {
669
1595
  /* dictMatchState repCode checks don't currently handle repCode == 0
670
1596
  * disabling. */
671
1597
  assert(offset_1 <= dictAndPrefixLength);
672
1598
  assert(offset_2 <= dictAndPrefixLength);
673
1599
  }
674
1600
 
1601
+ if (searchMethod == search_rowHash) {
1602
+ ZSTD_row_fillHashCache(ms, base, rowLog,
1603
+ MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
1604
+ ms->nextToUpdate, ilimit);
1605
+ }
1606
+
675
1607
  /* Match Loop */
1608
+ #if defined(__GNUC__) && defined(__x86_64__)
1609
+ /* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the
1610
+ * code alignment is perturbed. To fix the instability align the loop on 32-bytes.
1611
+ */
1612
+ __asm__(".p2align 5");
1613
+ #endif
676
1614
  while (ip < ilimit) {
677
1615
  size_t matchLength=0;
678
1616
  size_t offset=0;
679
1617
  const BYTE* start=ip+1;
680
1618
 
681
1619
  /* check repCode */
682
- if (dictMode == ZSTD_dictMatchState) {
1620
+ if (isDxS) {
683
1621
  const U32 repIndex = (U32)(ip - base) + 1 - offset_1;
684
- const BYTE* repMatch = (dictMode == ZSTD_dictMatchState
1622
+ const BYTE* repMatch = ((dictMode == ZSTD_dictMatchState || dictMode == ZSTD_dedicatedDictSearch)
685
1623
  && repIndex < prefixLowestIndex) ?
686
1624
  dictBase + (repIndex - dictIndexDelta) :
687
1625
  base + repIndex;
@@ -722,7 +1660,7 @@ size_t ZSTD_compressBlock_lazy_generic(
722
1660
  if ((mlRep >= 4) && (gain2 > gain1))
723
1661
  matchLength = mlRep, offset = 0, start = ip;
724
1662
  }
725
- if (dictMode == ZSTD_dictMatchState) {
1663
+ if (isDxS) {
726
1664
  const U32 repIndex = (U32)(ip - base) - offset_1;
727
1665
  const BYTE* repMatch = repIndex < prefixLowestIndex ?
728
1666
  dictBase + (repIndex - dictIndexDelta) :
@@ -757,7 +1695,7 @@ size_t ZSTD_compressBlock_lazy_generic(
757
1695
  if ((mlRep >= 4) && (gain2 > gain1))
758
1696
  matchLength = mlRep, offset = 0, start = ip;
759
1697
  }
760
- if (dictMode == ZSTD_dictMatchState) {
1698
+ if (isDxS) {
761
1699
  const U32 repIndex = (U32)(ip - base) - offset_1;
762
1700
  const BYTE* repMatch = repIndex < prefixLowestIndex ?
763
1701
  dictBase + (repIndex - dictIndexDelta) :
@@ -795,7 +1733,7 @@ size_t ZSTD_compressBlock_lazy_generic(
795
1733
  && (start[-1] == (start-(offset-ZSTD_REP_MOVE))[-1]) ) /* only search for offset within prefix */
796
1734
  { start--; matchLength++; }
797
1735
  }
798
- if (dictMode == ZSTD_dictMatchState) {
1736
+ if (isDxS) {
799
1737
  U32 const matchIndex = (U32)((start-base) - (offset - ZSTD_REP_MOVE));
800
1738
  const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex;
801
1739
  const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest;
@@ -806,17 +1744,16 @@ size_t ZSTD_compressBlock_lazy_generic(
806
1744
  /* store sequence */
807
1745
  _storeSequence:
808
1746
  { size_t const litLength = start - anchor;
809
- ZSTD_storeSeq(seqStore, litLength, anchor, (U32)offset, matchLength-MINMATCH);
1747
+ ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offset, matchLength-MINMATCH);
810
1748
  anchor = ip = start + matchLength;
811
1749
  }
812
1750
 
813
1751
  /* check immediate repcode */
814
- if (dictMode == ZSTD_dictMatchState) {
1752
+ if (isDxS) {
815
1753
  while (ip <= ilimit) {
816
1754
  U32 const current2 = (U32)(ip-base);
817
1755
  U32 const repIndex = current2 - offset_2;
818
- const BYTE* repMatch = dictMode == ZSTD_dictMatchState
819
- && repIndex < prefixLowestIndex ?
1756
+ const BYTE* repMatch = repIndex < prefixLowestIndex ?
820
1757
  dictBase - dictIndexDelta + repIndex :
821
1758
  base + repIndex;
822
1759
  if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex) >= 3 /* intentional overflow */)
@@ -824,7 +1761,7 @@ _storeSequence:
824
1761
  const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend;
825
1762
  matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4;
826
1763
  offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap offset_2 <=> offset_1 */
827
- ZSTD_storeSeq(seqStore, 0, anchor, 0, matchLength-MINMATCH);
1764
+ ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MINMATCH);
828
1765
  ip += matchLength;
829
1766
  anchor = ip;
830
1767
  continue;
@@ -839,7 +1776,7 @@ _storeSequence:
839
1776
  /* store sequence */
840
1777
  matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
841
1778
  offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap repcodes */
842
- ZSTD_storeSeq(seqStore, 0, anchor, 0, matchLength-MINMATCH);
1779
+ ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MINMATCH);
843
1780
  ip += matchLength;
844
1781
  anchor = ip;
845
1782
  continue; /* faster when present ... (?) */
@@ -850,7 +1787,7 @@ _storeSequence:
850
1787
  rep[1] = offset_2 ? offset_2 : savedOffset;
851
1788
 
852
1789
  /* Return the last literals size */
853
- return iend - anchor;
1790
+ return (size_t)(iend - anchor);
854
1791
  }
855
1792
 
856
1793
 
@@ -858,101 +1795,207 @@ size_t ZSTD_compressBlock_btlazy2(
858
1795
  ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
859
1796
  void const* src, size_t srcSize)
860
1797
  {
861
- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, 1, 2, ZSTD_noDict);
1798
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_noDict);
862
1799
  }
863
1800
 
864
1801
  size_t ZSTD_compressBlock_lazy2(
865
1802
  ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
866
1803
  void const* src, size_t srcSize)
867
1804
  {
868
- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, 0, 2, ZSTD_noDict);
1805
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_noDict);
869
1806
  }
870
1807
 
871
1808
  size_t ZSTD_compressBlock_lazy(
872
1809
  ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
873
1810
  void const* src, size_t srcSize)
874
1811
  {
875
- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, 0, 1, ZSTD_noDict);
1812
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_noDict);
876
1813
  }
877
1814
 
878
1815
  size_t ZSTD_compressBlock_greedy(
879
1816
  ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
880
1817
  void const* src, size_t srcSize)
881
1818
  {
882
- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, 0, 0, ZSTD_noDict);
1819
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_noDict);
883
1820
  }
884
1821
 
885
1822
  size_t ZSTD_compressBlock_btlazy2_dictMatchState(
886
1823
  ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
887
1824
  void const* src, size_t srcSize)
888
1825
  {
889
- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, 1, 2, ZSTD_dictMatchState);
1826
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_dictMatchState);
890
1827
  }
891
1828
 
892
1829
  size_t ZSTD_compressBlock_lazy2_dictMatchState(
893
1830
  ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
894
1831
  void const* src, size_t srcSize)
895
1832
  {
896
- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, 0, 2, ZSTD_dictMatchState);
1833
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dictMatchState);
897
1834
  }
898
1835
 
899
1836
  size_t ZSTD_compressBlock_lazy_dictMatchState(
900
1837
  ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
901
1838
  void const* src, size_t srcSize)
902
1839
  {
903
- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, 0, 1, ZSTD_dictMatchState);
1840
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dictMatchState);
904
1841
  }
905
1842
 
906
1843
  size_t ZSTD_compressBlock_greedy_dictMatchState(
907
1844
  ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
908
1845
  void const* src, size_t srcSize)
909
1846
  {
910
- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, 0, 0, ZSTD_dictMatchState);
1847
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dictMatchState);
1848
+ }
1849
+
1850
+
1851
+ size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(
1852
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1853
+ void const* src, size_t srcSize)
1854
+ {
1855
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dedicatedDictSearch);
1856
+ }
1857
+
1858
+ size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(
1859
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1860
+ void const* src, size_t srcSize)
1861
+ {
1862
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dedicatedDictSearch);
1863
+ }
1864
+
1865
+ size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
1866
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1867
+ void const* src, size_t srcSize)
1868
+ {
1869
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch);
1870
+ }
1871
+
1872
+ /* Row-based matchfinder */
1873
+ size_t ZSTD_compressBlock_lazy2_row(
1874
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1875
+ void const* src, size_t srcSize)
1876
+ {
1877
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict);
1878
+ }
1879
+
1880
+ size_t ZSTD_compressBlock_lazy_row(
1881
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1882
+ void const* src, size_t srcSize)
1883
+ {
1884
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict);
1885
+ }
1886
+
1887
+ size_t ZSTD_compressBlock_greedy_row(
1888
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1889
+ void const* src, size_t srcSize)
1890
+ {
1891
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict);
1892
+ }
1893
+
1894
+ size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
1895
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1896
+ void const* src, size_t srcSize)
1897
+ {
1898
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState);
911
1899
  }
912
1900
 
1901
+ size_t ZSTD_compressBlock_lazy_dictMatchState_row(
1902
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1903
+ void const* src, size_t srcSize)
1904
+ {
1905
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState);
1906
+ }
1907
+
1908
+ size_t ZSTD_compressBlock_greedy_dictMatchState_row(
1909
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1910
+ void const* src, size_t srcSize)
1911
+ {
1912
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState);
1913
+ }
1914
+
1915
+
1916
+ size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row(
1917
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1918
+ void const* src, size_t srcSize)
1919
+ {
1920
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dedicatedDictSearch);
1921
+ }
1922
+
1923
+ size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
1924
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1925
+ void const* src, size_t srcSize)
1926
+ {
1927
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch);
1928
+ }
1929
+
1930
+ size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
1931
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1932
+ void const* src, size_t srcSize)
1933
+ {
1934
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch);
1935
+ }
913
1936
 
914
1937
  FORCE_INLINE_TEMPLATE
915
1938
  size_t ZSTD_compressBlock_lazy_extDict_generic(
916
1939
  ZSTD_matchState_t* ms, seqStore_t* seqStore,
917
1940
  U32 rep[ZSTD_REP_NUM],
918
1941
  const void* src, size_t srcSize,
919
- const U32 searchMethod, const U32 depth)
1942
+ const searchMethod_e searchMethod, const U32 depth)
920
1943
  {
921
1944
  const BYTE* const istart = (const BYTE*)src;
922
1945
  const BYTE* ip = istart;
923
1946
  const BYTE* anchor = istart;
924
1947
  const BYTE* const iend = istart + srcSize;
925
- const BYTE* const ilimit = iend - 8;
1948
+ const BYTE* const ilimit = searchMethod == search_rowHash ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8;
926
1949
  const BYTE* const base = ms->window.base;
927
1950
  const U32 dictLimit = ms->window.dictLimit;
928
- const U32 lowestIndex = ms->window.lowLimit;
929
1951
  const BYTE* const prefixStart = base + dictLimit;
930
1952
  const BYTE* const dictBase = ms->window.dictBase;
931
1953
  const BYTE* const dictEnd = dictBase + dictLimit;
932
- const BYTE* const dictStart = dictBase + lowestIndex;
1954
+ const BYTE* const dictStart = dictBase + ms->window.lowLimit;
1955
+ const U32 windowLog = ms->cParams.windowLog;
1956
+ const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5;
933
1957
 
934
1958
  typedef size_t (*searchMax_f)(
935
1959
  ZSTD_matchState_t* ms,
936
1960
  const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
937
- searchMax_f searchMax = searchMethod ? ZSTD_BtFindBestMatch_extDict_selectMLS : ZSTD_HcFindBestMatch_extDict_selectMLS;
938
-
1961
+ const searchMax_f searchFuncs[3] = {
1962
+ ZSTD_HcFindBestMatch_extDict_selectMLS,
1963
+ ZSTD_BtFindBestMatch_extDict_selectMLS,
1964
+ ZSTD_RowFindBestMatch_extDict_selectRowLog
1965
+ };
1966
+ searchMax_f searchMax = searchFuncs[(int)searchMethod];
939
1967
  U32 offset_1 = rep[0], offset_2 = rep[1];
940
1968
 
1969
+ DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod);
1970
+
941
1971
  /* init */
942
1972
  ip += (ip == prefixStart);
1973
+ if (searchMethod == search_rowHash) {
1974
+ ZSTD_row_fillHashCache(ms, base, rowLog,
1975
+ MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
1976
+ ms->nextToUpdate, ilimit);
1977
+ }
943
1978
 
944
1979
  /* Match Loop */
1980
+ #if defined(__GNUC__) && defined(__x86_64__)
1981
+ /* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the
1982
+ * code alignment is perturbed. To fix the instability align the loop on 32-bytes.
1983
+ */
1984
+ __asm__(".p2align 5");
1985
+ #endif
945
1986
  while (ip < ilimit) {
946
1987
  size_t matchLength=0;
947
1988
  size_t offset=0;
948
1989
  const BYTE* start=ip+1;
949
- U32 current = (U32)(ip-base);
1990
+ U32 curr = (U32)(ip-base);
950
1991
 
951
1992
  /* check repCode */
952
- { const U32 repIndex = (U32)(current+1 - offset_1);
1993
+ { const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr+1, windowLog);
1994
+ const U32 repIndex = (U32)(curr+1 - offset_1);
953
1995
  const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
954
1996
  const BYTE* const repMatch = repBase + repIndex;
955
- if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > lowestIndex)) /* intentional overflow */
1997
+ if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow */
1998
+ & (offset_1 < curr+1 - windowLow) ) /* note: we are searching at curr+1 */
956
1999
  if (MEM_read32(ip+1) == MEM_read32(repMatch)) {
957
2000
  /* repcode detected we should take it */
958
2001
  const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
@@ -976,13 +2019,15 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
976
2019
  if (depth>=1)
977
2020
  while (ip<ilimit) {
978
2021
  ip ++;
979
- current++;
2022
+ curr++;
980
2023
  /* check repCode */
981
2024
  if (offset) {
982
- const U32 repIndex = (U32)(current - offset_1);
2025
+ const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
2026
+ const U32 repIndex = (U32)(curr - offset_1);
983
2027
  const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
984
2028
  const BYTE* const repMatch = repBase + repIndex;
985
- if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > lowestIndex)) /* intentional overflow */
2029
+ if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */
2030
+ & (offset_1 < curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
986
2031
  if (MEM_read32(ip) == MEM_read32(repMatch)) {
987
2032
  /* repcode detected */
988
2033
  const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
@@ -1006,13 +2051,15 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1006
2051
  /* let's find an even better one */
1007
2052
  if ((depth==2) && (ip<ilimit)) {
1008
2053
  ip ++;
1009
- current++;
2054
+ curr++;
1010
2055
  /* check repCode */
1011
2056
  if (offset) {
1012
- const U32 repIndex = (U32)(current - offset_1);
2057
+ const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
2058
+ const U32 repIndex = (U32)(curr - offset_1);
1013
2059
  const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
1014
2060
  const BYTE* const repMatch = repBase + repIndex;
1015
- if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > lowestIndex)) /* intentional overflow */
2061
+ if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */
2062
+ & (offset_1 < curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
1016
2063
  if (MEM_read32(ip) == MEM_read32(repMatch)) {
1017
2064
  /* repcode detected */
1018
2065
  const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
@@ -1047,22 +2094,25 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1047
2094
  /* store sequence */
1048
2095
  _storeSequence:
1049
2096
  { size_t const litLength = start - anchor;
1050
- ZSTD_storeSeq(seqStore, litLength, anchor, (U32)offset, matchLength-MINMATCH);
2097
+ ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offset, matchLength-MINMATCH);
1051
2098
  anchor = ip = start + matchLength;
1052
2099
  }
1053
2100
 
1054
2101
  /* check immediate repcode */
1055
2102
  while (ip <= ilimit) {
1056
- const U32 repIndex = (U32)((ip-base) - offset_2);
2103
+ const U32 repCurrent = (U32)(ip-base);
2104
+ const U32 windowLow = ZSTD_getLowestMatchIndex(ms, repCurrent, windowLog);
2105
+ const U32 repIndex = repCurrent - offset_2;
1057
2106
  const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
1058
2107
  const BYTE* const repMatch = repBase + repIndex;
1059
- if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > lowestIndex)) /* intentional overflow */
2108
+ if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */
2109
+ & (offset_2 < repCurrent - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
1060
2110
  if (MEM_read32(ip) == MEM_read32(repMatch)) {
1061
2111
  /* repcode detected we should take it */
1062
2112
  const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
1063
2113
  matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
1064
2114
  offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap offset history */
1065
- ZSTD_storeSeq(seqStore, 0, anchor, 0, matchLength-MINMATCH);
2115
+ ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MINMATCH);
1066
2116
  ip += matchLength;
1067
2117
  anchor = ip;
1068
2118
  continue; /* faster when present ... (?) */
@@ -1075,7 +2125,7 @@ _storeSequence:
1075
2125
  rep[1] = offset_2;
1076
2126
 
1077
2127
  /* Return the last literals size */
1078
- return iend - anchor;
2128
+ return (size_t)(iend - anchor);
1079
2129
  }
1080
2130
 
1081
2131
 
@@ -1083,7 +2133,7 @@ size_t ZSTD_compressBlock_greedy_extDict(
1083
2133
  ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1084
2134
  void const* src, size_t srcSize)
1085
2135
  {
1086
- return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, 0, 0);
2136
+ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0);
1087
2137
  }
1088
2138
 
1089
2139
  size_t ZSTD_compressBlock_lazy_extDict(
@@ -1091,7 +2141,7 @@ size_t ZSTD_compressBlock_lazy_extDict(
1091
2141
  void const* src, size_t srcSize)
1092
2142
 
1093
2143
  {
1094
- return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, 0, 1);
2144
+ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1);
1095
2145
  }
1096
2146
 
1097
2147
  size_t ZSTD_compressBlock_lazy2_extDict(
@@ -1099,7 +2149,7 @@ size_t ZSTD_compressBlock_lazy2_extDict(
1099
2149
  void const* src, size_t srcSize)
1100
2150
 
1101
2151
  {
1102
- return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, 0, 2);
2152
+ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2);
1103
2153
  }
1104
2154
 
1105
2155
  size_t ZSTD_compressBlock_btlazy2_extDict(
@@ -1107,5 +2157,28 @@ size_t ZSTD_compressBlock_btlazy2_extDict(
1107
2157
  void const* src, size_t srcSize)
1108
2158
 
1109
2159
  {
1110
- return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, 1, 2);
2160
+ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2);
2161
+ }
2162
+
2163
+ size_t ZSTD_compressBlock_greedy_extDict_row(
2164
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2165
+ void const* src, size_t srcSize)
2166
+ {
2167
+ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0);
2168
+ }
2169
+
2170
+ size_t ZSTD_compressBlock_lazy_extDict_row(
2171
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2172
+ void const* src, size_t srcSize)
2173
+
2174
+ {
2175
+ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1);
2176
+ }
2177
+
2178
+ size_t ZSTD_compressBlock_lazy2_extDict_row(
2179
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2180
+ void const* src, size_t srcSize)
2181
+
2182
+ {
2183
+ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2);
1111
2184
  }