extzstd 0.3.2 → 0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +4 -3
  3. data/contrib/zstd/CHANGELOG +225 -1
  4. data/contrib/zstd/CONTRIBUTING.md +158 -75
  5. data/contrib/zstd/LICENSE +4 -4
  6. data/contrib/zstd/Makefile +106 -69
  7. data/contrib/zstd/Package.swift +36 -0
  8. data/contrib/zstd/README.md +64 -36
  9. data/contrib/zstd/SECURITY.md +15 -0
  10. data/contrib/zstd/TESTING.md +2 -3
  11. data/contrib/zstd/lib/BUCK +5 -7
  12. data/contrib/zstd/lib/Makefile +117 -199
  13. data/contrib/zstd/lib/README.md +37 -7
  14. data/contrib/zstd/lib/common/allocations.h +55 -0
  15. data/contrib/zstd/lib/common/bits.h +200 -0
  16. data/contrib/zstd/lib/common/bitstream.h +80 -86
  17. data/contrib/zstd/lib/common/compiler.h +225 -63
  18. data/contrib/zstd/lib/common/cpu.h +37 -1
  19. data/contrib/zstd/lib/common/debug.c +7 -1
  20. data/contrib/zstd/lib/common/debug.h +21 -12
  21. data/contrib/zstd/lib/common/entropy_common.c +15 -37
  22. data/contrib/zstd/lib/common/error_private.c +9 -2
  23. data/contrib/zstd/lib/common/error_private.h +93 -5
  24. data/contrib/zstd/lib/common/fse.h +12 -87
  25. data/contrib/zstd/lib/common/fse_decompress.c +37 -117
  26. data/contrib/zstd/lib/common/huf.h +97 -172
  27. data/contrib/zstd/lib/common/mem.h +58 -58
  28. data/contrib/zstd/lib/common/pool.c +38 -17
  29. data/contrib/zstd/lib/common/pool.h +10 -4
  30. data/contrib/zstd/lib/common/portability_macros.h +158 -0
  31. data/contrib/zstd/lib/common/threading.c +74 -14
  32. data/contrib/zstd/lib/common/threading.h +5 -10
  33. data/contrib/zstd/lib/common/xxhash.c +6 -814
  34. data/contrib/zstd/lib/common/xxhash.h +6930 -195
  35. data/contrib/zstd/lib/common/zstd_common.c +1 -36
  36. data/contrib/zstd/lib/common/zstd_deps.h +1 -1
  37. data/contrib/zstd/lib/common/zstd_internal.h +68 -154
  38. data/contrib/zstd/lib/common/zstd_trace.h +163 -0
  39. data/contrib/zstd/lib/compress/clevels.h +134 -0
  40. data/contrib/zstd/lib/compress/fse_compress.c +75 -155
  41. data/contrib/zstd/lib/compress/hist.c +1 -1
  42. data/contrib/zstd/lib/compress/hist.h +1 -1
  43. data/contrib/zstd/lib/compress/huf_compress.c +810 -259
  44. data/contrib/zstd/lib/compress/zstd_compress.c +2864 -919
  45. data/contrib/zstd/lib/compress/zstd_compress_internal.h +523 -192
  46. data/contrib/zstd/lib/compress/zstd_compress_literals.c +117 -40
  47. data/contrib/zstd/lib/compress/zstd_compress_literals.h +16 -6
  48. data/contrib/zstd/lib/compress/zstd_compress_sequences.c +28 -19
  49. data/contrib/zstd/lib/compress/zstd_compress_sequences.h +1 -1
  50. data/contrib/zstd/lib/compress/zstd_compress_superblock.c +251 -412
  51. data/contrib/zstd/lib/compress/zstd_compress_superblock.h +1 -1
  52. data/contrib/zstd/lib/compress/zstd_cwksp.h +284 -97
  53. data/contrib/zstd/lib/compress/zstd_double_fast.c +382 -133
  54. data/contrib/zstd/lib/compress/zstd_double_fast.h +14 -2
  55. data/contrib/zstd/lib/compress/zstd_fast.c +732 -260
  56. data/contrib/zstd/lib/compress/zstd_fast.h +3 -2
  57. data/contrib/zstd/lib/compress/zstd_lazy.c +1177 -390
  58. data/contrib/zstd/lib/compress/zstd_lazy.h +129 -14
  59. data/contrib/zstd/lib/compress/zstd_ldm.c +280 -210
  60. data/contrib/zstd/lib/compress/zstd_ldm.h +3 -2
  61. data/contrib/zstd/lib/compress/zstd_ldm_geartab.h +106 -0
  62. data/contrib/zstd/lib/compress/zstd_opt.c +516 -285
  63. data/contrib/zstd/lib/compress/zstd_opt.h +32 -8
  64. data/contrib/zstd/lib/compress/zstdmt_compress.c +202 -131
  65. data/contrib/zstd/lib/compress/zstdmt_compress.h +9 -6
  66. data/contrib/zstd/lib/decompress/huf_decompress.c +1149 -555
  67. data/contrib/zstd/lib/decompress/huf_decompress_amd64.S +595 -0
  68. data/contrib/zstd/lib/decompress/zstd_ddict.c +4 -4
  69. data/contrib/zstd/lib/decompress/zstd_ddict.h +1 -1
  70. data/contrib/zstd/lib/decompress/zstd_decompress.c +583 -106
  71. data/contrib/zstd/lib/decompress/zstd_decompress_block.c +1054 -379
  72. data/contrib/zstd/lib/decompress/zstd_decompress_block.h +14 -3
  73. data/contrib/zstd/lib/decompress/zstd_decompress_internal.h +56 -6
  74. data/contrib/zstd/lib/deprecated/zbuff.h +1 -1
  75. data/contrib/zstd/lib/deprecated/zbuff_common.c +1 -1
  76. data/contrib/zstd/lib/deprecated/zbuff_compress.c +24 -4
  77. data/contrib/zstd/lib/deprecated/zbuff_decompress.c +3 -1
  78. data/contrib/zstd/lib/dictBuilder/cover.c +60 -44
  79. data/contrib/zstd/lib/dictBuilder/cover.h +6 -11
  80. data/contrib/zstd/lib/dictBuilder/divsufsort.c +1 -1
  81. data/contrib/zstd/lib/dictBuilder/fastcover.c +26 -18
  82. data/contrib/zstd/lib/dictBuilder/zdict.c +100 -101
  83. data/contrib/zstd/lib/legacy/zstd_legacy.h +38 -1
  84. data/contrib/zstd/lib/legacy/zstd_v01.c +18 -53
  85. data/contrib/zstd/lib/legacy/zstd_v01.h +1 -1
  86. data/contrib/zstd/lib/legacy/zstd_v02.c +28 -85
  87. data/contrib/zstd/lib/legacy/zstd_v02.h +1 -1
  88. data/contrib/zstd/lib/legacy/zstd_v03.c +29 -88
  89. data/contrib/zstd/lib/legacy/zstd_v03.h +1 -1
  90. data/contrib/zstd/lib/legacy/zstd_v04.c +27 -80
  91. data/contrib/zstd/lib/legacy/zstd_v04.h +1 -1
  92. data/contrib/zstd/lib/legacy/zstd_v05.c +36 -85
  93. data/contrib/zstd/lib/legacy/zstd_v05.h +1 -1
  94. data/contrib/zstd/lib/legacy/zstd_v06.c +44 -96
  95. data/contrib/zstd/lib/legacy/zstd_v06.h +1 -1
  96. data/contrib/zstd/lib/legacy/zstd_v07.c +37 -92
  97. data/contrib/zstd/lib/legacy/zstd_v07.h +1 -1
  98. data/contrib/zstd/lib/libzstd.mk +237 -0
  99. data/contrib/zstd/lib/libzstd.pc.in +4 -3
  100. data/contrib/zstd/lib/module.modulemap +35 -0
  101. data/contrib/zstd/lib/{dictBuilder/zdict.h → zdict.h} +202 -33
  102. data/contrib/zstd/lib/zstd.h +1030 -332
  103. data/contrib/zstd/lib/{common/zstd_errors.h → zstd_errors.h} +27 -8
  104. data/ext/extconf.rb +26 -7
  105. data/ext/extzstd.c +51 -24
  106. data/ext/extzstd.h +33 -6
  107. data/ext/extzstd_stream.c +74 -31
  108. data/ext/libzstd_conf.h +0 -1
  109. data/ext/zstd_decompress_asm.S +1 -0
  110. metadata +17 -7
  111. data/contrib/zstd/appveyor.yml +0 -292
  112. data/ext/depend +0 -2
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -10,14 +10,23 @@
10
10
 
11
11
  #include "zstd_compress_internal.h"
12
12
  #include "zstd_lazy.h"
13
+ #include "../common/bits.h" /* ZSTD_countTrailingZeros64 */
14
+
15
+ #if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \
16
+ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \
17
+ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \
18
+ || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR)
19
+
20
+ #define kLazySkippingStep 8
13
21
 
14
22
 
15
23
  /*-*************************************
16
24
  * Binary Tree search
17
25
  ***************************************/
18
26
 
19
- static void
20
- ZSTD_updateDUBT(ZSTD_matchState_t* ms,
27
+ static
28
+ ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
29
+ void ZSTD_updateDUBT(ZSTD_matchState_t* ms,
21
30
  const BYTE* ip, const BYTE* iend,
22
31
  U32 mls)
23
32
  {
@@ -60,8 +69,9 @@ ZSTD_updateDUBT(ZSTD_matchState_t* ms,
60
69
  * sort one already inserted but unsorted position
61
70
  * assumption : curr >= btlow == (curr - btmask)
62
71
  * doesn't fail */
63
- static void
64
- ZSTD_insertDUBT1(ZSTD_matchState_t* ms,
72
+ static
73
+ ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
74
+ void ZSTD_insertDUBT1(const ZSTD_matchState_t* ms,
65
75
  U32 curr, const BYTE* inputEnd,
66
76
  U32 nbCompares, U32 btLow,
67
77
  const ZSTD_dictMode_e dictMode)
@@ -93,7 +103,7 @@ ZSTD_insertDUBT1(ZSTD_matchState_t* ms,
93
103
  assert(curr >= btLow);
94
104
  assert(ip < iend); /* condition for ZSTD_count */
95
105
 
96
- while (nbCompares-- && (matchIndex > windowLow)) {
106
+ for (; nbCompares && (matchIndex > windowLow); --nbCompares) {
97
107
  U32* const nextPtr = bt + 2*(matchIndex & btMask);
98
108
  size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
99
109
  assert(matchIndex < curr);
@@ -149,9 +159,10 @@ ZSTD_insertDUBT1(ZSTD_matchState_t* ms,
149
159
  }
150
160
 
151
161
 
152
- static size_t
153
- ZSTD_DUBT_findBetterDictMatch (
154
- ZSTD_matchState_t* ms,
162
+ static
163
+ ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
164
+ size_t ZSTD_DUBT_findBetterDictMatch (
165
+ const ZSTD_matchState_t* ms,
155
166
  const BYTE* const ip, const BYTE* const iend,
156
167
  size_t* offsetPtr,
157
168
  size_t bestLength,
@@ -185,7 +196,7 @@ ZSTD_DUBT_findBetterDictMatch (
185
196
  (void)dictMode;
186
197
  assert(dictMode == ZSTD_dictMatchState);
187
198
 
188
- while (nbCompares-- && (dictMatchIndex > dictLowLimit)) {
199
+ for (; nbCompares && (dictMatchIndex > dictLowLimit); --nbCompares) {
189
200
  U32* const nextPtr = dictBt + 2*(dictMatchIndex & btMask);
190
201
  size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
191
202
  const BYTE* match = dictBase + dictMatchIndex;
@@ -197,8 +208,8 @@ ZSTD_DUBT_findBetterDictMatch (
197
208
  U32 matchIndex = dictMatchIndex + dictIndexDelta;
198
209
  if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) {
199
210
  DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)",
200
- curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, ZSTD_REP_MOVE + curr - matchIndex, dictMatchIndex, matchIndex);
201
- bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + curr - matchIndex;
211
+ curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, OFFSET_TO_OFFBASE(curr - matchIndex), dictMatchIndex, matchIndex);
212
+ bestLength = matchLength, *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
202
213
  }
203
214
  if (ip+matchLength == iend) { /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */
204
215
  break; /* drop, to guarantee consistency (miss a little bit of compression) */
@@ -218,7 +229,7 @@ ZSTD_DUBT_findBetterDictMatch (
218
229
  }
219
230
 
220
231
  if (bestLength >= MINMATCH) {
221
- U32 const mIndex = curr - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex;
232
+ U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offsetPtr); (void)mIndex;
222
233
  DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
223
234
  curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
224
235
  }
@@ -227,10 +238,11 @@ ZSTD_DUBT_findBetterDictMatch (
227
238
  }
228
239
 
229
240
 
230
- static size_t
231
- ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
241
+ static
242
+ ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
243
+ size_t ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
232
244
  const BYTE* const ip, const BYTE* const iend,
233
- size_t* offsetPtr,
245
+ size_t* offBasePtr,
234
246
  U32 const mls,
235
247
  const ZSTD_dictMode_e dictMode)
236
248
  {
@@ -309,7 +321,7 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
309
321
  matchIndex = hashTable[h];
310
322
  hashTable[h] = curr; /* Update Hash Table */
311
323
 
312
- while (nbCompares-- && (matchIndex > windowLow)) {
324
+ for (; nbCompares && (matchIndex > windowLow); --nbCompares) {
313
325
  U32* const nextPtr = bt + 2*(matchIndex & btMask);
314
326
  size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
315
327
  const BYTE* match;
@@ -327,8 +339,8 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
327
339
  if (matchLength > bestLength) {
328
340
  if (matchLength > matchEndIdx - matchIndex)
329
341
  matchEndIdx = matchIndex + (U32)matchLength;
330
- if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) )
331
- bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + curr - matchIndex;
342
+ if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr - matchIndex + 1) - ZSTD_highbit32((U32)*offBasePtr)) )
343
+ bestLength = matchLength, *offBasePtr = OFFSET_TO_OFFBASE(curr - matchIndex);
332
344
  if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */
333
345
  if (dictMode == ZSTD_dictMatchState) {
334
346
  nbCompares = 0; /* in addition to avoiding checking any
@@ -357,19 +369,20 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
357
369
 
358
370
  *smallerPtr = *largerPtr = 0;
359
371
 
372
+ assert(nbCompares <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */
360
373
  if (dictMode == ZSTD_dictMatchState && nbCompares) {
361
374
  bestLength = ZSTD_DUBT_findBetterDictMatch(
362
375
  ms, ip, iend,
363
- offsetPtr, bestLength, nbCompares,
376
+ offBasePtr, bestLength, nbCompares,
364
377
  mls, dictMode);
365
378
  }
366
379
 
367
380
  assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased */
368
381
  ms->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */
369
382
  if (bestLength >= MINMATCH) {
370
- U32 const mIndex = curr - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex;
383
+ U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offBasePtr); (void)mIndex;
371
384
  DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
372
- curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
385
+ curr, (U32)bestLength, (U32)*offBasePtr, mIndex);
373
386
  }
374
387
  return bestLength;
375
388
  }
@@ -377,104 +390,23 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
377
390
 
378
391
 
379
392
  /** ZSTD_BtFindBestMatch() : Tree updater, providing best match */
380
- FORCE_INLINE_TEMPLATE size_t
381
- ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms,
393
+ FORCE_INLINE_TEMPLATE
394
+ ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
395
+ size_t ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms,
382
396
  const BYTE* const ip, const BYTE* const iLimit,
383
- size_t* offsetPtr,
397
+ size_t* offBasePtr,
384
398
  const U32 mls /* template */,
385
399
  const ZSTD_dictMode_e dictMode)
386
400
  {
387
401
  DEBUGLOG(7, "ZSTD_BtFindBestMatch");
388
402
  if (ip < ms->window.base + ms->nextToUpdate) return 0; /* skipped area */
389
403
  ZSTD_updateDUBT(ms, ip, iLimit, mls);
390
- return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offsetPtr, mls, dictMode);
391
- }
392
-
393
-
394
- static size_t
395
- ZSTD_BtFindBestMatch_selectMLS ( ZSTD_matchState_t* ms,
396
- const BYTE* ip, const BYTE* const iLimit,
397
- size_t* offsetPtr)
398
- {
399
- switch(ms->cParams.minMatch)
400
- {
401
- default : /* includes case 3 */
402
- case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_noDict);
403
- case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_noDict);
404
- case 7 :
405
- case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_noDict);
406
- }
404
+ return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offBasePtr, mls, dictMode);
407
405
  }
408
406
 
409
-
410
- static size_t ZSTD_BtFindBestMatch_dictMatchState_selectMLS (
411
- ZSTD_matchState_t* ms,
412
- const BYTE* ip, const BYTE* const iLimit,
413
- size_t* offsetPtr)
414
- {
415
- switch(ms->cParams.minMatch)
416
- {
417
- default : /* includes case 3 */
418
- case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_dictMatchState);
419
- case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_dictMatchState);
420
- case 7 :
421
- case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_dictMatchState);
422
- }
423
- }
424
-
425
-
426
- static size_t ZSTD_BtFindBestMatch_extDict_selectMLS (
427
- ZSTD_matchState_t* ms,
428
- const BYTE* ip, const BYTE* const iLimit,
429
- size_t* offsetPtr)
430
- {
431
- switch(ms->cParams.minMatch)
432
- {
433
- default : /* includes case 3 */
434
- case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_extDict);
435
- case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_extDict);
436
- case 7 :
437
- case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_extDict);
438
- }
439
- }
440
-
441
-
442
-
443
- /* *********************************
444
- * Hash Chain
407
+ /***********************************
408
+ * Dedicated dict search
445
409
  ***********************************/
446
- #define NEXT_IN_CHAIN(d, mask) chainTable[(d) & (mask)]
447
-
448
- /* Update chains up to ip (excluded)
449
- Assumption : always within prefix (i.e. not within extDict) */
450
- FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
451
- ZSTD_matchState_t* ms,
452
- const ZSTD_compressionParameters* const cParams,
453
- const BYTE* ip, U32 const mls)
454
- {
455
- U32* const hashTable = ms->hashTable;
456
- const U32 hashLog = cParams->hashLog;
457
- U32* const chainTable = ms->chainTable;
458
- const U32 chainMask = (1 << cParams->chainLog) - 1;
459
- const BYTE* const base = ms->window.base;
460
- const U32 target = (U32)(ip - base);
461
- U32 idx = ms->nextToUpdate;
462
-
463
- while(idx < target) { /* catch up */
464
- size_t const h = ZSTD_hashPtr(base+idx, hashLog, mls);
465
- NEXT_IN_CHAIN(idx, chainMask) = hashTable[h];
466
- hashTable[h] = idx;
467
- idx++;
468
- }
469
-
470
- ms->nextToUpdate = target;
471
- return hashTable[ZSTD_hashPtr(ip, hashLog, mls)];
472
- }
473
-
474
- U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {
475
- const ZSTD_compressionParameters* const cParams = &ms->cParams;
476
- return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch);
477
- }
478
410
 
479
411
  void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip)
480
412
  {
@@ -484,7 +416,7 @@ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const B
484
416
  U32* const chainTable = ms->chainTable;
485
417
  U32 const chainSize = 1 << ms->cParams.chainLog;
486
418
  U32 idx = ms->nextToUpdate;
487
- U32 const minChain = chainSize < target ? target - chainSize : idx;
419
+ U32 const minChain = chainSize < target - idx ? target - chainSize : idx;
488
420
  U32 const bucketSize = 1 << ZSTD_LAZY_DDSS_BUCKET_LOG;
489
421
  U32 const cacheSize = bucketSize - 1;
490
422
  U32 const chainAttempts = (1 << ms->cParams.searchLog) - cacheSize;
@@ -498,13 +430,12 @@ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const B
498
430
  U32 const hashLog = ms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG;
499
431
  U32* const tmpHashTable = hashTable;
500
432
  U32* const tmpChainTable = hashTable + ((size_t)1 << hashLog);
501
- U32 const tmpChainSize = ((1 << ZSTD_LAZY_DDSS_BUCKET_LOG) - 1) << hashLog;
433
+ U32 const tmpChainSize = (U32)((1 << ZSTD_LAZY_DDSS_BUCKET_LOG) - 1) << hashLog;
502
434
  U32 const tmpMinChain = tmpChainSize < target ? target - tmpChainSize : idx;
503
-
504
435
  U32 hashIdx;
505
436
 
506
437
  assert(ms->cParams.chainLog <= 24);
507
- assert(ms->cParams.hashLog >= ms->cParams.chainLog);
438
+ assert(ms->cParams.hashLog > ms->cParams.chainLog);
508
439
  assert(idx != 0);
509
440
  assert(tmpMinChain <= minChain);
510
441
 
@@ -535,7 +466,7 @@ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const B
535
466
  if (count == cacheSize) {
536
467
  for (count = 0; count < chainLimit;) {
537
468
  if (i < minChain) {
538
- if (!i || countBeyondMinChain++ > cacheSize) {
469
+ if (!i || ++countBeyondMinChain > cacheSize) {
539
470
  /* only allow pulling `cacheSize` number of entries
540
471
  * into the cache or chainTable beyond `minChain`,
541
472
  * to replace the entries pulled out of the
@@ -591,10 +522,149 @@ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const B
591
522
  ms->nextToUpdate = target;
592
523
  }
593
524
 
525
+ /* Returns the longest match length found in the dedicated dict search structure.
526
+ * If none are longer than the argument ml, then ml will be returned.
527
+ */
528
+ FORCE_INLINE_TEMPLATE
529
+ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nbAttempts,
530
+ const ZSTD_matchState_t* const dms,
531
+ const BYTE* const ip, const BYTE* const iLimit,
532
+ const BYTE* const prefixStart, const U32 curr,
533
+ const U32 dictLimit, const size_t ddsIdx) {
534
+ const U32 ddsLowestIndex = dms->window.dictLimit;
535
+ const BYTE* const ddsBase = dms->window.base;
536
+ const BYTE* const ddsEnd = dms->window.nextSrc;
537
+ const U32 ddsSize = (U32)(ddsEnd - ddsBase);
538
+ const U32 ddsIndexDelta = dictLimit - ddsSize;
539
+ const U32 bucketSize = (1 << ZSTD_LAZY_DDSS_BUCKET_LOG);
540
+ const U32 bucketLimit = nbAttempts < bucketSize - 1 ? nbAttempts : bucketSize - 1;
541
+ U32 ddsAttempt;
542
+ U32 matchIndex;
543
+
544
+ for (ddsAttempt = 0; ddsAttempt < bucketSize - 1; ddsAttempt++) {
545
+ PREFETCH_L1(ddsBase + dms->hashTable[ddsIdx + ddsAttempt]);
546
+ }
547
+
548
+ {
549
+ U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1];
550
+ U32 const chainIndex = chainPackedPointer >> 8;
551
+
552
+ PREFETCH_L1(&dms->chainTable[chainIndex]);
553
+ }
554
+
555
+ for (ddsAttempt = 0; ddsAttempt < bucketLimit; ddsAttempt++) {
556
+ size_t currentMl=0;
557
+ const BYTE* match;
558
+ matchIndex = dms->hashTable[ddsIdx + ddsAttempt];
559
+ match = ddsBase + matchIndex;
560
+
561
+ if (!matchIndex) {
562
+ return ml;
563
+ }
564
+
565
+ /* guaranteed by table construction */
566
+ (void)ddsLowestIndex;
567
+ assert(matchIndex >= ddsLowestIndex);
568
+ assert(match+4 <= ddsEnd);
569
+ if (MEM_read32(match) == MEM_read32(ip)) {
570
+ /* assumption : matchIndex <= dictLimit-4 (by table construction) */
571
+ currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4;
572
+ }
573
+
574
+ /* save best solution */
575
+ if (currentMl > ml) {
576
+ ml = currentMl;
577
+ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta));
578
+ if (ip+currentMl == iLimit) {
579
+ /* best possible, avoids read overflow on next attempt */
580
+ return ml;
581
+ }
582
+ }
583
+ }
584
+
585
+ {
586
+ U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1];
587
+ U32 chainIndex = chainPackedPointer >> 8;
588
+ U32 const chainLength = chainPackedPointer & 0xFF;
589
+ U32 const chainAttempts = nbAttempts - ddsAttempt;
590
+ U32 const chainLimit = chainAttempts > chainLength ? chainLength : chainAttempts;
591
+ U32 chainAttempt;
592
+
593
+ for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++) {
594
+ PREFETCH_L1(ddsBase + dms->chainTable[chainIndex + chainAttempt]);
595
+ }
596
+
597
+ for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++, chainIndex++) {
598
+ size_t currentMl=0;
599
+ const BYTE* match;
600
+ matchIndex = dms->chainTable[chainIndex];
601
+ match = ddsBase + matchIndex;
602
+
603
+ /* guaranteed by table construction */
604
+ assert(matchIndex >= ddsLowestIndex);
605
+ assert(match+4 <= ddsEnd);
606
+ if (MEM_read32(match) == MEM_read32(ip)) {
607
+ /* assumption : matchIndex <= dictLimit-4 (by table construction) */
608
+ currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4;
609
+ }
610
+
611
+ /* save best solution */
612
+ if (currentMl > ml) {
613
+ ml = currentMl;
614
+ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta));
615
+ if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
616
+ }
617
+ }
618
+ }
619
+ return ml;
620
+ }
621
+
622
+
623
+ /* *********************************
624
+ * Hash Chain
625
+ ***********************************/
626
+ #define NEXT_IN_CHAIN(d, mask) chainTable[(d) & (mask)]
627
+
628
+ /* Update chains up to ip (excluded)
629
+ Assumption : always within prefix (i.e. not within extDict) */
630
+ FORCE_INLINE_TEMPLATE
631
+ ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
632
+ U32 ZSTD_insertAndFindFirstIndex_internal(
633
+ ZSTD_matchState_t* ms,
634
+ const ZSTD_compressionParameters* const cParams,
635
+ const BYTE* ip, U32 const mls, U32 const lazySkipping)
636
+ {
637
+ U32* const hashTable = ms->hashTable;
638
+ const U32 hashLog = cParams->hashLog;
639
+ U32* const chainTable = ms->chainTable;
640
+ const U32 chainMask = (1 << cParams->chainLog) - 1;
641
+ const BYTE* const base = ms->window.base;
642
+ const U32 target = (U32)(ip - base);
643
+ U32 idx = ms->nextToUpdate;
644
+
645
+ while(idx < target) { /* catch up */
646
+ size_t const h = ZSTD_hashPtr(base+idx, hashLog, mls);
647
+ NEXT_IN_CHAIN(idx, chainMask) = hashTable[h];
648
+ hashTable[h] = idx;
649
+ idx++;
650
+ /* Stop inserting every position when in the lazy skipping mode. */
651
+ if (lazySkipping)
652
+ break;
653
+ }
654
+
655
+ ms->nextToUpdate = target;
656
+ return hashTable[ZSTD_hashPtr(ip, hashLog, mls)];
657
+ }
658
+
659
+ U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {
660
+ const ZSTD_compressionParameters* const cParams = &ms->cParams;
661
+ return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch, /* lazySkipping*/ 0);
662
+ }
594
663
 
595
664
  /* inlining is important to hardwire a hot branch (template emulation) */
596
665
  FORCE_INLINE_TEMPLATE
597
- size_t ZSTD_HcFindBestMatch_generic (
666
+ ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
667
+ size_t ZSTD_HcFindBestMatch(
598
668
  ZSTD_matchState_t* ms,
599
669
  const BYTE* const ip, const BYTE* const iLimit,
600
670
  size_t* offsetPtr,
@@ -633,14 +703,15 @@ size_t ZSTD_HcFindBestMatch_generic (
633
703
  }
634
704
 
635
705
  /* HC4 match finder */
636
- matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls);
706
+ matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls, ms->lazySkipping);
637
707
 
638
708
  for ( ; (matchIndex>=lowLimit) & (nbAttempts>0) ; nbAttempts--) {
639
709
  size_t currentMl=0;
640
710
  if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
641
711
  const BYTE* const match = base + matchIndex;
642
712
  assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */
643
- if (match[ml] == ip[ml]) /* potentially better */
713
+ /* read 4B starting from (match + ml + 1 - sizeof(U32)) */
714
+ if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3)) /* potentially better */
644
715
  currentMl = ZSTD_count(ip, match, iLimit);
645
716
  } else {
646
717
  const BYTE* const match = dictBase + matchIndex;
@@ -652,7 +723,7 @@ size_t ZSTD_HcFindBestMatch_generic (
652
723
  /* save best solution */
653
724
  if (currentMl > ml) {
654
725
  ml = currentMl;
655
- *offsetPtr = curr - matchIndex + ZSTD_REP_MOVE;
726
+ *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
656
727
  if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
657
728
  }
658
729
 
@@ -660,91 +731,10 @@ size_t ZSTD_HcFindBestMatch_generic (
660
731
  matchIndex = NEXT_IN_CHAIN(matchIndex, chainMask);
661
732
  }
662
733
 
734
+ assert(nbAttempts <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */
663
735
  if (dictMode == ZSTD_dedicatedDictSearch) {
664
- const U32 ddsLowestIndex = dms->window.dictLimit;
665
- const BYTE* const ddsBase = dms->window.base;
666
- const BYTE* const ddsEnd = dms->window.nextSrc;
667
- const U32 ddsSize = (U32)(ddsEnd - ddsBase);
668
- const U32 ddsIndexDelta = dictLimit - ddsSize;
669
- const U32 bucketSize = (1 << ZSTD_LAZY_DDSS_BUCKET_LOG);
670
- const U32 bucketLimit = nbAttempts < bucketSize - 1 ? nbAttempts : bucketSize - 1;
671
- U32 ddsAttempt;
672
-
673
- for (ddsAttempt = 0; ddsAttempt < bucketSize - 1; ddsAttempt++) {
674
- PREFETCH_L1(ddsBase + dms->hashTable[ddsIdx + ddsAttempt]);
675
- }
676
-
677
- {
678
- U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1];
679
- U32 const chainIndex = chainPackedPointer >> 8;
680
-
681
- PREFETCH_L1(&dms->chainTable[chainIndex]);
682
- }
683
-
684
- for (ddsAttempt = 0; ddsAttempt < bucketLimit; ddsAttempt++) {
685
- size_t currentMl=0;
686
- const BYTE* match;
687
- matchIndex = dms->hashTable[ddsIdx + ddsAttempt];
688
- match = ddsBase + matchIndex;
689
-
690
- if (!matchIndex) {
691
- return ml;
692
- }
693
-
694
- /* guaranteed by table construction */
695
- (void)ddsLowestIndex;
696
- assert(matchIndex >= ddsLowestIndex);
697
- assert(match+4 <= ddsEnd);
698
- if (MEM_read32(match) == MEM_read32(ip)) {
699
- /* assumption : matchIndex <= dictLimit-4 (by table construction) */
700
- currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4;
701
- }
702
-
703
- /* save best solution */
704
- if (currentMl > ml) {
705
- ml = currentMl;
706
- *offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE;
707
- if (ip+currentMl == iLimit) {
708
- /* best possible, avoids read overflow on next attempt */
709
- return ml;
710
- }
711
- }
712
- }
713
-
714
- {
715
- U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1];
716
- U32 chainIndex = chainPackedPointer >> 8;
717
- U32 const chainLength = chainPackedPointer & 0xFF;
718
- U32 const chainAttempts = nbAttempts - ddsAttempt;
719
- U32 const chainLimit = chainAttempts > chainLength ? chainLength : chainAttempts;
720
- U32 chainAttempt;
721
-
722
- for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++) {
723
- PREFETCH_L1(ddsBase + dms->chainTable[chainIndex + chainAttempt]);
724
- }
725
-
726
- for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++, chainIndex++) {
727
- size_t currentMl=0;
728
- const BYTE* match;
729
- matchIndex = dms->chainTable[chainIndex];
730
- match = ddsBase + matchIndex;
731
-
732
- /* guaranteed by table construction */
733
- assert(matchIndex >= ddsLowestIndex);
734
- assert(match+4 <= ddsEnd);
735
- if (MEM_read32(match) == MEM_read32(ip)) {
736
- /* assumption : matchIndex <= dictLimit-4 (by table construction) */
737
- currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4;
738
- }
739
-
740
- /* save best solution */
741
- if (currentMl > ml) {
742
- ml = currentMl;
743
- *offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE;
744
- if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
745
- }
746
- }
747
- }
736
+ ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts, dms,
737
+ ip, iLimit, prefixStart, curr, dictLimit, ddsIdx);
748
738
  } else if (dictMode == ZSTD_dictMatchState) {
749
739
  const U32* const dmsChainTable = dms->chainTable;
750
740
  const U32 dmsChainSize = (1 << dms->cParams.chainLog);
@@ -768,7 +758,8 @@ size_t ZSTD_HcFindBestMatch_generic (
768
758
  /* save best solution */
769
759
  if (currentMl > ml) {
770
760
  ml = currentMl;
771
- *offsetPtr = curr - (matchIndex + dmsIndexDelta) + ZSTD_REP_MOVE;
761
+ assert(curr > matchIndex + dmsIndexDelta);
762
+ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta));
772
763
  if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
773
764
  }
774
765
 
@@ -781,78 +772,748 @@ size_t ZSTD_HcFindBestMatch_generic (
781
772
  return ml;
782
773
  }
783
774
 
775
+ /* *********************************
776
+ * (SIMD) Row-based matchfinder
777
+ ***********************************/
778
+ /* Constants for row-based hash */
779
+ #define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1)
780
+ #define ZSTD_ROW_HASH_MAX_ENTRIES 64 /* absolute maximum number of entries per row, for all configurations */
784
781
 
785
- FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_selectMLS (
786
- ZSTD_matchState_t* ms,
787
- const BYTE* ip, const BYTE* const iLimit,
788
- size_t* offsetPtr)
782
+ #define ZSTD_ROW_HASH_CACHE_MASK (ZSTD_ROW_HASH_CACHE_SIZE - 1)
783
+
784
+ typedef U64 ZSTD_VecMask; /* Clarifies when we are interacting with a U64 representing a mask of matches */
785
+
786
+ /* ZSTD_VecMask_next():
787
+ * Starting from the LSB, returns the idx of the next non-zero bit.
788
+ * Basically counting the nb of trailing zeroes.
789
+ */
790
+ MEM_STATIC U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
791
+ return ZSTD_countTrailingZeros64(val);
792
+ }
793
+
794
+ /* ZSTD_row_nextIndex():
795
+ * Returns the next index to insert at within a tagTable row, and updates the "head"
796
+ * value to reflect the update. Essentially cycles backwards from [1, {entries per row})
797
+ */
798
+ FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const rowMask) {
799
+ U32 next = (*tagRow-1) & rowMask;
800
+ next += (next == 0) ? rowMask : 0; /* skip first position */
801
+ *tagRow = (BYTE)next;
802
+ return next;
803
+ }
804
+
805
+ /* ZSTD_isAligned():
806
+ * Checks that a pointer is aligned to "align" bytes which must be a power of 2.
807
+ */
808
+ MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) {
809
+ assert((align & (align - 1)) == 0);
810
+ return (((size_t)ptr) & (align - 1)) == 0;
811
+ }
812
+
813
+ /* ZSTD_row_prefetch():
814
+ * Performs prefetching for the hashTable and tagTable at a given row.
815
+ */
816
+ FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, BYTE const* tagTable, U32 const relRow, U32 const rowLog) {
817
+ PREFETCH_L1(hashTable + relRow);
818
+ if (rowLog >= 5) {
819
+ PREFETCH_L1(hashTable + relRow + 16);
820
+ /* Note: prefetching more of the hash table does not appear to be beneficial for 128-entry rows */
821
+ }
822
+ PREFETCH_L1(tagTable + relRow);
823
+ if (rowLog == 6) {
824
+ PREFETCH_L1(tagTable + relRow + 32);
825
+ }
826
+ assert(rowLog == 4 || rowLog == 5 || rowLog == 6);
827
+ assert(ZSTD_isAligned(hashTable + relRow, 64)); /* prefetched hash row always 64-byte aligned */
828
+ assert(ZSTD_isAligned(tagTable + relRow, (size_t)1 << rowLog)); /* prefetched tagRow sits on correct multiple of bytes (32,64,128) */
829
+ }
830
+
831
+ /* ZSTD_row_fillHashCache():
832
+ * Fill up the hash cache starting at idx, prefetching up to ZSTD_ROW_HASH_CACHE_SIZE entries,
833
+ * but not beyond iLimit.
834
+ */
835
+ FORCE_INLINE_TEMPLATE
836
+ ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
837
+ void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base,
838
+ U32 const rowLog, U32 const mls,
839
+ U32 idx, const BYTE* const iLimit)
789
840
  {
790
- switch(ms->cParams.minMatch)
791
- {
792
- default : /* includes case 3 */
793
- case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_noDict);
794
- case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_noDict);
795
- case 7 :
796
- case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_noDict);
841
+ U32 const* const hashTable = ms->hashTable;
842
+ BYTE const* const tagTable = ms->tagTable;
843
+ U32 const hashLog = ms->rowHashLog;
844
+ U32 const maxElemsToPrefetch = (base + idx) > iLimit ? 0 : (U32)(iLimit - (base + idx) + 1);
845
+ U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch);
846
+
847
+ for (; idx < lim; ++idx) {
848
+ U32 const hash = (U32)ZSTD_hashPtrSalted(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt);
849
+ U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
850
+ ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
851
+ ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash;
797
852
  }
853
+
854
+ DEBUGLOG(6, "ZSTD_row_fillHashCache(): [%u %u %u %u %u %u %u %u]", ms->hashCache[0], ms->hashCache[1],
855
+ ms->hashCache[2], ms->hashCache[3], ms->hashCache[4],
856
+ ms->hashCache[5], ms->hashCache[6], ms->hashCache[7]);
798
857
  }
799
858
 
859
+ /* ZSTD_row_nextCachedHash():
860
+ * Returns the hash of base + idx, and replaces the hash in the hash cache with the byte at
861
+ * base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable.
862
+ */
863
+ FORCE_INLINE_TEMPLATE
864
+ ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
865
+ U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable,
866
+ BYTE const* tagTable, BYTE const* base,
867
+ U32 idx, U32 const hashLog,
868
+ U32 const rowLog, U32 const mls,
869
+ U64 const hashSalt)
870
+ {
871
+ U32 const newHash = (U32)ZSTD_hashPtrSalted(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt);
872
+ U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
873
+ ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
874
+ { U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK];
875
+ cache[idx & ZSTD_ROW_HASH_CACHE_MASK] = newHash;
876
+ return hash;
877
+ }
878
+ }
800
879
 
801
- static size_t ZSTD_HcFindBestMatch_dictMatchState_selectMLS (
802
- ZSTD_matchState_t* ms,
803
- const BYTE* ip, const BYTE* const iLimit,
804
- size_t* offsetPtr)
880
+ /* ZSTD_row_update_internalImpl():
881
+ * Updates the hash table with positions starting from updateStartIdx until updateEndIdx.
882
+ */
883
+ FORCE_INLINE_TEMPLATE
884
+ ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
885
+ void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms,
886
+ U32 updateStartIdx, U32 const updateEndIdx,
887
+ U32 const mls, U32 const rowLog,
888
+ U32 const rowMask, U32 const useCache)
805
889
  {
806
- switch(ms->cParams.minMatch)
807
- {
808
- default : /* includes case 3 */
809
- case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_dictMatchState);
810
- case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_dictMatchState);
811
- case 7 :
812
- case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_dictMatchState);
890
+ U32* const hashTable = ms->hashTable;
891
+ BYTE* const tagTable = ms->tagTable;
892
+ U32 const hashLog = ms->rowHashLog;
893
+ const BYTE* const base = ms->window.base;
894
+
895
+ DEBUGLOG(6, "ZSTD_row_update_internalImpl(): updateStartIdx=%u, updateEndIdx=%u", updateStartIdx, updateEndIdx);
896
+ for (; updateStartIdx < updateEndIdx; ++updateStartIdx) {
897
+ U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls, ms->hashSalt)
898
+ : (U32)ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt);
899
+ U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
900
+ U32* const row = hashTable + relRow;
901
+ BYTE* tagRow = tagTable + relRow;
902
+ U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
903
+
904
+ assert(hash == ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt));
905
+ tagRow[pos] = hash & ZSTD_ROW_HASH_TAG_MASK;
906
+ row[pos] = updateStartIdx;
907
+ }
908
+ }
909
+
910
+ /* ZSTD_row_update_internal():
911
+ * Inserts the byte at ip into the appropriate position in the hash table, and updates ms->nextToUpdate.
912
+ * Skips sections of long matches as is necessary.
913
+ */
914
+ FORCE_INLINE_TEMPLATE
915
+ ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
916
+ void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip,
917
+ U32 const mls, U32 const rowLog,
918
+ U32 const rowMask, U32 const useCache)
919
+ {
920
+ U32 idx = ms->nextToUpdate;
921
+ const BYTE* const base = ms->window.base;
922
+ const U32 target = (U32)(ip - base);
923
+ const U32 kSkipThreshold = 384;
924
+ const U32 kMaxMatchStartPositionsToUpdate = 96;
925
+ const U32 kMaxMatchEndPositionsToUpdate = 32;
926
+
927
+ if (useCache) {
928
+ /* Only skip positions when using hash cache, i.e.
929
+ * if we are loading a dict, don't skip anything.
930
+ * If we decide to skip, then we only update a set number
931
+ * of positions at the beginning and end of the match.
932
+ */
933
+ if (UNLIKELY(target - idx > kSkipThreshold)) {
934
+ U32 const bound = idx + kMaxMatchStartPositionsToUpdate;
935
+ ZSTD_row_update_internalImpl(ms, idx, bound, mls, rowLog, rowMask, useCache);
936
+ idx = target - kMaxMatchEndPositionsToUpdate;
937
+ ZSTD_row_fillHashCache(ms, base, rowLog, mls, idx, ip+1);
938
+ }
813
939
  }
940
+ assert(target >= idx);
941
+ ZSTD_row_update_internalImpl(ms, idx, target, mls, rowLog, rowMask, useCache);
942
+ ms->nextToUpdate = target;
814
943
  }
815
944
 
945
+ /* ZSTD_row_update():
946
+ * External wrapper for ZSTD_row_update_internal(). Used for filling the hashtable during dictionary
947
+ * processing.
948
+ */
949
+ void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) {
950
+ const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);
951
+ const U32 rowMask = (1u << rowLog) - 1;
952
+ const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */);
816
953
 
817
- static size_t ZSTD_HcFindBestMatch_dedicatedDictSearch_selectMLS (
818
- ZSTD_matchState_t* ms,
819
- const BYTE* ip, const BYTE* const iLimit,
820
- size_t* offsetPtr)
954
+ DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog);
955
+ ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* don't use cache */);
956
+ }
957
+
958
+ /* Returns the mask width of bits group of which will be set to 1. Given not all
959
+ * architectures have easy movemask instruction, this helps to iterate over
960
+ * groups of bits easier and faster.
961
+ */
962
+ FORCE_INLINE_TEMPLATE U32
963
+ ZSTD_row_matchMaskGroupWidth(const U32 rowEntries)
821
964
  {
822
- switch(ms->cParams.minMatch)
823
- {
824
- default : /* includes case 3 */
825
- case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_dedicatedDictSearch);
826
- case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_dedicatedDictSearch);
827
- case 7 :
828
- case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_dedicatedDictSearch);
965
+ assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
966
+ assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
967
+ (void)rowEntries;
968
+ #if defined(ZSTD_ARCH_ARM_NEON)
969
+ /* NEON path only works for little endian */
970
+ if (!MEM_isLittleEndian()) {
971
+ return 1;
829
972
  }
973
+ if (rowEntries == 16) {
974
+ return 4;
975
+ }
976
+ if (rowEntries == 32) {
977
+ return 2;
978
+ }
979
+ if (rowEntries == 64) {
980
+ return 1;
981
+ }
982
+ #endif
983
+ return 1;
830
984
  }
831
985
 
986
+ #if defined(ZSTD_ARCH_X86_SSE2)
987
+ FORCE_INLINE_TEMPLATE ZSTD_VecMask
988
+ ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U32 head)
989
+ {
990
+ const __m128i comparisonMask = _mm_set1_epi8((char)tag);
991
+ int matches[4] = {0};
992
+ int i;
993
+ assert(nbChunks == 1 || nbChunks == 2 || nbChunks == 4);
994
+ for (i=0; i<nbChunks; i++) {
995
+ const __m128i chunk = _mm_loadu_si128((const __m128i*)(const void*)(src + 16*i));
996
+ const __m128i equalMask = _mm_cmpeq_epi8(chunk, comparisonMask);
997
+ matches[i] = _mm_movemask_epi8(equalMask);
998
+ }
999
+ if (nbChunks == 1) return ZSTD_rotateRight_U16((U16)matches[0], head);
1000
+ if (nbChunks == 2) return ZSTD_rotateRight_U32((U32)matches[1] << 16 | (U32)matches[0], head);
1001
+ assert(nbChunks == 4);
1002
+ return ZSTD_rotateRight_U64((U64)matches[3] << 48 | (U64)matches[2] << 32 | (U64)matches[1] << 16 | (U64)matches[0], head);
1003
+ }
1004
+ #endif
1005
+
1006
+ #if defined(ZSTD_ARCH_ARM_NEON)
1007
+ FORCE_INLINE_TEMPLATE ZSTD_VecMask
1008
+ ZSTD_row_getNEONMask(const U32 rowEntries, const BYTE* const src, const BYTE tag, const U32 headGrouped)
1009
+ {
1010
+ assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
1011
+ if (rowEntries == 16) {
1012
+ /* vshrn_n_u16 shifts by 4 every u16 and narrows to 8 lower bits.
1013
+ * After that groups of 4 bits represent the equalMask. We lower
1014
+ * all bits except the highest in these groups by doing AND with
1015
+ * 0x88 = 0b10001000.
1016
+ */
1017
+ const uint8x16_t chunk = vld1q_u8(src);
1018
+ const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag)));
1019
+ const uint8x8_t res = vshrn_n_u16(equalMask, 4);
1020
+ const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0);
1021
+ return ZSTD_rotateRight_U64(matches, headGrouped) & 0x8888888888888888ull;
1022
+ } else if (rowEntries == 32) {
1023
+ /* Same idea as with rowEntries == 16 but doing AND with
1024
+ * 0x55 = 0b01010101.
1025
+ */
1026
+ const uint16x8x2_t chunk = vld2q_u16((const uint16_t*)(const void*)src);
1027
+ const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]);
1028
+ const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]);
1029
+ const uint8x16_t dup = vdupq_n_u8(tag);
1030
+ const uint8x8_t t0 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk0, dup)), 6);
1031
+ const uint8x8_t t1 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk1, dup)), 6);
1032
+ const uint8x8_t res = vsli_n_u8(t0, t1, 4);
1033
+ const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0) ;
1034
+ return ZSTD_rotateRight_U64(matches, headGrouped) & 0x5555555555555555ull;
1035
+ } else { /* rowEntries == 64 */
1036
+ const uint8x16x4_t chunk = vld4q_u8(src);
1037
+ const uint8x16_t dup = vdupq_n_u8(tag);
1038
+ const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup);
1039
+ const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup);
1040
+ const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup);
1041
+ const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup);
1042
+
1043
+ const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1);
1044
+ const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1);
1045
+ const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2);
1046
+ const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4);
1047
+ const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4);
1048
+ const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0);
1049
+ return ZSTD_rotateRight_U64(matches, headGrouped);
1050
+ }
1051
+ }
1052
+ #endif
1053
+
1054
+ /* Returns a ZSTD_VecMask (U64) that has the nth group (determined by
1055
+ * ZSTD_row_matchMaskGroupWidth) of bits set to 1 if the newly-computed "tag"
1056
+ * matches the hash at the nth position in a row of the tagTable.
1057
+ * Each row is a circular buffer beginning at the value of "headGrouped". So we
1058
+ * must rotate the "matches" bitfield to match up with the actual layout of the
1059
+ * entries within the hashTable */
1060
+ FORCE_INLINE_TEMPLATE ZSTD_VecMask
1061
+ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 headGrouped, const U32 rowEntries)
1062
+ {
1063
+ const BYTE* const src = tagRow;
1064
+ assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
1065
+ assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
1066
+ assert(ZSTD_row_matchMaskGroupWidth(rowEntries) * rowEntries <= sizeof(ZSTD_VecMask) * 8);
1067
+
1068
+ #if defined(ZSTD_ARCH_X86_SSE2)
1069
+
1070
+ return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, headGrouped);
1071
+
1072
+ #else /* SW or NEON-LE */
1073
+
1074
+ # if defined(ZSTD_ARCH_ARM_NEON)
1075
+ /* This NEON path only works for little endian - otherwise use SWAR below */
1076
+ if (MEM_isLittleEndian()) {
1077
+ return ZSTD_row_getNEONMask(rowEntries, src, tag, headGrouped);
1078
+ }
1079
+ # endif /* ZSTD_ARCH_ARM_NEON */
1080
+ /* SWAR */
1081
+ { const int chunkSize = sizeof(size_t);
1082
+ const size_t shiftAmount = ((chunkSize * 8) - chunkSize);
1083
+ const size_t xFF = ~((size_t)0);
1084
+ const size_t x01 = xFF / 0xFF;
1085
+ const size_t x80 = x01 << 7;
1086
+ const size_t splatChar = tag * x01;
1087
+ ZSTD_VecMask matches = 0;
1088
+ int i = rowEntries - chunkSize;
1089
+ assert((sizeof(size_t) == 4) || (sizeof(size_t) == 8));
1090
+ if (MEM_isLittleEndian()) { /* runtime check so have two loops */
1091
+ const size_t extractMagic = (xFF / 0x7F) >> chunkSize;
1092
+ do {
1093
+ size_t chunk = MEM_readST(&src[i]);
1094
+ chunk ^= splatChar;
1095
+ chunk = (((chunk | x80) - x01) | chunk) & x80;
1096
+ matches <<= chunkSize;
1097
+ matches |= (chunk * extractMagic) >> shiftAmount;
1098
+ i -= chunkSize;
1099
+ } while (i >= 0);
1100
+ } else { /* big endian: reverse bits during extraction */
1101
+ const size_t msb = xFF ^ (xFF >> 1);
1102
+ const size_t extractMagic = (msb / 0x1FF) | msb;
1103
+ do {
1104
+ size_t chunk = MEM_readST(&src[i]);
1105
+ chunk ^= splatChar;
1106
+ chunk = (((chunk | x80) - x01) | chunk) & x80;
1107
+ matches <<= chunkSize;
1108
+ matches |= ((chunk >> 7) * extractMagic) >> shiftAmount;
1109
+ i -= chunkSize;
1110
+ } while (i >= 0);
1111
+ }
1112
+ matches = ~matches;
1113
+ if (rowEntries == 16) {
1114
+ return ZSTD_rotateRight_U16((U16)matches, headGrouped);
1115
+ } else if (rowEntries == 32) {
1116
+ return ZSTD_rotateRight_U32((U32)matches, headGrouped);
1117
+ } else {
1118
+ return ZSTD_rotateRight_U64((U64)matches, headGrouped);
1119
+ }
1120
+ }
1121
+ #endif
1122
+ }
832
1123
 
833
- FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_extDict_selectMLS (
1124
+ /* The high-level approach of the SIMD row based match finder is as follows:
1125
+ * - Figure out where to insert the new entry:
1126
+ * - Generate a hash for current input posistion and split it into a one byte of tag and `rowHashLog` bits of index.
1127
+ * - The hash is salted by a value that changes on every contex reset, so when the same table is used
1128
+ * we will avoid collisions that would otherwise slow us down by intorducing phantom matches.
1129
+ * - The hashTable is effectively split into groups or "rows" of 15 or 31 entries of U32, and the index determines
1130
+ * which row to insert into.
1131
+ * - Determine the correct position within the row to insert the entry into. Each row of 15 or 31 can
1132
+ * be considered as a circular buffer with a "head" index that resides in the tagTable (overall 16 or 32 bytes
1133
+ * per row).
1134
+ * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte tag calculated for the position and
1135
+ * generate a bitfield that we can cycle through to check the collisions in the hash table.
1136
+ * - Pick the longest match.
1137
+ * - Insert the tag into the equivalent row and position in the tagTable.
1138
+ */
1139
+ FORCE_INLINE_TEMPLATE
1140
+ ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
1141
+ size_t ZSTD_RowFindBestMatch(
834
1142
  ZSTD_matchState_t* ms,
835
- const BYTE* ip, const BYTE* const iLimit,
836
- size_t* offsetPtr)
1143
+ const BYTE* const ip, const BYTE* const iLimit,
1144
+ size_t* offsetPtr,
1145
+ const U32 mls, const ZSTD_dictMode_e dictMode,
1146
+ const U32 rowLog)
837
1147
  {
838
- switch(ms->cParams.minMatch)
839
- {
840
- default : /* includes case 3 */
841
- case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_extDict);
842
- case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_extDict);
843
- case 7 :
844
- case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_extDict);
1148
+ U32* const hashTable = ms->hashTable;
1149
+ BYTE* const tagTable = ms->tagTable;
1150
+ U32* const hashCache = ms->hashCache;
1151
+ const U32 hashLog = ms->rowHashLog;
1152
+ const ZSTD_compressionParameters* const cParams = &ms->cParams;
1153
+ const BYTE* const base = ms->window.base;
1154
+ const BYTE* const dictBase = ms->window.dictBase;
1155
+ const U32 dictLimit = ms->window.dictLimit;
1156
+ const BYTE* const prefixStart = base + dictLimit;
1157
+ const BYTE* const dictEnd = dictBase + dictLimit;
1158
+ const U32 curr = (U32)(ip-base);
1159
+ const U32 maxDistance = 1U << cParams->windowLog;
1160
+ const U32 lowestValid = ms->window.lowLimit;
1161
+ const U32 withinMaxDistance = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid;
1162
+ const U32 isDictionary = (ms->loadedDictEnd != 0);
1163
+ const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance;
1164
+ const U32 rowEntries = (1U << rowLog);
1165
+ const U32 rowMask = rowEntries - 1;
1166
+ const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */
1167
+ const U32 groupWidth = ZSTD_row_matchMaskGroupWidth(rowEntries);
1168
+ const U64 hashSalt = ms->hashSalt;
1169
+ U32 nbAttempts = 1U << cappedSearchLog;
1170
+ size_t ml=4-1;
1171
+ U32 hash;
1172
+
1173
+ /* DMS/DDS variables that may be referenced laster */
1174
+ const ZSTD_matchState_t* const dms = ms->dictMatchState;
1175
+
1176
+ /* Initialize the following variables to satisfy static analyzer */
1177
+ size_t ddsIdx = 0;
1178
+ U32 ddsExtraAttempts = 0; /* cctx hash tables are limited in searches, but allow extra searches into DDS */
1179
+ U32 dmsTag = 0;
1180
+ U32* dmsRow = NULL;
1181
+ BYTE* dmsTagRow = NULL;
1182
+
1183
+ if (dictMode == ZSTD_dedicatedDictSearch) {
1184
+ const U32 ddsHashLog = dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG;
1185
+ { /* Prefetch DDS hashtable entry */
1186
+ ddsIdx = ZSTD_hashPtr(ip, ddsHashLog, mls) << ZSTD_LAZY_DDSS_BUCKET_LOG;
1187
+ PREFETCH_L1(&dms->hashTable[ddsIdx]);
1188
+ }
1189
+ ddsExtraAttempts = cParams->searchLog > rowLog ? 1U << (cParams->searchLog - rowLog) : 0;
1190
+ }
1191
+
1192
+ if (dictMode == ZSTD_dictMatchState) {
1193
+ /* Prefetch DMS rows */
1194
+ U32* const dmsHashTable = dms->hashTable;
1195
+ BYTE* const dmsTagTable = dms->tagTable;
1196
+ U32 const dmsHash = (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
1197
+ U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
1198
+ dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK;
1199
+ dmsTagRow = (BYTE*)(dmsTagTable + dmsRelRow);
1200
+ dmsRow = dmsHashTable + dmsRelRow;
1201
+ ZSTD_row_prefetch(dmsHashTable, dmsTagTable, dmsRelRow, rowLog);
1202
+ }
1203
+
1204
+ /* Update the hashTable and tagTable up to (but not including) ip */
1205
+ if (!ms->lazySkipping) {
1206
+ ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);
1207
+ hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls, hashSalt);
1208
+ } else {
1209
+ /* Stop inserting every position when in the lazy skipping mode.
1210
+ * The hash cache is also not kept up to date in this mode.
1211
+ */
1212
+ hash = (U32)ZSTD_hashPtrSalted(ip, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt);
1213
+ ms->nextToUpdate = curr;
1214
+ }
1215
+ ms->hashSaltEntropy += hash; /* collect salt entropy */
1216
+
1217
+ { /* Get the hash for ip, compute the appropriate row */
1218
+ U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
1219
+ U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK;
1220
+ U32* const row = hashTable + relRow;
1221
+ BYTE* tagRow = (BYTE*)(tagTable + relRow);
1222
+ U32 const headGrouped = (*tagRow & rowMask) * groupWidth;
1223
+ U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
1224
+ size_t numMatches = 0;
1225
+ size_t currMatch = 0;
1226
+ ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, headGrouped, rowEntries);
1227
+
1228
+ /* Cycle through the matches and prefetch */
1229
+ for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) {
1230
+ U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask;
1231
+ U32 const matchIndex = row[matchPos];
1232
+ if(matchPos == 0) continue;
1233
+ assert(numMatches < rowEntries);
1234
+ if (matchIndex < lowLimit)
1235
+ break;
1236
+ if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
1237
+ PREFETCH_L1(base + matchIndex);
1238
+ } else {
1239
+ PREFETCH_L1(dictBase + matchIndex);
1240
+ }
1241
+ matchBuffer[numMatches++] = matchIndex;
1242
+ --nbAttempts;
1243
+ }
1244
+
1245
+ /* Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop
1246
+ in ZSTD_row_update_internal() at the next search. */
1247
+ {
1248
+ U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
1249
+ tagRow[pos] = (BYTE)tag;
1250
+ row[pos] = ms->nextToUpdate++;
1251
+ }
1252
+
1253
+ /* Return the longest match */
1254
+ for (; currMatch < numMatches; ++currMatch) {
1255
+ U32 const matchIndex = matchBuffer[currMatch];
1256
+ size_t currentMl=0;
1257
+ assert(matchIndex < curr);
1258
+ assert(matchIndex >= lowLimit);
1259
+
1260
+ if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
1261
+ const BYTE* const match = base + matchIndex;
1262
+ assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */
1263
+ /* read 4B starting from (match + ml + 1 - sizeof(U32)) */
1264
+ if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3)) /* potentially better */
1265
+ currentMl = ZSTD_count(ip, match, iLimit);
1266
+ } else {
1267
+ const BYTE* const match = dictBase + matchIndex;
1268
+ assert(match+4 <= dictEnd);
1269
+ if (MEM_read32(match) == MEM_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */
1270
+ currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dictEnd, prefixStart) + 4;
1271
+ }
1272
+
1273
+ /* Save best solution */
1274
+ if (currentMl > ml) {
1275
+ ml = currentMl;
1276
+ *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
1277
+ if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
1278
+ }
1279
+ }
1280
+ }
1281
+
1282
+ assert(nbAttempts <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */
1283
+ if (dictMode == ZSTD_dedicatedDictSearch) {
1284
+ ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts + ddsExtraAttempts, dms,
1285
+ ip, iLimit, prefixStart, curr, dictLimit, ddsIdx);
1286
+ } else if (dictMode == ZSTD_dictMatchState) {
1287
+ /* TODO: Measure and potentially add prefetching to DMS */
1288
+ const U32 dmsLowestIndex = dms->window.dictLimit;
1289
+ const BYTE* const dmsBase = dms->window.base;
1290
+ const BYTE* const dmsEnd = dms->window.nextSrc;
1291
+ const U32 dmsSize = (U32)(dmsEnd - dmsBase);
1292
+ const U32 dmsIndexDelta = dictLimit - dmsSize;
1293
+
1294
+ { U32 const headGrouped = (*dmsTagRow & rowMask) * groupWidth;
1295
+ U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
1296
+ size_t numMatches = 0;
1297
+ size_t currMatch = 0;
1298
+ ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, headGrouped, rowEntries);
1299
+
1300
+ for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) {
1301
+ U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask;
1302
+ U32 const matchIndex = dmsRow[matchPos];
1303
+ if(matchPos == 0) continue;
1304
+ if (matchIndex < dmsLowestIndex)
1305
+ break;
1306
+ PREFETCH_L1(dmsBase + matchIndex);
1307
+ matchBuffer[numMatches++] = matchIndex;
1308
+ --nbAttempts;
1309
+ }
1310
+
1311
+ /* Return the longest match */
1312
+ for (; currMatch < numMatches; ++currMatch) {
1313
+ U32 const matchIndex = matchBuffer[currMatch];
1314
+ size_t currentMl=0;
1315
+ assert(matchIndex >= dmsLowestIndex);
1316
+ assert(matchIndex < curr);
1317
+
1318
+ { const BYTE* const match = dmsBase + matchIndex;
1319
+ assert(match+4 <= dmsEnd);
1320
+ if (MEM_read32(match) == MEM_read32(ip))
1321
+ currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dmsEnd, prefixStart) + 4;
1322
+ }
1323
+
1324
+ if (currentMl > ml) {
1325
+ ml = currentMl;
1326
+ assert(curr > matchIndex + dmsIndexDelta);
1327
+ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta));
1328
+ if (ip+currentMl == iLimit) break;
1329
+ }
1330
+ }
1331
+ }
845
1332
  }
1333
+ return ml;
846
1334
  }
847
1335
 
848
1336
 
1337
+ /**
1338
+ * Generate search functions templated on (dictMode, mls, rowLog).
1339
+ * These functions are outlined for code size & compilation time.
1340
+ * ZSTD_searchMax() dispatches to the correct implementation function.
1341
+ *
1342
+ * TODO: The start of the search function involves loading and calculating a
1343
+ * bunch of constants from the ZSTD_matchState_t. These computations could be
1344
+ * done in an initialization function, and saved somewhere in the match state.
1345
+ * Then we could pass a pointer to the saved state instead of the match state,
1346
+ * and avoid duplicate computations.
1347
+ *
1348
+ * TODO: Move the match re-winding into searchMax. This improves compression
1349
+ * ratio, and unlocks further simplifications with the next TODO.
1350
+ *
1351
+ * TODO: Try moving the repcode search into searchMax. After the re-winding
1352
+ * and repcode search are in searchMax, there is no more logic in the match
1353
+ * finder loop that requires knowledge about the dictMode. So we should be
1354
+ * able to avoid force inlining it, and we can join the extDict loop with
1355
+ * the single segment loop. It should go in searchMax instead of its own
1356
+ * function to avoid having multiple virtual function calls per search.
1357
+ */
1358
+
1359
+ #define ZSTD_BT_SEARCH_FN(dictMode, mls) ZSTD_BtFindBestMatch_##dictMode##_##mls
1360
+ #define ZSTD_HC_SEARCH_FN(dictMode, mls) ZSTD_HcFindBestMatch_##dictMode##_##mls
1361
+ #define ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog) ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog
1362
+
1363
+ #define ZSTD_SEARCH_FN_ATTRS FORCE_NOINLINE
1364
+
1365
+ #define GEN_ZSTD_BT_SEARCH_FN(dictMode, mls) \
1366
+ ZSTD_SEARCH_FN_ATTRS size_t ZSTD_BT_SEARCH_FN(dictMode, mls)( \
1367
+ ZSTD_matchState_t* ms, \
1368
+ const BYTE* ip, const BYTE* const iLimit, \
1369
+ size_t* offBasePtr) \
1370
+ { \
1371
+ assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
1372
+ return ZSTD_BtFindBestMatch(ms, ip, iLimit, offBasePtr, mls, ZSTD_##dictMode); \
1373
+ } \
1374
+
1375
+ #define GEN_ZSTD_HC_SEARCH_FN(dictMode, mls) \
1376
+ ZSTD_SEARCH_FN_ATTRS size_t ZSTD_HC_SEARCH_FN(dictMode, mls)( \
1377
+ ZSTD_matchState_t* ms, \
1378
+ const BYTE* ip, const BYTE* const iLimit, \
1379
+ size_t* offsetPtr) \
1380
+ { \
1381
+ assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
1382
+ return ZSTD_HcFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode); \
1383
+ } \
1384
+
1385
+ #define GEN_ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog) \
1386
+ ZSTD_SEARCH_FN_ATTRS size_t ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)( \
1387
+ ZSTD_matchState_t* ms, \
1388
+ const BYTE* ip, const BYTE* const iLimit, \
1389
+ size_t* offsetPtr) \
1390
+ { \
1391
+ assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
1392
+ assert(MAX(4, MIN(6, ms->cParams.searchLog)) == rowLog); \
1393
+ return ZSTD_RowFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode, rowLog); \
1394
+ } \
1395
+
1396
+ #define ZSTD_FOR_EACH_ROWLOG(X, dictMode, mls) \
1397
+ X(dictMode, mls, 4) \
1398
+ X(dictMode, mls, 5) \
1399
+ X(dictMode, mls, 6)
1400
+
1401
+ #define ZSTD_FOR_EACH_MLS_ROWLOG(X, dictMode) \
1402
+ ZSTD_FOR_EACH_ROWLOG(X, dictMode, 4) \
1403
+ ZSTD_FOR_EACH_ROWLOG(X, dictMode, 5) \
1404
+ ZSTD_FOR_EACH_ROWLOG(X, dictMode, 6)
1405
+
1406
+ #define ZSTD_FOR_EACH_MLS(X, dictMode) \
1407
+ X(dictMode, 4) \
1408
+ X(dictMode, 5) \
1409
+ X(dictMode, 6)
1410
+
1411
+ #define ZSTD_FOR_EACH_DICT_MODE(X, ...) \
1412
+ X(__VA_ARGS__, noDict) \
1413
+ X(__VA_ARGS__, extDict) \
1414
+ X(__VA_ARGS__, dictMatchState) \
1415
+ X(__VA_ARGS__, dedicatedDictSearch)
1416
+
1417
+ /* Generate row search fns for each combination of (dictMode, mls, rowLog) */
1418
+ ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS_ROWLOG, GEN_ZSTD_ROW_SEARCH_FN)
1419
+ /* Generate binary Tree search fns for each combination of (dictMode, mls) */
1420
+ ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_BT_SEARCH_FN)
1421
+ /* Generate hash chain search fns for each combination of (dictMode, mls) */
1422
+ ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_HC_SEARCH_FN)
1423
+
1424
+ typedef enum { search_hashChain=0, search_binaryTree=1, search_rowHash=2 } searchMethod_e;
1425
+
1426
+ #define GEN_ZSTD_CALL_BT_SEARCH_FN(dictMode, mls) \
1427
+ case mls: \
1428
+ return ZSTD_BT_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr);
1429
+ #define GEN_ZSTD_CALL_HC_SEARCH_FN(dictMode, mls) \
1430
+ case mls: \
1431
+ return ZSTD_HC_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr);
1432
+ #define GEN_ZSTD_CALL_ROW_SEARCH_FN(dictMode, mls, rowLog) \
1433
+ case rowLog: \
1434
+ return ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)(ms, ip, iend, offsetPtr);
1435
+
1436
+ #define ZSTD_SWITCH_MLS(X, dictMode) \
1437
+ switch (mls) { \
1438
+ ZSTD_FOR_EACH_MLS(X, dictMode) \
1439
+ }
1440
+
1441
+ #define ZSTD_SWITCH_ROWLOG(dictMode, mls) \
1442
+ case mls: \
1443
+ switch (rowLog) { \
1444
+ ZSTD_FOR_EACH_ROWLOG(GEN_ZSTD_CALL_ROW_SEARCH_FN, dictMode, mls) \
1445
+ } \
1446
+ ZSTD_UNREACHABLE; \
1447
+ break;
1448
+
1449
+ #define ZSTD_SWITCH_SEARCH_METHOD(dictMode) \
1450
+ switch (searchMethod) { \
1451
+ case search_hashChain: \
1452
+ ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_HC_SEARCH_FN, dictMode) \
1453
+ break; \
1454
+ case search_binaryTree: \
1455
+ ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_BT_SEARCH_FN, dictMode) \
1456
+ break; \
1457
+ case search_rowHash: \
1458
+ ZSTD_SWITCH_MLS(ZSTD_SWITCH_ROWLOG, dictMode) \
1459
+ break; \
1460
+ } \
1461
+ ZSTD_UNREACHABLE;
1462
+
1463
+ /**
1464
+ * Searches for the longest match at @p ip.
1465
+ * Dispatches to the correct implementation function based on the
1466
+ * (searchMethod, dictMode, mls, rowLog). We use switch statements
1467
+ * here instead of using an indirect function call through a function
1468
+ * pointer because after Spectre and Meltdown mitigations, indirect
1469
+ * function calls can be very costly, especially in the kernel.
1470
+ *
1471
+ * NOTE: dictMode and searchMethod should be templated, so those switch
1472
+ * statements should be optimized out. Only the mls & rowLog switches
1473
+ * should be left.
1474
+ *
1475
+ * @param ms The match state.
1476
+ * @param ip The position to search at.
1477
+ * @param iend The end of the input data.
1478
+ * @param[out] offsetPtr Stores the match offset into this pointer.
1479
+ * @param mls The minimum search length, in the range [4, 6].
1480
+ * @param rowLog The row log (if applicable), in the range [4, 6].
1481
+ * @param searchMethod The search method to use (templated).
1482
+ * @param dictMode The dictMode (templated).
1483
+ *
1484
+ * @returns The length of the longest match found, or < mls if no match is found.
1485
+ * If a match is found its offset is stored in @p offsetPtr.
1486
+ */
1487
+ FORCE_INLINE_TEMPLATE size_t ZSTD_searchMax(
1488
+ ZSTD_matchState_t* ms,
1489
+ const BYTE* ip,
1490
+ const BYTE* iend,
1491
+ size_t* offsetPtr,
1492
+ U32 const mls,
1493
+ U32 const rowLog,
1494
+ searchMethod_e const searchMethod,
1495
+ ZSTD_dictMode_e const dictMode)
1496
+ {
1497
+ if (dictMode == ZSTD_noDict) {
1498
+ ZSTD_SWITCH_SEARCH_METHOD(noDict)
1499
+ } else if (dictMode == ZSTD_extDict) {
1500
+ ZSTD_SWITCH_SEARCH_METHOD(extDict)
1501
+ } else if (dictMode == ZSTD_dictMatchState) {
1502
+ ZSTD_SWITCH_SEARCH_METHOD(dictMatchState)
1503
+ } else if (dictMode == ZSTD_dedicatedDictSearch) {
1504
+ ZSTD_SWITCH_SEARCH_METHOD(dedicatedDictSearch)
1505
+ }
1506
+ ZSTD_UNREACHABLE;
1507
+ return 0;
1508
+ }
1509
+
849
1510
  /* *******************************
850
1511
  * Common parser - lazy strategy
851
1512
  *********************************/
852
- typedef enum { search_hashChain, search_binaryTree } searchMethod_e;
853
1513
 
854
- FORCE_INLINE_TEMPLATE size_t
855
- ZSTD_compressBlock_lazy_generic(
1514
+ FORCE_INLINE_TEMPLATE
1515
+ ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
1516
+ size_t ZSTD_compressBlock_lazy_generic(
856
1517
  ZSTD_matchState_t* ms, seqStore_t* seqStore,
857
1518
  U32 rep[ZSTD_REP_NUM],
858
1519
  const void* src, size_t srcSize,
@@ -863,42 +1524,15 @@ ZSTD_compressBlock_lazy_generic(
863
1524
  const BYTE* ip = istart;
864
1525
  const BYTE* anchor = istart;
865
1526
  const BYTE* const iend = istart + srcSize;
866
- const BYTE* const ilimit = iend - 8;
1527
+ const BYTE* const ilimit = (searchMethod == search_rowHash) ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8;
867
1528
  const BYTE* const base = ms->window.base;
868
1529
  const U32 prefixLowestIndex = ms->window.dictLimit;
869
1530
  const BYTE* const prefixLowest = base + prefixLowestIndex;
1531
+ const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6);
1532
+ const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);
870
1533
 
871
- typedef size_t (*searchMax_f)(
872
- ZSTD_matchState_t* ms,
873
- const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
874
-
875
- /**
876
- * This table is indexed first by the four ZSTD_dictMode_e values, and then
877
- * by the two searchMethod_e values. NULLs are placed for configurations
878
- * that should never occur (extDict modes go to the other implementation
879
- * below and there is no DDSS for binary tree search yet).
880
- */
881
- const searchMax_f searchFuncs[4][2] = {
882
- {
883
- ZSTD_HcFindBestMatch_selectMLS,
884
- ZSTD_BtFindBestMatch_selectMLS
885
- },
886
- {
887
- NULL,
888
- NULL
889
- },
890
- {
891
- ZSTD_HcFindBestMatch_dictMatchState_selectMLS,
892
- ZSTD_BtFindBestMatch_dictMatchState_selectMLS
893
- },
894
- {
895
- ZSTD_HcFindBestMatch_dedicatedDictSearch_selectMLS,
896
- NULL
897
- }
898
- };
899
-
900
- searchMax_f const searchMax = searchFuncs[dictMode][searchMethod == search_binaryTree];
901
- U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0;
1534
+ U32 offset_1 = rep[0], offset_2 = rep[1];
1535
+ U32 offsetSaved1 = 0, offsetSaved2 = 0;
902
1536
 
903
1537
  const int isDMS = dictMode == ZSTD_dictMatchState;
904
1538
  const int isDDS = dictMode == ZSTD_dedicatedDictSearch;
@@ -913,18 +1547,14 @@ ZSTD_compressBlock_lazy_generic(
913
1547
  0;
914
1548
  const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictLowest));
915
1549
 
916
- assert(searchMax != NULL);
917
-
918
- DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=%u)", (U32)dictMode);
919
-
920
- /* init */
1550
+ DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=%u) (searchFunc=%u)", (U32)dictMode, (U32)searchMethod);
921
1551
  ip += (dictAndPrefixLength == 0);
922
1552
  if (dictMode == ZSTD_noDict) {
923
1553
  U32 const curr = (U32)(ip - base);
924
1554
  U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, ms->cParams.windowLog);
925
1555
  U32 const maxRep = curr - windowLow;
926
- if (offset_2 > maxRep) savedOffset = offset_2, offset_2 = 0;
927
- if (offset_1 > maxRep) savedOffset = offset_1, offset_1 = 0;
1556
+ if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0;
1557
+ if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0;
928
1558
  }
929
1559
  if (isDxS) {
930
1560
  /* dictMatchState repCode checks don't currently handle repCode == 0
@@ -933,6 +1563,13 @@ ZSTD_compressBlock_lazy_generic(
933
1563
  assert(offset_2 <= dictAndPrefixLength);
934
1564
  }
935
1565
 
1566
+ /* Reset the lazy skipping state */
1567
+ ms->lazySkipping = 0;
1568
+
1569
+ if (searchMethod == search_rowHash) {
1570
+ ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
1571
+ }
1572
+
936
1573
  /* Match Loop */
937
1574
  #if defined(__GNUC__) && defined(__x86_64__)
938
1575
  /* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the
@@ -942,8 +1579,9 @@ ZSTD_compressBlock_lazy_generic(
942
1579
  #endif
943
1580
  while (ip < ilimit) {
944
1581
  size_t matchLength=0;
945
- size_t offset=0;
1582
+ size_t offBase = REPCODE1_TO_OFFBASE;
946
1583
  const BYTE* start=ip+1;
1584
+ DEBUGLOG(7, "search baseline (depth 0)");
947
1585
 
948
1586
  /* check repCode */
949
1587
  if (isDxS) {
@@ -966,28 +1604,38 @@ ZSTD_compressBlock_lazy_generic(
966
1604
  }
967
1605
 
968
1606
  /* first search (depth 0) */
969
- { size_t offsetFound = 999999999;
970
- size_t const ml2 = searchMax(ms, ip, iend, &offsetFound);
1607
+ { size_t offbaseFound = 999999999;
1608
+ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offbaseFound, mls, rowLog, searchMethod, dictMode);
971
1609
  if (ml2 > matchLength)
972
- matchLength = ml2, start = ip, offset=offsetFound;
1610
+ matchLength = ml2, start = ip, offBase = offbaseFound;
973
1611
  }
974
1612
 
975
1613
  if (matchLength < 4) {
976
- ip += ((ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */
1614
+ size_t const step = ((size_t)(ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */;
1615
+ ip += step;
1616
+ /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time.
1617
+ * In this mode we stop inserting every position into our tables, and only insert
1618
+ * positions that we search, which is one in step positions.
1619
+ * The exact cutoff is flexible, I've just chosen a number that is reasonably high,
1620
+ * so we minimize the compression ratio loss in "normal" scenarios. This mode gets
1621
+ * triggered once we've gone 2KB without finding any matches.
1622
+ */
1623
+ ms->lazySkipping = step > kLazySkippingStep;
977
1624
  continue;
978
1625
  }
979
1626
 
980
1627
  /* let's try to find a better solution */
981
1628
  if (depth>=1)
982
1629
  while (ip<ilimit) {
1630
+ DEBUGLOG(7, "search depth 1");
983
1631
  ip ++;
984
1632
  if ( (dictMode == ZSTD_noDict)
985
- && (offset) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
1633
+ && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
986
1634
  size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
987
1635
  int const gain2 = (int)(mlRep * 3);
988
- int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1);
1636
+ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
989
1637
  if ((mlRep >= 4) && (gain2 > gain1))
990
- matchLength = mlRep, offset = 0, start = ip;
1638
+ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
991
1639
  }
992
1640
  if (isDxS) {
993
1641
  const U32 repIndex = (U32)(ip - base) - offset_1;
@@ -999,30 +1647,31 @@ ZSTD_compressBlock_lazy_generic(
999
1647
  const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
1000
1648
  size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
1001
1649
  int const gain2 = (int)(mlRep * 3);
1002
- int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1);
1650
+ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
1003
1651
  if ((mlRep >= 4) && (gain2 > gain1))
1004
- matchLength = mlRep, offset = 0, start = ip;
1652
+ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
1005
1653
  }
1006
1654
  }
1007
- { size_t offset2=999999999;
1008
- size_t const ml2 = searchMax(ms, ip, iend, &offset2);
1009
- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */
1010
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 4);
1655
+ { size_t ofbCandidate=999999999;
1656
+ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);
1657
+ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */
1658
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4);
1011
1659
  if ((ml2 >= 4) && (gain2 > gain1)) {
1012
- matchLength = ml2, offset = offset2, start = ip;
1660
+ matchLength = ml2, offBase = ofbCandidate, start = ip;
1013
1661
  continue; /* search a better one */
1014
1662
  } }
1015
1663
 
1016
1664
  /* let's find an even better one */
1017
1665
  if ((depth==2) && (ip<ilimit)) {
1666
+ DEBUGLOG(7, "search depth 2");
1018
1667
  ip ++;
1019
1668
  if ( (dictMode == ZSTD_noDict)
1020
- && (offset) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
1669
+ && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
1021
1670
  size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
1022
1671
  int const gain2 = (int)(mlRep * 4);
1023
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1);
1672
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
1024
1673
  if ((mlRep >= 4) && (gain2 > gain1))
1025
- matchLength = mlRep, offset = 0, start = ip;
1674
+ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
1026
1675
  }
1027
1676
  if (isDxS) {
1028
1677
  const U32 repIndex = (U32)(ip - base) - offset_1;
@@ -1034,48 +1683,54 @@ ZSTD_compressBlock_lazy_generic(
1034
1683
  const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
1035
1684
  size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
1036
1685
  int const gain2 = (int)(mlRep * 4);
1037
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1);
1686
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
1038
1687
  if ((mlRep >= 4) && (gain2 > gain1))
1039
- matchLength = mlRep, offset = 0, start = ip;
1688
+ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
1040
1689
  }
1041
1690
  }
1042
- { size_t offset2=999999999;
1043
- size_t const ml2 = searchMax(ms, ip, iend, &offset2);
1044
- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */
1045
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 7);
1691
+ { size_t ofbCandidate=999999999;
1692
+ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);
1693
+ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */
1694
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7);
1046
1695
  if ((ml2 >= 4) && (gain2 > gain1)) {
1047
- matchLength = ml2, offset = offset2, start = ip;
1696
+ matchLength = ml2, offBase = ofbCandidate, start = ip;
1048
1697
  continue;
1049
1698
  } } }
1050
1699
  break; /* nothing found : store previous solution */
1051
1700
  }
1052
1701
 
1053
1702
  /* NOTE:
1054
- * start[-offset+ZSTD_REP_MOVE-1] is undefined behavior.
1055
- * (-offset+ZSTD_REP_MOVE-1) is unsigned, and is added to start, which
1056
- * overflows the pointer, which is undefined behavior.
1703
+ * Pay attention that `start[-value]` can lead to strange undefined behavior
1704
+ * notably if `value` is unsigned, resulting in a large positive `-value`.
1057
1705
  */
1058
1706
  /* catch up */
1059
- if (offset) {
1707
+ if (OFFBASE_IS_OFFSET(offBase)) {
1060
1708
  if (dictMode == ZSTD_noDict) {
1061
- while ( ((start > anchor) & (start - (offset-ZSTD_REP_MOVE) > prefixLowest))
1062
- && (start[-1] == (start-(offset-ZSTD_REP_MOVE))[-1]) ) /* only search for offset within prefix */
1709
+ while ( ((start > anchor) & (start - OFFBASE_TO_OFFSET(offBase) > prefixLowest))
1710
+ && (start[-1] == (start-OFFBASE_TO_OFFSET(offBase))[-1]) ) /* only search for offset within prefix */
1063
1711
  { start--; matchLength++; }
1064
1712
  }
1065
1713
  if (isDxS) {
1066
- U32 const matchIndex = (U32)((start-base) - (offset - ZSTD_REP_MOVE));
1714
+ U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase));
1067
1715
  const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex;
1068
1716
  const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest;
1069
1717
  while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */
1070
1718
  }
1071
- offset_2 = offset_1; offset_1 = (U32)(offset - ZSTD_REP_MOVE);
1719
+ offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase);
1072
1720
  }
1073
1721
  /* store sequence */
1074
1722
  _storeSequence:
1075
- { size_t const litLength = start - anchor;
1076
- ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offset, matchLength-MINMATCH);
1723
+ { size_t const litLength = (size_t)(start - anchor);
1724
+ ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
1077
1725
  anchor = ip = start + matchLength;
1078
1726
  }
1727
+ if (ms->lazySkipping) {
1728
+ /* We've found a match, disable lazy skipping mode, and refill the hash cache. */
1729
+ if (searchMethod == search_rowHash) {
1730
+ ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
1731
+ }
1732
+ ms->lazySkipping = 0;
1733
+ }
1079
1734
 
1080
1735
  /* check immediate repcode */
1081
1736
  if (isDxS) {
@@ -1089,8 +1744,8 @@ _storeSequence:
1089
1744
  && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
1090
1745
  const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend;
1091
1746
  matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4;
1092
- offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap offset_2 <=> offset_1 */
1093
- ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MINMATCH);
1747
+ offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap offset_2 <=> offset_1 */
1748
+ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
1094
1749
  ip += matchLength;
1095
1750
  anchor = ip;
1096
1751
  continue;
@@ -1104,62 +1759,77 @@ _storeSequence:
1104
1759
  && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) {
1105
1760
  /* store sequence */
1106
1761
  matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
1107
- offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap repcodes */
1108
- ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MINMATCH);
1762
+ offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap repcodes */
1763
+ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
1109
1764
  ip += matchLength;
1110
1765
  anchor = ip;
1111
1766
  continue; /* faster when present ... (?) */
1112
1767
  } } }
1113
1768
 
1114
- /* Save reps for next block */
1115
- rep[0] = offset_1 ? offset_1 : savedOffset;
1116
- rep[1] = offset_2 ? offset_2 : savedOffset;
1769
+ /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
1770
+ * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
1771
+ offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
1772
+
1773
+ /* save reps for next block */
1774
+ rep[0] = offset_1 ? offset_1 : offsetSaved1;
1775
+ rep[1] = offset_2 ? offset_2 : offsetSaved2;
1117
1776
 
1118
1777
  /* Return the last literals size */
1119
1778
  return (size_t)(iend - anchor);
1120
1779
  }
1780
+ #endif /* build exclusions */
1121
1781
 
1122
1782
 
1123
- size_t ZSTD_compressBlock_btlazy2(
1783
+ #ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR
1784
+ size_t ZSTD_compressBlock_greedy(
1124
1785
  ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1125
1786
  void const* src, size_t srcSize)
1126
1787
  {
1127
- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_noDict);
1788
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_noDict);
1128
1789
  }
1129
1790
 
1130
- size_t ZSTD_compressBlock_lazy2(
1791
+ size_t ZSTD_compressBlock_greedy_dictMatchState(
1131
1792
  ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1132
1793
  void const* src, size_t srcSize)
1133
1794
  {
1134
- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_noDict);
1795
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dictMatchState);
1135
1796
  }
1136
1797
 
1137
- size_t ZSTD_compressBlock_lazy(
1798
+ size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
1138
1799
  ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1139
1800
  void const* src, size_t srcSize)
1140
1801
  {
1141
- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_noDict);
1802
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch);
1142
1803
  }
1143
1804
 
1144
- size_t ZSTD_compressBlock_greedy(
1805
+ size_t ZSTD_compressBlock_greedy_row(
1145
1806
  ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1146
1807
  void const* src, size_t srcSize)
1147
1808
  {
1148
- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_noDict);
1809
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict);
1149
1810
  }
1150
1811
 
1151
- size_t ZSTD_compressBlock_btlazy2_dictMatchState(
1812
+ size_t ZSTD_compressBlock_greedy_dictMatchState_row(
1152
1813
  ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1153
1814
  void const* src, size_t srcSize)
1154
1815
  {
1155
- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_dictMatchState);
1816
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState);
1156
1817
  }
1157
1818
 
1158
- size_t ZSTD_compressBlock_lazy2_dictMatchState(
1819
+ size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
1159
1820
  ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1160
1821
  void const* src, size_t srcSize)
1161
1822
  {
1162
- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dictMatchState);
1823
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch);
1824
+ }
1825
+ #endif
1826
+
1827
+ #ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR
1828
+ size_t ZSTD_compressBlock_lazy(
1829
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1830
+ void const* src, size_t srcSize)
1831
+ {
1832
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_noDict);
1163
1833
  }
1164
1834
 
1165
1835
  size_t ZSTD_compressBlock_lazy_dictMatchState(
@@ -1169,13 +1839,49 @@ size_t ZSTD_compressBlock_lazy_dictMatchState(
1169
1839
  return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dictMatchState);
1170
1840
  }
1171
1841
 
1172
- size_t ZSTD_compressBlock_greedy_dictMatchState(
1842
+ size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(
1173
1843
  ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1174
1844
  void const* src, size_t srcSize)
1175
1845
  {
1176
- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dictMatchState);
1846
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dedicatedDictSearch);
1847
+ }
1848
+
1849
+ size_t ZSTD_compressBlock_lazy_row(
1850
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1851
+ void const* src, size_t srcSize)
1852
+ {
1853
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict);
1854
+ }
1855
+
1856
+ size_t ZSTD_compressBlock_lazy_dictMatchState_row(
1857
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1858
+ void const* src, size_t srcSize)
1859
+ {
1860
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState);
1861
+ }
1862
+
1863
+ size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
1864
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1865
+ void const* src, size_t srcSize)
1866
+ {
1867
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch);
1868
+ }
1869
+ #endif
1870
+
1871
+ #ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR
1872
+ size_t ZSTD_compressBlock_lazy2(
1873
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1874
+ void const* src, size_t srcSize)
1875
+ {
1876
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_noDict);
1177
1877
  }
1178
1878
 
1879
+ size_t ZSTD_compressBlock_lazy2_dictMatchState(
1880
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1881
+ void const* src, size_t srcSize)
1882
+ {
1883
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dictMatchState);
1884
+ }
1179
1885
 
1180
1886
  size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(
1181
1887
  ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
@@ -1184,22 +1890,50 @@ size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(
1184
1890
  return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dedicatedDictSearch);
1185
1891
  }
1186
1892
 
1187
- size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(
1893
+ size_t ZSTD_compressBlock_lazy2_row(
1188
1894
  ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1189
1895
  void const* src, size_t srcSize)
1190
1896
  {
1191
- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dedicatedDictSearch);
1897
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict);
1192
1898
  }
1193
1899
 
1194
- size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
1900
+ size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
1195
1901
  ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1196
1902
  void const* src, size_t srcSize)
1197
1903
  {
1198
- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch);
1904
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState);
1905
+ }
1906
+
1907
+ size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row(
1908
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1909
+ void const* src, size_t srcSize)
1910
+ {
1911
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dedicatedDictSearch);
1199
1912
  }
1913
+ #endif
1200
1914
 
1915
+ #ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR
1916
+ size_t ZSTD_compressBlock_btlazy2(
1917
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1918
+ void const* src, size_t srcSize)
1919
+ {
1920
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_noDict);
1921
+ }
1922
+
1923
+ size_t ZSTD_compressBlock_btlazy2_dictMatchState(
1924
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1925
+ void const* src, size_t srcSize)
1926
+ {
1927
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_dictMatchState);
1928
+ }
1929
+ #endif
1201
1930
 
1931
+ #if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \
1932
+ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \
1933
+ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \
1934
+ || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR)
1202
1935
  FORCE_INLINE_TEMPLATE
1936
+ ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
1203
1937
  size_t ZSTD_compressBlock_lazy_extDict_generic(
1204
1938
  ZSTD_matchState_t* ms, seqStore_t* seqStore,
1205
1939
  U32 rep[ZSTD_REP_NUM],
@@ -1210,7 +1944,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1210
1944
  const BYTE* ip = istart;
1211
1945
  const BYTE* anchor = istart;
1212
1946
  const BYTE* const iend = istart + srcSize;
1213
- const BYTE* const ilimit = iend - 8;
1947
+ const BYTE* const ilimit = searchMethod == search_rowHash ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8;
1214
1948
  const BYTE* const base = ms->window.base;
1215
1949
  const U32 dictLimit = ms->window.dictLimit;
1216
1950
  const BYTE* const prefixStart = base + dictLimit;
@@ -1218,18 +1952,21 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1218
1952
  const BYTE* const dictEnd = dictBase + dictLimit;
1219
1953
  const BYTE* const dictStart = dictBase + ms->window.lowLimit;
1220
1954
  const U32 windowLog = ms->cParams.windowLog;
1221
-
1222
- typedef size_t (*searchMax_f)(
1223
- ZSTD_matchState_t* ms,
1224
- const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
1225
- searchMax_f searchMax = searchMethod==search_binaryTree ? ZSTD_BtFindBestMatch_extDict_selectMLS : ZSTD_HcFindBestMatch_extDict_selectMLS;
1955
+ const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6);
1956
+ const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);
1226
1957
 
1227
1958
  U32 offset_1 = rep[0], offset_2 = rep[1];
1228
1959
 
1229
- DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic");
1960
+ DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod);
1961
+
1962
+ /* Reset the lazy skipping state */
1963
+ ms->lazySkipping = 0;
1230
1964
 
1231
1965
  /* init */
1232
1966
  ip += (ip == prefixStart);
1967
+ if (searchMethod == search_rowHash) {
1968
+ ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
1969
+ }
1233
1970
 
1234
1971
  /* Match Loop */
1235
1972
  #if defined(__GNUC__) && defined(__x86_64__)
@@ -1240,7 +1977,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1240
1977
  #endif
1241
1978
  while (ip < ilimit) {
1242
1979
  size_t matchLength=0;
1243
- size_t offset=0;
1980
+ size_t offBase = REPCODE1_TO_OFFBASE;
1244
1981
  const BYTE* start=ip+1;
1245
1982
  U32 curr = (U32)(ip-base);
1246
1983
 
@@ -1249,7 +1986,8 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1249
1986
  const U32 repIndex = (U32)(curr+1 - offset_1);
1250
1987
  const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
1251
1988
  const BYTE* const repMatch = repBase + repIndex;
1252
- if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > windowLow)) /* intentional overflow */
1989
+ if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow */
1990
+ & (offset_1 <= curr+1 - windowLow) ) /* note: we are searching at curr+1 */
1253
1991
  if (MEM_read32(ip+1) == MEM_read32(repMatch)) {
1254
1992
  /* repcode detected we should take it */
1255
1993
  const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
@@ -1258,14 +1996,23 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1258
1996
  } }
1259
1997
 
1260
1998
  /* first search (depth 0) */
1261
- { size_t offsetFound = 999999999;
1262
- size_t const ml2 = searchMax(ms, ip, iend, &offsetFound);
1999
+ { size_t ofbCandidate = 999999999;
2000
+ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
1263
2001
  if (ml2 > matchLength)
1264
- matchLength = ml2, start = ip, offset=offsetFound;
2002
+ matchLength = ml2, start = ip, offBase = ofbCandidate;
1265
2003
  }
1266
2004
 
1267
- if (matchLength < 4) {
1268
- ip += ((ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */
2005
+ if (matchLength < 4) {
2006
+ size_t const step = ((size_t)(ip-anchor) >> kSearchStrength);
2007
+ ip += step + 1; /* jump faster over incompressible sections */
2008
+ /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time.
2009
+ * In this mode we stop inserting every position into our tables, and only insert
2010
+ * positions that we search, which is one in step positions.
2011
+ * The exact cutoff is flexible, I've just chosen a number that is reasonably high,
2012
+ * so we minimize the compression ratio loss in "normal" scenarios. This mode gets
2013
+ * triggered once we've gone 2KB without finding any matches.
2014
+ */
2015
+ ms->lazySkipping = step > kLazySkippingStep;
1269
2016
  continue;
1270
2017
  }
1271
2018
 
@@ -1275,29 +2022,30 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1275
2022
  ip ++;
1276
2023
  curr++;
1277
2024
  /* check repCode */
1278
- if (offset) {
2025
+ if (offBase) {
1279
2026
  const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
1280
2027
  const U32 repIndex = (U32)(curr - offset_1);
1281
2028
  const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
1282
2029
  const BYTE* const repMatch = repBase + repIndex;
1283
- if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > windowLow)) /* intentional overflow */
2030
+ if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */
2031
+ & (offset_1 <= curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
1284
2032
  if (MEM_read32(ip) == MEM_read32(repMatch)) {
1285
2033
  /* repcode detected */
1286
2034
  const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
1287
2035
  size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
1288
2036
  int const gain2 = (int)(repLength * 3);
1289
- int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1);
2037
+ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
1290
2038
  if ((repLength >= 4) && (gain2 > gain1))
1291
- matchLength = repLength, offset = 0, start = ip;
2039
+ matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip;
1292
2040
  } }
1293
2041
 
1294
2042
  /* search match, depth 1 */
1295
- { size_t offset2=999999999;
1296
- size_t const ml2 = searchMax(ms, ip, iend, &offset2);
1297
- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */
1298
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 4);
2043
+ { size_t ofbCandidate = 999999999;
2044
+ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
2045
+ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */
2046
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4);
1299
2047
  if ((ml2 >= 4) && (gain2 > gain1)) {
1300
- matchLength = ml2, offset = offset2, start = ip;
2048
+ matchLength = ml2, offBase = ofbCandidate, start = ip;
1301
2049
  continue; /* search a better one */
1302
2050
  } }
1303
2051
 
@@ -1306,49 +2054,57 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1306
2054
  ip ++;
1307
2055
  curr++;
1308
2056
  /* check repCode */
1309
- if (offset) {
2057
+ if (offBase) {
1310
2058
  const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
1311
2059
  const U32 repIndex = (U32)(curr - offset_1);
1312
2060
  const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
1313
2061
  const BYTE* const repMatch = repBase + repIndex;
1314
- if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > windowLow)) /* intentional overflow */
2062
+ if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */
2063
+ & (offset_1 <= curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
1315
2064
  if (MEM_read32(ip) == MEM_read32(repMatch)) {
1316
2065
  /* repcode detected */
1317
2066
  const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
1318
2067
  size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
1319
2068
  int const gain2 = (int)(repLength * 4);
1320
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1);
2069
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
1321
2070
  if ((repLength >= 4) && (gain2 > gain1))
1322
- matchLength = repLength, offset = 0, start = ip;
2071
+ matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip;
1323
2072
  } }
1324
2073
 
1325
2074
  /* search match, depth 2 */
1326
- { size_t offset2=999999999;
1327
- size_t const ml2 = searchMax(ms, ip, iend, &offset2);
1328
- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */
1329
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 7);
2075
+ { size_t ofbCandidate = 999999999;
2076
+ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
2077
+ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */
2078
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7);
1330
2079
  if ((ml2 >= 4) && (gain2 > gain1)) {
1331
- matchLength = ml2, offset = offset2, start = ip;
2080
+ matchLength = ml2, offBase = ofbCandidate, start = ip;
1332
2081
  continue;
1333
2082
  } } }
1334
2083
  break; /* nothing found : store previous solution */
1335
2084
  }
1336
2085
 
1337
2086
  /* catch up */
1338
- if (offset) {
1339
- U32 const matchIndex = (U32)((start-base) - (offset - ZSTD_REP_MOVE));
2087
+ if (OFFBASE_IS_OFFSET(offBase)) {
2088
+ U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase));
1340
2089
  const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex;
1341
2090
  const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart;
1342
2091
  while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */
1343
- offset_2 = offset_1; offset_1 = (U32)(offset - ZSTD_REP_MOVE);
2092
+ offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase);
1344
2093
  }
1345
2094
 
1346
2095
  /* store sequence */
1347
2096
  _storeSequence:
1348
- { size_t const litLength = start - anchor;
1349
- ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offset, matchLength-MINMATCH);
2097
+ { size_t const litLength = (size_t)(start - anchor);
2098
+ ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
1350
2099
  anchor = ip = start + matchLength;
1351
2100
  }
2101
+ if (ms->lazySkipping) {
2102
+ /* We've found a match, disable lazy skipping mode, and refill the hash cache. */
2103
+ if (searchMethod == search_rowHash) {
2104
+ ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
2105
+ }
2106
+ ms->lazySkipping = 0;
2107
+ }
1352
2108
 
1353
2109
  /* check immediate repcode */
1354
2110
  while (ip <= ilimit) {
@@ -1357,13 +2113,14 @@ _storeSequence:
1357
2113
  const U32 repIndex = repCurrent - offset_2;
1358
2114
  const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
1359
2115
  const BYTE* const repMatch = repBase + repIndex;
1360
- if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > windowLow)) /* intentional overflow */
2116
+ if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */
2117
+ & (offset_2 <= repCurrent - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
1361
2118
  if (MEM_read32(ip) == MEM_read32(repMatch)) {
1362
2119
  /* repcode detected we should take it */
1363
2120
  const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
1364
2121
  matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
1365
- offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap offset history */
1366
- ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MINMATCH);
2122
+ offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap offset history */
2123
+ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
1367
2124
  ip += matchLength;
1368
2125
  anchor = ip;
1369
2126
  continue; /* faster when present ... (?) */
@@ -1378,8 +2135,9 @@ _storeSequence:
1378
2135
  /* Return the last literals size */
1379
2136
  return (size_t)(iend - anchor);
1380
2137
  }
2138
+ #endif /* build exclusions */
1381
2139
 
1382
-
2140
+ #ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR
1383
2141
  size_t ZSTD_compressBlock_greedy_extDict(
1384
2142
  ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1385
2143
  void const* src, size_t srcSize)
@@ -1387,6 +2145,15 @@ size_t ZSTD_compressBlock_greedy_extDict(
1387
2145
  return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0);
1388
2146
  }
1389
2147
 
2148
+ size_t ZSTD_compressBlock_greedy_extDict_row(
2149
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2150
+ void const* src, size_t srcSize)
2151
+ {
2152
+ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0);
2153
+ }
2154
+ #endif
2155
+
2156
+ #ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR
1390
2157
  size_t ZSTD_compressBlock_lazy_extDict(
1391
2158
  ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1392
2159
  void const* src, size_t srcSize)
@@ -1395,6 +2162,16 @@ size_t ZSTD_compressBlock_lazy_extDict(
1395
2162
  return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1);
1396
2163
  }
1397
2164
 
2165
+ size_t ZSTD_compressBlock_lazy_extDict_row(
2166
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2167
+ void const* src, size_t srcSize)
2168
+
2169
+ {
2170
+ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1);
2171
+ }
2172
+ #endif
2173
+
2174
+ #ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR
1398
2175
  size_t ZSTD_compressBlock_lazy2_extDict(
1399
2176
  ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1400
2177
  void const* src, size_t srcSize)
@@ -1403,6 +2180,15 @@ size_t ZSTD_compressBlock_lazy2_extDict(
1403
2180
  return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2);
1404
2181
  }
1405
2182
 
2183
+ size_t ZSTD_compressBlock_lazy2_extDict_row(
2184
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2185
+ void const* src, size_t srcSize)
2186
+ {
2187
+ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2);
2188
+ }
2189
+ #endif
2190
+
2191
+ #ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR
1406
2192
  size_t ZSTD_compressBlock_btlazy2_extDict(
1407
2193
  ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1408
2194
  void const* src, size_t srcSize)
@@ -1410,3 +2196,4 @@ size_t ZSTD_compressBlock_btlazy2_extDict(
1410
2196
  {
1411
2197
  return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2);
1412
2198
  }
2199
+ #endif