zstdlib 0.3.0-x64-mingw32 → 0.8.0-x64-mingw32

Sign up to get free protection for your applications and to get access to all the features.
Files changed (88) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGES.md +30 -1
  3. data/README.md +2 -2
  4. data/Rakefile +1 -1
  5. data/ext/zstdlib/extconf.rb +3 -3
  6. data/ext/zstdlib/ruby/zlib-2.7/zstdlib.c +4895 -0
  7. data/ext/zstdlib/ruby/zlib-3.0/zstdlib.c +4994 -0
  8. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/common/bitstream.h +59 -51
  9. data/ext/zstdlib/zstd-1.5.0/lib/common/compiler.h +289 -0
  10. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/common/cpu.h +1 -3
  11. data/ext/zstdlib/zstd-1.5.0/lib/common/debug.c +24 -0
  12. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/common/debug.h +22 -49
  13. data/ext/zstdlib/zstd-1.5.0/lib/common/entropy_common.c +362 -0
  14. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/common/error_private.c +3 -1
  15. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/common/error_private.h +8 -4
  16. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/common/fse.h +50 -42
  17. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/common/fse_decompress.c +149 -55
  18. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/common/huf.h +43 -39
  19. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/common/mem.h +69 -25
  20. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/common/pool.c +30 -20
  21. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/common/pool.h +3 -3
  22. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/common/threading.c +51 -4
  23. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/common/threading.h +36 -4
  24. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/common/xxhash.c +40 -92
  25. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/common/xxhash.h +12 -32
  26. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/common/zstd_common.c +10 -10
  27. data/ext/zstdlib/zstd-1.5.0/lib/common/zstd_deps.h +111 -0
  28. data/ext/zstdlib/zstd-1.5.0/lib/common/zstd_internal.h +490 -0
  29. data/ext/zstdlib/zstd-1.5.0/lib/common/zstd_trace.h +154 -0
  30. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/compress/fse_compress.c +47 -63
  31. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/compress/hist.c +41 -63
  32. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/compress/hist.h +13 -33
  33. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/compress/huf_compress.c +332 -193
  34. data/ext/zstdlib/zstd-1.5.0/lib/compress/zstd_compress.c +6393 -0
  35. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/compress/zstd_compress_internal.h +522 -86
  36. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/compress/zstd_compress_literals.c +25 -16
  37. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/compress/zstd_compress_literals.h +2 -2
  38. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/compress/zstd_compress_sequences.c +50 -24
  39. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/compress/zstd_compress_sequences.h +11 -4
  40. data/ext/zstdlib/zstd-1.5.0/lib/compress/zstd_compress_superblock.c +572 -0
  41. data/ext/zstdlib/zstd-1.5.0/lib/compress/zstd_compress_superblock.h +32 -0
  42. data/ext/zstdlib/zstd-1.5.0/lib/compress/zstd_cwksp.h +662 -0
  43. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/compress/zstd_double_fast.c +43 -41
  44. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/compress/zstd_double_fast.h +2 -2
  45. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/compress/zstd_fast.c +85 -80
  46. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/compress/zstd_fast.h +2 -2
  47. data/ext/zstdlib/zstd-1.5.0/lib/compress/zstd_lazy.c +2184 -0
  48. data/ext/zstdlib/zstd-1.5.0/lib/compress/zstd_lazy.h +125 -0
  49. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/compress/zstd_ldm.c +333 -208
  50. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/compress/zstd_ldm.h +15 -3
  51. data/ext/zstdlib/zstd-1.5.0/lib/compress/zstd_ldm_geartab.h +103 -0
  52. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/compress/zstd_opt.c +228 -129
  53. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/compress/zstd_opt.h +1 -1
  54. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/compress/zstdmt_compress.c +151 -440
  55. data/ext/zstdlib/zstd-1.5.0/lib/compress/zstdmt_compress.h +110 -0
  56. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/decompress/huf_decompress.c +395 -276
  57. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/decompress/zstd_ddict.c +20 -16
  58. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/decompress/zstd_ddict.h +3 -3
  59. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/decompress/zstd_decompress.c +628 -231
  60. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/decompress/zstd_decompress_block.c +606 -380
  61. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/decompress/zstd_decompress_block.h +8 -5
  62. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/decompress/zstd_decompress_internal.h +39 -9
  63. data/ext/zstdlib/zstd-1.5.0/lib/zdict.h +452 -0
  64. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/lib/zstd.h +740 -153
  65. data/ext/zstdlib/{zstd-1.4.2/lib/common → zstd-1.5.0/lib}/zstd_errors.h +3 -1
  66. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/zlibWrapper/gzclose.c +1 -1
  67. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/zlibWrapper/gzcompatibility.h +1 -1
  68. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/zlibWrapper/gzguts.h +0 -0
  69. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/zlibWrapper/gzlib.c +9 -9
  70. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/zlibWrapper/gzread.c +16 -8
  71. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/zlibWrapper/gzwrite.c +8 -8
  72. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/zlibWrapper/zstd_zlibwrapper.c +131 -45
  73. data/ext/zstdlib/{zstd-1.4.2 → zstd-1.5.0}/zlibWrapper/zstd_zlibwrapper.h +1 -1
  74. data/lib/2.2/zstdlib.so +0 -0
  75. data/lib/2.3/zstdlib.so +0 -0
  76. data/lib/2.4/zstdlib.so +0 -0
  77. data/lib/2.5/zstdlib.so +0 -0
  78. data/lib/2.6/zstdlib.so +0 -0
  79. data/lib/2.7/zstdlib.so +0 -0
  80. metadata +76 -67
  81. data/ext/zstdlib/zstd-1.4.2/lib/common/compiler.h +0 -147
  82. data/ext/zstdlib/zstd-1.4.2/lib/common/debug.c +0 -44
  83. data/ext/zstdlib/zstd-1.4.2/lib/common/entropy_common.c +0 -236
  84. data/ext/zstdlib/zstd-1.4.2/lib/common/zstd_internal.h +0 -371
  85. data/ext/zstdlib/zstd-1.4.2/lib/compress/zstd_compress.c +0 -3904
  86. data/ext/zstdlib/zstd-1.4.2/lib/compress/zstd_lazy.c +0 -1111
  87. data/ext/zstdlib/zstd-1.4.2/lib/compress/zstd_lazy.h +0 -67
  88. data/ext/zstdlib/zstd-1.4.2/lib/compress/zstdmt_compress.h +0 -192
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
2
+ * Copyright (c) Yann Collet, Facebook, Inc.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -15,7 +15,7 @@
15
15
  extern "C" {
16
16
  #endif
17
17
 
18
- #include "mem.h" /* U32 */
18
+ #include "../common/mem.h" /* U32 */
19
19
  #include "zstd_compress_internal.h"
20
20
 
21
21
  void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
@@ -0,0 +1,2184 @@
1
+ /*
2
+ * Copyright (c) Yann Collet, Facebook, Inc.
3
+ * All rights reserved.
4
+ *
5
+ * This source code is licensed under both the BSD-style license (found in the
6
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7
+ * in the COPYING file in the root directory of this source tree).
8
+ * You may select, at your option, one of the above-listed licenses.
9
+ */
10
+
11
+ #include "zstd_compress_internal.h"
12
+ #include "zstd_lazy.h"
13
+
14
+
15
+ /*-*************************************
16
+ * Binary Tree search
17
+ ***************************************/
18
+
19
+ static void
20
+ ZSTD_updateDUBT(ZSTD_matchState_t* ms,
21
+ const BYTE* ip, const BYTE* iend,
22
+ U32 mls)
23
+ {
24
+ const ZSTD_compressionParameters* const cParams = &ms->cParams;
25
+ U32* const hashTable = ms->hashTable;
26
+ U32 const hashLog = cParams->hashLog;
27
+
28
+ U32* const bt = ms->chainTable;
29
+ U32 const btLog = cParams->chainLog - 1;
30
+ U32 const btMask = (1 << btLog) - 1;
31
+
32
+ const BYTE* const base = ms->window.base;
33
+ U32 const target = (U32)(ip - base);
34
+ U32 idx = ms->nextToUpdate;
35
+
36
+ if (idx != target)
37
+ DEBUGLOG(7, "ZSTD_updateDUBT, from %u to %u (dictLimit:%u)",
38
+ idx, target, ms->window.dictLimit);
39
+ assert(ip + 8 <= iend); /* condition for ZSTD_hashPtr */
40
+ (void)iend;
41
+
42
+ assert(idx >= ms->window.dictLimit); /* condition for valid base+idx */
43
+ for ( ; idx < target ; idx++) {
44
+ size_t const h = ZSTD_hashPtr(base + idx, hashLog, mls); /* assumption : ip + 8 <= iend */
45
+ U32 const matchIndex = hashTable[h];
46
+
47
+ U32* const nextCandidatePtr = bt + 2*(idx&btMask);
48
+ U32* const sortMarkPtr = nextCandidatePtr + 1;
49
+
50
+ DEBUGLOG(8, "ZSTD_updateDUBT: insert %u", idx);
51
+ hashTable[h] = idx; /* Update Hash Table */
52
+ *nextCandidatePtr = matchIndex; /* update BT like a chain */
53
+ *sortMarkPtr = ZSTD_DUBT_UNSORTED_MARK;
54
+ }
55
+ ms->nextToUpdate = target;
56
+ }
57
+
58
+
59
+ /** ZSTD_insertDUBT1() :
60
+ * sort one already inserted but unsorted position
61
+ * assumption : curr >= btlow == (curr - btmask)
62
+ * doesn't fail */
63
+ static void
64
+ ZSTD_insertDUBT1(ZSTD_matchState_t* ms,
65
+ U32 curr, const BYTE* inputEnd,
66
+ U32 nbCompares, U32 btLow,
67
+ const ZSTD_dictMode_e dictMode)
68
+ {
69
+ const ZSTD_compressionParameters* const cParams = &ms->cParams;
70
+ U32* const bt = ms->chainTable;
71
+ U32 const btLog = cParams->chainLog - 1;
72
+ U32 const btMask = (1 << btLog) - 1;
73
+ size_t commonLengthSmaller=0, commonLengthLarger=0;
74
+ const BYTE* const base = ms->window.base;
75
+ const BYTE* const dictBase = ms->window.dictBase;
76
+ const U32 dictLimit = ms->window.dictLimit;
77
+ const BYTE* const ip = (curr>=dictLimit) ? base + curr : dictBase + curr;
78
+ const BYTE* const iend = (curr>=dictLimit) ? inputEnd : dictBase + dictLimit;
79
+ const BYTE* const dictEnd = dictBase + dictLimit;
80
+ const BYTE* const prefixStart = base + dictLimit;
81
+ const BYTE* match;
82
+ U32* smallerPtr = bt + 2*(curr&btMask);
83
+ U32* largerPtr = smallerPtr + 1;
84
+ U32 matchIndex = *smallerPtr; /* this candidate is unsorted : next sorted candidate is reached through *smallerPtr, while *largerPtr contains previous unsorted candidate (which is already saved and can be overwritten) */
85
+ U32 dummy32; /* to be nullified at the end */
86
+ U32 const windowValid = ms->window.lowLimit;
87
+ U32 const maxDistance = 1U << cParams->windowLog;
88
+ U32 const windowLow = (curr - windowValid > maxDistance) ? curr - maxDistance : windowValid;
89
+
90
+
91
+ DEBUGLOG(8, "ZSTD_insertDUBT1(%u) (dictLimit=%u, lowLimit=%u)",
92
+ curr, dictLimit, windowLow);
93
+ assert(curr >= btLow);
94
+ assert(ip < iend); /* condition for ZSTD_count */
95
+
96
+ while (nbCompares-- && (matchIndex > windowLow)) {
97
+ U32* const nextPtr = bt + 2*(matchIndex & btMask);
98
+ size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
99
+ assert(matchIndex < curr);
100
+ /* note : all candidates are now supposed sorted,
101
+ * but it's still possible to have nextPtr[1] == ZSTD_DUBT_UNSORTED_MARK
102
+ * when a real index has the same value as ZSTD_DUBT_UNSORTED_MARK */
103
+
104
+ if ( (dictMode != ZSTD_extDict)
105
+ || (matchIndex+matchLength >= dictLimit) /* both in current segment*/
106
+ || (curr < dictLimit) /* both in extDict */) {
107
+ const BYTE* const mBase = ( (dictMode != ZSTD_extDict)
108
+ || (matchIndex+matchLength >= dictLimit)) ?
109
+ base : dictBase;
110
+ assert( (matchIndex+matchLength >= dictLimit) /* might be wrong if extDict is incorrectly set to 0 */
111
+ || (curr < dictLimit) );
112
+ match = mBase + matchIndex;
113
+ matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend);
114
+ } else {
115
+ match = dictBase + matchIndex;
116
+ matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);
117
+ if (matchIndex+matchLength >= dictLimit)
118
+ match = base + matchIndex; /* preparation for next read of match[matchLength] */
119
+ }
120
+
121
+ DEBUGLOG(8, "ZSTD_insertDUBT1: comparing %u with %u : found %u common bytes ",
122
+ curr, matchIndex, (U32)matchLength);
123
+
124
+ if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */
125
+ break; /* drop , to guarantee consistency ; miss a bit of compression, but other solutions can corrupt tree */
126
+ }
127
+
128
+ if (match[matchLength] < ip[matchLength]) { /* necessarily within buffer */
129
+ /* match is smaller than current */
130
+ *smallerPtr = matchIndex; /* update smaller idx */
131
+ commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */
132
+ if (matchIndex <= btLow) { smallerPtr=&dummy32; break; } /* beyond tree size, stop searching */
133
+ DEBUGLOG(8, "ZSTD_insertDUBT1: %u (>btLow=%u) is smaller : next => %u",
134
+ matchIndex, btLow, nextPtr[1]);
135
+ smallerPtr = nextPtr+1; /* new "candidate" => larger than match, which was smaller than target */
136
+ matchIndex = nextPtr[1]; /* new matchIndex, larger than previous and closer to current */
137
+ } else {
138
+ /* match is larger than current */
139
+ *largerPtr = matchIndex;
140
+ commonLengthLarger = matchLength;
141
+ if (matchIndex <= btLow) { largerPtr=&dummy32; break; } /* beyond tree size, stop searching */
142
+ DEBUGLOG(8, "ZSTD_insertDUBT1: %u (>btLow=%u) is larger => %u",
143
+ matchIndex, btLow, nextPtr[0]);
144
+ largerPtr = nextPtr;
145
+ matchIndex = nextPtr[0];
146
+ } }
147
+
148
+ *smallerPtr = *largerPtr = 0;
149
+ }
150
+
151
+
152
+ static size_t
153
+ ZSTD_DUBT_findBetterDictMatch (
154
+ ZSTD_matchState_t* ms,
155
+ const BYTE* const ip, const BYTE* const iend,
156
+ size_t* offsetPtr,
157
+ size_t bestLength,
158
+ U32 nbCompares,
159
+ U32 const mls,
160
+ const ZSTD_dictMode_e dictMode)
161
+ {
162
+ const ZSTD_matchState_t * const dms = ms->dictMatchState;
163
+ const ZSTD_compressionParameters* const dmsCParams = &dms->cParams;
164
+ const U32 * const dictHashTable = dms->hashTable;
165
+ U32 const hashLog = dmsCParams->hashLog;
166
+ size_t const h = ZSTD_hashPtr(ip, hashLog, mls);
167
+ U32 dictMatchIndex = dictHashTable[h];
168
+
169
+ const BYTE* const base = ms->window.base;
170
+ const BYTE* const prefixStart = base + ms->window.dictLimit;
171
+ U32 const curr = (U32)(ip-base);
172
+ const BYTE* const dictBase = dms->window.base;
173
+ const BYTE* const dictEnd = dms->window.nextSrc;
174
+ U32 const dictHighLimit = (U32)(dms->window.nextSrc - dms->window.base);
175
+ U32 const dictLowLimit = dms->window.lowLimit;
176
+ U32 const dictIndexDelta = ms->window.lowLimit - dictHighLimit;
177
+
178
+ U32* const dictBt = dms->chainTable;
179
+ U32 const btLog = dmsCParams->chainLog - 1;
180
+ U32 const btMask = (1 << btLog) - 1;
181
+ U32 const btLow = (btMask >= dictHighLimit - dictLowLimit) ? dictLowLimit : dictHighLimit - btMask;
182
+
183
+ size_t commonLengthSmaller=0, commonLengthLarger=0;
184
+
185
+ (void)dictMode;
186
+ assert(dictMode == ZSTD_dictMatchState);
187
+
188
+ while (nbCompares-- && (dictMatchIndex > dictLowLimit)) {
189
+ U32* const nextPtr = dictBt + 2*(dictMatchIndex & btMask);
190
+ size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
191
+ const BYTE* match = dictBase + dictMatchIndex;
192
+ matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);
193
+ if (dictMatchIndex+matchLength >= dictHighLimit)
194
+ match = base + dictMatchIndex + dictIndexDelta; /* to prepare for next usage of match[matchLength] */
195
+
196
+ if (matchLength > bestLength) {
197
+ U32 matchIndex = dictMatchIndex + dictIndexDelta;
198
+ if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) {
199
+ DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)",
200
+ curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, ZSTD_REP_MOVE + curr - matchIndex, dictMatchIndex, matchIndex);
201
+ bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + curr - matchIndex;
202
+ }
203
+ if (ip+matchLength == iend) { /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */
204
+ break; /* drop, to guarantee consistency (miss a little bit of compression) */
205
+ }
206
+ }
207
+
208
+ if (match[matchLength] < ip[matchLength]) {
209
+ if (dictMatchIndex <= btLow) { break; } /* beyond tree size, stop the search */
210
+ commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */
211
+ dictMatchIndex = nextPtr[1]; /* new matchIndex larger than previous (closer to current) */
212
+ } else {
213
+ /* match is larger than current */
214
+ if (dictMatchIndex <= btLow) { break; } /* beyond tree size, stop the search */
215
+ commonLengthLarger = matchLength;
216
+ dictMatchIndex = nextPtr[0];
217
+ }
218
+ }
219
+
220
+ if (bestLength >= MINMATCH) {
221
+ U32 const mIndex = curr - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex;
222
+ DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
223
+ curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
224
+ }
225
+ return bestLength;
226
+
227
+ }
228
+
229
+
230
+ static size_t
231
+ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
232
+ const BYTE* const ip, const BYTE* const iend,
233
+ size_t* offsetPtr,
234
+ U32 const mls,
235
+ const ZSTD_dictMode_e dictMode)
236
+ {
237
+ const ZSTD_compressionParameters* const cParams = &ms->cParams;
238
+ U32* const hashTable = ms->hashTable;
239
+ U32 const hashLog = cParams->hashLog;
240
+ size_t const h = ZSTD_hashPtr(ip, hashLog, mls);
241
+ U32 matchIndex = hashTable[h];
242
+
243
+ const BYTE* const base = ms->window.base;
244
+ U32 const curr = (U32)(ip-base);
245
+ U32 const windowLow = ZSTD_getLowestMatchIndex(ms, curr, cParams->windowLog);
246
+
247
+ U32* const bt = ms->chainTable;
248
+ U32 const btLog = cParams->chainLog - 1;
249
+ U32 const btMask = (1 << btLog) - 1;
250
+ U32 const btLow = (btMask >= curr) ? 0 : curr - btMask;
251
+ U32 const unsortLimit = MAX(btLow, windowLow);
252
+
253
+ U32* nextCandidate = bt + 2*(matchIndex&btMask);
254
+ U32* unsortedMark = bt + 2*(matchIndex&btMask) + 1;
255
+ U32 nbCompares = 1U << cParams->searchLog;
256
+ U32 nbCandidates = nbCompares;
257
+ U32 previousCandidate = 0;
258
+
259
+ DEBUGLOG(7, "ZSTD_DUBT_findBestMatch (%u) ", curr);
260
+ assert(ip <= iend-8); /* required for h calculation */
261
+ assert(dictMode != ZSTD_dedicatedDictSearch);
262
+
263
+ /* reach end of unsorted candidates list */
264
+ while ( (matchIndex > unsortLimit)
265
+ && (*unsortedMark == ZSTD_DUBT_UNSORTED_MARK)
266
+ && (nbCandidates > 1) ) {
267
+ DEBUGLOG(8, "ZSTD_DUBT_findBestMatch: candidate %u is unsorted",
268
+ matchIndex);
269
+ *unsortedMark = previousCandidate; /* the unsortedMark becomes a reversed chain, to move up back to original position */
270
+ previousCandidate = matchIndex;
271
+ matchIndex = *nextCandidate;
272
+ nextCandidate = bt + 2*(matchIndex&btMask);
273
+ unsortedMark = bt + 2*(matchIndex&btMask) + 1;
274
+ nbCandidates --;
275
+ }
276
+
277
+ /* nullify last candidate if it's still unsorted
278
+ * simplification, detrimental to compression ratio, beneficial for speed */
279
+ if ( (matchIndex > unsortLimit)
280
+ && (*unsortedMark==ZSTD_DUBT_UNSORTED_MARK) ) {
281
+ DEBUGLOG(7, "ZSTD_DUBT_findBestMatch: nullify last unsorted candidate %u",
282
+ matchIndex);
283
+ *nextCandidate = *unsortedMark = 0;
284
+ }
285
+
286
+ /* batch sort stacked candidates */
287
+ matchIndex = previousCandidate;
288
+ while (matchIndex) { /* will end on matchIndex == 0 */
289
+ U32* const nextCandidateIdxPtr = bt + 2*(matchIndex&btMask) + 1;
290
+ U32 const nextCandidateIdx = *nextCandidateIdxPtr;
291
+ ZSTD_insertDUBT1(ms, matchIndex, iend,
292
+ nbCandidates, unsortLimit, dictMode);
293
+ matchIndex = nextCandidateIdx;
294
+ nbCandidates++;
295
+ }
296
+
297
+ /* find longest match */
298
+ { size_t commonLengthSmaller = 0, commonLengthLarger = 0;
299
+ const BYTE* const dictBase = ms->window.dictBase;
300
+ const U32 dictLimit = ms->window.dictLimit;
301
+ const BYTE* const dictEnd = dictBase + dictLimit;
302
+ const BYTE* const prefixStart = base + dictLimit;
303
+ U32* smallerPtr = bt + 2*(curr&btMask);
304
+ U32* largerPtr = bt + 2*(curr&btMask) + 1;
305
+ U32 matchEndIdx = curr + 8 + 1;
306
+ U32 dummy32; /* to be nullified at the end */
307
+ size_t bestLength = 0;
308
+
309
+ matchIndex = hashTable[h];
310
+ hashTable[h] = curr; /* Update Hash Table */
311
+
312
+ while (nbCompares-- && (matchIndex > windowLow)) {
313
+ U32* const nextPtr = bt + 2*(matchIndex & btMask);
314
+ size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
315
+ const BYTE* match;
316
+
317
+ if ((dictMode != ZSTD_extDict) || (matchIndex+matchLength >= dictLimit)) {
318
+ match = base + matchIndex;
319
+ matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend);
320
+ } else {
321
+ match = dictBase + matchIndex;
322
+ matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);
323
+ if (matchIndex+matchLength >= dictLimit)
324
+ match = base + matchIndex; /* to prepare for next usage of match[matchLength] */
325
+ }
326
+
327
+ if (matchLength > bestLength) {
328
+ if (matchLength > matchEndIdx - matchIndex)
329
+ matchEndIdx = matchIndex + (U32)matchLength;
330
+ if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) )
331
+ bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + curr - matchIndex;
332
+ if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */
333
+ if (dictMode == ZSTD_dictMatchState) {
334
+ nbCompares = 0; /* in addition to avoiding checking any
335
+ * further in this loop, make sure we
336
+ * skip checking in the dictionary. */
337
+ }
338
+ break; /* drop, to guarantee consistency (miss a little bit of compression) */
339
+ }
340
+ }
341
+
342
+ if (match[matchLength] < ip[matchLength]) {
343
+ /* match is smaller than current */
344
+ *smallerPtr = matchIndex; /* update smaller idx */
345
+ commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */
346
+ if (matchIndex <= btLow) { smallerPtr=&dummy32; break; } /* beyond tree size, stop the search */
347
+ smallerPtr = nextPtr+1; /* new "smaller" => larger of match */
348
+ matchIndex = nextPtr[1]; /* new matchIndex larger than previous (closer to current) */
349
+ } else {
350
+ /* match is larger than current */
351
+ *largerPtr = matchIndex;
352
+ commonLengthLarger = matchLength;
353
+ if (matchIndex <= btLow) { largerPtr=&dummy32; break; } /* beyond tree size, stop the search */
354
+ largerPtr = nextPtr;
355
+ matchIndex = nextPtr[0];
356
+ } }
357
+
358
+ *smallerPtr = *largerPtr = 0;
359
+
360
+ if (dictMode == ZSTD_dictMatchState && nbCompares) {
361
+ bestLength = ZSTD_DUBT_findBetterDictMatch(
362
+ ms, ip, iend,
363
+ offsetPtr, bestLength, nbCompares,
364
+ mls, dictMode);
365
+ }
366
+
367
+ assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased */
368
+ ms->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */
369
+ if (bestLength >= MINMATCH) {
370
+ U32 const mIndex = curr - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex;
371
+ DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
372
+ curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
373
+ }
374
+ return bestLength;
375
+ }
376
+ }
377
+
378
+
379
+ /** ZSTD_BtFindBestMatch() : Tree updater, providing best match */
380
+ FORCE_INLINE_TEMPLATE size_t
381
+ ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms,
382
+ const BYTE* const ip, const BYTE* const iLimit,
383
+ size_t* offsetPtr,
384
+ const U32 mls /* template */,
385
+ const ZSTD_dictMode_e dictMode)
386
+ {
387
+ DEBUGLOG(7, "ZSTD_BtFindBestMatch");
388
+ if (ip < ms->window.base + ms->nextToUpdate) return 0; /* skipped area */
389
+ ZSTD_updateDUBT(ms, ip, iLimit, mls);
390
+ return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offsetPtr, mls, dictMode);
391
+ }
392
+
393
+
394
+ static size_t
395
+ ZSTD_BtFindBestMatch_selectMLS ( ZSTD_matchState_t* ms,
396
+ const BYTE* ip, const BYTE* const iLimit,
397
+ size_t* offsetPtr)
398
+ {
399
+ switch(ms->cParams.minMatch)
400
+ {
401
+ default : /* includes case 3 */
402
+ case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_noDict);
403
+ case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_noDict);
404
+ case 7 :
405
+ case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_noDict);
406
+ }
407
+ }
408
+
409
+
410
+ static size_t ZSTD_BtFindBestMatch_dictMatchState_selectMLS (
411
+ ZSTD_matchState_t* ms,
412
+ const BYTE* ip, const BYTE* const iLimit,
413
+ size_t* offsetPtr)
414
+ {
415
+ switch(ms->cParams.minMatch)
416
+ {
417
+ default : /* includes case 3 */
418
+ case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_dictMatchState);
419
+ case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_dictMatchState);
420
+ case 7 :
421
+ case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_dictMatchState);
422
+ }
423
+ }
424
+
425
+
426
+ static size_t ZSTD_BtFindBestMatch_extDict_selectMLS (
427
+ ZSTD_matchState_t* ms,
428
+ const BYTE* ip, const BYTE* const iLimit,
429
+ size_t* offsetPtr)
430
+ {
431
+ switch(ms->cParams.minMatch)
432
+ {
433
+ default : /* includes case 3 */
434
+ case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_extDict);
435
+ case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_extDict);
436
+ case 7 :
437
+ case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_extDict);
438
+ }
439
+ }
440
+
441
+ /***********************************
442
+ * Dedicated dict search
443
+ ***********************************/
444
+
445
+ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip)
446
+ {
447
+ const BYTE* const base = ms->window.base;
448
+ U32 const target = (U32)(ip - base);
449
+ U32* const hashTable = ms->hashTable;
450
+ U32* const chainTable = ms->chainTable;
451
+ U32 const chainSize = 1 << ms->cParams.chainLog;
452
+ U32 idx = ms->nextToUpdate;
453
+ U32 const minChain = chainSize < target ? target - chainSize : idx;
454
+ U32 const bucketSize = 1 << ZSTD_LAZY_DDSS_BUCKET_LOG;
455
+ U32 const cacheSize = bucketSize - 1;
456
+ U32 const chainAttempts = (1 << ms->cParams.searchLog) - cacheSize;
457
+ U32 const chainLimit = chainAttempts > 255 ? 255 : chainAttempts;
458
+
459
+ /* We know the hashtable is oversized by a factor of `bucketSize`.
460
+ * We are going to temporarily pretend `bucketSize == 1`, keeping only a
461
+ * single entry. We will use the rest of the space to construct a temporary
462
+ * chaintable.
463
+ */
464
+ U32 const hashLog = ms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG;
465
+ U32* const tmpHashTable = hashTable;
466
+ U32* const tmpChainTable = hashTable + ((size_t)1 << hashLog);
467
+ U32 const tmpChainSize = ((1 << ZSTD_LAZY_DDSS_BUCKET_LOG) - 1) << hashLog;
468
+ U32 const tmpMinChain = tmpChainSize < target ? target - tmpChainSize : idx;
469
+ U32 hashIdx;
470
+
471
+ assert(ms->cParams.chainLog <= 24);
472
+ assert(ms->cParams.hashLog > ms->cParams.chainLog);
473
+ assert(idx != 0);
474
+ assert(tmpMinChain <= minChain);
475
+
476
+ /* fill conventional hash table and conventional chain table */
477
+ for ( ; idx < target; idx++) {
478
+ U32 const h = (U32)ZSTD_hashPtr(base + idx, hashLog, ms->cParams.minMatch);
479
+ if (idx >= tmpMinChain) {
480
+ tmpChainTable[idx - tmpMinChain] = hashTable[h];
481
+ }
482
+ tmpHashTable[h] = idx;
483
+ }
484
+
485
+ /* sort chains into ddss chain table */
486
+ {
487
+ U32 chainPos = 0;
488
+ for (hashIdx = 0; hashIdx < (1U << hashLog); hashIdx++) {
489
+ U32 count;
490
+ U32 countBeyondMinChain = 0;
491
+ U32 i = tmpHashTable[hashIdx];
492
+ for (count = 0; i >= tmpMinChain && count < cacheSize; count++) {
493
+ /* skip through the chain to the first position that won't be
494
+ * in the hash cache bucket */
495
+ if (i < minChain) {
496
+ countBeyondMinChain++;
497
+ }
498
+ i = tmpChainTable[i - tmpMinChain];
499
+ }
500
+ if (count == cacheSize) {
501
+ for (count = 0; count < chainLimit;) {
502
+ if (i < minChain) {
503
+ if (!i || ++countBeyondMinChain > cacheSize) {
504
+ /* only allow pulling `cacheSize` number of entries
505
+ * into the cache or chainTable beyond `minChain`,
506
+ * to replace the entries pulled out of the
507
+ * chainTable into the cache. This lets us reach
508
+ * back further without increasing the total number
509
+ * of entries in the chainTable, guaranteeing the
510
+ * DDSS chain table will fit into the space
511
+ * allocated for the regular one. */
512
+ break;
513
+ }
514
+ }
515
+ chainTable[chainPos++] = i;
516
+ count++;
517
+ if (i < tmpMinChain) {
518
+ break;
519
+ }
520
+ i = tmpChainTable[i - tmpMinChain];
521
+ }
522
+ } else {
523
+ count = 0;
524
+ }
525
+ if (count) {
526
+ tmpHashTable[hashIdx] = ((chainPos - count) << 8) + count;
527
+ } else {
528
+ tmpHashTable[hashIdx] = 0;
529
+ }
530
+ }
531
+ assert(chainPos <= chainSize); /* I believe this is guaranteed... */
532
+ }
533
+
534
+ /* move chain pointers into the last entry of each hash bucket */
535
+ for (hashIdx = (1 << hashLog); hashIdx; ) {
536
+ U32 const bucketIdx = --hashIdx << ZSTD_LAZY_DDSS_BUCKET_LOG;
537
+ U32 const chainPackedPointer = tmpHashTable[hashIdx];
538
+ U32 i;
539
+ for (i = 0; i < cacheSize; i++) {
540
+ hashTable[bucketIdx + i] = 0;
541
+ }
542
+ hashTable[bucketIdx + bucketSize - 1] = chainPackedPointer;
543
+ }
544
+
545
+ /* fill the buckets of the hash table */
546
+ for (idx = ms->nextToUpdate; idx < target; idx++) {
547
+ U32 const h = (U32)ZSTD_hashPtr(base + idx, hashLog, ms->cParams.minMatch)
548
+ << ZSTD_LAZY_DDSS_BUCKET_LOG;
549
+ U32 i;
550
+ /* Shift hash cache down 1. */
551
+ for (i = cacheSize - 1; i; i--)
552
+ hashTable[h + i] = hashTable[h + i - 1];
553
+ hashTable[h] = idx;
554
+ }
555
+
556
+ ms->nextToUpdate = target;
557
+ }
558
+
559
+ /* Returns the longest match length found in the dedicated dict search structure.
560
+ * If none are longer than the argument ml, then ml will be returned.
561
+ */
562
+ FORCE_INLINE_TEMPLATE
563
+ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nbAttempts,
564
+ const ZSTD_matchState_t* const dms,
565
+ const BYTE* const ip, const BYTE* const iLimit,
566
+ const BYTE* const prefixStart, const U32 curr,
567
+ const U32 dictLimit, const size_t ddsIdx) {
568
+ const U32 ddsLowestIndex = dms->window.dictLimit;
569
+ const BYTE* const ddsBase = dms->window.base;
570
+ const BYTE* const ddsEnd = dms->window.nextSrc;
571
+ const U32 ddsSize = (U32)(ddsEnd - ddsBase);
572
+ const U32 ddsIndexDelta = dictLimit - ddsSize;
573
+ const U32 bucketSize = (1 << ZSTD_LAZY_DDSS_BUCKET_LOG);
574
+ const U32 bucketLimit = nbAttempts < bucketSize - 1 ? nbAttempts : bucketSize - 1;
575
+ U32 ddsAttempt;
576
+ U32 matchIndex;
577
+
578
+ for (ddsAttempt = 0; ddsAttempt < bucketSize - 1; ddsAttempt++) {
579
+ PREFETCH_L1(ddsBase + dms->hashTable[ddsIdx + ddsAttempt]);
580
+ }
581
+
582
+ {
583
+ U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1];
584
+ U32 const chainIndex = chainPackedPointer >> 8;
585
+
586
+ PREFETCH_L1(&dms->chainTable[chainIndex]);
587
+ }
588
+
589
+ for (ddsAttempt = 0; ddsAttempt < bucketLimit; ddsAttempt++) {
590
+ size_t currentMl=0;
591
+ const BYTE* match;
592
+ matchIndex = dms->hashTable[ddsIdx + ddsAttempt];
593
+ match = ddsBase + matchIndex;
594
+
595
+ if (!matchIndex) {
596
+ return ml;
597
+ }
598
+
599
+ /* guaranteed by table construction */
600
+ (void)ddsLowestIndex;
601
+ assert(matchIndex >= ddsLowestIndex);
602
+ assert(match+4 <= ddsEnd);
603
+ if (MEM_read32(match) == MEM_read32(ip)) {
604
+ /* assumption : matchIndex <= dictLimit-4 (by table construction) */
605
+ currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4;
606
+ }
607
+
608
+ /* save best solution */
609
+ if (currentMl > ml) {
610
+ ml = currentMl;
611
+ *offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE;
612
+ if (ip+currentMl == iLimit) {
613
+ /* best possible, avoids read overflow on next attempt */
614
+ return ml;
615
+ }
616
+ }
617
+ }
618
+
619
+ {
620
+ U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1];
621
+ U32 chainIndex = chainPackedPointer >> 8;
622
+ U32 const chainLength = chainPackedPointer & 0xFF;
623
+ U32 const chainAttempts = nbAttempts - ddsAttempt;
624
+ U32 const chainLimit = chainAttempts > chainLength ? chainLength : chainAttempts;
625
+ U32 chainAttempt;
626
+
627
+ for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++) {
628
+ PREFETCH_L1(ddsBase + dms->chainTable[chainIndex + chainAttempt]);
629
+ }
630
+
631
+ for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++, chainIndex++) {
632
+ size_t currentMl=0;
633
+ const BYTE* match;
634
+ matchIndex = dms->chainTable[chainIndex];
635
+ match = ddsBase + matchIndex;
636
+
637
+ /* guaranteed by table construction */
638
+ assert(matchIndex >= ddsLowestIndex);
639
+ assert(match+4 <= ddsEnd);
640
+ if (MEM_read32(match) == MEM_read32(ip)) {
641
+ /* assumption : matchIndex <= dictLimit-4 (by table construction) */
642
+ currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4;
643
+ }
644
+
645
+ /* save best solution */
646
+ if (currentMl > ml) {
647
+ ml = currentMl;
648
+ *offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE;
649
+ if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
650
+ }
651
+ }
652
+ }
653
+ return ml;
654
+ }
655
+
656
+
657
+ /* *********************************
658
+ * Hash Chain
659
+ ***********************************/
660
+ #define NEXT_IN_CHAIN(d, mask) chainTable[(d) & (mask)]
661
+
662
+ /* Update chains up to ip (excluded)
663
+ Assumption : always within prefix (i.e. not within extDict) */
664
+ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
665
+ ZSTD_matchState_t* ms,
666
+ const ZSTD_compressionParameters* const cParams,
667
+ const BYTE* ip, U32 const mls)
668
+ {
669
+ U32* const hashTable = ms->hashTable;
670
+ const U32 hashLog = cParams->hashLog;
671
+ U32* const chainTable = ms->chainTable;
672
+ const U32 chainMask = (1 << cParams->chainLog) - 1;
673
+ const BYTE* const base = ms->window.base;
674
+ const U32 target = (U32)(ip - base);
675
+ U32 idx = ms->nextToUpdate;
676
+
677
+ while(idx < target) { /* catch up */
678
+ size_t const h = ZSTD_hashPtr(base+idx, hashLog, mls);
679
+ NEXT_IN_CHAIN(idx, chainMask) = hashTable[h];
680
+ hashTable[h] = idx;
681
+ idx++;
682
+ }
683
+
684
+ ms->nextToUpdate = target;
685
+ return hashTable[ZSTD_hashPtr(ip, hashLog, mls)];
686
+ }
687
+
688
+ U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {
689
+ const ZSTD_compressionParameters* const cParams = &ms->cParams;
690
+ return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch);
691
+ }
692
+
693
+ /* inlining is important to hardwire a hot branch (template emulation) */
694
+ FORCE_INLINE_TEMPLATE
695
+ size_t ZSTD_HcFindBestMatch_generic (
696
+ ZSTD_matchState_t* ms,
697
+ const BYTE* const ip, const BYTE* const iLimit,
698
+ size_t* offsetPtr,
699
+ const U32 mls, const ZSTD_dictMode_e dictMode)
700
+ {
701
+ const ZSTD_compressionParameters* const cParams = &ms->cParams;
702
+ U32* const chainTable = ms->chainTable;
703
+ const U32 chainSize = (1 << cParams->chainLog);
704
+ const U32 chainMask = chainSize-1;
705
+ const BYTE* const base = ms->window.base;
706
+ const BYTE* const dictBase = ms->window.dictBase;
707
+ const U32 dictLimit = ms->window.dictLimit;
708
+ const BYTE* const prefixStart = base + dictLimit;
709
+ const BYTE* const dictEnd = dictBase + dictLimit;
710
+ const U32 curr = (U32)(ip-base);
711
+ const U32 maxDistance = 1U << cParams->windowLog;
712
+ const U32 lowestValid = ms->window.lowLimit;
713
+ const U32 withinMaxDistance = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid;
714
+ const U32 isDictionary = (ms->loadedDictEnd != 0);
715
+ const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance;
716
+ const U32 minChain = curr > chainSize ? curr - chainSize : 0;
717
+ U32 nbAttempts = 1U << cParams->searchLog;
718
+ size_t ml=4-1;
719
+
720
+ const ZSTD_matchState_t* const dms = ms->dictMatchState;
721
+ const U32 ddsHashLog = dictMode == ZSTD_dedicatedDictSearch
722
+ ? dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG : 0;
723
+ const size_t ddsIdx = dictMode == ZSTD_dedicatedDictSearch
724
+ ? ZSTD_hashPtr(ip, ddsHashLog, mls) << ZSTD_LAZY_DDSS_BUCKET_LOG : 0;
725
+
726
+ U32 matchIndex;
727
+
728
+ if (dictMode == ZSTD_dedicatedDictSearch) {
729
+ const U32* entry = &dms->hashTable[ddsIdx];
730
+ PREFETCH_L1(entry);
731
+ }
732
+
733
+ /* HC4 match finder */
734
+ matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls);
735
+
736
+ for ( ; (matchIndex>=lowLimit) & (nbAttempts>0) ; nbAttempts--) {
737
+ size_t currentMl=0;
738
+ if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
739
+ const BYTE* const match = base + matchIndex;
740
+ assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */
741
+ if (match[ml] == ip[ml]) /* potentially better */
742
+ currentMl = ZSTD_count(ip, match, iLimit);
743
+ } else {
744
+ const BYTE* const match = dictBase + matchIndex;
745
+ assert(match+4 <= dictEnd);
746
+ if (MEM_read32(match) == MEM_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */
747
+ currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dictEnd, prefixStart) + 4;
748
+ }
749
+
750
+ /* save best solution */
751
+ if (currentMl > ml) {
752
+ ml = currentMl;
753
+ *offsetPtr = curr - matchIndex + ZSTD_REP_MOVE;
754
+ if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
755
+ }
756
+
757
+ if (matchIndex <= minChain) break;
758
+ matchIndex = NEXT_IN_CHAIN(matchIndex, chainMask);
759
+ }
760
+
761
+ if (dictMode == ZSTD_dedicatedDictSearch) {
762
+ ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts, dms,
763
+ ip, iLimit, prefixStart, curr, dictLimit, ddsIdx);
764
+ } else if (dictMode == ZSTD_dictMatchState) {
765
+ const U32* const dmsChainTable = dms->chainTable;
766
+ const U32 dmsChainSize = (1 << dms->cParams.chainLog);
767
+ const U32 dmsChainMask = dmsChainSize - 1;
768
+ const U32 dmsLowestIndex = dms->window.dictLimit;
769
+ const BYTE* const dmsBase = dms->window.base;
770
+ const BYTE* const dmsEnd = dms->window.nextSrc;
771
+ const U32 dmsSize = (U32)(dmsEnd - dmsBase);
772
+ const U32 dmsIndexDelta = dictLimit - dmsSize;
773
+ const U32 dmsMinChain = dmsSize > dmsChainSize ? dmsSize - dmsChainSize : 0;
774
+
775
+ matchIndex = dms->hashTable[ZSTD_hashPtr(ip, dms->cParams.hashLog, mls)];
776
+
777
+ for ( ; (matchIndex>=dmsLowestIndex) & (nbAttempts>0) ; nbAttempts--) {
778
+ size_t currentMl=0;
779
+ const BYTE* const match = dmsBase + matchIndex;
780
+ assert(match+4 <= dmsEnd);
781
+ if (MEM_read32(match) == MEM_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */
782
+ currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dmsEnd, prefixStart) + 4;
783
+
784
+ /* save best solution */
785
+ if (currentMl > ml) {
786
+ ml = currentMl;
787
+ *offsetPtr = curr - (matchIndex + dmsIndexDelta) + ZSTD_REP_MOVE;
788
+ if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
789
+ }
790
+
791
+ if (matchIndex <= dmsMinChain) break;
792
+
793
+ matchIndex = dmsChainTable[matchIndex & dmsChainMask];
794
+ }
795
+ }
796
+
797
+ return ml;
798
+ }
799
+
800
+
801
+ FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_selectMLS (
802
+ ZSTD_matchState_t* ms,
803
+ const BYTE* ip, const BYTE* const iLimit,
804
+ size_t* offsetPtr)
805
+ {
806
+ switch(ms->cParams.minMatch)
807
+ {
808
+ default : /* includes case 3 */
809
+ case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_noDict);
810
+ case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_noDict);
811
+ case 7 :
812
+ case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_noDict);
813
+ }
814
+ }
815
+
816
+
817
+ static size_t ZSTD_HcFindBestMatch_dictMatchState_selectMLS (
818
+ ZSTD_matchState_t* ms,
819
+ const BYTE* ip, const BYTE* const iLimit,
820
+ size_t* offsetPtr)
821
+ {
822
+ switch(ms->cParams.minMatch)
823
+ {
824
+ default : /* includes case 3 */
825
+ case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_dictMatchState);
826
+ case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_dictMatchState);
827
+ case 7 :
828
+ case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_dictMatchState);
829
+ }
830
+ }
831
+
832
+
833
+ static size_t ZSTD_HcFindBestMatch_dedicatedDictSearch_selectMLS (
834
+ ZSTD_matchState_t* ms,
835
+ const BYTE* ip, const BYTE* const iLimit,
836
+ size_t* offsetPtr)
837
+ {
838
+ switch(ms->cParams.minMatch)
839
+ {
840
+ default : /* includes case 3 */
841
+ case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_dedicatedDictSearch);
842
+ case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_dedicatedDictSearch);
843
+ case 7 :
844
+ case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_dedicatedDictSearch);
845
+ }
846
+ }
847
+
848
+
849
+ FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_extDict_selectMLS (
850
+ ZSTD_matchState_t* ms,
851
+ const BYTE* ip, const BYTE* const iLimit,
852
+ size_t* offsetPtr)
853
+ {
854
+ switch(ms->cParams.minMatch)
855
+ {
856
+ default : /* includes case 3 */
857
+ case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_extDict);
858
+ case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_extDict);
859
+ case 7 :
860
+ case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_extDict);
861
+ }
862
+ }
863
+
864
+ /* *********************************
865
+ * (SIMD) Row-based matchfinder
866
+ ***********************************/
867
+ /* Constants for row-based hash */
868
+ #define ZSTD_ROW_HASH_TAG_OFFSET 1 /* byte offset of hashes in the match state's tagTable from the beginning of a row */
869
+ #define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */
870
+ #define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1)
871
+
872
+ #define ZSTD_ROW_HASH_CACHE_MASK (ZSTD_ROW_HASH_CACHE_SIZE - 1)
873
+
874
+ typedef U32 ZSTD_VecMask; /* Clarifies when we are interacting with a U32 representing a mask of matches */
875
+
876
+ #if !defined(ZSTD_NO_INTRINSICS) && defined(__SSE2__) /* SIMD SSE version */
877
+
878
+ #include <emmintrin.h>
879
+ typedef __m128i ZSTD_Vec128;
880
+
881
+ /* Returns a 128-bit container with 128-bits from src */
882
+ static ZSTD_Vec128 ZSTD_Vec128_read(const void* const src) {
883
+ return _mm_loadu_si128((ZSTD_Vec128 const*)src);
884
+ }
885
+
886
+ /* Returns a ZSTD_Vec128 with the byte "val" packed 16 times */
887
+ static ZSTD_Vec128 ZSTD_Vec128_set8(BYTE val) {
888
+ return _mm_set1_epi8((char)val);
889
+ }
890
+
891
+ /* Do byte-by-byte comparison result of x and y. Then collapse 128-bit resultant mask
892
+ * into a 32-bit mask that is the MSB of each byte.
893
+ * */
894
+ static ZSTD_VecMask ZSTD_Vec128_cmpMask8(ZSTD_Vec128 x, ZSTD_Vec128 y) {
895
+ return (ZSTD_VecMask)_mm_movemask_epi8(_mm_cmpeq_epi8(x, y));
896
+ }
897
+
898
+ typedef struct {
899
+ __m128i fst;
900
+ __m128i snd;
901
+ } ZSTD_Vec256;
902
+
903
+ static ZSTD_Vec256 ZSTD_Vec256_read(const void* const ptr) {
904
+ ZSTD_Vec256 v;
905
+ v.fst = ZSTD_Vec128_read(ptr);
906
+ v.snd = ZSTD_Vec128_read((ZSTD_Vec128 const*)ptr + 1);
907
+ return v;
908
+ }
909
+
910
+ static ZSTD_Vec256 ZSTD_Vec256_set8(BYTE val) {
911
+ ZSTD_Vec256 v;
912
+ v.fst = ZSTD_Vec128_set8(val);
913
+ v.snd = ZSTD_Vec128_set8(val);
914
+ return v;
915
+ }
916
+
917
+ static ZSTD_VecMask ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x, ZSTD_Vec256 y) {
918
+ ZSTD_VecMask fstMask;
919
+ ZSTD_VecMask sndMask;
920
+ fstMask = ZSTD_Vec128_cmpMask8(x.fst, y.fst);
921
+ sndMask = ZSTD_Vec128_cmpMask8(x.snd, y.snd);
922
+ return fstMask | (sndMask << 16);
923
+ }
924
+
925
+ #elif !defined(ZSTD_NO_INTRINSICS) && defined(__ARM_NEON) /* SIMD ARM NEON Version */
926
+
927
+ #include <arm_neon.h>
928
+ typedef uint8x16_t ZSTD_Vec128;
929
+
930
+ static ZSTD_Vec128 ZSTD_Vec128_read(const void* const src) {
931
+ return vld1q_u8((const BYTE* const)src);
932
+ }
933
+
934
+ static ZSTD_Vec128 ZSTD_Vec128_set8(BYTE val) {
935
+ return vdupq_n_u8(val);
936
+ }
937
+
938
+ /* Mimics '_mm_movemask_epi8()' from SSE */
939
+ static U32 ZSTD_vmovmaskq_u8(ZSTD_Vec128 val) {
940
+ /* Shift out everything but the MSB bits in each byte */
941
+ uint16x8_t highBits = vreinterpretq_u16_u8(vshrq_n_u8(val, 7));
942
+ /* Merge the even lanes together with vsra (right shift and add) */
943
+ uint32x4_t paired16 = vreinterpretq_u32_u16(vsraq_n_u16(highBits, highBits, 7));
944
+ uint64x2_t paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
945
+ uint8x16_t paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
946
+ /* Extract the low 8 bits from each lane, merge */
947
+ return vgetq_lane_u8(paired64, 0) | ((U32)vgetq_lane_u8(paired64, 8) << 8);
948
+ }
949
+
950
+ static ZSTD_VecMask ZSTD_Vec128_cmpMask8(ZSTD_Vec128 x, ZSTD_Vec128 y) {
951
+ return (ZSTD_VecMask)ZSTD_vmovmaskq_u8(vceqq_u8(x, y));
952
+ }
953
+
954
+ typedef struct {
955
+ uint8x16_t fst;
956
+ uint8x16_t snd;
957
+ } ZSTD_Vec256;
958
+
959
+ static ZSTD_Vec256 ZSTD_Vec256_read(const void* const ptr) {
960
+ ZSTD_Vec256 v;
961
+ v.fst = ZSTD_Vec128_read(ptr);
962
+ v.snd = ZSTD_Vec128_read((ZSTD_Vec128 const*)ptr + 1);
963
+ return v;
964
+ }
965
+
966
+ static ZSTD_Vec256 ZSTD_Vec256_set8(BYTE val) {
967
+ ZSTD_Vec256 v;
968
+ v.fst = ZSTD_Vec128_set8(val);
969
+ v.snd = ZSTD_Vec128_set8(val);
970
+ return v;
971
+ }
972
+
973
+ static ZSTD_VecMask ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x, ZSTD_Vec256 y) {
974
+ ZSTD_VecMask fstMask;
975
+ ZSTD_VecMask sndMask;
976
+ fstMask = ZSTD_Vec128_cmpMask8(x.fst, y.fst);
977
+ sndMask = ZSTD_Vec128_cmpMask8(x.snd, y.snd);
978
+ return fstMask | (sndMask << 16);
979
+ }
980
+
981
+ #else /* Scalar fallback version */
982
+
983
+ #define VEC128_NB_SIZE_T (16 / sizeof(size_t))
984
+ typedef struct {
985
+ size_t vec[VEC128_NB_SIZE_T];
986
+ } ZSTD_Vec128;
987
+
988
+ static ZSTD_Vec128 ZSTD_Vec128_read(const void* const src) {
989
+ ZSTD_Vec128 ret;
990
+ ZSTD_memcpy(ret.vec, src, VEC128_NB_SIZE_T*sizeof(size_t));
991
+ return ret;
992
+ }
993
+
994
+ static ZSTD_Vec128 ZSTD_Vec128_set8(BYTE val) {
995
+ ZSTD_Vec128 ret = { {0} };
996
+ int startBit = sizeof(size_t) * 8 - 8;
997
+ for (;startBit >= 0; startBit -= 8) {
998
+ unsigned j = 0;
999
+ for (;j < VEC128_NB_SIZE_T; ++j) {
1000
+ ret.vec[j] |= ((size_t)val << startBit);
1001
+ }
1002
+ }
1003
+ return ret;
1004
+ }
1005
+
1006
+ /* Compare x to y, byte by byte, generating a "matches" bitfield */
1007
+ static ZSTD_VecMask ZSTD_Vec128_cmpMask8(ZSTD_Vec128 x, ZSTD_Vec128 y) {
1008
+ ZSTD_VecMask res = 0;
1009
+ unsigned i = 0;
1010
+ unsigned l = 0;
1011
+ for (; i < VEC128_NB_SIZE_T; ++i) {
1012
+ const size_t cmp1 = x.vec[i];
1013
+ const size_t cmp2 = y.vec[i];
1014
+ unsigned j = 0;
1015
+ for (; j < sizeof(size_t); ++j, ++l) {
1016
+ if (((cmp1 >> j*8) & 0xFF) == ((cmp2 >> j*8) & 0xFF)) {
1017
+ res |= ((U32)1 << (j+i*sizeof(size_t)));
1018
+ }
1019
+ }
1020
+ }
1021
+ return res;
1022
+ }
1023
+
1024
+ #define VEC256_NB_SIZE_T 2*VEC128_NB_SIZE_T
1025
+ typedef struct {
1026
+ size_t vec[VEC256_NB_SIZE_T];
1027
+ } ZSTD_Vec256;
1028
+
1029
+ static ZSTD_Vec256 ZSTD_Vec256_read(const void* const src) {
1030
+ ZSTD_Vec256 ret;
1031
+ ZSTD_memcpy(ret.vec, src, VEC256_NB_SIZE_T*sizeof(size_t));
1032
+ return ret;
1033
+ }
1034
+
1035
+ static ZSTD_Vec256 ZSTD_Vec256_set8(BYTE val) {
1036
+ ZSTD_Vec256 ret = { {0} };
1037
+ int startBit = sizeof(size_t) * 8 - 8;
1038
+ for (;startBit >= 0; startBit -= 8) {
1039
+ unsigned j = 0;
1040
+ for (;j < VEC256_NB_SIZE_T; ++j) {
1041
+ ret.vec[j] |= ((size_t)val << startBit);
1042
+ }
1043
+ }
1044
+ return ret;
1045
+ }
1046
+
1047
+ /* Compare x to y, byte by byte, generating a "matches" bitfield */
1048
+ static ZSTD_VecMask ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x, ZSTD_Vec256 y) {
1049
+ ZSTD_VecMask res = 0;
1050
+ unsigned i = 0;
1051
+ unsigned l = 0;
1052
+ for (; i < VEC256_NB_SIZE_T; ++i) {
1053
+ const size_t cmp1 = x.vec[i];
1054
+ const size_t cmp2 = y.vec[i];
1055
+ unsigned j = 0;
1056
+ for (; j < sizeof(size_t); ++j, ++l) {
1057
+ if (((cmp1 >> j*8) & 0xFF) == ((cmp2 >> j*8) & 0xFF)) {
1058
+ res |= ((U32)1 << (j+i*sizeof(size_t)));
1059
+ }
1060
+ }
1061
+ }
1062
+ return res;
1063
+ }
1064
+
1065
+ #endif /* !defined(ZSTD_NO_INTRINSICS) && defined(__SSE2__) */
1066
+
1067
+ /* ZSTD_VecMask_next():
1068
+ * Starting from the LSB, returns the idx of the next non-zero bit.
1069
+ * Basically counting the nb of trailing zeroes.
1070
+ */
1071
+ static U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
1072
+ # if defined(_MSC_VER) /* Visual */
1073
+ unsigned long r=0;
1074
+ return _BitScanForward(&r, val) ? (U32)r : 0;
1075
+ # elif defined(__GNUC__) && (__GNUC__ >= 3)
1076
+ return (U32)__builtin_ctz(val);
1077
+ # else
1078
+ /* Software ctz version: http://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightMultLookup */
1079
+ static const U32 multiplyDeBruijnBitPosition[32] =
1080
+ {
1081
+ 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
1082
+ 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
1083
+ };
1084
+ return multiplyDeBruijnBitPosition[((U32)((v & -(int)v) * 0x077CB531U)) >> 27];
1085
+ # endif
1086
+ }
1087
+
1088
+ /* ZSTD_VecMask_rotateRight():
1089
+ * Rotates a bitfield to the right by "rotation" bits.
1090
+ * If the rotation is greater than totalBits, the returned mask is 0.
1091
+ */
1092
+ FORCE_INLINE_TEMPLATE ZSTD_VecMask
1093
+ ZSTD_VecMask_rotateRight(ZSTD_VecMask mask, U32 const rotation, U32 const totalBits) {
1094
+ if (rotation == 0)
1095
+ return mask;
1096
+ switch (totalBits) {
1097
+ default:
1098
+ assert(0);
1099
+ case 16:
1100
+ return (mask >> rotation) | (U16)(mask << (16 - rotation));
1101
+ case 32:
1102
+ return (mask >> rotation) | (U32)(mask << (32 - rotation));
1103
+ }
1104
+ }
1105
+
1106
+ /* ZSTD_row_nextIndex():
1107
+ * Returns the next index to insert at within a tagTable row, and updates the "head"
1108
+ * value to reflect the update. Essentially cycles backwards from [0, {entries per row})
1109
+ */
1110
+ FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const rowMask) {
1111
+ U32 const next = (*tagRow - 1) & rowMask;
1112
+ *tagRow = (BYTE)next;
1113
+ return next;
1114
+ }
1115
+
1116
+ /* ZSTD_isAligned():
1117
+ * Checks that a pointer is aligned to "align" bytes which must be a power of 2.
1118
+ */
1119
+ MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) {
1120
+ assert((align & (align - 1)) == 0);
1121
+ return (((size_t)ptr) & (align - 1)) == 0;
1122
+ }
1123
+
1124
+ /* ZSTD_row_prefetch():
1125
+ * Performs prefetching for the hashTable and tagTable at a given row.
1126
+ */
1127
+ FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* tagTable, U32 const relRow, U32 const rowLog) {
1128
+ PREFETCH_L1(hashTable + relRow);
1129
+ if (rowLog == 5) {
1130
+ PREFETCH_L1(hashTable + relRow + 16);
1131
+ }
1132
+ PREFETCH_L1(tagTable + relRow);
1133
+ assert(rowLog == 4 || rowLog == 5);
1134
+ assert(ZSTD_isAligned(hashTable + relRow, 64)); /* prefetched hash row always 64-byte aligned */
1135
+ assert(ZSTD_isAligned(tagTable + relRow, (size_t)1 << rowLog)); /* prefetched tagRow sits on a multiple of 32 or 64 bytes */
1136
+ }
1137
+
1138
+ /* ZSTD_row_fillHashCache():
1139
+ * Fill up the hash cache starting at idx, prefetching up to ZSTD_ROW_HASH_CACHE_SIZE entries,
1140
+ * but not beyond iLimit.
1141
+ */
1142
+ static void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base,
1143
+ U32 const rowLog, U32 const mls,
1144
+ U32 idx, const BYTE* const iLimit)
1145
+ {
1146
+ U32 const* const hashTable = ms->hashTable;
1147
+ U16 const* const tagTable = ms->tagTable;
1148
+ U32 const hashLog = ms->rowHashLog;
1149
+ U32 const maxElemsToPrefetch = (base + idx) > iLimit ? 0 : (U32)(iLimit - (base + idx) + 1);
1150
+ U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch);
1151
+
1152
+ for (; idx < lim; ++idx) {
1153
+ U32 const hash = (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
1154
+ U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
1155
+ ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
1156
+ ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash;
1157
+ }
1158
+
1159
+ DEBUGLOG(6, "ZSTD_row_fillHashCache(): [%u %u %u %u %u %u %u %u]", ms->hashCache[0], ms->hashCache[1],
1160
+ ms->hashCache[2], ms->hashCache[3], ms->hashCache[4],
1161
+ ms->hashCache[5], ms->hashCache[6], ms->hashCache[7]);
1162
+ }
1163
+
1164
+ /* ZSTD_row_nextCachedHash():
1165
+ * Returns the hash of base + idx, and replaces the hash in the hash cache with the byte at
1166
+ * base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable.
1167
+ */
1168
+ FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable,
1169
+ U16 const* tagTable, BYTE const* base,
1170
+ U32 idx, U32 const hashLog,
1171
+ U32 const rowLog, U32 const mls)
1172
+ {
1173
+ U32 const newHash = (U32)ZSTD_hashPtr(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
1174
+ U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
1175
+ ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
1176
+ { U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK];
1177
+ cache[idx & ZSTD_ROW_HASH_CACHE_MASK] = newHash;
1178
+ return hash;
1179
+ }
1180
+ }
1181
+
1182
+ /* ZSTD_row_update_internal():
1183
+ * Inserts the byte at ip into the appropriate position in the hash table.
1184
+ * Determines the relative row, and the position within the {16, 32} entry row to insert at.
1185
+ */
1186
+ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip,
1187
+ U32 const mls, U32 const rowLog,
1188
+ U32 const rowMask, U32 const useCache)
1189
+ {
1190
+ U32* const hashTable = ms->hashTable;
1191
+ U16* const tagTable = ms->tagTable;
1192
+ U32 const hashLog = ms->rowHashLog;
1193
+ const BYTE* const base = ms->window.base;
1194
+ const U32 target = (U32)(ip - base);
1195
+ U32 idx = ms->nextToUpdate;
1196
+
1197
+ DEBUGLOG(6, "ZSTD_row_update_internal(): nextToUpdate=%u, current=%u", idx, target);
1198
+ for (; idx < target; ++idx) {
1199
+ U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, idx, hashLog, rowLog, mls)
1200
+ : (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
1201
+ U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
1202
+ U32* const row = hashTable + relRow;
1203
+ BYTE* tagRow = (BYTE*)(tagTable + relRow); /* Though tagTable is laid out as a table of U16, each tag is only 1 byte.
1204
+ Explicit cast allows us to get exact desired position within each row */
1205
+ U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
1206
+
1207
+ assert(hash == ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls));
1208
+ ((BYTE*)tagRow)[pos + ZSTD_ROW_HASH_TAG_OFFSET] = hash & ZSTD_ROW_HASH_TAG_MASK;
1209
+ row[pos] = idx;
1210
+ }
1211
+ ms->nextToUpdate = target;
1212
+ }
1213
+
1214
+ /* ZSTD_row_update():
1215
+ * External wrapper for ZSTD_row_update_internal(). Used for filling the hashtable during dictionary
1216
+ * processing.
1217
+ */
1218
+ void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) {
1219
+ const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5;
1220
+ const U32 rowMask = (1u << rowLog) - 1;
1221
+ const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */);
1222
+
1223
+ DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog);
1224
+ ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* dont use cache */);
1225
+ }
1226
+
1227
+ /* Returns a ZSTD_VecMask (U32) that has the nth bit set to 1 if the newly-computed "tag" matches
1228
+ * the hash at the nth position in a row of the tagTable.
1229
+ */
1230
+ FORCE_INLINE_TEMPLATE
1231
+ ZSTD_VecMask ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries) {
1232
+ ZSTD_VecMask matches = 0;
1233
+ if (rowEntries == 16) {
1234
+ ZSTD_Vec128 hashes = ZSTD_Vec128_read(tagRow + ZSTD_ROW_HASH_TAG_OFFSET);
1235
+ ZSTD_Vec128 expandedTags = ZSTD_Vec128_set8(tag);
1236
+ matches = ZSTD_Vec128_cmpMask8(hashes, expandedTags);
1237
+ } else if (rowEntries == 32) {
1238
+ ZSTD_Vec256 hashes = ZSTD_Vec256_read(tagRow + ZSTD_ROW_HASH_TAG_OFFSET);
1239
+ ZSTD_Vec256 expandedTags = ZSTD_Vec256_set8(tag);
1240
+ matches = ZSTD_Vec256_cmpMask8(hashes, expandedTags);
1241
+ } else {
1242
+ assert(0);
1243
+ }
1244
+ /* Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield
1245
+ to match up with the actual layout of the entries within the hashTable */
1246
+ return ZSTD_VecMask_rotateRight(matches, head, rowEntries);
1247
+ }
1248
+
1249
+ /* The high-level approach of the SIMD row based match finder is as follows:
1250
+ * - Figure out where to insert the new entry:
1251
+ * - Generate a hash from a byte along with an additional 1-byte "short hash". The additional byte is our "tag"
1252
+ * - The hashTable is effectively split into groups or "rows" of 16 or 32 entries of U32, and the hash determines
1253
+ * which row to insert into.
1254
+ * - Determine the correct position within the row to insert the entry into. Each row of 16 or 32 can
1255
+ * be considered as a circular buffer with a "head" index that resides in the tagTable.
1256
+ * - Also insert the "tag" into the equivalent row and position in the tagTable.
1257
+ * - Note: The tagTable has 17 or 33 1-byte entries per row, due to 16 or 32 tags, and 1 "head" entry.
1258
+ * The 17 or 33 entry rows are spaced out to occur every 32 or 64 bytes, respectively,
1259
+ * for alignment/performance reasons, leaving some bytes unused.
1260
+ * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte "short hash" and
1261
+ * generate a bitfield that we can cycle through to check the collisions in the hash table.
1262
+ * - Pick the longest match.
1263
+ */
1264
+ FORCE_INLINE_TEMPLATE
1265
+ size_t ZSTD_RowFindBestMatch_generic (
1266
+ ZSTD_matchState_t* ms,
1267
+ const BYTE* const ip, const BYTE* const iLimit,
1268
+ size_t* offsetPtr,
1269
+ const U32 mls, const ZSTD_dictMode_e dictMode,
1270
+ const U32 rowLog)
1271
+ {
1272
+ U32* const hashTable = ms->hashTable;
1273
+ U16* const tagTable = ms->tagTable;
1274
+ U32* const hashCache = ms->hashCache;
1275
+ const U32 hashLog = ms->rowHashLog;
1276
+ const ZSTD_compressionParameters* const cParams = &ms->cParams;
1277
+ const BYTE* const base = ms->window.base;
1278
+ const BYTE* const dictBase = ms->window.dictBase;
1279
+ const U32 dictLimit = ms->window.dictLimit;
1280
+ const BYTE* const prefixStart = base + dictLimit;
1281
+ const BYTE* const dictEnd = dictBase + dictLimit;
1282
+ const U32 curr = (U32)(ip-base);
1283
+ const U32 maxDistance = 1U << cParams->windowLog;
1284
+ const U32 lowestValid = ms->window.lowLimit;
1285
+ const U32 withinMaxDistance = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid;
1286
+ const U32 isDictionary = (ms->loadedDictEnd != 0);
1287
+ const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance;
1288
+ const U32 rowEntries = (1U << rowLog);
1289
+ const U32 rowMask = rowEntries - 1;
1290
+ const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */
1291
+ U32 nbAttempts = 1U << cappedSearchLog;
1292
+ size_t ml=4-1;
1293
+
1294
+ /* DMS/DDS variables that may be referenced laster */
1295
+ const ZSTD_matchState_t* const dms = ms->dictMatchState;
1296
+ size_t ddsIdx;
1297
+ U32 ddsExtraAttempts; /* cctx hash tables are limited in searches, but allow extra searches into DDS */
1298
+ U32 dmsTag;
1299
+ U32* dmsRow;
1300
+ BYTE* dmsTagRow;
1301
+
1302
+ if (dictMode == ZSTD_dedicatedDictSearch) {
1303
+ const U32 ddsHashLog = dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG;
1304
+ { /* Prefetch DDS hashtable entry */
1305
+ ddsIdx = ZSTD_hashPtr(ip, ddsHashLog, mls) << ZSTD_LAZY_DDSS_BUCKET_LOG;
1306
+ PREFETCH_L1(&dms->hashTable[ddsIdx]);
1307
+ }
1308
+ ddsExtraAttempts = cParams->searchLog > rowLog ? 1U << (cParams->searchLog - rowLog) : 0;
1309
+ }
1310
+
1311
+ if (dictMode == ZSTD_dictMatchState) {
1312
+ /* Prefetch DMS rows */
1313
+ U32* const dmsHashTable = dms->hashTable;
1314
+ U16* const dmsTagTable = dms->tagTable;
1315
+ U32 const dmsHash = (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
1316
+ U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
1317
+ dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK;
1318
+ dmsTagRow = (BYTE*)(dmsTagTable + dmsRelRow);
1319
+ dmsRow = dmsHashTable + dmsRelRow;
1320
+ ZSTD_row_prefetch(dmsHashTable, dmsTagTable, dmsRelRow, rowLog);
1321
+ }
1322
+
1323
+ /* Update the hashTable and tagTable up to (but not including) ip */
1324
+ ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);
1325
+ { /* Get the hash for ip, compute the appropriate row */
1326
+ U32 const hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls);
1327
+ U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
1328
+ U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK;
1329
+ U32* const row = hashTable + relRow;
1330
+ BYTE* tagRow = (BYTE*)(tagTable + relRow);
1331
+ U32 const head = *tagRow & rowMask;
1332
+ U32 matchBuffer[32 /* maximum nb entries per row */];
1333
+ size_t numMatches = 0;
1334
+ size_t currMatch = 0;
1335
+ ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, head, rowEntries);
1336
+
1337
+ /* Cycle through the matches and prefetch */
1338
+ for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
1339
+ U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
1340
+ U32 const matchIndex = row[matchPos];
1341
+ assert(numMatches < rowEntries);
1342
+ if (matchIndex < lowLimit)
1343
+ break;
1344
+ if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
1345
+ PREFETCH_L1(base + matchIndex);
1346
+ } else {
1347
+ PREFETCH_L1(dictBase + matchIndex);
1348
+ }
1349
+ matchBuffer[numMatches++] = matchIndex;
1350
+ }
1351
+
1352
+ /* Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop
1353
+ in ZSTD_row_update_internal() at the next search. */
1354
+ {
1355
+ U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
1356
+ tagRow[pos + ZSTD_ROW_HASH_TAG_OFFSET] = (BYTE)tag;
1357
+ row[pos] = ms->nextToUpdate++;
1358
+ }
1359
+
1360
+ /* Return the longest match */
1361
+ for (; currMatch < numMatches; ++currMatch) {
1362
+ U32 const matchIndex = matchBuffer[currMatch];
1363
+ size_t currentMl=0;
1364
+ assert(matchIndex < curr);
1365
+ assert(matchIndex >= lowLimit);
1366
+
1367
+ if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
1368
+ const BYTE* const match = base + matchIndex;
1369
+ assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */
1370
+ if (match[ml] == ip[ml]) /* potentially better */
1371
+ currentMl = ZSTD_count(ip, match, iLimit);
1372
+ } else {
1373
+ const BYTE* const match = dictBase + matchIndex;
1374
+ assert(match+4 <= dictEnd);
1375
+ if (MEM_read32(match) == MEM_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */
1376
+ currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dictEnd, prefixStart) + 4;
1377
+ }
1378
+
1379
+ /* Save best solution */
1380
+ if (currentMl > ml) {
1381
+ ml = currentMl;
1382
+ *offsetPtr = curr - matchIndex + ZSTD_REP_MOVE;
1383
+ if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
1384
+ }
1385
+ }
1386
+ }
1387
+
1388
+ if (dictMode == ZSTD_dedicatedDictSearch) {
1389
+ ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts + ddsExtraAttempts, dms,
1390
+ ip, iLimit, prefixStart, curr, dictLimit, ddsIdx);
1391
+ } else if (dictMode == ZSTD_dictMatchState) {
1392
+ /* TODO: Measure and potentially add prefetching to DMS */
1393
+ const U32 dmsLowestIndex = dms->window.dictLimit;
1394
+ const BYTE* const dmsBase = dms->window.base;
1395
+ const BYTE* const dmsEnd = dms->window.nextSrc;
1396
+ const U32 dmsSize = (U32)(dmsEnd - dmsBase);
1397
+ const U32 dmsIndexDelta = dictLimit - dmsSize;
1398
+
1399
+ { U32 const head = *dmsTagRow & rowMask;
1400
+ U32 matchBuffer[32 /* maximum nb row entries */];
1401
+ size_t numMatches = 0;
1402
+ size_t currMatch = 0;
1403
+ ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, head, rowEntries);
1404
+
1405
+ for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
1406
+ U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
1407
+ U32 const matchIndex = dmsRow[matchPos];
1408
+ if (matchIndex < dmsLowestIndex)
1409
+ break;
1410
+ PREFETCH_L1(dmsBase + matchIndex);
1411
+ matchBuffer[numMatches++] = matchIndex;
1412
+ }
1413
+
1414
+ /* Return the longest match */
1415
+ for (; currMatch < numMatches; ++currMatch) {
1416
+ U32 const matchIndex = matchBuffer[currMatch];
1417
+ size_t currentMl=0;
1418
+ assert(matchIndex >= dmsLowestIndex);
1419
+ assert(matchIndex < curr);
1420
+
1421
+ { const BYTE* const match = dmsBase + matchIndex;
1422
+ assert(match+4 <= dmsEnd);
1423
+ if (MEM_read32(match) == MEM_read32(ip))
1424
+ currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dmsEnd, prefixStart) + 4;
1425
+ }
1426
+
1427
+ if (currentMl > ml) {
1428
+ ml = currentMl;
1429
+ *offsetPtr = curr - (matchIndex + dmsIndexDelta) + ZSTD_REP_MOVE;
1430
+ if (ip+currentMl == iLimit) break;
1431
+ }
1432
+ }
1433
+ }
1434
+ }
1435
+ return ml;
1436
+ }
1437
+
1438
+ /* Inlining is important to hardwire a hot branch (template emulation) */
1439
+ FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_selectMLS (
1440
+ ZSTD_matchState_t* ms,
1441
+ const BYTE* ip, const BYTE* const iLimit,
1442
+ const ZSTD_dictMode_e dictMode, size_t* offsetPtr, const U32 rowLog)
1443
+ {
1444
+ switch(ms->cParams.minMatch)
1445
+ {
1446
+ default : /* includes case 3 */
1447
+ case 4 : return ZSTD_RowFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, dictMode, rowLog);
1448
+ case 5 : return ZSTD_RowFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, dictMode, rowLog);
1449
+ case 7 :
1450
+ case 6 : return ZSTD_RowFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, dictMode, rowLog);
1451
+ }
1452
+ }
1453
+
1454
+ FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_selectRowLog (
1455
+ ZSTD_matchState_t* ms,
1456
+ const BYTE* ip, const BYTE* const iLimit,
1457
+ size_t* offsetPtr)
1458
+ {
1459
+ const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 5);
1460
+ switch(cappedSearchLog)
1461
+ {
1462
+ default :
1463
+ case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_noDict, offsetPtr, 4);
1464
+ case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_noDict, offsetPtr, 5);
1465
+ }
1466
+ }
1467
+
1468
+ FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_dictMatchState_selectRowLog(
1469
+ ZSTD_matchState_t* ms,
1470
+ const BYTE* ip, const BYTE* const iLimit,
1471
+ size_t* offsetPtr)
1472
+ {
1473
+ const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 5);
1474
+ switch(cappedSearchLog)
1475
+ {
1476
+ default :
1477
+ case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dictMatchState, offsetPtr, 4);
1478
+ case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dictMatchState, offsetPtr, 5);
1479
+ }
1480
+ }
1481
+
1482
+ FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_dedicatedDictSearch_selectRowLog(
1483
+ ZSTD_matchState_t* ms,
1484
+ const BYTE* ip, const BYTE* const iLimit,
1485
+ size_t* offsetPtr)
1486
+ {
1487
+ const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 5);
1488
+ switch(cappedSearchLog)
1489
+ {
1490
+ default :
1491
+ case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dedicatedDictSearch, offsetPtr, 4);
1492
+ case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dedicatedDictSearch, offsetPtr, 5);
1493
+ }
1494
+ }
1495
+
1496
+ FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_extDict_selectRowLog (
1497
+ ZSTD_matchState_t* ms,
1498
+ const BYTE* ip, const BYTE* const iLimit,
1499
+ size_t* offsetPtr)
1500
+ {
1501
+ const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 5);
1502
+ switch(cappedSearchLog)
1503
+ {
1504
+ default :
1505
+ case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_extDict, offsetPtr, 4);
1506
+ case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_extDict, offsetPtr, 5);
1507
+ }
1508
+ }
1509
+
1510
+
1511
+ /* *******************************
1512
+ * Common parser - lazy strategy
1513
+ *********************************/
1514
+ typedef enum { search_hashChain=0, search_binaryTree=1, search_rowHash=2 } searchMethod_e;
1515
+
1516
+ FORCE_INLINE_TEMPLATE size_t
1517
+ ZSTD_compressBlock_lazy_generic(
1518
+ ZSTD_matchState_t* ms, seqStore_t* seqStore,
1519
+ U32 rep[ZSTD_REP_NUM],
1520
+ const void* src, size_t srcSize,
1521
+ const searchMethod_e searchMethod, const U32 depth,
1522
+ ZSTD_dictMode_e const dictMode)
1523
+ {
1524
+ const BYTE* const istart = (const BYTE*)src;
1525
+ const BYTE* ip = istart;
1526
+ const BYTE* anchor = istart;
1527
+ const BYTE* const iend = istart + srcSize;
1528
+ const BYTE* const ilimit = searchMethod == search_rowHash ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8;
1529
+ const BYTE* const base = ms->window.base;
1530
+ const U32 prefixLowestIndex = ms->window.dictLimit;
1531
+ const BYTE* const prefixLowest = base + prefixLowestIndex;
1532
+ const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5;
1533
+
1534
+ typedef size_t (*searchMax_f)(
1535
+ ZSTD_matchState_t* ms,
1536
+ const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
1537
+
1538
+ /**
1539
+ * This table is indexed first by the four ZSTD_dictMode_e values, and then
1540
+ * by the two searchMethod_e values. NULLs are placed for configurations
1541
+ * that should never occur (extDict modes go to the other implementation
1542
+ * below and there is no DDSS for binary tree search yet).
1543
+ */
1544
+ const searchMax_f searchFuncs[4][3] = {
1545
+ {
1546
+ ZSTD_HcFindBestMatch_selectMLS,
1547
+ ZSTD_BtFindBestMatch_selectMLS,
1548
+ ZSTD_RowFindBestMatch_selectRowLog
1549
+ },
1550
+ {
1551
+ NULL,
1552
+ NULL,
1553
+ NULL
1554
+ },
1555
+ {
1556
+ ZSTD_HcFindBestMatch_dictMatchState_selectMLS,
1557
+ ZSTD_BtFindBestMatch_dictMatchState_selectMLS,
1558
+ ZSTD_RowFindBestMatch_dictMatchState_selectRowLog
1559
+ },
1560
+ {
1561
+ ZSTD_HcFindBestMatch_dedicatedDictSearch_selectMLS,
1562
+ NULL,
1563
+ ZSTD_RowFindBestMatch_dedicatedDictSearch_selectRowLog
1564
+ }
1565
+ };
1566
+
1567
+ searchMax_f const searchMax = searchFuncs[dictMode][(int)searchMethod];
1568
+ U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0;
1569
+
1570
+ const int isDMS = dictMode == ZSTD_dictMatchState;
1571
+ const int isDDS = dictMode == ZSTD_dedicatedDictSearch;
1572
+ const int isDxS = isDMS || isDDS;
1573
+ const ZSTD_matchState_t* const dms = ms->dictMatchState;
1574
+ const U32 dictLowestIndex = isDxS ? dms->window.dictLimit : 0;
1575
+ const BYTE* const dictBase = isDxS ? dms->window.base : NULL;
1576
+ const BYTE* const dictLowest = isDxS ? dictBase + dictLowestIndex : NULL;
1577
+ const BYTE* const dictEnd = isDxS ? dms->window.nextSrc : NULL;
1578
+ const U32 dictIndexDelta = isDxS ?
1579
+ prefixLowestIndex - (U32)(dictEnd - dictBase) :
1580
+ 0;
1581
+ const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictLowest));
1582
+
1583
+ assert(searchMax != NULL);
1584
+
1585
+ DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=%u) (searchFunc=%u)", (U32)dictMode, (U32)searchMethod);
1586
+ ip += (dictAndPrefixLength == 0);
1587
+ if (dictMode == ZSTD_noDict) {
1588
+ U32 const curr = (U32)(ip - base);
1589
+ U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, ms->cParams.windowLog);
1590
+ U32 const maxRep = curr - windowLow;
1591
+ if (offset_2 > maxRep) savedOffset = offset_2, offset_2 = 0;
1592
+ if (offset_1 > maxRep) savedOffset = offset_1, offset_1 = 0;
1593
+ }
1594
+ if (isDxS) {
1595
+ /* dictMatchState repCode checks don't currently handle repCode == 0
1596
+ * disabling. */
1597
+ assert(offset_1 <= dictAndPrefixLength);
1598
+ assert(offset_2 <= dictAndPrefixLength);
1599
+ }
1600
+
1601
+ if (searchMethod == search_rowHash) {
1602
+ ZSTD_row_fillHashCache(ms, base, rowLog,
1603
+ MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
1604
+ ms->nextToUpdate, ilimit);
1605
+ }
1606
+
1607
+ /* Match Loop */
1608
+ #if defined(__GNUC__) && defined(__x86_64__)
1609
+ /* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the
1610
+ * code alignment is perturbed. To fix the instability align the loop on 32-bytes.
1611
+ */
1612
+ __asm__(".p2align 5");
1613
+ #endif
1614
+ while (ip < ilimit) {
1615
+ size_t matchLength=0;
1616
+ size_t offset=0;
1617
+ const BYTE* start=ip+1;
1618
+
1619
+ /* check repCode */
1620
+ if (isDxS) {
1621
+ const U32 repIndex = (U32)(ip - base) + 1 - offset_1;
1622
+ const BYTE* repMatch = ((dictMode == ZSTD_dictMatchState || dictMode == ZSTD_dedicatedDictSearch)
1623
+ && repIndex < prefixLowestIndex) ?
1624
+ dictBase + (repIndex - dictIndexDelta) :
1625
+ base + repIndex;
1626
+ if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */)
1627
+ && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
1628
+ const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
1629
+ matchLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
1630
+ if (depth==0) goto _storeSequence;
1631
+ }
1632
+ }
1633
+ if ( dictMode == ZSTD_noDict
1634
+ && ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1)))) {
1635
+ matchLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4;
1636
+ if (depth==0) goto _storeSequence;
1637
+ }
1638
+
1639
+ /* first search (depth 0) */
1640
+ { size_t offsetFound = 999999999;
1641
+ size_t const ml2 = searchMax(ms, ip, iend, &offsetFound);
1642
+ if (ml2 > matchLength)
1643
+ matchLength = ml2, start = ip, offset=offsetFound;
1644
+ }
1645
+
1646
+ if (matchLength < 4) {
1647
+ ip += ((ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */
1648
+ continue;
1649
+ }
1650
+
1651
+ /* let's try to find a better solution */
1652
+ if (depth>=1)
1653
+ while (ip<ilimit) {
1654
+ ip ++;
1655
+ if ( (dictMode == ZSTD_noDict)
1656
+ && (offset) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
1657
+ size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
1658
+ int const gain2 = (int)(mlRep * 3);
1659
+ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1);
1660
+ if ((mlRep >= 4) && (gain2 > gain1))
1661
+ matchLength = mlRep, offset = 0, start = ip;
1662
+ }
1663
+ if (isDxS) {
1664
+ const U32 repIndex = (U32)(ip - base) - offset_1;
1665
+ const BYTE* repMatch = repIndex < prefixLowestIndex ?
1666
+ dictBase + (repIndex - dictIndexDelta) :
1667
+ base + repIndex;
1668
+ if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */)
1669
+ && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
1670
+ const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
1671
+ size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
1672
+ int const gain2 = (int)(mlRep * 3);
1673
+ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1);
1674
+ if ((mlRep >= 4) && (gain2 > gain1))
1675
+ matchLength = mlRep, offset = 0, start = ip;
1676
+ }
1677
+ }
1678
+ { size_t offset2=999999999;
1679
+ size_t const ml2 = searchMax(ms, ip, iend, &offset2);
1680
+ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */
1681
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 4);
1682
+ if ((ml2 >= 4) && (gain2 > gain1)) {
1683
+ matchLength = ml2, offset = offset2, start = ip;
1684
+ continue; /* search a better one */
1685
+ } }
1686
+
1687
+ /* let's find an even better one */
1688
+ if ((depth==2) && (ip<ilimit)) {
1689
+ ip ++;
1690
+ if ( (dictMode == ZSTD_noDict)
1691
+ && (offset) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
1692
+ size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
1693
+ int const gain2 = (int)(mlRep * 4);
1694
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1);
1695
+ if ((mlRep >= 4) && (gain2 > gain1))
1696
+ matchLength = mlRep, offset = 0, start = ip;
1697
+ }
1698
+ if (isDxS) {
1699
+ const U32 repIndex = (U32)(ip - base) - offset_1;
1700
+ const BYTE* repMatch = repIndex < prefixLowestIndex ?
1701
+ dictBase + (repIndex - dictIndexDelta) :
1702
+ base + repIndex;
1703
+ if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */)
1704
+ && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
1705
+ const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
1706
+ size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
1707
+ int const gain2 = (int)(mlRep * 4);
1708
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1);
1709
+ if ((mlRep >= 4) && (gain2 > gain1))
1710
+ matchLength = mlRep, offset = 0, start = ip;
1711
+ }
1712
+ }
1713
+ { size_t offset2=999999999;
1714
+ size_t const ml2 = searchMax(ms, ip, iend, &offset2);
1715
+ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */
1716
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 7);
1717
+ if ((ml2 >= 4) && (gain2 > gain1)) {
1718
+ matchLength = ml2, offset = offset2, start = ip;
1719
+ continue;
1720
+ } } }
1721
+ break; /* nothing found : store previous solution */
1722
+ }
1723
+
1724
+ /* NOTE:
1725
+ * start[-offset+ZSTD_REP_MOVE-1] is undefined behavior.
1726
+ * (-offset+ZSTD_REP_MOVE-1) is unsigned, and is added to start, which
1727
+ * overflows the pointer, which is undefined behavior.
1728
+ */
1729
+ /* catch up */
1730
+ if (offset) {
1731
+ if (dictMode == ZSTD_noDict) {
1732
+ while ( ((start > anchor) & (start - (offset-ZSTD_REP_MOVE) > prefixLowest))
1733
+ && (start[-1] == (start-(offset-ZSTD_REP_MOVE))[-1]) ) /* only search for offset within prefix */
1734
+ { start--; matchLength++; }
1735
+ }
1736
+ if (isDxS) {
1737
+ U32 const matchIndex = (U32)((start-base) - (offset - ZSTD_REP_MOVE));
1738
+ const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex;
1739
+ const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest;
1740
+ while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */
1741
+ }
1742
+ offset_2 = offset_1; offset_1 = (U32)(offset - ZSTD_REP_MOVE);
1743
+ }
1744
+ /* store sequence */
1745
+ _storeSequence:
1746
+ { size_t const litLength = start - anchor;
1747
+ ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offset, matchLength-MINMATCH);
1748
+ anchor = ip = start + matchLength;
1749
+ }
1750
+
1751
+ /* check immediate repcode */
1752
+ if (isDxS) {
1753
+ while (ip <= ilimit) {
1754
+ U32 const current2 = (U32)(ip-base);
1755
+ U32 const repIndex = current2 - offset_2;
1756
+ const BYTE* repMatch = repIndex < prefixLowestIndex ?
1757
+ dictBase - dictIndexDelta + repIndex :
1758
+ base + repIndex;
1759
+ if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex) >= 3 /* intentional overflow */)
1760
+ && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
1761
+ const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend;
1762
+ matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4;
1763
+ offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap offset_2 <=> offset_1 */
1764
+ ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MINMATCH);
1765
+ ip += matchLength;
1766
+ anchor = ip;
1767
+ continue;
1768
+ }
1769
+ break;
1770
+ }
1771
+ }
1772
+
1773
+ if (dictMode == ZSTD_noDict) {
1774
+ while ( ((ip <= ilimit) & (offset_2>0))
1775
+ && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) {
1776
+ /* store sequence */
1777
+ matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
1778
+ offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap repcodes */
1779
+ ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MINMATCH);
1780
+ ip += matchLength;
1781
+ anchor = ip;
1782
+ continue; /* faster when present ... (?) */
1783
+ } } }
1784
+
1785
+ /* Save reps for next block */
1786
+ rep[0] = offset_1 ? offset_1 : savedOffset;
1787
+ rep[1] = offset_2 ? offset_2 : savedOffset;
1788
+
1789
+ /* Return the last literals size */
1790
+ return (size_t)(iend - anchor);
1791
+ }
1792
+
1793
+
1794
+ size_t ZSTD_compressBlock_btlazy2(
1795
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1796
+ void const* src, size_t srcSize)
1797
+ {
1798
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_noDict);
1799
+ }
1800
+
1801
+ size_t ZSTD_compressBlock_lazy2(
1802
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1803
+ void const* src, size_t srcSize)
1804
+ {
1805
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_noDict);
1806
+ }
1807
+
1808
+ size_t ZSTD_compressBlock_lazy(
1809
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1810
+ void const* src, size_t srcSize)
1811
+ {
1812
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_noDict);
1813
+ }
1814
+
1815
+ size_t ZSTD_compressBlock_greedy(
1816
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1817
+ void const* src, size_t srcSize)
1818
+ {
1819
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_noDict);
1820
+ }
1821
+
1822
+ size_t ZSTD_compressBlock_btlazy2_dictMatchState(
1823
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1824
+ void const* src, size_t srcSize)
1825
+ {
1826
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_dictMatchState);
1827
+ }
1828
+
1829
+ size_t ZSTD_compressBlock_lazy2_dictMatchState(
1830
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1831
+ void const* src, size_t srcSize)
1832
+ {
1833
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dictMatchState);
1834
+ }
1835
+
1836
+ size_t ZSTD_compressBlock_lazy_dictMatchState(
1837
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1838
+ void const* src, size_t srcSize)
1839
+ {
1840
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dictMatchState);
1841
+ }
1842
+
1843
+ size_t ZSTD_compressBlock_greedy_dictMatchState(
1844
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1845
+ void const* src, size_t srcSize)
1846
+ {
1847
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dictMatchState);
1848
+ }
1849
+
1850
+
1851
+ size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(
1852
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1853
+ void const* src, size_t srcSize)
1854
+ {
1855
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dedicatedDictSearch);
1856
+ }
1857
+
1858
+ size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(
1859
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1860
+ void const* src, size_t srcSize)
1861
+ {
1862
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dedicatedDictSearch);
1863
+ }
1864
+
1865
+ size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
1866
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1867
+ void const* src, size_t srcSize)
1868
+ {
1869
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch);
1870
+ }
1871
+
1872
+ /* Row-based matchfinder */
1873
+ size_t ZSTD_compressBlock_lazy2_row(
1874
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1875
+ void const* src, size_t srcSize)
1876
+ {
1877
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict);
1878
+ }
1879
+
1880
+ size_t ZSTD_compressBlock_lazy_row(
1881
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1882
+ void const* src, size_t srcSize)
1883
+ {
1884
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict);
1885
+ }
1886
+
1887
+ size_t ZSTD_compressBlock_greedy_row(
1888
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1889
+ void const* src, size_t srcSize)
1890
+ {
1891
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict);
1892
+ }
1893
+
1894
+ size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
1895
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1896
+ void const* src, size_t srcSize)
1897
+ {
1898
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState);
1899
+ }
1900
+
1901
+ size_t ZSTD_compressBlock_lazy_dictMatchState_row(
1902
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1903
+ void const* src, size_t srcSize)
1904
+ {
1905
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState);
1906
+ }
1907
+
1908
+ size_t ZSTD_compressBlock_greedy_dictMatchState_row(
1909
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1910
+ void const* src, size_t srcSize)
1911
+ {
1912
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState);
1913
+ }
1914
+
1915
+
1916
+ size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row(
1917
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1918
+ void const* src, size_t srcSize)
1919
+ {
1920
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dedicatedDictSearch);
1921
+ }
1922
+
1923
+ size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
1924
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1925
+ void const* src, size_t srcSize)
1926
+ {
1927
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch);
1928
+ }
1929
+
1930
+ size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
1931
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1932
+ void const* src, size_t srcSize)
1933
+ {
1934
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch);
1935
+ }
1936
+
1937
+ FORCE_INLINE_TEMPLATE
1938
+ size_t ZSTD_compressBlock_lazy_extDict_generic(
1939
+ ZSTD_matchState_t* ms, seqStore_t* seqStore,
1940
+ U32 rep[ZSTD_REP_NUM],
1941
+ const void* src, size_t srcSize,
1942
+ const searchMethod_e searchMethod, const U32 depth)
1943
+ {
1944
+ const BYTE* const istart = (const BYTE*)src;
1945
+ const BYTE* ip = istart;
1946
+ const BYTE* anchor = istart;
1947
+ const BYTE* const iend = istart + srcSize;
1948
+ const BYTE* const ilimit = searchMethod == search_rowHash ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8;
1949
+ const BYTE* const base = ms->window.base;
1950
+ const U32 dictLimit = ms->window.dictLimit;
1951
+ const BYTE* const prefixStart = base + dictLimit;
1952
+ const BYTE* const dictBase = ms->window.dictBase;
1953
+ const BYTE* const dictEnd = dictBase + dictLimit;
1954
+ const BYTE* const dictStart = dictBase + ms->window.lowLimit;
1955
+ const U32 windowLog = ms->cParams.windowLog;
1956
+ const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5;
1957
+
1958
+ typedef size_t (*searchMax_f)(
1959
+ ZSTD_matchState_t* ms,
1960
+ const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
1961
+ const searchMax_f searchFuncs[3] = {
1962
+ ZSTD_HcFindBestMatch_extDict_selectMLS,
1963
+ ZSTD_BtFindBestMatch_extDict_selectMLS,
1964
+ ZSTD_RowFindBestMatch_extDict_selectRowLog
1965
+ };
1966
+ searchMax_f searchMax = searchFuncs[(int)searchMethod];
1967
+ U32 offset_1 = rep[0], offset_2 = rep[1];
1968
+
1969
+ DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod);
1970
+
1971
+ /* init */
1972
+ ip += (ip == prefixStart);
1973
+ if (searchMethod == search_rowHash) {
1974
+ ZSTD_row_fillHashCache(ms, base, rowLog,
1975
+ MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
1976
+ ms->nextToUpdate, ilimit);
1977
+ }
1978
+
1979
+ /* Match Loop */
1980
+ #if defined(__GNUC__) && defined(__x86_64__)
1981
+ /* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the
1982
+ * code alignment is perturbed. To fix the instability align the loop on 32-bytes.
1983
+ */
1984
+ __asm__(".p2align 5");
1985
+ #endif
1986
+ while (ip < ilimit) {
1987
+ size_t matchLength=0;
1988
+ size_t offset=0;
1989
+ const BYTE* start=ip+1;
1990
+ U32 curr = (U32)(ip-base);
1991
+
1992
+ /* check repCode */
1993
+ { const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr+1, windowLog);
1994
+ const U32 repIndex = (U32)(curr+1 - offset_1);
1995
+ const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
1996
+ const BYTE* const repMatch = repBase + repIndex;
1997
+ if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow */
1998
+ & (offset_1 < curr+1 - windowLow) ) /* note: we are searching at curr+1 */
1999
+ if (MEM_read32(ip+1) == MEM_read32(repMatch)) {
2000
+ /* repcode detected we should take it */
2001
+ const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
2002
+ matchLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repEnd, prefixStart) + 4;
2003
+ if (depth==0) goto _storeSequence;
2004
+ } }
2005
+
2006
+ /* first search (depth 0) */
2007
+ { size_t offsetFound = 999999999;
2008
+ size_t const ml2 = searchMax(ms, ip, iend, &offsetFound);
2009
+ if (ml2 > matchLength)
2010
+ matchLength = ml2, start = ip, offset=offsetFound;
2011
+ }
2012
+
2013
+ if (matchLength < 4) {
2014
+ ip += ((ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */
2015
+ continue;
2016
+ }
2017
+
2018
+ /* let's try to find a better solution */
2019
+ if (depth>=1)
2020
+ while (ip<ilimit) {
2021
+ ip ++;
2022
+ curr++;
2023
+ /* check repCode */
2024
+ if (offset) {
2025
+ const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
2026
+ const U32 repIndex = (U32)(curr - offset_1);
2027
+ const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
2028
+ const BYTE* const repMatch = repBase + repIndex;
2029
+ if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */
2030
+ & (offset_1 < curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
2031
+ if (MEM_read32(ip) == MEM_read32(repMatch)) {
2032
+ /* repcode detected */
2033
+ const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
2034
+ size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
2035
+ int const gain2 = (int)(repLength * 3);
2036
+ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1);
2037
+ if ((repLength >= 4) && (gain2 > gain1))
2038
+ matchLength = repLength, offset = 0, start = ip;
2039
+ } }
2040
+
2041
+ /* search match, depth 1 */
2042
+ { size_t offset2=999999999;
2043
+ size_t const ml2 = searchMax(ms, ip, iend, &offset2);
2044
+ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */
2045
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 4);
2046
+ if ((ml2 >= 4) && (gain2 > gain1)) {
2047
+ matchLength = ml2, offset = offset2, start = ip;
2048
+ continue; /* search a better one */
2049
+ } }
2050
+
2051
+ /* let's find an even better one */
2052
+ if ((depth==2) && (ip<ilimit)) {
2053
+ ip ++;
2054
+ curr++;
2055
+ /* check repCode */
2056
+ if (offset) {
2057
+ const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
2058
+ const U32 repIndex = (U32)(curr - offset_1);
2059
+ const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
2060
+ const BYTE* const repMatch = repBase + repIndex;
2061
+ if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */
2062
+ & (offset_1 < curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
2063
+ if (MEM_read32(ip) == MEM_read32(repMatch)) {
2064
+ /* repcode detected */
2065
+ const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
2066
+ size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
2067
+ int const gain2 = (int)(repLength * 4);
2068
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1);
2069
+ if ((repLength >= 4) && (gain2 > gain1))
2070
+ matchLength = repLength, offset = 0, start = ip;
2071
+ } }
2072
+
2073
+ /* search match, depth 2 */
2074
+ { size_t offset2=999999999;
2075
+ size_t const ml2 = searchMax(ms, ip, iend, &offset2);
2076
+ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */
2077
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 7);
2078
+ if ((ml2 >= 4) && (gain2 > gain1)) {
2079
+ matchLength = ml2, offset = offset2, start = ip;
2080
+ continue;
2081
+ } } }
2082
+ break; /* nothing found : store previous solution */
2083
+ }
2084
+
2085
+ /* catch up */
2086
+ if (offset) {
2087
+ U32 const matchIndex = (U32)((start-base) - (offset - ZSTD_REP_MOVE));
2088
+ const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex;
2089
+ const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart;
2090
+ while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */
2091
+ offset_2 = offset_1; offset_1 = (U32)(offset - ZSTD_REP_MOVE);
2092
+ }
2093
+
2094
+ /* store sequence */
2095
+ _storeSequence:
2096
+ { size_t const litLength = start - anchor;
2097
+ ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offset, matchLength-MINMATCH);
2098
+ anchor = ip = start + matchLength;
2099
+ }
2100
+
2101
+ /* check immediate repcode */
2102
+ while (ip <= ilimit) {
2103
+ const U32 repCurrent = (U32)(ip-base);
2104
+ const U32 windowLow = ZSTD_getLowestMatchIndex(ms, repCurrent, windowLog);
2105
+ const U32 repIndex = repCurrent - offset_2;
2106
+ const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
2107
+ const BYTE* const repMatch = repBase + repIndex;
2108
+ if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */
2109
+ & (offset_2 < repCurrent - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
2110
+ if (MEM_read32(ip) == MEM_read32(repMatch)) {
2111
+ /* repcode detected we should take it */
2112
+ const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
2113
+ matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
2114
+ offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap offset history */
2115
+ ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MINMATCH);
2116
+ ip += matchLength;
2117
+ anchor = ip;
2118
+ continue; /* faster when present ... (?) */
2119
+ }
2120
+ break;
2121
+ } }
2122
+
2123
+ /* Save reps for next block */
2124
+ rep[0] = offset_1;
2125
+ rep[1] = offset_2;
2126
+
2127
+ /* Return the last literals size */
2128
+ return (size_t)(iend - anchor);
2129
+ }
2130
+
2131
+
2132
+ size_t ZSTD_compressBlock_greedy_extDict(
2133
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2134
+ void const* src, size_t srcSize)
2135
+ {
2136
+ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0);
2137
+ }
2138
+
2139
+ size_t ZSTD_compressBlock_lazy_extDict(
2140
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2141
+ void const* src, size_t srcSize)
2142
+
2143
+ {
2144
+ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1);
2145
+ }
2146
+
2147
+ size_t ZSTD_compressBlock_lazy2_extDict(
2148
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2149
+ void const* src, size_t srcSize)
2150
+
2151
+ {
2152
+ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2);
2153
+ }
2154
+
2155
+ size_t ZSTD_compressBlock_btlazy2_extDict(
2156
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2157
+ void const* src, size_t srcSize)
2158
+
2159
+ {
2160
+ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2);
2161
+ }
2162
+
2163
+ size_t ZSTD_compressBlock_greedy_extDict_row(
2164
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2165
+ void const* src, size_t srcSize)
2166
+ {
2167
+ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0);
2168
+ }
2169
+
2170
+ size_t ZSTD_compressBlock_lazy_extDict_row(
2171
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2172
+ void const* src, size_t srcSize)
2173
+
2174
+ {
2175
+ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1);
2176
+ }
2177
+
2178
+ size_t ZSTD_compressBlock_lazy2_extDict_row(
2179
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2180
+ void const* src, size_t srcSize)
2181
+
2182
+ {
2183
+ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2);
2184
+ }