zstd-ruby 1.4.1.0 → 1.5.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/dependabot.yml +8 -0
- data/.github/workflows/ruby.yml +35 -0
- data/README.md +2 -2
- data/ext/zstdruby/libzstd/BUCK +5 -7
- data/ext/zstdruby/libzstd/Makefile +304 -113
- data/ext/zstdruby/libzstd/README.md +83 -20
- data/ext/zstdruby/libzstd/common/bitstream.h +59 -51
- data/ext/zstdruby/libzstd/common/compiler.h +150 -8
- data/ext/zstdruby/libzstd/common/cpu.h +1 -3
- data/ext/zstdruby/libzstd/common/debug.c +11 -31
- data/ext/zstdruby/libzstd/common/debug.h +22 -49
- data/ext/zstdruby/libzstd/common/entropy_common.c +201 -75
- data/ext/zstdruby/libzstd/common/error_private.c +3 -1
- data/ext/zstdruby/libzstd/common/error_private.h +8 -4
- data/ext/zstdruby/libzstd/common/fse.h +50 -42
- data/ext/zstdruby/libzstd/common/fse_decompress.c +149 -55
- data/ext/zstdruby/libzstd/common/huf.h +43 -39
- data/ext/zstdruby/libzstd/common/mem.h +69 -25
- data/ext/zstdruby/libzstd/common/pool.c +30 -20
- data/ext/zstdruby/libzstd/common/pool.h +3 -3
- data/ext/zstdruby/libzstd/common/threading.c +51 -4
- data/ext/zstdruby/libzstd/common/threading.h +36 -4
- data/ext/zstdruby/libzstd/common/xxhash.c +40 -92
- data/ext/zstdruby/libzstd/common/xxhash.h +12 -32
- data/ext/zstdruby/libzstd/common/zstd_common.c +10 -10
- data/ext/zstdruby/libzstd/common/zstd_deps.h +111 -0
- data/ext/zstdruby/libzstd/common/zstd_internal.h +230 -111
- data/ext/zstdruby/libzstd/common/zstd_trace.h +154 -0
- data/ext/zstdruby/libzstd/compress/fse_compress.c +47 -63
- data/ext/zstdruby/libzstd/compress/hist.c +41 -63
- data/ext/zstdruby/libzstd/compress/hist.h +13 -33
- data/ext/zstdruby/libzstd/compress/huf_compress.c +332 -193
- data/ext/zstdruby/libzstd/compress/zstd_compress.c +3614 -1696
- data/ext/zstdruby/libzstd/compress/zstd_compress_internal.h +546 -86
- data/ext/zstdruby/libzstd/compress/zstd_compress_literals.c +158 -0
- data/ext/zstdruby/libzstd/compress/zstd_compress_literals.h +29 -0
- data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.c +441 -0
- data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.h +54 -0
- data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.c +572 -0
- data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.h +32 -0
- data/ext/zstdruby/libzstd/compress/zstd_cwksp.h +662 -0
- data/ext/zstdruby/libzstd/compress/zstd_double_fast.c +43 -41
- data/ext/zstdruby/libzstd/compress/zstd_double_fast.h +2 -2
- data/ext/zstdruby/libzstd/compress/zstd_fast.c +85 -80
- data/ext/zstdruby/libzstd/compress/zstd_fast.h +2 -2
- data/ext/zstdruby/libzstd/compress/zstd_lazy.c +1184 -111
- data/ext/zstdruby/libzstd/compress/zstd_lazy.h +59 -1
- data/ext/zstdruby/libzstd/compress/zstd_ldm.c +333 -208
- data/ext/zstdruby/libzstd/compress/zstd_ldm.h +15 -3
- data/ext/zstdruby/libzstd/compress/zstd_ldm_geartab.h +103 -0
- data/ext/zstdruby/libzstd/compress/zstd_opt.c +228 -129
- data/ext/zstdruby/libzstd/compress/zstd_opt.h +1 -1
- data/ext/zstdruby/libzstd/compress/zstdmt_compress.c +151 -440
- data/ext/zstdruby/libzstd/compress/zstdmt_compress.h +32 -114
- data/ext/zstdruby/libzstd/decompress/huf_decompress.c +395 -276
- data/ext/zstdruby/libzstd/decompress/zstd_ddict.c +20 -16
- data/ext/zstdruby/libzstd/decompress/zstd_ddict.h +3 -3
- data/ext/zstdruby/libzstd/decompress/zstd_decompress.c +630 -231
- data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.c +606 -380
- data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.h +8 -5
- data/ext/zstdruby/libzstd/decompress/zstd_decompress_internal.h +39 -9
- data/ext/zstdruby/libzstd/deprecated/zbuff.h +9 -8
- data/ext/zstdruby/libzstd/deprecated/zbuff_common.c +2 -2
- data/ext/zstdruby/libzstd/deprecated/zbuff_compress.c +1 -1
- data/ext/zstdruby/libzstd/deprecated/zbuff_decompress.c +1 -1
- data/ext/zstdruby/libzstd/dictBuilder/cover.c +55 -46
- data/ext/zstdruby/libzstd/dictBuilder/cover.h +20 -9
- data/ext/zstdruby/libzstd/dictBuilder/divsufsort.c +1 -1
- data/ext/zstdruby/libzstd/dictBuilder/fastcover.c +43 -31
- data/ext/zstdruby/libzstd/dictBuilder/zdict.c +53 -30
- data/ext/zstdruby/libzstd/dll/example/Makefile +2 -1
- data/ext/zstdruby/libzstd/dll/example/README.md +16 -22
- data/ext/zstdruby/libzstd/legacy/zstd_legacy.h +4 -4
- data/ext/zstdruby/libzstd/legacy/zstd_v01.c +24 -14
- data/ext/zstdruby/libzstd/legacy/zstd_v01.h +1 -1
- data/ext/zstdruby/libzstd/legacy/zstd_v02.c +17 -8
- data/ext/zstdruby/libzstd/legacy/zstd_v02.h +1 -1
- data/ext/zstdruby/libzstd/legacy/zstd_v03.c +17 -8
- data/ext/zstdruby/libzstd/legacy/zstd_v03.h +1 -1
- data/ext/zstdruby/libzstd/legacy/zstd_v04.c +25 -11
- data/ext/zstdruby/libzstd/legacy/zstd_v04.h +1 -1
- data/ext/zstdruby/libzstd/legacy/zstd_v05.c +43 -32
- data/ext/zstdruby/libzstd/legacy/zstd_v05.h +2 -2
- data/ext/zstdruby/libzstd/legacy/zstd_v06.c +27 -19
- data/ext/zstdruby/libzstd/legacy/zstd_v06.h +1 -1
- data/ext/zstdruby/libzstd/legacy/zstd_v07.c +32 -20
- data/ext/zstdruby/libzstd/legacy/zstd_v07.h +1 -1
- data/ext/zstdruby/libzstd/libzstd.pc.in +2 -1
- data/ext/zstdruby/libzstd/{dictBuilder/zdict.h → zdict.h} +201 -31
- data/ext/zstdruby/libzstd/zstd.h +740 -153
- data/ext/zstdruby/libzstd/{common/zstd_errors.h → zstd_errors.h} +3 -1
- data/lib/zstd-ruby/version.rb +1 -1
- data/zstd-ruby.gemspec +1 -1
- metadata +21 -10
- data/.travis.yml +0 -14
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
/*
|
|
2
|
-
* Copyright (c)
|
|
2
|
+
* Copyright (c) Yann Collet, Facebook, Inc.
|
|
3
3
|
* All rights reserved.
|
|
4
4
|
*
|
|
5
5
|
* This source code is licensed under both the BSD-style license (found in the
|
|
@@ -15,7 +15,7 @@
|
|
|
15
15
|
extern "C" {
|
|
16
16
|
#endif
|
|
17
17
|
|
|
18
|
-
#include "mem.h" /* U32 */
|
|
18
|
+
#include "../common/mem.h" /* U32 */
|
|
19
19
|
#include "zstd_compress_internal.h"
|
|
20
20
|
|
|
21
21
|
void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
/*
|
|
2
|
-
* Copyright (c)
|
|
2
|
+
* Copyright (c) Yann Collet, Facebook, Inc.
|
|
3
3
|
* All rights reserved.
|
|
4
4
|
*
|
|
5
5
|
* This source code is licensed under both the BSD-style license (found in the
|
|
@@ -58,11 +58,11 @@ ZSTD_updateDUBT(ZSTD_matchState_t* ms,
|
|
|
58
58
|
|
|
59
59
|
/** ZSTD_insertDUBT1() :
|
|
60
60
|
* sort one already inserted but unsorted position
|
|
61
|
-
* assumption :
|
|
61
|
+
* assumption : curr >= btlow == (curr - btmask)
|
|
62
62
|
* doesn't fail */
|
|
63
63
|
static void
|
|
64
64
|
ZSTD_insertDUBT1(ZSTD_matchState_t* ms,
|
|
65
|
-
U32
|
|
65
|
+
U32 curr, const BYTE* inputEnd,
|
|
66
66
|
U32 nbCompares, U32 btLow,
|
|
67
67
|
const ZSTD_dictMode_e dictMode)
|
|
68
68
|
{
|
|
@@ -74,41 +74,41 @@ ZSTD_insertDUBT1(ZSTD_matchState_t* ms,
|
|
|
74
74
|
const BYTE* const base = ms->window.base;
|
|
75
75
|
const BYTE* const dictBase = ms->window.dictBase;
|
|
76
76
|
const U32 dictLimit = ms->window.dictLimit;
|
|
77
|
-
const BYTE* const ip = (
|
|
78
|
-
const BYTE* const iend = (
|
|
77
|
+
const BYTE* const ip = (curr>=dictLimit) ? base + curr : dictBase + curr;
|
|
78
|
+
const BYTE* const iend = (curr>=dictLimit) ? inputEnd : dictBase + dictLimit;
|
|
79
79
|
const BYTE* const dictEnd = dictBase + dictLimit;
|
|
80
80
|
const BYTE* const prefixStart = base + dictLimit;
|
|
81
81
|
const BYTE* match;
|
|
82
|
-
U32* smallerPtr = bt + 2*(
|
|
82
|
+
U32* smallerPtr = bt + 2*(curr&btMask);
|
|
83
83
|
U32* largerPtr = smallerPtr + 1;
|
|
84
84
|
U32 matchIndex = *smallerPtr; /* this candidate is unsorted : next sorted candidate is reached through *smallerPtr, while *largerPtr contains previous unsorted candidate (which is already saved and can be overwritten) */
|
|
85
85
|
U32 dummy32; /* to be nullified at the end */
|
|
86
86
|
U32 const windowValid = ms->window.lowLimit;
|
|
87
87
|
U32 const maxDistance = 1U << cParams->windowLog;
|
|
88
|
-
U32 const windowLow = (
|
|
88
|
+
U32 const windowLow = (curr - windowValid > maxDistance) ? curr - maxDistance : windowValid;
|
|
89
89
|
|
|
90
90
|
|
|
91
91
|
DEBUGLOG(8, "ZSTD_insertDUBT1(%u) (dictLimit=%u, lowLimit=%u)",
|
|
92
|
-
|
|
93
|
-
assert(
|
|
92
|
+
curr, dictLimit, windowLow);
|
|
93
|
+
assert(curr >= btLow);
|
|
94
94
|
assert(ip < iend); /* condition for ZSTD_count */
|
|
95
95
|
|
|
96
96
|
while (nbCompares-- && (matchIndex > windowLow)) {
|
|
97
97
|
U32* const nextPtr = bt + 2*(matchIndex & btMask);
|
|
98
98
|
size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
|
|
99
|
-
assert(matchIndex <
|
|
99
|
+
assert(matchIndex < curr);
|
|
100
100
|
/* note : all candidates are now supposed sorted,
|
|
101
101
|
* but it's still possible to have nextPtr[1] == ZSTD_DUBT_UNSORTED_MARK
|
|
102
102
|
* when a real index has the same value as ZSTD_DUBT_UNSORTED_MARK */
|
|
103
103
|
|
|
104
104
|
if ( (dictMode != ZSTD_extDict)
|
|
105
105
|
|| (matchIndex+matchLength >= dictLimit) /* both in current segment*/
|
|
106
|
-
|| (
|
|
106
|
+
|| (curr < dictLimit) /* both in extDict */) {
|
|
107
107
|
const BYTE* const mBase = ( (dictMode != ZSTD_extDict)
|
|
108
108
|
|| (matchIndex+matchLength >= dictLimit)) ?
|
|
109
109
|
base : dictBase;
|
|
110
110
|
assert( (matchIndex+matchLength >= dictLimit) /* might be wrong if extDict is incorrectly set to 0 */
|
|
111
|
-
|| (
|
|
111
|
+
|| (curr < dictLimit) );
|
|
112
112
|
match = mBase + matchIndex;
|
|
113
113
|
matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend);
|
|
114
114
|
} else {
|
|
@@ -119,7 +119,7 @@ ZSTD_insertDUBT1(ZSTD_matchState_t* ms,
|
|
|
119
119
|
}
|
|
120
120
|
|
|
121
121
|
DEBUGLOG(8, "ZSTD_insertDUBT1: comparing %u with %u : found %u common bytes ",
|
|
122
|
-
|
|
122
|
+
curr, matchIndex, (U32)matchLength);
|
|
123
123
|
|
|
124
124
|
if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */
|
|
125
125
|
break; /* drop , to guarantee consistency ; miss a bit of compression, but other solutions can corrupt tree */
|
|
@@ -168,7 +168,7 @@ ZSTD_DUBT_findBetterDictMatch (
|
|
|
168
168
|
|
|
169
169
|
const BYTE* const base = ms->window.base;
|
|
170
170
|
const BYTE* const prefixStart = base + ms->window.dictLimit;
|
|
171
|
-
U32 const
|
|
171
|
+
U32 const curr = (U32)(ip-base);
|
|
172
172
|
const BYTE* const dictBase = dms->window.base;
|
|
173
173
|
const BYTE* const dictEnd = dms->window.nextSrc;
|
|
174
174
|
U32 const dictHighLimit = (U32)(dms->window.nextSrc - dms->window.base);
|
|
@@ -195,10 +195,10 @@ ZSTD_DUBT_findBetterDictMatch (
|
|
|
195
195
|
|
|
196
196
|
if (matchLength > bestLength) {
|
|
197
197
|
U32 matchIndex = dictMatchIndex + dictIndexDelta;
|
|
198
|
-
if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(
|
|
198
|
+
if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) {
|
|
199
199
|
DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)",
|
|
200
|
-
|
|
201
|
-
bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE +
|
|
200
|
+
curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, ZSTD_REP_MOVE + curr - matchIndex, dictMatchIndex, matchIndex);
|
|
201
|
+
bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + curr - matchIndex;
|
|
202
202
|
}
|
|
203
203
|
if (ip+matchLength == iend) { /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */
|
|
204
204
|
break; /* drop, to guarantee consistency (miss a little bit of compression) */
|
|
@@ -218,9 +218,9 @@ ZSTD_DUBT_findBetterDictMatch (
|
|
|
218
218
|
}
|
|
219
219
|
|
|
220
220
|
if (bestLength >= MINMATCH) {
|
|
221
|
-
U32 const mIndex =
|
|
221
|
+
U32 const mIndex = curr - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex;
|
|
222
222
|
DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
|
|
223
|
-
|
|
223
|
+
curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
|
|
224
224
|
}
|
|
225
225
|
return bestLength;
|
|
226
226
|
|
|
@@ -241,15 +241,13 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
|
|
|
241
241
|
U32 matchIndex = hashTable[h];
|
|
242
242
|
|
|
243
243
|
const BYTE* const base = ms->window.base;
|
|
244
|
-
U32 const
|
|
245
|
-
U32 const
|
|
246
|
-
U32 const windowValid = ms->window.lowLimit;
|
|
247
|
-
U32 const windowLow = (current - windowValid > maxDistance) ? current - maxDistance : windowValid;
|
|
244
|
+
U32 const curr = (U32)(ip-base);
|
|
245
|
+
U32 const windowLow = ZSTD_getLowestMatchIndex(ms, curr, cParams->windowLog);
|
|
248
246
|
|
|
249
247
|
U32* const bt = ms->chainTable;
|
|
250
248
|
U32 const btLog = cParams->chainLog - 1;
|
|
251
249
|
U32 const btMask = (1 << btLog) - 1;
|
|
252
|
-
U32 const btLow = (btMask >=
|
|
250
|
+
U32 const btLow = (btMask >= curr) ? 0 : curr - btMask;
|
|
253
251
|
U32 const unsortLimit = MAX(btLow, windowLow);
|
|
254
252
|
|
|
255
253
|
U32* nextCandidate = bt + 2*(matchIndex&btMask);
|
|
@@ -258,8 +256,9 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
|
|
|
258
256
|
U32 nbCandidates = nbCompares;
|
|
259
257
|
U32 previousCandidate = 0;
|
|
260
258
|
|
|
261
|
-
DEBUGLOG(7, "ZSTD_DUBT_findBestMatch (%u) ",
|
|
259
|
+
DEBUGLOG(7, "ZSTD_DUBT_findBestMatch (%u) ", curr);
|
|
262
260
|
assert(ip <= iend-8); /* required for h calculation */
|
|
261
|
+
assert(dictMode != ZSTD_dedicatedDictSearch);
|
|
263
262
|
|
|
264
263
|
/* reach end of unsorted candidates list */
|
|
265
264
|
while ( (matchIndex > unsortLimit)
|
|
@@ -301,14 +300,14 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
|
|
|
301
300
|
const U32 dictLimit = ms->window.dictLimit;
|
|
302
301
|
const BYTE* const dictEnd = dictBase + dictLimit;
|
|
303
302
|
const BYTE* const prefixStart = base + dictLimit;
|
|
304
|
-
U32* smallerPtr = bt + 2*(
|
|
305
|
-
U32* largerPtr = bt + 2*(
|
|
306
|
-
U32 matchEndIdx =
|
|
303
|
+
U32* smallerPtr = bt + 2*(curr&btMask);
|
|
304
|
+
U32* largerPtr = bt + 2*(curr&btMask) + 1;
|
|
305
|
+
U32 matchEndIdx = curr + 8 + 1;
|
|
307
306
|
U32 dummy32; /* to be nullified at the end */
|
|
308
307
|
size_t bestLength = 0;
|
|
309
308
|
|
|
310
309
|
matchIndex = hashTable[h];
|
|
311
|
-
hashTable[h] =
|
|
310
|
+
hashTable[h] = curr; /* Update Hash Table */
|
|
312
311
|
|
|
313
312
|
while (nbCompares-- && (matchIndex > windowLow)) {
|
|
314
313
|
U32* const nextPtr = bt + 2*(matchIndex & btMask);
|
|
@@ -328,8 +327,8 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
|
|
|
328
327
|
if (matchLength > bestLength) {
|
|
329
328
|
if (matchLength > matchEndIdx - matchIndex)
|
|
330
329
|
matchEndIdx = matchIndex + (U32)matchLength;
|
|
331
|
-
if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(
|
|
332
|
-
bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE +
|
|
330
|
+
if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) )
|
|
331
|
+
bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + curr - matchIndex;
|
|
333
332
|
if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */
|
|
334
333
|
if (dictMode == ZSTD_dictMatchState) {
|
|
335
334
|
nbCompares = 0; /* in addition to avoiding checking any
|
|
@@ -365,12 +364,12 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
|
|
|
365
364
|
mls, dictMode);
|
|
366
365
|
}
|
|
367
366
|
|
|
368
|
-
assert(matchEndIdx >
|
|
367
|
+
assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased */
|
|
369
368
|
ms->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */
|
|
370
369
|
if (bestLength >= MINMATCH) {
|
|
371
|
-
U32 const mIndex =
|
|
370
|
+
U32 const mIndex = curr - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex;
|
|
372
371
|
DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
|
|
373
|
-
|
|
372
|
+
curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
|
|
374
373
|
}
|
|
375
374
|
return bestLength;
|
|
376
375
|
}
|
|
@@ -439,6 +438,220 @@ static size_t ZSTD_BtFindBestMatch_extDict_selectMLS (
|
|
|
439
438
|
}
|
|
440
439
|
}
|
|
441
440
|
|
|
441
|
+
/***********************************
|
|
442
|
+
* Dedicated dict search
|
|
443
|
+
***********************************/
|
|
444
|
+
|
|
445
|
+
void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip)
|
|
446
|
+
{
|
|
447
|
+
const BYTE* const base = ms->window.base;
|
|
448
|
+
U32 const target = (U32)(ip - base);
|
|
449
|
+
U32* const hashTable = ms->hashTable;
|
|
450
|
+
U32* const chainTable = ms->chainTable;
|
|
451
|
+
U32 const chainSize = 1 << ms->cParams.chainLog;
|
|
452
|
+
U32 idx = ms->nextToUpdate;
|
|
453
|
+
U32 const minChain = chainSize < target ? target - chainSize : idx;
|
|
454
|
+
U32 const bucketSize = 1 << ZSTD_LAZY_DDSS_BUCKET_LOG;
|
|
455
|
+
U32 const cacheSize = bucketSize - 1;
|
|
456
|
+
U32 const chainAttempts = (1 << ms->cParams.searchLog) - cacheSize;
|
|
457
|
+
U32 const chainLimit = chainAttempts > 255 ? 255 : chainAttempts;
|
|
458
|
+
|
|
459
|
+
/* We know the hashtable is oversized by a factor of `bucketSize`.
|
|
460
|
+
* We are going to temporarily pretend `bucketSize == 1`, keeping only a
|
|
461
|
+
* single entry. We will use the rest of the space to construct a temporary
|
|
462
|
+
* chaintable.
|
|
463
|
+
*/
|
|
464
|
+
U32 const hashLog = ms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG;
|
|
465
|
+
U32* const tmpHashTable = hashTable;
|
|
466
|
+
U32* const tmpChainTable = hashTable + ((size_t)1 << hashLog);
|
|
467
|
+
U32 const tmpChainSize = ((1 << ZSTD_LAZY_DDSS_BUCKET_LOG) - 1) << hashLog;
|
|
468
|
+
U32 const tmpMinChain = tmpChainSize < target ? target - tmpChainSize : idx;
|
|
469
|
+
U32 hashIdx;
|
|
470
|
+
|
|
471
|
+
assert(ms->cParams.chainLog <= 24);
|
|
472
|
+
assert(ms->cParams.hashLog > ms->cParams.chainLog);
|
|
473
|
+
assert(idx != 0);
|
|
474
|
+
assert(tmpMinChain <= minChain);
|
|
475
|
+
|
|
476
|
+
/* fill conventional hash table and conventional chain table */
|
|
477
|
+
for ( ; idx < target; idx++) {
|
|
478
|
+
U32 const h = (U32)ZSTD_hashPtr(base + idx, hashLog, ms->cParams.minMatch);
|
|
479
|
+
if (idx >= tmpMinChain) {
|
|
480
|
+
tmpChainTable[idx - tmpMinChain] = hashTable[h];
|
|
481
|
+
}
|
|
482
|
+
tmpHashTable[h] = idx;
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
/* sort chains into ddss chain table */
|
|
486
|
+
{
|
|
487
|
+
U32 chainPos = 0;
|
|
488
|
+
for (hashIdx = 0; hashIdx < (1U << hashLog); hashIdx++) {
|
|
489
|
+
U32 count;
|
|
490
|
+
U32 countBeyondMinChain = 0;
|
|
491
|
+
U32 i = tmpHashTable[hashIdx];
|
|
492
|
+
for (count = 0; i >= tmpMinChain && count < cacheSize; count++) {
|
|
493
|
+
/* skip through the chain to the first position that won't be
|
|
494
|
+
* in the hash cache bucket */
|
|
495
|
+
if (i < minChain) {
|
|
496
|
+
countBeyondMinChain++;
|
|
497
|
+
}
|
|
498
|
+
i = tmpChainTable[i - tmpMinChain];
|
|
499
|
+
}
|
|
500
|
+
if (count == cacheSize) {
|
|
501
|
+
for (count = 0; count < chainLimit;) {
|
|
502
|
+
if (i < minChain) {
|
|
503
|
+
if (!i || ++countBeyondMinChain > cacheSize) {
|
|
504
|
+
/* only allow pulling `cacheSize` number of entries
|
|
505
|
+
* into the cache or chainTable beyond `minChain`,
|
|
506
|
+
* to replace the entries pulled out of the
|
|
507
|
+
* chainTable into the cache. This lets us reach
|
|
508
|
+
* back further without increasing the total number
|
|
509
|
+
* of entries in the chainTable, guaranteeing the
|
|
510
|
+
* DDSS chain table will fit into the space
|
|
511
|
+
* allocated for the regular one. */
|
|
512
|
+
break;
|
|
513
|
+
}
|
|
514
|
+
}
|
|
515
|
+
chainTable[chainPos++] = i;
|
|
516
|
+
count++;
|
|
517
|
+
if (i < tmpMinChain) {
|
|
518
|
+
break;
|
|
519
|
+
}
|
|
520
|
+
i = tmpChainTable[i - tmpMinChain];
|
|
521
|
+
}
|
|
522
|
+
} else {
|
|
523
|
+
count = 0;
|
|
524
|
+
}
|
|
525
|
+
if (count) {
|
|
526
|
+
tmpHashTable[hashIdx] = ((chainPos - count) << 8) + count;
|
|
527
|
+
} else {
|
|
528
|
+
tmpHashTable[hashIdx] = 0;
|
|
529
|
+
}
|
|
530
|
+
}
|
|
531
|
+
assert(chainPos <= chainSize); /* I believe this is guaranteed... */
|
|
532
|
+
}
|
|
533
|
+
|
|
534
|
+
/* move chain pointers into the last entry of each hash bucket */
|
|
535
|
+
for (hashIdx = (1 << hashLog); hashIdx; ) {
|
|
536
|
+
U32 const bucketIdx = --hashIdx << ZSTD_LAZY_DDSS_BUCKET_LOG;
|
|
537
|
+
U32 const chainPackedPointer = tmpHashTable[hashIdx];
|
|
538
|
+
U32 i;
|
|
539
|
+
for (i = 0; i < cacheSize; i++) {
|
|
540
|
+
hashTable[bucketIdx + i] = 0;
|
|
541
|
+
}
|
|
542
|
+
hashTable[bucketIdx + bucketSize - 1] = chainPackedPointer;
|
|
543
|
+
}
|
|
544
|
+
|
|
545
|
+
/* fill the buckets of the hash table */
|
|
546
|
+
for (idx = ms->nextToUpdate; idx < target; idx++) {
|
|
547
|
+
U32 const h = (U32)ZSTD_hashPtr(base + idx, hashLog, ms->cParams.minMatch)
|
|
548
|
+
<< ZSTD_LAZY_DDSS_BUCKET_LOG;
|
|
549
|
+
U32 i;
|
|
550
|
+
/* Shift hash cache down 1. */
|
|
551
|
+
for (i = cacheSize - 1; i; i--)
|
|
552
|
+
hashTable[h + i] = hashTable[h + i - 1];
|
|
553
|
+
hashTable[h] = idx;
|
|
554
|
+
}
|
|
555
|
+
|
|
556
|
+
ms->nextToUpdate = target;
|
|
557
|
+
}
|
|
558
|
+
|
|
559
|
+
/* Returns the longest match length found in the dedicated dict search structure.
|
|
560
|
+
* If none are longer than the argument ml, then ml will be returned.
|
|
561
|
+
*/
|
|
562
|
+
FORCE_INLINE_TEMPLATE
|
|
563
|
+
size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nbAttempts,
|
|
564
|
+
const ZSTD_matchState_t* const dms,
|
|
565
|
+
const BYTE* const ip, const BYTE* const iLimit,
|
|
566
|
+
const BYTE* const prefixStart, const U32 curr,
|
|
567
|
+
const U32 dictLimit, const size_t ddsIdx) {
|
|
568
|
+
const U32 ddsLowestIndex = dms->window.dictLimit;
|
|
569
|
+
const BYTE* const ddsBase = dms->window.base;
|
|
570
|
+
const BYTE* const ddsEnd = dms->window.nextSrc;
|
|
571
|
+
const U32 ddsSize = (U32)(ddsEnd - ddsBase);
|
|
572
|
+
const U32 ddsIndexDelta = dictLimit - ddsSize;
|
|
573
|
+
const U32 bucketSize = (1 << ZSTD_LAZY_DDSS_BUCKET_LOG);
|
|
574
|
+
const U32 bucketLimit = nbAttempts < bucketSize - 1 ? nbAttempts : bucketSize - 1;
|
|
575
|
+
U32 ddsAttempt;
|
|
576
|
+
U32 matchIndex;
|
|
577
|
+
|
|
578
|
+
for (ddsAttempt = 0; ddsAttempt < bucketSize - 1; ddsAttempt++) {
|
|
579
|
+
PREFETCH_L1(ddsBase + dms->hashTable[ddsIdx + ddsAttempt]);
|
|
580
|
+
}
|
|
581
|
+
|
|
582
|
+
{
|
|
583
|
+
U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1];
|
|
584
|
+
U32 const chainIndex = chainPackedPointer >> 8;
|
|
585
|
+
|
|
586
|
+
PREFETCH_L1(&dms->chainTable[chainIndex]);
|
|
587
|
+
}
|
|
588
|
+
|
|
589
|
+
for (ddsAttempt = 0; ddsAttempt < bucketLimit; ddsAttempt++) {
|
|
590
|
+
size_t currentMl=0;
|
|
591
|
+
const BYTE* match;
|
|
592
|
+
matchIndex = dms->hashTable[ddsIdx + ddsAttempt];
|
|
593
|
+
match = ddsBase + matchIndex;
|
|
594
|
+
|
|
595
|
+
if (!matchIndex) {
|
|
596
|
+
return ml;
|
|
597
|
+
}
|
|
598
|
+
|
|
599
|
+
/* guaranteed by table construction */
|
|
600
|
+
(void)ddsLowestIndex;
|
|
601
|
+
assert(matchIndex >= ddsLowestIndex);
|
|
602
|
+
assert(match+4 <= ddsEnd);
|
|
603
|
+
if (MEM_read32(match) == MEM_read32(ip)) {
|
|
604
|
+
/* assumption : matchIndex <= dictLimit-4 (by table construction) */
|
|
605
|
+
currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4;
|
|
606
|
+
}
|
|
607
|
+
|
|
608
|
+
/* save best solution */
|
|
609
|
+
if (currentMl > ml) {
|
|
610
|
+
ml = currentMl;
|
|
611
|
+
*offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE;
|
|
612
|
+
if (ip+currentMl == iLimit) {
|
|
613
|
+
/* best possible, avoids read overflow on next attempt */
|
|
614
|
+
return ml;
|
|
615
|
+
}
|
|
616
|
+
}
|
|
617
|
+
}
|
|
618
|
+
|
|
619
|
+
{
|
|
620
|
+
U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1];
|
|
621
|
+
U32 chainIndex = chainPackedPointer >> 8;
|
|
622
|
+
U32 const chainLength = chainPackedPointer & 0xFF;
|
|
623
|
+
U32 const chainAttempts = nbAttempts - ddsAttempt;
|
|
624
|
+
U32 const chainLimit = chainAttempts > chainLength ? chainLength : chainAttempts;
|
|
625
|
+
U32 chainAttempt;
|
|
626
|
+
|
|
627
|
+
for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++) {
|
|
628
|
+
PREFETCH_L1(ddsBase + dms->chainTable[chainIndex + chainAttempt]);
|
|
629
|
+
}
|
|
630
|
+
|
|
631
|
+
for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++, chainIndex++) {
|
|
632
|
+
size_t currentMl=0;
|
|
633
|
+
const BYTE* match;
|
|
634
|
+
matchIndex = dms->chainTable[chainIndex];
|
|
635
|
+
match = ddsBase + matchIndex;
|
|
636
|
+
|
|
637
|
+
/* guaranteed by table construction */
|
|
638
|
+
assert(matchIndex >= ddsLowestIndex);
|
|
639
|
+
assert(match+4 <= ddsEnd);
|
|
640
|
+
if (MEM_read32(match) == MEM_read32(ip)) {
|
|
641
|
+
/* assumption : matchIndex <= dictLimit-4 (by table construction) */
|
|
642
|
+
currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4;
|
|
643
|
+
}
|
|
644
|
+
|
|
645
|
+
/* save best solution */
|
|
646
|
+
if (currentMl > ml) {
|
|
647
|
+
ml = currentMl;
|
|
648
|
+
*offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE;
|
|
649
|
+
if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
|
|
650
|
+
}
|
|
651
|
+
}
|
|
652
|
+
}
|
|
653
|
+
return ml;
|
|
654
|
+
}
|
|
442
655
|
|
|
443
656
|
|
|
444
657
|
/* *********************************
|
|
@@ -448,7 +661,7 @@ static size_t ZSTD_BtFindBestMatch_extDict_selectMLS (
|
|
|
448
661
|
|
|
449
662
|
/* Update chains up to ip (excluded)
|
|
450
663
|
Assumption : always within prefix (i.e. not within extDict) */
|
|
451
|
-
|
|
664
|
+
FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
|
|
452
665
|
ZSTD_matchState_t* ms,
|
|
453
666
|
const ZSTD_compressionParameters* const cParams,
|
|
454
667
|
const BYTE* ip, U32 const mls)
|
|
@@ -477,7 +690,6 @@ U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {
|
|
|
477
690
|
return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch);
|
|
478
691
|
}
|
|
479
692
|
|
|
480
|
-
|
|
481
693
|
/* inlining is important to hardwire a hot branch (template emulation) */
|
|
482
694
|
FORCE_INLINE_TEMPLATE
|
|
483
695
|
size_t ZSTD_HcFindBestMatch_generic (
|
|
@@ -495,18 +707,33 @@ size_t ZSTD_HcFindBestMatch_generic (
|
|
|
495
707
|
const U32 dictLimit = ms->window.dictLimit;
|
|
496
708
|
const BYTE* const prefixStart = base + dictLimit;
|
|
497
709
|
const BYTE* const dictEnd = dictBase + dictLimit;
|
|
498
|
-
const U32
|
|
710
|
+
const U32 curr = (U32)(ip-base);
|
|
499
711
|
const U32 maxDistance = 1U << cParams->windowLog;
|
|
500
|
-
const U32
|
|
501
|
-
const U32
|
|
502
|
-
const U32
|
|
712
|
+
const U32 lowestValid = ms->window.lowLimit;
|
|
713
|
+
const U32 withinMaxDistance = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid;
|
|
714
|
+
const U32 isDictionary = (ms->loadedDictEnd != 0);
|
|
715
|
+
const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance;
|
|
716
|
+
const U32 minChain = curr > chainSize ? curr - chainSize : 0;
|
|
503
717
|
U32 nbAttempts = 1U << cParams->searchLog;
|
|
504
718
|
size_t ml=4-1;
|
|
505
719
|
|
|
720
|
+
const ZSTD_matchState_t* const dms = ms->dictMatchState;
|
|
721
|
+
const U32 ddsHashLog = dictMode == ZSTD_dedicatedDictSearch
|
|
722
|
+
? dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG : 0;
|
|
723
|
+
const size_t ddsIdx = dictMode == ZSTD_dedicatedDictSearch
|
|
724
|
+
? ZSTD_hashPtr(ip, ddsHashLog, mls) << ZSTD_LAZY_DDSS_BUCKET_LOG : 0;
|
|
725
|
+
|
|
726
|
+
U32 matchIndex;
|
|
727
|
+
|
|
728
|
+
if (dictMode == ZSTD_dedicatedDictSearch) {
|
|
729
|
+
const U32* entry = &dms->hashTable[ddsIdx];
|
|
730
|
+
PREFETCH_L1(entry);
|
|
731
|
+
}
|
|
732
|
+
|
|
506
733
|
/* HC4 match finder */
|
|
507
|
-
|
|
734
|
+
matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls);
|
|
508
735
|
|
|
509
|
-
for ( ; (matchIndex
|
|
736
|
+
for ( ; (matchIndex>=lowLimit) & (nbAttempts>0) ; nbAttempts--) {
|
|
510
737
|
size_t currentMl=0;
|
|
511
738
|
if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
|
|
512
739
|
const BYTE* const match = base + matchIndex;
|
|
@@ -523,7 +750,7 @@ size_t ZSTD_HcFindBestMatch_generic (
|
|
|
523
750
|
/* save best solution */
|
|
524
751
|
if (currentMl > ml) {
|
|
525
752
|
ml = currentMl;
|
|
526
|
-
*offsetPtr =
|
|
753
|
+
*offsetPtr = curr - matchIndex + ZSTD_REP_MOVE;
|
|
527
754
|
if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
|
|
528
755
|
}
|
|
529
756
|
|
|
@@ -531,8 +758,10 @@ size_t ZSTD_HcFindBestMatch_generic (
|
|
|
531
758
|
matchIndex = NEXT_IN_CHAIN(matchIndex, chainMask);
|
|
532
759
|
}
|
|
533
760
|
|
|
534
|
-
if (dictMode ==
|
|
535
|
-
|
|
761
|
+
if (dictMode == ZSTD_dedicatedDictSearch) {
|
|
762
|
+
ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts, dms,
|
|
763
|
+
ip, iLimit, prefixStart, curr, dictLimit, ddsIdx);
|
|
764
|
+
} else if (dictMode == ZSTD_dictMatchState) {
|
|
536
765
|
const U32* const dmsChainTable = dms->chainTable;
|
|
537
766
|
const U32 dmsChainSize = (1 << dms->cParams.chainLog);
|
|
538
767
|
const U32 dmsChainMask = dmsChainSize - 1;
|
|
@@ -545,7 +774,7 @@ size_t ZSTD_HcFindBestMatch_generic (
|
|
|
545
774
|
|
|
546
775
|
matchIndex = dms->hashTable[ZSTD_hashPtr(ip, dms->cParams.hashLog, mls)];
|
|
547
776
|
|
|
548
|
-
for ( ; (matchIndex
|
|
777
|
+
for ( ; (matchIndex>=dmsLowestIndex) & (nbAttempts>0) ; nbAttempts--) {
|
|
549
778
|
size_t currentMl=0;
|
|
550
779
|
const BYTE* const match = dmsBase + matchIndex;
|
|
551
780
|
assert(match+4 <= dmsEnd);
|
|
@@ -555,11 +784,12 @@ size_t ZSTD_HcFindBestMatch_generic (
|
|
|
555
784
|
/* save best solution */
|
|
556
785
|
if (currentMl > ml) {
|
|
557
786
|
ml = currentMl;
|
|
558
|
-
*offsetPtr =
|
|
787
|
+
*offsetPtr = curr - (matchIndex + dmsIndexDelta) + ZSTD_REP_MOVE;
|
|
559
788
|
if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
|
|
560
789
|
}
|
|
561
790
|
|
|
562
791
|
if (matchIndex <= dmsMinChain) break;
|
|
792
|
+
|
|
563
793
|
matchIndex = dmsChainTable[matchIndex & dmsChainMask];
|
|
564
794
|
}
|
|
565
795
|
}
|
|
@@ -600,6 +830,22 @@ static size_t ZSTD_HcFindBestMatch_dictMatchState_selectMLS (
|
|
|
600
830
|
}
|
|
601
831
|
|
|
602
832
|
|
|
833
|
+
static size_t ZSTD_HcFindBestMatch_dedicatedDictSearch_selectMLS (
|
|
834
|
+
ZSTD_matchState_t* ms,
|
|
835
|
+
const BYTE* ip, const BYTE* const iLimit,
|
|
836
|
+
size_t* offsetPtr)
|
|
837
|
+
{
|
|
838
|
+
switch(ms->cParams.minMatch)
|
|
839
|
+
{
|
|
840
|
+
default : /* includes case 3 */
|
|
841
|
+
case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_dedicatedDictSearch);
|
|
842
|
+
case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_dedicatedDictSearch);
|
|
843
|
+
case 7 :
|
|
844
|
+
case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_dedicatedDictSearch);
|
|
845
|
+
}
|
|
846
|
+
}
|
|
847
|
+
|
|
848
|
+
|
|
603
849
|
FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_extDict_selectMLS (
|
|
604
850
|
ZSTD_matchState_t* ms,
|
|
605
851
|
const BYTE* ip, const BYTE* const iLimit,
|
|
@@ -615,73 +861,765 @@ FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_extDict_selectMLS (
|
|
|
615
861
|
}
|
|
616
862
|
}
|
|
617
863
|
|
|
864
|
+
/* *********************************
|
|
865
|
+
* (SIMD) Row-based matchfinder
|
|
866
|
+
***********************************/
|
|
867
|
+
/* Constants for row-based hash */
|
|
868
|
+
#define ZSTD_ROW_HASH_TAG_OFFSET 1 /* byte offset of hashes in the match state's tagTable from the beginning of a row */
|
|
869
|
+
#define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */
|
|
870
|
+
#define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1)
|
|
871
|
+
|
|
872
|
+
#define ZSTD_ROW_HASH_CACHE_MASK (ZSTD_ROW_HASH_CACHE_SIZE - 1)
|
|
873
|
+
|
|
874
|
+
typedef U32 ZSTD_VecMask; /* Clarifies when we are interacting with a U32 representing a mask of matches */
|
|
875
|
+
|
|
876
|
+
#if !defined(ZSTD_NO_INTRINSICS) && defined(__SSE2__) /* SIMD SSE version */
|
|
877
|
+
|
|
878
|
+
#include <emmintrin.h>
|
|
879
|
+
typedef __m128i ZSTD_Vec128;
|
|
880
|
+
|
|
881
|
+
/* Returns a 128-bit container with 128-bits from src */
|
|
882
|
+
static ZSTD_Vec128 ZSTD_Vec128_read(const void* const src) {
|
|
883
|
+
return _mm_loadu_si128((ZSTD_Vec128 const*)src);
|
|
884
|
+
}
|
|
885
|
+
|
|
886
|
+
/* Returns a ZSTD_Vec128 with the byte "val" packed 16 times */
|
|
887
|
+
static ZSTD_Vec128 ZSTD_Vec128_set8(BYTE val) {
|
|
888
|
+
return _mm_set1_epi8((char)val);
|
|
889
|
+
}
|
|
890
|
+
|
|
891
|
+
/* Do byte-by-byte comparison result of x and y. Then collapse 128-bit resultant mask
|
|
892
|
+
* into a 32-bit mask that is the MSB of each byte.
|
|
893
|
+
* */
|
|
894
|
+
static ZSTD_VecMask ZSTD_Vec128_cmpMask8(ZSTD_Vec128 x, ZSTD_Vec128 y) {
|
|
895
|
+
return (ZSTD_VecMask)_mm_movemask_epi8(_mm_cmpeq_epi8(x, y));
|
|
896
|
+
}
|
|
897
|
+
|
|
898
|
+
typedef struct {
|
|
899
|
+
__m128i fst;
|
|
900
|
+
__m128i snd;
|
|
901
|
+
} ZSTD_Vec256;
|
|
902
|
+
|
|
903
|
+
static ZSTD_Vec256 ZSTD_Vec256_read(const void* const ptr) {
|
|
904
|
+
ZSTD_Vec256 v;
|
|
905
|
+
v.fst = ZSTD_Vec128_read(ptr);
|
|
906
|
+
v.snd = ZSTD_Vec128_read((ZSTD_Vec128 const*)ptr + 1);
|
|
907
|
+
return v;
|
|
908
|
+
}
|
|
909
|
+
|
|
910
|
+
static ZSTD_Vec256 ZSTD_Vec256_set8(BYTE val) {
|
|
911
|
+
ZSTD_Vec256 v;
|
|
912
|
+
v.fst = ZSTD_Vec128_set8(val);
|
|
913
|
+
v.snd = ZSTD_Vec128_set8(val);
|
|
914
|
+
return v;
|
|
915
|
+
}
|
|
916
|
+
|
|
917
|
+
static ZSTD_VecMask ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x, ZSTD_Vec256 y) {
|
|
918
|
+
ZSTD_VecMask fstMask;
|
|
919
|
+
ZSTD_VecMask sndMask;
|
|
920
|
+
fstMask = ZSTD_Vec128_cmpMask8(x.fst, y.fst);
|
|
921
|
+
sndMask = ZSTD_Vec128_cmpMask8(x.snd, y.snd);
|
|
922
|
+
return fstMask | (sndMask << 16);
|
|
923
|
+
}
|
|
924
|
+
|
|
925
|
+
#elif !defined(ZSTD_NO_INTRINSICS) && defined(__ARM_NEON) /* SIMD ARM NEON Version */
|
|
926
|
+
|
|
927
|
+
#include <arm_neon.h>
|
|
928
|
+
typedef uint8x16_t ZSTD_Vec128;
|
|
929
|
+
|
|
930
|
+
static ZSTD_Vec128 ZSTD_Vec128_read(const void* const src) {
|
|
931
|
+
return vld1q_u8((const BYTE* const)src);
|
|
932
|
+
}
|
|
933
|
+
|
|
934
|
+
static ZSTD_Vec128 ZSTD_Vec128_set8(BYTE val) {
|
|
935
|
+
return vdupq_n_u8(val);
|
|
936
|
+
}
|
|
937
|
+
|
|
938
|
+
/* Mimics '_mm_movemask_epi8()' from SSE */
|
|
939
|
+
static U32 ZSTD_vmovmaskq_u8(ZSTD_Vec128 val) {
|
|
940
|
+
/* Shift out everything but the MSB bits in each byte */
|
|
941
|
+
uint16x8_t highBits = vreinterpretq_u16_u8(vshrq_n_u8(val, 7));
|
|
942
|
+
/* Merge the even lanes together with vsra (right shift and add) */
|
|
943
|
+
uint32x4_t paired16 = vreinterpretq_u32_u16(vsraq_n_u16(highBits, highBits, 7));
|
|
944
|
+
uint64x2_t paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
|
|
945
|
+
uint8x16_t paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
|
|
946
|
+
/* Extract the low 8 bits from each lane, merge */
|
|
947
|
+
return vgetq_lane_u8(paired64, 0) | ((U32)vgetq_lane_u8(paired64, 8) << 8);
|
|
948
|
+
}
|
|
949
|
+
|
|
950
|
+
static ZSTD_VecMask ZSTD_Vec128_cmpMask8(ZSTD_Vec128 x, ZSTD_Vec128 y) {
|
|
951
|
+
return (ZSTD_VecMask)ZSTD_vmovmaskq_u8(vceqq_u8(x, y));
|
|
952
|
+
}
|
|
953
|
+
|
|
954
|
+
typedef struct {
|
|
955
|
+
uint8x16_t fst;
|
|
956
|
+
uint8x16_t snd;
|
|
957
|
+
} ZSTD_Vec256;
|
|
958
|
+
|
|
959
|
+
static ZSTD_Vec256 ZSTD_Vec256_read(const void* const ptr) {
|
|
960
|
+
ZSTD_Vec256 v;
|
|
961
|
+
v.fst = ZSTD_Vec128_read(ptr);
|
|
962
|
+
v.snd = ZSTD_Vec128_read((ZSTD_Vec128 const*)ptr + 1);
|
|
963
|
+
return v;
|
|
964
|
+
}
|
|
965
|
+
|
|
966
|
+
static ZSTD_Vec256 ZSTD_Vec256_set8(BYTE val) {
|
|
967
|
+
ZSTD_Vec256 v;
|
|
968
|
+
v.fst = ZSTD_Vec128_set8(val);
|
|
969
|
+
v.snd = ZSTD_Vec128_set8(val);
|
|
970
|
+
return v;
|
|
971
|
+
}
|
|
972
|
+
|
|
973
|
+
static ZSTD_VecMask ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x, ZSTD_Vec256 y) {
|
|
974
|
+
ZSTD_VecMask fstMask;
|
|
975
|
+
ZSTD_VecMask sndMask;
|
|
976
|
+
fstMask = ZSTD_Vec128_cmpMask8(x.fst, y.fst);
|
|
977
|
+
sndMask = ZSTD_Vec128_cmpMask8(x.snd, y.snd);
|
|
978
|
+
return fstMask | (sndMask << 16);
|
|
979
|
+
}
|
|
980
|
+
|
|
981
|
+
#else /* Scalar fallback version */
|
|
982
|
+
|
|
983
|
+
#define VEC128_NB_SIZE_T (16 / sizeof(size_t))
|
|
984
|
+
typedef struct {
|
|
985
|
+
size_t vec[VEC128_NB_SIZE_T];
|
|
986
|
+
} ZSTD_Vec128;
|
|
987
|
+
|
|
988
|
+
static ZSTD_Vec128 ZSTD_Vec128_read(const void* const src) {
|
|
989
|
+
ZSTD_Vec128 ret;
|
|
990
|
+
ZSTD_memcpy(ret.vec, src, VEC128_NB_SIZE_T*sizeof(size_t));
|
|
991
|
+
return ret;
|
|
992
|
+
}
|
|
993
|
+
|
|
994
|
+
static ZSTD_Vec128 ZSTD_Vec128_set8(BYTE val) {
|
|
995
|
+
ZSTD_Vec128 ret = { {0} };
|
|
996
|
+
int startBit = sizeof(size_t) * 8 - 8;
|
|
997
|
+
for (;startBit >= 0; startBit -= 8) {
|
|
998
|
+
unsigned j = 0;
|
|
999
|
+
for (;j < VEC128_NB_SIZE_T; ++j) {
|
|
1000
|
+
ret.vec[j] |= ((size_t)val << startBit);
|
|
1001
|
+
}
|
|
1002
|
+
}
|
|
1003
|
+
return ret;
|
|
1004
|
+
}
|
|
1005
|
+
|
|
1006
|
+
/* Compare x to y, byte by byte, generating a "matches" bitfield */
|
|
1007
|
+
static ZSTD_VecMask ZSTD_Vec128_cmpMask8(ZSTD_Vec128 x, ZSTD_Vec128 y) {
|
|
1008
|
+
ZSTD_VecMask res = 0;
|
|
1009
|
+
unsigned i = 0;
|
|
1010
|
+
unsigned l = 0;
|
|
1011
|
+
for (; i < VEC128_NB_SIZE_T; ++i) {
|
|
1012
|
+
const size_t cmp1 = x.vec[i];
|
|
1013
|
+
const size_t cmp2 = y.vec[i];
|
|
1014
|
+
unsigned j = 0;
|
|
1015
|
+
for (; j < sizeof(size_t); ++j, ++l) {
|
|
1016
|
+
if (((cmp1 >> j*8) & 0xFF) == ((cmp2 >> j*8) & 0xFF)) {
|
|
1017
|
+
res |= ((U32)1 << (j+i*sizeof(size_t)));
|
|
1018
|
+
}
|
|
1019
|
+
}
|
|
1020
|
+
}
|
|
1021
|
+
return res;
|
|
1022
|
+
}
|
|
1023
|
+
|
|
1024
|
+
#define VEC256_NB_SIZE_T 2*VEC128_NB_SIZE_T
|
|
1025
|
+
typedef struct {
|
|
1026
|
+
size_t vec[VEC256_NB_SIZE_T];
|
|
1027
|
+
} ZSTD_Vec256;
|
|
1028
|
+
|
|
1029
|
+
static ZSTD_Vec256 ZSTD_Vec256_read(const void* const src) {
|
|
1030
|
+
ZSTD_Vec256 ret;
|
|
1031
|
+
ZSTD_memcpy(ret.vec, src, VEC256_NB_SIZE_T*sizeof(size_t));
|
|
1032
|
+
return ret;
|
|
1033
|
+
}
|
|
1034
|
+
|
|
1035
|
+
static ZSTD_Vec256 ZSTD_Vec256_set8(BYTE val) {
|
|
1036
|
+
ZSTD_Vec256 ret = { {0} };
|
|
1037
|
+
int startBit = sizeof(size_t) * 8 - 8;
|
|
1038
|
+
for (;startBit >= 0; startBit -= 8) {
|
|
1039
|
+
unsigned j = 0;
|
|
1040
|
+
for (;j < VEC256_NB_SIZE_T; ++j) {
|
|
1041
|
+
ret.vec[j] |= ((size_t)val << startBit);
|
|
1042
|
+
}
|
|
1043
|
+
}
|
|
1044
|
+
return ret;
|
|
1045
|
+
}
|
|
1046
|
+
|
|
1047
|
+
/* Compare x to y, byte by byte, generating a "matches" bitfield */
|
|
1048
|
+
static ZSTD_VecMask ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x, ZSTD_Vec256 y) {
|
|
1049
|
+
ZSTD_VecMask res = 0;
|
|
1050
|
+
unsigned i = 0;
|
|
1051
|
+
unsigned l = 0;
|
|
1052
|
+
for (; i < VEC256_NB_SIZE_T; ++i) {
|
|
1053
|
+
const size_t cmp1 = x.vec[i];
|
|
1054
|
+
const size_t cmp2 = y.vec[i];
|
|
1055
|
+
unsigned j = 0;
|
|
1056
|
+
for (; j < sizeof(size_t); ++j, ++l) {
|
|
1057
|
+
if (((cmp1 >> j*8) & 0xFF) == ((cmp2 >> j*8) & 0xFF)) {
|
|
1058
|
+
res |= ((U32)1 << (j+i*sizeof(size_t)));
|
|
1059
|
+
}
|
|
1060
|
+
}
|
|
1061
|
+
}
|
|
1062
|
+
return res;
|
|
1063
|
+
}
|
|
1064
|
+
|
|
1065
|
+
#endif /* !defined(ZSTD_NO_INTRINSICS) && defined(__SSE2__) */
|
|
1066
|
+
|
|
1067
|
+
/* ZSTD_VecMask_next():
|
|
1068
|
+
* Starting from the LSB, returns the idx of the next non-zero bit.
|
|
1069
|
+
* Basically counting the nb of trailing zeroes.
|
|
1070
|
+
*/
|
|
1071
|
+
static U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
|
|
1072
|
+
# if defined(_MSC_VER) /* Visual */
|
|
1073
|
+
unsigned long r=0;
|
|
1074
|
+
return _BitScanForward(&r, val) ? (U32)r : 0;
|
|
1075
|
+
# elif defined(__GNUC__) && (__GNUC__ >= 3)
|
|
1076
|
+
return (U32)__builtin_ctz(val);
|
|
1077
|
+
# else
|
|
1078
|
+
/* Software ctz version: http://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightMultLookup */
|
|
1079
|
+
static const U32 multiplyDeBruijnBitPosition[32] =
|
|
1080
|
+
{
|
|
1081
|
+
0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
|
|
1082
|
+
31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
|
|
1083
|
+
};
|
|
1084
|
+
return multiplyDeBruijnBitPosition[((U32)((v & -(int)v) * 0x077CB531U)) >> 27];
|
|
1085
|
+
# endif
|
|
1086
|
+
}
|
|
1087
|
+
|
|
1088
|
+
/* ZSTD_VecMask_rotateRight():
|
|
1089
|
+
* Rotates a bitfield to the right by "rotation" bits.
|
|
1090
|
+
* If the rotation is greater than totalBits, the returned mask is 0.
|
|
1091
|
+
*/
|
|
1092
|
+
FORCE_INLINE_TEMPLATE ZSTD_VecMask
|
|
1093
|
+
ZSTD_VecMask_rotateRight(ZSTD_VecMask mask, U32 const rotation, U32 const totalBits) {
|
|
1094
|
+
if (rotation == 0)
|
|
1095
|
+
return mask;
|
|
1096
|
+
switch (totalBits) {
|
|
1097
|
+
default:
|
|
1098
|
+
assert(0);
|
|
1099
|
+
case 16:
|
|
1100
|
+
return (mask >> rotation) | (U16)(mask << (16 - rotation));
|
|
1101
|
+
case 32:
|
|
1102
|
+
return (mask >> rotation) | (U32)(mask << (32 - rotation));
|
|
1103
|
+
}
|
|
1104
|
+
}
|
|
1105
|
+
|
|
1106
|
+
/* ZSTD_row_nextIndex():
|
|
1107
|
+
* Returns the next index to insert at within a tagTable row, and updates the "head"
|
|
1108
|
+
* value to reflect the update. Essentially cycles backwards from [0, {entries per row})
|
|
1109
|
+
*/
|
|
1110
|
+
FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const rowMask) {
|
|
1111
|
+
U32 const next = (*tagRow - 1) & rowMask;
|
|
1112
|
+
*tagRow = (BYTE)next;
|
|
1113
|
+
return next;
|
|
1114
|
+
}
|
|
1115
|
+
|
|
1116
|
+
/* ZSTD_isAligned():
|
|
1117
|
+
* Checks that a pointer is aligned to "align" bytes which must be a power of 2.
|
|
1118
|
+
*/
|
|
1119
|
+
MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) {
|
|
1120
|
+
assert((align & (align - 1)) == 0);
|
|
1121
|
+
return (((size_t)ptr) & (align - 1)) == 0;
|
|
1122
|
+
}
|
|
1123
|
+
|
|
1124
|
+
/* ZSTD_row_prefetch():
|
|
1125
|
+
* Performs prefetching for the hashTable and tagTable at a given row.
|
|
1126
|
+
*/
|
|
1127
|
+
FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* tagTable, U32 const relRow, U32 const rowLog) {
|
|
1128
|
+
PREFETCH_L1(hashTable + relRow);
|
|
1129
|
+
if (rowLog == 5) {
|
|
1130
|
+
PREFETCH_L1(hashTable + relRow + 16);
|
|
1131
|
+
}
|
|
1132
|
+
PREFETCH_L1(tagTable + relRow);
|
|
1133
|
+
assert(rowLog == 4 || rowLog == 5);
|
|
1134
|
+
assert(ZSTD_isAligned(hashTable + relRow, 64)); /* prefetched hash row always 64-byte aligned */
|
|
1135
|
+
assert(ZSTD_isAligned(tagTable + relRow, (size_t)1 << rowLog)); /* prefetched tagRow sits on a multiple of 32 or 64 bytes */
|
|
1136
|
+
}
|
|
1137
|
+
|
|
1138
|
+
/* ZSTD_row_fillHashCache():
|
|
1139
|
+
* Fill up the hash cache starting at idx, prefetching up to ZSTD_ROW_HASH_CACHE_SIZE entries,
|
|
1140
|
+
* but not beyond iLimit.
|
|
1141
|
+
*/
|
|
1142
|
+
static void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base,
|
|
1143
|
+
U32 const rowLog, U32 const mls,
|
|
1144
|
+
U32 idx, const BYTE* const iLimit)
|
|
1145
|
+
{
|
|
1146
|
+
U32 const* const hashTable = ms->hashTable;
|
|
1147
|
+
U16 const* const tagTable = ms->tagTable;
|
|
1148
|
+
U32 const hashLog = ms->rowHashLog;
|
|
1149
|
+
U32 const maxElemsToPrefetch = (base + idx) > iLimit ? 0 : (U32)(iLimit - (base + idx) + 1);
|
|
1150
|
+
U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch);
|
|
1151
|
+
|
|
1152
|
+
for (; idx < lim; ++idx) {
|
|
1153
|
+
U32 const hash = (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
|
|
1154
|
+
U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
|
|
1155
|
+
ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
|
|
1156
|
+
ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash;
|
|
1157
|
+
}
|
|
1158
|
+
|
|
1159
|
+
DEBUGLOG(6, "ZSTD_row_fillHashCache(): [%u %u %u %u %u %u %u %u]", ms->hashCache[0], ms->hashCache[1],
|
|
1160
|
+
ms->hashCache[2], ms->hashCache[3], ms->hashCache[4],
|
|
1161
|
+
ms->hashCache[5], ms->hashCache[6], ms->hashCache[7]);
|
|
1162
|
+
}
|
|
1163
|
+
|
|
1164
|
+
/* ZSTD_row_nextCachedHash():
|
|
1165
|
+
* Returns the hash of base + idx, and replaces the hash in the hash cache with the byte at
|
|
1166
|
+
* base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable.
|
|
1167
|
+
*/
|
|
1168
|
+
FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable,
|
|
1169
|
+
U16 const* tagTable, BYTE const* base,
|
|
1170
|
+
U32 idx, U32 const hashLog,
|
|
1171
|
+
U32 const rowLog, U32 const mls)
|
|
1172
|
+
{
|
|
1173
|
+
U32 const newHash = (U32)ZSTD_hashPtr(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
|
|
1174
|
+
U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
|
|
1175
|
+
ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
|
|
1176
|
+
{ U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK];
|
|
1177
|
+
cache[idx & ZSTD_ROW_HASH_CACHE_MASK] = newHash;
|
|
1178
|
+
return hash;
|
|
1179
|
+
}
|
|
1180
|
+
}
|
|
1181
|
+
|
|
1182
|
+
/* ZSTD_row_update_internal():
|
|
1183
|
+
* Inserts the byte at ip into the appropriate position in the hash table.
|
|
1184
|
+
* Determines the relative row, and the position within the {16, 32} entry row to insert at.
|
|
1185
|
+
*/
|
|
1186
|
+
FORCE_INLINE_TEMPLATE void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip,
|
|
1187
|
+
U32 const mls, U32 const rowLog,
|
|
1188
|
+
U32 const rowMask, U32 const useCache)
|
|
1189
|
+
{
|
|
1190
|
+
U32* const hashTable = ms->hashTable;
|
|
1191
|
+
U16* const tagTable = ms->tagTable;
|
|
1192
|
+
U32 const hashLog = ms->rowHashLog;
|
|
1193
|
+
const BYTE* const base = ms->window.base;
|
|
1194
|
+
const U32 target = (U32)(ip - base);
|
|
1195
|
+
U32 idx = ms->nextToUpdate;
|
|
1196
|
+
|
|
1197
|
+
DEBUGLOG(6, "ZSTD_row_update_internal(): nextToUpdate=%u, current=%u", idx, target);
|
|
1198
|
+
for (; idx < target; ++idx) {
|
|
1199
|
+
U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, idx, hashLog, rowLog, mls)
|
|
1200
|
+
: (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
|
|
1201
|
+
U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
|
|
1202
|
+
U32* const row = hashTable + relRow;
|
|
1203
|
+
BYTE* tagRow = (BYTE*)(tagTable + relRow); /* Though tagTable is laid out as a table of U16, each tag is only 1 byte.
|
|
1204
|
+
Explicit cast allows us to get exact desired position within each row */
|
|
1205
|
+
U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
|
|
1206
|
+
|
|
1207
|
+
assert(hash == ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls));
|
|
1208
|
+
((BYTE*)tagRow)[pos + ZSTD_ROW_HASH_TAG_OFFSET] = hash & ZSTD_ROW_HASH_TAG_MASK;
|
|
1209
|
+
row[pos] = idx;
|
|
1210
|
+
}
|
|
1211
|
+
ms->nextToUpdate = target;
|
|
1212
|
+
}
|
|
1213
|
+
|
|
1214
|
+
/* ZSTD_row_update():
|
|
1215
|
+
* External wrapper for ZSTD_row_update_internal(). Used for filling the hashtable during dictionary
|
|
1216
|
+
* processing.
|
|
1217
|
+
*/
|
|
1218
|
+
void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) {
|
|
1219
|
+
const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5;
|
|
1220
|
+
const U32 rowMask = (1u << rowLog) - 1;
|
|
1221
|
+
const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */);
|
|
1222
|
+
|
|
1223
|
+
DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog);
|
|
1224
|
+
ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* dont use cache */);
|
|
1225
|
+
}
|
|
1226
|
+
|
|
1227
|
+
/* Returns a ZSTD_VecMask (U32) that has the nth bit set to 1 if the newly-computed "tag" matches
|
|
1228
|
+
* the hash at the nth position in a row of the tagTable.
|
|
1229
|
+
*/
|
|
1230
|
+
FORCE_INLINE_TEMPLATE
|
|
1231
|
+
ZSTD_VecMask ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries) {
|
|
1232
|
+
ZSTD_VecMask matches = 0;
|
|
1233
|
+
if (rowEntries == 16) {
|
|
1234
|
+
ZSTD_Vec128 hashes = ZSTD_Vec128_read(tagRow + ZSTD_ROW_HASH_TAG_OFFSET);
|
|
1235
|
+
ZSTD_Vec128 expandedTags = ZSTD_Vec128_set8(tag);
|
|
1236
|
+
matches = ZSTD_Vec128_cmpMask8(hashes, expandedTags);
|
|
1237
|
+
} else if (rowEntries == 32) {
|
|
1238
|
+
ZSTD_Vec256 hashes = ZSTD_Vec256_read(tagRow + ZSTD_ROW_HASH_TAG_OFFSET);
|
|
1239
|
+
ZSTD_Vec256 expandedTags = ZSTD_Vec256_set8(tag);
|
|
1240
|
+
matches = ZSTD_Vec256_cmpMask8(hashes, expandedTags);
|
|
1241
|
+
} else {
|
|
1242
|
+
assert(0);
|
|
1243
|
+
}
|
|
1244
|
+
/* Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield
|
|
1245
|
+
to match up with the actual layout of the entries within the hashTable */
|
|
1246
|
+
return ZSTD_VecMask_rotateRight(matches, head, rowEntries);
|
|
1247
|
+
}
|
|
1248
|
+
|
|
1249
|
+
/* The high-level approach of the SIMD row based match finder is as follows:
|
|
1250
|
+
* - Figure out where to insert the new entry:
|
|
1251
|
+
* - Generate a hash from a byte along with an additional 1-byte "short hash". The additional byte is our "tag"
|
|
1252
|
+
* - The hashTable is effectively split into groups or "rows" of 16 or 32 entries of U32, and the hash determines
|
|
1253
|
+
* which row to insert into.
|
|
1254
|
+
* - Determine the correct position within the row to insert the entry into. Each row of 16 or 32 can
|
|
1255
|
+
* be considered as a circular buffer with a "head" index that resides in the tagTable.
|
|
1256
|
+
* - Also insert the "tag" into the equivalent row and position in the tagTable.
|
|
1257
|
+
* - Note: The tagTable has 17 or 33 1-byte entries per row, due to 16 or 32 tags, and 1 "head" entry.
|
|
1258
|
+
* The 17 or 33 entry rows are spaced out to occur every 32 or 64 bytes, respectively,
|
|
1259
|
+
* for alignment/performance reasons, leaving some bytes unused.
|
|
1260
|
+
* - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte "short hash" and
|
|
1261
|
+
* generate a bitfield that we can cycle through to check the collisions in the hash table.
|
|
1262
|
+
* - Pick the longest match.
|
|
1263
|
+
*/
|
|
1264
|
+
FORCE_INLINE_TEMPLATE
|
|
1265
|
+
size_t ZSTD_RowFindBestMatch_generic (
|
|
1266
|
+
ZSTD_matchState_t* ms,
|
|
1267
|
+
const BYTE* const ip, const BYTE* const iLimit,
|
|
1268
|
+
size_t* offsetPtr,
|
|
1269
|
+
const U32 mls, const ZSTD_dictMode_e dictMode,
|
|
1270
|
+
const U32 rowLog)
|
|
1271
|
+
{
|
|
1272
|
+
U32* const hashTable = ms->hashTable;
|
|
1273
|
+
U16* const tagTable = ms->tagTable;
|
|
1274
|
+
U32* const hashCache = ms->hashCache;
|
|
1275
|
+
const U32 hashLog = ms->rowHashLog;
|
|
1276
|
+
const ZSTD_compressionParameters* const cParams = &ms->cParams;
|
|
1277
|
+
const BYTE* const base = ms->window.base;
|
|
1278
|
+
const BYTE* const dictBase = ms->window.dictBase;
|
|
1279
|
+
const U32 dictLimit = ms->window.dictLimit;
|
|
1280
|
+
const BYTE* const prefixStart = base + dictLimit;
|
|
1281
|
+
const BYTE* const dictEnd = dictBase + dictLimit;
|
|
1282
|
+
const U32 curr = (U32)(ip-base);
|
|
1283
|
+
const U32 maxDistance = 1U << cParams->windowLog;
|
|
1284
|
+
const U32 lowestValid = ms->window.lowLimit;
|
|
1285
|
+
const U32 withinMaxDistance = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid;
|
|
1286
|
+
const U32 isDictionary = (ms->loadedDictEnd != 0);
|
|
1287
|
+
const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance;
|
|
1288
|
+
const U32 rowEntries = (1U << rowLog);
|
|
1289
|
+
const U32 rowMask = rowEntries - 1;
|
|
1290
|
+
const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */
|
|
1291
|
+
U32 nbAttempts = 1U << cappedSearchLog;
|
|
1292
|
+
size_t ml=4-1;
|
|
1293
|
+
|
|
1294
|
+
/* DMS/DDS variables that may be referenced laster */
|
|
1295
|
+
const ZSTD_matchState_t* const dms = ms->dictMatchState;
|
|
1296
|
+
size_t ddsIdx;
|
|
1297
|
+
U32 ddsExtraAttempts; /* cctx hash tables are limited in searches, but allow extra searches into DDS */
|
|
1298
|
+
U32 dmsTag;
|
|
1299
|
+
U32* dmsRow;
|
|
1300
|
+
BYTE* dmsTagRow;
|
|
1301
|
+
|
|
1302
|
+
if (dictMode == ZSTD_dedicatedDictSearch) {
|
|
1303
|
+
const U32 ddsHashLog = dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG;
|
|
1304
|
+
{ /* Prefetch DDS hashtable entry */
|
|
1305
|
+
ddsIdx = ZSTD_hashPtr(ip, ddsHashLog, mls) << ZSTD_LAZY_DDSS_BUCKET_LOG;
|
|
1306
|
+
PREFETCH_L1(&dms->hashTable[ddsIdx]);
|
|
1307
|
+
}
|
|
1308
|
+
ddsExtraAttempts = cParams->searchLog > rowLog ? 1U << (cParams->searchLog - rowLog) : 0;
|
|
1309
|
+
}
|
|
1310
|
+
|
|
1311
|
+
if (dictMode == ZSTD_dictMatchState) {
|
|
1312
|
+
/* Prefetch DMS rows */
|
|
1313
|
+
U32* const dmsHashTable = dms->hashTable;
|
|
1314
|
+
U16* const dmsTagTable = dms->tagTable;
|
|
1315
|
+
U32 const dmsHash = (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
|
|
1316
|
+
U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
|
|
1317
|
+
dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK;
|
|
1318
|
+
dmsTagRow = (BYTE*)(dmsTagTable + dmsRelRow);
|
|
1319
|
+
dmsRow = dmsHashTable + dmsRelRow;
|
|
1320
|
+
ZSTD_row_prefetch(dmsHashTable, dmsTagTable, dmsRelRow, rowLog);
|
|
1321
|
+
}
|
|
1322
|
+
|
|
1323
|
+
/* Update the hashTable and tagTable up to (but not including) ip */
|
|
1324
|
+
ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);
|
|
1325
|
+
{ /* Get the hash for ip, compute the appropriate row */
|
|
1326
|
+
U32 const hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls);
|
|
1327
|
+
U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
|
|
1328
|
+
U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK;
|
|
1329
|
+
U32* const row = hashTable + relRow;
|
|
1330
|
+
BYTE* tagRow = (BYTE*)(tagTable + relRow);
|
|
1331
|
+
U32 const head = *tagRow & rowMask;
|
|
1332
|
+
U32 matchBuffer[32 /* maximum nb entries per row */];
|
|
1333
|
+
size_t numMatches = 0;
|
|
1334
|
+
size_t currMatch = 0;
|
|
1335
|
+
ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, head, rowEntries);
|
|
1336
|
+
|
|
1337
|
+
/* Cycle through the matches and prefetch */
|
|
1338
|
+
for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
|
|
1339
|
+
U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
|
|
1340
|
+
U32 const matchIndex = row[matchPos];
|
|
1341
|
+
assert(numMatches < rowEntries);
|
|
1342
|
+
if (matchIndex < lowLimit)
|
|
1343
|
+
break;
|
|
1344
|
+
if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
|
|
1345
|
+
PREFETCH_L1(base + matchIndex);
|
|
1346
|
+
} else {
|
|
1347
|
+
PREFETCH_L1(dictBase + matchIndex);
|
|
1348
|
+
}
|
|
1349
|
+
matchBuffer[numMatches++] = matchIndex;
|
|
1350
|
+
}
|
|
1351
|
+
|
|
1352
|
+
/* Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop
|
|
1353
|
+
in ZSTD_row_update_internal() at the next search. */
|
|
1354
|
+
{
|
|
1355
|
+
U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
|
|
1356
|
+
tagRow[pos + ZSTD_ROW_HASH_TAG_OFFSET] = (BYTE)tag;
|
|
1357
|
+
row[pos] = ms->nextToUpdate++;
|
|
1358
|
+
}
|
|
1359
|
+
|
|
1360
|
+
/* Return the longest match */
|
|
1361
|
+
for (; currMatch < numMatches; ++currMatch) {
|
|
1362
|
+
U32 const matchIndex = matchBuffer[currMatch];
|
|
1363
|
+
size_t currentMl=0;
|
|
1364
|
+
assert(matchIndex < curr);
|
|
1365
|
+
assert(matchIndex >= lowLimit);
|
|
1366
|
+
|
|
1367
|
+
if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
|
|
1368
|
+
const BYTE* const match = base + matchIndex;
|
|
1369
|
+
assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */
|
|
1370
|
+
if (match[ml] == ip[ml]) /* potentially better */
|
|
1371
|
+
currentMl = ZSTD_count(ip, match, iLimit);
|
|
1372
|
+
} else {
|
|
1373
|
+
const BYTE* const match = dictBase + matchIndex;
|
|
1374
|
+
assert(match+4 <= dictEnd);
|
|
1375
|
+
if (MEM_read32(match) == MEM_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */
|
|
1376
|
+
currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dictEnd, prefixStart) + 4;
|
|
1377
|
+
}
|
|
1378
|
+
|
|
1379
|
+
/* Save best solution */
|
|
1380
|
+
if (currentMl > ml) {
|
|
1381
|
+
ml = currentMl;
|
|
1382
|
+
*offsetPtr = curr - matchIndex + ZSTD_REP_MOVE;
|
|
1383
|
+
if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
|
|
1384
|
+
}
|
|
1385
|
+
}
|
|
1386
|
+
}
|
|
1387
|
+
|
|
1388
|
+
if (dictMode == ZSTD_dedicatedDictSearch) {
|
|
1389
|
+
ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts + ddsExtraAttempts, dms,
|
|
1390
|
+
ip, iLimit, prefixStart, curr, dictLimit, ddsIdx);
|
|
1391
|
+
} else if (dictMode == ZSTD_dictMatchState) {
|
|
1392
|
+
/* TODO: Measure and potentially add prefetching to DMS */
|
|
1393
|
+
const U32 dmsLowestIndex = dms->window.dictLimit;
|
|
1394
|
+
const BYTE* const dmsBase = dms->window.base;
|
|
1395
|
+
const BYTE* const dmsEnd = dms->window.nextSrc;
|
|
1396
|
+
const U32 dmsSize = (U32)(dmsEnd - dmsBase);
|
|
1397
|
+
const U32 dmsIndexDelta = dictLimit - dmsSize;
|
|
1398
|
+
|
|
1399
|
+
{ U32 const head = *dmsTagRow & rowMask;
|
|
1400
|
+
U32 matchBuffer[32 /* maximum nb row entries */];
|
|
1401
|
+
size_t numMatches = 0;
|
|
1402
|
+
size_t currMatch = 0;
|
|
1403
|
+
ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, head, rowEntries);
|
|
1404
|
+
|
|
1405
|
+
for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
|
|
1406
|
+
U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
|
|
1407
|
+
U32 const matchIndex = dmsRow[matchPos];
|
|
1408
|
+
if (matchIndex < dmsLowestIndex)
|
|
1409
|
+
break;
|
|
1410
|
+
PREFETCH_L1(dmsBase + matchIndex);
|
|
1411
|
+
matchBuffer[numMatches++] = matchIndex;
|
|
1412
|
+
}
|
|
1413
|
+
|
|
1414
|
+
/* Return the longest match */
|
|
1415
|
+
for (; currMatch < numMatches; ++currMatch) {
|
|
1416
|
+
U32 const matchIndex = matchBuffer[currMatch];
|
|
1417
|
+
size_t currentMl=0;
|
|
1418
|
+
assert(matchIndex >= dmsLowestIndex);
|
|
1419
|
+
assert(matchIndex < curr);
|
|
1420
|
+
|
|
1421
|
+
{ const BYTE* const match = dmsBase + matchIndex;
|
|
1422
|
+
assert(match+4 <= dmsEnd);
|
|
1423
|
+
if (MEM_read32(match) == MEM_read32(ip))
|
|
1424
|
+
currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dmsEnd, prefixStart) + 4;
|
|
1425
|
+
}
|
|
1426
|
+
|
|
1427
|
+
if (currentMl > ml) {
|
|
1428
|
+
ml = currentMl;
|
|
1429
|
+
*offsetPtr = curr - (matchIndex + dmsIndexDelta) + ZSTD_REP_MOVE;
|
|
1430
|
+
if (ip+currentMl == iLimit) break;
|
|
1431
|
+
}
|
|
1432
|
+
}
|
|
1433
|
+
}
|
|
1434
|
+
}
|
|
1435
|
+
return ml;
|
|
1436
|
+
}
|
|
1437
|
+
|
|
1438
|
+
/* Inlining is important to hardwire a hot branch (template emulation) */
|
|
1439
|
+
FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_selectMLS (
|
|
1440
|
+
ZSTD_matchState_t* ms,
|
|
1441
|
+
const BYTE* ip, const BYTE* const iLimit,
|
|
1442
|
+
const ZSTD_dictMode_e dictMode, size_t* offsetPtr, const U32 rowLog)
|
|
1443
|
+
{
|
|
1444
|
+
switch(ms->cParams.minMatch)
|
|
1445
|
+
{
|
|
1446
|
+
default : /* includes case 3 */
|
|
1447
|
+
case 4 : return ZSTD_RowFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, dictMode, rowLog);
|
|
1448
|
+
case 5 : return ZSTD_RowFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, dictMode, rowLog);
|
|
1449
|
+
case 7 :
|
|
1450
|
+
case 6 : return ZSTD_RowFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, dictMode, rowLog);
|
|
1451
|
+
}
|
|
1452
|
+
}
|
|
1453
|
+
|
|
1454
|
+
FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_selectRowLog (
|
|
1455
|
+
ZSTD_matchState_t* ms,
|
|
1456
|
+
const BYTE* ip, const BYTE* const iLimit,
|
|
1457
|
+
size_t* offsetPtr)
|
|
1458
|
+
{
|
|
1459
|
+
const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 5);
|
|
1460
|
+
switch(cappedSearchLog)
|
|
1461
|
+
{
|
|
1462
|
+
default :
|
|
1463
|
+
case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_noDict, offsetPtr, 4);
|
|
1464
|
+
case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_noDict, offsetPtr, 5);
|
|
1465
|
+
}
|
|
1466
|
+
}
|
|
1467
|
+
|
|
1468
|
+
FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_dictMatchState_selectRowLog(
|
|
1469
|
+
ZSTD_matchState_t* ms,
|
|
1470
|
+
const BYTE* ip, const BYTE* const iLimit,
|
|
1471
|
+
size_t* offsetPtr)
|
|
1472
|
+
{
|
|
1473
|
+
const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 5);
|
|
1474
|
+
switch(cappedSearchLog)
|
|
1475
|
+
{
|
|
1476
|
+
default :
|
|
1477
|
+
case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dictMatchState, offsetPtr, 4);
|
|
1478
|
+
case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dictMatchState, offsetPtr, 5);
|
|
1479
|
+
}
|
|
1480
|
+
}
|
|
1481
|
+
|
|
1482
|
+
FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_dedicatedDictSearch_selectRowLog(
|
|
1483
|
+
ZSTD_matchState_t* ms,
|
|
1484
|
+
const BYTE* ip, const BYTE* const iLimit,
|
|
1485
|
+
size_t* offsetPtr)
|
|
1486
|
+
{
|
|
1487
|
+
const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 5);
|
|
1488
|
+
switch(cappedSearchLog)
|
|
1489
|
+
{
|
|
1490
|
+
default :
|
|
1491
|
+
case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dedicatedDictSearch, offsetPtr, 4);
|
|
1492
|
+
case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dedicatedDictSearch, offsetPtr, 5);
|
|
1493
|
+
}
|
|
1494
|
+
}
|
|
1495
|
+
|
|
1496
|
+
FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_extDict_selectRowLog (
|
|
1497
|
+
ZSTD_matchState_t* ms,
|
|
1498
|
+
const BYTE* ip, const BYTE* const iLimit,
|
|
1499
|
+
size_t* offsetPtr)
|
|
1500
|
+
{
|
|
1501
|
+
const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 5);
|
|
1502
|
+
switch(cappedSearchLog)
|
|
1503
|
+
{
|
|
1504
|
+
default :
|
|
1505
|
+
case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_extDict, offsetPtr, 4);
|
|
1506
|
+
case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_extDict, offsetPtr, 5);
|
|
1507
|
+
}
|
|
1508
|
+
}
|
|
1509
|
+
|
|
618
1510
|
|
|
619
1511
|
/* *******************************
|
|
620
1512
|
* Common parser - lazy strategy
|
|
621
1513
|
*********************************/
|
|
622
|
-
|
|
623
|
-
|
|
1514
|
+
typedef enum { search_hashChain=0, search_binaryTree=1, search_rowHash=2 } searchMethod_e;
|
|
1515
|
+
|
|
1516
|
+
FORCE_INLINE_TEMPLATE size_t
|
|
1517
|
+
ZSTD_compressBlock_lazy_generic(
|
|
624
1518
|
ZSTD_matchState_t* ms, seqStore_t* seqStore,
|
|
625
1519
|
U32 rep[ZSTD_REP_NUM],
|
|
626
1520
|
const void* src, size_t srcSize,
|
|
627
|
-
const
|
|
1521
|
+
const searchMethod_e searchMethod, const U32 depth,
|
|
628
1522
|
ZSTD_dictMode_e const dictMode)
|
|
629
1523
|
{
|
|
630
1524
|
const BYTE* const istart = (const BYTE*)src;
|
|
631
1525
|
const BYTE* ip = istart;
|
|
632
1526
|
const BYTE* anchor = istart;
|
|
633
1527
|
const BYTE* const iend = istart + srcSize;
|
|
634
|
-
const BYTE* const ilimit = iend - 8;
|
|
1528
|
+
const BYTE* const ilimit = searchMethod == search_rowHash ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8;
|
|
635
1529
|
const BYTE* const base = ms->window.base;
|
|
636
1530
|
const U32 prefixLowestIndex = ms->window.dictLimit;
|
|
637
1531
|
const BYTE* const prefixLowest = base + prefixLowestIndex;
|
|
1532
|
+
const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5;
|
|
638
1533
|
|
|
639
1534
|
typedef size_t (*searchMax_f)(
|
|
640
1535
|
ZSTD_matchState_t* ms,
|
|
641
1536
|
const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
1537
|
+
|
|
1538
|
+
/**
|
|
1539
|
+
* This table is indexed first by the four ZSTD_dictMode_e values, and then
|
|
1540
|
+
* by the two searchMethod_e values. NULLs are placed for configurations
|
|
1541
|
+
* that should never occur (extDict modes go to the other implementation
|
|
1542
|
+
* below and there is no DDSS for binary tree search yet).
|
|
1543
|
+
*/
|
|
1544
|
+
const searchMax_f searchFuncs[4][3] = {
|
|
1545
|
+
{
|
|
1546
|
+
ZSTD_HcFindBestMatch_selectMLS,
|
|
1547
|
+
ZSTD_BtFindBestMatch_selectMLS,
|
|
1548
|
+
ZSTD_RowFindBestMatch_selectRowLog
|
|
1549
|
+
},
|
|
1550
|
+
{
|
|
1551
|
+
NULL,
|
|
1552
|
+
NULL,
|
|
1553
|
+
NULL
|
|
1554
|
+
},
|
|
1555
|
+
{
|
|
1556
|
+
ZSTD_HcFindBestMatch_dictMatchState_selectMLS,
|
|
1557
|
+
ZSTD_BtFindBestMatch_dictMatchState_selectMLS,
|
|
1558
|
+
ZSTD_RowFindBestMatch_dictMatchState_selectRowLog
|
|
1559
|
+
},
|
|
1560
|
+
{
|
|
1561
|
+
ZSTD_HcFindBestMatch_dedicatedDictSearch_selectMLS,
|
|
1562
|
+
NULL,
|
|
1563
|
+
ZSTD_RowFindBestMatch_dedicatedDictSearch_selectRowLog
|
|
1564
|
+
}
|
|
1565
|
+
};
|
|
1566
|
+
|
|
1567
|
+
searchMax_f const searchMax = searchFuncs[dictMode][(int)searchMethod];
|
|
645
1568
|
U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0;
|
|
646
1569
|
|
|
1570
|
+
const int isDMS = dictMode == ZSTD_dictMatchState;
|
|
1571
|
+
const int isDDS = dictMode == ZSTD_dedicatedDictSearch;
|
|
1572
|
+
const int isDxS = isDMS || isDDS;
|
|
647
1573
|
const ZSTD_matchState_t* const dms = ms->dictMatchState;
|
|
648
|
-
const U32 dictLowestIndex =
|
|
649
|
-
|
|
650
|
-
const BYTE* const
|
|
651
|
-
|
|
652
|
-
const
|
|
653
|
-
dictBase + dictLowestIndex : NULL;
|
|
654
|
-
const BYTE* const dictEnd = dictMode == ZSTD_dictMatchState ?
|
|
655
|
-
dms->window.nextSrc : NULL;
|
|
656
|
-
const U32 dictIndexDelta = dictMode == ZSTD_dictMatchState ?
|
|
1574
|
+
const U32 dictLowestIndex = isDxS ? dms->window.dictLimit : 0;
|
|
1575
|
+
const BYTE* const dictBase = isDxS ? dms->window.base : NULL;
|
|
1576
|
+
const BYTE* const dictLowest = isDxS ? dictBase + dictLowestIndex : NULL;
|
|
1577
|
+
const BYTE* const dictEnd = isDxS ? dms->window.nextSrc : NULL;
|
|
1578
|
+
const U32 dictIndexDelta = isDxS ?
|
|
657
1579
|
prefixLowestIndex - (U32)(dictEnd - dictBase) :
|
|
658
1580
|
0;
|
|
659
|
-
const U32 dictAndPrefixLength = (U32)(ip - prefixLowest + dictEnd - dictLowest);
|
|
1581
|
+
const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictLowest));
|
|
660
1582
|
|
|
661
|
-
|
|
1583
|
+
assert(searchMax != NULL);
|
|
1584
|
+
|
|
1585
|
+
DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=%u) (searchFunc=%u)", (U32)dictMode, (U32)searchMethod);
|
|
662
1586
|
ip += (dictAndPrefixLength == 0);
|
|
663
1587
|
if (dictMode == ZSTD_noDict) {
|
|
664
|
-
U32 const
|
|
1588
|
+
U32 const curr = (U32)(ip - base);
|
|
1589
|
+
U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, ms->cParams.windowLog);
|
|
1590
|
+
U32 const maxRep = curr - windowLow;
|
|
665
1591
|
if (offset_2 > maxRep) savedOffset = offset_2, offset_2 = 0;
|
|
666
1592
|
if (offset_1 > maxRep) savedOffset = offset_1, offset_1 = 0;
|
|
667
1593
|
}
|
|
668
|
-
if (
|
|
1594
|
+
if (isDxS) {
|
|
669
1595
|
/* dictMatchState repCode checks don't currently handle repCode == 0
|
|
670
1596
|
* disabling. */
|
|
671
1597
|
assert(offset_1 <= dictAndPrefixLength);
|
|
672
1598
|
assert(offset_2 <= dictAndPrefixLength);
|
|
673
1599
|
}
|
|
674
1600
|
|
|
1601
|
+
if (searchMethod == search_rowHash) {
|
|
1602
|
+
ZSTD_row_fillHashCache(ms, base, rowLog,
|
|
1603
|
+
MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
|
|
1604
|
+
ms->nextToUpdate, ilimit);
|
|
1605
|
+
}
|
|
1606
|
+
|
|
675
1607
|
/* Match Loop */
|
|
1608
|
+
#if defined(__GNUC__) && defined(__x86_64__)
|
|
1609
|
+
/* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the
|
|
1610
|
+
* code alignment is perturbed. To fix the instability align the loop on 32-bytes.
|
|
1611
|
+
*/
|
|
1612
|
+
__asm__(".p2align 5");
|
|
1613
|
+
#endif
|
|
676
1614
|
while (ip < ilimit) {
|
|
677
1615
|
size_t matchLength=0;
|
|
678
1616
|
size_t offset=0;
|
|
679
1617
|
const BYTE* start=ip+1;
|
|
680
1618
|
|
|
681
1619
|
/* check repCode */
|
|
682
|
-
if (
|
|
1620
|
+
if (isDxS) {
|
|
683
1621
|
const U32 repIndex = (U32)(ip - base) + 1 - offset_1;
|
|
684
|
-
const BYTE* repMatch = (dictMode == ZSTD_dictMatchState
|
|
1622
|
+
const BYTE* repMatch = ((dictMode == ZSTD_dictMatchState || dictMode == ZSTD_dedicatedDictSearch)
|
|
685
1623
|
&& repIndex < prefixLowestIndex) ?
|
|
686
1624
|
dictBase + (repIndex - dictIndexDelta) :
|
|
687
1625
|
base + repIndex;
|
|
@@ -722,7 +1660,7 @@ size_t ZSTD_compressBlock_lazy_generic(
|
|
|
722
1660
|
if ((mlRep >= 4) && (gain2 > gain1))
|
|
723
1661
|
matchLength = mlRep, offset = 0, start = ip;
|
|
724
1662
|
}
|
|
725
|
-
if (
|
|
1663
|
+
if (isDxS) {
|
|
726
1664
|
const U32 repIndex = (U32)(ip - base) - offset_1;
|
|
727
1665
|
const BYTE* repMatch = repIndex < prefixLowestIndex ?
|
|
728
1666
|
dictBase + (repIndex - dictIndexDelta) :
|
|
@@ -757,7 +1695,7 @@ size_t ZSTD_compressBlock_lazy_generic(
|
|
|
757
1695
|
if ((mlRep >= 4) && (gain2 > gain1))
|
|
758
1696
|
matchLength = mlRep, offset = 0, start = ip;
|
|
759
1697
|
}
|
|
760
|
-
if (
|
|
1698
|
+
if (isDxS) {
|
|
761
1699
|
const U32 repIndex = (U32)(ip - base) - offset_1;
|
|
762
1700
|
const BYTE* repMatch = repIndex < prefixLowestIndex ?
|
|
763
1701
|
dictBase + (repIndex - dictIndexDelta) :
|
|
@@ -795,7 +1733,7 @@ size_t ZSTD_compressBlock_lazy_generic(
|
|
|
795
1733
|
&& (start[-1] == (start-(offset-ZSTD_REP_MOVE))[-1]) ) /* only search for offset within prefix */
|
|
796
1734
|
{ start--; matchLength++; }
|
|
797
1735
|
}
|
|
798
|
-
if (
|
|
1736
|
+
if (isDxS) {
|
|
799
1737
|
U32 const matchIndex = (U32)((start-base) - (offset - ZSTD_REP_MOVE));
|
|
800
1738
|
const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex;
|
|
801
1739
|
const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest;
|
|
@@ -806,17 +1744,16 @@ size_t ZSTD_compressBlock_lazy_generic(
|
|
|
806
1744
|
/* store sequence */
|
|
807
1745
|
_storeSequence:
|
|
808
1746
|
{ size_t const litLength = start - anchor;
|
|
809
|
-
ZSTD_storeSeq(seqStore, litLength, anchor, (U32)offset, matchLength-MINMATCH);
|
|
1747
|
+
ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offset, matchLength-MINMATCH);
|
|
810
1748
|
anchor = ip = start + matchLength;
|
|
811
1749
|
}
|
|
812
1750
|
|
|
813
1751
|
/* check immediate repcode */
|
|
814
|
-
if (
|
|
1752
|
+
if (isDxS) {
|
|
815
1753
|
while (ip <= ilimit) {
|
|
816
1754
|
U32 const current2 = (U32)(ip-base);
|
|
817
1755
|
U32 const repIndex = current2 - offset_2;
|
|
818
|
-
const BYTE* repMatch =
|
|
819
|
-
&& repIndex < prefixLowestIndex ?
|
|
1756
|
+
const BYTE* repMatch = repIndex < prefixLowestIndex ?
|
|
820
1757
|
dictBase - dictIndexDelta + repIndex :
|
|
821
1758
|
base + repIndex;
|
|
822
1759
|
if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex) >= 3 /* intentional overflow */)
|
|
@@ -824,7 +1761,7 @@ _storeSequence:
|
|
|
824
1761
|
const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend;
|
|
825
1762
|
matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4;
|
|
826
1763
|
offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap offset_2 <=> offset_1 */
|
|
827
|
-
ZSTD_storeSeq(seqStore, 0, anchor, 0, matchLength-MINMATCH);
|
|
1764
|
+
ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MINMATCH);
|
|
828
1765
|
ip += matchLength;
|
|
829
1766
|
anchor = ip;
|
|
830
1767
|
continue;
|
|
@@ -839,7 +1776,7 @@ _storeSequence:
|
|
|
839
1776
|
/* store sequence */
|
|
840
1777
|
matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
|
|
841
1778
|
offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap repcodes */
|
|
842
|
-
ZSTD_storeSeq(seqStore, 0, anchor, 0, matchLength-MINMATCH);
|
|
1779
|
+
ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MINMATCH);
|
|
843
1780
|
ip += matchLength;
|
|
844
1781
|
anchor = ip;
|
|
845
1782
|
continue; /* faster when present ... (?) */
|
|
@@ -850,7 +1787,7 @@ _storeSequence:
|
|
|
850
1787
|
rep[1] = offset_2 ? offset_2 : savedOffset;
|
|
851
1788
|
|
|
852
1789
|
/* Return the last literals size */
|
|
853
|
-
return iend - anchor;
|
|
1790
|
+
return (size_t)(iend - anchor);
|
|
854
1791
|
}
|
|
855
1792
|
|
|
856
1793
|
|
|
@@ -858,101 +1795,207 @@ size_t ZSTD_compressBlock_btlazy2(
|
|
|
858
1795
|
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
|
|
859
1796
|
void const* src, size_t srcSize)
|
|
860
1797
|
{
|
|
861
|
-
return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize,
|
|
1798
|
+
return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_noDict);
|
|
862
1799
|
}
|
|
863
1800
|
|
|
864
1801
|
size_t ZSTD_compressBlock_lazy2(
|
|
865
1802
|
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
|
|
866
1803
|
void const* src, size_t srcSize)
|
|
867
1804
|
{
|
|
868
|
-
return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize,
|
|
1805
|
+
return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_noDict);
|
|
869
1806
|
}
|
|
870
1807
|
|
|
871
1808
|
size_t ZSTD_compressBlock_lazy(
|
|
872
1809
|
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
|
|
873
1810
|
void const* src, size_t srcSize)
|
|
874
1811
|
{
|
|
875
|
-
return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize,
|
|
1812
|
+
return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_noDict);
|
|
876
1813
|
}
|
|
877
1814
|
|
|
878
1815
|
size_t ZSTD_compressBlock_greedy(
|
|
879
1816
|
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
|
|
880
1817
|
void const* src, size_t srcSize)
|
|
881
1818
|
{
|
|
882
|
-
return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize,
|
|
1819
|
+
return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_noDict);
|
|
883
1820
|
}
|
|
884
1821
|
|
|
885
1822
|
size_t ZSTD_compressBlock_btlazy2_dictMatchState(
|
|
886
1823
|
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
|
|
887
1824
|
void const* src, size_t srcSize)
|
|
888
1825
|
{
|
|
889
|
-
return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize,
|
|
1826
|
+
return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_dictMatchState);
|
|
890
1827
|
}
|
|
891
1828
|
|
|
892
1829
|
size_t ZSTD_compressBlock_lazy2_dictMatchState(
|
|
893
1830
|
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
|
|
894
1831
|
void const* src, size_t srcSize)
|
|
895
1832
|
{
|
|
896
|
-
return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize,
|
|
1833
|
+
return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dictMatchState);
|
|
897
1834
|
}
|
|
898
1835
|
|
|
899
1836
|
size_t ZSTD_compressBlock_lazy_dictMatchState(
|
|
900
1837
|
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
|
|
901
1838
|
void const* src, size_t srcSize)
|
|
902
1839
|
{
|
|
903
|
-
return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize,
|
|
1840
|
+
return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dictMatchState);
|
|
904
1841
|
}
|
|
905
1842
|
|
|
906
1843
|
size_t ZSTD_compressBlock_greedy_dictMatchState(
|
|
907
1844
|
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
|
|
908
1845
|
void const* src, size_t srcSize)
|
|
909
1846
|
{
|
|
910
|
-
return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize,
|
|
1847
|
+
return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dictMatchState);
|
|
1848
|
+
}
|
|
1849
|
+
|
|
1850
|
+
|
|
1851
|
+
size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(
|
|
1852
|
+
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
|
|
1853
|
+
void const* src, size_t srcSize)
|
|
1854
|
+
{
|
|
1855
|
+
return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dedicatedDictSearch);
|
|
1856
|
+
}
|
|
1857
|
+
|
|
1858
|
+
size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(
|
|
1859
|
+
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
|
|
1860
|
+
void const* src, size_t srcSize)
|
|
1861
|
+
{
|
|
1862
|
+
return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dedicatedDictSearch);
|
|
1863
|
+
}
|
|
1864
|
+
|
|
1865
|
+
size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
|
|
1866
|
+
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
|
|
1867
|
+
void const* src, size_t srcSize)
|
|
1868
|
+
{
|
|
1869
|
+
return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch);
|
|
1870
|
+
}
|
|
1871
|
+
|
|
1872
|
+
/* Row-based matchfinder */
|
|
1873
|
+
size_t ZSTD_compressBlock_lazy2_row(
|
|
1874
|
+
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
|
|
1875
|
+
void const* src, size_t srcSize)
|
|
1876
|
+
{
|
|
1877
|
+
return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict);
|
|
1878
|
+
}
|
|
1879
|
+
|
|
1880
|
+
size_t ZSTD_compressBlock_lazy_row(
|
|
1881
|
+
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
|
|
1882
|
+
void const* src, size_t srcSize)
|
|
1883
|
+
{
|
|
1884
|
+
return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict);
|
|
1885
|
+
}
|
|
1886
|
+
|
|
1887
|
+
size_t ZSTD_compressBlock_greedy_row(
|
|
1888
|
+
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
|
|
1889
|
+
void const* src, size_t srcSize)
|
|
1890
|
+
{
|
|
1891
|
+
return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict);
|
|
1892
|
+
}
|
|
1893
|
+
|
|
1894
|
+
size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
|
|
1895
|
+
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
|
|
1896
|
+
void const* src, size_t srcSize)
|
|
1897
|
+
{
|
|
1898
|
+
return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState);
|
|
911
1899
|
}
|
|
912
1900
|
|
|
1901
|
+
size_t ZSTD_compressBlock_lazy_dictMatchState_row(
|
|
1902
|
+
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
|
|
1903
|
+
void const* src, size_t srcSize)
|
|
1904
|
+
{
|
|
1905
|
+
return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState);
|
|
1906
|
+
}
|
|
1907
|
+
|
|
1908
|
+
size_t ZSTD_compressBlock_greedy_dictMatchState_row(
|
|
1909
|
+
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
|
|
1910
|
+
void const* src, size_t srcSize)
|
|
1911
|
+
{
|
|
1912
|
+
return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState);
|
|
1913
|
+
}
|
|
1914
|
+
|
|
1915
|
+
|
|
1916
|
+
size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row(
|
|
1917
|
+
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
|
|
1918
|
+
void const* src, size_t srcSize)
|
|
1919
|
+
{
|
|
1920
|
+
return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dedicatedDictSearch);
|
|
1921
|
+
}
|
|
1922
|
+
|
|
1923
|
+
size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
|
|
1924
|
+
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
|
|
1925
|
+
void const* src, size_t srcSize)
|
|
1926
|
+
{
|
|
1927
|
+
return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch);
|
|
1928
|
+
}
|
|
1929
|
+
|
|
1930
|
+
size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
|
|
1931
|
+
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
|
|
1932
|
+
void const* src, size_t srcSize)
|
|
1933
|
+
{
|
|
1934
|
+
return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch);
|
|
1935
|
+
}
|
|
913
1936
|
|
|
914
1937
|
FORCE_INLINE_TEMPLATE
|
|
915
1938
|
size_t ZSTD_compressBlock_lazy_extDict_generic(
|
|
916
1939
|
ZSTD_matchState_t* ms, seqStore_t* seqStore,
|
|
917
1940
|
U32 rep[ZSTD_REP_NUM],
|
|
918
1941
|
const void* src, size_t srcSize,
|
|
919
|
-
const
|
|
1942
|
+
const searchMethod_e searchMethod, const U32 depth)
|
|
920
1943
|
{
|
|
921
1944
|
const BYTE* const istart = (const BYTE*)src;
|
|
922
1945
|
const BYTE* ip = istart;
|
|
923
1946
|
const BYTE* anchor = istart;
|
|
924
1947
|
const BYTE* const iend = istart + srcSize;
|
|
925
|
-
const BYTE* const ilimit = iend - 8;
|
|
1948
|
+
const BYTE* const ilimit = searchMethod == search_rowHash ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8;
|
|
926
1949
|
const BYTE* const base = ms->window.base;
|
|
927
1950
|
const U32 dictLimit = ms->window.dictLimit;
|
|
928
|
-
const U32 lowestIndex = ms->window.lowLimit;
|
|
929
1951
|
const BYTE* const prefixStart = base + dictLimit;
|
|
930
1952
|
const BYTE* const dictBase = ms->window.dictBase;
|
|
931
1953
|
const BYTE* const dictEnd = dictBase + dictLimit;
|
|
932
|
-
const BYTE* const dictStart = dictBase +
|
|
1954
|
+
const BYTE* const dictStart = dictBase + ms->window.lowLimit;
|
|
1955
|
+
const U32 windowLog = ms->cParams.windowLog;
|
|
1956
|
+
const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5;
|
|
933
1957
|
|
|
934
1958
|
typedef size_t (*searchMax_f)(
|
|
935
1959
|
ZSTD_matchState_t* ms,
|
|
936
1960
|
const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
|
|
937
|
-
searchMax_f
|
|
938
|
-
|
|
1961
|
+
const searchMax_f searchFuncs[3] = {
|
|
1962
|
+
ZSTD_HcFindBestMatch_extDict_selectMLS,
|
|
1963
|
+
ZSTD_BtFindBestMatch_extDict_selectMLS,
|
|
1964
|
+
ZSTD_RowFindBestMatch_extDict_selectRowLog
|
|
1965
|
+
};
|
|
1966
|
+
searchMax_f searchMax = searchFuncs[(int)searchMethod];
|
|
939
1967
|
U32 offset_1 = rep[0], offset_2 = rep[1];
|
|
940
1968
|
|
|
1969
|
+
DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod);
|
|
1970
|
+
|
|
941
1971
|
/* init */
|
|
942
1972
|
ip += (ip == prefixStart);
|
|
1973
|
+
if (searchMethod == search_rowHash) {
|
|
1974
|
+
ZSTD_row_fillHashCache(ms, base, rowLog,
|
|
1975
|
+
MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
|
|
1976
|
+
ms->nextToUpdate, ilimit);
|
|
1977
|
+
}
|
|
943
1978
|
|
|
944
1979
|
/* Match Loop */
|
|
1980
|
+
#if defined(__GNUC__) && defined(__x86_64__)
|
|
1981
|
+
/* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the
|
|
1982
|
+
* code alignment is perturbed. To fix the instability align the loop on 32-bytes.
|
|
1983
|
+
*/
|
|
1984
|
+
__asm__(".p2align 5");
|
|
1985
|
+
#endif
|
|
945
1986
|
while (ip < ilimit) {
|
|
946
1987
|
size_t matchLength=0;
|
|
947
1988
|
size_t offset=0;
|
|
948
1989
|
const BYTE* start=ip+1;
|
|
949
|
-
U32
|
|
1990
|
+
U32 curr = (U32)(ip-base);
|
|
950
1991
|
|
|
951
1992
|
/* check repCode */
|
|
952
|
-
{ const U32
|
|
1993
|
+
{ const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr+1, windowLog);
|
|
1994
|
+
const U32 repIndex = (U32)(curr+1 - offset_1);
|
|
953
1995
|
const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
|
|
954
1996
|
const BYTE* const repMatch = repBase + repIndex;
|
|
955
|
-
if (((U32)((dictLimit-1) - repIndex) >= 3)
|
|
1997
|
+
if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow */
|
|
1998
|
+
& (offset_1 < curr+1 - windowLow) ) /* note: we are searching at curr+1 */
|
|
956
1999
|
if (MEM_read32(ip+1) == MEM_read32(repMatch)) {
|
|
957
2000
|
/* repcode detected we should take it */
|
|
958
2001
|
const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
|
|
@@ -976,13 +2019,15 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
|
|
|
976
2019
|
if (depth>=1)
|
|
977
2020
|
while (ip<ilimit) {
|
|
978
2021
|
ip ++;
|
|
979
|
-
|
|
2022
|
+
curr++;
|
|
980
2023
|
/* check repCode */
|
|
981
2024
|
if (offset) {
|
|
982
|
-
const U32
|
|
2025
|
+
const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
|
|
2026
|
+
const U32 repIndex = (U32)(curr - offset_1);
|
|
983
2027
|
const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
|
|
984
2028
|
const BYTE* const repMatch = repBase + repIndex;
|
|
985
|
-
if (((U32)((dictLimit-1) - repIndex) >= 3)
|
|
2029
|
+
if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */
|
|
2030
|
+
& (offset_1 < curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
|
|
986
2031
|
if (MEM_read32(ip) == MEM_read32(repMatch)) {
|
|
987
2032
|
/* repcode detected */
|
|
988
2033
|
const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
|
|
@@ -1006,13 +2051,15 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
|
|
|
1006
2051
|
/* let's find an even better one */
|
|
1007
2052
|
if ((depth==2) && (ip<ilimit)) {
|
|
1008
2053
|
ip ++;
|
|
1009
|
-
|
|
2054
|
+
curr++;
|
|
1010
2055
|
/* check repCode */
|
|
1011
2056
|
if (offset) {
|
|
1012
|
-
const U32
|
|
2057
|
+
const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
|
|
2058
|
+
const U32 repIndex = (U32)(curr - offset_1);
|
|
1013
2059
|
const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
|
|
1014
2060
|
const BYTE* const repMatch = repBase + repIndex;
|
|
1015
|
-
if (((U32)((dictLimit-1) - repIndex) >= 3)
|
|
2061
|
+
if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */
|
|
2062
|
+
& (offset_1 < curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
|
|
1016
2063
|
if (MEM_read32(ip) == MEM_read32(repMatch)) {
|
|
1017
2064
|
/* repcode detected */
|
|
1018
2065
|
const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
|
|
@@ -1047,22 +2094,25 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
|
|
|
1047
2094
|
/* store sequence */
|
|
1048
2095
|
_storeSequence:
|
|
1049
2096
|
{ size_t const litLength = start - anchor;
|
|
1050
|
-
ZSTD_storeSeq(seqStore, litLength, anchor, (U32)offset, matchLength-MINMATCH);
|
|
2097
|
+
ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offset, matchLength-MINMATCH);
|
|
1051
2098
|
anchor = ip = start + matchLength;
|
|
1052
2099
|
}
|
|
1053
2100
|
|
|
1054
2101
|
/* check immediate repcode */
|
|
1055
2102
|
while (ip <= ilimit) {
|
|
1056
|
-
const U32
|
|
2103
|
+
const U32 repCurrent = (U32)(ip-base);
|
|
2104
|
+
const U32 windowLow = ZSTD_getLowestMatchIndex(ms, repCurrent, windowLog);
|
|
2105
|
+
const U32 repIndex = repCurrent - offset_2;
|
|
1057
2106
|
const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
|
|
1058
2107
|
const BYTE* const repMatch = repBase + repIndex;
|
|
1059
|
-
if (((U32)((dictLimit-1) - repIndex) >= 3)
|
|
2108
|
+
if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */
|
|
2109
|
+
& (offset_2 < repCurrent - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
|
|
1060
2110
|
if (MEM_read32(ip) == MEM_read32(repMatch)) {
|
|
1061
2111
|
/* repcode detected we should take it */
|
|
1062
2112
|
const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
|
|
1063
2113
|
matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
|
|
1064
2114
|
offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap offset history */
|
|
1065
|
-
ZSTD_storeSeq(seqStore, 0, anchor, 0, matchLength-MINMATCH);
|
|
2115
|
+
ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MINMATCH);
|
|
1066
2116
|
ip += matchLength;
|
|
1067
2117
|
anchor = ip;
|
|
1068
2118
|
continue; /* faster when present ... (?) */
|
|
@@ -1075,7 +2125,7 @@ _storeSequence:
|
|
|
1075
2125
|
rep[1] = offset_2;
|
|
1076
2126
|
|
|
1077
2127
|
/* Return the last literals size */
|
|
1078
|
-
return iend - anchor;
|
|
2128
|
+
return (size_t)(iend - anchor);
|
|
1079
2129
|
}
|
|
1080
2130
|
|
|
1081
2131
|
|
|
@@ -1083,7 +2133,7 @@ size_t ZSTD_compressBlock_greedy_extDict(
|
|
|
1083
2133
|
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
|
|
1084
2134
|
void const* src, size_t srcSize)
|
|
1085
2135
|
{
|
|
1086
|
-
return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize,
|
|
2136
|
+
return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0);
|
|
1087
2137
|
}
|
|
1088
2138
|
|
|
1089
2139
|
size_t ZSTD_compressBlock_lazy_extDict(
|
|
@@ -1091,7 +2141,7 @@ size_t ZSTD_compressBlock_lazy_extDict(
|
|
|
1091
2141
|
void const* src, size_t srcSize)
|
|
1092
2142
|
|
|
1093
2143
|
{
|
|
1094
|
-
return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize,
|
|
2144
|
+
return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1);
|
|
1095
2145
|
}
|
|
1096
2146
|
|
|
1097
2147
|
size_t ZSTD_compressBlock_lazy2_extDict(
|
|
@@ -1099,7 +2149,7 @@ size_t ZSTD_compressBlock_lazy2_extDict(
|
|
|
1099
2149
|
void const* src, size_t srcSize)
|
|
1100
2150
|
|
|
1101
2151
|
{
|
|
1102
|
-
return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize,
|
|
2152
|
+
return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2);
|
|
1103
2153
|
}
|
|
1104
2154
|
|
|
1105
2155
|
size_t ZSTD_compressBlock_btlazy2_extDict(
|
|
@@ -1107,5 +2157,28 @@ size_t ZSTD_compressBlock_btlazy2_extDict(
|
|
|
1107
2157
|
void const* src, size_t srcSize)
|
|
1108
2158
|
|
|
1109
2159
|
{
|
|
1110
|
-
return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize,
|
|
2160
|
+
return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2);
|
|
2161
|
+
}
|
|
2162
|
+
|
|
2163
|
+
size_t ZSTD_compressBlock_greedy_extDict_row(
|
|
2164
|
+
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
|
|
2165
|
+
void const* src, size_t srcSize)
|
|
2166
|
+
{
|
|
2167
|
+
return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0);
|
|
2168
|
+
}
|
|
2169
|
+
|
|
2170
|
+
size_t ZSTD_compressBlock_lazy_extDict_row(
|
|
2171
|
+
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
|
|
2172
|
+
void const* src, size_t srcSize)
|
|
2173
|
+
|
|
2174
|
+
{
|
|
2175
|
+
return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1);
|
|
2176
|
+
}
|
|
2177
|
+
|
|
2178
|
+
size_t ZSTD_compressBlock_lazy2_extDict_row(
|
|
2179
|
+
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
|
|
2180
|
+
void const* src, size_t srcSize)
|
|
2181
|
+
|
|
2182
|
+
{
|
|
2183
|
+
return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2);
|
|
1111
2184
|
}
|