zstd-ruby 1.5.2.2 → 1.5.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +15 -3
- data/ext/zstdruby/common.h +7 -0
- data/ext/zstdruby/libzstd/common/bits.h +175 -0
- data/ext/zstdruby/libzstd/common/bitstream.h +18 -59
- data/ext/zstdruby/libzstd/common/compiler.h +22 -3
- data/ext/zstdruby/libzstd/common/cpu.h +1 -1
- data/ext/zstdruby/libzstd/common/debug.c +1 -1
- data/ext/zstdruby/libzstd/common/debug.h +1 -1
- data/ext/zstdruby/libzstd/common/entropy_common.c +12 -40
- data/ext/zstdruby/libzstd/common/error_private.c +9 -2
- data/ext/zstdruby/libzstd/common/error_private.h +1 -1
- data/ext/zstdruby/libzstd/common/fse.h +5 -83
- data/ext/zstdruby/libzstd/common/fse_decompress.c +7 -99
- data/ext/zstdruby/libzstd/common/huf.h +65 -156
- data/ext/zstdruby/libzstd/common/mem.h +39 -46
- data/ext/zstdruby/libzstd/common/pool.c +26 -10
- data/ext/zstdruby/libzstd/common/pool.h +7 -1
- data/ext/zstdruby/libzstd/common/portability_macros.h +22 -3
- data/ext/zstdruby/libzstd/common/threading.c +68 -14
- data/ext/zstdruby/libzstd/common/threading.h +5 -10
- data/ext/zstdruby/libzstd/common/xxhash.c +2 -2
- data/ext/zstdruby/libzstd/common/xxhash.h +8 -8
- data/ext/zstdruby/libzstd/common/zstd_common.c +1 -1
- data/ext/zstdruby/libzstd/common/zstd_deps.h +1 -1
- data/ext/zstdruby/libzstd/common/zstd_internal.h +17 -113
- data/ext/zstdruby/libzstd/common/zstd_trace.h +3 -3
- data/ext/zstdruby/libzstd/compress/clevels.h +1 -1
- data/ext/zstdruby/libzstd/compress/fse_compress.c +7 -124
- data/ext/zstdruby/libzstd/compress/hist.c +1 -1
- data/ext/zstdruby/libzstd/compress/hist.h +1 -1
- data/ext/zstdruby/libzstd/compress/huf_compress.c +234 -169
- data/ext/zstdruby/libzstd/compress/zstd_compress.c +1055 -455
- data/ext/zstdruby/libzstd/compress/zstd_compress_internal.h +165 -145
- data/ext/zstdruby/libzstd/compress/zstd_compress_literals.c +115 -39
- data/ext/zstdruby/libzstd/compress/zstd_compress_literals.h +16 -8
- data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.c +3 -3
- data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.h +1 -1
- data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.c +25 -21
- data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.h +1 -1
- data/ext/zstdruby/libzstd/compress/zstd_cwksp.h +5 -3
- data/ext/zstdruby/libzstd/compress/zstd_double_fast.c +95 -33
- data/ext/zstdruby/libzstd/compress/zstd_double_fast.h +3 -2
- data/ext/zstdruby/libzstd/compress/zstd_fast.c +433 -148
- data/ext/zstdruby/libzstd/compress/zstd_fast.h +3 -2
- data/ext/zstdruby/libzstd/compress/zstd_lazy.c +306 -283
- data/ext/zstdruby/libzstd/compress/zstd_lazy.h +4 -2
- data/ext/zstdruby/libzstd/compress/zstd_ldm.c +5 -5
- data/ext/zstdruby/libzstd/compress/zstd_ldm.h +1 -1
- data/ext/zstdruby/libzstd/compress/zstd_ldm_geartab.h +1 -1
- data/ext/zstdruby/libzstd/compress/zstd_opt.c +104 -80
- data/ext/zstdruby/libzstd/compress/zstd_opt.h +1 -1
- data/ext/zstdruby/libzstd/compress/zstdmt_compress.c +12 -5
- data/ext/zstdruby/libzstd/compress/zstdmt_compress.h +1 -1
- data/ext/zstdruby/libzstd/decompress/huf_decompress.c +434 -441
- data/ext/zstdruby/libzstd/decompress/huf_decompress_amd64.S +30 -39
- data/ext/zstdruby/libzstd/decompress/zstd_ddict.c +3 -4
- data/ext/zstdruby/libzstd/decompress/zstd_ddict.h +1 -1
- data/ext/zstdruby/libzstd/decompress/zstd_decompress.c +164 -42
- data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.c +186 -65
- data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.h +1 -1
- data/ext/zstdruby/libzstd/decompress/zstd_decompress_internal.h +4 -2
- data/ext/zstdruby/libzstd/dictBuilder/cover.c +19 -15
- data/ext/zstdruby/libzstd/dictBuilder/cover.h +1 -1
- data/ext/zstdruby/libzstd/dictBuilder/fastcover.c +2 -2
- data/ext/zstdruby/libzstd/dictBuilder/zdict.c +9 -87
- data/ext/zstdruby/libzstd/zdict.h +53 -31
- data/ext/zstdruby/libzstd/zstd.h +489 -90
- data/ext/zstdruby/libzstd/zstd_errors.h +27 -8
- data/ext/zstdruby/main.c +4 -0
- data/ext/zstdruby/streaming_compress.c +1 -7
- data/ext/zstdruby/zstdruby.c +110 -26
- data/lib/zstd-ruby/version.rb +1 -1
- data/lib/zstd-ruby.rb +0 -1
- metadata +7 -6
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
/*
|
|
2
|
-
* Copyright (c)
|
|
2
|
+
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
3
3
|
* All rights reserved.
|
|
4
4
|
*
|
|
5
5
|
* This source code is licensed under both the BSD-style license (found in the
|
|
@@ -10,6 +10,7 @@
|
|
|
10
10
|
|
|
11
11
|
#include "zstd_compress_internal.h"
|
|
12
12
|
#include "zstd_lazy.h"
|
|
13
|
+
#include "../common/bits.h" /* ZSTD_countTrailingZeros64 */
|
|
13
14
|
|
|
14
15
|
|
|
15
16
|
/*-*************************************
|
|
@@ -197,8 +198,8 @@ ZSTD_DUBT_findBetterDictMatch (
|
|
|
197
198
|
U32 matchIndex = dictMatchIndex + dictIndexDelta;
|
|
198
199
|
if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) {
|
|
199
200
|
DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)",
|
|
200
|
-
curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr,
|
|
201
|
-
bestLength = matchLength, *offsetPtr =
|
|
201
|
+
curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, OFFSET_TO_OFFBASE(curr - matchIndex), dictMatchIndex, matchIndex);
|
|
202
|
+
bestLength = matchLength, *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
|
|
202
203
|
}
|
|
203
204
|
if (ip+matchLength == iend) { /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */
|
|
204
205
|
break; /* drop, to guarantee consistency (miss a little bit of compression) */
|
|
@@ -218,7 +219,7 @@ ZSTD_DUBT_findBetterDictMatch (
|
|
|
218
219
|
}
|
|
219
220
|
|
|
220
221
|
if (bestLength >= MINMATCH) {
|
|
221
|
-
U32 const mIndex = curr - (U32)
|
|
222
|
+
U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offsetPtr); (void)mIndex;
|
|
222
223
|
DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
|
|
223
224
|
curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
|
|
224
225
|
}
|
|
@@ -230,7 +231,7 @@ ZSTD_DUBT_findBetterDictMatch (
|
|
|
230
231
|
static size_t
|
|
231
232
|
ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
|
|
232
233
|
const BYTE* const ip, const BYTE* const iend,
|
|
233
|
-
size_t*
|
|
234
|
+
size_t* offBasePtr,
|
|
234
235
|
U32 const mls,
|
|
235
236
|
const ZSTD_dictMode_e dictMode)
|
|
236
237
|
{
|
|
@@ -327,8 +328,8 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
|
|
|
327
328
|
if (matchLength > bestLength) {
|
|
328
329
|
if (matchLength > matchEndIdx - matchIndex)
|
|
329
330
|
matchEndIdx = matchIndex + (U32)matchLength;
|
|
330
|
-
if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)
|
|
331
|
-
bestLength = matchLength, *
|
|
331
|
+
if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr - matchIndex + 1) - ZSTD_highbit32((U32)*offBasePtr)) )
|
|
332
|
+
bestLength = matchLength, *offBasePtr = OFFSET_TO_OFFBASE(curr - matchIndex);
|
|
332
333
|
if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */
|
|
333
334
|
if (dictMode == ZSTD_dictMatchState) {
|
|
334
335
|
nbCompares = 0; /* in addition to avoiding checking any
|
|
@@ -361,16 +362,16 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
|
|
|
361
362
|
if (dictMode == ZSTD_dictMatchState && nbCompares) {
|
|
362
363
|
bestLength = ZSTD_DUBT_findBetterDictMatch(
|
|
363
364
|
ms, ip, iend,
|
|
364
|
-
|
|
365
|
+
offBasePtr, bestLength, nbCompares,
|
|
365
366
|
mls, dictMode);
|
|
366
367
|
}
|
|
367
368
|
|
|
368
369
|
assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased */
|
|
369
370
|
ms->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */
|
|
370
371
|
if (bestLength >= MINMATCH) {
|
|
371
|
-
U32 const mIndex = curr - (U32)
|
|
372
|
+
U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offBasePtr); (void)mIndex;
|
|
372
373
|
DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
|
|
373
|
-
curr, (U32)bestLength, (U32)*
|
|
374
|
+
curr, (U32)bestLength, (U32)*offBasePtr, mIndex);
|
|
374
375
|
}
|
|
375
376
|
return bestLength;
|
|
376
377
|
}
|
|
@@ -381,14 +382,14 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
|
|
|
381
382
|
FORCE_INLINE_TEMPLATE size_t
|
|
382
383
|
ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms,
|
|
383
384
|
const BYTE* const ip, const BYTE* const iLimit,
|
|
384
|
-
size_t*
|
|
385
|
+
size_t* offBasePtr,
|
|
385
386
|
const U32 mls /* template */,
|
|
386
387
|
const ZSTD_dictMode_e dictMode)
|
|
387
388
|
{
|
|
388
389
|
DEBUGLOG(7, "ZSTD_BtFindBestMatch");
|
|
389
390
|
if (ip < ms->window.base + ms->nextToUpdate) return 0; /* skipped area */
|
|
390
391
|
ZSTD_updateDUBT(ms, ip, iLimit, mls);
|
|
391
|
-
return ZSTD_DUBT_findBestMatch(ms, ip, iLimit,
|
|
392
|
+
return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offBasePtr, mls, dictMode);
|
|
392
393
|
}
|
|
393
394
|
|
|
394
395
|
/***********************************
|
|
@@ -561,7 +562,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
|
|
|
561
562
|
/* save best solution */
|
|
562
563
|
if (currentMl > ml) {
|
|
563
564
|
ml = currentMl;
|
|
564
|
-
*offsetPtr =
|
|
565
|
+
*offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta));
|
|
565
566
|
if (ip+currentMl == iLimit) {
|
|
566
567
|
/* best possible, avoids read overflow on next attempt */
|
|
567
568
|
return ml;
|
|
@@ -598,7 +599,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
|
|
|
598
599
|
/* save best solution */
|
|
599
600
|
if (currentMl > ml) {
|
|
600
601
|
ml = currentMl;
|
|
601
|
-
*offsetPtr =
|
|
602
|
+
*offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta));
|
|
602
603
|
if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
|
|
603
604
|
}
|
|
604
605
|
}
|
|
@@ -691,7 +692,8 @@ size_t ZSTD_HcFindBestMatch(
|
|
|
691
692
|
if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
|
|
692
693
|
const BYTE* const match = base + matchIndex;
|
|
693
694
|
assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */
|
|
694
|
-
|
|
695
|
+
/* read 4B starting from (match + ml + 1 - sizeof(U32)) */
|
|
696
|
+
if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3)) /* potentially better */
|
|
695
697
|
currentMl = ZSTD_count(ip, match, iLimit);
|
|
696
698
|
} else {
|
|
697
699
|
const BYTE* const match = dictBase + matchIndex;
|
|
@@ -703,7 +705,7 @@ size_t ZSTD_HcFindBestMatch(
|
|
|
703
705
|
/* save best solution */
|
|
704
706
|
if (currentMl > ml) {
|
|
705
707
|
ml = currentMl;
|
|
706
|
-
*offsetPtr =
|
|
708
|
+
*offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
|
|
707
709
|
if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
|
|
708
710
|
}
|
|
709
711
|
|
|
@@ -739,7 +741,7 @@ size_t ZSTD_HcFindBestMatch(
|
|
|
739
741
|
if (currentMl > ml) {
|
|
740
742
|
ml = currentMl;
|
|
741
743
|
assert(curr > matchIndex + dmsIndexDelta);
|
|
742
|
-
*offsetPtr =
|
|
744
|
+
*offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta));
|
|
743
745
|
if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
|
|
744
746
|
}
|
|
745
747
|
|
|
@@ -757,7 +759,6 @@ size_t ZSTD_HcFindBestMatch(
|
|
|
757
759
|
***********************************/
|
|
758
760
|
/* Constants for row-based hash */
|
|
759
761
|
#define ZSTD_ROW_HASH_TAG_OFFSET 16 /* byte offset of hashes in the match state's tagTable from the beginning of a row */
|
|
760
|
-
#define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */
|
|
761
762
|
#define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1)
|
|
762
763
|
#define ZSTD_ROW_HASH_MAX_ENTRIES 64 /* absolute maximum number of entries per row, for all configurations */
|
|
763
764
|
|
|
@@ -769,38 +770,8 @@ typedef U64 ZSTD_VecMask; /* Clarifies when we are interacting with a U64 repr
|
|
|
769
770
|
* Starting from the LSB, returns the idx of the next non-zero bit.
|
|
770
771
|
* Basically counting the nb of trailing zeroes.
|
|
771
772
|
*/
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
# if defined(_MSC_VER) && defined(_WIN64)
|
|
775
|
-
if (val != 0) {
|
|
776
|
-
unsigned long r;
|
|
777
|
-
_BitScanForward64(&r, val);
|
|
778
|
-
return (U32)(r);
|
|
779
|
-
} else {
|
|
780
|
-
/* Should not reach this code path */
|
|
781
|
-
__assume(0);
|
|
782
|
-
}
|
|
783
|
-
# elif (defined(__GNUC__) && ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))
|
|
784
|
-
if (sizeof(size_t) == 4) {
|
|
785
|
-
U32 mostSignificantWord = (U32)(val >> 32);
|
|
786
|
-
U32 leastSignificantWord = (U32)val;
|
|
787
|
-
if (leastSignificantWord == 0) {
|
|
788
|
-
return 32 + (U32)__builtin_ctz(mostSignificantWord);
|
|
789
|
-
} else {
|
|
790
|
-
return (U32)__builtin_ctz(leastSignificantWord);
|
|
791
|
-
}
|
|
792
|
-
} else {
|
|
793
|
-
return (U32)__builtin_ctzll(val);
|
|
794
|
-
}
|
|
795
|
-
# else
|
|
796
|
-
/* Software ctz version: http://aggregate.org/MAGIC/#Trailing%20Zero%20Count
|
|
797
|
-
* and: https://stackoverflow.com/questions/2709430/count-number-of-bits-in-a-64-bit-long-big-integer
|
|
798
|
-
*/
|
|
799
|
-
val = ~val & (val - 1ULL); /* Lowest set bit mask */
|
|
800
|
-
val = val - ((val >> 1) & 0x5555555555555555);
|
|
801
|
-
val = (val & 0x3333333333333333ULL) + ((val >> 2) & 0x3333333333333333ULL);
|
|
802
|
-
return (U32)((((val + (val >> 4)) & 0xF0F0F0F0F0F0F0FULL) * 0x101010101010101ULL) >> 56);
|
|
803
|
-
# endif
|
|
773
|
+
MEM_STATIC U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
|
|
774
|
+
return ZSTD_countTrailingZeros64(val);
|
|
804
775
|
}
|
|
805
776
|
|
|
806
777
|
/* ZSTD_rotateRight_*():
|
|
@@ -980,7 +951,35 @@ void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) {
|
|
|
980
951
|
const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */);
|
|
981
952
|
|
|
982
953
|
DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog);
|
|
983
|
-
ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /*
|
|
954
|
+
ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* don't use cache */);
|
|
955
|
+
}
|
|
956
|
+
|
|
957
|
+
/* Returns the mask width of bits group of which will be set to 1. Given not all
|
|
958
|
+
* architectures have easy movemask instruction, this helps to iterate over
|
|
959
|
+
* groups of bits easier and faster.
|
|
960
|
+
*/
|
|
961
|
+
FORCE_INLINE_TEMPLATE U32
|
|
962
|
+
ZSTD_row_matchMaskGroupWidth(const U32 rowEntries)
|
|
963
|
+
{
|
|
964
|
+
assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
|
|
965
|
+
assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
|
|
966
|
+
(void)rowEntries;
|
|
967
|
+
#if defined(ZSTD_ARCH_ARM_NEON)
|
|
968
|
+
/* NEON path only works for little endian */
|
|
969
|
+
if (!MEM_isLittleEndian()) {
|
|
970
|
+
return 1;
|
|
971
|
+
}
|
|
972
|
+
if (rowEntries == 16) {
|
|
973
|
+
return 4;
|
|
974
|
+
}
|
|
975
|
+
if (rowEntries == 32) {
|
|
976
|
+
return 2;
|
|
977
|
+
}
|
|
978
|
+
if (rowEntries == 64) {
|
|
979
|
+
return 1;
|
|
980
|
+
}
|
|
981
|
+
#endif
|
|
982
|
+
return 1;
|
|
984
983
|
}
|
|
985
984
|
|
|
986
985
|
#if defined(ZSTD_ARCH_X86_SSE2)
|
|
@@ -1003,71 +1002,82 @@ ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U
|
|
|
1003
1002
|
}
|
|
1004
1003
|
#endif
|
|
1005
1004
|
|
|
1006
|
-
|
|
1007
|
-
* the hash at the nth position in a row of the tagTable.
|
|
1008
|
-
* Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield
|
|
1009
|
-
* to match up with the actual layout of the entries within the hashTable */
|
|
1005
|
+
#if defined(ZSTD_ARCH_ARM_NEON)
|
|
1010
1006
|
FORCE_INLINE_TEMPLATE ZSTD_VecMask
|
|
1011
|
-
|
|
1007
|
+
ZSTD_row_getNEONMask(const U32 rowEntries, const BYTE* const src, const BYTE tag, const U32 headGrouped)
|
|
1008
|
+
{
|
|
1009
|
+
assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
|
|
1010
|
+
if (rowEntries == 16) {
|
|
1011
|
+
/* vshrn_n_u16 shifts by 4 every u16 and narrows to 8 lower bits.
|
|
1012
|
+
* After that groups of 4 bits represent the equalMask. We lower
|
|
1013
|
+
* all bits except the highest in these groups by doing AND with
|
|
1014
|
+
* 0x88 = 0b10001000.
|
|
1015
|
+
*/
|
|
1016
|
+
const uint8x16_t chunk = vld1q_u8(src);
|
|
1017
|
+
const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag)));
|
|
1018
|
+
const uint8x8_t res = vshrn_n_u16(equalMask, 4);
|
|
1019
|
+
const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0);
|
|
1020
|
+
return ZSTD_rotateRight_U64(matches, headGrouped) & 0x8888888888888888ull;
|
|
1021
|
+
} else if (rowEntries == 32) {
|
|
1022
|
+
/* Same idea as with rowEntries == 16 but doing AND with
|
|
1023
|
+
* 0x55 = 0b01010101.
|
|
1024
|
+
*/
|
|
1025
|
+
const uint16x8x2_t chunk = vld2q_u16((const uint16_t*)(const void*)src);
|
|
1026
|
+
const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]);
|
|
1027
|
+
const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]);
|
|
1028
|
+
const uint8x16_t dup = vdupq_n_u8(tag);
|
|
1029
|
+
const uint8x8_t t0 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk0, dup)), 6);
|
|
1030
|
+
const uint8x8_t t1 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk1, dup)), 6);
|
|
1031
|
+
const uint8x8_t res = vsli_n_u8(t0, t1, 4);
|
|
1032
|
+
const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0) ;
|
|
1033
|
+
return ZSTD_rotateRight_U64(matches, headGrouped) & 0x5555555555555555ull;
|
|
1034
|
+
} else { /* rowEntries == 64 */
|
|
1035
|
+
const uint8x16x4_t chunk = vld4q_u8(src);
|
|
1036
|
+
const uint8x16_t dup = vdupq_n_u8(tag);
|
|
1037
|
+
const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup);
|
|
1038
|
+
const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup);
|
|
1039
|
+
const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup);
|
|
1040
|
+
const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup);
|
|
1041
|
+
|
|
1042
|
+
const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1);
|
|
1043
|
+
const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1);
|
|
1044
|
+
const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2);
|
|
1045
|
+
const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4);
|
|
1046
|
+
const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4);
|
|
1047
|
+
const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0);
|
|
1048
|
+
return ZSTD_rotateRight_U64(matches, headGrouped);
|
|
1049
|
+
}
|
|
1050
|
+
}
|
|
1051
|
+
#endif
|
|
1052
|
+
|
|
1053
|
+
/* Returns a ZSTD_VecMask (U64) that has the nth group (determined by
|
|
1054
|
+
* ZSTD_row_matchMaskGroupWidth) of bits set to 1 if the newly-computed "tag"
|
|
1055
|
+
* matches the hash at the nth position in a row of the tagTable.
|
|
1056
|
+
* Each row is a circular buffer beginning at the value of "headGrouped". So we
|
|
1057
|
+
* must rotate the "matches" bitfield to match up with the actual layout of the
|
|
1058
|
+
* entries within the hashTable */
|
|
1059
|
+
FORCE_INLINE_TEMPLATE ZSTD_VecMask
|
|
1060
|
+
ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 headGrouped, const U32 rowEntries)
|
|
1012
1061
|
{
|
|
1013
1062
|
const BYTE* const src = tagRow + ZSTD_ROW_HASH_TAG_OFFSET;
|
|
1014
1063
|
assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
|
|
1015
1064
|
assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
|
|
1065
|
+
assert(ZSTD_row_matchMaskGroupWidth(rowEntries) * rowEntries <= sizeof(ZSTD_VecMask) * 8);
|
|
1016
1066
|
|
|
1017
1067
|
#if defined(ZSTD_ARCH_X86_SSE2)
|
|
1018
1068
|
|
|
1019
|
-
return ZSTD_row_getSSEMask(rowEntries / 16, src, tag,
|
|
1069
|
+
return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, headGrouped);
|
|
1020
1070
|
|
|
1021
1071
|
#else /* SW or NEON-LE */
|
|
1022
1072
|
|
|
1023
1073
|
# if defined(ZSTD_ARCH_ARM_NEON)
|
|
1024
1074
|
/* This NEON path only works for little endian - otherwise use SWAR below */
|
|
1025
1075
|
if (MEM_isLittleEndian()) {
|
|
1026
|
-
|
|
1027
|
-
const uint8x16_t chunk = vld1q_u8(src);
|
|
1028
|
-
const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag)));
|
|
1029
|
-
const uint16x8_t t0 = vshlq_n_u16(equalMask, 7);
|
|
1030
|
-
const uint32x4_t t1 = vreinterpretq_u32_u16(vsriq_n_u16(t0, t0, 14));
|
|
1031
|
-
const uint64x2_t t2 = vreinterpretq_u64_u32(vshrq_n_u32(t1, 14));
|
|
1032
|
-
const uint8x16_t t3 = vreinterpretq_u8_u64(vsraq_n_u64(t2, t2, 28));
|
|
1033
|
-
const U16 hi = (U16)vgetq_lane_u8(t3, 8);
|
|
1034
|
-
const U16 lo = (U16)vgetq_lane_u8(t3, 0);
|
|
1035
|
-
return ZSTD_rotateRight_U16((hi << 8) | lo, head);
|
|
1036
|
-
} else if (rowEntries == 32) {
|
|
1037
|
-
const uint16x8x2_t chunk = vld2q_u16((const U16*)(const void*)src);
|
|
1038
|
-
const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]);
|
|
1039
|
-
const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]);
|
|
1040
|
-
const uint8x16_t equalMask0 = vceqq_u8(chunk0, vdupq_n_u8(tag));
|
|
1041
|
-
const uint8x16_t equalMask1 = vceqq_u8(chunk1, vdupq_n_u8(tag));
|
|
1042
|
-
const int8x8_t pack0 = vqmovn_s16(vreinterpretq_s16_u8(equalMask0));
|
|
1043
|
-
const int8x8_t pack1 = vqmovn_s16(vreinterpretq_s16_u8(equalMask1));
|
|
1044
|
-
const uint8x8_t t0 = vreinterpret_u8_s8(pack0);
|
|
1045
|
-
const uint8x8_t t1 = vreinterpret_u8_s8(pack1);
|
|
1046
|
-
const uint8x8_t t2 = vsri_n_u8(t1, t0, 2);
|
|
1047
|
-
const uint8x8x2_t t3 = vuzp_u8(t2, t0);
|
|
1048
|
-
const uint8x8_t t4 = vsri_n_u8(t3.val[1], t3.val[0], 4);
|
|
1049
|
-
const U32 matches = vget_lane_u32(vreinterpret_u32_u8(t4), 0);
|
|
1050
|
-
return ZSTD_rotateRight_U32(matches, head);
|
|
1051
|
-
} else { /* rowEntries == 64 */
|
|
1052
|
-
const uint8x16x4_t chunk = vld4q_u8(src);
|
|
1053
|
-
const uint8x16_t dup = vdupq_n_u8(tag);
|
|
1054
|
-
const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup);
|
|
1055
|
-
const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup);
|
|
1056
|
-
const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup);
|
|
1057
|
-
const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup);
|
|
1058
|
-
|
|
1059
|
-
const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1);
|
|
1060
|
-
const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1);
|
|
1061
|
-
const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2);
|
|
1062
|
-
const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4);
|
|
1063
|
-
const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4);
|
|
1064
|
-
const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0);
|
|
1065
|
-
return ZSTD_rotateRight_U64(matches, head);
|
|
1066
|
-
}
|
|
1076
|
+
return ZSTD_row_getNEONMask(rowEntries, src, tag, headGrouped);
|
|
1067
1077
|
}
|
|
1068
1078
|
# endif /* ZSTD_ARCH_ARM_NEON */
|
|
1069
1079
|
/* SWAR */
|
|
1070
|
-
{ const
|
|
1080
|
+
{ const int chunkSize = sizeof(size_t);
|
|
1071
1081
|
const size_t shiftAmount = ((chunkSize * 8) - chunkSize);
|
|
1072
1082
|
const size_t xFF = ~((size_t)0);
|
|
1073
1083
|
const size_t x01 = xFF / 0xFF;
|
|
@@ -1100,11 +1110,11 @@ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head,
|
|
|
1100
1110
|
}
|
|
1101
1111
|
matches = ~matches;
|
|
1102
1112
|
if (rowEntries == 16) {
|
|
1103
|
-
return ZSTD_rotateRight_U16((U16)matches,
|
|
1113
|
+
return ZSTD_rotateRight_U16((U16)matches, headGrouped);
|
|
1104
1114
|
} else if (rowEntries == 32) {
|
|
1105
|
-
return ZSTD_rotateRight_U32((U32)matches,
|
|
1115
|
+
return ZSTD_rotateRight_U32((U32)matches, headGrouped);
|
|
1106
1116
|
} else {
|
|
1107
|
-
return ZSTD_rotateRight_U64((U64)matches,
|
|
1117
|
+
return ZSTD_rotateRight_U64((U64)matches, headGrouped);
|
|
1108
1118
|
}
|
|
1109
1119
|
}
|
|
1110
1120
|
#endif
|
|
@@ -1152,6 +1162,7 @@ size_t ZSTD_RowFindBestMatch(
|
|
|
1152
1162
|
const U32 rowEntries = (1U << rowLog);
|
|
1153
1163
|
const U32 rowMask = rowEntries - 1;
|
|
1154
1164
|
const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */
|
|
1165
|
+
const U32 groupWidth = ZSTD_row_matchMaskGroupWidth(rowEntries);
|
|
1155
1166
|
U32 nbAttempts = 1U << cappedSearchLog;
|
|
1156
1167
|
size_t ml=4-1;
|
|
1157
1168
|
|
|
@@ -1194,15 +1205,15 @@ size_t ZSTD_RowFindBestMatch(
|
|
|
1194
1205
|
U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK;
|
|
1195
1206
|
U32* const row = hashTable + relRow;
|
|
1196
1207
|
BYTE* tagRow = (BYTE*)(tagTable + relRow);
|
|
1197
|
-
U32 const
|
|
1208
|
+
U32 const headGrouped = (*tagRow & rowMask) * groupWidth;
|
|
1198
1209
|
U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
|
|
1199
1210
|
size_t numMatches = 0;
|
|
1200
1211
|
size_t currMatch = 0;
|
|
1201
|
-
ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag,
|
|
1212
|
+
ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, headGrouped, rowEntries);
|
|
1202
1213
|
|
|
1203
1214
|
/* Cycle through the matches and prefetch */
|
|
1204
1215
|
for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
|
|
1205
|
-
U32 const matchPos = (
|
|
1216
|
+
U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask;
|
|
1206
1217
|
U32 const matchIndex = row[matchPos];
|
|
1207
1218
|
assert(numMatches < rowEntries);
|
|
1208
1219
|
if (matchIndex < lowLimit)
|
|
@@ -1233,7 +1244,8 @@ size_t ZSTD_RowFindBestMatch(
|
|
|
1233
1244
|
if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
|
|
1234
1245
|
const BYTE* const match = base + matchIndex;
|
|
1235
1246
|
assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */
|
|
1236
|
-
|
|
1247
|
+
/* read 4B starting from (match + ml + 1 - sizeof(U32)) */
|
|
1248
|
+
if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3)) /* potentially better */
|
|
1237
1249
|
currentMl = ZSTD_count(ip, match, iLimit);
|
|
1238
1250
|
} else {
|
|
1239
1251
|
const BYTE* const match = dictBase + matchIndex;
|
|
@@ -1245,7 +1257,7 @@ size_t ZSTD_RowFindBestMatch(
|
|
|
1245
1257
|
/* Save best solution */
|
|
1246
1258
|
if (currentMl > ml) {
|
|
1247
1259
|
ml = currentMl;
|
|
1248
|
-
*offsetPtr =
|
|
1260
|
+
*offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
|
|
1249
1261
|
if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
|
|
1250
1262
|
}
|
|
1251
1263
|
}
|
|
@@ -1263,14 +1275,14 @@ size_t ZSTD_RowFindBestMatch(
|
|
|
1263
1275
|
const U32 dmsSize = (U32)(dmsEnd - dmsBase);
|
|
1264
1276
|
const U32 dmsIndexDelta = dictLimit - dmsSize;
|
|
1265
1277
|
|
|
1266
|
-
{ U32 const
|
|
1278
|
+
{ U32 const headGrouped = (*dmsTagRow & rowMask) * groupWidth;
|
|
1267
1279
|
U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
|
|
1268
1280
|
size_t numMatches = 0;
|
|
1269
1281
|
size_t currMatch = 0;
|
|
1270
|
-
ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag,
|
|
1282
|
+
ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, headGrouped, rowEntries);
|
|
1271
1283
|
|
|
1272
1284
|
for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
|
|
1273
|
-
U32 const matchPos = (
|
|
1285
|
+
U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask;
|
|
1274
1286
|
U32 const matchIndex = dmsRow[matchPos];
|
|
1275
1287
|
if (matchIndex < dmsLowestIndex)
|
|
1276
1288
|
break;
|
|
@@ -1294,7 +1306,7 @@ size_t ZSTD_RowFindBestMatch(
|
|
|
1294
1306
|
if (currentMl > ml) {
|
|
1295
1307
|
ml = currentMl;
|
|
1296
1308
|
assert(curr > matchIndex + dmsIndexDelta);
|
|
1297
|
-
*offsetPtr =
|
|
1309
|
+
*offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta));
|
|
1298
1310
|
if (ip+currentMl == iLimit) break;
|
|
1299
1311
|
}
|
|
1300
1312
|
}
|
|
@@ -1304,14 +1316,10 @@ size_t ZSTD_RowFindBestMatch(
|
|
|
1304
1316
|
}
|
|
1305
1317
|
|
|
1306
1318
|
|
|
1307
|
-
typedef size_t (*searchMax_f)(
|
|
1308
|
-
ZSTD_matchState_t* ms,
|
|
1309
|
-
const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
|
|
1310
|
-
|
|
1311
1319
|
/**
|
|
1312
|
-
*
|
|
1313
|
-
*
|
|
1314
|
-
*
|
|
1320
|
+
* Generate search functions templated on (dictMode, mls, rowLog).
|
|
1321
|
+
* These functions are outlined for code size & compilation time.
|
|
1322
|
+
* ZSTD_searchMax() dispatches to the correct implementation function.
|
|
1315
1323
|
*
|
|
1316
1324
|
* TODO: The start of the search function involves loading and calculating a
|
|
1317
1325
|
* bunch of constants from the ZSTD_matchState_t. These computations could be
|
|
@@ -1329,25 +1337,25 @@ typedef size_t (*searchMax_f)(
|
|
|
1329
1337
|
* the single segment loop. It should go in searchMax instead of its own
|
|
1330
1338
|
* function to avoid having multiple virtual function calls per search.
|
|
1331
1339
|
*/
|
|
1332
|
-
typedef struct {
|
|
1333
|
-
searchMax_f searchMax;
|
|
1334
|
-
} ZSTD_LazyVTable;
|
|
1335
1340
|
|
|
1336
|
-
#define
|
|
1337
|
-
|
|
1338
|
-
|
|
1339
|
-
|
|
1340
|
-
|
|
1341
|
-
|
|
1342
|
-
|
|
1343
|
-
|
|
1344
|
-
|
|
1345
|
-
|
|
1346
|
-
|
|
1347
|
-
|
|
1341
|
+
#define ZSTD_BT_SEARCH_FN(dictMode, mls) ZSTD_BtFindBestMatch_##dictMode##_##mls
|
|
1342
|
+
#define ZSTD_HC_SEARCH_FN(dictMode, mls) ZSTD_HcFindBestMatch_##dictMode##_##mls
|
|
1343
|
+
#define ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog) ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog
|
|
1344
|
+
|
|
1345
|
+
#define ZSTD_SEARCH_FN_ATTRS FORCE_NOINLINE
|
|
1346
|
+
|
|
1347
|
+
#define GEN_ZSTD_BT_SEARCH_FN(dictMode, mls) \
|
|
1348
|
+
ZSTD_SEARCH_FN_ATTRS size_t ZSTD_BT_SEARCH_FN(dictMode, mls)( \
|
|
1349
|
+
ZSTD_matchState_t* ms, \
|
|
1350
|
+
const BYTE* ip, const BYTE* const iLimit, \
|
|
1351
|
+
size_t* offBasePtr) \
|
|
1352
|
+
{ \
|
|
1353
|
+
assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
|
|
1354
|
+
return ZSTD_BtFindBestMatch(ms, ip, iLimit, offBasePtr, mls, ZSTD_##dictMode); \
|
|
1355
|
+
} \
|
|
1348
1356
|
|
|
1349
|
-
#define
|
|
1350
|
-
|
|
1357
|
+
#define GEN_ZSTD_HC_SEARCH_FN(dictMode, mls) \
|
|
1358
|
+
ZSTD_SEARCH_FN_ATTRS size_t ZSTD_HC_SEARCH_FN(dictMode, mls)( \
|
|
1351
1359
|
ZSTD_matchState_t* ms, \
|
|
1352
1360
|
const BYTE* ip, const BYTE* const iLimit, \
|
|
1353
1361
|
size_t* offsetPtr) \
|
|
@@ -1355,12 +1363,9 @@ typedef struct {
|
|
|
1355
1363
|
assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
|
|
1356
1364
|
return ZSTD_HcFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode); \
|
|
1357
1365
|
} \
|
|
1358
|
-
static const ZSTD_LazyVTable ZSTD_HcVTable_##dictMode##_##mls = { \
|
|
1359
|
-
ZSTD_HcFindBestMatch_##dictMode##_##mls \
|
|
1360
|
-
};
|
|
1361
1366
|
|
|
1362
|
-
#define
|
|
1363
|
-
|
|
1367
|
+
#define GEN_ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog) \
|
|
1368
|
+
ZSTD_SEARCH_FN_ATTRS size_t ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)( \
|
|
1364
1369
|
ZSTD_matchState_t* ms, \
|
|
1365
1370
|
const BYTE* ip, const BYTE* const iLimit, \
|
|
1366
1371
|
size_t* offsetPtr) \
|
|
@@ -1369,9 +1374,6 @@ typedef struct {
|
|
|
1369
1374
|
assert(MAX(4, MIN(6, ms->cParams.searchLog)) == rowLog); \
|
|
1370
1375
|
return ZSTD_RowFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode, rowLog); \
|
|
1371
1376
|
} \
|
|
1372
|
-
static const ZSTD_LazyVTable ZSTD_RowVTable_##dictMode##_##mls##_##rowLog = { \
|
|
1373
|
-
ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog \
|
|
1374
|
-
};
|
|
1375
1377
|
|
|
1376
1378
|
#define ZSTD_FOR_EACH_ROWLOG(X, dictMode, mls) \
|
|
1377
1379
|
X(dictMode, mls, 4) \
|
|
@@ -1394,84 +1396,103 @@ typedef struct {
|
|
|
1394
1396
|
X(__VA_ARGS__, dictMatchState) \
|
|
1395
1397
|
X(__VA_ARGS__, dedicatedDictSearch)
|
|
1396
1398
|
|
|
1397
|
-
/* Generate
|
|
1398
|
-
ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS_ROWLOG,
|
|
1399
|
-
/* Generate
|
|
1400
|
-
ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS,
|
|
1401
|
-
/* Generate
|
|
1402
|
-
ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS,
|
|
1403
|
-
|
|
1404
|
-
#define GEN_ZSTD_BT_VTABLE_ARRAY(dictMode) \
|
|
1405
|
-
{ \
|
|
1406
|
-
&ZSTD_BtVTable_##dictMode##_4, \
|
|
1407
|
-
&ZSTD_BtVTable_##dictMode##_5, \
|
|
1408
|
-
&ZSTD_BtVTable_##dictMode##_6 \
|
|
1409
|
-
}
|
|
1410
|
-
|
|
1411
|
-
#define GEN_ZSTD_HC_VTABLE_ARRAY(dictMode) \
|
|
1412
|
-
{ \
|
|
1413
|
-
&ZSTD_HcVTable_##dictMode##_4, \
|
|
1414
|
-
&ZSTD_HcVTable_##dictMode##_5, \
|
|
1415
|
-
&ZSTD_HcVTable_##dictMode##_6 \
|
|
1416
|
-
}
|
|
1399
|
+
/* Generate row search fns for each combination of (dictMode, mls, rowLog) */
|
|
1400
|
+
ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS_ROWLOG, GEN_ZSTD_ROW_SEARCH_FN)
|
|
1401
|
+
/* Generate binary Tree search fns for each combination of (dictMode, mls) */
|
|
1402
|
+
ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_BT_SEARCH_FN)
|
|
1403
|
+
/* Generate hash chain search fns for each combination of (dictMode, mls) */
|
|
1404
|
+
ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_HC_SEARCH_FN)
|
|
1417
1405
|
|
|
1418
|
-
|
|
1419
|
-
{ \
|
|
1420
|
-
&ZSTD_RowVTable_##dictMode##_##mls##_4, \
|
|
1421
|
-
&ZSTD_RowVTable_##dictMode##_##mls##_5, \
|
|
1422
|
-
&ZSTD_RowVTable_##dictMode##_##mls##_6 \
|
|
1423
|
-
}
|
|
1424
|
-
|
|
1425
|
-
#define GEN_ZSTD_ROW_VTABLE_ARRAY(dictMode) \
|
|
1426
|
-
{ \
|
|
1427
|
-
GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, 4), \
|
|
1428
|
-
GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, 5), \
|
|
1429
|
-
GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, 6) \
|
|
1430
|
-
}
|
|
1406
|
+
typedef enum { search_hashChain=0, search_binaryTree=1, search_rowHash=2 } searchMethod_e;
|
|
1431
1407
|
|
|
1432
|
-
#define
|
|
1433
|
-
|
|
1434
|
-
|
|
1435
|
-
|
|
1436
|
-
|
|
1437
|
-
|
|
1408
|
+
#define GEN_ZSTD_CALL_BT_SEARCH_FN(dictMode, mls) \
|
|
1409
|
+
case mls: \
|
|
1410
|
+
return ZSTD_BT_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr);
|
|
1411
|
+
#define GEN_ZSTD_CALL_HC_SEARCH_FN(dictMode, mls) \
|
|
1412
|
+
case mls: \
|
|
1413
|
+
return ZSTD_HC_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr);
|
|
1414
|
+
#define GEN_ZSTD_CALL_ROW_SEARCH_FN(dictMode, mls, rowLog) \
|
|
1415
|
+
case rowLog: \
|
|
1416
|
+
return ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)(ms, ip, iend, offsetPtr);
|
|
1417
|
+
|
|
1418
|
+
#define ZSTD_SWITCH_MLS(X, dictMode) \
|
|
1419
|
+
switch (mls) { \
|
|
1420
|
+
ZSTD_FOR_EACH_MLS(X, dictMode) \
|
|
1438
1421
|
}
|
|
1439
1422
|
|
|
1440
|
-
|
|
1441
|
-
|
|
1442
|
-
|
|
1443
|
-
|
|
1423
|
+
#define ZSTD_SWITCH_ROWLOG(dictMode, mls) \
|
|
1424
|
+
case mls: \
|
|
1425
|
+
switch (rowLog) { \
|
|
1426
|
+
ZSTD_FOR_EACH_ROWLOG(GEN_ZSTD_CALL_ROW_SEARCH_FN, dictMode, mls) \
|
|
1427
|
+
} \
|
|
1428
|
+
ZSTD_UNREACHABLE; \
|
|
1429
|
+
break;
|
|
1430
|
+
|
|
1431
|
+
#define ZSTD_SWITCH_SEARCH_METHOD(dictMode) \
|
|
1432
|
+
switch (searchMethod) { \
|
|
1433
|
+
case search_hashChain: \
|
|
1434
|
+
ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_HC_SEARCH_FN, dictMode) \
|
|
1435
|
+
break; \
|
|
1436
|
+
case search_binaryTree: \
|
|
1437
|
+
ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_BT_SEARCH_FN, dictMode) \
|
|
1438
|
+
break; \
|
|
1439
|
+
case search_rowHash: \
|
|
1440
|
+
ZSTD_SWITCH_MLS(ZSTD_SWITCH_ROWLOG, dictMode) \
|
|
1441
|
+
break; \
|
|
1442
|
+
} \
|
|
1443
|
+
ZSTD_UNREACHABLE;
|
|
1444
1444
|
|
|
1445
1445
|
/**
|
|
1446
|
-
*
|
|
1447
|
-
*
|
|
1448
|
-
*
|
|
1449
|
-
*
|
|
1446
|
+
* Searches for the longest match at @p ip.
|
|
1447
|
+
* Dispatches to the correct implementation function based on the
|
|
1448
|
+
* (searchMethod, dictMode, mls, rowLog). We use switch statements
|
|
1449
|
+
* here instead of using an indirect function call through a function
|
|
1450
|
+
* pointer because after Spectre and Meltdown mitigations, indirect
|
|
1451
|
+
* function calls can be very costly, especially in the kernel.
|
|
1452
|
+
*
|
|
1453
|
+
* NOTE: dictMode and searchMethod should be templated, so those switch
|
|
1454
|
+
* statements should be optimized out. Only the mls & rowLog switches
|
|
1455
|
+
* should be left.
|
|
1456
|
+
*
|
|
1457
|
+
* @param ms The match state.
|
|
1458
|
+
* @param ip The position to search at.
|
|
1459
|
+
* @param iend The end of the input data.
|
|
1460
|
+
* @param[out] offsetPtr Stores the match offset into this pointer.
|
|
1461
|
+
* @param mls The minimum search length, in the range [4, 6].
|
|
1462
|
+
* @param rowLog The row log (if applicable), in the range [4, 6].
|
|
1463
|
+
* @param searchMethod The search method to use (templated).
|
|
1464
|
+
* @param dictMode The dictMode (templated).
|
|
1465
|
+
*
|
|
1466
|
+
* @returns The length of the longest match found, or < mls if no match is found.
|
|
1467
|
+
* If a match is found its offset is stored in @p offsetPtr.
|
|
1450
1468
|
*/
|
|
1451
|
-
|
|
1452
|
-
|
|
1453
|
-
|
|
1469
|
+
FORCE_INLINE_TEMPLATE size_t ZSTD_searchMax(
|
|
1470
|
+
ZSTD_matchState_t* ms,
|
|
1471
|
+
const BYTE* ip,
|
|
1472
|
+
const BYTE* iend,
|
|
1473
|
+
size_t* offsetPtr,
|
|
1474
|
+
U32 const mls,
|
|
1475
|
+
U32 const rowLog,
|
|
1476
|
+
searchMethod_e const searchMethod,
|
|
1477
|
+
ZSTD_dictMode_e const dictMode)
|
|
1454
1478
|
{
|
|
1455
|
-
|
|
1456
|
-
|
|
1457
|
-
|
|
1458
|
-
|
|
1459
|
-
|
|
1460
|
-
|
|
1461
|
-
|
|
1462
|
-
|
|
1463
|
-
switch (searchMethod) {
|
|
1464
|
-
case search_hashChain:
|
|
1465
|
-
return hcVTables[dictMode][mls - 4];
|
|
1466
|
-
case search_binaryTree:
|
|
1467
|
-
return btVTables[dictMode][mls - 4];
|
|
1468
|
-
case search_rowHash:
|
|
1469
|
-
return rowVTables[dictMode][mls - 4][rowLog - 4];
|
|
1470
|
-
default:
|
|
1471
|
-
return NULL;
|
|
1479
|
+
if (dictMode == ZSTD_noDict) {
|
|
1480
|
+
ZSTD_SWITCH_SEARCH_METHOD(noDict)
|
|
1481
|
+
} else if (dictMode == ZSTD_extDict) {
|
|
1482
|
+
ZSTD_SWITCH_SEARCH_METHOD(extDict)
|
|
1483
|
+
} else if (dictMode == ZSTD_dictMatchState) {
|
|
1484
|
+
ZSTD_SWITCH_SEARCH_METHOD(dictMatchState)
|
|
1485
|
+
} else if (dictMode == ZSTD_dedicatedDictSearch) {
|
|
1486
|
+
ZSTD_SWITCH_SEARCH_METHOD(dedicatedDictSearch)
|
|
1472
1487
|
}
|
|
1488
|
+
ZSTD_UNREACHABLE;
|
|
1489
|
+
return 0;
|
|
1473
1490
|
}
|
|
1474
1491
|
|
|
1492
|
+
/* *******************************
|
|
1493
|
+
* Common parser - lazy strategy
|
|
1494
|
+
*********************************/
|
|
1495
|
+
|
|
1475
1496
|
FORCE_INLINE_TEMPLATE size_t
|
|
1476
1497
|
ZSTD_compressBlock_lazy_generic(
|
|
1477
1498
|
ZSTD_matchState_t* ms, seqStore_t* seqStore,
|
|
@@ -1488,9 +1509,11 @@ ZSTD_compressBlock_lazy_generic(
|
|
|
1488
1509
|
const BYTE* const base = ms->window.base;
|
|
1489
1510
|
const U32 prefixLowestIndex = ms->window.dictLimit;
|
|
1490
1511
|
const BYTE* const prefixLowest = base + prefixLowestIndex;
|
|
1512
|
+
const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6);
|
|
1513
|
+
const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);
|
|
1491
1514
|
|
|
1492
|
-
|
|
1493
|
-
U32
|
|
1515
|
+
U32 offset_1 = rep[0], offset_2 = rep[1];
|
|
1516
|
+
U32 offsetSaved1 = 0, offsetSaved2 = 0;
|
|
1494
1517
|
|
|
1495
1518
|
const int isDMS = dictMode == ZSTD_dictMatchState;
|
|
1496
1519
|
const int isDDS = dictMode == ZSTD_dedicatedDictSearch;
|
|
@@ -1505,16 +1528,14 @@ ZSTD_compressBlock_lazy_generic(
|
|
|
1505
1528
|
0;
|
|
1506
1529
|
const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictLowest));
|
|
1507
1530
|
|
|
1508
|
-
assert(searchMax != NULL);
|
|
1509
|
-
|
|
1510
1531
|
DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=%u) (searchFunc=%u)", (U32)dictMode, (U32)searchMethod);
|
|
1511
1532
|
ip += (dictAndPrefixLength == 0);
|
|
1512
1533
|
if (dictMode == ZSTD_noDict) {
|
|
1513
1534
|
U32 const curr = (U32)(ip - base);
|
|
1514
1535
|
U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, ms->cParams.windowLog);
|
|
1515
1536
|
U32 const maxRep = curr - windowLow;
|
|
1516
|
-
if (offset_2 > maxRep)
|
|
1517
|
-
if (offset_1 > maxRep)
|
|
1537
|
+
if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0;
|
|
1538
|
+
if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0;
|
|
1518
1539
|
}
|
|
1519
1540
|
if (isDxS) {
|
|
1520
1541
|
/* dictMatchState repCode checks don't currently handle repCode == 0
|
|
@@ -1524,7 +1545,6 @@ ZSTD_compressBlock_lazy_generic(
|
|
|
1524
1545
|
}
|
|
1525
1546
|
|
|
1526
1547
|
if (searchMethod == search_rowHash) {
|
|
1527
|
-
const U32 rowLog = MAX(4, MIN(6, ms->cParams.searchLog));
|
|
1528
1548
|
ZSTD_row_fillHashCache(ms, base, rowLog,
|
|
1529
1549
|
MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
|
|
1530
1550
|
ms->nextToUpdate, ilimit);
|
|
@@ -1539,7 +1559,7 @@ ZSTD_compressBlock_lazy_generic(
|
|
|
1539
1559
|
#endif
|
|
1540
1560
|
while (ip < ilimit) {
|
|
1541
1561
|
size_t matchLength=0;
|
|
1542
|
-
size_t
|
|
1562
|
+
size_t offBase = REPCODE1_TO_OFFBASE;
|
|
1543
1563
|
const BYTE* start=ip+1;
|
|
1544
1564
|
DEBUGLOG(7, "search baseline (depth 0)");
|
|
1545
1565
|
|
|
@@ -1564,10 +1584,10 @@ ZSTD_compressBlock_lazy_generic(
|
|
|
1564
1584
|
}
|
|
1565
1585
|
|
|
1566
1586
|
/* first search (depth 0) */
|
|
1567
|
-
{ size_t
|
|
1568
|
-
size_t const ml2 =
|
|
1587
|
+
{ size_t offbaseFound = 999999999;
|
|
1588
|
+
size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offbaseFound, mls, rowLog, searchMethod, dictMode);
|
|
1569
1589
|
if (ml2 > matchLength)
|
|
1570
|
-
matchLength = ml2, start = ip,
|
|
1590
|
+
matchLength = ml2, start = ip, offBase = offbaseFound;
|
|
1571
1591
|
}
|
|
1572
1592
|
|
|
1573
1593
|
if (matchLength < 4) {
|
|
@@ -1581,12 +1601,12 @@ ZSTD_compressBlock_lazy_generic(
|
|
|
1581
1601
|
DEBUGLOG(7, "search depth 1");
|
|
1582
1602
|
ip ++;
|
|
1583
1603
|
if ( (dictMode == ZSTD_noDict)
|
|
1584
|
-
&& (
|
|
1604
|
+
&& (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
|
|
1585
1605
|
size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
|
|
1586
1606
|
int const gain2 = (int)(mlRep * 3);
|
|
1587
|
-
int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)
|
|
1607
|
+
int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
|
|
1588
1608
|
if ((mlRep >= 4) && (gain2 > gain1))
|
|
1589
|
-
matchLength = mlRep,
|
|
1609
|
+
matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
|
|
1590
1610
|
}
|
|
1591
1611
|
if (isDxS) {
|
|
1592
1612
|
const U32 repIndex = (U32)(ip - base) - offset_1;
|
|
@@ -1598,17 +1618,17 @@ ZSTD_compressBlock_lazy_generic(
|
|
|
1598
1618
|
const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
|
|
1599
1619
|
size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
|
|
1600
1620
|
int const gain2 = (int)(mlRep * 3);
|
|
1601
|
-
int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)
|
|
1621
|
+
int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
|
|
1602
1622
|
if ((mlRep >= 4) && (gain2 > gain1))
|
|
1603
|
-
matchLength = mlRep,
|
|
1623
|
+
matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
|
|
1604
1624
|
}
|
|
1605
1625
|
}
|
|
1606
|
-
{ size_t
|
|
1607
|
-
size_t const ml2 =
|
|
1608
|
-
int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)
|
|
1609
|
-
int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)
|
|
1626
|
+
{ size_t ofbCandidate=999999999;
|
|
1627
|
+
size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);
|
|
1628
|
+
int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */
|
|
1629
|
+
int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4);
|
|
1610
1630
|
if ((ml2 >= 4) && (gain2 > gain1)) {
|
|
1611
|
-
matchLength = ml2,
|
|
1631
|
+
matchLength = ml2, offBase = ofbCandidate, start = ip;
|
|
1612
1632
|
continue; /* search a better one */
|
|
1613
1633
|
} }
|
|
1614
1634
|
|
|
@@ -1617,12 +1637,12 @@ ZSTD_compressBlock_lazy_generic(
|
|
|
1617
1637
|
DEBUGLOG(7, "search depth 2");
|
|
1618
1638
|
ip ++;
|
|
1619
1639
|
if ( (dictMode == ZSTD_noDict)
|
|
1620
|
-
&& (
|
|
1640
|
+
&& (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
|
|
1621
1641
|
size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
|
|
1622
1642
|
int const gain2 = (int)(mlRep * 4);
|
|
1623
|
-
int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)
|
|
1643
|
+
int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
|
|
1624
1644
|
if ((mlRep >= 4) && (gain2 > gain1))
|
|
1625
|
-
matchLength = mlRep,
|
|
1645
|
+
matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
|
|
1626
1646
|
}
|
|
1627
1647
|
if (isDxS) {
|
|
1628
1648
|
const U32 repIndex = (U32)(ip - base) - offset_1;
|
|
@@ -1634,17 +1654,17 @@ ZSTD_compressBlock_lazy_generic(
|
|
|
1634
1654
|
const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
|
|
1635
1655
|
size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
|
|
1636
1656
|
int const gain2 = (int)(mlRep * 4);
|
|
1637
|
-
int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)
|
|
1657
|
+
int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
|
|
1638
1658
|
if ((mlRep >= 4) && (gain2 > gain1))
|
|
1639
|
-
matchLength = mlRep,
|
|
1659
|
+
matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
|
|
1640
1660
|
}
|
|
1641
1661
|
}
|
|
1642
|
-
{ size_t
|
|
1643
|
-
size_t const ml2 =
|
|
1644
|
-
int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)
|
|
1645
|
-
int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)
|
|
1662
|
+
{ size_t ofbCandidate=999999999;
|
|
1663
|
+
size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);
|
|
1664
|
+
int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */
|
|
1665
|
+
int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7);
|
|
1646
1666
|
if ((ml2 >= 4) && (gain2 > gain1)) {
|
|
1647
|
-
matchLength = ml2,
|
|
1667
|
+
matchLength = ml2, offBase = ofbCandidate, start = ip;
|
|
1648
1668
|
continue;
|
|
1649
1669
|
} } }
|
|
1650
1670
|
break; /* nothing found : store previous solution */
|
|
@@ -1655,24 +1675,24 @@ ZSTD_compressBlock_lazy_generic(
|
|
|
1655
1675
|
* notably if `value` is unsigned, resulting in a large positive `-value`.
|
|
1656
1676
|
*/
|
|
1657
1677
|
/* catch up */
|
|
1658
|
-
if (
|
|
1678
|
+
if (OFFBASE_IS_OFFSET(offBase)) {
|
|
1659
1679
|
if (dictMode == ZSTD_noDict) {
|
|
1660
|
-
while ( ((start > anchor) & (start -
|
|
1661
|
-
&& (start[-1] == (start-
|
|
1680
|
+
while ( ((start > anchor) & (start - OFFBASE_TO_OFFSET(offBase) > prefixLowest))
|
|
1681
|
+
&& (start[-1] == (start-OFFBASE_TO_OFFSET(offBase))[-1]) ) /* only search for offset within prefix */
|
|
1662
1682
|
{ start--; matchLength++; }
|
|
1663
1683
|
}
|
|
1664
1684
|
if (isDxS) {
|
|
1665
|
-
U32 const matchIndex = (U32)((size_t)(start-base) -
|
|
1685
|
+
U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase));
|
|
1666
1686
|
const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex;
|
|
1667
1687
|
const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest;
|
|
1668
1688
|
while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */
|
|
1669
1689
|
}
|
|
1670
|
-
offset_2 = offset_1; offset_1 = (U32)
|
|
1690
|
+
offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase);
|
|
1671
1691
|
}
|
|
1672
1692
|
/* store sequence */
|
|
1673
1693
|
_storeSequence:
|
|
1674
1694
|
{ size_t const litLength = (size_t)(start - anchor);
|
|
1675
|
-
ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)
|
|
1695
|
+
ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
|
|
1676
1696
|
anchor = ip = start + matchLength;
|
|
1677
1697
|
}
|
|
1678
1698
|
|
|
@@ -1688,8 +1708,8 @@ _storeSequence:
|
|
|
1688
1708
|
&& (MEM_read32(repMatch) == MEM_read32(ip)) ) {
|
|
1689
1709
|
const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend;
|
|
1690
1710
|
matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4;
|
|
1691
|
-
|
|
1692
|
-
ZSTD_storeSeq(seqStore, 0, anchor, iend,
|
|
1711
|
+
offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap offset_2 <=> offset_1 */
|
|
1712
|
+
ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
|
|
1693
1713
|
ip += matchLength;
|
|
1694
1714
|
anchor = ip;
|
|
1695
1715
|
continue;
|
|
@@ -1703,16 +1723,20 @@ _storeSequence:
|
|
|
1703
1723
|
&& (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) {
|
|
1704
1724
|
/* store sequence */
|
|
1705
1725
|
matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
|
|
1706
|
-
|
|
1707
|
-
ZSTD_storeSeq(seqStore, 0, anchor, iend,
|
|
1726
|
+
offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap repcodes */
|
|
1727
|
+
ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
|
|
1708
1728
|
ip += matchLength;
|
|
1709
1729
|
anchor = ip;
|
|
1710
1730
|
continue; /* faster when present ... (?) */
|
|
1711
1731
|
} } }
|
|
1712
1732
|
|
|
1713
|
-
/*
|
|
1714
|
-
|
|
1715
|
-
|
|
1733
|
+
/* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
|
|
1734
|
+
* rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
|
|
1735
|
+
offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
|
|
1736
|
+
|
|
1737
|
+
/* save reps for next block */
|
|
1738
|
+
rep[0] = offset_1 ? offset_1 : offsetSaved1;
|
|
1739
|
+
rep[1] = offset_2 ? offset_2 : offsetSaved2;
|
|
1716
1740
|
|
|
1717
1741
|
/* Return the last literals size */
|
|
1718
1742
|
return (size_t)(iend - anchor);
|
|
@@ -1881,9 +1905,9 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
|
|
|
1881
1905
|
const BYTE* const dictEnd = dictBase + dictLimit;
|
|
1882
1906
|
const BYTE* const dictStart = dictBase + ms->window.lowLimit;
|
|
1883
1907
|
const U32 windowLog = ms->cParams.windowLog;
|
|
1884
|
-
const U32
|
|
1908
|
+
const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6);
|
|
1909
|
+
const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);
|
|
1885
1910
|
|
|
1886
|
-
searchMax_f const searchMax = ZSTD_selectLazyVTable(ms, searchMethod, ZSTD_extDict)->searchMax;
|
|
1887
1911
|
U32 offset_1 = rep[0], offset_2 = rep[1];
|
|
1888
1912
|
|
|
1889
1913
|
DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod);
|
|
@@ -1905,7 +1929,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
|
|
|
1905
1929
|
#endif
|
|
1906
1930
|
while (ip < ilimit) {
|
|
1907
1931
|
size_t matchLength=0;
|
|
1908
|
-
size_t
|
|
1932
|
+
size_t offBase = REPCODE1_TO_OFFBASE;
|
|
1909
1933
|
const BYTE* start=ip+1;
|
|
1910
1934
|
U32 curr = (U32)(ip-base);
|
|
1911
1935
|
|
|
@@ -1924,10 +1948,10 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
|
|
|
1924
1948
|
} }
|
|
1925
1949
|
|
|
1926
1950
|
/* first search (depth 0) */
|
|
1927
|
-
{ size_t
|
|
1928
|
-
size_t const ml2 =
|
|
1951
|
+
{ size_t ofbCandidate = 999999999;
|
|
1952
|
+
size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
|
|
1929
1953
|
if (ml2 > matchLength)
|
|
1930
|
-
matchLength = ml2, start = ip,
|
|
1954
|
+
matchLength = ml2, start = ip, offBase = ofbCandidate;
|
|
1931
1955
|
}
|
|
1932
1956
|
|
|
1933
1957
|
if (matchLength < 4) {
|
|
@@ -1941,7 +1965,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
|
|
|
1941
1965
|
ip ++;
|
|
1942
1966
|
curr++;
|
|
1943
1967
|
/* check repCode */
|
|
1944
|
-
if (
|
|
1968
|
+
if (offBase) {
|
|
1945
1969
|
const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
|
|
1946
1970
|
const U32 repIndex = (U32)(curr - offset_1);
|
|
1947
1971
|
const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
|
|
@@ -1953,18 +1977,18 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
|
|
|
1953
1977
|
const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
|
|
1954
1978
|
size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
|
|
1955
1979
|
int const gain2 = (int)(repLength * 3);
|
|
1956
|
-
int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)
|
|
1980
|
+
int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
|
|
1957
1981
|
if ((repLength >= 4) && (gain2 > gain1))
|
|
1958
|
-
matchLength = repLength,
|
|
1982
|
+
matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip;
|
|
1959
1983
|
} }
|
|
1960
1984
|
|
|
1961
1985
|
/* search match, depth 1 */
|
|
1962
|
-
{ size_t
|
|
1963
|
-
size_t const ml2 =
|
|
1964
|
-
int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)
|
|
1965
|
-
int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)
|
|
1986
|
+
{ size_t ofbCandidate = 999999999;
|
|
1987
|
+
size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
|
|
1988
|
+
int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */
|
|
1989
|
+
int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4);
|
|
1966
1990
|
if ((ml2 >= 4) && (gain2 > gain1)) {
|
|
1967
|
-
matchLength = ml2,
|
|
1991
|
+
matchLength = ml2, offBase = ofbCandidate, start = ip;
|
|
1968
1992
|
continue; /* search a better one */
|
|
1969
1993
|
} }
|
|
1970
1994
|
|
|
@@ -1973,7 +1997,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
|
|
|
1973
1997
|
ip ++;
|
|
1974
1998
|
curr++;
|
|
1975
1999
|
/* check repCode */
|
|
1976
|
-
if (
|
|
2000
|
+
if (offBase) {
|
|
1977
2001
|
const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
|
|
1978
2002
|
const U32 repIndex = (U32)(curr - offset_1);
|
|
1979
2003
|
const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
|
|
@@ -1985,36 +2009,36 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
|
|
|
1985
2009
|
const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
|
|
1986
2010
|
size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
|
|
1987
2011
|
int const gain2 = (int)(repLength * 4);
|
|
1988
|
-
int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)
|
|
2012
|
+
int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
|
|
1989
2013
|
if ((repLength >= 4) && (gain2 > gain1))
|
|
1990
|
-
matchLength = repLength,
|
|
2014
|
+
matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip;
|
|
1991
2015
|
} }
|
|
1992
2016
|
|
|
1993
2017
|
/* search match, depth 2 */
|
|
1994
|
-
{ size_t
|
|
1995
|
-
size_t const ml2 =
|
|
1996
|
-
int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)
|
|
1997
|
-
int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)
|
|
2018
|
+
{ size_t ofbCandidate = 999999999;
|
|
2019
|
+
size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
|
|
2020
|
+
int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */
|
|
2021
|
+
int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7);
|
|
1998
2022
|
if ((ml2 >= 4) && (gain2 > gain1)) {
|
|
1999
|
-
matchLength = ml2,
|
|
2023
|
+
matchLength = ml2, offBase = ofbCandidate, start = ip;
|
|
2000
2024
|
continue;
|
|
2001
2025
|
} } }
|
|
2002
2026
|
break; /* nothing found : store previous solution */
|
|
2003
2027
|
}
|
|
2004
2028
|
|
|
2005
2029
|
/* catch up */
|
|
2006
|
-
if (
|
|
2007
|
-
U32 const matchIndex = (U32)((size_t)(start-base) -
|
|
2030
|
+
if (OFFBASE_IS_OFFSET(offBase)) {
|
|
2031
|
+
U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase));
|
|
2008
2032
|
const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex;
|
|
2009
2033
|
const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart;
|
|
2010
2034
|
while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */
|
|
2011
|
-
offset_2 = offset_1; offset_1 = (U32)
|
|
2035
|
+
offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase);
|
|
2012
2036
|
}
|
|
2013
2037
|
|
|
2014
2038
|
/* store sequence */
|
|
2015
2039
|
_storeSequence:
|
|
2016
2040
|
{ size_t const litLength = (size_t)(start - anchor);
|
|
2017
|
-
ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)
|
|
2041
|
+
ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
|
|
2018
2042
|
anchor = ip = start + matchLength;
|
|
2019
2043
|
}
|
|
2020
2044
|
|
|
@@ -2031,8 +2055,8 @@ _storeSequence:
|
|
|
2031
2055
|
/* repcode detected we should take it */
|
|
2032
2056
|
const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
|
|
2033
2057
|
matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
|
|
2034
|
-
|
|
2035
|
-
ZSTD_storeSeq(seqStore, 0, anchor, iend,
|
|
2058
|
+
offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap offset history */
|
|
2059
|
+
ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
|
|
2036
2060
|
ip += matchLength;
|
|
2037
2061
|
anchor = ip;
|
|
2038
2062
|
continue; /* faster when present ... (?) */
|
|
@@ -2098,7 +2122,6 @@ size_t ZSTD_compressBlock_lazy_extDict_row(
|
|
|
2098
2122
|
size_t ZSTD_compressBlock_lazy2_extDict_row(
|
|
2099
2123
|
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
|
|
2100
2124
|
void const* src, size_t srcSize)
|
|
2101
|
-
|
|
2102
2125
|
{
|
|
2103
2126
|
return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2);
|
|
2104
2127
|
}
|