duckdb 0.6.1-dev240.0 → 0.6.1-dev247.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/duckdb.cpp +532 -529
- package/src/duckdb.hpp +2 -2
- package/src/parquet-amalgamation.cpp +29186 -29187
package/src/duckdb.cpp
CHANGED
|
@@ -228764,626 +228764,629 @@ size_t duckdb_fsst_compressAVX512(SymbolTable &symbolTable, u8* codeBase, u8* sy
|
|
|
228764
228764
|
|
|
228765
228765
|
|
|
228766
228766
|
Symbol concat(Symbol a, Symbol b) {
|
|
228767
|
-
|
|
228768
|
-
|
|
228769
|
-
|
|
228770
|
-
|
|
228771
|
-
|
|
228772
|
-
|
|
228767
|
+
Symbol s;
|
|
228768
|
+
u32 length = a.length()+b.length();
|
|
228769
|
+
if (length > Symbol::maxLength) length = Symbol::maxLength;
|
|
228770
|
+
s.set_code_len(FSST_CODE_MASK, length);
|
|
228771
|
+
s.val.num = (b.val.num << (8*a.length())) | a.val.num;
|
|
228772
|
+
return s;
|
|
228773
228773
|
}
|
|
228774
228774
|
|
|
228775
228775
|
namespace std {
|
|
228776
228776
|
template <>
|
|
228777
228777
|
class hash<QSymbol> {
|
|
228778
|
-
|
|
228779
|
-
|
|
228780
|
-
|
|
228781
|
-
|
|
228782
|
-
|
|
228783
|
-
|
|
228784
|
-
|
|
228785
|
-
|
|
228786
|
-
|
|
228787
|
-
|
|
228788
|
-
|
|
228789
|
-
|
|
228790
|
-
|
|
228791
|
-
|
|
228792
|
-
|
|
228793
|
-
|
|
228778
|
+
public:
|
|
228779
|
+
size_t operator()(const QSymbol& q) const {
|
|
228780
|
+
uint64_t k = q.symbol.val.num;
|
|
228781
|
+
const uint64_t m = 0xc6a4a7935bd1e995;
|
|
228782
|
+
const int r = 47;
|
|
228783
|
+
uint64_t h = 0x8445d61a4e774912 ^ (8*m);
|
|
228784
|
+
k *= m;
|
|
228785
|
+
k ^= k >> r;
|
|
228786
|
+
k *= m;
|
|
228787
|
+
h ^= k;
|
|
228788
|
+
h *= m;
|
|
228789
|
+
h ^= h >> r;
|
|
228790
|
+
h *= m;
|
|
228791
|
+
h ^= h >> r;
|
|
228792
|
+
return h;
|
|
228793
|
+
}
|
|
228794
228794
|
};
|
|
228795
228795
|
}
|
|
228796
228796
|
|
|
228797
228797
|
bool isEscapeCode(u16 pos) { return pos < FSST_CODE_BASE; }
|
|
228798
228798
|
|
|
228799
228799
|
std::ostream& operator<<(std::ostream& out, const Symbol& s) {
|
|
228800
|
-
|
|
228801
|
-
|
|
228802
|
-
|
|
228800
|
+
for (u32 i=0; i<s.length(); i++)
|
|
228801
|
+
out << s.val.str[i];
|
|
228802
|
+
return out;
|
|
228803
228803
|
}
|
|
228804
|
-
|
|
228804
|
+
static u64 iter = 0;
|
|
228805
228805
|
|
|
228806
228806
|
SymbolTable *buildSymbolTable(Counters& counters, vector<u8*> line, size_t len[], bool zeroTerminated=false) {
|
|
228807
|
-
|
|
228808
|
-
|
|
228809
|
-
|
|
228810
|
-
|
|
228811
|
-
|
|
228812
|
-
|
|
228813
|
-
|
|
228814
|
-
|
|
228815
|
-
|
|
228816
|
-
|
|
228817
|
-
|
|
228818
|
-
|
|
228819
|
-
|
|
228820
|
-
|
|
228821
|
-
|
|
228822
|
-
|
|
228823
|
-
|
|
228824
|
-
|
|
228825
|
-
|
|
228826
|
-
|
|
228827
|
-
|
|
228828
|
-
|
|
228829
|
-
|
|
228830
|
-
|
|
228807
|
+
SymbolTable *st = new SymbolTable(), *bestTable = new SymbolTable();
|
|
228808
|
+
int bestGain = (int) -FSST_SAMPLEMAXSZ; // worst case (everything exception)
|
|
228809
|
+
size_t sampleFrac = 128;
|
|
228810
|
+
|
|
228811
|
+
// start by determining the terminator. We use the (lowest) most infrequent byte as terminator
|
|
228812
|
+
st->zeroTerminated = zeroTerminated;
|
|
228813
|
+
if (zeroTerminated) {
|
|
228814
|
+
st->terminator = 0; // except in case of zeroTerminated mode, then byte 0 is terminator regardless frequency
|
|
228815
|
+
} else {
|
|
228816
|
+
u16 byteHisto[256];
|
|
228817
|
+
memset(byteHisto, 0, sizeof(byteHisto));
|
|
228818
|
+
for(size_t i=0; i<line.size(); i++) {
|
|
228819
|
+
u8* cur = line[i];
|
|
228820
|
+
u8* end = cur + len[i];
|
|
228821
|
+
while(cur < end) byteHisto[*cur++]++;
|
|
228822
|
+
}
|
|
228823
|
+
u32 minSize = FSST_SAMPLEMAXSZ, i = st->terminator = 256;
|
|
228824
|
+
while(i-- > 0) {
|
|
228825
|
+
if (byteHisto[i] > minSize) continue;
|
|
228826
|
+
st->terminator = i;
|
|
228827
|
+
minSize = byteHisto[i];
|
|
228828
|
+
}
|
|
228829
|
+
}
|
|
228830
|
+
assert(st->terminator != 256);
|
|
228831
|
+
|
|
228832
|
+
// a random number between 0 and 128
|
|
228833
|
+
auto rnd128 = [&](size_t i) { return 1 + (FSST_HASH((i+1UL)*sampleFrac)&127); };
|
|
228834
|
+
|
|
228835
|
+
// compress sample, and compute (pair-)frequencies
|
|
228836
|
+
auto compressCount = [&](SymbolTable *st, Counters &counters) { // returns gain
|
|
228837
|
+
int gain = 0;
|
|
228838
|
+
|
|
228839
|
+
for(size_t i=0; i<line.size(); i++) {
|
|
228840
|
+
u8* cur = line[i];
|
|
228841
|
+
u8* end = cur + len[i];
|
|
228842
|
+
|
|
228843
|
+
if (sampleFrac < 128) {
|
|
228844
|
+
// in earlier rounds (sampleFrac < 128) we skip data in the sample (reduces overall work ~2x)
|
|
228845
|
+
if (rnd128(i) > sampleFrac) continue;
|
|
228846
|
+
}
|
|
228847
|
+
if (cur < end) {
|
|
228848
|
+
u8* start = cur;
|
|
228849
|
+
u16 code2 = 255, code1 = st->findLongestSymbol(cur, end);
|
|
228850
|
+
cur += st->symbols[code1].length();
|
|
228851
|
+
gain += (int) (st->symbols[code1].length()-(1+isEscapeCode(code1)));
|
|
228852
|
+
while (true) {
|
|
228853
|
+
// count single symbol (i.e. an option is not extending it)
|
|
228854
|
+
counters.count1Inc(code1);
|
|
228831
228855
|
|
|
228832
|
-
|
|
228833
|
-
|
|
228856
|
+
// as an alternative, consider just using the next byte..
|
|
228857
|
+
if (st->symbols[code1].length() != 1) // .. but do not count single byte symbols doubly
|
|
228858
|
+
counters.count1Inc(*start);
|
|
228834
228859
|
|
|
228835
|
-
|
|
228836
|
-
|
|
228837
|
-
|
|
228860
|
+
if (cur==end) {
|
|
228861
|
+
break;
|
|
228862
|
+
}
|
|
228838
228863
|
|
|
228839
|
-
|
|
228840
|
-
|
|
228841
|
-
|
|
228864
|
+
// now match a new symbol
|
|
228865
|
+
start = cur;
|
|
228866
|
+
if (cur<end-7) {
|
|
228867
|
+
u64 word = fsst_unaligned_load(cur);
|
|
228868
|
+
size_t code = word & 0xFFFFFF;
|
|
228869
|
+
size_t idx = FSST_HASH(code)&(st->hashTabSize-1);
|
|
228870
|
+
Symbol s = st->hashTab[idx];
|
|
228871
|
+
code2 = st->shortCodes[word & 0xFFFF] & FSST_CODE_MASK;
|
|
228872
|
+
word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.icl);
|
|
228873
|
+
if ((s.icl < FSST_ICL_FREE) & (s.val.num == word)) {
|
|
228874
|
+
code2 = s.code();
|
|
228875
|
+
cur += s.length();
|
|
228876
|
+
} else if (code2 >= FSST_CODE_BASE) {
|
|
228877
|
+
cur += 2;
|
|
228878
|
+
} else {
|
|
228879
|
+
code2 = st->byteCodes[word & 0xFF] & FSST_CODE_MASK;
|
|
228880
|
+
cur += 1;
|
|
228881
|
+
}
|
|
228882
|
+
} else {
|
|
228883
|
+
code2 = st->findLongestSymbol(cur, end);
|
|
228884
|
+
cur += st->symbols[code2].length();
|
|
228885
|
+
}
|
|
228842
228886
|
|
|
228843
|
-
|
|
228844
|
-
|
|
228845
|
-
if (rnd128(i) > sampleFrac) continue;
|
|
228846
|
-
}
|
|
228847
|
-
if (cur < end) {
|
|
228848
|
-
u16 pos2 = 255, pos1 = st->findLongestSymbol(cur, end);
|
|
228849
|
-
cur += st->symbols[pos1].length();
|
|
228850
|
-
gain += (int) (st->symbols[pos1].length()-(1+isEscapeCode(pos1)));
|
|
228851
|
-
while (true) {
|
|
228852
|
-
u8* old = cur;
|
|
228853
|
-
counters.count1Inc(pos1);
|
|
228854
|
-
// count single symbol (i.e. an option is not extending it)
|
|
228855
|
-
if (cur>=end)
|
|
228856
|
-
break;
|
|
228857
|
-
if (st->symbols[pos1].length() != 1)
|
|
228858
|
-
counters.count1Inc(*cur);
|
|
228859
|
-
if (cur<end-7) {
|
|
228860
|
-
u64 word = fsst_unaligned_load(cur);
|
|
228861
|
-
size_t pos = word & 0xFFFFFF;
|
|
228862
|
-
size_t idx = FSST_HASH(pos)&(st->hashTabSize-1);
|
|
228863
|
-
Symbol s = st->hashTab[idx];
|
|
228864
|
-
pos2 = st->shortCodes[word & 0xFFFF] & FSST_CODE_MASK;
|
|
228865
|
-
word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.icl);
|
|
228866
|
-
if ((s.icl < FSST_ICL_FREE) & (s.val.num == word)) {
|
|
228867
|
-
pos2 = s.code();
|
|
228868
|
-
cur += s.length();
|
|
228869
|
-
} else if (pos2 >= FSST_CODE_BASE) {
|
|
228870
|
-
cur += 2;
|
|
228871
|
-
} else {
|
|
228872
|
-
pos2 = st->byteCodes[word & 0xFF] & FSST_CODE_MASK;
|
|
228873
|
-
cur += 1;
|
|
228874
|
-
}
|
|
228875
|
-
} else {
|
|
228876
|
-
assert(cur<end);
|
|
228877
|
-
pos2 = st->findLongestSymbol(cur, end);
|
|
228878
|
-
cur += st->symbols[pos2].length();
|
|
228879
|
-
}
|
|
228880
|
-
|
|
228881
|
-
// compute compressed output size
|
|
228882
|
-
gain += ((int) (cur-old))-(1+isEscapeCode(pos2));
|
|
228883
|
-
|
|
228884
|
-
// now count the subsequent two symbols we encode as an extension possibility
|
|
228885
|
-
if (sampleFrac < 128) { // no need to count pairs in final round
|
|
228886
|
-
counters.count2Inc(pos1, pos2);
|
|
228887
|
-
if ((cur-old) > 1) // do not count escaped bytes doubly
|
|
228888
|
-
counters.count2Inc(pos1, *old);
|
|
228889
|
-
}
|
|
228890
|
-
pos1 = pos2;
|
|
228891
|
-
}
|
|
228892
|
-
}
|
|
228893
|
-
}
|
|
228894
|
-
return gain;
|
|
228895
|
-
};
|
|
228887
|
+
// compute compressed output size
|
|
228888
|
+
gain += ((int) (cur-start))-(1+isEscapeCode(code2));
|
|
228896
228889
|
|
|
228897
|
-
|
|
228898
|
-
|
|
228899
|
-
|
|
228900
|
-
|
|
228901
|
-
// artificially make terminater the most frequent symbol so it gets included
|
|
228902
|
-
u16 terminator = st->nSymbols?FSST_CODE_BASE:st->terminator;
|
|
228903
|
-
counters.count1Set(terminator,65535);
|
|
228904
|
-
|
|
228905
|
-
auto addOrInc = [&](unordered_set<QSymbol> &cands, Symbol s, u64 count) {
|
|
228906
|
-
if (count < (5*sampleFrac)/128) return; // improves both compression speed (less candidates), but also quality!!
|
|
228907
|
-
QSymbol q;
|
|
228908
|
-
q.symbol = s;
|
|
228909
|
-
q.gain = count * s.length();
|
|
228910
|
-
auto it = cands.find(q);
|
|
228911
|
-
if (it != cands.end()) {
|
|
228912
|
-
q.gain += (*it).gain;
|
|
228913
|
-
cands.erase(*it);
|
|
228914
|
-
}
|
|
228915
|
-
cands.insert(q);
|
|
228916
|
-
};
|
|
228890
|
+
// now count the subsequent two symbols we encode as an extension codesibility
|
|
228891
|
+
if (sampleFrac < 128) { // no need to count pairs in final round
|
|
228892
|
+
// consider the symbol that is the concatenation of the two last symbols
|
|
228893
|
+
counters.count2Inc(code1, code2);
|
|
228917
228894
|
|
|
228918
|
-
|
|
228919
|
-
|
|
228920
|
-
|
|
228921
|
-
|
|
228895
|
+
// as an alternative, consider just extending with the next byte..
|
|
228896
|
+
if ((cur-start) > 1) // ..but do not count single byte extensions doubly
|
|
228897
|
+
counters.count2Inc(code1, *start);
|
|
228898
|
+
}
|
|
228899
|
+
code1 = code2;
|
|
228900
|
+
}
|
|
228901
|
+
}
|
|
228902
|
+
}
|
|
228903
|
+
return gain;
|
|
228904
|
+
};
|
|
228922
228905
|
|
|
228923
|
-
|
|
228924
|
-
|
|
228925
|
-
|
|
228906
|
+
auto makeTable = [&](SymbolTable *st, Counters &counters) {
|
|
228907
|
+
// hashmap of c (needed because we can generate duplicate candidates)
|
|
228908
|
+
unordered_set<QSymbol> cands;
|
|
228909
|
+
|
|
228910
|
+
// artificially make terminater the most frequent symbol so it gets included
|
|
228911
|
+
u16 terminator = st->nSymbols?FSST_CODE_BASE:st->terminator;
|
|
228912
|
+
counters.count1Set(terminator,65535);
|
|
228913
|
+
|
|
228914
|
+
auto addOrInc = [&](unordered_set<QSymbol> &cands, Symbol s, u64 count) {
|
|
228915
|
+
if (count < (5*sampleFrac)/128) return; // improves both compression speed (less candidates), but also quality!!
|
|
228916
|
+
QSymbol q;
|
|
228917
|
+
q.symbol = s;
|
|
228918
|
+
q.gain = count * s.length();
|
|
228919
|
+
auto it = cands.find(q);
|
|
228920
|
+
if (it != cands.end()) {
|
|
228921
|
+
q.gain += (*it).gain;
|
|
228922
|
+
cands.erase(*it);
|
|
228923
|
+
}
|
|
228924
|
+
cands.insert(q);
|
|
228925
|
+
};
|
|
228926
228926
|
|
|
228927
|
-
|
|
228928
|
-
|
|
228929
|
-
|
|
228930
|
-
|
|
228931
|
-
}
|
|
228932
|
-
for (u32 pos2=0; pos2<FSST_CODE_BASE+(size_t)st->nSymbols; pos2++) {
|
|
228933
|
-
u32 cnt2 = counters.count2GetNext(pos1, pos2); // may advance pos2!!
|
|
228934
|
-
if (!cnt2) continue;
|
|
228935
|
-
|
|
228936
|
-
// create a new symbol
|
|
228937
|
-
Symbol s2 = st->symbols[pos2];
|
|
228938
|
-
Symbol s3 = concat(s1, s2);
|
|
228939
|
-
if (s2.val.str[0] != st->terminator) // multi-byte symbols cannot contain the terminator byte
|
|
228940
|
-
addOrInc(cands, s3, cnt2);
|
|
228941
|
-
}
|
|
228942
|
-
}
|
|
228927
|
+
// add candidate symbols based on counted frequency
|
|
228928
|
+
for (u32 pos1=0; pos1<FSST_CODE_BASE+(size_t) st->nSymbols; pos1++) {
|
|
228929
|
+
u32 cnt1 = counters.count1GetNext(pos1); // may advance pos1!!
|
|
228930
|
+
if (!cnt1) continue;
|
|
228943
228931
|
|
|
228944
|
-
|
|
228945
|
-
|
|
228946
|
-
|
|
228947
|
-
for (auto& q : cands)
|
|
228948
|
-
pq.push(q);
|
|
228949
|
-
|
|
228950
|
-
// Create new symbol map using best candidates
|
|
228951
|
-
st->clear();
|
|
228952
|
-
while (st->nSymbols < 255 && !pq.empty()) {
|
|
228953
|
-
QSymbol q = pq.top();
|
|
228954
|
-
pq.pop();
|
|
228955
|
-
st->add(q.symbol);
|
|
228956
|
-
}
|
|
228957
|
-
};
|
|
228932
|
+
// heuristic: promoting single-byte symbols (*8) helps reduce exception rates and increases [de]compression speed
|
|
228933
|
+
Symbol s1 = st->symbols[pos1];
|
|
228934
|
+
addOrInc(cands, s1, ((s1.length()==1)?8LL:1LL)*cnt1);
|
|
228958
228935
|
|
|
228959
|
-
|
|
228936
|
+
if (sampleFrac >= 128 || // last round we do not create new (combined) symbols
|
|
228937
|
+
s1.length() == Symbol::maxLength || // symbol cannot be extended
|
|
228938
|
+
s1.val.str[0] == st->terminator) { // multi-byte symbols cannot contain the terminator byte
|
|
228939
|
+
continue;
|
|
228940
|
+
}
|
|
228941
|
+
for (u32 pos2=0; pos2<FSST_CODE_BASE+(size_t)st->nSymbols; pos2++) {
|
|
228942
|
+
u32 cnt2 = counters.count2GetNext(pos1, pos2); // may advance pos2!!
|
|
228943
|
+
if (!cnt2) continue;
|
|
228944
|
+
|
|
228945
|
+
// create a new symbol
|
|
228946
|
+
Symbol s2 = st->symbols[pos2];
|
|
228947
|
+
Symbol s3 = concat(s1, s2);
|
|
228948
|
+
if (s2.val.str[0] != st->terminator) // multi-byte symbols cannot contain the terminator byte
|
|
228949
|
+
addOrInc(cands, s3, cnt2);
|
|
228950
|
+
}
|
|
228951
|
+
}
|
|
228952
|
+
|
|
228953
|
+
// insert candidates into priority queue (by gain)
|
|
228954
|
+
auto cmpGn = [](const QSymbol& q1, const QSymbol& q2) { return (q1.gain < q2.gain) || (q1.gain == q2.gain && q1.symbol.val.num > q2.symbol.val.num); };
|
|
228955
|
+
priority_queue<QSymbol,vector<QSymbol>,decltype(cmpGn)> pq(cmpGn);
|
|
228956
|
+
for (auto& q : cands)
|
|
228957
|
+
pq.push(q);
|
|
228958
|
+
|
|
228959
|
+
// Create new symbol map using best candidates
|
|
228960
|
+
st->clear();
|
|
228961
|
+
while (st->nSymbols < 255 && !pq.empty()) {
|
|
228962
|
+
QSymbol q = pq.top();
|
|
228963
|
+
pq.pop();
|
|
228964
|
+
st->add(q.symbol);
|
|
228965
|
+
}
|
|
228966
|
+
};
|
|
228967
|
+
|
|
228968
|
+
u8 bestCounters[512*sizeof(u16)];
|
|
228960
228969
|
#ifdef NONOPT_FSST
|
|
228961
|
-
|
|
228962
|
-
|
|
228970
|
+
for(size_t frac : {127, 127, 127, 127, 127, 127, 127, 127, 127, 128}) {
|
|
228971
|
+
sampleFrac = frac;
|
|
228963
228972
|
#else
|
|
228964
|
-
|
|
228973
|
+
for(sampleFrac=8; true; sampleFrac += 30) {
|
|
228965
228974
|
#endif
|
|
228966
|
-
|
|
228967
|
-
|
|
228968
|
-
|
|
228969
|
-
|
|
228970
|
-
|
|
228971
|
-
|
|
228972
|
-
|
|
228973
|
-
|
|
228974
|
-
|
|
228975
|
-
|
|
228976
|
-
|
|
228977
|
-
|
|
228978
|
-
|
|
228979
|
-
|
|
228975
|
+
memset(&counters, 0, sizeof(Counters));
|
|
228976
|
+
long gain = compressCount(st, counters);
|
|
228977
|
+
if (gain >= bestGain) { // a new best solution!
|
|
228978
|
+
counters.backup1(bestCounters);
|
|
228979
|
+
*bestTable = *st; bestGain = gain;
|
|
228980
|
+
}
|
|
228981
|
+
if (sampleFrac >= 128) break; // we do 5 rounds (sampleFrac=8,38,68,98,128)
|
|
228982
|
+
makeTable(st, counters);
|
|
228983
|
+
}
|
|
228984
|
+
delete st;
|
|
228985
|
+
counters.restore1(bestCounters);
|
|
228986
|
+
makeTable(bestTable, counters);
|
|
228987
|
+
bestTable->finalize(zeroTerminated); // renumber codes for more efficient compression
|
|
228988
|
+
return bestTable;
|
|
228980
228989
|
}
|
|
228981
228990
|
|
|
228982
228991
|
static inline size_t compressSIMD(SymbolTable &symbolTable, u8* symbolBase, size_t nlines, size_t len[], u8* line[], size_t size, u8* dst, size_t lenOut[], u8* strOut[], int unroll) {
|
|
228983
|
-
|
|
228984
|
-
|
|
228985
|
-
|
|
228986
|
-
|
|
228987
|
-
|
|
228988
|
-
|
|
228989
|
-
while (curLine < nlines && outOff <= (1<<19)) {
|
|
228990
|
-
size_t prevLine = curLine, chunk, curOff = 0;
|
|
228991
|
-
|
|
228992
|
-
// bail out if the output buffer cannot hold the compressed next string fully
|
|
228993
|
-
if (((len[curLine]-curOff)*2 + 7) > budget) break; // see below for the +7
|
|
228994
|
-
else budget -= (len[curLine]-curOff)*2;
|
|
228995
|
-
|
|
228996
|
-
strOut[curLine] = (u8*) 0;
|
|
228997
|
-
lenOut[curLine] = 0;
|
|
228992
|
+
size_t curLine = 0, inOff = 0, outOff = 0, batchPos = 0, empty = 0, budget = size;
|
|
228993
|
+
u8 *lim = dst + size, *codeBase = symbolBase + (1<<18); // 512KB temp space for compressing 512 strings
|
|
228994
|
+
SIMDjob input[512]; // combined offsets of input strings (cur,end), and string #id (pos) and output (dst) pointer
|
|
228995
|
+
SIMDjob output[512]; // output are (pos:9,dst:19) end pointers (compute compressed length from this)
|
|
228996
|
+
size_t jobLine[512]; // for which line in the input sequence was this job (needed because we may split a line into multiple jobs)
|
|
228998
228997
|
|
|
228999
|
-
|
|
229000
|
-
|
|
229001
|
-
|
|
229002
|
-
|
|
229003
|
-
|
|
229004
|
-
|
|
229005
|
-
|
|
229006
|
-
|
|
229007
|
-
|
|
229008
|
-
|
|
229009
|
-
|
|
229010
|
-
|
|
229011
|
-
|
|
229012
|
-
|
|
229013
|
-
|
|
229014
|
-
|
|
229015
|
-
|
|
229016
|
-
|
|
229017
|
-
|
|
229018
|
-
|
|
229019
|
-
|
|
229020
|
-
|
|
229021
|
-
|
|
229022
|
-
|
|
229023
|
-
|
|
229024
|
-
|
|
229025
|
-
|
|
229026
|
-
|
|
229027
|
-
|
|
229028
|
-
|
|
229029
|
-
|
|
229030
|
-
|
|
229031
|
-
|
|
229032
|
-
|
|
229033
|
-
|
|
229034
|
-
|
|
229035
|
-
|
|
229036
|
-
|
|
229037
|
-
|
|
229038
|
-
|
|
229039
|
-
|
|
229040
|
-
|
|
229041
|
-
|
|
229042
|
-
|
|
229043
|
-
|
|
229044
|
-
|
|
229045
|
-
|
|
229046
|
-
|
|
229047
|
-
|
|
229048
|
-
|
|
229049
|
-
|
|
229050
|
-
|
|
229051
|
-
|
|
229052
|
-
|
|
229053
|
-
|
|
229054
|
-
|
|
229055
|
-
|
|
229056
|
-
|
|
229057
|
-
|
|
229058
|
-
|
|
229059
|
-
|
|
229060
|
-
|
|
229061
|
-
|
|
229062
|
-
|
|
229063
|
-
|
|
229064
|
-
|
|
229065
|
-
|
|
229066
|
-
|
|
229067
|
-
|
|
229068
|
-
|
|
229069
|
-
|
|
229070
|
-
|
|
229071
|
-
|
|
229072
|
-
|
|
229073
|
-
|
|
229074
|
-
|
|
229075
|
-
|
|
229076
|
-
|
|
229077
|
-
|
|
229078
|
-
|
|
229079
|
-
|
|
229080
|
-
|
|
229081
|
-
|
|
229082
|
-
|
|
229083
|
-
|
|
229084
|
-
|
|
229085
|
-
|
|
229086
|
-
|
|
229087
|
-
|
|
229088
|
-
|
|
229089
|
-
|
|
229090
|
-
|
|
229091
|
-
|
|
229092
|
-
|
|
229093
|
-
|
|
229094
|
-
|
|
229095
|
-
|
|
229096
|
-
|
|
229097
|
-
|
|
229098
|
-
|
|
229099
|
-
|
|
229100
|
-
|
|
229101
|
-
|
|
229102
|
-
|
|
229103
|
-
|
|
229104
|
-
|
|
229105
|
-
|
|
229106
|
-
|
|
229107
|
-
|
|
229108
|
-
|
|
229109
|
-
|
|
229110
|
-
|
|
229111
|
-
|
|
229112
|
-
|
|
228998
|
+
while (curLine < nlines && outOff <= (1<<19)) {
|
|
228999
|
+
size_t prevLine = curLine, chunk, curOff = 0;
|
|
229000
|
+
|
|
229001
|
+
// bail out if the output buffer cannot hold the compressed next string fully
|
|
229002
|
+
if (((len[curLine]-curOff)*2 + 7) > budget) break; // see below for the +7
|
|
229003
|
+
else budget -= (len[curLine]-curOff)*2;
|
|
229004
|
+
|
|
229005
|
+
strOut[curLine] = (u8*) 0;
|
|
229006
|
+
lenOut[curLine] = 0;
|
|
229007
|
+
|
|
229008
|
+
do {
|
|
229009
|
+
do {
|
|
229010
|
+
chunk = len[curLine] - curOff;
|
|
229011
|
+
if (chunk > 511) {
|
|
229012
|
+
chunk = 511; // large strings need to be chopped up into segments of 511 bytes
|
|
229013
|
+
}
|
|
229014
|
+
// create a job in this batch
|
|
229015
|
+
SIMDjob job;
|
|
229016
|
+
job.cur = inOff;
|
|
229017
|
+
job.end = job.cur + chunk;
|
|
229018
|
+
job.pos = batchPos;
|
|
229019
|
+
job.out = outOff;
|
|
229020
|
+
|
|
229021
|
+
// worst case estimate for compressed size (+7 is for the scatter that writes extra 7 zeros)
|
|
229022
|
+
outOff += 7 + 2*(size_t)(job.end - job.cur); // note, total size needed is 512*(511*2+7) bytes.
|
|
229023
|
+
if (outOff > (1<<19)) break; // simdbuf may get full, stop before this chunk
|
|
229024
|
+
|
|
229025
|
+
// register job in this batch
|
|
229026
|
+
input[batchPos] = job;
|
|
229027
|
+
jobLine[batchPos] = curLine;
|
|
229028
|
+
|
|
229029
|
+
if (chunk == 0) {
|
|
229030
|
+
empty++; // detect empty chunks -- SIMD code cannot handle empty strings, so they need to be filtered out
|
|
229031
|
+
} else {
|
|
229032
|
+
// copy string chunk into temp buffer
|
|
229033
|
+
memcpy(symbolBase + inOff, line[curLine] + curOff, chunk);
|
|
229034
|
+
inOff += chunk;
|
|
229035
|
+
curOff += chunk;
|
|
229036
|
+
symbolBase[inOff++] = (u8) symbolTable.terminator; // write an extra char at the end that will not be encoded
|
|
229037
|
+
}
|
|
229038
|
+
if (++batchPos == 512) break;
|
|
229039
|
+
} while(curOff < len[curLine]);
|
|
229040
|
+
|
|
229041
|
+
if ((batchPos == 512) || (outOff > (1<<19)) || (++curLine >= nlines)) { // cannot accumulate more?
|
|
229042
|
+
if (batchPos-empty >= 32) { // if we have enough work, fire off fsst_compressAVX512 (32 is due to max 4x8 unrolling)
|
|
229043
|
+
// radix-sort jobs on length (longest string first)
|
|
229044
|
+
// -- this provides best load balancing and allows to skip empty jobs at the end
|
|
229045
|
+
u16 sortpos[513];
|
|
229046
|
+
memset(sortpos, 0, sizeof(sortpos));
|
|
229047
|
+
|
|
229048
|
+
// calculate length histo
|
|
229049
|
+
for(size_t i=0; i<batchPos; i++) {
|
|
229050
|
+
size_t len = input[i].end - input[i].cur;
|
|
229051
|
+
sortpos[512UL - len]++;
|
|
229052
|
+
}
|
|
229053
|
+
// calculate running sum
|
|
229054
|
+
for(size_t i=1; i<=512; i++)
|
|
229055
|
+
sortpos[i] += sortpos[i-1];
|
|
229056
|
+
|
|
229057
|
+
// move jobs to their final destination
|
|
229058
|
+
SIMDjob inputOrdered[512];
|
|
229059
|
+
for(size_t i=0; i<batchPos; i++) {
|
|
229060
|
+
size_t len = input[i].end - input[i].cur;
|
|
229061
|
+
size_t pos = sortpos[511UL - len]++;
|
|
229062
|
+
inputOrdered[pos] = input[i];
|
|
229063
|
+
}
|
|
229064
|
+
// finally.. SIMD compress max 256KB of simdbuf into (max) 512KB of simdbuf (but presumably much less..)
|
|
229065
|
+
for(size_t done = duckdb_fsst_compressAVX512(symbolTable, codeBase, symbolBase, inputOrdered, output, batchPos-empty, unroll);
|
|
229066
|
+
done < batchPos; done++) output[done] = inputOrdered[done];
|
|
229067
|
+
} else {
|
|
229068
|
+
memcpy(output, input, batchPos*sizeof(SIMDjob));
|
|
229069
|
+
}
|
|
229070
|
+
|
|
229071
|
+
// finish encoding (unfinished strings in process, plus the few last strings not yet processed)
|
|
229072
|
+
for(size_t i=0; i<batchPos; i++) {
|
|
229073
|
+
SIMDjob job = output[i];
|
|
229074
|
+
if (job.cur < job.end) { // finish encoding this string with scalar code
|
|
229075
|
+
u8* cur = symbolBase + job.cur;
|
|
229076
|
+
u8* end = symbolBase + job.end;
|
|
229077
|
+
u8* out = codeBase + job.out;
|
|
229078
|
+
while (cur < end) {
|
|
229079
|
+
u64 word = fsst_unaligned_load(cur);
|
|
229080
|
+
size_t code = symbolTable.shortCodes[word & 0xFFFF];
|
|
229081
|
+
size_t pos = word & 0xFFFFFF;
|
|
229082
|
+
size_t idx = FSST_HASH(pos)&(symbolTable.hashTabSize-1);
|
|
229083
|
+
Symbol s = symbolTable.hashTab[idx];
|
|
229084
|
+
out[1] = (u8) word; // speculatively write out escaped byte
|
|
229085
|
+
word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.icl);
|
|
229086
|
+
if ((s.icl < FSST_ICL_FREE) && s.val.num == word) {
|
|
229087
|
+
*out++ = (u8) s.code(); cur += s.length();
|
|
229088
|
+
} else {
|
|
229089
|
+
// could be a 2-byte or 1-byte code, or miss
|
|
229090
|
+
// handle everything with predication
|
|
229091
|
+
*out = (u8) code;
|
|
229092
|
+
out += 1+((code&FSST_CODE_BASE)>>8);
|
|
229093
|
+
cur += (code>>FSST_LEN_BITS);
|
|
229094
|
+
}
|
|
229095
|
+
}
|
|
229096
|
+
job.out = out - codeBase;
|
|
229097
|
+
}
|
|
229098
|
+
// postprocess job info
|
|
229099
|
+
job.cur = 0;
|
|
229100
|
+
job.end = job.out - input[job.pos].out; // misuse .end field as compressed size
|
|
229101
|
+
job.out = input[job.pos].out; // reset offset to start of encoded string
|
|
229102
|
+
input[job.pos] = job;
|
|
229103
|
+
}
|
|
229104
|
+
|
|
229105
|
+
// copy out the result data
|
|
229106
|
+
for(size_t i=0; i<batchPos; i++) {
|
|
229107
|
+
size_t lineNr = jobLine[i]; // the sort must be order-preserving, as we concatenate results string in order
|
|
229108
|
+
size_t sz = input[i].end; // had stored compressed lengths here
|
|
229109
|
+
if (!strOut[lineNr]) strOut[lineNr] = dst; // first segment will be the strOut pointer
|
|
229110
|
+
lenOut[lineNr] += sz; // add segment (lenOut starts at 0 for this reason)
|
|
229111
|
+
memcpy(dst, codeBase+input[i].out, sz);
|
|
229112
|
+
dst += sz;
|
|
229113
|
+
}
|
|
229114
|
+
|
|
229115
|
+
// go for the next batch of 512 chunks
|
|
229116
|
+
inOff = outOff = batchPos = empty = 0;
|
|
229117
|
+
budget = (size_t) (lim - dst);
|
|
229118
|
+
}
|
|
229119
|
+
} while (curLine == prevLine && outOff <= (1<<19));
|
|
229120
|
+
}
|
|
229121
|
+
return curLine;
|
|
229113
229122
|
}
|
|
229114
229123
|
|
|
229115
229124
|
|
|
229116
229125
|
// optimized adaptive *scalar* compression method
|
|
229117
229126
|
static inline size_t compressBulk(SymbolTable &symbolTable, size_t nlines, size_t lenIn[], u8* strIn[], size_t size, u8* out, size_t lenOut[], u8* strOut[], bool noSuffixOpt, bool avoidBranch) {
|
|
229118
|
-
|
|
229119
|
-
|
|
229120
|
-
|
|
229121
|
-
|
|
229122
|
-
|
|
229123
|
-
|
|
229124
|
-
|
|
229125
|
-
|
|
229126
|
-
|
|
229127
|
-
|
|
229128
|
-
|
|
229129
|
-
|
|
229130
|
-
|
|
229131
|
-
|
|
229132
|
-
|
|
229133
|
-
|
|
229134
|
-
|
|
229135
|
-
|
|
229136
|
-
|
|
229137
|
-
|
|
229138
|
-
|
|
229139
|
-
|
|
229140
|
-
|
|
229141
|
-
|
|
229142
|
-
|
|
229143
|
-
|
|
229144
|
-
|
|
229145
|
-
|
|
229146
|
-
|
|
229147
|
-
|
|
229148
|
-
|
|
229149
|
-
|
|
229150
|
-
|
|
229151
|
-
|
|
229152
|
-
|
|
229153
|
-
|
|
229154
|
-
|
|
229155
|
-
|
|
229156
|
-
|
|
229157
|
-
|
|
229158
|
-
|
|
229159
|
-
}
|
|
229160
|
-
}
|
|
229161
|
-
};
|
|
229127
|
+
u8 *cur = NULL, *end = NULL, *lim = out + size;
|
|
229128
|
+
size_t curLine, suffixLim = symbolTable.suffixLim;
|
|
229129
|
+
u8 byteLim = symbolTable.nSymbols + symbolTable.zeroTerminated - symbolTable.lenHisto[0];
|
|
229130
|
+
|
|
229131
|
+
u8 buf[512+7]; /* +7 sentinel is to avoid 8-byte unaligned-loads going beyond 511 out-of-bounds */
|
|
229132
|
+
memset(buf+511, 0, 8); /* and initialize the sentinal bytes */
|
|
229133
|
+
|
|
229134
|
+
// three variants are possible. dead code falls away since the bool arguments are constants
|
|
229135
|
+
auto compressVariant = [&](bool noSuffixOpt, bool avoidBranch) {
|
|
229136
|
+
while (cur < end) {
|
|
229137
|
+
u64 word = fsst_unaligned_load(cur);
|
|
229138
|
+
size_t code = symbolTable.shortCodes[word & 0xFFFF];
|
|
229139
|
+
if (noSuffixOpt && ((u8) code) < suffixLim) {
|
|
229140
|
+
// 2 byte code without having to worry about longer matches
|
|
229141
|
+
*out++ = (u8) code; cur += 2;
|
|
229142
|
+
} else {
|
|
229143
|
+
size_t pos = word & 0xFFFFFF;
|
|
229144
|
+
size_t idx = FSST_HASH(pos)&(symbolTable.hashTabSize-1);
|
|
229145
|
+
Symbol s = symbolTable.hashTab[idx];
|
|
229146
|
+
out[1] = (u8) word; // speculatively write out escaped byte
|
|
229147
|
+
word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.icl);
|
|
229148
|
+
if ((s.icl < FSST_ICL_FREE) && s.val.num == word) {
|
|
229149
|
+
*out++ = (u8) s.code(); cur += s.length();
|
|
229150
|
+
} else if (avoidBranch) {
|
|
229151
|
+
// could be a 2-byte or 1-byte code, or miss
|
|
229152
|
+
// handle everything with predication
|
|
229153
|
+
*out = (u8) code;
|
|
229154
|
+
out += 1+((code&FSST_CODE_BASE)>>8);
|
|
229155
|
+
cur += (code>>FSST_LEN_BITS);
|
|
229156
|
+
} else if ((u8) code < byteLim) {
|
|
229157
|
+
// 2 byte code after checking there is no longer pattern
|
|
229158
|
+
*out++ = (u8) code; cur += 2;
|
|
229159
|
+
} else {
|
|
229160
|
+
// 1 byte code or miss.
|
|
229161
|
+
*out = (u8) code;
|
|
229162
|
+
out += 1+((code&FSST_CODE_BASE)>>8); // predicated - tested with a branch, that was always worse
|
|
229163
|
+
cur++;
|
|
229164
|
+
}
|
|
229165
|
+
}
|
|
229166
|
+
}
|
|
229167
|
+
};
|
|
229162
229168
|
|
|
229163
|
-
|
|
229164
|
-
|
|
229165
|
-
|
|
229166
|
-
|
|
229167
|
-
|
|
229168
|
-
|
|
229169
|
-
|
|
229170
|
-
|
|
229171
|
-
|
|
229172
|
-
|
|
229173
|
-
|
|
229174
|
-
|
|
229175
|
-
|
|
229176
|
-
|
|
229177
|
-
|
|
229178
|
-
|
|
229179
|
-
|
|
229180
|
-
|
|
229181
|
-
|
|
229182
|
-
|
|
229183
|
-
|
|
229184
|
-
|
|
229185
|
-
|
|
229186
|
-
|
|
229187
|
-
|
|
229188
|
-
|
|
229189
|
-
|
|
229190
|
-
|
|
229191
|
-
|
|
229192
|
-
|
|
229193
|
-
}
|
|
229194
|
-
return curLine;
|
|
229169
|
+
for(curLine=0; curLine<nlines; curLine++) {
|
|
229170
|
+
size_t chunk, curOff = 0;
|
|
229171
|
+
strOut[curLine] = out;
|
|
229172
|
+
do {
|
|
229173
|
+
cur = strIn[curLine] + curOff;
|
|
229174
|
+
chunk = lenIn[curLine] - curOff;
|
|
229175
|
+
if (chunk > 511) {
|
|
229176
|
+
chunk = 511; // we need to compress in chunks of 511 in order to be byte-compatible with simd-compressed FSST
|
|
229177
|
+
}
|
|
229178
|
+
if ((2*chunk+7) > (size_t) (lim-out)) {
|
|
229179
|
+
return curLine; // out of memory
|
|
229180
|
+
}
|
|
229181
|
+
// copy the string to the 511-byte buffer
|
|
229182
|
+
memcpy(buf, cur, chunk);
|
|
229183
|
+
buf[chunk] = (u8) symbolTable.terminator;
|
|
229184
|
+
cur = buf;
|
|
229185
|
+
end = cur + chunk;
|
|
229186
|
+
|
|
229187
|
+
// based on symboltable stats, choose a variant that is nice to the branch predictor
|
|
229188
|
+
if (noSuffixOpt) {
|
|
229189
|
+
compressVariant(true,false);
|
|
229190
|
+
} else if (avoidBranch) {
|
|
229191
|
+
compressVariant(false,true);
|
|
229192
|
+
} else {
|
|
229193
|
+
compressVariant(false, false);
|
|
229194
|
+
}
|
|
229195
|
+
} while((curOff += chunk) < lenIn[curLine]);
|
|
229196
|
+
lenOut[curLine] = (size_t) (out - strOut[curLine]);
|
|
229197
|
+
}
|
|
229198
|
+
return curLine;
|
|
229195
229199
|
}
|
|
229196
229200
|
|
|
229197
229201
|
#define FSST_SAMPLELINE ((size_t) 512)
|
|
229198
229202
|
|
|
229199
229203
|
// quickly select a uniformly random set of lines such that we have between [FSST_SAMPLETARGET,FSST_SAMPLEMAXSZ) string bytes
|
|
229200
229204
|
vector<u8*> makeSample(u8* sampleBuf, u8* strIn[], size_t **lenRef, size_t nlines) {
|
|
229201
|
-
|
|
229202
|
-
|
|
229205
|
+
size_t totSize = 0, *lenIn = *lenRef;
|
|
229206
|
+
vector<u8*> sample;
|
|
229203
229207
|
|
|
229204
|
-
|
|
229205
|
-
|
|
229208
|
+
for(size_t i=0; i<nlines; i++)
|
|
229209
|
+
totSize += lenIn[i];
|
|
229206
229210
|
|
|
229207
|
-
|
|
229208
|
-
|
|
229209
|
-
|
|
229210
|
-
|
|
229211
|
-
|
|
229212
|
-
|
|
229213
|
-
|
|
229214
|
-
|
|
229215
|
-
|
|
229216
|
-
|
|
229217
|
-
|
|
229218
|
-
|
|
229219
|
-
|
|
229220
|
-
|
|
229221
|
-
|
|
229222
|
-
|
|
229223
|
-
|
|
229224
|
-
|
|
229225
|
-
|
|
229226
|
-
|
|
229227
|
-
|
|
229228
|
-
|
|
229229
|
-
|
|
229230
|
-
|
|
229231
|
-
|
|
229232
|
-
|
|
229233
|
-
|
|
229234
|
-
|
|
229211
|
+
if (totSize < FSST_SAMPLETARGET) {
|
|
229212
|
+
for(size_t i=0; i<nlines; i++)
|
|
229213
|
+
sample.push_back(strIn[i]);
|
|
229214
|
+
} else {
|
|
229215
|
+
size_t sampleRnd = FSST_HASH(4637947);
|
|
229216
|
+
u8* sampleLim = sampleBuf + FSST_SAMPLETARGET;
|
|
229217
|
+
size_t *sampleLen = *lenRef = new size_t[nlines + FSST_SAMPLEMAXSZ/FSST_SAMPLELINE];
|
|
229218
|
+
|
|
229219
|
+
while(sampleBuf < sampleLim) {
|
|
229220
|
+
// choose a non-empty line
|
|
229221
|
+
sampleRnd = FSST_HASH(sampleRnd);
|
|
229222
|
+
size_t linenr = sampleRnd % nlines;
|
|
229223
|
+
while (lenIn[linenr] == 0)
|
|
229224
|
+
if (++linenr == nlines) linenr = 0;
|
|
229225
|
+
|
|
229226
|
+
// choose a chunk
|
|
229227
|
+
size_t chunks = 1 + ((lenIn[linenr]-1) / FSST_SAMPLELINE);
|
|
229228
|
+
sampleRnd = FSST_HASH(sampleRnd);
|
|
229229
|
+
size_t chunk = FSST_SAMPLELINE*(sampleRnd % chunks);
|
|
229230
|
+
|
|
229231
|
+
// add the chunk to the sample
|
|
229232
|
+
size_t len = min(lenIn[linenr]-chunk,FSST_SAMPLELINE);
|
|
229233
|
+
memcpy(sampleBuf, strIn[linenr]+chunk, len);
|
|
229234
|
+
sample.push_back(sampleBuf);
|
|
229235
|
+
sampleBuf += *sampleLen++ = len;
|
|
229236
|
+
}
|
|
229237
|
+
}
|
|
229238
|
+
return sample;
|
|
229235
229239
|
}
|
|
229236
229240
|
|
|
229237
229241
|
extern "C" duckdb_fsst_encoder_t* duckdb_fsst_create(size_t n, size_t lenIn[], u8 *strIn[], int zeroTerminated) {
|
|
229238
|
-
|
|
229239
|
-
|
|
229240
|
-
|
|
229241
|
-
|
|
229242
|
-
|
|
229243
|
-
|
|
229244
|
-
|
|
229245
|
-
|
|
229242
|
+
u8* sampleBuf = new u8[FSST_SAMPLEMAXSZ];
|
|
229243
|
+
size_t *sampleLen = lenIn;
|
|
229244
|
+
vector<u8*> sample = makeSample(sampleBuf, strIn, &sampleLen, n?n:1); // careful handling of input to get a right-size and representative sample
|
|
229245
|
+
Encoder *encoder = new Encoder();
|
|
229246
|
+
encoder->symbolTable = shared_ptr<SymbolTable>(buildSymbolTable(encoder->counters, sample, sampleLen, zeroTerminated));
|
|
229247
|
+
if (sampleLen != lenIn) delete[] sampleLen;
|
|
229248
|
+
delete[] sampleBuf;
|
|
229249
|
+
return (duckdb_fsst_encoder_t*) encoder;
|
|
229246
229250
|
}
|
|
229247
229251
|
|
|
229248
229252
|
/* create another encoder instance, necessary to do multi-threaded encoding using the same symbol table */
|
|
229249
229253
|
extern "C" duckdb_fsst_encoder_t* duckdb_fsst_duplicate(duckdb_fsst_encoder_t *encoder) {
|
|
229250
|
-
|
|
229251
|
-
|
|
229252
|
-
|
|
229254
|
+
Encoder *e = new Encoder();
|
|
229255
|
+
e->symbolTable = ((Encoder*)encoder)->symbolTable; // it is a shared_ptr
|
|
229256
|
+
return (duckdb_fsst_encoder_t*) e;
|
|
229253
229257
|
}
|
|
229254
229258
|
|
|
229255
|
-
// export a symbol table in compact format.
|
|
229259
|
+
// export a symbol table in compact format.
|
|
229256
229260
|
extern "C" u32 duckdb_fsst_export(duckdb_fsst_encoder_t *encoder, u8 *buf) {
|
|
229257
|
-
|
|
229258
|
-
|
|
229259
|
-
|
|
229260
|
-
|
|
229261
|
-
|
|
229262
|
-
|
|
229263
|
-
|
|
229264
|
-
|
|
229265
|
-
|
|
229266
|
-
|
|
229267
|
-
|
|
229268
|
-
|
|
229269
|
-
|
|
229270
|
-
|
|
229271
|
-
|
|
229272
|
-
|
|
229273
|
-
|
|
229274
|
-
|
|
229275
|
-
|
|
229276
|
-
|
|
229261
|
+
Encoder *e = (Encoder*) encoder;
|
|
229262
|
+
// In ->version there is a versionnr, but we hide also suffixLim/terminator/nSymbols there.
|
|
229263
|
+
// This is sufficient in principle to *reconstruct* a duckdb_fsst_encoder_t from a duckdb_fsst_decoder_t
|
|
229264
|
+
// (such functionality could be useful to append compressed data to an existing block).
|
|
229265
|
+
//
|
|
229266
|
+
// However, the hash function in the encoder hash table is endian-sensitive, and given its
|
|
229267
|
+
// 'lossy perfect' hashing scheme is *unable* to contain other-endian-produced symbol tables.
|
|
229268
|
+
// Doing a endian-conversion during hashing will be slow and self-defeating.
|
|
229269
|
+
//
|
|
229270
|
+
// Overall, we could support reconstructing an encoder for incremental compression, but
|
|
229271
|
+
// should enforce equal-endianness. Bit of a bummer. Not going there now.
|
|
229272
|
+
//
|
|
229273
|
+
// The version field is now there just for future-proofness, but not used yet
|
|
229274
|
+
|
|
229275
|
+
// version allows keeping track of fsst versions, track endianness, and encoder reconstruction
|
|
229276
|
+
u64 version = (FSST_VERSION << 32) | // version is 24 bits, most significant byte is 0
|
|
229277
|
+
(((u64) e->symbolTable->suffixLim) << 24) |
|
|
229278
|
+
(((u64) e->symbolTable->terminator) << 16) |
|
|
229279
|
+
(((u64) e->symbolTable->nSymbols) << 8) |
|
|
229280
|
+
FSST_ENDIAN_MARKER; // least significant byte is nonzero
|
|
229277
229281
|
|
|
229278
|
-
|
|
229279
|
-
|
|
229280
|
-
|
|
229281
|
-
|
|
229282
|
-
|
|
229283
|
-
|
|
229282
|
+
/* do not assume unaligned reads here */
|
|
229283
|
+
memcpy(buf, &version, 8);
|
|
229284
|
+
buf[8] = e->symbolTable->zeroTerminated;
|
|
229285
|
+
for(u32 i=0; i<8; i++)
|
|
229286
|
+
buf[9+i] = (u8) e->symbolTable->lenHisto[i];
|
|
229287
|
+
u32 pos = 17;
|
|
229284
229288
|
|
|
229285
|
-
|
|
229286
|
-
|
|
229287
|
-
|
|
229288
|
-
|
|
229289
|
+
// emit only the used bytes of the symbols
|
|
229290
|
+
for(u32 i = e->symbolTable->zeroTerminated; i < e->symbolTable->nSymbols; i++)
|
|
229291
|
+
for(u32 j = 0; j < e->symbolTable->symbols[i].length(); j++)
|
|
229292
|
+
buf[pos++] = e->symbolTable->symbols[i].val.str[j]; // serialize used symbol bytes
|
|
229289
229293
|
|
|
229290
|
-
|
|
229294
|
+
return pos; // length of what was serialized
|
|
229291
229295
|
}
|
|
229292
229296
|
|
|
229293
229297
|
#define FSST_CORRUPT 32774747032022883 /* 7-byte number in little endian containing "corrupt" */
|
|
229294
229298
|
|
|
229295
229299
|
extern "C" u32 duckdb_fsst_import(duckdb_fsst_decoder_t *decoder, u8 *buf) {
|
|
229296
|
-
|
|
229297
|
-
|
|
229298
|
-
|
|
229299
|
-
|
|
229300
|
-
// version field (first 8 bytes) is now there just for future-proofness, unused still (skipped)
|
|
229301
|
-
memcpy(&version, buf, 8);
|
|
229302
|
-
if ((version>>32) != FSST_VERSION) return 0;
|
|
229303
|
-
decoder->zeroTerminated = buf[8]&1;
|
|
229304
|
-
memcpy(lenHisto, buf+9, 8);
|
|
229305
|
-
|
|
229306
|
-
// in case of zero-terminated, first symbol is "" (zero always, may be overwritten)
|
|
229307
|
-
decoder->len[0] = 1;
|
|
229308
|
-
decoder->symbol[0] = 0;
|
|
229309
|
-
|
|
229310
|
-
// we use lenHisto[0] as 1-byte symbol run length (at the end)
|
|
229311
|
-
code = decoder->zeroTerminated;
|
|
229312
|
-
if (decoder->zeroTerminated) lenHisto[0]--; // if zeroTerminated, then symbol "" aka 1-byte code=0, is not stored at the end
|
|
229313
|
-
|
|
229314
|
-
// now get all symbols from the buffer
|
|
229315
|
-
for(u32 l=1; l<=8; l++) { /* l = 1,2,3,4,5,6,7,8 */
|
|
229316
|
-
for(u32 i=0; i < lenHisto[(l&7) /* 1,2,3,4,5,6,7,0 */]; i++, code++) {
|
|
229317
|
-
decoder->len[code] = (l&7)+1; /* len = 2,3,4,5,6,7,8,1 */
|
|
229318
|
-
decoder->symbol[code] = 0;
|
|
229319
|
-
for(u32 j=0; j<decoder->len[code]; j++)
|
|
229320
|
-
((u8*) &decoder->symbol[code])[j] = buf[pos++]; // note this enforces 'little endian' symbols
|
|
229321
|
-
}
|
|
229322
|
-
}
|
|
229323
|
-
if (decoder->zeroTerminated) lenHisto[0]++;
|
|
229300
|
+
u64 version = 0;
|
|
229301
|
+
u32 code, pos = 17;
|
|
229302
|
+
u8 lenHisto[8];
|
|
229324
229303
|
|
|
229325
|
-
|
|
229326
|
-
|
|
229327
|
-
|
|
229328
|
-
|
|
229329
|
-
|
|
229330
|
-
|
|
229304
|
+
// version field (first 8 bytes) is now there just for future-proofness, unused still (skipped)
|
|
229305
|
+
memcpy(&version, buf, 8);
|
|
229306
|
+
if ((version>>32) != FSST_VERSION) return 0;
|
|
229307
|
+
decoder->zeroTerminated = buf[8]&1;
|
|
229308
|
+
memcpy(lenHisto, buf+9, 8);
|
|
229309
|
+
|
|
229310
|
+
// in case of zero-terminated, first symbol is "" (zero always, may be overwritten)
|
|
229311
|
+
decoder->len[0] = 1;
|
|
229312
|
+
decoder->symbol[0] = 0;
|
|
229313
|
+
|
|
229314
|
+
// we use lenHisto[0] as 1-byte symbol run length (at the end)
|
|
229315
|
+
code = decoder->zeroTerminated;
|
|
229316
|
+
if (decoder->zeroTerminated) lenHisto[0]--; // if zeroTerminated, then symbol "" aka 1-byte code=0, is not stored at the end
|
|
229317
|
+
|
|
229318
|
+
// now get all symbols from the buffer
|
|
229319
|
+
for(u32 l=1; l<=8; l++) { /* l = 1,2,3,4,5,6,7,8 */
|
|
229320
|
+
for(u32 i=0; i < lenHisto[(l&7) /* 1,2,3,4,5,6,7,0 */]; i++, code++) {
|
|
229321
|
+
decoder->len[code] = (l&7)+1; /* len = 2,3,4,5,6,7,8,1 */
|
|
229322
|
+
decoder->symbol[code] = 0;
|
|
229323
|
+
for(u32 j=0; j<decoder->len[code]; j++)
|
|
229324
|
+
((u8*) &decoder->symbol[code])[j] = buf[pos++]; // note this enforces 'little endian' symbols
|
|
229325
|
+
}
|
|
229326
|
+
}
|
|
229327
|
+
if (decoder->zeroTerminated) lenHisto[0]++;
|
|
229328
|
+
|
|
229329
|
+
// fill unused symbols with text "corrupt". Gives a chance to detect corrupted code sequences (if there are unused symbols).
|
|
229330
|
+
while(code<255) {
|
|
229331
|
+
decoder->symbol[code] = FSST_CORRUPT;
|
|
229332
|
+
decoder->len[code++] = 8;
|
|
229333
|
+
}
|
|
229334
|
+
return pos;
|
|
229331
229335
|
}
|
|
229332
229336
|
|
|
229333
229337
|
// runtime check for simd
|
|
229334
229338
|
inline size_t _compressImpl(Encoder *e, size_t nlines, size_t lenIn[], u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd) {
|
|
229335
229339
|
#ifndef NONOPT_FSST
|
|
229336
|
-
|
|
229337
|
-
|
|
229340
|
+
if (simd && duckdb_fsst_hasAVX512())
|
|
229341
|
+
return compressSIMD(*e->symbolTable, e->simdbuf, nlines, lenIn, strIn, size, output, lenOut, strOut, simd);
|
|
229338
229342
|
#endif
|
|
229339
|
-
|
|
229340
|
-
|
|
229343
|
+
(void) simd;
|
|
229344
|
+
return compressBulk(*e->symbolTable, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt, avoidBranch);
|
|
229341
229345
|
}
|
|
229342
229346
|
size_t compressImpl(Encoder *e, size_t nlines, size_t lenIn[], u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd) {
|
|
229343
|
-
|
|
229347
|
+
return _compressImpl(e, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt, avoidBranch, simd);
|
|
229344
229348
|
}
|
|
229345
229349
|
|
|
229346
|
-
// adaptive choosing of scalar compression method based on symbol length histogram
|
|
229350
|
+
// adaptive choosing of scalar compression method based on symbol length histogram
|
|
229347
229351
|
inline size_t _compressAuto(Encoder *e, size_t nlines, size_t lenIn[], u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], int simd) {
|
|
229348
|
-
|
|
229349
|
-
|
|
229350
|
-
|
|
229351
|
-
|
|
229352
|
-
|
|
229353
|
-
|
|
229354
|
-
|
|
229355
|
-
|
|
229356
|
-
|
|
229352
|
+
bool avoidBranch = false, noSuffixOpt = false;
|
|
229353
|
+
if (100*e->symbolTable->lenHisto[1] > 65*e->symbolTable->nSymbols && 100*e->symbolTable->suffixLim > 95*e->symbolTable->lenHisto[1]) {
|
|
229354
|
+
noSuffixOpt = true;
|
|
229355
|
+
} else if ((e->symbolTable->lenHisto[0] > 24 && e->symbolTable->lenHisto[0] < 92) &&
|
|
229356
|
+
(e->symbolTable->lenHisto[0] < 43 || e->symbolTable->lenHisto[6] + e->symbolTable->lenHisto[7] < 29) &&
|
|
229357
|
+
(e->symbolTable->lenHisto[0] < 72 || e->symbolTable->lenHisto[2] < 72)) {
|
|
229358
|
+
avoidBranch = true;
|
|
229359
|
+
}
|
|
229360
|
+
return _compressImpl(e, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt, avoidBranch, simd);
|
|
229357
229361
|
}
|
|
229358
229362
|
size_t compressAuto(Encoder *e, size_t nlines, size_t lenIn[], u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], int simd) {
|
|
229359
|
-
|
|
229363
|
+
return _compressAuto(e, nlines, lenIn, strIn, size, output, lenOut, strOut, simd);
|
|
229360
229364
|
}
|
|
229361
229365
|
|
|
229362
229366
|
// the main compression function (everything automatic)
|
|
229363
229367
|
extern "C" size_t duckdb_fsst_compress(duckdb_fsst_encoder_t *encoder, size_t nlines, size_t lenIn[], u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[]) {
|
|
229364
|
-
|
|
229365
|
-
|
|
229366
|
-
|
|
229367
|
-
|
|
229368
|
+
// to be faster than scalar, simd needs 64 lines or more of length >=12; or fewer lines, but big ones (totLen > 32KB)
|
|
229369
|
+
size_t totLen = accumulate(lenIn, lenIn+nlines, 0);
|
|
229370
|
+
int simd = totLen > nlines*12 && (nlines > 64 || totLen > (size_t) 1<<15);
|
|
229371
|
+
return _compressAuto((Encoder*) encoder, nlines, lenIn, strIn, size, output, lenOut, strOut, 3*simd);
|
|
229368
229372
|
}
|
|
229369
229373
|
|
|
229370
229374
|
/* deallocate encoder */
|
|
229371
229375
|
extern "C" void duckdb_fsst_destroy(duckdb_fsst_encoder_t* encoder) {
|
|
229372
|
-
|
|
229373
|
-
|
|
229376
|
+
Encoder *e = (Encoder*) encoder;
|
|
229377
|
+
delete e;
|
|
229374
229378
|
}
|
|
229375
229379
|
|
|
229376
229380
|
/* very lazy implementation relying on export and import */
|
|
229377
229381
|
extern "C" duckdb_fsst_decoder_t duckdb_fsst_decoder(duckdb_fsst_encoder_t *encoder) {
|
|
229378
|
-
|
|
229379
|
-
|
|
229380
|
-
|
|
229381
|
-
|
|
229382
|
-
|
|
229383
|
-
|
|
229382
|
+
u8 buf[sizeof(duckdb_fsst_decoder_t)];
|
|
229383
|
+
u32 cnt1 = duckdb_fsst_export(encoder, buf);
|
|
229384
|
+
duckdb_fsst_decoder_t decoder;
|
|
229385
|
+
u32 cnt2 = duckdb_fsst_import(&decoder, buf);
|
|
229386
|
+
assert(cnt1 == cnt2); (void) cnt1; (void) cnt2;
|
|
229387
|
+
return decoder;
|
|
229384
229388
|
}
|
|
229385
229389
|
|
|
229386
|
-
|
|
229387
229390
|
// LICENSE_CHANGE_END
|
|
229388
229391
|
|
|
229389
229392
|
|