duckdb 0.6.1-dev240.0 → 0.6.1-dev247.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/duckdb.cpp CHANGED
@@ -228764,626 +228764,629 @@ size_t duckdb_fsst_compressAVX512(SymbolTable &symbolTable, u8* codeBase, u8* sy
228764
228764
 
228765
228765
 
228766
228766
  Symbol concat(Symbol a, Symbol b) {
228767
- Symbol s;
228768
- u32 length = a.length()+b.length();
228769
- if (length > Symbol::maxLength) length = Symbol::maxLength;
228770
- s.set_code_len(FSST_CODE_MASK, length);
228771
- s.val.num = (b.val.num << (8*a.length())) | a.val.num;
228772
- return s;
228767
+ Symbol s;
228768
+ u32 length = a.length()+b.length();
228769
+ if (length > Symbol::maxLength) length = Symbol::maxLength;
228770
+ s.set_code_len(FSST_CODE_MASK, length);
228771
+ s.val.num = (b.val.num << (8*a.length())) | a.val.num;
228772
+ return s;
228773
228773
  }
228774
228774
 
228775
228775
  namespace std {
228776
228776
  template <>
228777
228777
  class hash<QSymbol> {
228778
- public:
228779
- size_t operator()(const QSymbol& q) const {
228780
- uint64_t k = q.symbol.val.num;
228781
- const uint64_t m = 0xc6a4a7935bd1e995;
228782
- const int r = 47;
228783
- uint64_t h = 0x8445d61a4e774912 ^ (8*m);
228784
- k *= m;
228785
- k ^= k >> r;
228786
- k *= m;
228787
- h ^= k;
228788
- h *= m;
228789
- h ^= h >> r;
228790
- h *= m;
228791
- h ^= h >> r;
228792
- return h;
228793
- }
228778
+ public:
228779
+ size_t operator()(const QSymbol& q) const {
228780
+ uint64_t k = q.symbol.val.num;
228781
+ const uint64_t m = 0xc6a4a7935bd1e995;
228782
+ const int r = 47;
228783
+ uint64_t h = 0x8445d61a4e774912 ^ (8*m);
228784
+ k *= m;
228785
+ k ^= k >> r;
228786
+ k *= m;
228787
+ h ^= k;
228788
+ h *= m;
228789
+ h ^= h >> r;
228790
+ h *= m;
228791
+ h ^= h >> r;
228792
+ return h;
228793
+ }
228794
228794
  };
228795
228795
  }
228796
228796
 
228797
228797
  bool isEscapeCode(u16 pos) { return pos < FSST_CODE_BASE; }
228798
228798
 
228799
228799
  std::ostream& operator<<(std::ostream& out, const Symbol& s) {
228800
- for (u32 i=0; i<s.length(); i++)
228801
- out << s.val.str[i];
228802
- return out;
228800
+ for (u32 i=0; i<s.length(); i++)
228801
+ out << s.val.str[i];
228802
+ return out;
228803
228803
  }
228804
- //static u64 iter = 0;
228804
+ static u64 iter = 0;
228805
228805
 
228806
228806
  SymbolTable *buildSymbolTable(Counters& counters, vector<u8*> line, size_t len[], bool zeroTerminated=false) {
228807
- SymbolTable *st = new SymbolTable(), *bestTable = new SymbolTable();
228808
- int bestGain = (int) -FSST_SAMPLEMAXSZ; // worst case (everything exception)
228809
- size_t sampleFrac = 128;
228810
-
228811
- // start by determining the terminator. We use the (lowest) most infrequent byte as terminator
228812
- st->zeroTerminated = zeroTerminated;
228813
- if (zeroTerminated) {
228814
- st->terminator = 0; // except in case of zeroTerminated mode, then byte 0 is terminator regardless frequency
228815
- } else {
228816
- u16 byteHisto[256];
228817
- memset(byteHisto, 0, sizeof(byteHisto));
228818
- for(size_t i=0; i<line.size(); i++) {
228819
- u8* cur = line[i];
228820
- u8* end = cur + len[i];
228821
- while(cur < end) byteHisto[*cur++]++;
228822
- }
228823
- u32 minSize = FSST_SAMPLEMAXSZ, i = st->terminator = 256;
228824
- while(i-- > 0) {
228825
- if (byteHisto[i] > minSize) continue;
228826
- st->terminator = i;
228827
- minSize = byteHisto[i];
228828
- }
228829
- }
228830
- assert(st->terminator != 256);
228807
+ SymbolTable *st = new SymbolTable(), *bestTable = new SymbolTable();
228808
+ int bestGain = (int) -FSST_SAMPLEMAXSZ; // worst case (everything exception)
228809
+ size_t sampleFrac = 128;
228810
+
228811
+ // start by determining the terminator. We use the (lowest) most infrequent byte as terminator
228812
+ st->zeroTerminated = zeroTerminated;
228813
+ if (zeroTerminated) {
228814
+ st->terminator = 0; // except in case of zeroTerminated mode, then byte 0 is terminator regardless frequency
228815
+ } else {
228816
+ u16 byteHisto[256];
228817
+ memset(byteHisto, 0, sizeof(byteHisto));
228818
+ for(size_t i=0; i<line.size(); i++) {
228819
+ u8* cur = line[i];
228820
+ u8* end = cur + len[i];
228821
+ while(cur < end) byteHisto[*cur++]++;
228822
+ }
228823
+ u32 minSize = FSST_SAMPLEMAXSZ, i = st->terminator = 256;
228824
+ while(i-- > 0) {
228825
+ if (byteHisto[i] > minSize) continue;
228826
+ st->terminator = i;
228827
+ minSize = byteHisto[i];
228828
+ }
228829
+ }
228830
+ assert(st->terminator != 256);
228831
+
228832
+ // a random number between 0 and 128
228833
+ auto rnd128 = [&](size_t i) { return 1 + (FSST_HASH((i+1UL)*sampleFrac)&127); };
228834
+
228835
+ // compress sample, and compute (pair-)frequencies
228836
+ auto compressCount = [&](SymbolTable *st, Counters &counters) { // returns gain
228837
+ int gain = 0;
228838
+
228839
+ for(size_t i=0; i<line.size(); i++) {
228840
+ u8* cur = line[i];
228841
+ u8* end = cur + len[i];
228842
+
228843
+ if (sampleFrac < 128) {
228844
+ // in earlier rounds (sampleFrac < 128) we skip data in the sample (reduces overall work ~2x)
228845
+ if (rnd128(i) > sampleFrac) continue;
228846
+ }
228847
+ if (cur < end) {
228848
+ u8* start = cur;
228849
+ u16 code2 = 255, code1 = st->findLongestSymbol(cur, end);
228850
+ cur += st->symbols[code1].length();
228851
+ gain += (int) (st->symbols[code1].length()-(1+isEscapeCode(code1)));
228852
+ while (true) {
228853
+ // count single symbol (i.e. an option is not extending it)
228854
+ counters.count1Inc(code1);
228831
228855
 
228832
- // a random number between 0 and 128
228833
- auto rnd128 = [&](size_t i) { return 1 + (FSST_HASH((i+1UL)*sampleFrac)&127); };
228856
+ // as an alternative, consider just using the next byte..
228857
+ if (st->symbols[code1].length() != 1) // .. but do not count single byte symbols doubly
228858
+ counters.count1Inc(*start);
228834
228859
 
228835
- // compress sample, and compute (pair-)frequencies
228836
- auto compressCount = [&](SymbolTable *st, Counters &counters) { // returns gain
228837
- int gain = 0;
228860
+ if (cur==end) {
228861
+ break;
228862
+ }
228838
228863
 
228839
- for(size_t i=0; i<line.size(); i++) {
228840
- u8* cur = line[i];
228841
- u8* end = cur + len[i];
228864
+ // now match a new symbol
228865
+ start = cur;
228866
+ if (cur<end-7) {
228867
+ u64 word = fsst_unaligned_load(cur);
228868
+ size_t code = word & 0xFFFFFF;
228869
+ size_t idx = FSST_HASH(code)&(st->hashTabSize-1);
228870
+ Symbol s = st->hashTab[idx];
228871
+ code2 = st->shortCodes[word & 0xFFFF] & FSST_CODE_MASK;
228872
+ word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.icl);
228873
+ if ((s.icl < FSST_ICL_FREE) & (s.val.num == word)) {
228874
+ code2 = s.code();
228875
+ cur += s.length();
228876
+ } else if (code2 >= FSST_CODE_BASE) {
228877
+ cur += 2;
228878
+ } else {
228879
+ code2 = st->byteCodes[word & 0xFF] & FSST_CODE_MASK;
228880
+ cur += 1;
228881
+ }
228882
+ } else {
228883
+ code2 = st->findLongestSymbol(cur, end);
228884
+ cur += st->symbols[code2].length();
228885
+ }
228842
228886
 
228843
- if (sampleFrac < 128) {
228844
- // in earlier rounds (sampleFrac < 128) we skip data in the sample (reduces overall work ~2x)
228845
- if (rnd128(i) > sampleFrac) continue;
228846
- }
228847
- if (cur < end) {
228848
- u16 pos2 = 255, pos1 = st->findLongestSymbol(cur, end);
228849
- cur += st->symbols[pos1].length();
228850
- gain += (int) (st->symbols[pos1].length()-(1+isEscapeCode(pos1)));
228851
- while (true) {
228852
- u8* old = cur;
228853
- counters.count1Inc(pos1);
228854
- // count single symbol (i.e. an option is not extending it)
228855
- if (cur>=end)
228856
- break;
228857
- if (st->symbols[pos1].length() != 1)
228858
- counters.count1Inc(*cur);
228859
- if (cur<end-7) {
228860
- u64 word = fsst_unaligned_load(cur);
228861
- size_t pos = word & 0xFFFFFF;
228862
- size_t idx = FSST_HASH(pos)&(st->hashTabSize-1);
228863
- Symbol s = st->hashTab[idx];
228864
- pos2 = st->shortCodes[word & 0xFFFF] & FSST_CODE_MASK;
228865
- word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.icl);
228866
- if ((s.icl < FSST_ICL_FREE) & (s.val.num == word)) {
228867
- pos2 = s.code();
228868
- cur += s.length();
228869
- } else if (pos2 >= FSST_CODE_BASE) {
228870
- cur += 2;
228871
- } else {
228872
- pos2 = st->byteCodes[word & 0xFF] & FSST_CODE_MASK;
228873
- cur += 1;
228874
- }
228875
- } else {
228876
- assert(cur<end);
228877
- pos2 = st->findLongestSymbol(cur, end);
228878
- cur += st->symbols[pos2].length();
228879
- }
228880
-
228881
- // compute compressed output size
228882
- gain += ((int) (cur-old))-(1+isEscapeCode(pos2));
228883
-
228884
- // now count the subsequent two symbols we encode as an extension possibility
228885
- if (sampleFrac < 128) { // no need to count pairs in final round
228886
- counters.count2Inc(pos1, pos2);
228887
- if ((cur-old) > 1) // do not count escaped bytes doubly
228888
- counters.count2Inc(pos1, *old);
228889
- }
228890
- pos1 = pos2;
228891
- }
228892
- }
228893
- }
228894
- return gain;
228895
- };
228887
+ // compute compressed output size
228888
+ gain += ((int) (cur-start))-(1+isEscapeCode(code2));
228896
228889
 
228897
- auto makeTable = [&](SymbolTable *st, Counters &counters) {
228898
- // hashmap of c (needed because we can generate duplicate candidates)
228899
- unordered_set<QSymbol> cands;
228900
-
228901
- // artificially make terminater the most frequent symbol so it gets included
228902
- u16 terminator = st->nSymbols?FSST_CODE_BASE:st->terminator;
228903
- counters.count1Set(terminator,65535);
228904
-
228905
- auto addOrInc = [&](unordered_set<QSymbol> &cands, Symbol s, u64 count) {
228906
- if (count < (5*sampleFrac)/128) return; // improves both compression speed (less candidates), but also quality!!
228907
- QSymbol q;
228908
- q.symbol = s;
228909
- q.gain = count * s.length();
228910
- auto it = cands.find(q);
228911
- if (it != cands.end()) {
228912
- q.gain += (*it).gain;
228913
- cands.erase(*it);
228914
- }
228915
- cands.insert(q);
228916
- };
228890
+ // now count the subsequent two symbols we encode as an extension codesibility
228891
+ if (sampleFrac < 128) { // no need to count pairs in final round
228892
+ // consider the symbol that is the concatenation of the two last symbols
228893
+ counters.count2Inc(code1, code2);
228917
228894
 
228918
- // add candidate symbols based on counted frequency
228919
- for (u32 pos1=0; pos1<FSST_CODE_BASE+(size_t) st->nSymbols; pos1++) {
228920
- u32 cnt1 = counters.count1GetNext(pos1); // may advance pos1!!
228921
- if (!cnt1) continue;
228895
+ // as an alternative, consider just extending with the next byte..
228896
+ if ((cur-start) > 1) // ..but do not count single byte extensions doubly
228897
+ counters.count2Inc(code1, *start);
228898
+ }
228899
+ code1 = code2;
228900
+ }
228901
+ }
228902
+ }
228903
+ return gain;
228904
+ };
228922
228905
 
228923
- // heuristic: promoting single-byte symbols (*8) helps reduce exception rates and increases [de]compression speed
228924
- Symbol s1 = st->symbols[pos1];
228925
- addOrInc(cands, s1, ((s1.length()==1)?8LL:1LL)*cnt1);
228906
+ auto makeTable = [&](SymbolTable *st, Counters &counters) {
228907
+ // hashmap of c (needed because we can generate duplicate candidates)
228908
+ unordered_set<QSymbol> cands;
228909
+
228910
+ // artificially make terminater the most frequent symbol so it gets included
228911
+ u16 terminator = st->nSymbols?FSST_CODE_BASE:st->terminator;
228912
+ counters.count1Set(terminator,65535);
228913
+
228914
+ auto addOrInc = [&](unordered_set<QSymbol> &cands, Symbol s, u64 count) {
228915
+ if (count < (5*sampleFrac)/128) return; // improves both compression speed (less candidates), but also quality!!
228916
+ QSymbol q;
228917
+ q.symbol = s;
228918
+ q.gain = count * s.length();
228919
+ auto it = cands.find(q);
228920
+ if (it != cands.end()) {
228921
+ q.gain += (*it).gain;
228922
+ cands.erase(*it);
228923
+ }
228924
+ cands.insert(q);
228925
+ };
228926
228926
 
228927
- if (sampleFrac >= 128 || // last round we do not create new (combined) symbols
228928
- s1.length() == Symbol::maxLength || // symbol cannot be extended
228929
- s1.val.str[0] == st->terminator) { // multi-byte symbols cannot contain the terminator byte
228930
- continue;
228931
- }
228932
- for (u32 pos2=0; pos2<FSST_CODE_BASE+(size_t)st->nSymbols; pos2++) {
228933
- u32 cnt2 = counters.count2GetNext(pos1, pos2); // may advance pos2!!
228934
- if (!cnt2) continue;
228935
-
228936
- // create a new symbol
228937
- Symbol s2 = st->symbols[pos2];
228938
- Symbol s3 = concat(s1, s2);
228939
- if (s2.val.str[0] != st->terminator) // multi-byte symbols cannot contain the terminator byte
228940
- addOrInc(cands, s3, cnt2);
228941
- }
228942
- }
228927
+ // add candidate symbols based on counted frequency
228928
+ for (u32 pos1=0; pos1<FSST_CODE_BASE+(size_t) st->nSymbols; pos1++) {
228929
+ u32 cnt1 = counters.count1GetNext(pos1); // may advance pos1!!
228930
+ if (!cnt1) continue;
228943
228931
 
228944
- // insert candidates into priority queue (by gain)
228945
- auto cmpGn = [](const QSymbol& q1, const QSymbol& q2) { return (q1.gain < q2.gain) || (q1.gain == q2.gain && q1.symbol.val.num > q2.symbol.val.num); };
228946
- priority_queue<QSymbol,vector<QSymbol>,decltype(cmpGn)> pq(cmpGn);
228947
- for (auto& q : cands)
228948
- pq.push(q);
228949
-
228950
- // Create new symbol map using best candidates
228951
- st->clear();
228952
- while (st->nSymbols < 255 && !pq.empty()) {
228953
- QSymbol q = pq.top();
228954
- pq.pop();
228955
- st->add(q.symbol);
228956
- }
228957
- };
228932
+ // heuristic: promoting single-byte symbols (*8) helps reduce exception rates and increases [de]compression speed
228933
+ Symbol s1 = st->symbols[pos1];
228934
+ addOrInc(cands, s1, ((s1.length()==1)?8LL:1LL)*cnt1);
228958
228935
 
228959
- u8 bestCounters[512*sizeof(u16)];
228936
+ if (sampleFrac >= 128 || // last round we do not create new (combined) symbols
228937
+ s1.length() == Symbol::maxLength || // symbol cannot be extended
228938
+ s1.val.str[0] == st->terminator) { // multi-byte symbols cannot contain the terminator byte
228939
+ continue;
228940
+ }
228941
+ for (u32 pos2=0; pos2<FSST_CODE_BASE+(size_t)st->nSymbols; pos2++) {
228942
+ u32 cnt2 = counters.count2GetNext(pos1, pos2); // may advance pos2!!
228943
+ if (!cnt2) continue;
228944
+
228945
+ // create a new symbol
228946
+ Symbol s2 = st->symbols[pos2];
228947
+ Symbol s3 = concat(s1, s2);
228948
+ if (s2.val.str[0] != st->terminator) // multi-byte symbols cannot contain the terminator byte
228949
+ addOrInc(cands, s3, cnt2);
228950
+ }
228951
+ }
228952
+
228953
+ // insert candidates into priority queue (by gain)
228954
+ auto cmpGn = [](const QSymbol& q1, const QSymbol& q2) { return (q1.gain < q2.gain) || (q1.gain == q2.gain && q1.symbol.val.num > q2.symbol.val.num); };
228955
+ priority_queue<QSymbol,vector<QSymbol>,decltype(cmpGn)> pq(cmpGn);
228956
+ for (auto& q : cands)
228957
+ pq.push(q);
228958
+
228959
+ // Create new symbol map using best candidates
228960
+ st->clear();
228961
+ while (st->nSymbols < 255 && !pq.empty()) {
228962
+ QSymbol q = pq.top();
228963
+ pq.pop();
228964
+ st->add(q.symbol);
228965
+ }
228966
+ };
228967
+
228968
+ u8 bestCounters[512*sizeof(u16)];
228960
228969
  #ifdef NONOPT_FSST
228961
- for(size_t frac : {127, 127, 127, 127, 127, 127, 127, 127, 127, 128}) {
228962
- sampleFrac = frac;
228970
+ for(size_t frac : {127, 127, 127, 127, 127, 127, 127, 127, 127, 128}) {
228971
+ sampleFrac = frac;
228963
228972
  #else
228964
- for(sampleFrac=8; true; sampleFrac += 30) {
228973
+ for(sampleFrac=8; true; sampleFrac += 30) {
228965
228974
  #endif
228966
- memset(&counters, 0, sizeof(Counters));
228967
- long gain = compressCount(st, counters);
228968
- if (gain >= bestGain) { // a new best solution!
228969
- counters.backup1(bestCounters);
228970
- *bestTable = *st; bestGain = gain;
228971
- }
228972
- if (sampleFrac >= 128) break; // we do 5 rounds (sampleFrac=8,38,68,98,128)
228973
- makeTable(st, counters);
228974
- }
228975
- delete st;
228976
- counters.restore1(bestCounters);
228977
- makeTable(bestTable, counters);
228978
- bestTable->finalize(zeroTerminated); // renumber codes for more efficient compression
228979
- return bestTable;
228975
+ memset(&counters, 0, sizeof(Counters));
228976
+ long gain = compressCount(st, counters);
228977
+ if (gain >= bestGain) { // a new best solution!
228978
+ counters.backup1(bestCounters);
228979
+ *bestTable = *st; bestGain = gain;
228980
+ }
228981
+ if (sampleFrac >= 128) break; // we do 5 rounds (sampleFrac=8,38,68,98,128)
228982
+ makeTable(st, counters);
228983
+ }
228984
+ delete st;
228985
+ counters.restore1(bestCounters);
228986
+ makeTable(bestTable, counters);
228987
+ bestTable->finalize(zeroTerminated); // renumber codes for more efficient compression
228988
+ return bestTable;
228980
228989
  }
228981
228990
 
228982
228991
  static inline size_t compressSIMD(SymbolTable &symbolTable, u8* symbolBase, size_t nlines, size_t len[], u8* line[], size_t size, u8* dst, size_t lenOut[], u8* strOut[], int unroll) {
228983
- size_t curLine = 0, inOff = 0, outOff = 0, batchPos = 0, empty = 0, budget = size;
228984
- u8 *lim = dst + size, *codeBase = symbolBase + (1<<18); // 512KB temp space for compressing 512 strings
228985
- SIMDjob input[512]; // combined offsets of input strings (cur,end), and string #id (pos) and output (dst) pointer
228986
- SIMDjob output[512]; // output are (pos:9,dst:19) end pointers (compute compressed length from this)
228987
- size_t jobLine[512]; // for which line in the input sequence was this job (needed because we may split a line into multiple jobs)
228988
-
228989
- while (curLine < nlines && outOff <= (1<<19)) {
228990
- size_t prevLine = curLine, chunk, curOff = 0;
228991
-
228992
- // bail out if the output buffer cannot hold the compressed next string fully
228993
- if (((len[curLine]-curOff)*2 + 7) > budget) break; // see below for the +7
228994
- else budget -= (len[curLine]-curOff)*2;
228995
-
228996
- strOut[curLine] = (u8*) 0;
228997
- lenOut[curLine] = 0;
228992
+ size_t curLine = 0, inOff = 0, outOff = 0, batchPos = 0, empty = 0, budget = size;
228993
+ u8 *lim = dst + size, *codeBase = symbolBase + (1<<18); // 512KB temp space for compressing 512 strings
228994
+ SIMDjob input[512]; // combined offsets of input strings (cur,end), and string #id (pos) and output (dst) pointer
228995
+ SIMDjob output[512]; // output are (pos:9,dst:19) end pointers (compute compressed length from this)
228996
+ size_t jobLine[512]; // for which line in the input sequence was this job (needed because we may split a line into multiple jobs)
228998
228997
 
228999
- do {
229000
- do {
229001
- chunk = len[curLine] - curOff;
229002
- if (chunk > 511) {
229003
- chunk = 511; // large strings need to be chopped up into segments of 511 bytes
229004
- }
229005
- // create a job in this batch
229006
- SIMDjob job;
229007
- job.cur = inOff;
229008
- job.end = job.cur + chunk;
229009
- job.pos = batchPos;
229010
- job.out = outOff;
229011
-
229012
- // worst case estimate for compressed size (+7 is for the scatter that writes extra 7 zeros)
229013
- outOff += 7 + 2*(size_t)(job.end - job.cur); // note, total size needed is 512*(511*2+7) bytes.
229014
- if (outOff > (1<<19)) break; // simdbuf may get full, stop before this chunk
229015
-
229016
- // register job in this batch
229017
- input[batchPos] = job;
229018
- jobLine[batchPos] = curLine;
229019
-
229020
- if (chunk == 0) {
229021
- empty++; // detect empty chunks -- SIMD code cannot handle empty strings, so they need to be filtered out
229022
- } else {
229023
- // copy string chunk into temp buffer
229024
- memcpy(symbolBase + inOff, line[curLine] + curOff, chunk);
229025
- inOff += chunk;
229026
- curOff += chunk;
229027
- symbolBase[inOff++] = (u8) symbolTable.terminator; // write an extra char at the end that will not be encoded
229028
- }
229029
- if (++batchPos == 512) break;
229030
- } while(curOff < len[curLine]);
229031
-
229032
- if ((batchPos == 512) || (outOff > (1<<19)) || (++curLine >= nlines)) { // cannot accumulate more?
229033
- if (batchPos-empty >= 32) { // if we have enough work, fire off duckdb_fsst_compressAVX512 (32 is due to max 4x8 unrolling)
229034
- // radix-sort jobs on length (longest string first)
229035
- // -- this provides best load balancing and allows to skip empty jobs at the end
229036
- u16 sortpos[513];
229037
- memset(sortpos, 0, sizeof(sortpos));
229038
-
229039
- // calculate length histo
229040
- for(size_t i=0; i<batchPos; i++) {
229041
- size_t len = input[i].end - input[i].cur;
229042
- sortpos[512UL - len]++;
229043
- }
229044
- // calculate running sum
229045
- for(size_t i=1; i<=512; i++)
229046
- sortpos[i] += sortpos[i-1];
229047
-
229048
- // move jobs to their final destination
229049
- SIMDjob inputOrdered[512];
229050
- for(size_t i=0; i<batchPos; i++) {
229051
- size_t len = input[i].end - input[i].cur;
229052
- size_t pos = sortpos[511UL - len]++;
229053
- inputOrdered[pos] = input[i];
229054
- }
229055
- // finally.. SIMD compress max 256KB of simdbuf into (max) 512KB of simdbuf (but presumably much less..)
229056
- for(size_t done = duckdb_fsst_compressAVX512(symbolTable, codeBase, symbolBase, inputOrdered, output, batchPos-empty, unroll);
229057
- done < batchPos; done++) output[done] = inputOrdered[done];
229058
- } else {
229059
- memcpy(output, input, batchPos*sizeof(SIMDjob));
229060
- }
229061
-
229062
- // finish encoding (unfinished strings in process, plus the few last strings not yet processed)
229063
- for(size_t i=0; i<batchPos; i++) {
229064
- SIMDjob job = output[i];
229065
- if (job.cur < job.end) { // finish encoding this string with scalar code
229066
- u8* cur = symbolBase + job.cur;
229067
- u8* end = symbolBase + job.end;
229068
- u8* out = codeBase + job.out;
229069
- while (cur < end) {
229070
- u64 word = fsst_unaligned_load(cur);
229071
- size_t code = symbolTable.shortCodes[word & 0xFFFF];
229072
- size_t pos = word & 0xFFFFFF;
229073
- size_t idx = FSST_HASH(pos)&(symbolTable.hashTabSize-1);
229074
- Symbol s = symbolTable.hashTab[idx];
229075
- out[1] = (u8) word; // speculatively write out escaped byte
229076
- word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.icl);
229077
- if ((s.icl < FSST_ICL_FREE) && s.val.num == word) {
229078
- *out++ = (u8) s.code(); cur += s.length();
229079
- } else {
229080
- // could be a 2-byte or 1-byte code, or miss
229081
- // handle everything with predication
229082
- *out = (u8) code;
229083
- out += 1+((code&FSST_CODE_BASE)>>8);
229084
- cur += (code>>FSST_LEN_BITS);
229085
- }
229086
- }
229087
- job.out = out - codeBase;
229088
- }
229089
- // postprocess job info
229090
- job.cur = 0;
229091
- job.end = job.out - input[job.pos].out; // misuse .end field as compressed size
229092
- job.out = input[job.pos].out; // reset offset to start of encoded string
229093
- input[job.pos] = job;
229094
- }
229095
-
229096
- // copy out the result data
229097
- for(size_t i=0; i<batchPos; i++) {
229098
- size_t lineNr = jobLine[i]; // the sort must be order-preserving, as we concatenate results string in order
229099
- size_t sz = input[i].end; // had stored compressed lengths here
229100
- if (!strOut[lineNr]) strOut[lineNr] = dst; // first segment will be the strOut pointer
229101
- lenOut[lineNr] += sz; // add segment (lenOut starts at 0 for this reason)
229102
- memcpy(dst, codeBase+input[i].out, sz);
229103
- dst += sz;
229104
- }
229105
-
229106
- // go for the next batch of 512 chunks
229107
- inOff = outOff = batchPos = empty = 0;
229108
- budget = (size_t) (lim - dst);
229109
- }
229110
- } while (curLine == prevLine && outOff <= (1<<19));
229111
- }
229112
- return curLine;
228998
+ while (curLine < nlines && outOff <= (1<<19)) {
228999
+ size_t prevLine = curLine, chunk, curOff = 0;
229000
+
229001
+ // bail out if the output buffer cannot hold the compressed next string fully
229002
+ if (((len[curLine]-curOff)*2 + 7) > budget) break; // see below for the +7
229003
+ else budget -= (len[curLine]-curOff)*2;
229004
+
229005
+ strOut[curLine] = (u8*) 0;
229006
+ lenOut[curLine] = 0;
229007
+
229008
+ do {
229009
+ do {
229010
+ chunk = len[curLine] - curOff;
229011
+ if (chunk > 511) {
229012
+ chunk = 511; // large strings need to be chopped up into segments of 511 bytes
229013
+ }
229014
+ // create a job in this batch
229015
+ SIMDjob job;
229016
+ job.cur = inOff;
229017
+ job.end = job.cur + chunk;
229018
+ job.pos = batchPos;
229019
+ job.out = outOff;
229020
+
229021
+ // worst case estimate for compressed size (+7 is for the scatter that writes extra 7 zeros)
229022
+ outOff += 7 + 2*(size_t)(job.end - job.cur); // note, total size needed is 512*(511*2+7) bytes.
229023
+ if (outOff > (1<<19)) break; // simdbuf may get full, stop before this chunk
229024
+
229025
+ // register job in this batch
229026
+ input[batchPos] = job;
229027
+ jobLine[batchPos] = curLine;
229028
+
229029
+ if (chunk == 0) {
229030
+ empty++; // detect empty chunks -- SIMD code cannot handle empty strings, so they need to be filtered out
229031
+ } else {
229032
+ // copy string chunk into temp buffer
229033
+ memcpy(symbolBase + inOff, line[curLine] + curOff, chunk);
229034
+ inOff += chunk;
229035
+ curOff += chunk;
229036
+ symbolBase[inOff++] = (u8) symbolTable.terminator; // write an extra char at the end that will not be encoded
229037
+ }
229038
+ if (++batchPos == 512) break;
229039
+ } while(curOff < len[curLine]);
229040
+
229041
+ if ((batchPos == 512) || (outOff > (1<<19)) || (++curLine >= nlines)) { // cannot accumulate more?
229042
+ if (batchPos-empty >= 32) { // if we have enough work, fire off fsst_compressAVX512 (32 is due to max 4x8 unrolling)
229043
+ // radix-sort jobs on length (longest string first)
229044
+ // -- this provides best load balancing and allows to skip empty jobs at the end
229045
+ u16 sortpos[513];
229046
+ memset(sortpos, 0, sizeof(sortpos));
229047
+
229048
+ // calculate length histo
229049
+ for(size_t i=0; i<batchPos; i++) {
229050
+ size_t len = input[i].end - input[i].cur;
229051
+ sortpos[512UL - len]++;
229052
+ }
229053
+ // calculate running sum
229054
+ for(size_t i=1; i<=512; i++)
229055
+ sortpos[i] += sortpos[i-1];
229056
+
229057
+ // move jobs to their final destination
229058
+ SIMDjob inputOrdered[512];
229059
+ for(size_t i=0; i<batchPos; i++) {
229060
+ size_t len = input[i].end - input[i].cur;
229061
+ size_t pos = sortpos[511UL - len]++;
229062
+ inputOrdered[pos] = input[i];
229063
+ }
229064
+ // finally.. SIMD compress max 256KB of simdbuf into (max) 512KB of simdbuf (but presumably much less..)
229065
+ for(size_t done = duckdb_fsst_compressAVX512(symbolTable, codeBase, symbolBase, inputOrdered, output, batchPos-empty, unroll);
229066
+ done < batchPos; done++) output[done] = inputOrdered[done];
229067
+ } else {
229068
+ memcpy(output, input, batchPos*sizeof(SIMDjob));
229069
+ }
229070
+
229071
+ // finish encoding (unfinished strings in process, plus the few last strings not yet processed)
229072
+ for(size_t i=0; i<batchPos; i++) {
229073
+ SIMDjob job = output[i];
229074
+ if (job.cur < job.end) { // finish encoding this string with scalar code
229075
+ u8* cur = symbolBase + job.cur;
229076
+ u8* end = symbolBase + job.end;
229077
+ u8* out = codeBase + job.out;
229078
+ while (cur < end) {
229079
+ u64 word = fsst_unaligned_load(cur);
229080
+ size_t code = symbolTable.shortCodes[word & 0xFFFF];
229081
+ size_t pos = word & 0xFFFFFF;
229082
+ size_t idx = FSST_HASH(pos)&(symbolTable.hashTabSize-1);
229083
+ Symbol s = symbolTable.hashTab[idx];
229084
+ out[1] = (u8) word; // speculatively write out escaped byte
229085
+ word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.icl);
229086
+ if ((s.icl < FSST_ICL_FREE) && s.val.num == word) {
229087
+ *out++ = (u8) s.code(); cur += s.length();
229088
+ } else {
229089
+ // could be a 2-byte or 1-byte code, or miss
229090
+ // handle everything with predication
229091
+ *out = (u8) code;
229092
+ out += 1+((code&FSST_CODE_BASE)>>8);
229093
+ cur += (code>>FSST_LEN_BITS);
229094
+ }
229095
+ }
229096
+ job.out = out - codeBase;
229097
+ }
229098
+ // postprocess job info
229099
+ job.cur = 0;
229100
+ job.end = job.out - input[job.pos].out; // misuse .end field as compressed size
229101
+ job.out = input[job.pos].out; // reset offset to start of encoded string
229102
+ input[job.pos] = job;
229103
+ }
229104
+
229105
+ // copy out the result data
229106
+ for(size_t i=0; i<batchPos; i++) {
229107
+ size_t lineNr = jobLine[i]; // the sort must be order-preserving, as we concatenate results string in order
229108
+ size_t sz = input[i].end; // had stored compressed lengths here
229109
+ if (!strOut[lineNr]) strOut[lineNr] = dst; // first segment will be the strOut pointer
229110
+ lenOut[lineNr] += sz; // add segment (lenOut starts at 0 for this reason)
229111
+ memcpy(dst, codeBase+input[i].out, sz);
229112
+ dst += sz;
229113
+ }
229114
+
229115
+ // go for the next batch of 512 chunks
229116
+ inOff = outOff = batchPos = empty = 0;
229117
+ budget = (size_t) (lim - dst);
229118
+ }
229119
+ } while (curLine == prevLine && outOff <= (1<<19));
229120
+ }
229121
+ return curLine;
229113
229122
  }
229114
229123
 
229115
229124
 
229116
229125
  // optimized adaptive *scalar* compression method
229117
229126
  static inline size_t compressBulk(SymbolTable &symbolTable, size_t nlines, size_t lenIn[], u8* strIn[], size_t size, u8* out, size_t lenOut[], u8* strOut[], bool noSuffixOpt, bool avoidBranch) {
229118
- // TODO: PR this fix into main fsst REPO?
229119
- // - the issue is that for strings over the 512 buf size, the unaligned load will read past the end of the buf
229120
- // due to the unaligned_load loading 64 bits, simply increasing the buffer size should be ok since the read word
229121
- // is masked with 0xFFFF anyway
229122
- u8 buf[512 + 8];
229123
-
229124
- u8 *cur = NULL, *end = NULL, *lim = out + size;
229125
- size_t curLine, suffixLim = symbolTable.suffixLim;
229126
- u8 byteLim = symbolTable.nSymbols + symbolTable.zeroTerminated - symbolTable.lenHisto[0];
229127
-
229128
- // three variants are possible. dead code falls away since the bool arguments are constants
229129
- auto compressVariant = [&](bool noSuffixOpt, bool avoidBranch) {
229130
- while (cur < end) {
229131
- u64 word = fsst_unaligned_load(cur);
229132
- size_t code = symbolTable.shortCodes[word & 0xFFFF];
229133
- if (noSuffixOpt && ((u8) code) < suffixLim) {
229134
- // 2 byte code without having to worry about longer matches
229135
- *out++ = (u8) code; cur += 2;
229136
- } else {
229137
- size_t pos = word & 0xFFFFFF;
229138
- size_t idx = FSST_HASH(pos)&(symbolTable.hashTabSize-1);
229139
- Symbol s = symbolTable.hashTab[idx];
229140
- out[1] = (u8) word; // speculatively write out escaped byte
229141
- word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.icl);
229142
- if ((s.icl < FSST_ICL_FREE) && s.val.num == word) {
229143
- *out++ = (u8) s.code(); cur += s.length();
229144
- } else if (avoidBranch) {
229145
- // could be a 2-byte or 1-byte code, or miss
229146
- // handle everything with predication
229147
- *out = (u8) code;
229148
- out += 1+((code&FSST_CODE_BASE)>>8);
229149
- cur += (code>>FSST_LEN_BITS);
229150
- } else if ((u8) code < byteLim) {
229151
- // 2 byte code after checking there is no longer pattern
229152
- *out++ = (u8) code; cur += 2;
229153
- } else {
229154
- // 1 byte code or miss.
229155
- *out = (u8) code;
229156
- out += 1+((code&FSST_CODE_BASE)>>8); // predicated - tested with a branch, that was always worse
229157
- cur++;
229158
- }
229159
- }
229160
- }
229161
- };
229127
+ u8 *cur = NULL, *end = NULL, *lim = out + size;
229128
+ size_t curLine, suffixLim = symbolTable.suffixLim;
229129
+ u8 byteLim = symbolTable.nSymbols + symbolTable.zeroTerminated - symbolTable.lenHisto[0];
229130
+
229131
+ u8 buf[512+7]; /* +7 sentinel is to avoid 8-byte unaligned-loads going beyond 511 out-of-bounds */
229132
+ memset(buf+511, 0, 8); /* and initialize the sentinal bytes */
229133
+
229134
+ // three variants are possible. dead code falls away since the bool arguments are constants
229135
+ auto compressVariant = [&](bool noSuffixOpt, bool avoidBranch) {
229136
+ while (cur < end) {
229137
+ u64 word = fsst_unaligned_load(cur);
229138
+ size_t code = symbolTable.shortCodes[word & 0xFFFF];
229139
+ if (noSuffixOpt && ((u8) code) < suffixLim) {
229140
+ // 2 byte code without having to worry about longer matches
229141
+ *out++ = (u8) code; cur += 2;
229142
+ } else {
229143
+ size_t pos = word & 0xFFFFFF;
229144
+ size_t idx = FSST_HASH(pos)&(symbolTable.hashTabSize-1);
229145
+ Symbol s = symbolTable.hashTab[idx];
229146
+ out[1] = (u8) word; // speculatively write out escaped byte
229147
+ word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.icl);
229148
+ if ((s.icl < FSST_ICL_FREE) && s.val.num == word) {
229149
+ *out++ = (u8) s.code(); cur += s.length();
229150
+ } else if (avoidBranch) {
229151
+ // could be a 2-byte or 1-byte code, or miss
229152
+ // handle everything with predication
229153
+ *out = (u8) code;
229154
+ out += 1+((code&FSST_CODE_BASE)>>8);
229155
+ cur += (code>>FSST_LEN_BITS);
229156
+ } else if ((u8) code < byteLim) {
229157
+ // 2 byte code after checking there is no longer pattern
229158
+ *out++ = (u8) code; cur += 2;
229159
+ } else {
229160
+ // 1 byte code or miss.
229161
+ *out = (u8) code;
229162
+ out += 1+((code&FSST_CODE_BASE)>>8); // predicated - tested with a branch, that was always worse
229163
+ cur++;
229164
+ }
229165
+ }
229166
+ }
229167
+ };
229162
229168
 
229163
- for(curLine=0; curLine<nlines; curLine++) {
229164
- size_t chunk, curOff = 0;
229165
- strOut[curLine] = out;
229166
- do {
229167
- bool skipCopy = symbolTable.zeroTerminated;
229168
- cur = strIn[curLine] + curOff;
229169
- chunk = lenIn[curLine] - curOff;
229170
- if (chunk > 511) {
229171
- chunk = 511; // we need to compress in chunks of 511 in order to be byte-compatible with simd-compressed FSST
229172
- skipCopy = false; // need to put terminator, so no in place mem usage possible
229173
- }
229174
- if ((2*chunk+7) > (size_t) (lim-out)) {
229175
- return curLine; // out of memory
229176
- }
229177
- if (!skipCopy) { // only in case of short zero-terminated strings, we can avoid copying
229178
- memcpy(buf, cur, chunk);
229179
- cur = buf;
229180
- buf[chunk] = (u8) symbolTable.terminator;
229181
- }
229182
- end = cur + chunk;
229183
- // based on symboltable stats, choose a variant that is nice to the branch predictor
229184
- if (noSuffixOpt) {
229185
- compressVariant(true,false);
229186
- } else if (avoidBranch) {
229187
- compressVariant(false,true);
229188
- } else {
229189
- compressVariant(false, false);
229190
- }
229191
- } while((curOff += chunk) < lenIn[curLine]);
229192
- lenOut[curLine] = (size_t) (out - strOut[curLine]);
229193
- }
229194
- return curLine;
229169
+ for(curLine=0; curLine<nlines; curLine++) {
229170
+ size_t chunk, curOff = 0;
229171
+ strOut[curLine] = out;
229172
+ do {
229173
+ cur = strIn[curLine] + curOff;
229174
+ chunk = lenIn[curLine] - curOff;
229175
+ if (chunk > 511) {
229176
+ chunk = 511; // we need to compress in chunks of 511 in order to be byte-compatible with simd-compressed FSST
229177
+ }
229178
+ if ((2*chunk+7) > (size_t) (lim-out)) {
229179
+ return curLine; // out of memory
229180
+ }
229181
+ // copy the string to the 511-byte buffer
229182
+ memcpy(buf, cur, chunk);
229183
+ buf[chunk] = (u8) symbolTable.terminator;
229184
+ cur = buf;
229185
+ end = cur + chunk;
229186
+
229187
+ // based on symboltable stats, choose a variant that is nice to the branch predictor
229188
+ if (noSuffixOpt) {
229189
+ compressVariant(true,false);
229190
+ } else if (avoidBranch) {
229191
+ compressVariant(false,true);
229192
+ } else {
229193
+ compressVariant(false, false);
229194
+ }
229195
+ } while((curOff += chunk) < lenIn[curLine]);
229196
+ lenOut[curLine] = (size_t) (out - strOut[curLine]);
229197
+ }
229198
+ return curLine;
229195
229199
  }
229196
229200
 
229197
229201
  #define FSST_SAMPLELINE ((size_t) 512)
229198
229202
 
229199
229203
  // quickly select a uniformly random set of lines such that we have between [FSST_SAMPLETARGET,FSST_SAMPLEMAXSZ) string bytes
229200
229204
  vector<u8*> makeSample(u8* sampleBuf, u8* strIn[], size_t **lenRef, size_t nlines) {
229201
- size_t totSize = 0, *lenIn = *lenRef;
229202
- vector<u8*> sample;
229205
+ size_t totSize = 0, *lenIn = *lenRef;
229206
+ vector<u8*> sample;
229203
229207
 
229204
- for(size_t i=0; i<nlines; i++)
229205
- totSize += lenIn[i];
229208
+ for(size_t i=0; i<nlines; i++)
229209
+ totSize += lenIn[i];
229206
229210
 
229207
- if (totSize < FSST_SAMPLETARGET) {
229208
- for(size_t i=0; i<nlines; i++)
229209
- sample.push_back(strIn[i]);
229210
- } else {
229211
- size_t sampleRnd = FSST_HASH(4637947);
229212
- u8* sampleLim = sampleBuf + FSST_SAMPLETARGET;
229213
- size_t *sampleLen = *lenRef = new size_t[nlines + FSST_SAMPLEMAXSZ/FSST_SAMPLELINE];
229214
-
229215
- while(sampleBuf < sampleLim) {
229216
- // choose a non-empty line
229217
- sampleRnd = FSST_HASH(sampleRnd);
229218
- size_t linenr = sampleRnd % nlines;
229219
- while (lenIn[linenr] == 0)
229220
- if (++linenr == nlines) linenr = 0;
229221
-
229222
- // choose a chunk
229223
- size_t chunks = 1 + ((lenIn[linenr]-1) / FSST_SAMPLELINE);
229224
- sampleRnd = FSST_HASH(sampleRnd);
229225
- size_t chunk = FSST_SAMPLELINE*(sampleRnd % chunks);
229226
-
229227
- // add the chunk to the sample
229228
- size_t len = min(lenIn[linenr]-chunk,FSST_SAMPLELINE);
229229
- memcpy(sampleBuf, strIn[linenr]+chunk, len);
229230
- sample.push_back(sampleBuf);
229231
- sampleBuf += *sampleLen++ = len;
229232
- }
229233
- }
229234
- return sample;
229211
+ if (totSize < FSST_SAMPLETARGET) {
229212
+ for(size_t i=0; i<nlines; i++)
229213
+ sample.push_back(strIn[i]);
229214
+ } else {
229215
+ size_t sampleRnd = FSST_HASH(4637947);
229216
+ u8* sampleLim = sampleBuf + FSST_SAMPLETARGET;
229217
+ size_t *sampleLen = *lenRef = new size_t[nlines + FSST_SAMPLEMAXSZ/FSST_SAMPLELINE];
229218
+
229219
+ while(sampleBuf < sampleLim) {
229220
+ // choose a non-empty line
229221
+ sampleRnd = FSST_HASH(sampleRnd);
229222
+ size_t linenr = sampleRnd % nlines;
229223
+ while (lenIn[linenr] == 0)
229224
+ if (++linenr == nlines) linenr = 0;
229225
+
229226
+ // choose a chunk
229227
+ size_t chunks = 1 + ((lenIn[linenr]-1) / FSST_SAMPLELINE);
229228
+ sampleRnd = FSST_HASH(sampleRnd);
229229
+ size_t chunk = FSST_SAMPLELINE*(sampleRnd % chunks);
229230
+
229231
+ // add the chunk to the sample
229232
+ size_t len = min(lenIn[linenr]-chunk,FSST_SAMPLELINE);
229233
+ memcpy(sampleBuf, strIn[linenr]+chunk, len);
229234
+ sample.push_back(sampleBuf);
229235
+ sampleBuf += *sampleLen++ = len;
229236
+ }
229237
+ }
229238
+ return sample;
229235
229239
  }
229236
229240
 
229237
229241
  extern "C" duckdb_fsst_encoder_t* duckdb_fsst_create(size_t n, size_t lenIn[], u8 *strIn[], int zeroTerminated) {
229238
- u8* sampleBuf = new u8[FSST_SAMPLEMAXSZ];
229239
- size_t *sampleLen = lenIn;
229240
- vector<u8*> sample = makeSample(sampleBuf, strIn, &sampleLen, n?n:1); // careful handling of input to get a right-size and representative sample
229241
- Encoder *encoder = new Encoder();
229242
- encoder->symbolTable = shared_ptr<SymbolTable>(buildSymbolTable(encoder->counters, sample, sampleLen, zeroTerminated));
229243
- if (sampleLen != lenIn) delete[] sampleLen;
229244
- delete[] sampleBuf;
229245
- return (duckdb_fsst_encoder_t*) encoder;
229242
+ u8* sampleBuf = new u8[FSST_SAMPLEMAXSZ];
229243
+ size_t *sampleLen = lenIn;
229244
+ vector<u8*> sample = makeSample(sampleBuf, strIn, &sampleLen, n?n:1); // careful handling of input to get a right-size and representative sample
229245
+ Encoder *encoder = new Encoder();
229246
+ encoder->symbolTable = shared_ptr<SymbolTable>(buildSymbolTable(encoder->counters, sample, sampleLen, zeroTerminated));
229247
+ if (sampleLen != lenIn) delete[] sampleLen;
229248
+ delete[] sampleBuf;
229249
+ return (duckdb_fsst_encoder_t*) encoder;
229246
229250
  }
229247
229251
 
229248
229252
  /* create another encoder instance, necessary to do multi-threaded encoding using the same symbol table */
229249
229253
  extern "C" duckdb_fsst_encoder_t* duckdb_fsst_duplicate(duckdb_fsst_encoder_t *encoder) {
229250
- Encoder *e = new Encoder();
229251
- e->symbolTable = ((Encoder*)encoder)->symbolTable; // it is a shared_ptr
229252
- return (duckdb_fsst_encoder_t*) e;
229254
+ Encoder *e = new Encoder();
229255
+ e->symbolTable = ((Encoder*)encoder)->symbolTable; // it is a shared_ptr
229256
+ return (duckdb_fsst_encoder_t*) e;
229253
229257
  }
229254
229258
 
229255
- // export a symbol table in compact format.
229259
+ // export a symbol table in compact format.
229256
229260
  extern "C" u32 duckdb_fsst_export(duckdb_fsst_encoder_t *encoder, u8 *buf) {
229257
- Encoder *e = (Encoder*) encoder;
229258
- // In ->version there is a versionnr, but we hide also suffixLim/terminator/nSymbols there.
229259
- // This is sufficient in principle to *reconstruct* a duckdb_fsst_encoder_t from a duckdb_fsst_decoder_t
229260
- // (such functionality could be useful to append compressed data to an existing block).
229261
- //
229262
- // However, the hash function in the encoder hash table is endian-sensitive, and given its
229263
- // 'lossy perfect' hashing scheme is *unable* to contain other-endian-produced symbol tables.
229264
- // Doing a endian-conversion during hashing will be slow and self-defeating.
229265
- //
229266
- // Overall, we could support reconstructing an encoder for incremental compression, but
229267
- // should enforce equal-endianness. Bit of a bummer. Not going there now.
229268
- //
229269
- // The version field is now there just for future-proofness, but not used yet
229270
-
229271
- // version allows keeping track of fsst versions, track endianness, and encoder reconstruction
229272
- u64 version = (FSST_VERSION << 32) | // version is 24 bits, most significant byte is 0
229273
- (((u64) e->symbolTable->suffixLim) << 24) |
229274
- (((u64) e->symbolTable->terminator) << 16) |
229275
- (((u64) e->symbolTable->nSymbols) << 8) |
229276
- FSST_ENDIAN_MARKER; // least significant byte is nonzero
229261
+ Encoder *e = (Encoder*) encoder;
229262
+ // In ->version there is a versionnr, but we hide also suffixLim/terminator/nSymbols there.
229263
+ // This is sufficient in principle to *reconstruct* a duckdb_fsst_encoder_t from a duckdb_fsst_decoder_t
229264
+ // (such functionality could be useful to append compressed data to an existing block).
229265
+ //
229266
+ // However, the hash function in the encoder hash table is endian-sensitive, and given its
229267
+ // 'lossy perfect' hashing scheme is *unable* to contain other-endian-produced symbol tables.
229268
+ // Doing a endian-conversion during hashing will be slow and self-defeating.
229269
+ //
229270
+ // Overall, we could support reconstructing an encoder for incremental compression, but
229271
+ // should enforce equal-endianness. Bit of a bummer. Not going there now.
229272
+ //
229273
+ // The version field is now there just for future-proofness, but not used yet
229274
+
229275
+ // version allows keeping track of fsst versions, track endianness, and encoder reconstruction
229276
+ u64 version = (FSST_VERSION << 32) | // version is 24 bits, most significant byte is 0
229277
+ (((u64) e->symbolTable->suffixLim) << 24) |
229278
+ (((u64) e->symbolTable->terminator) << 16) |
229279
+ (((u64) e->symbolTable->nSymbols) << 8) |
229280
+ FSST_ENDIAN_MARKER; // least significant byte is nonzero
229277
229281
 
229278
- /* do not assume unaligned reads here */
229279
- memcpy(buf, &version, 8);
229280
- buf[8] = e->symbolTable->zeroTerminated;
229281
- for(u32 i=0; i<8; i++)
229282
- buf[9+i] = (u8) e->symbolTable->lenHisto[i];
229283
- u32 pos = 17;
229282
+ /* do not assume unaligned reads here */
229283
+ memcpy(buf, &version, 8);
229284
+ buf[8] = e->symbolTable->zeroTerminated;
229285
+ for(u32 i=0; i<8; i++)
229286
+ buf[9+i] = (u8) e->symbolTable->lenHisto[i];
229287
+ u32 pos = 17;
229284
229288
 
229285
- // emit only the used bytes of the symbols
229286
- for(u32 i = e->symbolTable->zeroTerminated; i < e->symbolTable->nSymbols; i++)
229287
- for(u32 j = 0; j < e->symbolTable->symbols[i].length(); j++)
229288
- buf[pos++] = e->symbolTable->symbols[i].val.str[j]; // serialize used symbol bytes
229289
+ // emit only the used bytes of the symbols
229290
+ for(u32 i = e->symbolTable->zeroTerminated; i < e->symbolTable->nSymbols; i++)
229291
+ for(u32 j = 0; j < e->symbolTable->symbols[i].length(); j++)
229292
+ buf[pos++] = e->symbolTable->symbols[i].val.str[j]; // serialize used symbol bytes
229289
229293
 
229290
- return pos; // length of what was serialized
229294
+ return pos; // length of what was serialized
229291
229295
  }
229292
229296
 
229293
229297
  #define FSST_CORRUPT 32774747032022883 /* 7-byte number in little endian containing "corrupt" */
229294
229298
 
229295
229299
  extern "C" u32 duckdb_fsst_import(duckdb_fsst_decoder_t *decoder, u8 *buf) {
229296
- u64 version = 0;
229297
- u32 code, pos = 17;
229298
- u8 lenHisto[8];
229299
-
229300
- // version field (first 8 bytes) is now there just for future-proofness, unused still (skipped)
229301
- memcpy(&version, buf, 8);
229302
- if ((version>>32) != FSST_VERSION) return 0;
229303
- decoder->zeroTerminated = buf[8]&1;
229304
- memcpy(lenHisto, buf+9, 8);
229305
-
229306
- // in case of zero-terminated, first symbol is "" (zero always, may be overwritten)
229307
- decoder->len[0] = 1;
229308
- decoder->symbol[0] = 0;
229309
-
229310
- // we use lenHisto[0] as 1-byte symbol run length (at the end)
229311
- code = decoder->zeroTerminated;
229312
- if (decoder->zeroTerminated) lenHisto[0]--; // if zeroTerminated, then symbol "" aka 1-byte code=0, is not stored at the end
229313
-
229314
- // now get all symbols from the buffer
229315
- for(u32 l=1; l<=8; l++) { /* l = 1,2,3,4,5,6,7,8 */
229316
- for(u32 i=0; i < lenHisto[(l&7) /* 1,2,3,4,5,6,7,0 */]; i++, code++) {
229317
- decoder->len[code] = (l&7)+1; /* len = 2,3,4,5,6,7,8,1 */
229318
- decoder->symbol[code] = 0;
229319
- for(u32 j=0; j<decoder->len[code]; j++)
229320
- ((u8*) &decoder->symbol[code])[j] = buf[pos++]; // note this enforces 'little endian' symbols
229321
- }
229322
- }
229323
- if (decoder->zeroTerminated) lenHisto[0]++;
229300
+ u64 version = 0;
229301
+ u32 code, pos = 17;
229302
+ u8 lenHisto[8];
229324
229303
 
229325
- // fill unused symbols with text "corrupt". Gives a chance to detect corrupted code sequences (if there are unused symbols).
229326
- while(code<255) {
229327
- decoder->symbol[code] = FSST_CORRUPT;
229328
- decoder->len[code++] = 8;
229329
- }
229330
- return pos;
229304
+ // version field (first 8 bytes) is now there just for future-proofness, unused still (skipped)
229305
+ memcpy(&version, buf, 8);
229306
+ if ((version>>32) != FSST_VERSION) return 0;
229307
+ decoder->zeroTerminated = buf[8]&1;
229308
+ memcpy(lenHisto, buf+9, 8);
229309
+
229310
+ // in case of zero-terminated, first symbol is "" (zero always, may be overwritten)
229311
+ decoder->len[0] = 1;
229312
+ decoder->symbol[0] = 0;
229313
+
229314
+ // we use lenHisto[0] as 1-byte symbol run length (at the end)
229315
+ code = decoder->zeroTerminated;
229316
+ if (decoder->zeroTerminated) lenHisto[0]--; // if zeroTerminated, then symbol "" aka 1-byte code=0, is not stored at the end
229317
+
229318
+ // now get all symbols from the buffer
229319
+ for(u32 l=1; l<=8; l++) { /* l = 1,2,3,4,5,6,7,8 */
229320
+ for(u32 i=0; i < lenHisto[(l&7) /* 1,2,3,4,5,6,7,0 */]; i++, code++) {
229321
+ decoder->len[code] = (l&7)+1; /* len = 2,3,4,5,6,7,8,1 */
229322
+ decoder->symbol[code] = 0;
229323
+ for(u32 j=0; j<decoder->len[code]; j++)
229324
+ ((u8*) &decoder->symbol[code])[j] = buf[pos++]; // note this enforces 'little endian' symbols
229325
+ }
229326
+ }
229327
+ if (decoder->zeroTerminated) lenHisto[0]++;
229328
+
229329
+ // fill unused symbols with text "corrupt". Gives a chance to detect corrupted code sequences (if there are unused symbols).
229330
+ while(code<255) {
229331
+ decoder->symbol[code] = FSST_CORRUPT;
229332
+ decoder->len[code++] = 8;
229333
+ }
229334
+ return pos;
229331
229335
  }
229332
229336
 
229333
229337
  // runtime check for simd
229334
229338
  inline size_t _compressImpl(Encoder *e, size_t nlines, size_t lenIn[], u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd) {
229335
229339
  #ifndef NONOPT_FSST
229336
- if (simd && duckdb_fsst_hasAVX512())
229337
- return compressSIMD(*e->symbolTable, e->simdbuf, nlines, lenIn, strIn, size, output, lenOut, strOut, simd);
229340
+ if (simd && duckdb_fsst_hasAVX512())
229341
+ return compressSIMD(*e->symbolTable, e->simdbuf, nlines, lenIn, strIn, size, output, lenOut, strOut, simd);
229338
229342
  #endif
229339
- (void) simd;
229340
- return compressBulk(*e->symbolTable, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt, avoidBranch);
229343
+ (void) simd;
229344
+ return compressBulk(*e->symbolTable, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt, avoidBranch);
229341
229345
  }
229342
229346
  size_t compressImpl(Encoder *e, size_t nlines, size_t lenIn[], u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd) {
229343
- return _compressImpl(e, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt, avoidBranch, simd);
229347
+ return _compressImpl(e, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt, avoidBranch, simd);
229344
229348
  }
229345
229349
 
229346
- // adaptive choosing of scalar compression method based on symbol length histogram
229350
+ // adaptive choosing of scalar compression method based on symbol length histogram
229347
229351
  inline size_t _compressAuto(Encoder *e, size_t nlines, size_t lenIn[], u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], int simd) {
229348
- bool avoidBranch = false, noSuffixOpt = false;
229349
- if (100*e->symbolTable->lenHisto[1] > 65*e->symbolTable->nSymbols && 100*e->symbolTable->suffixLim > 95*e->symbolTable->lenHisto[1]) {
229350
- noSuffixOpt = true;
229351
- } else if ((e->symbolTable->lenHisto[0] > 24 && e->symbolTable->lenHisto[0] < 92) &&
229352
- (e->symbolTable->lenHisto[0] < 43 || e->symbolTable->lenHisto[6] + e->symbolTable->lenHisto[7] < 29) &&
229353
- (e->symbolTable->lenHisto[0] < 72 || e->symbolTable->lenHisto[2] < 72)) {
229354
- avoidBranch = true;
229355
- }
229356
- return _compressImpl(e, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt, avoidBranch, simd);
229352
+ bool avoidBranch = false, noSuffixOpt = false;
229353
+ if (100*e->symbolTable->lenHisto[1] > 65*e->symbolTable->nSymbols && 100*e->symbolTable->suffixLim > 95*e->symbolTable->lenHisto[1]) {
229354
+ noSuffixOpt = true;
229355
+ } else if ((e->symbolTable->lenHisto[0] > 24 && e->symbolTable->lenHisto[0] < 92) &&
229356
+ (e->symbolTable->lenHisto[0] < 43 || e->symbolTable->lenHisto[6] + e->symbolTable->lenHisto[7] < 29) &&
229357
+ (e->symbolTable->lenHisto[0] < 72 || e->symbolTable->lenHisto[2] < 72)) {
229358
+ avoidBranch = true;
229359
+ }
229360
+ return _compressImpl(e, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt, avoidBranch, simd);
229357
229361
  }
229358
229362
  size_t compressAuto(Encoder *e, size_t nlines, size_t lenIn[], u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], int simd) {
229359
- return _compressAuto(e, nlines, lenIn, strIn, size, output, lenOut, strOut, simd);
229363
+ return _compressAuto(e, nlines, lenIn, strIn, size, output, lenOut, strOut, simd);
229360
229364
  }
229361
229365
 
229362
229366
  // the main compression function (everything automatic)
229363
229367
  extern "C" size_t duckdb_fsst_compress(duckdb_fsst_encoder_t *encoder, size_t nlines, size_t lenIn[], u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[]) {
229364
- // to be faster than scalar, simd needs 64 lines or more of length >=12; or fewer lines, but big ones (totLen > 32KB)
229365
- size_t totLen = accumulate(lenIn, lenIn+nlines, 0);
229366
- int simd = totLen > nlines*12 && (nlines > 64 || totLen > (size_t) 1<<15);
229367
- return _compressAuto((Encoder*) encoder, nlines, lenIn, strIn, size, output, lenOut, strOut, 3*simd);
229368
+ // to be faster than scalar, simd needs 64 lines or more of length >=12; or fewer lines, but big ones (totLen > 32KB)
229369
+ size_t totLen = accumulate(lenIn, lenIn+nlines, 0);
229370
+ int simd = totLen > nlines*12 && (nlines > 64 || totLen > (size_t) 1<<15);
229371
+ return _compressAuto((Encoder*) encoder, nlines, lenIn, strIn, size, output, lenOut, strOut, 3*simd);
229368
229372
  }
229369
229373
 
229370
229374
  /* deallocate encoder */
229371
229375
  extern "C" void duckdb_fsst_destroy(duckdb_fsst_encoder_t* encoder) {
229372
- Encoder *e = (Encoder*) encoder;
229373
- delete e;
229376
+ Encoder *e = (Encoder*) encoder;
229377
+ delete e;
229374
229378
  }
229375
229379
 
229376
229380
  /* very lazy implementation relying on export and import */
229377
229381
  extern "C" duckdb_fsst_decoder_t duckdb_fsst_decoder(duckdb_fsst_encoder_t *encoder) {
229378
- u8 buf[sizeof(duckdb_fsst_decoder_t)];
229379
- u32 cnt1 = duckdb_fsst_export(encoder, buf);
229380
- duckdb_fsst_decoder_t decoder;
229381
- u32 cnt2 = duckdb_fsst_import(&decoder, buf);
229382
- assert(cnt1 == cnt2); (void) cnt1; (void) cnt2;
229383
- return decoder;
229382
+ u8 buf[sizeof(duckdb_fsst_decoder_t)];
229383
+ u32 cnt1 = duckdb_fsst_export(encoder, buf);
229384
+ duckdb_fsst_decoder_t decoder;
229385
+ u32 cnt2 = duckdb_fsst_import(&decoder, buf);
229386
+ assert(cnt1 == cnt2); (void) cnt1; (void) cnt2;
229387
+ return decoder;
229384
229388
  }
229385
229389
 
229386
-
229387
229390
  // LICENSE_CHANGE_END
229388
229391
 
229389
229392