duckdb 0.6.2-dev960.0 → 0.6.2-dev969.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
#ifndef DUCKDB_VERSION
|
|
2
|
-
#define DUCKDB_VERSION "0.6.2-
|
|
2
|
+
#define DUCKDB_VERSION "0.6.2-dev969"
|
|
3
3
|
#endif
|
|
4
4
|
#ifndef DUCKDB_SOURCE_ID
|
|
5
|
-
#define DUCKDB_SOURCE_ID "
|
|
5
|
+
#define DUCKDB_SOURCE_ID "8fa1b6e786"
|
|
6
6
|
#endif
|
|
7
7
|
#include "duckdb/function/table/system_functions.hpp"
|
|
8
8
|
#include "duckdb/main/database.hpp"
|
|
@@ -129,19 +129,9 @@ void StructColumnData::Append(BaseStatistics &stats, ColumnAppendState &state, V
|
|
|
129
129
|
// append the null values
|
|
130
130
|
validity.Append(*stats.validity_stats, state.child_appends[0], vector, count);
|
|
131
131
|
|
|
132
|
-
auto &struct_validity = FlatVector::Validity(vector);
|
|
133
|
-
|
|
134
132
|
auto &struct_stats = (StructStatistics &)stats;
|
|
135
133
|
auto &child_entries = StructVector::GetEntries(vector);
|
|
136
134
|
for (idx_t i = 0; i < child_entries.size(); i++) {
|
|
137
|
-
if (!struct_validity.AllValid()) {
|
|
138
|
-
// we set the child entries of the struct to NULL
|
|
139
|
-
// for any values in which the struct itself is NULL
|
|
140
|
-
child_entries[i]->Flatten(count);
|
|
141
|
-
|
|
142
|
-
auto &child_validity = FlatVector::Validity(*child_entries[i]);
|
|
143
|
-
child_validity.Combine(struct_validity, count);
|
|
144
|
-
}
|
|
145
135
|
sub_columns[i]->Append(*struct_stats.child_stats[i], state.child_appends[i + 1], *child_entries[i], count);
|
|
146
136
|
}
|
|
147
137
|
}
|
|
@@ -454,21 +454,24 @@ static inline size_t compressBulk(SymbolTable &symbolTable, size_t nlines, size_
|
|
|
454
454
|
#define FSST_SAMPLELINE ((size_t) 512)
|
|
455
455
|
|
|
456
456
|
// quickly select a uniformly random set of lines such that we have between [FSST_SAMPLETARGET,FSST_SAMPLEMAXSZ) string bytes
|
|
457
|
-
vector<u8*> makeSample(u8* sampleBuf, u8* strIn[], size_t
|
|
458
|
-
|
|
457
|
+
vector<u8*> makeSample(u8* sampleBuf, u8* strIn[], size_t *lenIn, size_t nlines,
|
|
458
|
+
unique_ptr<vector<size_t>>& sample_len_out) {
|
|
459
|
+
size_t totSize = 0;
|
|
459
460
|
vector<u8*> sample;
|
|
460
461
|
|
|
461
462
|
for(size_t i=0; i<nlines; i++)
|
|
462
463
|
totSize += lenIn[i];
|
|
463
|
-
|
|
464
464
|
if (totSize < FSST_SAMPLETARGET) {
|
|
465
465
|
for(size_t i=0; i<nlines; i++)
|
|
466
466
|
sample.push_back(strIn[i]);
|
|
467
467
|
} else {
|
|
468
468
|
size_t sampleRnd = FSST_HASH(4637947);
|
|
469
469
|
u8* sampleLim = sampleBuf + FSST_SAMPLETARGET;
|
|
470
|
-
size_t *sampleLen = *lenRef = new size_t[nlines + FSST_SAMPLEMAXSZ/FSST_SAMPLELINE];
|
|
471
470
|
|
|
471
|
+
sample_len_out = unique_ptr<vector<size_t>>(new vector<size_t>());
|
|
472
|
+
sample_len_out->reserve(nlines + FSST_SAMPLEMAXSZ/FSST_SAMPLELINE);
|
|
473
|
+
|
|
474
|
+
// This fails if we have a lot of small strings and a few big ones?
|
|
472
475
|
while(sampleBuf < sampleLim) {
|
|
473
476
|
// choose a non-empty line
|
|
474
477
|
sampleRnd = FSST_HASH(sampleRnd);
|
|
@@ -485,7 +488,9 @@ vector<u8*> makeSample(u8* sampleBuf, u8* strIn[], size_t **lenRef, size_t nline
|
|
|
485
488
|
size_t len = min(lenIn[linenr]-chunk,FSST_SAMPLELINE);
|
|
486
489
|
memcpy(sampleBuf, strIn[linenr]+chunk, len);
|
|
487
490
|
sample.push_back(sampleBuf);
|
|
488
|
-
|
|
491
|
+
|
|
492
|
+
sample_len_out->push_back(len);
|
|
493
|
+
sampleBuf += len;
|
|
489
494
|
}
|
|
490
495
|
}
|
|
491
496
|
return sample;
|
|
@@ -493,11 +498,11 @@ vector<u8*> makeSample(u8* sampleBuf, u8* strIn[], size_t **lenRef, size_t nline
|
|
|
493
498
|
|
|
494
499
|
extern "C" duckdb_fsst_encoder_t* duckdb_fsst_create(size_t n, size_t lenIn[], u8 *strIn[], int zeroTerminated) {
|
|
495
500
|
u8* sampleBuf = new u8[FSST_SAMPLEMAXSZ];
|
|
496
|
-
size_t
|
|
497
|
-
vector<u8*> sample = makeSample(sampleBuf, strIn,
|
|
501
|
+
unique_ptr<vector<size_t>> sample_sizes;
|
|
502
|
+
vector<u8*> sample = makeSample(sampleBuf, strIn, lenIn, n?n:1, sample_sizes); // careful handling of input to get a right-size and representative sample
|
|
498
503
|
Encoder *encoder = new Encoder();
|
|
504
|
+
size_t* sampleLen = sample_sizes ? sample_sizes->data() : &lenIn[0];
|
|
499
505
|
encoder->symbolTable = shared_ptr<SymbolTable>(buildSymbolTable(encoder->counters, sample, sampleLen, zeroTerminated));
|
|
500
|
-
if (sampleLen != lenIn) delete[] sampleLen;
|
|
501
506
|
delete[] sampleBuf;
|
|
502
507
|
return (duckdb_fsst_encoder_t*) encoder;
|
|
503
508
|
}
|