duckdb 0.6.2-dev32.0 → 0.6.2-dev416.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/duckdb.cpp +2537 -1098
- package/src/duckdb.hpp +3667 -3446
- package/src/parquet-amalgamation.cpp +17147 -16936
- package/src/parquet-amalgamation.hpp +46 -4
|
@@ -7193,7 +7193,7 @@ public:
|
|
|
7193
7193
|
return;
|
|
7194
7194
|
}
|
|
7195
7195
|
if (new_size > alloc_len) {
|
|
7196
|
-
alloc_len = new_size;
|
|
7196
|
+
alloc_len = NextPowerOfTwo(new_size);
|
|
7197
7197
|
allocated_data = allocator.Allocate(alloc_len);
|
|
7198
7198
|
ptr = (char *)allocated_data.get();
|
|
7199
7199
|
}
|
|
@@ -7387,7 +7387,6 @@ namespace duckdb {
|
|
|
7387
7387
|
class DbpDecoder {
|
|
7388
7388
|
public:
|
|
7389
7389
|
DbpDecoder(const uint8_t *buffer, uint32_t buffer_len) : buffer_((char *)buffer, buffer_len) {
|
|
7390
|
-
|
|
7391
7390
|
//<block size in values> <number of miniblocks in a block> <total value count> <first value>
|
|
7392
7391
|
// overall header
|
|
7393
7392
|
block_value_count = ParquetDecodeUtils::VarintDecode<uint64_t>(buffer_);
|
|
@@ -7396,6 +7395,7 @@ public:
|
|
|
7396
7395
|
start_value = ParquetDecodeUtils::ZigzagToInt(ParquetDecodeUtils::VarintDecode<int64_t>(buffer_));
|
|
7397
7396
|
|
|
7398
7397
|
// some derivatives
|
|
7398
|
+
D_ASSERT(miniblocks_per_block > 0);
|
|
7399
7399
|
values_per_miniblock = block_value_count / miniblocks_per_block;
|
|
7400
7400
|
miniblock_bit_widths = std::unique_ptr<uint8_t[]>(new data_t[miniblocks_per_block]);
|
|
7401
7401
|
|
|
@@ -7474,6 +7474,17 @@ public:
|
|
|
7474
7474
|
}
|
|
7475
7475
|
start_value = values[batch_size - 1];
|
|
7476
7476
|
}
|
|
7477
|
+
void Finalize() {
|
|
7478
|
+
if (values_left_in_miniblock == 0) {
|
|
7479
|
+
return;
|
|
7480
|
+
}
|
|
7481
|
+
auto data = unique_ptr<uint32_t[]>(new uint32_t[values_left_in_miniblock]);
|
|
7482
|
+
GetBatch<uint32_t>((char *)data.get(), values_left_in_miniblock);
|
|
7483
|
+
}
|
|
7484
|
+
|
|
7485
|
+
uint64_t TotalValues() {
|
|
7486
|
+
return total_value_count;
|
|
7487
|
+
}
|
|
7477
7488
|
|
|
7478
7489
|
private:
|
|
7479
7490
|
ByteBuffer buffer_;
|
|
@@ -7579,11 +7590,31 @@ public:
|
|
|
7579
7590
|
|
|
7580
7591
|
virtual unique_ptr<BaseStatistics> Stats(idx_t row_group_idx_p, const std::vector<ColumnChunk> &columns);
|
|
7581
7592
|
|
|
7593
|
+
template <class VALUE_TYPE, class CONVERSION>
|
|
7594
|
+
void PlainTemplated(shared_ptr<ByteBuffer> plain_data, uint8_t *defines, uint64_t num_values,
|
|
7595
|
+
parquet_filter_t &filter, idx_t result_offset, Vector &result) {
|
|
7596
|
+
auto result_ptr = FlatVector::GetData<VALUE_TYPE>(result);
|
|
7597
|
+
auto &result_mask = FlatVector::Validity(result);
|
|
7598
|
+
for (idx_t row_idx = 0; row_idx < num_values; row_idx++) {
|
|
7599
|
+
if (HasDefines() && defines[row_idx + result_offset] != max_define) {
|
|
7600
|
+
result_mask.SetInvalid(row_idx + result_offset);
|
|
7601
|
+
continue;
|
|
7602
|
+
}
|
|
7603
|
+
if (filter[row_idx + result_offset]) {
|
|
7604
|
+
VALUE_TYPE val = CONVERSION::PlainRead(*plain_data, *this);
|
|
7605
|
+
result_ptr[row_idx + result_offset] = val;
|
|
7606
|
+
} else { // there is still some data there that we have to skip over
|
|
7607
|
+
CONVERSION::PlainSkip(*plain_data, *this);
|
|
7608
|
+
}
|
|
7609
|
+
}
|
|
7610
|
+
}
|
|
7611
|
+
|
|
7582
7612
|
protected:
|
|
7613
|
+
Allocator &GetAllocator();
|
|
7583
7614
|
// readers that use the default Read() need to implement those
|
|
7584
7615
|
virtual void Plain(shared_ptr<ByteBuffer> plain_data, uint8_t *defines, idx_t num_values, parquet_filter_t &filter,
|
|
7585
7616
|
idx_t result_offset, Vector &result);
|
|
7586
|
-
virtual void Dictionary(shared_ptr<
|
|
7617
|
+
virtual void Dictionary(shared_ptr<ResizeableBuffer> dictionary_data, idx_t num_entries);
|
|
7587
7618
|
virtual void Offsets(uint32_t *offsets, uint8_t *defines, idx_t num_values, parquet_filter_t &filter,
|
|
7588
7619
|
idx_t result_offset, Vector &result);
|
|
7589
7620
|
|
|
@@ -7591,6 +7622,11 @@ protected:
|
|
|
7591
7622
|
virtual void DictReference(Vector &result);
|
|
7592
7623
|
virtual void PlainReference(shared_ptr<ByteBuffer>, Vector &result);
|
|
7593
7624
|
|
|
7625
|
+
virtual void PrepareDeltaLengthByteArray(ResizeableBuffer &buffer);
|
|
7626
|
+
virtual void PrepareDeltaByteArray(ResizeableBuffer &buffer);
|
|
7627
|
+
virtual void DeltaByteArray(uint8_t *defines, idx_t num_values, parquet_filter_t &filter, idx_t result_offset,
|
|
7628
|
+
Vector &result);
|
|
7629
|
+
|
|
7594
7630
|
// applies any skips that were registered using Skip()
|
|
7595
7631
|
virtual void ApplyPendingSkips(idx_t num_values);
|
|
7596
7632
|
|
|
@@ -7611,10 +7647,13 @@ protected:
|
|
|
7611
7647
|
|
|
7612
7648
|
ParquetReader &reader;
|
|
7613
7649
|
LogicalType type;
|
|
7650
|
+
unique_ptr<Vector> byte_array_data;
|
|
7614
7651
|
|
|
7615
7652
|
idx_t pending_skips = 0;
|
|
7616
7653
|
|
|
7617
7654
|
private:
|
|
7655
|
+
void AllocateBlock(idx_t size);
|
|
7656
|
+
void AllocateCompressed(idx_t size);
|
|
7618
7657
|
void PrepareRead(parquet_filter_t &filter);
|
|
7619
7658
|
void PreparePage(PageHeader &page_hdr);
|
|
7620
7659
|
void PrepareDataPage(PageHeader &page_hdr);
|
|
@@ -7630,12 +7669,14 @@ private:
|
|
|
7630
7669
|
|
|
7631
7670
|
shared_ptr<ResizeableBuffer> block;
|
|
7632
7671
|
|
|
7672
|
+
ResizeableBuffer compressed_buffer;
|
|
7633
7673
|
ResizeableBuffer offset_buffer;
|
|
7634
7674
|
|
|
7635
7675
|
unique_ptr<RleBpDecoder> dict_decoder;
|
|
7636
7676
|
unique_ptr<RleBpDecoder> defined_decoder;
|
|
7637
7677
|
unique_ptr<RleBpDecoder> repeated_decoder;
|
|
7638
7678
|
unique_ptr<DbpDecoder> dbp_decoder;
|
|
7679
|
+
unique_ptr<RleBpDecoder> rle_decoder;
|
|
7639
7680
|
|
|
7640
7681
|
// dummies for Skip()
|
|
7641
7682
|
parquet_filter_t none_filter;
|
|
@@ -7908,7 +7949,8 @@ public:
|
|
|
7908
7949
|
idx_t max_repeat = 0, idx_t max_define = 1,
|
|
7909
7950
|
bool can_have_nulls = true);
|
|
7910
7951
|
|
|
7911
|
-
virtual unique_ptr<ColumnWriterState> InitializeWriteState(duckdb_parquet::format::RowGroup &row_group
|
|
7952
|
+
virtual unique_ptr<ColumnWriterState> InitializeWriteState(duckdb_parquet::format::RowGroup &row_group,
|
|
7953
|
+
Allocator &allocator) = 0;
|
|
7912
7954
|
|
|
7913
7955
|
//! indicates whether the write need to analyse the data before preparing it
|
|
7914
7956
|
virtual bool HasAnalyze() {
|