duckdb 0.6.2-dev32.0 → 0.6.2-dev416.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7193,7 +7193,7 @@ public:
7193
7193
  return;
7194
7194
  }
7195
7195
  if (new_size > alloc_len) {
7196
- alloc_len = new_size;
7196
+ alloc_len = NextPowerOfTwo(new_size);
7197
7197
  allocated_data = allocator.Allocate(alloc_len);
7198
7198
  ptr = (char *)allocated_data.get();
7199
7199
  }
@@ -7387,7 +7387,6 @@ namespace duckdb {
7387
7387
  class DbpDecoder {
7388
7388
  public:
7389
7389
  DbpDecoder(const uint8_t *buffer, uint32_t buffer_len) : buffer_((char *)buffer, buffer_len) {
7390
-
7391
7390
  //<block size in values> <number of miniblocks in a block> <total value count> <first value>
7392
7391
  // overall header
7393
7392
  block_value_count = ParquetDecodeUtils::VarintDecode<uint64_t>(buffer_);
@@ -7396,6 +7395,7 @@ public:
7396
7395
  start_value = ParquetDecodeUtils::ZigzagToInt(ParquetDecodeUtils::VarintDecode<int64_t>(buffer_));
7397
7396
 
7398
7397
  // some derivatives
7398
+ D_ASSERT(miniblocks_per_block > 0);
7399
7399
  values_per_miniblock = block_value_count / miniblocks_per_block;
7400
7400
  miniblock_bit_widths = std::unique_ptr<uint8_t[]>(new data_t[miniblocks_per_block]);
7401
7401
 
@@ -7474,6 +7474,17 @@ public:
7474
7474
  }
7475
7475
  start_value = values[batch_size - 1];
7476
7476
  }
7477
+ void Finalize() {
7478
+ if (values_left_in_miniblock == 0) {
7479
+ return;
7480
+ }
7481
+ auto data = unique_ptr<uint32_t[]>(new uint32_t[values_left_in_miniblock]);
7482
+ GetBatch<uint32_t>((char *)data.get(), values_left_in_miniblock);
7483
+ }
7484
+
7485
+ uint64_t TotalValues() {
7486
+ return total_value_count;
7487
+ }
7477
7488
 
7478
7489
  private:
7479
7490
  ByteBuffer buffer_;
@@ -7579,11 +7590,31 @@ public:
7579
7590
 
7580
7591
  virtual unique_ptr<BaseStatistics> Stats(idx_t row_group_idx_p, const std::vector<ColumnChunk> &columns);
7581
7592
 
7593
+ template <class VALUE_TYPE, class CONVERSION>
7594
+ void PlainTemplated(shared_ptr<ByteBuffer> plain_data, uint8_t *defines, uint64_t num_values,
7595
+ parquet_filter_t &filter, idx_t result_offset, Vector &result) {
7596
+ auto result_ptr = FlatVector::GetData<VALUE_TYPE>(result);
7597
+ auto &result_mask = FlatVector::Validity(result);
7598
+ for (idx_t row_idx = 0; row_idx < num_values; row_idx++) {
7599
+ if (HasDefines() && defines[row_idx + result_offset] != max_define) {
7600
+ result_mask.SetInvalid(row_idx + result_offset);
7601
+ continue;
7602
+ }
7603
+ if (filter[row_idx + result_offset]) {
7604
+ VALUE_TYPE val = CONVERSION::PlainRead(*plain_data, *this);
7605
+ result_ptr[row_idx + result_offset] = val;
7606
+ } else { // there is still some data there that we have to skip over
7607
+ CONVERSION::PlainSkip(*plain_data, *this);
7608
+ }
7609
+ }
7610
+ }
7611
+
7582
7612
  protected:
7613
+ Allocator &GetAllocator();
7583
7614
  // readers that use the default Read() need to implement those
7584
7615
  virtual void Plain(shared_ptr<ByteBuffer> plain_data, uint8_t *defines, idx_t num_values, parquet_filter_t &filter,
7585
7616
  idx_t result_offset, Vector &result);
7586
- virtual void Dictionary(shared_ptr<ByteBuffer> dictionary_data, idx_t num_entries);
7617
+ virtual void Dictionary(shared_ptr<ResizeableBuffer> dictionary_data, idx_t num_entries);
7587
7618
  virtual void Offsets(uint32_t *offsets, uint8_t *defines, idx_t num_values, parquet_filter_t &filter,
7588
7619
  idx_t result_offset, Vector &result);
7589
7620
 
@@ -7591,6 +7622,11 @@ protected:
7591
7622
  virtual void DictReference(Vector &result);
7592
7623
  virtual void PlainReference(shared_ptr<ByteBuffer>, Vector &result);
7593
7624
 
7625
+ virtual void PrepareDeltaLengthByteArray(ResizeableBuffer &buffer);
7626
+ virtual void PrepareDeltaByteArray(ResizeableBuffer &buffer);
7627
+ virtual void DeltaByteArray(uint8_t *defines, idx_t num_values, parquet_filter_t &filter, idx_t result_offset,
7628
+ Vector &result);
7629
+
7594
7630
  // applies any skips that were registered using Skip()
7595
7631
  virtual void ApplyPendingSkips(idx_t num_values);
7596
7632
 
@@ -7611,10 +7647,13 @@ protected:
7611
7647
 
7612
7648
  ParquetReader &reader;
7613
7649
  LogicalType type;
7650
+ unique_ptr<Vector> byte_array_data;
7614
7651
 
7615
7652
  idx_t pending_skips = 0;
7616
7653
 
7617
7654
  private:
7655
+ void AllocateBlock(idx_t size);
7656
+ void AllocateCompressed(idx_t size);
7618
7657
  void PrepareRead(parquet_filter_t &filter);
7619
7658
  void PreparePage(PageHeader &page_hdr);
7620
7659
  void PrepareDataPage(PageHeader &page_hdr);
@@ -7630,12 +7669,14 @@ private:
7630
7669
 
7631
7670
  shared_ptr<ResizeableBuffer> block;
7632
7671
 
7672
+ ResizeableBuffer compressed_buffer;
7633
7673
  ResizeableBuffer offset_buffer;
7634
7674
 
7635
7675
  unique_ptr<RleBpDecoder> dict_decoder;
7636
7676
  unique_ptr<RleBpDecoder> defined_decoder;
7637
7677
  unique_ptr<RleBpDecoder> repeated_decoder;
7638
7678
  unique_ptr<DbpDecoder> dbp_decoder;
7679
+ unique_ptr<RleBpDecoder> rle_decoder;
7639
7680
 
7640
7681
  // dummies for Skip()
7641
7682
  parquet_filter_t none_filter;
@@ -7908,7 +7949,8 @@ public:
7908
7949
  idx_t max_repeat = 0, idx_t max_define = 1,
7909
7950
  bool can_have_nulls = true);
7910
7951
 
7911
- virtual unique_ptr<ColumnWriterState> InitializeWriteState(duckdb_parquet::format::RowGroup &row_group) = 0;
7952
+ virtual unique_ptr<ColumnWriterState> InitializeWriteState(duckdb_parquet::format::RowGroup &row_group,
7953
+ Allocator &allocator) = 0;
7912
7954
 
7913
7955
  //! indicates whether the write need to analyse the data before preparing it
7914
7956
  virtual bool HasAnalyze() {