duckdb 0.5.2-dev2196.0 → 0.5.2-dev2214.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "name": "duckdb",
3
3
  "main": "./lib/duckdb.js",
4
4
  "types": "./lib/duckdb.d.ts",
5
- "version": "0.5.2-dev2196.0",
5
+ "version": "0.5.2-dev2214.0",
6
6
  "description": "DuckDB node.js API",
7
7
  "gypfile": true,
8
8
  "dependencies": {
package/src/duckdb.cpp CHANGED
@@ -39768,6 +39768,7 @@ void GlobalSortState::Print() {
39768
39768
 
39769
39769
 
39770
39770
 
39771
+
39771
39772
  #include <numeric>
39772
39773
 
39773
39774
  namespace duckdb {
@@ -40047,85 +40048,65 @@ void SBScanState::SetIndices(idx_t block_idx_to, idx_t entry_idx_to) {
40047
40048
  entry_idx = entry_idx_to;
40048
40049
  }
40049
40050
 
40050
- PayloadScanner::PayloadScanner(SortedData &sorted_data, GlobalSortState &global_sort_state, bool flush_p)
40051
- : sorted_data(sorted_data), read_state(global_sort_state.buffer_manager, global_sort_state),
40052
- total_count(sorted_data.Count()), total_scanned(0), flush(flush_p),
40053
- unswizzling(!sorted_data.layout.AllConstant() && global_sort_state.external) {
40054
- ValidateUnscannedBlock();
40051
+ PayloadScanner::PayloadScanner(SortedData &sorted_data, GlobalSortState &global_sort_state, bool flush_p) {
40052
+ auto count = sorted_data.Count();
40053
+ auto &layout = sorted_data.layout;
40054
+
40055
+ // Create collections to put the data into so we can use RowDataCollectionScanner
40056
+ rows = make_unique<RowDataCollection>(global_sort_state.buffer_manager, (idx_t)Storage::BLOCK_SIZE, 1);
40057
+ rows->count = count;
40058
+
40059
+ heap = make_unique<RowDataCollection>(global_sort_state.buffer_manager, (idx_t)Storage::BLOCK_SIZE, 1);
40060
+ if (!sorted_data.layout.AllConstant()) {
40061
+ heap->count = count;
40062
+ }
40063
+
40064
+ if (flush_p) {
40065
+ // If we are flushing, we can just move the data
40066
+ rows->blocks = move(sorted_data.data_blocks);
40067
+ if (!layout.AllConstant()) {
40068
+ heap->blocks = move(sorted_data.heap_blocks);
40069
+ }
40070
+ } else {
40071
+ // Not flushing, create references to the blocks
40072
+ for (auto &block : sorted_data.data_blocks) {
40073
+ rows->blocks.emplace_back(block->Copy());
40074
+ }
40075
+ if (!layout.AllConstant()) {
40076
+ for (auto &block : sorted_data.heap_blocks) {
40077
+ heap->blocks.emplace_back(block->Copy());
40078
+ }
40079
+ }
40080
+ }
40081
+
40082
+ scanner = make_unique<RowDataCollectionScanner>(*rows, *heap, layout, global_sort_state.external, flush_p);
40055
40083
  }
40056
40084
 
40057
40085
  PayloadScanner::PayloadScanner(GlobalSortState &global_sort_state, bool flush_p)
40058
40086
  : PayloadScanner(*global_sort_state.sorted_blocks[0]->payload_data, global_sort_state, flush_p) {
40059
40087
  }
40060
40088
 
40061
- PayloadScanner::PayloadScanner(GlobalSortState &global_sort_state, idx_t block_idx)
40062
- : sorted_data(*global_sort_state.sorted_blocks[0]->payload_data),
40063
- read_state(global_sort_state.buffer_manager, global_sort_state),
40064
- total_count(sorted_data.data_blocks[block_idx]->count), total_scanned(0), flush(false),
40065
- unswizzling(!sorted_data.layout.AllConstant() && global_sort_state.external) {
40066
- read_state.SetIndices(block_idx, 0);
40067
- ValidateUnscannedBlock();
40068
- }
40089
+ PayloadScanner::PayloadScanner(GlobalSortState &global_sort_state, idx_t block_idx) {
40090
+ auto &sorted_data = *global_sort_state.sorted_blocks[0]->payload_data;
40091
+ auto count = sorted_data.data_blocks[block_idx]->count;
40092
+ auto &layout = sorted_data.layout;
40069
40093
 
40070
- void PayloadScanner::ValidateUnscannedBlock() const {
40071
- if (unswizzling && read_state.block_idx < sorted_data.data_blocks.size()) {
40072
- D_ASSERT(sorted_data.data_blocks[read_state.block_idx]->block->IsSwizzled());
40094
+ // Create collections to put the data into so we can use RowDataCollectionScanner
40095
+ rows = make_unique<RowDataCollection>(global_sort_state.buffer_manager, (idx_t)Storage::BLOCK_SIZE, 1);
40096
+ rows->blocks.emplace_back(sorted_data.data_blocks[block_idx]->Copy());
40097
+ rows->count = count;
40098
+
40099
+ heap = make_unique<RowDataCollection>(global_sort_state.buffer_manager, (idx_t)Storage::BLOCK_SIZE, 1);
40100
+ if (!sorted_data.layout.AllConstant() && sorted_data.swizzled) {
40101
+ heap->blocks.emplace_back(sorted_data.heap_blocks[block_idx]->Copy());
40102
+ heap->count = count;
40073
40103
  }
40104
+
40105
+ scanner = make_unique<RowDataCollectionScanner>(*rows, *heap, layout, global_sort_state.external, false);
40074
40106
  }
40075
40107
 
40076
40108
  void PayloadScanner::Scan(DataChunk &chunk) {
40077
- auto count = MinValue((idx_t)STANDARD_VECTOR_SIZE, total_count - total_scanned);
40078
- if (count == 0) {
40079
- chunk.SetCardinality(count);
40080
- return;
40081
- }
40082
- // Eagerly delete references to blocks that we've passed
40083
- if (flush) {
40084
- for (idx_t i = 0; i < read_state.block_idx; i++) {
40085
- sorted_data.data_blocks[i]->block = nullptr;
40086
- if (unswizzling) {
40087
- sorted_data.heap_blocks[i]->block = nullptr;
40088
- }
40089
- }
40090
- }
40091
- const idx_t &row_width = sorted_data.layout.GetRowWidth();
40092
- // Set up a batch of pointers to scan data from
40093
- idx_t scanned = 0;
40094
- auto data_pointers = FlatVector::GetData<data_ptr_t>(addresses);
40095
- while (scanned < count) {
40096
- read_state.PinData(sorted_data);
40097
- auto &data_block = *sorted_data.data_blocks[read_state.block_idx];
40098
- idx_t next = MinValue(data_block.count - read_state.entry_idx, count - scanned);
40099
- const data_ptr_t data_ptr = read_state.payload_data_handle.Ptr() + read_state.entry_idx * row_width;
40100
- // Set up the next pointers
40101
- data_ptr_t row_ptr = data_ptr;
40102
- for (idx_t i = 0; i < next; i++) {
40103
- data_pointers[scanned + i] = row_ptr;
40104
- row_ptr += row_width;
40105
- }
40106
- // Unswizzle the offsets back to pointers (if needed)
40107
- if (unswizzling) {
40108
- RowOperations::UnswizzlePointers(sorted_data.layout, data_ptr, read_state.payload_heap_handle.Ptr(), next);
40109
- sorted_data.data_blocks[read_state.block_idx]->block->SetSwizzling("PayloadScanner::Scan");
40110
- }
40111
- // Update state indices
40112
- read_state.entry_idx += next;
40113
- if (read_state.entry_idx == data_block.count) {
40114
- read_state.block_idx++;
40115
- read_state.entry_idx = 0;
40116
- ValidateUnscannedBlock();
40117
- }
40118
- scanned += next;
40119
- }
40120
- D_ASSERT(scanned == count);
40121
- // Deserialize the payload data
40122
- for (idx_t col_no = 0; col_no < sorted_data.layout.ColumnCount(); col_no++) {
40123
- RowOperations::Gather(addresses, *FlatVector::IncrementalSelectionVector(), chunk.data[col_no],
40124
- *FlatVector::IncrementalSelectionVector(), count, sorted_data.layout, col_no);
40125
- }
40126
- chunk.SetCardinality(count);
40127
- chunk.Verify();
40128
- total_scanned += scanned;
40109
+ scanner->Scan(chunk);
40129
40110
  }
40130
40111
 
40131
40112
  int SBIterator::ComparisonValue(ExpressionType comparison) {
@@ -43412,14 +43393,21 @@ ColumnDataRowCollection::ColumnDataRowCollection(const ColumnDataCollection &col
43412
43393
  return;
43413
43394
  }
43414
43395
  // read all the chunks
43415
- ColumnDataScanState scan_state;
43416
- collection.InitializeScan(scan_state);
43396
+ ColumnDataScanState temp_scan_state;
43397
+ collection.InitializeScan(temp_scan_state);
43417
43398
  while (true) {
43418
43399
  auto chunk = make_unique<DataChunk>();
43419
43400
  collection.InitializeScanChunk(*chunk);
43420
- if (!collection.Scan(scan_state, *chunk)) {
43401
+ if (!collection.Scan(temp_scan_state, *chunk)) {
43421
43402
  break;
43422
43403
  }
43404
+ // we keep the BufferHandles that are needed for the materialized collection pinned in the supplied scan_state
43405
+ auto &temp_handles = temp_scan_state.current_chunk_state.handles;
43406
+ auto &scan_handles = scan_state.current_chunk_state.handles;
43407
+ for (auto &temp_handle_pair : temp_handles) {
43408
+ auto handle_copy = make_pair<uint32_t, BufferHandle>(scan_handles.size(), move(temp_handle_pair.second));
43409
+ scan_state.current_chunk_state.handles.insert(move(handle_copy));
43410
+ }
43423
43411
  chunks.push_back(move(chunk));
43424
43412
  }
43425
43413
  // now create all of the column data rows
@@ -47745,6 +47733,7 @@ void RowDataCollection::Merge(RowDataCollection &other) {
47745
47733
  temp.block_capacity = other.block_capacity;
47746
47734
  temp.entry_size = other.entry_size;
47747
47735
  temp.blocks = move(other.blocks);
47736
+ temp.pinned_blocks = move(other.pinned_blocks);
47748
47737
  }
47749
47738
  other.Clear();
47750
47739
 
@@ -47761,115 +47750,6 @@ void RowDataCollection::Merge(RowDataCollection &other) {
47761
47750
  }
47762
47751
 
47763
47752
  } // namespace duckdb
47764
- //===----------------------------------------------------------------------===//
47765
- // DuckDB
47766
- //
47767
- // duckdb/common/types/row_data_collection_scanner.hpp
47768
- //
47769
- //
47770
- //===----------------------------------------------------------------------===//
47771
-
47772
-
47773
-
47774
-
47775
-
47776
-
47777
- namespace duckdb {
47778
-
47779
- class BufferHandle;
47780
- class RowDataCollection;
47781
- struct RowDataBlock;
47782
- class DataChunk;
47783
-
47784
- //! Used to scan the data into DataChunks after sorting
47785
- struct RowDataCollectionScanner {
47786
- public:
47787
- using Types = vector<LogicalType>;
47788
-
47789
- struct ScanState {
47790
- explicit ScanState(const RowDataCollectionScanner &scanner_p) : scanner(scanner_p), block_idx(0), entry_idx(0) {
47791
- }
47792
-
47793
- void PinData();
47794
-
47795
- //! The data layout
47796
- const RowDataCollectionScanner &scanner;
47797
-
47798
- idx_t block_idx;
47799
- idx_t entry_idx;
47800
-
47801
- BufferHandle data_handle;
47802
- BufferHandle heap_handle;
47803
-
47804
- // We must pin ALL blocks we are going to gather from
47805
- vector<BufferHandle> pinned_blocks;
47806
- };
47807
-
47808
- //! Ensure that heap blocks correspond to row blocks
47809
- static void AlignHeapBlocks(RowDataCollection &dst_block_collection, RowDataCollection &dst_string_heap,
47810
- RowDataCollection &src_block_collection, RowDataCollection &src_string_heap,
47811
- const RowLayout &layout);
47812
-
47813
- RowDataCollectionScanner(RowDataCollection &rows, RowDataCollection &heap, const RowLayout &layout, bool external,
47814
- bool flush = true);
47815
-
47816
- //! The type layout of the payload
47817
- inline const vector<LogicalType> &GetTypes() const {
47818
- return layout.GetTypes();
47819
- }
47820
-
47821
- //! The number of rows in the collection
47822
- inline idx_t Count() const {
47823
- return total_count;
47824
- }
47825
-
47826
- //! The number of rows scanned so far
47827
- inline idx_t Scanned() const {
47828
- return total_scanned;
47829
- }
47830
-
47831
- //! The number of remaining rows
47832
- inline idx_t Remaining() const {
47833
- return total_count - total_scanned;
47834
- }
47835
-
47836
- //! Swizzle the blocks for external scanning
47837
- //! Swizzling is all or nothing, so if we have scanned previously,
47838
- //! we need to re-swizzle.
47839
- void ReSwizzle();
47840
-
47841
- void SwizzleBlock(RowDataBlock &data_block, RowDataBlock &heap_block);
47842
-
47843
- //! Scans the next data chunk from the sorted data
47844
- void Scan(DataChunk &chunk);
47845
-
47846
- private:
47847
- //! The row data being scanned
47848
- RowDataCollection &rows;
47849
- //! The row heap being scanned
47850
- RowDataCollection &heap;
47851
- //! The data layout
47852
- const RowLayout layout;
47853
- //! Read state
47854
- ScanState read_state;
47855
- //! The total count of sorted_data
47856
- const idx_t total_count;
47857
- //! The number of rows scanned so far
47858
- idx_t total_scanned;
47859
- //! Addresses used to gather from the sorted data
47860
- Vector addresses = Vector(LogicalType::POINTER);
47861
- //! Whether the blocks can be flushed to disk
47862
- const bool external;
47863
- //! Whether to flush the blocks after scanning
47864
- const bool flush;
47865
- //! Whether we are unswizzling the blocks
47866
- const bool unswizzling;
47867
-
47868
- //! Checks that the newest block is valid
47869
- void ValidateUnscannedBlock() const;
47870
- };
47871
-
47872
- } // namespace duckdb
47873
47753
 
47874
47754
 
47875
47755
 
@@ -63506,6 +63386,8 @@ public:
63506
63386
 
63507
63387
  //! Fill the pointer with all the addresses from the hashtable for full scan
63508
63388
  idx_t FillWithHTOffsets(data_ptr_t *key_locations, JoinHTScanState &state);
63389
+ //! Pins all fixed-size blocks
63390
+ void PinAllBlocks();
63509
63391
 
63510
63392
  idx_t Count() const {
63511
63393
  return block_collection->count;
@@ -64602,6 +64484,12 @@ idx_t JoinHashTable::FillWithHTOffsets(data_ptr_t *key_locations, JoinHTScanStat
64602
64484
  return key_count;
64603
64485
  }
64604
64486
 
64487
+ void JoinHashTable::PinAllBlocks() {
64488
+ for (auto &block : block_collection->blocks) {
64489
+ pinned_handles.push_back(buffer_manager.Pin(block->block));
64490
+ }
64491
+ }
64492
+
64605
64493
  void JoinHashTable::SwizzleBlocks() {
64606
64494
  if (block_collection->count == 0) {
64607
64495
  return;
@@ -73113,6 +73001,9 @@ bool PerfectHashJoinExecutor::BuildPerfectHashTable(LogicalType &key_type) {
73113
73001
  bitmap_build_idx = unique_ptr<bool[]>(new bool[build_size]);
73114
73002
  memset(bitmap_build_idx.get(), 0, sizeof(bool) * build_size); // set false
73115
73003
 
73004
+ // pin all fixed-size blocks (variable-sized should still be pinned)
73005
+ ht.PinAllBlocks();
73006
+
73116
73007
  // Now fill columns with build data
73117
73008
  JoinHTScanState join_ht_state;
73118
73009
  return FullScanHashTable(join_ht_state, key_type);
@@ -76735,6 +76626,7 @@ public:
76735
76626
 
76736
76627
  ArenaAllocator arena_allocator;
76737
76628
  vector<Key> keys;
76629
+ unique_ptr<ColumnFetchState> fetch_state;
76738
76630
 
76739
76631
  public:
76740
76632
  void Finalize(PhysicalOperator *op, ExecutionContext &context) override {
@@ -76814,9 +76706,9 @@ void PhysicalIndexJoin::Output(ExecutionContext &context, DataChunk &input, Data
76814
76706
  return;
76815
76707
  }
76816
76708
  state.rhs_chunk.Reset();
76817
- ColumnFetchState fetch_state;
76709
+ state.fetch_state = make_unique<ColumnFetchState>();
76818
76710
  Vector row_ids(LogicalType::ROW_TYPE, (data_ptr_t)&fetch_rows[0]);
76819
- tbl->Fetch(transaction, state.rhs_chunk, fetch_ids, row_ids, output_sel_idx, fetch_state);
76711
+ tbl->Fetch(transaction, state.rhs_chunk, fetch_ids, row_ids, output_sel_idx, *state.fetch_state);
76820
76712
  }
76821
76713
 
76822
76714
  //! Now we actually produce our result chunk
@@ -77824,6 +77716,7 @@ public:
77824
77716
  RowLayout lhs_layout;
77825
77717
  unique_ptr<LocalSortedTable> lhs_local_table;
77826
77718
  unique_ptr<GlobalSortState> lhs_global_state;
77719
+ unique_ptr<PayloadScanner> scanner;
77827
77720
 
77828
77721
  // Simple scans
77829
77722
  idx_t left_position;
@@ -77840,7 +77733,7 @@ public:
77840
77733
  DataChunk rhs_keys;
77841
77734
  DataChunk rhs_input;
77842
77735
  ExpressionExecutor rhs_executor;
77843
- BufferHandle payload_heap_handle;
77736
+ vector<BufferHandle> payload_heap_handles;
77844
77737
 
77845
77738
  public:
77846
77739
  void ResolveJoinKeys(DataChunk &input) {
@@ -77862,9 +77755,9 @@ public:
77862
77755
  // Scan the sorted payload
77863
77756
  D_ASSERT(lhs_global_state->sorted_blocks.size() == 1);
77864
77757
 
77865
- PayloadScanner scanner(*lhs_global_state->sorted_blocks[0]->payload_data, *lhs_global_state);
77758
+ scanner = make_unique<PayloadScanner>(*lhs_global_state->sorted_blocks[0]->payload_data, *lhs_global_state);
77866
77759
  lhs_payload.Reset();
77867
- scanner.Scan(lhs_payload);
77760
+ scanner->Scan(lhs_payload);
77868
77761
 
77869
77762
  // Recompute the sorted keys from the sorted input
77870
77763
  lhs_local_table->keys.Reset();
@@ -78138,6 +78031,8 @@ OperatorResultType PhysicalPiecewiseMergeJoin::ResolveComplexJoin(ExecutionConte
78138
78031
  auto &rsorted = *gstate.table->global_sort_state.sorted_blocks[0];
78139
78032
  const auto left_cols = input.ColumnCount();
78140
78033
  const auto tail_cols = conditions.size() - 1;
78034
+
78035
+ state.payload_heap_handles.clear();
78141
78036
  do {
78142
78037
  if (state.first_fetch) {
78143
78038
  state.ResolveJoinKeys(input);
@@ -78188,8 +78083,8 @@ OperatorResultType PhysicalPiecewiseMergeJoin::ResolveComplexJoin(ExecutionConte
78188
78083
  for (idx_t c = 0; c < state.lhs_payload.ColumnCount(); ++c) {
78189
78084
  chunk.data[c].Slice(state.lhs_payload.data[c], left_info.result, result_count);
78190
78085
  }
78191
- state.payload_heap_handle = SliceSortedPayload(chunk, right_info.state, right_info.block_idx,
78192
- right_info.result, result_count, left_cols);
78086
+ state.payload_heap_handles.push_back(SliceSortedPayload(chunk, right_info.state, right_info.block_idx,
78087
+ right_info.result, result_count, left_cols));
78193
78088
  chunk.SetCardinality(result_count);
78194
78089
 
78195
78090
  auto sel = FlatVector::IncrementalSelectionVector();
@@ -202218,6 +202113,16 @@ void BufferManager::AddToEvictionQueue(shared_ptr<BlockHandle> &handle) {
202218
202113
  queue->q.enqueue(BufferEvictionNode(weak_ptr<BlockHandle>(handle), handle->eviction_timestamp));
202219
202114
  }
202220
202115
 
202116
+ void BufferManager::VerifyZeroReaders(shared_ptr<BlockHandle> &handle) {
202117
+ #ifdef DUCKDB_DEBUG_DESTROY_BLOCKS
202118
+ auto replacement_buffer = make_unique<FileBuffer>(Allocator::Get(db), handle->buffer->type,
202119
+ handle->memory_usage - Storage::BLOCK_HEADER_SIZE);
202120
+ memcpy(replacement_buffer->buffer, handle->buffer->buffer, handle->buffer->size);
202121
+ memset(handle->buffer->buffer, 190, handle->buffer->size);
202122
+ handle->buffer = move(replacement_buffer);
202123
+ #endif
202124
+ }
202125
+
202221
202126
  void BufferManager::Unpin(shared_ptr<BlockHandle> &handle) {
202222
202127
  lock_guard<mutex> lock(handle->lock);
202223
202128
  if (!handle->buffer || handle->buffer->type == FileBufferType::TINY_BUFFER) {
@@ -202226,6 +202131,7 @@ void BufferManager::Unpin(shared_ptr<BlockHandle> &handle) {
202226
202131
  D_ASSERT(handle->readers > 0);
202227
202132
  handle->readers--;
202228
202133
  if (handle->readers == 0) {
202134
+ VerifyZeroReaders(handle);
202229
202135
  AddToEvictionQueue(handle);
202230
202136
  }
202231
202137
  }
@@ -217286,6 +217192,7 @@ void ColumnData::InitializeScanWithOffset(ColumnScanState &state, idx_t row_idx)
217286
217192
  }
217287
217193
 
217288
217194
  idx_t ColumnData::ScanVector(ColumnScanState &state, Vector &result, idx_t remaining) {
217195
+ state.previous_states.clear();
217289
217196
  if (state.version != version) {
217290
217197
  InitializeScanWithOffset(state, state.row_index);
217291
217198
  state.current->InitializeScan(state);
@@ -217320,6 +217227,7 @@ idx_t ColumnData::ScanVector(ColumnScanState &state, Vector &result, idx_t remai
217320
217227
  if (!state.current->next) {
217321
217228
  break;
217322
217229
  }
217230
+ state.previous_states.emplace_back(move(state.scan_state));
217323
217231
  state.current = (ColumnSegment *)state.current->Next();
217324
217232
  state.current->InitializeScan(state);
217325
217233
  state.segment_checked = false;