duckdb 0.7.2-dev1898.0 → 0.7.2-dev2144.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.gyp +2 -0
- package/package.json +1 -1
- package/src/data_chunk.cpp +13 -1
- package/src/duckdb/extension/parquet/include/parquet_writer.hpp +1 -1
- package/src/duckdb/extension/parquet/parquet_metadata.cpp +4 -2
- package/src/duckdb/src/catalog/catalog_entry/duck_index_entry.cpp +1 -1
- package/src/duckdb/src/common/arrow/arrow_appender.cpp +69 -44
- package/src/duckdb/src/common/arrow/arrow_converter.cpp +1 -1
- package/src/duckdb/src/common/arrow/arrow_wrapper.cpp +20 -2
- package/src/duckdb/src/common/box_renderer.cpp +4 -2
- package/src/duckdb/src/common/constants.cpp +10 -1
- package/src/duckdb/src/common/filename_pattern.cpp +41 -0
- package/src/duckdb/src/common/hive_partitioning.cpp +144 -15
- package/src/duckdb/src/common/radix_partitioning.cpp +101 -369
- package/src/duckdb/src/common/row_operations/row_aggregate.cpp +8 -9
- package/src/duckdb/src/common/row_operations/row_external.cpp +1 -1
- package/src/duckdb/src/common/row_operations/row_gather.cpp +5 -3
- package/src/duckdb/src/common/row_operations/row_match.cpp +117 -22
- package/src/duckdb/src/common/row_operations/row_scatter.cpp +2 -2
- package/src/duckdb/src/common/sort/partition_state.cpp +1 -1
- package/src/duckdb/src/common/sort/sort_state.cpp +2 -1
- package/src/duckdb/src/common/sort/sorted_block.cpp +1 -1
- package/src/duckdb/src/common/types/{column_data_allocator.cpp → column/column_data_allocator.cpp} +2 -2
- package/src/duckdb/src/common/types/{column_data_collection.cpp → column/column_data_collection.cpp} +22 -4
- package/src/duckdb/src/common/types/{column_data_collection_segment.cpp → column/column_data_collection_segment.cpp} +2 -1
- package/src/duckdb/src/common/types/{column_data_consumer.cpp → column/column_data_consumer.cpp} +1 -1
- package/src/duckdb/src/common/types/{partitioned_column_data.cpp → column/partitioned_column_data.cpp} +11 -9
- package/src/duckdb/src/common/types/row/partitioned_tuple_data.cpp +316 -0
- package/src/duckdb/src/common/types/{row_data_collection.cpp → row/row_data_collection.cpp} +1 -1
- package/src/duckdb/src/common/types/{row_data_collection_scanner.cpp → row/row_data_collection_scanner.cpp} +2 -2
- package/src/duckdb/src/common/types/{row_layout.cpp → row/row_layout.cpp} +1 -1
- package/src/duckdb/src/common/types/row/tuple_data_allocator.cpp +465 -0
- package/src/duckdb/src/common/types/row/tuple_data_collection.cpp +511 -0
- package/src/duckdb/src/common/types/row/tuple_data_iterator.cpp +96 -0
- package/src/duckdb/src/common/types/row/tuple_data_layout.cpp +119 -0
- package/src/duckdb/src/common/types/row/tuple_data_scatter_gather.cpp +1200 -0
- package/src/duckdb/src/common/types/row/tuple_data_segment.cpp +170 -0
- package/src/duckdb/src/common/types/vector.cpp +1 -1
- package/src/duckdb/src/execution/aggregate_hashtable.cpp +252 -290
- package/src/duckdb/src/execution/join_hashtable.cpp +192 -328
- package/src/duckdb/src/execution/operator/aggregate/physical_window.cpp +4 -4
- package/src/duckdb/src/execution/operator/helper/physical_execute.cpp +3 -3
- package/src/duckdb/src/execution/operator/helper/physical_limit_percent.cpp +2 -3
- package/src/duckdb/src/execution/operator/helper/physical_result_collector.cpp +2 -3
- package/src/duckdb/src/execution/operator/join/perfect_hash_join_executor.cpp +36 -21
- package/src/duckdb/src/execution/operator/join/physical_blockwise_nl_join.cpp +2 -2
- package/src/duckdb/src/execution/operator/join/physical_cross_product.cpp +1 -1
- package/src/duckdb/src/execution/operator/join/physical_delim_join.cpp +2 -2
- package/src/duckdb/src/execution/operator/join/physical_hash_join.cpp +166 -144
- package/src/duckdb/src/execution/operator/join/physical_index_join.cpp +5 -5
- package/src/duckdb/src/execution/operator/join/physical_join.cpp +2 -10
- package/src/duckdb/src/execution/operator/join/physical_positional_join.cpp +0 -1
- package/src/duckdb/src/execution/operator/order/physical_top_n.cpp +2 -2
- package/src/duckdb/src/execution/operator/persistent/physical_copy_to_file.cpp +13 -11
- package/src/duckdb/src/execution/operator/persistent/physical_delete.cpp +3 -2
- package/src/duckdb/src/execution/operator/persistent/physical_export.cpp +25 -24
- package/src/duckdb/src/execution/operator/persistent/physical_insert.cpp +1 -1
- package/src/duckdb/src/execution/operator/persistent/physical_update.cpp +4 -3
- package/src/duckdb/src/execution/operator/schema/physical_create_type.cpp +1 -1
- package/src/duckdb/src/execution/operator/set/physical_recursive_cte.cpp +3 -3
- package/src/duckdb/src/execution/partitionable_hashtable.cpp +9 -37
- package/src/duckdb/src/execution/physical_operator.cpp +1 -1
- package/src/duckdb/src/execution/physical_plan/plan_comparison_join.cpp +19 -18
- package/src/duckdb/src/execution/physical_plan/plan_copy_to_file.cpp +2 -1
- package/src/duckdb/src/execution/physical_plan/plan_execute.cpp +2 -2
- package/src/duckdb/src/execution/physical_plan/plan_explain.cpp +5 -6
- package/src/duckdb/src/execution/physical_plan/plan_expression_get.cpp +2 -2
- package/src/duckdb/src/execution/physical_plan/plan_recursive_cte.cpp +3 -3
- package/src/duckdb/src/execution/physical_plan_generator.cpp +1 -1
- package/src/duckdb/src/execution/radix_partitioned_hashtable.cpp +39 -17
- package/src/duckdb/src/function/aggregate/sorted_aggregate_function.cpp +2 -2
- package/src/duckdb/src/function/table/pragma_detailed_profiling_output.cpp +5 -5
- package/src/duckdb/src/function/table/pragma_last_profiling_output.cpp +2 -2
- package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
- package/src/duckdb/src/include/duckdb/catalog/catalog_entry/index_catalog_entry.hpp +1 -1
- package/src/duckdb/src/include/duckdb/common/arrow/arrow_appender.hpp +1 -1
- package/src/duckdb/src/include/duckdb/common/constants.hpp +2 -0
- package/src/duckdb/src/include/duckdb/common/exception.hpp +3 -0
- package/src/duckdb/src/include/duckdb/common/fast_mem.hpp +528 -0
- package/src/duckdb/src/include/duckdb/common/filename_pattern.hpp +34 -0
- package/src/duckdb/src/include/duckdb/common/helper.hpp +10 -0
- package/src/duckdb/src/include/duckdb/common/hive_partitioning.hpp +13 -3
- package/src/duckdb/src/include/duckdb/common/optional_ptr.hpp +8 -0
- package/src/duckdb/src/include/duckdb/common/perfect_map_set.hpp +34 -0
- package/src/duckdb/src/include/duckdb/common/radix_partitioning.hpp +80 -27
- package/src/duckdb/src/include/duckdb/common/reference_map.hpp +38 -0
- package/src/duckdb/src/include/duckdb/common/row_operations/row_operations.hpp +7 -6
- package/src/duckdb/src/include/duckdb/common/sort/comparators.hpp +1 -1
- package/src/duckdb/src/include/duckdb/common/sort/partition_state.hpp +1 -1
- package/src/duckdb/src/include/duckdb/common/sort/sort.hpp +1 -1
- package/src/duckdb/src/include/duckdb/common/sort/sorted_block.hpp +2 -2
- package/src/duckdb/src/include/duckdb/common/types/batched_data_collection.hpp +1 -1
- package/src/duckdb/src/include/duckdb/common/types/{column_data_allocator.hpp → column/column_data_allocator.hpp} +4 -4
- package/src/duckdb/src/include/duckdb/common/types/{column_data_collection.hpp → column/column_data_collection.hpp} +2 -2
- package/src/duckdb/src/include/duckdb/common/types/{column_data_collection_iterators.hpp → column/column_data_collection_iterators.hpp} +2 -2
- package/src/duckdb/src/include/duckdb/common/types/{column_data_collection_segment.hpp → column/column_data_collection_segment.hpp} +3 -3
- package/src/duckdb/src/include/duckdb/common/types/{column_data_consumer.hpp → column/column_data_consumer.hpp} +8 -4
- package/src/duckdb/src/include/duckdb/common/types/{column_data_scan_states.hpp → column/column_data_scan_states.hpp} +1 -1
- package/src/duckdb/src/include/duckdb/common/types/{partitioned_column_data.hpp → column/partitioned_column_data.hpp} +15 -7
- package/src/duckdb/src/include/duckdb/common/types/row/partitioned_tuple_data.hpp +140 -0
- package/src/duckdb/src/include/duckdb/common/types/{row_data_collection.hpp → row/row_data_collection.hpp} +1 -1
- package/src/duckdb/src/include/duckdb/common/types/{row_data_collection_scanner.hpp → row/row_data_collection_scanner.hpp} +2 -2
- package/src/duckdb/src/include/duckdb/common/types/{row_layout.hpp → row/row_layout.hpp} +3 -1
- package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_allocator.hpp +116 -0
- package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_collection.hpp +239 -0
- package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_iterator.hpp +64 -0
- package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_layout.hpp +113 -0
- package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_segment.hpp +124 -0
- package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_states.hpp +74 -0
- package/src/duckdb/src/include/duckdb/common/types/validity_mask.hpp +3 -0
- package/src/duckdb/src/include/duckdb/common/types/value.hpp +4 -12
- package/src/duckdb/src/include/duckdb/execution/aggregate_hashtable.hpp +34 -31
- package/src/duckdb/src/include/duckdb/execution/base_aggregate_hashtable.hpp +2 -2
- package/src/duckdb/src/include/duckdb/execution/execution_context.hpp +3 -2
- package/src/duckdb/src/include/duckdb/execution/expression_executor.hpp +1 -1
- package/src/duckdb/src/include/duckdb/execution/join_hashtable.hpp +41 -67
- package/src/duckdb/src/include/duckdb/execution/nested_loop_join.hpp +1 -1
- package/src/duckdb/src/include/duckdb/execution/operator/helper/physical_execute.hpp +2 -2
- package/src/duckdb/src/include/duckdb/execution/operator/helper/physical_result_collector.hpp +1 -1
- package/src/duckdb/src/include/duckdb/execution/operator/join/outer_join_marker.hpp +2 -2
- package/src/duckdb/src/include/duckdb/execution/operator/join/perfect_hash_join_executor.hpp +1 -1
- package/src/duckdb/src/include/duckdb/execution/operator/join/physical_cross_product.hpp +1 -1
- package/src/duckdb/src/include/duckdb/execution/operator/join/physical_hash_join.hpp +0 -2
- package/src/duckdb/src/include/duckdb/execution/operator/join/physical_index_join.hpp +2 -2
- package/src/duckdb/src/include/duckdb/execution/operator/join/physical_positional_join.hpp +1 -1
- package/src/duckdb/src/include/duckdb/execution/operator/persistent/physical_copy_to_file.hpp +4 -1
- package/src/duckdb/src/include/duckdb/execution/operator/scan/physical_column_data_scan.hpp +1 -1
- package/src/duckdb/src/include/duckdb/execution/operator/set/physical_recursive_cte.hpp +1 -1
- package/src/duckdb/src/include/duckdb/execution/partitionable_hashtable.hpp +2 -2
- package/src/duckdb/src/include/duckdb/main/materialized_query_result.hpp +1 -1
- package/src/duckdb/src/include/duckdb/main/query_result.hpp +14 -1
- package/src/duckdb/src/include/duckdb/optimizer/expression_rewriter.hpp +3 -3
- package/src/duckdb/src/include/duckdb/optimizer/join_order/cardinality_estimator.hpp +16 -16
- package/src/duckdb/src/include/duckdb/optimizer/join_order/join_node.hpp +8 -8
- package/src/duckdb/src/include/duckdb/optimizer/join_order/join_order_optimizer.hpp +23 -15
- package/src/duckdb/src/include/duckdb/optimizer/join_order/join_relation.hpp +9 -10
- package/src/duckdb/src/include/duckdb/optimizer/join_order/query_graph.hpp +18 -11
- package/src/duckdb/src/include/duckdb/parallel/meta_pipeline.hpp +1 -1
- package/src/duckdb/src/include/duckdb/parser/parsed_data/exported_table_data.hpp +5 -1
- package/src/duckdb/src/include/duckdb/parser/parsed_data/vacuum_info.hpp +3 -2
- package/src/duckdb/src/include/duckdb/parser/query_error_context.hpp +4 -2
- package/src/duckdb/src/include/duckdb/parser/transformer.hpp +9 -35
- package/src/duckdb/src/include/duckdb/planner/binder.hpp +24 -23
- package/src/duckdb/src/include/duckdb/planner/expression_binder.hpp +3 -3
- package/src/duckdb/src/include/duckdb/planner/operator/logical_column_data_get.hpp +1 -1
- package/src/duckdb/src/include/duckdb/planner/operator/logical_copy_to_file.hpp +3 -1
- package/src/duckdb/src/include/duckdb/storage/table/table_index_list.hpp +1 -1
- package/src/duckdb/src/main/appender.cpp +6 -6
- package/src/duckdb/src/main/client_context.cpp +1 -1
- package/src/duckdb/src/main/connection.cpp +2 -2
- package/src/duckdb/src/main/query_result.cpp +13 -0
- package/src/duckdb/src/optimizer/expression_rewriter.cpp +4 -4
- package/src/duckdb/src/optimizer/join_order/cardinality_estimator.cpp +91 -105
- package/src/duckdb/src/optimizer/join_order/join_node.cpp +5 -8
- package/src/duckdb/src/optimizer/join_order/join_order_optimizer.cpp +163 -160
- package/src/duckdb/src/optimizer/join_order/join_relation_set.cpp +30 -30
- package/src/duckdb/src/optimizer/join_order/query_graph.cpp +37 -38
- package/src/duckdb/src/parallel/executor.cpp +1 -1
- package/src/duckdb/src/parallel/meta_pipeline.cpp +2 -2
- package/src/duckdb/src/parser/transform/helpers/transform_cte.cpp +1 -1
- package/src/duckdb/src/parser/transform/tableref/transform_subquery.cpp +1 -1
- package/src/duckdb/src/parser/transformer.cpp +50 -9
- package/src/duckdb/src/planner/binder/statement/bind_copy.cpp +15 -5
- package/src/duckdb/src/planner/binder/statement/bind_create.cpp +19 -17
- package/src/duckdb/src/planner/binder/statement/bind_create_table.cpp +4 -4
- package/src/duckdb/src/planner/binder/statement/bind_export.cpp +20 -21
- package/src/duckdb/src/planner/binder/tableref/bind_basetableref.cpp +24 -22
- package/src/duckdb/src/planner/binder/tableref/bind_subqueryref.cpp +2 -2
- package/src/duckdb/src/planner/binder.cpp +16 -19
- package/src/duckdb/src/planner/expression_binder.cpp +8 -8
- package/src/duckdb/src/planner/operator/logical_copy_to_file.cpp +3 -3
- package/src/duckdb/src/storage/checkpoint_manager.cpp +23 -23
- package/src/duckdb/src/storage/standard_buffer_manager.cpp +1 -1
- package/src/duckdb/src/storage/table_index_list.cpp +3 -3
- package/src/duckdb/src/verification/statement_verifier.cpp +1 -1
- package/src/duckdb/third_party/libpg_query/src_backend_parser_gram.cpp +5552 -5598
- package/src/duckdb/ub_src_common.cpp +2 -0
- package/src/duckdb/ub_src_common_types.cpp +0 -16
- package/src/duckdb/ub_src_common_types_column.cpp +10 -0
- package/src/duckdb/ub_src_common_types_row.cpp +20 -0
- package/test/udf.test.ts +9 -0
@@ -2,9 +2,7 @@
|
|
2
2
|
|
3
3
|
#include "duckdb/common/exception.hpp"
|
4
4
|
#include "duckdb/common/row_operations/row_operations.hpp"
|
5
|
-
#include "duckdb/common/types/column_data_collection_segment.hpp"
|
6
|
-
#include "duckdb/common/types/row_data_collection.hpp"
|
7
|
-
#include "duckdb/common/types/row_data_collection_scanner.hpp"
|
5
|
+
#include "duckdb/common/types/column/column_data_collection_segment.hpp"
|
8
6
|
#include "duckdb/common/vector_operations/vector_operations.hpp"
|
9
7
|
#include "duckdb/main/client_context.hpp"
|
10
8
|
#include "duckdb/storage/buffer_manager.hpp"
|
@@ -16,11 +14,11 @@ using ScanStructure = JoinHashTable::ScanStructure;
|
|
16
14
|
using ProbeSpill = JoinHashTable::ProbeSpill;
|
17
15
|
using ProbeSpillLocalState = JoinHashTable::ProbeSpillLocalAppendState;
|
18
16
|
|
19
|
-
JoinHashTable::JoinHashTable(BufferManager &
|
20
|
-
vector<LogicalType> btypes, JoinType
|
21
|
-
: buffer_manager(
|
22
|
-
tuple_size(0), vfound(Value::BOOLEAN(false)), join_type(
|
23
|
-
|
17
|
+
JoinHashTable::JoinHashTable(BufferManager &buffer_manager_p, const vector<JoinCondition> &conditions_p,
|
18
|
+
vector<LogicalType> btypes, JoinType type_p)
|
19
|
+
: buffer_manager(buffer_manager_p), conditions(conditions_p), build_types(std::move(btypes)), entry_size(0),
|
20
|
+
tuple_size(0), vfound(Value::BOOLEAN(false)), join_type(type_p), finalized(false), has_null(false),
|
21
|
+
external(false), radix_bits(4), partition_start(0), partition_end(0) {
|
24
22
|
for (auto &condition : conditions) {
|
25
23
|
D_ASSERT(condition.left->return_type == condition.right->return_type);
|
26
24
|
auto type = condition.left->return_type;
|
@@ -59,23 +57,18 @@ JoinHashTable::JoinHashTable(BufferManager &buffer_manager, const vector<JoinCon
|
|
59
57
|
pointer_offset = offsets.back();
|
60
58
|
entry_size = layout.GetRowWidth();
|
61
59
|
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
string_heap = make_uniq<RowDataCollection>(buffer_manager, (idx_t)Storage::BLOCK_SIZE, 1, true);
|
66
|
-
swizzled_block_collection = block_collection->CloneEmpty();
|
67
|
-
swizzled_string_heap = string_heap->CloneEmpty();
|
60
|
+
data_collection = make_uniq<TupleDataCollection>(buffer_manager, layout);
|
61
|
+
sink_collection =
|
62
|
+
make_uniq<RadixPartitionedTupleData>(buffer_manager, layout, radix_bits, layout.ColumnCount() - 1);
|
68
63
|
}
|
69
64
|
|
70
65
|
JoinHashTable::~JoinHashTable() {
|
71
66
|
}
|
72
67
|
|
73
68
|
void JoinHashTable::Merge(JoinHashTable &other) {
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
string_heap->Merge(*other.string_heap);
|
78
|
-
swizzled_string_heap->Merge(*other.swizzled_string_heap);
|
69
|
+
{
|
70
|
+
lock_guard<mutex> guard(data_lock);
|
71
|
+
data_collection->Combine(*other.data_collection);
|
79
72
|
}
|
80
73
|
|
81
74
|
if (join_type == JoinType::MARK) {
|
@@ -88,28 +81,7 @@ void JoinHashTable::Merge(JoinHashTable &other) {
|
|
88
81
|
}
|
89
82
|
}
|
90
83
|
|
91
|
-
|
92
|
-
if (partition_block_collections.empty()) {
|
93
|
-
D_ASSERT(partition_string_heaps.empty());
|
94
|
-
// Move partitions to this HT
|
95
|
-
for (idx_t p = 0; p < other.partition_block_collections.size(); p++) {
|
96
|
-
partition_block_collections.push_back(std::move(other.partition_block_collections[p]));
|
97
|
-
if (!layout.AllConstant()) {
|
98
|
-
partition_string_heaps.push_back(std::move(other.partition_string_heaps[p]));
|
99
|
-
}
|
100
|
-
}
|
101
|
-
return;
|
102
|
-
}
|
103
|
-
|
104
|
-
// Should have same number of partitions
|
105
|
-
D_ASSERT(partition_block_collections.size() == other.partition_block_collections.size());
|
106
|
-
D_ASSERT(partition_string_heaps.size() == other.partition_string_heaps.size());
|
107
|
-
for (idx_t idx = 0; idx < other.partition_block_collections.size(); idx++) {
|
108
|
-
partition_block_collections[idx]->Merge(*other.partition_block_collections[idx]);
|
109
|
-
if (!layout.AllConstant()) {
|
110
|
-
partition_string_heaps[idx]->Merge(*other.partition_string_heaps[idx]);
|
111
|
-
}
|
112
|
-
}
|
84
|
+
sink_collection->Combine(*other.sink_collection);
|
113
85
|
}
|
114
86
|
|
115
87
|
void JoinHashTable::ApplyBitmask(Vector &hashes, idx_t count) {
|
@@ -194,7 +166,7 @@ idx_t JoinHashTable::PrepareKeys(DataChunk &keys, unique_ptr<UnifiedVectorFormat
|
|
194
166
|
return added_count;
|
195
167
|
}
|
196
168
|
|
197
|
-
void JoinHashTable::Build(DataChunk &keys, DataChunk &payload) {
|
169
|
+
void JoinHashTable::Build(PartitionedTupleDataAppendState &append_state, DataChunk &keys, DataChunk &payload) {
|
198
170
|
D_ASSERT(!finalized);
|
199
171
|
D_ASSERT(keys.size() == payload.size());
|
200
172
|
if (keys.size() == 0) {
|
@@ -236,61 +208,42 @@ void JoinHashTable::Build(DataChunk &keys, DataChunk &payload) {
|
|
236
208
|
return;
|
237
209
|
}
|
238
210
|
|
239
|
-
// build out the buffer space
|
240
|
-
Vector addresses(LogicalType::POINTER);
|
241
|
-
auto key_locations = FlatVector::GetData<data_ptr_t>(addresses);
|
242
|
-
auto handles = block_collection->Build(added_count, key_locations, nullptr, current_sel);
|
243
|
-
|
244
211
|
// hash the keys and obtain an entry in the list
|
245
212
|
// note that we only hash the keys used in the equality comparison
|
246
213
|
Vector hash_values(LogicalType::HASH);
|
247
214
|
Hash(keys, *current_sel, added_count, hash_values);
|
248
215
|
|
249
|
-
// build a chunk
|
216
|
+
// build a chunk to append to the data collection [keys, payload, (optional "found" boolean), hash]
|
250
217
|
DataChunk source_chunk;
|
251
218
|
source_chunk.InitializeEmpty(layout.GetTypes());
|
252
|
-
|
253
|
-
vector<UnifiedVectorFormat> source_data;
|
254
|
-
source_data.reserve(layout.ColumnCount());
|
255
|
-
|
256
|
-
// serialize the keys to the key locations
|
257
219
|
for (idx_t i = 0; i < keys.ColumnCount(); i++) {
|
258
220
|
source_chunk.data[i].Reference(keys.data[i]);
|
259
|
-
source_data.emplace_back(std::move(key_data[i]));
|
260
221
|
}
|
261
|
-
|
222
|
+
idx_t col_offset = keys.ColumnCount();
|
262
223
|
D_ASSERT(build_types.size() == payload.ColumnCount());
|
263
224
|
for (idx_t i = 0; i < payload.ColumnCount(); i++) {
|
264
|
-
source_chunk.data[
|
265
|
-
UnifiedVectorFormat pdata;
|
266
|
-
payload.data[i].ToUnifiedFormat(payload.size(), pdata);
|
267
|
-
source_data.emplace_back(std::move(pdata));
|
225
|
+
source_chunk.data[col_offset + i].Reference(payload.data[i]);
|
268
226
|
}
|
227
|
+
col_offset += payload.ColumnCount();
|
269
228
|
if (IsRightOuterJoin(join_type)) {
|
270
229
|
// for FULL/RIGHT OUTER joins initialize the "found" boolean to false
|
271
|
-
source_chunk.data[
|
272
|
-
|
273
|
-
vfound.ToUnifiedFormat(keys.size(), fdata);
|
274
|
-
source_data.emplace_back(std::move(fdata));
|
230
|
+
source_chunk.data[col_offset].Reference(vfound);
|
231
|
+
col_offset++;
|
275
232
|
}
|
276
|
-
|
277
|
-
// serialise the hashes at the end
|
278
|
-
source_chunk.data[source_data.size()].Reference(hash_values);
|
279
|
-
UnifiedVectorFormat hdata;
|
280
|
-
hash_values.ToUnifiedFormat(keys.size(), hdata);
|
281
|
-
source_data.emplace_back(std::move(hdata));
|
282
|
-
|
233
|
+
source_chunk.data[col_offset].Reference(hash_values);
|
283
234
|
source_chunk.SetCardinality(keys);
|
284
235
|
|
285
|
-
|
286
|
-
|
236
|
+
if (added_count < keys.size()) {
|
237
|
+
source_chunk.Slice(*current_sel, added_count);
|
238
|
+
}
|
239
|
+
sink_collection->Append(append_state, source_chunk);
|
287
240
|
}
|
288
241
|
|
289
242
|
template <bool PARALLEL>
|
290
243
|
static inline void InsertHashesLoop(atomic<data_ptr_t> pointers[], const hash_t indices[], const idx_t count,
|
291
244
|
const data_ptr_t key_locations[], const idx_t pointer_offset) {
|
292
245
|
for (idx_t i = 0; i < count; i++) {
|
293
|
-
auto index = indices[i];
|
246
|
+
const auto index = indices[i];
|
294
247
|
if (PARALLEL) {
|
295
248
|
data_ptr_t head;
|
296
249
|
do {
|
@@ -327,74 +280,48 @@ void JoinHashTable::InsertHashes(Vector &hashes, idx_t count, data_ptr_t key_loc
|
|
327
280
|
}
|
328
281
|
|
329
282
|
void JoinHashTable::InitializePointerTable() {
|
330
|
-
idx_t
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
283
|
+
idx_t capacity = PointerTableCapacity(Count());
|
284
|
+
D_ASSERT(IsPowerOfTwo(capacity));
|
285
|
+
|
286
|
+
if (hash_map.get()) {
|
287
|
+
// There is already a hash map
|
288
|
+
auto current_capacity = hash_map.GetSize() / sizeof(data_ptr_t);
|
289
|
+
if (capacity > current_capacity) {
|
290
|
+
// Need more space
|
291
|
+
hash_map = buffer_manager.GetBufferAllocator().Allocate(capacity * sizeof(data_ptr_t));
|
292
|
+
} else {
|
293
|
+
// Just use the current hash map
|
294
|
+
capacity = current_capacity;
|
295
|
+
}
|
296
|
+
} else {
|
297
|
+
// Allocate a hash map
|
338
298
|
hash_map = buffer_manager.GetBufferAllocator().Allocate(capacity * sizeof(data_ptr_t));
|
339
299
|
}
|
340
300
|
D_ASSERT(hash_map.GetSize() == capacity * sizeof(data_ptr_t));
|
341
301
|
|
342
302
|
// initialize HT with all-zero entries
|
343
|
-
|
303
|
+
std::fill_n((data_ptr_t *)hash_map.get(), capacity, nullptr);
|
304
|
+
|
305
|
+
bitmask = capacity - 1;
|
344
306
|
}
|
345
307
|
|
346
|
-
void JoinHashTable::Finalize(idx_t
|
308
|
+
void JoinHashTable::Finalize(idx_t chunk_idx_from, idx_t chunk_idx_to, bool parallel) {
|
347
309
|
// Pointer table should be allocated
|
348
310
|
D_ASSERT(hash_map.get());
|
349
311
|
|
350
|
-
const auto unswizzle = external && !layout.AllConstant();
|
351
|
-
vector<BufferHandle> local_pinned_handles;
|
352
|
-
|
353
312
|
Vector hashes(LogicalType::HASH);
|
354
313
|
auto hash_data = FlatVector::GetData<hash_t>(hashes);
|
355
|
-
data_ptr_t key_locations[STANDARD_VECTOR_SIZE];
|
356
|
-
// now construct the actual hash table; scan the nodes
|
357
|
-
// as we scan the nodes we pin all the blocks of the HT and keep them pinned until the HT is destroyed
|
358
|
-
// this is so that we can keep pointers around to the blocks
|
359
|
-
for (idx_t block_idx = block_idx_start; block_idx < block_idx_end; block_idx++) {
|
360
|
-
auto &block = block_collection->blocks[block_idx];
|
361
|
-
auto handle = buffer_manager.Pin(block->block);
|
362
|
-
data_ptr_t dataptr = handle.Ptr();
|
363
|
-
|
364
|
-
data_ptr_t heap_ptr = nullptr;
|
365
|
-
if (unswizzle) {
|
366
|
-
auto &heap_block = string_heap->blocks[block_idx];
|
367
|
-
auto heap_handle = buffer_manager.Pin(heap_block->block);
|
368
|
-
heap_ptr = heap_handle.Ptr();
|
369
|
-
local_pinned_handles.push_back(std::move(heap_handle));
|
370
|
-
}
|
371
|
-
|
372
|
-
idx_t entry = 0;
|
373
|
-
while (entry < block->count) {
|
374
|
-
idx_t next = MinValue<idx_t>(STANDARD_VECTOR_SIZE, block->count - entry);
|
375
|
-
|
376
|
-
if (unswizzle) {
|
377
|
-
RowOperations::UnswizzlePointers(layout, dataptr, heap_ptr, next);
|
378
|
-
}
|
379
|
-
|
380
|
-
// fetch the next vector of entries from the blocks
|
381
|
-
for (idx_t i = 0; i < next; i++) {
|
382
|
-
hash_data[i] = Load<hash_t>((data_ptr_t)(dataptr + pointer_offset));
|
383
|
-
key_locations[i] = dataptr;
|
384
|
-
dataptr += entry_size;
|
385
|
-
}
|
386
|
-
// now insert into the hash table
|
387
|
-
InsertHashes(hashes, next, key_locations, parallel);
|
388
314
|
|
389
|
-
|
315
|
+
TupleDataChunkIterator iterator(*data_collection, TupleDataPinProperties::KEEP_EVERYTHING_PINNED, chunk_idx_from,
|
316
|
+
chunk_idx_to, false);
|
317
|
+
const auto row_locations = iterator.GetRowLocations();
|
318
|
+
do {
|
319
|
+
const auto count = iterator.GetCurrentChunkCount();
|
320
|
+
for (idx_t i = 0; i < count; i++) {
|
321
|
+
hash_data[i] = Load<hash_t>(row_locations[i] + pointer_offset);
|
390
322
|
}
|
391
|
-
|
392
|
-
}
|
393
|
-
|
394
|
-
lock_guard<mutex> lock(pinned_handles_lock);
|
395
|
-
for (auto &local_pinned_handle : local_pinned_handles) {
|
396
|
-
pinned_handles.push_back(std::move(local_pinned_handle));
|
397
|
-
}
|
323
|
+
InsertHashes(hashes, count, row_locations, parallel);
|
324
|
+
} while (iterator.Next());
|
398
325
|
}
|
399
326
|
|
400
327
|
unique_ptr<ScanStructure> JoinHashTable::InitializeScanStructure(DataChunk &keys, const SelectionVector *¤t_sel) {
|
@@ -540,7 +467,7 @@ void ScanStructure::AdvancePointers() {
|
|
540
467
|
|
541
468
|
void ScanStructure::GatherResult(Vector &result, const SelectionVector &result_vector,
|
542
469
|
const SelectionVector &sel_vector, const idx_t count, const idx_t col_no) {
|
543
|
-
|
470
|
+
ht.data_collection->Gather(pointers, sel_vector, count, col_no, result, result_vector);
|
544
471
|
}
|
545
472
|
|
546
473
|
void ScanStructure::GatherResult(Vector &result, const SelectionVector &sel_vector, const idx_t count,
|
@@ -838,22 +765,25 @@ void ScanStructure::NextSingleJoin(DataChunk &keys, DataChunk &input, DataChunk
|
|
838
765
|
finished = true;
|
839
766
|
}
|
840
767
|
|
841
|
-
|
768
|
+
void JoinHashTable::ScanFullOuter(JoinHTScanState &state, Vector &addresses, DataChunk &result) {
|
842
769
|
// scan the HT starting from the current position and check which rows from the build side did not find a match
|
843
770
|
auto key_locations = FlatVector::GetData<data_ptr_t>(addresses);
|
844
771
|
idx_t found_entries = 0;
|
845
|
-
|
846
|
-
|
847
|
-
|
848
|
-
|
849
|
-
|
850
|
-
|
851
|
-
|
772
|
+
|
773
|
+
auto &iterator = state.iterator;
|
774
|
+
if (iterator.Done()) {
|
775
|
+
return;
|
776
|
+
}
|
777
|
+
|
778
|
+
const auto row_locations = iterator.GetRowLocations();
|
779
|
+
do {
|
780
|
+
const auto count = iterator.GetCurrentChunkCount();
|
781
|
+
for (idx_t i = state.offset_in_chunk; i < count; i++) {
|
782
|
+
auto found_match = Load<bool>(row_locations[i] + tuple_size);
|
852
783
|
if (!found_match) {
|
853
|
-
key_locations[found_entries++] =
|
784
|
+
key_locations[found_entries++] = row_locations[i];
|
854
785
|
if (found_entries == STANDARD_VECTOR_SIZE) {
|
855
|
-
state.
|
856
|
-
state.scan_index++;
|
786
|
+
state.offset_in_chunk = i + 1;
|
857
787
|
break;
|
858
788
|
}
|
859
789
|
}
|
@@ -861,11 +791,10 @@ idx_t JoinHashTable::ScanFullOuter(JoinHTScanState &state, Vector &addresses) {
|
|
861
791
|
if (found_entries == STANDARD_VECTOR_SIZE) {
|
862
792
|
break;
|
863
793
|
}
|
864
|
-
|
865
|
-
|
866
|
-
}
|
794
|
+
state.offset_in_chunk = 0;
|
795
|
+
} while (iterator.Next());
|
867
796
|
|
868
|
-
|
797
|
+
// now gather from the found rows
|
869
798
|
if (found_entries == 0) {
|
870
799
|
return;
|
871
800
|
}
|
@@ -878,243 +807,170 @@ void JoinHashTable::GatherFullOuter(DataChunk &result, Vector &addresses, idx_t
|
|
878
807
|
vec.SetVectorType(VectorType::CONSTANT_VECTOR);
|
879
808
|
ConstantVector::SetNull(vec, true);
|
880
809
|
}
|
810
|
+
|
881
811
|
// gather the values from the RHS
|
882
812
|
for (idx_t i = 0; i < build_types.size(); i++) {
|
883
813
|
auto &vector = result.data[left_column_count + i];
|
884
814
|
D_ASSERT(vector.GetType() == build_types[i]);
|
885
815
|
const auto col_no = condition_types.size() + i;
|
886
|
-
|
816
|
+
data_collection->Gather(addresses, sel_vector, found_entries, col_no, vector, sel_vector);
|
887
817
|
}
|
888
818
|
}
|
889
819
|
|
890
|
-
idx_t JoinHashTable::FillWithHTOffsets(
|
891
|
-
// iterate over
|
820
|
+
idx_t JoinHashTable::FillWithHTOffsets(JoinHTScanState &state, Vector &addresses) {
|
821
|
+
// iterate over HT
|
822
|
+
auto key_locations = FlatVector::GetData<data_ptr_t>(addresses);
|
892
823
|
idx_t key_count = 0;
|
893
|
-
|
894
|
-
|
895
|
-
|
896
|
-
|
897
|
-
|
898
|
-
|
899
|
-
|
900
|
-
// store its locations
|
901
|
-
key_locations[key_count++] = tuple_base;
|
902
|
-
state.position++;
|
824
|
+
|
825
|
+
auto &iterator = state.iterator;
|
826
|
+
const auto row_locations = iterator.GetRowLocations();
|
827
|
+
do {
|
828
|
+
const auto count = iterator.GetCurrentChunkCount();
|
829
|
+
for (idx_t i = 0; i < count; i++) {
|
830
|
+
key_locations[key_count + i] = row_locations[i];
|
903
831
|
}
|
904
|
-
|
905
|
-
|
906
|
-
|
832
|
+
key_count += count;
|
833
|
+
} while (iterator.Next());
|
834
|
+
|
907
835
|
return key_count;
|
908
836
|
}
|
909
837
|
|
910
|
-
|
911
|
-
|
912
|
-
|
838
|
+
bool JoinHashTable::RequiresExternalJoin(ClientConfig &config, vector<unique_ptr<JoinHashTable>> &local_hts) {
|
839
|
+
total_count = 0;
|
840
|
+
idx_t data_size = 0;
|
841
|
+
for (auto &ht : local_hts) {
|
842
|
+
auto &local_sink_collection = ht->GetSinkCollection();
|
843
|
+
total_count += local_sink_collection.Count();
|
844
|
+
data_size += local_sink_collection.SizeInBytes();
|
913
845
|
}
|
914
|
-
}
|
915
846
|
|
916
|
-
|
917
|
-
|
918
|
-
return;
|
847
|
+
if (total_count == 0) {
|
848
|
+
return false;
|
919
849
|
}
|
920
850
|
|
921
|
-
if (
|
922
|
-
//
|
923
|
-
|
924
|
-
|
851
|
+
if (config.force_external) {
|
852
|
+
// Do ~3 rounds if forcing external join to test all code paths
|
853
|
+
auto data_size_per_round = (data_size + 2) / 3;
|
854
|
+
auto count_per_round = (total_count + 2) / 3;
|
855
|
+
max_ht_size = data_size_per_round + PointerTableSize(count_per_round);
|
856
|
+
external = true;
|
857
|
+
} else {
|
858
|
+
auto ht_size = data_size + PointerTableSize(total_count);
|
859
|
+
external = ht_size > max_ht_size;
|
925
860
|
}
|
861
|
+
return external;
|
862
|
+
}
|
926
863
|
|
927
|
-
|
928
|
-
auto &
|
929
|
-
|
930
|
-
idx_t heap_block_remaining = heap_blocks[heap_block_idx]->count;
|
931
|
-
for (auto &data_block : block_collection->blocks) {
|
932
|
-
if (heap_block_remaining == 0) {
|
933
|
-
heap_block_remaining = heap_blocks[++heap_block_idx]->count;
|
934
|
-
}
|
935
|
-
|
936
|
-
// Pin the data block and swizzle the pointers within the rows
|
937
|
-
auto data_handle = buffer_manager.Pin(data_block->block);
|
938
|
-
auto data_ptr = data_handle.Ptr();
|
939
|
-
RowOperations::SwizzleColumns(layout, data_ptr, data_block->count);
|
940
|
-
|
941
|
-
// We want to copy as little of the heap data as possible, check how the data and heap blocks line up
|
942
|
-
if (heap_block_remaining >= data_block->count) {
|
943
|
-
// Easy: current heap block contains all strings for this data block, just copy (reference) the block
|
944
|
-
swizzled_string_heap->blocks.emplace_back(heap_blocks[heap_block_idx]->Copy());
|
945
|
-
swizzled_string_heap->blocks.back()->count = data_block->count;
|
946
|
-
|
947
|
-
// Swizzle the heap pointer
|
948
|
-
auto heap_handle = buffer_manager.Pin(swizzled_string_heap->blocks.back()->block);
|
949
|
-
auto heap_ptr = Load<data_ptr_t>(data_ptr + layout.GetHeapOffset());
|
950
|
-
auto heap_offset = heap_ptr - heap_handle.Ptr();
|
951
|
-
RowOperations::SwizzleHeapPointer(layout, data_ptr, heap_ptr, data_block->count, heap_offset);
|
952
|
-
|
953
|
-
// Update counter
|
954
|
-
heap_block_remaining -= data_block->count;
|
955
|
-
} else {
|
956
|
-
// Strings for this data block are spread over the current heap block and the next (and possibly more)
|
957
|
-
idx_t data_block_remaining = data_block->count;
|
958
|
-
vector<std::pair<data_ptr_t, idx_t>> ptrs_and_sizes;
|
959
|
-
idx_t total_size = 0;
|
960
|
-
while (data_block_remaining > 0) {
|
961
|
-
if (heap_block_remaining == 0) {
|
962
|
-
heap_block_remaining = heap_blocks[++heap_block_idx]->count;
|
963
|
-
}
|
964
|
-
auto next = MinValue<idx_t>(data_block_remaining, heap_block_remaining);
|
965
|
-
|
966
|
-
// Figure out where to start copying strings, and how many bytes we need to copy
|
967
|
-
auto heap_start_ptr = Load<data_ptr_t>(data_ptr + layout.GetHeapOffset());
|
968
|
-
auto heap_end_ptr =
|
969
|
-
Load<data_ptr_t>(data_ptr + layout.GetHeapOffset() + (next - 1) * layout.GetRowWidth());
|
970
|
-
idx_t size = heap_end_ptr - heap_start_ptr + Load<uint32_t>(heap_end_ptr);
|
971
|
-
ptrs_and_sizes.emplace_back(heap_start_ptr, size);
|
972
|
-
D_ASSERT(size <= heap_blocks[heap_block_idx]->byte_offset);
|
973
|
-
|
974
|
-
// Swizzle the heap pointer
|
975
|
-
RowOperations::SwizzleHeapPointer(layout, data_ptr, heap_start_ptr, next, total_size);
|
976
|
-
total_size += size;
|
977
|
-
|
978
|
-
// Update where we are in the data and heap blocks
|
979
|
-
data_ptr += next * layout.GetRowWidth();
|
980
|
-
data_block_remaining -= next;
|
981
|
-
heap_block_remaining -= next;
|
982
|
-
}
|
983
|
-
|
984
|
-
// Finally, we allocate a new heap block and copy data to it
|
985
|
-
swizzled_string_heap->blocks.emplace_back(
|
986
|
-
make_uniq<RowDataBlock>(buffer_manager, MaxValue<idx_t>(total_size, (idx_t)Storage::BLOCK_SIZE), 1));
|
987
|
-
auto new_heap_handle = buffer_manager.Pin(swizzled_string_heap->blocks.back()->block);
|
988
|
-
auto new_heap_ptr = new_heap_handle.Ptr();
|
989
|
-
for (auto &ptr_and_size : ptrs_and_sizes) {
|
990
|
-
memcpy(new_heap_ptr, ptr_and_size.first, ptr_and_size.second);
|
991
|
-
new_heap_ptr += ptr_and_size.second;
|
992
|
-
}
|
993
|
-
}
|
864
|
+
void JoinHashTable::Unpartition() {
|
865
|
+
for (auto &partition : sink_collection->GetPartitions()) {
|
866
|
+
data_collection->Combine(*partition);
|
994
867
|
}
|
995
|
-
|
996
|
-
// We're done with variable-sized data, now just merge the fixed-size data
|
997
|
-
swizzled_block_collection->Merge(*block_collection);
|
998
|
-
D_ASSERT(swizzled_block_collection->blocks.size() == swizzled_string_heap->blocks.size());
|
999
|
-
|
1000
|
-
// Update counts and cleanup
|
1001
|
-
swizzled_string_heap->count = string_heap->count;
|
1002
|
-
string_heap->Clear();
|
1003
868
|
}
|
1004
869
|
|
1005
|
-
|
1006
|
-
|
1007
|
-
external
|
870
|
+
bool JoinHashTable::RequiresPartitioning(ClientConfig &config, vector<unique_ptr<JoinHashTable>> &local_hts) {
|
871
|
+
D_ASSERT(total_count != 0);
|
872
|
+
D_ASSERT(external);
|
1008
873
|
|
1009
|
-
|
1010
|
-
|
1011
|
-
idx_t
|
874
|
+
idx_t num_partitions = RadixPartitioning::NumberOfPartitions(radix_bits);
|
875
|
+
vector<idx_t> partition_counts(num_partitions, 0);
|
876
|
+
vector<idx_t> partition_sizes(num_partitions, 0);
|
1012
877
|
for (auto &ht : local_hts) {
|
1013
|
-
|
1014
|
-
|
1015
|
-
|
1016
|
-
|
878
|
+
const auto &local_partitions = ht->GetSinkCollection().GetPartitions();
|
879
|
+
for (idx_t partition_idx = 0; partition_idx < num_partitions; partition_idx++) {
|
880
|
+
auto &local_partition = local_partitions[partition_idx];
|
881
|
+
partition_counts[partition_idx] += local_partition->Count();
|
882
|
+
partition_sizes[partition_idx] += local_partition->SizeInBytes();
|
883
|
+
}
|
1017
884
|
}
|
1018
885
|
|
1019
|
-
if
|
1020
|
-
|
886
|
+
// Figure out if we can fit all single partitions in memory
|
887
|
+
idx_t max_partition_idx = 0;
|
888
|
+
idx_t max_partition_size = 0;
|
889
|
+
for (idx_t partition_idx = 0; partition_idx < num_partitions; partition_idx++) {
|
890
|
+
const auto &partition_count = partition_counts[partition_idx];
|
891
|
+
const auto &partition_size = partition_sizes[partition_idx];
|
892
|
+
auto partition_ht_size = partition_size + PointerTableSize(partition_count);
|
893
|
+
if (partition_ht_size > max_partition_size) {
|
894
|
+
max_partition_size = partition_ht_size;
|
895
|
+
max_partition_idx = partition_idx;
|
896
|
+
}
|
1021
897
|
}
|
1022
898
|
|
1023
|
-
|
1024
|
-
|
1025
|
-
|
899
|
+
if (config.force_external || max_partition_size > max_ht_size) {
|
900
|
+
const auto partition_count = partition_counts[max_partition_idx];
|
901
|
+
const auto partition_size = partition_sizes[max_partition_idx];
|
1026
902
|
|
1027
|
-
|
1028
|
-
|
1029
|
-
|
1030
|
-
|
903
|
+
const auto max_added_bits = 8 - radix_bits;
|
904
|
+
idx_t added_bits;
|
905
|
+
for (added_bits = 1; added_bits < max_added_bits; added_bits++) {
|
906
|
+
double partition_multiplier = RadixPartitioning::NumberOfPartitions(added_bits);
|
1031
907
|
|
1032
|
-
|
1033
|
-
|
1034
|
-
|
1035
|
-
auto avg_partition_size = total_size / num_partitions;
|
908
|
+
auto new_estimated_count = double(partition_count) / partition_multiplier;
|
909
|
+
auto new_estimated_size = double(partition_size) / partition_multiplier;
|
910
|
+
auto new_estimated_ht_size = new_estimated_size + PointerTableSize(new_estimated_count);
|
1036
911
|
|
1037
|
-
|
1038
|
-
|
1039
|
-
|
912
|
+
if (new_estimated_ht_size <= double(max_ht_size) / 4) {
|
913
|
+
// Aim for an estimated partition size of max_ht_size / 4
|
914
|
+
break;
|
915
|
+
}
|
1040
916
|
}
|
917
|
+
radix_bits += added_bits;
|
918
|
+
sink_collection =
|
919
|
+
make_uniq<RadixPartitionedTupleData>(buffer_manager, layout, radix_bits, layout.ColumnCount() - 1);
|
920
|
+
return true;
|
921
|
+
} else {
|
922
|
+
return false;
|
1041
923
|
}
|
1042
924
|
}
|
1043
925
|
|
1044
926
|
void JoinHashTable::Partition(JoinHashTable &global_ht) {
|
1045
|
-
|
1046
|
-
|
1047
|
-
|
1048
|
-
|
1049
|
-
}
|
1050
|
-
#endif
|
1051
|
-
|
1052
|
-
// Swizzle and Partition
|
1053
|
-
SwizzleBlocks();
|
1054
|
-
RadixPartitioning::PartitionRowData(global_ht.buffer_manager, global_ht.layout, global_ht.pointer_offset,
|
1055
|
-
*swizzled_block_collection, *swizzled_string_heap, partition_block_collections,
|
1056
|
-
partition_string_heaps, global_ht.radix_bits);
|
1057
|
-
|
1058
|
-
// Add to global HT
|
927
|
+
auto new_sink_collection =
|
928
|
+
make_uniq<RadixPartitionedTupleData>(buffer_manager, layout, global_ht.radix_bits, layout.ColumnCount() - 1);
|
929
|
+
sink_collection->Repartition(*new_sink_collection);
|
930
|
+
sink_collection = std::move(new_sink_collection);
|
1059
931
|
global_ht.Merge(*this);
|
1060
932
|
}
|
1061
933
|
|
1062
934
|
void JoinHashTable::Reset() {
|
1063
|
-
|
1064
|
-
block_collection->Clear();
|
1065
|
-
string_heap->Clear();
|
935
|
+
data_collection->Reset();
|
1066
936
|
finalized = false;
|
1067
937
|
}
|
1068
938
|
|
1069
939
|
bool JoinHashTable::PrepareExternalFinalize() {
|
1070
|
-
idx_t num_partitions = RadixPartitioning::NumberOfPartitions(radix_bits);
|
1071
|
-
if (partition_block_collections.empty() || partition_end == num_partitions) {
|
1072
|
-
return false;
|
1073
|
-
}
|
1074
|
-
|
1075
940
|
if (finalized) {
|
1076
941
|
Reset();
|
1077
942
|
}
|
1078
943
|
|
944
|
+
const auto num_partitions = RadixPartitioning::NumberOfPartitions(radix_bits);
|
945
|
+
if (partition_end == num_partitions) {
|
946
|
+
return false;
|
947
|
+
}
|
948
|
+
|
949
|
+
// Start where we left off
|
950
|
+
auto &partitions = sink_collection->GetPartitions();
|
951
|
+
partition_start = partition_end;
|
952
|
+
|
1079
953
|
// Determine how many partitions we can do next (at least one)
|
1080
|
-
idx_t next = 0;
|
1081
954
|
idx_t count = 0;
|
1082
|
-
|
1083
|
-
|
1084
|
-
|
1085
|
-
|
1086
|
-
|
1087
|
-
|
955
|
+
idx_t data_size = 0;
|
956
|
+
idx_t partition_idx;
|
957
|
+
for (partition_idx = partition_start; partition_idx < num_partitions; partition_idx++) {
|
958
|
+
auto incl_count = count + partitions[partition_idx]->Count();
|
959
|
+
auto incl_data_size = data_size + partitions[partition_idx]->SizeInBytes();
|
960
|
+
auto incl_ht_size = incl_data_size + PointerTableSize(incl_count);
|
961
|
+
if (count > 0 && incl_ht_size > max_ht_size) {
|
1088
962
|
break;
|
1089
963
|
}
|
1090
|
-
|
1091
|
-
|
1092
|
-
}
|
1093
|
-
partition_end += next;
|
1094
|
-
|
1095
|
-
// Move specific partitions to the swizzled_... collections so they can be unswizzled
|
1096
|
-
D_ASSERT(SwizzledCount() == 0);
|
1097
|
-
for (idx_t p = partition_start; p < partition_end; p++) {
|
1098
|
-
auto &p_block_collection = *partition_block_collections[p];
|
1099
|
-
if (!layout.AllConstant()) {
|
1100
|
-
auto &p_string_heap = *partition_string_heaps[p];
|
1101
|
-
D_ASSERT(p_block_collection.count == p_string_heap.count);
|
1102
|
-
swizzled_string_heap->Merge(p_string_heap);
|
1103
|
-
// Remove after merging
|
1104
|
-
partition_string_heaps[p] = nullptr;
|
1105
|
-
}
|
1106
|
-
swizzled_block_collection->Merge(p_block_collection);
|
1107
|
-
// Remove after merging
|
1108
|
-
partition_block_collections[p] = nullptr;
|
964
|
+
count = incl_count;
|
965
|
+
data_size = incl_data_size;
|
1109
966
|
}
|
1110
|
-
|
967
|
+
partition_end = partition_idx;
|
1111
968
|
|
1112
|
-
//
|
1113
|
-
|
1114
|
-
|
1115
|
-
|
1116
|
-
|
1117
|
-
D_ASSERT(count == Count());
|
969
|
+
// Move the partitions to the main data collection
|
970
|
+
for (partition_idx = partition_start; partition_idx < partition_end; partition_idx++) {
|
971
|
+
data_collection->Combine(*partitions[partition_idx]);
|
972
|
+
}
|
973
|
+
D_ASSERT(Count() == count);
|
1118
974
|
|
1119
975
|
return true;
|
1120
976
|
}
|
@@ -1178,7 +1034,10 @@ unique_ptr<ScanStructure> JoinHashTable::ProbeAndSpill(DataChunk &keys, DataChun
|
|
1178
1034
|
|
1179
1035
|
ProbeSpill::ProbeSpill(JoinHashTable &ht, ClientContext &context, const vector<LogicalType> &probe_types)
|
1180
1036
|
: ht(ht), context(context), probe_types(probe_types) {
|
1181
|
-
|
1037
|
+
auto remaining_count = ht.GetSinkCollection().Count();
|
1038
|
+
auto remaining_data_size = ht.GetSinkCollection().SizeInBytes();
|
1039
|
+
auto remaining_ht_size = remaining_data_size + ht.PointerTableSize(remaining_count);
|
1040
|
+
if (remaining_ht_size <= ht.max_ht_size) {
|
1182
1041
|
// No need to partition as we will only have one more probe round
|
1183
1042
|
partitioned = false;
|
1184
1043
|
} else {
|
@@ -1260,7 +1119,12 @@ void ProbeSpill::PrepareNextProbe() {
|
|
1260
1119
|
// Move specific partitions to the global spill collection
|
1261
1120
|
global_spill_collection = std::move(partitions[ht.partition_start]);
|
1262
1121
|
for (idx_t i = ht.partition_start + 1; i < ht.partition_end; i++) {
|
1263
|
-
|
1122
|
+
auto &partition = partitions[i];
|
1123
|
+
if (global_spill_collection->Count() == 0) {
|
1124
|
+
global_spill_collection = std::move(partition);
|
1125
|
+
} else {
|
1126
|
+
global_spill_collection->Combine(*partition);
|
1127
|
+
}
|
1264
1128
|
}
|
1265
1129
|
}
|
1266
1130
|
}
|