duckdb 0.8.2-dev3458.0 → 0.8.2-dev3949.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.gyp +2 -0
- package/package.json +1 -1
- package/src/duckdb/extension/icu/icu_extension.cpp +5 -5
- package/src/duckdb/extension/json/include/json_deserializer.hpp +7 -16
- package/src/duckdb/extension/json/include/json_serializer.hpp +9 -15
- package/src/duckdb/extension/json/json_deserializer.cpp +29 -67
- package/src/duckdb/extension/json/json_scan.cpp +1 -1
- package/src/duckdb/extension/json/json_serializer.cpp +26 -69
- package/src/duckdb/src/common/enum_util.cpp +119 -7
- package/src/duckdb/src/common/extra_type_info.cpp +7 -3
- package/src/duckdb/src/common/radix_partitioning.cpp +8 -31
- package/src/duckdb/src/common/row_operations/row_aggregate.cpp +18 -3
- package/src/duckdb/src/common/serializer/binary_deserializer.cpp +62 -77
- package/src/duckdb/src/common/serializer/binary_serializer.cpp +84 -84
- package/src/duckdb/src/common/serializer/format_serializer.cpp +1 -1
- package/src/duckdb/src/common/sort/partition_state.cpp +41 -33
- package/src/duckdb/src/common/types/data_chunk.cpp +44 -8
- package/src/duckdb/src/common/types/hyperloglog.cpp +21 -0
- package/src/duckdb/src/common/types/interval.cpp +3 -0
- package/src/duckdb/src/common/types/row/partitioned_tuple_data.cpp +252 -126
- package/src/duckdb/src/common/types/row/row_layout.cpp +3 -31
- package/src/duckdb/src/common/types/row/tuple_data_allocator.cpp +40 -32
- package/src/duckdb/src/common/types/row/tuple_data_collection.cpp +39 -26
- package/src/duckdb/src/common/types/row/tuple_data_layout.cpp +11 -1
- package/src/duckdb/src/common/types/row/tuple_data_segment.cpp +21 -16
- package/src/duckdb/src/common/types/value.cpp +63 -42
- package/src/duckdb/src/common/types/vector.cpp +33 -67
- package/src/duckdb/src/core_functions/scalar/list/list_lambdas.cpp +3 -2
- package/src/duckdb/src/execution/aggregate_hashtable.cpp +222 -364
- package/src/duckdb/src/execution/join_hashtable.cpp +5 -6
- package/src/duckdb/src/execution/operator/aggregate/physical_hash_aggregate.cpp +240 -310
- package/src/duckdb/src/execution/operator/aggregate/physical_ungrouped_aggregate.cpp +202 -173
- package/src/duckdb/src/execution/operator/aggregate/physical_window.cpp +36 -2
- package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/base_csv_reader.cpp +58 -162
- package/src/duckdb/src/execution/operator/csv_scanner/buffered_csv_reader.cpp +434 -0
- package/src/duckdb/src/execution/operator/csv_scanner/csv_buffer.cpp +80 -0
- package/src/duckdb/src/execution/operator/csv_scanner/csv_buffer_manager.cpp +90 -0
- package/src/duckdb/src/execution/operator/csv_scanner/csv_file_handle.cpp +95 -0
- package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/csv_reader_options.cpp +47 -28
- package/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine.cpp +35 -0
- package/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine_cache.cpp +107 -0
- package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/parallel_csv_reader.cpp +44 -44
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp +52 -0
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp +336 -0
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/header_detection.cpp +165 -0
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp +398 -0
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_refinement.cpp +175 -0
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_replacement.cpp +39 -0
- package/src/duckdb/src/execution/operator/join/physical_asof_join.cpp +1 -1
- package/src/duckdb/src/execution/operator/set/physical_recursive_cte.cpp +1 -2
- package/src/duckdb/src/execution/radix_partitioned_hashtable.cpp +614 -574
- package/src/duckdb/src/execution/window_executor.cpp +6 -5
- package/src/duckdb/src/function/cast/cast_function_set.cpp +1 -0
- package/src/duckdb/src/function/scalar/strftime_format.cpp +4 -4
- package/src/duckdb/src/function/table/copy_csv.cpp +94 -96
- package/src/duckdb/src/function/table/read_csv.cpp +150 -136
- package/src/duckdb/src/function/table/table_scan.cpp +0 -2
- package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
- package/src/duckdb/src/include/duckdb/common/enum_util.hpp +24 -0
- package/src/duckdb/src/include/duckdb/common/file_opener.hpp +9 -0
- package/src/duckdb/src/include/duckdb/common/fixed_size_map.hpp +208 -0
- package/src/duckdb/src/include/duckdb/common/optional_idx.hpp +3 -0
- package/src/duckdb/src/include/duckdb/common/perfect_map_set.hpp +2 -1
- package/src/duckdb/src/include/duckdb/common/printer.hpp +11 -0
- package/src/duckdb/src/include/duckdb/common/serializer/binary_deserializer.hpp +43 -30
- package/src/duckdb/src/include/duckdb/common/serializer/binary_serializer.hpp +36 -35
- package/src/duckdb/src/include/duckdb/common/serializer/deserialization_data.hpp +18 -0
- package/src/duckdb/src/include/duckdb/common/serializer/encoding_util.hpp +132 -0
- package/src/duckdb/src/include/duckdb/common/serializer/format_deserializer.hpp +125 -150
- package/src/duckdb/src/include/duckdb/common/serializer/format_serializer.hpp +119 -107
- package/src/duckdb/src/include/duckdb/common/serializer/serialization_traits.hpp +2 -1
- package/src/duckdb/src/include/duckdb/common/shared_ptr.hpp +8 -0
- package/src/duckdb/src/include/duckdb/common/sort/partition_state.hpp +13 -7
- package/src/duckdb/src/include/duckdb/common/types/data_chunk.hpp +5 -0
- package/src/duckdb/src/include/duckdb/common/types/hyperloglog.hpp +7 -1
- package/src/duckdb/src/include/duckdb/common/types/interval.hpp +7 -0
- package/src/duckdb/src/include/duckdb/common/types/row/partitioned_tuple_data.hpp +41 -9
- package/src/duckdb/src/include/duckdb/common/types/row/row_data_collection_scanner.hpp +5 -0
- package/src/duckdb/src/include/duckdb/common/types/row/row_layout.hpp +1 -23
- package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_allocator.hpp +14 -8
- package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_collection.hpp +6 -3
- package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_layout.hpp +7 -0
- package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_segment.hpp +13 -8
- package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_states.hpp +3 -2
- package/src/duckdb/src/include/duckdb/common/types/vector.hpp +3 -3
- package/src/duckdb/src/include/duckdb/common/vector.hpp +2 -2
- package/src/duckdb/src/include/duckdb/execution/aggregate_hashtable.hpp +125 -146
- package/src/duckdb/src/include/duckdb/execution/operator/aggregate/physical_hash_aggregate.hpp +5 -4
- package/src/duckdb/src/include/duckdb/execution/operator/aggregate/physical_window.hpp +4 -3
- package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/base_csv_reader.hpp +17 -17
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/buffered_csv_reader.hpp +72 -0
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_buffer.hpp +110 -0
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_buffer_manager.hpp +103 -0
- package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_file_handle.hpp +8 -15
- package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_line_info.hpp +1 -1
- package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_reader_options.hpp +52 -28
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_sniffer.hpp +127 -0
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine.hpp +75 -0
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine_cache.hpp +51 -0
- package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/parallel_csv_reader.hpp +21 -27
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/quote_rules.hpp +21 -0
- package/src/duckdb/src/include/duckdb/execution/radix_partitioned_hashtable.hpp +18 -27
- package/src/duckdb/src/include/duckdb/function/function_serialization.hpp +5 -6
- package/src/duckdb/src/include/duckdb/function/scalar/strftime_format.hpp +4 -4
- package/src/duckdb/src/include/duckdb/function/table/read_csv.hpp +17 -12
- package/src/duckdb/src/include/duckdb/main/client_context_file_opener.hpp +1 -0
- package/src/duckdb/src/include/duckdb/main/client_data.hpp +2 -1
- package/src/duckdb/src/include/duckdb/main/config.hpp +1 -0
- package/src/duckdb/src/include/duckdb/main/connection.hpp +2 -2
- package/src/duckdb/src/include/duckdb/main/relation/read_csv_relation.hpp +6 -6
- package/src/duckdb/src/include/duckdb/parallel/event.hpp +12 -1
- package/src/duckdb/src/include/duckdb/storage/block.hpp +6 -0
- package/src/duckdb/src/include/duckdb/storage/buffer/block_handle.hpp +3 -0
- package/src/duckdb/src/include/duckdb/storage/statistics/base_statistics.hpp +7 -3
- package/src/duckdb/src/include/duckdb/storage/statistics/column_statistics.hpp +4 -0
- package/src/duckdb/src/include/duckdb/storage/statistics/distinct_statistics.hpp +5 -0
- package/src/duckdb/src/include/duckdb/storage/statistics/list_stats.hpp +3 -0
- package/src/duckdb/src/include/duckdb/storage/statistics/numeric_stats.hpp +3 -0
- package/src/duckdb/src/include/duckdb/storage/statistics/string_stats.hpp +3 -0
- package/src/duckdb/src/include/duckdb/storage/statistics/struct_stats.hpp +3 -0
- package/src/duckdb/src/include/duckdb/storage/table/chunk_info.hpp +15 -3
- package/src/duckdb/src/include/duckdb/storage/table/row_group.hpp +4 -0
- package/src/duckdb/src/include/duckdb/storage/table/table_statistics.hpp +5 -0
- package/src/duckdb/src/include/duckdb/verification/deserialized_statement_verifier_v2.hpp +6 -0
- package/src/duckdb/src/include/duckdb/verification/statement_verifier.hpp +1 -0
- package/src/duckdb/src/include/duckdb.h +12 -0
- package/src/duckdb/src/main/capi/logical_types-c.cpp +22 -0
- package/src/duckdb/src/main/client_context_file_opener.cpp +17 -0
- package/src/duckdb/src/main/client_verify.cpp +1 -0
- package/src/duckdb/src/main/config.cpp +2 -2
- package/src/duckdb/src/main/connection.cpp +3 -3
- package/src/duckdb/src/main/relation/read_csv_relation.cpp +19 -13
- package/src/duckdb/src/parallel/pipeline_finish_event.cpp +1 -1
- package/src/duckdb/src/parser/tableref/pivotref.cpp +0 -16
- package/src/duckdb/src/planner/binder/statement/bind_copy.cpp +1 -1
- package/src/duckdb/src/planner/binder/statement/bind_export.cpp +41 -25
- package/src/duckdb/src/planner/expression/bound_aggregate_expression.cpp +4 -4
- package/src/duckdb/src/planner/expression/bound_window_expression.cpp +10 -10
- package/src/duckdb/src/planner/logical_operator.cpp +1 -1
- package/src/duckdb/src/planner/planner.cpp +1 -1
- package/src/duckdb/src/storage/checkpoint_manager.cpp +4 -3
- package/src/duckdb/src/storage/serialization/serialize_constraint.cpp +1 -1
- package/src/duckdb/src/storage/serialization/serialize_create_info.cpp +5 -5
- package/src/duckdb/src/storage/serialization/serialize_expression.cpp +10 -10
- package/src/duckdb/src/storage/serialization/serialize_logical_operator.cpp +20 -20
- package/src/duckdb/src/storage/serialization/serialize_macro_function.cpp +2 -2
- package/src/duckdb/src/storage/serialization/serialize_nodes.cpp +118 -89
- package/src/duckdb/src/storage/serialization/serialize_parse_info.cpp +3 -3
- package/src/duckdb/src/storage/serialization/serialize_parsed_expression.cpp +27 -27
- package/src/duckdb/src/storage/serialization/serialize_query_node.cpp +16 -16
- package/src/duckdb/src/storage/serialization/serialize_result_modifier.cpp +8 -8
- package/src/duckdb/src/storage/serialization/serialize_statement.cpp +1 -1
- package/src/duckdb/src/storage/serialization/serialize_storage.cpp +39 -0
- package/src/duckdb/src/storage/serialization/serialize_tableref.cpp +9 -9
- package/src/duckdb/src/storage/statistics/base_statistics.cpp +67 -4
- package/src/duckdb/src/storage/statistics/column_statistics.cpp +16 -0
- package/src/duckdb/src/storage/statistics/list_stats.cpp +21 -0
- package/src/duckdb/src/storage/statistics/numeric_stats.cpp +126 -1
- package/src/duckdb/src/storage/statistics/string_stats.cpp +23 -0
- package/src/duckdb/src/storage/statistics/struct_stats.cpp +27 -0
- package/src/duckdb/src/storage/storage_info.cpp +1 -1
- package/src/duckdb/src/storage/table/chunk_info.cpp +82 -3
- package/src/duckdb/src/storage/table/row_group.cpp +68 -1
- package/src/duckdb/src/storage/table/table_statistics.cpp +21 -0
- package/src/duckdb/src/storage/wal_replay.cpp +2 -2
- package/src/duckdb/src/verification/deserialized_statement_verifier_v2.cpp +15 -1
- package/src/duckdb/src/verification/statement_verifier.cpp +2 -0
- package/src/duckdb/third_party/utf8proc/include/utf8proc_wrapper.hpp +8 -0
- package/src/duckdb/ub_src_execution.cpp +0 -2
- package/src/duckdb/ub_src_execution_operator_csv_scanner.cpp +18 -0
- package/src/duckdb/ub_src_execution_operator_csv_scanner_sniffer.cpp +12 -0
- package/src/duckdb/ub_src_execution_operator_persistent.cpp +0 -12
- package/src/duckdb/ub_src_storage_serialization.cpp +2 -0
- package/src/duckdb/src/execution/operator/persistent/buffered_csv_reader.cpp +0 -1487
- package/src/duckdb/src/execution/operator/persistent/csv_buffer.cpp +0 -72
- package/src/duckdb/src/execution/operator/persistent/csv_file_handle.cpp +0 -158
- package/src/duckdb/src/execution/partitionable_hashtable.cpp +0 -207
- package/src/duckdb/src/include/duckdb/execution/operator/persistent/buffered_csv_reader.hpp +0 -133
- package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_buffer.hpp +0 -74
- package/src/duckdb/src/include/duckdb/execution/partitionable_hashtable.hpp +0 -73
@@ -7,13 +7,9 @@
|
|
7
7
|
#include "duckdb/common/row_operations/row_operations.hpp"
|
8
8
|
#include "duckdb/common/types/null_value.hpp"
|
9
9
|
#include "duckdb/common/types/row/tuple_data_iterator.hpp"
|
10
|
-
#include "duckdb/common/vector_operations/unary_executor.hpp"
|
11
10
|
#include "duckdb/common/vector_operations/vector_operations.hpp"
|
12
11
|
#include "duckdb/execution/expression_executor.hpp"
|
13
12
|
#include "duckdb/planner/expression/bound_aggregate_expression.hpp"
|
14
|
-
#include "duckdb/storage/buffer_manager.hpp"
|
15
|
-
|
16
|
-
#include <cmath>
|
17
13
|
|
18
14
|
namespace duckdb {
|
19
15
|
|
@@ -22,9 +18,9 @@ using ValidityBytes = TupleDataLayout::ValidityBytes;
|
|
22
18
|
GroupedAggregateHashTable::GroupedAggregateHashTable(ClientContext &context, Allocator &allocator,
|
23
19
|
vector<LogicalType> group_types, vector<LogicalType> payload_types,
|
24
20
|
const vector<BoundAggregateExpression *> &bindings,
|
25
|
-
|
21
|
+
idx_t initial_capacity, idx_t radix_bits)
|
26
22
|
: GroupedAggregateHashTable(context, allocator, std::move(group_types), std::move(payload_types),
|
27
|
-
AggregateObject::CreateAggregateObjects(bindings),
|
23
|
+
AggregateObject::CreateAggregateObjects(bindings), initial_capacity, radix_bits) {
|
28
24
|
}
|
29
25
|
|
30
26
|
GroupedAggregateHashTable::GroupedAggregateHashTable(ClientContext &context, Allocator &allocator,
|
@@ -32,205 +28,189 @@ GroupedAggregateHashTable::GroupedAggregateHashTable(ClientContext &context, All
|
|
32
28
|
: GroupedAggregateHashTable(context, allocator, std::move(group_types), {}, vector<AggregateObject>()) {
|
33
29
|
}
|
34
30
|
|
35
|
-
AggregateHTAppendState::AggregateHTAppendState()
|
36
|
-
: ht_offsets(
|
37
|
-
|
38
|
-
|
39
|
-
chunk_state_initialized(false) {
|
31
|
+
GroupedAggregateHashTable::AggregateHTAppendState::AggregateHTAppendState()
|
32
|
+
: ht_offsets(LogicalType::UBIGINT), hash_salts(LogicalType::HASH), group_compare_vector(STANDARD_VECTOR_SIZE),
|
33
|
+
no_match_vector(STANDARD_VECTOR_SIZE), empty_vector(STANDARD_VECTOR_SIZE), new_groups(STANDARD_VECTOR_SIZE),
|
34
|
+
addresses(LogicalType::POINTER) {
|
40
35
|
}
|
41
36
|
|
42
37
|
GroupedAggregateHashTable::GroupedAggregateHashTable(ClientContext &context, Allocator &allocator,
|
43
38
|
vector<LogicalType> group_types_p,
|
44
39
|
vector<LogicalType> payload_types_p,
|
45
40
|
vector<AggregateObject> aggregate_objects_p,
|
46
|
-
|
41
|
+
idx_t initial_capacity, idx_t radix_bits)
|
47
42
|
: BaseAggregateHashTable(context, allocator, aggregate_objects_p, std::move(payload_types_p)),
|
48
|
-
|
49
|
-
|
43
|
+
radix_bits(radix_bits), count(0), capacity(0), aggregate_allocator(make_shared<ArenaAllocator>(allocator)) {
|
44
|
+
|
50
45
|
// Append hash column to the end and initialise the row layout
|
51
46
|
group_types_p.emplace_back(LogicalType::HASH);
|
52
47
|
layout.Initialize(std::move(group_types_p), std::move(aggregate_objects_p));
|
53
|
-
tuple_size = layout.GetRowWidth();
|
54
|
-
tuples_per_block = Storage::BLOCK_SIZE / tuple_size;
|
55
|
-
|
56
|
-
// HT layout
|
57
48
|
hash_offset = layout.GetOffsets()[layout.ColumnCount() - 1];
|
58
|
-
data_collection = make_uniq<TupleDataCollection>(buffer_manager, layout);
|
59
|
-
data_collection->InitializeAppend(td_pin_state, TupleDataPinProperties::KEEP_EVERYTHING_PINNED);
|
60
|
-
|
61
|
-
switch (entry_type) {
|
62
|
-
case HtEntryType::HT_WIDTH_64: {
|
63
|
-
hash_prefix_shift = (HASH_WIDTH - sizeof(aggr_ht_entry_64::salt)) * 8;
|
64
|
-
Resize<aggr_ht_entry_64>(initial_capacity);
|
65
|
-
break;
|
66
|
-
}
|
67
|
-
case HtEntryType::HT_WIDTH_32: {
|
68
|
-
hash_prefix_shift = (HASH_WIDTH - sizeof(aggr_ht_entry_32::salt)) * 8;
|
69
|
-
Resize<aggr_ht_entry_32>(initial_capacity);
|
70
|
-
break;
|
71
|
-
}
|
72
|
-
default:
|
73
|
-
throw InternalException("Unknown HT entry width");
|
74
|
-
}
|
75
49
|
|
50
|
+
// Partitioned data and pointer table
|
51
|
+
InitializePartitionedData();
|
52
|
+
Resize(initial_capacity);
|
53
|
+
|
54
|
+
// Predicates
|
76
55
|
predicates.resize(layout.ColumnCount() - 1, ExpressionType::COMPARE_EQUAL);
|
77
56
|
}
|
78
57
|
|
58
|
+
void GroupedAggregateHashTable::InitializePartitionedData() {
|
59
|
+
if (!partitioned_data || RadixPartitioning::RadixBits(partitioned_data->PartitionCount()) != radix_bits) {
|
60
|
+
D_ASSERT(!partitioned_data || partitioned_data->Count() == 0);
|
61
|
+
partitioned_data =
|
62
|
+
make_uniq<RadixPartitionedTupleData>(buffer_manager, layout, radix_bits, layout.ColumnCount() - 1);
|
63
|
+
} else {
|
64
|
+
partitioned_data->Reset();
|
65
|
+
}
|
66
|
+
|
67
|
+
D_ASSERT(GetLayout().GetAggrWidth() == layout.GetAggrWidth());
|
68
|
+
D_ASSERT(GetLayout().GetDataWidth() == layout.GetDataWidth());
|
69
|
+
D_ASSERT(GetLayout().GetRowWidth() == layout.GetRowWidth());
|
70
|
+
|
71
|
+
partitioned_data->InitializeAppendState(state.append_state, TupleDataPinProperties::KEEP_EVERYTHING_PINNED);
|
72
|
+
}
|
73
|
+
|
74
|
+
unique_ptr<PartitionedTupleData> &GroupedAggregateHashTable::GetPartitionedData() {
|
75
|
+
return partitioned_data;
|
76
|
+
}
|
77
|
+
|
78
|
+
shared_ptr<ArenaAllocator> GroupedAggregateHashTable::GetAggregateAllocator() {
|
79
|
+
return aggregate_allocator;
|
80
|
+
}
|
81
|
+
|
79
82
|
GroupedAggregateHashTable::~GroupedAggregateHashTable() {
|
80
83
|
Destroy();
|
81
84
|
}
|
82
85
|
|
83
86
|
void GroupedAggregateHashTable::Destroy() {
|
84
|
-
if (
|
85
|
-
return;
|
86
|
-
}
|
87
|
-
|
88
|
-
// Check if there is an aggregate with a destructor
|
89
|
-
bool has_destructor = false;
|
90
|
-
for (auto &aggr : layout.GetAggregates()) {
|
91
|
-
if (aggr.function.destructor) {
|
92
|
-
has_destructor = true;
|
93
|
-
}
|
94
|
-
}
|
95
|
-
if (!has_destructor) {
|
87
|
+
if (!partitioned_data || partitioned_data->Count() == 0 || !layout.HasDestructor()) {
|
96
88
|
return;
|
97
89
|
}
|
98
90
|
|
99
91
|
// There are aggregates with destructors: Call the destructor for each of the aggregates
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
data_collection->Reset();
|
107
|
-
}
|
108
|
-
|
109
|
-
template <class ENTRY>
|
110
|
-
void GroupedAggregateHashTable::VerifyInternal() {
|
111
|
-
auto hashes_ptr = (ENTRY *)hashes_hdl_ptr;
|
112
|
-
idx_t count = 0;
|
113
|
-
for (idx_t i = 0; i < capacity; i++) {
|
114
|
-
if (hashes_ptr[i].page_nr > 0) {
|
115
|
-
D_ASSERT(hashes_ptr[i].page_offset < tuples_per_block);
|
116
|
-
D_ASSERT(hashes_ptr[i].page_nr <= payload_hds_ptrs.size());
|
117
|
-
auto ptr = payload_hds_ptrs[hashes_ptr[i].page_nr - 1] + ((hashes_ptr[i].page_offset) * tuple_size);
|
118
|
-
auto hash = Load<hash_t>(ptr + hash_offset);
|
119
|
-
D_ASSERT((hashes_ptr[i].salt) == (hash >> hash_prefix_shift));
|
120
|
-
|
121
|
-
count++;
|
92
|
+
// Currently does not happen because aggregate destructors are called while scanning in RadixPartitionedHashTable
|
93
|
+
// LCOV_EXCL_START
|
94
|
+
RowOperationsState row_state(*aggregate_allocator);
|
95
|
+
for (auto &data_collection : partitioned_data->GetPartitions()) {
|
96
|
+
if (data_collection->Count() == 0) {
|
97
|
+
continue;
|
122
98
|
}
|
99
|
+
TupleDataChunkIterator iterator(*data_collection, TupleDataPinProperties::DESTROY_AFTER_DONE, false);
|
100
|
+
auto &row_locations = iterator.GetChunkState().row_locations;
|
101
|
+
do {
|
102
|
+
RowOperations::DestroyStates(row_state, layout, row_locations, iterator.GetCurrentChunkCount());
|
103
|
+
} while (iterator.Next());
|
104
|
+
data_collection->Reset();
|
123
105
|
}
|
124
|
-
|
125
|
-
|
106
|
+
// LCOV_EXCL_STOP
|
107
|
+
}
|
108
|
+
|
109
|
+
const TupleDataLayout &GroupedAggregateHashTable::GetLayout() const {
|
110
|
+
return partitioned_data->GetLayout();
|
111
|
+
}
|
112
|
+
|
113
|
+
idx_t GroupedAggregateHashTable::Count() const {
|
114
|
+
return count;
|
126
115
|
}
|
127
116
|
|
128
117
|
idx_t GroupedAggregateHashTable::InitialCapacity() {
|
129
118
|
return STANDARD_VECTOR_SIZE * 2ULL;
|
130
119
|
}
|
131
120
|
|
132
|
-
idx_t GroupedAggregateHashTable::
|
133
|
-
idx_t
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
break;
|
141
|
-
case HtEntryType::HT_WIDTH_64:
|
142
|
-
max_pages = NumericLimits<uint32_t>::Maximum();
|
143
|
-
max_tuples = NumericLimits<uint16_t>::Maximum();
|
144
|
-
break;
|
145
|
-
default:
|
146
|
-
throw InternalException("Unsupported hash table width");
|
147
|
-
}
|
121
|
+
idx_t GroupedAggregateHashTable::GetCapacityForCount(idx_t count) {
|
122
|
+
count = MaxValue<idx_t>(InitialCapacity(), count);
|
123
|
+
return NextPowerOfTwo(count * LOAD_FACTOR);
|
124
|
+
}
|
125
|
+
|
126
|
+
idx_t GroupedAggregateHashTable::Capacity() const {
|
127
|
+
return capacity;
|
128
|
+
}
|
148
129
|
|
149
|
-
|
130
|
+
idx_t GroupedAggregateHashTable::ResizeThreshold() const {
|
131
|
+
return Capacity() / LOAD_FACTOR;
|
150
132
|
}
|
151
133
|
|
152
|
-
idx_t GroupedAggregateHashTable::
|
153
|
-
return
|
134
|
+
idx_t GroupedAggregateHashTable::ApplyBitMask(hash_t hash) const {
|
135
|
+
return hash & bitmask;
|
154
136
|
}
|
155
137
|
|
156
138
|
void GroupedAggregateHashTable::Verify() {
|
157
139
|
#ifdef DEBUG
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
140
|
+
idx_t total_count = 0;
|
141
|
+
for (idx_t i = 0; i < capacity; i++) {
|
142
|
+
const auto &entry = entries[i];
|
143
|
+
if (!entry.IsOccupied()) {
|
144
|
+
continue;
|
145
|
+
}
|
146
|
+
auto hash = Load<hash_t>(entry.GetPointer() + hash_offset);
|
147
|
+
D_ASSERT(entry.GetSalt() == aggr_ht_entry_t::ExtractSalt(hash));
|
148
|
+
total_count++;
|
165
149
|
}
|
150
|
+
D_ASSERT(total_count == Count());
|
166
151
|
#endif
|
167
152
|
}
|
168
153
|
|
169
|
-
|
154
|
+
void GroupedAggregateHashTable::ClearPointerTable() {
|
155
|
+
std::fill_n(entries, capacity, aggr_ht_entry_t(0));
|
156
|
+
}
|
157
|
+
|
158
|
+
void GroupedAggregateHashTable::ResetCount() {
|
159
|
+
count = 0;
|
160
|
+
}
|
161
|
+
|
162
|
+
void GroupedAggregateHashTable::SetRadixBits(idx_t radix_bits_p) {
|
163
|
+
radix_bits = radix_bits_p;
|
164
|
+
}
|
165
|
+
|
170
166
|
void GroupedAggregateHashTable::Resize(idx_t size) {
|
171
|
-
D_ASSERT(!is_finalized);
|
172
167
|
D_ASSERT(size >= STANDARD_VECTOR_SIZE);
|
173
168
|
D_ASSERT(IsPowerOfTwo(size));
|
174
|
-
|
175
169
|
if (size < capacity) {
|
176
170
|
throw InternalException("Cannot downsize a hash table!");
|
177
171
|
}
|
178
|
-
capacity = size;
|
179
172
|
|
173
|
+
capacity = size;
|
174
|
+
hash_map = buffer_manager.GetBufferAllocator().Allocate(capacity * sizeof(aggr_ht_entry_t));
|
175
|
+
entries = reinterpret_cast<aggr_ht_entry_t *>(hash_map.get());
|
176
|
+
ClearPointerTable();
|
180
177
|
bitmask = capacity - 1;
|
181
|
-
const auto byte_size = capacity * sizeof(ENTRY);
|
182
|
-
hashes_hdl = buffer_manager.GetBufferAllocator().Allocate(byte_size);
|
183
|
-
hashes_hdl_ptr = hashes_hdl.get();
|
184
|
-
memset(hashes_hdl_ptr, 0, byte_size);
|
185
178
|
|
186
179
|
if (Count() != 0) {
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
D_ASSERT((row_location - block_pointer) % tuple_size == 0);
|
207
|
-
|
208
|
-
const auto hash = Load<hash_t>(row_location + hash_offset);
|
209
|
-
D_ASSERT((hash & bitmask) == (hash % capacity));
|
210
|
-
D_ASSERT(hash >> hash_prefix_shift <= NumericLimits<uint16_t>::Maximum());
|
211
|
-
|
212
|
-
auto entry_idx = (idx_t)hash & bitmask;
|
213
|
-
while (hashes_arr[entry_idx].page_nr > 0) {
|
214
|
-
entry_idx++;
|
215
|
-
if (entry_idx >= capacity) {
|
216
|
-
entry_idx = 0;
|
180
|
+
for (auto &data_collection : partitioned_data->GetPartitions()) {
|
181
|
+
if (data_collection->Count() == 0) {
|
182
|
+
continue;
|
183
|
+
}
|
184
|
+
TupleDataChunkIterator iterator(*data_collection, TupleDataPinProperties::ALREADY_PINNED, false);
|
185
|
+
const auto row_locations = iterator.GetRowLocations();
|
186
|
+
do {
|
187
|
+
for (idx_t i = 0; i < iterator.GetCurrentChunkCount(); i++) {
|
188
|
+
const auto &row_location = row_locations[i];
|
189
|
+
const auto hash = Load<hash_t>(row_location + hash_offset);
|
190
|
+
|
191
|
+
// Find an empty entry
|
192
|
+
auto entry_idx = ApplyBitMask(hash);
|
193
|
+
D_ASSERT(entry_idx == hash % capacity);
|
194
|
+
while (entries[entry_idx].IsOccupied() > 0) {
|
195
|
+
entry_idx++;
|
196
|
+
if (entry_idx >= capacity) {
|
197
|
+
entry_idx = 0;
|
198
|
+
}
|
217
199
|
}
|
200
|
+
auto &entry = entries[entry_idx];
|
201
|
+
D_ASSERT(!entry.IsOccupied());
|
202
|
+
entry.SetSalt(aggr_ht_entry_t::ExtractSalt(hash));
|
203
|
+
entry.SetPointer(row_location);
|
204
|
+
D_ASSERT(entry.IsOccupied());
|
218
205
|
}
|
219
|
-
|
220
|
-
|
221
|
-
D_ASSERT(!ht_entry.page_nr);
|
222
|
-
ht_entry.salt = hash >> hash_prefix_shift;
|
223
|
-
ht_entry.page_nr = block_id + 1;
|
224
|
-
ht_entry.page_offset = (row_location - block_pointer) / tuple_size;
|
225
|
-
}
|
226
|
-
} while (iterator.Next());
|
206
|
+
} while (iterator.Next());
|
207
|
+
}
|
227
208
|
}
|
228
209
|
|
229
210
|
Verify();
|
230
211
|
}
|
231
212
|
|
232
|
-
idx_t GroupedAggregateHashTable::AddChunk(
|
233
|
-
AggregateType filter) {
|
213
|
+
idx_t GroupedAggregateHashTable::AddChunk(DataChunk &groups, DataChunk &payload, AggregateType filter) {
|
234
214
|
unsafe_vector<idx_t> aggregate_filter;
|
235
215
|
|
236
216
|
auto &aggregates = layout.GetAggregates();
|
@@ -240,20 +220,18 @@ idx_t GroupedAggregateHashTable::AddChunk(AggregateHTAppendState &state, DataChu
|
|
240
220
|
aggregate_filter.push_back(i);
|
241
221
|
}
|
242
222
|
}
|
243
|
-
return AddChunk(
|
223
|
+
return AddChunk(groups, payload, aggregate_filter);
|
244
224
|
}
|
245
225
|
|
246
|
-
idx_t GroupedAggregateHashTable::AddChunk(
|
247
|
-
const unsafe_vector<idx_t> &filter) {
|
226
|
+
idx_t GroupedAggregateHashTable::AddChunk(DataChunk &groups, DataChunk &payload, const unsafe_vector<idx_t> &filter) {
|
248
227
|
Vector hashes(LogicalType::HASH);
|
249
228
|
groups.Hash(hashes);
|
250
229
|
|
251
|
-
return AddChunk(
|
230
|
+
return AddChunk(groups, hashes, payload, filter);
|
252
231
|
}
|
253
232
|
|
254
|
-
idx_t GroupedAggregateHashTable::AddChunk(
|
255
|
-
|
256
|
-
D_ASSERT(!is_finalized);
|
233
|
+
idx_t GroupedAggregateHashTable::AddChunk(DataChunk &groups, Vector &group_hashes, DataChunk &payload,
|
234
|
+
const unsafe_vector<idx_t> &filter) {
|
257
235
|
if (groups.size() == 0) {
|
258
236
|
return 0;
|
259
237
|
}
|
@@ -265,7 +243,7 @@ idx_t GroupedAggregateHashTable::AddChunk(AggregateHTAppendState &state, DataChu
|
|
265
243
|
}
|
266
244
|
#endif
|
267
245
|
|
268
|
-
auto new_group_count = FindOrCreateGroups(
|
246
|
+
const auto new_group_count = FindOrCreateGroups(groups, group_hashes, state.addresses, state.new_groups);
|
269
247
|
VectorOperations::AddInPlace(state.addresses, layout.GetAggrOffset(), payload.size());
|
270
248
|
|
271
249
|
// Now every cell has an entry, update the aggregates
|
@@ -301,11 +279,14 @@ idx_t GroupedAggregateHashTable::AddChunk(AggregateHTAppendState &state, DataChu
|
|
301
279
|
}
|
302
280
|
|
303
281
|
void GroupedAggregateHashTable::FetchAggregates(DataChunk &groups, DataChunk &result) {
|
282
|
+
#ifdef DEBUG
|
304
283
|
groups.Verify();
|
305
284
|
D_ASSERT(groups.ColumnCount() + 1 == layout.ColumnCount());
|
306
285
|
for (idx_t i = 0; i < result.ColumnCount(); i++) {
|
307
286
|
D_ASSERT(result.data[i].GetType() == payload_types[i]);
|
308
287
|
}
|
288
|
+
#endif
|
289
|
+
|
309
290
|
result.SetCardinality(groups);
|
310
291
|
if (groups.size() == 0) {
|
311
292
|
return;
|
@@ -313,57 +294,46 @@ void GroupedAggregateHashTable::FetchAggregates(DataChunk &groups, DataChunk &re
|
|
313
294
|
|
314
295
|
// find the groups associated with the addresses
|
315
296
|
// FIXME: this should not use the FindOrCreateGroups, creating them is unnecessary
|
316
|
-
AggregateHTAppendState append_state;
|
317
297
|
Vector addresses(LogicalType::POINTER);
|
318
|
-
FindOrCreateGroups(
|
298
|
+
FindOrCreateGroups(groups, addresses);
|
319
299
|
// now fetch the aggregates
|
320
300
|
RowOperationsState row_state(*aggregate_allocator);
|
321
301
|
RowOperations::FinalizeStates(row_state, layout, addresses, result, 0);
|
322
302
|
}
|
323
303
|
|
324
|
-
idx_t GroupedAggregateHashTable::
|
325
|
-
|
326
|
-
}
|
327
|
-
|
328
|
-
template <class ENTRY>
|
329
|
-
idx_t GroupedAggregateHashTable::FindOrCreateGroupsInternal(AggregateHTAppendState &state, DataChunk &groups,
|
330
|
-
Vector &group_hashes_v, Vector &addresses_v,
|
331
|
-
SelectionVector &new_groups_out) {
|
332
|
-
D_ASSERT(!is_finalized);
|
304
|
+
idx_t GroupedAggregateHashTable::FindOrCreateGroupsInternal(DataChunk &groups, Vector &group_hashes_v,
|
305
|
+
Vector &addresses_v, SelectionVector &new_groups_out) {
|
333
306
|
D_ASSERT(groups.ColumnCount() + 1 == layout.ColumnCount());
|
334
307
|
D_ASSERT(group_hashes_v.GetType() == LogicalType::HASH);
|
335
308
|
D_ASSERT(state.ht_offsets.GetVectorType() == VectorType::FLAT_VECTOR);
|
336
|
-
D_ASSERT(state.ht_offsets.GetType() == LogicalType::
|
309
|
+
D_ASSERT(state.ht_offsets.GetType() == LogicalType::UBIGINT);
|
337
310
|
D_ASSERT(addresses_v.GetType() == LogicalType::POINTER);
|
338
|
-
D_ASSERT(state.hash_salts.GetType() == LogicalType::
|
339
|
-
|
340
|
-
if (Count() + groups.size() > MaxCapacity()) {
|
341
|
-
throw InternalException("Hash table capacity reached");
|
342
|
-
}
|
311
|
+
D_ASSERT(state.hash_salts.GetType() == LogicalType::HASH);
|
343
312
|
|
344
|
-
//
|
345
|
-
if (
|
313
|
+
// Need to fit the entire vector, and resize at threshold
|
314
|
+
if (Count() + groups.size() > capacity || Count() + groups.size() > ResizeThreshold()) {
|
346
315
|
Verify();
|
347
|
-
Resize
|
316
|
+
Resize(capacity * 2);
|
348
317
|
}
|
349
318
|
D_ASSERT(capacity - Count() >= groups.size()); // we need to be able to fit at least one vector of data
|
350
319
|
|
351
320
|
group_hashes_v.Flatten(groups.size());
|
352
|
-
auto
|
321
|
+
auto hashes = FlatVector::GetData<hash_t>(group_hashes_v);
|
353
322
|
|
354
323
|
addresses_v.Flatten(groups.size());
|
355
324
|
auto addresses = FlatVector::GetData<data_ptr_t>(addresses_v);
|
356
325
|
|
357
326
|
// Compute the entry in the table based on the hash using a modulo,
|
358
327
|
// and precompute the hash salts for faster comparison below
|
359
|
-
auto
|
360
|
-
auto
|
328
|
+
auto ht_offsets = FlatVector::GetData<uint64_t>(state.ht_offsets);
|
329
|
+
auto hash_salts = FlatVector::GetData<hash_t>(state.hash_salts);
|
361
330
|
for (idx_t r = 0; r < groups.size(); r++) {
|
362
|
-
auto
|
363
|
-
|
364
|
-
|
365
|
-
|
331
|
+
const auto &hash = hashes[r];
|
332
|
+
ht_offsets[r] = ApplyBitMask(hash);
|
333
|
+
D_ASSERT(ht_offsets[r] == hash % capacity);
|
334
|
+
hash_salts[r] = aggr_ht_entry_t::ExtractSalt(hash);
|
366
335
|
}
|
336
|
+
|
367
337
|
// we start out with all entries [0, 1, 2, ..., groups.size()]
|
368
338
|
const SelectionVector *sel_vector = FlatVector::IncrementalSelectionVector();
|
369
339
|
|
@@ -379,15 +349,12 @@ idx_t GroupedAggregateHashTable::FindOrCreateGroupsInternal(AggregateHTAppendSta
|
|
379
349
|
state.group_chunk.SetCardinality(groups);
|
380
350
|
|
381
351
|
// convert all vectors to unified format
|
382
|
-
|
383
|
-
|
384
|
-
state.chunk_state_initialized = true;
|
385
|
-
}
|
386
|
-
TupleDataCollection::ToUnifiedFormat(state.chunk_state, state.group_chunk);
|
352
|
+
auto &chunk_state = state.append_state.chunk_state;
|
353
|
+
TupleDataCollection::ToUnifiedFormat(chunk_state, state.group_chunk);
|
387
354
|
if (!state.group_data) {
|
388
355
|
state.group_data = make_unsafe_uniq_array<UnifiedVectorFormat>(state.group_chunk.ColumnCount());
|
389
356
|
}
|
390
|
-
TupleDataCollection::GetVectorData(
|
357
|
+
TupleDataCollection::GetVectorData(chunk_state, state.group_data.get());
|
391
358
|
|
392
359
|
idx_t new_group_count = 0;
|
393
360
|
idx_t remaining_entries = groups.size();
|
@@ -398,57 +365,42 @@ idx_t GroupedAggregateHashTable::FindOrCreateGroupsInternal(AggregateHTAppendSta
|
|
398
365
|
|
399
366
|
// For each remaining entry, figure out whether or not it belongs to a full or empty group
|
400
367
|
for (idx_t i = 0; i < remaining_entries; i++) {
|
401
|
-
const
|
402
|
-
auto &
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
// Set page nr to 1 for now to mark it as occupied (will be corrected later) and set the salt
|
408
|
-
ht_entry.page_nr = 1;
|
409
|
-
ht_entry.salt = group_hashes[index] >> hash_prefix_shift;
|
410
|
-
|
411
|
-
// Update selection lists for outer loops
|
412
|
-
state.empty_vector.set_index(new_entry_count++, index);
|
413
|
-
new_groups_out.set_index(new_group_count++, index);
|
414
|
-
} else { // Cell is occupied: Compare salts
|
415
|
-
if (ht_entry.salt == hash_salts_ptr[index]) {
|
368
|
+
const auto index = sel_vector->get_index(i);
|
369
|
+
const auto &salt = hash_salts[index];
|
370
|
+
auto &entry = entries[ht_offsets[index]];
|
371
|
+
if (entry.IsOccupied()) { // Cell is occupied: Compare salts
|
372
|
+
if (entry.GetSalt() == salt) {
|
416
373
|
state.group_compare_vector.set_index(need_compare_count++, index);
|
417
374
|
} else {
|
418
375
|
state.no_match_vector.set_index(no_match_count++, index);
|
419
376
|
}
|
377
|
+
} else { // Cell is unoccupied
|
378
|
+
// Set salt (also marks as occupied)
|
379
|
+
entry.SetSalt(salt);
|
380
|
+
|
381
|
+
// Update selection lists for outer loops
|
382
|
+
state.empty_vector.set_index(new_entry_count++, index);
|
383
|
+
new_groups_out.set_index(new_group_count++, index);
|
420
384
|
}
|
421
385
|
}
|
422
386
|
|
423
387
|
if (new_entry_count != 0) {
|
424
388
|
// Append everything that belongs to an empty group
|
425
|
-
|
426
|
-
|
427
|
-
RowOperations::InitializeStates(layout, state.chunk_state.row_locations,
|
389
|
+
partitioned_data->AppendUnified(state.append_state, state.group_chunk, state.empty_vector, new_entry_count);
|
390
|
+
RowOperations::InitializeStates(layout, chunk_state.row_locations,
|
428
391
|
*FlatVector::IncrementalSelectionVector(), new_entry_count);
|
429
392
|
|
430
|
-
//
|
431
|
-
|
432
|
-
|
433
|
-
auto block_pointer = payload_hds_ptrs[block_id];
|
434
|
-
auto block_end = block_pointer + tuples_per_block * tuple_size;
|
435
|
-
|
436
|
-
// Set the page nrs/offsets in the 1st part of the HT now that the data has been appended
|
437
|
-
const auto row_locations = FlatVector::GetData<data_ptr_t>(state.chunk_state.row_locations);
|
393
|
+
// Set the entry pointers in the 1st part of the HT now that the data has been appended
|
394
|
+
const auto row_locations = FlatVector::GetData<data_ptr_t>(chunk_state.row_locations);
|
395
|
+
const auto &row_sel = state.append_state.reverse_partition_sel;
|
438
396
|
for (idx_t new_entry_idx = 0; new_entry_idx < new_entry_count; new_entry_idx++) {
|
439
|
-
const auto &row_location = row_locations[new_entry_idx];
|
440
|
-
if (row_location > block_end || row_location < block_pointer) {
|
441
|
-
block_id++;
|
442
|
-
D_ASSERT(block_id < payload_hds_ptrs.size());
|
443
|
-
block_pointer = payload_hds_ptrs[block_id];
|
444
|
-
block_end = block_pointer + tuples_per_block * tuple_size;
|
445
|
-
}
|
446
|
-
D_ASSERT(row_location >= block_pointer && row_location < block_end);
|
447
|
-
D_ASSERT((row_location - block_pointer) % tuple_size == 0);
|
448
397
|
const auto index = state.empty_vector.get_index(new_entry_idx);
|
449
|
-
auto
|
450
|
-
|
451
|
-
|
398
|
+
const auto row_idx = row_sel.get_index(index);
|
399
|
+
const auto &row_location = row_locations[row_idx];
|
400
|
+
|
401
|
+
auto &entry = entries[ht_offsets[index]];
|
402
|
+
|
403
|
+
entry.SetPointer(row_location);
|
452
404
|
addresses[index] = row_location;
|
453
405
|
}
|
454
406
|
}
|
@@ -457,10 +409,8 @@ idx_t GroupedAggregateHashTable::FindOrCreateGroupsInternal(AggregateHTAppendSta
|
|
457
409
|
// Get the pointers to the rows that need to be compared
|
458
410
|
for (idx_t need_compare_idx = 0; need_compare_idx < need_compare_count; need_compare_idx++) {
|
459
411
|
const auto index = state.group_compare_vector.get_index(need_compare_idx);
|
460
|
-
const auto &
|
461
|
-
|
462
|
-
auto page_offset = ht_entry.page_offset * tuple_size;
|
463
|
-
addresses[index] = page_ptr + page_offset;
|
412
|
+
const auto &entry = entries[ht_offsets[index]];
|
413
|
+
addresses[index] = entry.GetPointer();
|
464
414
|
}
|
465
415
|
|
466
416
|
// Perform group comparisons
|
@@ -472,55 +422,36 @@ idx_t GroupedAggregateHashTable::FindOrCreateGroupsInternal(AggregateHTAppendSta
|
|
472
422
|
// Linear probing: each of the entries that do not match move to the next entry in the HT
|
473
423
|
for (idx_t i = 0; i < no_match_count; i++) {
|
474
424
|
idx_t index = state.no_match_vector.get_index(i);
|
475
|
-
|
476
|
-
if (
|
477
|
-
|
425
|
+
ht_offsets[index]++;
|
426
|
+
if (ht_offsets[index] >= capacity) {
|
427
|
+
ht_offsets[index] = 0;
|
478
428
|
}
|
479
429
|
}
|
480
430
|
sel_vector = &state.no_match_vector;
|
481
431
|
remaining_entries = no_match_count;
|
482
432
|
}
|
483
433
|
|
434
|
+
count += new_group_count;
|
484
435
|
return new_group_count;
|
485
436
|
}
|
486
437
|
|
487
|
-
void GroupedAggregateHashTable::UpdateBlockPointers() {
|
488
|
-
for (const auto &id_and_handle : td_pin_state.row_handles) {
|
489
|
-
const auto &id = id_and_handle.first;
|
490
|
-
const auto &handle = id_and_handle.second;
|
491
|
-
if (payload_hds_ptrs.empty() || id > payload_hds_ptrs.size() - 1) {
|
492
|
-
payload_hds_ptrs.resize(id + 1);
|
493
|
-
}
|
494
|
-
payload_hds_ptrs[id] = handle.Ptr();
|
495
|
-
}
|
496
|
-
}
|
497
|
-
|
498
438
|
// this is to support distinct aggregations where we need to record whether we
|
499
439
|
// have already seen a value for a group
|
500
|
-
idx_t GroupedAggregateHashTable::FindOrCreateGroups(
|
501
|
-
Vector &group_hashes, Vector &addresses_out,
|
440
|
+
idx_t GroupedAggregateHashTable::FindOrCreateGroups(DataChunk &groups, Vector &group_hashes, Vector &addresses_out,
|
502
441
|
SelectionVector &new_groups_out) {
|
503
|
-
|
504
|
-
case HtEntryType::HT_WIDTH_64:
|
505
|
-
return FindOrCreateGroupsInternal<aggr_ht_entry_64>(state, groups, group_hashes, addresses_out, new_groups_out);
|
506
|
-
case HtEntryType::HT_WIDTH_32:
|
507
|
-
return FindOrCreateGroupsInternal<aggr_ht_entry_32>(state, groups, group_hashes, addresses_out, new_groups_out);
|
508
|
-
default:
|
509
|
-
throw InternalException("Unknown HT entry width");
|
510
|
-
}
|
442
|
+
return FindOrCreateGroupsInternal(groups, group_hashes, addresses_out, new_groups_out);
|
511
443
|
}
|
512
444
|
|
513
|
-
void GroupedAggregateHashTable::FindOrCreateGroups(
|
514
|
-
Vector &addresses) {
|
445
|
+
void GroupedAggregateHashTable::FindOrCreateGroups(DataChunk &groups, Vector &addresses) {
|
515
446
|
// create a dummy new_groups sel vector
|
516
|
-
FindOrCreateGroups(
|
447
|
+
FindOrCreateGroups(groups, addresses, state.new_groups);
|
517
448
|
}
|
518
449
|
|
519
|
-
idx_t GroupedAggregateHashTable::FindOrCreateGroups(
|
520
|
-
|
450
|
+
idx_t GroupedAggregateHashTable::FindOrCreateGroups(DataChunk &groups, Vector &addresses_out,
|
451
|
+
SelectionVector &new_groups_out) {
|
521
452
|
Vector hashes(LogicalType::HASH);
|
522
453
|
groups.Hash(hashes);
|
523
|
-
return FindOrCreateGroups(
|
454
|
+
return FindOrCreateGroups(groups, hashes, addresses_out, new_groups_out);
|
524
455
|
}
|
525
456
|
|
526
457
|
struct FlushMoveState {
|
@@ -533,13 +464,21 @@ struct FlushMoveState {
|
|
533
464
|
for (idx_t col_idx = 0; col_idx < layout.ColumnCount() - 1; col_idx++) {
|
534
465
|
column_ids.emplace_back(col_idx);
|
535
466
|
}
|
536
|
-
|
537
|
-
collection.InitializeScan(scan_state, column_ids, TupleDataPinProperties::UNPIN_AFTER_DONE);
|
467
|
+
collection.InitializeScan(scan_state, column_ids, TupleDataPinProperties::DESTROY_AFTER_DONE);
|
538
468
|
collection.InitializeScanChunk(scan_state, groups);
|
539
469
|
hash_col_idx = layout.ColumnCount() - 1;
|
540
470
|
}
|
541
471
|
|
542
|
-
bool Scan()
|
472
|
+
bool Scan() {
|
473
|
+
if (collection.Scan(scan_state, groups)) {
|
474
|
+
collection.Gather(scan_state.chunk_state.row_locations, *FlatVector::IncrementalSelectionVector(),
|
475
|
+
groups.size(), hash_col_idx, hashes, *FlatVector::IncrementalSelectionVector());
|
476
|
+
return true;
|
477
|
+
}
|
478
|
+
|
479
|
+
collection.FinalizePinState(scan_state.pin_state);
|
480
|
+
return false;
|
481
|
+
}
|
543
482
|
|
544
483
|
TupleDataCollection &collection;
|
545
484
|
TupleDataScanState scan_state;
|
@@ -548,52 +487,13 @@ struct FlushMoveState {
|
|
548
487
|
idx_t hash_col_idx;
|
549
488
|
Vector hashes;
|
550
489
|
|
551
|
-
AggregateHTAppendState append_state;
|
552
490
|
Vector group_addresses;
|
553
491
|
SelectionVector new_groups_sel;
|
554
492
|
};
|
555
493
|
|
556
|
-
bool FlushMoveState::Scan() {
|
557
|
-
if (collection.Scan(scan_state, groups)) {
|
558
|
-
collection.Gather(scan_state.chunk_state.row_locations, *FlatVector::IncrementalSelectionVector(),
|
559
|
-
groups.size(), hash_col_idx, hashes, *FlatVector::IncrementalSelectionVector());
|
560
|
-
return true;
|
561
|
-
}
|
562
|
-
|
563
|
-
collection.FinalizePinState(scan_state.pin_state);
|
564
|
-
return false;
|
565
|
-
}
|
566
|
-
|
567
494
|
void GroupedAggregateHashTable::Combine(GroupedAggregateHashTable &other) {
|
568
|
-
|
569
|
-
|
570
|
-
D_ASSERT(other.layout.GetAggrWidth() == layout.GetAggrWidth());
|
571
|
-
D_ASSERT(other.layout.GetDataWidth() == layout.GetDataWidth());
|
572
|
-
D_ASSERT(other.layout.GetRowWidth() == layout.GetRowWidth());
|
573
|
-
|
574
|
-
if (other.Count() == 0) {
|
575
|
-
return;
|
576
|
-
}
|
577
|
-
|
578
|
-
FlushMoveState state(*other.data_collection);
|
579
|
-
RowOperationsState row_state(*aggregate_allocator);
|
580
|
-
while (state.Scan()) {
|
581
|
-
FindOrCreateGroups(state.append_state, state.groups, state.hashes, state.group_addresses, state.new_groups_sel);
|
582
|
-
RowOperations::CombineStates(row_state, layout, state.scan_state.chunk_state.row_locations,
|
583
|
-
state.group_addresses, state.groups.size());
|
584
|
-
}
|
585
|
-
|
586
|
-
Verify();
|
587
|
-
|
588
|
-
// if we combine states, then we also need to combine the arena allocators
|
589
|
-
for (auto &stored_allocator : other.stored_allocators) {
|
590
|
-
stored_allocators.push_back(stored_allocator);
|
591
|
-
}
|
592
|
-
stored_allocators.push_back(other.aggregate_allocator);
|
593
|
-
}
|
594
|
-
|
595
|
-
void GroupedAggregateHashTable::Append(GroupedAggregateHashTable &other) {
|
596
|
-
data_collection->Combine(other.GetDataCollection());
|
495
|
+
auto other_data = other.partitioned_data->GetUnpartitioned();
|
496
|
+
Combine(*other_data);
|
597
497
|
|
598
498
|
// Inherit ownership to all stored aggregate allocators
|
599
499
|
stored_allocators.emplace_back(other.aggregate_allocator);
|
@@ -602,75 +502,33 @@ void GroupedAggregateHashTable::Append(GroupedAggregateHashTable &other) {
|
|
602
502
|
}
|
603
503
|
}
|
604
504
|
|
605
|
-
void GroupedAggregateHashTable::
|
606
|
-
|
607
|
-
|
608
|
-
D_ASSERT(
|
609
|
-
|
610
|
-
// Partition the data
|
611
|
-
auto pin_properties =
|
612
|
-
sink_done ? TupleDataPinProperties::UNPIN_AFTER_DONE : TupleDataPinProperties::KEEP_EVERYTHING_PINNED;
|
613
|
-
auto partitioned_data =
|
614
|
-
make_uniq<RadixPartitionedTupleData>(buffer_manager, layout, radix_bits, layout.ColumnCount() - 1);
|
615
|
-
partitioned_data->Partition(*data_collection, pin_properties);
|
616
|
-
D_ASSERT(partitioned_data->GetPartitions().size() == num_partitions);
|
617
|
-
|
618
|
-
// Move the partitioned data collections to the partitioned hash tables and initialize the 1st part of the HT
|
619
|
-
auto &partitions = partitioned_data->GetPartitions();
|
620
|
-
for (idx_t partition_idx = 0; partition_idx < num_partitions; partition_idx++) {
|
621
|
-
auto &partition_ht = *partition_hts[partition_idx];
|
622
|
-
partition_ht.data_collection = std::move(partitions[partition_idx]);
|
623
|
-
|
624
|
-
// Inherit ownership to all stored aggregate allocators
|
625
|
-
partition_ht.stored_allocators.emplace_back(aggregate_allocator);
|
626
|
-
for (const auto &stored_allocator : stored_allocators) {
|
627
|
-
partition_ht.stored_allocators.emplace_back(stored_allocator);
|
628
|
-
}
|
629
|
-
|
630
|
-
if (!sink_done) {
|
631
|
-
partition_ht.InitializeFirstPart();
|
632
|
-
partition_ht.Verify();
|
633
|
-
}
|
634
|
-
}
|
635
|
-
}
|
505
|
+
void GroupedAggregateHashTable::Combine(TupleDataCollection &other_data) {
|
506
|
+
D_ASSERT(other_data.GetLayout().GetAggrWidth() == layout.GetAggrWidth());
|
507
|
+
D_ASSERT(other_data.GetLayout().GetDataWidth() == layout.GetDataWidth());
|
508
|
+
D_ASSERT(other_data.GetLayout().GetRowWidth() == layout.GetRowWidth());
|
636
509
|
|
637
|
-
|
638
|
-
|
639
|
-
auto size = MaxValue<idx_t>(NextPowerOfTwo(Count() * 2L), capacity);
|
640
|
-
switch (entry_type) {
|
641
|
-
case HtEntryType::HT_WIDTH_64:
|
642
|
-
Resize<aggr_ht_entry_64>(size);
|
643
|
-
break;
|
644
|
-
case HtEntryType::HT_WIDTH_32:
|
645
|
-
Resize<aggr_ht_entry_32>(size);
|
646
|
-
break;
|
647
|
-
default:
|
648
|
-
throw InternalException("Unknown HT entry width");
|
510
|
+
if (other_data.Count() == 0) {
|
511
|
+
return;
|
649
512
|
}
|
650
|
-
}
|
651
|
-
|
652
|
-
idx_t GroupedAggregateHashTable::Scan(TupleDataParallelScanState &gstate, TupleDataLocalScanState &lstate,
|
653
|
-
DataChunk &result) {
|
654
|
-
data_collection->Scan(gstate, lstate, result);
|
655
513
|
|
514
|
+
FlushMoveState fm_state(other_data);
|
656
515
|
RowOperationsState row_state(*aggregate_allocator);
|
657
|
-
|
658
|
-
|
659
|
-
|
660
|
-
|
661
|
-
|
662
|
-
|
663
|
-
|
664
|
-
|
665
|
-
return;
|
516
|
+
while (fm_state.Scan()) {
|
517
|
+
FindOrCreateGroups(fm_state.groups, fm_state.hashes, fm_state.group_addresses, fm_state.new_groups_sel);
|
518
|
+
RowOperations::CombineStates(row_state, layout, fm_state.scan_state.chunk_state.row_locations,
|
519
|
+
fm_state.group_addresses, fm_state.groups.size());
|
520
|
+
if (layout.HasDestructor()) {
|
521
|
+
RowOperations::DestroyStates(row_state, layout, fm_state.scan_state.chunk_state.row_locations,
|
522
|
+
fm_state.groups.size());
|
523
|
+
}
|
666
524
|
}
|
667
525
|
|
668
|
-
|
669
|
-
|
670
|
-
data_collection->FinalizePinState(td_pin_state);
|
671
|
-
data_collection->Unpin();
|
526
|
+
Verify();
|
527
|
+
}
|
672
528
|
|
673
|
-
|
529
|
+
void GroupedAggregateHashTable::UnpinData() {
|
530
|
+
partitioned_data->FlushAppendState(state.append_state);
|
531
|
+
partitioned_data->Unpin();
|
674
532
|
}
|
675
533
|
|
676
534
|
} // namespace duckdb
|