duckdb 0.8.2-dev3458.0 → 0.8.2-dev3949.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (180) hide show
  1. package/binding.gyp +2 -0
  2. package/package.json +1 -1
  3. package/src/duckdb/extension/icu/icu_extension.cpp +5 -5
  4. package/src/duckdb/extension/json/include/json_deserializer.hpp +7 -16
  5. package/src/duckdb/extension/json/include/json_serializer.hpp +9 -15
  6. package/src/duckdb/extension/json/json_deserializer.cpp +29 -67
  7. package/src/duckdb/extension/json/json_scan.cpp +1 -1
  8. package/src/duckdb/extension/json/json_serializer.cpp +26 -69
  9. package/src/duckdb/src/common/enum_util.cpp +119 -7
  10. package/src/duckdb/src/common/extra_type_info.cpp +7 -3
  11. package/src/duckdb/src/common/radix_partitioning.cpp +8 -31
  12. package/src/duckdb/src/common/row_operations/row_aggregate.cpp +18 -3
  13. package/src/duckdb/src/common/serializer/binary_deserializer.cpp +62 -77
  14. package/src/duckdb/src/common/serializer/binary_serializer.cpp +84 -84
  15. package/src/duckdb/src/common/serializer/format_serializer.cpp +1 -1
  16. package/src/duckdb/src/common/sort/partition_state.cpp +41 -33
  17. package/src/duckdb/src/common/types/data_chunk.cpp +44 -8
  18. package/src/duckdb/src/common/types/hyperloglog.cpp +21 -0
  19. package/src/duckdb/src/common/types/interval.cpp +3 -0
  20. package/src/duckdb/src/common/types/row/partitioned_tuple_data.cpp +252 -126
  21. package/src/duckdb/src/common/types/row/row_layout.cpp +3 -31
  22. package/src/duckdb/src/common/types/row/tuple_data_allocator.cpp +40 -32
  23. package/src/duckdb/src/common/types/row/tuple_data_collection.cpp +39 -26
  24. package/src/duckdb/src/common/types/row/tuple_data_layout.cpp +11 -1
  25. package/src/duckdb/src/common/types/row/tuple_data_segment.cpp +21 -16
  26. package/src/duckdb/src/common/types/value.cpp +63 -42
  27. package/src/duckdb/src/common/types/vector.cpp +33 -67
  28. package/src/duckdb/src/core_functions/scalar/list/list_lambdas.cpp +3 -2
  29. package/src/duckdb/src/execution/aggregate_hashtable.cpp +222 -364
  30. package/src/duckdb/src/execution/join_hashtable.cpp +5 -6
  31. package/src/duckdb/src/execution/operator/aggregate/physical_hash_aggregate.cpp +240 -310
  32. package/src/duckdb/src/execution/operator/aggregate/physical_ungrouped_aggregate.cpp +202 -173
  33. package/src/duckdb/src/execution/operator/aggregate/physical_window.cpp +36 -2
  34. package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/base_csv_reader.cpp +58 -162
  35. package/src/duckdb/src/execution/operator/csv_scanner/buffered_csv_reader.cpp +434 -0
  36. package/src/duckdb/src/execution/operator/csv_scanner/csv_buffer.cpp +80 -0
  37. package/src/duckdb/src/execution/operator/csv_scanner/csv_buffer_manager.cpp +90 -0
  38. package/src/duckdb/src/execution/operator/csv_scanner/csv_file_handle.cpp +95 -0
  39. package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/csv_reader_options.cpp +47 -28
  40. package/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine.cpp +35 -0
  41. package/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine_cache.cpp +107 -0
  42. package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/parallel_csv_reader.cpp +44 -44
  43. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp +52 -0
  44. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp +336 -0
  45. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/header_detection.cpp +165 -0
  46. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp +398 -0
  47. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_refinement.cpp +175 -0
  48. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_replacement.cpp +39 -0
  49. package/src/duckdb/src/execution/operator/join/physical_asof_join.cpp +1 -1
  50. package/src/duckdb/src/execution/operator/set/physical_recursive_cte.cpp +1 -2
  51. package/src/duckdb/src/execution/radix_partitioned_hashtable.cpp +614 -574
  52. package/src/duckdb/src/execution/window_executor.cpp +6 -5
  53. package/src/duckdb/src/function/cast/cast_function_set.cpp +1 -0
  54. package/src/duckdb/src/function/scalar/strftime_format.cpp +4 -4
  55. package/src/duckdb/src/function/table/copy_csv.cpp +94 -96
  56. package/src/duckdb/src/function/table/read_csv.cpp +150 -136
  57. package/src/duckdb/src/function/table/table_scan.cpp +0 -2
  58. package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
  59. package/src/duckdb/src/include/duckdb/common/enum_util.hpp +24 -0
  60. package/src/duckdb/src/include/duckdb/common/file_opener.hpp +9 -0
  61. package/src/duckdb/src/include/duckdb/common/fixed_size_map.hpp +208 -0
  62. package/src/duckdb/src/include/duckdb/common/optional_idx.hpp +3 -0
  63. package/src/duckdb/src/include/duckdb/common/perfect_map_set.hpp +2 -1
  64. package/src/duckdb/src/include/duckdb/common/printer.hpp +11 -0
  65. package/src/duckdb/src/include/duckdb/common/serializer/binary_deserializer.hpp +43 -30
  66. package/src/duckdb/src/include/duckdb/common/serializer/binary_serializer.hpp +36 -35
  67. package/src/duckdb/src/include/duckdb/common/serializer/deserialization_data.hpp +18 -0
  68. package/src/duckdb/src/include/duckdb/common/serializer/encoding_util.hpp +132 -0
  69. package/src/duckdb/src/include/duckdb/common/serializer/format_deserializer.hpp +125 -150
  70. package/src/duckdb/src/include/duckdb/common/serializer/format_serializer.hpp +119 -107
  71. package/src/duckdb/src/include/duckdb/common/serializer/serialization_traits.hpp +2 -1
  72. package/src/duckdb/src/include/duckdb/common/shared_ptr.hpp +8 -0
  73. package/src/duckdb/src/include/duckdb/common/sort/partition_state.hpp +13 -7
  74. package/src/duckdb/src/include/duckdb/common/types/data_chunk.hpp +5 -0
  75. package/src/duckdb/src/include/duckdb/common/types/hyperloglog.hpp +7 -1
  76. package/src/duckdb/src/include/duckdb/common/types/interval.hpp +7 -0
  77. package/src/duckdb/src/include/duckdb/common/types/row/partitioned_tuple_data.hpp +41 -9
  78. package/src/duckdb/src/include/duckdb/common/types/row/row_data_collection_scanner.hpp +5 -0
  79. package/src/duckdb/src/include/duckdb/common/types/row/row_layout.hpp +1 -23
  80. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_allocator.hpp +14 -8
  81. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_collection.hpp +6 -3
  82. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_layout.hpp +7 -0
  83. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_segment.hpp +13 -8
  84. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_states.hpp +3 -2
  85. package/src/duckdb/src/include/duckdb/common/types/vector.hpp +3 -3
  86. package/src/duckdb/src/include/duckdb/common/vector.hpp +2 -2
  87. package/src/duckdb/src/include/duckdb/execution/aggregate_hashtable.hpp +125 -146
  88. package/src/duckdb/src/include/duckdb/execution/operator/aggregate/physical_hash_aggregate.hpp +5 -4
  89. package/src/duckdb/src/include/duckdb/execution/operator/aggregate/physical_window.hpp +4 -3
  90. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/base_csv_reader.hpp +17 -17
  91. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/buffered_csv_reader.hpp +72 -0
  92. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_buffer.hpp +110 -0
  93. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_buffer_manager.hpp +103 -0
  94. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_file_handle.hpp +8 -15
  95. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_line_info.hpp +1 -1
  96. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_reader_options.hpp +52 -28
  97. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_sniffer.hpp +127 -0
  98. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine.hpp +75 -0
  99. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine_cache.hpp +51 -0
  100. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/parallel_csv_reader.hpp +21 -27
  101. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/quote_rules.hpp +21 -0
  102. package/src/duckdb/src/include/duckdb/execution/radix_partitioned_hashtable.hpp +18 -27
  103. package/src/duckdb/src/include/duckdb/function/function_serialization.hpp +5 -6
  104. package/src/duckdb/src/include/duckdb/function/scalar/strftime_format.hpp +4 -4
  105. package/src/duckdb/src/include/duckdb/function/table/read_csv.hpp +17 -12
  106. package/src/duckdb/src/include/duckdb/main/client_context_file_opener.hpp +1 -0
  107. package/src/duckdb/src/include/duckdb/main/client_data.hpp +2 -1
  108. package/src/duckdb/src/include/duckdb/main/config.hpp +1 -0
  109. package/src/duckdb/src/include/duckdb/main/connection.hpp +2 -2
  110. package/src/duckdb/src/include/duckdb/main/relation/read_csv_relation.hpp +6 -6
  111. package/src/duckdb/src/include/duckdb/parallel/event.hpp +12 -1
  112. package/src/duckdb/src/include/duckdb/storage/block.hpp +6 -0
  113. package/src/duckdb/src/include/duckdb/storage/buffer/block_handle.hpp +3 -0
  114. package/src/duckdb/src/include/duckdb/storage/statistics/base_statistics.hpp +7 -3
  115. package/src/duckdb/src/include/duckdb/storage/statistics/column_statistics.hpp +4 -0
  116. package/src/duckdb/src/include/duckdb/storage/statistics/distinct_statistics.hpp +5 -0
  117. package/src/duckdb/src/include/duckdb/storage/statistics/list_stats.hpp +3 -0
  118. package/src/duckdb/src/include/duckdb/storage/statistics/numeric_stats.hpp +3 -0
  119. package/src/duckdb/src/include/duckdb/storage/statistics/string_stats.hpp +3 -0
  120. package/src/duckdb/src/include/duckdb/storage/statistics/struct_stats.hpp +3 -0
  121. package/src/duckdb/src/include/duckdb/storage/table/chunk_info.hpp +15 -3
  122. package/src/duckdb/src/include/duckdb/storage/table/row_group.hpp +4 -0
  123. package/src/duckdb/src/include/duckdb/storage/table/table_statistics.hpp +5 -0
  124. package/src/duckdb/src/include/duckdb/verification/deserialized_statement_verifier_v2.hpp +6 -0
  125. package/src/duckdb/src/include/duckdb/verification/statement_verifier.hpp +1 -0
  126. package/src/duckdb/src/include/duckdb.h +12 -0
  127. package/src/duckdb/src/main/capi/logical_types-c.cpp +22 -0
  128. package/src/duckdb/src/main/client_context_file_opener.cpp +17 -0
  129. package/src/duckdb/src/main/client_verify.cpp +1 -0
  130. package/src/duckdb/src/main/config.cpp +2 -2
  131. package/src/duckdb/src/main/connection.cpp +3 -3
  132. package/src/duckdb/src/main/relation/read_csv_relation.cpp +19 -13
  133. package/src/duckdb/src/parallel/pipeline_finish_event.cpp +1 -1
  134. package/src/duckdb/src/parser/tableref/pivotref.cpp +0 -16
  135. package/src/duckdb/src/planner/binder/statement/bind_copy.cpp +1 -1
  136. package/src/duckdb/src/planner/binder/statement/bind_export.cpp +41 -25
  137. package/src/duckdb/src/planner/expression/bound_aggregate_expression.cpp +4 -4
  138. package/src/duckdb/src/planner/expression/bound_window_expression.cpp +10 -10
  139. package/src/duckdb/src/planner/logical_operator.cpp +1 -1
  140. package/src/duckdb/src/planner/planner.cpp +1 -1
  141. package/src/duckdb/src/storage/checkpoint_manager.cpp +4 -3
  142. package/src/duckdb/src/storage/serialization/serialize_constraint.cpp +1 -1
  143. package/src/duckdb/src/storage/serialization/serialize_create_info.cpp +5 -5
  144. package/src/duckdb/src/storage/serialization/serialize_expression.cpp +10 -10
  145. package/src/duckdb/src/storage/serialization/serialize_logical_operator.cpp +20 -20
  146. package/src/duckdb/src/storage/serialization/serialize_macro_function.cpp +2 -2
  147. package/src/duckdb/src/storage/serialization/serialize_nodes.cpp +118 -89
  148. package/src/duckdb/src/storage/serialization/serialize_parse_info.cpp +3 -3
  149. package/src/duckdb/src/storage/serialization/serialize_parsed_expression.cpp +27 -27
  150. package/src/duckdb/src/storage/serialization/serialize_query_node.cpp +16 -16
  151. package/src/duckdb/src/storage/serialization/serialize_result_modifier.cpp +8 -8
  152. package/src/duckdb/src/storage/serialization/serialize_statement.cpp +1 -1
  153. package/src/duckdb/src/storage/serialization/serialize_storage.cpp +39 -0
  154. package/src/duckdb/src/storage/serialization/serialize_tableref.cpp +9 -9
  155. package/src/duckdb/src/storage/statistics/base_statistics.cpp +67 -4
  156. package/src/duckdb/src/storage/statistics/column_statistics.cpp +16 -0
  157. package/src/duckdb/src/storage/statistics/list_stats.cpp +21 -0
  158. package/src/duckdb/src/storage/statistics/numeric_stats.cpp +126 -1
  159. package/src/duckdb/src/storage/statistics/string_stats.cpp +23 -0
  160. package/src/duckdb/src/storage/statistics/struct_stats.cpp +27 -0
  161. package/src/duckdb/src/storage/storage_info.cpp +1 -1
  162. package/src/duckdb/src/storage/table/chunk_info.cpp +82 -3
  163. package/src/duckdb/src/storage/table/row_group.cpp +68 -1
  164. package/src/duckdb/src/storage/table/table_statistics.cpp +21 -0
  165. package/src/duckdb/src/storage/wal_replay.cpp +2 -2
  166. package/src/duckdb/src/verification/deserialized_statement_verifier_v2.cpp +15 -1
  167. package/src/duckdb/src/verification/statement_verifier.cpp +2 -0
  168. package/src/duckdb/third_party/utf8proc/include/utf8proc_wrapper.hpp +8 -0
  169. package/src/duckdb/ub_src_execution.cpp +0 -2
  170. package/src/duckdb/ub_src_execution_operator_csv_scanner.cpp +18 -0
  171. package/src/duckdb/ub_src_execution_operator_csv_scanner_sniffer.cpp +12 -0
  172. package/src/duckdb/ub_src_execution_operator_persistent.cpp +0 -12
  173. package/src/duckdb/ub_src_storage_serialization.cpp +2 -0
  174. package/src/duckdb/src/execution/operator/persistent/buffered_csv_reader.cpp +0 -1487
  175. package/src/duckdb/src/execution/operator/persistent/csv_buffer.cpp +0 -72
  176. package/src/duckdb/src/execution/operator/persistent/csv_file_handle.cpp +0 -158
  177. package/src/duckdb/src/execution/partitionable_hashtable.cpp +0 -207
  178. package/src/duckdb/src/include/duckdb/execution/operator/persistent/buffered_csv_reader.hpp +0 -133
  179. package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_buffer.hpp +0 -74
  180. package/src/duckdb/src/include/duckdb/execution/partitionable_hashtable.hpp +0 -73
@@ -7,13 +7,9 @@
7
7
  #include "duckdb/common/row_operations/row_operations.hpp"
8
8
  #include "duckdb/common/types/null_value.hpp"
9
9
  #include "duckdb/common/types/row/tuple_data_iterator.hpp"
10
- #include "duckdb/common/vector_operations/unary_executor.hpp"
11
10
  #include "duckdb/common/vector_operations/vector_operations.hpp"
12
11
  #include "duckdb/execution/expression_executor.hpp"
13
12
  #include "duckdb/planner/expression/bound_aggregate_expression.hpp"
14
- #include "duckdb/storage/buffer_manager.hpp"
15
-
16
- #include <cmath>
17
13
 
18
14
  namespace duckdb {
19
15
 
@@ -22,9 +18,9 @@ using ValidityBytes = TupleDataLayout::ValidityBytes;
22
18
  GroupedAggregateHashTable::GroupedAggregateHashTable(ClientContext &context, Allocator &allocator,
23
19
  vector<LogicalType> group_types, vector<LogicalType> payload_types,
24
20
  const vector<BoundAggregateExpression *> &bindings,
25
- HtEntryType entry_type, idx_t initial_capacity)
21
+ idx_t initial_capacity, idx_t radix_bits)
26
22
  : GroupedAggregateHashTable(context, allocator, std::move(group_types), std::move(payload_types),
27
- AggregateObject::CreateAggregateObjects(bindings), entry_type, initial_capacity) {
23
+ AggregateObject::CreateAggregateObjects(bindings), initial_capacity, radix_bits) {
28
24
  }
29
25
 
30
26
  GroupedAggregateHashTable::GroupedAggregateHashTable(ClientContext &context, Allocator &allocator,
@@ -32,205 +28,189 @@ GroupedAggregateHashTable::GroupedAggregateHashTable(ClientContext &context, All
32
28
  : GroupedAggregateHashTable(context, allocator, std::move(group_types), {}, vector<AggregateObject>()) {
33
29
  }
34
30
 
35
- AggregateHTAppendState::AggregateHTAppendState()
36
- : ht_offsets(LogicalTypeId::BIGINT), hash_salts(LogicalTypeId::SMALLINT),
37
- group_compare_vector(STANDARD_VECTOR_SIZE), no_match_vector(STANDARD_VECTOR_SIZE),
38
- empty_vector(STANDARD_VECTOR_SIZE), new_groups(STANDARD_VECTOR_SIZE), addresses(LogicalType::POINTER),
39
- chunk_state_initialized(false) {
31
+ GroupedAggregateHashTable::AggregateHTAppendState::AggregateHTAppendState()
32
+ : ht_offsets(LogicalType::UBIGINT), hash_salts(LogicalType::HASH), group_compare_vector(STANDARD_VECTOR_SIZE),
33
+ no_match_vector(STANDARD_VECTOR_SIZE), empty_vector(STANDARD_VECTOR_SIZE), new_groups(STANDARD_VECTOR_SIZE),
34
+ addresses(LogicalType::POINTER) {
40
35
  }
41
36
 
42
37
  GroupedAggregateHashTable::GroupedAggregateHashTable(ClientContext &context, Allocator &allocator,
43
38
  vector<LogicalType> group_types_p,
44
39
  vector<LogicalType> payload_types_p,
45
40
  vector<AggregateObject> aggregate_objects_p,
46
- HtEntryType entry_type, idx_t initial_capacity)
41
+ idx_t initial_capacity, idx_t radix_bits)
47
42
  : BaseAggregateHashTable(context, allocator, aggregate_objects_p, std::move(payload_types_p)),
48
- entry_type(entry_type), capacity(0), is_finalized(false),
49
- aggregate_allocator(make_shared<ArenaAllocator>(allocator)) {
43
+ radix_bits(radix_bits), count(0), capacity(0), aggregate_allocator(make_shared<ArenaAllocator>(allocator)) {
44
+
50
45
  // Append hash column to the end and initialise the row layout
51
46
  group_types_p.emplace_back(LogicalType::HASH);
52
47
  layout.Initialize(std::move(group_types_p), std::move(aggregate_objects_p));
53
- tuple_size = layout.GetRowWidth();
54
- tuples_per_block = Storage::BLOCK_SIZE / tuple_size;
55
-
56
- // HT layout
57
48
  hash_offset = layout.GetOffsets()[layout.ColumnCount() - 1];
58
- data_collection = make_uniq<TupleDataCollection>(buffer_manager, layout);
59
- data_collection->InitializeAppend(td_pin_state, TupleDataPinProperties::KEEP_EVERYTHING_PINNED);
60
-
61
- switch (entry_type) {
62
- case HtEntryType::HT_WIDTH_64: {
63
- hash_prefix_shift = (HASH_WIDTH - sizeof(aggr_ht_entry_64::salt)) * 8;
64
- Resize<aggr_ht_entry_64>(initial_capacity);
65
- break;
66
- }
67
- case HtEntryType::HT_WIDTH_32: {
68
- hash_prefix_shift = (HASH_WIDTH - sizeof(aggr_ht_entry_32::salt)) * 8;
69
- Resize<aggr_ht_entry_32>(initial_capacity);
70
- break;
71
- }
72
- default:
73
- throw InternalException("Unknown HT entry width");
74
- }
75
49
 
50
+ // Partitioned data and pointer table
51
+ InitializePartitionedData();
52
+ Resize(initial_capacity);
53
+
54
+ // Predicates
76
55
  predicates.resize(layout.ColumnCount() - 1, ExpressionType::COMPARE_EQUAL);
77
56
  }
78
57
 
58
+ void GroupedAggregateHashTable::InitializePartitionedData() {
59
+ if (!partitioned_data || RadixPartitioning::RadixBits(partitioned_data->PartitionCount()) != radix_bits) {
60
+ D_ASSERT(!partitioned_data || partitioned_data->Count() == 0);
61
+ partitioned_data =
62
+ make_uniq<RadixPartitionedTupleData>(buffer_manager, layout, radix_bits, layout.ColumnCount() - 1);
63
+ } else {
64
+ partitioned_data->Reset();
65
+ }
66
+
67
+ D_ASSERT(GetLayout().GetAggrWidth() == layout.GetAggrWidth());
68
+ D_ASSERT(GetLayout().GetDataWidth() == layout.GetDataWidth());
69
+ D_ASSERT(GetLayout().GetRowWidth() == layout.GetRowWidth());
70
+
71
+ partitioned_data->InitializeAppendState(state.append_state, TupleDataPinProperties::KEEP_EVERYTHING_PINNED);
72
+ }
73
+
74
+ unique_ptr<PartitionedTupleData> &GroupedAggregateHashTable::GetPartitionedData() {
75
+ return partitioned_data;
76
+ }
77
+
78
+ shared_ptr<ArenaAllocator> GroupedAggregateHashTable::GetAggregateAllocator() {
79
+ return aggregate_allocator;
80
+ }
81
+
79
82
  GroupedAggregateHashTable::~GroupedAggregateHashTable() {
80
83
  Destroy();
81
84
  }
82
85
 
83
86
  void GroupedAggregateHashTable::Destroy() {
84
- if (data_collection->Count() == 0) {
85
- return;
86
- }
87
-
88
- // Check if there is an aggregate with a destructor
89
- bool has_destructor = false;
90
- for (auto &aggr : layout.GetAggregates()) {
91
- if (aggr.function.destructor) {
92
- has_destructor = true;
93
- }
94
- }
95
- if (!has_destructor) {
87
+ if (!partitioned_data || partitioned_data->Count() == 0 || !layout.HasDestructor()) {
96
88
  return;
97
89
  }
98
90
 
99
91
  // There are aggregates with destructors: Call the destructor for each of the aggregates
100
- RowOperationsState state(*aggregate_allocator);
101
- TupleDataChunkIterator iterator(*data_collection, TupleDataPinProperties::DESTROY_AFTER_DONE, false);
102
- auto &row_locations = iterator.GetChunkState().row_locations;
103
- do {
104
- RowOperations::DestroyStates(state, layout, row_locations, iterator.GetCurrentChunkCount());
105
- } while (iterator.Next());
106
- data_collection->Reset();
107
- }
108
-
109
- template <class ENTRY>
110
- void GroupedAggregateHashTable::VerifyInternal() {
111
- auto hashes_ptr = (ENTRY *)hashes_hdl_ptr;
112
- idx_t count = 0;
113
- for (idx_t i = 0; i < capacity; i++) {
114
- if (hashes_ptr[i].page_nr > 0) {
115
- D_ASSERT(hashes_ptr[i].page_offset < tuples_per_block);
116
- D_ASSERT(hashes_ptr[i].page_nr <= payload_hds_ptrs.size());
117
- auto ptr = payload_hds_ptrs[hashes_ptr[i].page_nr - 1] + ((hashes_ptr[i].page_offset) * tuple_size);
118
- auto hash = Load<hash_t>(ptr + hash_offset);
119
- D_ASSERT((hashes_ptr[i].salt) == (hash >> hash_prefix_shift));
120
-
121
- count++;
92
+ // Currently does not happen because aggregate destructors are called while scanning in RadixPartitionedHashTable
93
+ // LCOV_EXCL_START
94
+ RowOperationsState row_state(*aggregate_allocator);
95
+ for (auto &data_collection : partitioned_data->GetPartitions()) {
96
+ if (data_collection->Count() == 0) {
97
+ continue;
122
98
  }
99
+ TupleDataChunkIterator iterator(*data_collection, TupleDataPinProperties::DESTROY_AFTER_DONE, false);
100
+ auto &row_locations = iterator.GetChunkState().row_locations;
101
+ do {
102
+ RowOperations::DestroyStates(row_state, layout, row_locations, iterator.GetCurrentChunkCount());
103
+ } while (iterator.Next());
104
+ data_collection->Reset();
123
105
  }
124
- (void)count;
125
- D_ASSERT(count == Count());
106
+ // LCOV_EXCL_STOP
107
+ }
108
+
109
+ const TupleDataLayout &GroupedAggregateHashTable::GetLayout() const {
110
+ return partitioned_data->GetLayout();
111
+ }
112
+
113
+ idx_t GroupedAggregateHashTable::Count() const {
114
+ return count;
126
115
  }
127
116
 
128
117
  idx_t GroupedAggregateHashTable::InitialCapacity() {
129
118
  return STANDARD_VECTOR_SIZE * 2ULL;
130
119
  }
131
120
 
132
- idx_t GroupedAggregateHashTable::GetMaxCapacity(HtEntryType entry_type, idx_t tuple_size) {
133
- idx_t max_pages;
134
- idx_t max_tuples;
135
-
136
- switch (entry_type) {
137
- case HtEntryType::HT_WIDTH_32:
138
- max_pages = NumericLimits<uint8_t>::Maximum();
139
- max_tuples = NumericLimits<uint16_t>::Maximum();
140
- break;
141
- case HtEntryType::HT_WIDTH_64:
142
- max_pages = NumericLimits<uint32_t>::Maximum();
143
- max_tuples = NumericLimits<uint16_t>::Maximum();
144
- break;
145
- default:
146
- throw InternalException("Unsupported hash table width");
147
- }
121
+ idx_t GroupedAggregateHashTable::GetCapacityForCount(idx_t count) {
122
+ count = MaxValue<idx_t>(InitialCapacity(), count);
123
+ return NextPowerOfTwo(count * LOAD_FACTOR);
124
+ }
125
+
126
+ idx_t GroupedAggregateHashTable::Capacity() const {
127
+ return capacity;
128
+ }
148
129
 
149
- return max_pages * MinValue(max_tuples, (idx_t)Storage::BLOCK_SIZE / tuple_size);
130
+ idx_t GroupedAggregateHashTable::ResizeThreshold() const {
131
+ return Capacity() / LOAD_FACTOR;
150
132
  }
151
133
 
152
- idx_t GroupedAggregateHashTable::MaxCapacity() {
153
- return GetMaxCapacity(entry_type, tuple_size);
134
+ idx_t GroupedAggregateHashTable::ApplyBitMask(hash_t hash) const {
135
+ return hash & bitmask;
154
136
  }
155
137
 
156
138
  void GroupedAggregateHashTable::Verify() {
157
139
  #ifdef DEBUG
158
- switch (entry_type) {
159
- case HtEntryType::HT_WIDTH_32:
160
- VerifyInternal<aggr_ht_entry_32>();
161
- break;
162
- case HtEntryType::HT_WIDTH_64:
163
- VerifyInternal<aggr_ht_entry_64>();
164
- break;
140
+ idx_t total_count = 0;
141
+ for (idx_t i = 0; i < capacity; i++) {
142
+ const auto &entry = entries[i];
143
+ if (!entry.IsOccupied()) {
144
+ continue;
145
+ }
146
+ auto hash = Load<hash_t>(entry.GetPointer() + hash_offset);
147
+ D_ASSERT(entry.GetSalt() == aggr_ht_entry_t::ExtractSalt(hash));
148
+ total_count++;
165
149
  }
150
+ D_ASSERT(total_count == Count());
166
151
  #endif
167
152
  }
168
153
 
169
- template <class ENTRY>
154
+ void GroupedAggregateHashTable::ClearPointerTable() {
155
+ std::fill_n(entries, capacity, aggr_ht_entry_t(0));
156
+ }
157
+
158
+ void GroupedAggregateHashTable::ResetCount() {
159
+ count = 0;
160
+ }
161
+
162
+ void GroupedAggregateHashTable::SetRadixBits(idx_t radix_bits_p) {
163
+ radix_bits = radix_bits_p;
164
+ }
165
+
170
166
  void GroupedAggregateHashTable::Resize(idx_t size) {
171
- D_ASSERT(!is_finalized);
172
167
  D_ASSERT(size >= STANDARD_VECTOR_SIZE);
173
168
  D_ASSERT(IsPowerOfTwo(size));
174
-
175
169
  if (size < capacity) {
176
170
  throw InternalException("Cannot downsize a hash table!");
177
171
  }
178
- capacity = size;
179
172
 
173
+ capacity = size;
174
+ hash_map = buffer_manager.GetBufferAllocator().Allocate(capacity * sizeof(aggr_ht_entry_t));
175
+ entries = reinterpret_cast<aggr_ht_entry_t *>(hash_map.get());
176
+ ClearPointerTable();
180
177
  bitmask = capacity - 1;
181
- const auto byte_size = capacity * sizeof(ENTRY);
182
- hashes_hdl = buffer_manager.GetBufferAllocator().Allocate(byte_size);
183
- hashes_hdl_ptr = hashes_hdl.get();
184
- memset(hashes_hdl_ptr, 0, byte_size);
185
178
 
186
179
  if (Count() != 0) {
187
- D_ASSERT(!payload_hds_ptrs.empty());
188
- auto hashes_arr = (ENTRY *)hashes_hdl_ptr;
189
-
190
- idx_t block_id = 0;
191
- auto block_pointer = payload_hds_ptrs[block_id];
192
- auto block_end = block_pointer + tuples_per_block * tuple_size;
193
-
194
- TupleDataChunkIterator iterator(*data_collection, TupleDataPinProperties::ALREADY_PINNED, false);
195
- const auto row_locations = iterator.GetRowLocations();
196
- do {
197
- for (idx_t i = 0; i < iterator.GetCurrentChunkCount(); i++) {
198
- const auto &row_location = row_locations[i];
199
- if (row_location > block_end || row_location < block_pointer) {
200
- block_id++;
201
- D_ASSERT(block_id < payload_hds_ptrs.size());
202
- block_pointer = payload_hds_ptrs[block_id];
203
- block_end = block_pointer + tuples_per_block * tuple_size;
204
- }
205
- D_ASSERT(row_location >= block_pointer && row_location < block_end);
206
- D_ASSERT((row_location - block_pointer) % tuple_size == 0);
207
-
208
- const auto hash = Load<hash_t>(row_location + hash_offset);
209
- D_ASSERT((hash & bitmask) == (hash % capacity));
210
- D_ASSERT(hash >> hash_prefix_shift <= NumericLimits<uint16_t>::Maximum());
211
-
212
- auto entry_idx = (idx_t)hash & bitmask;
213
- while (hashes_arr[entry_idx].page_nr > 0) {
214
- entry_idx++;
215
- if (entry_idx >= capacity) {
216
- entry_idx = 0;
180
+ for (auto &data_collection : partitioned_data->GetPartitions()) {
181
+ if (data_collection->Count() == 0) {
182
+ continue;
183
+ }
184
+ TupleDataChunkIterator iterator(*data_collection, TupleDataPinProperties::ALREADY_PINNED, false);
185
+ const auto row_locations = iterator.GetRowLocations();
186
+ do {
187
+ for (idx_t i = 0; i < iterator.GetCurrentChunkCount(); i++) {
188
+ const auto &row_location = row_locations[i];
189
+ const auto hash = Load<hash_t>(row_location + hash_offset);
190
+
191
+ // Find an empty entry
192
+ auto entry_idx = ApplyBitMask(hash);
193
+ D_ASSERT(entry_idx == hash % capacity);
194
+ while (entries[entry_idx].IsOccupied() > 0) {
195
+ entry_idx++;
196
+ if (entry_idx >= capacity) {
197
+ entry_idx = 0;
198
+ }
217
199
  }
200
+ auto &entry = entries[entry_idx];
201
+ D_ASSERT(!entry.IsOccupied());
202
+ entry.SetSalt(aggr_ht_entry_t::ExtractSalt(hash));
203
+ entry.SetPointer(row_location);
204
+ D_ASSERT(entry.IsOccupied());
218
205
  }
219
-
220
- auto &ht_entry = hashes_arr[entry_idx];
221
- D_ASSERT(!ht_entry.page_nr);
222
- ht_entry.salt = hash >> hash_prefix_shift;
223
- ht_entry.page_nr = block_id + 1;
224
- ht_entry.page_offset = (row_location - block_pointer) / tuple_size;
225
- }
226
- } while (iterator.Next());
206
+ } while (iterator.Next());
207
+ }
227
208
  }
228
209
 
229
210
  Verify();
230
211
  }
231
212
 
232
- idx_t GroupedAggregateHashTable::AddChunk(AggregateHTAppendState &state, DataChunk &groups, DataChunk &payload,
233
- AggregateType filter) {
213
+ idx_t GroupedAggregateHashTable::AddChunk(DataChunk &groups, DataChunk &payload, AggregateType filter) {
234
214
  unsafe_vector<idx_t> aggregate_filter;
235
215
 
236
216
  auto &aggregates = layout.GetAggregates();
@@ -240,20 +220,18 @@ idx_t GroupedAggregateHashTable::AddChunk(AggregateHTAppendState &state, DataChu
240
220
  aggregate_filter.push_back(i);
241
221
  }
242
222
  }
243
- return AddChunk(state, groups, payload, aggregate_filter);
223
+ return AddChunk(groups, payload, aggregate_filter);
244
224
  }
245
225
 
246
- idx_t GroupedAggregateHashTable::AddChunk(AggregateHTAppendState &state, DataChunk &groups, DataChunk &payload,
247
- const unsafe_vector<idx_t> &filter) {
226
+ idx_t GroupedAggregateHashTable::AddChunk(DataChunk &groups, DataChunk &payload, const unsafe_vector<idx_t> &filter) {
248
227
  Vector hashes(LogicalType::HASH);
249
228
  groups.Hash(hashes);
250
229
 
251
- return AddChunk(state, groups, hashes, payload, filter);
230
+ return AddChunk(groups, hashes, payload, filter);
252
231
  }
253
232
 
254
- idx_t GroupedAggregateHashTable::AddChunk(AggregateHTAppendState &state, DataChunk &groups, Vector &group_hashes,
255
- DataChunk &payload, const unsafe_vector<idx_t> &filter) {
256
- D_ASSERT(!is_finalized);
233
+ idx_t GroupedAggregateHashTable::AddChunk(DataChunk &groups, Vector &group_hashes, DataChunk &payload,
234
+ const unsafe_vector<idx_t> &filter) {
257
235
  if (groups.size() == 0) {
258
236
  return 0;
259
237
  }
@@ -265,7 +243,7 @@ idx_t GroupedAggregateHashTable::AddChunk(AggregateHTAppendState &state, DataChu
265
243
  }
266
244
  #endif
267
245
 
268
- auto new_group_count = FindOrCreateGroups(state, groups, group_hashes, state.addresses, state.new_groups);
246
+ const auto new_group_count = FindOrCreateGroups(groups, group_hashes, state.addresses, state.new_groups);
269
247
  VectorOperations::AddInPlace(state.addresses, layout.GetAggrOffset(), payload.size());
270
248
 
271
249
  // Now every cell has an entry, update the aggregates
@@ -301,11 +279,14 @@ idx_t GroupedAggregateHashTable::AddChunk(AggregateHTAppendState &state, DataChu
301
279
  }
302
280
 
303
281
  void GroupedAggregateHashTable::FetchAggregates(DataChunk &groups, DataChunk &result) {
282
+ #ifdef DEBUG
304
283
  groups.Verify();
305
284
  D_ASSERT(groups.ColumnCount() + 1 == layout.ColumnCount());
306
285
  for (idx_t i = 0; i < result.ColumnCount(); i++) {
307
286
  D_ASSERT(result.data[i].GetType() == payload_types[i]);
308
287
  }
288
+ #endif
289
+
309
290
  result.SetCardinality(groups);
310
291
  if (groups.size() == 0) {
311
292
  return;
@@ -313,57 +294,46 @@ void GroupedAggregateHashTable::FetchAggregates(DataChunk &groups, DataChunk &re
313
294
 
314
295
  // find the groups associated with the addresses
315
296
  // FIXME: this should not use the FindOrCreateGroups, creating them is unnecessary
316
- AggregateHTAppendState append_state;
317
297
  Vector addresses(LogicalType::POINTER);
318
- FindOrCreateGroups(append_state, groups, addresses);
298
+ FindOrCreateGroups(groups, addresses);
319
299
  // now fetch the aggregates
320
300
  RowOperationsState row_state(*aggregate_allocator);
321
301
  RowOperations::FinalizeStates(row_state, layout, addresses, result, 0);
322
302
  }
323
303
 
324
- idx_t GroupedAggregateHashTable::ResizeThreshold() {
325
- return capacity / LOAD_FACTOR;
326
- }
327
-
328
- template <class ENTRY>
329
- idx_t GroupedAggregateHashTable::FindOrCreateGroupsInternal(AggregateHTAppendState &state, DataChunk &groups,
330
- Vector &group_hashes_v, Vector &addresses_v,
331
- SelectionVector &new_groups_out) {
332
- D_ASSERT(!is_finalized);
304
+ idx_t GroupedAggregateHashTable::FindOrCreateGroupsInternal(DataChunk &groups, Vector &group_hashes_v,
305
+ Vector &addresses_v, SelectionVector &new_groups_out) {
333
306
  D_ASSERT(groups.ColumnCount() + 1 == layout.ColumnCount());
334
307
  D_ASSERT(group_hashes_v.GetType() == LogicalType::HASH);
335
308
  D_ASSERT(state.ht_offsets.GetVectorType() == VectorType::FLAT_VECTOR);
336
- D_ASSERT(state.ht_offsets.GetType() == LogicalType::BIGINT);
309
+ D_ASSERT(state.ht_offsets.GetType() == LogicalType::UBIGINT);
337
310
  D_ASSERT(addresses_v.GetType() == LogicalType::POINTER);
338
- D_ASSERT(state.hash_salts.GetType() == LogicalType::SMALLINT);
339
-
340
- if (Count() + groups.size() > MaxCapacity()) {
341
- throw InternalException("Hash table capacity reached");
342
- }
311
+ D_ASSERT(state.hash_salts.GetType() == LogicalType::HASH);
343
312
 
344
- // Resize at 50% capacity, also need to fit the entire vector
345
- if (capacity - Count() <= groups.size() || Count() > ResizeThreshold()) {
313
+ // Need to fit the entire vector, and resize at threshold
314
+ if (Count() + groups.size() > capacity || Count() + groups.size() > ResizeThreshold()) {
346
315
  Verify();
347
- Resize<ENTRY>(capacity * 2);
316
+ Resize(capacity * 2);
348
317
  }
349
318
  D_ASSERT(capacity - Count() >= groups.size()); // we need to be able to fit at least one vector of data
350
319
 
351
320
  group_hashes_v.Flatten(groups.size());
352
- auto group_hashes = FlatVector::GetData<hash_t>(group_hashes_v);
321
+ auto hashes = FlatVector::GetData<hash_t>(group_hashes_v);
353
322
 
354
323
  addresses_v.Flatten(groups.size());
355
324
  auto addresses = FlatVector::GetData<data_ptr_t>(addresses_v);
356
325
 
357
326
  // Compute the entry in the table based on the hash using a modulo,
358
327
  // and precompute the hash salts for faster comparison below
359
- auto ht_offsets_ptr = FlatVector::GetData<uint64_t>(state.ht_offsets);
360
- auto hash_salts_ptr = FlatVector::GetData<uint16_t>(state.hash_salts);
328
+ auto ht_offsets = FlatVector::GetData<uint64_t>(state.ht_offsets);
329
+ auto hash_salts = FlatVector::GetData<hash_t>(state.hash_salts);
361
330
  for (idx_t r = 0; r < groups.size(); r++) {
362
- auto element = group_hashes[r];
363
- D_ASSERT((element & bitmask) == (element % capacity));
364
- ht_offsets_ptr[r] = element & bitmask;
365
- hash_salts_ptr[r] = element >> hash_prefix_shift;
331
+ const auto &hash = hashes[r];
332
+ ht_offsets[r] = ApplyBitMask(hash);
333
+ D_ASSERT(ht_offsets[r] == hash % capacity);
334
+ hash_salts[r] = aggr_ht_entry_t::ExtractSalt(hash);
366
335
  }
336
+
367
337
  // we start out with all entries [0, 1, 2, ..., groups.size()]
368
338
  const SelectionVector *sel_vector = FlatVector::IncrementalSelectionVector();
369
339
 
@@ -379,15 +349,12 @@ idx_t GroupedAggregateHashTable::FindOrCreateGroupsInternal(AggregateHTAppendSta
379
349
  state.group_chunk.SetCardinality(groups);
380
350
 
381
351
  // convert all vectors to unified format
382
- if (!state.chunk_state_initialized) {
383
- data_collection->InitializeAppend(state.chunk_state);
384
- state.chunk_state_initialized = true;
385
- }
386
- TupleDataCollection::ToUnifiedFormat(state.chunk_state, state.group_chunk);
352
+ auto &chunk_state = state.append_state.chunk_state;
353
+ TupleDataCollection::ToUnifiedFormat(chunk_state, state.group_chunk);
387
354
  if (!state.group_data) {
388
355
  state.group_data = make_unsafe_uniq_array<UnifiedVectorFormat>(state.group_chunk.ColumnCount());
389
356
  }
390
- TupleDataCollection::GetVectorData(state.chunk_state, state.group_data.get());
357
+ TupleDataCollection::GetVectorData(chunk_state, state.group_data.get());
391
358
 
392
359
  idx_t new_group_count = 0;
393
360
  idx_t remaining_entries = groups.size();
@@ -398,57 +365,42 @@ idx_t GroupedAggregateHashTable::FindOrCreateGroupsInternal(AggregateHTAppendSta
398
365
 
399
366
  // For each remaining entry, figure out whether or not it belongs to a full or empty group
400
367
  for (idx_t i = 0; i < remaining_entries; i++) {
401
- const idx_t index = sel_vector->get_index(i);
402
- auto &ht_entry = *(((ENTRY *)this->hashes_hdl_ptr) + ht_offsets_ptr[index]);
403
- if (ht_entry.page_nr == 0) { // Cell is unoccupied (we use page number 0 as a "unused marker")
404
- D_ASSERT(group_hashes[index] >> hash_prefix_shift <= NumericLimits<uint16_t>::Maximum());
405
- D_ASSERT(payload_hds_ptrs.size() < NumericLimits<uint32_t>::Maximum());
406
-
407
- // Set page nr to 1 for now to mark it as occupied (will be corrected later) and set the salt
408
- ht_entry.page_nr = 1;
409
- ht_entry.salt = group_hashes[index] >> hash_prefix_shift;
410
-
411
- // Update selection lists for outer loops
412
- state.empty_vector.set_index(new_entry_count++, index);
413
- new_groups_out.set_index(new_group_count++, index);
414
- } else { // Cell is occupied: Compare salts
415
- if (ht_entry.salt == hash_salts_ptr[index]) {
368
+ const auto index = sel_vector->get_index(i);
369
+ const auto &salt = hash_salts[index];
370
+ auto &entry = entries[ht_offsets[index]];
371
+ if (entry.IsOccupied()) { // Cell is occupied: Compare salts
372
+ if (entry.GetSalt() == salt) {
416
373
  state.group_compare_vector.set_index(need_compare_count++, index);
417
374
  } else {
418
375
  state.no_match_vector.set_index(no_match_count++, index);
419
376
  }
377
+ } else { // Cell is unoccupied
378
+ // Set salt (also marks as occupied)
379
+ entry.SetSalt(salt);
380
+
381
+ // Update selection lists for outer loops
382
+ state.empty_vector.set_index(new_entry_count++, index);
383
+ new_groups_out.set_index(new_group_count++, index);
420
384
  }
421
385
  }
422
386
 
423
387
  if (new_entry_count != 0) {
424
388
  // Append everything that belongs to an empty group
425
- data_collection->AppendUnified(td_pin_state, state.chunk_state, state.group_chunk, state.empty_vector,
426
- new_entry_count);
427
- RowOperations::InitializeStates(layout, state.chunk_state.row_locations,
389
+ partitioned_data->AppendUnified(state.append_state, state.group_chunk, state.empty_vector, new_entry_count);
390
+ RowOperations::InitializeStates(layout, chunk_state.row_locations,
428
391
  *FlatVector::IncrementalSelectionVector(), new_entry_count);
429
392
 
430
- // Get the pointers to the (possibly) newly created blocks of the data collection
431
- idx_t block_id = payload_hds_ptrs.empty() ? 0 : payload_hds_ptrs.size() - 1;
432
- UpdateBlockPointers();
433
- auto block_pointer = payload_hds_ptrs[block_id];
434
- auto block_end = block_pointer + tuples_per_block * tuple_size;
435
-
436
- // Set the page nrs/offsets in the 1st part of the HT now that the data has been appended
437
- const auto row_locations = FlatVector::GetData<data_ptr_t>(state.chunk_state.row_locations);
393
+ // Set the entry pointers in the 1st part of the HT now that the data has been appended
394
+ const auto row_locations = FlatVector::GetData<data_ptr_t>(chunk_state.row_locations);
395
+ const auto &row_sel = state.append_state.reverse_partition_sel;
438
396
  for (idx_t new_entry_idx = 0; new_entry_idx < new_entry_count; new_entry_idx++) {
439
- const auto &row_location = row_locations[new_entry_idx];
440
- if (row_location > block_end || row_location < block_pointer) {
441
- block_id++;
442
- D_ASSERT(block_id < payload_hds_ptrs.size());
443
- block_pointer = payload_hds_ptrs[block_id];
444
- block_end = block_pointer + tuples_per_block * tuple_size;
445
- }
446
- D_ASSERT(row_location >= block_pointer && row_location < block_end);
447
- D_ASSERT((row_location - block_pointer) % tuple_size == 0);
448
397
  const auto index = state.empty_vector.get_index(new_entry_idx);
449
- auto &ht_entry = *(((ENTRY *)this->hashes_hdl_ptr) + ht_offsets_ptr[index]);
450
- ht_entry.page_nr = block_id + 1;
451
- ht_entry.page_offset = (row_location - block_pointer) / tuple_size;
398
+ const auto row_idx = row_sel.get_index(index);
399
+ const auto &row_location = row_locations[row_idx];
400
+
401
+ auto &entry = entries[ht_offsets[index]];
402
+
403
+ entry.SetPointer(row_location);
452
404
  addresses[index] = row_location;
453
405
  }
454
406
  }
@@ -457,10 +409,8 @@ idx_t GroupedAggregateHashTable::FindOrCreateGroupsInternal(AggregateHTAppendSta
457
409
  // Get the pointers to the rows that need to be compared
458
410
  for (idx_t need_compare_idx = 0; need_compare_idx < need_compare_count; need_compare_idx++) {
459
411
  const auto index = state.group_compare_vector.get_index(need_compare_idx);
460
- const auto &ht_entry = *(((ENTRY *)this->hashes_hdl_ptr) + ht_offsets_ptr[index]);
461
- auto page_ptr = payload_hds_ptrs[ht_entry.page_nr - 1];
462
- auto page_offset = ht_entry.page_offset * tuple_size;
463
- addresses[index] = page_ptr + page_offset;
412
+ const auto &entry = entries[ht_offsets[index]];
413
+ addresses[index] = entry.GetPointer();
464
414
  }
465
415
 
466
416
  // Perform group comparisons
@@ -472,55 +422,36 @@ idx_t GroupedAggregateHashTable::FindOrCreateGroupsInternal(AggregateHTAppendSta
472
422
  // Linear probing: each of the entries that do not match move to the next entry in the HT
473
423
  for (idx_t i = 0; i < no_match_count; i++) {
474
424
  idx_t index = state.no_match_vector.get_index(i);
475
- ht_offsets_ptr[index]++;
476
- if (ht_offsets_ptr[index] >= capacity) {
477
- ht_offsets_ptr[index] = 0;
425
+ ht_offsets[index]++;
426
+ if (ht_offsets[index] >= capacity) {
427
+ ht_offsets[index] = 0;
478
428
  }
479
429
  }
480
430
  sel_vector = &state.no_match_vector;
481
431
  remaining_entries = no_match_count;
482
432
  }
483
433
 
434
+ count += new_group_count;
484
435
  return new_group_count;
485
436
  }
486
437
 
487
- void GroupedAggregateHashTable::UpdateBlockPointers() {
488
- for (const auto &id_and_handle : td_pin_state.row_handles) {
489
- const auto &id = id_and_handle.first;
490
- const auto &handle = id_and_handle.second;
491
- if (payload_hds_ptrs.empty() || id > payload_hds_ptrs.size() - 1) {
492
- payload_hds_ptrs.resize(id + 1);
493
- }
494
- payload_hds_ptrs[id] = handle.Ptr();
495
- }
496
- }
497
-
498
438
  // this is to support distinct aggregations where we need to record whether we
499
439
  // have already seen a value for a group
500
- idx_t GroupedAggregateHashTable::FindOrCreateGroups(AggregateHTAppendState &state, DataChunk &groups,
501
- Vector &group_hashes, Vector &addresses_out,
440
+ idx_t GroupedAggregateHashTable::FindOrCreateGroups(DataChunk &groups, Vector &group_hashes, Vector &addresses_out,
502
441
  SelectionVector &new_groups_out) {
503
- switch (entry_type) {
504
- case HtEntryType::HT_WIDTH_64:
505
- return FindOrCreateGroupsInternal<aggr_ht_entry_64>(state, groups, group_hashes, addresses_out, new_groups_out);
506
- case HtEntryType::HT_WIDTH_32:
507
- return FindOrCreateGroupsInternal<aggr_ht_entry_32>(state, groups, group_hashes, addresses_out, new_groups_out);
508
- default:
509
- throw InternalException("Unknown HT entry width");
510
- }
442
+ return FindOrCreateGroupsInternal(groups, group_hashes, addresses_out, new_groups_out);
511
443
  }
512
444
 
513
- void GroupedAggregateHashTable::FindOrCreateGroups(AggregateHTAppendState &state, DataChunk &groups,
514
- Vector &addresses) {
445
+ void GroupedAggregateHashTable::FindOrCreateGroups(DataChunk &groups, Vector &addresses) {
515
446
  // create a dummy new_groups sel vector
516
- FindOrCreateGroups(state, groups, addresses, state.new_groups);
447
+ FindOrCreateGroups(groups, addresses, state.new_groups);
517
448
  }
518
449
 
519
- idx_t GroupedAggregateHashTable::FindOrCreateGroups(AggregateHTAppendState &state, DataChunk &groups,
520
- Vector &addresses_out, SelectionVector &new_groups_out) {
450
+ idx_t GroupedAggregateHashTable::FindOrCreateGroups(DataChunk &groups, Vector &addresses_out,
451
+ SelectionVector &new_groups_out) {
521
452
  Vector hashes(LogicalType::HASH);
522
453
  groups.Hash(hashes);
523
- return FindOrCreateGroups(state, groups, hashes, addresses_out, new_groups_out);
454
+ return FindOrCreateGroups(groups, hashes, addresses_out, new_groups_out);
524
455
  }
525
456
 
526
457
  struct FlushMoveState {
@@ -533,13 +464,21 @@ struct FlushMoveState {
533
464
  for (idx_t col_idx = 0; col_idx < layout.ColumnCount() - 1; col_idx++) {
534
465
  column_ids.emplace_back(col_idx);
535
466
  }
536
- // FIXME DESTROY_AFTER_DONE if we make it possible to pass a selection vector to RowOperations::DestroyStates?
537
- collection.InitializeScan(scan_state, column_ids, TupleDataPinProperties::UNPIN_AFTER_DONE);
467
+ collection.InitializeScan(scan_state, column_ids, TupleDataPinProperties::DESTROY_AFTER_DONE);
538
468
  collection.InitializeScanChunk(scan_state, groups);
539
469
  hash_col_idx = layout.ColumnCount() - 1;
540
470
  }
541
471
 
542
- bool Scan();
472
+ bool Scan() {
473
+ if (collection.Scan(scan_state, groups)) {
474
+ collection.Gather(scan_state.chunk_state.row_locations, *FlatVector::IncrementalSelectionVector(),
475
+ groups.size(), hash_col_idx, hashes, *FlatVector::IncrementalSelectionVector());
476
+ return true;
477
+ }
478
+
479
+ collection.FinalizePinState(scan_state.pin_state);
480
+ return false;
481
+ }
543
482
 
544
483
  TupleDataCollection &collection;
545
484
  TupleDataScanState scan_state;
@@ -548,52 +487,13 @@ struct FlushMoveState {
548
487
  idx_t hash_col_idx;
549
488
  Vector hashes;
550
489
 
551
- AggregateHTAppendState append_state;
552
490
  Vector group_addresses;
553
491
  SelectionVector new_groups_sel;
554
492
  };
555
493
 
556
- bool FlushMoveState::Scan() {
557
- if (collection.Scan(scan_state, groups)) {
558
- collection.Gather(scan_state.chunk_state.row_locations, *FlatVector::IncrementalSelectionVector(),
559
- groups.size(), hash_col_idx, hashes, *FlatVector::IncrementalSelectionVector());
560
- return true;
561
- }
562
-
563
- collection.FinalizePinState(scan_state.pin_state);
564
- return false;
565
- }
566
-
567
494
  void GroupedAggregateHashTable::Combine(GroupedAggregateHashTable &other) {
568
- D_ASSERT(!is_finalized);
569
-
570
- D_ASSERT(other.layout.GetAggrWidth() == layout.GetAggrWidth());
571
- D_ASSERT(other.layout.GetDataWidth() == layout.GetDataWidth());
572
- D_ASSERT(other.layout.GetRowWidth() == layout.GetRowWidth());
573
-
574
- if (other.Count() == 0) {
575
- return;
576
- }
577
-
578
- FlushMoveState state(*other.data_collection);
579
- RowOperationsState row_state(*aggregate_allocator);
580
- while (state.Scan()) {
581
- FindOrCreateGroups(state.append_state, state.groups, state.hashes, state.group_addresses, state.new_groups_sel);
582
- RowOperations::CombineStates(row_state, layout, state.scan_state.chunk_state.row_locations,
583
- state.group_addresses, state.groups.size());
584
- }
585
-
586
- Verify();
587
-
588
- // if we combine states, then we also need to combine the arena allocators
589
- for (auto &stored_allocator : other.stored_allocators) {
590
- stored_allocators.push_back(stored_allocator);
591
- }
592
- stored_allocators.push_back(other.aggregate_allocator);
593
- }
594
-
595
- void GroupedAggregateHashTable::Append(GroupedAggregateHashTable &other) {
596
- data_collection->Combine(other.GetDataCollection());
495
+ auto other_data = other.partitioned_data->GetUnpartitioned();
496
+ Combine(*other_data);
597
497
 
598
498
  // Inherit ownership to all stored aggregate allocators
599
499
  stored_allocators.emplace_back(other.aggregate_allocator);
@@ -602,75 +502,33 @@ void GroupedAggregateHashTable::Append(GroupedAggregateHashTable &other) {
602
502
  }
603
503
  }
604
504
 
605
- void GroupedAggregateHashTable::Partition(vector<GroupedAggregateHashTable *> &partition_hts, idx_t radix_bits,
606
- bool sink_done) {
607
- const auto num_partitions = RadixPartitioning::NumberOfPartitions(radix_bits);
608
- D_ASSERT(partition_hts.size() == num_partitions);
609
-
610
- // Partition the data
611
- auto pin_properties =
612
- sink_done ? TupleDataPinProperties::UNPIN_AFTER_DONE : TupleDataPinProperties::KEEP_EVERYTHING_PINNED;
613
- auto partitioned_data =
614
- make_uniq<RadixPartitionedTupleData>(buffer_manager, layout, radix_bits, layout.ColumnCount() - 1);
615
- partitioned_data->Partition(*data_collection, pin_properties);
616
- D_ASSERT(partitioned_data->GetPartitions().size() == num_partitions);
617
-
618
- // Move the partitioned data collections to the partitioned hash tables and initialize the 1st part of the HT
619
- auto &partitions = partitioned_data->GetPartitions();
620
- for (idx_t partition_idx = 0; partition_idx < num_partitions; partition_idx++) {
621
- auto &partition_ht = *partition_hts[partition_idx];
622
- partition_ht.data_collection = std::move(partitions[partition_idx]);
623
-
624
- // Inherit ownership to all stored aggregate allocators
625
- partition_ht.stored_allocators.emplace_back(aggregate_allocator);
626
- for (const auto &stored_allocator : stored_allocators) {
627
- partition_ht.stored_allocators.emplace_back(stored_allocator);
628
- }
629
-
630
- if (!sink_done) {
631
- partition_ht.InitializeFirstPart();
632
- partition_ht.Verify();
633
- }
634
- }
635
- }
505
+ void GroupedAggregateHashTable::Combine(TupleDataCollection &other_data) {
506
+ D_ASSERT(other_data.GetLayout().GetAggrWidth() == layout.GetAggrWidth());
507
+ D_ASSERT(other_data.GetLayout().GetDataWidth() == layout.GetDataWidth());
508
+ D_ASSERT(other_data.GetLayout().GetRowWidth() == layout.GetRowWidth());
636
509
 
637
- void GroupedAggregateHashTable::InitializeFirstPart() {
638
- data_collection->GetBlockPointers(payload_hds_ptrs);
639
- auto size = MaxValue<idx_t>(NextPowerOfTwo(Count() * 2L), capacity);
640
- switch (entry_type) {
641
- case HtEntryType::HT_WIDTH_64:
642
- Resize<aggr_ht_entry_64>(size);
643
- break;
644
- case HtEntryType::HT_WIDTH_32:
645
- Resize<aggr_ht_entry_32>(size);
646
- break;
647
- default:
648
- throw InternalException("Unknown HT entry width");
510
+ if (other_data.Count() == 0) {
511
+ return;
649
512
  }
650
- }
651
-
652
- idx_t GroupedAggregateHashTable::Scan(TupleDataParallelScanState &gstate, TupleDataLocalScanState &lstate,
653
- DataChunk &result) {
654
- data_collection->Scan(gstate, lstate, result);
655
513
 
514
+ FlushMoveState fm_state(other_data);
656
515
  RowOperationsState row_state(*aggregate_allocator);
657
- const auto group_cols = layout.ColumnCount() - 1;
658
- RowOperations::FinalizeStates(row_state, layout, lstate.chunk_state.row_locations, result, group_cols);
659
-
660
- return result.size();
661
- }
662
-
663
- void GroupedAggregateHashTable::Finalize() {
664
- if (is_finalized) {
665
- return;
516
+ while (fm_state.Scan()) {
517
+ FindOrCreateGroups(fm_state.groups, fm_state.hashes, fm_state.group_addresses, fm_state.new_groups_sel);
518
+ RowOperations::CombineStates(row_state, layout, fm_state.scan_state.chunk_state.row_locations,
519
+ fm_state.group_addresses, fm_state.groups.size());
520
+ if (layout.HasDestructor()) {
521
+ RowOperations::DestroyStates(row_state, layout, fm_state.scan_state.chunk_state.row_locations,
522
+ fm_state.groups.size());
523
+ }
666
524
  }
667
525
 
668
- // Early release hashes (not needed for partition/scan) and data collection (will be pinned again when scanning)
669
- hashes_hdl.Reset();
670
- data_collection->FinalizePinState(td_pin_state);
671
- data_collection->Unpin();
526
+ Verify();
527
+ }
672
528
 
673
- is_finalized = true;
529
+ void GroupedAggregateHashTable::UnpinData() {
530
+ partitioned_data->FlushAppendState(state.append_state);
531
+ partitioned_data->Unpin();
674
532
  }
675
533
 
676
534
  } // namespace duckdb