duckdb 0.7.2-dev1901.0 → 0.7.2-dev2233.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (198) hide show
  1. package/binding.gyp +2 -0
  2. package/package.json +1 -1
  3. package/src/duckdb/extension/parquet/column_reader.cpp +3 -0
  4. package/src/duckdb/extension/parquet/include/parquet_writer.hpp +1 -1
  5. package/src/duckdb/extension/parquet/parquet_metadata.cpp +4 -2
  6. package/src/duckdb/src/catalog/catalog_entry/duck_index_entry.cpp +1 -1
  7. package/src/duckdb/src/common/arrow/arrow_appender.cpp +69 -44
  8. package/src/duckdb/src/common/arrow/arrow_converter.cpp +1 -1
  9. package/src/duckdb/src/common/arrow/arrow_wrapper.cpp +20 -2
  10. package/src/duckdb/src/common/box_renderer.cpp +4 -2
  11. package/src/duckdb/src/common/constants.cpp +10 -1
  12. package/src/duckdb/src/common/filename_pattern.cpp +41 -0
  13. package/src/duckdb/src/common/hive_partitioning.cpp +144 -15
  14. package/src/duckdb/src/common/radix_partitioning.cpp +101 -369
  15. package/src/duckdb/src/common/row_operations/row_aggregate.cpp +8 -9
  16. package/src/duckdb/src/common/row_operations/row_external.cpp +1 -1
  17. package/src/duckdb/src/common/row_operations/row_gather.cpp +5 -3
  18. package/src/duckdb/src/common/row_operations/row_match.cpp +117 -22
  19. package/src/duckdb/src/common/row_operations/row_scatter.cpp +2 -2
  20. package/src/duckdb/src/common/sort/partition_state.cpp +1 -1
  21. package/src/duckdb/src/common/sort/sort_state.cpp +2 -1
  22. package/src/duckdb/src/common/sort/sorted_block.cpp +1 -1
  23. package/src/duckdb/src/common/types/{column_data_allocator.cpp → column/column_data_allocator.cpp} +2 -2
  24. package/src/duckdb/src/common/types/{column_data_collection.cpp → column/column_data_collection.cpp} +29 -6
  25. package/src/duckdb/src/common/types/{column_data_collection_segment.cpp → column/column_data_collection_segment.cpp} +2 -1
  26. package/src/duckdb/src/common/types/{column_data_consumer.cpp → column/column_data_consumer.cpp} +1 -1
  27. package/src/duckdb/src/common/types/{partitioned_column_data.cpp → column/partitioned_column_data.cpp} +11 -9
  28. package/src/duckdb/src/common/types/row/partitioned_tuple_data.cpp +316 -0
  29. package/src/duckdb/src/common/types/{row_data_collection.cpp → row/row_data_collection.cpp} +1 -1
  30. package/src/duckdb/src/common/types/{row_data_collection_scanner.cpp → row/row_data_collection_scanner.cpp} +2 -2
  31. package/src/duckdb/src/common/types/{row_layout.cpp → row/row_layout.cpp} +1 -1
  32. package/src/duckdb/src/common/types/row/tuple_data_allocator.cpp +465 -0
  33. package/src/duckdb/src/common/types/row/tuple_data_collection.cpp +511 -0
  34. package/src/duckdb/src/common/types/row/tuple_data_iterator.cpp +96 -0
  35. package/src/duckdb/src/common/types/row/tuple_data_layout.cpp +119 -0
  36. package/src/duckdb/src/common/types/row/tuple_data_scatter_gather.cpp +1200 -0
  37. package/src/duckdb/src/common/types/row/tuple_data_segment.cpp +170 -0
  38. package/src/duckdb/src/common/types/vector.cpp +1 -1
  39. package/src/duckdb/src/execution/aggregate_hashtable.cpp +252 -290
  40. package/src/duckdb/src/execution/join_hashtable.cpp +192 -328
  41. package/src/duckdb/src/execution/operator/aggregate/physical_window.cpp +4 -4
  42. package/src/duckdb/src/execution/operator/helper/physical_execute.cpp +3 -3
  43. package/src/duckdb/src/execution/operator/helper/physical_limit_percent.cpp +2 -3
  44. package/src/duckdb/src/execution/operator/helper/physical_result_collector.cpp +2 -3
  45. package/src/duckdb/src/execution/operator/join/perfect_hash_join_executor.cpp +36 -21
  46. package/src/duckdb/src/execution/operator/join/physical_blockwise_nl_join.cpp +2 -2
  47. package/src/duckdb/src/execution/operator/join/physical_cross_product.cpp +1 -1
  48. package/src/duckdb/src/execution/operator/join/physical_delim_join.cpp +2 -2
  49. package/src/duckdb/src/execution/operator/join/physical_hash_join.cpp +166 -144
  50. package/src/duckdb/src/execution/operator/join/physical_index_join.cpp +5 -5
  51. package/src/duckdb/src/execution/operator/join/physical_join.cpp +2 -10
  52. package/src/duckdb/src/execution/operator/join/physical_positional_join.cpp +0 -1
  53. package/src/duckdb/src/execution/operator/order/physical_top_n.cpp +2 -2
  54. package/src/duckdb/src/execution/operator/persistent/base_csv_reader.cpp +3 -0
  55. package/src/duckdb/src/execution/operator/persistent/buffered_csv_reader.cpp +71 -22
  56. package/src/duckdb/src/execution/operator/persistent/csv_buffer.cpp +17 -13
  57. package/src/duckdb/src/execution/operator/persistent/csv_reader_options.cpp +0 -7
  58. package/src/duckdb/src/execution/operator/persistent/parallel_csv_reader.cpp +124 -29
  59. package/src/duckdb/src/execution/operator/persistent/physical_copy_to_file.cpp +13 -11
  60. package/src/duckdb/src/execution/operator/persistent/physical_delete.cpp +3 -2
  61. package/src/duckdb/src/execution/operator/persistent/physical_export.cpp +25 -24
  62. package/src/duckdb/src/execution/operator/persistent/physical_insert.cpp +1 -1
  63. package/src/duckdb/src/execution/operator/persistent/physical_update.cpp +4 -3
  64. package/src/duckdb/src/execution/operator/scan/physical_table_scan.cpp +1 -1
  65. package/src/duckdb/src/execution/operator/schema/physical_create_type.cpp +1 -1
  66. package/src/duckdb/src/execution/operator/set/physical_recursive_cte.cpp +3 -3
  67. package/src/duckdb/src/execution/partitionable_hashtable.cpp +9 -37
  68. package/src/duckdb/src/execution/physical_operator.cpp +1 -1
  69. package/src/duckdb/src/execution/physical_plan/plan_comparison_join.cpp +19 -18
  70. package/src/duckdb/src/execution/physical_plan/plan_copy_to_file.cpp +2 -1
  71. package/src/duckdb/src/execution/physical_plan/plan_execute.cpp +2 -2
  72. package/src/duckdb/src/execution/physical_plan/plan_explain.cpp +5 -6
  73. package/src/duckdb/src/execution/physical_plan/plan_expression_get.cpp +2 -2
  74. package/src/duckdb/src/execution/physical_plan/plan_recursive_cte.cpp +3 -3
  75. package/src/duckdb/src/execution/physical_plan_generator.cpp +1 -1
  76. package/src/duckdb/src/execution/radix_partitioned_hashtable.cpp +39 -17
  77. package/src/duckdb/src/function/aggregate/sorted_aggregate_function.cpp +2 -2
  78. package/src/duckdb/src/function/table/pragma_detailed_profiling_output.cpp +5 -5
  79. package/src/duckdb/src/function/table/pragma_last_profiling_output.cpp +2 -2
  80. package/src/duckdb/src/function/table/read_csv.cpp +124 -58
  81. package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
  82. package/src/duckdb/src/include/duckdb/catalog/catalog_entry/index_catalog_entry.hpp +1 -1
  83. package/src/duckdb/src/include/duckdb/common/arrow/arrow_appender.hpp +1 -1
  84. package/src/duckdb/src/include/duckdb/common/constants.hpp +2 -0
  85. package/src/duckdb/src/include/duckdb/common/exception.hpp +3 -0
  86. package/src/duckdb/src/include/duckdb/common/fast_mem.hpp +528 -0
  87. package/src/duckdb/src/include/duckdb/common/filename_pattern.hpp +34 -0
  88. package/src/duckdb/src/include/duckdb/common/helper.hpp +10 -0
  89. package/src/duckdb/src/include/duckdb/common/hive_partitioning.hpp +13 -3
  90. package/src/duckdb/src/include/duckdb/common/optional_ptr.hpp +8 -0
  91. package/src/duckdb/src/include/duckdb/common/perfect_map_set.hpp +34 -0
  92. package/src/duckdb/src/include/duckdb/common/radix_partitioning.hpp +80 -27
  93. package/src/duckdb/src/include/duckdb/common/reference_map.hpp +38 -0
  94. package/src/duckdb/src/include/duckdb/common/row_operations/row_operations.hpp +7 -6
  95. package/src/duckdb/src/include/duckdb/common/sort/comparators.hpp +1 -1
  96. package/src/duckdb/src/include/duckdb/common/sort/partition_state.hpp +1 -1
  97. package/src/duckdb/src/include/duckdb/common/sort/sort.hpp +1 -1
  98. package/src/duckdb/src/include/duckdb/common/sort/sorted_block.hpp +2 -2
  99. package/src/duckdb/src/include/duckdb/common/types/batched_data_collection.hpp +1 -1
  100. package/src/duckdb/src/include/duckdb/common/types/{column_data_allocator.hpp → column/column_data_allocator.hpp} +4 -4
  101. package/src/duckdb/src/include/duckdb/common/types/{column_data_collection.hpp → column/column_data_collection.hpp} +4 -4
  102. package/src/duckdb/src/include/duckdb/common/types/{column_data_collection_iterators.hpp → column/column_data_collection_iterators.hpp} +2 -2
  103. package/src/duckdb/src/include/duckdb/common/types/{column_data_collection_segment.hpp → column/column_data_collection_segment.hpp} +3 -3
  104. package/src/duckdb/src/include/duckdb/common/types/{column_data_consumer.hpp → column/column_data_consumer.hpp} +8 -4
  105. package/src/duckdb/src/include/duckdb/common/types/{column_data_scan_states.hpp → column/column_data_scan_states.hpp} +1 -1
  106. package/src/duckdb/src/include/duckdb/common/types/{partitioned_column_data.hpp → column/partitioned_column_data.hpp} +15 -7
  107. package/src/duckdb/src/include/duckdb/common/types/row/partitioned_tuple_data.hpp +140 -0
  108. package/src/duckdb/src/include/duckdb/common/types/{row_data_collection.hpp → row/row_data_collection.hpp} +1 -1
  109. package/src/duckdb/src/include/duckdb/common/types/{row_data_collection_scanner.hpp → row/row_data_collection_scanner.hpp} +2 -2
  110. package/src/duckdb/src/include/duckdb/common/types/{row_layout.hpp → row/row_layout.hpp} +3 -1
  111. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_allocator.hpp +116 -0
  112. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_collection.hpp +239 -0
  113. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_iterator.hpp +64 -0
  114. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_layout.hpp +113 -0
  115. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_segment.hpp +124 -0
  116. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_states.hpp +74 -0
  117. package/src/duckdb/src/include/duckdb/common/types/validity_mask.hpp +3 -0
  118. package/src/duckdb/src/include/duckdb/common/types/value.hpp +4 -12
  119. package/src/duckdb/src/include/duckdb/execution/aggregate_hashtable.hpp +34 -31
  120. package/src/duckdb/src/include/duckdb/execution/base_aggregate_hashtable.hpp +2 -2
  121. package/src/duckdb/src/include/duckdb/execution/execution_context.hpp +3 -2
  122. package/src/duckdb/src/include/duckdb/execution/expression_executor.hpp +1 -1
  123. package/src/duckdb/src/include/duckdb/execution/join_hashtable.hpp +41 -67
  124. package/src/duckdb/src/include/duckdb/execution/nested_loop_join.hpp +1 -1
  125. package/src/duckdb/src/include/duckdb/execution/operator/helper/physical_execute.hpp +2 -2
  126. package/src/duckdb/src/include/duckdb/execution/operator/helper/physical_result_collector.hpp +1 -1
  127. package/src/duckdb/src/include/duckdb/execution/operator/join/outer_join_marker.hpp +2 -2
  128. package/src/duckdb/src/include/duckdb/execution/operator/join/perfect_hash_join_executor.hpp +1 -1
  129. package/src/duckdb/src/include/duckdb/execution/operator/join/physical_cross_product.hpp +1 -1
  130. package/src/duckdb/src/include/duckdb/execution/operator/join/physical_hash_join.hpp +0 -2
  131. package/src/duckdb/src/include/duckdb/execution/operator/join/physical_index_join.hpp +2 -2
  132. package/src/duckdb/src/include/duckdb/execution/operator/join/physical_positional_join.hpp +1 -1
  133. package/src/duckdb/src/include/duckdb/execution/operator/persistent/buffered_csv_reader.hpp +4 -1
  134. package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_buffer.hpp +8 -3
  135. package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_reader_options.hpp +5 -7
  136. package/src/duckdb/src/include/duckdb/execution/operator/persistent/parallel_csv_reader.hpp +5 -1
  137. package/src/duckdb/src/include/duckdb/execution/operator/persistent/physical_copy_to_file.hpp +4 -1
  138. package/src/duckdb/src/include/duckdb/execution/operator/scan/physical_column_data_scan.hpp +1 -1
  139. package/src/duckdb/src/include/duckdb/execution/operator/set/physical_recursive_cte.hpp +1 -1
  140. package/src/duckdb/src/include/duckdb/execution/partitionable_hashtable.hpp +2 -2
  141. package/src/duckdb/src/include/duckdb/function/function.hpp +2 -0
  142. package/src/duckdb/src/include/duckdb/function/table/read_csv.hpp +25 -0
  143. package/src/duckdb/src/include/duckdb/main/client_data.hpp +3 -0
  144. package/src/duckdb/src/include/duckdb/main/config.hpp +0 -2
  145. package/src/duckdb/src/include/duckdb/main/materialized_query_result.hpp +1 -1
  146. package/src/duckdb/src/include/duckdb/main/query_result.hpp +14 -1
  147. package/src/duckdb/src/include/duckdb/optimizer/expression_rewriter.hpp +3 -3
  148. package/src/duckdb/src/include/duckdb/optimizer/join_order/cardinality_estimator.hpp +16 -16
  149. package/src/duckdb/src/include/duckdb/optimizer/join_order/join_node.hpp +8 -8
  150. package/src/duckdb/src/include/duckdb/optimizer/join_order/join_order_optimizer.hpp +23 -15
  151. package/src/duckdb/src/include/duckdb/optimizer/join_order/join_relation.hpp +9 -10
  152. package/src/duckdb/src/include/duckdb/optimizer/join_order/query_graph.hpp +18 -11
  153. package/src/duckdb/src/include/duckdb/parallel/meta_pipeline.hpp +1 -1
  154. package/src/duckdb/src/include/duckdb/parser/parsed_data/exported_table_data.hpp +5 -1
  155. package/src/duckdb/src/include/duckdb/parser/parsed_data/vacuum_info.hpp +3 -2
  156. package/src/duckdb/src/include/duckdb/parser/query_error_context.hpp +4 -2
  157. package/src/duckdb/src/include/duckdb/parser/transformer.hpp +9 -35
  158. package/src/duckdb/src/include/duckdb/planner/binder.hpp +24 -23
  159. package/src/duckdb/src/include/duckdb/planner/expression_binder.hpp +3 -3
  160. package/src/duckdb/src/include/duckdb/planner/operator/logical_column_data_get.hpp +1 -1
  161. package/src/duckdb/src/include/duckdb/planner/operator/logical_copy_to_file.hpp +3 -1
  162. package/src/duckdb/src/include/duckdb/storage/table/table_index_list.hpp +1 -1
  163. package/src/duckdb/src/main/appender.cpp +6 -6
  164. package/src/duckdb/src/main/client_context.cpp +1 -1
  165. package/src/duckdb/src/main/connection.cpp +2 -2
  166. package/src/duckdb/src/main/query_result.cpp +13 -0
  167. package/src/duckdb/src/main/settings/settings.cpp +3 -4
  168. package/src/duckdb/src/optimizer/expression_rewriter.cpp +4 -4
  169. package/src/duckdb/src/optimizer/join_order/cardinality_estimator.cpp +91 -105
  170. package/src/duckdb/src/optimizer/join_order/join_node.cpp +5 -8
  171. package/src/duckdb/src/optimizer/join_order/join_order_optimizer.cpp +163 -160
  172. package/src/duckdb/src/optimizer/join_order/join_relation_set.cpp +30 -30
  173. package/src/duckdb/src/optimizer/join_order/query_graph.cpp +37 -38
  174. package/src/duckdb/src/parallel/executor.cpp +1 -1
  175. package/src/duckdb/src/parallel/meta_pipeline.cpp +2 -2
  176. package/src/duckdb/src/parser/transform/helpers/transform_cte.cpp +1 -1
  177. package/src/duckdb/src/parser/transform/tableref/transform_subquery.cpp +1 -1
  178. package/src/duckdb/src/parser/transformer.cpp +50 -9
  179. package/src/duckdb/src/planner/binder/expression/bind_operator_expression.cpp +13 -0
  180. package/src/duckdb/src/planner/binder/statement/bind_copy.cpp +15 -5
  181. package/src/duckdb/src/planner/binder/statement/bind_create.cpp +19 -17
  182. package/src/duckdb/src/planner/binder/statement/bind_create_table.cpp +4 -4
  183. package/src/duckdb/src/planner/binder/statement/bind_export.cpp +20 -21
  184. package/src/duckdb/src/planner/binder/tableref/bind_basetableref.cpp +24 -22
  185. package/src/duckdb/src/planner/binder/tableref/bind_subqueryref.cpp +2 -2
  186. package/src/duckdb/src/planner/binder/tableref/bind_table_function.cpp +9 -0
  187. package/src/duckdb/src/planner/binder.cpp +16 -19
  188. package/src/duckdb/src/planner/expression_binder.cpp +8 -8
  189. package/src/duckdb/src/planner/operator/logical_copy_to_file.cpp +3 -3
  190. package/src/duckdb/src/storage/checkpoint_manager.cpp +23 -23
  191. package/src/duckdb/src/storage/standard_buffer_manager.cpp +1 -1
  192. package/src/duckdb/src/storage/table_index_list.cpp +3 -3
  193. package/src/duckdb/src/verification/statement_verifier.cpp +1 -1
  194. package/src/duckdb/third_party/libpg_query/src_backend_parser_gram.cpp +5552 -5598
  195. package/src/duckdb/ub_src_common.cpp +2 -0
  196. package/src/duckdb/ub_src_common_types.cpp +0 -16
  197. package/src/duckdb/ub_src_common_types_column.cpp +10 -0
  198. package/src/duckdb/ub_src_common_types_row.cpp +20 -0
@@ -3,9 +3,10 @@
3
3
  #include "duckdb/catalog/catalog_entry/aggregate_function_catalog_entry.hpp"
4
4
  #include "duckdb/common/algorithm.hpp"
5
5
  #include "duckdb/common/exception.hpp"
6
+ #include "duckdb/common/radix_partitioning.hpp"
6
7
  #include "duckdb/common/row_operations/row_operations.hpp"
7
8
  #include "duckdb/common/types/null_value.hpp"
8
- #include "duckdb/common/types/row_data_collection.hpp"
9
+ #include "duckdb/common/types/row/tuple_data_iterator.hpp"
9
10
  #include "duckdb/common/vector_operations/unary_executor.hpp"
10
11
  #include "duckdb/common/vector_operations/vector_operations.hpp"
11
12
  #include "duckdb/execution/expression_executor.hpp"
@@ -16,7 +17,7 @@
16
17
 
17
18
  namespace duckdb {
18
19
 
19
- using ValidityBytes = RowLayout::ValidityBytes;
20
+ using ValidityBytes = TupleDataLayout::ValidityBytes;
20
21
 
21
22
  GroupedAggregateHashTable::GroupedAggregateHashTable(ClientContext &context, Allocator &allocator,
22
23
  vector<LogicalType> group_types, vector<LogicalType> payload_types,
@@ -34,7 +35,8 @@ GroupedAggregateHashTable::GroupedAggregateHashTable(ClientContext &context, All
34
35
  AggregateHTAppendState::AggregateHTAppendState()
35
36
  : ht_offsets(LogicalTypeId::BIGINT), hash_salts(LogicalTypeId::SMALLINT),
36
37
  group_compare_vector(STANDARD_VECTOR_SIZE), no_match_vector(STANDARD_VECTOR_SIZE),
37
- empty_vector(STANDARD_VECTOR_SIZE), new_groups(STANDARD_VECTOR_SIZE), addresses(LogicalType::POINTER) {
38
+ empty_vector(STANDARD_VECTOR_SIZE), new_groups(STANDARD_VECTOR_SIZE), addresses(LogicalType::POINTER),
39
+ chunk_state_initialized(false) {
38
40
  }
39
41
 
40
42
  GroupedAggregateHashTable::GroupedAggregateHashTable(ClientContext &context, Allocator &allocator,
@@ -43,19 +45,19 @@ GroupedAggregateHashTable::GroupedAggregateHashTable(ClientContext &context, All
43
45
  vector<AggregateObject> aggregate_objects_p,
44
46
  HtEntryType entry_type, idx_t initial_capacity)
45
47
  : BaseAggregateHashTable(context, allocator, aggregate_objects_p, std::move(payload_types_p)),
46
- entry_type(entry_type), capacity(0), entries(0), payload_page_offset(0), is_finalized(false),
47
- aggregate_allocator(allocator) {
48
+ entry_type(entry_type), capacity(0), is_finalized(false),
49
+ aggregate_allocator(make_shared<ArenaAllocator>(allocator)) {
48
50
  // Append hash column to the end and initialise the row layout
49
51
  group_types_p.emplace_back(LogicalType::HASH);
50
52
  layout.Initialize(std::move(group_types_p), std::move(aggregate_objects_p));
53
+ tuple_size = layout.GetRowWidth();
54
+ tuples_per_block = Storage::BLOCK_SIZE / tuple_size;
51
55
 
52
56
  // HT layout
53
57
  hash_offset = layout.GetOffsets()[layout.ColumnCount() - 1];
58
+ data_collection = make_uniq<TupleDataCollection>(buffer_manager, layout);
59
+ data_collection->InitializeAppend(td_pin_state, TupleDataPinProperties::KEEP_EVERYTHING_PINNED);
54
60
 
55
- tuple_size = layout.GetRowWidth();
56
-
57
- D_ASSERT(tuple_size <= Storage::BLOCK_SIZE);
58
- tuples_per_block = Storage::BLOCK_SIZE / tuple_size;
59
61
  hashes_hdl = buffer_manager.Allocate(Storage::BLOCK_SIZE);
60
62
  hashes_hdl_ptr = hashes_hdl.Ptr();
61
63
 
@@ -75,44 +77,18 @@ GroupedAggregateHashTable::GroupedAggregateHashTable(ClientContext &context, All
75
77
  }
76
78
 
77
79
  predicates.resize(layout.ColumnCount() - 1, ExpressionType::COMPARE_EQUAL);
78
- string_heap = make_uniq<RowDataCollection>(buffer_manager, (idx_t)Storage::BLOCK_SIZE, 1, true);
79
80
  }
80
81
 
81
82
  GroupedAggregateHashTable::~GroupedAggregateHashTable() {
82
83
  Destroy();
83
84
  }
84
85
 
85
- template <class FUNC>
86
- void GroupedAggregateHashTable::PayloadApply(FUNC fun) {
87
- if (entries == 0) {
86
+ void GroupedAggregateHashTable::Destroy() {
87
+ if (data_collection->Count() == 0) {
88
88
  return;
89
89
  }
90
- idx_t apply_entries = entries;
91
- idx_t page_nr = 0;
92
- idx_t page_offset = 0;
93
-
94
- for (auto &payload_chunk_ptr : payload_hds_ptrs) {
95
- auto this_entries = MinValue(tuples_per_block, apply_entries);
96
- page_offset = 0;
97
- for (data_ptr_t ptr = payload_chunk_ptr, end = payload_chunk_ptr + this_entries * tuple_size; ptr < end;
98
- ptr += tuple_size) {
99
- fun(page_nr, page_offset++, ptr);
100
- }
101
- apply_entries -= this_entries;
102
- page_nr++;
103
- }
104
- D_ASSERT(apply_entries == 0);
105
- }
106
90
 
107
- void GroupedAggregateHashTable::NewBlock() {
108
- auto pin = buffer_manager.Allocate(Storage::BLOCK_SIZE);
109
- payload_hds.push_back(std::move(pin));
110
- payload_hds_ptrs.push_back(payload_hds.back().Ptr());
111
- payload_page_offset = 0;
112
- }
113
-
114
- void GroupedAggregateHashTable::Destroy() {
115
- // check if there is a destructor
91
+ // Check if there is an aggregate with a destructor
116
92
  bool has_destructor = false;
117
93
  for (auto &aggr : layout.GetAggregates()) {
118
94
  if (aggr.function.destructor) {
@@ -122,32 +98,25 @@ void GroupedAggregateHashTable::Destroy() {
122
98
  if (!has_destructor) {
123
99
  return;
124
100
  }
125
- // there are aggregates with destructors: loop over the hash table
126
- // and call the destructor method for each of the aggregates
127
- data_ptr_t data_pointers[STANDARD_VECTOR_SIZE];
128
- Vector state_vector(LogicalType::POINTER, (data_ptr_t)data_pointers);
129
- idx_t count = 0;
130
101
 
131
- RowOperationsState state(aggregate_allocator.GetAllocator());
132
- PayloadApply([&](idx_t page_nr, idx_t page_offset, data_ptr_t ptr) {
133
- data_pointers[count++] = ptr;
134
- if (count == STANDARD_VECTOR_SIZE) {
135
- RowOperations::DestroyStates(state, layout, state_vector, count);
136
- count = 0;
137
- }
138
- });
139
- RowOperations::DestroyStates(state, layout, state_vector, count);
102
+ // There are aggregates with destructors: Call the destructor for each of the aggregates
103
+ RowOperationsState state(aggregate_allocator->GetAllocator());
104
+ TupleDataChunkIterator iterator(*data_collection, TupleDataPinProperties::DESTROY_AFTER_DONE, false);
105
+ auto &row_locations = iterator.GetChunkState().row_locations;
106
+ do {
107
+ RowOperations::DestroyStates(state, layout, row_locations, iterator.GetCurrentChunkCount());
108
+ } while (iterator.Next());
109
+ data_collection->Reset();
140
110
  }
141
111
 
142
112
  template <class ENTRY>
143
113
  void GroupedAggregateHashTable::VerifyInternal() {
144
114
  auto hashes_ptr = (ENTRY *)hashes_hdl_ptr;
145
- D_ASSERT(payload_hds.size() == payload_hds_ptrs.size());
146
115
  idx_t count = 0;
147
116
  for (idx_t i = 0; i < capacity; i++) {
148
117
  if (hashes_ptr[i].page_nr > 0) {
149
118
  D_ASSERT(hashes_ptr[i].page_offset < tuples_per_block);
150
- D_ASSERT(hashes_ptr[i].page_nr <= payload_hds.size());
119
+ D_ASSERT(hashes_ptr[i].page_nr <= payload_hds_ptrs.size());
151
120
  auto ptr = payload_hds_ptrs[hashes_ptr[i].page_nr - 1] + ((hashes_ptr[i].page_offset) * tuple_size);
152
121
  auto hash = Load<hash_t>(ptr + hash_offset);
153
122
  D_ASSERT((hashes_ptr[i].salt) == (hash >> hash_prefix_shift));
@@ -156,7 +125,7 @@ void GroupedAggregateHashTable::VerifyInternal() {
156
125
  }
157
126
  }
158
127
  (void)count;
159
- D_ASSERT(count == entries);
128
+ D_ASSERT(count == Count());
160
129
  }
161
130
 
162
131
  idx_t GroupedAggregateHashTable::InitialCapacity() {
@@ -202,47 +171,65 @@ void GroupedAggregateHashTable::Verify() {
202
171
 
203
172
  template <class ENTRY>
204
173
  void GroupedAggregateHashTable::Resize(idx_t size) {
205
- Verify();
206
-
207
174
  D_ASSERT(!is_finalized);
175
+ D_ASSERT(size >= STANDARD_VECTOR_SIZE);
176
+ D_ASSERT(IsPowerOfTwo(size));
208
177
 
209
- if (size <= capacity) {
178
+ if (size < capacity) {
210
179
  throw InternalException("Cannot downsize a hash table!");
211
180
  }
212
- D_ASSERT(size >= STANDARD_VECTOR_SIZE);
213
-
214
- // size needs to be a power of 2
215
- D_ASSERT((size & (size - 1)) == 0);
216
- bitmask = size - 1;
181
+ capacity = size;
217
182
 
218
- auto byte_size = size * sizeof(ENTRY);
183
+ bitmask = capacity - 1;
184
+ const auto byte_size = capacity * sizeof(ENTRY);
219
185
  if (byte_size > (idx_t)Storage::BLOCK_SIZE) {
220
186
  hashes_hdl = buffer_manager.Allocate(byte_size);
221
187
  hashes_hdl_ptr = hashes_hdl.Ptr();
222
188
  }
223
189
  memset(hashes_hdl_ptr, 0, byte_size);
224
- capacity = size;
225
190
 
226
- auto hashes_arr = (ENTRY *)hashes_hdl_ptr;
191
+ if (Count() != 0) {
192
+ D_ASSERT(!payload_hds_ptrs.empty());
193
+ auto hashes_arr = (ENTRY *)hashes_hdl_ptr;
194
+
195
+ idx_t block_id = 0;
196
+ auto block_pointer = payload_hds_ptrs[block_id];
197
+ auto block_end = block_pointer + tuples_per_block * tuple_size;
198
+
199
+ TupleDataChunkIterator iterator(*data_collection, TupleDataPinProperties::ALREADY_PINNED, false);
200
+ const auto row_locations = iterator.GetRowLocations();
201
+ do {
202
+ for (idx_t i = 0; i < iterator.GetCurrentChunkCount(); i++) {
203
+ const auto &row_location = row_locations[i];
204
+ if (row_location > block_end || row_location < block_pointer) {
205
+ block_id++;
206
+ D_ASSERT(block_id < payload_hds_ptrs.size());
207
+ block_pointer = payload_hds_ptrs[block_id];
208
+ block_end = block_pointer + tuples_per_block * tuple_size;
209
+ }
210
+ D_ASSERT(row_location >= block_pointer && row_location < block_end);
211
+ D_ASSERT((row_location - block_pointer) % tuple_size == 0);
212
+
213
+ const auto hash = Load<hash_t>(row_location + hash_offset);
214
+ D_ASSERT((hash & bitmask) == (hash % capacity));
215
+ D_ASSERT(hash >> hash_prefix_shift <= NumericLimits<uint16_t>::Maximum());
216
+
217
+ auto entry_idx = (idx_t)hash & bitmask;
218
+ while (hashes_arr[entry_idx].page_nr > 0) {
219
+ entry_idx++;
220
+ if (entry_idx >= capacity) {
221
+ entry_idx = 0;
222
+ }
223
+ }
227
224
 
228
- PayloadApply([&](idx_t page_nr, idx_t page_offset, data_ptr_t ptr) {
229
- auto hash = Load<hash_t>(ptr + hash_offset);
230
- D_ASSERT((hash & bitmask) == (hash % capacity));
231
- auto entry_idx = (idx_t)hash & bitmask;
232
- while (hashes_arr[entry_idx].page_nr > 0) {
233
- entry_idx++;
234
- if (entry_idx >= capacity) {
235
- entry_idx = 0;
225
+ auto &ht_entry = hashes_arr[entry_idx];
226
+ D_ASSERT(!ht_entry.page_nr);
227
+ ht_entry.salt = hash >> hash_prefix_shift;
228
+ ht_entry.page_nr = block_id + 1;
229
+ ht_entry.page_offset = (row_location - block_pointer) / tuple_size;
236
230
  }
237
- }
238
-
239
- D_ASSERT(!hashes_arr[entry_idx].page_nr);
240
- D_ASSERT(hash >> hash_prefix_shift <= NumericLimits<uint16_t>::Maximum());
241
-
242
- hashes_arr[entry_idx].salt = hash >> hash_prefix_shift;
243
- hashes_arr[entry_idx].page_nr = page_nr + 1;
244
- hashes_arr[entry_idx].page_offset = page_offset;
245
- });
231
+ } while (iterator.Next());
232
+ }
246
233
 
247
234
  Verify();
248
235
  }
@@ -272,26 +259,25 @@ idx_t GroupedAggregateHashTable::AddChunk(AggregateHTAppendState &state, DataChu
272
259
  idx_t GroupedAggregateHashTable::AddChunk(AggregateHTAppendState &state, DataChunk &groups, Vector &group_hashes,
273
260
  DataChunk &payload, const vector<idx_t> &filter) {
274
261
  D_ASSERT(!is_finalized);
275
-
276
262
  if (groups.size() == 0) {
277
263
  return 0;
278
264
  }
279
265
 
266
+ #ifdef DEBUG
280
267
  D_ASSERT(groups.ColumnCount() + 1 == layout.ColumnCount());
281
268
  for (idx_t i = 0; i < groups.ColumnCount(); i++) {
282
269
  D_ASSERT(groups.GetTypes()[i] == layout.GetTypes()[i]);
283
270
  }
271
+ #endif
284
272
 
285
273
  auto new_group_count = FindOrCreateGroups(state, groups, group_hashes, state.addresses, state.new_groups);
286
274
  VectorOperations::AddInPlace(state.addresses, layout.GetAggrOffset(), payload.size());
287
275
 
288
- // now every cell has an entry
289
- // update the aggregates
290
- idx_t payload_idx = 0;
291
-
276
+ // Now every cell has an entry, update the aggregates
292
277
  auto &aggregates = layout.GetAggregates();
293
278
  idx_t filter_idx = 0;
294
- RowOperationsState row_state(aggregate_allocator.GetAllocator());
279
+ idx_t payload_idx = 0;
280
+ RowOperationsState row_state(aggregate_allocator->GetAllocator());
295
281
  for (idx_t i = 0; i < aggregates.size(); i++) {
296
282
  auto &aggr = aggregates[i];
297
283
  if (filter_idx >= filter.size() || i < filter[filter_idx]) {
@@ -309,7 +295,7 @@ idx_t GroupedAggregateHashTable::AddChunk(AggregateHTAppendState &state, DataChu
309
295
  RowOperations::UpdateStates(row_state, aggr, state.addresses, payload, payload_idx, payload.size());
310
296
  }
311
297
 
312
- // move to the next aggregate
298
+ // Move to the next aggregate
313
299
  payload_idx += aggr.child_count;
314
300
  VectorOperations::AddInPlace(state.addresses, aggr.payload_size, payload.size());
315
301
  filter_idx++;
@@ -336,7 +322,7 @@ void GroupedAggregateHashTable::FetchAggregates(DataChunk &groups, DataChunk &re
336
322
  Vector addresses(LogicalType::POINTER);
337
323
  FindOrCreateGroups(append_state, groups, addresses);
338
324
  // now fetch the aggregates
339
- RowOperationsState row_state(aggregate_allocator.GetAllocator());
325
+ RowOperationsState row_state(aggregate_allocator->GetAllocator());
340
326
  RowOperations::FinalizeStates(row_state, layout, addresses, result, 0);
341
327
  }
342
328
 
@@ -346,42 +332,39 @@ idx_t GroupedAggregateHashTable::ResizeThreshold() {
346
332
 
347
333
  template <class ENTRY>
348
334
  idx_t GroupedAggregateHashTable::FindOrCreateGroupsInternal(AggregateHTAppendState &state, DataChunk &groups,
349
- Vector &group_hashes, Vector &addresses,
335
+ Vector &group_hashes_v, Vector &addresses_v,
350
336
  SelectionVector &new_groups_out) {
351
337
  D_ASSERT(!is_finalized);
338
+ D_ASSERT(groups.ColumnCount() + 1 == layout.ColumnCount());
339
+ D_ASSERT(group_hashes_v.GetType() == LogicalType::HASH);
340
+ D_ASSERT(state.ht_offsets.GetVectorType() == VectorType::FLAT_VECTOR);
341
+ D_ASSERT(state.ht_offsets.GetType() == LogicalType::BIGINT);
342
+ D_ASSERT(addresses_v.GetType() == LogicalType::POINTER);
343
+ D_ASSERT(state.hash_salts.GetType() == LogicalType::SMALLINT);
352
344
 
353
- if (entries + groups.size() > MaxCapacity()) {
345
+ if (Count() + groups.size() > MaxCapacity()) {
354
346
  throw InternalException("Hash table capacity reached");
355
347
  }
356
348
 
357
- // resize at 50% capacity, also need to fit the entire vector
358
- if (capacity - entries <= groups.size() || entries > ResizeThreshold()) {
349
+ // Resize at 50% capacity, also need to fit the entire vector
350
+ if (capacity - Count() <= groups.size() || Count() > ResizeThreshold()) {
351
+ Verify();
359
352
  Resize<ENTRY>(capacity * 2);
360
353
  }
354
+ D_ASSERT(capacity - Count() >= groups.size()); // we need to be able to fit at least one vector of data
361
355
 
362
- D_ASSERT(capacity - entries >= groups.size());
363
- D_ASSERT(groups.ColumnCount() + 1 == layout.ColumnCount());
364
- // we need to be able to fit at least one vector of data
365
- D_ASSERT(capacity - entries >= groups.size());
366
- D_ASSERT(group_hashes.GetType() == LogicalType::HASH);
356
+ group_hashes_v.Flatten(groups.size());
357
+ auto group_hashes = FlatVector::GetData<hash_t>(group_hashes_v);
367
358
 
368
- group_hashes.Flatten(groups.size());
369
- auto group_hashes_ptr = FlatVector::GetData<hash_t>(group_hashes);
359
+ addresses_v.Flatten(groups.size());
360
+ auto addresses = FlatVector::GetData<data_ptr_t>(addresses_v);
370
361
 
371
- D_ASSERT(state.ht_offsets.GetVectorType() == VectorType::FLAT_VECTOR);
372
- D_ASSERT(state.ht_offsets.GetType() == LogicalType::BIGINT);
373
-
374
- D_ASSERT(addresses.GetType() == LogicalType::POINTER);
375
- addresses.Flatten(groups.size());
376
- auto addresses_ptr = FlatVector::GetData<data_ptr_t>(addresses);
377
-
378
- // compute the entry in the table based on the hash using a modulo
362
+ // Compute the entry in the table based on the hash using a modulo,
379
363
  // and precompute the hash salts for faster comparison below
380
- D_ASSERT(state.hash_salts.GetType() == LogicalType::SMALLINT);
381
364
  auto ht_offsets_ptr = FlatVector::GetData<uint64_t>(state.ht_offsets);
382
365
  auto hash_salts_ptr = FlatVector::GetData<uint16_t>(state.hash_salts);
383
366
  for (idx_t r = 0; r < groups.size(); r++) {
384
- auto element = group_hashes_ptr[r];
367
+ auto element = group_hashes[r];
385
368
  D_ASSERT((element & bitmask) == (element % capacity));
386
369
  ht_offsets_ptr[r] = element & bitmask;
387
370
  hash_salts_ptr[r] = element >> hash_prefix_shift;
@@ -389,9 +372,7 @@ idx_t GroupedAggregateHashTable::FindOrCreateGroupsInternal(AggregateHTAppendSta
389
372
  // we start out with all entries [0, 1, 2, ..., groups.size()]
390
373
  const SelectionVector *sel_vector = FlatVector::IncrementalSelectionVector();
391
374
 
392
- idx_t remaining_entries = groups.size();
393
-
394
- // make a chunk that references the groups and the hashes
375
+ // Make a chunk that references the groups and the hashes and convert to unified format
395
376
  if (state.group_chunk.ColumnCount() == 0) {
396
377
  state.group_chunk.InitializeEmpty(layout.GetTypes());
397
378
  }
@@ -399,81 +380,101 @@ idx_t GroupedAggregateHashTable::FindOrCreateGroupsInternal(AggregateHTAppendSta
399
380
  for (idx_t grp_idx = 0; grp_idx < groups.ColumnCount(); grp_idx++) {
400
381
  state.group_chunk.data[grp_idx].Reference(groups.data[grp_idx]);
401
382
  }
402
- state.group_chunk.data[groups.ColumnCount()].Reference(group_hashes);
383
+ state.group_chunk.data[groups.ColumnCount()].Reference(group_hashes_v);
403
384
  state.group_chunk.SetCardinality(groups);
404
385
 
405
386
  // convert all vectors to unified format
387
+ if (!state.chunk_state_initialized) {
388
+ data_collection->InitializeAppend(state.chunk_state);
389
+ state.chunk_state_initialized = true;
390
+ }
391
+ TupleDataCollection::ToUnifiedFormat(state.chunk_state, state.group_chunk);
406
392
  if (!state.group_data) {
407
393
  state.group_data = unique_ptr<UnifiedVectorFormat[]>(new UnifiedVectorFormat[state.group_chunk.ColumnCount()]);
408
394
  }
409
- for (idx_t col_idx = 0; col_idx < state.group_chunk.ColumnCount(); col_idx++) {
410
- state.group_chunk.data[col_idx].ToUnifiedFormat(state.group_chunk.size(), state.group_data[col_idx]);
411
- }
395
+ TupleDataCollection::GetVectorData(state.chunk_state, state.group_data.get());
412
396
 
413
397
  idx_t new_group_count = 0;
398
+ idx_t remaining_entries = groups.size();
414
399
  while (remaining_entries > 0) {
415
400
  idx_t new_entry_count = 0;
416
401
  idx_t need_compare_count = 0;
417
402
  idx_t no_match_count = 0;
418
403
 
419
- // first figure out for each remaining whether or not it belongs to a full or empty group
404
+ // For each remaining entry, figure out whether or not it belongs to a full or empty group
420
405
  for (idx_t i = 0; i < remaining_entries; i++) {
421
406
  const idx_t index = sel_vector->get_index(i);
422
- const auto ht_entry_ptr = ((ENTRY *)this->hashes_hdl_ptr) + ht_offsets_ptr[index];
423
- if (ht_entry_ptr->page_nr == 0) { // we use page number 0 as a "unused marker"
424
- // cell is empty; setup the new entry
425
- if (payload_page_offset == tuples_per_block || payload_hds.empty()) {
426
- NewBlock();
427
- }
428
-
429
- auto entry_payload_ptr = payload_hds_ptrs.back() + (payload_page_offset * tuple_size);
407
+ auto &ht_entry = *(((ENTRY *)this->hashes_hdl_ptr) + ht_offsets_ptr[index]);
408
+ if (ht_entry.page_nr == 0) { // Cell is unoccupied (we use page number 0 as a "unused marker")
409
+ D_ASSERT(group_hashes[index] >> hash_prefix_shift <= NumericLimits<uint16_t>::Maximum());
410
+ D_ASSERT(payload_hds_ptrs.size() < NumericLimits<uint32_t>::Maximum());
430
411
 
431
- D_ASSERT(group_hashes_ptr[index] >> hash_prefix_shift <= NumericLimits<uint16_t>::Maximum());
432
- D_ASSERT(payload_page_offset < tuples_per_block);
433
- D_ASSERT(payload_hds.size() < NumericLimits<uint32_t>::Maximum());
434
- D_ASSERT(payload_page_offset + 1 < NumericLimits<uint16_t>::Maximum());
412
+ // Set page nr to 1 for now to mark it as occupied (will be corrected later) and set the salt
413
+ ht_entry.page_nr = 1;
414
+ ht_entry.salt = group_hashes[index] >> hash_prefix_shift;
435
415
 
436
- ht_entry_ptr->salt = group_hashes_ptr[index] >> hash_prefix_shift;
437
-
438
- // page numbers start at one so we can use 0 as empty flag
439
- // GetPtr undoes this
440
- ht_entry_ptr->page_nr = payload_hds.size();
441
- ht_entry_ptr->page_offset = payload_page_offset++;
442
-
443
- // update selection lists for outer loops
416
+ // Update selection lists for outer loops
444
417
  state.empty_vector.set_index(new_entry_count++, index);
445
418
  new_groups_out.set_index(new_group_count++, index);
446
- entries++;
447
-
448
- addresses_ptr[index] = entry_payload_ptr;
449
-
450
- } else {
451
- // cell is occupied: add to check list
452
- // only need to check if hash salt in ptr == prefix of hash in payload
453
- if (ht_entry_ptr->salt == hash_salts_ptr[index]) {
419
+ } else { // Cell is occupied: Compare salts
420
+ if (ht_entry.salt == hash_salts_ptr[index]) {
454
421
  state.group_compare_vector.set_index(need_compare_count++, index);
455
-
456
- auto page_ptr = payload_hds_ptrs[ht_entry_ptr->page_nr - 1];
457
- auto page_offset = ht_entry_ptr->page_offset * tuple_size;
458
- addresses_ptr[index] = page_ptr + page_offset;
459
-
460
422
  } else {
461
423
  state.no_match_vector.set_index(no_match_count++, index);
462
424
  }
463
425
  }
464
426
  }
465
427
 
466
- // for each of the locations that are empty, serialize the group columns to the locations
467
- RowOperations::Scatter(state.group_chunk, state.group_data.get(), layout, addresses, *string_heap,
468
- state.empty_vector, new_entry_count);
469
- RowOperations::InitializeStates(layout, addresses, state.empty_vector, new_entry_count);
428
+ if (new_entry_count != 0) {
429
+ // Append everything that belongs to an empty group
430
+ data_collection->AppendUnified(td_pin_state, state.chunk_state, state.group_chunk, state.empty_vector,
431
+ new_entry_count);
432
+ RowOperations::InitializeStates(layout, state.chunk_state.row_locations,
433
+ *FlatVector::IncrementalSelectionVector(), new_entry_count);
434
+
435
+ // Get the pointers to the (possibly) newly created blocks of the data collection
436
+ idx_t block_id = payload_hds_ptrs.empty() ? 0 : payload_hds_ptrs.size() - 1;
437
+ UpdateBlockPointers();
438
+ auto block_pointer = payload_hds_ptrs[block_id];
439
+ auto block_end = block_pointer + tuples_per_block * tuple_size;
440
+
441
+ // Set the page nrs/offsets in the 1st part of the HT now that the data has been appended
442
+ const auto row_locations = FlatVector::GetData<data_ptr_t>(state.chunk_state.row_locations);
443
+ for (idx_t new_entry_idx = 0; new_entry_idx < new_entry_count; new_entry_idx++) {
444
+ const auto &row_location = row_locations[new_entry_idx];
445
+ if (row_location > block_end || row_location < block_pointer) {
446
+ block_id++;
447
+ D_ASSERT(block_id < payload_hds_ptrs.size());
448
+ block_pointer = payload_hds_ptrs[block_id];
449
+ block_end = block_pointer + tuples_per_block * tuple_size;
450
+ }
451
+ D_ASSERT(row_location >= block_pointer && row_location < block_end);
452
+ D_ASSERT((row_location - block_pointer) % tuple_size == 0);
453
+ const auto index = state.empty_vector.get_index(new_entry_idx);
454
+ auto &ht_entry = *(((ENTRY *)this->hashes_hdl_ptr) + ht_offsets_ptr[index]);
455
+ ht_entry.page_nr = block_id + 1;
456
+ ht_entry.page_offset = (row_location - block_pointer) / tuple_size;
457
+ addresses[index] = row_location;
458
+ }
459
+ }
460
+
461
+ if (need_compare_count != 0) {
462
+ // Get the pointers to the rows that need to be compared
463
+ for (idx_t need_compare_idx = 0; need_compare_idx < need_compare_count; need_compare_idx++) {
464
+ const auto index = state.group_compare_vector.get_index(need_compare_idx);
465
+ const auto &ht_entry = *(((ENTRY *)this->hashes_hdl_ptr) + ht_offsets_ptr[index]);
466
+ auto page_ptr = payload_hds_ptrs[ht_entry.page_nr - 1];
467
+ auto page_offset = ht_entry.page_offset * tuple_size;
468
+ addresses[index] = page_ptr + page_offset;
469
+ }
470
470
 
471
- // now we have only the tuples remaining that might match to an existing group
472
- // start performing comparisons with each of the groups
473
- RowOperations::Match(state.group_chunk, state.group_data.get(), layout, addresses, predicates,
474
- state.group_compare_vector, need_compare_count, &state.no_match_vector, no_match_count);
471
+ // Perform group comparisons
472
+ RowOperations::Match(state.group_chunk, state.group_data.get(), layout, addresses_v, predicates,
473
+ state.group_compare_vector, need_compare_count, &state.no_match_vector,
474
+ no_match_count);
475
+ }
475
476
 
476
- // each of the entries that do not match we move them to the next entry in the HT
477
+ // Linear probing: each of the entries that do not match move to the next entry in the HT
477
478
  for (idx_t i = 0; i < no_match_count; i++) {
478
479
  idx_t index = state.no_match_vector.get_index(i);
479
480
  ht_offsets_ptr[index]++;
@@ -488,6 +489,17 @@ idx_t GroupedAggregateHashTable::FindOrCreateGroupsInternal(AggregateHTAppendSta
488
489
  return new_group_count;
489
490
  }
490
491
 
492
+ void GroupedAggregateHashTable::UpdateBlockPointers() {
493
+ for (const auto &id_and_handle : td_pin_state.row_handles) {
494
+ const auto &id = id_and_handle.first;
495
+ const auto &handle = id_and_handle.second;
496
+ if (payload_hds_ptrs.empty() || id > payload_hds_ptrs.size() - 1) {
497
+ payload_hds_ptrs.resize(id + 1);
498
+ }
499
+ payload_hds_ptrs[id] = handle.Ptr();
500
+ }
501
+ }
502
+
491
503
  // this is to support distinct aggregations where we need to record whether we
492
504
  // have already seen a value for a group
493
505
  idx_t GroupedAggregateHashTable::FindOrCreateGroups(AggregateHTAppendState &state, DataChunk &groups,
@@ -517,37 +529,44 @@ idx_t GroupedAggregateHashTable::FindOrCreateGroups(AggregateHTAppendState &stat
517
529
  }
518
530
 
519
531
  struct FlushMoveState {
520
- FlushMoveState(Allocator &allocator, RowLayout &layout)
521
- : new_groups(STANDARD_VECTOR_SIZE), group_addresses(LogicalType::POINTER),
532
+ explicit FlushMoveState(TupleDataCollection &collection_p)
533
+ : collection(collection_p), hashes(LogicalType::HASH), group_addresses(LogicalType::POINTER),
522
534
  new_groups_sel(STANDARD_VECTOR_SIZE) {
523
- vector<LogicalType> group_types(layout.GetTypes().begin(), layout.GetTypes().end() - 1);
524
- groups.Initialize(allocator, group_types);
535
+ const auto &layout = collection.GetLayout();
536
+ vector<column_t> column_ids;
537
+ column_ids.reserve(layout.ColumnCount() - 1);
538
+ for (idx_t col_idx = 0; col_idx < layout.ColumnCount() - 1; col_idx++) {
539
+ column_ids.emplace_back(col_idx);
540
+ }
541
+ // FIXME DESTROY_AFTER_DONE if we make it possible to pass a selection vector to RowOperations::DestroyStates?
542
+ collection.InitializeScan(scan_state, column_ids, TupleDataPinProperties::UNPIN_AFTER_DONE);
543
+ collection.InitializeScanChunk(scan_state, groups);
544
+ hash_col_idx = layout.ColumnCount() - 1;
525
545
  }
526
546
 
547
+ bool Scan();
548
+
549
+ TupleDataCollection &collection;
550
+ TupleDataScanState scan_state;
527
551
  DataChunk groups;
528
- SelectionVector new_groups;
552
+
553
+ idx_t hash_col_idx;
554
+ Vector hashes;
555
+
556
+ AggregateHTAppendState append_state;
529
557
  Vector group_addresses;
530
558
  SelectionVector new_groups_sel;
531
559
  };
532
560
 
533
- void GroupedAggregateHashTable::FlushMove(FlushMoveState &state, Vector &source_addresses, Vector &source_hashes,
534
- idx_t count) {
535
- D_ASSERT(source_addresses.GetType() == LogicalType::POINTER);
536
- D_ASSERT(source_hashes.GetType() == LogicalType::HASH);
537
-
538
- state.groups.Reset();
539
- state.groups.SetCardinality(count);
540
- for (idx_t col_no = 0; col_no < state.groups.ColumnCount(); col_no++) {
541
- auto &column = state.groups.data[col_no];
542
- RowOperations::Gather(source_addresses, *FlatVector::IncrementalSelectionVector(), column,
543
- *FlatVector::IncrementalSelectionVector(), count, layout, col_no);
561
+ bool FlushMoveState::Scan() {
562
+ if (collection.Scan(scan_state, groups)) {
563
+ collection.Gather(scan_state.chunk_state.row_locations, *FlatVector::IncrementalSelectionVector(),
564
+ groups.size(), hash_col_idx, hashes, *FlatVector::IncrementalSelectionVector());
565
+ return true;
544
566
  }
545
567
 
546
- AggregateHTAppendState append_state;
547
- FindOrCreateGroups(append_state, state.groups, source_hashes, state.group_addresses, state.new_groups_sel);
548
-
549
- RowOperationsState row_state(aggregate_allocator.GetAllocator());
550
- RowOperations::CombineStates(row_state, layout, source_addresses, state.group_addresses, count);
568
+ collection.FinalizePinState(scan_state.pin_state);
569
+ return false;
551
570
  }
552
571
 
553
572
  void GroupedAggregateHashTable::Combine(GroupedAggregateHashTable &other) {
@@ -556,127 +575,67 @@ void GroupedAggregateHashTable::Combine(GroupedAggregateHashTable &other) {
556
575
  D_ASSERT(other.layout.GetAggrWidth() == layout.GetAggrWidth());
557
576
  D_ASSERT(other.layout.GetDataWidth() == layout.GetDataWidth());
558
577
  D_ASSERT(other.layout.GetRowWidth() == layout.GetRowWidth());
559
- D_ASSERT(other.tuples_per_block == tuples_per_block);
560
578
 
561
- if (other.entries == 0) {
579
+ if (other.Count() == 0) {
562
580
  return;
563
581
  }
564
582
 
565
- Vector addresses(LogicalType::POINTER);
566
- auto addresses_ptr = FlatVector::GetData<data_ptr_t>(addresses);
567
-
568
- Vector hashes(LogicalType::HASH);
569
- auto hashes_ptr = FlatVector::GetData<hash_t>(hashes);
570
-
571
- idx_t group_idx = 0;
572
-
573
- FlushMoveState state(allocator, layout);
574
- other.PayloadApply([&](idx_t page_nr, idx_t page_offset, data_ptr_t ptr) {
575
- auto hash = Load<hash_t>(ptr + hash_offset);
583
+ FlushMoveState state(*other.data_collection);
584
+ RowOperationsState row_state(aggregate_allocator->GetAllocator());
585
+ while (state.Scan()) {
586
+ FindOrCreateGroups(state.append_state, state.groups, state.hashes, state.group_addresses, state.new_groups_sel);
587
+ RowOperations::CombineStates(row_state, layout, state.scan_state.chunk_state.row_locations,
588
+ state.group_addresses, state.groups.size());
589
+ }
576
590
 
577
- hashes_ptr[group_idx] = hash;
578
- addresses_ptr[group_idx] = ptr;
579
- group_idx++;
580
- if (group_idx == STANDARD_VECTOR_SIZE) {
581
- FlushMove(state, addresses, hashes, group_idx);
582
- group_idx = 0;
583
- }
584
- });
585
- FlushMove(state, addresses, hashes, group_idx);
586
- string_heap->Merge(*other.string_heap);
587
591
  Verify();
588
592
  }
589
593
 
590
- struct PartitionInfo {
591
- PartitionInfo() : addresses(LogicalType::POINTER), hashes(LogicalType::HASH), group_count(0) {
592
- addresses_ptr = FlatVector::GetData<data_ptr_t>(addresses);
593
- hashes_ptr = FlatVector::GetData<hash_t>(hashes);
594
- };
595
- Vector addresses;
596
- Vector hashes;
597
- idx_t group_count;
598
- data_ptr_t *addresses_ptr;
599
- hash_t *hashes_ptr;
600
- };
601
-
602
- void GroupedAggregateHashTable::Partition(vector<GroupedAggregateHashTable *> &partition_hts, hash_t mask,
603
- idx_t shift) {
604
- D_ASSERT(partition_hts.size() > 1);
605
- vector<PartitionInfo> partition_info(partition_hts.size());
606
-
607
- FlushMoveState state(allocator, layout);
608
- PayloadApply([&](idx_t page_nr, idx_t page_offset, data_ptr_t ptr) {
609
- auto hash = Load<hash_t>(ptr + hash_offset);
610
-
611
- idx_t partition = (hash & mask) >> shift;
612
- D_ASSERT(partition < partition_hts.size());
613
-
614
- auto &info = partition_info[partition];
615
-
616
- info.hashes_ptr[info.group_count] = hash;
617
- info.addresses_ptr[info.group_count] = ptr;
618
- info.group_count++;
619
- if (info.group_count == STANDARD_VECTOR_SIZE) {
620
- D_ASSERT(partition_hts[partition]);
621
- partition_hts[partition]->FlushMove(state, info.addresses, info.hashes, info.group_count);
622
- info.group_count = 0;
623
- }
624
- });
625
-
626
- idx_t info_idx = 0;
627
- idx_t total_count = 0;
628
- for (auto &partition_entry : partition_hts) {
629
- auto &info = partition_info[info_idx++];
630
- partition_entry->FlushMove(state, info.addresses, info.hashes, info.group_count);
631
-
632
- partition_entry->string_heap->Merge(*string_heap);
633
- partition_entry->Verify();
634
- total_count += partition_entry->Size();
594
+ void GroupedAggregateHashTable::Partition(vector<GroupedAggregateHashTable *> &partition_hts, idx_t radix_bits) {
595
+ const auto num_partitions = RadixPartitioning::NumberOfPartitions(radix_bits);
596
+ D_ASSERT(partition_hts.size() == num_partitions);
597
+
598
+ // Partition the data
599
+ auto partitioned_data =
600
+ make_uniq<RadixPartitionedTupleData>(buffer_manager, layout, radix_bits, layout.ColumnCount() - 1);
601
+ partitioned_data->Partition(*data_collection, TupleDataPinProperties::KEEP_EVERYTHING_PINNED);
602
+ D_ASSERT(partitioned_data->GetPartitions().size() == num_partitions);
603
+
604
+ // Move the partitioned data collections to the partitioned hash tables and initialize the 1st part of the HT
605
+ auto &partitions = partitioned_data->GetPartitions();
606
+ for (idx_t partition_idx = 0; partition_idx < num_partitions; partition_idx++) {
607
+ auto &partition_ht = *partition_hts[partition_idx];
608
+ partition_ht.data_collection = std::move(partitions[partition_idx]);
609
+ partition_ht.aggregate_allocator = aggregate_allocator;
610
+ partition_ht.InitializeFirstPart();
611
+ partition_ht.Verify();
635
612
  }
636
- (void)total_count;
637
- D_ASSERT(total_count == entries);
638
613
  }
639
614
 
640
- idx_t GroupedAggregateHashTable::Scan(AggregateHTScanState &scan_state, DataChunk &result) {
641
- idx_t this_n;
642
- Vector addresses(LogicalType::POINTER);
643
- auto data_pointers = FlatVector::GetData<data_ptr_t>(addresses);
644
- {
645
- lock_guard<mutex> l(scan_state.lock);
646
- if (scan_state.scan_position >= entries) {
647
- return 0;
648
- }
649
- auto remaining = entries - scan_state.scan_position;
650
- this_n = MinValue((idx_t)STANDARD_VECTOR_SIZE, remaining);
651
-
652
- auto chunk_idx = scan_state.scan_position / tuples_per_block;
653
- auto chunk_offset = (scan_state.scan_position % tuples_per_block) * tuple_size;
654
- D_ASSERT(chunk_offset + tuple_size <= Storage::BLOCK_SIZE);
655
-
656
- auto read_ptr = payload_hds_ptrs[chunk_idx++];
657
- for (idx_t i = 0; i < this_n; i++) {
658
- data_pointers[i] = read_ptr + chunk_offset;
659
- chunk_offset += tuple_size;
660
- if (chunk_offset >= tuples_per_block * tuple_size) {
661
- read_ptr = payload_hds_ptrs[chunk_idx++];
662
- chunk_offset = 0;
663
- }
664
- }
665
- scan_state.scan_position += this_n;
615
+ void GroupedAggregateHashTable::InitializeFirstPart() {
616
+ data_collection->GetBlockPointers(payload_hds_ptrs);
617
+ auto size = MaxValue<idx_t>(NextPowerOfTwo(Count() * 2L), capacity);
618
+ switch (entry_type) {
619
+ case HtEntryType::HT_WIDTH_64:
620
+ Resize<aggr_ht_entry_64>(size);
621
+ break;
622
+ case HtEntryType::HT_WIDTH_32:
623
+ Resize<aggr_ht_entry_32>(size);
624
+ break;
625
+ default:
626
+ throw InternalException("Unknown HT entry width");
666
627
  }
628
+ }
667
629
 
668
- result.SetCardinality(this_n);
669
- // fetch the group columns (ignoring the final hash column
630
+ idx_t GroupedAggregateHashTable::Scan(TupleDataParallelScanState &gstate, TupleDataLocalScanState &lstate,
631
+ DataChunk &result) {
632
+ data_collection->Scan(gstate, lstate, result);
633
+
634
+ RowOperationsState row_state(aggregate_allocator->GetAllocator());
670
635
  const auto group_cols = layout.ColumnCount() - 1;
671
- for (idx_t col_no = 0; col_no < group_cols; col_no++) {
672
- auto &column = result.data[col_no];
673
- RowOperations::Gather(addresses, *FlatVector::IncrementalSelectionVector(), column,
674
- *FlatVector::IncrementalSelectionVector(), result.size(), layout, col_no);
675
- }
636
+ RowOperations::FinalizeStates(row_state, layout, lstate.chunk_state.row_locations, result, group_cols);
676
637
 
677
- RowOperationsState row_state(aggregate_allocator.GetAllocator());
678
- RowOperations::FinalizeStates(row_state, layout, addresses, result, group_cols);
679
- return this_n;
638
+ return result.size();
680
639
  }
681
640
 
682
641
  void GroupedAggregateHashTable::Finalize() {
@@ -684,8 +643,11 @@ void GroupedAggregateHashTable::Finalize() {
684
643
  return;
685
644
  }
686
645
 
687
- // early release hashes, not needed for partition/scan
646
+ // Early release hashes (not needed for partition/scan) and data collection (will be pinned again when scanning)
688
647
  hashes_hdl.Destroy();
648
+ data_collection->FinalizePinState(td_pin_state);
649
+ data_collection->Unpin();
650
+
689
651
  is_finalized = true;
690
652
  }
691
653