duckdb 0.8.2-dev3458.0 → 0.8.2-dev3949.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (180) hide show
  1. package/binding.gyp +2 -0
  2. package/package.json +1 -1
  3. package/src/duckdb/extension/icu/icu_extension.cpp +5 -5
  4. package/src/duckdb/extension/json/include/json_deserializer.hpp +7 -16
  5. package/src/duckdb/extension/json/include/json_serializer.hpp +9 -15
  6. package/src/duckdb/extension/json/json_deserializer.cpp +29 -67
  7. package/src/duckdb/extension/json/json_scan.cpp +1 -1
  8. package/src/duckdb/extension/json/json_serializer.cpp +26 -69
  9. package/src/duckdb/src/common/enum_util.cpp +119 -7
  10. package/src/duckdb/src/common/extra_type_info.cpp +7 -3
  11. package/src/duckdb/src/common/radix_partitioning.cpp +8 -31
  12. package/src/duckdb/src/common/row_operations/row_aggregate.cpp +18 -3
  13. package/src/duckdb/src/common/serializer/binary_deserializer.cpp +62 -77
  14. package/src/duckdb/src/common/serializer/binary_serializer.cpp +84 -84
  15. package/src/duckdb/src/common/serializer/format_serializer.cpp +1 -1
  16. package/src/duckdb/src/common/sort/partition_state.cpp +41 -33
  17. package/src/duckdb/src/common/types/data_chunk.cpp +44 -8
  18. package/src/duckdb/src/common/types/hyperloglog.cpp +21 -0
  19. package/src/duckdb/src/common/types/interval.cpp +3 -0
  20. package/src/duckdb/src/common/types/row/partitioned_tuple_data.cpp +252 -126
  21. package/src/duckdb/src/common/types/row/row_layout.cpp +3 -31
  22. package/src/duckdb/src/common/types/row/tuple_data_allocator.cpp +40 -32
  23. package/src/duckdb/src/common/types/row/tuple_data_collection.cpp +39 -26
  24. package/src/duckdb/src/common/types/row/tuple_data_layout.cpp +11 -1
  25. package/src/duckdb/src/common/types/row/tuple_data_segment.cpp +21 -16
  26. package/src/duckdb/src/common/types/value.cpp +63 -42
  27. package/src/duckdb/src/common/types/vector.cpp +33 -67
  28. package/src/duckdb/src/core_functions/scalar/list/list_lambdas.cpp +3 -2
  29. package/src/duckdb/src/execution/aggregate_hashtable.cpp +222 -364
  30. package/src/duckdb/src/execution/join_hashtable.cpp +5 -6
  31. package/src/duckdb/src/execution/operator/aggregate/physical_hash_aggregate.cpp +240 -310
  32. package/src/duckdb/src/execution/operator/aggregate/physical_ungrouped_aggregate.cpp +202 -173
  33. package/src/duckdb/src/execution/operator/aggregate/physical_window.cpp +36 -2
  34. package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/base_csv_reader.cpp +58 -162
  35. package/src/duckdb/src/execution/operator/csv_scanner/buffered_csv_reader.cpp +434 -0
  36. package/src/duckdb/src/execution/operator/csv_scanner/csv_buffer.cpp +80 -0
  37. package/src/duckdb/src/execution/operator/csv_scanner/csv_buffer_manager.cpp +90 -0
  38. package/src/duckdb/src/execution/operator/csv_scanner/csv_file_handle.cpp +95 -0
  39. package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/csv_reader_options.cpp +47 -28
  40. package/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine.cpp +35 -0
  41. package/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine_cache.cpp +107 -0
  42. package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/parallel_csv_reader.cpp +44 -44
  43. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp +52 -0
  44. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp +336 -0
  45. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/header_detection.cpp +165 -0
  46. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp +398 -0
  47. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_refinement.cpp +175 -0
  48. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_replacement.cpp +39 -0
  49. package/src/duckdb/src/execution/operator/join/physical_asof_join.cpp +1 -1
  50. package/src/duckdb/src/execution/operator/set/physical_recursive_cte.cpp +1 -2
  51. package/src/duckdb/src/execution/radix_partitioned_hashtable.cpp +614 -574
  52. package/src/duckdb/src/execution/window_executor.cpp +6 -5
  53. package/src/duckdb/src/function/cast/cast_function_set.cpp +1 -0
  54. package/src/duckdb/src/function/scalar/strftime_format.cpp +4 -4
  55. package/src/duckdb/src/function/table/copy_csv.cpp +94 -96
  56. package/src/duckdb/src/function/table/read_csv.cpp +150 -136
  57. package/src/duckdb/src/function/table/table_scan.cpp +0 -2
  58. package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
  59. package/src/duckdb/src/include/duckdb/common/enum_util.hpp +24 -0
  60. package/src/duckdb/src/include/duckdb/common/file_opener.hpp +9 -0
  61. package/src/duckdb/src/include/duckdb/common/fixed_size_map.hpp +208 -0
  62. package/src/duckdb/src/include/duckdb/common/optional_idx.hpp +3 -0
  63. package/src/duckdb/src/include/duckdb/common/perfect_map_set.hpp +2 -1
  64. package/src/duckdb/src/include/duckdb/common/printer.hpp +11 -0
  65. package/src/duckdb/src/include/duckdb/common/serializer/binary_deserializer.hpp +43 -30
  66. package/src/duckdb/src/include/duckdb/common/serializer/binary_serializer.hpp +36 -35
  67. package/src/duckdb/src/include/duckdb/common/serializer/deserialization_data.hpp +18 -0
  68. package/src/duckdb/src/include/duckdb/common/serializer/encoding_util.hpp +132 -0
  69. package/src/duckdb/src/include/duckdb/common/serializer/format_deserializer.hpp +125 -150
  70. package/src/duckdb/src/include/duckdb/common/serializer/format_serializer.hpp +119 -107
  71. package/src/duckdb/src/include/duckdb/common/serializer/serialization_traits.hpp +2 -1
  72. package/src/duckdb/src/include/duckdb/common/shared_ptr.hpp +8 -0
  73. package/src/duckdb/src/include/duckdb/common/sort/partition_state.hpp +13 -7
  74. package/src/duckdb/src/include/duckdb/common/types/data_chunk.hpp +5 -0
  75. package/src/duckdb/src/include/duckdb/common/types/hyperloglog.hpp +7 -1
  76. package/src/duckdb/src/include/duckdb/common/types/interval.hpp +7 -0
  77. package/src/duckdb/src/include/duckdb/common/types/row/partitioned_tuple_data.hpp +41 -9
  78. package/src/duckdb/src/include/duckdb/common/types/row/row_data_collection_scanner.hpp +5 -0
  79. package/src/duckdb/src/include/duckdb/common/types/row/row_layout.hpp +1 -23
  80. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_allocator.hpp +14 -8
  81. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_collection.hpp +6 -3
  82. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_layout.hpp +7 -0
  83. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_segment.hpp +13 -8
  84. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_states.hpp +3 -2
  85. package/src/duckdb/src/include/duckdb/common/types/vector.hpp +3 -3
  86. package/src/duckdb/src/include/duckdb/common/vector.hpp +2 -2
  87. package/src/duckdb/src/include/duckdb/execution/aggregate_hashtable.hpp +125 -146
  88. package/src/duckdb/src/include/duckdb/execution/operator/aggregate/physical_hash_aggregate.hpp +5 -4
  89. package/src/duckdb/src/include/duckdb/execution/operator/aggregate/physical_window.hpp +4 -3
  90. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/base_csv_reader.hpp +17 -17
  91. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/buffered_csv_reader.hpp +72 -0
  92. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_buffer.hpp +110 -0
  93. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_buffer_manager.hpp +103 -0
  94. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_file_handle.hpp +8 -15
  95. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_line_info.hpp +1 -1
  96. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_reader_options.hpp +52 -28
  97. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_sniffer.hpp +127 -0
  98. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine.hpp +75 -0
  99. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine_cache.hpp +51 -0
  100. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/parallel_csv_reader.hpp +21 -27
  101. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/quote_rules.hpp +21 -0
  102. package/src/duckdb/src/include/duckdb/execution/radix_partitioned_hashtable.hpp +18 -27
  103. package/src/duckdb/src/include/duckdb/function/function_serialization.hpp +5 -6
  104. package/src/duckdb/src/include/duckdb/function/scalar/strftime_format.hpp +4 -4
  105. package/src/duckdb/src/include/duckdb/function/table/read_csv.hpp +17 -12
  106. package/src/duckdb/src/include/duckdb/main/client_context_file_opener.hpp +1 -0
  107. package/src/duckdb/src/include/duckdb/main/client_data.hpp +2 -1
  108. package/src/duckdb/src/include/duckdb/main/config.hpp +1 -0
  109. package/src/duckdb/src/include/duckdb/main/connection.hpp +2 -2
  110. package/src/duckdb/src/include/duckdb/main/relation/read_csv_relation.hpp +6 -6
  111. package/src/duckdb/src/include/duckdb/parallel/event.hpp +12 -1
  112. package/src/duckdb/src/include/duckdb/storage/block.hpp +6 -0
  113. package/src/duckdb/src/include/duckdb/storage/buffer/block_handle.hpp +3 -0
  114. package/src/duckdb/src/include/duckdb/storage/statistics/base_statistics.hpp +7 -3
  115. package/src/duckdb/src/include/duckdb/storage/statistics/column_statistics.hpp +4 -0
  116. package/src/duckdb/src/include/duckdb/storage/statistics/distinct_statistics.hpp +5 -0
  117. package/src/duckdb/src/include/duckdb/storage/statistics/list_stats.hpp +3 -0
  118. package/src/duckdb/src/include/duckdb/storage/statistics/numeric_stats.hpp +3 -0
  119. package/src/duckdb/src/include/duckdb/storage/statistics/string_stats.hpp +3 -0
  120. package/src/duckdb/src/include/duckdb/storage/statistics/struct_stats.hpp +3 -0
  121. package/src/duckdb/src/include/duckdb/storage/table/chunk_info.hpp +15 -3
  122. package/src/duckdb/src/include/duckdb/storage/table/row_group.hpp +4 -0
  123. package/src/duckdb/src/include/duckdb/storage/table/table_statistics.hpp +5 -0
  124. package/src/duckdb/src/include/duckdb/verification/deserialized_statement_verifier_v2.hpp +6 -0
  125. package/src/duckdb/src/include/duckdb/verification/statement_verifier.hpp +1 -0
  126. package/src/duckdb/src/include/duckdb.h +12 -0
  127. package/src/duckdb/src/main/capi/logical_types-c.cpp +22 -0
  128. package/src/duckdb/src/main/client_context_file_opener.cpp +17 -0
  129. package/src/duckdb/src/main/client_verify.cpp +1 -0
  130. package/src/duckdb/src/main/config.cpp +2 -2
  131. package/src/duckdb/src/main/connection.cpp +3 -3
  132. package/src/duckdb/src/main/relation/read_csv_relation.cpp +19 -13
  133. package/src/duckdb/src/parallel/pipeline_finish_event.cpp +1 -1
  134. package/src/duckdb/src/parser/tableref/pivotref.cpp +0 -16
  135. package/src/duckdb/src/planner/binder/statement/bind_copy.cpp +1 -1
  136. package/src/duckdb/src/planner/binder/statement/bind_export.cpp +41 -25
  137. package/src/duckdb/src/planner/expression/bound_aggregate_expression.cpp +4 -4
  138. package/src/duckdb/src/planner/expression/bound_window_expression.cpp +10 -10
  139. package/src/duckdb/src/planner/logical_operator.cpp +1 -1
  140. package/src/duckdb/src/planner/planner.cpp +1 -1
  141. package/src/duckdb/src/storage/checkpoint_manager.cpp +4 -3
  142. package/src/duckdb/src/storage/serialization/serialize_constraint.cpp +1 -1
  143. package/src/duckdb/src/storage/serialization/serialize_create_info.cpp +5 -5
  144. package/src/duckdb/src/storage/serialization/serialize_expression.cpp +10 -10
  145. package/src/duckdb/src/storage/serialization/serialize_logical_operator.cpp +20 -20
  146. package/src/duckdb/src/storage/serialization/serialize_macro_function.cpp +2 -2
  147. package/src/duckdb/src/storage/serialization/serialize_nodes.cpp +118 -89
  148. package/src/duckdb/src/storage/serialization/serialize_parse_info.cpp +3 -3
  149. package/src/duckdb/src/storage/serialization/serialize_parsed_expression.cpp +27 -27
  150. package/src/duckdb/src/storage/serialization/serialize_query_node.cpp +16 -16
  151. package/src/duckdb/src/storage/serialization/serialize_result_modifier.cpp +8 -8
  152. package/src/duckdb/src/storage/serialization/serialize_statement.cpp +1 -1
  153. package/src/duckdb/src/storage/serialization/serialize_storage.cpp +39 -0
  154. package/src/duckdb/src/storage/serialization/serialize_tableref.cpp +9 -9
  155. package/src/duckdb/src/storage/statistics/base_statistics.cpp +67 -4
  156. package/src/duckdb/src/storage/statistics/column_statistics.cpp +16 -0
  157. package/src/duckdb/src/storage/statistics/list_stats.cpp +21 -0
  158. package/src/duckdb/src/storage/statistics/numeric_stats.cpp +126 -1
  159. package/src/duckdb/src/storage/statistics/string_stats.cpp +23 -0
  160. package/src/duckdb/src/storage/statistics/struct_stats.cpp +27 -0
  161. package/src/duckdb/src/storage/storage_info.cpp +1 -1
  162. package/src/duckdb/src/storage/table/chunk_info.cpp +82 -3
  163. package/src/duckdb/src/storage/table/row_group.cpp +68 -1
  164. package/src/duckdb/src/storage/table/table_statistics.cpp +21 -0
  165. package/src/duckdb/src/storage/wal_replay.cpp +2 -2
  166. package/src/duckdb/src/verification/deserialized_statement_verifier_v2.cpp +15 -1
  167. package/src/duckdb/src/verification/statement_verifier.cpp +2 -0
  168. package/src/duckdb/third_party/utf8proc/include/utf8proc_wrapper.hpp +8 -0
  169. package/src/duckdb/ub_src_execution.cpp +0 -2
  170. package/src/duckdb/ub_src_execution_operator_csv_scanner.cpp +18 -0
  171. package/src/duckdb/ub_src_execution_operator_csv_scanner_sniffer.cpp +12 -0
  172. package/src/duckdb/ub_src_execution_operator_persistent.cpp +0 -12
  173. package/src/duckdb/ub_src_storage_serialization.cpp +2 -0
  174. package/src/duckdb/src/execution/operator/persistent/buffered_csv_reader.cpp +0 -1487
  175. package/src/duckdb/src/execution/operator/persistent/csv_buffer.cpp +0 -72
  176. package/src/duckdb/src/execution/operator/persistent/csv_file_handle.cpp +0 -158
  177. package/src/duckdb/src/execution/partitionable_hashtable.cpp +0 -207
  178. package/src/duckdb/src/include/duckdb/execution/operator/persistent/buffered_csv_reader.hpp +0 -133
  179. package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_buffer.hpp +0 -74
  180. package/src/duckdb/src/include/duckdb/execution/partitionable_hashtable.hpp +0 -73
@@ -1,49 +1,28 @@
1
1
  #include "duckdb/execution/radix_partitioned_hashtable.hpp"
2
2
 
3
3
  #include "duckdb/common/radix_partitioning.hpp"
4
+ #include "duckdb/common/row_operations/row_operations.hpp"
4
5
  #include "duckdb/common/types/row/tuple_data_collection.hpp"
6
+ #include "duckdb/common/types/row/tuple_data_iterator.hpp"
7
+ #include "duckdb/execution/aggregate_hashtable.hpp"
5
8
  #include "duckdb/execution/executor.hpp"
6
9
  #include "duckdb/execution/operator/aggregate/physical_hash_aggregate.hpp"
10
+ #include "duckdb/main/config.hpp"
7
11
  #include "duckdb/parallel/event.hpp"
8
- #include "duckdb/parallel/task_scheduler.hpp"
9
12
  #include "duckdb/planner/expression/bound_reference_expression.hpp"
10
13
 
11
14
  namespace duckdb {
12
15
 
13
- // compute the GROUPING values
14
- // for each parameter to the GROUPING clause, we check if the hash table groups on this particular group
15
- // if it does, we return 0, otherwise we return 1
16
- // we then use bitshifts to combine these values
17
- void RadixPartitionedHashTable::SetGroupingValues() {
18
- auto &grouping_functions = op.GetGroupingFunctions();
19
- for (auto &grouping : grouping_functions) {
20
- int64_t grouping_value = 0;
21
- D_ASSERT(grouping.size() < sizeof(int64_t) * 8);
22
- for (idx_t i = 0; i < grouping.size(); i++) {
23
- if (grouping_set.find(grouping[i]) == grouping_set.end()) {
24
- // we don't group on this value!
25
- grouping_value += (int64_t)1 << (grouping.size() - (i + 1));
26
- }
27
- }
28
- grouping_values.push_back(Value::BIGINT(grouping_value));
29
- }
30
- }
31
-
32
16
  RadixPartitionedHashTable::RadixPartitionedHashTable(GroupingSet &grouping_set_p, const GroupedAggregateData &op_p)
33
17
  : grouping_set(grouping_set_p), op(op_p) {
34
-
35
18
  auto groups_count = op.GroupCount();
36
19
  for (idx_t i = 0; i < groups_count; i++) {
37
20
  if (grouping_set.find(i) == grouping_set.end()) {
38
21
  null_groups.push_back(i);
39
22
  }
40
23
  }
41
-
42
- // 10000 seems like a good compromise here
43
- radix_limit = 10000;
44
-
45
24
  if (grouping_set.empty()) {
46
- // fake a single group with a constant value for aggregation without groups
25
+ // Fake a single group with a constant value for aggregation without groups
47
26
  group_types.emplace_back(LogicalType::TINYINT);
48
27
  }
49
28
  for (auto &entry : grouping_set) {
@@ -51,79 +30,279 @@ RadixPartitionedHashTable::RadixPartitionedHashTable(GroupingSet &grouping_set_p
51
30
  group_types.push_back(op.group_types[entry]);
52
31
  }
53
32
  SetGroupingValues();
33
+
34
+ auto group_types_copy = group_types;
35
+ group_types_copy.emplace_back(LogicalType::HASH);
36
+ layout.Initialize(std::move(group_types_copy), AggregateObject::CreateAggregateObjects(op.bindings));
37
+ }
38
+
39
+ void RadixPartitionedHashTable::SetGroupingValues() {
40
+ // Compute the GROUPING values:
41
+ // For each parameter to the GROUPING clause, we check if the hash table groups on this particular group
42
+ // If it does, we return 0, otherwise we return 1
43
+ // We then use bitshifts to combine these values
44
+ auto &grouping_functions = op.GetGroupingFunctions();
45
+ for (auto &grouping : grouping_functions) {
46
+ int64_t grouping_value = 0;
47
+ D_ASSERT(grouping.size() < sizeof(int64_t) * 8);
48
+ for (idx_t i = 0; i < grouping.size(); i++) {
49
+ if (grouping_set.find(grouping[i]) == grouping_set.end()) {
50
+ // We don't group on this value!
51
+ grouping_value += (int64_t)1 << (grouping.size() - (i + 1));
52
+ }
53
+ }
54
+ grouping_values.push_back(Value::BIGINT(grouping_value));
55
+ }
56
+ }
57
+
58
+ const TupleDataLayout &RadixPartitionedHashTable::GetLayout() const {
59
+ return layout;
60
+ }
61
+
62
+ unique_ptr<GroupedAggregateHashTable> RadixPartitionedHashTable::CreateHT(ClientContext &context, const idx_t capacity,
63
+ const idx_t radix_bits) const {
64
+ return make_uniq<GroupedAggregateHashTable>(context, BufferAllocator::Get(context), group_types, op.payload_types,
65
+ op.bindings, capacity, radix_bits);
54
66
  }
55
67
 
56
68
  //===--------------------------------------------------------------------===//
57
69
  // Sink
58
70
  //===--------------------------------------------------------------------===//
59
- class RadixHTGlobalState : public GlobalSinkState {
60
- constexpr const static idx_t MAX_RADIX_PARTITIONS = 32;
71
+ struct AggregatePartition {
72
+ explicit AggregatePartition(unique_ptr<TupleDataCollection> data_p) : data(std::move(data_p)), finalized(false) {
73
+ }
74
+ unique_ptr<TupleDataCollection> data;
75
+ atomic<bool> finalized;
76
+ };
61
77
 
78
+ class RadixHTGlobalSinkState;
79
+
80
+ struct RadixHTConfig {
62
81
  public:
63
- explicit RadixHTGlobalState(ClientContext &context)
64
- : is_empty(true), multi_scan(true), partitioned(false),
65
- partition_info(make_uniq<RadixPartitionInfo>(
66
- MinValue<idx_t>(MAX_RADIX_PARTITIONS, TaskScheduler::GetScheduler(context).NumberOfThreads()))) {
67
- }
82
+ explicit RadixHTConfig(ClientContext &context, RadixHTGlobalSinkState &sink);
68
83
 
69
- vector<unique_ptr<PartitionableHashTable>> intermediate_hts;
70
- vector<shared_ptr<GroupedAggregateHashTable>> finalized_hts;
84
+ void SetRadixBits(idx_t radix_bits_p);
85
+ bool SetRadixBitsToExternal();
86
+ idx_t GetRadixBits() const;
71
87
 
72
- //! Whether or not any tuples were added to the HT
73
- bool is_empty;
74
- //! Whether or not the hash table should be scannable multiple times
75
- bool multi_scan;
76
- //! The lock for updating the global aggregate state
77
- mutex lock;
78
- //! Whether or not any thread has crossed the partitioning threshold
79
- atomic<bool> partitioned;
80
-
81
- bool is_finalized = false;
82
- bool is_partitioned = false;
83
-
84
- unique_ptr<RadixPartitionInfo> partition_info;
85
- AggregateHTAppendState append_state;
86
-
87
- //! Repartitioned HT info
88
- bool repartitioned = false;
89
- idx_t repartition_tasks_per_partition;
90
- vector<vector<unique_ptr<PartitionableHashTable>>> repartition_tasks;
91
- unique_array<atomic<idx_t>> repartition_tasks_assigned;
92
- unique_array<atomic<idx_t>> repartition_tasks_done;
93
- unique_array<atomic<bool>> finalize_assigned;
88
+ private:
89
+ void SetRadixBitsInternal(const idx_t radix_bits_p, bool external);
90
+ static idx_t InitialSinkRadixBits(ClientContext &context);
91
+ static idx_t MaximumSinkRadixBits(ClientContext &context);
92
+ static idx_t ExternalRadixBits(const idx_t &maximum_sink_radix_bits_p);
93
+ static idx_t SinkCapacity(ClientContext &context);
94
+
95
+ private:
96
+ //! Assume (1 << 15) = 32KB L1 cache per core, divided by two because hyperthreading
97
+ static constexpr const idx_t L1_CACHE_SIZE = 32768 / 2;
98
+ //! Assume (1 << 20) = 1MB L2 cache per core, divided by two because hyperthreading
99
+ static constexpr const idx_t L2_CACHE_SIZE = 1048576 / 2;
100
+ //! Assume (1 << 20) + (1 << 19) = 1.5MB L3 cache per core (shared), divided by two because hyperthreading
101
+ static constexpr const idx_t L3_CACHE_SIZE = 1572864 / 2;
102
+
103
+ //! Sink radix bits to initialize with
104
+ static constexpr const idx_t MAXIMUM_INITIAL_SINK_RADIX_BITS = 3;
105
+ //! Maximum Sink radix bits (independent of threads)
106
+ static constexpr const idx_t MAXIMUM_FINAL_SINK_RADIX_BITS = 7;
107
+ //! By how many radix bits to increment if we go external
108
+ static constexpr const idx_t EXTERNAL_RADIX_BITS_INCREMENT = 3;
109
+
110
+ //! The global sink state
111
+ RadixHTGlobalSinkState &sink;
112
+ //! Current thread-global sink radix bits
113
+ atomic<idx_t> sink_radix_bits;
114
+ //! Maximum Sink radix bits (set based on number of threads)
115
+ const idx_t maximum_sink_radix_bits;
116
+ //! Radix bits if we go external
117
+ const idx_t external_radix_bits;
118
+
119
+ public:
120
+ //! Capacity of HTs during the Sink
121
+ const idx_t sink_capacity;
122
+
123
+ //! If we fill this many blocks per partition, we trigger a repartition
124
+ static constexpr const double BLOCK_FILL_FACTOR = 1.8;
125
+ //! By how many bits to repartition if a repartition is triggered
126
+ static constexpr const idx_t REPARTITION_RADIX_BITS = 2;
94
127
  };
95
128
 
96
- class RadixHTLocalState : public LocalSinkState {
129
+ class RadixHTGlobalSinkState : public GlobalSinkState {
130
+ public:
131
+ RadixHTGlobalSinkState(ClientContext &context, const RadixPartitionedHashTable &radix_ht);
132
+
133
+ //! Destroys aggregate states (if multi-scan)
134
+ ~RadixHTGlobalSinkState() override;
135
+ void Destroy();
136
+
97
137
  public:
98
- explicit RadixHTLocalState(const RadixPartitionedHashTable &ht) : total_groups(0), is_empty(true) {
99
- // if there are no groups we create a fake group so everything has the same group
100
- group_chunk.InitializeEmpty(ht.group_types);
101
- if (ht.grouping_set.empty()) {
102
- group_chunk.data[0].Reference(Value::TINYINT(42));
138
+ //! The radix HT
139
+ const RadixPartitionedHashTable &radix_ht;
140
+ //! Config for partitioning
141
+ RadixHTConfig config;
142
+
143
+ //! Whether we've called Finalize
144
+ bool finalized;
145
+ //! Whether we are doing an external aggregation
146
+ atomic<bool> external;
147
+ //! Threads that have called Sink
148
+ atomic<idx_t> active_threads;
149
+ //! If any thread has called combine
150
+ atomic<bool> any_combined;
151
+
152
+ //! Lock for uncombined_data/stored_allocators
153
+ mutex lock;
154
+ //! Uncombined partitioned data that will be put into the AggregatePartitions
155
+ unique_ptr<PartitionedTupleData> uncombined_data;
156
+ //! Allocators used during the Sink/Finalize
157
+ vector<shared_ptr<ArenaAllocator>> stored_allocators;
158
+
159
+ //! Partitions that are finalized during GetData
160
+ vector<unique_ptr<AggregatePartition>> partitions;
161
+
162
+ //! For synchronizing finalize tasks
163
+ atomic<idx_t> finalize_idx;
164
+
165
+ //! Pin properties when scanning
166
+ TupleDataPinProperties scan_pin_properties;
167
+ //! Total count before combining
168
+ idx_t count_before_combining;
169
+ };
170
+
171
+ RadixHTGlobalSinkState::RadixHTGlobalSinkState(ClientContext &context, const RadixPartitionedHashTable &radix_ht_p)
172
+ : radix_ht(radix_ht_p), config(context, *this), finalized(false), external(false), active_threads(0),
173
+ any_combined(false), finalize_idx(0), scan_pin_properties(TupleDataPinProperties::DESTROY_AFTER_DONE),
174
+ count_before_combining(0) {
175
+ }
176
+
177
+ RadixHTGlobalSinkState::~RadixHTGlobalSinkState() {
178
+ Destroy();
179
+ }
180
+
181
+ // LCOV_EXCL_START
182
+ void RadixHTGlobalSinkState::Destroy() {
183
+ if (scan_pin_properties == TupleDataPinProperties::DESTROY_AFTER_DONE || count_before_combining == 0 ||
184
+ partitions.empty()) {
185
+ // Already destroyed / empty
186
+ return;
187
+ }
188
+
189
+ TupleDataLayout layout = partitions[0]->data->GetLayout().Copy();
190
+ if (!layout.HasDestructor()) {
191
+ return; // No destructors, exit
192
+ }
193
+
194
+ // There are aggregates with destructors: Call the destructor for each of the aggregates
195
+ RowOperationsState row_state(*stored_allocators.back());
196
+ for (auto &partition : partitions) {
197
+ auto &data_collection = *partition->data;
198
+ if (data_collection.Count() == 0) {
199
+ continue;
103
200
  }
201
+ TupleDataChunkIterator iterator(data_collection, TupleDataPinProperties::DESTROY_AFTER_DONE, false);
202
+ auto &row_locations = iterator.GetChunkState().row_locations;
203
+ do {
204
+ RowOperations::DestroyStates(row_state, layout, row_locations, iterator.GetCurrentChunkCount());
205
+ } while (iterator.Next());
206
+ data_collection.Reset();
104
207
  }
208
+ }
209
+ // LCOV_EXCL_STOP
105
210
 
211
+ RadixHTConfig::RadixHTConfig(ClientContext &context, RadixHTGlobalSinkState &sink_p)
212
+ : sink(sink_p), sink_radix_bits(InitialSinkRadixBits(context)),
213
+ maximum_sink_radix_bits(MaximumSinkRadixBits(context)),
214
+ external_radix_bits(ExternalRadixBits(maximum_sink_radix_bits)), sink_capacity(SinkCapacity(context)) {
215
+ }
216
+
217
+ void RadixHTConfig::SetRadixBits(idx_t radix_bits_p) {
218
+ SetRadixBitsInternal(MinValue(radix_bits_p, maximum_sink_radix_bits), false);
219
+ }
220
+
221
+ bool RadixHTConfig::SetRadixBitsToExternal() {
222
+ SetRadixBitsInternal(external_radix_bits, true);
223
+ return sink.external;
224
+ }
225
+
226
+ idx_t RadixHTConfig::GetRadixBits() const {
227
+ return sink_radix_bits;
228
+ }
229
+
230
+ void RadixHTConfig::SetRadixBitsInternal(const idx_t radix_bits_p, bool external) {
231
+ if (sink_radix_bits >= radix_bits_p || sink.any_combined) {
232
+ return;
233
+ }
234
+
235
+ lock_guard<mutex> guard(sink.lock);
236
+ if (sink_radix_bits >= radix_bits_p || sink.any_combined) {
237
+ return;
238
+ }
239
+
240
+ if (external) {
241
+ sink.external = true;
242
+ }
243
+ sink_radix_bits = radix_bits_p;
244
+ return;
245
+ }
246
+
247
+ idx_t RadixHTConfig::InitialSinkRadixBits(ClientContext &context) {
248
+ const idx_t active_threads = TaskScheduler::GetScheduler(context).NumberOfThreads();
249
+ return MinValue(RadixPartitioning::RadixBits(NextPowerOfTwo(active_threads)), MAXIMUM_INITIAL_SINK_RADIX_BITS);
250
+ }
251
+
252
+ idx_t RadixHTConfig::MaximumSinkRadixBits(ClientContext &context) {
253
+ const idx_t active_threads = TaskScheduler::GetScheduler(context).NumberOfThreads();
254
+ return MinValue(RadixPartitioning::RadixBits(NextPowerOfTwo(active_threads)), MAXIMUM_FINAL_SINK_RADIX_BITS);
255
+ }
256
+
257
+ idx_t RadixHTConfig::ExternalRadixBits(const idx_t &maximum_sink_radix_bits_p) {
258
+ return MinValue(maximum_sink_radix_bits_p + EXTERNAL_RADIX_BITS_INCREMENT, MAXIMUM_FINAL_SINK_RADIX_BITS);
259
+ }
260
+
261
+ idx_t RadixHTConfig::SinkCapacity(ClientContext &context) {
262
+ // Get active and maximum number of threads
263
+ const idx_t active_threads = TaskScheduler::GetScheduler(context).NumberOfThreads();
264
+ const auto max_threads = DBConfig::GetSystemMaxThreads(FileSystem::GetFileSystem(context));
265
+
266
+ // Compute cache size per active thread (assuming cache is shared)
267
+ const auto total_shared_cache_size = max_threads * L3_CACHE_SIZE;
268
+ const auto cache_per_active_thread = L1_CACHE_SIZE + L2_CACHE_SIZE + total_shared_cache_size / active_threads;
269
+
270
+ // Divide cache per active thread by entry size, round up to next power of two, to get capacity
271
+ const auto size_per_entry = sizeof(aggr_ht_entry_t) * GroupedAggregateHashTable::LOAD_FACTOR;
272
+ const auto capacity = NextPowerOfTwo(cache_per_active_thread / size_per_entry);
273
+
274
+ // Capacity must be at least the minimum capacity
275
+ return MaxValue<idx_t>(capacity, GroupedAggregateHashTable::InitialCapacity());
276
+ }
277
+
278
+ class RadixHTLocalSinkState : public LocalSinkState {
279
+ public:
280
+ RadixHTLocalSinkState(ClientContext &context, const RadixPartitionedHashTable &radix_ht);
281
+
282
+ public:
283
+ //! Thread-local HT that is re-used after abandoning
284
+ unique_ptr<GroupedAggregateHashTable> ht;
285
+ //! Chunk with group columns
106
286
  DataChunk group_chunk;
107
- //! The aggregate HT
108
- unique_ptr<PartitionableHashTable> ht;
109
- //! The total number of groups found by this thread
110
- idx_t total_groups;
111
287
 
112
- //! Whether or not any tuples were added to the HT
113
- bool is_empty;
288
+ //! Data that is abandoned ends up here (only if we're doing external aggregation)
289
+ unique_ptr<PartitionedTupleData> abandoned_data;
114
290
  };
115
291
 
116
- void RadixPartitionedHashTable::SetMultiScan(GlobalSinkState &state) {
117
- auto &gstate = state.Cast<RadixHTGlobalState>();
118
- gstate.multi_scan = true;
292
+ RadixHTLocalSinkState::RadixHTLocalSinkState(ClientContext &, const RadixPartitionedHashTable &radix_ht) {
293
+ // If there are no groups we create a fake group so everything has the same group
294
+ group_chunk.InitializeEmpty(radix_ht.group_types);
295
+ if (radix_ht.grouping_set.empty()) {
296
+ group_chunk.data[0].Reference(Value::TINYINT(42));
297
+ }
119
298
  }
120
299
 
121
300
  unique_ptr<GlobalSinkState> RadixPartitionedHashTable::GetGlobalSinkState(ClientContext &context) const {
122
- return make_uniq<RadixHTGlobalState>(context);
301
+ return make_uniq<RadixHTGlobalSinkState>(context, *this);
123
302
  }
124
303
 
125
304
  unique_ptr<LocalSinkState> RadixPartitionedHashTable::GetLocalSinkState(ExecutionContext &context) const {
126
- return make_uniq<RadixHTLocalState>(*this);
305
+ return make_uniq<RadixHTLocalSinkState>(context.client, *this);
127
306
  }
128
307
 
129
308
  void RadixPartitionedHashTable::PopulateGroupChunk(DataChunk &group_chunk, DataChunk &input_chunk) const {
@@ -141,507 +320,448 @@ void RadixPartitionedHashTable::PopulateGroupChunk(DataChunk &group_chunk, DataC
141
320
  group_chunk.Verify();
142
321
  }
143
322
 
144
- void RadixPartitionedHashTable::Sink(ExecutionContext &context, DataChunk &chunk, OperatorSinkInput &input,
145
- DataChunk &payload_input, const unsafe_vector<idx_t> &filter) const {
146
- auto &llstate = input.local_state.Cast<RadixHTLocalState>();
147
- auto &gstate = input.global_state.Cast<RadixHTGlobalState>();
148
- D_ASSERT(!gstate.is_finalized);
149
-
150
- DataChunk &group_chunk = llstate.group_chunk;
151
- PopulateGroupChunk(group_chunk, chunk);
323
+ bool MaybeRepartition(ClientContext &context, RadixHTGlobalSinkState &gstate, RadixHTLocalSinkState &lstate) {
324
+ auto &config = gstate.config;
325
+ auto &ht = *lstate.ht;
326
+ auto &partitioned_data = ht.GetPartitionedData();
327
+
328
+ // Check if we're approaching the memory limit
329
+ const idx_t n_threads = TaskScheduler::GetScheduler(context).NumberOfThreads();
330
+ const idx_t limit = BufferManager::GetBufferManager(context).GetMaxMemory();
331
+ const idx_t thread_limit = 0.6 * limit / n_threads;
332
+ if (ht.GetPartitionedData()->SizeInBytes() > thread_limit || context.config.force_external) {
333
+ if (gstate.config.SetRadixBitsToExternal()) {
334
+ // We're approaching the memory limit, unpin the data
335
+ if (!lstate.abandoned_data) {
336
+ lstate.abandoned_data = make_uniq<RadixPartitionedTupleData>(
337
+ BufferManager::GetBufferManager(context), gstate.radix_ht.GetLayout(), config.GetRadixBits(),
338
+ gstate.radix_ht.GetLayout().ColumnCount() - 1);
339
+ }
152
340
 
153
- // if we have non-combinable aggregates (e.g. string_agg) we cannot keep parallel hash
154
- // tables
155
- if (ForceSingleHT(input.global_state)) {
156
- lock_guard<mutex> glock(gstate.lock);
157
- gstate.is_empty = gstate.is_empty && group_chunk.size() == 0;
158
- if (gstate.finalized_hts.empty()) {
159
- // Create a finalized ht in the global state, that we can populate
160
- gstate.finalized_hts.push_back(make_shared<GroupedAggregateHashTable>(
161
- context.client, BufferAllocator::Get(context.client), group_types, op.payload_types, op.bindings,
162
- HtEntryType::HT_WIDTH_64));
341
+ ht.UnpinData();
342
+ partitioned_data->Repartition(*lstate.abandoned_data);
343
+ ht.SetRadixBits(gstate.config.GetRadixBits());
344
+ ht.InitializePartitionedData();
345
+ return true;
163
346
  }
164
- D_ASSERT(gstate.finalized_hts.size() == 1);
165
- D_ASSERT(gstate.finalized_hts[0]);
166
- llstate.total_groups +=
167
- gstate.finalized_hts[0]->AddChunk(gstate.append_state, group_chunk, payload_input, filter);
168
- return;
169
347
  }
170
348
 
171
- if (group_chunk.size() > 0) {
172
- llstate.is_empty = false;
173
- }
349
+ const auto partition_count = partitioned_data->PartitionCount();
350
+ const auto current_radix_bits = RadixPartitioning::RadixBits(partition_count);
351
+ D_ASSERT(current_radix_bits <= config.GetRadixBits());
174
352
 
175
- if (!llstate.ht) {
176
- llstate.ht =
177
- make_uniq<PartitionableHashTable>(context.client, BufferAllocator::Get(context.client),
178
- *gstate.partition_info, group_types, op.payload_types, op.bindings);
179
- if (context.client.config.force_external) {
180
- gstate.partitioned = true;
181
- }
353
+ const auto row_size_per_partition =
354
+ partitioned_data->Count() * partitioned_data->GetLayout().GetRowWidth() / partition_count;
355
+ if (row_size_per_partition > config.BLOCK_FILL_FACTOR * Storage::BLOCK_SIZE) {
356
+ // We crossed our block filling threshold, try to increment radix bits
357
+ config.SetRadixBits(current_radix_bits + config.REPARTITION_RADIX_BITS);
182
358
  }
183
359
 
184
- llstate.total_groups += llstate.ht->AddChunk(group_chunk, payload_input,
185
- gstate.partitioned && gstate.partition_info->n_partitions > 1, filter);
186
- if (llstate.total_groups >= radix_limit) {
187
- gstate.partitioned = true;
360
+ const auto global_radix_bits = config.GetRadixBits();
361
+ if (current_radix_bits == global_radix_bits) {
362
+ return false; // We're already on the right number of radix bits
188
363
  }
364
+
365
+ // We're out-of-sync with the global radix bits, repartition
366
+ ht.UnpinData();
367
+ auto old_partitioned_data = std::move(partitioned_data);
368
+ ht.SetRadixBits(global_radix_bits);
369
+ ht.InitializePartitionedData();
370
+ old_partitioned_data->Repartition(*ht.GetPartitionedData());
371
+ return true;
189
372
  }
190
373
 
191
- void RadixPartitionedHashTable::Combine(ExecutionContext &context, GlobalSinkState &state,
192
- LocalSinkState &lstate) const {
193
- auto &llstate = lstate.Cast<RadixHTLocalState>();
194
- auto &gstate = state.Cast<RadixHTGlobalState>();
195
- D_ASSERT(!gstate.is_finalized);
374
+ void RadixPartitionedHashTable::Sink(ExecutionContext &context, DataChunk &chunk, OperatorSinkInput &input,
375
+ DataChunk &payload_input, const unsafe_vector<idx_t> &filter) const {
376
+ auto &gstate = input.global_state.Cast<RadixHTGlobalSinkState>();
377
+ auto &lstate = input.local_state.Cast<RadixHTLocalSinkState>();
378
+ if (!lstate.ht) {
379
+ lstate.ht = CreateHT(context.client, gstate.config.sink_capacity, gstate.config.GetRadixBits());
380
+ gstate.active_threads++;
381
+ }
196
382
 
197
- // this actually does not do a lot but just pushes the local HTs into the global state so we can later combine them
198
- // in parallel
383
+ auto &group_chunk = lstate.group_chunk;
384
+ PopulateGroupChunk(group_chunk, chunk);
199
385
 
200
- if (ForceSingleHT(state)) {
201
- D_ASSERT(gstate.finalized_hts.size() <= 1);
202
- return;
203
- }
386
+ auto &ht = *lstate.ht;
387
+ ht.AddChunk(group_chunk, payload_input, filter);
204
388
 
205
- if (!llstate.ht) {
206
- return; // no data
389
+ if (ht.Count() + STANDARD_VECTOR_SIZE < ht.ResizeThreshold()) {
390
+ return; // We can fit another chunk
207
391
  }
208
392
 
209
- if (!llstate.ht->IsPartitioned() && gstate.partition_info->n_partitions > 1 && gstate.partitioned) {
210
- llstate.ht->Partition(true);
393
+ if (gstate.active_threads > 2) {
394
+ // 'Reset' the HT without taking its data, we can just keep appending to the same collection
395
+ // This only works because we never resize the HT
396
+ ht.ClearPointerTable();
397
+ ht.ResetCount();
398
+ // We don't do this when running with 1 or 2 threads, it only makes sense when there's many threads
211
399
  }
212
400
 
213
- // we will never add new values to these HTs so we can drop the first part of the HT
214
- llstate.ht->Finalize();
401
+ // Check if we need to repartition
402
+ auto repartitioned = MaybeRepartition(context.client, gstate, lstate);
215
403
 
216
- lock_guard<mutex> glock(gstate.lock);
217
- if (!llstate.is_empty) {
218
- gstate.is_empty = false;
404
+ if (repartitioned && ht.Count() != 0) {
405
+ // We repartitioned, but we didn't clear the pointer table / reset the count because we're on 1 or 2 threads
406
+ ht.ClearPointerTable();
407
+ ht.ResetCount();
219
408
  }
220
- // at this point we just collect them the PhysicalHashAggregateFinalizeTask (below) will merge them in parallel
221
- gstate.intermediate_hts.push_back(std::move(llstate.ht));
409
+
410
+ // TODO: combine early and often
222
411
  }
223
412
 
224
- void RadixPartitionedHashTable::InitializeFinalizedHTs(ClientContext &context, GlobalSinkState &gstate_p) const {
225
- auto &gstate = gstate_p.Cast<RadixHTGlobalState>();
226
- auto &allocator = BufferAllocator::Get(context);
227
- gstate.finalized_hts.resize(gstate.partition_info->n_partitions);
228
- for (idx_t r = 0; r < gstate.partition_info->n_partitions; r++) {
229
- gstate.finalized_hts[r] = make_shared<GroupedAggregateHashTable>(
230
- context, allocator, group_types, op.payload_types, op.bindings, HtEntryType::HT_WIDTH_64);
413
+ void RadixPartitionedHashTable::Combine(ExecutionContext &context, GlobalSinkState &gstate_p,
414
+ LocalSinkState &lstate_p) const {
415
+ auto &gstate = gstate_p.Cast<RadixHTGlobalSinkState>();
416
+ auto &lstate = lstate_p.Cast<RadixHTLocalSinkState>();
417
+ if (!lstate.ht) {
418
+ return;
231
419
  }
232
- }
233
420
 
234
- bool RadixPartitionedHashTable::Finalize(ClientContext &context, GlobalSinkState &gstate_p) const {
235
- auto &gstate = gstate_p.Cast<RadixHTGlobalState>();
236
- D_ASSERT(!gstate.is_finalized);
237
- gstate.is_finalized = true;
421
+ // Set any_combined, then check one last time whether we need to repartition
422
+ gstate.any_combined = true;
423
+ MaybeRepartition(context.client, gstate, lstate);
238
424
 
239
- // special case if we have non-combinable aggregates
240
- // we have already aggregated into a global shared HT that does not require any additional finalization steps
241
- if (ForceSingleHT(gstate)) {
242
- D_ASSERT(gstate.finalized_hts.size() <= 1);
243
- D_ASSERT(gstate.finalized_hts.empty() || gstate.finalized_hts[0]);
244
- return false;
425
+ auto &ht = *lstate.ht;
426
+ ht.UnpinData();
427
+
428
+ if (lstate.abandoned_data) {
429
+ D_ASSERT(gstate.external);
430
+ D_ASSERT(lstate.abandoned_data->PartitionCount() == lstate.ht->GetPartitionedData()->PartitionCount());
431
+ D_ASSERT(lstate.abandoned_data->PartitionCount() ==
432
+ RadixPartitioning::NumberOfPartitions(gstate.config.GetRadixBits()));
433
+ lstate.abandoned_data->Combine(*lstate.ht->GetPartitionedData());
434
+ } else {
435
+ lstate.abandoned_data = std::move(ht.GetPartitionedData());
245
436
  }
246
437
 
247
- // we can have two cases now, non-partitioned for few groups and radix-partitioned for very many groups.
248
- auto &allocator = BufferAllocator::Get(context);
249
- if (AnyPartitioned(gstate_p)) {
250
- // if one is partitioned, all have to be
251
- // this should mostly have already happened in Combine, but if not we do it here
252
- for (auto &pht : gstate.intermediate_hts) {
253
- if (!pht->IsPartitioned()) {
254
- pht->Partition(true);
255
- }
256
- }
257
- // schedule additional tasks to combine the partial HTs
258
- InitializeFinalizedHTs(context, gstate_p);
259
- gstate.is_partitioned = true;
260
- return true;
261
- } else { // in the non-partitioned case we immediately combine all the unpartitioned hts created by the threads.
262
- // TODO possible optimization, if total count < limit for 32 bit ht, use that one
263
- // create this ht here so finalize needs no lock on gstate
264
-
265
- gstate.finalized_hts.push_back(make_shared<GroupedAggregateHashTable>(
266
- context, allocator, group_types, op.payload_types, op.bindings, HtEntryType::HT_WIDTH_64));
267
- for (auto &pht : gstate.intermediate_hts) {
268
- auto unpartitioned = pht->GetUnpartitioned();
269
- for (auto &unpartitioned_ht : unpartitioned) {
270
- D_ASSERT(unpartitioned_ht);
271
- gstate.finalized_hts[0]->Combine(*unpartitioned_ht);
272
- unpartitioned_ht.reset();
273
- }
274
- unpartitioned.clear();
275
- }
276
- D_ASSERT(gstate.finalized_hts[0]);
277
- gstate.finalized_hts[0]->Finalize();
278
- return false;
438
+ lock_guard<mutex> guard(gstate.lock);
439
+ if (gstate.uncombined_data) {
440
+ gstate.uncombined_data->Combine(*lstate.abandoned_data);
441
+ } else {
442
+ gstate.uncombined_data = std::move(lstate.abandoned_data);
279
443
  }
444
+ gstate.stored_allocators.emplace_back(ht.GetAggregateAllocator());
280
445
  }
281
446
 
282
- // this task is run in multiple threads and combines the radix-partitioned hash tables into a single one and then
283
- // folds them into the global ht finally.
284
- class RadixAggregateFinalizeTask : public ExecutorTask {
285
- public:
286
- RadixAggregateFinalizeTask(Executor &executor, shared_ptr<Event> event_p, RadixHTGlobalState &state_p,
287
- idx_t radix_p)
288
- : ExecutorTask(executor), event(std::move(event_p)), state(state_p), radix(radix_p) {
289
- }
290
-
291
- static void FinalizeHT(RadixHTGlobalState &gstate, idx_t radix) {
292
- D_ASSERT(gstate.partition_info->n_partitions <= gstate.finalized_hts.size());
293
- D_ASSERT(gstate.finalized_hts[radix]);
294
-
295
- idx_t pht_idx_from = 0;
296
- idx_t pht_idx_to = gstate.intermediate_hts.size();
297
- if (gstate.repartitioned) {
298
- const auto num_partitions_before = gstate.repartition_tasks.size();
299
- const auto multiplier = gstate.partition_info->n_partitions / num_partitions_before;
300
- const auto radix_before = radix / multiplier;
301
- pht_idx_from = radix_before * gstate.repartition_tasks_per_partition;
302
- pht_idx_to = pht_idx_from + gstate.repartition_tasks_per_partition;
303
- }
447
+ void RadixPartitionedHashTable::Finalize(ClientContext &, GlobalSinkState &gstate_p) const {
448
+ auto &gstate = gstate_p.Cast<RadixHTGlobalSinkState>();
449
+
450
+ if (gstate.uncombined_data) {
451
+ auto &uncombined_data = *gstate.uncombined_data;
452
+ gstate.count_before_combining = uncombined_data.Count();
453
+
454
+ // If true there is no need to combine, it was all done by a single thread in a single HT
455
+ const auto single_ht = !gstate.external && gstate.active_threads == 1;
304
456
 
305
- for (idx_t i = pht_idx_from; i < pht_idx_to; i++) {
306
- for (auto &ht : gstate.intermediate_hts[i]->GetPartition(radix)) {
307
- gstate.finalized_hts[radix]->Combine(*ht);
308
- ht.reset();
457
+ auto &uncombined_partition_data = uncombined_data.GetPartitions();
458
+ const auto n_partitions = uncombined_partition_data.size();
459
+ gstate.partitions.reserve(n_partitions);
460
+ for (idx_t i = 0; i < n_partitions; i++) {
461
+ gstate.partitions.emplace_back(make_uniq<AggregatePartition>(std::move(uncombined_partition_data[i])));
462
+ if (single_ht) {
463
+ gstate.finalize_idx++;
464
+ gstate.partitions.back()->finalized = true;
309
465
  }
310
466
  }
311
- gstate.finalized_hts[radix]->Finalize();
467
+ } else {
468
+ gstate.count_before_combining = 0;
312
469
  }
313
470
 
314
- TaskExecutionResult ExecuteTask(TaskExecutionMode mode) override {
315
- FinalizeHT(state, radix);
316
- event->FinishTask();
317
- return TaskExecutionResult::TASK_FINISHED;
318
- }
471
+ gstate.finalized = true;
472
+ }
319
473
 
320
- private:
321
- shared_ptr<Event> event;
322
- RadixHTGlobalState &state;
323
- idx_t radix;
474
+ //===--------------------------------------------------------------------===//
475
+ // Source
476
+ //===--------------------------------------------------------------------===//
477
+ idx_t RadixPartitionedHashTable::Count(GlobalSinkState &sink_p) const {
478
+ const auto count = CountInternal(sink_p);
479
+ return count == 0 && grouping_set.empty() ? 1 : count;
480
+ }
481
+
482
+ idx_t RadixPartitionedHashTable::CountInternal(GlobalSinkState &sink_p) const {
483
+ auto &sink = sink_p.Cast<RadixHTGlobalSinkState>();
484
+ return sink.count_before_combining;
485
+ }
486
+
487
+ void RadixPartitionedHashTable::SetMultiScan(GlobalSinkState &sink_p) {
488
+ auto &sink = sink_p.Cast<RadixHTGlobalSinkState>();
489
+ sink.scan_pin_properties = TupleDataPinProperties::UNPIN_AFTER_DONE;
490
+ }
491
+
492
+ enum class RadixHTSourceTaskType : uint8_t { NO_TASK, FINALIZE, SCAN };
493
+
494
+ class RadixHTLocalSourceState;
495
+
496
+ class RadixHTGlobalSourceState : public GlobalSourceState {
497
+ public:
498
+ RadixHTGlobalSourceState(ClientContext &context, const RadixPartitionedHashTable &radix_ht);
499
+
500
+ //! Assigns a task to a local source state
501
+ bool AssignTask(RadixHTGlobalSinkState &sink, RadixHTLocalSourceState &lstate);
502
+
503
+ public:
504
+ //! The client context
505
+ ClientContext &context;
506
+ //! For synchronizing the source phase
507
+ atomic<bool> finished;
508
+
509
+ //! Column ids for scanning
510
+ vector<column_t> column_ids;
511
+
512
+ //! For synchronizing scan tasks
513
+ atomic<idx_t> scan_idx;
514
+ atomic<idx_t> scan_done;
324
515
  };
325
516
 
326
- class RadixAggregateRepartitionTask : public ExecutorTask {
517
+ enum class RadixHTScanStatus : uint8_t { INIT, IN_PROGRESS, DONE };
518
+
519
+ class RadixHTLocalSourceState : public LocalSourceState {
327
520
  public:
328
- RadixAggregateRepartitionTask(Executor &executor, shared_ptr<Event> event_p, RadixHTGlobalState &state_p,
329
- idx_t num_partitions_before_p)
330
- : ExecutorTask(executor), event(std::move(event_p)), state(state_p),
331
- num_partitions_before(num_partitions_before_p) {
332
- }
333
-
334
- TaskExecutionResult ExecuteTask(TaskExecutionMode mode) override {
335
- const auto multiplier = state.partition_info->n_partitions / num_partitions_before;
336
-
337
- idx_t repartition_radix = 0;
338
- idx_t finalize_radix = 0;
339
- while (repartition_radix < num_partitions_before && finalize_radix < state.partition_info->n_partitions) {
340
- // Loop over original partitions until we find one that we can repartition
341
- for (; repartition_radix < num_partitions_before; repartition_radix++) {
342
- auto task_idx = state.repartition_tasks_assigned[repartition_radix]++;
343
- if (task_idx >= state.repartition_tasks_per_partition) {
344
- continue;
345
- }
346
- auto &ht = state.repartition_tasks[repartition_radix][task_idx];
347
- ht->Partition(true);
348
- state.intermediate_hts[repartition_radix * state.repartition_tasks_per_partition + task_idx] =
349
- std::move(ht);
350
- state.repartition_tasks_done[repartition_radix]++;
351
- break;
352
- }
521
+ explicit RadixHTLocalSourceState(ExecutionContext &context, const RadixPartitionedHashTable &radix_ht);
353
522
 
354
- // Loop over repartitioned partitions
355
- for (; finalize_radix < state.partition_info->n_partitions; finalize_radix++) {
356
- const auto original_radix = finalize_radix / multiplier;
357
- if (state.repartition_tasks_done[original_radix] != state.repartition_tasks_per_partition) {
358
- break; // Needs more repartitioning
359
- }
360
-
361
- if (state.finalize_assigned[finalize_radix]) {
362
- continue; // Already assigned
363
- }
364
-
365
- {
366
- lock_guard<mutex> guard(state.lock);
367
- if (state.finalize_assigned[finalize_radix]) {
368
- // LCOV_EXCL_START
369
- continue; // Check again with lock, but already assigned
370
- // LCOV_EXCL_STOP
371
- }
372
- state.finalize_assigned[finalize_radix] = true;
373
- }
374
-
375
- // We can finalize!
376
- RadixAggregateFinalizeTask::FinalizeHT(state, finalize_radix);
377
- }
378
- }
379
- event->FinishTask();
380
- return TaskExecutionResult::TASK_FINISHED;
381
- }
523
+ public:
524
+ //! Do the work this thread has been assigned
525
+ void ExecuteTask(RadixHTGlobalSinkState &sink, RadixHTGlobalSourceState &gstate, DataChunk &chunk);
526
+ //! Whether this thread has finished the work it has been assigned
527
+ bool TaskFinished();
382
528
 
383
529
  private:
384
- shared_ptr<Event> event;
385
- RadixHTGlobalState &state;
386
- const idx_t num_partitions_before;
530
+ //! Execute the finalize or scan task
531
+ void Finalize(RadixHTGlobalSinkState &sink, RadixHTGlobalSourceState &gstate);
532
+ void Scan(RadixHTGlobalSinkState &sink, RadixHTGlobalSourceState &gstate, DataChunk &chunk);
533
+
534
+ public:
535
+ //! Current task and index
536
+ RadixHTSourceTaskType task;
537
+ idx_t task_idx;
538
+
539
+ //! Thread-local HT that is re-used to Finalize
540
+ unique_ptr<GroupedAggregateHashTable> ht;
541
+ //! Current status of a Scan
542
+ RadixHTScanStatus scan_status;
543
+
544
+ private:
545
+ //! Allocator and layout for finalizing state
546
+ TupleDataLayout layout;
547
+ ArenaAllocator aggregate_allocator;
548
+
549
+ //! State and chunk for scanning
550
+ TupleDataScanState scan_state;
551
+ DataChunk scan_chunk;
387
552
  };
388
553
 
389
- void RadixPartitionedHashTable::ScheduleTasks(Executor &executor, const shared_ptr<Event> &event,
390
- GlobalSinkState &state, vector<shared_ptr<Task>> &tasks) const {
391
- auto &gstate = state.Cast<RadixHTGlobalState>();
392
- if (!gstate.is_partitioned) {
393
- return;
554
+ unique_ptr<GlobalSourceState> RadixPartitionedHashTable::GetGlobalSourceState(ClientContext &context) const {
555
+ return make_uniq<RadixHTGlobalSourceState>(context, *this);
556
+ }
557
+
558
+ unique_ptr<LocalSourceState> RadixPartitionedHashTable::GetLocalSourceState(ExecutionContext &context) const {
559
+ return make_uniq<RadixHTLocalSourceState>(context, *this);
560
+ }
561
+
562
+ RadixHTGlobalSourceState::RadixHTGlobalSourceState(ClientContext &context_p, const RadixPartitionedHashTable &radix_ht)
563
+ : context(context_p), finished(false), scan_idx(0), scan_done(0) {
564
+ for (column_t column_id = 0; column_id < radix_ht.group_types.size(); column_id++) {
565
+ column_ids.push_back(column_id);
394
566
  }
567
+ }
395
568
 
396
- idx_t repartition_radix_bits;
397
- idx_t concurrent_repartitions;
398
- idx_t tasks_per_partition;
399
- GetRepartitionInfo(executor.context, state, repartition_radix_bits, concurrent_repartitions, tasks_per_partition);
400
- if (repartition_radix_bits == gstate.partition_info->radix_bits) {
401
- // No repartitioning necessary
402
- for (idx_t r = 0; r < gstate.partition_info->n_partitions; r++) {
403
- D_ASSERT(gstate.partition_info->n_partitions <= gstate.finalized_hts.size());
404
- D_ASSERT(gstate.finalized_hts[r]);
405
- tasks.push_back(make_uniq<RadixAggregateFinalizeTask>(executor, event, gstate, r));
406
- }
407
- } else {
408
- // Schedule repartition / finalize tasks
409
- ScheduleRepartitionTasks(executor, event, state, tasks, repartition_radix_bits, concurrent_repartitions,
410
- tasks_per_partition);
411
- }
412
- }
413
-
414
- void RadixPartitionedHashTable::ScheduleRepartitionTasks(Executor &executor, const shared_ptr<Event> &event,
415
- GlobalSinkState &state, vector<shared_ptr<Task>> &tasks,
416
- const idx_t repartition_radix_bits,
417
- const idx_t concurrent_repartitions,
418
- const idx_t tasks_per_partition) const {
419
- auto &gstate = state.Cast<RadixHTGlobalState>();
420
- D_ASSERT(repartition_radix_bits > gstate.partition_info->radix_bits);
421
- const auto num_partitions_before = gstate.partition_info->n_partitions;
422
- const auto multiplier = RadixPartitioning::NumberOfPartitions(repartition_radix_bits) / num_partitions_before;
423
-
424
- // Inititialize gstate
425
- auto new_partition_info =
426
- make_uniq<RadixPartitionInfo>(RadixPartitioning::NumberOfPartitions(repartition_radix_bits));
427
- gstate.repartitioned = true;
428
- gstate.repartition_tasks_per_partition = tasks_per_partition;
429
- gstate.repartition_tasks.resize(num_partitions_before);
430
- gstate.repartition_tasks_assigned = make_uniq_array<atomic<idx_t>>(num_partitions_before);
431
- gstate.repartition_tasks_done = make_uniq_array<atomic<idx_t>>(num_partitions_before);
432
- gstate.finalize_assigned = make_uniq_array<atomic<bool>>(new_partition_info->n_partitions);
433
- for (idx_t partition_idx = 0; partition_idx < num_partitions_before; partition_idx++) {
434
- gstate.repartition_tasks_assigned[partition_idx] = 0;
435
- gstate.repartition_tasks_done[partition_idx] = 0;
436
-
437
- // Grab intermediate data from gstate
438
- HashTableList partition_list;
439
- for (auto &pht : gstate.intermediate_hts) {
440
- for (auto &ht : pht->GetPartition(partition_idx)) {
441
- partition_list.push_back(std::move(ht));
442
- }
443
- }
569
+ bool RadixHTGlobalSourceState::AssignTask(RadixHTGlobalSinkState &sink, RadixHTLocalSourceState &lstate) {
570
+ D_ASSERT(lstate.scan_status != RadixHTScanStatus::IN_PROGRESS);
444
571
 
445
- // Spread the data across the tasks
446
- const idx_t hts_per_task = (partition_list.size() + tasks_per_partition - 1) / tasks_per_partition;
447
- idx_t ht_idx = 0;
448
- for (idx_t task_idx = 0; task_idx < tasks_per_partition; task_idx++) {
449
- auto task_ht =
450
- make_uniq<PartitionableHashTable>(executor.context, BufferAllocator::Get(executor.context),
451
- *new_partition_info, group_types, op.payload_types, op.bindings);
452
- auto ht_idx_to = MinValue<idx_t>(ht_idx + hts_per_task, partition_list.size());
453
- for (; ht_idx < ht_idx_to; ht_idx++) {
454
- auto &ht = partition_list[ht_idx];
455
- task_ht->Append(*ht);
456
- ht.reset();
457
- }
458
- gstate.repartition_tasks[partition_idx].push_back(std::move(task_ht));
459
- }
572
+ const auto n_partitions = sink.partitions.size();
573
+ if (scan_done == n_partitions) {
574
+ finished = true;
575
+ return false;
576
+ }
577
+ // We first try to assign a Scan task, then a Finalize task if that didn't work, without using any locks
460
578
 
461
- for (idx_t i = 0; i < multiplier; i++) {
462
- gstate.finalize_assigned[partition_idx * multiplier + i] = false;
579
+ // We need an atomic compare-and-swap to assign a Scan task, because we need to only increment
580
+ // the 'scan_idx' atomic if the 'finalize' of that partition is true, i.e., ready to be scanned
581
+ bool scan_assigned = true;
582
+ do {
583
+ lstate.task_idx = scan_idx.load();
584
+ if (lstate.task_idx >= n_partitions || !sink.partitions[lstate.task_idx]->finalized) {
585
+ scan_assigned = false;
586
+ break;
463
587
  }
464
- }
588
+ } while (!std::atomic_compare_exchange_weak(&scan_idx, &lstate.task_idx, lstate.task_idx + 1));
465
589
 
466
- // Schedule tasks equal to number of therads
467
- const idx_t num_threads = TaskScheduler::GetScheduler(executor.context).NumberOfThreads();
468
- for (idx_t i = 0; i < num_threads; i++) {
469
- tasks.emplace_back(make_shared<RadixAggregateRepartitionTask>(executor, event, gstate, num_partitions_before));
590
+ if (scan_assigned) {
591
+ // We successfully assigned a Scan task
592
+ D_ASSERT(lstate.task_idx < n_partitions && sink.partitions[lstate.task_idx]->finalized);
593
+ lstate.task = RadixHTSourceTaskType::SCAN;
594
+ lstate.scan_status = RadixHTScanStatus::INIT;
595
+ return true;
470
596
  }
471
597
 
472
- gstate.intermediate_hts.clear();
473
- gstate.intermediate_hts.resize(num_partitions_before * tasks_per_partition);
598
+ // We can just increment the atomic here, much simpler than assigning the scan task
599
+ lstate.task_idx = sink.finalize_idx++;
600
+ if (lstate.task_idx < n_partitions) {
601
+ // We successfully assigned a Finalize task
602
+ lstate.task = RadixHTSourceTaskType::FINALIZE;
603
+ return true;
604
+ }
474
605
 
475
- gstate.partition_info = std::move(new_partition_info);
476
- InitializeFinalizedHTs(executor.context, state);
606
+ // We didn't manage to assign a finalize task
607
+ return false;
477
608
  }
478
609
 
479
- bool RadixPartitionedHashTable::ForceSingleHT(GlobalSinkState &state) {
480
- auto &gstate = state.Cast<RadixHTGlobalState>();
481
- return gstate.partition_info->n_partitions < 2;
610
+ RadixHTLocalSourceState::RadixHTLocalSourceState(ExecutionContext &context, const RadixPartitionedHashTable &radix_ht)
611
+ : task(RadixHTSourceTaskType::NO_TASK), scan_status(RadixHTScanStatus::DONE), layout(radix_ht.GetLayout().Copy()),
612
+ aggregate_allocator(BufferAllocator::Get(context.client)) {
613
+ auto &allocator = BufferAllocator::Get(context.client);
614
+ auto scan_chunk_types = radix_ht.group_types;
615
+ for (auto &aggr_type : radix_ht.op.aggregate_return_types) {
616
+ scan_chunk_types.push_back(aggr_type);
617
+ }
618
+ scan_chunk.Initialize(allocator, scan_chunk_types);
482
619
  }
483
620
 
484
- bool RadixPartitionedHashTable::AnyPartitioned(GlobalSinkState &state) {
485
- auto &gstate = state.Cast<RadixHTGlobalState>();
486
- for (auto &pht : gstate.intermediate_hts) {
487
- if (pht->IsPartitioned()) {
488
- return true;
489
- }
621
+ void RadixHTLocalSourceState::ExecuteTask(RadixHTGlobalSinkState &sink, RadixHTGlobalSourceState &gstate,
622
+ DataChunk &chunk) {
623
+ switch (task) {
624
+ case RadixHTSourceTaskType::FINALIZE:
625
+ Finalize(sink, gstate);
626
+ break;
627
+ case RadixHTSourceTaskType::SCAN:
628
+ Scan(sink, gstate, chunk);
629
+ break;
630
+ default:
631
+ throw InternalException("Unexpected RadixHTSourceTaskType in ExecuteTask!");
490
632
  }
491
- return false;
492
633
  }
493
634
 
494
- void RadixPartitionedHashTable::GetRepartitionInfo(ClientContext &context, GlobalSinkState &state,
495
- idx_t &repartition_radix_bits, idx_t &concurrent_repartitions,
496
- idx_t &tasks_per_partition) {
497
- auto &gstate = state.Cast<RadixHTGlobalState>();
498
- const auto num_partitions = gstate.partition_info->n_partitions;
499
- const auto radix_bits = gstate.partition_info->radix_bits;
500
- D_ASSERT(IsPowerOfTwo(num_partitions));
501
-
502
- vector<idx_t> partition_counts(num_partitions, 0);
503
- vector<idx_t> partition_sizes(num_partitions, 0);
504
- for (const auto &ht : gstate.intermediate_hts) {
505
- for (idx_t partition_idx = 0; partition_idx < num_partitions; partition_idx++) {
506
- partition_counts[partition_idx] += ht->GetPartitionCount(partition_idx);
507
- partition_sizes[partition_idx] += ht->GetPartitionSize(partition_idx);
508
- }
509
- }
635
+ void RadixHTLocalSourceState::Finalize(RadixHTGlobalSinkState &sink, RadixHTGlobalSourceState &gstate) {
636
+ D_ASSERT(task == RadixHTSourceTaskType::FINALIZE);
637
+ D_ASSERT(scan_status != RadixHTScanStatus::IN_PROGRESS);
510
638
 
511
- idx_t total_size = 0;
512
- idx_t max_partition_idx = 0;
513
- idx_t max_partition_size = 0;
514
- for (idx_t partition_idx = 0; partition_idx < num_partitions; partition_idx++) {
515
- const auto &partition_count = partition_counts[partition_idx];
516
- const auto &partition_size = partition_sizes[partition_idx];
517
- auto partition_ht_size =
518
- partition_size + GroupedAggregateHashTable::FirstPartSize(partition_count, HtEntryType::HT_WIDTH_64);
519
- if (partition_ht_size > max_partition_size) {
520
- max_partition_idx = partition_idx;
521
- max_partition_size = partition_ht_size;
522
- }
523
- total_size += partition_ht_size;
524
- }
525
-
526
- // Switch to out-of-core finalize at ~60%
527
- const auto max_ht_size = double(0.6) * BufferManager::GetBufferManager(context).GetMaxMemory();
528
- const idx_t n_threads = PreviousPowerOfTwo(TaskScheduler::GetScheduler(context).NumberOfThreads());
529
- D_ASSERT(IsPowerOfTwo(n_threads));
530
- if (!context.config.force_external && total_size < max_ht_size) {
531
- // In-memory finalize
532
- if (num_partitions >= n_threads) { // Can already keep all threads busy
533
- repartition_radix_bits = radix_bits;
534
- tasks_per_partition = 1;
535
- } else { // Repartition to keep all threads busy
536
- // Can't have coverage because RadixHTGlobalState::MAX_RADIX_PARTITIONS > threads on github actions
537
- // LCOV_EXCL_START
538
- repartition_radix_bits = RadixPartitioning::RadixBits(NextPowerOfTwo(n_threads));
539
- tasks_per_partition = n_threads / num_partitions;
540
- // LCOV_EXCL_STOP
541
- }
542
- concurrent_repartitions = num_partitions;
639
+ auto &partition = *sink.partitions[task_idx];
640
+ if (partition.data->Count() == 0) {
641
+ partition.finalized = true;
543
642
  return;
544
643
  }
545
644
 
546
- // Out-of-core finalize
547
- const auto partition_count = partition_counts[max_partition_idx];
548
- const auto partition_size = partition_sizes[max_partition_idx];
645
+ if (!ht) {
646
+ // Create a HT with sufficient capacity
647
+ const auto capacity = GroupedAggregateHashTable::GetCapacityForCount(partition.data->Count());
648
+ ht = sink.radix_ht.CreateHT(gstate.context, capacity, 0);
649
+ } else {
650
+ // We may want to resize here to the size of this partition, but for now we just assume uniform partition sizes
651
+ ht->InitializePartitionedData();
652
+ ht->ClearPointerTable();
653
+ ht->ResetCount();
654
+ }
549
655
 
550
- const auto max_added_bits = RadixPartitioning::MAX_RADIX_BITS - radix_bits;
551
- idx_t added_bits;
552
- for (added_bits = 1; added_bits < max_added_bits; added_bits++) {
553
- double partition_multiplier = RadixPartitioning::NumberOfPartitions(added_bits);
656
+ // Now combine the uncombined data using this thread's HT
657
+ ht->Combine(*partition.data);
658
+ ht->UnpinData();
554
659
 
555
- auto new_estimated_count = double(partition_count) / partition_multiplier;
556
- auto new_estimated_size = double(partition_size) / partition_multiplier;
557
- auto new_estimated_ht_size = new_estimated_size + GroupedAggregateHashTable::FirstPartSize(
558
- new_estimated_count, HtEntryType::HT_WIDTH_64);
660
+ // Move the combined data back to the partition
661
+ partition.data =
662
+ make_uniq<TupleDataCollection>(BufferManager::GetBufferManager(gstate.context), sink.radix_ht.GetLayout());
663
+ partition.data->Combine(*ht->GetPartitionedData()->GetPartitions()[0]);
559
664
 
560
- if (new_estimated_ht_size <= max_ht_size / n_threads) {
561
- break; // Max HT size is safe
562
- }
563
- }
564
- repartition_radix_bits = radix_bits + added_bits;
565
- concurrent_repartitions = MinValue<idx_t>(MaxValue<idx_t>(1, max_ht_size / max_partition_size), n_threads);
566
- tasks_per_partition = NextPowerOfTwo(n_threads / concurrent_repartitions);
665
+ // Mark partition as ready to scan
666
+ partition.finalized = true;
667
+
668
+ // Make sure this thread's aggregate allocator does not get lost
669
+ lock_guard<mutex> guard(sink.lock);
670
+ sink.stored_allocators.emplace_back(ht->GetAggregateAllocator());
567
671
  }
568
672
 
569
- //===--------------------------------------------------------------------===//
570
- // Source
571
- //===--------------------------------------------------------------------===//
572
- class RadixHTGlobalSourceState : public GlobalSourceState {
573
- public:
574
- explicit RadixHTGlobalSourceState(Allocator &allocator, const RadixPartitionedHashTable &ht)
575
- : ht_index(0), initialized(false), finished(false) {
576
- }
673
+ void RadixHTLocalSourceState::Scan(RadixHTGlobalSinkState &sink, RadixHTGlobalSourceState &gstate, DataChunk &chunk) {
674
+ D_ASSERT(task == RadixHTSourceTaskType::SCAN);
675
+ D_ASSERT(scan_status != RadixHTScanStatus::DONE);
577
676
 
578
- //! Heavy handed for now.
579
- mutex lock;
580
- //! The current position to scan the HT for output tuples
581
- idx_t ht_index;
582
- //! The set of aggregate scan states
583
- unsafe_unique_array<TupleDataParallelScanState> ht_scan_states;
584
- atomic<bool> initialized;
585
- atomic<bool> finished;
586
- };
677
+ auto &partition = *sink.partitions[task_idx];
678
+ D_ASSERT(partition.finalized);
679
+ auto &data_collection = *partition.data;
587
680
 
588
- class RadixHTLocalSourceState : public LocalSourceState {
589
- public:
590
- explicit RadixHTLocalSourceState(ExecutionContext &context, const RadixPartitionedHashTable &ht) {
591
- auto &allocator = BufferAllocator::Get(context.client);
592
- auto scan_chunk_types = ht.group_types;
593
- for (auto &aggr_type : ht.op.aggregate_return_types) {
594
- scan_chunk_types.push_back(aggr_type);
681
+ if (data_collection.Count() == 0) {
682
+ scan_status = RadixHTScanStatus::DONE;
683
+ if (++gstate.scan_done == sink.partitions.size()) {
684
+ gstate.finished = true;
595
685
  }
596
- scan_chunk.Initialize(allocator, scan_chunk_types);
686
+ return;
597
687
  }
598
688
 
599
- //! Materialized GROUP BY expressions & aggregates
600
- DataChunk scan_chunk;
601
- //! HT index
602
- idx_t ht_index = DConstants::INVALID_INDEX;
603
- //! A reference to the current HT that we are scanning
604
- shared_ptr<GroupedAggregateHashTable> ht;
605
- //! Scan state for the current HT
606
- TupleDataLocalScanState scan_state;
607
- };
689
+ if (scan_status == RadixHTScanStatus::INIT) {
690
+ data_collection.InitializeScan(scan_state, gstate.column_ids, sink.scan_pin_properties);
691
+ scan_status = RadixHTScanStatus::IN_PROGRESS;
692
+ }
608
693
 
609
- unique_ptr<GlobalSourceState> RadixPartitionedHashTable::GetGlobalSourceState(ClientContext &context) const {
610
- return make_uniq<RadixHTGlobalSourceState>(BufferAllocator::Get(context), *this);
611
- }
694
+ if (!data_collection.Scan(scan_state, scan_chunk)) {
695
+ scan_status = RadixHTScanStatus::DONE;
696
+ if (++gstate.scan_done == sink.partitions.size()) {
697
+ gstate.finished = true;
698
+ }
699
+ if (sink.scan_pin_properties == TupleDataPinProperties::DESTROY_AFTER_DONE) {
700
+ data_collection.Reset();
701
+ }
702
+ return;
703
+ }
612
704
 
613
- unique_ptr<LocalSourceState> RadixPartitionedHashTable::GetLocalSourceState(ExecutionContext &context) const {
614
- return make_uniq<RadixHTLocalSourceState>(context, *this);
615
- }
705
+ RowOperationsState row_state(aggregate_allocator);
706
+ const auto group_cols = layout.ColumnCount() - 1;
707
+ RowOperations::FinalizeStates(row_state, layout, scan_state.chunk_state.row_locations, scan_chunk, group_cols);
616
708
 
617
- idx_t RadixPartitionedHashTable::Size(GlobalSinkState &sink_state) const {
618
- auto &gstate = sink_state.Cast<RadixHTGlobalState>();
619
- if (gstate.is_empty && grouping_set.empty()) {
620
- return 1;
709
+ if (sink.scan_pin_properties == TupleDataPinProperties::DESTROY_AFTER_DONE && layout.HasDestructor()) {
710
+ RowOperations::DestroyStates(row_state, layout, scan_state.chunk_state.row_locations, scan_chunk.size());
621
711
  }
622
712
 
623
- idx_t count = 0;
624
- for (const auto &ht : gstate.finalized_hts) {
625
- count += ht->Count();
713
+ auto &radix_ht = sink.radix_ht;
714
+ idx_t chunk_index = 0;
715
+ for (auto &entry : radix_ht.grouping_set) {
716
+ chunk.data[entry].Reference(scan_chunk.data[chunk_index++]);
717
+ }
718
+ for (auto null_group : radix_ht.null_groups) {
719
+ chunk.data[null_group].SetVectorType(VectorType::CONSTANT_VECTOR);
720
+ ConstantVector::SetNull(chunk.data[null_group], true);
721
+ }
722
+ D_ASSERT(radix_ht.grouping_set.size() + radix_ht.null_groups.size() == radix_ht.op.GroupCount());
723
+ for (idx_t col_idx = 0; col_idx < radix_ht.op.aggregates.size(); col_idx++) {
724
+ chunk.data[radix_ht.op.GroupCount() + col_idx].Reference(
725
+ scan_chunk.data[radix_ht.group_types.size() + col_idx]);
726
+ }
727
+ D_ASSERT(radix_ht.op.grouping_functions.size() == radix_ht.grouping_values.size());
728
+ for (idx_t i = 0; i < radix_ht.op.grouping_functions.size(); i++) {
729
+ chunk.data[radix_ht.op.GroupCount() + radix_ht.op.aggregates.size() + i].Reference(radix_ht.grouping_values[i]);
730
+ }
731
+ chunk.SetCardinality(scan_chunk);
732
+ D_ASSERT(chunk.size() != 0);
733
+ }
734
+
735
+ bool RadixHTLocalSourceState::TaskFinished() {
736
+ switch (task) {
737
+ case RadixHTSourceTaskType::FINALIZE:
738
+ return true;
739
+ case RadixHTSourceTaskType::SCAN:
740
+ return scan_status == RadixHTScanStatus::DONE;
741
+ default:
742
+ D_ASSERT(task == RadixHTSourceTaskType::NO_TASK);
743
+ return true;
626
744
  }
627
- return count;
628
745
  }
629
746
 
630
747
  SourceResultType RadixPartitionedHashTable::GetData(ExecutionContext &context, DataChunk &chunk,
631
- GlobalSinkState &sink_state, OperatorSourceInput &input) const {
632
- auto &gstate = sink_state.Cast<RadixHTGlobalState>();
633
- auto &state = input.global_state.Cast<RadixHTGlobalSourceState>();
748
+ GlobalSinkState &sink_p, OperatorSourceInput &input) const {
749
+ auto &sink = sink_p.Cast<RadixHTGlobalSinkState>();
750
+ D_ASSERT(sink.finalized);
751
+
752
+ auto &gstate = input.global_state.Cast<RadixHTGlobalSourceState>();
634
753
  auto &lstate = input.local_state.Cast<RadixHTLocalSourceState>();
635
- D_ASSERT(gstate.is_finalized);
636
- if (state.finished) {
754
+ D_ASSERT(sink.scan_pin_properties == TupleDataPinProperties::UNPIN_AFTER_DONE ||
755
+ sink.scan_pin_properties == TupleDataPinProperties::DESTROY_AFTER_DONE);
756
+
757
+ if (gstate.finished) {
637
758
  return SourceResultType::FINISHED;
638
759
  }
639
760
 
640
- // special case hack to sort out aggregating from empty intermediates
641
- // for aggregations without groups
642
- if (gstate.is_empty && grouping_set.empty()) {
761
+ // Special case hack to sort out aggregating from empty intermediates for aggregations without groups
762
+ if (CountInternal(sink_p) == 0 && grouping_set.empty()) {
643
763
  D_ASSERT(chunk.ColumnCount() == null_groups.size() + op.aggregates.size() + op.grouping_functions.size());
644
- // for each column in the aggregates, set to initial state
764
+ // For each column in the aggregates, set to initial state
645
765
  chunk.SetCardinality(1);
646
766
  for (auto null_group : null_groups) {
647
767
  chunk.data[null_group].SetVectorType(VectorType::CONSTANT_VECTOR);
@@ -666,97 +786,17 @@ SourceResultType RadixPartitionedHashTable::GetData(ExecutionContext &context, D
666
786
  for (idx_t i = 0; i < op.grouping_functions.size(); i++) {
667
787
  chunk.data[null_groups.size() + op.aggregates.size() + i].Reference(grouping_values[i]);
668
788
  }
669
- state.finished = true;
670
- return chunk.size() == 0 ? SourceResultType::FINISHED : SourceResultType::HAVE_MORE_OUTPUT;
671
- }
672
- if (gstate.is_empty) {
673
- state.finished = true;
674
- return chunk.size() == 0 ? SourceResultType::FINISHED : SourceResultType::HAVE_MORE_OUTPUT;
675
- }
676
- idx_t elements_found = 0;
677
-
678
- lstate.scan_chunk.Reset();
679
- if (!state.initialized) {
680
- lock_guard<mutex> l(state.lock);
681
- if (!state.initialized) {
682
- auto &finalized_hts = gstate.finalized_hts;
683
- state.ht_scan_states = make_unsafe_uniq_array<TupleDataParallelScanState>(finalized_hts.size());
684
-
685
- const auto &layout = gstate.finalized_hts[0]->GetDataCollection().GetLayout();
686
- vector<column_t> column_ids;
687
- column_ids.reserve(layout.ColumnCount() - 1);
688
- for (idx_t col_idx = 0; col_idx < layout.ColumnCount() - 1; col_idx++) {
689
- column_ids.emplace_back(col_idx);
690
- }
691
-
692
- for (idx_t ht_idx = 0; ht_idx < finalized_hts.size(); ht_idx++) {
693
- gstate.finalized_hts[ht_idx]->GetDataCollection().InitializeScan(
694
- state.ht_scan_states.get()[ht_idx].scan_state, column_ids);
695
- }
696
- state.initialized = true;
697
- }
789
+ gstate.finished = true;
790
+ return SourceResultType::HAVE_MORE_OUTPUT;
698
791
  }
699
792
 
700
- auto &local_scan_state = lstate.scan_state;
701
- while (true) {
702
- D_ASSERT(state.ht_scan_states);
703
- idx_t ht_index;
704
- {
705
- lock_guard<mutex> l(state.lock);
706
- ht_index = state.ht_index;
707
- if (ht_index >= gstate.finalized_hts.size()) {
708
- state.finished = true;
709
- return chunk.size() == 0 ? SourceResultType::FINISHED : SourceResultType::HAVE_MORE_OUTPUT;
710
- }
711
- }
712
- D_ASSERT(ht_index < gstate.finalized_hts.size());
713
- if (lstate.ht_index != DConstants::INVALID_INDEX && ht_index != lstate.ht_index) {
714
- lstate.ht->GetDataCollection().FinalizePinState(local_scan_state.pin_state);
715
- }
716
- lstate.ht_index = ht_index;
717
- lstate.ht = gstate.finalized_hts[ht_index];
718
- D_ASSERT(lstate.ht);
719
-
720
- auto &global_scan_state = state.ht_scan_states[ht_index];
721
- elements_found = lstate.ht->Scan(global_scan_state, local_scan_state, lstate.scan_chunk);
722
- if (elements_found > 0) {
723
- break;
724
- }
725
- lstate.ht->GetDataCollection().FinalizePinState(local_scan_state.pin_state);
726
-
727
- // move to the next hash table
728
- lock_guard<mutex> l(state.lock);
729
- ht_index++;
730
- if (ht_index > state.ht_index) {
731
- // we have not yet worked on the table
732
- // move the global index forwards
733
- if (!gstate.multi_scan) {
734
- gstate.finalized_hts[state.ht_index].reset();
735
- }
736
- state.ht_index = ht_index;
793
+ while (!gstate.finished && chunk.size() == 0) {
794
+ if (!lstate.TaskFinished() || gstate.AssignTask(sink, lstate)) {
795
+ lstate.ExecuteTask(sink, gstate, chunk);
737
796
  }
738
797
  }
739
798
 
740
- // compute the final projection list
741
- chunk.SetCardinality(elements_found);
742
-
743
- idx_t chunk_index = 0;
744
- for (auto &entry : grouping_set) {
745
- chunk.data[entry].Reference(lstate.scan_chunk.data[chunk_index++]);
746
- }
747
- for (auto null_group : null_groups) {
748
- chunk.data[null_group].SetVectorType(VectorType::CONSTANT_VECTOR);
749
- ConstantVector::SetNull(chunk.data[null_group], true);
750
- }
751
- D_ASSERT(grouping_set.size() + null_groups.size() == op.GroupCount());
752
- for (idx_t col_idx = 0; col_idx < op.aggregates.size(); col_idx++) {
753
- chunk.data[op.GroupCount() + col_idx].Reference(lstate.scan_chunk.data[group_types.size() + col_idx]);
754
- }
755
- D_ASSERT(op.grouping_functions.size() == grouping_values.size());
756
- for (idx_t i = 0; i < op.grouping_functions.size(); i++) {
757
- chunk.data[op.GroupCount() + op.aggregates.size() + i].Reference(grouping_values[i]);
758
- }
759
- return chunk.size() == 0 ? SourceResultType::FINISHED : SourceResultType::HAVE_MORE_OUTPUT;
799
+ return SourceResultType::HAVE_MORE_OUTPUT;
760
800
  }
761
801
 
762
802
  } // namespace duckdb